template<typename T, int BLOCK_DIM_X, BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0> class BlockReduce ...
//调用: __global__ voidExampleKernel(...) { // Specialize BlockReduce for a 1D block of 128 threads of type int using BlockReduce = cub::BlockReduce<int, 128>;
// Allocate shared memory for BlockReduce __shared__ typename BlockReduce::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads int thread_data[4]; ...
// Compute the block-wide max for thread0 int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
__global__ voidkernel(int* per_block_results) { // (1) Select the desired class // `cub::BlockReduce` is a class template that must be instantiated for the // input data type and the number of threads. Internally the class is // specialized depending on the data type, number of threads, and hardware // architecture. Type aliases are often used for convenience: using BlockReduce = cub::BlockReduce<int, 128>; // (2) Query the temporary storage // The type and amount of temporary storage depends on the selected instantiation using TempStorage = typename BlockReduce::TempStorage; // (3) Allocate the temporary storage __shared__ TempStorage temp_storage; // (4) Pass the temporary storage // Temporary storage is passed to the constructor of the `BlockReduce` class BlockReduce block_reduce{temp_storage}; // (5) Invoke the algorithm // The `Sum()` member function performs the sum reduction of `thread_data` across all 128 threads int thread_data[4] = {1, 2, 3, 4}; int block_result = block_reduce.Sum(thread_data);