| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #ifndef FIREECHO_CLUSTER_LAUNCH_H |
| #define FIREECHO_CLUSTER_LAUNCH_H |
|
|
| #include <cuda.h> |
| #include <cuda_runtime.h> |
| #include <stdexcept> |
| #include <string> |
|
|
| namespace fireecho { |
|
|
| |
| |
| |
| struct ClusterConfig { |
| int cluster_dim_x = 2; |
| int cluster_dim_y = 1; |
| int cluster_dim_z = 1; |
| int max_registers = 240; |
| bool enable_dshem = true; |
| }; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| inline CUresult launch_with_cluster( |
| CUfunction func, |
| dim3 grid, |
| dim3 block, |
| void** args, |
| const ClusterConfig& config = ClusterConfig(), |
| CUstream stream = 0 |
| ) { |
| |
| CUlaunchAttribute attrs[2]; |
| int num_attrs = 0; |
| |
| |
| attrs[num_attrs].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; |
| attrs[num_attrs].value.clusterDim.x = config.cluster_dim_x; |
| attrs[num_attrs].value.clusterDim.y = config.cluster_dim_y; |
| attrs[num_attrs].value.clusterDim.z = config.cluster_dim_z; |
| num_attrs++; |
| |
| |
| attrs[num_attrs].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE; |
| attrs[num_attrs].value.clusterSchedulingPolicyPreference = |
| CU_CLUSTER_SCHEDULING_POLICY_SPREAD; |
| num_attrs++; |
| |
| |
| CUlaunchConfig launch_config = {}; |
| launch_config.gridDimX = grid.x; |
| launch_config.gridDimY = grid.y; |
| launch_config.gridDimZ = grid.z; |
| launch_config.blockDimX = block.x; |
| launch_config.blockDimY = block.y; |
| launch_config.blockDimZ = block.z; |
| launch_config.sharedMemBytes = 0; |
| launch_config.hStream = stream; |
| launch_config.attrs = attrs; |
| launch_config.numAttrs = num_attrs; |
| |
| |
| return cuLaunchKernelEx(&launch_config, func, args, nullptr); |
| } |
|
|
| |
| |
| |
| inline bool supports_clusters() { |
| int device; |
| cudaGetDevice(&device); |
| |
| cudaDeviceProp props; |
| cudaGetDeviceProperties(&props, device); |
| |
| |
| return (props.major >= 9) || (props.major == 12); |
| } |
|
|
| |
| |
| |
| inline int get_max_cluster_size() { |
| int device; |
| cudaGetDevice(&device); |
| |
| int max_cluster_size = 1; |
| cudaDeviceGetAttribute(&max_cluster_size, |
| cudaDevAttrClusterLaunch, device); |
| |
| return max_cluster_size; |
| } |
|
|
| |
| |
| |
| struct ClusterProperties { |
| int max_cluster_size; |
| int max_blocks_per_sm; |
| int shared_memory_per_block; |
| int registers_per_block; |
| bool supports_dshem; |
| }; |
|
|
| inline ClusterProperties get_cluster_properties() { |
| ClusterProperties props = {}; |
| |
| int device; |
| cudaGetDevice(&device); |
| |
| cudaDeviceProp dev_props; |
| cudaGetDeviceProperties(&dev_props, device); |
| |
| props.max_cluster_size = get_max_cluster_size(); |
| props.max_blocks_per_sm = dev_props.maxBlocksPerMultiProcessor; |
| props.shared_memory_per_block = dev_props.sharedMemPerBlock; |
| props.registers_per_block = dev_props.regsPerBlock; |
| props.supports_dshem = (dev_props.major >= 9); |
| |
| return props; |
| } |
|
|
| |
| |
| |
| |
| extern "C" { |
|
|
| int fireecho_launch_cluster( |
| void* func_ptr, |
| int grid_x, int grid_y, int grid_z, |
| int block_x, int block_y, int block_z, |
| void** args, |
| int cluster_x, int cluster_y, int cluster_z, |
| void* stream_ptr |
| ) { |
| CUfunction func = (CUfunction)func_ptr; |
| CUstream stream = (CUstream)stream_ptr; |
| |
| ClusterConfig config; |
| config.cluster_dim_x = cluster_x; |
| config.cluster_dim_y = cluster_y; |
| config.cluster_dim_z = cluster_z; |
| |
| CUresult result = launch_with_cluster( |
| func, |
| dim3(grid_x, grid_y, grid_z), |
| dim3(block_x, block_y, block_z), |
| args, |
| config, |
| stream |
| ); |
| |
| return (int)result; |
| } |
|
|
| int fireecho_supports_clusters() { |
| return supports_clusters() ? 1 : 0; |
| } |
|
|
| int fireecho_max_cluster_size() { |
| return get_max_cluster_size(); |
| } |
|
|
| } |
|
|
| } |
|
|
| #endif |
|
|