FireEcho / FireEcho Engine /csrc /cluster_launch.h
Joysulem's picture
Upload 3258 files
b5bff9c verified
/**
* FireEcho Kernel - SM120 Thread Block Cluster Launcher
*
* Exposes true Thread Block Cluster APIs for Blackwell (SM 12.0)
* using the CUDA Driver API's cuLaunchKernelEx with cluster attributes.
*
* Requirements:
* - CUDA 12.8+ (for SM 12.0 support)
* - Triton 3.6.0+ compiled kernel (CUfunction)
* - Blackwell GPU (RTX 5090, SM 12.0)
*
* Features:
* - True hardware cluster launch (not just num_ctas hint)
* - Distributed Shared Memory (dSMEM) access
* - Cluster barriers for synchronization
*/
#ifndef FIREECHO_CLUSTER_LAUNCH_H
#define FIREECHO_CLUSTER_LAUNCH_H
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdexcept>
#include <string>
namespace fireecho {
/**
* Cluster configuration for SM120 kernels.
*/
struct ClusterConfig {
int cluster_dim_x = 2; // Cluster size in X (typically 2 for 2-CTA MMA)
int cluster_dim_y = 1;
int cluster_dim_z = 1;
int max_registers = 240; // Cap for cluster occupancy
bool enable_dshem = true; // Enable distributed shared memory
};
/**
* Launch a Triton-compiled kernel with true SM120 cluster support.
*
* @param func The compiled CUfunction from Triton
* @param grid Grid dimensions (in clusters, not blocks)
* @param block Block dimensions
* @param args Kernel arguments
* @param config Cluster configuration
* @param stream CUDA stream (0 for default)
*/
inline CUresult launch_with_cluster(
CUfunction func,
dim3 grid,
dim3 block,
void** args,
const ClusterConfig& config = ClusterConfig(),
CUstream stream = 0
) {
// Set up cluster launch attributes for SM120
CUlaunchAttribute attrs[2];
int num_attrs = 0;
// Cluster dimension attribute
attrs[num_attrs].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
attrs[num_attrs].value.clusterDim.x = config.cluster_dim_x;
attrs[num_attrs].value.clusterDim.y = config.cluster_dim_y;
attrs[num_attrs].value.clusterDim.z = config.cluster_dim_z;
num_attrs++;
// Cluster scheduling policy (optional, for better occupancy)
attrs[num_attrs].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
attrs[num_attrs].value.clusterSchedulingPolicyPreference =
CU_CLUSTER_SCHEDULING_POLICY_SPREAD; // or CU_CLUSTER_SCHEDULING_POLICY_LOAD_BALANCING
num_attrs++;
// Configure the launch
CUlaunchConfig launch_config = {};
launch_config.gridDimX = grid.x;
launch_config.gridDimY = grid.y;
launch_config.gridDimZ = grid.z;
launch_config.blockDimX = block.x;
launch_config.blockDimY = block.y;
launch_config.blockDimZ = block.z;
launch_config.sharedMemBytes = 0; // Triton manages shared memory
launch_config.hStream = stream;
launch_config.attrs = attrs;
launch_config.numAttrs = num_attrs;
// Launch with cluster configuration
return cuLaunchKernelEx(&launch_config, func, args, nullptr);
}
/**
* Check if the current GPU supports Thread Block Clusters.
*/
inline bool supports_clusters() {
int device;
cudaGetDevice(&device);
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device);
// Clusters require SM 9.0+ (Hopper) or SM 12.0+ (Blackwell)
return (props.major >= 9) || (props.major == 12);
}
/**
* Get maximum cluster size for the current GPU.
*/
inline int get_max_cluster_size() {
int device;
cudaGetDevice(&device);
int max_cluster_size = 1;
cudaDeviceGetAttribute(&max_cluster_size,
cudaDevAttrClusterLaunch, device);
return max_cluster_size;
}
/**
* Query cluster properties for SM120.
*/
struct ClusterProperties {
int max_cluster_size;
int max_blocks_per_sm;
int shared_memory_per_block;
int registers_per_block;
bool supports_dshem;
};
inline ClusterProperties get_cluster_properties() {
ClusterProperties props = {};
int device;
cudaGetDevice(&device);
cudaDeviceProp dev_props;
cudaGetDeviceProperties(&dev_props, device);
props.max_cluster_size = get_max_cluster_size();
props.max_blocks_per_sm = dev_props.maxBlocksPerMultiProcessor;
props.shared_memory_per_block = dev_props.sharedMemPerBlock;
props.registers_per_block = dev_props.regsPerBlock;
props.supports_dshem = (dev_props.major >= 9); // SM 9.0+ has dSMEM
return props;
}
/**
* Python-compatible wrapper for cluster launch.
* Can be called from Python via ctypes or pybind11.
*/
extern "C" {
int fireecho_launch_cluster(
void* func_ptr,
int grid_x, int grid_y, int grid_z,
int block_x, int block_y, int block_z,
void** args,
int cluster_x, int cluster_y, int cluster_z,
void* stream_ptr
) {
CUfunction func = (CUfunction)func_ptr;
CUstream stream = (CUstream)stream_ptr;
ClusterConfig config;
config.cluster_dim_x = cluster_x;
config.cluster_dim_y = cluster_y;
config.cluster_dim_z = cluster_z;
CUresult result = launch_with_cluster(
func,
dim3(grid_x, grid_y, grid_z),
dim3(block_x, block_y, block_z),
args,
config,
stream
);
return (int)result;
}
int fireecho_supports_clusters() {
return supports_clusters() ? 1 : 0;
}
int fireecho_max_cluster_size() {
return get_max_cluster_size();
}
} // extern "C"
} // namespace fireecho
#endif // FIREECHO_CLUSTER_LAUNCH_H