/** * FireEcho Kernel - SM120 Cluster Launch Implementation * * Compile with: * nvcc -shared -o libfireecho_cluster.so cluster_launch.cpp \ * -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcuda -lcudart \ * --compiler-options '-fPIC' -arch=sm_120 */ #include "cluster_launch.h" #include namespace fireecho { // Implementation of helper functions that need compilation void print_cluster_info() { if (!supports_clusters()) { printf("Thread Block Clusters: NOT SUPPORTED\n"); return; } ClusterProperties props = get_cluster_properties(); printf("=== SM120 Thread Block Cluster Info ===\n"); printf("Max Cluster Size: %d\n", props.max_cluster_size); printf("Max Blocks/SM: %d\n", props.max_blocks_per_sm); printf("Shared Memory/Block: %d KB\n", props.shared_memory_per_block / 1024); printf("Registers/Block: %d\n", props.registers_per_block); printf("Distributed SMEM: %s\n", props.supports_dshem ? "YES" : "NO"); printf("========================================\n"); } } // namespace fireecho // Standalone test #ifdef TEST_CLUSTER_LAUNCH int main() { // Initialize CUDA cudaSetDevice(0); fireecho::print_cluster_info(); if (fireecho::supports_clusters()) { printf("\n✅ This GPU supports Thread Block Clusters!\n"); printf(" Max cluster size: %d CTAs\n", fireecho::get_max_cluster_size()); } else { printf("\n❌ This GPU does NOT support Thread Block Clusters.\n"); } return 0; } #endif