/**
 * FireEcho Kernel - SM120 Cluster Launch Implementation
 * 
 * Compile with:
 *   nvcc -shared -o libfireecho_cluster.so cluster_launch.cpp \
 *        -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcuda -lcudart \
 *        --compiler-options '-fPIC' -arch=sm_120
 */

#include "cluster_launch.h"
#include <stdio.h>

namespace fireecho {

// Implementation of helper functions that need compilation

void print_cluster_info() {
    if (!supports_clusters()) {
        printf("Thread Block Clusters: NOT SUPPORTED\n");
        return;
    }
    
    ClusterProperties props = get_cluster_properties();
    
    printf("=== SM120 Thread Block Cluster Info ===\n");
    printf("Max Cluster Size: %d\n", props.max_cluster_size);
    printf("Max Blocks/SM: %d\n", props.max_blocks_per_sm);
    printf("Shared Memory/Block: %d KB\n", props.shared_memory_per_block / 1024);
    printf("Registers/Block: %d\n", props.registers_per_block);
    printf("Distributed SMEM: %s\n", props.supports_dshem ? "YES" : "NO");
    printf("========================================\n");
}

}  // namespace fireecho

// Standalone test
#ifdef TEST_CLUSTER_LAUNCH
int main() {
    // Initialize CUDA
    cudaSetDevice(0);
    
    fireecho::print_cluster_info();
    
    if (fireecho::supports_clusters()) {
        printf("\n✅ This GPU supports Thread Block Clusters!\n");
        printf("   Max cluster size: %d CTAs\n", fireecho::get_max_cluster_size());
    } else {
        printf("\n❌ This GPU does NOT support Thread Block Clusters.\n");
    }
    
    return 0;
}
#endif