""" FireEcho Kernel v2.5 - Full Native Triton on Blackwell + Quantum Gold ====================================================================== High-performance kernel stack for RTX 5090 (SM 12.0 Blackwell) with: - Native Triton kernels (Triton 3.6.0+ SM 12.0 support) - DARPA STELLAR lifelong learning innovations - TMA-style block pointers (115% of cuBLAS!) - FP8 tensor core operations Quick Start: from fireecho import FireEchoEngine, FireEchoConfig engine = FireEchoEngine(FireEchoConfig.llama_7b()).cuda() output = engine.generate(input_ids, max_new_tokens=100) Kernel Performance (RTX 5090): - 2-CTA MatMul: 191 TFLOPS (116% of cuBLAS!) - Blackwell cooperative MMA - TMA MatMul: 189 TFLOPS (115% of cuBLAS) - Hybrid MatMul: 174 TFLOPS (Triton/cuBLAS auto) - FP8 MatMul: 152 TFLOPS (native FP8 tensor cores) - Split-K GEMM: 80 TFLOPS (1.5x decode speedup) Features: - NVFP4 Quantization (4-bit, dual-scaling) - Paged KV Cache (vLLM-style) - Flash Attention (SDPA) - GQA Support - Multimodal (Vision + Audio) - L2 Cache Manager STELLAR Innovations: - HebbianMemory with eligibility traces & rare correlations - Neuromodulation gating (PNN: hs ⊗ tanh(hm)) - Sliced Cramer Preservation (anti-forgetting) - Context-Skill dual-temporal module Reference: DARPA L2M STELLAR - AFRL-RI-RS-TR-2022-135 """ from .fireecho_kernel import ( # Main classes FireEchoEngine, FireEchoConfig, # Components PagedKVCache, HebbianMemory, PerLayerHebbian, FusedAttention, FusedFFN, FusedTransformerBlock, MegaFusedTransformerBlock, # NEW: Maximum fusion QuantizedLinear, # STELLAR Innovations SlicedCramerPreservation, ContextSkillModel, # Multimodal VisionEncoder, AudioEncoder, MultimodalFusion, # Core Kernel Functions hybrid_matmul, hybrid_matmul_v2, quantize_nvfp4, dequantize_nvfp4, fwht_forward, fwht_inverse, stochastic_round, # Fused Kernels (Phase 1) fused_qkv_projection, fused_swiglu, splitk_matmul, # Phase 2 Kernels persistent_matmul, L2CacheManager, prefetch_to_l2, # NEW: Advanced Kernels tma_matmul, # TMA-style block pointers (115% of cuBLAS!) blackwell_2cta_matmul, # 2-CTA cooperative MMA (116% of cuBLAS!) fp8_matmul, # Native FP8 tensor cores fused_transformer_block_forward, # Functional fused block # Utilities create_small_engine, create_7b_engine, create_multimodal_engine, ) # Cluster launcher (SM120 Thread Block Clusters) from .cluster_launcher import ( ClusterLauncher, supports_clusters, get_cluster_info, cluster_matmul, ) # DSMEM & Cluster Barriers (SM 9.0+ / SM 12.0+) from .dsmem_ops import ( ClusterConfig, cluster_matmul_dsmem, cluster_attention, SuperClusterConfig, init_super_cluster, ) # FireEcho Quantum Gold - Native SM120 Quantum Simulation from .quantum import ( # Circuit building QuantumCircuit, QuantumRegister, # Simulation QuantumSimulator, StateVector, # Gates hadamard, pauli_x, pauli_y, pauli_z, rotation_x, rotation_y, rotation_z, cnot, cz, swap, # Measurement measure, measure_all, sample, get_probabilities, # Algorithms bell_state, ghz_state, qft, inverse_qft, ) __version__ = "2.5.0" __author__ = "FireEcho Team"