| | """ |
| | FireEcho Kernel v2.5 - Full Native Triton on Blackwell + Quantum Gold |
| | ====================================================================== |
| | |
| | High-performance kernel stack for RTX 5090 (SM 12.0 Blackwell) with: |
| | - Native Triton kernels (Triton 3.6.0+ SM 12.0 support) |
| | - DARPA STELLAR lifelong learning innovations |
| | - TMA-style block pointers (115% of cuBLAS!) |
| | - FP8 tensor core operations |
| | |
| | Quick Start: |
| | from fireecho import FireEchoEngine, FireEchoConfig |
| | |
| | engine = FireEchoEngine(FireEchoConfig.llama_7b()).cuda() |
| | output = engine.generate(input_ids, max_new_tokens=100) |
| | |
| | Kernel Performance (RTX 5090): |
| | - 2-CTA MatMul: 191 TFLOPS (116% of cuBLAS!) - Blackwell cooperative MMA |
| | - TMA MatMul: 189 TFLOPS (115% of cuBLAS) |
| | - Hybrid MatMul: 174 TFLOPS (Triton/cuBLAS auto) |
| | - FP8 MatMul: 152 TFLOPS (native FP8 tensor cores) |
| | - Split-K GEMM: 80 TFLOPS (1.5x decode speedup) |
| | |
| | Features: |
| | - NVFP4 Quantization (4-bit, dual-scaling) |
| | - Paged KV Cache (vLLM-style) |
| | - Flash Attention (SDPA) |
| | - GQA Support |
| | - Multimodal (Vision + Audio) |
| | - L2 Cache Manager |
| | |
| | STELLAR Innovations: |
| | - HebbianMemory with eligibility traces & rare correlations |
| | - Neuromodulation gating (PNN: hs ⊗ tanh(hm)) |
| | - Sliced Cramer Preservation (anti-forgetting) |
| | - Context-Skill dual-temporal module |
| | |
| | Reference: DARPA L2M STELLAR - AFRL-RI-RS-TR-2022-135 |
| | """ |
| |
|
| | from .fireecho_kernel import ( |
| | |
| | FireEchoEngine, |
| | FireEchoConfig, |
| | |
| | |
| | PagedKVCache, |
| | HebbianMemory, |
| | PerLayerHebbian, |
| | FusedAttention, |
| | FusedFFN, |
| | FusedTransformerBlock, |
| | MegaFusedTransformerBlock, |
| | QuantizedLinear, |
| | |
| | |
| | SlicedCramerPreservation, |
| | ContextSkillModel, |
| | |
| | |
| | VisionEncoder, |
| | AudioEncoder, |
| | MultimodalFusion, |
| | |
| | |
| | hybrid_matmul, |
| | hybrid_matmul_v2, |
| | quantize_nvfp4, |
| | dequantize_nvfp4, |
| | fwht_forward, |
| | fwht_inverse, |
| | stochastic_round, |
| | |
| | |
| | fused_qkv_projection, |
| | fused_swiglu, |
| | splitk_matmul, |
| | |
| | |
| | persistent_matmul, |
| | L2CacheManager, |
| | prefetch_to_l2, |
| | |
| | |
| | tma_matmul, |
| | blackwell_2cta_matmul, |
| | fp8_matmul, |
| | fused_transformer_block_forward, |
| | |
| | |
| | create_small_engine, |
| | create_7b_engine, |
| | create_multimodal_engine, |
| | ) |
| |
|
| | |
| | from .cluster_launcher import ( |
| | ClusterLauncher, |
| | supports_clusters, |
| | get_cluster_info, |
| | cluster_matmul, |
| | ) |
| |
|
| | |
| | from .dsmem_ops import ( |
| | ClusterConfig, |
| | cluster_matmul_dsmem, |
| | cluster_attention, |
| | SuperClusterConfig, |
| | init_super_cluster, |
| | ) |
| |
|
| | |
| | from .quantum import ( |
| | |
| | QuantumCircuit, |
| | QuantumRegister, |
| | |
| | QuantumSimulator, |
| | StateVector, |
| | |
| | hadamard, pauli_x, pauli_y, pauli_z, |
| | rotation_x, rotation_y, rotation_z, |
| | cnot, cz, swap, |
| | |
| | measure, measure_all, sample, get_probabilities, |
| | |
| | bell_state, ghz_state, qft, inverse_qft, |
| | ) |
| |
|
| | __version__ = "2.5.0" |
| | __author__ = "FireEcho Team" |
| |
|