__init__.py · Joysulem/FireEcho at main

File size: 3,518 Bytes

b5bff9c

"""
FireEcho Kernel v2.5 - Full Native Triton on Blackwell + Quantum Gold
======================================================================

High-performance kernel stack for RTX 5090 (SM 12.0 Blackwell) with:
- Native Triton kernels (Triton 3.6.0+ SM 12.0 support)
- DARPA STELLAR lifelong learning innovations
- TMA-style block pointers (115% of cuBLAS!)
- FP8 tensor core operations

Quick Start:
    from fireecho import FireEchoEngine, FireEchoConfig
    
    engine = FireEchoEngine(FireEchoConfig.llama_7b()).cuda()
    output = engine.generate(input_ids, max_new_tokens=100)

Kernel Performance (RTX 5090):
    - 2-CTA MatMul:    191 TFLOPS (116% of cuBLAS!) - Blackwell cooperative MMA
    - TMA MatMul:      189 TFLOPS (115% of cuBLAS)
    - Hybrid MatMul:   174 TFLOPS (Triton/cuBLAS auto)
    - FP8 MatMul:      152 TFLOPS (native FP8 tensor cores)
    - Split-K GEMM:    80 TFLOPS (1.5x decode speedup)

Features:
    - NVFP4 Quantization (4-bit, dual-scaling)
    - Paged KV Cache (vLLM-style)
    - Flash Attention (SDPA)
    - GQA Support
    - Multimodal (Vision + Audio)
    - L2 Cache Manager

STELLAR Innovations:
    - HebbianMemory with eligibility traces & rare correlations
    - Neuromodulation gating (PNN: hs ⊗ tanh(hm))
    - Sliced Cramer Preservation (anti-forgetting)
    - Context-Skill dual-temporal module

Reference: DARPA L2M STELLAR - AFRL-RI-RS-TR-2022-135
"""

from .fireecho_kernel import (
    # Main classes
    FireEchoEngine,
    FireEchoConfig,
    
    # Components
    PagedKVCache,
    HebbianMemory,
    PerLayerHebbian,
    FusedAttention,
    FusedFFN,
    FusedTransformerBlock,
    MegaFusedTransformerBlock,  # NEW: Maximum fusion
    QuantizedLinear,
    
    # STELLAR Innovations
    SlicedCramerPreservation,
    ContextSkillModel,
    
    # Multimodal
    VisionEncoder,
    AudioEncoder,
    MultimodalFusion,
    
    # Core Kernel Functions
    hybrid_matmul,
    hybrid_matmul_v2,
    quantize_nvfp4,
    dequantize_nvfp4,
    fwht_forward,
    fwht_inverse,
    stochastic_round,
    
    # Fused Kernels (Phase 1)
    fused_qkv_projection,
    fused_swiglu,
    splitk_matmul,
    
    # Phase 2 Kernels
    persistent_matmul,
    L2CacheManager,
    prefetch_to_l2,
    
    # NEW: Advanced Kernels
    tma_matmul,                    # TMA-style block pointers (115% of cuBLAS!)
    blackwell_2cta_matmul,         # 2-CTA cooperative MMA (116% of cuBLAS!)
    fp8_matmul,                    # Native FP8 tensor cores
    fused_transformer_block_forward,  # Functional fused block
    
    # Utilities
    create_small_engine,
    create_7b_engine,
    create_multimodal_engine,
)

# Cluster launcher (SM120 Thread Block Clusters)
from .cluster_launcher import (
    ClusterLauncher,
    supports_clusters,
    get_cluster_info,
    cluster_matmul,
)

# DSMEM & Cluster Barriers (SM 9.0+ / SM 12.0+)
from .dsmem_ops import (
    ClusterConfig,
    cluster_matmul_dsmem,
    cluster_attention,
    SuperClusterConfig,
    init_super_cluster,
)

# FireEcho Quantum Gold - Native SM120 Quantum Simulation
from .quantum import (
    # Circuit building
    QuantumCircuit,
    QuantumRegister,
    # Simulation
    QuantumSimulator,
    StateVector,
    # Gates
    hadamard, pauli_x, pauli_y, pauli_z,
    rotation_x, rotation_y, rotation_z,
    cnot, cz, swap,
    # Measurement
    measure, measure_all, sample, get_probabilities,
    # Algorithms
    bell_state, ghz_state, qft, inverse_qft,
)

__version__ = "2.5.0"
__author__ = "FireEcho Team"