__init__.py · Joysulem/FireEcho at main

FireEcho / __init__.py

Upload 3258 files

b5bff9c verified 8 days ago

3.52 kB

	"""
	FireEcho Kernel v2.5 - Full Native Triton on Blackwell + Quantum Gold
	======================================================================

	High-performance kernel stack for RTX 5090 (SM 12.0 Blackwell) with:
	- Native Triton kernels (Triton 3.6.0+ SM 12.0 support)
	- DARPA STELLAR lifelong learning innovations
	- TMA-style block pointers (115% of cuBLAS!)
	- FP8 tensor core operations

	Quick Start:
	from fireecho import FireEchoEngine, FireEchoConfig

	engine = FireEchoEngine(FireEchoConfig.llama_7b()).cuda()
	output = engine.generate(input_ids, max_new_tokens=100)

	Kernel Performance (RTX 5090):
	- 2-CTA MatMul: 191 TFLOPS (116% of cuBLAS!) - Blackwell cooperative MMA
	- TMA MatMul: 189 TFLOPS (115% of cuBLAS)
	- Hybrid MatMul: 174 TFLOPS (Triton/cuBLAS auto)
	- FP8 MatMul: 152 TFLOPS (native FP8 tensor cores)
	- Split-K GEMM: 80 TFLOPS (1.5x decode speedup)

	Features:
	- NVFP4 Quantization (4-bit, dual-scaling)
	- Paged KV Cache (vLLM-style)
	- Flash Attention (SDPA)
	- GQA Support
	- Multimodal (Vision + Audio)
	- L2 Cache Manager

	STELLAR Innovations:
	- HebbianMemory with eligibility traces & rare correlations
	- Neuromodulation gating (PNN: hs ⊗ tanh(hm))
	- Sliced Cramer Preservation (anti-forgetting)
	- Context-Skill dual-temporal module

	Reference: DARPA L2M STELLAR - AFRL-RI-RS-TR-2022-135
	"""

	from .fireecho_kernel import (
	# Main classes
	FireEchoEngine,
	FireEchoConfig,

	# Components
	PagedKVCache,
	HebbianMemory,
	PerLayerHebbian,
	FusedAttention,
	FusedFFN,
	FusedTransformerBlock,
	MegaFusedTransformerBlock, # NEW: Maximum fusion
	QuantizedLinear,

	# STELLAR Innovations
	SlicedCramerPreservation,
	ContextSkillModel,

	# Multimodal
	VisionEncoder,
	AudioEncoder,
	MultimodalFusion,

	# Core Kernel Functions
	hybrid_matmul,
	hybrid_matmul_v2,
	quantize_nvfp4,
	dequantize_nvfp4,
	fwht_forward,
	fwht_inverse,
	stochastic_round,

	# Fused Kernels (Phase 1)
	fused_qkv_projection,
	fused_swiglu,
	splitk_matmul,

	# Phase 2 Kernels
	persistent_matmul,
	L2CacheManager,
	prefetch_to_l2,

	# NEW: Advanced Kernels
	tma_matmul, # TMA-style block pointers (115% of cuBLAS!)
	blackwell_2cta_matmul, # 2-CTA cooperative MMA (116% of cuBLAS!)
	fp8_matmul, # Native FP8 tensor cores
	fused_transformer_block_forward, # Functional fused block

	# Utilities
	create_small_engine,
	create_7b_engine,
	create_multimodal_engine,
	)

	# Cluster launcher (SM120 Thread Block Clusters)
	from .cluster_launcher import (
	ClusterLauncher,
	supports_clusters,
	get_cluster_info,
	cluster_matmul,
	)

	# DSMEM & Cluster Barriers (SM 9.0+ / SM 12.0+)
	from .dsmem_ops import (
	ClusterConfig,
	cluster_matmul_dsmem,
	cluster_attention,
	SuperClusterConfig,
	init_super_cluster,
	)

	# FireEcho Quantum Gold - Native SM120 Quantum Simulation
	from .quantum import (
	# Circuit building
	QuantumCircuit,
	QuantumRegister,
	# Simulation
	QuantumSimulator,
	StateVector,
	# Gates
	hadamard, pauli_x, pauli_y, pauli_z,
	rotation_x, rotation_y, rotation_z,
	cnot, cz, swap,
	# Measurement
	measure, measure_all, sample, get_probabilities,
	# Algorithms
	bell_state, ghz_state, qft, inverse_qft,
	)

	__version__ = "2.5.0"
	__author__ = "FireEcho Team"