FireEcho Engine/goliath_kernel.py · Joysulem/FireEcho at main

FireEcho / FireEcho Engine /goliath_kernel.py

Upload 3258 files

b5bff9c verified 7 days ago

114 kB

	# =============================================================================
	# Copyright (c) 2024-2026 Luis E. Davila Flores. All rights reserved.
	#
	# FireEcho Engine — High-Performance Inference Kernel
	# Creator & Sole Author: Luis E. Davila Flores
	#
	# Licensed under Creative Commons Attribution-NonCommercial 4.0 International
	# (CC BY-NC 4.0). You may share and adapt this work for non-commercial
	# purposes with proper attribution. Full license terms:
	# https://creativecommons.org/licenses/by-nc/4.0/
	# =============================================================================
	"""
	FireEcho Goliath — Native FP4/FP8/INT2/FE-XC/FE-XVQ Fused Triton GEMM Kernel
	===============================================================================
	Part of the FireEcho Engine — Custom inference kernel for NVIDIA Blackwell
	Copyright (c) 2025-2026 Echo (FireEcho Project). All rights reserved.

	A unified quantized GEMM kernel that dequantizes inside the Triton matmul
	loop — no separate dequantization step, no global memory materialization.

	Features:
	1. NVFP4 GEMM — Fused dequant inside Triton matmul
	2. NVFP8 GEMM — Same approach for FP8
	3. INT2 GEMM — 2-bit quantization for cold MoE experts
	4. FE-XC GEMM — Codebook 2-bit (2x8) with CodeGEMM psumbook (near-FP16 quality)
	5. FE-XVQ GEMM — Hessian-weighted codebook 2-bit (VPTQ-inspired, second-order optimal)
	6. Packed MoE — Contiguous [128, K//2, N] expert buffers, GPU expert IDs
	7. Fused SwiGLU+Down — Single-kernel gate+up+silu+mul+down
	8. Auto-dispatch — Automatically choose FP4/FP8/INT2/FE-XC/FE-XVQ based on expert temperature
	9. Unified API:
	- goliath_quantize(tensor, bits=4 or 8)
	- goliath_gemm(activations, quantized_weights)

	Design:
	- Loads packed FP4/FP8 data directly from global memory
	- Dequantizes in registers (not global memory) — zero extra traffic
	- Uses Triton block pointers for efficient memory access
	- Supports both BF16 and FP16 accumulation
	- Target: 5-10x over the 10.4 TFLOPS baseline

	Usage:
	from goliath_kernel import goliath_quantize, goliath_gemm

	# FP4 quantization (maximum compression)
	w_q4 = goliath_quantize(weights, bits=4)
	out = goliath_gemm(activations, w_q4)

	# FP8 quantization (higher accuracy)
	w_q8 = goliath_quantize(weights, bits=8)
	out = goliath_gemm(activations, w_q8)

	# Auto mode (let Goliath decide)
	w_q = goliath_quantize(weights, bits='auto')
	out = goliath_gemm(activations, w_q)
	"""

	import torch
	import triton
	import triton.language as tl
	from typing import Optional, Tuple, Union
	from dataclasses import dataclass


	# =============================================================================
	# NVFP4 Bridge (cutlass_kernels → native cuBLAS path when available)
	# =============================================================================
	try:
	from cutlass_kernels import (
	NVFP4Weights as _BridgeNVFP4Weights,
	_fused_nvfp4_matmul as _bridge_fused_nvfp4,
	_can_use_scaled_mm_fp4 as _bridge_can_use_cublas_fp4,
	_scaled_mm_fp4 as _bridge_scaled_mm_fp4,
	)
	_NVFP4_BRIDGE_AVAILABLE = True
	except Exception:
	_BridgeNVFP4Weights = None
	_bridge_fused_nvfp4 = None
	_bridge_can_use_cublas_fp4 = None
	_bridge_scaled_mm_fp4 = None
	_NVFP4_BRIDGE_AVAILABLE = False


	# =============================================================================
	# E2M1 Constants (shared with cutlass_kernels)
	# =============================================================================

	_E2M1_VALUES = torch.tensor(
	[0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1, -1.5, -2, -3, -4, -6],
	dtype=torch.float32,
	)

	_E2M1_BOUNDARIES = torch.tensor([0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5.0])


	# =============================================================================
	# E4M3 Encode/Decode (PyTorch host-side)
	# =============================================================================

	def _encode_e4m3(values: torch.Tensor) -> torch.Tensor:
	"""Encode FP32 values to E4M3 (FP8) as uint8."""
	if hasattr(torch, 'float8_e4m3fn'):
	return values.clamp(-448.0, 448.0).to(torch.float8_e4m3fn).view(torch.uint8)
	v = values.float().clamp(-448.0, 448.0)
	sign = (v < 0).to(torch.uint8) << 7
	av = v.abs().clamp(min=0.0)
	log2_av = torch.log2(av.clamp(min=2**-9))
	exp_raw = torch.floor(log2_av).clamp(-6, 8)
	exp_biased = (exp_raw + 7).clamp(0, 15)
	mantissa_f = (av / torch.pow(2.0, exp_raw) - 1.0) * 8.0
	mantissa = mantissa_f.round().clamp(0, 7).to(torch.uint8)
	sub_mant = (av / (2*-6) 8.0).round().clamp(0, 7).to(torch.uint8)
	is_sub = exp_biased == 0
	final_mant = torch.where(is_sub, sub_mant, mantissa)
	return sign \| (exp_biased.to(torch.uint8) << 3) \| final_mant


	def _decode_e4m3(encoded: torch.Tensor) -> torch.Tensor:
	"""Decode E4M3 uint8 back to FP32."""
	if hasattr(torch, 'float8_e4m3fn'):
	return encoded.view(torch.float8_e4m3fn).float()
	sign = ((encoded >> 7) & 1).float()
	exp = ((encoded >> 3) & 0xF).long()
	mant = (encoded & 0x7).long()
	is_normal = exp > 0
	normal_val = (8 + mant).float() * torch.pow(2.0, (exp - 10).float())
	subnormal_val = mant.float() * (2.0 ** -9)
	unsigned = torch.where(is_normal, normal_val, subnormal_val)
	return torch.where(sign != 0, -unsigned, unsigned)


	# =============================================================================
	# Goliath Quantized Weight Containers
	# =============================================================================

	@dataclass
	class GoliathFP4Weights:
	"""
	Goliath FP4 quantized weights — fused-dequant format.

	16-element blocks, E4M3 scales, per-tensor FP32 scale.
	Packed as 2 nibbles per uint8 byte (same as NVFP4).

	Optional FP8 residual correction (double-buff) for near-FP16 accuracy.
	"""
	packed: torch.Tensor # [K//2, N] uint8
	block_scales: torch.Tensor # [K//16, N] uint8 (E4M3)
	tensor_scale: float # FP32 per-tensor scale
	shape: Tuple[int, int] # (K, N)
	bits: int = 4
	# FP8 residual correction (optional, "double-buff")
	residual: Optional[torch.Tensor] = None # [K, N] uint8 — E4M3 encoded
	residual_scales: Optional[torch.Tensor] = None # [K//16, N] float32 per-block

	@classmethod
	def from_float(cls, weights: torch.Tensor, training: bool = False,
	sr_seed: Optional[int] = None,
	compute_residual: bool = False) -> 'GoliathFP4Weights':
	K, N = weights.shape
	assert K % 16 == 0, f"K ({K}) must be multiple of 16 for FP4"

	device = weights.device
	w = weights.float()

	reshaped = w.view(K // 16, 16, N)
	absmax = reshaped.abs().amax(dim=1).clamp(min=1e-10)

	# Two-level scaling
	global_absmax = absmax.max().clamp(min=1e-10)
	tensor_scale = (global_absmax / 448.0).item()

	target = absmax / (tensor_scale * 6.0)
	target = target.clamp(min=1e-10)
	block_scales_fp8 = _encode_e4m3(target)

	actual_scale = _decode_e4m3(block_scales_fp8) * tensor_scale
	actual_scale = actual_scale.clamp(min=1e-10)

	normalized = (reshaped / actual_scale.unsqueeze(1)).clamp(-6.0, 6.0)

	# Vectorized bucketize quantization
	boundaries = _E2M1_BOUNDARIES.to(device)
	abs_norm = normalized.abs().reshape(-1)
	unsigned_idx = torch.bucketize(abs_norm, boundaries)

	# Stochastic rounding for training (reduces systematic quantization bias)
	if training and sr_seed is not None:
	e2m1_unsigned = _E2M1_VALUES[:8].to(device) # [0, 0.5, 1, 1.5, 2, 3, 4, 6]
	lower_val = e2m1_unsigned[unsigned_idx.clamp(max=7)]
	upper_val = e2m1_unsigned[(unsigned_idx + 1).clamp(max=7)]
	spread = (upper_val - lower_val).clamp(min=1e-10)
	frac = (abs_norm - lower_val) / spread
	gen = torch.Generator(device=device).manual_seed(sr_seed)
	rand = torch.rand_like(frac, generator=gen)
	unsigned_idx = torch.where(rand < frac, unsigned_idx + 1, unsigned_idx).clamp(0, 7)

	sign_bit = ((normalized.reshape(-1) < 0) & (unsigned_idx > 0)).to(torch.uint8) << 3
	indices = (sign_bit \| unsigned_idx.to(torch.uint8)).reshape(K, N)

	packed = (indices[0::2] \| (indices[1::2] << 4))

	# FP8 residual correction (double-buff)
	residual_e4m3 = None
	residual_scales = None
	if compute_residual:
	fp4_approx = cls(
	packed=packed, block_scales=block_scales_fp8,
	tensor_scale=tensor_scale, shape=(K, N),
	).to_float()
	residual_float = w - fp4_approx
	res_blocks = residual_float.view(K // 16, 16, N)
	res_absmax = res_blocks.abs().amax(dim=1).clamp(min=1e-10)
	res_scale = res_absmax / 448.0
	res_normalized = res_blocks / res_scale.unsqueeze(1)
	res_normalized = res_normalized.clamp(-448.0, 448.0)
	residual_e4m3 = res_normalized.view(K, N).to(torch.float8_e4m3fn).view(torch.uint8)
	residual_scales = res_scale

	return cls(
	packed=packed,
	block_scales=block_scales_fp8,
	tensor_scale=tensor_scale,
	shape=(K, N),
	residual=residual_e4m3,
	residual_scales=residual_scales,
	)

	def to_float(self) -> torch.Tensor:
	K, N = self.shape
	device = self.packed.device
	e2m1 = _E2M1_VALUES.to(device)

	low = (self.packed & 0xF).long()
	high = (self.packed >> 4).long()

	low_vals = e2m1[low.flatten()].view(K // 2, N)
	high_vals = e2m1[high.flatten()].view(K // 2, N)

	unpacked = torch.zeros(K, N, device=device, dtype=torch.float32)
	unpacked[0::2] = low_vals
	unpacked[1::2] = high_vals

	block_sf = _decode_e4m3(self.block_scales)
	scale = block_sf * self.tensor_scale
	unpacked = unpacked.view(K // 16, 16, N) * scale.unsqueeze(1)
	return unpacked.view(K, N)


	@dataclass
	class GoliathFP4NativeWeights:
	"""
	Goliath FP4 native weights for ``tl.dot_scaled`` code path.

	Layout required by ``tl.dot_scaled(..., "e2m1")``:
	- packed_col_major: [N, K//2] uint8 — E2M1 nibbles, col-major for RHS
	- scales_e8m0: [N, K//32] uint8 — E8M0 power-of-two scales (32-element blocks)

	Created from a GoliathFP4Weights instance via ``from_goliath_fp4()``.
	"""
	packed_col_major: torch.Tensor # [N, K//2] uint8
	scales_e8m0: torch.Tensor # [N, K//32] uint8
	shape: Tuple[int, int] # (K, N) — original weight shape
	bits: int = 4

	@classmethod
	def from_goliath_fp4(cls, w: GoliathFP4Weights) -> 'GoliathFP4NativeWeights':
	"""Convert GoliathFP4Weights → native dot_scaled layout.

	Transposes packed data to col-major [N, K//2] and converts
	E4M3 16-element block scales → E8M0 32-element block scales.
	"""
	K, N = w.shape
	device = w.packed.device

	# Transpose packed nibbles: [K//2, N] → [N, K//2]
	packed_col = w.packed.T.contiguous()

	# Convert E4M3 block scales to E8M0 (power-of-two, 32-element blocks)
	# w.block_scales: [K//16, N] uint8 E4M3
	# E8M0 needs [K//32, N] — merge pairs of 16-element groups
	scales_f = _decode_e4m3(w.block_scales) # [K//16, N] float32
	num_16_groups = K // 16

	if num_16_groups >= 2:
	# Pair consecutive 16-element groups → 32-element blocks
	scales_paired = scales_f.view(num_16_groups // 2, 2, N)
	# Effective scale = max of the pair * tensor_scale
	scales_32 = scales_paired.amax(dim=1) * w.tensor_scale # [K//32, N]
	else:
	scales_32 = scales_f * w.tensor_scale # [1, N] (K==16 edge case)

	# E8M0 encoding: uint8 = round(log2(val / 6.0)) + 127
	# 6.0 = max absolute E2M1 value; E8M0 is pure exponent (bias 127)
	safe_scales = scales_32.clamp(min=1e-20)
	log2_val = torch.log2(safe_scales / 6.0)
	e8m0 = (log2_val.round() + 127).clamp(0, 254).to(torch.uint8) # [K//32, N]

	# Transpose scales to [N, K//32] for col-major RHS access
	scales_col = e8m0.T.contiguous()

	return cls(
	packed_col_major=packed_col,
	scales_e8m0=scales_col,
	shape=(K, N),
	)


	@dataclass
	class GoliathFP8Weights:
	"""
	Goliath FP8 quantized weights — fused-dequant format.

	Per-block FP32 scales, data stored as uint8 E4M3 encoding.
	Block size: 32 elements (matches Triton tile granularity for shared memory).
	"""
	data: torch.Tensor # [K, N] uint8 (E4M3 encoded)
	block_scales: torch.Tensor # [K//32, N] float32 per-block scale
	shape: Tuple[int, int] # (K, N)
	bits: int = 8

	@classmethod
	def from_float(cls, weights: torch.Tensor) -> 'GoliathFP8Weights':
	K, N = weights.shape
	assert K % 32 == 0, f"K ({K}) must be multiple of 32 for FP8"

	device = weights.device
	w = weights.float()

	reshaped = w.view(K // 32, 32, N)
	absmax = reshaped.abs().amax(dim=1).clamp(min=1e-10) # [K//32, N]

	# Scale so max maps to 448 (E4M3 max)
	block_scales = absmax / 448.0 # [K//32, N]

	normalized = reshaped / block_scales.unsqueeze(1)
	normalized = normalized.clamp(-448.0, 448.0)

	# Encode as E4M3
	data = _encode_e4m3(normalized.reshape(K, N))

	return cls(
	data=data,
	block_scales=block_scales,
	shape=(K, N),
	)

	def to_float(self) -> torch.Tensor:
	K, N = self.shape
	decoded = _decode_e4m3(self.data) # [K, N]
	decoded = decoded.view(K // 32, 32, N) * self.block_scales.unsqueeze(1)
	return decoded.view(K, N)


	# =============================================================================
	# Goliath INT2 Weights — Aggressive 2-bit Quantization for Cold Experts
	# =============================================================================
	#
	# Simple uniform 2-bit quantization with group scales:
	# - 4 weights packed per uint8 byte
	# - 32-element groups with FP16 scales
	# - Values quantize to {-2, -1, 0, 1} × scale
	# - 2x smaller than FP4 (0.25 bytes/weight vs 0.5 bytes/weight)
	#
	# Use case: Cold MoE experts (rarely routed, <10% of tokens)
	# Quality: ~2-3% accuracy loss acceptable for cold experts
	#

	@dataclass
	class GoliathINT2Weights:
	"""
	Goliath INT2 quantized weights — aggressive compression for cold experts.

	32-element groups, FP16 scales, 4 weights packed per uint8 byte.
	Values map to {-2, -1, 0, 1} × scale (symmetric 2-bit).
	"""
	packed: torch.Tensor # [K//4, N] uint8 (4 weights per byte)
	block_scales: torch.Tensor # [K//32, N] float16 per-block scale
	shape: Tuple[int, int] # (K, N)
	bits: int = 2

	@classmethod
	def from_float(cls, weights: torch.Tensor, stochastic: bool = False) -> 'GoliathINT2Weights':
	"""Quantize FP32/BF16 weights to INT2 format.

	Args:
	weights: Input [K, N] tensor
	stochastic: Use stochastic rounding (reduces bias for training)

	Returns:
	GoliathINT2Weights with packed 2-bit data
	"""
	K, N = weights.shape
	assert K % 32 == 0, f"K ({K}) must be multiple of 32 for INT2"

	device = weights.device
	w = weights.float()

	# Reshape to 32-element blocks for scaling
	reshaped = w.view(K // 32, 32, N)
	absmax = reshaped.abs().amax(dim=1).clamp(min=1e-10) # [K//32, N]

	# Scale factor: map max to 2.0 (our max quantized value)
	block_scales = (absmax / 2.0).to(torch.float16) # [K//32, N]

	# Normalize to [-2, 2] range
	scale_expanded = block_scales.unsqueeze(1).float() # [K//32, 1, N]
	normalized = reshaped / scale_expanded.clamp(min=1e-10) # [K//32, 32, N]

	# Quantize to {-2, -1, 0, 1} (2-bit signed)
	if stochastic:
	# Stochastic rounding
	noise = torch.rand_like(normalized) - 0.5
	quantized = torch.round(normalized + noise * 0.5)
	else:
	quantized = torch.round(normalized)

	quantized = quantized.clamp(-2, 1).to(torch.int8) # {-2, -1, 0, 1}

	# Shift to unsigned {0, 1, 2, 3} for packing
	unsigned = (quantized + 2).to(torch.uint8) # {0, 1, 2, 3}

	# Reshape to [K, N] for packing
	unsigned = unsigned.view(K, N)

	# Pack 4 weights per byte: w0 \| (w1 << 2) \| (w2 << 4) \| (w3 << 6)
	packed = (unsigned[0::4] \|
	(unsigned[1::4] << 2) \|
	(unsigned[2::4] << 4) \|
	(unsigned[3::4] << 6)) # [K//4, N]

	return cls(
	packed=packed,
	block_scales=block_scales,
	shape=(K, N),
	)

	@classmethod
	def from_fp4(cls, fp4_weights: GoliathFP4Weights) -> 'GoliathINT2Weights':
	"""Convert FP4 weights to INT2 (for demoting cold experts)."""
	return cls.from_float(fp4_weights.to_float())

	def to_float(self) -> torch.Tensor:
	"""Dequantize INT2 weights back to FP32."""
	K, N = self.shape
	device = self.packed.device

	# Unpack 4 weights per byte
	w0 = (self.packed & 0x3).to(torch.int8) - 2 # {0,1,2,3} -> {-2,-1,0,1}
	w1 = ((self.packed >> 2) & 0x3).to(torch.int8) - 2
	w2 = ((self.packed >> 4) & 0x3).to(torch.int8) - 2
	w3 = ((self.packed >> 6) & 0x3).to(torch.int8) - 2

	# Interleave back to [K, N]
	unpacked = torch.zeros(K, N, device=device, dtype=torch.float32)
	unpacked[0::4] = w0.float()
	unpacked[1::4] = w1.float()
	unpacked[2::4] = w2.float()
	unpacked[3::4] = w3.float()

	# Apply block scales
	unpacked = unpacked.view(K // 32, 32, N)
	unpacked = unpacked * self.block_scales.unsqueeze(1).float()

	return unpacked.view(K, N)

	def memory_bytes(self) -> int:
	"""Return memory usage in bytes."""
	return self.packed.numel() + self.block_scales.numel() * 2 # FP16 = 2 bytes


	@dataclass
	class GoliathFEXCWeights:
	"""
	FE-XC (FireEcho Xtreme Compress) — Codebook-based 2-bit quantization.

	Uses AQLM-style 2x8 additive codebooks: each group of 8 weights is
	represented as C0[idx0] + C1[idx1] where C0/C1 are learned codebooks
	with 256 centroids each. Achieves near-FP16 quality at 2 bits/weight.

	Combined with CodeGEMM-style psumbook precomputation for fast inference:
	precompute psumbook[m,c,j] = dot(codebook[m][c], input[j8:(j+1)8])
	once per token, then the matmul reduces to scalar gathers + adds.

	Layout:
	codes: [K, N//8, 2] uint8 — 2 codebook indices per 8-weight group
	codebooks: [2, 256, 8] float16 — 2 codebooks, 256 centroids, 8 elements
	scales: [K] float16 — per-output-channel scale
	"""
	codes: torch.Tensor # [K, N//8, 2] uint8
	codebooks: torch.Tensor # [2, 256, 8] float16
	scales: torch.Tensor # [K] float16
	shape: Tuple[int, int] # (K, N)
	bits: int = 2
	group_size: int = 8

	@classmethod
	def from_float(
	cls,
	weights: torch.Tensor,
	codebooks: Optional[torch.Tensor] = None,
	n_centroids: int = 256,
	n_iters: int = 20,
	) -> 'GoliathFEXCWeights':
	"""Quantize FP32/BF16 weights to FE-XC 2x8 codebook format.

	Uses residual k-means: learn codebook_0 on raw groups, then
	codebook_1 on residuals. If codebooks are provided (shared),
	skips k-means and only assigns codes.

	Args:
	weights: Input [K, N] tensor
	codebooks: Optional pre-learned [2, 256, 8] codebooks (shared across experts)
	n_centroids: Number of codebook entries (default 256)
	n_iters: K-means iterations (default 20)

	Returns:
	GoliathFEXCWeights with codebook indices + shared codebooks
	"""
	K, N = weights.shape
	g = 8
	assert N % g == 0, f"N ({N}) must be multiple of group_size {g}"

	device = weights.device
	w = weights.float()

	# Reshape to groups: [K * N//g, g]
	groups = w.view(-1, g) # [K*N/8, 8]
	num_groups = groups.shape[0]

	if codebooks is None:
	# Learn codebooks via residual k-means
	codebooks = torch.zeros(2, n_centroids, g, device=device, dtype=torch.float32)

	for cb_idx in range(2):
	if cb_idx == 0:
	data = groups
	else:
	# Residual after first codebook
	nearest_0 = codebooks[0][codes_0.long()] # [num_groups, g]
	data = groups - nearest_0

	# K-means: init with random sample
	perm = torch.randperm(num_groups, device=device)[:n_centroids]
	centroids = data[perm].clone() # [256, g]

	for _ in range(n_iters):
	# Assign: find nearest centroid for each group
	# dists[i, c] = \|\|data[i] - centroids[c]\|\|^2
	dists = torch.cdist(data, centroids) # [num_groups, 256]
	assignments = dists.argmin(dim=1) # [num_groups]

	# Update centroids
	for c in range(n_centroids):
	mask = (assignments == c)
	if mask.any():
	centroids[c] = data[mask].mean(dim=0)

	codebooks[cb_idx] = centroids

	if cb_idx == 0:
	codes_0 = dists.argmin(dim=1).to(torch.uint8)

	codebooks = codebooks.to(torch.float16)
	else:
	codebooks = codebooks.to(device=device)

	# Assign codes using provided or learned codebooks
	cb_float = codebooks.float()

	# Codebook 0: nearest centroid
	dists_0 = torch.cdist(groups, cb_float[0]) # [num_groups, 256]
	codes_0 = dists_0.argmin(dim=1).to(torch.uint8) # [num_groups]

	# Residual after codebook 0
	residual = groups - cb_float[0][codes_0.long()]

	# Codebook 1: nearest centroid on residual
	dists_1 = torch.cdist(residual, cb_float[1])
	codes_1 = dists_1.argmin(dim=1).to(torch.uint8)

	# Pack codes: [K, N//g, 2]
	codes = torch.stack([codes_0, codes_1], dim=1).view(K, N // g, 2)

	# Per-output-channel scale: compensate for reconstruction error
	reconstructed = (cb_float[0][codes_0.long()] + cb_float[1][codes_1.long()]).view(K, N)
	row_norms_orig = w.norm(dim=1).clamp(min=1e-10)
	row_norms_recon = reconstructed.norm(dim=1).clamp(min=1e-10)
	scales = (row_norms_orig / row_norms_recon).to(torch.float16)

	return cls(
	codes=codes,
	codebooks=codebooks.to(torch.float16),
	scales=scales,
	shape=(K, N),
	)

	@classmethod
	def from_fp4(cls, fp4_weights: GoliathFP4Weights,
	codebooks: Optional[torch.Tensor] = None) -> 'GoliathFEXCWeights':
	"""Convert FP4 weights to FE-XC (for demoting cold experts)."""
	return cls.from_float(fp4_weights.to_float(), codebooks=codebooks)

	def to_float(self) -> torch.Tensor:
	"""Dequantize FE-XC weights back to FP32."""
	K, N = self.shape
	g = self.group_size
	cb = self.codebooks.float() # [2, 256, 8]

	codes_flat = self.codes.view(-1, 2).long() # [K*N//8, 2]
	reconstructed = cb[0][codes_flat[:, 0]] + cb[1][codes_flat[:, 1]] # [K*N//8, 8]
	reconstructed = reconstructed.view(K, N)

	# Apply per-row scale
	reconstructed = reconstructed * self.scales.float().unsqueeze(1)

	return reconstructed

	def memory_bytes(self) -> int:
	"""Return memory usage in bytes."""
	return (self.codes.numel() # uint8
	+ self.codebooks.numel() * 2 # float16
	+ self.scales.numel() * 2) # float16


	@dataclass
	class GoliathFEXVQWeights:
	"""
	FE-XVQ (FireEcho XVector Quantization) — Hessian-weighted codebook 2-bit.

	VPTQ-inspired: uses second-order information (Hessian diagonal from
	calibration data) to weight the k-means objective. Errors in important
	dimensions (high Hessian) are penalized more, producing better codebooks
	than FE-XC's plain MSE k-means.

	Same storage format as FE-XC — reuses the same inference kernel (psumbook
	CodeGEMM). Only the codebook LEARNING differs.

	For a linear layer y = Wx, the Hessian H = X^T X (input covariance).
	Quantization error δw contributes δw^T H δw to output loss.
	FE-XVQ minimizes this weighted error instead of plain \|\|δw\|\|^2.

	Layout (identical to FE-XC):
	codes: [K, N//8, 2] uint8 — 2 codebook indices per 8-weight group
	codebooks: [2, 256, 8] float16 — 2 codebooks, 256 centroids, 8 elements
	scales: [K] float16 — per-output-channel scale
	"""
	codes: torch.Tensor # [K, N//8, 2] uint8
	codebooks: torch.Tensor # [2, 256, 8] float16
	scales: torch.Tensor # [K] float16
	shape: Tuple[int, int] # (K, N)
	bits: int = 2
	group_size: int = 8

	@classmethod
	def from_float(
	cls,
	weights: torch.Tensor,
	hessian_diag: Optional[torch.Tensor] = None,
	codebooks: Optional[torch.Tensor] = None,
	n_centroids: int = 256,
	n_iters: int = 20,
	) -> 'GoliathFEXVQWeights':
	"""Quantize weights to FE-XVQ format with Hessian-weighted codebooks.

	When hessian_diag is provided, uses importance-weighted k-means:
	distance = sum(h_i * (w_i - c_i)^2) instead of plain MSE.
	This prioritizes accuracy on dimensions that matter most for output.

	The Hessian diagonal is averaged within each group of 8 to produce
	per-element importance weights [8]. Both data and centroids are
	pre-scaled by sqrt(h_avg), converting Mahalanobis distance to
	Euclidean — enabling efficient torch.cdist.

	Falls back to plain k-means (FE-XC equivalent) when hessian_diag=None.

	Args:
	weights: Input [K, N] tensor (K=out_features, N=in_features)
	hessian_diag: Optional [N] tensor — diagonal of H = X^T X
	codebooks: Optional pre-learned [2, 256, 8] codebooks
	n_centroids: Number of codebook entries (default 256)
	n_iters: K-means iterations (default 20)

	Returns:
	GoliathFEXVQWeights with Hessian-optimal codebook indices
	"""
	K, N = weights.shape
	g = 8
	assert N % g == 0, f"N ({N}) must be multiple of group_size {g}"

	device = weights.device
	w = weights.float()

	# Reshape to groups: [K * N//g, g]
	groups = w.view(-1, g) # [K*N/8, 8]
	num_groups = groups.shape[0]

	# Compute per-element importance weights [g] from Hessian diagonal
	# Average over group positions to get a single [8] weight vector.
	# This captures which of the 8 elements within each group matters most.
	# Pre-scaling by sqrt(h_avg) converts Mahalanobis→Euclidean for cdist.
	if hessian_diag is not None:
	h = hessian_diag.float().to(device)
	h = h / h.mean().clamp(min=1e-10) # normalize mean=1
	h_groups = h.view(-1, g) # [N//8, 8]
	h_avg = h_groups.mean(dim=0) # [8] per-element importance
	sqrt_h = h_avg.sqrt().unsqueeze(0) # [1, 8] for broadcasting
	else:
	sqrt_h = None
	h_avg = None

	# Transform groups to weighted space for efficient cdist
	groups_w = groups * sqrt_h if sqrt_h is not None else groups

	if codebooks is None:
	codebooks = torch.zeros(2, n_centroids, g, device=device, dtype=torch.float32)

	for cb_idx in range(2):
	if cb_idx == 0:
	data = groups
	else:
	nearest_0 = codebooks[0][codes_0.long()]
	data = groups - nearest_0

	data_w = data * sqrt_h if sqrt_h is not None else data

	# K-means init: random sample
	perm = torch.randperm(num_groups, device=device)[:n_centroids]
	centroids = data[perm].clone() # [256, g] in original space

	for _ in range(n_iters):
	# Distances in Hessian-weighted space via pre-scaled cdist
	cent_w = centroids * sqrt_h if sqrt_h is not None else centroids
	dists = torch.cdist(data_w, cent_w) # [G, 256]
	assignments = dists.argmin(dim=1)

	# Update centroids (Hessian-weighted mean in original space)
	for c in range(n_centroids):
	mask = (assignments == c)
	if mask.any():
	if h_avg is not None:
	# Weighted centroid: sum(h_avg * x) / sum(h_avg)
	d_masked = data[mask] # [count, g]
	centroids[c] = (h_avg * d_masked).sum(0) / (h_avg * mask.sum()).clamp(min=1e-10)
	else:
	centroids[c] = data[mask].mean(dim=0)

	codebooks[cb_idx] = centroids
	if cb_idx == 0:
	codes_0 = dists.argmin(dim=1).to(torch.uint8)

	codebooks = codebooks.to(torch.float16)
	else:
	codebooks = codebooks.to(device=device)

	# Assign codes using Hessian-weighted distances
	cb_float = codebooks.float()
	cb0_w = cb_float[0] * sqrt_h if sqrt_h is not None else cb_float[0]
	dists_0 = torch.cdist(groups_w, cb0_w)
	codes_0 = dists_0.argmin(dim=1).to(torch.uint8)

	residual = groups - cb_float[0][codes_0.long()]
	residual_w = residual * sqrt_h if sqrt_h is not None else residual
	cb1_w = cb_float[1] * sqrt_h if sqrt_h is not None else cb_float[1]
	dists_1 = torch.cdist(residual_w, cb1_w)
	codes_1 = dists_1.argmin(dim=1).to(torch.uint8)

	# Pack codes: [K, N//g, 2]
	codes = torch.stack([codes_0, codes_1], dim=1).view(K, N // g, 2)

	# Per-output-channel scale (Hessian-weighted norm ratio)
	reconstructed = (cb_float[0][codes_0.long()] + cb_float[1][codes_1.long()]).view(K, N)
	if hessian_diag is not None:
	h_row = hessian_diag.float().to(device).unsqueeze(0) # [1, N]
	row_norms_orig = (w * w * h_row).sum(dim=1).sqrt().clamp(min=1e-10)
	row_norms_recon = (reconstructed * reconstructed * h_row).sum(dim=1).sqrt().clamp(min=1e-10)
	else:
	row_norms_orig = w.norm(dim=1).clamp(min=1e-10)
	row_norms_recon = reconstructed.norm(dim=1).clamp(min=1e-10)
	scales = (row_norms_orig / row_norms_recon).to(torch.float16)

	return cls(
	codes=codes,
	codebooks=codebooks.to(torch.float16),
	scales=scales,
	shape=(K, N),
	)

	@classmethod
	def from_fexc(cls, fexc_weights: GoliathFEXCWeights) -> 'GoliathFEXVQWeights':
	"""Promote FE-XC weights to FE-XVQ (same data, different type tag)."""
	return cls(
	codes=fexc_weights.codes,
	codebooks=fexc_weights.codebooks,
	scales=fexc_weights.scales,
	shape=fexc_weights.shape,
	)

	def to_fexc(self) -> GoliathFEXCWeights:
	"""Downcast to FE-XC (for using FE-XC inference kernel)."""
	return GoliathFEXCWeights(
	codes=self.codes,
	codebooks=self.codebooks,
	scales=self.scales,
	shape=self.shape,
	)

	def to_float(self) -> torch.Tensor:
	"""Dequantize FE-XVQ weights back to FP32 (same as FE-XC)."""
	K, N = self.shape
	g = self.group_size
	cb = self.codebooks.float()
	codes_flat = self.codes.view(-1, 2).long()
	reconstructed = cb[0][codes_flat[:, 0]] + cb[1][codes_flat[:, 1]]
	reconstructed = reconstructed.view(K, N)
	reconstructed = reconstructed * self.scales.float().unsqueeze(1)
	return reconstructed

	def memory_bytes(self) -> int:
	"""Return memory usage in bytes."""
	return (self.codes.numel()
	+ self.codebooks.numel() * 2
	+ self.scales.numel() * 2)


	# Union type for dispatch
	GoliathWeights = Union[GoliathFP4Weights, GoliathFP8Weights, GoliathINT2Weights, GoliathFEXCWeights, GoliathFEXVQWeights]


	# =============================================================================
	# Triton JIT Helpers
	# =============================================================================

	@triton.jit
	def _int2_decode(packed_byte, offset):
	"""Decode 2-bit value from packed byte at given offset (0-3).

	packed_byte: uint8 with 4 packed 2-bit values
	offset: which 2-bit value to extract (0=bits 0-1, 1=bits 2-3, etc.)

	Returns: float32 value in {-2, -1, 0, 1}
	"""
	shift = offset * 2
	unsigned = (packed_byte >> shift) & 0x3 # Extract 2 bits
	return (unsigned.to(tl.float32) - 2.0) # Convert {0,1,2,3} -> {-2,-1,0,1}


	@triton.jit
	def _e2m1_decode(idx):
	"""Decode 4-bit E2M1 index -> float32."""
	sign = (idx >> 3) & 1
	exp = (idx >> 1) & 3
	mant = idx & 1
	is_normal = exp > 0
	subnormal_val = mant.to(tl.float32) * 0.5
	normal_val = (2 + mant).to(tl.float32) * tl.exp2((exp - 2).to(tl.float32))
	unsigned_val = tl.where(is_normal, normal_val, subnormal_val)
	return tl.where(sign != 0, -unsigned_val, unsigned_val)


	@triton.jit
	def _decode_e4m3_triton(raw_uint8):
	"""Decode E4M3 FP8 in Triton registers."""
	sign = (raw_uint8 >> 7) & 1
	exp = (raw_uint8 >> 3) & 0xF
	mant = raw_uint8 & 0x7
	is_normal = exp > 0
	normal_val = (8 + mant).to(tl.float32) * tl.exp2((exp - 10).to(tl.float32))
	subnormal_val = mant.to(tl.float32) * tl.exp2(tl.full(mant.shape, -9.0, tl.float32))
	unsigned = tl.where(is_normal, normal_val, subnormal_val)
	return tl.where(sign != 0, -unsigned, unsigned)


	# =============================================================================
	# Goliath FP4 Fused Dequant-MatMul Kernel
	# =============================================================================

	@triton.autotune(
	configs=[
	# --- Blackwell 5090 prefill configs (dual-SM, high occupancy) ---
	triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128}, num_stages=5, num_warps=16),
	triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128}, num_stages=5, num_warps=16),
	triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128}, num_stages=5, num_warps=8),
	triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 64}, num_stages=4, num_warps=8),
	triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64}, num_stages=5, num_warps=4),
	triton.Config({'BLOCK_M': 64, 'BLOCK_N': 64, 'BLOCK_K': 64}, num_stages=5, num_warps=4),
	# --- Decode-optimized (small M, maximize N-parallelism for 170 SMs) ---
	triton.Config({'BLOCK_M': 16, 'BLOCK_N': 256, 'BLOCK_K': 128}, num_stages=5, num_warps=8),
	triton.Config({'BLOCK_M': 16, 'BLOCK_N': 256, 'BLOCK_K': 64}, num_stages=5, num_warps=4),
	triton.Config({'BLOCK_M': 16, 'BLOCK_N': 128, 'BLOCK_K': 64}, num_stages=5, num_warps=4),
	triton.Config({'BLOCK_M': 32, 'BLOCK_N': 256, 'BLOCK_K': 128}, num_stages=5, num_warps=8),
	triton.Config({'BLOCK_M': 32, 'BLOCK_N': 256, 'BLOCK_K': 64}, num_stages=5, num_warps=4),
	# --- MoE expert small-matrix (768×2048 / 2048×768) ---
	# N=768: tiles must be 64 or 128 (768/64=12, 768/128=6)
	# K=768: smaller K needs more pipeline depth
	triton.Config({'BLOCK_M': 16, 'BLOCK_N': 64, 'BLOCK_K': 128}, num_stages=4, num_warps=4),
	triton.Config({'BLOCK_M': 16, 'BLOCK_N': 128, 'BLOCK_K': 128}, num_stages=4, num_warps=4),
	triton.Config({'BLOCK_M': 16, 'BLOCK_N': 64, 'BLOCK_K': 64}, num_stages=4, num_warps=2),
	triton.Config({'BLOCK_M': 32, 'BLOCK_N': 64, 'BLOCK_K': 128}, num_stages=4, num_warps=4),
	triton.Config({'BLOCK_M': 32, 'BLOCK_N': 128, 'BLOCK_K': 128}, num_stages=4, num_warps=4),
	],
	key=['M', 'N', 'K'],
	)
	@triton.jit
	def _goliath_fp4_kernel(
	a_ptr, # [M, K] activations
	w_packed_ptr, # [K//2, N] uint8 packed FP4
	w_scales_ptr, # [K//16, N] uint8 E4M3 scales
	out_ptr, # [M, N] output
	bias_ptr, # [N] optional bias
	tensor_scale, # FP32 per-tensor scale
	M, N, K,
	stride_am, stride_ak,
	stride_wk, stride_wn,
	stride_sk, stride_sn,
	stride_om, stride_on,
	HAS_BIAS: tl.constexpr,
	ACC_DTYPE: tl.constexpr, # 0=float32, 1=bfloat16
	BLOCK_M: tl.constexpr,
	BLOCK_N: tl.constexpr,
	BLOCK_K: tl.constexpr,
	):
	"""
	Goliath FP4 fused dequant-matmul kernel.

	Loads packed E2M1 nibbles, dequantizes in registers via arithmetic decode,
	applies two-level E4M3 + FP32 scaling, and accumulates via tl.dot().
	The full dequantized weight matrix NEVER exists in global memory.
	"""
	pid_m = tl.program_id(0)
	pid_n = tl.program_id(1)

	offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
	offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)

	acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

	HALF_BLOCK_K: tl.constexpr = BLOCK_K // 2
	SCALES_PER_TILE: tl.constexpr = BLOCK_K // 16

	for k_start in range(0, K, BLOCK_K):
	# Load A as even/odd column halves
	even_k = k_start + tl.arange(0, HALF_BLOCK_K) * 2
	odd_k = k_start + tl.arange(0, HALF_BLOCK_K) * 2 + 1

	a_even_ptrs = a_ptr + offs_m[:, None] * stride_am + even_k[None, :] * stride_ak
	a_odd_ptrs = a_ptr + offs_m[:, None] * stride_am + odd_k[None, :] * stride_ak
	mask_ae = (offs_m[:, None] < M) & (even_k[None, :] < K)
	mask_ao = (offs_m[:, None] < M) & (odd_k[None, :] < K)
	a_even = tl.load(a_even_ptrs, mask=mask_ae, other=0.0)
	a_odd = tl.load(a_odd_ptrs, mask=mask_ao, other=0.0)

	# Load packed weights
	pk_start = k_start // 2
	offs_pk = pk_start + tl.arange(0, HALF_BLOCK_K)
	w_ptrs = w_packed_ptr + offs_pk[:, None] * stride_wk + offs_n[None, :] * stride_wn
	mask_w = (offs_pk[:, None] < (K // 2)) & (offs_n[None, :] < N)
	packed = tl.load(w_ptrs, mask=mask_w, other=0).to(tl.int32)

	# Unpack + E2M1 decode in registers
	low_f = _e2m1_decode(packed & 0xF)
	high_f = _e2m1_decode((packed >> 4) & 0xF)

	# Load E4M3 scales (16-element groups = 8 packed rows each)
	scale_start = k_start // 16
	offs_local = tl.arange(0, HALF_BLOCK_K)
	group_idx = offs_local // 8

	scale_bc = tl.zeros((HALF_BLOCK_K, BLOCK_N), dtype=tl.float32)
	for sg in tl.static_range(0, SCALES_PER_TILE):
	sg_row = scale_start + sg
	sg_ptrs = w_scales_ptr + sg_row * stride_sk + offs_n * stride_sn
	sg_mask = (sg_row < (K // 16)) & (offs_n < N)
	sg_raw = tl.load(sg_ptrs, mask=sg_mask, other=0).to(tl.int32)
	sg_val = _decode_e4m3_triton(sg_raw) * tensor_scale
	sg_match = (group_idx == sg)
	scale_bc = tl.where(sg_match[:, None], sg_val[None, :], scale_bc)

	# Apply scales and cast
	w_even = (low_f * scale_bc).to(tl.bfloat16)
	w_odd = (high_f * scale_bc).to(tl.bfloat16)

	# Two half-sized dot products
	acc += tl.dot(a_even.to(tl.bfloat16), w_even)
	acc += tl.dot(a_odd.to(tl.bfloat16), w_odd)

	# Bias
	if HAS_BIAS:
	bias = tl.load(bias_ptr + offs_n, mask=offs_n < N, other=0.0).to(tl.float32)
	acc += bias[None, :]

	# Store
	out_ptrs = out_ptr + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
	mask_out = (offs_m[:, None] < M) & (offs_n[None, :] < N)
	tl.store(out_ptrs, acc.to(tl.bfloat16), mask=mask_out)


	# =============================================================================
	# Goliath FP4 Native dot_scaled Kernel (tcgen05.mma.mxf4)
	# =============================================================================
	#
	# Activates only when Triton maps tl.dot_scaled to real FP4 tensor cores
	# (SM >= 10.0 + Triton with MXFP4 support). On fallback hardware the probe
	# function _can_use_goliath_dot_scaled() returns False and this path is skipped.
	#

	@triton.autotune(
	configs=[
	triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128}, num_stages=3, num_warps=8),
	triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 256}, num_stages=3, num_warps=8),
	triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 256}, num_stages=3, num_warps=8),
	],
	key=['M', 'N', 'K'],
	)
	@triton.jit
	def _goliath_fp4_dot_scaled_kernel(
	a_ptr, # [M, K] BF16 activations
	w_packed_ptr, # [N, K//2] uint8 packed E2M1 (col-major)
	w_scales_ptr, # [N, K//32] uint8 E8M0 scales
	out_ptr, # [M, N] output
	bias_ptr, # [N] optional bias
	M, N, K,
	stride_am, stride_ak,
	stride_wn, stride_wk, # col-major: row=N, col=K//2
	stride_sn, stride_sk, # col-major: row=N, col=K//32
	stride_om, stride_on,
	HAS_BIAS: tl.constexpr,
	BLOCK_M: tl.constexpr,
	BLOCK_N: tl.constexpr,
	BLOCK_K: tl.constexpr,
	):
	"""
	Goliath FP4 dot_scaled kernel — uses tl.dot_scaled for native MXFP4 TCs.

	LHS = BF16 activations (no scale), RHS = E2M1 packed weights with E8M0 scales.
	When Triton maps this to tcgen05.mma.mxf4, throughput reaches 200-800+ TFLOPS.
	"""
	pid_m = tl.program_id(0)
	pid_n = tl.program_id(1)

	offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
	offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)

	acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

	HALF_BLOCK_K: tl.constexpr = BLOCK_K // 2
	SCALES_PER_TILE: tl.constexpr = BLOCK_K // 32

	for k_start in range(0, K, BLOCK_K):
	# --- LHS: load A tile [BLOCK_M, BLOCK_K] BF16 ---
	offs_k = k_start + tl.arange(0, BLOCK_K)
	a_ptrs = a_ptr + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak
	mask_a = (offs_m[:, None] < M) & (offs_k[None, :] < K)
	a_tile = tl.load(a_ptrs, mask=mask_a, other=0.0).to(tl.bfloat16)

	# --- RHS: load packed weights [BLOCK_N, BLOCK_K//2] uint8 ---
	pk_start = k_start // 2
	offs_pk = pk_start + tl.arange(0, HALF_BLOCK_K)
	w_ptrs = w_packed_ptr + offs_n[:, None] * stride_wn + offs_pk[None, :] * stride_wk
	mask_w = (offs_n[:, None] < N) & (offs_pk[None, :] < (K // 2))
	b_tile = tl.load(w_ptrs, mask=mask_w, other=0) # [BLOCK_N, HALF_BLOCK_K]

	# --- RHS scales: load E8M0 [BLOCK_N, SCALES_PER_TILE] ---
	sc_start = k_start // 32
	offs_sc = sc_start + tl.arange(0, SCALES_PER_TILE)
	s_ptrs = w_scales_ptr + offs_n[:, None] * stride_sn + offs_sc[None, :] * stride_sk
	mask_s = (offs_n[:, None] < N) & (offs_sc[None, :] < (K // 32))
	b_scale = tl.load(s_ptrs, mask=mask_s, other=127) # [BLOCK_N, SCALES_PER_TILE]

	# --- tl.dot_scaled: A (bf16, no scale) × B^T (e2m1, e8m0 scale) ---
	# b_tile is [BLOCK_N, HALF_BLOCK_K], transposed for RHS of dot_scaled
	acc = tl.dot_scaled(a_tile, None, "bf16",
	b_tile.T, b_scale, "e2m1",
	acc)

	# Bias
	if HAS_BIAS:
	bias = tl.load(bias_ptr + offs_n, mask=offs_n < N, other=0.0).to(tl.float32)
	acc += bias[None, :]

	# Store
	out_ptrs = out_ptr + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
	mask_out = (offs_m[:, None] < M) & (offs_n[None, :] < N)
	tl.store(out_ptrs, acc.to(tl.bfloat16), mask=mask_out)


	# =============================================================================
	# Goliath FP8 Fused Dequant-MatMul Kernel
	# =============================================================================

	@triton.autotune(
	configs=[
	# --- Blackwell 5090 prefill configs (dual-SM, high occupancy) ---
	triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128}, num_stages=5, num_warps=16),
	triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128}, num_stages=5, num_warps=16),
	triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128}, num_stages=5, num_warps=8),
	triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 64}, num_stages=4, num_warps=8),
	triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64}, num_stages=5, num_warps=4),
	triton.Config({'BLOCK_M': 64, 'BLOCK_N': 64, 'BLOCK_K': 64}, num_stages=5, num_warps=4),
	# --- Decode-optimized (small M, maximize N-parallelism for 170 SMs) ---
	triton.Config({'BLOCK_M': 16, 'BLOCK_N': 256, 'BLOCK_K': 128}, num_stages=5, num_warps=8),
	triton.Config({'BLOCK_M': 16, 'BLOCK_N': 256, 'BLOCK_K': 64}, num_stages=5, num_warps=4),
	triton.Config({'BLOCK_M': 16, 'BLOCK_N': 128, 'BLOCK_K': 64}, num_stages=5, num_warps=4),
	triton.Config({'BLOCK_M': 32, 'BLOCK_N': 256, 'BLOCK_K': 128}, num_stages=5, num_warps=8),
	triton.Config({'BLOCK_M': 32, 'BLOCK_N': 256, 'BLOCK_K': 64}, num_stages=5, num_warps=4),
	# --- MoE expert small-matrix (768×2048 / 2048×768) ---
	triton.Config({'BLOCK_M': 16, 'BLOCK_N': 64, 'BLOCK_K': 128}, num_stages=4, num_warps=4),
	triton.Config({'BLOCK_M': 16, 'BLOCK_N': 128, 'BLOCK_K': 128}, num_stages=4, num_warps=4),
	triton.Config({'BLOCK_M': 16, 'BLOCK_N': 64, 'BLOCK_K': 64}, num_stages=4, num_warps=2),
	triton.Config({'BLOCK_M': 32, 'BLOCK_N': 64, 'BLOCK_K': 128}, num_stages=4, num_warps=4),
	triton.Config({'BLOCK_M': 32, 'BLOCK_N': 128, 'BLOCK_K': 128}, num_stages=4, num_warps=4),
	],
	key=['M', 'N', 'K'],
	)
	@triton.jit
	def _goliath_fp8_kernel(
	a_ptr, # [M, K] activations (BF16)
	w_data_ptr, # [K, N] uint8 E4M3 encoded weights
	w_scales_ptr, # [K//32, N] float32 per-block scales
	out_ptr, # [M, N] output
	bias_ptr, # [N] optional bias
	M, N, K,
	stride_am, stride_ak,
	stride_wk, stride_wn,
	stride_sk, stride_sn,
	stride_om, stride_on,
	HAS_BIAS: tl.constexpr,
	BLOCK_M: tl.constexpr,
	BLOCK_N: tl.constexpr,
	BLOCK_K: tl.constexpr,
	):
	"""
	Goliath FP8 fused dequant-matmul kernel.

	Loads E4M3-encoded uint8 weights, dequantizes in registers via
	_decode_e4m3_triton, applies per-block FP32 scales, accumulates via tl.dot().
	No separate dequantization step — everything happens in-register.
	32-element scale blocks for shared-memory-friendly tile sizes.
	"""
	pid_m = tl.program_id(0)
	pid_n = tl.program_id(1)

	offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
	offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)

	acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

	SCALES_PER_TILE: tl.constexpr = BLOCK_K // 32

	for k_start in range(0, K, BLOCK_K):
	# Load A tile
	offs_k = k_start + tl.arange(0, BLOCK_K)
	a_ptrs = a_ptr + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak
	mask_a = (offs_m[:, None] < M) & (offs_k[None, :] < K)
	a_tile = tl.load(a_ptrs, mask=mask_a, other=0.0)

	# Load weight tile as uint8
	w_ptrs = w_data_ptr + offs_k[:, None] * stride_wk + offs_n[None, :] * stride_wn
	mask_w = (offs_k[:, None] < K) & (offs_n[None, :] < N)
	w_raw = tl.load(w_ptrs, mask=mask_w, other=0).to(tl.int32)

	# Decode E4M3 in registers
	w_decoded = _decode_e4m3_triton(w_raw) # [BLOCK_K, BLOCK_N] float32

	# Apply per-block scales (32-element groups)
	scale_start = k_start // 32
	offs_local_k = tl.arange(0, BLOCK_K)
	group_idx = offs_local_k // 32

	scale_bc = tl.zeros((BLOCK_K, BLOCK_N), dtype=tl.float32)
	for sg in tl.static_range(0, SCALES_PER_TILE):
	sg_row = scale_start + sg
	sg_ptrs = w_scales_ptr + sg_row * stride_sk + offs_n * stride_sn
	sg_mask = (sg_row < (K // 32)) & (offs_n < N)
	sg_val = tl.load(sg_ptrs, mask=sg_mask, other=1.0) # [BLOCK_N] float32
	sg_match = (group_idx == sg)
	scale_bc = tl.where(sg_match[:, None], sg_val[None, :], scale_bc)

	w_scaled = (w_decoded * scale_bc).to(tl.bfloat16)

	# Matmul accumulate
	acc += tl.dot(a_tile.to(tl.bfloat16), w_scaled)

	# Bias
	if HAS_BIAS:
	bias = tl.load(bias_ptr + offs_n, mask=offs_n < N, other=0.0).to(tl.float32)
	acc += bias[None, :]

	# Store
	out_ptrs = out_ptr + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
	mask_out = (offs_m[:, None] < M) & (offs_n[None, :] < N)
	tl.store(out_ptrs, acc.to(tl.bfloat16), mask=mask_out)


	# =============================================================================
	# Kernel Wrappers
	# =============================================================================

	def _goliath_fp4_matmul(
	activations: torch.Tensor,
	weights: GoliathFP4Weights,
	bias: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""Launch Goliath FP4 fused dequant-matmul kernel.

	Dispatches to native dot_scaled path when available (SM >= 10.0 +
	Triton MXFP4 support), otherwise uses the manual dequant kernel.
	"""
	# Tier 1: Native dot_scaled path (SM >= 10.0 + real FP4 tensor cores)
	if _can_use_goliath_dot_scaled():
	native_w = GoliathFP4NativeWeights.from_goliath_fp4(weights)
	return _goliath_fp4_dot_scaled_matmul(activations, native_w, bias)

	# Tier 1.5: NVFP4 bridge (zero-copy, identical storage layout)
	# packed [K//2, N] uint8, block_scales [K//16, N] E4M3, tensor_scale FP32
	if _NVFP4_BRIDGE_AVAILABLE:
	nvfp4_w = _BridgeNVFP4Weights(
	packed=weights.packed,
	block_scales=weights.block_scales,
	tensor_scale=weights.tensor_scale,
	shape=weights.shape,
	residual=weights.residual,
	residual_scales=weights.residual_scales,
	)
	# Prefer native cuBLAS FP4 on Blackwell (5th-gen tensor cores)
	if _bridge_can_use_cublas_fp4 and _bridge_can_use_cublas_fp4():
	return _bridge_scaled_mm_fp4(activations, nvfp4_w, bias)
	return _bridge_fused_nvfp4(activations, nvfp4_w, bias)

	# Tier 2: Manual dequant fallback
	M, K = activations.shape
	_, N = weights.shape

	a = activations.contiguous()
	if a.dtype != torch.bfloat16:
	a = a.to(torch.bfloat16)

	w_packed = weights.packed.contiguous()
	w_scales = weights.block_scales.contiguous()

	out = torch.empty(M, N, device=a.device, dtype=torch.bfloat16)

	has_bias = bias is not None
	if has_bias:
	bias_t = bias.contiguous().float()
	else:
	bias_t = torch.empty(0, device=a.device, dtype=torch.float32)

	grid = lambda META: (
	triton.cdiv(M, META['BLOCK_M']),
	triton.cdiv(N, META['BLOCK_N']),
	)

	_goliath_fp4_kernel[grid](
	a, w_packed, w_scales, out, bias_t,
	weights.tensor_scale,
	M, N, K,
	a.stride(0), a.stride(1),
	w_packed.stride(0), w_packed.stride(1),
	w_scales.stride(0), w_scales.stride(1),
	out.stride(0), out.stride(1),
	HAS_BIAS=has_bias,
	ACC_DTYPE=0,
	)

	return out


	def _goliath_fp8_matmul(
	activations: torch.Tensor,
	weights: GoliathFP8Weights,
	bias: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""Launch Goliath FP8 fused dequant-matmul kernel."""
	M, K = activations.shape
	_, N = weights.shape

	a = activations.contiguous()
	if a.dtype != torch.bfloat16:
	a = a.to(torch.bfloat16)

	w_data = weights.data.contiguous()
	w_scales = weights.block_scales.contiguous()

	out = torch.empty(M, N, device=a.device, dtype=torch.bfloat16)

	has_bias = bias is not None
	if has_bias:
	bias_t = bias.contiguous().float()
	else:
	bias_t = torch.empty(0, device=a.device, dtype=torch.float32)

	grid = lambda META: (
	triton.cdiv(M, META['BLOCK_M']),
	triton.cdiv(N, META['BLOCK_N']),
	)

	_goliath_fp8_kernel[grid](
	a, w_data, w_scales, out, bias_t,
	M, N, K,
	a.stride(0), a.stride(1),
	w_data.stride(0), w_data.stride(1),
	w_scales.stride(0), w_scales.stride(1),
	out.stride(0), out.stride(1),
	HAS_BIAS=has_bias,
	)

	return out


	# =============================================================================
	# dot_scaled Wrapper + Probe
	# =============================================================================

	def _goliath_fp4_dot_scaled_matmul(
	activations: torch.Tensor,
	weights: GoliathFP4NativeWeights,
	bias: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""Launch Goliath FP4 dot_scaled kernel (native MXFP4 tensor cores)."""
	M, K = activations.shape
	_, N = weights.shape

	a = activations.contiguous()
	if a.dtype != torch.bfloat16:
	a = a.to(torch.bfloat16)

	w_packed = weights.packed_col_major.contiguous() # [N, K//2]
	w_scales = weights.scales_e8m0.contiguous() # [N, K//32]

	out = torch.empty(M, N, device=a.device, dtype=torch.bfloat16)

	has_bias = bias is not None
	if has_bias:
	bias_t = bias.contiguous().float()
	else:
	bias_t = torch.empty(0, device=a.device, dtype=torch.float32)

	grid = lambda META: (
	triton.cdiv(M, META['BLOCK_M']),
	triton.cdiv(N, META['BLOCK_N']),
	)

	_goliath_fp4_dot_scaled_kernel[grid](
	a, w_packed, w_scales, out, bias_t,
	M, N, K,
	a.stride(0), a.stride(1),
	w_packed.stride(0), w_packed.stride(1),
	w_scales.stride(0), w_scales.stride(1),
	out.stride(0), out.stride(1),
	HAS_BIAS=has_bias,
	)

	return out


	# Cached probe result
	_DOT_SCALED_PROBE_RESULT: Optional[bool] = None


	def _can_use_goliath_dot_scaled() -> bool:
	"""Probe whether tl.dot_scaled maps to real FP4 tensor cores.

	Checks:
	1. SM >= 10.0 (Blackwell or later)
	2. tl.dot_scaled exists in Triton
	3. Runtime probe: run a small matmul with both the manual-dequant kernel
	and the dot_scaled kernel. If outputs are bit-identical, Triton is
	falling back to BF16 MMA → return False. If outputs differ with
	rel_err < 0.15 → native FP4 TCs are active → return True.
	"""
	global _DOT_SCALED_PROBE_RESULT
	if _DOT_SCALED_PROBE_RESULT is not None:
	return _DOT_SCALED_PROBE_RESULT

	_DOT_SCALED_PROBE_RESULT = False

	# Check 1: SM capability
	if not torch.cuda.is_available():
	return False
	cap = torch.cuda.get_device_capability(0)
	if cap[0] < 10:
	return False

	# Check 2: Triton API
	if not hasattr(tl, 'dot_scaled'):
	return False

	# Check 3: Runtime probe with small matmul
	try:
	M, N, K = 64, 64, 64
	torch.manual_seed(42)
	w_f = torch.randn(K, N, device='cuda', dtype=torch.float32)
	a = torch.randn(M, K, device='cuda', dtype=torch.bfloat16)

	# Quantize via Goliath FP4
	w_q = GoliathFP4Weights.from_float(w_f)

	# Path A: manual dequant kernel
	out_manual = _goliath_fp4_matmul(a, w_q)

	# Path B: dot_scaled kernel
	w_native = GoliathFP4NativeWeights.from_goliath_fp4(w_q)
	out_scaled = _goliath_fp4_dot_scaled_matmul(a, w_native)

	# Compare
	if torch.equal(out_manual, out_scaled):
	# Bit-identical → Triton using BF16 fallback
	_DOT_SCALED_PROBE_RESULT = False
	else:
	ref = out_manual.float()
	diff = (out_scaled.float() - ref).abs().mean()
	rel_err = diff / ref.abs().mean().clamp(min=1e-10)
	_DOT_SCALED_PROBE_RESULT = rel_err.item() < 0.15

	except Exception:
	_DOT_SCALED_PROBE_RESULT = False

	return _DOT_SCALED_PROBE_RESULT


	# =============================================================================
	# Auto-dispatch: FP4 vs FP8 selection
	# =============================================================================

	def _estimate_accuracy_need(weights: torch.Tensor) -> int:
	"""
	Estimate whether FP4 or FP8 is appropriate for these weights.

	Heuristic: if the weight distribution has high kurtosis (heavy tails)
	or many outliers, FP8 preserves more fidelity. Otherwise FP4 suffices.

	Returns:
	4 for FP4, 8 for FP8
	"""
	w = weights.float()
	absmax = w.abs().max()
	mean_abs = w.abs().mean()

	# Outlier ratio: if max >> mean, distribution has heavy tails
	outlier_ratio = absmax / mean_abs.clamp(min=1e-10)

	# High kurtosis → FP8 for better tail representation
	if outlier_ratio > 20.0:
	return 8

	# Check what fraction of values fall outside FP4 representable range
	# after scaling. FP4 has only 16 levels — coarse quantization.
	std = w.std()
	# If std is very small relative to max, distribution is spiky → FP8
	if std / absmax.clamp(min=1e-10) < 0.05:
	return 8

	return 4


	# =============================================================================
	# Public API
	# =============================================================================

	def goliath_quantize(
	weights: torch.Tensor,
	bits: Union[int, str] = 4,
	training: bool = False,
	sr_seed: Optional[int] = None,
	compute_residual: bool = False,
	) -> GoliathWeights:
	"""
	Quantize weights for Goliath fused GEMM.

	Args:
	weights: Input tensor [K, N] in any float dtype
	bits: 4 for FP4, 8 for FP8, 'auto' for automatic selection
	training: If True, use stochastic rounding for unbiased quantization
	sr_seed: Seed for stochastic rounding RNG
	compute_residual: Compute FP8 residual correction for FP4 weights (double-buff)

	Returns:
	GoliathFP4Weights or GoliathFP8Weights
	"""
	if bits == 'auto':
	bits = _estimate_accuracy_need(weights)

	if bits == 4:
	return GoliathFP4Weights.from_float(weights, training=training, sr_seed=sr_seed,
	compute_residual=compute_residual)
	elif bits == 8:
	return GoliathFP8Weights.from_float(weights)
	else:
	raise ValueError(f"bits must be 4, 8, or 'auto', got {bits}")


	# =============================================================================
	# Goliath Multi-Expert Fused Kernel (all active experts in ONE launch)
	# =============================================================================
	#
	# For MoE single-token decode: instead of launching N separate Goliath kernels
	# (one per active expert), this kernel processes ALL active experts' matmuls
	# in a single kernel launch. Grid dim 0 = expert index, dim 1 = output col tiles.
	# Each expert has its own packed weight, block_scales, and tensor_scale.
	#
	# This eliminates kernel launch overhead which dominates at M=1:
	# - Before: 8 experts × 2 projections = 16 launches per MoE layer × 48 = 768
	# - After: 1 launch for gate_up + 1 launch for down = 2 per layer × 48 = 96
	#

	MAX_EXPERTS: int = 16 # max active experts per launch (Qwen3: 8)

	@triton.jit
	def _goliath_fp4_multi_expert_kernel(
	a_ptr, # [E*M, K] or [M, K] activations
	# Expert weight pointers (padded to MAX_EXPERTS)
	w0_ptr, w1_ptr, w2_ptr, w3_ptr, w4_ptr, w5_ptr, w6_ptr, w7_ptr,
	# Expert scale pointers
	s0_ptr, s1_ptr, s2_ptr, s3_ptr, s4_ptr, s5_ptr, s6_ptr, s7_ptr,
	# Per-expert tensor scales (passed as array in global memory)
	tscale_ptr,
	out_ptr, # [num_experts, M, N] output (expert-batched)
	M, N, K,
	num_experts,
	stride_am, stride_ak,
	stride_wk, stride_wn,
	stride_sk, stride_sn,
	stride_oe, stride_om, stride_on,
	a_expert_stride, # 0 = shared input, >0 = per-expert input (row offset)
	BLOCK_M: tl.constexpr,
	BLOCK_N: tl.constexpr,
	BLOCK_K: tl.constexpr,
	):
	"""Multi-expert FP4 fused dequant-matmul: all experts in ONE launch.

	When a_expert_stride=0: all experts share the same input (gate_up case).
	When a_expert_stride>0: each expert reads from row pid_expert*M of a_ptr (down case).
	"""
	pid_expert = tl.program_id(0)
	pid_n = tl.program_id(1)

	if pid_expert >= num_experts:
	return

	# Per-expert activation offset
	a_base = a_ptr + pid_expert * a_expert_stride

	# Select weight pointer for this expert
	w_packed_ptr = w0_ptr
	w_scales_ptr = s0_ptr
	if pid_expert == 1:
	w_packed_ptr = w1_ptr; w_scales_ptr = s1_ptr
	elif pid_expert == 2:
	w_packed_ptr = w2_ptr; w_scales_ptr = s2_ptr
	elif pid_expert == 3:
	w_packed_ptr = w3_ptr; w_scales_ptr = s3_ptr
	elif pid_expert == 4:
	w_packed_ptr = w4_ptr; w_scales_ptr = s4_ptr
	elif pid_expert == 5:
	w_packed_ptr = w5_ptr; w_scales_ptr = s5_ptr
	elif pid_expert == 6:
	w_packed_ptr = w6_ptr; w_scales_ptr = s6_ptr
	elif pid_expert == 7:
	w_packed_ptr = w7_ptr; w_scales_ptr = s7_ptr

	tensor_scale = tl.load(tscale_ptr + pid_expert)

	offs_m = tl.arange(0, BLOCK_M)
	offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)

	acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

	HALF_BLOCK_K: tl.constexpr = BLOCK_K // 2
	SCALES_PER_TILE: tl.constexpr = BLOCK_K // 16

	for k_start in range(0, K, BLOCK_K):
	even_k = k_start + tl.arange(0, HALF_BLOCK_K) * 2
	odd_k = k_start + tl.arange(0, HALF_BLOCK_K) * 2 + 1

	a_even_ptrs = a_base + offs_m[:, None] * stride_am + even_k[None, :] * stride_ak
	a_odd_ptrs = a_base + offs_m[:, None] * stride_am + odd_k[None, :] * stride_ak
	mask_ae = (offs_m[:, None] < M) & (even_k[None, :] < K)
	mask_ao = (offs_m[:, None] < M) & (odd_k[None, :] < K)
	a_even = tl.load(a_even_ptrs, mask=mask_ae, other=0.0)
	a_odd = tl.load(a_odd_ptrs, mask=mask_ao, other=0.0)

	pk_start = k_start // 2
	offs_pk = pk_start + tl.arange(0, HALF_BLOCK_K)
	w_ptrs = w_packed_ptr + offs_pk[:, None] * stride_wk + offs_n[None, :] * stride_wn
	mask_w = (offs_pk[:, None] < (K // 2)) & (offs_n[None, :] < N)
	packed = tl.load(w_ptrs, mask=mask_w, other=0).to(tl.int32)

	low_f = _e2m1_decode(packed & 0xF)
	high_f = _e2m1_decode((packed >> 4) & 0xF)

	scale_start = k_start // 16
	offs_local = tl.arange(0, HALF_BLOCK_K)
	group_idx = offs_local // 8

	scale_bc = tl.zeros((HALF_BLOCK_K, BLOCK_N), dtype=tl.float32)
	for sg in tl.static_range(0, SCALES_PER_TILE):
	sg_row = scale_start + sg
	sg_ptrs = w_scales_ptr + sg_row * stride_sk + offs_n * stride_sn
	sg_mask = (sg_row < (K // 16)) & (offs_n < N)
	sg_raw = tl.load(sg_ptrs, mask=sg_mask, other=0).to(tl.int32)
	sg_val = _decode_e4m3_triton(sg_raw) * tensor_scale
	sg_match = (group_idx == sg)
	scale_bc = tl.where(sg_match[:, None], sg_val[None, :], scale_bc)

	w_even = (low_f * scale_bc).to(tl.bfloat16)
	w_odd = (high_f * scale_bc).to(tl.bfloat16)

	acc += tl.dot(a_even.to(tl.bfloat16), w_even)
	acc += tl.dot(a_odd.to(tl.bfloat16), w_odd)

	# Store to expert-batched output [num_experts, M, N]
	out_ptrs = out_ptr + pid_expert * stride_oe + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
	mask_out = (offs_m[:, None] < M) & (offs_n[None, :] < N)
	tl.store(out_ptrs, acc.to(tl.bfloat16), mask=mask_out)


	def goliath_multi_expert_gemm(
	activations: torch.Tensor,
	expert_weights: list,
	num_experts: int = 8,
	per_expert_input: bool = False,
	) -> torch.Tensor:
	"""
	Fused multi-expert FP4 GEMM: all experts in ONE kernel launch.

	Args:
	activations: Input in BF16.
	If per_expert_input=False: [M, K] shared across all experts (gate_up case)
	If per_expert_input=True: [num_experts*M, K] stacked per-expert inputs (down case)
	expert_weights: List of GoliathFP4Weights (one per active expert)
	num_experts: Number of active experts
	per_expert_input: If True, each expert reads from its own M rows of activations

	Returns:
	Output [num_experts, M, N] in BF16
	"""
	assert num_experts <= MAX_EXPERTS, f"max {MAX_EXPERTS} experts, got {num_experts}"
	assert all(isinstance(w, GoliathFP4Weights) for w in expert_weights)

	_, N = expert_weights[0].shape

	a = activations.contiguous()
	if a.dtype != torch.bfloat16:
	a = a.to(torch.bfloat16)

	if per_expert_input:
	# activations is [num_experts*M, K], each expert gets M rows
	total_rows, K = a.shape
	M = total_rows // num_experts
	a_expert_stride = M * a.stride(0) # byte offset between expert inputs
	else:
	M, K = a.shape
	a_expert_stride = 0 # shared input

	out = torch.empty(num_experts, M, N, device=a.device, dtype=torch.bfloat16)

	# Collect tensor scales into a GPU tensor
	tscales = torch.tensor(
	[w.tensor_scale for w in expert_weights],
	dtype=torch.float32, device=a.device)

	packed_ptrs = [w.packed.contiguous() for w in expert_weights]
	scale_ptrs = [w.block_scales.contiguous() for w in expert_weights]

	# Pad to 8 with dummy (first expert repeated)
	while len(packed_ptrs) < 8:
	packed_ptrs.append(packed_ptrs[0])
	scale_ptrs.append(scale_ptrs[0])

	grid = (num_experts, triton.cdiv(N, 64))

	_goliath_fp4_multi_expert_kernel[grid](
	a,
	packed_ptrs[0], packed_ptrs[1], packed_ptrs[2], packed_ptrs[3],
	packed_ptrs[4], packed_ptrs[5], packed_ptrs[6], packed_ptrs[7],
	scale_ptrs[0], scale_ptrs[1], scale_ptrs[2], scale_ptrs[3],
	scale_ptrs[4], scale_ptrs[5], scale_ptrs[6], scale_ptrs[7],
	tscales, out,
	M, N, K, num_experts,
	a.stride(0), a.stride(1),
	packed_ptrs[0].stride(0), packed_ptrs[0].stride(1),
	scale_ptrs[0].stride(0), scale_ptrs[0].stride(1),
	out.stride(0), out.stride(1), out.stride(2),
	a_expert_stride,
	BLOCK_M=16, BLOCK_N=64, BLOCK_K=128,
	)

	return out


	def goliath_gemm(
	activations: torch.Tensor,
	weights: GoliathWeights,
	bias: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""
	Goliath fused dequant-matmul GEMM.

	Automatically dispatches to FP4 or FP8 kernel based on weight type.
	All dequantization happens in Triton registers — zero extra memory traffic.

	Args:
	activations: Input [M, K] in BF16/FP16/FP32
	weights: GoliathFP4Weights or GoliathFP8Weights
	bias: Optional bias [N]

	Returns:
	Output [M, N] in BF16
	"""
	if not activations.is_cuda:
	# CPU fallback
	w_deq = weights.to_float()
	d = torch.matmul(activations.float(), w_deq)
	if bias is not None:
	d = d + bias.float()
	return d.to(torch.bfloat16)

	if isinstance(weights, GoliathFP4Weights):
	return _goliath_fp4_matmul(activations, weights, bias)
	elif isinstance(weights, GoliathFP8Weights):
	return _goliath_fp8_matmul(activations, weights, bias)
	else:
	raise TypeError(f"Expected GoliathFP4Weights or GoliathFP8Weights, got {type(weights)}")


	# =============================================================================
	# Packed MoE Kernel — contiguous expert buffer + GPU-resident expert IDs
	# =============================================================================
	#
	# Instead of passing 8 separate weight pointers, pack ALL 128 experts' FP4
	# weights into contiguous [E, K//2, N] buffers. Expert selection reads from
	# a GPU tensor — zero .item() calls, zero CPU-GPU sync, CUDA-graph-safe.
	#
	# Benefits over goliath_multi_expert_gemm:
	# 1. No .item() calls (8 per layer × 48 layers = 384 CPU syncs eliminated)
	# 2. No Python weight-collection loops (~100μs × 48 layers saved)
	# 3. CUDA-graph-capturable (all inputs are static-address GPU tensors)
	# 4. Better L2 locality from contiguous weight storage
	#

	@triton.jit
	def _goliath_fp4_packed_moe_kernel(
	a_ptr, # [M, K] or [num_active*M, K] activations
	packed_w_ptr, # [E_total, K//2, N] contiguous FP4 weights
	packed_s_ptr, # [E_total, K//16, N] contiguous FP8 scales
	tscale_ptr, # [E_total] tensor scales (float32)
	expert_ids_ptr, # [num_active] selected expert indices (GPU tensor!)
	out_ptr, # [num_active, M, N] output
	M, N, K,
	num_active, # number of active experts (e.g. 8)
	stride_ew, # expert stride for packed weights (K//2 * N)
	stride_wk, stride_wn,
	stride_es, # expert stride for packed scales (K//16 * N)
	stride_sk, stride_sn,
	stride_oe, stride_om, stride_on,
	stride_am, stride_ak,
	a_expert_stride, # 0 = shared input, >0 = per-expert activation stride
	BLOCK_M: tl.constexpr,
	BLOCK_N: tl.constexpr,
	BLOCK_K: tl.constexpr,
	):
	"""Packed MoE FP4 fused dequant-matmul: GPU-resident expert selection.

	Grid: (num_active, ceil(N/BLOCK_N), ceil(M/BLOCK_M))
	Each program handles one active expert's matmul for one tile of output rows × columns.
	Expert IDs are read from a GPU tensor — no CPU involvement.
	"""
	pid_active = tl.program_id(0) # which active expert (0..num_active-1)
	pid_n = tl.program_id(1) # output column tile
	pid_m = tl.program_id(2) # output row tile

	if pid_active >= num_active:
	return

	# Read expert index from GPU tensor (NO .item()!)
	expert_id = tl.load(expert_ids_ptr + pid_active)

	# Compute base pointers for this expert
	w_packed_ptr = packed_w_ptr + expert_id * stride_ew
	w_scales_ptr = packed_s_ptr + expert_id * stride_es
	tensor_scale = tl.load(tscale_ptr + expert_id)

	# Activation base (shared or per-expert)
	a_base = a_ptr + pid_active * a_expert_stride

	offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
	offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)

	acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

	HALF_BLOCK_K: tl.constexpr = BLOCK_K // 2
	SCALES_PER_TILE: tl.constexpr = BLOCK_K // 16

	for k_start in range(0, K, BLOCK_K):
	even_k = k_start + tl.arange(0, HALF_BLOCK_K) * 2
	odd_k = k_start + tl.arange(0, HALF_BLOCK_K) * 2 + 1

	a_even_ptrs = a_base + offs_m[:, None] * stride_am + even_k[None, :] * stride_ak
	a_odd_ptrs = a_base + offs_m[:, None] * stride_am + odd_k[None, :] * stride_ak
	mask_ae = (offs_m[:, None] < M) & (even_k[None, :] < K)
	mask_ao = (offs_m[:, None] < M) & (odd_k[None, :] < K)
	a_even = tl.load(a_even_ptrs, mask=mask_ae, other=0.0)
	a_odd = tl.load(a_odd_ptrs, mask=mask_ao, other=0.0)

	pk_start = k_start // 2
	offs_pk = pk_start + tl.arange(0, HALF_BLOCK_K)
	w_ptrs = w_packed_ptr + offs_pk[:, None] * stride_wk + offs_n[None, :] * stride_wn
	mask_w = (offs_pk[:, None] < (K // 2)) & (offs_n[None, :] < N)
	packed = tl.load(w_ptrs, mask=mask_w, other=0).to(tl.int32)

	low_f = _e2m1_decode(packed & 0xF)
	high_f = _e2m1_decode((packed >> 4) & 0xF)

	scale_start = k_start // 16
	offs_local = tl.arange(0, HALF_BLOCK_K)
	group_idx = offs_local // 8

	scale_bc = tl.zeros((HALF_BLOCK_K, BLOCK_N), dtype=tl.float32)
	for sg in tl.static_range(0, SCALES_PER_TILE):
	sg_row = scale_start + sg
	sg_ptrs = w_scales_ptr + sg_row * stride_sk + offs_n * stride_sn
	sg_mask = (sg_row < (K // 16)) & (offs_n < N)
	sg_raw = tl.load(sg_ptrs, mask=sg_mask, other=0).to(tl.int32)
	sg_val = _decode_e4m3_triton(sg_raw) * tensor_scale
	sg_match = (group_idx == sg)
	scale_bc = tl.where(sg_match[:, None], sg_val[None, :], scale_bc)

	w_even = (low_f * scale_bc).to(tl.bfloat16)
	w_odd = (high_f * scale_bc).to(tl.bfloat16)

	acc += tl.dot(a_even.to(tl.bfloat16), w_even)
	acc += tl.dot(a_odd.to(tl.bfloat16), w_odd)

	# Store to expert-batched output [num_active, M, N]
	out_ptrs = out_ptr + pid_active * stride_oe + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
	mask_out = (offs_m[:, None] < M) & (offs_n[None, :] < N)
	tl.store(out_ptrs, acc.to(tl.bfloat16), mask=mask_out)


	def goliath_packed_moe_gemm(
	activations: torch.Tensor,
	packed_w: torch.Tensor, # [E_total, K//2, N] uint8
	packed_s: torch.Tensor, # [E_total, K//16, N] uint8
	packed_ts: torch.Tensor, # [E_total] float32
	expert_ids: torch.Tensor, # [num_active] int64 on GPU
	num_active: int = 8,
	per_expert_input: bool = False,
	) -> torch.Tensor:
	"""Packed MoE FP4 GEMM: contiguous expert buffer + GPU-resident expert IDs.

	All expert selection happens on GPU — zero .item() calls, CUDA-graph-safe.

	Args:
	activations: Input in BF16.
	If per_expert_input=False: [M, K] shared across all experts
	If per_expert_input=True: [num_active*M, K] stacked per-expert inputs
	packed_w: [E_total, K//2, N] contiguous packed FP4 weights for all experts
	packed_s: [E_total, K//16, N] contiguous FP8 block scales
	packed_ts: [E_total] per-expert tensor scales (float32)
	expert_ids: [num_active] selected expert indices (GPU tensor, int64)
	num_active: Number of active experts
	per_expert_input: If True, each expert reads from its own M rows

	Returns:
	Output [num_active, M, N] in BF16
	"""
	N = packed_w.shape[2] # N is the last dim of packed weights

	a = activations.contiguous()
	if a.dtype != torch.bfloat16:
	a = a.to(torch.bfloat16)

	if per_expert_input:
	total_rows, K = a.shape
	M = total_rows // num_active
	a_expert_stride = M * a.stride(0)
	else:
	M, K = a.shape
	a_expert_stride = 0

	out = torch.empty(num_active, M, N, device=a.device, dtype=torch.bfloat16)

	BLOCK_M = 16
	BLOCK_N = 64
	grid = (num_active, triton.cdiv(N, BLOCK_N), triton.cdiv(M, BLOCK_M))

	_goliath_fp4_packed_moe_kernel[grid](
	a,
	packed_w, packed_s, packed_ts,
	expert_ids,
	out,
	M, N, K,
	num_active,
	packed_w.stride(0), # stride_ew
	packed_w.stride(1), packed_w.stride(2), # stride_wk, stride_wn
	packed_s.stride(0), # stride_es
	packed_s.stride(1), packed_s.stride(2), # stride_sk, stride_sn
	out.stride(0), out.stride(1), out.stride(2),
	a.stride(0), a.stride(1),
	a_expert_stride,
	BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=128,
	)

	return out


	# =============================================================================
	# Fused SwiGLU + Down Projection — Eliminates Intermediate Tensor
	# =============================================================================
	#
	# Instead of: gate_up → split → SiLU*up → down_proj (3 kernel launches)
	# We do: gate_up → fused_swiglu_down (2 kernel launches, no hidden tensor)
	#
	# The key insight: apply SwiGLU as input preprocessing in the down_proj kernel,
	# reading gate_up output and applying SiLU(gate)*up inline before GEMM.

	@triton.jit
	def _goliath_fp4_swiglu_down_kernel(
	gate_up_ptr, # [num_active, M, 2*intermediate] gate_up output (BF16)
	packed_w_ptr, # [E_total, K//2, N] down_proj FP4 weights (K=intermediate)
	packed_s_ptr, # [E_total, K//16, N] down_proj FP8 scales
	tscale_ptr, # [E_total] tensor scales (float32)
	expert_ids_ptr, # [num_active] selected expert indices (GPU tensor)
	out_ptr, # [num_active, M, N] output
	M, N, K, # K = intermediate_size, N = dim
	num_active,
	inter_size, # intermediate_size (K)
	stride_ge, stride_gm, stride_gk, # gate_up strides
	stride_ew, stride_wk, stride_wn,
	stride_es, stride_sk, stride_sn,
	stride_oe, stride_om, stride_on,
	BLOCK_M: tl.constexpr,
	BLOCK_N: tl.constexpr,
	BLOCK_K: tl.constexpr,
	):
	"""Fused SwiGLU + FP4 down_proj GEMM.

	Reads gate_up output [num_active, M, 2inter], applies SiLU(gate)up inline,
	then performs FP4 GEMM with down_proj weights. Eliminates hidden tensor write/read.
	"""
	pid_active = tl.program_id(0)
	pid_n = tl.program_id(1)

	if pid_active >= num_active:
	return

	expert_id = tl.load(expert_ids_ptr + pid_active)

	w_packed_ptr = packed_w_ptr + expert_id * stride_ew
	w_scales_ptr = packed_s_ptr + expert_id * stride_es
	tensor_scale = tl.load(tscale_ptr + expert_id)

	gate_up_base = gate_up_ptr + pid_active * stride_ge

	offs_m = tl.arange(0, BLOCK_M)
	offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)

	acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

	HALF_BLOCK_K: tl.constexpr = BLOCK_K // 2
	SCALES_PER_TILE: tl.constexpr = BLOCK_K // 16

	for k_start in range(0, K, BLOCK_K):
	even_k = k_start + tl.arange(0, HALF_BLOCK_K) * 2
	odd_k = k_start + tl.arange(0, HALF_BLOCK_K) * 2 + 1

	# Load gate (from first half) and up (from second half)
	gate_even_ptrs = gate_up_base + offs_m[:, None] * stride_gm + even_k[None, :] * stride_gk
	gate_odd_ptrs = gate_up_base + offs_m[:, None] * stride_gm + odd_k[None, :] * stride_gk
	up_even_ptrs = gate_up_base + offs_m[:, None] * stride_gm + (inter_size + even_k[None, :]) * stride_gk
	up_odd_ptrs = gate_up_base + offs_m[:, None] * stride_gm + (inter_size + odd_k[None, :]) * stride_gk

	mask_e = (offs_m[:, None] < M) & (even_k[None, :] < K)
	mask_o = (offs_m[:, None] < M) & (odd_k[None, :] < K)

	gate_even = tl.load(gate_even_ptrs, mask=mask_e, other=0.0).to(tl.float32)
	gate_odd = tl.load(gate_odd_ptrs, mask=mask_o, other=0.0).to(tl.float32)
	up_even = tl.load(up_even_ptrs, mask=mask_e, other=0.0).to(tl.float32)
	up_odd = tl.load(up_odd_ptrs, mask=mask_o, other=0.0).to(tl.float32)

	# Apply SiLU(gate) * up inline
	a_even = (tl.sigmoid(gate_even) * gate_even * up_even).to(tl.bfloat16)
	a_odd = (tl.sigmoid(gate_odd) * gate_odd * up_odd).to(tl.bfloat16)

	# Load FP4 down_proj weights
	pk_start = k_start // 2
	offs_pk = pk_start + tl.arange(0, HALF_BLOCK_K)
	w_ptrs = w_packed_ptr + offs_pk[:, None] * stride_wk + offs_n[None, :] * stride_wn
	mask_w = (offs_pk[:, None] < (K // 2)) & (offs_n[None, :] < N)
	packed = tl.load(w_ptrs, mask=mask_w, other=0).to(tl.int32)

	low_f = _e2m1_decode(packed & 0xF)
	high_f = _e2m1_decode((packed >> 4) & 0xF)

	scale_start = k_start // 16
	offs_local = tl.arange(0, HALF_BLOCK_K)
	group_idx = offs_local // 8

	scale_bc = tl.zeros((HALF_BLOCK_K, BLOCK_N), dtype=tl.float32)
	for sg in tl.static_range(0, SCALES_PER_TILE):
	sg_row = scale_start + sg
	sg_ptrs = w_scales_ptr + sg_row * stride_sk + offs_n * stride_sn
	sg_mask = (sg_row < (K // 16)) & (offs_n < N)
	sg_raw = tl.load(sg_ptrs, mask=sg_mask, other=0).to(tl.int32)
	sg_val = _decode_e4m3_triton(sg_raw) * tensor_scale
	sg_match = (group_idx == sg)
	scale_bc = tl.where(sg_match[:, None], sg_val[None, :], scale_bc)

	w_even = (low_f * scale_bc).to(tl.bfloat16)
	w_odd = (high_f * scale_bc).to(tl.bfloat16)

	acc += tl.dot(a_even, w_even)
	acc += tl.dot(a_odd, w_odd)

	out_ptrs = out_ptr + pid_active * stride_oe + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
	mask_out = (offs_m[:, None] < M) & (offs_n[None, :] < N)
	tl.store(out_ptrs, acc.to(tl.bfloat16), mask=mask_out)


	def goliath_packed_moe_swiglu_down(
	gate_up_output: torch.Tensor, # [num_active, M, 2*inter] from gate_up GEMM
	packed_w: torch.Tensor, # [E_total, K//2, N] down_proj FP4 weights
	packed_s: torch.Tensor, # [E_total, K//16, N] down_proj FP8 scales
	packed_ts: torch.Tensor, # [E_total] tensor scales
	expert_ids: torch.Tensor, # [num_active] on GPU
	intermediate_size: int, # K for down_proj
	num_active: int = 8,
	) -> torch.Tensor:
	"""Fused SwiGLU + FP4 down_proj GEMM.

	Takes gate_up output, applies SiLU(gate)*up inline, performs down_proj.
	Eliminates the hidden tensor write/read cycle.

	Args:
	gate_up_output: [num_active, M, 2*intermediate] output from gate_up GEMM
	packed_w: Down projection FP4 weights [E_total, K//2, N]
	packed_s: Down projection FP8 block scales
	packed_ts: Down projection tensor scales
	expert_ids: Selected expert indices on GPU
	intermediate_size: K dimension (matches gate_up output's second half)
	num_active: Number of active experts

	Returns:
	[num_active, M, N] final output
	"""
	num_active_in, M, gu_dim = gate_up_output.shape
	assert gu_dim == 2 * intermediate_size, f"gate_up dim {gu_dim} != 2*{intermediate_size}"

	K = intermediate_size
	_, _, N = packed_w.shape[0], packed_w.shape[1] * 2, packed_w.shape[2]
	# N is the output dim (dim), K is intermediate_size

	gate_up = gate_up_output.contiguous()
	if gate_up.dtype != torch.bfloat16:
	gate_up = gate_up.to(torch.bfloat16)

	out = torch.empty(num_active, M, N, device=gate_up.device, dtype=torch.bfloat16)

	grid = (num_active, triton.cdiv(N, 64))

	_goliath_fp4_swiglu_down_kernel[grid](
	gate_up,
	packed_w, packed_s, packed_ts, expert_ids,
	out,
	M, N, K,
	num_active,
	intermediate_size,
	gate_up.stride(0), gate_up.stride(1), gate_up.stride(2),
	packed_w.stride(0), packed_w.stride(1), packed_w.stride(2),
	packed_s.stride(0), packed_s.stride(1), packed_s.stride(2),
	out.stride(0), out.stride(1), out.stride(2),
	BLOCK_M=1 if M == 1 else 16,
	BLOCK_N=64,
	BLOCK_K=32,
	)

	return out


	# =============================================================================
	# INT2 Packed MoE Kernel — For Cold Experts (2x smaller than FP4)
	# =============================================================================
	#
	# Same structure as FP4 packed MoE kernel, but with 2-bit unpacking.
	# 4 weights per byte instead of 2 = 2x less bandwidth for cold experts.
	#

	@triton.jit
	def _goliath_int2_packed_moe_kernel(
	a_ptr, # [M, K] or [num_active*M, K] activations
	packed_w_ptr, # [E_total, K//4, N] contiguous INT2 weights (4 per byte)
	packed_s_ptr, # [E_total, K//32, N] contiguous FP16 scales
	expert_ids_ptr, # [num_active] selected expert indices (GPU tensor)
	out_ptr, # [num_active, M, N] output
	M, N, K,
	num_active,
	stride_ew, # expert stride for packed weights (K//4 * N)
	stride_wk, stride_wn,
	stride_es, # expert stride for packed scales (K//32 * N)
	stride_sk, stride_sn,
	stride_oe, stride_om, stride_on,
	stride_am, stride_ak,
	a_expert_stride, # 0 = shared input, >0 = per-expert activation stride
	BLOCK_M: tl.constexpr,
	BLOCK_N: tl.constexpr,
	BLOCK_K: tl.constexpr,
	):
	"""Packed MoE INT2 fused dequant-matmul: GPU-resident expert selection.

	Similar to FP4 kernel but unpacks 4 weights per byte instead of 2.
	Used for cold experts where 2-bit precision is acceptable.
	"""
	pid_active = tl.program_id(0)
	pid_n = tl.program_id(1)

	if pid_active >= num_active:
	return

	expert_id = tl.load(expert_ids_ptr + pid_active)

	w_packed_ptr = packed_w_ptr + expert_id * stride_ew
	w_scales_ptr = packed_s_ptr + expert_id * stride_es

	a_base = a_ptr + pid_active * a_expert_stride

	offs_m = tl.arange(0, BLOCK_M)
	offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)

	acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

	# INT2: 4 weights per byte, so QUARTER_BLOCK_K
	QUARTER_BLOCK_K: tl.constexpr = BLOCK_K // 4
	SCALES_PER_TILE: tl.constexpr = BLOCK_K // 32

	for k_start in range(0, K, BLOCK_K):
	# Load packed INT2 weights [QUARTER_BLOCK_K, BLOCK_N]
	pk_start = k_start // 4
	offs_pk = pk_start + tl.arange(0, QUARTER_BLOCK_K)
	w_ptrs = w_packed_ptr + offs_pk[:, None] * stride_wk + offs_n[None, :] * stride_wn
	mask_w = (offs_pk[:, None] < (K // 4)) & (offs_n[None, :] < N)
	packed = tl.load(w_ptrs, mask=mask_w, other=0).to(tl.int32)

	# Unpack 4 weights per byte: {0,1,2,3} -> {-2,-1,0,1}
	w0 = ((packed >> 0) & 0x3).to(tl.float32) - 2.0 # [QUARTER_BLOCK_K, BLOCK_N]
	w1 = ((packed >> 2) & 0x3).to(tl.float32) - 2.0
	w2 = ((packed >> 4) & 0x3).to(tl.float32) - 2.0
	w3 = ((packed >> 6) & 0x3).to(tl.float32) - 2.0

	# Load FP16 scales [SCALES_PER_TILE groups, each covers 32 elements]
	scale_start = k_start // 32
	offs_local_k = tl.arange(0, QUARTER_BLOCK_K) # Each packed byte covers 4 elements

	scale_bc = tl.zeros((QUARTER_BLOCK_K, BLOCK_N), dtype=tl.float32)
	for sg in tl.static_range(0, SCALES_PER_TILE):
	sg_row = scale_start + sg
	sg_ptrs = w_scales_ptr + sg_row * stride_sk + offs_n * stride_sn
	sg_mask = (sg_row < (K // 32)) & (offs_n < N)
	sg_val = tl.load(sg_ptrs, mask=sg_mask, other=1.0).to(tl.float32)
	sg_match = (offs_local_k // 8 == sg)
	scale_bc = tl.where(sg_match[:, None], sg_val[None, :], scale_bc)

	# Apply scales to unpacked weights
	w0_scaled = (w0 * scale_bc).to(tl.bfloat16)
	w1_scaled = (w1 * scale_bc).to(tl.bfloat16)
	w2_scaled = (w2 * scale_bc).to(tl.bfloat16)
	w3_scaled = (w3 * scale_bc).to(tl.bfloat16)

	# Load activations with stride-4 pattern (Triton doesn't support ::4 slicing)
	# Each packed INT2 byte covers 4 consecutive K elements: [4i, 4i+1, 4i+2, 4i+3]
	offs_qk = tl.arange(0, QUARTER_BLOCK_K)
	mask_a_qk = (offs_m[:, None] < M) & ((k_start + offs_qk[None, :] * 4) < K)
	a0 = tl.load(a_base + offs_m[:, None] * stride_am + (k_start + offs_qk[None, :] * 4 + 0) * stride_ak,
	mask=mask_a_qk, other=0.0).to(tl.bfloat16)
	a1 = tl.load(a_base + offs_m[:, None] * stride_am + (k_start + offs_qk[None, :] * 4 + 1) * stride_ak,
	mask=mask_a_qk, other=0.0).to(tl.bfloat16)
	a2 = tl.load(a_base + offs_m[:, None] * stride_am + (k_start + offs_qk[None, :] * 4 + 2) * stride_ak,
	mask=mask_a_qk, other=0.0).to(tl.bfloat16)
	a3 = tl.load(a_base + offs_m[:, None] * stride_am + (k_start + offs_qk[None, :] * 4 + 3) * stride_ak,
	mask=mask_a_qk, other=0.0).to(tl.bfloat16)

	acc += tl.dot(a0, w0_scaled)
	acc += tl.dot(a1, w1_scaled)
	acc += tl.dot(a2, w2_scaled)
	acc += tl.dot(a3, w3_scaled)

	out_ptrs = out_ptr + pid_active * stride_oe + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
	mask_out = (offs_m[:, None] < M) & (offs_n[None, :] < N)
	tl.store(out_ptrs, acc.to(tl.bfloat16), mask=mask_out)


	def goliath_packed_moe_int2_gemm(
	activations: torch.Tensor,
	packed_w: torch.Tensor, # [E_total, K//4, N] uint8 (4 weights per byte)
	packed_s: torch.Tensor, # [E_total, K//32, N] float16
	expert_ids: torch.Tensor, # [num_active] int64 on GPU
	num_active: int = 8,
	per_expert_input: bool = False,
	) -> torch.Tensor:
	"""Packed MoE INT2 GEMM: 2-bit weights for cold experts.

	2x smaller than FP4 = 2x less bandwidth. Use for rarely-routed experts.

	Args:
	activations: Input in BF16. [M, K] or [num_active*M, K]
	packed_w: [E_total, K//4, N] INT2 packed weights
	packed_s: [E_total, K//32, N] FP16 block scales
	expert_ids: [num_active] selected expert indices on GPU
	num_active: Number of active experts
	per_expert_input: If True, each expert reads from its own M rows

	Returns:
	Output [num_active, M, N] in BF16
	"""
	N = packed_w.shape[2]

	a = activations.contiguous()
	if a.dtype != torch.bfloat16:
	a = a.to(torch.bfloat16)

	if per_expert_input:
	total_rows, K = a.shape
	M = total_rows // num_active
	a_expert_stride = M * a.stride(0)
	else:
	M, K = a.shape
	a_expert_stride = 0

	out = torch.empty(num_active, M, N, device=a.device, dtype=torch.bfloat16)

	grid = (num_active, triton.cdiv(N, 64))

	_goliath_int2_packed_moe_kernel[grid](
	a,
	packed_w, packed_s,
	expert_ids,
	out,
	M, N, K,
	num_active,
	packed_w.stride(0),
	packed_w.stride(1), packed_w.stride(2),
	packed_s.stride(0),
	packed_s.stride(1), packed_s.stride(2),
	out.stride(0), out.stride(1), out.stride(2),
	a.stride(0), a.stride(1),
	a_expert_stride,
	BLOCK_M=16 if M > 1 else 1,
	BLOCK_N=64,
	BLOCK_K=128,
	)

	return out


	def pack_experts_int2(
	expert_weights: list, # List of GoliathINT2Weights
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""Pack INT2 expert weights into contiguous buffers.

	Args:
	expert_weights: List of GoliathINT2Weights (one per expert)

	Returns:
	packed_w: [E, K//4, N] contiguous INT2 weights
	packed_s: [E, K//32, N] contiguous FP16 scales
	"""
	E = len(expert_weights)
	K, N = expert_weights[0].shape

	device = expert_weights[0].packed.device

	packed_w = torch.zeros(E, K // 4, N, dtype=torch.uint8, device=device)
	packed_s = torch.zeros(E, K // 32, N, dtype=torch.float16, device=device)

	for i, w in enumerate(expert_weights):
	packed_w[i] = w.packed
	packed_s[i] = w.block_scales

	return packed_w, packed_s


	# =============================================================================
	# FE-XC (FireEcho Xtreme Compress) — Codebook 2-bit with CodeGEMM Psumbook
	# =============================================================================

	def fexc_precompute_psumbook(
	codebooks: torch.Tensor, # [2, 256, 8] float16
	x: torch.Tensor, # [N] or [1, N] bfloat16 — single token input
	) -> torch.Tensor:
	"""Precompute psumbook: dot products of all codebook centroids with input groups.

	psumbook[m, c, j] = dot(codebooks[m][c], x[j8:(j+1)8])

	This is computed ONCE per token and reused across all 8 active experts.
	For N=2048: psumbook is [2, 256, 256] float32 = 512KB.

	Args:
	codebooks: [2, 256, 8] float16 — shared codebooks for this layer
	x: [N] or [1, N] input vector

	Returns:
	psumbook: [2, 256, N//8] float32
	"""
	x_flat = x.view(-1).float() # [N]
	N = x_flat.shape[0]
	g = 8
	x_groups = x_flat.view(N // g, g) # [N//8, 8]
	cb = codebooks.float() # [2, 256, 8]

	# Batched matmul: [2, 256, 8] × [8, N//8] → [2, 256, N//8]
	psumbook = torch.bmm(cb, x_groups.T.unsqueeze(0).expand(2, -1, -1))
	return psumbook # [2, 256, N//8] float32


	@triton.jit
	def _goliath_fexc_packed_moe_kernel(
	psumbook_ptr, # [2, 256, num_groups] float32 — precomputed partial sums
	codes_ptr, # [E_total, K, num_groups, 2] uint8 — codebook indices
	scales_ptr, # [E_total, K] float16 — per-output-channel scales
	expert_ids_ptr, # [num_active] int64 — selected expert indices (GPU tensor)
	out_ptr, # [num_active, K] float32 output (M=1 only)
	K: tl.constexpr,
	num_groups: tl.constexpr, # N // 8
	num_active,
	stride_ec, # expert stride for codes: K * num_groups * 2
	stride_ck, # code stride per output row: num_groups * 2
	stride_cg, # code stride per group: 2
	stride_es, # expert stride for scales: K
	stride_oe, # output expert stride: K
	BLOCK_K: tl.constexpr,
	BLOCK_G: tl.constexpr,
	):
	"""FE-XC packed MoE kernel: CodeGEMM-style psumbook gather for M=1.

	For each active expert and each output row, gathers precomputed partial
	sums from psumbook using codebook indices. This replaces the traditional
	dequant-matmul with scalar gather+add operations.

	Grid: (num_active, ceil(K / BLOCK_K))
	"""
	pid_expert = tl.program_id(0)
	pid_k = tl.program_id(1)

	if pid_expert >= num_active:
	return

	expert_id = tl.load(expert_ids_ptr + pid_expert)

	offs_k = pid_k * BLOCK_K + tl.arange(0, BLOCK_K)
	mask_k = offs_k < K

	# Accumulator for output values [BLOCK_K]
	acc = tl.zeros((BLOCK_K,), dtype=tl.float32)

	# Code base pointer for this expert
	code_base = codes_ptr + expert_id * stride_ec

	# Psumbook layout: [2, 256, num_groups] row-major
	# psumbook[m, c, j] at offset: m * 256 * num_groups + c * num_groups + j
	ps_stride_m = 256 * num_groups
	ps_stride_c = num_groups

	# Loop over input groups
	for g_start in range(0, num_groups, BLOCK_G):
	offs_g = g_start + tl.arange(0, BLOCK_G)
	mask_g = offs_g < num_groups

	# Load codes for [BLOCK_K, BLOCK_G, 2]
	# codes[expert_id, k, g, 0] and codes[expert_id, k, g, 1]
	code_ptrs_0 = code_base + offs_k[:, None] * stride_ck + offs_g[None, :] * stride_cg + 0
	code_ptrs_1 = code_base + offs_k[:, None] * stride_ck + offs_g[None, :] * stride_cg + 1
	mask_kg = mask_k[:, None] & mask_g[None, :]

	codes_0 = tl.load(code_ptrs_0, mask=mask_kg, other=0).to(tl.int32) # [BLOCK_K, BLOCK_G]
	codes_1 = tl.load(code_ptrs_1, mask=mask_kg, other=0).to(tl.int32)

	# Gather from psumbook: psumbook[0, code0, g] + psumbook[1, code1, g]
	ps_ptrs_0 = psumbook_ptr + 0 * ps_stride_m + codes_0 * ps_stride_c + offs_g[None, :]
	ps_ptrs_1 = psumbook_ptr + 1 * ps_stride_m + codes_1 * ps_stride_c + offs_g[None, :]

	ps_vals_0 = tl.load(ps_ptrs_0, mask=mask_kg, other=0.0) # [BLOCK_K, BLOCK_G]
	ps_vals_1 = tl.load(ps_ptrs_1, mask=mask_kg, other=0.0)

	# Accumulate partial sums across groups
	acc += tl.sum(ps_vals_0 + ps_vals_1, axis=1) # reduce over BLOCK_G → [BLOCK_K]

	# Apply per-output-channel scale
	scale_ptrs = scales_ptr + expert_id * stride_es + offs_k
	scales = tl.load(scale_ptrs, mask=mask_k, other=1.0).to(tl.float32)
	acc = acc * scales

	# Store output
	out_ptrs = out_ptr + pid_expert * stride_oe + offs_k
	tl.store(out_ptrs, acc.to(tl.bfloat16), mask=mask_k)


	def goliath_packed_moe_fexc_gemm(
	activations: torch.Tensor, # [M, N] bfloat16 (M=1 for decode)
	packed_codes: torch.Tensor, # [E_total, K, N//8, 2] uint8
	codebooks: torch.Tensor, # [2, 256, 8] float16 (shared per layer)
	packed_scales: torch.Tensor, # [E_total, K] float16
	expert_ids: torch.Tensor, # [num_active] int64 on GPU
	psumbook: torch.Tensor = None, # [2, 256, N//8] float32 (precomputed, optional)
	num_active: int = 8,
	) -> torch.Tensor:
	"""FE-XC packed MoE GEMM: codebook 2-bit with CodeGEMM psumbook.

	Near-FP16 quality at 2 bits/weight. Precomputes psumbook once per token,
	then uses scalar gather+add instead of dequant+matmul.

	Args:
	activations: [M, N] or [1, N] input in BF16
	packed_codes: [E_total, K, N//8, 2] codebook indices
	codebooks: [2, 256, 8] shared codebooks for this layer
	packed_scales: [E_total, K] per-output-channel scales
	expert_ids: [num_active] expert indices on GPU
	psumbook: Precomputed [2, 256, N//8] (computed if None)
	num_active: Number of active experts

	Returns:
	Output [num_active, 1, K] in BF16 (M=1)
	"""
	M, N = activations.shape
	K = packed_codes.shape[1]
	num_groups = N // 8

	# Precompute psumbook if not provided
	if psumbook is None:
	psumbook = fexc_precompute_psumbook(codebooks, activations[0])

	out = torch.empty(num_active, K, device=activations.device, dtype=torch.bfloat16)

	BLOCK_K = min(64, K)
	BLOCK_G = min(64, num_groups)

	grid = (num_active, triton.cdiv(K, BLOCK_K))

	_goliath_fexc_packed_moe_kernel[grid](
	psumbook,
	packed_codes,
	packed_scales,
	expert_ids,
	out,
	K, num_groups, num_active,
	packed_codes.stride(0), # stride_ec
	packed_codes.stride(1), # stride_ck
	packed_codes.stride(2), # stride_cg
	packed_scales.stride(0), # stride_es
	out.stride(0), # stride_oe
	BLOCK_K=BLOCK_K,
	BLOCK_G=BLOCK_G,
	)

	return out.unsqueeze(1) # [num_active, 1, K]


	def pack_experts_fexc(
	expert_weights: list, # List of GoliathFEXCWeights
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	"""Pack FE-XC expert weights into contiguous buffers.

	Args:
	expert_weights: List of GoliathFEXCWeights (one per expert)

	Returns:
	packed_codes: [E, K, N//8, 2] contiguous uint8
	packed_scales: [E, K] contiguous float16
	codebooks: [2, 256, 8] float16 (shared, from first expert)
	"""
	E = len(expert_weights)
	K, N = expert_weights[0].shape
	g = expert_weights[0].group_size
	device = expert_weights[0].codes.device

	packed_codes = torch.zeros(E, K, N // g, 2, dtype=torch.uint8, device=device)
	packed_scales = torch.zeros(E, K, dtype=torch.float16, device=device)

	for i, w in enumerate(expert_weights):
	packed_codes[i] = w.codes
	packed_scales[i] = w.scales

	# Codebooks are shared across experts — take from first
	codebooks = expert_weights[0].codebooks

	return packed_codes, packed_scales, codebooks


	# =============================================================================
	# Benchmark
	# =============================================================================

	def benchmark_goliath(M=4096, N=4096, K=4096, warmup=10, iters=100):
	"""Benchmark Goliath FP4/FP8 vs BF16 cuBLAS."""
	import time

	if not torch.cuda.is_available():
	print("No CUDA available.")
	return

	print("=" * 60)
	print("Goliath FP4/FP8 Benchmark")
	print("=" * 60)

	w = torch.randn(K, N, device='cuda', dtype=torch.float32)
	w_fp4 = goliath_quantize(w, bits=4)
	w_fp8 = goliath_quantize(w, bits=8)
	w_bf16 = w.to(torch.bfloat16)
	a = torch.randn(M, K, device='cuda', dtype=torch.bfloat16)

	# Memory comparison
	fp4_bytes = w_fp4.packed.numel() + w_fp4.block_scales.numel()
	fp8_bytes = w_fp8.data.numel() + w_fp8.block_scales.numel() * 4
	bf16_bytes = w_bf16.numel() * 2
	print(f"Weight memory:")
	print(f" FP4: {fp4_bytes / 1e6:.1f} MB ({bf16_bytes / fp4_bytes:.1f}x compression)")
	print(f" FP8: {fp8_bytes / 1e6:.1f} MB ({bf16_bytes / fp8_bytes:.1f}x compression)")
	print(f" BF16: {bf16_bytes / 1e6:.1f} MB")
	print()

	# Warmup
	for _ in range(warmup):
	goliath_gemm(a, w_fp4)
	goliath_gemm(a, w_fp8)
	torch.matmul(a, w_bf16)
	torch.cuda.synchronize()

	# FP4
	start = time.perf_counter()
	for _ in range(iters):
	goliath_gemm(a, w_fp4)
	torch.cuda.synchronize()
	fp4_t = (time.perf_counter() - start) / iters

	# FP8
	start = time.perf_counter()
	for _ in range(iters):
	goliath_gemm(a, w_fp8)
	torch.cuda.synchronize()
	fp8_t = (time.perf_counter() - start) / iters

	# BF16 cuBLAS
	start = time.perf_counter()
	for _ in range(iters):
	torch.matmul(a, w_bf16)
	torch.cuda.synchronize()
	bf16_t = (time.perf_counter() - start) / iters

	flops = 2 * M * N * K
	print(f"{M}x{N}x{K} GEMM:")
	print(f" Goliath FP4: {flops/fp4_t/1e12:.1f} TFLOPS ({fp4_t*1000:.2f}ms)")
	print(f" Goliath FP8: {flops/fp8_t/1e12:.1f} TFLOPS ({fp8_t*1000:.2f}ms)")
	print(f" BF16 cuBLAS: {flops/bf16_t/1e12:.1f} TFLOPS ({bf16_t*1000:.2f}ms)")

	# Accuracy
	out_fp4 = goliath_gemm(a, w_fp4)
	out_fp8 = goliath_gemm(a, w_fp8)
	out_ref = torch.matmul(a.float(), w).bfloat16()

	err_fp4 = (out_fp4 - out_ref).abs().mean() / out_ref.abs().mean()
	err_fp8 = (out_fp8 - out_ref).abs().mean() / out_ref.abs().mean()
	print(f" FP4 vs FP32 ref: rel_err={err_fp4:.4f}")
	print(f" FP8 vs FP32 ref: rel_err={err_fp8:.4f}")

	# Auto mode
	w_auto = goliath_quantize(w, bits='auto')
	print(f" Auto-selected: FP{w_auto.bits}")


	# =============================================================================
	# GoliathLinear — Training Module with Custom Autograd
	# =============================================================================

	import torch.nn as nn


	class _GoliathLinearFunction(torch.autograd.Function):
	"""Custom autograd for fused FP4/FP8 forward + FP32 backward."""

	@staticmethod
	def forward(ctx, input, goliath_weights, bias, weight_fp32, bits):
	# Forward uses quantized Goliath GEMM
	# input: [M, K], goliath_weights: GoliathFP4Weights or GoliathFP8Weights
	out = goliath_gemm(input, goliath_weights, bias)
	ctx.save_for_backward(input, weight_fp32, bias)
	ctx.bits = bits
	return out

	@staticmethod
	def backward(ctx, grad_output):
	input, weight_fp32, bias = ctx.saved_tensors
	# grad_output: [M, N]

	# dA = grad_output @ W^T (dequant from master weights for accuracy)
	# weight_fp32: [out_features, in_features] = [N, K]
	dA = torch.matmul(grad_output.float(), weight_fp32.float()) # [M, K]

	# dW = grad_output^T @ input (FP32 accumulation)
	dW = torch.matmul(grad_output.float().T, input.float()) # [N, K]

	# db = grad_output.sum(dim=0) if bias exists
	db = None
	if bias is not None:
	db = grad_output.float().sum(dim=0)

	# Return gradients for: input, goliath_weights, bias, weight_fp32, bits
	return dA.to(input.dtype), None, db, dW, None


	class GoliathLinear(nn.Module):
	"""Linear layer with Goliath fused FP4/FP8 quantized forward + FP32 backward.

	Stores FP32 master weights as ``nn.Parameter`` for training. On forward,
	weights are quantized via Goliath and the fused kernel runs the matmul.
	Backward uses FP32 master weights for gradient computation.

	Args:
	in_features: Input dimension (K)
	out_features: Output dimension (N)
	bias: Whether to include a bias term
	bits: Quantization bits — 4, 8, or 'auto'
	"""

	def __init__(self, in_features: int, out_features: int, bias: bool = True,
	bits: Union[int, str] = 4):
	super().__init__()
	self.in_features = in_features
	self.out_features = out_features
	self.bits = bits

	# FP32 master weights
	self.weight = nn.Parameter(torch.empty(out_features, in_features))
	if bias:
	self.bias = nn.Parameter(torch.zeros(out_features))
	else:
	self.register_parameter('bias', None)

	nn.init.kaiming_uniform_(self.weight, a=2.23606797749979) # sqrt(5)

	# Quantization cache
	self._goliath_weights: Optional[GoliathWeights] = None
	self._weight_version: int = -1

	def _ensure_quantized(self):
	"""Re-quantize from master weights when weight data has changed."""
	# Check if weight tensor has been updated (via _version counter)
	current_version = self.weight._version
	if self._goliath_weights is not None and self._weight_version == current_version:
	return
	# weight is [out_features, in_features] = [N, K]
	# Goliath expects [K, N]
	w_kn = self.weight.data.T.contiguous().float()
	self._goliath_weights = goliath_quantize(w_kn, bits=self.bits)
	self._weight_version = current_version

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	orig_shape = x.shape[:-1]
	x_flat = x.reshape(-1, self.in_features)

	if x_flat.is_cuda and not self.training:
	# Inference: pure quantized forward (no autograd overhead)
	self._ensure_quantized()
	out = goliath_gemm(x_flat, self._goliath_weights, self.bias)
	return out.view(*orig_shape, self.out_features)

	if x_flat.is_cuda:
	# Training: quantized forward + FP32 backward via custom autograd
	self._ensure_quantized()
	out = _GoliathLinearFunction.apply(
	x_flat, self._goliath_weights, self.bias,
	self.weight, self.bits,
	)
	return out.view(*orig_shape, self.out_features)

	# CPU fallback
	return torch.nn.functional.linear(x, self.weight, self.bias)

	def extra_repr(self) -> str:
	return (f'in_features={self.in_features}, out_features={self.out_features}, '
	f'bias={self.bias is not None}, bits={self.bits}')


	# =============================================================================
	# GoliathQuantumLinear — Hybrid Training Module (FP8 Forward + Quantum Backward)
	# =============================================================================
	#
	# FireEcho training-optimized linear layer that combines:
	# Forward: BF16 master → quantize to FP8 → _goliath_fp8_kernel (2x bandwidth)
	# Backward: quantum_optimized_matmul (L2-swizzled Triton GEMM for gradients)
	# Master: BF16 weights (saves 2x memory vs FP32 GoliathLinear)
	#
	# Designed for EAGLE-3 draft head training where:
	# - Forward needs to be fast (inference-like, quantized)
	# - Backward needs to be accurate (BF16 master weights for gradient flow)
	# - Memory is tight (BF16 master saves 50% vs FP32)
	#
	# Hardware-agnostic: uses Triton (compiles to NVIDIA/AMD/Intel), no cuQuantum.

	# Lazy import for quantum module (lives at kernel/quantum/, parent of Engine/)
	_quantum_matmul = None
	_quantum_import_attempted = False

	def _ensure_quantum_import():
	"""Lazily import quantum_optimized_matmul from kernel/quantum/."""
	global _quantum_matmul, _quantum_import_attempted
	if _quantum_import_attempted:
	return _quantum_matmul is not None
	_quantum_import_attempted = True
	try:
	from quantum import quantum_optimized_matmul
	_quantum_matmul = quantum_optimized_matmul
	return True
	except ImportError:
	import sys as _sys
	import os as _os
	# kernel/ is parent of FireEcho Engine/
	_kernel_dir = _os.path.dirname(_os.path.dirname(_os.path.abspath(__file__)))
	if _kernel_dir not in _sys.path:
	_sys.path.insert(0, _kernel_dir)
	try:
	from quantum import quantum_optimized_matmul
	_quantum_matmul = quantum_optimized_matmul
	return True
	except ImportError:
	return False


	class _GoliathQuantumFunction(torch.autograd.Function):
	"""Custom autograd: Goliath FP8 forward + Quantum Gold L2-swizzled backward.

	Forward: Quantize BF16 master → FP8, run fused dequant-matmul (2x bandwidth)
	Backward: quantum_optimized_matmul for dX and dW (L2 cache swizzle, Triton GEMM)
	"""

	@staticmethod
	def forward(ctx, input, weight_bf16, bias, goliath_fp8):
	# Forward: use pre-quantized Goliath FP8 weights for 2x bandwidth
	out = goliath_gemm(input, goliath_fp8, bias)
	ctx.save_for_backward(input, weight_bf16, bias)
	return out

	@staticmethod
	def backward(ctx, grad_output):
	input, weight_bf16, bias = ctx.saved_tensors
	# grad_output: [M, N], weight_bf16: [N, K], input: [M, K]

	if _quantum_matmul is not None:
	# Quantum Gold backward: L2-swizzled Triton GEMM
	# dX = grad_output @ weight_bf16 — [M, N] × [N, K] → [M, K]
	# quantum_optimized_matmul expects [M, K] × [K, N] → [M, N]
	# So: dX = quantum_optimized_matmul(grad_output, weight_bf16.T)
	# but weight_bf16 is [N, K], so weight_bf16.T is [K, N]...
	# We need grad_output [M, N] × weight [N, K] = [M, K]
	# = quantum_optimized_matmul(grad_output, weight_bf16.T.contiguous()) won't work
	# We need: dX[M,K] = grad[M,N] @ W[N,K]
	# quantum_optimized_matmul(a[M,K], b[K,N]) → [M,N]
	# So: dX = quantum_optimized_matmul(grad_output.contiguous(),
	# weight_bf16.contiguous()) # grad[M,N] × W[N,K] → [M,K]
	# Wait — that's [M,N]×[N,K] which has inner dim N, not K.
	# quantum expects a[M,K]×b[K,N]. Here K_inner=N.
	# So this IS valid: a=[M,N_inner], b=[N_inner,K_out]
	dX = _quantum_matmul(
	grad_output.contiguous(),
	weight_bf16.contiguous() # [N, K] — inner dim N matches
	) # [M, K]

	# dW = grad_output.T @ input — [N, M] × [M, K] → [N, K]
	dW = _quantum_matmul(
	grad_output.T.contiguous(), # [N, M]
	input.contiguous() # [M, K]
	) # [N, K]
	else:
	# Fallback: standard PyTorch matmul (still BF16, still fast)
	dX = torch.matmul(grad_output, weight_bf16) # [M,N] × [N,K] → [M,K]
	dW = torch.matmul(grad_output.T, input) # [N,M] × [M,K] → [N,K]

	db = None
	if bias is not None:
	db = grad_output.sum(dim=0)

	# Gradients for: input, weight_bf16, bias, goliath_fp8
	return dX.to(input.dtype), dW.to(weight_bf16.dtype), db, None


	class GoliathQuantumLinear(nn.Module):
	"""FireEcho training-optimized linear layer.

	Combines Goliath FP8 forward (2x bandwidth savings) with Quantum Gold
	backward (L2-swizzled Triton GEMM for gradients). BF16 master weights
	save 50% memory vs FP32.

	This is the training counterpart of GoliathLinear. GoliathLinear uses FP32
	master weights and standard torch.matmul backward. GoliathQuantumLinear uses
	BF16 master weights and quantum_optimized_matmul backward.

	Args:
	in_features: Input dimension (K)
	out_features: Output dimension (N)
	bias: Whether to include a bias term
	"""

	def __init__(self, in_features: int, out_features: int, bias: bool = False):
	super().__init__()
	self.in_features = in_features
	self.out_features = out_features

	# BF16 master weights (saves 2x vs FP32)
	self.weight = nn.Parameter(
	torch.empty(out_features, in_features, dtype=torch.bfloat16))
	if bias:
	self.bias = nn.Parameter(torch.zeros(out_features, dtype=torch.bfloat16))
	else:
	self.register_parameter('bias', None)

	nn.init.kaiming_uniform_(self.weight, a=2.23606797749979) # sqrt(5)

	# FP8 quantization cache (reused every forward, re-quantized when weight changes)
	self._goliath_fp8: Optional[GoliathFP8Weights] = None
	self._weight_version: int = -1

	# Ensure quantum module is available
	_ensure_quantum_import()

	def _ensure_quantized(self):
	"""Re-quantize BF16 master → FP8 when weights have changed."""
	current_version = self.weight._version
	if self._goliath_fp8 is not None and self._weight_version == current_version:
	return
	# weight is [out_features, in_features] = [N, K]
	# Goliath expects [K, N]
	w_kn = self.weight.data.T.contiguous().float()
	self._goliath_fp8 = GoliathFP8Weights.from_float(w_kn)
	self._weight_version = current_version

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	orig_shape = x.shape[:-1]
	x_flat = x.reshape(-1, self.in_features)

	if x_flat.is_cuda and not self.training:
	# Inference: pure FP8 forward (no autograd overhead)
	self._ensure_quantized()
	out = goliath_gemm(x_flat, self._goliath_fp8, self.bias)
	return out.view(*orig_shape, self.out_features)

	if x_flat.is_cuda:
	# Training: FP8 forward + Quantum Gold backward
	self._ensure_quantized()
	out = _GoliathQuantumFunction.apply(
	x_flat, self.weight, self.bias, self._goliath_fp8,
	)
	return out.view(*orig_shape, self.out_features)

	# CPU fallback
	return torch.nn.functional.linear(x, self.weight, self.bias)

	def extra_repr(self) -> str:
	quantum_str = "quantum" if _quantum_matmul is not None else "fallback"
	return (f'in_features={self.in_features}, out_features={self.out_features}, '
	f'bias={self.bias is not None}, backward={quantum_str}')


	if __name__ == "__main__":
	print("Goliath — Native FP4/FP8 Fused Triton GEMM Kernel")
	print("=" * 60)

	if torch.cuda.is_available():
	print(f"GPU: {torch.cuda.get_device_name(0)}")
	benchmark_goliath(M=2048, N=2048, K=2048)
	print()
	benchmark_goliath(M=4096, N=4096, K=4096)
	else:
	print("No CUDA available.")