eousphoros
/

DeepSeek-V3.2-NVFP4

@@ -1,88 +1,340 @@
 import torch
-import tilelang
-import tilelang.language as T
 from typing import Tuple, Optional
-tilelang.set_log_level("WARNING")
-pass_configs = {
-    tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    tilelang.PassConfigKey.TL_DISABLE_FAST_MATH: True,
-}
 FP8 = "float8_e4m3"
 BF16 = "bfloat16"
 FP32 = "float32"
-def fast_log2_ceil(x):
-    bits_x = T.reinterpret("uint32", x)
-    exp_x = (bits_x >> 23) & 0xFF
-    man_bits = bits_x & ((1 << 23) - 1)
-    return T.Cast("int32", exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0))
-def fast_pow2(x):
-    bits_x = (x + 127) << 23
-    return T.reinterpret("float32", bits_x)
-def fast_round_scale(amax, fp8_max_inv):
-    return fast_pow2(fast_log2_ceil(amax * fp8_max_inv))
-@tilelang.jit(pass_configs=pass_configs)
-def act_quant_kernel(
-    N, in_dtype=BF16, out_dtype=FP8, scale_dtype=FP32, round_scale=False
-):
-    M = T.symbolic("M")
-    fp8_min = -448.0
-    fp8_max = 448.0
-    fp8_max_inv = 1 / fp8_max
-    num_stages = 0 if round_scale else 2
-    blk_m = 32
-    group_size = 128
-    @T.prim_func
-    def act_quant_kernel_(
-        X: T.Tensor[(M, N), in_dtype],
-        Y: T.Tensor[(M, N), out_dtype],
-        S: T.Tensor[(M, T.ceildiv(N, group_size)), scale_dtype],
     ):
-        with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as (
-            pid_m,
-            pid_n,
         ):
-            x_shared = T.alloc_shared((blk_m, group_size), in_dtype)
-            x_local = T.alloc_fragment((blk_m, group_size), in_dtype)
-            amax_local = T.alloc_fragment((blk_m,), scale_dtype)
-            s_local = T.alloc_fragment((blk_m,), scale_dtype)
-            y_local = T.alloc_fragment((blk_m, group_size), out_dtype)
-            y_shared = T.alloc_shared((blk_m, group_size), out_dtype)
-            for _ in T.Pipelined(1, num_stages=num_stages):
-                T.copy(X[pid_m * blk_m, pid_n * group_size], x_shared)
-                T.copy(x_shared, x_local)
-                T.reduce_absmax(x_local, amax_local, dim=1)
-                for i in T.Parallel(blk_m):
-                    amax_local[i] = T.max(amax_local[i], 1e-4)
-                    if round_scale:
-                        s_local[i] = fast_round_scale(amax_local[i], fp8_max_inv)
-                    else:
-                        s_local[i] = amax_local[i] * fp8_max_inv
-                for i, j in T.Parallel(blk_m, group_size):
-                    y_local[i, j] = T.clamp(
-                        x_local[i, j] / s_local[i], fp8_min, fp8_max
                     )
-                for i in T.Parallel(blk_m):
-                    S[pid_m * blk_m + i, pid_n] = s_local[i]
-                T.copy(y_local, y_shared)
-                T.copy(y_shared, Y[pid_m * blk_m, pid_n * group_size])
-    return act_quant_kernel_
 def act_quant(
     x: torch.Tensor, block_size: int = 128, scale_fmt: Optional[str] = None
@@ -99,6 +351,10 @@ def act_quant(
             - The quantized tensor with dtype `torch.float8_e4m3fn`.
             - A tensor of scaling factors with dtype `torch.float32`.
     """
     assert x.is_contiguous(), "Input tensor must be contiguous"
     assert x.size(-1) % block_size == 0, (
         f"Last dimension size must be divisible by block_size (block_size={block_size})"
@@ -111,63 +367,6 @@ def act_quant(
     return y, s
-@tilelang.jit(pass_configs=pass_configs)
-def fp8_gemm_kernel(N, K, out_dtype=BF16, accum_dtype="float32"):
-    assert out_dtype in [BF16, "float32"]
-    M = T.symbolic("M")
-    group_size = 128
-    block_M = 32
-    block_N = 128
-    block_K = 128
-    @T.prim_func
-    def fp8_gemm_kernel_(
-        A: T.Tensor[(M, K), FP8],
-        B: T.Tensor[(N, K), FP8],
-        C: T.Tensor[(M, N), out_dtype],
-        scales_a: T.Tensor[(M, T.ceildiv(K, group_size)), FP32],
-        scales_b: T.Tensor[(T.ceildiv(N, group_size), T.ceildiv(K, group_size)), FP32],
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
-            bx,
-            by,
-        ):
-            A_shared = T.alloc_shared((block_M, block_K), FP8)
-            B_shared = T.alloc_shared((block_N, block_K), FP8)
-            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
-            Scale_C_shared = T.alloc_shared((block_M), FP32)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            C_local_accum = T.alloc_fragment((block_M, block_N), accum_dtype)
-            # Improve L2 Cache
-            T.use_swizzle(panel_size=10)
-            T.clear(C_local)
-            T.clear(C_local_accum)
-            K_iters = T.ceildiv(K, block_K)
-            for k in T.Pipelined(K_iters, num_stages=4):
-                # Load A into shared memory
-                T.copy(A[by * block_M, k * block_K], A_shared)
-                # Load B into shared memory
-                T.copy(B[bx * block_N, k * block_K], B_shared)
-                # Load scale into shared memory
-                Scale_B = scales_b[bx * block_N // group_size, k]
-                for i in T.Parallel(block_M):
-                    Scale_C_shared[i] = scales_a[by * block_M + i, k] * Scale_B
-                T.gemm(A_shared, B_shared, C_local, transpose_B=True)
-                # Promote to enable 2xAcc
-                for i, j in T.Parallel(block_M, block_N):
-                    C_local_accum[i, j] += C_local[i, j] * Scale_C_shared[i]
-                T.clear(C_local)
-            # TMA store
-            T.copy(C_local_accum, C_shared)
-            T.copy(C_shared, C[by * block_M, bx * block_N])
-    return fp8_gemm_kernel_
 def fp8_gemm(
     a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor
 ) -> torch.Tensor:
@@ -183,6 +382,10 @@ def fp8_gemm(
     Returns:
         torch.Tensor: The result of the matrix multiplication.
     """
     assert a.is_contiguous() and b.is_contiguous(), "Input tensors must be contiguous"
     assert a_s.is_contiguous() and b_s.is_contiguous(), (
         "Scaling factor tensors must be contiguous"
@@ -196,61 +399,6 @@ def fp8_gemm(
     return c
-@tilelang.jit(out_idx=[4], pass_configs=pass_configs)
-def fp8_index_kernel(h: int, d: int):
-    b = T.symbolic("b")
-    m = T.symbolic("m")
-    n = T.symbolic("n")
-    blk_n1 = 512
-    blk_n2 = 128
-    @T.prim_func
-    def fp8_index_kernel_(
-        q: T.Tensor[(b, m, h, d), FP8],
-        q_s: T.Tensor[(b, m, h), FP32],
-        k: T.Tensor[(b, n, d), FP8],
-        k_s: T.Tensor[(b, n), FP32],
-        o: T.Tensor[(b, m, n), FP32],
-    ) -> None:
-        with T.Kernel(b, m, T.ceildiv(n, blk_n1)) as (i_b, i_m, i1_n):
-            q_smem = T.alloc_shared((h, d), FP8)
-            T.copy(q[i_b, i_m, 0, 0], q_smem)
-            q_s_frag = T.alloc_fragment(h, FP32)
-            T.copy(q_s[i_b, i_m, 0], q_s_frag)
-            for i2_n in T.Pipelined(blk_n1 // blk_n2, num_stages=2):
-                k_smem = T.alloc_shared((blk_n2, d), FP8)
-                T.copy(k[i_b, i1_n * blk_n1 + i2_n * blk_n2, 0], k_smem)
-                k_s_frag = T.alloc_fragment(blk_n2, FP32)
-                T.copy(k_s[i_b, i1_n * blk_n1 + i2_n * blk_n2], k_s_frag)
-                logits = T.alloc_fragment((blk_n2, h), FP32)
-                T.gemm(
-                    k_smem,
-                    q_smem,
-                    logits,
-                    transpose_A=False,
-                    transpose_B=True,
-                    clear_accum=True,
-                )
-                for i_h, i3_n in T.Parallel(h, blk_n2):
-                    logits[i3_n, i_h] = T.max(logits[i3_n, i_h], 0) * q_s_frag[i_h]
-                logits_sum = T.alloc_fragment(blk_n2, FP32)
-                T.reduce_sum(logits, logits_sum, dim=1)
-                for i3_n in T.Parallel(blk_n2):
-                    logits_sum[i3_n] *= k_s_frag[i3_n]
-                T.copy(logits_sum, o[i_b, i_m, i1_n * blk_n1 + i2_n * blk_n2])
-    return fp8_index_kernel_
 def fp8_index(
     q: torch.Tensor,
     q_s: torch.Tensor,
@@ -271,4 +419,8 @@ def fp8_index(
         fp32 logits -> fp32 logits_sum
         fp32 logits_sum * k_s (e8m0) -> fp32 index_score
     """
     return fp8_index_kernel(q.shape[2], q.shape[3])(q, q_s, k, k_s)

 import torch
 from typing import Tuple, Optional
+# Check if CUDA is available for tilelang kernels
+USE_TILELANG = torch.cuda.is_available()
+if USE_TILELANG:
+    try:
+        import tilelang
+        import tilelang.language as T
+        tilelang.set_log_level("WARNING")
+        pass_configs = {
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_FAST_MATH: True,
+        }
+    except ImportError:
+        USE_TILELANG = False
 FP8 = "float8_e4m3"
 BF16 = "bfloat16"
 FP32 = "float32"
+# ============================================================================
+# CPU Fallback Implementations
+# ============================================================================
+def act_quant_cpu(
+    x: torch.Tensor, block_size: int = 128, scale_fmt: Optional[str] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    CPU fallback: Quantizes input tensor to FP8 with per-block scales.
+    Uses simple per-block max scaling for FP8 quantization on CPU.
+    """
+    assert x.is_contiguous(), "Input tensor must be contiguous"
+    assert x.size(-1) % block_size == 0, (
+        f"Last dimension size must be divisible by block_size (block_size={block_size})"
+    )
+    N = x.size(-1)
+    fp8_max = 448.0  # Max representable value in FP8 E4M3
+    # Reshape for block-wise operations: [..., N] -> [..., N//block_size, block_size]
+    orig_shape = x.shape
+    x_blocks = x.view(*orig_shape[:-1], N // block_size, block_size)
+    # Compute per-block max (absolute value)
+    amax = x_blocks.abs().amax(dim=-1, keepdim=True).clamp(min=1e-4)
+    # Compute scales: scale = amax / fp8_max
+    s = (amax / fp8_max).squeeze(-1)  # [..., N//block_size]
+    # Quantize: y = clamp(x / scale, -fp8_max, fp8_max)
+    y_scaled = x_blocks / amax * fp8_max
+    y_scaled = y_scaled.clamp(-fp8_max, fp8_max)
+    # Reshape back and convert to FP8
+    y = y_scaled.view(orig_shape).to(torch.float8_e4m3fn)
+    return y, s.to(torch.float32)
+def fp8_gemm_cpu(
+    a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor,
+    block_size: int = 128
+) -> torch.Tensor:
+    """
+    CPU fallback: FP8 GEMM with block-scaled dequantization.
+    Args:
+        a: [M, K] FP8 activations
+        a_s: [M, K//block_size] activation scales
+        b: [N, K] FP8 weights
+        b_s: [N//block_size, K//block_size] weight scales
+    Returns:
+        [M, N] output in default dtype (bf16)
+    """
+    M = a.numel() // a.size(-1)
+    K = a.size(-1)
+    N = b.size(0)
+    # Dequantize A: [M, K] = fp8_a * scale_a (broadcast over blocks)
+    a_f32 = a.view(M, K // block_size, block_size).float()
+    a_dequant = (a_f32 * a_s.view(M, -1, 1)).view(M, K)
+    # Dequantize B: [N, K] = fp8_b * scale_b (broadcast over blocks)
+    b_f32 = b.view(N, K // block_size, block_size).float()
+    # b_s is [N//block_size, K//block_size], need to broadcast
+    b_s_expanded = b_s.view(N // block_size, 1, K // block_size, 1).expand(
+        N // block_size, block_size, K // block_size, block_size
+    ).reshape(N, K)
+    b_dequant = b_f32.view(N, K) * b_s_expanded
+    # Standard matmul: [M, K] @ [K, N] -> [M, N]
+    return torch.matmul(a_dequant.to(torch.bfloat16), b_dequant.T.to(torch.bfloat16))
+def fp8_index_cpu(
+    q: torch.Tensor,
+    q_s: torch.Tensor,
+    k: torch.Tensor,
+    k_s: torch.Tensor,
+    block_size: int = 128
+) -> torch.Tensor:
+    """
+    CPU fallback: Index scoring for sparse attention.
+    This computes index scores for selecting top-k positions in sparse attention.
+    Args:
+        q: [b, m, h, d] FP8 queries
+        q_s: [b, m, h] or [b, m, h, d//block_size] query weights (includes scales)
+        k: [b, n, d] FP8 keys
+        k_s: [b, n] or [b, n, d//block_size] key scales
+    Returns:
+        [b, m, n] index scores
+    """
+    b, m, h, d = q.shape
+    n = k.shape[1]
+    # Dequantize q and k from FP8 to float32
+    q_f32 = q.float()  # [b, m, h, d]
+    k_f32 = k.float()  # [b, n, d]
+    # Compute attention logits: q @ k^T -> [b, m, h, n]
+    logits = torch.einsum("bmhd,bnd->bmhn", q_f32, k_f32)
+    # Apply ReLU
+    logits = torch.relu(logits)
+    # Scale by q_s (query weights)
+    # q_s may have shape [b, m, h] or [b, m, h, num_scales]
+    if q_s.dim() == 3:
+        logits = logits * q_s.unsqueeze(-1)  # [b, m, h, 1] broadcast
+    else:
+        # q_s is [b, m, h, num_scales] - sum/average over last dim
+        logits = logits * q_s.mean(dim=-1, keepdim=True)
+    # Sum over heads -> [b, m, n]
+    logits_sum = logits.sum(dim=2)
+    # Scale by k_s (key scales)
+    # k_s may have shape [b, n] or [b, n, num_scales]
+    if k_s.dim() == 2:
+        logits_sum = logits_sum * k_s.unsqueeze(1)  # [b, 1, n] broadcast
+    else:
+        # k_s is [b, n, num_scales] - sum/average over last dim
+        logits_sum = logits_sum * k_s.mean(dim=-1).unsqueeze(1)
+    return logits_sum.to(torch.float32)
+# ============================================================================
+# Tilelang CUDA Kernels (only defined if tilelang available)
+# ============================================================================
+if USE_TILELANG:
+    def fast_log2_ceil(x):
+        bits_x = T.reinterpret("uint32", x)
+        exp_x = (bits_x >> 23) & 0xFF
+        man_bits = bits_x & ((1 << 23) - 1)
+        return T.Cast("int32", exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0))
+    def fast_pow2(x):
+        bits_x = (x + 127) << 23
+        return T.reinterpret("float32", bits_x)
+    def fast_round_scale(amax, fp8_max_inv):
+        return fast_pow2(fast_log2_ceil(amax * fp8_max_inv))
+    @tilelang.jit(pass_configs=pass_configs)
+    def act_quant_kernel(
+        N, in_dtype=BF16, out_dtype=FP8, scale_dtype=FP32, round_scale=False
     ):
+        M = T.symbolic("M")
+        fp8_min = -448.0
+        fp8_max = 448.0
+        fp8_max_inv = 1 / fp8_max
+        num_stages = 0 if round_scale else 2
+        blk_m = 32
+        group_size = 128
+        @T.prim_func
+        def act_quant_kernel_(
+            X: T.Tensor[(M, N), in_dtype],
+            Y: T.Tensor[(M, N), out_dtype],
+            S: T.Tensor[(M, T.ceildiv(N, group_size)), scale_dtype],
+        ):
+            with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as (
+                pid_m,
+                pid_n,
+            ):
+                x_shared = T.alloc_shared((blk_m, group_size), in_dtype)
+                x_local = T.alloc_fragment((blk_m, group_size), in_dtype)
+                amax_local = T.alloc_fragment((blk_m,), scale_dtype)
+                s_local = T.alloc_fragment((blk_m,), scale_dtype)
+                y_local = T.alloc_fragment((blk_m, group_size), out_dtype)
+                y_shared = T.alloc_shared((blk_m, group_size), out_dtype)
+                for _ in T.Pipelined(1, num_stages=num_stages):
+                    T.copy(X[pid_m * blk_m, pid_n * group_size], x_shared)
+                    T.copy(x_shared, x_local)
+                    T.reduce_absmax(x_local, amax_local, dim=1)
+                    for i in T.Parallel(blk_m):
+                        amax_local[i] = T.max(amax_local[i], 1e-4)
+                        if round_scale:
+                            s_local[i] = fast_round_scale(amax_local[i], fp8_max_inv)
+                        else:
+                            s_local[i] = amax_local[i] * fp8_max_inv
+                    for i, j in T.Parallel(blk_m, group_size):
+                        y_local[i, j] = T.clamp(
+                            x_local[i, j] / s_local[i], fp8_min, fp8_max
+                        )
+                    for i in T.Parallel(blk_m):
+                        S[pid_m * blk_m + i, pid_n] = s_local[i]
+                    T.copy(y_local, y_shared)
+                    T.copy(y_shared, Y[pid_m * blk_m, pid_n * group_size])
+        return act_quant_kernel_
+    @tilelang.jit(pass_configs=pass_configs)
+    def fp8_gemm_kernel(N, K, out_dtype=BF16, accum_dtype="float32"):
+        assert out_dtype in [BF16, "float32"]
+        M = T.symbolic("M")
+        group_size = 128
+        block_M = 32
+        block_N = 128
+        block_K = 128
+        @T.prim_func
+        def fp8_gemm_kernel_(
+            A: T.Tensor[(M, K), FP8],
+            B: T.Tensor[(N, K), FP8],
+            C: T.Tensor[(M, N), out_dtype],
+            scales_a: T.Tensor[(M, T.ceildiv(K, group_size)), FP32],
+            scales_b: T.Tensor[(T.ceildiv(N, group_size), T.ceildiv(K, group_size)), FP32],
         ):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
+                bx,
+                by,
+            ):
+                A_shared = T.alloc_shared((block_M, block_K), FP8)
+                B_shared = T.alloc_shared((block_N, block_K), FP8)
+                C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+                Scale_C_shared = T.alloc_shared((block_M), FP32)
+                C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+                C_local_accum = T.alloc_fragment((block_M, block_N), accum_dtype)
+                # Improve L2 Cache
+                T.use_swizzle(panel_size=10)
+                T.clear(C_local)
+                T.clear(C_local_accum)
+                K_iters = T.ceildiv(K, block_K)
+                for k in T.Pipelined(K_iters, num_stages=4):
+                    # Load A into shared memory
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                    # Load B into shared memory
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                    # Load scale into shared memory
+                    Scale_B = scales_b[bx * block_N // group_size, k]
+                    for i in T.Parallel(block_M):
+                        Scale_C_shared[i] = scales_a[by * block_M + i, k] * Scale_B
+                    T.gemm(A_shared, B_shared, C_local, transpose_B=True)
+                    # Promote to enable 2xAcc
+                    for i, j in T.Parallel(block_M, block_N):
+                        C_local_accum[i, j] += C_local[i, j] * Scale_C_shared[i]
+                    T.clear(C_local)
+                # TMA store
+                T.copy(C_local_accum, C_shared)
+                T.copy(C_shared, C[by * block_M, bx * block_N])
+        return fp8_gemm_kernel_
+    @tilelang.jit(out_idx=[4], pass_configs=pass_configs)
+    def fp8_index_kernel(h: int, d: int):
+        b = T.symbolic("b")
+        m = T.symbolic("m")
+        n = T.symbolic("n")
+        blk_n1 = 512
+        blk_n2 = 128
+        @T.prim_func
+        def fp8_index_kernel_(
+            q: T.Tensor[(b, m, h, d), FP8],
+            q_s: T.Tensor[(b, m, h), FP32],
+            k: T.Tensor[(b, n, d), FP8],
+            k_s: T.Tensor[(b, n), FP32],
+            o: T.Tensor[(b, m, n), FP32],
+        ) -> None:
+            with T.Kernel(b, m, T.ceildiv(n, blk_n1)) as (i_b, i_m, i1_n):
+                q_smem = T.alloc_shared((h, d), FP8)
+                T.copy(q[i_b, i_m, 0, 0], q_smem)
+                q_s_frag = T.alloc_fragment(h, FP32)
+                T.copy(q_s[i_b, i_m, 0], q_s_frag)
+                for i2_n in T.Pipelined(blk_n1 // blk_n2, num_stages=2):
+                    k_smem = T.alloc_shared((blk_n2, d), FP8)
+                    T.copy(k[i_b, i1_n * blk_n1 + i2_n * blk_n2, 0], k_smem)
+                    k_s_frag = T.alloc_fragment(blk_n2, FP32)
+                    T.copy(k_s[i_b, i1_n * blk_n1 + i2_n * blk_n2], k_s_frag)
+                    logits = T.alloc_fragment((blk_n2, h), FP32)
+                    T.gemm(
+                        k_smem,
+                        q_smem,
+                        logits,
+                        transpose_A=False,
+                        transpose_B=True,
+                        clear_accum=True,
                     )
+                    for i_h, i3_n in T.Parallel(h, blk_n2):
+                        logits[i3_n, i_h] = T.max(logits[i3_n, i_h], 0) * q_s_frag[i_h]
+                    logits_sum = T.alloc_fragment(blk_n2, FP32)
+                    T.reduce_sum(logits, logits_sum, dim=1)
+                    for i3_n in T.Parallel(blk_n2):
+                        logits_sum[i3_n] *= k_s_frag[i3_n]
+                    T.copy(logits_sum, o[i_b, i_m, i1_n * blk_n1 + i2_n * blk_n2])
+        return fp8_index_kernel_
+# ============================================================================
+# Public API - dispatches to CUDA or CPU implementations
+# ============================================================================
 def act_quant(
     x: torch.Tensor, block_size: int = 128, scale_fmt: Optional[str] = None
             - The quantized tensor with dtype `torch.float8_e4m3fn`.
             - A tensor of scaling factors with dtype `torch.float32`.
     """
+    # Use CPU fallback if not on CUDA or tilelang not available
+    if not x.is_cuda or not USE_TILELANG:
+        return act_quant_cpu(x, block_size, scale_fmt)
     assert x.is_contiguous(), "Input tensor must be contiguous"
     assert x.size(-1) % block_size == 0, (
         f"Last dimension size must be divisible by block_size (block_size={block_size})"
     return y, s
 def fp8_gemm(
     a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor
 ) -> torch.Tensor:
     Returns:
         torch.Tensor: The result of the matrix multiplication.
     """
+    # Use CPU fallback if not on CUDA or tilelang not available
+    if not a.is_cuda or not USE_TILELANG:
+        return fp8_gemm_cpu(a, a_s, b, b_s)
     assert a.is_contiguous() and b.is_contiguous(), "Input tensors must be contiguous"
     assert a_s.is_contiguous() and b_s.is_contiguous(), (
         "Scaling factor tensors must be contiguous"
     return c
 def fp8_index(
     q: torch.Tensor,
     q_s: torch.Tensor,
         fp32 logits -> fp32 logits_sum
         fp32 logits_sum * k_s (e8m0) -> fp32 index_score
     """
+    # Use CPU fallback if not on CUDA or tilelang not available
+    if not q.is_cuda or not USE_TILELANG:
+        return fp8_index_cpu(q, q_s, k, k_s)
     return fp8_index_kernel(q.shape[2], q.shape[3])(q, q_s, k, k_s)