Spaces:

jbilcke-hf
/

FlashWorld-ZeroGPU

Runtime error

Julian Bilcke commited on Oct 16, 2025

Commit

9dd297c

1 Parent(s): 3abc88b

attempting a fix

Files changed (1) hide show

quant.py CHANGED Viewed

@@ -92,18 +92,27 @@ def fp8_gemm(A, A_scale, B, B_scale, bias, out_dtype, native_fp8_support=False):
         else:
             batch_size = None
             A_input = A
         output = torch._scaled_mm(
             A_input,
             B.t(),
             out_dtype=out_dtype,
             scale_a=A_scale,
             scale_b=B_scale,
-            bias=bias,
         )
         if need_reshape:
             output = output.reshape(
                 batch_size, output.shape[0] // batch_size, output.shape[1]
             )
     else:
         output = torch.nn.functional.linear(
             A.to(out_dtype) * A_scale,

         else:
             batch_size = None
             A_input = A
+        # torch._scaled_mm doesn't support bias when out_dtype is Float32
+        # Apply bias separately in this case
+        use_bias_in_mm = bias is not None and out_dtype != torch.float32
         output = torch._scaled_mm(
             A_input,
             B.t(),
             out_dtype=out_dtype,
             scale_a=A_scale,
             scale_b=B_scale,
+            bias=bias if use_bias_in_mm else None,
         )
         if need_reshape:
             output = output.reshape(
                 batch_size, output.shape[0] // batch_size, output.shape[1]
             )
+        # Apply bias separately if out_dtype is Float32
+        if bias is not None and not use_bias_in_mm:
+            output = output + bias
     else:
         output = torch.nn.functional.linear(
             A.to(out_dtype) * A_scale,