danieldk HF Staff commited on Jul 4, 2025

Commit

a02ac19

1 Parent(s): 3370704

Build (AArch64)

Browse files

Files changed (44) hide show

build/torch26-cxx11-cu126-aarch64-linux/quantization/__init__.py +2 -2
build/torch26-cxx11-cu126-aarch64-linux/quantization/_ops.py +3 -3
build/torch26-cxx11-cu126-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so} +2 -2
build/torch26-cxx11-cu126-aarch64-linux/quantization/compressed_tensors.py +34 -33
build/torch26-cxx11-cu126-aarch64-linux/quantization/cutlass.py +10 -16
build/torch26-cxx11-cu126-aarch64-linux/quantization/marlin.py +40 -74
build/torch26-cxx11-cu126-aarch64-linux/quantization/platforms.py +69 -0
build/torch26-cxx11-cu126-aarch64-linux/quantization/scalar_type.py +19 -2
build/torch26-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils.py +231 -170
build/torch26-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp4.py +282 -0
build/torch26-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp8.py +90 -68
build/torch26-cxx98-cu126-aarch64-linux/quantization/__init__.py +2 -2
build/torch26-cxx98-cu126-aarch64-linux/quantization/_ops.py +3 -3
build/torch26-cxx98-cu126-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so} +2 -2
build/torch26-cxx98-cu126-aarch64-linux/quantization/compressed_tensors.py +34 -33
build/torch26-cxx98-cu126-aarch64-linux/quantization/cutlass.py +10 -16
build/torch26-cxx98-cu126-aarch64-linux/quantization/marlin.py +40 -74
build/torch26-cxx98-cu126-aarch64-linux/quantization/platforms.py +69 -0
build/torch26-cxx98-cu126-aarch64-linux/quantization/scalar_type.py +19 -2
build/torch26-cxx98-cu126-aarch64-linux/quantization/utils/marlin_utils.py +231 -170
build/torch26-cxx98-cu126-aarch64-linux/quantization/utils/marlin_utils_fp4.py +282 -0
build/torch26-cxx98-cu126-aarch64-linux/quantization/utils/marlin_utils_fp8.py +90 -68
build/torch27-cxx11-cu126-aarch64-linux/quantization/__init__.py +2 -2
build/torch27-cxx11-cu126-aarch64-linux/quantization/_ops.py +3 -3
build/torch27-cxx11-cu126-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so} +2 -2
build/torch27-cxx11-cu126-aarch64-linux/quantization/compressed_tensors.py +34 -33
build/torch27-cxx11-cu126-aarch64-linux/quantization/cutlass.py +10 -16
build/torch27-cxx11-cu126-aarch64-linux/quantization/marlin.py +40 -74
build/torch27-cxx11-cu126-aarch64-linux/quantization/platforms.py +69 -0
build/torch27-cxx11-cu126-aarch64-linux/quantization/scalar_type.py +19 -2
build/torch27-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils.py +231 -170
build/torch27-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp4.py +282 -0
build/torch27-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp8.py +90 -68
build/torch27-cxx11-cu128-aarch64-linux/quantization/__init__.py +2 -2
build/torch27-cxx11-cu128-aarch64-linux/quantization/_ops.py +3 -3
build/torch27-cxx11-cu128-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so} +2 -2
build/torch27-cxx11-cu128-aarch64-linux/quantization/compressed_tensors.py +34 -33
build/torch27-cxx11-cu128-aarch64-linux/quantization/cutlass.py +10 -16
build/torch27-cxx11-cu128-aarch64-linux/quantization/marlin.py +40 -74
build/torch27-cxx11-cu128-aarch64-linux/quantization/platforms.py +69 -0
build/torch27-cxx11-cu128-aarch64-linux/quantization/scalar_type.py +19 -2
build/torch27-cxx11-cu128-aarch64-linux/quantization/utils/marlin_utils.py +231 -170
build/torch27-cxx11-cu128-aarch64-linux/quantization/utils/marlin_utils_fp4.py +282 -0
build/torch27-cxx11-cu128-aarch64-linux/quantization/utils/marlin_utils_fp8.py +90 -68

build/torch26-cxx11-cu126-aarch64-linux/quantization/__init__.py CHANGED Viewed

@@ -1,12 +1,12 @@
 from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
 from .cutlass import (
     cutlass_scaled_mm_supports_fp8,
     cutlass_scaled_mm,
     cutlass_scaled_mm_azp,
 )
 from .marlin import (
     awq_marlin_repack,
-    fp8_marlin_gemm,
     gptq_marlin_gemm,
     gptq_marlin_repack,
     gptq_marlin_24_gemm,
@@ -25,8 +25,8 @@ __all__ = [
     "awq_marlin_repack",
     "cutlass_scaled_mm",
     "cutlass_scaled_mm_azp",
     "cutlass_scaled_mm_supports_fp8",
-    "fp8_marlin_gemm",
     "gptq_marlin_24_gemm",
     "gptq_marlin_gemm",
     "gptq_marlin_repack",

 from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
 from .cutlass import (
+    cutlass_scaled_mm_supports_block_fp8,
     cutlass_scaled_mm_supports_fp8,
     cutlass_scaled_mm,
     cutlass_scaled_mm_azp,
 )
 from .marlin import (
     awq_marlin_repack,
     gptq_marlin_gemm,
     gptq_marlin_repack,
     gptq_marlin_24_gemm,
     "awq_marlin_repack",
     "cutlass_scaled_mm",
     "cutlass_scaled_mm_azp",
+    "cutlass_scaled_mm_supports_block_fp8",
     "cutlass_scaled_mm_supports_fp8",
     "gptq_marlin_24_gemm",
     "gptq_marlin_gemm",
     "gptq_marlin_repack",

build/torch26-cxx11-cu126-aarch64-linux/quantization/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _quantization_0435ccb
-ops = torch.ops._quantization_0435ccb
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_quantization_0435ccb::{op_name}"

 import torch
+from . import _quantization_9035540
+ops = torch.ops._quantization_9035540
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_quantization_9035540::{op_name}"

build/torch26-cxx11-cu126-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8088ed15517846e4eec7ea5975b32cc7e4164522cb9305510663f76d36a1cef5
-size 67890136

 version https://git-lfs.github.com/spec/v1
+oid sha256:aee128710f3a8587386120a226a6caddd5e77cd7a0296a1f7fad51b4028550b1
+size 159934120

build/torch26-cxx11-cu126-aarch64-linux/quantization/compressed_tensors.py CHANGED Viewed

@@ -2,17 +2,7 @@ from typing import Optional, Tuple
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _quantization
-        ops = torch.ops._quantization
-    except ImportError:
-        raise e
 # fp8
 def scaled_fp8_quant(
@@ -21,7 +11,8 @@ def scaled_fp8_quant(
     num_token_padding: Optional[int] = None,
     scale_ub: Optional[torch.Tensor] = None,
     use_per_token_if_dynamic: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP8 and return quantized tensor and scale.
@@ -42,30 +33,36 @@ def scaled_fp8_quant(
             in the dynamic quantization case.
     Returns:
-        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
             scaling factor.
     """
     # This code assumes batch_dim and num_tokens are flattened
-    assert input.ndim == 2
-    shape: Union[Tuple[int, int], torch.Size] = input.shape
-    # For rocm, the output fp8 dtype is torch.float_e3m3fnuz
-    # out_dtype: torch.dtype = torch.float8_e4m3fnuz \
-    #        if current_platform.is_rocm() else torch.float8_e4m3fn
-    out_dtype = torch.float8_e4m3fn
     if num_token_padding:
         shape = (max(num_token_padding, input.shape[0]), shape[1])
-    output = torch.empty(shape, device=input.device, dtype=out_dtype)
     if scale is None:
         if use_per_token_if_dynamic:
-            scale = torch.empty((shape[0], 1), device=input.device, dtype=torch.float32)
-            ops.dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub)
         else:
             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
             ops.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         # num_token_padding not implemented for this case
-        assert scale.numel() == 1 or num_token_padding is None
         ops.static_scaled_fp8_quant(output, input, scale)
     return output, scale
@@ -76,8 +73,8 @@ def scaled_int8_quant(
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
     azp: Optional[torch.Tensor] = None,
-    symmetric: bool = True,
-) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     """
     Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
@@ -90,21 +87,25 @@ def scaled_int8_quant(
         symmetric: Whether to use symmetric quantization (scale only, azp ignored).
     Returns:
-      Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
     """
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
         # static-per-tensor quantization.
         assert symmetric == (
-            azp is None
-        ), "azp must only be provided for asymmetric quantization."
         ops.static_scaled_int8_quant(output, input, scale, azp)
         return output, scale, azp
     # dynamic-per-token quantization.
-    input_scales = torch.empty(
-        (input.numel() // input.shape[-1], 1), device=input.device, dtype=torch.float32
-    )
-    input_azp = None if symmetric else torch.empty_like(input_scales, dtype=torch.int32)
-    ops.dynamic_scaled_int8_quant(output, input, input_scales, input_azp)
     return output, input_scales, input_azp

 import torch
+from ._ops import ops
 # fp8
 def scaled_fp8_quant(
     num_token_padding: Optional[int] = None,
     scale_ub: Optional[torch.Tensor] = None,
     use_per_token_if_dynamic: bool = False,
+    output: Optional[torch.Tensor] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP8 and return quantized tensor and scale.
             in the dynamic quantization case.
     Returns:
+        tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
             scaling factor.
     """
     # This code assumes batch_dim and num_tokens are flattened
+    assert (input.ndim == 2)
+    shape: Union[tuple[int, int], torch.Size] = input.shape
+    # For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
+    out_dtype: torch.dtype = current_platform.fp8_dtype()
     if num_token_padding:
         shape = (max(num_token_padding, input.shape[0]), shape[1])
+    if output is None:
+        output = torch.empty(shape, device=input.device, dtype=out_dtype)
+    else:
+        assert num_token_padding is None, \
+            "padding not supported if output passed in"
+        assert output.dtype == out_dtype
     if scale is None:
         if use_per_token_if_dynamic:
+            scale = torch.empty((shape[0], 1),
+                                device=input.device,
+                                dtype=torch.float32)
+            ops.dynamic_per_token_scaled_fp8_quant(
+                output, input.contiguous(), scale, scale_ub)
         else:
             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
             ops.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         # num_token_padding not implemented for this case
+        assert (scale.numel() == 1 and num_token_padding is None)
         ops.static_scaled_fp8_quant(output, input, scale)
     return output, scale
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
     azp: Optional[torch.Tensor] = None,
+    symmetric: bool = True
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     """
     Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
         symmetric: Whether to use symmetric quantization (scale only, azp ignored).
     Returns:
+      tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
     """
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
         # static-per-tensor quantization.
         assert symmetric == (
+            azp
+            is None), "azp must only be provided for asymmetric quantization."
         ops.static_scaled_int8_quant(output, input, scale, azp)
         return output, scale, azp
     # dynamic-per-token quantization.
+    input_scales = torch.empty((input.numel() // input.shape[-1], 1),
+                               device=input.device,
+                               dtype=torch.float32)
+    input_azp = None if symmetric else torch.empty_like(input_scales,
+                                                        dtype=torch.int32)
+    ops.dynamic_scaled_int8_quant(output, input.contiguous(),
+                                           input_scales, input_azp)
     return output, input_scales, input_azp

build/torch26-cxx11-cu126-aarch64-linux/quantization/cutlass.py CHANGED Viewed

@@ -2,22 +2,18 @@ from typing import Optional
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _quantization
-        ops = torch.ops._quantization
-    except ImportError:
-        raise e
 def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
 def cutlass_scaled_mm(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -33,12 +29,10 @@ def cutlass_scaled_mm(
     m = a.shape[0]
     n = b.shape[1]
-    # if current_platform.is_rocm():
-    #    triton_scaled_mm_module = importlib.import_module(
-    #        "vllm.model_executor.layers.quantization.compressed_tensors."
-    #        "triton_scaled_mm")
-    #    triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
-    #    return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)

 import torch
+from ._ops import ops
+from .platforms import current_platform
 def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
+def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
+    return ops.cutlass_scaled_mm_supports_block_fp8(cuda_device_capability)
 def cutlass_scaled_mm(
     a: torch.Tensor,
     b: torch.Tensor,
     m = a.shape[0]
     n = b.shape[1]
+    cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    if not cutlass_compatible_b:
+        from .triton_scaled_mm import triton_scaled_mm
+        return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)

build/torch26-cxx11-cu126-aarch64-linux/quantization/marlin.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING
 import torch
@@ -30,58 +30,30 @@ except ImportError as e:
 from .scalar_type import ScalarType
-# fp8 marlin
-def fp8_marlin_gemm(
-    a: torch.Tensor,
-    b_q_weight: torch.Tensor,
-    b_scales: torch.Tensor,
-    workspace: torch.Tensor,
-    num_bits: int,
-    size_m: int,
-    size_n: int,
-    size_k: int,
-) -> torch.Tensor:
-    return ops.fp8_marlin_gemm(
-        a, b_q_weight, b_scales, workspace, num_bits, size_m, size_n, size_k
-    )
 # gptq_marlin
-def gptq_marlin_gemm(
-    a: torch.Tensor,
-    b_q_weight: torch.Tensor,
-    b_scales: torch.Tensor,
-    b_zeros: torch.Tensor,
-    g_idx: torch.Tensor,
-    perm: torch.Tensor,
-    workspace: torch.Tensor,
-    b_q_type: ScalarType,
-    size_m: int,
-    size_n: int,
-    size_k: int,
-    is_k_full: bool,
-    has_zp: bool = False,
-    use_fp32_reduce: bool = False,
-    is_zp_float: bool = False,
-) -> torch.Tensor:
-    return ops.gptq_marlin_gemm(
-        a,
-        b_q_weight,
-        b_scales,
-        b_zeros,
-        g_idx,
-        perm,
-        workspace,
-        b_q_type.id,
-        size_m,
-        size_n,
-        size_k,
-        is_k_full,
-        has_zp,
-        use_fp32_reduce,
-        is_zp_float,
-    )
 # gptq_marlin
 def gptq_marlin_repack(
@@ -153,14 +125,6 @@ def marlin_qqq_gemm(
 # Fake ops
 if hasattr(ops, "gptq_marlin_24_gemm"):
-    @register_fake(add_op_namespace_prefix("fp8_marlin_gemm"))
-    def _fp8_marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
-                              b_scales: torch.Tensor, workspace: torch.Tensor,
-                              num_bits: int, size_m: torch.SymInt,
-                              size_n: torch.SymInt,
-                              size_k: torch.SymInt) -> torch.Tensor:
-        return torch.empty((size_m, size_n), dtype=a.dtype, device=a.device)
     @register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
     def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
                                     b_meta: torch.Tensor, b_scales: torch.Tensor,
@@ -172,20 +136,22 @@ if hasattr(ops, "gptq_marlin_24_gemm"):
     @register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
     def _gptq_marlin_gemm_fake(a: torch.Tensor,
-                                b_q_weight: torch.Tensor,
-                                b_scales: torch.Tensor,
-                                b_zeros: torch.Tensor,
-                                g_idx: torch.Tensor,
-                                perm: torch.Tensor,
-                                workspace: torch.Tensor,
-                                b_q_type: ScalarType,
-                                size_m: torch.SymInt,
-                                size_n: torch.SymInt,
-                                size_k: torch.SymInt,
-                                is_k_full: bool,
-                                has_zp: bool = False,
-                                use_fp32_reduce: bool = False,
-                                is_zp_float: bool = False) -> torch.Tensor:
         return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
     @register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))

+from typing import TYPE_CHECKING, Optional
 import torch
 from .scalar_type import ScalarType
 # gptq_marlin
+def gptq_marlin_gemm(a: torch.Tensor,
+                     c: Optional[torch.Tensor],
+                     b_q_weight: torch.Tensor,
+                     b_scales: torch.Tensor,
+                     global_scale: Optional[torch.Tensor],
+                     b_zeros: Optional[torch.Tensor],
+                     g_idx: Optional[torch.Tensor],
+                     perm: Optional[torch.Tensor],
+                     workspace: torch.Tensor,
+                     b_q_type: ScalarType,
+                     size_m: int,
+                     size_n: int,
+                     size_k: int,
+                     is_k_full: bool = True,
+                     use_atomic_add: bool = False,
+                     use_fp32_reduce: bool = False,
+                     is_zp_float: bool = False) -> torch.Tensor:
+    return ops.gptq_marlin_gemm(a, c, b_q_weight, b_scales,
+                                         global_scale, b_zeros, g_idx, perm,
+                                         workspace, b_q_type.id, size_m,
+                                         size_n, size_k, is_k_full,
+                                         use_atomic_add, use_fp32_reduce,
+                                         is_zp_float)
 # gptq_marlin
 def gptq_marlin_repack(
 # Fake ops
 if hasattr(ops, "gptq_marlin_24_gemm"):
     @register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
     def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
                                     b_meta: torch.Tensor, b_scales: torch.Tensor,
     @register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
     def _gptq_marlin_gemm_fake(a: torch.Tensor,
+                               c: Optional[torch.Tensor],
+                               b_q_weight: torch.Tensor,
+                               b_scales: torch.Tensor,
+                               global_scale: Optional[torch.Tensor],
+                               b_zeros: Optional[torch.Tensor],
+                               g_idx: Optional[torch.Tensor],
+                               perm: Optional[torch.Tensor],
+                               workspace: torch.Tensor,
+                               b_q_type_id: int,
+                               size_m: torch.SymInt,
+                               size_n: torch.SymInt,
+                               size_k: torch.SymInt,
+                               is_k_full: bool = True,
+                               use_atomic_add: bool = False,
+                               use_fp32_reduce: bool = False,
+                               is_zp_float: bool = False) -> torch.Tensor:
         return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
     @register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))

build/torch26-cxx11-cu126-aarch64-linux/quantization/platforms.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from abc import ABC, abstractmethod
+from functools import lru_cache
+from typing import NamedTuple
+import torch
+IS_ROCM = torch.version.hip is not None
+class DeviceCapability(NamedTuple):
+    major: int
+    minor: int
+    def as_version_str(self) -> str:
+        return f"{self.major}.{self.minor}"
+    def to_int(self) -> int:
+        """
+        Express device capability as an integer ``<major><minor>``.
+        It is assumed that the minor version is always a single digit.
+        """
+        assert 0 <= self.minor < 10
+        return self.major * 10 + self.minor
+class Platform(ABC):
+    simple_compile_backend: str = "inductor"
+    @classmethod
+    @abstractmethod
+    def get_device_name(cls, device_id: int = 0) -> str: ...
+    @abstractmethod
+    def is_rocm(self): ...
+class CudaPlatform(Platform):
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(0)
+    def is_rocm(self):
+        return False
+class RocmPlatform(Platform):
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(device_id)
+    def is_rocm(self):
+        return True
+current_platform = RocmPlatform() if IS_ROCM else CudaPlatform()

build/torch26-cxx11-cu126-aarch64-linux/quantization/scalar_type.py CHANGED Viewed

@@ -1,9 +1,14 @@
 import functools
 import struct
 from dataclasses import dataclass
 from enum import Enum
 from typing import Optional, Union
 # Mirrors enum in `core/scalar_type.hpp`
 class NanRepr(Enum):
@@ -121,8 +126,8 @@ class ScalarType:
             min_raw = max_raw | sign_bit_double
             return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
         else:
-            assert (not self.is_signed() or
-                    self.size_bits <= 64), "Cannot represent min as a int64_t"
             if self.is_signed():
                 return -(1 << (self.size_bits - 1))
@@ -156,6 +161,8 @@ class ScalarType:
         assert offset <= 64, \
             f"ScalarType fields too big {offset} to fit into an int64"
         return val
     @property
@@ -293,6 +300,13 @@ class ScalarType:
         ret.id  # noqa B018: make sure the id is cached
         return ret
 # naming generally follows: https://github.com/jax-ml/ml_dtypes
 # for floating point types (leading f) the scheme is:
@@ -319,6 +333,9 @@ class scalar_types:
     # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
     float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
     # "gptq" types
     uint2b2 = ScalarType.uint(2, 2)
     uint3b4 = ScalarType.uint(3, 4)

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 import struct
 from dataclasses import dataclass
 from enum import Enum
 from typing import Optional, Union
+_SCALAR_TYPES_ID_MAP = {}
 # Mirrors enum in `core/scalar_type.hpp`
 class NanRepr(Enum):
             min_raw = max_raw | sign_bit_double
             return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
         else:
+            assert (not self.is_signed() or self.size_bits
+                    <= 64), "Cannot represent min as a int64_t"
             if self.is_signed():
                 return -(1 << (self.size_bits - 1))
         assert offset <= 64, \
             f"ScalarType fields too big {offset} to fit into an int64"
+        _SCALAR_TYPES_ID_MAP[val] = self
         return val
     @property
         ret.id  # noqa B018: make sure the id is cached
         return ret
+    @classmethod
+    def from_id(cls, scalar_type_id: int):
+        if scalar_type_id not in _SCALAR_TYPES_ID_MAP:
+            raise ValueError(
+                f"scalar_type_id {scalar_type_id} doesn't exists.")
+        return _SCALAR_TYPES_ID_MAP[scalar_type_id]
 # naming generally follows: https://github.com/jax-ml/ml_dtypes
 # for floating point types (leading f) the scheme is:
     # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
     float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
+    # fp4, https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    float4_e2m1f = ScalarType.float_(2, 1, True, NanRepr.NONE)
     # "gptq" types
     uint2b2 = ScalarType.uint(2, 2)
     uint3b4 = ScalarType.uint(3, 4)

build/torch26-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils.py CHANGED Viewed

@@ -1,4 +1,7 @@
-from typing import List, Optional, Tuple
 import numpy
 import torch
@@ -42,7 +45,9 @@ USE_FP32_REDUCE_DEFAULT = True
 #  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
 #  TODO: we may want to move this into the C++ so its closer to the actual impl
 def query_marlin_supported_quant_types(
-    has_zp: bool, device_capability: Optional[int] = None
 ):
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
@@ -51,137 +56,141 @@ def query_marlin_supported_quant_types(
     if device_capability < 80:
         return []
     if has_zp:
         # AWQ style, unsigned + runtime zero-point
-        return [scalar_types.uint4, scalar_types.uint8]
     else:
         # GPTQ style, unsigned + symmetric bias
-        # TODO: once fp8_marlin is merged into "gptq_marlin" we should be able
-        #  to add `scalar_types.float8_e4m3fn` here
-        return [scalar_types.uint4b8, scalar_types.uint8b128]
 def _check_marlin_supported(
-    quant_type: ScalarType,
-    group_size: Optional[int],
-    has_zp: bool,
-    device_capability: Optional[int] = None,
-) -> Tuple[bool, Optional[str]]:
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
         device_capability = capability_tuple[0] * 10 + capability_tuple[1]
-    supported_types = query_marlin_supported_quant_types(has_zp, device_capability)
     if quant_type not in supported_types:
-        return (
-            False,
-            f"Marlin does not support weight_bits = {quant_type}. "
-            f"Only types = {supported_types} "
-            f"are supported (for group_size = {group_size}, "
-            f"device_capability = {device_capability}, zp = {has_zp}).",
-        )
-    if group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
-        return (
-            False,
-            f"Marlin does not support group_size = {group_size}. "
-            f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
-            "are supported.",
-        )
     return True, None
-def check_marlin_supported(
-    quant_type: ScalarType,
-    group_size: int,
-    has_zp: bool = False,
-    device_capability: Optional[int] = None,
-) -> bool:
-    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp, device_capability)
     return cond
-def verify_marlin_supported(
-    quant_type: ScalarType, group_size: int, has_zp: bool = False
-) -> None:
     cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
     if not cond:
         assert err_msg is not None
         raise ValueError(err_msg)
-def verify_marlin_supports_shape(
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    input_size: int,
-    group_size: int,
-) -> None:
     # Validate output_size_per_partition
     if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
-        raise ValueError(
-            f"Weight output_size_per_partition = "
-            f"{output_size_per_partition} is not divisible by "
-            f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
-            "Consider reducing tensor_parallel_size or running "
-            "with --quantization gptq."
-        )
     # Validate input_size_per_partition
     if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
-        raise ValueError(
-            f"Weight input_size_per_partition = "
-            f"{input_size_per_partition} is not divisible "
-            f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
-            "Consider reducing tensor_parallel_size or running "
-            "with --quantization gptq."
-        )
-    if group_size < input_size and input_size_per_partition % group_size != 0:
         raise ValueError(
             f"Weight input_size_per_partition = {input_size_per_partition}"
-            f" is not divisible by group_size = {group_size}."
             "Consider reducing tensor_parallel_size or running "
-            "with --quantization gptq."
-        )
-def check_marlin_supports_shape(
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    input_size: int,
-    group_size: int,
-) -> Tuple[bool, Optional[str]]:
     try:
-        verify_marlin_supports_shape(
-            output_size_per_partition, input_size_per_partition, input_size, group_size
-        )
     except ValueError as e:
         return False, e.__str__()
     return True, None
-def marlin_make_workspace(
-    output_size_per_partition: int, device: torch.device
-) -> torch.Tensor:
-    max_workspace_size = (
-        output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N
-    ) * GPTQ_MARLIN_MAX_PARALLEL
-    return torch.zeros(
-        max_workspace_size, dtype=torch.int, device=device, requires_grad=False
-    )
 def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
     return (not act_order) or (act_order and not is_row_parallel)
-def marlin_repeat_scales_on_all_ranks(
-    act_order: bool, group_size: int, is_row_parallel: bool
-) -> bool:
     # Need to repeat scales on every rank if act_ordering or
     # channelwise and RowParallelLinear
     is_channelwise = group_size == -1
@@ -189,35 +198,34 @@ def marlin_repeat_scales_on_all_ranks(
 def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
-    return torch.nn.Parameter(
-        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
-    )
 def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
-    return torch.nn.Parameter(
-        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
-    )
-def marlin_sort_g_idx(g_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
     return g_idx[g_idx_sort_indices], g_idx_sort_indices
 def get_scale_perms():
-    scale_perm: List[int] = []
     for i in range(8):
         scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single: List[int] = []
     for i in range(4):
-        scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
     return scale_perm, scale_perm_single
-def marlin_permute_scales(
-    s: torch.Tensor, size_k: int, size_n: int, group_size: int
-) -> torch.Tensor:
     scale_perm, scale_perm_single = get_scale_perms()
     if group_size < size_k and group_size != -1:
@@ -247,9 +255,8 @@ def marlin_moe_permute_scales(
     return output
-def marlin_zero_points(
-    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int
-) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
     # "single" permutation, since zero-points are applied on every MMA
     scale_perm, _ = get_scale_perms()
@@ -270,9 +277,8 @@ def marlin_zero_points(
     return zp
-def awq_to_marlin_zero_points(
-    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
-) -> torch.Tensor:
     # AWQ zero-points are quantized and packed on the column dim.
     # In addition, the values are permuted based on dequantizer.
     # Here we undo both of these, and then apply marlin permutation
@@ -294,9 +300,8 @@ def awq_to_marlin_zero_points(
     return marlin_zp
-def moe_awq_to_marlin_zero_points(
-    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
-):
     num_experts = q_zp_packed.shape[0]
     output = torch.empty(
         (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
@@ -304,45 +309,97 @@ def moe_awq_to_marlin_zero_points(
         dtype=q_zp_packed.dtype,
     )
     for e in range(num_experts):
-        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n, num_bits)
     return output
 def apply_gptq_marlin_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    weight_zp: torch.Tensor,
-    g_idx: torch.Tensor,
-    g_idx_sort_indices: torch.Tensor,
-    workspace: torch.Tensor,
-    wtype: ScalarType,
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    is_k_full: bool,
-    bias: Optional[torch.Tensor] = None,
-    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
-) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (output_size_per_partition,)
-    output = ops.gptq_marlin_gemm(
-        reshaped_x,
-        weight,
-        weight_scale,
-        weight_zp,
-        g_idx,
-        g_idx_sort_indices,
-        workspace,
-        wtype,
-        size_m=reshaped_x.shape[0],
-        size_n=output_size_per_partition,
-        size_k=input_size_per_partition,
-        is_k_full=is_k_full,
-        has_zp=False,
-        use_fp32_reduce=use_fp32_reduce,
-        is_zp_float=False,
-    )
     if bias is not None:
         output.add_(bias)  # In-place add
@@ -351,39 +408,43 @@ def apply_gptq_marlin_linear(
 def apply_awq_marlin_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    weight_zp: torch.Tensor,
-    g_idx: torch.Tensor,
-    g_idx_sort_indices: torch.Tensor,
-    workspace: torch.Tensor,
-    quant_type: ScalarType,
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    bias: Optional[torch.Tensor] = None,
-    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
-) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (output_size_per_partition,)
-    output = ops.gptq_marlin_gemm(
-        reshaped_x,
-        weight,
-        weight_scale,
-        weight_zp,
-        g_idx,
-        g_idx_sort_indices,
-        workspace,
-        quant_type,
-        size_m=reshaped_x.shape[0],
-        size_n=output_size_per_partition,
-        size_k=input_size_per_partition,
-        is_k_full=True,
-        has_zp=True,
-        use_fp32_reduce=use_fp32_reduce,
-        is_zp_float=False,
-    )
     if bias is not None:
         output.add_(bias)  # In-place add

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
 import numpy
 import torch
 #  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
 #  TODO: we may want to move this into the C++ so its closer to the actual impl
 def query_marlin_supported_quant_types(
+    has_zp: Optional[bool] = None,
+    include_fp_type: bool = True,
+    device_capability: Optional[int] = None,
 ):
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
     if device_capability < 80:
         return []
+    # - has_zp is True: return quant_types that has zero points
+    # - has_zp is False: return quant_types that has not zero points
+    # - has_zp is None: both
+    if has_zp is None:
+        types0 = query_marlin_supported_quant_types(False, include_fp_type,
+                                                    device_capability)
+        types1 = query_marlin_supported_quant_types(True, include_fp_type,
+                                                    device_capability)
+        return types0 + types1
     if has_zp:
         # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4]
     else:
         # GPTQ style, unsigned + symmetric bias
+        res = [scalar_types.uint4b8, scalar_types.uint8b128]
+        if include_fp_type:
+            res += [scalar_types.float8_e4m3fn, scalar_types.float4_e2m1f]
+        return res
 def _check_marlin_supported(
+        quant_type: ScalarType,
+        group_size: Optional[int],
+        has_zp: bool,
+        device_capability: Optional[int] = None) -> tuple[bool, Optional[str]]:
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
         device_capability = capability_tuple[0] * 10 + capability_tuple[1]
+    supported_types = query_marlin_supported_quant_types(
+        has_zp, True, device_capability)
     if quant_type not in supported_types:
+        return (False, f"Marlin does not support weight_bits = {quant_type}. "
+                f"Only types = {supported_types} "
+                f"are supported (for group_size = {group_size}, "
+                f"device_capability = {device_capability}, zp = {has_zp}).")
+    if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES):
+        return (False, f"Marlin does not support group_size = {group_size}. "
+                f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
     return True, None
+def check_marlin_supported(quant_type: ScalarType,
+                           group_size: int,
+                           has_zp: bool = False,
+                           device_capability: Optional[int] = None) -> bool:
+    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp,
+                                      device_capability)
     return cond
+def verify_marlin_supported(quant_type: ScalarType,
+                            group_size: int,
+                            has_zp: bool = False) -> None:
     cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
     if not cond:
         assert err_msg is not None
         raise ValueError(err_msg)
+def verify_marlin_supports_shape(output_size_per_partition: int,
+                                 input_size_per_partition: int,
+                                 input_size: int, group_size: int) -> None:
     # Validate output_size_per_partition
     if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
+        raise ValueError(f"Weight output_size_per_partition = "
+                         f"{output_size_per_partition} is not divisible by "
+                         f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
     # Validate input_size_per_partition
     if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
+        raise ValueError(f"Weight input_size_per_partition = "
+                         f"{input_size_per_partition} is not divisible "
+                         f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+    if (group_size < input_size
+            and input_size_per_partition % group_size != 0):
         raise ValueError(
             f"Weight input_size_per_partition = {input_size_per_partition}"
+            f" is not divisible by group_size = {group_size}. "
             "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq.")
+def check_marlin_supports_shape(output_size_per_partition: int,
+                                input_size_per_partition: int,
+                                input_size: int, group_size: int) \
+                                    -> tuple[bool, Optional[str]]:
     try:
+        verify_marlin_supports_shape(output_size_per_partition,
+                                     input_size_per_partition, input_size,
+                                     group_size)
     except ValueError as e:
         return False, e.__str__()
     return True, None
+def marlin_make_workspace(output_size_per_partition: int,
+                          device: torch.device) -> torch.Tensor:
+    max_workspace_size = (output_size_per_partition //
+                          GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
+    return torch.zeros(max_workspace_size,
+                       dtype=torch.int,
+                       device=device,
+                       requires_grad=False)
+def marlin_make_workspace_new(device: torch.device,
+                              max_blocks_per_sm: int = 1) -> torch.Tensor:
+    # In the new marlin kernel, we use the num of threadblocks as workspace
+    # size. The num of threadblocks is is sms_count * max_blocks_per_sm.
+    sms = torch.cuda.get_device_properties(device).multi_processor_count
+    return torch.zeros(sms * max_blocks_per_sm,
+                       dtype=torch.int,
+                       device=device,
+                       requires_grad=False)
 def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
     return (not act_order) or (act_order and not is_row_parallel)
+def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
+                                      is_row_parallel: bool) -> bool:
     # Need to repeat scales on every rank if act_ordering or
     # channelwise and RowParallelLinear
     is_channelwise = group_size == -1
 def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
 def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
+def marlin_sort_g_idx(
+        g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
     return g_idx[g_idx_sort_indices], g_idx_sort_indices
 def get_scale_perms():
+    scale_perm: list[int] = []
     for i in range(8):
         scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: list[int] = []
     for i in range(4):
+        scale_perm_single.extend(
+            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
     return scale_perm, scale_perm_single
+def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
+                          group_size: int) -> torch.Tensor:
     scale_perm, scale_perm_single = get_scale_perms()
     if group_size < size_k and group_size != -1:
     return output
+def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
+                       num_bits: int) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
     # "single" permutation, since zero-points are applied on every MMA
     scale_perm, _ = get_scale_perms()
     return zp
+def awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
+                              size_n: int, num_bits: int) -> torch.Tensor:
     # AWQ zero-points are quantized and packed on the column dim.
     # In addition, the values are permuted based on dequantizer.
     # Here we undo both of these, and then apply marlin permutation
     return marlin_zp
+def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
+                                  size_n: int, num_bits: int):
     num_experts = q_zp_packed.shape[0]
     output = torch.empty(
         (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
         dtype=q_zp_packed.dtype,
     )
     for e in range(num_experts):
+        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
+                                              num_bits)
     return output
+def maybe_warn_marlin_atomic_add(device, dtype):
+    if torch.compiler.is_dynamo_compiling():
+        return
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        logger.info_once(
+            "You are running Marlin kernel with bf16 on GPUs before SM90. "
+            "You can consider change to fp16 to achieve better performance "
+            "if possible.")
+def maybe_warn_marlin_atomic_add_env():
+    if torch.compiler.is_dynamo_compiling():
+        return
+    if envs.VLLM_MARLIN_USE_ATOMIC_ADD:
+        return
+    logger.info_once(
+        "Marlin kernel can achieve better performance for small size_n "
+        "with experimental use_atomic_add feature. "
+        "You can consider set environment variable "
+        "VLLM_MARLIN_USE_ATOMIC_ADD to 1 if possible.")
+def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
+                                 dtype: torch.dtype) -> bool:
+    # the performance of atomicAdd is better than global reduce
+    # only when m*n is small and k is large
+    if n >= 2048 or k < 2048 or device.type != "cuda":
+        return False
+    # disable atomicAdd reduce by default,
+    # one can enable it with VLLM_MARLIN_USE_ATOMIC_ADD=1
+    if not envs.VLLM_MARLIN_USE_ATOMIC_ADD:
+        maybe_warn_marlin_atomic_add_env()
+        return False
+    # sm8x doesn't support atomicAdd + bfloat16 natively
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        maybe_warn_marlin_atomic_add(device, dtype)
+        return False
+    return True
 def apply_gptq_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_zp: torch.Tensor,
+        g_idx: torch.Tensor,
+        g_idx_sort_indices: torch.Tensor,
+        workspace: torch.Tensor,
+        wtype: ScalarType,
+        output_size_per_partition: int,
+        input_size_per_partition: int,
+        is_k_full: bool,
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=output_size_per_partition,
+                                                  k=reshaped_x.size(1),
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(reshaped_x,
+                                  None,
+                                  weight,
+                                  weight_scale,
+                                  None,
+                                  weight_zp,
+                                  g_idx,
+                                  g_idx_sort_indices,
+                                  workspace,
+                                  wtype,
+                                  size_m=reshaped_x.shape[0],
+                                  size_n=output_size_per_partition,
+                                  size_k=input_size_per_partition,
+                                  is_k_full=is_k_full,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)
     if bias is not None:
         output.add_(bias)  # In-place add
 def apply_awq_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_zp: torch.Tensor,
+        g_idx: torch.Tensor,
+        g_idx_sort_indices: torch.Tensor,
+        workspace: torch.Tensor,
+        quant_type: ScalarType,
+        output_size_per_partition: int,
+        input_size_per_partition: int,
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=output_size_per_partition,
+                                                  k=reshaped_x.size(1),
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(reshaped_x,
+                                  None,
+                                  weight,
+                                  weight_scale,
+                                  None,
+                                  weight_zp,
+                                  g_idx,
+                                  g_idx_sort_indices,
+                                  workspace,
+                                  quant_type,
+                                  size_m=reshaped_x.shape[0],
+                                  size_n=output_size_per_partition,
+                                  size_k=input_size_per_partition,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)
     if bias is not None:
         output.add_(bias)  # In-place add

build/torch26-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp4.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import torch
+import quantization as ops
+from .marlin_utils import (
+    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
+    should_use_atomic_add_reduce)
+from quantization.scalar_type import scalar_types
+FP4_MARLIN_SUPPORTED_GROUP_SIZES = [16]
+def is_fp4_marlin_supported():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    return capability >= 80
+def fp4_marlin_process_scales(marlin_scales):
+    if not (marlin_scales >= 0).all():
+        logger.warning_once(
+            "NVFP4 Marlin assumes the scales to be >=0, but has encountered "
+            "negative scales. Accuracy will likely be degraded. This is "
+            "because it changes the scales from FP8-S1E4M3 to a special "
+            "FP8-S0E5M3 format to speedup the dequantization.")
+    # convert to half first, we would convert to fp8 later
+    marlin_scales = marlin_scales.to(torch.half)
+    # 8 is the number of scale number using by one thread
+    marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
+    marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
+        marlin_scales.size(0) * 2, -1)
+    # fit the layout of fp8 dequantization
+    marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
+        marlin_scales.size(0), -1)
+    # We assume that weight_scale (FP8-S1E4M3) is always greater
+    # than or equal to 0. So we can convert
+    # (weight_scale * (2 ** 7) to a special FP8-S0E5M3 format.
+    # After multiplying by 2 ** 7, the top bit of FP8-S0E5M3 would always be 1
+    # when weight_scale > 0. This allows us to have an exponent bias
+    # closer to zero after dequantization.
+    marlin_scales = (marlin_scales * (2**7)).view(torch.int16) << 1
+    marlin_scales = marlin_scales.view(torch.float8_e4m3fn)
+    marlin_scales = marlin_scales[:, 1::2].contiguous()
+    return marlin_scales
+def fp4_marlin_process_global_scale(global_scale):
+    assert global_scale.dtype in [torch.half, torch.bfloat16]
+    fp4_exponent = 2
+    if global_scale.dtype == torch.half:
+        target_exponent = 5
+    elif global_scale.dtype == torch.bfloat16:
+        target_exponent = 8
+    # exponent_bias_fp16 = 2 ** 4 - 2 ** 1 = 14
+    # exponent_bias_bf16 = 2 ** 7 - 2 ** 1 = 126
+    exponent_bias = 2**(target_exponent - 1) - 2**(fp4_exponent - 1)
+    return global_scale * (2.0**(exponent_bias - 7))
+def apply_fp4_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_scale_2: torch.Tensor,
+        workspace: torch.Tensor,
+        size_n: int,
+        size_k: int,
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
+    # For GPUs that lack FP4 hardware support, we can leverage the
+    # Marlin kernel for fast weight-only FP4 quantization
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=size_n,
+                                                  k=size_k,
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(a=reshaped_x,
+                                  c=None,
+                                  b_q_weight=weight,
+                                  b_scales=weight_scale,
+                                  global_scale=weight_scale_2,
+                                  b_zeros=None,
+                                  g_idx=None,
+                                  perm=None,
+                                  workspace=workspace,
+                                  b_q_type=scalar_types.float4_e2m1f,
+                                  size_m=reshaped_x.size(0),
+                                  size_n=size_n,
+                                  size_k=size_k,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce)
+    if bias is not None:
+        output.add_(bias)  # In-place add
+    return output.reshape(out_shape)
+def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP4 computation but "
+        "FP4 quantization is being used. Weight-only FP4 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads.")
+    part_size_n = layer.output_size_per_partition
+    part_size_k = layer.input_size_per_partition
+    param_dtype = layer.params_dtype
+    assert layer.weight.shape == (part_size_n, part_size_k // 2)
+    device = layer.weight.device
+    # WORKSPACE
+    layer.workspace = marlin_make_workspace_new(device)
+    # WEIGHT
+    # Repack weights to marlin format
+    perm = torch.empty(0, dtype=torch.int, device=device)
+    qweight = layer.weight.view(torch.int32).T.contiguous()
+    marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
+                                            perm=perm,
+                                            size_k=part_size_k,
+                                            size_n=part_size_n,
+                                            num_bits=4)
+    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
+    # WEIGHT SCALES
+    # Permute scales
+    weight_scale = layer.weight_scale.T.to(param_dtype)
+    weight_scale = marlin_permute_scales(s=weight_scale,
+                                         size_k=part_size_k,
+                                         size_n=part_size_n,
+                                         group_size=16)
+    weight_scale = fp4_marlin_process_scales(weight_scale)
+    layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+    weight_scale_2 = layer.weight_scale_2.to(param_dtype)
+    weight_scale_2 = fp4_marlin_process_global_scale(weight_scale_2)
+    layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
+                                              requires_grad=False)
+    return
+def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP4 computation but "
+        "FP4 quantization is being used. Weight-only FP4 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads.")
+    e = layer.num_experts
+    k = layer.hidden_size
+    n = layer.intermediate_size_per_partition
+    # WORKSPACE
+    device = layer.w13_weight.device
+    param_dtype = layer.params_dtype
+    layer.workspace = marlin_make_workspace_new(device, 4)
+    perm = torch.empty(0, dtype=torch.int, device=device)
+    # WEIGHT
+    # Repack weights to marlin format
+    for name in ["w13_weight", "w2_weight"]:
+        weight = getattr(layer, name)
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+        assert weight.shape == (e, size_n, size_k // 2)
+        for i in range(e):
+            qweight = weight[i].view(torch.int32).T.contiguous()
+            marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
+                                                    perm=perm,
+                                                    size_k=size_k,
+                                                    size_n=size_n,
+                                                    num_bits=4)
+            tensor_list.append(marlin_qweight)
+        weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        weight = torch.nn.Parameter(weight, requires_grad=False)
+        setattr(layer, name, weight)
+    # WEIGHT SCALES
+    # Permute scales
+    for name in ["w13", "w2"]:
+        scales = getattr(layer, name + "_weight_scale").to(param_dtype)
+        global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype)
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+        for i in range(e):
+            marlin_scales = marlin_permute_scales(s=scales[i].T,
+                                                  size_k=size_k,
+                                                  size_n=size_n,
+                                                  group_size=16)
+            marlin_scales = fp4_marlin_process_scales(marlin_scales)
+            tensor_list.append(marlin_scales)
+        scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        scales = torch.nn.Parameter(scales, requires_grad=False)
+        setattr(layer, name + "_weight_scale", scales)
+        global_scale = fp4_marlin_process_global_scale(global_scale)
+        global_scale = torch.nn.Parameter(global_scale, requires_grad=False)
+        setattr(layer, name + "_weight_scale_2", global_scale)
+def rand_marlin_weight_fp4_like(weight, group_size):
+    assert group_size > 0
+    size_n, size_k = weight.shape
+    device = weight.device
+    scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 6
+    global_scale = scales.max() / 448
+    scales = (scales / global_scale).to(torch.float8_e4m3fn)
+    fp4_weight = torch.randint(0,
+                               256, (size_n, size_k // 2),
+                               dtype=torch.uint8,
+                               device=weight.device)
+    fp4_weight_part_1 = ((fp4_weight & 0b10000000) |
+                         ((fp4_weight & 0b01110000) >> 2))
+    fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn)
+    fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6)
+    fp4_weight2 = fp4_weight << 4
+    fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) |
+                         ((fp4_weight2 & 0b01110000) >> 2))
+    fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn)
+    fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6)
+    weight_ref = torch.cat(
+        [fp4_weight_part_2.unsqueeze(2),
+         fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k)
+    weight_ref = weight_ref * global_scale.to(weight.dtype) * \
+        scales.repeat_interleave(group_size, 1).to(weight.dtype)
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=4,
+    )
+    marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype),
+                                          size_k=size_k,
+                                          size_n=size_n,
+                                          group_size=group_size)
+    marlin_scales = fp4_marlin_process_scales(marlin_scales)
+    global_scale = fp4_marlin_process_global_scale(global_scale)
+    return weight_ref.T, marlin_qweight, marlin_scales, global_scale

build/torch26-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp8.py CHANGED Viewed

@@ -1,10 +1,13 @@
 from typing import Optional
 import torch
 import quantization as ops
-from .marlin_utils import marlin_make_workspace, marlin_permute_scales
 def is_fp8_marlin_supported():
@@ -13,88 +16,107 @@ def is_fp8_marlin_supported():
     return capability >= 80
 def apply_fp8_marlin_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    workspace: torch.Tensor,
-    size_n: int,
-    size_k: int,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
     # For GPUs that lack FP8 hardware support, we can leverage the
     # Marlin kernel for fast weight-only FP8 quantization
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (size_n,)
-    output = ops.fp8_marlin_gemm(
-        a=reshaped_x,
-        b_q_weight=weight,
-        b_scales=weight_scale,
-        workspace=workspace,
-        num_bits=8,
-        size_m=reshaped_x.shape[0],
-        size_n=size_n,
-        size_k=size_k,
-    )
     if bias is not None:
         output.add_(bias)  # In-place add
     return output.reshape(out_shape)
-def prepare_fp8_layer_for_marlin(
-    layer: torch.nn.Module, strategy: str = "tensor"
-) -> None:
-    part_size_n = layer.output_size_per_partition
-    part_size_k = layer.input_size_per_partition
-    device = layer.weight.device
-    # WORKSPACE
-    layer.workspace = marlin_make_workspace(part_size_n, device)
-    # WEIGHT
-    # Repack weights to marlin format
-    marlin_qweight = ops.gptq_marlin_repack(
-        b_q_weight=pack_fp8_to_int32(layer.weight),
-        perm=torch.empty(0, dtype=torch.int, device=device),
-        size_k=part_size_k,
-        size_n=part_size_n,
-        num_bits=8,
-    )
-    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
-    # WEIGHT SCALES
-    scales = layer.weight_scale.to(layer.orig_dtype)
-    # Permute scales
-    marlin_scales = marlin_permute_scales(
-        s=scales, size_k=part_size_k, size_n=part_size_n, group_size=-1
-    )
-    layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
-def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
     """
     Repack FP8 weights to gptq format (packed int32 elements)
     """
     assert fp8_tensor.dtype == torch.float8_e4m3fn
-    assert fp8_tensor.shape[0] % 4 == 0
-    # Reshape to prepare for packing
-    reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
-    # Convert fp8 to uint8 (byte) representation
-    byte_tensor = reshaped.view(torch.uint8)
-    # Pack 4 uint8 values into one int32
-    packed = (
-        byte_tensor[:, 0].to(torch.int32)
-        | (byte_tensor[:, 1].to(torch.int32) << 8)
-        | (byte_tensor[:, 2].to(torch.int32) << 16)
-        | (byte_tensor[:, 3].to(torch.int32) << 24)
-    )
-    return packed.view(fp8_tensor.shape[0] // 4, *fp8_tensor.shape[1:]).contiguous()

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import torch
 import quantization as ops
+from .marlin_utils import USE_FP32_REDUCE_DEFAULT, marlin_make_workspace, marlin_permute_scales
 def is_fp8_marlin_supported():
     return capability >= 80
+def fp8_fused_exponent_bias_into_scales(scales):
+    fp8_exponent = 4
+    if scales.dtype == torch.half:
+        target_exponent = 5
+    elif scales.dtype == torch.bfloat16:
+        target_exponent = 8
+    # exponent_bias_fp16 = 2 ** 4 - 2 ** 3 = 8
+    # exponent_bias_bf16 = 2 ** 7 - 2 ** 3 = 120
+    exponent_bias = 2**(target_exponent - 1) - 2**(fp8_exponent - 1)
+    s = torch.ones_like(scales) * 2
+    s = s**exponent_bias
+    return scales * s
 def apply_fp8_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        workspace: torch.Tensor,
+        size_n: int,
+        size_k: int,
+        bias: Optional[torch.Tensor],
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     # For GPUs that lack FP8 hardware support, we can leverage the
     # Marlin kernel for fast weight-only FP8 quantization
     reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=size_n,
+                                                  k=size_k,
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(a=reshaped_x,
+                                  c=None,
+                                  b_q_weight=weight,
+                                  b_scales=weight_scale,
+                                  global_scale=None,
+                                  b_zeros=None,
+                                  g_idx=None,
+                                  perm=None,
+                                  workspace=workspace,
+                                  b_q_type=scalar_types.float8_e4m3fn,
+                                  size_m=reshaped_x.size(0),
+                                  size_n=size_n,
+                                  size_k=size_k,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce)
     if bias is not None:
         output.add_(bias)  # In-place add
     return output.reshape(out_shape)
+def pack_fp8_to_int32(fp8_tensor: torch.Tensor,
+                      size_k_first: bool = True) -> torch.Tensor:
     """
     Repack FP8 weights to gptq format (packed int32 elements)
     """
     assert fp8_tensor.dtype == torch.float8_e4m3fn
+    assert fp8_tensor.ndim == 2
+    fp8_tensor = fp8_tensor.T if size_k_first else fp8_tensor
+    fp8_tensor = fp8_tensor.contiguous()
+    # fp8_tensor is contiguous and have shape (N, K) now
+    # with `.view(torch.int32)`, it become (N, K // 4)
+    int32_tensor = fp8_tensor.view(torch.int32)
+    return int32_tensor.T.contiguous() if size_k_first else int32_tensor
+def marlin_quant_fp8_torch(weight, group_size):
+    size_n, size_k = weight.shape
+    device = weight.device
+    if group_size != -1:
+        scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 448
+        repeated_scales = scales.repeat_interleave(group_size, 1)
+        fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
+        weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
+    else:
+        scales = weight.view(size_n, 1, group_size).abs().max(-1)[0] / 448
+        repeated_scales = scales.repeat_interleave(size_k, 1)
+        fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
+        weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
+    packed_weight = pack_fp8_to_int32(fp8_weight, False).T.contiguous()
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=packed_weight,
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=8,
+    )
+    marlin_scales = marlin_permute_scales(s=scales.T,
+                                          size_k=size_k,
+                                          size_n=size_n,
+                                          group_size=group_size)
+    marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
+    return weight_ref.T, marlin_qweight, marlin_scales

build/torch26-cxx98-cu126-aarch64-linux/quantization/__init__.py CHANGED Viewed

@@ -1,12 +1,12 @@
 from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
 from .cutlass import (
     cutlass_scaled_mm_supports_fp8,
     cutlass_scaled_mm,
     cutlass_scaled_mm_azp,
 )
 from .marlin import (
     awq_marlin_repack,
-    fp8_marlin_gemm,
     gptq_marlin_gemm,
     gptq_marlin_repack,
     gptq_marlin_24_gemm,
@@ -25,8 +25,8 @@ __all__ = [
     "awq_marlin_repack",
     "cutlass_scaled_mm",
     "cutlass_scaled_mm_azp",
     "cutlass_scaled_mm_supports_fp8",
-    "fp8_marlin_gemm",
     "gptq_marlin_24_gemm",
     "gptq_marlin_gemm",
     "gptq_marlin_repack",

 from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
 from .cutlass import (
+    cutlass_scaled_mm_supports_block_fp8,
     cutlass_scaled_mm_supports_fp8,
     cutlass_scaled_mm,
     cutlass_scaled_mm_azp,
 )
 from .marlin import (
     awq_marlin_repack,
     gptq_marlin_gemm,
     gptq_marlin_repack,
     gptq_marlin_24_gemm,
     "awq_marlin_repack",
     "cutlass_scaled_mm",
     "cutlass_scaled_mm_azp",
+    "cutlass_scaled_mm_supports_block_fp8",
     "cutlass_scaled_mm_supports_fp8",
     "gptq_marlin_24_gemm",
     "gptq_marlin_gemm",
     "gptq_marlin_repack",

build/torch26-cxx98-cu126-aarch64-linux/quantization/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _quantization_0435ccb
-ops = torch.ops._quantization_0435ccb
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_quantization_0435ccb::{op_name}"

 import torch
+from . import _quantization_9035540
+ops = torch.ops._quantization_9035540
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_quantization_9035540::{op_name}"

build/torch26-cxx98-cu126-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b8c55f64398210ec79884d722f11a1a40f8e61dd9cc1aaf31111592db04da151
-size 67884088

 version https://git-lfs.github.com/spec/v1
+oid sha256:3685a434362226370f1956f59790a58d2f4c8999f9f35acafd25ca9d73bfc5ae
+size 159991696

build/torch26-cxx98-cu126-aarch64-linux/quantization/compressed_tensors.py CHANGED Viewed

@@ -2,17 +2,7 @@ from typing import Optional, Tuple
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _quantization
-        ops = torch.ops._quantization
-    except ImportError:
-        raise e
 # fp8
 def scaled_fp8_quant(
@@ -21,7 +11,8 @@ def scaled_fp8_quant(
     num_token_padding: Optional[int] = None,
     scale_ub: Optional[torch.Tensor] = None,
     use_per_token_if_dynamic: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP8 and return quantized tensor and scale.
@@ -42,30 +33,36 @@ def scaled_fp8_quant(
             in the dynamic quantization case.
     Returns:
-        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
             scaling factor.
     """
     # This code assumes batch_dim and num_tokens are flattened
-    assert input.ndim == 2
-    shape: Union[Tuple[int, int], torch.Size] = input.shape
-    # For rocm, the output fp8 dtype is torch.float_e3m3fnuz
-    # out_dtype: torch.dtype = torch.float8_e4m3fnuz \
-    #        if current_platform.is_rocm() else torch.float8_e4m3fn
-    out_dtype = torch.float8_e4m3fn
     if num_token_padding:
         shape = (max(num_token_padding, input.shape[0]), shape[1])
-    output = torch.empty(shape, device=input.device, dtype=out_dtype)
     if scale is None:
         if use_per_token_if_dynamic:
-            scale = torch.empty((shape[0], 1), device=input.device, dtype=torch.float32)
-            ops.dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub)
         else:
             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
             ops.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         # num_token_padding not implemented for this case
-        assert scale.numel() == 1 or num_token_padding is None
         ops.static_scaled_fp8_quant(output, input, scale)
     return output, scale
@@ -76,8 +73,8 @@ def scaled_int8_quant(
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
     azp: Optional[torch.Tensor] = None,
-    symmetric: bool = True,
-) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     """
     Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
@@ -90,21 +87,25 @@ def scaled_int8_quant(
         symmetric: Whether to use symmetric quantization (scale only, azp ignored).
     Returns:
-      Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
     """
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
         # static-per-tensor quantization.
         assert symmetric == (
-            azp is None
-        ), "azp must only be provided for asymmetric quantization."
         ops.static_scaled_int8_quant(output, input, scale, azp)
         return output, scale, azp
     # dynamic-per-token quantization.
-    input_scales = torch.empty(
-        (input.numel() // input.shape[-1], 1), device=input.device, dtype=torch.float32
-    )
-    input_azp = None if symmetric else torch.empty_like(input_scales, dtype=torch.int32)
-    ops.dynamic_scaled_int8_quant(output, input, input_scales, input_azp)
     return output, input_scales, input_azp

 import torch
+from ._ops import ops
 # fp8
 def scaled_fp8_quant(
     num_token_padding: Optional[int] = None,
     scale_ub: Optional[torch.Tensor] = None,
     use_per_token_if_dynamic: bool = False,
+    output: Optional[torch.Tensor] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP8 and return quantized tensor and scale.
             in the dynamic quantization case.
     Returns:
+        tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
             scaling factor.
     """
     # This code assumes batch_dim and num_tokens are flattened
+    assert (input.ndim == 2)
+    shape: Union[tuple[int, int], torch.Size] = input.shape
+    # For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
+    out_dtype: torch.dtype = current_platform.fp8_dtype()
     if num_token_padding:
         shape = (max(num_token_padding, input.shape[0]), shape[1])
+    if output is None:
+        output = torch.empty(shape, device=input.device, dtype=out_dtype)
+    else:
+        assert num_token_padding is None, \
+            "padding not supported if output passed in"
+        assert output.dtype == out_dtype
     if scale is None:
         if use_per_token_if_dynamic:
+            scale = torch.empty((shape[0], 1),
+                                device=input.device,
+                                dtype=torch.float32)
+            ops.dynamic_per_token_scaled_fp8_quant(
+                output, input.contiguous(), scale, scale_ub)
         else:
             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
             ops.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         # num_token_padding not implemented for this case
+        assert (scale.numel() == 1 and num_token_padding is None)
         ops.static_scaled_fp8_quant(output, input, scale)
     return output, scale
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
     azp: Optional[torch.Tensor] = None,
+    symmetric: bool = True
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     """
     Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
         symmetric: Whether to use symmetric quantization (scale only, azp ignored).
     Returns:
+      tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
     """
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
         # static-per-tensor quantization.
         assert symmetric == (
+            azp
+            is None), "azp must only be provided for asymmetric quantization."
         ops.static_scaled_int8_quant(output, input, scale, azp)
         return output, scale, azp
     # dynamic-per-token quantization.
+    input_scales = torch.empty((input.numel() // input.shape[-1], 1),
+                               device=input.device,
+                               dtype=torch.float32)
+    input_azp = None if symmetric else torch.empty_like(input_scales,
+                                                        dtype=torch.int32)
+    ops.dynamic_scaled_int8_quant(output, input.contiguous(),
+                                           input_scales, input_azp)
     return output, input_scales, input_azp

build/torch26-cxx98-cu126-aarch64-linux/quantization/cutlass.py CHANGED Viewed

@@ -2,22 +2,18 @@ from typing import Optional
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _quantization
-        ops = torch.ops._quantization
-    except ImportError:
-        raise e
 def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
 def cutlass_scaled_mm(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -33,12 +29,10 @@ def cutlass_scaled_mm(
     m = a.shape[0]
     n = b.shape[1]
-    # if current_platform.is_rocm():
-    #    triton_scaled_mm_module = importlib.import_module(
-    #        "vllm.model_executor.layers.quantization.compressed_tensors."
-    #        "triton_scaled_mm")
-    #    triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
-    #    return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)

 import torch
+from ._ops import ops
+from .platforms import current_platform
 def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
+def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
+    return ops.cutlass_scaled_mm_supports_block_fp8(cuda_device_capability)
 def cutlass_scaled_mm(
     a: torch.Tensor,
     b: torch.Tensor,
     m = a.shape[0]
     n = b.shape[1]
+    cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    if not cutlass_compatible_b:
+        from .triton_scaled_mm import triton_scaled_mm
+        return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)

build/torch26-cxx98-cu126-aarch64-linux/quantization/marlin.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING
 import torch
@@ -30,58 +30,30 @@ except ImportError as e:
 from .scalar_type import ScalarType
-# fp8 marlin
-def fp8_marlin_gemm(
-    a: torch.Tensor,
-    b_q_weight: torch.Tensor,
-    b_scales: torch.Tensor,
-    workspace: torch.Tensor,
-    num_bits: int,
-    size_m: int,
-    size_n: int,
-    size_k: int,
-) -> torch.Tensor:
-    return ops.fp8_marlin_gemm(
-        a, b_q_weight, b_scales, workspace, num_bits, size_m, size_n, size_k
-    )
 # gptq_marlin
-def gptq_marlin_gemm(
-    a: torch.Tensor,
-    b_q_weight: torch.Tensor,
-    b_scales: torch.Tensor,
-    b_zeros: torch.Tensor,
-    g_idx: torch.Tensor,
-    perm: torch.Tensor,
-    workspace: torch.Tensor,
-    b_q_type: ScalarType,
-    size_m: int,
-    size_n: int,
-    size_k: int,
-    is_k_full: bool,
-    has_zp: bool = False,
-    use_fp32_reduce: bool = False,
-    is_zp_float: bool = False,
-) -> torch.Tensor:
-    return ops.gptq_marlin_gemm(
-        a,
-        b_q_weight,
-        b_scales,
-        b_zeros,
-        g_idx,
-        perm,
-        workspace,
-        b_q_type.id,
-        size_m,
-        size_n,
-        size_k,
-        is_k_full,
-        has_zp,
-        use_fp32_reduce,
-        is_zp_float,
-    )
 # gptq_marlin
 def gptq_marlin_repack(
@@ -153,14 +125,6 @@ def marlin_qqq_gemm(
 # Fake ops
 if hasattr(ops, "gptq_marlin_24_gemm"):
-    @register_fake(add_op_namespace_prefix("fp8_marlin_gemm"))
-    def _fp8_marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
-                              b_scales: torch.Tensor, workspace: torch.Tensor,
-                              num_bits: int, size_m: torch.SymInt,
-                              size_n: torch.SymInt,
-                              size_k: torch.SymInt) -> torch.Tensor:
-        return torch.empty((size_m, size_n), dtype=a.dtype, device=a.device)
     @register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
     def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
                                     b_meta: torch.Tensor, b_scales: torch.Tensor,
@@ -172,20 +136,22 @@ if hasattr(ops, "gptq_marlin_24_gemm"):
     @register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
     def _gptq_marlin_gemm_fake(a: torch.Tensor,
-                                b_q_weight: torch.Tensor,
-                                b_scales: torch.Tensor,
-                                b_zeros: torch.Tensor,
-                                g_idx: torch.Tensor,
-                                perm: torch.Tensor,
-                                workspace: torch.Tensor,
-                                b_q_type: ScalarType,
-                                size_m: torch.SymInt,
-                                size_n: torch.SymInt,
-                                size_k: torch.SymInt,
-                                is_k_full: bool,
-                                has_zp: bool = False,
-                                use_fp32_reduce: bool = False,
-                                is_zp_float: bool = False) -> torch.Tensor:
         return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
     @register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))

+from typing import TYPE_CHECKING, Optional
 import torch
 from .scalar_type import ScalarType
 # gptq_marlin
+def gptq_marlin_gemm(a: torch.Tensor,
+                     c: Optional[torch.Tensor],
+                     b_q_weight: torch.Tensor,
+                     b_scales: torch.Tensor,
+                     global_scale: Optional[torch.Tensor],
+                     b_zeros: Optional[torch.Tensor],
+                     g_idx: Optional[torch.Tensor],
+                     perm: Optional[torch.Tensor],
+                     workspace: torch.Tensor,
+                     b_q_type: ScalarType,
+                     size_m: int,
+                     size_n: int,
+                     size_k: int,
+                     is_k_full: bool = True,
+                     use_atomic_add: bool = False,
+                     use_fp32_reduce: bool = False,
+                     is_zp_float: bool = False) -> torch.Tensor:
+    return ops.gptq_marlin_gemm(a, c, b_q_weight, b_scales,
+                                         global_scale, b_zeros, g_idx, perm,
+                                         workspace, b_q_type.id, size_m,
+                                         size_n, size_k, is_k_full,
+                                         use_atomic_add, use_fp32_reduce,
+                                         is_zp_float)
 # gptq_marlin
 def gptq_marlin_repack(
 # Fake ops
 if hasattr(ops, "gptq_marlin_24_gemm"):
     @register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
     def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
                                     b_meta: torch.Tensor, b_scales: torch.Tensor,
     @register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
     def _gptq_marlin_gemm_fake(a: torch.Tensor,
+                               c: Optional[torch.Tensor],
+                               b_q_weight: torch.Tensor,
+                               b_scales: torch.Tensor,
+                               global_scale: Optional[torch.Tensor],
+                               b_zeros: Optional[torch.Tensor],
+                               g_idx: Optional[torch.Tensor],
+                               perm: Optional[torch.Tensor],
+                               workspace: torch.Tensor,
+                               b_q_type_id: int,
+                               size_m: torch.SymInt,
+                               size_n: torch.SymInt,
+                               size_k: torch.SymInt,
+                               is_k_full: bool = True,
+                               use_atomic_add: bool = False,
+                               use_fp32_reduce: bool = False,
+                               is_zp_float: bool = False) -> torch.Tensor:
         return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
     @register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))

build/torch26-cxx98-cu126-aarch64-linux/quantization/platforms.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from abc import ABC, abstractmethod
+from functools import lru_cache
+from typing import NamedTuple
+import torch
+IS_ROCM = torch.version.hip is not None
+class DeviceCapability(NamedTuple):
+    major: int
+    minor: int
+    def as_version_str(self) -> str:
+        return f"{self.major}.{self.minor}"
+    def to_int(self) -> int:
+        """
+        Express device capability as an integer ``<major><minor>``.
+        It is assumed that the minor version is always a single digit.
+        """
+        assert 0 <= self.minor < 10
+        return self.major * 10 + self.minor
+class Platform(ABC):
+    simple_compile_backend: str = "inductor"
+    @classmethod
+    @abstractmethod
+    def get_device_name(cls, device_id: int = 0) -> str: ...
+    @abstractmethod
+    def is_rocm(self): ...
+class CudaPlatform(Platform):
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(0)
+    def is_rocm(self):
+        return False
+class RocmPlatform(Platform):
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(device_id)
+    def is_rocm(self):
+        return True
+current_platform = RocmPlatform() if IS_ROCM else CudaPlatform()

build/torch26-cxx98-cu126-aarch64-linux/quantization/scalar_type.py CHANGED Viewed

@@ -1,9 +1,14 @@
 import functools
 import struct
 from dataclasses import dataclass
 from enum import Enum
 from typing import Optional, Union
 # Mirrors enum in `core/scalar_type.hpp`
 class NanRepr(Enum):
@@ -121,8 +126,8 @@ class ScalarType:
             min_raw = max_raw | sign_bit_double
             return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
         else:
-            assert (not self.is_signed() or
-                    self.size_bits <= 64), "Cannot represent min as a int64_t"
             if self.is_signed():
                 return -(1 << (self.size_bits - 1))
@@ -156,6 +161,8 @@ class ScalarType:
         assert offset <= 64, \
             f"ScalarType fields too big {offset} to fit into an int64"
         return val
     @property
@@ -293,6 +300,13 @@ class ScalarType:
         ret.id  # noqa B018: make sure the id is cached
         return ret
 # naming generally follows: https://github.com/jax-ml/ml_dtypes
 # for floating point types (leading f) the scheme is:
@@ -319,6 +333,9 @@ class scalar_types:
     # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
     float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
     # "gptq" types
     uint2b2 = ScalarType.uint(2, 2)
     uint3b4 = ScalarType.uint(3, 4)

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 import struct
 from dataclasses import dataclass
 from enum import Enum
 from typing import Optional, Union
+_SCALAR_TYPES_ID_MAP = {}
 # Mirrors enum in `core/scalar_type.hpp`
 class NanRepr(Enum):
             min_raw = max_raw | sign_bit_double
             return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
         else:
+            assert (not self.is_signed() or self.size_bits
+                    <= 64), "Cannot represent min as a int64_t"
             if self.is_signed():
                 return -(1 << (self.size_bits - 1))
         assert offset <= 64, \
             f"ScalarType fields too big {offset} to fit into an int64"
+        _SCALAR_TYPES_ID_MAP[val] = self
         return val
     @property
         ret.id  # noqa B018: make sure the id is cached
         return ret
+    @classmethod
+    def from_id(cls, scalar_type_id: int):
+        if scalar_type_id not in _SCALAR_TYPES_ID_MAP:
+            raise ValueError(
+                f"scalar_type_id {scalar_type_id} doesn't exists.")
+        return _SCALAR_TYPES_ID_MAP[scalar_type_id]
 # naming generally follows: https://github.com/jax-ml/ml_dtypes
 # for floating point types (leading f) the scheme is:
     # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
     float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
+    # fp4, https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    float4_e2m1f = ScalarType.float_(2, 1, True, NanRepr.NONE)
     # "gptq" types
     uint2b2 = ScalarType.uint(2, 2)
     uint3b4 = ScalarType.uint(3, 4)

build/torch26-cxx98-cu126-aarch64-linux/quantization/utils/marlin_utils.py CHANGED Viewed

@@ -1,4 +1,7 @@
-from typing import List, Optional, Tuple
 import numpy
 import torch
@@ -42,7 +45,9 @@ USE_FP32_REDUCE_DEFAULT = True
 #  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
 #  TODO: we may want to move this into the C++ so its closer to the actual impl
 def query_marlin_supported_quant_types(
-    has_zp: bool, device_capability: Optional[int] = None
 ):
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
@@ -51,137 +56,141 @@ def query_marlin_supported_quant_types(
     if device_capability < 80:
         return []
     if has_zp:
         # AWQ style, unsigned + runtime zero-point
-        return [scalar_types.uint4, scalar_types.uint8]
     else:
         # GPTQ style, unsigned + symmetric bias
-        # TODO: once fp8_marlin is merged into "gptq_marlin" we should be able
-        #  to add `scalar_types.float8_e4m3fn` here
-        return [scalar_types.uint4b8, scalar_types.uint8b128]
 def _check_marlin_supported(
-    quant_type: ScalarType,
-    group_size: Optional[int],
-    has_zp: bool,
-    device_capability: Optional[int] = None,
-) -> Tuple[bool, Optional[str]]:
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
         device_capability = capability_tuple[0] * 10 + capability_tuple[1]
-    supported_types = query_marlin_supported_quant_types(has_zp, device_capability)
     if quant_type not in supported_types:
-        return (
-            False,
-            f"Marlin does not support weight_bits = {quant_type}. "
-            f"Only types = {supported_types} "
-            f"are supported (for group_size = {group_size}, "
-            f"device_capability = {device_capability}, zp = {has_zp}).",
-        )
-    if group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
-        return (
-            False,
-            f"Marlin does not support group_size = {group_size}. "
-            f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
-            "are supported.",
-        )
     return True, None
-def check_marlin_supported(
-    quant_type: ScalarType,
-    group_size: int,
-    has_zp: bool = False,
-    device_capability: Optional[int] = None,
-) -> bool:
-    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp, device_capability)
     return cond
-def verify_marlin_supported(
-    quant_type: ScalarType, group_size: int, has_zp: bool = False
-) -> None:
     cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
     if not cond:
         assert err_msg is not None
         raise ValueError(err_msg)
-def verify_marlin_supports_shape(
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    input_size: int,
-    group_size: int,
-) -> None:
     # Validate output_size_per_partition
     if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
-        raise ValueError(
-            f"Weight output_size_per_partition = "
-            f"{output_size_per_partition} is not divisible by "
-            f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
-            "Consider reducing tensor_parallel_size or running "
-            "with --quantization gptq."
-        )
     # Validate input_size_per_partition
     if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
-        raise ValueError(
-            f"Weight input_size_per_partition = "
-            f"{input_size_per_partition} is not divisible "
-            f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
-            "Consider reducing tensor_parallel_size or running "
-            "with --quantization gptq."
-        )
-    if group_size < input_size and input_size_per_partition % group_size != 0:
         raise ValueError(
             f"Weight input_size_per_partition = {input_size_per_partition}"
-            f" is not divisible by group_size = {group_size}."
             "Consider reducing tensor_parallel_size or running "
-            "with --quantization gptq."
-        )
-def check_marlin_supports_shape(
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    input_size: int,
-    group_size: int,
-) -> Tuple[bool, Optional[str]]:
     try:
-        verify_marlin_supports_shape(
-            output_size_per_partition, input_size_per_partition, input_size, group_size
-        )
     except ValueError as e:
         return False, e.__str__()
     return True, None
-def marlin_make_workspace(
-    output_size_per_partition: int, device: torch.device
-) -> torch.Tensor:
-    max_workspace_size = (
-        output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N
-    ) * GPTQ_MARLIN_MAX_PARALLEL
-    return torch.zeros(
-        max_workspace_size, dtype=torch.int, device=device, requires_grad=False
-    )
 def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
     return (not act_order) or (act_order and not is_row_parallel)
-def marlin_repeat_scales_on_all_ranks(
-    act_order: bool, group_size: int, is_row_parallel: bool
-) -> bool:
     # Need to repeat scales on every rank if act_ordering or
     # channelwise and RowParallelLinear
     is_channelwise = group_size == -1
@@ -189,35 +198,34 @@ def marlin_repeat_scales_on_all_ranks(
 def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
-    return torch.nn.Parameter(
-        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
-    )
 def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
-    return torch.nn.Parameter(
-        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
-    )
-def marlin_sort_g_idx(g_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
     return g_idx[g_idx_sort_indices], g_idx_sort_indices
 def get_scale_perms():
-    scale_perm: List[int] = []
     for i in range(8):
         scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single: List[int] = []
     for i in range(4):
-        scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
     return scale_perm, scale_perm_single
-def marlin_permute_scales(
-    s: torch.Tensor, size_k: int, size_n: int, group_size: int
-) -> torch.Tensor:
     scale_perm, scale_perm_single = get_scale_perms()
     if group_size < size_k and group_size != -1:
@@ -247,9 +255,8 @@ def marlin_moe_permute_scales(
     return output
-def marlin_zero_points(
-    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int
-) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
     # "single" permutation, since zero-points are applied on every MMA
     scale_perm, _ = get_scale_perms()
@@ -270,9 +277,8 @@ def marlin_zero_points(
     return zp
-def awq_to_marlin_zero_points(
-    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
-) -> torch.Tensor:
     # AWQ zero-points are quantized and packed on the column dim.
     # In addition, the values are permuted based on dequantizer.
     # Here we undo both of these, and then apply marlin permutation
@@ -294,9 +300,8 @@ def awq_to_marlin_zero_points(
     return marlin_zp
-def moe_awq_to_marlin_zero_points(
-    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
-):
     num_experts = q_zp_packed.shape[0]
     output = torch.empty(
         (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
@@ -304,45 +309,97 @@ def moe_awq_to_marlin_zero_points(
         dtype=q_zp_packed.dtype,
     )
     for e in range(num_experts):
-        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n, num_bits)
     return output
 def apply_gptq_marlin_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    weight_zp: torch.Tensor,
-    g_idx: torch.Tensor,
-    g_idx_sort_indices: torch.Tensor,
-    workspace: torch.Tensor,
-    wtype: ScalarType,
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    is_k_full: bool,
-    bias: Optional[torch.Tensor] = None,
-    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
-) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (output_size_per_partition,)
-    output = ops.gptq_marlin_gemm(
-        reshaped_x,
-        weight,
-        weight_scale,
-        weight_zp,
-        g_idx,
-        g_idx_sort_indices,
-        workspace,
-        wtype,
-        size_m=reshaped_x.shape[0],
-        size_n=output_size_per_partition,
-        size_k=input_size_per_partition,
-        is_k_full=is_k_full,
-        has_zp=False,
-        use_fp32_reduce=use_fp32_reduce,
-        is_zp_float=False,
-    )
     if bias is not None:
         output.add_(bias)  # In-place add
@@ -351,39 +408,43 @@ def apply_gptq_marlin_linear(
 def apply_awq_marlin_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    weight_zp: torch.Tensor,
-    g_idx: torch.Tensor,
-    g_idx_sort_indices: torch.Tensor,
-    workspace: torch.Tensor,
-    quant_type: ScalarType,
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    bias: Optional[torch.Tensor] = None,
-    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
-) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (output_size_per_partition,)
-    output = ops.gptq_marlin_gemm(
-        reshaped_x,
-        weight,
-        weight_scale,
-        weight_zp,
-        g_idx,
-        g_idx_sort_indices,
-        workspace,
-        quant_type,
-        size_m=reshaped_x.shape[0],
-        size_n=output_size_per_partition,
-        size_k=input_size_per_partition,
-        is_k_full=True,
-        has_zp=True,
-        use_fp32_reduce=use_fp32_reduce,
-        is_zp_float=False,
-    )
     if bias is not None:
         output.add_(bias)  # In-place add

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
 import numpy
 import torch
 #  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
 #  TODO: we may want to move this into the C++ so its closer to the actual impl
 def query_marlin_supported_quant_types(
+    has_zp: Optional[bool] = None,
+    include_fp_type: bool = True,
+    device_capability: Optional[int] = None,
 ):
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
     if device_capability < 80:
         return []
+    # - has_zp is True: return quant_types that has zero points
+    # - has_zp is False: return quant_types that has not zero points
+    # - has_zp is None: both
+    if has_zp is None:
+        types0 = query_marlin_supported_quant_types(False, include_fp_type,
+                                                    device_capability)
+        types1 = query_marlin_supported_quant_types(True, include_fp_type,
+                                                    device_capability)
+        return types0 + types1
     if has_zp:
         # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4]
     else:
         # GPTQ style, unsigned + symmetric bias
+        res = [scalar_types.uint4b8, scalar_types.uint8b128]
+        if include_fp_type:
+            res += [scalar_types.float8_e4m3fn, scalar_types.float4_e2m1f]
+        return res
 def _check_marlin_supported(
+        quant_type: ScalarType,
+        group_size: Optional[int],
+        has_zp: bool,
+        device_capability: Optional[int] = None) -> tuple[bool, Optional[str]]:
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
         device_capability = capability_tuple[0] * 10 + capability_tuple[1]
+    supported_types = query_marlin_supported_quant_types(
+        has_zp, True, device_capability)
     if quant_type not in supported_types:
+        return (False, f"Marlin does not support weight_bits = {quant_type}. "
+                f"Only types = {supported_types} "
+                f"are supported (for group_size = {group_size}, "
+                f"device_capability = {device_capability}, zp = {has_zp}).")
+    if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES):
+        return (False, f"Marlin does not support group_size = {group_size}. "
+                f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
     return True, None
+def check_marlin_supported(quant_type: ScalarType,
+                           group_size: int,
+                           has_zp: bool = False,
+                           device_capability: Optional[int] = None) -> bool:
+    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp,
+                                      device_capability)
     return cond
+def verify_marlin_supported(quant_type: ScalarType,
+                            group_size: int,
+                            has_zp: bool = False) -> None:
     cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
     if not cond:
         assert err_msg is not None
         raise ValueError(err_msg)
+def verify_marlin_supports_shape(output_size_per_partition: int,
+                                 input_size_per_partition: int,
+                                 input_size: int, group_size: int) -> None:
     # Validate output_size_per_partition
     if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
+        raise ValueError(f"Weight output_size_per_partition = "
+                         f"{output_size_per_partition} is not divisible by "
+                         f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
     # Validate input_size_per_partition
     if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
+        raise ValueError(f"Weight input_size_per_partition = "
+                         f"{input_size_per_partition} is not divisible "
+                         f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+    if (group_size < input_size
+            and input_size_per_partition % group_size != 0):
         raise ValueError(
             f"Weight input_size_per_partition = {input_size_per_partition}"
+            f" is not divisible by group_size = {group_size}. "
             "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq.")
+def check_marlin_supports_shape(output_size_per_partition: int,
+                                input_size_per_partition: int,
+                                input_size: int, group_size: int) \
+                                    -> tuple[bool, Optional[str]]:
     try:
+        verify_marlin_supports_shape(output_size_per_partition,
+                                     input_size_per_partition, input_size,
+                                     group_size)
     except ValueError as e:
         return False, e.__str__()
     return True, None
+def marlin_make_workspace(output_size_per_partition: int,
+                          device: torch.device) -> torch.Tensor:
+    max_workspace_size = (output_size_per_partition //
+                          GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
+    return torch.zeros(max_workspace_size,
+                       dtype=torch.int,
+                       device=device,
+                       requires_grad=False)
+def marlin_make_workspace_new(device: torch.device,
+                              max_blocks_per_sm: int = 1) -> torch.Tensor:
+    # In the new marlin kernel, we use the num of threadblocks as workspace
+    # size. The num of threadblocks is is sms_count * max_blocks_per_sm.
+    sms = torch.cuda.get_device_properties(device).multi_processor_count
+    return torch.zeros(sms * max_blocks_per_sm,
+                       dtype=torch.int,
+                       device=device,
+                       requires_grad=False)
 def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
     return (not act_order) or (act_order and not is_row_parallel)
+def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
+                                      is_row_parallel: bool) -> bool:
     # Need to repeat scales on every rank if act_ordering or
     # channelwise and RowParallelLinear
     is_channelwise = group_size == -1
 def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
 def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
+def marlin_sort_g_idx(
+        g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
     return g_idx[g_idx_sort_indices], g_idx_sort_indices
 def get_scale_perms():
+    scale_perm: list[int] = []
     for i in range(8):
         scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: list[int] = []
     for i in range(4):
+        scale_perm_single.extend(
+            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
     return scale_perm, scale_perm_single
+def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
+                          group_size: int) -> torch.Tensor:
     scale_perm, scale_perm_single = get_scale_perms()
     if group_size < size_k and group_size != -1:
     return output
+def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
+                       num_bits: int) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
     # "single" permutation, since zero-points are applied on every MMA
     scale_perm, _ = get_scale_perms()
     return zp
+def awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
+                              size_n: int, num_bits: int) -> torch.Tensor:
     # AWQ zero-points are quantized and packed on the column dim.
     # In addition, the values are permuted based on dequantizer.
     # Here we undo both of these, and then apply marlin permutation
     return marlin_zp
+def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
+                                  size_n: int, num_bits: int):
     num_experts = q_zp_packed.shape[0]
     output = torch.empty(
         (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
         dtype=q_zp_packed.dtype,
     )
     for e in range(num_experts):
+        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
+                                              num_bits)
     return output
+def maybe_warn_marlin_atomic_add(device, dtype):
+    if torch.compiler.is_dynamo_compiling():
+        return
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        logger.info_once(
+            "You are running Marlin kernel with bf16 on GPUs before SM90. "
+            "You can consider change to fp16 to achieve better performance "
+            "if possible.")
+def maybe_warn_marlin_atomic_add_env():
+    if torch.compiler.is_dynamo_compiling():
+        return
+    if envs.VLLM_MARLIN_USE_ATOMIC_ADD:
+        return
+    logger.info_once(
+        "Marlin kernel can achieve better performance for small size_n "
+        "with experimental use_atomic_add feature. "
+        "You can consider set environment variable "
+        "VLLM_MARLIN_USE_ATOMIC_ADD to 1 if possible.")
+def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
+                                 dtype: torch.dtype) -> bool:
+    # the performance of atomicAdd is better than global reduce
+    # only when m*n is small and k is large
+    if n >= 2048 or k < 2048 or device.type != "cuda":
+        return False
+    # disable atomicAdd reduce by default,
+    # one can enable it with VLLM_MARLIN_USE_ATOMIC_ADD=1
+    if not envs.VLLM_MARLIN_USE_ATOMIC_ADD:
+        maybe_warn_marlin_atomic_add_env()
+        return False
+    # sm8x doesn't support atomicAdd + bfloat16 natively
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        maybe_warn_marlin_atomic_add(device, dtype)
+        return False
+    return True
 def apply_gptq_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_zp: torch.Tensor,
+        g_idx: torch.Tensor,
+        g_idx_sort_indices: torch.Tensor,
+        workspace: torch.Tensor,
+        wtype: ScalarType,
+        output_size_per_partition: int,
+        input_size_per_partition: int,
+        is_k_full: bool,
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=output_size_per_partition,
+                                                  k=reshaped_x.size(1),
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(reshaped_x,
+                                  None,
+                                  weight,
+                                  weight_scale,
+                                  None,
+                                  weight_zp,
+                                  g_idx,
+                                  g_idx_sort_indices,
+                                  workspace,
+                                  wtype,
+                                  size_m=reshaped_x.shape[0],
+                                  size_n=output_size_per_partition,
+                                  size_k=input_size_per_partition,
+                                  is_k_full=is_k_full,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)
     if bias is not None:
         output.add_(bias)  # In-place add
 def apply_awq_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_zp: torch.Tensor,
+        g_idx: torch.Tensor,
+        g_idx_sort_indices: torch.Tensor,
+        workspace: torch.Tensor,
+        quant_type: ScalarType,
+        output_size_per_partition: int,
+        input_size_per_partition: int,
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=output_size_per_partition,
+                                                  k=reshaped_x.size(1),
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(reshaped_x,
+                                  None,
+                                  weight,
+                                  weight_scale,
+                                  None,
+                                  weight_zp,
+                                  g_idx,
+                                  g_idx_sort_indices,
+                                  workspace,
+                                  quant_type,
+                                  size_m=reshaped_x.shape[0],
+                                  size_n=output_size_per_partition,
+                                  size_k=input_size_per_partition,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)
     if bias is not None:
         output.add_(bias)  # In-place add

build/torch26-cxx98-cu126-aarch64-linux/quantization/utils/marlin_utils_fp4.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import torch
+import quantization as ops
+from .marlin_utils import (
+    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
+    should_use_atomic_add_reduce)
+from quantization.scalar_type import scalar_types
+FP4_MARLIN_SUPPORTED_GROUP_SIZES = [16]
+def is_fp4_marlin_supported():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    return capability >= 80
+def fp4_marlin_process_scales(marlin_scales):
+    if not (marlin_scales >= 0).all():
+        logger.warning_once(
+            "NVFP4 Marlin assumes the scales to be >=0, but has encountered "
+            "negative scales. Accuracy will likely be degraded. This is "
+            "because it changes the scales from FP8-S1E4M3 to a special "
+            "FP8-S0E5M3 format to speedup the dequantization.")
+    # convert to half first, we would convert to fp8 later
+    marlin_scales = marlin_scales.to(torch.half)
+    # 8 is the number of scale number using by one thread
+    marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
+    marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
+        marlin_scales.size(0) * 2, -1)
+    # fit the layout of fp8 dequantization
+    marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
+        marlin_scales.size(0), -1)
+    # We assume that weight_scale (FP8-S1E4M3) is always greater
+    # than or equal to 0. So we can convert
+    # (weight_scale * (2 ** 7) to a special FP8-S0E5M3 format.
+    # After multiplying by 2 ** 7, the top bit of FP8-S0E5M3 would always be 1
+    # when weight_scale > 0. This allows us to have an exponent bias
+    # closer to zero after dequantization.
+    marlin_scales = (marlin_scales * (2**7)).view(torch.int16) << 1
+    marlin_scales = marlin_scales.view(torch.float8_e4m3fn)
+    marlin_scales = marlin_scales[:, 1::2].contiguous()
+    return marlin_scales
+def fp4_marlin_process_global_scale(global_scale):
+    assert global_scale.dtype in [torch.half, torch.bfloat16]
+    fp4_exponent = 2
+    if global_scale.dtype == torch.half:
+        target_exponent = 5
+    elif global_scale.dtype == torch.bfloat16:
+        target_exponent = 8
+    # exponent_bias_fp16 = 2 ** 4 - 2 ** 1 = 14
+    # exponent_bias_bf16 = 2 ** 7 - 2 ** 1 = 126
+    exponent_bias = 2**(target_exponent - 1) - 2**(fp4_exponent - 1)
+    return global_scale * (2.0**(exponent_bias - 7))
+def apply_fp4_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_scale_2: torch.Tensor,
+        workspace: torch.Tensor,
+        size_n: int,
+        size_k: int,
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
+    # For GPUs that lack FP4 hardware support, we can leverage the
+    # Marlin kernel for fast weight-only FP4 quantization
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=size_n,
+                                                  k=size_k,
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(a=reshaped_x,
+                                  c=None,
+                                  b_q_weight=weight,
+                                  b_scales=weight_scale,
+                                  global_scale=weight_scale_2,
+                                  b_zeros=None,
+                                  g_idx=None,
+                                  perm=None,
+                                  workspace=workspace,
+                                  b_q_type=scalar_types.float4_e2m1f,
+                                  size_m=reshaped_x.size(0),
+                                  size_n=size_n,
+                                  size_k=size_k,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce)
+    if bias is not None:
+        output.add_(bias)  # In-place add
+    return output.reshape(out_shape)
+def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP4 computation but "
+        "FP4 quantization is being used. Weight-only FP4 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads.")
+    part_size_n = layer.output_size_per_partition
+    part_size_k = layer.input_size_per_partition
+    param_dtype = layer.params_dtype
+    assert layer.weight.shape == (part_size_n, part_size_k // 2)
+    device = layer.weight.device
+    # WORKSPACE
+    layer.workspace = marlin_make_workspace_new(device)
+    # WEIGHT
+    # Repack weights to marlin format
+    perm = torch.empty(0, dtype=torch.int, device=device)
+    qweight = layer.weight.view(torch.int32).T.contiguous()
+    marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
+                                            perm=perm,
+                                            size_k=part_size_k,
+                                            size_n=part_size_n,
+                                            num_bits=4)
+    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
+    # WEIGHT SCALES
+    # Permute scales
+    weight_scale = layer.weight_scale.T.to(param_dtype)
+    weight_scale = marlin_permute_scales(s=weight_scale,
+                                         size_k=part_size_k,
+                                         size_n=part_size_n,
+                                         group_size=16)
+    weight_scale = fp4_marlin_process_scales(weight_scale)
+    layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+    weight_scale_2 = layer.weight_scale_2.to(param_dtype)
+    weight_scale_2 = fp4_marlin_process_global_scale(weight_scale_2)
+    layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
+                                              requires_grad=False)
+    return
+def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP4 computation but "
+        "FP4 quantization is being used. Weight-only FP4 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads.")
+    e = layer.num_experts
+    k = layer.hidden_size
+    n = layer.intermediate_size_per_partition
+    # WORKSPACE
+    device = layer.w13_weight.device
+    param_dtype = layer.params_dtype
+    layer.workspace = marlin_make_workspace_new(device, 4)
+    perm = torch.empty(0, dtype=torch.int, device=device)
+    # WEIGHT
+    # Repack weights to marlin format
+    for name in ["w13_weight", "w2_weight"]:
+        weight = getattr(layer, name)
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+        assert weight.shape == (e, size_n, size_k // 2)
+        for i in range(e):
+            qweight = weight[i].view(torch.int32).T.contiguous()
+            marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
+                                                    perm=perm,
+                                                    size_k=size_k,
+                                                    size_n=size_n,
+                                                    num_bits=4)
+            tensor_list.append(marlin_qweight)
+        weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        weight = torch.nn.Parameter(weight, requires_grad=False)
+        setattr(layer, name, weight)
+    # WEIGHT SCALES
+    # Permute scales
+    for name in ["w13", "w2"]:
+        scales = getattr(layer, name + "_weight_scale").to(param_dtype)
+        global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype)
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+        for i in range(e):
+            marlin_scales = marlin_permute_scales(s=scales[i].T,
+                                                  size_k=size_k,
+                                                  size_n=size_n,
+                                                  group_size=16)
+            marlin_scales = fp4_marlin_process_scales(marlin_scales)
+            tensor_list.append(marlin_scales)
+        scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        scales = torch.nn.Parameter(scales, requires_grad=False)
+        setattr(layer, name + "_weight_scale", scales)
+        global_scale = fp4_marlin_process_global_scale(global_scale)
+        global_scale = torch.nn.Parameter(global_scale, requires_grad=False)
+        setattr(layer, name + "_weight_scale_2", global_scale)
+def rand_marlin_weight_fp4_like(weight, group_size):
+    assert group_size > 0
+    size_n, size_k = weight.shape
+    device = weight.device
+    scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 6
+    global_scale = scales.max() / 448
+    scales = (scales / global_scale).to(torch.float8_e4m3fn)
+    fp4_weight = torch.randint(0,
+                               256, (size_n, size_k // 2),
+                               dtype=torch.uint8,
+                               device=weight.device)
+    fp4_weight_part_1 = ((fp4_weight & 0b10000000) |
+                         ((fp4_weight & 0b01110000) >> 2))
+    fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn)
+    fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6)
+    fp4_weight2 = fp4_weight << 4
+    fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) |
+                         ((fp4_weight2 & 0b01110000) >> 2))
+    fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn)
+    fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6)
+    weight_ref = torch.cat(
+        [fp4_weight_part_2.unsqueeze(2),
+         fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k)
+    weight_ref = weight_ref * global_scale.to(weight.dtype) * \
+        scales.repeat_interleave(group_size, 1).to(weight.dtype)
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=4,
+    )
+    marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype),
+                                          size_k=size_k,
+                                          size_n=size_n,
+                                          group_size=group_size)
+    marlin_scales = fp4_marlin_process_scales(marlin_scales)
+    global_scale = fp4_marlin_process_global_scale(global_scale)
+    return weight_ref.T, marlin_qweight, marlin_scales, global_scale

build/torch26-cxx98-cu126-aarch64-linux/quantization/utils/marlin_utils_fp8.py CHANGED Viewed

@@ -1,10 +1,13 @@
 from typing import Optional
 import torch
 import quantization as ops
-from .marlin_utils import marlin_make_workspace, marlin_permute_scales
 def is_fp8_marlin_supported():
@@ -13,88 +16,107 @@ def is_fp8_marlin_supported():
     return capability >= 80
 def apply_fp8_marlin_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    workspace: torch.Tensor,
-    size_n: int,
-    size_k: int,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
     # For GPUs that lack FP8 hardware support, we can leverage the
     # Marlin kernel for fast weight-only FP8 quantization
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (size_n,)
-    output = ops.fp8_marlin_gemm(
-        a=reshaped_x,
-        b_q_weight=weight,
-        b_scales=weight_scale,
-        workspace=workspace,
-        num_bits=8,
-        size_m=reshaped_x.shape[0],
-        size_n=size_n,
-        size_k=size_k,
-    )
     if bias is not None:
         output.add_(bias)  # In-place add
     return output.reshape(out_shape)
-def prepare_fp8_layer_for_marlin(
-    layer: torch.nn.Module, strategy: str = "tensor"
-) -> None:
-    part_size_n = layer.output_size_per_partition
-    part_size_k = layer.input_size_per_partition
-    device = layer.weight.device
-    # WORKSPACE
-    layer.workspace = marlin_make_workspace(part_size_n, device)
-    # WEIGHT
-    # Repack weights to marlin format
-    marlin_qweight = ops.gptq_marlin_repack(
-        b_q_weight=pack_fp8_to_int32(layer.weight),
-        perm=torch.empty(0, dtype=torch.int, device=device),
-        size_k=part_size_k,
-        size_n=part_size_n,
-        num_bits=8,
-    )
-    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
-    # WEIGHT SCALES
-    scales = layer.weight_scale.to(layer.orig_dtype)
-    # Permute scales
-    marlin_scales = marlin_permute_scales(
-        s=scales, size_k=part_size_k, size_n=part_size_n, group_size=-1
-    )
-    layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
-def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
     """
     Repack FP8 weights to gptq format (packed int32 elements)
     """
     assert fp8_tensor.dtype == torch.float8_e4m3fn
-    assert fp8_tensor.shape[0] % 4 == 0
-    # Reshape to prepare for packing
-    reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
-    # Convert fp8 to uint8 (byte) representation
-    byte_tensor = reshaped.view(torch.uint8)
-    # Pack 4 uint8 values into one int32
-    packed = (
-        byte_tensor[:, 0].to(torch.int32)
-        | (byte_tensor[:, 1].to(torch.int32) << 8)
-        | (byte_tensor[:, 2].to(torch.int32) << 16)
-        | (byte_tensor[:, 3].to(torch.int32) << 24)
-    )
-    return packed.view(fp8_tensor.shape[0] // 4, *fp8_tensor.shape[1:]).contiguous()

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import torch
 import quantization as ops
+from .marlin_utils import USE_FP32_REDUCE_DEFAULT, marlin_make_workspace, marlin_permute_scales
 def is_fp8_marlin_supported():
     return capability >= 80
+def fp8_fused_exponent_bias_into_scales(scales):
+    fp8_exponent = 4
+    if scales.dtype == torch.half:
+        target_exponent = 5
+    elif scales.dtype == torch.bfloat16:
+        target_exponent = 8
+    # exponent_bias_fp16 = 2 ** 4 - 2 ** 3 = 8
+    # exponent_bias_bf16 = 2 ** 7 - 2 ** 3 = 120
+    exponent_bias = 2**(target_exponent - 1) - 2**(fp8_exponent - 1)
+    s = torch.ones_like(scales) * 2
+    s = s**exponent_bias
+    return scales * s
 def apply_fp8_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        workspace: torch.Tensor,
+        size_n: int,
+        size_k: int,
+        bias: Optional[torch.Tensor],
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     # For GPUs that lack FP8 hardware support, we can leverage the
     # Marlin kernel for fast weight-only FP8 quantization
     reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=size_n,
+                                                  k=size_k,
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(a=reshaped_x,
+                                  c=None,
+                                  b_q_weight=weight,
+                                  b_scales=weight_scale,
+                                  global_scale=None,
+                                  b_zeros=None,
+                                  g_idx=None,
+                                  perm=None,
+                                  workspace=workspace,
+                                  b_q_type=scalar_types.float8_e4m3fn,
+                                  size_m=reshaped_x.size(0),
+                                  size_n=size_n,
+                                  size_k=size_k,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce)
     if bias is not None:
         output.add_(bias)  # In-place add
     return output.reshape(out_shape)
+def pack_fp8_to_int32(fp8_tensor: torch.Tensor,
+                      size_k_first: bool = True) -> torch.Tensor:
     """
     Repack FP8 weights to gptq format (packed int32 elements)
     """
     assert fp8_tensor.dtype == torch.float8_e4m3fn
+    assert fp8_tensor.ndim == 2
+    fp8_tensor = fp8_tensor.T if size_k_first else fp8_tensor
+    fp8_tensor = fp8_tensor.contiguous()
+    # fp8_tensor is contiguous and have shape (N, K) now
+    # with `.view(torch.int32)`, it become (N, K // 4)
+    int32_tensor = fp8_tensor.view(torch.int32)
+    return int32_tensor.T.contiguous() if size_k_first else int32_tensor
+def marlin_quant_fp8_torch(weight, group_size):
+    size_n, size_k = weight.shape
+    device = weight.device
+    if group_size != -1:
+        scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 448
+        repeated_scales = scales.repeat_interleave(group_size, 1)
+        fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
+        weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
+    else:
+        scales = weight.view(size_n, 1, group_size).abs().max(-1)[0] / 448
+        repeated_scales = scales.repeat_interleave(size_k, 1)
+        fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
+        weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
+    packed_weight = pack_fp8_to_int32(fp8_weight, False).T.contiguous()
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=packed_weight,
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=8,
+    )
+    marlin_scales = marlin_permute_scales(s=scales.T,
+                                          size_k=size_k,
+                                          size_n=size_n,
+                                          group_size=group_size)
+    marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
+    return weight_ref.T, marlin_qweight, marlin_scales

build/torch27-cxx11-cu126-aarch64-linux/quantization/__init__.py CHANGED Viewed

@@ -1,12 +1,12 @@
 from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
 from .cutlass import (
     cutlass_scaled_mm_supports_fp8,
     cutlass_scaled_mm,
     cutlass_scaled_mm_azp,
 )
 from .marlin import (
     awq_marlin_repack,
-    fp8_marlin_gemm,
     gptq_marlin_gemm,
     gptq_marlin_repack,
     gptq_marlin_24_gemm,
@@ -25,8 +25,8 @@ __all__ = [
     "awq_marlin_repack",
     "cutlass_scaled_mm",
     "cutlass_scaled_mm_azp",
     "cutlass_scaled_mm_supports_fp8",
-    "fp8_marlin_gemm",
     "gptq_marlin_24_gemm",
     "gptq_marlin_gemm",
     "gptq_marlin_repack",

 from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
 from .cutlass import (
+    cutlass_scaled_mm_supports_block_fp8,
     cutlass_scaled_mm_supports_fp8,
     cutlass_scaled_mm,
     cutlass_scaled_mm_azp,
 )
 from .marlin import (
     awq_marlin_repack,
     gptq_marlin_gemm,
     gptq_marlin_repack,
     gptq_marlin_24_gemm,
     "awq_marlin_repack",
     "cutlass_scaled_mm",
     "cutlass_scaled_mm_azp",
+    "cutlass_scaled_mm_supports_block_fp8",
     "cutlass_scaled_mm_supports_fp8",
     "gptq_marlin_24_gemm",
     "gptq_marlin_gemm",
     "gptq_marlin_repack",

build/torch27-cxx11-cu126-aarch64-linux/quantization/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _quantization_0435ccb
-ops = torch.ops._quantization_0435ccb
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_quantization_0435ccb::{op_name}"

 import torch
+from . import _quantization_9035540
+ops = torch.ops._quantization_9035540
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_quantization_9035540::{op_name}"

build/torch27-cxx11-cu126-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7e8455e10805adf431198b60afbdbc1c7d79e65a67aab2a501ef9fe822484f3c
-size 67890208

 version https://git-lfs.github.com/spec/v1
+oid sha256:219fc94b48e46777769dd2cd61785791b4fd00c58824d6de5252defbf48c30e5
+size 159999608

build/torch27-cxx11-cu126-aarch64-linux/quantization/compressed_tensors.py CHANGED Viewed

@@ -2,17 +2,7 @@ from typing import Optional, Tuple
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _quantization
-        ops = torch.ops._quantization
-    except ImportError:
-        raise e
 # fp8
 def scaled_fp8_quant(
@@ -21,7 +11,8 @@ def scaled_fp8_quant(
     num_token_padding: Optional[int] = None,
     scale_ub: Optional[torch.Tensor] = None,
     use_per_token_if_dynamic: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP8 and return quantized tensor and scale.
@@ -42,30 +33,36 @@ def scaled_fp8_quant(
             in the dynamic quantization case.
     Returns:
-        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
             scaling factor.
     """
     # This code assumes batch_dim and num_tokens are flattened
-    assert input.ndim == 2
-    shape: Union[Tuple[int, int], torch.Size] = input.shape
-    # For rocm, the output fp8 dtype is torch.float_e3m3fnuz
-    # out_dtype: torch.dtype = torch.float8_e4m3fnuz \
-    #        if current_platform.is_rocm() else torch.float8_e4m3fn
-    out_dtype = torch.float8_e4m3fn
     if num_token_padding:
         shape = (max(num_token_padding, input.shape[0]), shape[1])
-    output = torch.empty(shape, device=input.device, dtype=out_dtype)
     if scale is None:
         if use_per_token_if_dynamic:
-            scale = torch.empty((shape[0], 1), device=input.device, dtype=torch.float32)
-            ops.dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub)
         else:
             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
             ops.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         # num_token_padding not implemented for this case
-        assert scale.numel() == 1 or num_token_padding is None
         ops.static_scaled_fp8_quant(output, input, scale)
     return output, scale
@@ -76,8 +73,8 @@ def scaled_int8_quant(
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
     azp: Optional[torch.Tensor] = None,
-    symmetric: bool = True,
-) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     """
     Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
@@ -90,21 +87,25 @@ def scaled_int8_quant(
         symmetric: Whether to use symmetric quantization (scale only, azp ignored).
     Returns:
-      Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
     """
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
         # static-per-tensor quantization.
         assert symmetric == (
-            azp is None
-        ), "azp must only be provided for asymmetric quantization."
         ops.static_scaled_int8_quant(output, input, scale, azp)
         return output, scale, azp
     # dynamic-per-token quantization.
-    input_scales = torch.empty(
-        (input.numel() // input.shape[-1], 1), device=input.device, dtype=torch.float32
-    )
-    input_azp = None if symmetric else torch.empty_like(input_scales, dtype=torch.int32)
-    ops.dynamic_scaled_int8_quant(output, input, input_scales, input_azp)
     return output, input_scales, input_azp

 import torch
+from ._ops import ops
 # fp8
 def scaled_fp8_quant(
     num_token_padding: Optional[int] = None,
     scale_ub: Optional[torch.Tensor] = None,
     use_per_token_if_dynamic: bool = False,
+    output: Optional[torch.Tensor] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP8 and return quantized tensor and scale.
             in the dynamic quantization case.
     Returns:
+        tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
             scaling factor.
     """
     # This code assumes batch_dim and num_tokens are flattened
+    assert (input.ndim == 2)
+    shape: Union[tuple[int, int], torch.Size] = input.shape
+    # For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
+    out_dtype: torch.dtype = current_platform.fp8_dtype()
     if num_token_padding:
         shape = (max(num_token_padding, input.shape[0]), shape[1])
+    if output is None:
+        output = torch.empty(shape, device=input.device, dtype=out_dtype)
+    else:
+        assert num_token_padding is None, \
+            "padding not supported if output passed in"
+        assert output.dtype == out_dtype
     if scale is None:
         if use_per_token_if_dynamic:
+            scale = torch.empty((shape[0], 1),
+                                device=input.device,
+                                dtype=torch.float32)
+            ops.dynamic_per_token_scaled_fp8_quant(
+                output, input.contiguous(), scale, scale_ub)
         else:
             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
             ops.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         # num_token_padding not implemented for this case
+        assert (scale.numel() == 1 and num_token_padding is None)
         ops.static_scaled_fp8_quant(output, input, scale)
     return output, scale
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
     azp: Optional[torch.Tensor] = None,
+    symmetric: bool = True
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     """
     Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
         symmetric: Whether to use symmetric quantization (scale only, azp ignored).
     Returns:
+      tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
     """
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
         # static-per-tensor quantization.
         assert symmetric == (
+            azp
+            is None), "azp must only be provided for asymmetric quantization."
         ops.static_scaled_int8_quant(output, input, scale, azp)
         return output, scale, azp
     # dynamic-per-token quantization.
+    input_scales = torch.empty((input.numel() // input.shape[-1], 1),
+                               device=input.device,
+                               dtype=torch.float32)
+    input_azp = None if symmetric else torch.empty_like(input_scales,
+                                                        dtype=torch.int32)
+    ops.dynamic_scaled_int8_quant(output, input.contiguous(),
+                                           input_scales, input_azp)
     return output, input_scales, input_azp

build/torch27-cxx11-cu126-aarch64-linux/quantization/cutlass.py CHANGED Viewed

@@ -2,22 +2,18 @@ from typing import Optional
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _quantization
-        ops = torch.ops._quantization
-    except ImportError:
-        raise e
 def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
 def cutlass_scaled_mm(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -33,12 +29,10 @@ def cutlass_scaled_mm(
     m = a.shape[0]
     n = b.shape[1]
-    # if current_platform.is_rocm():
-    #    triton_scaled_mm_module = importlib.import_module(
-    #        "vllm.model_executor.layers.quantization.compressed_tensors."
-    #        "triton_scaled_mm")
-    #    triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
-    #    return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)

 import torch
+from ._ops import ops
+from .platforms import current_platform
 def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
+def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
+    return ops.cutlass_scaled_mm_supports_block_fp8(cuda_device_capability)
 def cutlass_scaled_mm(
     a: torch.Tensor,
     b: torch.Tensor,
     m = a.shape[0]
     n = b.shape[1]
+    cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    if not cutlass_compatible_b:
+        from .triton_scaled_mm import triton_scaled_mm
+        return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)

build/torch27-cxx11-cu126-aarch64-linux/quantization/marlin.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING
 import torch
@@ -30,58 +30,30 @@ except ImportError as e:
 from .scalar_type import ScalarType
-# fp8 marlin
-def fp8_marlin_gemm(
-    a: torch.Tensor,
-    b_q_weight: torch.Tensor,
-    b_scales: torch.Tensor,
-    workspace: torch.Tensor,
-    num_bits: int,
-    size_m: int,
-    size_n: int,
-    size_k: int,
-) -> torch.Tensor:
-    return ops.fp8_marlin_gemm(
-        a, b_q_weight, b_scales, workspace, num_bits, size_m, size_n, size_k
-    )
 # gptq_marlin
-def gptq_marlin_gemm(
-    a: torch.Tensor,
-    b_q_weight: torch.Tensor,
-    b_scales: torch.Tensor,
-    b_zeros: torch.Tensor,
-    g_idx: torch.Tensor,
-    perm: torch.Tensor,
-    workspace: torch.Tensor,
-    b_q_type: ScalarType,
-    size_m: int,
-    size_n: int,
-    size_k: int,
-    is_k_full: bool,
-    has_zp: bool = False,
-    use_fp32_reduce: bool = False,
-    is_zp_float: bool = False,
-) -> torch.Tensor:
-    return ops.gptq_marlin_gemm(
-        a,
-        b_q_weight,
-        b_scales,
-        b_zeros,
-        g_idx,
-        perm,
-        workspace,
-        b_q_type.id,
-        size_m,
-        size_n,
-        size_k,
-        is_k_full,
-        has_zp,
-        use_fp32_reduce,
-        is_zp_float,
-    )
 # gptq_marlin
 def gptq_marlin_repack(
@@ -153,14 +125,6 @@ def marlin_qqq_gemm(
 # Fake ops
 if hasattr(ops, "gptq_marlin_24_gemm"):
-    @register_fake(add_op_namespace_prefix("fp8_marlin_gemm"))
-    def _fp8_marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
-                              b_scales: torch.Tensor, workspace: torch.Tensor,
-                              num_bits: int, size_m: torch.SymInt,
-                              size_n: torch.SymInt,
-                              size_k: torch.SymInt) -> torch.Tensor:
-        return torch.empty((size_m, size_n), dtype=a.dtype, device=a.device)
     @register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
     def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
                                     b_meta: torch.Tensor, b_scales: torch.Tensor,
@@ -172,20 +136,22 @@ if hasattr(ops, "gptq_marlin_24_gemm"):
     @register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
     def _gptq_marlin_gemm_fake(a: torch.Tensor,
-                                b_q_weight: torch.Tensor,
-                                b_scales: torch.Tensor,
-                                b_zeros: torch.Tensor,
-                                g_idx: torch.Tensor,
-                                perm: torch.Tensor,
-                                workspace: torch.Tensor,
-                                b_q_type: ScalarType,
-                                size_m: torch.SymInt,
-                                size_n: torch.SymInt,
-                                size_k: torch.SymInt,
-                                is_k_full: bool,
-                                has_zp: bool = False,
-                                use_fp32_reduce: bool = False,
-                                is_zp_float: bool = False) -> torch.Tensor:
         return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
     @register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))

+from typing import TYPE_CHECKING, Optional
 import torch
 from .scalar_type import ScalarType
 # gptq_marlin
+def gptq_marlin_gemm(a: torch.Tensor,
+                     c: Optional[torch.Tensor],
+                     b_q_weight: torch.Tensor,
+                     b_scales: torch.Tensor,
+                     global_scale: Optional[torch.Tensor],
+                     b_zeros: Optional[torch.Tensor],
+                     g_idx: Optional[torch.Tensor],
+                     perm: Optional[torch.Tensor],
+                     workspace: torch.Tensor,
+                     b_q_type: ScalarType,
+                     size_m: int,
+                     size_n: int,
+                     size_k: int,
+                     is_k_full: bool = True,
+                     use_atomic_add: bool = False,
+                     use_fp32_reduce: bool = False,
+                     is_zp_float: bool = False) -> torch.Tensor:
+    return ops.gptq_marlin_gemm(a, c, b_q_weight, b_scales,
+                                         global_scale, b_zeros, g_idx, perm,
+                                         workspace, b_q_type.id, size_m,
+                                         size_n, size_k, is_k_full,
+                                         use_atomic_add, use_fp32_reduce,
+                                         is_zp_float)
 # gptq_marlin
 def gptq_marlin_repack(
 # Fake ops
 if hasattr(ops, "gptq_marlin_24_gemm"):
     @register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
     def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
                                     b_meta: torch.Tensor, b_scales: torch.Tensor,
     @register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
     def _gptq_marlin_gemm_fake(a: torch.Tensor,
+                               c: Optional[torch.Tensor],
+                               b_q_weight: torch.Tensor,
+                               b_scales: torch.Tensor,
+                               global_scale: Optional[torch.Tensor],
+                               b_zeros: Optional[torch.Tensor],
+                               g_idx: Optional[torch.Tensor],
+                               perm: Optional[torch.Tensor],
+                               workspace: torch.Tensor,
+                               b_q_type_id: int,
+                               size_m: torch.SymInt,
+                               size_n: torch.SymInt,
+                               size_k: torch.SymInt,
+                               is_k_full: bool = True,
+                               use_atomic_add: bool = False,
+                               use_fp32_reduce: bool = False,
+                               is_zp_float: bool = False) -> torch.Tensor:
         return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
     @register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))

build/torch27-cxx11-cu126-aarch64-linux/quantization/platforms.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from abc import ABC, abstractmethod
+from functools import lru_cache
+from typing import NamedTuple
+import torch
+IS_ROCM = torch.version.hip is not None
+class DeviceCapability(NamedTuple):
+    major: int
+    minor: int
+    def as_version_str(self) -> str:
+        return f"{self.major}.{self.minor}"
+    def to_int(self) -> int:
+        """
+        Express device capability as an integer ``<major><minor>``.
+        It is assumed that the minor version is always a single digit.
+        """
+        assert 0 <= self.minor < 10
+        return self.major * 10 + self.minor
+class Platform(ABC):
+    simple_compile_backend: str = "inductor"
+    @classmethod
+    @abstractmethod
+    def get_device_name(cls, device_id: int = 0) -> str: ...
+    @abstractmethod
+    def is_rocm(self): ...
+class CudaPlatform(Platform):
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(0)
+    def is_rocm(self):
+        return False
+class RocmPlatform(Platform):
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(device_id)
+    def is_rocm(self):
+        return True
+current_platform = RocmPlatform() if IS_ROCM else CudaPlatform()

build/torch27-cxx11-cu126-aarch64-linux/quantization/scalar_type.py CHANGED Viewed

@@ -1,9 +1,14 @@
 import functools
 import struct
 from dataclasses import dataclass
 from enum import Enum
 from typing import Optional, Union
 # Mirrors enum in `core/scalar_type.hpp`
 class NanRepr(Enum):
@@ -121,8 +126,8 @@ class ScalarType:
             min_raw = max_raw | sign_bit_double
             return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
         else:
-            assert (not self.is_signed() or
-                    self.size_bits <= 64), "Cannot represent min as a int64_t"
             if self.is_signed():
                 return -(1 << (self.size_bits - 1))
@@ -156,6 +161,8 @@ class ScalarType:
         assert offset <= 64, \
             f"ScalarType fields too big {offset} to fit into an int64"
         return val
     @property
@@ -293,6 +300,13 @@ class ScalarType:
         ret.id  # noqa B018: make sure the id is cached
         return ret
 # naming generally follows: https://github.com/jax-ml/ml_dtypes
 # for floating point types (leading f) the scheme is:
@@ -319,6 +333,9 @@ class scalar_types:
     # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
     float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
     # "gptq" types
     uint2b2 = ScalarType.uint(2, 2)
     uint3b4 = ScalarType.uint(3, 4)

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 import struct
 from dataclasses import dataclass
 from enum import Enum
 from typing import Optional, Union
+_SCALAR_TYPES_ID_MAP = {}
 # Mirrors enum in `core/scalar_type.hpp`
 class NanRepr(Enum):
             min_raw = max_raw | sign_bit_double
             return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
         else:
+            assert (not self.is_signed() or self.size_bits
+                    <= 64), "Cannot represent min as a int64_t"
             if self.is_signed():
                 return -(1 << (self.size_bits - 1))
         assert offset <= 64, \
             f"ScalarType fields too big {offset} to fit into an int64"
+        _SCALAR_TYPES_ID_MAP[val] = self
         return val
     @property
         ret.id  # noqa B018: make sure the id is cached
         return ret
+    @classmethod
+    def from_id(cls, scalar_type_id: int):
+        if scalar_type_id not in _SCALAR_TYPES_ID_MAP:
+            raise ValueError(
+                f"scalar_type_id {scalar_type_id} doesn't exists.")
+        return _SCALAR_TYPES_ID_MAP[scalar_type_id]
 # naming generally follows: https://github.com/jax-ml/ml_dtypes
 # for floating point types (leading f) the scheme is:
     # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
     float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
+    # fp4, https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    float4_e2m1f = ScalarType.float_(2, 1, True, NanRepr.NONE)
     # "gptq" types
     uint2b2 = ScalarType.uint(2, 2)
     uint3b4 = ScalarType.uint(3, 4)

build/torch27-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils.py CHANGED Viewed

@@ -1,4 +1,7 @@
-from typing import List, Optional, Tuple
 import numpy
 import torch
@@ -42,7 +45,9 @@ USE_FP32_REDUCE_DEFAULT = True
 #  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
 #  TODO: we may want to move this into the C++ so its closer to the actual impl
 def query_marlin_supported_quant_types(
-    has_zp: bool, device_capability: Optional[int] = None
 ):
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
@@ -51,137 +56,141 @@ def query_marlin_supported_quant_types(
     if device_capability < 80:
         return []
     if has_zp:
         # AWQ style, unsigned + runtime zero-point
-        return [scalar_types.uint4, scalar_types.uint8]
     else:
         # GPTQ style, unsigned + symmetric bias
-        # TODO: once fp8_marlin is merged into "gptq_marlin" we should be able
-        #  to add `scalar_types.float8_e4m3fn` here
-        return [scalar_types.uint4b8, scalar_types.uint8b128]
 def _check_marlin_supported(
-    quant_type: ScalarType,
-    group_size: Optional[int],
-    has_zp: bool,
-    device_capability: Optional[int] = None,
-) -> Tuple[bool, Optional[str]]:
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
         device_capability = capability_tuple[0] * 10 + capability_tuple[1]
-    supported_types = query_marlin_supported_quant_types(has_zp, device_capability)
     if quant_type not in supported_types:
-        return (
-            False,
-            f"Marlin does not support weight_bits = {quant_type}. "
-            f"Only types = {supported_types} "
-            f"are supported (for group_size = {group_size}, "
-            f"device_capability = {device_capability}, zp = {has_zp}).",
-        )
-    if group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
-        return (
-            False,
-            f"Marlin does not support group_size = {group_size}. "
-            f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
-            "are supported.",
-        )
     return True, None
-def check_marlin_supported(
-    quant_type: ScalarType,
-    group_size: int,
-    has_zp: bool = False,
-    device_capability: Optional[int] = None,
-) -> bool:
-    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp, device_capability)
     return cond
-def verify_marlin_supported(
-    quant_type: ScalarType, group_size: int, has_zp: bool = False
-) -> None:
     cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
     if not cond:
         assert err_msg is not None
         raise ValueError(err_msg)
-def verify_marlin_supports_shape(
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    input_size: int,
-    group_size: int,
-) -> None:
     # Validate output_size_per_partition
     if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
-        raise ValueError(
-            f"Weight output_size_per_partition = "
-            f"{output_size_per_partition} is not divisible by "
-            f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
-            "Consider reducing tensor_parallel_size or running "
-            "with --quantization gptq."
-        )
     # Validate input_size_per_partition
     if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
-        raise ValueError(
-            f"Weight input_size_per_partition = "
-            f"{input_size_per_partition} is not divisible "
-            f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
-            "Consider reducing tensor_parallel_size or running "
-            "with --quantization gptq."
-        )
-    if group_size < input_size and input_size_per_partition % group_size != 0:
         raise ValueError(
             f"Weight input_size_per_partition = {input_size_per_partition}"
-            f" is not divisible by group_size = {group_size}."
             "Consider reducing tensor_parallel_size or running "
-            "with --quantization gptq."
-        )
-def check_marlin_supports_shape(
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    input_size: int,
-    group_size: int,
-) -> Tuple[bool, Optional[str]]:
     try:
-        verify_marlin_supports_shape(
-            output_size_per_partition, input_size_per_partition, input_size, group_size
-        )
     except ValueError as e:
         return False, e.__str__()
     return True, None
-def marlin_make_workspace(
-    output_size_per_partition: int, device: torch.device
-) -> torch.Tensor:
-    max_workspace_size = (
-        output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N
-    ) * GPTQ_MARLIN_MAX_PARALLEL
-    return torch.zeros(
-        max_workspace_size, dtype=torch.int, device=device, requires_grad=False
-    )
 def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
     return (not act_order) or (act_order and not is_row_parallel)
-def marlin_repeat_scales_on_all_ranks(
-    act_order: bool, group_size: int, is_row_parallel: bool
-) -> bool:
     # Need to repeat scales on every rank if act_ordering or
     # channelwise and RowParallelLinear
     is_channelwise = group_size == -1
@@ -189,35 +198,34 @@ def marlin_repeat_scales_on_all_ranks(
 def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
-    return torch.nn.Parameter(
-        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
-    )
 def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
-    return torch.nn.Parameter(
-        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
-    )
-def marlin_sort_g_idx(g_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
     return g_idx[g_idx_sort_indices], g_idx_sort_indices
 def get_scale_perms():
-    scale_perm: List[int] = []
     for i in range(8):
         scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single: List[int] = []
     for i in range(4):
-        scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
     return scale_perm, scale_perm_single
-def marlin_permute_scales(
-    s: torch.Tensor, size_k: int, size_n: int, group_size: int
-) -> torch.Tensor:
     scale_perm, scale_perm_single = get_scale_perms()
     if group_size < size_k and group_size != -1:
@@ -247,9 +255,8 @@ def marlin_moe_permute_scales(
     return output
-def marlin_zero_points(
-    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int
-) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
     # "single" permutation, since zero-points are applied on every MMA
     scale_perm, _ = get_scale_perms()
@@ -270,9 +277,8 @@ def marlin_zero_points(
     return zp
-def awq_to_marlin_zero_points(
-    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
-) -> torch.Tensor:
     # AWQ zero-points are quantized and packed on the column dim.
     # In addition, the values are permuted based on dequantizer.
     # Here we undo both of these, and then apply marlin permutation
@@ -294,9 +300,8 @@ def awq_to_marlin_zero_points(
     return marlin_zp
-def moe_awq_to_marlin_zero_points(
-    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
-):
     num_experts = q_zp_packed.shape[0]
     output = torch.empty(
         (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
@@ -304,45 +309,97 @@ def moe_awq_to_marlin_zero_points(
         dtype=q_zp_packed.dtype,
     )
     for e in range(num_experts):
-        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n, num_bits)
     return output
 def apply_gptq_marlin_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    weight_zp: torch.Tensor,
-    g_idx: torch.Tensor,
-    g_idx_sort_indices: torch.Tensor,
-    workspace: torch.Tensor,
-    wtype: ScalarType,
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    is_k_full: bool,
-    bias: Optional[torch.Tensor] = None,
-    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
-) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (output_size_per_partition,)
-    output = ops.gptq_marlin_gemm(
-        reshaped_x,
-        weight,
-        weight_scale,
-        weight_zp,
-        g_idx,
-        g_idx_sort_indices,
-        workspace,
-        wtype,
-        size_m=reshaped_x.shape[0],
-        size_n=output_size_per_partition,
-        size_k=input_size_per_partition,
-        is_k_full=is_k_full,
-        has_zp=False,
-        use_fp32_reduce=use_fp32_reduce,
-        is_zp_float=False,
-    )
     if bias is not None:
         output.add_(bias)  # In-place add
@@ -351,39 +408,43 @@ def apply_gptq_marlin_linear(
 def apply_awq_marlin_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    weight_zp: torch.Tensor,
-    g_idx: torch.Tensor,
-    g_idx_sort_indices: torch.Tensor,
-    workspace: torch.Tensor,
-    quant_type: ScalarType,
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    bias: Optional[torch.Tensor] = None,
-    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
-) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (output_size_per_partition,)
-    output = ops.gptq_marlin_gemm(
-        reshaped_x,
-        weight,
-        weight_scale,
-        weight_zp,
-        g_idx,
-        g_idx_sort_indices,
-        workspace,
-        quant_type,
-        size_m=reshaped_x.shape[0],
-        size_n=output_size_per_partition,
-        size_k=input_size_per_partition,
-        is_k_full=True,
-        has_zp=True,
-        use_fp32_reduce=use_fp32_reduce,
-        is_zp_float=False,
-    )
     if bias is not None:
         output.add_(bias)  # In-place add

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
 import numpy
 import torch
 #  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
 #  TODO: we may want to move this into the C++ so its closer to the actual impl
 def query_marlin_supported_quant_types(
+    has_zp: Optional[bool] = None,
+    include_fp_type: bool = True,
+    device_capability: Optional[int] = None,
 ):
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
     if device_capability < 80:
         return []
+    # - has_zp is True: return quant_types that has zero points
+    # - has_zp is False: return quant_types that has not zero points
+    # - has_zp is None: both
+    if has_zp is None:
+        types0 = query_marlin_supported_quant_types(False, include_fp_type,
+                                                    device_capability)
+        types1 = query_marlin_supported_quant_types(True, include_fp_type,
+                                                    device_capability)
+        return types0 + types1
     if has_zp:
         # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4]
     else:
         # GPTQ style, unsigned + symmetric bias
+        res = [scalar_types.uint4b8, scalar_types.uint8b128]
+        if include_fp_type:
+            res += [scalar_types.float8_e4m3fn, scalar_types.float4_e2m1f]
+        return res
 def _check_marlin_supported(
+        quant_type: ScalarType,
+        group_size: Optional[int],
+        has_zp: bool,
+        device_capability: Optional[int] = None) -> tuple[bool, Optional[str]]:
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
         device_capability = capability_tuple[0] * 10 + capability_tuple[1]
+    supported_types = query_marlin_supported_quant_types(
+        has_zp, True, device_capability)
     if quant_type not in supported_types:
+        return (False, f"Marlin does not support weight_bits = {quant_type}. "
+                f"Only types = {supported_types} "
+                f"are supported (for group_size = {group_size}, "
+                f"device_capability = {device_capability}, zp = {has_zp}).")
+    if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES):
+        return (False, f"Marlin does not support group_size = {group_size}. "
+                f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
     return True, None
+def check_marlin_supported(quant_type: ScalarType,
+                           group_size: int,
+                           has_zp: bool = False,
+                           device_capability: Optional[int] = None) -> bool:
+    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp,
+                                      device_capability)
     return cond
+def verify_marlin_supported(quant_type: ScalarType,
+                            group_size: int,
+                            has_zp: bool = False) -> None:
     cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
     if not cond:
         assert err_msg is not None
         raise ValueError(err_msg)
+def verify_marlin_supports_shape(output_size_per_partition: int,
+                                 input_size_per_partition: int,
+                                 input_size: int, group_size: int) -> None:
     # Validate output_size_per_partition
     if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
+        raise ValueError(f"Weight output_size_per_partition = "
+                         f"{output_size_per_partition} is not divisible by "
+                         f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
     # Validate input_size_per_partition
     if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
+        raise ValueError(f"Weight input_size_per_partition = "
+                         f"{input_size_per_partition} is not divisible "
+                         f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+    if (group_size < input_size
+            and input_size_per_partition % group_size != 0):
         raise ValueError(
             f"Weight input_size_per_partition = {input_size_per_partition}"
+            f" is not divisible by group_size = {group_size}. "
             "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq.")
+def check_marlin_supports_shape(output_size_per_partition: int,
+                                input_size_per_partition: int,
+                                input_size: int, group_size: int) \
+                                    -> tuple[bool, Optional[str]]:
     try:
+        verify_marlin_supports_shape(output_size_per_partition,
+                                     input_size_per_partition, input_size,
+                                     group_size)
     except ValueError as e:
         return False, e.__str__()
     return True, None
+def marlin_make_workspace(output_size_per_partition: int,
+                          device: torch.device) -> torch.Tensor:
+    max_workspace_size = (output_size_per_partition //
+                          GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
+    return torch.zeros(max_workspace_size,
+                       dtype=torch.int,
+                       device=device,
+                       requires_grad=False)
+def marlin_make_workspace_new(device: torch.device,
+                              max_blocks_per_sm: int = 1) -> torch.Tensor:
+    # In the new marlin kernel, we use the num of threadblocks as workspace
+    # size. The num of threadblocks is is sms_count * max_blocks_per_sm.
+    sms = torch.cuda.get_device_properties(device).multi_processor_count
+    return torch.zeros(sms * max_blocks_per_sm,
+                       dtype=torch.int,
+                       device=device,
+                       requires_grad=False)
 def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
     return (not act_order) or (act_order and not is_row_parallel)
+def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
+                                      is_row_parallel: bool) -> bool:
     # Need to repeat scales on every rank if act_ordering or
     # channelwise and RowParallelLinear
     is_channelwise = group_size == -1
 def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
 def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
+def marlin_sort_g_idx(
+        g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
     return g_idx[g_idx_sort_indices], g_idx_sort_indices
 def get_scale_perms():
+    scale_perm: list[int] = []
     for i in range(8):
         scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: list[int] = []
     for i in range(4):
+        scale_perm_single.extend(
+            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
     return scale_perm, scale_perm_single
+def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
+                          group_size: int) -> torch.Tensor:
     scale_perm, scale_perm_single = get_scale_perms()
     if group_size < size_k and group_size != -1:
     return output
+def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
+                       num_bits: int) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
     # "single" permutation, since zero-points are applied on every MMA
     scale_perm, _ = get_scale_perms()
     return zp
+def awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
+                              size_n: int, num_bits: int) -> torch.Tensor:
     # AWQ zero-points are quantized and packed on the column dim.
     # In addition, the values are permuted based on dequantizer.
     # Here we undo both of these, and then apply marlin permutation
     return marlin_zp
+def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
+                                  size_n: int, num_bits: int):
     num_experts = q_zp_packed.shape[0]
     output = torch.empty(
         (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
         dtype=q_zp_packed.dtype,
     )
     for e in range(num_experts):
+        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
+                                              num_bits)
     return output
+def maybe_warn_marlin_atomic_add(device, dtype):
+    if torch.compiler.is_dynamo_compiling():
+        return
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        logger.info_once(
+            "You are running Marlin kernel with bf16 on GPUs before SM90. "
+            "You can consider change to fp16 to achieve better performance "
+            "if possible.")
+def maybe_warn_marlin_atomic_add_env():
+    if torch.compiler.is_dynamo_compiling():
+        return
+    if envs.VLLM_MARLIN_USE_ATOMIC_ADD:
+        return
+    logger.info_once(
+        "Marlin kernel can achieve better performance for small size_n "
+        "with experimental use_atomic_add feature. "
+        "You can consider set environment variable "
+        "VLLM_MARLIN_USE_ATOMIC_ADD to 1 if possible.")
+def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
+                                 dtype: torch.dtype) -> bool:
+    # the performance of atomicAdd is better than global reduce
+    # only when m*n is small and k is large
+    if n >= 2048 or k < 2048 or device.type != "cuda":
+        return False
+    # disable atomicAdd reduce by default,
+    # one can enable it with VLLM_MARLIN_USE_ATOMIC_ADD=1
+    if not envs.VLLM_MARLIN_USE_ATOMIC_ADD:
+        maybe_warn_marlin_atomic_add_env()
+        return False
+    # sm8x doesn't support atomicAdd + bfloat16 natively
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        maybe_warn_marlin_atomic_add(device, dtype)
+        return False
+    return True
 def apply_gptq_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_zp: torch.Tensor,
+        g_idx: torch.Tensor,
+        g_idx_sort_indices: torch.Tensor,
+        workspace: torch.Tensor,
+        wtype: ScalarType,
+        output_size_per_partition: int,
+        input_size_per_partition: int,
+        is_k_full: bool,
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=output_size_per_partition,
+                                                  k=reshaped_x.size(1),
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(reshaped_x,
+                                  None,
+                                  weight,
+                                  weight_scale,
+                                  None,
+                                  weight_zp,
+                                  g_idx,
+                                  g_idx_sort_indices,
+                                  workspace,
+                                  wtype,
+                                  size_m=reshaped_x.shape[0],
+                                  size_n=output_size_per_partition,
+                                  size_k=input_size_per_partition,
+                                  is_k_full=is_k_full,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)
     if bias is not None:
         output.add_(bias)  # In-place add
 def apply_awq_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_zp: torch.Tensor,
+        g_idx: torch.Tensor,
+        g_idx_sort_indices: torch.Tensor,
+        workspace: torch.Tensor,
+        quant_type: ScalarType,
+        output_size_per_partition: int,
+        input_size_per_partition: int,
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=output_size_per_partition,
+                                                  k=reshaped_x.size(1),
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(reshaped_x,
+                                  None,
+                                  weight,
+                                  weight_scale,
+                                  None,
+                                  weight_zp,
+                                  g_idx,
+                                  g_idx_sort_indices,
+                                  workspace,
+                                  quant_type,
+                                  size_m=reshaped_x.shape[0],
+                                  size_n=output_size_per_partition,
+                                  size_k=input_size_per_partition,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)
     if bias is not None:
         output.add_(bias)  # In-place add

build/torch27-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp4.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import torch
+import quantization as ops
+from .marlin_utils import (
+    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
+    should_use_atomic_add_reduce)
+from quantization.scalar_type import scalar_types
+FP4_MARLIN_SUPPORTED_GROUP_SIZES = [16]
+def is_fp4_marlin_supported():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    return capability >= 80
+def fp4_marlin_process_scales(marlin_scales):
+    if not (marlin_scales >= 0).all():
+        logger.warning_once(
+            "NVFP4 Marlin assumes the scales to be >=0, but has encountered "
+            "negative scales. Accuracy will likely be degraded. This is "
+            "because it changes the scales from FP8-S1E4M3 to a special "
+            "FP8-S0E5M3 format to speedup the dequantization.")
+    # convert to half first, we would convert to fp8 later
+    marlin_scales = marlin_scales.to(torch.half)
+    # 8 is the number of scale number using by one thread
+    marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
+    marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
+        marlin_scales.size(0) * 2, -1)
+    # fit the layout of fp8 dequantization
+    marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
+        marlin_scales.size(0), -1)
+    # We assume that weight_scale (FP8-S1E4M3) is always greater
+    # than or equal to 0. So we can convert
+    # (weight_scale * (2 ** 7) to a special FP8-S0E5M3 format.
+    # After multiplying by 2 ** 7, the top bit of FP8-S0E5M3 would always be 1
+    # when weight_scale > 0. This allows us to have an exponent bias
+    # closer to zero after dequantization.
+    marlin_scales = (marlin_scales * (2**7)).view(torch.int16) << 1
+    marlin_scales = marlin_scales.view(torch.float8_e4m3fn)
+    marlin_scales = marlin_scales[:, 1::2].contiguous()
+    return marlin_scales
+def fp4_marlin_process_global_scale(global_scale):
+    assert global_scale.dtype in [torch.half, torch.bfloat16]
+    fp4_exponent = 2
+    if global_scale.dtype == torch.half:
+        target_exponent = 5
+    elif global_scale.dtype == torch.bfloat16:
+        target_exponent = 8
+    # exponent_bias_fp16 = 2 ** 4 - 2 ** 1 = 14
+    # exponent_bias_bf16 = 2 ** 7 - 2 ** 1 = 126
+    exponent_bias = 2**(target_exponent - 1) - 2**(fp4_exponent - 1)
+    return global_scale * (2.0**(exponent_bias - 7))
+def apply_fp4_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_scale_2: torch.Tensor,
+        workspace: torch.Tensor,
+        size_n: int,
+        size_k: int,
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
+    # For GPUs that lack FP4 hardware support, we can leverage the
+    # Marlin kernel for fast weight-only FP4 quantization
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=size_n,
+                                                  k=size_k,
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(a=reshaped_x,
+                                  c=None,
+                                  b_q_weight=weight,
+                                  b_scales=weight_scale,
+                                  global_scale=weight_scale_2,
+                                  b_zeros=None,
+                                  g_idx=None,
+                                  perm=None,
+                                  workspace=workspace,
+                                  b_q_type=scalar_types.float4_e2m1f,
+                                  size_m=reshaped_x.size(0),
+                                  size_n=size_n,
+                                  size_k=size_k,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce)
+    if bias is not None:
+        output.add_(bias)  # In-place add
+    return output.reshape(out_shape)
+def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP4 computation but "
+        "FP4 quantization is being used. Weight-only FP4 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads.")
+    part_size_n = layer.output_size_per_partition
+    part_size_k = layer.input_size_per_partition
+    param_dtype = layer.params_dtype
+    assert layer.weight.shape == (part_size_n, part_size_k // 2)
+    device = layer.weight.device
+    # WORKSPACE
+    layer.workspace = marlin_make_workspace_new(device)
+    # WEIGHT
+    # Repack weights to marlin format
+    perm = torch.empty(0, dtype=torch.int, device=device)
+    qweight = layer.weight.view(torch.int32).T.contiguous()
+    marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
+                                            perm=perm,
+                                            size_k=part_size_k,
+                                            size_n=part_size_n,
+                                            num_bits=4)
+    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
+    # WEIGHT SCALES
+    # Permute scales
+    weight_scale = layer.weight_scale.T.to(param_dtype)
+    weight_scale = marlin_permute_scales(s=weight_scale,
+                                         size_k=part_size_k,
+                                         size_n=part_size_n,
+                                         group_size=16)
+    weight_scale = fp4_marlin_process_scales(weight_scale)
+    layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+    weight_scale_2 = layer.weight_scale_2.to(param_dtype)
+    weight_scale_2 = fp4_marlin_process_global_scale(weight_scale_2)
+    layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
+                                              requires_grad=False)
+    return
+def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP4 computation but "
+        "FP4 quantization is being used. Weight-only FP4 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads.")
+    e = layer.num_experts
+    k = layer.hidden_size
+    n = layer.intermediate_size_per_partition
+    # WORKSPACE
+    device = layer.w13_weight.device
+    param_dtype = layer.params_dtype
+    layer.workspace = marlin_make_workspace_new(device, 4)
+    perm = torch.empty(0, dtype=torch.int, device=device)
+    # WEIGHT
+    # Repack weights to marlin format
+    for name in ["w13_weight", "w2_weight"]:
+        weight = getattr(layer, name)
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+        assert weight.shape == (e, size_n, size_k // 2)
+        for i in range(e):
+            qweight = weight[i].view(torch.int32).T.contiguous()
+            marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
+                                                    perm=perm,
+                                                    size_k=size_k,
+                                                    size_n=size_n,
+                                                    num_bits=4)
+            tensor_list.append(marlin_qweight)
+        weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        weight = torch.nn.Parameter(weight, requires_grad=False)
+        setattr(layer, name, weight)
+    # WEIGHT SCALES
+    # Permute scales
+    for name in ["w13", "w2"]:
+        scales = getattr(layer, name + "_weight_scale").to(param_dtype)
+        global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype)
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+        for i in range(e):
+            marlin_scales = marlin_permute_scales(s=scales[i].T,
+                                                  size_k=size_k,
+                                                  size_n=size_n,
+                                                  group_size=16)
+            marlin_scales = fp4_marlin_process_scales(marlin_scales)
+            tensor_list.append(marlin_scales)
+        scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        scales = torch.nn.Parameter(scales, requires_grad=False)
+        setattr(layer, name + "_weight_scale", scales)
+        global_scale = fp4_marlin_process_global_scale(global_scale)
+        global_scale = torch.nn.Parameter(global_scale, requires_grad=False)
+        setattr(layer, name + "_weight_scale_2", global_scale)
+def rand_marlin_weight_fp4_like(weight, group_size):
+    assert group_size > 0
+    size_n, size_k = weight.shape
+    device = weight.device
+    scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 6
+    global_scale = scales.max() / 448
+    scales = (scales / global_scale).to(torch.float8_e4m3fn)
+    fp4_weight = torch.randint(0,
+                               256, (size_n, size_k // 2),
+                               dtype=torch.uint8,
+                               device=weight.device)
+    fp4_weight_part_1 = ((fp4_weight & 0b10000000) |
+                         ((fp4_weight & 0b01110000) >> 2))
+    fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn)
+    fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6)
+    fp4_weight2 = fp4_weight << 4
+    fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) |
+                         ((fp4_weight2 & 0b01110000) >> 2))
+    fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn)
+    fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6)
+    weight_ref = torch.cat(
+        [fp4_weight_part_2.unsqueeze(2),
+         fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k)
+    weight_ref = weight_ref * global_scale.to(weight.dtype) * \
+        scales.repeat_interleave(group_size, 1).to(weight.dtype)
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=4,
+    )
+    marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype),
+                                          size_k=size_k,
+                                          size_n=size_n,
+                                          group_size=group_size)
+    marlin_scales = fp4_marlin_process_scales(marlin_scales)
+    global_scale = fp4_marlin_process_global_scale(global_scale)
+    return weight_ref.T, marlin_qweight, marlin_scales, global_scale

build/torch27-cxx11-cu126-aarch64-linux/quantization/utils/marlin_utils_fp8.py CHANGED Viewed

@@ -1,10 +1,13 @@
 from typing import Optional
 import torch
 import quantization as ops
-from .marlin_utils import marlin_make_workspace, marlin_permute_scales
 def is_fp8_marlin_supported():
@@ -13,88 +16,107 @@ def is_fp8_marlin_supported():
     return capability >= 80
 def apply_fp8_marlin_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    workspace: torch.Tensor,
-    size_n: int,
-    size_k: int,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
     # For GPUs that lack FP8 hardware support, we can leverage the
     # Marlin kernel for fast weight-only FP8 quantization
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (size_n,)
-    output = ops.fp8_marlin_gemm(
-        a=reshaped_x,
-        b_q_weight=weight,
-        b_scales=weight_scale,
-        workspace=workspace,
-        num_bits=8,
-        size_m=reshaped_x.shape[0],
-        size_n=size_n,
-        size_k=size_k,
-    )
     if bias is not None:
         output.add_(bias)  # In-place add
     return output.reshape(out_shape)
-def prepare_fp8_layer_for_marlin(
-    layer: torch.nn.Module, strategy: str = "tensor"
-) -> None:
-    part_size_n = layer.output_size_per_partition
-    part_size_k = layer.input_size_per_partition
-    device = layer.weight.device
-    # WORKSPACE
-    layer.workspace = marlin_make_workspace(part_size_n, device)
-    # WEIGHT
-    # Repack weights to marlin format
-    marlin_qweight = ops.gptq_marlin_repack(
-        b_q_weight=pack_fp8_to_int32(layer.weight),
-        perm=torch.empty(0, dtype=torch.int, device=device),
-        size_k=part_size_k,
-        size_n=part_size_n,
-        num_bits=8,
-    )
-    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
-    # WEIGHT SCALES
-    scales = layer.weight_scale.to(layer.orig_dtype)
-    # Permute scales
-    marlin_scales = marlin_permute_scales(
-        s=scales, size_k=part_size_k, size_n=part_size_n, group_size=-1
-    )
-    layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
-def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
     """
     Repack FP8 weights to gptq format (packed int32 elements)
     """
     assert fp8_tensor.dtype == torch.float8_e4m3fn
-    assert fp8_tensor.shape[0] % 4 == 0
-    # Reshape to prepare for packing
-    reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
-    # Convert fp8 to uint8 (byte) representation
-    byte_tensor = reshaped.view(torch.uint8)
-    # Pack 4 uint8 values into one int32
-    packed = (
-        byte_tensor[:, 0].to(torch.int32)
-        | (byte_tensor[:, 1].to(torch.int32) << 8)
-        | (byte_tensor[:, 2].to(torch.int32) << 16)
-        | (byte_tensor[:, 3].to(torch.int32) << 24)
-    )
-    return packed.view(fp8_tensor.shape[0] // 4, *fp8_tensor.shape[1:]).contiguous()

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import torch
 import quantization as ops
+from .marlin_utils import USE_FP32_REDUCE_DEFAULT, marlin_make_workspace, marlin_permute_scales
 def is_fp8_marlin_supported():
     return capability >= 80
+def fp8_fused_exponent_bias_into_scales(scales):
+    fp8_exponent = 4
+    if scales.dtype == torch.half:
+        target_exponent = 5
+    elif scales.dtype == torch.bfloat16:
+        target_exponent = 8
+    # exponent_bias_fp16 = 2 ** 4 - 2 ** 3 = 8
+    # exponent_bias_bf16 = 2 ** 7 - 2 ** 3 = 120
+    exponent_bias = 2**(target_exponent - 1) - 2**(fp8_exponent - 1)
+    s = torch.ones_like(scales) * 2
+    s = s**exponent_bias
+    return scales * s
 def apply_fp8_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        workspace: torch.Tensor,
+        size_n: int,
+        size_k: int,
+        bias: Optional[torch.Tensor],
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     # For GPUs that lack FP8 hardware support, we can leverage the
     # Marlin kernel for fast weight-only FP8 quantization
     reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=size_n,
+                                                  k=size_k,
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(a=reshaped_x,
+                                  c=None,
+                                  b_q_weight=weight,
+                                  b_scales=weight_scale,
+                                  global_scale=None,
+                                  b_zeros=None,
+                                  g_idx=None,
+                                  perm=None,
+                                  workspace=workspace,
+                                  b_q_type=scalar_types.float8_e4m3fn,
+                                  size_m=reshaped_x.size(0),
+                                  size_n=size_n,
+                                  size_k=size_k,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce)
     if bias is not None:
         output.add_(bias)  # In-place add
     return output.reshape(out_shape)
+def pack_fp8_to_int32(fp8_tensor: torch.Tensor,
+                      size_k_first: bool = True) -> torch.Tensor:
     """
     Repack FP8 weights to gptq format (packed int32 elements)
     """
     assert fp8_tensor.dtype == torch.float8_e4m3fn
+    assert fp8_tensor.ndim == 2
+    fp8_tensor = fp8_tensor.T if size_k_first else fp8_tensor
+    fp8_tensor = fp8_tensor.contiguous()
+    # fp8_tensor is contiguous and have shape (N, K) now
+    # with `.view(torch.int32)`, it become (N, K // 4)
+    int32_tensor = fp8_tensor.view(torch.int32)
+    return int32_tensor.T.contiguous() if size_k_first else int32_tensor
+def marlin_quant_fp8_torch(weight, group_size):
+    size_n, size_k = weight.shape
+    device = weight.device
+    if group_size != -1:
+        scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 448
+        repeated_scales = scales.repeat_interleave(group_size, 1)
+        fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
+        weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
+    else:
+        scales = weight.view(size_n, 1, group_size).abs().max(-1)[0] / 448
+        repeated_scales = scales.repeat_interleave(size_k, 1)
+        fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
+        weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
+    packed_weight = pack_fp8_to_int32(fp8_weight, False).T.contiguous()
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=packed_weight,
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=8,
+    )
+    marlin_scales = marlin_permute_scales(s=scales.T,
+                                          size_k=size_k,
+                                          size_n=size_n,
+                                          group_size=group_size)
+    marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
+    return weight_ref.T, marlin_qweight, marlin_scales

build/torch27-cxx11-cu128-aarch64-linux/quantization/__init__.py CHANGED Viewed

@@ -1,12 +1,12 @@
 from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
 from .cutlass import (
     cutlass_scaled_mm_supports_fp8,
     cutlass_scaled_mm,
     cutlass_scaled_mm_azp,
 )
 from .marlin import (
     awq_marlin_repack,
-    fp8_marlin_gemm,
     gptq_marlin_gemm,
     gptq_marlin_repack,
     gptq_marlin_24_gemm,
@@ -25,8 +25,8 @@ __all__ = [
     "awq_marlin_repack",
     "cutlass_scaled_mm",
     "cutlass_scaled_mm_azp",
     "cutlass_scaled_mm_supports_fp8",
-    "fp8_marlin_gemm",
     "gptq_marlin_24_gemm",
     "gptq_marlin_gemm",
     "gptq_marlin_repack",

 from .compressed_tensors import scaled_fp8_quant, scaled_int8_quant
 from .cutlass import (
+    cutlass_scaled_mm_supports_block_fp8,
     cutlass_scaled_mm_supports_fp8,
     cutlass_scaled_mm,
     cutlass_scaled_mm_azp,
 )
 from .marlin import (
     awq_marlin_repack,
     gptq_marlin_gemm,
     gptq_marlin_repack,
     gptq_marlin_24_gemm,
     "awq_marlin_repack",
     "cutlass_scaled_mm",
     "cutlass_scaled_mm_azp",
+    "cutlass_scaled_mm_supports_block_fp8",
     "cutlass_scaled_mm_supports_fp8",
     "gptq_marlin_24_gemm",
     "gptq_marlin_gemm",
     "gptq_marlin_repack",

build/torch27-cxx11-cu128-aarch64-linux/quantization/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _quantization_0435ccb
-ops = torch.ops._quantization_0435ccb
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_quantization_0435ccb::{op_name}"

 import torch
+from . import _quantization_9035540
+ops = torch.ops._quantization_9035540
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_quantization_9035540::{op_name}"

build/torch27-cxx11-cu128-aarch64-linux/quantization/{_quantization_0435ccb.abi3.so → _quantization_9035540.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6a816838b0e28d1984f1ac89da4f25bff61a0dcfe6dc54351bc08e544363d499
-size 121278800

 version https://git-lfs.github.com/spec/v1
+oid sha256:4d670f7d449a8d177ce46784fb4617dcb0edc30f8d8a62305ed1213310256167
+size 296561248

build/torch27-cxx11-cu128-aarch64-linux/quantization/compressed_tensors.py CHANGED Viewed

@@ -2,17 +2,7 @@ from typing import Optional, Tuple
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _quantization
-        ops = torch.ops._quantization
-    except ImportError:
-        raise e
 # fp8
 def scaled_fp8_quant(
@@ -21,7 +11,8 @@ def scaled_fp8_quant(
     num_token_padding: Optional[int] = None,
     scale_ub: Optional[torch.Tensor] = None,
     use_per_token_if_dynamic: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP8 and return quantized tensor and scale.
@@ -42,30 +33,36 @@ def scaled_fp8_quant(
             in the dynamic quantization case.
     Returns:
-        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
             scaling factor.
     """
     # This code assumes batch_dim and num_tokens are flattened
-    assert input.ndim == 2
-    shape: Union[Tuple[int, int], torch.Size] = input.shape
-    # For rocm, the output fp8 dtype is torch.float_e3m3fnuz
-    # out_dtype: torch.dtype = torch.float8_e4m3fnuz \
-    #        if current_platform.is_rocm() else torch.float8_e4m3fn
-    out_dtype = torch.float8_e4m3fn
     if num_token_padding:
         shape = (max(num_token_padding, input.shape[0]), shape[1])
-    output = torch.empty(shape, device=input.device, dtype=out_dtype)
     if scale is None:
         if use_per_token_if_dynamic:
-            scale = torch.empty((shape[0], 1), device=input.device, dtype=torch.float32)
-            ops.dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub)
         else:
             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
             ops.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         # num_token_padding not implemented for this case
-        assert scale.numel() == 1 or num_token_padding is None
         ops.static_scaled_fp8_quant(output, input, scale)
     return output, scale
@@ -76,8 +73,8 @@ def scaled_int8_quant(
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
     azp: Optional[torch.Tensor] = None,
-    symmetric: bool = True,
-) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     """
     Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
@@ -90,21 +87,25 @@ def scaled_int8_quant(
         symmetric: Whether to use symmetric quantization (scale only, azp ignored).
     Returns:
-      Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
     """
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
         # static-per-tensor quantization.
         assert symmetric == (
-            azp is None
-        ), "azp must only be provided for asymmetric quantization."
         ops.static_scaled_int8_quant(output, input, scale, azp)
         return output, scale, azp
     # dynamic-per-token quantization.
-    input_scales = torch.empty(
-        (input.numel() // input.shape[-1], 1), device=input.device, dtype=torch.float32
-    )
-    input_azp = None if symmetric else torch.empty_like(input_scales, dtype=torch.int32)
-    ops.dynamic_scaled_int8_quant(output, input, input_scales, input_azp)
     return output, input_scales, input_azp

 import torch
+from ._ops import ops
 # fp8
 def scaled_fp8_quant(
     num_token_padding: Optional[int] = None,
     scale_ub: Optional[torch.Tensor] = None,
     use_per_token_if_dynamic: bool = False,
+    output: Optional[torch.Tensor] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP8 and return quantized tensor and scale.
             in the dynamic quantization case.
     Returns:
+        tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
             scaling factor.
     """
     # This code assumes batch_dim and num_tokens are flattened
+    assert (input.ndim == 2)
+    shape: Union[tuple[int, int], torch.Size] = input.shape
+    # For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
+    out_dtype: torch.dtype = current_platform.fp8_dtype()
     if num_token_padding:
         shape = (max(num_token_padding, input.shape[0]), shape[1])
+    if output is None:
+        output = torch.empty(shape, device=input.device, dtype=out_dtype)
+    else:
+        assert num_token_padding is None, \
+            "padding not supported if output passed in"
+        assert output.dtype == out_dtype
     if scale is None:
         if use_per_token_if_dynamic:
+            scale = torch.empty((shape[0], 1),
+                                device=input.device,
+                                dtype=torch.float32)
+            ops.dynamic_per_token_scaled_fp8_quant(
+                output, input.contiguous(), scale, scale_ub)
         else:
             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
             ops.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         # num_token_padding not implemented for this case
+        assert (scale.numel() == 1 and num_token_padding is None)
         ops.static_scaled_fp8_quant(output, input, scale)
     return output, scale
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
     azp: Optional[torch.Tensor] = None,
+    symmetric: bool = True
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     """
     Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
         symmetric: Whether to use symmetric quantization (scale only, azp ignored).
     Returns:
+      tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
     """
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
         # static-per-tensor quantization.
         assert symmetric == (
+            azp
+            is None), "azp must only be provided for asymmetric quantization."
         ops.static_scaled_int8_quant(output, input, scale, azp)
         return output, scale, azp
     # dynamic-per-token quantization.
+    input_scales = torch.empty((input.numel() // input.shape[-1], 1),
+                               device=input.device,
+                               dtype=torch.float32)
+    input_azp = None if symmetric else torch.empty_like(input_scales,
+                                                        dtype=torch.int32)
+    ops.dynamic_scaled_int8_quant(output, input.contiguous(),
+                                           input_scales, input_azp)
     return output, input_scales, input_azp

build/torch27-cxx11-cu128-aarch64-linux/quantization/cutlass.py CHANGED Viewed

@@ -2,22 +2,18 @@ from typing import Optional
 import torch
-try:
-    from ._ops import ops
-except ImportError as e:
-    # Fallback for local development.
-    try:
-        import _quantization
-        ops = torch.ops._quantization
-    except ImportError:
-        raise e
 def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
 def cutlass_scaled_mm(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -33,12 +29,10 @@ def cutlass_scaled_mm(
     m = a.shape[0]
     n = b.shape[1]
-    # if current_platform.is_rocm():
-    #    triton_scaled_mm_module = importlib.import_module(
-    #        "vllm.model_executor.layers.quantization.compressed_tensors."
-    #        "triton_scaled_mm")
-    #    triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
-    #    return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)

 import torch
+from ._ops import ops
+from .platforms import current_platform
 def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
+def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
+    return ops.cutlass_scaled_mm_supports_block_fp8(cuda_device_capability)
 def cutlass_scaled_mm(
     a: torch.Tensor,
     b: torch.Tensor,
     m = a.shape[0]
     n = b.shape[1]
+    cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    if not cutlass_compatible_b:
+        from .triton_scaled_mm import triton_scaled_mm
+        return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)

build/torch27-cxx11-cu128-aarch64-linux/quantization/marlin.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING
 import torch
@@ -30,58 +30,30 @@ except ImportError as e:
 from .scalar_type import ScalarType
-# fp8 marlin
-def fp8_marlin_gemm(
-    a: torch.Tensor,
-    b_q_weight: torch.Tensor,
-    b_scales: torch.Tensor,
-    workspace: torch.Tensor,
-    num_bits: int,
-    size_m: int,
-    size_n: int,
-    size_k: int,
-) -> torch.Tensor:
-    return ops.fp8_marlin_gemm(
-        a, b_q_weight, b_scales, workspace, num_bits, size_m, size_n, size_k
-    )
 # gptq_marlin
-def gptq_marlin_gemm(
-    a: torch.Tensor,
-    b_q_weight: torch.Tensor,
-    b_scales: torch.Tensor,
-    b_zeros: torch.Tensor,
-    g_idx: torch.Tensor,
-    perm: torch.Tensor,
-    workspace: torch.Tensor,
-    b_q_type: ScalarType,
-    size_m: int,
-    size_n: int,
-    size_k: int,
-    is_k_full: bool,
-    has_zp: bool = False,
-    use_fp32_reduce: bool = False,
-    is_zp_float: bool = False,
-) -> torch.Tensor:
-    return ops.gptq_marlin_gemm(
-        a,
-        b_q_weight,
-        b_scales,
-        b_zeros,
-        g_idx,
-        perm,
-        workspace,
-        b_q_type.id,
-        size_m,
-        size_n,
-        size_k,
-        is_k_full,
-        has_zp,
-        use_fp32_reduce,
-        is_zp_float,
-    )
 # gptq_marlin
 def gptq_marlin_repack(
@@ -153,14 +125,6 @@ def marlin_qqq_gemm(
 # Fake ops
 if hasattr(ops, "gptq_marlin_24_gemm"):
-    @register_fake(add_op_namespace_prefix("fp8_marlin_gemm"))
-    def _fp8_marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
-                              b_scales: torch.Tensor, workspace: torch.Tensor,
-                              num_bits: int, size_m: torch.SymInt,
-                              size_n: torch.SymInt,
-                              size_k: torch.SymInt) -> torch.Tensor:
-        return torch.empty((size_m, size_n), dtype=a.dtype, device=a.device)
     @register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
     def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
                                     b_meta: torch.Tensor, b_scales: torch.Tensor,
@@ -172,20 +136,22 @@ if hasattr(ops, "gptq_marlin_24_gemm"):
     @register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
     def _gptq_marlin_gemm_fake(a: torch.Tensor,
-                                b_q_weight: torch.Tensor,
-                                b_scales: torch.Tensor,
-                                b_zeros: torch.Tensor,
-                                g_idx: torch.Tensor,
-                                perm: torch.Tensor,
-                                workspace: torch.Tensor,
-                                b_q_type: ScalarType,
-                                size_m: torch.SymInt,
-                                size_n: torch.SymInt,
-                                size_k: torch.SymInt,
-                                is_k_full: bool,
-                                has_zp: bool = False,
-                                use_fp32_reduce: bool = False,
-                                is_zp_float: bool = False) -> torch.Tensor:
         return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
     @register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))

+from typing import TYPE_CHECKING, Optional
 import torch
 from .scalar_type import ScalarType
 # gptq_marlin
+def gptq_marlin_gemm(a: torch.Tensor,
+                     c: Optional[torch.Tensor],
+                     b_q_weight: torch.Tensor,
+                     b_scales: torch.Tensor,
+                     global_scale: Optional[torch.Tensor],
+                     b_zeros: Optional[torch.Tensor],
+                     g_idx: Optional[torch.Tensor],
+                     perm: Optional[torch.Tensor],
+                     workspace: torch.Tensor,
+                     b_q_type: ScalarType,
+                     size_m: int,
+                     size_n: int,
+                     size_k: int,
+                     is_k_full: bool = True,
+                     use_atomic_add: bool = False,
+                     use_fp32_reduce: bool = False,
+                     is_zp_float: bool = False) -> torch.Tensor:
+    return ops.gptq_marlin_gemm(a, c, b_q_weight, b_scales,
+                                         global_scale, b_zeros, g_idx, perm,
+                                         workspace, b_q_type.id, size_m,
+                                         size_n, size_k, is_k_full,
+                                         use_atomic_add, use_fp32_reduce,
+                                         is_zp_float)
 # gptq_marlin
 def gptq_marlin_repack(
 # Fake ops
 if hasattr(ops, "gptq_marlin_24_gemm"):
     @register_fake(add_op_namespace_prefix("gptq_marlin_24_gemm"))
     def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
                                     b_meta: torch.Tensor, b_scales: torch.Tensor,
     @register_fake(add_op_namespace_prefix("gptq_marlin_gemm"))
     def _gptq_marlin_gemm_fake(a: torch.Tensor,
+                               c: Optional[torch.Tensor],
+                               b_q_weight: torch.Tensor,
+                               b_scales: torch.Tensor,
+                               global_scale: Optional[torch.Tensor],
+                               b_zeros: Optional[torch.Tensor],
+                               g_idx: Optional[torch.Tensor],
+                               perm: Optional[torch.Tensor],
+                               workspace: torch.Tensor,
+                               b_q_type_id: int,
+                               size_m: torch.SymInt,
+                               size_n: torch.SymInt,
+                               size_k: torch.SymInt,
+                               is_k_full: bool = True,
+                               use_atomic_add: bool = False,
+                               use_fp32_reduce: bool = False,
+                               is_zp_float: bool = False) -> torch.Tensor:
         return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
     @register_fake(add_op_namespace_prefix("marlin_qqq_gemm"))

build/torch27-cxx11-cu128-aarch64-linux/quantization/platforms.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from abc import ABC, abstractmethod
+from functools import lru_cache
+from typing import NamedTuple
+import torch
+IS_ROCM = torch.version.hip is not None
+class DeviceCapability(NamedTuple):
+    major: int
+    minor: int
+    def as_version_str(self) -> str:
+        return f"{self.major}.{self.minor}"
+    def to_int(self) -> int:
+        """
+        Express device capability as an integer ``<major><minor>``.
+        It is assumed that the minor version is always a single digit.
+        """
+        assert 0 <= self.minor < 10
+        return self.major * 10 + self.minor
+class Platform(ABC):
+    simple_compile_backend: str = "inductor"
+    @classmethod
+    @abstractmethod
+    def get_device_name(cls, device_id: int = 0) -> str: ...
+    @abstractmethod
+    def is_rocm(self): ...
+class CudaPlatform(Platform):
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(0)
+    def is_rocm(self):
+        return False
+class RocmPlatform(Platform):
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(device_id)
+    def is_rocm(self):
+        return True
+current_platform = RocmPlatform() if IS_ROCM else CudaPlatform()

build/torch27-cxx11-cu128-aarch64-linux/quantization/scalar_type.py CHANGED Viewed

@@ -1,9 +1,14 @@
 import functools
 import struct
 from dataclasses import dataclass
 from enum import Enum
 from typing import Optional, Union
 # Mirrors enum in `core/scalar_type.hpp`
 class NanRepr(Enum):
@@ -121,8 +126,8 @@ class ScalarType:
             min_raw = max_raw | sign_bit_double
             return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
         else:
-            assert (not self.is_signed() or
-                    self.size_bits <= 64), "Cannot represent min as a int64_t"
             if self.is_signed():
                 return -(1 << (self.size_bits - 1))
@@ -156,6 +161,8 @@ class ScalarType:
         assert offset <= 64, \
             f"ScalarType fields too big {offset} to fit into an int64"
         return val
     @property
@@ -293,6 +300,13 @@ class ScalarType:
         ret.id  # noqa B018: make sure the id is cached
         return ret
 # naming generally follows: https://github.com/jax-ml/ml_dtypes
 # for floating point types (leading f) the scheme is:
@@ -319,6 +333,9 @@ class scalar_types:
     # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
     float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
     # "gptq" types
     uint2b2 = ScalarType.uint(2, 2)
     uint3b4 = ScalarType.uint(3, 4)

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 import struct
 from dataclasses import dataclass
 from enum import Enum
 from typing import Optional, Union
+_SCALAR_TYPES_ID_MAP = {}
 # Mirrors enum in `core/scalar_type.hpp`
 class NanRepr(Enum):
             min_raw = max_raw | sign_bit_double
             return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
         else:
+            assert (not self.is_signed() or self.size_bits
+                    <= 64), "Cannot represent min as a int64_t"
             if self.is_signed():
                 return -(1 << (self.size_bits - 1))
         assert offset <= 64, \
             f"ScalarType fields too big {offset} to fit into an int64"
+        _SCALAR_TYPES_ID_MAP[val] = self
         return val
     @property
         ret.id  # noqa B018: make sure the id is cached
         return ret
+    @classmethod
+    def from_id(cls, scalar_type_id: int):
+        if scalar_type_id not in _SCALAR_TYPES_ID_MAP:
+            raise ValueError(
+                f"scalar_type_id {scalar_type_id} doesn't exists.")
+        return _SCALAR_TYPES_ID_MAP[scalar_type_id]
 # naming generally follows: https://github.com/jax-ml/ml_dtypes
 # for floating point types (leading f) the scheme is:
     # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
     float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
+    # fp4, https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    float4_e2m1f = ScalarType.float_(2, 1, True, NanRepr.NONE)
     # "gptq" types
     uint2b2 = ScalarType.uint(2, 2)
     uint3b4 = ScalarType.uint(3, 4)

build/torch27-cxx11-cu128-aarch64-linux/quantization/utils/marlin_utils.py CHANGED Viewed

@@ -1,4 +1,7 @@
-from typing import List, Optional, Tuple
 import numpy
 import torch
@@ -42,7 +45,9 @@ USE_FP32_REDUCE_DEFAULT = True
 #  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
 #  TODO: we may want to move this into the C++ so its closer to the actual impl
 def query_marlin_supported_quant_types(
-    has_zp: bool, device_capability: Optional[int] = None
 ):
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
@@ -51,137 +56,141 @@ def query_marlin_supported_quant_types(
     if device_capability < 80:
         return []
     if has_zp:
         # AWQ style, unsigned + runtime zero-point
-        return [scalar_types.uint4, scalar_types.uint8]
     else:
         # GPTQ style, unsigned + symmetric bias
-        # TODO: once fp8_marlin is merged into "gptq_marlin" we should be able
-        #  to add `scalar_types.float8_e4m3fn` here
-        return [scalar_types.uint4b8, scalar_types.uint8b128]
 def _check_marlin_supported(
-    quant_type: ScalarType,
-    group_size: Optional[int],
-    has_zp: bool,
-    device_capability: Optional[int] = None,
-) -> Tuple[bool, Optional[str]]:
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
         device_capability = capability_tuple[0] * 10 + capability_tuple[1]
-    supported_types = query_marlin_supported_quant_types(has_zp, device_capability)
     if quant_type not in supported_types:
-        return (
-            False,
-            f"Marlin does not support weight_bits = {quant_type}. "
-            f"Only types = {supported_types} "
-            f"are supported (for group_size = {group_size}, "
-            f"device_capability = {device_capability}, zp = {has_zp}).",
-        )
-    if group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
-        return (
-            False,
-            f"Marlin does not support group_size = {group_size}. "
-            f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
-            "are supported.",
-        )
     return True, None
-def check_marlin_supported(
-    quant_type: ScalarType,
-    group_size: int,
-    has_zp: bool = False,
-    device_capability: Optional[int] = None,
-) -> bool:
-    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp, device_capability)
     return cond
-def verify_marlin_supported(
-    quant_type: ScalarType, group_size: int, has_zp: bool = False
-) -> None:
     cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
     if not cond:
         assert err_msg is not None
         raise ValueError(err_msg)
-def verify_marlin_supports_shape(
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    input_size: int,
-    group_size: int,
-) -> None:
     # Validate output_size_per_partition
     if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
-        raise ValueError(
-            f"Weight output_size_per_partition = "
-            f"{output_size_per_partition} is not divisible by "
-            f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
-            "Consider reducing tensor_parallel_size or running "
-            "with --quantization gptq."
-        )
     # Validate input_size_per_partition
     if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
-        raise ValueError(
-            f"Weight input_size_per_partition = "
-            f"{input_size_per_partition} is not divisible "
-            f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
-            "Consider reducing tensor_parallel_size or running "
-            "with --quantization gptq."
-        )
-    if group_size < input_size and input_size_per_partition % group_size != 0:
         raise ValueError(
             f"Weight input_size_per_partition = {input_size_per_partition}"
-            f" is not divisible by group_size = {group_size}."
             "Consider reducing tensor_parallel_size or running "
-            "with --quantization gptq."
-        )
-def check_marlin_supports_shape(
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    input_size: int,
-    group_size: int,
-) -> Tuple[bool, Optional[str]]:
     try:
-        verify_marlin_supports_shape(
-            output_size_per_partition, input_size_per_partition, input_size, group_size
-        )
     except ValueError as e:
         return False, e.__str__()
     return True, None
-def marlin_make_workspace(
-    output_size_per_partition: int, device: torch.device
-) -> torch.Tensor:
-    max_workspace_size = (
-        output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N
-    ) * GPTQ_MARLIN_MAX_PARALLEL
-    return torch.zeros(
-        max_workspace_size, dtype=torch.int, device=device, requires_grad=False
-    )
 def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
     return (not act_order) or (act_order and not is_row_parallel)
-def marlin_repeat_scales_on_all_ranks(
-    act_order: bool, group_size: int, is_row_parallel: bool
-) -> bool:
     # Need to repeat scales on every rank if act_ordering or
     # channelwise and RowParallelLinear
     is_channelwise = group_size == -1
@@ -189,35 +198,34 @@ def marlin_repeat_scales_on_all_ranks(
 def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
-    return torch.nn.Parameter(
-        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
-    )
 def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
-    return torch.nn.Parameter(
-        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
-    )
-def marlin_sort_g_idx(g_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
     return g_idx[g_idx_sort_indices], g_idx_sort_indices
 def get_scale_perms():
-    scale_perm: List[int] = []
     for i in range(8):
         scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single: List[int] = []
     for i in range(4):
-        scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
     return scale_perm, scale_perm_single
-def marlin_permute_scales(
-    s: torch.Tensor, size_k: int, size_n: int, group_size: int
-) -> torch.Tensor:
     scale_perm, scale_perm_single = get_scale_perms()
     if group_size < size_k and group_size != -1:
@@ -247,9 +255,8 @@ def marlin_moe_permute_scales(
     return output
-def marlin_zero_points(
-    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int
-) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
     # "single" permutation, since zero-points are applied on every MMA
     scale_perm, _ = get_scale_perms()
@@ -270,9 +277,8 @@ def marlin_zero_points(
     return zp
-def awq_to_marlin_zero_points(
-    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
-) -> torch.Tensor:
     # AWQ zero-points are quantized and packed on the column dim.
     # In addition, the values are permuted based on dequantizer.
     # Here we undo both of these, and then apply marlin permutation
@@ -294,9 +300,8 @@ def awq_to_marlin_zero_points(
     return marlin_zp
-def moe_awq_to_marlin_zero_points(
-    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
-):
     num_experts = q_zp_packed.shape[0]
     output = torch.empty(
         (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
@@ -304,45 +309,97 @@ def moe_awq_to_marlin_zero_points(
         dtype=q_zp_packed.dtype,
     )
     for e in range(num_experts):
-        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n, num_bits)
     return output
 def apply_gptq_marlin_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    weight_zp: torch.Tensor,
-    g_idx: torch.Tensor,
-    g_idx_sort_indices: torch.Tensor,
-    workspace: torch.Tensor,
-    wtype: ScalarType,
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    is_k_full: bool,
-    bias: Optional[torch.Tensor] = None,
-    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
-) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (output_size_per_partition,)
-    output = ops.gptq_marlin_gemm(
-        reshaped_x,
-        weight,
-        weight_scale,
-        weight_zp,
-        g_idx,
-        g_idx_sort_indices,
-        workspace,
-        wtype,
-        size_m=reshaped_x.shape[0],
-        size_n=output_size_per_partition,
-        size_k=input_size_per_partition,
-        is_k_full=is_k_full,
-        has_zp=False,
-        use_fp32_reduce=use_fp32_reduce,
-        is_zp_float=False,
-    )
     if bias is not None:
         output.add_(bias)  # In-place add
@@ -351,39 +408,43 @@ def apply_gptq_marlin_linear(
 def apply_awq_marlin_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    weight_zp: torch.Tensor,
-    g_idx: torch.Tensor,
-    g_idx_sort_indices: torch.Tensor,
-    workspace: torch.Tensor,
-    quant_type: ScalarType,
-    output_size_per_partition: int,
-    input_size_per_partition: int,
-    bias: Optional[torch.Tensor] = None,
-    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
-) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (output_size_per_partition,)
-    output = ops.gptq_marlin_gemm(
-        reshaped_x,
-        weight,
-        weight_scale,
-        weight_zp,
-        g_idx,
-        g_idx_sort_indices,
-        workspace,
-        quant_type,
-        size_m=reshaped_x.shape[0],
-        size_n=output_size_per_partition,
-        size_k=input_size_per_partition,
-        is_k_full=True,
-        has_zp=True,
-        use_fp32_reduce=use_fp32_reduce,
-        is_zp_float=False,
-    )
     if bias is not None:
         output.add_(bias)  # In-place add

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
 import numpy
 import torch
 #  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
 #  TODO: we may want to move this into the C++ so its closer to the actual impl
 def query_marlin_supported_quant_types(
+    has_zp: Optional[bool] = None,
+    include_fp_type: bool = True,
+    device_capability: Optional[int] = None,
 ):
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
     if device_capability < 80:
         return []
+    # - has_zp is True: return quant_types that has zero points
+    # - has_zp is False: return quant_types that has not zero points
+    # - has_zp is None: both
+    if has_zp is None:
+        types0 = query_marlin_supported_quant_types(False, include_fp_type,
+                                                    device_capability)
+        types1 = query_marlin_supported_quant_types(True, include_fp_type,
+                                                    device_capability)
+        return types0 + types1
     if has_zp:
         # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4]
     else:
         # GPTQ style, unsigned + symmetric bias
+        res = [scalar_types.uint4b8, scalar_types.uint8b128]
+        if include_fp_type:
+            res += [scalar_types.float8_e4m3fn, scalar_types.float4_e2m1f]
+        return res
 def _check_marlin_supported(
+        quant_type: ScalarType,
+        group_size: Optional[int],
+        has_zp: bool,
+        device_capability: Optional[int] = None) -> tuple[bool, Optional[str]]:
     if device_capability is None:
         capability_tuple = torch.cuda.get_device_capability()
         device_capability = capability_tuple[0] * 10 + capability_tuple[1]
+    supported_types = query_marlin_supported_quant_types(
+        has_zp, True, device_capability)
     if quant_type not in supported_types:
+        return (False, f"Marlin does not support weight_bits = {quant_type}. "
+                f"Only types = {supported_types} "
+                f"are supported (for group_size = {group_size}, "
+                f"device_capability = {device_capability}, zp = {has_zp}).")
+    if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES):
+        return (False, f"Marlin does not support group_size = {group_size}. "
+                f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
     return True, None
+def check_marlin_supported(quant_type: ScalarType,
+                           group_size: int,
+                           has_zp: bool = False,
+                           device_capability: Optional[int] = None) -> bool:
+    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp,
+                                      device_capability)
     return cond
+def verify_marlin_supported(quant_type: ScalarType,
+                            group_size: int,
+                            has_zp: bool = False) -> None:
     cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
     if not cond:
         assert err_msg is not None
         raise ValueError(err_msg)
+def verify_marlin_supports_shape(output_size_per_partition: int,
+                                 input_size_per_partition: int,
+                                 input_size: int, group_size: int) -> None:
     # Validate output_size_per_partition
     if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
+        raise ValueError(f"Weight output_size_per_partition = "
+                         f"{output_size_per_partition} is not divisible by "
+                         f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
     # Validate input_size_per_partition
     if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
+        raise ValueError(f"Weight input_size_per_partition = "
+                         f"{input_size_per_partition} is not divisible "
+                         f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+    if (group_size < input_size
+            and input_size_per_partition % group_size != 0):
         raise ValueError(
             f"Weight input_size_per_partition = {input_size_per_partition}"
+            f" is not divisible by group_size = {group_size}. "
             "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq.")
+def check_marlin_supports_shape(output_size_per_partition: int,
+                                input_size_per_partition: int,
+                                input_size: int, group_size: int) \
+                                    -> tuple[bool, Optional[str]]:
     try:
+        verify_marlin_supports_shape(output_size_per_partition,
+                                     input_size_per_partition, input_size,
+                                     group_size)
     except ValueError as e:
         return False, e.__str__()
     return True, None
+def marlin_make_workspace(output_size_per_partition: int,
+                          device: torch.device) -> torch.Tensor:
+    max_workspace_size = (output_size_per_partition //
+                          GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
+    return torch.zeros(max_workspace_size,
+                       dtype=torch.int,
+                       device=device,
+                       requires_grad=False)
+def marlin_make_workspace_new(device: torch.device,
+                              max_blocks_per_sm: int = 1) -> torch.Tensor:
+    # In the new marlin kernel, we use the num of threadblocks as workspace
+    # size. The num of threadblocks is is sms_count * max_blocks_per_sm.
+    sms = torch.cuda.get_device_properties(device).multi_processor_count
+    return torch.zeros(sms * max_blocks_per_sm,
+                       dtype=torch.int,
+                       device=device,
+                       requires_grad=False)
 def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
     return (not act_order) or (act_order and not is_row_parallel)
+def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
+                                      is_row_parallel: bool) -> bool:
     # Need to repeat scales on every rank if act_ordering or
     # channelwise and RowParallelLinear
     is_channelwise = group_size == -1
 def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
 def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
+def marlin_sort_g_idx(
+        g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
     return g_idx[g_idx_sort_indices], g_idx_sort_indices
 def get_scale_perms():
+    scale_perm: list[int] = []
     for i in range(8):
         scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: list[int] = []
     for i in range(4):
+        scale_perm_single.extend(
+            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
     return scale_perm, scale_perm_single
+def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
+                          group_size: int) -> torch.Tensor:
     scale_perm, scale_perm_single = get_scale_perms()
     if group_size < size_k and group_size != -1:
     return output
+def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
+                       num_bits: int) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
     # "single" permutation, since zero-points are applied on every MMA
     scale_perm, _ = get_scale_perms()
     return zp
+def awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
+                              size_n: int, num_bits: int) -> torch.Tensor:
     # AWQ zero-points are quantized and packed on the column dim.
     # In addition, the values are permuted based on dequantizer.
     # Here we undo both of these, and then apply marlin permutation
     return marlin_zp
+def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
+                                  size_n: int, num_bits: int):
     num_experts = q_zp_packed.shape[0]
     output = torch.empty(
         (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
         dtype=q_zp_packed.dtype,
     )
     for e in range(num_experts):
+        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
+                                              num_bits)
     return output
+def maybe_warn_marlin_atomic_add(device, dtype):
+    if torch.compiler.is_dynamo_compiling():
+        return
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        logger.info_once(
+            "You are running Marlin kernel with bf16 on GPUs before SM90. "
+            "You can consider change to fp16 to achieve better performance "
+            "if possible.")
+def maybe_warn_marlin_atomic_add_env():
+    if torch.compiler.is_dynamo_compiling():
+        return
+    if envs.VLLM_MARLIN_USE_ATOMIC_ADD:
+        return
+    logger.info_once(
+        "Marlin kernel can achieve better performance for small size_n "
+        "with experimental use_atomic_add feature. "
+        "You can consider set environment variable "
+        "VLLM_MARLIN_USE_ATOMIC_ADD to 1 if possible.")
+def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
+                                 dtype: torch.dtype) -> bool:
+    # the performance of atomicAdd is better than global reduce
+    # only when m*n is small and k is large
+    if n >= 2048 or k < 2048 or device.type != "cuda":
+        return False
+    # disable atomicAdd reduce by default,
+    # one can enable it with VLLM_MARLIN_USE_ATOMIC_ADD=1
+    if not envs.VLLM_MARLIN_USE_ATOMIC_ADD:
+        maybe_warn_marlin_atomic_add_env()
+        return False
+    # sm8x doesn't support atomicAdd + bfloat16 natively
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        maybe_warn_marlin_atomic_add(device, dtype)
+        return False
+    return True
 def apply_gptq_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_zp: torch.Tensor,
+        g_idx: torch.Tensor,
+        g_idx_sort_indices: torch.Tensor,
+        workspace: torch.Tensor,
+        wtype: ScalarType,
+        output_size_per_partition: int,
+        input_size_per_partition: int,
+        is_k_full: bool,
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=output_size_per_partition,
+                                                  k=reshaped_x.size(1),
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(reshaped_x,
+                                  None,
+                                  weight,
+                                  weight_scale,
+                                  None,
+                                  weight_zp,
+                                  g_idx,
+                                  g_idx_sort_indices,
+                                  workspace,
+                                  wtype,
+                                  size_m=reshaped_x.shape[0],
+                                  size_n=output_size_per_partition,
+                                  size_k=input_size_per_partition,
+                                  is_k_full=is_k_full,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)
     if bias is not None:
         output.add_(bias)  # In-place add
 def apply_awq_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_zp: torch.Tensor,
+        g_idx: torch.Tensor,
+        g_idx_sort_indices: torch.Tensor,
+        workspace: torch.Tensor,
+        quant_type: ScalarType,
+        output_size_per_partition: int,
+        input_size_per_partition: int,
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=output_size_per_partition,
+                                                  k=reshaped_x.size(1),
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(reshaped_x,
+                                  None,
+                                  weight,
+                                  weight_scale,
+                                  None,
+                                  weight_zp,
+                                  g_idx,
+                                  g_idx_sort_indices,
+                                  workspace,
+                                  quant_type,
+                                  size_m=reshaped_x.shape[0],
+                                  size_n=output_size_per_partition,
+                                  size_k=input_size_per_partition,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce,
+                                  is_zp_float=False)
     if bias is not None:
         output.add_(bias)  # In-place add

build/torch27-cxx11-cu128-aarch64-linux/quantization/utils/marlin_utils_fp4.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import torch
+import quantization as ops
+from .marlin_utils import (
+    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
+    should_use_atomic_add_reduce)
+from quantization.scalar_type import scalar_types
+FP4_MARLIN_SUPPORTED_GROUP_SIZES = [16]
+def is_fp4_marlin_supported():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    return capability >= 80
+def fp4_marlin_process_scales(marlin_scales):
+    if not (marlin_scales >= 0).all():
+        logger.warning_once(
+            "NVFP4 Marlin assumes the scales to be >=0, but has encountered "
+            "negative scales. Accuracy will likely be degraded. This is "
+            "because it changes the scales from FP8-S1E4M3 to a special "
+            "FP8-S0E5M3 format to speedup the dequantization.")
+    # convert to half first, we would convert to fp8 later
+    marlin_scales = marlin_scales.to(torch.half)
+    # 8 is the number of scale number using by one thread
+    marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
+    marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
+        marlin_scales.size(0) * 2, -1)
+    # fit the layout of fp8 dequantization
+    marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
+        marlin_scales.size(0), -1)
+    # We assume that weight_scale (FP8-S1E4M3) is always greater
+    # than or equal to 0. So we can convert
+    # (weight_scale * (2 ** 7) to a special FP8-S0E5M3 format.
+    # After multiplying by 2 ** 7, the top bit of FP8-S0E5M3 would always be 1
+    # when weight_scale > 0. This allows us to have an exponent bias
+    # closer to zero after dequantization.
+    marlin_scales = (marlin_scales * (2**7)).view(torch.int16) << 1
+    marlin_scales = marlin_scales.view(torch.float8_e4m3fn)
+    marlin_scales = marlin_scales[:, 1::2].contiguous()
+    return marlin_scales
+def fp4_marlin_process_global_scale(global_scale):
+    assert global_scale.dtype in [torch.half, torch.bfloat16]
+    fp4_exponent = 2
+    if global_scale.dtype == torch.half:
+        target_exponent = 5
+    elif global_scale.dtype == torch.bfloat16:
+        target_exponent = 8
+    # exponent_bias_fp16 = 2 ** 4 - 2 ** 1 = 14
+    # exponent_bias_bf16 = 2 ** 7 - 2 ** 1 = 126
+    exponent_bias = 2**(target_exponent - 1) - 2**(fp4_exponent - 1)
+    return global_scale * (2.0**(exponent_bias - 7))
+def apply_fp4_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_scale_2: torch.Tensor,
+        workspace: torch.Tensor,
+        size_n: int,
+        size_k: int,
+        bias: Optional[torch.Tensor] = None,
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
+    # For GPUs that lack FP4 hardware support, we can leverage the
+    # Marlin kernel for fast weight-only FP4 quantization
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=size_n,
+                                                  k=size_k,
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(a=reshaped_x,
+                                  c=None,
+                                  b_q_weight=weight,
+                                  b_scales=weight_scale,
+                                  global_scale=weight_scale_2,
+                                  b_zeros=None,
+                                  g_idx=None,
+                                  perm=None,
+                                  workspace=workspace,
+                                  b_q_type=scalar_types.float4_e2m1f,
+                                  size_m=reshaped_x.size(0),
+                                  size_n=size_n,
+                                  size_k=size_k,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce)
+    if bias is not None:
+        output.add_(bias)  # In-place add
+    return output.reshape(out_shape)
+def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP4 computation but "
+        "FP4 quantization is being used. Weight-only FP4 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads.")
+    part_size_n = layer.output_size_per_partition
+    part_size_k = layer.input_size_per_partition
+    param_dtype = layer.params_dtype
+    assert layer.weight.shape == (part_size_n, part_size_k // 2)
+    device = layer.weight.device
+    # WORKSPACE
+    layer.workspace = marlin_make_workspace_new(device)
+    # WEIGHT
+    # Repack weights to marlin format
+    perm = torch.empty(0, dtype=torch.int, device=device)
+    qweight = layer.weight.view(torch.int32).T.contiguous()
+    marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
+                                            perm=perm,
+                                            size_k=part_size_k,
+                                            size_n=part_size_n,
+                                            num_bits=4)
+    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
+    # WEIGHT SCALES
+    # Permute scales
+    weight_scale = layer.weight_scale.T.to(param_dtype)
+    weight_scale = marlin_permute_scales(s=weight_scale,
+                                         size_k=part_size_k,
+                                         size_n=part_size_n,
+                                         group_size=16)
+    weight_scale = fp4_marlin_process_scales(weight_scale)
+    layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+    weight_scale_2 = layer.weight_scale_2.to(param_dtype)
+    weight_scale_2 = fp4_marlin_process_global_scale(weight_scale_2)
+    layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
+                                              requires_grad=False)
+    return
+def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP4 computation but "
+        "FP4 quantization is being used. Weight-only FP4 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads.")
+    e = layer.num_experts
+    k = layer.hidden_size
+    n = layer.intermediate_size_per_partition
+    # WORKSPACE
+    device = layer.w13_weight.device
+    param_dtype = layer.params_dtype
+    layer.workspace = marlin_make_workspace_new(device, 4)
+    perm = torch.empty(0, dtype=torch.int, device=device)
+    # WEIGHT
+    # Repack weights to marlin format
+    for name in ["w13_weight", "w2_weight"]:
+        weight = getattr(layer, name)
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+        assert weight.shape == (e, size_n, size_k // 2)
+        for i in range(e):
+            qweight = weight[i].view(torch.int32).T.contiguous()
+            marlin_qweight = ops.gptq_marlin_repack(b_q_weight=qweight,
+                                                    perm=perm,
+                                                    size_k=size_k,
+                                                    size_n=size_n,
+                                                    num_bits=4)
+            tensor_list.append(marlin_qweight)
+        weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        weight = torch.nn.Parameter(weight, requires_grad=False)
+        setattr(layer, name, weight)
+    # WEIGHT SCALES
+    # Permute scales
+    for name in ["w13", "w2"]:
+        scales = getattr(layer, name + "_weight_scale").to(param_dtype)
+        global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype)
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+        for i in range(e):
+            marlin_scales = marlin_permute_scales(s=scales[i].T,
+                                                  size_k=size_k,
+                                                  size_n=size_n,
+                                                  group_size=16)
+            marlin_scales = fp4_marlin_process_scales(marlin_scales)
+            tensor_list.append(marlin_scales)
+        scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        scales = torch.nn.Parameter(scales, requires_grad=False)
+        setattr(layer, name + "_weight_scale", scales)
+        global_scale = fp4_marlin_process_global_scale(global_scale)
+        global_scale = torch.nn.Parameter(global_scale, requires_grad=False)
+        setattr(layer, name + "_weight_scale_2", global_scale)
+def rand_marlin_weight_fp4_like(weight, group_size):
+    assert group_size > 0
+    size_n, size_k = weight.shape
+    device = weight.device
+    scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 6
+    global_scale = scales.max() / 448
+    scales = (scales / global_scale).to(torch.float8_e4m3fn)
+    fp4_weight = torch.randint(0,
+                               256, (size_n, size_k // 2),
+                               dtype=torch.uint8,
+                               device=weight.device)
+    fp4_weight_part_1 = ((fp4_weight & 0b10000000) |
+                         ((fp4_weight & 0b01110000) >> 2))
+    fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn)
+    fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6)
+    fp4_weight2 = fp4_weight << 4
+    fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) |
+                         ((fp4_weight2 & 0b01110000) >> 2))
+    fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn)
+    fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6)
+    weight_ref = torch.cat(
+        [fp4_weight_part_2.unsqueeze(2),
+         fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k)
+    weight_ref = weight_ref * global_scale.to(weight.dtype) * \
+        scales.repeat_interleave(group_size, 1).to(weight.dtype)
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=4,
+    )
+    marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype),
+                                          size_k=size_k,
+                                          size_n=size_n,
+                                          group_size=group_size)
+    marlin_scales = fp4_marlin_process_scales(marlin_scales)
+    global_scale = fp4_marlin_process_global_scale(global_scale)
+    return weight_ref.T, marlin_qweight, marlin_scales, global_scale

build/torch27-cxx11-cu128-aarch64-linux/quantization/utils/marlin_utils_fp8.py CHANGED Viewed

@@ -1,10 +1,13 @@
 from typing import Optional
 import torch
 import quantization as ops
-from .marlin_utils import marlin_make_workspace, marlin_permute_scales
 def is_fp8_marlin_supported():
@@ -13,88 +16,107 @@ def is_fp8_marlin_supported():
     return capability >= 80
 def apply_fp8_marlin_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    workspace: torch.Tensor,
-    size_n: int,
-    size_k: int,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
     # For GPUs that lack FP8 hardware support, we can leverage the
     # Marlin kernel for fast weight-only FP8 quantization
     reshaped_x = input.reshape(-1, input.shape[-1])
-    out_shape = input.shape[:-1] + (size_n,)
-    output = ops.fp8_marlin_gemm(
-        a=reshaped_x,
-        b_q_weight=weight,
-        b_scales=weight_scale,
-        workspace=workspace,
-        num_bits=8,
-        size_m=reshaped_x.shape[0],
-        size_n=size_n,
-        size_k=size_k,
-    )
     if bias is not None:
         output.add_(bias)  # In-place add
     return output.reshape(out_shape)
-def prepare_fp8_layer_for_marlin(
-    layer: torch.nn.Module, strategy: str = "tensor"
-) -> None:
-    part_size_n = layer.output_size_per_partition
-    part_size_k = layer.input_size_per_partition
-    device = layer.weight.device
-    # WORKSPACE
-    layer.workspace = marlin_make_workspace(part_size_n, device)
-    # WEIGHT
-    # Repack weights to marlin format
-    marlin_qweight = ops.gptq_marlin_repack(
-        b_q_weight=pack_fp8_to_int32(layer.weight),
-        perm=torch.empty(0, dtype=torch.int, device=device),
-        size_k=part_size_k,
-        size_n=part_size_n,
-        num_bits=8,
-    )
-    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
-    # WEIGHT SCALES
-    scales = layer.weight_scale.to(layer.orig_dtype)
-    # Permute scales
-    marlin_scales = marlin_permute_scales(
-        s=scales, size_k=part_size_k, size_n=part_size_n, group_size=-1
-    )
-    layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
-def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
     """
     Repack FP8 weights to gptq format (packed int32 elements)
     """
     assert fp8_tensor.dtype == torch.float8_e4m3fn
-    assert fp8_tensor.shape[0] % 4 == 0
-    # Reshape to prepare for packing
-    reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
-    # Convert fp8 to uint8 (byte) representation
-    byte_tensor = reshaped.view(torch.uint8)
-    # Pack 4 uint8 values into one int32
-    packed = (
-        byte_tensor[:, 0].to(torch.int32)
-        | (byte_tensor[:, 1].to(torch.int32) << 8)
-        | (byte_tensor[:, 2].to(torch.int32) << 16)
-        | (byte_tensor[:, 3].to(torch.int32) << 24)
-    )
-    return packed.view(fp8_tensor.shape[0] // 4, *fp8_tensor.shape[1:]).contiguous()

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import torch
 import quantization as ops
+from .marlin_utils import USE_FP32_REDUCE_DEFAULT, marlin_make_workspace, marlin_permute_scales
 def is_fp8_marlin_supported():
     return capability >= 80
+def fp8_fused_exponent_bias_into_scales(scales):
+    fp8_exponent = 4
+    if scales.dtype == torch.half:
+        target_exponent = 5
+    elif scales.dtype == torch.bfloat16:
+        target_exponent = 8
+    # exponent_bias_fp16 = 2 ** 4 - 2 ** 3 = 8
+    # exponent_bias_bf16 = 2 ** 7 - 2 ** 3 = 120
+    exponent_bias = 2**(target_exponent - 1) - 2**(fp8_exponent - 1)
+    s = torch.ones_like(scales) * 2
+    s = s**exponent_bias
+    return scales * s
 def apply_fp8_marlin_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        workspace: torch.Tensor,
+        size_n: int,
+        size_k: int,
+        bias: Optional[torch.Tensor],
+        use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
     # For GPUs that lack FP8 hardware support, we can leverage the
     # Marlin kernel for fast weight-only FP8 quantization
     reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n, )
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=size_n,
+                                                  k=size_k,
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+    output = ops.gptq_marlin_gemm(a=reshaped_x,
+                                  c=None,
+                                  b_q_weight=weight,
+                                  b_scales=weight_scale,
+                                  global_scale=None,
+                                  b_zeros=None,
+                                  g_idx=None,
+                                  perm=None,
+                                  workspace=workspace,
+                                  b_q_type=scalar_types.float8_e4m3fn,
+                                  size_m=reshaped_x.size(0),
+                                  size_n=size_n,
+                                  size_k=size_k,
+                                  use_atomic_add=use_atomic_add,
+                                  use_fp32_reduce=use_fp32_reduce)
     if bias is not None:
         output.add_(bias)  # In-place add
     return output.reshape(out_shape)
+def pack_fp8_to_int32(fp8_tensor: torch.Tensor,
+                      size_k_first: bool = True) -> torch.Tensor:
     """
     Repack FP8 weights to gptq format (packed int32 elements)
     """
     assert fp8_tensor.dtype == torch.float8_e4m3fn
+    assert fp8_tensor.ndim == 2
+    fp8_tensor = fp8_tensor.T if size_k_first else fp8_tensor
+    fp8_tensor = fp8_tensor.contiguous()
+    # fp8_tensor is contiguous and have shape (N, K) now
+    # with `.view(torch.int32)`, it become (N, K // 4)
+    int32_tensor = fp8_tensor.view(torch.int32)
+    return int32_tensor.T.contiguous() if size_k_first else int32_tensor
+def marlin_quant_fp8_torch(weight, group_size):
+    size_n, size_k = weight.shape
+    device = weight.device
+    if group_size != -1:
+        scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 448
+        repeated_scales = scales.repeat_interleave(group_size, 1)
+        fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
+        weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
+    else:
+        scales = weight.view(size_n, 1, group_size).abs().max(-1)[0] / 448
+        repeated_scales = scales.repeat_interleave(size_k, 1)
+        fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
+        weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
+    packed_weight = pack_fp8_to_int32(fp8_weight, False).T.contiguous()
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=packed_weight,
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=8,
+    )
+    marlin_scales = marlin_permute_scales(s=scales.T,
+                                          size_k=size_k,
+                                          size_n=size_n,
+                                          group_size=group_size)
+    marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
+    return weight_ref.T, marlin_qweight, marlin_scales