koichi12 commited on Feb 12, 2025

Commit

63c744a

verified ·

1 Parent(s): 96a7806

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/aqlm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq_triton.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/bitsandbytes.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/deepspeedfp.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/experts_int8.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/fbgemm_fp8.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/fp8.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gguf.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/hqq_marlin.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/ipex_quant.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/kv_cache.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/modelopt.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/moe_wna16.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/neuron_quant.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/qqq.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/schema.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/tpu_int8.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +89 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +76 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/MPLinearKernel.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/exllama.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/machete.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/marlin.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +142 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +122 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +135 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +66 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +84 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/ScaledMMLinearKernel.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/cutlass.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/triton.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/xla.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +136 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +40 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +103 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/quark.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/quark_moe.cpython-311.pyc +0 -0

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (4.95 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/aqlm.cpython-311.pyc ADDED Viewed

Binary file (15.6 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq.cpython-311.pyc ADDED Viewed

Binary file (9.79 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-311.pyc ADDED Viewed

Binary file (21.2 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq_triton.cpython-311.pyc ADDED Viewed

Binary file (13.8 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-311.pyc ADDED Viewed

Binary file (7.73 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/bitsandbytes.cpython-311.pyc ADDED Viewed

Binary file (17 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/deepspeedfp.cpython-311.pyc ADDED Viewed

Binary file (11.2 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/experts_int8.cpython-311.pyc ADDED Viewed

Binary file (9.87 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/fbgemm_fp8.cpython-311.pyc ADDED Viewed

Binary file (8.79 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/fp8.cpython-311.pyc ADDED Viewed

Binary file (29.5 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gguf.cpython-311.pyc ADDED Viewed

Binary file (11.8 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-311.pyc ADDED Viewed

Binary file (11.7 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-311.pyc ADDED Viewed

Binary file (24.1 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-311.pyc ADDED Viewed

Binary file (12.5 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/hqq_marlin.cpython-311.pyc ADDED Viewed

Binary file (17.8 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/ipex_quant.cpython-311.pyc ADDED Viewed

Binary file (12.9 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/kv_cache.cpython-311.pyc ADDED Viewed

Binary file (4.81 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-311.pyc ADDED Viewed

Binary file (11.4 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/modelopt.cpython-311.pyc ADDED Viewed

Binary file (9.62 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/moe_wna16.cpython-311.pyc ADDED Viewed

Binary file (20.1 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/neuron_quant.cpython-311.pyc ADDED Viewed

Binary file (4.11 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/qqq.cpython-311.pyc ADDED Viewed

Binary file (11.7 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/schema.cpython-311.pyc ADDED Viewed

Binary file (4.9 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/tpu_int8.cpython-311.pyc ADDED Viewed

Binary file (7.74 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (220 Bytes). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Callable, Optional, Tuple
+import torch
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.scalar_type import ScalarType
+@dataclass
+class MPLinearLayerConfig:
+    full_weight_shape: Tuple[int, int]  # [in, out]
+    partition_weight_shape: Tuple[int, int]
+    weight_type: ScalarType
+    act_type: torch.dtype
+    group_size: int
+    zero_points: bool
+    has_g_idx: bool
+class MPLinearKernel(ABC):
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError
+    @classmethod
+    @abstractmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        raise NotImplementedError
+    def __init__(self,
+                 c: MPLinearLayerConfig,
+                 w_q_param_name: str,
+                 w_s_param_name: str,
+                 w_zp_param_name: Optional[str] = None,
+                 w_gidx_param_name: Optional[str] = None) -> None:
+        assert self.can_implement(c)
+        self.config = c
+        self.w_q_name = w_q_param_name
+        self.w_s_name = w_s_param_name
+        if c.zero_points:
+            assert w_zp_param_name is not None
+        if c.has_g_idx:
+            assert w_gidx_param_name is not None
+        self.w_zp_name = w_zp_param_name
+        self.w_gidx_name = w_gidx_param_name
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        raise NotImplementedError
+    @abstractmethod
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        raise NotImplementedError
+    def _transform_param(self, layer: torch.nn.Module, name: Optional[str],
+                         fn: Callable) -> None:
+        if name is not None and getattr(layer, name, None) is not None:
+            old_param = getattr(layer, name)
+            new_param = fn(old_param)
+            # replace the parameter with torch.nn.Parameter for TorchDynamo
+            # compatibility
+            replace_parameter(
+                layer, name,
+                torch.nn.Parameter(new_param.data, requires_grad=False))
+    def _get_weight_params(
+            self, layer: torch.nn.Module) -> Tuple[
+                torch.Tensor,  # w_q
+                torch.Tensor,  # w_s
+                Optional[torch.Tensor],  # w_zp,
+                Optional[torch.Tensor]  # w_gidx
+            ]:
+        return (
+            getattr(layer, self.w_q_name),
+            getattr(layer, self.w_s_name),
+            getattr(layer, self.w_zp_name or "", None),
+            getattr(layer, self.w_gidx_name or "", None),
+        )

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Optional, Type
+import vllm.envs as envs
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import (  # noqa: E501
+    ExllamaLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import (  # noqa: E501
+    MacheteLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.marlin import (  # noqa: E501
+    MarlinLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKernel import (  # noqa: E501
+    MPLinearKernel, MPLinearLayerConfig)
+from vllm.platforms import current_platform
+# in priority/performance order (when available)
+_POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
+    MacheteLinearKernel,
+    MarlinLinearKernel,
+    ExllamaLinearKernel,
+]
+def choose_mp_linear_kernel(
+        config: MPLinearLayerConfig,
+        compute_capability: Optional[int] = None) -> Type[MPLinearKernel]:
+    """
+    Choose an MPLinearKernel that can implement the given config for the given
+     compute capability. Attempts to choose the best kernel in terms of
+     performance.
+    Args:
+        config (MPLinearLayerConfig): Description of the linear layer to be
+          implemented.
+        compute_capability (Optional[int], optional): The compute capability of
+          the target device, if None uses `current_platform` to get the compute
+          capability. Defaults to None.
+    Raises:
+        ValueError: If no kernel can implement the given config.
+    Returns:
+        Type[MPLinearKernel]: Chosen kernel.
+    """
+    if compute_capability is None:
+        if current_platform is None:
+            raise ValueError("Cannot determine compute capability")
+        _cc = current_platform.get_device_capability()
+        compute_capability = _cc[0] * 10 + _cc[1]
+    failure_reasons = []
+    for kernel in _POSSIBLE_KERNELS:
+        if kernel.__name__ in envs.VLLM_DISABLED_KERNELS:
+            failure_reasons.append(
+                f' {kernel.__name__} disabled by environment variable')
+            continue
+        if kernel.get_min_capability() > compute_capability:
+            failure_reasons.append(
+                f"{kernel.__name__} requires capability "
+                f"{kernel.get_min_capability()}, current compute capability "
+                f"is {compute_capability}")
+            continue
+        can_implement, failure_reason = kernel.can_implement(config)
+        if can_implement:
+            return kernel
+        else:
+            failure_reasons.append(
+                f' {kernel.__name__} cannot implement due to: {failure_reason}'
+            )
+    raise ValueError(
+        "Failed to find a kernel that can implement the "\
+        "WNA16 linear layer. Reasons: \n"
+        + '\n'.join(failure_reasons))

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/MPLinearKernel.cpython-311.pyc ADDED Viewed

Binary file (4.87 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (3.53 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/exllama.cpython-311.pyc ADDED Viewed

Binary file (7.87 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/machete.cpython-311.pyc ADDED Viewed

Binary file (7.18 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/marlin.cpython-311.pyc ADDED Viewed

Binary file (6.96 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+import torch
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_quantized_values_into_int32)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           permute_param_layout_)
+from vllm.scalar_type import scalar_types
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+class ExllamaLinearKernel(MPLinearKernel):
+    SUPPORTED_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
+    # In theory supports `scalar_types.uint2b2, scalar_types.uint3b4` too but
+    # currently untested so not added to the list
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 60
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        if c.has_g_idx and\
+            c.partition_weight_shape[0] != c.full_weight_shape[0]:
+            return False, "Act reordering currently not supported by Exllama, "\
+                          "when the input features are partitioned across "\
+                          "devices"
+        if c.partition_weight_shape[1] % (32 // c.weight_type.size_bits) != 0:
+            return False, "Output features must be a multiple of the pack " \
+                            "factor (32 / num_bits) so that we can correctly " \
+                            "pack the zero points"
+        if c.act_type != torch.float16:
+            return False, "Exllama only supports float16 activations"
+        if c.weight_type not in cls.SUPPORTED_QUANT_TYPES:
+            return False, f"Quant type ({c.weight_type}) not supported by "\
+                           "Exllama, supported types are: "\
+                           f"{cls.SUPPORTED_QUANT_TYPES}"
+        if c.full_weight_shape[0] % c.group_size != 0:
+            return False, f"Group size ({c.group_size}) does not evenly divide"\
+                           " the number of input features "\
+                           f"({c.full_weight_shape[0]})"
+        return True, None
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        c = self.config
+        # For Exllama, we need to set a zero-point tensor if there is not one
+        if not c.zero_points:
+            self.w_zp_name = "qzeros"
+            device = getattr(layer, self.w_q_name).device
+            groups = c.partition_weight_shape[0] // c.group_size
+            out_features = c.partition_weight_shape[1]
+            if c.weight_type.has_bias():
+                # if the type has a bias we have to create a zeros tensor that
+                # contains the bias values repeated for each group (-1 due to
+                # a bug in the original GPTQ checkpoint format leading to
+                # exllama kernel adding 1 to the zero points during inference)
+                # Documentation of the bug can be found here:
+                #  https://garden.danieldk.eu/GPTQ-Checkpoint-Format
+                zeros = torch.full((groups, out_features),
+                                   c.weight_type.bias - 1,
+                                   dtype=torch.int32,
+                                   device=device)
+            else:
+                raise NotImplementedError(
+                    "A 0 zero-point is not supported by Exllama due to "
+                    "a bug in the original GPTQ checkpoint format leading to "
+                    "exllama kernel adding 1 to the zero points during "
+                    "inference")
+            zeros = pack_quantized_values_into_int32(zeros,
+                                                     c.weight_type,
+                                                     packed_dim=1)
+            setattr(layer, self.w_zp_name,
+                    torch.nn.Parameter(zeros, requires_grad=False))
+        if c.has_g_idx:
+            def transform_w_g_idx(x):
+                # Exllama wants the permutation array instead of the group
+                # indices
+                return torch.argsort(x).to(torch.int)
+            self._transform_param(layer, self.w_gidx_name, transform_w_g_idx)
+        else:
+            self.w_gidx_name = "g_idx"
+            empty_g_idx = torch.nn.Parameter(torch.empty((0, ),
+                                                         dtype=torch.int,
+                                                         device=device),
+                                             requires_grad=False)
+            setattr(layer, self.w_gidx_name, empty_g_idx)
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            assert self.w_gidx_name is not None
+            g_idx = getattr(layer, self.w_gidx_name)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            x_cont = x.data.contiguous()
+            ops.gptq_shuffle(x_cont, g_idx, c.weight_type.size_bits)
+            return x_cont
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = x.data.contiguous()
+            return x.to(dtype=c.act_type)
+        # Repack weights and scales for Machete
+        self._transform_param(layer, self.w_q_name, transform_w_q)
+        self._transform_param(layer, self.w_s_name, transform_w_s)
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        c = self.config
+        x_2d = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+        w_q, w_s, w_zp, w_g_idx = self._get_weight_params(layer)
+        assert w_zp is not None, "Zero points are required by Exllama"
+        assert w_g_idx is not None, "Group index is required by Exllama"
+        output = ops.gptq_gemm(x_2d, w_q, w_zp, w_s, w_g_idx, True,
+                               c.weight_type.size_bits)
+        if bias is not None:
+            output.add_(bias)
+        return output.reshape(out_shape)

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# SPDX-License-Identifier: Apache-2.0
+from functools import partial
+from typing import Optional, Tuple
+import torch
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.machete_utils import (
+    MACHETE_SUPPORTED_GROUP_SIZES, check_machete_supports_shape,
+    query_machete_supported_quant_types)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_quantized_values_into_int32, unpack_quantized_values_into_int32)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           permute_param_layout_)
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+class MacheteLinearKernel(MPLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 90
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        if c.has_g_idx and\
+            c.partition_weight_shape[0] != c.full_weight_shape[0]:
+            return False, "Act reordering currently not supported by Machete, "\
+                          "when the input features are partitioned across "\
+                          "devices"
+        if c.zero_points:
+            return False, "Zero points currently not supported by "\
+                          " Compressed Tensors + Machete. (Kernel supports it"\
+                          " but CompressedTensorsWNA16 does not so support has"\
+                          " not been added to MacheteWNA16Kernel yet"
+        if c.weight_type not in query_machete_supported_quant_types(
+                c.zero_points):
+            return False, f"Quant type ({c.weight_type}) not supported by "\
+                           "Machete, supported types are: "\
+                           f"{query_machete_supported_quant_types(c.zero_points)}"
+        if c.group_size not in MACHETE_SUPPORTED_GROUP_SIZES:
+            return False, f"Group size ({c.group_size}) not supported by "\
+                            "Machete, supported group sizes are: "\
+                            f"{MACHETE_SUPPORTED_GROUP_SIZES}"
+        return check_machete_supports_shape(c.partition_weight_shape[0],
+                                            c.partition_weight_shape[1])
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale`  is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        c = self.config
+        if c.has_g_idx:
+            assert self.w_gidx_name is not None
+            perm = torch.argsort(getattr(layer, self.w_gidx_name))\
+                .to(torch.int)
+            self.act_perm = lambda x: x[:, perm]
+            # use `ops.permute_cols` if possible
+            if c.act_type in [torch.float16, torch.bfloat16] \
+                and c.partition_weight_shape[0] % 8 == 0:
+                self.act_perm = partial(ops.permute_cols, perm=perm)
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            if c.has_g_idx:
+                x_unpacked = unpack_quantized_values_into_int32(x.data,
+                                                                c.weight_type,
+                                                                packed_dim=0)
+                x_perm = x_unpacked[perm, :]
+                x.data = pack_quantized_values_into_int32(x_perm,
+                                                          c.weight_type,
+                                                          packed_dim=0)
+            x.data = ops.machete_prepack_B(x.data.t().contiguous().t(),
+                                           a_type=c.act_type,
+                                           b_type=c.weight_type,
+                                           group_scales_type=c.act_type)
+            return x
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = x.data.contiguous()
+            return x
+        # Repack weights and scales for Machete
+        self._transform_param(layer, self.w_q_name, transform_w_q)
+        self._transform_param(layer, self.w_s_name, transform_w_s)
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        c = self.config
+        w_q, w_s, _, _ = self._get_weight_params(layer)
+        x_2d = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+        if c.has_g_idx:
+            x_2d = self.act_perm(x_2d)
+        output = ops.machete_mm(a=x_2d,
+                                b_q=w_q,
+                                b_type=c.weight_type,
+                                b_group_zeros=None,
+                                b_group_scales=w_s,
+                                b_group_size=c.group_size)
+        if bias is not None:
+            output.add_(bias)  # In-place add
+        return output.reshape(out_shape)

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+import torch
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    MARLIN_SUPPORTED_GROUP_SIZES, apply_gptq_marlin_linear,
+    check_marlin_supports_shape, marlin_is_k_full, marlin_make_empty_g_idx,
+    marlin_make_workspace, marlin_permute_scales, marlin_sort_g_idx,
+    query_marlin_supported_quant_types)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           permute_param_layout_)
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+class MarlinLinearKernel(MPLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        if c.zero_points:
+            return False, "Zero points currently not supported by "\
+                          " MarlinLinearKernel. Will be added when AWQMarlin "\
+                          "is migrated over to using MPLinearKernel backend"
+        quant_types = query_marlin_supported_quant_types(c.zero_points)
+        if c.weight_type not in quant_types:
+            return False, f"Quant type ({c.weight_type}) not supported by"\
+                          f"  Marlin, supported types are: {quant_types}"
+        if c.group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
+            return False, f"Group size ({c.group_size}) not supported by "\
+                            "Marlin, supported group sizes are: "\
+                            f"{MARLIN_SUPPORTED_GROUP_SIZES}"
+        return check_marlin_supports_shape(
+            c.partition_weight_shape[1],  # out_features
+            c.partition_weight_shape[0],  # in_features
+            c.full_weight_shape[0],  # in_features
+            c.group_size)
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale` is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = getattr(layer, self.w_q_name).device
+        c = self.config
+        row_parallel = (c.partition_weight_shape[0] != c.full_weight_shape[0])
+        self.is_k_full = marlin_is_k_full(c.has_g_idx, row_parallel)
+        # Allocate marlin workspace.
+        self.workspace = marlin_make_workspace(c.partition_weight_shape[1],
+                                               device)
+        # Default names since marlin requires empty parameters for these,
+        # TODO: remove this requirement from marlin (allow optional tensors)
+        if self.w_gidx_name is None:
+            self.w_gidx_name = "g_idx"
+        if self.w_zp_name is None:
+            self.w_zp_name = "w_zp"
+        if c.has_g_idx:
+            g_idx, g_idx_sort_indices = marlin_sort_g_idx(
+                getattr(layer, self.w_gidx_name))
+            self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+        else:
+            setattr(layer, self.w_gidx_name, marlin_make_empty_g_idx(device))
+            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+        if c.zero_points:
+            pass
+            # TODO (lucas): add the following when AWQMarlin is migrated over to
+            #       using MPLinearKernel backend
+            # self._transform_param(layer, self.w_zp_name, lambda x: \
+            #     marlin_zero_points(
+            #         x,
+            #         size_k=c.partition_weight_shape[0],
+            #         size_n=c.partition_weight_shape[1],
+            #         num_bits=c.weight_type.size_bits))
+        else:
+            setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            x.data = ops.gptq_marlin_repack(x.data.contiguous(),
+                                            perm=layer.g_idx_sort_indices,
+                                            size_k=c.partition_weight_shape[0],
+                                            size_n=c.partition_weight_shape[1],
+                                            num_bits=c.weight_type.size_bits)
+            return x
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = marlin_permute_scales(x.data.contiguous(),
+                                           size_k=c.partition_weight_shape[0],
+                                           size_n=c.partition_weight_shape[1],
+                                           group_size=c.group_size)
+            return x
+        self._transform_param(layer, self.w_q_name, transform_w_q)
+        self._transform_param(layer, self.w_s_name, transform_w_s)
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        c = self.config
+        w_q, w_s, w_zp, w_gidx = self._get_weight_params(layer)
+        # `process_weights_after_loading` will ensure w_zp and w_gidx are not
+        #  None for marlin
+        return apply_gptq_marlin_linear(
+            input=x,
+            weight=w_q,
+            weight_scale=w_s,
+            weight_zp=w_zp,  # type: ignore
+            g_idx=w_gidx,  # type: ignore
+            g_idx_sort_indices=layer.g_idx_sort_indices,
+            workspace=self.workspace,
+            wtype=c.weight_type,
+            input_size_per_partition=c.partition_weight_shape[0],
+            output_size_per_partition=c.partition_weight_shape[1],
+            is_k_full=self.is_k_full,
+            bias=bias)

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+@dataclass
+class ScaledMMLinearLayerConfig:
+    is_channelwise: bool
+    is_static_input_scheme: bool
+    input_symmetric: bool
+class ScaledMMLinearKernel(ABC):
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError
+    @classmethod
+    @abstractmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        raise NotImplementedError
+    def __init__(self, c: ScaledMMLinearLayerConfig, w_q_param_name: str,
+                 w_s_param_name: str, i_s_param_name: str,
+                 i_zp_param_name: str, azp_adj_param_name: str) -> None:
+        assert self.can_implement(c)
+        self.config = c
+        self.w_q_name = w_q_param_name
+        self.w_s_name = w_s_param_name
+        self.i_s_name = i_s_param_name
+        self.i_zp_name = i_zp_param_name
+        self.azp_adj_name = azp_adj_param_name
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        raise NotImplementedError
+    @abstractmethod
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        raise NotImplementedError
+    def _get_weight_params(
+            self, layer: torch.nn.Module) -> Tuple[
+                torch.Tensor,  # weight
+                torch.Tensor,  # weight_scale
+                Optional[torch.Tensor],  # input_scale,
+                Optional[torch.Tensor],  # input_zp
+                Optional[torch.Tensor],  # azp_adj
+            ]:
+        return (
+            getattr(layer, self.w_q_name),
+            getattr(layer, self.w_s_name),
+            getattr(layer, self.i_s_name),
+            getattr(layer, self.i_zp_name),
+            getattr(layer, self.azp_adj_name),
+        )

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import Dict, List, Optional, Type
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
+    CutlassScaledMMLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
+    ScaledMMLinearKernel, ScaledMMLinearLayerConfig)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
+    TritonScaledMMLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import (
+    XLAScaledMMLinearKernel)
+from vllm.platforms import PlatformEnum, current_platform
+# in priority/performance order (when available)
+_POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = {
+    PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
+    PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
+    PlatformEnum.ROCM: [TritonScaledMMLinearKernel],
+    PlatformEnum.TPU: [XLAScaledMMLinearKernel],
+}
+def choose_scaled_mm_linear_kernel(
+        config: ScaledMMLinearLayerConfig,
+        compute_capability: Optional[int] = None
+) -> Type[ScaledMMLinearKernel]:
+    """
+    Choose an ScalledMMLinearKernel that can implement the given config for the
+    given compute capability. Attempts to choose the best kernel in terms of
+    performance.
+    Args:
+        config (ScaledMMLinearLayerConfig): Description of the linear layer
+            to be implemented.
+        compute_capability (Optional[int], optional): The compute capability of
+            the target device, if None uses `current_platform` to get the
+            compute capability. Defaults to None.
+    Raises:
+        ValueError: If no kernel can implement the given config.
+    Returns:
+        Type[ScaledMMLinearKernel]: Chosen kernel.
+    """
+    if compute_capability is None:
+        _cc = current_platform.get_device_capability()
+        if _cc is not None:
+            compute_capability = _cc[0] * 10 + _cc[1]
+    failure_reasons = []
+    for kernel in _POSSIBLE_KERNELS[current_platform._enum]:
+        if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "")\
+            .split(","):
+            failure_reasons.append(
+                f' {kernel.__name__} disabled by environment variable')
+            continue
+        # If the current platform uses compute_capability,
+        # make sure the kernel supports the compute cability.
+        if compute_capability is not None:
+            kernel_min_capability = kernel.get_min_capability()
+            if (kernel_min_capability is not None
+                    and kernel_min_capability > compute_capability):
+                failure_reasons.append(
+                    f"{kernel.__name__} requires capability "
+                    f"{kernel_min_capability}, current compute capability "
+                    f"is {compute_capability}")
+                continue
+        can_implement, failure_reason = kernel.can_implement(config)
+        if can_implement:
+            return kernel
+        else:
+            failure_reasons.append(
+                f' {kernel.__name__} cannot implement due to: {failure_reason}'
+            )
+    raise ValueError(
+        "Failed to find a kernel that can implement the "\
+        "ScaledMM linear layer. Reasons: \n"
+        + '\n'.join(failure_reasons))

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/ScaledMMLinearKernel.cpython-311.pyc ADDED Viewed

Binary file (3.78 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (3.84 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/cutlass.cpython-311.pyc ADDED Viewed

Binary file (6.34 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/triton.cpython-311.pyc ADDED Viewed

Binary file (2.58 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/xla.cpython-311.pyc ADDED Viewed

Binary file (5.26 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+import torch
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.platforms import current_platform
+from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
+                                   ScaledMMLinearLayerConfig)
+class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        if (not current_platform.is_cuda() and not current_platform.is_cpu()):
+            return False, "CutlassScaledMM requires running on CUDA or CPU."
+        return True, None
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # Cutlass kernels need transposed weight.
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(
+            layer, self.w_q_name,
+            torch.nn.Parameter(weight.t().data, requires_grad=False))
+        # WEIGHT SCALE
+        # Cutlass kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+        # INPUT SCALE
+        if self.config.is_static_input_scheme:
+            input_scale = getattr(layer, self.i_s_name)
+            if self.config.input_symmetric:
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(input_scale.max(), requires_grad=False))
+                setattr(layer, self.i_zp_name, None)
+            else:
+                input_zero_point = getattr(layer, self.i_zp_name)
+                # reconstruct the ranges
+                int8_traits = torch.iinfo(torch.int8)
+                azps = input_zero_point.to(dtype=torch.int32)
+                range_max = (input_scale * (int8_traits.max - azps)).max()
+                range_min = (input_scale * (int8_traits.min - azps)).min()
+                scale = (range_max - range_min) / (int8_traits.max -
+                                                   int8_traits.min)
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(scale, requires_grad=False))
+                # AZP loaded as int8 but used as int32
+                azp = (int8_traits.min -
+                       range_min / scale).to(dtype=torch.int32)
+                replace_parameter(layer, self.i_zp_name,
+                                  torch.nn.Parameter(azp, requires_grad=False))
+        else:
+            setattr(layer, self.i_s_name, None)
+            setattr(layer, self.i_zp_name, None)
+        # azp_adj is the AZP adjustment term, used to account for weights.
+        # It does not depend on scales or azp, so it is the same for
+        # static and dynamic quantization.
+        # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
+        # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
+        if not self.config.input_symmetric:
+            weight = getattr(layer, self.w_q_name)
+            azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.int32)
+            if self.config.is_static_input_scheme:
+                # cutlass_w8a8 requires azp to be folded into azp_adj
+                # in the per-tensor case
+                azp_adj = getattr(layer, self.i_zp_name) * azp_adj
+            setattr(layer, self.azp_adj_name,
+                    torch.nn.Parameter(azp_adj, requires_grad=False))
+        else:
+            setattr(layer, self.azp_adj_name, None)
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
+        # ops.scaled_int8_quant supports both dynamic and static quant:
+        # * dynamic, i_s is None and x_s computed from x.
+        # * static, i_s is scalar and x_s is i_s.
+        symmetric = azp_adj is None
+        x_q, x_s, x_zp = ops.scaled_int8_quant(x,
+                                               i_s,
+                                               i_zp,
+                                               symmetric=symmetric)
+        if x_zp is not None:
+            # Currently, static is always per-tensor and dynamic is per-token
+            static = i_zp is not None
+            azp = None if static else x_zp
+            return ops.cutlass_scaled_mm_azp(x_q,
+                                             w_q,
+                                             scale_a=x_s,
+                                             scale_b=w_s,
+                                             out_dtype=x.dtype,
+                                             azp_adj=azp_adj,
+                                             azp=azp,
+                                             bias=bias)
+        return ops.cutlass_scaled_mm(x_q,
+                                     w_q,
+                                     scale_a=x_s,
+                                     scale_b=w_s,
+                                     out_dtype=x.dtype,
+                                     bias=bias)

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+import torch
+from vllm.platforms import current_platform
+from .cutlass import CutlassScaledMMLinearKernel
+from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
+class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        if current_platform.is_cpu():
+            return (
+                False,
+                "TritonScaledMMLinearKernel requires Triton which is not " +
+                "currently supported on CPU.")
+        if not c.input_symmetric:
+            return (False,
+                    "TritonScaledMMLinearKernel only supports symmetric " +
+                    "quantization.")
+        return True, None
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return super().apply_weights(layer, x, bias)

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# SPDX-License-Identifier: Apache-2.0
+import warnings
+from typing import Optional, Tuple
+import torch
+from functorch.experimental.control_flow import cond  # noqa: F401
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.platforms import current_platform
+from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
+                                   ScaledMMLinearLayerConfig)
+class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError(
+            "TPU platform does have a concept of compute capability, "
+            "this method should not be called.")
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        if not current_platform.is_tpu():
+            return False, "ScaledMMXLA requires running on TPU."
+        if c.is_static_input_scheme:
+            return False, "ScaledMMXLA requires dynamic activation scales."
+        if not c.input_symmetric:
+            return False, "ScaledMMXLA requires symmetric activation scales."
+        if not c.is_channelwise:
+            return False, "ScaledMMXLA requires channelwise weight scales"
+        return True, None
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # [out, in] (different than cutlass_scaled_mm)
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(layer, self.w_q_name,
+                          torch.nn.Parameter(weight.data, requires_grad=False))
+        # WEIGHT SCALE
+        # XLA kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+        # [out_channel,] (different than cutlass_scaled_mm)
+        weight_scale = weight_scale.squeeze(-1)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+        # Only support symmetric dynamic activation quantization.
+        setattr(layer, self.i_s_name, None)
+        setattr(layer, self.i_zp_name, None)
+        setattr(layer, self.azp_adj_name, None)
+        # Filter warning for cond usage in apply_weights. It is okay
+        # to specialize the graph since bias is not dynamic.
+        warnings.filterwarnings(
+            "ignore",
+            message=
+            "Pred is a Python constant. When used with torch.cond, it specializes on one of the branches."  # noqa: E501
+        )
+    def no_add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+        return x
+    def add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+        return x + bias
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, _, _, _ = self._get_weight_params(layer)
+        import torch_xla.experimental.xla_quantized_matmul  # noqa: F401
+        out = torch.ops.xla.quantized_matmul(x,
+                                             w_q,
+                                             w_s,
+                                             zero_point=None,
+                                             block_size=-1,
+                                             int4_weight=False,
+                                             quantize_activation=True)
+        # Explicitly capture control flow to make dynamo happy.
+        # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
+        return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (218 Bytes). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/quark.cpython-311.pyc ADDED Viewed

Binary file (21.8 kB). View file

.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/quark_moe.cpython-311.pyc ADDED Viewed

Binary file (11.1 kB). View file