Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/aqlm.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq_triton.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/bitsandbytes.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/deepspeedfp.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/experts_int8.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/fbgemm_fp8.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/fp8.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gguf.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/hqq_marlin.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/ipex_quant.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/kv_cache.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/modelopt.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/moe_wna16.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/neuron_quant.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/qqq.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/schema.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/tpu_int8.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +89 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +76 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/MPLinearKernel.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/exllama.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/machete.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/marlin.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +142 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +122 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +135 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +66 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +84 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/ScaledMMLinearKernel.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/cutlass.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/triton.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/xla.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +136 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +40 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +103 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/quark.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/quark_moe.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (4.95 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/aqlm.cpython-311.pyc
ADDED
|
Binary file (15.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq.cpython-311.pyc
ADDED
|
Binary file (9.79 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-311.pyc
ADDED
|
Binary file (21.2 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq_triton.cpython-311.pyc
ADDED
|
Binary file (13.8 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-311.pyc
ADDED
|
Binary file (7.73 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/bitsandbytes.cpython-311.pyc
ADDED
|
Binary file (17 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/deepspeedfp.cpython-311.pyc
ADDED
|
Binary file (11.2 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/experts_int8.cpython-311.pyc
ADDED
|
Binary file (9.87 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/fbgemm_fp8.cpython-311.pyc
ADDED
|
Binary file (8.79 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/fp8.cpython-311.pyc
ADDED
|
Binary file (29.5 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gguf.cpython-311.pyc
ADDED
|
Binary file (11.8 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-311.pyc
ADDED
|
Binary file (11.7 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-311.pyc
ADDED
|
Binary file (24.1 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-311.pyc
ADDED
|
Binary file (12.5 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/hqq_marlin.cpython-311.pyc
ADDED
|
Binary file (17.8 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/ipex_quant.cpython-311.pyc
ADDED
|
Binary file (12.9 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/kv_cache.cpython-311.pyc
ADDED
|
Binary file (4.81 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-311.pyc
ADDED
|
Binary file (11.4 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/modelopt.cpython-311.pyc
ADDED
|
Binary file (9.62 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/moe_wna16.cpython-311.pyc
ADDED
|
Binary file (20.1 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/neuron_quant.cpython-311.pyc
ADDED
|
Binary file (4.11 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/qqq.cpython-311.pyc
ADDED
|
Binary file (11.7 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/schema.cpython-311.pyc
ADDED
|
Binary file (4.9 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/tpu_int8.cpython-311.pyc
ADDED
|
Binary file (7.74 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (220 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from abc import ABC, abstractmethod
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from typing import Callable, Optional, Tuple
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
from vllm.model_executor.layers.quantization.utils import replace_parameter
|
| 10 |
+
from vllm.scalar_type import ScalarType
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class MPLinearLayerConfig:
|
| 15 |
+
full_weight_shape: Tuple[int, int] # [in, out]
|
| 16 |
+
partition_weight_shape: Tuple[int, int]
|
| 17 |
+
weight_type: ScalarType
|
| 18 |
+
act_type: torch.dtype
|
| 19 |
+
group_size: int
|
| 20 |
+
zero_points: bool
|
| 21 |
+
has_g_idx: bool
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class MPLinearKernel(ABC):
|
| 25 |
+
|
| 26 |
+
@classmethod
|
| 27 |
+
@abstractmethod
|
| 28 |
+
def get_min_capability(cls) -> int:
|
| 29 |
+
raise NotImplementedError
|
| 30 |
+
|
| 31 |
+
@classmethod
|
| 32 |
+
@abstractmethod
|
| 33 |
+
def can_implement(cls,
|
| 34 |
+
c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
|
| 35 |
+
raise NotImplementedError
|
| 36 |
+
|
| 37 |
+
def __init__(self,
|
| 38 |
+
c: MPLinearLayerConfig,
|
| 39 |
+
w_q_param_name: str,
|
| 40 |
+
w_s_param_name: str,
|
| 41 |
+
w_zp_param_name: Optional[str] = None,
|
| 42 |
+
w_gidx_param_name: Optional[str] = None) -> None:
|
| 43 |
+
assert self.can_implement(c)
|
| 44 |
+
self.config = c
|
| 45 |
+
self.w_q_name = w_q_param_name
|
| 46 |
+
self.w_s_name = w_s_param_name
|
| 47 |
+
if c.zero_points:
|
| 48 |
+
assert w_zp_param_name is not None
|
| 49 |
+
if c.has_g_idx:
|
| 50 |
+
assert w_gidx_param_name is not None
|
| 51 |
+
self.w_zp_name = w_zp_param_name
|
| 52 |
+
self.w_gidx_name = w_gidx_param_name
|
| 53 |
+
|
| 54 |
+
@abstractmethod
|
| 55 |
+
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
| 56 |
+
raise NotImplementedError
|
| 57 |
+
|
| 58 |
+
@abstractmethod
|
| 59 |
+
def apply_weights(self,
|
| 60 |
+
layer: torch.nn.Module,
|
| 61 |
+
x: torch.Tensor,
|
| 62 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 63 |
+
raise NotImplementedError
|
| 64 |
+
|
| 65 |
+
def _transform_param(self, layer: torch.nn.Module, name: Optional[str],
|
| 66 |
+
fn: Callable) -> None:
|
| 67 |
+
if name is not None and getattr(layer, name, None) is not None:
|
| 68 |
+
|
| 69 |
+
old_param = getattr(layer, name)
|
| 70 |
+
new_param = fn(old_param)
|
| 71 |
+
# replace the parameter with torch.nn.Parameter for TorchDynamo
|
| 72 |
+
# compatibility
|
| 73 |
+
replace_parameter(
|
| 74 |
+
layer, name,
|
| 75 |
+
torch.nn.Parameter(new_param.data, requires_grad=False))
|
| 76 |
+
|
| 77 |
+
def _get_weight_params(
|
| 78 |
+
self, layer: torch.nn.Module) -> Tuple[
|
| 79 |
+
torch.Tensor, # w_q
|
| 80 |
+
torch.Tensor, # w_s
|
| 81 |
+
Optional[torch.Tensor], # w_zp,
|
| 82 |
+
Optional[torch.Tensor] # w_gidx
|
| 83 |
+
]:
|
| 84 |
+
return (
|
| 85 |
+
getattr(layer, self.w_q_name),
|
| 86 |
+
getattr(layer, self.w_s_name),
|
| 87 |
+
getattr(layer, self.w_zp_name or "", None),
|
| 88 |
+
getattr(layer, self.w_gidx_name or "", None),
|
| 89 |
+
)
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from typing import List, Optional, Type
|
| 4 |
+
|
| 5 |
+
import vllm.envs as envs
|
| 6 |
+
from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import ( # noqa: E501
|
| 7 |
+
ExllamaLinearKernel)
|
| 8 |
+
from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import ( # noqa: E501
|
| 9 |
+
MacheteLinearKernel)
|
| 10 |
+
from vllm.model_executor.layers.quantization.kernels.mixed_precision.marlin import ( # noqa: E501
|
| 11 |
+
MarlinLinearKernel)
|
| 12 |
+
from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKernel import ( # noqa: E501
|
| 13 |
+
MPLinearKernel, MPLinearLayerConfig)
|
| 14 |
+
from vllm.platforms import current_platform
|
| 15 |
+
|
| 16 |
+
# in priority/performance order (when available)
|
| 17 |
+
_POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
|
| 18 |
+
MacheteLinearKernel,
|
| 19 |
+
MarlinLinearKernel,
|
| 20 |
+
ExllamaLinearKernel,
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def choose_mp_linear_kernel(
|
| 25 |
+
config: MPLinearLayerConfig,
|
| 26 |
+
compute_capability: Optional[int] = None) -> Type[MPLinearKernel]:
|
| 27 |
+
"""
|
| 28 |
+
Choose an MPLinearKernel that can implement the given config for the given
|
| 29 |
+
compute capability. Attempts to choose the best kernel in terms of
|
| 30 |
+
performance.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
config (MPLinearLayerConfig): Description of the linear layer to be
|
| 34 |
+
implemented.
|
| 35 |
+
compute_capability (Optional[int], optional): The compute capability of
|
| 36 |
+
the target device, if None uses `current_platform` to get the compute
|
| 37 |
+
capability. Defaults to None.
|
| 38 |
+
|
| 39 |
+
Raises:
|
| 40 |
+
ValueError: If no kernel can implement the given config.
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
Type[MPLinearKernel]: Chosen kernel.
|
| 44 |
+
"""
|
| 45 |
+
if compute_capability is None:
|
| 46 |
+
if current_platform is None:
|
| 47 |
+
raise ValueError("Cannot determine compute capability")
|
| 48 |
+
_cc = current_platform.get_device_capability()
|
| 49 |
+
compute_capability = _cc[0] * 10 + _cc[1]
|
| 50 |
+
|
| 51 |
+
failure_reasons = []
|
| 52 |
+
for kernel in _POSSIBLE_KERNELS:
|
| 53 |
+
if kernel.__name__ in envs.VLLM_DISABLED_KERNELS:
|
| 54 |
+
failure_reasons.append(
|
| 55 |
+
f' {kernel.__name__} disabled by environment variable')
|
| 56 |
+
continue
|
| 57 |
+
|
| 58 |
+
if kernel.get_min_capability() > compute_capability:
|
| 59 |
+
failure_reasons.append(
|
| 60 |
+
f"{kernel.__name__} requires capability "
|
| 61 |
+
f"{kernel.get_min_capability()}, current compute capability "
|
| 62 |
+
f"is {compute_capability}")
|
| 63 |
+
continue
|
| 64 |
+
|
| 65 |
+
can_implement, failure_reason = kernel.can_implement(config)
|
| 66 |
+
if can_implement:
|
| 67 |
+
return kernel
|
| 68 |
+
else:
|
| 69 |
+
failure_reasons.append(
|
| 70 |
+
f' {kernel.__name__} cannot implement due to: {failure_reason}'
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
raise ValueError(
|
| 74 |
+
"Failed to find a kernel that can implement the "\
|
| 75 |
+
"WNA16 linear layer. Reasons: \n"
|
| 76 |
+
+ '\n'.join(failure_reasons))
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/MPLinearKernel.cpython-311.pyc
ADDED
|
Binary file (4.87 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (3.53 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/exllama.cpython-311.pyc
ADDED
|
Binary file (7.87 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/machete.cpython-311.pyc
ADDED
|
Binary file (7.18 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/marlin.cpython-311.pyc
ADDED
|
Binary file (6.96 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from typing import Optional, Tuple
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
|
| 7 |
+
from vllm import _custom_ops as ops
|
| 8 |
+
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
| 9 |
+
pack_quantized_values_into_int32)
|
| 10 |
+
from vllm.model_executor.parameter import (BasevLLMParameter,
|
| 11 |
+
permute_param_layout_)
|
| 12 |
+
from vllm.scalar_type import scalar_types
|
| 13 |
+
|
| 14 |
+
from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class ExllamaLinearKernel(MPLinearKernel):
|
| 18 |
+
SUPPORTED_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
|
| 19 |
+
# In theory supports `scalar_types.uint2b2, scalar_types.uint3b4` too but
|
| 20 |
+
# currently untested so not added to the list
|
| 21 |
+
|
| 22 |
+
@classmethod
|
| 23 |
+
def get_min_capability(cls) -> int:
|
| 24 |
+
return 60
|
| 25 |
+
|
| 26 |
+
@classmethod
|
| 27 |
+
def can_implement(cls,
|
| 28 |
+
c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
|
| 29 |
+
if c.has_g_idx and\
|
| 30 |
+
c.partition_weight_shape[0] != c.full_weight_shape[0]:
|
| 31 |
+
return False, "Act reordering currently not supported by Exllama, "\
|
| 32 |
+
"when the input features are partitioned across "\
|
| 33 |
+
"devices"
|
| 34 |
+
|
| 35 |
+
if c.partition_weight_shape[1] % (32 // c.weight_type.size_bits) != 0:
|
| 36 |
+
return False, "Output features must be a multiple of the pack " \
|
| 37 |
+
"factor (32 / num_bits) so that we can correctly " \
|
| 38 |
+
"pack the zero points"
|
| 39 |
+
|
| 40 |
+
if c.act_type != torch.float16:
|
| 41 |
+
return False, "Exllama only supports float16 activations"
|
| 42 |
+
|
| 43 |
+
if c.weight_type not in cls.SUPPORTED_QUANT_TYPES:
|
| 44 |
+
return False, f"Quant type ({c.weight_type}) not supported by "\
|
| 45 |
+
"Exllama, supported types are: "\
|
| 46 |
+
f"{cls.SUPPORTED_QUANT_TYPES}"
|
| 47 |
+
|
| 48 |
+
if c.full_weight_shape[0] % c.group_size != 0:
|
| 49 |
+
return False, f"Group size ({c.group_size}) does not evenly divide"\
|
| 50 |
+
" the number of input features "\
|
| 51 |
+
f"({c.full_weight_shape[0]})"
|
| 52 |
+
|
| 53 |
+
return True, None
|
| 54 |
+
|
| 55 |
+
def process_weights_after_loading(self, layer: torch.nn.Module):
|
| 56 |
+
c = self.config
|
| 57 |
+
|
| 58 |
+
# For Exllama, we need to set a zero-point tensor if there is not one
|
| 59 |
+
if not c.zero_points:
|
| 60 |
+
self.w_zp_name = "qzeros"
|
| 61 |
+
device = getattr(layer, self.w_q_name).device
|
| 62 |
+
groups = c.partition_weight_shape[0] // c.group_size
|
| 63 |
+
out_features = c.partition_weight_shape[1]
|
| 64 |
+
|
| 65 |
+
if c.weight_type.has_bias():
|
| 66 |
+
# if the type has a bias we have to create a zeros tensor that
|
| 67 |
+
# contains the bias values repeated for each group (-1 due to
|
| 68 |
+
# a bug in the original GPTQ checkpoint format leading to
|
| 69 |
+
# exllama kernel adding 1 to the zero points during inference)
|
| 70 |
+
# Documentation of the bug can be found here:
|
| 71 |
+
# https://garden.danieldk.eu/GPTQ-Checkpoint-Format
|
| 72 |
+
zeros = torch.full((groups, out_features),
|
| 73 |
+
c.weight_type.bias - 1,
|
| 74 |
+
dtype=torch.int32,
|
| 75 |
+
device=device)
|
| 76 |
+
else:
|
| 77 |
+
raise NotImplementedError(
|
| 78 |
+
"A 0 zero-point is not supported by Exllama due to "
|
| 79 |
+
"a bug in the original GPTQ checkpoint format leading to "
|
| 80 |
+
"exllama kernel adding 1 to the zero points during "
|
| 81 |
+
"inference")
|
| 82 |
+
zeros = pack_quantized_values_into_int32(zeros,
|
| 83 |
+
c.weight_type,
|
| 84 |
+
packed_dim=1)
|
| 85 |
+
setattr(layer, self.w_zp_name,
|
| 86 |
+
torch.nn.Parameter(zeros, requires_grad=False))
|
| 87 |
+
|
| 88 |
+
if c.has_g_idx:
|
| 89 |
+
|
| 90 |
+
def transform_w_g_idx(x):
|
| 91 |
+
# Exllama wants the permutation array instead of the group
|
| 92 |
+
# indices
|
| 93 |
+
return torch.argsort(x).to(torch.int)
|
| 94 |
+
|
| 95 |
+
self._transform_param(layer, self.w_gidx_name, transform_w_g_idx)
|
| 96 |
+
else:
|
| 97 |
+
self.w_gidx_name = "g_idx"
|
| 98 |
+
empty_g_idx = torch.nn.Parameter(torch.empty((0, ),
|
| 99 |
+
dtype=torch.int,
|
| 100 |
+
device=device),
|
| 101 |
+
requires_grad=False)
|
| 102 |
+
setattr(layer, self.w_gidx_name, empty_g_idx)
|
| 103 |
+
|
| 104 |
+
def transform_w_q(x):
|
| 105 |
+
assert isinstance(x, BasevLLMParameter)
|
| 106 |
+
assert self.w_gidx_name is not None
|
| 107 |
+
g_idx = getattr(layer, self.w_gidx_name)
|
| 108 |
+
|
| 109 |
+
permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
|
| 110 |
+
x_cont = x.data.contiguous()
|
| 111 |
+
ops.gptq_shuffle(x_cont, g_idx, c.weight_type.size_bits)
|
| 112 |
+
return x_cont
|
| 113 |
+
|
| 114 |
+
def transform_w_s(x):
|
| 115 |
+
assert isinstance(x, BasevLLMParameter)
|
| 116 |
+
permute_param_layout_(x, input_dim=0, output_dim=1)
|
| 117 |
+
x.data = x.data.contiguous()
|
| 118 |
+
return x.to(dtype=c.act_type)
|
| 119 |
+
|
| 120 |
+
# Repack weights and scales for Machete
|
| 121 |
+
self._transform_param(layer, self.w_q_name, transform_w_q)
|
| 122 |
+
self._transform_param(layer, self.w_s_name, transform_w_s)
|
| 123 |
+
|
| 124 |
+
def apply_weights(self,
|
| 125 |
+
layer: torch.nn.Module,
|
| 126 |
+
x: torch.Tensor,
|
| 127 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 128 |
+
c = self.config
|
| 129 |
+
|
| 130 |
+
x_2d = x.reshape(-1, x.shape[-1])
|
| 131 |
+
out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
|
| 132 |
+
|
| 133 |
+
w_q, w_s, w_zp, w_g_idx = self._get_weight_params(layer)
|
| 134 |
+
|
| 135 |
+
assert w_zp is not None, "Zero points are required by Exllama"
|
| 136 |
+
assert w_g_idx is not None, "Group index is required by Exllama"
|
| 137 |
+
output = ops.gptq_gemm(x_2d, w_q, w_zp, w_s, w_g_idx, True,
|
| 138 |
+
c.weight_type.size_bits)
|
| 139 |
+
|
| 140 |
+
if bias is not None:
|
| 141 |
+
output.add_(bias)
|
| 142 |
+
return output.reshape(out_shape)
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from functools import partial
|
| 4 |
+
from typing import Optional, Tuple
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
|
| 8 |
+
from vllm import _custom_ops as ops
|
| 9 |
+
from vllm.model_executor.layers.quantization.utils.machete_utils import (
|
| 10 |
+
MACHETE_SUPPORTED_GROUP_SIZES, check_machete_supports_shape,
|
| 11 |
+
query_machete_supported_quant_types)
|
| 12 |
+
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
| 13 |
+
pack_quantized_values_into_int32, unpack_quantized_values_into_int32)
|
| 14 |
+
from vllm.model_executor.parameter import (BasevLLMParameter,
|
| 15 |
+
permute_param_layout_)
|
| 16 |
+
|
| 17 |
+
from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class MacheteLinearKernel(MPLinearKernel):
|
| 21 |
+
|
| 22 |
+
@classmethod
|
| 23 |
+
def get_min_capability(cls) -> int:
|
| 24 |
+
return 90
|
| 25 |
+
|
| 26 |
+
@classmethod
|
| 27 |
+
def can_implement(cls,
|
| 28 |
+
c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
|
| 29 |
+
if c.has_g_idx and\
|
| 30 |
+
c.partition_weight_shape[0] != c.full_weight_shape[0]:
|
| 31 |
+
return False, "Act reordering currently not supported by Machete, "\
|
| 32 |
+
"when the input features are partitioned across "\
|
| 33 |
+
"devices"
|
| 34 |
+
|
| 35 |
+
if c.zero_points:
|
| 36 |
+
return False, "Zero points currently not supported by "\
|
| 37 |
+
" Compressed Tensors + Machete. (Kernel supports it"\
|
| 38 |
+
" but CompressedTensorsWNA16 does not so support has"\
|
| 39 |
+
" not been added to MacheteWNA16Kernel yet"
|
| 40 |
+
|
| 41 |
+
if c.weight_type not in query_machete_supported_quant_types(
|
| 42 |
+
c.zero_points):
|
| 43 |
+
return False, f"Quant type ({c.weight_type}) not supported by "\
|
| 44 |
+
"Machete, supported types are: "\
|
| 45 |
+
f"{query_machete_supported_quant_types(c.zero_points)}"
|
| 46 |
+
|
| 47 |
+
if c.group_size not in MACHETE_SUPPORTED_GROUP_SIZES:
|
| 48 |
+
return False, f"Group size ({c.group_size}) not supported by "\
|
| 49 |
+
"Machete, supported group sizes are: "\
|
| 50 |
+
f"{MACHETE_SUPPORTED_GROUP_SIZES}"
|
| 51 |
+
|
| 52 |
+
return check_machete_supports_shape(c.partition_weight_shape[0],
|
| 53 |
+
c.partition_weight_shape[1])
|
| 54 |
+
|
| 55 |
+
# note assumes that
|
| 56 |
+
# `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
|
| 57 |
+
# `weight_scale` is: {input_dim = 0, output_dim = 1}
|
| 58 |
+
def process_weights_after_loading(self, layer: torch.nn.Module):
|
| 59 |
+
c = self.config
|
| 60 |
+
|
| 61 |
+
if c.has_g_idx:
|
| 62 |
+
assert self.w_gidx_name is not None
|
| 63 |
+
perm = torch.argsort(getattr(layer, self.w_gidx_name))\
|
| 64 |
+
.to(torch.int)
|
| 65 |
+
|
| 66 |
+
self.act_perm = lambda x: x[:, perm]
|
| 67 |
+
# use `ops.permute_cols` if possible
|
| 68 |
+
if c.act_type in [torch.float16, torch.bfloat16] \
|
| 69 |
+
and c.partition_weight_shape[0] % 8 == 0:
|
| 70 |
+
self.act_perm = partial(ops.permute_cols, perm=perm)
|
| 71 |
+
|
| 72 |
+
def transform_w_q(x):
|
| 73 |
+
assert isinstance(x, BasevLLMParameter)
|
| 74 |
+
permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
|
| 75 |
+
if c.has_g_idx:
|
| 76 |
+
x_unpacked = unpack_quantized_values_into_int32(x.data,
|
| 77 |
+
c.weight_type,
|
| 78 |
+
packed_dim=0)
|
| 79 |
+
x_perm = x_unpacked[perm, :]
|
| 80 |
+
x.data = pack_quantized_values_into_int32(x_perm,
|
| 81 |
+
c.weight_type,
|
| 82 |
+
packed_dim=0)
|
| 83 |
+
x.data = ops.machete_prepack_B(x.data.t().contiguous().t(),
|
| 84 |
+
a_type=c.act_type,
|
| 85 |
+
b_type=c.weight_type,
|
| 86 |
+
group_scales_type=c.act_type)
|
| 87 |
+
return x
|
| 88 |
+
|
| 89 |
+
def transform_w_s(x):
|
| 90 |
+
assert isinstance(x, BasevLLMParameter)
|
| 91 |
+
permute_param_layout_(x, input_dim=0, output_dim=1)
|
| 92 |
+
x.data = x.data.contiguous()
|
| 93 |
+
return x
|
| 94 |
+
|
| 95 |
+
# Repack weights and scales for Machete
|
| 96 |
+
self._transform_param(layer, self.w_q_name, transform_w_q)
|
| 97 |
+
self._transform_param(layer, self.w_s_name, transform_w_s)
|
| 98 |
+
|
| 99 |
+
def apply_weights(self,
|
| 100 |
+
layer: torch.nn.Module,
|
| 101 |
+
x: torch.Tensor,
|
| 102 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 103 |
+
c = self.config
|
| 104 |
+
w_q, w_s, _, _ = self._get_weight_params(layer)
|
| 105 |
+
|
| 106 |
+
x_2d = x.reshape(-1, x.shape[-1])
|
| 107 |
+
out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
|
| 108 |
+
|
| 109 |
+
if c.has_g_idx:
|
| 110 |
+
x_2d = self.act_perm(x_2d)
|
| 111 |
+
|
| 112 |
+
output = ops.machete_mm(a=x_2d,
|
| 113 |
+
b_q=w_q,
|
| 114 |
+
b_type=c.weight_type,
|
| 115 |
+
b_group_zeros=None,
|
| 116 |
+
b_group_scales=w_s,
|
| 117 |
+
b_group_size=c.group_size)
|
| 118 |
+
|
| 119 |
+
if bias is not None:
|
| 120 |
+
output.add_(bias) # In-place add
|
| 121 |
+
|
| 122 |
+
return output.reshape(out_shape)
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from typing import Optional, Tuple
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
|
| 7 |
+
from vllm import _custom_ops as ops
|
| 8 |
+
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
| 9 |
+
MARLIN_SUPPORTED_GROUP_SIZES, apply_gptq_marlin_linear,
|
| 10 |
+
check_marlin_supports_shape, marlin_is_k_full, marlin_make_empty_g_idx,
|
| 11 |
+
marlin_make_workspace, marlin_permute_scales, marlin_sort_g_idx,
|
| 12 |
+
query_marlin_supported_quant_types)
|
| 13 |
+
from vllm.model_executor.parameter import (BasevLLMParameter,
|
| 14 |
+
permute_param_layout_)
|
| 15 |
+
|
| 16 |
+
from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class MarlinLinearKernel(MPLinearKernel):
|
| 20 |
+
|
| 21 |
+
@classmethod
|
| 22 |
+
def get_min_capability(cls) -> int:
|
| 23 |
+
return 80
|
| 24 |
+
|
| 25 |
+
@classmethod
|
| 26 |
+
def can_implement(cls,
|
| 27 |
+
c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
|
| 28 |
+
if c.zero_points:
|
| 29 |
+
return False, "Zero points currently not supported by "\
|
| 30 |
+
" MarlinLinearKernel. Will be added when AWQMarlin "\
|
| 31 |
+
"is migrated over to using MPLinearKernel backend"
|
| 32 |
+
|
| 33 |
+
quant_types = query_marlin_supported_quant_types(c.zero_points)
|
| 34 |
+
if c.weight_type not in quant_types:
|
| 35 |
+
return False, f"Quant type ({c.weight_type}) not supported by"\
|
| 36 |
+
f" Marlin, supported types are: {quant_types}"
|
| 37 |
+
|
| 38 |
+
if c.group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
|
| 39 |
+
return False, f"Group size ({c.group_size}) not supported by "\
|
| 40 |
+
"Marlin, supported group sizes are: "\
|
| 41 |
+
f"{MARLIN_SUPPORTED_GROUP_SIZES}"
|
| 42 |
+
|
| 43 |
+
return check_marlin_supports_shape(
|
| 44 |
+
c.partition_weight_shape[1], # out_features
|
| 45 |
+
c.partition_weight_shape[0], # in_features
|
| 46 |
+
c.full_weight_shape[0], # in_features
|
| 47 |
+
c.group_size)
|
| 48 |
+
|
| 49 |
+
# note assumes that
|
| 50 |
+
# `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
|
| 51 |
+
# `weight_scale` is: {input_dim = 0, output_dim = 1}
|
| 52 |
+
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
| 53 |
+
device = getattr(layer, self.w_q_name).device
|
| 54 |
+
c = self.config
|
| 55 |
+
|
| 56 |
+
row_parallel = (c.partition_weight_shape[0] != c.full_weight_shape[0])
|
| 57 |
+
self.is_k_full = marlin_is_k_full(c.has_g_idx, row_parallel)
|
| 58 |
+
|
| 59 |
+
# Allocate marlin workspace.
|
| 60 |
+
self.workspace = marlin_make_workspace(c.partition_weight_shape[1],
|
| 61 |
+
device)
|
| 62 |
+
|
| 63 |
+
# Default names since marlin requires empty parameters for these,
|
| 64 |
+
# TODO: remove this requirement from marlin (allow optional tensors)
|
| 65 |
+
if self.w_gidx_name is None:
|
| 66 |
+
self.w_gidx_name = "g_idx"
|
| 67 |
+
if self.w_zp_name is None:
|
| 68 |
+
self.w_zp_name = "w_zp"
|
| 69 |
+
|
| 70 |
+
if c.has_g_idx:
|
| 71 |
+
g_idx, g_idx_sort_indices = marlin_sort_g_idx(
|
| 72 |
+
getattr(layer, self.w_gidx_name))
|
| 73 |
+
self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
|
| 74 |
+
layer.g_idx_sort_indices = g_idx_sort_indices
|
| 75 |
+
else:
|
| 76 |
+
setattr(layer, self.w_gidx_name, marlin_make_empty_g_idx(device))
|
| 77 |
+
layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
|
| 78 |
+
|
| 79 |
+
if c.zero_points:
|
| 80 |
+
pass
|
| 81 |
+
# TODO (lucas): add the following when AWQMarlin is migrated over to
|
| 82 |
+
# using MPLinearKernel backend
|
| 83 |
+
# self._transform_param(layer, self.w_zp_name, lambda x: \
|
| 84 |
+
# marlin_zero_points(
|
| 85 |
+
# x,
|
| 86 |
+
# size_k=c.partition_weight_shape[0],
|
| 87 |
+
# size_n=c.partition_weight_shape[1],
|
| 88 |
+
# num_bits=c.weight_type.size_bits))
|
| 89 |
+
else:
|
| 90 |
+
setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
|
| 91 |
+
|
| 92 |
+
def transform_w_q(x):
|
| 93 |
+
assert isinstance(x, BasevLLMParameter)
|
| 94 |
+
permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
|
| 95 |
+
x.data = ops.gptq_marlin_repack(x.data.contiguous(),
|
| 96 |
+
perm=layer.g_idx_sort_indices,
|
| 97 |
+
size_k=c.partition_weight_shape[0],
|
| 98 |
+
size_n=c.partition_weight_shape[1],
|
| 99 |
+
num_bits=c.weight_type.size_bits)
|
| 100 |
+
return x
|
| 101 |
+
|
| 102 |
+
def transform_w_s(x):
|
| 103 |
+
assert isinstance(x, BasevLLMParameter)
|
| 104 |
+
permute_param_layout_(x, input_dim=0, output_dim=1)
|
| 105 |
+
x.data = marlin_permute_scales(x.data.contiguous(),
|
| 106 |
+
size_k=c.partition_weight_shape[0],
|
| 107 |
+
size_n=c.partition_weight_shape[1],
|
| 108 |
+
group_size=c.group_size)
|
| 109 |
+
return x
|
| 110 |
+
|
| 111 |
+
self._transform_param(layer, self.w_q_name, transform_w_q)
|
| 112 |
+
self._transform_param(layer, self.w_s_name, transform_w_s)
|
| 113 |
+
|
| 114 |
+
def apply_weights(self,
|
| 115 |
+
layer: torch.nn.Module,
|
| 116 |
+
x: torch.Tensor,
|
| 117 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 118 |
+
c = self.config
|
| 119 |
+
w_q, w_s, w_zp, w_gidx = self._get_weight_params(layer)
|
| 120 |
+
|
| 121 |
+
# `process_weights_after_loading` will ensure w_zp and w_gidx are not
|
| 122 |
+
# None for marlin
|
| 123 |
+
return apply_gptq_marlin_linear(
|
| 124 |
+
input=x,
|
| 125 |
+
weight=w_q,
|
| 126 |
+
weight_scale=w_s,
|
| 127 |
+
weight_zp=w_zp, # type: ignore
|
| 128 |
+
g_idx=w_gidx, # type: ignore
|
| 129 |
+
g_idx_sort_indices=layer.g_idx_sort_indices,
|
| 130 |
+
workspace=self.workspace,
|
| 131 |
+
wtype=c.weight_type,
|
| 132 |
+
input_size_per_partition=c.partition_weight_shape[0],
|
| 133 |
+
output_size_per_partition=c.partition_weight_shape[1],
|
| 134 |
+
is_k_full=self.is_k_full,
|
| 135 |
+
bias=bias)
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from abc import ABC, abstractmethod
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from typing import Optional, Tuple
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class ScaledMMLinearLayerConfig:
|
| 12 |
+
is_channelwise: bool
|
| 13 |
+
is_static_input_scheme: bool
|
| 14 |
+
input_symmetric: bool
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class ScaledMMLinearKernel(ABC):
|
| 18 |
+
|
| 19 |
+
@classmethod
|
| 20 |
+
@abstractmethod
|
| 21 |
+
def get_min_capability(cls) -> int:
|
| 22 |
+
raise NotImplementedError
|
| 23 |
+
|
| 24 |
+
@classmethod
|
| 25 |
+
@abstractmethod
|
| 26 |
+
def can_implement(
|
| 27 |
+
cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
|
| 28 |
+
raise NotImplementedError
|
| 29 |
+
|
| 30 |
+
def __init__(self, c: ScaledMMLinearLayerConfig, w_q_param_name: str,
|
| 31 |
+
w_s_param_name: str, i_s_param_name: str,
|
| 32 |
+
i_zp_param_name: str, azp_adj_param_name: str) -> None:
|
| 33 |
+
assert self.can_implement(c)
|
| 34 |
+
self.config = c
|
| 35 |
+
self.w_q_name = w_q_param_name
|
| 36 |
+
self.w_s_name = w_s_param_name
|
| 37 |
+
self.i_s_name = i_s_param_name
|
| 38 |
+
self.i_zp_name = i_zp_param_name
|
| 39 |
+
self.azp_adj_name = azp_adj_param_name
|
| 40 |
+
|
| 41 |
+
@abstractmethod
|
| 42 |
+
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
| 43 |
+
raise NotImplementedError
|
| 44 |
+
|
| 45 |
+
@abstractmethod
|
| 46 |
+
def apply_weights(self,
|
| 47 |
+
layer: torch.nn.Module,
|
| 48 |
+
x: torch.Tensor,
|
| 49 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 50 |
+
raise NotImplementedError
|
| 51 |
+
|
| 52 |
+
def _get_weight_params(
|
| 53 |
+
self, layer: torch.nn.Module) -> Tuple[
|
| 54 |
+
torch.Tensor, # weight
|
| 55 |
+
torch.Tensor, # weight_scale
|
| 56 |
+
Optional[torch.Tensor], # input_scale,
|
| 57 |
+
Optional[torch.Tensor], # input_zp
|
| 58 |
+
Optional[torch.Tensor], # azp_adj
|
| 59 |
+
]:
|
| 60 |
+
return (
|
| 61 |
+
getattr(layer, self.w_q_name),
|
| 62 |
+
getattr(layer, self.w_s_name),
|
| 63 |
+
getattr(layer, self.i_s_name),
|
| 64 |
+
getattr(layer, self.i_zp_name),
|
| 65 |
+
getattr(layer, self.azp_adj_name),
|
| 66 |
+
)
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from typing import Dict, List, Optional, Type
|
| 5 |
+
|
| 6 |
+
from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
|
| 7 |
+
CutlassScaledMMLinearKernel)
|
| 8 |
+
from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501
|
| 9 |
+
ScaledMMLinearKernel, ScaledMMLinearLayerConfig)
|
| 10 |
+
from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
|
| 11 |
+
TritonScaledMMLinearKernel)
|
| 12 |
+
from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import (
|
| 13 |
+
XLAScaledMMLinearKernel)
|
| 14 |
+
from vllm.platforms import PlatformEnum, current_platform
|
| 15 |
+
|
| 16 |
+
# in priority/performance order (when available)
|
| 17 |
+
_POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = {
|
| 18 |
+
PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
|
| 19 |
+
PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
|
| 20 |
+
PlatformEnum.ROCM: [TritonScaledMMLinearKernel],
|
| 21 |
+
PlatformEnum.TPU: [XLAScaledMMLinearKernel],
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def choose_scaled_mm_linear_kernel(
|
| 26 |
+
config: ScaledMMLinearLayerConfig,
|
| 27 |
+
compute_capability: Optional[int] = None
|
| 28 |
+
) -> Type[ScaledMMLinearKernel]:
|
| 29 |
+
"""
|
| 30 |
+
Choose an ScalledMMLinearKernel that can implement the given config for the
|
| 31 |
+
given compute capability. Attempts to choose the best kernel in terms of
|
| 32 |
+
performance.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
config (ScaledMMLinearLayerConfig): Description of the linear layer
|
| 36 |
+
to be implemented.
|
| 37 |
+
compute_capability (Optional[int], optional): The compute capability of
|
| 38 |
+
the target device, if None uses `current_platform` to get the
|
| 39 |
+
compute capability. Defaults to None.
|
| 40 |
+
|
| 41 |
+
Raises:
|
| 42 |
+
ValueError: If no kernel can implement the given config.
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
Type[ScaledMMLinearKernel]: Chosen kernel.
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
if compute_capability is None:
|
| 49 |
+
_cc = current_platform.get_device_capability()
|
| 50 |
+
if _cc is not None:
|
| 51 |
+
compute_capability = _cc[0] * 10 + _cc[1]
|
| 52 |
+
|
| 53 |
+
failure_reasons = []
|
| 54 |
+
for kernel in _POSSIBLE_KERNELS[current_platform._enum]:
|
| 55 |
+
if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "")\
|
| 56 |
+
.split(","):
|
| 57 |
+
failure_reasons.append(
|
| 58 |
+
f' {kernel.__name__} disabled by environment variable')
|
| 59 |
+
continue
|
| 60 |
+
|
| 61 |
+
# If the current platform uses compute_capability,
|
| 62 |
+
# make sure the kernel supports the compute cability.
|
| 63 |
+
if compute_capability is not None:
|
| 64 |
+
kernel_min_capability = kernel.get_min_capability()
|
| 65 |
+
if (kernel_min_capability is not None
|
| 66 |
+
and kernel_min_capability > compute_capability):
|
| 67 |
+
failure_reasons.append(
|
| 68 |
+
f"{kernel.__name__} requires capability "
|
| 69 |
+
f"{kernel_min_capability}, current compute capability "
|
| 70 |
+
f"is {compute_capability}")
|
| 71 |
+
continue
|
| 72 |
+
|
| 73 |
+
can_implement, failure_reason = kernel.can_implement(config)
|
| 74 |
+
if can_implement:
|
| 75 |
+
return kernel
|
| 76 |
+
else:
|
| 77 |
+
failure_reasons.append(
|
| 78 |
+
f' {kernel.__name__} cannot implement due to: {failure_reason}'
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
raise ValueError(
|
| 82 |
+
"Failed to find a kernel that can implement the "\
|
| 83 |
+
"ScaledMM linear layer. Reasons: \n"
|
| 84 |
+
+ '\n'.join(failure_reasons))
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/ScaledMMLinearKernel.cpython-311.pyc
ADDED
|
Binary file (3.78 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (3.84 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/cutlass.cpython-311.pyc
ADDED
|
Binary file (6.34 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/triton.cpython-311.pyc
ADDED
|
Binary file (2.58 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/xla.cpython-311.pyc
ADDED
|
Binary file (5.26 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from typing import Optional, Tuple
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
|
| 7 |
+
from vllm import _custom_ops as ops
|
| 8 |
+
from vllm.model_executor.layers.quantization.utils import replace_parameter
|
| 9 |
+
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
| 10 |
+
convert_to_channelwise)
|
| 11 |
+
from vllm.platforms import current_platform
|
| 12 |
+
|
| 13 |
+
from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
|
| 14 |
+
ScaledMMLinearLayerConfig)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
|
| 18 |
+
|
| 19 |
+
@classmethod
|
| 20 |
+
def get_min_capability(cls) -> int:
|
| 21 |
+
return 75
|
| 22 |
+
|
| 23 |
+
@classmethod
|
| 24 |
+
def can_implement(
|
| 25 |
+
cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
|
| 26 |
+
|
| 27 |
+
if (not current_platform.is_cuda() and not current_platform.is_cpu()):
|
| 28 |
+
return False, "CutlassScaledMM requires running on CUDA or CPU."
|
| 29 |
+
|
| 30 |
+
return True, None
|
| 31 |
+
|
| 32 |
+
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
| 33 |
+
# WEIGHT
|
| 34 |
+
# Cutlass kernels need transposed weight.
|
| 35 |
+
weight = getattr(layer, self.w_q_name)
|
| 36 |
+
replace_parameter(
|
| 37 |
+
layer, self.w_q_name,
|
| 38 |
+
torch.nn.Parameter(weight.t().data, requires_grad=False))
|
| 39 |
+
|
| 40 |
+
# WEIGHT SCALE
|
| 41 |
+
# Cutlass kernels support only per-tensor and per-channel.
|
| 42 |
+
# If we have a fused module (QKV, MLP) with per tensor scales (thus N
|
| 43 |
+
# scales being passed to the kernel), convert to the per-channel case.
|
| 44 |
+
is_fused_module = len(layer.logical_widths) > 1
|
| 45 |
+
weight_scale = getattr(layer, self.w_s_name)
|
| 46 |
+
if is_fused_module and not self.config.is_channelwise:
|
| 47 |
+
weight_scale = convert_to_channelwise(weight_scale,
|
| 48 |
+
layer.logical_widths)
|
| 49 |
+
replace_parameter(
|
| 50 |
+
layer, self.w_s_name,
|
| 51 |
+
torch.nn.Parameter(weight_scale.data, requires_grad=False))
|
| 52 |
+
|
| 53 |
+
# INPUT SCALE
|
| 54 |
+
if self.config.is_static_input_scheme:
|
| 55 |
+
input_scale = getattr(layer, self.i_s_name)
|
| 56 |
+
|
| 57 |
+
if self.config.input_symmetric:
|
| 58 |
+
replace_parameter(
|
| 59 |
+
layer, self.i_s_name,
|
| 60 |
+
torch.nn.Parameter(input_scale.max(), requires_grad=False))
|
| 61 |
+
setattr(layer, self.i_zp_name, None)
|
| 62 |
+
else:
|
| 63 |
+
input_zero_point = getattr(layer, self.i_zp_name)
|
| 64 |
+
|
| 65 |
+
# reconstruct the ranges
|
| 66 |
+
int8_traits = torch.iinfo(torch.int8)
|
| 67 |
+
azps = input_zero_point.to(dtype=torch.int32)
|
| 68 |
+
range_max = (input_scale * (int8_traits.max - azps)).max()
|
| 69 |
+
range_min = (input_scale * (int8_traits.min - azps)).min()
|
| 70 |
+
|
| 71 |
+
scale = (range_max - range_min) / (int8_traits.max -
|
| 72 |
+
int8_traits.min)
|
| 73 |
+
replace_parameter(
|
| 74 |
+
layer, self.i_s_name,
|
| 75 |
+
torch.nn.Parameter(scale, requires_grad=False))
|
| 76 |
+
|
| 77 |
+
# AZP loaded as int8 but used as int32
|
| 78 |
+
azp = (int8_traits.min -
|
| 79 |
+
range_min / scale).to(dtype=torch.int32)
|
| 80 |
+
replace_parameter(layer, self.i_zp_name,
|
| 81 |
+
torch.nn.Parameter(azp, requires_grad=False))
|
| 82 |
+
|
| 83 |
+
else:
|
| 84 |
+
setattr(layer, self.i_s_name, None)
|
| 85 |
+
setattr(layer, self.i_zp_name, None)
|
| 86 |
+
|
| 87 |
+
# azp_adj is the AZP adjustment term, used to account for weights.
|
| 88 |
+
# It does not depend on scales or azp, so it is the same for
|
| 89 |
+
# static and dynamic quantization.
|
| 90 |
+
# For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
|
| 91 |
+
# https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
|
| 92 |
+
if not self.config.input_symmetric:
|
| 93 |
+
weight = getattr(layer, self.w_q_name)
|
| 94 |
+
azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.int32)
|
| 95 |
+
if self.config.is_static_input_scheme:
|
| 96 |
+
# cutlass_w8a8 requires azp to be folded into azp_adj
|
| 97 |
+
# in the per-tensor case
|
| 98 |
+
azp_adj = getattr(layer, self.i_zp_name) * azp_adj
|
| 99 |
+
setattr(layer, self.azp_adj_name,
|
| 100 |
+
torch.nn.Parameter(azp_adj, requires_grad=False))
|
| 101 |
+
else:
|
| 102 |
+
setattr(layer, self.azp_adj_name, None)
|
| 103 |
+
|
| 104 |
+
def apply_weights(self,
|
| 105 |
+
layer: torch.nn.Module,
|
| 106 |
+
x: torch.Tensor,
|
| 107 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 108 |
+
w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
|
| 109 |
+
|
| 110 |
+
# ops.scaled_int8_quant supports both dynamic and static quant:
|
| 111 |
+
# * dynamic, i_s is None and x_s computed from x.
|
| 112 |
+
# * static, i_s is scalar and x_s is i_s.
|
| 113 |
+
symmetric = azp_adj is None
|
| 114 |
+
x_q, x_s, x_zp = ops.scaled_int8_quant(x,
|
| 115 |
+
i_s,
|
| 116 |
+
i_zp,
|
| 117 |
+
symmetric=symmetric)
|
| 118 |
+
|
| 119 |
+
if x_zp is not None:
|
| 120 |
+
# Currently, static is always per-tensor and dynamic is per-token
|
| 121 |
+
static = i_zp is not None
|
| 122 |
+
azp = None if static else x_zp
|
| 123 |
+
return ops.cutlass_scaled_mm_azp(x_q,
|
| 124 |
+
w_q,
|
| 125 |
+
scale_a=x_s,
|
| 126 |
+
scale_b=w_s,
|
| 127 |
+
out_dtype=x.dtype,
|
| 128 |
+
azp_adj=azp_adj,
|
| 129 |
+
azp=azp,
|
| 130 |
+
bias=bias)
|
| 131 |
+
return ops.cutlass_scaled_mm(x_q,
|
| 132 |
+
w_q,
|
| 133 |
+
scale_a=x_s,
|
| 134 |
+
scale_b=w_s,
|
| 135 |
+
out_dtype=x.dtype,
|
| 136 |
+
bias=bias)
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from typing import Optional, Tuple
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
|
| 7 |
+
from vllm.platforms import current_platform
|
| 8 |
+
|
| 9 |
+
from .cutlass import CutlassScaledMMLinearKernel
|
| 10 |
+
from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
|
| 14 |
+
|
| 15 |
+
@classmethod
|
| 16 |
+
def get_min_capability(cls) -> int:
|
| 17 |
+
return 75
|
| 18 |
+
|
| 19 |
+
@classmethod
|
| 20 |
+
def can_implement(
|
| 21 |
+
cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
|
| 22 |
+
if current_platform.is_cpu():
|
| 23 |
+
return (
|
| 24 |
+
False,
|
| 25 |
+
"TritonScaledMMLinearKernel requires Triton which is not " +
|
| 26 |
+
"currently supported on CPU.")
|
| 27 |
+
if not c.input_symmetric:
|
| 28 |
+
return (False,
|
| 29 |
+
"TritonScaledMMLinearKernel only supports symmetric " +
|
| 30 |
+
"quantization.")
|
| 31 |
+
return True, None
|
| 32 |
+
|
| 33 |
+
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
| 34 |
+
super().process_weights_after_loading(layer)
|
| 35 |
+
|
| 36 |
+
def apply_weights(self,
|
| 37 |
+
layer: torch.nn.Module,
|
| 38 |
+
x: torch.Tensor,
|
| 39 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 40 |
+
return super().apply_weights(layer, x, bias)
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
import warnings
|
| 4 |
+
from typing import Optional, Tuple
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
from functorch.experimental.control_flow import cond # noqa: F401
|
| 8 |
+
|
| 9 |
+
from vllm.model_executor.layers.quantization.utils import replace_parameter
|
| 10 |
+
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
| 11 |
+
convert_to_channelwise)
|
| 12 |
+
from vllm.platforms import current_platform
|
| 13 |
+
|
| 14 |
+
from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
|
| 15 |
+
ScaledMMLinearLayerConfig)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
|
| 19 |
+
|
| 20 |
+
@classmethod
|
| 21 |
+
def get_min_capability(cls) -> int:
|
| 22 |
+
raise NotImplementedError(
|
| 23 |
+
"TPU platform does have a concept of compute capability, "
|
| 24 |
+
"this method should not be called.")
|
| 25 |
+
|
| 26 |
+
@classmethod
|
| 27 |
+
def can_implement(
|
| 28 |
+
cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
|
| 29 |
+
|
| 30 |
+
if not current_platform.is_tpu():
|
| 31 |
+
return False, "ScaledMMXLA requires running on TPU."
|
| 32 |
+
|
| 33 |
+
if c.is_static_input_scheme:
|
| 34 |
+
return False, "ScaledMMXLA requires dynamic activation scales."
|
| 35 |
+
|
| 36 |
+
if not c.input_symmetric:
|
| 37 |
+
return False, "ScaledMMXLA requires symmetric activation scales."
|
| 38 |
+
|
| 39 |
+
if not c.is_channelwise:
|
| 40 |
+
return False, "ScaledMMXLA requires channelwise weight scales"
|
| 41 |
+
|
| 42 |
+
return True, None
|
| 43 |
+
|
| 44 |
+
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
| 45 |
+
# WEIGHT
|
| 46 |
+
# [out, in] (different than cutlass_scaled_mm)
|
| 47 |
+
weight = getattr(layer, self.w_q_name)
|
| 48 |
+
replace_parameter(layer, self.w_q_name,
|
| 49 |
+
torch.nn.Parameter(weight.data, requires_grad=False))
|
| 50 |
+
|
| 51 |
+
# WEIGHT SCALE
|
| 52 |
+
# XLA kernels support only per-tensor and per-channel.
|
| 53 |
+
# If we have a fused module (QKV, MLP) with per tensor scales (thus N
|
| 54 |
+
# scales being passed to the kernel), convert to the per-channel case.
|
| 55 |
+
is_fused_module = len(layer.logical_widths) > 1
|
| 56 |
+
weight_scale = getattr(layer, self.w_s_name)
|
| 57 |
+
if is_fused_module and not self.config.is_channelwise:
|
| 58 |
+
weight_scale = convert_to_channelwise(weight_scale,
|
| 59 |
+
layer.logical_widths)
|
| 60 |
+
|
| 61 |
+
# [out_channel,] (different than cutlass_scaled_mm)
|
| 62 |
+
weight_scale = weight_scale.squeeze(-1)
|
| 63 |
+
replace_parameter(
|
| 64 |
+
layer, self.w_s_name,
|
| 65 |
+
torch.nn.Parameter(weight_scale.data, requires_grad=False))
|
| 66 |
+
|
| 67 |
+
# Only support symmetric dynamic activation quantization.
|
| 68 |
+
setattr(layer, self.i_s_name, None)
|
| 69 |
+
setattr(layer, self.i_zp_name, None)
|
| 70 |
+
setattr(layer, self.azp_adj_name, None)
|
| 71 |
+
|
| 72 |
+
# Filter warning for cond usage in apply_weights. It is okay
|
| 73 |
+
# to specialize the graph since bias is not dynamic.
|
| 74 |
+
warnings.filterwarnings(
|
| 75 |
+
"ignore",
|
| 76 |
+
message=
|
| 77 |
+
"Pred is a Python constant. When used with torch.cond, it specializes on one of the branches." # noqa: E501
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
def no_add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
|
| 81 |
+
return x
|
| 82 |
+
|
| 83 |
+
def add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
|
| 84 |
+
return x + bias
|
| 85 |
+
|
| 86 |
+
def apply_weights(self,
|
| 87 |
+
layer: torch.nn.Module,
|
| 88 |
+
x: torch.Tensor,
|
| 89 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 90 |
+
w_q, w_s, _, _, _ = self._get_weight_params(layer)
|
| 91 |
+
|
| 92 |
+
import torch_xla.experimental.xla_quantized_matmul # noqa: F401
|
| 93 |
+
out = torch.ops.xla.quantized_matmul(x,
|
| 94 |
+
w_q,
|
| 95 |
+
w_s,
|
| 96 |
+
zero_point=None,
|
| 97 |
+
block_size=-1,
|
| 98 |
+
int4_weight=False,
|
| 99 |
+
quantize_activation=True)
|
| 100 |
+
|
| 101 |
+
# Explicitly capture control flow to make dynamo happy.
|
| 102 |
+
# https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
|
| 103 |
+
return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (218 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/quark.cpython-311.pyc
ADDED
|
Binary file (21.8 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/quark_moe.cpython-311.pyc
ADDED
|
Binary file (11.1 kB). View file
|
|
|