koichi12 commited on
Commit
63c744a
·
verified ·
1 Parent(s): 96a7806

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-311.pyc +0 -0
  2. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/aqlm.cpython-311.pyc +0 -0
  3. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq.cpython-311.pyc +0 -0
  4. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-311.pyc +0 -0
  5. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq_triton.cpython-311.pyc +0 -0
  6. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-311.pyc +0 -0
  7. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/bitsandbytes.cpython-311.pyc +0 -0
  8. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/deepspeedfp.cpython-311.pyc +0 -0
  9. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/experts_int8.cpython-311.pyc +0 -0
  10. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/fbgemm_fp8.cpython-311.pyc +0 -0
  11. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/fp8.cpython-311.pyc +0 -0
  12. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gguf.cpython-311.pyc +0 -0
  13. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-311.pyc +0 -0
  14. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-311.pyc +0 -0
  15. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-311.pyc +0 -0
  16. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/hqq_marlin.cpython-311.pyc +0 -0
  17. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/ipex_quant.cpython-311.pyc +0 -0
  18. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/kv_cache.cpython-311.pyc +0 -0
  19. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-311.pyc +0 -0
  20. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/modelopt.cpython-311.pyc +0 -0
  21. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/moe_wna16.cpython-311.pyc +0 -0
  22. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/neuron_quant.cpython-311.pyc +0 -0
  23. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/qqq.cpython-311.pyc +0 -0
  24. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/schema.cpython-311.pyc +0 -0
  25. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/tpu_int8.cpython-311.pyc +0 -0
  26. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-311.pyc +0 -0
  27. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +89 -0
  28. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +76 -0
  29. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/MPLinearKernel.cpython-311.pyc +0 -0
  30. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/__init__.cpython-311.pyc +0 -0
  31. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/exllama.cpython-311.pyc +0 -0
  32. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/machete.cpython-311.pyc +0 -0
  33. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/marlin.cpython-311.pyc +0 -0
  34. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +142 -0
  35. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +122 -0
  36. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +135 -0
  37. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +66 -0
  38. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +84 -0
  39. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/ScaledMMLinearKernel.cpython-311.pyc +0 -0
  40. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/__init__.cpython-311.pyc +0 -0
  41. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/cutlass.cpython-311.pyc +0 -0
  42. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/triton.cpython-311.pyc +0 -0
  43. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/xla.cpython-311.pyc +0 -0
  44. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +136 -0
  45. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +40 -0
  46. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +103 -0
  47. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  48. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/__init__.cpython-311.pyc +0 -0
  49. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/quark.cpython-311.pyc +0 -0
  50. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/quark_moe.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (4.95 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/aqlm.cpython-311.pyc ADDED
Binary file (15.6 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq.cpython-311.pyc ADDED
Binary file (9.79 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-311.pyc ADDED
Binary file (21.2 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/awq_triton.cpython-311.pyc ADDED
Binary file (13.8 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-311.pyc ADDED
Binary file (7.73 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/bitsandbytes.cpython-311.pyc ADDED
Binary file (17 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/deepspeedfp.cpython-311.pyc ADDED
Binary file (11.2 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/experts_int8.cpython-311.pyc ADDED
Binary file (9.87 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/fbgemm_fp8.cpython-311.pyc ADDED
Binary file (8.79 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/fp8.cpython-311.pyc ADDED
Binary file (29.5 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gguf.cpython-311.pyc ADDED
Binary file (11.8 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-311.pyc ADDED
Binary file (11.7 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-311.pyc ADDED
Binary file (24.1 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-311.pyc ADDED
Binary file (12.5 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/hqq_marlin.cpython-311.pyc ADDED
Binary file (17.8 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/ipex_quant.cpython-311.pyc ADDED
Binary file (12.9 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/kv_cache.cpython-311.pyc ADDED
Binary file (4.81 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-311.pyc ADDED
Binary file (11.4 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/modelopt.cpython-311.pyc ADDED
Binary file (9.62 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/moe_wna16.cpython-311.pyc ADDED
Binary file (20.1 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/neuron_quant.cpython-311.pyc ADDED
Binary file (4.11 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/qqq.cpython-311.pyc ADDED
Binary file (11.7 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/schema.cpython-311.pyc ADDED
Binary file (4.9 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__pycache__/tpu_int8.cpython-311.pyc ADDED
Binary file (7.74 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (220 Bytes). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass
5
+ from typing import Callable, Optional, Tuple
6
+
7
+ import torch
8
+
9
+ from vllm.model_executor.layers.quantization.utils import replace_parameter
10
+ from vllm.scalar_type import ScalarType
11
+
12
+
13
+ @dataclass
14
+ class MPLinearLayerConfig:
15
+ full_weight_shape: Tuple[int, int] # [in, out]
16
+ partition_weight_shape: Tuple[int, int]
17
+ weight_type: ScalarType
18
+ act_type: torch.dtype
19
+ group_size: int
20
+ zero_points: bool
21
+ has_g_idx: bool
22
+
23
+
24
+ class MPLinearKernel(ABC):
25
+
26
+ @classmethod
27
+ @abstractmethod
28
+ def get_min_capability(cls) -> int:
29
+ raise NotImplementedError
30
+
31
+ @classmethod
32
+ @abstractmethod
33
+ def can_implement(cls,
34
+ c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
35
+ raise NotImplementedError
36
+
37
+ def __init__(self,
38
+ c: MPLinearLayerConfig,
39
+ w_q_param_name: str,
40
+ w_s_param_name: str,
41
+ w_zp_param_name: Optional[str] = None,
42
+ w_gidx_param_name: Optional[str] = None) -> None:
43
+ assert self.can_implement(c)
44
+ self.config = c
45
+ self.w_q_name = w_q_param_name
46
+ self.w_s_name = w_s_param_name
47
+ if c.zero_points:
48
+ assert w_zp_param_name is not None
49
+ if c.has_g_idx:
50
+ assert w_gidx_param_name is not None
51
+ self.w_zp_name = w_zp_param_name
52
+ self.w_gidx_name = w_gidx_param_name
53
+
54
+ @abstractmethod
55
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
56
+ raise NotImplementedError
57
+
58
+ @abstractmethod
59
+ def apply_weights(self,
60
+ layer: torch.nn.Module,
61
+ x: torch.Tensor,
62
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
63
+ raise NotImplementedError
64
+
65
+ def _transform_param(self, layer: torch.nn.Module, name: Optional[str],
66
+ fn: Callable) -> None:
67
+ if name is not None and getattr(layer, name, None) is not None:
68
+
69
+ old_param = getattr(layer, name)
70
+ new_param = fn(old_param)
71
+ # replace the parameter with torch.nn.Parameter for TorchDynamo
72
+ # compatibility
73
+ replace_parameter(
74
+ layer, name,
75
+ torch.nn.Parameter(new_param.data, requires_grad=False))
76
+
77
+ def _get_weight_params(
78
+ self, layer: torch.nn.Module) -> Tuple[
79
+ torch.Tensor, # w_q
80
+ torch.Tensor, # w_s
81
+ Optional[torch.Tensor], # w_zp,
82
+ Optional[torch.Tensor] # w_gidx
83
+ ]:
84
+ return (
85
+ getattr(layer, self.w_q_name),
86
+ getattr(layer, self.w_s_name),
87
+ getattr(layer, self.w_zp_name or "", None),
88
+ getattr(layer, self.w_gidx_name or "", None),
89
+ )
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import List, Optional, Type
4
+
5
+ import vllm.envs as envs
6
+ from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import ( # noqa: E501
7
+ ExllamaLinearKernel)
8
+ from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import ( # noqa: E501
9
+ MacheteLinearKernel)
10
+ from vllm.model_executor.layers.quantization.kernels.mixed_precision.marlin import ( # noqa: E501
11
+ MarlinLinearKernel)
12
+ from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKernel import ( # noqa: E501
13
+ MPLinearKernel, MPLinearLayerConfig)
14
+ from vllm.platforms import current_platform
15
+
16
+ # in priority/performance order (when available)
17
+ _POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
18
+ MacheteLinearKernel,
19
+ MarlinLinearKernel,
20
+ ExllamaLinearKernel,
21
+ ]
22
+
23
+
24
+ def choose_mp_linear_kernel(
25
+ config: MPLinearLayerConfig,
26
+ compute_capability: Optional[int] = None) -> Type[MPLinearKernel]:
27
+ """
28
+ Choose an MPLinearKernel that can implement the given config for the given
29
+ compute capability. Attempts to choose the best kernel in terms of
30
+ performance.
31
+
32
+ Args:
33
+ config (MPLinearLayerConfig): Description of the linear layer to be
34
+ implemented.
35
+ compute_capability (Optional[int], optional): The compute capability of
36
+ the target device, if None uses `current_platform` to get the compute
37
+ capability. Defaults to None.
38
+
39
+ Raises:
40
+ ValueError: If no kernel can implement the given config.
41
+
42
+ Returns:
43
+ Type[MPLinearKernel]: Chosen kernel.
44
+ """
45
+ if compute_capability is None:
46
+ if current_platform is None:
47
+ raise ValueError("Cannot determine compute capability")
48
+ _cc = current_platform.get_device_capability()
49
+ compute_capability = _cc[0] * 10 + _cc[1]
50
+
51
+ failure_reasons = []
52
+ for kernel in _POSSIBLE_KERNELS:
53
+ if kernel.__name__ in envs.VLLM_DISABLED_KERNELS:
54
+ failure_reasons.append(
55
+ f' {kernel.__name__} disabled by environment variable')
56
+ continue
57
+
58
+ if kernel.get_min_capability() > compute_capability:
59
+ failure_reasons.append(
60
+ f"{kernel.__name__} requires capability "
61
+ f"{kernel.get_min_capability()}, current compute capability "
62
+ f"is {compute_capability}")
63
+ continue
64
+
65
+ can_implement, failure_reason = kernel.can_implement(config)
66
+ if can_implement:
67
+ return kernel
68
+ else:
69
+ failure_reasons.append(
70
+ f' {kernel.__name__} cannot implement due to: {failure_reason}'
71
+ )
72
+
73
+ raise ValueError(
74
+ "Failed to find a kernel that can implement the "\
75
+ "WNA16 linear layer. Reasons: \n"
76
+ + '\n'.join(failure_reasons))
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/MPLinearKernel.cpython-311.pyc ADDED
Binary file (4.87 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (3.53 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/exllama.cpython-311.pyc ADDED
Binary file (7.87 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/machete.cpython-311.pyc ADDED
Binary file (7.18 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/marlin.cpython-311.pyc ADDED
Binary file (6.96 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import Optional, Tuple
4
+
5
+ import torch
6
+
7
+ from vllm import _custom_ops as ops
8
+ from vllm.model_executor.layers.quantization.utils.quant_utils import (
9
+ pack_quantized_values_into_int32)
10
+ from vllm.model_executor.parameter import (BasevLLMParameter,
11
+ permute_param_layout_)
12
+ from vllm.scalar_type import scalar_types
13
+
14
+ from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
15
+
16
+
17
+ class ExllamaLinearKernel(MPLinearKernel):
18
+ SUPPORTED_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
19
+ # In theory supports `scalar_types.uint2b2, scalar_types.uint3b4` too but
20
+ # currently untested so not added to the list
21
+
22
+ @classmethod
23
+ def get_min_capability(cls) -> int:
24
+ return 60
25
+
26
+ @classmethod
27
+ def can_implement(cls,
28
+ c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
29
+ if c.has_g_idx and\
30
+ c.partition_weight_shape[0] != c.full_weight_shape[0]:
31
+ return False, "Act reordering currently not supported by Exllama, "\
32
+ "when the input features are partitioned across "\
33
+ "devices"
34
+
35
+ if c.partition_weight_shape[1] % (32 // c.weight_type.size_bits) != 0:
36
+ return False, "Output features must be a multiple of the pack " \
37
+ "factor (32 / num_bits) so that we can correctly " \
38
+ "pack the zero points"
39
+
40
+ if c.act_type != torch.float16:
41
+ return False, "Exllama only supports float16 activations"
42
+
43
+ if c.weight_type not in cls.SUPPORTED_QUANT_TYPES:
44
+ return False, f"Quant type ({c.weight_type}) not supported by "\
45
+ "Exllama, supported types are: "\
46
+ f"{cls.SUPPORTED_QUANT_TYPES}"
47
+
48
+ if c.full_weight_shape[0] % c.group_size != 0:
49
+ return False, f"Group size ({c.group_size}) does not evenly divide"\
50
+ " the number of input features "\
51
+ f"({c.full_weight_shape[0]})"
52
+
53
+ return True, None
54
+
55
+ def process_weights_after_loading(self, layer: torch.nn.Module):
56
+ c = self.config
57
+
58
+ # For Exllama, we need to set a zero-point tensor if there is not one
59
+ if not c.zero_points:
60
+ self.w_zp_name = "qzeros"
61
+ device = getattr(layer, self.w_q_name).device
62
+ groups = c.partition_weight_shape[0] // c.group_size
63
+ out_features = c.partition_weight_shape[1]
64
+
65
+ if c.weight_type.has_bias():
66
+ # if the type has a bias we have to create a zeros tensor that
67
+ # contains the bias values repeated for each group (-1 due to
68
+ # a bug in the original GPTQ checkpoint format leading to
69
+ # exllama kernel adding 1 to the zero points during inference)
70
+ # Documentation of the bug can be found here:
71
+ # https://garden.danieldk.eu/GPTQ-Checkpoint-Format
72
+ zeros = torch.full((groups, out_features),
73
+ c.weight_type.bias - 1,
74
+ dtype=torch.int32,
75
+ device=device)
76
+ else:
77
+ raise NotImplementedError(
78
+ "A 0 zero-point is not supported by Exllama due to "
79
+ "a bug in the original GPTQ checkpoint format leading to "
80
+ "exllama kernel adding 1 to the zero points during "
81
+ "inference")
82
+ zeros = pack_quantized_values_into_int32(zeros,
83
+ c.weight_type,
84
+ packed_dim=1)
85
+ setattr(layer, self.w_zp_name,
86
+ torch.nn.Parameter(zeros, requires_grad=False))
87
+
88
+ if c.has_g_idx:
89
+
90
+ def transform_w_g_idx(x):
91
+ # Exllama wants the permutation array instead of the group
92
+ # indices
93
+ return torch.argsort(x).to(torch.int)
94
+
95
+ self._transform_param(layer, self.w_gidx_name, transform_w_g_idx)
96
+ else:
97
+ self.w_gidx_name = "g_idx"
98
+ empty_g_idx = torch.nn.Parameter(torch.empty((0, ),
99
+ dtype=torch.int,
100
+ device=device),
101
+ requires_grad=False)
102
+ setattr(layer, self.w_gidx_name, empty_g_idx)
103
+
104
+ def transform_w_q(x):
105
+ assert isinstance(x, BasevLLMParameter)
106
+ assert self.w_gidx_name is not None
107
+ g_idx = getattr(layer, self.w_gidx_name)
108
+
109
+ permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
110
+ x_cont = x.data.contiguous()
111
+ ops.gptq_shuffle(x_cont, g_idx, c.weight_type.size_bits)
112
+ return x_cont
113
+
114
+ def transform_w_s(x):
115
+ assert isinstance(x, BasevLLMParameter)
116
+ permute_param_layout_(x, input_dim=0, output_dim=1)
117
+ x.data = x.data.contiguous()
118
+ return x.to(dtype=c.act_type)
119
+
120
+ # Repack weights and scales for Machete
121
+ self._transform_param(layer, self.w_q_name, transform_w_q)
122
+ self._transform_param(layer, self.w_s_name, transform_w_s)
123
+
124
+ def apply_weights(self,
125
+ layer: torch.nn.Module,
126
+ x: torch.Tensor,
127
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
128
+ c = self.config
129
+
130
+ x_2d = x.reshape(-1, x.shape[-1])
131
+ out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
132
+
133
+ w_q, w_s, w_zp, w_g_idx = self._get_weight_params(layer)
134
+
135
+ assert w_zp is not None, "Zero points are required by Exllama"
136
+ assert w_g_idx is not None, "Group index is required by Exllama"
137
+ output = ops.gptq_gemm(x_2d, w_q, w_zp, w_s, w_g_idx, True,
138
+ c.weight_type.size_bits)
139
+
140
+ if bias is not None:
141
+ output.add_(bias)
142
+ return output.reshape(out_shape)
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from functools import partial
4
+ from typing import Optional, Tuple
5
+
6
+ import torch
7
+
8
+ from vllm import _custom_ops as ops
9
+ from vllm.model_executor.layers.quantization.utils.machete_utils import (
10
+ MACHETE_SUPPORTED_GROUP_SIZES, check_machete_supports_shape,
11
+ query_machete_supported_quant_types)
12
+ from vllm.model_executor.layers.quantization.utils.quant_utils import (
13
+ pack_quantized_values_into_int32, unpack_quantized_values_into_int32)
14
+ from vllm.model_executor.parameter import (BasevLLMParameter,
15
+ permute_param_layout_)
16
+
17
+ from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
18
+
19
+
20
+ class MacheteLinearKernel(MPLinearKernel):
21
+
22
+ @classmethod
23
+ def get_min_capability(cls) -> int:
24
+ return 90
25
+
26
+ @classmethod
27
+ def can_implement(cls,
28
+ c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
29
+ if c.has_g_idx and\
30
+ c.partition_weight_shape[0] != c.full_weight_shape[0]:
31
+ return False, "Act reordering currently not supported by Machete, "\
32
+ "when the input features are partitioned across "\
33
+ "devices"
34
+
35
+ if c.zero_points:
36
+ return False, "Zero points currently not supported by "\
37
+ " Compressed Tensors + Machete. (Kernel supports it"\
38
+ " but CompressedTensorsWNA16 does not so support has"\
39
+ " not been added to MacheteWNA16Kernel yet"
40
+
41
+ if c.weight_type not in query_machete_supported_quant_types(
42
+ c.zero_points):
43
+ return False, f"Quant type ({c.weight_type}) not supported by "\
44
+ "Machete, supported types are: "\
45
+ f"{query_machete_supported_quant_types(c.zero_points)}"
46
+
47
+ if c.group_size not in MACHETE_SUPPORTED_GROUP_SIZES:
48
+ return False, f"Group size ({c.group_size}) not supported by "\
49
+ "Machete, supported group sizes are: "\
50
+ f"{MACHETE_SUPPORTED_GROUP_SIZES}"
51
+
52
+ return check_machete_supports_shape(c.partition_weight_shape[0],
53
+ c.partition_weight_shape[1])
54
+
55
+ # note assumes that
56
+ # `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
57
+ # `weight_scale` is: {input_dim = 0, output_dim = 1}
58
+ def process_weights_after_loading(self, layer: torch.nn.Module):
59
+ c = self.config
60
+
61
+ if c.has_g_idx:
62
+ assert self.w_gidx_name is not None
63
+ perm = torch.argsort(getattr(layer, self.w_gidx_name))\
64
+ .to(torch.int)
65
+
66
+ self.act_perm = lambda x: x[:, perm]
67
+ # use `ops.permute_cols` if possible
68
+ if c.act_type in [torch.float16, torch.bfloat16] \
69
+ and c.partition_weight_shape[0] % 8 == 0:
70
+ self.act_perm = partial(ops.permute_cols, perm=perm)
71
+
72
+ def transform_w_q(x):
73
+ assert isinstance(x, BasevLLMParameter)
74
+ permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
75
+ if c.has_g_idx:
76
+ x_unpacked = unpack_quantized_values_into_int32(x.data,
77
+ c.weight_type,
78
+ packed_dim=0)
79
+ x_perm = x_unpacked[perm, :]
80
+ x.data = pack_quantized_values_into_int32(x_perm,
81
+ c.weight_type,
82
+ packed_dim=0)
83
+ x.data = ops.machete_prepack_B(x.data.t().contiguous().t(),
84
+ a_type=c.act_type,
85
+ b_type=c.weight_type,
86
+ group_scales_type=c.act_type)
87
+ return x
88
+
89
+ def transform_w_s(x):
90
+ assert isinstance(x, BasevLLMParameter)
91
+ permute_param_layout_(x, input_dim=0, output_dim=1)
92
+ x.data = x.data.contiguous()
93
+ return x
94
+
95
+ # Repack weights and scales for Machete
96
+ self._transform_param(layer, self.w_q_name, transform_w_q)
97
+ self._transform_param(layer, self.w_s_name, transform_w_s)
98
+
99
+ def apply_weights(self,
100
+ layer: torch.nn.Module,
101
+ x: torch.Tensor,
102
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
103
+ c = self.config
104
+ w_q, w_s, _, _ = self._get_weight_params(layer)
105
+
106
+ x_2d = x.reshape(-1, x.shape[-1])
107
+ out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
108
+
109
+ if c.has_g_idx:
110
+ x_2d = self.act_perm(x_2d)
111
+
112
+ output = ops.machete_mm(a=x_2d,
113
+ b_q=w_q,
114
+ b_type=c.weight_type,
115
+ b_group_zeros=None,
116
+ b_group_scales=w_s,
117
+ b_group_size=c.group_size)
118
+
119
+ if bias is not None:
120
+ output.add_(bias) # In-place add
121
+
122
+ return output.reshape(out_shape)
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import Optional, Tuple
4
+
5
+ import torch
6
+
7
+ from vllm import _custom_ops as ops
8
+ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
9
+ MARLIN_SUPPORTED_GROUP_SIZES, apply_gptq_marlin_linear,
10
+ check_marlin_supports_shape, marlin_is_k_full, marlin_make_empty_g_idx,
11
+ marlin_make_workspace, marlin_permute_scales, marlin_sort_g_idx,
12
+ query_marlin_supported_quant_types)
13
+ from vllm.model_executor.parameter import (BasevLLMParameter,
14
+ permute_param_layout_)
15
+
16
+ from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
17
+
18
+
19
+ class MarlinLinearKernel(MPLinearKernel):
20
+
21
+ @classmethod
22
+ def get_min_capability(cls) -> int:
23
+ return 80
24
+
25
+ @classmethod
26
+ def can_implement(cls,
27
+ c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
28
+ if c.zero_points:
29
+ return False, "Zero points currently not supported by "\
30
+ " MarlinLinearKernel. Will be added when AWQMarlin "\
31
+ "is migrated over to using MPLinearKernel backend"
32
+
33
+ quant_types = query_marlin_supported_quant_types(c.zero_points)
34
+ if c.weight_type not in quant_types:
35
+ return False, f"Quant type ({c.weight_type}) not supported by"\
36
+ f" Marlin, supported types are: {quant_types}"
37
+
38
+ if c.group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
39
+ return False, f"Group size ({c.group_size}) not supported by "\
40
+ "Marlin, supported group sizes are: "\
41
+ f"{MARLIN_SUPPORTED_GROUP_SIZES}"
42
+
43
+ return check_marlin_supports_shape(
44
+ c.partition_weight_shape[1], # out_features
45
+ c.partition_weight_shape[0], # in_features
46
+ c.full_weight_shape[0], # in_features
47
+ c.group_size)
48
+
49
+ # note assumes that
50
+ # `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
51
+ # `weight_scale` is: {input_dim = 0, output_dim = 1}
52
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
53
+ device = getattr(layer, self.w_q_name).device
54
+ c = self.config
55
+
56
+ row_parallel = (c.partition_weight_shape[0] != c.full_weight_shape[0])
57
+ self.is_k_full = marlin_is_k_full(c.has_g_idx, row_parallel)
58
+
59
+ # Allocate marlin workspace.
60
+ self.workspace = marlin_make_workspace(c.partition_weight_shape[1],
61
+ device)
62
+
63
+ # Default names since marlin requires empty parameters for these,
64
+ # TODO: remove this requirement from marlin (allow optional tensors)
65
+ if self.w_gidx_name is None:
66
+ self.w_gidx_name = "g_idx"
67
+ if self.w_zp_name is None:
68
+ self.w_zp_name = "w_zp"
69
+
70
+ if c.has_g_idx:
71
+ g_idx, g_idx_sort_indices = marlin_sort_g_idx(
72
+ getattr(layer, self.w_gidx_name))
73
+ self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
74
+ layer.g_idx_sort_indices = g_idx_sort_indices
75
+ else:
76
+ setattr(layer, self.w_gidx_name, marlin_make_empty_g_idx(device))
77
+ layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
78
+
79
+ if c.zero_points:
80
+ pass
81
+ # TODO (lucas): add the following when AWQMarlin is migrated over to
82
+ # using MPLinearKernel backend
83
+ # self._transform_param(layer, self.w_zp_name, lambda x: \
84
+ # marlin_zero_points(
85
+ # x,
86
+ # size_k=c.partition_weight_shape[0],
87
+ # size_n=c.partition_weight_shape[1],
88
+ # num_bits=c.weight_type.size_bits))
89
+ else:
90
+ setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
91
+
92
+ def transform_w_q(x):
93
+ assert isinstance(x, BasevLLMParameter)
94
+ permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
95
+ x.data = ops.gptq_marlin_repack(x.data.contiguous(),
96
+ perm=layer.g_idx_sort_indices,
97
+ size_k=c.partition_weight_shape[0],
98
+ size_n=c.partition_weight_shape[1],
99
+ num_bits=c.weight_type.size_bits)
100
+ return x
101
+
102
+ def transform_w_s(x):
103
+ assert isinstance(x, BasevLLMParameter)
104
+ permute_param_layout_(x, input_dim=0, output_dim=1)
105
+ x.data = marlin_permute_scales(x.data.contiguous(),
106
+ size_k=c.partition_weight_shape[0],
107
+ size_n=c.partition_weight_shape[1],
108
+ group_size=c.group_size)
109
+ return x
110
+
111
+ self._transform_param(layer, self.w_q_name, transform_w_q)
112
+ self._transform_param(layer, self.w_s_name, transform_w_s)
113
+
114
+ def apply_weights(self,
115
+ layer: torch.nn.Module,
116
+ x: torch.Tensor,
117
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
118
+ c = self.config
119
+ w_q, w_s, w_zp, w_gidx = self._get_weight_params(layer)
120
+
121
+ # `process_weights_after_loading` will ensure w_zp and w_gidx are not
122
+ # None for marlin
123
+ return apply_gptq_marlin_linear(
124
+ input=x,
125
+ weight=w_q,
126
+ weight_scale=w_s,
127
+ weight_zp=w_zp, # type: ignore
128
+ g_idx=w_gidx, # type: ignore
129
+ g_idx_sort_indices=layer.g_idx_sort_indices,
130
+ workspace=self.workspace,
131
+ wtype=c.weight_type,
132
+ input_size_per_partition=c.partition_weight_shape[0],
133
+ output_size_per_partition=c.partition_weight_shape[1],
134
+ is_k_full=self.is_k_full,
135
+ bias=bias)
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass
5
+ from typing import Optional, Tuple
6
+
7
+ import torch
8
+
9
+
10
+ @dataclass
11
+ class ScaledMMLinearLayerConfig:
12
+ is_channelwise: bool
13
+ is_static_input_scheme: bool
14
+ input_symmetric: bool
15
+
16
+
17
+ class ScaledMMLinearKernel(ABC):
18
+
19
+ @classmethod
20
+ @abstractmethod
21
+ def get_min_capability(cls) -> int:
22
+ raise NotImplementedError
23
+
24
+ @classmethod
25
+ @abstractmethod
26
+ def can_implement(
27
+ cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
28
+ raise NotImplementedError
29
+
30
+ def __init__(self, c: ScaledMMLinearLayerConfig, w_q_param_name: str,
31
+ w_s_param_name: str, i_s_param_name: str,
32
+ i_zp_param_name: str, azp_adj_param_name: str) -> None:
33
+ assert self.can_implement(c)
34
+ self.config = c
35
+ self.w_q_name = w_q_param_name
36
+ self.w_s_name = w_s_param_name
37
+ self.i_s_name = i_s_param_name
38
+ self.i_zp_name = i_zp_param_name
39
+ self.azp_adj_name = azp_adj_param_name
40
+
41
+ @abstractmethod
42
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
43
+ raise NotImplementedError
44
+
45
+ @abstractmethod
46
+ def apply_weights(self,
47
+ layer: torch.nn.Module,
48
+ x: torch.Tensor,
49
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
50
+ raise NotImplementedError
51
+
52
+ def _get_weight_params(
53
+ self, layer: torch.nn.Module) -> Tuple[
54
+ torch.Tensor, # weight
55
+ torch.Tensor, # weight_scale
56
+ Optional[torch.Tensor], # input_scale,
57
+ Optional[torch.Tensor], # input_zp
58
+ Optional[torch.Tensor], # azp_adj
59
+ ]:
60
+ return (
61
+ getattr(layer, self.w_q_name),
62
+ getattr(layer, self.w_s_name),
63
+ getattr(layer, self.i_s_name),
64
+ getattr(layer, self.i_zp_name),
65
+ getattr(layer, self.azp_adj_name),
66
+ )
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import os
4
+ from typing import Dict, List, Optional, Type
5
+
6
+ from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
7
+ CutlassScaledMMLinearKernel)
8
+ from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501
9
+ ScaledMMLinearKernel, ScaledMMLinearLayerConfig)
10
+ from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
11
+ TritonScaledMMLinearKernel)
12
+ from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import (
13
+ XLAScaledMMLinearKernel)
14
+ from vllm.platforms import PlatformEnum, current_platform
15
+
16
+ # in priority/performance order (when available)
17
+ _POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = {
18
+ PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
19
+ PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
20
+ PlatformEnum.ROCM: [TritonScaledMMLinearKernel],
21
+ PlatformEnum.TPU: [XLAScaledMMLinearKernel],
22
+ }
23
+
24
+
25
+ def choose_scaled_mm_linear_kernel(
26
+ config: ScaledMMLinearLayerConfig,
27
+ compute_capability: Optional[int] = None
28
+ ) -> Type[ScaledMMLinearKernel]:
29
+ """
30
+ Choose an ScalledMMLinearKernel that can implement the given config for the
31
+ given compute capability. Attempts to choose the best kernel in terms of
32
+ performance.
33
+
34
+ Args:
35
+ config (ScaledMMLinearLayerConfig): Description of the linear layer
36
+ to be implemented.
37
+ compute_capability (Optional[int], optional): The compute capability of
38
+ the target device, if None uses `current_platform` to get the
39
+ compute capability. Defaults to None.
40
+
41
+ Raises:
42
+ ValueError: If no kernel can implement the given config.
43
+
44
+ Returns:
45
+ Type[ScaledMMLinearKernel]: Chosen kernel.
46
+ """
47
+
48
+ if compute_capability is None:
49
+ _cc = current_platform.get_device_capability()
50
+ if _cc is not None:
51
+ compute_capability = _cc[0] * 10 + _cc[1]
52
+
53
+ failure_reasons = []
54
+ for kernel in _POSSIBLE_KERNELS[current_platform._enum]:
55
+ if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "")\
56
+ .split(","):
57
+ failure_reasons.append(
58
+ f' {kernel.__name__} disabled by environment variable')
59
+ continue
60
+
61
+ # If the current platform uses compute_capability,
62
+ # make sure the kernel supports the compute cability.
63
+ if compute_capability is not None:
64
+ kernel_min_capability = kernel.get_min_capability()
65
+ if (kernel_min_capability is not None
66
+ and kernel_min_capability > compute_capability):
67
+ failure_reasons.append(
68
+ f"{kernel.__name__} requires capability "
69
+ f"{kernel_min_capability}, current compute capability "
70
+ f"is {compute_capability}")
71
+ continue
72
+
73
+ can_implement, failure_reason = kernel.can_implement(config)
74
+ if can_implement:
75
+ return kernel
76
+ else:
77
+ failure_reasons.append(
78
+ f' {kernel.__name__} cannot implement due to: {failure_reason}'
79
+ )
80
+
81
+ raise ValueError(
82
+ "Failed to find a kernel that can implement the "\
83
+ "ScaledMM linear layer. Reasons: \n"
84
+ + '\n'.join(failure_reasons))
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/ScaledMMLinearKernel.cpython-311.pyc ADDED
Binary file (3.78 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (3.84 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/cutlass.cpython-311.pyc ADDED
Binary file (6.34 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/triton.cpython-311.pyc ADDED
Binary file (2.58 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/xla.cpython-311.pyc ADDED
Binary file (5.26 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import Optional, Tuple
4
+
5
+ import torch
6
+
7
+ from vllm import _custom_ops as ops
8
+ from vllm.model_executor.layers.quantization.utils import replace_parameter
9
+ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
10
+ convert_to_channelwise)
11
+ from vllm.platforms import current_platform
12
+
13
+ from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
14
+ ScaledMMLinearLayerConfig)
15
+
16
+
17
+ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
18
+
19
+ @classmethod
20
+ def get_min_capability(cls) -> int:
21
+ return 75
22
+
23
+ @classmethod
24
+ def can_implement(
25
+ cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
26
+
27
+ if (not current_platform.is_cuda() and not current_platform.is_cpu()):
28
+ return False, "CutlassScaledMM requires running on CUDA or CPU."
29
+
30
+ return True, None
31
+
32
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
33
+ # WEIGHT
34
+ # Cutlass kernels need transposed weight.
35
+ weight = getattr(layer, self.w_q_name)
36
+ replace_parameter(
37
+ layer, self.w_q_name,
38
+ torch.nn.Parameter(weight.t().data, requires_grad=False))
39
+
40
+ # WEIGHT SCALE
41
+ # Cutlass kernels support only per-tensor and per-channel.
42
+ # If we have a fused module (QKV, MLP) with per tensor scales (thus N
43
+ # scales being passed to the kernel), convert to the per-channel case.
44
+ is_fused_module = len(layer.logical_widths) > 1
45
+ weight_scale = getattr(layer, self.w_s_name)
46
+ if is_fused_module and not self.config.is_channelwise:
47
+ weight_scale = convert_to_channelwise(weight_scale,
48
+ layer.logical_widths)
49
+ replace_parameter(
50
+ layer, self.w_s_name,
51
+ torch.nn.Parameter(weight_scale.data, requires_grad=False))
52
+
53
+ # INPUT SCALE
54
+ if self.config.is_static_input_scheme:
55
+ input_scale = getattr(layer, self.i_s_name)
56
+
57
+ if self.config.input_symmetric:
58
+ replace_parameter(
59
+ layer, self.i_s_name,
60
+ torch.nn.Parameter(input_scale.max(), requires_grad=False))
61
+ setattr(layer, self.i_zp_name, None)
62
+ else:
63
+ input_zero_point = getattr(layer, self.i_zp_name)
64
+
65
+ # reconstruct the ranges
66
+ int8_traits = torch.iinfo(torch.int8)
67
+ azps = input_zero_point.to(dtype=torch.int32)
68
+ range_max = (input_scale * (int8_traits.max - azps)).max()
69
+ range_min = (input_scale * (int8_traits.min - azps)).min()
70
+
71
+ scale = (range_max - range_min) / (int8_traits.max -
72
+ int8_traits.min)
73
+ replace_parameter(
74
+ layer, self.i_s_name,
75
+ torch.nn.Parameter(scale, requires_grad=False))
76
+
77
+ # AZP loaded as int8 but used as int32
78
+ azp = (int8_traits.min -
79
+ range_min / scale).to(dtype=torch.int32)
80
+ replace_parameter(layer, self.i_zp_name,
81
+ torch.nn.Parameter(azp, requires_grad=False))
82
+
83
+ else:
84
+ setattr(layer, self.i_s_name, None)
85
+ setattr(layer, self.i_zp_name, None)
86
+
87
+ # azp_adj is the AZP adjustment term, used to account for weights.
88
+ # It does not depend on scales or azp, so it is the same for
89
+ # static and dynamic quantization.
90
+ # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
91
+ # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
92
+ if not self.config.input_symmetric:
93
+ weight = getattr(layer, self.w_q_name)
94
+ azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.int32)
95
+ if self.config.is_static_input_scheme:
96
+ # cutlass_w8a8 requires azp to be folded into azp_adj
97
+ # in the per-tensor case
98
+ azp_adj = getattr(layer, self.i_zp_name) * azp_adj
99
+ setattr(layer, self.azp_adj_name,
100
+ torch.nn.Parameter(azp_adj, requires_grad=False))
101
+ else:
102
+ setattr(layer, self.azp_adj_name, None)
103
+
104
+ def apply_weights(self,
105
+ layer: torch.nn.Module,
106
+ x: torch.Tensor,
107
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
108
+ w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
109
+
110
+ # ops.scaled_int8_quant supports both dynamic and static quant:
111
+ # * dynamic, i_s is None and x_s computed from x.
112
+ # * static, i_s is scalar and x_s is i_s.
113
+ symmetric = azp_adj is None
114
+ x_q, x_s, x_zp = ops.scaled_int8_quant(x,
115
+ i_s,
116
+ i_zp,
117
+ symmetric=symmetric)
118
+
119
+ if x_zp is not None:
120
+ # Currently, static is always per-tensor and dynamic is per-token
121
+ static = i_zp is not None
122
+ azp = None if static else x_zp
123
+ return ops.cutlass_scaled_mm_azp(x_q,
124
+ w_q,
125
+ scale_a=x_s,
126
+ scale_b=w_s,
127
+ out_dtype=x.dtype,
128
+ azp_adj=azp_adj,
129
+ azp=azp,
130
+ bias=bias)
131
+ return ops.cutlass_scaled_mm(x_q,
132
+ w_q,
133
+ scale_a=x_s,
134
+ scale_b=w_s,
135
+ out_dtype=x.dtype,
136
+ bias=bias)
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import Optional, Tuple
4
+
5
+ import torch
6
+
7
+ from vllm.platforms import current_platform
8
+
9
+ from .cutlass import CutlassScaledMMLinearKernel
10
+ from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
11
+
12
+
13
+ class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
14
+
15
+ @classmethod
16
+ def get_min_capability(cls) -> int:
17
+ return 75
18
+
19
+ @classmethod
20
+ def can_implement(
21
+ cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
22
+ if current_platform.is_cpu():
23
+ return (
24
+ False,
25
+ "TritonScaledMMLinearKernel requires Triton which is not " +
26
+ "currently supported on CPU.")
27
+ if not c.input_symmetric:
28
+ return (False,
29
+ "TritonScaledMMLinearKernel only supports symmetric " +
30
+ "quantization.")
31
+ return True, None
32
+
33
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
34
+ super().process_weights_after_loading(layer)
35
+
36
+ def apply_weights(self,
37
+ layer: torch.nn.Module,
38
+ x: torch.Tensor,
39
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
40
+ return super().apply_weights(layer, x, bias)
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import warnings
4
+ from typing import Optional, Tuple
5
+
6
+ import torch
7
+ from functorch.experimental.control_flow import cond # noqa: F401
8
+
9
+ from vllm.model_executor.layers.quantization.utils import replace_parameter
10
+ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
11
+ convert_to_channelwise)
12
+ from vllm.platforms import current_platform
13
+
14
+ from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
15
+ ScaledMMLinearLayerConfig)
16
+
17
+
18
+ class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
19
+
20
+ @classmethod
21
+ def get_min_capability(cls) -> int:
22
+ raise NotImplementedError(
23
+ "TPU platform does have a concept of compute capability, "
24
+ "this method should not be called.")
25
+
26
+ @classmethod
27
+ def can_implement(
28
+ cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
29
+
30
+ if not current_platform.is_tpu():
31
+ return False, "ScaledMMXLA requires running on TPU."
32
+
33
+ if c.is_static_input_scheme:
34
+ return False, "ScaledMMXLA requires dynamic activation scales."
35
+
36
+ if not c.input_symmetric:
37
+ return False, "ScaledMMXLA requires symmetric activation scales."
38
+
39
+ if not c.is_channelwise:
40
+ return False, "ScaledMMXLA requires channelwise weight scales"
41
+
42
+ return True, None
43
+
44
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
45
+ # WEIGHT
46
+ # [out, in] (different than cutlass_scaled_mm)
47
+ weight = getattr(layer, self.w_q_name)
48
+ replace_parameter(layer, self.w_q_name,
49
+ torch.nn.Parameter(weight.data, requires_grad=False))
50
+
51
+ # WEIGHT SCALE
52
+ # XLA kernels support only per-tensor and per-channel.
53
+ # If we have a fused module (QKV, MLP) with per tensor scales (thus N
54
+ # scales being passed to the kernel), convert to the per-channel case.
55
+ is_fused_module = len(layer.logical_widths) > 1
56
+ weight_scale = getattr(layer, self.w_s_name)
57
+ if is_fused_module and not self.config.is_channelwise:
58
+ weight_scale = convert_to_channelwise(weight_scale,
59
+ layer.logical_widths)
60
+
61
+ # [out_channel,] (different than cutlass_scaled_mm)
62
+ weight_scale = weight_scale.squeeze(-1)
63
+ replace_parameter(
64
+ layer, self.w_s_name,
65
+ torch.nn.Parameter(weight_scale.data, requires_grad=False))
66
+
67
+ # Only support symmetric dynamic activation quantization.
68
+ setattr(layer, self.i_s_name, None)
69
+ setattr(layer, self.i_zp_name, None)
70
+ setattr(layer, self.azp_adj_name, None)
71
+
72
+ # Filter warning for cond usage in apply_weights. It is okay
73
+ # to specialize the graph since bias is not dynamic.
74
+ warnings.filterwarnings(
75
+ "ignore",
76
+ message=
77
+ "Pred is a Python constant. When used with torch.cond, it specializes on one of the branches." # noqa: E501
78
+ )
79
+
80
+ def no_add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
81
+ return x
82
+
83
+ def add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
84
+ return x + bias
85
+
86
+ def apply_weights(self,
87
+ layer: torch.nn.Module,
88
+ x: torch.Tensor,
89
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
90
+ w_q, w_s, _, _, _ = self._get_weight_params(layer)
91
+
92
+ import torch_xla.experimental.xla_quantized_matmul # noqa: F401
93
+ out = torch.ops.xla.quantized_matmul(x,
94
+ w_q,
95
+ w_s,
96
+ zero_point=None,
97
+ block_size=-1,
98
+ int4_weight=False,
99
+ quantize_activation=True)
100
+
101
+ # Explicitly capture control flow to make dynamo happy.
102
+ # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
103
+ return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (218 Bytes). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/quark.cpython-311.pyc ADDED
Binary file (21.8 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/quark/__pycache__/quark_moe.cpython-311.pyc ADDED
Binary file (11.1 kB). View file