| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | from collections import defaultdict |
| | from enum import Enum |
| | from typing import Annotated, Any, Dict, List, Optional, Set, Union |
| |
|
| | from compressed_tensors.config import CompressionFormat |
| | from compressed_tensors.quantization.quant_args import DynamicType, QuantizationArgs |
| | from compressed_tensors.quantization.quant_scheme import ( |
| | QuantizationScheme, |
| | preset_name_to_scheme, |
| | ) |
| | from compressed_tensors.quantization.utils import is_module_quantized, module_type |
| | from pydantic import BaseModel, ConfigDict, Field |
| | from torch.nn import Module |
| |
|
| |
|
| | __all__ = [ |
| | "QuantizationStatus", |
| | "QuantizationConfig", |
| | "LIFECYCLE_ORDER", |
| | "DEFAULT_QUANTIZATION_METHOD", |
| | "DEFAULT_QUANTIZATION_FORMAT", |
| | ] |
| |
|
| |
|
| | class QuantizationStatus(str, Enum): |
| | """ |
| | Enum storing the different states a quantized layer can be in |
| | |
| | Initialized: scale, zero points and observers have been attached to the layer but |
| | are set to dummy values (not yet calibrated) |
| | Calibration: scale and zero points have been calibrated through OBCQ or similar |
| | algorithm, observers are still attached |
| | Frozen: scale and zero points are finalized, observers have been deleted, weights |
| | are still in their original precision |
| | Compressed: weights have been converted to their target type or compressed to |
| | their closed approximation |
| | """ |
| |
|
| | INITIALIZED = "initialized" |
| | CALIBRATION = "calibration" |
| | FROZEN = "frozen" |
| | COMPRESSED = "compressed" |
| |
|
| | @classmethod |
| | def lifecycle_order(cls) -> List["QuantizationStatus"]: |
| | """ |
| | :return: list of correct quantization lifecycle order |
| | """ |
| | return |
| |
|
| | def __ge__(self, other): |
| | if other is None: |
| | return True |
| | if not isinstance(other, self.__class__): |
| | raise NotImplementedError |
| | return LIFECYCLE_ORDER.index(self) >= LIFECYCLE_ORDER.index(other) |
| |
|
| | def __gt__(self, other): |
| | if other is None: |
| | return True |
| | if not isinstance(other, self.__class__): |
| | raise NotImplementedError |
| | return LIFECYCLE_ORDER.index(self) > LIFECYCLE_ORDER.index(other) |
| |
|
| | def __lt__(self, other): |
| | if other is None: |
| | return False |
| | if not isinstance(other, self.__class__): |
| | raise NotImplementedError |
| | return LIFECYCLE_ORDER.index(self) < LIFECYCLE_ORDER.index(other) |
| |
|
| | def __le__(self, other): |
| | if other is None: |
| | return False |
| | if not isinstance(other, self.__class__): |
| | raise NotImplementedError |
| | return LIFECYCLE_ORDER.index(self) <= LIFECYCLE_ORDER.index(other) |
| |
|
| |
|
| | LIFECYCLE_ORDER = [ |
| | QuantizationStatus.INITIALIZED, |
| | QuantizationStatus.CALIBRATION, |
| | QuantizationStatus.FROZEN, |
| | QuantizationStatus.COMPRESSED, |
| | ] |
| |
|
| | DEFAULT_QUANTIZATION_METHOD = "compressed-tensors" |
| | DEFAULT_QUANTIZATION_FORMAT = "fakequant" |
| |
|
| |
|
| | class QuantizationConfig(BaseModel): |
| | """ |
| | Full configuration specifying how a model is quantized. Each quantized layer is |
| | mapped to a QuantizationScheme in config_groups. |
| | |
| | :param config_groups: dict of QuantizationSchemes specifying the quantization |
| | settings for each quantized layer. A group could also be a reference to |
| | a predefined scheme name, mapped to a list of its target layers/classes |
| | :param quant_method: a constant used to differentiate compressed-tensors |
| | quantization from other quantization configs |
| | :param format: specifies how the quantized model is stored on disk |
| | :quantization_status: specifies the current status of all quantized layers. It is |
| | assumed all layers are in the same state. |
| | :param kv_cache_scheme: optional QuantizationArgs, that specify the |
| | quantization of the kv cache. If None, kv cache is not quantized. |
| | When applying kv cache quantization to transformer AutoModelForCausalLM, |
| | the kv_cache_scheme gets converted into a QuantizationScheme that: |
| | - targets the `q_proj` and `k_proj` modules of the model. The outputs |
| | of those modules are the keys and values that might be cached |
| | - quantizes the outputs of the aformentioned layers, so that |
| | keys and values are compressed before storing them in the cache |
| | There is an explicit assumption that the model contains modules with |
| | `k_proj` and `v_proj` in their names. If this is not the case |
| | and kv_cache_scheme != None, the quantization of kv cache will fail |
| | :global_compression_ratio: optional informational config to report the model |
| | compression ratio acheived by the quantization config |
| | :ignore: optional list of layers to ignore from config_groups. Layers in this list |
| | are not quantized even if they match up with a target in config_groups |
| | """ |
| |
|
| | config_groups: Dict[str, Union[QuantizationScheme, List[str]]] |
| | quant_method: str = DEFAULT_QUANTIZATION_METHOD |
| | kv_cache_scheme: Optional[QuantizationArgs] = None |
| | format: str = DEFAULT_QUANTIZATION_FORMAT |
| | quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED |
| | global_compression_ratio: Optional[float] = None |
| | ignore: Optional[List[str]] = Field(default_factory=list) |
| | |
| | |
| | run_compressed: Annotated[Any, Field(exclude=True)] = None |
| |
|
| | def model_post_init(self, __context): |
| | """ |
| | updates any quantization schemes defined as presets to be fully loaded |
| | schemes |
| | """ |
| | for group_name, targets_or_scheme in self.config_groups.items(): |
| | if isinstance(targets_or_scheme, QuantizationScheme): |
| | continue |
| | self.config_groups[group_name] = preset_name_to_scheme( |
| | name=group_name, |
| | targets=targets_or_scheme, |
| | ) |
| |
|
| | def to_dict(self): |
| | |
| | return self.model_dump() |
| |
|
| | @staticmethod |
| | def from_pretrained( |
| | model: Module, format: Optional[Union[str, list]] = None |
| | ) -> Optional["QuantizationConfig"]: |
| | """ |
| | Converts a model into its associated QuantizationConfig based on the |
| | QuantizationScheme attached to each quantized module |
| | |
| | :param model: model to calculate quantization scheme of |
| | :return: filled out QuantizationScheme for the input model |
| | """ |
| | from compressed_tensors.modeling import IMPL_ATTR |
| | from compressed_tensors.quantization.lifecycle.initialize import ( |
| | is_attention_module, |
| | ) |
| |
|
| | |
| | |
| | quantization_schemes: List[QuantizationScheme] = list() |
| |
|
| | |
| | model_status = None |
| |
|
| | |
| | |
| | quantization_type_names: Set[str] = set() |
| |
|
| | |
| | |
| | ignore: Dict[str, List[str]] = defaultdict(list) |
| |
|
| | |
| | kv_cache_scheme: Optional[QuantizationArgs] = None |
| |
|
| | for name, submodule in model.named_modules(): |
| | layer_type: str = module_type(submodule) |
| |
|
| | |
| | has_config_group = is_module_quantized(submodule) and ( |
| | not is_attention_module(submodule) or hasattr(submodule, IMPL_ATTR) |
| | ) |
| | |
| | has_kv_cache = is_module_quantized(submodule) and is_attention_module( |
| | submodule |
| | ) |
| |
|
| | if has_config_group: |
| | |
| | model_status = getattr(submodule, "quantization_status", model_status) |
| | quantization_type_names.add(layer_type) |
| | if submodule.quantization_scheme not in quantization_schemes: |
| | quantization_schemes.append(submodule.quantization_scheme) |
| |
|
| | if has_kv_cache: |
| | model_status = getattr(submodule, "quantization_status", model_status) |
| | kv_cache_scheme = submodule.quantization_scheme.input_activations |
| |
|
| | if not has_config_group: |
| | |
| | if layer_type not in ignore: |
| | ignore[layer_type] = [] |
| | ignore[layer_type].append(name) |
| |
|
| | if ( |
| | len(quantization_schemes) == 0 and kv_cache_scheme is None |
| | ): |
| | return None |
| |
|
| | |
| | consolidated_ignore = [] |
| | for layer_type, ignore_names in ignore.items(): |
| | if layer_type in quantization_type_names: |
| | |
| | consolidated_ignore += ignore_names |
| | |
| | |
| |
|
| | |
| | config_groups = {} |
| | for idx, scheme in enumerate(quantization_schemes): |
| | group_name = "group_" + str(idx) |
| | config_groups[group_name] = scheme |
| |
|
| | |
| | if format is None: |
| | if model_status == QuantizationStatus.COMPRESSED: |
| | format = CompressionFormat.int_quantized.value |
| | else: |
| | format = CompressionFormat.dense.value |
| | elif isinstance(format, list): |
| | format = ( |
| | CompressionFormat.mixed_precision.value |
| | if len(format) > 1 |
| | else format[0] |
| | ) |
| |
|
| | return QuantizationConfig( |
| | config_groups=config_groups, |
| | quantization_status=model_status, |
| | kv_cache_scheme=kv_cache_scheme, |
| | global_compression_ratio=None, |
| | format=format, |
| | ignore=consolidated_ignore, |
| | ) |
| |
|
| | def requires_calibration_data(self): |
| | if self.kv_cache_scheme is not None: |
| | return True |
| |
|
| | for _, scheme in self.config_groups.items(): |
| | if scheme.input_activations is not None: |
| | if scheme.input_activations.dynamic in (False, DynamicType.LOCAL): |
| | return True |
| | if scheme.output_activations is not None: |
| | if not scheme.output_activations.dynamic: |
| | return True |
| |
|
| | return False |
| |
|
| | |
| | model_config = ConfigDict(extra="ignore") |
| |
|