# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import defaultdict from enum import Enum from typing import Annotated, Any, Dict, List, Optional, Set, Union from compressed_tensors.config import CompressionFormat from compressed_tensors.quantization.quant_args import DynamicType, QuantizationArgs from compressed_tensors.quantization.quant_scheme import ( QuantizationScheme, preset_name_to_scheme, ) from compressed_tensors.quantization.utils import is_module_quantized, module_type from pydantic import BaseModel, ConfigDict, Field from torch.nn import Module __all__ = [ "QuantizationStatus", "QuantizationConfig", "LIFECYCLE_ORDER", "DEFAULT_QUANTIZATION_METHOD", "DEFAULT_QUANTIZATION_FORMAT", ] class QuantizationStatus(str, Enum): """ Enum storing the different states a quantized layer can be in Initialized: scale, zero points and observers have been attached to the layer but are set to dummy values (not yet calibrated) Calibration: scale and zero points have been calibrated through OBCQ or similar algorithm, observers are still attached Frozen: scale and zero points are finalized, observers have been deleted, weights are still in their original precision Compressed: weights have been converted to their target type or compressed to their closed approximation """ INITIALIZED = "initialized" CALIBRATION = "calibration" FROZEN = "frozen" COMPRESSED = "compressed" @classmethod def lifecycle_order(cls) -> List["QuantizationStatus"]: """ :return: list of correct quantization lifecycle order """ return def __ge__(self, other): if other is None: return True if not isinstance(other, self.__class__): raise NotImplementedError return LIFECYCLE_ORDER.index(self) >= LIFECYCLE_ORDER.index(other) def __gt__(self, other): if other is None: return True if not isinstance(other, self.__class__): raise NotImplementedError return LIFECYCLE_ORDER.index(self) > LIFECYCLE_ORDER.index(other) def __lt__(self, other): if other is None: return False if not isinstance(other, self.__class__): raise NotImplementedError return LIFECYCLE_ORDER.index(self) < LIFECYCLE_ORDER.index(other) def __le__(self, other): if other is None: return False if not isinstance(other, self.__class__): raise NotImplementedError return LIFECYCLE_ORDER.index(self) <= LIFECYCLE_ORDER.index(other) LIFECYCLE_ORDER = [ QuantizationStatus.INITIALIZED, QuantizationStatus.CALIBRATION, QuantizationStatus.FROZEN, QuantizationStatus.COMPRESSED, ] DEFAULT_QUANTIZATION_METHOD = "compressed-tensors" DEFAULT_QUANTIZATION_FORMAT = "fakequant" class QuantizationConfig(BaseModel): """ Full configuration specifying how a model is quantized. Each quantized layer is mapped to a QuantizationScheme in config_groups. :param config_groups: dict of QuantizationSchemes specifying the quantization settings for each quantized layer. A group could also be a reference to a predefined scheme name, mapped to a list of its target layers/classes :param quant_method: a constant used to differentiate compressed-tensors quantization from other quantization configs :param format: specifies how the quantized model is stored on disk :quantization_status: specifies the current status of all quantized layers. It is assumed all layers are in the same state. :param kv_cache_scheme: optional QuantizationArgs, that specify the quantization of the kv cache. If None, kv cache is not quantized. When applying kv cache quantization to transformer AutoModelForCausalLM, the kv_cache_scheme gets converted into a QuantizationScheme that: - targets the `q_proj` and `k_proj` modules of the model. The outputs of those modules are the keys and values that might be cached - quantizes the outputs of the aformentioned layers, so that keys and values are compressed before storing them in the cache There is an explicit assumption that the model contains modules with `k_proj` and `v_proj` in their names. If this is not the case and kv_cache_scheme != None, the quantization of kv cache will fail :global_compression_ratio: optional informational config to report the model compression ratio acheived by the quantization config :ignore: optional list of layers to ignore from config_groups. Layers in this list are not quantized even if they match up with a target in config_groups """ config_groups: Dict[str, Union[QuantizationScheme, List[str]]] quant_method: str = DEFAULT_QUANTIZATION_METHOD kv_cache_scheme: Optional[QuantizationArgs] = None format: str = DEFAULT_QUANTIZATION_FORMAT quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED global_compression_ratio: Optional[float] = None ignore: Optional[List[str]] = Field(default_factory=list) # `run_compressed` is a dummy, unused arg for backwards compatibility # see: https://github.com/huggingface/transformers/pull/39324 run_compressed: Annotated[Any, Field(exclude=True)] = None def model_post_init(self, __context): """ updates any quantization schemes defined as presets to be fully loaded schemes """ for group_name, targets_or_scheme in self.config_groups.items(): if isinstance(targets_or_scheme, QuantizationScheme): continue # scheme already defined self.config_groups[group_name] = preset_name_to_scheme( name=group_name, targets=targets_or_scheme, ) def to_dict(self): # for compatibility with HFQuantizer return self.model_dump() @staticmethod def from_pretrained( model: Module, format: Optional[Union[str, list]] = None ) -> Optional["QuantizationConfig"]: """ Converts a model into its associated QuantizationConfig based on the QuantizationScheme attached to each quantized module :param model: model to calculate quantization scheme of :return: filled out QuantizationScheme for the input model """ from compressed_tensors.modeling import IMPL_ATTR from compressed_tensors.quantization.lifecycle.initialize import ( is_attention_module, ) # set of all quantization schemes # TODO: make quant config/scheme/args frozen/hashable and use a set quantization_schemes: List[QuantizationScheme] = list() # use any status from modules (in practice, use the last module) model_status = None # set of all quantized types # this is later used to create the ignore list quantization_type_names: Set[str] = set() # maps types to names which are not quantized # this is later used to create the ignore list ignore: Dict[str, List[str]] = defaultdict(list) # this keeps track of any kvcache schemes kv_cache_scheme: Optional[QuantizationArgs] = None for name, submodule in model.named_modules(): layer_type: str = module_type(submodule) # add config group if quantized non-attention or attention quant has_config_group = is_module_quantized(submodule) and ( not is_attention_module(submodule) or hasattr(submodule, IMPL_ATTR) ) # only add kvcache if quant attention (which always implies kvcache) has_kv_cache = is_module_quantized(submodule) and is_attention_module( submodule ) if has_config_group: # add to running set of schemes/layer_type_names model_status = getattr(submodule, "quantization_status", model_status) quantization_type_names.add(layer_type) if submodule.quantization_scheme not in quantization_schemes: quantization_schemes.append(submodule.quantization_scheme) if has_kv_cache: model_status = getattr(submodule, "quantization_status", model_status) kv_cache_scheme = submodule.quantization_scheme.input_activations if not has_config_group: # add non-quantized layers to the ignore list if layer_type not in ignore: ignore[layer_type] = [] ignore[layer_type].append(name) if ( len(quantization_schemes) == 0 and kv_cache_scheme is None ): # No quantized layers return None # create ignore list, only include layers whose class has ever been targeted consolidated_ignore = [] for layer_type, ignore_names in ignore.items(): if layer_type in quantization_type_names: # specific layers of a quantized type are ignored consolidated_ignore += ignore_names # else we leave it off the ignore list, doesn't fall under any of the # existing quantization schemes so it won't be quantized # create config groups from all unique schemes config_groups = {} for idx, scheme in enumerate(quantization_schemes): group_name = "group_" + str(idx) config_groups[group_name] = scheme # infer format if format is None: if model_status == QuantizationStatus.COMPRESSED: format = CompressionFormat.int_quantized.value else: format = CompressionFormat.dense.value elif isinstance(format, list): format = ( CompressionFormat.mixed_precision.value if len(format) > 1 else format[0] ) return QuantizationConfig( config_groups=config_groups, quantization_status=model_status, kv_cache_scheme=kv_cache_scheme, global_compression_ratio=None, format=format, ignore=consolidated_ignore, ) def requires_calibration_data(self): if self.kv_cache_scheme is not None: return True for _, scheme in self.config_groups.items(): if scheme.input_activations is not None: if scheme.input_activations.dynamic in (False, DynamicType.LOCAL): return True if scheme.output_activations is not None: if not scheme.output_activations.dynamic: return True return False # TODO set `extra="forbid"` when upstream transformers is compatible model_config = ConfigDict(extra="ignore")