File size: 11,546 Bytes
76f9669 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
from enum import Enum
from typing import Annotated, Any, Dict, List, Optional, Set, Union
from compressed_tensors.config import CompressionFormat
from compressed_tensors.quantization.quant_args import DynamicType, QuantizationArgs
from compressed_tensors.quantization.quant_scheme import (
QuantizationScheme,
preset_name_to_scheme,
)
from compressed_tensors.quantization.utils import is_module_quantized, module_type
from pydantic import BaseModel, ConfigDict, Field
from torch.nn import Module
__all__ = [
"QuantizationStatus",
"QuantizationConfig",
"LIFECYCLE_ORDER",
"DEFAULT_QUANTIZATION_METHOD",
"DEFAULT_QUANTIZATION_FORMAT",
]
class QuantizationStatus(str, Enum):
"""
Enum storing the different states a quantized layer can be in
Initialized: scale, zero points and observers have been attached to the layer but
are set to dummy values (not yet calibrated)
Calibration: scale and zero points have been calibrated through OBCQ or similar
algorithm, observers are still attached
Frozen: scale and zero points are finalized, observers have been deleted, weights
are still in their original precision
Compressed: weights have been converted to their target type or compressed to
their closed approximation
"""
INITIALIZED = "initialized"
CALIBRATION = "calibration"
FROZEN = "frozen"
COMPRESSED = "compressed"
@classmethod
def lifecycle_order(cls) -> List["QuantizationStatus"]:
"""
:return: list of correct quantization lifecycle order
"""
return
def __ge__(self, other):
if other is None:
return True
if not isinstance(other, self.__class__):
raise NotImplementedError
return LIFECYCLE_ORDER.index(self) >= LIFECYCLE_ORDER.index(other)
def __gt__(self, other):
if other is None:
return True
if not isinstance(other, self.__class__):
raise NotImplementedError
return LIFECYCLE_ORDER.index(self) > LIFECYCLE_ORDER.index(other)
def __lt__(self, other):
if other is None:
return False
if not isinstance(other, self.__class__):
raise NotImplementedError
return LIFECYCLE_ORDER.index(self) < LIFECYCLE_ORDER.index(other)
def __le__(self, other):
if other is None:
return False
if not isinstance(other, self.__class__):
raise NotImplementedError
return LIFECYCLE_ORDER.index(self) <= LIFECYCLE_ORDER.index(other)
LIFECYCLE_ORDER = [
QuantizationStatus.INITIALIZED,
QuantizationStatus.CALIBRATION,
QuantizationStatus.FROZEN,
QuantizationStatus.COMPRESSED,
]
DEFAULT_QUANTIZATION_METHOD = "compressed-tensors"
DEFAULT_QUANTIZATION_FORMAT = "fakequant"
class QuantizationConfig(BaseModel):
"""
Full configuration specifying how a model is quantized. Each quantized layer is
mapped to a QuantizationScheme in config_groups.
:param config_groups: dict of QuantizationSchemes specifying the quantization
settings for each quantized layer. A group could also be a reference to
a predefined scheme name, mapped to a list of its target layers/classes
:param quant_method: a constant used to differentiate compressed-tensors
quantization from other quantization configs
:param format: specifies how the quantized model is stored on disk
:quantization_status: specifies the current status of all quantized layers. It is
assumed all layers are in the same state.
:param kv_cache_scheme: optional QuantizationArgs, that specify the
quantization of the kv cache. If None, kv cache is not quantized.
When applying kv cache quantization to transformer AutoModelForCausalLM,
the kv_cache_scheme gets converted into a QuantizationScheme that:
- targets the `q_proj` and `k_proj` modules of the model. The outputs
of those modules are the keys and values that might be cached
- quantizes the outputs of the aformentioned layers, so that
keys and values are compressed before storing them in the cache
There is an explicit assumption that the model contains modules with
`k_proj` and `v_proj` in their names. If this is not the case
and kv_cache_scheme != None, the quantization of kv cache will fail
:global_compression_ratio: optional informational config to report the model
compression ratio acheived by the quantization config
:ignore: optional list of layers to ignore from config_groups. Layers in this list
are not quantized even if they match up with a target in config_groups
"""
config_groups: Dict[str, Union[QuantizationScheme, List[str]]]
quant_method: str = DEFAULT_QUANTIZATION_METHOD
kv_cache_scheme: Optional[QuantizationArgs] = None
format: str = DEFAULT_QUANTIZATION_FORMAT
quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
global_compression_ratio: Optional[float] = None
ignore: Optional[List[str]] = Field(default_factory=list)
# `run_compressed` is a dummy, unused arg for backwards compatibility
# see: https://github.com/huggingface/transformers/pull/39324
run_compressed: Annotated[Any, Field(exclude=True)] = None
def model_post_init(self, __context):
"""
updates any quantization schemes defined as presets to be fully loaded
schemes
"""
for group_name, targets_or_scheme in self.config_groups.items():
if isinstance(targets_or_scheme, QuantizationScheme):
continue # scheme already defined
self.config_groups[group_name] = preset_name_to_scheme(
name=group_name,
targets=targets_or_scheme,
)
def to_dict(self):
# for compatibility with HFQuantizer
return self.model_dump()
@staticmethod
def from_pretrained(
model: Module, format: Optional[Union[str, list]] = None
) -> Optional["QuantizationConfig"]:
"""
Converts a model into its associated QuantizationConfig based on the
QuantizationScheme attached to each quantized module
:param model: model to calculate quantization scheme of
:return: filled out QuantizationScheme for the input model
"""
from compressed_tensors.modeling import IMPL_ATTR
from compressed_tensors.quantization.lifecycle.initialize import (
is_attention_module,
)
# set of all quantization schemes
# TODO: make quant config/scheme/args frozen/hashable and use a set
quantization_schemes: List[QuantizationScheme] = list()
# use any status from modules (in practice, use the last module)
model_status = None
# set of all quantized types
# this is later used to create the ignore list
quantization_type_names: Set[str] = set()
# maps types to names which are not quantized
# this is later used to create the ignore list
ignore: Dict[str, List[str]] = defaultdict(list)
# this keeps track of any kvcache schemes
kv_cache_scheme: Optional[QuantizationArgs] = None
for name, submodule in model.named_modules():
layer_type: str = module_type(submodule)
# add config group if quantized non-attention or attention quant
has_config_group = is_module_quantized(submodule) and (
not is_attention_module(submodule) or hasattr(submodule, IMPL_ATTR)
)
# only add kvcache if quant attention (which always implies kvcache)
has_kv_cache = is_module_quantized(submodule) and is_attention_module(
submodule
)
if has_config_group:
# add to running set of schemes/layer_type_names
model_status = getattr(submodule, "quantization_status", model_status)
quantization_type_names.add(layer_type)
if submodule.quantization_scheme not in quantization_schemes:
quantization_schemes.append(submodule.quantization_scheme)
if has_kv_cache:
model_status = getattr(submodule, "quantization_status", model_status)
kv_cache_scheme = submodule.quantization_scheme.input_activations
if not has_config_group:
# add non-quantized layers to the ignore list
if layer_type not in ignore:
ignore[layer_type] = []
ignore[layer_type].append(name)
if (
len(quantization_schemes) == 0 and kv_cache_scheme is None
): # No quantized layers
return None
# create ignore list, only include layers whose class has ever been targeted
consolidated_ignore = []
for layer_type, ignore_names in ignore.items():
if layer_type in quantization_type_names:
# specific layers of a quantized type are ignored
consolidated_ignore += ignore_names
# else we leave it off the ignore list, doesn't fall under any of the
# existing quantization schemes so it won't be quantized
# create config groups from all unique schemes
config_groups = {}
for idx, scheme in enumerate(quantization_schemes):
group_name = "group_" + str(idx)
config_groups[group_name] = scheme
# infer format
if format is None:
if model_status == QuantizationStatus.COMPRESSED:
format = CompressionFormat.int_quantized.value
else:
format = CompressionFormat.dense.value
elif isinstance(format, list):
format = (
CompressionFormat.mixed_precision.value
if len(format) > 1
else format[0]
)
return QuantizationConfig(
config_groups=config_groups,
quantization_status=model_status,
kv_cache_scheme=kv_cache_scheme,
global_compression_ratio=None,
format=format,
ignore=consolidated_ignore,
)
def requires_calibration_data(self):
if self.kv_cache_scheme is not None:
return True
for _, scheme in self.config_groups.items():
if scheme.input_activations is not None:
if scheme.input_activations.dynamic in (False, DynamicType.LOCAL):
return True
if scheme.output_activations is not None:
if not scheme.output_activations.dynamic:
return True
return False
# TODO set `extra="forbid"` when upstream transformers is compatible
model_config = ConfigDict(extra="ignore")
|