Add source batch 2/11

76f9669 verified 3 days ago

11.5 kB

	# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	from collections import defaultdict
	from enum import Enum
	from typing import Annotated, Any, Dict, List, Optional, Set, Union

	from compressed_tensors.config import CompressionFormat
	from compressed_tensors.quantization.quant_args import DynamicType, QuantizationArgs
	from compressed_tensors.quantization.quant_scheme import (
	QuantizationScheme,
	preset_name_to_scheme,
	)
	from compressed_tensors.quantization.utils import is_module_quantized, module_type
	from pydantic import BaseModel, ConfigDict, Field
	from torch.nn import Module


	__all__ = [
	"QuantizationStatus",
	"QuantizationConfig",
	"LIFECYCLE_ORDER",
	"DEFAULT_QUANTIZATION_METHOD",
	"DEFAULT_QUANTIZATION_FORMAT",
	]


	class QuantizationStatus(str, Enum):
	"""
	Enum storing the different states a quantized layer can be in

	Initialized: scale, zero points and observers have been attached to the layer but
	are set to dummy values (not yet calibrated)
	Calibration: scale and zero points have been calibrated through OBCQ or similar
	algorithm, observers are still attached
	Frozen: scale and zero points are finalized, observers have been deleted, weights
	are still in their original precision
	Compressed: weights have been converted to their target type or compressed to
	their closed approximation
	"""

	INITIALIZED = "initialized"
	CALIBRATION = "calibration"
	FROZEN = "frozen"
	COMPRESSED = "compressed"

	@classmethod
	def lifecycle_order(cls) -> List["QuantizationStatus"]:
	"""
	:return: list of correct quantization lifecycle order
	"""
	return

	def __ge__(self, other):
	if other is None:
	return True
	if not isinstance(other, self.__class__):
	raise NotImplementedError
	return LIFECYCLE_ORDER.index(self) >= LIFECYCLE_ORDER.index(other)

	def __gt__(self, other):
	if other is None:
	return True
	if not isinstance(other, self.__class__):
	raise NotImplementedError
	return LIFECYCLE_ORDER.index(self) > LIFECYCLE_ORDER.index(other)

	def __lt__(self, other):
	if other is None:
	return False
	if not isinstance(other, self.__class__):
	raise NotImplementedError
	return LIFECYCLE_ORDER.index(self) < LIFECYCLE_ORDER.index(other)

	def __le__(self, other):
	if other is None:
	return False
	if not isinstance(other, self.__class__):
	raise NotImplementedError
	return LIFECYCLE_ORDER.index(self) <= LIFECYCLE_ORDER.index(other)


	LIFECYCLE_ORDER = [
	QuantizationStatus.INITIALIZED,
	QuantizationStatus.CALIBRATION,
	QuantizationStatus.FROZEN,
	QuantizationStatus.COMPRESSED,
	]

	DEFAULT_QUANTIZATION_METHOD = "compressed-tensors"
	DEFAULT_QUANTIZATION_FORMAT = "fakequant"


	class QuantizationConfig(BaseModel):
	"""
	Full configuration specifying how a model is quantized. Each quantized layer is
	mapped to a QuantizationScheme in config_groups.

	:param config_groups: dict of QuantizationSchemes specifying the quantization
	settings for each quantized layer. A group could also be a reference to
	a predefined scheme name, mapped to a list of its target layers/classes
	:param quant_method: a constant used to differentiate compressed-tensors
	quantization from other quantization configs
	:param format: specifies how the quantized model is stored on disk
	:quantization_status: specifies the current status of all quantized layers. It is
	assumed all layers are in the same state.
	:param kv_cache_scheme: optional QuantizationArgs, that specify the
	quantization of the kv cache. If None, kv cache is not quantized.
	When applying kv cache quantization to transformer AutoModelForCausalLM,
	the kv_cache_scheme gets converted into a QuantizationScheme that:
	- targets the `q_proj` and `k_proj` modules of the model. The outputs
	of those modules are the keys and values that might be cached
	- quantizes the outputs of the aformentioned layers, so that
	keys and values are compressed before storing them in the cache
	There is an explicit assumption that the model contains modules with
	`k_proj` and `v_proj` in their names. If this is not the case
	and kv_cache_scheme != None, the quantization of kv cache will fail
	:global_compression_ratio: optional informational config to report the model
	compression ratio acheived by the quantization config
	:ignore: optional list of layers to ignore from config_groups. Layers in this list
	are not quantized even if they match up with a target in config_groups
	"""

	config_groups: Dict[str, Union[QuantizationScheme, List[str]]]
	quant_method: str = DEFAULT_QUANTIZATION_METHOD
	kv_cache_scheme: Optional[QuantizationArgs] = None
	format: str = DEFAULT_QUANTIZATION_FORMAT
	quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
	global_compression_ratio: Optional[float] = None
	ignore: Optional[List[str]] = Field(default_factory=list)
	# `run_compressed` is a dummy, unused arg for backwards compatibility
	# see: https://github.com/huggingface/transformers/pull/39324
	run_compressed: Annotated[Any, Field(exclude=True)] = None

	def model_post_init(self, __context):
	"""
	updates any quantization schemes defined as presets to be fully loaded
	schemes
	"""
	for group_name, targets_or_scheme in self.config_groups.items():
	if isinstance(targets_or_scheme, QuantizationScheme):
	continue # scheme already defined
	self.config_groups[group_name] = preset_name_to_scheme(
	name=group_name,
	targets=targets_or_scheme,
	)

	def to_dict(self):
	# for compatibility with HFQuantizer
	return self.model_dump()

	@staticmethod
	def from_pretrained(
	model: Module, format: Optional[Union[str, list]] = None
	) -> Optional["QuantizationConfig"]:
	"""
	Converts a model into its associated QuantizationConfig based on the
	QuantizationScheme attached to each quantized module

	:param model: model to calculate quantization scheme of
	:return: filled out QuantizationScheme for the input model
	"""
	from compressed_tensors.modeling import IMPL_ATTR
	from compressed_tensors.quantization.lifecycle.initialize import (
	is_attention_module,
	)

	# set of all quantization schemes
	# TODO: make quant config/scheme/args frozen/hashable and use a set
	quantization_schemes: List[QuantizationScheme] = list()

	# use any status from modules (in practice, use the last module)
	model_status = None

	# set of all quantized types
	# this is later used to create the ignore list
	quantization_type_names: Set[str] = set()

	# maps types to names which are not quantized
	# this is later used to create the ignore list
	ignore: Dict[str, List[str]] = defaultdict(list)

	# this keeps track of any kvcache schemes
	kv_cache_scheme: Optional[QuantizationArgs] = None

	for name, submodule in model.named_modules():
	layer_type: str = module_type(submodule)

	# add config group if quantized non-attention or attention quant
	has_config_group = is_module_quantized(submodule) and (
	not is_attention_module(submodule) or hasattr(submodule, IMPL_ATTR)
	)
	# only add kvcache if quant attention (which always implies kvcache)
	has_kv_cache = is_module_quantized(submodule) and is_attention_module(
	submodule
	)

	if has_config_group:
	# add to running set of schemes/layer_type_names
	model_status = getattr(submodule, "quantization_status", model_status)
	quantization_type_names.add(layer_type)
	if submodule.quantization_scheme not in quantization_schemes:
	quantization_schemes.append(submodule.quantization_scheme)

	if has_kv_cache:
	model_status = getattr(submodule, "quantization_status", model_status)
	kv_cache_scheme = submodule.quantization_scheme.input_activations

	if not has_config_group:
	# add non-quantized layers to the ignore list
	if layer_type not in ignore:
	ignore[layer_type] = []
	ignore[layer_type].append(name)

	if (
	len(quantization_schemes) == 0 and kv_cache_scheme is None
	): # No quantized layers
	return None

	# create ignore list, only include layers whose class has ever been targeted
	consolidated_ignore = []
	for layer_type, ignore_names in ignore.items():
	if layer_type in quantization_type_names:
	# specific layers of a quantized type are ignored
	consolidated_ignore += ignore_names
	# else we leave it off the ignore list, doesn't fall under any of the
	# existing quantization schemes so it won't be quantized

	# create config groups from all unique schemes
	config_groups = {}
	for idx, scheme in enumerate(quantization_schemes):
	group_name = "group_" + str(idx)
	config_groups[group_name] = scheme

	# infer format
	if format is None:
	if model_status == QuantizationStatus.COMPRESSED:
	format = CompressionFormat.int_quantized.value
	else:
	format = CompressionFormat.dense.value
	elif isinstance(format, list):
	format = (
	CompressionFormat.mixed_precision.value
	if len(format) > 1
	else format[0]
	)

	return QuantizationConfig(
	config_groups=config_groups,
	quantization_status=model_status,
	kv_cache_scheme=kv_cache_scheme,
	global_compression_ratio=None,
	format=format,
	ignore=consolidated_ignore,
	)

	def requires_calibration_data(self):
	if self.kv_cache_scheme is not None:
	return True

	for _, scheme in self.config_groups.items():
	if scheme.input_activations is not None:
	if scheme.input_activations.dynamic in (False, DynamicType.LOCAL):
	return True
	if scheme.output_activations is not None:
	if not scheme.output_activations.dynamic:
	return True

	return False

	# TODO set `extra="forbid"` when upstream transformers is compatible
	model_config = ConfigDict(extra="ignore")