Harmony18090
/

server

Model card Files Files and versions

server / source /compressed_tensors /quantization /lifecycle /compressed.py

Harmony18090's picture

Add source batch 2/11

76f9669 verified 4 days ago

2.35 kB

	# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	import logging

	import torch
	from compressed_tensors.quantization.lifecycle.forward import quantize
	from compressed_tensors.quantization.quant_config import QuantizationStatus
	from torch.nn import Module


	__all__ = [
	"compress_quantized_weights",
	]


	_LOGGER = logging.getLogger(__name__)


	def compress_quantized_weights(module: Module):
	"""
	Quantizes the module weight representation to use fewer bits in memory

	apply to full model with `model.apply(compress_quantized_weights)`

	:param module: module to compress to quantized representation
	"""
	scheme = getattr(module, "quantization_scheme", None)
	if not scheme or not scheme.weights:
	# no quantization scheme or weights not quantized, nothing to do
	return

	status = getattr(module, "quantization_status", None)
	if status is QuantizationStatus.COMPRESSED:
	# module is already compressed, nothing to do
	return

	weight = getattr(module, "weight", None)
	scale = getattr(module, "weight_scale", None)
	zero_point = getattr(module, "weight_zero_point", None)
	g_idx = getattr(module, "weight_g_idx", None)

	if weight is None or scale is None:
	# no weight, scale, or ZP, nothing to do

	# mark as compressed here to maintain consistent status throughout the model
	module.quantization_status = QuantizationStatus.COMPRESSED
	return

	module.weight.requires_grad = False # cannot use auto grad after compression
	module.weight.data = quantize(
	x=weight,
	scale=scale,
	zero_point=zero_point,
	g_idx=g_idx,
	args=scheme.weights,
	dtype=torch.int8,
	)

	module.quantization_status = QuantizationStatus.COMPRESSED