Harmony18090's picture
Add source batch 2/11
76f9669 verified
raw
history blame
2.35 kB
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import torch
from compressed_tensors.quantization.lifecycle.forward import quantize
from compressed_tensors.quantization.quant_config import QuantizationStatus
from torch.nn import Module
__all__ = [
"compress_quantized_weights",
]
_LOGGER = logging.getLogger(__name__)
def compress_quantized_weights(module: Module):
"""
Quantizes the module weight representation to use fewer bits in memory
apply to full model with `model.apply(compress_quantized_weights)`
:param module: module to compress to quantized representation
"""
scheme = getattr(module, "quantization_scheme", None)
if not scheme or not scheme.weights:
# no quantization scheme or weights not quantized, nothing to do
return
status = getattr(module, "quantization_status", None)
if status is QuantizationStatus.COMPRESSED:
# module is already compressed, nothing to do
return
weight = getattr(module, "weight", None)
scale = getattr(module, "weight_scale", None)
zero_point = getattr(module, "weight_zero_point", None)
g_idx = getattr(module, "weight_g_idx", None)
if weight is None or scale is None:
# no weight, scale, or ZP, nothing to do
# mark as compressed here to maintain consistent status throughout the model
module.quantization_status = QuantizationStatus.COMPRESSED
return
module.weight.requires_grad = False # cannot use auto grad after compression
module.weight.data = quantize(
x=weight,
scale=scale,
zero_point=zero_point,
g_idx=g_idx,
args=scheme.weights,
dtype=torch.int8,
)
module.quantization_status = QuantizationStatus.COMPRESSED