|
|
""" |
|
|
Model loader for safetensors-based quantized models |
|
|
""" |
|
|
import torch |
|
|
import json |
|
|
import os |
|
|
from safetensors.torch import load_file |
|
|
from transformers import AutoModel |
|
|
from .quantization import QuantizedLinear |
|
|
|
|
|
def load_quantized_model(model_path, device="cuda", trust_remote_code=True): |
|
|
""" |
|
|
Load quantized model from safetensors + metadata |
|
|
|
|
|
Args: |
|
|
model_path: Path to model directory or HF repo |
|
|
device: Device to load on |
|
|
trust_remote_code: Required for custom code |
|
|
|
|
|
Returns: |
|
|
Loaded quantized model |
|
|
""" |
|
|
|
|
|
|
|
|
print("Loading base model architecture...") |
|
|
base_model = AutoModel.from_pretrained( |
|
|
"deepseek-ai/DeepSeek-OCR", |
|
|
trust_remote_code=True, |
|
|
torch_dtype=torch.bfloat16 |
|
|
) |
|
|
|
|
|
|
|
|
print("Loading quantized weights from safetensors...") |
|
|
if os.path.isdir(model_path): |
|
|
safetensors_path = os.path.join(model_path, "model.safetensors") |
|
|
metadata_path = os.path.join(model_path, "quantization_config.json") |
|
|
else: |
|
|
|
|
|
from huggingface_hub import hf_hub_download |
|
|
safetensors_path = hf_hub_download(repo_id=model_path, filename="model.safetensors") |
|
|
metadata_path = hf_hub_download(repo_id=model_path, filename="quantization_config.json") |
|
|
|
|
|
state_dict = load_file(safetensors_path) |
|
|
|
|
|
|
|
|
with open(metadata_path, 'r') as f: |
|
|
quant_metadata = json.load(f) |
|
|
|
|
|
quantized_layers = quant_metadata['quantized_layers'] |
|
|
|
|
|
|
|
|
print(f"Reconstructing {len(quantized_layers)} quantized layers...") |
|
|
for layer_name, layer_info in quantized_layers.items(): |
|
|
weight_quantized = state_dict[f"{layer_name}.weight_quantized"] |
|
|
scale = state_dict[f"{layer_name}.scale"] |
|
|
zero_point = state_dict[f"{layer_name}.zero_point"] |
|
|
bias = state_dict.get(f"{layer_name}.bias", None) |
|
|
|
|
|
quantized_linear = QuantizedLinear( |
|
|
in_features=layer_info['in_features'], |
|
|
out_features=layer_info['out_features'], |
|
|
bits=layer_info['bits'], |
|
|
weight_data=weight_quantized, |
|
|
scale=scale, |
|
|
zero_point=zero_point, |
|
|
bias=bias |
|
|
) |
|
|
|
|
|
|
|
|
parts = layer_name.split('.') |
|
|
parent = base_model |
|
|
for part in parts[:-1]: |
|
|
parent = getattr(parent, part) |
|
|
setattr(parent, parts[-1], quantized_linear) |
|
|
|
|
|
print(f"✅ Model loaded successfully!") |
|
|
print(f" Compression: {quant_metadata['stats']['compression_ratio']:.2f}x") |
|
|
print(f" Size: {quant_metadata['stats']['compressed_size_mb']:.0f} MB") |
|
|
|
|
|
return base_model.to(device) |
|
|
|
|
|
|
|
|
def from_pretrained(model_path, **kwargs): |
|
|
"""HuggingFace-style loading""" |
|
|
device = kwargs.get('device', 'cuda') |
|
|
return load_quantized_model(model_path, device=device) |
|
|
|