Spaces:
Running
Running
File size: 1,740 Bytes
8ff1b66 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | """
Quantize the sentiment model to INT8 using ONNX Runtime dynamic quantization.
Usage:
python scripts/quantize_model.py
Output:
backend/models/quantized/ — contains model_quantized.onnx + tokenizer files
"""
import shutil
import tempfile
from pathlib import Path
from optimum.onnxruntime import ORTModelForSequenceClassification, ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from transformers import AutoTokenizer
MODEL_ID = "uer/roberta-base-finetuned-jd-binary-chinese"
OUTPUT_DIR = Path(__file__).resolve().parent.parent / "backend" / "models" / "quantized"
def main() -> None:
print(f"[1/4] Exporting {MODEL_ID} to ONNX...")
tmp_dir = tempfile.mkdtemp(prefix="onnx_export_")
try:
model = ORTModelForSequenceClassification.from_pretrained(
MODEL_ID, export=True
)
model.save_pretrained(tmp_dir)
print("[2/4] Applying INT8 dynamic quantization (AVX2)...")
quantizer = ORTQuantizer.from_pretrained(tmp_dir)
qconfig = AutoQuantizationConfig.avx2(is_static=False)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
quantizer.quantize(save_dir=OUTPUT_DIR, quantization_config=qconfig)
print("[3/4] Copying tokenizer files...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.save_pretrained(str(OUTPUT_DIR))
print(f"[4/4] Done! Quantized model saved to: {OUTPUT_DIR}")
print("Files:")
for f in sorted(OUTPUT_DIR.iterdir()):
size_mb = f.stat().st_size / (1024 * 1024)
print(f" {f.name} ({size_mb:.1f} MB)")
finally:
shutil.rmtree(tmp_dir, ignore_errors=True)
if __name__ == "__main__":
main()
|