""" Quantize the sentiment model to INT8 using ONNX Runtime dynamic quantization. Usage: python scripts/quantize_model.py Output: backend/models/quantized/ — contains model_quantized.onnx + tokenizer files """ import shutil import tempfile from pathlib import Path from optimum.onnxruntime import ORTModelForSequenceClassification, ORTQuantizer from optimum.onnxruntime.configuration import AutoQuantizationConfig from transformers import AutoTokenizer MODEL_ID = "uer/roberta-base-finetuned-jd-binary-chinese" OUTPUT_DIR = Path(__file__).resolve().parent.parent / "backend" / "models" / "quantized" def main() -> None: print(f"[1/4] Exporting {MODEL_ID} to ONNX...") tmp_dir = tempfile.mkdtemp(prefix="onnx_export_") try: model = ORTModelForSequenceClassification.from_pretrained( MODEL_ID, export=True ) model.save_pretrained(tmp_dir) print("[2/4] Applying INT8 dynamic quantization (AVX2)...") quantizer = ORTQuantizer.from_pretrained(tmp_dir) qconfig = AutoQuantizationConfig.avx2(is_static=False) OUTPUT_DIR.mkdir(parents=True, exist_ok=True) quantizer.quantize(save_dir=OUTPUT_DIR, quantization_config=qconfig) print("[3/4] Copying tokenizer files...") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer.save_pretrained(str(OUTPUT_DIR)) print(f"[4/4] Done! Quantized model saved to: {OUTPUT_DIR}") print("Files:") for f in sorted(OUTPUT_DIR.iterdir()): size_mb = f.stat().st_size / (1024 * 1024) print(f" {f.name} ({size_mb:.1f} MB)") finally: shutil.rmtree(tmp_dir, ignore_errors=True) if __name__ == "__main__": main()