sentimentstream-worker / scripts /quantize_model.py
GitHub Action
deploy: worker release from GitHub
8ff1b66
"""
Quantize the sentiment model to INT8 using ONNX Runtime dynamic quantization.
Usage:
python scripts/quantize_model.py
Output:
backend/models/quantized/ — contains model_quantized.onnx + tokenizer files
"""
import shutil
import tempfile
from pathlib import Path
from optimum.onnxruntime import ORTModelForSequenceClassification, ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from transformers import AutoTokenizer
MODEL_ID = "uer/roberta-base-finetuned-jd-binary-chinese"
OUTPUT_DIR = Path(__file__).resolve().parent.parent / "backend" / "models" / "quantized"
def main() -> None:
print(f"[1/4] Exporting {MODEL_ID} to ONNX...")
tmp_dir = tempfile.mkdtemp(prefix="onnx_export_")
try:
model = ORTModelForSequenceClassification.from_pretrained(
MODEL_ID, export=True
)
model.save_pretrained(tmp_dir)
print("[2/4] Applying INT8 dynamic quantization (AVX2)...")
quantizer = ORTQuantizer.from_pretrained(tmp_dir)
qconfig = AutoQuantizationConfig.avx2(is_static=False)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
quantizer.quantize(save_dir=OUTPUT_DIR, quantization_config=qconfig)
print("[3/4] Copying tokenizer files...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.save_pretrained(str(OUTPUT_DIR))
print(f"[4/4] Done! Quantized model saved to: {OUTPUT_DIR}")
print("Files:")
for f in sorted(OUTPUT_DIR.iterdir()):
size_mb = f.stat().st_size / (1024 * 1024)
print(f" {f.name} ({size_mb:.1f} MB)")
finally:
shutil.rmtree(tmp_dir, ignore_errors=True)
if __name__ == "__main__":
main()