File size: 1,740 Bytes
8ff1b66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""
Quantize the sentiment model to INT8 using ONNX Runtime dynamic quantization.

Usage:
    python scripts/quantize_model.py

Output:
    backend/models/quantized/  — contains model_quantized.onnx + tokenizer files
"""

import shutil
import tempfile
from pathlib import Path

from optimum.onnxruntime import ORTModelForSequenceClassification, ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from transformers import AutoTokenizer

MODEL_ID = "uer/roberta-base-finetuned-jd-binary-chinese"
OUTPUT_DIR = Path(__file__).resolve().parent.parent / "backend" / "models" / "quantized"


def main() -> None:
    print(f"[1/4] Exporting {MODEL_ID} to ONNX...")
    tmp_dir = tempfile.mkdtemp(prefix="onnx_export_")
    try:
        model = ORTModelForSequenceClassification.from_pretrained(
            MODEL_ID, export=True
        )
        model.save_pretrained(tmp_dir)

        print("[2/4] Applying INT8 dynamic quantization (AVX2)...")
        quantizer = ORTQuantizer.from_pretrained(tmp_dir)
        qconfig = AutoQuantizationConfig.avx2(is_static=False)

        OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
        quantizer.quantize(save_dir=OUTPUT_DIR, quantization_config=qconfig)

        print("[3/4] Copying tokenizer files...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        tokenizer.save_pretrained(str(OUTPUT_DIR))

        print(f"[4/4] Done! Quantized model saved to: {OUTPUT_DIR}")
        print("Files:")
        for f in sorted(OUTPUT_DIR.iterdir()):
            size_mb = f.stat().st_size / (1024 * 1024)
            print(f"  {f.name}  ({size_mb:.1f} MB)")
    finally:
        shutil.rmtree(tmp_dir, ignore_errors=True)


if __name__ == "__main__":
    main()