File size: 755 Bytes
fd0b01f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# dynamic int8 quantize the onnx via ort. dynamic (not static) because we do
# not have a calibration dataset and dynamic works well for decoder-style LMs.

from pathlib import Path


def quantize_int8(fp32_onnx: Path, out_dir: Path) -> Path:
    from onnxruntime.quantization import quantize_dynamic, QuantType

    fp32_onnx = Path(fp32_onnx)
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    int8_path = out_dir / "model.onnx"
    print(f"[quantize] {fp32_onnx} -> {int8_path}")
    quantize_dynamic(
        model_input=str(fp32_onnx),
        model_output=str(int8_path),
        weight_type=QuantType.QInt8,
    )
    print(f"[quantize] wrote {int8_path} ({int8_path.stat().st_size / 1e6:.1f} MB)")
    return int8_path