| | """ |
| | Quantize text model to INT8. |
| | Produces two variants: |
| | - textual_int8.onnx — full dynamic quantization (incl. Conv → ConvInteger), max speed. |
| | - textual_int8_no_conv.onnx — quantization without Conv (no ConvInteger), for runtimes like ONNX Runtime in Rust. |
| | """ |
| | import logging |
| | logging.getLogger("onnxruntime").setLevel(logging.ERROR) |
| | logging.getLogger().setLevel(logging.ERROR) |
| |
|
| | from onnxruntime.quantization import quantize_dynamic, QuantType |
| |
|
| | |
| | quantize_dynamic( |
| | model_input="textual.onnx", |
| | model_output="textual_int8.onnx", |
| | weight_type=QuantType.QInt8, |
| | ) |
| | print("✅ textual_int8.onnx created") |
| |
|
| | |
| | OP_TYPES_NO_CONV = ["MatMul", "Attention", "Gather", "Transpose", "EmbedLayerNormalization"] |
| | quantize_dynamic( |
| | model_input="textual.onnx", |
| | model_output="textual_int8_no_conv.onnx", |
| | weight_type=QuantType.QInt8, |
| | op_types_to_quantize=OP_TYPES_NO_CONV, |
| | ) |
| | print("✅ textual_int8_no_conv.onnx created") |
| |
|