""" Quantize text model to INT8. Produces two variants: - textual_int8.onnx — full dynamic quantization (incl. Conv → ConvInteger), max speed. - textual_int8_no_conv.onnx — quantization without Conv (no ConvInteger), for runtimes like ONNX Runtime in Rust. """ import logging logging.getLogger("onnxruntime").setLevel(logging.ERROR) logging.getLogger().setLevel(logging.ERROR) from onnxruntime.quantization import quantize_dynamic, QuantType # Full quantization (default: all types from IntegerOpsRegistry, incl. Conv) quantize_dynamic( model_input="textual.onnx", model_output="textual_int8.onnx", weight_type=QuantType.QInt8, ) print("✅ textual_int8.onnx created") # Variant without Conv — no ConvInteger, compatible with ort in Rust etc. OP_TYPES_NO_CONV = ["MatMul", "Attention", "Gather", "Transpose", "EmbedLayerNormalization"] quantize_dynamic( model_input="textual.onnx", model_output="textual_int8_no_conv.onnx", weight_type=QuantType.QInt8, op_types_to_quantize=OP_TYPES_NO_CONV, ) print("✅ textual_int8_no_conv.onnx created")