"""Offline recipe: export a HF token-classification model to ONNX + dynamic INT8. Needs torch + optimum (not a runtime dep). Produces the artifacts vendored next to this file (quantized .onnx + tokenizer + config), which the agent loads with onnxruntime only — no torch. Validated for akdeniz27/bert-base-turkish-cased-ner (see README.md). """ import argparse import os import shutil from optimum.onnxruntime import ORTModelForTokenClassification, ORTQuantizer from optimum.onnxruntime.configuration import AutoQuantizationConfig from transformers import AutoTokenizer def main() -> None: """Export + dynamic-INT8-quantize the given HF model into ``--out``.""" ap = argparse.ArgumentParser() ap.add_argument("--model", required=True, help="HF model id, e.g. akdeniz27/bert-base-turkish-cased-ner") ap.add_argument("--out", required=True, help="output dir for the quantized model + tokenizer") args = ap.parse_args() fp32_dir = args.out + "-fp32" os.makedirs(args.out, exist_ok=True) # Export FP32 ONNX, then dynamic-INT8 quantize into `out`. ORTModelForTokenClassification.from_pretrained(args.model, export=True).save_pretrained(fp32_dir) quantizer = ORTQuantizer.from_pretrained(fp32_dir) qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False) quantizer.quantize(save_dir=args.out, quantization_config=qconfig) # Tokenizer (tokenizer.json for the Rust runtime) + config.json (id2label). AutoTokenizer.from_pretrained(args.model).save_pretrained(args.out) if not os.path.exists(os.path.join(args.out, "config.json")): shutil.copy(os.path.join(fp32_dir, "config.json"), os.path.join(args.out, "config.json")) shutil.rmtree(fp32_dir, ignore_errors=True) print(f"Exported INT8 ONNX + tokenizer to {args.out}") if __name__ == "__main__": main()