| """Offline recipe: export a HF token-classification model to ONNX + dynamic INT8. |
| |
| Needs torch + optimum (not a runtime dep). Produces the artifacts vendored next to this |
| file (quantized .onnx + tokenizer + config), which the agent loads with onnxruntime only |
| — no torch. Validated for akdeniz27/bert-base-turkish-cased-ner (see README.md). |
| """ |
|
|
| import argparse |
| import os |
| import shutil |
|
|
| from optimum.onnxruntime import ORTModelForTokenClassification, ORTQuantizer |
| from optimum.onnxruntime.configuration import AutoQuantizationConfig |
| from transformers import AutoTokenizer |
|
|
|
|
| def main() -> None: |
| """Export + dynamic-INT8-quantize the given HF model into ``--out``.""" |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--model", required=True, help="HF model id, e.g. akdeniz27/bert-base-turkish-cased-ner") |
| ap.add_argument("--out", required=True, help="output dir for the quantized model + tokenizer") |
| args = ap.parse_args() |
|
|
| fp32_dir = args.out + "-fp32" |
| os.makedirs(args.out, exist_ok=True) |
|
|
| |
| ORTModelForTokenClassification.from_pretrained(args.model, export=True).save_pretrained(fp32_dir) |
| quantizer = ORTQuantizer.from_pretrained(fp32_dir) |
| qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False) |
| quantizer.quantize(save_dir=args.out, quantization_config=qconfig) |
|
|
| |
| AutoTokenizer.from_pretrained(args.model).save_pretrained(args.out) |
| if not os.path.exists(os.path.join(args.out, "config.json")): |
| shutil.copy(os.path.join(fp32_dir, "config.json"), os.path.join(args.out, "config.json")) |
|
|
| shutil.rmtree(fp32_dir, ignore_errors=True) |
| print(f"Exported INT8 ONNX + tokenizer to {args.out}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|