Onnx_quant / app.py
root
First Steps
dee6b63
import gradio as gr
from huggingface_hub import hf_hub_download, upload_file
from onnxruntime.quantization import quantize_dynamic, QuantType
from onnxruntime.quantization import quant_utils
from pathlib import Path
import tempfile, shutil, os
def quantize_model(repo_id, filename, bits, hf_token=None):
try:
# download original ONNX
fp32_path = hf_hub_download(repo_id=repo_id, filename=filename, token=hf_token)
outdir = tempfile.mkdtemp()
out_path = os.path.join(outdir, f"{Path(filename).stem}.quant{bits}.onnx")
if bits == 8:
quantize_dynamic(fp32_path, out_path, weight_type=QuantType.QInt8)
elif bits == 4:
'''
model = quant_utils.load_model_with_shape_infer(Path(fp32_path))
quant_config = matmul_4bits_quantizer.DefaultWeightOnlyQuantConfig(
block_size=128,
is_symmetric=True,
accuracy_level=4,
quant_format=quant_utils.QuantFormat.QOperator,
op_types_to_quantize=("MatMul",),
quant_axes=(("MatMul", 0),)
)
quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, None, None, quant_config)
quant.process()
quant.model.save_model_to_file(out_path, True)
'''
else:
return None, f"Only 8 or 4 bit supported. Got {bits}."
# if token provided, auto-upload back
if hf_token:
upload_file(
path_or_fileobj=out_path,
path_in_repo=Path(out_path).name,
repo_id=repo_id,
token=hf_token
)
msg = f"βœ… Uploaded to {repo_id} as {Path(out_path).name}"
else:
msg = "βœ… Quantization done. Download below."
return out_path, msg
except Exception as e:
return None, f"❌ Error: {e}"
demo = gr.Interface(
fn=quantize_model,
inputs=[
gr.Textbox(label="Repo ID (e.g. chakatoptisol/nomic-embed-text-for-asset-management)"),
gr.Textbox(label="Filename in repo (e.g. model.onnx)"),
gr.Radio([8, 4], label="Quantization bits"),
gr.Textbox(label="HF Token (optional, needed for auto-upload)")
],
outputs=[
gr.File(label="Quantized ONNX File"),
gr.Textbox(label="Status / Message")
],
title="ONNX Quantizer (8-bit / 4-bit)"
)
if __name__ == "__main__":
demo.launch()