import gradio as gr from huggingface_hub import hf_hub_download, upload_file from onnxruntime.quantization import quantize_dynamic, QuantType from onnxruntime.quantization import quant_utils from pathlib import Path import tempfile, shutil, os def quantize_model(repo_id, filename, bits, hf_token=None): try: # download original ONNX fp32_path = hf_hub_download(repo_id=repo_id, filename=filename, token=hf_token) outdir = tempfile.mkdtemp() out_path = os.path.join(outdir, f"{Path(filename).stem}.quant{bits}.onnx") if bits == 8: quantize_dynamic(fp32_path, out_path, weight_type=QuantType.QInt8) elif bits == 4: ''' model = quant_utils.load_model_with_shape_infer(Path(fp32_path)) quant_config = matmul_4bits_quantizer.DefaultWeightOnlyQuantConfig( block_size=128, is_symmetric=True, accuracy_level=4, quant_format=quant_utils.QuantFormat.QOperator, op_types_to_quantize=("MatMul",), quant_axes=(("MatMul", 0),) ) quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, None, None, quant_config) quant.process() quant.model.save_model_to_file(out_path, True) ''' else: return None, f"Only 8 or 4 bit supported. Got {bits}." # if token provided, auto-upload back if hf_token: upload_file( path_or_fileobj=out_path, path_in_repo=Path(out_path).name, repo_id=repo_id, token=hf_token ) msg = f"✅ Uploaded to {repo_id} as {Path(out_path).name}" else: msg = "✅ Quantization done. Download below." return out_path, msg except Exception as e: return None, f"❌ Error: {e}" demo = gr.Interface( fn=quantize_model, inputs=[ gr.Textbox(label="Repo ID (e.g. chakatoptisol/nomic-embed-text-for-asset-management)"), gr.Textbox(label="Filename in repo (e.g. model.onnx)"), gr.Radio([8, 4], label="Quantization bits"), gr.Textbox(label="HF Token (optional, needed for auto-upload)") ], outputs=[ gr.File(label="Quantized ONNX File"), gr.Textbox(label="Status / Message") ], title="ONNX Quantizer (8-bit / 4-bit)" ) if __name__ == "__main__": demo.launch()