Spaces:
Running
Running
| import gradio as gr | |
| from huggingface_hub import hf_hub_download, upload_file | |
| from onnxruntime.quantization import quantize_dynamic, QuantType | |
| from onnxruntime.quantization import quant_utils | |
| from pathlib import Path | |
| import tempfile, shutil, os | |
| def quantize_model(repo_id, filename, bits, hf_token=None): | |
| try: | |
| # download original ONNX | |
| fp32_path = hf_hub_download(repo_id=repo_id, filename=filename, token=hf_token) | |
| outdir = tempfile.mkdtemp() | |
| out_path = os.path.join(outdir, f"{Path(filename).stem}.quant{bits}.onnx") | |
| if bits == 8: | |
| quantize_dynamic(fp32_path, out_path, weight_type=QuantType.QInt8) | |
| elif bits == 4: | |
| ''' | |
| model = quant_utils.load_model_with_shape_infer(Path(fp32_path)) | |
| quant_config = matmul_4bits_quantizer.DefaultWeightOnlyQuantConfig( | |
| block_size=128, | |
| is_symmetric=True, | |
| accuracy_level=4, | |
| quant_format=quant_utils.QuantFormat.QOperator, | |
| op_types_to_quantize=("MatMul",), | |
| quant_axes=(("MatMul", 0),) | |
| ) | |
| quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, None, None, quant_config) | |
| quant.process() | |
| quant.model.save_model_to_file(out_path, True) | |
| ''' | |
| else: | |
| return None, f"Only 8 or 4 bit supported. Got {bits}." | |
| # if token provided, auto-upload back | |
| if hf_token: | |
| upload_file( | |
| path_or_fileobj=out_path, | |
| path_in_repo=Path(out_path).name, | |
| repo_id=repo_id, | |
| token=hf_token | |
| ) | |
| msg = f"β Uploaded to {repo_id} as {Path(out_path).name}" | |
| else: | |
| msg = "β Quantization done. Download below." | |
| return out_path, msg | |
| except Exception as e: | |
| return None, f"β Error: {e}" | |
| demo = gr.Interface( | |
| fn=quantize_model, | |
| inputs=[ | |
| gr.Textbox(label="Repo ID (e.g. chakatoptisol/nomic-embed-text-for-asset-management)"), | |
| gr.Textbox(label="Filename in repo (e.g. model.onnx)"), | |
| gr.Radio([8, 4], label="Quantization bits"), | |
| gr.Textbox(label="HF Token (optional, needed for auto-upload)") | |
| ], | |
| outputs=[ | |
| gr.File(label="Quantized ONNX File"), | |
| gr.Textbox(label="Status / Message") | |
| ], | |
| title="ONNX Quantizer (8-bit / 4-bit)" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |