Ranjit0034's picture
Upload scripts/export_model.py with huggingface_hub
114a2fc verified
#!/usr/bin/env python3
"""
Model Export for Production Deployment
=======================================
Export FinEE model to various formats:
- ONNX (cross-platform)
- GGUF (llama.cpp, mobile)
- CoreML (iOS/macOS)
- TensorRT (NVIDIA inference)
Author: Ranjit Behera
"""
import os
import sys
import json
import shutil
import subprocess
from pathlib import Path
from typing import Optional, List
import argparse
class ModelExporter:
"""
Export models to production-ready formats.
"""
SUPPORTED_FORMATS = ["onnx", "gguf", "coreml", "tensorrt", "transformers"]
def __init__(self, model_path: Path, output_dir: Path):
self.model_path = Path(model_path)
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
def export_onnx(
self,
opset_version: int = 14,
optimize: bool = True,
) -> Path:
"""
Export to ONNX format.
ONNX provides:
- Cross-platform inference (CPU, GPU, mobile)
- Python, C++, C#, Java, JavaScript runtimes
- Optimized for ONNX Runtime
Requirements: transformers, optimum
"""
print("🔄 Exporting to ONNX...")
try:
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import AutoTokenizer
# Load model
print(f" Loading model from {self.model_path}")
# Export
output_path = self.output_dir / "onnx"
output_path.mkdir(exist_ok=True)
# Use optimum CLI for export
cmd = [
sys.executable, "-m", "optimum.exporters.onnx",
"--model", str(self.model_path),
"--task", "text-generation",
str(output_path),
]
subprocess.run(cmd, check=True)
print(f"✅ ONNX model exported to {output_path}")
# Optimize if requested
if optimize:
self._optimize_onnx(output_path)
return output_path
except ImportError:
print("❌ Install optimum: pip install optimum[onnxruntime]")
return None
except Exception as e:
print(f"❌ ONNX export failed: {e}")
return None
def _optimize_onnx(self, model_dir: Path):
"""Optimize ONNX model."""
try:
from onnxruntime.transformers import optimizer
model_path = model_dir / "model.onnx"
if model_path.exists():
optimized_path = model_dir / "model_optimized.onnx"
opt_model = optimizer.optimize_model(
str(model_path),
model_type="gpt2", # or bert, etc.
num_heads=32,
hidden_size=4096,
)
opt_model.save_model_to_file(str(optimized_path))
print(f" Optimized model saved to {optimized_path}")
except Exception as e:
print(f" ⚠️ Optimization failed: {e}")
def export_gguf(
self,
quantization: str = "q4_k_m",
) -> Path:
"""
Export to GGUF format for llama.cpp.
GGUF provides:
- Fast CPU inference
- Low memory usage
- Mobile deployment (Android, iOS)
- Various quantization levels
Requirements: llama-cpp-python, llama.cpp tools
"""
print(f"🔄 Exporting to GGUF ({quantization})...")
output_path = self.output_dir / "gguf"
output_path.mkdir(exist_ok=True)
try:
# Check for llama.cpp convert script
convert_script = shutil.which("convert-hf-to-gguf")
if convert_script:
# Using llama.cpp
cmd = [
convert_script,
str(self.model_path),
"--outfile", str(output_path / "model.gguf"),
"--outtype", quantization,
]
subprocess.run(cmd, check=True)
else:
# Try using llama-cpp-python
print(" Using llama-cpp-python for conversion...")
# Alternative: use Python llama.cpp bindings
from llama_cpp import Llama
# This requires the model to already be in GGUF
print(" ⚠️ llama.cpp convert tools not found")
print(" Install: git clone https://github.com/ggerganov/llama.cpp && make")
return None
print(f"✅ GGUF model exported to {output_path}")
return output_path
except Exception as e:
print(f"❌ GGUF export failed: {e}")
print(" To convert to GGUF:")
print(" 1. Clone llama.cpp: git clone https://github.com/ggerganov/llama.cpp")
print(" 2. Run: python convert-hf-to-gguf.py <model_path> --outtype q4_k_m")
return None
def export_coreml(self) -> Path:
"""
Export to CoreML for iOS/macOS.
Requirements: coremltools
"""
print("🔄 Exporting to CoreML...")
output_path = self.output_dir / "coreml"
output_path.mkdir(exist_ok=True)
try:
import coremltools as ct
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# Load model
model = AutoModelForCausalLM.from_pretrained(
self.model_path,
torch_dtype=torch.float32,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_path)
# Trace
example_input = tokenizer("Hello", return_tensors="pt")
traced = torch.jit.trace(model, (example_input.input_ids,))
# Convert
mlmodel = ct.convert(
traced,
inputs=[ct.TensorType(name="input_ids", shape=(1, ct.RangeDim(1, 512)))],
minimum_deployment_target=ct.target.iOS16,
)
mlmodel.save(output_path / "model.mlpackage")
print(f"✅ CoreML model exported to {output_path}")
return output_path
except ImportError:
print("❌ Install coremltools: pip install coremltools")
return None
except Exception as e:
print(f"❌ CoreML export failed: {e}")
return None
def export_transformers(self) -> Path:
"""
Export as standard Transformers format (Safetensors).
This is the most compatible format for Hugging Face.
"""
print("🔄 Exporting to Transformers format...")
output_path = self.output_dir / "transformers"
output_path.mkdir(exist_ok=True)
try:
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load
model = AutoModelForCausalLM.from_pretrained(self.model_path)
tokenizer = AutoTokenizer.from_pretrained(self.model_path)
# Save in safetensors format
model.save_pretrained(output_path, safe_serialization=True)
tokenizer.save_pretrained(output_path)
print(f"✅ Transformers model exported to {output_path}")
return output_path
except Exception as e:
print(f"❌ Export failed: {e}")
return None
def create_inference_code(self) -> Path:
"""Generate inference code for each format."""
code_path = self.output_dir / "inference_examples"
code_path.mkdir(exist_ok=True)
# ONNX inference
onnx_code = '''
"""ONNX Runtime Inference"""
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer
# Load
session = ort.InferenceSession("model.onnx")
tokenizer = AutoTokenizer.from_pretrained(".")
# Inference
def extract(text: str) -> dict:
inputs = tokenizer(text, return_tensors="np")
outputs = session.run(None, {"input_ids": inputs["input_ids"]})
# Decode and parse
result = tokenizer.decode(outputs[0][0])
return parse_json(result)
# Usage
result = extract("HDFC Bank Rs.500 debited")
print(result)
'''
with open(code_path / "onnx_inference.py", 'w') as f:
f.write(onnx_code)
# GGUF inference
gguf_code = '''
"""llama.cpp Inference"""
from llama_cpp import Llama
# Load
llm = Llama(model_path="model.gguf", n_ctx=512, n_gpu_layers=0)
# Inference
def extract(text: str) -> dict:
prompt = f"Extract entities from: {text}\\nJSON:"
output = llm(prompt, max_tokens=256, stop=["\\n\\n"])
return json.loads(output["choices"][0]["text"])
# Usage
result = extract("HDFC Bank Rs.500 debited")
print(result)
'''
with open(code_path / "gguf_inference.py", 'w') as f:
f.write(gguf_code)
# Transformers inference
hf_code = '''
"""Hugging Face Transformers Inference"""
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load
model = AutoModelForCausalLM.from_pretrained(".")
tokenizer = AutoTokenizer.from_pretrained(".")
# Inference
def extract(text: str) -> dict:
prompt = f"Extract entities from: {text}\\nJSON:"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=256)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
return json.loads(result.split("JSON:")[-1])
# Usage
result = extract("HDFC Bank Rs.500 debited")
print(result)
'''
with open(code_path / "transformers_inference.py", 'w') as f:
f.write(hf_code)
print(f"✅ Inference examples saved to {code_path}")
return code_path
def export_all(self) -> dict:
"""Export to all supported formats."""
results = {}
for fmt in ["transformers", "onnx", "gguf"]:
try:
if fmt == "onnx":
results[fmt] = self.export_onnx()
elif fmt == "gguf":
results[fmt] = self.export_gguf()
elif fmt == "transformers":
results[fmt] = self.export_transformers()
except Exception as e:
results[fmt] = None
print(f"⚠️ {fmt} export failed: {e}")
self.create_inference_code()
return results
def main():
parser = argparse.ArgumentParser(description="Export model to production formats")
parser.add_argument("model_path", help="Path to model")
parser.add_argument("--output", "-o", default="exports", help="Output directory")
parser.add_argument("--format", "-f", choices=ModelExporter.SUPPORTED_FORMATS + ["all"],
default="all", help="Export format")
parser.add_argument("--quantization", "-q", default="q4_k_m",
help="GGUF quantization type")
args = parser.parse_args()
exporter = ModelExporter(Path(args.model_path), Path(args.output))
if args.format == "all":
exporter.export_all()
elif args.format == "onnx":
exporter.export_onnx()
elif args.format == "gguf":
exporter.export_gguf(args.quantization)
elif args.format == "coreml":
exporter.export_coreml()
elif args.format == "transformers":
exporter.export_transformers()
if __name__ == "__main__":
main()