|
|
|
|
|
""" |
|
|
Model Export for Production Deployment |
|
|
======================================= |
|
|
|
|
|
Export FinEE model to various formats: |
|
|
- ONNX (cross-platform) |
|
|
- GGUF (llama.cpp, mobile) |
|
|
- CoreML (iOS/macOS) |
|
|
- TensorRT (NVIDIA inference) |
|
|
|
|
|
Author: Ranjit Behera |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import json |
|
|
import shutil |
|
|
import subprocess |
|
|
from pathlib import Path |
|
|
from typing import Optional, List |
|
|
import argparse |
|
|
|
|
|
|
|
|
class ModelExporter: |
|
|
""" |
|
|
Export models to production-ready formats. |
|
|
""" |
|
|
|
|
|
SUPPORTED_FORMATS = ["onnx", "gguf", "coreml", "tensorrt", "transformers"] |
|
|
|
|
|
def __init__(self, model_path: Path, output_dir: Path): |
|
|
self.model_path = Path(model_path) |
|
|
self.output_dir = Path(output_dir) |
|
|
self.output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
def export_onnx( |
|
|
self, |
|
|
opset_version: int = 14, |
|
|
optimize: bool = True, |
|
|
) -> Path: |
|
|
""" |
|
|
Export to ONNX format. |
|
|
|
|
|
ONNX provides: |
|
|
- Cross-platform inference (CPU, GPU, mobile) |
|
|
- Python, C++, C#, Java, JavaScript runtimes |
|
|
- Optimized for ONNX Runtime |
|
|
|
|
|
Requirements: transformers, optimum |
|
|
""" |
|
|
print("🔄 Exporting to ONNX...") |
|
|
|
|
|
try: |
|
|
from optimum.onnxruntime import ORTModelForCausalLM |
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
|
|
|
print(f" Loading model from {self.model_path}") |
|
|
|
|
|
|
|
|
output_path = self.output_dir / "onnx" |
|
|
output_path.mkdir(exist_ok=True) |
|
|
|
|
|
|
|
|
cmd = [ |
|
|
sys.executable, "-m", "optimum.exporters.onnx", |
|
|
"--model", str(self.model_path), |
|
|
"--task", "text-generation", |
|
|
str(output_path), |
|
|
] |
|
|
|
|
|
subprocess.run(cmd, check=True) |
|
|
print(f"✅ ONNX model exported to {output_path}") |
|
|
|
|
|
|
|
|
if optimize: |
|
|
self._optimize_onnx(output_path) |
|
|
|
|
|
return output_path |
|
|
|
|
|
except ImportError: |
|
|
print("❌ Install optimum: pip install optimum[onnxruntime]") |
|
|
return None |
|
|
except Exception as e: |
|
|
print(f"❌ ONNX export failed: {e}") |
|
|
return None |
|
|
|
|
|
def _optimize_onnx(self, model_dir: Path): |
|
|
"""Optimize ONNX model.""" |
|
|
try: |
|
|
from onnxruntime.transformers import optimizer |
|
|
|
|
|
model_path = model_dir / "model.onnx" |
|
|
if model_path.exists(): |
|
|
optimized_path = model_dir / "model_optimized.onnx" |
|
|
opt_model = optimizer.optimize_model( |
|
|
str(model_path), |
|
|
model_type="gpt2", |
|
|
num_heads=32, |
|
|
hidden_size=4096, |
|
|
) |
|
|
opt_model.save_model_to_file(str(optimized_path)) |
|
|
print(f" Optimized model saved to {optimized_path}") |
|
|
except Exception as e: |
|
|
print(f" ⚠️ Optimization failed: {e}") |
|
|
|
|
|
def export_gguf( |
|
|
self, |
|
|
quantization: str = "q4_k_m", |
|
|
) -> Path: |
|
|
""" |
|
|
Export to GGUF format for llama.cpp. |
|
|
|
|
|
GGUF provides: |
|
|
- Fast CPU inference |
|
|
- Low memory usage |
|
|
- Mobile deployment (Android, iOS) |
|
|
- Various quantization levels |
|
|
|
|
|
Requirements: llama-cpp-python, llama.cpp tools |
|
|
""" |
|
|
print(f"🔄 Exporting to GGUF ({quantization})...") |
|
|
|
|
|
output_path = self.output_dir / "gguf" |
|
|
output_path.mkdir(exist_ok=True) |
|
|
|
|
|
try: |
|
|
|
|
|
convert_script = shutil.which("convert-hf-to-gguf") |
|
|
|
|
|
if convert_script: |
|
|
|
|
|
cmd = [ |
|
|
convert_script, |
|
|
str(self.model_path), |
|
|
"--outfile", str(output_path / "model.gguf"), |
|
|
"--outtype", quantization, |
|
|
] |
|
|
subprocess.run(cmd, check=True) |
|
|
else: |
|
|
|
|
|
print(" Using llama-cpp-python for conversion...") |
|
|
|
|
|
|
|
|
from llama_cpp import Llama |
|
|
|
|
|
|
|
|
print(" ⚠️ llama.cpp convert tools not found") |
|
|
print(" Install: git clone https://github.com/ggerganov/llama.cpp && make") |
|
|
return None |
|
|
|
|
|
print(f"✅ GGUF model exported to {output_path}") |
|
|
return output_path |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ GGUF export failed: {e}") |
|
|
print(" To convert to GGUF:") |
|
|
print(" 1. Clone llama.cpp: git clone https://github.com/ggerganov/llama.cpp") |
|
|
print(" 2. Run: python convert-hf-to-gguf.py <model_path> --outtype q4_k_m") |
|
|
return None |
|
|
|
|
|
def export_coreml(self) -> Path: |
|
|
""" |
|
|
Export to CoreML for iOS/macOS. |
|
|
|
|
|
Requirements: coremltools |
|
|
""" |
|
|
print("🔄 Exporting to CoreML...") |
|
|
|
|
|
output_path = self.output_dir / "coreml" |
|
|
output_path.mkdir(exist_ok=True) |
|
|
|
|
|
try: |
|
|
import coremltools as ct |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
import torch |
|
|
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
self.model_path, |
|
|
torch_dtype=torch.float32, |
|
|
) |
|
|
tokenizer = AutoTokenizer.from_pretrained(self.model_path) |
|
|
|
|
|
|
|
|
example_input = tokenizer("Hello", return_tensors="pt") |
|
|
traced = torch.jit.trace(model, (example_input.input_ids,)) |
|
|
|
|
|
|
|
|
mlmodel = ct.convert( |
|
|
traced, |
|
|
inputs=[ct.TensorType(name="input_ids", shape=(1, ct.RangeDim(1, 512)))], |
|
|
minimum_deployment_target=ct.target.iOS16, |
|
|
) |
|
|
|
|
|
mlmodel.save(output_path / "model.mlpackage") |
|
|
print(f"✅ CoreML model exported to {output_path}") |
|
|
return output_path |
|
|
|
|
|
except ImportError: |
|
|
print("❌ Install coremltools: pip install coremltools") |
|
|
return None |
|
|
except Exception as e: |
|
|
print(f"❌ CoreML export failed: {e}") |
|
|
return None |
|
|
|
|
|
def export_transformers(self) -> Path: |
|
|
""" |
|
|
Export as standard Transformers format (Safetensors). |
|
|
|
|
|
This is the most compatible format for Hugging Face. |
|
|
""" |
|
|
print("🔄 Exporting to Transformers format...") |
|
|
|
|
|
output_path = self.output_dir / "transformers" |
|
|
output_path.mkdir(exist_ok=True) |
|
|
|
|
|
try: |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(self.model_path) |
|
|
tokenizer = AutoTokenizer.from_pretrained(self.model_path) |
|
|
|
|
|
|
|
|
model.save_pretrained(output_path, safe_serialization=True) |
|
|
tokenizer.save_pretrained(output_path) |
|
|
|
|
|
print(f"✅ Transformers model exported to {output_path}") |
|
|
return output_path |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Export failed: {e}") |
|
|
return None |
|
|
|
|
|
def create_inference_code(self) -> Path: |
|
|
"""Generate inference code for each format.""" |
|
|
|
|
|
code_path = self.output_dir / "inference_examples" |
|
|
code_path.mkdir(exist_ok=True) |
|
|
|
|
|
|
|
|
onnx_code = ''' |
|
|
"""ONNX Runtime Inference""" |
|
|
import numpy as np |
|
|
import onnxruntime as ort |
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
# Load |
|
|
session = ort.InferenceSession("model.onnx") |
|
|
tokenizer = AutoTokenizer.from_pretrained(".") |
|
|
|
|
|
# Inference |
|
|
def extract(text: str) -> dict: |
|
|
inputs = tokenizer(text, return_tensors="np") |
|
|
outputs = session.run(None, {"input_ids": inputs["input_ids"]}) |
|
|
# Decode and parse |
|
|
result = tokenizer.decode(outputs[0][0]) |
|
|
return parse_json(result) |
|
|
|
|
|
# Usage |
|
|
result = extract("HDFC Bank Rs.500 debited") |
|
|
print(result) |
|
|
''' |
|
|
|
|
|
with open(code_path / "onnx_inference.py", 'w') as f: |
|
|
f.write(onnx_code) |
|
|
|
|
|
|
|
|
gguf_code = ''' |
|
|
"""llama.cpp Inference""" |
|
|
from llama_cpp import Llama |
|
|
|
|
|
# Load |
|
|
llm = Llama(model_path="model.gguf", n_ctx=512, n_gpu_layers=0) |
|
|
|
|
|
# Inference |
|
|
def extract(text: str) -> dict: |
|
|
prompt = f"Extract entities from: {text}\\nJSON:" |
|
|
output = llm(prompt, max_tokens=256, stop=["\\n\\n"]) |
|
|
return json.loads(output["choices"][0]["text"]) |
|
|
|
|
|
# Usage |
|
|
result = extract("HDFC Bank Rs.500 debited") |
|
|
print(result) |
|
|
''' |
|
|
|
|
|
with open(code_path / "gguf_inference.py", 'w') as f: |
|
|
f.write(gguf_code) |
|
|
|
|
|
|
|
|
hf_code = ''' |
|
|
"""Hugging Face Transformers Inference""" |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
|
# Load |
|
|
model = AutoModelForCausalLM.from_pretrained(".") |
|
|
tokenizer = AutoTokenizer.from_pretrained(".") |
|
|
|
|
|
# Inference |
|
|
def extract(text: str) -> dict: |
|
|
prompt = f"Extract entities from: {text}\\nJSON:" |
|
|
inputs = tokenizer(prompt, return_tensors="pt") |
|
|
outputs = model.generate(**inputs, max_new_tokens=256) |
|
|
result = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
return json.loads(result.split("JSON:")[-1]) |
|
|
|
|
|
# Usage |
|
|
result = extract("HDFC Bank Rs.500 debited") |
|
|
print(result) |
|
|
''' |
|
|
|
|
|
with open(code_path / "transformers_inference.py", 'w') as f: |
|
|
f.write(hf_code) |
|
|
|
|
|
print(f"✅ Inference examples saved to {code_path}") |
|
|
return code_path |
|
|
|
|
|
def export_all(self) -> dict: |
|
|
"""Export to all supported formats.""" |
|
|
results = {} |
|
|
|
|
|
for fmt in ["transformers", "onnx", "gguf"]: |
|
|
try: |
|
|
if fmt == "onnx": |
|
|
results[fmt] = self.export_onnx() |
|
|
elif fmt == "gguf": |
|
|
results[fmt] = self.export_gguf() |
|
|
elif fmt == "transformers": |
|
|
results[fmt] = self.export_transformers() |
|
|
except Exception as e: |
|
|
results[fmt] = None |
|
|
print(f"⚠️ {fmt} export failed: {e}") |
|
|
|
|
|
self.create_inference_code() |
|
|
return results |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="Export model to production formats") |
|
|
parser.add_argument("model_path", help="Path to model") |
|
|
parser.add_argument("--output", "-o", default="exports", help="Output directory") |
|
|
parser.add_argument("--format", "-f", choices=ModelExporter.SUPPORTED_FORMATS + ["all"], |
|
|
default="all", help="Export format") |
|
|
parser.add_argument("--quantization", "-q", default="q4_k_m", |
|
|
help="GGUF quantization type") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
exporter = ModelExporter(Path(args.model_path), Path(args.output)) |
|
|
|
|
|
if args.format == "all": |
|
|
exporter.export_all() |
|
|
elif args.format == "onnx": |
|
|
exporter.export_onnx() |
|
|
elif args.format == "gguf": |
|
|
exporter.export_gguf(args.quantization) |
|
|
elif args.format == "coreml": |
|
|
exporter.export_coreml() |
|
|
elif args.format == "transformers": |
|
|
exporter.export_transformers() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|