#!/usr/bin/env python3 """ Model Export for Production Deployment ======================================= Export FinEE model to various formats: - ONNX (cross-platform) - GGUF (llama.cpp, mobile) - CoreML (iOS/macOS) - TensorRT (NVIDIA inference) Author: Ranjit Behera """ import os import sys import json import shutil import subprocess from pathlib import Path from typing import Optional, List import argparse class ModelExporter: """ Export models to production-ready formats. """ SUPPORTED_FORMATS = ["onnx", "gguf", "coreml", "tensorrt", "transformers"] def __init__(self, model_path: Path, output_dir: Path): self.model_path = Path(model_path) self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) def export_onnx( self, opset_version: int = 14, optimize: bool = True, ) -> Path: """ Export to ONNX format. ONNX provides: - Cross-platform inference (CPU, GPU, mobile) - Python, C++, C#, Java, JavaScript runtimes - Optimized for ONNX Runtime Requirements: transformers, optimum """ print("🔄 Exporting to ONNX...") try: from optimum.onnxruntime import ORTModelForCausalLM from transformers import AutoTokenizer # Load model print(f" Loading model from {self.model_path}") # Export output_path = self.output_dir / "onnx" output_path.mkdir(exist_ok=True) # Use optimum CLI for export cmd = [ sys.executable, "-m", "optimum.exporters.onnx", "--model", str(self.model_path), "--task", "text-generation", str(output_path), ] subprocess.run(cmd, check=True) print(f"✅ ONNX model exported to {output_path}") # Optimize if requested if optimize: self._optimize_onnx(output_path) return output_path except ImportError: print("❌ Install optimum: pip install optimum[onnxruntime]") return None except Exception as e: print(f"❌ ONNX export failed: {e}") return None def _optimize_onnx(self, model_dir: Path): """Optimize ONNX model.""" try: from onnxruntime.transformers import optimizer model_path = model_dir / "model.onnx" if model_path.exists(): optimized_path = model_dir / "model_optimized.onnx" opt_model = optimizer.optimize_model( str(model_path), model_type="gpt2", # or bert, etc. num_heads=32, hidden_size=4096, ) opt_model.save_model_to_file(str(optimized_path)) print(f" Optimized model saved to {optimized_path}") except Exception as e: print(f" ⚠️ Optimization failed: {e}") def export_gguf( self, quantization: str = "q4_k_m", ) -> Path: """ Export to GGUF format for llama.cpp. GGUF provides: - Fast CPU inference - Low memory usage - Mobile deployment (Android, iOS) - Various quantization levels Requirements: llama-cpp-python, llama.cpp tools """ print(f"🔄 Exporting to GGUF ({quantization})...") output_path = self.output_dir / "gguf" output_path.mkdir(exist_ok=True) try: # Check for llama.cpp convert script convert_script = shutil.which("convert-hf-to-gguf") if convert_script: # Using llama.cpp cmd = [ convert_script, str(self.model_path), "--outfile", str(output_path / "model.gguf"), "--outtype", quantization, ] subprocess.run(cmd, check=True) else: # Try using llama-cpp-python print(" Using llama-cpp-python for conversion...") # Alternative: use Python llama.cpp bindings from llama_cpp import Llama # This requires the model to already be in GGUF print(" ⚠️ llama.cpp convert tools not found") print(" Install: git clone https://github.com/ggerganov/llama.cpp && make") return None print(f"✅ GGUF model exported to {output_path}") return output_path except Exception as e: print(f"❌ GGUF export failed: {e}") print(" To convert to GGUF:") print(" 1. Clone llama.cpp: git clone https://github.com/ggerganov/llama.cpp") print(" 2. Run: python convert-hf-to-gguf.py --outtype q4_k_m") return None def export_coreml(self) -> Path: """ Export to CoreML for iOS/macOS. Requirements: coremltools """ print("🔄 Exporting to CoreML...") output_path = self.output_dir / "coreml" output_path.mkdir(exist_ok=True) try: import coremltools as ct from transformers import AutoModelForCausalLM, AutoTokenizer import torch # Load model model = AutoModelForCausalLM.from_pretrained( self.model_path, torch_dtype=torch.float32, ) tokenizer = AutoTokenizer.from_pretrained(self.model_path) # Trace example_input = tokenizer("Hello", return_tensors="pt") traced = torch.jit.trace(model, (example_input.input_ids,)) # Convert mlmodel = ct.convert( traced, inputs=[ct.TensorType(name="input_ids", shape=(1, ct.RangeDim(1, 512)))], minimum_deployment_target=ct.target.iOS16, ) mlmodel.save(output_path / "model.mlpackage") print(f"✅ CoreML model exported to {output_path}") return output_path except ImportError: print("❌ Install coremltools: pip install coremltools") return None except Exception as e: print(f"❌ CoreML export failed: {e}") return None def export_transformers(self) -> Path: """ Export as standard Transformers format (Safetensors). This is the most compatible format for Hugging Face. """ print("🔄 Exporting to Transformers format...") output_path = self.output_dir / "transformers" output_path.mkdir(exist_ok=True) try: from transformers import AutoModelForCausalLM, AutoTokenizer # Load model = AutoModelForCausalLM.from_pretrained(self.model_path) tokenizer = AutoTokenizer.from_pretrained(self.model_path) # Save in safetensors format model.save_pretrained(output_path, safe_serialization=True) tokenizer.save_pretrained(output_path) print(f"✅ Transformers model exported to {output_path}") return output_path except Exception as e: print(f"❌ Export failed: {e}") return None def create_inference_code(self) -> Path: """Generate inference code for each format.""" code_path = self.output_dir / "inference_examples" code_path.mkdir(exist_ok=True) # ONNX inference onnx_code = ''' """ONNX Runtime Inference""" import numpy as np import onnxruntime as ort from transformers import AutoTokenizer # Load session = ort.InferenceSession("model.onnx") tokenizer = AutoTokenizer.from_pretrained(".") # Inference def extract(text: str) -> dict: inputs = tokenizer(text, return_tensors="np") outputs = session.run(None, {"input_ids": inputs["input_ids"]}) # Decode and parse result = tokenizer.decode(outputs[0][0]) return parse_json(result) # Usage result = extract("HDFC Bank Rs.500 debited") print(result) ''' with open(code_path / "onnx_inference.py", 'w') as f: f.write(onnx_code) # GGUF inference gguf_code = ''' """llama.cpp Inference""" from llama_cpp import Llama # Load llm = Llama(model_path="model.gguf", n_ctx=512, n_gpu_layers=0) # Inference def extract(text: str) -> dict: prompt = f"Extract entities from: {text}\\nJSON:" output = llm(prompt, max_tokens=256, stop=["\\n\\n"]) return json.loads(output["choices"][0]["text"]) # Usage result = extract("HDFC Bank Rs.500 debited") print(result) ''' with open(code_path / "gguf_inference.py", 'w') as f: f.write(gguf_code) # Transformers inference hf_code = ''' """Hugging Face Transformers Inference""" from transformers import AutoModelForCausalLM, AutoTokenizer # Load model = AutoModelForCausalLM.from_pretrained(".") tokenizer = AutoTokenizer.from_pretrained(".") # Inference def extract(text: str) -> dict: prompt = f"Extract entities from: {text}\\nJSON:" inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=256) result = tokenizer.decode(outputs[0], skip_special_tokens=True) return json.loads(result.split("JSON:")[-1]) # Usage result = extract("HDFC Bank Rs.500 debited") print(result) ''' with open(code_path / "transformers_inference.py", 'w') as f: f.write(hf_code) print(f"✅ Inference examples saved to {code_path}") return code_path def export_all(self) -> dict: """Export to all supported formats.""" results = {} for fmt in ["transformers", "onnx", "gguf"]: try: if fmt == "onnx": results[fmt] = self.export_onnx() elif fmt == "gguf": results[fmt] = self.export_gguf() elif fmt == "transformers": results[fmt] = self.export_transformers() except Exception as e: results[fmt] = None print(f"⚠️ {fmt} export failed: {e}") self.create_inference_code() return results def main(): parser = argparse.ArgumentParser(description="Export model to production formats") parser.add_argument("model_path", help="Path to model") parser.add_argument("--output", "-o", default="exports", help="Output directory") parser.add_argument("--format", "-f", choices=ModelExporter.SUPPORTED_FORMATS + ["all"], default="all", help="Export format") parser.add_argument("--quantization", "-q", default="q4_k_m", help="GGUF quantization type") args = parser.parse_args() exporter = ModelExporter(Path(args.model_path), Path(args.output)) if args.format == "all": exporter.export_all() elif args.format == "onnx": exporter.export_onnx() elif args.format == "gguf": exporter.export_gguf(args.quantization) elif args.format == "coreml": exporter.export_coreml() elif args.format == "transformers": exporter.export_transformers() if __name__ == "__main__": main()