Ranjit0034
/

finance-entity-extractor

+#!/usr/bin/env python3
+"""
+Model Export for Production Deployment
+=======================================
+Export FinEE model to various formats:
+- ONNX (cross-platform)
+- GGUF (llama.cpp, mobile)
+- CoreML (iOS/macOS)
+- TensorRT (NVIDIA inference)
+Author: Ranjit Behera
+"""
+import os
+import sys
+import json
+import shutil
+import subprocess
+from pathlib import Path
+from typing import Optional, List
+import argparse
+class ModelExporter:
+    """
+    Export models to production-ready formats.
+    """
+    SUPPORTED_FORMATS = ["onnx", "gguf", "coreml", "tensorrt", "transformers"]
+    def __init__(self, model_path: Path, output_dir: Path):
+        self.model_path = Path(model_path)
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+    def export_onnx(
+        self,
+        opset_version: int = 14,
+        optimize: bool = True,
+    ) -> Path:
+        """
+        Export to ONNX format.
+        ONNX provides:
+        - Cross-platform inference (CPU, GPU, mobile)
+        - Python, C++, C#, Java, JavaScript runtimes
+        - Optimized for ONNX Runtime
+        Requirements: transformers, optimum
+        """
+        print("🔄 Exporting to ONNX...")
+        try:
+            from optimum.onnxruntime import ORTModelForCausalLM
+            from transformers import AutoTokenizer
+            # Load model
+            print(f"   Loading model from {self.model_path}")
+            # Export
+            output_path = self.output_dir / "onnx"
+            output_path.mkdir(exist_ok=True)
+            # Use optimum CLI for export
+            cmd = [
+                sys.executable, "-m", "optimum.exporters.onnx",
+                "--model", str(self.model_path),
+                "--task", "text-generation",
+                str(output_path),
+            ]
+            subprocess.run(cmd, check=True)
+            print(f"✅ ONNX model exported to {output_path}")
+            # Optimize if requested
+            if optimize:
+                self._optimize_onnx(output_path)
+            return output_path
+        except ImportError:
+            print("❌ Install optimum: pip install optimum[onnxruntime]")
+            return None
+        except Exception as e:
+            print(f"❌ ONNX export failed: {e}")
+            return None
+    def _optimize_onnx(self, model_dir: Path):
+        """Optimize ONNX model."""
+        try:
+            from onnxruntime.transformers import optimizer
+            model_path = model_dir / "model.onnx"
+            if model_path.exists():
+                optimized_path = model_dir / "model_optimized.onnx"
+                opt_model = optimizer.optimize_model(
+                    str(model_path),
+                    model_type="gpt2",  # or bert, etc.
+                    num_heads=32,
+                    hidden_size=4096,
+                )
+                opt_model.save_model_to_file(str(optimized_path))
+                print(f"   Optimized model saved to {optimized_path}")
+        except Exception as e:
+            print(f"   ⚠️ Optimization failed: {e}")
+    def export_gguf(
+        self,
+        quantization: str = "q4_k_m",
+    ) -> Path:
+        """
+        Export to GGUF format for llama.cpp.
+        GGUF provides:
+        - Fast CPU inference
+        - Low memory usage
+        - Mobile deployment (Android, iOS)
+        - Various quantization levels
+        Requirements: llama-cpp-python, llama.cpp tools
+        """
+        print(f"🔄 Exporting to GGUF ({quantization})...")
+        output_path = self.output_dir / "gguf"
+        output_path.mkdir(exist_ok=True)
+        try:
+            # Check for llama.cpp convert script
+            convert_script = shutil.which("convert-hf-to-gguf")
+            if convert_script:
+                # Using llama.cpp
+                cmd = [
+                    convert_script,
+                    str(self.model_path),
+                    "--outfile", str(output_path / "model.gguf"),
+                    "--outtype", quantization,
+                ]
+                subprocess.run(cmd, check=True)
+            else:
+                # Try using llama-cpp-python
+                print("   Using llama-cpp-python for conversion...")
+                # Alternative: use Python llama.cpp bindings
+                from llama_cpp import Llama
+                # This requires the model to already be in GGUF
+                print("   ⚠️ llama.cpp convert tools not found")
+                print("   Install: git clone https://github.com/ggerganov/llama.cpp && make")
+                return None
+            print(f"✅ GGUF model exported to {output_path}")
+            return output_path
+        except Exception as e:
+            print(f"❌ GGUF export failed: {e}")
+            print("   To convert to GGUF:")
+            print("   1. Clone llama.cpp: git clone https://github.com/ggerganov/llama.cpp")
+            print("   2. Run: python convert-hf-to-gguf.py <model_path> --outtype q4_k_m")
+            return None
+    def export_coreml(self) -> Path:
+        """
+        Export to CoreML for iOS/macOS.
+        Requirements: coremltools
+        """
+        print("🔄 Exporting to CoreML...")
+        output_path = self.output_dir / "coreml"
+        output_path.mkdir(exist_ok=True)
+        try:
+            import coremltools as ct
+            from transformers import AutoModelForCausalLM, AutoTokenizer
+            import torch
+            # Load model
+            model = AutoModelForCausalLM.from_pretrained(
+                self.model_path,
+                torch_dtype=torch.float32,
+            )
+            tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+            # Trace
+            example_input = tokenizer("Hello", return_tensors="pt")
+            traced = torch.jit.trace(model, (example_input.input_ids,))
+            # Convert
+            mlmodel = ct.convert(
+                traced,
+                inputs=[ct.TensorType(name="input_ids", shape=(1, ct.RangeDim(1, 512)))],
+                minimum_deployment_target=ct.target.iOS16,
+            )
+            mlmodel.save(output_path / "model.mlpackage")
+            print(f"✅ CoreML model exported to {output_path}")
+            return output_path
+        except ImportError:
+            print("❌ Install coremltools: pip install coremltools")
+            return None
+        except Exception as e:
+            print(f"❌ CoreML export failed: {e}")
+            return None
+    def export_transformers(self) -> Path:
+        """
+        Export as standard Transformers format (Safetensors).
+        This is the most compatible format for Hugging Face.
+        """
+        print("🔄 Exporting to Transformers format...")
+        output_path = self.output_dir / "transformers"
+        output_path.mkdir(exist_ok=True)
+        try:
+            from transformers import AutoModelForCausalLM, AutoTokenizer
+            # Load
+            model = AutoModelForCausalLM.from_pretrained(self.model_path)
+            tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+            # Save in safetensors format
+            model.save_pretrained(output_path, safe_serialization=True)
+            tokenizer.save_pretrained(output_path)
+            print(f"✅ Transformers model exported to {output_path}")
+            return output_path
+        except Exception as e:
+            print(f"❌ Export failed: {e}")
+            return None
+    def create_inference_code(self) -> Path:
+        """Generate inference code for each format."""
+        code_path = self.output_dir / "inference_examples"
+        code_path.mkdir(exist_ok=True)
+        # ONNX inference
+        onnx_code = '''
+"""ONNX Runtime Inference"""
+import numpy as np
+import onnxruntime as ort
+from transformers import AutoTokenizer
+# Load
+session = ort.InferenceSession("model.onnx")
+tokenizer = AutoTokenizer.from_pretrained(".")
+# Inference
+def extract(text: str) -> dict:
+    inputs = tokenizer(text, return_tensors="np")
+    outputs = session.run(None, {"input_ids": inputs["input_ids"]})
+    # Decode and parse
+    result = tokenizer.decode(outputs[0][0])
+    return parse_json(result)
+# Usage
+result = extract("HDFC Bank Rs.500 debited")
+print(result)
+'''
+        with open(code_path / "onnx_inference.py", 'w') as f:
+            f.write(onnx_code)
+        # GGUF inference
+        gguf_code = '''
+"""llama.cpp Inference"""
+from llama_cpp import Llama
+# Load
+llm = Llama(model_path="model.gguf", n_ctx=512, n_gpu_layers=0)
+# Inference
+def extract(text: str) -> dict:
+    prompt = f"Extract entities from: {text}\\nJSON:"
+    output = llm(prompt, max_tokens=256, stop=["\\n\\n"])
+    return json.loads(output["choices"][0]["text"])
+# Usage
+result = extract("HDFC Bank Rs.500 debited")
+print(result)
+'''
+        with open(code_path / "gguf_inference.py", 'w') as f:
+            f.write(gguf_code)
+        # Transformers inference
+        hf_code = '''
+"""Hugging Face Transformers Inference"""
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# Load
+model = AutoModelForCausalLM.from_pretrained(".")
+tokenizer = AutoTokenizer.from_pretrained(".")
+# Inference
+def extract(text: str) -> dict:
+    prompt = f"Extract entities from: {text}\\nJSON:"
+    inputs = tokenizer(prompt, return_tensors="pt")
+    outputs = model.generate(**inputs, max_new_tokens=256)
+    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return json.loads(result.split("JSON:")[-1])
+# Usage
+result = extract("HDFC Bank Rs.500 debited")
+print(result)
+'''
+        with open(code_path / "transformers_inference.py", 'w') as f:
+            f.write(hf_code)
+        print(f"✅ Inference examples saved to {code_path}")
+        return code_path
+    def export_all(self) -> dict:
+        """Export to all supported formats."""
+        results = {}
+        for fmt in ["transformers", "onnx", "gguf"]:
+            try:
+                if fmt == "onnx":
+                    results[fmt] = self.export_onnx()
+                elif fmt == "gguf":
+                    results[fmt] = self.export_gguf()
+                elif fmt == "transformers":
+                    results[fmt] = self.export_transformers()
+            except Exception as e:
+                results[fmt] = None
+                print(f"⚠️ {fmt} export failed: {e}")
+        self.create_inference_code()
+        return results
+def main():
+    parser = argparse.ArgumentParser(description="Export model to production formats")
+    parser.add_argument("model_path", help="Path to model")
+    parser.add_argument("--output", "-o", default="exports", help="Output directory")
+    parser.add_argument("--format", "-f", choices=ModelExporter.SUPPORTED_FORMATS + ["all"],
+                       default="all", help="Export format")
+    parser.add_argument("--quantization", "-q", default="q4_k_m",
+                       help="GGUF quantization type")
+    args = parser.parse_args()
+    exporter = ModelExporter(Path(args.model_path), Path(args.output))
+    if args.format == "all":
+        exporter.export_all()
+    elif args.format == "onnx":
+        exporter.export_onnx()
+    elif args.format == "gguf":
+        exporter.export_gguf(args.quantization)
+    elif args.format == "coreml":
+        exporter.export_coreml()
+    elif args.format == "transformers":
+        exporter.export_transformers()
+if __name__ == "__main__":
+    main()