Ranjit0034 commited on
Commit
114a2fc
·
verified ·
1 Parent(s): 1cba4da

Upload scripts/export_model.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. scripts/export_model.py +366 -0
scripts/export_model.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Model Export for Production Deployment
4
+ =======================================
5
+
6
+ Export FinEE model to various formats:
7
+ - ONNX (cross-platform)
8
+ - GGUF (llama.cpp, mobile)
9
+ - CoreML (iOS/macOS)
10
+ - TensorRT (NVIDIA inference)
11
+
12
+ Author: Ranjit Behera
13
+ """
14
+
15
+ import os
16
+ import sys
17
+ import json
18
+ import shutil
19
+ import subprocess
20
+ from pathlib import Path
21
+ from typing import Optional, List
22
+ import argparse
23
+
24
+
25
+ class ModelExporter:
26
+ """
27
+ Export models to production-ready formats.
28
+ """
29
+
30
+ SUPPORTED_FORMATS = ["onnx", "gguf", "coreml", "tensorrt", "transformers"]
31
+
32
+ def __init__(self, model_path: Path, output_dir: Path):
33
+ self.model_path = Path(model_path)
34
+ self.output_dir = Path(output_dir)
35
+ self.output_dir.mkdir(parents=True, exist_ok=True)
36
+
37
+ def export_onnx(
38
+ self,
39
+ opset_version: int = 14,
40
+ optimize: bool = True,
41
+ ) -> Path:
42
+ """
43
+ Export to ONNX format.
44
+
45
+ ONNX provides:
46
+ - Cross-platform inference (CPU, GPU, mobile)
47
+ - Python, C++, C#, Java, JavaScript runtimes
48
+ - Optimized for ONNX Runtime
49
+
50
+ Requirements: transformers, optimum
51
+ """
52
+ print("🔄 Exporting to ONNX...")
53
+
54
+ try:
55
+ from optimum.onnxruntime import ORTModelForCausalLM
56
+ from transformers import AutoTokenizer
57
+
58
+ # Load model
59
+ print(f" Loading model from {self.model_path}")
60
+
61
+ # Export
62
+ output_path = self.output_dir / "onnx"
63
+ output_path.mkdir(exist_ok=True)
64
+
65
+ # Use optimum CLI for export
66
+ cmd = [
67
+ sys.executable, "-m", "optimum.exporters.onnx",
68
+ "--model", str(self.model_path),
69
+ "--task", "text-generation",
70
+ str(output_path),
71
+ ]
72
+
73
+ subprocess.run(cmd, check=True)
74
+ print(f"✅ ONNX model exported to {output_path}")
75
+
76
+ # Optimize if requested
77
+ if optimize:
78
+ self._optimize_onnx(output_path)
79
+
80
+ return output_path
81
+
82
+ except ImportError:
83
+ print("❌ Install optimum: pip install optimum[onnxruntime]")
84
+ return None
85
+ except Exception as e:
86
+ print(f"❌ ONNX export failed: {e}")
87
+ return None
88
+
89
+ def _optimize_onnx(self, model_dir: Path):
90
+ """Optimize ONNX model."""
91
+ try:
92
+ from onnxruntime.transformers import optimizer
93
+
94
+ model_path = model_dir / "model.onnx"
95
+ if model_path.exists():
96
+ optimized_path = model_dir / "model_optimized.onnx"
97
+ opt_model = optimizer.optimize_model(
98
+ str(model_path),
99
+ model_type="gpt2", # or bert, etc.
100
+ num_heads=32,
101
+ hidden_size=4096,
102
+ )
103
+ opt_model.save_model_to_file(str(optimized_path))
104
+ print(f" Optimized model saved to {optimized_path}")
105
+ except Exception as e:
106
+ print(f" ⚠️ Optimization failed: {e}")
107
+
108
+ def export_gguf(
109
+ self,
110
+ quantization: str = "q4_k_m",
111
+ ) -> Path:
112
+ """
113
+ Export to GGUF format for llama.cpp.
114
+
115
+ GGUF provides:
116
+ - Fast CPU inference
117
+ - Low memory usage
118
+ - Mobile deployment (Android, iOS)
119
+ - Various quantization levels
120
+
121
+ Requirements: llama-cpp-python, llama.cpp tools
122
+ """
123
+ print(f"🔄 Exporting to GGUF ({quantization})...")
124
+
125
+ output_path = self.output_dir / "gguf"
126
+ output_path.mkdir(exist_ok=True)
127
+
128
+ try:
129
+ # Check for llama.cpp convert script
130
+ convert_script = shutil.which("convert-hf-to-gguf")
131
+
132
+ if convert_script:
133
+ # Using llama.cpp
134
+ cmd = [
135
+ convert_script,
136
+ str(self.model_path),
137
+ "--outfile", str(output_path / "model.gguf"),
138
+ "--outtype", quantization,
139
+ ]
140
+ subprocess.run(cmd, check=True)
141
+ else:
142
+ # Try using llama-cpp-python
143
+ print(" Using llama-cpp-python for conversion...")
144
+
145
+ # Alternative: use Python llama.cpp bindings
146
+ from llama_cpp import Llama
147
+
148
+ # This requires the model to already be in GGUF
149
+ print(" ⚠️ llama.cpp convert tools not found")
150
+ print(" Install: git clone https://github.com/ggerganov/llama.cpp && make")
151
+ return None
152
+
153
+ print(f"✅ GGUF model exported to {output_path}")
154
+ return output_path
155
+
156
+ except Exception as e:
157
+ print(f"❌ GGUF export failed: {e}")
158
+ print(" To convert to GGUF:")
159
+ print(" 1. Clone llama.cpp: git clone https://github.com/ggerganov/llama.cpp")
160
+ print(" 2. Run: python convert-hf-to-gguf.py <model_path> --outtype q4_k_m")
161
+ return None
162
+
163
+ def export_coreml(self) -> Path:
164
+ """
165
+ Export to CoreML for iOS/macOS.
166
+
167
+ Requirements: coremltools
168
+ """
169
+ print("🔄 Exporting to CoreML...")
170
+
171
+ output_path = self.output_dir / "coreml"
172
+ output_path.mkdir(exist_ok=True)
173
+
174
+ try:
175
+ import coremltools as ct
176
+ from transformers import AutoModelForCausalLM, AutoTokenizer
177
+ import torch
178
+
179
+ # Load model
180
+ model = AutoModelForCausalLM.from_pretrained(
181
+ self.model_path,
182
+ torch_dtype=torch.float32,
183
+ )
184
+ tokenizer = AutoTokenizer.from_pretrained(self.model_path)
185
+
186
+ # Trace
187
+ example_input = tokenizer("Hello", return_tensors="pt")
188
+ traced = torch.jit.trace(model, (example_input.input_ids,))
189
+
190
+ # Convert
191
+ mlmodel = ct.convert(
192
+ traced,
193
+ inputs=[ct.TensorType(name="input_ids", shape=(1, ct.RangeDim(1, 512)))],
194
+ minimum_deployment_target=ct.target.iOS16,
195
+ )
196
+
197
+ mlmodel.save(output_path / "model.mlpackage")
198
+ print(f"✅ CoreML model exported to {output_path}")
199
+ return output_path
200
+
201
+ except ImportError:
202
+ print("❌ Install coremltools: pip install coremltools")
203
+ return None
204
+ except Exception as e:
205
+ print(f"❌ CoreML export failed: {e}")
206
+ return None
207
+
208
+ def export_transformers(self) -> Path:
209
+ """
210
+ Export as standard Transformers format (Safetensors).
211
+
212
+ This is the most compatible format for Hugging Face.
213
+ """
214
+ print("🔄 Exporting to Transformers format...")
215
+
216
+ output_path = self.output_dir / "transformers"
217
+ output_path.mkdir(exist_ok=True)
218
+
219
+ try:
220
+ from transformers import AutoModelForCausalLM, AutoTokenizer
221
+
222
+ # Load
223
+ model = AutoModelForCausalLM.from_pretrained(self.model_path)
224
+ tokenizer = AutoTokenizer.from_pretrained(self.model_path)
225
+
226
+ # Save in safetensors format
227
+ model.save_pretrained(output_path, safe_serialization=True)
228
+ tokenizer.save_pretrained(output_path)
229
+
230
+ print(f"✅ Transformers model exported to {output_path}")
231
+ return output_path
232
+
233
+ except Exception as e:
234
+ print(f"❌ Export failed: {e}")
235
+ return None
236
+
237
+ def create_inference_code(self) -> Path:
238
+ """Generate inference code for each format."""
239
+
240
+ code_path = self.output_dir / "inference_examples"
241
+ code_path.mkdir(exist_ok=True)
242
+
243
+ # ONNX inference
244
+ onnx_code = '''
245
+ """ONNX Runtime Inference"""
246
+ import numpy as np
247
+ import onnxruntime as ort
248
+ from transformers import AutoTokenizer
249
+
250
+ # Load
251
+ session = ort.InferenceSession("model.onnx")
252
+ tokenizer = AutoTokenizer.from_pretrained(".")
253
+
254
+ # Inference
255
+ def extract(text: str) -> dict:
256
+ inputs = tokenizer(text, return_tensors="np")
257
+ outputs = session.run(None, {"input_ids": inputs["input_ids"]})
258
+ # Decode and parse
259
+ result = tokenizer.decode(outputs[0][0])
260
+ return parse_json(result)
261
+
262
+ # Usage
263
+ result = extract("HDFC Bank Rs.500 debited")
264
+ print(result)
265
+ '''
266
+
267
+ with open(code_path / "onnx_inference.py", 'w') as f:
268
+ f.write(onnx_code)
269
+
270
+ # GGUF inference
271
+ gguf_code = '''
272
+ """llama.cpp Inference"""
273
+ from llama_cpp import Llama
274
+
275
+ # Load
276
+ llm = Llama(model_path="model.gguf", n_ctx=512, n_gpu_layers=0)
277
+
278
+ # Inference
279
+ def extract(text: str) -> dict:
280
+ prompt = f"Extract entities from: {text}\\nJSON:"
281
+ output = llm(prompt, max_tokens=256, stop=["\\n\\n"])
282
+ return json.loads(output["choices"][0]["text"])
283
+
284
+ # Usage
285
+ result = extract("HDFC Bank Rs.500 debited")
286
+ print(result)
287
+ '''
288
+
289
+ with open(code_path / "gguf_inference.py", 'w') as f:
290
+ f.write(gguf_code)
291
+
292
+ # Transformers inference
293
+ hf_code = '''
294
+ """Hugging Face Transformers Inference"""
295
+ from transformers import AutoModelForCausalLM, AutoTokenizer
296
+
297
+ # Load
298
+ model = AutoModelForCausalLM.from_pretrained(".")
299
+ tokenizer = AutoTokenizer.from_pretrained(".")
300
+
301
+ # Inference
302
+ def extract(text: str) -> dict:
303
+ prompt = f"Extract entities from: {text}\\nJSON:"
304
+ inputs = tokenizer(prompt, return_tensors="pt")
305
+ outputs = model.generate(**inputs, max_new_tokens=256)
306
+ result = tokenizer.decode(outputs[0], skip_special_tokens=True)
307
+ return json.loads(result.split("JSON:")[-1])
308
+
309
+ # Usage
310
+ result = extract("HDFC Bank Rs.500 debited")
311
+ print(result)
312
+ '''
313
+
314
+ with open(code_path / "transformers_inference.py", 'w') as f:
315
+ f.write(hf_code)
316
+
317
+ print(f"✅ Inference examples saved to {code_path}")
318
+ return code_path
319
+
320
+ def export_all(self) -> dict:
321
+ """Export to all supported formats."""
322
+ results = {}
323
+
324
+ for fmt in ["transformers", "onnx", "gguf"]:
325
+ try:
326
+ if fmt == "onnx":
327
+ results[fmt] = self.export_onnx()
328
+ elif fmt == "gguf":
329
+ results[fmt] = self.export_gguf()
330
+ elif fmt == "transformers":
331
+ results[fmt] = self.export_transformers()
332
+ except Exception as e:
333
+ results[fmt] = None
334
+ print(f"⚠️ {fmt} export failed: {e}")
335
+
336
+ self.create_inference_code()
337
+ return results
338
+
339
+
340
+ def main():
341
+ parser = argparse.ArgumentParser(description="Export model to production formats")
342
+ parser.add_argument("model_path", help="Path to model")
343
+ parser.add_argument("--output", "-o", default="exports", help="Output directory")
344
+ parser.add_argument("--format", "-f", choices=ModelExporter.SUPPORTED_FORMATS + ["all"],
345
+ default="all", help="Export format")
346
+ parser.add_argument("--quantization", "-q", default="q4_k_m",
347
+ help="GGUF quantization type")
348
+
349
+ args = parser.parse_args()
350
+
351
+ exporter = ModelExporter(Path(args.model_path), Path(args.output))
352
+
353
+ if args.format == "all":
354
+ exporter.export_all()
355
+ elif args.format == "onnx":
356
+ exporter.export_onnx()
357
+ elif args.format == "gguf":
358
+ exporter.export_gguf(args.quantization)
359
+ elif args.format == "coreml":
360
+ exporter.export_coreml()
361
+ elif args.format == "transformers":
362
+ exporter.export_transformers()
363
+
364
+
365
+ if __name__ == "__main__":
366
+ main()