Trouter-Library commited on
Commit
f1a0ba2
·
verified ·
1 Parent(s): c1ead59

Create deployment.py

Browse files
Files changed (1) hide show
  1. deployment.py +345 -0
deployment.py ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helion-V1 Production Deployment Script
3
+ Optimized for serving with vLLM, TGI, or custom inference servers
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import logging
9
+ from typing import Dict, List, Optional
10
+ from dataclasses import dataclass
11
+ import asyncio
12
+
13
+ # Configure logging
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
17
+ )
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ @dataclass
22
+ class DeploymentConfig:
23
+ """Configuration for model deployment."""
24
+ model_name: str = "DeepXR/Helion-V1"
25
+ tensor_parallel_size: int = 1
26
+ max_model_len: int = 4096
27
+ max_num_seqs: int = 256
28
+ gpu_memory_utilization: float = 0.90
29
+ trust_remote_code: bool = True
30
+ quantization: Optional[str] = None # "awq", "gptq", or None
31
+ dtype: str = "bfloat16"
32
+ enforce_eager: bool = False
33
+
34
+ # Safety settings
35
+ max_tokens: int = 2048
36
+ temperature: float = 0.7
37
+ top_p: float = 0.9
38
+ frequency_penalty: float = 0.1
39
+ presence_penalty: float = 0.1
40
+
41
+ # Rate limiting
42
+ rate_limit_requests_per_minute: int = 60
43
+ rate_limit_tokens_per_minute: int = 90000
44
+
45
+
46
+ class HelionDeployment:
47
+ """
48
+ Production deployment handler for Helion-V1.
49
+ Supports vLLM, Text Generation Inference, and custom servers.
50
+ """
51
+
52
+ def __init__(self, config: DeploymentConfig):
53
+ self.config = config
54
+ self.model = None
55
+ self.tokenizer = None
56
+
57
+ def deploy_vllm(self):
58
+ """Deploy using vLLM for high-throughput inference."""
59
+ try:
60
+ from vllm import LLM, SamplingParams
61
+
62
+ logger.info("Initializing vLLM engine...")
63
+
64
+ self.model = LLM(
65
+ model=self.config.model_name,
66
+ tensor_parallel_size=self.config.tensor_parallel_size,
67
+ max_model_len=self.config.max_model_len,
68
+ max_num_seqs=self.config.max_num_seqs,
69
+ gpu_memory_utilization=self.config.gpu_memory_utilization,
70
+ trust_remote_code=self.config.trust_remote_code,
71
+ quantization=self.config.quantization,
72
+ dtype=self.config.dtype,
73
+ enforce_eager=self.config.enforce_eager
74
+ )
75
+
76
+ logger.info("✅ vLLM engine initialized successfully")
77
+ return True
78
+
79
+ except ImportError:
80
+ logger.error("vLLM not installed. Install with: pip install vllm")
81
+ return False
82
+ except Exception as e:
83
+ logger.error(f"Failed to initialize vLLM: {e}")
84
+ return False
85
+
86
+ def get_sampling_params(self) -> 'SamplingParams':
87
+ """Get vLLM sampling parameters."""
88
+ from vllm import SamplingParams
89
+
90
+ return SamplingParams(
91
+ temperature=self.config.temperature,
92
+ top_p=self.config.top_p,
93
+ max_tokens=self.config.max_tokens,
94
+ frequency_penalty=self.config.frequency_penalty,
95
+ presence_penalty=self.config.presence_penalty
96
+ )
97
+
98
+ def generate_vllm(self, prompts: List[str]) -> List[str]:
99
+ """Generate responses using vLLM."""
100
+ if not self.model:
101
+ raise RuntimeError("Model not initialized. Call deploy_vllm() first.")
102
+
103
+ sampling_params = self.get_sampling_params()
104
+ outputs = self.model.generate(prompts, sampling_params)
105
+
106
+ return [output.outputs[0].text for output in outputs]
107
+
108
+ def create_fastapi_server(self):
109
+ """Create FastAPI server for HTTP API."""
110
+ try:
111
+ from fastapi import FastAPI, HTTPException
112
+ from fastapi.middleware.cors import CORSMiddleware
113
+ from pydantic import BaseModel
114
+ import uvicorn
115
+
116
+ app = FastAPI(
117
+ title="Helion-V1 API",
118
+ description="Safe and helpful AI assistant API",
119
+ version="1.0.0"
120
+ )
121
+
122
+ # CORS middleware
123
+ app.add_middleware(
124
+ CORSMiddleware,
125
+ allow_origins=["*"],
126
+ allow_credentials=True,
127
+ allow_methods=["*"],
128
+ allow_headers=["*"],
129
+ )
130
+
131
+ class ChatRequest(BaseModel):
132
+ messages: List[Dict[str, str]]
133
+ max_tokens: Optional[int] = 512
134
+ temperature: Optional[float] = 0.7
135
+ top_p: Optional[float] = 0.9
136
+
137
+ class ChatResponse(BaseModel):
138
+ response: str
139
+ model: str
140
+ usage: Dict[str, int]
141
+
142
+ @app.post("/v1/chat/completions", response_model=ChatResponse)
143
+ async def chat_completion(request: ChatRequest):
144
+ """OpenAI-compatible chat completion endpoint."""
145
+ try:
146
+ # Format messages
147
+ from transformers import AutoTokenizer
148
+ tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
149
+
150
+ prompt = tokenizer.apply_chat_template(
151
+ request.messages,
152
+ tokenize=False,
153
+ add_generation_prompt=True
154
+ )
155
+
156
+ # Generate response
157
+ responses = self.generate_vllm([prompt])
158
+
159
+ return ChatResponse(
160
+ response=responses[0],
161
+ model=self.config.model_name,
162
+ usage={
163
+ "prompt_tokens": len(tokenizer.encode(prompt)),
164
+ "completion_tokens": len(tokenizer.encode(responses[0])),
165
+ "total_tokens": len(tokenizer.encode(prompt + responses[0]))
166
+ }
167
+ )
168
+
169
+ except Exception as e:
170
+ logger.error(f"Generation error: {e}")
171
+ raise HTTPException(status_code=500, detail=str(e))
172
+
173
+ @app.get("/health")
174
+ async def health_check():
175
+ """Health check endpoint."""
176
+ return {"status": "healthy", "model": self.config.model_name}
177
+
178
+ @app.get("/")
179
+ async def root():
180
+ """Root endpoint."""
181
+ return {
182
+ "name": "Helion-V1 API",
183
+ "version": "1.0.0",
184
+ "status": "online"
185
+ }
186
+
187
+ return app
188
+
189
+ except ImportError:
190
+ logger.error("FastAPI not installed. Install with: pip install fastapi uvicorn")
191
+ return None
192
+
193
+ def export_onnx(self, output_path: str = "./helion_onnx"):
194
+ """Export model to ONNX format for optimized deployment."""
195
+ try:
196
+ from optimum.onnxruntime import ORTModelForCausalLM
197
+ from transformers import AutoTokenizer
198
+
199
+ logger.info("Exporting model to ONNX...")
200
+
201
+ model = ORTModelForCausalLM.from_pretrained(
202
+ self.config.model_name,
203
+ export=True
204
+ )
205
+ tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
206
+
207
+ model.save_pretrained(output_path)
208
+ tokenizer.save_pretrained(output_path)
209
+
210
+ logger.info(f"✅ Model exported to {output_path}")
211
+ return True
212
+
213
+ except ImportError:
214
+ logger.error("Optimum not installed. Install with: pip install optimum[onnxruntime-gpu]")
215
+ return False
216
+ except Exception as e:
217
+ logger.error(f"ONNX export failed: {e}")
218
+ return False
219
+
220
+ def create_docker_config(self, output_path: str = "./"):
221
+ """Generate Dockerfile for containerized deployment."""
222
+ dockerfile_content = f"""FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
223
+
224
+ # Set working directory
225
+ WORKDIR /app
226
+
227
+ # Install Python and dependencies
228
+ RUN apt-get update && apt-get install -y \\
229
+ python3.10 \\
230
+ python3-pip \\
231
+ git \\
232
+ && rm -rf /var/lib/apt/lists/*
233
+
234
+ # Install Python packages
235
+ COPY requirements.txt .
236
+ RUN pip3 install --no-cache-dir -r requirements.txt
237
+
238
+ # Install vLLM for high-performance inference
239
+ RUN pip3 install vllm
240
+
241
+ # Copy application code
242
+ COPY . .
243
+
244
+ # Set environment variables
245
+ ENV MODEL_NAME={self.config.model_name}
246
+ ENV MAX_MODEL_LEN={self.config.max_model_len}
247
+ ENV GPU_MEMORY_UTILIZATION={self.config.gpu_memory_utilization}
248
+ ENV TENSOR_PARALLEL_SIZE={self.config.tensor_parallel_size}
249
+
250
+ # Expose port
251
+ EXPOSE 8000
252
+
253
+ # Health check
254
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \\
255
+ CMD curl -f http://localhost:8000/health || exit 1
256
+
257
+ # Run the application
258
+ CMD ["python3", "deployment.py", "--server"]
259
+ """
260
+
261
+ dockerfile_path = os.path.join(output_path, "Dockerfile")
262
+ with open(dockerfile_path, 'w') as f:
263
+ f.write(dockerfile_content)
264
+
265
+ # Also create docker-compose.yml
266
+ docker_compose_content = f"""version: '3.8'
267
+
268
+ services:
269
+ helion-v1:
270
+ build: .
271
+ ports:
272
+ - "8000:8000"
273
+ environment:
274
+ - MODEL_NAME={self.config.model_name}
275
+ - CUDA_VISIBLE_DEVICES=0
276
+ deploy:
277
+ resources:
278
+ reservations:
279
+ devices:
280
+ - driver: nvidia
281
+ count: 1
282
+ capabilities: [gpu]
283
+ volumes:
284
+ - model_cache:/root/.cache/huggingface
285
+ restart: unless-stopped
286
+
287
+ volumes:
288
+ model_cache:
289
+ """
290
+
291
+ compose_path = os.path.join(output_path, "docker-compose.yml")
292
+ with open(compose_path, 'w') as f:
293
+ f.write(docker_compose_content)
294
+
295
+ logger.info(f"✅ Docker configuration created in {output_path}")
296
+ logger.info("Build with: docker-compose build")
297
+ logger.info("Run with: docker-compose up -d")
298
+
299
+
300
+ def main():
301
+ """Main deployment function."""
302
+ import argparse
303
+
304
+ parser = argparse.ArgumentParser(description="Deploy Helion-V1")
305
+ parser.add_argument("--model", default="DeepXR/Helion-V1", help="Model name or path")
306
+ parser.add_argument("--backend", choices=["vllm", "tgi", "fastapi"], default="vllm")
307
+ parser.add_argument("--server", action="store_true", help="Start HTTP server")
308
+ parser.add_argument("--export-onnx", action="store_true", help="Export to ONNX")
309
+ parser.add_argument("--create-docker", action="store_true", help="Create Docker config")
310
+ parser.add_argument("--tensor-parallel", type=int, default=1)
311
+ parser.add_argument("--quantization", choices=["awq", "gptq", None], default=None)
312
+
313
+ args = parser.parse_args()
314
+
315
+ # Create config
316
+ config = DeploymentConfig(
317
+ model_name=args.model,
318
+ tensor_parallel_size=args.tensor_parallel,
319
+ quantization=args.quantization
320
+ )
321
+
322
+ deployment = HelionDeployment(config)
323
+
324
+ if args.export_onnx:
325
+ deployment.export_onnx()
326
+
327
+ if args.create_docker:
328
+ deployment.create_docker_config()
329
+
330
+ if args.server:
331
+ if args.backend == "vllm":
332
+ if deployment.deploy_vllm():
333
+ app = deployment.create_fastapi_server()
334
+ if app:
335
+ import uvicorn
336
+ logger.info("🚀 Starting Helion-V1 server on http://0.0.0.0:8000")
337
+ uvicorn.run(app, host="0.0.0.0", port=8000)
338
+ else:
339
+ logger.error(f"Backend {args.backend} not implemented yet")
340
+ else:
341
+ logger.info("No action specified. Use --help for options.")
342
+
343
+
344
+ if __name__ == "__main__":
345
+ main()