File size: 8,326 Bytes
abae9ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
{
  "deployment_metadata": {
    "model_name": "Helion-V2.0-Thinking",
    "version": "2.0.0",
    "deployment_date": "2024-11-27",
    "supported_frameworks": ["transformers", "vllm", "text-generation-inference", "ollama"],
    "minimum_transformers_version": "4.36.0"
  },
  
  "server_configurations": {
    "development": {
      "environment": "dev",
      "host": "0.0.0.0",
      "port": 8000,
      "workers": 1,
      "max_batch_size": 1,
      "max_concurrent_requests": 4,
      "timeout_seconds": 300,
      "enable_cors": true,
      "cors_origins": ["*"],
      "log_level": "DEBUG",
      "cache_enabled": true,
      "metrics_enabled": true
    },
    "production": {
      "environment": "prod",
      "host": "0.0.0.0",
      "port": 8000,
      "workers": 4,
      "max_batch_size": 8,
      "max_concurrent_requests": 32,
      "timeout_seconds": 180,
      "enable_cors": true,
      "cors_origins": ["https://yourdomain.com"],
      "log_level": "INFO",
      "cache_enabled": true,
      "metrics_enabled": true,
      "health_check_enabled": true,
      "auto_scaling": true
    }
  },
  
  "vllm_config": {
    "gpu_memory_utilization": 0.9,
    "max_num_seqs": 256,
    "max_num_batched_tokens": 8192,
    "max_model_len": 200000,
    "trust_remote_code": true,
    "tensor_parallel_size": 1,
    "pipeline_parallel_size": 1,
    "dtype": "bfloat16",
    "quantization": null,
    "enforce_eager": false,
    "enable_chunked_prefill": true,
    "max_num_on_the_fly": 8,
    "enable_prefix_caching": true,
    "disable_custom_all_reduce": false
  },
  
  "text_generation_inference": {
    "max_concurrent_requests": 128,
    "max_best_of": 4,
    "max_stop_sequences": 4,
    "max_input_length": 199000,
    "max_total_tokens": 200000,
    "waiting_served_ratio": 1.2,
    "max_batch_prefill_tokens": 4096,
    "max_batch_total_tokens": 200000,
    "max_waiting_tokens": 20,
    "hostname": "0.0.0.0",
    "port": 8080,
    "master_shard_uds_path": "/tmp/text-generation-server",
    "tokenizer_name": "DeepXR/Helion-V2.0-Thinking",
    "revision": "main",
    "validation_workers": 2,
    "json_output": false,
    "otlp_endpoint": null,
    "cors_allow_origin": "*",
    "watermark_gamma": null,
    "watermark_delta": null
  },
  
  "ollama_modelfile": {
    "from": "DeepXR/Helion-V2.0-Thinking",
    "template": "[INST] {{ .System }} {{ .Prompt }} [/INST]",
    "parameter": {
      "temperature": 0.7,
      "top_p": 0.9,
      "top_k": 50,
      "num_ctx": 200000,
      "num_predict": 2048,
      "stop": ["</s>", "<|end|>"],
      "repeat_penalty": 1.1,
      "seed": -1
    },
    "system": "You are Helion, a helpful AI assistant with vision and tool use capabilities."
  },
  
  "api_endpoints": {
    "generate": {
      "path": "/v1/generate",
      "method": "POST",
      "rate_limit": "100/minute",
      "request_schema": {
        "prompt": "string (required)",
        "max_tokens": "integer (optional, default: 1024)",
        "temperature": "float (optional, default: 0.7)",
        "top_p": "float (optional, default: 0.9)",
        "stream": "boolean (optional, default: false)",
        "images": "array<base64> (optional)"
      }
    },
    "chat": {
      "path": "/v1/chat/completions",
      "method": "POST",
      "rate_limit": "100/minute",
      "openai_compatible": true,
      "request_schema": {
        "messages": "array (required)",
        "model": "string (required)",
        "temperature": "float (optional)",
        "stream": "boolean (optional)"
      }
    },
    "embeddings": {
      "path": "/v1/embeddings",
      "method": "POST",
      "rate_limit": "200/minute",
      "enabled": false
    },
    "health": {
      "path": "/health",
      "method": "GET",
      "public": true
    },
    "metrics": {
      "path": "/metrics",
      "method": "GET",
      "format": "prometheus",
      "public": false
    }
  },
  
  "load_balancing": {
    "strategy": "round_robin",
    "health_check_interval_seconds": 30,
    "unhealthy_threshold": 3,
    "healthy_threshold": 2,
    "sticky_sessions": false,
    "session_affinity_ttl_seconds": 3600
  },
  
  "caching": {
    "enabled": true,
    "backend": "redis",
    "redis": {
      "host": "localhost",
      "port": 6379,
      "db": 0,
      "password": null,
      "ssl": false,
      "ttl_seconds": 3600,
      "max_connections": 50
    },
    "cache_keys": {
      "prompt_prefix": "helion:prompt:",
      "result_prefix": "helion:result:",
      "metrics_prefix": "helion:metrics:"
    },
    "cache_policies": {
      "identical_prompts": true,
      "similar_prompts": false,
      "max_cache_size_mb": 1024
    }
  },
  
  "monitoring": {
    "prometheus": {
      "enabled": true,
      "port": 9090,
      "metrics": [
        "request_count",
        "request_duration_seconds",
        "token_generation_rate",
        "gpu_memory_usage",
        "active_requests",
        "queue_size"
      ]
    },
    "logging": {
      "format": "json",
      "output": "stdout",
      "level": "INFO",
      "include_request_body": false,
      "include_response_body": false,
      "log_rotation": {
        "enabled": true,
        "max_size_mb": 100,
        "max_files": 10
      }
    },
    "tracing": {
      "enabled": false,
      "backend": "jaeger",
      "sampling_rate": 0.1
    }
  },
  
  "security": {
    "authentication": {
      "enabled": true,
      "type": "api_key",
      "api_key_header": "X-API-Key",
      "rate_limiting": true
    },
    "rate_limiting": {
      "enabled": true,
      "requests_per_minute": 100,
      "requests_per_hour": 5000,
      "burst_size": 10,
      "strategy": "sliding_window"
    },
    "input_validation": {
      "max_prompt_length": 199000,
      "max_image_size_mb": 20,
      "max_images_per_request": 10,
      "allowed_image_formats": ["jpg", "jpeg", "png", "webp"],
      "sanitize_inputs": true
    },
    "output_filtering": {
      "enabled": true,
      "pii_detection": true,
      "toxicity_filtering": true,
      "content_policy_enforcement": true
    }
  },
  
  "resource_management": {
    "gpu": {
      "memory_fraction": 0.95,
      "allow_growth": true,
      "per_process_gpu_memory_fraction": 0.9,
      "visible_devices": "0",
      "multi_gpu_strategy": "model_parallel"
    },
    "cpu": {
      "num_threads": 8,
      "num_workers": 4,
      "affinity_enabled": false
    },
    "memory": {
      "max_memory_gb": 64,
      "swap_enabled": false,
      "oom_handling": "graceful_degradation"
    }
  },
  
  "auto_scaling": {
    "enabled": false,
    "min_replicas": 1,
    "max_replicas": 10,
    "target_gpu_utilization": 0.7,
    "target_request_rate": 50,
    "scale_up_threshold": 0.8,
    "scale_down_threshold": 0.3,
    "cooldown_period_seconds": 300
  },
  
  "backup_and_recovery": {
    "checkpoint_enabled": false,
    "checkpoint_interval_hours": 24,
    "checkpoint_path": "/data/checkpoints",
    "max_checkpoints": 5,
    "recovery_strategy": "latest_checkpoint"
  },
  
  "experimental_features": {
    "speculative_decoding": false,
    "continuous_batching": true,
    "dynamic_batching": true,
    "model_compilation": false,
    "mixed_precision": true,
    "gradient_checkpointing": false
  },
  
  "model_serving_options": {
    "triton_inference_server": {
      "enabled": false,
      "model_repository": "/models",
      "backend": "python",
      "max_batch_size": 8,
      "dynamic_batching": true
    },
    "torchserve": {
      "enabled": false,
      "model_store": "/model_store",
      "batch_size": 4,
      "workers": 2
    },
    "ray_serve": {
      "enabled": false,
      "num_replicas": 2,
      "max_concurrent_queries": 16
    }
  },
  
  "cloud_deployment": {
    "aws": {
      "instance_type": "p4d.24xlarge",
      "region": "us-east-1",
      "use_spot_instances": false,
      "s3_model_path": "s3://your-bucket/models/helion-v2-thinking"
    },
    "gcp": {
      "machine_type": "a2-highgpu-8g",
      "region": "us-central1",
      "preemptible": false,
      "gcs_model_path": "gs://your-bucket/models/helion-v2-thinking"
    },
    "azure": {
      "vm_size": "Standard_NC96ads_A100_v4",
      "region": "eastus",
      "spot_instance": false,
      "blob_model_path": "https://your-storage.blob.core.windows.net/models/helion-v2-thinking"
    }
  }
}