|
|
|
|
|
""" |
|
|
Helion-2.5-Rnd Inference Server |
|
|
High-performance inference server with vLLM backend |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import asyncio |
|
|
import json |
|
|
import logging |
|
|
import os |
|
|
import time |
|
|
from typing import AsyncGenerator, Dict, List, Optional, Union |
|
|
|
|
|
import torch |
|
|
import uvicorn |
|
|
from fastapi import FastAPI, HTTPException, Request |
|
|
from fastapi.middleware.cors import CORSMiddleware |
|
|
from fastapi.responses import JSONResponse, StreamingResponse |
|
|
from pydantic import BaseModel, Field |
|
|
from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams |
|
|
from vllm.utils import random_uuid |
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
|
|
) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class ChatMessage(BaseModel): |
|
|
"""Chat message format""" |
|
|
role: str = Field(..., description="Role: system, user, or assistant") |
|
|
content: str = Field(..., description="Message content") |
|
|
|
|
|
|
|
|
class ChatCompletionRequest(BaseModel): |
|
|
"""Chat completion request format""" |
|
|
model: str = Field(default="DeepXR/Helion-2.5-Rnd") |
|
|
messages: List[ChatMessage] |
|
|
temperature: float = Field(default=0.7, ge=0.0, le=2.0) |
|
|
top_p: float = Field(default=0.9, ge=0.0, le=1.0) |
|
|
top_k: int = Field(default=50, ge=0) |
|
|
max_tokens: int = Field(default=4096, ge=1) |
|
|
stream: bool = Field(default=False) |
|
|
stop: Optional[List[str]] = None |
|
|
presence_penalty: float = Field(default=0.0, ge=-2.0, le=2.0) |
|
|
frequency_penalty: float = Field(default=0.0, ge=-2.0, le=2.0) |
|
|
repetition_penalty: float = Field(default=1.1, ge=1.0, le=2.0) |
|
|
n: int = Field(default=1, ge=1, le=10) |
|
|
logprobs: Optional[int] = None |
|
|
echo: bool = Field(default=False) |
|
|
|
|
|
|
|
|
class CompletionRequest(BaseModel): |
|
|
"""Text completion request format""" |
|
|
model: str = Field(default="DeepXR/Helion-2.5-Rnd") |
|
|
prompt: Union[str, List[str]] |
|
|
temperature: float = Field(default=0.7, ge=0.0, le=2.0) |
|
|
top_p: float = Field(default=0.9, ge=0.0, le=1.0) |
|
|
max_tokens: int = Field(default=4096, ge=1) |
|
|
stream: bool = Field(default=False) |
|
|
stop: Optional[List[str]] = None |
|
|
n: int = Field(default=1, ge=1, le=10) |
|
|
|
|
|
|
|
|
class HelionInferenceServer: |
|
|
"""Main inference server class""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
model_path: str, |
|
|
tensor_parallel_size: int = 2, |
|
|
max_model_len: int = 131072, |
|
|
gpu_memory_utilization: float = 0.95, |
|
|
dtype: str = "bfloat16" |
|
|
): |
|
|
self.model_path = model_path |
|
|
self.model_name = "DeepXR/Helion-2.5-Rnd" |
|
|
|
|
|
|
|
|
engine_args = AsyncEngineArgs( |
|
|
model=model_path, |
|
|
tensor_parallel_size=tensor_parallel_size, |
|
|
max_model_len=max_model_len, |
|
|
gpu_memory_utilization=gpu_memory_utilization, |
|
|
dtype=dtype, |
|
|
trust_remote_code=True, |
|
|
enforce_eager=False, |
|
|
disable_log_stats=False, |
|
|
) |
|
|
|
|
|
logger.info(f"Initializing Helion-2.5-Rnd from {model_path}") |
|
|
self.engine = AsyncLLMEngine.from_engine_args(engine_args) |
|
|
logger.info("Engine initialized successfully") |
|
|
|
|
|
|
|
|
self.request_count = 0 |
|
|
self.start_time = time.time() |
|
|
|
|
|
def format_chat_prompt(self, messages: List[ChatMessage]) -> str: |
|
|
"""Format chat messages into prompt""" |
|
|
formatted = "" |
|
|
for msg in messages: |
|
|
formatted += f"<|im_start|>{msg.role}\n{msg.content}<|im_end|>\n" |
|
|
formatted += "<|im_start|>assistant\n" |
|
|
return formatted |
|
|
|
|
|
async def generate( |
|
|
self, |
|
|
prompt: str, |
|
|
sampling_params: SamplingParams, |
|
|
request_id: str |
|
|
) -> AsyncGenerator[str, None]: |
|
|
"""Generate text streaming""" |
|
|
results_generator = self.engine.generate( |
|
|
prompt, |
|
|
sampling_params, |
|
|
request_id |
|
|
) |
|
|
|
|
|
async for request_output in results_generator: |
|
|
text = request_output.outputs[0].text |
|
|
yield text |
|
|
|
|
|
async def chat_completion( |
|
|
self, |
|
|
request: ChatCompletionRequest |
|
|
) -> Union[Dict, AsyncGenerator]: |
|
|
"""Handle chat completion request""" |
|
|
request_id = f"helion-{random_uuid()}" |
|
|
self.request_count += 1 |
|
|
|
|
|
|
|
|
prompt = self.format_chat_prompt(request.messages) |
|
|
|
|
|
|
|
|
sampling_params = SamplingParams( |
|
|
temperature=request.temperature, |
|
|
top_p=request.top_p, |
|
|
top_k=request.top_k, |
|
|
max_tokens=request.max_tokens, |
|
|
stop=request.stop or ["<|im_end|>", "<|endoftext|>"], |
|
|
presence_penalty=request.presence_penalty, |
|
|
frequency_penalty=request.frequency_penalty, |
|
|
repetition_penalty=request.repetition_penalty, |
|
|
n=request.n, |
|
|
logprobs=request.logprobs, |
|
|
) |
|
|
|
|
|
if request.stream: |
|
|
return self._stream_chat_completion( |
|
|
prompt, |
|
|
sampling_params, |
|
|
request_id, |
|
|
request.model |
|
|
) |
|
|
else: |
|
|
return await self._complete_chat_completion( |
|
|
prompt, |
|
|
sampling_params, |
|
|
request_id, |
|
|
request.model |
|
|
) |
|
|
|
|
|
async def _complete_chat_completion( |
|
|
self, |
|
|
prompt: str, |
|
|
sampling_params: SamplingParams, |
|
|
request_id: str, |
|
|
model: str |
|
|
) -> Dict: |
|
|
"""Non-streaming chat completion""" |
|
|
final_output = None |
|
|
async for request_output in self.engine.generate( |
|
|
prompt, sampling_params, request_id |
|
|
): |
|
|
final_output = request_output |
|
|
|
|
|
if final_output is None: |
|
|
raise HTTPException(status_code=500, detail="Generation failed") |
|
|
|
|
|
choice = { |
|
|
"index": 0, |
|
|
"message": { |
|
|
"role": "assistant", |
|
|
"content": final_output.outputs[0].text |
|
|
}, |
|
|
"finish_reason": final_output.outputs[0].finish_reason |
|
|
} |
|
|
|
|
|
return { |
|
|
"id": request_id, |
|
|
"object": "chat.completion", |
|
|
"created": int(time.time()), |
|
|
"model": model, |
|
|
"choices": [choice], |
|
|
"usage": { |
|
|
"prompt_tokens": len(final_output.prompt_token_ids), |
|
|
"completion_tokens": len(final_output.outputs[0].token_ids), |
|
|
"total_tokens": len(final_output.prompt_token_ids) + len(final_output.outputs[0].token_ids) |
|
|
} |
|
|
} |
|
|
|
|
|
async def _stream_chat_completion( |
|
|
self, |
|
|
prompt: str, |
|
|
sampling_params: SamplingParams, |
|
|
request_id: str, |
|
|
model: str |
|
|
) -> AsyncGenerator: |
|
|
"""Streaming chat completion""" |
|
|
async def generate(): |
|
|
previous_text = "" |
|
|
async for request_output in self.engine.generate( |
|
|
prompt, sampling_params, request_id |
|
|
): |
|
|
text = request_output.outputs[0].text |
|
|
delta = text[len(previous_text):] |
|
|
previous_text = text |
|
|
|
|
|
chunk = { |
|
|
"id": request_id, |
|
|
"object": "chat.completion.chunk", |
|
|
"created": int(time.time()), |
|
|
"model": model, |
|
|
"choices": [{ |
|
|
"index": 0, |
|
|
"delta": {"content": delta}, |
|
|
"finish_reason": None |
|
|
}] |
|
|
} |
|
|
yield f"data: {json.dumps(chunk)}\n\n" |
|
|
|
|
|
|
|
|
final_chunk = { |
|
|
"id": request_id, |
|
|
"object": "chat.completion.chunk", |
|
|
"created": int(time.time()), |
|
|
"model": model, |
|
|
"choices": [{ |
|
|
"index": 0, |
|
|
"delta": {}, |
|
|
"finish_reason": "stop" |
|
|
}] |
|
|
} |
|
|
yield f"data: {json.dumps(final_chunk)}\n\n" |
|
|
yield "data: [DONE]\n\n" |
|
|
|
|
|
return generate() |
|
|
|
|
|
|
|
|
|
|
|
app = FastAPI( |
|
|
title="Helion-2.5-Rnd Inference API", |
|
|
description="Advanced language model inference server", |
|
|
version="2.5.0-rnd" |
|
|
) |
|
|
|
|
|
|
|
|
app.add_middleware( |
|
|
CORSMiddleware, |
|
|
allow_origins=["*"], |
|
|
allow_credentials=True, |
|
|
allow_methods=["*"], |
|
|
allow_headers=["*"], |
|
|
) |
|
|
|
|
|
|
|
|
server: Optional[HelionInferenceServer] = None |
|
|
|
|
|
|
|
|
@app.on_event("startup") |
|
|
async def startup_event(): |
|
|
"""Initialize the model on startup""" |
|
|
global server |
|
|
|
|
|
model_path = os.getenv("MODEL_PATH", "/models/helion") |
|
|
tensor_parallel = int(os.getenv("TENSOR_PARALLEL_SIZE", "2")) |
|
|
max_len = int(os.getenv("MAX_MODEL_LEN", "131072")) |
|
|
gpu_util = float(os.getenv("GPU_MEMORY_UTILIZATION", "0.95")) |
|
|
|
|
|
server = HelionInferenceServer( |
|
|
model_path=model_path, |
|
|
tensor_parallel_size=tensor_parallel, |
|
|
max_model_len=max_len, |
|
|
gpu_memory_utilization=gpu_util |
|
|
) |
|
|
logger.info("Helion-2.5-Rnd server started successfully") |
|
|
|
|
|
|
|
|
@app.get("/") |
|
|
async def root(): |
|
|
"""Root endpoint""" |
|
|
return { |
|
|
"model": "DeepXR/Helion-2.5-Rnd", |
|
|
"version": "2.5.0-rnd", |
|
|
"status": "ready", |
|
|
"type": "research" |
|
|
} |
|
|
|
|
|
|
|
|
@app.get("/health") |
|
|
async def health(): |
|
|
"""Health check endpoint""" |
|
|
if server is None: |
|
|
raise HTTPException(status_code=503, detail="Server not initialized") |
|
|
|
|
|
return { |
|
|
"status": "healthy", |
|
|
"model": server.model_name, |
|
|
"requests_served": server.request_count, |
|
|
"uptime_seconds": int(time.time() - server.start_time) |
|
|
} |
|
|
|
|
|
|
|
|
@app.get("/v1/models") |
|
|
async def list_models(): |
|
|
"""List available models""" |
|
|
return { |
|
|
"object": "list", |
|
|
"data": [{ |
|
|
"id": "DeepXR/Helion-2.5-Rnd", |
|
|
"object": "model", |
|
|
"created": int(time.time()), |
|
|
"owned_by": "DeepXR" |
|
|
}] |
|
|
} |
|
|
|
|
|
|
|
|
@app.post("/v1/chat/completions") |
|
|
async def chat_completions(request: ChatCompletionRequest): |
|
|
"""Chat completion endpoint""" |
|
|
if server is None: |
|
|
raise HTTPException(status_code=503, detail="Server not initialized") |
|
|
|
|
|
try: |
|
|
result = await server.chat_completion(request) |
|
|
|
|
|
if request.stream: |
|
|
return StreamingResponse( |
|
|
result, |
|
|
media_type="text/event-stream" |
|
|
) |
|
|
else: |
|
|
return JSONResponse(content=result) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error in chat completion: {str(e)}") |
|
|
raise HTTPException(status_code=500, detail=str(e)) |
|
|
|
|
|
|
|
|
@app.post("/v1/completions") |
|
|
async def completions(request: CompletionRequest): |
|
|
"""Text completion endpoint""" |
|
|
if server is None: |
|
|
raise HTTPException(status_code=503, detail="Server not initialized") |
|
|
|
|
|
|
|
|
messages = [ChatMessage(role="user", content=request.prompt)] |
|
|
chat_request = ChatCompletionRequest( |
|
|
model=request.model, |
|
|
messages=messages, |
|
|
temperature=request.temperature, |
|
|
top_p=request.top_p, |
|
|
max_tokens=request.max_tokens, |
|
|
stream=request.stream, |
|
|
stop=request.stop, |
|
|
n=request.n |
|
|
) |
|
|
|
|
|
return await chat_completions(chat_request) |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main entry point""" |
|
|
parser = argparse.ArgumentParser(description="Helion-2.5-Rnd Inference Server") |
|
|
parser.add_argument("--model", type=str, default="/models/helion") |
|
|
parser.add_argument("--host", type=str, default="0.0.0.0") |
|
|
parser.add_argument("--port", type=int, default=8000) |
|
|
parser.add_argument("--tensor-parallel-size", type=int, default=2) |
|
|
parser.add_argument("--max-model-len", type=int, default=131072) |
|
|
parser.add_argument("--gpu-memory-utilization", type=float, default=0.95) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
os.environ["MODEL_PATH"] = args.model |
|
|
os.environ["TENSOR_PARALLEL_SIZE"] = str(args.tensor_parallel_size) |
|
|
os.environ["MAX_MODEL_LEN"] = str(args.max_model_len) |
|
|
os.environ["GPU_MEMORY_UTILIZATION"] = str(args.gpu_memory_utilization) |
|
|
|
|
|
|
|
|
uvicorn.run( |
|
|
app, |
|
|
host=args.host, |
|
|
port=args.port, |
|
|
log_level="info", |
|
|
access_log=True |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |