DeepXR
/

Helion-V1.5

+"""
+Helion-V1.5 Production API Server
+FastAPI server with OpenAI-compatible endpoints, streaming, and monitoring
+"""
+import os
+import time
+import logging
+from typing import List, Dict, Optional, AsyncIterator
+from contextlib import asynccontextmanager
+import uvicorn
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel, Field
+import torch
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Global model instance
+MODEL = None
+TOKENIZER = None
+SAFEGUARDS = None
+class Message(BaseModel):
+    """Chat message."""
+    role: str = Field(..., description="Message role (system/user/assistant)")
+    content: str = Field(..., description="Message content")
+class ChatCompletionRequest(BaseModel):
+    """OpenAI-compatible chat completion request."""
+    model: str = Field(default="DeepXR/Helion-V1.5")
+    messages: List[Message]
+    temperature: float = Field(default=0.7, ge=0.0, le=2.0)
+    top_p: float = Field(default=0.9, ge=0.0, le=1.0)
+    max_tokens: int = Field(default=512, ge=1, le=4096)
+    stream: bool = Field(default=False)
+    n: int = Field(default=1, ge=1, le=1)
+    stop: Optional[List[str]] = None
+    presence_penalty: float = Field(default=0.0, ge=-2.0, le=2.0)
+    frequency_penalty: float = Field(default=0.0, ge=-2.0, le=2.0)
+class ChatCompletionResponse(BaseModel):
+    """OpenAI-compatible chat completion response."""
+    id: str
+    object: str = "chat.completion"
+    created: int
+    model: str
+    choices: List[Dict]
+    usage: Dict[str, int]
+class CompletionRequest(BaseModel):
+    """Text completion request."""
+    prompt: str
+    max_tokens: int = Field(default=512, ge=1, le=4096)
+    temperature: float = Field(default=0.7, ge=0.0, le=2.0)
+    top_p: float = Field(default=0.9, ge=0.0, le=1.0)
+    stream: bool = Field(default=False)
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Lifespan context manager for model loading."""
+    global MODEL, TOKENIZER, SAFEGUARDS
+    logger.info("Loading Helion-V1.5...")
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    from safeguards_v15 import HelionSafeguardSystem, SafeguardConfig
+    model_name = os.getenv("MODEL_NAME", "DeepXR/Helion-V1.5")
+    TOKENIZER = AutoTokenizer.from_pretrained(model_name)
+    MODEL = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.bfloat16,
+        device_map="auto"
+    )
+    MODEL.eval()
+    # Initialize safeguards
+    safeguard_mode = os.getenv("SAFEGUARD_MODE", "moderate")
+    from safeguards_v15 import create_safeguard_config
+    config = create_safeguard_config(mode=safeguard_mode)
+    SAFEGUARDS = HelionSafeguardSystem(config)
+    logger.info("Model loaded successfully")
+    yield
+    logger.info("Shutting down...")
+    del MODEL
+    del TOKENIZER
+    torch.cuda.empty_cache()
+# Create FastAPI app
+app = FastAPI(
+    title="Helion-V1.5 API",
+    description="OpenAI-compatible API for Helion-V1.5",
+    version="1.5.0",
+    lifespan=lifespan
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Request tracking middleware
+@app.middleware("http")
+async def log_requests(request: Request, call_next):
+    """Log all requests."""
+    start_time = time.time()
+    response = await call_next(request)
+    duration = time.time() - start_time
+    logger.info(
+        f"{request.method} {request.url.path} "
+        f"completed in {duration:.2f}s with status {response.status_code}"
+    )
+    return response
+def generate_response(
+    messages: List[Dict[str, str]],
+    max_tokens: int = 512,
+    temperature: float = 0.7,
+    top_p: float = 0.9,
+    use_safeguards: bool = True
+) -> Dict:
+    """Generate response from messages."""
+    if use_safeguards:
+        # Check input with safeguards
+        user_msg = messages[-1]["content"]
+        context = " ".join([m["content"] for m in messages[:-1]])
+        allowed, response = SAFEGUARDS.filter_message(user_msg, context)
+        if not allowed:
+            return {
+                "text": response,
+                "blocked": True,
+                "finish_reason": "content_filter"
+            }
+    # Apply chat template
+    input_ids = TOKENIZER.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        return_tensors="pt"
+    ).to(MODEL.device)
+    # Generate
+    with torch.no_grad():
+        output = MODEL.generate(
+            input_ids,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=True,
+            pad_token_id=TOKENIZER.pad_token_id,
+            eos_token_id=TOKENIZER.eos_token_id
+        )
+    # Decode
+    response_text = TOKENIZER.decode(
+        output[0][input_ids.shape[1]:],
+        skip_special_tokens=True
+    )
+    # Check output with safeguards
+    if use_safeguards:
+        output_safe, reason = SAFEGUARDS.check_output(response_text, user_msg)
+        if not output_safe:
+            return {
+                "text": SAFEGUARDS.get_refusal_message("default"),
+                "blocked": True,
+                "finish_reason": "content_filter"
+            }
+    return {
+        "text": response_text.strip(),
+        "blocked": False,
+        "finish_reason": "stop",
+        "prompt_tokens": input_ids.shape[1],
+        "completion_tokens": output.shape[1] - input_ids.shape[1],
+        "total_tokens": output.shape[1]
+    }
+async def stream_response(
+    messages: List[Dict[str, str]],
+    max_tokens: int = 512,
+    temperature: float = 0.7,
+    top_p: float = 0.9
+) -> AsyncIterator[str]:
+    """Stream response tokens."""
+    import json
+    # Apply chat template
+    input_ids = TOKENIZER.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        return_tensors="pt"
+    ).to(MODEL.device)
+    # Stream generation
+    from transformers import TextIteratorStreamer
+    from threading import Thread
+    streamer = TextIteratorStreamer(
+        TOKENIZER,
+        skip_prompt=True,
+        skip_special_tokens=True
+    )
+    generation_kwargs = dict(
+        input_ids=input_ids,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        do_sample=True,
+        streamer=streamer,
+        pad_token_id=TOKENIZER.pad_token_id,
+        eos_token_id=TOKENIZER.eos_token_id
+    )
+    thread = Thread(target=MODEL.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Stream tokens
+    for text in streamer:
+        chunk = {
+            "id": f"chatcmpl-{int(time.time())}",
+            "object": "chat.completion.chunk",
+            "created": int(time.time()),
+            "model": "DeepXR/Helion-V1.5",
+            "choices": [{
+                "index": 0,
+                "delta": {"content": text},
+                "finish_reason": None
+            }]
+        }
+        yield f"data: {json.dumps(chunk)}\n\n"
+    # Final chunk
+    final_chunk = {
+        "id": f"chatcmpl-{int(time.time())}",
+        "object": "chat.completion.chunk",
+        "created": int(time.time()),
+        "model": "DeepXR/Helion-V1.5",
+        "choices": [{
+            "index": 0,
+            "delta": {},
+            "finish_reason": "stop"
+        }]
+    }
+    yield f"data: {json.dumps(final_chunk)}\n\n"
+    yield "data: [DONE]\n\n"
+@app.get("/")
+async def root():
+    """Root endpoint."""
+    return {
+        "name": "Helion-V1.5 API",
+        "version": "1.5.0",
+        "status": "online",
+        "model": "DeepXR/Helion-V1.5"
+    }
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {
+        "status": "healthy",
+        "model_loaded": MODEL is not None,
+        "device": str(MODEL.device) if MODEL else None,
+        "safeguards_enabled": SAFEGUARDS is not None
+    }
+@app.get("/v1/models")
+async def list_models():
+    """List available models."""
+    return {
+        "object": "list",
+        "data": [{
+            "id": "DeepXR/Helion-V1.5",
+            "object": "model",
+            "created": int(time.time()),
+            "owned_by": "deepxr"
+        }]
+    }
+@app.post("/v1/chat/completions")
+async def chat_completions(request: ChatCompletionRequest):
+    """OpenAI-compatible chat completions endpoint."""
+    if not MODEL or not TOKENIZER:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    # Convert messages
+    messages = [{"role": m.role, "content": m.content} for m in request.messages]
+    # Streaming response
+    if request.stream:
+        return StreamingResponse(
+            stream_response(
+                messages,
+                max_tokens=request.max_tokens,
+                temperature=request.temperature,
+                top_p=request.top_p
+            ),
+            media_type="text/event-stream"
+        )
+    # Non-streaming response
+    result = generate_response(
+        messages,
+        max_tokens=request.max_tokens,
+        temperature=request.temperature,
+        top_p=request.top_p
+    )
+    response = ChatCompletionResponse(
+        id=f"chatcmpl-{int(time.time())}",
+        created=int(time.time()),
+        model=request.model,
+        choices=[{
+            "index": 0,
+            "message": {
+                "role": "assistant",
+                "content": result["text"]
+            },
+            "finish_reason": result["finish_reason"]
+        }],
+        usage={
+            "prompt_tokens": result.get("prompt_tokens", 0),
+            "completion_tokens": result.get("completion_tokens", 0),
+            "total_tokens": result.get("total_tokens", 0)
+        }
+    )
+    return response
+@app.post("/v1/completions")
+async def completions(request: CompletionRequest):
+    """Text completion endpoint."""
+    if not MODEL or not TOKENIZER:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    messages = [{"role": "user", "content": request.prompt}]
+    result = generate_response(
+        messages,
+        max_tokens=request.max_tokens,
+        temperature=request.temperature,
+        top_p=request.top_p
+    )
+    return {
+        "id": f"cmpl-{int(time.time())}",
+        "object": "text_completion",
+        "created": int(time.time()),
+        "model": "DeepXR/Helion-V1.5",
+        "choices": [{
+            "text": result["text"],
+            "index": 0,
+            "finish_reason": result["finish_reason"]
+        }],
+        "usage": {
+            "prompt_tokens": result.get("prompt_tokens", 0),
+            "completion_tokens": result.get("completion_tokens", 0),
+            "total_tokens": result.get("total_tokens", 0)
+        }
+    }
+def main():
+    """Run the server."""
+    import argparse
+    parser = argparse.ArgumentParser(description="Helion-V1.5 API Server")
+    parser.add_argument("--host", default="0.0.0.0", help="Host to bind to")
+    parser.add_argument("--port", type=int, default=8000, help="Port to bind to")
+    parser.add_argument("--reload", action="store_true", help="Enable auto-reload")
+    args = parser.parse_args()
+    uvicorn.run(
+        "server:app",
+        host=args.host,
+        port=args.port,
+        reload=args.reload,
+        log_level="info"
+    )
+if __name__ == "__main__":
+    main()