File size: 2,552 Bytes

3930c05

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Optional
import uvicorn
import time
from .model_manager import OVModelManager
from .gguf_manager import OVGGUFManager

app = FastAPI(title="OpenVinayaka API", version="1.0")

# Global Model Instance
model_instance = None

class ChatMessage(BaseModel):
    role: str
    content: str

class ChatCompletionRequest(BaseModel):
    model: str
    messages: List[ChatMessage]
    temperature: Optional[float] = 0.7
    max_tokens: Optional[int] = 100

class ChatCompletionResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: List[dict]
    usage: dict

@app.on_event("startup")
async def startup_event():
    print("🚀 OpenVinayaka API Server Started")

@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def chat_completions(request: ChatCompletionRequest):
    global model_instance
    
    # Lazy Load Model if needed
    if model_instance is None:
        # Check if it's a GGUF file or HF model
        if request.model.endswith(".gguf"):
            print(f"Loading GGUF Model: {request.model}")
            model_instance = OVGGUFManager(request.model)
        else:
            print(f"Loading HF Model: {request.model}")
            model_instance = OVModelManager(request.model)
            model_instance.attach_ov_hooks()
            
    # Format Prompt
    prompt = ""
    for msg in request.messages:
        prompt += f"{msg.role}: {msg.content}\n"
    prompt += "assistant:"
    
    # Generate
    response_text = model_instance.generate(prompt, max_new_tokens=request.max_tokens)
    
    # Mock Usage
    usage = {"prompt_tokens": len(prompt), "completion_tokens": len(response_text), "total_tokens": len(prompt)+len(response_text)}
    
    return ChatCompletionResponse(
        id=f"chatcmpl-{int(time.time())}",
        created=int(time.time()),
        model=request.model,
        choices=[{
            "index": 0,
            "message": {"role": "assistant", "content": response_text},
            "finish_reason": "stop"
        }],
        usage=usage
    )

def start_server(host="0.0.0.0", port=8000, model=None):
    # Pre-load if specified
    global model_instance
    if model:
        if model.endswith(".gguf"):
            model_instance = OVGGUFManager(model)
        else:
            model_instance = OVModelManager(model)
            model_instance.attach_ov_hooks()
            
    uvicorn.run(app, host=host, port=port)