from fastapi import FastAPI, HTTPException from pydantic import BaseModel from typing import List, Optional import uvicorn import time from .model_manager import OVModelManager from .gguf_manager import OVGGUFManager app = FastAPI(title="OpenVinayaka API", version="1.0") # Global Model Instance model_instance = None class ChatMessage(BaseModel): role: str content: str class ChatCompletionRequest(BaseModel): model: str messages: List[ChatMessage] temperature: Optional[float] = 0.7 max_tokens: Optional[int] = 100 class ChatCompletionResponse(BaseModel): id: str object: str = "chat.completion" created: int model: str choices: List[dict] usage: dict @app.on_event("startup") async def startup_event(): print("🚀 OpenVinayaka API Server Started") @app.post("/v1/chat/completions", response_model=ChatCompletionResponse) async def chat_completions(request: ChatCompletionRequest): global model_instance # Lazy Load Model if needed if model_instance is None: # Check if it's a GGUF file or HF model if request.model.endswith(".gguf"): print(f"Loading GGUF Model: {request.model}") model_instance = OVGGUFManager(request.model) else: print(f"Loading HF Model: {request.model}") model_instance = OVModelManager(request.model) model_instance.attach_ov_hooks() # Format Prompt prompt = "" for msg in request.messages: prompt += f"{msg.role}: {msg.content}\n" prompt += "assistant:" # Generate response_text = model_instance.generate(prompt, max_new_tokens=request.max_tokens) # Mock Usage usage = {"prompt_tokens": len(prompt), "completion_tokens": len(response_text), "total_tokens": len(prompt)+len(response_text)} return ChatCompletionResponse( id=f"chatcmpl-{int(time.time())}", created=int(time.time()), model=request.model, choices=[{ "index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop" }], usage=usage ) def start_server(host="0.0.0.0", port=8000, model=None): # Pre-load if specified global model_instance if model: if model.endswith(".gguf"): model_instance = OVGGUFManager(model) else: model_instance = OVModelManager(model) model_instance.attach_ov_hooks() uvicorn.run(app, host=host, port=port)