File size: 2,552 Bytes
3930c05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Optional
import uvicorn
import time
from .model_manager import OVModelManager
from .gguf_manager import OVGGUFManager

app = FastAPI(title="OpenVinayaka API", version="1.0")

# Global Model Instance
model_instance = None

class ChatMessage(BaseModel):
    role: str
    content: str

class ChatCompletionRequest(BaseModel):
    model: str
    messages: List[ChatMessage]
    temperature: Optional[float] = 0.7
    max_tokens: Optional[int] = 100

class ChatCompletionResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: List[dict]
    usage: dict

@app.on_event("startup")
async def startup_event():
    print("🚀 OpenVinayaka API Server Started")

@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def chat_completions(request: ChatCompletionRequest):
    global model_instance
    
    # Lazy Load Model if needed
    if model_instance is None:
        # Check if it's a GGUF file or HF model
        if request.model.endswith(".gguf"):
            print(f"Loading GGUF Model: {request.model}")
            model_instance = OVGGUFManager(request.model)
        else:
            print(f"Loading HF Model: {request.model}")
            model_instance = OVModelManager(request.model)
            model_instance.attach_ov_hooks()
            
    # Format Prompt
    prompt = ""
    for msg in request.messages:
        prompt += f"{msg.role}: {msg.content}\n"
    prompt += "assistant:"
    
    # Generate
    response_text = model_instance.generate(prompt, max_new_tokens=request.max_tokens)
    
    # Mock Usage
    usage = {"prompt_tokens": len(prompt), "completion_tokens": len(response_text), "total_tokens": len(prompt)+len(response_text)}
    
    return ChatCompletionResponse(
        id=f"chatcmpl-{int(time.time())}",
        created=int(time.time()),
        model=request.model,
        choices=[{
            "index": 0,
            "message": {"role": "assistant", "content": response_text},
            "finish_reason": "stop"
        }],
        usage=usage
    )

def start_server(host="0.0.0.0", port=8000, model=None):
    # Pre-load if specified
    global model_instance
    if model:
        if model.endswith(".gguf"):
            model_instance = OVGGUFManager(model)
        else:
            model_instance = OVModelManager(model)
            model_instance.attach_ov_hooks()
            
    uvicorn.run(app, host=host, port=port)