vaibhavlakshmi
/

OpenVinayaka-Engine

Model card Files Files and versions

OpenVinayaka-Engine / Python_Package /openvinayaka /api_server.py

vaibhavlakshmi's picture

Upload folder using huggingface_hub

3930c05 verified 3 months ago

history blame contribute delete

2.55 kB

	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from typing import List, Optional
	import uvicorn
	import time
	from .model_manager import OVModelManager
	from .gguf_manager import OVGGUFManager

	app = FastAPI(title="OpenVinayaka API", version="1.0")

	# Global Model Instance
	model_instance = None

	class ChatMessage(BaseModel):
	role: str
	content: str

	class ChatCompletionRequest(BaseModel):
	model: str
	messages: List[ChatMessage]
	temperature: Optional[float] = 0.7
	max_tokens: Optional[int] = 100

	class ChatCompletionResponse(BaseModel):
	id: str
	object: str = "chat.completion"
	created: int
	model: str
	choices: List[dict]
	usage: dict

	@app.on_event("startup")
	async def startup_event():
	print("🚀 OpenVinayaka API Server Started")

	@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
	async def chat_completions(request: ChatCompletionRequest):
	global model_instance

	# Lazy Load Model if needed
	if model_instance is None:
	# Check if it's a GGUF file or HF model
	if request.model.endswith(".gguf"):
	print(f"Loading GGUF Model: {request.model}")
	model_instance = OVGGUFManager(request.model)
	else:
	print(f"Loading HF Model: {request.model}")
	model_instance = OVModelManager(request.model)
	model_instance.attach_ov_hooks()

	# Format Prompt
	prompt = ""
	for msg in request.messages:
	prompt += f"{msg.role}: {msg.content}\n"
	prompt += "assistant:"

	# Generate
	response_text = model_instance.generate(prompt, max_new_tokens=request.max_tokens)

	# Mock Usage
	usage = {"prompt_tokens": len(prompt), "completion_tokens": len(response_text), "total_tokens": len(prompt)+len(response_text)}

	return ChatCompletionResponse(
	id=f"chatcmpl-{int(time.time())}",
	created=int(time.time()),
	model=request.model,
	choices=[{
	"index": 0,
	"message": {"role": "assistant", "content": response_text},
	"finish_reason": "stop"
	}],
	usage=usage
	)

	def start_server(host="0.0.0.0", port=8000, model=None):
	# Pre-load if specified
	global model_instance
	if model:
	if model.endswith(".gguf"):
	model_instance = OVGGUFManager(model)
	else:
	model_instance = OVModelManager(model)
	model_instance.attach_ov_hooks()

	uvicorn.run(app, host=host, port=port)