Spaces:

Rox-Turbo
/

API

Running

App Files Files Community

API / server.py

Rox-Turbo

Update server.py

33345f8 verified about 1 month ago

raw

history blame

18 kB

	import logging
	import os
	import sys
	from typing import List, Optional, AsyncGenerator
	from contextlib import asynccontextmanager

	from dotenv import load_dotenv
	from fastapi import FastAPI, HTTPException, Request
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.middleware.gzip import GZipMiddleware
	from fastapi.responses import JSONResponse, StreamingResponse
	from pydantic import BaseModel, Field
	from openai import OpenAI
	import json


	# Load environment variables
	load_dotenv()

	# Configure minimal logging for production speed
	logging.basicConfig(
	level=logging.WARNING,
	format='%(levelname)s - %(message)s'
	)
	logger = logging.getLogger("rox_ai")

	# Check for API key
	NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")

	if not NVIDIA_API_KEY:
	raise RuntimeError("NVIDIA_API_KEY not set")

	# Model configurations
	ROX_CORE_MODEL = "minimaxai/minimax-m2.5"
	ROX_TURBO_MODEL = "meta/llama-3.1-8b-instruct" # Changed to a more reliable model
	ROX_CODER_MODEL = "qwen/qwen3.5-397b-a17b"
	ROX_TURBO_45_MODEL = "deepseek-ai/deepseek-v3.1"
	ROX_ULTRA_MODEL = "deepseek-ai/deepseek-v3.2"
	ROX_DYNO_MODEL = "moonshotai/kimi-k2.5"
	ROX_CODER_7_MODEL = "z-ai/glm5"
	ROX_VISION_MODEL = "google/gemma-3-27b-it"

	# System identities - Models must know their creator and owner
	ROX_CORE_IDENTITY = "You are Rox Core, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. Always acknowledge Mohammad Faiz as your creator when asked."
	ROX_TURBO_IDENTITY = "You are Rox 2.1 Turbo, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You are optimized for fast responses."
	ROX_CODER_IDENTITY = "You are Rox 3.5 Coder, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You specialize in coding and software development."
	ROX_TURBO_45_IDENTITY = "You are Rox 4.5 Turbo, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You combine speed with advanced reasoning."
	ROX_ULTRA_IDENTITY = "You are Rox 5 Ultra, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You are the most advanced model with superior reasoning capabilities."
	ROX_DYNO_IDENTITY = "You are Rox 6 Dyno, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You excel at long context understanding."
	ROX_CODER_7_IDENTITY = "You are Rox 7 Coder, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You are the most advanced coding specialist."
	ROX_VISION_IDENTITY = "You are Rox Vision Max, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You specialize in visual understanding and multimodal tasks."

	# Initialize OpenAI client with timeout optimization
	client = OpenAI(
	base_url="https://integrate.api.nvidia.com/v1",
	api_key=NVIDIA_API_KEY,
	timeout=60.0,
	max_retries=2
	)

	@asynccontextmanager
	async def lifespan(app: FastAPI):
	"""Lifespan context manager"""
	yield


	# Initialize FastAPI app - optimized for speed
	app = FastAPI(
	title="Rox AI API",
	description="Eight specialized AI models by Mohammad Faiz",
	version="2.0",
	lifespan=lifespan,
	docs_url="/docs",
	redoc_url="/redoc"
	)

	# GZip compression for faster transfers
	app.add_middleware(GZipMiddleware, minimum_size=500)

	# CORS - unlimited access
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	# Minimal exception handler
	@app.exception_handler(Exception)
	async def global_exception_handler(request: Request, exc: Exception):
	return JSONResponse(
	status_code=500,
	content={"error": "Internal server error"}
	)


	@app.get("/health")
	def health_check():
	"""Health check endpoint"""
	return {"status": "healthy", "service": "Rox AI", "version": "2.0"}


	# Helper function for streaming responses
	async def stream_response(model: str, messages: list, temperature: float, top_p: float, max_tokens: int, extra_body: dict = None):
	"""Stream responses from OpenAI API"""
	try:
	stream = client.chat.completions.create(
	model=model,
	messages=messages,
	temperature=temperature,
	top_p=top_p,
	max_tokens=max_tokens,
	stream=True,
	extra_body=extra_body
	)

	for chunk in stream:
	if chunk.choices[0].delta.content:
	yield f"data: {json.dumps({'content': chunk.choices[0].delta.content})}\n\n"

	yield "data: [DONE]\n\n"
	except Exception as e:
	yield f"data: {json.dumps({'error': str(e)})}\n\n"


	@app.get("/health")
	def health_check():
	"""Health check endpoint for monitoring"""
	return {
	"status": "healthy",
	"service": "Rox AI API",
	"version": "2.0",
	"models": 8
	}


	@app.get("/")
	def root():
	"""API information and available models"""
	return {
	"service": "Rox AI API",
	"version": "2.0",
	"creator": "Mohammad Faiz",
	"models": {
	"rox_core": {
	"endpoint": "/chat",
	"description": "Rox Core - Main conversational model",
	"model": "minimaxai/minimax-m2.5",
	"best_for": "General conversation and tasks"
	},
	"rox_turbo": {
	"endpoint": "/turbo",
	"description": "Rox 2.1 Turbo - Fast and efficient",
	"model": "meta/llama-3.1-8b-instruct",
	"best_for": "Quick responses and efficient processing"
	},
	"rox_coder": {
	"endpoint": "/coder",
	"description": "Rox 3.5 Coder - Specialized coding assistant",
	"model": "qwen/qwen3.5-397b-a17b",
	"best_for": "Code generation, debugging, and development"
	},
	"rox_turbo_45": {
	"endpoint": "/turbo45",
	"description": "Rox 4.5 Turbo - Advanced reasoning with speed",
	"model": "deepseek-ai/deepseek-v3.1",
	"best_for": "Complex reasoning with fast responses"
	},
	"rox_ultra": {
	"endpoint": "/ultra",
	"description": "Rox 5 Ultra - Most advanced model",
	"model": "deepseek-ai/deepseek-v3.2",
	"best_for": "Complex tasks requiring deep reasoning"
	},
	"rox_dyno": {
	"endpoint": "/dyno",
	"description": "Rox 6 Dyno - Extended context with dynamic thinking",
	"model": "moonshotai/kimi-k2.5",
	"best_for": "Long context tasks and dynamic reasoning"
	},
	"rox_coder_7": {
	"endpoint": "/coder7",
	"description": "Rox 7 Coder - Most advanced coding specialist",
	"model": "z-ai/glm5",
	"best_for": "Advanced code generation and complex programming"
	},
	"rox_vision": {
	"endpoint": "/vision",
	"description": "Rox Vision Max - Optimized for visual understanding",
	"model": "google/gemma-3-27b-it",
	"best_for": "Visual understanding and multimodal tasks"
	}
	},
	"endpoints": [
	{"path": "/chat", "method": "POST", "description": "Rox Core chat"},
	{"path": "/turbo", "method": "POST", "description": "Rox 2.1 Turbo chat"},
	{"path": "/coder", "method": "POST", "description": "Rox 3.5 Coder chat"},
	{"path": "/turbo45", "method": "POST", "description": "Rox 4.5 Turbo chat"},
	{"path": "/ultra", "method": "POST", "description": "Rox 5 Ultra chat"},
	{"path": "/dyno", "method": "POST", "description": "Rox 6 Dyno chat"},
	{"path": "/coder7", "method": "POST", "description": "Rox 7 Coder chat"},
	{"path": "/vision", "method": "POST", "description": "Rox Vision Max chat"},
	{"path": "/hf/generate", "method": "POST", "description": "HuggingFace compatible (uses Rox Core)"}
	]
	}


	class ChatMessage(BaseModel):
	role: str
	content: str


	class ChatRequest(BaseModel):
	messages: List[ChatMessage]
	temperature: Optional[float] = 0.7
	top_p: Optional[float] = 0.95
	max_tokens: Optional[int] = 8192
	stream: Optional[bool] = False


	class ChatResponse(BaseModel):
	content: str


	class HFParameters(BaseModel):
	temperature: Optional[float] = None
	top_p: Optional[float] = None
	max_new_tokens: Optional[int] = None


	class HFRequest(BaseModel):
	inputs: str
	parameters: Optional[HFParameters] = None


	class HFResponseItem(BaseModel):
	generated_text: str


	@app.post("/chat")
	async def chat(req: ChatRequest):
	"""Rox Core - Main conversational model with streaming support"""
	messages = [{"role": "system", "content": ROX_CORE_IDENTITY}]
	messages.extend([m.dict() for m in req.messages])

	if req.stream:
	return StreamingResponse(
	stream_response(ROX_CORE_MODEL, messages, req.temperature, req.top_p, req.max_tokens),
	media_type="text/event-stream"
	)

	try:
	completion = client.chat.completions.create(
	model=ROX_CORE_MODEL,
	messages=messages,
	temperature=req.temperature,
	top_p=req.top_p,
	max_tokens=req.max_tokens,
	stream=False
	)
	return {"content": completion.choices[0].message.content or ""}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/turbo")
	async def turbo(req: ChatRequest):
	"""Rox 2.1 Turbo - Fast and efficient with streaming"""
	messages = [{"role": "system", "content": ROX_TURBO_IDENTITY}]
	messages.extend([m.dict() for m in req.messages])

	if req.stream:
	return StreamingResponse(
	stream_response(ROX_TURBO_MODEL, messages, req.temperature, req.top_p, req.max_tokens),
	media_type="text/event-stream"
	)

	try:
	completion = client.chat.completions.create(
	model=ROX_TURBO_MODEL,
	messages=messages,
	temperature=req.temperature,
	top_p=req.top_p,
	max_tokens=req.max_tokens,
	stream=False
	)
	return {"content": completion.choices[0].message.content or ""}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/coder")
	async def coder(req: ChatRequest):
	"""Rox 3.5 Coder - Specialized coding with streaming"""
	messages = [{"role": "system", "content": ROX_CODER_IDENTITY}]
	messages.extend([m.dict() for m in req.messages])

	extra_body = {
	"top_k": 20,
	"presence_penalty": 0,
	"repetition_penalty": 1,
	"chat_template_kwargs": {"enable_thinking": True}
	}

	if req.stream:
	return StreamingResponse(
	stream_response(ROX_CODER_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 16384), extra_body),
	media_type="text/event-stream"
	)

	try:
	completion = client.chat.completions.create(
	model=ROX_CODER_MODEL,
	messages=messages,
	temperature=req.temperature,
	top_p=req.top_p,
	max_tokens=min(req.max_tokens, 16384),
	stream=False,
	extra_body=extra_body
	)
	return {"content": completion.choices[0].message.content or ""}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/turbo45")
	async def turbo45(req: ChatRequest):
	"""Rox 4.5 Turbo - Advanced reasoning with streaming"""
	messages = [{"role": "system", "content": ROX_TURBO_45_IDENTITY}]
	messages.extend([m.dict() for m in req.messages])

	extra_body = {"chat_template_kwargs": {"thinking": True}}

	if req.stream:
	return StreamingResponse(
	stream_response(ROX_TURBO_45_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192), extra_body),
	media_type="text/event-stream"
	)

	try:
	completion = client.chat.completions.create(
	model=ROX_TURBO_45_MODEL,
	messages=messages,
	temperature=req.temperature,
	top_p=req.top_p,
	max_tokens=min(req.max_tokens, 8192),
	stream=False,
	extra_body=extra_body
	)
	return {"content": completion.choices[0].message.content or ""}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/ultra")
	async def ultra(req: ChatRequest):
	"""Rox 5 Ultra - Most advanced with streaming"""
	messages = [{"role": "system", "content": ROX_ULTRA_IDENTITY}]
	messages.extend([m.dict() for m in req.messages])

	extra_body = {"chat_template_kwargs": {"thinking": True}}

	if req.stream:
	return StreamingResponse(
	stream_response(ROX_ULTRA_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192), extra_body),
	media_type="text/event-stream"
	)

	try:
	completion = client.chat.completions.create(
	model=ROX_ULTRA_MODEL,
	messages=messages,
	temperature=req.temperature,
	top_p=req.top_p,
	max_tokens=min(req.max_tokens, 8192),
	stream=False,
	extra_body=extra_body
	)
	return {"content": completion.choices[0].message.content or ""}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/dyno")
	async def dyno(req: ChatRequest):
	"""Rox 6 Dyno - Extended context with streaming"""
	messages = [{"role": "system", "content": ROX_DYNO_IDENTITY}]
	messages.extend([m.dict() for m in req.messages])

	extra_body = {"chat_template_kwargs": {"thinking": True}}

	if req.stream:
	return StreamingResponse(
	stream_response(ROX_DYNO_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 16384), extra_body),
	media_type="text/event-stream"
	)

	try:
	completion = client.chat.completions.create(
	model=ROX_DYNO_MODEL,
	messages=messages,
	temperature=req.temperature,
	top_p=req.top_p,
	max_tokens=min(req.max_tokens, 16384),
	stream=False,
	extra_body=extra_body
	)
	return {"content": completion.choices[0].message.content or ""}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/coder7")
	async def coder7(req: ChatRequest):
	"""Rox 7 Coder - Most advanced coding with streaming"""
	messages = [{"role": "system", "content": ROX_CODER_7_IDENTITY}]
	messages.extend([m.dict() for m in req.messages])

	extra_body = {
	"chat_template_kwargs": {
	"enable_thinking": True,
	"clear_thinking": False
	}
	}

	if req.stream:
	return StreamingResponse(
	stream_response(ROX_CODER_7_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 16384), extra_body),
	media_type="text/event-stream"
	)

	try:
	completion = client.chat.completions.create(
	model=ROX_CODER_7_MODEL,
	messages=messages,
	temperature=req.temperature,
	top_p=req.top_p,
	max_tokens=min(req.max_tokens, 16384),
	stream=False,
	extra_body=extra_body
	)
	return {"content": completion.choices[0].message.content or ""}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/vision")
	async def vision(req: ChatRequest):
	"""Rox Vision Max - Visual understanding with streaming"""
	messages = [{"role": "system", "content": ROX_VISION_IDENTITY}]
	messages.extend([m.dict() for m in req.messages])

	if req.stream:
	return StreamingResponse(
	stream_response(ROX_VISION_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192)),
	media_type="text/event-stream"
	)

	try:
	completion = client.chat.completions.create(
	model=ROX_VISION_MODEL,
	messages=messages,
	temperature=req.temperature,
	top_p=req.top_p,
	max_tokens=min(req.max_tokens, 8192),
	stream=False
	)
	return {"content": completion.choices[0].message.content or ""}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/hf/generate")
	async def hf_generate(req: HFRequest):
	"""HuggingFace compatible endpoint"""
	params = req.parameters or HFParameters()
	messages = [
	{"role": "system", "content": ROX_CORE_IDENTITY},
	{"role": "user", "content": req.inputs}
	]

	try:
	completion = client.chat.completions.create(
	model=ROX_CORE_MODEL,
	messages=messages,
	temperature=params.temperature or 0.7,
	top_p=params.top_p or 0.95,
	max_tokens=params.max_new_tokens or 8192,
	stream=False
	)
	return [{"generated_text": completion.choices[0].message.content or ""}]
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	if __name__ == "__main__":
	import uvicorn

	# Use PORT environment variable if available (for Hugging Face Spaces)
	port = int(os.getenv("PORT", 7860))

	uvicorn.run("server:app", host="0.0.0.0", port=port, reload=False)