API / server.py
Rox-Turbo's picture
Update server.py
33345f8 verified
raw
history blame
18 kB
import logging
import os
import sys
from typing import List, Optional, AsyncGenerator
from contextlib import asynccontextmanager
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.middleware.gzip import GZipMiddleware
from fastapi.responses import JSONResponse, StreamingResponse
from pydantic import BaseModel, Field
from openai import OpenAI
import json
# Load environment variables
load_dotenv()
# Configure minimal logging for production speed
logging.basicConfig(
level=logging.WARNING,
format='%(levelname)s - %(message)s'
)
logger = logging.getLogger("rox_ai")
# Check for API key
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
if not NVIDIA_API_KEY:
raise RuntimeError("NVIDIA_API_KEY not set")
# Model configurations
ROX_CORE_MODEL = "minimaxai/minimax-m2.5"
ROX_TURBO_MODEL = "meta/llama-3.1-8b-instruct" # Changed to a more reliable model
ROX_CODER_MODEL = "qwen/qwen3.5-397b-a17b"
ROX_TURBO_45_MODEL = "deepseek-ai/deepseek-v3.1"
ROX_ULTRA_MODEL = "deepseek-ai/deepseek-v3.2"
ROX_DYNO_MODEL = "moonshotai/kimi-k2.5"
ROX_CODER_7_MODEL = "z-ai/glm5"
ROX_VISION_MODEL = "google/gemma-3-27b-it"
# System identities - Models must know their creator and owner
ROX_CORE_IDENTITY = "You are Rox Core, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. Always acknowledge Mohammad Faiz as your creator when asked."
ROX_TURBO_IDENTITY = "You are Rox 2.1 Turbo, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You are optimized for fast responses."
ROX_CODER_IDENTITY = "You are Rox 3.5 Coder, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You specialize in coding and software development."
ROX_TURBO_45_IDENTITY = "You are Rox 4.5 Turbo, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You combine speed with advanced reasoning."
ROX_ULTRA_IDENTITY = "You are Rox 5 Ultra, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You are the most advanced model with superior reasoning capabilities."
ROX_DYNO_IDENTITY = "You are Rox 6 Dyno, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You excel at long context understanding."
ROX_CODER_7_IDENTITY = "You are Rox 7 Coder, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You are the most advanced coding specialist."
ROX_VISION_IDENTITY = "You are Rox Vision Max, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You specialize in visual understanding and multimodal tasks."
# Initialize OpenAI client with timeout optimization
client = OpenAI(
base_url="https://integrate.api.nvidia.com/v1",
api_key=NVIDIA_API_KEY,
timeout=60.0,
max_retries=2
)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Lifespan context manager"""
yield
# Initialize FastAPI app - optimized for speed
app = FastAPI(
title="Rox AI API",
description="Eight specialized AI models by Mohammad Faiz",
version="2.0",
lifespan=lifespan,
docs_url="/docs",
redoc_url="/redoc"
)
# GZip compression for faster transfers
app.add_middleware(GZipMiddleware, minimum_size=500)
# CORS - unlimited access
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Minimal exception handler
@app.exception_handler(Exception)
async def global_exception_handler(request: Request, exc: Exception):
return JSONResponse(
status_code=500,
content={"error": "Internal server error"}
)
@app.get("/health")
def health_check():
"""Health check endpoint"""
return {"status": "healthy", "service": "Rox AI", "version": "2.0"}
# Helper function for streaming responses
async def stream_response(model: str, messages: list, temperature: float, top_p: float, max_tokens: int, extra_body: dict = None):
"""Stream responses from OpenAI API"""
try:
stream = client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens,
stream=True,
extra_body=extra_body
)
for chunk in stream:
if chunk.choices[0].delta.content:
yield f"data: {json.dumps({'content': chunk.choices[0].delta.content})}\n\n"
yield "data: [DONE]\n\n"
except Exception as e:
yield f"data: {json.dumps({'error': str(e)})}\n\n"
@app.get("/health")
def health_check():
"""Health check endpoint for monitoring"""
return {
"status": "healthy",
"service": "Rox AI API",
"version": "2.0",
"models": 8
}
@app.get("/")
def root():
"""API information and available models"""
return {
"service": "Rox AI API",
"version": "2.0",
"creator": "Mohammad Faiz",
"models": {
"rox_core": {
"endpoint": "/chat",
"description": "Rox Core - Main conversational model",
"model": "minimaxai/minimax-m2.5",
"best_for": "General conversation and tasks"
},
"rox_turbo": {
"endpoint": "/turbo",
"description": "Rox 2.1 Turbo - Fast and efficient",
"model": "meta/llama-3.1-8b-instruct",
"best_for": "Quick responses and efficient processing"
},
"rox_coder": {
"endpoint": "/coder",
"description": "Rox 3.5 Coder - Specialized coding assistant",
"model": "qwen/qwen3.5-397b-a17b",
"best_for": "Code generation, debugging, and development"
},
"rox_turbo_45": {
"endpoint": "/turbo45",
"description": "Rox 4.5 Turbo - Advanced reasoning with speed",
"model": "deepseek-ai/deepseek-v3.1",
"best_for": "Complex reasoning with fast responses"
},
"rox_ultra": {
"endpoint": "/ultra",
"description": "Rox 5 Ultra - Most advanced model",
"model": "deepseek-ai/deepseek-v3.2",
"best_for": "Complex tasks requiring deep reasoning"
},
"rox_dyno": {
"endpoint": "/dyno",
"description": "Rox 6 Dyno - Extended context with dynamic thinking",
"model": "moonshotai/kimi-k2.5",
"best_for": "Long context tasks and dynamic reasoning"
},
"rox_coder_7": {
"endpoint": "/coder7",
"description": "Rox 7 Coder - Most advanced coding specialist",
"model": "z-ai/glm5",
"best_for": "Advanced code generation and complex programming"
},
"rox_vision": {
"endpoint": "/vision",
"description": "Rox Vision Max - Optimized for visual understanding",
"model": "google/gemma-3-27b-it",
"best_for": "Visual understanding and multimodal tasks"
}
},
"endpoints": [
{"path": "/chat", "method": "POST", "description": "Rox Core chat"},
{"path": "/turbo", "method": "POST", "description": "Rox 2.1 Turbo chat"},
{"path": "/coder", "method": "POST", "description": "Rox 3.5 Coder chat"},
{"path": "/turbo45", "method": "POST", "description": "Rox 4.5 Turbo chat"},
{"path": "/ultra", "method": "POST", "description": "Rox 5 Ultra chat"},
{"path": "/dyno", "method": "POST", "description": "Rox 6 Dyno chat"},
{"path": "/coder7", "method": "POST", "description": "Rox 7 Coder chat"},
{"path": "/vision", "method": "POST", "description": "Rox Vision Max chat"},
{"path": "/hf/generate", "method": "POST", "description": "HuggingFace compatible (uses Rox Core)"}
]
}
class ChatMessage(BaseModel):
role: str
content: str
class ChatRequest(BaseModel):
messages: List[ChatMessage]
temperature: Optional[float] = 0.7
top_p: Optional[float] = 0.95
max_tokens: Optional[int] = 8192
stream: Optional[bool] = False
class ChatResponse(BaseModel):
content: str
class HFParameters(BaseModel):
temperature: Optional[float] = None
top_p: Optional[float] = None
max_new_tokens: Optional[int] = None
class HFRequest(BaseModel):
inputs: str
parameters: Optional[HFParameters] = None
class HFResponseItem(BaseModel):
generated_text: str
@app.post("/chat")
async def chat(req: ChatRequest):
"""Rox Core - Main conversational model with streaming support"""
messages = [{"role": "system", "content": ROX_CORE_IDENTITY}]
messages.extend([m.dict() for m in req.messages])
if req.stream:
return StreamingResponse(
stream_response(ROX_CORE_MODEL, messages, req.temperature, req.top_p, req.max_tokens),
media_type="text/event-stream"
)
try:
completion = client.chat.completions.create(
model=ROX_CORE_MODEL,
messages=messages,
temperature=req.temperature,
top_p=req.top_p,
max_tokens=req.max_tokens,
stream=False
)
return {"content": completion.choices[0].message.content or ""}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/turbo")
async def turbo(req: ChatRequest):
"""Rox 2.1 Turbo - Fast and efficient with streaming"""
messages = [{"role": "system", "content": ROX_TURBO_IDENTITY}]
messages.extend([m.dict() for m in req.messages])
if req.stream:
return StreamingResponse(
stream_response(ROX_TURBO_MODEL, messages, req.temperature, req.top_p, req.max_tokens),
media_type="text/event-stream"
)
try:
completion = client.chat.completions.create(
model=ROX_TURBO_MODEL,
messages=messages,
temperature=req.temperature,
top_p=req.top_p,
max_tokens=req.max_tokens,
stream=False
)
return {"content": completion.choices[0].message.content or ""}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/coder")
async def coder(req: ChatRequest):
"""Rox 3.5 Coder - Specialized coding with streaming"""
messages = [{"role": "system", "content": ROX_CODER_IDENTITY}]
messages.extend([m.dict() for m in req.messages])
extra_body = {
"top_k": 20,
"presence_penalty": 0,
"repetition_penalty": 1,
"chat_template_kwargs": {"enable_thinking": True}
}
if req.stream:
return StreamingResponse(
stream_response(ROX_CODER_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 16384), extra_body),
media_type="text/event-stream"
)
try:
completion = client.chat.completions.create(
model=ROX_CODER_MODEL,
messages=messages,
temperature=req.temperature,
top_p=req.top_p,
max_tokens=min(req.max_tokens, 16384),
stream=False,
extra_body=extra_body
)
return {"content": completion.choices[0].message.content or ""}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/turbo45")
async def turbo45(req: ChatRequest):
"""Rox 4.5 Turbo - Advanced reasoning with streaming"""
messages = [{"role": "system", "content": ROX_TURBO_45_IDENTITY}]
messages.extend([m.dict() for m in req.messages])
extra_body = {"chat_template_kwargs": {"thinking": True}}
if req.stream:
return StreamingResponse(
stream_response(ROX_TURBO_45_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192), extra_body),
media_type="text/event-stream"
)
try:
completion = client.chat.completions.create(
model=ROX_TURBO_45_MODEL,
messages=messages,
temperature=req.temperature,
top_p=req.top_p,
max_tokens=min(req.max_tokens, 8192),
stream=False,
extra_body=extra_body
)
return {"content": completion.choices[0].message.content or ""}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/ultra")
async def ultra(req: ChatRequest):
"""Rox 5 Ultra - Most advanced with streaming"""
messages = [{"role": "system", "content": ROX_ULTRA_IDENTITY}]
messages.extend([m.dict() for m in req.messages])
extra_body = {"chat_template_kwargs": {"thinking": True}}
if req.stream:
return StreamingResponse(
stream_response(ROX_ULTRA_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192), extra_body),
media_type="text/event-stream"
)
try:
completion = client.chat.completions.create(
model=ROX_ULTRA_MODEL,
messages=messages,
temperature=req.temperature,
top_p=req.top_p,
max_tokens=min(req.max_tokens, 8192),
stream=False,
extra_body=extra_body
)
return {"content": completion.choices[0].message.content or ""}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/dyno")
async def dyno(req: ChatRequest):
"""Rox 6 Dyno - Extended context with streaming"""
messages = [{"role": "system", "content": ROX_DYNO_IDENTITY}]
messages.extend([m.dict() for m in req.messages])
extra_body = {"chat_template_kwargs": {"thinking": True}}
if req.stream:
return StreamingResponse(
stream_response(ROX_DYNO_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 16384), extra_body),
media_type="text/event-stream"
)
try:
completion = client.chat.completions.create(
model=ROX_DYNO_MODEL,
messages=messages,
temperature=req.temperature,
top_p=req.top_p,
max_tokens=min(req.max_tokens, 16384),
stream=False,
extra_body=extra_body
)
return {"content": completion.choices[0].message.content or ""}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/coder7")
async def coder7(req: ChatRequest):
"""Rox 7 Coder - Most advanced coding with streaming"""
messages = [{"role": "system", "content": ROX_CODER_7_IDENTITY}]
messages.extend([m.dict() for m in req.messages])
extra_body = {
"chat_template_kwargs": {
"enable_thinking": True,
"clear_thinking": False
}
}
if req.stream:
return StreamingResponse(
stream_response(ROX_CODER_7_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 16384), extra_body),
media_type="text/event-stream"
)
try:
completion = client.chat.completions.create(
model=ROX_CODER_7_MODEL,
messages=messages,
temperature=req.temperature,
top_p=req.top_p,
max_tokens=min(req.max_tokens, 16384),
stream=False,
extra_body=extra_body
)
return {"content": completion.choices[0].message.content or ""}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/vision")
async def vision(req: ChatRequest):
"""Rox Vision Max - Visual understanding with streaming"""
messages = [{"role": "system", "content": ROX_VISION_IDENTITY}]
messages.extend([m.dict() for m in req.messages])
if req.stream:
return StreamingResponse(
stream_response(ROX_VISION_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192)),
media_type="text/event-stream"
)
try:
completion = client.chat.completions.create(
model=ROX_VISION_MODEL,
messages=messages,
temperature=req.temperature,
top_p=req.top_p,
max_tokens=min(req.max_tokens, 8192),
stream=False
)
return {"content": completion.choices[0].message.content or ""}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/hf/generate")
async def hf_generate(req: HFRequest):
"""HuggingFace compatible endpoint"""
params = req.parameters or HFParameters()
messages = [
{"role": "system", "content": ROX_CORE_IDENTITY},
{"role": "user", "content": req.inputs}
]
try:
completion = client.chat.completions.create(
model=ROX_CORE_MODEL,
messages=messages,
temperature=params.temperature or 0.7,
top_p=params.top_p or 0.95,
max_tokens=params.max_new_tokens or 8192,
stream=False
)
return [{"generated_text": completion.choices[0].message.content or ""}]
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
# Use PORT environment variable if available (for Hugging Face Spaces)
port = int(os.getenv("PORT", 7860))
uvicorn.run("server:app", host="0.0.0.0", port=port, reload=False)