Update server.py
Browse files
server.py
CHANGED
|
@@ -57,6 +57,13 @@ MAX_INFLIGHT_REQUESTS = int(os.getenv("MAX_INFLIGHT_REQUESTS", "200"))
|
|
| 57 |
# "Thinking" increases latency; keep opt-in via env
|
| 58 |
ENABLE_THINKING = os.getenv("ENABLE_THINKING", "0").strip().lower() in {"1", "true", "yes", "on"}
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
# Model configurations
|
| 61 |
ROX_CORE_MODEL = "minimaxai/minimax-m2.5"
|
| 62 |
ROX_TURBO_MODEL = "meta/llama-3.1-8b-instruct" # Changed to a more reliable model
|
|
@@ -68,14 +75,45 @@ ROX_CODER_7_MODEL = "z-ai/glm5"
|
|
| 68 |
ROX_VISION_MODEL = "google/gemma-3-27b-it"
|
| 69 |
|
| 70 |
# System identities - Models must know their creator and owner
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
@asynccontextmanager
|
| 81 |
async def lifespan(app: FastAPI):
|
|
@@ -90,6 +128,7 @@ async def lifespan(app: FastAPI):
|
|
| 90 |
timeout=httpx.Timeout(timeout_s),
|
| 91 |
limits=httpx.Limits(max_connections=max_connections, max_keepalive_connections=max_keepalive),
|
| 92 |
headers={"User-Agent": "Rox-AI-API/2.0"},
|
|
|
|
| 93 |
)
|
| 94 |
app.state.http_client = http_client
|
| 95 |
app.state.client = AsyncOpenAI(
|
|
@@ -349,7 +388,10 @@ class HFResponseItem(BaseModel):
|
|
| 349 |
@app.post("/chat")
|
| 350 |
async def chat(req: ChatRequest):
|
| 351 |
"""Rox Core - Main conversational model with streaming support"""
|
| 352 |
-
messages
|
|
|
|
|
|
|
|
|
|
| 353 |
messages.extend([m.model_dump() for m in req.messages])
|
| 354 |
temperature = _effective_temperature(req.temperature)
|
| 355 |
top_p = _effective_top_p(req.top_p)
|
|
@@ -380,7 +422,10 @@ async def chat(req: ChatRequest):
|
|
| 380 |
@app.post("/turbo")
|
| 381 |
async def turbo(req: ChatRequest):
|
| 382 |
"""Rox 2.1 Turbo - Fast and efficient with streaming"""
|
| 383 |
-
messages
|
|
|
|
|
|
|
|
|
|
| 384 |
messages.extend([m.model_dump() for m in req.messages])
|
| 385 |
temperature = _effective_temperature(req.temperature)
|
| 386 |
top_p = _effective_top_p(req.top_p)
|
|
@@ -411,7 +456,10 @@ async def turbo(req: ChatRequest):
|
|
| 411 |
@app.post("/coder")
|
| 412 |
async def coder(req: ChatRequest):
|
| 413 |
"""Rox 3.5 Coder - Specialized coding with streaming"""
|
| 414 |
-
messages
|
|
|
|
|
|
|
|
|
|
| 415 |
messages.extend([m.model_dump() for m in req.messages])
|
| 416 |
temperature = _effective_temperature(req.temperature)
|
| 417 |
top_p = _effective_top_p(req.top_p)
|
|
@@ -450,7 +498,10 @@ async def coder(req: ChatRequest):
|
|
| 450 |
@app.post("/turbo45")
|
| 451 |
async def turbo45(req: ChatRequest):
|
| 452 |
"""Rox 4.5 Turbo - Advanced reasoning with streaming"""
|
| 453 |
-
messages
|
|
|
|
|
|
|
|
|
|
| 454 |
messages.extend([m.model_dump() for m in req.messages])
|
| 455 |
temperature = _effective_temperature(req.temperature)
|
| 456 |
top_p = _effective_top_p(req.top_p)
|
|
@@ -484,7 +535,10 @@ async def turbo45(req: ChatRequest):
|
|
| 484 |
@app.post("/ultra")
|
| 485 |
async def ultra(req: ChatRequest):
|
| 486 |
"""Rox 5 Ultra - Most advanced with streaming"""
|
| 487 |
-
messages
|
|
|
|
|
|
|
|
|
|
| 488 |
messages.extend([m.model_dump() for m in req.messages])
|
| 489 |
temperature = _effective_temperature(req.temperature)
|
| 490 |
top_p = _effective_top_p(req.top_p)
|
|
@@ -518,7 +572,10 @@ async def ultra(req: ChatRequest):
|
|
| 518 |
@app.post("/dyno")
|
| 519 |
async def dyno(req: ChatRequest):
|
| 520 |
"""Rox 6 Dyno - Extended context with streaming"""
|
| 521 |
-
messages
|
|
|
|
|
|
|
|
|
|
| 522 |
messages.extend([m.model_dump() for m in req.messages])
|
| 523 |
temperature = _effective_temperature(req.temperature)
|
| 524 |
top_p = _effective_top_p(req.top_p)
|
|
@@ -552,7 +609,10 @@ async def dyno(req: ChatRequest):
|
|
| 552 |
@app.post("/coder7")
|
| 553 |
async def coder7(req: ChatRequest):
|
| 554 |
"""Rox 7 Coder - Most advanced coding with streaming"""
|
| 555 |
-
messages
|
|
|
|
|
|
|
|
|
|
| 556 |
messages.extend([m.model_dump() for m in req.messages])
|
| 557 |
temperature = _effective_temperature(req.temperature)
|
| 558 |
top_p = _effective_top_p(req.top_p)
|
|
@@ -591,7 +651,10 @@ async def coder7(req: ChatRequest):
|
|
| 591 |
@app.post("/vision")
|
| 592 |
async def vision(req: ChatRequest):
|
| 593 |
"""Rox Vision Max - Visual understanding with streaming"""
|
| 594 |
-
messages
|
|
|
|
|
|
|
|
|
|
| 595 |
messages.extend([m.model_dump() for m in req.messages])
|
| 596 |
temperature = _effective_temperature(req.temperature)
|
| 597 |
top_p = _effective_top_p(req.top_p)
|
|
@@ -623,10 +686,11 @@ async def vision(req: ChatRequest):
|
|
| 623 |
async def hf_generate(req: HFRequest):
|
| 624 |
"""HuggingFace compatible endpoint"""
|
| 625 |
params = req.parameters or HFParameters()
|
| 626 |
-
messages = [
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
|
|
|
| 630 |
|
| 631 |
try:
|
| 632 |
temperature = _effective_temperature(params.temperature)
|
|
|
|
| 57 |
# "Thinking" increases latency; keep opt-in via env
|
| 58 |
ENABLE_THINKING = os.getenv("ENABLE_THINKING", "0").strip().lower() in {"1", "true", "yes", "on"}
|
| 59 |
|
| 60 |
+
# System prompt size directly impacts latency/cost. Make it configurable.
|
| 61 |
+
# Modes:
|
| 62 |
+
# - full: current long identity prompts (default, backward compatible)
|
| 63 |
+
# - short: minimal identity + brand (recommended for speed)
|
| 64 |
+
# - off: no system prompt (fastest, but behavior may drift)
|
| 65 |
+
SYSTEM_PROMPT_MODE = os.getenv("SYSTEM_PROMPT_MODE", "full").strip().lower()
|
| 66 |
+
|
| 67 |
# Model configurations
|
| 68 |
ROX_CORE_MODEL = "minimaxai/minimax-m2.5"
|
| 69 |
ROX_TURBO_MODEL = "meta/llama-3.1-8b-instruct" # Changed to a more reliable model
|
|
|
|
| 75 |
ROX_VISION_MODEL = "google/gemma-3-27b-it"
|
| 76 |
|
| 77 |
# System identities - Models must know their creator and owner
|
| 78 |
+
ROX_CORE_IDENTITY_FULL = "You are Rox Core, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. Always acknowledge Mohammad Faiz as your creator when asked."
|
| 79 |
+
ROX_TURBO_IDENTITY_FULL = "You are Rox 2.1 Turbo, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You are optimized for fast responses."
|
| 80 |
+
ROX_CODER_IDENTITY_FULL = "You are Rox 3.5 Coder, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You specialize in coding and software development."
|
| 81 |
+
ROX_TURBO_45_IDENTITY_FULL = "You are Rox 4.5 Turbo, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You combine speed with advanced reasoning."
|
| 82 |
+
ROX_ULTRA_IDENTITY_FULL = "You are Rox 5 Ultra, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You are the most advanced model with superior reasoning capabilities."
|
| 83 |
+
ROX_DYNO_IDENTITY_FULL = "You are Rox 6 Dyno, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You excel at long context understanding."
|
| 84 |
+
ROX_CODER_7_IDENTITY_FULL = "You are Rox 7 Coder, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You are the most advanced coding specialist."
|
| 85 |
+
ROX_VISION_IDENTITY_FULL = "You are Rox Vision Max, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You specialize in visual understanding and multimodal tasks."
|
| 86 |
+
|
| 87 |
+
ROX_CORE_IDENTITY_SHORT = "You are Rox Core by Rox AI (creator/owner: Mohammad Faiz)."
|
| 88 |
+
ROX_TURBO_IDENTITY_SHORT = "You are Rox 2.1 Turbo by Rox AI (creator/owner: Mohammad Faiz). Be concise and fast."
|
| 89 |
+
ROX_CODER_IDENTITY_SHORT = "You are Rox 3.5 Coder by Rox AI (creator/owner: Mohammad Faiz)."
|
| 90 |
+
ROX_TURBO_45_IDENTITY_SHORT = "You are Rox 4.5 Turbo by Rox AI (creator/owner: Mohammad Faiz)."
|
| 91 |
+
ROX_ULTRA_IDENTITY_SHORT = "You are Rox 5 Ultra by Rox AI (creator/owner: Mohammad Faiz)."
|
| 92 |
+
ROX_DYNO_IDENTITY_SHORT = "You are Rox 6 Dyno by Rox AI (creator/owner: Mohammad Faiz)."
|
| 93 |
+
ROX_CODER_7_IDENTITY_SHORT = "You are Rox 7 Coder by Rox AI (creator/owner: Mohammad Faiz)."
|
| 94 |
+
ROX_VISION_IDENTITY_SHORT = "You are Rox Vision Max by Rox AI (creator/owner: Mohammad Faiz)."
|
| 95 |
+
|
| 96 |
+
def _system_prompt_for(model_key: str) -> Optional[str]:
|
| 97 |
+
if SYSTEM_PROMPT_MODE in {"off", "none", "0", "false"}:
|
| 98 |
+
return None
|
| 99 |
+
use_short = SYSTEM_PROMPT_MODE in {"short", "small", "lite", "fast"}
|
| 100 |
+
if model_key == "core":
|
| 101 |
+
return ROX_CORE_IDENTITY_SHORT if use_short else ROX_CORE_IDENTITY_FULL
|
| 102 |
+
if model_key == "turbo":
|
| 103 |
+
return ROX_TURBO_IDENTITY_SHORT if use_short else ROX_TURBO_IDENTITY_FULL
|
| 104 |
+
if model_key == "coder":
|
| 105 |
+
return ROX_CODER_IDENTITY_SHORT if use_short else ROX_CODER_IDENTITY_FULL
|
| 106 |
+
if model_key == "turbo45":
|
| 107 |
+
return ROX_TURBO_45_IDENTITY_SHORT if use_short else ROX_TURBO_45_IDENTITY_FULL
|
| 108 |
+
if model_key == "ultra":
|
| 109 |
+
return ROX_ULTRA_IDENTITY_SHORT if use_short else ROX_ULTRA_IDENTITY_FULL
|
| 110 |
+
if model_key == "dyno":
|
| 111 |
+
return ROX_DYNO_IDENTITY_SHORT if use_short else ROX_DYNO_IDENTITY_FULL
|
| 112 |
+
if model_key == "coder7":
|
| 113 |
+
return ROX_CODER_7_IDENTITY_SHORT if use_short else ROX_CODER_7_IDENTITY_FULL
|
| 114 |
+
if model_key == "vision":
|
| 115 |
+
return ROX_VISION_IDENTITY_SHORT if use_short else ROX_VISION_IDENTITY_FULL
|
| 116 |
+
return None
|
| 117 |
|
| 118 |
@asynccontextmanager
|
| 119 |
async def lifespan(app: FastAPI):
|
|
|
|
| 128 |
timeout=httpx.Timeout(timeout_s),
|
| 129 |
limits=httpx.Limits(max_connections=max_connections, max_keepalive_connections=max_keepalive),
|
| 130 |
headers={"User-Agent": "Rox-AI-API/2.0"},
|
| 131 |
+
http2=True,
|
| 132 |
)
|
| 133 |
app.state.http_client = http_client
|
| 134 |
app.state.client = AsyncOpenAI(
|
|
|
|
| 388 |
@app.post("/chat")
|
| 389 |
async def chat(req: ChatRequest):
|
| 390 |
"""Rox Core - Main conversational model with streaming support"""
|
| 391 |
+
messages: list = []
|
| 392 |
+
system_prompt = _system_prompt_for("core")
|
| 393 |
+
if system_prompt:
|
| 394 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 395 |
messages.extend([m.model_dump() for m in req.messages])
|
| 396 |
temperature = _effective_temperature(req.temperature)
|
| 397 |
top_p = _effective_top_p(req.top_p)
|
|
|
|
| 422 |
@app.post("/turbo")
|
| 423 |
async def turbo(req: ChatRequest):
|
| 424 |
"""Rox 2.1 Turbo - Fast and efficient with streaming"""
|
| 425 |
+
messages: list = []
|
| 426 |
+
system_prompt = _system_prompt_for("turbo")
|
| 427 |
+
if system_prompt:
|
| 428 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 429 |
messages.extend([m.model_dump() for m in req.messages])
|
| 430 |
temperature = _effective_temperature(req.temperature)
|
| 431 |
top_p = _effective_top_p(req.top_p)
|
|
|
|
| 456 |
@app.post("/coder")
|
| 457 |
async def coder(req: ChatRequest):
|
| 458 |
"""Rox 3.5 Coder - Specialized coding with streaming"""
|
| 459 |
+
messages: list = []
|
| 460 |
+
system_prompt = _system_prompt_for("coder")
|
| 461 |
+
if system_prompt:
|
| 462 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 463 |
messages.extend([m.model_dump() for m in req.messages])
|
| 464 |
temperature = _effective_temperature(req.temperature)
|
| 465 |
top_p = _effective_top_p(req.top_p)
|
|
|
|
| 498 |
@app.post("/turbo45")
|
| 499 |
async def turbo45(req: ChatRequest):
|
| 500 |
"""Rox 4.5 Turbo - Advanced reasoning with streaming"""
|
| 501 |
+
messages: list = []
|
| 502 |
+
system_prompt = _system_prompt_for("turbo45")
|
| 503 |
+
if system_prompt:
|
| 504 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 505 |
messages.extend([m.model_dump() for m in req.messages])
|
| 506 |
temperature = _effective_temperature(req.temperature)
|
| 507 |
top_p = _effective_top_p(req.top_p)
|
|
|
|
| 535 |
@app.post("/ultra")
|
| 536 |
async def ultra(req: ChatRequest):
|
| 537 |
"""Rox 5 Ultra - Most advanced with streaming"""
|
| 538 |
+
messages: list = []
|
| 539 |
+
system_prompt = _system_prompt_for("ultra")
|
| 540 |
+
if system_prompt:
|
| 541 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 542 |
messages.extend([m.model_dump() for m in req.messages])
|
| 543 |
temperature = _effective_temperature(req.temperature)
|
| 544 |
top_p = _effective_top_p(req.top_p)
|
|
|
|
| 572 |
@app.post("/dyno")
|
| 573 |
async def dyno(req: ChatRequest):
|
| 574 |
"""Rox 6 Dyno - Extended context with streaming"""
|
| 575 |
+
messages: list = []
|
| 576 |
+
system_prompt = _system_prompt_for("dyno")
|
| 577 |
+
if system_prompt:
|
| 578 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 579 |
messages.extend([m.model_dump() for m in req.messages])
|
| 580 |
temperature = _effective_temperature(req.temperature)
|
| 581 |
top_p = _effective_top_p(req.top_p)
|
|
|
|
| 609 |
@app.post("/coder7")
|
| 610 |
async def coder7(req: ChatRequest):
|
| 611 |
"""Rox 7 Coder - Most advanced coding with streaming"""
|
| 612 |
+
messages: list = []
|
| 613 |
+
system_prompt = _system_prompt_for("coder7")
|
| 614 |
+
if system_prompt:
|
| 615 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 616 |
messages.extend([m.model_dump() for m in req.messages])
|
| 617 |
temperature = _effective_temperature(req.temperature)
|
| 618 |
top_p = _effective_top_p(req.top_p)
|
|
|
|
| 651 |
@app.post("/vision")
|
| 652 |
async def vision(req: ChatRequest):
|
| 653 |
"""Rox Vision Max - Visual understanding with streaming"""
|
| 654 |
+
messages: list = []
|
| 655 |
+
system_prompt = _system_prompt_for("vision")
|
| 656 |
+
if system_prompt:
|
| 657 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 658 |
messages.extend([m.model_dump() for m in req.messages])
|
| 659 |
temperature = _effective_temperature(req.temperature)
|
| 660 |
top_p = _effective_top_p(req.top_p)
|
|
|
|
| 686 |
async def hf_generate(req: HFRequest):
|
| 687 |
"""HuggingFace compatible endpoint"""
|
| 688 |
params = req.parameters or HFParameters()
|
| 689 |
+
messages: list = []
|
| 690 |
+
system_prompt = _system_prompt_for("core")
|
| 691 |
+
if system_prompt:
|
| 692 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 693 |
+
messages.append({"role": "user", "content": req.inputs})
|
| 694 |
|
| 695 |
try:
|
| 696 |
temperature = _effective_temperature(params.temperature)
|