Update server.py
Browse files
server.py
CHANGED
|
@@ -5,7 +5,7 @@ import time
|
|
| 5 |
import uuid
|
| 6 |
import asyncio
|
| 7 |
from typing import List, Optional, AsyncGenerator, Iterable
|
| 8 |
-
from contextlib import asynccontextmanager
|
| 9 |
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
from fastapi import FastAPI, HTTPException, Request, Response
|
|
@@ -44,7 +44,8 @@ def _parse_cors_origins(value: str) -> List[str]:
|
|
| 44 |
|
| 45 |
CORS_ORIGINS = _parse_cors_origins(os.getenv("CORS_ORIGINS", "*"))
|
| 46 |
GZIP_MIN_SIZE = int(os.getenv("GZIP_MIN_SIZE", "500"))
|
| 47 |
-
|
|
|
|
| 48 |
|
| 49 |
# Fast-by-default generation settings (still fully overridable per request)
|
| 50 |
DEFAULT_TEMPERATURE = float(os.getenv("DEFAULT_TEMPERATURE", "0.7"))
|
|
@@ -52,11 +53,14 @@ DEFAULT_TOP_P = float(os.getenv("DEFAULT_TOP_P", "0.95"))
|
|
| 52 |
DEFAULT_MAX_TOKENS = int(os.getenv("DEFAULT_MAX_TOKENS", "1024"))
|
| 53 |
|
| 54 |
# Concurrency guard to keep tail latency low under spikes
|
| 55 |
-
MAX_INFLIGHT_REQUESTS = int(os.getenv("MAX_INFLIGHT_REQUESTS", "
|
| 56 |
|
| 57 |
# "Thinking" increases latency; keep opt-in via env
|
| 58 |
ENABLE_THINKING = os.getenv("ENABLE_THINKING", "0").strip().lower() in {"1", "true", "yes", "on"}
|
| 59 |
|
|
|
|
|
|
|
|
|
|
| 60 |
# System prompt size directly impacts latency/cost. Make it configurable.
|
| 61 |
# Modes:
|
| 62 |
# - full: current long identity prompts (default, backward compatible)
|
|
@@ -138,7 +142,10 @@ async def lifespan(app: FastAPI):
|
|
| 138 |
max_retries=max_retries,
|
| 139 |
http_client=http_client,
|
| 140 |
)
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
try:
|
| 144 |
yield
|
|
@@ -173,14 +180,15 @@ async def add_request_context(request: Request, call_next):
|
|
| 173 |
request_id = request.headers.get("x-request-id") or str(uuid.uuid4())
|
| 174 |
start = time.perf_counter()
|
| 175 |
try:
|
| 176 |
-
#
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
| 184 |
|
| 185 |
response: Response = await call_next(request)
|
| 186 |
finally:
|
|
@@ -210,10 +218,10 @@ def _client(app_: FastAPI) -> AsyncOpenAI:
|
|
| 210 |
raise RuntimeError("Client not initialized")
|
| 211 |
return c
|
| 212 |
|
| 213 |
-
def
|
| 214 |
s = getattr(app_.state, "inflight_semaphore", None)
|
| 215 |
if s is None:
|
| 216 |
-
|
| 217 |
return s
|
| 218 |
|
| 219 |
def _effective_temperature(value: Optional[float]) -> float:
|
|
@@ -222,11 +230,14 @@ def _effective_temperature(value: Optional[float]) -> float:
|
|
| 222 |
def _effective_top_p(value: Optional[float]) -> float:
|
| 223 |
return DEFAULT_TOP_P if value is None else value
|
| 224 |
|
| 225 |
-
def _effective_max_tokens(value: Optional[int],
|
| 226 |
v = DEFAULT_MAX_TOKENS if value is None else value
|
| 227 |
if v < 1:
|
| 228 |
v = DEFAULT_MAX_TOKENS
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
def _sse_headers() -> dict:
|
| 232 |
# Helps proxies (nginx) avoid buffering and keeps SSE responsive
|
|
@@ -249,7 +260,7 @@ async def stream_response(
|
|
| 249 |
) -> AsyncGenerator[str, None]:
|
| 250 |
"""Stream responses from OpenAI API"""
|
| 251 |
try:
|
| 252 |
-
async with
|
| 253 |
stream = await _client(app_).chat.completions.create(
|
| 254 |
model=model,
|
| 255 |
messages=messages,
|
|
@@ -405,7 +416,7 @@ async def chat(req: ChatRequest):
|
|
| 405 |
)
|
| 406 |
|
| 407 |
try:
|
| 408 |
-
async with
|
| 409 |
completion = await _client(app).chat.completions.create(
|
| 410 |
model=ROX_CORE_MODEL,
|
| 411 |
messages=messages,
|
|
@@ -439,7 +450,7 @@ async def turbo(req: ChatRequest):
|
|
| 439 |
)
|
| 440 |
|
| 441 |
try:
|
| 442 |
-
async with
|
| 443 |
completion = await _client(app).chat.completions.create(
|
| 444 |
model=ROX_TURBO_MODEL,
|
| 445 |
messages=messages,
|
|
@@ -480,7 +491,7 @@ async def coder(req: ChatRequest):
|
|
| 480 |
)
|
| 481 |
|
| 482 |
try:
|
| 483 |
-
async with
|
| 484 |
completion = await _client(app).chat.completions.create(
|
| 485 |
model=ROX_CODER_MODEL,
|
| 486 |
messages=messages,
|
|
@@ -517,7 +528,7 @@ async def turbo45(req: ChatRequest):
|
|
| 517 |
)
|
| 518 |
|
| 519 |
try:
|
| 520 |
-
async with
|
| 521 |
completion = await _client(app).chat.completions.create(
|
| 522 |
model=ROX_TURBO_45_MODEL,
|
| 523 |
messages=messages,
|
|
@@ -554,7 +565,7 @@ async def ultra(req: ChatRequest):
|
|
| 554 |
)
|
| 555 |
|
| 556 |
try:
|
| 557 |
-
async with
|
| 558 |
completion = await _client(app).chat.completions.create(
|
| 559 |
model=ROX_ULTRA_MODEL,
|
| 560 |
messages=messages,
|
|
@@ -591,7 +602,7 @@ async def dyno(req: ChatRequest):
|
|
| 591 |
)
|
| 592 |
|
| 593 |
try:
|
| 594 |
-
async with
|
| 595 |
completion = await _client(app).chat.completions.create(
|
| 596 |
model=ROX_DYNO_MODEL,
|
| 597 |
messages=messages,
|
|
@@ -633,7 +644,7 @@ async def coder7(req: ChatRequest):
|
|
| 633 |
)
|
| 634 |
|
| 635 |
try:
|
| 636 |
-
async with
|
| 637 |
completion = await _client(app).chat.completions.create(
|
| 638 |
model=ROX_CODER_7_MODEL,
|
| 639 |
messages=messages,
|
|
@@ -668,7 +679,7 @@ async def vision(req: ChatRequest):
|
|
| 668 |
)
|
| 669 |
|
| 670 |
try:
|
| 671 |
-
async with
|
| 672 |
completion = await _client(app).chat.completions.create(
|
| 673 |
model=ROX_VISION_MODEL,
|
| 674 |
messages=messages,
|
|
@@ -696,7 +707,7 @@ async def hf_generate(req: HFRequest):
|
|
| 696 |
temperature = _effective_temperature(params.temperature)
|
| 697 |
top_p = _effective_top_p(params.top_p)
|
| 698 |
max_tokens = _effective_max_tokens(params.max_new_tokens, 8192)
|
| 699 |
-
async with
|
| 700 |
completion = await _client(app).chat.completions.create(
|
| 701 |
model=ROX_CORE_MODEL,
|
| 702 |
messages=messages,
|
|
|
|
| 5 |
import uuid
|
| 6 |
import asyncio
|
| 7 |
from typing import List, Optional, AsyncGenerator, Iterable
|
| 8 |
+
from contextlib import asynccontextmanager, nullcontext
|
| 9 |
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
from fastapi import FastAPI, HTTPException, Request, Response
|
|
|
|
| 44 |
|
| 45 |
CORS_ORIGINS = _parse_cors_origins(os.getenv("CORS_ORIGINS", "*"))
|
| 46 |
GZIP_MIN_SIZE = int(os.getenv("GZIP_MIN_SIZE", "500"))
|
| 47 |
+
# Optional safety checks (can be disabled by setting 0)
|
| 48 |
+
MAX_REQUEST_BYTES = int(os.getenv("MAX_REQUEST_BYTES", "0")) # 0 = disabled
|
| 49 |
|
| 50 |
# Fast-by-default generation settings (still fully overridable per request)
|
| 51 |
DEFAULT_TEMPERATURE = float(os.getenv("DEFAULT_TEMPERATURE", "0.7"))
|
|
|
|
| 53 |
DEFAULT_MAX_TOKENS = int(os.getenv("DEFAULT_MAX_TOKENS", "1024"))
|
| 54 |
|
| 55 |
# Concurrency guard to keep tail latency low under spikes
|
| 56 |
+
MAX_INFLIGHT_REQUESTS = int(os.getenv("MAX_INFLIGHT_REQUESTS", "0")) # 0 = disabled
|
| 57 |
|
| 58 |
# "Thinking" increases latency; keep opt-in via env
|
| 59 |
ENABLE_THINKING = os.getenv("ENABLE_THINKING", "0").strip().lower() in {"1", "true", "yes", "on"}
|
| 60 |
|
| 61 |
+
# Optional hard cap for max_tokens (0 = disabled). Note: upstream providers may still enforce their own limits.
|
| 62 |
+
MAX_TOKENS_HARD_CAP = int(os.getenv("MAX_TOKENS_HARD_CAP", "0"))
|
| 63 |
+
|
| 64 |
# System prompt size directly impacts latency/cost. Make it configurable.
|
| 65 |
# Modes:
|
| 66 |
# - full: current long identity prompts (default, backward compatible)
|
|
|
|
| 142 |
max_retries=max_retries,
|
| 143 |
http_client=http_client,
|
| 144 |
)
|
| 145 |
+
if MAX_INFLIGHT_REQUESTS > 0:
|
| 146 |
+
app.state.inflight_semaphore = asyncio.Semaphore(MAX_INFLIGHT_REQUESTS)
|
| 147 |
+
else:
|
| 148 |
+
app.state.inflight_semaphore = None
|
| 149 |
|
| 150 |
try:
|
| 151 |
yield
|
|
|
|
| 180 |
request_id = request.headers.get("x-request-id") or str(uuid.uuid4())
|
| 181 |
start = time.perf_counter()
|
| 182 |
try:
|
| 183 |
+
# Optional body-size protection (disabled by default)
|
| 184 |
+
if MAX_REQUEST_BYTES > 0:
|
| 185 |
+
cl = request.headers.get("content-length")
|
| 186 |
+
if cl is not None:
|
| 187 |
+
try:
|
| 188 |
+
if int(cl) > MAX_REQUEST_BYTES:
|
| 189 |
+
return JSONResponse(status_code=413, content={"error": "Request too large"})
|
| 190 |
+
except ValueError:
|
| 191 |
+
return JSONResponse(status_code=400, content={"error": "Invalid Content-Length"})
|
| 192 |
|
| 193 |
response: Response = await call_next(request)
|
| 194 |
finally:
|
|
|
|
| 218 |
raise RuntimeError("Client not initialized")
|
| 219 |
return c
|
| 220 |
|
| 221 |
+
def _inflight_context(app_: FastAPI):
|
| 222 |
s = getattr(app_.state, "inflight_semaphore", None)
|
| 223 |
if s is None:
|
| 224 |
+
return nullcontext()
|
| 225 |
return s
|
| 226 |
|
| 227 |
def _effective_temperature(value: Optional[float]) -> float:
|
|
|
|
| 230 |
def _effective_top_p(value: Optional[float]) -> float:
|
| 231 |
return DEFAULT_TOP_P if value is None else value
|
| 232 |
|
| 233 |
+
def _effective_max_tokens(value: Optional[int], model_cap: int) -> int:
|
| 234 |
v = DEFAULT_MAX_TOKENS if value is None else value
|
| 235 |
if v < 1:
|
| 236 |
v = DEFAULT_MAX_TOKENS
|
| 237 |
+
if MAX_TOKENS_HARD_CAP > 0:
|
| 238 |
+
return min(v, model_cap, MAX_TOKENS_HARD_CAP)
|
| 239 |
+
# No hard cap from this API layer; upstream may still enforce its own maximum.
|
| 240 |
+
return v
|
| 241 |
|
| 242 |
def _sse_headers() -> dict:
|
| 243 |
# Helps proxies (nginx) avoid buffering and keeps SSE responsive
|
|
|
|
| 260 |
) -> AsyncGenerator[str, None]:
|
| 261 |
"""Stream responses from OpenAI API"""
|
| 262 |
try:
|
| 263 |
+
async with _inflight_context(app_):
|
| 264 |
stream = await _client(app_).chat.completions.create(
|
| 265 |
model=model,
|
| 266 |
messages=messages,
|
|
|
|
| 416 |
)
|
| 417 |
|
| 418 |
try:
|
| 419 |
+
async with _inflight_context(app):
|
| 420 |
completion = await _client(app).chat.completions.create(
|
| 421 |
model=ROX_CORE_MODEL,
|
| 422 |
messages=messages,
|
|
|
|
| 450 |
)
|
| 451 |
|
| 452 |
try:
|
| 453 |
+
async with _inflight_context(app):
|
| 454 |
completion = await _client(app).chat.completions.create(
|
| 455 |
model=ROX_TURBO_MODEL,
|
| 456 |
messages=messages,
|
|
|
|
| 491 |
)
|
| 492 |
|
| 493 |
try:
|
| 494 |
+
async with _inflight_context(app):
|
| 495 |
completion = await _client(app).chat.completions.create(
|
| 496 |
model=ROX_CODER_MODEL,
|
| 497 |
messages=messages,
|
|
|
|
| 528 |
)
|
| 529 |
|
| 530 |
try:
|
| 531 |
+
async with _inflight_context(app):
|
| 532 |
completion = await _client(app).chat.completions.create(
|
| 533 |
model=ROX_TURBO_45_MODEL,
|
| 534 |
messages=messages,
|
|
|
|
| 565 |
)
|
| 566 |
|
| 567 |
try:
|
| 568 |
+
async with _inflight_context(app):
|
| 569 |
completion = await _client(app).chat.completions.create(
|
| 570 |
model=ROX_ULTRA_MODEL,
|
| 571 |
messages=messages,
|
|
|
|
| 602 |
)
|
| 603 |
|
| 604 |
try:
|
| 605 |
+
async with _inflight_context(app):
|
| 606 |
completion = await _client(app).chat.completions.create(
|
| 607 |
model=ROX_DYNO_MODEL,
|
| 608 |
messages=messages,
|
|
|
|
| 644 |
)
|
| 645 |
|
| 646 |
try:
|
| 647 |
+
async with _inflight_context(app):
|
| 648 |
completion = await _client(app).chat.completions.create(
|
| 649 |
model=ROX_CODER_7_MODEL,
|
| 650 |
messages=messages,
|
|
|
|
| 679 |
)
|
| 680 |
|
| 681 |
try:
|
| 682 |
+
async with _inflight_context(app):
|
| 683 |
completion = await _client(app).chat.completions.create(
|
| 684 |
model=ROX_VISION_MODEL,
|
| 685 |
messages=messages,
|
|
|
|
| 707 |
temperature = _effective_temperature(params.temperature)
|
| 708 |
top_p = _effective_top_p(params.top_p)
|
| 709 |
max_tokens = _effective_max_tokens(params.max_new_tokens, 8192)
|
| 710 |
+
async with _inflight_context(app):
|
| 711 |
completion = await _client(app).chat.completions.create(
|
| 712 |
model=ROX_CORE_MODEL,
|
| 713 |
messages=messages,
|