Spaces:

Rox-Turbo
/

API

Running

App Files Files Community

Rox-Turbo commited on Mar 13

Commit

f2619c7

verified ·

1 Parent(s): acd4ef1

Update server.py

Browse files

Files changed (1) hide show

server.py +37 -26

server.py CHANGED Viewed

@@ -5,7 +5,7 @@ import time
 import uuid
 import asyncio
 from typing import List, Optional, AsyncGenerator, Iterable
-from contextlib import asynccontextmanager
 from dotenv import load_dotenv
 from fastapi import FastAPI, HTTPException, Request, Response
@@ -44,7 +44,8 @@ def _parse_cors_origins(value: str) -> List[str]:
 CORS_ORIGINS = _parse_cors_origins(os.getenv("CORS_ORIGINS", "*"))
 GZIP_MIN_SIZE = int(os.getenv("GZIP_MIN_SIZE", "500"))
-MAX_REQUEST_BYTES = int(os.getenv("MAX_REQUEST_BYTES", str(1_000_000)))  # 1MB default
 # Fast-by-default generation settings (still fully overridable per request)
 DEFAULT_TEMPERATURE = float(os.getenv("DEFAULT_TEMPERATURE", "0.7"))
@@ -52,11 +53,14 @@ DEFAULT_TOP_P = float(os.getenv("DEFAULT_TOP_P", "0.95"))
 DEFAULT_MAX_TOKENS = int(os.getenv("DEFAULT_MAX_TOKENS", "1024"))
 # Concurrency guard to keep tail latency low under spikes
-MAX_INFLIGHT_REQUESTS = int(os.getenv("MAX_INFLIGHT_REQUESTS", "200"))
 # "Thinking" increases latency; keep opt-in via env
 ENABLE_THINKING = os.getenv("ENABLE_THINKING", "0").strip().lower() in {"1", "true", "yes", "on"}
 # System prompt size directly impacts latency/cost. Make it configurable.
 # Modes:
 # - full: current long identity prompts (default, backward compatible)
@@ -138,7 +142,10 @@ async def lifespan(app: FastAPI):
         max_retries=max_retries,
         http_client=http_client,
     )
-    app.state.inflight_semaphore = asyncio.Semaphore(MAX_INFLIGHT_REQUESTS)
     try:
         yield
@@ -173,14 +180,15 @@ async def add_request_context(request: Request, call_next):
     request_id = request.headers.get("x-request-id") or str(uuid.uuid4())
     start = time.perf_counter()
     try:
-        # Protect server from huge bodies (DoS / latency blowups)
-        cl = request.headers.get("content-length")
-        if cl is not None:
-            try:
-                if int(cl) > MAX_REQUEST_BYTES:
-                    return JSONResponse(status_code=413, content={"error": "Request too large"})
-            except ValueError:
-                return JSONResponse(status_code=400, content={"error": "Invalid Content-Length"})
         response: Response = await call_next(request)
     finally:
@@ -210,10 +218,10 @@ def _client(app_: FastAPI) -> AsyncOpenAI:
         raise RuntimeError("Client not initialized")
     return c
-def _semaphore(app_: FastAPI) -> asyncio.Semaphore:
     s = getattr(app_.state, "inflight_semaphore", None)
     if s is None:
-        raise RuntimeError("Semaphore not initialized")
     return s
 def _effective_temperature(value: Optional[float]) -> float:
@@ -222,11 +230,14 @@ def _effective_temperature(value: Optional[float]) -> float:
 def _effective_top_p(value: Optional[float]) -> float:
     return DEFAULT_TOP_P if value is None else value
-def _effective_max_tokens(value: Optional[int], cap: int) -> int:
     v = DEFAULT_MAX_TOKENS if value is None else value
     if v < 1:
         v = DEFAULT_MAX_TOKENS
-    return min(v, cap)
 def _sse_headers() -> dict:
     # Helps proxies (nginx) avoid buffering and keeps SSE responsive
@@ -249,7 +260,7 @@ async def stream_response(
 ) -> AsyncGenerator[str, None]:
     """Stream responses from OpenAI API"""
     try:
-        async with _semaphore(app_):
             stream = await _client(app_).chat.completions.create(
                 model=model,
                 messages=messages,
@@ -405,7 +416,7 @@ async def chat(req: ChatRequest):
         )
     try:
-        async with _semaphore(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_CORE_MODEL,
                 messages=messages,
@@ -439,7 +450,7 @@ async def turbo(req: ChatRequest):
         )
     try:
-        async with _semaphore(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_TURBO_MODEL,
                 messages=messages,
@@ -480,7 +491,7 @@ async def coder(req: ChatRequest):
         )
     try:
-        async with _semaphore(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_CODER_MODEL,
                 messages=messages,
@@ -517,7 +528,7 @@ async def turbo45(req: ChatRequest):
         )
     try:
-        async with _semaphore(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_TURBO_45_MODEL,
                 messages=messages,
@@ -554,7 +565,7 @@ async def ultra(req: ChatRequest):
         )
     try:
-        async with _semaphore(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_ULTRA_MODEL,
                 messages=messages,
@@ -591,7 +602,7 @@ async def dyno(req: ChatRequest):
         )
     try:
-        async with _semaphore(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_DYNO_MODEL,
                 messages=messages,
@@ -633,7 +644,7 @@ async def coder7(req: ChatRequest):
         )
     try:
-        async with _semaphore(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_CODER_7_MODEL,
                 messages=messages,
@@ -668,7 +679,7 @@ async def vision(req: ChatRequest):
         )
     try:
-        async with _semaphore(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_VISION_MODEL,
                 messages=messages,
@@ -696,7 +707,7 @@ async def hf_generate(req: HFRequest):
         temperature = _effective_temperature(params.temperature)
         top_p = _effective_top_p(params.top_p)
         max_tokens = _effective_max_tokens(params.max_new_tokens, 8192)
-        async with _semaphore(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_CORE_MODEL,
                 messages=messages,

 import uuid
 import asyncio
 from typing import List, Optional, AsyncGenerator, Iterable
+from contextlib import asynccontextmanager, nullcontext
 from dotenv import load_dotenv
 from fastapi import FastAPI, HTTPException, Request, Response
 CORS_ORIGINS = _parse_cors_origins(os.getenv("CORS_ORIGINS", "*"))
 GZIP_MIN_SIZE = int(os.getenv("GZIP_MIN_SIZE", "500"))
+# Optional safety checks (can be disabled by setting 0)
+MAX_REQUEST_BYTES = int(os.getenv("MAX_REQUEST_BYTES", "0"))  # 0 = disabled
 # Fast-by-default generation settings (still fully overridable per request)
 DEFAULT_TEMPERATURE = float(os.getenv("DEFAULT_TEMPERATURE", "0.7"))
 DEFAULT_MAX_TOKENS = int(os.getenv("DEFAULT_MAX_TOKENS", "1024"))
 # Concurrency guard to keep tail latency low under spikes
+MAX_INFLIGHT_REQUESTS = int(os.getenv("MAX_INFLIGHT_REQUESTS", "0"))  # 0 = disabled
 # "Thinking" increases latency; keep opt-in via env
 ENABLE_THINKING = os.getenv("ENABLE_THINKING", "0").strip().lower() in {"1", "true", "yes", "on"}
+# Optional hard cap for max_tokens (0 = disabled). Note: upstream providers may still enforce their own limits.
+MAX_TOKENS_HARD_CAP = int(os.getenv("MAX_TOKENS_HARD_CAP", "0"))
 # System prompt size directly impacts latency/cost. Make it configurable.
 # Modes:
 # - full: current long identity prompts (default, backward compatible)
         max_retries=max_retries,
         http_client=http_client,
     )
+    if MAX_INFLIGHT_REQUESTS > 0:
+        app.state.inflight_semaphore = asyncio.Semaphore(MAX_INFLIGHT_REQUESTS)
+    else:
+        app.state.inflight_semaphore = None
     try:
         yield
     request_id = request.headers.get("x-request-id") or str(uuid.uuid4())
     start = time.perf_counter()
     try:
+        # Optional body-size protection (disabled by default)
+        if MAX_REQUEST_BYTES > 0:
+            cl = request.headers.get("content-length")
+            if cl is not None:
+                try:
+                    if int(cl) > MAX_REQUEST_BYTES:
+                        return JSONResponse(status_code=413, content={"error": "Request too large"})
+                except ValueError:
+                    return JSONResponse(status_code=400, content={"error": "Invalid Content-Length"})
         response: Response = await call_next(request)
     finally:
         raise RuntimeError("Client not initialized")
     return c
+def _inflight_context(app_: FastAPI):
     s = getattr(app_.state, "inflight_semaphore", None)
     if s is None:
+        return nullcontext()
     return s
 def _effective_temperature(value: Optional[float]) -> float:
 def _effective_top_p(value: Optional[float]) -> float:
     return DEFAULT_TOP_P if value is None else value
+def _effective_max_tokens(value: Optional[int], model_cap: int) -> int:
     v = DEFAULT_MAX_TOKENS if value is None else value
     if v < 1:
         v = DEFAULT_MAX_TOKENS
+    if MAX_TOKENS_HARD_CAP > 0:
+        return min(v, model_cap, MAX_TOKENS_HARD_CAP)
+    # No hard cap from this API layer; upstream may still enforce its own maximum.
+    return v
 def _sse_headers() -> dict:
     # Helps proxies (nginx) avoid buffering and keeps SSE responsive
 ) -> AsyncGenerator[str, None]:
     """Stream responses from OpenAI API"""
     try:
+        async with _inflight_context(app_):
             stream = await _client(app_).chat.completions.create(
                 model=model,
                 messages=messages,
         )
     try:
+        async with _inflight_context(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_CORE_MODEL,
                 messages=messages,
         )
     try:
+        async with _inflight_context(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_TURBO_MODEL,
                 messages=messages,
         )
     try:
+        async with _inflight_context(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_CODER_MODEL,
                 messages=messages,
         )
     try:
+        async with _inflight_context(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_TURBO_45_MODEL,
                 messages=messages,
         )
     try:
+        async with _inflight_context(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_ULTRA_MODEL,
                 messages=messages,
         )
     try:
+        async with _inflight_context(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_DYNO_MODEL,
                 messages=messages,
         )
     try:
+        async with _inflight_context(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_CODER_7_MODEL,
                 messages=messages,
         )
     try:
+        async with _inflight_context(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_VISION_MODEL,
                 messages=messages,
         temperature = _effective_temperature(params.temperature)
         top_p = _effective_top_p(params.top_p)
         max_tokens = _effective_max_tokens(params.max_new_tokens, 8192)
+        async with _inflight_context(app):
             completion = await _client(app).chat.completions.create(
                 model=ROX_CORE_MODEL,
                 messages=messages,