Rox-Turbo commited on
Commit
f2619c7
·
verified ·
1 Parent(s): acd4ef1

Update server.py

Browse files
Files changed (1) hide show
  1. server.py +37 -26
server.py CHANGED
@@ -5,7 +5,7 @@ import time
5
  import uuid
6
  import asyncio
7
  from typing import List, Optional, AsyncGenerator, Iterable
8
- from contextlib import asynccontextmanager
9
 
10
  from dotenv import load_dotenv
11
  from fastapi import FastAPI, HTTPException, Request, Response
@@ -44,7 +44,8 @@ def _parse_cors_origins(value: str) -> List[str]:
44
 
45
  CORS_ORIGINS = _parse_cors_origins(os.getenv("CORS_ORIGINS", "*"))
46
  GZIP_MIN_SIZE = int(os.getenv("GZIP_MIN_SIZE", "500"))
47
- MAX_REQUEST_BYTES = int(os.getenv("MAX_REQUEST_BYTES", str(1_000_000))) # 1MB default
 
48
 
49
  # Fast-by-default generation settings (still fully overridable per request)
50
  DEFAULT_TEMPERATURE = float(os.getenv("DEFAULT_TEMPERATURE", "0.7"))
@@ -52,11 +53,14 @@ DEFAULT_TOP_P = float(os.getenv("DEFAULT_TOP_P", "0.95"))
52
  DEFAULT_MAX_TOKENS = int(os.getenv("DEFAULT_MAX_TOKENS", "1024"))
53
 
54
  # Concurrency guard to keep tail latency low under spikes
55
- MAX_INFLIGHT_REQUESTS = int(os.getenv("MAX_INFLIGHT_REQUESTS", "200"))
56
 
57
  # "Thinking" increases latency; keep opt-in via env
58
  ENABLE_THINKING = os.getenv("ENABLE_THINKING", "0").strip().lower() in {"1", "true", "yes", "on"}
59
 
 
 
 
60
  # System prompt size directly impacts latency/cost. Make it configurable.
61
  # Modes:
62
  # - full: current long identity prompts (default, backward compatible)
@@ -138,7 +142,10 @@ async def lifespan(app: FastAPI):
138
  max_retries=max_retries,
139
  http_client=http_client,
140
  )
141
- app.state.inflight_semaphore = asyncio.Semaphore(MAX_INFLIGHT_REQUESTS)
 
 
 
142
 
143
  try:
144
  yield
@@ -173,14 +180,15 @@ async def add_request_context(request: Request, call_next):
173
  request_id = request.headers.get("x-request-id") or str(uuid.uuid4())
174
  start = time.perf_counter()
175
  try:
176
- # Protect server from huge bodies (DoS / latency blowups)
177
- cl = request.headers.get("content-length")
178
- if cl is not None:
179
- try:
180
- if int(cl) > MAX_REQUEST_BYTES:
181
- return JSONResponse(status_code=413, content={"error": "Request too large"})
182
- except ValueError:
183
- return JSONResponse(status_code=400, content={"error": "Invalid Content-Length"})
 
184
 
185
  response: Response = await call_next(request)
186
  finally:
@@ -210,10 +218,10 @@ def _client(app_: FastAPI) -> AsyncOpenAI:
210
  raise RuntimeError("Client not initialized")
211
  return c
212
 
213
- def _semaphore(app_: FastAPI) -> asyncio.Semaphore:
214
  s = getattr(app_.state, "inflight_semaphore", None)
215
  if s is None:
216
- raise RuntimeError("Semaphore not initialized")
217
  return s
218
 
219
  def _effective_temperature(value: Optional[float]) -> float:
@@ -222,11 +230,14 @@ def _effective_temperature(value: Optional[float]) -> float:
222
  def _effective_top_p(value: Optional[float]) -> float:
223
  return DEFAULT_TOP_P if value is None else value
224
 
225
- def _effective_max_tokens(value: Optional[int], cap: int) -> int:
226
  v = DEFAULT_MAX_TOKENS if value is None else value
227
  if v < 1:
228
  v = DEFAULT_MAX_TOKENS
229
- return min(v, cap)
 
 
 
230
 
231
  def _sse_headers() -> dict:
232
  # Helps proxies (nginx) avoid buffering and keeps SSE responsive
@@ -249,7 +260,7 @@ async def stream_response(
249
  ) -> AsyncGenerator[str, None]:
250
  """Stream responses from OpenAI API"""
251
  try:
252
- async with _semaphore(app_):
253
  stream = await _client(app_).chat.completions.create(
254
  model=model,
255
  messages=messages,
@@ -405,7 +416,7 @@ async def chat(req: ChatRequest):
405
  )
406
 
407
  try:
408
- async with _semaphore(app):
409
  completion = await _client(app).chat.completions.create(
410
  model=ROX_CORE_MODEL,
411
  messages=messages,
@@ -439,7 +450,7 @@ async def turbo(req: ChatRequest):
439
  )
440
 
441
  try:
442
- async with _semaphore(app):
443
  completion = await _client(app).chat.completions.create(
444
  model=ROX_TURBO_MODEL,
445
  messages=messages,
@@ -480,7 +491,7 @@ async def coder(req: ChatRequest):
480
  )
481
 
482
  try:
483
- async with _semaphore(app):
484
  completion = await _client(app).chat.completions.create(
485
  model=ROX_CODER_MODEL,
486
  messages=messages,
@@ -517,7 +528,7 @@ async def turbo45(req: ChatRequest):
517
  )
518
 
519
  try:
520
- async with _semaphore(app):
521
  completion = await _client(app).chat.completions.create(
522
  model=ROX_TURBO_45_MODEL,
523
  messages=messages,
@@ -554,7 +565,7 @@ async def ultra(req: ChatRequest):
554
  )
555
 
556
  try:
557
- async with _semaphore(app):
558
  completion = await _client(app).chat.completions.create(
559
  model=ROX_ULTRA_MODEL,
560
  messages=messages,
@@ -591,7 +602,7 @@ async def dyno(req: ChatRequest):
591
  )
592
 
593
  try:
594
- async with _semaphore(app):
595
  completion = await _client(app).chat.completions.create(
596
  model=ROX_DYNO_MODEL,
597
  messages=messages,
@@ -633,7 +644,7 @@ async def coder7(req: ChatRequest):
633
  )
634
 
635
  try:
636
- async with _semaphore(app):
637
  completion = await _client(app).chat.completions.create(
638
  model=ROX_CODER_7_MODEL,
639
  messages=messages,
@@ -668,7 +679,7 @@ async def vision(req: ChatRequest):
668
  )
669
 
670
  try:
671
- async with _semaphore(app):
672
  completion = await _client(app).chat.completions.create(
673
  model=ROX_VISION_MODEL,
674
  messages=messages,
@@ -696,7 +707,7 @@ async def hf_generate(req: HFRequest):
696
  temperature = _effective_temperature(params.temperature)
697
  top_p = _effective_top_p(params.top_p)
698
  max_tokens = _effective_max_tokens(params.max_new_tokens, 8192)
699
- async with _semaphore(app):
700
  completion = await _client(app).chat.completions.create(
701
  model=ROX_CORE_MODEL,
702
  messages=messages,
 
5
  import uuid
6
  import asyncio
7
  from typing import List, Optional, AsyncGenerator, Iterable
8
+ from contextlib import asynccontextmanager, nullcontext
9
 
10
  from dotenv import load_dotenv
11
  from fastapi import FastAPI, HTTPException, Request, Response
 
44
 
45
  CORS_ORIGINS = _parse_cors_origins(os.getenv("CORS_ORIGINS", "*"))
46
  GZIP_MIN_SIZE = int(os.getenv("GZIP_MIN_SIZE", "500"))
47
+ # Optional safety checks (can be disabled by setting 0)
48
+ MAX_REQUEST_BYTES = int(os.getenv("MAX_REQUEST_BYTES", "0")) # 0 = disabled
49
 
50
  # Fast-by-default generation settings (still fully overridable per request)
51
  DEFAULT_TEMPERATURE = float(os.getenv("DEFAULT_TEMPERATURE", "0.7"))
 
53
  DEFAULT_MAX_TOKENS = int(os.getenv("DEFAULT_MAX_TOKENS", "1024"))
54
 
55
  # Concurrency guard to keep tail latency low under spikes
56
+ MAX_INFLIGHT_REQUESTS = int(os.getenv("MAX_INFLIGHT_REQUESTS", "0")) # 0 = disabled
57
 
58
  # "Thinking" increases latency; keep opt-in via env
59
  ENABLE_THINKING = os.getenv("ENABLE_THINKING", "0").strip().lower() in {"1", "true", "yes", "on"}
60
 
61
+ # Optional hard cap for max_tokens (0 = disabled). Note: upstream providers may still enforce their own limits.
62
+ MAX_TOKENS_HARD_CAP = int(os.getenv("MAX_TOKENS_HARD_CAP", "0"))
63
+
64
  # System prompt size directly impacts latency/cost. Make it configurable.
65
  # Modes:
66
  # - full: current long identity prompts (default, backward compatible)
 
142
  max_retries=max_retries,
143
  http_client=http_client,
144
  )
145
+ if MAX_INFLIGHT_REQUESTS > 0:
146
+ app.state.inflight_semaphore = asyncio.Semaphore(MAX_INFLIGHT_REQUESTS)
147
+ else:
148
+ app.state.inflight_semaphore = None
149
 
150
  try:
151
  yield
 
180
  request_id = request.headers.get("x-request-id") or str(uuid.uuid4())
181
  start = time.perf_counter()
182
  try:
183
+ # Optional body-size protection (disabled by default)
184
+ if MAX_REQUEST_BYTES > 0:
185
+ cl = request.headers.get("content-length")
186
+ if cl is not None:
187
+ try:
188
+ if int(cl) > MAX_REQUEST_BYTES:
189
+ return JSONResponse(status_code=413, content={"error": "Request too large"})
190
+ except ValueError:
191
+ return JSONResponse(status_code=400, content={"error": "Invalid Content-Length"})
192
 
193
  response: Response = await call_next(request)
194
  finally:
 
218
  raise RuntimeError("Client not initialized")
219
  return c
220
 
221
+ def _inflight_context(app_: FastAPI):
222
  s = getattr(app_.state, "inflight_semaphore", None)
223
  if s is None:
224
+ return nullcontext()
225
  return s
226
 
227
  def _effective_temperature(value: Optional[float]) -> float:
 
230
  def _effective_top_p(value: Optional[float]) -> float:
231
  return DEFAULT_TOP_P if value is None else value
232
 
233
+ def _effective_max_tokens(value: Optional[int], model_cap: int) -> int:
234
  v = DEFAULT_MAX_TOKENS if value is None else value
235
  if v < 1:
236
  v = DEFAULT_MAX_TOKENS
237
+ if MAX_TOKENS_HARD_CAP > 0:
238
+ return min(v, model_cap, MAX_TOKENS_HARD_CAP)
239
+ # No hard cap from this API layer; upstream may still enforce its own maximum.
240
+ return v
241
 
242
  def _sse_headers() -> dict:
243
  # Helps proxies (nginx) avoid buffering and keeps SSE responsive
 
260
  ) -> AsyncGenerator[str, None]:
261
  """Stream responses from OpenAI API"""
262
  try:
263
+ async with _inflight_context(app_):
264
  stream = await _client(app_).chat.completions.create(
265
  model=model,
266
  messages=messages,
 
416
  )
417
 
418
  try:
419
+ async with _inflight_context(app):
420
  completion = await _client(app).chat.completions.create(
421
  model=ROX_CORE_MODEL,
422
  messages=messages,
 
450
  )
451
 
452
  try:
453
+ async with _inflight_context(app):
454
  completion = await _client(app).chat.completions.create(
455
  model=ROX_TURBO_MODEL,
456
  messages=messages,
 
491
  )
492
 
493
  try:
494
+ async with _inflight_context(app):
495
  completion = await _client(app).chat.completions.create(
496
  model=ROX_CODER_MODEL,
497
  messages=messages,
 
528
  )
529
 
530
  try:
531
+ async with _inflight_context(app):
532
  completion = await _client(app).chat.completions.create(
533
  model=ROX_TURBO_45_MODEL,
534
  messages=messages,
 
565
  )
566
 
567
  try:
568
+ async with _inflight_context(app):
569
  completion = await _client(app).chat.completions.create(
570
  model=ROX_ULTRA_MODEL,
571
  messages=messages,
 
602
  )
603
 
604
  try:
605
+ async with _inflight_context(app):
606
  completion = await _client(app).chat.completions.create(
607
  model=ROX_DYNO_MODEL,
608
  messages=messages,
 
644
  )
645
 
646
  try:
647
+ async with _inflight_context(app):
648
  completion = await _client(app).chat.completions.create(
649
  model=ROX_CODER_7_MODEL,
650
  messages=messages,
 
679
  )
680
 
681
  try:
682
+ async with _inflight_context(app):
683
  completion = await _client(app).chat.completions.create(
684
  model=ROX_VISION_MODEL,
685
  messages=messages,
 
707
  temperature = _effective_temperature(params.temperature)
708
  top_p = _effective_top_p(params.top_p)
709
  max_tokens = _effective_max_tokens(params.max_new_tokens, 8192)
710
+ async with _inflight_context(app):
711
  completion = await _client(app).chat.completions.create(
712
  model=ROX_CORE_MODEL,
713
  messages=messages,