Antaram commited on
Commit
06d16fa
·
verified ·
1 Parent(s): 787b02e

Upload 3 files

Browse files
Files changed (1) hide show
  1. app.py +66 -66
app.py CHANGED
@@ -368,72 +368,72 @@ async def fast_chat(prompt: str = "", max_tokens: int = 512):
368
 
369
  return {"response": ''.join(response_parts)}
370
 
371
- # ===== Mini-server load tracking & coordination endpoints =====
372
-
373
- # How many concurrent requests this mini should handle
374
- MAX_CONCURRENT_REQUESTS = int(os.environ.get("MAX_CONCURRENT_REQUESTS", "1"))
375
-
376
- # In-memory tracking per process
377
- current_requests = 0
378
-
379
- # For identification / debugging
380
- MINI_SERVER_ID = os.environ.get("MINI_SERVER_ID", "mini-1")
381
-
382
-
383
- class MiniStatus(BaseModel):
384
- server_id: str
385
- max_concurrent: int
386
- current_requests: int
387
- status: str
388
-
389
-
390
- @app.get("/status")
391
- async def mini_status():
392
- """
393
- Used by the main server to know if this mini is idle/busy.
394
- """
395
- status = "busy" if current_requests >= MAX_CONCURRENT_REQUESTS else "idle"
396
- return MiniStatus(
397
- server_id=MINI_SERVER_ID,
398
- max_concurrent=MAX_CONCURRENT_REQUESTS,
399
- current_requests=current_requests,
400
- status=status,
401
- )
402
-
403
-
404
- @app.post("/reserve")
405
- async def reserve_slot():
406
- """
407
- Called by the main server BEFORE it forwards a chat request.
408
- If this mini is full, returns 429 so main server can try another mini.
409
- """
410
- global current_requests
411
- if current_requests >= MAX_CONCURRENT_REQUESTS:
412
- raise HTTPException(status_code=429, detail="Mini server busy")
413
- current_requests += 1
414
- return {
415
- "server_id": MINI_SERVER_ID,
416
- "current_requests": current_requests,
417
- "max_concurrent": MAX_CONCURRENT_REQUESTS,
418
- }
419
-
420
-
421
- @app.post("/release")
422
- async def release_slot():
423
- """
424
- Called by the main server after request is finished (stream closed/response sent).
425
- """
426
- global current_requests
427
- if current_requests > 0:
428
- current_requests -= 1
429
- return {
430
- "server_id": MINI_SERVER_ID,
431
- "current_requests": current_requests,
432
- "max_concurrent": MAX_CONCURRENT_REQUESTS,
433
- }
434
-
435
-
436
- if __name__ == "__main__":
437
  uvicorn.run(
438
  app,
439
  host="0.0.0.0",
 
368
 
369
  return {"response": ''.join(response_parts)}
370
 
371
+ # ===== Mini-server load tracking & coordination endpoints =====
372
+
373
+ # How many concurrent requests this mini should handle
374
+ MAX_CONCURRENT_REQUESTS = int(os.environ.get("MAX_CONCURRENT_REQUESTS", "1"))
375
+
376
+ # In-memory tracking per process
377
+ current_requests = 0
378
+
379
+ # For identification / debugging
380
+ MINI_SERVER_ID = os.environ.get("MINI_SERVER_ID", "mini-1")
381
+
382
+
383
+ class MiniStatus(BaseModel):
384
+ server_id: str
385
+ max_concurrent: int
386
+ current_requests: int
387
+ status: str
388
+
389
+
390
+ @app.get("/status")
391
+ async def mini_status():
392
+ """
393
+ Used by the main server to know if this mini is idle/busy.
394
+ """
395
+ status = "busy" if current_requests >= MAX_CONCURRENT_REQUESTS else "idle"
396
+ return MiniStatus(
397
+ server_id=MINI_SERVER_ID,
398
+ max_concurrent=MAX_CONCURRENT_REQUESTS,
399
+ current_requests=current_requests,
400
+ status=status,
401
+ )
402
+
403
+
404
+ @app.post("/reserve")
405
+ async def reserve_slot():
406
+ """
407
+ Called by the main server BEFORE it forwards a chat request.
408
+ If this mini is full, returns 429 so main server can try another mini.
409
+ """
410
+ global current_requests
411
+ if current_requests >= MAX_CONCURRENT_REQUESTS:
412
+ raise HTTPException(status_code=429, detail="Mini server busy")
413
+ current_requests += 1
414
+ return {
415
+ "server_id": MINI_SERVER_ID,
416
+ "current_requests": current_requests,
417
+ "max_concurrent": MAX_CONCURRENT_REQUESTS,
418
+ }
419
+
420
+
421
+ @app.post("/release")
422
+ async def release_slot():
423
+ """
424
+ Called by the main server after request is finished (stream closed/response sent).
425
+ """
426
+ global current_requests
427
+ if current_requests > 0:
428
+ current_requests -= 1
429
+ return {
430
+ "server_id": MINI_SERVER_ID,
431
+ "current_requests": current_requests,
432
+ "max_concurrent": MAX_CONCURRENT_REQUESTS,
433
+ }
434
+
435
+
436
+ if __name__ == "__main__":
437
  uvicorn.run(
438
  app,
439
  host="0.0.0.0",