msradam commited on
Commit
c92bd29
·
verified ·
1 Parent(s): b604d92

fix(mellea): restore vLLM probe with auth header + non-5xx check

Browse files
Files changed (1) hide show
  1. app/mellea_validator.py +32 -0
app/mellea_validator.py CHANGED
@@ -382,6 +382,38 @@ def reconcile_strict_streaming(
382
  _first_token_timeout = int(os.environ.get("RIPRAP_FIRST_TOKEN_TIMEOUT_S", "400"))
383
  _inter_token_timeout = int(os.environ.get("RIPRAP_TOKEN_TIMEOUT_S", "45"))
384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
  for attempt_idx in range(loop_budget):
386
  attempts = attempt_idx + 1
387
  # On reroll, append a tight feedback message naming what failed AND
 
382
  _first_token_timeout = int(os.environ.get("RIPRAP_FIRST_TOKEN_TIMEOUT_S", "400"))
383
  _inter_token_timeout = int(os.environ.get("RIPRAP_TOKEN_TIMEOUT_S", "45"))
384
 
385
+ # When PRIMARY=vllm, RunPod cold-starts take ~250-360s (container boot +
386
+ # model load). The pod starts when the warmup request hits the proxy, but
387
+ # the proxy returns 503 immediately. LiteLLM would fall back to Ollama and
388
+ # fail before RunPod is ready. Poll /v1/models here (after specialists, but
389
+ # before generation) and wait — keepalives keep the SSE connection alive.
390
+ _vllm_base = os.environ.get("RIPRAP_LLM_BASE_URL", "").rstrip("/")
391
+ if os.environ.get("RIPRAP_LLM_PRIMARY", "ollama") == "vllm" and _vllm_base:
392
+ try:
393
+ import httpx as _httpx
394
+ _probe_url = f"{_vllm_base}/models"
395
+ _probe_key = os.environ.get("RIPRAP_LLM_API_KEY", "") or "EMPTY"
396
+ _probe_headers = {"Authorization": f"Bearer {_probe_key}"}
397
+ _probe_deadline = t0 + _first_token_timeout
398
+ log.info("mellea: polling vLLM readiness at %s", _probe_url)
399
+ while time.time() < _probe_deadline:
400
+ try:
401
+ _r = _httpx.get(_probe_url, headers=_probe_headers, timeout=5.0)
402
+ # Any non-503/502/504 means the service is UP (200 = ready,
403
+ # 401 = auth-gated but alive, 404 = wrong path but alive).
404
+ if _r.status_code not in (502, 503, 504):
405
+ log.info("mellea: vLLM ready (status=%d, %.1fs elapsed)",
406
+ _r.status_code, time.time() - t0)
407
+ break
408
+ except Exception as _pe:
409
+ log.debug("mellea: vLLM probe: %r", _pe)
410
+ time.sleep(10)
411
+ else:
412
+ log.warning("mellea: vLLM not ready after %.1fs, proceeding anyway",
413
+ time.time() - t0)
414
+ except ImportError:
415
+ log.warning("mellea: httpx not available, skipping vLLM probe")
416
+
417
  for attempt_idx in range(loop_budget):
418
  attempts = attempt_idx + 1
419
  # On reroll, append a tight feedback message naming what failed AND