msradam commited on
Commit
daf3545
·
verified ·
1 Parent(s): 52a5649

debug: improve vllm-direct endpoint to test context overflow

Browse files
Files changed (1) hide show
  1. web/main.py +69 -41
web/main.py CHANGED
@@ -368,50 +368,78 @@ def api_debug_vllm_direct():
368
  if not vllm_base:
369
  return JSONResponse({"error": "RIPRAP_LLM_BASE_URL not set"}, status_code=400)
370
 
371
- # Minimal reconciler-style payload: 2 documents + system + user.
372
- payload = {
373
- "model": os.environ.get("RIPRAP_LLM_VLLM_8B_NAME", "granite4.1:3b"),
374
- "messages": [
375
- {"role": "system", "content": "You are a flood risk analyst."},
376
- {"role": "user", "content": "Write the cited paragraph now."},
377
- ],
378
- "max_tokens": 64,
379
- "temperature": 0,
380
- "stream": False,
381
- "chat_template_kwargs": {
382
- "documents": [
383
- {"doc_id": "noaa_tides", "text": "Current tide at Battery Park: 4.14 ft MLLW."},
384
- {"doc_id": "microtopo", "text": "Elevation 1.37 m, 80th pct 200m radius."},
385
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  },
387
- "documents": [
388
- {"doc_id": "noaa_tides", "text": "Current tide at Battery Park: 4.14 ft MLLW."},
389
- {"doc_id": "microtopo", "text": "Elevation 1.37 m, 80th pct 200m radius."},
390
- ],
391
  }
392
- try:
393
- with httpx.Client(timeout=30.0) as c:
394
- r = c.post(
395
- f"{vllm_base}/chat/completions",
396
- headers={"Authorization": f"Bearer {vllm_key}",
397
- "Content-Type": "application/json"},
398
- json=payload,
399
- )
400
  try:
401
- body = r.json()
402
- except Exception:
403
- body = r.text[:500]
404
- return JSONResponse({
405
- "status": r.status_code,
406
- "body": body,
407
- "model_used": payload["model"],
408
- "vllm_base": vllm_base,
409
- })
410
- except Exception as e:
411
- return JSONResponse({
412
- "error": str(e),
413
- "tb": traceback.format_exc().splitlines()[-5:],
414
- }, status_code=500)
 
 
 
 
 
 
415
 
416
 
417
  @app.get("/api/backend")
 
368
  if not vllm_base:
369
  return JSONResponse({"error": "RIPRAP_LLM_BASE_URL not set"}, status_code=400)
370
 
371
+ model_name = os.environ.get("RIPRAP_LLM_VLLM_8B_NAME", "granite4.1:3b")
372
+
373
+ # Two payloads: minimal (sanity check) and full-load (context overflow test).
374
+ # Generate a realistic 14-doc payload that approximates what the reconciler sends.
375
+ _FILLER_DOC = (
376
+ "Source: NYC OEM Sandy 2012 inundation. "
377
+ "This location is within the Sandy 2012 inundation zone, "
378
+ "which experienced flood depths of 1–4 ft. "
379
+ "FEMA Flood Zone AE. BFE 12 ft NAVD88."
380
+ )
381
+ full_docs = [
382
+ {"doc_id": f"doc_{i}", "text": f"[doc_{i}] " + _FILLER_DOC}
383
+ for i in range(14)
384
+ ]
385
+ _LONG_SYSTEM = (
386
+ "Write a flood-exposure briefing for an NYC address. "
387
+ "Use ONLY the facts in the provided documents. "
388
+ "Every sentence that contains a number MUST include a citation tag. "
389
+ "Output the four sections: Status, History, Forecast, and Risk. "
390
+ "Valid document IDs: " + ", ".join(f"doc_{i}" for i in range(14)) + "."
391
+ ) * 3 # ~500 tokens
392
+
393
+ payloads = {
394
+ "minimal": {
395
+ "model": model_name,
396
+ "messages": [
397
+ {"role": "system", "content": "You are a flood risk analyst."},
398
+ {"role": "user", "content": "Write the cited paragraph now."},
399
+ ],
400
+ "max_tokens": 64,
401
+ "temperature": 0,
402
+ "stream": False,
403
+ "chat_template_kwargs": {"documents": [
404
+ {"doc_id": "noaa_tides", "text": "Tide: 4.14 ft MLLW."},
405
+ {"doc_id": "microtopo", "text": "Elevation: 1.37 m."},
406
+ ]},
407
+ },
408
+ "full_load": {
409
+ "model": model_name,
410
+ "messages": [
411
+ {"role": "system", "content": _LONG_SYSTEM},
412
+ {"role": "user", "content": "Write the cited paragraph now."},
413
+ ],
414
+ "max_tokens": 512,
415
+ "temperature": 0,
416
+ "stream": False,
417
+ "chat_template_kwargs": {"documents": full_docs},
418
  },
 
 
 
 
419
  }
420
+ results = {}
421
+ for name, payload in payloads.items():
 
 
 
 
 
 
422
  try:
423
+ with httpx.Client(timeout=30.0) as c:
424
+ r = c.post(
425
+ f"{vllm_base}/chat/completions",
426
+ headers={"Authorization": f"Bearer {vllm_key}",
427
+ "Content-Type": "application/json"},
428
+ json=payload,
429
+ )
430
+ try:
431
+ body = r.json()
432
+ except Exception:
433
+ body = r.text[:300]
434
+ results[name] = {"status": r.status_code, "body_snippet": str(body)[:400]}
435
+ except Exception as e:
436
+ results[name] = {"error": str(e)}
437
+
438
+ return JSONResponse({
439
+ "model": model_name,
440
+ "vllm_base": vllm_base,
441
+ "results": results,
442
+ })
443
 
444
 
445
  @app.get("/api/backend")