Spaces:

msradam
/

riprap

Sleeping

App Files Files Community

msradam commited on 5 days ago

Commit

daf3545

verified ·

1 Parent(s): 52a5649

debug: improve vllm-direct endpoint to test context overflow

Browse files

Files changed (1) hide show

web/main.py +69 -41

web/main.py CHANGED Viewed

@@ -368,50 +368,78 @@ def api_debug_vllm_direct():
     if not vllm_base:
         return JSONResponse({"error": "RIPRAP_LLM_BASE_URL not set"}, status_code=400)
-    # Minimal reconciler-style payload: 2 documents + system + user.
-    payload = {
-        "model": os.environ.get("RIPRAP_LLM_VLLM_8B_NAME", "granite4.1:3b"),
-        "messages": [
-            {"role": "system", "content": "You are a flood risk analyst."},
-            {"role": "user", "content": "Write the cited paragraph now."},
-        ],
-        "max_tokens": 64,
-        "temperature": 0,
-        "stream": False,
-        "chat_template_kwargs": {
-            "documents": [
-                {"doc_id": "noaa_tides", "text": "Current tide at Battery Park: 4.14 ft MLLW."},
-                {"doc_id": "microtopo", "text": "Elevation 1.37 m, 80th pct 200m radius."},
-            ]
         },
-        "documents": [
-            {"doc_id": "noaa_tides", "text": "Current tide at Battery Park: 4.14 ft MLLW."},
-            {"doc_id": "microtopo", "text": "Elevation 1.37 m, 80th pct 200m radius."},
-        ],
     }
-    try:
-        with httpx.Client(timeout=30.0) as c:
-            r = c.post(
-                f"{vllm_base}/chat/completions",
-                headers={"Authorization": f"Bearer {vllm_key}",
-                         "Content-Type": "application/json"},
-                json=payload,
-            )
         try:
-            body = r.json()
-        except Exception:
-            body = r.text[:500]
-        return JSONResponse({
-            "status": r.status_code,
-            "body": body,
-            "model_used": payload["model"],
-            "vllm_base": vllm_base,
-        })
-    except Exception as e:
-        return JSONResponse({
-            "error": str(e),
-            "tb": traceback.format_exc().splitlines()[-5:],
-        }, status_code=500)
 @app.get("/api/backend")

     if not vllm_base:
         return JSONResponse({"error": "RIPRAP_LLM_BASE_URL not set"}, status_code=400)
+    model_name = os.environ.get("RIPRAP_LLM_VLLM_8B_NAME", "granite4.1:3b")
+    # Two payloads: minimal (sanity check) and full-load (context overflow test).
+    # Generate a realistic 14-doc payload that approximates what the reconciler sends.
+    _FILLER_DOC = (
+        "Source: NYC OEM Sandy 2012 inundation. "
+        "This location is within the Sandy 2012 inundation zone, "
+        "which experienced flood depths of 1–4 ft. "
+        "FEMA Flood Zone AE. BFE 12 ft NAVD88."
+    )
+    full_docs = [
+        {"doc_id": f"doc_{i}", "text": f"[doc_{i}] " + _FILLER_DOC}
+        for i in range(14)
+    ]
+    _LONG_SYSTEM = (
+        "Write a flood-exposure briefing for an NYC address. "
+        "Use ONLY the facts in the provided documents. "
+        "Every sentence that contains a number MUST include a citation tag. "
+        "Output the four sections: Status, History, Forecast, and Risk. "
+        "Valid document IDs: " + ", ".join(f"doc_{i}" for i in range(14)) + "."
+    ) * 3  # ~500 tokens
+    payloads = {
+        "minimal": {
+            "model": model_name,
+            "messages": [
+                {"role": "system", "content": "You are a flood risk analyst."},
+                {"role": "user", "content": "Write the cited paragraph now."},
+            ],
+            "max_tokens": 64,
+            "temperature": 0,
+            "stream": False,
+            "chat_template_kwargs": {"documents": [
+                {"doc_id": "noaa_tides", "text": "Tide: 4.14 ft MLLW."},
+                {"doc_id": "microtopo", "text": "Elevation: 1.37 m."},
+            ]},
+        },
+        "full_load": {
+            "model": model_name,
+            "messages": [
+                {"role": "system", "content": _LONG_SYSTEM},
+                {"role": "user", "content": "Write the cited paragraph now."},
+            ],
+            "max_tokens": 512,
+            "temperature": 0,
+            "stream": False,
+            "chat_template_kwargs": {"documents": full_docs},
         },
     }
+    results = {}
+    for name, payload in payloads.items():
         try:
+            with httpx.Client(timeout=30.0) as c:
+                r = c.post(
+                    f"{vllm_base}/chat/completions",
+                    headers={"Authorization": f"Bearer {vllm_key}",
+                             "Content-Type": "application/json"},
+                    json=payload,
+                )
+            try:
+                body = r.json()
+            except Exception:
+                body = r.text[:300]
+            results[name] = {"status": r.status_code, "body_snippet": str(body)[:400]}
+        except Exception as e:
+            results[name] = {"error": str(e)}
+    return JSONResponse({
+        "model": model_name,
+        "vllm_base": vllm_base,
+        "results": results,
+    })
 @app.get("/api/backend")