Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

App Files Files Community

FoodDesert commited on Feb 20

Commit

41dd600

1 Parent(s): c4f6ab1

Record non-fatal pipeline issues in eval JSONL outputs

Browse files

Files changed (2) hide show

psq_rag/llm/openrouter_client.py +36 -6
scripts/eval_pipeline.py +30 -19

psq_rag/llm/openrouter_client.py CHANGED Viewed

@@ -81,12 +81,42 @@ def openrouter_chat(
                 json=payload,
             )
-            data = r.json()
-            choice0 = data["choices"][0]
-            content = (choice0["message"].get("content", "") or "").strip()
-            finish_reason = choice0.get("finish_reason")
-            native_finish_reason = choice0.get("native_finish_reason")
             # (optional) expose these as part of error_str for logging
             meta = []

                 json=payload,
             )
+            try:
+                data = r.json()
+            except Exception:
+                data = None
+            if r.status_code >= 400:
+                error_parts = [f"HTTP {r.status_code}"]
+                if isinstance(data, dict):
+                    err_obj = data.get("error")
+                    if isinstance(err_obj, dict):
+                        code = err_obj.get("code")
+                        message = err_obj.get("message")
+                        if code:
+                            error_parts.append(f"code={code}")
+                        if message:
+                            error_parts.append(str(message))
+                if len(error_parts) == 1:
+                    body = (r.text or "").strip()
+                    if body:
+                        error_parts.append(body[:300])
+                return None, None, "OpenRouter error: " + " | ".join(error_parts)
+            if not isinstance(data, dict):
+                return None, None, f"OpenRouter error: HTTP {r.status_code} | non-JSON response"
+            choices = data.get("choices")
+            if not isinstance(choices, list) or not choices:
+                return None, None, f"OpenRouter error: HTTP {r.status_code} | missing choices"
+            choice0 = choices[0] if isinstance(choices[0], dict) else {}
+            message = choice0.get("message") if isinstance(choice0, dict) else {}
+            content = (message.get("content", "") if isinstance(message, dict) else "") or ""
+            content = content.strip()
+            finish_reason = choice0.get("finish_reason") if isinstance(choice0, dict) else None
+            native_finish_reason = choice0.get("native_finish_reason") if isinstance(choice0, dict) else None
             # (optional) expose these as part of error_str for logging
             meta = []

scripts/eval_pipeline.py CHANGED Viewed

@@ -168,8 +168,10 @@ class SampleResult:
     stage3s_time: float = 0.0
     # Categorized suggestions (for ranking metrics)
     categorized_suggestions: Dict[str, List[Tuple[str, float]]] = field(default_factory=dict)
-    # Errors
-    error: Optional[str] = None
 def _compute_metrics(predicted: Set[str], ground_truth: Set[str]) -> Tuple[float, float, float]:
@@ -211,10 +213,14 @@ def _process_one_sample(
     from psq_rag.llm.select import llm_select_indices, llm_infer_structural_tags
     from psq_rag.retrieval.state import get_tag_type_name, expand_tags_via_implications, get_leaf_tags
-    def log(msg: str) -> None:
-        if verbose:
-            with print_lock:
-                print(f"  [{index+1}] {msg}")
     sid = sample["id"]
     caption = sample["caption"]
@@ -425,10 +431,11 @@ def _process_one_sample(
                 f"t1={result.stage1_time:.1f}s t2={result.stage2_time:.1f}s t3={result.stage3_time:.1f}s"
             )
-    except Exception as e:
-        result.error = str(e)
-        with print_lock:
-            print(f"  [{index+1}] ERROR: {e}")
     return result
@@ -872,8 +879,10 @@ def main(argv=None) -> int:
         "workers": args.workers,
         "min_why": args.min_why,
         "expand_implications": args.expand_implications,
-        "infer_structural": args.infer_structural,
-        "n_errors": sum(1 for r in results if r.error),
     }
     with out_path.open("w", encoding="utf-8") as f:
@@ -927,10 +936,11 @@ def main(argv=None) -> int:
                 # Timing
                 "t1": round(r.stage1_time, 2),
                 "t2": round(r.stage2_time, 2),
-                "t3": round(r.stage3_time, 2),
-                "t3s": round(r.stage3s_time, 2),
-                "err": r.error,
-            }
             f.write(json.dumps(row, ensure_ascii=False) + "\n")
     print(f"\nCompact results saved to: {out_path}")
@@ -953,9 +963,10 @@ def main(argv=None) -> int:
                 "gt_character_tags": sorted(r.gt_character_tags),
                 "selected_character_tags": sorted(r.selected_character_tags),
                 "gt_general_tags": sorted(r.gt_general_tags),
-                "selected_general_tags": sorted(r.selected_general_tags),
-                "error": r.error,
-            }
             f.write(json.dumps(row, ensure_ascii=False) + "\n")
     print(f"Detail results saved to: {detail_path}")

     stage3s_time: float = 0.0
     # Categorized suggestions (for ranking metrics)
     categorized_suggestions: Dict[str, List[Tuple[str, float]]] = field(default_factory=dict)
+    # Errors
+    error: Optional[str] = None
+    # Non-fatal issues/warnings captured from pipeline logs (fallbacks, retries, API errors)
+    issues: List[str] = field(default_factory=list)
 def _compute_metrics(predicted: Set[str], ground_truth: Set[str]) -> Tuple[float, float, float]:
     from psq_rag.llm.select import llm_select_indices, llm_infer_structural_tags
     from psq_rag.retrieval.state import get_tag_type_name, expand_tags_via_implications, get_leaf_tags
+    def log(msg: str) -> None:
+        msg_str = str(msg)
+        msg_l = msg_str.lower()
+        if any(k in msg_l for k in ("error", "fallback", "gave up", "warning", "filtered", "refusal")):
+            result.issues.append(msg_str)
+        if verbose:
+            with print_lock:
+                print(f"  [{index+1}] {msg_str}")
     sid = sample["id"]
     caption = sample["caption"]
                 f"t1={result.stage1_time:.1f}s t2={result.stage2_time:.1f}s t3={result.stage3_time:.1f}s"
             )
+    except Exception as e:
+        result.error = str(e)
+        result.issues.append(f"fatal_exception: {e}")
+        with print_lock:
+            print(f"  [{index+1}] ERROR: {e}")
     return result
         "workers": args.workers,
         "min_why": args.min_why,
         "expand_implications": args.expand_implications,
+        "infer_structural": args.infer_structural,
+        "n_errors": sum(1 for r in results if r.error),
+        "n_issue_samples": sum(1 for r in results if r.issues),
+        "n_issues_total": sum(len(r.issues) for r in results),
     }
     with out_path.open("w", encoding="utf-8") as f:
                 # Timing
                 "t1": round(r.stage1_time, 2),
                 "t2": round(r.stage2_time, 2),
+                "t3": round(r.stage3_time, 2),
+                "t3s": round(r.stage3s_time, 2),
+                "err": r.error,
+                "issues": r.issues,
+            }
             f.write(json.dumps(row, ensure_ascii=False) + "\n")
     print(f"\nCompact results saved to: {out_path}")
                 "gt_character_tags": sorted(r.gt_character_tags),
                 "selected_character_tags": sorted(r.selected_character_tags),
                 "gt_general_tags": sorted(r.gt_general_tags),
+                "selected_general_tags": sorted(r.selected_general_tags),
+                "error": r.error,
+                "issues": r.issues,
+            }
             f.write(json.dumps(row, ensure_ascii=False) + "\n")
     print(f"Detail results saved to: {detail_path}")