FoodDesert commited on
Commit
41dd600
·
1 Parent(s): c4f6ab1

Record non-fatal pipeline issues in eval JSONL outputs

Browse files
psq_rag/llm/openrouter_client.py CHANGED
@@ -81,12 +81,42 @@ def openrouter_chat(
81
  json=payload,
82
  )
83
 
84
- data = r.json()
85
- choice0 = data["choices"][0]
86
- content = (choice0["message"].get("content", "") or "").strip()
87
-
88
- finish_reason = choice0.get("finish_reason")
89
- native_finish_reason = choice0.get("native_finish_reason")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  # (optional) expose these as part of error_str for logging
92
  meta = []
 
81
  json=payload,
82
  )
83
 
84
+ try:
85
+ data = r.json()
86
+ except Exception:
87
+ data = None
88
+
89
+ if r.status_code >= 400:
90
+ error_parts = [f"HTTP {r.status_code}"]
91
+ if isinstance(data, dict):
92
+ err_obj = data.get("error")
93
+ if isinstance(err_obj, dict):
94
+ code = err_obj.get("code")
95
+ message = err_obj.get("message")
96
+ if code:
97
+ error_parts.append(f"code={code}")
98
+ if message:
99
+ error_parts.append(str(message))
100
+ if len(error_parts) == 1:
101
+ body = (r.text or "").strip()
102
+ if body:
103
+ error_parts.append(body[:300])
104
+ return None, None, "OpenRouter error: " + " | ".join(error_parts)
105
+
106
+ if not isinstance(data, dict):
107
+ return None, None, f"OpenRouter error: HTTP {r.status_code} | non-JSON response"
108
+
109
+ choices = data.get("choices")
110
+ if not isinstance(choices, list) or not choices:
111
+ return None, None, f"OpenRouter error: HTTP {r.status_code} | missing choices"
112
+
113
+ choice0 = choices[0] if isinstance(choices[0], dict) else {}
114
+ message = choice0.get("message") if isinstance(choice0, dict) else {}
115
+ content = (message.get("content", "") if isinstance(message, dict) else "") or ""
116
+ content = content.strip()
117
+
118
+ finish_reason = choice0.get("finish_reason") if isinstance(choice0, dict) else None
119
+ native_finish_reason = choice0.get("native_finish_reason") if isinstance(choice0, dict) else None
120
 
121
  # (optional) expose these as part of error_str for logging
122
  meta = []
scripts/eval_pipeline.py CHANGED
@@ -168,8 +168,10 @@ class SampleResult:
168
  stage3s_time: float = 0.0
169
  # Categorized suggestions (for ranking metrics)
170
  categorized_suggestions: Dict[str, List[Tuple[str, float]]] = field(default_factory=dict)
171
- # Errors
172
- error: Optional[str] = None
 
 
173
 
174
 
175
  def _compute_metrics(predicted: Set[str], ground_truth: Set[str]) -> Tuple[float, float, float]:
@@ -211,10 +213,14 @@ def _process_one_sample(
211
  from psq_rag.llm.select import llm_select_indices, llm_infer_structural_tags
212
  from psq_rag.retrieval.state import get_tag_type_name, expand_tags_via_implications, get_leaf_tags
213
 
214
- def log(msg: str) -> None:
215
- if verbose:
216
- with print_lock:
217
- print(f" [{index+1}] {msg}")
 
 
 
 
218
 
219
  sid = sample["id"]
220
  caption = sample["caption"]
@@ -425,10 +431,11 @@ def _process_one_sample(
425
  f"t1={result.stage1_time:.1f}s t2={result.stage2_time:.1f}s t3={result.stage3_time:.1f}s"
426
  )
427
 
428
- except Exception as e:
429
- result.error = str(e)
430
- with print_lock:
431
- print(f" [{index+1}] ERROR: {e}")
 
432
 
433
  return result
434
 
@@ -872,8 +879,10 @@ def main(argv=None) -> int:
872
  "workers": args.workers,
873
  "min_why": args.min_why,
874
  "expand_implications": args.expand_implications,
875
- "infer_structural": args.infer_structural,
876
- "n_errors": sum(1 for r in results if r.error),
 
 
877
  }
878
 
879
  with out_path.open("w", encoding="utf-8") as f:
@@ -927,10 +936,11 @@ def main(argv=None) -> int:
927
  # Timing
928
  "t1": round(r.stage1_time, 2),
929
  "t2": round(r.stage2_time, 2),
930
- "t3": round(r.stage3_time, 2),
931
- "t3s": round(r.stage3s_time, 2),
932
- "err": r.error,
933
- }
 
934
  f.write(json.dumps(row, ensure_ascii=False) + "\n")
935
  print(f"\nCompact results saved to: {out_path}")
936
 
@@ -953,9 +963,10 @@ def main(argv=None) -> int:
953
  "gt_character_tags": sorted(r.gt_character_tags),
954
  "selected_character_tags": sorted(r.selected_character_tags),
955
  "gt_general_tags": sorted(r.gt_general_tags),
956
- "selected_general_tags": sorted(r.selected_general_tags),
957
- "error": r.error,
958
- }
 
959
  f.write(json.dumps(row, ensure_ascii=False) + "\n")
960
  print(f"Detail results saved to: {detail_path}")
961
 
 
168
  stage3s_time: float = 0.0
169
  # Categorized suggestions (for ranking metrics)
170
  categorized_suggestions: Dict[str, List[Tuple[str, float]]] = field(default_factory=dict)
171
+ # Errors
172
+ error: Optional[str] = None
173
+ # Non-fatal issues/warnings captured from pipeline logs (fallbacks, retries, API errors)
174
+ issues: List[str] = field(default_factory=list)
175
 
176
 
177
  def _compute_metrics(predicted: Set[str], ground_truth: Set[str]) -> Tuple[float, float, float]:
 
213
  from psq_rag.llm.select import llm_select_indices, llm_infer_structural_tags
214
  from psq_rag.retrieval.state import get_tag_type_name, expand_tags_via_implications, get_leaf_tags
215
 
216
+ def log(msg: str) -> None:
217
+ msg_str = str(msg)
218
+ msg_l = msg_str.lower()
219
+ if any(k in msg_l for k in ("error", "fallback", "gave up", "warning", "filtered", "refusal")):
220
+ result.issues.append(msg_str)
221
+ if verbose:
222
+ with print_lock:
223
+ print(f" [{index+1}] {msg_str}")
224
 
225
  sid = sample["id"]
226
  caption = sample["caption"]
 
431
  f"t1={result.stage1_time:.1f}s t2={result.stage2_time:.1f}s t3={result.stage3_time:.1f}s"
432
  )
433
 
434
+ except Exception as e:
435
+ result.error = str(e)
436
+ result.issues.append(f"fatal_exception: {e}")
437
+ with print_lock:
438
+ print(f" [{index+1}] ERROR: {e}")
439
 
440
  return result
441
 
 
879
  "workers": args.workers,
880
  "min_why": args.min_why,
881
  "expand_implications": args.expand_implications,
882
+ "infer_structural": args.infer_structural,
883
+ "n_errors": sum(1 for r in results if r.error),
884
+ "n_issue_samples": sum(1 for r in results if r.issues),
885
+ "n_issues_total": sum(len(r.issues) for r in results),
886
  }
887
 
888
  with out_path.open("w", encoding="utf-8") as f:
 
936
  # Timing
937
  "t1": round(r.stage1_time, 2),
938
  "t2": round(r.stage2_time, 2),
939
+ "t3": round(r.stage3_time, 2),
940
+ "t3s": round(r.stage3s_time, 2),
941
+ "err": r.error,
942
+ "issues": r.issues,
943
+ }
944
  f.write(json.dumps(row, ensure_ascii=False) + "\n")
945
  print(f"\nCompact results saved to: {out_path}")
946
 
 
963
  "gt_character_tags": sorted(r.gt_character_tags),
964
  "selected_character_tags": sorted(r.selected_character_tags),
965
  "gt_general_tags": sorted(r.gt_general_tags),
966
+ "selected_general_tags": sorted(r.selected_general_tags),
967
+ "error": r.error,
968
+ "issues": r.issues,
969
+ }
970
  f.write(json.dumps(row, ensure_ascii=False) + "\n")
971
  print(f"Detail results saved to: {detail_path}")
972