Spaces:
Running
Running
FoodDesert commited on
Commit ·
41dd600
1
Parent(s): c4f6ab1
Record non-fatal pipeline issues in eval JSONL outputs
Browse files- psq_rag/llm/openrouter_client.py +36 -6
- scripts/eval_pipeline.py +30 -19
psq_rag/llm/openrouter_client.py
CHANGED
|
@@ -81,12 +81,42 @@ def openrouter_chat(
|
|
| 81 |
json=payload,
|
| 82 |
)
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
# (optional) expose these as part of error_str for logging
|
| 92 |
meta = []
|
|
|
|
| 81 |
json=payload,
|
| 82 |
)
|
| 83 |
|
| 84 |
+
try:
|
| 85 |
+
data = r.json()
|
| 86 |
+
except Exception:
|
| 87 |
+
data = None
|
| 88 |
+
|
| 89 |
+
if r.status_code >= 400:
|
| 90 |
+
error_parts = [f"HTTP {r.status_code}"]
|
| 91 |
+
if isinstance(data, dict):
|
| 92 |
+
err_obj = data.get("error")
|
| 93 |
+
if isinstance(err_obj, dict):
|
| 94 |
+
code = err_obj.get("code")
|
| 95 |
+
message = err_obj.get("message")
|
| 96 |
+
if code:
|
| 97 |
+
error_parts.append(f"code={code}")
|
| 98 |
+
if message:
|
| 99 |
+
error_parts.append(str(message))
|
| 100 |
+
if len(error_parts) == 1:
|
| 101 |
+
body = (r.text or "").strip()
|
| 102 |
+
if body:
|
| 103 |
+
error_parts.append(body[:300])
|
| 104 |
+
return None, None, "OpenRouter error: " + " | ".join(error_parts)
|
| 105 |
+
|
| 106 |
+
if not isinstance(data, dict):
|
| 107 |
+
return None, None, f"OpenRouter error: HTTP {r.status_code} | non-JSON response"
|
| 108 |
+
|
| 109 |
+
choices = data.get("choices")
|
| 110 |
+
if not isinstance(choices, list) or not choices:
|
| 111 |
+
return None, None, f"OpenRouter error: HTTP {r.status_code} | missing choices"
|
| 112 |
+
|
| 113 |
+
choice0 = choices[0] if isinstance(choices[0], dict) else {}
|
| 114 |
+
message = choice0.get("message") if isinstance(choice0, dict) else {}
|
| 115 |
+
content = (message.get("content", "") if isinstance(message, dict) else "") or ""
|
| 116 |
+
content = content.strip()
|
| 117 |
+
|
| 118 |
+
finish_reason = choice0.get("finish_reason") if isinstance(choice0, dict) else None
|
| 119 |
+
native_finish_reason = choice0.get("native_finish_reason") if isinstance(choice0, dict) else None
|
| 120 |
|
| 121 |
# (optional) expose these as part of error_str for logging
|
| 122 |
meta = []
|
scripts/eval_pipeline.py
CHANGED
|
@@ -168,8 +168,10 @@ class SampleResult:
|
|
| 168 |
stage3s_time: float = 0.0
|
| 169 |
# Categorized suggestions (for ranking metrics)
|
| 170 |
categorized_suggestions: Dict[str, List[Tuple[str, float]]] = field(default_factory=dict)
|
| 171 |
-
# Errors
|
| 172 |
-
error: Optional[str] = None
|
|
|
|
|
|
|
| 173 |
|
| 174 |
|
| 175 |
def _compute_metrics(predicted: Set[str], ground_truth: Set[str]) -> Tuple[float, float, float]:
|
|
@@ -211,10 +213,14 @@ def _process_one_sample(
|
|
| 211 |
from psq_rag.llm.select import llm_select_indices, llm_infer_structural_tags
|
| 212 |
from psq_rag.retrieval.state import get_tag_type_name, expand_tags_via_implications, get_leaf_tags
|
| 213 |
|
| 214 |
-
def log(msg: str) -> None:
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
sid = sample["id"]
|
| 220 |
caption = sample["caption"]
|
|
@@ -425,10 +431,11 @@ def _process_one_sample(
|
|
| 425 |
f"t1={result.stage1_time:.1f}s t2={result.stage2_time:.1f}s t3={result.stage3_time:.1f}s"
|
| 426 |
)
|
| 427 |
|
| 428 |
-
except Exception as e:
|
| 429 |
-
result.error = str(e)
|
| 430 |
-
|
| 431 |
-
|
|
|
|
| 432 |
|
| 433 |
return result
|
| 434 |
|
|
@@ -872,8 +879,10 @@ def main(argv=None) -> int:
|
|
| 872 |
"workers": args.workers,
|
| 873 |
"min_why": args.min_why,
|
| 874 |
"expand_implications": args.expand_implications,
|
| 875 |
-
"infer_structural": args.infer_structural,
|
| 876 |
-
"n_errors": sum(1 for r in results if r.error),
|
|
|
|
|
|
|
| 877 |
}
|
| 878 |
|
| 879 |
with out_path.open("w", encoding="utf-8") as f:
|
|
@@ -927,10 +936,11 @@ def main(argv=None) -> int:
|
|
| 927 |
# Timing
|
| 928 |
"t1": round(r.stage1_time, 2),
|
| 929 |
"t2": round(r.stage2_time, 2),
|
| 930 |
-
"t3": round(r.stage3_time, 2),
|
| 931 |
-
"t3s": round(r.stage3s_time, 2),
|
| 932 |
-
"err": r.error,
|
| 933 |
-
|
|
|
|
| 934 |
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
| 935 |
print(f"\nCompact results saved to: {out_path}")
|
| 936 |
|
|
@@ -953,9 +963,10 @@ def main(argv=None) -> int:
|
|
| 953 |
"gt_character_tags": sorted(r.gt_character_tags),
|
| 954 |
"selected_character_tags": sorted(r.selected_character_tags),
|
| 955 |
"gt_general_tags": sorted(r.gt_general_tags),
|
| 956 |
-
"selected_general_tags": sorted(r.selected_general_tags),
|
| 957 |
-
"error": r.error,
|
| 958 |
-
|
|
|
|
| 959 |
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
| 960 |
print(f"Detail results saved to: {detail_path}")
|
| 961 |
|
|
|
|
| 168 |
stage3s_time: float = 0.0
|
| 169 |
# Categorized suggestions (for ranking metrics)
|
| 170 |
categorized_suggestions: Dict[str, List[Tuple[str, float]]] = field(default_factory=dict)
|
| 171 |
+
# Errors
|
| 172 |
+
error: Optional[str] = None
|
| 173 |
+
# Non-fatal issues/warnings captured from pipeline logs (fallbacks, retries, API errors)
|
| 174 |
+
issues: List[str] = field(default_factory=list)
|
| 175 |
|
| 176 |
|
| 177 |
def _compute_metrics(predicted: Set[str], ground_truth: Set[str]) -> Tuple[float, float, float]:
|
|
|
|
| 213 |
from psq_rag.llm.select import llm_select_indices, llm_infer_structural_tags
|
| 214 |
from psq_rag.retrieval.state import get_tag_type_name, expand_tags_via_implications, get_leaf_tags
|
| 215 |
|
| 216 |
+
def log(msg: str) -> None:
|
| 217 |
+
msg_str = str(msg)
|
| 218 |
+
msg_l = msg_str.lower()
|
| 219 |
+
if any(k in msg_l for k in ("error", "fallback", "gave up", "warning", "filtered", "refusal")):
|
| 220 |
+
result.issues.append(msg_str)
|
| 221 |
+
if verbose:
|
| 222 |
+
with print_lock:
|
| 223 |
+
print(f" [{index+1}] {msg_str}")
|
| 224 |
|
| 225 |
sid = sample["id"]
|
| 226 |
caption = sample["caption"]
|
|
|
|
| 431 |
f"t1={result.stage1_time:.1f}s t2={result.stage2_time:.1f}s t3={result.stage3_time:.1f}s"
|
| 432 |
)
|
| 433 |
|
| 434 |
+
except Exception as e:
|
| 435 |
+
result.error = str(e)
|
| 436 |
+
result.issues.append(f"fatal_exception: {e}")
|
| 437 |
+
with print_lock:
|
| 438 |
+
print(f" [{index+1}] ERROR: {e}")
|
| 439 |
|
| 440 |
return result
|
| 441 |
|
|
|
|
| 879 |
"workers": args.workers,
|
| 880 |
"min_why": args.min_why,
|
| 881 |
"expand_implications": args.expand_implications,
|
| 882 |
+
"infer_structural": args.infer_structural,
|
| 883 |
+
"n_errors": sum(1 for r in results if r.error),
|
| 884 |
+
"n_issue_samples": sum(1 for r in results if r.issues),
|
| 885 |
+
"n_issues_total": sum(len(r.issues) for r in results),
|
| 886 |
}
|
| 887 |
|
| 888 |
with out_path.open("w", encoding="utf-8") as f:
|
|
|
|
| 936 |
# Timing
|
| 937 |
"t1": round(r.stage1_time, 2),
|
| 938 |
"t2": round(r.stage2_time, 2),
|
| 939 |
+
"t3": round(r.stage3_time, 2),
|
| 940 |
+
"t3s": round(r.stage3s_time, 2),
|
| 941 |
+
"err": r.error,
|
| 942 |
+
"issues": r.issues,
|
| 943 |
+
}
|
| 944 |
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
| 945 |
print(f"\nCompact results saved to: {out_path}")
|
| 946 |
|
|
|
|
| 963 |
"gt_character_tags": sorted(r.gt_character_tags),
|
| 964 |
"selected_character_tags": sorted(r.selected_character_tags),
|
| 965 |
"gt_general_tags": sorted(r.gt_general_tags),
|
| 966 |
+
"selected_general_tags": sorted(r.selected_general_tags),
|
| 967 |
+
"error": r.error,
|
| 968 |
+
"issues": r.issues,
|
| 969 |
+
}
|
| 970 |
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
| 971 |
print(f"Detail results saved to: {detail_path}")
|
| 972 |
|