Inframat-x commited on
Commit
c3b8ad2
Β·
verified Β·
1 Parent(s): a269863

Update rag_eval_metrics.py

Browse files
Files changed (1) hide show
  1. rag_eval_metrics.py +178 -141
rag_eval_metrics.py CHANGED
@@ -3,30 +3,6 @@
3
  rag_eval_metrics.py
4
 
5
  Evaluate RAG retrieval quality by comparing app logs (JSONL) with a gold file (CSV).
6
-
7
- Inputs (CLI):
8
- --gold_csv Path to gold CSV.
9
- --logs_jsonl Path to app JSONL logs (rag_logs.jsonl).
10
- --k Cutoff for metrics (default: 8).
11
- --out_dir Output directory for metrics files (default: rag_artifacts).
12
-
13
- Outputs (written into out_dir):
14
- - metrics_per_question.csv
15
- - metrics_aggregate.json
16
-
17
- Gold CSV accepted schemas (case-insensitive headers):
18
- Minimal (doc-level):
19
- question, doc
20
- (multiple rows per question allowed)
21
- With page info (page-level optional):
22
- question, doc, page
23
- List-in-a-cell also supported:
24
- question, relevant_docs # semicolon/comma separated; page matching disabled in this column
25
-
26
- Notes:
27
- - Matching is case-insensitive on question and doc filename.
28
- - Page-level metrics only computed when GOLD includes a concrete page for that question.
29
- - Logs are produced by app.py and contain 'retrieval'->'hits' with 'doc' and 'page'.
30
  """
31
 
32
  import argparse
@@ -40,11 +16,47 @@ import pandas as pd
40
  import numpy as np
41
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  # ----------------------------- IO Helpers ----------------------------- #
44
 
45
  def read_logs(jsonl_path: Path) -> pd.DataFrame:
46
- """Read JSONL logs and return a DataFrame with columns: question, hits(list[dict])."""
47
  rows = []
 
 
 
48
  with open(jsonl_path, "r", encoding="utf-8") as f:
49
  for line in f:
50
  line = line.strip()
@@ -54,46 +66,45 @@ def read_logs(jsonl_path: Path) -> pd.DataFrame:
54
  rec = json.loads(line)
55
  except Exception:
56
  continue
 
 
57
  q = (((rec.get("inputs") or {}).get("question")) or "").strip()
 
 
58
  retr = (rec.get("retrieval") or {})
59
  hits = retr.get("hits", [])
60
- # Normalize fields we need
61
  norm_hits = []
62
  for h in hits or []:
63
  doc = (h.get("doc") or "").strip()
64
  page = str(h.get("page") or "").strip()
 
 
65
  try:
66
- # Try int page if it looks numeric
67
  page_int = int(page)
68
  except Exception:
69
  page_int = None
 
70
  norm_hits.append({"doc": doc, "page": page_int})
 
71
  rows.append({"question": q, "hits": norm_hits})
 
72
  df = pd.DataFrame(rows)
73
  if df.empty:
74
  return pd.DataFrame(columns=["question", "hits"])
75
- # Keep last occurrence per question (latest run), but also allow multiple – we aggregate by question
76
- # For stability, group and keep the last non-empty hit list.
77
- def _pick_last_non_empty(hit_lists: List[List[dict]]) -> List[dict]:
78
- for lst in reversed(hit_lists):
79
- if lst:
80
- return lst
81
- return []
82
  df = (
83
- df.groupby(df["question"].str.casefold().str.strip(), as_index=False)
84
  .agg({"question": "last", "hits": _pick_last_non_empty})
85
  )
86
  return df
87
 
88
 
89
  def read_gold(csv_path: Path) -> pd.DataFrame:
90
- """Read gold CSV, normalize columns, and return rows with:
91
- question(cf), question_raw, doc (lowercased filename), page (optional, int or NaN).
92
- """
93
  df = pd.read_csv(csv_path)
94
- # Normalize headers
95
  cols = {c.lower().strip(): c for c in df.columns}
96
- # Find question column
 
97
  q_col = None
98
  for cand in ["question", "query", "q"]:
99
  if cand in cols:
@@ -102,19 +113,21 @@ def read_gold(csv_path: Path) -> pd.DataFrame:
102
  if q_col is None:
103
  raise ValueError("Gold CSV must contain a 'question' column (case-insensitive).")
104
 
105
- # Accept either (doc[, page]) rows or a 'relevant_docs' list column
106
  rel_list_col = None
107
  for cand in ["relevant_docs", "relevant", "docs"]:
108
  if cand in cols:
109
  rel_list_col = cols[cand]
110
  break
111
 
 
112
  doc_col = None
113
  for cand in ["doc", "document", "file", "doc_name"]:
114
  if cand in cols:
115
  doc_col = cols[cand]
116
  break
117
 
 
118
  page_col = None
119
  for cand in ["page", "page_num", "page_number"]:
120
  if cand in cols:
@@ -122,65 +135,73 @@ def read_gold(csv_path: Path) -> pd.DataFrame:
122
  break
123
 
124
  rows = []
 
 
125
  if rel_list_col and doc_col is None:
126
- # Each row may contain a list of docs (comma/semicolon separated)
127
  for _, r in df.iterrows():
128
  q_raw = str(r[q_col]).strip()
129
  q_norm = q_raw.casefold().strip()
 
130
  rel_val = str(r[rel_list_col]) if pd.notna(r[rel_list_col]) else ""
131
  if not rel_val:
132
- # still create an empty row (no gold docs)
133
- rows.append({"question_raw": q_raw, "question": q_norm, "doc": None, "page": np.nan})
 
 
 
 
134
  continue
135
- # split by semicolon or comma
136
  parts = [p.strip() for p in re_split_sc(rel_val)]
137
- # one row per doc (page-level off for list column)
138
  for d in parts:
139
- rows.append({"question_raw": q_raw, "question": q_norm, "doc": filename_key(d), "page": np.nan})
 
 
 
 
 
 
 
140
  elif doc_col:
141
- # Standard long form: one doc (+/- page) per row
142
  for _, r in df.iterrows():
143
  q_raw = str(r[q_col]).strip()
144
  q_norm = q_raw.casefold().strip()
 
145
  d = str(r[doc_col]).strip() if pd.notna(r[doc_col]) else ""
146
- p = r[page_col] if page_col and pd.notna(r[page_col]) else np.nan
 
147
  try:
148
  p = int(p)
149
  except Exception:
150
  p = np.nan
151
- rows.append({"question_raw": q_raw, "question": q_norm, "doc": filename_key(d), "page": p})
 
 
 
 
 
 
 
152
  else:
153
  raise ValueError("Gold CSV must contain either a 'doc' column or a 'relevant_docs' column.")
154
 
155
  gold = pd.DataFrame(rows)
156
- # drop fully empty doc rows (when no gold docs listed)
 
157
  gold["has_doc"] = gold["doc"].apply(lambda x: isinstance(x, str) and len(x) > 0)
158
  if gold["has_doc"].any():
159
  gold = gold[gold["has_doc"]].copy()
160
  gold.drop(columns=["has_doc"], inplace=True, errors="ignore")
161
- # Deduplicate
162
- gold = gold.drop_duplicates(subset=["question", "doc", "page"])
163
- return gold
164
-
165
-
166
- def filename_key(s: str) -> str:
167
- """Normalize document name to just the basename, lowercased."""
168
- s = (s or "").strip()
169
- s = s.replace("\\", "/")
170
- s = s.split("/")[-1]
171
- return s.casefold()
172
 
 
 
173
 
174
- def re_split_sc(s: str) -> List[str]:
175
- """Split on semicolons or commas."""
176
- import re
177
- return re.split(r"[;,]", s)
178
 
179
 
180
  # ----------------------------- Metric Core ----------------------------- #
181
 
182
  def dcg_at_k(relevances: List[int]) -> float:
183
- """Binary DCG with log2 discounts; ranks are 1-indexed in denominator."""
184
  dcg = 0.0
185
  for i, rel in enumerate(relevances, start=1):
186
  if rel > 0:
@@ -197,30 +218,21 @@ def ndcg_at_k(relevances: List[int]) -> float:
197
  return float(dcg / idcg)
198
 
199
 
200
- def compute_metrics_for_question(
201
- gold_docs: List[str],
202
- gold_pages: List[Optional[int]],
203
- hits: List[Dict[str, Any]],
204
- k: int
205
- ) -> Dict[str, Any]:
206
- """
207
- Returns per-question metrics at cutoff k for:
208
- - doc-level: match on doc only
209
- - page-level: match on (doc,page) where page is provided in GOLD
210
- """
211
  top = hits[:k] if hits else []
212
  pred_docs = [filename_key(h.get("doc", "")) for h in top]
213
  pred_pairs = [(filename_key(h.get("doc", "")), h.get("page", None)) for h in top]
214
 
215
- # --- DOC-LEVEL ---
216
  gold_doc_set = set([d for d in gold_docs if isinstance(d, str) and d])
 
217
  rel_bin_doc = [1 if d in gold_doc_set else 0 for d in pred_docs]
218
  hitk_doc = 1 if any(rel_bin_doc) else 0
219
  prec_doc = (sum(rel_bin_doc) / max(1, len(pred_docs))) if pred_docs else 0.0
220
  rec_doc = (sum(rel_bin_doc) / max(1, len(gold_doc_set))) if gold_doc_set else 0.0
221
  ndcg_doc = ndcg_at_k(rel_bin_doc)
222
 
223
- # --- PAGE-LEVEL (only if at least one GOLD page specified) ---
224
  gold_pairs = set()
225
  for d, p in zip(gold_docs, gold_pages):
226
  if isinstance(d, str) and d and (p is not None) and (not (isinstance(p, float) and np.isnan(p))):
@@ -231,8 +243,13 @@ def compute_metrics_for_question(
231
  gold_pairs.add((d, p_int))
232
 
233
  if gold_pairs:
234
- rel_bin_page = [1 if ((d, (p if p is not None else -1)) in gold_pairs) else 0
235
- for (d, p) in [(d, (p if isinstance(p, int) else -1)) for (d, p) in pred_pairs]]
 
 
 
 
 
236
  hitk_page = 1 if any(rel_bin_page) else 0
237
  prec_page = (sum(rel_bin_page) / max(1, len(pred_pairs))) if pred_pairs else 0.0
238
  rec_page = (sum(rel_bin_page) / max(1, len(gold_pairs))) if gold_pairs else 0.0
@@ -257,6 +274,20 @@ def compute_metrics_for_question(
257
 
258
  # ----------------------------- Orchestration ----------------------------- #
259
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  def main():
261
  ap = argparse.ArgumentParser()
262
  ap.add_argument("--gold_csv", required=True, type=str)
@@ -272,28 +303,34 @@ def main():
272
  logs_path = Path(args.logs_jsonl)
273
 
274
  if not gold_path.exists():
275
- print(f"❌ gold.csv not found at {gold_path}", file=sys.stderr)
276
  sys.exit(0)
277
  if not logs_path.exists() or logs_path.stat().st_size == 0:
278
- print(f"❌ logs JSONL not found or empty at {logs_path}", file=sys.stderr)
279
  sys.exit(0)
280
 
281
- # Load data
282
  try:
283
  gold = read_gold(gold_path)
284
  except Exception as e:
285
- print(f"❌ Failed to read gold: {e}", file=sys.stderr)
 
 
 
 
 
 
 
286
  sys.exit(0)
287
- logs = read_logs(logs_path)
288
 
289
  if gold.empty:
290
- print("❌ Gold file contains no usable rows.", file=sys.stderr)
291
  sys.exit(0)
292
  if logs.empty:
293
- print("❌ Logs file contains no usable entries.", file=sys.stderr)
294
  sys.exit(0)
295
 
296
- # Build gold dict: question -> list of (doc, page)
297
  gdict: Dict[str, List[Tuple[str, Optional[int]]]] = {}
298
  for _, r in gold.iterrows():
299
  q = str(r["question"]).strip()
@@ -301,26 +338,35 @@ def main():
301
  p = r["page"] if "page" in r else np.nan
302
  gdict.setdefault(q, []).append((d, p))
303
 
304
- # Align on questions (casefolded)
305
  logs["q_norm"] = logs["question"].astype(str).str.casefold().str.strip()
 
306
  perq_rows = []
307
  not_in_logs, not_in_gold = [], []
308
 
 
309
  for q_norm, pairs in gdict.items():
310
- # Pairs is list of (doc, page)
311
- q_gold_variants = [q_norm] # already normalized
312
- # Find logs row with same normalized question
313
  row = logs[logs["q_norm"] == q_norm]
 
 
 
314
  if row.empty:
 
315
  not_in_logs.append(q_norm)
316
- # Still record a row with zeros/NaNs
317
- gdocs = [d for (d, _) in pairs]
318
- gpages = [p for (_, p) in pairs]
319
  metrics = {
320
- "hit@k_doc": 0, "precision@k_doc": 0.0, "recall@k_doc": 0.0, "ndcg@k_doc": 0.0,
321
- "hit@k_page": np.nan, "precision@k_page": np.nan, "recall@k_page": np.nan, "ndcg@k_page": np.nan,
 
 
 
 
 
 
322
  "n_gold_docs": int(len(set([d for d in gdocs if isinstance(d, str) and d]))),
323
- "n_gold_doc_pages": int(len([(d, p) for (d, p) in zip(gdocs, gpages) if isinstance(d, str) and d and pd.notna(p)])),
 
 
 
324
  "n_pred": 0
325
  }
326
  perq_rows.append({
@@ -330,91 +376,82 @@ def main():
330
  })
331
  continue
332
 
333
- # Use the last row (grouping ensured one row per question)
334
  hits = row.iloc[0]["hits"] or []
335
- # Prepare gold lists for metric function
336
- gdocs = [d for (d, _) in pairs]
337
- gpages = [p for (_, p) in pairs]
338
  metrics = compute_metrics_for_question(gdocs, gpages, hits, args.k)
339
-
340
  perq_rows.append({
341
  "question": q_norm,
342
  "covered_in_logs": 1,
343
  **metrics
344
  })
345
 
346
- # Detect questions present in logs but not in gold (for reporting)
347
  gold_qs = set(gdict.keys())
348
  for qn in logs["q_norm"].tolist():
349
  if qn not in gold_qs:
350
  not_in_gold.append(qn)
351
 
352
  perq = pd.DataFrame(perq_rows)
353
-
354
- # Aggregates over questions that are covered_in_logs == 1
355
  covered = perq[perq["covered_in_logs"] == 1].copy()
 
356
  agg = {
357
  "questions_total_gold": int(len(gdict)),
358
  "questions_covered_in_logs": int(covered.shape[0]),
359
  "questions_missing_in_logs": int(len(not_in_logs)),
360
  "questions_in_logs_not_in_gold": int(len(set(not_in_gold))),
361
  "k": int(args.k),
362
- # DOC-level
363
  "mean_hit@k_doc": float(covered["hit@k_doc"].mean()) if not covered.empty else 0.0,
364
  "mean_precision@k_doc": float(covered["precision@k_doc"].mean()) if not covered.empty else 0.0,
365
  "mean_recall@k_doc": float(covered["recall@k_doc"].mean()) if not covered.empty else 0.0,
366
  "mean_ndcg@k_doc": float(covered["ndcg@k_doc"].mean()) if not covered.empty else 0.0,
367
- # PAGE-level (skip NaNs)
368
  "mean_hit@k_page": float(covered["hit@k_page"].dropna().mean()) if covered["hit@k_page"].notna().any() else None,
369
  "mean_precision@k_page": float(covered["precision@k_page"].dropna().mean()) if covered["precision@k_page"].notna().any() else None,
370
  "mean_recall@k_page": float(covered["recall@k_page"].dropna().mean()) if covered["recall@k_page"].notna().any() else None,
371
  "mean_ndcg@k_page": float(covered["ndcg@k_page"].dropna().mean()) if covered["ndcg@k_page"].notna().any() else None,
372
- # Distribution hints
373
  "avg_gold_docs_per_q": float(perq["n_gold_docs"].mean()) if not perq.empty else 0.0,
374
  "avg_preds_per_q": float(perq["n_pred"].mean()) if not perq.empty else 0.0,
375
- # Listings (truncated for readability)
376
  "examples_missing_in_logs": list(not_in_logs[:10]),
377
  "examples_in_logs_not_in_gold": list(dict.fromkeys(not_in_gold))[:10],
378
  }
379
 
380
- # Write outputs
381
  perq_path = out_dir / "metrics_per_question.csv"
382
  agg_path = out_dir / "metrics_aggregate.json"
 
383
  perq.to_csv(perq_path, index=False)
384
  with open(agg_path, "w", encoding="utf-8") as f:
385
  json.dump(agg, f, ensure_ascii=False, indent=2)
386
 
387
- # Console summary (stdout) for app display
388
- print("RAG Evaluation Summary")
389
- print("----------------------")
390
- print(f"Gold questions: {agg['questions_total_gold']}")
391
- print(f"Covered in logs: {agg['questions_covered_in_logs']}")
392
- print(f"Missing in logs: {agg['questions_missing_in_logs']}")
393
- print(f"In logs but not in gold: {agg['questions_in_logs_not_in_gold']}")
394
- print(f"k = {agg['k']}")
395
- print()
396
- print(f"Doc-level: Hit@k={_fmt(agg['mean_hit@k_doc'])} "
397
- f"Precision@k={_fmt(agg['mean_precision@k_doc'])} "
398
- f"Recall@k={_fmt(agg['mean_recall@k_doc'])} "
399
- f"nDCG@k={_fmt(agg['mean_ndcg@k_doc'])}")
400
- if agg["mean_hit@k_page"] is not None:
401
- print(f"Page-level: Hit@k={_fmt(agg['mean_hit@k_page'])} "
402
- f"Precision@k={_fmt(agg['mean_precision@k_page'])} "
403
- f"Recall@k={_fmt(agg['mean_recall@k_page'])} "
404
- f"nDCG@k={_fmt(agg['mean_ndcg@k_page'])}")
 
 
 
 
 
 
 
405
  else:
406
- print("Page-level: (no page labels in gold)")
407
 
408
  print()
409
- print(f"Wrote per-question CSV β†’ {perq_path}")
410
- print(f"Wrote aggregate JSON β†’ {agg_path}")
411
-
412
-
413
- def _fmt(x: Any) -> str:
414
- try:
415
- return f"{float(x):.3f}"
416
- except Exception:
417
- return "-"
418
 
419
 
420
  if __name__ == "__main__":
 
3
  rag_eval_metrics.py
4
 
5
  Evaluate RAG retrieval quality by comparing app logs (JSONL) with a gold file (CSV).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  """
7
 
8
  import argparse
 
16
  import numpy as np
17
 
18
 
19
+ # ----------------------------- Small Utils ----------------------------- #
20
+
21
+ def filename_key(s: str) -> str:
22
+ s = (s or "").strip().replace("\\", "/").split("/")[-1]
23
+ return s.casefold()
24
+
25
+
26
+ def re_split_sc(s: str) -> List[str]:
27
+ import re
28
+ return re.split(r"[;,]", s)
29
+
30
+
31
+ def _pick_last_non_empty(hit_lists) -> List[dict]:
32
+ """
33
+ Robustly select the last non-empty hits list from a pandas Series or iterable.
34
+
35
+ This fixes the KeyError that happens when using reversed() directly on a Series
36
+ with a non-range index.
37
+ """
38
+ # Convert pandas Series or other iterables to a plain Python list
39
+ try:
40
+ values = list(hit_lists.tolist())
41
+ except AttributeError:
42
+ values = list(hit_lists)
43
+
44
+ # Walk from last to first, return first non-empty list-like
45
+ for lst in reversed(values):
46
+ if isinstance(lst, (list, tuple)) and len(lst) > 0:
47
+ return lst
48
+
49
+ # If everything was empty / NaN
50
+ return []
51
+
52
+
53
  # ----------------------------- IO Helpers ----------------------------- #
54
 
55
  def read_logs(jsonl_path: Path) -> pd.DataFrame:
 
56
  rows = []
57
+ if (not jsonl_path.exists()) or jsonl_path.stat().st_size == 0:
58
+ return pd.DataFrame(columns=["question", "hits"])
59
+
60
  with open(jsonl_path, "r", encoding="utf-8") as f:
61
  for line in f:
62
  line = line.strip()
 
66
  rec = json.loads(line)
67
  except Exception:
68
  continue
69
+
70
+ # Extract question
71
  q = (((rec.get("inputs") or {}).get("question")) or "").strip()
72
+
73
+ # Extract retrieval hits (if present)
74
  retr = (rec.get("retrieval") or {})
75
  hits = retr.get("hits", [])
 
76
  norm_hits = []
77
  for h in hits or []:
78
  doc = (h.get("doc") or "").strip()
79
  page = str(h.get("page") or "").strip()
80
+
81
+ # Normalize page to int or None
82
  try:
 
83
  page_int = int(page)
84
  except Exception:
85
  page_int = None
86
+
87
  norm_hits.append({"doc": doc, "page": page_int})
88
+
89
  rows.append({"question": q, "hits": norm_hits})
90
+
91
  df = pd.DataFrame(rows)
92
  if df.empty:
93
  return pd.DataFrame(columns=["question", "hits"])
94
+
95
+ # Group by normalized question text and keep last non-empty hits list per question
 
 
 
 
 
96
  df = (
97
+ df.groupby(df["question"].astype(str).str.casefold().str.strip(), as_index=False)
98
  .agg({"question": "last", "hits": _pick_last_non_empty})
99
  )
100
  return df
101
 
102
 
103
  def read_gold(csv_path: Path) -> pd.DataFrame:
 
 
 
104
  df = pd.read_csv(csv_path)
 
105
  cols = {c.lower().strip(): c for c in df.columns}
106
+
107
+ # --- question column ---
108
  q_col = None
109
  for cand in ["question", "query", "q"]:
110
  if cand in cols:
 
113
  if q_col is None:
114
  raise ValueError("Gold CSV must contain a 'question' column (case-insensitive).")
115
 
116
+ # --- possible relevant_docs (list-in-cell) column ---
117
  rel_list_col = None
118
  for cand in ["relevant_docs", "relevant", "docs"]:
119
  if cand in cols:
120
  rel_list_col = cols[cand]
121
  break
122
 
123
+ # --- single-doc-per-row column ---
124
  doc_col = None
125
  for cand in ["doc", "document", "file", "doc_name"]:
126
  if cand in cols:
127
  doc_col = cols[cand]
128
  break
129
 
130
+ # --- optional page column ---
131
  page_col = None
132
  for cand in ["page", "page_num", "page_number"]:
133
  if cand in cols:
 
135
  break
136
 
137
  rows = []
138
+
139
+ # Case 1: relevant_docs list column (no explicit doc_col)
140
  if rel_list_col and doc_col is None:
 
141
  for _, r in df.iterrows():
142
  q_raw = str(r[q_col]).strip()
143
  q_norm = q_raw.casefold().strip()
144
+
145
  rel_val = str(r[rel_list_col]) if pd.notna(r[rel_list_col]) else ""
146
  if not rel_val:
147
+ rows.append({
148
+ "question_raw": q_raw,
149
+ "question": q_norm,
150
+ "doc": None,
151
+ "page": np.nan
152
+ })
153
  continue
154
+
155
  parts = [p.strip() for p in re_split_sc(rel_val)]
 
156
  for d in parts:
157
+ rows.append({
158
+ "question_raw": q_raw,
159
+ "question": q_norm,
160
+ "doc": filename_key(d),
161
+ "page": np.nan
162
+ })
163
+
164
+ # Case 2: doc/page columns (one relevant doc per row)
165
  elif doc_col:
 
166
  for _, r in df.iterrows():
167
  q_raw = str(r[q_col]).strip()
168
  q_norm = q_raw.casefold().strip()
169
+
170
  d = str(r[doc_col]).strip() if pd.notna(r[doc_col]) else ""
171
+ p = r[page_col] if (page_col and pd.notna(r[page_col])) else np.nan
172
+
173
  try:
174
  p = int(p)
175
  except Exception:
176
  p = np.nan
177
+
178
+ rows.append({
179
+ "question_raw": q_raw,
180
+ "question": q_norm,
181
+ "doc": filename_key(d),
182
+ "page": p
183
+ })
184
+
185
  else:
186
  raise ValueError("Gold CSV must contain either a 'doc' column or a 'relevant_docs' column.")
187
 
188
  gold = pd.DataFrame(rows)
189
+
190
+ # Keep only rows with a valid doc (when docs exist)
191
  gold["has_doc"] = gold["doc"].apply(lambda x: isinstance(x, str) and len(x) > 0)
192
  if gold["has_doc"].any():
193
  gold = gold[gold["has_doc"]].copy()
194
  gold.drop(columns=["has_doc"], inplace=True, errors="ignore")
 
 
 
 
 
 
 
 
 
 
 
195
 
196
+ # Remove duplicates
197
+ gold = gold.drop_duplicates(subset=["question", "doc", "page"])
198
 
199
+ return gold
 
 
 
200
 
201
 
202
  # ----------------------------- Metric Core ----------------------------- #
203
 
204
  def dcg_at_k(relevances: List[int]) -> float:
 
205
  dcg = 0.0
206
  for i, rel in enumerate(relevances, start=1):
207
  if rel > 0:
 
218
  return float(dcg / idcg)
219
 
220
 
221
+ def compute_metrics_for_question(gold_docs, gold_pages, hits, k):
 
 
 
 
 
 
 
 
 
 
222
  top = hits[:k] if hits else []
223
  pred_docs = [filename_key(h.get("doc", "")) for h in top]
224
  pred_pairs = [(filename_key(h.get("doc", "")), h.get("page", None)) for h in top]
225
 
226
+ # --- Doc-level metrics ---
227
  gold_doc_set = set([d for d in gold_docs if isinstance(d, str) and d])
228
+
229
  rel_bin_doc = [1 if d in gold_doc_set else 0 for d in pred_docs]
230
  hitk_doc = 1 if any(rel_bin_doc) else 0
231
  prec_doc = (sum(rel_bin_doc) / max(1, len(pred_docs))) if pred_docs else 0.0
232
  rec_doc = (sum(rel_bin_doc) / max(1, len(gold_doc_set))) if gold_doc_set else 0.0
233
  ndcg_doc = ndcg_at_k(rel_bin_doc)
234
 
235
+ # --- Page-level metrics (only if gold has page labels) ---
236
  gold_pairs = set()
237
  for d, p in zip(gold_docs, gold_pages):
238
  if isinstance(d, str) and d and (p is not None) and (not (isinstance(p, float) and np.isnan(p))):
 
243
  gold_pairs.add((d, p_int))
244
 
245
  if gold_pairs:
246
+ rel_bin_page = []
247
+ for (d, p) in pred_pairs:
248
+ if p is None or not isinstance(p, int):
249
+ rel_bin_page.append(0)
250
+ else:
251
+ rel_bin_page.append(1 if (d, p) in gold_pairs else 0)
252
+
253
  hitk_page = 1 if any(rel_bin_page) else 0
254
  prec_page = (sum(rel_bin_page) / max(1, len(pred_pairs))) if pred_pairs else 0.0
255
  rec_page = (sum(rel_bin_page) / max(1, len(gold_pairs))) if gold_pairs else 0.0
 
274
 
275
  # ----------------------------- Orchestration ----------------------------- #
276
 
277
+ # === Dark blue and accent colors ===
278
+ COLOR_TITLE = "\033[94m" # light blue for titles
279
+ COLOR_TEXT = "\033[34m" # dark blue
280
+ COLOR_ACCENT = "\033[36m" # cyan for metrics
281
+ COLOR_RESET = "\033[0m"
282
+
283
+
284
+ def _fmt(x: Any) -> str:
285
+ try:
286
+ return f"{float(x):.3f}"
287
+ except Exception:
288
+ return "-"
289
+
290
+
291
  def main():
292
  ap = argparse.ArgumentParser()
293
  ap.add_argument("--gold_csv", required=True, type=str)
 
303
  logs_path = Path(args.logs_jsonl)
304
 
305
  if not gold_path.exists():
306
+ print(f"{COLOR_TEXT}❌ gold.csv not found at {gold_path}{COLOR_RESET}", file=sys.stderr)
307
  sys.exit(0)
308
  if not logs_path.exists() or logs_path.stat().st_size == 0:
309
+ print(f"{COLOR_TEXT}❌ logs JSONL not found or empty at {logs_path}{COLOR_RESET}", file=sys.stderr)
310
  sys.exit(0)
311
 
312
+ # Read gold
313
  try:
314
  gold = read_gold(gold_path)
315
  except Exception as e:
316
+ print(f"{COLOR_TEXT}❌ Failed to read gold: {e}{COLOR_RESET}", file=sys.stderr)
317
+ sys.exit(0)
318
+
319
+ # Read logs (with robust aggregation)
320
+ try:
321
+ logs = read_logs(logs_path)
322
+ except Exception as e:
323
+ print(f"{COLOR_TEXT}❌ Failed to read logs: {e}{COLOR_RESET}", file=sys.stderr)
324
  sys.exit(0)
 
325
 
326
  if gold.empty:
327
+ print(f"{COLOR_TEXT}❌ Gold file contains no usable rows.{COLOR_RESET}", file=sys.stderr)
328
  sys.exit(0)
329
  if logs.empty:
330
+ print(f"{COLOR_TEXT}❌ Logs file contains no usable entries.{COLOR_RESET}", file=sys.stderr)
331
  sys.exit(0)
332
 
333
+ # Build gold dict: normalized_question -> list of (doc, page)
334
  gdict: Dict[str, List[Tuple[str, Optional[int]]]] = {}
335
  for _, r in gold.iterrows():
336
  q = str(r["question"]).strip()
 
338
  p = r["page"] if "page" in r else np.nan
339
  gdict.setdefault(q, []).append((d, p))
340
 
341
+ # Normalize log questions for join
342
  logs["q_norm"] = logs["question"].astype(str).str.casefold().str.strip()
343
+
344
  perq_rows = []
345
  not_in_logs, not_in_gold = [], []
346
 
347
+ # For each gold question, compute metrics using logs
348
  for q_norm, pairs in gdict.items():
 
 
 
349
  row = logs[logs["q_norm"] == q_norm]
350
+ gdocs = [d for (d, _) in pairs]
351
+ gpages = [p for (_, p) in pairs]
352
+
353
  if row.empty:
354
+ # No logs for this gold question β†’ zero retrieval
355
  not_in_logs.append(q_norm)
 
 
 
356
  metrics = {
357
+ "hit@k_doc": 0,
358
+ "precision@k_doc": 0.0,
359
+ "recall@k_doc": 0.0,
360
+ "ndcg@k_doc": 0.0,
361
+ "hit@k_page": np.nan,
362
+ "precision@k_page": np.nan,
363
+ "recall@k_page": np.nan,
364
+ "ndcg@k_page": np.nan,
365
  "n_gold_docs": int(len(set([d for d in gdocs if isinstance(d, str) and d]))),
366
+ "n_gold_doc_pages": int(len([
367
+ (d, p) for (d, p) in zip(gdocs, gpages)
368
+ if isinstance(d, str) and d and pd.notna(p)
369
+ ])),
370
  "n_pred": 0
371
  }
372
  perq_rows.append({
 
376
  })
377
  continue
378
 
379
+ # Use aggregated hits from read_logs
380
  hits = row.iloc[0]["hits"] or []
 
 
 
381
  metrics = compute_metrics_for_question(gdocs, gpages, hits, args.k)
 
382
  perq_rows.append({
383
  "question": q_norm,
384
  "covered_in_logs": 1,
385
  **metrics
386
  })
387
 
388
+ # Any log questions not in gold
389
  gold_qs = set(gdict.keys())
390
  for qn in logs["q_norm"].tolist():
391
  if qn not in gold_qs:
392
  not_in_gold.append(qn)
393
 
394
  perq = pd.DataFrame(perq_rows)
 
 
395
  covered = perq[perq["covered_in_logs"] == 1].copy()
396
+
397
  agg = {
398
  "questions_total_gold": int(len(gdict)),
399
  "questions_covered_in_logs": int(covered.shape[0]),
400
  "questions_missing_in_logs": int(len(not_in_logs)),
401
  "questions_in_logs_not_in_gold": int(len(set(not_in_gold))),
402
  "k": int(args.k),
 
403
  "mean_hit@k_doc": float(covered["hit@k_doc"].mean()) if not covered.empty else 0.0,
404
  "mean_precision@k_doc": float(covered["precision@k_doc"].mean()) if not covered.empty else 0.0,
405
  "mean_recall@k_doc": float(covered["recall@k_doc"].mean()) if not covered.empty else 0.0,
406
  "mean_ndcg@k_doc": float(covered["ndcg@k_doc"].mean()) if not covered.empty else 0.0,
 
407
  "mean_hit@k_page": float(covered["hit@k_page"].dropna().mean()) if covered["hit@k_page"].notna().any() else None,
408
  "mean_precision@k_page": float(covered["precision@k_page"].dropna().mean()) if covered["precision@k_page"].notna().any() else None,
409
  "mean_recall@k_page": float(covered["recall@k_page"].dropna().mean()) if covered["recall@k_page"].notna().any() else None,
410
  "mean_ndcg@k_page": float(covered["ndcg@k_page"].dropna().mean()) if covered["ndcg@k_page"].notna().any() else None,
 
411
  "avg_gold_docs_per_q": float(perq["n_gold_docs"].mean()) if not perq.empty else 0.0,
412
  "avg_preds_per_q": float(perq["n_pred"].mean()) if not perq.empty else 0.0,
 
413
  "examples_missing_in_logs": list(not_in_logs[:10]),
414
  "examples_in_logs_not_in_gold": list(dict.fromkeys(not_in_gold))[:10],
415
  }
416
 
 
417
  perq_path = out_dir / "metrics_per_question.csv"
418
  agg_path = out_dir / "metrics_aggregate.json"
419
+
420
  perq.to_csv(perq_path, index=False)
421
  with open(agg_path, "w", encoding="utf-8") as f:
422
  json.dump(agg, f, ensure_ascii=False, indent=2)
423
 
424
+ # === Console summary with color ===
425
+ print(f"{COLOR_TITLE}RAG Evaluation Summary{COLOR_RESET}")
426
+ print(f"{COLOR_TITLE}----------------------{COLOR_RESET}")
427
+ print(f"{COLOR_TEXT}Gold questions: {COLOR_ACCENT}{agg['questions_total_gold']}{COLOR_RESET}")
428
+ print(f"{COLOR_TEXT}Covered in logs: {COLOR_ACCENT}{agg['questions_covered_in_logs']}{COLOR_RESET}")
429
+ print(f"{COLOR_TEXT}Missing in logs: {COLOR_ACCENT}{agg['questions_missing_in_logs']}{COLOR_RESET}")
430
+ print(f"{COLOR_TEXT}In logs but not in gold: {COLOR_ACCENT}{agg['questions_in_logs_not_in_gold']}{COLOR_RESET}")
431
+ print(f"{COLOR_TEXT}k = {COLOR_ACCENT}{agg['k']}{COLOR_RESET}\n")
432
+
433
+ print(
434
+ f"{COLOR_TEXT}Doc-level:{COLOR_RESET} "
435
+ f"{COLOR_ACCENT}Hit@k={_fmt(agg['mean_hit@k_doc'])} "
436
+ f"Precision@k={_fmt(agg['mean_precision@k_doc'])} "
437
+ f"Recall@k={_fmt(agg['mean_recall@k_doc'])} "
438
+ f"nDCG@k={_fmt(agg['mean_ndcg@k_doc'])}{COLOR_RESET}"
439
+ )
440
+
441
+ if agg['mean_hit@k_page'] is not None:
442
+ print(
443
+ f"{COLOR_TEXT}Page-level:{COLOR_RESET} "
444
+ f"{COLOR_ACCENT}Hit@k={_fmt(agg['mean_hit@k_page'])} "
445
+ f"Precision@k={_fmt(agg['mean_precision@k_page'])} "
446
+ f"Recall@k={_fmt(agg['mean_recall@k_page'])} "
447
+ f"nDCG@k={_fmt(agg['mean_ndcg@k_page'])}{COLOR_RESET}"
448
+ )
449
  else:
450
+ print(f"{COLOR_TEXT}Page-level: (no page labels in gold){COLOR_RESET}")
451
 
452
  print()
453
+ print(f"{COLOR_TEXT}Wrote per-question CSV β†’ {COLOR_ACCENT}{perq_path}{COLOR_RESET}")
454
+ print(f"{COLOR_TEXT}Wrote aggregate JSON β†’ {COLOR_ACCENT}{agg_path}{COLOR_RESET}")
 
 
 
 
 
 
 
455
 
456
 
457
  if __name__ == "__main__":