Spaces:

Inframat-x
/

ML-Chatbot

Sleeping

App Files Files Community

Inframat-x commited on Nov 26, 2025

Commit

103dce8

verified ·

1 Parent(s): 1562548

Update rag_eval_metrics.py

Browse files

Files changed (1) hide show

rag_eval_metrics.py +189 -29

rag_eval_metrics.py CHANGED Viewed

@@ -3,6 +3,10 @@
 rag_eval_metrics.py
 Evaluate RAG retrieval quality by comparing app logs (JSONL) with a gold file (CSV).
 """
 import argparse
@@ -15,19 +19,16 @@ from typing import Dict, List, Tuple, Any, Optional
 import pandas as pd
 import numpy as np
 # ----------------------------- Small Utils ----------------------------- #
 def filename_key(s: str) -> str:
     s = (s or "").strip().replace("\\", "/").split("/")[-1]
     return s.casefold()
 def re_split_sc(s: str) -> List[str]:
     import re
     return re.split(r"[;,]", s)
 def _pick_last_non_empty(hit_lists) -> List[dict]:
     """
     Robustly select the last non-empty hits list from a pandas Series or iterable.
@@ -49,13 +50,20 @@ def _pick_last_non_empty(hit_lists) -> List[dict]:
     # If everything was empty / NaN
     return []
 # ----------------------------- IO Helpers ----------------------------- #
 def read_logs(jsonl_path: Path) -> pd.DataFrame:
     rows = []
     if (not jsonl_path.exists()) or jsonl_path.stat().st_size == 0:
-        return pd.DataFrame(columns=["question", "hits"])
     with open(jsonl_path, "r", encoding="utf-8") as f:
         for line in f:
@@ -86,21 +94,36 @@ def read_logs(jsonl_path: Path) -> pd.DataFrame:
                 norm_hits.append({"doc": doc, "page": page_int})
-            rows.append({"question": q, "hits": norm_hits})
     df = pd.DataFrame(rows)
     if df.empty:
-        return pd.DataFrame(columns=["question", "hits"])
-    # Group by normalized question text and keep last non-empty hits list per question
     df = (
         df.groupby(df["question"].astype(str).str.casefold().str.strip(), as_index=False)
-          .agg({"question": "last", "hits": _pick_last_non_empty})
     )
     return df
-def read_gold(csv_path: Path) -> pd.DataFrame:
     df = pd.read_csv(csv_path)
     cols = {c.lower().strip(): c for c in df.columns}
@@ -134,6 +157,13 @@ def read_gold(csv_path: Path) -> pd.DataFrame:
             page_col = cols[cand]
             break
     rows = []
     # Case 1: relevant_docs list column (no explicit doc_col)
@@ -141,6 +171,7 @@ def read_gold(csv_path: Path) -> pd.DataFrame:
         for _, r in df.iterrows():
             q_raw = str(r[q_col]).strip()
             q_norm = q_raw.casefold().strip()
             rel_val = str(r[rel_list_col]) if pd.notna(r[rel_list_col]) else ""
             if not rel_val:
@@ -148,7 +179,8 @@ def read_gold(csv_path: Path) -> pd.DataFrame:
                     "question_raw": q_raw,
                     "question": q_norm,
                     "doc": None,
-                    "page": np.nan
                 })
                 continue
@@ -158,7 +190,8 @@ def read_gold(csv_path: Path) -> pd.DataFrame:
                     "question_raw": q_raw,
                     "question": q_norm,
                     "doc": filename_key(d),
-                    "page": np.nan
                 })
     # Case 2: doc/page columns (one relevant doc per row)
@@ -166,6 +199,7 @@ def read_gold(csv_path: Path) -> pd.DataFrame:
         for _, r in df.iterrows():
             q_raw = str(r[q_col]).strip()
             q_norm = q_raw.casefold().strip()
             d = str(r[doc_col]).strip() if pd.notna(r[doc_col]) else ""
             p = r[page_col] if (page_col and pd.notna(r[page_col])) else np.nan
@@ -179,7 +213,8 @@ def read_gold(csv_path: Path) -> pd.DataFrame:
                 "question_raw": q_raw,
                 "question": q_norm,
                 "doc": filename_key(d),
-                "page": p
             })
     else:
@@ -196,8 +231,17 @@ def read_gold(csv_path: Path) -> pd.DataFrame:
     # Remove duplicates
     gold = gold.drop_duplicates(subset=["question", "doc", "page"])
-    return gold
 # ----------------------------- Metric Core ----------------------------- #
@@ -208,7 +252,6 @@ def dcg_at_k(relevances: List[int]) -> float:
             dcg += 1.0 / np.log2(i + 1.0)
     return float(dcg)
 def ndcg_at_k(relevances: List[int]) -> float:
     dcg = dcg_at_k(relevances)
     ideal = sorted(relevances, reverse=True)
@@ -217,7 +260,6 @@ def ndcg_at_k(relevances: List[int]) -> float:
         return 0.0
     return float(dcg / idcg)
 def compute_metrics_for_question(gold_docs, gold_pages, hits, k):
     top = hits[:k] if hits else []
     pred_docs = [filename_key(h.get("doc", "")) for h in top]
@@ -271,6 +313,70 @@ def compute_metrics_for_question(gold_docs, gold_pages, hits, k):
         "n_pred": int(len(pred_docs))
     }
 # ----------------------------- Orchestration ----------------------------- #
@@ -280,14 +386,12 @@ COLOR_TEXT = "\033[34m"      # dark blue
 COLOR_ACCENT = "\033[36m"    # cyan for metrics
 COLOR_RESET = "\033[0m"
 def _fmt(x: Any) -> str:
     try:
         return f"{float(x):.3f}"
     except Exception:
         return "-"
 def main():
     ap = argparse.ArgumentParser()
     ap.add_argument("--gold_csv", required=True, type=str)
@@ -309,9 +413,9 @@ def main():
         print(f"{COLOR_TEXT}❌ logs JSONL not found or empty at {logs_path}{COLOR_RESET}", file=sys.stderr)
         sys.exit(0)
-    # Read gold
     try:
-        gold = read_gold(gold_path)
     except Exception as e:
         print(f"{COLOR_TEXT}❌ Failed to read gold: {e}{COLOR_RESET}", file=sys.stderr)
         sys.exit(0)
@@ -333,7 +437,7 @@ def main():
     # Build gold dict: normalized_question -> list of (doc, page)
     gdict: Dict[str, List[Tuple[str, Optional[int]]]] = {}
     for _, r in gold.iterrows():
-        q = str(r["question"]).strip()
         d = r["doc"]
         p = r["page"] if "page" in r else np.nan
         gdict.setdefault(q, []).append((d, p))
@@ -351,9 +455,9 @@ def main():
         gpages = [p for (_, p) in pairs]
         if row.empty:
-            # No logs for this gold question → zero retrieval
             not_in_logs.append(q_norm)
-            metrics = {
                 "hit@k_doc": 0,
                 "precision@k_doc": 0.0,
                 "recall@k_doc": 0.0,
@@ -369,20 +473,49 @@ def main():
                 ])),
                 "n_pred": 0
             }
             perq_rows.append({
                 "question": q_norm,
                 "covered_in_logs": 0,
-                **metrics
             })
             continue
         # Use aggregated hits from read_logs
         hits = row.iloc[0]["hits"] or []
-        metrics = compute_metrics_for_question(gdocs, gpages, hits, args.k)
         perq_rows.append({
             "question": q_norm,
             "covered_in_logs": 1,
-            **metrics
         })
     # Any log questions not in gold
@@ -414,6 +547,15 @@ def main():
         "examples_in_logs_not_in_gold": list(dict.fromkeys(not_in_gold))[:10],
     }
     perq_path = out_dir / "metrics_per_question.csv"
     agg_path = out_dir / "metrics_aggregate.json"
@@ -438,21 +580,39 @@ def main():
         f"nDCG@k={_fmt(agg['mean_ndcg@k_doc'])}{COLOR_RESET}"
     )
-    if agg['mean_hit@k_page'] is not None:
         print(
             f"{COLOR_TEXT}Page-level:{COLOR_RESET} "
             f"{COLOR_ACCENT}Hit@k={_fmt(agg['mean_hit@k_page'])}  "
             f"Precision@k={_fmt(agg['mean_precision@k_page'])}  "
-            f"Recall@k={_fmt(agg['mean_recall@k_page'])}  "
             f"nDCG@k={_fmt(agg['mean_ndcg@k_page'])}{COLOR_RESET}"
         )
     else:
         print(f"{COLOR_TEXT}Page-level: (no page labels in gold){COLOR_RESET}")
     print()
     print(f"{COLOR_TEXT}Wrote per-question CSV → {COLOR_ACCENT}{perq_path}{COLOR_RESET}")
     print(f"{COLOR_TEXT}Wrote aggregate JSON   → {COLOR_ACCENT}{agg_path}{COLOR_RESET}")
 if __name__ == "__main__":
     main()

 rag_eval_metrics.py
 Evaluate RAG retrieval quality by comparing app logs (JSONL) with a gold file (CSV).
+Now extended to also evaluate answer quality using:
+- Lexical similarity: BLEU, ROUGE-1/2/L
+- Semantic similarity: BERTScore (Recall, F1)
 """
 import argparse
 import pandas as pd
 import numpy as np
 # ----------------------------- Small Utils ----------------------------- #
 def filename_key(s: str) -> str:
     s = (s or "").strip().replace("\\", "/").split("/")[-1]
     return s.casefold()
 def re_split_sc(s: str) -> List[str]:
     import re
     return re.split(r"[;,]", s)
 def _pick_last_non_empty(hit_lists) -> List[dict]:
     """
     Robustly select the last non-empty hits list from a pandas Series or iterable.
     # If everything was empty / NaN
     return []
 # ----------------------------- IO Helpers ----------------------------- #
 def read_logs(jsonl_path: Path) -> pd.DataFrame:
+    """
+    Read RAG JSONL logs and aggregate by question.
+    Returns a DataFrame with columns:
+      - question: original question text (last occurrence)
+      - hits:     list of dicts {doc, page} for retrieval
+      - answer:   final answer text logged for that question
+    """
     rows = []
     if (not jsonl_path.exists()) or jsonl_path.stat().st_size == 0:
+        return pd.DataFrame(columns=["question", "hits", "answer"])
     with open(jsonl_path, "r", encoding="utf-8") as f:
         for line in f:
                 norm_hits.append({"doc": doc, "page": page_int})
+            # Extract final answer text (if present)
+            out = (rec.get("output") or {})
+            ans = ((out.get("final_answer") or "")).strip()
+            rows.append({"question": q, "hits": norm_hits, "answer": ans})
     df = pd.DataFrame(rows)
     if df.empty:
+        return pd.DataFrame(columns=["question", "hits", "answer"])
+    # Group by normalized question text and keep last non-empty hits list and answer per question
     df = (
         df.groupby(df["question"].astype(str).str.casefold().str.strip(), as_index=False)
+          .agg({
+              "question": "last",
+              "hits": _pick_last_non_empty,
+              "answer": "last"
+          })
     )
     return df
+def read_gold(csv_path: Path) -> Tuple[pd.DataFrame, Dict[str, str]]:
+    """
+    Read gold CSV with retrieval labels and optional reference answers.
+    Returns:
+      - gold_df: rows with columns ['question', 'doc', 'page', 'answer', ...]
+                 where 'question' is normalized (casefold+strip)
+      - gold_answers: dict mapping normalized question -> reference answer text
+    """
     df = pd.read_csv(csv_path)
     cols = {c.lower().strip(): c for c in df.columns}
             page_col = cols[cand]
             break
+    # --- optional answer column (for QA metrics) ---
+    ans_col = None
+    for cand in ["answer", "reference_answer", "gold_answer"]:
+        if cand in cols:
+            ans_col = cols[cand]
+            break
     rows = []
     # Case 1: relevant_docs list column (no explicit doc_col)
         for _, r in df.iterrows():
             q_raw = str(r[q_col]).strip()
             q_norm = q_raw.casefold().strip()
+            ans_raw = str(r[ans_col]).strip() if (ans_col and pd.notna(r[ans_col])) else ""
             rel_val = str(r[rel_list_col]) if pd.notna(r[rel_list_col]) else ""
             if not rel_val:
                     "question_raw": q_raw,
                     "question": q_norm,
                     "doc": None,
+                    "page": np.nan,
+                    "answer": ans_raw
                 })
                 continue
                     "question_raw": q_raw,
                     "question": q_norm,
                     "doc": filename_key(d),
+                    "page": np.nan,
+                    "answer": ans_raw
                 })
     # Case 2: doc/page columns (one relevant doc per row)
         for _, r in df.iterrows():
             q_raw = str(r[q_col]).strip()
             q_norm = q_raw.casefold().strip()
+            ans_raw = str(r[ans_col]).strip() if (ans_col and pd.notna(r[ans_col])) else ""
             d = str(r[doc_col]).strip() if pd.notna(r[doc_col]) else ""
             p = r[page_col] if (page_col and pd.notna(r[page_col])) else np.nan
                 "question_raw": q_raw,
                 "question": q_norm,
                 "doc": filename_key(d),
+                "page": p,
+                "answer": ans_raw
             })
     else:
     # Remove duplicates
     gold = gold.drop_duplicates(subset=["question", "doc", "page"])
+    # Build question -> gold_answer map (normalized questions)
+    gold_answers: Dict[str, str] = {}
+    if "answer" in gold.columns:
+        tmp = (
+            gold[["question", "answer"]]
+            .dropna(subset=["answer"])
+            .drop_duplicates(subset=["question"])
+        )
+        gold_answers = dict(zip(tmp["question"], tmp["answer"]))
+    return gold, gold_answers
 # ----------------------------- Metric Core ----------------------------- #
             dcg += 1.0 / np.log2(i + 1.0)
     return float(dcg)
 def ndcg_at_k(relevances: List[int]) -> float:
     dcg = dcg_at_k(relevances)
     ideal = sorted(relevances, reverse=True)
         return 0.0
     return float(dcg / idcg)
 def compute_metrics_for_question(gold_docs, gold_pages, hits, k):
     top = hits[:k] if hits else []
     pred_docs = [filename_key(h.get("doc", "")) for h in top]
         "n_pred": int(len(pred_docs))
     }
+# ---------------------- Answer Quality Metrics ---------------------- #
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from rouge_score import rouge_scorer
+from bert_score import score as bert_score
+_SMOOTH = SmoothingFunction().method1
+_ROUGE_SCORER = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
+def _normalize_text_for_metrics(s: str) -> str:
+    import re
+    s = (s or "").strip().lower()
+    # remove simple markdown markers
+    s = re.sub(r"\*\*|\*", "", s)
+    # drop inline citations like (Doc.pdf, p.X)
+    s = re.sub(r"\([^)]*\)", " ", s)
+    s = re.sub(r"\s+", " ", s)
+    return s.strip()
+def compute_text_metrics(pred: str, ref: str) -> Dict[str, float]:
+    """
+    Compute lexical and semantic similarity metrics between prediction and reference:
+      - BLEU
+      - ROUGE-1/2/L (F-measure)
+      - BERTScore Recall, F1
+    """
+    pred_n = _normalize_text_for_metrics(pred)
+    ref_n  = _normalize_text_for_metrics(ref)
+    if not pred_n or not ref_n:
+        return {
+            "bleu": np.nan,
+            "rouge1": np.nan,
+            "rouge2": np.nan,
+            "rougeL": np.nan,
+            "bert_recall": np.nan,
+            "bert_f1": np.nan,
+        }
+    pred_tokens = pred_n.split()
+    ref_tokens  = ref_n.split()
+    # BLEU (sentence-level with smoothing)
+    bleu = float(sentence_bleu([ref_tokens], pred_tokens, smoothing_function=_SMOOTH))
+    # ROUGE via rouge-score (F-measure)
+    rs = _ROUGE_SCORER.score(ref_n, pred_n)
+    rouge1 = float(rs["rouge1"].fmeasure)
+    rouge2 = float(rs["rouge2"].fmeasure)
+    rougeL = float(rs["rougeL"].fmeasure)
+    # BERTScore (semantic similarity)
+    P, R, F1 = bert_score([pred_n], [ref_n], lang="en", rescale_with_baseline=True)
+    bert_recall = float(R.mean().item())
+    bert_f1     = float(F1.mean().item())
+    return {
+        "bleu": bleu,
+        "rouge1": rouge1,
+        "rouge2": rouge2,
+        "rougeL": rougeL,
+        "bert_recall": bert_recall,
+        "bert_f1": bert_f1,
+    }
 # ----------------------------- Orchestration ----------------------------- #
 COLOR_ACCENT = "\033[36m"    # cyan for metrics
 COLOR_RESET = "\033[0m"
 def _fmt(x: Any) -> str:
     try:
         return f"{float(x):.3f}"
     except Exception:
         return "-"
 def main():
     ap = argparse.ArgumentParser()
     ap.add_argument("--gold_csv", required=True, type=str)
         print(f"{COLOR_TEXT}❌ logs JSONL not found or empty at {logs_path}{COLOR_RESET}", file=sys.stderr)
         sys.exit(0)
+    # Read gold (retrieval + QA answers)
     try:
+        gold, gold_answers = read_gold(gold_path)
     except Exception as e:
         print(f"{COLOR_TEXT}❌ Failed to read gold: {e}{COLOR_RESET}", file=sys.stderr)
         sys.exit(0)
     # Build gold dict: normalized_question -> list of (doc, page)
     gdict: Dict[str, List[Tuple[str, Optional[int]]]] = {}
     for _, r in gold.iterrows():
+        q = str(r["question"]).strip()  # already normalized in read_gold
         d = r["doc"]
         p = r["page"] if "page" in r else np.nan
         gdict.setdefault(q, []).append((d, p))
         gpages = [p for (_, p) in pairs]
         if row.empty:
+            # No logs for this gold question → zero retrieval and no answer metrics
             not_in_logs.append(q_norm)
+            base_metrics = {
                 "hit@k_doc": 0,
                 "precision@k_doc": 0.0,
                 "recall@k_doc": 0.0,
                 ])),
                 "n_pred": 0
             }
+            txt_metrics = {
+                "bleu": np.nan,
+                "rouge1": np.nan,
+                "rouge2": np.nan,
+                "rougeL": np.nan,
+                "bert_recall": np.nan,
+                "bert_f1": np.nan,
+            }
             perq_rows.append({
                 "question": q_norm,
                 "covered_in_logs": 0,
+                **base_metrics,
+                **txt_metrics,
             })
             continue
         # Use aggregated hits from read_logs
         hits = row.iloc[0]["hits"] or []
+        base_metrics = compute_metrics_for_question(gdocs, gpages, hits, args.k)
+        # Answer text: predicted vs. gold
+        pred_answer = str(row.iloc[0].get("answer", "")).strip()
+        gold_answer = str(gold_answers.get(q_norm, "")).strip()
+        if gold_answer and pred_answer:
+            txt_metrics = compute_text_metrics(pred_answer, gold_answer)
+        else:
+            txt_metrics = {
+                "bleu": np.nan,
+                "rouge1": np.nan,
+                "rouge2": np.nan,
+                "rougeL": np.nan,
+                "bert_recall": np.nan,
+                "bert_f1": np.nan,
+            }
         perq_rows.append({
             "question": q_norm,
             "covered_in_logs": 1,
+            **base_metrics,
+            **txt_metrics,
         })
     # Any log questions not in gold
         "examples_in_logs_not_in_gold": list(dict.fromkeys(not_in_gold))[:10],
     }
+    # Aggregate answer-quality metrics (lexical + semantic)
+    if "bleu" in covered.columns:
+        agg["mean_bleu"] = float(covered["bleu"].mean(skipna=True))
+        agg["mean_rouge1"] = float(covered["rouge1"].mean(skipna=True))
+        agg["mean_rouge2"] = float(covered["rouge2"].mean(skipna=True))
+        agg["mean_rougeL"] = float(covered["rougeL"].mean(skipna=True))
+        agg["mean_bert_recall"] = float(covered["bert_recall"].mean(skipna=True))
+        agg["mean_bert_f1"] = float(covered["bert_f1"].mean(skipna=True))
     perq_path = out_dir / "metrics_per_question.csv"
     agg_path = out_dir / "metrics_aggregate.json"
         f"nDCG@k={_fmt(agg['mean_ndcg@k_doc'])}{COLOR_RESET}"
     )
+    if agg.get("mean_hit@k_page") is not None:
         print(
             f"{COLOR_TEXT}Page-level:{COLOR_RESET} "
             f"{COLOR_ACCENT}Hit@k={_fmt(agg['mean_hit@k_page'])}  "
             f"Precision@k={_fmt(agg['mean_precision@k_page'])}  "
+            f"Recall={_fmt(agg['mean_recall@k_page'])}  "
             f"nDCG@k={_fmt(agg['mean_ndcg@k_page'])}{COLOR_RESET}"
         )
     else:
         print(f"{COLOR_TEXT}Page-level: (no page labels in gold){COLOR_RESET}")
+    # Lexical metrics summary
+    if "mean_bleu" in agg:
+        print(
+            f"{COLOR_TEXT}Lexical (answer quality):{COLOR_RESET} "
+            f"{COLOR_ACCENT}BLEU={_fmt(agg['mean_bleu'])}  "
+            f"ROUGE-1={_fmt(agg['mean_rouge1'])}  "
+            f"ROUGE-2={_fmt(agg['mean_rouge2'])}  "
+            f"ROUGE-L={_fmt(agg['mean_rougeL'])}{COLOR_RESET}"
+        )
+    # Semantic metrics summary
+    if "mean_bert_f1" in agg:
+        print(
+            f"{COLOR_TEXT}Semantic (BERTScore):{COLOR_RESET} "
+            f"{COLOR_ACCENT}Recall={_fmt(agg['mean_bert_recall'])}  "
+            f"F1={_fmt(agg['mean_bert_f1'])}{COLOR_RESET}"
+        )
     print()
     print(f"{COLOR_TEXT}Wrote per-question CSV → {COLOR_ACCENT}{perq_path}{COLOR_RESET}")
     print(f"{COLOR_TEXT}Wrote aggregate JSON   → {COLOR_ACCENT}{agg_path}{COLOR_RESET}")
 if __name__ == "__main__":
     main()