Inframat-x commited on
Commit
de52939
·
verified ·
1 Parent(s): 103dce8

Update rag_eval_metrics.py

Browse files
Files changed (1) hide show
  1. rag_eval_metrics.py +59 -17
rag_eval_metrics.py CHANGED
@@ -4,9 +4,12 @@ rag_eval_metrics.py
4
 
5
  Evaluate RAG retrieval quality by comparing app logs (JSONL) with a gold file (CSV).
6
 
7
- Now extended to also evaluate answer quality using:
8
  - Lexical similarity: BLEU, ROUGE-1/2/L
9
  - Semantic similarity: BERTScore (Recall, F1)
 
 
 
10
  """
11
 
12
  import argparse
@@ -243,7 +246,7 @@ def read_gold(csv_path: Path) -> Tuple[pd.DataFrame, Dict[str, str]]:
243
 
244
  return gold, gold_answers
245
 
246
- # ----------------------------- Metric Core ----------------------------- #
247
 
248
  def dcg_at_k(relevances: List[int]) -> float:
249
  dcg = 0.0
@@ -313,14 +316,40 @@ def compute_metrics_for_question(gold_docs, gold_pages, hits, k):
313
  "n_pred": int(len(pred_docs))
314
  }
315
 
316
- # ---------------------- Answer Quality Metrics ---------------------- #
317
-
318
- from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
319
- from rouge_score import rouge_scorer
320
- from bert_score import score as bert_score
321
-
322
- _SMOOTH = SmoothingFunction().method1
323
- _ROUGE_SCORER = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
  def _normalize_text_for_metrics(s: str) -> str:
326
  import re
@@ -338,7 +367,21 @@ def compute_text_metrics(pred: str, ref: str) -> Dict[str, float]:
338
  - BLEU
339
  - ROUGE-1/2/L (F-measure)
340
  - BERTScore Recall, F1
 
 
 
341
  """
 
 
 
 
 
 
 
 
 
 
 
342
  pred_n = _normalize_text_for_metrics(pred)
343
  ref_n = _normalize_text_for_metrics(ref)
344
 
@@ -595,18 +638,18 @@ def main():
595
  if "mean_bleu" in agg:
596
  print(
597
  f"{COLOR_TEXT}Lexical (answer quality):{COLOR_RESET} "
598
- f"{COLOR_ACCENT}BLEU={_fmt(agg['mean_bleu'])} "
599
- f"ROUGE-1={_fmt(agg['mean_rouge1'])} "
600
- f"ROUGE-2={_fmt(agg['mean_rouge2'])} "
601
- f"ROUGE-L={_fmt(agg['mean_rougeL'])}{COLOR_RESET}"
602
  )
603
 
604
  # Semantic metrics summary
605
  if "mean_bert_f1" in agg:
606
  print(
607
  f"{COLOR_TEXT}Semantic (BERTScore):{COLOR_RESET} "
608
- f"{COLOR_ACCENT}Recall={_fmt(agg['mean_bert_recall'])} "
609
- f"F1={_fmt(agg['mean_bert_f1'])}{COLOR_RESET}"
610
  )
611
 
612
  print()
@@ -615,4 +658,3 @@ def main():
615
 
616
  if __name__ == "__main__":
617
  main()
618
-
 
4
 
5
  Evaluate RAG retrieval quality by comparing app logs (JSONL) with a gold file (CSV).
6
 
7
+ Extended to also evaluate answer quality using:
8
  - Lexical similarity: BLEU, ROUGE-1/2/L
9
  - Semantic similarity: BERTScore (Recall, F1)
10
+
11
+ If nltk / rouge-score / bert-score are missing, the script still runs and
12
+ returns NaN for these metrics instead of crashing.
13
  """
14
 
15
  import argparse
 
246
 
247
  return gold, gold_answers
248
 
249
+ # ----------------------------- Retrieval Metric Core ----------------------------- #
250
 
251
  def dcg_at_k(relevances: List[int]) -> float:
252
  dcg = 0.0
 
316
  "n_pred": int(len(pred_docs))
317
  }
318
 
319
+ # ---------------------- Answer Quality Metrics (with fallbacks) ---------------------- #
320
+
321
+ # Try to import optional libraries; if missing, we fall back to NaN metrics
322
+ try:
323
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
324
+ HAVE_NLTK = True
325
+ except Exception:
326
+ sentence_bleu = None
327
+ SmoothingFunction = None
328
+ HAVE_NLTK = False
329
+
330
+ try:
331
+ from rouge_score import rouge_scorer
332
+ HAVE_ROUGE = True
333
+ except Exception:
334
+ rouge_scorer = None
335
+ HAVE_ROUGE = False
336
+
337
+ try:
338
+ from bert_score import score as bert_score
339
+ HAVE_BERT = True
340
+ except Exception:
341
+ bert_score = None
342
+ HAVE_BERT = False
343
+
344
+ if HAVE_NLTK:
345
+ _SMOOTH = SmoothingFunction().method1
346
+ else:
347
+ _SMOOTH = None
348
+
349
+ if HAVE_ROUGE:
350
+ _ROUGE_SCORER = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
351
+ else:
352
+ _ROUGE_SCORER = None
353
 
354
  def _normalize_text_for_metrics(s: str) -> str:
355
  import re
 
367
  - BLEU
368
  - ROUGE-1/2/L (F-measure)
369
  - BERTScore Recall, F1
370
+
371
+ If the required libraries (nltk, rouge-score, bert-score) are not installed,
372
+ returns NaN for all metrics.
373
  """
374
+ # If any of the libraries is missing, skip answer metrics
375
+ if not (HAVE_NLTK and HAVE_ROUGE and HAVE_BERT):
376
+ return {
377
+ "bleu": np.nan,
378
+ "rouge1": np.nan,
379
+ "rouge2": np.nan,
380
+ "rougeL": np.nan,
381
+ "bert_recall": np.nan,
382
+ "bert_f1": np.nan,
383
+ }
384
+
385
  pred_n = _normalize_text_for_metrics(pred)
386
  ref_n = _normalize_text_for_metrics(ref)
387
 
 
638
  if "mean_bleu" in agg:
639
  print(
640
  f"{COLOR_TEXT}Lexical (answer quality):{COLOR_RESET} "
641
+ f"{COLOR_ACCENT}BLEU={_fmt(agg.get('mean_bleu'))} "
642
+ f"ROUGE-1={_fmt(agg.get('mean_rouge1'))} "
643
+ f"ROUGE-2={_fmt(agg.get('mean_rouge2'))} "
644
+ f"ROUGE-L={_fmt(agg.get('mean_rougeL'))}{COLOR_RESET}"
645
  )
646
 
647
  # Semantic metrics summary
648
  if "mean_bert_f1" in agg:
649
  print(
650
  f"{COLOR_TEXT}Semantic (BERTScore):{COLOR_RESET} "
651
+ f"{COLOR_ACCENT}Recall={_fmt(agg.get('mean_bert_recall'))} "
652
+ f"F1={_fmt(agg.get('mean_bert_f1'))}{COLOR_RESET}"
653
  )
654
 
655
  print()
 
658
 
659
  if __name__ == "__main__":
660
  main()