Spaces:

Inframat-x
/

ML-Chatbot

Sleeping

App Files Files Community

Inframat-x commited on Nov 26, 2025

Commit

de52939

verified ·

1 Parent(s): 103dce8

Update rag_eval_metrics.py

Browse files

Files changed (1) hide show

rag_eval_metrics.py +59 -17

rag_eval_metrics.py CHANGED Viewed

@@ -4,9 +4,12 @@ rag_eval_metrics.py
 Evaluate RAG retrieval quality by comparing app logs (JSONL) with a gold file (CSV).
-Now extended to also evaluate answer quality using:
 - Lexical similarity: BLEU, ROUGE-1/2/L
 - Semantic similarity: BERTScore (Recall, F1)
 """
 import argparse
@@ -243,7 +246,7 @@ def read_gold(csv_path: Path) -> Tuple[pd.DataFrame, Dict[str, str]]:
     return gold, gold_answers
-# ----------------------------- Metric Core ----------------------------- #
 def dcg_at_k(relevances: List[int]) -> float:
     dcg = 0.0
@@ -313,14 +316,40 @@ def compute_metrics_for_question(gold_docs, gold_pages, hits, k):
         "n_pred": int(len(pred_docs))
     }
-# ---------------------- Answer Quality Metrics ---------------------- #
-from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
-from rouge_score import rouge_scorer
-from bert_score import score as bert_score
-_SMOOTH = SmoothingFunction().method1
-_ROUGE_SCORER = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
 def _normalize_text_for_metrics(s: str) -> str:
     import re
@@ -338,7 +367,21 @@ def compute_text_metrics(pred: str, ref: str) -> Dict[str, float]:
       - BLEU
       - ROUGE-1/2/L (F-measure)
       - BERTScore Recall, F1
     """
     pred_n = _normalize_text_for_metrics(pred)
     ref_n  = _normalize_text_for_metrics(ref)
@@ -595,18 +638,18 @@ def main():
     if "mean_bleu" in agg:
         print(
             f"{COLOR_TEXT}Lexical (answer quality):{COLOR_RESET} "
-            f"{COLOR_ACCENT}BLEU={_fmt(agg['mean_bleu'])}  "
-            f"ROUGE-1={_fmt(agg['mean_rouge1'])}  "
-            f"ROUGE-2={_fmt(agg['mean_rouge2'])}  "
-            f"ROUGE-L={_fmt(agg['mean_rougeL'])}{COLOR_RESET}"
         )
     # Semantic metrics summary
     if "mean_bert_f1" in agg:
         print(
             f"{COLOR_TEXT}Semantic (BERTScore):{COLOR_RESET} "
-            f"{COLOR_ACCENT}Recall={_fmt(agg['mean_bert_recall'])}  "
-            f"F1={_fmt(agg['mean_bert_f1'])}{COLOR_RESET}"
         )
     print()
@@ -615,4 +658,3 @@ def main():
 if __name__ == "__main__":
     main()

 Evaluate RAG retrieval quality by comparing app logs (JSONL) with a gold file (CSV).
+Extended to also evaluate answer quality using:
 - Lexical similarity: BLEU, ROUGE-1/2/L
 - Semantic similarity: BERTScore (Recall, F1)
+If nltk / rouge-score / bert-score are missing, the script still runs and
+returns NaN for these metrics instead of crashing.
 """
 import argparse
     return gold, gold_answers
+# ----------------------------- Retrieval Metric Core ----------------------------- #
 def dcg_at_k(relevances: List[int]) -> float:
     dcg = 0.0
         "n_pred": int(len(pred_docs))
     }
+# ---------------------- Answer Quality Metrics (with fallbacks) ---------------------- #
+# Try to import optional libraries; if missing, we fall back to NaN metrics
+try:
+    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+    HAVE_NLTK = True
+except Exception:
+    sentence_bleu = None
+    SmoothingFunction = None
+    HAVE_NLTK = False
+try:
+    from rouge_score import rouge_scorer
+    HAVE_ROUGE = True
+except Exception:
+    rouge_scorer = None
+    HAVE_ROUGE = False
+try:
+    from bert_score import score as bert_score
+    HAVE_BERT = True
+except Exception:
+    bert_score = None
+    HAVE_BERT = False
+if HAVE_NLTK:
+    _SMOOTH = SmoothingFunction().method1
+else:
+    _SMOOTH = None
+if HAVE_ROUGE:
+    _ROUGE_SCORER = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
+else:
+    _ROUGE_SCORER = None
 def _normalize_text_for_metrics(s: str) -> str:
     import re
       - BLEU
       - ROUGE-1/2/L (F-measure)
       - BERTScore Recall, F1
+    If the required libraries (nltk, rouge-score, bert-score) are not installed,
+    returns NaN for all metrics.
     """
+    # If any of the libraries is missing, skip answer metrics
+    if not (HAVE_NLTK and HAVE_ROUGE and HAVE_BERT):
+        return {
+            "bleu": np.nan,
+            "rouge1": np.nan,
+            "rouge2": np.nan,
+            "rougeL": np.nan,
+            "bert_recall": np.nan,
+            "bert_f1": np.nan,
+        }
     pred_n = _normalize_text_for_metrics(pred)
     ref_n  = _normalize_text_for_metrics(ref)
     if "mean_bleu" in agg:
         print(
             f"{COLOR_TEXT}Lexical (answer quality):{COLOR_RESET} "
+            f"{COLOR_ACCENT}BLEU={_fmt(agg.get('mean_bleu'))}  "
+            f"ROUGE-1={_fmt(agg.get('mean_rouge1'))}  "
+            f"ROUGE-2={_fmt(agg.get('mean_rouge2'))}  "
+            f"ROUGE-L={_fmt(agg.get('mean_rougeL'))}{COLOR_RESET}"
         )
     # Semantic metrics summary
     if "mean_bert_f1" in agg:
         print(
             f"{COLOR_TEXT}Semantic (BERTScore):{COLOR_RESET} "
+            f"{COLOR_ACCENT}Recall={_fmt(agg.get('mean_bert_recall'))}  "
+            f"F1={_fmt(agg.get('mean_bert_f1'))}{COLOR_RESET}"
         )
     print()
 if __name__ == "__main__":
     main()