Spaces:
Sleeping
Sleeping
Update rag_eval_metrics.py
Browse files- rag_eval_metrics.py +59 -17
rag_eval_metrics.py
CHANGED
|
@@ -4,9 +4,12 @@ rag_eval_metrics.py
|
|
| 4 |
|
| 5 |
Evaluate RAG retrieval quality by comparing app logs (JSONL) with a gold file (CSV).
|
| 6 |
|
| 7 |
-
|
| 8 |
- Lexical similarity: BLEU, ROUGE-1/2/L
|
| 9 |
- Semantic similarity: BERTScore (Recall, F1)
|
|
|
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
|
| 12 |
import argparse
|
|
@@ -243,7 +246,7 @@ def read_gold(csv_path: Path) -> Tuple[pd.DataFrame, Dict[str, str]]:
|
|
| 243 |
|
| 244 |
return gold, gold_answers
|
| 245 |
|
| 246 |
-
# ----------------------------- Metric Core ----------------------------- #
|
| 247 |
|
| 248 |
def dcg_at_k(relevances: List[int]) -> float:
|
| 249 |
dcg = 0.0
|
|
@@ -313,14 +316,40 @@ def compute_metrics_for_question(gold_docs, gold_pages, hits, k):
|
|
| 313 |
"n_pred": int(len(pred_docs))
|
| 314 |
}
|
| 315 |
|
| 316 |
-
# ---------------------- Answer Quality Metrics ---------------------- #
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
from
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
def _normalize_text_for_metrics(s: str) -> str:
|
| 326 |
import re
|
|
@@ -338,7 +367,21 @@ def compute_text_metrics(pred: str, ref: str) -> Dict[str, float]:
|
|
| 338 |
- BLEU
|
| 339 |
- ROUGE-1/2/L (F-measure)
|
| 340 |
- BERTScore Recall, F1
|
|
|
|
|
|
|
|
|
|
| 341 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
pred_n = _normalize_text_for_metrics(pred)
|
| 343 |
ref_n = _normalize_text_for_metrics(ref)
|
| 344 |
|
|
@@ -595,18 +638,18 @@ def main():
|
|
| 595 |
if "mean_bleu" in agg:
|
| 596 |
print(
|
| 597 |
f"{COLOR_TEXT}Lexical (answer quality):{COLOR_RESET} "
|
| 598 |
-
f"{COLOR_ACCENT}BLEU={_fmt(agg
|
| 599 |
-
f"ROUGE-1={_fmt(agg
|
| 600 |
-
f"ROUGE-2={_fmt(agg
|
| 601 |
-
f"ROUGE-L={_fmt(agg
|
| 602 |
)
|
| 603 |
|
| 604 |
# Semantic metrics summary
|
| 605 |
if "mean_bert_f1" in agg:
|
| 606 |
print(
|
| 607 |
f"{COLOR_TEXT}Semantic (BERTScore):{COLOR_RESET} "
|
| 608 |
-
f"{COLOR_ACCENT}Recall={_fmt(agg
|
| 609 |
-
f"F1={_fmt(agg
|
| 610 |
)
|
| 611 |
|
| 612 |
print()
|
|
@@ -615,4 +658,3 @@ def main():
|
|
| 615 |
|
| 616 |
if __name__ == "__main__":
|
| 617 |
main()
|
| 618 |
-
|
|
|
|
| 4 |
|
| 5 |
Evaluate RAG retrieval quality by comparing app logs (JSONL) with a gold file (CSV).
|
| 6 |
|
| 7 |
+
Extended to also evaluate answer quality using:
|
| 8 |
- Lexical similarity: BLEU, ROUGE-1/2/L
|
| 9 |
- Semantic similarity: BERTScore (Recall, F1)
|
| 10 |
+
|
| 11 |
+
If nltk / rouge-score / bert-score are missing, the script still runs and
|
| 12 |
+
returns NaN for these metrics instead of crashing.
|
| 13 |
"""
|
| 14 |
|
| 15 |
import argparse
|
|
|
|
| 246 |
|
| 247 |
return gold, gold_answers
|
| 248 |
|
| 249 |
+
# ----------------------------- Retrieval Metric Core ----------------------------- #
|
| 250 |
|
| 251 |
def dcg_at_k(relevances: List[int]) -> float:
|
| 252 |
dcg = 0.0
|
|
|
|
| 316 |
"n_pred": int(len(pred_docs))
|
| 317 |
}
|
| 318 |
|
| 319 |
+
# ---------------------- Answer Quality Metrics (with fallbacks) ---------------------- #
|
| 320 |
+
|
| 321 |
+
# Try to import optional libraries; if missing, we fall back to NaN metrics
|
| 322 |
+
try:
|
| 323 |
+
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
| 324 |
+
HAVE_NLTK = True
|
| 325 |
+
except Exception:
|
| 326 |
+
sentence_bleu = None
|
| 327 |
+
SmoothingFunction = None
|
| 328 |
+
HAVE_NLTK = False
|
| 329 |
+
|
| 330 |
+
try:
|
| 331 |
+
from rouge_score import rouge_scorer
|
| 332 |
+
HAVE_ROUGE = True
|
| 333 |
+
except Exception:
|
| 334 |
+
rouge_scorer = None
|
| 335 |
+
HAVE_ROUGE = False
|
| 336 |
+
|
| 337 |
+
try:
|
| 338 |
+
from bert_score import score as bert_score
|
| 339 |
+
HAVE_BERT = True
|
| 340 |
+
except Exception:
|
| 341 |
+
bert_score = None
|
| 342 |
+
HAVE_BERT = False
|
| 343 |
+
|
| 344 |
+
if HAVE_NLTK:
|
| 345 |
+
_SMOOTH = SmoothingFunction().method1
|
| 346 |
+
else:
|
| 347 |
+
_SMOOTH = None
|
| 348 |
+
|
| 349 |
+
if HAVE_ROUGE:
|
| 350 |
+
_ROUGE_SCORER = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
|
| 351 |
+
else:
|
| 352 |
+
_ROUGE_SCORER = None
|
| 353 |
|
| 354 |
def _normalize_text_for_metrics(s: str) -> str:
|
| 355 |
import re
|
|
|
|
| 367 |
- BLEU
|
| 368 |
- ROUGE-1/2/L (F-measure)
|
| 369 |
- BERTScore Recall, F1
|
| 370 |
+
|
| 371 |
+
If the required libraries (nltk, rouge-score, bert-score) are not installed,
|
| 372 |
+
returns NaN for all metrics.
|
| 373 |
"""
|
| 374 |
+
# If any of the libraries is missing, skip answer metrics
|
| 375 |
+
if not (HAVE_NLTK and HAVE_ROUGE and HAVE_BERT):
|
| 376 |
+
return {
|
| 377 |
+
"bleu": np.nan,
|
| 378 |
+
"rouge1": np.nan,
|
| 379 |
+
"rouge2": np.nan,
|
| 380 |
+
"rougeL": np.nan,
|
| 381 |
+
"bert_recall": np.nan,
|
| 382 |
+
"bert_f1": np.nan,
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
pred_n = _normalize_text_for_metrics(pred)
|
| 386 |
ref_n = _normalize_text_for_metrics(ref)
|
| 387 |
|
|
|
|
| 638 |
if "mean_bleu" in agg:
|
| 639 |
print(
|
| 640 |
f"{COLOR_TEXT}Lexical (answer quality):{COLOR_RESET} "
|
| 641 |
+
f"{COLOR_ACCENT}BLEU={_fmt(agg.get('mean_bleu'))} "
|
| 642 |
+
f"ROUGE-1={_fmt(agg.get('mean_rouge1'))} "
|
| 643 |
+
f"ROUGE-2={_fmt(agg.get('mean_rouge2'))} "
|
| 644 |
+
f"ROUGE-L={_fmt(agg.get('mean_rougeL'))}{COLOR_RESET}"
|
| 645 |
)
|
| 646 |
|
| 647 |
# Semantic metrics summary
|
| 648 |
if "mean_bert_f1" in agg:
|
| 649 |
print(
|
| 650 |
f"{COLOR_TEXT}Semantic (BERTScore):{COLOR_RESET} "
|
| 651 |
+
f"{COLOR_ACCENT}Recall={_fmt(agg.get('mean_bert_recall'))} "
|
| 652 |
+
f"F1={_fmt(agg.get('mean_bert_f1'))}{COLOR_RESET}"
|
| 653 |
)
|
| 654 |
|
| 655 |
print()
|
|
|
|
| 658 |
|
| 659 |
if __name__ == "__main__":
|
| 660 |
main()
|
|
|