Spaces:
Sleeping
Sleeping
| import pytest | |
| import numpy as np | |
| from evaluation.metrics import ( | |
| precision_at_k, | |
| recall_at_k, | |
| mean_reciprocal_rank, | |
| average_precision, | |
| rag_score, | |
| bleu, | |
| rouge_l, | |
| bert_score, | |
| qags, | |
| fact_score, | |
| ragas_f, | |
| ) | |
| def test_retrieval_metrics_simple(): | |
| retrieved = ["d1", "d2", "d3", "d4"] | |
| relevant = {"d2", "d4", "d5"} | |
| assert precision_at_k(retrieved, relevant, 2) == pytest.approx(0.5, rel=1e-6) | |
| assert precision_at_k(retrieved, relevant, 3) == pytest.approx(1 / 3, rel=1e-6) | |
| assert recall_at_k(retrieved, relevant, 2) == pytest.approx(1 / 3, rel=1e-6) | |
| assert recall_at_k(retrieved, relevant, 4) == pytest.approx(2 / 3, rel=1e-6) | |
| assert mean_reciprocal_rank(retrieved, relevant) == pytest.approx(0.5, rel=1e-6) | |
| assert average_precision(retrieved, relevant) == pytest.approx(1 / 3, rel=1e-6) | |
| def test_rag_score_harmonic_mean(): | |
| scores = {"retrieval_f1": 0.8, "generation_bleu": 0.6} | |
| val = rag_score(scores) | |
| target = 2.0 / (1 / 0.8 + 1 / 0.6) | |
| assert val == pytest.approx(target, rel=1e-6) | |
| scores_zero = {"retrieval_f1": 0.0, "generation_bleu": 0.6} | |
| assert rag_score(scores_zero) == pytest.approx(0.0, rel=1e-6) | |
| def test_generation_metrics_fallback(preds, refs, expected_min): | |
| b = bleu(preds, refs) | |
| r = rouge_l(preds, refs) | |
| bs = bert_score(preds, refs) | |
| assert isinstance(b, float) and b == pytest.approx(expected_min, rel=1e-6) | |
| assert isinstance(r, float) and r == pytest.approx(expected_min, rel=1e-6) | |
| assert isinstance(bs, float) and bs == pytest.approx(expected_min, rel=1e-6) | |
| def test_qags_factscore_ragas_f_fallback(preds, refs, ctxs, expected): | |
| assert qags(preds, refs) == pytest.approx(expected, rel=1e-6) | |
| assert fact_score(preds, refs) == pytest.approx(expected, rel=1e-6) | |
| assert ragas_f(preds, refs, ctxs) == pytest.approx(expected, rel=1e-6) | |