Spaces:
Sleeping
Sleeping
File size: 6,936 Bytes
bb04c5f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | # evaluation/evaluator.py
import math
from collections import defaultdict
class Evaluator:
"""
Computes standard IR evaluation metrics by comparing your
system's ranked results against the ground-truth qrels.
Metrics implemented:
NDCG@k β Normalized Discounted Cumulative Gain
Measures ranking quality; rewards relevant docs appearing early
Handles graded relevance (NFCorpus 0-3) and binary (SciFact 0-1)
MAP@k β Mean Average Precision
Average of precision computed at each relevant doc position
Recall@k β Fraction of relevant docs found in top-k
P@k β Precision at k (fraction of top-k that are relevant)
MRR β Mean Reciprocal Rank (position of first relevant result)
"""
def ndcg_at_k(self, ranked: list, relevant: dict, k: int) -> float:
"""
NDCG@k β the most important metric for ranked retrieval.
Score of 1.0 = perfect ranking, 0.0 = no relevant docs found.
Works for both:
- Binary relevance (SciFact): scores are 0 or 1
- Graded relevance (NFCorpus): scores are 0, 1, 2, or 3
"""
dcg = 0.0
for i, (doc_id, _) in enumerate(ranked[:k]):
rel = relevant.get(doc_id, 0)
if rel > 0:
dcg += rel / math.log2(i + 2) # i+2 because log2(1) = 0
# Ideal DCG β best possible ranking given the relevant docs
ideal_rels = sorted(relevant.values(), reverse=True)[:k]
idcg = sum(
rel / math.log2(i + 2)
for i, rel in enumerate(ideal_rels)
if rel > 0
)
return dcg / idcg if idcg > 0 else 0.0
def map_at_k(self, ranked: list, relevant: dict, k: int) -> float:
"""
MAP@k β average precision across all relevant document positions.
For graded relevance (NFCorpus), any score >= 1 counts as relevant.
"""
num_relevant = 0
sum_precision = 0.0
for i, (doc_id, _) in enumerate(ranked[:k]):
if relevant.get(doc_id, 0) > 0:
num_relevant += 1
sum_precision += num_relevant / (i + 1)
total_relevant = sum(1 for v in relevant.values() if v > 0)
if total_relevant == 0:
return 0.0
return sum_precision / total_relevant
def recall_at_k(self, ranked: list, relevant: dict, k: int) -> float:
"""
Recall@k β what fraction of all relevant docs appear in top-k.
For graded relevance, any score >= 1 counts as relevant.
"""
total_relevant = sum(1 for v in relevant.values() if v > 0)
if total_relevant == 0:
return 0.0
found = sum(
1 for doc_id, _ in ranked[:k]
if relevant.get(doc_id, 0) > 0
)
return found / total_relevant
def precision_at_k(self, ranked: list, relevant: dict, k: int) -> float:
"""
P@k β fraction of the top-k results that are relevant.
For graded relevance, any score >= 1 counts as relevant.
"""
if k == 0:
return 0.0
hits = sum(
1 for doc_id, _ in ranked[:k]
if relevant.get(doc_id, 0) > 0
)
return hits / k
def mrr(self, ranked: list, relevant: dict) -> float:
"""
MRR β reciprocal of the rank of the first relevant result.
Score of 1.0 = first result is relevant.
For graded relevance, any score >= 1 counts as relevant.
"""
for i, (doc_id, _) in enumerate(ranked):
if relevant.get(doc_id, 0) > 0:
return 1.0 / (i + 1)
return 0.0
def evaluate(
self,
all_results: dict,
qrels: dict,
k_values: list = None,
) -> dict:
"""
Compute all metrics across all queries and average them.
Args:
all_results β {query_id: [(doc_id, score), ...]} from QueryRunner
qrels β {query_id: {doc_id: relevance}} from DatasetLoader
k_values β list of k values e.g. [1, 5, 10, 100]
Returns:
dict β {
"NDCG@10": 0.42,
"MAP@100": 0.38,
"Recall@100": 0.71,
"P@10": 0.15,
"MRR": 0.55,
"num_queries": 300,
"queries_with_results": 298,
"queries_with_no_qrels": 2,
}
"""
if k_values is None:
k_values = [1, 5, 10, 100]
scores = defaultdict(list)
num_queries = 0
queries_with_results = 0
queries_no_qrels = 0
for query_id, ranked in all_results.items():
relevant = qrels.get(query_id, {})
# skip queries that have no ground truth at all
if not relevant:
queries_no_qrels += 1
continue
num_queries += 1
if ranked:
queries_with_results += 1
for k in k_values:
scores[f"NDCG@{k}"].append(self.ndcg_at_k(ranked, relevant, k))
scores[f"MAP@{k}"].append(self.map_at_k(ranked, relevant, k))
scores[f"Recall@{k}"].append(self.recall_at_k(ranked, relevant, k))
scores[f"P@{k}"].append(self.precision_at_k(ranked, relevant, k))
scores["MRR"].append(self.mrr(ranked, relevant))
# Print diagnostic so you can see if queries matched correctly
print(f" Evaluated {num_queries} queries | "
f"{queries_with_results} had results | "
f"{queries_no_qrels} had no qrels (skipped)")
# Average across all queries
summary = {
metric: round(sum(vals) / len(vals), 4) if vals else 0.0
for metric, vals in scores.items()
}
summary["num_queries"] = num_queries
summary["queries_with_results"] = queries_with_results
summary["queries_with_no_qrels"] = queries_no_qrels
return summary
if __name__ == "__main__":
# Quick sanity check with toy data
evaluator = Evaluator()
# Fake ranked results β doc_1 is relevant, doc_2 is not
fake_results = {
"q1": [("doc_1", 0.95), ("doc_2", 0.80), ("doc_3", 0.60)],
"q2": [("doc_4", 0.70), ("doc_1", 0.50)],
}
fake_qrels = {
"q1": {"doc_1": 1},
"q2": {"doc_4": 1, "doc_5": 1},
}
metrics = evaluator.evaluate(fake_results, fake_qrels, k_values=[1, 5, 10])
print("\nSanity check metrics:")
for k, v in metrics.items():
print(f" {k}: {v}") |