Semantic_File / evaluation /evaluator.py
JackSparrow89's picture
Upload 65 files
bb04c5f verified
# evaluation/evaluator.py
import math
from collections import defaultdict
class Evaluator:
"""
Computes standard IR evaluation metrics by comparing your
system's ranked results against the ground-truth qrels.
Metrics implemented:
NDCG@k β€” Normalized Discounted Cumulative Gain
Measures ranking quality; rewards relevant docs appearing early
Handles graded relevance (NFCorpus 0-3) and binary (SciFact 0-1)
MAP@k β€” Mean Average Precision
Average of precision computed at each relevant doc position
Recall@k β€” Fraction of relevant docs found in top-k
P@k β€” Precision at k (fraction of top-k that are relevant)
MRR β€” Mean Reciprocal Rank (position of first relevant result)
"""
def ndcg_at_k(self, ranked: list, relevant: dict, k: int) -> float:
"""
NDCG@k β€” the most important metric for ranked retrieval.
Score of 1.0 = perfect ranking, 0.0 = no relevant docs found.
Works for both:
- Binary relevance (SciFact): scores are 0 or 1
- Graded relevance (NFCorpus): scores are 0, 1, 2, or 3
"""
dcg = 0.0
for i, (doc_id, _) in enumerate(ranked[:k]):
rel = relevant.get(doc_id, 0)
if rel > 0:
dcg += rel / math.log2(i + 2) # i+2 because log2(1) = 0
# Ideal DCG β€” best possible ranking given the relevant docs
ideal_rels = sorted(relevant.values(), reverse=True)[:k]
idcg = sum(
rel / math.log2(i + 2)
for i, rel in enumerate(ideal_rels)
if rel > 0
)
return dcg / idcg if idcg > 0 else 0.0
def map_at_k(self, ranked: list, relevant: dict, k: int) -> float:
"""
MAP@k β€” average precision across all relevant document positions.
For graded relevance (NFCorpus), any score >= 1 counts as relevant.
"""
num_relevant = 0
sum_precision = 0.0
for i, (doc_id, _) in enumerate(ranked[:k]):
if relevant.get(doc_id, 0) > 0:
num_relevant += 1
sum_precision += num_relevant / (i + 1)
total_relevant = sum(1 for v in relevant.values() if v > 0)
if total_relevant == 0:
return 0.0
return sum_precision / total_relevant
def recall_at_k(self, ranked: list, relevant: dict, k: int) -> float:
"""
Recall@k β€” what fraction of all relevant docs appear in top-k.
For graded relevance, any score >= 1 counts as relevant.
"""
total_relevant = sum(1 for v in relevant.values() if v > 0)
if total_relevant == 0:
return 0.0
found = sum(
1 for doc_id, _ in ranked[:k]
if relevant.get(doc_id, 0) > 0
)
return found / total_relevant
def precision_at_k(self, ranked: list, relevant: dict, k: int) -> float:
"""
P@k β€” fraction of the top-k results that are relevant.
For graded relevance, any score >= 1 counts as relevant.
"""
if k == 0:
return 0.0
hits = sum(
1 for doc_id, _ in ranked[:k]
if relevant.get(doc_id, 0) > 0
)
return hits / k
def mrr(self, ranked: list, relevant: dict) -> float:
"""
MRR β€” reciprocal of the rank of the first relevant result.
Score of 1.0 = first result is relevant.
For graded relevance, any score >= 1 counts as relevant.
"""
for i, (doc_id, _) in enumerate(ranked):
if relevant.get(doc_id, 0) > 0:
return 1.0 / (i + 1)
return 0.0
def evaluate(
self,
all_results: dict,
qrels: dict,
k_values: list = None,
) -> dict:
"""
Compute all metrics across all queries and average them.
Args:
all_results β€” {query_id: [(doc_id, score), ...]} from QueryRunner
qrels β€” {query_id: {doc_id: relevance}} from DatasetLoader
k_values β€” list of k values e.g. [1, 5, 10, 100]
Returns:
dict β€” {
"NDCG@10": 0.42,
"MAP@100": 0.38,
"Recall@100": 0.71,
"P@10": 0.15,
"MRR": 0.55,
"num_queries": 300,
"queries_with_results": 298,
"queries_with_no_qrels": 2,
}
"""
if k_values is None:
k_values = [1, 5, 10, 100]
scores = defaultdict(list)
num_queries = 0
queries_with_results = 0
queries_no_qrels = 0
for query_id, ranked in all_results.items():
relevant = qrels.get(query_id, {})
# skip queries that have no ground truth at all
if not relevant:
queries_no_qrels += 1
continue
num_queries += 1
if ranked:
queries_with_results += 1
for k in k_values:
scores[f"NDCG@{k}"].append(self.ndcg_at_k(ranked, relevant, k))
scores[f"MAP@{k}"].append(self.map_at_k(ranked, relevant, k))
scores[f"Recall@{k}"].append(self.recall_at_k(ranked, relevant, k))
scores[f"P@{k}"].append(self.precision_at_k(ranked, relevant, k))
scores["MRR"].append(self.mrr(ranked, relevant))
# Print diagnostic so you can see if queries matched correctly
print(f" Evaluated {num_queries} queries | "
f"{queries_with_results} had results | "
f"{queries_no_qrels} had no qrels (skipped)")
# Average across all queries
summary = {
metric: round(sum(vals) / len(vals), 4) if vals else 0.0
for metric, vals in scores.items()
}
summary["num_queries"] = num_queries
summary["queries_with_results"] = queries_with_results
summary["queries_with_no_qrels"] = queries_no_qrels
return summary
if __name__ == "__main__":
# Quick sanity check with toy data
evaluator = Evaluator()
# Fake ranked results β€” doc_1 is relevant, doc_2 is not
fake_results = {
"q1": [("doc_1", 0.95), ("doc_2", 0.80), ("doc_3", 0.60)],
"q2": [("doc_4", 0.70), ("doc_1", 0.50)],
}
fake_qrels = {
"q1": {"doc_1": 1},
"q2": {"doc_4": 1, "doc_5": 1},
}
metrics = evaluator.evaluate(fake_results, fake_qrels, k_values=[1, 5, 10])
print("\nSanity check metrics:")
for k, v in metrics.items():
print(f" {k}: {v}")