Spaces:
Sleeping
Sleeping
File size: 3,202 Bytes
d0abef8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import json
import requests
import numpy as np
API_URL = "http://localhost:8000/search"
# =====================================================
# Utility: MRR
# =====================================================
def compute_mrr(all_ranks):
if not all_ranks:
return 0.0
rr = [1.0 / r for r in all_ranks]
return float(np.mean(rr))
# =====================================================
# Utility: NDCG@K
# =====================================================
def compute_ndcg(results, k):
"""results = [1,0,0...] relevance for retrieved docs"""
dcg = 0
for rank, rel in enumerate(results[:k], start=1):
if rel == 1:
dcg += 1 / np.log2(rank + 1)
idcg = 1 / np.log2(1 + 1) # ideal rank = 1
return dcg / idcg if idcg != 0 else 0
# =====================================================
# MAIN EVALUATION FUNCTION
# =====================================================
def run_evaluation(query_file="generated_queries.json", top_k=10):
"""
top_k is FIXED = 10 for a realistic evaluation.
"""
with open(query_file) as f:
queries = json.load(f)
correct = []
ranks = []
ndcg_scores = []
detailed = []
for item in queries:
query = item["query"]
expected = item["doc_id"] + ".txt"
# ----------------------------
# CALL API
# ----------------------------
resp = requests.post(API_URL, json={"query": query, "top_k": top_k})
if resp.status_code != 200:
continue
results = resp.json().get("results", [])
retrieved = [r["filename"] for r in results]
# relevance array for NDCG
relevance = [1 if fn == expected else 0 for fn in retrieved]
# ----------------------------
# ACCURACY
# ----------------------------
hit = expected in retrieved
correct.append(1 if hit else 0)
# ----------------------------
# RANK for MRR
# ----------------------------
if hit:
rank_position = retrieved.index(expected) + 1
ranks.append(rank_position)
else:
rank_position = None
# ----------------------------
# NDCG
# ----------------------------
ndcg_scores.append(compute_ndcg(relevance, top_k))
# ----------------------------
# Save detail
# ----------------------------
detailed.append({
"query": query,
"expected": expected,
"retrieved": retrieved,
"rank": rank_position,
"is_correct": hit
})
# =====================================================
# FINAL METRICS
# =====================================================
accuracy = round(np.mean(correct) * 100, 2)
mrr = round(compute_mrr(ranks), 4)
mean_ndcg = round(float(np.mean(ndcg_scores)), 4)
summary = {
"accuracy": accuracy,
"mrr": mrr,
"ndcg": mean_ndcg,
"total_queries": len(queries),
"correct_count": sum(correct),
"incorrect_count": len(queries) - sum(correct),
"details": detailed
}
return summary
|