import json import requests import numpy as np API_URL = "http://localhost:8000/search" # ===================================================== # Utility: MRR # ===================================================== def compute_mrr(all_ranks): if not all_ranks: return 0.0 rr = [1.0 / r for r in all_ranks] return float(np.mean(rr)) # ===================================================== # Utility: NDCG@K # ===================================================== def compute_ndcg(results, k): """results = [1,0,0...] relevance for retrieved docs""" dcg = 0 for rank, rel in enumerate(results[:k], start=1): if rel == 1: dcg += 1 / np.log2(rank + 1) idcg = 1 / np.log2(1 + 1) # ideal rank = 1 return dcg / idcg if idcg != 0 else 0 # ===================================================== # MAIN EVALUATION FUNCTION # ===================================================== def run_evaluation(query_file="generated_queries.json", top_k=10): """ top_k is FIXED = 10 for a realistic evaluation. """ with open(query_file) as f: queries = json.load(f) correct = [] ranks = [] ndcg_scores = [] detailed = [] for item in queries: query = item["query"] expected = item["doc_id"] + ".txt" # ---------------------------- # CALL API # ---------------------------- resp = requests.post(API_URL, json={"query": query, "top_k": top_k}) if resp.status_code != 200: continue results = resp.json().get("results", []) retrieved = [r["filename"] for r in results] # relevance array for NDCG relevance = [1 if fn == expected else 0 for fn in retrieved] # ---------------------------- # ACCURACY # ---------------------------- hit = expected in retrieved correct.append(1 if hit else 0) # ---------------------------- # RANK for MRR # ---------------------------- if hit: rank_position = retrieved.index(expected) + 1 ranks.append(rank_position) else: rank_position = None # ---------------------------- # NDCG # ---------------------------- ndcg_scores.append(compute_ndcg(relevance, top_k)) # ---------------------------- # Save detail # ---------------------------- detailed.append({ "query": query, "expected": expected, "retrieved": retrieved, "rank": rank_position, "is_correct": hit }) # ===================================================== # FINAL METRICS # ===================================================== accuracy = round(np.mean(correct) * 100, 2) mrr = round(compute_mrr(ranks), 4) mean_ndcg = round(float(np.mean(ndcg_scores)), 4) summary = { "accuracy": accuracy, "mrr": mrr, "ndcg": mean_ndcg, "total_queries": len(queries), "correct_count": sum(correct), "incorrect_count": len(queries) - sum(correct), "details": detailed } return summary