# evaluation/run_eval.py import argparse import json import os import time from evaluation.dataset_loader import DatasetLoader from evaluation.indexer_bridge import IndexerBridge from evaluation.query_runner import QueryRunner from evaluation.evaluator import Evaluator MODES = ["dense", "sparse", "hybrid", "full"] DISPLAY_METRICS = ["NDCG@10", "MAP@100", "Recall@100", "P@10", "MRR"] # All supported datasets — add more here later if needed AVAILABLE_DATASETS = { "scifact": "data/scifact", "nfcorpus": "data/nfcorpus", } def print_table(results: dict, title: str = ""): col_w = 14 header = f"{'Mode':<10}" + "".join(f"{m:>{col_w}}" for m in DISPLAY_METRICS) if title: print(f"\n {title}") print("=" * len(header)) print(header) print("-" * len(header)) for mode, metrics in results.items(): row = f"{mode:<10}" for m in DISPLAY_METRICS: val = metrics.get(m, 0.0) row += f"{val:>{col_w}.4f}" print(row) print("=" * len(header)) def print_comparison_table(all_dataset_results: dict): """ Print a single comparison table across all datasets. Shows NDCG@10 and MRR side by side for each dataset. """ datasets = list(all_dataset_results.keys()) modes = list(list(all_dataset_results.values())[0].keys()) print("\n" + "=" * 80) print("CROSS-DATASET COMPARISON — full pipeline mode") print("=" * 80) # Header header = f"{'Dataset':<14}" + "".join( f"{'NDCG@10':>12}{'MRR':>10}{'MAP@100':>10}" ) print(f"{'Dataset':<14}{'NDCG@10':>12}{'MRR':>10}{'MAP@100':>10}") print("-" * 46) for dataset, mode_results in all_dataset_results.items(): # use "full" mode results for comparison, fallback to first mode metrics = mode_results.get("full", list(mode_results.values())[0]) ndcg = metrics.get("NDCG@10", 0.0) mrr = metrics.get("MRR", 0.0) map_ = metrics.get("MAP@100", 0.0) print(f"{dataset:<14}{ndcg:>12.4f}{mrr:>10.4f}{map_:>10.4f}") print("=" * 46) def run_single_dataset(dataset_name: str, dataset_path: str, args) -> dict: """Run full eval pipeline for one dataset. Returns mode→metrics dict.""" print(f"\n{'#'*60}") print(f" DATASET: {dataset_name.upper()}") print(f"{'#'*60}") # 1 — load print("\n[1/4] Loading dataset...") loader = DatasetLoader(dataset_path) corpus = loader.load_corpus() queries = loader.load_queries() qrels = loader.load_qrels() # 2 — index if not args.skip_index: print("\n[2/4] Indexing corpus...") bridge = IndexerBridge(args.config) # pass dataset_name so fake paths are e.g. nfcorpus://doc_id bridge.index_corpus(corpus, batch_size=64, dataset_name=dataset_name) else: print("\n[2/4] Skipping indexing (--skip-index)") # 3 — run queries print("\n[3/4] Running queries...") runner = QueryRunner(args.config) evaluator = Evaluator() modes_to_run = MODES if args.mode == "all" else [args.mode] all_mode_results = {} for mode in modes_to_run: print(f"\n Mode: {mode}") t0 = time.time() ranked_results = runner.run(queries, top_k=args.top_k, mode=mode) elapsed = time.time() - t0 metrics = evaluator.evaluate(ranked_results, qrels, k_values=[1, 5, 10, 100]) metrics["query_time_s"] = round(elapsed, 2) all_mode_results[mode] = metrics print(f" NDCG@10={metrics.get('NDCG@10', 0):.4f} " f"MAP@100={metrics.get('MAP@100', 0):.4f} " f"MRR={metrics.get('MRR', 0):.4f}") # 4 — per-dataset table print(f"\n[4/4] Results for {dataset_name.upper()}") print_table(all_mode_results, title=f"EVALUATION RESULTS — {dataset_name} (pytrec_eval)") return all_mode_results def main(): parser = argparse.ArgumentParser(description="Evaluate semantic search on BEIR datasets") parser.add_argument( "--datasets", nargs="+", default=["scifact", "nfcorpus"], choices=list(AVAILABLE_DATASETS.keys()), help="Which datasets to evaluate. e.g. --datasets scifact nfcorpus" ) parser.add_argument("--config", default="config.yaml") parser.add_argument("--top-k", default=100, type=int) parser.add_argument("--skip-index", action="store_true") parser.add_argument("--mode", default="all", help="dense | sparse | hybrid | full | all") args = parser.parse_args() os.makedirs("results", exist_ok=True) all_dataset_results = {} for dataset_name in args.datasets: dataset_path = AVAILABLE_DATASETS[dataset_name] if not os.path.exists(dataset_path): print(f"\n[WARNING] Dataset folder not found: {dataset_path} — skipping {dataset_name}") continue results = run_single_dataset(dataset_name, dataset_path, args) all_dataset_results[dataset_name] = results # save per-dataset report report_path = f"results/eval_{dataset_name}.json" with open(report_path, "w") as f: json.dump(results, f, indent=2) print(f" Saved → {report_path}") # cross-dataset comparison (only if more than one dataset ran) if len(all_dataset_results) > 1: print_comparison_table(all_dataset_results) # save combined report combined_path = "results/eval_all.json" with open(combined_path, "w") as f: json.dump(all_dataset_results, f, indent=2) print(f"\nCombined report saved → {combined_path}") if __name__ == "__main__": main()