Semantic_File / evaluation /run_eval.py
JackSparrow89's picture
Upload 65 files
bb04c5f verified
# evaluation/run_eval.py
import argparse
import json
import os
import time
from evaluation.dataset_loader import DatasetLoader
from evaluation.indexer_bridge import IndexerBridge
from evaluation.query_runner import QueryRunner
from evaluation.evaluator import Evaluator
MODES = ["dense", "sparse", "hybrid", "full"]
DISPLAY_METRICS = ["NDCG@10", "MAP@100", "Recall@100", "P@10", "MRR"]
# All supported datasets — add more here later if needed
AVAILABLE_DATASETS = {
"scifact": "data/scifact",
"nfcorpus": "data/nfcorpus",
}
def print_table(results: dict, title: str = ""):
col_w = 14
header = f"{'Mode':<10}" + "".join(f"{m:>{col_w}}" for m in DISPLAY_METRICS)
if title:
print(f"\n {title}")
print("=" * len(header))
print(header)
print("-" * len(header))
for mode, metrics in results.items():
row = f"{mode:<10}"
for m in DISPLAY_METRICS:
val = metrics.get(m, 0.0)
row += f"{val:>{col_w}.4f}"
print(row)
print("=" * len(header))
def print_comparison_table(all_dataset_results: dict):
"""
Print a single comparison table across all datasets.
Shows NDCG@10 and MRR side by side for each dataset.
"""
datasets = list(all_dataset_results.keys())
modes = list(list(all_dataset_results.values())[0].keys())
print("\n" + "=" * 80)
print("CROSS-DATASET COMPARISON — full pipeline mode")
print("=" * 80)
# Header
header = f"{'Dataset':<14}" + "".join(
f"{'NDCG@10':>12}{'MRR':>10}{'MAP@100':>10}"
)
print(f"{'Dataset':<14}{'NDCG@10':>12}{'MRR':>10}{'MAP@100':>10}")
print("-" * 46)
for dataset, mode_results in all_dataset_results.items():
# use "full" mode results for comparison, fallback to first mode
metrics = mode_results.get("full", list(mode_results.values())[0])
ndcg = metrics.get("NDCG@10", 0.0)
mrr = metrics.get("MRR", 0.0)
map_ = metrics.get("MAP@100", 0.0)
print(f"{dataset:<14}{ndcg:>12.4f}{mrr:>10.4f}{map_:>10.4f}")
print("=" * 46)
def run_single_dataset(dataset_name: str, dataset_path: str, args) -> dict:
"""Run full eval pipeline for one dataset. Returns mode→metrics dict."""
print(f"\n{'#'*60}")
print(f" DATASET: {dataset_name.upper()}")
print(f"{'#'*60}")
# 1 — load
print("\n[1/4] Loading dataset...")
loader = DatasetLoader(dataset_path)
corpus = loader.load_corpus()
queries = loader.load_queries()
qrels = loader.load_qrels()
# 2 — index
if not args.skip_index:
print("\n[2/4] Indexing corpus...")
bridge = IndexerBridge(args.config)
# pass dataset_name so fake paths are e.g. nfcorpus://doc_id
bridge.index_corpus(corpus, batch_size=64, dataset_name=dataset_name)
else:
print("\n[2/4] Skipping indexing (--skip-index)")
# 3 — run queries
print("\n[3/4] Running queries...")
runner = QueryRunner(args.config)
evaluator = Evaluator()
modes_to_run = MODES if args.mode == "all" else [args.mode]
all_mode_results = {}
for mode in modes_to_run:
print(f"\n Mode: {mode}")
t0 = time.time()
ranked_results = runner.run(queries, top_k=args.top_k, mode=mode)
elapsed = time.time() - t0
metrics = evaluator.evaluate(ranked_results, qrels, k_values=[1, 5, 10, 100])
metrics["query_time_s"] = round(elapsed, 2)
all_mode_results[mode] = metrics
print(f" NDCG@10={metrics.get('NDCG@10', 0):.4f} "
f"MAP@100={metrics.get('MAP@100', 0):.4f} "
f"MRR={metrics.get('MRR', 0):.4f}")
# 4 — per-dataset table
print(f"\n[4/4] Results for {dataset_name.upper()}")
print_table(all_mode_results, title=f"EVALUATION RESULTS — {dataset_name} (pytrec_eval)")
return all_mode_results
def main():
parser = argparse.ArgumentParser(description="Evaluate semantic search on BEIR datasets")
parser.add_argument(
"--datasets",
nargs="+",
default=["scifact", "nfcorpus"],
choices=list(AVAILABLE_DATASETS.keys()),
help="Which datasets to evaluate. e.g. --datasets scifact nfcorpus"
)
parser.add_argument("--config", default="config.yaml")
parser.add_argument("--top-k", default=100, type=int)
parser.add_argument("--skip-index", action="store_true")
parser.add_argument("--mode", default="all",
help="dense | sparse | hybrid | full | all")
args = parser.parse_args()
os.makedirs("results", exist_ok=True)
all_dataset_results = {}
for dataset_name in args.datasets:
dataset_path = AVAILABLE_DATASETS[dataset_name]
if not os.path.exists(dataset_path):
print(f"\n[WARNING] Dataset folder not found: {dataset_path} — skipping {dataset_name}")
continue
results = run_single_dataset(dataset_name, dataset_path, args)
all_dataset_results[dataset_name] = results
# save per-dataset report
report_path = f"results/eval_{dataset_name}.json"
with open(report_path, "w") as f:
json.dump(results, f, indent=2)
print(f" Saved → {report_path}")
# cross-dataset comparison (only if more than one dataset ran)
if len(all_dataset_results) > 1:
print_comparison_table(all_dataset_results)
# save combined report
combined_path = "results/eval_all.json"
with open(combined_path, "w") as f:
json.dump(all_dataset_results, f, indent=2)
print(f"\nCombined report saved → {combined_path}")
if __name__ == "__main__":
main()