# -*- coding: utf-8 -*- """ TREC Benchmark Script - SysCRED ================================ Run TREC-style evaluation on the fact-checking system. This script: 1. Loads TREC AP88-90 topics and qrels 2. Runs retrieval with multiple models (BM25, QLD, TF-IDF) 3. Evaluates using pytrec_eval metrics 4. Generates comparison tables and visualizations Usage: python run_trec_benchmark.py --index /path/to/index --qrels /path/to/qrels (c) Dominique S. Loyer - PhD Thesis Prototype Citation Key: loyerEvaluationModelesRecherche2025 """ import os import sys import json import argparse import time from pathlib import Path from typing import Dict, List, Any, Tuple from collections import defaultdict # Add parent directory to path sys.path.insert(0, str(Path(__file__).parent)) from syscred.trec_retriever import TRECRetriever, RetrievalResult from syscred.trec_dataset import TRECDataset, SAMPLE_TOPICS from syscred.eval_metrics import EvaluationMetrics class TRECBenchmark: """ TREC-style benchmark runner for SysCRED. Runs multiple retrieval configurations and compares performance using standard IR metrics. """ # Configurations to test CONFIGURATIONS = [ {"name": "BM25", "model": "bm25", "prf": False}, {"name": "BM25+PRF", "model": "bm25", "prf": True}, {"name": "QLD", "model": "qld", "prf": False}, {"name": "QLD+PRF", "model": "qld", "prf": True}, ] # Metrics to evaluate METRICS = ["map", "ndcg", "P_10", "P_20", "recall_100", "recip_rank"] def __init__( self, index_path: str = None, corpus_path: str = None, topics_path: str = None, qrels_path: str = None, output_dir: str = None ): """ Initialize the benchmark runner. Args: index_path: Path to Lucene index corpus_path: Path to JSONL corpus topics_path: Path to TREC topics qrels_path: Path to TREC qrels output_dir: Directory for output files """ self.index_path = index_path self.corpus_path = corpus_path self.topics_path = topics_path self.qrels_path = qrels_path self.output_dir = Path(output_dir) if output_dir else Path("benchmark_results") # Create output directory self.output_dir.mkdir(parents=True, exist_ok=True) # Initialize components self.dataset = TRECDataset( topics_dir=topics_path, qrels_dir=qrels_path, corpus_path=corpus_path ) self.retriever = TRECRetriever( index_path=index_path, corpus_path=corpus_path, use_stemming=True ) self.metrics = EvaluationMetrics() # Results storage self.results: Dict[str, Dict[str, Any]] = {} def load_data(self): """Load topics and qrels.""" print("\n" + "=" * 60) print("Loading TREC Data") print("=" * 60) # Load topics if self.topics_path: self.dataset.load_topics(self.topics_path) else: # Use sample topics print("[Benchmark] Using sample topics (no topics file provided)") self.dataset.topics = SAMPLE_TOPICS.copy() # Load qrels if self.qrels_path: self.dataset.load_qrels(self.qrels_path) else: print("[Benchmark] No qrels provided - evaluation will be limited") # Load corpus if available if self.corpus_path: self.dataset.load_corpus_jsonl(self.corpus_path) stats = self.dataset.get_statistics() print(f"\nDataset Statistics:") for key, value in stats.items(): print(f" {key}: {value}") def run_configuration( self, config: Dict[str, Any], query_type: str = "short", k: int = 100 ) -> Tuple[str, Dict[str, Any]]: """ Run a single retrieval configuration. Returns: (run_tag, results_dict) """ config_name = config["name"] model = config["model"] use_prf = config["prf"] run_tag = f"syscred_{config_name}_{query_type}" print(f"\n--- Running: {run_tag} ---") queries = self.dataset.get_topic_queries(query_type) if not queries: print(f" No queries available!") return run_tag, {} # Run retrieval start_time = time.time() all_results = [] run_lines = [] for topic_id, query_text in queries.items(): result = self.retriever.retrieve_evidence( claim=query_text, k=k, model=model, use_prf=use_prf ) for evidence in result.evidences: all_results.append({ "topic_id": topic_id, "doc_id": evidence.doc_id, "score": evidence.score, "rank": evidence.rank }) run_lines.append( f"{topic_id} Q0 {evidence.doc_id} {evidence.rank} {evidence.score:.6f} {run_tag}" ) elapsed = time.time() - start_time # Save run file run_file = self.output_dir / f"{run_tag}.run" with open(run_file, 'w') as f: f.write("\n".join(run_lines)) print(f" Queries: {len(queries)}") print(f" Total results: {len(all_results)}") print(f" Time: {elapsed:.2f}s") print(f" Saved: {run_file}") return run_tag, { "config": config, "query_type": query_type, "results": all_results, "run_file": str(run_file), "elapsed_time": elapsed } def evaluate_run(self, run_tag: str, results: Dict[str, Any]) -> Dict[str, float]: """ Evaluate a run using pytrec_eval. Returns dictionary of metric -> value (aggregated across queries). """ if not self.dataset.qrels: print(f" [Skip evaluation - no qrels]") return {} # Convert results to pytrec format: {query_id: [(doc_id, score), ...]} run = defaultdict(list) for r in results["results"]: run[r["topic_id"]].append((r["doc_id"], r["score"])) # Sort each query's results by score descending for qid in run: run[qid].sort(key=lambda x: x[1], reverse=True) # Convert qrels to pytrec format qrels = {} for topic_id, docs in self.dataset.qrels.items(): qrels[topic_id] = {doc_id: rel for doc_id, rel in docs.items()} # Evaluate try: per_query_results = self.metrics.evaluate_run(dict(run), qrels, self.METRICS) # Aggregate results across queries aggregated = self.metrics.compute_aggregate(per_query_results) return aggregated except Exception as e: print(f" [Evaluation error: {e}]") return {} def run_full_benchmark(self, query_types: List[str] = None, k: int = 100): """ Run the complete benchmark suite. Args: query_types: List of query types to test ("short", "long") k: Number of results per query """ if query_types is None: query_types = ["short", "long"] print("\n" + "=" * 60) print("TREC Benchmark - SysCRED") print("=" * 60) # Load data self.load_data() # Run all configurations print("\n" + "=" * 60) print("Running Retrieval Experiments") print("=" * 60) for query_type in query_types: for config in self.CONFIGURATIONS: run_tag, results = self.run_configuration( config, query_type, k ) if results: self.results[run_tag] = results # Evaluate metrics = self.evaluate_run(run_tag, results) self.results[run_tag]["metrics"] = metrics # Generate report self.generate_report() return self.results def generate_report(self): """Generate summary report.""" print("\n" + "=" * 60) print("Benchmark Results Summary") print("=" * 60) # Table header header = ["Configuration", "Query", "MAP", "NDCG", "P@10", "MRR", "Time(s)"] print("\n" + " | ".join(f"{h:^12}" for h in header)) print("-" * 100) # Table rows for run_tag, data in self.results.items(): metrics = data.get("metrics", {}) row = [ data["config"]["name"][:12], data["query_type"][:5], f"{metrics.get('map', 0):.4f}", f"{metrics.get('ndcg', 0):.4f}", f"{metrics.get('P_10', 0):.4f}", f"{metrics.get('recip_rank', 0):.4f}", f"{data.get('elapsed_time', 0):.2f}" ] print(" | ".join(f"{v:^12}" for v in row)) # Save detailed results results_file = self.output_dir / "benchmark_results.json" # Make results JSON serializable serializable_results = {} for run_tag, data in self.results.items(): serializable_results[run_tag] = { "config": data["config"], "query_type": data["query_type"], "metrics": data.get("metrics", {}), "elapsed_time": data.get("elapsed_time", 0), "num_results": len(data.get("results", [])) } with open(results_file, 'w') as f: json.dump(serializable_results, f, indent=2) print(f"\nDetailed results saved to: {results_file}") # Generate LaTeX table self._generate_latex_table() def _generate_latex_table(self): """Generate LaTeX table for paper.""" latex_file = self.output_dir / "results_table.tex" lines = [ r"\begin{table}[ht]", r"\centering", r"\caption{TREC AP88-90 Retrieval Results}", r"\label{tab:trec-results}", r"\begin{tabular}{l|l|cccc}", r"\toprule", r"Model & Query & MAP & NDCG & P@10 & MRR \\", r"\midrule" ] for run_tag, data in self.results.items(): metrics = data.get("metrics", {}) row = ( f"{data['config']['name']} & {data['query_type']} & " f"{metrics.get('map', 0):.4f} & " f"{metrics.get('ndcg', 0):.4f} & " f"{metrics.get('P_10', 0):.4f} & " f"{metrics.get('recip_rank', 0):.4f} \\\\" ) lines.append(row) lines.extend([ r"\bottomrule", r"\end{tabular}", r"\end{table}" ]) with open(latex_file, 'w') as f: f.write("\n".join(lines)) print(f"LaTeX table saved to: {latex_file}") def main(): """Main entry point.""" parser = argparse.ArgumentParser( description="Run TREC benchmark for SysCRED" ) parser.add_argument( "--index", "-i", help="Path to Lucene index" ) parser.add_argument( "--corpus", "-c", help="Path to JSONL corpus" ) parser.add_argument( "--topics", "-t", help="Path to TREC topics file/directory" ) parser.add_argument( "--qrels", "-q", help="Path to TREC qrels file/directory" ) parser.add_argument( "--output", "-o", default="benchmark_results", help="Output directory for results" ) parser.add_argument( "--k", type=int, default=100, help="Number of results per query" ) args = parser.parse_args() # Run benchmark benchmark = TRECBenchmark( index_path=args.index, corpus_path=args.corpus, topics_path=args.topics, qrels_path=args.qrels, output_dir=args.output ) results = benchmark.run_full_benchmark(k=args.k) print("\n" + "=" * 60) print("Benchmark Complete!") print("=" * 60) if __name__ == "__main__": main()