syscred_duplicate / syscred /run_trec_benchmark.py
D Ф m i И i q ц e L Ф y e r
Deploy SysCRED with PyTorch
e70050b
# -*- coding: utf-8 -*-
"""
TREC Benchmark Script - SysCRED
================================
Run TREC-style evaluation on the fact-checking system.
This script:
1. Loads TREC AP88-90 topics and qrels
2. Runs retrieval with multiple models (BM25, QLD, TF-IDF)
3. Evaluates using pytrec_eval metrics
4. Generates comparison tables and visualizations
Usage:
python run_trec_benchmark.py --index /path/to/index --qrels /path/to/qrels
(c) Dominique S. Loyer - PhD Thesis Prototype
Citation Key: loyerEvaluationModelesRecherche2025
"""
import os
import sys
import json
import argparse
import time
from pathlib import Path
from typing import Dict, List, Any, Tuple
from collections import defaultdict
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent))
from syscred.trec_retriever import TRECRetriever, RetrievalResult
from syscred.trec_dataset import TRECDataset, SAMPLE_TOPICS
from syscred.eval_metrics import EvaluationMetrics
class TRECBenchmark:
"""
TREC-style benchmark runner for SysCRED.
Runs multiple retrieval configurations and compares performance
using standard IR metrics.
"""
# Configurations to test
CONFIGURATIONS = [
{"name": "BM25", "model": "bm25", "prf": False},
{"name": "BM25+PRF", "model": "bm25", "prf": True},
{"name": "QLD", "model": "qld", "prf": False},
{"name": "QLD+PRF", "model": "qld", "prf": True},
]
# Metrics to evaluate
METRICS = ["map", "ndcg", "P_10", "P_20", "recall_100", "recip_rank"]
def __init__(
self,
index_path: str = None,
corpus_path: str = None,
topics_path: str = None,
qrels_path: str = None,
output_dir: str = None
):
"""
Initialize the benchmark runner.
Args:
index_path: Path to Lucene index
corpus_path: Path to JSONL corpus
topics_path: Path to TREC topics
qrels_path: Path to TREC qrels
output_dir: Directory for output files
"""
self.index_path = index_path
self.corpus_path = corpus_path
self.topics_path = topics_path
self.qrels_path = qrels_path
self.output_dir = Path(output_dir) if output_dir else Path("benchmark_results")
# Create output directory
self.output_dir.mkdir(parents=True, exist_ok=True)
# Initialize components
self.dataset = TRECDataset(
topics_dir=topics_path,
qrels_dir=qrels_path,
corpus_path=corpus_path
)
self.retriever = TRECRetriever(
index_path=index_path,
corpus_path=corpus_path,
use_stemming=True
)
self.metrics = EvaluationMetrics()
# Results storage
self.results: Dict[str, Dict[str, Any]] = {}
def load_data(self):
"""Load topics and qrels."""
print("\n" + "=" * 60)
print("Loading TREC Data")
print("=" * 60)
# Load topics
if self.topics_path:
self.dataset.load_topics(self.topics_path)
else:
# Use sample topics
print("[Benchmark] Using sample topics (no topics file provided)")
self.dataset.topics = SAMPLE_TOPICS.copy()
# Load qrels
if self.qrels_path:
self.dataset.load_qrels(self.qrels_path)
else:
print("[Benchmark] No qrels provided - evaluation will be limited")
# Load corpus if available
if self.corpus_path:
self.dataset.load_corpus_jsonl(self.corpus_path)
stats = self.dataset.get_statistics()
print(f"\nDataset Statistics:")
for key, value in stats.items():
print(f" {key}: {value}")
def run_configuration(
self,
config: Dict[str, Any],
query_type: str = "short",
k: int = 100
) -> Tuple[str, Dict[str, Any]]:
"""
Run a single retrieval configuration.
Returns:
(run_tag, results_dict)
"""
config_name = config["name"]
model = config["model"]
use_prf = config["prf"]
run_tag = f"syscred_{config_name}_{query_type}"
print(f"\n--- Running: {run_tag} ---")
queries = self.dataset.get_topic_queries(query_type)
if not queries:
print(f" No queries available!")
return run_tag, {}
# Run retrieval
start_time = time.time()
all_results = []
run_lines = []
for topic_id, query_text in queries.items():
result = self.retriever.retrieve_evidence(
claim=query_text,
k=k,
model=model,
use_prf=use_prf
)
for evidence in result.evidences:
all_results.append({
"topic_id": topic_id,
"doc_id": evidence.doc_id,
"score": evidence.score,
"rank": evidence.rank
})
run_lines.append(
f"{topic_id} Q0 {evidence.doc_id} {evidence.rank} {evidence.score:.6f} {run_tag}"
)
elapsed = time.time() - start_time
# Save run file
run_file = self.output_dir / f"{run_tag}.run"
with open(run_file, 'w') as f:
f.write("\n".join(run_lines))
print(f" Queries: {len(queries)}")
print(f" Total results: {len(all_results)}")
print(f" Time: {elapsed:.2f}s")
print(f" Saved: {run_file}")
return run_tag, {
"config": config,
"query_type": query_type,
"results": all_results,
"run_file": str(run_file),
"elapsed_time": elapsed
}
def evaluate_run(self, run_tag: str, results: Dict[str, Any]) -> Dict[str, float]:
"""
Evaluate a run using pytrec_eval.
Returns dictionary of metric -> value (aggregated across queries).
"""
if not self.dataset.qrels:
print(f" [Skip evaluation - no qrels]")
return {}
# Convert results to pytrec format: {query_id: [(doc_id, score), ...]}
run = defaultdict(list)
for r in results["results"]:
run[r["topic_id"]].append((r["doc_id"], r["score"]))
# Sort each query's results by score descending
for qid in run:
run[qid].sort(key=lambda x: x[1], reverse=True)
# Convert qrels to pytrec format
qrels = {}
for topic_id, docs in self.dataset.qrels.items():
qrels[topic_id] = {doc_id: rel for doc_id, rel in docs.items()}
# Evaluate
try:
per_query_results = self.metrics.evaluate_run(dict(run), qrels, self.METRICS)
# Aggregate results across queries
aggregated = self.metrics.compute_aggregate(per_query_results)
return aggregated
except Exception as e:
print(f" [Evaluation error: {e}]")
return {}
def run_full_benchmark(self, query_types: List[str] = None, k: int = 100):
"""
Run the complete benchmark suite.
Args:
query_types: List of query types to test ("short", "long")
k: Number of results per query
"""
if query_types is None:
query_types = ["short", "long"]
print("\n" + "=" * 60)
print("TREC Benchmark - SysCRED")
print("=" * 60)
# Load data
self.load_data()
# Run all configurations
print("\n" + "=" * 60)
print("Running Retrieval Experiments")
print("=" * 60)
for query_type in query_types:
for config in self.CONFIGURATIONS:
run_tag, results = self.run_configuration(
config, query_type, k
)
if results:
self.results[run_tag] = results
# Evaluate
metrics = self.evaluate_run(run_tag, results)
self.results[run_tag]["metrics"] = metrics
# Generate report
self.generate_report()
return self.results
def generate_report(self):
"""Generate summary report."""
print("\n" + "=" * 60)
print("Benchmark Results Summary")
print("=" * 60)
# Table header
header = ["Configuration", "Query", "MAP", "NDCG", "P@10", "MRR", "Time(s)"]
print("\n" + " | ".join(f"{h:^12}" for h in header))
print("-" * 100)
# Table rows
for run_tag, data in self.results.items():
metrics = data.get("metrics", {})
row = [
data["config"]["name"][:12],
data["query_type"][:5],
f"{metrics.get('map', 0):.4f}",
f"{metrics.get('ndcg', 0):.4f}",
f"{metrics.get('P_10', 0):.4f}",
f"{metrics.get('recip_rank', 0):.4f}",
f"{data.get('elapsed_time', 0):.2f}"
]
print(" | ".join(f"{v:^12}" for v in row))
# Save detailed results
results_file = self.output_dir / "benchmark_results.json"
# Make results JSON serializable
serializable_results = {}
for run_tag, data in self.results.items():
serializable_results[run_tag] = {
"config": data["config"],
"query_type": data["query_type"],
"metrics": data.get("metrics", {}),
"elapsed_time": data.get("elapsed_time", 0),
"num_results": len(data.get("results", []))
}
with open(results_file, 'w') as f:
json.dump(serializable_results, f, indent=2)
print(f"\nDetailed results saved to: {results_file}")
# Generate LaTeX table
self._generate_latex_table()
def _generate_latex_table(self):
"""Generate LaTeX table for paper."""
latex_file = self.output_dir / "results_table.tex"
lines = [
r"\begin{table}[ht]",
r"\centering",
r"\caption{TREC AP88-90 Retrieval Results}",
r"\label{tab:trec-results}",
r"\begin{tabular}{l|l|cccc}",
r"\toprule",
r"Model & Query & MAP & NDCG & P@10 & MRR \\",
r"\midrule"
]
for run_tag, data in self.results.items():
metrics = data.get("metrics", {})
row = (
f"{data['config']['name']} & {data['query_type']} & "
f"{metrics.get('map', 0):.4f} & "
f"{metrics.get('ndcg', 0):.4f} & "
f"{metrics.get('P_10', 0):.4f} & "
f"{metrics.get('recip_rank', 0):.4f} \\\\"
)
lines.append(row)
lines.extend([
r"\bottomrule",
r"\end{tabular}",
r"\end{table}"
])
with open(latex_file, 'w') as f:
f.write("\n".join(lines))
print(f"LaTeX table saved to: {latex_file}")
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description="Run TREC benchmark for SysCRED"
)
parser.add_argument(
"--index", "-i",
help="Path to Lucene index"
)
parser.add_argument(
"--corpus", "-c",
help="Path to JSONL corpus"
)
parser.add_argument(
"--topics", "-t",
help="Path to TREC topics file/directory"
)
parser.add_argument(
"--qrels", "-q",
help="Path to TREC qrels file/directory"
)
parser.add_argument(
"--output", "-o",
default="benchmark_results",
help="Output directory for results"
)
parser.add_argument(
"--k",
type=int,
default=100,
help="Number of results per query"
)
args = parser.parse_args()
# Run benchmark
benchmark = TRECBenchmark(
index_path=args.index,
corpus_path=args.corpus,
topics_path=args.topics,
qrels_path=args.qrels,
output_dir=args.output
)
results = benchmark.run_full_benchmark(k=args.k)
print("\n" + "=" * 60)
print("Benchmark Complete!")
print("=" * 60)
if __name__ == "__main__":
main()