syscred_duplicate

Sleeping

syscred_duplicate / syscred /run_trec_benchmark.py

D Ф m i И i q ц e L Ф y e r

Deploy SysCRED with PyTorch

e70050b about 1 month ago

12.8 kB

	# -- coding: utf-8 --
	"""
	TREC Benchmark Script - SysCRED
	================================
	Run TREC-style evaluation on the fact-checking system.

	This script:
	1. Loads TREC AP88-90 topics and qrels
	2. Runs retrieval with multiple models (BM25, QLD, TF-IDF)
	3. Evaluates using pytrec_eval metrics
	4. Generates comparison tables and visualizations

	Usage:
	python run_trec_benchmark.py --index /path/to/index --qrels /path/to/qrels

	(c) Dominique S. Loyer - PhD Thesis Prototype
	Citation Key: loyerEvaluationModelesRecherche2025
	"""

	import os
	import sys
	import json
	import argparse
	import time
	from pathlib import Path
	from typing import Dict, List, Any, Tuple
	from collections import defaultdict

	# Add parent directory to path
	sys.path.insert(0, str(Path(__file__).parent))

	from syscred.trec_retriever import TRECRetriever, RetrievalResult
	from syscred.trec_dataset import TRECDataset, SAMPLE_TOPICS
	from syscred.eval_metrics import EvaluationMetrics


	class TRECBenchmark:
	"""
	TREC-style benchmark runner for SysCRED.

	Runs multiple retrieval configurations and compares performance
	using standard IR metrics.
	"""

	# Configurations to test
	CONFIGURATIONS = [
	{"name": "BM25", "model": "bm25", "prf": False},
	{"name": "BM25+PRF", "model": "bm25", "prf": True},
	{"name": "QLD", "model": "qld", "prf": False},
	{"name": "QLD+PRF", "model": "qld", "prf": True},
	]

	# Metrics to evaluate
	METRICS = ["map", "ndcg", "P_10", "P_20", "recall_100", "recip_rank"]

	def __init__(
	self,
	index_path: str = None,
	corpus_path: str = None,
	topics_path: str = None,
	qrels_path: str = None,
	output_dir: str = None
	):
	"""
	Initialize the benchmark runner.

	Args:
	index_path: Path to Lucene index
	corpus_path: Path to JSONL corpus
	topics_path: Path to TREC topics
	qrels_path: Path to TREC qrels
	output_dir: Directory for output files
	"""
	self.index_path = index_path
	self.corpus_path = corpus_path
	self.topics_path = topics_path
	self.qrels_path = qrels_path
	self.output_dir = Path(output_dir) if output_dir else Path("benchmark_results")

	# Create output directory
	self.output_dir.mkdir(parents=True, exist_ok=True)

	# Initialize components
	self.dataset = TRECDataset(
	topics_dir=topics_path,
	qrels_dir=qrels_path,
	corpus_path=corpus_path
	)

	self.retriever = TRECRetriever(
	index_path=index_path,
	corpus_path=corpus_path,
	use_stemming=True
	)

	self.metrics = EvaluationMetrics()

	# Results storage
	self.results: Dict[str, Dict[str, Any]] = {}

	def load_data(self):
	"""Load topics and qrels."""
	print("\n" + "=" * 60)
	print("Loading TREC Data")
	print("=" * 60)

	# Load topics
	if self.topics_path:
	self.dataset.load_topics(self.topics_path)
	else:
	# Use sample topics
	print("[Benchmark] Using sample topics (no topics file provided)")
	self.dataset.topics = SAMPLE_TOPICS.copy()

	# Load qrels
	if self.qrels_path:
	self.dataset.load_qrels(self.qrels_path)
	else:
	print("[Benchmark] No qrels provided - evaluation will be limited")

	# Load corpus if available
	if self.corpus_path:
	self.dataset.load_corpus_jsonl(self.corpus_path)

	stats = self.dataset.get_statistics()
	print(f"\nDataset Statistics:")
	for key, value in stats.items():
	print(f" {key}: {value}")

	def run_configuration(
	self,
	config: Dict[str, Any],
	query_type: str = "short",
	k: int = 100
	) -> Tuple[str, Dict[str, Any]]:
	"""
	Run a single retrieval configuration.

	Returns:
	(run_tag, results_dict)
	"""
	config_name = config["name"]
	model = config["model"]
	use_prf = config["prf"]

	run_tag = f"syscred_{config_name}_{query_type}"

	print(f"\n--- Running: {run_tag} ---")

	queries = self.dataset.get_topic_queries(query_type)

	if not queries:
	print(f" No queries available!")
	return run_tag, {}

	# Run retrieval
	start_time = time.time()

	all_results = []
	run_lines = []

	for topic_id, query_text in queries.items():
	result = self.retriever.retrieve_evidence(
	claim=query_text,
	k=k,
	model=model,
	use_prf=use_prf
	)

	for evidence in result.evidences:
	all_results.append({
	"topic_id": topic_id,
	"doc_id": evidence.doc_id,
	"score": evidence.score,
	"rank": evidence.rank
	})
	run_lines.append(
	f"{topic_id} Q0 {evidence.doc_id} {evidence.rank} {evidence.score:.6f} {run_tag}"
	)

	elapsed = time.time() - start_time

	# Save run file
	run_file = self.output_dir / f"{run_tag}.run"
	with open(run_file, 'w') as f:
	f.write("\n".join(run_lines))

	print(f" Queries: {len(queries)}")
	print(f" Total results: {len(all_results)}")
	print(f" Time: {elapsed:.2f}s")
	print(f" Saved: {run_file}")

	return run_tag, {
	"config": config,
	"query_type": query_type,
	"results": all_results,
	"run_file": str(run_file),
	"elapsed_time": elapsed
	}

	def evaluate_run(self, run_tag: str, results: Dict[str, Any]) -> Dict[str, float]:
	"""
	Evaluate a run using pytrec_eval.

	Returns dictionary of metric -> value (aggregated across queries).
	"""
	if not self.dataset.qrels:
	print(f" [Skip evaluation - no qrels]")
	return {}

	# Convert results to pytrec format: {query_id: [(doc_id, score), ...]}
	run = defaultdict(list)
	for r in results["results"]:
	run[r["topic_id"]].append((r["doc_id"], r["score"]))

	# Sort each query's results by score descending
	for qid in run:
	run[qid].sort(key=lambda x: x[1], reverse=True)

	# Convert qrels to pytrec format
	qrels = {}
	for topic_id, docs in self.dataset.qrels.items():
	qrels[topic_id] = {doc_id: rel for doc_id, rel in docs.items()}

	# Evaluate
	try:
	per_query_results = self.metrics.evaluate_run(dict(run), qrels, self.METRICS)
	# Aggregate results across queries
	aggregated = self.metrics.compute_aggregate(per_query_results)
	return aggregated
	except Exception as e:
	print(f" [Evaluation error: {e}]")
	return {}

	def run_full_benchmark(self, query_types: List[str] = None, k: int = 100):
	"""
	Run the complete benchmark suite.

	Args:
	query_types: List of query types to test ("short", "long")
	k: Number of results per query
	"""
	if query_types is None:
	query_types = ["short", "long"]

	print("\n" + "=" * 60)
	print("TREC Benchmark - SysCRED")
	print("=" * 60)

	# Load data
	self.load_data()

	# Run all configurations
	print("\n" + "=" * 60)
	print("Running Retrieval Experiments")
	print("=" * 60)

	for query_type in query_types:
	for config in self.CONFIGURATIONS:
	run_tag, results = self.run_configuration(
	config, query_type, k
	)

	if results:
	self.results[run_tag] = results

	# Evaluate
	metrics = self.evaluate_run(run_tag, results)
	self.results[run_tag]["metrics"] = metrics

	# Generate report
	self.generate_report()

	return self.results

	def generate_report(self):
	"""Generate summary report."""
	print("\n" + "=" * 60)
	print("Benchmark Results Summary")
	print("=" * 60)

	# Table header
	header = ["Configuration", "Query", "MAP", "NDCG", "P@10", "MRR", "Time(s)"]
	print("\n" + " \| ".join(f"{h:^12}" for h in header))
	print("-" * 100)

	# Table rows
	for run_tag, data in self.results.items():
	metrics = data.get("metrics", {})

	row = [
	data["config"]["name"][:12],
	data["query_type"][:5],
	f"{metrics.get('map', 0):.4f}",
	f"{metrics.get('ndcg', 0):.4f}",
	f"{metrics.get('P_10', 0):.4f}",
	f"{metrics.get('recip_rank', 0):.4f}",
	f"{data.get('elapsed_time', 0):.2f}"
	]
	print(" \| ".join(f"{v:^12}" for v in row))

	# Save detailed results
	results_file = self.output_dir / "benchmark_results.json"

	# Make results JSON serializable
	serializable_results = {}
	for run_tag, data in self.results.items():
	serializable_results[run_tag] = {
	"config": data["config"],
	"query_type": data["query_type"],
	"metrics": data.get("metrics", {}),
	"elapsed_time": data.get("elapsed_time", 0),
	"num_results": len(data.get("results", []))
	}

	with open(results_file, 'w') as f:
	json.dump(serializable_results, f, indent=2)

	print(f"\nDetailed results saved to: {results_file}")

	# Generate LaTeX table
	self._generate_latex_table()

	def _generate_latex_table(self):
	"""Generate LaTeX table for paper."""
	latex_file = self.output_dir / "results_table.tex"

	lines = [
	r"\begin{table}[ht]",
	r"\centering",
	r"\caption{TREC AP88-90 Retrieval Results}",
	r"\label{tab:trec-results}",
	r"\begin{tabular}{l\|l\|cccc}",
	r"\toprule",
	r"Model & Query & MAP & NDCG & P@10 & MRR \\",
	r"\midrule"
	]

	for run_tag, data in self.results.items():
	metrics = data.get("metrics", {})
	row = (
	f"{data['config']['name']} & {data['query_type']} & "
	f"{metrics.get('map', 0):.4f} & "
	f"{metrics.get('ndcg', 0):.4f} & "
	f"{metrics.get('P_10', 0):.4f} & "
	f"{metrics.get('recip_rank', 0):.4f} \\\\"
	)
	lines.append(row)

	lines.extend([
	r"\bottomrule",
	r"\end{tabular}",
	r"\end{table}"
	])

	with open(latex_file, 'w') as f:
	f.write("\n".join(lines))

	print(f"LaTeX table saved to: {latex_file}")


	def main():
	"""Main entry point."""
	parser = argparse.ArgumentParser(
	description="Run TREC benchmark for SysCRED"
	)
	parser.add_argument(
	"--index", "-i",
	help="Path to Lucene index"
	)
	parser.add_argument(
	"--corpus", "-c",
	help="Path to JSONL corpus"
	)
	parser.add_argument(
	"--topics", "-t",
	help="Path to TREC topics file/directory"
	)
	parser.add_argument(
	"--qrels", "-q",
	help="Path to TREC qrels file/directory"
	)
	parser.add_argument(
	"--output", "-o",
	default="benchmark_results",
	help="Output directory for results"
	)
	parser.add_argument(
	"--k",
	type=int,
	default=100,
	help="Number of results per query"
	)

	args = parser.parse_args()

	# Run benchmark
	benchmark = TRECBenchmark(
	index_path=args.index,
	corpus_path=args.corpus,
	topics_path=args.topics,
	qrels_path=args.qrels,
	output_dir=args.output
	)

	results = benchmark.run_full_benchmark(k=args.k)

	print("\n" + "=" * 60)
	print("Benchmark Complete!")
	print("=" * 60)


	if __name__ == "__main__":
	main()