syscred_duplicate / syscred /trec_dataset.py
D Ф m i И i q ц e L Ф y e r
Deploy SysCRED with PyTorch
e70050b
# -*- coding: utf-8 -*-
"""
TREC Dataset Module - SysCRED
==============================
Loader and utilities for TREC AP88-90 dataset.
Handles:
- Topic/Query parsing
- Qrels (relevance judgments) loading
- Document corpus loading
- TREC run file generation
Based on: TREC_AP88-90_5juin2025.py
(c) Dominique S. Loyer - PhD Thesis Prototype
Citation Key: loyerEvaluationModelesRecherche2025
"""
import os
import re
import json
import tarfile
from typing import Dict, List, Tuple, Optional, Set
from dataclasses import dataclass, field
from pathlib import Path
@dataclass
class TRECTopic:
"""A TREC topic (query)."""
topic_id: str
title: str # Short query
description: str # Long description
narrative: str = "" # Full narrative (optional)
@property
def short_query(self) -> str:
return self.title
@property
def long_query(self) -> str:
return f"{self.title} {self.description}".strip()
@dataclass
class TRECQrel:
"""A relevance judgment."""
topic_id: str
doc_id: str
relevance: int # 0=not relevant, 1=relevant, 2+=highly relevant
@dataclass
class TRECDocument:
"""A document from the corpus."""
doc_id: str
text: str
title: str = ""
date: str = ""
source: str = ""
class TRECDataset:
"""
TREC AP88-90 Dataset loader and manager.
Provides utilities for:
- Loading topics (queries)
- Loading qrels (relevance judgments)
- Loading document corpus
- Creating TREC-format run files
Usage:
dataset = TRECDataset(base_path="/path/to/trec")
topics = dataset.load_topics()
qrels = dataset.load_qrels()
"""
# Standard TREC file patterns
TOPIC_PATTERN = r"topics\.\d+\.txt"
QREL_PATTERN = r"qrels\.\d+\.txt"
def __init__(
self,
base_path: Optional[str] = None,
topics_dir: Optional[str] = None,
qrels_dir: Optional[str] = None,
corpus_path: Optional[str] = None
):
"""
Initialize the dataset loader.
Args:
base_path: Base path containing TREC data
topics_dir: Path to topics directory (overrides base_path)
qrels_dir: Path to qrels directory (overrides base_path)
corpus_path: Path to corpus file (AP.tar or JSONL)
"""
self.base_path = Path(base_path) if base_path else None
self.topics_dir = Path(topics_dir) if topics_dir else None
self.qrels_dir = Path(qrels_dir) if qrels_dir else None
self.corpus_path = Path(corpus_path) if corpus_path else None
# Loaded data
self.topics: Dict[str, TRECTopic] = {}
self.qrels: Dict[str, Dict[str, int]] = {} # topic_id -> {doc_id: relevance}
self.documents: Dict[str, TRECDocument] = {}
# Statistics
self.stats = {
"topics_loaded": 0,
"qrels_loaded": 0,
"docs_loaded": 0
}
def load_topics(self, topics_path: Optional[str] = None) -> Dict[str, TRECTopic]:
"""
Load TREC topics from file(s).
Supports standard TREC topic format with <top>, <num>, <title>, <desc>, <narr> tags.
"""
search_path = Path(topics_path) if topics_path else self.topics_dir or self.base_path
if not search_path or not search_path.exists():
print(f"[TRECDataset] Topics path not found: {search_path}")
return {}
topic_files = []
if search_path.is_file():
topic_files = [search_path]
else:
topic_files = list(search_path.glob("topics*.txt"))
for topic_file in topic_files:
self._parse_topic_file(topic_file)
self.stats["topics_loaded"] = len(self.topics)
print(f"[TRECDataset] Loaded {len(self.topics)} topics from {len(topic_files)} files")
return self.topics
def _parse_topic_file(self, file_path: Path):
"""Parse a single TREC topic file."""
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
# Find all <top>...</top> blocks
for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
topic_content = top_match.group(1)
# Extract fields
num_match = re.search(r"<num>\s*(?:Number:)?\s*(\d+)", topic_content, re.IGNORECASE)
if not num_match:
continue
topic_id = num_match.group(1).strip()
title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
title = title_match.group(1).strip() if title_match else ""
desc_match = re.search(r"<desc>\s*(?:Description:)?\s*(.*?)\s*(?=<narr>|<|$)", topic_content, re.IGNORECASE | re.DOTALL)
desc = desc_match.group(1).strip() if desc_match else ""
narr_match = re.search(r"<narr>\s*(?:Narrative:)?\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
narr = narr_match.group(1).strip() if narr_match else ""
if topic_id and title:
self.topics[topic_id] = TRECTopic(
topic_id=topic_id,
title=title,
description=desc,
narrative=narr
)
except Exception as e:
print(f"[TRECDataset] Error parsing {file_path}: {e}")
def load_qrels(self, qrels_path: Optional[str] = None) -> Dict[str, Dict[str, int]]:
"""
Load TREC qrels (relevance judgments).
Format: topic_id 0 doc_id relevance
"""
search_path = Path(qrels_path) if qrels_path else self.qrels_dir or self.base_path
if not search_path or not search_path.exists():
print(f"[TRECDataset] Qrels path not found: {search_path}")
return {}
qrel_files = []
if search_path.is_file():
qrel_files = [search_path]
else:
qrel_files = list(search_path.glob("qrels*.txt")) + list(search_path.glob("*.qrels"))
total_qrels = 0
for qrel_file in qrel_files:
count = self._parse_qrel_file(qrel_file)
total_qrels += count
self.stats["qrels_loaded"] = total_qrels
print(f"[TRECDataset] Loaded {total_qrels} qrels from {len(qrel_files)} files")
return self.qrels
def _parse_qrel_file(self, file_path: Path) -> int:
"""Parse a single qrel file. Returns count of qrels loaded."""
count = 0
try:
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
parts = line.strip().split()
if len(parts) >= 4:
topic_id = parts[0]
doc_id = parts[2]
relevance = int(parts[3])
if topic_id not in self.qrels:
self.qrels[topic_id] = {}
self.qrels[topic_id][doc_id] = relevance
count += 1
except Exception as e:
print(f"[TRECDataset] Error parsing {file_path}: {e}")
return count
def load_corpus_jsonl(self, jsonl_path: Optional[str] = None) -> Dict[str, TRECDocument]:
"""
Load corpus from JSONL format.
Expected format: {"id": "...", "contents": "...", "title": "..."}
"""
path = Path(jsonl_path) if jsonl_path else self.corpus_path
if not path or not path.exists():
print(f"[TRECDataset] Corpus path not found: {path}")
return {}
try:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
doc = json.loads(line.strip())
doc_id = doc.get('id', doc.get('docid', ''))
text = doc.get('contents', doc.get('text', ''))
title = doc.get('title', '')
if doc_id:
self.documents[doc_id] = TRECDocument(
doc_id=doc_id,
text=text,
title=title
)
self.stats["docs_loaded"] = len(self.documents)
print(f"[TRECDataset] Loaded {len(self.documents)} documents")
except Exception as e:
print(f"[TRECDataset] Error loading corpus: {e}")
return self.documents
def get_relevant_docs(self, topic_id: str) -> Set[str]:
"""Get set of relevant document IDs for a topic."""
if topic_id not in self.qrels:
return set()
return {
doc_id for doc_id, rel in self.qrels[topic_id].items()
if rel > 0
}
def get_topic_queries(self, query_type: str = "short") -> Dict[str, str]:
"""
Get dictionary of topic_id -> query text.
Args:
query_type: "short" (title only) or "long" (title + description)
"""
if query_type == "short":
return {tid: t.short_query for tid, t in self.topics.items()}
else:
return {tid: t.long_query for tid, t in self.topics.items()}
@staticmethod
def format_trec_run(
results: List[Tuple[str, str, float, int]], # (topic_id, doc_id, score, rank)
run_tag: str
) -> str:
"""
Format results as TREC run file.
Output format: topic_id Q0 doc_id rank score run_tag
"""
lines = []
for topic_id, doc_id, score, rank in results:
lines.append(f"{topic_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}")
return "\n".join(lines)
@staticmethod
def save_trec_run(
results: List[Tuple[str, str, float, int]],
run_tag: str,
output_path: str
):
"""Save results to TREC run file."""
run_content = TRECDataset.format_trec_run(results, run_tag)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(run_content)
print(f"[TRECDataset] Saved run file: {output_path}")
def get_statistics(self) -> Dict[str, int]:
"""Get dataset statistics."""
return {
"topics": len(self.topics),
"qrels_topics": len(self.qrels),
"total_qrels": sum(len(q) for q in self.qrels.values()),
"documents": len(self.documents)
}
# --- Sample Topics for Testing (AP88-90 subset) ---
SAMPLE_TOPICS = {
"51": TRECTopic(
topic_id="51",
title="Airbus Subsidies",
description="How much government money has been used to support Airbus aircraft manufacturing?",
narrative="A relevant document will contain information on subsidies or other financial support from government sources to Airbus."
),
"52": TRECTopic(
topic_id="52",
title="Japanese Auto Sales",
description="How have Japanese automobile sales fared in the U.S.?",
narrative="A relevant document will report on sales figures, trends, or market share of Japanese automobile manufacturers in the United States."
),
"53": TRECTopic(
topic_id="53",
title="Leveraged Buyouts",
description="What are the effects of leveraged buyouts on companies and industries?",
narrative="Relevant documents discuss the impact of LBOs on corporate structure, employment, or industry dynamics."
),
"54": TRECTopic(
topic_id="54",
title="Satellite Launches",
description="What are the commercial applications of satellite launches?",
narrative="A relevant document will discuss commercial satellite launches and their business applications."
),
"55": TRECTopic(
topic_id="55",
title="Insider Trading",
description="What individuals or companies have been accused or convicted of insider trading?",
narrative="A relevant document will identify specific cases of insider trading allegations or convictions."
),
}
def create_sample_dataset() -> TRECDataset:
"""Create a sample dataset for testing."""
dataset = TRECDataset()
dataset.topics = SAMPLE_TOPICS.copy()
# Add sample qrels
dataset.qrels = {
"51": {"AP880212-0001": 1, "AP880215-0003": 1, "AP880301-0010": 0},
"52": {"AP890102-0020": 1, "AP890115-0045": 1},
"53": {"AP880325-0100": 1},
}
return dataset
# --- Testing ---
if __name__ == "__main__":
print("=" * 60)
print("SysCRED TREC Dataset - Test Suite")
print("=" * 60)
# Create sample dataset
dataset = create_sample_dataset()
print(f"\n1. Sample Topics: {len(dataset.topics)}")
for tid, topic in list(dataset.topics.items())[:3]:
print(f" {tid}: {topic.title}")
print(f" Short: {topic.short_query}")
print(f" Long: {topic.long_query[:80]}...")
print(f"\n2. Sample Qrels:")
for tid, docs in dataset.qrels.items():
print(f" Topic {tid}: {len(docs)} judgments")
print(f"\n3. Query dictionaries:")
short_queries = dataset.get_topic_queries("short")
long_queries = dataset.get_topic_queries("long")
print(f" Short queries: {len(short_queries)}")
print(f" Long queries: {len(long_queries)}")
print(f"\n4. Relevant docs for topic 51:")
relevant = dataset.get_relevant_docs("51")
print(f" {relevant}")
print(f"\n5. Statistics:")
stats = dataset.get_statistics()
for key, value in stats.items():
print(f" {key}: {value}")
print("\n" + "=" * 60)
print("Tests complete!")
print("=" * 60)