File size: 14,212 Bytes
e70050b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
# -*- coding: utf-8 -*-
"""
TREC Dataset Module - SysCRED
==============================
Loader and utilities for TREC AP88-90 dataset.

Handles:
- Topic/Query parsing
- Qrels (relevance judgments) loading
- Document corpus loading
- TREC run file generation

Based on: TREC_AP88-90_5juin2025.py
(c) Dominique S. Loyer - PhD Thesis Prototype
Citation Key: loyerEvaluationModelesRecherche2025
"""

import os
import re
import json
import tarfile
from typing import Dict, List, Tuple, Optional, Set
from dataclasses import dataclass, field
from pathlib import Path


@dataclass
class TRECTopic:
    """A TREC topic (query)."""
    topic_id: str
    title: str  # Short query
    description: str  # Long description
    narrative: str = ""  # Full narrative (optional)
    
    @property
    def short_query(self) -> str:
        return self.title
    
    @property
    def long_query(self) -> str:
        return f"{self.title} {self.description}".strip()


@dataclass
class TRECQrel:
    """A relevance judgment."""
    topic_id: str
    doc_id: str
    relevance: int  # 0=not relevant, 1=relevant, 2+=highly relevant


@dataclass
class TRECDocument:
    """A document from the corpus."""
    doc_id: str
    text: str
    title: str = ""
    date: str = ""
    source: str = ""


class TRECDataset:
    """
    TREC AP88-90 Dataset loader and manager.
    
    Provides utilities for:
    - Loading topics (queries)
    - Loading qrels (relevance judgments)
    - Loading document corpus
    - Creating TREC-format run files
    
    Usage:
        dataset = TRECDataset(base_path="/path/to/trec")
        topics = dataset.load_topics()
        qrels = dataset.load_qrels()
    """
    
    # Standard TREC file patterns
    TOPIC_PATTERN = r"topics\.\d+\.txt"
    QREL_PATTERN = r"qrels\.\d+\.txt"
    
    def __init__(
        self,
        base_path: Optional[str] = None,
        topics_dir: Optional[str] = None,
        qrels_dir: Optional[str] = None,
        corpus_path: Optional[str] = None
    ):
        """
        Initialize the dataset loader.
        
        Args:
            base_path: Base path containing TREC data
            topics_dir: Path to topics directory (overrides base_path)
            qrels_dir: Path to qrels directory (overrides base_path)
            corpus_path: Path to corpus file (AP.tar or JSONL)
        """
        self.base_path = Path(base_path) if base_path else None
        self.topics_dir = Path(topics_dir) if topics_dir else None
        self.qrels_dir = Path(qrels_dir) if qrels_dir else None
        self.corpus_path = Path(corpus_path) if corpus_path else None
        
        # Loaded data
        self.topics: Dict[str, TRECTopic] = {}
        self.qrels: Dict[str, Dict[str, int]] = {}  # topic_id -> {doc_id: relevance}
        self.documents: Dict[str, TRECDocument] = {}
        
        # Statistics
        self.stats = {
            "topics_loaded": 0,
            "qrels_loaded": 0,
            "docs_loaded": 0
        }
    
    def load_topics(self, topics_path: Optional[str] = None) -> Dict[str, TRECTopic]:
        """
        Load TREC topics from file(s).
        
        Supports standard TREC topic format with <top>, <num>, <title>, <desc>, <narr> tags.
        """
        search_path = Path(topics_path) if topics_path else self.topics_dir or self.base_path
        
        if not search_path or not search_path.exists():
            print(f"[TRECDataset] Topics path not found: {search_path}")
            return {}
        
        topic_files = []
        if search_path.is_file():
            topic_files = [search_path]
        else:
            topic_files = list(search_path.glob("topics*.txt"))
        
        for topic_file in topic_files:
            self._parse_topic_file(topic_file)
        
        self.stats["topics_loaded"] = len(self.topics)
        print(f"[TRECDataset] Loaded {len(self.topics)} topics from {len(topic_files)} files")
        
        return self.topics
    
    def _parse_topic_file(self, file_path: Path):
        """Parse a single TREC topic file."""
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
            
            # Find all <top>...</top> blocks
            for top_match in re.finditer(r"<top>(.*?)</top>", content, re.DOTALL):
                topic_content = top_match.group(1)
                
                # Extract fields
                num_match = re.search(r"<num>\s*(?:Number:)?\s*(\d+)", topic_content, re.IGNORECASE)
                if not num_match:
                    continue
                
                topic_id = num_match.group(1).strip()
                
                title_match = re.search(r"<title>\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                title = title_match.group(1).strip() if title_match else ""
                
                desc_match = re.search(r"<desc>\s*(?:Description:)?\s*(.*?)\s*(?=<narr>|<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                desc = desc_match.group(1).strip() if desc_match else ""
                
                narr_match = re.search(r"<narr>\s*(?:Narrative:)?\s*(.*?)\s*(?=<|$)", topic_content, re.IGNORECASE | re.DOTALL)
                narr = narr_match.group(1).strip() if narr_match else ""
                
                if topic_id and title:
                    self.topics[topic_id] = TRECTopic(
                        topic_id=topic_id,
                        title=title,
                        description=desc,
                        narrative=narr
                    )
        except Exception as e:
            print(f"[TRECDataset] Error parsing {file_path}: {e}")
    
    def load_qrels(self, qrels_path: Optional[str] = None) -> Dict[str, Dict[str, int]]:
        """
        Load TREC qrels (relevance judgments).
        
        Format: topic_id 0 doc_id relevance
        """
        search_path = Path(qrels_path) if qrels_path else self.qrels_dir or self.base_path
        
        if not search_path or not search_path.exists():
            print(f"[TRECDataset] Qrels path not found: {search_path}")
            return {}
        
        qrel_files = []
        if search_path.is_file():
            qrel_files = [search_path]
        else:
            qrel_files = list(search_path.glob("qrels*.txt")) + list(search_path.glob("*.qrels"))
        
        total_qrels = 0
        for qrel_file in qrel_files:
            count = self._parse_qrel_file(qrel_file)
            total_qrels += count
        
        self.stats["qrels_loaded"] = total_qrels
        print(f"[TRECDataset] Loaded {total_qrels} qrels from {len(qrel_files)} files")
        
        return self.qrels
    
    def _parse_qrel_file(self, file_path: Path) -> int:
        """Parse a single qrel file. Returns count of qrels loaded."""
        count = 0
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) >= 4:
                        topic_id = parts[0]
                        doc_id = parts[2]
                        relevance = int(parts[3])
                        
                        if topic_id not in self.qrels:
                            self.qrels[topic_id] = {}
                        
                        self.qrels[topic_id][doc_id] = relevance
                        count += 1
        except Exception as e:
            print(f"[TRECDataset] Error parsing {file_path}: {e}")
        
        return count
    
    def load_corpus_jsonl(self, jsonl_path: Optional[str] = None) -> Dict[str, TRECDocument]:
        """
        Load corpus from JSONL format.
        
        Expected format: {"id": "...", "contents": "...", "title": "..."}
        """
        path = Path(jsonl_path) if jsonl_path else self.corpus_path
        
        if not path or not path.exists():
            print(f"[TRECDataset] Corpus path not found: {path}")
            return {}
        
        try:
            with open(path, 'r', encoding='utf-8') as f:
                for line in f:
                    doc = json.loads(line.strip())
                    doc_id = doc.get('id', doc.get('docid', ''))
                    text = doc.get('contents', doc.get('text', ''))
                    title = doc.get('title', '')
                    
                    if doc_id:
                        self.documents[doc_id] = TRECDocument(
                            doc_id=doc_id,
                            text=text,
                            title=title
                        )
            
            self.stats["docs_loaded"] = len(self.documents)
            print(f"[TRECDataset] Loaded {len(self.documents)} documents")
            
        except Exception as e:
            print(f"[TRECDataset] Error loading corpus: {e}")
        
        return self.documents
    
    def get_relevant_docs(self, topic_id: str) -> Set[str]:
        """Get set of relevant document IDs for a topic."""
        if topic_id not in self.qrels:
            return set()
        
        return {
            doc_id for doc_id, rel in self.qrels[topic_id].items()
            if rel > 0
        }
    
    def get_topic_queries(self, query_type: str = "short") -> Dict[str, str]:
        """
        Get dictionary of topic_id -> query text.
        
        Args:
            query_type: "short" (title only) or "long" (title + description)
        """
        if query_type == "short":
            return {tid: t.short_query for tid, t in self.topics.items()}
        else:
            return {tid: t.long_query for tid, t in self.topics.items()}
    
    @staticmethod
    def format_trec_run(
        results: List[Tuple[str, str, float, int]],  # (topic_id, doc_id, score, rank)
        run_tag: str
    ) -> str:
        """
        Format results as TREC run file.
        
        Output format: topic_id Q0 doc_id rank score run_tag
        """
        lines = []
        for topic_id, doc_id, score, rank in results:
            lines.append(f"{topic_id} Q0 {doc_id} {rank} {score:.6f} {run_tag}")
        return "\n".join(lines)
    
    @staticmethod
    def save_trec_run(
        results: List[Tuple[str, str, float, int]],
        run_tag: str,
        output_path: str
    ):
        """Save results to TREC run file."""
        run_content = TRECDataset.format_trec_run(results, run_tag)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(run_content)
        print(f"[TRECDataset] Saved run file: {output_path}")
    
    def get_statistics(self) -> Dict[str, int]:
        """Get dataset statistics."""
        return {
            "topics": len(self.topics),
            "qrels_topics": len(self.qrels),
            "total_qrels": sum(len(q) for q in self.qrels.values()),
            "documents": len(self.documents)
        }


# --- Sample Topics for Testing (AP88-90 subset) ---

SAMPLE_TOPICS = {
    "51": TRECTopic(
        topic_id="51",
        title="Airbus Subsidies",
        description="How much government money has been used to support Airbus aircraft manufacturing?",
        narrative="A relevant document will contain information on subsidies or other financial support from government sources to Airbus."
    ),
    "52": TRECTopic(
        topic_id="52", 
        title="Japanese Auto Sales",
        description="How have Japanese automobile sales fared in the U.S.?",
        narrative="A relevant document will report on sales figures, trends, or market share of Japanese automobile manufacturers in the United States."
    ),
    "53": TRECTopic(
        topic_id="53",
        title="Leveraged Buyouts",
        description="What are the effects of leveraged buyouts on companies and industries?",
        narrative="Relevant documents discuss the impact of LBOs on corporate structure, employment, or industry dynamics."
    ),
    "54": TRECTopic(
        topic_id="54",
        title="Satellite Launches",
        description="What are the commercial applications of satellite launches?",
        narrative="A relevant document will discuss commercial satellite launches and their business applications."
    ),
    "55": TRECTopic(
        topic_id="55",
        title="Insider Trading",
        description="What individuals or companies have been accused or convicted of insider trading?",
        narrative="A relevant document will identify specific cases of insider trading allegations or convictions."
    ),
}


def create_sample_dataset() -> TRECDataset:
    """Create a sample dataset for testing."""
    dataset = TRECDataset()
    dataset.topics = SAMPLE_TOPICS.copy()
    
    # Add sample qrels
    dataset.qrels = {
        "51": {"AP880212-0001": 1, "AP880215-0003": 1, "AP880301-0010": 0},
        "52": {"AP890102-0020": 1, "AP890115-0045": 1},
        "53": {"AP880325-0100": 1},
    }
    
    return dataset


# --- Testing ---

if __name__ == "__main__":
    print("=" * 60)
    print("SysCRED TREC Dataset - Test Suite")
    print("=" * 60)
    
    # Create sample dataset
    dataset = create_sample_dataset()
    
    print(f"\n1. Sample Topics: {len(dataset.topics)}")
    for tid, topic in list(dataset.topics.items())[:3]:
        print(f"   {tid}: {topic.title}")
        print(f"      Short: {topic.short_query}")
        print(f"      Long: {topic.long_query[:80]}...")
    
    print(f"\n2. Sample Qrels:")
    for tid, docs in dataset.qrels.items():
        print(f"   Topic {tid}: {len(docs)} judgments")
    
    print(f"\n3. Query dictionaries:")
    short_queries = dataset.get_topic_queries("short")
    long_queries = dataset.get_topic_queries("long")
    print(f"   Short queries: {len(short_queries)}")
    print(f"   Long queries: {len(long_queries)}")
    
    print(f"\n4. Relevant docs for topic 51:")
    relevant = dataset.get_relevant_docs("51")
    print(f"   {relevant}")
    
    print(f"\n5. Statistics:")
    stats = dataset.get_statistics()
    for key, value in stats.items():
        print(f"   {key}: {value}")
    
    print("\n" + "=" * 60)
    print("Tests complete!")
    print("=" * 60)