File size: 4,452 Bytes
1ea26af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
"""
GAIA Data Loader - Minimal JSONL parser for GAIA dataset
Loads tasks from GAIA JSONL format with exact field names and permanent file exclusion.
"""

import json
from typing import List, Dict


def load_tasks(path: str) -> List[Dict]:
    """
    Load tasks from GAIA JSONL format with exact field names
    
    Expected GAIA format fields:
    - task_id: Task identifier
    - Question: Task question/query (capital Q)
    - Final answer: Ground truth answer (space, capital F)
    - Level: Difficulty level (1, 2, or 3, capital L)
    - file_name: File attachment (lowercase with underscore)
    
    Args:
        path: Path to JSONL file
        
    Returns:
        List of normalized task dictionaries
    """
    tasks = []
    
    try:
        with open(path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                line = line.strip()
                if not line:
                    continue
                    
                try:
                    obj = json.loads(line)
                except json.JSONDecodeError as e:
                    print(f"Warning: Invalid JSON at line {line_num}: {e}")
                    continue

                # Extract GAIA format fields with exact names
                task_id = obj.get('task_id')
                question = obj.get('Question')  # Capital Q
                final_answer = obj.get('Final answer')  # Space, capital F
                level = obj.get('Level')  # Capital L
                file_name = obj.get('file_name', '')  # lowercase with underscore

                # Skip malformed entries
                if not task_id or not question or level is None:
                    print(f"Warning: Missing required fields at line {line_num}")
                    continue

                # Normalize and validate
                try:
                    level_int = int(level)
                    if level_int not in [1, 2, 3]:
                        print(f"Warning: Invalid level {level} at line {line_num}")
                        continue
                except (ValueError, TypeError):
                    print(f"Warning: Invalid level format at line {line_num}")
                    continue

                tasks.append({
                    'task_id': str(task_id),
                    'question': str(question),
                    'ground_truth': str(final_answer) if final_answer else '',
                    'level': level_int,
                    'has_file': bool(file_name and str(file_name).strip()),
                })
                
    except FileNotFoundError:
        raise FileNotFoundError(f"Data file not found: {path}")
    except Exception as e:
        raise RuntimeError(f"Failed to load data from {path}: {e}")
    
    return tasks


def filter_tasks(tasks: List[Dict], level: str = 'all', limit: int = 0) -> List[Dict]:
    """
    Filter tasks by level and apply permanent file exclusion
    
    Args:
        tasks: List of task dictionaries
        level: Level filter ('1', '2', '3', or 'all')
        limit: Maximum number of tasks to return (0 = no limit)
        
    Returns:
        Filtered list of tasks
    """
    filtered = tasks.copy()
    
    # Filter by level if specified
    if level != 'all':
        try:
            level_int = int(level)
            filtered = [t for t in filtered if t['level'] == level_int]
        except ValueError:
            raise ValueError(f"Invalid level filter: {level}. Must be '1', '2', '3', or 'all'")
    
    # PERMANENT exclusion of tasks requiring files (no override ever)
    filtered = [t for t in filtered if not t['has_file']]
    
    # Apply limit if specified
    if limit > 0:
        filtered = filtered[:limit]
    
    return filtered


def get_task_stats(tasks: List[Dict]) -> Dict[str, int]:
    """
    Get basic statistics about the task list
    
    Args:
        tasks: List of task dictionaries
        
    Returns:
        Dictionary with task statistics
    """
    if not tasks:
        return {'total': 0, 'level_1': 0, 'level_2': 0, 'level_3': 0, 'with_files': 0}
    
    stats = {
        'total': len(tasks),
        'level_1': len([t for t in tasks if t['level'] == 1]),
        'level_2': len([t for t in tasks if t['level'] == 2]),
        'level_3': len([t for t in tasks if t['level'] == 3]),
        'with_files': len([t for t in tasks if t['has_file']]),
    }
    
    return stats