Spaces:
Sleeping
Sleeping
File size: 4,452 Bytes
1ea26af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
#!/usr/bin/env python3
"""
GAIA Data Loader - Minimal JSONL parser for GAIA dataset
Loads tasks from GAIA JSONL format with exact field names and permanent file exclusion.
"""
import json
from typing import List, Dict
def load_tasks(path: str) -> List[Dict]:
"""
Load tasks from GAIA JSONL format with exact field names
Expected GAIA format fields:
- task_id: Task identifier
- Question: Task question/query (capital Q)
- Final answer: Ground truth answer (space, capital F)
- Level: Difficulty level (1, 2, or 3, capital L)
- file_name: File attachment (lowercase with underscore)
Args:
path: Path to JSONL file
Returns:
List of normalized task dictionaries
"""
tasks = []
try:
with open(path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError as e:
print(f"Warning: Invalid JSON at line {line_num}: {e}")
continue
# Extract GAIA format fields with exact names
task_id = obj.get('task_id')
question = obj.get('Question') # Capital Q
final_answer = obj.get('Final answer') # Space, capital F
level = obj.get('Level') # Capital L
file_name = obj.get('file_name', '') # lowercase with underscore
# Skip malformed entries
if not task_id or not question or level is None:
print(f"Warning: Missing required fields at line {line_num}")
continue
# Normalize and validate
try:
level_int = int(level)
if level_int not in [1, 2, 3]:
print(f"Warning: Invalid level {level} at line {line_num}")
continue
except (ValueError, TypeError):
print(f"Warning: Invalid level format at line {line_num}")
continue
tasks.append({
'task_id': str(task_id),
'question': str(question),
'ground_truth': str(final_answer) if final_answer else '',
'level': level_int,
'has_file': bool(file_name and str(file_name).strip()),
})
except FileNotFoundError:
raise FileNotFoundError(f"Data file not found: {path}")
except Exception as e:
raise RuntimeError(f"Failed to load data from {path}: {e}")
return tasks
def filter_tasks(tasks: List[Dict], level: str = 'all', limit: int = 0) -> List[Dict]:
"""
Filter tasks by level and apply permanent file exclusion
Args:
tasks: List of task dictionaries
level: Level filter ('1', '2', '3', or 'all')
limit: Maximum number of tasks to return (0 = no limit)
Returns:
Filtered list of tasks
"""
filtered = tasks.copy()
# Filter by level if specified
if level != 'all':
try:
level_int = int(level)
filtered = [t for t in filtered if t['level'] == level_int]
except ValueError:
raise ValueError(f"Invalid level filter: {level}. Must be '1', '2', '3', or 'all'")
# PERMANENT exclusion of tasks requiring files (no override ever)
filtered = [t for t in filtered if not t['has_file']]
# Apply limit if specified
if limit > 0:
filtered = filtered[:limit]
return filtered
def get_task_stats(tasks: List[Dict]) -> Dict[str, int]:
"""
Get basic statistics about the task list
Args:
tasks: List of task dictionaries
Returns:
Dictionary with task statistics
"""
if not tasks:
return {'total': 0, 'level_1': 0, 'level_2': 0, 'level_3': 0, 'with_files': 0}
stats = {
'total': len(tasks),
'level_1': len([t for t in tasks if t['level'] == 1]),
'level_2': len([t for t in tasks if t['level'] == 2]),
'level_3': len([t for t in tasks if t['level'] == 3]),
'with_files': len([t for t in tasks if t['has_file']]),
}
return stats
|