charSLee013
feat: complete Hugging Face Spaces deployment with production-ready CognitiveKernel-Launchpad
1ea26af
#!/usr/bin/env python3
"""
GAIA Data Loader - Minimal JSONL parser for GAIA dataset
Loads tasks from GAIA JSONL format with exact field names and permanent file exclusion.
"""
import json
from typing import List, Dict
def load_tasks(path: str) -> List[Dict]:
"""
Load tasks from GAIA JSONL format with exact field names
Expected GAIA format fields:
- task_id: Task identifier
- Question: Task question/query (capital Q)
- Final answer: Ground truth answer (space, capital F)
- Level: Difficulty level (1, 2, or 3, capital L)
- file_name: File attachment (lowercase with underscore)
Args:
path: Path to JSONL file
Returns:
List of normalized task dictionaries
"""
tasks = []
try:
with open(path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError as e:
print(f"Warning: Invalid JSON at line {line_num}: {e}")
continue
# Extract GAIA format fields with exact names
task_id = obj.get('task_id')
question = obj.get('Question') # Capital Q
final_answer = obj.get('Final answer') # Space, capital F
level = obj.get('Level') # Capital L
file_name = obj.get('file_name', '') # lowercase with underscore
# Skip malformed entries
if not task_id or not question or level is None:
print(f"Warning: Missing required fields at line {line_num}")
continue
# Normalize and validate
try:
level_int = int(level)
if level_int not in [1, 2, 3]:
print(f"Warning: Invalid level {level} at line {line_num}")
continue
except (ValueError, TypeError):
print(f"Warning: Invalid level format at line {line_num}")
continue
tasks.append({
'task_id': str(task_id),
'question': str(question),
'ground_truth': str(final_answer) if final_answer else '',
'level': level_int,
'has_file': bool(file_name and str(file_name).strip()),
})
except FileNotFoundError:
raise FileNotFoundError(f"Data file not found: {path}")
except Exception as e:
raise RuntimeError(f"Failed to load data from {path}: {e}")
return tasks
def filter_tasks(tasks: List[Dict], level: str = 'all', limit: int = 0) -> List[Dict]:
"""
Filter tasks by level and apply permanent file exclusion
Args:
tasks: List of task dictionaries
level: Level filter ('1', '2', '3', or 'all')
limit: Maximum number of tasks to return (0 = no limit)
Returns:
Filtered list of tasks
"""
filtered = tasks.copy()
# Filter by level if specified
if level != 'all':
try:
level_int = int(level)
filtered = [t for t in filtered if t['level'] == level_int]
except ValueError:
raise ValueError(f"Invalid level filter: {level}. Must be '1', '2', '3', or 'all'")
# PERMANENT exclusion of tasks requiring files (no override ever)
filtered = [t for t in filtered if not t['has_file']]
# Apply limit if specified
if limit > 0:
filtered = filtered[:limit]
return filtered
def get_task_stats(tasks: List[Dict]) -> Dict[str, int]:
"""
Get basic statistics about the task list
Args:
tasks: List of task dictionaries
Returns:
Dictionary with task statistics
"""
if not tasks:
return {'total': 0, 'level_1': 0, 'level_2': 0, 'level_3': 0, 'with_files': 0}
stats = {
'total': len(tasks),
'level_1': len([t for t in tasks if t['level'] == 1]),
'level_2': len([t for t in tasks if t['level'] == 2]),
'level_3': len([t for t in tasks if t['level'] == 3]),
'with_files': len([t for t in tasks if t['has_file']]),
}
return stats