Spaces:

Chars
/

CognitiveKernel-Launchpad

Sleeping

CognitiveKernel-Launchpad / gaia /data_loader.py

charSLee013

feat: complete Hugging Face Spaces deployment with production-ready CognitiveKernel-Launchpad

1ea26af 4 months ago

4.45 kB

	#!/usr/bin/env python3
	"""
	GAIA Data Loader - Minimal JSONL parser for GAIA dataset
	Loads tasks from GAIA JSONL format with exact field names and permanent file exclusion.
	"""

	import json
	from typing import List, Dict


	def load_tasks(path: str) -> List[Dict]:
	"""
	Load tasks from GAIA JSONL format with exact field names

	Expected GAIA format fields:
	- task_id: Task identifier
	- Question: Task question/query (capital Q)
	- Final answer: Ground truth answer (space, capital F)
	- Level: Difficulty level (1, 2, or 3, capital L)
	- file_name: File attachment (lowercase with underscore)

	Args:
	path: Path to JSONL file

	Returns:
	List of normalized task dictionaries
	"""
	tasks = []

	try:
	with open(path, 'r', encoding='utf-8') as f:
	for line_num, line in enumerate(f, 1):
	line = line.strip()
	if not line:
	continue

	try:
	obj = json.loads(line)
	except json.JSONDecodeError as e:
	print(f"Warning: Invalid JSON at line {line_num}: {e}")
	continue

	# Extract GAIA format fields with exact names
	task_id = obj.get('task_id')
	question = obj.get('Question') # Capital Q
	final_answer = obj.get('Final answer') # Space, capital F
	level = obj.get('Level') # Capital L
	file_name = obj.get('file_name', '') # lowercase with underscore

	# Skip malformed entries
	if not task_id or not question or level is None:
	print(f"Warning: Missing required fields at line {line_num}")
	continue

	# Normalize and validate
	try:
	level_int = int(level)
	if level_int not in [1, 2, 3]:
	print(f"Warning: Invalid level {level} at line {line_num}")
	continue
	except (ValueError, TypeError):
	print(f"Warning: Invalid level format at line {line_num}")
	continue

	tasks.append({
	'task_id': str(task_id),
	'question': str(question),
	'ground_truth': str(final_answer) if final_answer else '',
	'level': level_int,
	'has_file': bool(file_name and str(file_name).strip()),
	})

	except FileNotFoundError:
	raise FileNotFoundError(f"Data file not found: {path}")
	except Exception as e:
	raise RuntimeError(f"Failed to load data from {path}: {e}")

	return tasks


	def filter_tasks(tasks: List[Dict], level: str = 'all', limit: int = 0) -> List[Dict]:
	"""
	Filter tasks by level and apply permanent file exclusion

	Args:
	tasks: List of task dictionaries
	level: Level filter ('1', '2', '3', or 'all')
	limit: Maximum number of tasks to return (0 = no limit)

	Returns:
	Filtered list of tasks
	"""
	filtered = tasks.copy()

	# Filter by level if specified
	if level != 'all':
	try:
	level_int = int(level)
	filtered = [t for t in filtered if t['level'] == level_int]
	except ValueError:
	raise ValueError(f"Invalid level filter: {level}. Must be '1', '2', '3', or 'all'")

	# PERMANENT exclusion of tasks requiring files (no override ever)
	filtered = [t for t in filtered if not t['has_file']]

	# Apply limit if specified
	if limit > 0:
	filtered = filtered[:limit]

	return filtered


	def get_task_stats(tasks: List[Dict]) -> Dict[str, int]:
	"""
	Get basic statistics about the task list

	Args:
	tasks: List of task dictionaries

	Returns:
	Dictionary with task statistics
	"""
	if not tasks:
	return {'total': 0, 'level_1': 0, 'level_2': 0, 'level_3': 0, 'with_files': 0}

	stats = {
	'total': len(tasks),
	'level_1': len([t for t in tasks if t['level'] == 1]),
	'level_2': len([t for t in tasks if t['level'] == 2]),
	'level_3': len([t for t in tasks if t['level'] == 3]),
	'with_files': len([t for t in tasks if t['has_file']]),
	}

	return stats