AgileAndy's picture
updated I'm proved version
f504b2e verified
"""
GAIA Dataset Utilities
Download and cache GAIA questions for local testing
"""
import os
import json
import requests
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
class GAIADatasetManager:
"""Manages GAIA dataset download and local caching"""
def __init__(self, cache_dir: str = "gaia_data"):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
self.api_url = "https://agents-course-unit4-scoring.hf.space"
self.questions_url = f"{self.api_url}/questions"
self.submit_url = f"{self.api_url}/submit"
self.questions_cache_file = self.cache_dir / "questions.json"
self.metadata_file = self.cache_dir / "metadata.json"
def download_questions(self, force_refresh: bool = False) -> List[Dict[str, Any]]:
"""
Download GAIA questions from scoring API.
Args:
force_refresh: If True, always download fresh data. If False, use cache if available.
Returns:
List of question dictionaries
"""
# Check cache first
if not force_refresh and self.questions_cache_file.exists():
print(f"πŸ“¦ Loading questions from cache: {self.questions_cache_file}")
with open(self.questions_cache_file, 'r', encoding='utf-8') as f:
return json.load(f)
# Download from API
print(f"🌐 Downloading questions from: {self.questions_url}")
try:
response = requests.get(self.questions_url, timeout=30)
response.raise_for_status()
questions = response.json()
if not questions:
raise ValueError("Fetched questions list is empty")
# Cache the questions
with open(self.questions_cache_file, 'w', encoding='utf-8') as f:
json.dump(questions, f, indent=2)
# Update metadata
metadata = {
"download_time": datetime.now().isoformat(),
"question_count": len(questions),
"api_url": self.questions_url
}
with open(self.metadata_file, 'w', encoding='utf-8') as f:
json.dump(metadata, f, indent=2)
print(f"βœ… Downloaded and cached {len(questions)} questions")
return questions
except requests.exceptions.RequestException as e:
print(f"❌ Error downloading questions: {e}")
# Fallback to cache if available
if self.questions_cache_file.exists():
print("πŸ“¦ Falling back to cached questions")
with open(self.questions_cache_file, 'r', encoding='utf-8') as f:
return json.load(f)
else:
raise e
def get_cached_metadata(self) -> Optional[Dict[str, Any]]:
"""Get metadata about cached questions"""
if self.metadata_file.exists():
with open(self.metadata_file, 'r', encoding='utf-8') as f:
return json.load(f)
return None
def save_results(self, results: List[Dict[str, Any]], filename: Optional[str] = None):
"""
Save test results to a file
Args:
results: List of result dictionaries
filename: Optional filename. If not provided, uses timestamp.
"""
if filename is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"results_{timestamp}.json"
filepath = self.cache_dir / filename
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2)
print(f"πŸ’Ύ Results saved to: {filepath}")
return filepath
def load_dotenv(self):
"""Load environment variables from .env file"""
env_file = Path(".env")
if env_file.exists():
print("πŸ“„ Loading environment variables from .env")
with open(env_file, 'r') as f:
for line in f:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
key, value = line.split('=', 1)
os.environ[key.strip()] = value.strip()
print("βœ… Environment variables loaded")
else:
print("⚠️ No .env file found")
def ensure_local_testing_setup() -> GAIADatasetManager:
"""
Ensure environment is set up for 100% local testing.
Returns:
GAIADatasetManager instance with questions cached
"""
print("πŸ”§ Setting up for local testing...")
# Load environment variables
manager = GAIADatasetManager()
manager.load_dotenv()
# Download and cache questions
try:
questions = manager.download_questions()
print(f"βœ… Local testing setup complete ({len(questions)} questions cached)")
except Exception as e:
print(f"❌ Failed to download questions: {e}")
raise e
return manager
if __name__ == "__main__":
# Test the dataset manager
print("=" * 60)
print("GAIA Dataset Manager Test")
print("=" * 60)
manager = ensure_local_testing_setup()
# Show cache metadata
metadata = manager.get_cached_metadata()
if metadata:
print("\nπŸ“Š Cache Metadata:")
for key, value in metadata.items():
print(f" {key}: {value}")