Spaces:
Sleeping
Sleeping
| """ | |
| GAIA Dataset Utilities | |
| Download and cache GAIA questions for local testing | |
| """ | |
| import os | |
| import json | |
| import requests | |
| from pathlib import Path | |
| from typing import List, Dict, Any, Optional | |
| from datetime import datetime | |
| class GAIADatasetManager: | |
| """Manages GAIA dataset download and local caching""" | |
| def __init__(self, cache_dir: str = "gaia_data"): | |
| self.cache_dir = Path(cache_dir) | |
| self.cache_dir.mkdir(exist_ok=True) | |
| self.api_url = "https://agents-course-unit4-scoring.hf.space" | |
| self.questions_url = f"{self.api_url}/questions" | |
| self.submit_url = f"{self.api_url}/submit" | |
| self.questions_cache_file = self.cache_dir / "questions.json" | |
| self.metadata_file = self.cache_dir / "metadata.json" | |
| def download_questions(self, force_refresh: bool = False) -> List[Dict[str, Any]]: | |
| """ | |
| Download GAIA questions from scoring API. | |
| Args: | |
| force_refresh: If True, always download fresh data. If False, use cache if available. | |
| Returns: | |
| List of question dictionaries | |
| """ | |
| # Check cache first | |
| if not force_refresh and self.questions_cache_file.exists(): | |
| print(f"π¦ Loading questions from cache: {self.questions_cache_file}") | |
| with open(self.questions_cache_file, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |
| # Download from API | |
| print(f"π Downloading questions from: {self.questions_url}") | |
| try: | |
| response = requests.get(self.questions_url, timeout=30) | |
| response.raise_for_status() | |
| questions = response.json() | |
| if not questions: | |
| raise ValueError("Fetched questions list is empty") | |
| # Cache the questions | |
| with open(self.questions_cache_file, 'w', encoding='utf-8') as f: | |
| json.dump(questions, f, indent=2) | |
| # Update metadata | |
| metadata = { | |
| "download_time": datetime.now().isoformat(), | |
| "question_count": len(questions), | |
| "api_url": self.questions_url | |
| } | |
| with open(self.metadata_file, 'w', encoding='utf-8') as f: | |
| json.dump(metadata, f, indent=2) | |
| print(f"β Downloaded and cached {len(questions)} questions") | |
| return questions | |
| except requests.exceptions.RequestException as e: | |
| print(f"β Error downloading questions: {e}") | |
| # Fallback to cache if available | |
| if self.questions_cache_file.exists(): | |
| print("π¦ Falling back to cached questions") | |
| with open(self.questions_cache_file, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |
| else: | |
| raise e | |
| def get_cached_metadata(self) -> Optional[Dict[str, Any]]: | |
| """Get metadata about cached questions""" | |
| if self.metadata_file.exists(): | |
| with open(self.metadata_file, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |
| return None | |
| def save_results(self, results: List[Dict[str, Any]], filename: Optional[str] = None): | |
| """ | |
| Save test results to a file | |
| Args: | |
| results: List of result dictionaries | |
| filename: Optional filename. If not provided, uses timestamp. | |
| """ | |
| if filename is None: | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| filename = f"results_{timestamp}.json" | |
| filepath = self.cache_dir / filename | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(results, f, indent=2) | |
| print(f"πΎ Results saved to: {filepath}") | |
| return filepath | |
| def load_dotenv(self): | |
| """Load environment variables from .env file""" | |
| env_file = Path(".env") | |
| if env_file.exists(): | |
| print("π Loading environment variables from .env") | |
| with open(env_file, 'r') as f: | |
| for line in f: | |
| line = line.strip() | |
| if line and not line.startswith('#') and '=' in line: | |
| key, value = line.split('=', 1) | |
| os.environ[key.strip()] = value.strip() | |
| print("β Environment variables loaded") | |
| else: | |
| print("β οΈ No .env file found") | |
| def ensure_local_testing_setup() -> GAIADatasetManager: | |
| """ | |
| Ensure environment is set up for 100% local testing. | |
| Returns: | |
| GAIADatasetManager instance with questions cached | |
| """ | |
| print("π§ Setting up for local testing...") | |
| # Load environment variables | |
| manager = GAIADatasetManager() | |
| manager.load_dotenv() | |
| # Download and cache questions | |
| try: | |
| questions = manager.download_questions() | |
| print(f"β Local testing setup complete ({len(questions)} questions cached)") | |
| except Exception as e: | |
| print(f"β Failed to download questions: {e}") | |
| raise e | |
| return manager | |
| if __name__ == "__main__": | |
| # Test the dataset manager | |
| print("=" * 60) | |
| print("GAIA Dataset Manager Test") | |
| print("=" * 60) | |
| manager = ensure_local_testing_setup() | |
| # Show cache metadata | |
| metadata = manager.get_cached_metadata() | |
| if metadata: | |
| print("\nπ Cache Metadata:") | |
| for key, value in metadata.items(): | |
| print(f" {key}: {value}") | |