Spaces:
Runtime error
Runtime error
| """ | |
| Paper dataset and loading utilities. | |
| """ | |
| from pathlib import Path | |
| from typing import List, Dict, Any, Optional | |
| import json | |
| class PaperDataset: | |
| """ | |
| Manages collection of research papers for reproduction. | |
| Organizes by difficulty level. | |
| """ | |
| def __init__(self, data_dir: str = "data/papers"): | |
| self.data_dir = Path(data_dir) | |
| self.papers = self._load_papers() | |
| def _load_papers(self) -> Dict[str, List[Dict[str, Any]]]: | |
| """Load papers from data directory.""" | |
| papers = { | |
| 'easy': [], | |
| 'medium': [], | |
| 'hard': [] | |
| } | |
| for difficulty in ['easy', 'medium', 'hard']: | |
| difficulty_dir = self.data_dir / difficulty | |
| if difficulty_dir.exists(): | |
| for paper_file in difficulty_dir.glob('*.json'): | |
| try: | |
| with open(paper_file) as f: | |
| paper_data = json.load(f) | |
| paper_data['difficulty'] = difficulty | |
| papers[difficulty].append(paper_data) | |
| except Exception as e: | |
| print(f"⚠️ Failed to load {paper_file}: {e}") | |
| return papers | |
| def get_paper( | |
| self, | |
| difficulty: Optional[str] = None, | |
| index: int = 0 | |
| ) -> Optional[Dict[str, Any]]: | |
| """Get a specific paper.""" | |
| if difficulty: | |
| papers_list = self.papers.get(difficulty, []) | |
| if index < len(papers_list): | |
| return papers_list[index] | |
| else: | |
| # Get any paper | |
| all_papers = [] | |
| for papers_list in self.papers.values(): | |
| all_papers.extend(papers_list) | |
| if index < len(all_papers): | |
| return all_papers[index] | |
| return None | |
| def get_random_paper(self, difficulty: Optional[str] = None) -> Optional[Dict[str, Any]]: | |
| """Get random paper.""" | |
| import random | |
| if difficulty: | |
| papers_list = self.papers.get(difficulty, []) | |
| else: | |
| papers_list = [] | |
| for plist in self.papers.values(): | |
| papers_list.extend(plist) | |
| if papers_list: | |
| return random.choice(papers_list) | |
| return None | |
| def count(self, difficulty: Optional[str] = None) -> int: | |
| """Count papers.""" | |
| if difficulty: | |
| return len(self.papers.get(difficulty, [])) | |
| else: | |
| return sum(len(plist) for plist in self.papers.values()) | |
| # Example paper template | |
| SAMPLE_PAPER_TEMPLATE = { | |
| "title": "ResNet-50 on CIFAR-10", | |
| "dataset": "CIFAR-10", | |
| "model": "ResNet-50", | |
| "target_metric": 0.95, | |
| "metric_name": "accuracy", | |
| "github_url": "https://github.com/example/resnet-cifar10", | |
| "key_claims": [ | |
| "Achieves 95% accuracy on CIFAR-10", | |
| "Uses standard data augmentation", | |
| "Trains in 200 epochs" | |
| ], | |
| "ground_truth_config": { | |
| "learning_rate": 0.0001, | |
| "batch_size": 64, | |
| "optimizer": "adamw", | |
| "epochs": 200, | |
| "weight_decay": 0.01, | |
| "scheduler": "cosine" | |
| } | |
| } | |
| def create_sample_papers(): | |
| """Create sample paper dataset for all difficulty levels.""" | |
| data_dir = Path("data/papers") | |
| # --- EASY papers --- | |
| easy_dir = data_dir / "easy" | |
| easy_dir.mkdir(parents=True, exist_ok=True) | |
| easy_paper_1 = SAMPLE_PAPER_TEMPLATE.copy() | |
| easy_paper_1["difficulty"] = "easy" | |
| with open(easy_dir / "resnet_cifar10.json", 'w') as f: | |
| json.dump(easy_paper_1, f, indent=2) | |
| easy_paper_2 = { | |
| "title": "Simple CNN for MNIST Digit Classification", | |
| "dataset": "MNIST", | |
| "model": "CNN-Small", | |
| "target_metric": 0.99, | |
| "metric_name": "accuracy", | |
| "github_url": "https://github.com/pytorch/examples", | |
| "key_claims": [ | |
| "Achieves 99% accuracy on MNIST", | |
| "Simple 2-layer CNN architecture", | |
| "Trains in under 10 epochs" | |
| ], | |
| "ground_truth_config": { | |
| "learning_rate": 0.001, | |
| "batch_size": 64, | |
| "optimizer": "adam", | |
| "epochs": 10 | |
| }, | |
| "difficulty": "easy" | |
| } | |
| with open(easy_dir / "mnist_cnn.json", 'w') as f: | |
| json.dump(easy_paper_2, f, indent=2) | |
| # --- MEDIUM papers --- | |
| medium_dir = data_dir / "medium" | |
| medium_dir.mkdir(parents=True, exist_ok=True) | |
| medium_paper = { | |
| "title": "Fine-tuning BERT for Text Classification", | |
| "dataset": "GLUE-SST2", | |
| "model": "BERT-base", | |
| "target_metric": 0.92, | |
| "metric_name": "accuracy", | |
| "github_url": "https://github.com/huggingface/transformers", | |
| "key_claims": [ | |
| "Achieves 92% accuracy on SST-2", | |
| "Fine-tunes pre-trained BERT-base", | |
| "Requires careful learning rate tuning" | |
| ], | |
| "ground_truth_config": { | |
| "learning_rate": 2e-5, | |
| "batch_size": 32, | |
| "optimizer": "adamw", | |
| "epochs": 3, | |
| "warmup_steps": 500, | |
| "weight_decay": 0.01 | |
| }, | |
| "difficulty": "medium" | |
| } | |
| with open(medium_dir / "bert_finetuning.json", 'w') as f: | |
| json.dump(medium_paper, f, indent=2) | |
| # --- HARD papers --- | |
| hard_dir = data_dir / "hard" | |
| hard_dir.mkdir(parents=True, exist_ok=True) | |
| hard_paper = { | |
| "title": "Progressive GAN for High-Resolution Image Generation", | |
| "dataset": "CelebA-HQ", | |
| "model": "ProGAN", | |
| "target_metric": 0.85, | |
| "metric_name": "FID_score_inverse", | |
| "github_url": "", | |
| "key_claims": [ | |
| "Generates 1024x1024 face images", | |
| "Uses progressive training curriculum", | |
| "Achieves FID of 7.3 on CelebA-HQ" | |
| ], | |
| "ground_truth_config": { | |
| "learning_rate": 0.001, | |
| "batch_size": 16, | |
| "optimizer": "adam", | |
| "epochs": 600, | |
| "latent_dim": 512, | |
| "beta1": 0.0, | |
| "beta2": 0.99 | |
| }, | |
| "difficulty": "hard" | |
| } | |
| with open(hard_dir / "gan_generation.json", 'w') as f: | |
| json.dump(hard_paper, f, indent=2) | |
| print("[OK] Sample papers created (easy: 2, medium: 1, hard: 1)") | |
| if __name__ == "__main__": | |
| create_sample_papers() | |