"""
Paper dataset and loading utilities.
"""

from pathlib import Path
from typing import List, Dict, Any, Optional
import json


class PaperDataset:
    """
    Manages collection of research papers for reproduction.
    Organizes by difficulty level.
    """
    
    def __init__(self, data_dir: str = "data/papers"):
        self.data_dir = Path(data_dir)
        self.papers = self._load_papers()
    
    def _load_papers(self) -> Dict[str, List[Dict[str, Any]]]:
        """Load papers from data directory."""
        papers = {
            'easy': [],
            'medium': [],
            'hard': []
        }
        
        for difficulty in ['easy', 'medium', 'hard']:
            difficulty_dir = self.data_dir / difficulty
            
            if difficulty_dir.exists():
                for paper_file in difficulty_dir.glob('*.json'):
                    try:
                        with open(paper_file) as f:
                            paper_data = json.load(f)
                            paper_data['difficulty'] = difficulty
                            papers[difficulty].append(paper_data)
                    except Exception as e:
                        print(f"⚠️  Failed to load {paper_file}: {e}")
        
        return papers
    
    def get_paper(
        self,
        difficulty: Optional[str] = None,
        index: int = 0
    ) -> Optional[Dict[str, Any]]:
        """Get a specific paper."""
        if difficulty:
            papers_list = self.papers.get(difficulty, [])
            if index < len(papers_list):
                return papers_list[index]
        else:
            # Get any paper
            all_papers = []
            for papers_list in self.papers.values():
                all_papers.extend(papers_list)
            
            if index < len(all_papers):
                return all_papers[index]
        
        return None
    
    def get_random_paper(self, difficulty: Optional[str] = None) -> Optional[Dict[str, Any]]:
        """Get random paper."""
        import random
        
        if difficulty:
            papers_list = self.papers.get(difficulty, [])
        else:
            papers_list = []
            for plist in self.papers.values():
                papers_list.extend(plist)
        
        if papers_list:
            return random.choice(papers_list)
        
        return None
    
    def count(self, difficulty: Optional[str] = None) -> int:
        """Count papers."""
        if difficulty:
            return len(self.papers.get(difficulty, []))
        else:
            return sum(len(plist) for plist in self.papers.values())


# Example paper template
SAMPLE_PAPER_TEMPLATE = {
    "title": "ResNet-50 on CIFAR-10",
    "dataset": "CIFAR-10",
    "model": "ResNet-50",
    "target_metric": 0.95,
    "metric_name": "accuracy",
    "github_url": "https://github.com/example/resnet-cifar10",
    "key_claims": [
        "Achieves 95% accuracy on CIFAR-10",
        "Uses standard data augmentation",
        "Trains in 200 epochs"
    ],
    "ground_truth_config": {
        "learning_rate": 0.0001,
        "batch_size": 64,
        "optimizer": "adamw",
        "epochs": 200,
        "weight_decay": 0.01,
        "scheduler": "cosine"
    }
}


def create_sample_papers():
    """Create sample paper dataset for all difficulty levels."""
    data_dir = Path("data/papers")
    
    # --- EASY papers ---
    easy_dir = data_dir / "easy"
    easy_dir.mkdir(parents=True, exist_ok=True)
    
    easy_paper_1 = SAMPLE_PAPER_TEMPLATE.copy()
    easy_paper_1["difficulty"] = "easy"
    with open(easy_dir / "resnet_cifar10.json", 'w') as f:
        json.dump(easy_paper_1, f, indent=2)
    
    easy_paper_2 = {
        "title": "Simple CNN for MNIST Digit Classification",
        "dataset": "MNIST",
        "model": "CNN-Small",
        "target_metric": 0.99,
        "metric_name": "accuracy",
        "github_url": "https://github.com/pytorch/examples",
        "key_claims": [
            "Achieves 99% accuracy on MNIST",
            "Simple 2-layer CNN architecture",
            "Trains in under 10 epochs"
        ],
        "ground_truth_config": {
            "learning_rate": 0.001,
            "batch_size": 64,
            "optimizer": "adam",
            "epochs": 10
        },
        "difficulty": "easy"
    }
    with open(easy_dir / "mnist_cnn.json", 'w') as f:
        json.dump(easy_paper_2, f, indent=2)
    
    # --- MEDIUM papers ---
    medium_dir = data_dir / "medium"
    medium_dir.mkdir(parents=True, exist_ok=True)
    
    medium_paper = {
        "title": "Fine-tuning BERT for Text Classification",
        "dataset": "GLUE-SST2",
        "model": "BERT-base",
        "target_metric": 0.92,
        "metric_name": "accuracy",
        "github_url": "https://github.com/huggingface/transformers",
        "key_claims": [
            "Achieves 92% accuracy on SST-2",
            "Fine-tunes pre-trained BERT-base",
            "Requires careful learning rate tuning"
        ],
        "ground_truth_config": {
            "learning_rate": 2e-5,
            "batch_size": 32,
            "optimizer": "adamw",
            "epochs": 3,
            "warmup_steps": 500,
            "weight_decay": 0.01
        },
        "difficulty": "medium"
    }
    with open(medium_dir / "bert_finetuning.json", 'w') as f:
        json.dump(medium_paper, f, indent=2)
    
    # --- HARD papers ---
    hard_dir = data_dir / "hard"
    hard_dir.mkdir(parents=True, exist_ok=True)
    
    hard_paper = {
        "title": "Progressive GAN for High-Resolution Image Generation",
        "dataset": "CelebA-HQ",
        "model": "ProGAN",
        "target_metric": 0.85,
        "metric_name": "FID_score_inverse",
        "github_url": "",
        "key_claims": [
            "Generates 1024x1024 face images",
            "Uses progressive training curriculum",
            "Achieves FID of 7.3 on CelebA-HQ"
        ],
        "ground_truth_config": {
            "learning_rate": 0.001,
            "batch_size": 16,
            "optimizer": "adam",
            "epochs": 600,
            "latent_dim": 512,
            "beta1": 0.0,
            "beta2": 0.99
        },
        "difficulty": "hard"
    }
    with open(hard_dir / "gan_generation.json", 'w') as f:
        json.dump(hard_paper, f, indent=2)
    
    print("[OK] Sample papers created (easy: 2, medium: 1, hard: 1)")


if __name__ == "__main__":
    create_sample_papers()