"""
Script to download and materialize SWE-bench Lite tasks.

This script:
1. Downloads SWE-bench Lite dataset from HuggingFace
2. Extracts the buggy code and creates test files
3. Organizes them into the expected directory structure

Usage:
    python -m dataset.prepare_swebench [--max-tasks N] [--difficulty easy|medium|hard|all]
"""

import argparse
import os
import sys
from pathlib import Path

# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from datasets import load_dataset


def get_problem_statement(row):
    """Extract problem statement from row."""
    return row.get("problem_statement", "")


def get_patch(row):
    """Extract the patch/fix from row."""
    return row.get("patch", "")


def get_instance_id(row):
    """Get instance ID from row."""
    return row.get("instance_id", "")


def create_buggy_file(instance_dir: Path, row):
    """
    Create buggy.py from the base commit and instance.
    
    The SWE-bench dataset provides the full repository at base_commit.
    We need to extract just the relevant file that has the bug.
    """
    # For SWE-bench, the "buggy" version is actually the version BEFORE the patch
    # We need to get the file content from the base commit
    # This is complex as it requires cloning the repo at a specific commit
    
    # For simplicity, we'll use a different approach:
    # The problem_statement describes the bug, and we can create a simplified
    # buggy version based on that description
    
    instance_id = get_instance_id(row)
    problem_stmt = get_problem_statement(row)
    
    # Try to extract the file from the created files in the instance
    # SWE-bench provides 'repo' and we need to find the relevant file
    created_files = row.get("created_files", [])
    
    if not created_files:
        # Fallback: create a placeholder
        buggy_code = f'''# Buggy code for {instance_id}
# Problem: {problem_stmt[:200]}...

def solution():
    """Placeholder solution - needs to be fixed."""
    pass
'''
    else:
        # For now, create a simple placeholder
        # In a full implementation, we'd clone the repo at base_commit
        file_path = created_files[0] if created_files else "solution.py"
        buggy_code = f'''# Buggy code for {instance_id}
# File: {file_path}
# Problem: {problem_stmt[:200]}...

def solution():
    """Placeholder solution - needs to be fixed."""
    pass
'''
    
    buggy_file = instance_dir / "buggy.py"
    buggy_file.write_text(buggy_code, encoding="utf-8")
    return buggy_file


def create_test_file(instance_dir: Path, row):
    """
    Create test.py based on the problem statement.
    
    For SWE-bench, tests are typically derived from the issue description.
    We'll create a simple test that checks if the solution works.
    """
    instance_id = get_instance_id(row)
    problem_stmt = get_problem_statement(row)
    
    # Create a simple test file
    # In practice, SWE-bench has a test.json file with test cases
    test_cases = row.get("test_cases", [])
    
    if test_cases:
        # Create tests from provided test cases
        test_code = "import unittest\\n\\n"
        for i, tc in enumerate(test_cases):
            input_str = tc.get("input", "")
            output_str = tc.get("output", "")
            test_code += f'''class TestSolution(unittest.TestCase):
    def test_case_{i+1}(self):
        # Input: {input_str}
        # Expected: {output_str}
        pass  # TODO: Add actual test
'''
    else:
        # Create a basic test based on problem statement
        test_code = f'''"""Test file for {instance_id}"""

import unittest
from buggy import solution


class TestSolution(unittest.TestCase):
    def test_basic(self):
        """Test based on problem statement."""
        # Problem: {problem_stmt[:300]}...
        result = solution()
        self.assertIsNotNone(result)


if __name__ == "__main__":
    unittest.main()
'''
    
    test_file = instance_dir / "test.py"
    test_file.write_text(test_code, encoding="utf-8")
    return test_file


def create_metadata_file(instance_dir: Path, row):
    """Create metadata.json with instance info."""
    import json
    
    metadata = {
        "instance_id": get_instance_id(row),
        "repo": row.get("repo", ""),
        "base_commit": row.get("base_commit", ""),
        "problem_statement": get_problem_statement(row),
        "patch": get_patch(row),
        "difficulty": "medium",  # Will be set based on index
    }
    
    metadata_file = instance_dir / "metadata.json"
    metadata_file.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
    return metadata_file


def prepare_swebench_tasks(
    output_dir: Path,
    max_tasks: int = 30,
    difficulty: str = "all"
):
    """
    Download and prepare SWE-bench Lite tasks.
    
    Args:
        output_dir: Directory to save tasks
        max_tasks: Maximum number of tasks to download
        difficulty: "easy", "medium", "hard", or "all"
    """
    print(f"Loading SWE-bench Lite dataset...")
    
    try:
        ds = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Trying alternative dataset name...")
        ds = load_dataset("swe-bench/swe-bench-lite", split="test")
    
    print(f"Loaded {len(ds)} tasks")
    
    # Calculate difficulty bounds
    total = len(ds)
    one_third = max(total // 3, 1)
    two_third = max((2 * total) // 3, one_third + 1)
    
    difficulty_ranges = {
        "easy": (0, one_third),
        "medium": (one_third, two_third),
        "hard": (two_third, total),
    }
    
    # Determine which tasks to download
    if difficulty == "all":
        ranges = list(difficulty_ranges.values())
        indices = []
        for start, end in ranges:
            indices.extend(range(start, min(end, start + max_tasks // 3)))
    else:
        start, end = difficulty_ranges.get(difficulty, (0, total))
        indices = list(range(start, min(end, max_tasks)))
    
    # Create output directory
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"Preparing {len(indices)} tasks...")
    
    success_count = 0
    for i, idx in enumerate(indices):
        try:
            row = ds[idx]
            instance_id = get_instance_id(row)
            
            # Create instance directory
            instance_dir = output_dir / instance_id
            instance_dir.mkdir(parents=True, exist_ok=True)
            
            # Create files
            create_buggy_file(instance_dir, row)
            create_test_file(instance_dir, row)
            create_metadata_file(instance_dir, row)
            
            success_count += 1
            if (i + 1) % 10 == 0:
                print(f"  Processed {i + 1}/{len(indices)} tasks...")
                
        except Exception as e:
            print(f"  Warning: Failed to process task {idx}: {e}")
            continue
    
    print(f"\nDone! Prepared {success_count}/{len(indices)} tasks in {output_dir}")
    print(f"Set SWEBENCH_TASKS_ROOT={output_dir.absolute()} to use these tasks.")


def main():
    parser = argparse.ArgumentParser(description="Prepare SWE-bench Lite tasks")
    parser.add_argument(
        "--max-tasks", 
        type=int, 
        default=30,
        help="Maximum number of tasks to download (default: 30)"
    )
    parser.add_argument(
        "--difficulty",
        type=str,
        default="all",
        choices=["easy", "medium", "hard", "all"],
        help="Difficulty level to download (default: all)"
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default=None,
        help="Output directory (default: dataset/swebench_lite_tasks)"
    )
    
    args = parser.parse_args()
    
    # Determine output directory
    if args.output_dir:
        output_dir = Path(args.output_dir)
    else:
        script_dir = Path(__file__).parent
        output_dir = script_dir / "swebench_lite_tasks"
    
    prepare_swebench_tasks(
        output_dir=output_dir,
        max_tasks=args.max_tasks,
        difficulty=args.difficulty
    )


if __name__ == "__main__":
    main()