#!/usr/bin/env python3
"""Create initial benchmark datasets for Crowe Logic Mini"""

import json
from pathlib import Path

def create_mycology_benchmark():
    return {
        "domain": "mycology",
        "num_questions": 5,
        "questions": [
            {
                "id": "myco_001",
                "question": "What is the optimal temperature range for Pleurotus ostreatus fruiting?",
                "correct_answer": "55-65°F (13-18°C)",
                "difficulty": "easy"
            },
            {
                "id": "myco_002",
                "question": "Why does supplementing sawdust substrate with bran increase yields?",
                "correct_answer": "Bran provides additional nitrogen and nutrients for mycelial growth",
                "difficulty": "medium"
            },
            {
                "id": "myco_003",
                "question": "Calculate biological efficiency: 5 lbs mushrooms from 10 lbs dry substrate",
                "correct_answer": "50% biological efficiency (5/10 * 100)",
                "difficulty": "easy"
            },
            {
                "id": "myco_004",
                "question": "What causes Trichoderma contamination and how to prevent it?",
                "correct_answer": "Incomplete sterilization. Prevent with proper autoclave cycles (15 PSI, 90+ min) and sterile technique",
                "difficulty": "medium"
            },
            {
                "id": "myco_005",
                "question": "When scaling from 100 to 1000 lbs/week, what is the primary bottleneck?",
                "correct_answer": "Fresh air exchange (FAE) capacity - requires proportional HEPA filtration scaling",
                "difficulty": "hard"
            }
        ]
    }

def create_drug_discovery_benchmark():
    return {
        "domain": "drug_discovery",
        "num_questions": 5,
        "questions": [
            {
                "id": "drug_001",
                "question": "What does IC50 represent in drug screening?",
                "correct_answer": "Concentration of inhibitor required to reduce activity by 50%",
                "difficulty": "easy"
            },
            {
                "id": "drug_002",
                "question": "Why is high lipophilicity (logP) problematic for oral drugs?",
                "correct_answer": "Poor aqueous solubility leads to low bioavailability",
                "difficulty": "medium"
            },
            {
                "id": "drug_003",
                "question": "IC50 = 10 nM vs target, 5 μM vs homolog. Calculate selectivity.",
                "correct_answer": "500-fold selectivity (5000 nM / 10 nM)",
                "difficulty": "medium"
            },
            {
                "id": "drug_004",
                "question": "Lead has good potency but fails ADME. Propose 3 strategies.",
                "correct_answer": "1) Add polar groups for solubility 2) Reduce MW for permeability 3) Modify labile sites for stability",
                "difficulty": "hard"
            },
            {
                "id": "drug_005",
                "question": "How does CriOS Nova achieve 98.5% time compression?",
                "correct_answer": "Parallel assay execution, automated hypothesis testing, AI experimental design eliminate sequential bottlenecks",
                "difficulty": "expert"
            }
        ]
    }

def create_ai_systems_benchmark():
    return {
        "domain": "ai_systems",
        "num_questions": 5,
        "questions": [
            {
                "id": "ai_001",
                "question": "What is the primary advantage of Grouped-Query Attention?",
                "correct_answer": "Reduces KV cache size and memory bandwidth by sharing KV heads",
                "difficulty": "medium"
            },
            {
                "id": "ai_002",
                "question": "For 850M param model, estimate training memory with AdamW (mixed precision)",
                "correct_answer": "~15-20 GB (2 bytes model + 12 bytes optimizer + activations)",
                "difficulty": "medium"
            },
            {
                "id": "ai_003",
                "question": "Why is Flash Attention 2 faster than standard attention?",
                "correct_answer": "Fuses operations and optimizes memory access to minimize HBM reads/writes",
                "difficulty": "hard"
            },
            {
                "id": "ai_004",
                "question": "When to use MoE vs dense transformer?",
                "correct_answer": "MoE when: larger capacity needed, clear domains, sufficient data. Dense when: limited data, simpler deployment",
                "difficulty": "hard"
            },
            {
                "id": "ai_005",
                "question": "How does CrowLogic achieve 740x communication efficiency?",
                "correct_answer": "Hierarchical message passing with domain-specific routing eliminates broadcast overhead",
                "difficulty": "expert"
            }
        ]
    }

def main():
    output_dir = Path("evaluation/benchmarks")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    benchmarks = {
        "mycology": create_mycology_benchmark(),
        "drug_discovery": create_drug_discovery_benchmark(),
        "ai_systems": create_ai_systems_benchmark()
    }
    
    print("=" * 70)
    print("Creating Benchmark Datasets")
    print("=" * 70)
    
    for name, data in benchmarks.items():
        filepath = output_dir / f"{name}_benchmark.json"
        with open(filepath, 'w') as f:
            json.dump(data, f, indent=2)
        print(f"✓ {name}: {data['num_questions']} questions -> {filepath}")
    
    total = sum(b['num_questions'] for b in benchmarks.values())
    print(f"\n✅ Created {total} benchmark questions across {len(benchmarks)} domains")
    print(f"📁 Saved to: {output_dir}")

if __name__ == "__main__":
    main()