crowe-logic-mini / evaluation /create_benchmarks.py
mike1210's picture
Upload evaluation/create_benchmarks.py with huggingface_hub
27efa47 verified
#!/usr/bin/env python3
"""Create initial benchmark datasets for Crowe Logic Mini"""
import json
from pathlib import Path
def create_mycology_benchmark():
return {
"domain": "mycology",
"num_questions": 5,
"questions": [
{
"id": "myco_001",
"question": "What is the optimal temperature range for Pleurotus ostreatus fruiting?",
"correct_answer": "55-65°F (13-18°C)",
"difficulty": "easy"
},
{
"id": "myco_002",
"question": "Why does supplementing sawdust substrate with bran increase yields?",
"correct_answer": "Bran provides additional nitrogen and nutrients for mycelial growth",
"difficulty": "medium"
},
{
"id": "myco_003",
"question": "Calculate biological efficiency: 5 lbs mushrooms from 10 lbs dry substrate",
"correct_answer": "50% biological efficiency (5/10 * 100)",
"difficulty": "easy"
},
{
"id": "myco_004",
"question": "What causes Trichoderma contamination and how to prevent it?",
"correct_answer": "Incomplete sterilization. Prevent with proper autoclave cycles (15 PSI, 90+ min) and sterile technique",
"difficulty": "medium"
},
{
"id": "myco_005",
"question": "When scaling from 100 to 1000 lbs/week, what is the primary bottleneck?",
"correct_answer": "Fresh air exchange (FAE) capacity - requires proportional HEPA filtration scaling",
"difficulty": "hard"
}
]
}
def create_drug_discovery_benchmark():
return {
"domain": "drug_discovery",
"num_questions": 5,
"questions": [
{
"id": "drug_001",
"question": "What does IC50 represent in drug screening?",
"correct_answer": "Concentration of inhibitor required to reduce activity by 50%",
"difficulty": "easy"
},
{
"id": "drug_002",
"question": "Why is high lipophilicity (logP) problematic for oral drugs?",
"correct_answer": "Poor aqueous solubility leads to low bioavailability",
"difficulty": "medium"
},
{
"id": "drug_003",
"question": "IC50 = 10 nM vs target, 5 μM vs homolog. Calculate selectivity.",
"correct_answer": "500-fold selectivity (5000 nM / 10 nM)",
"difficulty": "medium"
},
{
"id": "drug_004",
"question": "Lead has good potency but fails ADME. Propose 3 strategies.",
"correct_answer": "1) Add polar groups for solubility 2) Reduce MW for permeability 3) Modify labile sites for stability",
"difficulty": "hard"
},
{
"id": "drug_005",
"question": "How does CriOS Nova achieve 98.5% time compression?",
"correct_answer": "Parallel assay execution, automated hypothesis testing, AI experimental design eliminate sequential bottlenecks",
"difficulty": "expert"
}
]
}
def create_ai_systems_benchmark():
return {
"domain": "ai_systems",
"num_questions": 5,
"questions": [
{
"id": "ai_001",
"question": "What is the primary advantage of Grouped-Query Attention?",
"correct_answer": "Reduces KV cache size and memory bandwidth by sharing KV heads",
"difficulty": "medium"
},
{
"id": "ai_002",
"question": "For 850M param model, estimate training memory with AdamW (mixed precision)",
"correct_answer": "~15-20 GB (2 bytes model + 12 bytes optimizer + activations)",
"difficulty": "medium"
},
{
"id": "ai_003",
"question": "Why is Flash Attention 2 faster than standard attention?",
"correct_answer": "Fuses operations and optimizes memory access to minimize HBM reads/writes",
"difficulty": "hard"
},
{
"id": "ai_004",
"question": "When to use MoE vs dense transformer?",
"correct_answer": "MoE when: larger capacity needed, clear domains, sufficient data. Dense when: limited data, simpler deployment",
"difficulty": "hard"
},
{
"id": "ai_005",
"question": "How does CrowLogic achieve 740x communication efficiency?",
"correct_answer": "Hierarchical message passing with domain-specific routing eliminates broadcast overhead",
"difficulty": "expert"
}
]
}
def main():
output_dir = Path("evaluation/benchmarks")
output_dir.mkdir(parents=True, exist_ok=True)
benchmarks = {
"mycology": create_mycology_benchmark(),
"drug_discovery": create_drug_discovery_benchmark(),
"ai_systems": create_ai_systems_benchmark()
}
print("=" * 70)
print("Creating Benchmark Datasets")
print("=" * 70)
for name, data in benchmarks.items():
filepath = output_dir / f"{name}_benchmark.json"
with open(filepath, 'w') as f:
json.dump(data, f, indent=2)
print(f"✓ {name}: {data['num_questions']} questions -> {filepath}")
total = sum(b['num_questions'] for b in benchmarks.values())
print(f"\n✅ Created {total} benchmark questions across {len(benchmarks)} domains")
print(f"📁 Saved to: {output_dir}")
if __name__ == "__main__":
main()