crowe-logic-mini / evaluation /create_benchmarks.py

Upload evaluation/create_benchmarks.py with huggingface_hub

27efa47 verified 6 months ago

5.98 kB

	#!/usr/bin/env python3
	"""Create initial benchmark datasets for Crowe Logic Mini"""

	import json
	from pathlib import Path

	def create_mycology_benchmark():
	return {
	"domain": "mycology",
	"num_questions": 5,
	"questions": [
	{
	"id": "myco_001",
	"question": "What is the optimal temperature range for Pleurotus ostreatus fruiting?",
	"correct_answer": "55-65°F (13-18°C)",
	"difficulty": "easy"
	},
	{
	"id": "myco_002",
	"question": "Why does supplementing sawdust substrate with bran increase yields?",
	"correct_answer": "Bran provides additional nitrogen and nutrients for mycelial growth",
	"difficulty": "medium"
	},
	{
	"id": "myco_003",
	"question": "Calculate biological efficiency: 5 lbs mushrooms from 10 lbs dry substrate",
	"correct_answer": "50% biological efficiency (5/10 * 100)",
	"difficulty": "easy"
	},
	{
	"id": "myco_004",
	"question": "What causes Trichoderma contamination and how to prevent it?",
	"correct_answer": "Incomplete sterilization. Prevent with proper autoclave cycles (15 PSI, 90+ min) and sterile technique",
	"difficulty": "medium"
	},
	{
	"id": "myco_005",
	"question": "When scaling from 100 to 1000 lbs/week, what is the primary bottleneck?",
	"correct_answer": "Fresh air exchange (FAE) capacity - requires proportional HEPA filtration scaling",
	"difficulty": "hard"
	}
	]
	}

	def create_drug_discovery_benchmark():
	return {
	"domain": "drug_discovery",
	"num_questions": 5,
	"questions": [
	{
	"id": "drug_001",
	"question": "What does IC50 represent in drug screening?",
	"correct_answer": "Concentration of inhibitor required to reduce activity by 50%",
	"difficulty": "easy"
	},
	{
	"id": "drug_002",
	"question": "Why is high lipophilicity (logP) problematic for oral drugs?",
	"correct_answer": "Poor aqueous solubility leads to low bioavailability",
	"difficulty": "medium"
	},
	{
	"id": "drug_003",
	"question": "IC50 = 10 nM vs target, 5 μM vs homolog. Calculate selectivity.",
	"correct_answer": "500-fold selectivity (5000 nM / 10 nM)",
	"difficulty": "medium"
	},
	{
	"id": "drug_004",
	"question": "Lead has good potency but fails ADME. Propose 3 strategies.",
	"correct_answer": "1) Add polar groups for solubility 2) Reduce MW for permeability 3) Modify labile sites for stability",
	"difficulty": "hard"
	},
	{
	"id": "drug_005",
	"question": "How does CriOS Nova achieve 98.5% time compression?",
	"correct_answer": "Parallel assay execution, automated hypothesis testing, AI experimental design eliminate sequential bottlenecks",
	"difficulty": "expert"
	}
	]
	}

	def create_ai_systems_benchmark():
	return {
	"domain": "ai_systems",
	"num_questions": 5,
	"questions": [
	{
	"id": "ai_001",
	"question": "What is the primary advantage of Grouped-Query Attention?",
	"correct_answer": "Reduces KV cache size and memory bandwidth by sharing KV heads",
	"difficulty": "medium"
	},
	{
	"id": "ai_002",
	"question": "For 850M param model, estimate training memory with AdamW (mixed precision)",
	"correct_answer": "~15-20 GB (2 bytes model + 12 bytes optimizer + activations)",
	"difficulty": "medium"
	},
	{
	"id": "ai_003",
	"question": "Why is Flash Attention 2 faster than standard attention?",
	"correct_answer": "Fuses operations and optimizes memory access to minimize HBM reads/writes",
	"difficulty": "hard"
	},
	{
	"id": "ai_004",
	"question": "When to use MoE vs dense transformer?",
	"correct_answer": "MoE when: larger capacity needed, clear domains, sufficient data. Dense when: limited data, simpler deployment",
	"difficulty": "hard"
	},
	{
	"id": "ai_005",
	"question": "How does CrowLogic achieve 740x communication efficiency?",
	"correct_answer": "Hierarchical message passing with domain-specific routing eliminates broadcast overhead",
	"difficulty": "expert"
	}
	]
	}

	def main():
	output_dir = Path("evaluation/benchmarks")
	output_dir.mkdir(parents=True, exist_ok=True)

	benchmarks = {
	"mycology": create_mycology_benchmark(),
	"drug_discovery": create_drug_discovery_benchmark(),
	"ai_systems": create_ai_systems_benchmark()
	}

	print("=" * 70)
	print("Creating Benchmark Datasets")
	print("=" * 70)

	for name, data in benchmarks.items():
	filepath = output_dir / f"{name}_benchmark.json"
	with open(filepath, 'w') as f:
	json.dump(data, f, indent=2)
	print(f"✓ {name}: {data['num_questions']} questions -> {filepath}")

	total = sum(b['num_questions'] for b in benchmarks.values())
	print(f"\n✅ Created {total} benchmark questions across {len(benchmarks)} domains")
	print(f"📁 Saved to: {output_dir}")

	if __name__ == "__main__":
	main()