| | """
|
| | Science benchmarks for Vortex model.
|
| | Evaluates performance across 7 science domains.
|
| | """
|
| |
|
| | import torch
|
| | from typing import Dict, List, Tuple
|
| | from dataclasses import dataclass
|
| |
|
| |
|
| | @dataclass
|
| | class BenchmarkResult:
|
| | """Results from a benchmark."""
|
| | domain: str
|
| | accuracy: float
|
| | total_questions: int
|
| | correct_answers: int
|
| | details: List[Dict]
|
| |
|
| |
|
| | class ScienceBenchmark:
|
| | """
|
| | Base class for science benchmarks.
|
| | """
|
| |
|
| | def __init__(self, name: str, domain: str):
|
| | self.name = name
|
| | self.domain = domain
|
| |
|
| | def load_questions(self) -> List[Dict]:
|
| | """Load benchmark questions."""
|
| | raise NotImplementedError
|
| |
|
| | def evaluate(
|
| | self,
|
| | model,
|
| | tokenizer,
|
| | device: torch.device,
|
| | max_samples: int = 100,
|
| | ) -> BenchmarkResult:
|
| | """
|
| | Evaluate model on benchmark.
|
| |
|
| | Args:
|
| | model: Vortex model
|
| | tokenizer: Tokenizer
|
| | device: Torch device
|
| | max_samples: Maximum samples to evaluate
|
| |
|
| | Returns:
|
| | BenchmarkResult
|
| | """
|
| | questions = self.load_questions()[:max_samples]
|
| | correct = 0
|
| |
|
| | details = []
|
| | for q in questions:
|
| |
|
| | prompt = self.format_prompt(q)
|
| |
|
| | inputs = tokenizer(prompt, return_tensors="pt").to(device)
|
| |
|
| |
|
| | with torch.no_grad():
|
| | outputs = model.generate(
|
| | **inputs,
|
| | max_new_tokens=50,
|
| | temperature=0.0,
|
| | do_sample=False,
|
| | )
|
| |
|
| |
|
| | generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| | answer = self.extract_answer(generated)
|
| |
|
| |
|
| | is_correct = self.check_answer(answer, q["answer"])
|
| | if is_correct:
|
| | correct += 1
|
| |
|
| | details.append({
|
| | "question": q["question"],
|
| | "expected": q["answer"],
|
| | "generated": answer,
|
| | "correct": is_correct,
|
| | })
|
| |
|
| | accuracy = correct / len(questions) if questions else 0.0
|
| | return BenchmarkResult(
|
| | domain=self.domain,
|
| | accuracy=accuracy,
|
| | total_questions=len(questions),
|
| | correct_answers=correct,
|
| | details=details,
|
| | )
|
| |
|
| | def format_prompt(self, question: Dict) -> str:
|
| | """Format question into prompt."""
|
| | raise NotImplementedError
|
| |
|
| | def extract_answer(self, text: str) -> str:
|
| | """Extract answer from generated text."""
|
| | raise NotImplementedError
|
| |
|
| | def check_answer(self, predicted: str, expected: str) -> bool:
|
| | """Check if predicted answer matches expected."""
|
| | raise NotImplementedError
|
| |
|
| |
|
| | class PhysicsBenchmark(ScienceBenchmark):
|
| | """Physics benchmark (Feynman Questions style)."""
|
| |
|
| | def __init__(self):
|
| | super().__init__("physics_benchmark", "physics")
|
| |
|
| | def load_questions(self) -> List[Dict]:
|
| |
|
| | return [
|
| | {
|
| | "question": "What is the formula for kinetic energy?",
|
| | "answer": "KE = 1/2 mv^2",
|
| | "type": "formula",
|
| | },
|
| | {
|
| | "question": "Explain Newton's first law of motion.",
|
| | "answer": "An object at rest stays at rest unless acted upon by a force.",
|
| | "type": "conceptual",
|
| | },
|
| | ]
|
| |
|
| | def format_prompt(self, question: Dict) -> str:
|
| | return f"Question: {question['question']}\nAnswer:"
|
| |
|
| | def extract_answer(self, text: str) -> str:
|
| |
|
| | if "Answer:" in text:
|
| | return text.split("Answer:")[-1].strip()
|
| | return text.strip()
|
| |
|
| | def check_answer(self, predicted: str, expected: str) -> bool:
|
| |
|
| | pred_lower = predicted.lower()
|
| | exp_lower = expected.lower()
|
| | return exp_lower in pred_lower or pred_lower in exp_lower
|
| |
|
| |
|
| | class MathBenchmark(ScienceBenchmark):
|
| | """Math benchmark (MATH dataset style)."""
|
| |
|
| | def __init__(self):
|
| | super().__init__("math_benchmark", "math")
|
| |
|
| | def load_questions(self) -> List[Dict]:
|
| | return [
|
| | {
|
| | "question": "Solve for x: 2x + 5 = 15",
|
| | "answer": "x = 5",
|
| | "type": "algebra",
|
| | },
|
| | {
|
| | "question": "What is the derivative of x^2?",
|
| | "answer": "2x",
|
| | "type": "calculus",
|
| | },
|
| | ]
|
| |
|
| | def format_prompt(self, question: Dict) -> str:
|
| | return f"Problem: {question['question']}\nSolution:"
|
| |
|
| | def extract_answer(self, text: str) -> str:
|
| | if "Solution:" in text:
|
| | return text.split("Solution:")[-1].strip()
|
| | return text.strip()
|
| |
|
| | def check_answer(self, predicted: str, expected: str) -> bool:
|
| |
|
| | pred = " ".join(predicted.lower().split())
|
| | exp = " ".join(expected.lower().split())
|
| | return pred == exp
|
| |
|
| |
|
| | class ChemistryBenchmark(ScienceBenchmark):
|
| | """Chemistry benchmark."""
|
| |
|
| | def __init__(self):
|
| | super().__init__("chemistry_benchmark", "chemistry")
|
| |
|
| | def load_questions(self) -> List[Dict]:
|
| | return [
|
| | {
|
| | "question": "What is the chemical formula for water?",
|
| | "answer": "H2O",
|
| | "type": "factual",
|
| | },
|
| | {
|
| | "question": "How many protons does carbon have?",
|
| | "answer": "6",
|
| | "type": "factual",
|
| | },
|
| | ]
|
| |
|
| | def format_prompt(self, question: Dict) -> str:
|
| | return f"Chemistry question: {question['question']}\nAnswer:"
|
| |
|
| | def extract_answer(self, text: str) -> str:
|
| | if "Answer:" in text:
|
| | return text.split("Answer:")[-1].strip()
|
| | return text.strip()
|
| |
|
| | def check_answer(self, predicted: str, expected: str) -> bool:
|
| | pred = predicted.strip().lower()
|
| | exp = expected.strip().lower()
|
| | return exp in pred
|
| |
|
| |
|
| | class BiologyBenchmark(ScienceBenchmark):
|
| | """Biology benchmark."""
|
| |
|
| | def __init__(self):
|
| | super().__init__("biology_benchmark", "biology")
|
| |
|
| | def load_questions(self) -> List[Dict]:
|
| | return [
|
| | {
|
| | "question": "What is the powerhouse of the cell?",
|
| | "answer": "mitochondria",
|
| | "type": "factual",
|
| | },
|
| | {
|
| | "question": "What molecule carries genetic information?",
|
| | "answer": "DNA",
|
| | "type": "factual",
|
| | },
|
| | ]
|
| |
|
| | def format_prompt(self, question: Dict) -> str:
|
| | return f"Biology: {question['question']}\nAnswer:"
|
| |
|
| | def extract_answer(self, text: str) -> str:
|
| | if "Answer:" in text:
|
| | return text.split("Answer:")[-1].strip()
|
| | return text.strip()
|
| |
|
| | def check_answer(self, predicted: str, expected: str) -> bool:
|
| | pred = predicted.strip().lower()
|
| | exp = expected.strip().lower()
|
| | return exp in pred
|
| |
|
| |
|
| |
|
| | class EarthScienceBenchmark(ScienceBenchmark):
|
| | def __init__(self):
|
| | super().__init__("earth_science_benchmark", "earth")
|
| |
|
| | def load_questions(self) -> List[Dict]:
|
| | return []
|
| |
|
| | def format_prompt(self, question: Dict) -> str:
|
| | return f"Earth Science: {question['question']}\nAnswer:"
|
| |
|
| | def extract_answer(self, text: str) -> str:
|
| | return text.strip()
|
| |
|
| | def check_answer(self, predicted: str, expected: str) -> bool:
|
| | return predicted.strip().lower() == expected.strip().lower()
|
| |
|
| |
|
| | class SpaceScienceBenchmark(ScienceBenchmark):
|
| | def __init__(self):
|
| | super().__init__("space_science_benchmark", "space")
|
| |
|
| | def load_questions(self) -> List[Dict]:
|
| | return []
|
| |
|
| | def format_prompt(self, question: Dict) -> str:
|
| | return f"Space Science: {question['question']}\nAnswer:"
|
| |
|
| | def extract_answer(self, text: str) -> str:
|
| | return text.strip()
|
| |
|
| | def check_answer(self, predicted: str, expected: str) -> bool:
|
| | return predicted.strip().lower() == expected.strip().lower()
|
| |
|
| |
|
| | class ZoologyBenchmark(ScienceBenchmark):
|
| | def __init__(self):
|
| | super().__init__("zoology_benchmark", "zoology")
|
| |
|
| | def load_questions(self) -> List[Dict]:
|
| | return []
|
| |
|
| | def format_prompt(self, question: Dict) -> str:
|
| | return f"Zoology: {question['question']}\nAnswer:"
|
| |
|
| | def extract_answer(self, text: str) -> str:
|
| | return text.strip()
|
| |
|
| | def check_answer(self, predicted: str, expected: str) -> bool:
|
| | return predicted.strip().lower() == expected.strip().lower()
|
| |
|
| |
|
| | def run_all_benchmarks(
|
| | model,
|
| | tokenizer,
|
| | device: torch.device,
|
| | max_samples_per_domain: int = 50,
|
| | ) -> Dict[str, BenchmarkResult]:
|
| | """
|
| | Run all benchmarks and return results.
|
| |
|
| | Args:
|
| | model: Vortex model
|
| | tokenizer: Tokenizer
|
| | device: Torch device
|
| | max_samples_per_domain: Max samples per domain
|
| |
|
| | Returns:
|
| | Dictionary mapping domain to results
|
| | """
|
| | benchmarks = [
|
| | PhysicsBenchmark(),
|
| | MathBenchmark(),
|
| | ChemistryBenchmark(),
|
| | BiologyBenchmark(),
|
| | EarthScienceBenchmark(),
|
| | SpaceScienceBenchmark(),
|
| | ZoologyBenchmark(),
|
| | ]
|
| |
|
| | results = {}
|
| | for bench in benchmarks:
|
| | print(f"Running {bench.name}...")
|
| | result = bench.evaluate(model, tokenizer, device, max_samples=max_samples_per_domain)
|
| | results[bench.domain] = result
|
| | print(f" Accuracy: {result.accuracy:.2%} ({result.correct_answers}/{result.total_questions})")
|
| |
|
| | return results
|
| |
|
| |
|
| | def print_summary(results: Dict[str, BenchmarkResult]):
|
| | """Print summary of benchmark results."""
|
| | print("\n" + "="*60)
|
| | print("BENCHMARK RESULTS")
|
| | print("="*60)
|
| |
|
| | for domain, result in results.items():
|
| | print(f"{domain:15} {result.accuracy:6.2%} ({result.correct_answers}/{result.total_questions})")
|
| |
|
| |
|
| | all_accuracies = [r.accuracy for r in results.values() if r.total_questions > 0]
|
| | if all_accuracies:
|
| | avg = sum(all_accuracies) / len(all_accuracies)
|
| | print(f"{'OVERALL':15} {avg:6.2%}")
|
| | print("="*60)
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| |
|
| | print("This script benchmarks the model across science domains.")
|
| | print("To run full benchmarks, integrate with a trained model.")
|
| |
|