science_bench.py · Matrix-Corp/Vortex-13b-V1 at main

File size: 10,996 Bytes

5c43f61

"""

Science benchmarks for Vortex model.

Evaluates performance across 7 science domains.

"""

import torch
from typing import Dict, List, Tuple
from dataclasses import dataclass


@dataclass
class BenchmarkResult:
    """Results from a benchmark."""
    domain: str
    accuracy: float
    total_questions: int
    correct_answers: int
    details: List[Dict]


class ScienceBenchmark:
    """

    Base class for science benchmarks.

    """

    def __init__(self, name: str, domain: str):
        self.name = name
        self.domain = domain

    def load_questions(self) -> List[Dict]:
        """Load benchmark questions."""
        raise NotImplementedError

    def evaluate(

        self,

        model,

        tokenizer,

        device: torch.device,

        max_samples: int = 100,

    ) -> BenchmarkResult:
        """

        Evaluate model on benchmark.



        Args:

            model: Vortex model

            tokenizer: Tokenizer

            device: Torch device

            max_samples: Maximum samples to evaluate



        Returns:

            BenchmarkResult

        """
        questions = self.load_questions()[:max_samples]
        correct = 0

        details = []
        for q in questions:
            # Format prompt
            prompt = self.format_prompt(q)
            # Tokenize
            inputs = tokenizer(prompt, return_tensors="pt").to(device)

            # Generate answer
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=50,
                    temperature=0.0,  # Greedy
                    do_sample=False,
                )

            # Decode
            generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
            answer = self.extract_answer(generated)

            # Check correctness
            is_correct = self.check_answer(answer, q["answer"])
            if is_correct:
                correct += 1

            details.append({
                "question": q["question"],
                "expected": q["answer"],
                "generated": answer,
                "correct": is_correct,
            })

        accuracy = correct / len(questions) if questions else 0.0
        return BenchmarkResult(
            domain=self.domain,
            accuracy=accuracy,
            total_questions=len(questions),
            correct_answers=correct,
            details=details,
        )

    def format_prompt(self, question: Dict) -> str:
        """Format question into prompt."""
        raise NotImplementedError

    def extract_answer(self, text: str) -> str:
        """Extract answer from generated text."""
        raise NotImplementedError

    def check_answer(self, predicted: str, expected: str) -> bool:
        """Check if predicted answer matches expected."""
        raise NotImplementedError


class PhysicsBenchmark(ScienceBenchmark):
    """Physics benchmark (Feynman Questions style)."""

    def __init__(self):
        super().__init__("physics_benchmark", "physics")

    def load_questions(self) -> List[Dict]:
        # Placeholder - would load from dataset
        return [
            {
                "question": "What is the formula for kinetic energy?",
                "answer": "KE = 1/2 mv^2",
                "type": "formula",
            },
            {
                "question": "Explain Newton's first law of motion.",
                "answer": "An object at rest stays at rest unless acted upon by a force.",
                "type": "conceptual",
            },
        ]

    def format_prompt(self, question: Dict) -> str:
        return f"Question: {question['question']}\nAnswer:"

    def extract_answer(self, text: str) -> str:
        # Extract after "Answer:"
        if "Answer:" in text:
            return text.split("Answer:")[-1].strip()
        return text.strip()

    def check_answer(self, predicted: str, expected: str) -> bool:
        # Simple string match (would use more sophisticated in practice)
        pred_lower = predicted.lower()
        exp_lower = expected.lower()
        return exp_lower in pred_lower or pred_lower in exp_lower


class MathBenchmark(ScienceBenchmark):
    """Math benchmark (MATH dataset style)."""

    def __init__(self):
        super().__init__("math_benchmark", "math")

    def load_questions(self) -> List[Dict]:
        return [
            {
                "question": "Solve for x: 2x + 5 = 15",
                "answer": "x = 5",
                "type": "algebra",
            },
            {
                "question": "What is the derivative of x^2?",
                "answer": "2x",
                "type": "calculus",
            },
        ]

    def format_prompt(self, question: Dict) -> str:
        return f"Problem: {question['question']}\nSolution:"

    def extract_answer(self, text: str) -> str:
        if "Solution:" in text:
            return text.split("Solution:")[-1].strip()
        return text.strip()

    def check_answer(self, predicted: str, expected: str) -> bool:
        # Normalize whitespace and case
        pred = " ".join(predicted.lower().split())
        exp = " ".join(expected.lower().split())
        return pred == exp


class ChemistryBenchmark(ScienceBenchmark):
    """Chemistry benchmark."""

    def __init__(self):
        super().__init__("chemistry_benchmark", "chemistry")

    def load_questions(self) -> List[Dict]:
        return [
            {
                "question": "What is the chemical formula for water?",
                "answer": "H2O",
                "type": "factual",
            },
            {
                "question": "How many protons does carbon have?",
                "answer": "6",
                "type": "factual",
            },
        ]

    def format_prompt(self, question: Dict) -> str:
        return f"Chemistry question: {question['question']}\nAnswer:"

    def extract_answer(self, text: str) -> str:
        if "Answer:" in text:
            return text.split("Answer:")[-1].strip()
        return text.strip()

    def check_answer(self, predicted: str, expected: str) -> bool:
        pred = predicted.strip().lower()
        exp = expected.strip().lower()
        return exp in pred


class BiologyBenchmark(ScienceBenchmark):
    """Biology benchmark."""

    def __init__(self):
        super().__init__("biology_benchmark", "biology")

    def load_questions(self) -> List[Dict]:
        return [
            {
                "question": "What is the powerhouse of the cell?",
                "answer": "mitochondria",
                "type": "factual",
            },
            {
                "question": "What molecule carries genetic information?",
                "answer": "DNA",
                "type": "factual",
            },
        ]

    def format_prompt(self, question: Dict) -> str:
        return f"Biology: {question['question']}\nAnswer:"

    def extract_answer(self, text: str) -> str:
        if "Answer:" in text:
            return text.split("Answer:")[-1].strip()
        return text.strip()

    def check_answer(self, predicted: str, expected: str) -> bool:
        pred = predicted.strip().lower()
        exp = expected.strip().lower()
        return exp in pred


# Placeholder for other domains
class EarthScienceBenchmark(ScienceBenchmark):
    def __init__(self):
        super().__init__("earth_science_benchmark", "earth")

    def load_questions(self) -> List[Dict]:
        return []

    def format_prompt(self, question: Dict) -> str:
        return f"Earth Science: {question['question']}\nAnswer:"

    def extract_answer(self, text: str) -> str:
        return text.strip()

    def check_answer(self, predicted: str, expected: str) -> bool:
        return predicted.strip().lower() == expected.strip().lower()


class SpaceScienceBenchmark(ScienceBenchmark):
    def __init__(self):
        super().__init__("space_science_benchmark", "space")

    def load_questions(self) -> List[Dict]:
        return []

    def format_prompt(self, question: Dict) -> str:
        return f"Space Science: {question['question']}\nAnswer:"

    def extract_answer(self, text: str) -> str:
        return text.strip()

    def check_answer(self, predicted: str, expected: str) -> bool:
        return predicted.strip().lower() == expected.strip().lower()


class ZoologyBenchmark(ScienceBenchmark):
    def __init__(self):
        super().__init__("zoology_benchmark", "zoology")

    def load_questions(self) -> List[Dict]:
        return []

    def format_prompt(self, question: Dict) -> str:
        return f"Zoology: {question['question']}\nAnswer:"

    def extract_answer(self, text: str) -> str:
        return text.strip()

    def check_answer(self, predicted: str, expected: str) -> bool:
        return predicted.strip().lower() == expected.strip().lower()


def run_all_benchmarks(

    model,

    tokenizer,

    device: torch.device,

    max_samples_per_domain: int = 50,

) -> Dict[str, BenchmarkResult]:
    """

    Run all benchmarks and return results.



    Args:

        model: Vortex model

        tokenizer: Tokenizer

        device: Torch device

        max_samples_per_domain: Max samples per domain



    Returns:

        Dictionary mapping domain to results

    """
    benchmarks = [
        PhysicsBenchmark(),
        MathBenchmark(),
        ChemistryBenchmark(),
        BiologyBenchmark(),
        EarthScienceBenchmark(),
        SpaceScienceBenchmark(),
        ZoologyBenchmark(),
    ]

    results = {}
    for bench in benchmarks:
        print(f"Running {bench.name}...")
        result = bench.evaluate(model, tokenizer, device, max_samples=max_samples_per_domain)
        results[bench.domain] = result
        print(f"  Accuracy: {result.accuracy:.2%} ({result.correct_answers}/{result.total_questions})")

    return results


def print_summary(results: Dict[str, BenchmarkResult]):
    """Print summary of benchmark results."""
    print("\n" + "="*60)
    print("BENCHMARK RESULTS")
    print("="*60)

    for domain, result in results.items():
        print(f"{domain:15} {result.accuracy:6.2%}  ({result.correct_answers}/{result.total_questions})")

    # Overall average
    all_accuracies = [r.accuracy for r in results.values() if r.total_questions > 0]
    if all_accuracies:
        avg = sum(all_accuracies) / len(all_accuracies)
        print(f"{'OVERALL':15} {avg:6.2%}")
    print("="*60)


if __name__ == "__main__":
    # Quick test
    print("This script benchmarks the model across science domains.")
    print("To run full benchmarks, integrate with a trained model.")