Spaces:
Sleeping
Sleeping
| from .base_benchmark import BaseBenchmark | |
| from typing import Dict, Any, Optional, Tuple | |
| from datasets import load_dataset | |
| import re | |
| from .evaluation_utils import normalize_math_answer, is_math_equiv | |
| class MATHBenchmark(BaseBenchmark): | |
| """MATH (Mathematics) benchmark for competition-level problems""" | |
| LEVELS = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'] | |
| TYPES = ['Algebra', 'Counting & Probability', 'Geometry', 'Intermediate Algebra', | |
| 'Number Theory', 'Prealgebra', 'Precalculus'] | |
| def __init__(self): | |
| super().__init__(name="MATH", dataset_name="hendrycks/competition_math") | |
| async def load_dataset(self, sample_size: Optional[int] = None, **kwargs): | |
| """Load MATH dataset""" | |
| dataset = load_dataset(self.dataset_name, split='test') | |
| # Filter by difficulty level if specified | |
| difficulty_levels = kwargs.get('difficulty', ['all']) | |
| if 'all' not in difficulty_levels: | |
| dataset = dataset.filter(lambda x: x['level'] in difficulty_levels) | |
| self.dataset = [] | |
| for sample in dataset: | |
| self.dataset.append({ | |
| 'problem': sample['problem'], | |
| 'solution': sample['solution'], | |
| 'level': sample['level'], | |
| 'type': sample['type'], | |
| 'raw_sample': sample | |
| }) | |
| # Shuffle dataset | |
| import random | |
| random.shuffle(self.dataset) | |
| if sample_size and len(self.dataset) > sample_size: | |
| self.dataset = self.dataset[:sample_size] | |
| def extract_answer(self, solution: str) -> Optional[str]: | |
| """Extract the final answer from MATH solution using lm-eval's method""" | |
| # Find all boxed content | |
| boxed_matches = re.findall(r'\\boxed\{([^{}]*)\}', solution) | |
| fbox_matches = re.findall(r'\\fbox\{([^{}]*)\}', solution) | |
| all_matches = boxed_matches + fbox_matches | |
| if all_matches: | |
| # Return the last boxed answer | |
| return all_matches[-1].strip() | |
| return None | |
| def extract_model_answer(self, response: str) -> Optional[str]: | |
| """Extract answer from model response""" | |
| # Try to find boxed answer first | |
| answer = self.extract_answer(response) | |
| if answer: | |
| return answer | |
| # If no boxed answer, look for common patterns | |
| # "The answer is X" | |
| match = re.search(r'answer is[\s:]*([^.\n]+)', response, re.IGNORECASE) | |
| if match: | |
| return match.group(1).strip() | |
| # "Therefore, X" | |
| match = re.search(r'therefore[,\s]+([^.\n]+)', response, re.IGNORECASE) | |
| if match: | |
| return match.group(1).strip() | |
| return None | |
| def format_prompt(self, sample: Dict[str, Any]) -> str: | |
| """Format MATH problem as prompt""" | |
| prompt = f"""Solve the following mathematics problem step by step. Show all your work and put your final answer in the format \\boxed{{answer}}. | |
| Problem: {sample['problem']} | |
| Solution:""" | |
| return prompt | |
| async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]: | |
| """Evaluate a single MATH sample""" | |
| prompt = self.format_prompt(sample) | |
| try: | |
| response = await api.generate_with_retry(prompt, **kwargs) | |
| # Extract correct answer | |
| correct_answer = self.extract_answer(sample['solution']) | |
| # Extract model's answer | |
| model_answer = self.extract_model_answer(response) | |
| # Compare answers using mathematical equivalence | |
| is_correct = False | |
| if correct_answer and model_answer: | |
| # Use the official equivalence checking | |
| is_correct = is_math_equiv(model_answer, correct_answer) | |
| result = { | |
| 'problem': sample['problem'], | |
| 'level': sample['level'], | |
| 'type': sample['type'], | |
| 'correct_answer': correct_answer, | |
| 'model_answer': model_answer, | |
| 'model_response': response, | |
| 'is_correct': is_correct | |
| } | |
| return is_correct, result | |
| except Exception as e: | |
| result = { | |
| 'problem': sample['problem'], | |
| 'level': sample['level'], | |
| 'type': sample['type'], | |
| 'error': str(e), | |
| 'is_correct': False | |
| } | |
| return False, result |