Spaces:
Runtime error
Runtime error
| from .base_benchmark import BaseBenchmark | |
| from typing import Dict, Any, Optional, Tuple | |
| from datasets import load_dataset | |
| import subprocess | |
| import tempfile | |
| import os | |
| import sys | |
| import re | |
| class HumanEvalBenchmark(BaseBenchmark): | |
| """HumanEval code generation benchmark""" | |
| def __init__(self): | |
| super().__init__(name="HumanEval", dataset_name="openai_humaneval") | |
| async def load_dataset(self, sample_size: Optional[int] = None, **kwargs): | |
| """Load HumanEval dataset""" | |
| dataset = load_dataset(self.dataset_name, split='test') | |
| self.dataset = [] | |
| for sample in dataset: | |
| self.dataset.append({ | |
| 'task_id': sample['task_id'], | |
| 'prompt': sample['prompt'], | |
| 'canonical_solution': sample['canonical_solution'], | |
| 'test': sample['test'], | |
| 'entry_point': sample['entry_point'], | |
| 'raw_sample': sample | |
| }) | |
| if sample_size and len(self.dataset) > sample_size: | |
| self.dataset = self.dataset[:sample_size] | |
| def format_prompt(self, sample: Dict[str, Any]) -> str: | |
| """Format HumanEval problem as prompt""" | |
| # lm-eval uses just the raw prompt without additional instructions | |
| return sample['prompt'] | |
| def extract_code(self, response: str, entry_point: str, prompt: str) -> str: | |
| """Extract code from model response""" | |
| # Clean the response - handle markdown code blocks | |
| code = response.strip() | |
| # Remove markdown code block markers | |
| if code.startswith('```python'): | |
| code = code[9:] # Remove ```python | |
| elif code.startswith('```'): | |
| code = code[3:] # Remove ``` | |
| if code.endswith('```'): | |
| code = code[:-3] # Remove trailing ``` | |
| code = code.strip() | |
| # If the response contains the complete function, use it directly | |
| if f"def {entry_point}" in code: | |
| return code | |
| else: | |
| # Fallback: assume it's completion to be added after prompt | |
| stop_sequences = ['\nclass', '\ndef', '\n#', '\nif __name__'] | |
| for stop in stop_sequences: | |
| pos = code.find(stop) | |
| if pos > 0: | |
| code = code[:pos] | |
| break | |
| return prompt + code | |
| def run_test(self, code: str, test_code: str) -> Tuple[bool, str]: | |
| """Run the test code and return success status and output""" | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: | |
| # Write the complete test file | |
| f.write(code + '\n\n' + test_code) | |
| f.flush() | |
| try: | |
| # Run the test | |
| result = subprocess.run( | |
| [sys.executable, f.name], | |
| capture_output=True, | |
| text=True, | |
| timeout=10 | |
| ) | |
| if result.returncode == 0: | |
| return True, result.stdout | |
| else: | |
| return False, result.stderr | |
| except subprocess.TimeoutExpired: | |
| return False, "Timeout: Code execution took too long" | |
| except Exception as e: | |
| return False, f"Error running test: {str(e)}" | |
| finally: | |
| # Clean up | |
| try: | |
| os.unlink(f.name) | |
| except: | |
| pass | |
| async def evaluate_sample(self, api, sample: Dict[str, Any], **kwargs) -> Tuple[bool, Dict[str, Any]]: | |
| """Evaluate a single HumanEval sample""" | |
| prompt = self.format_prompt(sample) | |
| try: | |
| response = await api.generate_with_retry(prompt, **kwargs) | |
| # Extract code from response | |
| code = self.extract_code(response, sample['entry_point'], sample['prompt']) | |
| # Run the test | |
| is_correct, test_output = self.run_test(code, sample['test']) | |
| result = { | |
| 'task_id': sample['task_id'], | |
| 'prompt': sample['prompt'], | |
| 'model_response': response, | |
| 'extracted_code': code, | |
| 'is_correct': is_correct, | |
| 'test_output': test_output, | |
| 'entry_point': sample['entry_point'] | |
| } | |
| return is_correct, result | |
| except Exception as e: | |
| result = { | |
| 'task_id': sample['task_id'], | |
| 'prompt': sample['prompt'], | |
| 'error': str(e), | |
| 'is_correct': False | |
| } | |
| return False, result |