| | |
| |
|
| | from typing import Any, Dict, List, Optional, Tuple, Set, Union |
| | import uuid |
| | import json |
| | import re |
| | import random |
| | import ast |
| | import copy |
| | from pathlib import Path |
| | import tempfile |
| | import subprocess |
| | import shutil |
| | import os |
| |
|
| | from recursive_swe_bench.core.recursive_task import ( |
| | RecursiveTask, ProblemState, EvaluationResult, Feedback, TaskStatus |
| | ) |
| |
|
| | class BugCategory: |
| | """Categories of bugs for classification and evolution.""" |
| | SYNTAX = "syntax" |
| | LOGICAL = "logical" |
| | PERFORMANCE = "performance" |
| | SECURITY = "security" |
| | CONCURRENCY = "concurrency" |
| | EXCEPTION_HANDLING = "exception_handling" |
| | API_USAGE = "api_usage" |
| | MEMORY_MANAGEMENT = "memory_management" |
| | TYPE_ERROR = "type_error" |
| | EDGE_CASE = "edge_case" |
| | DATA_HANDLING = "data_handling" |
| | DEPENDENCY = "dependency" |
| |
|
| |
|
| | class BugFixingTask(RecursiveTask): |
| | """ |
| | A recursive task for evaluating how models fix bugs in code. |
| | |
| | The task presents a piece of code with one or more bugs, and evolves |
| | based on the model's fix attempts. As the model addresses issues, |
| | the task may introduce more subtle bugs, change requirements, or |
| | increase complexity to test adaptive problem-solving. |
| | """ |
| | |
| | def __init__( |
| | self, |
| | initial_state: ProblemState, |
| | config: Dict[str, Any] = None, |
| | test_runner: Any = None |
| | ): |
| | """ |
| | Initialize the bug fixing task. |
| | |
| | Args: |
| | initial_state: The initial problem state |
| | config: Configuration options |
| | test_runner: Custom test runner (optional) |
| | """ |
| | super().__init__(initial_state, config) |
| | self.test_runner = test_runner or DefaultTestRunner() |
| | self.bug_categories: Set[str] = set( |
| | self.config.get("bug_categories", [BugCategory.LOGICAL, BugCategory.SYNTAX]) |
| | ) |
| | self.difficulty_progression = self.config.get( |
| | "difficulty_progression", [0.0, 0.15, 0.3, 0.5, 0.7] |
| | ) |
| | self.evolution_strategies = self.config.get( |
| | "evolution_strategies", ["add_subtle_bug", "change_requirements", "increase_complexity"] |
| | ) |
| | |
| | def _run_evaluation(self, solution: str) -> EvaluationResult: |
| | """ |
| | Run tests to evaluate the solution. |
| | |
| | Args: |
| | solution: The solution code |
| | |
| | Returns: |
| | Evaluation results |
| | """ |
| | |
| | with tempfile.TemporaryDirectory() as temp_dir: |
| | temp_path = Path(temp_dir) |
| | |
| | |
| | solution_file = temp_path / "solution.py" |
| | with open(solution_file, "w") as f: |
| | f.write(solution) |
| | |
| | |
| | test_files = self._create_test_files(temp_path) |
| | |
| | |
| | results = self.test_runner.run_tests( |
| | solution_file=solution_file, |
| | test_files=test_files, |
| | code_context=self.state.code_context |
| | ) |
| | |
| | |
| | score = self._calculate_score(results) |
| | |
| | return EvaluationResult( |
| | success=results["all_passed"], |
| | score=score, |
| | execution_results=results["execution"], |
| | error_details=results.get("errors"), |
| | test_results=results["tests"], |
| | metrics={ |
| | "passed_tests": results["passed_tests"], |
| | "total_tests": results["total_tests"], |
| | "execution_time": results["execution_time"], |
| | "memory_usage": results.get("memory_usage", 0), |
| | "code_complexity": self._calculate_complexity(solution) |
| | } |
| | ) |
| | |
| | def _generate_feedback(self, solution: str, result: EvaluationResult) -> Feedback: |
| | """ |
| | Generate structured feedback based on evaluation results. |
| | |
| | Args: |
| | solution: The solution code |
| | result: The evaluation results |
| | |
| | Returns: |
| | Structured feedback |
| | """ |
| | issues = [] |
| | suggestions = [] |
| | focus_areas = [] |
| | |
| | |
| | if result.test_results: |
| | for test_name, test_result in result.test_results.items(): |
| | if not test_result["passed"]: |
| | issues.append({ |
| | "type": "test_failure", |
| | "test": test_name, |
| | "message": test_result.get("message", "Test failed"), |
| | "expected": test_result.get("expected"), |
| | "actual": test_result.get("actual") |
| | }) |
| | |
| | |
| | if result.error_details: |
| | for error_type, error_info in result.error_details.items(): |
| | issues.append({ |
| | "type": "error", |
| | "error_type": error_type, |
| | "message": error_info.get("message", "An error occurred"), |
| | "location": error_info.get("location") |
| | }) |
| | |
| | |
| | for issue in issues: |
| | if issue["type"] == "test_failure": |
| | suggestion = self._generate_suggestion_for_test_failure( |
| | issue, solution, result.test_results |
| | ) |
| | if suggestion: |
| | suggestions.append(suggestion) |
| | elif issue["type"] == "error": |
| | suggestion = self._generate_suggestion_for_error( |
| | issue, solution |
| | ) |
| | if suggestion: |
| | suggestions.append(suggestion) |
| | |
| | |
| | focus_areas = self._determine_focus_areas(issues, solution, result) |
| | |
| | |
| | adaptation_hints = self._generate_adaptation_hints(solution, result) |
| | |
| | |
| | if result.success: |
| | summary = ( |
| | f"Your solution passes all tests with a score of {result.score:.2f}. " |
| | f"The code successfully addresses the bugs in the original implementation." |
| | ) |
| | else: |
| | passed = result.metrics.get("passed_tests", 0) |
| | total = result.metrics.get("total_tests", 0) |
| | summary = ( |
| | f"Your solution passes {passed}/{total} tests with a score of {result.score:.2f}. " |
| | f"There are still issues that need to be addressed." |
| | ) |
| | |
| | return Feedback( |
| | summary=summary, |
| | issues=issues, |
| | suggestions=suggestions, |
| | focus_areas=focus_areas, |
| | adaptation_hints=adaptation_hints |
| | ) |
| | |
| | def _evolve_state(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: |
| | """ |
| | Evolve the problem state based on the solution and feedback. |
| | |
| | This method implements the recursive nature of the benchmark by |
| | adapting the problem to challenge the model's understanding. |
| | |
| | Args: |
| | solution: The attempted solution |
| | result: The evaluation results |
| | feedback: The feedback provided |
| | |
| | Returns: |
| | The evolved problem state |
| | """ |
| | |
| | if result.success and result.score > 0.95: |
| | return self._increase_difficulty(solution, result, feedback) |
| | |
| | |
| | elif result.score > 0.7: |
| | return self._focus_remaining_issues(solution, result, feedback) |
| | |
| | |
| | else: |
| | return self._provide_more_guidance(solution, result, feedback) |
| | |
| | def _increase_difficulty(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: |
| | """ |
| | Increase the difficulty of the problem for models that solved it well. |
| | |
| | Args: |
| | solution: The successful solution |
| | result: The evaluation results |
| | feedback: The feedback provided |
| | |
| | Returns: |
| | The evolved problem state with increased difficulty |
| | """ |
| | |
| | new_state = copy.deepcopy(self.state) |
| | |
| | |
| | new_state.evolution_stage += 1 |
| | |
| | |
| | current_difficulty_idx = min(new_state.evolution_stage, |
| | len(self.difficulty_progression) - 1) |
| | new_state.difficulty = self.difficulty_progression[current_difficulty_idx] |
| | |
| | |
| | strategy = self._select_evolution_strategy(solution, result, feedback) |
| | |
| | |
| | if strategy == "add_subtle_bug": |
| | self._add_subtle_bug(new_state, solution) |
| | elif strategy == "change_requirements": |
| | self._change_requirements(new_state, solution) |
| | elif strategy == "increase_complexity": |
| | self._increase_complexity(new_state, solution) |
| | |
| | |
| | new_state.description = self._generate_description(new_state) |
| | |
| | |
| | new_state.adaptation_vector = self._calculate_adaptation_vector( |
| | solution, result, feedback |
| | ) |
| | |
| | return new_state |
| | |
| | def _focus_remaining_issues(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: |
| | """ |
| | Evolve the state to focus on remaining issues when the solution is close but not perfect. |
| | |
| | Args: |
| | solution: The nearly-successful solution |
| | result: The evaluation results |
| | feedback: The feedback provided |
| | |
| | Returns: |
| | The evolved problem state focusing on remaining issues |
| | """ |
| | |
| | new_state = copy.deepcopy(self.state) |
| | |
| | |
| | new_state.evolution_stage += 1 |
| | |
| | |
| | current_difficulty_idx = min(new_state.evolution_stage - 1, |
| | len(self.difficulty_progression) - 1) |
| | new_state.difficulty = self.difficulty_progression[current_difficulty_idx] |
| | |
| | |
| | new_state.code_context["focus_areas"] = feedback.focus_areas |
| | |
| | |
| | if result.test_results: |
| | failing_tests = [ |
| | test_name for test_name, test_result in result.test_results.items() |
| | if not test_result["passed"] |
| | ] |
| | new_state.code_context["failing_tests"] = failing_tests |
| | |
| | |
| | new_state.description = self._generate_focused_description( |
| | new_state, feedback.issues |
| | ) |
| | |
| | |
| | new_state.adaptation_vector = self._calculate_adaptation_vector( |
| | solution, result, feedback |
| | ) |
| | |
| | return new_state |
| | |
| | def _provide_more_guidance(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: |
| | """ |
| | Evolve the state to provide more guidance when the solution was not very good. |
| | |
| | Args: |
| | solution: The unsuccessful solution |
| | result: The evaluation results |
| | feedback: The feedback provided |
| | |
| | Returns: |
| | The evolved problem state with more guidance |
| | """ |
| | |
| | new_state = copy.deepcopy(self.state) |
| | |
| | |
| | new_state.evolution_stage += 1 |
| | |
| | |
| | current_difficulty_idx = max(0, min(new_state.evolution_stage - 1, |
| | len(self.difficulty_progression) - 1) - 1) |
| | new_state.difficulty = self.difficulty_progression[current_difficulty_idx] |
| | |
| | |
| | new_state.code_context["hints"] = self._generate_hints( |
| | solution, result, feedback |
| | ) |
| | |
| | |
| | if result.test_results: |
| | detailed_test_results = {} |
| | for test_name, test_result in result.test_results.items(): |
| | if not test_result["passed"]: |
| | detailed_test_results[test_name] = { |
| | "message": test_result.get("message", "Test failed"), |
| | "expected": test_result.get("expected"), |
| | "actual": test_result.get("actual"), |
| | "hint": self._generate_test_hint(test_name, test_result) |
| | } |
| | new_state.code_context["detailed_test_results"] = detailed_test_results |
| | |
| | |
| | new_state.description = self._generate_guided_description( |
| | new_state, feedback.issues, feedback.suggestions |
| | ) |
| | |
| | |
| | new_state.adaptation_vector = self._calculate_adaptation_vector( |
| | solution, result, feedback |
| | ) |
| | |
| | return new_state |
| | |
| | def _select_evolution_strategy(self, solution: str, result: EvaluationResult, feedback: Feedback) -> str: |
| | """ |
| | Select an evolution strategy based on the current state and solution. |
| | |
| | Args: |
| | solution: The current solution |
| | result: The evaluation results |
| | feedback: The feedback provided |
| | |
| | Returns: |
| | The selected evolution strategy |
| | """ |
| | available_strategies = self.evolution_strategies.copy() |
| | |
| | |
| | weights = {} |
| | |
| | |
| | if result.score > 0.95: |
| | weights["add_subtle_bug"] = 0.6 |
| | weights["change_requirements"] = 0.3 |
| | weights["increase_complexity"] = 0.1 |
| | |
| | |
| | elif self.state.evolution_stage >= 2 and "bug_count" in self.state.code_context and self.state.code_context["bug_count"] >= 3: |
| | weights["add_subtle_bug"] = 0.1 |
| | weights["change_requirements"] = 0.7 |
| | weights["increase_complexity"] = 0.2 |
| | |
| | |
| | elif result.score > 0.85: |
| | weights["add_subtle_bug"] = 0.2 |
| | weights["change_requirements"] = 0.2 |
| | weights["increase_complexity"] = 0.6 |
| | |
| | |
| | else: |
| | weights = {strategy: 1.0 / len(available_strategies) |
| | for strategy in available_strategies} |
| | |
| | |
| | total_weight = sum(weights.get(strategy, 0) for strategy in available_strategies) |
| | normalized_weights = [weights.get(strategy, 0) / total_weight |
| | for strategy in available_strategies] |
| | |
| | |
| | return random.choices(available_strategies, weights=normalized_weights)[0] |
| | |
| | def _add_subtle_bug(self, state: ProblemState, solution: str) -> None: |
| | """ |
| | Add a subtle bug to the solution code. |
| | |
| | Args: |
| | state: The problem state to modify |
| | solution: The current solution |
| | """ |
| | |
| | try: |
| | parsed_solution = ast.parse(solution) |
| | except SyntaxError: |
| | |
| | self._add_syntax_error(state, solution) |
| | return |
| | |
| | |
| | available_categories = list(self.bug_categories) |
| | if available_categories: |
| | bug_category = random.choice(available_categories) |
| | else: |
| | bug_category = BugCategory.LOGICAL |
| | |
| | |
| | if bug_category == BugCategory.SYNTAX: |
| | self._add_syntax_error(state, solution) |
| | elif bug_category == BugCategory.LOGICAL: |
| | self._add_logical_error(state, solution, parsed_solution) |
| | elif bug_category == BugCategory.PERFORMANCE: |
| | self._add_performance_issue(state, solution, parsed_solution) |
| | elif bug_category == BugCategory.EDGE_CASE: |
| | self._add_edge_case_issue(state, solution, parsed_solution) |
| | else: |
| | |
| | self._add_logical_error(state, solution, parsed_solution) |
| | |
| | |
| | if "bug_count" not in state.code_context: |
| | state.code_context["bug_count"] = 0 |
| | state.code_context["bug_count"] += 1 |
| | |
| | |
| | if "bug_categories" not in state.code_context: |
| | state.code_context["bug_categories"] = [] |
| | state.code_context["bug_categories"].append(bug_category) |
| | |
| | def _change_requirements(self, state: ProblemState, solution: str) -> None: |
| | """ |
| | Change the requirements to challenge the current solution. |
| | |
| | Args: |
| | state: The problem state to modify |
| | solution: The current solution |
| | """ |
| | |
| | requirements = state.requirements |
| | |
| | |
| | new_requirement = self._generate_new_requirement(state, solution) |
| | if new_requirement: |
| | requirements.append(new_requirement) |
| | |
| | |
| | if requirements and random.random() < 0.5: |
| | idx = random.randint(0, len(requirements) - 1) |
| | requirements[idx] = self._modify_requirement(requirements[idx], state, solution) |
| | |
| | def _increase_complexity(self, state: ProblemState, solution: str) -> None: |
| | """ |
| | Increase the complexity of the task. |
| | |
| | Args: |
| | state: The problem state to modify |
| | solution: The current solution |
| | """ |
| | |
| | try: |
| | parsed_solution = ast.parse(solution) |
| | except SyntaxError: |
| | |
| | self._add_edge_case_requirement(state) |
| | return |
| | |
| | |
| | strategies = [ |
| | "add_edge_cases", |
| | "increase_data_volume", |
| | "add_performance_constraint", |
| | "expand_functionality" |
| | ] |
| | |
| | strategy = random.choice(strategies) |
| | |
| | if strategy == "add_edge_cases": |
| | self._add_edge_case_requirement(state) |
| | elif strategy == "increase_data_volume": |
| | self._increase_data_volume(state, solution) |
| | elif strategy == "add_performance_constraint": |
| | self._add_performance_constraint(state, solution) |
| | elif strategy == "expand_functionality": |
| | self._expand_functionality(state, solution) |
| | |
| | def _create_test_files(self, temp_path: Path) -> List[Path]: |
| | """ |
| | Create test files based on the current problem state. |
| | |
| | Args: |
| | temp_path: The temporary directory path |
| | |
| | Returns: |
| | List of test file paths |
| | """ |
| | test_files = [] |
| | |
| | |
| | if "tests" in self.state.code_context: |
| | for i, test in enumerate(self.state.code_context["tests"]): |
| | test_file = temp_path / f"test_{i}.py" |
| | with open(test_file, "w") as f: |
| | f.write(test["content"]) |
| | test_files.append(test_file) |
| | |
| | |
| | if not test_files: |
| | test_file = temp_path / "test_default.py" |
| | with open(test_file, "w") as f: |
| | f.write(self._generate_default_test()) |
| | test_files.append(test_file) |
| | |
| | return test_files |
| | |
| | def _calculate_score(self, results: Dict[str, Any]) -> float: |
| | """ |
| | Calculate a score based on test results. |
| | |
| | Args: |
| | results: The test results |
| | |
| | Returns: |
| | A score between 0 and 1 |
| | """ |
| | |
| | if results["total_tests"] == 0: |
| | test_score = 0.0 |
| | else: |
| | test_score = results["passed_tests"] / results["total_tests"] |
| | |
| | |
| | execution_score = 1.0 if results["execution"]["success"] else 0.0 |
| | |
| | |
| | weights = self.config.get("score_weights", {"test": 0.7, "execution": 0.3}) |
| | score = (test_score * weights["test"] + execution_score * weights["execution"]) |
| | |
| | |
| | difficulty_modifier = 1.0 + (self.state.difficulty * 0.2) |
| | score = score / difficulty_modifier |
| | |
| | return max(0.0, min(1.0, score)) |
| | |
| | def _calculate_complexity(self, code: str) -> float: |
| | """ |
| | Calculate the complexity of code. |
| | |
| | Args: |
| | code: The code to analyze |
| | |
| | Returns: |
| | A complexity score |
| | """ |
| | |
| | complexity = 1 |
| | |
| | |
| | for pattern in ["if", "for", "while", "and", "or"]: |
| | complexity += code.count(f" {pattern} ") |
| | |
| | |
| | complexity += code.count("def ") |
| | |
| | |
| | normalized = min(1.0, complexity / 50.0) |
| | |
| | return normalized |
| | |
| | def _generate_suggestion_for_test_failure( |
| | self, |
| | issue: Dict[str, Any], |
| | solution: str, |
| | test_results: Dict[str, Any] |
| | ) -> Dict[str, Any]: |
| | """ |
| | Generate a suggestion for a test failure. |
| | |
| | Args: |
| | issue: The issue data |
| | solution: The solution code |
| | test_results: The test results |
| | |
| | Returns: |
| | A suggestion dictionary |
| | """ |
| | test_name = issue["test"] |
| | test_result = test_results[test_name] |
| | |
| | |
| | test_content = None |
| | for test in self.state.code_context.get("tests", []): |
| | if test.get("name") == test_name: |
| | test_content = test.get("content") |
| | break |
| | |
| | if test_content: |
| | |
| | assertion_match = re.search(r"assert.*", test_content) |
| | assertion = assertion_match.group(0) if assertion_match else None |
| | |
| | |
| | test_funcs = re.findall(r"def\s+(\w+)", test_content) |
| | solution_funcs = re.findall(r"def\s+(\w+)", solution) |
| | |
| | |
| | missing_funcs = [f for f in test_funcs if f not in solution_funcs] |
| | |
| | if missing_funcs: |
| | return { |
| | "type": "missing_function", |
| | "message": f"Implement the missing function(s): {', '.join(missing_funcs)}", |
| | "functions": missing_funcs |
| | } |
| | elif assertion: |
| | return { |
| | "type": "fix_assertion_failure", |
| | "message": f"Fix the code to pass the assertion: {assertion}", |
| | "assertion": assertion, |
| | "expected": test_result.get("expected"), |
| | "actual": test_result.get("actual") |
| | } |
| | else: |
| | return { |
| | "type": "fix_test_failure", |
| | "message": f"Fix the code to pass the test: {test_name}", |
| | "test_name": test_name |
| | } |
| | else: |
| | return { |
| | "type": "general_fix", |
| | "message": f"Fix the code to pass the failing test: {test_name}" |
| | } |
| | |
| | def _generate_suggestion_for_error( |
| | self, |
| | issue: Dict[str, Any], |
| | solution: str |
| | ) -> Dict[str, Any]: |
| | """ |
| | Generate a suggestion for an error. |
| | |
| | Args: |
| | issue: The issue data |
| | solution: The solution code |
| | |
| | Returns: |
| | A suggestion dictionary |
| | """ |
| | error_type = issue["error_type"] |
| | message = issue["message"] |
| | location = issue.get("location") |
| | |
| | if error_type == "syntax": |
| | return { |
| | "type": "fix_syntax", |
| | "message": f"Fix the syntax error: {message}", |
| | "location": location |
| | } |
| | elif error_type == "runtime": |
| | return { |
| | "type": "fix_runtime_error", |
| | "message": f"Fix the runtime error: {message}", |
| | "location": location |
| | } |
| | else: |
| | return { |
| | "type": "fix_error", |
| | "message": f"Fix the error: {message}", |
| | "error_type": error_type, |
| | "location": location |
| | } |
| | |
| | def _determine_focus_areas( |
| | self, |
| | issues: List[Dict[str, Any]], |
| | solution: str, |
| | result: EvaluationResult |
| | ) -> List[str]: |
| | """ |
| | Determine focus areas based on issues and results. |
| | |
| | Args: |
| | issues: The identified issues |
| | solution: The solution code |
| | result: The evaluation results |
| | |
| | Returns: |
| | List of focus areas |
| | """ |
| | focus_areas = [] |
| | |
| | |
| | syntax_issues = [i for i in issues if i.get("error_type") == "syntax"] |
| | if syntax_issues: |
| | focus_areas.append("syntax") |
| | |
| | |
| | test_issues = [i for i in issues if i["type"] == "test_failure"] |
| | if test_issues: |
| | if any("expected" in i and "actual" in i for i in test_issues): |
| | focus_areas.append("logic") |
| | else: |
| | focus_areas.append("functionality") |
| | |
| | |
| | if result.metrics and "execution_time" in result.metrics: |
| | if result.metrics["execution_time"] > self.config.get("performance_threshold", 1.0): |
| | focus_areas.append("performance") |
| | |
| | |
| | if result.metrics and "code_complexity" in result.metrics: |
| | if result.metrics["code_complexity"] > self.config.get("complexity_threshold", 0.7): |
| | focus_areas.append("complexity") |
| | |
| | |
| | if not focus_areas: |
| | focus_areas.append("general") |
| | |
| | return focus_areas |
| | |
| | def _generate_adaptation_hints( |
| | self, |
| | solution: str, |
| | result: EvaluationResult |
| | ) -> List[Dict[str, Any]]: |
| | """ |
| | Generate hints about how the problem might adapt in the next iteration. |
| | |
| | Args: |
| | solution: The solution code |
| | result: The evaluation results |
| | |
| | Returns: |
| | List of adaptation hints |
| | """ |
| | hints = [] |
| | |
| | |
| | if result.score > 0.8: |
| | hints.append({ |
| | "type": "complexity_increase", |
| | "message": "The problem may become more complex in the next iteration." |
| | }) |
| | |
| | |
| | if result.score > 0.9 and self.state.evolution_stage >= 1: |
| | hints.append({ |
| | "type": "requirement_change", |
| | "message": "The requirements may change in the next iteration." |
| | }) |
| | |
| | |
| | if result.score > 0.95: |
| | hints.append({ |
| | "type": "new_bugs", |
| | "message": "New, more subtle bugs may be introduced in the next iteration." |
| | }) |
| | |
| | |
| | if result.score > 0.7 and result.score < 0.95: |
| | focus_areas = result.metrics.get("focus_areas", []) |
| | if focus_areas: |
| | hints.append({ |
| | "type": "focus_shift", |
| | "message": f"The next iteration may focus more on: {', '.join(focus_areas)}", |
| | "areas": focus_areas |
| | }) |
| | |
| | return hints |
| | |
| | def _generate_description(self, state: ProblemState) -> str: |
| | """ |
| | Generate a description for the current problem state. |
| | |
| | Args: |
| | state: The problem state |
| | |
| | Returns: |
| | A descriptive prompt for the problem |
| | """ |
| | |
| | base_desc = ( |
| | f"Fix the bug(s) in the following code. " |
| | f"This is iteration {state.evolution_stage + 1} of the task." |
| | ) |
| | |
| | |
| | if "bug_categories" in state.code_context: |
| | categories = state.code_context["bug_categories"] |
| | if categories: |
| | base_desc += f"\n\nThe code contains the following types of issues: {', '.join(categories)}." |
| | |
| | |
| | if state.requirements: |
| | base_desc += "\n\nRequirements:" |
| | for i, req in enumerate(state.requirements): |
| | base_desc += f"\n{i+1}. {req['description']}" |
| | |
| | |
| | difficulty_desc = "easy" |
| | if state.difficulty > 0.3 and state.difficulty <= 0.6: |
| | difficulty_desc = "moderate" |
| | elif state.difficulty > 0.6 and state.difficulty <= 0.8: |
| | difficulty_desc = "challenging" |
| | elif state.difficulty > 0.8: |
| | difficulty_desc = "very challenging" |
| | |
| | base_desc += f"\n\nThis is a {difficulty_desc} bug fixing task." |
| | |
| | return base_desc |
| | |
| | def _generate_focused_description(self, state: ProblemState, issues: List[Dict[str, Any]]) -> str: |
| | """ |
| | Generate a description focused on remaining issues. |
| | |
| | Args: |
| | state: The problem state |
| | issues: The identified issues |
| | |
| | Returns: |
| | A descriptive prompt focused on remaining issues |
| | """ |
| | base_desc = self._generate_description(state) |
| | |
| | |
| | if issues: |
| | base_desc += "\n\nFocus on the following issues:" |
| | for i, issue in enumerate(issues): |
| | if issue["type"] == "test_failure": |
| | base_desc += f"\n{i+1}. Test failure in '{issue['test']}': {issue['message']}" |
| | else: |
| | base_desc += f"\n{i+1}. {issue['error_type']} error: {issue['message']}" |
| | |
| | |
| | if "focus_areas" in state.code_context: |
| | areas = state.code_context["focus_areas"] |
| | if areas: |
| | base_desc += f"\n\nPay particular attention to: {', '.join(areas)}." |
| | |
| | return base_desc |
| | |
| | def _generate_guided_description( |
| | self, |
| | state: ProblemState, |
| | issues: List[Dict[str, Any]], |
| | suggestions: List[Dict[str, Any]] |
| | ) -> str: |
| | """ |
| | Generate a description with added guidance. |
| | |
| | Args: |
| | state: The problem state |
| | issues: The identified issues |
| | suggestions: The suggested fixes |
| | |
| | Returns: |
| | A descriptive prompt with added guidance |
| | """ |
| | base_desc = self._generate_description(state) |
| | |
| | |
| | if issues: |
| | base_desc += "\n\nThe following issues were identified in your previous solution:" |
| | for i, issue in enumerate(issues): |
| | if issue["type"] == "test_failure": |
| | base_desc += f"\n{i+1}. Test failure in '{issue['test']}': {issue['message']}" |
| | if "expected" in issue and "actual" in issue: |
| | base_desc += f"\n Expected: {issue['expected']}" |
| | base_desc += f"\n Actual: {issue['actual']}" |
| | else: |
| | base_desc += f"\n{i+1}. {issue['error_type']} error: {issue['message']}" |
| | if "location" in issue: |
| | base_desc += f"\n Location: {issue['location']}" |
| | |
| | |
| | if suggestions: |
| | base_desc += "\n\nConsider the following suggestions:" |
| | for i, suggestion in enumerate(suggestions): |
| | base_desc += f"\n{i+1}. {suggestion['message']}" |
| | |
| | |
| | if "hints" in state.code_context: |
| | hints = state.code_context["hints"] |
| | if hints: |
| | base_desc += "\n\nHints:" |
| | for i, hint in enumerate(hints): |
| | base_desc += f"\n{i+1}. {hint}" |
| | |
| | return base_desc |
| | |
| | def _generate_hints( |
| | self, |
| | solution: str, |
| | result: EvaluationResult, |
| | feedback: Feedback |
| | ) -> List[str]: |
| | """ |
| | Generate hints based on the solution and feedback. |
| | |
| | Args: |
| | solution: The solution code |
| | result: The evaluation results |
| | feedback: The feedback provided |
| | |
| | Returns: |
| | List of hints |
| | """ |
| | hints = [] |
| | |
| | |
| | if result.test_results: |
| | failing_tests = [ |
| | test_name for test_name, test_result in result.test_results.items() |
| | if not test_result["passed"] |
| | ] |
| | |
| | if failing_tests: |
| | test_hint = "Focus on fixing the failing tests" |
| | |
| | |
| | for test_name in failing_tests[:2]: |
| | test_result = result.test_results[test_name] |
| | if "expected" in test_result and "actual" in test_result: |
| | test_hint += f". For test '{test_name}', expected '{test_result['expected']}' but got '{test_result['actual']}'" |
| | |
| | hints.append(test_hint + ".") |
| | |
| | |
| | if result.error_details: |
| | for error_type, error_info in result.error_details.items(): |
| | hints.append(f"Fix the {error_type} error: {error_info.get('message', 'Unknown error')}.") |
| | |
| | |
| | for area in feedback.focus_areas: |
| | if area == "syntax": |
| | hints.append("Check your syntax carefully, especially parentheses, indentation, and function definitions.") |
| | elif area == "logic": |
| | hints.append("Review the logic of your solution, especially conditional statements and loop conditions.") |
| | elif area == "functionality": |
| | hints.append("Ensure your solution implements all required functionality specified in the tests.") |
| | elif area == "performance": |
| | hints.append("Consider optimizing your solution for better performance, avoid unnecessary operations.") |
| | elif area == "complexity": |
| | hints.append("Try to simplify your solution, it may be more complex than necessary.") |
| | |
| | return hints |
| | |
| | def _generate_test_hint(self, test_name: str, test_result: Dict[str, Any]) -> str: |
| | """ |
| | Generate a hint for a specific failing test. |
| | |
| | Args: |
| | test_name: The name of the test |
| | test_result: The test result |
| | |
| | Returns: |
| | A hint for the test |
| | """ |
| | if "expected" in test_result and "actual" in test_result: |
| | return f"The test expected '{test_result['expected']}' but got '{test_result['actual']}'" |
| | elif "message" in test_result: |
| | return test_result["message"] |
| | else: |
| | return "The test failed, but no detailed information is available." |
| | |
| | def _add_syntax_error(self, state: ProblemState, solution: str) -> None: |
| | """ |
| | Add a syntax error to the solution code. |
| | |
| | Args: |
| | state: The problem state to modify |
| | solution: The current solution |
| | """ |
| | lines = solution.split('\n') |
| | if not lines: |
| | return |
| | |
| | |
| | idx = random.randint(0, len(lines) - 1) |
| | line = lines[idx] |
| | |
| | |
| | while not line.strip() or line.strip().startswith('#'): |
| | idx = random.randint(0, len(lines) - 1) |
| | line = lines[idx] |
| | |
| | |
| | mod_type = random.choice([ |
| | "remove_character", |
| | "add_character", |
| | "swap_characters", |
| | "change_indent" |
| | ]) |
| | |
| | if mod_type == "remove_character" and line: |
| | char_idx = random.randint(0, len(line) - 1) |
| | lines[idx] = line[:char_idx] + line[char_idx+1:] |
| | |
| | elif mod_type == "add_character": |
| | char_idx = random.randint(0, len(line)) |
| | char = random.choice(["(", ")", "{", "}", "[", "]", ":", ";", ",", "."]) |
| | lines[idx] = line[:char_idx] + char + line[char_idx:] |
| | |
| | elif mod_type == "swap_characters" and len(line) >= 2: |
| | char_idx = random.randint(0, len(line) - 2) |
| | lines[idx] = (line[:char_idx] + line[char_idx+1] + |
| | line[char_idx] + line[char_idx+2:]) |
| | |
| | elif mod_type == "change_indent": |
| | |
| | if line.startswith(" "): |
| | lines[idx] = line[2:] |
| | else: |
| | lines[idx] = " " + line |
| | |
| | |
| | modified_code = '\n'.join(lines) |
| | state.code_context["code"] = modified_code |
| | |
| | |
| | if "bugs" not in state.code_context: |
| | state.code_context["bugs"] = [] |
| | |
| | state.code_context["bugs"].append({ |
| | "type": "syntax", |
| | "line": idx + 1, |
| | "description": f"Syntax error introduced in line {idx + 1}" |
| | }) |
| | |
| | def _add_logical_error(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None: |
| | """ |
| | Add a logical error to the solution code. |
| | |
| | Args: |
| | state: The problem state to modify |
| | solution: The current solution |
| | parsed_solution: The parsed AST of the solution |
| | """ |
| | modification_types = [ |
| | "change_comparison", |
| | "invert_condition", |
| | "off_by_one", |
| | "change_operator", |
| | "reverse_logic" |
| | ] |
| | |
| | mod_type = random.choice(modification_types) |
| | lines = solution.split('\n') |
| | |
| | |
| | if_statements = [] |
| | for i, line in enumerate(lines): |
| | if re.search(r'\bif\b|\bwhile\b|\bfor\b', line): |
| | if_statements.append((i, line)) |
| | |
| | if if_statements: |
| | |
| | idx, line = random.choice(if_ |
| | |
| |
|
| | if if_statements: |
| | |
| | idx, line = random.choice(if_statements) |
| | |
| | if mod_type == "change_comparison": |
| | |
| | comparisons = {"==": "!=", "!=": "==", ">": "<", "<": ">", ">=": "<=", "<=": ">="} |
| | for op, new_op in comparisons.items(): |
| | if op in line: |
| | lines[idx] = line.replace(op, new_op, 1) |
| | break |
| | |
| | elif mod_type == "invert_condition": |
| | |
| | if "not" in line: |
| | lines[idx] = line.replace("not ", "", 1) |
| | else: |
| | match = re.search(r'(if|while)\s+([^:]+):', line) |
| | if match: |
| | condition = match.group(2) |
| | lines[idx] = line.replace(condition, f"not ({condition})", 1) |
| | |
| | elif mod_type == "off_by_one": |
| | |
| | for op in ["+", "-"]: |
| | if op in line: |
| | |
| | match = re.search(f'\\{op}\\s*(\\d+)', line) |
| | if match: |
| | num = int(match.group(1)) |
| | new_num = num + 1 if op == "+" else max(0, num - 1) |
| | lines[idx] = line.replace(f"{op} {num}", f"{op} {new_num}", 1) |
| | break |
| | |
| | elif mod_type == "change_operator": |
| | |
| | operators = {"+": "-", "-": "+", "*": "/", "/": "*", "and": "or", "or": "and"} |
| | for op, new_op in operators.items(): |
| | if f" {op} " in line: |
| | lines[idx] = line.replace(f" {op} ", f" {new_op} ", 1) |
| | break |
| | |
| | elif mod_type == "reverse_logic": |
| | |
| | if " and " in line: |
| | parts = line.split(" and ") |
| | lines[idx] = line.replace(" and ".join(parts), " or ".join(parts), 1) |
| | elif " or " in line: |
| | parts = line.split(" or ") |
| | lines[idx] = line.replace(" or ".join(parts), " and ".join(parts), 1) |
| | |
| | else: |
| | |
| | |
| | assignments = [] |
| | for i, line in enumerate(lines): |
| | if "=" in line and "==" not in line and "!=" not in line: |
| | assignments.append((i, line)) |
| | |
| | if assignments: |
| | |
| | idx, line = random.choice(assignments) |
| | |
| | |
| | if "+" in line: |
| | lines[idx] = line.replace("+", "-", 1) |
| | elif "-" in line: |
| | lines[idx] = line.replace("-", "+", 1) |
| | elif "*" in line: |
| | lines[idx] = line.replace("*", "/", 1) |
| | elif "/" in line: |
| | lines[idx] = line.replace("/", "*", 1) |
| | else: |
| | |
| | match = re.search(r'=\s*(\d+)', line) |
| | if match: |
| | num = int(match.group(1)) |
| | new_num = num + random.choice([-1, 1]) * random.randint(1, 3) |
| | lines[idx] = line.replace(f"= {num}", f"= {new_num}", 1) |
| | |
| | |
| | modified_code = '\n'.join(lines) |
| | state.code_context["code"] = modified_code |
| | |
| | |
| | if "bugs" not in state.code_context: |
| | state.code_context["bugs"] = [] |
| | |
| | state.code_context["bugs"].append({ |
| | "type": "logical", |
| | "line": idx + 1, |
| | "description": f"Logical error introduced in line {idx + 1}: {mod_type}" |
| | }) |
| | |
| | def _add_performance_issue(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None: |
| | """ |
| | Add a performance issue to the solution code. |
| | |
| | Args: |
| | state: The problem state to modify |
| | solution: The current solution |
| | parsed_solution: The parsed AST of the solution |
| | """ |
| | lines = solution.split('\n') |
| | |
| | |
| | loops = [] |
| | for i, line in enumerate(lines): |
| | if re.search(r'\bfor\b|\bwhile\b', line): |
| | loops.append((i, line)) |
| | |
| | if loops: |
| | |
| | idx, line = random.choice(loops) |
| | |
| | |
| | mod_type = random.choice([ |
| | "add_nested_loop", |
| | "replace_efficient_operation", |
| | "add_redundant_computation" |
| | ]) |
| | |
| | if mod_type == "add_nested_loop": |
| | |
| | indent = len(line) - len(line.lstrip()) |
| | indent_str = ' ' * indent |
| | loop_body_indent = indent_str + ' ' |
| | |
| | |
| | end_idx = idx + 1 |
| | while end_idx < len(lines) and (not lines[end_idx].strip() or len(lines[end_idx]) - len(lines[end_idx].lstrip()) > indent): |
| | end_idx += 1 |
| | |
| | |
| | insert_pos = end_idx |
| | lines.insert(insert_pos, f"{loop_body_indent}for _ in range(100): # Unnecessary loop") |
| | lines.insert(insert_pos + 1, f"{loop_body_indent} pass") |
| | |
| | elif mod_type == "replace_efficient_operation": |
| | |
| | |
| | for i in range(idx + 1, min(idx + 10, len(lines))): |
| | if "append" in lines[i] or "extend" in lines[i]: |
| | indent = len(lines[i]) - len(lines[i].lstrip()) |
| | indent_str = ' ' * indent |
| | match = re.search(r'(\w+)\.(append|extend)', lines[i]) |
| | if match: |
| | list_name = match.group(1) |
| | operation = match.group(2) |
| | item = lines[i].split(f"{list_name}.{operation}(")[1].split(")")[0] |
| | |
| | if operation == "append": |
| | |
| | lines[i] = f"{indent_str}{list_name} = {list_name} + [{item}] # Less efficient than append" |
| | elif operation == "extend": |
| | |
| | lines[i] = f"{indent_str}{list_name} = {list_name} + {item} # Less efficient than extend" |
| | break |
| | |
| | elif mod_type == "add_redundant_computation": |
| | |
| | |
| | if idx + 1 < len(lines): |
| | body_indent = len(lines[idx + 1]) - len(lines[idx + 1].lstrip()) |
| | body_indent_str = ' ' * body_indent |
| | |
| | |
| | lines.insert(idx + 1, f"{body_indent_str}temp = [] # Redundant computation") |
| | lines.insert(idx + 2, f"{body_indent_str}for i in range(1000):") |
| | lines.insert(idx + 3, f"{body_indent_str} temp.append(i)") |
| | lines.insert(idx + 4, f"{body_indent_str} temp.sort() # Unnecessary sort in each iteration") |
| | |
| | else: |
| | |
| | function_defs = [] |
| | for i, line in enumerate(lines): |
| | if line.strip().startswith("def "): |
| | function_defs.append((i, line)) |
| | |
| | if function_defs: |
| | |
| | idx, line = random.choice(function_defs) |
| | |
| | |
| | if idx + 1 < len(lines): |
| | body_indent = len(lines[idx + 1]) - len(lines[idx + 1].lstrip()) |
| | body_indent_str = ' ' * body_indent |
| | |
| | |
| | lines.insert(idx + 1, f"{body_indent_str}# Inefficient data structure usage") |
| | lines.insert(idx + 2, f"{body_indent_str}data = []") |
| | lines.insert(idx + 3, f"{body_indent_str}for i in range(1000):") |
| | lines.insert(idx + 4, f"{body_indent_str} data.append(i)") |
| | lines.insert(idx + 5, f"{body_indent_str} # Inefficient search operation") |
| | lines.insert(idx + 6, f"{body_indent_str} if i in data: # Linear search instead of using a set") |
| | lines.insert(idx + 7, f"{body_indent_str} pass") |
| | |
| | |
| | modified_code = '\n'.join(lines) |
| | state.code_context["code"] = modified_code |
| | |
| | |
| | if "bugs" not in state.code_context: |
| | state.code_context["bugs"] = [] |
| | |
| | state.code_context["bugs"].append({ |
| | "type": "performance", |
| | "line": idx + 1, |
| | "description": f"Performance issue introduced around line {idx + 1}" |
| | }) |
| | |
| | def _add_edge_case_issue(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None: |
| | """ |
| | Add an edge case issue to the solution code. |
| | |
| | Args: |
| | state: The problem state to modify |
| | solution: The current solution |
| | parsed_solution: The parsed AST of the solution |
| | """ |
| | lines = solution.split('\n') |
| | |
| | |
| | functions = [] |
| | current_func = None |
| | func_start = None |
| | for i, line in enumerate(lines): |
| | if line.strip().startswith("def "): |
| | if current_func: |
| | functions.append((func_start, i - 1, current_func)) |
| | current_func = line.strip()[4:].split("(")[0] |
| | func_start = i |
| | elif i == len(lines) - 1 and current_func: |
| | functions.append((func_start, i, current_func)) |
| | |
| | if functions: |
| | |
| | start_idx, end_idx, func_name = random.choice(functions) |
| | |
| | |
| | mod_type = random.choice([ |
| | "remove_boundary_check", |
| | "introduce_zero_division", |
| | "handling_empty_input", |
| | "type_assumption" |
| | ]) |
| | |
| | if mod_type == "remove_boundary_check": |
| | |
| | for i in range(start_idx, end_idx + 1): |
| | if re.search(r'if\s+.*(?:len|count|size|length|empty|<=|>=|<|>|\!=)', lines[i]): |
| | |
| | lines[i] = f"# {lines[i]} # Boundary check removed" |
| | |
| | j = i + 1 |
| | indent = len(lines[i]) - len(lines[i].lstrip()) |
| | body_indent = indent + 4 |
| | while j <= end_idx and (not lines[j].strip() or len(lines[j]) - len(lines[j].lstrip()) >= body_indent): |
| | lines[j] = f"# {lines[j]}" |
| | j += 1 |
| | break |
| | |
| | elif mod_type == "introduce_zero_division": |
| | |
| | for i in range(start_idx, end_idx + 1): |
| | if "/" in lines[i] and "try" not in lines[i] and "except" not in lines[i]: |
| | |
| | if re.search(r'if\s+.*(?:!=\s*0|>\s*0)', lines[i]): |
| | lines[i] = f"# {lines[i]} # Denominator check removed" |
| | else: |
| | |
| | match = re.search(r'(\w+)\s*/\s*(\w+)', lines[i]) |
| | if match: |
| | denominator = match.group(2) |
| | |
| | indent = len(lines[i]) - len(lines[i].lstrip()) |
| | indent_str = ' ' * indent |
| | lines.insert(i, f"{indent_str}if random.random() < 0.1: # Introduce potential zero division") |
| | lines.insert(i + 1, f"{indent_str} {denominator} = 0") |
| | break |
| | |
| | elif mod_type == "handling_empty_input": |
| | |
| | params = re.search(r'def\s+\w+\s*\((.*?)\)', lines[start_idx]) |
| | if params and params.group(1): |
| | param_list = [p.strip() for p in params.group(1).split(",")] |
| | if param_list: |
| | param = param_list[0].split("=")[0].strip() |
| | |
| | for i in range(start_idx + 1, end_idx + 1): |
| | if re.search(rf'if\s+.*(?:not\s+{param}|len\s*\(\s*{param}\s*\)\s*==\s*0)', lines[i]): |
| | |
| | lines[i] = f"# {lines[i]} # Empty input check removed" |
| | |
| | j = i + 1 |
| | indent = len(lines[i]) - len(lines[i].lstrip()) |
| | body_indent = indent + 4 |
| | while j <= end_idx and (not lines[j].strip() or len(lines[j]) - len(lines[j].lstrip()) >= body_indent): |
| | lines[j] = f"# {lines[j]}" |
| | j += 1 |
| | break |
| | |
| | elif mod_type == "type_assumption": |
| | |
| | params = re.search(r'def\s+\w+\s*\((.*?)\)', lines[start_idx]) |
| | if params and params.group(1): |
| | param_list = [p.strip() for p in params.group(1).split(",")] |
| | if param_list: |
| | param = param_list[0].split("=")[0].strip() |
| | |
| | type_check_found = False |
| | for i in range(start_idx + 1, end_idx + 1): |
| | if re.search(rf'(?:isinstance|type)\s*\(\s*{param}\s*,', lines[i]): |
| | |
| | lines[i] = f"# {lines[i]} # Type check removed" |
| | type_check_found = True |
| | break |
| | |
| | if not type_check_found: |
| | |
| | indent = 4 |
| | for i in range(start_idx + 1, min(start_idx + 5, end_idx + 1)): |
| | if lines[i].strip(): |
| | indent = len(lines[i]) - len(lines[i].lstrip()) |
| | break |
| | |
| | indent_str = ' ' * indent |
| | |
| | lines.insert(start_idx + 1, f"{indent_str}# Assuming {param} is a specific type without checking") |
| | lines.insert(start_idx + 2, f"{indent_str}{param}_length = len({param}) # Will fail if {param} doesn't support len()") |
| | |
| | |
| | modified_code = '\n'.join(lines) |
| | state.code_context["code"] = modified_code |
| | |
| | |
| | if "bugs" not in state.code_context: |
| | state.code_context["bugs"] = [] |
| | |
| | state.code_context["bugs"].append({ |
| | "type": "edge_case", |
| | "line": start_idx + 1, |
| | "description": f"Edge case issue introduced in function '{func_name}': {mod_type}" |
| | }) |
| | |
| | def _generate_new_requirement(self, state: ProblemState, solution: str) -> Dict[str, Any]: |
| | """ |
| | Generate a new requirement based on the current state and solution. |
| | |
| | Args: |
| | state: The current problem state |
| | solution: The current solution |
| | |
| | Returns: |
| | A new requirement dictionary |
| | """ |
| | |
| | function_names = re.findall(r'def\s+(\w+)', solution) |
| | variable_names = re.findall(r'(\w+)\s*=', solution) |
| | |
| | |
| | req_type = random.choice([ |
| | "edge_case_handling", |
| | "performance_improvement", |
| | "error_handling", |
| | "type_checking", |
| | "feature_addition" |
| | ]) |
| | |
| | if req_type == "edge_case_handling": |
| | if function_names: |
| | func_name = random.choice(function_names) |
| | edge_cases = [ |
| | "empty input", |
| | "negative values", |
| | "zero values", |
| | "extremely large values", |
| | "special characters", |
| | "duplicate values" |
| | ] |
| | edge_case = random.choice(edge_cases) |
| | return { |
| | "type": "edge_case_handling", |
| | "description": f"The function '{func_name}' should handle {edge_case} correctly.", |
| | "difficulty": random.uniform(0.3, 0.7) |
| | } |
| | |
| | elif req_type == "performance_improvement": |
| | return { |
| | "type": "performance_improvement", |
| | "description": "The solution should be optimized to run in O(n) time or better.", |
| | "difficulty": random.uniform(0.4, 0.8) |
| | } |
| | |
| | elif req_type == "error_handling": |
| | error_types = [ |
| | "invalid input", |
| | "division by zero", |
| | "file not found", |
| | "network timeout", |
| | "permission denied" |
| | ] |
| | error_type = random.choice(error_types) |
| | return { |
| | "type": "error_handling", |
| | "description": f"The code should handle {error_type} errors gracefully.", |
| | "difficulty": random.uniform(0.2, 0.6) |
| | } |
| | |
| | elif req_type == "type_checking": |
| | if function_names: |
| | func_name = random.choice(function_names) |
| | return { |
| | "type": "type_checking", |
| | "description": f"The function '{func_name}' should validate input types before processing.", |
| | "difficulty": random.uniform(0.1, 0.5) |
| | } |
| | |
| | elif req_type == "feature_addition": |
| | features = [ |
| | "logging capability", |
| | "progress tracking", |
| | "caching for repeated operations", |
| | "parameter validation", |
| | "configuration options" |
| | ] |
| | feature = random.choice(features) |
| | return { |
| | "type": "feature_addition", |
| | "description": f"Add {feature} to the solution.", |
| | "difficulty": random.uniform(0.3, 0.7) |
| | } |
| | |
| | |
| | return { |
| | "type": "general_improvement", |
| | "description": "Improve the overall code quality and readability.", |
| | "difficulty": random.uniform(0.1, 0.4) |
| | } |
| | |
| | def _modify_requirement(self, requirement: Dict[str, Any], state: ProblemState, solution: str) -> Dict[str, Any]: |
| | """ |
| | Modify an existing requirement to make it more challenging. |
| | |
| | Args: |
| | requirement: The requirement to modify |
| | state: The current problem state |
| | solution: The current solution |
| | |
| | Returns: |
| | The modified requirement |
| | """ |
| | |
| | modified_req = copy.deepcopy(requirement) |
| | |
| | |
| | modified_req["difficulty"] = min(1.0, requirement.get("difficulty", 0.3) + random.uniform(0.1, 0.3)) |
| | |
| | |
| | if requirement["type"] == "edge_case_handling": |
| | modified_req["description"] += " Additionally, it should handle very large inputs efficiently." |
| | |
| | elif requirement["type"] == "performance_improvement": |
| | modified_req["description"] = modified_req["description"].replace("O(n)", "O(log n)") |
| | |
| | elif requirement["type"] == "error_handling": |
| | modified_req["description"] += " And provide detailed error messages for debugging." |
| | |
| | elif requirement["type"] == "type_checking": |
| | modified_req["description"] += " And automatically convert types when possible." |
| | |
| | elif requirement["type"] == "feature_addition": |
| | modified_req["description"] += " Ensure this feature is configurable via parameters." |
| | |
| | else: |
| | modified_req["description"] += " The code should also be well-documented with comments." |
| | |
| | return modified_req |
| | |
| | def _add_edge_case_requirement(self, state: ProblemState) -> None: |
| | """ |
| | Add a requirement for handling edge cases. |
| | |
| | Args: |
| | state: The problem state to modify |
| | """ |
| | edge_cases = [ |
| | "empty collections", |
| | "null/None values", |
| | "boundary values (min/max)", |
| | "negative numbers", |
| | "special characters", |
| | "Unicode characters", |
| | "very large inputs", |
| | "malformed input" |
| | ] |
| | |
| | edge_case = random.choice(edge_cases) |
| | |
| | |
| | state.requirements.append({ |
| | "type": "edge_case_handling", |
| | "description": f"The solution must correctly handle {edge_case}.", |
| | "difficulty": random.uniform(0.3, 0.7) |
| | }) |
| | |
| | |
| | if "tests" in state.code_context: |
| | |
| | test_template = self._generate_edge_case_test(edge_case, state.code_context) |
| | if test_template: |
| | state.code_context["tests"].append({ |
| | "name": f"test_edge_case_{len(state.code_context['tests'])}", |
| | "content": test_template, |
| | "description": f"Test handling of {edge_case}" |
| | }) |
| | |
| | def _increase_data_volume(self, state: ProblemState, solution: str) -> None: |
| | """ |
| | Modify the problem to require handling larger data volumes. |
| | |
| | Args: |
| | state: The problem state to modify |
| | solution: The current solution |
| | """ |
| | |
| | state.requirements.append({ |
| | "type": "scalability", |
| | "description": "The solution must efficiently handle large datasets (10,000+ items).", |
| | "difficulty": random.uniform(0.5, 0.8) |
| | }) |
| | |
| | |
| | if "tests" in state.code_context: |
| | for i, test in enumerate(state.code_context["tests"]): |
| | content = test["content"] |
| | |
| | |
| | for pattern, replacement in [ |
| | (r'\[[^\]]{0,50}\]', '[random.randint(0, 1000) for _ in range(10000)]'), |
| | (r'range\(\d+\)', 'range(10000)'), |
| | (r'"[^"]{0,20}"', '"' + 'a' * 10000 + '"') |
| | ]: |
| | match = re.search(pattern, content) |
| | if match and random.random() < 0.3: |
| | content = content.replace(match.group(0), replacement, 1) |
| | break |
| | |
| | state.code_context["tests"][i]["content"] = content |
| | state.code_context["tests"][i]["description"] = f"{test.get('description', 'Test')} (with large data)" |
| | |
| | def _add_performance_constraint(self, state: ProblemState, solution: str) -> None: |
| | """ |
| | Add a performance constraint to the problem. |
| | |
| | Args: |
| | state: The problem state to modify |
| | solution: The current solution |
| | """ |
| | |
| | constraints = [ |
| | "linear time complexity (O(n))", |
| | "logarithmic time complexity (O(log n))", |
| | "constant memory usage (O(1) space)", |
| | "execution time under 100ms for large inputs", |
| | "minimal function calls" |
| | ] |
| | |
| | constraint = random.choice(constraints) |
| | |
| | |
| | state.requirements.append({ |
| | "type": "performance", |
| | "description": f"The solution must achieve {constraint}.", |
| | "difficulty": random.uniform(0.6, 0.9) |
| | }) |
| | |
| | |
| | if "tests" in state.code_context: |
| | |
| | perf_test = self._generate_performance_test(constraint, state.code_context) |
| | if perf_test: |
| | state.code_context["tests"].append({ |
| | "name": f"test_performance_{len(state.code_context['tests'])}", |
| | "content": perf_test, |
| | "description": f"Test {constraint}" |
| | }) |
| | |
| | def _expand_functionality(self, state: ProblemState, solution: str) -> None: |
| | """ |
| | Expand the required functionality of the solution. |
| | |
| | Args: |
| | state: The problem state to modify |
| | solution: The current solution |
| | """ |
| | |
| | expansions = [ |
| | "support for different input types", |
| | "parameterized behavior", |
| | "additional output formats", |
| | "flexible error handling", |
| | "integration with external systems" |
| | ] |
| | |
| | expansion = random.choice(expansions) |
| | |
| | |
| | state.requirements.append({ |
| | "type": "functionality", |
| | "description": f"Expand the solution to include {expansion}.", |
| | "difficulty": random.uniform(0.4, 0.8) |
| | }) |
| | |
| | |
| | if "tests" in state.code_context: |
| | |
| | test_template = self._generate_functionality_test(expansion, state.code_context) |
| | if test_template: |
| | state.code_context["tests"].append({ |
| | "name": f"test_expanded_functionality_{len(state.code_context['tests'])}", |
| | "content": test_template, |
| | "description": f"Test {expansion}" |
| | }) |
| | |
| | def _generate_default_test(self) -> str: |
| | """ |
| | Generate a default test based on the current problem state. |
| | |
| | Returns: |
| | A default test script |
| | """ |
| | |
| | return """ |
| | import unittest |
| | import sys |
| | import os |
| | |
| | # Add the directory containing the solution to the path |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | |
| | # Import the solution |
| | from solution import * |
| | |
| | class DefaultTest(unittest.TestCase): |
| | def test_basic_functionality(self): |
| | # A basic test that should pass if the solution is correct |
| | self.assertTrue(True, "Basic assertion failed") |
| | |
| | def test_expected_output(self): |
| | # Test expected output of main functions |
| | # This will need to be updated based on the specific problem |
| | pass |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | |
| | def _generate_edge_case_test(self, edge_case: str, code_context: Dict[str, Any]) -> str: |
| | """ |
| | Generate a test for an edge case. |
| | |
| | Args: |
| | edge_case: The edge case to test |
| | code_context: The code context containing information about the problem |
| | |
| | Returns: |
| | A test script for the edge case |
| | """ |
| | |
| | function_names = [] |
| | if "code" in code_context: |
| | function_names = re.findall(r'def\s+(\w+)', code_context["code"]) |
| | |
| | if not function_names: |
| | return None |
| | |
| | |
| | function_name = random.choice(function_names) |
| | |
| | |
| | if edge_case == "empty collections": |
| | return f""" |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import {function_name} |
| | |
| | class EmptyCollectionTest(unittest.TestCase): |
| | def test_empty_input(self): |
| | # Test with empty list |
| | result = {function_name}([]) |
| | self.assertIsNotNone(result, "Function should handle empty list") |
| | |
| | # Test with empty string |
| | result = {function_name}("") |
| | self.assertIsNotNone(result, "Function should handle empty string") |
| | |
| | # Test with empty dict |
| | result = {function_name}({{}}) |
| | self.assertIsNotNone(result, "Function should handle empty dict") |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | elif edge_case == "null/None values": |
| | return f""" |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import {function_name} |
| | |
| | class NoneValueTest(unittest.TestCase): |
| | def test_none_input(self): |
| | # Test with None as input |
| | result = {function_name}(None) |
| | self.assertIsNotNone(result, "Function should handle None input") |
| | |
| | # Test with list containing None |
| | result = {function_name}([1, None, 3]) |
| | self.assertIsNotNone(result, "Function should handle list with None values") |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | elif edge_case == "boundary values (min/max)": |
| | return f""" |
| | # recursive_swe_bench/task_generators/bug_fixing.py (completion) |
| | |
| | import unittest |
| | import sys |
| | import os |
| | import sys |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import {function_name} |
| | |
| | class BoundaryValueTest(unittest.TestCase): |
| | def test_min_max_values(self): |
| | # Test with minimum integer |
| | min_int = -sys.maxsize - 1 |
| | result = {function_name}(min_int) |
| | self.assertIsNotNone(result, "Function should handle minimum integer") |
| | |
| | # Test with maximum integer |
| | max_int = sys.maxsize |
| | result = {function_name}(max_int) |
| | self.assertIsNotNone(result, "Function should handle maximum integer") |
| | |
| | # Test with very large list |
| | large_list = list(range(10000)) |
| | result = {function_name}(large_list) |
| | self.assertIsNotNone(result, "Function should handle very large inputs") |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | elif edge_case == "negative numbers": |
| | return f""" |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import {function_name} |
| | |
| | class NegativeNumberTest(unittest.TestCase): |
| | def test_negative_numbers(self): |
| | # Test with negative number |
| | result = {function_name}(-1) |
| | self.assertIsNotNone(result, "Function should handle negative numbers") |
| | |
| | # Test with list of negative numbers |
| | result = {function_name}([-1, -2, -3]) |
| | self.assertIsNotNone(result, "Function should handle lists of negative numbers") |
| | |
| | # Test with mixed positive and negative |
| | result = {function_name}([-1, 0, 1]) |
| | self.assertIsNotNone(result, "Function should handle mixed positive and negative") |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | else: |
| | |
| | return f""" |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import {function_name} |
| | |
| | class EdgeCaseTest(unittest.TestCase): |
| | def test_edge_case_{edge_case.replace(' ', '_')}(self): |
| | # Test edge case: {edge_case} |
| | # This is a placeholder test that needs to be customized for the specific edge case |
| | self.assertTrue(True, "Edge case test not implemented") |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | |
| | def _generate_performance_test(self, constraint: str, code_context: Dict[str, Any]) -> str: |
| | """ |
| | Generate a performance test based on a constraint. |
| | |
| | Args: |
| | constraint: The performance constraint |
| | code_context: The code context containing information about the problem |
| | |
| | Returns: |
| | A test script for the performance constraint |
| | """ |
| | |
| | function_names = [] |
| | if "code" in code_context: |
| | function_names = re.findall(r'def\s+(\w+)', code_context["code"]) |
| | |
| | if not function_names: |
| | return None |
| | |
| | |
| | function_name = random.choice(function_names) |
| | |
| | if "time complexity" in constraint: |
| | return f""" |
| | import unittest |
| | import sys |
| | import os |
| | import time |
| | import random |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import {function_name} |
| | |
| | class PerformanceTest(unittest.TestCase): |
| | def test_time_complexity(self): |
| | # Test for {constraint} |
| | sizes = [100, 1000, 10000] |
| | times = [] |
| | |
| | for size in sizes: |
| | # Generate input of the given size |
| | input_data = [random.randint(0, 1000) for _ in range(size)] |
| | |
| | # Measure execution time |
| | start_time = time.time() |
| | {function_name}(input_data) |
| | end_time = time.time() |
| | |
| | times.append(end_time - start_time) |
| | |
| | # Check if time grows appropriately |
| | # For O(n), time should grow linearly with input size |
| | # For O(log n), time should grow logarithmically |
| | # This is a simplified check and might need adjustment |
| | if "log n" in "{constraint}": |
| | # For logarithmic time, the ratio of times should decrease |
| | ratio1 = times[1] / times[0] |
| | ratio2 = times[2] / times[1] |
| | self.assertLess(ratio2, ratio1 * 1.5, |
| | f"Growth rate appears super-logarithmic: {times}") |
| | else: # Assume linear or better |
| | # For linear time, the ratio of times should be roughly equal to ratio of sizes |
| | ratio1 = times[1] / times[0] |
| | size_ratio1 = sizes[1] / sizes[0] |
| | |
| | ratio2 = times[2] / times[1] |
| | size_ratio2 = sizes[2] / sizes[1] |
| | |
| | self.assertLess(ratio1, size_ratio1 * 1.5, |
| | f"First growth rate appears super-linear: {times}") |
| | self.assertLess(ratio2, size_ratio2 * 1.5, |
| | f"Second growth rate appears super-linear: {times}") |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | elif "execution time" in constraint: |
| | return f""" |
| | import unittest |
| | import sys |
| | import os |
| | import time |
| | import random |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import {function_name} |
| | |
| | class PerformanceTest(unittest.TestCase): |
| | def test_execution_time(self): |
| | # Test for {constraint} |
| | # Generate a large input |
| | input_data = [random.randint(0, 1000) for _ in range(10000)] |
| | |
| | # Measure execution time |
| | start_time = time.time() |
| | {function_name}(input_data) |
| | end_time = time.time() |
| | |
| | execution_time = (end_time - start_time) * 1000 # Convert to ms |
| | |
| | self.assertLess(execution_time, 100, |
| | f"Execution time exceeded 100ms: {execution_time:.2f}ms") |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | elif "memory usage" in constraint: |
| | return f""" |
| | import unittest |
| | import sys |
| | import os |
| | import psutil |
| | import random |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import {function_name} |
| | |
| | class MemoryUsageTest(unittest.TestCase): |
| | def test_memory_usage(self): |
| | # Test for {constraint} |
| | # Note: This is an approximate test and may not be accurate in all environments |
| | |
| | # Get current process |
| | process = psutil.Process(os.getpid()) |
| | |
| | # Measure memory before |
| | memory_before = process.memory_info().rss / 1024 / 1024 # MB |
| | |
| | # Generate a large input |
| | input_data = [random.randint(0, 1000) for _ in range(100000)] |
| | |
| | # Run function |
| | {function_name}(input_data) |
| | |
| | # Measure memory after |
| | memory_after = process.memory_info().rss / 1024 / 1024 # MB |
| | |
| | # Calculate memory usage |
| | memory_used = memory_after - memory_before |
| | |
| | # A crude approximation, adjust as needed |
| | self.assertLess(memory_used, 10, |
| | f"Memory usage seems high: {memory_used:.2f}MB") |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | else: |
| | |
| | return f""" |
| | import unittest |
| | import sys |
| | import os |
| | import time |
| | import random |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import {function_name} |
| | |
| | class PerformanceTest(unittest.TestCase): |
| | def test_performance(self): |
| | # Test for {constraint} |
| | # This is a placeholder test that needs to be customized for the specific constraint |
| | |
| | # Generate a large input |
| | input_data = [random.randint(0, 1000) for _ in range(10000)] |
| | |
| | # Measure execution time |
| | start_time = time.time() |
| | {function_name}(input_data) |
| | end_time = time.time() |
| | |
| | execution_time = end_time - start_time |
| | |
| | # Just log the time for now |
| | print(f"Execution time: {execution_time:.4f} seconds") |
| | self.assertTrue(True, "Performance test completed") |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | |
| | def _generate_functionality_test(self, expansion: str, code_context: Dict[str, Any]) -> str: |
| | """ |
| | Generate a test for expanded functionality. |
| | |
| | Args: |
| | expansion: The functionality expansion |
| | code_context: The code context containing information about the problem |
| | |
| | Returns: |
| | A test script for the expanded functionality |
| | """ |
| | |
| | function_names = [] |
| | if "code" in code_context: |
| | function_names = re.findall(r'def\s+(\w+)', code_context["code"]) |
| | |
| | if not function_names: |
| | return None |
| | |
| | |
| | function_name = random.choice(function_names) |
| | |
| | if "different input types" in expansion: |
| | return f""" |
| | import unittest |
| | import sys |
| | import os |
| | import json |
| | from collections import namedtuple |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import {function_name} |
| | |
| | class InputTypesTest(unittest.TestCase): |
| | def test_different_input_types(self): |
| | # Test with different types of inputs |
| | |
| | # Test with list |
| | list_input = [1, 2, 3] |
| | list_result = {function_name}(list_input) |
| | self.assertIsNotNone(list_result, "Function should handle list input") |
| | |
| | # Test with tuple |
| | tuple_input = (1, 2, 3) |
| | tuple_result = {function_name}(tuple_input) |
| | self.assertIsNotNone(tuple_result, "Function should handle tuple input") |
| | |
| | # Test with set |
| | set_input = {{1, 2, 3}} |
| | set_result = {function_name}(set_input) |
| | self.assertIsNotNone(set_result, "Function should handle set input") |
| | |
| | # Test with dictionary |
| | dict_input = {{"a": 1, "b": 2, "c": 3}} |
| | dict_result = {function_name}(dict_input) |
| | self.assertIsNotNone(dict_result, "Function should handle dictionary input") |
| | |
| | # Test with JSON string |
| | json_input = '{{"data": [1, 2, 3]}}' |
| | json_result = {function_name}(json_input) |
| | self.assertIsNotNone(json_result, "Function should handle JSON string") |
| | |
| | # Test with custom object |
| | Point = namedtuple('Point', ['x', 'y']) |
| | obj_input = Point(1, 2) |
| | obj_result = {function_name}(obj_input) |
| | self.assertIsNotNone(obj_result, "Function should handle custom object") |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | elif "parameterized behavior" in expansion: |
| | return f""" |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import {function_name} |
| | |
| | class ParameterizedTest(unittest.TestCase): |
| | def test_parameterized_behavior(self): |
| | # Test function with different parameters |
| | |
| | # Base case with default parameters |
| | base_input = [1, 2, 3] |
| | base_result = {function_name}(base_input) |
| | |
| | # The function should now accept additional parameters |
| | # These are example parameters, adjust based on the specific function |
| | |
| | # With sorting parameter |
| | try: |
| | sorted_result = {function_name}(base_input, sort=True) |
| | self.assertIsNotNone(sorted_result, "Function should handle sort parameter") |
| | except TypeError as e: |
| | self.fail(f"Function does not support sort parameter: {{e}}") |
| | |
| | # With filtering parameter |
| | try: |
| | filtered_result = {function_name}(base_input, filter_fn=lambda x: x > 1) |
| | self.assertIsNotNone(filtered_result, "Function should handle filter_fn parameter") |
| | except TypeError as e: |
| | self.fail(f"Function does not support filter_fn parameter: {{e}}") |
| | |
| | # With formatting parameter |
| | try: |
| | formatted_result = {function_name}(base_input, format="json") |
| | self.assertIsNotNone(formatted_result, "Function should handle format parameter") |
| | except TypeError as e: |
| | self.fail(f"Function does not support format parameter: {{e}}") |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | elif "additional output formats" in expansion: |
| | return f""" |
| | import unittest |
| | import sys |
| | import os |
| | import json |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import {function_name} |
| | |
| | class OutputFormatsTest(unittest.TestCase): |
| | def test_output_formats(self): |
| | # Test function with different output formats |
| | input_data = [1, 2, 3] |
| | |
| | # Original format |
| | original_result = {function_name}(input_data) |
| | |
| | # The function should now support different output formats |
| | # These are example formats, adjust based on the specific function |
| | |
| | # JSON format |
| | try: |
| | json_result = {function_name}(input_data, format="json") |
| | # Check if it's valid JSON |
| | try: |
| | json_obj = json.loads(json_result) if isinstance(json_result, str) else json_result |
| | self.assertIsNotNone(json_obj, "JSON result should be valid") |
| | except json.JSONDecodeError: |
| | self.fail("JSON result is not valid") |
| | except TypeError as e: |
| | self.fail(f"Function does not support JSON format: {{e}}") |
| | |
| | # CSV format |
| | try: |
| | csv_result = {function_name}(input_data, format="csv") |
| | self.assertIsNotNone(csv_result, "CSV result should not be None") |
| | if isinstance(csv_result, str): |
| | self.assertIn(",", csv_result, "CSV result should contain commas") |
| | except TypeError as e: |
| | self.fail(f"Function does not support CSV format: {{e}}") |
| | |
| | # XML format |
| | try: |
| | xml_result = {function_name}(input_data, format="xml") |
| | self.assertIsNotNone(xml_result, "XML result should not be None") |
| | if isinstance(xml_result, str): |
| | self.assertIn("<", xml_result, "XML result should contain tags") |
| | self.assertIn(">", xml_result, "XML result should contain tags") |
| | except TypeError as e: |
| | self.fail(f"Function does not support XML format: {{e}}") |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | else: |
| | |
| | return f""" |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import {function_name} |
| | |
| | class ExpandedFunctionalityTest(unittest.TestCase): |
| | def test_expanded_functionality(self): |
| | # Test for {expansion} |
| | # This is a placeholder test that needs to be customized for the specific expansion |
| | |
| | # Basic test to verify the function exists |
| | input_data = [1, 2, 3] |
| | result = {function_name}(input_data) |
| | self.assertIsNotNone(result, "Function should return a result") |
| | |
| | # You need to add specific tests for the expanded functionality |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | |
| | def _calculate_adaptation_vector(self, solution: str, result: EvaluationResult, feedback: Feedback) -> List[float]: |
| | """ |
| | Calculate an adaptation vector based on the solution, result, and feedback. |
| | |
| | The adaptation vector encodes how the problem should evolve in future iterations, |
| | capturing dimensions like difficulty, bug type emphasis, and feedback focus. |
| | |
| | Args: |
| | solution: The current solution |
| | result: The evaluation results |
| | feedback: The feedback provided |
| | |
| | Returns: |
| | An adaptation vector (list of floats) |
| | """ |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | adaptation_vector = [0.0] * 5 |
| | |
| | |
| | if result.score > 0.95: |
| | adaptation_vector[0] = 0.2 |
| | elif result.score > 0.8: |
| | adaptation_vector[0] = 0.1 |
| | elif result.score > 0.6: |
| | adaptation_vector[0] = 0.0 |
| | elif result.score > 0.4: |
| | adaptation_vector[0] = -0.1 |
| | else: |
| | adaptation_vector[0] = -0.2 |
| | |
| | |
| | syntax_issues = sum(1 for issue in feedback.issues if issue.get("error_type") == "syntax") |
| | logical_issues = sum(1 for issue in feedback.issues if issue.get("type") == "test_failure") |
| | |
| | if syntax_issues > logical_issues: |
| | adaptation_vector[1] = -0.1 |
| | elif logical_issues > syntax_issues: |
| | adaptation_vector[1] = 0.1 |
| | |
| | |
| | if result.metrics and "execution_time" in result.metrics: |
| | if result.metrics["execution_time"] > self.config.get("performance_threshold", 1.0): |
| | adaptation_vector[2] = 0.2 |
| | else: |
| | adaptation_vector[2] = -0.1 |
| | |
| | |
| | if result.test_results: |
| | edge_case_failures = sum(1 for test_name, test_result in result.test_results.items() |
| | if not test_result["passed"] and "edge" in test_name.lower()) |
| | if edge_case_failures > 0: |
| | adaptation_vector[3] = 0.2 |
| | else: |
| | adaptation_vector[3] = 0.0 |
| | |
| | |
| | current_requirements = len(self.state.requirements) |
| | if current_requirements < 3: |
| | adaptation_vector[4] = 0.1 |
| | elif current_requirements >= 5: |
| | adaptation_vector[4] = -0.1 |
| | |
| | return adaptation_vector |
| |
|
| |
|
| | class DefaultTestRunner: |
| | """Default test runner for evaluating bug fixes.""" |
| | |
| | def run_tests(self, solution_file: Path, test_files: List[Path], code_context: Dict[str, Any]) -> Dict[str, Any]: |
| | """ |
| | Run tests against a solution file. |
| | |
| | Args: |
| | solution_file: Path to the solution file |
| | test_files: List of test file paths |
| | code_context: Context information about the code |
| | |
| | Returns: |
| | Dictionary of test results |
| | """ |
| | |
| | results = { |
| | "all_passed": True, |
| | "passed_tests": 0, |
| | "total_tests": 0, |
| | "tests": {}, |
| | "execution": { |
| | "success": True, |
| | "error": None, |
| | "stdout": None, |
| | "stderr": None |
| | }, |
| | "execution_time": 0.0 |
| | } |
| | |
| | |
| | try: |
| | |
| | if not solution_file.exists(): |
| | results["execution"]["success"] = False |
| | results["execution"]["error"] = "Solution file not found" |
| | results["all_passed"] = False |
| | return results |
| | |
| | |
| | sys.path.insert(0, str(solution_file.parent)) |
| | import importlib.util |
| | spec = importlib.util.spec_from_file_location("solution", solution_file) |
| | solution_module = importlib.util.module_from_spec(spec) |
| | spec.loader.exec_module(solution_module) |
| | |
| | |
| | if "required_functions" in code_context: |
| | for func_name in code_context["required_functions"]: |
| | if not hasattr(solution_module, func_name): |
| | results["execution"]["success"] = False |
| | results["execution"]["error"] = f"Required function '{func_name}' not found" |
| | results["all_passed"] = False |
| | return results |
| | |
| | except Exception as e: |
| | results["execution"]["success"] = False |
| | results["execution"]["error"] = str(e) |
| | results["all_passed"] = False |
| | return results |
| | |
| | |
| | for test_file in test_files: |
| | |
| | if not test_file.exists(): |
| | continue |
| | |
| | |
| | import unittest |
| | import io |
| | from contextlib import redirect_stdout, redirect_stderr |
| | |
| | |
| | loader = unittest.TestLoader() |
| | try: |
| | tests = loader.discover(str(test_file.parent), pattern=test_file.name) |
| | |
| | |
| | test_cases = 0 |
| | for suite in tests: |
| | for test_case in suite: |
| | test_cases += test_case.countTestCases() |
| | |
| | results["total_tests"] += test_cases |
| | |
| | |
| | runner = unittest.TextTestRunner(verbosity=2) |
| | |
| | |
| | stdout_buffer = io.StringIO() |
| | stderr_buffer = io.StringIO() |
| | |
| | with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): |
| | test_result = runner.run(tests) |
| | |
| | stdout = stdout_buffer.getvalue() |
| | stderr = stderr_buffer.getvalue() |
| | |
| | |
| | if not test_result.wasSuccessful(): |
| | results["all_passed"] = False |
| | |
| | |
| | passed_tests = test_cases - len(test_result.failures) - len(test_result.errors) |
| | results["passed_tests"] += passed_tests |
| | |
| | |
| | test_name = test_file.stem |
| | results["tests"][test_name] = { |
| | "passed": test_result.wasSuccessful(), |
| | "failures": len(test_result.failures), |
| | "errors": len(test_result.errors), |
| | "skipped": len(test_result.skipped), |
| | "total": test_cases, |
| | "passed_count": passed_tests, |
| | "stdout": stdout, |
| | "stderr": stderr |
| | } |
| | |
| | |
| | for failure in test_result.failures: |
| | test_id = failure[0].id() |
| | failure_message = failure[1] |
| | |
| | |
| | import re |
| | expected_match = re.search(r'Expected\s*:(.+)', failure_message) |
| | actual_match = re.search(r'Actual\s*:(.+)', failure_message) |
| | |
| | expected = expected_match.group(1).strip() if expected_match else None |
| | actual = actual_match.group(1).strip() if actual_match else None |
| | |
| | if test_id not in results["tests"]: |
| | results["tests"][test_id] = {} |
| | |
| | results["tests"][test_id].update({ |
| | "passed": False, |
| | "message": failure_message, |
| | "expected": expected, |
| | "actual": actual |
| | }) |
| | |
| | except Exception as e: |
| | |
| | results["all_passed"] = False |
| | results["tests"][test_file.stem] = { |
| | "passed": False, |
| | "error": str(e), |
| | "failures": 1, |
| | "errors": 1, |
| | "skipped": 0, |
| | "total": 1, |
| | "passed_count": 0 |
| | } |
| | results["total_tests"] += 1 |
| | |
| | return results |
| |
|
| |
|
| | class BugFixingTaskGenerator: |
| | """Generator for bug fixing tasks.""" |
| | |
| | def __init__(self, config: Dict[str, Any] = None): |
| | """ |
| | Initialize the bug fixing task generator. |
| | |
| | Args: |
| | config: Configuration options |
| | """ |
| | self.config = config or {} |
| | self.difficulty_levels = self.config.get( |
| | "difficulty_levels", |
| | ["easy", "medium", "hard", "expert"] |
| | ) |
| | self.bug_categories = self.config.get( |
| | "bug_categories", |
| | [ |
| | BugCategory.SYNTAX, |
| | BugCategory.LOGICAL, |
| | BugCategory.EDGE_CASE, |
| | BugCategory.PERFORMANCE |
| | ] |
| | ) |
| | self.test_templates = self._load_test_templates() |
| | |
| | def generate_task(self, difficulty: str = None, bug_categories: List[str] = None) -> BugFixingTask: |
| | """ |
| | Generate a new bug fixing task. |
| | |
| | Args: |
| | difficulty: The difficulty level (easy, medium, hard, expert) |
| | bug_categories: List of bug categories to include |
| | |
| | Returns: |
| | A new bug fixing task |
| | """ |
| | |
| | if difficulty is None: |
| | difficulty = random.choice(self.difficulty_levels) |
| | |
| | |
| | if bug_categories is None: |
| | num_categories = random.randint(1, 3) |
| | bug_categories = random.sample(self.bug_categories, num_categories) |
| | |
| | |
| | problem_state = self._generate_problem_state(difficulty, bug_categories) |
| | |
| | |
| | task_config = { |
| | "difficulty": difficulty, |
| | "bug_categories": bug_categories, |
| | "convergence_criteria": { |
| | "score_threshold": 0.95, |
| | "min_iterations": 1, |
| | "max_iterations": self.config.get("max_iterations", 5), |
| | "score_delta_threshold": 0.05, |
| | "consecutive_plateau_limit": 2 |
| | }, |
| | "score_weights": { |
| | "test": 0.7, |
| | "execution": 0.3 |
| | }, |
| | "performance_threshold": 1.0, |
| | "complexity_threshold": 0.7 |
| | } |
| | |
| | |
| | return BugFixingTask(problem_state, task_config) |
| | |
| | def _generate_problem_state(self, difficulty: str, bug_categories: List[str]) -> ProblemState: |
| | """ |
| | Generate a problem state for the given difficulty and bug categories. |
| | |
| | Args: |
| | difficulty: The difficulty level |
| | bug_categories: List of bug categories |
| | |
| | Returns: |
| | A problem state for the task |
| | """ |
| | |
| | template = self._choose_template(difficulty, bug_categories) |
| | |
| | |
| | problem_state = copy.deepcopy(template) |
| | |
| | |
| | problem_state.problem_id = str(uuid.uuid4()) |
| | |
| | |
| | problem_state.evolution_stage = 0 |
| | problem_state.adaptation_vector = [0.0] * 5 |
| | |
| | |
| | difficulty_values = { |
| | "easy": 0.25, |
| | "medium": 0.5, |
| | "hard": 0.75, |
| | "expert": 0.9 |
| | } |
| | problem_state.difficulty = difficulty_values.get(difficulty, 0.5) |
| | |
| | |
| | for category in bug_categories: |
| | self._insert_bug(problem_state, category) |
| | |
| | |
| | problem_state.description = self._generate_description(problem_state) |
| | |
| | return problem_state |
| | |
| | def _choose_template(self, difficulty: str, bug_categories: List[str]) -> ProblemState: |
| | """ |
| | Choose a template that matches the difficulty and bug categories. |
| | |
| | Args: |
| | difficulty: The difficulty level |
| | bug_categories: List of bug categories |
| | |
| | Returns: |
| | A template problem state |
| | """ |
| | |
| | |
| | |
| | |
| | code = self._generate_template_code(difficulty, bug_categories) |
| | tests = self._generate_template_tests(code) |
| | |
| | |
| | return ProblemState( |
| | problem_id="template", |
| | description="Fix the bugs in the given code.", |
| | code_context={ |
| | "code": code, |
| | "tests": tests, |
| | "bug_count": 0, |
| | "bug_categories": [] |
| | }, |
| | requirements=[ |
| | { |
| | "type": "functional", |
| | "description": "The code should pass all the provided tests.", |
| | "difficulty": 0.3 |
| | } |
| | ], |
| | difficulty=0.5, |
| | evolution_stage=0, |
| | adaptation_vector=[0.0] * 5 |
| | ) |
| | |
| | def _generate_template_code(self, difficulty: str, bug_categories: List[str]) -> str: |
| | """ |
| | Generate template code based on difficulty and bug categories. |
| | |
| | Args: |
| | difficulty: The difficulty level |
| | bug_categories: List of bug categories |
| | |
| | Returns: |
| | Template code |
| | """ |
| | |
| | templates = { |
| | "easy": """ |
| | def calculate_sum(numbers): |
| | \"\"\"Calculate the sum of a list of numbers.\"\"\" |
| | total = 0 |
| | for num in numbers: |
| | total += num |
| | return total |
| | |
| | def calculate_average(numbers): |
| | \"\"\"Calculate the average of a list of numbers.\"\"\" |
| | if not numbers: |
| | return 0 |
| | return calculate_sum(numbers) / len(numbers) |
| | """, |
| | "medium": """ |
| | def find_most_frequent(items): |
| | \"\"\"Find the most frequently occurring item in |
| | # recursive_swe_bench/task_generators/bug_fixing.py (template generation) |
| | |
| | def find_most_frequent(items): |
| | """Find the most frequently occurring item in a list.""" |
| | if not items: |
| | return None |
| | |
| | counts = {} |
| | for item in items: |
| | if item in counts: |
| | counts[item] += 1 |
| | else: |
| | counts[item] = 1 |
| | |
| | max_count = 0 |
| | max_item = None |
| | for item, count in counts.items(): |
| | if count > max_count: |
| | max_count = count |
| | max_item = item |
| | |
| | return max_item |
| | |
| | def binary_search(sorted_list, target): |
| | """Perform binary search on a sorted list.""" |
| | left = 0 |
| | right = len(sorted_list) - 1 |
| | |
| | while left <= right: |
| | mid = (left + right) // 2 |
| | if sorted_list[mid] == target: |
| | return mid |
| | elif sorted_list[mid] < target: |
| | left = mid + 1 |
| | else: |
| | right = mid - 1 |
| | |
| | return -1 # Target not found |
| | """, |
| | "hard": """ |
| | def merge_sort(arr): |
| | """Sort an array using the merge sort algorithm.""" |
| | if len(arr) <= 1: |
| | return arr |
| | |
| | # Split the array into two halves |
| | mid = len(arr) // 2 |
| | left_half = arr[:mid] |
| | right_half = arr[mid:] |
| | |
| | # Recursively sort both halves |
| | left_half = merge_sort(left_half) |
| | right_half = merge_sort(right_half) |
| | |
| | # Merge the sorted halves |
| | return merge(left_half, right_half) |
| | |
| | def merge(left, right): |
| | """Merge two sorted arrays.""" |
| | result = [] |
| | i = j = 0 |
| | |
| | # Compare elements from both arrays and add the smaller one to the result |
| | while i < len(left) and j < len(right): |
| | if left[i] <= right[j]: |
| | result.append(left[i]) |
| | i += 1 |
| | else: |
| | result.append(right[j]) |
| | j += 1 |
| | |
| | # Add any remaining elements |
| | result.extend(left[i:]) |
| | result.extend(right[j:]) |
| | |
| | return result |
| | |
| | def quicksort(arr): |
| | """Sort an array using the quicksort algorithm.""" |
| | if len(arr) <= 1: |
| | return arr |
| | |
| | # Choose the pivot (using the first element for simplicity) |
| | pivot = arr[0] |
| | |
| | # Partition the array |
| | less = [x for x in arr[1:] if x <= pivot] |
| | greater = [x for x in arr[1:] if x > pivot] |
| | |
| | # Recursively sort the partitions and combine |
| | return quicksort(less) + [pivot] + quicksort(greater) |
| | """, |
| | "expert": """ |
| | class Node: |
| | """Node in a binary tree.""" |
| | def __init__(self, value): |
| | self.value = value |
| | self.left = None |
| | self.right = None |
| | |
| | def build_binary_tree(values): |
| | """Build a binary tree from a list of values.""" |
| | if not values: |
| | return None |
| | |
| | root = Node(values[0]) |
| | queue = [root] |
| | i = 1 |
| | |
| | while queue and i < len(values): |
| | node = queue.pop(0) |
| | |
| | # Add left child |
| | if i < len(values) and values[i] is not None: |
| | node.left = Node(values[i]) |
| | queue.append(node.left) |
| | i += 1 |
| | |
| | # Add right child |
| | if i < len(values) and values[i] is not None: |
| | node.right = Node(values[i]) |
| | queue.append(node.right) |
| | i += 1 |
| | |
| | return root |
| | |
| | def is_balanced(root): |
| | """Check if a binary tree is balanced.""" |
| | def height(node): |
| | if not node: |
| | return 0 |
| | return max(height(node.left), height(node.right)) + 1 |
| | |
| | def is_balanced_helper(node): |
| | if not node: |
| | return True |
| | |
| | left_height = height(node.left) |
| | right_height = height(node.right) |
| | |
| | if abs(left_height - right_height) > 1: |
| | return False |
| | |
| | return is_balanced_helper(node.left) and is_balanced_helper(node.right) |
| | |
| | return is_balanced_helper(root) |
| | |
| | def find_lca(root, p, q): |
| | """Find the lowest common ancestor of two nodes in a binary tree.""" |
| | if not root: |
| | return None |
| | |
| | if root.value == p or root.value == q: |
| | return root |
| | |
| | left_lca = find_lca(root.left, p, q) |
| | right_lca = find_lca(root.right, p, q) |
| | |
| | if left_lca and right_lca: |
| | return root |
| | |
| | return left_lca if left_lca else right_lca |
| | """ |
| | } |
| | |
| | |
| | if difficulty in templates: |
| | return templates[difficulty] |
| | else: |
| | return templates["medium"] |
| | |
| | def _generate_template_tests(self, code: str) -> List[Dict[str, Any]]: |
| | """ |
| | Generate template tests based on the code. |
| | |
| | Args: |
| | code: The template code |
| | |
| | Returns: |
| | List of test dictionaries |
| | """ |
| | |
| | function_names = re.findall(r'def\s+(\w+)', code) |
| | |
| | |
| | tests = [] |
| | for func_name in function_names: |
| | test_content = self._generate_test_for_function(func_name) |
| | if test_content: |
| | tests.append({ |
| | "name": f"test_{func_name}", |
| | "content": test_content, |
| | "description": f"Test for {func_name} function" |
| | }) |
| | |
| | return tests |
| | |
| | def _generate_test_for_function(self, func_name: str) -> str: |
| | """ |
| | Generate a test for a specific function. |
| | |
| | Args: |
| | func_name: The name of the function to test |
| | |
| | Returns: |
| | Test content |
| | """ |
| | |
| | if func_name in self.test_templates: |
| | return self.test_templates[func_name] |
| | |
| | |
| | if "sum" in func_name.lower(): |
| | return """ |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import calculate_sum |
| | |
| | class TestCalculateSum(unittest.TestCase): |
| | def test_calculate_sum(self): |
| | self.assertEqual(calculate_sum([1, 2, 3, 4, 5]), 15) |
| | self.assertEqual(calculate_sum([]), 0) |
| | self.assertEqual(calculate_sum([-1, -2, -3]), -6) |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | elif "average" in func_name.lower(): |
| | return """ |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import calculate_average |
| | |
| | class TestCalculateAverage(unittest.TestCase): |
| | def test_calculate_average(self): |
| | self.assertEqual(calculate_average([1, 2, 3, 4, 5]), 3) |
| | self.assertEqual(calculate_average([]), 0) |
| | self.assertEqual(calculate_average([10]), 10) |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | elif "frequent" in func_name.lower(): |
| | return """ |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import find_most_frequent |
| | |
| | class TestFindMostFrequent(unittest.TestCase): |
| | def test_find_most_frequent(self): |
| | self.assertEqual(find_most_frequent([1, 2, 2, 3, 3, 3, 4]), 3) |
| | self.assertEqual(find_most_frequent(['a', 'b', 'a', 'c', 'a']), 'a') |
| | self.assertIsNone(find_most_frequent([])) |
| | self.assertEqual(find_most_frequent([5]), 5) |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | elif "search" in func_name.lower(): |
| | return """ |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import binary_search |
| | |
| | class TestBinarySearch(unittest.TestCase): |
| | def test_binary_search(self): |
| | self.assertEqual(binary_search([1, 2, 3, 4, 5], 3), 2) |
| | self.assertEqual(binary_search([1, 2, 3, 4, 5], 1), 0) |
| | self.assertEqual(binary_search([1, 2, 3, 4, 5], 5), 4) |
| | self.assertEqual(binary_search([1, 2, 3, 4, 5], 6), -1) |
| | self.assertEqual(binary_search([], 5), -1) |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | elif "sort" in func_name.lower(): |
| | return """ |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import {0} |
| | |
| | class Test{1}(unittest.TestCase): |
| | def test_sorting(self): |
| | self.assertEqual({0}([]), []) |
| | self.assertEqual({0}([1]), [1]) |
| | self.assertEqual({0}([3, 1, 4, 1, 5, 9, 2, 6, 5]), [1, 1, 2, 3, 4, 5, 5, 6, 9]) |
| | self.assertEqual({0}([9, 8, 7, 6, 5, 4, 3, 2, 1]), [1, 2, 3, 4, 5, 6, 7, 8, 9]) |
| | self.assertEqual({0}([1, 1, 1, 1]), [1, 1, 1, 1]) |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """.format(func_name, func_name.title()) |
| | elif "balanced" in func_name.lower(): |
| | return """ |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import Node, is_balanced |
| | |
| | class TestIsBalanced(unittest.TestCase): |
| | def test_is_balanced(self): |
| | # Create a balanced tree |
| | # 1 |
| | # / \\ |
| | # 2 3 |
| | # / \\ / \\ |
| | # 4 5 6 7 |
| | root = Node(1) |
| | root.left = Node(2) |
| | root.right = Node(3) |
| | root.left.left = Node(4) |
| | root.left.right = Node(5) |
| | root.right.left = Node(6) |
| | root.right.right = Node(7) |
| | self.assertTrue(is_balanced(root)) |
| | |
| | # Create an unbalanced tree |
| | # 1 |
| | # / \\ |
| | # 2 3 |
| | # / \\ |
| | # 4 5 |
| | #/ |
| | #6 |
| | root = Node(1) |
| | root.left = Node(2) |
| | root.right = Node(3) |
| | root.left.left = Node(4) |
| | root.left.right = Node(5) |
| | root.left.left.left = Node(6) |
| | self.assertFalse(is_balanced(root)) |
| | |
| | # Empty tree is balanced |
| | self.assertTrue(is_balanced(None)) |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | elif "lca" in func_name.lower(): |
| | return """ |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import Node, find_lca |
| | |
| | class TestFindLCA(unittest.TestCase): |
| | def test_find_lca(self): |
| | # Create a tree |
| | # 1 |
| | # / \\ |
| | # 2 3 |
| | # / \\ / \\ |
| | # 4 5 6 7 |
| | root = Node(1) |
| | root.left = Node(2) |
| | root.right = Node(3) |
| | root.left.left = Node(4) |
| | root.left.right = Node(5) |
| | root.right.left = Node(6) |
| | root.right.right = Node(7) |
| | |
| | # Test cases |
| | self.assertEqual(find_lca(root, 4, 5).value, 2) # LCA of 4 and 5 is 2 |
| | self.assertEqual(find_lca(root, 4, 6).value, 1) # LCA of 4 and 6 is 1 |
| | self.assertEqual(find_lca(root, 3, 7).value, 3) # LCA of 3 and 7 is 3 |
| | self.assertEqual(find_lca(root, 2, 7).value, 1) # LCA of 2 and 7 is 1 |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | elif "tree" in func_name.lower(): |
| | return """ |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import Node, build_binary_tree |
| | |
| | class TestBuildBinaryTree(unittest.TestCase): |
| | def test_build_binary_tree(self): |
| | # Test empty list |
| | self.assertIsNone(build_binary_tree([])) |
| | |
| | # Test single node |
| | root = build_binary_tree([1]) |
| | self.assertEqual(root.value, 1) |
| | self.assertIsNone(root.left) |
| | self.assertIsNone(root.right) |
| | |
| | # Test complete tree |
| | # 1 |
| | # / \\ |
| | # 2 3 |
| | # / \\ / \\ |
| | # 4 5 6 7 |
| | values = [1, 2, 3, 4, 5, 6, 7] |
| | root = build_binary_tree(values) |
| | self.assertEqual(root.value, 1) |
| | self.assertEqual(root.left.value, 2) |
| | self.assertEqual(root.right.value, 3) |
| | self.assertEqual(root.left.left.value, 4) |
| | self.assertEqual(root.left.right.value, 5) |
| | self.assertEqual(root.right.left.value, 6) |
| | self.assertEqual(root.right.right.value, 7) |
| | |
| | # Test tree with None values |
| | # 1 |
| | # / \\ |
| | # 2 3 |
| | # / / |
| | # 4 6 |
| | values = [1, 2, 3, 4, None, 6, None] |
| | root = build_binary_tree(values) |
| | self.assertEqual(root.value, 1) |
| | self.assertEqual(root.left.value, 2) |
| | self.assertEqual(root.right.value, 3) |
| | self.assertEqual(root.left.left.value, 4) |
| | self.assertIsNone(root.left.right) |
| | self.assertEqual(root.right.left.value, 6) |
| | self.assertIsNone(root.right.right) |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | else: |
| | |
| | return """ |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import {0} |
| | |
| | class Test{1}(unittest.TestCase): |
| | def test_{0}(self): |
| | # TODO: Add specific test cases for {0} |
| | # This is a placeholder test |
| | self.assertTrue(True) |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """.format(func_name, func_name.title()) |
| | |
| | def _load_test_templates(self) -> Dict[str, str]: |
| | """ |
| | Load test templates for common functions. |
| | |
| | Returns: |
| | Dictionary of test templates |
| | """ |
| | |
| | return { |
| | "calculate_sum": """ |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import calculate_sum |
| | |
| | class TestCalculateSum(unittest.TestCase): |
| | def test_calculate_sum(self): |
| | self.assertEqual(calculate_sum([1, 2, 3, 4, 5]), 15) |
| | self.assertEqual(calculate_sum([]), 0) |
| | self.assertEqual(calculate_sum([-1, -2, -3]), -6) |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """, |
| | "calculate_average": """ |
| | import unittest |
| | import sys |
| | import os |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | from solution import calculate_average |
| | |
| | class TestCalculateAverage(unittest.TestCase): |
| | def test_calculate_average(self): |
| | self.assertEqual(calculate_average([1, 2, 3, 4, 5]), 3) |
| | self.assertEqual(calculate_average([]), 0) |
| | self.assertEqual(calculate_average([10]), 10) |
| | |
| | if __name__ == '__main__': |
| | unittest.main() |
| | """ |
| | } |
| | |
| | def _insert_bug(self, problem_state: ProblemState, bug_category: str) -> None: |
| | """ |
| | Insert a bug of the specified category into the problem state. |
| | |
| | Args: |
| | problem_state: The problem state to modify |
| | bug_category: The category of bug to insert |
| | """ |
| | if "code" not in problem_state.code_context: |
| | return |
| | |
| | |
| | code = problem_state.code_context["code"] |
| | try: |
| | parsed_code = ast.parse(code) |
| | except SyntaxError: |
| | |
| | return |
| | |
| | |
| | if bug_category == BugCategory.SYNTAX: |
| | self._insert_syntax_bug(problem_state) |
| | elif bug_category == BugCategory.LOGICAL: |
| | self._insert_logical_bug(problem_state) |
| | elif bug_category == BugCategory.PERFORMANCE: |
| | self._insert_performance_bug(problem_state) |
| | elif bug_category == BugCategory.EDGE_CASE: |
| | self._insert_edge_case_bug(problem_state) |
| | else: |
| | |
| | self._insert_logical_bug(problem_state) |
| | |
| | |
| | if "bug_count" not in problem_state.code_context: |
| | problem_state.code_context["bug_count"] = 0 |
| | problem_state.code_context["bug_count"] += 1 |
| | |
| | if "bug_categories" not in problem_state.code_context: |
| | problem_state.code_context["bug_categories"] = [] |
| | if bug_category not in problem_state.code_context["bug_categories"]: |
| | problem_state.code_context["bug_categories"].append(bug_category) |
| | |
| | def _insert_syntax_bug(self, problem_state: ProblemState) -> None: |
| | """ |
| | Insert a syntax bug into the problem state. |
| | |
| | Args: |
| | problem_state: The problem state to modify |
| | """ |
| | code = problem_state.code_context["code"] |
| | lines = code.split('\n') |
| | if not lines: |
| | return |
| | |
| | |
| | idx = random.randint(0, len(lines) - 1) |
| | line = lines[idx] |
| | |
| | |
| | attempts = 0 |
| | while (not line.strip() or line.strip().startswith('#')) and attempts < 10: |
| | idx = random.randint(0, len(lines) - 1) |
| | line = lines[idx] |
| | attempts += 1 |
| | |
| | if attempts >= 10: |
| | |
| | for i, line in enumerate(lines): |
| | if line.strip() and not line.strip().startswith('#'): |
| | idx = i |
| | break |
| | else: |
| | return |
| | |
| | |
| | mod_type = random.choice([ |
| | "remove_character", |
| | "add_character", |
| | "swap_characters", |
| | "change_indent" |
| | ]) |
| | |
| | if mod_type == "remove_character" and line: |
| | char_idx = random.randint(0, len(line) - 1) |
| | lines[idx] = line[:char_idx] + line[char_idx+1:] |
| | |
| | elif mod_type == "add_character": |
| | char_idx = random.randint(0, len(line)) |
| | char = random.choice(["(", ")", "{", "}", "[", "]", ":", ";", ",", "."]) |
| | lines[idx] = line[:char_idx] + char + line[char_idx:] |
| | |
| | elif mod_type == "swap_characters" and len(line) >= 2: |
| | char_idx = random.randint(0, len(line) - 2) |
| | lines[idx] = (line[:char_idx] + line[char_idx+1] + |
| | line[char_idx] + line[char_idx+2:]) |
| | |
| | elif mod_type == "change_indent": |
| | |
| | if line.startswith(" "): |
| | lines[idx] = line[2:] |
| | else: |
| | lines[idx] = " " + line |
| | |
| | |
| | problem_state.code_context["code"] = '\n'.join(lines) |
| | |
| | |
| | if "bugs" not in problem_state.code_context: |
| | problem_state.code_context["bugs"] = [] |
| | |
| | problem_state.code_context["bugs"].append({ |
| | "type": BugCategory.SYNTAX, |
| | "line": idx + 1, |
| | "description": f"Syntax error introduced in line {idx + 1}" |
| | }) |
| | |
| | def _insert_logical_bug(self, problem_state: ProblemState) -> None: |
| | """ |
| | Insert a logical bug into the problem state. |
| | |
| | Args: |
| | problem_state: The problem state to modify |
| | """ |
| | code = problem_state.code_context["code"] |
| | lines = code.split('\n') |
| | if not lines: |
| | return |
| | |
| | |
| | if_statements = [] |
| | for i, line in enumerate(lines): |
| | if re.search(r'\bif\b|\bwhile\b|\bfor\b', line): |
| | if_statements.append((i, line)) |
| | |
| | |
| | mod_type = random.choice([ |
| | "change_comparison", |
| | "invert_condition", |
| | "off_by_one", |
| | "change_operator", |
| | "reverse_logic" |
| | ]) |
| | |
| | if if_statements: |
| | |
| | idx, line = random.choice(if_statements) |
| | |
| | if mod_type == "change_comparison": |
| | |
| | comparisons = {"==": "!=", "!=": "==", ">": "<", "<": ">", ">=": "<=", "<=": ">="} |
| | for op, new_op in comparisons.items(): |
| | if op in line: |
| | lines[idx] = line.replace(op, new_op, 1) |
| | break |
| | |
| | elif mod_type == "invert_condition": |
| | |
| | if "not" in line: |
| | lines[idx] = line.replace("not ", "", 1) |
| | else: |
| | match = re.search(r'(if|while)\s+([^:]+):', line) |
| | if match: |
| | condition = match.group(2) |
| | lines[idx] = line.replace(condition, f"not ({condition})", 1) |
| | |
| | elif mod_type == "off_by_one": |
| | |
| | for op in ["+", "-"]: |
| | if op in line: |
| | |
| | match = re.search(f'\\{op}\\s*(\\d+)', line) |
| | if match: |
| | num = int(match.group(1)) |
| | new_num = num + 1 if op == "+" else max(0, num - 1) |
| | lines[idx] = line.replace(f"{op} {num}", f"{op} {new_num}", 1) |
| | break |
| | |
| | elif mod_type == "change_operator": |
| | |
| | operators = {"+": "-", "-": "+", "*": "/", "/": "*", "and": "or", "or": "and"} |
| | for op, new_op in operators.items(): |
| | if f" {op} " in line: |
| | lines[idx] = line.replace(f" {op} ", f" {new_op} ", 1) |
| | break |
| | |
| | elif mod_type == "reverse_logic": |
| | |
| | if " and " in line: |
| | parts = line.split(" and ") |
| | lines[idx] = line.replace(" and ".join(parts), " or ".join(parts), 1) |
| | elif " or " in line: |
| | parts = line.split(" or ") |
| | lines[idx] = line.replace(" or ".join(parts), " and ".join(parts), 1) |
| | |
| | else: |
| | |
| | |
| | assignments = [] |
| | for i, line in enumerate(lines): |
| | if "=" in line and "==" not in line and "!=" not in line: |
| | assignments.append((i, line)) |
| | |
| | if assignments: |
| | |
| | idx, line = random.choice(assignments) |
| | |
| | |
| | if "+" in line: |
| | lines[idx] = line.replace("+", "-", 1) |
| | elif "-" in line: |
| | lines[idx] = line.replace("-", "+", 1) |
| | elif "*" in line: |
| | lines[idx] = line.replace("*", "/", 1) |
| | elif "/" in line: |
| | lines[idx] = line.replace("/", "*", 1) |
| | else: |
| | |
| | match = re.search(r'=\s*(\d+)', line) |
| | if match: |
| | num = int(match.group(1)) |
| | new_num = num + random.choice([-1, 1]) * random.randint(1, 3) |
| | lines[idx] = line.replace(f"= {num}", f"= {new_num}", 1) |
| | |
| | |
| | problem_state.code_context["code"] = '\n'.join(lines) |
| | |
| | |
| | if "bugs" not in problem_state.code_context: |
| | problem_state.code_context["bugs"] = [] |
| | |
| | problem_state.code_context["bugs"].append({ |
| | "type": BugCategory.LOGICAL, |
| | "line": idx + 1, |
| | "description": f"Logical error introduced in line {idx + 1}" |
| | }) |
| | |
| | def _insert_performance_bug(self, problem_state: ProblemState) -> None: |
| | """ |
| | Insert a performance bug into the problem state. |
| | |
| | Args: |
| | problem_state: The problem state to modify |
| | """ |
| | code = problem_state.code_context["code"] |
| | lines = code.split('\n') |
| | if not lines: |
| | return |
| | |
| | |
| | functions = [] |
| | current_func = None |
| | func_start = None |
| | for i, line in enumerate(lines): |
| | if line.strip().startswith("def "): |
| | if current_func: |
| | functions.append((func_start, i - 1, current_func)) |
| | current_func = line.strip()[4:].split("(")[0] |
| | func_start = i |
| | elif i == len(lines) - 1 and current_func: |
| | functions.append((func_start, i, current_func)) |
| | |
| | if not functions: |
| | return |
| | |
| | |
| | start_idx, end_idx, func_name = random.choice(functions) |
| | |
| | |
| | mod_type = random.choice([ |
| | "add_nested_loop", |
| | "inefficient_data_structure", |
| | "redundant_computation" |
| | ]) |
| | |
| | if mod_type == "add_nested_loop": |
| | |
| | for i in range(start_idx + 1, end_idx + 1): |
| | if lines[i].strip(): |
| | indent = len(lines[i]) - len(lines[i].lstrip()) |
| | break |
| | else: |
| | indent = 4 |
| | |
| | |
| | for i in range(start_idx + 1, end_idx + 1): |
| | if "for " in lines[i] or "while " in lines[i]: |
| | |
| | inner_indent = len(lines[i]) - len(lines[i].lstrip()) + 4 |
| | inner_indent_str = ' ' * inner_indent |
| | |
| | |
| | lines.insert(i + 1, f"{inner_indent_str}for _ in range(100): # Inefficient nested loop") |
| | lines.insert(i + 2, f"{inner_indent_str} pass") |
| | |
| | |
| | end_idx += 2 |
| | break |
| | else: |
| | |
| | inner_indent = indent + 4 |
| | inner_indent_str = ' ' * inner_indent |
| | |
| | |
| | for i in range(start_idx + 1, end_idx + 1): |
| | if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): |
| | |
| | lines.insert(i, f"{' ' * indent}for i in range(100): # Inefficient loop") |
| | lines.insert(i + 1, f"{inner_indent_str}pass") |
| | |
| | |
| | end_idx += 2 |
| | break |
| | |
| | elif mod_type == "ineff |
| | # recursive_swe_bench/task_generators/bug_fixing.py (finalized) |
| | |
| | elif mod_type == "inefficient_data_structure": |
| | # Find indentation of the function |
| | for i in range(start_idx + 1, end_idx + 1): |
| | if lines[i].strip(): |
| | indent = len(lines[i]) - len(lines[i].lstrip()) |
| | break |
| | else: |
| | indent = 4 |
| | |
| | # Find a suitable place to add inefficient data structure usage |
| | for i in range(start_idx + 1, end_idx + 1): |
| | if "def " not in lines[i] and lines[i].strip(): |
| | # Add inefficient data structure usage after this line |
| | indent_str = ' ' * indent |
| | |
| | # Add inefficient code |
| | lines.insert(i + 1, f"{indent_str} |
| | lines.insert(i + 2, f"{indent_str}results = []") |
| | lines.insert(i + 3, f"{indent_str}for i in range(1000): # Unnecessarily large range") |
| | lines.insert(i + 4, f"{indent_str} # Using list instead of set for lookups") |
| | lines.insert(i + 5, f"{indent_str} if i % 10 in results: # O(n) lookup instead of O(1)") |
| | lines.insert(i + 6, f"{indent_str} results.append(i) # Unnecessary storage") |
| | |
| | |
| | end_idx += 6 |
| | break |
| | |
| | elif mod_type == "redundant_computation": |
| | |
| | for i in range(start_idx + 1, end_idx + 1): |
| | if lines[i].strip(): |
| | indent = len(lines[i]) - len(lines[i].lstrip()) |
| | break |
| | else: |
| | indent = 4 |
| | |
| | |
| | for i in range(start_idx + 1, end_idx + 1): |
| | if "for " in lines[i] or "while " in lines[i]: |
| | |
| | inner_indent = len(lines[i]) - len(lines[i].lstrip()) + 4 |
| | inner_indent_str = ' ' * inner_indent |
| | |
| | |
| | lines.insert(i + 1, f"{inner_indent_str}# Redundant computation in each iteration") |
| | lines.insert(i + 2, f"{inner_indent_str}temp_sum = 0") |
| | lines.insert(i + 3, f"{inner_indent_str}for j in range(100): # Unnecessary nested computation") |
| | lines.insert(i + 4, f"{inner_indent_str} temp_sum += j") |
| | |
| | |
| | end_idx += 4 |
| | break |
| | |
| | |
| | problem_state.code_context["code"] = '\n'.join(lines) |
| | |
| | |
| | if "bugs" not in problem_state.code_context: |
| | problem_state.code_context["bugs"] = [] |
| | |
| | problem_state.code_context["bugs"].append({ |
| | "type": BugCategory.PERFORMANCE, |
| | "line": start_idx + 1, |
| | "description": f"Performance issue introduced in function '{func_name}'" |
| | }) |
| | |
| | def _insert_edge_case_bug(self, problem_state: ProblemState) -> None: |
| | """ |
| | Insert an edge case bug into the problem state. |
| | |
| | Args: |
| | problem_state: The problem state to modify |
| | """ |
| | code = problem_state.code_context["code"] |
| | lines = code.split('\n') |
| | if not lines: |
| | return |
| | |
| | |
| | functions = [] |
| | current_func = None |
| | func_start = None |
| | for i, line in enumerate(lines): |
| | if line.strip().startswith("def "): |
| | if current_func: |
| | functions.append((func_start, i - 1, current_func)) |
| | current_func = line.strip()[4:].split("(")[0] |
| | func_start = i |
| | elif i == len(lines) - 1 and current_func: |
| | functions.append((func_start, i, current_func)) |
| | |
| | if not functions: |
| | return |
| | |
| | |
| | start_idx, end_idx, func_name = random.choice(functions) |
| | |
| | |
| | mod_type = random.choice([ |
| | "remove_boundary_check", |
| | "missing_edge_case", |
| | "type_assumption" |
| | ]) |
| | |
| | if mod_type == "remove_boundary_check": |
| | |
| | boundary_checks = [] |
| | for i in range(start_idx + 1, end_idx + 1): |
| | if (re.search(r'if\s+.*(len|empty|<=|>=|<|>|==|!=)', lines[i]) and |
| | (("if not " in lines[i]) or ("if len(" in lines[i]) or |
| | ("if " in lines[i] and " == 0" in lines[i]) or |
| | ("if " in lines[i] and " == []" in lines[i]) or |
| | ("if " in lines[i] and " == ''" in lines[i]) or |
| | ("if " in lines[i] and " is None" in lines[i]))): |
| | boundary_checks.append(i) |
| | |
| | if boundary_checks: |
| | |
| | idx = random.choice(boundary_checks) |
| | |
| | |
| | lines[idx] = f"# {lines[idx]} # Boundary check removed" |
| | |
| | |
| | i = idx + 1 |
| | while i <= end_idx and (not lines[i].strip() or len(lines[i]) - len(lines[i].lstrip()) > len(lines[idx]) - len(lines[idx].lstrip())): |
| | lines[i] = f"# {lines[i]}" |
| | i += 1 |
| | else: |
| | |
| | |
| | for i in range(start_idx + 1, end_idx + 1): |
| | if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): |
| | indent = len(lines[i]) - len(lines[i].lstrip()) |
| | indent_str = ' ' * indent |
| | |
| | |
| | lines.insert(i, f"{indent_str}# Missing check for empty input") |
| | lines.insert(i + 1, f"{indent_str}first_item = items[0] # Will fail on empty input") |
| | |
| | |
| | end_idx += 2 |
| | break |
| | |
| | elif mod_type == "missing_edge_case": |
| | |
| | for i in range(start_idx + 1, end_idx + 1): |
| | if ("/" in lines[i] or |
| | "if " in lines[i] and "==" in lines[i] or |
| | "if " in lines[i] and "!=" in lines[i]): |
| | |
| | if "/" in lines[i] and not re.search(r'if\s+.*!=\s*0', lines[i-1]): |
| | |
| | indent = len(lines[i]) - len(lines[i].lstrip()) |
| | indent_str = ' ' * indent |
| | |
| | |
| | match = re.search(r'/\s*(\w+)', lines[i]) |
| | if match: |
| | denominator = match.group(1) |
| | |
| | |
| | j = i - 1 |
| | while j >= start_idx and len(lines[j]) - len(lines[j].lstrip()) >= indent: |
| | if f"if {denominator}" in lines[j] and "== 0" in lines[j]: |
| | lines[j] = f"# {lines[j]} # Zero division check removed" |
| | j -= 1 |
| | |
| | |
| | lines.insert(i, f"{indent_str}# Missing check for zero division") |
| | |
| | |
| | end_idx += 1 |
| | break |
| | |
| | elif ("==" in lines[i] or "!=" in lines[i]) and "None" not in lines[i]: |
| | |
| | lines[i] = f"# {lines[i]} # Edge case check removed" |
| | break |
| | else: |
| | |
| | |
| | for i in range(start_idx + 1, end_idx + 1): |
| | if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): |
| | indent = len(lines[i]) - len(lines[i].lstrip()) |
| | indent_str = ' ' * indent |
| | |
| | |
| | lines.insert(i, f"{indent_str}# Missing handling for edge cases") |
| | lines.insert(i + 1, f"{indent_str}# This function doesn't handle special cases properly") |
| | |
| | |
| | end_idx += 2 |
| | break |
| | |
| | elif mod_type == "type_assumption": |
| | |
| | for i in range(start_idx + 1, end_idx + 1): |
| | if re.search(r'for\s+\w+\s+in\s+\w+', lines[i]) or "=" in lines[i] and "[" in lines[i]: |
| | |
| | var_match = re.search(r'for\s+\w+\s+in\s+(\w+)', lines[i]) |
| | if not var_match: |
| | var_match = re.search(r'(\w+)\s*=', lines[i]) |
| | |
| | if var_match: |
| | var_name = var_match.group(1) |
| | indent = len(lines[i]) - len(lines[i].lstrip()) |
| | indent_str = ' ' * indent |
| | |
| | |
| | lines.insert(i + 1, f"{indent_str}# Type assumption: {var_name} is assumed to be a list") |
| | lines.insert(i + 2, f"{indent_str}if len({var_name}) > 0: # Will fail if {var_name} doesn't support len()") |
| | lines.insert(i + 3, f"{indent_str} first = {var_name}[0] # Will fail if {var_name} is not subscriptable") |
| | |
| | |
| | end_idx += 3 |
| | break |
| | else: |
| | |
| | for i in range(start_idx + 1, end_idx + 1): |
| | if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): |
| | indent = len(lines[i]) - len(lines[i].lstrip()) |
| | indent_str = ' ' * indent |
| | |
| | |
| | param_match = re.search(r'def\s+\w+\s*\(\s*(\w+)', lines[start_idx]) |
| | param_name = param_match.group(1) if param_match else "input_data" |
| | |
| | |
| | lines.insert(i, f"{indent_str}# Type assumption: {param_name} is assumed to be a specific type") |
| | lines.insert(i + 1, f"{indent_str}{param_name}_str = str({param_name}) # Will fail if {param_name} can't be converted to string") |
| | |
| | |
| | end_idx += 2 |
| | break |
| | |
| | |
| | problem_state.code_context["code"] = '\n'.join(lines) |
| | |
| | |
| | if "bugs" not in problem_state.code_context: |
| | problem_state.code_context["bugs"] = [] |
| | |
| | problem_state.code_context["bugs"].append({ |
| | "type": BugCategory.EDGE_CASE, |
| | "line": start_idx + 1, |
| | "description": f"Edge case bug introduced in function '{func_name}'" |
| | }) |
| | |
| | def _generate_description(self, problem_state: ProblemState) -> str: |
| | """ |
| | Generate a description for the current problem state. |
| | |
| | Args: |
| | problem_state: The problem state |
| | |
| | Returns: |
| | A descriptive prompt for the problem |
| | """ |
| | |
| | bug_count = problem_state.code_context.get("bug_count", 0) |
| | plural = "bugs" if bug_count != 1 else "bug" |
| | |
| | base_desc = ( |
| | f"Fix the {plural} in the code below. " |
| | f"There {'are' if bug_count != 1 else 'is'} {bug_count} {plural} to find and fix." |
| | ) |
| | |
| | |
| | if "bug_categories" in problem_state.code_context: |
| | categories = problem_state.code_context["bug_categories"] |
| | if categories: |
| | category_desc = ", ".join(categories) |
| | base_desc += f"\n\nThe code contains the following types of issues: {category_desc}." |
| | |
| | |
| | if problem_state.requirements: |
| | base_desc += "\n\nRequirements:" |
| | for i, req in enumerate(problem_state.requirements): |
| | base_desc += f"\n{i+1}. {req['description']}" |
| | |
| | |
| | difficulty_desc = "easy" |
| | if problem_state.difficulty > 0.3 and problem_state.difficulty <= 0.6: |
| | difficulty_desc = "moderate" |
| | elif problem_state.difficulty > 0.6 and problem_state.difficulty <= 0.8: |
| | difficulty_desc = "challenging" |
| | elif problem_state.difficulty > 0.8: |
| | difficulty_desc = "very challenging" |
| | |
| | base_desc += f"\n\nThis is a {difficulty_desc} bug fixing task." |
| | |
| | return base_desc |
| |
|
| |
|
| | |
| | class DefaultTestRunner: |
| | """ |
| | Default test runner for evaluating solutions. |
| | |
| | This class runs tests against a solution file and collects the results. |
| | """ |
| | |
| | def run_tests( |
| | self, |
| | solution_file: Path, |
| | test_files: List[Path], |
| | code_context: Dict[str, Any] |
| | ) -> Dict[str, Any]: |
| | """ |
| | Run tests against a solution file. |
| | |
| | Args: |
| | solution_file: Path to the solution file |
| | test_files: List of test file paths |
| | code_context: Additional context about the code |
| | |
| | Returns: |
| | Dictionary containing test results |
| | """ |
| | |
| | results = { |
| | "all_passed": True, |
| | "passed_tests": 0, |
| | "total_tests": 0, |
| | "tests": {}, |
| | "execution": { |
| | "success": True, |
| | "error": None, |
| | "stdout": "", |
| | "stderr": "" |
| | }, |
| | "execution_time": 0.0 |
| | } |
| | |
| | |
| | if not solution_file.exists(): |
| | results["execution"]["success"] = False |
| | results["execution"]["error"] = f"Solution file not found: {solution_file}" |
| | results["all_passed"] = False |
| | return results |
| | |
| | |
| | try: |
| | start_time = time.time() |
| | |
| | |
| | sys.path.insert(0, str(solution_file.parent)) |
| | |
| | |
| | spec = importlib.util.spec_from_file_location( |
| | "solution", solution_file) |
| | solution_module = importlib.util.module_from_spec(spec) |
| | spec.loader.exec_module(solution_module) |
| | |
| | |
| | sys.path.pop(0) |
| | |
| | |
| | end_time = time.time() |
| | results["execution_time"] = end_time - start_time |
| | |
| | except Exception as e: |
| | results["execution"]["success"] = False |
| | results["execution"]["error"] = str(e) |
| | results["all_passed"] = False |
| | return results |
| | |
| | |
| | for test_file in test_files: |
| | |
| | if not test_file.exists(): |
| | continue |
| | |
| | try: |
| | |
| | loader = unittest.TestLoader() |
| | |
| | |
| | sys.path.insert(0, str(test_file.parent)) |
| | |
| | |
| | stdout_buffer = io.StringIO() |
| | stderr_buffer = io.StringIO() |
| | |
| | |
| | test_suite = loader.discover( |
| | str(test_file.parent), |
| | pattern=test_file.name |
| | ) |
| | |
| | |
| | test_count = 0 |
| | for suite in test_suite: |
| | for test_case in suite: |
| | test_count += test_case.countTestCases() |
| | |
| | results["total_tests"] += test_count |
| | |
| | |
| | with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): |
| | test_runner = unittest.TextTestRunner(verbosity=2) |
| | test_result = test_runner.run(test_suite) |
| | |
| | |
| | stdout = stdout_buffer.getvalue() |
| | stderr = stderr_buffer.getvalue() |
| | |
| | |
| | sys.path.pop(0) |
| | |
| | |
| | if not test_result.wasSuccessful(): |
| | results["all_passed"] = False |
| | |
| | |
| | passed_tests = test_count - len(test_result.failures) - len(test_result.errors) |
| | results["passed_tests"] += passed_tests |
| | |
| | |
| | test_name = test_file.stem |
| | results["tests"][test_name] = { |
| | "passed": test_result.wasSuccessful(), |
| | "failures": len(test_result.failures), |
| | "errors": len(test_result.errors), |
| | "skipped": len(test_result.skipped), |
| | "total": test_count, |
| | "passed_count": passed_tests, |
| | "stdout": stdout, |
| | "stderr": stderr |
| | } |
| | |
| | |
| | for failure in test_result.failures + test_result.errors: |
| | test_id = failure[0].id().split('.')[-1] |
| | failure_message = failure[1] |
| | |
| | |
| | expected_match = re.search(r'Expected\s*:(.+)', failure_message) |
| | actual_match = re.search(r'Actual\s*:(.+)', failure_message) |
| | |
| | expected = expected_match.group(1).strip() if expected_match else None |
| | actual = actual_match.group(1).strip() if actual_match else None |
| | |
| | if test_id not in results["tests"]: |
| | results["tests"][test_id] = {} |
| | |
| | results["tests"][test_id].update({ |
| | "passed": False, |
| | "message": failure_message, |
| | "expected": expected, |
| | "actual": actual |
| | }) |
| | |
| | except Exception as e: |
| | |
| | results["all_passed"] = False |
| | test_name = test_file.stem |
| | results["tests"][test_name] = { |
| | "passed": False, |
| | "error": str(e), |
| | "failures": 0, |
| | "errors": 1, |
| | "skipped": 0, |
| | "total": 1, |
| | "passed_count": 0 |
| | } |
| | results["total_tests"] += 1 |
| | |
| | return results |
| |
|