| | |
| |
|
| | from dataclasses import dataclass |
| | from typing import Any, Dict, List, Optional, Tuple, Union |
| | from enum import Enum |
| | import datetime |
| | import uuid |
| | import json |
| | import copy |
| |
|
| | class TaskStatus(Enum): |
| | """Status of a recursive task.""" |
| | INITIALIZED = "initialized" |
| | IN_PROGRESS = "in_progress" |
| | CONVERGED = "converged" |
| | MAX_ITERATIONS = "max_iterations" |
| | PERFECT_SOLUTION = "perfect_solution" |
| | ABANDONED = "abandoned" |
| |
|
| |
|
| | @dataclass |
| | class ProblemState: |
| | """Represents the current state of a problem in the recursive task.""" |
| | problem_id: str |
| | description: str |
| | code_context: Dict[str, Any] |
| | requirements: List[Dict[str, Any]] |
| | difficulty: float |
| | evolution_stage: int |
| | adaptation_vector: List[float] |
| |
|
| |
|
| | @dataclass |
| | class EvaluationResult: |
| | """Results from evaluating a solution.""" |
| | success: bool |
| | score: float |
| | execution_results: Dict[str, Any] |
| | error_details: Optional[Dict[str, Any]] = None |
| | test_results: Optional[Dict[str, Any]] = None |
| | metrics: Optional[Dict[str, float]] = None |
| |
|
| |
|
| | @dataclass |
| | class Feedback: |
| | """Structured feedback on a solution.""" |
| | summary: str |
| | issues: List[Dict[str, Any]] |
| | suggestions: List[Dict[str, Any]] |
| | focus_areas: List[str] |
| | adaptation_hints: List[Dict[str, Any]] |
| |
|
| |
|
| | class ConvergenceCriteria: |
| | """Criteria for determining when a recursive task has converged.""" |
| | |
| | def __init__(self, config: Dict[str, Any] = None): |
| | self.config = config or {} |
| | self.score_threshold = self.config.get("score_threshold", 0.95) |
| | self.min_iterations = self.config.get("min_iterations", 1) |
| | self.max_iterations = self.config.get("max_iterations", 10) |
| | self.score_delta_threshold = self.config.get("score_delta_threshold", 0.01) |
| | self.consecutive_plateau_limit = self.config.get("consecutive_plateau_limit", 3) |
| | |
| | def has_converged(self, trajectory: "Trajectory") -> bool: |
| | """Determine if the task has converged based on the trajectory.""" |
| | if len(trajectory.steps) < self.min_iterations: |
| | return False |
| | |
| | if len(trajectory.steps) >= self.max_iterations: |
| | return True |
| | |
| | |
| | latest_score = trajectory.steps[-1].result.score |
| | if latest_score >= self.score_threshold: |
| | return True |
| | |
| | |
| | if len(trajectory.steps) >= self.consecutive_plateau_limit + 1: |
| | recent_scores = [step.result.score for step in |
| | trajectory.steps[-self.consecutive_plateau_limit-1:]] |
| | deltas = [abs(recent_scores[i+1] - recent_scores[i]) |
| | for i in range(len(recent_scores)-1)] |
| | |
| | if all(delta < self.score_delta_threshold for delta in deltas): |
| | return True |
| | |
| | return False |
| |
|
| |
|
| | @dataclass |
| | class TrajectoryStep: |
| | """A single step in a solution trajectory.""" |
| | step_id: str |
| | timestamp: datetime.datetime |
| | problem_state: ProblemState |
| | solution: str |
| | result: EvaluationResult |
| | feedback: Feedback |
| |
|
| |
|
| | class Trajectory: |
| | """Tracks the evolution of solutions over multiple iterations.""" |
| | |
| | def __init__(self, task_id: str): |
| | self.task_id = task_id |
| | self.steps: List[TrajectoryStep] = [] |
| | self.metadata: Dict[str, Any] = { |
| | "start_time": datetime.datetime.now(), |
| | "task_id": task_id |
| | } |
| | |
| | def add_step(self, problem_state: ProblemState, solution: str, |
| | result: EvaluationResult, feedback: Feedback) -> None: |
| | """Add a step to the trajectory.""" |
| | step = TrajectoryStep( |
| | step_id=str(uuid.uuid4()), |
| | timestamp=datetime.datetime.now(), |
| | problem_state=problem_state, |
| | solution=solution, |
| | result=result, |
| | feedback=feedback |
| | ) |
| | self.steps.append(step) |
| | |
| | def get_solution_series(self) -> List[str]: |
| | """Return the series of solutions.""" |
| | return [step.solution for step in self.steps] |
| | |
| | def get_score_series(self) -> List[float]: |
| | """Return the series of scores.""" |
| | return [step.result.score for step in self.steps] |
| | |
| | def get_latest_step(self) -> Optional[TrajectoryStep]: |
| | """Get the most recent step in the trajectory.""" |
| | if not self.steps: |
| | return None |
| | return self.steps[-1] |
| | |
| | def calculate_improvement_rate(self) -> float: |
| | """Calculate the rate of improvement across iterations.""" |
| | scores = self.get_score_series() |
| | if len(scores) < 2: |
| | return 0.0 |
| | |
| | return (scores[-1] - scores[0]) / len(scores) |
| | |
| | def calculate_volatility(self) -> float: |
| | """Calculate the volatility of scores across iterations.""" |
| | scores = self.get_score_series() |
| | if len(scores) < 2: |
| | return 0.0 |
| | |
| | deltas = [abs(scores[i+1] - scores[i]) for i in range(len(scores)-1)] |
| | return sum(deltas) / len(deltas) |
| | |
| | def to_dict(self) -> Dict[str, Any]: |
| | """Convert the trajectory to a dictionary for serialization.""" |
| | return { |
| | "task_id": self.task_id, |
| | "metadata": self.metadata, |
| | "steps": [ |
| | { |
| | "step_id": step.step_id, |
| | "timestamp": step.timestamp.isoformat(), |
| | "problem_state": { |
| | "problem_id": step.problem_state.problem_id, |
| | "description": step.problem_state.description, |
| | "code_context": step.problem_state.code_context, |
| | "requirements": step.problem_state.requirements, |
| | "difficulty": step.problem_state.difficulty, |
| | "evolution_stage": step.problem_state.evolution_stage, |
| | "adaptation_vector": step.problem_state.adaptation_vector |
| | }, |
| | "solution": step.solution, |
| | "result": { |
| | "success": step.result.success, |
| | "score": step.result.score, |
| | "execution_results": step.result.execution_results, |
| | "error_details": step.result.error_details, |
| | "test_results": step.result.test_results, |
| | "metrics": step.result.metrics |
| | }, |
| | "feedback": { |
| | "summary": step.feedback.summary, |
| | "issues": step.feedback.issues, |
| | "suggestions": step.feedback.suggestions, |
| | "focus_areas": step.feedback.focus_areas, |
| | "adaptation_hints": step.feedback.adaptation_hints |
| | } |
| | } |
| | for step in self.steps |
| | ] |
| | } |
| | |
| | @classmethod |
| | def from_dict(cls, data: Dict[str, Any]) -> "Trajectory": |
| | """Create a trajectory from a dictionary.""" |
| | trajectory = cls(data["task_id"]) |
| | trajectory.metadata = data["metadata"] |
| | |
| | for step_data in data["steps"]: |
| | problem_state = ProblemState( |
| | problem_id=step_data["problem_state"]["problem_id"], |
| | description=step_data["problem_state"]["description"], |
| | code_context=step_data["problem_state"]["code_context"], |
| | requirements=step_data["problem_state"]["requirements"], |
| | difficulty=step_data["problem_state"]["difficulty"], |
| | evolution_stage=step_data["problem_state"]["evolution_stage"], |
| | adaptation_vector=step_data["problem_state"]["adaptation_vector"] |
| | ) |
| | |
| | result = EvaluationResult( |
| | success=step_data["result"]["success"], |
| | score=step_data["result"]["score"], |
| | execution_results=step_data["result"]["execution_results"], |
| | error_details=step_data["result"]["error_details"], |
| | test_results=step_data["result"]["test_results"], |
| | metrics=step_data["result"]["metrics"] |
| | ) |
| | |
| | feedback = Feedback( |
| | summary=step_data["feedback"]["summary"], |
| | issues=step_data["feedback"]["issues"], |
| | suggestions=step_data["feedback"]["suggestions"], |
| | focus_areas=step_data["feedback"]["focus_areas"], |
| | adaptation_hints=step_data["feedback"]["adaptation_hints"] |
| | ) |
| | |
| | trajectory.add_step( |
| | problem_state=problem_state, |
| | solution=step_data["solution"], |
| | result=result, |
| | feedback=feedback |
| | ) |
| | |
| | return trajectory |
| | |
| | def save(self, filepath: str) -> None: |
| | """Save the trajectory to a file.""" |
| | with open(filepath, "w") as f: |
| | json.dump(self.to_dict(), f, indent=2) |
| | |
| | @classmethod |
| | def load(cls, filepath: str) -> "Trajectory": |
| | """Load a trajectory from a file.""" |
| | with open(filepath, "r") as f: |
| | data = json.load(f) |
| | return cls.from_dict(data) |
| |
|
| |
|
| | class RecursiveTask: |
| | """ |
| | Base class for recursive tasks that evolve based on model solutions. |
| | |
| | A recursive task provides a dynamic problem that adapts based on the |
| | model's attempted solutions, creating a feedback loop that more accurately |
| | reflects real-world software engineering challenges. |
| | """ |
| | |
| | def __init__(self, |
| | initial_state: ProblemState, |
| | config: Dict[str, Any] = None): |
| | """ |
| | Initialize the recursive task with an initial problem state. |
| | |
| | Args: |
| | initial_state: The initial state of the problem |
| | config: Configuration options for the task |
| | """ |
| | self.task_id = str(uuid.uuid4()) |
| | self.state = initial_state |
| | self.config = config or {} |
| | self.trajectory = Trajectory(self.task_id) |
| | self.status = TaskStatus.INITIALIZED |
| | self.convergence_criteria = ConvergenceCriteria( |
| | config.get("convergence_criteria", {})) |
| | |
| | def get_current_problem(self) -> Dict[str, Any]: |
| | """ |
| | Return the current problem description and context. |
| | |
| | Returns: |
| | A dictionary containing the current problem description and context |
| | """ |
| | return { |
| | "description": self.state.description, |
| | "code_context": self.state.code_context, |
| | "requirements": self.state.requirements, |
| | "evolution_stage": self.state.evolution_stage |
| | } |
| | |
| | def evaluate_solution(self, solution: str) -> Tuple[EvaluationResult, Feedback]: |
| | """ |
| | Evaluate a solution and generate feedback. |
| | |
| | Args: |
| | solution: The solution to evaluate |
| | |
| | Returns: |
| | A tuple containing the evaluation result and feedback |
| | """ |
| | |
| | result = self._run_evaluation(solution) |
| | |
| | |
| | feedback = self._generate_feedback(solution, result) |
| | |
| | return result, feedback |
| | |
| | def update_state(self, |
| | solution: str, |
| | result: EvaluationResult, |
| | feedback: Feedback) -> ProblemState: |
| | """ |
| | Update the problem state based on the solution and feedback. |
| | |
| | This method implements the recursive nature of the benchmark by |
| | evolving the problem based on the model's solution attempt. |
| | |
| | Args: |
| | solution: The attempted solution |
| | result: The evaluation result |
| | feedback: The feedback provided |
| | |
| | Returns: |
| | The updated problem state |
| | """ |
| | |
| | self.trajectory.add_step( |
| | problem_state=self.state, |
| | solution=solution, |
| | result=result, |
| | feedback=feedback |
| | ) |
| | |
| | |
| | if self.convergence_criteria.has_converged(self.trajectory): |
| | if self.trajectory.steps[-1].result.score >= self.convergence_criteria.score_threshold: |
| | self.status = TaskStatus.PERFECT_SOLUTION |
| | elif len(self.trajectory.steps) >= self.convergence_criteria.max_iterations: |
| | self.status = TaskStatus.MAX_ITERATIONS |
| | else: |
| | self.status = TaskStatus.CONVERGED |
| | return self.state |
| | |
| | |
| | self.state = self._evolve_state(solution, result, feedback) |
| | |
| | |
| | self.status = TaskStatus.IN_PROGRESS |
| | |
| | return self.state |
| | |
| | def _run_evaluation(self, solution: str) -> EvaluationResult: |
| | """ |
| | Run evaluation logic specific to this task. |
| | |
| | Args: |
| | solution: The solution to evaluate |
| | |
| | Returns: |
| | The evaluation result |
| | """ |
| | raise NotImplementedError("Subclasses must implement this method") |
| | |
| | def _generate_feedback(self, |
| | solution: str, |
| | result: EvaluationResult) -> Feedback: |
| | """ |
| | Generate structured feedback based on evaluation results. |
| | |
| | Args: |
| | solution: The solution that was evaluated |
| | result: The evaluation result |
| | |
| | Returns: |
| | Structured feedback |
| | """ |
| | raise NotImplementedError("Subclasses must implement this method") |
| | |
| | def _evolve_state(self, |
| | solution: str, |
| | result: EvaluationResult, |
| | feedback: Feedback) -> ProblemState: |
| | """ |
| | Evolve the problem state based on the solution and feedback. |
| | |
| | This method implements the recursive nature of the benchmark by |
| | defining how the problem changes in response to solution attempts. |
| | |
| | Args: |
| | solution: The attempted solution |
| | result: The evaluation result |
| | feedback: The feedback provided |
| | |
| | Returns: |
| | The evolved problem state |
| | """ |
| | raise NotImplementedError("Subclasses must implement this method") |
| | |
| | def get_trajectory(self) -> Trajectory: |
| | """ |
| | Get the complete solution trajectory for this task. |
| | |
| | Returns: |
| | The solution trajectory |
| | """ |
| | return self.trajectory |
| | |
| | def to_dict(self) -> Dict[str, Any]: |
| | """ |
| | Convert the task to a dictionary for serialization. |
| | |
| | Returns: |
| | A dictionary representation of the task |
| | """ |
| | return { |
| | "task_id": self.task_id, |
| | "status": self.status.value, |
| | "state": { |
| | "problem_id": self.state.problem_id, |
| | "description": self.state.description, |
| | "code_context": self.state.code_context, |
| | "requirements": self.state.requirements, |
| | "difficulty": self.state.difficulty, |
| | "evolution_stage": self.state.evolution_stage, |
| | "adaptation_vector": self.state.adaptation_vector |
| | }, |
| | "config": self.config, |
| | "trajectory": self.trajectory.to_dict() |
| | } |
| | |
| | def save(self, filepath: str) -> None: |
| | """ |
| | Save the task to a file. |
| | |
| | Args: |
| | filepath: Path to save the task |
| | """ |
| | with open(filepath, "w") as f: |
| | json.dump(self.to_dict(), f, indent=2) |
| | |
| | @classmethod |
| | def load(cls, filepath: str) -> "RecursiveTask": |
| | """ |
| | Load a task from a file. |
| | |
| | Args: |
| | filepath: Path to load the task from |
| | |
| | Returns: |
| | The loaded task |
| | """ |
| | with open(filepath, "r") as f: |
| | data = json.load(f) |
| | |
| | |
| | |
| | raise NotImplementedError("Subclasses must implement this method") |
| |
|