| | |
| |
|
| | import json |
| | import backoff |
| | import time |
| | import anthropic |
| | from typing import Any, Dict, List, Optional, Union, Tuple |
| | import re |
| | import logging |
| |
|
| | from recursive_swe_bench.models.base_model import ModelInterface |
| |
|
| | class AnthropicModel(ModelInterface): |
| | """ |
| | Integration with Anthropic models (Claude). |
| | |
| | This class provides integration with Anthropic's API for evaluating |
| | Claude models with Recursive-SWE-bench through recursive evaluation loops. |
| | The implementation features dynamic adaptation to feedback through a |
| | self-reflective mechanism that traces attribution paths through recursive iterations. |
| | """ |
| | |
| | def __init__( |
| | self, |
| | model_identifier: str, |
| | api_key: Optional[str] = None, |
| | config: Optional[Dict[str, Any]] = None |
| | ): |
| | """ |
| | Initialize the Anthropic model interface. |
| | |
| | Args: |
| | model_identifier: Anthropic model identifier (e.g., "claude-3-opus-20240229") |
| | api_key: Anthropic API key (optional if set in environment) |
| | config: Additional configuration options |
| | """ |
| | super().__init__(model_identifier, config) |
| | |
| | |
| | if api_key: |
| | self.client = anthropic.Anthropic(api_key=api_key) |
| | else: |
| | self.client = anthropic.Anthropic() |
| | |
| | |
| | self.prompts = self.config.get("prompts", { |
| | "system": "You are an expert software engineer who specializes in debugging and fixing complex code. Your task is to fix bugs in code based on the description and test requirements provided.", |
| | "user_template": "# Bug Fixing Task\n\n{description}\n\n# Code\n```python\n{code}\n```\n\n{tests_description}\n\n# Your task\nFix the bugs in the code above. Focus on making the code pass all tests while maintaining good practices. Provide only the corrected code without additional explanations.", |
| | "reflection_template": "# Feedback on Previous Solution\n\nYour previous solution had the following issues:\n{issues}\n\n# Suggested Improvements\n{suggestions}\n\n# Test Results\n{test_results}\n\n# Reflection Prompt\nBefore providing a new solution, analyze what went wrong in your previous attempt and how you'll approach fixing it differently this time." |
| | }) |
| | |
| | |
| | self.api_params = self.config.get("api_params", { |
| | "temperature": 0.2, |
| | "max_tokens": 2000, |
| | "top_p": 0.95, |
| | "top_k": 50 |
| | }) |
| | |
| | |
| | self.recursive_config = self.config.get("recursive_config", { |
| | "enable_self_reflection": True, |
| | "adaptation_threshold": 0.5, |
| | "max_reflection_depth": 3, |
| | "attribution_tracking": True, |
| | "dynamic_prompting": True, |
| | }) |
| | |
| | |
| | self.recursive_state = { |
| | "reflection_depth": 0, |
| | "adaptation_vector": [0.0] * 5, |
| | "attribution_map": {}, |
| | "error_frequency": {}, |
| | "solution_quality_trend": [], |
| | } |
| | |
| | self.logger.info(f"Initialized Anthropic model: {model_identifier} with recursive capability") |
| | |
| | @backoff.on_exception( |
| | backoff.expo, |
| | (anthropic.APIError, anthropic.APITimeoutError, anthropic.RateLimitError), |
| | max_tries=5 |
| | ) |
| | def solve( |
| | self, |
| | problem: Dict[str, Any], |
| | history: Optional[List[Dict[str, Any]]] = None |
| | ) -> str: |
| | """ |
| | Generate a solution using the Anthropic model with recursive adaptation. |
| | |
| | Args: |
| | problem: The problem to solve |
| | history: Optional history of previous solution attempts |
| | |
| | Returns: |
| | The generated solution |
| | """ |
| | self.logger.info(f"Solving problem with Anthropic model: {self.model_identifier}") |
| | start_time = time.time() |
| | |
| | |
| | if not history: |
| | self._reset_recursive_state() |
| | elif history: |
| | |
| | self._update_recursive_state(history) |
| | |
| | |
| | system_prompt, user_message = self._format_messages(problem, history) |
| | |
| | |
| | response = self.client.messages.create( |
| | model=self.model_identifier, |
| | system=system_prompt, |
| | messages=[ |
| | {"role": "user", "content": user_message} |
| | ], |
| | max_tokens=self.api_params.get("max_tokens", 2000), |
| | temperature=self.api_params.get("temperature", 0.2), |
| | top_p=self.api_params.get("top_p", 0.95), |
| | top_k=self.api_params.get("top_k", 50) |
| | ) |
| | |
| | |
| | solution = response.content[0].text |
| | |
| | end_time = time.time() |
| | self.logger.info(f"Solution generated in {end_time - start_time:.2f} seconds") |
| | |
| | |
| | if solution: |
| | self.recursive_state["reflection_depth"] += 1 |
| | |
| | return self._extract_code(solution) |
| | |
| | def _format_messages( |
| | self, |
| | problem: Dict[str, Any], |
| | history: Optional[List[Dict[str, Any]]] = None |
| | ) -> Tuple[str, str]: |
| | """ |
| | Format the problem and history into messages for the Anthropic API. |
| | |
| | Args: |
| | problem: The problem to solve |
| | history: Optional history of previous solution attempts |
| | |
| | Returns: |
| | Tuple of (system_prompt, user_message) |
| | """ |
| | |
| | system_prompt = self.prompts["system"] |
| | |
| | |
| | if self.recursive_config.get("enable_self_reflection", True) and history: |
| | |
| | if self.recursive_state["error_frequency"]: |
| | top_errors = sorted( |
| | self.recursive_state["error_frequency"].items(), |
| | key=lambda x: x[1], |
| | reverse=True |
| | )[:3] |
| | |
| | error_guidance = "Focus particularly on addressing these recurring issues:\n" |
| | for error_type, count in top_errors: |
| | error_guidance += f"- {error_type} (appeared {count} times)\n" |
| | |
| | system_prompt += f"\n\n{error_guidance}" |
| | |
| | |
| | if len(self.recursive_state["solution_quality_trend"]) > 1: |
| | trend = self.recursive_state["solution_quality_trend"] |
| | if trend[-1] > trend[-2]: |
| | system_prompt += "\n\nYour solutions are improving. Continue this trajectory." |
| | elif trend[-1] < trend[-2]: |
| | system_prompt += "\n\nYour solutions are declining in quality. Carefully reconsider your approach." |
| | else: |
| | system_prompt += "\n\nYour solutions maintain the same quality. Try a different approach." |
| | |
| | |
| | code = problem["code_context"]["code"] |
| | |
| | |
| | tests_description = "# Tests\n" |
| | if "tests" in problem["code_context"]: |
| | tests_description += "The code must pass the following tests:\n\n" |
| | for i, test in enumerate(problem["code_context"]["tests"]): |
| | tests_description += f"## Test {i+1}: {test['name']}\n```python\n{test['content']}\n```\n\n" |
| | else: |
| | tests_description += "The code must work correctly according to its intended functionality." |
| | |
| | |
| | user_message = self.prompts["user_template"].format( |
| | description=problem["description"], |
| | code=code, |
| | tests_description=tests_description |
| | ) |
| | |
| | |
| | if history and self.recursive_config.get("enable_self_reflection", True): |
| | |
| | latest_entry = history[-1] |
| | |
| | |
| | issues_text = "- " + "\n- ".join([issue["message"] for issue in latest_entry["feedback"]["issues"]]) |
| | |
| | |
| | suggestions_text = "- " + "\n- ".join([suggestion["message"] for suggestion in latest_entry["feedback"]["suggestions"]]) |
| | |
| | |
| | test_results = latest_entry.get("result", {}) |
| | passed_tests = test_results.get("passed_tests", 0) |
| | total_tests = test_results.get("total_tests", 0) |
| | |
| | test_results_text = f"Passed {passed_tests}/{total_tests} tests." |
| | if "tests" in test_results: |
| | test_results_text += "\n\nIndividual test results:" |
| | for test_name, test_result in test_results["tests"].items(): |
| | status = "✅ Passed" if test_result.get("passed", False) else "❌ Failed" |
| | test_results_text += f"\n- {test_name}: {status}" |
| | if not test_result.get("passed", False) and "message" in test_result: |
| | test_results_text += f"\n Error: {test_result['message']}" |
| | |
| | |
| | reflection_prompt = self.prompts["reflection_template"].format( |
| | issues=issues_text, |
| | suggestions=suggestions_text, |
| | test_results=test_results_text |
| | ) |
| | |
| | |
| | user_message = f"{reflection_prompt}\n\n{user_message}" |
| | |
| | |
| | if self.recursive_config.get("dynamic_prompting", True): |
| | |
| | error_types = [issue.get("type", "") for issue in latest_entry["feedback"]["issues"]] |
| | |
| | if "syntax" in " ".join(error_types).lower(): |
| | user_message += "\n\nPay careful attention to syntax correctness. Double-check all parentheses, indentation, and function definitions." |
| | |
| | if "test_failure" in " ".join(error_types).lower(): |
| | user_message += "\n\nFocus on making the code pass the failing tests. Carefully trace through the code execution for each test case." |
| | |
| | if "edge_case" in " ".join(error_types).lower() or "boundary" in " ".join(error_types).lower(): |
| | user_message += "\n\nBe sure to handle edge cases such as empty inputs, boundary values, and special cases." |
| | |
| | if "performance" in " ".join(error_types).lower(): |
| | user_message += "\n\nOptimize your solution for better performance. Avoid unnecessary operations and inefficient data structures." |
| | |
| | return system_prompt, user_message |
| | |
| | def _extract_code(self, text: str) -> str: |
| | """ |
| | Extract code from the model's response. |
| | |
| | Args: |
| | text: The model's response |
| | |
| | Returns: |
| | Extracted code |
| | """ |
| | |
| | code_blocks = re.findall(r'```(?:python)?\s*(.*?)\s*```', text, re.DOTALL) |
| | |
| | if code_blocks: |
| | return code_blocks[0].strip() |
| | |
| | |
| | return text.strip() |
| | |
| | def _reset_recursive_state(self): |
| | """Reset the recursive state for a new problem.""" |
| | self.recursive_state = { |
| | "reflection_depth": 0, |
| | "adaptation_vector": [0.0] * 5, |
| | "attribution_map": {}, |
| | "error_frequency": {}, |
| | "solution_quality_trend": [], |
| | } |
| | |
| | def _update_recursive_state(self, history: List[Dict[str, Any]]): |
| | """ |
| | Update recursive state based on solution history. |
| | |
| | Args: |
| | history: History of previous solution attempts |
| | """ |
| | |
| | scores = [entry.get("result", {}).get("score", 0.0) for entry in history] |
| | self.recursive_state["solution_quality_trend"] = scores |
| | |
| | |
| | if len(scores) >= 2: |
| | |
| | improvement = scores[-1] - scores[0] |
| | self.recursive_state["adaptation_vector"][0] = max(-1.0, min(1.0, improvement)) |
| | |
| | |
| | recent_improvement = scores[-1] - scores[-2] |
| | self.recursive_state["adaptation_vector"][1] = max(-1.0, min(1.0, recent_improvement)) |
| | |
| | |
| | if history: |
| | latest_feedback = history[-1].get("feedback", {}) |
| | issues = latest_feedback.get("issues", []) |
| | |
| | for issue in issues: |
| | issue_type = issue.get("type", "unknown") |
| | self.recursive_state["error_frequency"][issue_type] = self.recursive_state["error_frequency"].get(issue_type, 0) + 1 |
| | |
| | |
| | self.recursive_state["reflection_depth"] = len(history) |
| | |
| | def get_meta_information(self) -> Dict[str, Any]: |
| | """ |
| | Get meta information about the model. |
| | |
| | Returns: |
| | Dictionary containing model information |
| | """ |
| | return { |
| | "model_name": self.model_identifier, |
| | "provider": "Anthropic", |
| | "type": "API", |
| | "parameters": self.api_params, |
| | "system_prompt": self.prompts["system"], |
| | "recursive_capability": self.recursive_config.get("enable_self_reflection", True), |
| | "reflection_depth": self.recursive_state["reflection_depth"], |
| | "adaptation_vector": self.recursive_state["adaptation_vector"] |
| | } |
| |
|
| |
|
| | |
| |
|
| | import numpy as np |
| | import scipy.stats |
| | from typing import Any, Dict, List, Optional, Union |
| | import dataclasses |
| | import math |
| |
|
| | from recursive_swe_bench.core.recursive_task import Trajectory |
| |
|
| |
|
| | class RecursiveLearningCurveArea: |
| | """ |
| | Measures the area under the learning curve across iterations. |
| | |
| | This metric captures the overall performance of a model throughout its |
| | learning trajectory, rewarding both high scores and quick improvement. |
| | """ |
| | |
| | def __init__(self, config: Dict[str, Any] = None): |
| | """ |
| | Initialize the recursive learning curve area metric. |
| | |
| | Args: |
| | config: Configuration options |
| | """ |
| | self.config = config or {} |
| | self.max_score = self.config.get("max_score", 1.0) |
| | self.normalize = self.config.get("normalize", True) |
| | |
| | def calculate(self, trajectory: Trajectory) -> float: |
| | """ |
| | Calculate the area under the learning curve. |
| | |
| | Args: |
| | trajectory: The solution trajectory |
| | |
| | Returns: |
| | The normalized area under the learning curve |
| | """ |
| | scores = trajectory.get_score_series() |
| | if not scores: |
| | return 0.0 |
| | |
| | |
| | area = np.trapz(scores, dx=1.0) |
| | |
| | |
| | if self.normalize: |
| | max_area = self.max_score * len(scores) |
| | return area / max_area |
| | |
| | return area |
| |
|
| |
|
| | class AdaptationRate: |
| | """ |
| | Measures the rate at which the model improves its solutions. |
| | |
| | This metric captures how quickly a model adapts to feedback and |
| | improves its solutions across iterations. |
| | """ |
| | |
| | def __init__(self, config: Dict[str, Any] = None): |
| | """ |
| | Initialize the adaptation rate metric. |
| | |
| | Args: |
| | config: Configuration options |
| | """ |
| | self.config = config or {} |
| | self.min_iterations = self.config.get("min_iterations", 2) |
| | |
| | def calculate(self, trajectory: Trajectory) -> float: |
| | """ |
| | Calculate the adaptation rate. |
| | |
| | Args: |
| | trajectory: The solution trajectory |
| | |
| | Returns: |
| | The adaptation rate |
| | """ |
| | scores = trajectory.get_score_series() |
| | if len(scores) < self.min_iterations: |
| | return 0.0 |
| | |
| | |
| | total_improvement = scores[-1] - scores[0] |
| | iterations = len(scores) - 1 |
| | |
| | return total_improvement / iterations |
| |
|
| |
|
| | class RecursiveVolatility: |
| | """ |
| | Measures the volatility of solution quality across iterations. |
| | |
| | This metric captures how stable or erratic a model's performance |
| | is across iterations. |
| | """ |
| | |
| | def __init__(self, config: Dict[str, Any] = None): |
| | """ |
| | Initialize the recursive volatility metric. |
| | |
| | Args: |
| | config: Configuration options |
| | """ |
| | self.config = config or {} |
| | self.min_iterations = self.config.get("min_iterations", 3) |
| | self.normalize = self.config.get("normalize", True) |
| | |
| | def calculate(self, trajectory: Trajectory) -> float: |
| | """ |
| | Calculate the recursive volatility. |
| | |
| | Args: |
| | trajectory: The solution trajectory |
| | |
| | Returns: |
| | The normalized volatility |
| | """ |
| | scores = trajectory.get_score_series() |
| | if len(scores) < self.min_iterations: |
| | return 0.0 |
| | |
| | |
| | changes = [abs(scores[i] - scores[i-1]) for i in range(1, len(scores))] |
| | volatility = np.std(changes) |
| | |
| | |
| | if self.normalize and np.mean(scores) > 0: |
| | return volatility / np.mean(scores) |
| | |
| | return volatility |
| |
|
| |
|
| | class ConvergenceIndex: |
| | """ |
| | Measures how quickly the model converges to a stable solution. |
| | |
| | This metric captures how efficiently a model reaches a stable solution |
| | across iterations. |
| | """ |
| | |
| | def __init__(self, config: Dict[str, Any] = None): |
| | """ |
| | Initialize the convergence index metric. |
| | |
| | Args: |
| | config: Configuration options |
| | """ |
| | self.config = config or {} |
| | self.stability_threshold = self.config.get("stability_threshold", 0.05) |
| | self.max_score_threshold = self.config.get("max_score_threshold", 0.95) |
| | |
| | def calculate(self, trajectory: Trajectory) -> float: |
| | """ |
| | Calculate the convergence index. |
| | |
| | Args: |
| | trajectory: The solution trajectory |
| | |
| | Returns: |
| | The convergence index (lower is better) |
| | """ |
| | scores = trajectory.get_score_series() |
| | if not scores: |
| | return 0.0 |
| | |
| | |
| | |
| | convergence_point = len(scores) - 1 |
| | for i in range(1, len(scores)): |
| | remaining_changes = [abs(scores[j] - scores[j-1]) for j in range(i, len(scores))] |
| | if all(change <= self.stability_threshold for change in remaining_changes): |
| | convergence_point = i |
| | break |
| | |
| | |
| | max_score_point = len(scores) |
| | for i, score in enumerate(scores): |
| | if score >= self.max_score_threshold: |
| | max_score_point = i |
| | break |
| | |
| | |
| | |
| | return (convergence_point / len(scores)) * (1.0 - max(0.0, min(1.0, scores[-1]))) |
| |
|
| |
|
| | class ErrorRecoveryEfficiency: |
| | """ |
| | Measures how efficiently the model recovers from errors. |
| | |
| | This metric captures how well a model addresses and fixes specific |
| | errors across iterations. |
| | """ |
| | |
| | def __init__(self, config: Dict[str, Any] = None): |
| | """ |
| | Initialize the error recovery efficiency metric. |
| | |
| | Args: |
| | config: Configuration options |
| | """ |
| | self.config = config or {} |
| | |
| | def calculate(self, trajectory: Trajectory) -> float: |
| | """ |
| | Calculate the error recovery efficiency. |
| | |
| | Args: |
| | trajectory: The solution trajectory |
| | |
| | Returns: |
| | The error recovery efficiency |
| | """ |
| | if not trajectory.steps or len(trajectory.steps) < 2: |
| | return 0.0 |
| | |
| | |
| | error_counts = [] |
| | for step in trajectory.steps: |
| | if hasattr(step, "result") and hasattr(step.result, "error_details"): |
| | error_counts.append(len(step.result.error_details or {})) |
| | else: |
| | |
| | error_counts.append(len(step.feedback.issues)) |
| | |
| | if not error_counts or error_counts[0] == 0: |
| | return 1.0 |
| | |
| | |
| | initial_errors = error_counts[0] |
| | final_errors = error_counts[-1] |
| | |
| | |
| | return (initial_errors - final_errors) / initial_errors |
| |
|
| |
|
| | class DynamicComplexityHandling: |
| | """ |
| | Measures how well the model handles varying problem complexity. |
| | |
| | This metric evaluates performance while accounting for changes in |
| | problem difficulty across iterations. |
| | """ |
| | |
| | def __init__(self, config: Dict[str, Any] = None): |
| | """ |
| | Initialize the dynamic complexity handling metric. |
| | |
| | Args: |
| | config: Configuration options |
| | """ |
| | self.config = config or {} |
| | |
| | def calculate(self, trajectory: Trajectory) -> float: |
| | """ |
| | Calculate the dynamic complexity handling score. |
| | |
| | Args: |
| | trajectory: The solution trajectory |
| | |
| | Returns: |
| | The dynamic complexity handling score |
| | """ |
| | if not trajectory.steps: |
| | return 0.0 |
| | |
| | |
| | scores = [] |
| | difficulties = [] |
| | |
| | for step in trajectory.steps: |
| | scores.append(step.result.score) |
| | difficulties.append(step.problem_state.difficulty) |
| | |
| | |
| | weighted_scores = [scores[i] / max(0.1, difficulties[i]) for i in range(len(scores))] |
| | |
| | |
| | return sum(weighted_scores) / len(weighted_scores) |
| |
|
| |
|
| | class RecursiveFrameworkMetrics: |
| | """ |
| | Comprehensive collection of metrics for recursive evaluation. |
| | |
| | This class provides easy access to all recursive metrics and |
| | standardized calculation across trajectories. |
| | """ |
| | |
| | def __init__(self, config: Dict[str, Any] = None): |
| | """ |
| | Initialize the recursive framework metrics. |
| | |
| | Args: |
| | config: Configuration options |
| | """ |
| | self.config = config or {} |
| | |
| | |
| | self.metrics = { |
| | "learning_curve_area": RecursiveLearningCurveArea(self.config.get("learning_curve_area")), |
| | "adaptation_rate": AdaptationRate(self.config.get("adaptation_rate")), |
| | "volatility": RecursiveVolatility(self.config.get("volatility")), |
| | "convergence_index": ConvergenceIndex(self.config.get("convergence_index")), |
| | "error_recovery": ErrorRecoveryEfficiency(self.config.get("error_recovery")), |
| | "complexity_handling": DynamicComplexityHandling(self.config.get("complexity_handling")) |
| | } |
| | |
| | |
| | if "custom_metrics" in self.config: |
| | for name, metric in self.config["custom_metrics"].items(): |
| | self.metrics[name] = metric |
| | |
| | def calculate_all(self, trajectory: Trajectory) -> Dict[str, float]: |
| | """ |
| | Calculate all metrics for a trajectory. |
| | |
| | Args: |
| | trajectory: The solution trajectory |
| | |
| | Returns: |
| | Dictionary of metric names and values |
| | """ |
| | return {name: metric.calculate(trajectory) |
| | for name, metric in self.metrics.items()} |
| | |
| | def calculate(self, trajectory: Trajectory, metric_name: str) -> float: |
| | """ |
| | Calculate a specific metric for a trajectory. |
| | |
| | Args: |
| | trajectory: The solution trajectory |
| | metric_name: The name of the metric to calculate |
| | |
| | Returns: |
| | The calculated metric value |
| | """ |
| | if metric_name not in self.metrics: |
| | raise ValueError(f"Unknown metric: {metric_name}") |
| | |
| | return self.metrics[metric_name].calculate(trajectory) |
| | |
| | def aggregate_metrics(self, trajectories: List[Trajectory]) -> Dict[str, float]: |
| | """ |
| | Calculate aggregate metrics across multiple trajectories. |
| | |
| | Args: |
| | trajectories: List of solution trajectories |
| | |
| | Returns: |
| | Dictionary of aggregated metric values |
| | """ |
| | if not trajectories: |
| | return {} |
| | |
| | all_metrics = [self.calculate_all(trajectory) for trajectory in trajectories] |
| | |
| | |
| | aggregated = {} |
| | for metric_name in self.metrics: |
| | values = [metrics[metric_name] for metrics in all_metrics] |
| | aggregated[metric_name] = sum(values) / len(values) |
| | |
| | return aggregated |
| |
|
| |
|
| | |
| |
|
| | import matplotlib.pyplot as plt |
| | import numpy as np |
| | import pandas as pd |
| | from typing import Any, Dict, List, Optional, Union |
| | import os |
| | import json |
| | import seaborn as sns |
| | from pathlib import Path |
| |
|
| | from recursive_swe_bench.core.recursive_task import Trajectory |
| |
|
| |
|
| | class RecursiveVisualizer: |
| | """ |
| | Visualization tools for recursive evaluation results. |
| | |
| | This class provides methods for visualizing recursive trajectories, |
| | metrics, and comparative analysis across models. |
| | """ |
| | |
| | def __init__(self, output_dir: Optional[str] = None, config: Dict[str, Any] = None): |
| | """ |
| | Initialize the recursive visualizer. |
| | |
| | Args: |
| | output_dir: Directory to save visualizations |
| | config: Configuration options |
| | """ |
| | self.output_dir = output_dir |
| | if output_dir: |
| | os.makedirs(output_dir, exist_ok=True) |
| | |
| | self.config = config or {} |
| | self.theme = self.config.get("theme", "default") |
| | |
| | |
| | if self.theme == "dark": |
| | plt.style.use("dark_background") |
| | self.colors = sns.color_palette("viridis", 10) |
| | else: |
| | plt.style.use("seaborn-v0_8-whitegrid") |
| | self.colors = sns.color_palette("muted", 10) |
| | |
| | sns.set_context("talk") |
| | |
| | def plot_trajectory( |
| | self, |
| | trajectory: Trajectory, |
| | title: Optional[str] = None, |
| | show: bool = True, |
| | save_path: Optional[str] = None |
| | ): |
| | """ |
| | Plot a solution trajectory showing score evolution. |
| | |
| | Args: |
| | trajectory: The solution trajectory |
| | title: Optional title for the plot |
| | show: Whether to display the plot |
| | save_path: Optional path to save the plot |
| | """ |
| | scores = trajectory.get_score_series() |
| | if not scores: |
| | return |
| | |
| | plt.figure(figsize=(10, 6)) |
| | |
| | |
| | plt.plot(range(1, len(scores) + 1), scores, marker='o', |
| | linewidth=2, markersize=8, color=self.colors[0]) |
| | |
| | |
| | difficulties = [step.problem_state.difficulty for step in trajectory.steps] |
| | if difficulties: |
| | plt.plot(range(1, len(difficulties) + 1), difficulties, marker='s', |
| | linewidth=2, markersize=8, color=self.colors[1], linestyle='--', |
| | label='Problem Difficulty') |
| | |
| | |
| | plt.title(title or f"Solution Trajectory for Task {trajectory.task_id}") |
| | plt.xlabel("Iteration") |
| | plt.ylabel("Score / Difficulty") |
| | plt.grid(True) |
| | plt.ylim(0, 1.05) |
| | plt.xticks(range(1, len(scores) + 1)) |
| | plt.legend(["Solution Score", "Problem Difficulty"]) |
| | |
| | |
| | if save_path: |
| | full_path = os.path.join(self.output_dir, save_path) if self.output_dir else save_path |
| | plt.savefig(full_path, bbox_inches='tight', dpi=300) |
| | |
| | |
| | if show: |
| | plt.show() |
| | else: |
| | plt.close() |
| | |
| | def plot_metrics_comparison( |
| | self, |
| | metrics_by_model: Dict[str, Dict[str, float]], |
| | title: Optional[str] = None, |
| | show: bool = True, |
| | save_path: Optional[str] = None |
| | ): |
| | """ |
| | Plot a comparison of metrics across models. |
| | |
| | Args: |
| | metrics_by_model: Dictionary mapping model names to metric values |
| | title: Optional title for the plot |
| | show: Whether to display the plot |
| | save_path: Optional path to save the plot |
| | """ |
| | if not metrics_by_model: |
| | return |
| | |
| | |
| | df = pd.DataFrame(metrics_by_model).T |
| | |
| | |
| | categories = list(df.columns) |
| | N = len(categories) |
| | |
| | |
| | angles = [n / float(N) * 2 * np.pi for n in range(N)] |
| | angles += angles[:1] |
| | |
| | |
| | fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True)) |
| | |
| | |
| | for i, (model, metrics) in enumerate(df.iterrows()): |
| | values = metrics.values.flatten().tolist() |
| | values += values[:1] |
| | |
| | |
| | ax.plot(angles, values, linewidth=2, linestyle='solid', |
| | label=model, color=self.colors[i % len(self.colors)]) |
| | ax.fill(angles, values, alpha=0.1, color=self.colors[i % len(self.colors)]) |
| | |
| | |
| | plt.xticks(angles[:-1], categories) |
| | |
| | |
| | plt.ylim(0, 1) |
| | |
| | |
| | plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1)) |
| | |
| | |
| | plt.title(title or "Metrics Comparison Across Models") |
| | |
| | |
| | if save_path: |
| | full_path = os.path.join(self.output_dir, save_path) if self.output_dir else save_path |
| | plt.savefig(full_path, bbox_inches='tight', |
| |
|