| import os |
| import sys |
| import json |
| from pathlib import Path |
| from typing import Optional |
|
|
| from openhands.sdk import LLM, Agent, Conversation, Tool |
| from openhands.tools.file_editor import FileEditorTool |
| from openhands.tools.task_tracker import TaskTrackerTool |
| from openhands.tools.terminal import TerminalTool |
|
|
|
|
| |
| |
| |
| def evolution_evaluation_agent( |
| results_dir: str, |
| current_gen: int = 0, |
| primary_evaluator_path: Optional[str] = None, |
| task_message: Optional[str] = None |
| ) -> str: |
| """ |
| Evolution Evaluation (EV2) Agent - Step 1: Minimum Viable Version |
| |
| Specialized agent for analyzing evolution progress and creating |
| AUXILIARY evaluation metrics during the code evolution process. |
| |
| IMPORTANT: The agent can only create auxiliary/supplementary metrics. |
| The primary evaluation metric is fixed and cannot be modified. |
| |
| Args: |
| results_dir: Path to ShinkaEvolve results directory |
| current_gen: Current generation number |
| primary_evaluator_path: Path to the primary evaluator (ground truth) |
| task_message: Custom task message (None = use default) |
| |
| Returns: |
| Path to agent workspace directory |
| """ |
| |
| agent_workspace = Path(results_dir) / "eval_agent_memory" |
| agent_workspace.mkdir(parents=True, exist_ok=True) |
| |
| print("=" * 80) |
| print("π€ EV2 Agent Initialization") |
| print("=" * 80) |
| print(f"Results Dir: {results_dir}") |
| print(f"Generation: {current_gen}") |
| print(f"Workspace: {agent_workspace}") |
| print("=" * 80) |
| print() |
| |
| |
| llm = LLM( |
| model=os.getenv("LLM_MODEL", "vertex_ai/gemini-2.5-flash"), |
| api_key=os.getenv("LLM_API_KEY"), |
| base_url=os.getenv("LLM_BASE_URL", None), |
| ) |
| |
| |
| ev2_prompt_path = Path(__file__).parent / "ev2_prompt.j2" |
| |
| |
| agent = Agent( |
| llm=llm, |
| tools=[ |
| Tool(name=TerminalTool.name), |
| Tool(name=FileEditorTool.name), |
| Tool(name=TaskTrackerTool.name), |
| ], |
| |
| system_prompt_filename=str(ev2_prompt_path), |
| ) |
| |
| |
| conversation = Conversation(agent=agent, workspace=str(agent_workspace)) |
| |
| |
| if task_message is None: |
| task_message = _build_default_task( |
| results_dir, current_gen, agent_workspace, primary_evaluator_path |
| ) |
| |
| |
| print("π Sending task to agent...") |
| print() |
| conversation.send_message(task_message) |
| |
| print("π Agent working...") |
| print() |
| conversation.run() |
| |
| print() |
| print("=" * 80) |
| print("β
EV2 Evaluation Complete!") |
| print("=" * 80) |
| print(f"π Workspace: {agent_workspace}") |
| print(f"π Memory: {agent_workspace}/EVAL_AGENTS.md") |
| print("=" * 80) |
| |
| return str(agent_workspace) |
|
|
|
|
| def _build_default_task( |
| results_dir: str, |
| current_gen: int, |
| workspace: Path, |
| primary_evaluator_path: Optional[str] = None |
| ) -> str: |
| """ |
| Build default task message with evolution context |
| |
| Args: |
| results_dir: Results directory path |
| current_gen: Current generation number |
| workspace: Agent workspace path |
| primary_evaluator_path: Path to primary evaluator (ground truth) |
| |
| Returns: |
| Task message string |
| """ |
| results_path = Path(results_dir) |
| |
| |
| current_gen_dir = results_path / f"gen_{current_gen}" |
| current_metrics = current_gen_dir / "results" / "metrics.json" |
| |
| |
| current_score = None |
| if current_metrics.exists(): |
| try: |
| with open(current_metrics) as f: |
| data = json.load(f) |
| current_score = data.get("combined_score", None) |
| except Exception: |
| pass |
| |
| |
| task_parts = [ |
| f"=== Generation {current_gen} Evaluation ===", |
| "", |
| "π File Locations:", |
| f"- Your workspace: {workspace}", |
| f"- Results directory: {results_path}", |
| f"- Current generation: {current_gen_dir}", |
| ] |
| |
| if current_gen > 0: |
| task_parts.extend([ |
| f"- Current code: {current_gen_dir}/main.py", |
| f"- Current metrics: {current_metrics}", |
| ]) |
| if current_score is not None: |
| task_parts.append(f"- Current score: {current_score:.4f}") |
| |
| task_parts.extend([ |
| "", |
| "π Available Data:", |
| f"- Evolution database: {results_path}/evolution_db_*.sqlite", |
| f"- All generations: {results_path}/gen_0/ through gen_{current_gen}/", |
| f"- Each generation has: main.py and results/metrics.json", |
| ]) |
| |
| |
| if primary_evaluator_path: |
| primary_eval_path = Path(primary_evaluator_path) |
| task_parts.extend([ |
| "", |
| "β οΈ PRIMARY EVALUATOR (FIXED - DO NOT MODIFY):", |
| f"- Path: {primary_eval_path}", |
| "- This file defines the GROUND TRUTH evaluation metric", |
| "- The primary score comes from this evaluator", |
| "- You MUST NOT modify or replace this evaluator", |
| "- You can READ it to understand what is being optimized", |
| "- Your job is to create AUXILIARY metrics that complement it", |
| ]) |
| |
| task_parts.extend([ |
| "", |
| "π― Your Specific Tasks for This Generation:", |
| "1. Analyze evolution progress up to generation {current_gen}", |
| "2. Review performance trends from recent generations", |
| "3. Identify what aspects are NOT being measured by primary metric", |
| "4. Design 2-3 auxiliary metrics that would provide useful insights", |
| "5. Implement these metrics as Python functions in your workspace", |
| "6. Test metrics on current generation data", |
| "7. Document findings and metric designs in EVAL_AGENTS.md", |
| "", |
| "πΎ Your Memory:", |
| "- EVAL_AGENTS.md is your persistent memory across generations", |
| "- Record which metrics were useful, which weren't", |
| "- Build a library of reusable auxiliary metric functions", |
| "", |
| "π‘ Good Auxiliary Metrics Examples:", |
| "- Solution diversity measures", |
| "- Convergence rate analysis", |
| "- Robustness to perturbations", |
| "- Structural pattern detection", |
| "- Resource efficiency metrics", |
| "", |
| "Focus on aspects NOT captured by the primary evaluator.", |
| ]) |
| |
| return "\n".join(task_parts) |
|
|
|
|
| |
| |
| |
| if __name__ == "__main__": |
| if len(sys.argv) < 2: |
| print("Usage: python ev2.py <results_dir> [current_gen] [primary_evaluator_path]") |
| print() |
| print("Arguments:") |
| print(" results_dir: Path to ShinkaEvolve results directory") |
| print(" current_gen: Current generation number (default: 0)") |
| print(" primary_evaluator_path: Path to primary evaluator file (optional)") |
| print() |
| print("Examples:") |
| print(" # Basic usage") |
| print(" python eval_agent/ev2.py examples/circle_packing/results/results_exp_20260129 42") |
| print() |
| print(" # With primary evaluator (recommended)") |
| print(" python eval_agent/ev2.py \\") |
| print(" examples/circle_packing/results/results_circle_packing_NO_vision_WITH_refined_aux_20260118_205215 \\") |
| print(" 100 \\") |
| print(" examples/circle_packing/evaluate_ori.py") |
| sys.exit(1) |
| |
| results_dir = sys.argv[1] |
| current_gen = int(sys.argv[2]) if len(sys.argv) > 2 else 0 |
| primary_evaluator_path = sys.argv[3] if len(sys.argv) > 3 else None |
| |
| |
| if primary_evaluator_path: |
| primary_evaluator_path = os.path.abspath(primary_evaluator_path) |
| if not Path(primary_evaluator_path).exists(): |
| print(f"β Error: Primary evaluator not found: {primary_evaluator_path}") |
| sys.exit(1) |
| |
| if primary_evaluator_path: |
| print(f"π Primary Evaluator: {primary_evaluator_path}") |
| print(" (Agent will be constrained to NOT modify this)") |
| print() |
| |
| |
| workspace = evolution_evaluation_agent(results_dir, current_gen, primary_evaluator_path) |
| |
| print() |
| print(f"β
Done! Check the agent workspace:") |
| print(f" {workspace}") |
| print() |
| print("Files to inspect:") |
| print(f" - {workspace}/EVAL_AGENTS.md (agent memory)") |
| print(f" - {workspace}/*.py (any auxiliary metrics agent created)") |
| if primary_evaluator_path: |
| print() |
| print("β οΈ Remember: Agent cannot modify the primary evaluator!") |
| print(f" Primary evaluator remains: {primary_evaluator_path}") |
| print() |