| """Heuristic baseline runner for the /baseline endpoint. |
| |
| Applies expected_fixes directly to verify the environment + grader work e2e. |
| By default runs ALL scenarios of ALL tasks for deterministic, reproducible evaluation. |
| """ |
|
|
|
|
| from typing import List, Optional |
|
|
| from server.environment import CloudNativeDebugEnvironment |
| from server.graders import run_grader |
| from server.models import Action, ActionType, FileEdit, GraderResult |
| from server.tasks.task_registry import TASK_REGISTRY |
|
|
|
|
| def _heuristic_episode(env: CloudNativeDebugEnvironment, task_id: str, scenario_id: Optional[str] = None) -> GraderResult: |
| """Run one episode using a heuristic that applies expected fixes.""" |
| obs = env.reset(task_id=task_id, scenario_id=scenario_id) |
|
|
| |
| for fix in env.expected_fixes: |
| if env.done: |
| break |
| file_path = fix["file"] |
| if file_path not in env.current_files: |
| |
| |
| if fix["type"] == "contains": |
| action = Action( |
| action_type=ActionType.EDIT_FILE, |
| edits=[FileEdit( |
| file_path=file_path, |
| new_content=fix["expected"], |
| )], |
| ) |
| env.step(action) |
| continue |
|
|
| current_content = env.current_files[file_path].content |
|
|
| if fix["type"] == "contains": |
| |
| if fix["expected"] not in current_content: |
| |
| hint_text = fix.get("hint", "") |
| |
| |
| original_content = env.original_files.get(file_path) |
| if original_content: |
| lines = current_content.split("\n") |
| expected = fix["expected"] |
| line_num = fix.get("line") |
|
|
| if line_num and 1 <= line_num <= len(lines): |
| old_line = lines[line_num - 1] |
| action = Action( |
| action_type=ActionType.REPLACE_LINE, |
| edits=[FileEdit( |
| file_path=file_path, |
| line_number=line_num, |
| new_content=expected, |
| )], |
| ) |
| else: |
| |
| best_line = None |
| best_idx = None |
| best_score = 0 |
| for i, line in enumerate(lines): |
| stripped = line.strip() |
| exp_stripped = expected.strip() |
| if not stripped or not exp_stripped: |
| continue |
| overlap = len(set(stripped) & set(exp_stripped)) |
| |
| score = overlap / max(len(exp_stripped), len(stripped)) |
| if score > 0.5 and score > best_score: |
| best_line = line |
| best_idx = i |
| best_score = score |
|
|
| if best_line is not None: |
| action = Action( |
| action_type=ActionType.EDIT_FILE, |
| edits=[FileEdit( |
| file_path=file_path, |
| old_content=best_line, |
| new_content=expected, |
| )], |
| ) |
| else: |
| |
| action = Action( |
| action_type=ActionType.ADD_LINE, |
| edits=[FileEdit( |
| file_path=file_path, |
| new_content=expected, |
| )], |
| ) |
| env.step(action) |
|
|
| elif fix["type"] == "not_contains": |
| |
| if fix["expected"] in current_content: |
| action = Action( |
| action_type=ActionType.DELETE_BLOCK, |
| edits=[FileEdit( |
| file_path=file_path, |
| old_content=fix["expected"], |
| )], |
| ) |
| env.step(action) |
|
|
| elif fix["type"] == "line_equals": |
| line_num = int(fix.get("line", 0)) |
| if line_num >= 1: |
| action = Action( |
| action_type=ActionType.REPLACE_LINE, |
| edits=[FileEdit( |
| file_path=file_path, |
| line_number=line_num, |
| new_content=str(fix["expected"]), |
| )], |
| ) |
| env.step(action) |
|
|
| |
| if not env.done: |
| env.step(Action(action_type=ActionType.SUBMIT)) |
|
|
| return run_grader(task_id, env.trajectory) |
|
|
|
|
| def run_baseline_episodes(task_id: Optional[str] = None, num_episodes: Optional[int] = None) -> List[GraderResult]: |
| """Run baseline episodes across tasks. |
| |
| Args: |
| task_id: Specific task to run, or None for all tasks. |
| num_episodes: Max scenarios per task. None = run ALL scenarios (default). |
| |
| Returns: |
| List of GraderResult for each episode. |
| """ |
| task_ids: List[str] |
| if task_id: |
| if task_id not in TASK_REGISTRY: |
| raise ValueError(f"Unknown task: {task_id}") |
| task_ids = [task_id] |
| else: |
| task_ids = list(TASK_REGISTRY.keys()) |
|
|
| results: List[GraderResult] = [] |
| for tid in task_ids: |
| task_cls = TASK_REGISTRY[tid] |
| scenarios = task_cls.SCENARIOS |
| for idx, scenario in enumerate(scenarios): |
| if num_episodes is not None and idx >= num_episodes: |
| break |
| env = CloudNativeDebugEnvironment() |
| result = _heuristic_episode(env, tid, scenario["id"]) |
| results.append(result) |
|
|
| return results |
|
|