"""Heuristic baseline runner for the /baseline endpoint. Applies expected_fixes directly to verify the environment + grader work e2e. By default runs ALL scenarios of ALL tasks for deterministic, reproducible evaluation. """ from typing import List, Optional from server.environment import CloudNativeDebugEnvironment from server.graders import run_grader from server.models import Action, ActionType, FileEdit, GraderResult from server.tasks.task_registry import TASK_REGISTRY def _heuristic_episode(env: CloudNativeDebugEnvironment, task_id: str, scenario_id: Optional[str] = None) -> GraderResult: """Run one episode using a heuristic that applies expected fixes.""" obs = env.reset(task_id=task_id, scenario_id=scenario_id) # Apply each expected fix as an edit_file action for fix in env.expected_fixes: if env.done: break file_path = fix["file"] if file_path not in env.current_files: # For fixes that require creating a new file (e.g. ConfigMap), # create it with the expected content if fix["type"] == "contains": action = Action( action_type=ActionType.EDIT_FILE, edits=[FileEdit( file_path=file_path, new_content=fix["expected"], )], ) env.step(action) continue current_content = env.current_files[file_path].content if fix["type"] == "contains": # Need to ensure expected string is present if fix["expected"] not in current_content: # Try to find the broken line using hint hint_text = fix.get("hint", "") # Use edit_file with old/new content based on the fix # We look at original files to find what changed original_content = env.original_files.get(file_path) if original_content: lines = current_content.split("\n") expected = fix["expected"] line_num = fix.get("line") if line_num and 1 <= line_num <= len(lines): old_line = lines[line_num - 1] action = Action( action_type=ActionType.REPLACE_LINE, edits=[FileEdit( file_path=file_path, line_number=line_num, new_content=expected, )], ) else: # Find the line with highest character overlap to expected best_line = None best_idx = None best_score = 0 for i, line in enumerate(lines): stripped = line.strip() exp_stripped = expected.strip() if not stripped or not exp_stripped: continue overlap = len(set(stripped) & set(exp_stripped)) # Use ratio of overlap to max length for scoring score = overlap / max(len(exp_stripped), len(stripped)) if score > 0.5 and score > best_score: best_line = line best_idx = i best_score = score if best_line is not None: action = Action( action_type=ActionType.EDIT_FILE, edits=[FileEdit( file_path=file_path, old_content=best_line, new_content=expected, )], ) else: # Append the expected content action = Action( action_type=ActionType.ADD_LINE, edits=[FileEdit( file_path=file_path, new_content=expected, )], ) env.step(action) elif fix["type"] == "not_contains": # Need to ensure expected string is NOT present if fix["expected"] in current_content: action = Action( action_type=ActionType.DELETE_BLOCK, edits=[FileEdit( file_path=file_path, old_content=fix["expected"], )], ) env.step(action) elif fix["type"] == "line_equals": line_num = int(fix.get("line", 0)) if line_num >= 1: action = Action( action_type=ActionType.REPLACE_LINE, edits=[FileEdit( file_path=file_path, line_number=line_num, new_content=str(fix["expected"]), )], ) env.step(action) # Submit if not already done if not env.done: env.step(Action(action_type=ActionType.SUBMIT)) return run_grader(task_id, env.trajectory) def run_baseline_episodes(task_id: Optional[str] = None, num_episodes: Optional[int] = None) -> List[GraderResult]: """Run baseline episodes across tasks. Args: task_id: Specific task to run, or None for all tasks. num_episodes: Max scenarios per task. None = run ALL scenarios (default). Returns: List of GraderResult for each episode. """ task_ids: List[str] if task_id: if task_id not in TASK_REGISTRY: raise ValueError(f"Unknown task: {task_id}") task_ids = [task_id] else: task_ids = list(TASK_REGISTRY.keys()) results: List[GraderResult] = [] for tid in task_ids: task_cls = TASK_REGISTRY[tid] scenarios = task_cls.SCENARIOS for idx, scenario in enumerate(scenarios): if num_episodes is not None and idx >= num_episodes: break env = CloudNativeDebugEnvironment() result = _heuristic_episode(env, tid, scenario["id"]) results.append(result) return results