File size: 6,647 Bytes
a7caaff 557930c a7caaff 2794920 557930c 85b7ac8 557930c 85b7ac8 4de7d31 85b7ac8 557930c 85b7ac8 4de7d31 557930c 85b7ac8 557930c 2794920 557930c 2794920 557930c 2794920 557930c 2794920 557930c 85b7ac8 557930c 2794920 557930c 2794920 557930c 85b7ac8 557930c 85b7ac8 557930c 85b7ac8 557930c 2794920 557930c 4de7d31 557930c 85b7ac8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | """Heuristic baseline runner for the /baseline endpoint.
Applies expected_fixes directly to verify the environment + grader work e2e.
By default runs ALL scenarios of ALL tasks for deterministic, reproducible evaluation.
"""
from typing import List, Optional
from server.environment import CloudNativeDebugEnvironment
from server.graders import run_grader
from server.models import Action, ActionType, FileEdit, GraderResult
from server.tasks.task_registry import TASK_REGISTRY
def _heuristic_episode(env: CloudNativeDebugEnvironment, task_id: str, scenario_id: Optional[str] = None) -> GraderResult:
"""Run one episode using a heuristic that applies expected fixes."""
obs = env.reset(task_id=task_id, scenario_id=scenario_id)
# Apply each expected fix as an edit_file action
for fix in env.expected_fixes:
if env.done:
break
file_path = fix["file"]
if file_path not in env.current_files:
# For fixes that require creating a new file (e.g. ConfigMap),
# create it with the expected content
if fix["type"] == "contains":
action = Action(
action_type=ActionType.EDIT_FILE,
edits=[FileEdit(
file_path=file_path,
new_content=fix["expected"],
)],
)
env.step(action)
continue
current_content = env.current_files[file_path].content
if fix["type"] == "contains":
# Need to ensure expected string is present
if fix["expected"] not in current_content:
# Try to find the broken line using hint
hint_text = fix.get("hint", "")
# Use edit_file with old/new content based on the fix
# We look at original files to find what changed
original_content = env.original_files.get(file_path)
if original_content:
lines = current_content.split("\n")
expected = fix["expected"]
line_num = fix.get("line")
if line_num and 1 <= line_num <= len(lines):
old_line = lines[line_num - 1]
action = Action(
action_type=ActionType.REPLACE_LINE,
edits=[FileEdit(
file_path=file_path,
line_number=line_num,
new_content=expected,
)],
)
else:
# Find the line with highest character overlap to expected
best_line = None
best_idx = None
best_score = 0
for i, line in enumerate(lines):
stripped = line.strip()
exp_stripped = expected.strip()
if not stripped or not exp_stripped:
continue
overlap = len(set(stripped) & set(exp_stripped))
# Use ratio of overlap to max length for scoring
score = overlap / max(len(exp_stripped), len(stripped))
if score > 0.5 and score > best_score:
best_line = line
best_idx = i
best_score = score
if best_line is not None:
action = Action(
action_type=ActionType.EDIT_FILE,
edits=[FileEdit(
file_path=file_path,
old_content=best_line,
new_content=expected,
)],
)
else:
# Append the expected content
action = Action(
action_type=ActionType.ADD_LINE,
edits=[FileEdit(
file_path=file_path,
new_content=expected,
)],
)
env.step(action)
elif fix["type"] == "not_contains":
# Need to ensure expected string is NOT present
if fix["expected"] in current_content:
action = Action(
action_type=ActionType.DELETE_BLOCK,
edits=[FileEdit(
file_path=file_path,
old_content=fix["expected"],
)],
)
env.step(action)
elif fix["type"] == "line_equals":
line_num = int(fix.get("line", 0))
if line_num >= 1:
action = Action(
action_type=ActionType.REPLACE_LINE,
edits=[FileEdit(
file_path=file_path,
line_number=line_num,
new_content=str(fix["expected"]),
)],
)
env.step(action)
# Submit if not already done
if not env.done:
env.step(Action(action_type=ActionType.SUBMIT))
return run_grader(task_id, env.trajectory)
def run_baseline_episodes(task_id: Optional[str] = None, num_episodes: Optional[int] = None) -> List[GraderResult]:
"""Run baseline episodes across tasks.
Args:
task_id: Specific task to run, or None for all tasks.
num_episodes: Max scenarios per task. None = run ALL scenarios (default).
Returns:
List of GraderResult for each episode.
"""
task_ids: List[str]
if task_id:
if task_id not in TASK_REGISTRY:
raise ValueError(f"Unknown task: {task_id}")
task_ids = [task_id]
else:
task_ids = list(TASK_REGISTRY.keys())
results: List[GraderResult] = []
for tid in task_ids:
task_cls = TASK_REGISTRY[tid]
scenarios = task_cls.SCENARIOS
for idx, scenario in enumerate(scenarios):
if num_episodes is not None and idx >= num_episodes:
break
env = CloudNativeDebugEnvironment()
result = _heuristic_episode(env, tid, scenario["id"])
results.append(result)
return results
|