cloudnative-devops-debug-env / baseline_runner.py
Krishna1107's picture
fixed inference
2794920
"""Heuristic baseline runner for the /baseline endpoint.
Applies expected_fixes directly to verify the environment + grader work e2e.
By default runs ALL scenarios of ALL tasks for deterministic, reproducible evaluation.
"""
from typing import List, Optional
from server.environment import CloudNativeDebugEnvironment
from server.graders import run_grader
from server.models import Action, ActionType, FileEdit, GraderResult
from server.tasks.task_registry import TASK_REGISTRY
def _heuristic_episode(env: CloudNativeDebugEnvironment, task_id: str, scenario_id: Optional[str] = None) -> GraderResult:
"""Run one episode using a heuristic that applies expected fixes."""
obs = env.reset(task_id=task_id, scenario_id=scenario_id)
# Apply each expected fix as an edit_file action
for fix in env.expected_fixes:
if env.done:
break
file_path = fix["file"]
if file_path not in env.current_files:
# For fixes that require creating a new file (e.g. ConfigMap),
# create it with the expected content
if fix["type"] == "contains":
action = Action(
action_type=ActionType.EDIT_FILE,
edits=[FileEdit(
file_path=file_path,
new_content=fix["expected"],
)],
)
env.step(action)
continue
current_content = env.current_files[file_path].content
if fix["type"] == "contains":
# Need to ensure expected string is present
if fix["expected"] not in current_content:
# Try to find the broken line using hint
hint_text = fix.get("hint", "")
# Use edit_file with old/new content based on the fix
# We look at original files to find what changed
original_content = env.original_files.get(file_path)
if original_content:
lines = current_content.split("\n")
expected = fix["expected"]
line_num = fix.get("line")
if line_num and 1 <= line_num <= len(lines):
old_line = lines[line_num - 1]
action = Action(
action_type=ActionType.REPLACE_LINE,
edits=[FileEdit(
file_path=file_path,
line_number=line_num,
new_content=expected,
)],
)
else:
# Find the line with highest character overlap to expected
best_line = None
best_idx = None
best_score = 0
for i, line in enumerate(lines):
stripped = line.strip()
exp_stripped = expected.strip()
if not stripped or not exp_stripped:
continue
overlap = len(set(stripped) & set(exp_stripped))
# Use ratio of overlap to max length for scoring
score = overlap / max(len(exp_stripped), len(stripped))
if score > 0.5 and score > best_score:
best_line = line
best_idx = i
best_score = score
if best_line is not None:
action = Action(
action_type=ActionType.EDIT_FILE,
edits=[FileEdit(
file_path=file_path,
old_content=best_line,
new_content=expected,
)],
)
else:
# Append the expected content
action = Action(
action_type=ActionType.ADD_LINE,
edits=[FileEdit(
file_path=file_path,
new_content=expected,
)],
)
env.step(action)
elif fix["type"] == "not_contains":
# Need to ensure expected string is NOT present
if fix["expected"] in current_content:
action = Action(
action_type=ActionType.DELETE_BLOCK,
edits=[FileEdit(
file_path=file_path,
old_content=fix["expected"],
)],
)
env.step(action)
elif fix["type"] == "line_equals":
line_num = int(fix.get("line", 0))
if line_num >= 1:
action = Action(
action_type=ActionType.REPLACE_LINE,
edits=[FileEdit(
file_path=file_path,
line_number=line_num,
new_content=str(fix["expected"]),
)],
)
env.step(action)
# Submit if not already done
if not env.done:
env.step(Action(action_type=ActionType.SUBMIT))
return run_grader(task_id, env.trajectory)
def run_baseline_episodes(task_id: Optional[str] = None, num_episodes: Optional[int] = None) -> List[GraderResult]:
"""Run baseline episodes across tasks.
Args:
task_id: Specific task to run, or None for all tasks.
num_episodes: Max scenarios per task. None = run ALL scenarios (default).
Returns:
List of GraderResult for each episode.
"""
task_ids: List[str]
if task_id:
if task_id not in TASK_REGISTRY:
raise ValueError(f"Unknown task: {task_id}")
task_ids = [task_id]
else:
task_ids = list(TASK_REGISTRY.keys())
results: List[GraderResult] = []
for tid in task_ids:
task_cls = TASK_REGISTRY[tid]
scenarios = task_cls.SCENARIOS
for idx, scenario in enumerate(scenarios):
if num_episodes is not None and idx >= num_episodes:
break
env = CloudNativeDebugEnvironment()
result = _heuristic_episode(env, tid, scenario["id"])
results.append(result)
return results