Spaces:

jester1177
/

cloudnative-devops-debug-env

Sleeping

App Files Files Community

cloudnative-devops-debug-env / baseline_runner.py

Krishna1107

fixed inference

2794920 about 1 month ago

raw

history blame contribute delete

6.65 kB

	"""Heuristic baseline runner for the /baseline endpoint.

	Applies expected_fixes directly to verify the environment + grader work e2e.
	By default runs ALL scenarios of ALL tasks for deterministic, reproducible evaluation.
	"""


	from typing import List, Optional

	from server.environment import CloudNativeDebugEnvironment
	from server.graders import run_grader
	from server.models import Action, ActionType, FileEdit, GraderResult
	from server.tasks.task_registry import TASK_REGISTRY


	def _heuristic_episode(env: CloudNativeDebugEnvironment, task_id: str, scenario_id: Optional[str] = None) -> GraderResult:
	"""Run one episode using a heuristic that applies expected fixes."""
	obs = env.reset(task_id=task_id, scenario_id=scenario_id)

	# Apply each expected fix as an edit_file action
	for fix in env.expected_fixes:
	if env.done:
	break
	file_path = fix["file"]
	if file_path not in env.current_files:
	# For fixes that require creating a new file (e.g. ConfigMap),
	# create it with the expected content
	if fix["type"] == "contains":
	action = Action(
	action_type=ActionType.EDIT_FILE,
	edits=[FileEdit(
	file_path=file_path,
	new_content=fix["expected"],
	)],
	)
	env.step(action)
	continue

	current_content = env.current_files[file_path].content

	if fix["type"] == "contains":
	# Need to ensure expected string is present
	if fix["expected"] not in current_content:
	# Try to find the broken line using hint
	hint_text = fix.get("hint", "")
	# Use edit_file with old/new content based on the fix
	# We look at original files to find what changed
	original_content = env.original_files.get(file_path)
	if original_content:
	lines = current_content.split("\n")
	expected = fix["expected"]
	line_num = fix.get("line")

	if line_num and 1 <= line_num <= len(lines):
	old_line = lines[line_num - 1]
	action = Action(
	action_type=ActionType.REPLACE_LINE,
	edits=[FileEdit(
	file_path=file_path,
	line_number=line_num,
	new_content=expected,
	)],
	)
	else:
	# Find the line with highest character overlap to expected
	best_line = None
	best_idx = None
	best_score = 0
	for i, line in enumerate(lines):
	stripped = line.strip()
	exp_stripped = expected.strip()
	if not stripped or not exp_stripped:
	continue
	overlap = len(set(stripped) & set(exp_stripped))
	# Use ratio of overlap to max length for scoring
	score = overlap / max(len(exp_stripped), len(stripped))
	if score > 0.5 and score > best_score:
	best_line = line
	best_idx = i
	best_score = score

	if best_line is not None:
	action = Action(
	action_type=ActionType.EDIT_FILE,
	edits=[FileEdit(
	file_path=file_path,
	old_content=best_line,
	new_content=expected,
	)],
	)
	else:
	# Append the expected content
	action = Action(
	action_type=ActionType.ADD_LINE,
	edits=[FileEdit(
	file_path=file_path,
	new_content=expected,
	)],
	)
	env.step(action)

	elif fix["type"] == "not_contains":
	# Need to ensure expected string is NOT present
	if fix["expected"] in current_content:
	action = Action(
	action_type=ActionType.DELETE_BLOCK,
	edits=[FileEdit(
	file_path=file_path,
	old_content=fix["expected"],
	)],
	)
	env.step(action)

	elif fix["type"] == "line_equals":
	line_num = int(fix.get("line", 0))
	if line_num >= 1:
	action = Action(
	action_type=ActionType.REPLACE_LINE,
	edits=[FileEdit(
	file_path=file_path,
	line_number=line_num,
	new_content=str(fix["expected"]),
	)],
	)
	env.step(action)

	# Submit if not already done
	if not env.done:
	env.step(Action(action_type=ActionType.SUBMIT))

	return run_grader(task_id, env.trajectory)


	def run_baseline_episodes(task_id: Optional[str] = None, num_episodes: Optional[int] = None) -> List[GraderResult]:
	"""Run baseline episodes across tasks.

	Args:
	task_id: Specific task to run, or None for all tasks.
	num_episodes: Max scenarios per task. None = run ALL scenarios (default).

	Returns:
	List of GraderResult for each episode.
	"""
	task_ids: List[str]
	if task_id:
	if task_id not in TASK_REGISTRY:
	raise ValueError(f"Unknown task: {task_id}")
	task_ids = [task_id]
	else:
	task_ids = list(TASK_REGISTRY.keys())

	results: List[GraderResult] = []
	for tid in task_ids:
	task_cls = TASK_REGISTRY[tid]
	scenarios = task_cls.SCENARIOS
	for idx, scenario in enumerate(scenarios):
	if num_episodes is not None and idx >= num_episodes:
	break
	env = CloudNativeDebugEnvironment()
	result = _heuristic_episode(env, tid, scenario["id"])
	results.append(result)

	return results