Spaces:

jester1177
/

cloudnative-devops-debug-env

Sleeping

File size: 6,647 Bytes

"""Heuristic baseline runner for the /baseline endpoint.

Applies expected_fixes directly to verify the environment + grader work e2e.
By default runs ALL scenarios of ALL tasks for deterministic, reproducible evaluation.
"""


from typing import List, Optional

from server.environment import CloudNativeDebugEnvironment
from server.graders import run_grader
from server.models import Action, ActionType, FileEdit, GraderResult
from server.tasks.task_registry import TASK_REGISTRY


def _heuristic_episode(env: CloudNativeDebugEnvironment, task_id: str, scenario_id: Optional[str] = None) -> GraderResult:
    """Run one episode using a heuristic that applies expected fixes."""
    obs = env.reset(task_id=task_id, scenario_id=scenario_id)

    # Apply each expected fix as an edit_file action
    for fix in env.expected_fixes:
        if env.done:
            break
        file_path = fix["file"]
        if file_path not in env.current_files:
            # For fixes that require creating a new file (e.g. ConfigMap),
            # create it with the expected content
            if fix["type"] == "contains":
                action = Action(
                    action_type=ActionType.EDIT_FILE,
                    edits=[FileEdit(
                        file_path=file_path,
                        new_content=fix["expected"],
                    )],
                )
                env.step(action)
            continue

        current_content = env.current_files[file_path].content

        if fix["type"] == "contains":
            # Need to ensure expected string is present
            if fix["expected"] not in current_content:
                # Try to find the broken line using hint
                hint_text = fix.get("hint", "")
                # Use edit_file with old/new content based on the fix
                # We look at original files to find what changed
                original_content = env.original_files.get(file_path)
                if original_content:
                    lines = current_content.split("\n")
                    expected = fix["expected"]
                    line_num = fix.get("line")

                    if line_num and 1 <= line_num <= len(lines):
                        old_line = lines[line_num - 1]
                        action = Action(
                            action_type=ActionType.REPLACE_LINE,
                            edits=[FileEdit(
                                file_path=file_path,
                                line_number=line_num,
                                new_content=expected,
                            )],
                        )
                    else:
                        # Find the line with highest character overlap to expected
                        best_line = None
                        best_idx = None
                        best_score = 0
                        for i, line in enumerate(lines):
                            stripped = line.strip()
                            exp_stripped = expected.strip()
                            if not stripped or not exp_stripped:
                                continue
                            overlap = len(set(stripped) & set(exp_stripped))
                            # Use ratio of overlap to max length for scoring
                            score = overlap / max(len(exp_stripped), len(stripped))
                            if score > 0.5 and score > best_score:
                                best_line = line
                                best_idx = i
                                best_score = score

                        if best_line is not None:
                            action = Action(
                                action_type=ActionType.EDIT_FILE,
                                edits=[FileEdit(
                                    file_path=file_path,
                                    old_content=best_line,
                                    new_content=expected,
                                )],
                            )
                        else:
                            # Append the expected content
                            action = Action(
                                action_type=ActionType.ADD_LINE,
                                edits=[FileEdit(
                                    file_path=file_path,
                                    new_content=expected,
                                )],
                            )
                    env.step(action)

        elif fix["type"] == "not_contains":
            # Need to ensure expected string is NOT present
            if fix["expected"] in current_content:
                action = Action(
                    action_type=ActionType.DELETE_BLOCK,
                    edits=[FileEdit(
                        file_path=file_path,
                        old_content=fix["expected"],
                    )],
                )
                env.step(action)

        elif fix["type"] == "line_equals":
            line_num = int(fix.get("line", 0))
            if line_num >= 1:
                action = Action(
                    action_type=ActionType.REPLACE_LINE,
                    edits=[FileEdit(
                        file_path=file_path,
                        line_number=line_num,
                        new_content=str(fix["expected"]),
                    )],
                )
                env.step(action)

    # Submit if not already done
    if not env.done:
        env.step(Action(action_type=ActionType.SUBMIT))

    return run_grader(task_id, env.trajectory)


def run_baseline_episodes(task_id: Optional[str] = None, num_episodes: Optional[int] = None) -> List[GraderResult]:
    """Run baseline episodes across tasks.

    Args:
        task_id: Specific task to run, or None for all tasks.
        num_episodes: Max scenarios per task. None = run ALL scenarios (default).

    Returns:
        List of GraderResult for each episode.
    """
    task_ids: List[str]
    if task_id:
        if task_id not in TASK_REGISTRY:
            raise ValueError(f"Unknown task: {task_id}")
        task_ids = [task_id]
    else:
        task_ids = list(TASK_REGISTRY.keys())

    results: List[GraderResult] = []
    for tid in task_ids:
        task_cls = TASK_REGISTRY[tid]
        scenarios = task_cls.SCENARIOS
        for idx, scenario in enumerate(scenarios):
            if num_episodes is not None and idx >= num_episodes:
                break
            env = CloudNativeDebugEnvironment()
            result = _heuristic_episode(env, tid, scenario["id"])
            results.append(result)

    return results