File size: 6,647 Bytes
a7caaff
557930c
a7caaff
2794920
557930c
 
85b7ac8
557930c
85b7ac8
4de7d31
85b7ac8
557930c
 
85b7ac8
 
4de7d31
557930c
 
85b7ac8
557930c
 
 
 
 
 
2794920
 
 
 
 
 
 
 
 
 
 
557930c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2794920
557930c
 
2794920
557930c
 
 
2794920
 
 
 
 
 
 
 
 
557930c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85b7ac8
557930c
 
 
 
 
 
 
2794920
557930c
 
 
 
2794920
557930c
 
 
 
85b7ac8
 
557930c
 
85b7ac8
 
557930c
 
 
85b7ac8
557930c
 
2794920
 
557930c
4de7d31
557930c
 
 
85b7ac8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""Heuristic baseline runner for the /baseline endpoint.

Applies expected_fixes directly to verify the environment + grader work e2e.
By default runs ALL scenarios of ALL tasks for deterministic, reproducible evaluation.
"""


from typing import List, Optional

from server.environment import CloudNativeDebugEnvironment
from server.graders import run_grader
from server.models import Action, ActionType, FileEdit, GraderResult
from server.tasks.task_registry import TASK_REGISTRY


def _heuristic_episode(env: CloudNativeDebugEnvironment, task_id: str, scenario_id: Optional[str] = None) -> GraderResult:
    """Run one episode using a heuristic that applies expected fixes."""
    obs = env.reset(task_id=task_id, scenario_id=scenario_id)

    # Apply each expected fix as an edit_file action
    for fix in env.expected_fixes:
        if env.done:
            break
        file_path = fix["file"]
        if file_path not in env.current_files:
            # For fixes that require creating a new file (e.g. ConfigMap),
            # create it with the expected content
            if fix["type"] == "contains":
                action = Action(
                    action_type=ActionType.EDIT_FILE,
                    edits=[FileEdit(
                        file_path=file_path,
                        new_content=fix["expected"],
                    )],
                )
                env.step(action)
            continue

        current_content = env.current_files[file_path].content

        if fix["type"] == "contains":
            # Need to ensure expected string is present
            if fix["expected"] not in current_content:
                # Try to find the broken line using hint
                hint_text = fix.get("hint", "")
                # Use edit_file with old/new content based on the fix
                # We look at original files to find what changed
                original_content = env.original_files.get(file_path)
                if original_content:
                    lines = current_content.split("\n")
                    expected = fix["expected"]
                    line_num = fix.get("line")

                    if line_num and 1 <= line_num <= len(lines):
                        old_line = lines[line_num - 1]
                        action = Action(
                            action_type=ActionType.REPLACE_LINE,
                            edits=[FileEdit(
                                file_path=file_path,
                                line_number=line_num,
                                new_content=expected,
                            )],
                        )
                    else:
                        # Find the line with highest character overlap to expected
                        best_line = None
                        best_idx = None
                        best_score = 0
                        for i, line in enumerate(lines):
                            stripped = line.strip()
                            exp_stripped = expected.strip()
                            if not stripped or not exp_stripped:
                                continue
                            overlap = len(set(stripped) & set(exp_stripped))
                            # Use ratio of overlap to max length for scoring
                            score = overlap / max(len(exp_stripped), len(stripped))
                            if score > 0.5 and score > best_score:
                                best_line = line
                                best_idx = i
                                best_score = score

                        if best_line is not None:
                            action = Action(
                                action_type=ActionType.EDIT_FILE,
                                edits=[FileEdit(
                                    file_path=file_path,
                                    old_content=best_line,
                                    new_content=expected,
                                )],
                            )
                        else:
                            # Append the expected content
                            action = Action(
                                action_type=ActionType.ADD_LINE,
                                edits=[FileEdit(
                                    file_path=file_path,
                                    new_content=expected,
                                )],
                            )
                    env.step(action)

        elif fix["type"] == "not_contains":
            # Need to ensure expected string is NOT present
            if fix["expected"] in current_content:
                action = Action(
                    action_type=ActionType.DELETE_BLOCK,
                    edits=[FileEdit(
                        file_path=file_path,
                        old_content=fix["expected"],
                    )],
                )
                env.step(action)

        elif fix["type"] == "line_equals":
            line_num = int(fix.get("line", 0))
            if line_num >= 1:
                action = Action(
                    action_type=ActionType.REPLACE_LINE,
                    edits=[FileEdit(
                        file_path=file_path,
                        line_number=line_num,
                        new_content=str(fix["expected"]),
                    )],
                )
                env.step(action)

    # Submit if not already done
    if not env.done:
        env.step(Action(action_type=ActionType.SUBMIT))

    return run_grader(task_id, env.trajectory)


def run_baseline_episodes(task_id: Optional[str] = None, num_episodes: Optional[int] = None) -> List[GraderResult]:
    """Run baseline episodes across tasks.

    Args:
        task_id: Specific task to run, or None for all tasks.
        num_episodes: Max scenarios per task. None = run ALL scenarios (default).

    Returns:
        List of GraderResult for each episode.
    """
    task_ids: List[str]
    if task_id:
        if task_id not in TASK_REGISTRY:
            raise ValueError(f"Unknown task: {task_id}")
        task_ids = [task_id]
    else:
        task_ids = list(TASK_REGISTRY.keys())

    results: List[GraderResult] = []
    for tid in task_ids:
        task_cls = TASK_REGISTRY[tid]
        scenarios = task_cls.SCENARIOS
        for idx, scenario in enumerate(scenarios):
            if num_episodes is not None and idx >= num_episodes:
                break
            env = CloudNativeDebugEnvironment()
            result = _heuristic_episode(env, tid, scenario["id"])
            results.append(result)

    return results