File size: 10,124 Bytes
6f67ddc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d95cff9
 
 
 
 
6f67ddc
 
 
 
 
 
 
d95cff9
 
 
 
6f67ddc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7301cf5
 
 
 
 
 
 
6f67ddc
 
 
 
 
 
 
 
 
7301cf5
6f67ddc
 
7301cf5
 
 
6f67ddc
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
"""
Shared utilities for file editing tools β€” fuzzy matching, syntax validation,
and richer edit operations.

Used by both local_tools.py and the embedded sandbox server.
"""

from __future__ import annotations

# ── Unicode normalization map ────────────────────────────────────────────

UNICODE_MAP = {
    "\u2013": "-",   # en-dash
    "\u2014": "-",   # em-dash
    "\u2212": "-",   # minus sign
    "\u2018": "'",   # left single quote
    "\u2019": "'",   # right single quote
    "\u201c": '"',   # left double quote
    "\u201d": '"',   # right double quote
    "\u00a0": " ",   # non-breaking space
    "\u2003": " ",   # em space
    "\u2002": " ",   # en space
    "\u200b": "",    # zero-width space
    "\ufeff": "",    # BOM
}


def _normalize_unicode(s: str) -> str:
    return "".join(UNICODE_MAP.get(c, c) for c in s)


# ── 4-pass fuzzy matching ────────────────────────────────────────────────


def fuzzy_find(content: str, pattern: str) -> tuple[int | None, str | None]:
    """Find *pattern* in *content* with increasingly relaxed matching.

    Returns (start_index_in_original_content, match_note) or (None, None).
    The index always refers to the *original* content string so callers can
    use ``content[idx : idx + len(matched_text)]`` for replacement.

    Strategy (mirrors Codex):
      1. Exact match
      2. Right-trim each line (trailing whitespace)
      3. Both-sides trim (all surrounding whitespace per line)
      4. Unicode normalization on top of both-sides trim
    """
    # Pass 1 β€” exact
    if pattern in content:
        return content.index(pattern), None

    # Helper: build a line-stripped version *and* a mapping from stripped
    # positions back to original positions.  We need this so callers can
    # apply the replacement on the original content, not the stripped copy.

    def _build_stripped(text: str, strip_fn):
        """Return (stripped_text, line_start_map).

        line_start_map[i] = original byte offset of the start of line i.
        """
        orig_lines = text.split("\n")
        stripped_lines = [strip_fn(l) for l in orig_lines]
        return "\n".join(stripped_lines), orig_lines, stripped_lines

    # Pass 2 β€” right-trim
    c_rt, c_orig_lines, c_rt_lines = _build_stripped(content, str.rstrip)
    p_rt = "\n".join(l.rstrip() for l in pattern.split("\n"))
    idx = c_rt.find(p_rt)
    if idx != -1:
        orig_idx = _map_back(idx, c_orig_lines, c_rt_lines)
        return orig_idx, "(matched after trimming trailing whitespace)"

    # Pass 3 β€” both-sides trim
    c_st, _, c_st_lines = _build_stripped(content, str.strip)
    p_st = "\n".join(l.strip() for l in pattern.split("\n"))
    idx = c_st.find(p_st)
    if idx != -1:
        orig_idx = _map_back(idx, c_orig_lines, c_st_lines)
        return orig_idx, "(matched after trimming whitespace)"

    # Pass 4 β€” unicode normalization + both-sides trim
    c_norm = _normalize_unicode(c_st)
    p_norm = _normalize_unicode(p_st)
    idx = c_norm.find(p_norm)
    if idx != -1:
        orig_idx = _map_back(idx, c_orig_lines, c_st_lines)
        return orig_idx, "(matched after unicode normalization)"

    return None, None


def _map_back(
    stripped_idx: int,
    orig_lines: list[str],
    stripped_lines: list[str],
) -> int:
    """Map a character index in the stripped/joined text back to the original text."""
    # Walk through stripped lines to find which line the index falls on
    pos = 0
    for i, sl in enumerate(stripped_lines):
        line_end = pos + len(sl)
        if stripped_idx <= line_end:
            col_in_stripped = stripped_idx - pos
            # Find where this stripped line's content starts in the original line
            ol = orig_lines[i]
            # The stripped line is a subset of the original line; find its offset
            lstripped = len(ol) - len(ol.lstrip())
            orig_col = lstripped + col_in_stripped
            # Compute absolute position in original text
            orig_pos = sum(len(orig_lines[j]) + 1 for j in range(i)) + orig_col
            return orig_pos
        pos = line_end + 1  # +1 for the \n
    # Fallback: return 0 (shouldn't happen if idx is valid)
    return 0


def fuzzy_find_original_match(content: str, pattern: str) -> tuple[str | None, str | None]:
    """Find the *original* text in content that matches pattern fuzzily.

    Returns (original_matched_text, match_note) or (None, None).
    This extracts the exact substring from the original content that
    corresponds to the fuzzy match, preserving its original whitespace/unicode.
    """
    if pattern in content:
        return pattern, None

    idx, note = fuzzy_find(content, pattern)
    if idx is None:
        return None, None

    # We need to find the original text span that corresponds to the match.
    # The match covers len(pattern) worth of *logical* content.
    # Count how many original lines the pattern spans.
    pattern_lines = pattern.split("\n")
    n_lines = len(pattern_lines)

    # Find which original line the match starts on
    orig_lines = content.split("\n")
    char_pos = 0
    start_line = 0
    for i, ol in enumerate(orig_lines):
        if char_pos + len(ol) >= idx:
            start_line = i
            break
        char_pos += len(ol) + 1

    end_line = min(start_line + n_lines, len(orig_lines))
    # Extract the original lines that were matched
    matched_lines = orig_lines[start_line:end_line]
    original_text = "\n".join(matched_lines)
    return original_text, note


# ── Richer edit operations ───────────────────────────────────────────────


def apply_edit(
    content: str,
    old_str: str,
    new_str: str,
    mode: str = "replace",
    replace_all: bool = False,
) -> tuple[str, int, str | None]:
    """Apply an edit operation to content.

    Modes:
      - replace: replace first occurrence (or all if replace_all=True)
      - replace_all: replace all occurrences (alias)
      - append_after: insert new_str after old_str
      - prepend_before: insert new_str before old_str

    Returns (new_content, num_replacements, fuzzy_note).
    Raises ValueError if old_str not found.
    """
    if mode == "replace_all":
        replace_all = True
        mode = "replace"

    # Try exact match first, then fuzzy
    fuzzy_note = None
    if old_str not in content:
        original_match, fuzzy_note = fuzzy_find_original_match(content, old_str)
        if original_match is None:
            raise ValueError(
                "old_str was not found in the file. Make sure old_str matches "
                "the file contents exactly, including whitespace and indentation. "
                "Use the read tool to verify the current file contents before retrying."
            )
        old_str = original_match

    count = content.count(old_str)

    if mode == "replace":
        if count > 1 and not replace_all:
            raise ValueError(
                f"Found {count} matches of old_str in the file, but replace_all is "
                f"false. To replace all occurrences, set replace_all to true. To "
                f"replace only one, provide a larger old_str with more surrounding "
                f"context to uniquely identify the instance."
            )
        if replace_all:
            new_content = content.replace(old_str, new_str)
            return new_content, count, fuzzy_note
        else:
            new_content = content.replace(old_str, new_str, 1)
            return new_content, 1, fuzzy_note

    elif mode == "append_after":
        if replace_all:
            new_content = content.replace(old_str, old_str + new_str)
            return new_content, count, fuzzy_note
        else:
            idx = content.index(old_str) + len(old_str)
            new_content = content[:idx] + new_str + content[idx:]
            return new_content, 1, fuzzy_note

    elif mode == "prepend_before":
        if replace_all:
            new_content = content.replace(old_str, new_str + old_str)
            return new_content, count, fuzzy_note
        else:
            idx = content.index(old_str)
            new_content = content[:idx] + new_str + content[idx:]
            return new_content, 1, fuzzy_note

    else:
        raise ValueError(f"Unknown edit mode: {mode}. Use replace, append_after, or prepend_before.")


# ── Syntax validation (Python) ───────────────────────────────────────────


def validate_python(content: str, path: str = "") -> list[str]:
    """Lightweight post-write validation for Python files.

    Checks syntax and training script conventions. This runs on the host
    (not in the sandbox), so it only does static checks β€” no import resolution
    or signature inspection since packages are installed in the sandbox, not here.

    The sandbox server has its own richer version that does real signature
    inspection against installed packages.

    Returns a list of warning strings (empty = all good).
    Never raises β€” validation failures are advisory only.
    """
    import ast

    warnings = []

    # 1. Syntax check via ast.parse
    try:
        ast.parse(content)
    except SyntaxError as e:
        warnings.append(f"Python syntax error at line {e.lineno}: {e.msg}")
        return warnings

    # 2. Training script heuristics
    if any(kw in content for kw in ("TrainingArguments", "SFTConfig", "DPOConfig", "GRPOConfig")):
        if "push_to_hub" not in content:
            warnings.append(
                "Training script warning: no 'push_to_hub' found β€” model may be lost when job ends"
            )
        if "hub_model_id" not in content:
            warnings.append(
                "Training script warning: no 'hub_model_id' found"
            )

    return warnings