File size: 16,683 Bytes
43328de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
"""
Docker-less SWE-bench test verification.

Replicates the exact conda environment from SWE-bench Docker images
without using Docker. Uses `environment_setup_commit` from the dataset
to find environment.yml, creates a conda env, installs repo, applies
test_patch, and runs pytest.

This is the key missing piece for closing the cascade validation loop.
With this, we can verify that cascade-produced patches actually pass tests.

Requirements: conda (or mamba), git, pip
Usage:
    python dockerless_verify.py --instance django__django-12308 --patch path/to/patch.diff
    python dockerless_verify.py --batch results.jsonl --max-instances 50
"""

import argparse
import json
import os
import re
import subprocess
import sys
import tempfile
import time
from pathlib import Path
from typing import Optional, Tuple, Dict, List

# ============================================================
# CONFIG
# ============================================================

# Environment file candidate paths (from SWE-bench harness)
ENV_CANDIDATES = [
    "environment.yml",
    "dev/environment.yml",
    ".github/environment.yml",
    ".github/setup/environment.yml",
    "requirements/environment.yml",
    "ci/environment.yml",
    ".azure-pipelines/environment.yml",
    "build_tools/azure/environment.yml",
    "conda_environment.yml",
    "env/environment.yml",
]

# Repo clone URLs
REPO_URLS = {
    "django/django": "https://github.com/django/django.git",
    "pytest-dev/pytest": "https://github.com/pytest-dev/pytest.git",
    "scikit-learn/scikit-learn": "https://github.com/scikit-learn/scikit-learn.git",
    "sympy/sympy": "https://github.com/sympy/sympy.git",
    "matplotlib/matplotlib": "https://github.com/matplotlib/matplotlib.git",
    "sphinx-doc/sphinx": "https://github.com/sphinx-doc/sphinx.git",
    "astropy/astropy": "https://github.com/astropy/astropy.git",
    "psf/requests": "https://github.com/psf/requests.git",
    "pylint-dev/pylint": "https://github.com/pylint-dev/pylint.git",
    "pallets/flask": "https://github.com/pallets/flask.git",
    "mwaskom/seaborn": "https://github.com/mwaskom/seaborn.git",
    "pydata/xarray": "https://github.com/pydata/xarray.git",
}


def run(cmd: list, cwd: str = None, timeout: int = 120, env: dict = None) -> Tuple[int, str, str]:
    """Run a command, return (returncode, stdout, stderr)."""
    try:
        result = subprocess.run(
            cmd, cwd=cwd, capture_output=True, text=True,
            timeout=timeout, env=env or os.environ
        )
        return result.returncode, result.stdout, result.stderr
    except subprocess.TimeoutExpired:
        return 124, "", "TIMEOUT"
    except Exception as e:
        return -1, "", str(e)


def find_environment_yml(repo_dir: Path) -> Optional[Path]:
    """Find environment.yml in the repo at its current checkout."""
    for candidate in ENV_CANDIDATES:
        path = repo_dir / candidate
        if path.exists() and path.stat().st_size > 10:
            return path
    # Try recursive search
    for path in repo_dir.rglob("environment.yml"):
        if path.stat().st_size > 10:
            return path
    return None


def get_python_version_from_yml(yml_path: Path) -> str:
    """Extract Python version from environment.yml."""
    content = yml_path.read_text()
    # Look for: - python=3.10 or - python=3.9.*
    m = re.search(r'python\s*[=!<>]+\s*(\d+\.\d+)', content)
    if m:
        return m.group(1)
    m = re.search(r'-\s*python\s*$', content, re.MULTILINE)
    if m:
        return "3.10"  # default
    return "3.10"


def setup_conda_env(repo_dir: Path, env_name: str) -> Tuple[bool, str]:
    """
    Create conda environment for the repo.
    Returns (success, error_message).
    """
    yml_path = find_environment_yml(repo_dir)
    
    if yml_path:
        print(f"  Found environment.yml: {yml_path.relative_to(repo_dir)}")
        rc, out, err = run(
            ["conda", "env", "create", "-f", str(yml_path), "-n", env_name, "--quiet"],
            timeout=300
        )
        if rc != 0:
            # Try with channel flexibility
            rc2, out2, err2 = run(
                ["conda", "env", "create", "-f", str(yml_path), "-n", env_name,
                 "--quiet", "--override-channels", "-c", "conda-forge", "-c", "defaults"],
                timeout=300
            )
            if rc2 != 0:
                return False, f"conda env create failed: {err[:500]}\nFallback also failed: {err2[:200]}"
    else:
        print(f"  No environment.yml found, creating minimal env")
        py_ver = "3.10"
        # Try to detect from setup.py
        setup_py = repo_dir / "setup.py"
        if setup_py.exists():
            content = setup_py.read_text()
            m = re.search(r'python_requires\s*=\s*[\'"]([^\'"]+)[\'"]', content)
            if m:
                ver_spec = m.group(1)
                m2 = re.search(r'(\d+\.\d+)', ver_spec)
                if m2:
                    py_ver = m2.group(1)
        
        rc, out, err = run(
            ["conda", "create", "-n", env_name, f"python={py_ver}", "pip", "-y", "--quiet"],
            timeout=120
        )
        if rc != 0:
            return False, f"conda create failed: {err[:300]}"
    
    # Install repo in editable mode
    print(f"  Installing repo (pip install -e .)...")
    rc, out, err = run(
        ["conda", "run", "-n", env_name, "pip", "install", "-e", ".", "--quiet"],
        cwd=str(repo_dir),
        timeout=300
    )
    if rc != 0:
        # Try without quiet to see errors
        rc2, out2, err2 = run(
            ["conda", "run", "-n", env_name, "pip", "install", "-e", "."],
            cwd=str(repo_dir),
            timeout=300
        )
        if rc2 != 0:
            print(f"  [WARN] pip install failed: {err2[:300]}")
            return False, f"pip install failed: {err2[:300]}"
    
    # Install test dependencies if any
    for req_file in ["test-requirements.txt", "requirements-test.txt", "requirements_test.txt",
                      "requirements/dev.txt", "requirements/test.txt", "dev-requirements.txt"]:
        req_path = repo_dir / req_file
        if req_path.exists():
            print(f"  Installing test deps from {req_file}")
            run(["conda", "run", "-n", env_name, "pip", "install", "-r", str(req_path), "--quiet"],
                timeout=120)
    
    # Also try pip install -e ".[test]"
    run(["conda", "run", "-n", env_name, "pip", "install", "-e", ".[test]", "--quiet"],
        cwd=str(repo_dir), timeout=120)
    
    return True, ""


def verify_patch(
    instance: dict,
    model_patch: str,
    work_dir: Path,
    env_name: str,
) -> dict:
    """
    Verify a model-generated patch against SWE-bench tests.
    
    Steps:
    1. Git apply the model's patch
    2. Git apply the test_patch from the instance
    3. Run FAIL_TO_PASS tests
    4. Check that PASS_TO_PASS tests still pass
    
    Returns: {success, resolved, test_output, failures}
    """
    result = {
        "success": False,
        "resolved": False,
        "test_output": "",
        "failures": [],
        "error": None,
    }
    
    repo_dir = work_dir / "repo"
    
    try:
        # Step 1: Apply model patch
        model_patch_file = work_dir / "model.patch"
        model_patch_file.write_text(model_patch)
        
        rc, out, err = run(
            ["git", "apply", "--verbose", str(model_patch_file)],
            cwd=str(repo_dir)
        )
        if rc != 0:
            result["error"] = f"model patch apply failed: {err[:300]}"
            return result
        
        # Step 2: Apply test patch
        test_patch = instance.get("test_patch", "")
        if not test_patch:
            result["error"] = "no test_patch in instance"
            return result
        
        test_patch_file = work_dir / "test.patch"
        test_patch_file.write_text(test_patch)
        
        rc, out, err = run(
            ["git", "apply", "--verbose", str(test_patch_file)],
            cwd=str(repo_dir)
        )
        if rc != 0:
            # Try with --reject
            rc, out, err = run(
                ["git", "apply", "--reject", "--verbose", str(test_patch_file)],
                cwd=str(repo_dir)
            )
            if rc != 0:
                result["error"] = f"test patch apply failed: {err[:300]}"
                return result
        
        # Step 3: Run FAIL_TO_PASS tests
        fail_tests = instance.get("FAIL_TO_PASS", [])
        if not fail_tests:
            result["error"] = "no FAIL_TO_PASS tests"
            return result
        
        # Convert test paths to pytest format
        test_args = []
        for test in fail_tests:
            # Format: "astropy/modeling/tests/test_separable.py::test_name"
            test_args.append(test)
        
        print(f"  Running {len(fail_tests)} FAIL_TO_PASS tests...")
        cmd = ["conda", "run", "-n", env_name, "python", "-m", "pytest", "-x", "-v"] + test_args
        rc, out, err = run(cmd, cwd=str(repo_dir), timeout=300)
        
        result["test_output"] = (out + err)[:2000]
        
        # Check if all tests passed
        if rc == 0:
            # Step 4: Run PASS_TO_PASS tests to check for regressions
            pass_tests = instance.get("PASS_TO_PASS", [])
            if pass_tests:
                print(f"  Running {len(pass_tests)} PASS_TO_PASS regression tests...")
                cmd2 = ["conda", "run", "-n", env_name, "python", "-m", "pytest", "-x", "-v"] + pass_tests[:20]
                rc2, out2, err2 = run(cmd2, cwd=str(repo_dir), timeout=300)
                if rc2 == 0:
                    result["success"] = True
                    result["resolved"] = True
                else:
                    result["error"] = f"regression test failed: {(out2+err2)[:300]}"
            else:
                result["success"] = True
                result["resolved"] = True
        else:
            # Count failures
            failures = []
            for line in (out + err).split('\n'):
                if 'FAILED' in line and '::' in line:
                    failures.append(line.strip())
            result["failures"] = failures[:10]
        
        return result
        
    except Exception as e:
        result["error"] = str(e)[:500]
        return result


def verify_one(instance: dict, model_patch: str) -> dict:
    """
    Full verification of one instance with one patch.
    Clones repo, sets up conda env, verifies.
    """
    inst_id = instance["instance_id"]
    repo = instance.get("repo", "")
    env_setup_commit = instance.get("environment_setup_commit", "")
    base_commit = instance.get("base_commit", "")
    
    print(f"\n{'='*60}")
    print(f"VERIFY: {inst_id}")
    print(f"  Repo: {repo}")
    print(f"  Base: {base_commit[:12]} EnvSetup: {env_setup_commit[:12]}")
    print(f"{'='*60}")
    
    repo_url = REPO_URLS.get(repo, f"https://github.com/{repo}.git")
    
    with tempfile.TemporaryDirectory(prefix=f"sweverify_{inst_id.replace('/', '_')}_") as tmpdir:
        work_dir = Path(tmpdir)
        repo_dir = work_dir / "repo"
        env_name = f"swebench_{inst_id.replace('__', '_').replace('-', '_')[:40]}"
        
        # Clone
        print(f"  Cloning {repo_url}...")
        rc, out, err = run(["git", "clone", repo_url, str(repo_dir)], timeout=180)
        if rc != 0:
            return {"instance_id": inst_id, "resolved": False, "error": f"clone failed: {err[:200]}"}
        
        # Step 1: Checkout environment_setup_commit to find environment.yml
        if env_setup_commit:
            rc, out, err = run(
                ["git", "fetch", "origin", env_setup_commit],
                cwd=str(repo_dir), timeout=60
            )
            rc, out, err = run(
                ["git", "checkout", env_setup_commit],
                cwd=str(repo_dir), timeout=30
            )
            if rc != 0:
                print(f"  [WARN] Cannot checkout env_setup_commit {env_setup_commit[:12]}, using HEAD")
        
        # Step 2: Create conda env
        success, error = setup_conda_env(repo_dir, env_name)
        if not success:
            return {"instance_id": inst_id, "resolved": False, "error": f"env setup: {error[:200]}"}
        
        # Step 3: Checkout base_commit
        if base_commit:
            rc, out, err = run(
                ["git", "fetch", "origin", base_commit],
                cwd=str(repo_dir), timeout=60
            )
            rc, out, err = run(
                ["git", "checkout", base_commit],
                cwd=str(repo_dir), timeout=30
            )
            if rc != 0:
                print(f"  [WARN] Cannot checkout base_commit {base_commit[:12]}")
        
        # Reinstall at base commit
        print(f"  Reinstalling at base_commit...")
        run(["conda", "run", "-n", env_name, "pip", "install", "-e", ".", "--quiet"],
            cwd=str(repo_dir), timeout=120)
        
        # Step 4: Verify
        result = verify_patch(instance, model_patch, work_dir, env_name)
        result["instance_id"] = inst_id
        
        # Cleanup conda env
        print(f"  Cleaning up conda env {env_name}...")
        run(["conda", "env", "remove", "-n", env_name, "-y", "--quiet"], timeout=30)
        
        return result


def verify_batch(instances: list, patches: dict, max_instances: int = 50) -> list:
    """Verify multiple instances with their patches."""
    results = []
    for i, instance in enumerate(instances[:max_instances]):
        inst_id = instance["instance_id"]
        if inst_id not in patches:
            print(f"  [{i+1}/{min(len(instances), max_instances)}] SKIP {inst_id} — no patch")
            results.append({"instance_id": inst_id, "resolved": False, "error": "no patch"})
            continue
        
        print(f"\n{'#'*60}")
        print(f"  [{i+1}/{min(len(instances), max_instances)}] {inst_id}")
        print(f"{'#'*60}")
        
        result = verify_one(instance, patches[inst_id])
        results.append(result)
        
        # Save incrementally
        with open("/app/verify_results.jsonl", "w") as f:
            for r in results:
                r_clean = {k: v for k, v in r.items() if k != "test_output"}
                r_clean["test_output_len"] = len(r.get("test_output", ""))
                f.write(json.dumps(r_clean) + "\n")
    
    return results


def load_patches_from_results(results_file: str) -> dict:
    """Load patches from a results JSONL file."""
    patches = {}
    with open(results_file) as f:
        for line in f:
            r = json.loads(line)
            if r.get("patch"):
                patches[r["instance_id"]] = r["patch"]
    return patches


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Docker-less SWE-bench verification")
    parser.add_argument("--instance", type=str, help="Single instance ID")
    parser.add_argument("--patch", type=str, help="Path to patch file (for single instance)")
    parser.add_argument("--batch", type=str, help="Path to results JSONL with patches")
    parser.add_argument("--max-instances", type=int, default=10)
    args = parser.parse_args()
    
    from datasets import load_dataset
    
    if args.instance and args.patch:
        # Single instance mode
        ds = load_dataset("princeton-nlp/SWE-bench_Verified", split="test")
        instance = None
        for row in ds:
            if row["instance_id"] == args.instance:
                instance = dict(row)
                break
        if not instance:
            print(f"Instance {args.instance} not found")
            sys.exit(1)
        
        patch = Path(args.patch).read_text()
        result = verify_one(instance, patch)
        print(f"\nRESULT: {json.dumps(result, indent=2)}")
    
    elif args.batch:
        # Batch mode
        patches = load_patches_from_results(args.batch)
        print(f"Loaded {len(patches)} patches from {args.batch}")
        
        ds = load_dataset("princeton-nlp/SWE-bench_Verified", split="test")
        instances = []
        for row in ds:
            if row["instance_id"] in patches:
                instances.append(dict(row))
        
        print(f"Found {len(instances)} instances with patches")
        
        results = verify_batch(instances, patches, args.max_instances)
        
        resolved = [r for r in results if r.get("resolved")]
        print(f"\nVERIFICATION COMPLETE: {len(resolved)}/{len(results)} verified")
        
        with open("/app/verify_results.jsonl", "w") as f:
            for r in results:
                f.write(json.dumps(r) + "\n")
    
    else:
        parser.print_help()