| """ |
| Docker-less SWE-bench test verification. |
| |
| Replicates the exact conda environment from SWE-bench Docker images |
| without using Docker. Uses `environment_setup_commit` from the dataset |
| to find environment.yml, creates a conda env, installs repo, applies |
| test_patch, and runs pytest. |
| |
| This is the key missing piece for closing the cascade validation loop. |
| With this, we can verify that cascade-produced patches actually pass tests. |
| |
| Requirements: conda (or mamba), git, pip |
| Usage: |
| python dockerless_verify.py --instance django__django-12308 --patch path/to/patch.diff |
| python dockerless_verify.py --batch results.jsonl --max-instances 50 |
| """ |
|
|
| import argparse |
| import json |
| import os |
| import re |
| import subprocess |
| import sys |
| import tempfile |
| import time |
| from pathlib import Path |
| from typing import Optional, Tuple, Dict, List |
|
|
| |
| |
| |
|
|
| |
| ENV_CANDIDATES = [ |
| "environment.yml", |
| "dev/environment.yml", |
| ".github/environment.yml", |
| ".github/setup/environment.yml", |
| "requirements/environment.yml", |
| "ci/environment.yml", |
| ".azure-pipelines/environment.yml", |
| "build_tools/azure/environment.yml", |
| "conda_environment.yml", |
| "env/environment.yml", |
| ] |
|
|
| |
| REPO_URLS = { |
| "django/django": "https://github.com/django/django.git", |
| "pytest-dev/pytest": "https://github.com/pytest-dev/pytest.git", |
| "scikit-learn/scikit-learn": "https://github.com/scikit-learn/scikit-learn.git", |
| "sympy/sympy": "https://github.com/sympy/sympy.git", |
| "matplotlib/matplotlib": "https://github.com/matplotlib/matplotlib.git", |
| "sphinx-doc/sphinx": "https://github.com/sphinx-doc/sphinx.git", |
| "astropy/astropy": "https://github.com/astropy/astropy.git", |
| "psf/requests": "https://github.com/psf/requests.git", |
| "pylint-dev/pylint": "https://github.com/pylint-dev/pylint.git", |
| "pallets/flask": "https://github.com/pallets/flask.git", |
| "mwaskom/seaborn": "https://github.com/mwaskom/seaborn.git", |
| "pydata/xarray": "https://github.com/pydata/xarray.git", |
| } |
|
|
|
|
| def run(cmd: list, cwd: str = None, timeout: int = 120, env: dict = None) -> Tuple[int, str, str]: |
| """Run a command, return (returncode, stdout, stderr).""" |
| try: |
| result = subprocess.run( |
| cmd, cwd=cwd, capture_output=True, text=True, |
| timeout=timeout, env=env or os.environ |
| ) |
| return result.returncode, result.stdout, result.stderr |
| except subprocess.TimeoutExpired: |
| return 124, "", "TIMEOUT" |
| except Exception as e: |
| return -1, "", str(e) |
|
|
|
|
| def find_environment_yml(repo_dir: Path) -> Optional[Path]: |
| """Find environment.yml in the repo at its current checkout.""" |
| for candidate in ENV_CANDIDATES: |
| path = repo_dir / candidate |
| if path.exists() and path.stat().st_size > 10: |
| return path |
| |
| for path in repo_dir.rglob("environment.yml"): |
| if path.stat().st_size > 10: |
| return path |
| return None |
|
|
|
|
| def get_python_version_from_yml(yml_path: Path) -> str: |
| """Extract Python version from environment.yml.""" |
| content = yml_path.read_text() |
| |
| m = re.search(r'python\s*[=!<>]+\s*(\d+\.\d+)', content) |
| if m: |
| return m.group(1) |
| m = re.search(r'-\s*python\s*$', content, re.MULTILINE) |
| if m: |
| return "3.10" |
| return "3.10" |
|
|
|
|
| def setup_conda_env(repo_dir: Path, env_name: str) -> Tuple[bool, str]: |
| """ |
| Create conda environment for the repo. |
| Returns (success, error_message). |
| """ |
| yml_path = find_environment_yml(repo_dir) |
| |
| if yml_path: |
| print(f" Found environment.yml: {yml_path.relative_to(repo_dir)}") |
| rc, out, err = run( |
| ["conda", "env", "create", "-f", str(yml_path), "-n", env_name, "--quiet"], |
| timeout=300 |
| ) |
| if rc != 0: |
| |
| rc2, out2, err2 = run( |
| ["conda", "env", "create", "-f", str(yml_path), "-n", env_name, |
| "--quiet", "--override-channels", "-c", "conda-forge", "-c", "defaults"], |
| timeout=300 |
| ) |
| if rc2 != 0: |
| return False, f"conda env create failed: {err[:500]}\nFallback also failed: {err2[:200]}" |
| else: |
| print(f" No environment.yml found, creating minimal env") |
| py_ver = "3.10" |
| |
| setup_py = repo_dir / "setup.py" |
| if setup_py.exists(): |
| content = setup_py.read_text() |
| m = re.search(r'python_requires\s*=\s*[\'"]([^\'"]+)[\'"]', content) |
| if m: |
| ver_spec = m.group(1) |
| m2 = re.search(r'(\d+\.\d+)', ver_spec) |
| if m2: |
| py_ver = m2.group(1) |
| |
| rc, out, err = run( |
| ["conda", "create", "-n", env_name, f"python={py_ver}", "pip", "-y", "--quiet"], |
| timeout=120 |
| ) |
| if rc != 0: |
| return False, f"conda create failed: {err[:300]}" |
| |
| |
| print(f" Installing repo (pip install -e .)...") |
| rc, out, err = run( |
| ["conda", "run", "-n", env_name, "pip", "install", "-e", ".", "--quiet"], |
| cwd=str(repo_dir), |
| timeout=300 |
| ) |
| if rc != 0: |
| |
| rc2, out2, err2 = run( |
| ["conda", "run", "-n", env_name, "pip", "install", "-e", "."], |
| cwd=str(repo_dir), |
| timeout=300 |
| ) |
| if rc2 != 0: |
| print(f" [WARN] pip install failed: {err2[:300]}") |
| return False, f"pip install failed: {err2[:300]}" |
| |
| |
| for req_file in ["test-requirements.txt", "requirements-test.txt", "requirements_test.txt", |
| "requirements/dev.txt", "requirements/test.txt", "dev-requirements.txt"]: |
| req_path = repo_dir / req_file |
| if req_path.exists(): |
| print(f" Installing test deps from {req_file}") |
| run(["conda", "run", "-n", env_name, "pip", "install", "-r", str(req_path), "--quiet"], |
| timeout=120) |
| |
| |
| run(["conda", "run", "-n", env_name, "pip", "install", "-e", ".[test]", "--quiet"], |
| cwd=str(repo_dir), timeout=120) |
| |
| return True, "" |
|
|
|
|
| def verify_patch( |
| instance: dict, |
| model_patch: str, |
| work_dir: Path, |
| env_name: str, |
| ) -> dict: |
| """ |
| Verify a model-generated patch against SWE-bench tests. |
| |
| Steps: |
| 1. Git apply the model's patch |
| 2. Git apply the test_patch from the instance |
| 3. Run FAIL_TO_PASS tests |
| 4. Check that PASS_TO_PASS tests still pass |
| |
| Returns: {success, resolved, test_output, failures} |
| """ |
| result = { |
| "success": False, |
| "resolved": False, |
| "test_output": "", |
| "failures": [], |
| "error": None, |
| } |
| |
| repo_dir = work_dir / "repo" |
| |
| try: |
| |
| model_patch_file = work_dir / "model.patch" |
| model_patch_file.write_text(model_patch) |
| |
| rc, out, err = run( |
| ["git", "apply", "--verbose", str(model_patch_file)], |
| cwd=str(repo_dir) |
| ) |
| if rc != 0: |
| result["error"] = f"model patch apply failed: {err[:300]}" |
| return result |
| |
| |
| test_patch = instance.get("test_patch", "") |
| if not test_patch: |
| result["error"] = "no test_patch in instance" |
| return result |
| |
| test_patch_file = work_dir / "test.patch" |
| test_patch_file.write_text(test_patch) |
| |
| rc, out, err = run( |
| ["git", "apply", "--verbose", str(test_patch_file)], |
| cwd=str(repo_dir) |
| ) |
| if rc != 0: |
| |
| rc, out, err = run( |
| ["git", "apply", "--reject", "--verbose", str(test_patch_file)], |
| cwd=str(repo_dir) |
| ) |
| if rc != 0: |
| result["error"] = f"test patch apply failed: {err[:300]}" |
| return result |
| |
| |
| fail_tests = instance.get("FAIL_TO_PASS", []) |
| if not fail_tests: |
| result["error"] = "no FAIL_TO_PASS tests" |
| return result |
| |
| |
| test_args = [] |
| for test in fail_tests: |
| |
| test_args.append(test) |
| |
| print(f" Running {len(fail_tests)} FAIL_TO_PASS tests...") |
| cmd = ["conda", "run", "-n", env_name, "python", "-m", "pytest", "-x", "-v"] + test_args |
| rc, out, err = run(cmd, cwd=str(repo_dir), timeout=300) |
| |
| result["test_output"] = (out + err)[:2000] |
| |
| |
| if rc == 0: |
| |
| pass_tests = instance.get("PASS_TO_PASS", []) |
| if pass_tests: |
| print(f" Running {len(pass_tests)} PASS_TO_PASS regression tests...") |
| cmd2 = ["conda", "run", "-n", env_name, "python", "-m", "pytest", "-x", "-v"] + pass_tests[:20] |
| rc2, out2, err2 = run(cmd2, cwd=str(repo_dir), timeout=300) |
| if rc2 == 0: |
| result["success"] = True |
| result["resolved"] = True |
| else: |
| result["error"] = f"regression test failed: {(out2+err2)[:300]}" |
| else: |
| result["success"] = True |
| result["resolved"] = True |
| else: |
| |
| failures = [] |
| for line in (out + err).split('\n'): |
| if 'FAILED' in line and '::' in line: |
| failures.append(line.strip()) |
| result["failures"] = failures[:10] |
| |
| return result |
| |
| except Exception as e: |
| result["error"] = str(e)[:500] |
| return result |
|
|
|
|
| def verify_one(instance: dict, model_patch: str) -> dict: |
| """ |
| Full verification of one instance with one patch. |
| Clones repo, sets up conda env, verifies. |
| """ |
| inst_id = instance["instance_id"] |
| repo = instance.get("repo", "") |
| env_setup_commit = instance.get("environment_setup_commit", "") |
| base_commit = instance.get("base_commit", "") |
| |
| print(f"\n{'='*60}") |
| print(f"VERIFY: {inst_id}") |
| print(f" Repo: {repo}") |
| print(f" Base: {base_commit[:12]} EnvSetup: {env_setup_commit[:12]}") |
| print(f"{'='*60}") |
| |
| repo_url = REPO_URLS.get(repo, f"https://github.com/{repo}.git") |
| |
| with tempfile.TemporaryDirectory(prefix=f"sweverify_{inst_id.replace('/', '_')}_") as tmpdir: |
| work_dir = Path(tmpdir) |
| repo_dir = work_dir / "repo" |
| env_name = f"swebench_{inst_id.replace('__', '_').replace('-', '_')[:40]}" |
| |
| |
| print(f" Cloning {repo_url}...") |
| rc, out, err = run(["git", "clone", repo_url, str(repo_dir)], timeout=180) |
| if rc != 0: |
| return {"instance_id": inst_id, "resolved": False, "error": f"clone failed: {err[:200]}"} |
| |
| |
| if env_setup_commit: |
| rc, out, err = run( |
| ["git", "fetch", "origin", env_setup_commit], |
| cwd=str(repo_dir), timeout=60 |
| ) |
| rc, out, err = run( |
| ["git", "checkout", env_setup_commit], |
| cwd=str(repo_dir), timeout=30 |
| ) |
| if rc != 0: |
| print(f" [WARN] Cannot checkout env_setup_commit {env_setup_commit[:12]}, using HEAD") |
| |
| |
| success, error = setup_conda_env(repo_dir, env_name) |
| if not success: |
| return {"instance_id": inst_id, "resolved": False, "error": f"env setup: {error[:200]}"} |
| |
| |
| if base_commit: |
| rc, out, err = run( |
| ["git", "fetch", "origin", base_commit], |
| cwd=str(repo_dir), timeout=60 |
| ) |
| rc, out, err = run( |
| ["git", "checkout", base_commit], |
| cwd=str(repo_dir), timeout=30 |
| ) |
| if rc != 0: |
| print(f" [WARN] Cannot checkout base_commit {base_commit[:12]}") |
| |
| |
| print(f" Reinstalling at base_commit...") |
| run(["conda", "run", "-n", env_name, "pip", "install", "-e", ".", "--quiet"], |
| cwd=str(repo_dir), timeout=120) |
| |
| |
| result = verify_patch(instance, model_patch, work_dir, env_name) |
| result["instance_id"] = inst_id |
| |
| |
| print(f" Cleaning up conda env {env_name}...") |
| run(["conda", "env", "remove", "-n", env_name, "-y", "--quiet"], timeout=30) |
| |
| return result |
|
|
|
|
| def verify_batch(instances: list, patches: dict, max_instances: int = 50) -> list: |
| """Verify multiple instances with their patches.""" |
| results = [] |
| for i, instance in enumerate(instances[:max_instances]): |
| inst_id = instance["instance_id"] |
| if inst_id not in patches: |
| print(f" [{i+1}/{min(len(instances), max_instances)}] SKIP {inst_id} — no patch") |
| results.append({"instance_id": inst_id, "resolved": False, "error": "no patch"}) |
| continue |
| |
| print(f"\n{'#'*60}") |
| print(f" [{i+1}/{min(len(instances), max_instances)}] {inst_id}") |
| print(f"{'#'*60}") |
| |
| result = verify_one(instance, patches[inst_id]) |
| results.append(result) |
| |
| |
| with open("/app/verify_results.jsonl", "w") as f: |
| for r in results: |
| r_clean = {k: v for k, v in r.items() if k != "test_output"} |
| r_clean["test_output_len"] = len(r.get("test_output", "")) |
| f.write(json.dumps(r_clean) + "\n") |
| |
| return results |
|
|
|
|
| def load_patches_from_results(results_file: str) -> dict: |
| """Load patches from a results JSONL file.""" |
| patches = {} |
| with open(results_file) as f: |
| for line in f: |
| r = json.loads(line) |
| if r.get("patch"): |
| patches[r["instance_id"]] = r["patch"] |
| return patches |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Docker-less SWE-bench verification") |
| parser.add_argument("--instance", type=str, help="Single instance ID") |
| parser.add_argument("--patch", type=str, help="Path to patch file (for single instance)") |
| parser.add_argument("--batch", type=str, help="Path to results JSONL with patches") |
| parser.add_argument("--max-instances", type=int, default=10) |
| args = parser.parse_args() |
| |
| from datasets import load_dataset |
| |
| if args.instance and args.patch: |
| |
| ds = load_dataset("princeton-nlp/SWE-bench_Verified", split="test") |
| instance = None |
| for row in ds: |
| if row["instance_id"] == args.instance: |
| instance = dict(row) |
| break |
| if not instance: |
| print(f"Instance {args.instance} not found") |
| sys.exit(1) |
| |
| patch = Path(args.patch).read_text() |
| result = verify_one(instance, patch) |
| print(f"\nRESULT: {json.dumps(result, indent=2)}") |
| |
| elif args.batch: |
| |
| patches = load_patches_from_results(args.batch) |
| print(f"Loaded {len(patches)} patches from {args.batch}") |
| |
| ds = load_dataset("princeton-nlp/SWE-bench_Verified", split="test") |
| instances = [] |
| for row in ds: |
| if row["instance_id"] in patches: |
| instances.append(dict(row)) |
| |
| print(f"Found {len(instances)} instances with patches") |
| |
| results = verify_batch(instances, patches, args.max_instances) |
| |
| resolved = [r for r in results if r.get("resolved")] |
| print(f"\nVERIFICATION COMPLETE: {len(resolved)}/{len(results)} verified") |
| |
| with open("/app/verify_results.jsonl", "w") as f: |
| for r in results: |
| f.write(json.dumps(r) + "\n") |
| |
| else: |
| parser.print_help() |
|
|