""" Docker-less SWE-bench test verification. Replicates the exact conda environment from SWE-bench Docker images without using Docker. Uses `environment_setup_commit` from the dataset to find environment.yml, creates a conda env, installs repo, applies test_patch, and runs pytest. This is the key missing piece for closing the cascade validation loop. With this, we can verify that cascade-produced patches actually pass tests. Requirements: conda (or mamba), git, pip Usage: python dockerless_verify.py --instance django__django-12308 --patch path/to/patch.diff python dockerless_verify.py --batch results.jsonl --max-instances 50 """ import argparse import json import os import re import subprocess import sys import tempfile import time from pathlib import Path from typing import Optional, Tuple, Dict, List # ============================================================ # CONFIG # ============================================================ # Environment file candidate paths (from SWE-bench harness) ENV_CANDIDATES = [ "environment.yml", "dev/environment.yml", ".github/environment.yml", ".github/setup/environment.yml", "requirements/environment.yml", "ci/environment.yml", ".azure-pipelines/environment.yml", "build_tools/azure/environment.yml", "conda_environment.yml", "env/environment.yml", ] # Repo clone URLs REPO_URLS = { "django/django": "https://github.com/django/django.git", "pytest-dev/pytest": "https://github.com/pytest-dev/pytest.git", "scikit-learn/scikit-learn": "https://github.com/scikit-learn/scikit-learn.git", "sympy/sympy": "https://github.com/sympy/sympy.git", "matplotlib/matplotlib": "https://github.com/matplotlib/matplotlib.git", "sphinx-doc/sphinx": "https://github.com/sphinx-doc/sphinx.git", "astropy/astropy": "https://github.com/astropy/astropy.git", "psf/requests": "https://github.com/psf/requests.git", "pylint-dev/pylint": "https://github.com/pylint-dev/pylint.git", "pallets/flask": "https://github.com/pallets/flask.git", "mwaskom/seaborn": "https://github.com/mwaskom/seaborn.git", "pydata/xarray": "https://github.com/pydata/xarray.git", } def run(cmd: list, cwd: str = None, timeout: int = 120, env: dict = None) -> Tuple[int, str, str]: """Run a command, return (returncode, stdout, stderr).""" try: result = subprocess.run( cmd, cwd=cwd, capture_output=True, text=True, timeout=timeout, env=env or os.environ ) return result.returncode, result.stdout, result.stderr except subprocess.TimeoutExpired: return 124, "", "TIMEOUT" except Exception as e: return -1, "", str(e) def find_environment_yml(repo_dir: Path) -> Optional[Path]: """Find environment.yml in the repo at its current checkout.""" for candidate in ENV_CANDIDATES: path = repo_dir / candidate if path.exists() and path.stat().st_size > 10: return path # Try recursive search for path in repo_dir.rglob("environment.yml"): if path.stat().st_size > 10: return path return None def get_python_version_from_yml(yml_path: Path) -> str: """Extract Python version from environment.yml.""" content = yml_path.read_text() # Look for: - python=3.10 or - python=3.9.* m = re.search(r'python\s*[=!<>]+\s*(\d+\.\d+)', content) if m: return m.group(1) m = re.search(r'-\s*python\s*$', content, re.MULTILINE) if m: return "3.10" # default return "3.10" def setup_conda_env(repo_dir: Path, env_name: str) -> Tuple[bool, str]: """ Create conda environment for the repo. Returns (success, error_message). """ yml_path = find_environment_yml(repo_dir) if yml_path: print(f" Found environment.yml: {yml_path.relative_to(repo_dir)}") rc, out, err = run( ["conda", "env", "create", "-f", str(yml_path), "-n", env_name, "--quiet"], timeout=300 ) if rc != 0: # Try with channel flexibility rc2, out2, err2 = run( ["conda", "env", "create", "-f", str(yml_path), "-n", env_name, "--quiet", "--override-channels", "-c", "conda-forge", "-c", "defaults"], timeout=300 ) if rc2 != 0: return False, f"conda env create failed: {err[:500]}\nFallback also failed: {err2[:200]}" else: print(f" No environment.yml found, creating minimal env") py_ver = "3.10" # Try to detect from setup.py setup_py = repo_dir / "setup.py" if setup_py.exists(): content = setup_py.read_text() m = re.search(r'python_requires\s*=\s*[\'"]([^\'"]+)[\'"]', content) if m: ver_spec = m.group(1) m2 = re.search(r'(\d+\.\d+)', ver_spec) if m2: py_ver = m2.group(1) rc, out, err = run( ["conda", "create", "-n", env_name, f"python={py_ver}", "pip", "-y", "--quiet"], timeout=120 ) if rc != 0: return False, f"conda create failed: {err[:300]}" # Install repo in editable mode print(f" Installing repo (pip install -e .)...") rc, out, err = run( ["conda", "run", "-n", env_name, "pip", "install", "-e", ".", "--quiet"], cwd=str(repo_dir), timeout=300 ) if rc != 0: # Try without quiet to see errors rc2, out2, err2 = run( ["conda", "run", "-n", env_name, "pip", "install", "-e", "."], cwd=str(repo_dir), timeout=300 ) if rc2 != 0: print(f" [WARN] pip install failed: {err2[:300]}") return False, f"pip install failed: {err2[:300]}" # Install test dependencies if any for req_file in ["test-requirements.txt", "requirements-test.txt", "requirements_test.txt", "requirements/dev.txt", "requirements/test.txt", "dev-requirements.txt"]: req_path = repo_dir / req_file if req_path.exists(): print(f" Installing test deps from {req_file}") run(["conda", "run", "-n", env_name, "pip", "install", "-r", str(req_path), "--quiet"], timeout=120) # Also try pip install -e ".[test]" run(["conda", "run", "-n", env_name, "pip", "install", "-e", ".[test]", "--quiet"], cwd=str(repo_dir), timeout=120) return True, "" def verify_patch( instance: dict, model_patch: str, work_dir: Path, env_name: str, ) -> dict: """ Verify a model-generated patch against SWE-bench tests. Steps: 1. Git apply the model's patch 2. Git apply the test_patch from the instance 3. Run FAIL_TO_PASS tests 4. Check that PASS_TO_PASS tests still pass Returns: {success, resolved, test_output, failures} """ result = { "success": False, "resolved": False, "test_output": "", "failures": [], "error": None, } repo_dir = work_dir / "repo" try: # Step 1: Apply model patch model_patch_file = work_dir / "model.patch" model_patch_file.write_text(model_patch) rc, out, err = run( ["git", "apply", "--verbose", str(model_patch_file)], cwd=str(repo_dir) ) if rc != 0: result["error"] = f"model patch apply failed: {err[:300]}" return result # Step 2: Apply test patch test_patch = instance.get("test_patch", "") if not test_patch: result["error"] = "no test_patch in instance" return result test_patch_file = work_dir / "test.patch" test_patch_file.write_text(test_patch) rc, out, err = run( ["git", "apply", "--verbose", str(test_patch_file)], cwd=str(repo_dir) ) if rc != 0: # Try with --reject rc, out, err = run( ["git", "apply", "--reject", "--verbose", str(test_patch_file)], cwd=str(repo_dir) ) if rc != 0: result["error"] = f"test patch apply failed: {err[:300]}" return result # Step 3: Run FAIL_TO_PASS tests fail_tests = instance.get("FAIL_TO_PASS", []) if not fail_tests: result["error"] = "no FAIL_TO_PASS tests" return result # Convert test paths to pytest format test_args = [] for test in fail_tests: # Format: "astropy/modeling/tests/test_separable.py::test_name" test_args.append(test) print(f" Running {len(fail_tests)} FAIL_TO_PASS tests...") cmd = ["conda", "run", "-n", env_name, "python", "-m", "pytest", "-x", "-v"] + test_args rc, out, err = run(cmd, cwd=str(repo_dir), timeout=300) result["test_output"] = (out + err)[:2000] # Check if all tests passed if rc == 0: # Step 4: Run PASS_TO_PASS tests to check for regressions pass_tests = instance.get("PASS_TO_PASS", []) if pass_tests: print(f" Running {len(pass_tests)} PASS_TO_PASS regression tests...") cmd2 = ["conda", "run", "-n", env_name, "python", "-m", "pytest", "-x", "-v"] + pass_tests[:20] rc2, out2, err2 = run(cmd2, cwd=str(repo_dir), timeout=300) if rc2 == 0: result["success"] = True result["resolved"] = True else: result["error"] = f"regression test failed: {(out2+err2)[:300]}" else: result["success"] = True result["resolved"] = True else: # Count failures failures = [] for line in (out + err).split('\n'): if 'FAILED' in line and '::' in line: failures.append(line.strip()) result["failures"] = failures[:10] return result except Exception as e: result["error"] = str(e)[:500] return result def verify_one(instance: dict, model_patch: str) -> dict: """ Full verification of one instance with one patch. Clones repo, sets up conda env, verifies. """ inst_id = instance["instance_id"] repo = instance.get("repo", "") env_setup_commit = instance.get("environment_setup_commit", "") base_commit = instance.get("base_commit", "") print(f"\n{'='*60}") print(f"VERIFY: {inst_id}") print(f" Repo: {repo}") print(f" Base: {base_commit[:12]} EnvSetup: {env_setup_commit[:12]}") print(f"{'='*60}") repo_url = REPO_URLS.get(repo, f"https://github.com/{repo}.git") with tempfile.TemporaryDirectory(prefix=f"sweverify_{inst_id.replace('/', '_')}_") as tmpdir: work_dir = Path(tmpdir) repo_dir = work_dir / "repo" env_name = f"swebench_{inst_id.replace('__', '_').replace('-', '_')[:40]}" # Clone print(f" Cloning {repo_url}...") rc, out, err = run(["git", "clone", repo_url, str(repo_dir)], timeout=180) if rc != 0: return {"instance_id": inst_id, "resolved": False, "error": f"clone failed: {err[:200]}"} # Step 1: Checkout environment_setup_commit to find environment.yml if env_setup_commit: rc, out, err = run( ["git", "fetch", "origin", env_setup_commit], cwd=str(repo_dir), timeout=60 ) rc, out, err = run( ["git", "checkout", env_setup_commit], cwd=str(repo_dir), timeout=30 ) if rc != 0: print(f" [WARN] Cannot checkout env_setup_commit {env_setup_commit[:12]}, using HEAD") # Step 2: Create conda env success, error = setup_conda_env(repo_dir, env_name) if not success: return {"instance_id": inst_id, "resolved": False, "error": f"env setup: {error[:200]}"} # Step 3: Checkout base_commit if base_commit: rc, out, err = run( ["git", "fetch", "origin", base_commit], cwd=str(repo_dir), timeout=60 ) rc, out, err = run( ["git", "checkout", base_commit], cwd=str(repo_dir), timeout=30 ) if rc != 0: print(f" [WARN] Cannot checkout base_commit {base_commit[:12]}") # Reinstall at base commit print(f" Reinstalling at base_commit...") run(["conda", "run", "-n", env_name, "pip", "install", "-e", ".", "--quiet"], cwd=str(repo_dir), timeout=120) # Step 4: Verify result = verify_patch(instance, model_patch, work_dir, env_name) result["instance_id"] = inst_id # Cleanup conda env print(f" Cleaning up conda env {env_name}...") run(["conda", "env", "remove", "-n", env_name, "-y", "--quiet"], timeout=30) return result def verify_batch(instances: list, patches: dict, max_instances: int = 50) -> list: """Verify multiple instances with their patches.""" results = [] for i, instance in enumerate(instances[:max_instances]): inst_id = instance["instance_id"] if inst_id not in patches: print(f" [{i+1}/{min(len(instances), max_instances)}] SKIP {inst_id} — no patch") results.append({"instance_id": inst_id, "resolved": False, "error": "no patch"}) continue print(f"\n{'#'*60}") print(f" [{i+1}/{min(len(instances), max_instances)}] {inst_id}") print(f"{'#'*60}") result = verify_one(instance, patches[inst_id]) results.append(result) # Save incrementally with open("/app/verify_results.jsonl", "w") as f: for r in results: r_clean = {k: v for k, v in r.items() if k != "test_output"} r_clean["test_output_len"] = len(r.get("test_output", "")) f.write(json.dumps(r_clean) + "\n") return results def load_patches_from_results(results_file: str) -> dict: """Load patches from a results JSONL file.""" patches = {} with open(results_file) as f: for line in f: r = json.loads(line) if r.get("patch"): patches[r["instance_id"]] = r["patch"] return patches if __name__ == "__main__": parser = argparse.ArgumentParser(description="Docker-less SWE-bench verification") parser.add_argument("--instance", type=str, help="Single instance ID") parser.add_argument("--patch", type=str, help="Path to patch file (for single instance)") parser.add_argument("--batch", type=str, help="Path to results JSONL with patches") parser.add_argument("--max-instances", type=int, default=10) args = parser.parse_args() from datasets import load_dataset if args.instance and args.patch: # Single instance mode ds = load_dataset("princeton-nlp/SWE-bench_Verified", split="test") instance = None for row in ds: if row["instance_id"] == args.instance: instance = dict(row) break if not instance: print(f"Instance {args.instance} not found") sys.exit(1) patch = Path(args.patch).read_text() result = verify_one(instance, patch) print(f"\nRESULT: {json.dumps(result, indent=2)}") elif args.batch: # Batch mode patches = load_patches_from_results(args.batch) print(f"Loaded {len(patches)} patches from {args.batch}") ds = load_dataset("princeton-nlp/SWE-bench_Verified", split="test") instances = [] for row in ds: if row["instance_id"] in patches: instances.append(dict(row)) print(f"Found {len(instances)} instances with patches") results = verify_batch(instances, patches, args.max_instances) resolved = [r for r in results if r.get("resolved")] print(f"\nVERIFICATION COMPLETE: {len(resolved)}/{len(results)} verified") with open("/app/verify_results.jsonl", "w") as f: for r in results: f.write(json.dumps(r) + "\n") else: parser.print_help()