agent-cost-optimizer / dockerless_verify.py
narcolepticchicken's picture
Upload dockerless_verify.py
43328de verified
"""
Docker-less SWE-bench test verification.
Replicates the exact conda environment from SWE-bench Docker images
without using Docker. Uses `environment_setup_commit` from the dataset
to find environment.yml, creates a conda env, installs repo, applies
test_patch, and runs pytest.
This is the key missing piece for closing the cascade validation loop.
With this, we can verify that cascade-produced patches actually pass tests.
Requirements: conda (or mamba), git, pip
Usage:
python dockerless_verify.py --instance django__django-12308 --patch path/to/patch.diff
python dockerless_verify.py --batch results.jsonl --max-instances 50
"""
import argparse
import json
import os
import re
import subprocess
import sys
import tempfile
import time
from pathlib import Path
from typing import Optional, Tuple, Dict, List
# ============================================================
# CONFIG
# ============================================================
# Environment file candidate paths (from SWE-bench harness)
ENV_CANDIDATES = [
"environment.yml",
"dev/environment.yml",
".github/environment.yml",
".github/setup/environment.yml",
"requirements/environment.yml",
"ci/environment.yml",
".azure-pipelines/environment.yml",
"build_tools/azure/environment.yml",
"conda_environment.yml",
"env/environment.yml",
]
# Repo clone URLs
REPO_URLS = {
"django/django": "https://github.com/django/django.git",
"pytest-dev/pytest": "https://github.com/pytest-dev/pytest.git",
"scikit-learn/scikit-learn": "https://github.com/scikit-learn/scikit-learn.git",
"sympy/sympy": "https://github.com/sympy/sympy.git",
"matplotlib/matplotlib": "https://github.com/matplotlib/matplotlib.git",
"sphinx-doc/sphinx": "https://github.com/sphinx-doc/sphinx.git",
"astropy/astropy": "https://github.com/astropy/astropy.git",
"psf/requests": "https://github.com/psf/requests.git",
"pylint-dev/pylint": "https://github.com/pylint-dev/pylint.git",
"pallets/flask": "https://github.com/pallets/flask.git",
"mwaskom/seaborn": "https://github.com/mwaskom/seaborn.git",
"pydata/xarray": "https://github.com/pydata/xarray.git",
}
def run(cmd: list, cwd: str = None, timeout: int = 120, env: dict = None) -> Tuple[int, str, str]:
"""Run a command, return (returncode, stdout, stderr)."""
try:
result = subprocess.run(
cmd, cwd=cwd, capture_output=True, text=True,
timeout=timeout, env=env or os.environ
)
return result.returncode, result.stdout, result.stderr
except subprocess.TimeoutExpired:
return 124, "", "TIMEOUT"
except Exception as e:
return -1, "", str(e)
def find_environment_yml(repo_dir: Path) -> Optional[Path]:
"""Find environment.yml in the repo at its current checkout."""
for candidate in ENV_CANDIDATES:
path = repo_dir / candidate
if path.exists() and path.stat().st_size > 10:
return path
# Try recursive search
for path in repo_dir.rglob("environment.yml"):
if path.stat().st_size > 10:
return path
return None
def get_python_version_from_yml(yml_path: Path) -> str:
"""Extract Python version from environment.yml."""
content = yml_path.read_text()
# Look for: - python=3.10 or - python=3.9.*
m = re.search(r'python\s*[=!<>]+\s*(\d+\.\d+)', content)
if m:
return m.group(1)
m = re.search(r'-\s*python\s*$', content, re.MULTILINE)
if m:
return "3.10" # default
return "3.10"
def setup_conda_env(repo_dir: Path, env_name: str) -> Tuple[bool, str]:
"""
Create conda environment for the repo.
Returns (success, error_message).
"""
yml_path = find_environment_yml(repo_dir)
if yml_path:
print(f" Found environment.yml: {yml_path.relative_to(repo_dir)}")
rc, out, err = run(
["conda", "env", "create", "-f", str(yml_path), "-n", env_name, "--quiet"],
timeout=300
)
if rc != 0:
# Try with channel flexibility
rc2, out2, err2 = run(
["conda", "env", "create", "-f", str(yml_path), "-n", env_name,
"--quiet", "--override-channels", "-c", "conda-forge", "-c", "defaults"],
timeout=300
)
if rc2 != 0:
return False, f"conda env create failed: {err[:500]}\nFallback also failed: {err2[:200]}"
else:
print(f" No environment.yml found, creating minimal env")
py_ver = "3.10"
# Try to detect from setup.py
setup_py = repo_dir / "setup.py"
if setup_py.exists():
content = setup_py.read_text()
m = re.search(r'python_requires\s*=\s*[\'"]([^\'"]+)[\'"]', content)
if m:
ver_spec = m.group(1)
m2 = re.search(r'(\d+\.\d+)', ver_spec)
if m2:
py_ver = m2.group(1)
rc, out, err = run(
["conda", "create", "-n", env_name, f"python={py_ver}", "pip", "-y", "--quiet"],
timeout=120
)
if rc != 0:
return False, f"conda create failed: {err[:300]}"
# Install repo in editable mode
print(f" Installing repo (pip install -e .)...")
rc, out, err = run(
["conda", "run", "-n", env_name, "pip", "install", "-e", ".", "--quiet"],
cwd=str(repo_dir),
timeout=300
)
if rc != 0:
# Try without quiet to see errors
rc2, out2, err2 = run(
["conda", "run", "-n", env_name, "pip", "install", "-e", "."],
cwd=str(repo_dir),
timeout=300
)
if rc2 != 0:
print(f" [WARN] pip install failed: {err2[:300]}")
return False, f"pip install failed: {err2[:300]}"
# Install test dependencies if any
for req_file in ["test-requirements.txt", "requirements-test.txt", "requirements_test.txt",
"requirements/dev.txt", "requirements/test.txt", "dev-requirements.txt"]:
req_path = repo_dir / req_file
if req_path.exists():
print(f" Installing test deps from {req_file}")
run(["conda", "run", "-n", env_name, "pip", "install", "-r", str(req_path), "--quiet"],
timeout=120)
# Also try pip install -e ".[test]"
run(["conda", "run", "-n", env_name, "pip", "install", "-e", ".[test]", "--quiet"],
cwd=str(repo_dir), timeout=120)
return True, ""
def verify_patch(
instance: dict,
model_patch: str,
work_dir: Path,
env_name: str,
) -> dict:
"""
Verify a model-generated patch against SWE-bench tests.
Steps:
1. Git apply the model's patch
2. Git apply the test_patch from the instance
3. Run FAIL_TO_PASS tests
4. Check that PASS_TO_PASS tests still pass
Returns: {success, resolved, test_output, failures}
"""
result = {
"success": False,
"resolved": False,
"test_output": "",
"failures": [],
"error": None,
}
repo_dir = work_dir / "repo"
try:
# Step 1: Apply model patch
model_patch_file = work_dir / "model.patch"
model_patch_file.write_text(model_patch)
rc, out, err = run(
["git", "apply", "--verbose", str(model_patch_file)],
cwd=str(repo_dir)
)
if rc != 0:
result["error"] = f"model patch apply failed: {err[:300]}"
return result
# Step 2: Apply test patch
test_patch = instance.get("test_patch", "")
if not test_patch:
result["error"] = "no test_patch in instance"
return result
test_patch_file = work_dir / "test.patch"
test_patch_file.write_text(test_patch)
rc, out, err = run(
["git", "apply", "--verbose", str(test_patch_file)],
cwd=str(repo_dir)
)
if rc != 0:
# Try with --reject
rc, out, err = run(
["git", "apply", "--reject", "--verbose", str(test_patch_file)],
cwd=str(repo_dir)
)
if rc != 0:
result["error"] = f"test patch apply failed: {err[:300]}"
return result
# Step 3: Run FAIL_TO_PASS tests
fail_tests = instance.get("FAIL_TO_PASS", [])
if not fail_tests:
result["error"] = "no FAIL_TO_PASS tests"
return result
# Convert test paths to pytest format
test_args = []
for test in fail_tests:
# Format: "astropy/modeling/tests/test_separable.py::test_name"
test_args.append(test)
print(f" Running {len(fail_tests)} FAIL_TO_PASS tests...")
cmd = ["conda", "run", "-n", env_name, "python", "-m", "pytest", "-x", "-v"] + test_args
rc, out, err = run(cmd, cwd=str(repo_dir), timeout=300)
result["test_output"] = (out + err)[:2000]
# Check if all tests passed
if rc == 0:
# Step 4: Run PASS_TO_PASS tests to check for regressions
pass_tests = instance.get("PASS_TO_PASS", [])
if pass_tests:
print(f" Running {len(pass_tests)} PASS_TO_PASS regression tests...")
cmd2 = ["conda", "run", "-n", env_name, "python", "-m", "pytest", "-x", "-v"] + pass_tests[:20]
rc2, out2, err2 = run(cmd2, cwd=str(repo_dir), timeout=300)
if rc2 == 0:
result["success"] = True
result["resolved"] = True
else:
result["error"] = f"regression test failed: {(out2+err2)[:300]}"
else:
result["success"] = True
result["resolved"] = True
else:
# Count failures
failures = []
for line in (out + err).split('\n'):
if 'FAILED' in line and '::' in line:
failures.append(line.strip())
result["failures"] = failures[:10]
return result
except Exception as e:
result["error"] = str(e)[:500]
return result
def verify_one(instance: dict, model_patch: str) -> dict:
"""
Full verification of one instance with one patch.
Clones repo, sets up conda env, verifies.
"""
inst_id = instance["instance_id"]
repo = instance.get("repo", "")
env_setup_commit = instance.get("environment_setup_commit", "")
base_commit = instance.get("base_commit", "")
print(f"\n{'='*60}")
print(f"VERIFY: {inst_id}")
print(f" Repo: {repo}")
print(f" Base: {base_commit[:12]} EnvSetup: {env_setup_commit[:12]}")
print(f"{'='*60}")
repo_url = REPO_URLS.get(repo, f"https://github.com/{repo}.git")
with tempfile.TemporaryDirectory(prefix=f"sweverify_{inst_id.replace('/', '_')}_") as tmpdir:
work_dir = Path(tmpdir)
repo_dir = work_dir / "repo"
env_name = f"swebench_{inst_id.replace('__', '_').replace('-', '_')[:40]}"
# Clone
print(f" Cloning {repo_url}...")
rc, out, err = run(["git", "clone", repo_url, str(repo_dir)], timeout=180)
if rc != 0:
return {"instance_id": inst_id, "resolved": False, "error": f"clone failed: {err[:200]}"}
# Step 1: Checkout environment_setup_commit to find environment.yml
if env_setup_commit:
rc, out, err = run(
["git", "fetch", "origin", env_setup_commit],
cwd=str(repo_dir), timeout=60
)
rc, out, err = run(
["git", "checkout", env_setup_commit],
cwd=str(repo_dir), timeout=30
)
if rc != 0:
print(f" [WARN] Cannot checkout env_setup_commit {env_setup_commit[:12]}, using HEAD")
# Step 2: Create conda env
success, error = setup_conda_env(repo_dir, env_name)
if not success:
return {"instance_id": inst_id, "resolved": False, "error": f"env setup: {error[:200]}"}
# Step 3: Checkout base_commit
if base_commit:
rc, out, err = run(
["git", "fetch", "origin", base_commit],
cwd=str(repo_dir), timeout=60
)
rc, out, err = run(
["git", "checkout", base_commit],
cwd=str(repo_dir), timeout=30
)
if rc != 0:
print(f" [WARN] Cannot checkout base_commit {base_commit[:12]}")
# Reinstall at base commit
print(f" Reinstalling at base_commit...")
run(["conda", "run", "-n", env_name, "pip", "install", "-e", ".", "--quiet"],
cwd=str(repo_dir), timeout=120)
# Step 4: Verify
result = verify_patch(instance, model_patch, work_dir, env_name)
result["instance_id"] = inst_id
# Cleanup conda env
print(f" Cleaning up conda env {env_name}...")
run(["conda", "env", "remove", "-n", env_name, "-y", "--quiet"], timeout=30)
return result
def verify_batch(instances: list, patches: dict, max_instances: int = 50) -> list:
"""Verify multiple instances with their patches."""
results = []
for i, instance in enumerate(instances[:max_instances]):
inst_id = instance["instance_id"]
if inst_id not in patches:
print(f" [{i+1}/{min(len(instances), max_instances)}] SKIP {inst_id} — no patch")
results.append({"instance_id": inst_id, "resolved": False, "error": "no patch"})
continue
print(f"\n{'#'*60}")
print(f" [{i+1}/{min(len(instances), max_instances)}] {inst_id}")
print(f"{'#'*60}")
result = verify_one(instance, patches[inst_id])
results.append(result)
# Save incrementally
with open("/app/verify_results.jsonl", "w") as f:
for r in results:
r_clean = {k: v for k, v in r.items() if k != "test_output"}
r_clean["test_output_len"] = len(r.get("test_output", ""))
f.write(json.dumps(r_clean) + "\n")
return results
def load_patches_from_results(results_file: str) -> dict:
"""Load patches from a results JSONL file."""
patches = {}
with open(results_file) as f:
for line in f:
r = json.loads(line)
if r.get("patch"):
patches[r["instance_id"]] = r["patch"]
return patches
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Docker-less SWE-bench verification")
parser.add_argument("--instance", type=str, help="Single instance ID")
parser.add_argument("--patch", type=str, help="Path to patch file (for single instance)")
parser.add_argument("--batch", type=str, help="Path to results JSONL with patches")
parser.add_argument("--max-instances", type=int, default=10)
args = parser.parse_args()
from datasets import load_dataset
if args.instance and args.patch:
# Single instance mode
ds = load_dataset("princeton-nlp/SWE-bench_Verified", split="test")
instance = None
for row in ds:
if row["instance_id"] == args.instance:
instance = dict(row)
break
if not instance:
print(f"Instance {args.instance} not found")
sys.exit(1)
patch = Path(args.patch).read_text()
result = verify_one(instance, patch)
print(f"\nRESULT: {json.dumps(result, indent=2)}")
elif args.batch:
# Batch mode
patches = load_patches_from_results(args.batch)
print(f"Loaded {len(patches)} patches from {args.batch}")
ds = load_dataset("princeton-nlp/SWE-bench_Verified", split="test")
instances = []
for row in ds:
if row["instance_id"] in patches:
instances.append(dict(row))
print(f"Found {len(instances)} instances with patches")
results = verify_batch(instances, patches, args.max_instances)
resolved = [r for r in results if r.get("resolved")]
print(f"\nVERIFICATION COMPLETE: {len(resolved)}/{len(results)} verified")
with open("/app/verify_results.jsonl", "w") as f:
for r in results:
f.write(json.dumps(r) + "\n")
else:
parser.print_help()