agent-cost-optimizer / dockerless_verify.py

Upload dockerless_verify.py

43328de verified 3 days ago

16.7 kB

	"""
	Docker-less SWE-bench test verification.

	Replicates the exact conda environment from SWE-bench Docker images
	without using Docker. Uses `environment_setup_commit` from the dataset
	to find environment.yml, creates a conda env, installs repo, applies
	test_patch, and runs pytest.

	This is the key missing piece for closing the cascade validation loop.
	With this, we can verify that cascade-produced patches actually pass tests.

	Requirements: conda (or mamba), git, pip
	Usage:
	python dockerless_verify.py --instance django__django-12308 --patch path/to/patch.diff
	python dockerless_verify.py --batch results.jsonl --max-instances 50
	"""

	import argparse
	import json
	import os
	import re
	import subprocess
	import sys
	import tempfile
	import time
	from pathlib import Path
	from typing import Optional, Tuple, Dict, List

	# ============================================================
	# CONFIG
	# ============================================================

	# Environment file candidate paths (from SWE-bench harness)
	ENV_CANDIDATES = [
	"environment.yml",
	"dev/environment.yml",
	".github/environment.yml",
	".github/setup/environment.yml",
	"requirements/environment.yml",
	"ci/environment.yml",
	".azure-pipelines/environment.yml",
	"build_tools/azure/environment.yml",
	"conda_environment.yml",
	"env/environment.yml",
	]

	# Repo clone URLs
	REPO_URLS = {
	"django/django": "https://github.com/django/django.git",
	"pytest-dev/pytest": "https://github.com/pytest-dev/pytest.git",
	"scikit-learn/scikit-learn": "https://github.com/scikit-learn/scikit-learn.git",
	"sympy/sympy": "https://github.com/sympy/sympy.git",
	"matplotlib/matplotlib": "https://github.com/matplotlib/matplotlib.git",
	"sphinx-doc/sphinx": "https://github.com/sphinx-doc/sphinx.git",
	"astropy/astropy": "https://github.com/astropy/astropy.git",
	"psf/requests": "https://github.com/psf/requests.git",
	"pylint-dev/pylint": "https://github.com/pylint-dev/pylint.git",
	"pallets/flask": "https://github.com/pallets/flask.git",
	"mwaskom/seaborn": "https://github.com/mwaskom/seaborn.git",
	"pydata/xarray": "https://github.com/pydata/xarray.git",
	}


	def run(cmd: list, cwd: str = None, timeout: int = 120, env: dict = None) -> Tuple[int, str, str]:
	"""Run a command, return (returncode, stdout, stderr)."""
	try:
	result = subprocess.run(
	cmd, cwd=cwd, capture_output=True, text=True,
	timeout=timeout, env=env or os.environ
	)
	return result.returncode, result.stdout, result.stderr
	except subprocess.TimeoutExpired:
	return 124, "", "TIMEOUT"
	except Exception as e:
	return -1, "", str(e)


	def find_environment_yml(repo_dir: Path) -> Optional[Path]:
	"""Find environment.yml in the repo at its current checkout."""
	for candidate in ENV_CANDIDATES:
	path = repo_dir / candidate
	if path.exists() and path.stat().st_size > 10:
	return path
	# Try recursive search
	for path in repo_dir.rglob("environment.yml"):
	if path.stat().st_size > 10:
	return path
	return None


	def get_python_version_from_yml(yml_path: Path) -> str:
	"""Extract Python version from environment.yml."""
	content = yml_path.read_text()
	# Look for: - python=3.10 or - python=3.9.*
	m = re.search(r'python\s[=!<>]+\s(\d+\.\d+)', content)
	if m:
	return m.group(1)
	m = re.search(r'-\spython\s$', content, re.MULTILINE)
	if m:
	return "3.10" # default
	return "3.10"


	def setup_conda_env(repo_dir: Path, env_name: str) -> Tuple[bool, str]:
	"""
	Create conda environment for the repo.
	Returns (success, error_message).
	"""
	yml_path = find_environment_yml(repo_dir)

	if yml_path:
	print(f" Found environment.yml: {yml_path.relative_to(repo_dir)}")
	rc, out, err = run(
	["conda", "env", "create", "-f", str(yml_path), "-n", env_name, "--quiet"],
	timeout=300
	)
	if rc != 0:
	# Try with channel flexibility
	rc2, out2, err2 = run(
	["conda", "env", "create", "-f", str(yml_path), "-n", env_name,
	"--quiet", "--override-channels", "-c", "conda-forge", "-c", "defaults"],
	timeout=300
	)
	if rc2 != 0:
	return False, f"conda env create failed: {err[:500]}\nFallback also failed: {err2[:200]}"
	else:
	print(f" No environment.yml found, creating minimal env")
	py_ver = "3.10"
	# Try to detect from setup.py
	setup_py = repo_dir / "setup.py"
	if setup_py.exists():
	content = setup_py.read_text()
	m = re.search(r'python_requires\s=\s[\'"]([^\'"]+)[\'"]', content)
	if m:
	ver_spec = m.group(1)
	m2 = re.search(r'(\d+\.\d+)', ver_spec)
	if m2:
	py_ver = m2.group(1)

	rc, out, err = run(
	["conda", "create", "-n", env_name, f"python={py_ver}", "pip", "-y", "--quiet"],
	timeout=120
	)
	if rc != 0:
	return False, f"conda create failed: {err[:300]}"

	# Install repo in editable mode
	print(f" Installing repo (pip install -e .)...")
	rc, out, err = run(
	["conda", "run", "-n", env_name, "pip", "install", "-e", ".", "--quiet"],
	cwd=str(repo_dir),
	timeout=300
	)
	if rc != 0:
	# Try without quiet to see errors
	rc2, out2, err2 = run(
	["conda", "run", "-n", env_name, "pip", "install", "-e", "."],
	cwd=str(repo_dir),
	timeout=300
	)
	if rc2 != 0:
	print(f" [WARN] pip install failed: {err2[:300]}")
	return False, f"pip install failed: {err2[:300]}"

	# Install test dependencies if any
	for req_file in ["test-requirements.txt", "requirements-test.txt", "requirements_test.txt",
	"requirements/dev.txt", "requirements/test.txt", "dev-requirements.txt"]:
	req_path = repo_dir / req_file
	if req_path.exists():
	print(f" Installing test deps from {req_file}")
	run(["conda", "run", "-n", env_name, "pip", "install", "-r", str(req_path), "--quiet"],
	timeout=120)

	# Also try pip install -e ".[test]"
	run(["conda", "run", "-n", env_name, "pip", "install", "-e", ".[test]", "--quiet"],
	cwd=str(repo_dir), timeout=120)

	return True, ""


	def verify_patch(
	instance: dict,
	model_patch: str,
	work_dir: Path,
	env_name: str,
	) -> dict:
	"""
	Verify a model-generated patch against SWE-bench tests.

	Steps:
	1. Git apply the model's patch
	2. Git apply the test_patch from the instance
	3. Run FAIL_TO_PASS tests
	4. Check that PASS_TO_PASS tests still pass

	Returns: {success, resolved, test_output, failures}
	"""
	result = {
	"success": False,
	"resolved": False,
	"test_output": "",
	"failures": [],
	"error": None,
	}

	repo_dir = work_dir / "repo"

	try:
	# Step 1: Apply model patch
	model_patch_file = work_dir / "model.patch"
	model_patch_file.write_text(model_patch)

	rc, out, err = run(
	["git", "apply", "--verbose", str(model_patch_file)],
	cwd=str(repo_dir)
	)
	if rc != 0:
	result["error"] = f"model patch apply failed: {err[:300]}"
	return result

	# Step 2: Apply test patch
	test_patch = instance.get("test_patch", "")
	if not test_patch:
	result["error"] = "no test_patch in instance"
	return result

	test_patch_file = work_dir / "test.patch"
	test_patch_file.write_text(test_patch)

	rc, out, err = run(
	["git", "apply", "--verbose", str(test_patch_file)],
	cwd=str(repo_dir)
	)
	if rc != 0:
	# Try with --reject
	rc, out, err = run(
	["git", "apply", "--reject", "--verbose", str(test_patch_file)],
	cwd=str(repo_dir)
	)
	if rc != 0:
	result["error"] = f"test patch apply failed: {err[:300]}"
	return result

	# Step 3: Run FAIL_TO_PASS tests
	fail_tests = instance.get("FAIL_TO_PASS", [])
	if not fail_tests:
	result["error"] = "no FAIL_TO_PASS tests"
	return result

	# Convert test paths to pytest format
	test_args = []
	for test in fail_tests:
	# Format: "astropy/modeling/tests/test_separable.py::test_name"
	test_args.append(test)

	print(f" Running {len(fail_tests)} FAIL_TO_PASS tests...")
	cmd = ["conda", "run", "-n", env_name, "python", "-m", "pytest", "-x", "-v"] + test_args
	rc, out, err = run(cmd, cwd=str(repo_dir), timeout=300)

	result["test_output"] = (out + err)[:2000]

	# Check if all tests passed
	if rc == 0:
	# Step 4: Run PASS_TO_PASS tests to check for regressions
	pass_tests = instance.get("PASS_TO_PASS", [])
	if pass_tests:
	print(f" Running {len(pass_tests)} PASS_TO_PASS regression tests...")
	cmd2 = ["conda", "run", "-n", env_name, "python", "-m", "pytest", "-x", "-v"] + pass_tests[:20]
	rc2, out2, err2 = run(cmd2, cwd=str(repo_dir), timeout=300)
	if rc2 == 0:
	result["success"] = True
	result["resolved"] = True
	else:
	result["error"] = f"regression test failed: {(out2+err2)[:300]}"
	else:
	result["success"] = True
	result["resolved"] = True
	else:
	# Count failures
	failures = []
	for line in (out + err).split('\n'):
	if 'FAILED' in line and '::' in line:
	failures.append(line.strip())
	result["failures"] = failures[:10]

	return result

	except Exception as e:
	result["error"] = str(e)[:500]
	return result


	def verify_one(instance: dict, model_patch: str) -> dict:
	"""
	Full verification of one instance with one patch.
	Clones repo, sets up conda env, verifies.
	"""
	inst_id = instance["instance_id"]
	repo = instance.get("repo", "")
	env_setup_commit = instance.get("environment_setup_commit", "")
	base_commit = instance.get("base_commit", "")

	print(f"\n{'='*60}")
	print(f"VERIFY: {inst_id}")
	print(f" Repo: {repo}")
	print(f" Base: {base_commit[:12]} EnvSetup: {env_setup_commit[:12]}")
	print(f"{'='*60}")

	repo_url = REPO_URLS.get(repo, f"https://github.com/{repo}.git")

	with tempfile.TemporaryDirectory(prefix=f"sweverify_{inst_id.replace('/', '_')}_") as tmpdir:
	work_dir = Path(tmpdir)
	repo_dir = work_dir / "repo"
	env_name = f"swebench_{inst_id.replace('__', '_').replace('-', '_')[:40]}"

	# Clone
	print(f" Cloning {repo_url}...")
	rc, out, err = run(["git", "clone", repo_url, str(repo_dir)], timeout=180)
	if rc != 0:
	return {"instance_id": inst_id, "resolved": False, "error": f"clone failed: {err[:200]}"}

	# Step 1: Checkout environment_setup_commit to find environment.yml
	if env_setup_commit:
	rc, out, err = run(
	["git", "fetch", "origin", env_setup_commit],
	cwd=str(repo_dir), timeout=60
	)
	rc, out, err = run(
	["git", "checkout", env_setup_commit],
	cwd=str(repo_dir), timeout=30
	)
	if rc != 0:
	print(f" [WARN] Cannot checkout env_setup_commit {env_setup_commit[:12]}, using HEAD")

	# Step 2: Create conda env
	success, error = setup_conda_env(repo_dir, env_name)
	if not success:
	return {"instance_id": inst_id, "resolved": False, "error": f"env setup: {error[:200]}"}

	# Step 3: Checkout base_commit
	if base_commit:
	rc, out, err = run(
	["git", "fetch", "origin", base_commit],
	cwd=str(repo_dir), timeout=60
	)
	rc, out, err = run(
	["git", "checkout", base_commit],
	cwd=str(repo_dir), timeout=30
	)
	if rc != 0:
	print(f" [WARN] Cannot checkout base_commit {base_commit[:12]}")

	# Reinstall at base commit
	print(f" Reinstalling at base_commit...")
	run(["conda", "run", "-n", env_name, "pip", "install", "-e", ".", "--quiet"],
	cwd=str(repo_dir), timeout=120)

	# Step 4: Verify
	result = verify_patch(instance, model_patch, work_dir, env_name)
	result["instance_id"] = inst_id

	# Cleanup conda env
	print(f" Cleaning up conda env {env_name}...")
	run(["conda", "env", "remove", "-n", env_name, "-y", "--quiet"], timeout=30)

	return result


	def verify_batch(instances: list, patches: dict, max_instances: int = 50) -> list:
	"""Verify multiple instances with their patches."""
	results = []
	for i, instance in enumerate(instances[:max_instances]):
	inst_id = instance["instance_id"]
	if inst_id not in patches:
	print(f" [{i+1}/{min(len(instances), max_instances)}] SKIP {inst_id} — no patch")
	results.append({"instance_id": inst_id, "resolved": False, "error": "no patch"})
	continue

	print(f"\n{'#'*60}")
	print(f" [{i+1}/{min(len(instances), max_instances)}] {inst_id}")
	print(f"{'#'*60}")

	result = verify_one(instance, patches[inst_id])
	results.append(result)

	# Save incrementally
	with open("/app/verify_results.jsonl", "w") as f:
	for r in results:
	r_clean = {k: v for k, v in r.items() if k != "test_output"}
	r_clean["test_output_len"] = len(r.get("test_output", ""))
	f.write(json.dumps(r_clean) + "\n")

	return results


	def load_patches_from_results(results_file: str) -> dict:
	"""Load patches from a results JSONL file."""
	patches = {}
	with open(results_file) as f:
	for line in f:
	r = json.loads(line)
	if r.get("patch"):
	patches[r["instance_id"]] = r["patch"]
	return patches


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Docker-less SWE-bench verification")
	parser.add_argument("--instance", type=str, help="Single instance ID")
	parser.add_argument("--patch", type=str, help="Path to patch file (for single instance)")
	parser.add_argument("--batch", type=str, help="Path to results JSONL with patches")
	parser.add_argument("--max-instances", type=int, default=10)
	args = parser.parse_args()

	from datasets import load_dataset

	if args.instance and args.patch:
	# Single instance mode
	ds = load_dataset("princeton-nlp/SWE-bench_Verified", split="test")
	instance = None
	for row in ds:
	if row["instance_id"] == args.instance:
	instance = dict(row)
	break
	if not instance:
	print(f"Instance {args.instance} not found")
	sys.exit(1)

	patch = Path(args.patch).read_text()
	result = verify_one(instance, patch)
	print(f"\nRESULT: {json.dumps(result, indent=2)}")

	elif args.batch:
	# Batch mode
	patches = load_patches_from_results(args.batch)
	print(f"Loaded {len(patches)} patches from {args.batch}")

	ds = load_dataset("princeton-nlp/SWE-bench_Verified", split="test")
	instances = []
	for row in ds:
	if row["instance_id"] in patches:
	instances.append(dict(row))

	print(f"Found {len(instances)} instances with patches")

	results = verify_batch(instances, patches, args.max_instances)

	resolved = [r for r in results if r.get("resolved")]
	print(f"\nVERIFICATION COMPLETE: {len(resolved)}/{len(results)} verified")

	with open("/app/verify_results.jsonl", "w") as f:
	for r in results:
	f.write(json.dumps(r) + "\n")

	else:
	parser.print_help()