File size: 16,683 Bytes

43328de

"""
Docker-less SWE-bench test verification.

Replicates the exact conda environment from SWE-bench Docker images
without using Docker. Uses `environment_setup_commit` from the dataset
to find environment.yml, creates a conda env, installs repo, applies
test_patch, and runs pytest.

This is the key missing piece for closing the cascade validation loop.
With this, we can verify that cascade-produced patches actually pass tests.

Requirements: conda (or mamba), git, pip
Usage:
    python dockerless_verify.py --instance django__django-12308 --patch path/to/patch.diff
    python dockerless_verify.py --batch results.jsonl --max-instances 50
"""

import argparse
import json
import os
import re
import subprocess
import sys
import tempfile
import time
from pathlib import Path
from typing import Optional, Tuple, Dict, List

# ============================================================
# CONFIG
# ============================================================

# Environment file candidate paths (from SWE-bench harness)
ENV_CANDIDATES = [
    "environment.yml",
    "dev/environment.yml",
    ".github/environment.yml",
    ".github/setup/environment.yml",
    "requirements/environment.yml",
    "ci/environment.yml",
    ".azure-pipelines/environment.yml",
    "build_tools/azure/environment.yml",
    "conda_environment.yml",
    "env/environment.yml",
]

# Repo clone URLs
REPO_URLS = {
    "django/django": "https://github.com/django/django.git",
    "pytest-dev/pytest": "https://github.com/pytest-dev/pytest.git",
    "scikit-learn/scikit-learn": "https://github.com/scikit-learn/scikit-learn.git",
    "sympy/sympy": "https://github.com/sympy/sympy.git",
    "matplotlib/matplotlib": "https://github.com/matplotlib/matplotlib.git",
    "sphinx-doc/sphinx": "https://github.com/sphinx-doc/sphinx.git",
    "astropy/astropy": "https://github.com/astropy/astropy.git",
    "psf/requests": "https://github.com/psf/requests.git",
    "pylint-dev/pylint": "https://github.com/pylint-dev/pylint.git",
    "pallets/flask": "https://github.com/pallets/flask.git",
    "mwaskom/seaborn": "https://github.com/mwaskom/seaborn.git",
    "pydata/xarray": "https://github.com/pydata/xarray.git",
}


def run(cmd: list, cwd: str = None, timeout: int = 120, env: dict = None) -> Tuple[int, str, str]:
    """Run a command, return (returncode, stdout, stderr)."""
    try:
        result = subprocess.run(
            cmd, cwd=cwd, capture_output=True, text=True,
            timeout=timeout, env=env or os.environ
        )
        return result.returncode, result.stdout, result.stderr
    except subprocess.TimeoutExpired:
        return 124, "", "TIMEOUT"
    except Exception as e:
        return -1, "", str(e)


def find_environment_yml(repo_dir: Path) -> Optional[Path]:
    """Find environment.yml in the repo at its current checkout."""
    for candidate in ENV_CANDIDATES:
        path = repo_dir / candidate
        if path.exists() and path.stat().st_size > 10:
            return path
    # Try recursive search
    for path in repo_dir.rglob("environment.yml"):
        if path.stat().st_size > 10:
            return path
    return None


def get_python_version_from_yml(yml_path: Path) -> str:
    """Extract Python version from environment.yml."""
    content = yml_path.read_text()
    # Look for: - python=3.10 or - python=3.9.*
    m = re.search(r'python\s*[=!<>]+\s*(\d+\.\d+)', content)
    if m:
        return m.group(1)
    m = re.search(r'-\s*python\s*$', content, re.MULTILINE)
    if m:
        return "3.10"  # default
    return "3.10"


def setup_conda_env(repo_dir: Path, env_name: str) -> Tuple[bool, str]:
    """
    Create conda environment for the repo.
    Returns (success, error_message).
    """
    yml_path = find_environment_yml(repo_dir)
    
    if yml_path:
        print(f"  Found environment.yml: {yml_path.relative_to(repo_dir)}")
        rc, out, err = run(
            ["conda", "env", "create", "-f", str(yml_path), "-n", env_name, "--quiet"],
            timeout=300
        )
        if rc != 0:
            # Try with channel flexibility
            rc2, out2, err2 = run(
                ["conda", "env", "create", "-f", str(yml_path), "-n", env_name,
                 "--quiet", "--override-channels", "-c", "conda-forge", "-c", "defaults"],
                timeout=300
            )
            if rc2 != 0:
                return False, f"conda env create failed: {err[:500]}\nFallback also failed: {err2[:200]}"
    else:
        print(f"  No environment.yml found, creating minimal env")
        py_ver = "3.10"
        # Try to detect from setup.py
        setup_py = repo_dir / "setup.py"
        if setup_py.exists():
            content = setup_py.read_text()
            m = re.search(r'python_requires\s*=\s*[\'"]([^\'"]+)[\'"]', content)
            if m:
                ver_spec = m.group(1)
                m2 = re.search(r'(\d+\.\d+)', ver_spec)
                if m2:
                    py_ver = m2.group(1)
        
        rc, out, err = run(
            ["conda", "create", "-n", env_name, f"python={py_ver}", "pip", "-y", "--quiet"],
            timeout=120
        )
        if rc != 0:
            return False, f"conda create failed: {err[:300]}"
    
    # Install repo in editable mode
    print(f"  Installing repo (pip install -e .)...")
    rc, out, err = run(
        ["conda", "run", "-n", env_name, "pip", "install", "-e", ".", "--quiet"],
        cwd=str(repo_dir),
        timeout=300
    )
    if rc != 0:
        # Try without quiet to see errors
        rc2, out2, err2 = run(
            ["conda", "run", "-n", env_name, "pip", "install", "-e", "."],
            cwd=str(repo_dir),
            timeout=300
        )
        if rc2 != 0:
            print(f"  [WARN] pip install failed: {err2[:300]}")
            return False, f"pip install failed: {err2[:300]}"
    
    # Install test dependencies if any
    for req_file in ["test-requirements.txt", "requirements-test.txt", "requirements_test.txt",
                      "requirements/dev.txt", "requirements/test.txt", "dev-requirements.txt"]:
        req_path = repo_dir / req_file
        if req_path.exists():
            print(f"  Installing test deps from {req_file}")
            run(["conda", "run", "-n", env_name, "pip", "install", "-r", str(req_path), "--quiet"],
                timeout=120)
    
    # Also try pip install -e ".[test]"
    run(["conda", "run", "-n", env_name, "pip", "install", "-e", ".[test]", "--quiet"],
        cwd=str(repo_dir), timeout=120)
    
    return True, ""


def verify_patch(
    instance: dict,
    model_patch: str,
    work_dir: Path,
    env_name: str,
) -> dict:
    """
    Verify a model-generated patch against SWE-bench tests.
    
    Steps:
    1. Git apply the model's patch
    2. Git apply the test_patch from the instance
    3. Run FAIL_TO_PASS tests
    4. Check that PASS_TO_PASS tests still pass
    
    Returns: {success, resolved, test_output, failures}
    """
    result = {
        "success": False,
        "resolved": False,
        "test_output": "",
        "failures": [],
        "error": None,
    }
    
    repo_dir = work_dir / "repo"
    
    try:
        # Step 1: Apply model patch
        model_patch_file = work_dir / "model.patch"
        model_patch_file.write_text(model_patch)
        
        rc, out, err = run(
            ["git", "apply", "--verbose", str(model_patch_file)],
            cwd=str(repo_dir)
        )
        if rc != 0:
            result["error"] = f"model patch apply failed: {err[:300]}"
            return result
        
        # Step 2: Apply test patch
        test_patch = instance.get("test_patch", "")
        if not test_patch:
            result["error"] = "no test_patch in instance"
            return result
        
        test_patch_file = work_dir / "test.patch"
        test_patch_file.write_text(test_patch)
        
        rc, out, err = run(
            ["git", "apply", "--verbose", str(test_patch_file)],
            cwd=str(repo_dir)
        )
        if rc != 0:
            # Try with --reject
            rc, out, err = run(
                ["git", "apply", "--reject", "--verbose", str(test_patch_file)],
                cwd=str(repo_dir)
            )
            if rc != 0:
                result["error"] = f"test patch apply failed: {err[:300]}"
                return result
        
        # Step 3: Run FAIL_TO_PASS tests
        fail_tests = instance.get("FAIL_TO_PASS", [])
        if not fail_tests:
            result["error"] = "no FAIL_TO_PASS tests"
            return result
        
        # Convert test paths to pytest format
        test_args = []
        for test in fail_tests:
            # Format: "astropy/modeling/tests/test_separable.py::test_name"
            test_args.append(test)
        
        print(f"  Running {len(fail_tests)} FAIL_TO_PASS tests...")
        cmd = ["conda", "run", "-n", env_name, "python", "-m", "pytest", "-x", "-v"] + test_args
        rc, out, err = run(cmd, cwd=str(repo_dir), timeout=300)
        
        result["test_output"] = (out + err)[:2000]
        
        # Check if all tests passed
        if rc == 0:
            # Step 4: Run PASS_TO_PASS tests to check for regressions
            pass_tests = instance.get("PASS_TO_PASS", [])
            if pass_tests:
                print(f"  Running {len(pass_tests)} PASS_TO_PASS regression tests...")
                cmd2 = ["conda", "run", "-n", env_name, "python", "-m", "pytest", "-x", "-v"] + pass_tests[:20]
                rc2, out2, err2 = run(cmd2, cwd=str(repo_dir), timeout=300)
                if rc2 == 0:
                    result["success"] = True
                    result["resolved"] = True
                else:
                    result["error"] = f"regression test failed: {(out2+err2)[:300]}"
            else:
                result["success"] = True
                result["resolved"] = True
        else:
            # Count failures
            failures = []
            for line in (out + err).split('\n'):
                if 'FAILED' in line and '::' in line:
                    failures.append(line.strip())
            result["failures"] = failures[:10]
        
        return result
        
    except Exception as e:
        result["error"] = str(e)[:500]
        return result


def verify_one(instance: dict, model_patch: str) -> dict:
    """
    Full verification of one instance with one patch.
    Clones repo, sets up conda env, verifies.
    """
    inst_id = instance["instance_id"]
    repo = instance.get("repo", "")
    env_setup_commit = instance.get("environment_setup_commit", "")
    base_commit = instance.get("base_commit", "")
    
    print(f"\n{'='*60}")
    print(f"VERIFY: {inst_id}")
    print(f"  Repo: {repo}")
    print(f"  Base: {base_commit[:12]} EnvSetup: {env_setup_commit[:12]}")
    print(f"{'='*60}")
    
    repo_url = REPO_URLS.get(repo, f"https://github.com/{repo}.git")
    
    with tempfile.TemporaryDirectory(prefix=f"sweverify_{inst_id.replace('/', '_')}_") as tmpdir:
        work_dir = Path(tmpdir)
        repo_dir = work_dir / "repo"
        env_name = f"swebench_{inst_id.replace('__', '_').replace('-', '_')[:40]}"
        
        # Clone
        print(f"  Cloning {repo_url}...")
        rc, out, err = run(["git", "clone", repo_url, str(repo_dir)], timeout=180)
        if rc != 0:
            return {"instance_id": inst_id, "resolved": False, "error": f"clone failed: {err[:200]}"}
        
        # Step 1: Checkout environment_setup_commit to find environment.yml
        if env_setup_commit:
            rc, out, err = run(
                ["git", "fetch", "origin", env_setup_commit],
                cwd=str(repo_dir), timeout=60
            )
            rc, out, err = run(
                ["git", "checkout", env_setup_commit],
                cwd=str(repo_dir), timeout=30
            )
            if rc != 0:
                print(f"  [WARN] Cannot checkout env_setup_commit {env_setup_commit[:12]}, using HEAD")
        
        # Step 2: Create conda env
        success, error = setup_conda_env(repo_dir, env_name)
        if not success:
            return {"instance_id": inst_id, "resolved": False, "error": f"env setup: {error[:200]}"}
        
        # Step 3: Checkout base_commit
        if base_commit:
            rc, out, err = run(
                ["git", "fetch", "origin", base_commit],
                cwd=str(repo_dir), timeout=60
            )
            rc, out, err = run(
                ["git", "checkout", base_commit],
                cwd=str(repo_dir), timeout=30
            )
            if rc != 0:
                print(f"  [WARN] Cannot checkout base_commit {base_commit[:12]}")
        
        # Reinstall at base commit
        print(f"  Reinstalling at base_commit...")
        run(["conda", "run", "-n", env_name, "pip", "install", "-e", ".", "--quiet"],
            cwd=str(repo_dir), timeout=120)
        
        # Step 4: Verify
        result = verify_patch(instance, model_patch, work_dir, env_name)
        result["instance_id"] = inst_id
        
        # Cleanup conda env
        print(f"  Cleaning up conda env {env_name}...")
        run(["conda", "env", "remove", "-n", env_name, "-y", "--quiet"], timeout=30)
        
        return result


def verify_batch(instances: list, patches: dict, max_instances: int = 50) -> list:
    """Verify multiple instances with their patches."""
    results = []
    for i, instance in enumerate(instances[:max_instances]):
        inst_id = instance["instance_id"]
        if inst_id not in patches:
            print(f"  [{i+1}/{min(len(instances), max_instances)}] SKIP {inst_id} — no patch")
            results.append({"instance_id": inst_id, "resolved": False, "error": "no patch"})
            continue
        
        print(f"\n{'#'*60}")
        print(f"  [{i+1}/{min(len(instances), max_instances)}] {inst_id}")
        print(f"{'#'*60}")
        
        result = verify_one(instance, patches[inst_id])
        results.append(result)
        
        # Save incrementally
        with open("/app/verify_results.jsonl", "w") as f:
            for r in results:
                r_clean = {k: v for k, v in r.items() if k != "test_output"}
                r_clean["test_output_len"] = len(r.get("test_output", ""))
                f.write(json.dumps(r_clean) + "\n")
    
    return results


def load_patches_from_results(results_file: str) -> dict:
    """Load patches from a results JSONL file."""
    patches = {}
    with open(results_file) as f:
        for line in f:
            r = json.loads(line)
            if r.get("patch"):
                patches[r["instance_id"]] = r["patch"]
    return patches


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Docker-less SWE-bench verification")
    parser.add_argument("--instance", type=str, help="Single instance ID")
    parser.add_argument("--patch", type=str, help="Path to patch file (for single instance)")
    parser.add_argument("--batch", type=str, help="Path to results JSONL with patches")
    parser.add_argument("--max-instances", type=int, default=10)
    args = parser.parse_args()
    
    from datasets import load_dataset
    
    if args.instance and args.patch:
        # Single instance mode
        ds = load_dataset("princeton-nlp/SWE-bench_Verified", split="test")
        instance = None
        for row in ds:
            if row["instance_id"] == args.instance:
                instance = dict(row)
                break
        if not instance:
            print(f"Instance {args.instance} not found")
            sys.exit(1)
        
        patch = Path(args.patch).read_text()
        result = verify_one(instance, patch)
        print(f"\nRESULT: {json.dumps(result, indent=2)}")
    
    elif args.batch:
        # Batch mode
        patches = load_patches_from_results(args.batch)
        print(f"Loaded {len(patches)} patches from {args.batch}")
        
        ds = load_dataset("princeton-nlp/SWE-bench_Verified", split="test")
        instances = []
        for row in ds:
            if row["instance_id"] in patches:
                instances.append(dict(row))
        
        print(f"Found {len(instances)} instances with patches")
        
        results = verify_batch(instances, patches, args.max_instances)
        
        resolved = [r for r in results if r.get("resolved")]
        print(f"\nVERIFICATION COMPLETE: {len(resolved)}/{len(results)} verified")
        
        with open("/app/verify_results.jsonl", "w") as f:
            for r in results:
                f.write(json.dumps(r) + "\n")
    
    else:
        parser.print_help()