narcolepticchicken
/

agent-cost-optimizer

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 21 days ago

Commit

51c9a64

verified ·

1 Parent(s): 29c4a80

Upload smoke_test_v4.py

Browse files

Files changed (1) hide show

smoke_test_v4.py +243 -0

smoke_test_v4.py ADDED Viewed

	@@ -0,0 +1,243 @@

+"""
+Cascade Smoke Test v4 — FILE EDITING APPROACH
+Instead of asking models to generate git diffs (which they can't do reliably),
+they edit files directly using <edit> tags. We run `git diff` to generate
+the patch and `git apply --check test_patch` to verify.
+This is how real SWE-bench agents (SWE-agent, Aider, OpenHands) work.
+Usage via hf_jobs:
+  operation: run
+  script: "https://huggingface.co/narcolepticchicken/agent-cost-optimizer/resolve/main/smoke_test_v4.py"
+  dependencies: ["huggingface_hub", "datasets"]
+  hardware: a10g-largex2
+  timeout: 4h
+  env: {"INSTANCE_ID": "django__django-14315"}
+"""
+import json, os, re, subprocess, sys, tempfile, time, traceback
+from datetime import datetime
+from pathlib import Path
+def sh(cmd, cwd=None, timeout=120):
+    r = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True, timeout=timeout, shell=True)
+    return r.returncode, r.stdout, r.stderr
+def ensure_conda():
+    for p in [os.path.expanduser("~/miniconda3/bin/conda"), "/opt/conda/bin/conda"]:
+        if os.path.exists(p): return p
+    print("📦 Installing Miniconda...")
+    sh("wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && bash /tmp/miniconda.sh -b -p $HOME/miniconda3", timeout=300)
+    p = os.path.expanduser("~/miniconda3/bin/conda")
+    sh(f"{p} config --set always_yes yes --set changeps1 no && {p} tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main 2>/dev/null; true", timeout=30)
+    os.environ["PATH"] = os.path.expanduser("~/miniconda3/bin:") + os.environ.get("PATH", "")
+    return p
+def call_model(client, messages, max_tokens=4096):
+    try:
+        c = client.chat.completions.create(model=client.model, messages=messages, max_tokens=max_tokens, temperature=0.2)
+        t = c.choices[0].message.content
+        it = c.usage.prompt_tokens if hasattr(c,'usage') and c.usage else 0
+        ot = c.usage.completion_tokens if hasattr(c,'usage') and c.usage else len(t)//4
+        return t, it, ot
+    except Exception as e: return f"[ERROR: {e}]", 0, 0
+def apply_edits(text, repo_dir):
+    """Apply <edit> tags: <edit path='file.py'>NEW_CONTENT</edit>"""
+    edits = re.findall(r"<edit\s+path=['\"]([^'\"]+)['\"]\s*>(.*?)</edit>", text, re.DOTALL)
+    for filepath, content in edits:
+        full_path = Path(repo_dir) / filepath
+        if not full_path.exists():
+            return f"ERROR: file {filepath} does not exist"
+        content = content.strip()
+        full_path.write_text(content)
+        print(f"  ✏️ Edited {filepath} ({len(content)} bytes)")
+    return None if edits else "WARNING: no <edit> tags found"
+def run_cascade(instance, repo_dir, conda, env_name):
+    from huggingface_hub import InferenceClient
+    T1, T2 = "meta-llama/Llama-3.1-8B-Instruct", "meta-llama/Llama-3.3-70B-Instruct"
+    problem = instance.get("problem_statement","")
+    system = f"""You are fixing a bug in {instance['repo']}. Repository at {repo_dir}.
+YOU EDIT FILES DIRECTLY — do NOT generate patches. Format:
+TO EXPLORE: <bash>ls, find, grep, cat, git log, pytest commands</bash>
+TO FIX: <edit path='relative/path.py'>
+complete new file content here
+</edit>
+TO FINISH: <submit>Done</submit>
+Workflow:
+1. Explore the codebase to find the bug
+2. Read the relevant file(s) with cat
+3. Edit the file with the fix using <edit>
+4. Verify with pytest
+5. Submit when done"""
+    messages = [
+        {"role":"system","content":system},
+        {"role":"user","content":f"PROBLEM:\n{problem}\n\nExplore the repository to find the relevant code."}
+    ]
+    for tier_name, mid, max_turns in [("T1",T1,30),("T2",T2,30)]:
+        print(f"\n[{tier_name}] {mid}")
+        client = InferenceClient(mid)
+        ti = to = 0
+        for turn in range(max_turns):
+            text, it, ot = call_model(client, messages, 4096)
+            ti += it; to += ot
+            messages.append({"role":"assistant","content":text})
+            print(f"  Turn {turn+1}: {it}+{ot} tok, {len(text)} ch")
+            # Apply edits
+            edit_result = apply_edits(text, repo_dir)
+            # Run bash commands
+            cmds = re.findall(r'<bash>(.*?)</bash>', text, re.DOTALL)
+            for cmd in cmds:
+                cmd = cmd.strip()
+                if "pytest" in cmd:
+                    cmd = cmd.replace("pytest", f"{conda} run -n {env_name} python -m pytest")
+                print(f"  $ {cmd[:120]}")
+                rc, out, err = sh(cmd, cwd=str(repo_dir), timeout=60)
+                o = (out+err)[:1500]
+                if rc: o += f" [EXIT:{rc}]"
+                messages.append({"role":"user","content":f"<output>\n{o}\n</output>"})
+            if edit_result:
+                messages.append({"role":"user","content":edit_result})
+            if "<submit>" in text:
+                # Generate diff
+                rc, diff, err = sh(f"cd {repo_dir} && git diff", timeout=10)
+                if diff.strip():
+                    print(f"  ✅ Submitted — diff: {len(diff)} chars")
+                    return {"patch":diff.strip(),"tier":tier_name,"turns":turn+1,"input_tokens":ti,"output_tokens":to}
+                else:
+                    print(f"  ⚠️ Submitted but no diff — no changes made?")
+                    messages.append({"role":"user","content":"No changes detected. Did you edit any files?"})
+            if edit_result and "ERROR" in edit_result:
+                messages.append({"role":"user","content":edit_result})
+    # If exhausted, check if there's a diff anyway
+    rc, diff, err = sh(f"cd {repo_dir} && git diff", timeout=10)
+    if diff.strip():
+        print(f"  → Found unsubmitted diff: {len(diff)} chars")
+        return {"patch":diff.strip(),"tier":tier_name,"turns":max_turns,"input_tokens":ti,"output_tokens":to}
+    return {"patch":None,"tier":None,"turns":0,"input_tokens":0,"output_tokens":0}
+def verify_patch(instance, model_patch, repo_dir, conda, env_name):
+    base = instance.get("base_commit","")
+    tp = instance.get("test_patch","")
+    f2p = instance.get("FAIL_TO_PASS",[])
+    # Reset and apply
+    sh(f"cd {repo_dir} && git checkout -f {base} && git clean -fd", timeout=30)
+    (Path(repo_dir)/"_aco.patch").write_text(model_patch)
+    rc, out, err = sh(f"cd {repo_dir} && git apply --check _aco.patch", timeout=10)
+    if rc: return {"resolved":False,"error":f"patch check: {err[:200]}"}
+    rc, out, err = sh(f"cd {repo_dir} && git apply _aco.patch", timeout=10)
+    if rc: return {"resolved":False,"error":f"patch apply: {err[:200]}"}
+    (Path(repo_dir)/"_t.patch").write_text(tp)
+    sh(f"cd {repo_dir} && (git apply _t.patch) || git apply --reject _t.patch 2>/dev/null; true", timeout=10)
+    cmd = f"cd {repo_dir} && {conda} run -n {env_name} python -m pytest -v --tb=short -x {' '.join(f2p[:10])}"
+    print(f"  F2P: pytest {' '.join(f2p[:3])}...")
+    rc, out, err = sh(cmd, timeout=300)
+    if rc == 0:
+        p2p = instance.get("PASS_TO_PASS",[])
+        if p2p:
+            cmd2 = f"cd {repo_dir} && {conda} run -n {env_name} python -m pytest -v --tb=short -x {' '.join(p2p[:10])}"
+            rc2, out2, err2 = sh(cmd2, timeout=300)
+            if rc2: return {"resolved":False,"error":f"P2P: {(out2+err2)[:200]}"}
+        return {"resolved":True,"test_output":(out+err)[:500]}
+    return {"resolved":False,"error":f"{len(re.findall(r'FAILED',out+err))} F2P failures","test_output":(out+err)[:500]}
+def setup_env(conda, instance, repo_dir, env_name):
+    ec = instance.get("environment_setup_commit","")
+    if ec:
+        sh(f"cd {repo_dir} && git fetch origin {ec} && git checkout {ec}", timeout=60)
+    base = instance["base_commit"]
+    sh(f"cd {repo_dir} && git fetch origin {base} && git checkout {base}", timeout=60)
+    rc, out, err = sh(f"{conda} create -n {env_name} python=3.10 pip -y 2>&1 | tail -3", timeout=300)
+    if rc:
+        rc, out, err = sh(f"{conda} create -n {env_name} python=3.10 pip -y 2>&1 | tail -3", timeout=300)
+        if rc: return False, f"conda: {err[:200]}"
+    sh(f"cd {repo_dir} && {conda} run -n {env_name} pip install -e . 2>&1 | tail -3", timeout=300)
+    sh(f"cd {repo_dir} && {conda} run -n {env_name} pip install . 2>&1 | tail -3", timeout=300)
+    return True, ""
+def main():
+    import datasets
+    IID = os.environ.get("INSTANCE_ID","django__django-14315")
+    print(f"🚀 CASCADE SMOKE TEST v4 (edit-based) — {IID} — {datetime.now().isoformat()}")
+    conda = ensure_conda()
+    if not conda: print("❌ No conda"); sys.exit(1)
+    ds = datasets.load_dataset("princeton-nlp/SWE-bench_Verified", split="test")
+    instance = next((dict(r) for r in ds if r["instance_id"]==IID), None)
+    if not instance: print(f"❌ {IID}"); sys.exit(1)
+    print(f"Repo: {instance['repo']} Base: {instance['base_commit'][:12]} F2P: {len(instance.get('FAIL_TO_PASS',[]))}\n")
+    with tempfile.TemporaryDirectory(prefix="aco_v4_") as tmpdir:
+        repo_dir = Path(tmpdir) / "repo"
+        env_name = f"aco_{IID.replace('__','_').replace('-','_')[:30]}"
+        print(f"[1/4] Clone...")
+        t0=time.time()
+        url=f"https://github.com/{instance['repo']}.git"
+        rc,out,err=sh(f"git clone --depth 100 {url} {repo_dir}", timeout=600)
+        if rc: print(f"❌ Clone: {err[:200]}"); sys.exit(1)
+        ct=time.time()-t0; print(f"  ({ct:.0f}s)")
+        print(f"[2/4] Env...")
+        t0=time.time()
+        ok,err=setup_env(conda,instance,repo_dir,env_name)
+        et=time.time()-t0
+        if not ok: print(f"❌ {err}"); sys.exit(1)
+        print(f"  ({et:.0f}s)")
+        print(f"[3/4] Cascade (edit-based)...")
+        t0=time.time()
+        agent=run_cascade(instance,repo_dir,conda,env_name)
+        at=time.time()-t0
+        if not agent["patch"]:
+            result={"instance_id":IID,"resolved":False,"error":"No patch","times":{"clone":ct,"env":et,"agent":at}}
+            with open("smoke_result.json","w") as f: json.dump(result,f,indent=2)
+            print(f"\n❌ No patch\nSaved: smoke_result.json")
+            sys.exit(1)
+        print(f"\n✅ {agent['tier']} {agent['turns']}t {agent['input_tokens']}+{agent['output_tokens']}tok {at:.0f}s\n")
+        print(f"[4/4] Verify...")
+        t0=time.time()
+        verify=verify_patch(instance,agent["patch"],repo_dir,conda,env_name)
+        vt=time.time()-t0
+        status="✅ RESOLVED" if verify["resolved"] else "❌ FAILED"
+        print(f"\n{'='*50}\n{status}\n{'='*50}")
+        print(f"Tier: {agent['tier']} Turns: {agent['turns']}")
+        print(f"Times: clone={ct:.0f}s env={et:.0f}s agent={at:.0f}s verify={vt:.0f}s")
+        if not verify["resolved"]: print(f"Error: {verify.get('error','?')[:300]}")
+        result={"instance_id":IID,"repo":instance["repo"],"resolved":verify["resolved"],"tier":agent["tier"],"turns":agent["turns"],"input_tokens":agent["input_tokens"],"output_tokens":agent["output_tokens"],"times":{"clone":ct,"env":et,"agent":at,"verify":vt},"error":verify.get("error"),"patch_preview":agent["patch"][:500],"timestamp":datetime.now().isoformat()}
+        with open("smoke_result.json","w") as f: json.dump(result,f,indent=2)
+        print(f"\nSaved: smoke_result.json")
+        sh(f"{conda} env remove -n {env_name} -y --quiet 2>/dev/null; true", timeout=30)
+    print("\n🏁 DONE")
+    return 0 if verify["resolved"] else 1
+if __name__=="__main__":
+    try: sys.exit(main())
+    except Exception as e: print(f"💥 {e}"); traceback.print_exc(); sys.exit(1)