narcolepticchicken commited on
Commit
51c9a64
Β·
verified Β·
1 Parent(s): 29c4a80

Upload smoke_test_v4.py

Browse files
Files changed (1) hide show
  1. smoke_test_v4.py +243 -0
smoke_test_v4.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cascade Smoke Test v4 β€” FILE EDITING APPROACH
3
+
4
+ Instead of asking models to generate git diffs (which they can't do reliably),
5
+ they edit files directly using <edit> tags. We run `git diff` to generate
6
+ the patch and `git apply --check test_patch` to verify.
7
+
8
+ This is how real SWE-bench agents (SWE-agent, Aider, OpenHands) work.
9
+
10
+ Usage via hf_jobs:
11
+ operation: run
12
+ script: "https://huggingface.co/narcolepticchicken/agent-cost-optimizer/resolve/main/smoke_test_v4.py"
13
+ dependencies: ["huggingface_hub", "datasets"]
14
+ hardware: a10g-largex2
15
+ timeout: 4h
16
+ env: {"INSTANCE_ID": "django__django-14315"}
17
+ """
18
+
19
+ import json, os, re, subprocess, sys, tempfile, time, traceback
20
+ from datetime import datetime
21
+ from pathlib import Path
22
+
23
+ def sh(cmd, cwd=None, timeout=120):
24
+ r = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True, timeout=timeout, shell=True)
25
+ return r.returncode, r.stdout, r.stderr
26
+
27
+ def ensure_conda():
28
+ for p in [os.path.expanduser("~/miniconda3/bin/conda"), "/opt/conda/bin/conda"]:
29
+ if os.path.exists(p): return p
30
+ print("πŸ“¦ Installing Miniconda...")
31
+ sh("wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && bash /tmp/miniconda.sh -b -p $HOME/miniconda3", timeout=300)
32
+ p = os.path.expanduser("~/miniconda3/bin/conda")
33
+ sh(f"{p} config --set always_yes yes --set changeps1 no && {p} tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main 2>/dev/null; true", timeout=30)
34
+ os.environ["PATH"] = os.path.expanduser("~/miniconda3/bin:") + os.environ.get("PATH", "")
35
+ return p
36
+
37
+ def call_model(client, messages, max_tokens=4096):
38
+ try:
39
+ c = client.chat.completions.create(model=client.model, messages=messages, max_tokens=max_tokens, temperature=0.2)
40
+ t = c.choices[0].message.content
41
+ it = c.usage.prompt_tokens if hasattr(c,'usage') and c.usage else 0
42
+ ot = c.usage.completion_tokens if hasattr(c,'usage') and c.usage else len(t)//4
43
+ return t, it, ot
44
+ except Exception as e: return f"[ERROR: {e}]", 0, 0
45
+
46
+ def apply_edits(text, repo_dir):
47
+ """Apply <edit> tags: <edit path='file.py'>NEW_CONTENT</edit>"""
48
+ edits = re.findall(r"<edit\s+path=['\"]([^'\"]+)['\"]\s*>(.*?)</edit>", text, re.DOTALL)
49
+ for filepath, content in edits:
50
+ full_path = Path(repo_dir) / filepath
51
+ if not full_path.exists():
52
+ return f"ERROR: file {filepath} does not exist"
53
+ content = content.strip()
54
+ full_path.write_text(content)
55
+ print(f" ✏️ Edited {filepath} ({len(content)} bytes)")
56
+ return None if edits else "WARNING: no <edit> tags found"
57
+
58
+ def run_cascade(instance, repo_dir, conda, env_name):
59
+ from huggingface_hub import InferenceClient
60
+ T1, T2 = "meta-llama/Llama-3.1-8B-Instruct", "meta-llama/Llama-3.3-70B-Instruct"
61
+ problem = instance.get("problem_statement","")
62
+
63
+ system = f"""You are fixing a bug in {instance['repo']}. Repository at {repo_dir}.
64
+
65
+ YOU EDIT FILES DIRECTLY β€” do NOT generate patches. Format:
66
+
67
+ TO EXPLORE: <bash>ls, find, grep, cat, git log, pytest commands</bash>
68
+ TO FIX: <edit path='relative/path.py'>
69
+ complete new file content here
70
+ </edit>
71
+ TO FINISH: <submit>Done</submit>
72
+
73
+ Workflow:
74
+ 1. Explore the codebase to find the bug
75
+ 2. Read the relevant file(s) with cat
76
+ 3. Edit the file with the fix using <edit>
77
+ 4. Verify with pytest
78
+ 5. Submit when done"""
79
+
80
+ messages = [
81
+ {"role":"system","content":system},
82
+ {"role":"user","content":f"PROBLEM:\n{problem}\n\nExplore the repository to find the relevant code."}
83
+ ]
84
+
85
+ for tier_name, mid, max_turns in [("T1",T1,30),("T2",T2,30)]:
86
+ print(f"\n[{tier_name}] {mid}")
87
+ client = InferenceClient(mid)
88
+ ti = to = 0
89
+ for turn in range(max_turns):
90
+ text, it, ot = call_model(client, messages, 4096)
91
+ ti += it; to += ot
92
+ messages.append({"role":"assistant","content":text})
93
+ print(f" Turn {turn+1}: {it}+{ot} tok, {len(text)} ch")
94
+
95
+ # Apply edits
96
+ edit_result = apply_edits(text, repo_dir)
97
+
98
+ # Run bash commands
99
+ cmds = re.findall(r'<bash>(.*?)</bash>', text, re.DOTALL)
100
+ for cmd in cmds:
101
+ cmd = cmd.strip()
102
+ if "pytest" in cmd:
103
+ cmd = cmd.replace("pytest", f"{conda} run -n {env_name} python -m pytest")
104
+ print(f" $ {cmd[:120]}")
105
+ rc, out, err = sh(cmd, cwd=str(repo_dir), timeout=60)
106
+ o = (out+err)[:1500]
107
+ if rc: o += f" [EXIT:{rc}]"
108
+ messages.append({"role":"user","content":f"<output>\n{o}\n</output>"})
109
+
110
+ if edit_result:
111
+ messages.append({"role":"user","content":edit_result})
112
+
113
+ if "<submit>" in text:
114
+ # Generate diff
115
+ rc, diff, err = sh(f"cd {repo_dir} && git diff", timeout=10)
116
+ if diff.strip():
117
+ print(f" βœ… Submitted β€” diff: {len(diff)} chars")
118
+ return {"patch":diff.strip(),"tier":tier_name,"turns":turn+1,"input_tokens":ti,"output_tokens":to}
119
+ else:
120
+ print(f" ⚠️ Submitted but no diff β€” no changes made?")
121
+ messages.append({"role":"user","content":"No changes detected. Did you edit any files?"})
122
+
123
+ if edit_result and "ERROR" in edit_result:
124
+ messages.append({"role":"user","content":edit_result})
125
+
126
+ # If exhausted, check if there's a diff anyway
127
+ rc, diff, err = sh(f"cd {repo_dir} && git diff", timeout=10)
128
+ if diff.strip():
129
+ print(f" β†’ Found unsubmitted diff: {len(diff)} chars")
130
+ return {"patch":diff.strip(),"tier":tier_name,"turns":max_turns,"input_tokens":ti,"output_tokens":to}
131
+
132
+ return {"patch":None,"tier":None,"turns":0,"input_tokens":0,"output_tokens":0}
133
+
134
+ def verify_patch(instance, model_patch, repo_dir, conda, env_name):
135
+ base = instance.get("base_commit","")
136
+ tp = instance.get("test_patch","")
137
+ f2p = instance.get("FAIL_TO_PASS",[])
138
+
139
+ # Reset and apply
140
+ sh(f"cd {repo_dir} && git checkout -f {base} && git clean -fd", timeout=30)
141
+ (Path(repo_dir)/"_aco.patch").write_text(model_patch)
142
+ rc, out, err = sh(f"cd {repo_dir} && git apply --check _aco.patch", timeout=10)
143
+ if rc: return {"resolved":False,"error":f"patch check: {err[:200]}"}
144
+ rc, out, err = sh(f"cd {repo_dir} && git apply _aco.patch", timeout=10)
145
+ if rc: return {"resolved":False,"error":f"patch apply: {err[:200]}"}
146
+
147
+ (Path(repo_dir)/"_t.patch").write_text(tp)
148
+ sh(f"cd {repo_dir} && (git apply _t.patch) || git apply --reject _t.patch 2>/dev/null; true", timeout=10)
149
+
150
+ cmd = f"cd {repo_dir} && {conda} run -n {env_name} python -m pytest -v --tb=short -x {' '.join(f2p[:10])}"
151
+ print(f" F2P: pytest {' '.join(f2p[:3])}...")
152
+ rc, out, err = sh(cmd, timeout=300)
153
+
154
+ if rc == 0:
155
+ p2p = instance.get("PASS_TO_PASS",[])
156
+ if p2p:
157
+ cmd2 = f"cd {repo_dir} && {conda} run -n {env_name} python -m pytest -v --tb=short -x {' '.join(p2p[:10])}"
158
+ rc2, out2, err2 = sh(cmd2, timeout=300)
159
+ if rc2: return {"resolved":False,"error":f"P2P: {(out2+err2)[:200]}"}
160
+ return {"resolved":True,"test_output":(out+err)[:500]}
161
+ return {"resolved":False,"error":f"{len(re.findall(r'FAILED',out+err))} F2P failures","test_output":(out+err)[:500]}
162
+
163
+ def setup_env(conda, instance, repo_dir, env_name):
164
+ ec = instance.get("environment_setup_commit","")
165
+ if ec:
166
+ sh(f"cd {repo_dir} && git fetch origin {ec} && git checkout {ec}", timeout=60)
167
+ base = instance["base_commit"]
168
+ sh(f"cd {repo_dir} && git fetch origin {base} && git checkout {base}", timeout=60)
169
+ rc, out, err = sh(f"{conda} create -n {env_name} python=3.10 pip -y 2>&1 | tail -3", timeout=300)
170
+ if rc:
171
+ rc, out, err = sh(f"{conda} create -n {env_name} python=3.10 pip -y 2>&1 | tail -3", timeout=300)
172
+ if rc: return False, f"conda: {err[:200]}"
173
+ sh(f"cd {repo_dir} && {conda} run -n {env_name} pip install -e . 2>&1 | tail -3", timeout=300)
174
+ sh(f"cd {repo_dir} && {conda} run -n {env_name} pip install . 2>&1 | tail -3", timeout=300)
175
+ return True, ""
176
+
177
+ def main():
178
+ import datasets
179
+ IID = os.environ.get("INSTANCE_ID","django__django-14315")
180
+ print(f"πŸš€ CASCADE SMOKE TEST v4 (edit-based) β€” {IID} β€” {datetime.now().isoformat()}")
181
+
182
+ conda = ensure_conda()
183
+ if not conda: print("❌ No conda"); sys.exit(1)
184
+
185
+ ds = datasets.load_dataset("princeton-nlp/SWE-bench_Verified", split="test")
186
+ instance = next((dict(r) for r in ds if r["instance_id"]==IID), None)
187
+ if not instance: print(f"❌ {IID}"); sys.exit(1)
188
+ print(f"Repo: {instance['repo']} Base: {instance['base_commit'][:12]} F2P: {len(instance.get('FAIL_TO_PASS',[]))}\n")
189
+
190
+ with tempfile.TemporaryDirectory(prefix="aco_v4_") as tmpdir:
191
+ repo_dir = Path(tmpdir) / "repo"
192
+ env_name = f"aco_{IID.replace('__','_').replace('-','_')[:30]}"
193
+
194
+ print(f"[1/4] Clone...")
195
+ t0=time.time()
196
+ url=f"https://github.com/{instance['repo']}.git"
197
+ rc,out,err=sh(f"git clone --depth 100 {url} {repo_dir}", timeout=600)
198
+ if rc: print(f"❌ Clone: {err[:200]}"); sys.exit(1)
199
+ ct=time.time()-t0; print(f" ({ct:.0f}s)")
200
+
201
+ print(f"[2/4] Env...")
202
+ t0=time.time()
203
+ ok,err=setup_env(conda,instance,repo_dir,env_name)
204
+ et=time.time()-t0
205
+ if not ok: print(f"❌ {err}"); sys.exit(1)
206
+ print(f" ({et:.0f}s)")
207
+
208
+ print(f"[3/4] Cascade (edit-based)...")
209
+ t0=time.time()
210
+ agent=run_cascade(instance,repo_dir,conda,env_name)
211
+ at=time.time()-t0
212
+
213
+ if not agent["patch"]:
214
+ result={"instance_id":IID,"resolved":False,"error":"No patch","times":{"clone":ct,"env":et,"agent":at}}
215
+ with open("smoke_result.json","w") as f: json.dump(result,f,indent=2)
216
+ print(f"\n❌ No patch\nSaved: smoke_result.json")
217
+ sys.exit(1)
218
+
219
+ print(f"\nβœ… {agent['tier']} {agent['turns']}t {agent['input_tokens']}+{agent['output_tokens']}tok {at:.0f}s\n")
220
+
221
+ print(f"[4/4] Verify...")
222
+ t0=time.time()
223
+ verify=verify_patch(instance,agent["patch"],repo_dir,conda,env_name)
224
+ vt=time.time()-t0
225
+
226
+ status="βœ… RESOLVED" if verify["resolved"] else "❌ FAILED"
227
+ print(f"\n{'='*50}\n{status}\n{'='*50}")
228
+ print(f"Tier: {agent['tier']} Turns: {agent['turns']}")
229
+ print(f"Times: clone={ct:.0f}s env={et:.0f}s agent={at:.0f}s verify={vt:.0f}s")
230
+ if not verify["resolved"]: print(f"Error: {verify.get('error','?')[:300]}")
231
+
232
+ result={"instance_id":IID,"repo":instance["repo"],"resolved":verify["resolved"],"tier":agent["tier"],"turns":agent["turns"],"input_tokens":agent["input_tokens"],"output_tokens":agent["output_tokens"],"times":{"clone":ct,"env":et,"agent":at,"verify":vt},"error":verify.get("error"),"patch_preview":agent["patch"][:500],"timestamp":datetime.now().isoformat()}
233
+ with open("smoke_result.json","w") as f: json.dump(result,f,indent=2)
234
+ print(f"\nSaved: smoke_result.json")
235
+
236
+ sh(f"{conda} env remove -n {env_name} -y --quiet 2>/dev/null; true", timeout=30)
237
+
238
+ print("\n🏁 DONE")
239
+ return 0 if verify["resolved"] else 1
240
+
241
+ if __name__=="__main__":
242
+ try: sys.exit(main())
243
+ except Exception as e: print(f"πŸ’₯ {e}"); traceback.print_exc(); sys.exit(1)