#!/usr/bin/env python3 """ e2e_test_v3.py — Full End-to-End test suite for v3.0 Tests every endpoint, all 3 tasks, all new intelligence modules, multi-agent comparison, and the 3D viz-data endpoint. """ import sys import json import time import requests BASE = "http://localhost:7860" PASS = 0 FAIL = 0 RESULTS = [] def check(name, condition, detail=""): global PASS, FAIL status = "✅ PASS" if condition else "❌ FAIL" if condition: PASS += 1 else: FAIL += 1 msg = f" {status} {name}" if detail: msg += f" → {detail}" print(msg) RESULTS.append({"name": name, "passed": condition, "detail": detail}) def section(title): print(f"\n{'━'*60}") print(f" {title}") print(f"{'━'*60}") # ───────────────────────────────────────────────────────────────────────────── section("1. HEALTH & BASIC CONNECTIVITY") # ───────────────────────────────────────────────────────────────────────────── r = requests.get(f"{BASE}/health") check("GET /health returns 200", r.status_code == 200) data = r.json() check("Health version is 3.0.0", data.get("version") == "3.0.0", data.get("version")) check("Health status is ok", data.get("status") == "ok") # ───────────────────────────────────────────────────────────────────────────── section("2. CORE OPENENV — ALL 3 TASKS") # ───────────────────────────────────────────────────────────────────────────── for task in ["task1", "task2", "task3"]: r = requests.post(f"{BASE}/reset?task={task}") check(f"POST /reset?task={task} → 200", r.status_code == 200, f"status={r.status_code}") if r.status_code == 200: d = r.json() obs = d.get("observation", {}) check(f" {task}: has repo_tree", bool(obs.get("repo_tree")), str(obs.get("repo_tree", [])[:2])) check(f" {task}: has variant_id", bool(d.get("info", {}).get("variant_id"))) check(f" {task}: steps_remaining > 0", obs.get("steps_remaining", 0) > 0) # ───────────────────────────────────────────────────────────────────────────── section("3. STEP ACTIONS — FULL EPISODE (task1)") # ───────────────────────────────────────────────────────────────────────────── r = requests.post(f"{BASE}/reset?task=task1") obs = r.json()["observation"] tree = obs["repo_tree"] test_files = [f for f in tree if f.startswith("tests/")] src_files = [f for f in tree if f.startswith("src/")] # read_file r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": test_files[0]}) check("POST /step read_file test file → 200", r.status_code == 200) check("read_file reward >= 0", r.json().get("reward", -1) >= 0, str(r.json().get("reward"))) r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": src_files[0]}) check("POST /step read_file src file → 200", r.status_code == 200) # search_code r = requests.post(f"{BASE}/step", json={"action_type": "search_code", "query": "def "}) check("POST /step search_code → 200", r.status_code == 200) # run_tests r = requests.post(f"{BASE}/step", json={"action_type": "run_tests"}) check("POST /step run_tests → 200", r.status_code == 200, f"reward={r.json().get('reward')}") # submit r = requests.post(f"{BASE}/step", json={"action_type": "submit"}) check("POST /step submit → 200", r.status_code == 200) final_score = r.json()["info"].get("final_score", 0) check("Episode done after submit", r.json().get("done") == True) # Try stepping after done → should get 400 r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": "x.py"}) check("POST /step after done → 400", r.status_code == 400) # ───────────────────────────────────────────────────────────────────────────── section("4. STATE ENDPOINT") # ───────────────────────────────────────────────────────────────────────────── requests.post(f"{BASE}/reset?task=task1") requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": test_files[0]}) r = requests.get(f"{BASE}/state") check("GET /state → 200", r.status_code == 200) d = r.json() check("State has observation", "observation" in d) check("State total_steps_taken >= 1", d.get("total_steps_taken", 0) >= 1) # ───────────────────────────────────────────────────────────────────────────── section("5. TRAJECTORY & EVALUATION") # ───────────────────────────────────────────────────────────────────────────── requests.post(f"{BASE}/step", json={"action_type": "submit"}) r = requests.get(f"{BASE}/trajectory") check("GET /trajectory → 200", r.status_code == 200) traj = r.json() check("Trajectory has episode_id", bool(traj.get("episode_id"))) check("Trajectory steps > 0", len(traj.get("steps", [])) > 0, f"steps={len(traj.get('steps',[]))}") r = requests.get(f"{BASE}/evaluate") check("GET /evaluate → 200", r.status_code == 200) ev = r.json() check("Evaluation has composite_score", "composite_score" in ev, str(ev.get("composite_score"))) check("Evaluation has 6 dimensions", len(ev.get("dimensions", {})) == 6, str(list(ev.get("dimensions", {}).keys()))) r = requests.get(f"{BASE}/metrics") check("GET /metrics → 200", r.status_code == 200) m = r.json() check("Metrics has timeline", "timeline" in m, str(list(m.keys())[:5])) # ───────────────────────────────────────────────────────────────────────────── section("6. FAULT INJECTION") # ───────────────────────────────────────────────────────────────────────────── r = requests.post(f"{BASE}/fault-config", json={"level": "light"}) check("POST /fault-config light → 200", r.status_code == 200) r = requests.post(f"{BASE}/reset?task=task1") check("Reset with fault injection → 200", r.status_code == 200) fi = r.json().get("info", {}).get("fault_injection", {}) check("Fault injection info present", "difficulty_multiplier" in fi or "faults_injected" in fi, str(fi)) # Reset back requests.post(f"{BASE}/fault-config", json={"level": "none"}) # ───────────────────────────────────────────────────────────────────────────── section("7. INTELLIGENCE — FAILURE CLASSIFIER") # ───────────────────────────────────────────────────────────────────────────── # Run a fresh episode with minimal effort to get a known failure requests.post(f"{BASE}/reset?task=task1") requests.post(f"{BASE}/step", json={"action_type": "submit"}) # Submit without doing anything r = requests.get(f"{BASE}/classify") check("GET /classify → 200", r.status_code == 200) d = r.json() check("Classify has episode_id", "episode_id" in d, d.get("episode_id")) check("Classify has primary_failure", "primary_failure" in d, d.get("primary_failure")) check("Classify has success field", "success" in d) check("Classify success=False for minimal effort", d.get("success") == False) check("Classify has retry_hint", bool(d.get("retry_hint")), d.get("retry_hint", "")[:60]) # ───────────────────────────────────────────────────────────────────────────── section("8. INTELLIGENCE — STRATEGY DETECTOR") # ───────────────────────────────────────────────────────────────────────────── r = requests.get(f"{BASE}/strategy") check("GET /strategy → 200", r.status_code == 200) d = r.json() check("Strategy has strategy field", "strategy" in d, d.get("strategy")) VALID_STRATEGIES = ["TARGETED_DEBUGGING", "SYSTEMATIC_SEARCH", "BRUTE_FORCE", "RANDOM_EXPLORATION", "SPEC_DRIVEN", "MINIMAL_EFFORT"] check("Strategy is a known label", d.get("strategy") in VALID_STRATEGIES, d.get("strategy")) check("Strategy has score 0-1", 0 <= d.get("score", -1) <= 1, str(d.get("score"))) check("Strategy has exploration_ratio", "exploration_ratio" in d) check("Strategy has sub_patterns list", isinstance(d.get("sub_patterns"), list)) # ───────────────────────────────────────────────────────────────────────────── section("9. INTELLIGENCE — ADVANCED METRICS") # ───────────────────────────────────────────────────────────────────────────── r = requests.get(f"{BASE}/advanced-metrics") check("GET /advanced-metrics → 200", r.status_code == 200) d = r.json() expected_keys = ["reasoning_efficiency", "exploration_ratio", "decision_entropy", "reliability_index", "pivot_rate", "wasteful_ratio", "consistency_score"] for key in expected_keys: check(f" advanced-metrics has '{key}'", key in d, str(d.get(key, "MISSING"))) check("reliability_index in [0,1]", 0 <= d.get("reliability_index", -1) <= 1) check("action_distribution is dict", isinstance(d.get("action_distribution"), dict)) # ───────────────────────────────────────────────────────────────────────────── section("10. INTELLIGENCE — IMPROVEMENT PLAN") # ───────────────────────────────────────────────────────────────────────────── r = requests.get(f"{BASE}/improvement-plan") check("GET /improvement-plan → 200", r.status_code == 200) d = r.json() check("Plan has failure_type", "failure_type" in d, d.get("failure_type")) check("Plan has what_went_wrong", bool(d.get("what_went_wrong"))) check("Plan has improved_strategy", bool(d.get("improved_strategy"))) check("Plan has step_by_step_plan list", isinstance(d.get("step_by_step_plan"), list)) check("Plan step_by_step_plan not empty", len(d.get("step_by_step_plan", [])) > 0) check("Plan has system_prompt_addon", "system_prompt_addon" in d) # ───────────────────────────────────────────────────────────────────────────── section("11. MULTI-AGENT COMPARISON") # ───────────────────────────────────────────────────────────────────────────── r = requests.post(f"{BASE}/compare-agents?task=task1&agents=test-first,minimal") check("POST /compare-agents (2 agents) → 200", r.status_code == 200, f"status={r.status_code}") if r.status_code == 200: d = r.json() check("Comparison has winner", "winner" in d, d.get("winner")) check("Comparison has summary_table", "summary_table" in d) check("Summary table has 2 rows", len(d.get("summary_table", [])) == 2, str(len(d.get("summary_table", [])))) check("Each row has score/steps/strategy", all( "score" in row and "steps" in row and "strategy" in row for row in d.get("summary_table", []) )) check("Comparison has insights", "insights" in d) check("Comparison has detailed_runs", len(d.get("detailed_runs", [])) == 2) # Test all 4 agents r = requests.post(f"{BASE}/compare-agents?task=task1") check("POST /compare-agents (all agents) → 200", r.status_code == 200) if r.status_code == 200: d = r.json() check("All 4 agents ran", len(d.get("summary_table", [])) == 4, f"rows={len(d.get('summary_table',[]))}") # ───────────────────────────────────────────────────────────────────────────── section("12. 3D VISUALIZATION DATA") # ───────────────────────────────────────────────────────────────────────────── # Run a full episode first for viz data requests.post(f"{BASE}/reset?task=task1") requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": test_files[0]}) requests.post(f"{BASE}/step", json={"action_type": "submit"}) r = requests.get(f"{BASE}/viz-data") check("GET /viz-data → 200", r.status_code == 200) d = r.json() check("Viz-data has files array", isinstance(d.get("files"), list), f"len={len(d.get('files',[]))}") check("Viz-data files > 0", len(d.get("files", [])) > 0) check("Viz-data has dependencies", isinstance(d.get("dependencies"), list)) check("Viz-data has steps", isinstance(d.get("steps"), list)) check("Viz-data has strategy", "strategy" in d, d.get("strategy")) check("Viz-data has final_score", "final_score" in d) if d.get("files"): f = d["files"][0] check("File node has name/type/is_bug_file", all(k in f for k in ["name","type","is_bug_file"])) # ───────────────────────────────────────────────────────────────────────────── section("13. INVALID ACTION HANDLING") # ───────────────────────────────────────────────────────────────────────────── requests.post(f"{BASE}/reset?task=task1") # Invalid task r = requests.post(f"{BASE}/reset?task=task99") check("Invalid task → 400", r.status_code == 400) # Invalid action type r = requests.post(f"{BASE}/step", json={"action_type": "hack_system"}) check("Invalid action_type → 400 or 422", r.status_code in (400, 422)) # Non-existent file r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": "non_existent.py"}) check("Read non-existent file → 200 with error", r.status_code == 200) obs = r.json().get("observation", {}) check("Non-existent file has error in obs", bool(obs.get("last_action_error")), obs.get("last_action_error","")[:60]) # ───────────────────────────────────────────────────────────────────────────── section("14. SECURITY SCANNING") # ───────────────────────────────────────────────────────────────────────────── requests.post(f"{BASE}/reset?task=task1") # Try to write a file with dangerous code r = requests.post(f"{BASE}/step", json={ "action_type": "write_file", "path": src_files[0] if src_files else "src/hack.py", "content": "import os\nos.system('rm -rf /')\n" }) check("Write dangerous code → 200", r.status_code == 200) if r.status_code == 200: info = r.json().get("info", {}) flags = info.get("security_flags", []) check("Security flags populated for os.system", len(flags) > 0, str(flags[:2])) # ───────────────────────────────────────────────────────────────────────────── section("15. GRADIO UI ENDPOINTS") # ───────────────────────────────────────────────────────────────────────────── r = requests.get(f"{BASE}/") check("GET / (Gradio UI) → 200", r.status_code == 200) check("Response is HTML", "text/html" in r.headers.get("content-type", "")) r = requests.get(f"{BASE}/static/viz3d.html") check("GET /static/viz3d.html → 200", r.status_code == 200) check("viz3d.html is HTML", "html" in r.text.lower()[:200]) check("viz3d.html has Three.js", "three" in r.text.lower()) check("viz3d.html has timeline-slider", "timeline-slider" in r.text) # ───────────────────────────────────────────────────────────────────────────── section("16. TASK2 & TASK3 FULL EPISODE") # ───────────────────────────────────────────────────────────────────────────── for task in ["task2", "task3"]: r = requests.post(f"{BASE}/reset?task={task}") check(f"{task} reset → 200", r.status_code == 200) obs = r.json()["observation"] tree = obs["repo_tree"] tf = [f for f in tree if f.startswith("tests/")] sf = [f for f in tree if f.startswith("src/")] md = [f for f in tree if f.endswith(".md")] if task == "task3" and md: requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": md[0]}) if tf: requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": tf[0]}) if sf: requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": sf[0]}) r = requests.post(f"{BASE}/step", json={"action_type": "submit"}) check(f"{task} submit → done", r.json().get("done") == True) # Verify all intelligence endpoints work post-episode r = requests.get(f"{BASE}/classify") check(f"{task} /classify works", r.status_code == 200 and "primary_failure" in r.json()) r = requests.get(f"{BASE}/strategy") check(f"{task} /strategy works", r.status_code == 200 and "strategy" in r.json()) # ───────────────────────────────────────────────────────────────────────────── section("17. CONSISTENCY — 3 RUNS SAME TASK") # ───────────────────────────────────────────────────────────────────────────── scores = [] for i in range(3): requests.post(f"{BASE}/reset?task=task1") r = requests.get(f"{BASE}/state") tree = r.json()["observation"]["repo_tree"] tf = [f for f in tree if f.startswith("tests/")] if tf: requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": tf[0]}) requests.post(f"{BASE}/step", json={"action_type": "submit"}) metrics = requests.get(f"{BASE}/advanced-metrics").json() scores.append(requests.get(f"{BASE}/evaluate").json().get("composite_score", 0)) check("3 runs completed", len(scores) == 3, str(scores)) check("All runs have valid scores", all(0 <= s <= 1 for s in scores), str(scores)) # Consistency metric r = requests.get(f"{BASE}/advanced-metrics") d = r.json() check("Consistency score populated after multiple runs", d.get("runs_analyzed", 0) >= 1, f"runs={d.get('runs_analyzed')}, consistency={d.get('consistency_score'):.3f}") # ───────────────────────────────────────────────────────────────────────────── print(f"\n{'═'*60}") print(f" E2E RESULTS: {PASS} passed | {FAIL} failed | {PASS+FAIL} total") print(f" Score: {PASS/(PASS+FAIL)*100:.1f}%") print(f"{'═'*60}") if FAIL > 0: print("\nFailed tests:") for r in RESULTS: if not r["passed"]: print(f" ❌ {r['name']}: {r['detail']}") sys.exit(0 if FAIL == 0 else 1)