File size: 7,391 Bytes
51133cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""
Pre-submission validation script for the OpenEnv Code Debug environment.
Checks all items on the hackathon pre-submission checklist.
"""

import json
import os
import sys
import subprocess
from pathlib import Path

import httpx

HF_SPACE_URL = "https://arnavk-openenv-code-debugger.hf.space"
ROOT = Path(__file__).parent

PASS = "\033[92m[PASS]\033[0m"
FAIL = "\033[91m[FAIL]\033[0m"
WARN = "\033[93m[WARN]\033[0m"

results = []

def check(name, ok, detail=""):
    status = PASS if ok else FAIL
    print(f"{status} {name}" + (f" — {detail}" if detail else ""))
    results.append(ok)


# ------------------------------------------------------------------
# 1. Local file structure
# ------------------------------------------------------------------
print("\n=== File Structure ===")
check("inference.py at root", (ROOT / "inference.py").exists())
check("Dockerfile at root", (ROOT / "Dockerfile").exists())
check("openenv.yaml exists", (ROOT / "code_debug_env" / "openenv.yaml").exists())
check("models.py exists", (ROOT / "code_debug_env" / "models.py").exists())
check("server/app.py exists", (ROOT / "code_debug_env" / "server" / "app.py").exists())
check("server/environment.py exists", (ROOT / "code_debug_env" / "server" / "environment.py").exists())
check("server/executor.py exists", (ROOT / "code_debug_env" / "server" / "executor.py").exists())

tasks_dir = ROOT / "code_debug_env" / "tasks"
task_files = list(tasks_dir.rglob("*.json"))
check("3+ task files", len(task_files) >= 3, f"{len(task_files)} found")

difficulties = set()
for tf in task_files:
    t = json.loads(tf.read_text())
    difficulties.add(t.get("difficulty"))
check("All 3 difficulty levels present", {"easy", "medium", "hard"}.issubset(difficulties), str(difficulties))

# ------------------------------------------------------------------
# 2. inference.py content checks
# ------------------------------------------------------------------
print("\n=== inference.py Content ===")
inf = (ROOT / "inference.py").read_text()
check("Uses OpenAI client", "from openai import OpenAI" in inf)
check("Reads API_BASE_URL from env", "API_BASE_URL" in inf)
check("Reads MODEL_NAME from env", "MODEL_NAME" in inf)
check("Reads HF_TOKEN from env", "HF_TOKEN" in inf)
check("[START] log line", "[START]" in inf)
check("[STEP] log line", "[STEP]" in inf)
check("[END] log line", "[END]" in inf)

# ------------------------------------------------------------------
# 3. HF Space liveness
# ------------------------------------------------------------------
print("\n=== HF Space Liveness ===")
try:
    r = httpx.get(f"{HF_SPACE_URL}/health", timeout=30)
    check("HF Space returns HTTP 200", r.status_code == 200, f"status={r.status_code}")
    data = r.json()
    check("Health response is healthy", data.get("status") == "healthy", str(data))
except Exception as e:
    check("HF Space reachable", False, str(e))

# ------------------------------------------------------------------
# 4. reset() responds
# ------------------------------------------------------------------
print("\n=== reset() / step() Endpoints ===")
try:
    r = httpx.post(f"{HF_SPACE_URL}/reset", json={}, timeout=30)
    check("POST /reset returns 200", r.status_code == 200)
    ep = r.json().get("episode_id")
    check("reset() returns episode_id", bool(ep))

    obs = r.json().get("observation", {})
    check("observation has buggy_code", bool(obs.get("buggy_code")))
    check("observation has test_descriptions", bool(obs.get("test_descriptions")))
    check("observation has difficulty", bool(obs.get("difficulty")))
except Exception as e:
    check("reset() works", False, str(e))
    ep = None

# ------------------------------------------------------------------
# 5. step() returns reward in 0.0–1.0
# ------------------------------------------------------------------
if ep:
    try:
        r2 = httpx.post(f"{HF_SPACE_URL}/step/{ep}",
                        json={"action": {"code": "def placeholder(): pass"}},
                        timeout=30)
        check("POST /step returns 200", r2.status_code == 200)
        d = r2.json()
        reward = d.get("reward", -1)
        check("reward in [0.0, 1.0]", 0.0 <= reward <= 1.0, f"reward={reward}")
        check("done field is bool", isinstance(d.get("done"), bool))
    except Exception as e:
        check("step() works", False, str(e))

# ------------------------------------------------------------------
# 6. state() endpoint
# ------------------------------------------------------------------
if ep:
    try:
        r3 = httpx.get(f"{HF_SPACE_URL}/state/{ep}", timeout=30)
        check("GET /state returns 200", r3.status_code == 200)
        s = r3.json()
        check("state has episode_id", bool(s.get("episode_id")))
        check("state has step_count", "step_count" in s)
    except Exception as e:
        check("state() works", False, str(e))

# ------------------------------------------------------------------
# 7. Tasks enumeration
# ------------------------------------------------------------------
print("\n=== Task Enumeration ===")
try:
    r4 = httpx.get(f"{HF_SPACE_URL}/tasks", timeout=30)
    check("GET /tasks returns 200", r4.status_code == 200)
    tasks = r4.json()
    check("3+ tasks listed", len(tasks) >= 3, f"{len(tasks)} tasks")
    task_difficulties = {t["difficulty"] for t in tasks}
    check("All difficulties present in tasks endpoint", {"easy","medium","hard"}.issubset(task_difficulties))
except Exception as e:
    check("tasks endpoint works", False, str(e))

# ------------------------------------------------------------------
# 8. inference.py log format check (dry run on one task)
# ------------------------------------------------------------------
print("\n=== inference.py Log Format ===")
env = os.environ.copy()
env.update({
    "API_BASE_URL": "https://router.huggingface.co/v1",
    "MODEL_NAME": "Qwen/Qwen2.5-72B-Instruct",
    "HF_TOKEN": os.getenv("HF_TOKEN", ""),
    "ENV_URL": HF_SPACE_URL,
})

try:
    proc = subprocess.run(
        [sys.executable, str(ROOT / "inference.py")],
        capture_output=True, text=True, timeout=300, env=env
    )
    output = proc.stdout
    has_start = any(line.startswith("[START]") for line in output.splitlines())
    has_step  = any(line.startswith("[STEP]")  for line in output.splitlines())
    has_end   = any(line.startswith("[END]")   for line in output.splitlines())
    check("[START] line emitted", has_start)
    check("[STEP] line emitted", has_step)
    check("[END] line emitted", has_end)
    check("inference.py exits cleanly", proc.returncode == 0, f"exit={proc.returncode}")
    if proc.returncode != 0 and proc.stderr:
        print(f"       stderr: {proc.stderr[:300]}")
except subprocess.TimeoutExpired:
    check("inference.py completes within 5 min", False, "timed out")
except Exception as e:
    check("inference.py runs", False, str(e))

# ------------------------------------------------------------------
# Summary
# ------------------------------------------------------------------
print("\n=== Summary ===")
passed = sum(results)
total  = len(results)
print("".join(["PASS" if r else "FAIL" for r in results]))
print(f"{passed}/{total} checks passed")
if passed == total:
    print("\n[READY] All checks passed - ready to submit!")
else:
    print(f"\n[ACTION NEEDED] Fix {total - passed} failing check(s) before submitting.")