File size: 11,655 Bytes
f44f429 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 | import requests
import json
BASE_URL = "http://localhost:7860"
def run_tests():
checks = []
# 1. GET /
try:
r = requests.get(f"{BASE_URL}/")
passed = r.status_code == 200 and r.json().get("status") == "ok"
checks.append({
"id": 1, "name": "GET / health check", "passed": passed,
"expected": 'HTTP 200 and {"status": "ok"}', "got": f"HTTP {r.status_code} {r.text}"
})
except Exception as e:
checks.append({"id": 1, "name": "GET / health check", "passed": False, "expected": "200 OK", "got": str(e)})
# 15. GET /state before reset (Edge case)
try:
r = requests.get(f"{BASE_URL}/state")
# Should not crash
checks.append({
"id": 15, "name": "GET /state before any reset", "passed": r.status_code == 200,
"expected": "HTTP 200 (No crash)", "got": f"HTTP {r.status_code} {r.text}"
})
except Exception as e:
checks.append({"id": 15, "name": "GET /state before any reset", "passed": False, "expected": "200 OK", "got": str(e)})
# 2. POST /reset
try:
r = requests.post(f"{BASE_URL}/reset")
data = r.json().get("observation", {})
required = ["task_id", "language", "difficulty", "code_snippet", "context", "pr_title", "file_path"]
passed = all(k in data for k in required)
checks.append({
"id": 2, "name": "POST /reset fields check", "passed": passed,
"expected": f"JSON with {required}", "got": list(data.keys())
})
except Exception as e:
checks.append({"id": 2, "name": "POST /reset fields check", "passed": False, "expected": "Fields", "got": str(e)})
# 16. POST /reset no task_id
try:
r = requests.post(f"{BASE_URL}/reset")
checks.append({
"id": 16, "name": "POST /reset no task_id (Random)", "passed": r.status_code == 200,
"expected": "HTTP 200", "got": f"HTTP {r.status_code}"
})
except Exception as e:
checks.append({"id": 16, "name": "POST /reset no task_id (Random)", "passed": False, "expected": "200 OK", "got": str(e)})
# 3-5. POST /reset?task_id=...
for tid in ["python-off-by-one", "js-auth-privilege", "python-sql-injection"]:
try:
num = {"python-off-by-one": 3, "js-auth-privilege": 4, "python-sql-injection": 5}[tid]
r = requests.post(f"{BASE_URL}/reset?task_id={tid}")
passed = r.status_code == 200 and r.json()["observation"]["task_id"] == tid
checks.append({
"id": num, "name": f"POST /reset for {tid}", "passed": passed,
"expected": f"HTTP 200 with task_id={tid}", "got": f"HTTP {r.status_code} {r.json()['observation']['task_id'] if passed else r.text}"
})
except Exception as e:
checks.append({"id": num, "name": f"POST /reset for {tid}", "passed": False, "expected": "200 OK", "got": str(e)})
# 6. GET /state
try:
r = requests.get(f"{BASE_URL}/state")
data = r.json()
required = ["task_id", "step", "done", "total_reward"]
passed = all(k in data for k in required)
checks.append({
"id": 6, "name": "GET /state fields check", "passed": passed,
"expected": f"JSON with {required}", "got": list(data.keys())
})
except Exception as e:
checks.append({"id": 6, "name": "GET /state fields check", "passed": False, "expected": "Fields", "got": str(e)})
# 7. POST /step with PROVIDED action
try:
requests.post(f"{BASE_URL}/reset?task_id=python-sql-injection")
action = {
"bug_identified": True,
"bug_location": "line 2 f-string",
"bug_type": "security-vulnerability",
"bug_description": "SQL injection via f-string",
"severity": "critical",
"suggested_fix": "use parameterized query"
}
r = requests.post(f"{BASE_URL}/step", json=action)
res = r.json()
reward = res.get("reward", -1.0)
done = res.get("done", False)
passed = 0.0 <= reward <= 1.0 and done is True
checks.append({
"id": 7, "name": "POST /step valid action", "passed": passed,
"expected": "Reward [0,1] and done=true", "got": f"reward={reward}, done={done}"
})
except Exception as e:
checks.append({"id": 7, "name": "POST /step valid action", "passed": False, "expected": "Result", "got": str(e)})
# 14. Call POST /step twice (Edge Case)
try:
# Step already called in task 7
action = {"bug_identified": False, "bug_location": "", "bug_type": "none", "bug_description": "", "severity": "none", "suggested_fix": ""}
r = requests.post(f"{BASE_URL}/step", json=action)
res = r.json()
passed = r.status_code == 200 and "error" in res.get("info", {})
checks.append({
"id": 14, "name": "POST /step twice in same episode", "passed": passed,
"expected": "HTTP 200 and error in info", "got": f"HTTP {r.status_code}, info={res.get('info')}"
})
except Exception as e:
checks.append({"id": 14, "name": "POST /step twice in same episode", "passed": False, "expected": "Handled error", "got": str(e)})
# 8. Perfect action for SQL
try:
requests.post(f"{BASE_URL}/reset?task_id=python-sql-injection")
perfect_action = {
"bug_identified": True,
"bug_location": "line 2 f-string interpolation in SQL query construction",
"bug_type": "security-vulnerability",
"bug_description": "SQL injection vulnerability where user-supplied search_term is directly interpolated into the SQL query via f-string. An attacker can inject malicious SQL to bypass authentication, exfiltrate all user data, or drop tables. The fix is to use parameterized queries which sanitize user input automatically.",
"severity": "critical",
"suggested_fix": "Use db.execute('SELECT * FROM users WHERE name LIKE %s', ('%'+search_term+'%',)) instead of f-string interpolation"
}
r = requests.post(f"{BASE_URL}/step", json=perfect_action)
reward = r.json().get("reward", 0.0)
checks.append({
"id": 8, "name": "PERFECT action SQL", "passed": reward >= 0.85,
"expected": "Reward >= 0.85", "got": f"reward={reward}"
})
except Exception as e:
checks.append({"id": 8, "name": "PERFECT action SQL", "passed": False, "expected": ">=0.85", "got": str(e)})
# 9. Keyword stuffed
try:
requests.post(f"{BASE_URL}/reset?task_id=python-sql-injection")
stuffed_action = {
"bug_identified": True,
"bug_location": "sql",
"bug_type": "security-vulnerability",
"bug_description": "sql injection sql injection sql injection parameterized f-string sanitize escape malicious attack tautology union drop sql injection sql injection",
"severity": "critical",
"suggested_fix": "fix"
}
r = requests.post(f"{BASE_URL}/step", json=stuffed_action)
reward = r.json().get("reward", 1.0)
checks.append({
"id": 9, "name": "KEYWORD STUFFED action", "passed": reward <= 0.20,
"expected": "Reward <= 0.20", "got": f"reward={reward}"
})
except Exception as e:
checks.append({"id": 9, "name": "KEYWORD STUFFED action", "passed": False, "expected": "<=0.20", "got": str(e)})
# 10. Bug identified false
try:
requests.post(f"{BASE_URL}/reset")
action = {"bug_identified": False, "bug_location": "", "bug_type": "none", "bug_description": "", "severity": "none", "suggested_fix": ""}
r = requests.post(f"{BASE_URL}/step", json=action)
reward = r.json().get("reward", 1.0)
checks.append({
"id": 10, "name": "Identify=False empty fields", "passed": reward == 0.0,
"expected": "Reward exactly 0.0", "got": f"reward={reward}"
})
except Exception as e:
checks.append({"id": 10, "name": "Identify=False empty fields", "passed": False, "expected": "0.0", "got": str(e)})
# 11. Partial credit severity
try:
# Off-by-one is severity critical (I set it to critical).
# Let's say I submit 'low' severity.
requests.post(f"{BASE_URL}/reset?task_id=python-off-by-one")
action = {
"bug_identified": True, "bug_location": "range", "bug_type": "off-by-one",
"bug_description": "off-by-one error in range function call",
"severity": "low", # Wrong severity
"suggested_fix": "range(len(x))"
}
r = requests.post(f"{BASE_URL}/step", json=action)
info = r.json().get("info", {})
breakdown = info.get("reward_breakdown", {})
sev_score = breakdown.get("severity", -1.0)
# It should be 0.0 (wrong) but the total should still have partial credit from other components
reward = r.json().get("reward", 0.0)
checks.append({
"id": 11, "name": "Partial credit (wrong severity)", "passed": 0.0 < reward < 1.0,
"expected": "Reward between 0 and 1 (partial credit)", "got": f"reward={reward}, severity_component={sev_score}"
})
except Exception as e:
checks.append({"id": 11, "name": "Partial credit (wrong severity)", "passed": False, "expected": "Partial credit", "got": str(e)})
# 12-13. Breakdown keys and components
try:
requests.post(f"{BASE_URL}/reset")
action = {"bug_identified": True, "bug_location": "test", "bug_type": "test", "bug_description": "test test test test test test test test test test test test test test test test test test test test", "severity": "none", "suggested_fix": "test test test"}
r = requests.post(f"{BASE_URL}/step", json=action)
info = r.json().get("info", {})
breakdown = info.get("reward_breakdown", {})
required = ["bug_identified", "bug_type", "bug_location", "description_quality", "fix_quality", "severity"]
checks.append({
"id": 12, "name": "Reward breakdown keys", "passed": all(k in breakdown for k in required),
"expected": f"Breakdown with {required}", "got": list(breakdown.keys())
})
max_vals = {
"bug_identified": 0.20, "bug_type": 0.20, "bug_location": 0.10,
"description_quality": 0.25, "fix_quality": 0.15, "severity": 0.10
}
passed_range = all(0.0 <= breakdown.get(k, -1) <= max_vals[k] for k in max_vals)
checks.append({
"id": 13, "name": "Component score ranges", "passed": passed_range,
"expected": "All components <= max", "got": breakdown
})
except Exception as e:
checks.append({"id": 12, "name": "Breakdown checks", "passed": False, "expected": "Breakdown", "got": str(e)})
# Sort and print
checks.sort(key=lambda x: x["id"])
for c in checks:
status = "PASS" if c["passed"] else "FAIL"
print(f"[{c['id']}] {c['name']} — {status}")
print(f" Expected: {c['expected']}")
print(f" Got: {c['got']}")
print("")
passed_count = sum(1 for c in checks if c["passed"])
disqual = "YES" if passed_count < 7 else "NO" # Disqualified if Part 1 fails
print(f"TOTAL: {passed_count}/16 passed")
print(f"DISQUALIFICATION RISK: {disqual}")
# Estimate score based on points
score = (passed_count / 16) * 100
print(f"ESTIMATED SCORE: {round(score)}/100")
if __name__ == "__main__":
run_tests()
|