Spaces:
Running
Running
Commit ·
66d8c67
1
Parent(s): 2efa047
working
Browse files
inference.py
CHANGED
|
@@ -11,7 +11,7 @@ Usage:
|
|
| 11 |
STDOUT FORMAT (strictly required by evaluator - plaintext):
|
| 12 |
[START] task=<id> env=<benchmark> model=<model>
|
| 13 |
[STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 14 |
-
[END] success=<true|false> steps=<n> score=<0.
|
| 15 |
"""
|
| 16 |
|
| 17 |
import os, sys, json, time, argparse, requests, re
|
|
@@ -31,6 +31,9 @@ API_BASE_URL = os.getenv("API_BASE_URL", "https://api.groq.com/openai/v1")
|
|
| 31 |
MODEL_NAME = os.getenv("MODEL_NAME", "llama-3.1-8b-instant")
|
| 32 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 33 |
HF_TOKEN_SOURCE = "HF_TOKEN"
|
|
|
|
|
|
|
|
|
|
| 34 |
if not HF_TOKEN:
|
| 35 |
HF_TOKEN = os.getenv("hf_token")
|
| 36 |
HF_TOKEN_SOURCE = "hf_token"
|
|
@@ -55,12 +58,12 @@ def _normalize_token(value: str) -> str:
|
|
| 55 |
def _format_error(error: Optional[str]) -> str:
|
| 56 |
if error is None:
|
| 57 |
return "null"
|
| 58 |
-
|
| 59 |
-
return
|
| 60 |
|
| 61 |
|
| 62 |
def _format_rewards(rewards: List[float]) -> str:
|
| 63 |
-
return "
|
| 64 |
|
| 65 |
|
| 66 |
def log_start(task_id: str, env: str, model: str) -> None:
|
|
@@ -78,7 +81,7 @@ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[
|
|
| 78 |
|
| 79 |
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 80 |
print(
|
| 81 |
-
f"[END] success={_format_bool(success)} steps={steps} score={round(score,
|
| 82 |
f"rewards={_format_rewards(rewards)}",
|
| 83 |
flush=True,
|
| 84 |
)
|
|
@@ -241,43 +244,43 @@ def run_episode(env_url: str, difficulty: str) -> tuple:
|
|
| 241 |
last_feedback = None
|
| 242 |
last_code = None
|
| 243 |
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
return success, steps_taken, rewards
|
| 282 |
|
| 283 |
|
|
@@ -291,7 +294,7 @@ def main():
|
|
| 291 |
|
| 292 |
if not HF_TOKEN:
|
| 293 |
print(
|
| 294 |
-
"# Missing HF_TOKEN (or lowercase hf_token).",
|
| 295 |
file=sys.stderr,
|
| 296 |
flush=True,
|
| 297 |
)
|
|
|
|
| 11 |
STDOUT FORMAT (strictly required by evaluator - plaintext):
|
| 12 |
[START] task=<id> env=<benchmark> model=<model>
|
| 13 |
[STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 14 |
+
[END] success=<true|false> steps=<n> score=<0.00> rewards=<r1,r2,...>
|
| 15 |
"""
|
| 16 |
|
| 17 |
import os, sys, json, time, argparse, requests, re
|
|
|
|
| 31 |
MODEL_NAME = os.getenv("MODEL_NAME", "llama-3.1-8b-instant")
|
| 32 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 33 |
HF_TOKEN_SOURCE = "HF_TOKEN"
|
| 34 |
+
if not HF_TOKEN:
|
| 35 |
+
HF_TOKEN = os.getenv("API_KEY")
|
| 36 |
+
HF_TOKEN_SOURCE = "API_KEY"
|
| 37 |
if not HF_TOKEN:
|
| 38 |
HF_TOKEN = os.getenv("hf_token")
|
| 39 |
HF_TOKEN_SOURCE = "hf_token"
|
|
|
|
| 58 |
def _format_error(error: Optional[str]) -> str:
|
| 59 |
if error is None:
|
| 60 |
return "null"
|
| 61 |
+
text = str(error).replace("\r", "\\r").replace("\n", "\\n")
|
| 62 |
+
return text if text else "null"
|
| 63 |
|
| 64 |
|
| 65 |
def _format_rewards(rewards: List[float]) -> str:
|
| 66 |
+
return ",".join(f"{round(r, 2):.2f}" for r in rewards)
|
| 67 |
|
| 68 |
|
| 69 |
def log_start(task_id: str, env: str, model: str) -> None:
|
|
|
|
| 81 |
|
| 82 |
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 83 |
print(
|
| 84 |
+
f"[END] success={_format_bool(success)} steps={steps} score={round(score, 2):.2f} "
|
| 85 |
f"rewards={_format_rewards(rewards)}",
|
| 86 |
flush=True,
|
| 87 |
)
|
|
|
|
| 244 |
last_feedback = None
|
| 245 |
last_code = None
|
| 246 |
|
| 247 |
+
try:
|
| 248 |
+
for attempt in range(1, MAX_STEPS + 1):
|
| 249 |
+
steps_taken = attempt
|
| 250 |
+
action = call_llm(buggy_code, instructions, difficulty, last_feedback, attempt, last_code)
|
| 251 |
+
code = action.get("fixed_code") or ""
|
| 252 |
+
last_code = code
|
| 253 |
+
|
| 254 |
+
reward = 0.0
|
| 255 |
+
done = False
|
| 256 |
+
step_error: Optional[str] = None
|
| 257 |
+
try:
|
| 258 |
+
result = env_step(env_url, code, action.get("explanation"))
|
| 259 |
+
reward = result.get("reward", 0.0)
|
| 260 |
+
done = result.get("done", False)
|
| 261 |
+
obs_r = result.get("observation", {})
|
| 262 |
+
if isinstance(obs_r, dict):
|
| 263 |
+
last_feedback = obs_r.get("feedback", "")
|
| 264 |
+
step_error = obs_r.get("last_action_error")
|
| 265 |
+
if step_error is None:
|
| 266 |
+
step_error = obs_r.get("error")
|
| 267 |
+
except Exception as e:
|
| 268 |
+
step_error = str(e)
|
| 269 |
+
|
| 270 |
+
log_step(attempt, f"fix_{difficulty}_attempt{attempt}", reward, done, step_error)
|
| 271 |
+
rewards.append(reward)
|
| 272 |
+
|
| 273 |
+
if reward >= 1.0:
|
| 274 |
+
success = True
|
| 275 |
+
if done:
|
| 276 |
+
break
|
| 277 |
+
finally:
|
| 278 |
+
# Compute normalized score for this episode and always emit [END].
|
| 279 |
+
score = max(rewards) if rewards else 0.0
|
| 280 |
+
score = min(max(score, 0.0), 1.0)
|
| 281 |
+
success = success or (score >= SUCCESS_SCORE_THRESHOLD)
|
| 282 |
+
log_end(success, steps_taken, score, rewards)
|
| 283 |
+
|
| 284 |
return success, steps_taken, rewards
|
| 285 |
|
| 286 |
|
|
|
|
| 294 |
|
| 295 |
if not HF_TOKEN:
|
| 296 |
print(
|
| 297 |
+
"# Missing API key. Set HF_TOKEN (or API_KEY / lowercase hf_token).",
|
| 298 |
file=sys.stderr,
|
| 299 |
flush=True,
|
| 300 |
)
|
validator/__pycache__/pre_submit_check.cpython-39.pyc
CHANGED
|
Binary files a/validator/__pycache__/pre_submit_check.cpython-39.pyc and b/validator/__pycache__/pre_submit_check.cpython-39.pyc differ
|
|
|
validator/pre_submit_check.py
CHANGED
|
@@ -157,11 +157,13 @@ def run_checks(base_url: str):
|
|
| 157 |
has_step = "[STEP] step=" in content
|
| 158 |
has_end = "[END] success=" in content
|
| 159 |
avoids_json_logs = "print(json.dumps(log_entry)" not in content
|
|
|
|
| 160 |
check("inference.py emits [START] logs", has_start)
|
| 161 |
check("inference.py emits [STEP] logs", has_step)
|
| 162 |
check("inference.py emits [END] logs", has_end)
|
| 163 |
check("inference.py avoids JSON log dict dumps", avoids_json_logs)
|
| 164 |
-
|
|
|
|
| 165 |
except Exception as e:
|
| 166 |
check("inference.py log format", False, str(e))
|
| 167 |
all_passed = False
|
|
|
|
| 157 |
has_step = "[STEP] step=" in content
|
| 158 |
has_end = "[END] success=" in content
|
| 159 |
avoids_json_logs = "print(json.dumps(log_entry)" not in content
|
| 160 |
+
rewards_csv = "rewards=[" not in content
|
| 161 |
check("inference.py emits [START] logs", has_start)
|
| 162 |
check("inference.py emits [STEP] logs", has_step)
|
| 163 |
check("inference.py emits [END] logs", has_end)
|
| 164 |
check("inference.py avoids JSON log dict dumps", avoids_json_logs)
|
| 165 |
+
check("inference.py emits CSV rewards in [END]", rewards_csv)
|
| 166 |
+
all_passed &= has_start and has_step and has_end and avoids_json_logs and rewards_csv
|
| 167 |
except Exception as e:
|
| 168 |
check("inference.py log format", False, str(e))
|
| 169 |
all_passed = False
|