sameerkatte Claude Opus 4.6 (1M context) commited on
Commit
6dbb8cf
·
1 Parent(s): 8d618ab

Add [START]/[STEP]/[END] structured output markers to inference.py

Browse files

The validator parses stdout for these markers to extract per-task scores.
Previously inference.py only printed freeform text, so the Output Parsing
check failed with "No [START]/[STEP]/[END] in stdout".

Changes:
- Import sys/functools and wrap print() with flush=True so all output
is flushed immediately (no buffering inside the validator harness)
- run_episode() emits [START] task=NAME on entry
- Each step emits [STEP] step=N reward=VALUE
- Episode end emits [END] task=NAME score=VALUE steps=N

Verified locally against the live HF Space: all 3 tasks emit the
full marker set even when the LLM call fails (fallback to mark_complete).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. inference.py +22 -4
inference.py CHANGED
@@ -12,11 +12,17 @@ MANDATORY environment variables:
12
 
13
  import os
14
  import re
 
15
  import json
16
  import time
 
17
  import requests as http_requests
18
  from openai import OpenAI
19
 
 
 
 
 
20
  # === Configuration ===
21
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
22
  MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct")
@@ -221,7 +227,14 @@ def parse_action(text: str) -> dict:
221
 
222
 
223
  def run_episode(task_id: str, scenario_id: str = None) -> float:
224
- """Run a single episode and return the final score."""
 
 
 
 
 
 
 
225
  reset_body = {"task_id": task_id}
226
  if scenario_id:
227
  reset_body["scenario_id"] = scenario_id
@@ -266,8 +279,10 @@ def run_episode(task_id: str, scenario_id: str = None) -> float:
266
  observation = step_data["observation"]
267
  done = step_data["done"]
268
  step_count += 1
 
 
 
269
  if done:
270
- final_score = step_data["reward"]
271
  break
272
 
273
  # Main agent loop
@@ -310,13 +325,16 @@ def run_episode(task_id: str, scenario_id: str = None) -> float:
310
 
311
  observation = step_data["observation"]
312
  done = step_data["done"]
313
- final_score = step_data["reward"]
 
314
  step_count += 1
 
315
 
316
  # Small delay to avoid rate limiting
317
  time.sleep(0.3)
318
 
319
- print(f" Steps: {step_count}, Score: {final_score}")
 
320
  return final_score
321
 
322
 
 
12
 
13
  import os
14
  import re
15
+ import sys
16
  import json
17
  import time
18
+ import functools
19
  import requests as http_requests
20
  from openai import OpenAI
21
 
22
+ # All print calls flush stdout immediately so the validator can parse
23
+ # [START]/[STEP]/[END] markers in real time.
24
+ print = functools.partial(print, flush=True)
25
+
26
  # === Configuration ===
27
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
28
  MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct")
 
227
 
228
 
229
  def run_episode(task_id: str, scenario_id: str = None) -> float:
230
+ """Run a single episode and return the final score.
231
+
232
+ Emits ``[START]``, ``[STEP]``, and ``[END]`` markers on stdout for
233
+ the validator to parse.
234
+ """
235
+ # === [START] marker ===
236
+ print(f"[START] task={task_id}")
237
+
238
  reset_body = {"task_id": task_id}
239
  if scenario_id:
240
  reset_body["scenario_id"] = scenario_id
 
279
  observation = step_data["observation"]
280
  done = step_data["done"]
281
  step_count += 1
282
+ reward_val = step_data.get("reward", 0.0) or 0.0
283
+ final_score = reward_val
284
+ print(f"[STEP] step={step_count} reward={reward_val}")
285
  if done:
 
286
  break
287
 
288
  # Main agent loop
 
325
 
326
  observation = step_data["observation"]
327
  done = step_data["done"]
328
+ reward_val = step_data.get("reward", 0.0) or 0.0
329
+ final_score = reward_val
330
  step_count += 1
331
+ print(f"[STEP] step={step_count} reward={reward_val}")
332
 
333
  # Small delay to avoid rate limiting
334
  time.sleep(0.3)
335
 
336
+ # === [END] marker ===
337
+ print(f"[END] task={task_id} score={final_score} steps={step_count}")
338
  return final_score
339
 
340