kgdrathan commited on
Commit
db0b6ff
·
1 Parent(s): 6c997d7

test scripts and change in inference.py

Browse files
Files changed (3) hide show
  1. inference.original.py +197 -0
  2. inference.py +111 -34
  3. test_submission.sh +315 -0
inference.original.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Inference Script Example
3
+ ===================================
4
+ MANDATORY
5
+ - Before submitting, ensure the following variables are defined in your environment configuration:
6
+ API_BASE_URL The API endpoint for the LLM.
7
+ MODEL_NAME The model identifier to use for inference.
8
+ HF_TOKEN Your Hugging Face / API key.
9
+ LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
10
+ method
11
+
12
+ - Defaults are set only for API_BASE_URL and MODEL_NAME
13
+ (and should reflect your active inference setup):
14
+ API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
15
+ MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
16
+
17
+ - The inference script must be named `inference.py` and placed in the root directory of the project
18
+ - Participants must use OpenAI Client for all LLM calls using above variables
19
+
20
+ STDOUT FORMAT
21
+ - The script must emit exactly three line types to stdout, in this order:
22
+
23
+ [START] task=<task_name> env=<benchmark> model=<model_name>
24
+ [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
25
+ [END] success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
26
+
27
+ Rules:
28
+ - One [START] line at episode begin.
29
+ - One [STEP] line per step, immediately after env.step() returns.
30
+ - One [END] line after env.close(), always emitted (even on exception).
31
+ - reward and rewards are formatted to 2 decimal places.
32
+ - done and success are lowercase booleans: true or false.
33
+ - error is the raw last_action_error string, or null if none.
34
+ - All fields on a single line with no newlines within a line.
35
+ - Each tasks should return score in [0, 1]
36
+
37
+ Example:
38
+ [START] task=click-test env=miniwob model=Qwen3-VL-30B
39
+ [STEP] step=1 action=click('123') reward=0.00 done=false error=null
40
+ [STEP] step=2 action=fill('456','text') reward=0.00 done=false error=null
41
+ [STEP] step=3 action=click('789') reward=1.00 done=true error=null
42
+ [END] success=true steps=3 score=1.00 rewards=0.00,0.00,1.00
43
+ """
44
+
45
+ import asyncio
46
+ import os
47
+ import textwrap
48
+ from typing import List, Optional
49
+
50
+ from my_env_v4 import MyEnvV4Action, MyEnvV4Env
51
+ from openai import OpenAI
52
+
53
+ IMAGE_NAME = os.getenv("IMAGE_NAME") # If you are using docker image
54
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
55
+
56
+ API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
57
+ MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
58
+ TASK_NAME = os.getenv("MY_ENV_V4_TASK", "echo")
59
+ BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "my_env_v4")
60
+ MAX_STEPS = 8
61
+ TEMPERATURE = 0.7
62
+ MAX_TOKENS = 150
63
+ SUCCESS_SCORE_THRESHOLD = 0.1 # normalized score in [0, 1]
64
+
65
+ # Max possible reward: each token contributes 0.1, across all steps
66
+ _MAX_REWARD_PER_STEP = MAX_TOKENS * 0.1
67
+ MAX_TOTAL_REWARD = MAX_STEPS * _MAX_REWARD_PER_STEP
68
+
69
+ SYSTEM_PROMPT = textwrap.dedent(
70
+ """
71
+ You are interacting with a simple echo environment.
72
+ Each turn you must send a message. The environment will echo it back.
73
+ Reward is proportional to message length: reward = len(message) * 0.1
74
+ Your goal is to maximize total reward by sending meaningful, substantive messages.
75
+ Reply with exactly one message string — no quotes, no prefixes, just the message text.
76
+ """
77
+ ).strip()
78
+
79
+
80
+ def log_start(task: str, env: str, model: str) -> None:
81
+ print(f"[START] task={task} env={env} model={model}", flush=True)
82
+
83
+
84
+ def log_step(
85
+ step: int, action: str, reward: float, done: bool, error: Optional[str]
86
+ ) -> None:
87
+ error_val = error if error else "null"
88
+ done_val = str(done).lower()
89
+ print(
90
+ f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
91
+ flush=True,
92
+ )
93
+
94
+
95
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
96
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
97
+ print(
98
+ f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
99
+ flush=True,
100
+ )
101
+
102
+
103
+ def build_user_prompt(
104
+ step: int, last_echoed: str, last_reward: float, history: List[str]
105
+ ) -> str:
106
+ history_block = "\n".join(history[-4:]) if history else "None"
107
+ return textwrap.dedent(
108
+ f"""
109
+ Step: {step}
110
+ Last echoed message: {last_echoed!r}
111
+ Last reward: {last_reward:.2f}
112
+ Previous steps:
113
+ {history_block}
114
+ Send your next message.
115
+ """
116
+ ).strip()
117
+
118
+
119
+ def get_model_message(
120
+ client: OpenAI, step: int, last_echoed: str, last_reward: float, history: List[str]
121
+ ) -> str:
122
+ user_prompt = build_user_prompt(step, last_echoed, last_reward, history)
123
+ try:
124
+ completion = client.chat.completions.create(
125
+ model=MODEL_NAME,
126
+ messages=[
127
+ {"role": "system", "content": SYSTEM_PROMPT},
128
+ {"role": "user", "content": user_prompt},
129
+ ],
130
+ temperature=TEMPERATURE,
131
+ max_tokens=MAX_TOKENS,
132
+ stream=False,
133
+ )
134
+ text = (completion.choices[0].message.content or "").strip()
135
+ return text if text else "hello"
136
+ except Exception as exc:
137
+ print(f"[DEBUG] Model request failed: {exc}", flush=True)
138
+ return "hello"
139
+
140
+
141
+ async def main() -> None:
142
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
143
+
144
+ env = await MyEnvV4Env.from_docker_image(IMAGE_NAME)
145
+
146
+ history: List[str] = []
147
+ rewards: List[float] = []
148
+ steps_taken = 0
149
+ score = 0.0
150
+ success = False
151
+
152
+ log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
153
+
154
+ try:
155
+ result = await env.reset() # OpenENV.reset()
156
+ last_echoed = result.observation.echoed_message
157
+ last_reward = 0.0
158
+
159
+ for step in range(1, MAX_STEPS + 1):
160
+ if result.done:
161
+ break
162
+
163
+ message = get_model_message(client, step, last_echoed, last_reward, history)
164
+
165
+ result = await env.step(MyEnvV4Action(message=message))
166
+ obs = result.observation
167
+
168
+ reward = result.reward or 0.0
169
+ done = result.done
170
+ error = None
171
+
172
+ rewards.append(reward)
173
+ steps_taken = step
174
+ last_echoed = obs.echoed_message
175
+ last_reward = reward
176
+
177
+ log_step(step=step, action=message, reward=reward, done=done, error=error)
178
+
179
+ history.append(f"Step {step}: {message!r} -> reward {reward:+.2f}")
180
+
181
+ if done:
182
+ break
183
+
184
+ score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
185
+ score = min(max(score, 0.0), 1.0) # clamp to [0, 1]
186
+ success = score >= SUCCESS_SCORE_THRESHOLD
187
+
188
+ finally:
189
+ try:
190
+ await env.close()
191
+ except Exception as e:
192
+ print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
193
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
194
+
195
+
196
+ if __name__ == "__main__":
197
+ asyncio.run(main())
inference.py CHANGED
@@ -1,18 +1,47 @@
1
  """
2
- Inference Script for Curator Environment
3
- ============================================
 
4
 
5
- Environment Variables:
 
6
  API_BASE_URL The API endpoint for the LLM.
7
  MODEL_NAME The model identifier to use for inference.
8
  HF_TOKEN Your Hugging Face / API key.
9
- IMAGE_NAME Docker image name for the environment.
10
- CURATOR_TASK Task difficulty: "easy", "medium", or "hard" (default: "easy").
11
 
12
- STDOUT FORMAT:
13
- [START] task=<task_name> env=curator model=<model_name>
 
 
 
 
 
 
 
 
 
 
14
  [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
15
- [END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...,rn>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  """
17
 
18
  import asyncio
@@ -26,40 +55,53 @@ from openai import OpenAI
26
  from client import CuratorEnv
27
  from models import CuratorAction
28
 
29
- IMAGE_NAME = os.getenv("IMAGE_NAME")
30
- API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
 
 
 
 
 
 
 
31
 
32
  API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
33
- MODEL_NAME = os.getenv("MODEL_NAME") or "meta-llama/Llama-3.2-1B-Instruct"
34
- TASK_NAME = os.getenv("CURATOR_TASK", "hard")
 
35
  BENCHMARK = "curator"
36
  TEMPERATURE = 0.3
37
  MAX_TOKENS = 2000
38
  SUCCESS_SCORE_THRESHOLD = 0.3
39
 
 
40
  SYSTEM_PROMPT = textwrap.dedent("""
41
  You are a content curation agent. You help users find the most relevant
42
  articles from a pool of content items based on their interest profile.
43
 
44
- Available actions (respond with valid JSON):
45
 
46
- 1. Filter (remove irrelevant items):
47
  {"action_type": "filter", "item_ids": ["id1", "id2", ...]}
48
 
49
- 2. Categorize items:
50
  {"action_type": "categorize", "categories": {"id1": "urgent", "id2": "skip", ...}}
51
  Categories: "urgent", "read_later", "share", "skip"
52
 
53
- 3. Rank items by relevance:
54
  {"action_type": "rank", "rankings": ["best_id", "second_id", ...]}
55
 
56
- 4. Final recommendation (ends episode):
57
  {"action_type": "recommend", "item_ids": ["id1", "id2", ...]}
58
 
59
- Strategy: First filter out clearly irrelevant items, then rank the remainder,
60
- then recommend the top items.
 
 
 
61
 
62
- IMPORTANT: Respond with ONLY a JSON object, no markdown or explanation.
 
63
  """).strip()
64
 
65
 
@@ -175,6 +217,7 @@ def get_model_action(
175
  client: OpenAI,
176
  obs: Any,
177
  step: int,
 
178
  last_feedback: Optional[str],
179
  messages: List[Dict[str, str]],
180
  ) -> Dict:
@@ -182,6 +225,8 @@ def get_model_action(
182
  user_prompt = build_user_prompt(obs, step, last_feedback)
183
  messages.append({"role": "user", "content": user_prompt})
184
 
 
 
185
  try:
186
  completion = client.chat.completions.create(
187
  model=MODEL_NAME,
@@ -194,29 +239,39 @@ def get_model_action(
194
  messages.append({"role": "assistant", "content": text})
195
  action = parse_action_from_response(text)
196
  if action and "action_type" in action:
 
 
 
 
 
197
  return action
198
  except Exception as exc:
199
  print(f"[DEBUG] Model request failed: {exc}", flush=True)
200
 
201
- # Fallback: recommend first N items from pool
202
  item_ids = [item.id if hasattr(item, "id") else item["id"] for item in obs.items]
203
  k = obs.task_info.recommend_k if obs.task_info else 5
204
- return {"action_type": "recommend", "item_ids": item_ids[:k]}
205
-
 
206
 
207
- async def main() -> None:
208
- llm_client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
209
 
210
- async with CuratorEnv(base_url="http://localhost:8000") as env:
211
- rewards: List[float] = []
212
- steps_taken = 0
213
- score = 0.0
214
- success = False
215
- last_feedback: Optional[str] = None
 
 
 
 
 
216
 
217
- log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
218
 
219
- result = await env.reset(task_id=TASK_NAME)
 
220
  obs = result.observation
221
 
222
  task_info = obs.task_info
@@ -229,7 +284,7 @@ async def main() -> None:
229
  if result.done:
230
  break
231
 
232
- action_dict = get_model_action(llm_client, obs, step, last_feedback, messages)
233
  action = CuratorAction(**action_dict)
234
 
235
  result = await env.step(action)
@@ -265,8 +320,30 @@ async def main() -> None:
265
  score = min(max(score, 0.0), 1.0)
266
  success = score >= SUCCESS_SCORE_THRESHOLD
267
 
 
268
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
269
 
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  if __name__ == "__main__":
272
  asyncio.run(main())
 
1
  """
2
+ Inference Script for Curator Environment (Docker version)
3
+ =========================================================
4
+ Loads the environment from a Docker image via from_docker_image().
5
 
6
+ MANDATORY
7
+ - Before submitting, ensure the following variables are defined in your environment configuration:
8
  API_BASE_URL The API endpoint for the LLM.
9
  MODEL_NAME The model identifier to use for inference.
10
  HF_TOKEN Your Hugging Face / API key.
11
+ LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
12
+ method
13
 
14
+ - Defaults are set only for API_BASE_URL and MODEL_NAME
15
+ (and should reflect your active inference setup):
16
+ API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
17
+ MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
18
+
19
+ - The inference script must be named `inference.py` and placed in the root directory of the project
20
+ - Participants must use OpenAI Client for all LLM calls using above variables
21
+
22
+ STDOUT FORMAT
23
+ - The script must emit exactly three line types to stdout, in this order:
24
+
25
+ [START] task=<task_name> env=<benchmark> model=<model_name>
26
  [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
27
+ [END] success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
28
+
29
+ Rules:
30
+ - One [START] line at episode begin.
31
+ - One [STEP] line per step, immediately after env.step() returns.
32
+ - One [END] line after env.close(), always emitted (even on exception).
33
+ - reward and rewards are formatted to 2 decimal places.
34
+ - done and success are lowercase booleans: true or false.
35
+ - error is the raw last_action_error string, or null if none.
36
+ - All fields on a single line with no newlines within a line.
37
+ - Each tasks should return score in [0, 1]
38
+
39
+ Example:
40
+ [START] task=click-test env=miniwob model=Qwen3-VL-30B
41
+ [STEP] step=1 action=click('123') reward=0.00 done=false error=null
42
+ [STEP] step=2 action=fill('456','text') reward=0.00 done=false error=null
43
+ [STEP] step=3 action=click('789') reward=1.00 done=true error=null
44
+ [END] success=true steps=3 score=1.00 rewards=0.00,0.00,1.00
45
  """
46
 
47
  import asyncio
 
55
  from client import CuratorEnv
56
  from models import CuratorAction
57
 
58
+ HF_SPACE_URL = "https://huggingface.co/spaces/kgdrathan/openenv-curator"
59
+
60
+ LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME") or os.getenv("IMAGE_NAME")
61
+ API_KEY = "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"
62
+
63
+ API_BASE_URL = "https://openrouter.ai/api/v1"
64
+ MODEL_NAME = "nvidia/nemotron-3-nano-30b-a3b:free"
65
+
66
+ API_KEY = os.getenv("API_KEY") or os.getenv("HF_TOKEN")
67
 
68
  API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
69
+ MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
70
+
71
+ TASK_NAMES = os.getenv("CURATOR_TASKS", "easy,medium,hard").split(",")
72
  BENCHMARK = "curator"
73
  TEMPERATURE = 0.3
74
  MAX_TOKENS = 2000
75
  SUCCESS_SCORE_THRESHOLD = 0.3
76
 
77
+
78
  SYSTEM_PROMPT = textwrap.dedent("""
79
  You are a content curation agent. You help users find the most relevant
80
  articles from a pool of content items based on their interest profile.
81
 
82
+ Available actions (respond with ONE JSON object, nothing else):
83
 
84
+ 1. Filter remove irrelevant items from the pool:
85
  {"action_type": "filter", "item_ids": ["id1", "id2", ...]}
86
 
87
+ 2. Categorize — tag items by priority:
88
  {"action_type": "categorize", "categories": {"id1": "urgent", "id2": "skip", ...}}
89
  Categories: "urgent", "read_later", "share", "skip"
90
 
91
+ 3. Rank — order remaining items by relevance (best first):
92
  {"action_type": "rank", "rankings": ["best_id", "second_id", ...]}
93
 
94
+ 4. Recommend final selection (ENDS the episode):
95
  {"action_type": "recommend", "item_ids": ["id1", "id2", ...]}
96
 
97
+ STRATEGY you MUST follow these steps in order across multiple turns:
98
+ Step 1: FILTER out clearly irrelevant items (low match to user interests).
99
+ Step 2: CATEGORIZE remaining items based on relevance to the user profile.
100
+ Step 3: RANK the remaining items by relevance (best first).
101
+ Step 4: Only RECOMMEND when you are confident in your top picks.
102
 
103
+ DO NOT use "recommend" until you have filtered, categorized, and ranked.
104
+ IMPORTANT: Respond with ONLY a single JSON object per turn. No markdown, no explanation.
105
  """).strip()
106
 
107
 
 
217
  client: OpenAI,
218
  obs: Any,
219
  step: int,
220
+ max_steps: int,
221
  last_feedback: Optional[str],
222
  messages: List[Dict[str, str]],
223
  ) -> Dict:
 
225
  user_prompt = build_user_prompt(obs, step, last_feedback)
226
  messages.append({"role": "user", "content": user_prompt})
227
 
228
+ has_prior_work = obs.task_info and (obs.task_info.items_filtered > 0 or obs.task_info.items_categorized > 0)
229
+
230
  try:
231
  completion = client.chat.completions.create(
232
  model=MODEL_NAME,
 
239
  messages.append({"role": "assistant", "content": text})
240
  action = parse_action_from_response(text)
241
  if action and "action_type" in action:
242
+ # Block recommend on step 1 with no prior work
243
+ if action["action_type"] == "recommend" and step == 1 and not has_prior_work:
244
+ item_ids = action.get("item_ids", [])
245
+ if item_ids:
246
+ return {"action_type": "rank", "rankings": item_ids}
247
  return action
248
  except Exception as exc:
249
  print(f"[DEBUG] Model request failed: {exc}", flush=True)
250
 
251
+ # Fallback: use rank on early steps, recommend on last step
252
  item_ids = [item.id if hasattr(item, "id") else item["id"] for item in obs.items]
253
  k = obs.task_info.recommend_k if obs.task_info else 5
254
+ if step >= max_steps - 1 or has_prior_work:
255
+ return {"action_type": "recommend", "item_ids": item_ids[:k]}
256
+ return {"action_type": "rank", "rankings": item_ids[:k]}
257
 
 
 
258
 
259
+ async def run_episode(
260
+ env: Any,
261
+ client: OpenAI,
262
+ task_name: str,
263
+ ) -> None:
264
+ """Run one full episode for *task_name*, emitting [START] / [STEP]* / [END]."""
265
+ rewards: List[float] = []
266
+ steps_taken = 0
267
+ score = 0.0
268
+ success = False
269
+ last_feedback: Optional[str] = None
270
 
271
+ log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
272
 
273
+ try:
274
+ result = await env.reset(task_id=task_name)
275
  obs = result.observation
276
 
277
  task_info = obs.task_info
 
284
  if result.done:
285
  break
286
 
287
+ action_dict = get_model_action(client, obs, step, max_steps, last_feedback, messages)
288
  action = CuratorAction(**action_dict)
289
 
290
  result = await env.step(action)
 
320
  score = min(max(score, 0.0), 1.0)
321
  success = score >= SUCCESS_SCORE_THRESHOLD
322
 
323
+ finally:
324
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
325
 
326
 
327
+ async def main() -> None:
328
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
329
+
330
+ if LOCAL_IMAGE_NAME:
331
+ env = await CuratorEnv.from_docker_image(LOCAL_IMAGE_NAME)
332
+ else:
333
+ env = await CuratorEnv.from_env(
334
+ HF_SPACE_URL,
335
+ use_docker=False,
336
+ )
337
+
338
+ try:
339
+ for task_name in TASK_NAMES:
340
+ await run_episode(env, client, task_name)
341
+ finally:
342
+ try:
343
+ await env.close()
344
+ except Exception as e:
345
+ print(f"[DEBUG] env.close() error (cleanup): {e}", flush=True)
346
+
347
+
348
  if __name__ == "__main__":
349
  asyncio.run(main())
test_submission.sh ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # test_submission.sh — Local end-to-end test for Curator Environment
4
+ #
5
+ # Starts the server, runs inference against all 3 tasks, validates output format,
6
+ # and optionally runs pre-validation.sh.
7
+ #
8
+ # Usage:
9
+ # bash test_submission.sh
10
+ # bash test_submission.sh --skip-inference # skip LLM inference (needs API key)
11
+ # bash test_submission.sh --skip-prevalidation # skip pre-validation.sh
12
+ #
13
+ # Required env vars for inference:
14
+ # HF_TOKEN or API_KEY — LLM API key
15
+ #
16
+ # Optional env vars:
17
+ # API_BASE_URL — LLM endpoint (default: https://router.huggingface.co/v1)
18
+ # MODEL_NAME — model to use (default: meta-llama/Llama-3.2-1B-Instruct)
19
+ # HF_SPACE_URL — your HF Space URL for pre-validation (default: https://kgdrathan-openenv-curator.hf.space)
20
+
21
+ set -uo pipefail
22
+
23
+ REPO_DIR="$(cd "$(dirname "$0")" && pwd)"
24
+ cd "$REPO_DIR"
25
+
26
+ HF_SPACE_URL="${HF_SPACE_URL:-https://kgdrathan-openenv-curator.hf.space}"
27
+ SERVER_PORT=8000
28
+ SERVER_PID=""
29
+ SKIP_INFERENCE=false
30
+ SKIP_PREVALIDATION=false
31
+
32
+ for arg in "$@"; do
33
+ case "$arg" in
34
+ --skip-inference) SKIP_INFERENCE=true ;;
35
+ --skip-prevalidation) SKIP_PREVALIDATION=true ;;
36
+ esac
37
+ done
38
+
39
+ # Colors
40
+ if [ -t 1 ]; then
41
+ RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BOLD='\033[1m'; NC='\033[0m'
42
+ else
43
+ RED=''; GREEN=''; YELLOW=''; BOLD=''; NC=''
44
+ fi
45
+
46
+ log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
47
+ pass() { log "${GREEN}PASS${NC} -- $1"; }
48
+ fail() { log "${RED}FAIL${NC} -- $1"; }
49
+ warn() { log "${YELLOW}WARN${NC} -- $1"; }
50
+
51
+ cleanup() {
52
+ if [ -n "$SERVER_PID" ] && kill -0 "$SERVER_PID" 2>/dev/null; then
53
+ log "Stopping server (PID $SERVER_PID)..."
54
+ kill "$SERVER_PID" 2>/dev/null
55
+ wait "$SERVER_PID" 2>/dev/null
56
+ fi
57
+ }
58
+ trap cleanup EXIT
59
+
60
+ PASS_COUNT=0
61
+ FAIL_COUNT=0
62
+
63
+ # =========================================================================
64
+ printf "\n${BOLD}========================================${NC}\n"
65
+ printf "${BOLD} Curator Submission Test Suite${NC}\n"
66
+ printf "${BOLD}========================================${NC}\n\n"
67
+
68
+ # =========================================================================
69
+ # Step 1: Check required files
70
+ # =========================================================================
71
+ log "${BOLD}Step 1/6: Checking required files${NC}"
72
+
73
+ REQUIRED_FILES=(
74
+ "inference.py"
75
+ "client.py"
76
+ "models.py"
77
+ "openenv.yaml"
78
+ "pyproject.toml"
79
+ "uv.lock"
80
+ "server/app.py"
81
+ "server/curator_environment.py"
82
+ "server/grader.py"
83
+ "data/tasks.json"
84
+ "data/items.json"
85
+ "data/ground_truth.json"
86
+ "Dockerfile"
87
+ )
88
+
89
+ FILES_OK=true
90
+ for f in "${REQUIRED_FILES[@]}"; do
91
+ if [ ! -f "$REPO_DIR/$f" ]; then
92
+ fail "Missing required file: $f"
93
+ FILES_OK=false
94
+ FAIL_COUNT=$((FAIL_COUNT + 1))
95
+ fi
96
+ done
97
+ if [ "$FILES_OK" = true ]; then
98
+ pass "All required files present (${#REQUIRED_FILES[@]} files)"
99
+ PASS_COUNT=$((PASS_COUNT + 1))
100
+ fi
101
+
102
+ # =========================================================================
103
+ # Step 2: Check env vars
104
+ # =========================================================================
105
+ log "${BOLD}Step 2/6: Checking environment variables${NC}"
106
+
107
+ API_KEY="${HF_TOKEN:-${API_KEY:-}}"
108
+ if [ -z "$API_KEY" ]; then
109
+ warn "HF_TOKEN / API_KEY not set — inference step will be skipped"
110
+ SKIP_INFERENCE=true
111
+ else
112
+ pass "API key is set"
113
+ PASS_COUNT=$((PASS_COUNT + 1))
114
+ fi
115
+
116
+ log " API_BASE_URL = ${API_BASE_URL:-https://router.huggingface.co/v1 (default)}"
117
+ log " MODEL_NAME = ${MODEL_NAME:-meta-llama/Llama-3.2-1B-Instruct (default)}"
118
+
119
+ # =========================================================================
120
+ # Step 3: openenv validate (local)
121
+ # =========================================================================
122
+ log "${BOLD}Step 3/6: Running openenv validate${NC}"
123
+
124
+ if ! command -v openenv &>/dev/null && ! uv run openenv validate --help &>/dev/null 2>&1; then
125
+ warn "openenv CLI not found — skipping local validation"
126
+ else
127
+ VALIDATE_CMD="openenv validate"
128
+ if ! command -v openenv &>/dev/null; then
129
+ VALIDATE_CMD="uv run openenv validate"
130
+ fi
131
+ VALIDATE_OUTPUT=$($VALIDATE_CMD 2>&1) && VALIDATE_OK=true || VALIDATE_OK=false
132
+
133
+ if [ "$VALIDATE_OK" = true ]; then
134
+ pass "openenv validate passed"
135
+ PASS_COUNT=$((PASS_COUNT + 1))
136
+ else
137
+ fail "openenv validate failed"
138
+ printf "%s\n" "$VALIDATE_OUTPUT"
139
+ FAIL_COUNT=$((FAIL_COUNT + 1))
140
+ fi
141
+ fi
142
+
143
+ # =========================================================================
144
+ # Step 4: Start server
145
+ # =========================================================================
146
+ log "${BOLD}Step 4/6: Starting local server${NC}"
147
+
148
+ # Check if port is already in use
149
+ if curl -s -o /dev/null -w "%{http_code}" http://localhost:$SERVER_PORT/health --max-time 2 2>/dev/null | grep -q "200"; then
150
+ warn "Server already running on port $SERVER_PORT — using existing server"
151
+ else
152
+ uv run uvicorn server.app:app --host 0.0.0.0 --port $SERVER_PORT &>"$REPO_DIR/.test_server.log" &
153
+ SERVER_PID=$!
154
+ log " Server starting (PID $SERVER_PID)..."
155
+
156
+ # Wait for server to be ready
157
+ READY=false
158
+ for i in $(seq 1 30); do
159
+ if curl -s -o /dev/null -w "%{http_code}" http://localhost:$SERVER_PORT/health --max-time 2 2>/dev/null | grep -q "200"; then
160
+ READY=true
161
+ break
162
+ fi
163
+ sleep 1
164
+ done
165
+
166
+ if [ "$READY" = true ]; then
167
+ pass "Server is up and healthy"
168
+ PASS_COUNT=$((PASS_COUNT + 1))
169
+ else
170
+ fail "Server failed to start within 30s"
171
+ log " Last 20 lines of server log:"
172
+ tail -20 "$REPO_DIR/.test_server.log" 2>/dev/null
173
+ FAIL_COUNT=$((FAIL_COUNT + 1))
174
+ printf "\n${RED}${BOLD}Cannot continue without server. Exiting.${NC}\n\n"
175
+ exit 1
176
+ fi
177
+ fi
178
+
179
+ # Quick endpoint checks
180
+ log " Checking endpoints..."
181
+ for ENDPOINT in "/health" "/schema" "/metadata"; do
182
+ HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:$SERVER_PORT$ENDPOINT --max-time 5 2>/dev/null)
183
+ if [ "$HTTP_CODE" = "200" ]; then
184
+ log " GET $ENDPOINT -> $HTTP_CODE OK"
185
+ else
186
+ warn " GET $ENDPOINT -> $HTTP_CODE (expected 200)"
187
+ fi
188
+ done
189
+
190
+ RESET_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST -H "Content-Type: application/json" -d '{}' http://localhost:$SERVER_PORT/reset --max-time 10 2>/dev/null)
191
+ if [ "$RESET_CODE" = "200" ]; then
192
+ log " POST /reset -> $RESET_CODE OK"
193
+ else
194
+ warn " POST /reset -> $RESET_CODE (expected 200)"
195
+ fi
196
+
197
+ # =========================================================================
198
+ # Step 5: Run inference
199
+ # =========================================================================
200
+ log "${BOLD}Step 5/6: Running inference (all 3 tasks)${NC}"
201
+
202
+ if [ "$SKIP_INFERENCE" = true ]; then
203
+ warn "Inference skipped (no API key or --skip-inference)"
204
+ else
205
+ INFERENCE_OUTPUT="$REPO_DIR/.test_inference_output.txt"
206
+ INFERENCE_EXIT=0
207
+ LOCAL_IMAGE_NAME="http://localhost:$SERVER_PORT" uv run python inference.py >"$INFERENCE_OUTPUT" 2>"$REPO_DIR/.test_inference_stderr.txt" || INFERENCE_EXIT=$?
208
+
209
+ if [ $INFERENCE_EXIT -ne 0 ]; then
210
+ fail "inference.py exited with code $INFERENCE_EXIT"
211
+ log " stderr:"
212
+ tail -20 "$REPO_DIR/.test_inference_stderr.txt" 2>/dev/null
213
+ FAIL_COUNT=$((FAIL_COUNT + 1))
214
+ else
215
+ pass "inference.py exited successfully"
216
+ PASS_COUNT=$((PASS_COUNT + 1))
217
+ fi
218
+
219
+ # Validate output format
220
+ log " Validating stdout format..."
221
+ printf "\n --- inference.py stdout ---\n"
222
+ cat "$INFERENCE_OUTPUT"
223
+ printf " --- end stdout ---\n\n"
224
+
225
+ START_COUNT=$(grep -c '^\[START\]' "$INFERENCE_OUTPUT" 2>/dev/null || echo 0)
226
+ END_COUNT=$(grep -c '^\[END\]' "$INFERENCE_OUTPUT" 2>/dev/null || echo 0)
227
+ STEP_COUNT=$(grep -c '^\[STEP\]' "$INFERENCE_OUTPUT" 2>/dev/null || echo 0)
228
+
229
+ log " [START] lines: $START_COUNT (expected: 3)"
230
+ log " [STEP] lines: $STEP_COUNT"
231
+ log " [END] lines: $END_COUNT (expected: 3)"
232
+
233
+ if [ "$START_COUNT" -ge 3 ] && [ "$END_COUNT" -ge 3 ]; then
234
+ pass "All 3 tasks produced [START]/[END] blocks"
235
+ PASS_COUNT=$((PASS_COUNT + 1))
236
+ else
237
+ fail "Expected 3 [START] and 3 [END] lines, got $START_COUNT/$END_COUNT"
238
+ FAIL_COUNT=$((FAIL_COUNT + 1))
239
+ fi
240
+
241
+ # Check each [END] has score in [0,1]
242
+ SCORE_OK=true
243
+ while IFS= read -r line; do
244
+ SCORE=$(echo "$line" | grep -oE 'score=[0-9]+\.[0-9]+' | head -1 | cut -d= -f2)
245
+ if [ -n "$SCORE" ]; then
246
+ # Check score is between 0 and 1 (using awk for float comparison)
247
+ IN_RANGE=$(awk "BEGIN { print ($SCORE >= 0.0 && $SCORE <= 1.0) ? 1 : 0 }")
248
+ if [ "$IN_RANGE" != "1" ]; then
249
+ fail "Score $SCORE is not in [0, 1]: $line"
250
+ SCORE_OK=false
251
+ fi
252
+ fi
253
+ done < <(grep '^\[END\]' "$INFERENCE_OUTPUT" 2>/dev/null)
254
+
255
+ if [ "$SCORE_OK" = true ] && [ "$END_COUNT" -ge 3 ]; then
256
+ pass "All scores in [0, 1]"
257
+ PASS_COUNT=$((PASS_COUNT + 1))
258
+ fi
259
+
260
+ # Check tasks: easy, medium, hard
261
+ for TASK in easy medium hard; do
262
+ if grep -q "\[START\] task=$TASK " "$INFERENCE_OUTPUT" 2>/dev/null; then
263
+ log " task=$TASK found"
264
+ else
265
+ fail "Missing [START] for task=$TASK"
266
+ FAIL_COUNT=$((FAIL_COUNT + 1))
267
+ fi
268
+ done
269
+
270
+ # Check no non-protocol lines on stdout
271
+ NON_PROTOCOL=$(grep -cvE '^\[(START|STEP|END)\]' "$INFERENCE_OUTPUT" 2>/dev/null || echo 0)
272
+ if [ "$NON_PROTOCOL" -gt 0 ]; then
273
+ warn "$NON_PROTOCOL non-protocol lines found on stdout (should only have [START]/[STEP]/[END])"
274
+ fi
275
+ fi
276
+
277
+ # =========================================================================
278
+ # Step 6: Pre-validation script (optional)
279
+ # =========================================================================
280
+ log "${BOLD}Step 6/6: Running pre-validation.sh${NC}"
281
+
282
+ if [ "$SKIP_PREVALIDATION" = true ]; then
283
+ warn "Pre-validation skipped (--skip-prevalidation)"
284
+ elif [ ! -f "$REPO_DIR/pre-validation.sh" ]; then
285
+ warn "pre-validation.sh not found — skipping"
286
+ else
287
+ # pre-validation.sh needs a live HF Space URL; use local server as fallback
288
+ PREVALIDATION_OUTPUT=$(bash "$REPO_DIR/pre-validation.sh" "http://localhost:$SERVER_PORT" "$REPO_DIR" 2>&1) && PREVAL_OK=true || PREVAL_OK=false
289
+
290
+ if [ "$PREVAL_OK" = true ]; then
291
+ pass "pre-validation.sh passed"
292
+ PASS_COUNT=$((PASS_COUNT + 1))
293
+ else
294
+ fail "pre-validation.sh failed"
295
+ printf "%s\n" "$PREVALIDATION_OUTPUT"
296
+ FAIL_COUNT=$((FAIL_COUNT + 1))
297
+ fi
298
+ fi
299
+
300
+ # =========================================================================
301
+ # Summary
302
+ # =========================================================================
303
+ printf "\n${BOLD}========================================${NC}\n"
304
+ TOTAL=$((PASS_COUNT + FAIL_COUNT))
305
+ if [ "$FAIL_COUNT" -eq 0 ]; then
306
+ printf "${GREEN}${BOLD} All $PASS_COUNT checks passed!${NC}\n"
307
+ else
308
+ printf "${RED}${BOLD} $FAIL_COUNT/$TOTAL checks failed.${NC}\n"
309
+ fi
310
+ printf "${BOLD}========================================${NC}\n\n"
311
+
312
+ # Cleanup temp files
313
+ rm -f "$REPO_DIR/.test_server.log" "$REPO_DIR/.test_inference_output.txt" "$REPO_DIR/.test_inference_stderr.txt"
314
+
315
+ exit $FAIL_COUNT