Files changed (1) hide show
  1. inference.py +245 -0
inference.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Inference Script Example
3
+ ===================================
4
+ MANDATORY
5
+ - Before submitting, ensure the following variables are defined in your environment configuration:
6
+ API_BASE_URL The API endpoint for the LLM.
7
+ MODEL_NAME The model identifier to use for inference.
8
+ HF_TOKEN Your Hugging Face / API key.
9
+ LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
10
+ method
11
+
12
+ - Defaults are set only for API_BASE_URL and MODEL_NAME
13
+ (and should reflect your active inference setup):
14
+ API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
15
+ MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
16
+
17
+ - The inference script must be named `inference.py` and placed in the root directory of the project
18
+ - Participants must use OpenAI Client for all LLM calls using above variables
19
+
20
+ STDOUT FORMAT
21
+ - The script must emit exactly three line types to stdout, in this order:
22
+
23
+ [START] task=<task_name> env=<benchmark> model=<model_name>
24
+ [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
25
+ [END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
26
+
27
+ Rules:
28
+ - One [START] line at episode begin.
29
+ - One [STEP] line per step, immediately after env.step() returns.
30
+ - One [END] line after env.close(), always emitted (even on exception).
31
+ - reward and rewards are formatted to 2 decimal places.
32
+ - done and success are lowercase booleans: true or false.
33
+ - error is the raw last_action_error string, or null if none.
34
+ - All fields on a single line with no newlines within a line.
35
+
36
+ Example:
37
+ [START] task=click-test env=miniwob model=Qwen3-VL-30B
38
+ [STEP] step=1 action=click('123') reward=0.00 done=false error=null
39
+ [STEP] step=2 action=fill('456','text') reward=0.00 done=false error=null
40
+ [STEP] step=3 action=click('789') reward=1.00 done=true error=null
41
+ [END] success=true steps=3 rewards=0.00,0.00,1.00
42
+ """
43
+
44
+ import asyncio
45
+ import os
46
+ import textwrap
47
+ from typing import List, Optional
48
+
49
+ from openai import OpenAI
50
+ from dotenv import load_dotenv
51
+
52
+ # Load environment variables from .env file if present
53
+ load_dotenv()
54
+
55
+ from code_assessment_env import CodeAssessmentAction, CodeAssessmentEnv
56
+ LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
57
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
58
+
59
+ API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
60
+ MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
61
+ TASK_NAME = os.getenv("TASK_NAME", "code_output_assessment")
62
+ BENCHMARK = os.getenv("BENCHMARK", "first_rl_proj")
63
+ MAX_STEPS = 15
64
+ TEMPERATURE = 0.7
65
+ MAX_TOKENS = 200
66
+ SUCCESS_SCORE_THRESHOLD = 0.5 # normalized score in [0, 1]
67
+
68
+ # Max possible reward with normalized grading (0-1) × difficulty multipliers:
69
+ # Easy (1x): ~5 problems × 1.0 = 5.0
70
+ # Medium (2x): ~5 problems × 2.0 = 10.0
71
+ # Hard (5x): ~5 problems × 5.0 = 25.0
72
+ # Streak bonuses: ~3-4 bonuses × 0.5 = 1.5-2.0
73
+ # Total possible: ~40.0 with perfect performance
74
+ MAX_TOTAL_REWARD = 40.0
75
+
76
+ SYSTEM_PROMPT = textwrap.dedent(
77
+ """
78
+ You are solving coding problems at different difficulty levels.
79
+
80
+ For each problem:
81
+ 1. Read the problem description carefully
82
+ 2. Look at the test case input provided
83
+ 3. Calculate or determine the correct output
84
+ 4. Respond with ONLY the answer - no explanations, just the exact output value
85
+
86
+ Examples:
87
+ - If asked to add "3,5", respond: 8
88
+ - If asked to reverse "hello", respond: olleh
89
+ - If asked for palindrome check "racecar", respond: true
90
+
91
+ Be precise with formatting:
92
+ - For lists, use comma-separated values: "1,2,3"
93
+ - For true/false, use lowercase: "true" or "false"
94
+ - For numbers, no extra spaces or characters
95
+
96
+ You'll get higher rewards for:
97
+ - Correct answers (especially on hard problems)
98
+ - Maintaining a streak of correct answers
99
+ - Solving problems quickly
100
+
101
+ Focus on accuracy. Partial credit is available for close answers.
102
+ """
103
+ ).strip()
104
+
105
+
106
+ def log_start(task: str, env: str, model: str) -> None:
107
+ print(f"[START] task={task} env={env} model={model}", flush=True)
108
+
109
+
110
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
111
+ error_val = error if error else "null"
112
+ done_val = str(done).lower()
113
+ print(
114
+ f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
115
+ flush=True,
116
+ )
117
+
118
+
119
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
120
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
121
+ print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
122
+
123
+
124
+ def build_user_prompt(
125
+ step: int,
126
+ problem: str,
127
+ test_input: str,
128
+ difficulty: str,
129
+ feedback: str,
130
+ is_correct: bool,
131
+ streak: int,
132
+ problems_solved: int
133
+ ) -> str:
134
+ status = "✓ CORRECT!" if is_correct else feedback
135
+
136
+ return textwrap.dedent(
137
+ f"""
138
+ Step {step}/15 | Difficulty: {difficulty.upper()} | Solved: {problems_solved} | Streak: {streak}
139
+
140
+ Problem: {problem}
141
+ Test Input: {test_input}
142
+
143
+ Previous Feedback: {status}
144
+
145
+ What is the output? (respond with just the answer)
146
+ """
147
+ ).strip()
148
+
149
+
150
+ def get_model_answer(
151
+ client: OpenAI,
152
+ step: int,
153
+ problem: str,
154
+ test_input: str,
155
+ difficulty: str,
156
+ feedback: str,
157
+ is_correct: bool,
158
+ streak: int,
159
+ problems_solved: int
160
+ ) -> str:
161
+ user_prompt = build_user_prompt(step, problem, test_input, difficulty, feedback, is_correct, streak, problems_solved)
162
+ try:
163
+ completion = client.chat.completions.create(
164
+ model=MODEL_NAME,
165
+ messages=[
166
+ {"role": "system", "content": SYSTEM_PROMPT},
167
+ {"role": "user", "content": user_prompt},
168
+ ],
169
+ temperature=TEMPERATURE,
170
+ max_tokens=MAX_TOKENS,
171
+ stream=False,
172
+ )
173
+ text = (completion.choices[0].message.content or "").strip()
174
+ return text if text else "0"
175
+ except Exception as exc:
176
+ print(f"[DEBUG] Model request failed: {exc}", flush=True)
177
+ return "0"
178
+
179
+
180
+ async def main() -> None:
181
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
182
+
183
+ env = await CodeAssessmentEnv.from_docker_image(LOCAL_IMAGE_NAME)
184
+
185
+ rewards: List[float] = []
186
+ steps_taken = 0
187
+ score = 0.0
188
+ success = False
189
+
190
+ log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
191
+
192
+ try:
193
+ result = await env.reset()
194
+ obs = result.observation
195
+
196
+ for step in range(1, MAX_STEPS + 1):
197
+ if result.done:
198
+ break
199
+
200
+ # Get model's answer for the current problem
201
+ answer = get_model_answer(
202
+ client=client,
203
+ step=step,
204
+ problem=obs.problem_description,
205
+ test_input=obs.test_case_input,
206
+ difficulty=obs.difficulty,
207
+ feedback=obs.feedback,
208
+ is_correct=obs.is_correct,
209
+ streak=obs.current_streak,
210
+ problems_solved=obs.problems_solved,
211
+ )
212
+
213
+ # Submit answer
214
+ result = await env.step(CodeAssessmentAction(answer=answer))
215
+ obs = result.observation
216
+
217
+ reward = result.reward or 0.0
218
+ done = result.done
219
+ error = None
220
+
221
+ rewards.append(reward)
222
+ steps_taken = step
223
+
224
+ # Log step with problem info
225
+ action_str = f"answer='{answer}' | correct={obs.is_correct} | difficulty={obs.difficulty}"
226
+ log_step(step=step, action=action_str, reward=reward, done=done, error=error)
227
+
228
+ if done:
229
+ break
230
+
231
+ # Calculate normalized score
232
+ score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
233
+ score = min(max(score, 0.0), 1.0) # clamp to [0, 1]
234
+ success = score >= SUCCESS_SCORE_THRESHOLD
235
+
236
+ finally:
237
+ try:
238
+ await env.close()
239
+ except Exception as e:
240
+ print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
241
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
242
+
243
+
244
+ if __name__ == "__main__":
245
+ asyncio.run(main())