Files changed (1) hide show
  1. inferency.py +0 -245
inferency.py DELETED
@@ -1,245 +0,0 @@
1
- """
2
- Inference Script Example
3
- ===================================
4
- MANDATORY
5
- - Before submitting, ensure the following variables are defined in your environment configuration:
6
- API_BASE_URL The API endpoint for the LLM.
7
- MODEL_NAME The model identifier to use for inference.
8
- HF_TOKEN Your Hugging Face / API key.
9
- LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
10
- method
11
-
12
- - Defaults are set only for API_BASE_URL and MODEL_NAME
13
- (and should reflect your active inference setup):
14
- API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
15
- MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
16
-
17
- - The inference script must be named `inference.py` and placed in the root directory of the project
18
- - Participants must use OpenAI Client for all LLM calls using above variables
19
-
20
- STDOUT FORMAT
21
- - The script must emit exactly three line types to stdout, in this order:
22
-
23
- [START] task=<task_name> env=<benchmark> model=<model_name>
24
- [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
25
- [END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
26
-
27
- Rules:
28
- - One [START] line at episode begin.
29
- - One [STEP] line per step, immediately after env.step() returns.
30
- - One [END] line after env.close(), always emitted (even on exception).
31
- - reward and rewards are formatted to 2 decimal places.
32
- - done and success are lowercase booleans: true or false.
33
- - error is the raw last_action_error string, or null if none.
34
- - All fields on a single line with no newlines within a line.
35
-
36
- Example:
37
- [START] task=click-test env=miniwob model=Qwen3-VL-30B
38
- [STEP] step=1 action=click('123') reward=0.00 done=false error=null
39
- [STEP] step=2 action=fill('456','text') reward=0.00 done=false error=null
40
- [STEP] step=3 action=click('789') reward=1.00 done=true error=null
41
- [END] success=true steps=3 rewards=0.00,0.00,1.00
42
- """
43
-
44
- import asyncio
45
- import os
46
- import textwrap
47
- from typing import List, Optional
48
-
49
- from openai import OpenAI
50
- from dotenv import load_dotenv
51
-
52
- # Load environment variables from .env file if present
53
- load_dotenv()
54
-
55
- from code_assessment_env import CodeAssessmentAction, CodeAssessmentEnv
56
- IMAGE_NAME = os.getenv("IMAGE_NAME", "code_assessment_env:latest")
57
- API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
58
-
59
- API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
60
- MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
61
- TASK_NAME = os.getenv("TASK_NAME", "code_output_assessment")
62
- BENCHMARK = os.getenv("BENCHMARK", "first_rl_proj")
63
- MAX_STEPS = 15
64
- TEMPERATURE = 0.7
65
- MAX_TOKENS = 200
66
- SUCCESS_SCORE_THRESHOLD = 0.5 # normalized score in [0, 1]
67
-
68
- # Max possible reward with normalized grading (0-1) × difficulty multipliers:
69
- # Easy (1x): ~5 problems × 1.0 = 5.0
70
- # Medium (2x): ~5 problems × 2.0 = 10.0
71
- # Hard (5x): ~5 problems × 5.0 = 25.0
72
- # Streak bonuses: ~3-4 bonuses × 0.5 = 1.5-2.0
73
- # Total possible: ~40.0 with perfect performance
74
- MAX_TOTAL_REWARD = 40.0
75
-
76
- SYSTEM_PROMPT = textwrap.dedent(
77
- """
78
- You are solving coding problems at different difficulty levels.
79
-
80
- For each problem:
81
- 1. Read the problem description carefully
82
- 2. Look at the test case input provided
83
- 3. Calculate or determine the correct output
84
- 4. Respond with ONLY the answer - no explanations, just the exact output value
85
-
86
- Examples:
87
- - If asked to add "3,5", respond: 8
88
- - If asked to reverse "hello", respond: olleh
89
- - If asked for palindrome check "racecar", respond: true
90
-
91
- Be precise with formatting:
92
- - For lists, use comma-separated values: "1,2,3"
93
- - For true/false, use lowercase: "true" or "false"
94
- - For numbers, no extra spaces or characters
95
-
96
- You'll get higher rewards for:
97
- - Correct answers (especially on hard problems)
98
- - Maintaining a streak of correct answers
99
- - Solving problems quickly
100
-
101
- Focus on accuracy. Partial credit is available for close answers.
102
- """
103
- ).strip()
104
-
105
-
106
- def log_start(task: str, env: str, model: str) -> None:
107
- print(f"[START] task={task} env={env} model={model}", flush=True)
108
-
109
-
110
- def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
111
- error_val = error if error else "null"
112
- done_val = str(done).lower()
113
- print(
114
- f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
115
- flush=True,
116
- )
117
-
118
-
119
- def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
120
- rewards_str = ",".join(f"{r:.2f}" for r in rewards)
121
- print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
122
-
123
-
124
- def build_user_prompt(
125
- step: int,
126
- problem: str,
127
- test_input: str,
128
- difficulty: str,
129
- feedback: str,
130
- is_correct: bool,
131
- streak: int,
132
- problems_solved: int
133
- ) -> str:
134
- status = "✓ CORRECT!" if is_correct else feedback
135
-
136
- return textwrap.dedent(
137
- f"""
138
- Step {step}/15 | Difficulty: {difficulty.upper()} | Solved: {problems_solved} | Streak: {streak}
139
-
140
- Problem: {problem}
141
- Test Input: {test_input}
142
-
143
- Previous Feedback: {status}
144
-
145
- What is the output? (respond with just the answer)
146
- """
147
- ).strip()
148
-
149
-
150
- def get_model_answer(
151
- client: OpenAI,
152
- step: int,
153
- problem: str,
154
- test_input: str,
155
- difficulty: str,
156
- feedback: str,
157
- is_correct: bool,
158
- streak: int,
159
- problems_solved: int
160
- ) -> str:
161
- user_prompt = build_user_prompt(step, problem, test_input, difficulty, feedback, is_correct, streak, problems_solved)
162
- try:
163
- completion = client.chat.completions.create(
164
- model=MODEL_NAME,
165
- messages=[
166
- {"role": "system", "content": SYSTEM_PROMPT},
167
- {"role": "user", "content": user_prompt},
168
- ],
169
- temperature=TEMPERATURE,
170
- max_tokens=MAX_TOKENS,
171
- stream=False,
172
- )
173
- text = (completion.choices[0].message.content or "").strip()
174
- return text if text else "0"
175
- except Exception as exc:
176
- print(f"[DEBUG] Model request failed: {exc}", flush=True)
177
- return "0"
178
-
179
-
180
- async def main() -> None:
181
- client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
182
-
183
- env = await CodeAssessmentEnv.from_docker_image(IMAGE_NAME)
184
-
185
- rewards: List[float] = []
186
- steps_taken = 0
187
- score = 0.0
188
- success = False
189
-
190
- log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
191
-
192
- try:
193
- result = await env.reset()
194
- obs = result.observation
195
-
196
- for step in range(1, MAX_STEPS + 1):
197
- if result.done:
198
- break
199
-
200
- # Get model's answer for the current problem
201
- answer = get_model_answer(
202
- client=client,
203
- step=step,
204
- problem=obs.problem_description,
205
- test_input=obs.test_case_input,
206
- difficulty=obs.difficulty,
207
- feedback=obs.feedback,
208
- is_correct=obs.is_correct,
209
- streak=obs.current_streak,
210
- problems_solved=obs.problems_solved,
211
- )
212
-
213
- # Submit answer
214
- result = await env.step(CodeAssessmentAction(answer=answer))
215
- obs = result.observation
216
-
217
- reward = result.reward or 0.0
218
- done = result.done
219
- error = None
220
-
221
- rewards.append(reward)
222
- steps_taken = step
223
-
224
- # Log step with problem info
225
- action_str = f"answer='{answer}' | correct={obs.is_correct} | difficulty={obs.difficulty}"
226
- log_step(step=step, action=action_str, reward=reward, done=done, error=error)
227
-
228
- if done:
229
- break
230
-
231
- # Calculate normalized score
232
- score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
233
- score = min(max(score, 0.0), 1.0) # clamp to [0, 1]
234
- success = score >= SUCCESS_SCORE_THRESHOLD
235
-
236
- finally:
237
- try:
238
- await env.close()
239
- except Exception as e:
240
- print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
241
- log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
242
-
243
-
244
- if __name__ == "__main__":
245
- asyncio.run(main())