Parthiban007 commited on
Commit
efe528e
Β·
verified Β·
1 Parent(s): 090dc69

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. inference.py +171 -102
  2. server/app.py +38 -14
inference.py CHANGED
@@ -1,15 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
- import re
3
- import json
4
  import asyncio
5
  import logging
6
  from typing import List, Optional
 
7
  from openai import OpenAI
8
  from dotenv import load_dotenv
9
 
10
  load_dotenv()
11
 
12
- # --- Logging (inference.py) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  _LOG_LEVEL = (os.getenv("LOG_LEVEL") or "INFO").upper()
14
  logging.basicConfig(
15
  level=getattr(logging, _LOG_LEVEL, logging.INFO),
@@ -17,146 +54,178 @@ logging.basicConfig(
17
  )
18
  logger = logging.getLogger("rust_coder.inference")
19
 
20
- # --- Competition Configuration ---
21
- API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
22
- MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
23
- HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
24
- ENV_URL = os.getenv("ENV_URL") or "http://localhost:8000"
25
-
26
- # Episode constants: 10 problems, each worth max reward 1.0
27
- MAX_STEPS = 10
28
- MAX_TOTAL_REWARD = 10.0
29
- SUCCESS_SCORE_THRESHOLD = 0.5
30
-
31
- # Import client (ensure rust_coder is in PYTHONPATH)
32
  from client import RustCoderEnv
33
  from models import RustCoderAction
34
 
35
- # --- Strict Logging Helpers ---
36
- def log_start(task: str, env: str, model: str):
37
- # REQUIRED exact stdout format (no quotes)
 
38
  print(f"[START] task={task} env={env} model={model}", flush=True)
39
 
40
- def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str] = None):
41
- # REQUIRED exact stdout format:
42
- # [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
43
- action_str = (action or "").replace("\r", "\\r").replace("\n", "\\n")
44
- action_str = action_str[:200] # keep single-line + bounded
45
- err_field = "null" if error is None else str(error).replace("\r", "\\r").replace("\n", "\\n")
46
- reward_2 = f"{float(reward or 0.0):.2f}"
 
 
 
47
  print(
48
- f"[STEP] step={step} action={action_str} reward={reward_2} done={str(bool(done)).lower()} error={err_field}",
 
49
  flush=True,
50
  )
51
 
52
- def log_end(success: bool, steps: int, score: float, rewards: List[float]):
53
- # REQUIRED exact stdout format, rewards as comma-separated 2dp
54
- rewards_str = ",".join(f"{float(r or 0.0):.2f}" for r in rewards)
55
  print(
56
- f"[END] success={str(bool(success)).lower()} steps={steps} score={float(score or 0.0):.3f} rewards={rewards_str}",
 
57
  flush=True,
58
  )
59
 
60
- # --- LLM Solution Logic ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  async def get_model_code(prompt: str, client: OpenAI) -> str:
62
- """Call the LLM to get a Rust solution."""
63
  try:
64
- logger.info(
65
- "LLM call start model=%s base_url=%s prompt_chars=%d token_present=%s",
66
- MODEL_NAME,
67
- API_BASE_URL,
68
- len(prompt or ""),
69
- bool(HF_TOKEN),
70
- )
71
  completion = client.chat.completions.create(
72
  model=MODEL_NAME,
73
  messages=[
74
- {"role": "system", "content": "You are a senior Rust systems engineer. Return ONLY the complete, fixed Rust code. No explanation."},
 
 
 
 
 
 
 
75
  {"role": "user", "content": prompt},
76
  ],
77
- temperature=0.1,
 
78
  )
79
  text = (completion.choices[0].message.content or "").strip()
80
- logger.debug("LLM raw response chars=%d", len(text))
81
-
82
- # Extract code from markdown
83
  if "```rust" in text:
84
  text = text.split("```rust")[1].split("```")[0]
85
  elif "```" in text:
86
  text = text.split("```")[1].split("```")[0]
87
  text = text.strip()
88
- if not text:
89
- logger.warning("LLM returned empty code after cleanup.")
90
- return "// Error: empty response (no code returned)."
91
- logger.info("LLM call end: returned_code_chars=%d", len(text))
92
- return text
93
- except Exception as e:
94
- logger.exception("LLM Request failed.")
95
- return f"// Error: {e}"
96
-
97
- # --- Main Evaluation Loop ---
98
- async def main():
99
- if not HF_TOKEN:
100
- logger.error("HF_TOKEN/API_KEY not found in environment.")
101
- return
102
-
103
- client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
104
- env = RustCoderEnv(base_url=ENV_URL)
105
 
106
- log_start(task="rust_coder", env="RustCoder-v1", model=MODEL_NAME)
 
 
 
 
 
107
 
108
  rewards: List[float] = []
109
  steps_taken = 0
110
- score = 0.0
111
  success = False
112
 
 
 
113
  try:
114
- # Start the single episode (10 problems)
115
- result = await env.reset()
116
- obs = result.observation
117
-
118
- for step in range(1, MAX_STEPS + 1):
119
- if result.done:
120
- break
121
-
122
- steps_taken = step
123
-
124
- # Format prompt including header_section if available
125
- prompt = obs.problem_description
126
- if getattr(obs, "header_section", ""):
127
- prompt += f"\n\nHeader Section (must be included verbatim in final code):\n```rust\n{obs.header_section}\n```"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
- # 1. Ask model for solution to current task
130
- code_solution = await get_model_code(prompt, client)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
- # 2. Environment step
133
- logger.debug("Submitting to env.step code_chars=%d", len(code_solution or ""))
134
- result = await env.step(RustCoderAction(code=code_solution))
135
- obs = result.observation
136
- reward = result.reward or 0.0
137
- done = result.done
138
 
139
- rewards.append(reward)
140
- log_step(step=step, action=code_solution, reward=reward, done=done, error=None)
141
 
142
- if done:
143
- break
144
 
145
- # Normalize score to [0, 1] matching sample format
146
- score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
147
- score = min(max(score, 0.0), 1.0)
148
- success = score >= SUCCESS_SCORE_THRESHOLD
 
 
 
 
 
 
149
 
150
- except Exception as e:
151
- logger.exception("Runtime error.")
152
- log_step(step=steps_taken + 1, action="error", reward=0.0, done=True, error=str(e))
153
 
154
- finally:
155
- try:
156
- await env.close()
157
- except Exception as e:
158
- logger.exception("env.close() error.")
159
- log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
160
 
161
  if __name__ == "__main__":
162
  asyncio.run(main())
 
1
+ """
2
+ inference.py β€” Rust Coder OpenEnv Baseline Agent
3
+
4
+ Architecture
5
+ ────────────
6
+ β€’ Runs 3 tasks (easy / medium / hard) as independent episodes.
7
+ β€’ Each task produces its own [START]…[STEP]…[END] log block.
8
+ β€’ A fresh WebSocket env connection is opened per task to avoid
9
+ HF-Space WebSocket timeouts during long LLM + compilation waits.
10
+ β€’ Scores are clamped to (0.01, 0.99) β€” strictly inside (0, 1).
11
+ β€’ If HF_TOKEN is missing, minimal fallback blocks are emitted so
12
+ the platform always receives 3 parseable task records.
13
+
14
+ Required env vars
15
+ ─────────────────
16
+ API_BASE_URL β€” LLM router URL (default: HF router)
17
+ MODEL_NAME β€” model identifier (default: Qwen 72B)
18
+ HF_TOKEN β€” HuggingFace / API key
19
+ ENV_URL β€” environment URL (default: http://localhost:8000)
20
+ """
21
+
22
  import os
 
 
23
  import asyncio
24
  import logging
25
  from typing import List, Optional
26
+
27
  from openai import OpenAI
28
  from dotenv import load_dotenv
29
 
30
  load_dotenv()
31
 
32
+ # ── Configuration ─────────────────────────────────────────────────────────────
33
+ API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
34
+ MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
35
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
36
+ ENV_URL = os.getenv("ENV_URL") or "http://localhost:8000"
37
+
38
+ SUCCESS_SCORE_THRESHOLD = 0.5
39
+ TEMPERATURE = 0.1
40
+ MAX_TOKENS = 1500
41
+
42
+ # Exactly 3 tasks: easy / medium / hard (maps to problems.json indices)
43
+ EVAL_TASKS = [
44
+ {"task_id": "task_1", "start_index": 0, "difficulty": "easy"},
45
+ {"task_id": "task_3", "start_index": 2, "difficulty": "medium"},
46
+ {"task_id": "task_6", "start_index": 5, "difficulty": "hard"},
47
+ ]
48
+
49
+ # ── Logging ───────────────────────────────────────────────────────────────────
50
  _LOG_LEVEL = (os.getenv("LOG_LEVEL") or "INFO").upper()
51
  logging.basicConfig(
52
  level=getattr(logging, _LOG_LEVEL, logging.INFO),
 
54
  )
55
  logger = logging.getLogger("rust_coder.inference")
56
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  from client import RustCoderEnv
58
  from models import RustCoderAction
59
 
60
+
61
+ # ── Strict stdout log helpers ─────────────────────────────────────────────────
62
+
63
+ def log_start(task: str, env: str, model: str) -> None:
64
  print(f"[START] task={task} env={env} model={model}", flush=True)
65
 
66
+
67
+ def log_step(
68
+ step: int,
69
+ action: str,
70
+ reward: float,
71
+ done: bool,
72
+ error: Optional[str] = None,
73
+ ) -> None:
74
+ action_str = (action or "").replace("\r", "\\r").replace("\n", "\\n")[:200]
75
+ err_field = "null" if error is None else str(error).replace("\n", "\\n")[:200]
76
  print(
77
+ f"[STEP] step={step} action={action_str} reward={reward:.2f} "
78
+ f"done={str(bool(done)).lower()} error={err_field}",
79
  flush=True,
80
  )
81
 
82
+
83
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
84
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
85
  print(
86
+ f"[END] success={str(success).lower()} steps={steps} "
87
+ f"score={score:.3f} rewards={rewards_str}",
88
  flush=True,
89
  )
90
 
91
+
92
+ # ── Score clamping ────────────────────────────────────────────────────────────
93
+
94
+ def clamp_score(raw: float) -> float:
95
+ """
96
+ Clamp to the open interval (0, 1) β€” never exactly 0.0 or 1.0.
97
+
98
+ Floor 0.01: even compilation failures yield a non-zero score.
99
+ Ceiling 0.99: prevents a theoretically-perfect submission from
100
+ returning 1.0.
101
+ """
102
+ return round(max(0.01, min(0.99, float(raw))), 3)
103
+
104
+
105
+ # ── LLM call ─────────────────────────────────────────────────────────────────
106
+
107
  async def get_model_code(prompt: str, client: OpenAI) -> str:
108
+ """Ask the model for a complete Rust solution; strip markdown if needed."""
109
  try:
 
 
 
 
 
 
 
110
  completion = client.chat.completions.create(
111
  model=MODEL_NAME,
112
  messages=[
113
+ {
114
+ "role": "system",
115
+ "content": (
116
+ "You are a senior Rust systems engineer. "
117
+ "Return ONLY the complete, corrected Rust source file. "
118
+ "No markdown fences. No commentary."
119
+ ),
120
+ },
121
  {"role": "user", "content": prompt},
122
  ],
123
+ temperature=TEMPERATURE,
124
+ max_tokens=MAX_TOKENS,
125
  )
126
  text = (completion.choices[0].message.content or "").strip()
 
 
 
127
  if "```rust" in text:
128
  text = text.split("```rust")[1].split("```")[0]
129
  elif "```" in text:
130
  text = text.split("```")[1].split("```")[0]
131
  text = text.strip()
132
+ return text or "// empty response"
133
+ except Exception as exc:
134
+ logger.exception("LLM call failed")
135
+ return f"// LLM error: {exc}"
136
+
137
+
138
+ # ── Single-task episode ───────────────────────────────────────────────────────
139
+
140
+ async def run_task(task_info: dict, client: Optional[OpenAI]) -> None:
141
+ """
142
+ Run one task as a fully independent episode with its own env connection.
143
+
144
+ Opens a fresh WebSocket connection so a slow LLM call on a previous
145
+ task cannot cause a connection timeout here.
 
 
 
146
 
147
+ Always emits exactly one [START]…[STEP]…[END] block.
148
+ """
149
+ task_id = task_info["task_id"]
150
+ start_index = task_info["start_index"]
151
+
152
+ log_start(task=task_id, env="RustCoder-v1", model=MODEL_NAME)
153
 
154
  rewards: List[float] = []
155
  steps_taken = 0
156
+ score = 0.01
157
  success = False
158
 
159
+ # Fresh connection per task β€” avoids WebSocket timeout across tasks
160
+ env = RustCoderEnv(base_url=ENV_URL)
161
  try:
162
+ # ── Reset to the target task ──────────────────────────────────
163
+ reset_result = await env.reset(start_index=start_index)
164
+ obs = reset_result.observation
165
+
166
+ # ── Build prompt ──────────────────────────────────────────────
167
+ prompt = obs.problem_description or ""
168
+ header = getattr(obs, "header_section", "")
169
+ if header:
170
+ prompt += (
171
+ "\n\nHeader section (must be included verbatim):"
172
+ f"\n```rust\n{header}\n```"
173
+ )
174
+
175
+ # ── Get LLM code or skip if no token ─────────────────────────
176
+ if client is not None:
177
+ code = await get_model_code(prompt, client)
178
+ else:
179
+ code = "// no HF_TOKEN β€” using stub"
180
+
181
+ steps_taken = 1
182
+
183
+ # ── Evaluate in environment ───────────────────────────────────
184
+ step_result = await env.step(RustCoderAction(code=code))
185
+ # Explicit None check β€” 0.0 is falsy but valid
186
+ raw_reward = float(step_result.reward if step_result.reward is not None else 0.0)
187
+ score = clamp_score(raw_reward)
188
+ rewards.append(score)
189
+ success = score >= SUCCESS_SCORE_THRESHOLD
190
 
191
+ log_step(step=1, action=code, reward=score, done=True, error=None)
192
+
193
+ except Exception as exc:
194
+ logger.exception("Task %s failed", task_id)
195
+ score = 0.01
196
+ rewards = [0.01]
197
+ log_step(
198
+ step=steps_taken + 1,
199
+ action="error",
200
+ reward=0.01,
201
+ done=True,
202
+ error=str(exc),
203
+ )
204
+ finally:
205
+ try:
206
+ await env.close()
207
+ except Exception:
208
+ pass
209
 
210
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
 
 
 
 
 
211
 
 
 
212
 
213
+ # ── Main ──────────────────────────────────────────────────────────────────────
 
214
 
215
+ async def main() -> None:
216
+ # Build the LLM client if credentials are available
217
+ client: Optional[OpenAI] = None
218
+ if HF_TOKEN:
219
+ client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
220
+ else:
221
+ logger.warning(
222
+ "HF_TOKEN / API_KEY not set β€” LLM calls disabled. "
223
+ "Stub code will be submitted; scores will be at floor (0.01)."
224
+ )
225
 
226
+ for task in EVAL_TASKS:
227
+ await run_task(task, client)
 
228
 
 
 
 
 
 
 
229
 
230
  if __name__ == "__main__":
231
  asyncio.run(main())
server/app.py CHANGED
@@ -150,10 +150,10 @@ async def grader(task_id: str, action: RustCoderAction):
150
  Body: {"code": "<rust source code>"}
151
 
152
  Scores are strictly in the open interval (0, 1):
153
- - Minimum 0.01 β€” floor for any submission (even empty/non-compiling code)
154
- - Maximum 0.99 β€” ceiling so no submission scores a theoretical perfect 1.0
155
- - Natural range based on: Compilation(40%) + Correctness(20%) +
156
- Coverage(20%) + Elegance(10%) + Efficiency(10%)
157
  """
158
  task_meta = TASK_REGISTRY.get(task_id)
159
  if task_meta is None:
@@ -162,25 +162,49 @@ async def grader(task_id: str, action: RustCoderAction):
162
  detail=f"Unknown task_id '{task_id}'. Valid IDs: {TASK_IDS}",
163
  )
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  env = RustCoderEnvironment()
166
  env.reset(start_index=task_meta["index"])
167
  obs = env.step(action)
168
 
169
- raw_score = float(obs.reward) if obs.reward is not None else 0.0
 
170
  # Enforce strictly open interval (0, 1) β€” never exactly 0.0 or 1.0
171
- score = round(max(0.01, min(0.99, raw_score)), 4)
172
  success = score >= task_meta["success_threshold"]
173
 
174
  return {
175
- "task_id": task_id,
176
- "score": score,
177
- "passed": 1 if success else 0,
178
- "total": 1,
179
- "metric": "rust_code_quality",
180
- "reward_breakdown": obs.reward_breakdown,
181
  "compilation_success": obs.compilation_success,
182
- "compilation_output": obs.compilation_output,
183
- "test_results": obs.test_results,
184
  }
185
 
186
 
 
150
  Body: {"code": "<rust source code>"}
151
 
152
  Scores are strictly in the open interval (0, 1):
153
+ - Minimum 0.01 β€” floor for any submission (even empty/non-compiling)
154
+ - Maximum 0.99 β€” ceiling so no submission hits the theoretical perfect
155
+ - Weighted: Compilation(40%) + Correctness(20%) + Coverage(20%) +
156
+ Elegance(10%) + Efficiency(10%)
157
  """
158
  task_meta = TASK_REGISTRY.get(task_id)
159
  if task_meta is None:
 
162
  detail=f"Unknown task_id '{task_id}'. Valid IDs: {TASK_IDS}",
163
  )
164
 
165
+ _EMPTY_BREAKDOWN = {
166
+ "compilation": 0.0,
167
+ "correctness": 0.0,
168
+ "coverage": 0.0,
169
+ "elegance": 0.0,
170
+ "efficiency": 0.0,
171
+ }
172
+
173
+ # Fast path: empty code β€” skip compilation + avoid triggering auto-LLM
174
+ if not action.code.strip():
175
+ return {
176
+ "task_id": task_id,
177
+ "score": 0.01,
178
+ "passed": 0,
179
+ "total": 1,
180
+ "metric": "rust_code_quality",
181
+ "reward_breakdown": _EMPTY_BREAKDOWN,
182
+ "compilation_success": False,
183
+ "compilation_output": "No code submitted.",
184
+ "test_results": [],
185
+ }
186
+
187
+ # Full evaluation path
188
  env = RustCoderEnvironment()
189
  env.reset(start_index=task_meta["index"])
190
  obs = env.step(action)
191
 
192
+ # Explicit None check β€” 0.0 is falsy but a valid reward
193
+ raw_score = float(obs.reward if obs.reward is not None else 0.0)
194
  # Enforce strictly open interval (0, 1) β€” never exactly 0.0 or 1.0
195
+ score = round(max(0.01, min(0.99, raw_score)), 4)
196
  success = score >= task_meta["success_threshold"]
197
 
198
  return {
199
+ "task_id": task_id,
200
+ "score": score,
201
+ "passed": 1 if success else 0,
202
+ "total": 1,
203
+ "metric": "rust_code_quality",
204
+ "reward_breakdown": obs.reward_breakdown,
205
  "compilation_success": obs.compilation_success,
206
+ "compilation_output": obs.compilation_output,
207
+ "test_results": obs.test_results,
208
  }
209
 
210