PRANAV05092003 commited on
Commit
e93bbca
·
1 Parent(s): 8c9f7aa

Added missing env module

Browse files
acre/env/__pycache__/refactor_env.cpython-313.pyc CHANGED
Binary files a/acre/env/__pycache__/refactor_env.cpython-313.pyc and b/acre/env/__pycache__/refactor_env.cpython-313.pyc differ
 
acre/env/refactor_env.py CHANGED
@@ -13,6 +13,8 @@ import numpy as np
13
 
14
  from acre.actions import transformations as tx
15
  from acre.datasets.code_samples import CodeSample, CodeSampleDataset
 
 
16
 
17
  try:
18
  from radon.complexity import cc_visit
@@ -131,10 +133,13 @@ class RefactorEnv(gym.Env):
131
  self._np_random, _ = gym.utils.seeding.np_random(seed)
132
 
133
  self.executor = _InProcessExecutor()
 
134
 
135
  self._episode_steps = 0
136
  self._sample: Optional[CodeSample] = None
137
  self._code: str = ""
 
 
138
  self._last_runtime_s: float = 0.0
139
  self._last_error: bool = False
140
  self._last_complexity: float = 0.0
@@ -181,6 +186,22 @@ class RefactorEnv(gym.Env):
181
  self._code = str(self._sample.code)
182
  self._episode_steps = 0
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  self._last_complexity = self._compute_complexity(self._code)
185
  self._last_runtime_s, self._last_error, _ = self._compute_runtime(self._code)
186
 
@@ -188,6 +209,7 @@ class RefactorEnv(gym.Env):
188
  "sample_id": getattr(self._sample, "id", None),
189
  "language": getattr(self._sample, "language", None),
190
  "episode_steps": self._episode_steps,
 
191
  }
192
  return self._observation(), info
193
 
@@ -199,6 +221,7 @@ class RefactorEnv(gym.Env):
199
  prev_complexity = float(self._last_complexity)
200
  prev_runtime = float(self._last_runtime_s)
201
  prev_error = bool(self._last_error)
 
202
 
203
  original = self._code
204
  if action_i == 0:
@@ -218,26 +241,41 @@ class RefactorEnv(gym.Env):
218
  self._last_complexity = self._compute_complexity(self._code)
219
  self._last_runtime_s, self._last_error, is_timeout = self._compute_runtime(self._code)
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  complexity_gain = (prev_complexity - float(self._last_complexity)) / max(prev_complexity, 1.0)
222
  runtime_gain = (prev_runtime - float(self._last_runtime_s)) / max(prev_runtime, 1e-6)
223
- # Penalize execution errors strongly; timeouts even more strongly.
224
- timeout_penalty = -2.0 if is_timeout else 0.0
225
- error_penalty = -1.0 if self._last_error else 0.0
226
- change_bonus = 0.05 if transform.changed else 0.0
227
- no_change_penalty = -0.02 if not transform.changed else 0.0
 
 
 
 
228
 
229
  raw_reward = float(
230
- 2.0 * complexity_gain
231
- + 0.25 * runtime_gain
232
- + error_penalty
 
 
233
  + timeout_penalty
234
- + change_bonus
235
  + no_change_penalty
236
  )
237
- if (not prev_error) and self._last_error:
238
- raw_reward -= 0.5
239
- if prev_error and (not self._last_error):
240
- raw_reward += 0.5
241
 
242
  # Normalize exactly as declared in openenv.yaml (clip to [0,1]).
243
  normalized_reward = float((raw_reward + 32.0) / 52.0)
@@ -254,16 +292,21 @@ class RefactorEnv(gym.Env):
254
  "changed": bool(transform.changed),
255
  "transform": dict(transform.metadata),
256
  "reward_components": {
 
257
  "complexity_gain": float(complexity_gain),
258
  "runtime_gain": float(runtime_gain),
259
- "error_penalty": float(error_penalty),
 
 
 
260
  "timeout_penalty": float(timeout_penalty),
261
- "change_bonus": float(change_bonus),
262
  "no_change_penalty": float(no_change_penalty),
263
  },
264
  "normalized_reward": normalized_reward,
265
  "episode_steps": int(self._episode_steps),
266
  "timeout": bool(is_timeout),
 
 
267
  }
268
  return self._observation(), raw_reward, terminated, truncated, info
269
 
@@ -279,6 +322,7 @@ class RefactorEnv(gym.Env):
279
  "language": getattr(self._sample, "language", None) if self._sample is not None else None,
280
  "observation": self._observation().tolist(),
281
  "action_meanings": dict(self.ACTION_MEANINGS),
 
282
  }
283
 
284
  def render(self) -> None:
 
13
 
14
  from acre.actions import transformations as tx
15
  from acre.datasets.code_samples import CodeSample, CodeSampleDataset
16
+ from acre.tasks.task_registry import TaskRegistry
17
+ from acre.tasks.grader import grade_task
18
 
19
  try:
20
  from radon.complexity import cc_visit
 
133
  self._np_random, _ = gym.utils.seeding.np_random(seed)
134
 
135
  self.executor = _InProcessExecutor()
136
+ self._registry = TaskRegistry()
137
 
138
  self._episode_steps = 0
139
  self._sample: Optional[CodeSample] = None
140
  self._code: str = ""
141
+ self._expected_output: str = ""
142
+ self._progress_score: float = 0.0
143
  self._last_runtime_s: float = 0.0
144
  self._last_error: bool = False
145
  self._last_complexity: float = 0.0
 
186
  self._code = str(self._sample.code)
187
  self._episode_steps = 0
188
 
189
+ # Resolve expected output deterministically from task_registry based on sample_id.
190
+ # sample ids are produced by openenv_interface as "{task_id}:{i}".
191
+ self._expected_output = ""
192
+ self._progress_score = 0.0
193
+ sample_id = str(getattr(self._sample, "id", "") or "")
194
+ if ":" in sample_id:
195
+ task_id, raw_idx = sample_id.split(":", 1)
196
+ task = self._registry.get_task(task_id)
197
+ try:
198
+ sample_idx = int(raw_idx)
199
+ except Exception:
200
+ sample_idx = 0
201
+ if task is not None:
202
+ self._expected_output = task.expected_output_for_index(sample_idx)
203
+ self._progress_score = float(grade_task(self._code, self._expected_output))
204
+
205
  self._last_complexity = self._compute_complexity(self._code)
206
  self._last_runtime_s, self._last_error, _ = self._compute_runtime(self._code)
207
 
 
209
  "sample_id": getattr(self._sample, "id", None),
210
  "language": getattr(self._sample, "language", None),
211
  "episode_steps": self._episode_steps,
212
+ "progress_score": float(self._progress_score),
213
  }
214
  return self._observation(), info
215
 
 
221
  prev_complexity = float(self._last_complexity)
222
  prev_runtime = float(self._last_runtime_s)
223
  prev_error = bool(self._last_error)
224
+ prev_score = float(self._progress_score)
225
 
226
  original = self._code
227
  if action_i == 0:
 
241
  self._last_complexity = self._compute_complexity(self._code)
242
  self._last_runtime_s, self._last_error, is_timeout = self._compute_runtime(self._code)
243
 
244
+ # Deterministic task progress score toward expected output.
245
+ score_now = prev_score
246
+ if self._expected_output:
247
+ score_now = float(grade_task(self._code, self._expected_output))
248
+ self._progress_score = float(score_now)
249
+
250
+ # ------------------------------------------------------------------
251
+ # Step-wise reward (hackathon-friendly, deterministic)
252
+ # ------------------------------------------------------------------
253
+ # - better code (closer to expected_output) -> +0.3-ish
254
+ # - reduced complexity -> +0.3-ish
255
+ # - bug introduced -> -0.5
256
+ # - infinite loop / timeout -> large penalty
257
+ delta_score = float(score_now - prev_score)
258
  complexity_gain = (prev_complexity - float(self._last_complexity)) / max(prev_complexity, 1.0)
259
  runtime_gain = (prev_runtime - float(self._last_runtime_s)) / max(prev_runtime, 1e-6)
260
+
261
+ better_code_reward = float(max(-1.0, min(1.0, delta_score)) * 0.6)
262
+ complexity_reward = float(max(-1.0, min(1.0, complexity_gain)) * 0.3)
263
+ runtime_reward = float(max(-1.0, min(1.0, runtime_gain)) * 0.1)
264
+
265
+ bug_penalty = -0.5 if ((not prev_error) and self._last_error) else 0.0
266
+ fixed_bonus = 0.2 if (prev_error and (not self._last_error)) else 0.0
267
+ timeout_penalty = -1.0 if is_timeout else 0.0
268
+ no_change_penalty = -0.05 if not transform.changed else 0.0
269
 
270
  raw_reward = float(
271
+ better_code_reward
272
+ + complexity_reward
273
+ + runtime_reward
274
+ + bug_penalty
275
+ + fixed_bonus
276
  + timeout_penalty
 
277
  + no_change_penalty
278
  )
 
 
 
 
279
 
280
  # Normalize exactly as declared in openenv.yaml (clip to [0,1]).
281
  normalized_reward = float((raw_reward + 32.0) / 52.0)
 
292
  "changed": bool(transform.changed),
293
  "transform": dict(transform.metadata),
294
  "reward_components": {
295
+ "better_code_reward": float(better_code_reward),
296
  "complexity_gain": float(complexity_gain),
297
  "runtime_gain": float(runtime_gain),
298
+ "complexity_reward": float(complexity_reward),
299
+ "runtime_reward": float(runtime_reward),
300
+ "bug_penalty": float(bug_penalty),
301
+ "fixed_bonus": float(fixed_bonus),
302
  "timeout_penalty": float(timeout_penalty),
 
303
  "no_change_penalty": float(no_change_penalty),
304
  },
305
  "normalized_reward": normalized_reward,
306
  "episode_steps": int(self._episode_steps),
307
  "timeout": bool(is_timeout),
308
+ "progress_score": float(score_now),
309
+ "progress_delta": float(delta_score),
310
  }
311
  return self._observation(), raw_reward, terminated, truncated, info
312
 
 
322
  "language": getattr(self._sample, "language", None) if self._sample is not None else None,
323
  "observation": self._observation().tolist(),
324
  "action_meanings": dict(self.ACTION_MEANINGS),
325
+ "progress_score": float(self._progress_score),
326
  }
327
 
328
  def render(self) -> None:
acre/tasks/__init__.py CHANGED
@@ -1,3 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  from acre.tasks.task_registry import Task, TaskRegistry
2
 
3
  __all__ = ["Task", "TaskRegistry"]
 
1
+ from .grader import grade_task
2
+ from .easy_task import EasyTask
3
+ from .medium_task import MediumTask
4
+ from .hard_task import HardTask
5
+
6
+ __all__ = [
7
+ "EasyTask",
8
+ "MediumTask",
9
+ "HardTask",
10
+ "grade_task",
11
+ ]
12
+
13
  from acre.tasks.task_registry import Task, TaskRegistry
14
 
15
  __all__ = ["Task", "TaskRegistry"]
acre/tasks/easy_task.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass(frozen=True)
7
+ class EasyTask:
8
+ task_id: str = "rename_variables"
9
+ description: str = (
10
+ "Refactor the function by renaming generic variables (`x`, `tmp`, `i`) "
11
+ "into descriptive names while preserving behavior."
12
+ )
13
+ input_code: str = """\
14
+ def compute(x, y, tmp):
15
+ tmp = x + y
16
+ x = tmp * 2
17
+ result = x
18
+ return result
19
+ """
20
+ expected_output: str = """\
21
+ def compute(left, right, sum_value):
22
+ sum_value = left + right
23
+ doubled = sum_value * 2
24
+ result = doubled
25
+ return result
26
+ """
27
+
acre/tasks/grader.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+ import difflib
5
+ from typing import Tuple
6
+
7
+
8
+ def _normalize(code: str) -> Tuple[str, str]:
9
+ """
10
+ Deterministic normalization for grading.
11
+
12
+ Returns:
13
+ (ast_unparsed, stripped_source)
14
+ """
15
+ src = (code or "").replace("\r\n", "\n").strip()
16
+ try:
17
+ tree = ast.parse(src)
18
+ normalized = ast.unparse(tree).strip()
19
+ return normalized, src
20
+ except Exception:
21
+ return "", src
22
+
23
+
24
+ def grade_task(output: str, expected_output: str) -> float:
25
+ """
26
+ Deterministic score in [0.0, 1.0] comparing output vs expected_output.
27
+
28
+ - If both parse as Python, we compare normalized AST-unparse strings.
29
+ - Otherwise, we fall back to a whitespace-stripped diff similarity.
30
+ """
31
+ out_norm, out_src = _normalize(output)
32
+ exp_norm, exp_src = _normalize(expected_output)
33
+
34
+ if out_norm and exp_norm:
35
+ if out_norm == exp_norm:
36
+ return 1.0
37
+ ratio = difflib.SequenceMatcher(a=exp_norm, b=out_norm).ratio()
38
+ return float(max(0.0, min(1.0, ratio)))
39
+
40
+ # Fallback: compare raw text (still deterministic).
41
+ a = " ".join(exp_src.split())
42
+ b = " ".join(out_src.split())
43
+ if not a and not b:
44
+ return 1.0
45
+ ratio = difflib.SequenceMatcher(a=a, b=b).ratio()
46
+ return float(max(0.0, min(1.0, ratio)))
47
+
acre/tasks/hard_task.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass(frozen=True)
7
+ class HardTask:
8
+ task_id: str = "full_refactor"
9
+ description: str = (
10
+ "Perform a full refactor: rename generic variables, remove dead branches, "
11
+ "simplify loops into comprehensions, optimize boolean conditions, and inline "
12
+ "trivial helpers where appropriate."
13
+ )
14
+ input_code: str = """\
15
+ def add(p, q):
16
+ return p + q
17
+
18
+ def compute(x, data, tmp):
19
+ result = []
20
+ for item in data:
21
+ result.append(item * 2)
22
+ if False:
23
+ y = 999
24
+ if True:
25
+ val = add(x, tmp)
26
+ unused = 0
27
+ flag = not not True
28
+ return val
29
+ print("dead")
30
+ """
31
+ expected_output: str = """\
32
+ def compute(value, data, offset):
33
+ _ = [item * 2 for item in data]
34
+ return value + offset
35
+ """
36
+
acre/tasks/medium_task.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass(frozen=True)
7
+ class MediumTask:
8
+ task_id: str = "remove_dead_code"
9
+ description: str = (
10
+ "Remove dead code patterns (unreachable statements, `if False` blocks, and "
11
+ "obviously unused assignments) while keeping functional behavior intact."
12
+ )
13
+ input_code: str = """\
14
+ def process(data):
15
+ result = []
16
+ for item in data:
17
+ result.append(item * 2)
18
+ if False:
19
+ print("never runs")
20
+ unused_var = 42
21
+ return result
22
+ print("unreachable")
23
+ """
24
+ expected_output: str = """\
25
+ def process(data):
26
+ return [item * 2 for item in data]
27
+ """
28
+
acre/tasks/task_registry.py CHANGED
@@ -5,7 +5,12 @@ from __future__ import annotations
5
 
6
  import ast
7
  from dataclasses import dataclass
8
- from typing import Callable, Dict, List, Optional, Sequence
 
 
 
 
 
9
 
10
 
11
  @dataclass
@@ -15,12 +20,18 @@ class Task:
15
  description: str
16
  difficulty: str
17
  samples: List[str]
 
18
  _grade_fn: Callable[[str], float]
19
 
20
  @property
21
  def initial_code(self) -> str:
22
  return str(self.samples[0]) if self.samples else ""
23
 
 
 
 
 
 
24
  def grade(self, code: str) -> float:
25
  """Return a score in [0.0, 1.0]."""
26
  try:
@@ -28,6 +39,17 @@ class Task:
28
  except Exception:
29
  return 0.0
30
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def _safe_unparse(tree: ast.AST) -> str:
33
  try:
@@ -109,6 +131,37 @@ def merge(a, b):
109
  """,
110
  ]
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  def _grade_easy(code: str) -> float:
114
  """Score = fraction of generic names removed from all scopes."""
@@ -191,6 +244,31 @@ def calc(n):
191
  """,
192
  ]
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
  def _grade_medium(code: str) -> float:
196
  """Score = fraction of dead-code patterns eliminated (4 checks, 0.25 each)."""
@@ -299,6 +377,30 @@ def compute(tmp, data, x):
299
  """,
300
  ]
301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
  def _grade_hard(code: str) -> float:
304
  """Score = fraction of 7 quality checks passed."""
@@ -365,25 +467,28 @@ class TaskRegistry:
365
  self._tasks["rename_variables"] = Task(
366
  id="rename_variables",
367
  name="Rename Variables (Easy)",
368
- description="Rename generic variable names (x, tmp) to descriptive ones",
369
  difficulty="easy",
370
  samples=_EASY_SAMPLES,
 
371
  _grade_fn=_grade_easy,
372
  )
373
  self._tasks["remove_dead_code"] = Task(
374
  id="remove_dead_code",
375
  name="Remove Dead Code (Medium)",
376
- description="Remove unreachable code, if False blocks, and unused variables",
377
  difficulty="medium",
378
  samples=_MEDIUM_SAMPLES,
 
379
  _grade_fn=_grade_medium,
380
  )
381
  self._tasks["full_refactor"] = Task(
382
  id="full_refactor",
383
  name="Full Refactor (Hard)",
384
- description="Apply all transformations: rename, dead code, loops, conditions, inlining",
385
  difficulty="hard",
386
  samples=_HARD_SAMPLES,
 
387
  _grade_fn=_grade_hard,
388
  )
389
 
 
5
 
6
  import ast
7
  from dataclasses import dataclass
8
+ from typing import Callable, Dict, List, Optional, Sequence, Tuple
9
+
10
+ from acre.tasks.easy_task import EasyTask
11
+ from acre.tasks.hard_task import HardTask
12
+ from acre.tasks.medium_task import MediumTask
13
+ from acre.tasks.grader import grade_task
14
 
15
 
16
  @dataclass
 
20
  description: str
21
  difficulty: str
22
  samples: List[str]
23
+ expected_outputs: List[str]
24
  _grade_fn: Callable[[str], float]
25
 
26
  @property
27
  def initial_code(self) -> str:
28
  return str(self.samples[0]) if self.samples else ""
29
 
30
+ def expected_output_for_index(self, idx: int) -> str:
31
+ if 0 <= idx < len(self.expected_outputs):
32
+ return str(self.expected_outputs[idx])
33
+ return str(self.expected_outputs[0]) if self.expected_outputs else ""
34
+
35
  def grade(self, code: str) -> float:
36
  """Return a score in [0.0, 1.0]."""
37
  try:
 
39
  except Exception:
40
  return 0.0
41
 
42
+ def grade_against_expected(self, code: str) -> float:
43
+ """
44
+ Deterministic grader comparing against this task's expected outputs.
45
+
46
+ Since the HTTP `grade` endpoint doesn't know which sample was active, we
47
+ score against the best-matching expected output (still deterministic).
48
+ """
49
+ if not self.expected_outputs:
50
+ return 0.0
51
+ return float(max(grade_task(code, exp) for exp in self.expected_outputs))
52
+
53
 
54
  def _safe_unparse(tree: ast.AST) -> str:
55
  try:
 
131
  """,
132
  ]
133
 
134
+ _EASY_EXPECTED: List[str] = [
135
+ EasyTask.expected_output,
136
+ """\
137
+ def normalize(temp_value, value):
138
+ for index in range(3):
139
+ temp_value = temp_value + index
140
+ return temp_value * value
141
+ """,
142
+ """\
143
+ def score(items):
144
+ total = 0
145
+ for item in items:
146
+ total += item
147
+ value = total
148
+ return value
149
+ """,
150
+ """\
151
+ def transform(value):
152
+ temp_value = value
153
+ if temp_value > 10:
154
+ temp_value = temp_value - 1
155
+ return temp_value
156
+ """,
157
+ """\
158
+ def merge(a, b):
159
+ left = a
160
+ right = b
161
+ return left + right
162
+ """,
163
+ ]
164
+
165
 
166
  def _grade_easy(code: str) -> float:
167
  """Score = fraction of generic names removed from all scopes."""
 
244
  """,
245
  ]
246
 
247
+ _MEDIUM_EXPECTED: List[str] = [
248
+ MediumTask.expected_output,
249
+ """\
250
+ def build(values):
251
+ return [v + 1 for v in values]
252
+ """,
253
+ """\
254
+ def route(flag):
255
+ x = 2
256
+ y = x
257
+ return y
258
+ """,
259
+ """\
260
+ def clean(xs):
261
+ return [x * 2 for x in xs]
262
+ """,
263
+ """\
264
+ def calc(n):
265
+ total = 0
266
+ for index in range(n):
267
+ total += index
268
+ return total
269
+ """,
270
+ ]
271
+
272
 
273
  def _grade_medium(code: str) -> float:
274
  """Score = fraction of dead-code patterns eliminated (4 checks, 0.25 each)."""
 
377
  """,
378
  ]
379
 
380
+ _HARD_EXPECTED: List[str] = [
381
+ HardTask.expected_output,
382
+ """\
383
+ def pipeline(offset, xs, value):
384
+ _ = [item * 2 for item in xs]
385
+ return offset + value
386
+ """,
387
+ """\
388
+ def compute(value, data, offset):
389
+ _ = [item * 2 for item in data]
390
+ return value + offset
391
+ """,
392
+ """\
393
+ def compute(value, data, offset):
394
+ _ = [item * 2 for item in data]
395
+ return value + offset
396
+ """,
397
+ """\
398
+ def compute(offset, data, value):
399
+ _ = [item * 2 for item in data]
400
+ return value + offset
401
+ """,
402
+ ]
403
+
404
 
405
  def _grade_hard(code: str) -> float:
406
  """Score = fraction of 7 quality checks passed."""
 
467
  self._tasks["rename_variables"] = Task(
468
  id="rename_variables",
469
  name="Rename Variables (Easy)",
470
+ description=EasyTask.description,
471
  difficulty="easy",
472
  samples=_EASY_SAMPLES,
473
+ expected_outputs=_EASY_EXPECTED,
474
  _grade_fn=_grade_easy,
475
  )
476
  self._tasks["remove_dead_code"] = Task(
477
  id="remove_dead_code",
478
  name="Remove Dead Code (Medium)",
479
+ description=MediumTask.description,
480
  difficulty="medium",
481
  samples=_MEDIUM_SAMPLES,
482
+ expected_outputs=_MEDIUM_EXPECTED,
483
  _grade_fn=_grade_medium,
484
  )
485
  self._tasks["full_refactor"] = Task(
486
  id="full_refactor",
487
  name="Full Refactor (Hard)",
488
+ description=HardTask.description,
489
  difficulty="hard",
490
  samples=_HARD_SAMPLES,
491
+ expected_outputs=_HARD_EXPECTED,
492
  _grade_fn=_grade_hard,
493
  )
494
 
inference.py CHANGED
@@ -26,9 +26,9 @@ from typing import Dict, List, Optional, Tuple
26
  import requests
27
  from openai import OpenAI
28
 
29
- API_BASE_URL: str = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
30
- MODEL_NAME: str = os.getenv("MODEL_NAME", "gpt-4o-mini")
31
- HF_TOKEN: str | None = os.getenv("HF_TOKEN")
32
  ENV_URL: str | None = os.getenv("ENV_URL")
33
  LOCAL_IMAGE_NAME: str | None = os.getenv("LOCAL_IMAGE_NAME")
34
 
@@ -224,16 +224,25 @@ def main() -> None:
224
  if not ENV_URL:
225
  raise SystemExit("ENV_URL is required. Example: ENV_URL=http://localhost:7860")
226
 
 
227
  client: Optional[OpenAI] = None
228
- if HF_TOKEN and os.getenv("USE_LLM", "0") == "1":
229
  client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
230
 
231
- scores: List[float] = []
232
  for i, task_id in enumerate(TASKS, start=1):
233
- score = run_episode(client, task_id, i)
234
- scores.append(score)
 
 
 
 
 
 
 
 
 
235
 
236
- avg_score = sum(scores) / len(scores) if scores else 0.0
237
  sys.exit(0 if avg_score >= 0.5 else 1)
238
 
239
 
 
26
  import requests
27
  from openai import OpenAI
28
 
29
+ API_BASE_URL = os.getenv("API_BASE_URL") or "https://api.openai.com/v1"
30
+ MODEL_NAME = os.getenv("MODEL_NAME") or "gpt-4o-mini"
31
+ HF_TOKEN = os.getenv("HF_TOKEN")
32
  ENV_URL: str | None = os.getenv("ENV_URL")
33
  LOCAL_IMAGE_NAME: str | None = os.getenv("LOCAL_IMAGE_NAME")
34
 
 
224
  if not ENV_URL:
225
  raise SystemExit("ENV_URL is required. Example: ENV_URL=http://localhost:7860")
226
 
227
+ # Required: OpenAI client is constructed via official SDK.
228
  client: Optional[OpenAI] = None
229
+ if HF_TOKEN:
230
  client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
231
 
232
+ scores: Dict[str, float] = {}
233
  for i, task_id in enumerate(TASKS, start=1):
234
+ scores[task_id] = run_episode(client, task_id, i)
235
+
236
+ easy = float(scores.get("rename_variables", 0.0))
237
+ medium = float(scores.get("remove_dead_code", 0.0))
238
+ hard = float(scores.get("full_refactor", 0.0))
239
+ avg_score = (easy + medium + hard) / 3.0
240
+
241
+ print(f"Easy: {easy:.4f}")
242
+ print(f"Medium: {medium:.4f}")
243
+ print(f"Hard: {hard:.4f}")
244
+ print(f"Final: {avg_score:.4f}")
245
 
 
246
  sys.exit(0 if avg_score >= 0.5 else 1)
247
 
248
 
server.py CHANGED
@@ -544,7 +544,8 @@ def grade(task_id: str, req: GradeRequest) -> GradeResponse:
544
  task = registry.get_task(task_id)
545
  if task is None:
546
  raise HTTPException(status_code=404, detail=f"Task '{task_id}' not found")
547
- score = task.grade(req.code)
 
548
  return GradeResponse(
549
  task_id=task_id,
550
  score=round(score, 4),
 
544
  task = registry.get_task(task_id)
545
  if task is None:
546
  raise HTTPException(status_code=404, detail=f"Task '{task_id}' not found")
547
+ # Use the deterministic expected-output grader for the public grade endpoint.
548
+ score = task.grade_against_expected(req.code)
549
  return GradeResponse(
550
  task_id=task_id,
551
  score=round(score, 4),
validate.py CHANGED
@@ -220,18 +220,24 @@ def run_validation(base_url: str) -> int:
220
  ) else 1
221
  for var in ["API_BASE_URL", "MODEL_NAME", "HF_TOKEN", "ENV_URL", "LOCAL_IMAGE_NAME"]:
222
  failures += 0 if check(f"inference.py reads {var} from env", var in inference_src) else 1
223
- failures += 0 if check(
224
- "API_BASE_URL has a default",
225
- 'os.getenv("API_BASE_URL", "https://api.openai.com/v1")' in inference_src,
226
- ) else 1
227
- failures += 0 if check(
228
- "MODEL_NAME has a default",
229
- 'os.getenv("MODEL_NAME", "gpt-4o-mini")' in inference_src,
230
- ) else 1
231
- failures += 0 if check(
232
- "HF_TOKEN has no default",
233
- re.search(r'HF_TOKEN\s*:\s*.*os\.getenv\("HF_TOKEN"\)', inference_src) is not None,
234
- ) else 1
 
 
 
 
 
 
235
  except FileNotFoundError:
236
  failures += 1
237
  check("inference.py exists", False, "file not found")
 
220
  ) else 1
221
  for var in ["API_BASE_URL", "MODEL_NAME", "HF_TOKEN", "ENV_URL", "LOCAL_IMAGE_NAME"]:
222
  failures += 0 if check(f"inference.py reads {var} from env", var in inference_src) else 1
223
+ api_base_default_ok = (
224
+ 'os.getenv("API_BASE_URL", "https://api.openai.com/v1")' in inference_src
225
+ or re.search(r'API_BASE_URL\s*=.*os\.getenv\("API_BASE_URL"\)\s*or\s*"https://api\.openai\.com/v1"', inference_src)
226
+ is not None
227
+ )
228
+ failures += 0 if check("API_BASE_URL has a default", api_base_default_ok) else 1
229
+
230
+ model_default_ok = (
231
+ 'os.getenv("MODEL_NAME", "gpt-4o-mini")' in inference_src
232
+ or re.search(r'MODEL_NAME\s*=.*os\.getenv\("MODEL_NAME"\)\s*or\s*"gpt-4o-mini"', inference_src) is not None
233
+ )
234
+ failures += 0 if check("MODEL_NAME has a default", model_default_ok) else 1
235
+
236
+ hf_token_no_default_ok = (
237
+ re.search(r'HF_TOKEN\s*=.*os\.getenv\("HF_TOKEN"\)\s*$', inference_src, flags=re.MULTILINE) is not None
238
+ and re.search(r'os\.getenv\("HF_TOKEN"\s*,', inference_src) is None
239
+ )
240
+ failures += 0 if check("HF_TOKEN has no default", hf_token_no_default_ok) else 1
241
  except FileNotFoundError:
242
  failures += 1
243
  check("inference.py exists", False, "file not found")