h1manshu commited on
Commit
08fe580
·
verified ·
1 Parent(s): 31c8479

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. inference.py +15 -13
  2. openenv.yaml +28 -20
inference.py CHANGED
@@ -233,7 +233,9 @@ async def run_episode(client, env):
233
  reward = result.reward
234
  done = result.done
235
 
236
- log_step(step=step, action=response_text, reward=reward, done=done, error=None)
 
 
237
  final_score = max(final_score, reward if reward else 0.0)
238
 
239
  return final_score
@@ -243,26 +245,26 @@ async def main():
243
  client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
244
 
245
  scores = []
246
- log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
247
 
248
  async with CodeReviewEnv(base_url="https://h1manshu-code-review.hf.space") as env:
 
249
  for i in range(NUM_EPISODES):
 
 
 
 
 
250
  env.task_index = i
251
 
252
  score = await run_episode(client, env)
253
  scores.append(score)
254
 
255
- # print(f"[INFO] Scores so far: {scores}", flush=True)
256
-
257
- total_score = sum(scores)
258
- final_score = total_score / NUM_EPISODES
259
- success = final_score >= SUCCESS_SCORE_THRESHOLD
260
- log_end(
261
- success=success,
262
- steps=NUM_EPISODES * MAX_STEPS,
263
- score=final_score,
264
- rewards=scores,
265
- )
266
 
267
 
268
  if __name__ == "__main__":
 
233
  reward = result.reward
234
  done = result.done
235
 
236
+ action_str = action_dict.get("action_type", "unknown")
237
+ log_step(step=step, action=action_str, reward=reward, done=done, error=None)
238
+
239
  final_score = max(final_score, reward if reward else 0.0)
240
 
241
  return final_score
 
245
  client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
246
 
247
  scores = []
 
248
 
249
  async with CodeReviewEnv(base_url="https://h1manshu-code-review.hf.space") as env:
250
+
251
  for i in range(NUM_EPISODES):
252
+ task_name = f"task_{i+1}"
253
+
254
+ # START log must use task id from openenv.yaml
255
+ log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
256
+
257
  env.task_index = i
258
 
259
  score = await run_episode(client, env)
260
  scores.append(score)
261
 
262
+ log_end(
263
+ success=score >= SUCCESS_SCORE_THRESHOLD,
264
+ steps=MAX_STEPS,
265
+ score=score,
266
+ rewards=[score],
267
+ )
 
 
 
 
 
268
 
269
 
270
  if __name__ == "__main__":
openenv.yaml CHANGED
@@ -5,23 +5,31 @@ runtime: fastapi
5
  app: server.app:app
6
  port: 8000
7
  tasks:
8
- - id: task_easy
9
- difficulty: easy
10
- max_steps: 10
11
- grader:
12
- type: llm
13
- prompt_template: "Score this response 0.0 to 1.0 based on accuracy..."
14
-
15
- - id: task_medium
16
- difficulty: medium
17
- max_steps: 15
18
- grader:
19
- type: llm
20
- prompt_template: "Score this response 0.0 to 1.0 based on..."
21
-
22
- - id: task_hard
23
- difficulty: hard
24
- max_steps: 20
25
- grader:
26
- type: llm
27
- prompt_template: "Score this response 0.0 to 1.0 based on..."
 
 
 
 
 
 
 
 
 
5
  app: server.app:app
6
  port: 8000
7
  tasks:
8
+ - id: task_1
9
+ description: "Easy — missing import detection"
10
+ max_steps: 3
11
+ grader: graders:CodeReviewGrader
12
+ - id: task_2
13
+ description: "Medium division by zero handling"
14
+ max_steps: 3
15
+ grader: graders:CodeReviewGrader
16
+ - id: task_3
17
+ description: "Medium — inefficient loop optimization"
18
+ max_steps: 3
19
+ grader: graders:CodeReviewGrader
20
+ - id: task_4
21
+ description: "Hard — hardcoded password security vulnerability"
22
+ max_steps: 3
23
+ grader: graders:CodeReviewGrader
24
+ - id: task_5
25
+ description: "Hard — SQL injection vulnerability"
26
+ max_steps: 3
27
+ grader: graders:CodeReviewGrader
28
+ - id: task_6
29
+ description: "Hard — cross-file null handling bug"
30
+ max_steps: 3
31
+ grader: graders:CodeReviewGrader
32
+ endpoints:
33
+ reset: /reset
34
+ step: /step
35
+ health: /health