Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- inference.py +15 -13
- openenv.yaml +28 -20
inference.py
CHANGED
|
@@ -233,7 +233,9 @@ async def run_episode(client, env):
|
|
| 233 |
reward = result.reward
|
| 234 |
done = result.done
|
| 235 |
|
| 236 |
-
|
|
|
|
|
|
|
| 237 |
final_score = max(final_score, reward if reward else 0.0)
|
| 238 |
|
| 239 |
return final_score
|
|
@@ -243,26 +245,26 @@ async def main():
|
|
| 243 |
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 244 |
|
| 245 |
scores = []
|
| 246 |
-
log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
|
| 247 |
|
| 248 |
async with CodeReviewEnv(base_url="https://h1manshu-code-review.hf.space") as env:
|
|
|
|
| 249 |
for i in range(NUM_EPISODES):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
env.task_index = i
|
| 251 |
|
| 252 |
score = await run_episode(client, env)
|
| 253 |
scores.append(score)
|
| 254 |
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
success=success,
|
| 262 |
-
steps=NUM_EPISODES * MAX_STEPS,
|
| 263 |
-
score=final_score,
|
| 264 |
-
rewards=scores,
|
| 265 |
-
)
|
| 266 |
|
| 267 |
|
| 268 |
if __name__ == "__main__":
|
|
|
|
| 233 |
reward = result.reward
|
| 234 |
done = result.done
|
| 235 |
|
| 236 |
+
action_str = action_dict.get("action_type", "unknown")
|
| 237 |
+
log_step(step=step, action=action_str, reward=reward, done=done, error=None)
|
| 238 |
+
|
| 239 |
final_score = max(final_score, reward if reward else 0.0)
|
| 240 |
|
| 241 |
return final_score
|
|
|
|
| 245 |
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 246 |
|
| 247 |
scores = []
|
|
|
|
| 248 |
|
| 249 |
async with CodeReviewEnv(base_url="https://h1manshu-code-review.hf.space") as env:
|
| 250 |
+
|
| 251 |
for i in range(NUM_EPISODES):
|
| 252 |
+
task_name = f"task_{i+1}"
|
| 253 |
+
|
| 254 |
+
# START log must use task id from openenv.yaml
|
| 255 |
+
log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
|
| 256 |
+
|
| 257 |
env.task_index = i
|
| 258 |
|
| 259 |
score = await run_episode(client, env)
|
| 260 |
scores.append(score)
|
| 261 |
|
| 262 |
+
log_end(
|
| 263 |
+
success=score >= SUCCESS_SCORE_THRESHOLD,
|
| 264 |
+
steps=MAX_STEPS,
|
| 265 |
+
score=score,
|
| 266 |
+
rewards=[score],
|
| 267 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
|
| 269 |
|
| 270 |
if __name__ == "__main__":
|
openenv.yaml
CHANGED
|
@@ -5,23 +5,31 @@ runtime: fastapi
|
|
| 5 |
app: server.app:app
|
| 6 |
port: 8000
|
| 7 |
tasks:
|
| 8 |
-
- id:
|
| 9 |
-
|
| 10 |
-
max_steps:
|
| 11 |
-
grader:
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
app: server.app:app
|
| 6 |
port: 8000
|
| 7 |
tasks:
|
| 8 |
+
- id: task_1
|
| 9 |
+
description: "Easy — missing import detection"
|
| 10 |
+
max_steps: 3
|
| 11 |
+
grader: graders:CodeReviewGrader
|
| 12 |
+
- id: task_2
|
| 13 |
+
description: "Medium — division by zero handling"
|
| 14 |
+
max_steps: 3
|
| 15 |
+
grader: graders:CodeReviewGrader
|
| 16 |
+
- id: task_3
|
| 17 |
+
description: "Medium — inefficient loop optimization"
|
| 18 |
+
max_steps: 3
|
| 19 |
+
grader: graders:CodeReviewGrader
|
| 20 |
+
- id: task_4
|
| 21 |
+
description: "Hard — hardcoded password security vulnerability"
|
| 22 |
+
max_steps: 3
|
| 23 |
+
grader: graders:CodeReviewGrader
|
| 24 |
+
- id: task_5
|
| 25 |
+
description: "Hard — SQL injection vulnerability"
|
| 26 |
+
max_steps: 3
|
| 27 |
+
grader: graders:CodeReviewGrader
|
| 28 |
+
- id: task_6
|
| 29 |
+
description: "Hard — cross-file null handling bug"
|
| 30 |
+
max_steps: 3
|
| 31 |
+
grader: graders:CodeReviewGrader
|
| 32 |
+
endpoints:
|
| 33 |
+
reset: /reset
|
| 34 |
+
step: /step
|
| 35 |
+
health: /health
|