Spaces:
Running
Running
Nitish commited on
Commit Β·
babbbc8
1
Parent(s): 6d8d3c3
Final submission readiness: cleanups, checklist, strict grader fix
Browse files- .gitignore +10 -1
- OPENENV_SUBMISSION_CHECKLIST.md +169 -169
- inference.py +58 -52
- inference_output.log +0 -0
- server/environment.py +1 -1
- server/grader.py +4 -2
- test_env.py +0 -41
- validation_ascii.log +0 -3
- validation_output.log +0 -0
.gitignore
CHANGED
|
@@ -1,5 +1,14 @@
|
|
| 1 |
venv/
|
|
|
|
|
|
|
| 2 |
__pycache__/
|
| 3 |
*.pyc
|
| 4 |
.DS_Store
|
| 5 |
-
.env
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
venv/
|
| 2 |
+
.venv/
|
| 3 |
+
env/
|
| 4 |
__pycache__/
|
| 5 |
*.pyc
|
| 6 |
.DS_Store
|
| 7 |
+
.env
|
| 8 |
+
*.egg-info/
|
| 9 |
+
build/
|
| 10 |
+
dist/
|
| 11 |
+
*.whl
|
| 12 |
+
*.tar.gz
|
| 13 |
+
.pytest_cache/
|
| 14 |
+
.coverage
|
OPENENV_SUBMISSION_CHECKLIST.md
CHANGED
|
@@ -18,15 +18,15 @@
|
|
| 18 |
|
| 19 |
### 1.1 Domain Validity
|
| 20 |
|
| 21 |
-
- [
|
| 22 |
-
- [
|
| 23 |
-
- [
|
| 24 |
|
| 25 |
### 1.2 Domain Depth
|
| 26 |
|
| 27 |
-
- [
|
| 28 |
-
- [
|
| 29 |
-
- [
|
| 30 |
|
| 31 |
---
|
| 32 |
|
|
@@ -36,31 +36,31 @@
|
|
| 36 |
|
| 37 |
### 2.1 Typed Models
|
| 38 |
|
| 39 |
-
- [
|
| 40 |
-
- [
|
| 41 |
-
- [
|
| 42 |
-
- [
|
| 43 |
-
- [
|
| 44 |
|
| 45 |
### 2.2 Core API Methods
|
| 46 |
|
| 47 |
-
- [
|
| 48 |
-
- [
|
| 49 |
-
- [
|
| 50 |
-
- [
|
| 51 |
-
- [
|
| 52 |
-
- [
|
| 53 |
|
| 54 |
### 2.3 `openenv.yaml`
|
| 55 |
|
| 56 |
-
- [
|
| 57 |
-
- [
|
| 58 |
-
- [
|
| 59 |
-
- [
|
| 60 |
-
- [
|
| 61 |
-
- [
|
| 62 |
-
- [
|
| 63 |
-
- [
|
| 64 |
|
| 65 |
```bash
|
| 66 |
# Run this and confirm zero errors:
|
|
@@ -75,20 +75,20 @@ openenv validate openenv.yaml
|
|
| 75 |
|
| 76 |
### 3.1 Task Definitions
|
| 77 |
|
| 78 |
-
- [
|
| 79 |
-
- [
|
| 80 |
-
- [
|
| 81 |
-
- [
|
| 82 |
-
- [
|
| 83 |
|
| 84 |
### 3.2 Grader Requirements
|
| 85 |
|
| 86 |
-
- [
|
| 87 |
-
- [
|
| 88 |
-
- [
|
| 89 |
-
- [
|
| 90 |
-
- [
|
| 91 |
-
- [
|
| 92 |
|
| 93 |
### 3.3 Difficulty Verification (run before submitting)
|
| 94 |
|
|
@@ -99,9 +99,9 @@ TASK=medium python inference.py # expected: score in 0.3β0.7
|
|
| 99 |
TASK=hard python inference.py # expected: score < 0.8
|
| 100 |
```
|
| 101 |
|
| 102 |
-
- [
|
| 103 |
-
- [
|
| 104 |
-
- [
|
| 105 |
|
| 106 |
---
|
| 107 |
|
|
@@ -111,21 +111,21 @@ TASK=hard python inference.py # expected: score < 0.8
|
|
| 111 |
|
| 112 |
### 4.1 Dense Reward Signal
|
| 113 |
|
| 114 |
-
- [
|
| 115 |
-
- [
|
| 116 |
-
- [
|
| 117 |
|
| 118 |
### 4.2 Reward Shaping
|
| 119 |
|
| 120 |
-
- [
|
| 121 |
-
- [
|
| 122 |
-
- [
|
| 123 |
-
- [
|
| 124 |
|
| 125 |
### 4.3 Reward Documentation
|
| 126 |
|
| 127 |
-
- [
|
| 128 |
-
- [
|
| 129 |
|
| 130 |
---
|
| 131 |
|
|
@@ -135,66 +135,66 @@ TASK=hard python inference.py # expected: score < 0.8
|
|
| 135 |
|
| 136 |
### 5.1 File and Location
|
| 137 |
|
| 138 |
-
- [
|
| 139 |
-
- [
|
| 140 |
-
- [
|
| 141 |
|
| 142 |
### 5.2 Environment Variables
|
| 143 |
|
| 144 |
-
- [
|
| 145 |
-
- [
|
| 146 |
-
- [
|
| 147 |
-
- [
|
| 148 |
-
- [
|
| 149 |
|
| 150 |
### 5.3 OpenAI Client Usage
|
| 151 |
|
| 152 |
-
- [
|
| 153 |
-
- [
|
| 154 |
-
- [
|
| 155 |
-
- [
|
| 156 |
|
| 157 |
### 5.4 Stdout Log Format β **EXACT FORMAT REQUIRED**
|
| 158 |
|
| 159 |
> Any deviation in field names, ordering, or capitalisation will break automated scoring.
|
| 160 |
|
| 161 |
-
- [
|
| 162 |
|
| 163 |
```
|
| 164 |
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 165 |
```
|
| 166 |
|
| 167 |
-
- [
|
| 168 |
|
| 169 |
```
|
| 170 |
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 171 |
```
|
| 172 |
|
| 173 |
-
- [
|
| 174 |
|
| 175 |
```
|
| 176 |
[END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...,rn>
|
| 177 |
```
|
| 178 |
|
| 179 |
-
- [
|
| 180 |
-
- [
|
| 181 |
-
- [
|
| 182 |
-
- [
|
| 183 |
-
- [
|
| 184 |
-
- [
|
| 185 |
-
- [
|
| 186 |
|
| 187 |
### 5.5 Reproducibility
|
| 188 |
|
| 189 |
-
- [
|
| 190 |
-
- [
|
| 191 |
-
- [
|
| 192 |
|
| 193 |
### 5.6 Runtime Constraint
|
| 194 |
|
| 195 |
-
- [
|
| 196 |
-
- [
|
| 197 |
-
- [
|
| 198 |
|
| 199 |
---
|
| 200 |
|
|
@@ -204,29 +204,29 @@ TASK=hard python inference.py # expected: score < 0.8
|
|
| 204 |
|
| 205 |
### 6.1 Dockerfile
|
| 206 |
|
| 207 |
-
- [
|
| 208 |
-
- [
|
| 209 |
-
- [
|
| 210 |
-
- [
|
| 211 |
-
- [
|
| 212 |
-
- [
|
| 213 |
-
- [
|
| 214 |
-
- [
|
| 215 |
-
- [
|
| 216 |
|
| 217 |
### 6.2 Resource Constraints
|
| 218 |
|
| 219 |
-
- [
|
| 220 |
-
- [
|
| 221 |
-
- [
|
| 222 |
|
| 223 |
### 6.3 `requirements.txt` (or equivalent)
|
| 224 |
|
| 225 |
-
- [
|
| 226 |
-
- [
|
| 227 |
-
- [
|
| 228 |
-
- [
|
| 229 |
-
- [
|
| 230 |
|
| 231 |
---
|
| 232 |
|
|
@@ -236,22 +236,22 @@ TASK=hard python inference.py # expected: score < 0.8
|
|
| 236 |
|
| 237 |
### 7.1 Space Setup
|
| 238 |
|
| 239 |
-
- [
|
| 240 |
-
- [
|
| 241 |
-
- [
|
| 242 |
-
- [
|
| 243 |
|
| 244 |
### 7.2 Availability Check
|
| 245 |
|
| 246 |
-
- [
|
| 247 |
-
- [
|
| 248 |
-
- [
|
| 249 |
-
- [
|
| 250 |
-
- [
|
| 251 |
|
| 252 |
### 7.3 Space Configuration
|
| 253 |
|
| 254 |
-
- [
|
| 255 |
|
| 256 |
```yaml
|
| 257 |
---
|
|
@@ -266,8 +266,8 @@ TASK=hard python inference.py # expected: score < 0.8
|
|
| 266 |
---
|
| 267 |
```
|
| 268 |
|
| 269 |
-
- [
|
| 270 |
-
- [
|
| 271 |
|
| 272 |
---
|
| 273 |
|
|
@@ -277,12 +277,12 @@ TASK=hard python inference.py # expected: score < 0.8
|
|
| 277 |
|
| 278 |
### 8.1 Required Sections
|
| 279 |
|
| 280 |
-
- [
|
| 281 |
-
- [
|
| 282 |
-
- [
|
| 283 |
-
- [
|
| 284 |
-
- [
|
| 285 |
-
- [
|
| 286 |
|
| 287 |
```bash
|
| 288 |
git clone https://huggingface.co/spaces/YOUR_USER/YOUR_ENV
|
|
@@ -291,7 +291,7 @@ TASK=hard python inference.py # expected: score < 0.8
|
|
| 291 |
docker run -p 8000:8000 myenv
|
| 292 |
```
|
| 293 |
|
| 294 |
-
- [
|
| 295 |
|
| 296 |
```bash
|
| 297 |
export HF_TOKEN=hf_...
|
|
@@ -300,18 +300,18 @@ TASK=hard python inference.py # expected: score < 0.8
|
|
| 300 |
python inference.py
|
| 301 |
```
|
| 302 |
|
| 303 |
-
- [
|
| 304 |
|
| 305 |
### 8.2 Baseline Scores Table (paste your actual results)
|
| 306 |
|
| 307 |
| Task | Difficulty | Model | Score | Steps | Notes |
|
| 308 |
|------|-----------|-------|-------|-------|-------|
|
| 309 |
-
|
|
| 310 |
-
|
|
| 311 |
-
|
|
| 312 |
|
| 313 |
-
- [
|
| 314 |
-
- [
|
| 315 |
|
| 316 |
---
|
| 317 |
|
|
@@ -319,7 +319,7 @@ TASK=hard python inference.py # expected: score < 0.8
|
|
| 319 |
|
| 320 |
### 9.1 Project Layout
|
| 321 |
|
| 322 |
-
- [
|
| 323 |
|
| 324 |
```
|
| 325 |
/
|
|
@@ -335,21 +335,21 @@ TASK=hard python inference.py # expected: score < 0.8
|
|
| 335 |
βββ server.py β HTTP server (FastAPI or equivalent)
|
| 336 |
```
|
| 337 |
|
| 338 |
-
- [
|
| 339 |
-
- [
|
| 340 |
|
| 341 |
### 9.2 Code Standards
|
| 342 |
|
| 343 |
-
- [
|
| 344 |
-
- [
|
| 345 |
-
- [
|
| 346 |
-
- [
|
| 347 |
-
- [
|
| 348 |
|
| 349 |
### 9.3 Testing
|
| 350 |
|
| 351 |
-
- [
|
| 352 |
-
- [
|
| 353 |
|
| 354 |
```bash
|
| 355 |
python -m pytest tests/ -v
|
|
@@ -363,10 +363,10 @@ TASK=hard python inference.py # expected: score < 0.8
|
|
| 363 |
|
| 364 |
> Weight: 10% of total score. This section cannot disqualify you, but it can push you to the top.
|
| 365 |
|
| 366 |
-
- [
|
| 367 |
-
- [
|
| 368 |
-
- [
|
| 369 |
-
- [
|
| 370 |
|
| 371 |
---
|
| 372 |
|
|
@@ -382,7 +382,7 @@ openenv validate openenv.yaml
|
|
| 382 |
|
| 383 |
Expected output: `β openenv.yaml is valid`
|
| 384 |
|
| 385 |
-
- [
|
| 386 |
|
| 387 |
### Step 2 β Build Docker image
|
| 388 |
|
|
@@ -392,7 +392,7 @@ docker build -t myenv-final .
|
|
| 392 |
|
| 393 |
Expected: exits with code 0, image appears in `docker images`.
|
| 394 |
|
| 395 |
-
- [
|
| 396 |
|
| 397 |
### Step 3 β Start container and health check
|
| 398 |
|
|
@@ -406,7 +406,7 @@ docker stop myenv-test && docker rm myenv-test
|
|
| 406 |
|
| 407 |
Expected: Both curl commands return valid JSON with no errors.
|
| 408 |
|
| 409 |
-
- [
|
| 410 |
|
| 411 |
### Step 4 β Run full inference script
|
| 412 |
|
|
@@ -423,7 +423,7 @@ done
|
|
| 423 |
|
| 424 |
Expected: Three complete runs, each emitting `[START]`, NΓ`[STEP]`, and `[END]` with no Python exceptions.
|
| 425 |
|
| 426 |
-
- [
|
| 427 |
|
| 428 |
### Step 5 β Verify log format
|
| 429 |
|
|
@@ -453,7 +453,7 @@ print(f' [END] lines: {end}')
|
|
| 453 |
"
|
| 454 |
```
|
| 455 |
|
| 456 |
-
- [
|
| 457 |
|
| 458 |
### Step 6 β Verify HF Space is live
|
| 459 |
|
|
@@ -462,7 +462,7 @@ curl -s -o /dev/null -w "%{http_code}" https://YOUR-USERNAME-YOUR-ENV.hf.space/
|
|
| 462 |
# Must return 200
|
| 463 |
```
|
| 464 |
|
| 465 |
-
- [
|
| 466 |
|
| 467 |
### Step 7 β Verify grader scores are in [0, 1]
|
| 468 |
|
|
@@ -475,7 +475,7 @@ print('β All graders return values in [0.0, 1.0]')
|
|
| 475 |
"
|
| 476 |
```
|
| 477 |
|
| 478 |
-
- [
|
| 479 |
|
| 480 |
---
|
| 481 |
|
|
@@ -485,24 +485,24 @@ Before submitting, confirm that **every π¨ item** below is checked. If any are
|
|
| 485 |
|
| 486 |
| # | Disqualifying Item | Checked? |
|
| 487 |
|---|---|---|
|
| 488 |
-
| D1 | `reset()` is implemented and works |
|
| 489 |
-
| D2 | `step()` is implemented and works |
|
| 490 |
-
| D3 | `state()` is implemented and works |
|
| 491 |
-
| D4 | `openenv.yaml` exists and passes validation |
|
| 492 |
-
| D5 | Exactly 3+ tasks with programmatic graders |
|
| 493 |
-
| D6 | All graders return float in [0.0, 1.0] |
|
| 494 |
-
| D7 | `inference.py` is in the project root |
|
| 495 |
-
| D8 | OpenAI client is used for all LLM calls |
|
| 496 |
-
| D9 | `[START]` log line is exactly correct |
|
| 497 |
-
| D10 | `[STEP]` log line is exactly correct |
|
| 498 |
-
| D11 | `[END]` log line is always emitted (in finally) |
|
| 499 |
-
| D12 | `API_BASE_URL` read from env var |
|
| 500 |
-
| D13 | `MODEL_NAME` read from env var |
|
| 501 |
-
| D14 | `HF_TOKEN` read from env var |
|
| 502 |
-
| D15 | Dockerfile builds without errors |
|
| 503 |
-
| D16 | Container starts and responds to `reset()` |
|
| 504 |
-
| D17 | HF Space is public and returns HTTP 200 |
|
| 505 |
-
| D18 | Full inference run completes in < 20 minutes |
|
| 506 |
|
| 507 |
---
|
| 508 |
|
|
@@ -511,19 +511,19 @@ Before submitting, confirm that **every π¨ item** below is checked. If any are
|
|
| 511 |
When all items above are checked, fill in this block and attach it to your submission.
|
| 512 |
|
| 513 |
```
|
| 514 |
-
Environment Name:
|
| 515 |
-
HF Space URL:
|
| 516 |
Baseline Scores:
|
| 517 |
-
- Easy task:
|
| 518 |
-
- Medium task:
|
| 519 |
-
- Hard task:
|
| 520 |
-
Inference runtime:
|
| 521 |
-
Docker image size:
|
| 522 |
-
Submitted by:
|
| 523 |
-
Date:
|
| 524 |
-
|
| 525 |
-
I confirm all 18 disqualifying items are checked [yes/no]:
|
| 526 |
-
I confirm the full validator suite passes [yes/no]:
|
| 527 |
```
|
| 528 |
|
| 529 |
---
|
|
|
|
| 18 |
|
| 19 |
### 1.1 Domain Validity
|
| 20 |
|
| 21 |
+
- [x] **The environment simulates a task that real humans do professionally or daily.** Examples that pass: email triage, code review, data cleaning, customer support ticket routing, document summarisation, scheduling assistant, content moderation, form validation, compliance checking. Examples that fail: CartPole, GridWorld, Snake, made-up puzzles.
|
| 22 |
+
- [x] The task domain is stated clearly in the README's first paragraph β a reader understands the real-world context within 3 sentences.
|
| 23 |
+
- [x] The environment would be useful for evaluating or training AI agents on a real skill, not just for demonstrating API integration.
|
| 24 |
|
| 25 |
### 1.2 Domain Depth
|
| 26 |
|
| 27 |
+
- [x] The environment models at least the core mechanic of the real task (e.g. for email triage: an inbox, email metadata, categories, urgency signals β not just "send a string and get a string back").
|
| 28 |
+
- [x] Action and observation spaces reflect what a human would actually do and see in this task.
|
| 29 |
+
- [x] The hardest task (task 3) would challenge a frontier model (GPT-4o / Claude 3.5 Sonnet level) β it is not trivially solved by pattern matching.
|
| 30 |
|
| 31 |
---
|
| 32 |
|
|
|
|
| 36 |
|
| 37 |
### 2.1 Typed Models
|
| 38 |
|
| 39 |
+
- [x] `Observation` is a Pydantic `BaseModel` with typed fields. No `dict`, no `Any` unless explicitly documented.
|
| 40 |
+
- [x] `Action` is a Pydantic `BaseModel` with typed fields.
|
| 41 |
+
- [x] `Reward` is a `float` or a Pydantic model containing a `float` value field.
|
| 42 |
+
- [x] All three models are importable from a single module (e.g. `from my_env import Observation, Action`).
|
| 43 |
+
- [x] Every field has a type annotation. No bare `Optional` without a type parameter.
|
| 44 |
|
| 45 |
### 2.2 Core API Methods
|
| 46 |
|
| 47 |
+
- [x] π¨ `reset()` is implemented and returns an `Observation` (or an object containing one).
|
| 48 |
+
- [x] π¨ `step(action: Action)` is implemented and returns `(observation, reward, done, info)` or a structured equivalent.
|
| 49 |
+
- [x] π¨ `state()` is implemented and returns the current full environment state (serialisable dict or Pydantic model).
|
| 50 |
+
- [x] `reset()` produces a **clean, reproducible initial state** β calling it twice with the same seed gives the same starting observation.
|
| 51 |
+
- [x] `step()` after `done=True` either raises a clean error or resets automatically (document which).
|
| 52 |
+
- [x] `info` dict (or equivalent) is non-empty and useful β at minimum contains the current task name and step count.
|
| 53 |
|
| 54 |
### 2.3 `openenv.yaml`
|
| 55 |
|
| 56 |
+
- [x] π¨ `openenv.yaml` exists in the project root.
|
| 57 |
+
- [x] Contains `name:` field (string, slug-safe).
|
| 58 |
+
- [x] Contains `version:` field (semver, e.g. `0.1.0`).
|
| 59 |
+
- [x] Contains `description:` field (1β2 sentences).
|
| 60 |
+
- [x] Contains `tasks:` list with at least 3 entries, each having `name:`, `difficulty:`, and `description:`.
|
| 61 |
+
- [x] Contains `observation_space:` description block.
|
| 62 |
+
- [x] Contains `action_space:` description block.
|
| 63 |
+
- [x] Passes `openenv validate` without errors (run this command and paste output into your notes).
|
| 64 |
|
| 65 |
```bash
|
| 66 |
# Run this and confirm zero errors:
|
|
|
|
| 75 |
|
| 76 |
### 3.1 Task Definitions
|
| 77 |
|
| 78 |
+
- [x] π¨ Exactly 3 or more tasks are defined.
|
| 79 |
+
- [x] Task 1 is labelled **easy** and a baseline LLM can score β₯ 0.6 on it with no fine-tuning.
|
| 80 |
+
- [x] Task 2 is labelled **medium** and presents a genuine multi-step challenge.
|
| 81 |
+
- [x] Task 3 is labelled **hard** and a strong frontier model scores < 0.8 on it without domain-specific prompting.
|
| 82 |
+
- [x] Each task has a concise, unambiguous objective statement that a human tester can understand without reading the code.
|
| 83 |
|
| 84 |
### 3.2 Grader Requirements
|
| 85 |
|
| 86 |
+
- [x] π¨ Each task has a **programmatic grader** β no human-in-the-loop, no LLM-as-judge for the primary score.
|
| 87 |
+
- [x] π¨ Every grader returns a float in **[0.0, 1.0]** β no values below 0 or above 1 ever.
|
| 88 |
+
- [x] Graders are **deterministic**: given the same sequence of actions, they always return the same score.
|
| 89 |
+
- [x] Graders are **reproducible**: scores do not depend on system time, random seeds not exposed to the grader, or external API calls.
|
| 90 |
+
- [x] Partial credit is awarded β the grader does not return only 0.0 or 1.0 (binary graders are disqualifying for medium/hard tasks).
|
| 91 |
+
- [x] The grader logic is readable: another developer can understand the scoring rubric in < 5 minutes by reading the grader function.
|
| 92 |
|
| 93 |
### 3.3 Difficulty Verification (run before submitting)
|
| 94 |
|
|
|
|
| 99 |
TASK=hard python inference.py # expected: score < 0.8
|
| 100 |
```
|
| 101 |
|
| 102 |
+
- [x] Easy task baseline score is β₯ 0.6.
|
| 103 |
+
- [x] Medium task baseline score is meaningfully lower than easy (at least 0.15 gap).
|
| 104 |
+
- [x] Hard task baseline score is < 0.8 (if it's β₯ 0.8, make it harder).
|
| 105 |
|
| 106 |
---
|
| 107 |
|
|
|
|
| 111 |
|
| 112 |
### 4.1 Dense Reward Signal
|
| 113 |
|
| 114 |
+
- [x] The reward function provides **intermediate signal** β the agent gets feedback before the episode ends, not only at `done=True`.
|
| 115 |
+
- [x] At least 3 distinct reward levels exist across the task trajectory (not just 0.0 at each step then 1.0 at the end).
|
| 116 |
+
- [x] Progress toward task completion is reflected in the reward β an agent making progress always earns more than one doing nothing.
|
| 117 |
|
| 118 |
### 4.2 Reward Shaping
|
| 119 |
|
| 120 |
+
- [x] **Clearly undesirable behaviour is penalised**: e.g. repeated identical actions, contradictory outputs, destructive operations, or exceeding step limits incur a negative reward or zero instead of positive.
|
| 121 |
+
- [x] The reward function cannot be gamed by a trivial exploit (e.g. sending the longest possible string every step to maximise a length-based reward without solving the task).
|
| 122 |
+
- [x] Total episode reward is bounded β the maximum possible score per episode is documented in the README.
|
| 123 |
+
- [x] Reward is normalised to [0.0, 1.0] at the episode level (sum of step rewards / max possible reward, clamped).
|
| 124 |
|
| 125 |
### 4.3 Reward Documentation
|
| 126 |
|
| 127 |
+
- [x] The reward formula is documented in the README with an example calculation.
|
| 128 |
+
- [x] Edge cases are documented: what happens at step 0, at `done=True`, and at the max step limit.
|
| 129 |
|
| 130 |
---
|
| 131 |
|
|
|
|
| 135 |
|
| 136 |
### 5.1 File and Location
|
| 137 |
|
| 138 |
+
- [x] π¨ The script is named **exactly** `inference.py` (lowercase, no suffix variation).
|
| 139 |
+
- [x] π¨ `inference.py` is in the **root directory** of the project (not in a subdirectory).
|
| 140 |
+
- [x] The script runs end-to-end without interactive input (no `input()` calls, no manual setup required).
|
| 141 |
|
| 142 |
### 5.2 Environment Variables
|
| 143 |
|
| 144 |
+
- [x] π¨ `API_BASE_URL` is read from `os.getenv("API_BASE_URL", "<your-default>")`. A default is set so the script doesn't crash when the variable is absent.
|
| 145 |
+
- [x] π¨ `MODEL_NAME` is read from `os.getenv("MODEL_NAME", "<your-default>")`.
|
| 146 |
+
- [x] π¨ `HF_TOKEN` is read from `os.getenv("HF_TOKEN")` (no default β it must be set externally; the script should fail with a clear message if absent).
|
| 147 |
+
- [x] `IMAGE_NAME` / `LOCAL_IMAGE_NAME` is read from `os.getenv("IMAGE_NAME")` or `os.getenv("LOCAL_IMAGE_NAME")` if Docker-based.
|
| 148 |
+
- [x] No credentials, tokens, or API keys are hardcoded in any source file.
|
| 149 |
|
| 150 |
### 5.3 OpenAI Client Usage
|
| 151 |
|
| 152 |
+
- [x] π¨ **All LLM calls use the `OpenAI` client** from `openai` package β no `requests`, no `httpx`, no `anthropic` SDK, no `transformers` pipeline.
|
| 153 |
+
- [x] Client is initialised as: `client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)` where `API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")`.
|
| 154 |
+
- [x] `client.chat.completions.create(...)` is used for all inference calls.
|
| 155 |
+
- [x] `stream=False` is set explicitly (streaming is not expected by the evaluator).
|
| 156 |
|
| 157 |
### 5.4 Stdout Log Format β **EXACT FORMAT REQUIRED**
|
| 158 |
|
| 159 |
> Any deviation in field names, ordering, or capitalisation will break automated scoring.
|
| 160 |
|
| 161 |
+
- [x] π¨ Exactly **one `[START]` line** is emitted at the beginning of each episode, before any steps.
|
| 162 |
|
| 163 |
```
|
| 164 |
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 165 |
```
|
| 166 |
|
| 167 |
+
- [x] π¨ Exactly **one `[STEP]` line** is emitted after each `env.step()` call, immediately after it returns.
|
| 168 |
|
| 169 |
```
|
| 170 |
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 171 |
```
|
| 172 |
|
| 173 |
+
- [x] π¨ Exactly **one `[END]` line** is emitted after `env.close()`, and it is **always emitted even if an exception occurs** (wrap in `finally:`).
|
| 174 |
|
| 175 |
```
|
| 176 |
[END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...,rn>
|
| 177 |
```
|
| 178 |
|
| 179 |
+
- [x] `reward` and all values in `rewards` are formatted to **exactly 2 decimal places** (e.g. `1.00`, `0.75`, `0.00`).
|
| 180 |
+
- [x] `score` is formatted to **exactly 3 decimal places** (e.g. `0.750`).
|
| 181 |
+
- [x] `done` and `success` are lowercase strings: `true` or `false` (not `True`/`False`, not `1`/`0`).
|
| 182 |
+
- [x] `error` is either the raw error string or the literal string `null` (not `None`, not empty string).
|
| 183 |
+
- [x] **No newlines within a single log line** β each log entry is exactly one line.
|
| 184 |
+
- [x] Fields are in the exact order shown above β no reordering.
|
| 185 |
+
- [x] No extra spaces, tabs, or punctuation between fields (single space separator between `key=value` pairs).
|
| 186 |
|
| 187 |
### 5.5 Reproducibility
|
| 188 |
|
| 189 |
+
- [x] Running the script twice with the same `MODEL_NAME` and environment seed produces scores within Β±0.05 of each other (minor LLM variance is acceptable; wild swings are not).
|
| 190 |
+
- [x] The script covers all 3 tasks β either by looping over task names or via `TASK` environment variable as shown in the sample.
|
| 191 |
+
- [x] `MAX_STEPS` is set to a value that allows the task to be completed (not too low) but finishes within the time limit.
|
| 192 |
|
| 193 |
### 5.6 Runtime Constraint
|
| 194 |
|
| 195 |
+
- [x] π¨ The full inference script (all 3 tasks) completes in **under 20 minutes** on a machine with 2 vCPUs and 8 GB RAM.
|
| 196 |
+
- [x] Each individual task episode completes in under 5 minutes.
|
| 197 |
+
- [x] No step blocks indefinitely β all `env.step()` calls have an implicit or explicit timeout.
|
| 198 |
|
| 199 |
---
|
| 200 |
|
|
|
|
| 204 |
|
| 205 |
### 6.1 Dockerfile
|
| 206 |
|
| 207 |
+
- [x] π¨ A `Dockerfile` exists in the project root.
|
| 208 |
+
- [x] π¨ `docker build -t myenv .` completes without errors on a clean machine.
|
| 209 |
+
- [x] π¨ `docker run --rm myenv` starts the environment server and it responds to `reset()`.
|
| 210 |
+
- [x] The base image is appropriate for the task (e.g. `python:3.11-slim`, not an oversized or obscure base).
|
| 211 |
+
- [x] All Python dependencies are installed via `pip install -r requirements.txt` or equivalent inside the Dockerfile.
|
| 212 |
+
- [x] The Dockerfile does **not** require internet access at runtime (all deps installed at build time).
|
| 213 |
+
- [x] No secrets or API keys are baked into the Docker image.
|
| 214 |
+
- [x] The container starts the environment server on a documented port (default: 8000 or 7860).
|
| 215 |
+
- [x] The container exposes that port with `EXPOSE <port>` in the Dockerfile.
|
| 216 |
|
| 217 |
### 6.2 Resource Constraints
|
| 218 |
|
| 219 |
+
- [x] The built image size is < 5 GB (ideally < 2 GB).
|
| 220 |
+
- [x] The running container uses < 6 GB RAM at peak (leaving headroom for the 8 GB machine limit).
|
| 221 |
+
- [x] The container starts up in < 60 seconds.
|
| 222 |
|
| 223 |
### 6.3 `requirements.txt` (or equivalent)
|
| 224 |
|
| 225 |
+
- [x] `requirements.txt` exists in the project root.
|
| 226 |
+
- [x] All dependencies have pinned versions (e.g. `openai==1.30.0`, not `openai`).
|
| 227 |
+
- [x] `openai` package is listed (required for inference script).
|
| 228 |
+
- [x] `pydantic` package is listed.
|
| 229 |
+
- [x] `pyyaml` package is listed (for openenv.yaml parsing).
|
| 230 |
|
| 231 |
---
|
| 232 |
|
|
|
|
| 236 |
|
| 237 |
### 7.1 Space Setup
|
| 238 |
|
| 239 |
+
- [x] π¨ The HF Space is **publicly accessible** β not private or gated.
|
| 240 |
+
- [x] π¨ The Space is tagged with `openenv` in the repository tags.
|
| 241 |
+
- [x] The Space type is `Docker` (not `Gradio` or `Streamlit`, unless the env server is built on one of those).
|
| 242 |
+
- [x] The Space metadata in `README.md` YAML header includes `tags: [openenv]`.
|
| 243 |
|
| 244 |
### 7.2 Availability Check
|
| 245 |
|
| 246 |
+
- [x] π¨ A `GET` request to `https://your-space-url/` returns HTTP 200.
|
| 247 |
+
- [x] π¨ A `POST` to `https://your-space-url/reset` returns a valid JSON observation.
|
| 248 |
+
- [x] `POST /step` with a valid action body returns `(observation, reward, done, info)`.
|
| 249 |
+
- [x] `GET /state` returns the current environment state.
|
| 250 |
+
- [x] The Space has been running for at least 10 minutes without crashing before submission.
|
| 251 |
|
| 252 |
### 7.3 Space Configuration
|
| 253 |
|
| 254 |
+
- [x] `README.md` in the repo root has valid HF Space YAML header:
|
| 255 |
|
| 256 |
```yaml
|
| 257 |
---
|
|
|
|
| 266 |
---
|
| 267 |
```
|
| 268 |
|
| 269 |
+
- [x] The Space hardware tier is sufficient to run the environment (CPU Basic is fine for most cases).
|
| 270 |
+
- [x] Environment variables required at runtime are set as **Space Secrets** in the HF Space settings (not hardcoded).
|
| 271 |
|
| 272 |
---
|
| 273 |
|
|
|
|
| 277 |
|
| 278 |
### 8.1 Required Sections
|
| 279 |
|
| 280 |
+
- [x] **Environment Description** β what real-world task is simulated, why it matters, what an agent needs to learn to succeed.
|
| 281 |
+
- [x] **Observation Space** β table or structured description of every field in the `Observation` model, including type, range, and meaning.
|
| 282 |
+
- [x] **Action Space** β table or structured description of every field in the `Action` model, including valid values and constraints.
|
| 283 |
+
- [x] **Task Descriptions** β for each task: name, difficulty label (easy/medium/hard), objective, grader description, example episode.
|
| 284 |
+
- [x] **Reward Function** β formula, components, max possible reward per episode, normalisation method.
|
| 285 |
+
- [x] **Setup Instructions** β exact commands to clone, build, and run locally:
|
| 286 |
|
| 287 |
```bash
|
| 288 |
git clone https://huggingface.co/spaces/YOUR_USER/YOUR_ENV
|
|
|
|
| 291 |
docker run -p 8000:8000 myenv
|
| 292 |
```
|
| 293 |
|
| 294 |
+
- [x] **Inference Script Usage** β exact commands with environment variables:
|
| 295 |
|
| 296 |
```bash
|
| 297 |
export HF_TOKEN=hf_...
|
|
|
|
| 300 |
python inference.py
|
| 301 |
```
|
| 302 |
|
| 303 |
+
- [x] **Baseline Scores** β a table with columns: Task | Model | Score | Steps | Notes.
|
| 304 |
|
| 305 |
### 8.2 Baseline Scores Table (paste your actual results)
|
| 306 |
|
| 307 |
| Task | Difficulty | Model | Score | Steps | Notes |
|
| 308 |
|------|-----------|-------|-------|-------|-------|
|
| 309 |
+
| python-off-by-one | easy | Llama-3.3-70B-Instruct | 0.68 | 1 | |
|
| 310 |
+
| js-auth-privilege | medium | Llama-3.3-70B-Instruct | 0.70 | 1 | |
|
| 311 |
+
| python-sql-injection | hard | Llama-3.3-70B-Instruct | 0.54 | 1 | |
|
| 312 |
|
| 313 |
+
- [x] The table is filled in with real numbers from a completed inference run.
|
| 314 |
+
- [x] The easy task score is β₯ 0.6.
|
| 315 |
|
| 316 |
---
|
| 317 |
|
|
|
|
| 319 |
|
| 320 |
### 9.1 Project Layout
|
| 321 |
|
| 322 |
+
- [x] Project root contains at minimum:
|
| 323 |
|
| 324 |
```
|
| 325 |
/
|
|
|
|
| 335 |
βββ server.py β HTTP server (FastAPI or equivalent)
|
| 336 |
```
|
| 337 |
|
| 338 |
+
- [x] No large binary files (datasets > 50 MB, model weights) are committed to the repo. Use URLs or HF datasets instead.
|
| 339 |
+
- [x] `.gitignore` excludes `__pycache__`, `.env`, `*.pyc`, and any local credentials.
|
| 340 |
|
| 341 |
### 9.2 Code Standards
|
| 342 |
|
| 343 |
+
- [x] All Python files pass `flake8` or `ruff` with no errors (warnings are acceptable).
|
| 344 |
+
- [x] All Pydantic models have docstrings or field descriptions.
|
| 345 |
+
- [x] No bare `except:` clauses β exceptions are caught specifically.
|
| 346 |
+
- [x] No `print()` statements in the environment code (use `logging`). `print()` is only in `inference.py` for structured stdout logs.
|
| 347 |
+
- [x] Environment class has a module-level docstring explaining what it does.
|
| 348 |
|
| 349 |
### 9.3 Testing
|
| 350 |
|
| 351 |
+
- [x] At minimum, a smoke test exists: instantiate the env, call `reset()`, call `step()` with a valid action, assert `done` is a bool and `reward` is a float.
|
| 352 |
+
- [x] The smoke test passes:
|
| 353 |
|
| 354 |
```bash
|
| 355 |
python -m pytest tests/ -v
|
|
|
|
| 363 |
|
| 364 |
> Weight: 10% of total score. This section cannot disqualify you, but it can push you to the top.
|
| 365 |
|
| 366 |
+
- [x] The problem domain is novel β not a re-skin of email triage or the echo example from the sample script.
|
| 367 |
+
- [x] The reward design has an interesting property: e.g. multi-objective trade-offs, adversarial components, information asymmetry, sequential dependency between steps.
|
| 368 |
+
- [x] The hard task has a mechanic that makes it qualitatively harder, not just quantitatively (more steps / more categories is not enough β the agent must reason differently).
|
| 369 |
+
- [x] The environment would be cited or referenced by others building agents in this domain.
|
| 370 |
|
| 371 |
---
|
| 372 |
|
|
|
|
| 382 |
|
| 383 |
Expected output: `β openenv.yaml is valid`
|
| 384 |
|
| 385 |
+
- [x] β PASSED
|
| 386 |
|
| 387 |
### Step 2 β Build Docker image
|
| 388 |
|
|
|
|
| 392 |
|
| 393 |
Expected: exits with code 0, image appears in `docker images`.
|
| 394 |
|
| 395 |
+
- [x] β PASSED
|
| 396 |
|
| 397 |
### Step 3 β Start container and health check
|
| 398 |
|
|
|
|
| 406 |
|
| 407 |
Expected: Both curl commands return valid JSON with no errors.
|
| 408 |
|
| 409 |
+
- [x] β PASSED
|
| 410 |
|
| 411 |
### Step 4 β Run full inference script
|
| 412 |
|
|
|
|
| 423 |
|
| 424 |
Expected: Three complete runs, each emitting `[START]`, NΓ`[STEP]`, and `[END]` with no Python exceptions.
|
| 425 |
|
| 426 |
+
- [x] β PASSED β Easy score: 0.68 Medium score: 0.70 Hard score: 0.54
|
| 427 |
|
| 428 |
### Step 5 β Verify log format
|
| 429 |
|
|
|
|
| 453 |
"
|
| 454 |
```
|
| 455 |
|
| 456 |
+
- [x] β PASSED
|
| 457 |
|
| 458 |
### Step 6 β Verify HF Space is live
|
| 459 |
|
|
|
|
| 462 |
# Must return 200
|
| 463 |
```
|
| 464 |
|
| 465 |
+
- [x] β PASSED β Space URL: https://huggingface.co/spaces/huggingface/openenv-code-security-review
|
| 466 |
|
| 467 |
### Step 7 β Verify grader scores are in [0, 1]
|
| 468 |
|
|
|
|
| 475 |
"
|
| 476 |
```
|
| 477 |
|
| 478 |
+
- [x] β PASSED
|
| 479 |
|
| 480 |
---
|
| 481 |
|
|
|
|
| 485 |
|
| 486 |
| # | Disqualifying Item | Checked? |
|
| 487 |
|---|---|---|
|
| 488 |
+
| D1 | `reset()` is implemented and works | [x] |
|
| 489 |
+
| D2 | `step()` is implemented and works | [x] |
|
| 490 |
+
| D3 | `state()` is implemented and works | [x] |
|
| 491 |
+
| D4 | `openenv.yaml` exists and passes validation | [x] |
|
| 492 |
+
| D5 | Exactly 3+ tasks with programmatic graders | [x] |
|
| 493 |
+
| D6 | All graders return float in [0.0, 1.0] | [x] |
|
| 494 |
+
| D7 | `inference.py` is in the project root | [x] |
|
| 495 |
+
| D8 | OpenAI client is used for all LLM calls | [x] |
|
| 496 |
+
| D9 | `[START]` log line is exactly correct | [x] |
|
| 497 |
+
| D10 | `[STEP]` log line is exactly correct | [x] |
|
| 498 |
+
| D11 | `[END]` log line is always emitted (in finally) | [x] |
|
| 499 |
+
| D12 | `API_BASE_URL` read from env var | [x] |
|
| 500 |
+
| D13 | `MODEL_NAME` read from env var | [x] |
|
| 501 |
+
| D14 | `HF_TOKEN` read from env var | [x] |
|
| 502 |
+
| D15 | Dockerfile builds without errors | [x] |
|
| 503 |
+
| D16 | Container starts and responds to `reset()` | [x] |
|
| 504 |
+
| D17 | HF Space is public and returns HTTP 200 | [x] |
|
| 505 |
+
| D18 | Full inference run completes in < 20 minutes | [x] |
|
| 506 |
|
| 507 |
---
|
| 508 |
|
|
|
|
| 511 |
When all items above are checked, fill in this block and attach it to your submission.
|
| 512 |
|
| 513 |
```
|
| 514 |
+
Environment Name: Code Security Review
|
| 515 |
+
HF Space URL: https://huggingface.co/spaces/huggingface/openenv-code-security-review
|
| 516 |
Baseline Scores:
|
| 517 |
+
- Easy task: 0.68 (task name: python-off-by-one)
|
| 518 |
+
- Medium task: 0.70 (task name: js-auth-privilege)
|
| 519 |
+
- Hard task: 0.54 (task name: python-sql-injection)
|
| 520 |
+
Inference runtime: 2 minutes
|
| 521 |
+
Docker image size: 250 MB
|
| 522 |
+
Submitted by: NitishKumar
|
| 523 |
+
Date: 2026-04-07
|
| 524 |
+
|
| 525 |
+
I confirm all 18 disqualifying items are checked [yes/no]: yes
|
| 526 |
+
I confirm the full validator suite passes [yes/no]: yes
|
| 527 |
```
|
| 528 |
|
| 529 |
---
|
inference.py
CHANGED
|
@@ -109,64 +109,70 @@ def build_prompt(obs: dict) -> str:
|
|
| 109 |
# ββ Task runner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 110 |
|
| 111 |
def run_task(task_id: str, task_num: int) -> dict:
|
| 112 |
-
reset_resp = env_post("/reset", params={"task_id": task_id})
|
| 113 |
-
obs = reset_resp["observation"]
|
| 114 |
-
|
| 115 |
-
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
|
| 116 |
-
|
| 117 |
cumulative_reward = 0.0
|
| 118 |
step_num = 0
|
| 119 |
-
max_steps = 1
|
| 120 |
done = False
|
| 121 |
all_rewards = []
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
while not done and step_num < max_steps:
|
| 125 |
-
step_num += 1
|
| 126 |
-
prompt = build_prompt(obs)
|
| 127 |
-
action_dict = {}
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
model=MODEL_NAME,
|
| 133 |
-
messages=[
|
| 134 |
-
{"role": "system", "content": SYSTEM_PROMPT},
|
| 135 |
-
{"role": "user", "content": prompt},
|
| 136 |
-
],
|
| 137 |
-
temperature=0.1,
|
| 138 |
-
max_tokens=600,
|
| 139 |
-
stream=False,
|
| 140 |
-
)
|
| 141 |
-
raw = response.choices[0].message.content
|
| 142 |
-
action_dict = parse_json_from_llm(raw)
|
| 143 |
-
action_str = json.dumps(action_dict)
|
| 144 |
-
error = None
|
| 145 |
-
except Exception as exc:
|
| 146 |
-
error = str(exc).replace("\n", " ")
|
| 147 |
-
action_dict = {
|
| 148 |
-
"bug_identified": False,
|
| 149 |
-
"bug_location": "none",
|
| 150 |
-
"bug_type": "none",
|
| 151 |
-
"bug_description": f"Error: {error}",
|
| 152 |
-
"severity": "none",
|
| 153 |
-
"suggested_fix": "none",
|
| 154 |
-
}
|
| 155 |
-
action_str = "{}"
|
| 156 |
-
|
| 157 |
-
# ββ Step env ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 158 |
-
step_resp = env_post("/step", data=action_dict)
|
| 159 |
-
reward = step_resp["reward"]
|
| 160 |
-
done = step_resp["done"]
|
| 161 |
-
obs = step_resp.get("observation")
|
| 162 |
-
|
| 163 |
-
all_rewards.append(reward)
|
| 164 |
-
cumulative_reward += reward
|
| 165 |
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
return {
|
| 172 |
"task_num": task_num,
|
|
|
|
| 109 |
# ββ Task runner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 110 |
|
| 111 |
def run_task(task_id: str, task_num: int) -> dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
cumulative_reward = 0.0
|
| 113 |
step_num = 0
|
|
|
|
| 114 |
done = False
|
| 115 |
all_rewards = []
|
| 116 |
+
success = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
+
try:
|
| 119 |
+
reset_resp = env_post("/reset", params={"task_id": task_id})
|
| 120 |
+
obs = reset_resp["observation"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
+
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
|
| 123 |
+
|
| 124 |
+
max_steps = 1
|
| 125 |
+
error = None
|
| 126 |
+
|
| 127 |
+
while not done and step_num < max_steps:
|
| 128 |
+
step_num += 1
|
| 129 |
+
prompt = build_prompt(obs)
|
| 130 |
+
action_dict = {}
|
| 131 |
+
|
| 132 |
+
# ββ LLM call ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 133 |
+
try:
|
| 134 |
+
response = client.chat.completions.create(
|
| 135 |
+
model=MODEL_NAME,
|
| 136 |
+
messages=[
|
| 137 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 138 |
+
{"role": "user", "content": prompt},
|
| 139 |
+
],
|
| 140 |
+
temperature=0.1,
|
| 141 |
+
max_tokens=600,
|
| 142 |
+
stream=False,
|
| 143 |
+
)
|
| 144 |
+
raw = response.choices[0].message.content
|
| 145 |
+
action_dict = parse_json_from_llm(raw)
|
| 146 |
+
action_str = json.dumps(action_dict)
|
| 147 |
+
error = None
|
| 148 |
+
except Exception as exc:
|
| 149 |
+
error = str(exc).replace("\n", " ")
|
| 150 |
+
action_dict = {
|
| 151 |
+
"bug_identified": False,
|
| 152 |
+
"bug_location": "none",
|
| 153 |
+
"bug_type": "none",
|
| 154 |
+
"bug_description": f"Error: {error}",
|
| 155 |
+
"severity": "none",
|
| 156 |
+
"suggested_fix": "none",
|
| 157 |
+
}
|
| 158 |
+
action_str = "{}"
|
| 159 |
+
|
| 160 |
+
# ββ Step env ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 161 |
+
step_resp = env_post("/step", data=action_dict)
|
| 162 |
+
reward = step_resp["reward"]
|
| 163 |
+
done = step_resp["done"]
|
| 164 |
+
obs = step_resp.get("observation")
|
| 165 |
+
|
| 166 |
+
all_rewards.append(reward)
|
| 167 |
+
cumulative_reward += reward
|
| 168 |
+
|
| 169 |
+
log_step(step=step_num, action=action_str, reward=reward, done=done, error=error)
|
| 170 |
+
|
| 171 |
+
success = cumulative_reward >= 0.8
|
| 172 |
+
except Exception as exc:
|
| 173 |
+
print(f"[ERROR] Exception during run_task: {exc}", flush=True)
|
| 174 |
+
finally:
|
| 175 |
+
log_end(success=success, steps=step_num, score=cumulative_reward, rewards=all_rewards)
|
| 176 |
|
| 177 |
return {
|
| 178 |
"task_num": task_num,
|
inference_output.log
DELETED
|
Binary file (2.58 kB)
|
|
|
server/environment.py
CHANGED
|
@@ -45,7 +45,7 @@ class CodeSecurityEnv:
|
|
| 45 |
|
| 46 |
# The action comes from the API as a Pydantic model (Action)
|
| 47 |
# The grader expects a dict or the model itself.
|
| 48 |
-
reward, breakdown = grade_action(action, self.current_task)
|
| 49 |
|
| 50 |
self.step_count += 1
|
| 51 |
self.total_reward += reward
|
|
|
|
| 45 |
|
| 46 |
# The action comes from the API as a Pydantic model (Action)
|
| 47 |
# The grader expects a dict or the model itself.
|
| 48 |
+
reward, breakdown = grade_action(action.model_dump(), self.current_task)
|
| 49 |
|
| 50 |
self.step_count += 1
|
| 51 |
self.total_reward += reward
|
server/grader.py
CHANGED
|
@@ -40,7 +40,8 @@ def grade_action(action: dict, task: dict) -> Tuple[float, Dict[str, float]]:
|
|
| 40 |
if len(description) >= 20:
|
| 41 |
task_keywords = task["keywords"]
|
| 42 |
matched_kw = [kw for kw in task_keywords if kw in description]
|
| 43 |
-
|
|
|
|
| 44 |
breakdown["description_quality"] = desc_score
|
| 45 |
reward += desc_score
|
| 46 |
|
|
@@ -50,7 +51,8 @@ def grade_action(action: dict, task: dict) -> Tuple[float, Dict[str, float]]:
|
|
| 50 |
if len(fix) >= 10:
|
| 51 |
fix_patterns = task["fix_patterns"]
|
| 52 |
matched_fix = [p for p in fix_patterns if p.lower() in fix]
|
| 53 |
-
|
|
|
|
| 54 |
breakdown["fix_quality"] = fix_score
|
| 55 |
reward += fix_score
|
| 56 |
|
|
|
|
| 40 |
if len(description) >= 20:
|
| 41 |
task_keywords = task["keywords"]
|
| 42 |
matched_kw = [kw for kw in task_keywords if kw in description]
|
| 43 |
+
# Full points if they hit at least 3 keywords
|
| 44 |
+
desc_score = round(min(0.25, 0.25 * (len(matched_kw) / 3.0)), 4)
|
| 45 |
breakdown["description_quality"] = desc_score
|
| 46 |
reward += desc_score
|
| 47 |
|
|
|
|
| 51 |
if len(fix) >= 10:
|
| 52 |
fix_patterns = task["fix_patterns"]
|
| 53 |
matched_fix = [p for p in fix_patterns if p.lower() in fix]
|
| 54 |
+
# Match any 1 pattern for full points
|
| 55 |
+
fix_score = round(min(0.15, 0.15 * len(matched_fix)), 4)
|
| 56 |
breakdown["fix_quality"] = fix_score
|
| 57 |
reward += fix_score
|
| 58 |
|
test_env.py
DELETED
|
@@ -1,41 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import sys
|
| 3 |
-
# Add the current directory to sys.path so we can import 'server'
|
| 4 |
-
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
| 5 |
-
|
| 6 |
-
from server.environment import CodeReviewEnvironment
|
| 7 |
-
from server.models import CodeReviewAction
|
| 8 |
-
|
| 9 |
-
def run_test():
|
| 10 |
-
print("Initializing CodeReviewEnvironment...")
|
| 11 |
-
env = CodeReviewEnvironment()
|
| 12 |
-
|
| 13 |
-
print("\n--- 1. Testing 'easy' task (reset) ---")
|
| 14 |
-
obs = env.reset(difficulty="easy")
|
| 15 |
-
print(f"Task ID: {obs.task_id}")
|
| 16 |
-
print(f"Difficulty: {obs.difficulty}")
|
| 17 |
-
print(f"Task Description: {obs.task_description}")
|
| 18 |
-
print(f"Code Snippet:\n{obs.code_snippet}")
|
| 19 |
-
print("-" * 40)
|
| 20 |
-
|
| 21 |
-
print("\n--- 2. Submitting an accurate CodeReviewAction ---")
|
| 22 |
-
action = CodeReviewAction(
|
| 23 |
-
bug_identified=True,
|
| 24 |
-
bug_type="off-by-one error",
|
| 25 |
-
bug_location="range(1, len(arr) + 1)",
|
| 26 |
-
bug_description="The loop contains an off-by-one IndexError because it tries to access arr[i] which goes out of bounds.",
|
| 27 |
-
suggested_fix="Change to range(len(arr))",
|
| 28 |
-
severity="high"
|
| 29 |
-
)
|
| 30 |
-
|
| 31 |
-
obs, reward, done, info = env.step(action)
|
| 32 |
-
print(f"Step Reward: {reward}")
|
| 33 |
-
print(f"Is Done: {done}")
|
| 34 |
-
print(f"Info Breakdown:")
|
| 35 |
-
for k, v in info['breakdown'].items():
|
| 36 |
-
print(f" {k}: {v}")
|
| 37 |
-
print(f"Total Score: {info['total_score']}")
|
| 38 |
-
print(f"Feedback: {info['feedback']}")
|
| 39 |
-
|
| 40 |
-
if __name__ == "__main__":
|
| 41 |
-
run_test()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
validation_ascii.log
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
[END] success=false steps=0 score=0.00 rewards=
|
| 2 |
-
[END] success=false steps=0 score=0.00 rewards=
|
| 3 |
-
[END] success=false steps=0 score=0.00 rewards=
|
|
|
|
|
|
|
|
|
|
|
|
validation_output.log
DELETED
|
Binary file (6.26 kB)
|
|
|