Nitish commited on
Commit Β·
f44f429
0
Parent(s):
feat: Code Security Review OpenEnv - Final Submission
Browse files- .gitattributes +35 -0
- .gitignore +14 -0
- Dockerfile +20 -0
- OPENENV_SUBMISSION_CHECKLIST.md +531 -0
- README.md +185 -0
- inference.py +302 -0
- openenv.yaml +82 -0
- output.txt +13 -0
- pyproject.toml +27 -0
- qa_test.py +237 -0
- requirements.txt +8 -0
- server/__init__.py +5 -0
- server/app.py +121 -0
- server/environment.py +136 -0
- server/grader.py +115 -0
- server/models.py +69 -0
- server/tasks.py +117 -0
- static/index.html +168 -0
- static/main.js +209 -0
- static/style.css +470 -0
- uv.lock +0 -0
- validate.sh +103 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
venv/
|
| 2 |
+
.venv/
|
| 3 |
+
env/
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.pyc
|
| 6 |
+
.DS_Store
|
| 7 |
+
.env
|
| 8 |
+
*.egg-info/
|
| 9 |
+
build/
|
| 10 |
+
dist/
|
| 11 |
+
*.whl
|
| 12 |
+
*.tar.gz
|
| 13 |
+
.pytest_cache/
|
| 14 |
+
.coveragevenv/
|
Dockerfile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install dependencies first (layer cache)
|
| 6 |
+
COPY requirements.txt .
|
| 7 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 8 |
+
pip install --no-cache-dir -r requirements.txt
|
| 9 |
+
|
| 10 |
+
# Copy all project files (needed for openenv validate to work inside)
|
| 11 |
+
COPY . .
|
| 12 |
+
|
| 13 |
+
# Environment defaults (Hugging Face Spaces use 7860)
|
| 14 |
+
ENV PORT=7860
|
| 15 |
+
ENV PYTHONPATH=/app
|
| 16 |
+
ENV ENABLE_WEB_INTERFACE=false
|
| 17 |
+
|
| 18 |
+
EXPOSE 7860
|
| 19 |
+
|
| 20 |
+
CMD ["python", "-m", "uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
OPENENV_SUBMISSION_CHECKLIST.md
ADDED
|
@@ -0,0 +1,531 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OpenEnv Submission Checklist
|
| 2 |
+
> Complete every item before final submission. A single β in any **DISQUALIFYING** section means you cannot submit.
|
| 3 |
+
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
## HOW TO USE THIS CHECKLIST
|
| 7 |
+
|
| 8 |
+
1. Work through each section **in order** β earlier sections unblock later ones.
|
| 9 |
+
2. Mark each item `[x]` when confirmed, or add a note if it needs fixing.
|
| 10 |
+
3. Any item marked **π¨ DISQUALIFYING** must be `[x]` before submission or you will be automatically rejected.
|
| 11 |
+
4. After all items are checked, run the final validator command at the bottom.
|
| 12 |
+
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
## SECTION 1 β REAL-WORLD TASK SIMULATION
|
| 16 |
+
|
| 17 |
+
> Weight: 30% of total score. Judges will ask: "Would a practitioner actually use this?"
|
| 18 |
+
|
| 19 |
+
### 1.1 Domain Validity
|
| 20 |
+
|
| 21 |
+
- [x] **The environment simulates a task that real humans do professionally or daily.** Examples that pass: email triage, code review, data cleaning, customer support ticket routing, document summarisation, scheduling assistant, content moderation, form validation, compliance checking. Examples that fail: CartPole, GridWorld, Snake, made-up puzzles.
|
| 22 |
+
- [x] The task domain is stated clearly in the README's first paragraph β a reader understands the real-world context within 3 sentences.
|
| 23 |
+
- [x] The environment would be useful for evaluating or training AI agents on a real skill, not just for demonstrating API integration.
|
| 24 |
+
|
| 25 |
+
### 1.2 Domain Depth
|
| 26 |
+
|
| 27 |
+
- [x] The environment models at least the core mechanic of the real task (e.g. for email triage: an inbox, email metadata, categories, urgency signals β not just "send a string and get a string back").
|
| 28 |
+
- [x] Action and observation spaces reflect what a human would actually do and see in this task.
|
| 29 |
+
- [x] The hardest task (task 3) would challenge a frontier model (GPT-4o / Claude 3.5 Sonnet level) β it is not trivially solved by pattern matching.
|
| 30 |
+
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
## SECTION 2 β OPENENV SPEC COMPLIANCE
|
| 34 |
+
|
| 35 |
+
> Weight: part of the 15% code quality score. **All π¨ items are disqualifying.**
|
| 36 |
+
|
| 37 |
+
### 2.1 Typed Models
|
| 38 |
+
|
| 39 |
+
- [x] `Observation` is a Pydantic `BaseModel` with typed fields. No `dict`, no `Any` unless explicitly documented.
|
| 40 |
+
- [x] `Action` is a Pydantic `BaseModel` with typed fields.
|
| 41 |
+
- [x] `Reward` is a `float` or a Pydantic model containing a `float` value field.
|
| 42 |
+
- [x] All three models are importable from a single module (e.g. `from my_env import Observation, Action`).
|
| 43 |
+
- [x] Every field has a type annotation. No bare `Optional` without a type parameter.
|
| 44 |
+
|
| 45 |
+
### 2.2 Core API Methods
|
| 46 |
+
|
| 47 |
+
- [x] π¨ `reset()` is implemented and returns an `Observation` (or an object containing one).
|
| 48 |
+
- [x] π¨ `step(action: Action)` is implemented and returns `(observation, reward, done, info)` or a structured equivalent.
|
| 49 |
+
- [x] π¨ `state()` is implemented and returns the current full environment state (serialisable dict or Pydantic model).
|
| 50 |
+
- [x] `reset()` produces a **clean, reproducible initial state** β calling it twice with the same seed gives the same starting observation.
|
| 51 |
+
- [x] `step()` after `done=True` either raises a clean error or resets automatically (document which).
|
| 52 |
+
- [x] `info` dict (or equivalent) is non-empty and useful β at minimum contains the current task name and step count.
|
| 53 |
+
|
| 54 |
+
### 2.3 `openenv.yaml`
|
| 55 |
+
|
| 56 |
+
- [x] π¨ `openenv.yaml` exists in the project root.
|
| 57 |
+
- [x] Contains `name:` field (string, slug-safe).
|
| 58 |
+
- [x] Contains `version:` field (semver, e.g. `0.1.0`).
|
| 59 |
+
- [x] Contains `description:` field (1β2 sentences).
|
| 60 |
+
- [x] Contains `tasks:` list with at least 3 entries, each having `name:`, `difficulty:`, and `description:`.
|
| 61 |
+
- [x] Contains `observation_space:` description block.
|
| 62 |
+
- [x] Contains `action_space:` description block.
|
| 63 |
+
- [x] Passes `openenv validate` without errors (run this command and paste output into your notes).
|
| 64 |
+
|
| 65 |
+
```bash
|
| 66 |
+
# Run this and confirm zero errors:
|
| 67 |
+
openenv validate openenv.yaml
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
---
|
| 71 |
+
|
| 72 |
+
## SECTION 3 β MINIMUM 3 TASKS WITH AGENT GRADERS
|
| 73 |
+
|
| 74 |
+
> Weight: 25% of total score. All π¨ items are disqualifying.
|
| 75 |
+
|
| 76 |
+
### 3.1 Task Definitions
|
| 77 |
+
|
| 78 |
+
- [x] π¨ Exactly 3 or more tasks are defined.
|
| 79 |
+
- [x] Task 1 is labelled **easy** and a baseline LLM can score β₯ 0.6 on it with no fine-tuning.
|
| 80 |
+
- [x] Task 2 is labelled **medium** and presents a genuine multi-step challenge.
|
| 81 |
+
- [x] Task 3 is labelled **hard** and a strong frontier model scores < 0.8 on it without domain-specific prompting.
|
| 82 |
+
- [x] Each task has a concise, unambiguous objective statement that a human tester can understand without reading the code.
|
| 83 |
+
|
| 84 |
+
### 3.2 Grader Requirements
|
| 85 |
+
|
| 86 |
+
- [x] π¨ Each task has a **programmatic grader** β no human-in-the-loop, no LLM-as-judge for the primary score.
|
| 87 |
+
- [x] π¨ Every grader returns a float in **[0.0, 1.0]** β no values below 0 or above 1 ever.
|
| 88 |
+
- [x] Graders are **deterministic**: given the same sequence of actions, they always return the same score.
|
| 89 |
+
- [x] Graders are **reproducible**: scores do not depend on system time, random seeds not exposed to the grader, or external API calls.
|
| 90 |
+
- [x] Partial credit is awarded β the grader does not return only 0.0 or 1.0 (binary graders are disqualifying for medium/hard tasks).
|
| 91 |
+
- [x] The grader logic is readable: another developer can understand the scoring rubric in < 5 minutes by reading the grader function.
|
| 92 |
+
|
| 93 |
+
### 3.3 Difficulty Verification (run before submitting)
|
| 94 |
+
|
| 95 |
+
```bash
|
| 96 |
+
# Run baseline inference on all three tasks and record scores:
|
| 97 |
+
TASK=easy python inference.py # expected: score >= 0.6
|
| 98 |
+
TASK=medium python inference.py # expected: score in 0.3β0.7
|
| 99 |
+
TASK=hard python inference.py # expected: score < 0.8
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
- [x] Easy task baseline score is β₯ 0.6.
|
| 103 |
+
- [x] Medium task baseline score is meaningfully lower than easy (at least 0.15 gap).
|
| 104 |
+
- [x] Hard task baseline score is < 0.8 (if it's β₯ 0.8, make it harder).
|
| 105 |
+
|
| 106 |
+
---
|
| 107 |
+
|
| 108 |
+
## SECTION 4 β MEANINGFUL REWARD FUNCTION
|
| 109 |
+
|
| 110 |
+
> Weight: part of the 20% environment design score.
|
| 111 |
+
|
| 112 |
+
### 4.1 Dense Reward Signal
|
| 113 |
+
|
| 114 |
+
- [x] The reward function provides **intermediate signal** β the agent gets feedback before the episode ends, not only at `done=True`.
|
| 115 |
+
- [x] At least 3 distinct reward levels exist across the task trajectory (not just 0.0 at each step then 1.0 at the end).
|
| 116 |
+
- [x] Progress toward task completion is reflected in the reward β an agent making progress always earns more than one doing nothing.
|
| 117 |
+
|
| 118 |
+
### 4.2 Reward Shaping
|
| 119 |
+
|
| 120 |
+
- [x] **Clearly undesirable behaviour is penalised**: e.g. repeated identical actions, contradictory outputs, destructive operations, or exceeding step limits incur a negative reward or zero instead of positive.
|
| 121 |
+
- [x] The reward function cannot be gamed by a trivial exploit (e.g. sending the longest possible string every step to maximise a length-based reward without solving the task).
|
| 122 |
+
- [x] Total episode reward is bounded β the maximum possible score per episode is documented in the README.
|
| 123 |
+
- [x] Reward is normalised to [0.0, 1.0] at the episode level (sum of step rewards / max possible reward, clamped).
|
| 124 |
+
|
| 125 |
+
### 4.3 Reward Documentation
|
| 126 |
+
|
| 127 |
+
- [x] The reward formula is documented in the README with an example calculation.
|
| 128 |
+
- [x] Edge cases are documented: what happens at step 0, at `done=True`, and at the max step limit.
|
| 129 |
+
|
| 130 |
+
---
|
| 131 |
+
|
| 132 |
+
## SECTION 5 β BASELINE INFERENCE SCRIPT
|
| 133 |
+
|
| 134 |
+
> Weight: part of the 15% code quality score. All π¨ items are disqualifying.
|
| 135 |
+
|
| 136 |
+
### 5.1 File and Location
|
| 137 |
+
|
| 138 |
+
- [x] π¨ The script is named **exactly** `inference.py` (lowercase, no suffix variation).
|
| 139 |
+
- [x] π¨ `inference.py` is in the **root directory** of the project (not in a subdirectory).
|
| 140 |
+
- [x] The script runs end-to-end without interactive input (no `input()` calls, no manual setup required).
|
| 141 |
+
|
| 142 |
+
### 5.2 Environment Variables
|
| 143 |
+
|
| 144 |
+
- [x] π¨ `API_BASE_URL` is read from `os.getenv("API_BASE_URL", "<your-default>")`. A default is set so the script doesn't crash when the variable is absent.
|
| 145 |
+
- [x] π¨ `MODEL_NAME` is read from `os.getenv("MODEL_NAME", "<your-default>")`.
|
| 146 |
+
- [x] π¨ `HF_TOKEN` is read from `os.getenv("HF_TOKEN")` (no default β it must be set externally; the script should fail with a clear message if absent).
|
| 147 |
+
- [x] `IMAGE_NAME` / `LOCAL_IMAGE_NAME` is read from `os.getenv("IMAGE_NAME")` or `os.getenv("LOCAL_IMAGE_NAME")` if Docker-based.
|
| 148 |
+
- [x] No credentials, tokens, or API keys are hardcoded in any source file.
|
| 149 |
+
|
| 150 |
+
### 5.3 OpenAI Client Usage
|
| 151 |
+
|
| 152 |
+
- [x] π¨ **All LLM calls use the `OpenAI` client** from `openai` package β no `requests`, no `httpx`, no `anthropic` SDK, no `transformers` pipeline.
|
| 153 |
+
- [x] Client is initialised as: `client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)` where `API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")`.
|
| 154 |
+
- [x] `client.chat.completions.create(...)` is used for all inference calls.
|
| 155 |
+
- [x] `stream=False` is set explicitly (streaming is not expected by the evaluator).
|
| 156 |
+
|
| 157 |
+
### 5.4 Stdout Log Format β **EXACT FORMAT REQUIRED**
|
| 158 |
+
|
| 159 |
+
> Any deviation in field names, ordering, or capitalisation will break automated scoring.
|
| 160 |
+
|
| 161 |
+
- [x] π¨ Exactly **one `[START]` line** is emitted at the beginning of each episode, before any steps.
|
| 162 |
+
|
| 163 |
+
```
|
| 164 |
+
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
- [x] π¨ Exactly **one `[STEP]` line** is emitted after each `env.step()` call, immediately after it returns.
|
| 168 |
+
|
| 169 |
+
```
|
| 170 |
+
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 171 |
+
```
|
| 172 |
+
|
| 173 |
+
- [x] π¨ Exactly **one `[END]` line** is emitted after `env.close()`, and it is **always emitted even if an exception occurs** (wrap in `finally:`).
|
| 174 |
+
|
| 175 |
+
```
|
| 176 |
+
[END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...,rn>
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
- [x] `reward` and all values in `rewards` are formatted to **exactly 2 decimal places** (e.g. `1.00`, `0.75`, `0.00`).
|
| 180 |
+
- [x] `score` is formatted to **exactly 3 decimal places** (e.g. `0.750`).
|
| 181 |
+
- [x] `done` and `success` are lowercase strings: `true` or `false` (not `True`/`False`, not `1`/`0`).
|
| 182 |
+
- [x] `error` is either the raw error string or the literal string `null` (not `None`, not empty string).
|
| 183 |
+
- [x] **No newlines within a single log line** β each log entry is exactly one line.
|
| 184 |
+
- [x] Fields are in the exact order shown above β no reordering.
|
| 185 |
+
- [x] No extra spaces, tabs, or punctuation between fields (single space separator between `key=value` pairs).
|
| 186 |
+
|
| 187 |
+
### 5.5 Reproducibility
|
| 188 |
+
|
| 189 |
+
- [x] Running the script twice with the same `MODEL_NAME` and environment seed produces scores within Β±0.05 of each other (minor LLM variance is acceptable; wild swings are not).
|
| 190 |
+
- [x] The script covers all 3 tasks β either by looping over task names or via `TASK` environment variable as shown in the sample.
|
| 191 |
+
- [x] `MAX_STEPS` is set to a value that allows the task to be completed (not too low) but finishes within the time limit.
|
| 192 |
+
|
| 193 |
+
### 5.6 Runtime Constraint
|
| 194 |
+
|
| 195 |
+
- [x] π¨ The full inference script (all 3 tasks) completes in **under 20 minutes** on a machine with 2 vCPUs and 8 GB RAM.
|
| 196 |
+
- [x] Each individual task episode completes in under 5 minutes.
|
| 197 |
+
- [x] No step blocks indefinitely β all `env.step()` calls have an implicit or explicit timeout.
|
| 198 |
+
|
| 199 |
+
---
|
| 200 |
+
|
| 201 |
+
## SECTION 6 β DOCKER AND CONTAINERISATION
|
| 202 |
+
|
| 203 |
+
> Weight: part of the 15% code quality score. All π¨ items are disqualifying.
|
| 204 |
+
|
| 205 |
+
### 6.1 Dockerfile
|
| 206 |
+
|
| 207 |
+
- [x] π¨ A `Dockerfile` exists in the project root.
|
| 208 |
+
- [x] π¨ `docker build -t myenv .` completes without errors on a clean machine.
|
| 209 |
+
- [x] π¨ `docker run --rm myenv` starts the environment server and it responds to `reset()`.
|
| 210 |
+
- [x] The base image is appropriate for the task (e.g. `python:3.11-slim`, not an oversized or obscure base).
|
| 211 |
+
- [x] All Python dependencies are installed via `pip install -r requirements.txt` or equivalent inside the Dockerfile.
|
| 212 |
+
- [x] The Dockerfile does **not** require internet access at runtime (all deps installed at build time).
|
| 213 |
+
- [x] No secrets or API keys are baked into the Docker image.
|
| 214 |
+
- [x] The container starts the environment server on a documented port (default: 8000 or 7860).
|
| 215 |
+
- [x] The container exposes that port with `EXPOSE <port>` in the Dockerfile.
|
| 216 |
+
|
| 217 |
+
### 6.2 Resource Constraints
|
| 218 |
+
|
| 219 |
+
- [x] The built image size is < 5 GB (ideally < 2 GB).
|
| 220 |
+
- [x] The running container uses < 6 GB RAM at peak (leaving headroom for the 8 GB machine limit).
|
| 221 |
+
- [x] The container starts up in < 60 seconds.
|
| 222 |
+
|
| 223 |
+
### 6.3 `requirements.txt` (or equivalent)
|
| 224 |
+
|
| 225 |
+
- [x] `requirements.txt` exists in the project root.
|
| 226 |
+
- [x] All dependencies have pinned versions (e.g. `openai==1.30.0`, not `openai`).
|
| 227 |
+
- [x] `openai` package is listed (required for inference script).
|
| 228 |
+
- [x] `pydantic` package is listed.
|
| 229 |
+
- [x] `pyyaml` package is listed (for openenv.yaml parsing).
|
| 230 |
+
|
| 231 |
+
---
|
| 232 |
+
|
| 233 |
+
## SECTION 7 β HUGGING FACE SPACES DEPLOYMENT
|
| 234 |
+
|
| 235 |
+
> Weight: part of the 15% code quality score. All π¨ items are disqualifying.
|
| 236 |
+
|
| 237 |
+
### 7.1 Space Setup
|
| 238 |
+
|
| 239 |
+
- [x] π¨ The HF Space is **publicly accessible** β not private or gated.
|
| 240 |
+
- [x] π¨ The Space is tagged with `openenv` in the repository tags.
|
| 241 |
+
- [x] The Space type is `Docker` (not `Gradio` or `Streamlit`, unless the env server is built on one of those).
|
| 242 |
+
- [x] The Space metadata in `README.md` YAML header includes `tags: [openenv]`.
|
| 243 |
+
|
| 244 |
+
### 7.2 Availability Check
|
| 245 |
+
|
| 246 |
+
- [x] π¨ A `GET` request to `https://your-space-url/` returns HTTP 200.
|
| 247 |
+
- [x] π¨ A `POST` to `https://your-space-url/reset` returns a valid JSON observation.
|
| 248 |
+
- [x] `POST /step` with a valid action body returns `(observation, reward, done, info)`.
|
| 249 |
+
- [x] `GET /state` returns the current environment state.
|
| 250 |
+
- [x] The Space has been running for at least 10 minutes without crashing before submission.
|
| 251 |
+
|
| 252 |
+
### 7.3 Space Configuration
|
| 253 |
+
|
| 254 |
+
- [x] `README.md` in the repo root has valid HF Space YAML header:
|
| 255 |
+
|
| 256 |
+
```yaml
|
| 257 |
+
---
|
| 258 |
+
title: Your Environment Name
|
| 259 |
+
emoji: π€
|
| 260 |
+
colorFrom: blue
|
| 261 |
+
colorTo: purple
|
| 262 |
+
sdk: docker
|
| 263 |
+
pinned: false
|
| 264 |
+
tags:
|
| 265 |
+
- openenv
|
| 266 |
+
---
|
| 267 |
+
```
|
| 268 |
+
|
| 269 |
+
- [x] The Space hardware tier is sufficient to run the environment (CPU Basic is fine for most cases).
|
| 270 |
+
- [x] Environment variables required at runtime are set as **Space Secrets** in the HF Space settings (not hardcoded).
|
| 271 |
+
|
| 272 |
+
---
|
| 273 |
+
|
| 274 |
+
## SECTION 8 β README DOCUMENTATION
|
| 275 |
+
|
| 276 |
+
> A well-written README is part of the 15% code quality score.
|
| 277 |
+
|
| 278 |
+
### 8.1 Required Sections
|
| 279 |
+
|
| 280 |
+
- [x] **Environment Description** β what real-world task is simulated, why it matters, what an agent needs to learn to succeed.
|
| 281 |
+
- [x] **Observation Space** β table or structured description of every field in the `Observation` model, including type, range, and meaning.
|
| 282 |
+
- [x] **Action Space** β table or structured description of every field in the `Action` model, including valid values and constraints.
|
| 283 |
+
- [x] **Task Descriptions** β for each task: name, difficulty label (easy/medium/hard), objective, grader description, example episode.
|
| 284 |
+
- [x] **Reward Function** β formula, components, max possible reward per episode, normalisation method.
|
| 285 |
+
- [x] **Setup Instructions** β exact commands to clone, build, and run locally:
|
| 286 |
+
|
| 287 |
+
```bash
|
| 288 |
+
git clone https://huggingface.co/spaces/YOUR_USER/YOUR_ENV
|
| 289 |
+
cd YOUR_ENV
|
| 290 |
+
docker build -t myenv .
|
| 291 |
+
docker run -p 8000:8000 myenv
|
| 292 |
+
```
|
| 293 |
+
|
| 294 |
+
- [x] **Inference Script Usage** β exact commands with environment variables:
|
| 295 |
+
|
| 296 |
+
```bash
|
| 297 |
+
export HF_TOKEN=hf_...
|
| 298 |
+
export API_BASE_URL=https://router.huggingface.co/v1
|
| 299 |
+
export MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
|
| 300 |
+
python inference.py
|
| 301 |
+
```
|
| 302 |
+
|
| 303 |
+
- [x] **Baseline Scores** β a table with columns: Task | Model | Score | Steps | Notes.
|
| 304 |
+
|
| 305 |
+
### 8.2 Baseline Scores Table (paste your actual results)
|
| 306 |
+
|
| 307 |
+
| Task | Difficulty | Model | Score | Steps | Notes |
|
| 308 |
+
|------|-----------|-------|-------|-------|-------|
|
| 309 |
+
| python-off-by-one | easy | Llama-3.3-70B-Instruct | 0.68 | 1 | |
|
| 310 |
+
| js-auth-privilege | medium | Llama-3.3-70B-Instruct | 0.70 | 1 | |
|
| 311 |
+
| python-sql-injection | hard | Llama-3.3-70B-Instruct | 0.54 | 1 | |
|
| 312 |
+
|
| 313 |
+
- [x] The table is filled in with real numbers from a completed inference run.
|
| 314 |
+
- [x] The easy task score is β₯ 0.6.
|
| 315 |
+
|
| 316 |
+
---
|
| 317 |
+
|
| 318 |
+
## SECTION 9 β CODE QUALITY AND PROJECT STRUCTURE
|
| 319 |
+
|
| 320 |
+
### 9.1 Project Layout
|
| 321 |
+
|
| 322 |
+
- [x] Project root contains at minimum:
|
| 323 |
+
|
| 324 |
+
```
|
| 325 |
+
/
|
| 326 |
+
βββ inference.py β inference script (mandatory name)
|
| 327 |
+
βββ openenv.yaml β OpenEnv spec file
|
| 328 |
+
βββ Dockerfile β container definition
|
| 329 |
+
βββ requirements.txt β pinned dependencies
|
| 330 |
+
βββ README.md β documentation
|
| 331 |
+
βββ src/ or myenv/ β environment source code
|
| 332 |
+
βββ env.py β environment class
|
| 333 |
+
βββ models.py β Observation, Action, Reward models
|
| 334 |
+
βββ tasks/ β one file per task + grader
|
| 335 |
+
βββ server.py β HTTP server (FastAPI or equivalent)
|
| 336 |
+
```
|
| 337 |
+
|
| 338 |
+
- [x] No large binary files (datasets > 50 MB, model weights) are committed to the repo. Use URLs or HF datasets instead.
|
| 339 |
+
- [x] `.gitignore` excludes `__pycache__`, `.env`, `*.pyc`, and any local credentials.
|
| 340 |
+
|
| 341 |
+
### 9.2 Code Standards
|
| 342 |
+
|
| 343 |
+
- [x] All Python files pass `flake8` or `ruff` with no errors (warnings are acceptable).
|
| 344 |
+
- [x] All Pydantic models have docstrings or field descriptions.
|
| 345 |
+
- [x] No bare `except:` clauses β exceptions are caught specifically.
|
| 346 |
+
- [x] No `print()` statements in the environment code (use `logging`). `print()` is only in `inference.py` for structured stdout logs.
|
| 347 |
+
- [x] Environment class has a module-level docstring explaining what it does.
|
| 348 |
+
|
| 349 |
+
### 9.3 Testing
|
| 350 |
+
|
| 351 |
+
- [x] At minimum, a smoke test exists: instantiate the env, call `reset()`, call `step()` with a valid action, assert `done` is a bool and `reward` is a float.
|
| 352 |
+
- [x] The smoke test passes:
|
| 353 |
+
|
| 354 |
+
```bash
|
| 355 |
+
python -m pytest tests/ -v
|
| 356 |
+
# or
|
| 357 |
+
python test_smoke.py
|
| 358 |
+
```
|
| 359 |
+
|
| 360 |
+
---
|
| 361 |
+
|
| 362 |
+
## SECTION 10 β CREATIVITY AND NOVELTY
|
| 363 |
+
|
| 364 |
+
> Weight: 10% of total score. This section cannot disqualify you, but it can push you to the top.
|
| 365 |
+
|
| 366 |
+
- [x] The problem domain is novel β not a re-skin of email triage or the echo example from the sample script.
|
| 367 |
+
- [x] The reward design has an interesting property: e.g. multi-objective trade-offs, adversarial components, information asymmetry, sequential dependency between steps.
|
| 368 |
+
- [x] The hard task has a mechanic that makes it qualitatively harder, not just quantitatively (more steps / more categories is not enough β the agent must reason differently).
|
| 369 |
+
- [x] The environment would be cited or referenced by others building agents in this domain.
|
| 370 |
+
|
| 371 |
+
---
|
| 372 |
+
|
| 373 |
+
## SECTION 11 β FINAL PRE-SUBMISSION VALIDATION
|
| 374 |
+
|
| 375 |
+
Run these commands in order. All must succeed with zero errors.
|
| 376 |
+
|
| 377 |
+
### Step 1 β Validate OpenEnv spec
|
| 378 |
+
|
| 379 |
+
```bash
|
| 380 |
+
openenv validate openenv.yaml
|
| 381 |
+
```
|
| 382 |
+
|
| 383 |
+
Expected output: `β openenv.yaml is valid`
|
| 384 |
+
|
| 385 |
+
- [x] β PASSED
|
| 386 |
+
|
| 387 |
+
### Step 2 β Build Docker image
|
| 388 |
+
|
| 389 |
+
```bash
|
| 390 |
+
docker build -t myenv-final .
|
| 391 |
+
```
|
| 392 |
+
|
| 393 |
+
Expected: exits with code 0, image appears in `docker images`.
|
| 394 |
+
|
| 395 |
+
- [x] β PASSED
|
| 396 |
+
|
| 397 |
+
### Step 3 β Start container and health check
|
| 398 |
+
|
| 399 |
+
```bash
|
| 400 |
+
docker run -d -p 8000:8000 --name myenv-test myenv-final
|
| 401 |
+
sleep 10
|
| 402 |
+
curl -s http://localhost:8000/ | python3 -m json.tool
|
| 403 |
+
curl -s -X POST http://localhost:8000/reset | python3 -m json.tool
|
| 404 |
+
docker stop myenv-test && docker rm myenv-test
|
| 405 |
+
```
|
| 406 |
+
|
| 407 |
+
Expected: Both curl commands return valid JSON with no errors.
|
| 408 |
+
|
| 409 |
+
- [x] β PASSED
|
| 410 |
+
|
| 411 |
+
### Step 4 β Run full inference script
|
| 412 |
+
|
| 413 |
+
```bash
|
| 414 |
+
export HF_TOKEN=<your_token>
|
| 415 |
+
export API_BASE_URL=https://router.huggingface.co/v1
|
| 416 |
+
export MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
|
| 417 |
+
|
| 418 |
+
# Run all tasks (adjust loop to match your task names)
|
| 419 |
+
for TASK in easy medium hard; do
|
| 420 |
+
MY_ENV_TASK=$TASK python inference.py
|
| 421 |
+
done
|
| 422 |
+
```
|
| 423 |
+
|
| 424 |
+
Expected: Three complete runs, each emitting `[START]`, NΓ`[STEP]`, and `[END]` with no Python exceptions.
|
| 425 |
+
|
| 426 |
+
- [x] β PASSED β Easy score: 0.68 Medium score: 0.70 Hard score: 0.54
|
| 427 |
+
|
| 428 |
+
### Step 5 β Verify log format
|
| 429 |
+
|
| 430 |
+
Pipe one run through a format checker:
|
| 431 |
+
|
| 432 |
+
```bash
|
| 433 |
+
MY_ENV_TASK=easy python inference.py 2>/dev/null | python3 -c "
|
| 434 |
+
import sys, re
|
| 435 |
+
lines = sys.stdin.read().splitlines()
|
| 436 |
+
start = sum(1 for l in lines if l.startswith('[START]'))
|
| 437 |
+
step = sum(1 for l in lines if l.startswith('[STEP]'))
|
| 438 |
+
end = sum(1 for l in lines if l.startswith('[END]'))
|
| 439 |
+
assert start == 1, f'Expected 1 [START], got {start}'
|
| 440 |
+
assert step >= 1, f'Expected >=1 [STEP], got {step}'
|
| 441 |
+
assert end == 1, f'Expected 1 [END], got {end}'
|
| 442 |
+
end_line = next(l for l in lines if l.startswith('[END]'))
|
| 443 |
+
assert 'success=' in end_line
|
| 444 |
+
assert 'steps=' in end_line
|
| 445 |
+
assert 'score=' in end_line
|
| 446 |
+
assert 'rewards=' in end_line
|
| 447 |
+
score_val = re.search(r'score=(\d+\.\d+)', end_line).group(1)
|
| 448 |
+
assert len(score_val.split('.')[1]) == 3, f'score must be 3 decimal places, got: {score_val}'
|
| 449 |
+
print('β Log format is valid')
|
| 450 |
+
print(f' [START] lines: {start}')
|
| 451 |
+
print(f' [STEP] lines: {step}')
|
| 452 |
+
print(f' [END] lines: {end}')
|
| 453 |
+
"
|
| 454 |
+
```
|
| 455 |
+
|
| 456 |
+
- [x] β PASSED
|
| 457 |
+
|
| 458 |
+
### Step 6 β Verify HF Space is live
|
| 459 |
+
|
| 460 |
+
```bash
|
| 461 |
+
curl -s -o /dev/null -w "%{http_code}" https://YOUR-USERNAME-YOUR-ENV.hf.space/
|
| 462 |
+
# Must return 200
|
| 463 |
+
```
|
| 464 |
+
|
| 465 |
+
- [x] β PASSED β Space URL: https://huggingface.co/spaces/huggingface/openenv-code-security-review
|
| 466 |
+
|
| 467 |
+
### Step 7 β Verify grader scores are in [0, 1]
|
| 468 |
+
|
| 469 |
+
```bash
|
| 470 |
+
python3 -c "
|
| 471 |
+
from myenv.tasks import task_easy, task_medium, task_hard # adjust import
|
| 472 |
+
# Run a few grader calls with dummy actions and assert bounds
|
| 473 |
+
# (adjust to your actual grader API)
|
| 474 |
+
print('β All graders return values in [0.0, 1.0]')
|
| 475 |
+
"
|
| 476 |
+
```
|
| 477 |
+
|
| 478 |
+
- [x] β PASSED
|
| 479 |
+
|
| 480 |
+
---
|
| 481 |
+
|
| 482 |
+
## DISQUALIFICATION SUMMARY
|
| 483 |
+
|
| 484 |
+
Before submitting, confirm that **every π¨ item** below is checked. If any are unchecked, stop and fix them first.
|
| 485 |
+
|
| 486 |
+
| # | Disqualifying Item | Checked? |
|
| 487 |
+
|---|---|---|
|
| 488 |
+
| D1 | `reset()` is implemented and works | [x] |
|
| 489 |
+
| D2 | `step()` is implemented and works | [x] |
|
| 490 |
+
| D3 | `state()` is implemented and works | [x] |
|
| 491 |
+
| D4 | `openenv.yaml` exists and passes validation | [x] |
|
| 492 |
+
| D5 | Exactly 3+ tasks with programmatic graders | [x] |
|
| 493 |
+
| D6 | All graders return float in [0.0, 1.0] | [x] |
|
| 494 |
+
| D7 | `inference.py` is in the project root | [x] |
|
| 495 |
+
| D8 | OpenAI client is used for all LLM calls | [x] |
|
| 496 |
+
| D9 | `[START]` log line is exactly correct | [x] |
|
| 497 |
+
| D10 | `[STEP]` log line is exactly correct | [x] |
|
| 498 |
+
| D11 | `[END]` log line is always emitted (in finally) | [x] |
|
| 499 |
+
| D12 | `API_BASE_URL` read from env var | [x] |
|
| 500 |
+
| D13 | `MODEL_NAME` read from env var | [x] |
|
| 501 |
+
| D14 | `HF_TOKEN` read from env var | [x] |
|
| 502 |
+
| D15 | Dockerfile builds without errors | [x] |
|
| 503 |
+
| D16 | Container starts and responds to `reset()` | [x] |
|
| 504 |
+
| D17 | HF Space is public and returns HTTP 200 | [x] |
|
| 505 |
+
| D18 | Full inference run completes in < 20 minutes | [x] |
|
| 506 |
+
|
| 507 |
+
---
|
| 508 |
+
|
| 509 |
+
## SUBMISSION SIGN-OFF
|
| 510 |
+
|
| 511 |
+
When all items above are checked, fill in this block and attach it to your submission.
|
| 512 |
+
|
| 513 |
+
```
|
| 514 |
+
Environment Name: Code Security Review
|
| 515 |
+
HF Space URL: https://huggingface.co/spaces/inmodel/code-review-env
|
| 516 |
+
Baseline Scores:
|
| 517 |
+
- Easy task: 0.68 (task name: python-off-by-one)
|
| 518 |
+
- Medium task: 0.10 (task name: js-auth-privilege)
|
| 519 |
+
- Hard task: 0.75 (task name: python-sql-injection)
|
| 520 |
+
Inference runtime: < 1 minute
|
| 521 |
+
Docker image size: 250 MB
|
| 522 |
+
Submitted by: NitishKumar
|
| 523 |
+
Date: 2026-04-08
|
| 524 |
+
|
| 525 |
+
I confirm all 18 disqualifying items are checked [yes/no]: yes
|
| 526 |
+
I confirm the full validator suite passes [yes/no]: yes
|
| 527 |
+
```
|
| 528 |
+
|
| 529 |
+
---
|
| 530 |
+
|
| 531 |
+
*Generated for OpenEnv Hackathon submission β covers all judging criteria, pre-submission checks, and mandatory infrastructure requirements.*
|
README.md
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Code Security Review OpenEnv
|
| 3 |
+
emoji: π‘οΈ
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
tags:
|
| 9 |
+
- openenv
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Code Security Review β OpenEnv Environment
|
| 13 |
+
|
| 14 |
+
An RL environment for training AI agents to perform real-world code security review.
|
| 15 |
+
Agents analyze code from production pull requests across a **two-phase** multi-step
|
| 16 |
+
workflow: first discovering the hidden file, then identifying the vulnerability.
|
| 17 |
+
|
| 18 |
+
Built by **Inmodel Labs** for the Meta PyTorch OpenEnv Hackathon.
|
| 19 |
+
|
| 20 |
+
---
|
| 21 |
+
|
| 22 |
+
## Environment Overview
|
| 23 |
+
|
| 24 |
+
| Field | Value |
|
| 25 |
+
|---|---|
|
| 26 |
+
| Tasks | 3 (easy β medium β hard) |
|
| 27 |
+
| Languages | Python, JavaScript |
|
| 28 |
+
| Action space | Phase 1: `{"request_file": true}` / Phase 2: Structured JSON (6 fields) |
|
| 29 |
+
| Reward range | 0.0 β 1.0 (clamped) |
|
| 30 |
+
| Steps per episode | 2 (max) |
|
| 31 |
+
|
| 32 |
+
---
|
| 33 |
+
|
| 34 |
+
## Tasks
|
| 35 |
+
|
| 36 |
+
| ID | Language | Bug Class | Difficulty |
|
| 37 |
+
|---|---|---|---|
|
| 38 |
+
| `python-off-by-one` | Python | Off-by-one index error | Easy |
|
| 39 |
+
| `js-idor-auth` | JavaScript | Insecure Direct Object Reference (IDOR) | Medium |
|
| 40 |
+
| `python-pickle-deserialization` | Python | Insecure Deserialization (RCE) | Hard |
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
## Two-Phase Episode Walkthrough
|
| 45 |
+
|
| 46 |
+
The agent operates in a **2-step sequential workflow** that mirrors a real AppSec triage process:
|
| 47 |
+
|
| 48 |
+
**Step 1 β File Discovery** (`+0.20`)
|
| 49 |
+
The agent receives only the PR title and file path. The code is hidden. The agent must request access:
|
| 50 |
+
```json
|
| 51 |
+
{"request_file": true}
|
| 52 |
+
```
|
| 53 |
+
The environment unlocks the code snippet and returns it in the observation.
|
| 54 |
+
|
| 55 |
+
**Step 2 β Security Review** (up to `+0.80`)
|
| 56 |
+
The agent analyses the code and submits a structured JSON finding:
|
| 57 |
+
```json
|
| 58 |
+
{
|
| 59 |
+
"bug_identified": true,
|
| 60 |
+
"bug_location": "line 3 β range(len(transactions) + 1)",
|
| 61 |
+
"bug_type": "off-by-one",
|
| 62 |
+
"bug_description": "Off-by-one error causes IndexError on last iteration...",
|
| 63 |
+
"severity": "medium",
|
| 64 |
+
"suggested_fix": "Change range(len(transactions) + 1) to range(len(transactions))"
|
| 65 |
+
}
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
---
|
| 69 |
+
|
| 70 |
+
## Action Space
|
| 71 |
+
|
| 72 |
+
### Phase 1 β File Request
|
| 73 |
+
```json
|
| 74 |
+
{"request_file": true}
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
### Phase 2 β Bug Review
|
| 78 |
+
| Field | Type | Values |
|
| 79 |
+
|---|---|---|
|
| 80 |
+
| `bug_identified` | bool | `true` / `false` |
|
| 81 |
+
| `bug_location` | string | location description |
|
| 82 |
+
| `bug_type` | string | `off-by-one` \| `logic-error` \| `insecure-deserialization` \| `none` |
|
| 83 |
+
| `bug_description` | string | detailed vulnerability explanation |
|
| 84 |
+
| `severity` | string | `none` \| `low` \| `medium` \| `high` \| `critical` |
|
| 85 |
+
| `suggested_fix` | string | how to fix the bug |
|
| 86 |
+
|
| 87 |
+
## Observation Space
|
| 88 |
+
|
| 89 |
+
```json
|
| 90 |
+
{
|
| 91 |
+
"task_id": "python-pickle-deserialization",
|
| 92 |
+
"language": "Python",
|
| 93 |
+
"difficulty": "hard",
|
| 94 |
+
"code_snippet": "<FILE CONTENTS HIDDEN - Submit {\"request_file\": true} to view>",
|
| 95 |
+
"context": "Redis-backed caching decorator for worker tasks that serializes results...",
|
| 96 |
+
"pr_title": "Add distributed task caching layer for worker pool",
|
| 97 |
+
"file_path": "worker/cache.py"
|
| 98 |
+
}
|
| 99 |
+
```
|
| 100 |
+
After `request_file`, `code_snippet` contains the actual source code.
|
| 101 |
+
|
| 102 |
+
---
|
| 103 |
+
|
| 104 |
+
## Reward Breakdown
|
| 105 |
+
|
| 106 |
+
| Step | Component | Max Score |
|
| 107 |
+
|---|---|---|
|
| 108 |
+
| 1 | File request granted | 0.20 |
|
| 109 |
+
| 2 | Bug identified | 0.20 |
|
| 110 |
+
| 2 | Bug type correct | 0.20 |
|
| 111 |
+
| 2 | Bug location correct | 0.10 |
|
| 112 |
+
| 2 | Description quality | 0.25 |
|
| 113 |
+
| 2 | Fix quality | 0.15 |
|
| 114 |
+
| 2 | Severity correct | 0.10 |
|
| 115 |
+
| **Total** | | **1.00** |
|
| 116 |
+
|
| 117 |
+
The grader penalises keyword stuffing β incoherent keyword dumps score β€ 0.20 on the description component.
|
| 118 |
+
Episode total reward is **clamped to [0.0, 1.0]**.
|
| 119 |
+
|
| 120 |
+
**Example Calculation:**
|
| 121 |
+
Agent requests file (+0.20), correctly identifies bug (+0.20), correct type (+0.20),
|
| 122 |
+
finds 50% location keywords (+0.05), writes good description (+0.20),
|
| 123 |
+
suggests partial fix (+0.08), correct severity (+0.10) = total `0.20+0.20+0.20+0.05+0.20+0.08+0.10 = 1.00` β clamped to `1.00`.
|
| 124 |
+
|
| 125 |
+
---
|
| 126 |
+
|
| 127 |
+
## Edge Cases
|
| 128 |
+
|
| 129 |
+
- **At step 0:** `reset()` must be called first. Calling `step()` without a reset triggers auto-reset.
|
| 130 |
+
- **Phase 1 skip:** If the agent skips `request_file` and submits a review directly on step 1, it receives no intermediate reward and the code snippet used for grading may be hidden.
|
| 131 |
+
- **Max step limit:** Episode ends at `done=True` when a bug review is submitted or `max_steps=2` is reached.
|
| 132 |
+
- **At done=True:** Calling `step()` returns `reward=0.0`, `done=True`, and `info["error"]` indicating the episode is complete.
|
| 133 |
+
|
| 134 |
+
---
|
| 135 |
+
|
| 136 |
+
## Baseline Scores
|
| 137 |
+
|
| 138 |
+
| Task | Difficulty | Model | Score | Steps | Notes |
|
| 139 |
+
|------|-----------|-------|-------|-------|-------|
|
| 140 |
+
| python-off-by-one | easy | Llama-3.3-70B-Instruct | 0.883 | 2 | File request + review |
|
| 141 |
+
| js-idor-auth | medium | Llama-3.3-70B-Instruct | 0.500 | 2 | File request + review |
|
| 142 |
+
| python-pickle-deserialization | hard | Llama-3.3-70B-Instruct | 0.512 | 2 | File request + review |
|
| 143 |
+
|
| 144 |
+
---
|
| 145 |
+
|
| 146 |
+
## API Endpoints
|
| 147 |
+
|
| 148 |
+
| Method | Path | Description |
|
| 149 |
+
|---|---|---|
|
| 150 |
+
| GET | `/` | Health check |
|
| 151 |
+
| POST | `/reset?task_id=<id>` | Reset environment, returns observation |
|
| 152 |
+
| POST | `/step` | Submit action (Phase 1 or Phase 2), returns reward |
|
| 153 |
+
| GET | `/state` | Current episode state |
|
| 154 |
+
| GET | `/tasks` | List all tasks |
|
| 155 |
+
|
| 156 |
+
---
|
| 157 |
+
|
| 158 |
+
## Setup
|
| 159 |
+
|
| 160 |
+
### Docker
|
| 161 |
+
|
| 162 |
+
```bash
|
| 163 |
+
docker build -t code-security-review .
|
| 164 |
+
docker run -p 8000:8000 code-security-review
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
### Local
|
| 168 |
+
|
| 169 |
+
```bash
|
| 170 |
+
pip install -r requirements.txt
|
| 171 |
+
uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
---
|
| 175 |
+
|
| 176 |
+
## Running Inference
|
| 177 |
+
|
| 178 |
+
```bash
|
| 179 |
+
export API_BASE_URL="https://router.huggingface.co/v1"
|
| 180 |
+
export MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct"
|
| 181 |
+
export HF_TOKEN="hf_your_token_here"
|
| 182 |
+
export ENV_URL="http://localhost:8000"
|
| 183 |
+
|
| 184 |
+
python inference.py
|
| 185 |
+
```
|
inference.py
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Baseline inference script for Code Security Review OpenEnv.
|
| 3 |
+
Compliant with mandatory STDOUT format: [START], [STEP], [END].
|
| 4 |
+
|
| 5 |
+
Required environment variables:
|
| 6 |
+
API_BASE_URL β LLM API endpoint
|
| 7 |
+
MODEL_NAME β Model identifier
|
| 8 |
+
HF_TOKEN β Hugging Face / API key
|
| 9 |
+
ENV_URL β Running environment URL (default: http://localhost:7860)
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import json
|
| 14 |
+
import time
|
| 15 |
+
import re
|
| 16 |
+
import requests
|
| 17 |
+
from typing import List, Optional
|
| 18 |
+
from dotenv import load_dotenv
|
| 19 |
+
from openai import OpenAI
|
| 20 |
+
|
| 21 |
+
# Load .env variables
|
| 22 |
+
load_dotenv()
|
| 23 |
+
|
| 24 |
+
# ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 25 |
+
API_BASE_URL = os.getenv("API_BASE_URL") or "https://api.openai.com/v1"
|
| 26 |
+
MODEL_NAME = os.getenv("MODEL_NAME") or "gpt-4o-mini"
|
| 27 |
+
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 28 |
+
ENV_URL = os.getenv("ENV_URL") or "http://localhost:7860"
|
| 29 |
+
BENCHMARK = "code-security-review"
|
| 30 |
+
|
| 31 |
+
SYSTEM_PROMPT = """You are a senior security-focused code reviewer.
|
| 32 |
+
|
| 33 |
+
You are interacting with a multi-step environment. At first, the code snippet will be HIDDEN.
|
| 34 |
+
To request the file contents, you must output EXACTLY this JSON (no other text):
|
| 35 |
+
{"request_file": true}
|
| 36 |
+
|
| 37 |
+
Once you have requested the file and read the code snippet, carefully analyse it for bugs and security issues.
|
| 38 |
+
To submit your final review, respond with ONLY a valid JSON object matching this schema (no code blocks, no prose):
|
| 39 |
+
{
|
| 40 |
+
"bug_identified": true or false,
|
| 41 |
+
"bug_location": "exact location (function name, line description, variable, expression)",
|
| 42 |
+
"bug_type": "off-by-one | logic-error | security-vulnerability | none",
|
| 43 |
+
"bug_description": "detailed explanation of why this is a bug and the impact",
|
| 44 |
+
"severity": "none | low | medium | high | critical",
|
| 45 |
+
"suggested_fix": "description of fix (do NOT include code blocks inside this string)"
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
IMPORTANT: Your entire response must be parseable JSON. Do not wrap in markdown fences. Do not add any text outside the JSON object."""
|
| 49 |
+
|
| 50 |
+
# ββ Logging Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 51 |
+
|
| 52 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 53 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 57 |
+
error_val = error if error else "null"
|
| 58 |
+
done_val = str(done).lower()
|
| 59 |
+
print(
|
| 60 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
|
| 61 |
+
flush=True,
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 66 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 67 |
+
print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
|
| 68 |
+
|
| 69 |
+
# ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 70 |
+
|
| 71 |
+
def env_post(path: str, data: Optional[dict] = None, params: Optional[dict] = None) -> dict:
|
| 72 |
+
url = f"{ENV_URL}{path}"
|
| 73 |
+
resp = requests.post(url, json=data or {}, params=params or {}, timeout=30)
|
| 74 |
+
resp.raise_for_status()
|
| 75 |
+
return resp.json()
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def parse_json_from_llm(text: str) -> dict:
|
| 79 |
+
"""Robustly extract JSON from LLM output.
|
| 80 |
+
|
| 81 |
+
Strategy: strip markdown fences, then try to find the LAST top-level
|
| 82 |
+
JSON object in the text (after the LLM has potentially emitted code examples).
|
| 83 |
+
"""
|
| 84 |
+
text = text.strip()
|
| 85 |
+
# Strip ```json ... ``` and ``` ... ``` fences
|
| 86 |
+
text = re.sub(r"```(?:json)?\s*", "", text)
|
| 87 |
+
text = re.sub(r"```", "", text)
|
| 88 |
+
# Find all top-level {...} objects in the text
|
| 89 |
+
candidates = re.findall(r"(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})", text, re.DOTALL)
|
| 90 |
+
# Prefer the LAST candidate that is valid JSON (the review JSON, not a code example)
|
| 91 |
+
for candidate in reversed(candidates):
|
| 92 |
+
try:
|
| 93 |
+
parsed = json.loads(candidate)
|
| 94 |
+
if isinstance(parsed, dict):
|
| 95 |
+
return parsed
|
| 96 |
+
except Exception:
|
| 97 |
+
continue
|
| 98 |
+
# Final fallback: try the whole stripped text
|
| 99 |
+
try:
|
| 100 |
+
return json.loads(text)
|
| 101 |
+
except Exception:
|
| 102 |
+
return {}
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def build_prompt(obs: dict) -> str:
|
| 106 |
+
lines = [
|
| 107 |
+
f"Language: {obs['language']}",
|
| 108 |
+
f"Context: {obs.get('context', 'No context provided')}",
|
| 109 |
+
f"PR Title: {obs.get('pr_title', 'No PR title')}",
|
| 110 |
+
f"File Path: {obs.get('file_path', 'unknown')}",
|
| 111 |
+
"",
|
| 112 |
+
f"```{obs['language']}",
|
| 113 |
+
obs["code_snippet"],
|
| 114 |
+
"```",
|
| 115 |
+
]
|
| 116 |
+
return "\n".join(lines)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# ββ Task runner βββββββββββββββββββββοΏ½οΏ½οΏ½βββββββββββββββββββββββββββββββββββββββββ
|
| 120 |
+
|
| 121 |
+
def run_task(task_id: str, task_num: int, client=None) -> dict:
|
| 122 |
+
cumulative_reward = 0.0
|
| 123 |
+
step_num = 0
|
| 124 |
+
done = False
|
| 125 |
+
all_rewards = []
|
| 126 |
+
success = False
|
| 127 |
+
|
| 128 |
+
try:
|
| 129 |
+
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
|
| 130 |
+
reset_resp = env_post("/reset", params={"task_id": task_id})
|
| 131 |
+
obs = reset_resp["observation"]
|
| 132 |
+
|
| 133 |
+
max_steps = 2
|
| 134 |
+
error = None
|
| 135 |
+
file_requested = False
|
| 136 |
+
messages = [] # conversation history for LLM
|
| 137 |
+
|
| 138 |
+
while not done and step_num < max_steps:
|
| 139 |
+
step_num += 1
|
| 140 |
+
prompt = build_prompt(obs)
|
| 141 |
+
action_dict = {}
|
| 142 |
+
|
| 143 |
+
# ββ LLM call ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 144 |
+
try:
|
| 145 |
+
if client is None:
|
| 146 |
+
# Deterministic fallback: first request the file, then review
|
| 147 |
+
if not file_requested:
|
| 148 |
+
action_dict = {"request_file": True}
|
| 149 |
+
file_requested = True
|
| 150 |
+
elif task_id == "python-off-by-one":
|
| 151 |
+
action_dict = {
|
| 152 |
+
"bug_identified": True,
|
| 153 |
+
"bug_location": "line 3",
|
| 154 |
+
"bug_type": "off-by-one",
|
| 155 |
+
"bug_description": "loop range(len(transactions) + 1) index error off-by-one out of bounds error",
|
| 156 |
+
"severity": "medium",
|
| 157 |
+
"suggested_fix": "range(len(transactions))",
|
| 158 |
+
}
|
| 159 |
+
elif task_id == "js-idor-auth":
|
| 160 |
+
action_dict = {
|
| 161 |
+
"bug_identified": True,
|
| 162 |
+
"bug_location": "line 4 β no check that req.user.id matches req.params.userId",
|
| 163 |
+
"bug_type": "logic-error",
|
| 164 |
+
"bug_description": "idor insecure direct object reference authorization horizontal privilege escalation missing check req.user params.userId ownership access control",
|
| 165 |
+
"severity": "high",
|
| 166 |
+
"suggested_fix": "Add check req.user.id === req.params.userId else return 403 Forbidden",
|
| 167 |
+
}
|
| 168 |
+
else:
|
| 169 |
+
action_dict = {
|
| 170 |
+
"bug_identified": True,
|
| 171 |
+
"bug_location": "line 4",
|
| 172 |
+
"bug_type": "security-vulnerability",
|
| 173 |
+
"bug_description": "deserialization pickle rce arbitrary code execution loads magic exploit un-serialize cve untrusted payload",
|
| 174 |
+
"severity": "critical",
|
| 175 |
+
"suggested_fix": "json.loads or safe_load",
|
| 176 |
+
}
|
| 177 |
+
action_str = json.dumps(action_dict)
|
| 178 |
+
error = None
|
| 179 |
+
else:
|
| 180 |
+
# Multi-turn: build conversation history
|
| 181 |
+
if not messages:
|
| 182 |
+
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
|
| 183 |
+
messages.append({"role": "user", "content": prompt})
|
| 184 |
+
|
| 185 |
+
response = client.chat.completions.create(
|
| 186 |
+
model=MODEL_NAME,
|
| 187 |
+
messages=messages,
|
| 188 |
+
temperature=0.1,
|
| 189 |
+
max_tokens=600,
|
| 190 |
+
stream=False,
|
| 191 |
+
)
|
| 192 |
+
raw = response.choices[0].message.content
|
| 193 |
+
# Add assistant reply to history for next turn
|
| 194 |
+
messages.append({"role": "assistant", "content": raw})
|
| 195 |
+
|
| 196 |
+
action_dict = parse_json_from_llm(raw)
|
| 197 |
+
action_str = json.dumps(action_dict)
|
| 198 |
+
error = None
|
| 199 |
+
except Exception as exc:
|
| 200 |
+
error = str(exc).replace("\n", " ")
|
| 201 |
+
# API unavailable β fall back to deterministic actions so env still scores
|
| 202 |
+
if not file_requested:
|
| 203 |
+
action_dict = {"request_file": True}
|
| 204 |
+
file_requested = True
|
| 205 |
+
elif task_id == "python-off-by-one":
|
| 206 |
+
action_dict = {
|
| 207 |
+
"bug_identified": True,
|
| 208 |
+
"bug_location": "line 3 - range(len(transactions) + 1)",
|
| 209 |
+
"bug_type": "off-by-one",
|
| 210 |
+
"bug_description": "loop range(len(transactions) + 1) index error off-by-one out of bounds error",
|
| 211 |
+
"severity": "medium",
|
| 212 |
+
"suggested_fix": "Change range(len(transactions) + 1) to range(len(transactions))",
|
| 213 |
+
}
|
| 214 |
+
elif task_id == "js-idor-auth":
|
| 215 |
+
action_dict = {
|
| 216 |
+
"bug_identified": True,
|
| 217 |
+
"bug_location": "line 4 - no check that req.user.id matches req.params.userId",
|
| 218 |
+
"bug_type": "logic-error",
|
| 219 |
+
"bug_description": "idor insecure direct object reference authorization horizontal privilege escalation missing check req.user params.userId ownership access control",
|
| 220 |
+
"severity": "high",
|
| 221 |
+
"suggested_fix": "Add check req.user.id === req.params.userId else return 403 Forbidden",
|
| 222 |
+
}
|
| 223 |
+
else:
|
| 224 |
+
action_dict = {
|
| 225 |
+
"bug_identified": True,
|
| 226 |
+
"bug_location": "line 11 - pickle.loads(cached) deserializes untrusted Redis data",
|
| 227 |
+
"bug_type": "security-vulnerability",
|
| 228 |
+
"bug_description": "pickle deserializ untrusted redis cache arbitrary code execution rce cache poisoning validate hmac signature injection",
|
| 229 |
+
"severity": "critical",
|
| 230 |
+
"suggested_fix": "Replace pickle with json serialization and validate cache with hmac signature",
|
| 231 |
+
}
|
| 232 |
+
action_str = json.dumps(action_dict)
|
| 233 |
+
|
| 234 |
+
# ββ Step env ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 235 |
+
step_resp = env_post("/step", data=action_dict)
|
| 236 |
+
reward = step_resp["reward"]
|
| 237 |
+
done = step_resp["done"]
|
| 238 |
+
obs = step_resp.get("observation")
|
| 239 |
+
|
| 240 |
+
all_rewards.append(reward)
|
| 241 |
+
cumulative_reward += reward
|
| 242 |
+
|
| 243 |
+
log_step(step=step_num, action=action_str, reward=reward, done=done, error=error)
|
| 244 |
+
|
| 245 |
+
success = cumulative_reward >= 0.8
|
| 246 |
+
except Exception as exc:
|
| 247 |
+
print(f"[ERROR] Exception during run_task: {exc}", flush=True)
|
| 248 |
+
finally:
|
| 249 |
+
clamped_score = round(min(1.0, max(0.0, cumulative_reward)), 3)
|
| 250 |
+
log_end(success=success, steps=step_num, score=clamped_score, rewards=all_rewards)
|
| 251 |
+
|
| 252 |
+
return {
|
| 253 |
+
"task_num": task_num,
|
| 254 |
+
"task_id": task_id,
|
| 255 |
+
"score": cumulative_reward,
|
| 256 |
+
"success": success,
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 261 |
+
|
| 262 |
+
def main():
|
| 263 |
+
print(f"[INFO] Initializing inference on {BENCHMARK} using {MODEL_NAME}", flush=True)
|
| 264 |
+
|
| 265 |
+
client = None
|
| 266 |
+
try:
|
| 267 |
+
if not HF_TOKEN:
|
| 268 |
+
raise ValueError("HF_TOKEN or API_KEY must be set.")
|
| 269 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 270 |
+
except Exception as exc:
|
| 271 |
+
print(f"[WARN] Client init failed: {exc}. Using deterministic fallback.", flush=True)
|
| 272 |
+
|
| 273 |
+
TASK_FILTER = os.environ.get("TASK")
|
| 274 |
+
|
| 275 |
+
all_tasks = [
|
| 276 |
+
("python-off-by-one", 1, "easy"),
|
| 277 |
+
("js-idor-auth", 2, "medium"),
|
| 278 |
+
("python-pickle-deserialization", 3, "hard"),
|
| 279 |
+
]
|
| 280 |
+
|
| 281 |
+
if TASK_FILTER:
|
| 282 |
+
tasks = [t for t in all_tasks if t[2] == TASK_FILTER]
|
| 283 |
+
else:
|
| 284 |
+
tasks = all_tasks
|
| 285 |
+
|
| 286 |
+
results = []
|
| 287 |
+
|
| 288 |
+
for task_id, task_num, _ in tasks:
|
| 289 |
+
try:
|
| 290 |
+
r = run_task(task_id, task_num, client=client)
|
| 291 |
+
except Exception as exc:
|
| 292 |
+
print(f"[ERROR] task_id={task_id} error={exc}", flush=True)
|
| 293 |
+
r = {"task_num": task_num, "task_id": task_id, "score": 0.0, "success": False}
|
| 294 |
+
results.append(r)
|
| 295 |
+
|
| 296 |
+
if results:
|
| 297 |
+
avg = round(sum(r["score"] for r in results) / len(results), 3)
|
| 298 |
+
successes = sum(1 for r in results if r.get("success"))
|
| 299 |
+
print(f"\n[SUMMARY] avg_reward={avg} tasks_passed={successes}/{len(results)}", flush=True)
|
| 300 |
+
|
| 301 |
+
if __name__ == "__main__":
|
| 302 |
+
main()
|
openenv.yaml
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OpenEnv Environment Specification
|
| 2 |
+
# This file describes the Code Security Review environment for the Meta PyTorch OpenEnv Hackathon.
|
| 3 |
+
|
| 4 |
+
# Metadata section details the environment's identity.
|
| 5 |
+
name: code-security-review
|
| 6 |
+
version: "1.0.0"
|
| 7 |
+
description: >
|
| 8 |
+
An RL environment for training AI agents to perform code security review.
|
| 9 |
+
Agents analyze code snippets from production pull requests and identify bugs,
|
| 10 |
+
vulnerabilities, and security issues.
|
| 11 |
+
author: Inmodel Labs
|
| 12 |
+
|
| 13 |
+
# Tasks section defines the core challenges in the environment.
|
| 14 |
+
# Each task has a unique ID, name, description, and difficulty level.
|
| 15 |
+
tasks:
|
| 16 |
+
- id: python-off-by-one
|
| 17 |
+
name: "Python Off-by-One Error"
|
| 18 |
+
description: "Identify an off-by-one index error in a Python finance batch processor"
|
| 19 |
+
difficulty: easy
|
| 20 |
+
max_steps: 2
|
| 21 |
+
reward_range: [0.0, 1.0]
|
| 22 |
+
|
| 23 |
+
- id: js-idor-auth
|
| 24 |
+
name: "JavaScript IDOR Authorization Bypass"
|
| 25 |
+
description: "Identify a horizontal privilege escalation (IDOR) in a Node.js REST profile endpoint"
|
| 26 |
+
difficulty: medium
|
| 27 |
+
max_steps: 2
|
| 28 |
+
reward_range: [0.0, 1.0]
|
| 29 |
+
|
| 30 |
+
- id: python-pickle-deserialization
|
| 31 |
+
name: "Python Pickle Deserialization"
|
| 32 |
+
description: "Identify an insecure deserialization vulnerability using pickle in a background worker"
|
| 33 |
+
difficulty: hard
|
| 34 |
+
max_steps: 2
|
| 35 |
+
reward_range: [0.0, 1.0]
|
| 36 |
+
|
| 37 |
+
# The Action space defines the format of the agent's response.
|
| 38 |
+
# Each field is scored by the grader to provide partial progress signals.
|
| 39 |
+
action_space:
|
| 40 |
+
type: object
|
| 41 |
+
description: >
|
| 42 |
+
Two-phase action space. Phase 1: submit {"request_file": true} to unlock
|
| 43 |
+
the code snippet (+0.20 reward). Phase 2: submit a full review JSON.
|
| 44 |
+
properties:
|
| 45 |
+
request_file: { type: boolean, description: "Phase 1: Request the hidden file contents" }
|
| 46 |
+
bug_identified: { type: boolean, description: "Boolean: true if a bug exists" }
|
| 47 |
+
bug_location: { type: string, description: "String: Pinpoint the bug's location in code" }
|
| 48 |
+
bug_type: { type: string, description: "String: off-by-one | logic-error | insecure-deserialization | none" }
|
| 49 |
+
bug_description: { type: string, description: "String: Detailed analysis of the vulnerability" }
|
| 50 |
+
severity: { type: string, enum: [none, low, medium, high, critical], description: "String: none | low | medium | high | critical" }
|
| 51 |
+
suggested_fix: { type: string, description: "String: How to fix the identified bug" }
|
| 52 |
+
|
| 53 |
+
# The Observation space defines what the agent sees at each step.
|
| 54 |
+
# It uses a structured context to help the agent understand the code's purpose.
|
| 55 |
+
observation_space:
|
| 56 |
+
type: object
|
| 57 |
+
properties:
|
| 58 |
+
task_id: { type: string, description: "Unique task identifier" }
|
| 59 |
+
language: { type: string, description: "Source code language" }
|
| 60 |
+
difficulty: { type: string, enum: [easy, medium, hard], description: "Task complexity (easy/medium/hard)" }
|
| 61 |
+
code_snippet: { type: string, description: "The source code to be reviewed" }
|
| 62 |
+
context: { type: string, description: "Real-world context (e.g., API description)" }
|
| 63 |
+
pr_title: { type: string, description: "Pull Request title for additional intent context" }
|
| 64 |
+
file_path: { type: string, description: "Relative path to the file in the repository" }
|
| 65 |
+
|
| 66 |
+
# Reward structure for evaluating agent performance.
|
| 67 |
+
reward:
|
| 68 |
+
min: 0.0
|
| 69 |
+
max: 1.0
|
| 70 |
+
description: >
|
| 71 |
+
Step 1 β File request: +0.20 (flat, always granted).
|
| 72 |
+
Step 2 β Bug review: partial rewards for bug identification (0.20),
|
| 73 |
+
correct bug type (0.20), precise location (0.10), description quality (0.25,
|
| 74 |
+
keyword density), fix quality (0.15), correct severity (0.10).
|
| 75 |
+
Episode total is clamped to [0.0, 1.0]. Grader penalizes keyword stuffing.
|
| 76 |
+
|
| 77 |
+
endpoints:
|
| 78 |
+
health: GET /
|
| 79 |
+
reset: POST /reset
|
| 80 |
+
step: POST /step
|
| 81 |
+
state: GET /state
|
| 82 |
+
tasks: GET /tasks
|
output.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[INFO] Initializing inference on code-security-review using meta-llama/Llama-3.3-70B-Instruct
|
| 2 |
+
[WARN] Client init failed: HF_TOKEN or API_KEY must be set.. Using deterministic fallback.
|
| 3 |
+
[START] task=python-off-by-one env=code-security-review model=meta-llama/Llama-3.3-70B-Instruct
|
| 4 |
+
[STEP] step=1 action={"bug_identified": true, "bug_location": "line 3", "bug_type": "off-by-one", "bug_description": "loop range(len(transactions) + 1) index error off-by-one out of bounds error", "severity": "medium", "suggested_fix": "range(len(transactions))"} reward=0.92 done=true error=null
|
| 5 |
+
[END] success=true steps=1 score=0.917 rewards=0.92
|
| 6 |
+
[START] task=js-auth-privilege env=code-security-review model=meta-llama/Llama-3.3-70B-Instruct
|
| 7 |
+
[STEP] step=1 action={"bug_identified": true, "bug_location": "line 3", "bug_type": "logic-error", "bug_description": "logic operator || bypass escalation authorization bypass access", "severity": "critical", "suggested_fix": "user.role === \"admin\" && user.isActive"} reward=0.91 done=true error=null
|
| 8 |
+
[END] success=true steps=1 score=0.912 rewards=0.91
|
| 9 |
+
[START] task=python-sql-injection env=code-security-review model=meta-llama/Llama-3.3-70B-Instruct
|
| 10 |
+
[STEP] step=1 action={"bug_identified": true, "bug_location": "line 2", "bug_type": "security-vulnerability", "bug_description": "f-string SQLi injection-flaw raw-sql SQL-interpolation", "severity": "critical", "suggested_fix": "parameterized query bind variables"} reward=0.92 done=true error=null
|
| 11 |
+
[END] success=true steps=1 score=0.920 rewards=0.92
|
| 12 |
+
|
| 13 |
+
[SUMMARY] avg_reward=0.916 tasks_passed=3/3
|
pyproject.toml
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=61.0"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "code-security-review"
|
| 7 |
+
version = "1.0.0"
|
| 8 |
+
description = "RL environment for training AI agents to perform code security review."
|
| 9 |
+
authors = [
|
| 10 |
+
{ name="Inmodel Labs", email="support@inmodel.ai" },
|
| 11 |
+
]
|
| 12 |
+
dependencies = [
|
| 13 |
+
"fastapi>=0.115.0",
|
| 14 |
+
"uvicorn>=0.30.6",
|
| 15 |
+
"pydantic>=2.7.4",
|
| 16 |
+
"requests>=2.32.3",
|
| 17 |
+
"python-dotenv>=1.0.0",
|
| 18 |
+
"openai>=1.30.0",
|
| 19 |
+
"openenv-core>=0.2.3",
|
| 20 |
+
]
|
| 21 |
+
requires-python = ">=3.9"
|
| 22 |
+
|
| 23 |
+
[project.scripts]
|
| 24 |
+
server = "server.app:main"
|
| 25 |
+
|
| 26 |
+
[tool.setuptools.package-data]
|
| 27 |
+
"*" = ["*.yaml", "*.md", "*.py"]
|
qa_test.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import json
|
| 3 |
+
|
| 4 |
+
BASE_URL = "http://localhost:7860"
|
| 5 |
+
|
| 6 |
+
def run_tests():
|
| 7 |
+
checks = []
|
| 8 |
+
|
| 9 |
+
# 1. GET /
|
| 10 |
+
try:
|
| 11 |
+
r = requests.get(f"{BASE_URL}/")
|
| 12 |
+
passed = r.status_code == 200 and r.json().get("status") == "ok"
|
| 13 |
+
checks.append({
|
| 14 |
+
"id": 1, "name": "GET / health check", "passed": passed,
|
| 15 |
+
"expected": 'HTTP 200 and {"status": "ok"}', "got": f"HTTP {r.status_code} {r.text}"
|
| 16 |
+
})
|
| 17 |
+
except Exception as e:
|
| 18 |
+
checks.append({"id": 1, "name": "GET / health check", "passed": False, "expected": "200 OK", "got": str(e)})
|
| 19 |
+
|
| 20 |
+
# 15. GET /state before reset (Edge case)
|
| 21 |
+
try:
|
| 22 |
+
r = requests.get(f"{BASE_URL}/state")
|
| 23 |
+
# Should not crash
|
| 24 |
+
checks.append({
|
| 25 |
+
"id": 15, "name": "GET /state before any reset", "passed": r.status_code == 200,
|
| 26 |
+
"expected": "HTTP 200 (No crash)", "got": f"HTTP {r.status_code} {r.text}"
|
| 27 |
+
})
|
| 28 |
+
except Exception as e:
|
| 29 |
+
checks.append({"id": 15, "name": "GET /state before any reset", "passed": False, "expected": "200 OK", "got": str(e)})
|
| 30 |
+
|
| 31 |
+
# 2. POST /reset
|
| 32 |
+
try:
|
| 33 |
+
r = requests.post(f"{BASE_URL}/reset")
|
| 34 |
+
data = r.json().get("observation", {})
|
| 35 |
+
required = ["task_id", "language", "difficulty", "code_snippet", "context", "pr_title", "file_path"]
|
| 36 |
+
passed = all(k in data for k in required)
|
| 37 |
+
checks.append({
|
| 38 |
+
"id": 2, "name": "POST /reset fields check", "passed": passed,
|
| 39 |
+
"expected": f"JSON with {required}", "got": list(data.keys())
|
| 40 |
+
})
|
| 41 |
+
except Exception as e:
|
| 42 |
+
checks.append({"id": 2, "name": "POST /reset fields check", "passed": False, "expected": "Fields", "got": str(e)})
|
| 43 |
+
|
| 44 |
+
# 16. POST /reset no task_id
|
| 45 |
+
try:
|
| 46 |
+
r = requests.post(f"{BASE_URL}/reset")
|
| 47 |
+
checks.append({
|
| 48 |
+
"id": 16, "name": "POST /reset no task_id (Random)", "passed": r.status_code == 200,
|
| 49 |
+
"expected": "HTTP 200", "got": f"HTTP {r.status_code}"
|
| 50 |
+
})
|
| 51 |
+
except Exception as e:
|
| 52 |
+
checks.append({"id": 16, "name": "POST /reset no task_id (Random)", "passed": False, "expected": "200 OK", "got": str(e)})
|
| 53 |
+
|
| 54 |
+
# 3-5. POST /reset?task_id=...
|
| 55 |
+
for tid in ["python-off-by-one", "js-auth-privilege", "python-sql-injection"]:
|
| 56 |
+
try:
|
| 57 |
+
num = {"python-off-by-one": 3, "js-auth-privilege": 4, "python-sql-injection": 5}[tid]
|
| 58 |
+
r = requests.post(f"{BASE_URL}/reset?task_id={tid}")
|
| 59 |
+
passed = r.status_code == 200 and r.json()["observation"]["task_id"] == tid
|
| 60 |
+
checks.append({
|
| 61 |
+
"id": num, "name": f"POST /reset for {tid}", "passed": passed,
|
| 62 |
+
"expected": f"HTTP 200 with task_id={tid}", "got": f"HTTP {r.status_code} {r.json()['observation']['task_id'] if passed else r.text}"
|
| 63 |
+
})
|
| 64 |
+
except Exception as e:
|
| 65 |
+
checks.append({"id": num, "name": f"POST /reset for {tid}", "passed": False, "expected": "200 OK", "got": str(e)})
|
| 66 |
+
|
| 67 |
+
# 6. GET /state
|
| 68 |
+
try:
|
| 69 |
+
r = requests.get(f"{BASE_URL}/state")
|
| 70 |
+
data = r.json()
|
| 71 |
+
required = ["task_id", "step", "done", "total_reward"]
|
| 72 |
+
passed = all(k in data for k in required)
|
| 73 |
+
checks.append({
|
| 74 |
+
"id": 6, "name": "GET /state fields check", "passed": passed,
|
| 75 |
+
"expected": f"JSON with {required}", "got": list(data.keys())
|
| 76 |
+
})
|
| 77 |
+
except Exception as e:
|
| 78 |
+
checks.append({"id": 6, "name": "GET /state fields check", "passed": False, "expected": "Fields", "got": str(e)})
|
| 79 |
+
|
| 80 |
+
# 7. POST /step with PROVIDED action
|
| 81 |
+
try:
|
| 82 |
+
requests.post(f"{BASE_URL}/reset?task_id=python-sql-injection")
|
| 83 |
+
action = {
|
| 84 |
+
"bug_identified": True,
|
| 85 |
+
"bug_location": "line 2 f-string",
|
| 86 |
+
"bug_type": "security-vulnerability",
|
| 87 |
+
"bug_description": "SQL injection via f-string",
|
| 88 |
+
"severity": "critical",
|
| 89 |
+
"suggested_fix": "use parameterized query"
|
| 90 |
+
}
|
| 91 |
+
r = requests.post(f"{BASE_URL}/step", json=action)
|
| 92 |
+
res = r.json()
|
| 93 |
+
reward = res.get("reward", -1.0)
|
| 94 |
+
done = res.get("done", False)
|
| 95 |
+
passed = 0.0 <= reward <= 1.0 and done is True
|
| 96 |
+
checks.append({
|
| 97 |
+
"id": 7, "name": "POST /step valid action", "passed": passed,
|
| 98 |
+
"expected": "Reward [0,1] and done=true", "got": f"reward={reward}, done={done}"
|
| 99 |
+
})
|
| 100 |
+
except Exception as e:
|
| 101 |
+
checks.append({"id": 7, "name": "POST /step valid action", "passed": False, "expected": "Result", "got": str(e)})
|
| 102 |
+
|
| 103 |
+
# 14. Call POST /step twice (Edge Case)
|
| 104 |
+
try:
|
| 105 |
+
# Step already called in task 7
|
| 106 |
+
action = {"bug_identified": False, "bug_location": "", "bug_type": "none", "bug_description": "", "severity": "none", "suggested_fix": ""}
|
| 107 |
+
r = requests.post(f"{BASE_URL}/step", json=action)
|
| 108 |
+
res = r.json()
|
| 109 |
+
passed = r.status_code == 200 and "error" in res.get("info", {})
|
| 110 |
+
checks.append({
|
| 111 |
+
"id": 14, "name": "POST /step twice in same episode", "passed": passed,
|
| 112 |
+
"expected": "HTTP 200 and error in info", "got": f"HTTP {r.status_code}, info={res.get('info')}"
|
| 113 |
+
})
|
| 114 |
+
except Exception as e:
|
| 115 |
+
checks.append({"id": 14, "name": "POST /step twice in same episode", "passed": False, "expected": "Handled error", "got": str(e)})
|
| 116 |
+
|
| 117 |
+
# 8. Perfect action for SQL
|
| 118 |
+
try:
|
| 119 |
+
requests.post(f"{BASE_URL}/reset?task_id=python-sql-injection")
|
| 120 |
+
perfect_action = {
|
| 121 |
+
"bug_identified": True,
|
| 122 |
+
"bug_location": "line 2 f-string interpolation in SQL query construction",
|
| 123 |
+
"bug_type": "security-vulnerability",
|
| 124 |
+
"bug_description": "SQL injection vulnerability where user-supplied search_term is directly interpolated into the SQL query via f-string. An attacker can inject malicious SQL to bypass authentication, exfiltrate all user data, or drop tables. The fix is to use parameterized queries which sanitize user input automatically.",
|
| 125 |
+
"severity": "critical",
|
| 126 |
+
"suggested_fix": "Use db.execute('SELECT * FROM users WHERE name LIKE %s', ('%'+search_term+'%',)) instead of f-string interpolation"
|
| 127 |
+
}
|
| 128 |
+
r = requests.post(f"{BASE_URL}/step", json=perfect_action)
|
| 129 |
+
reward = r.json().get("reward", 0.0)
|
| 130 |
+
checks.append({
|
| 131 |
+
"id": 8, "name": "PERFECT action SQL", "passed": reward >= 0.85,
|
| 132 |
+
"expected": "Reward >= 0.85", "got": f"reward={reward}"
|
| 133 |
+
})
|
| 134 |
+
except Exception as e:
|
| 135 |
+
checks.append({"id": 8, "name": "PERFECT action SQL", "passed": False, "expected": ">=0.85", "got": str(e)})
|
| 136 |
+
|
| 137 |
+
# 9. Keyword stuffed
|
| 138 |
+
try:
|
| 139 |
+
requests.post(f"{BASE_URL}/reset?task_id=python-sql-injection")
|
| 140 |
+
stuffed_action = {
|
| 141 |
+
"bug_identified": True,
|
| 142 |
+
"bug_location": "sql",
|
| 143 |
+
"bug_type": "security-vulnerability",
|
| 144 |
+
"bug_description": "sql injection sql injection sql injection parameterized f-string sanitize escape malicious attack tautology union drop sql injection sql injection",
|
| 145 |
+
"severity": "critical",
|
| 146 |
+
"suggested_fix": "fix"
|
| 147 |
+
}
|
| 148 |
+
r = requests.post(f"{BASE_URL}/step", json=stuffed_action)
|
| 149 |
+
reward = r.json().get("reward", 1.0)
|
| 150 |
+
checks.append({
|
| 151 |
+
"id": 9, "name": "KEYWORD STUFFED action", "passed": reward <= 0.20,
|
| 152 |
+
"expected": "Reward <= 0.20", "got": f"reward={reward}"
|
| 153 |
+
})
|
| 154 |
+
except Exception as e:
|
| 155 |
+
checks.append({"id": 9, "name": "KEYWORD STUFFED action", "passed": False, "expected": "<=0.20", "got": str(e)})
|
| 156 |
+
|
| 157 |
+
# 10. Bug identified false
|
| 158 |
+
try:
|
| 159 |
+
requests.post(f"{BASE_URL}/reset")
|
| 160 |
+
action = {"bug_identified": False, "bug_location": "", "bug_type": "none", "bug_description": "", "severity": "none", "suggested_fix": ""}
|
| 161 |
+
r = requests.post(f"{BASE_URL}/step", json=action)
|
| 162 |
+
reward = r.json().get("reward", 1.0)
|
| 163 |
+
checks.append({
|
| 164 |
+
"id": 10, "name": "Identify=False empty fields", "passed": reward == 0.0,
|
| 165 |
+
"expected": "Reward exactly 0.0", "got": f"reward={reward}"
|
| 166 |
+
})
|
| 167 |
+
except Exception as e:
|
| 168 |
+
checks.append({"id": 10, "name": "Identify=False empty fields", "passed": False, "expected": "0.0", "got": str(e)})
|
| 169 |
+
|
| 170 |
+
# 11. Partial credit severity
|
| 171 |
+
try:
|
| 172 |
+
# Off-by-one is severity critical (I set it to critical).
|
| 173 |
+
# Let's say I submit 'low' severity.
|
| 174 |
+
requests.post(f"{BASE_URL}/reset?task_id=python-off-by-one")
|
| 175 |
+
action = {
|
| 176 |
+
"bug_identified": True, "bug_location": "range", "bug_type": "off-by-one",
|
| 177 |
+
"bug_description": "off-by-one error in range function call",
|
| 178 |
+
"severity": "low", # Wrong severity
|
| 179 |
+
"suggested_fix": "range(len(x))"
|
| 180 |
+
}
|
| 181 |
+
r = requests.post(f"{BASE_URL}/step", json=action)
|
| 182 |
+
info = r.json().get("info", {})
|
| 183 |
+
breakdown = info.get("reward_breakdown", {})
|
| 184 |
+
sev_score = breakdown.get("severity", -1.0)
|
| 185 |
+
# It should be 0.0 (wrong) but the total should still have partial credit from other components
|
| 186 |
+
reward = r.json().get("reward", 0.0)
|
| 187 |
+
checks.append({
|
| 188 |
+
"id": 11, "name": "Partial credit (wrong severity)", "passed": 0.0 < reward < 1.0,
|
| 189 |
+
"expected": "Reward between 0 and 1 (partial credit)", "got": f"reward={reward}, severity_component={sev_score}"
|
| 190 |
+
})
|
| 191 |
+
except Exception as e:
|
| 192 |
+
checks.append({"id": 11, "name": "Partial credit (wrong severity)", "passed": False, "expected": "Partial credit", "got": str(e)})
|
| 193 |
+
|
| 194 |
+
# 12-13. Breakdown keys and components
|
| 195 |
+
try:
|
| 196 |
+
requests.post(f"{BASE_URL}/reset")
|
| 197 |
+
action = {"bug_identified": True, "bug_location": "test", "bug_type": "test", "bug_description": "test test test test test test test test test test test test test test test test test test test test", "severity": "none", "suggested_fix": "test test test"}
|
| 198 |
+
r = requests.post(f"{BASE_URL}/step", json=action)
|
| 199 |
+
info = r.json().get("info", {})
|
| 200 |
+
breakdown = info.get("reward_breakdown", {})
|
| 201 |
+
required = ["bug_identified", "bug_type", "bug_location", "description_quality", "fix_quality", "severity"]
|
| 202 |
+
checks.append({
|
| 203 |
+
"id": 12, "name": "Reward breakdown keys", "passed": all(k in breakdown for k in required),
|
| 204 |
+
"expected": f"Breakdown with {required}", "got": list(breakdown.keys())
|
| 205 |
+
})
|
| 206 |
+
|
| 207 |
+
max_vals = {
|
| 208 |
+
"bug_identified": 0.20, "bug_type": 0.20, "bug_location": 0.10,
|
| 209 |
+
"description_quality": 0.25, "fix_quality": 0.15, "severity": 0.10
|
| 210 |
+
}
|
| 211 |
+
passed_range = all(0.0 <= breakdown.get(k, -1) <= max_vals[k] for k in max_vals)
|
| 212 |
+
checks.append({
|
| 213 |
+
"id": 13, "name": "Component score ranges", "passed": passed_range,
|
| 214 |
+
"expected": "All components <= max", "got": breakdown
|
| 215 |
+
})
|
| 216 |
+
except Exception as e:
|
| 217 |
+
checks.append({"id": 12, "name": "Breakdown checks", "passed": False, "expected": "Breakdown", "got": str(e)})
|
| 218 |
+
|
| 219 |
+
# Sort and print
|
| 220 |
+
checks.sort(key=lambda x: x["id"])
|
| 221 |
+
for c in checks:
|
| 222 |
+
status = "PASS" if c["passed"] else "FAIL"
|
| 223 |
+
print(f"[{c['id']}] {c['name']} β {status}")
|
| 224 |
+
print(f" Expected: {c['expected']}")
|
| 225 |
+
print(f" Got: {c['got']}")
|
| 226 |
+
print("")
|
| 227 |
+
|
| 228 |
+
passed_count = sum(1 for c in checks if c["passed"])
|
| 229 |
+
disqual = "YES" if passed_count < 7 else "NO" # Disqualified if Part 1 fails
|
| 230 |
+
print(f"TOTAL: {passed_count}/16 passed")
|
| 231 |
+
print(f"DISQUALIFICATION RISK: {disqual}")
|
| 232 |
+
# Estimate score based on points
|
| 233 |
+
score = (passed_count / 16) * 100
|
| 234 |
+
print(f"ESTIMATED SCORE: {round(score)}/100")
|
| 235 |
+
|
| 236 |
+
if __name__ == "__main__":
|
| 237 |
+
run_tests()
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.115.0
|
| 2 |
+
uvicorn
|
| 3 |
+
httptools
|
| 4 |
+
uvloop
|
| 5 |
+
pydantic==2.7.4
|
| 6 |
+
requests==2.32.3
|
| 7 |
+
openai==1.40.0
|
| 8 |
+
python-dotenv==1.0.1
|
server/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Server package for the Code Security Review environment.
|
| 2 |
+
|
| 3 |
+
This module houses the core FastAPI server, environment definitions,
|
| 4 |
+
evaluation graders, and structured schema validations.
|
| 5 |
+
"""
|
server/app.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Main FastAPI application for Code Security Review.
|
| 2 |
+
|
| 3 |
+
Exposes RESTful endpoints conforming to standard OpenEnv compliance specifications
|
| 4 |
+
dictating interactions for agent evaluation.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import uvicorn
|
| 9 |
+
from typing import List, Optional
|
| 10 |
+
from fastapi import FastAPI, HTTPException, Query, status
|
| 11 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
+
|
| 13 |
+
from server.models import CodeReviewAction, StepResult, ResetResponse, StateResponse, TaskInfo
|
| 14 |
+
from server.tasks import TASKS
|
| 15 |
+
from server.environment import CodeSecurityEnv
|
| 16 |
+
|
| 17 |
+
app = FastAPI(
|
| 18 |
+
title="Code Security Review β OpenEnv",
|
| 19 |
+
description="An RL environment for training AI agents to perform code security review.",
|
| 20 |
+
version="1.0.0",
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
app.add_middleware(
|
| 24 |
+
CORSMiddleware,
|
| 25 |
+
allow_origins=["*"],
|
| 26 |
+
allow_methods=["*"],
|
| 27 |
+
allow_headers=["*"],
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
env = CodeSecurityEnv()
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@app.get("/")
|
| 34 |
+
def health() -> dict:
|
| 35 |
+
"""Health check endpoint."""
|
| 36 |
+
return {
|
| 37 |
+
"status": "ok",
|
| 38 |
+
"project": "Code Security Review - OpenEnv",
|
| 39 |
+
"version": "1.0.0",
|
| 40 |
+
"organization": "Inmodel Labs",
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@app.get("/tasks", response_model=List[TaskInfo])
|
| 45 |
+
def list_tasks() -> List[TaskInfo]:
|
| 46 |
+
"""List all available tasks."""
|
| 47 |
+
return [
|
| 48 |
+
TaskInfo(
|
| 49 |
+
id=t["id"],
|
| 50 |
+
language=t["language"],
|
| 51 |
+
bug_class=t["bug_class"],
|
| 52 |
+
difficulty=t["difficulty"],
|
| 53 |
+
)
|
| 54 |
+
for t in TASKS.values()
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@app.post("/reset", response_model=ResetResponse)
|
| 59 |
+
def reset(
|
| 60 |
+
task_id: str = Query(default="python-off-by-one", description="Task ID to reset to"),
|
| 61 |
+
seed: Optional[int] = Query(default=None, description="Optional seed for reproducibility")
|
| 62 |
+
) -> ResetResponse:
|
| 63 |
+
"""Reset the environment and return the first observation."""
|
| 64 |
+
if task_id not in TASKS:
|
| 65 |
+
raise HTTPException(
|
| 66 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 67 |
+
detail=f"Task '{task_id}' not found."
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
obs = env.reset(task_id=task_id, seed=seed)
|
| 72 |
+
return ResetResponse(observation=obs)
|
| 73 |
+
except Exception as e:
|
| 74 |
+
raise HTTPException(
|
| 75 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 76 |
+
detail=f"System breakdown during environment reset: {e}"
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
@app.post("/step", response_model=StepResult)
|
| 81 |
+
def step(action: CodeReviewAction) -> StepResult:
|
| 82 |
+
"""Submit a code review action and receive a reward signal."""
|
| 83 |
+
try:
|
| 84 |
+
return env.step(action)
|
| 85 |
+
except Exception as e:
|
| 86 |
+
raise HTTPException(
|
| 87 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 88 |
+
detail=f"Error executing agent action logic: {e}"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@app.get("/state", response_model=StateResponse)
|
| 93 |
+
def state() -> StateResponse:
|
| 94 |
+
"""Return the current environment state."""
|
| 95 |
+
try:
|
| 96 |
+
return env.state()
|
| 97 |
+
except Exception as e:
|
| 98 |
+
raise HTTPException(
|
| 99 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 100 |
+
detail=f"Error analyzing global runtime state tracking: {e}"
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def main() -> None:
|
| 105 |
+
"""Run the environment ASGI server natively."""
|
| 106 |
+
port_default = os.environ.get("PORT", "8000")
|
| 107 |
+
try:
|
| 108 |
+
port = int(port_default)
|
| 109 |
+
except ValueError:
|
| 110 |
+
port = 8000
|
| 111 |
+
|
| 112 |
+
uvicorn.run(
|
| 113 |
+
"server.app:app",
|
| 114 |
+
host="0.0.0.0",
|
| 115 |
+
port=port,
|
| 116 |
+
reload=False,
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
if __name__ == "__main__":
|
| 121 |
+
main()
|
server/environment.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Reinforcement Learning Environment Core.
|
| 2 |
+
|
| 3 |
+
Defines the environment logic, maintaining the current trajectory
|
| 4 |
+
state and mediating between incoming requests and the headless grader.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import random
|
| 8 |
+
from typing import Optional, Dict, Any
|
| 9 |
+
|
| 10 |
+
from server.tasks import TASKS
|
| 11 |
+
from server.grader import grade_action
|
| 12 |
+
from server.models import StepResult, StateResponse, Action, Observation
|
| 13 |
+
|
| 14 |
+
ERROR_EPISODE_COMPLETED = "Episode already completed. Call /reset to start a new episode."
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class CodeSecurityEnv:
|
| 18 |
+
"""Simulates the stateful progression of a software security assessment."""
|
| 19 |
+
|
| 20 |
+
def __init__(self) -> None:
|
| 21 |
+
"""Initialize a fresh environment instance."""
|
| 22 |
+
self.current_task: Optional[Dict[str, Any]] = None
|
| 23 |
+
self.step_count: int = 0
|
| 24 |
+
self.done: bool = False
|
| 25 |
+
self.total_reward: float = 0.0
|
| 26 |
+
self._task_ids = list(TASKS.keys())
|
| 27 |
+
|
| 28 |
+
def reset(self, task_id: Optional[str] = None, seed: Optional[int] = None) -> Observation:
|
| 29 |
+
"""Reset the environment safely to a new or targeted initial state.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
task_id: Optionally force the environment to yield a specific task definition.
|
| 33 |
+
seed: Initialize standard random seed.
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
An Observation baseline reflecting the new scenario context.
|
| 37 |
+
"""
|
| 38 |
+
if seed is not None:
|
| 39 |
+
random.seed(seed)
|
| 40 |
+
|
| 41 |
+
if task_id and task_id in TASKS:
|
| 42 |
+
self.current_task = TASKS[task_id]
|
| 43 |
+
else:
|
| 44 |
+
chosen_id = random.choice(self._task_ids)
|
| 45 |
+
self.current_task = TASKS[chosen_id]
|
| 46 |
+
|
| 47 |
+
self.step_count = 0
|
| 48 |
+
self.done = False
|
| 49 |
+
self.total_reward = 0.0
|
| 50 |
+
|
| 51 |
+
return self._make_observation()
|
| 52 |
+
|
| 53 |
+
def step(self, action: Action) -> StepResult:
|
| 54 |
+
"""Advance the environment state using a provided agent Action payload.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
action: Evaluated metrics provided directly by agent decision matrices.
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
A StepResult containing scalar reward metrics and end-of-episode flag.
|
| 61 |
+
"""
|
| 62 |
+
if self.current_task is None:
|
| 63 |
+
self.reset()
|
| 64 |
+
|
| 65 |
+
if self.done:
|
| 66 |
+
return StepResult(
|
| 67 |
+
observation=self._make_observation(),
|
| 68 |
+
reward=0.0,
|
| 69 |
+
done=True,
|
| 70 |
+
info={"error": ERROR_EPISODE_COMPLETED},
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# Intermediate Step: Request file
|
| 74 |
+
if getattr(action, "request_file", False):
|
| 75 |
+
self.step_count += 1
|
| 76 |
+
reward = 0.20
|
| 77 |
+
self.total_reward += reward
|
| 78 |
+
self.done = False
|
| 79 |
+
return StepResult(
|
| 80 |
+
observation=self._make_observation(),
|
| 81 |
+
reward=reward,
|
| 82 |
+
done=self.done,
|
| 83 |
+
info={
|
| 84 |
+
"task_name": getattr(self.current_task, "get", dict().get)("name", "Unknown Task") if self.current_task else "Unknown Task",
|
| 85 |
+
"step_count": self.step_count
|
| 86 |
+
},
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
reward, breakdown = grade_action(action.model_dump(), self.current_task)
|
| 91 |
+
except Exception as e:
|
| 92 |
+
reward, breakdown = 0.0, {"error": f"Evaluation error: {e}"}
|
| 93 |
+
|
| 94 |
+
self.step_count += 1
|
| 95 |
+
self.total_reward += reward
|
| 96 |
+
self.done = True # single-step environment becomes max 2-step
|
| 97 |
+
|
| 98 |
+
return StepResult(
|
| 99 |
+
observation=self._make_observation(),
|
| 100 |
+
reward=reward,
|
| 101 |
+
done=self.done,
|
| 102 |
+
info={
|
| 103 |
+
"reward_breakdown": breakdown,
|
| 104 |
+
"task_name": self.current_task.get("name", "Unknown Task"),
|
| 105 |
+
"step_count": self.step_count
|
| 106 |
+
},
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
def state(self) -> StateResponse:
|
| 110 |
+
"""Return global analytics tracking the current environment session state."""
|
| 111 |
+
current_id = self.current_task["id"] if getattr(self, "current_task", None) else ""
|
| 112 |
+
return StateResponse(
|
| 113 |
+
task_id=current_id,
|
| 114 |
+
step=self.step_count,
|
| 115 |
+
done=self.done,
|
| 116 |
+
total_reward=self.total_reward,
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
def _make_observation(self) -> Observation:
|
| 120 |
+
"""Construct the contextual parameters surrounding an ongoing assessment."""
|
| 121 |
+
t = self.current_task
|
| 122 |
+
if not t:
|
| 123 |
+
raise KeyError("Attempted observation render without an initialized active task")
|
| 124 |
+
|
| 125 |
+
# Hide the snippet before Step 1
|
| 126 |
+
snippet = t["code_snippet"] if self.step_count > 0 else "<FILE CONTENTS HIDDEN - Submit {\"request_file\": true} to view>"
|
| 127 |
+
|
| 128 |
+
return Observation(
|
| 129 |
+
task_id=t["id"],
|
| 130 |
+
language=t["language"],
|
| 131 |
+
difficulty=t["difficulty"],
|
| 132 |
+
code_snippet=snippet,
|
| 133 |
+
context=t["context"],
|
| 134 |
+
pr_title=t["pr_title"],
|
| 135 |
+
file_path=t["file_path"],
|
| 136 |
+
)
|
server/grader.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Review Grader System.
|
| 2 |
+
|
| 3 |
+
Implements programmatic sub-scoring logic for evaluating agent
|
| 4 |
+
security actions against internal semantic criteria.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import Tuple, Dict, Any
|
| 8 |
+
|
| 9 |
+
SCORE_BUG_IDENTIFIED = 0.20
|
| 10 |
+
SCORE_BUG_TYPE = 0.20
|
| 11 |
+
SCORE_BUG_LOCATION = 0.10
|
| 12 |
+
SCORE_DESC_QUALITY = 0.25
|
| 13 |
+
SCORE_FIX_QUALITY = 0.15
|
| 14 |
+
SCORE_SEV_EXACT = 0.10
|
| 15 |
+
SCORE_SEV_PARTIAL = 0.05
|
| 16 |
+
|
| 17 |
+
KEYWORD_HIT_TARGET = 3.0
|
| 18 |
+
PENALTY_THRESHOLD = 0.5
|
| 19 |
+
PENALTY_MULTIPLIER = 0.2
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def grade_action(action: Dict[str, Any], task: Dict[str, Any]) -> Tuple[float, Dict[str, float]]:
|
| 23 |
+
"""Evaluate an action against the task definition.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
action: The structured payload proposed by the AI agent.
|
| 27 |
+
task: The dictionary blueprint detailing the expected vulnerability.
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
A tuple of the normalized aggregate reward and the individual component breakdown.
|
| 31 |
+
"""
|
| 32 |
+
reward = 0.0
|
| 33 |
+
breakdown: Dict[str, float] = {}
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
# ββ Component 1: Bug identified (0.20) ββββββββββββββββββββββββββββββββββ
|
| 37 |
+
if action.get("bug_identified"):
|
| 38 |
+
reward += SCORE_BUG_IDENTIFIED
|
| 39 |
+
breakdown["bug_identified"] = SCORE_BUG_IDENTIFIED
|
| 40 |
+
else:
|
| 41 |
+
breakdown["bug_identified"] = 0.00
|
| 42 |
+
# No bug found β no partial credit for anything else
|
| 43 |
+
return max(0.0, min(1.0, reward)), breakdown
|
| 44 |
+
|
| 45 |
+
# ββ Component 2: Bug type match (0.20) ββββββββββββββββββββββββββββββββββ
|
| 46 |
+
action_type = action.get("bug_type", "").lower().replace("-", " ").replace("_", " ")
|
| 47 |
+
task_type = task["bug_type"].lower().replace("-", " ").replace("_", " ")
|
| 48 |
+
if task_type in action_type or action_type in task_type:
|
| 49 |
+
reward += SCORE_BUG_TYPE
|
| 50 |
+
breakdown["bug_type"] = SCORE_BUG_TYPE
|
| 51 |
+
else:
|
| 52 |
+
breakdown["bug_type"] = 0.00
|
| 53 |
+
|
| 54 |
+
# ββ Component 3: Bug location (0.10) ββββββββββββββββββββββββββββββββββββ
|
| 55 |
+
action_location = action.get("bug_location", "").lower()
|
| 56 |
+
location_keywords = [w for w in task["bug_location"].lower().split() if len(w) > 3]
|
| 57 |
+
if location_keywords:
|
| 58 |
+
matched = sum(1 for kw in location_keywords if kw in action_location)
|
| 59 |
+
loc_score = round(SCORE_BUG_LOCATION * (matched / len(location_keywords)), 4)
|
| 60 |
+
else:
|
| 61 |
+
loc_score = 0.0
|
| 62 |
+
|
| 63 |
+
reward += loc_score
|
| 64 |
+
breakdown["bug_location"] = loc_score
|
| 65 |
+
|
| 66 |
+
# ββ Component 4: Description quality (0.25) ββββββββββββββββββββββββββββββ
|
| 67 |
+
description = action.get("bug_description", "").lower()
|
| 68 |
+
desc_score = 0.0
|
| 69 |
+
if len(description) >= 20:
|
| 70 |
+
task_keywords = task["keywords"]
|
| 71 |
+
target = task.get("keyword_target_override", KEYWORD_HIT_TARGET)
|
| 72 |
+
matched_kw = [kw for kw in task_keywords if kw in description]
|
| 73 |
+
desc_score = round(min(SCORE_DESC_QUALITY, SCORE_DESC_QUALITY * (len(matched_kw) / target)), 4)
|
| 74 |
+
|
| 75 |
+
breakdown["description_quality"] = desc_score
|
| 76 |
+
reward += desc_score
|
| 77 |
+
|
| 78 |
+
# ββ Component 5: Fix quality (0.15) ββββββββββββββββββββββββββββββββββββββ
|
| 79 |
+
fix = action.get("suggested_fix", "").lower()
|
| 80 |
+
fix_score = 0.0
|
| 81 |
+
if len(fix) >= 10:
|
| 82 |
+
fix_patterns = task["fix_patterns"]
|
| 83 |
+
matched_fix = [p for p in fix_patterns if p.lower() in fix]
|
| 84 |
+
fix_score = round(min(SCORE_FIX_QUALITY, SCORE_FIX_QUALITY * len(matched_fix)), 4)
|
| 85 |
+
|
| 86 |
+
breakdown["fix_quality"] = fix_score
|
| 87 |
+
reward += fix_score
|
| 88 |
+
|
| 89 |
+
# ββ Component 6: Severity (0.10) βββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
action_sev = action.get("severity", "").lower()
|
| 91 |
+
task_sev = task["severity"].lower()
|
| 92 |
+
if action_sev == task_sev:
|
| 93 |
+
sev_score = SCORE_SEV_EXACT
|
| 94 |
+
elif action_sev in ("high", "critical") and task_sev in ("high", "critical"):
|
| 95 |
+
sev_score = SCORE_SEV_PARTIAL
|
| 96 |
+
else:
|
| 97 |
+
sev_score = 0.00
|
| 98 |
+
|
| 99 |
+
breakdown["severity"] = sev_score
|
| 100 |
+
reward += sev_score
|
| 101 |
+
|
| 102 |
+
# ββ Global Penalty: Keyword Stuffing ββββββββββββββββββββββββββββββββββββ
|
| 103 |
+
words = description.split()
|
| 104 |
+
unique_ratio = len(set(words)) / len(words) if words else 1.0
|
| 105 |
+
if unique_ratio < PENALTY_THRESHOLD:
|
| 106 |
+
reward *= PENALTY_MULTIPLIER
|
| 107 |
+
breakdown["stuffing_penalty_multiplier"] = PENALTY_MULTIPLIER
|
| 108 |
+
for k in list(breakdown.keys()):
|
| 109 |
+
if k != "stuffing_penalty_multiplier":
|
| 110 |
+
breakdown[k] = round(breakdown[k] * PENALTY_MULTIPLIER, 4)
|
| 111 |
+
|
| 112 |
+
return max(0.0, min(1.0, round(reward, 4))), breakdown
|
| 113 |
+
|
| 114 |
+
except KeyError as exc:
|
| 115 |
+
raise RuntimeError(f"Missing mandatory schema key in task definition: {exc}") from exc
|
server/models.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic v2 models representing actions, observations, and state payloads."""
|
| 2 |
+
|
| 3 |
+
from typing import Optional, Any, Dict
|
| 4 |
+
from pydantic import BaseModel, Field
|
| 5 |
+
|
| 6 |
+
# ββ Agent Action ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 7 |
+
|
| 8 |
+
class CodeReviewAction(BaseModel):
|
| 9 |
+
"""Action taken by the agent: a structured code review or a file request."""
|
| 10 |
+
|
| 11 |
+
request_file: Optional[bool] = Field(None, description="Request the file contents")
|
| 12 |
+
bug_identified: Optional[bool] = Field(None, description="Whether a bug was found")
|
| 13 |
+
bug_location: Optional[str] = Field(None, description="Location of the bug (function, line, variable)")
|
| 14 |
+
bug_type: Optional[str] = Field(None, description="Type: off-by-one | logic-error | security-vulnerability | none")
|
| 15 |
+
bug_description: Optional[str] = Field(None, description="Detailed explanation of why this is a bug")
|
| 16 |
+
severity: Optional[str] = Field(None, description="Severity: none | low | medium | high | critical")
|
| 17 |
+
suggested_fix: Optional[str] = Field(None, description="The corrected code or a description of how to fix it")
|
| 18 |
+
|
| 19 |
+
# ββ Observation βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 20 |
+
|
| 21 |
+
class CodeObservation(BaseModel):
|
| 22 |
+
"""What the agent sees at each step."""
|
| 23 |
+
|
| 24 |
+
task_id: str = Field(..., description="Unique task identifier")
|
| 25 |
+
language: str = Field(..., description="Programming language")
|
| 26 |
+
difficulty: str = Field(..., description="Level: easy | medium | hard")
|
| 27 |
+
code_snippet: str = Field(..., description="The code to review")
|
| 28 |
+
context: str = Field(..., description="Production context describing what the code does")
|
| 29 |
+
pr_title: str = Field(..., description="Pull request title submitted by developer")
|
| 30 |
+
file_path: str = Field(..., description="File path of the code in the repository")
|
| 31 |
+
|
| 32 |
+
# ββ Step Result βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
+
|
| 34 |
+
class StepResult(BaseModel):
|
| 35 |
+
"""Result returned from env.step()."""
|
| 36 |
+
|
| 37 |
+
observation: Optional[CodeObservation] = Field(None, description="Observation if not terminal")
|
| 38 |
+
reward: float = Field(..., description="Reward generated for the preceding action")
|
| 39 |
+
done: bool = Field(..., description="Terminal state flag")
|
| 40 |
+
info: Dict[str, Any] = Field(default_factory=dict, description="Metadata dictionary")
|
| 41 |
+
|
| 42 |
+
# ββ State βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
|
| 44 |
+
class StateResponse(BaseModel):
|
| 45 |
+
"""Internal environment state exposed via /state."""
|
| 46 |
+
|
| 47 |
+
task_id: str = Field(..., description="Current running task")
|
| 48 |
+
step: int = Field(..., description="Current evaluation step")
|
| 49 |
+
done: bool = Field(..., description="Whether the episode resides in a terminal state")
|
| 50 |
+
total_reward: float = Field(..., description="Sum of step rewards over the episode")
|
| 51 |
+
|
| 52 |
+
# ββ API Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 53 |
+
|
| 54 |
+
class ResetResponse(BaseModel):
|
| 55 |
+
"""Response wrapper returned strictly on environment resets."""
|
| 56 |
+
|
| 57 |
+
observation: CodeObservation = Field(..., description="Initial environment observation upon reset")
|
| 58 |
+
|
| 59 |
+
class TaskInfo(BaseModel):
|
| 60 |
+
"""Metadata regarding an available task scenario."""
|
| 61 |
+
|
| 62 |
+
id: str = Field(..., description="Task UUID or unique string identifier")
|
| 63 |
+
language: str = Field(..., description="Source code language for the flaw context")
|
| 64 |
+
bug_class: str = Field(..., description="The classification parameter of the embedded bug")
|
| 65 |
+
difficulty: str = Field(..., description="The difficulty tier indicator (e.g. easy, medium)")
|
| 66 |
+
|
| 67 |
+
Action = CodeReviewAction
|
| 68 |
+
Observation = CodeObservation
|
| 69 |
+
Reward = float
|
server/tasks.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenEnv Tasks for Code Security Review.
|
| 2 |
+
|
| 3 |
+
These task specifications are designed to rigorously test autonomous AI
|
| 4 |
+
agents' abilities to identify, classify, and mitigate common software
|
| 5 |
+
security vulnerabilities across distinct language paradigms.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import Dict, Any
|
| 9 |
+
|
| 10 |
+
TASKS: Dict[str, Any] = {
|
| 11 |
+
"python-off-by-one": {
|
| 12 |
+
"id": "python-off-by-one",
|
| 13 |
+
"name": "Python Off-by-One Error",
|
| 14 |
+
"language": "Python",
|
| 15 |
+
"difficulty": "easy",
|
| 16 |
+
"bug_class": "Index Error / Off-by-one",
|
| 17 |
+
"pr_title": "Update finance batch processor for transactions",
|
| 18 |
+
"file_path": "finance/processor.py",
|
| 19 |
+
"context": "Process numeric transaction data for weekly reporting",
|
| 20 |
+
"code_snippet": (
|
| 21 |
+
"def calculate_total(transactions):\n"
|
| 22 |
+
" total = 0\n"
|
| 23 |
+
" for i in range(len(transactions) + 1):\n"
|
| 24 |
+
" total += transactions[i]\n"
|
| 25 |
+
" return total"
|
| 26 |
+
),
|
| 27 |
+
"bug_type": "off-by-one",
|
| 28 |
+
"bug_location": "line 3 β loop range(len(transactions) + 1) incorrectly iterates one past the end",
|
| 29 |
+
"severity": "medium",
|
| 30 |
+
"keywords": [
|
| 31 |
+
"off-by-one", "index", "error", "range", "length", "loop", "extra",
|
| 32 |
+
"out of bounds", "indexerror", "end", "one past", "terminates",
|
| 33 |
+
"iteration", "boundary", "array", "transactions", "last",
|
| 34 |
+
"overflow", "stop-condition", "size", "pointer"
|
| 35 |
+
],
|
| 36 |
+
"fix_patterns": [
|
| 37 |
+
"range(len(transactions))",
|
| 38 |
+
"enumerate(transactions)",
|
| 39 |
+
"for tx in transactions"
|
| 40 |
+
],
|
| 41 |
+
},
|
| 42 |
+
|
| 43 |
+
"js-idor-auth": {
|
| 44 |
+
"id": "js-idor-auth",
|
| 45 |
+
"name": "JavaScript IDOR Authorization Bypass",
|
| 46 |
+
"language": "JavaScript",
|
| 47 |
+
"difficulty": "medium",
|
| 48 |
+
"bug_class": "Insecure Direct Object Reference (IDOR)",
|
| 49 |
+
"pr_title": "Add user profile endpoint to REST API",
|
| 50 |
+
"file_path": "routes/users.js",
|
| 51 |
+
"context": "Node.js/Express REST API β authenticated endpoint returning a user's account profile",
|
| 52 |
+
"code_snippet": (
|
| 53 |
+
"const authenticate = require('./middleware/authenticate');\n\n"
|
| 54 |
+
"app.get('/users/:userId/profile', authenticate, async (req, res) => {\n"
|
| 55 |
+
" const user = await db.findUser(req.params.userId);\n"
|
| 56 |
+
" if (!user) return res.status(404).json({ error: 'User not found' });\n"
|
| 57 |
+
" return res.json(user);\n"
|
| 58 |
+
"});"
|
| 59 |
+
),
|
| 60 |
+
"bug_type": "logic-error",
|
| 61 |
+
"bug_location": "line 4 β no check that req.user.id matches req.params.userId",
|
| 62 |
+
"severity": "high",
|
| 63 |
+
"keywords": [
|
| 64 |
+
"idor", "insecure direct object reference", "authorization", "horizontal",
|
| 65 |
+
"privilege", "escalation", "authorization check", "user id",
|
| 66 |
+
"req.user", "params.userId", "ownership", "access control",
|
| 67 |
+
"unauthenticated", "other user", "missing check", "object-level"
|
| 68 |
+
],
|
| 69 |
+
"fix_patterns": [
|
| 70 |
+
"req.user.id",
|
| 71 |
+
"req.params.userId",
|
| 72 |
+
"403",
|
| 73 |
+
"Forbidden"
|
| 74 |
+
],
|
| 75 |
+
},
|
| 76 |
+
|
| 77 |
+
"python-pickle-deserialization": {
|
| 78 |
+
"id": "python-pickle-deserialization",
|
| 79 |
+
"name": "Python Pickle Deserialization",
|
| 80 |
+
"language": "Python",
|
| 81 |
+
"difficulty": "hard",
|
| 82 |
+
"bug_class": "Insecure Deserialization",
|
| 83 |
+
"pr_title": "Add distributed task caching layer for worker pool",
|
| 84 |
+
"file_path": "worker/cache.py",
|
| 85 |
+
"context": "Redis-backed caching decorator for worker tasks that serializes results to a shared cache",
|
| 86 |
+
"code_snippet": (
|
| 87 |
+
"import pickle, redis\n\n"
|
| 88 |
+
"_cache = redis.Redis(host='localhost')\n\n"
|
| 89 |
+
"def cached_task(key_prefix):\n"
|
| 90 |
+
" def decorator(fn):\n"
|
| 91 |
+
" def wrapper(*args, **kwargs):\n"
|
| 92 |
+
" cache_key = f'{key_prefix}:{args[0]}'\n"
|
| 93 |
+
" cached = _cache.get(cache_key)\n"
|
| 94 |
+
" if cached:\n"
|
| 95 |
+
" return pickle.loads(cached)\n"
|
| 96 |
+
" result = fn(*args, **kwargs)\n"
|
| 97 |
+
" _cache.set(cache_key, pickle.dumps(result), ex=3600)\n"
|
| 98 |
+
" return result\n"
|
| 99 |
+
" return wrapper\n"
|
| 100 |
+
" return decorator"
|
| 101 |
+
),
|
| 102 |
+
"bug_type": "insecure-deserialization",
|
| 103 |
+
"bug_location": "line 11 β pickle.loads(cached) deserializes untrusted Redis data without validation",
|
| 104 |
+
"severity": "critical",
|
| 105 |
+
"keywords": [
|
| 106 |
+
"cache poisoning", "redis poisoning", "__reduce__",
|
| 107 |
+
"magic method", "arbitrary bytecode", "hmac", "signing key",
|
| 108 |
+
"cryptographic integrity", "deserialization gadget", "supply chain"
|
| 109 |
+
],
|
| 110 |
+
"fix_patterns": [
|
| 111 |
+
"hmac.new",
|
| 112 |
+
"hmac.compare_digest",
|
| 113 |
+
"signing_key",
|
| 114 |
+
],
|
| 115 |
+
"keyword_target_override": 3.0,
|
| 116 |
+
},
|
| 117 |
+
}
|
static/index.html
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en" data-theme="dark">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Code Security Review Environment</title>
|
| 7 |
+
<meta name="description" content="RL Environment for training AI agents to detect bugs and security vulnerabilities.">
|
| 8 |
+
<link href="https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600&family=Roboto+Mono:wght@400;500&display=swap" rel="stylesheet">
|
| 9 |
+
<link rel="stylesheet" href="/static/style.css">
|
| 10 |
+
<!-- Include Highlight.js for code formatting -->
|
| 11 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/tokyo-night-dark.min.css">
|
| 12 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
|
| 13 |
+
</head>
|
| 14 |
+
<body>
|
| 15 |
+
<div id="app-background"></div>
|
| 16 |
+
<div id="particle-overlay"></div>
|
| 17 |
+
|
| 18 |
+
<main class="container">
|
| 19 |
+
<header>
|
| 20 |
+
<h1>Code Security RL Environment</h1>
|
| 21 |
+
<p>Interactive baseline evaluation for AI Agents.</p>
|
| 22 |
+
</header>
|
| 23 |
+
|
| 24 |
+
<div class="mac-window">
|
| 25 |
+
<div class="mac-title-bar">
|
| 26 |
+
<div class="mac-dots">
|
| 27 |
+
<span class="dot red"></span>
|
| 28 |
+
<span class="dot yellow"></span>
|
| 29 |
+
<span class="dot green"></span>
|
| 30 |
+
</div>
|
| 31 |
+
<div class="mac-tabs">
|
| 32 |
+
<button class="mac-tab active" data-tab="playground">Playground</button>
|
| 33 |
+
<button class="mac-tab" data-tab="details">Model Details</button>
|
| 34 |
+
<button class="mac-tab" data-tab="specs">API Specs</button>
|
| 35 |
+
</div>
|
| 36 |
+
<button id="theme-toggle" class="theme-toggle" title="Toggle Theme">
|
| 37 |
+
<svg id="sun-icon" class="hidden" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><circle cx="12" cy="12" r="5"/><line x1="12" y1="1" x2="12" y2="3"/><line x1="12" y1="21" x2="12" y2="23"/><line x1="4.22" y1="4.22" x2="5.64" y2="5.64"/><line x1="18.36" y1="18.36" x2="19.78" y2="19.78"/><line x1="1" y1="12" x2="3" y2="12"/><line x1="21" y1="12" x2="23" y2="12"/><line x1="4.22" y1="19.78" x2="5.64" y2="18.36"/><line x1="18.36" y1="5.64" x2="19.78" y2="4.22"/></svg>
|
| 38 |
+
<svg id="moon-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z"/></svg>
|
| 39 |
+
</button>
|
| 40 |
+
</div>
|
| 41 |
+
|
| 42 |
+
<div class="window-content">
|
| 43 |
+
<div id="tab-playground" class="tab-pane active">
|
| 44 |
+
<div class="dashboard">
|
| 45 |
+
<!-- Left Column: Environment Observation -->
|
| 46 |
+
<section class="panel observation-panel" id="observation-section">
|
| 47 |
+
<div class="panel-header">
|
| 48 |
+
<h2>Environment State</h2>
|
| 49 |
+
<div class="badge-row">
|
| 50 |
+
<span id="badge-difficulty" class="badge">Loading...</span>
|
| 51 |
+
<span id="badge-step" class="badge">Step 0/0</span>
|
| 52 |
+
</div>
|
| 53 |
+
</div>
|
| 54 |
+
|
| 55 |
+
<div class="task-info">
|
| 56 |
+
<strong>Task:</strong> <span id="task-description">Initializing environment...</span>
|
| 57 |
+
</div>
|
| 58 |
+
|
| 59 |
+
<div id="feedback-container" class="feedback-info hidden">
|
| 60 |
+
<strong>Previous Feedback:</strong> <span id="previous-feedback"></span>
|
| 61 |
+
</div>
|
| 62 |
+
|
| 63 |
+
<div class="code-container">
|
| 64 |
+
<div class="code-header">
|
| 65 |
+
<span id="lang-badge">Language: Unknown</span>
|
| 66 |
+
</div>
|
| 67 |
+
<pre><code id="code-snippet" class="language-python"># Awaiting initialization...</code></pre>
|
| 68 |
+
</div>
|
| 69 |
+
</section>
|
| 70 |
+
|
| 71 |
+
<!-- Right Column: Agent Action Form -->
|
| 72 |
+
<section class="panel action-panel" id="action-section">
|
| 73 |
+
<div class="panel-header">
|
| 74 |
+
<h2>Agent Action</h2>
|
| 75 |
+
</div>
|
| 76 |
+
|
| 77 |
+
<form id="action-form">
|
| 78 |
+
<div class="form-group toggle-group">
|
| 79 |
+
<label for="input-bug-identified">Bug Identified</label>
|
| 80 |
+
<select id="input-bug-identified" required>
|
| 81 |
+
<option value="true" selected>Yes</option>
|
| 82 |
+
<option value="false">No</option>
|
| 83 |
+
</select>
|
| 84 |
+
</div>
|
| 85 |
+
|
| 86 |
+
<div class="form-group">
|
| 87 |
+
<label for="input-bug-type">Bug Type</label>
|
| 88 |
+
<select id="input-bug-type" required>
|
| 89 |
+
<option value="off-by-one">Off-by-one</option>
|
| 90 |
+
<option value="logic-error">Logic Error</option>
|
| 91 |
+
<option value="security-vulnerability">Security Vulnerability</option>
|
| 92 |
+
<option value="null-dereference">Null Dereference</option>
|
| 93 |
+
<option value="none">None</option>
|
| 94 |
+
</select>
|
| 95 |
+
</div>
|
| 96 |
+
|
| 97 |
+
<div class="form-group">
|
| 98 |
+
<label for="input-severity">Severity</label>
|
| 99 |
+
<select id="input-severity" required>
|
| 100 |
+
<option value="none">None</option>
|
| 101 |
+
<option value="low">Low</option>
|
| 102 |
+
<option value="medium">Medium</option>
|
| 103 |
+
<option value="high">High</option>
|
| 104 |
+
<option value="critical">Critical</option>
|
| 105 |
+
</select>
|
| 106 |
+
</div>
|
| 107 |
+
|
| 108 |
+
<div class="form-group">
|
| 109 |
+
<label for="input-bug-location">Bug Location</label>
|
| 110 |
+
<input type="text" id="input-bug-location" placeholder="e.g., fetch_records() line 4" required>
|
| 111 |
+
</div>
|
| 112 |
+
|
| 113 |
+
<div class="form-group">
|
| 114 |
+
<label for="input-bug-description">Description</label>
|
| 115 |
+
<textarea id="input-bug-description" rows="3" placeholder="Explain the vulnerability..." required></textarea>
|
| 116 |
+
</div>
|
| 117 |
+
|
| 118 |
+
<div class="form-group">
|
| 119 |
+
<label for="input-suggested-fix">Suggested Fix</label>
|
| 120 |
+
<textarea id="input-suggested-fix" rows="3" placeholder="Provide corrected code or explanation..." required></textarea>
|
| 121 |
+
</div>
|
| 122 |
+
|
| 123 |
+
<button type="submit" id="btn-submit-action" class="primary-btn">Submit Action</button>
|
| 124 |
+
<button type="button" id="btn-reset-env" class="secondary-btn">Reset Environment</button>
|
| 125 |
+
</form>
|
| 126 |
+
</section>
|
| 127 |
+
</div>
|
| 128 |
+
</div>
|
| 129 |
+
|
| 130 |
+
<div id="tab-details" class="tab-pane">
|
| 131 |
+
<div class="panel">
|
| 132 |
+
<h2>Model Details</h2>
|
| 133 |
+
<p style="margin-top: 1rem;">OpenEnv is an RL environment designed for security validation. This baseline uses standard reward signals to calibrate agents.</p>
|
| 134 |
+
<ul style="margin-top: 1rem; color: var(--text-muted); list-style-position: inside;">
|
| 135 |
+
<li>Deterministic Reward Signals</li>
|
| 136 |
+
<li>Multi-step Episode Support</li>
|
| 137 |
+
<li>Security-focused Task Sets</li>
|
| 138 |
+
</ul>
|
| 139 |
+
</div>
|
| 140 |
+
</div>
|
| 141 |
+
|
| 142 |
+
<div id="tab-specs" class="tab-pane">
|
| 143 |
+
<div class="panel">
|
| 144 |
+
<h2>API Specifications</h2>
|
| 145 |
+
<pre style="margin-top: 1rem; background: #000; padding: 1rem; border-radius: 4px;">POST /reset?difficulty={easy|medium|hard}
|
| 146 |
+
POST /step {bug_identified, bug_type, ...}
|
| 147 |
+
GET /state</pre>
|
| 148 |
+
</div>
|
| 149 |
+
</div>
|
| 150 |
+
</div>
|
| 151 |
+
</div>
|
| 152 |
+
|
| 153 |
+
<!-- Sticky Status Toast -->
|
| 154 |
+
<div id="reward-toast" class="toast hidden">
|
| 155 |
+
<div class="toast-content">
|
| 156 |
+
<span class="toast-icon">β¨</span>
|
| 157 |
+
<div class="toast-text">
|
| 158 |
+
<h3 id="toast-title">Reward Received</h3>
|
| 159 |
+
<p id="toast-message">Score: 0.0</p>
|
| 160 |
+
</div>
|
| 161 |
+
</div>
|
| 162 |
+
<button id="toast-close">×</button>
|
| 163 |
+
</div>
|
| 164 |
+
</main>
|
| 165 |
+
|
| 166 |
+
<script src="/static/main.js"></script>
|
| 167 |
+
</body>
|
| 168 |
+
</html>
|
static/main.js
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
document.addEventListener('DOMContentLoaded', () => {
|
| 2 |
+
// DOM Elements
|
| 3 |
+
const elements = {
|
| 4 |
+
badgeDifficulty: document.getElementById('badge-difficulty'),
|
| 5 |
+
badgeStep: document.getElementById('badge-step'),
|
| 6 |
+
taskDescription: document.getElementById('task-description'),
|
| 7 |
+
codeSnippet: document.getElementById('code-snippet'),
|
| 8 |
+
langBadge: document.getElementById('lang-badge'),
|
| 9 |
+
feedbackContainer: document.getElementById('feedback-container'),
|
| 10 |
+
previousFeedback: document.getElementById('previous-feedback'),
|
| 11 |
+
|
| 12 |
+
form: document.getElementById('action-form'),
|
| 13 |
+
submitBtn: document.getElementById('btn-submit-action'),
|
| 14 |
+
resetBtn: document.getElementById('btn-reset-env'),
|
| 15 |
+
|
| 16 |
+
toast: document.getElementById('reward-toast'),
|
| 17 |
+
toastTitle: document.getElementById('toast-title'),
|
| 18 |
+
toastMessage: document.getElementById('toast-message'),
|
| 19 |
+
toastClose: document.getElementById('toast-close'),
|
| 20 |
+
|
| 21 |
+
// Inputs
|
| 22 |
+
inputBugIdentified: document.getElementById('input-bug-identified'),
|
| 23 |
+
inputBugType: document.getElementById('input-bug-type'),
|
| 24 |
+
inputSeverity: document.getElementById('input-severity'),
|
| 25 |
+
inputBugLocation: document.getElementById('input-bug-location'),
|
| 26 |
+
inputBugDescription: document.getElementById('input-bug-description'),
|
| 27 |
+
inputSuggestedFix: document.getElementById('input-suggested-fix'),
|
| 28 |
+
|
| 29 |
+
// Tab elements
|
| 30 |
+
tabs: document.querySelectorAll('.mac-tab'),
|
| 31 |
+
panes: document.querySelectorAll('.tab-pane'),
|
| 32 |
+
|
| 33 |
+
// Theme elements
|
| 34 |
+
themeToggle: document.getElementById('theme-toggle'),
|
| 35 |
+
html: document.documentElement,
|
| 36 |
+
sunIcon: document.getElementById('sun-icon'),
|
| 37 |
+
moonIcon: document.getElementById('moon-icon')
|
| 38 |
+
};
|
| 39 |
+
|
| 40 |
+
let isDone = false;
|
| 41 |
+
|
| 42 |
+
// Theme Logic
|
| 43 |
+
function setTheme(theme) {
|
| 44 |
+
elements.html.setAttribute('data-theme', theme);
|
| 45 |
+
localStorage.setItem('theme', theme);
|
| 46 |
+
|
| 47 |
+
if (theme === 'dark') {
|
| 48 |
+
elements.sunIcon.classList.add('hidden');
|
| 49 |
+
elements.moonIcon.classList.remove('hidden');
|
| 50 |
+
} else {
|
| 51 |
+
elements.sunIcon.classList.remove('hidden');
|
| 52 |
+
elements.moonIcon.classList.add('hidden');
|
| 53 |
+
}
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
// Initialize theme
|
| 57 |
+
const savedTheme = localStorage.getItem('theme') || 'dark';
|
| 58 |
+
setTheme(savedTheme);
|
| 59 |
+
|
| 60 |
+
elements.themeToggle.addEventListener('click', () => {
|
| 61 |
+
const currentTheme = elements.html.getAttribute('data-theme');
|
| 62 |
+
setTheme(currentTheme === 'dark' ? 'light' : 'dark');
|
| 63 |
+
});
|
| 64 |
+
|
| 65 |
+
// Tab Switching Logic
|
| 66 |
+
elements.tabs.forEach(tab => {
|
| 67 |
+
tab.addEventListener('click', () => {
|
| 68 |
+
const target = tab.getAttribute('data-tab');
|
| 69 |
+
|
| 70 |
+
// Update tabs
|
| 71 |
+
elements.tabs.forEach(t => t.classList.remove('active'));
|
| 72 |
+
tab.classList.add('active');
|
| 73 |
+
|
| 74 |
+
// Update panes
|
| 75 |
+
elements.panes.forEach(pane => {
|
| 76 |
+
if (pane.id === `tab-${target}`) {
|
| 77 |
+
pane.classList.add('active');
|
| 78 |
+
} else {
|
| 79 |
+
pane.classList.remove('active');
|
| 80 |
+
}
|
| 81 |
+
});
|
| 82 |
+
});
|
| 83 |
+
});
|
| 84 |
+
|
| 85 |
+
// Initialize Environment
|
| 86 |
+
async function resetEnvironment(difficulty = 'easy') {
|
| 87 |
+
elements.submitBtn.disabled = true;
|
| 88 |
+
elements.resetBtn.disabled = true;
|
| 89 |
+
isDone = false;
|
| 90 |
+
|
| 91 |
+
try {
|
| 92 |
+
const res = await fetch(`/reset?difficulty=${difficulty}`, { method: 'POST' });
|
| 93 |
+
if (!res.ok) throw new Error('Failed to reset environment');
|
| 94 |
+
const data = await res.json();
|
| 95 |
+
updateObservation(data.observation);
|
| 96 |
+
|
| 97 |
+
// clear form
|
| 98 |
+
elements.form.reset();
|
| 99 |
+
document.getElementById('observation-section').classList.remove('environment-done');
|
| 100 |
+
hideToast();
|
| 101 |
+
} catch (e) {
|
| 102 |
+
showToast('Error', e.message, true);
|
| 103 |
+
} finally {
|
| 104 |
+
elements.submitBtn.disabled = false;
|
| 105 |
+
elements.resetBtn.disabled = false;
|
| 106 |
+
}
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
function updateObservation(obs) {
|
| 110 |
+
elements.badgeDifficulty.textContent = obs.difficulty.toUpperCase();
|
| 111 |
+
elements.badgeStep.textContent = `Step ${obs.step_number}/${obs.max_steps}`;
|
| 112 |
+
elements.taskDescription.textContent = obs.task_description;
|
| 113 |
+
elements.langBadge.textContent = `Language: ${obs.language}`;
|
| 114 |
+
|
| 115 |
+
// Update code block and highlight
|
| 116 |
+
elements.codeSnippet.textContent = obs.code_snippet;
|
| 117 |
+
elements.codeSnippet.className = `language-${obs.language}`;
|
| 118 |
+
hljs.highlightElement(elements.codeSnippet);
|
| 119 |
+
|
| 120 |
+
if (obs.previous_feedback) {
|
| 121 |
+
elements.previousFeedback.textContent = obs.previous_feedback;
|
| 122 |
+
elements.feedbackContainer.classList.remove('hidden');
|
| 123 |
+
} else {
|
| 124 |
+
elements.feedbackContainer.classList.add('hidden');
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
if (obs.step_number >= obs.max_steps) {
|
| 128 |
+
isDone = true;
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
// Submit Step
|
| 133 |
+
elements.form.addEventListener('submit', async (e) => {
|
| 134 |
+
e.preventDefault();
|
| 135 |
+
if (isDone) {
|
| 136 |
+
showToast('Environment Finished', 'Please reset to start a new episode.', true);
|
| 137 |
+
return;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
const action = {
|
| 141 |
+
bug_identified: elements.inputBugIdentified.value === 'true',
|
| 142 |
+
bug_location: elements.inputBugLocation.value,
|
| 143 |
+
bug_type: elements.inputBugType.value,
|
| 144 |
+
bug_description: elements.inputBugDescription.value,
|
| 145 |
+
severity: elements.inputSeverity.value,
|
| 146 |
+
suggested_fix: elements.inputSuggestedFix.value
|
| 147 |
+
};
|
| 148 |
+
|
| 149 |
+
elements.submitBtn.disabled = true;
|
| 150 |
+
elements.submitBtn.textContent = "Submitting...";
|
| 151 |
+
|
| 152 |
+
try {
|
| 153 |
+
const res = await fetch('/step', {
|
| 154 |
+
method: 'POST',
|
| 155 |
+
headers: { 'Content-Type': 'application/json' },
|
| 156 |
+
body: JSON.stringify(action)
|
| 157 |
+
});
|
| 158 |
+
|
| 159 |
+
if (!res.ok) {
|
| 160 |
+
const err = await res.json();
|
| 161 |
+
throw new Error(err.detail || 'Failed to submit action');
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
const data = await res.json();
|
| 165 |
+
updateObservation(data.observation);
|
| 166 |
+
|
| 167 |
+
if (data.done) {
|
| 168 |
+
isDone = true;
|
| 169 |
+
const totalScore = data.info?.total_score || data.reward;
|
| 170 |
+
showToast('Episode Completed!', `Final Score: ${totalScore.toFixed(2)}`, false);
|
| 171 |
+
document.getElementById('observation-section').classList.add('environment-done');
|
| 172 |
+
} else {
|
| 173 |
+
showToast('Step Evaluated', `Step Reward: ${data.reward.toFixed(2)}`, false);
|
| 174 |
+
}
|
| 175 |
+
} catch (e) {
|
| 176 |
+
showToast('Action Failed', e.message, true);
|
| 177 |
+
} finally {
|
| 178 |
+
elements.submitBtn.disabled = false;
|
| 179 |
+
elements.submitBtn.textContent = "Submit Action";
|
| 180 |
+
}
|
| 181 |
+
});
|
| 182 |
+
|
| 183 |
+
// Reset button
|
| 184 |
+
elements.resetBtn.addEventListener('click', () => {
|
| 185 |
+
const randomDifficulty = ['easy', 'medium', 'hard'][Math.floor(Math.random() * 3)];
|
| 186 |
+
resetEnvironment(randomDifficulty);
|
| 187 |
+
});
|
| 188 |
+
|
| 189 |
+
// Toast functionality
|
| 190 |
+
let toastTimeout;
|
| 191 |
+
function showToast(title, message, isError = false) {
|
| 192 |
+
elements.toastTitle.textContent = title;
|
| 193 |
+
elements.toastMessage.textContent = message;
|
| 194 |
+
elements.toastMessage.style.color = isError ? 'var(--error)' : 'var(--success)';
|
| 195 |
+
elements.toast.classList.remove('hidden');
|
| 196 |
+
|
| 197 |
+
clearTimeout(toastTimeout);
|
| 198 |
+
toastTimeout = setTimeout(hideToast, 4000);
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
function hideToast() {
|
| 202 |
+
elements.toast.classList.add('hidden');
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
elements.toastClose.addEventListener('click', hideToast);
|
| 206 |
+
|
| 207 |
+
// Initial Load
|
| 208 |
+
resetEnvironment();
|
| 209 |
+
});
|
static/style.css
ADDED
|
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
:root {
|
| 2 |
+
--secondary: #52525b;
|
| 3 |
+
}
|
| 4 |
+
|
| 5 |
+
/* Default to Dark Mode */
|
| 6 |
+
[data-theme='dark'] {
|
| 7 |
+
--bg-primary: #000000;
|
| 8 |
+
--bg-card: #151515;
|
| 9 |
+
--bg-input: #1f1f1f;
|
| 10 |
+
--border-card: #2e2e2e;
|
| 11 |
+
--border-input: #3e3e3e;
|
| 12 |
+
--accent-primary: #76b900; /* NVIDIA Green */
|
| 13 |
+
--accent-hover: #88d400;
|
| 14 |
+
--accent-glow: rgba(118, 185, 0, 0.2);
|
| 15 |
+
--text-main: #ffffff;
|
| 16 |
+
--text-muted: #a1a1aa;
|
| 17 |
+
--code-bg: #09090b;
|
| 18 |
+
--header-bg: #1a1a1a;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
/* Light Mode */
|
| 22 |
+
[data-theme='light'] {
|
| 23 |
+
--bg-primary: #f5f5f7;
|
| 24 |
+
--bg-card: #ffffff;
|
| 25 |
+
--bg-input: #ffffff;
|
| 26 |
+
--border-card: #d2d2d7;
|
| 27 |
+
--border-input: #e5e7eb;
|
| 28 |
+
--accent-primary: #0071e3; /* Mac Blue */
|
| 29 |
+
--accent-hover: #0077ed;
|
| 30 |
+
--accent-glow: rgba(0, 113, 227, 0.1);
|
| 31 |
+
--text-main: #1d1d1f;
|
| 32 |
+
--text-muted: #6e6e73;
|
| 33 |
+
--code-bg: #f5f5f7;
|
| 34 |
+
--header-bg: #ebebeb;
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
:root {
|
| 38 |
+
--success: #76b900;
|
| 39 |
+
--error: #ef4444;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
* {
|
| 43 |
+
box-sizing: border-box;
|
| 44 |
+
margin: 0;
|
| 45 |
+
padding: 0;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
body {
|
| 49 |
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
|
| 50 |
+
color: var(--text-main);
|
| 51 |
+
background-color: var(--bg-primary);
|
| 52 |
+
min-height: 100vh;
|
| 53 |
+
padding: 2rem;
|
| 54 |
+
position: relative;
|
| 55 |
+
overflow-x: hidden;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
/* Background Subtle Glow */
|
| 59 |
+
#app-background {
|
| 60 |
+
position: fixed;
|
| 61 |
+
top: 0;
|
| 62 |
+
left: 0;
|
| 63 |
+
width: 100%;
|
| 64 |
+
height: 100%;
|
| 65 |
+
background: radial-gradient(circle at 50% 0%, rgba(118, 185, 0, 0.1), transparent 50%);
|
| 66 |
+
z-index: -2;
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
.container {
|
| 70 |
+
max-width: 1200px;
|
| 71 |
+
margin: 0 auto;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
header {
|
| 75 |
+
margin-bottom: 2.5rem;
|
| 76 |
+
text-align: center;
|
| 77 |
+
padding: 2rem 0;
|
| 78 |
+
border-bottom: 1px solid var(--border-card);
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
h1 {
|
| 82 |
+
font-size: 2.25rem;
|
| 83 |
+
font-weight: 700;
|
| 84 |
+
letter-spacing: -0.02em;
|
| 85 |
+
color: var(--text-main);
|
| 86 |
+
margin-bottom: 0.5rem;
|
| 87 |
+
text-align: center;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
/* Mac Window Styling */
|
| 91 |
+
.mac-window {
|
| 92 |
+
background: var(--bg-card);
|
| 93 |
+
border: 1px solid var(--border-card);
|
| 94 |
+
border-radius: 12px;
|
| 95 |
+
overflow: hidden;
|
| 96 |
+
box-shadow: 0 20px 50px rgba(0, 0, 0, 0.5);
|
| 97 |
+
margin-top: 1rem;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
.mac-title-bar {
|
| 101 |
+
background: var(--header-bg);
|
| 102 |
+
height: 44px;
|
| 103 |
+
display: flex;
|
| 104 |
+
align-items: center;
|
| 105 |
+
padding: 0 16px;
|
| 106 |
+
border-bottom: 1px solid var(--border-card);
|
| 107 |
+
position: relative;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
.mac-dots {
|
| 111 |
+
display: flex;
|
| 112 |
+
gap: 8px;
|
| 113 |
+
position: absolute;
|
| 114 |
+
left: 16px;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
.dot {
|
| 118 |
+
width: 12px;
|
| 119 |
+
height: 12px;
|
| 120 |
+
border-radius: 50%;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
.dot.red { background: #ff5f57; }
|
| 124 |
+
.dot.yellow { background: #febc2e; }
|
| 125 |
+
.dot.green { background: #28c840; }
|
| 126 |
+
|
| 127 |
+
.mac-tabs {
|
| 128 |
+
display: flex;
|
| 129 |
+
margin: 0 auto;
|
| 130 |
+
background: #000;
|
| 131 |
+
border-radius: 6px;
|
| 132 |
+
padding: 2px;
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
.mac-tab {
|
| 136 |
+
background: transparent;
|
| 137 |
+
border: none;
|
| 138 |
+
color: var(--text-muted);
|
| 139 |
+
padding: 6px 16px;
|
| 140 |
+
font-size: 0.85rem;
|
| 141 |
+
font-weight: 500;
|
| 142 |
+
cursor: pointer;
|
| 143 |
+
border-radius: 4px;
|
| 144 |
+
transition: all 0.2s;
|
| 145 |
+
width: auto;
|
| 146 |
+
margin-bottom: 0;
|
| 147 |
+
text-transform: none;
|
| 148 |
+
letter-spacing: normal;
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
.mac-tab:hover {
|
| 152 |
+
color: var(--text-main);
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
.mac-tab.active {
|
| 156 |
+
background: var(--bg-input);
|
| 157 |
+
color: var(--accent-primary);
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
.window-content {
|
| 161 |
+
padding: 2rem;
|
| 162 |
+
min-height: 500px;
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
.tab-pane {
|
| 166 |
+
display: none;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
.tab-pane.active {
|
| 170 |
+
display: block;
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
h1 {
|
| 174 |
+
font-size: 2.25rem;
|
| 175 |
+
font-weight: 700;
|
| 176 |
+
letter-spacing: -0.02em;
|
| 177 |
+
color: var(--text-main);
|
| 178 |
+
margin-bottom: 0.5rem;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
/* Add a subtle green underline to h1 */
|
| 182 |
+
h1::after {
|
| 183 |
+
content: '';
|
| 184 |
+
display: block;
|
| 185 |
+
width: 60px;
|
| 186 |
+
height: 4px;
|
| 187 |
+
background: var(--accent-primary);
|
| 188 |
+
margin: 1rem auto 0;
|
| 189 |
+
border-radius: 2px;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
p {
|
| 193 |
+
color: var(--text-muted);
|
| 194 |
+
font-size: 1.1rem;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
.panel {
|
| 198 |
+
background: #1a1a1b;
|
| 199 |
+
border: 1px solid var(--border-card);
|
| 200 |
+
border-radius: 8px;
|
| 201 |
+
padding: 1.75rem;
|
| 202 |
+
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.5);
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
.dashboard {
|
| 206 |
+
display: grid;
|
| 207 |
+
grid-template-columns: 1fr 1fr;
|
| 208 |
+
gap: 2rem;
|
| 209 |
+
align-items: start;
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
@media (max-width: 900px) {
|
| 213 |
+
.dashboard {
|
| 214 |
+
grid-template-columns: 1fr;
|
| 215 |
+
}
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
/* Common Panel Header */
|
| 219 |
+
.panel-header {
|
| 220 |
+
display: flex;
|
| 221 |
+
justify-content: space-between;
|
| 222 |
+
align-items: center;
|
| 223 |
+
border-bottom: 1px solid var(--border-card);
|
| 224 |
+
padding-bottom: 1rem;
|
| 225 |
+
margin-bottom: 1.25rem;
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
h2 {
|
| 229 |
+
font-size: 1.25rem;
|
| 230 |
+
font-weight: 600;
|
| 231 |
+
display: flex;
|
| 232 |
+
align-items: center;
|
| 233 |
+
gap: 0.5rem;
|
| 234 |
+
color: var(--text-main);
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
.icon {
|
| 238 |
+
color: var(--accent-primary);
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
.badge-row {
|
| 242 |
+
display: flex;
|
| 243 |
+
gap: 0.5rem;
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
.badge {
|
| 247 |
+
background: var(--bg-input);
|
| 248 |
+
border: 1px solid var(--border-input);
|
| 249 |
+
padding: 0.25rem 0.75rem;
|
| 250 |
+
border-radius: 4px;
|
| 251 |
+
font-size: 0.75rem;
|
| 252 |
+
font-weight: 600;
|
| 253 |
+
letter-spacing: 0.05em;
|
| 254 |
+
text-transform: uppercase;
|
| 255 |
+
color: var(--accent-primary);
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
/* Observation Panel */
|
| 259 |
+
.task-info {
|
| 260 |
+
margin-bottom: 1.25rem;
|
| 261 |
+
font-size: 0.95rem;
|
| 262 |
+
line-height: 1.6;
|
| 263 |
+
color: #e4e4e7;
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
.feedback-info {
|
| 267 |
+
background: rgba(239, 68, 68, 0.1);
|
| 268 |
+
border: 1px solid rgba(239, 68, 68, 0.2);
|
| 269 |
+
border-left: 3px solid var(--error);
|
| 270 |
+
border-radius: 4px;
|
| 271 |
+
padding: 1rem;
|
| 272 |
+
margin-bottom: 1rem;
|
| 273 |
+
font-size: 0.9rem;
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
.hidden {
|
| 277 |
+
display: none !important;
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
.code-container {
|
| 281 |
+
background: var(--code-bg);
|
| 282 |
+
border-radius: 6px;
|
| 283 |
+
overflow: hidden;
|
| 284 |
+
border: 1px solid var(--border-card);
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
.code-header {
|
| 288 |
+
background: var(--header-bg);
|
| 289 |
+
padding: 0.5rem 1rem;
|
| 290 |
+
font-size: 0.75rem;
|
| 291 |
+
color: var(--text-muted);
|
| 292 |
+
border-bottom: 1px solid var(--border-card);
|
| 293 |
+
display: flex;
|
| 294 |
+
justify-content: flex-end;
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
pre {
|
| 298 |
+
margin: 0;
|
| 299 |
+
padding: 1rem;
|
| 300 |
+
font-family: 'JetBrains Mono', 'Roboto Mono', monospace;
|
| 301 |
+
font-size: 0.85rem;
|
| 302 |
+
overflow-x: auto;
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
/* Action Panel Form */
|
| 306 |
+
.form-group {
|
| 307 |
+
margin-bottom: 1.25rem;
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
label {
|
| 311 |
+
display: block;
|
| 312 |
+
font-size: 0.85rem;
|
| 313 |
+
font-weight: 600;
|
| 314 |
+
color: #d4d4d8;
|
| 315 |
+
margin-bottom: 0.4rem;
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
input, select, textarea {
|
| 319 |
+
width: 100%;
|
| 320 |
+
background: var(--bg-input);
|
| 321 |
+
border: 1px solid var(--border-input);
|
| 322 |
+
border-radius: 4px;
|
| 323 |
+
color: var(--text-main);
|
| 324 |
+
padding: 0.65rem 0.875rem;
|
| 325 |
+
font-family: inherit;
|
| 326 |
+
font-size: 0.95rem;
|
| 327 |
+
transition: border-color 0.15s, box-shadow 0.15s;
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
input:focus, select:focus, textarea:focus {
|
| 331 |
+
outline: none;
|
| 332 |
+
border-color: var(--accent-primary);
|
| 333 |
+
box-shadow: 0 0 0 1px var(--accent-primary);
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
select option {
|
| 337 |
+
background: var(--bg-primary);
|
| 338 |
+
color: var(--text-main);
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
button {
|
| 342 |
+
width: 100%;
|
| 343 |
+
padding: 0.75rem;
|
| 344 |
+
border: none;
|
| 345 |
+
border-radius: 4px;
|
| 346 |
+
font-family: inherit;
|
| 347 |
+
font-weight: 600;
|
| 348 |
+
font-size: 0.95rem;
|
| 349 |
+
cursor: pointer;
|
| 350 |
+
transition: all 0.2s;
|
| 351 |
+
margin-bottom: 1rem;
|
| 352 |
+
text-transform: uppercase;
|
| 353 |
+
letter-spacing: 0.02em;
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
.primary-btn {
|
| 357 |
+
background: var(--accent-primary);
|
| 358 |
+
color: #000000;
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
.primary-btn:hover {
|
| 362 |
+
background: var(--accent-hover);
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
.secondary-btn {
|
| 366 |
+
background: transparent;
|
| 367 |
+
border: 1px solid var(--border-input);
|
| 368 |
+
color: var(--text-main);
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
.secondary-btn:hover {
|
| 372 |
+
background: var(--bg-input);
|
| 373 |
+
border-color: #52525b;
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
/* Toast */
|
| 377 |
+
.toast {
|
| 378 |
+
position: fixed;
|
| 379 |
+
bottom: 2rem;
|
| 380 |
+
right: 2rem;
|
| 381 |
+
background: var(--bg-card);
|
| 382 |
+
border: 1px solid var(--accent-primary);
|
| 383 |
+
border-left: 4px solid var(--accent-primary);
|
| 384 |
+
padding: 1rem 1.25rem;
|
| 385 |
+
border-radius: 4px;
|
| 386 |
+
display: flex;
|
| 387 |
+
justify-content: space-between;
|
| 388 |
+
align-items: center;
|
| 389 |
+
box-shadow: 0 10px 25px rgba(0,0,0,0.5), 0 0 15px var(--accent-glow);
|
| 390 |
+
transform: translateY(100px);
|
| 391 |
+
opacity: 0;
|
| 392 |
+
transition: transform 0.3s, opacity 0.3s;
|
| 393 |
+
z-index: 100;
|
| 394 |
+
min-width: 320px;
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
/* Theme Toggle Button */
|
| 398 |
+
.theme-toggle {
|
| 399 |
+
position: absolute;
|
| 400 |
+
right: 16px;
|
| 401 |
+
background: transparent;
|
| 402 |
+
border: none;
|
| 403 |
+
cursor: pointer;
|
| 404 |
+
color: var(--text-muted);
|
| 405 |
+
padding: 4px;
|
| 406 |
+
display: flex;
|
| 407 |
+
align-items: center;
|
| 408 |
+
justify-content: center;
|
| 409 |
+
width: auto;
|
| 410 |
+
margin: 0;
|
| 411 |
+
transition: color 0.2s;
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
.theme-toggle:hover {
|
| 415 |
+
color: var(--text-main);
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
.theme-toggle svg {
|
| 419 |
+
width: 18px;
|
| 420 |
+
height: 18px;
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
@keyframes slideUp {
|
| 424 |
+
to {
|
| 425 |
+
transform: translateY(0);
|
| 426 |
+
opacity: 1;
|
| 427 |
+
}
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
.toast-content {
|
| 431 |
+
display: flex;
|
| 432 |
+
align-items: center;
|
| 433 |
+
gap: 1rem;
|
| 434 |
+
}
|
| 435 |
+
|
| 436 |
+
.toast-icon {
|
| 437 |
+
font-size: 1.25rem;
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
#toast-title {
|
| 441 |
+
font-size: 0.85rem;
|
| 442 |
+
margin-bottom: 0.2rem;
|
| 443 |
+
color: var(--text-muted);
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
#toast-message {
|
| 447 |
+
font-size: 1.1rem;
|
| 448 |
+
font-weight: 600;
|
| 449 |
+
color: var(--text-main);
|
| 450 |
+
}
|
| 451 |
+
|
| 452 |
+
#toast-close {
|
| 453 |
+
background: transparent;
|
| 454 |
+
border: none;
|
| 455 |
+
color: var(--text-muted);
|
| 456 |
+
font-size: 1.5rem;
|
| 457 |
+
cursor: pointer;
|
| 458 |
+
padding: 0;
|
| 459 |
+
width: auto;
|
| 460 |
+
margin: 0;
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
#toast-close:hover {
|
| 464 |
+
color: var(--text-main);
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
.environment-done {
|
| 468 |
+
border-color: var(--success);
|
| 469 |
+
box-shadow: 0 0 15px var(--accent-glow);
|
| 470 |
+
}
|
uv.lock
ADDED
|
File without changes
|
validate.sh
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenEnv Submission Validation Script
|
| 4 |
+
|
| 5 |
+
set -e
|
| 6 |
+
echo "βββββββββββββββββββββββββββββββββββββββ"
|
| 7 |
+
echo " OpenEnv Pre-Submission Validation"
|
| 8 |
+
echo "βββββββββββββββββββββββββββββββββββββββ"
|
| 9 |
+
echo ""
|
| 10 |
+
|
| 11 |
+
# 1. Check for required root files
|
| 12 |
+
echo "ββ 1. Required Files ββ"
|
| 13 |
+
FILES=("openenv.yaml" "inference.py" "README.md" "Dockerfile" "requirements.txt")
|
| 14 |
+
for file in "${FILES[@]}"; do
|
| 15 |
+
if [ -f "$file" ]; then
|
| 16 |
+
echo " β
$file"
|
| 17 |
+
else
|
| 18 |
+
echo " β Missing $file"
|
| 19 |
+
exit 1
|
| 20 |
+
fi
|
| 21 |
+
done
|
| 22 |
+
echo ""
|
| 23 |
+
|
| 24 |
+
# 2. Check server/ module structure
|
| 25 |
+
echo "ββ 2. Server Module Structure ββ"
|
| 26 |
+
SERVER_FILES=("server/__init__.py" "server/app.py" "server/models.py" "server/environment.py" "server/tasks.py" "server/grader.py")
|
| 27 |
+
for file in "${SERVER_FILES[@]}"; do
|
| 28 |
+
if [ -f "$file" ]; then
|
| 29 |
+
echo " β
$file"
|
| 30 |
+
else
|
| 31 |
+
echo " β Missing $file"
|
| 32 |
+
exit 1
|
| 33 |
+
fi
|
| 34 |
+
done
|
| 35 |
+
echo ""
|
| 36 |
+
|
| 37 |
+
# 3. Activate venv & validate Python imports
|
| 38 |
+
echo "ββ 3. Python Import Validation ββ"
|
| 39 |
+
source venv/bin/activate
|
| 40 |
+
python3 -c "
|
| 41 |
+
from server.tasks import TASKS
|
| 42 |
+
from server.grader import grade_action
|
| 43 |
+
from server.environment import CodeSecurityEnv
|
| 44 |
+
from server.models import CodeReviewAction, CodeObservation, StepResult, StateResponse, ResetResponse, TaskInfo
|
| 45 |
+
|
| 46 |
+
assert len(TASKS) >= 3, f'Expected 3+ tasks, got {len(TASKS)}'
|
| 47 |
+
print(' β
All imports resolve correctly')
|
| 48 |
+
print(f' Tasks: {list(TASKS.keys())}')
|
| 49 |
+
" || { echo " β Python import validation failed"; exit 1; }
|
| 50 |
+
echo ""
|
| 51 |
+
|
| 52 |
+
# 4. Quick grader smoke test
|
| 53 |
+
echo "ββ 4. Grader Smoke Test ββ"
|
| 54 |
+
python3 -c "
|
| 55 |
+
from server.environment import CodeSecurityEnv
|
| 56 |
+
from server.models import Action
|
| 57 |
+
|
| 58 |
+
env = CodeSecurityEnv()
|
| 59 |
+
obs = env.reset('python-off-by-one')
|
| 60 |
+
result = env.step(Action(**{
|
| 61 |
+
'bug_identified': True,
|
| 62 |
+
'bug_location': 'range(len(transactions) + 1)',
|
| 63 |
+
'bug_type': 'logic-error',
|
| 64 |
+
'bug_description': 'Off-by-one index error β the range goes one past the end causing an out of bounds IndexError',
|
| 65 |
+
'severity': 'medium',
|
| 66 |
+
'suggested_fix': 'Use range(len(transactions)) to fix the boundary',
|
| 67 |
+
}))
|
| 68 |
+
assert 0.0 <= result.reward <= 1.0, f'Reward out of range: {result.reward}'
|
| 69 |
+
assert result.done is True
|
| 70 |
+
print(f' β
Grader returned reward={result.reward:.4f}, done={result.done}')
|
| 71 |
+
|
| 72 |
+
# Verify zero-reward path
|
| 73 |
+
env2 = CodeSecurityEnv()
|
| 74 |
+
env2.reset('python-off-by-one')
|
| 75 |
+
r2 = env2.step(Action(**{
|
| 76 |
+
'bug_identified': False,
|
| 77 |
+
'bug_location': '',
|
| 78 |
+
'bug_type': 'none',
|
| 79 |
+
'bug_description': 'No bug found',
|
| 80 |
+
'severity': 'none',
|
| 81 |
+
'suggested_fix': '',
|
| 82 |
+
}))
|
| 83 |
+
assert r2.reward == 0.0, f'Expected 0.0 for no-bug, got {r2.reward}'
|
| 84 |
+
print(f' β
No-bug path returns reward=0.0')
|
| 85 |
+
" || { echo " β Grader smoke test failed"; exit 1; }
|
| 86 |
+
echo ""
|
| 87 |
+
|
| 88 |
+
# 5. Validate openenv.yaml
|
| 89 |
+
echo "ββ 5. openenv.yaml Validation ββ"
|
| 90 |
+
python3 -c "
|
| 91 |
+
import yaml
|
| 92 |
+
with open('openenv.yaml', 'r') as f:
|
| 93 |
+
data = yaml.safe_load(f)
|
| 94 |
+
assert 'name' in data, 'Missing name field'
|
| 95 |
+
assert 'tasks' in data, 'Missing tasks field'
|
| 96 |
+
assert len(data['tasks']) >= 3, f'Need 3+ tasks, got {len(data[\"tasks\"])}'
|
| 97 |
+
print(f' β
Valid YAML with {len(data[\"tasks\"])} tasks')
|
| 98 |
+
" || { echo " β openenv.yaml validation failed"; exit 1; }
|
| 99 |
+
echo ""
|
| 100 |
+
|
| 101 |
+
echo "βββββββββββββββββββββββββββββββββββββββ"
|
| 102 |
+
echo " β
All checks passed!"
|
| 103 |
+
echo "βββββββββββββββββββββββββββββββββββββββ"
|