Spaces:
Configuration error
Configuration error
Upload 34 files
Browse files- Dockerfile +40 -0
- README.md +326 -5
- RULES.md +261 -0
- __pycache__/app.cpython-313.pyc +0 -0
- app.py +145 -0
- baseline_agent.py +316 -0
- corpus/__init__.py +1 -0
- corpus/__pycache__/__init__.cpython-313.pyc +0 -0
- corpus/__pycache__/snippets.cpython-313.pyc +0 -0
- corpus/snippets.py +390 -0
- env/__init__.py +1 -0
- env/__pycache__/__init__.cpython-313.pyc +0 -0
- env/__pycache__/environment.cpython-313.pyc +0 -0
- env/__pycache__/models.cpython-313.pyc +0 -0
- env/environment.py +317 -0
- env/models.py +117 -0
- graders/__init__.py +1 -0
- graders/__pycache__/__init__.cpython-313.pyc +0 -0
- graders/__pycache__/graders.cpython-313.pyc +0 -0
- graders/graders.py +313 -0
- inference.py +304 -0
- openenv-code-review.tar.gz +3 -0
- openenv.yaml +163 -0
- pyproject.toml +31 -0
- requirements.txt +7 -0
- server/__init__.py +1 -0
- server/app.py +34 -0
- templates/index.html +807 -0
- tests/__init__.py +1 -0
- tests/__pycache__/__init__.cpython-313.pyc +0 -0
- tests/__pycache__/test_env.cpython-313-pytest-9.0.3.pyc +0 -0
- tests/test_env.py +269 -0
- uv.lock +0 -0
- validate-submission.sh +185 -0
Dockerfile
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ---- Build stage ----
|
| 2 |
+
FROM python:3.11-slim AS builder
|
| 3 |
+
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install dependencies into a virtual environment
|
| 7 |
+
COPY requirements.txt .
|
| 8 |
+
RUN python -m venv /opt/venv && \
|
| 9 |
+
/opt/venv/bin/pip install --upgrade pip && \
|
| 10 |
+
/opt/venv/bin/pip install --no-cache-dir -r requirements.txt
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# ---- Runtime stage ----
|
| 14 |
+
FROM python:3.11-slim
|
| 15 |
+
|
| 16 |
+
# HF Spaces expects the app to listen on port 7860
|
| 17 |
+
ENV PORT=7860 \
|
| 18 |
+
PYTHONUNBUFFERED=1 \
|
| 19 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
| 20 |
+
PATH="/opt/venv/bin:$PATH"
|
| 21 |
+
|
| 22 |
+
WORKDIR /app
|
| 23 |
+
|
| 24 |
+
# Copy virtual env from builder
|
| 25 |
+
COPY --from=builder /opt/venv /opt/venv
|
| 26 |
+
|
| 27 |
+
# Copy application code
|
| 28 |
+
COPY . .
|
| 29 |
+
|
| 30 |
+
# Create non-root user (HF Spaces security requirement)
|
| 31 |
+
RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
|
| 32 |
+
USER appuser
|
| 33 |
+
|
| 34 |
+
EXPOSE 7860
|
| 35 |
+
|
| 36 |
+
# Health check
|
| 37 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
|
| 38 |
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')"
|
| 39 |
+
|
| 40 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
|
README.md
CHANGED
|
@@ -1,10 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
|
|
|
|
| 1 |
+
# 🔍 CodeReview OpenEnv
|
| 2 |
+
|
| 3 |
+
An **OpenEnv-compliant AI training environment** that simulates professional Python code review. Agents learn to identify bugs, security vulnerabilities, performance bottlenecks, style issues, and documentation gaps — exactly as a senior engineer would in a real pull-request workflow.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Why Code Review?
|
| 8 |
+
|
| 9 |
+
Code review is one of the highest-leverage tasks in software engineering. It is:
|
| 10 |
+
|
| 11 |
+
- **Real-world**: Every professional software team does it daily
|
| 12 |
+
- **Structured enough to grade**: Issues have objectively correct or incorrect assessments
|
| 13 |
+
- **Rich in partial signal**: An agent that spots 3/5 critical issues is measurably better than one that spots 1/5
|
| 14 |
+
- **Scalable in difficulty**: Easy (bugs only) → Hard (all categories + written summary)
|
| 15 |
+
|
| 16 |
+
This makes it an ideal domain for training and evaluating LLM-based agents on multi-step reasoning and quality estimation tasks.
|
| 17 |
+
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
## Environment Description
|
| 21 |
+
|
| 22 |
+
```
|
| 23 |
+
CodeReviewEnv
|
| 24 |
+
├── Task 1 – Easy : Bug detection + Code style (calculator.py, 31 lines)
|
| 25 |
+
├── Task 2 – Medium : Security + Performance audit (user_service.py, 55 lines)
|
| 26 |
+
└── Task 3 – Hard : Full review, all 5 categories (data_pipeline.py, 49 lines)
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
Each task presents a Python snippet containing intentional flaws. The agent submits `ReviewComment` objects across one or more steps, then finalises with `submit=True`. A deterministic grader scores the review against ground-truth issues.
|
| 30 |
+
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
## Observation Space
|
| 34 |
+
|
| 35 |
+
What the agent sees on each step:
|
| 36 |
+
|
| 37 |
+
| Field | Type | Description |
|
| 38 |
+
|---|---|---|
|
| 39 |
+
| `task_id` | `str` | Active task identifier |
|
| 40 |
+
| `step` | `int` | Current step (0-indexed) |
|
| 41 |
+
| `snippet.file_name` | `str` | Logical file name (e.g. `auth.py`) |
|
| 42 |
+
| `snippet.source` | `str` | Full Python source code |
|
| 43 |
+
| `instructions` | `str` | Review scope, difficulty, and guidance |
|
| 44 |
+
| `previous_comments` | `list[ReviewComment]` | All comments submitted so far |
|
| 45 |
+
| `feedback` | `str \| None` | Env feedback on the last action |
|
| 46 |
+
| `done` | `bool` | Whether the episode has ended |
|
| 47 |
+
|
| 48 |
+
---
|
| 49 |
+
|
| 50 |
+
## Action Space
|
| 51 |
+
|
| 52 |
+
What the agent submits on each step:
|
| 53 |
+
|
| 54 |
+
```json
|
| 55 |
+
{
|
| 56 |
+
"comments": [
|
| 57 |
+
{
|
| 58 |
+
"line": 10,
|
| 59 |
+
"category": "security",
|
| 60 |
+
"severity": "critical",
|
| 61 |
+
"message": "SQL injection via string interpolation in query.",
|
| 62 |
+
"suggestion": "Use parameterised queries: cursor.execute('...', (username,))"
|
| 63 |
+
}
|
| 64 |
+
],
|
| 65 |
+
"summary": "Overall review summary (required for task_3_hard)",
|
| 66 |
+
"submit": true
|
| 67 |
+
}
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
| Field | Type | Values |
|
| 71 |
+
|---|---|---|
|
| 72 |
+
| `comments[].line` | `int \| null` | 1-indexed line number; `null` for file-level |
|
| 73 |
+
| `comments[].category` | `enum` | `bug`, `security`, `performance`, `style`, `documentation` |
|
| 74 |
+
| `comments[].severity` | `enum` | `low`, `medium`, `high`, `critical` |
|
| 75 |
+
| `comments[].message` | `str` | 5–500 chars |
|
| 76 |
+
| `comments[].suggestion` | `str \| null` | Optional fix suggestion |
|
| 77 |
+
| `summary` | `str \| null` | Required for `task_3_hard`, optional otherwise |
|
| 78 |
+
| `submit` | `bool` | `true` finalises the review and triggers the grader |
|
| 79 |
+
|
| 80 |
---
|
| 81 |
+
|
| 82 |
+
## Reward Function
|
| 83 |
+
|
| 84 |
+
Rewards are shaped to provide signal over the **full trajectory**, not just on terminal submit.
|
| 85 |
+
|
| 86 |
+
### Per-step (incremental) rewards
|
| 87 |
+
|
| 88 |
+
| Event | Reward |
|
| 89 |
+
|---|---|
|
| 90 |
+
| New valid comment added | `+0.05` per comment (max `+0.15`) |
|
| 91 |
+
| Progress signal (grader score delta) | `+0.5 × Δscore` |
|
| 92 |
+
| Empty step (no new comments) | `−0.05` |
|
| 93 |
+
| Spam (> 2.5× expected comments) | `−0.10` |
|
| 94 |
+
|
| 95 |
+
### On `submit=True` (terminal)
|
| 96 |
+
|
| 97 |
+
```
|
| 98 |
+
submit_reward = score × 0.8 + (0.2 if score ≥ threshold else −0.2)
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
### Per-category penalties (applied to terminal grader score)
|
| 102 |
+
|
| 103 |
+
| Event | Penalty |
|
| 104 |
+
|---|---|
|
| 105 |
+
| False positive (fabricated issue) | `−0.08–0.12` per comment |
|
| 106 |
+
| Missed CRITICAL security issue | `−0.15–0.20` |
|
| 107 |
+
| Missed HIGH issue | `−0.08–0.10` |
|
| 108 |
+
| No summary on task 3 | `−0.10` |
|
| 109 |
+
|
| 110 |
+
All rewards are clipped to `[−1.0, 1.0]`.
|
| 111 |
+
|
| 112 |
+
---
|
| 113 |
+
|
| 114 |
+
## Task Descriptions
|
| 115 |
+
|
| 116 |
+
### Task 1 – Easy: Bug Detection & Style Review
|
| 117 |
+
**File**: `calculator.py` (31 lines) | **Max steps**: 5 | **Pass threshold**: 0.55
|
| 118 |
+
|
| 119 |
+
Covers basic utility functions: `divide`, `average`, `celsius_to_fahrenheit`, `find_max`, `count_words`.
|
| 120 |
+
|
| 121 |
+
**Ground-truth issues (6)**:
|
| 122 |
+
- `divide()` — no zero-division guard (HIGH bug)
|
| 123 |
+
- `average()` — crashes on empty list (HIGH bug)
|
| 124 |
+
- `celsius_to_fahrenheit` — off-by-one (+31 vs +32) (MEDIUM bug)
|
| 125 |
+
- `find_max()` — crashes on empty list (MEDIUM bug)
|
| 126 |
+
- `for i in range(len(lst))` — unpythonic iteration (LOW style)
|
| 127 |
+
- Manual `Counter` reimplementation (LOW style)
|
| 128 |
+
|
| 129 |
+
---
|
| 130 |
+
|
| 131 |
+
### Task 2 – Medium: Security & Performance Audit
|
| 132 |
+
**File**: `user_service.py` (55 lines) | **Max steps**: 7 | **Pass threshold**: 0.60
|
| 133 |
+
|
| 134 |
+
A SQLite-backed user management service with authentication.
|
| 135 |
+
|
| 136 |
+
**Ground-truth issues (6)**:
|
| 137 |
+
- SQL injection in `get_user()` — f-string query (CRITICAL security)
|
| 138 |
+
- MD5 password hashing in `create_user()` (CRITICAL security)
|
| 139 |
+
- SQL injection in `delete_user()` (CRITICAL security)
|
| 140 |
+
- MD5 reuse in `authenticate()` (HIGH security)
|
| 141 |
+
- `fetchall()` on unbounded table (HIGH performance)
|
| 142 |
+
- New DB connection per query, no pooling (MEDIUM performance)
|
| 143 |
+
|
| 144 |
+
---
|
| 145 |
+
|
| 146 |
+
### Task 3 – Hard: Comprehensive Code Review
|
| 147 |
+
**File**: `data_pipeline.py` (49 lines) | **Max steps**: 10 | **Pass threshold**: 0.65
|
| 148 |
+
|
| 149 |
+
An analytics data pipeline with CSV loading, row transformation, caching, and stats.
|
| 150 |
+
|
| 151 |
+
**Ground-truth issues (13 across all 5 categories)**:
|
| 152 |
+
- `subprocess.run(shell=True)` with user input — OS command injection (CRITICAL security)
|
| 153 |
+
- `pickle.loads()` on arbitrary cache data — RCE risk (CRITICAL security)
|
| 154 |
+
- Pickling into module-level dict (HIGH security)
|
| 155 |
+
- `compute_stats()` ZeroDivisionError on empty data (HIGH bug)
|
| 156 |
+
- Missing `"value"` key → silent KeyError (MEDIUM bug)
|
| 157 |
+
- `open()` without encoding (MEDIUM bug)
|
| 158 |
+
- Two-pass iteration in `compute_stats` (MEDIUM performance)
|
| 159 |
+
- Subprocess per row instead of batching (MEDIUM performance)
|
| 160 |
+
- `str(stats)` instead of JSON export (LOW style)
|
| 161 |
+
- Module-level mutable global cache (LOW style)
|
| 162 |
+
- `load_data()` missing docstring (LOW documentation)
|
| 163 |
+
- `process_row()` missing docstring (LOW documentation)
|
| 164 |
+
- Insufficient module-level docstring (LOW documentation)
|
| 165 |
+
|
| 166 |
+
A **written summary** is required (`summary` field) — absence incurs a `−0.10` score penalty.
|
| 167 |
+
|
| 168 |
+
---
|
| 169 |
+
|
| 170 |
+
## Expected Baseline Scores (gpt-4o)
|
| 171 |
+
|
| 172 |
+
| Task | Score | Pass? | Notes |
|
| 173 |
+
|---|---|---|---|
|
| 174 |
+
| `task_1_easy` | ~0.75 | ✅ | GPT-4o reliably spots ZeroDivisionError and off-by-one |
|
| 175 |
+
| `task_2_medium` | ~0.65 | ✅ | SQL injection found; MD5 usually flagged; perf issues partial |
|
| 176 |
+
| `task_3_hard` | ~0.55 | ✅ | Pickle RCE and shell injection found; docs often missed |
|
| 177 |
+
|
| 178 |
+
---
|
| 179 |
+
|
| 180 |
+
## Setup & Usage
|
| 181 |
+
|
| 182 |
+
### Option A — Docker (recommended)
|
| 183 |
+
|
| 184 |
+
```bash
|
| 185 |
+
# Build
|
| 186 |
+
docker build -t code-review-env .
|
| 187 |
+
|
| 188 |
+
# Run (port 7860)
|
| 189 |
+
docker run -p 7860:7860 code-review-env
|
| 190 |
+
|
| 191 |
+
# Test it
|
| 192 |
+
curl http://localhost:7860/health
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
### Option B — Local Python
|
| 196 |
+
|
| 197 |
+
```bash
|
| 198 |
+
# Install dependencies
|
| 199 |
+
pip install -r requirements.txt
|
| 200 |
+
|
| 201 |
+
# Start the server
|
| 202 |
+
uvicorn app:app --host 0.0.0.0 --port 7860 --reload
|
| 203 |
+
|
| 204 |
+
# Open docs
|
| 205 |
+
open http://localhost:7860/docs
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
### Run the test suite
|
| 209 |
+
|
| 210 |
+
```bash
|
| 211 |
+
pytest tests/ -v
|
| 212 |
+
# Expected: 25 passed
|
| 213 |
+
```
|
| 214 |
+
|
| 215 |
+
### Run the baseline agent
|
| 216 |
+
|
| 217 |
+
```bash
|
| 218 |
+
export OPENAI_API_KEY=sk-...
|
| 219 |
+
|
| 220 |
+
# All tasks (direct mode — no server needed)
|
| 221 |
+
python baseline_agent.py
|
| 222 |
+
|
| 223 |
+
# Single task
|
| 224 |
+
python baseline_agent.py --task task_2_medium
|
| 225 |
+
|
| 226 |
+
# Against a running HTTP server
|
| 227 |
+
python baseline_agent.py --mode http --base-url http://localhost:7860
|
| 228 |
+
```
|
| 229 |
+
|
| 230 |
+
---
|
| 231 |
+
|
| 232 |
+
## API Reference
|
| 233 |
+
|
| 234 |
+
| Endpoint | Method | Description |
|
| 235 |
+
|---|---|---|
|
| 236 |
+
| `/` | GET | HTML landing page |
|
| 237 |
+
| `/health` | GET | Health check |
|
| 238 |
+
| `/tasks` | GET | List all task specs |
|
| 239 |
+
| `/reset` | POST | Start or restart an episode |
|
| 240 |
+
| `/step` | POST | Submit an action |
|
| 241 |
+
| `/state` | GET | Get full serialisable state |
|
| 242 |
+
| `/docs` | GET | Interactive Swagger UI |
|
| 243 |
+
|
| 244 |
+
### Example: Full episode via curl
|
| 245 |
+
|
| 246 |
+
```bash
|
| 247 |
+
# 1. Reset
|
| 248 |
+
curl -X POST http://localhost:7860/reset \
|
| 249 |
+
-H 'Content-Type: application/json' \
|
| 250 |
+
-d '{"task_id": "task_1_easy", "session_id": "demo"}'
|
| 251 |
+
|
| 252 |
+
# 2. Step
|
| 253 |
+
curl -X POST http://localhost:7860/step \
|
| 254 |
+
-H 'Content-Type: application/json' \
|
| 255 |
+
-d '{
|
| 256 |
+
"session_id": "demo",
|
| 257 |
+
"action": {
|
| 258 |
+
"comments": [
|
| 259 |
+
{
|
| 260 |
+
"line": 2,
|
| 261 |
+
"category": "bug",
|
| 262 |
+
"severity": "high",
|
| 263 |
+
"message": "divide() will raise ZeroDivisionError when b is 0.",
|
| 264 |
+
"suggestion": "Guard with: if b == 0: raise ValueError"
|
| 265 |
+
}
|
| 266 |
+
],
|
| 267 |
+
"submit": true
|
| 268 |
+
}
|
| 269 |
+
}'
|
| 270 |
+
|
| 271 |
+
# 3. Check state
|
| 272 |
+
curl "http://localhost:7860/state?session_id=demo"
|
| 273 |
+
```
|
| 274 |
+
|
| 275 |
+
---
|
| 276 |
+
|
| 277 |
+
## Project Structure
|
| 278 |
+
|
| 279 |
+
```
|
| 280 |
+
openenv-code-review/
|
| 281 |
+
├── app.py # FastAPI HTTP server
|
| 282 |
+
├── openenv.yaml # OpenEnv spec metadata
|
| 283 |
+
├── Dockerfile # Container definition
|
| 284 |
+
├── requirements.txt
|
| 285 |
+
├── baseline_agent.py # gpt-4o baseline inference script
|
| 286 |
+
│
|
| 287 |
+
├── env/
|
| 288 |
+
│ ├── models.py # Pydantic typed models (Observation, Action, Reward, …)
|
| 289 |
+
│ └── environment.py # CodeReviewEnv — step() / reset() / state()
|
| 290 |
+
│
|
| 291 |
+
├── corpus/
|
| 292 |
+
│ └── snippets.py # Python snippets with ground-truth issues
|
| 293 |
+
│
|
| 294 |
+
├── graders/
|
| 295 |
+
│ └── graders.py # Task1Grader, Task2Grader, Task3Grader
|
| 296 |
+
│
|
| 297 |
+
└── tests/
|
| 298 |
+
└── test_env.py # 25-test pytest suite (all passing)
|
| 299 |
+
```
|
| 300 |
+
|
| 301 |
+
---
|
| 302 |
+
|
| 303 |
+
## Deploying to Hugging Face Spaces
|
| 304 |
+
|
| 305 |
+
1. Create a new Space with **Docker** SDK
|
| 306 |
+
2. Push this repository to the Space
|
| 307 |
+
3. Set `OPENAI_API_KEY` as a Space secret (only needed for baseline script)
|
| 308 |
+
4. The Space will auto-build and expose port 7860
|
| 309 |
+
|
| 310 |
+
```yaml
|
| 311 |
+
# README.md frontmatter for HF Spaces
|
| 312 |
+
---
|
| 313 |
+
title: CodeReview OpenEnv
|
| 314 |
+
emoji: 🔍
|
| 315 |
+
colorFrom: blue
|
| 316 |
+
colorTo: indigo
|
| 317 |
sdk: docker
|
| 318 |
pinned: false
|
| 319 |
+
tags:
|
| 320 |
+
- openenv
|
| 321 |
+
- code-review
|
| 322 |
+
- ai-agent
|
| 323 |
+
- evaluation
|
| 324 |
---
|
| 325 |
+
```
|
| 326 |
+
|
| 327 |
+
---
|
| 328 |
+
|
| 329 |
+
## License
|
| 330 |
|
| 331 |
+
MIT
|
RULES.md
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RULES.md — CodeReview OpenEnv Agent Grounding Rules
|
| 2 |
+
|
| 3 |
+
You are an AI agent operating inside the **CodeReview OpenEnv** environment.
|
| 4 |
+
Read every rule below before generating any action. Violating these rules
|
| 5 |
+
will cause your score to drop or your episode to terminate with a penalty.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## 1. YOUR ONLY JOB
|
| 10 |
+
|
| 11 |
+
You are reviewing a **Python source file** for real issues.
|
| 12 |
+
You are **not** writing code. You are **not** explaining Python concepts.
|
| 13 |
+
You are **not** summarising the file. You are finding specific, locatable
|
| 14 |
+
problems and describing them precisely.
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## 2. OUTPUT FORMAT — NON-NEGOTIABLE
|
| 19 |
+
|
| 20 |
+
You must respond with **one JSON object and nothing else**.
|
| 21 |
+
No markdown. No backticks. No preamble. No explanation outside the JSON.
|
| 22 |
+
|
| 23 |
+
```
|
| 24 |
+
{
|
| 25 |
+
"comments": [ ...ReviewComment objects... ],
|
| 26 |
+
"summary": "string or null",
|
| 27 |
+
"submit": true or false
|
| 28 |
+
}
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
Any response that is not valid JSON will be treated as an empty action
|
| 32 |
+
and penalised with −0.05 reward.
|
| 33 |
+
|
| 34 |
+
---
|
| 35 |
+
|
| 36 |
+
## 3. ReviewComment SCHEMA — EXACT TYPES REQUIRED
|
| 37 |
+
|
| 38 |
+
Every object inside `comments` must have exactly these fields:
|
| 39 |
+
|
| 40 |
+
| Field | Type | Allowed values / constraints |
|
| 41 |
+
|--------------|-----------------|-----------------------------------------------------------|
|
| 42 |
+
| `line` | int or null | 1-indexed line number from the code. null = file-level |
|
| 43 |
+
| `category` | string (enum) | `"bug"` `"security"` `"performance"` `"style"` `"documentation"` |
|
| 44 |
+
| `severity` | string (enum) | `"low"` `"medium"` `"high"` `"critical"` |
|
| 45 |
+
| `message` | string | 5–500 characters. Must describe the SPECIFIC issue. |
|
| 46 |
+
| `suggestion` | string or null | Optional fix. Max 500 characters. |
|
| 47 |
+
|
| 48 |
+
Do not add extra fields. Do not omit required fields. Do not use integers
|
| 49 |
+
for `category` or `severity`.
|
| 50 |
+
|
| 51 |
+
---
|
| 52 |
+
|
| 53 |
+
## 4. CATEGORY SCOPE — ONLY FLAG WHAT YOU ARE ASKED TO FLAG
|
| 54 |
+
|
| 55 |
+
The `instructions` field in the observation tells you which categories
|
| 56 |
+
to check. **Do not submit comments for categories outside that scope.**
|
| 57 |
+
|
| 58 |
+
- Task 1 (Easy): `bug`, `style` only
|
| 59 |
+
- Task 2 (Medium): `security`, `performance` only
|
| 60 |
+
- Task 3 (Hard): all five categories
|
| 61 |
+
|
| 62 |
+
Submitting comments in the wrong category is treated as a false positive
|
| 63 |
+
and incurs a penalty. The grader will ignore them.
|
| 64 |
+
|
| 65 |
+
---
|
| 66 |
+
|
| 67 |
+
## 5. LINE NUMBERS — BE PRECISE
|
| 68 |
+
|
| 69 |
+
- Count lines from **1** (the first line of the source is line 1).
|
| 70 |
+
- The source shown in the observation has line numbers prefixed — use them.
|
| 71 |
+
- If you cannot pinpoint a line, use `null` (file-level comment).
|
| 72 |
+
- Do not guess or approximate. Off-by-more-than-3 lines reduces your score.
|
| 73 |
+
|
| 74 |
+
---
|
| 75 |
+
|
| 76 |
+
## 6. NO FABRICATION
|
| 77 |
+
|
| 78 |
+
Do not invent issues that are not present in the code.
|
| 79 |
+
Every comment you submit must correspond to a real, demonstrable problem
|
| 80 |
+
in the snippet as written. Ask yourself:
|
| 81 |
+
|
| 82 |
+
> "Can I point to the exact line where this fails and show the failure?"
|
| 83 |
+
|
| 84 |
+
If the answer is no, do not submit that comment.
|
| 85 |
+
|
| 86 |
+
False positives reduce your score. Many false positives can bring your
|
| 87 |
+
score below zero.
|
| 88 |
+
|
| 89 |
+
---
|
| 90 |
+
|
| 91 |
+
## 7. SEVERITY CALIBRATION
|
| 92 |
+
|
| 93 |
+
Use severity consistently:
|
| 94 |
+
|
| 95 |
+
| Severity | Meaning | Examples |
|
| 96 |
+
|------------|------------------------------------------------------------|---------------------------------------------------|
|
| 97 |
+
| `critical` | Exploitable in production. Immediate risk of data loss, RCE, auth bypass. | SQL injection, pickle.loads on untrusted data, shell=True with user input |
|
| 98 |
+
| `high` | Causes crashes, data corruption, or major security weakness under normal use. | ZeroDivisionError on empty input, MD5 passwords, fetchall() on unbounded table |
|
| 99 |
+
| `medium` | Incorrect behaviour in edge cases, significant performance hit, notable security weakness. | Missing encoding param, off-by-one in formula, O(n) per-row subprocess |
|
| 100 |
+
| `low` | Style, readability, minor inefficiency, missing docs. | Unpythonic loop, manual Counter, missing docstring |
|
| 101 |
+
|
| 102 |
+
Do not mark everything as `critical`. Severity inflation is penalised.
|
| 103 |
+
|
| 104 |
+
---
|
| 105 |
+
|
| 106 |
+
## 8. MESSAGE QUALITY
|
| 107 |
+
|
| 108 |
+
A good message answers three questions:
|
| 109 |
+
1. **What** is wrong?
|
| 110 |
+
2. **Where** exactly (line / function)?
|
| 111 |
+
3. **Why** does it matter?
|
| 112 |
+
|
| 113 |
+
**Good**: `"average() divides by len(numbers) without checking for an empty list; raises ZeroDivisionError when called with []."`
|
| 114 |
+
|
| 115 |
+
**Bad**: `"This function has a bug."` — too vague, will not match ground truth.
|
| 116 |
+
**Bad**: `"Consider adding error handling."` — not specific enough.
|
| 117 |
+
**Bad**: `"Line 8 is problematic."` — no description of the actual problem.
|
| 118 |
+
|
| 119 |
+
Minimum 5 characters. Maximum 500 characters.
|
| 120 |
+
|
| 121 |
+
---
|
| 122 |
+
|
| 123 |
+
## 9. SUGGESTIONS ARE OPTIONAL BUT VALUABLE
|
| 124 |
+
|
| 125 |
+
- If you include a `suggestion`, make it concrete and correct Python.
|
| 126 |
+
- Do not include suggestions that are themselves buggy or insecure.
|
| 127 |
+
- A suggestion that introduces a new vulnerability is worse than no suggestion.
|
| 128 |
+
|
| 129 |
+
---
|
| 130 |
+
|
| 131 |
+
## 10. THE `summary` FIELD
|
| 132 |
+
|
| 133 |
+
- **Task 3 (Hard) only**: `summary` is **required**. Omitting it deducts 0.10 from your score.
|
| 134 |
+
- For Tasks 1 and 2: `summary` is optional. Include it if it adds value.
|
| 135 |
+
- The summary should cover the overall risk level and the main themes found.
|
| 136 |
+
- Mention key categories found: e.g. "security", "injection", "pickle", "performance", "documentation".
|
| 137 |
+
- More relevant keywords in the summary = small score bonus (up to +0.15).
|
| 138 |
+
|
| 139 |
+
---
|
| 140 |
+
|
| 141 |
+
## 11. WHEN TO SET `"submit": true`
|
| 142 |
+
|
| 143 |
+
Set `submit` to `true` when you believe your review is complete.
|
| 144 |
+
The grader runs immediately on submit and the episode ends.
|
| 145 |
+
|
| 146 |
+
Set `submit` to `false` if you want to add more comments in the next step.
|
| 147 |
+
You have `max_steps` steps per episode (varies by task: 5 / 7 / 10).
|
| 148 |
+
|
| 149 |
+
Rules:
|
| 150 |
+
- You MUST set `submit: true` on your final step.
|
| 151 |
+
- If you run out of steps without submitting, the episode auto-terminates.
|
| 152 |
+
- Do not waste steps submitting empty comment lists. Each empty step costs −0.05.
|
| 153 |
+
|
| 154 |
+
Recommended strategy: submit everything in **one step** unless you are
|
| 155 |
+
doing iterative refinement across multiple steps.
|
| 156 |
+
|
| 157 |
+
---
|
| 158 |
+
|
| 159 |
+
## 12. DEDUPLICATION — DO NOT REPEAT YOURSELF
|
| 160 |
+
|
| 161 |
+
The environment deduplicates comments across steps by `(line, category, message[:40])`.
|
| 162 |
+
Submitting the same comment again in a later step gives you zero credit for it.
|
| 163 |
+
Check `previous_comments` in the observation and do not re-submit anything
|
| 164 |
+
already there.
|
| 165 |
+
|
| 166 |
+
---
|
| 167 |
+
|
| 168 |
+
## 13. DO NOT SPAM
|
| 169 |
+
|
| 170 |
+
Submitting more than 2.5× the expected number of comments triggers a spam penalty (−0.10).
|
| 171 |
+
Quality over quantity. If you find 6 real issues, submit 6.
|
| 172 |
+
Do not pad with speculative or low-confidence comments to boost apparent coverage.
|
| 173 |
+
|
| 174 |
+
---
|
| 175 |
+
|
| 176 |
+
## 14. MULTI-STEP STRATEGY (if using more than 1 step)
|
| 177 |
+
|
| 178 |
+
Step 1 — Read carefully. Submit your highest-confidence comments.
|
| 179 |
+
Step 2 — Review `feedback` and `previous_comments` in the observation.
|
| 180 |
+
Add only NEW comments not already submitted.
|
| 181 |
+
Step N — Set `submit: true` when confident you have covered all categories.
|
| 182 |
+
|
| 183 |
+
Do not submit `submit: true` before you have reviewed the whole file.
|
| 184 |
+
|
| 185 |
+
---
|
| 186 |
+
|
| 187 |
+
## 15. WHAT THE GRADER CHECKS
|
| 188 |
+
|
| 189 |
+
The grader matches your comments against a hidden ground-truth list using:
|
| 190 |
+
- **Category match** (exact)
|
| 191 |
+
- **Line proximity** (within ±3 lines)
|
| 192 |
+
- **Keyword overlap** (≥25% of significant words from the truth message appear in yours)
|
| 193 |
+
- **Severity proximity** (within 1 level)
|
| 194 |
+
|
| 195 |
+
You get full credit for exact matches, partial credit (0.5×) for right issue
|
| 196 |
+
wrong line. You get nothing for wrong category, and a penalty for fabricated issues.
|
| 197 |
+
|
| 198 |
+
**Implication**: Write messages in plain, specific language that describes the
|
| 199 |
+
actual vulnerability or flaw. Technical terms matter (e.g. "SQL injection",
|
| 200 |
+
"ZeroDivisionError", "MD5", "shell=True", "pickle.loads").
|
| 201 |
+
|
| 202 |
+
---
|
| 203 |
+
|
| 204 |
+
## 16. FORBIDDEN BEHAVIOURS
|
| 205 |
+
|
| 206 |
+
The following will actively hurt your score:
|
| 207 |
+
|
| 208 |
+
| Behaviour | Consequence |
|
| 209 |
+
|---|---|
|
| 210 |
+
| Responding with non-JSON text | Treated as empty action, −0.05 |
|
| 211 |
+
| Submitting comments in wrong category | False positive penalty |
|
| 212 |
+
| Using categories not in the task scope | False positive penalty |
|
| 213 |
+
| Inventing issues not in the code | False positive penalty per comment |
|
| 214 |
+
| Marking all issues as `critical` | Severity mismatch reduces match score |
|
| 215 |
+
| Repeating already-submitted comments | No credit (deduped) |
|
| 216 |
+
| Submitting > 2.5× expected comments | Spam penalty −0.10 |
|
| 217 |
+
| Omitting `summary` on Task 3 | −0.10 from final score |
|
| 218 |
+
| Calling `submit: true` with 0 comments | Episode ends with near-zero score |
|
| 219 |
+
|
| 220 |
+
---
|
| 221 |
+
|
| 222 |
+
## 17. CHECKLIST BEFORE YOU RESPOND
|
| 223 |
+
|
| 224 |
+
Before generating your JSON, run through this mentally:
|
| 225 |
+
|
| 226 |
+
- [ ] Is my response a single valid JSON object with no surrounding text?
|
| 227 |
+
- [ ] Does every comment have all 5 fields with correct types?
|
| 228 |
+
- [ ] Are all my categories within the task scope defined in `instructions`?
|
| 229 |
+
- [ ] Is every line number accurate (1-indexed from the source)?
|
| 230 |
+
- [ ] Can I justify every comment with a specific line and a concrete failure mode?
|
| 231 |
+
- [ ] Have I avoided re-submitting comments from `previous_comments`?
|
| 232 |
+
- [ ] For Task 3: have I included a `summary` with key technical themes?
|
| 233 |
+
- [ ] Is my severity realistic (not everything is `critical`)?
|
| 234 |
+
- [ ] Should I set `submit: true` now, or do I have more to add?
|
| 235 |
+
|
| 236 |
+
---
|
| 237 |
+
|
| 238 |
+
## QUICK REFERENCE
|
| 239 |
+
|
| 240 |
+
```json
|
| 241 |
+
{
|
| 242 |
+
"comments": [
|
| 243 |
+
{
|
| 244 |
+
"line": 10,
|
| 245 |
+
"category": "security",
|
| 246 |
+
"severity": "critical",
|
| 247 |
+
"message": "get_user() interpolates username directly into the SQL query string, enabling SQL injection attacks.",
|
| 248 |
+
"suggestion": "Use parameterised queries: cursor.execute('SELECT * FROM users WHERE username=?', (username,))"
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"line": 19,
|
| 252 |
+
"category": "security",
|
| 253 |
+
"severity": "critical",
|
| 254 |
+
"message": "MD5 is a broken hash function unsuitable for password storage; collisions can be computed in seconds.",
|
| 255 |
+
"suggestion": "Replace with bcrypt.hashpw(password.encode(), bcrypt.gensalt()) or hashlib.scrypt."
|
| 256 |
+
}
|
| 257 |
+
],
|
| 258 |
+
"summary": "Critical security issues found: SQL injection on lines 10 and 52, broken MD5 password hashing on lines 19 and 46. Performance issue: fetchall() loads entire table. Connection pooling absent.",
|
| 259 |
+
"submit": true
|
| 260 |
+
}
|
| 261 |
+
```
|
__pycache__/app.cpython-313.pyc
ADDED
|
Binary file (6.58 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI HTTP server for CodeReview OpenEnv.
|
| 3 |
+
|
| 4 |
+
Exposes the environment as a REST API for agents to interact with.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from typing import Any, Dict, Optional
|
| 10 |
+
|
| 11 |
+
from fastapi import FastAPI, HTTPException, Query
|
| 12 |
+
from fastapi.responses import HTMLResponse
|
| 13 |
+
from pydantic import BaseModel
|
| 14 |
+
|
| 15 |
+
from env.environment import CodeReviewEnv, TASK_SPECS
|
| 16 |
+
from env.models import Action, ReviewCategory, ReviewComment, Severity
|
| 17 |
+
|
| 18 |
+
# ---------------------------------------------------------------------------
|
| 19 |
+
# App setup
|
| 20 |
+
# ---------------------------------------------------------------------------
|
| 21 |
+
|
| 22 |
+
app = FastAPI(
|
| 23 |
+
title="CodeReview OpenEnv",
|
| 24 |
+
description="An OpenEnv-compliant AI training environment for Python code review.",
|
| 25 |
+
version="1.0.0",
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# In-memory session store
|
| 29 |
+
SESSIONS: Dict[str, CodeReviewEnv] = {}
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ---------------------------------------------------------------------------
|
| 33 |
+
# Request / Response schemas
|
| 34 |
+
# ---------------------------------------------------------------------------
|
| 35 |
+
|
| 36 |
+
class ResetRequest(BaseModel):
|
| 37 |
+
task_id: str = "task_1_easy"
|
| 38 |
+
session_id: str = "default"
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class StepRequest(BaseModel):
|
| 42 |
+
session_id: str = "default"
|
| 43 |
+
action: Dict[str, Any]
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# ---------------------------------------------------------------------------
|
| 47 |
+
# Endpoints
|
| 48 |
+
# ---------------------------------------------------------------------------
|
| 49 |
+
|
| 50 |
+
import os
|
| 51 |
+
|
| 52 |
+
@app.get("/", response_class=HTMLResponse)
|
| 53 |
+
def landing_page():
|
| 54 |
+
"""HTML landing page."""
|
| 55 |
+
template_path = os.path.join(os.path.dirname(__file__), "templates", "index.html")
|
| 56 |
+
try:
|
| 57 |
+
with open(template_path, "r", encoding="utf-8") as f:
|
| 58 |
+
return f.read()
|
| 59 |
+
except FileNotFoundError:
|
| 60 |
+
return "<html><body><h1>Error: templates/index.html not found.</h1></body></html>"
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
@app.get("/health")
|
| 64 |
+
def health():
|
| 65 |
+
"""Health check endpoint."""
|
| 66 |
+
return {"status": "ok"}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@app.get("/tasks")
|
| 70 |
+
def list_tasks():
|
| 71 |
+
"""Return specs for all available tasks."""
|
| 72 |
+
return {
|
| 73 |
+
task_id: spec.model_dump()
|
| 74 |
+
for task_id, spec in TASK_SPECS.items()
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
@app.post("/reset")
|
| 79 |
+
def reset(req: ResetRequest):
|
| 80 |
+
"""Start or restart an episode for the given task and session."""
|
| 81 |
+
if req.task_id not in TASK_SPECS:
|
| 82 |
+
raise HTTPException(
|
| 83 |
+
status_code=400,
|
| 84 |
+
detail=f"Unknown task_id '{req.task_id}'. Choose from: {list(TASK_SPECS.keys())}",
|
| 85 |
+
)
|
| 86 |
+
env = CodeReviewEnv(task_id=req.task_id)
|
| 87 |
+
obs = env.reset()
|
| 88 |
+
SESSIONS[req.session_id] = env
|
| 89 |
+
return {"observation": obs.model_dump(), "session_id": req.session_id}
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@app.post("/step")
|
| 93 |
+
def step(req: StepRequest):
|
| 94 |
+
"""Submit an action for the given session."""
|
| 95 |
+
env = SESSIONS.get(req.session_id)
|
| 96 |
+
if env is None:
|
| 97 |
+
raise HTTPException(
|
| 98 |
+
status_code=404,
|
| 99 |
+
detail=f"Session '{req.session_id}' not found. Call /reset first.",
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# Parse the action dict into an Action model
|
| 103 |
+
action_dict = req.action
|
| 104 |
+
comments = []
|
| 105 |
+
for c in action_dict.get("comments", []):
|
| 106 |
+
try:
|
| 107 |
+
comments.append(ReviewComment(
|
| 108 |
+
line=c.get("line"),
|
| 109 |
+
category=ReviewCategory(c.get("category", "bug")),
|
| 110 |
+
severity=Severity(c.get("severity", "medium")),
|
| 111 |
+
message=c.get("message", ""),
|
| 112 |
+
suggestion=c.get("suggestion"),
|
| 113 |
+
))
|
| 114 |
+
except Exception:
|
| 115 |
+
pass # skip malformed comments
|
| 116 |
+
|
| 117 |
+
action = Action(
|
| 118 |
+
comments=comments,
|
| 119 |
+
summary=action_dict.get("summary"),
|
| 120 |
+
submit=action_dict.get("submit", False),
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
result = env.step(action)
|
| 125 |
+
except RuntimeError as e:
|
| 126 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 127 |
+
|
| 128 |
+
return {
|
| 129 |
+
"observation": result.observation.model_dump(),
|
| 130 |
+
"reward": result.reward.model_dump(),
|
| 131 |
+
"done": result.done,
|
| 132 |
+
"info": result.info,
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
@app.get("/state")
|
| 137 |
+
def get_state(session_id: str = Query(default="default")):
|
| 138 |
+
"""Return full serialisable state for the given session."""
|
| 139 |
+
env = SESSIONS.get(session_id)
|
| 140 |
+
if env is None:
|
| 141 |
+
raise HTTPException(
|
| 142 |
+
status_code=404,
|
| 143 |
+
detail=f"Session '{session_id}' not found. Call /reset first.",
|
| 144 |
+
)
|
| 145 |
+
return env.state().model_dump()
|
baseline_agent.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
baseline_agent.py – Baseline inference script for CodeReview OpenEnv.
|
| 4 |
+
|
| 5 |
+
Runs gpt-4o against all three tasks using the OpenAI client.
|
| 6 |
+
Reads credentials from OPENAI_API_KEY environment variable.
|
| 7 |
+
Connects to the env either locally (direct Python import) or via HTTP.
|
| 8 |
+
|
| 9 |
+
Usage
|
| 10 |
+
-----
|
| 11 |
+
# Direct mode (no server needed):
|
| 12 |
+
python baseline_agent.py
|
| 13 |
+
|
| 14 |
+
# Against a running server:
|
| 15 |
+
python baseline_agent.py --mode http --base-url http://localhost:7860
|
| 16 |
+
|
| 17 |
+
# Single task:
|
| 18 |
+
python baseline_agent.py --task task_2_medium
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
import argparse
|
| 24 |
+
import json
|
| 25 |
+
import os
|
| 26 |
+
import sys
|
| 27 |
+
import textwrap
|
| 28 |
+
import time
|
| 29 |
+
from typing import Any, Dict, List, Optional
|
| 30 |
+
|
| 31 |
+
import requests
|
| 32 |
+
from openai import OpenAI
|
| 33 |
+
|
| 34 |
+
# ---------------------------------------------------------------------------
|
| 35 |
+
# Configuration
|
| 36 |
+
# ---------------------------------------------------------------------------
|
| 37 |
+
|
| 38 |
+
MODEL = os.environ.get("BASELINE_MODEL", "gpt-4o")
|
| 39 |
+
API_KEY = os.environ.get("OPENAI_API_KEY", "")
|
| 40 |
+
ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:7860")
|
| 41 |
+
TASKS = ["task_1_easy", "task_2_medium", "task_3_hard"]
|
| 42 |
+
|
| 43 |
+
# ---------------------------------------------------------------------------
|
| 44 |
+
# Prompt construction
|
| 45 |
+
# ---------------------------------------------------------------------------
|
| 46 |
+
|
| 47 |
+
SYSTEM_PROMPT = textwrap.dedent("""
|
| 48 |
+
You are an expert Python code reviewer.
|
| 49 |
+
You will be given a code snippet along with review instructions.
|
| 50 |
+
Your job is to produce a JSON action object that identifies issues in the code.
|
| 51 |
+
|
| 52 |
+
The JSON object you return must match this schema exactly:
|
| 53 |
+
{
|
| 54 |
+
"comments": [
|
| 55 |
+
{
|
| 56 |
+
"line": <int or null>,
|
| 57 |
+
"category": <"bug"|"security"|"performance"|"style"|"documentation">,
|
| 58 |
+
"severity": <"low"|"medium"|"high"|"critical">,
|
| 59 |
+
"message": "<clear description of the issue>",
|
| 60 |
+
"suggestion": "<optional fix>"
|
| 61 |
+
}
|
| 62 |
+
],
|
| 63 |
+
"summary": "<overall assessment – required for hard tasks, optional otherwise>",
|
| 64 |
+
"submit": true
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
Rules:
|
| 68 |
+
- Only flag genuine issues. Do not fabricate problems.
|
| 69 |
+
- Be precise about line numbers (1-indexed from the code).
|
| 70 |
+
- Match the categories listed in the instructions.
|
| 71 |
+
- Always set "submit": true when you believe your review is complete.
|
| 72 |
+
- Return ONLY the JSON object. No markdown, no explanations.
|
| 73 |
+
""").strip()
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def build_user_message(observation: dict) -> str:
|
| 77 |
+
snippet = observation["snippet"]
|
| 78 |
+
instructions = observation["instructions"]
|
| 79 |
+
previous = observation.get("previous_comments", [])
|
| 80 |
+
|
| 81 |
+
numbered_source = "\n".join(
|
| 82 |
+
f"{i+1:3d} {line}"
|
| 83 |
+
for i, line in enumerate(snippet["source"].splitlines())
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
msg = f"""
|
| 87 |
+
{instructions}
|
| 88 |
+
|
| 89 |
+
### File: {snippet['file_name']}
|
| 90 |
+
```python
|
| 91 |
+
{numbered_source}
|
| 92 |
+
```
|
| 93 |
+
"""
|
| 94 |
+
if previous:
|
| 95 |
+
msg += f"\n### Your previous comments ({len(previous)} so far):\n"
|
| 96 |
+
for c in previous:
|
| 97 |
+
msg += f" - L{c.get('line','?')} [{c['category']}] {c['message'][:80]}\n"
|
| 98 |
+
|
| 99 |
+
return msg.strip()
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# ---------------------------------------------------------------------------
|
| 103 |
+
# Direct mode (import env directly)
|
| 104 |
+
# ---------------------------------------------------------------------------
|
| 105 |
+
|
| 106 |
+
def run_direct(task_id: str, client: OpenAI) -> dict:
|
| 107 |
+
"""Run the agent against the environment by direct Python import."""
|
| 108 |
+
# Import here to avoid circular dependency when running in HTTP mode
|
| 109 |
+
sys.path.insert(0, os.path.dirname(__file__))
|
| 110 |
+
from env.environment import CodeReviewEnv
|
| 111 |
+
from env.models import Action, ReviewComment, ReviewCategory, Severity
|
| 112 |
+
|
| 113 |
+
env = CodeReviewEnv(task_id=task_id)
|
| 114 |
+
obs = env.reset()
|
| 115 |
+
|
| 116 |
+
total_reward = 0.0
|
| 117 |
+
final_score = 0.0
|
| 118 |
+
steps_taken = 0
|
| 119 |
+
|
| 120 |
+
for step_num in range(env.spec.max_steps):
|
| 121 |
+
user_msg = build_user_message(obs.model_dump())
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
response = client.chat.completions.create(
|
| 125 |
+
model=MODEL,
|
| 126 |
+
messages=[
|
| 127 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 128 |
+
{"role": "user", "content": user_msg},
|
| 129 |
+
],
|
| 130 |
+
temperature=0.2,
|
| 131 |
+
response_format={"type": "json_object"},
|
| 132 |
+
)
|
| 133 |
+
raw = response.choices[0].message.content or "{}"
|
| 134 |
+
action_dict = json.loads(raw)
|
| 135 |
+
except Exception as e:
|
| 136 |
+
print(f" [!] LLM error on step {step_num}: {e}")
|
| 137 |
+
action_dict = {"comments": [], "submit": True}
|
| 138 |
+
|
| 139 |
+
# Build Action
|
| 140 |
+
comments = []
|
| 141 |
+
for c in action_dict.get("comments", []):
|
| 142 |
+
try:
|
| 143 |
+
comments.append(ReviewComment(
|
| 144 |
+
line=c.get("line"),
|
| 145 |
+
category=ReviewCategory(c.get("category", "bug")),
|
| 146 |
+
severity=Severity(c.get("severity", "medium")),
|
| 147 |
+
message=c.get("message", ""),
|
| 148 |
+
suggestion=c.get("suggestion"),
|
| 149 |
+
))
|
| 150 |
+
except Exception:
|
| 151 |
+
pass # skip malformed comments
|
| 152 |
+
|
| 153 |
+
action = Action(
|
| 154 |
+
comments=comments,
|
| 155 |
+
summary=action_dict.get("summary"),
|
| 156 |
+
submit=action_dict.get("submit", True),
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
result = env.step(action)
|
| 160 |
+
total_reward += result.reward.value
|
| 161 |
+
steps_taken += 1
|
| 162 |
+
final_score = result.info.get("grader", {}).get("score", 0.0)
|
| 163 |
+
|
| 164 |
+
print(f" Step {step_num+1}: reward={result.reward.value:+.3f} | "
|
| 165 |
+
f"comments={result.info['total_comments']} | "
|
| 166 |
+
f"score={final_score:.3f}")
|
| 167 |
+
|
| 168 |
+
obs = result.observation
|
| 169 |
+
if result.done:
|
| 170 |
+
break
|
| 171 |
+
|
| 172 |
+
passed = final_score >= env.spec.passing_threshold
|
| 173 |
+
return {
|
| 174 |
+
"task_id": task_id,
|
| 175 |
+
"steps": steps_taken,
|
| 176 |
+
"total_reward": round(total_reward, 4),
|
| 177 |
+
"final_score": round(final_score, 4),
|
| 178 |
+
"passed": passed,
|
| 179 |
+
"threshold": env.spec.passing_threshold,
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
# ---------------------------------------------------------------------------
|
| 184 |
+
# HTTP mode (against a running server)
|
| 185 |
+
# ---------------------------------------------------------------------------
|
| 186 |
+
|
| 187 |
+
def run_http(task_id: str, client: OpenAI, base_url: str) -> dict:
|
| 188 |
+
"""Run the agent against a live HTTP server."""
|
| 189 |
+
session_id = f"baseline-{task_id}-{int(time.time())}"
|
| 190 |
+
headers = {"Content-Type": "application/json"}
|
| 191 |
+
|
| 192 |
+
# Reset
|
| 193 |
+
r = requests.post(f"{base_url}/reset",
|
| 194 |
+
json={"task_id": task_id, "session_id": session_id}, headers=headers)
|
| 195 |
+
r.raise_for_status()
|
| 196 |
+
obs = r.json()["observation"]
|
| 197 |
+
|
| 198 |
+
# Get task spec for threshold
|
| 199 |
+
tasks_r = requests.get(f"{base_url}/tasks")
|
| 200 |
+
spec = tasks_r.json()[task_id]
|
| 201 |
+
max_steps = spec["max_steps"]
|
| 202 |
+
threshold = spec["passing_threshold"]
|
| 203 |
+
|
| 204 |
+
total_reward = 0.0
|
| 205 |
+
final_score = 0.0
|
| 206 |
+
steps_taken = 0
|
| 207 |
+
|
| 208 |
+
for step_num in range(max_steps):
|
| 209 |
+
user_msg = build_user_message(obs)
|
| 210 |
+
|
| 211 |
+
try:
|
| 212 |
+
response = client.chat.completions.create(
|
| 213 |
+
model=MODEL,
|
| 214 |
+
messages=[
|
| 215 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 216 |
+
{"role": "user", "content": user_msg},
|
| 217 |
+
],
|
| 218 |
+
temperature=0.2,
|
| 219 |
+
response_format={"type": "json_object"},
|
| 220 |
+
)
|
| 221 |
+
action_dict = json.loads(response.choices[0].message.content or "{}")
|
| 222 |
+
except Exception as e:
|
| 223 |
+
print(f" [!] LLM error: {e}")
|
| 224 |
+
action_dict = {"comments": [], "submit": True}
|
| 225 |
+
|
| 226 |
+
step_r = requests.post(
|
| 227 |
+
f"{base_url}/step",
|
| 228 |
+
json={"session_id": session_id, "action": action_dict},
|
| 229 |
+
headers=headers,
|
| 230 |
+
)
|
| 231 |
+
step_r.raise_for_status()
|
| 232 |
+
result = step_r.json()
|
| 233 |
+
|
| 234 |
+
total_reward += result["reward"]["value"]
|
| 235 |
+
steps_taken += 1
|
| 236 |
+
final_score = result["info"].get("grader", {}).get("score", 0.0)
|
| 237 |
+
|
| 238 |
+
print(f" Step {step_num+1}: reward={result['reward']['value']:+.3f} | "
|
| 239 |
+
f"comments={result['info']['total_comments']} | "
|
| 240 |
+
f"score={final_score:.3f}")
|
| 241 |
+
|
| 242 |
+
obs = result["observation"]
|
| 243 |
+
if result["done"]:
|
| 244 |
+
break
|
| 245 |
+
|
| 246 |
+
return {
|
| 247 |
+
"task_id": task_id,
|
| 248 |
+
"steps": steps_taken,
|
| 249 |
+
"total_reward": round(total_reward, 4),
|
| 250 |
+
"final_score": round(final_score, 4),
|
| 251 |
+
"passed": final_score >= threshold,
|
| 252 |
+
"threshold": threshold,
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
# ---------------------------------------------------------------------------
|
| 257 |
+
# Main
|
| 258 |
+
# ---------------------------------------------------------------------------
|
| 259 |
+
|
| 260 |
+
def main():
|
| 261 |
+
parser = argparse.ArgumentParser(description="Baseline agent for CodeReview OpenEnv")
|
| 262 |
+
parser.add_argument("--mode", choices=["direct", "http"], default="direct")
|
| 263 |
+
parser.add_argument("--base-url", default=ENV_BASE_URL)
|
| 264 |
+
parser.add_argument("--task", choices=TASKS + ["all"], default="all")
|
| 265 |
+
args = parser.parse_args()
|
| 266 |
+
|
| 267 |
+
if not API_KEY:
|
| 268 |
+
print("ERROR: OPENAI_API_KEY environment variable not set.")
|
| 269 |
+
sys.exit(1)
|
| 270 |
+
|
| 271 |
+
client = OpenAI(api_key=API_KEY)
|
| 272 |
+
tasks_to_run = TASKS if args.task == "all" else [args.task]
|
| 273 |
+
|
| 274 |
+
print(f"\n{'='*60}")
|
| 275 |
+
print(f" CodeReview OpenEnv – Baseline Agent ({MODEL})")
|
| 276 |
+
print(f" Mode: {args.mode}")
|
| 277 |
+
print(f"{'='*60}\n")
|
| 278 |
+
|
| 279 |
+
results: List[dict] = []
|
| 280 |
+
for task_id in tasks_to_run:
|
| 281 |
+
print(f"▶ Running {task_id} ...")
|
| 282 |
+
t0 = time.time()
|
| 283 |
+
if args.mode == "direct":
|
| 284 |
+
r = run_direct(task_id, client)
|
| 285 |
+
else:
|
| 286 |
+
r = run_http(task_id, client, args.base_url)
|
| 287 |
+
elapsed = round(time.time() - t0, 1)
|
| 288 |
+
r["elapsed_s"] = elapsed
|
| 289 |
+
results.append(r)
|
| 290 |
+
status = "✅ PASSED" if r["passed"] else "❌ FAILED"
|
| 291 |
+
print(f" → {status} | score={r['final_score']:.3f} | reward={r['total_reward']:+.3f} | {elapsed}s\n")
|
| 292 |
+
|
| 293 |
+
# Summary table
|
| 294 |
+
print(f"\n{'='*60}")
|
| 295 |
+
print(f" BASELINE RESULTS")
|
| 296 |
+
print(f"{'='*60}")
|
| 297 |
+
print(f" {'Task':<22} {'Score':>7} {'Threshold':>10} {'Reward':>8} {'Pass':>6}")
|
| 298 |
+
print(f" {'-'*55}")
|
| 299 |
+
for r in results:
|
| 300 |
+
print(f" {r['task_id']:<22} {r['final_score']:>7.3f} {r['threshold']:>10.2f} "
|
| 301 |
+
f"{r['total_reward']:>+8.3f} {'✅' if r['passed'] else '❌':>6}")
|
| 302 |
+
avg_score = sum(r["final_score"] for r in results) / len(results)
|
| 303 |
+
pass_rate = sum(1 for r in results if r["passed"]) / len(results)
|
| 304 |
+
print(f" {'-'*55}")
|
| 305 |
+
print(f" {'AVERAGE':<22} {avg_score:>7.3f} {'':>10} {'':>8} {pass_rate*100:>5.0f}%")
|
| 306 |
+
print(f"{'='*60}\n")
|
| 307 |
+
|
| 308 |
+
# Save results
|
| 309 |
+
out_path = "baseline_results.json"
|
| 310 |
+
with open(out_path, "w") as f:
|
| 311 |
+
json.dump({"model": MODEL, "results": results}, f, indent=2)
|
| 312 |
+
print(f" Results saved to {out_path}")
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
if __name__ == "__main__":
|
| 316 |
+
main()
|
corpus/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# corpus package
|
corpus/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (155 Bytes). View file
|
|
|
corpus/__pycache__/snippets.cpython-313.pyc
ADDED
|
Binary file (11.7 kB). View file
|
|
|
corpus/snippets.py
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Code corpus: Python snippets with embedded ground-truth issues.
|
| 3 |
+
|
| 4 |
+
Each entry has:
|
| 5 |
+
- snippet : CodeSnippet to show the agent
|
| 6 |
+
- issues : list of ground-truth ReviewComment objects the grader checks against
|
| 7 |
+
- task_id : which task this belongs to
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
from env.models import CodeSnippet, ReviewCategory, ReviewComment, Severity
|
| 13 |
+
|
| 14 |
+
# ---------------------------------------------------------------------------
|
| 15 |
+
# TASK 1 – Easy (Bug detection + Code style)
|
| 16 |
+
# ---------------------------------------------------------------------------
|
| 17 |
+
|
| 18 |
+
TASK1_SNIPPET = CodeSnippet(
|
| 19 |
+
file_name="calculator.py",
|
| 20 |
+
source='''\
|
| 21 |
+
def divide(a, b):
|
| 22 |
+
return a / b # line 2
|
| 23 |
+
|
| 24 |
+
def average(numbers):
|
| 25 |
+
total = 0
|
| 26 |
+
for n in numbers:
|
| 27 |
+
total = total + n
|
| 28 |
+
return total / len(numbers) # line 8
|
| 29 |
+
|
| 30 |
+
def celsius_to_fahrenheit(c):
|
| 31 |
+
return c * 9/5 + 31 # line 11 (bug: should be +32)
|
| 32 |
+
|
| 33 |
+
def is_palindrome(s):
|
| 34 |
+
return s == s[::-1] # line 14
|
| 35 |
+
|
| 36 |
+
def find_max(lst):
|
| 37 |
+
max_val = lst[0] # line 17
|
| 38 |
+
for i in range(len(lst)):
|
| 39 |
+
if lst[i] > max_val:
|
| 40 |
+
max_val = lst[i]
|
| 41 |
+
return max_val # line 21
|
| 42 |
+
|
| 43 |
+
def count_words(text):
|
| 44 |
+
words = text.split(" ")
|
| 45 |
+
wordcount = {}
|
| 46 |
+
for w in words:
|
| 47 |
+
if w in wordcount:
|
| 48 |
+
wordcount[w] = wordcount[w]+1
|
| 49 |
+
else:
|
| 50 |
+
wordcount[w] = 1
|
| 51 |
+
return wordcount # line 30
|
| 52 |
+
''',
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
TASK1_ISSUES: list[ReviewComment] = [
|
| 56 |
+
# ---- Bugs ----
|
| 57 |
+
ReviewComment(
|
| 58 |
+
line=2,
|
| 59 |
+
category=ReviewCategory.BUG,
|
| 60 |
+
severity=Severity.HIGH,
|
| 61 |
+
message="divide() has no guard against division by zero; will raise ZeroDivisionError when b=0.",
|
| 62 |
+
suggestion="Add `if b == 0: raise ValueError('b must not be zero')` before returning.",
|
| 63 |
+
),
|
| 64 |
+
ReviewComment(
|
| 65 |
+
line=8,
|
| 66 |
+
category=ReviewCategory.BUG,
|
| 67 |
+
severity=Severity.HIGH,
|
| 68 |
+
message="average() crashes with ZeroDivisionError on an empty list.",
|
| 69 |
+
suggestion="Guard with `if not numbers: return 0.0` or raise ValueError.",
|
| 70 |
+
),
|
| 71 |
+
ReviewComment(
|
| 72 |
+
line=11,
|
| 73 |
+
category=ReviewCategory.BUG,
|
| 74 |
+
severity=Severity.MEDIUM,
|
| 75 |
+
message="celsius_to_fahrenheit uses +31 instead of +32, giving wrong results.",
|
| 76 |
+
suggestion="Change `+ 31` to `+ 32`.",
|
| 77 |
+
),
|
| 78 |
+
ReviewComment(
|
| 79 |
+
line=17,
|
| 80 |
+
category=ReviewCategory.BUG,
|
| 81 |
+
severity=Severity.MEDIUM,
|
| 82 |
+
message="find_max() crashes with IndexError on an empty list.",
|
| 83 |
+
suggestion="Add `if not lst: raise ValueError('list is empty')` at the top.",
|
| 84 |
+
),
|
| 85 |
+
# ---- Style ----
|
| 86 |
+
ReviewComment(
|
| 87 |
+
line=18,
|
| 88 |
+
category=ReviewCategory.STYLE,
|
| 89 |
+
severity=Severity.LOW,
|
| 90 |
+
message="Iterating with `for i in range(len(lst))` is unpythonic; prefer `for val in lst`.",
|
| 91 |
+
suggestion="Replace loop body with `for val in lst: if val > max_val: max_val = val`.",
|
| 92 |
+
),
|
| 93 |
+
ReviewComment(
|
| 94 |
+
line=25,
|
| 95 |
+
category=ReviewCategory.STYLE,
|
| 96 |
+
severity=Severity.LOW,
|
| 97 |
+
message="count_words manually reimplements collections.Counter; use the stdlib instead.",
|
| 98 |
+
suggestion="Replace with `from collections import Counter; return Counter(text.split())`.",
|
| 99 |
+
),
|
| 100 |
+
]
|
| 101 |
+
|
| 102 |
+
# ---------------------------------------------------------------------------
|
| 103 |
+
# TASK 2 – Medium (Security + Performance)
|
| 104 |
+
# ---------------------------------------------------------------------------
|
| 105 |
+
|
| 106 |
+
TASK2_SNIPPET = CodeSnippet(
|
| 107 |
+
file_name="user_service.py",
|
| 108 |
+
source='''\
|
| 109 |
+
import sqlite3
|
| 110 |
+
import hashlib
|
| 111 |
+
import os
|
| 112 |
+
|
| 113 |
+
DB_PATH = "users.db"
|
| 114 |
+
|
| 115 |
+
def get_user(username):
|
| 116 |
+
conn = sqlite3.connect(DB_PATH)
|
| 117 |
+
cursor = conn.cursor()
|
| 118 |
+
query = f"SELECT * FROM users WHERE username = \'{ username }\'" # line 10
|
| 119 |
+
cursor.execute(query)
|
| 120 |
+
result = cursor.fetchone()
|
| 121 |
+
conn.close()
|
| 122 |
+
return result
|
| 123 |
+
|
| 124 |
+
def create_user(username, password):
|
| 125 |
+
conn = sqlite3.connect(DB_PATH)
|
| 126 |
+
cursor = conn.cursor()
|
| 127 |
+
pw_hash = hashlib.md5(password.encode()).hexdigest() # line 19
|
| 128 |
+
cursor.execute(
|
| 129 |
+
"INSERT INTO users (username, password) VALUES (?, ?)",
|
| 130 |
+
(username, pw_hash),
|
| 131 |
+
)
|
| 132 |
+
conn.commit()
|
| 133 |
+
conn.close()
|
| 134 |
+
|
| 135 |
+
def load_all_users():
|
| 136 |
+
conn = sqlite3.connect(DB_PATH)
|
| 137 |
+
cursor = conn.cursor()
|
| 138 |
+
cursor.execute("SELECT * FROM users")
|
| 139 |
+
rows = cursor.fetchall() # line 31
|
| 140 |
+
conn.close()
|
| 141 |
+
users = []
|
| 142 |
+
for row in rows:
|
| 143 |
+
users.append({
|
| 144 |
+
"id": row[0],
|
| 145 |
+
"username": row[1],
|
| 146 |
+
"password": row[2],
|
| 147 |
+
})
|
| 148 |
+
return users
|
| 149 |
+
|
| 150 |
+
def authenticate(username, password):
|
| 151 |
+
user = get_user(username)
|
| 152 |
+
if user is None:
|
| 153 |
+
return False
|
| 154 |
+
pw_hash = hashlib.md5(password.encode()).hexdigest() # line 46
|
| 155 |
+
return user[2] == pw_hash
|
| 156 |
+
|
| 157 |
+
def delete_user(username):
|
| 158 |
+
conn = sqlite3.connect(DB_PATH)
|
| 159 |
+
cursor = conn.cursor()
|
| 160 |
+
query = f"DELETE FROM users WHERE username = \'{ username }\'" # line 52
|
| 161 |
+
cursor.execute(query)
|
| 162 |
+
conn.commit()
|
| 163 |
+
conn.close()
|
| 164 |
+
''',
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
TASK2_ISSUES: list[ReviewComment] = [
|
| 168 |
+
# ---- Security ----
|
| 169 |
+
ReviewComment(
|
| 170 |
+
line=10,
|
| 171 |
+
category=ReviewCategory.SECURITY,
|
| 172 |
+
severity=Severity.CRITICAL,
|
| 173 |
+
message="SQL injection vulnerability: username is interpolated directly into the query string.",
|
| 174 |
+
suggestion="Use parameterised queries: `cursor.execute('SELECT * FROM users WHERE username=?', (username,))`",
|
| 175 |
+
),
|
| 176 |
+
ReviewComment(
|
| 177 |
+
line=19,
|
| 178 |
+
category=ReviewCategory.SECURITY,
|
| 179 |
+
severity=Severity.CRITICAL,
|
| 180 |
+
message="MD5 is cryptographically broken and must not be used for password hashing.",
|
| 181 |
+
suggestion="Replace with `bcrypt.hashpw(password.encode(), bcrypt.gensalt())` or `hashlib.scrypt`.",
|
| 182 |
+
),
|
| 183 |
+
ReviewComment(
|
| 184 |
+
line=52,
|
| 185 |
+
category=ReviewCategory.SECURITY,
|
| 186 |
+
severity=Severity.CRITICAL,
|
| 187 |
+
message="delete_user() is also vulnerable to SQL injection via string interpolation.",
|
| 188 |
+
suggestion="Use parameterised queries: `cursor.execute('DELETE FROM users WHERE username=?', (username,))`",
|
| 189 |
+
),
|
| 190 |
+
ReviewComment(
|
| 191 |
+
line=46,
|
| 192 |
+
category=ReviewCategory.SECURITY,
|
| 193 |
+
severity=Severity.HIGH,
|
| 194 |
+
message="authenticate() re-hashes with MD5 for comparison; same broken-hash issue as create_user.",
|
| 195 |
+
suggestion="Adopt bcrypt.checkpw() or equivalent constant-time comparison.",
|
| 196 |
+
),
|
| 197 |
+
# ---- Performance ----
|
| 198 |
+
ReviewComment(
|
| 199 |
+
line=31,
|
| 200 |
+
category=ReviewCategory.PERFORMANCE,
|
| 201 |
+
severity=Severity.HIGH,
|
| 202 |
+
message="fetchall() loads the entire users table into memory; will OOM on large tables.",
|
| 203 |
+
suggestion="Use `cursor.fetchmany(size=1000)` in a loop or add a LIMIT clause.",
|
| 204 |
+
),
|
| 205 |
+
ReviewComment(
|
| 206 |
+
line=8,
|
| 207 |
+
category=ReviewCategory.PERFORMANCE,
|
| 208 |
+
severity=Severity.MEDIUM,
|
| 209 |
+
message="A new DB connection is opened and closed for every single query; connection pooling should be used.",
|
| 210 |
+
suggestion="Use a module-level connection or a context-manager pool (e.g. `sqlite3.connect` as a shared resource).",
|
| 211 |
+
),
|
| 212 |
+
]
|
| 213 |
+
|
| 214 |
+
# ---------------------------------------------------------------------------
|
| 215 |
+
# TASK 3 – Hard (All categories: Bug + Security + Performance + Style + Docs)
|
| 216 |
+
# ---------------------------------------------------------------------------
|
| 217 |
+
|
| 218 |
+
TASK3_SNIPPET = CodeSnippet(
|
| 219 |
+
file_name="data_pipeline.py",
|
| 220 |
+
source='''\
|
| 221 |
+
"""Data pipeline for processing CSV exports from the analytics platform."""
|
| 222 |
+
|
| 223 |
+
import csv
|
| 224 |
+
import os
|
| 225 |
+
import pickle
|
| 226 |
+
import subprocess
|
| 227 |
+
import time
|
| 228 |
+
|
| 229 |
+
CACHE = {}
|
| 230 |
+
|
| 231 |
+
def load_data(filepath):
|
| 232 |
+
with open(filepath) as f: # line 12
|
| 233 |
+
reader = csv.DictReader(f)
|
| 234 |
+
data = []
|
| 235 |
+
for row in reader:
|
| 236 |
+
data.append(row)
|
| 237 |
+
return data
|
| 238 |
+
|
| 239 |
+
def process_row(row, transform_script):
|
| 240 |
+
result = subprocess.run(transform_script, shell=True, input=str(row)) # line 20
|
| 241 |
+
return result.stdout
|
| 242 |
+
|
| 243 |
+
def cache_result(key, value):
|
| 244 |
+
CACHE[key] = pickle.dumps(value) # line 24
|
| 245 |
+
|
| 246 |
+
def get_cached(key):
|
| 247 |
+
if key in CACHE:
|
| 248 |
+
return pickle.loads(CACHE[key]) # line 28
|
| 249 |
+
|
| 250 |
+
def compute_stats(data):
|
| 251 |
+
n = len(data) # line 31
|
| 252 |
+
total = sum(float(row["value"]) for row in data)
|
| 253 |
+
mean = total / n
|
| 254 |
+
variance = sum((float(row["value"]) - mean) ** 2 for row in data) / n
|
| 255 |
+
return {"mean": mean, "variance": variance, "count": n}
|
| 256 |
+
|
| 257 |
+
def run_pipeline(filepath, transform_script=None):
|
| 258 |
+
data = load_data(filepath)
|
| 259 |
+
if transform_script:
|
| 260 |
+
processed = []
|
| 261 |
+
for row in data:
|
| 262 |
+
processed.append(process_row(row, transform_script))
|
| 263 |
+
data = processed
|
| 264 |
+
stats = compute_stats(data)
|
| 265 |
+
cache_result(filepath, stats)
|
| 266 |
+
return stats
|
| 267 |
+
|
| 268 |
+
def export_results(stats, output_path):
|
| 269 |
+
with open(output_path, "w") as f: # line 47
|
| 270 |
+
f.write(str(stats))
|
| 271 |
+
''',
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
TASK3_ISSUES: list[ReviewComment] = [
|
| 275 |
+
# ---- Security ----
|
| 276 |
+
ReviewComment(
|
| 277 |
+
line=20,
|
| 278 |
+
category=ReviewCategory.SECURITY,
|
| 279 |
+
severity=Severity.CRITICAL,
|
| 280 |
+
message="subprocess.run with shell=True and user-supplied transform_script enables arbitrary OS command injection.",
|
| 281 |
+
suggestion="Avoid shell=True; pass args as a list or whitelist allowed scripts.",
|
| 282 |
+
),
|
| 283 |
+
ReviewComment(
|
| 284 |
+
line=28,
|
| 285 |
+
category=ReviewCategory.SECURITY,
|
| 286 |
+
severity=Severity.CRITICAL,
|
| 287 |
+
message="pickle.loads() on untrusted/arbitrary cache data allows arbitrary code execution.",
|
| 288 |
+
suggestion="Replace pickle with json.dumps/loads for serialisable data, or sign+verify the payload.",
|
| 289 |
+
),
|
| 290 |
+
ReviewComment(
|
| 291 |
+
line=24,
|
| 292 |
+
category=ReviewCategory.SECURITY,
|
| 293 |
+
severity=Severity.HIGH,
|
| 294 |
+
message="Storing pickled data in a module-level dict means deserialization risk persists across calls.",
|
| 295 |
+
suggestion="Use JSON for the cache and validate schemas on retrieval.",
|
| 296 |
+
),
|
| 297 |
+
# ---- Bugs ----
|
| 298 |
+
ReviewComment(
|
| 299 |
+
line=31,
|
| 300 |
+
category=ReviewCategory.BUG,
|
| 301 |
+
severity=Severity.HIGH,
|
| 302 |
+
message="compute_stats() raises ZeroDivisionError when data is empty (n=0).",
|
| 303 |
+
suggestion="Guard with `if not data: return {'mean': 0, 'variance': 0, 'count': 0}`.",
|
| 304 |
+
),
|
| 305 |
+
ReviewComment(
|
| 306 |
+
line=32,
|
| 307 |
+
category=ReviewCategory.BUG,
|
| 308 |
+
severity=Severity.MEDIUM,
|
| 309 |
+
message="If any row is missing the 'value' key, a KeyError will silently abort the pipeline.",
|
| 310 |
+
suggestion="Use `row.get('value', 0)` or validate schema at load time.",
|
| 311 |
+
),
|
| 312 |
+
ReviewComment(
|
| 313 |
+
line=12,
|
| 314 |
+
category=ReviewCategory.BUG,
|
| 315 |
+
severity=Severity.MEDIUM,
|
| 316 |
+
message="open(filepath) without encoding='utf-8' will use the system locale; may fail on non-ASCII data.",
|
| 317 |
+
suggestion="Use `open(filepath, encoding='utf-8')`.",
|
| 318 |
+
),
|
| 319 |
+
# ---- Performance ----
|
| 320 |
+
ReviewComment(
|
| 321 |
+
line=31,
|
| 322 |
+
category=ReviewCategory.PERFORMANCE,
|
| 323 |
+
severity=Severity.MEDIUM,
|
| 324 |
+
message="compute_stats() iterates over data twice (once for sum, once for variance); single-pass Welford's algorithm is more efficient.",
|
| 325 |
+
suggestion="Use Welford's online algorithm or numpy for large datasets.",
|
| 326 |
+
),
|
| 327 |
+
ReviewComment(
|
| 328 |
+
line=38,
|
| 329 |
+
category=ReviewCategory.PERFORMANCE,
|
| 330 |
+
severity=Severity.MEDIUM,
|
| 331 |
+
message="process_row() spawns a new subprocess for every row; should batch or vectorise the transformation.",
|
| 332 |
+
suggestion="Pass all rows to a single subprocess call or use a Python-native transform function.",
|
| 333 |
+
),
|
| 334 |
+
# ---- Style ----
|
| 335 |
+
ReviewComment(
|
| 336 |
+
line=47,
|
| 337 |
+
category=ReviewCategory.STYLE,
|
| 338 |
+
severity=Severity.LOW,
|
| 339 |
+
message="export_results writes str(stats) (a Python dict repr) rather than valid JSON or CSV.",
|
| 340 |
+
suggestion="Use `import json; f.write(json.dumps(stats, indent=2))`.",
|
| 341 |
+
),
|
| 342 |
+
ReviewComment(
|
| 343 |
+
line=9,
|
| 344 |
+
category=ReviewCategory.STYLE,
|
| 345 |
+
severity=Severity.LOW,
|
| 346 |
+
message="Module-level mutable CACHE dict is a global side-effect; makes the pipeline hard to test and thread-unsafe.",
|
| 347 |
+
suggestion="Encapsulate state inside a Pipeline class or pass cache explicitly.",
|
| 348 |
+
),
|
| 349 |
+
# ---- Documentation ----
|
| 350 |
+
ReviewComment(
|
| 351 |
+
line=12,
|
| 352 |
+
category=ReviewCategory.DOCUMENTATION,
|
| 353 |
+
severity=Severity.LOW,
|
| 354 |
+
message="load_data() has no docstring; expected CSV schema (required columns, types) is undocumented.",
|
| 355 |
+
suggestion="Add a docstring describing filepath, expected columns, and return type.",
|
| 356 |
+
),
|
| 357 |
+
ReviewComment(
|
| 358 |
+
line=19,
|
| 359 |
+
category=ReviewCategory.DOCUMENTATION,
|
| 360 |
+
severity=Severity.LOW,
|
| 361 |
+
message="process_row() does not document what transform_script should be, its expected format, or return value.",
|
| 362 |
+
suggestion="Add docstring: args, expected script interface, return type, and example.",
|
| 363 |
+
),
|
| 364 |
+
ReviewComment(
|
| 365 |
+
line=None,
|
| 366 |
+
category=ReviewCategory.DOCUMENTATION,
|
| 367 |
+
severity=Severity.LOW,
|
| 368 |
+
message="Module-level docstring is too vague; doesn't mention side-effects, required CSV schema, or dependencies.",
|
| 369 |
+
suggestion="Expand the module docstring with usage example, required columns, and external dependencies.",
|
| 370 |
+
),
|
| 371 |
+
]
|
| 372 |
+
|
| 373 |
+
# ---------------------------------------------------------------------------
|
| 374 |
+
# Registry
|
| 375 |
+
# ---------------------------------------------------------------------------
|
| 376 |
+
|
| 377 |
+
CORPUS: dict[str, dict] = {
|
| 378 |
+
"task_1_easy": {
|
| 379 |
+
"snippet": TASK1_SNIPPET,
|
| 380 |
+
"issues": TASK1_ISSUES,
|
| 381 |
+
},
|
| 382 |
+
"task_2_medium": {
|
| 383 |
+
"snippet": TASK2_SNIPPET,
|
| 384 |
+
"issues": TASK2_ISSUES,
|
| 385 |
+
},
|
| 386 |
+
"task_3_hard": {
|
| 387 |
+
"snippet": TASK3_SNIPPET,
|
| 388 |
+
"issues": TASK3_ISSUES,
|
| 389 |
+
},
|
| 390 |
+
}
|
env/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# env package
|
env/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (152 Bytes). View file
|
|
|
env/__pycache__/environment.cpython-313.pyc
ADDED
|
Binary file (11.8 kB). View file
|
|
|
env/__pycache__/models.cpython-313.pyc
ADDED
|
Binary file (5.37 kB). View file
|
|
|
env/environment.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CodeReviewEnv – main OpenEnv environment.
|
| 3 |
+
|
| 4 |
+
Interface
|
| 5 |
+
---------
|
| 6 |
+
env = CodeReviewEnv(task_id="task_1_easy")
|
| 7 |
+
obs = env.reset()
|
| 8 |
+
result = env.step(action)
|
| 9 |
+
state = env.state()
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import time
|
| 15 |
+
from typing import Any, Dict, List, Optional
|
| 16 |
+
|
| 17 |
+
from corpus.snippets import CORPUS
|
| 18 |
+
from env.models import (
|
| 19 |
+
Action,
|
| 20 |
+
CodeSnippet,
|
| 21 |
+
EnvironmentState,
|
| 22 |
+
Observation,
|
| 23 |
+
Reward,
|
| 24 |
+
ReviewComment,
|
| 25 |
+
StepResult,
|
| 26 |
+
TaskDifficulty,
|
| 27 |
+
TaskSpec,
|
| 28 |
+
)
|
| 29 |
+
from graders.graders import GRADERS
|
| 30 |
+
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
# Task specs
|
| 33 |
+
# ---------------------------------------------------------------------------
|
| 34 |
+
|
| 35 |
+
TASK_SPECS: dict[str, TaskSpec] = {
|
| 36 |
+
"task_1_easy": TaskSpec(
|
| 37 |
+
task_id="task_1_easy",
|
| 38 |
+
title="Bug Detection & Style Review",
|
| 39 |
+
difficulty=TaskDifficulty.EASY,
|
| 40 |
+
categories=["bug", "style"],
|
| 41 |
+
description=(
|
| 42 |
+
"Review calculator.py for correctness bugs (division by zero, off-by-one, "
|
| 43 |
+
"empty collection crashes) and Python style issues. "
|
| 44 |
+
"You do NOT need to check for security or performance."
|
| 45 |
+
),
|
| 46 |
+
max_steps=5,
|
| 47 |
+
passing_threshold=0.55,
|
| 48 |
+
),
|
| 49 |
+
"task_2_medium": TaskSpec(
|
| 50 |
+
task_id="task_2_medium",
|
| 51 |
+
title="Security & Performance Audit",
|
| 52 |
+
difficulty=TaskDifficulty.MEDIUM,
|
| 53 |
+
categories=["security", "performance"],
|
| 54 |
+
description=(
|
| 55 |
+
"Audit user_service.py for security vulnerabilities (SQL injection, weak "
|
| 56 |
+
"hashing, unsafe deserialization) and performance problems (unbounded queries, "
|
| 57 |
+
"connection churn). Identify ALL critical security issues – missing one costs "
|
| 58 |
+
"heavily."
|
| 59 |
+
),
|
| 60 |
+
max_steps=7,
|
| 61 |
+
passing_threshold=0.60,
|
| 62 |
+
),
|
| 63 |
+
"task_3_hard": TaskSpec(
|
| 64 |
+
task_id="task_3_hard",
|
| 65 |
+
title="Comprehensive Code Review",
|
| 66 |
+
difficulty=TaskDifficulty.HARD,
|
| 67 |
+
categories=["bug", "security", "performance", "style", "documentation"],
|
| 68 |
+
description=(
|
| 69 |
+
"Perform a full production-grade review of data_pipeline.py covering bugs, "
|
| 70 |
+
"security flaws, performance issues, code style, and documentation gaps. "
|
| 71 |
+
"You MUST provide a written summary of overall findings. "
|
| 72 |
+
"This snippet has intentional issues across all five categories."
|
| 73 |
+
),
|
| 74 |
+
max_steps=10,
|
| 75 |
+
passing_threshold=0.65,
|
| 76 |
+
),
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
# ---------------------------------------------------------------------------
|
| 80 |
+
# Environment
|
| 81 |
+
# ---------------------------------------------------------------------------
|
| 82 |
+
|
| 83 |
+
INSTRUCTIONS_TEMPLATE = """
|
| 84 |
+
You are performing a Python code review.
|
| 85 |
+
|
| 86 |
+
Task: {title}
|
| 87 |
+
Difficulty: {difficulty}
|
| 88 |
+
Categories to check: {categories}
|
| 89 |
+
|
| 90 |
+
{description}
|
| 91 |
+
|
| 92 |
+
Your job:
|
| 93 |
+
1. Read the code snippet carefully.
|
| 94 |
+
2. Identify issues matching the specified categories.
|
| 95 |
+
3. For each issue, provide: line number (if applicable), category, severity, a clear message, and an optional fix suggestion.
|
| 96 |
+
4. When you are satisfied, set `submit=True` in your action.
|
| 97 |
+
{summary_note}
|
| 98 |
+
|
| 99 |
+
The code will be shown in the observation. Previous comments you have already submitted are also included so you can refine or expand them across steps.
|
| 100 |
+
""".strip()
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
class CodeReviewEnv:
|
| 104 |
+
"""
|
| 105 |
+
OpenEnv-compliant environment for Python code review tasks.
|
| 106 |
+
"""
|
| 107 |
+
|
| 108 |
+
def __init__(self, task_id: str = "task_1_easy"):
|
| 109 |
+
if task_id not in TASK_SPECS:
|
| 110 |
+
raise ValueError(f"Unknown task_id '{task_id}'. Choose from: {list(TASK_SPECS)}")
|
| 111 |
+
|
| 112 |
+
self.task_id = task_id
|
| 113 |
+
self.spec: TaskSpec = TASK_SPECS[task_id]
|
| 114 |
+
self.corpus_entry: dict = CORPUS[task_id]
|
| 115 |
+
self.grader = GRADERS[task_id]
|
| 116 |
+
self.ground_truth: List[ReviewComment] = self.corpus_entry["issues"]
|
| 117 |
+
self.snippet: CodeSnippet = self.corpus_entry["snippet"]
|
| 118 |
+
|
| 119 |
+
# State
|
| 120 |
+
self._step: int = 0
|
| 121 |
+
self._done: bool = False
|
| 122 |
+
self._comments: List[ReviewComment] = []
|
| 123 |
+
self._total_reward: float = 0.0
|
| 124 |
+
self._grader_scores: Dict[str, float] = {}
|
| 125 |
+
self._last_feedback: Optional[str] = None
|
| 126 |
+
|
| 127 |
+
# ------------------------------------------------------------------
|
| 128 |
+
# Public API
|
| 129 |
+
# ------------------------------------------------------------------
|
| 130 |
+
|
| 131 |
+
def reset(self) -> Observation:
|
| 132 |
+
"""Reset the environment to initial state and return first observation."""
|
| 133 |
+
self._step = 0
|
| 134 |
+
self._done = False
|
| 135 |
+
self._comments = []
|
| 136 |
+
self._total_reward = 0.0
|
| 137 |
+
self._grader_scores = {}
|
| 138 |
+
self._last_feedback = None
|
| 139 |
+
return self._build_observation()
|
| 140 |
+
|
| 141 |
+
def step(self, action: Action) -> StepResult:
|
| 142 |
+
"""
|
| 143 |
+
Advance the environment by one step.
|
| 144 |
+
|
| 145 |
+
Parameters
|
| 146 |
+
----------
|
| 147 |
+
action : Action
|
| 148 |
+
Comments produced this step plus optional submit flag.
|
| 149 |
+
|
| 150 |
+
Returns
|
| 151 |
+
-------
|
| 152 |
+
StepResult with (observation, reward, done, info)
|
| 153 |
+
"""
|
| 154 |
+
if self._done:
|
| 155 |
+
raise RuntimeError("Episode is done; call reset() first.")
|
| 156 |
+
|
| 157 |
+
self._step += 1
|
| 158 |
+
|
| 159 |
+
# Accumulate comments (deduplicate by message fingerprint)
|
| 160 |
+
new_comments = self._deduplicate(action.comments)
|
| 161 |
+
self._comments.extend(new_comments)
|
| 162 |
+
|
| 163 |
+
# Compute incremental reward for new comments
|
| 164 |
+
reward, feedback, grader_result = self._compute_reward(action, new_comments)
|
| 165 |
+
self._grader_scores = grader_result
|
| 166 |
+
self._total_reward = round(self._total_reward + reward.value, 4)
|
| 167 |
+
self._last_feedback = feedback
|
| 168 |
+
|
| 169 |
+
# Determine done
|
| 170 |
+
done = action.submit or self._step >= self.spec.max_steps
|
| 171 |
+
self._done = done
|
| 172 |
+
|
| 173 |
+
obs = self._build_observation(feedback=feedback, done=done)
|
| 174 |
+
info: Dict[str, Any] = {
|
| 175 |
+
"step": self._step,
|
| 176 |
+
"new_comments": len(new_comments),
|
| 177 |
+
"total_comments": len(self._comments),
|
| 178 |
+
"grader": grader_result,
|
| 179 |
+
"passed": grader_result.get("score", 0.0) >= self.spec.passing_threshold,
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
return StepResult(observation=obs, reward=reward, done=done, info=info)
|
| 183 |
+
|
| 184 |
+
def state(self) -> EnvironmentState:
|
| 185 |
+
"""Return full serialisable state snapshot."""
|
| 186 |
+
return EnvironmentState(
|
| 187 |
+
task_id=self.task_id,
|
| 188 |
+
step=self._step,
|
| 189 |
+
max_steps=self.spec.max_steps,
|
| 190 |
+
total_reward=self._total_reward,
|
| 191 |
+
comments_so_far=self._comments,
|
| 192 |
+
done=self._done,
|
| 193 |
+
grader_scores=self._grader_scores,
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
# ------------------------------------------------------------------
|
| 197 |
+
# Internal helpers
|
| 198 |
+
# ------------------------------------------------------------------
|
| 199 |
+
|
| 200 |
+
def _build_observation(
|
| 201 |
+
self,
|
| 202 |
+
feedback: Optional[str] = None,
|
| 203 |
+
done: bool = False,
|
| 204 |
+
) -> Observation:
|
| 205 |
+
summary_note = (
|
| 206 |
+
"\n5. You MUST include a `summary` field with your overall assessment."
|
| 207 |
+
if self.task_id == "task_3_hard"
|
| 208 |
+
else ""
|
| 209 |
+
)
|
| 210 |
+
instructions = INSTRUCTIONS_TEMPLATE.format(
|
| 211 |
+
title=self.spec.title,
|
| 212 |
+
difficulty=self.spec.difficulty.value.upper(),
|
| 213 |
+
categories=", ".join(self.spec.categories),
|
| 214 |
+
description=self.spec.description,
|
| 215 |
+
summary_note=summary_note,
|
| 216 |
+
)
|
| 217 |
+
return Observation(
|
| 218 |
+
task_id=self.task_id,
|
| 219 |
+
step=self._step,
|
| 220 |
+
snippet=self.snippet,
|
| 221 |
+
instructions=instructions,
|
| 222 |
+
previous_comments=list(self._comments),
|
| 223 |
+
feedback=feedback or self._last_feedback,
|
| 224 |
+
done=done,
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
def _compute_reward(
|
| 228 |
+
self,
|
| 229 |
+
action: Action,
|
| 230 |
+
new_comments: List[ReviewComment],
|
| 231 |
+
) -> tuple[Reward, str, dict]:
|
| 232 |
+
"""
|
| 233 |
+
Compute reward with partial progress signals.
|
| 234 |
+
|
| 235 |
+
Components
|
| 236 |
+
----------
|
| 237 |
+
* +step_signal : positive if new valid comments were added
|
| 238 |
+
* +submit_bonus : grader score applied on final submit
|
| 239 |
+
* -loop_penalty : penalty for submitting zero new comments repeatedly
|
| 240 |
+
* -over_comment : penalty for > 2× the expected number of comments
|
| 241 |
+
"""
|
| 242 |
+
# Run grader against ALL accumulated comments
|
| 243 |
+
full_action = Action(
|
| 244 |
+
comments=self._comments,
|
| 245 |
+
summary=action.summary,
|
| 246 |
+
submit=action.submit,
|
| 247 |
+
)
|
| 248 |
+
grader_result = self.grader.grade(full_action, self.ground_truth)
|
| 249 |
+
current_score = grader_result["score"]
|
| 250 |
+
|
| 251 |
+
breakdown: Dict[str, float] = {}
|
| 252 |
+
reward_val = 0.0
|
| 253 |
+
|
| 254 |
+
if action.submit:
|
| 255 |
+
# Final reward = full grader score (0–1 mapped to -0.2–1.0)
|
| 256 |
+
submit_reward = current_score * 0.8 + (0.2 if current_score >= self.spec.passing_threshold else -0.2)
|
| 257 |
+
reward_val += submit_reward
|
| 258 |
+
breakdown["submit_reward"] = round(submit_reward, 4)
|
| 259 |
+
feedback = (
|
| 260 |
+
f"Review submitted. Score: {current_score:.3f} "
|
| 261 |
+
f"({'PASSED' if current_score >= self.spec.passing_threshold else 'FAILED'}). "
|
| 262 |
+
f"Matched {grader_result['matched_count']}/{grader_result['total_ground_truth']} issues."
|
| 263 |
+
)
|
| 264 |
+
else:
|
| 265 |
+
# Incremental reward: positive if new valid comments detected
|
| 266 |
+
if new_comments:
|
| 267 |
+
# Small positive signal for adding comments (+0.05 per comment, capped)
|
| 268 |
+
step_reward = min(0.05 * len(new_comments), 0.15)
|
| 269 |
+
reward_val += step_reward
|
| 270 |
+
breakdown["step_reward"] = round(step_reward, 4)
|
| 271 |
+
|
| 272 |
+
# Progress signal: reward increase in grader score
|
| 273 |
+
# We run a "previous" grader check without new comments to get delta
|
| 274 |
+
prev_action = Action(
|
| 275 |
+
comments=[c for c in self._comments if c not in new_comments],
|
| 276 |
+
summary=None,
|
| 277 |
+
submit=False,
|
| 278 |
+
)
|
| 279 |
+
prev_result = self.grader.grade(prev_action, self.ground_truth)
|
| 280 |
+
score_delta = current_score - prev_result["score"]
|
| 281 |
+
if score_delta > 0:
|
| 282 |
+
progress_reward = round(score_delta * 0.5, 4)
|
| 283 |
+
reward_val += progress_reward
|
| 284 |
+
breakdown["progress_reward"] = progress_reward
|
| 285 |
+
else:
|
| 286 |
+
# Penalty for empty step
|
| 287 |
+
reward_val -= 0.05
|
| 288 |
+
breakdown["empty_step_penalty"] = -0.05
|
| 289 |
+
|
| 290 |
+
# Penalty for too many comments (spam)
|
| 291 |
+
expected = grader_result["total_ground_truth"]
|
| 292 |
+
if len(self._comments) > expected * 2.5:
|
| 293 |
+
spam_penalty = -0.10
|
| 294 |
+
reward_val += spam_penalty
|
| 295 |
+
breakdown["spam_penalty"] = spam_penalty
|
| 296 |
+
|
| 297 |
+
feedback = (
|
| 298 |
+
f"Step {self._step}: Added {len(new_comments)} comment(s). "
|
| 299 |
+
f"Running score: {current_score:.3f}. "
|
| 300 |
+
f"Steps remaining: {self.spec.max_steps - self._step}."
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
reward_val = round(max(-1.0, min(1.0, reward_val)), 4)
|
| 304 |
+
return Reward(value=reward_val, breakdown=breakdown, reason=feedback), feedback, grader_result
|
| 305 |
+
|
| 306 |
+
def _deduplicate(self, incoming: List[ReviewComment]) -> List[ReviewComment]:
|
| 307 |
+
"""Remove comments whose (line, category, message[:40]) already exist."""
|
| 308 |
+
existing_keys = {
|
| 309 |
+
(c.line, c.category, c.message[:40]) for c in self._comments
|
| 310 |
+
}
|
| 311 |
+
new: List[ReviewComment] = []
|
| 312 |
+
for c in incoming:
|
| 313 |
+
key = (c.line, c.category, c.message[:40])
|
| 314 |
+
if key not in existing_keys:
|
| 315 |
+
existing_keys.add(key)
|
| 316 |
+
new.append(c)
|
| 317 |
+
return new
|
env/models.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pydantic typed models for CodeReview OpenEnv.
|
| 3 |
+
|
| 4 |
+
Defines all core data structures: enums for review categories and severities,
|
| 5 |
+
code snippets, review comments, actions, observations, rewards, step results,
|
| 6 |
+
task specifications, and environment state.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from enum import Enum
|
| 12 |
+
from typing import Any, Dict, List, Optional
|
| 13 |
+
|
| 14 |
+
from pydantic import BaseModel, Field
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# ---------------------------------------------------------------------------
|
| 18 |
+
# Enums
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
|
| 21 |
+
class ReviewCategory(str, Enum):
|
| 22 |
+
"""Categories of code review issues."""
|
| 23 |
+
BUG = "bug"
|
| 24 |
+
SECURITY = "security"
|
| 25 |
+
PERFORMANCE = "performance"
|
| 26 |
+
STYLE = "style"
|
| 27 |
+
DOCUMENTATION = "documentation"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class Severity(str, Enum):
|
| 31 |
+
"""Severity levels for review comments."""
|
| 32 |
+
LOW = "low"
|
| 33 |
+
MEDIUM = "medium"
|
| 34 |
+
HIGH = "high"
|
| 35 |
+
CRITICAL = "critical"
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class TaskDifficulty(str, Enum):
|
| 39 |
+
"""Difficulty levels for tasks."""
|
| 40 |
+
EASY = "easy"
|
| 41 |
+
MEDIUM = "medium"
|
| 42 |
+
HARD = "hard"
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# ---------------------------------------------------------------------------
|
| 46 |
+
# Core models
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
|
| 49 |
+
class CodeSnippet(BaseModel):
|
| 50 |
+
"""A Python source code snippet for review."""
|
| 51 |
+
file_name: str
|
| 52 |
+
source: str
|
| 53 |
+
language: str = "python"
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class ReviewComment(BaseModel):
|
| 57 |
+
"""A single review comment identifying an issue in the code."""
|
| 58 |
+
line: Optional[int] = None
|
| 59 |
+
category: ReviewCategory
|
| 60 |
+
severity: Severity = Severity.MEDIUM
|
| 61 |
+
message: str
|
| 62 |
+
suggestion: Optional[str] = None
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class Action(BaseModel):
|
| 66 |
+
"""Agent action: a list of review comments plus control flags."""
|
| 67 |
+
comments: List[ReviewComment] = Field(default_factory=list)
|
| 68 |
+
summary: Optional[str] = None
|
| 69 |
+
submit: bool = False
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class Observation(BaseModel):
|
| 73 |
+
"""What the agent sees on each step."""
|
| 74 |
+
task_id: str
|
| 75 |
+
step: int
|
| 76 |
+
snippet: CodeSnippet
|
| 77 |
+
instructions: str
|
| 78 |
+
previous_comments: List[ReviewComment] = Field(default_factory=list)
|
| 79 |
+
feedback: Optional[str] = None
|
| 80 |
+
done: bool = False
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class Reward(BaseModel):
|
| 84 |
+
"""Reward signal returned after each step."""
|
| 85 |
+
value: float = 0.0
|
| 86 |
+
breakdown: Dict[str, float] = Field(default_factory=dict)
|
| 87 |
+
reason: str = ""
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class StepResult(BaseModel):
|
| 91 |
+
"""Result of a single environment step."""
|
| 92 |
+
observation: Observation
|
| 93 |
+
reward: Reward
|
| 94 |
+
done: bool
|
| 95 |
+
info: Dict[str, Any] = Field(default_factory=dict)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class TaskSpec(BaseModel):
|
| 99 |
+
"""Specification for a single task."""
|
| 100 |
+
task_id: str
|
| 101 |
+
title: str
|
| 102 |
+
difficulty: TaskDifficulty
|
| 103 |
+
categories: List[str]
|
| 104 |
+
description: str
|
| 105 |
+
max_steps: int
|
| 106 |
+
passing_threshold: float
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
class EnvironmentState(BaseModel):
|
| 110 |
+
"""Full serialisable state snapshot of the environment."""
|
| 111 |
+
task_id: str
|
| 112 |
+
step: int
|
| 113 |
+
max_steps: int
|
| 114 |
+
total_reward: float
|
| 115 |
+
comments_so_far: List[ReviewComment] = Field(default_factory=list)
|
| 116 |
+
done: bool
|
| 117 |
+
grader_scores: Dict[str, Any] = Field(default_factory=dict)
|
graders/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# graders package
|
graders/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (156 Bytes). View file
|
|
|
graders/__pycache__/graders.cpython-313.pyc
ADDED
|
Binary file (13.9 kB). View file
|
|
|
graders/graders.py
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agent graders for all three tasks.
|
| 3 |
+
|
| 4 |
+
Each grader implements:
|
| 5 |
+
grade(action: Action, ground_truth: list[ReviewComment]) -> dict
|
| 6 |
+
|
| 7 |
+
Scoring philosophy
|
| 8 |
+
------------------
|
| 9 |
+
* True positive (found real issue) → positive reward
|
| 10 |
+
* False positive (fabricated issue) → small penalty
|
| 11 |
+
* Missed critical issue → large penalty
|
| 12 |
+
* Summary quality (task 3) → bonus
|
| 13 |
+
* Partial credit for correct category/severity with wrong line
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import re
|
| 19 |
+
from dataclasses import dataclass, field
|
| 20 |
+
from typing import List, Optional
|
| 21 |
+
|
| 22 |
+
from env.models import Action, ReviewCategory, ReviewComment, Severity
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ---------------------------------------------------------------------------
|
| 26 |
+
# Helpers
|
| 27 |
+
# ---------------------------------------------------------------------------
|
| 28 |
+
|
| 29 |
+
SEVERITY_WEIGHT: dict[Severity, float] = {
|
| 30 |
+
Severity.CRITICAL: 1.0,
|
| 31 |
+
Severity.HIGH: 0.75,
|
| 32 |
+
Severity.MEDIUM: 0.5,
|
| 33 |
+
Severity.LOW: 0.25,
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _category_match(a: ReviewComment, b: ReviewComment) -> bool:
|
| 38 |
+
return a.category == b.category
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _severity_close(a: ReviewComment, b: ReviewComment) -> bool:
|
| 42 |
+
order = [Severity.LOW, Severity.MEDIUM, Severity.HIGH, Severity.CRITICAL]
|
| 43 |
+
return abs(order.index(a.severity) - order.index(b.severity)) <= 1
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _line_close(a: ReviewComment, b: ReviewComment, tolerance: int = 3) -> bool:
|
| 47 |
+
if a.line is None or b.line is None:
|
| 48 |
+
return True # file-level comments always match positionally
|
| 49 |
+
return abs(a.line - b.line) <= tolerance
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _message_relevant(comment: ReviewComment, truth: ReviewComment) -> bool:
|
| 53 |
+
"""Check if comment message contains keywords from the truth message."""
|
| 54 |
+
# Pull significant words (>4 chars) from the ground truth message
|
| 55 |
+
truth_keywords = {
|
| 56 |
+
w.lower()
|
| 57 |
+
for w in re.findall(r"\b\w{4,}\b", truth.message)
|
| 58 |
+
if w.lower() not in {"this", "that", "with", "from", "will", "should", "must", "have", "been", "when"}
|
| 59 |
+
}
|
| 60 |
+
comment_text = (comment.message + " " + (comment.suggestion or "")).lower()
|
| 61 |
+
if not truth_keywords:
|
| 62 |
+
return True
|
| 63 |
+
overlap = sum(1 for kw in truth_keywords if kw in comment_text)
|
| 64 |
+
return overlap / len(truth_keywords) >= 0.25 # 25% keyword overlap
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@dataclass
|
| 68 |
+
class MatchResult:
|
| 69 |
+
matched: bool = False
|
| 70 |
+
partial: bool = False # right category, wrong line
|
| 71 |
+
score: float = 0.0
|
| 72 |
+
reason: str = ""
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _match_comment_to_truth(
|
| 76 |
+
comment: ReviewComment,
|
| 77 |
+
truth_list: List[ReviewComment],
|
| 78 |
+
already_matched: set[int],
|
| 79 |
+
) -> tuple[MatchResult, Optional[int]]:
|
| 80 |
+
"""Try to match a single agent comment against the ground-truth list."""
|
| 81 |
+
best = MatchResult()
|
| 82 |
+
best_idx: Optional[int] = None
|
| 83 |
+
|
| 84 |
+
for idx, truth in enumerate(truth_list):
|
| 85 |
+
if idx in already_matched:
|
| 86 |
+
continue
|
| 87 |
+
if not _category_match(comment, truth):
|
| 88 |
+
continue
|
| 89 |
+
|
| 90 |
+
line_ok = _line_close(comment, truth)
|
| 91 |
+
sev_ok = _severity_close(comment, truth)
|
| 92 |
+
msg_ok = _message_relevant(comment, truth)
|
| 93 |
+
|
| 94 |
+
if line_ok and msg_ok:
|
| 95 |
+
# Full match
|
| 96 |
+
score = SEVERITY_WEIGHT[truth.severity]
|
| 97 |
+
if sev_ok:
|
| 98 |
+
score *= 1.0
|
| 99 |
+
else:
|
| 100 |
+
score *= 0.7 # severity mismatch penalty
|
| 101 |
+
result = MatchResult(matched=True, partial=False, score=score,
|
| 102 |
+
reason=f"TP: {truth.category} L{truth.line}")
|
| 103 |
+
if score > best.score:
|
| 104 |
+
best = result
|
| 105 |
+
best_idx = idx
|
| 106 |
+
elif _category_match(comment, truth) and msg_ok and not line_ok:
|
| 107 |
+
# Partial: right issue, wrong line
|
| 108 |
+
score = SEVERITY_WEIGHT[truth.severity] * 0.5
|
| 109 |
+
result = MatchResult(matched=False, partial=True, score=score,
|
| 110 |
+
reason=f"Partial: right issue wrong line for {truth.category}")
|
| 111 |
+
if score > best.score:
|
| 112 |
+
best = result
|
| 113 |
+
best_idx = idx
|
| 114 |
+
|
| 115 |
+
return best, best_idx
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
# ---------------------------------------------------------------------------
|
| 119 |
+
# Base grader
|
| 120 |
+
# ---------------------------------------------------------------------------
|
| 121 |
+
|
| 122 |
+
class BaseGrader:
|
| 123 |
+
TASK_ID: str = ""
|
| 124 |
+
CATEGORIES: list[ReviewCategory] = []
|
| 125 |
+
|
| 126 |
+
def grade(
|
| 127 |
+
self,
|
| 128 |
+
action: Action,
|
| 129 |
+
ground_truth: List[ReviewComment],
|
| 130 |
+
) -> dict:
|
| 131 |
+
raise NotImplementedError
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
# ---------------------------------------------------------------------------
|
| 135 |
+
# Task 1 – Easy (Bug + Style)
|
| 136 |
+
# ---------------------------------------------------------------------------
|
| 137 |
+
|
| 138 |
+
class Task1Grader(BaseGrader):
|
| 139 |
+
TASK_ID = "task_1_easy"
|
| 140 |
+
CATEGORIES = [ReviewCategory.BUG, ReviewCategory.STYLE]
|
| 141 |
+
|
| 142 |
+
def grade(self, action: Action, ground_truth: List[ReviewComment]) -> dict:
|
| 143 |
+
comments = action.comments
|
| 144 |
+
matched_truths: set[int] = set()
|
| 145 |
+
tp_score = 0.0
|
| 146 |
+
fp_penalty = 0.0
|
| 147 |
+
breakdown: dict[str, float] = {}
|
| 148 |
+
|
| 149 |
+
for comment in comments:
|
| 150 |
+
if comment.category not in self.CATEGORIES:
|
| 151 |
+
fp_penalty += 0.05
|
| 152 |
+
continue
|
| 153 |
+
result, idx = _match_comment_to_truth(comment, ground_truth, matched_truths)
|
| 154 |
+
if result.matched or result.partial:
|
| 155 |
+
tp_score += result.score
|
| 156 |
+
if idx is not None:
|
| 157 |
+
matched_truths.add(idx)
|
| 158 |
+
else:
|
| 159 |
+
fp_penalty += 0.1 # fabricated issue
|
| 160 |
+
|
| 161 |
+
# Max possible TP score
|
| 162 |
+
max_score = sum(SEVERITY_WEIGHT[t.severity] for t in ground_truth
|
| 163 |
+
if t.category in self.CATEGORIES)
|
| 164 |
+
recall = tp_score / max_score if max_score > 0 else 0.0
|
| 165 |
+
|
| 166 |
+
# Penalise missed criticals/highs
|
| 167 |
+
missed_critical_penalty = 0.0
|
| 168 |
+
for idx, truth in enumerate(ground_truth):
|
| 169 |
+
if idx not in matched_truths and truth.severity in (Severity.HIGH, Severity.CRITICAL):
|
| 170 |
+
if truth.category in self.CATEGORIES:
|
| 171 |
+
missed_critical_penalty += 0.15
|
| 172 |
+
|
| 173 |
+
raw = recall - min(fp_penalty, 0.3) - missed_critical_penalty
|
| 174 |
+
final = round(max(0.0, min(1.0, raw)), 4)
|
| 175 |
+
|
| 176 |
+
breakdown["recall"] = round(recall, 4)
|
| 177 |
+
breakdown["fp_penalty"] = round(-min(fp_penalty, 0.3), 4)
|
| 178 |
+
breakdown["missed_critical_penalty"] = round(-missed_critical_penalty, 4)
|
| 179 |
+
|
| 180 |
+
return {
|
| 181 |
+
"score": final,
|
| 182 |
+
"breakdown": breakdown,
|
| 183 |
+
"matched_count": len(matched_truths),
|
| 184 |
+
"total_ground_truth": len([t for t in ground_truth if t.category in self.CATEGORIES]),
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
# ---------------------------------------------------------------------------
|
| 189 |
+
# Task 2 – Medium (Security + Performance)
|
| 190 |
+
# ---------------------------------------------------------------------------
|
| 191 |
+
|
| 192 |
+
class Task2Grader(BaseGrader):
|
| 193 |
+
TASK_ID = "task_2_medium"
|
| 194 |
+
CATEGORIES = [ReviewCategory.SECURITY, ReviewCategory.PERFORMANCE]
|
| 195 |
+
|
| 196 |
+
def grade(self, action: Action, ground_truth: List[ReviewComment]) -> dict:
|
| 197 |
+
comments = action.comments
|
| 198 |
+
matched_truths: set[int] = set()
|
| 199 |
+
tp_score = 0.0
|
| 200 |
+
fp_penalty = 0.0
|
| 201 |
+
|
| 202 |
+
for comment in comments:
|
| 203 |
+
if comment.category not in self.CATEGORIES:
|
| 204 |
+
fp_penalty += 0.03
|
| 205 |
+
continue
|
| 206 |
+
result, idx = _match_comment_to_truth(comment, ground_truth, matched_truths)
|
| 207 |
+
if result.matched or result.partial:
|
| 208 |
+
tp_score += result.score
|
| 209 |
+
if idx is not None:
|
| 210 |
+
matched_truths.add(idx)
|
| 211 |
+
else:
|
| 212 |
+
fp_penalty += 0.12
|
| 213 |
+
|
| 214 |
+
max_score = sum(SEVERITY_WEIGHT[t.severity] for t in ground_truth
|
| 215 |
+
if t.category in self.CATEGORIES)
|
| 216 |
+
recall = tp_score / max_score if max_score > 0 else 0.0
|
| 217 |
+
|
| 218 |
+
# Security criticals have double penalty if missed
|
| 219 |
+
missed_penalty = 0.0
|
| 220 |
+
for idx, truth in enumerate(ground_truth):
|
| 221 |
+
if idx not in matched_truths and truth.category == ReviewCategory.SECURITY:
|
| 222 |
+
if truth.severity == Severity.CRITICAL:
|
| 223 |
+
missed_penalty += 0.20
|
| 224 |
+
elif truth.severity == Severity.HIGH:
|
| 225 |
+
missed_penalty += 0.10
|
| 226 |
+
|
| 227 |
+
raw = recall - min(fp_penalty, 0.3) - missed_penalty
|
| 228 |
+
final = round(max(0.0, min(1.0, raw)), 4)
|
| 229 |
+
|
| 230 |
+
return {
|
| 231 |
+
"score": final,
|
| 232 |
+
"breakdown": {
|
| 233 |
+
"recall": round(recall, 4),
|
| 234 |
+
"fp_penalty": round(-min(fp_penalty, 0.3), 4),
|
| 235 |
+
"missed_security_penalty": round(-missed_penalty, 4),
|
| 236 |
+
},
|
| 237 |
+
"matched_count": len(matched_truths),
|
| 238 |
+
"total_ground_truth": len([t for t in ground_truth if t.category in self.CATEGORIES]),
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
# ---------------------------------------------------------------------------
|
| 243 |
+
# Task 3 – Hard (All categories + summary required)
|
| 244 |
+
# ---------------------------------------------------------------------------
|
| 245 |
+
|
| 246 |
+
class Task3Grader(BaseGrader):
|
| 247 |
+
TASK_ID = "task_3_hard"
|
| 248 |
+
CATEGORIES = list(ReviewCategory)
|
| 249 |
+
|
| 250 |
+
def grade(self, action: Action, ground_truth: List[ReviewComment]) -> dict:
|
| 251 |
+
comments = action.comments
|
| 252 |
+
matched_truths: set[int] = set()
|
| 253 |
+
tp_score = 0.0
|
| 254 |
+
fp_penalty = 0.0
|
| 255 |
+
|
| 256 |
+
for comment in comments:
|
| 257 |
+
result, idx = _match_comment_to_truth(comment, ground_truth, matched_truths)
|
| 258 |
+
if result.matched or result.partial:
|
| 259 |
+
tp_score += result.score
|
| 260 |
+
if idx is not None:
|
| 261 |
+
matched_truths.add(idx)
|
| 262 |
+
else:
|
| 263 |
+
fp_penalty += 0.08
|
| 264 |
+
|
| 265 |
+
max_score = sum(SEVERITY_WEIGHT[t.severity] for t in ground_truth)
|
| 266 |
+
recall = tp_score / max_score if max_score > 0 else 0.0
|
| 267 |
+
|
| 268 |
+
# Summary quality bonus (up to +0.15)
|
| 269 |
+
summary_bonus = 0.0
|
| 270 |
+
if action.summary:
|
| 271 |
+
summary_lower = action.summary.lower()
|
| 272 |
+
key_themes = ["security", "injection", "pickle", "performance", "documentation", "bug"]
|
| 273 |
+
hits = sum(1 for kw in key_themes if kw in summary_lower)
|
| 274 |
+
summary_bonus = min(0.15, hits * 0.025)
|
| 275 |
+
|
| 276 |
+
# Summary required penalty
|
| 277 |
+
summary_penalty = 0.10 if not action.summary else 0.0
|
| 278 |
+
|
| 279 |
+
# Missed critical penalty
|
| 280 |
+
missed_penalty = 0.0
|
| 281 |
+
for idx, truth in enumerate(ground_truth):
|
| 282 |
+
if idx not in matched_truths:
|
| 283 |
+
if truth.severity == Severity.CRITICAL:
|
| 284 |
+
missed_penalty += 0.15
|
| 285 |
+
elif truth.severity == Severity.HIGH:
|
| 286 |
+
missed_penalty += 0.08
|
| 287 |
+
|
| 288 |
+
raw = recall + summary_bonus - min(fp_penalty, 0.3) - missed_penalty - summary_penalty
|
| 289 |
+
final = round(max(0.0, min(1.0, raw)), 4)
|
| 290 |
+
|
| 291 |
+
return {
|
| 292 |
+
"score": final,
|
| 293 |
+
"breakdown": {
|
| 294 |
+
"recall": round(recall, 4),
|
| 295 |
+
"summary_bonus": round(summary_bonus, 4),
|
| 296 |
+
"fp_penalty": round(-min(fp_penalty, 0.3), 4),
|
| 297 |
+
"missed_critical_penalty": round(-missed_penalty, 4),
|
| 298 |
+
"summary_penalty": round(-summary_penalty, 4),
|
| 299 |
+
},
|
| 300 |
+
"matched_count": len(matched_truths),
|
| 301 |
+
"total_ground_truth": len(ground_truth),
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
# ---------------------------------------------------------------------------
|
| 306 |
+
# Registry
|
| 307 |
+
# ---------------------------------------------------------------------------
|
| 308 |
+
|
| 309 |
+
GRADERS: dict[str, BaseGrader] = {
|
| 310 |
+
"task_1_easy": Task1Grader(),
|
| 311 |
+
"task_2_medium": Task2Grader(),
|
| 312 |
+
"task_3_hard": Task3Grader(),
|
| 313 |
+
}
|
inference.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Inference Script for CodeReview OpenEnv
|
| 4 |
+
===================================
|
| 5 |
+
MANDATORY
|
| 6 |
+
- Before submitting, ensure the following variables are defined in your environment configuration:
|
| 7 |
+
API_BASE_URL The API endpoint for the LLM.
|
| 8 |
+
MODEL_NAME The model identifier to use for inference.
|
| 9 |
+
HF_TOKEN Your Hugging Face / API key.
|
| 10 |
+
|
| 11 |
+
- Defaults are set only for API_BASE_URL and MODEL_NAME
|
| 12 |
+
(and should reflect your active inference setup):
|
| 13 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
|
| 14 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
|
| 15 |
+
|
| 16 |
+
- The inference script must be named `inference.py` and placed in the root directory of the project
|
| 17 |
+
- Participants must use OpenAI Client for all LLM calls using above variables
|
| 18 |
+
|
| 19 |
+
STDOUT FORMAT
|
| 20 |
+
- The script must emit exactly three line types to stdout, in this order:
|
| 21 |
+
|
| 22 |
+
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 23 |
+
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 24 |
+
[END] success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
|
| 25 |
+
|
| 26 |
+
Rules:
|
| 27 |
+
- One [START] line at episode begin.
|
| 28 |
+
- One [STEP] line per step, immediately after env.step() returns.
|
| 29 |
+
- One [END] line after the episode, always emitted (even on exception).
|
| 30 |
+
- reward and rewards are formatted to 2 decimal places.
|
| 31 |
+
- done and success are lowercase booleans: true or false.
|
| 32 |
+
- error is the raw last_action_error string, or null if none.
|
| 33 |
+
- All fields on a single line with no newlines within a line.
|
| 34 |
+
- Each task should return score in [0, 1]
|
| 35 |
+
|
| 36 |
+
Example:
|
| 37 |
+
[START] task=task_1_easy env=code_review model=Qwen/Qwen2.5-72B-Instruct
|
| 38 |
+
[STEP] step=1 action=review(comments=6,submit=true) reward=0.85 done=true error=null
|
| 39 |
+
[END] success=true steps=1 score=0.850 rewards=0.85
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
from __future__ import annotations
|
| 43 |
+
|
| 44 |
+
import asyncio
|
| 45 |
+
import json
|
| 46 |
+
import os
|
| 47 |
+
import sys
|
| 48 |
+
import textwrap
|
| 49 |
+
from typing import Any, Dict, List, Optional
|
| 50 |
+
|
| 51 |
+
from openai import OpenAI
|
| 52 |
+
|
| 53 |
+
# Ensure project root is on the import path
|
| 54 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 55 |
+
|
| 56 |
+
from env.environment import CodeReviewEnv, TASK_SPECS
|
| 57 |
+
from env.models import Action, ReviewComment, ReviewCategory, Severity
|
| 58 |
+
|
| 59 |
+
# ---------------------------------------------------------------------------
|
| 60 |
+
# Configuration
|
| 61 |
+
# ---------------------------------------------------------------------------
|
| 62 |
+
|
| 63 |
+
LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME") # If using docker image
|
| 64 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 65 |
+
|
| 66 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 67 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 68 |
+
BENCHMARK = os.getenv("BENCHMARK", "code_review")
|
| 69 |
+
TASK_NAME = os.getenv("CODE_REVIEW_TASK", "all") # "all" or a specific task id
|
| 70 |
+
TASKS = ["task_1_easy", "task_2_medium", "task_3_hard"]
|
| 71 |
+
TEMPERATURE = 0.2
|
| 72 |
+
MAX_TOKENS = 2048
|
| 73 |
+
|
| 74 |
+
# ---------------------------------------------------------------------------
|
| 75 |
+
# System prompt for code review
|
| 76 |
+
# ---------------------------------------------------------------------------
|
| 77 |
+
|
| 78 |
+
SYSTEM_PROMPT = textwrap.dedent("""
|
| 79 |
+
You are an expert Python code reviewer.
|
| 80 |
+
You will be given a code snippet along with review instructions.
|
| 81 |
+
Your job is to produce a JSON action object that identifies issues in the code.
|
| 82 |
+
|
| 83 |
+
The JSON object you return must match this schema exactly:
|
| 84 |
+
{
|
| 85 |
+
"comments": [
|
| 86 |
+
{
|
| 87 |
+
"line": <int or null>,
|
| 88 |
+
"category": <"bug"|"security"|"performance"|"style"|"documentation">,
|
| 89 |
+
"severity": <"low"|"medium"|"high"|"critical">,
|
| 90 |
+
"message": "<clear description of the issue>",
|
| 91 |
+
"suggestion": "<optional fix>"
|
| 92 |
+
}
|
| 93 |
+
],
|
| 94 |
+
"summary": "<overall assessment – required for hard tasks, optional otherwise>",
|
| 95 |
+
"submit": true
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
Rules:
|
| 99 |
+
- Only flag genuine issues. Do not fabricate problems.
|
| 100 |
+
- Be precise about line numbers (1-indexed from the code).
|
| 101 |
+
- Match the categories listed in the instructions.
|
| 102 |
+
- Always set "submit": true when you believe your review is complete.
|
| 103 |
+
- Return ONLY the JSON object. No markdown, no explanations.
|
| 104 |
+
""").strip()
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# ---------------------------------------------------------------------------
|
| 108 |
+
# Logging helpers (exact STDOUT format from spec)
|
| 109 |
+
# ---------------------------------------------------------------------------
|
| 110 |
+
|
| 111 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 112 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 116 |
+
error_val = error if error else "null"
|
| 117 |
+
done_val = str(done).lower()
|
| 118 |
+
print(
|
| 119 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
|
| 120 |
+
flush=True,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 125 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 126 |
+
print(
|
| 127 |
+
f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
|
| 128 |
+
flush=True,
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# ---------------------------------------------------------------------------
|
| 133 |
+
# LLM interaction
|
| 134 |
+
# ---------------------------------------------------------------------------
|
| 135 |
+
|
| 136 |
+
def build_user_message(obs_dict: dict) -> str:
|
| 137 |
+
"""Build LLM user prompt from an observation dict."""
|
| 138 |
+
snippet = obs_dict["snippet"]
|
| 139 |
+
instructions = obs_dict["instructions"]
|
| 140 |
+
previous = obs_dict.get("previous_comments", [])
|
| 141 |
+
|
| 142 |
+
numbered_source = "\n".join(
|
| 143 |
+
f"{i+1:3d} {line}"
|
| 144 |
+
for i, line in enumerate(snippet["source"].splitlines())
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
msg = f"""
|
| 148 |
+
{instructions}
|
| 149 |
+
|
| 150 |
+
### File: {snippet['file_name']}
|
| 151 |
+
```python
|
| 152 |
+
{numbered_source}
|
| 153 |
+
```
|
| 154 |
+
"""
|
| 155 |
+
if previous:
|
| 156 |
+
msg += f"\n### Your previous comments ({len(previous)} so far):\n"
|
| 157 |
+
for c in previous:
|
| 158 |
+
line_val = c.get("line", "?")
|
| 159 |
+
category = c.get("category", "?")
|
| 160 |
+
message = c.get("message", "")[:80]
|
| 161 |
+
msg += f" - L{line_val} [{category}] {message}\n"
|
| 162 |
+
|
| 163 |
+
return msg.strip()
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def get_model_action(client: OpenAI, obs_dict: dict) -> dict:
|
| 167 |
+
"""Call the LLM and return a parsed action dict."""
|
| 168 |
+
user_msg = build_user_message(obs_dict)
|
| 169 |
+
|
| 170 |
+
try:
|
| 171 |
+
completion = client.chat.completions.create(
|
| 172 |
+
model=MODEL_NAME,
|
| 173 |
+
messages=[
|
| 174 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 175 |
+
{"role": "user", "content": user_msg},
|
| 176 |
+
],
|
| 177 |
+
temperature=TEMPERATURE,
|
| 178 |
+
max_tokens=MAX_TOKENS,
|
| 179 |
+
response_format={"type": "json_object"},
|
| 180 |
+
stream=False,
|
| 181 |
+
)
|
| 182 |
+
raw = (completion.choices[0].message.content or "{}").strip()
|
| 183 |
+
action_dict = json.loads(raw)
|
| 184 |
+
except Exception as exc:
|
| 185 |
+
print(f"[DEBUG] Model request failed: {exc}", flush=True)
|
| 186 |
+
action_dict = {"comments": [], "submit": True}
|
| 187 |
+
|
| 188 |
+
return action_dict
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# ---------------------------------------------------------------------------
|
| 192 |
+
# Action parsing
|
| 193 |
+
# ---------------------------------------------------------------------------
|
| 194 |
+
|
| 195 |
+
def parse_action(action_dict: dict) -> Action:
|
| 196 |
+
"""Convert a raw action dict into a typed Action model."""
|
| 197 |
+
comments: List[ReviewComment] = []
|
| 198 |
+
for c in action_dict.get("comments", []):
|
| 199 |
+
try:
|
| 200 |
+
comments.append(ReviewComment(
|
| 201 |
+
line=c.get("line"),
|
| 202 |
+
category=ReviewCategory(c.get("category", "bug")),
|
| 203 |
+
severity=Severity(c.get("severity", "medium")),
|
| 204 |
+
message=c.get("message", ""),
|
| 205 |
+
suggestion=c.get("suggestion"),
|
| 206 |
+
))
|
| 207 |
+
except Exception:
|
| 208 |
+
pass # skip malformed comments
|
| 209 |
+
|
| 210 |
+
return Action(
|
| 211 |
+
comments=comments,
|
| 212 |
+
summary=action_dict.get("summary"),
|
| 213 |
+
submit=action_dict.get("submit", True),
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def format_action_str(action_dict: dict) -> str:
|
| 218 |
+
"""Format action dict into a compact string for STEP logging."""
|
| 219 |
+
n = len(action_dict.get("comments", []))
|
| 220 |
+
submit = str(action_dict.get("submit", False)).lower()
|
| 221 |
+
return f"review(comments={n},submit={submit})"
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
# ---------------------------------------------------------------------------
|
| 225 |
+
# Task runner
|
| 226 |
+
# ---------------------------------------------------------------------------
|
| 227 |
+
|
| 228 |
+
async def run_task(task_id: str, client: OpenAI) -> dict:
|
| 229 |
+
"""Run a single code-review task episode and return results."""
|
| 230 |
+
env = CodeReviewEnv(task_id=task_id)
|
| 231 |
+
obs = env.reset()
|
| 232 |
+
|
| 233 |
+
rewards: List[float] = []
|
| 234 |
+
steps_taken = 0
|
| 235 |
+
score = 0.0
|
| 236 |
+
success = False
|
| 237 |
+
|
| 238 |
+
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
|
| 239 |
+
|
| 240 |
+
try:
|
| 241 |
+
for step in range(1, env.spec.max_steps + 1):
|
| 242 |
+
obs_dict = obs.model_dump()
|
| 243 |
+
|
| 244 |
+
# Get LLM response
|
| 245 |
+
action_dict = get_model_action(client, obs_dict)
|
| 246 |
+
action = parse_action(action_dict)
|
| 247 |
+
|
| 248 |
+
# Step the environment
|
| 249 |
+
result = env.step(action)
|
| 250 |
+
|
| 251 |
+
reward = result.reward.value
|
| 252 |
+
done = result.done
|
| 253 |
+
error = None
|
| 254 |
+
|
| 255 |
+
rewards.append(reward)
|
| 256 |
+
steps_taken = step
|
| 257 |
+
|
| 258 |
+
action_str = format_action_str(action_dict)
|
| 259 |
+
log_step(step=step, action=action_str, reward=reward, done=done, error=error)
|
| 260 |
+
|
| 261 |
+
obs = result.observation
|
| 262 |
+
|
| 263 |
+
if done:
|
| 264 |
+
score = result.info.get("grader", {}).get("score", 0.0)
|
| 265 |
+
success = score >= env.spec.passing_threshold
|
| 266 |
+
break
|
| 267 |
+
|
| 268 |
+
except Exception as e:
|
| 269 |
+
print(f"[DEBUG] Error during task {task_id}: {e}", flush=True)
|
| 270 |
+
|
| 271 |
+
finally:
|
| 272 |
+
# Clamp score to [0, 1]
|
| 273 |
+
score = min(max(score, 0.0), 1.0)
|
| 274 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 275 |
+
|
| 276 |
+
return {"task_id": task_id, "score": score, "success": success}
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
# ---------------------------------------------------------------------------
|
| 280 |
+
# Main
|
| 281 |
+
# ---------------------------------------------------------------------------
|
| 282 |
+
|
| 283 |
+
async def main() -> None:
|
| 284 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 285 |
+
|
| 286 |
+
tasks_to_run = TASKS if TASK_NAME == "all" else [TASK_NAME]
|
| 287 |
+
|
| 288 |
+
results: List[dict] = []
|
| 289 |
+
for task_id in tasks_to_run:
|
| 290 |
+
result = await run_task(task_id, client)
|
| 291 |
+
results.append(result)
|
| 292 |
+
|
| 293 |
+
# Print final summary to stderr (not part of the spec, but useful for debugging)
|
| 294 |
+
avg_score = sum(r["score"] for r in results) / len(results) if results else 0.0
|
| 295 |
+
pass_count = sum(1 for r in results if r["success"])
|
| 296 |
+
print(
|
| 297 |
+
f"\n[SUMMARY] tasks={len(results)} passed={pass_count} avg_score={avg_score:.3f}",
|
| 298 |
+
file=sys.stderr,
|
| 299 |
+
flush=True,
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
if __name__ == "__main__":
|
| 304 |
+
asyncio.run(main())
|
openenv-code-review.tar.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d389c58d2d84185dae21c86ccda3422c3bec52ab239859e93b90f721e5ce7fe1
|
| 3 |
+
size 50132
|
openenv.yaml
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: code-review-env
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
description: >
|
| 4 |
+
An OpenEnv-compliant AI training environment that simulates professional
|
| 5 |
+
Python code review. Agents learn to identify bugs, security vulnerabilities,
|
| 6 |
+
performance issues, style problems, and documentation gaps across three
|
| 7 |
+
progressively harder tasks.
|
| 8 |
+
|
| 9 |
+
tags:
|
| 10 |
+
- openenv
|
| 11 |
+
- code-review
|
| 12 |
+
- python
|
| 13 |
+
- security
|
| 14 |
+
- software-engineering
|
| 15 |
+
|
| 16 |
+
author: imaginephoenix / rawgenn.tech
|
| 17 |
+
license: MIT
|
| 18 |
+
|
| 19 |
+
environment:
|
| 20 |
+
class: CodeReviewEnv
|
| 21 |
+
module: env.environment
|
| 22 |
+
entrypoint: app.py
|
| 23 |
+
framework: fastapi
|
| 24 |
+
|
| 25 |
+
observation_space:
|
| 26 |
+
type: object
|
| 27 |
+
description: >
|
| 28 |
+
What the agent sees each step. Contains the code snippet to review,
|
| 29 |
+
task instructions, all previously submitted comments, and optional
|
| 30 |
+
feedback from the last step.
|
| 31 |
+
fields:
|
| 32 |
+
task_id:
|
| 33 |
+
type: string
|
| 34 |
+
description: Identifier of the active task
|
| 35 |
+
step:
|
| 36 |
+
type: integer
|
| 37 |
+
description: Current step number (0-indexed)
|
| 38 |
+
snippet:
|
| 39 |
+
type: object
|
| 40 |
+
description: Python source code to review
|
| 41 |
+
fields:
|
| 42 |
+
file_name: { type: string }
|
| 43 |
+
source: { type: string, description: "Full Python source with line numbers" }
|
| 44 |
+
language: { type: string, const: "python" }
|
| 45 |
+
instructions:
|
| 46 |
+
type: string
|
| 47 |
+
description: Review instructions and scope for this task
|
| 48 |
+
previous_comments:
|
| 49 |
+
type: array
|
| 50 |
+
description: All review comments submitted in prior steps
|
| 51 |
+
feedback:
|
| 52 |
+
type: string
|
| 53 |
+
nullable: true
|
| 54 |
+
description: Environment feedback on the most recent action
|
| 55 |
+
done:
|
| 56 |
+
type: boolean
|
| 57 |
+
|
| 58 |
+
action_space:
|
| 59 |
+
type: object
|
| 60 |
+
description: >
|
| 61 |
+
What the agent submits. A list of review comments (each with line,
|
| 62 |
+
category, severity, message, optional suggestion) plus an optional
|
| 63 |
+
overall summary and a submit flag.
|
| 64 |
+
fields:
|
| 65 |
+
comments:
|
| 66 |
+
type: array
|
| 67 |
+
items:
|
| 68 |
+
type: object
|
| 69 |
+
fields:
|
| 70 |
+
line: { type: integer, nullable: true, description: "1-indexed line number" }
|
| 71 |
+
category:
|
| 72 |
+
type: string
|
| 73 |
+
enum: [bug, security, performance, style, documentation]
|
| 74 |
+
severity:
|
| 75 |
+
type: string
|
| 76 |
+
enum: [low, medium, high, critical]
|
| 77 |
+
message: { type: string, minLength: 5, maxLength: 500 }
|
| 78 |
+
suggestion: { type: string, nullable: true, maxLength: 500 }
|
| 79 |
+
summary:
|
| 80 |
+
type: string
|
| 81 |
+
nullable: true
|
| 82 |
+
description: "Required for task_3_hard; optional otherwise"
|
| 83 |
+
submit:
|
| 84 |
+
type: boolean
|
| 85 |
+
description: "Set true to finalise the review and trigger the grader"
|
| 86 |
+
|
| 87 |
+
reward:
|
| 88 |
+
type: float
|
| 89 |
+
range: [-1.0, 1.0]
|
| 90 |
+
description: >
|
| 91 |
+
Shaped reward with partial progress signals. Incremental positive reward
|
| 92 |
+
for each new valid comment added (proportional to issue severity). On
|
| 93 |
+
submit: final grader score mapped to [-0.2, 1.0]. Penalties for false
|
| 94 |
+
positives, missed criticals, and spamming low-quality comments.
|
| 95 |
+
|
| 96 |
+
tasks:
|
| 97 |
+
- id: task_1_easy
|
| 98 |
+
title: "Bug Detection & Style Review"
|
| 99 |
+
difficulty: easy
|
| 100 |
+
categories: [bug, style]
|
| 101 |
+
max_steps: 5
|
| 102 |
+
passing_threshold: 0.55
|
| 103 |
+
description: >
|
| 104 |
+
Review calculator.py (31 lines) for division-by-zero bugs, off-by-one
|
| 105 |
+
errors, empty-collection crashes, and Python style anti-patterns.
|
| 106 |
+
|
| 107 |
+
- id: task_2_medium
|
| 108 |
+
title: "Security & Performance Audit"
|
| 109 |
+
difficulty: medium
|
| 110 |
+
categories: [security, performance]
|
| 111 |
+
max_steps: 7
|
| 112 |
+
passing_threshold: 0.60
|
| 113 |
+
description: >
|
| 114 |
+
Audit user_service.py (55 lines) for SQL injection, broken MD5 password
|
| 115 |
+
hashing, unbounded DB queries, and connection churn. Missed critical
|
| 116 |
+
security issues carry heavy penalties.
|
| 117 |
+
|
| 118 |
+
- id: task_3_hard
|
| 119 |
+
title: "Comprehensive Code Review"
|
| 120 |
+
difficulty: hard
|
| 121 |
+
categories: [bug, security, performance, style, documentation]
|
| 122 |
+
max_steps: 10
|
| 123 |
+
passing_threshold: 0.65
|
| 124 |
+
description: >
|
| 125 |
+
Full production-grade review of data_pipeline.py (49 lines). Covers
|
| 126 |
+
all five categories including shell injection, unsafe pickle
|
| 127 |
+
deserialization, ZeroDivisionError, and missing docstrings. An overall
|
| 128 |
+
written summary is required.
|
| 129 |
+
|
| 130 |
+
api_endpoints:
|
| 131 |
+
- path: /reset
|
| 132 |
+
method: POST
|
| 133 |
+
description: Start or restart an episode
|
| 134 |
+
- path: /step
|
| 135 |
+
method: POST
|
| 136 |
+
description: Submit an action
|
| 137 |
+
- path: /state
|
| 138 |
+
method: GET
|
| 139 |
+
description: Get full serialisable state
|
| 140 |
+
- path: /tasks
|
| 141 |
+
method: GET
|
| 142 |
+
description: List all available tasks
|
| 143 |
+
- path: /health
|
| 144 |
+
method: GET
|
| 145 |
+
description: Health check
|
| 146 |
+
|
| 147 |
+
baseline:
|
| 148 |
+
model: gpt-4o
|
| 149 |
+
script: baseline_agent.py
|
| 150 |
+
expected_scores:
|
| 151 |
+
task_1_easy: ~0.75
|
| 152 |
+
task_2_medium: ~0.65
|
| 153 |
+
task_3_hard: ~0.55
|
| 154 |
+
|
| 155 |
+
docker:
|
| 156 |
+
base_image: python:3.11-slim
|
| 157 |
+
port: 7860
|
| 158 |
+
build: docker build -t code-review-env .
|
| 159 |
+
run: docker run -p 7860:7860 code-review-env
|
| 160 |
+
|
| 161 |
+
huggingface:
|
| 162 |
+
space_sdk: docker
|
| 163 |
+
tags: [openenv, code-review, ai-agent, evaluation]
|
pyproject.toml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "code-review-env"
|
| 3 |
+
version = "1.0.0"
|
| 4 |
+
description = "An OpenEnv-compliant AI training environment that simulates professional Python code review."
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
license = { text = "MIT" }
|
| 7 |
+
requires-python = ">=3.11"
|
| 8 |
+
|
| 9 |
+
dependencies = [
|
| 10 |
+
"fastapi>=0.104.0",
|
| 11 |
+
"uvicorn[standard]>=0.24.0",
|
| 12 |
+
"pydantic>=2.5.0",
|
| 13 |
+
"requests>=2.31.0",
|
| 14 |
+
"openai>=1.6.0",
|
| 15 |
+
"openenv-core>=0.2.0",
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
[project.optional-dependencies]
|
| 19 |
+
dev = [
|
| 20 |
+
"pytest>=7.4.0",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
[project.scripts]
|
| 24 |
+
server = "server.app:main"
|
| 25 |
+
|
| 26 |
+
[build-system]
|
| 27 |
+
requires = ["setuptools>=68.0", "wheel"]
|
| 28 |
+
build-backend = "setuptools.backends._legacy:_Backend"
|
| 29 |
+
|
| 30 |
+
[tool.setuptools.packages.find]
|
| 31 |
+
include = ["env*", "corpus*", "graders*", "server*"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.104.0
|
| 2 |
+
uvicorn[standard]>=0.24.0
|
| 3 |
+
pydantic>=2.5.0
|
| 4 |
+
requests>=2.31.0
|
| 5 |
+
openai>=1.6.0
|
| 6 |
+
pytest>=7.4.0
|
| 7 |
+
openenv-core>=0.2.0
|
server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# server package
|
server/app.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Server entry point for CodeReview OpenEnv.
|
| 3 |
+
|
| 4 |
+
This module provides the main() entry point used by:
|
| 5 |
+
- pyproject.toml [project.scripts] server = "server.app:main"
|
| 6 |
+
- openenv serve
|
| 7 |
+
- uv run server
|
| 8 |
+
|
| 9 |
+
It imports and runs the FastAPI app defined in the root app.py.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import sys
|
| 15 |
+
import os
|
| 16 |
+
|
| 17 |
+
# Ensure project root is importable
|
| 18 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def main(host: str = "0.0.0.0", port: int = 7860, workers: int = 1) -> None:
|
| 22 |
+
"""Start the CodeReview OpenEnv server."""
|
| 23 |
+
import uvicorn
|
| 24 |
+
|
| 25 |
+
uvicorn.run(
|
| 26 |
+
"app:app",
|
| 27 |
+
host=host,
|
| 28 |
+
port=int(os.environ.get("PORT", port)),
|
| 29 |
+
workers=workers,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
if __name__ == "__main__":
|
| 34 |
+
main()
|
templates/index.html
ADDED
|
@@ -0,0 +1,807 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>CodeReview OpenEnv</title>
|
| 7 |
+
<!-- Google Fonts for modern typography -->
|
| 8 |
+
<link href="https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600;700&family=Fira+Code:wght@400;500&display=swap" rel="stylesheet">
|
| 9 |
+
<!-- PrismJS for code syntax highlighting -->
|
| 10 |
+
<link href="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/themes/prism-tomorrow.min.css" rel="stylesheet" />
|
| 11 |
+
<style>
|
| 12 |
+
:root {
|
| 13 |
+
--bg-color: #0b1120;
|
| 14 |
+
--surface-color: rgba(30, 41, 59, 0.7);
|
| 15 |
+
--surface-border: rgba(255, 255, 255, 0.08);
|
| 16 |
+
--text-primary: #f8fafc;
|
| 17 |
+
--text-secondary: #94a3b8;
|
| 18 |
+
--primary-accent: #3b82f6;
|
| 19 |
+
--primary-glow: rgba(59, 130, 246, 0.5);
|
| 20 |
+
--secondary-accent: #8b5cf6;
|
| 21 |
+
--danger: #ef4444;
|
| 22 |
+
--success: #10b981;
|
| 23 |
+
--warning: #f59e0b;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
body {
|
| 27 |
+
margin: 0;
|
| 28 |
+
padding: 0;
|
| 29 |
+
font-family: 'Outfit', sans-serif;
|
| 30 |
+
background-color: var(--bg-color);
|
| 31 |
+
background-image:
|
| 32 |
+
radial-gradient(at 0% 0%, rgba(59, 130, 246, 0.15) 0px, transparent 50%),
|
| 33 |
+
radial-gradient(at 100% 100%, rgba(139, 92, 246, 0.15) 0px, transparent 50%);
|
| 34 |
+
background-attachment: fixed;
|
| 35 |
+
color: var(--text-primary);
|
| 36 |
+
min-height: 100vh;
|
| 37 |
+
display: flex;
|
| 38 |
+
flex-direction: column;
|
| 39 |
+
overflow-x: hidden;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
/* Glassmorphism Classes */
|
| 43 |
+
.glass-panel {
|
| 44 |
+
background: var(--surface-color);
|
| 45 |
+
backdrop-filter: blur(12px);
|
| 46 |
+
-webkit-backdrop-filter: blur(12px);
|
| 47 |
+
border: 1px solid var(--surface-border);
|
| 48 |
+
border-radius: 16px;
|
| 49 |
+
box-shadow: 0 4px 30px rgba(0, 0, 0, 0.1);
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
header {
|
| 53 |
+
padding: 20px 40px;
|
| 54 |
+
display: flex;
|
| 55 |
+
justify-content: space-between;
|
| 56 |
+
align-items: center;
|
| 57 |
+
border-bottom: 1px solid var(--surface-border);
|
| 58 |
+
background: rgba(11, 17, 32, 0.8);
|
| 59 |
+
backdrop-filter: blur(8px);
|
| 60 |
+
position: sticky;
|
| 61 |
+
top: 0;
|
| 62 |
+
z-index: 100;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
h1 {
|
| 66 |
+
margin: 0;
|
| 67 |
+
font-size: 1.5rem;
|
| 68 |
+
font-weight: 700;
|
| 69 |
+
background: linear-gradient(135deg, #fff, #94a3b8);
|
| 70 |
+
-webkit-background-clip: text;
|
| 71 |
+
-webkit-text-fill-color: transparent;
|
| 72 |
+
display: flex;
|
| 73 |
+
align-items: center;
|
| 74 |
+
gap: 10px;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
.controls {
|
| 78 |
+
display: flex;
|
| 79 |
+
gap: 15px;
|
| 80 |
+
align-items: center;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
select, button, input, textarea {
|
| 84 |
+
font-family: 'Outfit', sans-serif;
|
| 85 |
+
outline: none;
|
| 86 |
+
transition: all 0.3s ease;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
select {
|
| 90 |
+
padding: 10px 16px;
|
| 91 |
+
background: rgba(30, 41, 59, 0.8);
|
| 92 |
+
color: white;
|
| 93 |
+
border: 1px solid var(--surface-border);
|
| 94 |
+
border-radius: 8px;
|
| 95 |
+
font-size: 0.95rem;
|
| 96 |
+
cursor: pointer;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
select:focus {
|
| 100 |
+
border-color: var(--primary-accent);
|
| 101 |
+
box-shadow: 0 0 0 2px var(--primary-glow);
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
button {
|
| 105 |
+
padding: 10px 20px;
|
| 106 |
+
border: none;
|
| 107 |
+
border-radius: 8px;
|
| 108 |
+
font-weight: 600;
|
| 109 |
+
cursor: pointer;
|
| 110 |
+
display: inline-flex;
|
| 111 |
+
align-items: center;
|
| 112 |
+
gap: 8px;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
.btn-primary {
|
| 116 |
+
background: linear-gradient(135deg, var(--primary-accent), var(--secondary-accent));
|
| 117 |
+
color: white;
|
| 118 |
+
box-shadow: 0 4px 15px var(--primary-glow);
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
.btn-primary:hover {
|
| 122 |
+
transform: translateY(-2px);
|
| 123 |
+
box-shadow: 0 6px 20px var(--primary-glow);
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
.btn-primary:active {
|
| 127 |
+
transform: translateY(1px);
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
.btn-outline {
|
| 131 |
+
background: transparent;
|
| 132 |
+
color: var(--text-primary);
|
| 133 |
+
border: 1px solid var(--surface-border);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.btn-outline:hover {
|
| 137 |
+
background: rgba(255, 255, 255, 0.05);
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
main {
|
| 141 |
+
display: flex;
|
| 142 |
+
flex: 1;
|
| 143 |
+
padding: 20px;
|
| 144 |
+
gap: 20px;
|
| 145 |
+
height: calc(100vh - 100px);
|
| 146 |
+
box-sizing: border-box;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
/* Loading Overlay */
|
| 150 |
+
#loader {
|
| 151 |
+
position: fixed;
|
| 152 |
+
top: 0; left: 0; right: 0; bottom: 0;
|
| 153 |
+
background: var(--bg-color);
|
| 154 |
+
display: flex;
|
| 155 |
+
justify-content: center;
|
| 156 |
+
align-items: center;
|
| 157 |
+
z-index: 1000;
|
| 158 |
+
transition: opacity 0.5s ease;
|
| 159 |
+
}
|
| 160 |
+
.spinner {
|
| 161 |
+
width: 50px;
|
| 162 |
+
height: 50px;
|
| 163 |
+
border: 3px solid rgba(255,255,255,0.1);
|
| 164 |
+
border-radius: 50%;
|
| 165 |
+
border-top-color: var(--primary-accent);
|
| 166 |
+
animation: spin 1s ease-in-out infinite;
|
| 167 |
+
}
|
| 168 |
+
@keyframes spin { to { transform: rotate(360deg); } }
|
| 169 |
+
|
| 170 |
+
/* Left Pane - Code Snippet */
|
| 171 |
+
.pane-left {
|
| 172 |
+
flex: 1.2;
|
| 173 |
+
display: flex;
|
| 174 |
+
flex-direction: column;
|
| 175 |
+
overflow: hidden;
|
| 176 |
+
animation: slideInLeft 0.5s cubic-bezier(0.16, 1, 0.3, 1);
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
.pane-header {
|
| 180 |
+
padding: 15px 20px;
|
| 181 |
+
border-bottom: 1px solid var(--surface-border);
|
| 182 |
+
font-weight: 600;
|
| 183 |
+
display: flex;
|
| 184 |
+
justify-content: space-between;
|
| 185 |
+
align-items: center;
|
| 186 |
+
font-size: 0.9rem;
|
| 187 |
+
color: var(--text-secondary);
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
.code-container {
|
| 191 |
+
flex: 1;
|
| 192 |
+
overflow: auto;
|
| 193 |
+
border-bottom-left-radius: 16px;
|
| 194 |
+
border-bottom-right-radius: 16px;
|
| 195 |
+
background: rgba(0, 0, 0, 0.2);
|
| 196 |
+
padding: 0;
|
| 197 |
+
margin: 0;
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
pre[class*="language-"] {
|
| 201 |
+
margin: 0;
|
| 202 |
+
padding: 20px;
|
| 203 |
+
background: transparent !important;
|
| 204 |
+
font-family: 'Fira Code', monospace;
|
| 205 |
+
font-size: 0.9rem;
|
| 206 |
+
line-height: 1.5;
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
/* Right Pane - Instructions & Actions */
|
| 210 |
+
.pane-right {
|
| 211 |
+
flex: 0.8;
|
| 212 |
+
display: flex;
|
| 213 |
+
flex-direction: column;
|
| 214 |
+
gap: 20px;
|
| 215 |
+
overflow-y: auto;
|
| 216 |
+
animation: slideInRight 0.5s cubic-bezier(0.16, 1, 0.3, 1);
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
.card {
|
| 220 |
+
padding: 20px;
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
.card-title {
|
| 224 |
+
font-size: 1.1rem;
|
| 225 |
+
font-weight: 600;
|
| 226 |
+
margin-top: 0;
|
| 227 |
+
margin-bottom: 15px;
|
| 228 |
+
color: var(--text-primary);
|
| 229 |
+
display: flex;
|
| 230 |
+
align-items: center;
|
| 231 |
+
gap: 10px;
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
.instructions-text {
|
| 235 |
+
font-size: 0.95rem;
|
| 236 |
+
color: var(--text-secondary);
|
| 237 |
+
line-height: 1.6;
|
| 238 |
+
white-space: pre-wrap;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
.badge {
|
| 242 |
+
display: inline-block;
|
| 243 |
+
padding: 4px 10px;
|
| 244 |
+
border-radius: 20px;
|
| 245 |
+
font-size: 0.75rem;
|
| 246 |
+
font-weight: 600;
|
| 247 |
+
background: rgba(255, 255, 255, 0.1);
|
| 248 |
+
text-transform: uppercase;
|
| 249 |
+
letter-spacing: 0.5px;
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
.badge.easy { color: var(--success); background: rgba(16, 185, 129, 0.1); }
|
| 253 |
+
.badge.medium { color: var(--warning); background: rgba(245, 158, 11, 0.1); }
|
| 254 |
+
.badge.hard { color: var(--danger); background: rgba(239, 68, 68, 0.1); }
|
| 255 |
+
|
| 256 |
+
/* Comments Form */
|
| 257 |
+
.form-group {
|
| 258 |
+
margin-bottom: 15px;
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
.form-group label {
|
| 262 |
+
display: block;
|
| 263 |
+
margin-bottom: 8px;
|
| 264 |
+
font-size: 0.85rem;
|
| 265 |
+
color: var(--text-secondary);
|
| 266 |
+
font-weight: 500;
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
.form-row {
|
| 270 |
+
display: flex;
|
| 271 |
+
gap: 15px;
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
.form-row > div {
|
| 275 |
+
flex: 1;
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
input[type="number"], input[type="text"], textarea {
|
| 279 |
+
width: 100%;
|
| 280 |
+
padding: 10px 12px;
|
| 281 |
+
background: rgba(15, 23, 42, 0.6);
|
| 282 |
+
border: 1px solid var(--surface-border);
|
| 283 |
+
border-radius: 8px;
|
| 284 |
+
color: white;
|
| 285 |
+
box-sizing: border-box;
|
| 286 |
+
resize: vertical;
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
input:focus, textarea:focus {
|
| 290 |
+
border-color: var(--primary-accent);
|
| 291 |
+
box-shadow: 0 0 0 2px rgba(59, 130, 246, 0.2);
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
/* Staged Comments List */
|
| 295 |
+
.comments-list {
|
| 296 |
+
margin-top: 15px;
|
| 297 |
+
display: flex;
|
| 298 |
+
flex-direction: column;
|
| 299 |
+
gap: 10px;
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
.comment-item {
|
| 303 |
+
background: rgba(0, 0, 0, 0.2);
|
| 304 |
+
border-left: 3px solid var(--primary-accent);
|
| 305 |
+
padding: 12px 15px;
|
| 306 |
+
border-radius: 4px 8px 8px 4px;
|
| 307 |
+
font-size: 0.9rem;
|
| 308 |
+
animation: fadeIn 0.3s ease;
|
| 309 |
+
position: relative;
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
.comment-item .meta {
|
| 313 |
+
font-size: 0.8rem;
|
| 314 |
+
color: var(--text-secondary);
|
| 315 |
+
margin-bottom: 5px;
|
| 316 |
+
display: flex;
|
| 317 |
+
justify-content: space-between;
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
.comment-item .remove-btn {
|
| 321 |
+
position: absolute;
|
| 322 |
+
top: 10px;
|
| 323 |
+
right: 10px;
|
| 324 |
+
background: none;
|
| 325 |
+
border: none;
|
| 326 |
+
padding: 0;
|
| 327 |
+
color: var(--danger);
|
| 328 |
+
font-size: 1.2rem;
|
| 329 |
+
cursor: pointer;
|
| 330 |
+
box-shadow: none;
|
| 331 |
+
opacity: 0.6;
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
.comment-item .remove-btn:hover {
|
| 335 |
+
opacity: 1;
|
| 336 |
+
transform: scale(1.1);
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
textarea#summary {
|
| 340 |
+
height: 80px;
|
| 341 |
+
margin-top: 10px;
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
.submit-section {
|
| 345 |
+
display: flex;
|
| 346 |
+
justify-content: flex-end;
|
| 347 |
+
margin-top: 20px;
|
| 348 |
+
border-top: 1px solid var(--surface-border);
|
| 349 |
+
padding-top: 20px;
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
/* Modal */
|
| 353 |
+
#result-modal {
|
| 354 |
+
position: fixed;
|
| 355 |
+
top: 0; left: 0; right: 0; bottom: 0;
|
| 356 |
+
background: rgba(0,0,0,0.7);
|
| 357 |
+
backdrop-filter: blur(5px);
|
| 358 |
+
display: none;
|
| 359 |
+
justify-content: center;
|
| 360 |
+
align-items: center;
|
| 361 |
+
z-index: 2000;
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
.modal-content {
|
| 365 |
+
width: 100%;
|
| 366 |
+
max-width: 500px;
|
| 367 |
+
padding: 30px;
|
| 368 |
+
text-align: center;
|
| 369 |
+
animation: popIn 0.4s cubic-bezier(0.16, 1, 0.3, 1);
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
.score-circle {
|
| 373 |
+
width: 120px;
|
| 374 |
+
height: 120px;
|
| 375 |
+
border-radius: 50%;
|
| 376 |
+
background: linear-gradient(135deg, var(--bg-color), rgba(30,41,59,1));
|
| 377 |
+
border: 4px solid var(--success);
|
| 378 |
+
display: flex;
|
| 379 |
+
flex-direction: column;
|
| 380 |
+
justify-content: center;
|
| 381 |
+
align-items: center;
|
| 382 |
+
margin: 0 auto 20px;
|
| 383 |
+
box-shadow: 0 0 30px rgba(16, 185, 129, 0.3);
|
| 384 |
+
position: relative;
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
.score-circle.failed {
|
| 388 |
+
border-color: var(--danger);
|
| 389 |
+
box-shadow: 0 0 30px rgba(239, 68, 68, 0.3);
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
.score-value {
|
| 393 |
+
font-size: 2.5rem;
|
| 394 |
+
font-weight: 700;
|
| 395 |
+
margin: 0;
|
| 396 |
+
line-height: 1;
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
.score-label {
|
| 400 |
+
font-size: 0.8rem;
|
| 401 |
+
color: var(--text-secondary);
|
| 402 |
+
text-transform: uppercase;
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
.reward-breakdown {
|
| 406 |
+
text-align: left;
|
| 407 |
+
margin-top: 25px;
|
| 408 |
+
background: rgba(0,0,0,0.2);
|
| 409 |
+
padding: 15px;
|
| 410 |
+
border-radius: 8px;
|
| 411 |
+
font-size: 0.9rem;
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
/* Animations */
|
| 415 |
+
@keyframes slideInLeft {
|
| 416 |
+
from { transform: translateX(-30px); opacity: 0; }
|
| 417 |
+
to { transform: translateX(0); opacity: 1; }
|
| 418 |
+
}
|
| 419 |
+
@keyframes slideInRight {
|
| 420 |
+
from { transform: translateX(30px); opacity: 0; }
|
| 421 |
+
to { transform: translateX(0); opacity: 1; }
|
| 422 |
+
}
|
| 423 |
+
@keyframes fadeIn {
|
| 424 |
+
from { opacity: 0; transform: translateY(10px); }
|
| 425 |
+
to { opacity: 1; transform: translateY(0); }
|
| 426 |
+
}
|
| 427 |
+
@keyframes popIn {
|
| 428 |
+
0% { transform: scale(0.9); opacity: 0; }
|
| 429 |
+
70% { transform: scale(1.02); opacity: 1; }
|
| 430 |
+
100% { transform: scale(1); opacity: 1; }
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
/* Highlight specific lines */
|
| 434 |
+
.line-highlight {
|
| 435 |
+
background: rgba(239, 68, 68, 0.2);
|
| 436 |
+
display: inline-block;
|
| 437 |
+
width: 100%;
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
</style>
|
| 441 |
+
</head>
|
| 442 |
+
<body>
|
| 443 |
+
|
| 444 |
+
<div id="loader">
|
| 445 |
+
<div class="spinner"></div>
|
| 446 |
+
</div>
|
| 447 |
+
|
| 448 |
+
<header>
|
| 449 |
+
<h1>
|
| 450 |
+
<svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
| 451 |
+
<polyline points="16 18 22 12 16 6"></polyline>
|
| 452 |
+
<polyline points="8 6 2 12 8 18"></polyline>
|
| 453 |
+
</svg>
|
| 454 |
+
CodeReview Hub
|
| 455 |
+
</h1>
|
| 456 |
+
<div class="controls">
|
| 457 |
+
<select id="task-select">
|
| 458 |
+
<option value="">Loading tasks...</option>
|
| 459 |
+
</select>
|
| 460 |
+
<button class="btn-primary" onclick="initSession()">
|
| 461 |
+
<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M2 12h4l2-9 5 18 3-10 4 3"/></svg>
|
| 462 |
+
Load Environment
|
| 463 |
+
</button>
|
| 464 |
+
</div>
|
| 465 |
+
</header>
|
| 466 |
+
|
| 467 |
+
<main id="main-content" style="opacity: 0; transition: opacity 0.5s ease;">
|
| 468 |
+
|
| 469 |
+
<div class="glass-panel pane-left">
|
| 470 |
+
<div class="pane-header">
|
| 471 |
+
<span id="file-name">filename.py</span>
|
| 472 |
+
<span class="badge" id="task-difficulty">EASY</span>
|
| 473 |
+
</div>
|
| 474 |
+
<div class="code-container">
|
| 475 |
+
<pre><code class="language-python" id="code-block"># Load a task to see code</code></pre>
|
| 476 |
+
</div>
|
| 477 |
+
</div>
|
| 478 |
+
|
| 479 |
+
<div class="pane-right">
|
| 480 |
+
|
| 481 |
+
<div class="glass-panel card">
|
| 482 |
+
<h2 class="card-title">
|
| 483 |
+
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="var(--primary-accent)" stroke-width="2"><circle cx="12" cy="12" r="10"/><path d="M12 16v-4"/><path d="M12 8h.01"/></svg>
|
| 484 |
+
Review Instructions
|
| 485 |
+
</h2>
|
| 486 |
+
<div class="instructions-text" id="instructions">
|
| 487 |
+
Select a task and click 'Load Environment' to begin your code review session.
|
| 488 |
+
</div>
|
| 489 |
+
</div>
|
| 490 |
+
|
| 491 |
+
<div class="glass-panel card" id="review-panel" style="display: none;">
|
| 492 |
+
<h2 class="card-title">
|
| 493 |
+
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="var(--secondary-accent)" stroke-width="2"><path d="M21 15a2 2 0 0 1-2 2H7l-4 4V5a2 2 0 0 1 2-2h14a2 2 0 0 1 2 2z"/></svg>
|
| 494 |
+
Add Comment
|
| 495 |
+
</h2>
|
| 496 |
+
|
| 497 |
+
<div class="form-row">
|
| 498 |
+
<div class="form-group">
|
| 499 |
+
<label>Line # (optional)</label>
|
| 500 |
+
<input type="number" id="c-line" placeholder="e.g. 15" min="1">
|
| 501 |
+
</div>
|
| 502 |
+
<div class="form-group">
|
| 503 |
+
<label>Category</label>
|
| 504 |
+
<select id="c-category">
|
| 505 |
+
<option value="bug">Bug</option>
|
| 506 |
+
<option value="security">Security</option>
|
| 507 |
+
<option value="performance">Performance</option>
|
| 508 |
+
<option value="style">Style</option>
|
| 509 |
+
<option value="documentation">Documentation</option>
|
| 510 |
+
</select>
|
| 511 |
+
</div>
|
| 512 |
+
<div class="form-group">
|
| 513 |
+
<label>Severity</label>
|
| 514 |
+
<select id="c-severity">
|
| 515 |
+
<option value="low">Low</option>
|
| 516 |
+
<option value="medium" selected>Medium</option>
|
| 517 |
+
<option value="high">High</option>
|
| 518 |
+
<option value="critical">Critical</option>
|
| 519 |
+
</select>
|
| 520 |
+
</div>
|
| 521 |
+
</div>
|
| 522 |
+
|
| 523 |
+
<div class="form-group">
|
| 524 |
+
<label>Message</label>
|
| 525 |
+
<textarea id="c-message" rows="2" placeholder="Describe the issue..."></textarea>
|
| 526 |
+
</div>
|
| 527 |
+
|
| 528 |
+
<div class="form-group">
|
| 529 |
+
<label>Suggestion (optional)</label>
|
| 530 |
+
<input type="text" id="c-suggestion" placeholder="Proposed fix code...">
|
| 531 |
+
</div>
|
| 532 |
+
|
| 533 |
+
<button class="btn-outline" style="width: 100%; justify-content: center;" onclick="stageComment()">
|
| 534 |
+
+ Add to Review
|
| 535 |
+
</button>
|
| 536 |
+
|
| 537 |
+
<div class="comments-list" id="staged-comments">
|
| 538 |
+
<!-- Dynamic comments go here -->
|
| 539 |
+
</div>
|
| 540 |
+
|
| 541 |
+
<div class="form-group" style="margin-top: 20px; padding-top: 20px; border-top: 1px solid var(--surface-border);">
|
| 542 |
+
<label>Overall Summary (required for Hard tasks)</label>
|
| 543 |
+
<textarea id="summary" placeholder="Provide an overall assessment of the code quality..."></textarea>
|
| 544 |
+
</div>
|
| 545 |
+
|
| 546 |
+
<div class="submit-section">
|
| 547 |
+
<button class="btn-primary" onclick="submitReview()">
|
| 548 |
+
<svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M22 11.08V12a10 10 0 1 1-5.93-9.14"/><polyline points="22 4 12 14.01 9 11.01"/></svg>
|
| 549 |
+
Submit Review
|
| 550 |
+
</button>
|
| 551 |
+
</div>
|
| 552 |
+
</div>
|
| 553 |
+
|
| 554 |
+
</div>
|
| 555 |
+
</main>
|
| 556 |
+
|
| 557 |
+
<!-- Results Modal -->
|
| 558 |
+
<div id="result-modal">
|
| 559 |
+
<div class="glass-panel modal-content">
|
| 560 |
+
<h2 style="margin-top: 0;">Evaluation Complete</h2>
|
| 561 |
+
|
| 562 |
+
<div class="score-circle" id="modal-score-circle">
|
| 563 |
+
<p class="score-value" id="modal-score">0.0</p>
|
| 564 |
+
<p class="score-label">Score</p>
|
| 565 |
+
</div>
|
| 566 |
+
|
| 567 |
+
<h3 id="modal-status" style="margin-bottom: 5px;">Passed!</h3>
|
| 568 |
+
<p id="modal-desc" style="color: var(--text-secondary); margin-top: 0; font-size: 0.9rem;"></p>
|
| 569 |
+
|
| 570 |
+
<div class="reward-breakdown" id="modal-breakdown">
|
| 571 |
+
<!-- Breakdown inserted here -->
|
| 572 |
+
</div>
|
| 573 |
+
|
| 574 |
+
<button class="btn-primary" style="margin-top: 20px; width: 100%; justify-content: center;" onclick="closeModal()">
|
| 575 |
+
Continue
|
| 576 |
+
</button>
|
| 577 |
+
</div>
|
| 578 |
+
</div>
|
| 579 |
+
|
| 580 |
+
|
| 581 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/prism.min.js"></script>
|
| 582 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/components/prism-python.min.js"></script>
|
| 583 |
+
|
| 584 |
+
<script>
|
| 585 |
+
// State
|
| 586 |
+
let currentSessionId = 'session_' + Math.floor(Math.random() * 100000);
|
| 587 |
+
let stagedComments = [];
|
| 588 |
+
let maxSteps = 1;
|
| 589 |
+
|
| 590 |
+
// Init
|
| 591 |
+
document.addEventListener('DOMContentLoaded', async () => {
|
| 592 |
+
try {
|
| 593 |
+
const res = await fetch('/tasks');
|
| 594 |
+
const tasks = await res.json();
|
| 595 |
+
const select = document.getElementById('task-select');
|
| 596 |
+
select.innerHTML = '';
|
| 597 |
+
|
| 598 |
+
for (const [id, spec] of Object.entries(tasks)) {
|
| 599 |
+
const option = document.createElement('option');
|
| 600 |
+
option.value = id;
|
| 601 |
+
option.textContent = `${spec.title} (${spec.difficulty})`;
|
| 602 |
+
select.appendChild(option);
|
| 603 |
+
}
|
| 604 |
+
|
| 605 |
+
document.getElementById('loader').style.opacity = '0';
|
| 606 |
+
setTimeout(() => document.getElementById('loader').style.display = 'none', 500);
|
| 607 |
+
document.getElementById('main-content').style.opacity = '1';
|
| 608 |
+
|
| 609 |
+
} catch (e) {
|
| 610 |
+
alert("Failed to connect to backend api.");
|
| 611 |
+
}
|
| 612 |
+
});
|
| 613 |
+
|
| 614 |
+
function getLineNumberedSource(source) {
|
| 615 |
+
const lines = source.split('\n');
|
| 616 |
+
let result = '';
|
| 617 |
+
for (let i = 0; i < lines.length; i++) {
|
| 618 |
+
// simple line numbering injected into markup
|
| 619 |
+
result += `${lines[i]}\n`;
|
| 620 |
+
}
|
| 621 |
+
return result;
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
+
async function initSession() {
|
| 625 |
+
const taskId = document.getElementById('task-select').value;
|
| 626 |
+
if(!taskId) return;
|
| 627 |
+
|
| 628 |
+
document.getElementById('loader').style.display = 'flex';
|
| 629 |
+
document.getElementById('loader').style.opacity = '1';
|
| 630 |
+
|
| 631 |
+
stagedComments = [];
|
| 632 |
+
renderComments();
|
| 633 |
+
document.getElementById('c-message').value = '';
|
| 634 |
+
document.getElementById('c-suggestion').value = '';
|
| 635 |
+
document.getElementById('c-line').value = '';
|
| 636 |
+
document.getElementById('summary').value = '';
|
| 637 |
+
|
| 638 |
+
try {
|
| 639 |
+
const res = await fetch('/reset', {
|
| 640 |
+
method: 'POST',
|
| 641 |
+
headers: {'Content-Type': 'application/json'},
|
| 642 |
+
body: JSON.stringify({task_id: taskId, session_id: currentSessionId})
|
| 643 |
+
});
|
| 644 |
+
const data = await res.json();
|
| 645 |
+
const obs = data.observation;
|
| 646 |
+
|
| 647 |
+
document.getElementById('instructions').textContent = obs.instructions;
|
| 648 |
+
document.getElementById('file-name').textContent = obs.snippet.file_name;
|
| 649 |
+
|
| 650 |
+
let diffBadge = document.getElementById('task-difficulty');
|
| 651 |
+
diffBadge.className = 'badge';
|
| 652 |
+
if(taskId.includes('easy')) diffBadge.classList.add('easy');
|
| 653 |
+
else if(taskId.includes('medium')) diffBadge.classList.add('medium');
|
| 654 |
+
else diffBadge.classList.add('hard');
|
| 655 |
+
|
| 656 |
+
diffBadge.textContent = diffBadge.classList[1].toUpperCase();
|
| 657 |
+
|
| 658 |
+
const codeBlock = document.getElementById('code-block');
|
| 659 |
+
codeBlock.textContent = obs.snippet.source;
|
| 660 |
+
Prism.highlightElement(codeBlock);
|
| 661 |
+
|
| 662 |
+
document.getElementById('review-panel').style.display = 'block';
|
| 663 |
+
|
| 664 |
+
} catch (e) {
|
| 665 |
+
alert("Error starting session.");
|
| 666 |
+
} finally {
|
| 667 |
+
document.getElementById('loader').style.opacity = '0';
|
| 668 |
+
setTimeout(() => document.getElementById('loader').style.display = 'none', 500);
|
| 669 |
+
}
|
| 670 |
+
}
|
| 671 |
+
|
| 672 |
+
function stageComment() {
|
| 673 |
+
const line = document.getElementById('c-line').value;
|
| 674 |
+
const category = document.getElementById('c-category').value;
|
| 675 |
+
const severity = document.getElementById('c-severity').value;
|
| 676 |
+
const message = document.getElementById('c-message').value;
|
| 677 |
+
const suggestion = document.getElementById('c-suggestion').value;
|
| 678 |
+
|
| 679 |
+
if(!message) {
|
| 680 |
+
alert("Message is required.");
|
| 681 |
+
return;
|
| 682 |
+
}
|
| 683 |
+
|
| 684 |
+
const comment = {
|
| 685 |
+
category, severity, message
|
| 686 |
+
};
|
| 687 |
+
if(line) comment.line = parseInt(line);
|
| 688 |
+
if(suggestion) comment.suggestion = suggestion;
|
| 689 |
+
|
| 690 |
+
stagedComments.push(comment);
|
| 691 |
+
renderComments();
|
| 692 |
+
|
| 693 |
+
// clear form
|
| 694 |
+
document.getElementById('c-message').value = '';
|
| 695 |
+
document.getElementById('c-suggestion').value = '';
|
| 696 |
+
document.getElementById('c-line').value = '';
|
| 697 |
+
}
|
| 698 |
+
|
| 699 |
+
function removeComment(index) {
|
| 700 |
+
stagedComments.splice(index, 1);
|
| 701 |
+
renderComments();
|
| 702 |
+
}
|
| 703 |
+
|
| 704 |
+
function renderComments() {
|
| 705 |
+
const list = document.getElementById('staged-comments');
|
| 706 |
+
list.innerHTML = '';
|
| 707 |
+
stagedComments.forEach((c, i) => {
|
| 708 |
+
const color = c.category === 'bug' ? 'var(--danger)' : c.category === 'security' ? '#ef4444' : 'var(--primary-accent)';
|
| 709 |
+
list.innerHTML += `
|
| 710 |
+
<div class="comment-item" style="border-left-color: ${color}">
|
| 711 |
+
<button class="remove-btn" onclick="removeComment(${i})">×</button>
|
| 712 |
+
<div class="meta">
|
| 713 |
+
<span style="color: ${color}; font-weight: bold;">[${c.category.toUpperCase()}] ${c.severity}</span>
|
| 714 |
+
<span>Line: ${c.line || 'global'}</span>
|
| 715 |
+
</div>
|
| 716 |
+
<div style="margin-bottom: ${c.suggestion ? '5px' : '0'}">${c.message}</div>
|
| 717 |
+
${c.suggestion ? `<div style="font-family: monospace; font-size: 0.8rem; background: rgba(0,0,0,0.3); padding: 5px; border-radius: 4px;">Fix: ${c.suggestion}</div>` : ''}
|
| 718 |
+
</div>
|
| 719 |
+
`;
|
| 720 |
+
});
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
async function submitReview() {
|
| 724 |
+
const summary = document.getElementById('summary').value;
|
| 725 |
+
|
| 726 |
+
const action = {
|
| 727 |
+
comments: stagedComments,
|
| 728 |
+
submit: true
|
| 729 |
+
};
|
| 730 |
+
if(summary) action.summary = summary;
|
| 731 |
+
|
| 732 |
+
document.getElementById('loader').style.display = 'flex';
|
| 733 |
+
document.getElementById('loader').style.opacity = '1';
|
| 734 |
+
|
| 735 |
+
try {
|
| 736 |
+
const res = await fetch('/step', {
|
| 737 |
+
method: 'POST',
|
| 738 |
+
headers: {'Content-Type': 'application/json'},
|
| 739 |
+
body: JSON.stringify({
|
| 740 |
+
session_id: currentSessionId,
|
| 741 |
+
action: action
|
| 742 |
+
})
|
| 743 |
+
});
|
| 744 |
+
|
| 745 |
+
const data = await res.json();
|
| 746 |
+
showResults(data);
|
| 747 |
+
|
| 748 |
+
} catch(e) {
|
| 749 |
+
alert("Failed to submit review.");
|
| 750 |
+
} finally {
|
| 751 |
+
document.getElementById('loader').style.opacity = '0';
|
| 752 |
+
setTimeout(() => document.getElementById('loader').style.display = 'none', 500);
|
| 753 |
+
}
|
| 754 |
+
}
|
| 755 |
+
|
| 756 |
+
function showResults(data) {
|
| 757 |
+
const modal = document.getElementById('result-modal');
|
| 758 |
+
const scoreVal = document.getElementById('modal-score');
|
| 759 |
+
const circle = document.getElementById('modal-score-circle');
|
| 760 |
+
const status = document.getElementById('modal-status');
|
| 761 |
+
const desc = document.getElementById('modal-desc');
|
| 762 |
+
const breakdown = document.getElementById('modal-breakdown');
|
| 763 |
+
|
| 764 |
+
const score = data.info.grader?.score || 0;
|
| 765 |
+
const threshold = data.info.grader?.threshold || 0.5;
|
| 766 |
+
const passed = score >= threshold;
|
| 767 |
+
|
| 768 |
+
scoreVal.textContent = score.toFixed(2);
|
| 769 |
+
|
| 770 |
+
if(passed) {
|
| 771 |
+
circle.classList.remove('failed');
|
| 772 |
+
status.textContent = "Great Review!";
|
| 773 |
+
status.style.color = "var(--success)";
|
| 774 |
+
} else {
|
| 775 |
+
circle.classList.add('failed');
|
| 776 |
+
status.textContent = "Needs Improvement";
|
| 777 |
+
status.style.color = "var(--danger)";
|
| 778 |
+
}
|
| 779 |
+
|
| 780 |
+
desc.textContent = `Passing threshold was ${threshold.toFixed(2)}`;
|
| 781 |
+
|
| 782 |
+
let bkHTML = `<h4 style="margin-top:0; color:var(--text-secondary);">Reward Breakdown</h4>`;
|
| 783 |
+
if(data.reward.breakdown) {
|
| 784 |
+
for (const [key, val] of Object.entries(data.reward.breakdown)) {
|
| 785 |
+
const color = val >= 0 ? 'var(--success)' : 'var(--danger)';
|
| 786 |
+
bkHTML += `<div style="display:flex; justify-content:space-between; margin-bottom:5px;">
|
| 787 |
+
<span>${key}</span>
|
| 788 |
+
<strong style="color:${color}">${val > 0 ? '+'+val.toFixed(2) : val.toFixed(2)}</strong>
|
| 789 |
+
</div>`;
|
| 790 |
+
}
|
| 791 |
+
}
|
| 792 |
+
bkHTML += `<div style="border-top:1px solid #333; margin-top:10px; padding-top:10px;">
|
| 793 |
+
<strong>Reason: </strong> <span style="color:#bbb">${data.reward.reason}</span>
|
| 794 |
+
</div>`;
|
| 795 |
+
|
| 796 |
+
breakdown.innerHTML = bkHTML;
|
| 797 |
+
|
| 798 |
+
modal.style.display = 'flex';
|
| 799 |
+
}
|
| 800 |
+
|
| 801 |
+
function closeModal() {
|
| 802 |
+
document.getElementById('result-modal').style.display = 'none';
|
| 803 |
+
}
|
| 804 |
+
|
| 805 |
+
</script>
|
| 806 |
+
</body>
|
| 807 |
+
</html>
|
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# tests package
|
tests/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (154 Bytes). View file
|
|
|
tests/__pycache__/test_env.cpython-313-pytest-9.0.3.pyc
ADDED
|
Binary file (34.4 kB). View file
|
|
|
tests/test_env.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test suite for CodeReview OpenEnv.
|
| 3 |
+
Run with: pytest tests/ -v
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import pytest
|
| 9 |
+
import sys
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 13 |
+
|
| 14 |
+
from env.environment import CodeReviewEnv
|
| 15 |
+
from env.models import Action, ReviewCategory, ReviewComment, Severity
|
| 16 |
+
from graders.graders import Task1Grader, Task2Grader, Task3Grader
|
| 17 |
+
from corpus.snippets import CORPUS
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# ---------------------------------------------------------------------------
|
| 21 |
+
# Fixtures
|
| 22 |
+
# ---------------------------------------------------------------------------
|
| 23 |
+
|
| 24 |
+
def perfect_action(task_id: str) -> Action:
|
| 25 |
+
"""Build an action containing all ground-truth comments for a task."""
|
| 26 |
+
issues = CORPUS[task_id]["issues"]
|
| 27 |
+
return Action(comments=list(issues), summary="Perfect review.", submit=True)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def empty_action(submit: bool = False) -> Action:
|
| 31 |
+
return Action(comments=[], submit=submit)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def single_bug_action() -> Action:
|
| 35 |
+
return Action(
|
| 36 |
+
comments=[
|
| 37 |
+
ReviewComment(
|
| 38 |
+
line=2,
|
| 39 |
+
category=ReviewCategory.BUG,
|
| 40 |
+
severity=Severity.HIGH,
|
| 41 |
+
message="divide() has no guard against division by zero will raise ZeroDivisionError",
|
| 42 |
+
suggestion="Add a check for b==0",
|
| 43 |
+
)
|
| 44 |
+
],
|
| 45 |
+
submit=True,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
+
# Grader unit tests
|
| 51 |
+
# ---------------------------------------------------------------------------
|
| 52 |
+
|
| 53 |
+
class TestTask1Grader:
|
| 54 |
+
grader = Task1Grader()
|
| 55 |
+
ground_truth = CORPUS["task_1_easy"]["issues"]
|
| 56 |
+
|
| 57 |
+
def test_perfect_score_close_to_one(self):
|
| 58 |
+
action = perfect_action("task_1_easy")
|
| 59 |
+
result = self.grader.grade(action, self.ground_truth)
|
| 60 |
+
assert result["score"] >= 0.80, f"Expected ≥0.80 got {result['score']}"
|
| 61 |
+
|
| 62 |
+
def test_empty_action_scores_zero(self):
|
| 63 |
+
result = self.grader.grade(empty_action(submit=True), self.ground_truth)
|
| 64 |
+
assert result["score"] < 0.15
|
| 65 |
+
|
| 66 |
+
def test_single_correct_bug_gives_positive_score(self):
|
| 67 |
+
result = self.grader.grade(single_bug_action(), self.ground_truth)
|
| 68 |
+
assert result["score"] > 0.0
|
| 69 |
+
|
| 70 |
+
def test_wrong_category_penalised(self):
|
| 71 |
+
action = Action(
|
| 72 |
+
comments=[
|
| 73 |
+
ReviewComment(
|
| 74 |
+
line=2, category=ReviewCategory.SECURITY,
|
| 75 |
+
severity=Severity.HIGH,
|
| 76 |
+
message="divide has no guard against division by zero",
|
| 77 |
+
)
|
| 78 |
+
],
|
| 79 |
+
submit=True,
|
| 80 |
+
)
|
| 81 |
+
result_wrong = self.grader.grade(action, self.ground_truth)
|
| 82 |
+
result_right = self.grader.grade(single_bug_action(), self.ground_truth)
|
| 83 |
+
assert result_right["score"] >= result_wrong["score"]
|
| 84 |
+
|
| 85 |
+
def test_fabricated_comment_penalised(self):
|
| 86 |
+
fabricated = Action(
|
| 87 |
+
comments=[
|
| 88 |
+
ReviewComment(
|
| 89 |
+
line=5, category=ReviewCategory.BUG,
|
| 90 |
+
severity=Severity.CRITICAL,
|
| 91 |
+
message="Imaginary crash that does not exist in the code at all",
|
| 92 |
+
)
|
| 93 |
+
] * 10,
|
| 94 |
+
submit=True,
|
| 95 |
+
)
|
| 96 |
+
result = self.grader.grade(fabricated, self.ground_truth)
|
| 97 |
+
assert result["score"] <= 0.1
|
| 98 |
+
|
| 99 |
+
def test_score_in_range(self):
|
| 100 |
+
action = perfect_action("task_1_easy")
|
| 101 |
+
result = self.grader.grade(action, self.ground_truth)
|
| 102 |
+
assert 0.0 <= result["score"] <= 1.0
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
class TestTask2Grader:
|
| 106 |
+
grader = Task2Grader()
|
| 107 |
+
ground_truth = CORPUS["task_2_medium"]["issues"]
|
| 108 |
+
|
| 109 |
+
def test_perfect_score_close_to_one(self):
|
| 110 |
+
action = perfect_action("task_2_medium")
|
| 111 |
+
result = self.grader.grade(action, self.ground_truth)
|
| 112 |
+
assert result["score"] >= 0.75
|
| 113 |
+
|
| 114 |
+
def test_missing_critical_sql_injection_penalised(self):
|
| 115 |
+
# Remove the SQL injection comment from perfect action
|
| 116 |
+
issues = [i for i in self.ground_truth
|
| 117 |
+
if not ("SQL injection" in i.message or "injection" in i.message.lower())]
|
| 118 |
+
action = Action(comments=issues, submit=True)
|
| 119 |
+
full_action = perfect_action("task_2_medium")
|
| 120 |
+
full_result = self.grader.grade(full_action, self.ground_truth)
|
| 121 |
+
partial_result = self.grader.grade(action, self.ground_truth)
|
| 122 |
+
assert full_result["score"] > partial_result["score"]
|
| 123 |
+
|
| 124 |
+
def test_score_in_range(self):
|
| 125 |
+
action = perfect_action("task_2_medium")
|
| 126 |
+
result = self.grader.grade(action, self.ground_truth)
|
| 127 |
+
assert 0.0 <= result["score"] <= 1.0
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class TestTask3Grader:
|
| 131 |
+
grader = Task3Grader()
|
| 132 |
+
ground_truth = CORPUS["task_3_hard"]["issues"]
|
| 133 |
+
|
| 134 |
+
def test_perfect_with_summary_beats_without(self):
|
| 135 |
+
with_summary = perfect_action("task_3_hard")
|
| 136 |
+
without_summary = Action(
|
| 137 |
+
comments=list(self.ground_truth), summary=None, submit=True
|
| 138 |
+
)
|
| 139 |
+
r_with = self.grader.grade(with_summary, self.ground_truth)
|
| 140 |
+
r_without = self.grader.grade(without_summary, self.ground_truth)
|
| 141 |
+
assert r_with["score"] >= r_without["score"]
|
| 142 |
+
|
| 143 |
+
def test_summary_penalty_applied_when_missing(self):
|
| 144 |
+
action = Action(comments=[], summary=None, submit=True)
|
| 145 |
+
result = self.grader.grade(action, self.ground_truth)
|
| 146 |
+
assert result["breakdown"].get("summary_penalty", 0) < 0
|
| 147 |
+
|
| 148 |
+
def test_score_in_range(self):
|
| 149 |
+
action = perfect_action("task_3_hard")
|
| 150 |
+
result = self.grader.grade(action, self.ground_truth)
|
| 151 |
+
assert 0.0 <= result["score"] <= 1.0
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
# ---------------------------------------------------------------------------
|
| 155 |
+
# Environment integration tests
|
| 156 |
+
# ---------------------------------------------------------------------------
|
| 157 |
+
|
| 158 |
+
class TestEnvironmentAPI:
|
| 159 |
+
def test_reset_returns_observation(self):
|
| 160 |
+
env = CodeReviewEnv("task_1_easy")
|
| 161 |
+
obs = env.reset()
|
| 162 |
+
assert obs.task_id == "task_1_easy"
|
| 163 |
+
assert obs.step == 0
|
| 164 |
+
assert obs.snippet.language == "python"
|
| 165 |
+
assert len(obs.snippet.source) > 0
|
| 166 |
+
|
| 167 |
+
def test_step_increments_step_counter(self):
|
| 168 |
+
env = CodeReviewEnv("task_1_easy")
|
| 169 |
+
env.reset()
|
| 170 |
+
result = env.step(empty_action(submit=False))
|
| 171 |
+
assert result.observation.step == 1
|
| 172 |
+
|
| 173 |
+
def test_step_submit_ends_episode(self):
|
| 174 |
+
env = CodeReviewEnv("task_1_easy")
|
| 175 |
+
env.reset()
|
| 176 |
+
result = env.step(empty_action(submit=True))
|
| 177 |
+
assert result.done is True
|
| 178 |
+
|
| 179 |
+
def test_step_after_done_raises(self):
|
| 180 |
+
env = CodeReviewEnv("task_1_easy")
|
| 181 |
+
env.reset()
|
| 182 |
+
env.step(empty_action(submit=True))
|
| 183 |
+
with pytest.raises(RuntimeError):
|
| 184 |
+
env.step(empty_action())
|
| 185 |
+
|
| 186 |
+
def test_state_matches_step(self):
|
| 187 |
+
env = CodeReviewEnv("task_2_medium")
|
| 188 |
+
env.reset()
|
| 189 |
+
env.step(single_bug_action())
|
| 190 |
+
state = env.state()
|
| 191 |
+
assert state.step == 1
|
| 192 |
+
assert state.task_id == "task_2_medium"
|
| 193 |
+
|
| 194 |
+
def test_max_steps_auto_terminates(self):
|
| 195 |
+
env = CodeReviewEnv("task_1_easy")
|
| 196 |
+
env.reset()
|
| 197 |
+
result = None
|
| 198 |
+
for _ in range(env.spec.max_steps):
|
| 199 |
+
result = env.step(empty_action(submit=False))
|
| 200 |
+
assert result.done is True
|
| 201 |
+
|
| 202 |
+
def test_reward_in_range(self):
|
| 203 |
+
env = CodeReviewEnv("task_1_easy")
|
| 204 |
+
env.reset()
|
| 205 |
+
result = env.step(single_bug_action())
|
| 206 |
+
assert -1.0 <= result.reward.value <= 1.0
|
| 207 |
+
|
| 208 |
+
def test_reset_clears_state(self):
|
| 209 |
+
env = CodeReviewEnv("task_1_easy")
|
| 210 |
+
env.reset()
|
| 211 |
+
env.step(single_bug_action())
|
| 212 |
+
env.reset()
|
| 213 |
+
state = env.state()
|
| 214 |
+
assert state.step == 0
|
| 215 |
+
assert state.total_reward == 0.0
|
| 216 |
+
assert len(state.comments_so_far) == 0
|
| 217 |
+
|
| 218 |
+
def test_deduplication_prevents_duplicate_comments(self):
|
| 219 |
+
env = CodeReviewEnv("task_1_easy")
|
| 220 |
+
env.reset()
|
| 221 |
+
# First step: submit=False so episode stays open
|
| 222 |
+
step1_action = Action(comments=[
|
| 223 |
+
ReviewComment(
|
| 224 |
+
line=2, category=ReviewCategory.BUG, severity=Severity.HIGH,
|
| 225 |
+
message="divide() has no guard against division by zero will raise ZeroDivisionError",
|
| 226 |
+
suggestion="Add a check for b==0",
|
| 227 |
+
)
|
| 228 |
+
], submit=False)
|
| 229 |
+
env.step(step1_action)
|
| 230 |
+
# Second step: same comment again (should be deduped)
|
| 231 |
+
step2_action = Action(comments=[
|
| 232 |
+
ReviewComment(
|
| 233 |
+
line=2, category=ReviewCategory.BUG, severity=Severity.HIGH,
|
| 234 |
+
message="divide() has no guard against division by zero will raise ZeroDivisionError",
|
| 235 |
+
suggestion="Add a check for b==0",
|
| 236 |
+
)
|
| 237 |
+
], submit=True)
|
| 238 |
+
env.step(step2_action)
|
| 239 |
+
state = env.state()
|
| 240 |
+
assert len(state.comments_so_far) == 1
|
| 241 |
+
|
| 242 |
+
def test_all_three_tasks_init(self):
|
| 243 |
+
for tid in ["task_1_easy", "task_2_medium", "task_3_hard"]:
|
| 244 |
+
env = CodeReviewEnv(tid)
|
| 245 |
+
obs = env.reset()
|
| 246 |
+
assert obs.task_id == tid
|
| 247 |
+
|
| 248 |
+
def test_invalid_task_raises(self):
|
| 249 |
+
with pytest.raises(ValueError):
|
| 250 |
+
CodeReviewEnv("task_9_impossible")
|
| 251 |
+
|
| 252 |
+
def test_hard_task_requires_summary_field(self):
|
| 253 |
+
env = CodeReviewEnv("task_3_hard")
|
| 254 |
+
env.reset()
|
| 255 |
+
# Submit without summary – should still work but score less
|
| 256 |
+
action = Action(comments=[], summary=None, submit=True)
|
| 257 |
+
result = env.step(action)
|
| 258 |
+
assert result.done is True
|
| 259 |
+
# Verify summary penalty is applied
|
| 260 |
+
assert result.info["grader"]["breakdown"].get("summary_penalty", 0) < 0
|
| 261 |
+
|
| 262 |
+
def test_full_episode_task1(self):
|
| 263 |
+
"""Full happy-path episode: submit all ground truth → should pass."""
|
| 264 |
+
env = CodeReviewEnv("task_1_easy")
|
| 265 |
+
env.reset()
|
| 266 |
+
action = perfect_action("task_1_easy")
|
| 267 |
+
result = env.step(action)
|
| 268 |
+
assert result.done
|
| 269 |
+
assert result.info["passed"] is True
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
validate-submission.sh
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# validate-submission.sh — OpenEnv Submission Validator
|
| 4 |
+
#
|
| 5 |
+
# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
|
| 6 |
+
#
|
| 7 |
+
# Prerequisites:
|
| 8 |
+
# - Docker: https://docs.docker.com/get-docker/
|
| 9 |
+
# - openenv-core: pip install openenv-core
|
| 10 |
+
# - curl (usually pre-installed)
|
| 11 |
+
#
|
| 12 |
+
# Run:
|
| 13 |
+
# curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
|
| 14 |
+
#
|
| 15 |
+
# Or download and run locally:
|
| 16 |
+
# chmod +x validate-submission.sh
|
| 17 |
+
# ./validate-submission.sh <ping_url> [repo_dir]
|
| 18 |
+
#
|
| 19 |
+
# Arguments:
|
| 20 |
+
# ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)
|
| 21 |
+
# repo_dir Path to your repo (default: current directory)
|
| 22 |
+
#
|
| 23 |
+
# Examples:
|
| 24 |
+
# ./validate-submission.sh https://my-team.hf.space
|
| 25 |
+
# ./validate-submission.sh https://my-team.hf.space ./my-repo
|
| 26 |
+
#
|
| 27 |
+
|
| 28 |
+
set -uo pipefail
|
| 29 |
+
|
| 30 |
+
DOCKER_BUILD_TIMEOUT=600
|
| 31 |
+
if [ -t 1 ]; then
|
| 32 |
+
RED='\033[0;31m'
|
| 33 |
+
GREEN='\033[0;32m'
|
| 34 |
+
YELLOW='\033[1;33m'
|
| 35 |
+
BOLD='\033[1m'
|
| 36 |
+
NC='\033[0m'
|
| 37 |
+
else
|
| 38 |
+
RED='' GREEN='' YELLOW='' BOLD='' NC=''
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
run_with_timeout() {
|
| 42 |
+
local secs="$1"; shift
|
| 43 |
+
if command -v timeout &>/dev/null; then
|
| 44 |
+
timeout "$secs" "$@"
|
| 45 |
+
elif command -v gtimeout &>/dev/null; then
|
| 46 |
+
gtimeout "$secs" "$@"
|
| 47 |
+
else
|
| 48 |
+
"$@" &
|
| 49 |
+
local pid=$!
|
| 50 |
+
( sleep "$secs" && kill "$pid" 2>/dev/null ) &
|
| 51 |
+
local watcher=$!
|
| 52 |
+
wait "$pid" 2>/dev/null
|
| 53 |
+
local rc=$?
|
| 54 |
+
kill "$watcher" 2>/dev/null
|
| 55 |
+
wait "$watcher" 2>/dev/null
|
| 56 |
+
return $rc
|
| 57 |
+
fi
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
portable_mktemp() {
|
| 61 |
+
local prefix="${1:-validate}"
|
| 62 |
+
mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
CLEANUP_FILES=()
|
| 66 |
+
cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
|
| 67 |
+
trap cleanup EXIT
|
| 68 |
+
|
| 69 |
+
PING_URL="${1:-}"
|
| 70 |
+
REPO_DIR="${2:-.}"
|
| 71 |
+
|
| 72 |
+
if [ -z "$PING_URL" ]; then
|
| 73 |
+
printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
|
| 74 |
+
printf "\n"
|
| 75 |
+
printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
|
| 76 |
+
printf " repo_dir Path to your repo (default: current directory)\n"
|
| 77 |
+
exit 1
|
| 78 |
+
fi
|
| 79 |
+
|
| 80 |
+
if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
|
| 81 |
+
printf "Error: directory '%s' not found\n" "${2:-.}"
|
| 82 |
+
exit 1
|
| 83 |
+
fi
|
| 84 |
+
PING_URL="${PING_URL%/}"
|
| 85 |
+
export PING_URL
|
| 86 |
+
PASS=0
|
| 87 |
+
|
| 88 |
+
log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
|
| 89 |
+
pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
|
| 90 |
+
fail() { log "${RED}FAILED${NC} -- $1"; }
|
| 91 |
+
hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
|
| 92 |
+
stop_at() {
|
| 93 |
+
printf "\n"
|
| 94 |
+
printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
|
| 95 |
+
exit 1
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
printf "\n"
|
| 99 |
+
printf "${BOLD}========================================${NC}\n"
|
| 100 |
+
printf "${BOLD} OpenEnv Submission Validator${NC}\n"
|
| 101 |
+
printf "${BOLD}========================================${NC}\n"
|
| 102 |
+
log "Repo: $REPO_DIR"
|
| 103 |
+
log "Ping URL: $PING_URL"
|
| 104 |
+
printf "\n"
|
| 105 |
+
|
| 106 |
+
log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
|
| 107 |
+
|
| 108 |
+
CURL_OUTPUT=$(portable_mktemp "validate-curl")
|
| 109 |
+
CLEANUP_FILES+=("$CURL_OUTPUT")
|
| 110 |
+
HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
|
| 111 |
+
-H "Content-Type: application/json" -d '{}' \
|
| 112 |
+
"$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
|
| 113 |
+
|
| 114 |
+
if [ "$HTTP_CODE" = "200" ]; then
|
| 115 |
+
pass "HF Space is live and responds to /reset"
|
| 116 |
+
elif [ "$HTTP_CODE" = "000" ]; then
|
| 117 |
+
fail "HF Space not reachable (connection failed or timed out)"
|
| 118 |
+
hint "Check your network connection and that the Space is running."
|
| 119 |
+
hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
|
| 120 |
+
stop_at "Step 1"
|
| 121 |
+
else
|
| 122 |
+
fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
|
| 123 |
+
hint "Make sure your Space is running and the URL is correct."
|
| 124 |
+
hint "Try opening $PING_URL in your browser first."
|
| 125 |
+
stop_at "Step 1"
|
| 126 |
+
fi
|
| 127 |
+
|
| 128 |
+
log "${BOLD}Step 2/3: Running docker build${NC} ..."
|
| 129 |
+
|
| 130 |
+
if ! command -v docker &>/dev/null; then
|
| 131 |
+
fail "docker command not found"
|
| 132 |
+
hint "Install Docker: https://docs.docker.com/get-docker/"
|
| 133 |
+
stop_at "Step 2"
|
| 134 |
+
fi
|
| 135 |
+
|
| 136 |
+
if [ -f "$REPO_DIR/Dockerfile" ]; then
|
| 137 |
+
DOCKER_CONTEXT="$REPO_DIR"
|
| 138 |
+
elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
|
| 139 |
+
DOCKER_CONTEXT="$REPO_DIR/server"
|
| 140 |
+
else
|
| 141 |
+
fail "No Dockerfile found in repo root or server/ directory"
|
| 142 |
+
stop_at "Step 2"
|
| 143 |
+
fi
|
| 144 |
+
|
| 145 |
+
log " Found Dockerfile in $DOCKER_CONTEXT"
|
| 146 |
+
|
| 147 |
+
BUILD_OK=false
|
| 148 |
+
BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
|
| 149 |
+
|
| 150 |
+
if [ "$BUILD_OK" = true ]; then
|
| 151 |
+
pass "Docker build succeeded"
|
| 152 |
+
else
|
| 153 |
+
fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
|
| 154 |
+
printf "%s\n" "$BUILD_OUTPUT" | tail -20
|
| 155 |
+
stop_at "Step 2"
|
| 156 |
+
fi
|
| 157 |
+
|
| 158 |
+
log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
|
| 159 |
+
|
| 160 |
+
if ! command -v openenv &>/dev/null; then
|
| 161 |
+
fail "openenv command not found"
|
| 162 |
+
hint "Install it: pip install openenv-core"
|
| 163 |
+
stop_at "Step 3"
|
| 164 |
+
fi
|
| 165 |
+
|
| 166 |
+
VALIDATE_OK=false
|
| 167 |
+
VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
|
| 168 |
+
|
| 169 |
+
if [ "$VALIDATE_OK" = true ]; then
|
| 170 |
+
pass "openenv validate passed"
|
| 171 |
+
[ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
|
| 172 |
+
else
|
| 173 |
+
fail "openenv validate failed"
|
| 174 |
+
printf "%s\n" "$VALIDATE_OUTPUT"
|
| 175 |
+
stop_at "Step 3"
|
| 176 |
+
fi
|
| 177 |
+
|
| 178 |
+
printf "\n"
|
| 179 |
+
printf "${BOLD}========================================${NC}\n"
|
| 180 |
+
printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
|
| 181 |
+
printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
|
| 182 |
+
printf "${BOLD}========================================${NC}\n"
|
| 183 |
+
printf "\n"
|
| 184 |
+
|
| 185 |
+
exit 0
|