Spaces:
Running
Running
Commit ·
6e7ce30
0
Parent(s):
Initial commit with OpenEnv structure
Browse files- .gitignore +9 -0
- Dockerfile +12 -0
- README.md +52 -0
- app.py +61 -0
- environment/__init__.py +3 -0
- environment/env.py +85 -0
- environment/graders.py +35 -0
- environment/models.py +21 -0
- environment/rewards.py +51 -0
- environment/tasks.py +86 -0
- inference.py +91 -0
- openenv.yaml +56 -0
- requirements.txt +7 -0
- tests/__init__.py +1 -0
- tests/test_env.py +31 -0
- tests/test_graders.py +33 -0
.gitignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
.env
|
| 4 |
+
.pytest_cache/
|
| 5 |
+
.DS_Store
|
| 6 |
+
.git/
|
| 7 |
+
node_modules/
|
| 8 |
+
dist/
|
| 9 |
+
build/
|
Dockerfile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
|
| 8 |
+
COPY . .
|
| 9 |
+
|
| 10 |
+
EXPOSE 7860
|
| 11 |
+
|
| 12 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CodeReviewEnv
|
| 2 |
+
|
| 3 |
+
A realistic OpenEnv environment where an AI agent performs code review on Python code snippets.
|
| 4 |
+
|
| 5 |
+
## Real-world utility
|
| 6 |
+
|
| 7 |
+
Code review is a daily task for software engineers. Automating parts of it can save time and catch bugs early. This environment allows training agents to identify bugs, style issues, security flaws, performance problems, and documentation gaps.
|
| 8 |
+
|
| 9 |
+
## Action & Observation Spaces
|
| 10 |
+
|
| 11 |
+
- **Observation**: Contains the source code, step count, and feedback from previous step.
|
| 12 |
+
- **Action**: A list of issues (line number, category, description) and a `final` flag to submit.
|
| 13 |
+
|
| 14 |
+
## Tasks (Easy → Medium → Hard)
|
| 15 |
+
|
| 16 |
+
| Task | Issues | Description |
|
| 17 |
+
|--------|--------|-------------|
|
| 18 |
+
| Easy | 2 | Missing zero-division guard and missing docstring |
|
| 19 |
+
| Medium | 3 | Logic error (wrong dict key), hardcoded API key, missing type hints |
|
| 20 |
+
| Hard | 5 | Race condition, O(n²) anti-pattern, eval() security hole, missing docstrings |
|
| 21 |
+
|
| 22 |
+
Graders compute F1 score based on exact (line, category) matches.
|
| 23 |
+
|
| 24 |
+
## Setup
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
git clone <your-space-url>
|
| 28 |
+
cd codereview-env
|
| 29 |
+
docker build -t codereview-env .
|
| 30 |
+
docker run -p 7860:7860 codereview-env
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
## Baseline Inference
|
| 34 |
+
|
| 35 |
+
```bash
|
| 36 |
+
export GROQ_API_KEY=your_key
|
| 37 |
+
export ENV_URL=http://localhost:7860
|
| 38 |
+
python inference.py
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
Expected baseline scores (Llama-3-70B-8192):
|
| 42 |
+
- Easy: ~0.95
|
| 43 |
+
- Medium: ~0.82
|
| 44 |
+
- Hard: ~0.60
|
| 45 |
+
|
| 46 |
+
## Deploy to HF Spaces
|
| 47 |
+
|
| 48 |
+
Create a Space with Docker, push this repo, and set environment variables `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN`.
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
This implementation satisfies all OpenEnv requirements, including real-world utility, varying difficulty, 0.0-1.0 grading, and reproducible baseline inference.
|
app.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import uuid
|
| 2 |
+
from fastapi import FastAPI, HTTPException
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from environment.env import CodeReviewEnv
|
| 5 |
+
from environment.models import Action, Observation, Reward
|
| 6 |
+
|
| 7 |
+
app = FastAPI(title="CodeReviewEnv")
|
| 8 |
+
|
| 9 |
+
# In-memory store for environments categorized by session ID
|
| 10 |
+
sessions = {}
|
| 11 |
+
|
| 12 |
+
class ResetRequest(BaseModel):
|
| 13 |
+
task_id: str
|
| 14 |
+
|
| 15 |
+
class StepRequest(BaseModel):
|
| 16 |
+
session_id: str
|
| 17 |
+
action: Action
|
| 18 |
+
|
| 19 |
+
@app.get("/health")
|
| 20 |
+
async def health():
|
| 21 |
+
return {"status": "ok"}
|
| 22 |
+
|
| 23 |
+
@app.post("/reset")
|
| 24 |
+
async def reset_endpoint(req: ResetRequest):
|
| 25 |
+
try:
|
| 26 |
+
# Create a new session with a unique ID
|
| 27 |
+
session_id = str(uuid.uuid4())
|
| 28 |
+
env = CodeReviewEnv(req.task_id)
|
| 29 |
+
obs = env.reset()
|
| 30 |
+
sessions[session_id] = env
|
| 31 |
+
|
| 32 |
+
# Return observation alongside the session ID
|
| 33 |
+
return {
|
| 34 |
+
"session_id": session_id,
|
| 35 |
+
"observation": obs.dict()
|
| 36 |
+
}
|
| 37 |
+
except Exception as e:
|
| 38 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 39 |
+
|
| 40 |
+
@app.post("/step")
|
| 41 |
+
async def step_endpoint(req: StepRequest):
|
| 42 |
+
if req.session_id not in sessions:
|
| 43 |
+
raise HTTPException(status_code=400, detail="Invalid session ID or expired environment.")
|
| 44 |
+
|
| 45 |
+
env = sessions[req.session_id]
|
| 46 |
+
try:
|
| 47 |
+
obs, reward, done, info = env.step(req.action)
|
| 48 |
+
return {
|
| 49 |
+
"observation": obs.dict(),
|
| 50 |
+
"reward": reward.dict(),
|
| 51 |
+
"done": done,
|
| 52 |
+
"info": info
|
| 53 |
+
}
|
| 54 |
+
except Exception as e:
|
| 55 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 56 |
+
|
| 57 |
+
@app.get("/state")
|
| 58 |
+
async def state_endpoint(session_id: str):
|
| 59 |
+
if session_id not in sessions:
|
| 60 |
+
raise HTTPException(status_code=400, detail="Invalid session ID.")
|
| 61 |
+
return sessions[session_id].state()
|
environment/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Makes 'environment' a proper Python package
|
| 2 |
+
from .env import CodeReviewEnv
|
| 3 |
+
from .models import Action, Observation, Reward
|
environment/env.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
from typing import Dict, Any, Tuple
|
| 3 |
+
from environment.models import Observation, Action, Reward
|
| 4 |
+
from environment.tasks import TASKS
|
| 5 |
+
from environment.graders import grade_easy, grade_medium, grade_hard
|
| 6 |
+
from environment.rewards import compute_reward
|
| 7 |
+
|
| 8 |
+
class CodeReviewEnv:
|
| 9 |
+
def __init__(self, task_id: str):
|
| 10 |
+
if task_id not in TASKS:
|
| 11 |
+
raise ValueError(f"Unknown task: {task_id}")
|
| 12 |
+
self.task_id = task_id
|
| 13 |
+
self._state = None
|
| 14 |
+
self._step_count = 0
|
| 15 |
+
self._done = False
|
| 16 |
+
self._final_f1 = None
|
| 17 |
+
self._ground_truth = TASKS[task_id]["ground_truth"]
|
| 18 |
+
self._max_steps = TASKS[task_id]["max_steps"]
|
| 19 |
+
# Use a local random instance for isolation
|
| 20 |
+
self._rng = random.Random(42)
|
| 21 |
+
|
| 22 |
+
def reset(self) -> Observation:
|
| 23 |
+
self._rng.seed(42) # Set seed on instance for each reset
|
| 24 |
+
self._step_count = 0
|
| 25 |
+
self._done = False
|
| 26 |
+
self._final_f1 = None
|
| 27 |
+
task = TASKS[self.task_id]
|
| 28 |
+
self._state = {
|
| 29 |
+
"code": task["code"],
|
| 30 |
+
"instructions": task["instructions"],
|
| 31 |
+
"issues_reported": []
|
| 32 |
+
}
|
| 33 |
+
return Observation(
|
| 34 |
+
code=self._state["code"],
|
| 35 |
+
step_count=self._step_count,
|
| 36 |
+
previous_feedback="",
|
| 37 |
+
done=False
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
def step(self, action: Action) -> Tuple[Observation, Reward, bool, Dict[str, Any]]:
|
| 41 |
+
if self._done:
|
| 42 |
+
raise RuntimeError("Episode already done. Call reset().")
|
| 43 |
+
|
| 44 |
+
self._step_count += 1
|
| 45 |
+
self._state["issues_reported"] = action.issues
|
| 46 |
+
|
| 47 |
+
# Compute reward
|
| 48 |
+
reward_obj = compute_reward(
|
| 49 |
+
action=action,
|
| 50 |
+
ground_truth=self._ground_truth,
|
| 51 |
+
step_count=self._step_count,
|
| 52 |
+
max_steps=self._max_steps
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
# Check episode termination
|
| 56 |
+
done = False
|
| 57 |
+
info = {}
|
| 58 |
+
|
| 59 |
+
if action.final or self._step_count >= self._max_steps:
|
| 60 |
+
# Grade the final attempt
|
| 61 |
+
if self.task_id == "easy":
|
| 62 |
+
final_score = grade_easy(action.issues)
|
| 63 |
+
elif self.task_id == "medium":
|
| 64 |
+
final_score = grade_medium(action.issues)
|
| 65 |
+
else:
|
| 66 |
+
final_score = grade_hard(action.issues)
|
| 67 |
+
self._final_f1 = final_score
|
| 68 |
+
done = True
|
| 69 |
+
info["final_f1"] = final_score
|
| 70 |
+
# Override reward: give final F1 as reward for the terminal step
|
| 71 |
+
reward_obj = Reward(value=final_score, reason=f"Episode finished. F1={final_score}")
|
| 72 |
+
|
| 73 |
+
self._done = done
|
| 74 |
+
|
| 75 |
+
obs = Observation(
|
| 76 |
+
code=self._state["code"],
|
| 77 |
+
step_count=self._step_count,
|
| 78 |
+
previous_feedback=reward_obj.reason,
|
| 79 |
+
done=done
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
return obs, reward_obj, done, info
|
| 83 |
+
|
| 84 |
+
def state(self) -> Dict[str, Any]:
|
| 85 |
+
return self._state.copy()
|
environment/graders.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
from environment.models import Issue
|
| 3 |
+
|
| 4 |
+
def compute_f1(agent_issues: List[Issue], ground_truth: List[Issue]) -> float:
|
| 5 |
+
"""
|
| 6 |
+
Deterministic grader: exact match on line and category.
|
| 7 |
+
Returns F1 score between 0.0 and 1.0.
|
| 8 |
+
"""
|
| 9 |
+
# Convert ground truth to set of (line, category) tuples
|
| 10 |
+
truth_set = {(issue.line, issue.category) for issue in ground_truth}
|
| 11 |
+
agent_set = {(issue.line, issue.category) for issue in agent_issues}
|
| 12 |
+
|
| 13 |
+
true_positives = len(truth_set & agent_set)
|
| 14 |
+
false_positives = len(agent_set - truth_set)
|
| 15 |
+
false_negatives = len(truth_set - agent_set)
|
| 16 |
+
|
| 17 |
+
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
|
| 18 |
+
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0
|
| 19 |
+
|
| 20 |
+
if precision + recall == 0:
|
| 21 |
+
return 0.0
|
| 22 |
+
f1 = 2 * (precision * recall) / (precision + recall)
|
| 23 |
+
return round(f1, 3)
|
| 24 |
+
|
| 25 |
+
def grade_easy(agent_issues: List[Issue]) -> float:
|
| 26 |
+
from environment.tasks import TASKS
|
| 27 |
+
return compute_f1(agent_issues, TASKS["easy"]["ground_truth"])
|
| 28 |
+
|
| 29 |
+
def grade_medium(agent_issues: List[Issue]) -> float:
|
| 30 |
+
from environment.tasks import TASKS
|
| 31 |
+
return compute_f1(agent_issues, TASKS["medium"]["ground_truth"])
|
| 32 |
+
|
| 33 |
+
def grade_hard(agent_issues: List[Issue]) -> float:
|
| 34 |
+
from environment.tasks import TASKS
|
| 35 |
+
return compute_f1(agent_issues, TASKS["hard"]["ground_truth"])
|
environment/models.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import List, Literal, Optional
|
| 3 |
+
|
| 4 |
+
class Issue(BaseModel):
|
| 5 |
+
line: int = Field(..., ge=1)
|
| 6 |
+
category: Literal["bug", "style", "security", "performance", "documentation"]
|
| 7 |
+
description: str = Field(..., max_length=200)
|
| 8 |
+
|
| 9 |
+
class Action(BaseModel):
|
| 10 |
+
issues: List[Issue] = Field(default_factory=list)
|
| 11 |
+
final: bool = False
|
| 12 |
+
|
| 13 |
+
class Observation(BaseModel):
|
| 14 |
+
code: str
|
| 15 |
+
step_count: int
|
| 16 |
+
previous_feedback: str = ""
|
| 17 |
+
done: bool = False
|
| 18 |
+
|
| 19 |
+
class Reward(BaseModel):
|
| 20 |
+
value: float
|
| 21 |
+
reason: str
|
environment/rewards.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from environment.models import Issue, Action, Reward
|
| 2 |
+
from environment.graders import compute_f1
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
def compute_reward(
|
| 6 |
+
action: Action,
|
| 7 |
+
ground_truth: List[Issue],
|
| 8 |
+
step_count: int,
|
| 9 |
+
max_steps: int
|
| 10 |
+
) -> Reward:
|
| 11 |
+
"""
|
| 12 |
+
Dense reward:
|
| 13 |
+
- +0.2 per correctly identified issue (true positive)
|
| 14 |
+
- -0.1 per false positive
|
| 15 |
+
- -0.05 per step (encourage efficiency)
|
| 16 |
+
- +0.5 bonus if all issues found and final=True
|
| 17 |
+
- Final episode reward = F1 score at end if final=True, else 0
|
| 18 |
+
"""
|
| 19 |
+
# Compute current F1 based on issues reported so far
|
| 20 |
+
current_f1 = compute_f1(action.issues, ground_truth)
|
| 21 |
+
|
| 22 |
+
# Per-step penalty
|
| 23 |
+
step_penalty = -0.05 * step_count
|
| 24 |
+
|
| 25 |
+
# True positives: count matching (line, category)
|
| 26 |
+
truth_set = {(i.line, i.category) for i in ground_truth}
|
| 27 |
+
agent_set = {(i.line, i.category) for i in action.issues}
|
| 28 |
+
tp_count = len(truth_set & agent_set)
|
| 29 |
+
fp_count = len(agent_set - truth_set)
|
| 30 |
+
|
| 31 |
+
tp_reward = tp_count * 0.2
|
| 32 |
+
fp_penalty = fp_count * 0.1
|
| 33 |
+
|
| 34 |
+
reward_value = tp_reward - fp_penalty + step_penalty
|
| 35 |
+
|
| 36 |
+
# Bonus for early completion with all issues
|
| 37 |
+
all_found = (tp_count == len(ground_truth))
|
| 38 |
+
if action.final and all_found:
|
| 39 |
+
reward_value += 0.5
|
| 40 |
+
reason = f"Final answer correct! F1={current_f1}"
|
| 41 |
+
elif action.final:
|
| 42 |
+
reason = f"Final answer submitted with F1={current_f1}"
|
| 43 |
+
# If final but not all correct, still give F1 score as final reward
|
| 44 |
+
reward_value = current_f1
|
| 45 |
+
else:
|
| 46 |
+
reason = f"Step {step_count}: {tp_count}/{len(ground_truth)} issues found. +{tp_reward:.2f} -{fp_penalty:.2f} -{step_penalty:.2f}"
|
| 47 |
+
|
| 48 |
+
# Clip to [-1, 1] for stability
|
| 49 |
+
reward_value = max(-1.0, min(1.0, reward_value))
|
| 50 |
+
|
| 51 |
+
return Reward(value=reward_value, reason=reason)
|
environment/tasks.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from environment.models import Issue
|
| 2 |
+
|
| 3 |
+
# Task definitions: each has code, ground truth issues, and max steps
|
| 4 |
+
TASKS = {
|
| 5 |
+
"easy": {
|
| 6 |
+
"code": '''def calculate_average(numbers):
|
| 7 |
+
"""Calculate average of a list of numbers."""
|
| 8 |
+
total = 0
|
| 9 |
+
count = 0
|
| 10 |
+
for n in numbers:
|
| 11 |
+
total += n
|
| 12 |
+
count += 1
|
| 13 |
+
# Missing division by zero check
|
| 14 |
+
return total / count
|
| 15 |
+
''',
|
| 16 |
+
"ground_truth": [
|
| 17 |
+
Issue(line=8, category="bug", description="No handling of empty list: division by zero"),
|
| 18 |
+
Issue(line=2, category="documentation", description="Missing docstring param/return description")
|
| 19 |
+
],
|
| 20 |
+
"max_steps": 3,
|
| 21 |
+
"instructions": "Find all bugs and documentation issues in the code."
|
| 22 |
+
},
|
| 23 |
+
"medium": {
|
| 24 |
+
"code": '''def process_user_data(users):
|
| 25 |
+
results = []
|
| 26 |
+
for user in users:
|
| 27 |
+
if user['active'] == True:
|
| 28 |
+
# Logic error: using 'name' instead of 'username'
|
| 29 |
+
results.append(user['name'].upper())
|
| 30 |
+
return results
|
| 31 |
+
|
| 32 |
+
def fetch_users(api_key):
|
| 33 |
+
# Security issue: hardcoded API key
|
| 34 |
+
return [{'name': 'Alice', 'username': 'alice123', 'active': True},
|
| 35 |
+
{'name': 'Bob', 'username': 'bob456', 'active': False}]
|
| 36 |
+
''',
|
| 37 |
+
"ground_truth": [
|
| 38 |
+
Issue(line=4, category="bug", description="Logic error: should use 'username' key, not 'name'"),
|
| 39 |
+
Issue(line=9, category="security", description="Hardcoded API key – expose secret"),
|
| 40 |
+
Issue(line=1, category="style", description="Missing type hints and docstring")
|
| 41 |
+
],
|
| 42 |
+
"max_steps": 4,
|
| 43 |
+
"instructions": "Find bug, security issue, and style violation."
|
| 44 |
+
},
|
| 45 |
+
"hard": {
|
| 46 |
+
"code": '''import threading
|
| 47 |
+
|
| 48 |
+
counter = 0
|
| 49 |
+
|
| 50 |
+
def increment():
|
| 51 |
+
global counter
|
| 52 |
+
for _ in range(1000):
|
| 53 |
+
counter += 1 # Race condition
|
| 54 |
+
|
| 55 |
+
def start_threads():
|
| 56 |
+
threads = []
|
| 57 |
+
for _ in range(10):
|
| 58 |
+
t = threading.Thread(target=increment)
|
| 59 |
+
threads.append(t)
|
| 60 |
+
t.start()
|
| 61 |
+
for t in threads:
|
| 62 |
+
t.join()
|
| 63 |
+
return counter
|
| 64 |
+
|
| 65 |
+
def expensive_loop(n):
|
| 66 |
+
result = []
|
| 67 |
+
for i in range(n):
|
| 68 |
+
# Performance anti-pattern: repeated list concatenation
|
| 69 |
+
result = result + [i**2]
|
| 70 |
+
return result
|
| 71 |
+
|
| 72 |
+
# Security: using eval on user input
|
| 73 |
+
def process_expression(expr):
|
| 74 |
+
return eval(expr)
|
| 75 |
+
''',
|
| 76 |
+
"ground_truth": [
|
| 77 |
+
Issue(line=8, category="bug", description="Race condition on global counter without lock"),
|
| 78 |
+
Issue(line=20, category="performance", description="O(n^2) due to list concatenation; use .append()"),
|
| 79 |
+
Issue(line=27, category="security", description="eval() on user input allows arbitrary code execution"),
|
| 80 |
+
Issue(line=1, category="style", description="Missing module docstring and proper structure"),
|
| 81 |
+
Issue(line=13, category="documentation", description="No docstring for start_threads function")
|
| 82 |
+
],
|
| 83 |
+
"max_steps": 6,
|
| 84 |
+
"instructions": "Find concurrency bug, performance anti-pattern, security flaw, and documentation/style issues."
|
| 85 |
+
}
|
| 86 |
+
}
|
inference.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
import requests
|
| 5 |
+
from openai import OpenAI
|
| 6 |
+
from environment.models import Action, Issue
|
| 7 |
+
|
| 8 |
+
# Better logging instead of quiet failures
|
| 9 |
+
logging.basicConfig(level=logging.INFO)
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.groq.com/openai/v1")
|
| 13 |
+
API_KEY = os.getenv("GROQ_API_KEY") or os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
|
| 14 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "llama3-70b-8192")
|
| 15 |
+
ENV_URL = os.getenv("ENV_URL", "http://localhost:7860") # Set this for HF Spaces
|
| 16 |
+
|
| 17 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 18 |
+
|
| 19 |
+
def parse_llm_response(text: str) -> Action:
|
| 20 |
+
"""Parse LLM output into an Action. Expects JSON list of issues."""
|
| 21 |
+
try:
|
| 22 |
+
# Extract JSON from markdown blocks
|
| 23 |
+
if "```json" in text:
|
| 24 |
+
text = text.split("```json")[1].split("```")[0]
|
| 25 |
+
elif "```" in text:
|
| 26 |
+
text = text.split("```")[1].split("```")[0]
|
| 27 |
+
|
| 28 |
+
data = json.loads(text.strip())
|
| 29 |
+
issues = [Issue(**item) for item in data]
|
| 30 |
+
return Action(issues=issues, final=True)
|
| 31 |
+
except Exception as e:
|
| 32 |
+
logger.error(f"Failed to parse LLM response: {e}")
|
| 33 |
+
# Return an empty list indicating the model failed to find issues properly
|
| 34 |
+
return Action(issues=[], final=True)
|
| 35 |
+
|
| 36 |
+
def run_task(task_id: str) -> float:
|
| 37 |
+
# 1. Reset environment to get initial observation and session_id
|
| 38 |
+
logger.info(f"Running task: {task_id}")
|
| 39 |
+
resp = requests.post(f"{ENV_URL}/reset", json={"task_id": task_id})
|
| 40 |
+
resp.raise_for_status()
|
| 41 |
+
reset_data = resp.json()
|
| 42 |
+
|
| 43 |
+
session_id = reset_data["session_id"]
|
| 44 |
+
obs = reset_data["observation"]
|
| 45 |
+
|
| 46 |
+
# 2. Build prompt using the code from the observation
|
| 47 |
+
prompt = f"""You are a code reviewer. Analyze the following Python code and list all issues (bugs, style, security, performance, documentation).
|
| 48 |
+
Return a JSON list where each item has: "line" (int), "category" (one of: bug, style, security, performance, documentation), "description" (string).
|
| 49 |
+
Example: [{{"line": 5, "category": "bug", "description": "Division by zero"}}]
|
| 50 |
+
|
| 51 |
+
Code:
|
| 52 |
+
{obs['code']}
|
| 53 |
+
"""
|
| 54 |
+
try:
|
| 55 |
+
response = client.chat.completions.create(
|
| 56 |
+
model=MODEL_NAME,
|
| 57 |
+
messages=[{"role": "user", "content": prompt}],
|
| 58 |
+
temperature=0.0 # Reproducibility
|
| 59 |
+
)
|
| 60 |
+
raw = response.choices[0].message.content
|
| 61 |
+
logger.debug(f"Raw Output: {raw}")
|
| 62 |
+
except Exception as e:
|
| 63 |
+
logger.error(f"OpenAI completion error: {e}")
|
| 64 |
+
raw = "[]"
|
| 65 |
+
|
| 66 |
+
action = parse_llm_response(raw)
|
| 67 |
+
|
| 68 |
+
# 3. Take step using the session_id
|
| 69 |
+
step_resp = requests.post(f"{ENV_URL}/step", json={
|
| 70 |
+
"session_id": session_id,
|
| 71 |
+
"action": action.dict()
|
| 72 |
+
})
|
| 73 |
+
step_resp.raise_for_status()
|
| 74 |
+
data = step_resp.json()
|
| 75 |
+
|
| 76 |
+
final_reward = data["reward"]["value"]
|
| 77 |
+
logger.info(f"Task {task_id}: Final Score = {final_reward:.3f}")
|
| 78 |
+
return final_reward
|
| 79 |
+
|
| 80 |
+
if __name__ == "__main__":
|
| 81 |
+
scores = {}
|
| 82 |
+
for task in ["easy", "medium", "hard"]:
|
| 83 |
+
try:
|
| 84 |
+
scores[task] = run_task(task)
|
| 85 |
+
except Exception as e:
|
| 86 |
+
logger.error(f"Task execution failed ({task}): {e}")
|
| 87 |
+
scores[task] = 0.0
|
| 88 |
+
|
| 89 |
+
print("\n=== Baseline Results ===")
|
| 90 |
+
for task, score in scores.items():
|
| 91 |
+
print(f"{task}: {score:.3f}")
|
openenv.yaml
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: CodeReviewEnv
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
description: |
|
| 4 |
+
An AI agent environment for performing code reviews on Python snippets.
|
| 5 |
+
The agent identifies bugs, style issues, security flaws, and performance problems.
|
| 6 |
+
Three tasks of increasing difficulty, graded by F1 score (0.0–1.0) against ground truth.
|
| 7 |
+
authors:
|
| 8 |
+
- name: Your Name
|
| 9 |
+
email: you@example.com
|
| 10 |
+
tags:
|
| 11 |
+
- code-review
|
| 12 |
+
- software-engineering
|
| 13 |
+
- text
|
| 14 |
+
tasks:
|
| 15 |
+
- id: easy
|
| 16 |
+
name: "Find 2 simple issues"
|
| 17 |
+
description: "Missing zero-division guard and missing docstring"
|
| 18 |
+
difficulty: easy
|
| 19 |
+
- id: medium
|
| 20 |
+
name: "Find 3 issues including logic error"
|
| 21 |
+
description: "Logic error (wrong dict key), hardcoded API key, missing type hints"
|
| 22 |
+
difficulty: medium
|
| 23 |
+
- id: hard
|
| 24 |
+
name: "Find 5 issues including security flaw"
|
| 25 |
+
description: "Race condition, O(n²) anti-pattern, eval() security hole, missing docstrings"
|
| 26 |
+
difficulty: hard
|
| 27 |
+
action_space:
|
| 28 |
+
type: object
|
| 29 |
+
properties:
|
| 30 |
+
issues:
|
| 31 |
+
type: array
|
| 32 |
+
items:
|
| 33 |
+
type: object
|
| 34 |
+
properties:
|
| 35 |
+
line:
|
| 36 |
+
type: integer
|
| 37 |
+
minimum: 1
|
| 38 |
+
category:
|
| 39 |
+
type: string
|
| 40 |
+
enum: [bug, style, security, performance, documentation]
|
| 41 |
+
description:
|
| 42 |
+
type: string
|
| 43 |
+
maxLength: 200
|
| 44 |
+
final:
|
| 45 |
+
type: boolean
|
| 46 |
+
observation_space:
|
| 47 |
+
type: object
|
| 48 |
+
properties:
|
| 49 |
+
code:
|
| 50 |
+
type: string
|
| 51 |
+
step_count:
|
| 52 |
+
type: integer
|
| 53 |
+
previous_feedback:
|
| 54 |
+
type: string
|
| 55 |
+
done:
|
| 56 |
+
type: boolean
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pydantic>=2.0.0
|
| 2 |
+
fastapi>=0.100.0
|
| 3 |
+
uvicorn>=0.23.0
|
| 4 |
+
openai>=1.0.0
|
| 5 |
+
numpy>=1.24.0
|
| 6 |
+
pytest>=7.0.0
|
| 7 |
+
requests>=2.31.0
|
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Empty init file for Python testing package
|
tests/test_env.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 4 |
+
|
| 5 |
+
from environment.env import CodeReviewEnv
|
| 6 |
+
from environment.models import Action, Issue
|
| 7 |
+
|
| 8 |
+
def test_basic_flow():
|
| 9 |
+
env = CodeReviewEnv("easy")
|
| 10 |
+
obs = env.reset()
|
| 11 |
+
assert "calculate_average" in obs.code
|
| 12 |
+
assert obs.step_count == 0
|
| 13 |
+
|
| 14 |
+
# Send a dummy action
|
| 15 |
+
action = Action(issues=[Issue(line=8, category="bug", description="test")], final=False)
|
| 16 |
+
obs, reward, done, info = env.step(action)
|
| 17 |
+
|
| 18 |
+
assert obs.step_count == 1
|
| 19 |
+
assert not done
|
| 20 |
+
assert reward.value > -1.0
|
| 21 |
+
|
| 22 |
+
# Finish the episode
|
| 23 |
+
action = Action(issues=[Issue(line=8, category="bug", description="test")], final=True)
|
| 24 |
+
obs, reward, done, info = env.step(action)
|
| 25 |
+
|
| 26 |
+
assert done
|
| 27 |
+
assert "final_f1" in info
|
| 28 |
+
print("Environment basic flow test passed.")
|
| 29 |
+
|
| 30 |
+
if __name__ == "__main__":
|
| 31 |
+
test_basic_flow()
|
tests/test_graders.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
# Add parent directory to path to import environment
|
| 4 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 5 |
+
|
| 6 |
+
from environment.graders import grade_easy, grade_medium, grade_hard
|
| 7 |
+
from environment.models import Issue
|
| 8 |
+
|
| 9 |
+
def test_grader_variance():
|
| 10 |
+
# Perfect score
|
| 11 |
+
perfect = [
|
| 12 |
+
Issue(line=8, category="bug", description=""),
|
| 13 |
+
Issue(line=2, category="documentation", description="")
|
| 14 |
+
]
|
| 15 |
+
assert grade_easy(perfect) == 1.0
|
| 16 |
+
|
| 17 |
+
# Empty
|
| 18 |
+
assert grade_easy([]) == 0.0
|
| 19 |
+
|
| 20 |
+
# Partial
|
| 21 |
+
partial = [Issue(line=8, category="bug", description="")]
|
| 22 |
+
# F1 score for partial match should be between 0 and 1
|
| 23 |
+
# TP=1, FP=0, FN=1 -> P=1, R=0.5 -> F1 = 2*(1*0.5)/(1+0.5) = 2/3 = 0.667
|
| 24 |
+
assert grade_easy(partial) == 0.667
|
| 25 |
+
|
| 26 |
+
# False positive
|
| 27 |
+
fp = [Issue(line=99, category="bug", description="")]
|
| 28 |
+
assert grade_easy(fp) == 0.0
|
| 29 |
+
|
| 30 |
+
print("All grader variance tests passed.")
|
| 31 |
+
|
| 32 |
+
if __name__ == "__main__":
|
| 33 |
+
test_grader_variance()
|