Spaces:
Running
Running
Nitish commited on
Commit Β·
e4853aa
1
Parent(s): babbbc8
chore: production standardisation of the server codebase
Browse files- server/__init__.py +5 -0
- server/app.py +47 -15
- server/environment.py +45 -12
- server/grader.py +102 -70
- server/models.py +24 -20
- server/tasks.py +9 -3
server/__init__.py
CHANGED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Server package for the Code Security Review environment.
|
| 2 |
+
|
| 3 |
+
This module houses the core FastAPI server, environment definitions,
|
| 4 |
+
evaluation graders, and structured schema validations.
|
| 5 |
+
"""
|
server/app.py
CHANGED
|
@@ -1,7 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import uvicorn
|
| 3 |
from typing import List, Optional
|
| 4 |
-
from fastapi import FastAPI, HTTPException, Query
|
| 5 |
from fastapi.middleware.cors import CORSMiddleware
|
| 6 |
|
| 7 |
from server.models import CodeReviewAction, StepResult, ResetResponse, StateResponse, TaskInfo
|
|
@@ -25,7 +31,7 @@ env = CodeSecurityEnv()
|
|
| 25 |
|
| 26 |
|
| 27 |
@app.get("/")
|
| 28 |
-
def health():
|
| 29 |
"""Health check endpoint."""
|
| 30 |
return {
|
| 31 |
"status": "ok",
|
|
@@ -36,7 +42,7 @@ def health():
|
|
| 36 |
|
| 37 |
|
| 38 |
@app.get("/tasks", response_model=List[TaskInfo])
|
| 39 |
-
def list_tasks():
|
| 40 |
"""List all available tasks."""
|
| 41 |
return [
|
| 42 |
TaskInfo(
|
|
@@ -53,30 +59,56 @@ def list_tasks():
|
|
| 53 |
def reset(
|
| 54 |
task_id: str = Query(default="python-off-by-one", description="Task ID to reset to"),
|
| 55 |
seed: Optional[int] = Query(default=None, description="Optional seed for reproducibility")
|
| 56 |
-
):
|
| 57 |
"""Reset the environment and return the first observation."""
|
| 58 |
if task_id not in TASKS:
|
| 59 |
-
raise HTTPException(
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
|
| 64 |
@app.post("/step", response_model=StepResult)
|
| 65 |
-
def step(action: CodeReviewAction):
|
| 66 |
"""Submit a code review action and receive a reward signal."""
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
|
| 71 |
@app.get("/state", response_model=StateResponse)
|
| 72 |
-
def state():
|
| 73 |
"""Return the current environment state."""
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
-
def main():
|
| 78 |
-
"""Run the environment server."""
|
| 79 |
-
port = int(os.environ.get("PORT", 8000))
|
| 80 |
uvicorn.run(
|
| 81 |
"server.app:app",
|
| 82 |
host="0.0.0.0",
|
|
|
|
| 1 |
+
"""Main FastAPI application for Code Security Review.
|
| 2 |
+
|
| 3 |
+
Exposes RESTful endpoints conforming to standard OpenEnv compliance specifications
|
| 4 |
+
dictating interactions for agent evaluation.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
import os
|
| 8 |
import uvicorn
|
| 9 |
from typing import List, Optional
|
| 10 |
+
from fastapi import FastAPI, HTTPException, Query, status
|
| 11 |
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
|
| 13 |
from server.models import CodeReviewAction, StepResult, ResetResponse, StateResponse, TaskInfo
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
@app.get("/")
|
| 34 |
+
def health() -> dict:
|
| 35 |
"""Health check endpoint."""
|
| 36 |
return {
|
| 37 |
"status": "ok",
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
@app.get("/tasks", response_model=List[TaskInfo])
|
| 45 |
+
def list_tasks() -> List[TaskInfo]:
|
| 46 |
"""List all available tasks."""
|
| 47 |
return [
|
| 48 |
TaskInfo(
|
|
|
|
| 59 |
def reset(
|
| 60 |
task_id: str = Query(default="python-off-by-one", description="Task ID to reset to"),
|
| 61 |
seed: Optional[int] = Query(default=None, description="Optional seed for reproducibility")
|
| 62 |
+
) -> ResetResponse:
|
| 63 |
"""Reset the environment and return the first observation."""
|
| 64 |
if task_id not in TASKS:
|
| 65 |
+
raise HTTPException(
|
| 66 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 67 |
+
detail=f"Task '{task_id}' not found."
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
obs = env.reset(task_id=task_id, seed=seed)
|
| 72 |
+
return ResetResponse(observation=obs)
|
| 73 |
+
except Exception as e:
|
| 74 |
+
raise HTTPException(
|
| 75 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 76 |
+
detail=f"System breakdown during environment reset: {e}"
|
| 77 |
+
)
|
| 78 |
|
| 79 |
|
| 80 |
@app.post("/step", response_model=StepResult)
|
| 81 |
+
def step(action: CodeReviewAction) -> StepResult:
|
| 82 |
"""Submit a code review action and receive a reward signal."""
|
| 83 |
+
try:
|
| 84 |
+
return env.step(action)
|
| 85 |
+
except Exception as e:
|
| 86 |
+
raise HTTPException(
|
| 87 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 88 |
+
detail=f"Error executing agent action logic: {e}"
|
| 89 |
+
)
|
| 90 |
|
| 91 |
|
| 92 |
@app.get("/state", response_model=StateResponse)
|
| 93 |
+
def state() -> StateResponse:
|
| 94 |
"""Return the current environment state."""
|
| 95 |
+
try:
|
| 96 |
+
return env.state()
|
| 97 |
+
except Exception as e:
|
| 98 |
+
raise HTTPException(
|
| 99 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 100 |
+
detail=f"Error analyzing global runtime state tracking: {e}"
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
|
| 104 |
+
def main() -> None:
|
| 105 |
+
"""Run the environment ASGI server natively."""
|
| 106 |
+
port_default = os.environ.get("PORT", "8000")
|
| 107 |
+
try:
|
| 108 |
+
port = int(port_default)
|
| 109 |
+
except ValueError:
|
| 110 |
+
port = 8000
|
| 111 |
|
|
|
|
|
|
|
|
|
|
| 112 |
uvicorn.run(
|
| 113 |
"server.app:app",
|
| 114 |
host="0.0.0.0",
|
server/environment.py
CHANGED
|
@@ -1,26 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import random
|
| 2 |
-
from typing import Optional, Dict,
|
| 3 |
|
| 4 |
from server.tasks import TASKS
|
| 5 |
from server.grader import grade_action
|
| 6 |
-
from server.models import
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
class CodeSecurityEnv:
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
| 11 |
self.step_count: int = 0
|
| 12 |
self.done: bool = False
|
| 13 |
self.total_reward: float = 0.0
|
| 14 |
self._task_ids = list(TASKS.keys())
|
| 15 |
|
| 16 |
def reset(self, task_id: Optional[str] = None, seed: Optional[int] = None) -> Observation:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
if seed is not None:
|
| 18 |
random.seed(seed)
|
| 19 |
|
| 20 |
if task_id and task_id in TASKS:
|
| 21 |
self.current_task = TASKS[task_id]
|
| 22 |
else:
|
| 23 |
-
# Pick a task by its ID
|
| 24 |
chosen_id = random.choice(self._task_ids)
|
| 25 |
self.current_task = TASKS[chosen_id]
|
| 26 |
|
|
@@ -31,8 +51,15 @@ class CodeSecurityEnv:
|
|
| 31 |
return self._make_observation()
|
| 32 |
|
| 33 |
def step(self, action: Action) -> StepResult:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
if self.current_task is None:
|
| 35 |
-
# Auto-reset if called before reset()
|
| 36 |
self.reset()
|
| 37 |
|
| 38 |
if self.done:
|
|
@@ -40,16 +67,17 @@ class CodeSecurityEnv:
|
|
| 40 |
observation=self._make_observation(),
|
| 41 |
reward=0.0,
|
| 42 |
done=True,
|
| 43 |
-
info={"error":
|
| 44 |
)
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
| 49 |
|
| 50 |
self.step_count += 1
|
| 51 |
self.total_reward += reward
|
| 52 |
-
self.done = True # single-step environment
|
| 53 |
|
| 54 |
return StepResult(
|
| 55 |
observation=self._make_observation(),
|
|
@@ -63,7 +91,8 @@ class CodeSecurityEnv:
|
|
| 63 |
)
|
| 64 |
|
| 65 |
def state(self) -> StateResponse:
|
| 66 |
-
|
|
|
|
| 67 |
return StateResponse(
|
| 68 |
task_id=current_id,
|
| 69 |
step=self.step_count,
|
|
@@ -72,7 +101,11 @@ class CodeSecurityEnv:
|
|
| 72 |
)
|
| 73 |
|
| 74 |
def _make_observation(self) -> Observation:
|
|
|
|
| 75 |
t = self.current_task
|
|
|
|
|
|
|
|
|
|
| 76 |
return Observation(
|
| 77 |
task_id=t["id"],
|
| 78 |
language=t["language"],
|
|
|
|
| 1 |
+
"""Reinforcement Learning Environment Core.
|
| 2 |
+
|
| 3 |
+
Defines the environment logic, maintaining the current trajectory
|
| 4 |
+
state and mediating between incoming requests and the headless grader.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
import random
|
| 8 |
+
from typing import Optional, Dict, Any
|
| 9 |
|
| 10 |
from server.tasks import TASKS
|
| 11 |
from server.grader import grade_action
|
| 12 |
+
from server.models import StepResult, StateResponse, Action, Observation
|
| 13 |
+
|
| 14 |
+
ERROR_EPISODE_COMPLETED = "Episode already completed. Call /reset to start a new episode."
|
| 15 |
+
|
| 16 |
|
| 17 |
class CodeSecurityEnv:
|
| 18 |
+
"""Simulates the stateful progression of a software security assessment."""
|
| 19 |
+
|
| 20 |
+
def __init__(self) -> None:
|
| 21 |
+
"""Initialize a fresh environment instance."""
|
| 22 |
+
self.current_task: Optional[Dict[str, Any]] = None
|
| 23 |
self.step_count: int = 0
|
| 24 |
self.done: bool = False
|
| 25 |
self.total_reward: float = 0.0
|
| 26 |
self._task_ids = list(TASKS.keys())
|
| 27 |
|
| 28 |
def reset(self, task_id: Optional[str] = None, seed: Optional[int] = None) -> Observation:
|
| 29 |
+
"""Reset the environment safely to a new or targeted initial state.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
task_id: Optionally force the environment to yield a specific task definition.
|
| 33 |
+
seed: Initialize standard random seed.
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
An Observation baseline reflecting the new scenario context.
|
| 37 |
+
"""
|
| 38 |
if seed is not None:
|
| 39 |
random.seed(seed)
|
| 40 |
|
| 41 |
if task_id and task_id in TASKS:
|
| 42 |
self.current_task = TASKS[task_id]
|
| 43 |
else:
|
|
|
|
| 44 |
chosen_id = random.choice(self._task_ids)
|
| 45 |
self.current_task = TASKS[chosen_id]
|
| 46 |
|
|
|
|
| 51 |
return self._make_observation()
|
| 52 |
|
| 53 |
def step(self, action: Action) -> StepResult:
|
| 54 |
+
"""Advance the environment state using a provided agent Action payload.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
action: Evaluated metrics provided directly by agent decision matrices.
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
A StepResult containing scalar reward metrics and end-of-episode flag.
|
| 61 |
+
"""
|
| 62 |
if self.current_task is None:
|
|
|
|
| 63 |
self.reset()
|
| 64 |
|
| 65 |
if self.done:
|
|
|
|
| 67 |
observation=self._make_observation(),
|
| 68 |
reward=0.0,
|
| 69 |
done=True,
|
| 70 |
+
info={"error": ERROR_EPISODE_COMPLETED},
|
| 71 |
)
|
| 72 |
|
| 73 |
+
try:
|
| 74 |
+
reward, breakdown = grade_action(action.model_dump(), self.current_task)
|
| 75 |
+
except Exception as e:
|
| 76 |
+
reward, breakdown = 0.0, {"error": f"Evaluation error: {e}"}
|
| 77 |
|
| 78 |
self.step_count += 1
|
| 79 |
self.total_reward += reward
|
| 80 |
+
self.done = True # single-step environment
|
| 81 |
|
| 82 |
return StepResult(
|
| 83 |
observation=self._make_observation(),
|
|
|
|
| 91 |
)
|
| 92 |
|
| 93 |
def state(self) -> StateResponse:
|
| 94 |
+
"""Return global analytics tracking the current environment session state."""
|
| 95 |
+
current_id = self.current_task["id"] if getattr(self, "current_task", None) else ""
|
| 96 |
return StateResponse(
|
| 97 |
task_id=current_id,
|
| 98 |
step=self.step_count,
|
|
|
|
| 101 |
)
|
| 102 |
|
| 103 |
def _make_observation(self) -> Observation:
|
| 104 |
+
"""Construct the contextual parameters surrounding an ongoing assessment."""
|
| 105 |
t = self.current_task
|
| 106 |
+
if not t:
|
| 107 |
+
raise KeyError("Attempted observation render without an initialized active task")
|
| 108 |
+
|
| 109 |
return Observation(
|
| 110 |
task_id=t["id"],
|
| 111 |
language=t["language"],
|
server/grader.py
CHANGED
|
@@ -1,82 +1,114 @@
|
|
| 1 |
-
|
| 2 |
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
reward = 0.0
|
| 6 |
breakdown: Dict[str, float] = {}
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
breakdown["severity"] = sev_score
|
| 69 |
-
reward += sev_score
|
| 70 |
|
| 71 |
-
|
| 72 |
-
description = action.get("bug_description", "").lower()
|
| 73 |
-
words = description.split()
|
| 74 |
-
unique_ratio = len(set(words)) / len(words) if words else 1.0
|
| 75 |
-
if unique_ratio < 0.5:
|
| 76 |
-
reward *= 0.2 # Heavy global penalty
|
| 77 |
-
breakdown["stuffing_penalty_multiplier"] = 0.2
|
| 78 |
-
for k in list(breakdown.keys()):
|
| 79 |
-
if k != "stuffing_penalty_multiplier":
|
| 80 |
-
breakdown[k] = round(breakdown[k] * 0.2, 4)
|
| 81 |
|
| 82 |
-
|
|
|
|
|
|
| 1 |
+
"""Review Grader System.
|
| 2 |
|
| 3 |
+
Implements programmatic sub-scoring logic for evaluating agent
|
| 4 |
+
security actions against internal semantic criteria.
|
| 5 |
+
"""
|
| 6 |
|
| 7 |
+
from typing import Tuple, Dict, Any
|
| 8 |
+
|
| 9 |
+
SCORE_BUG_IDENTIFIED = 0.20
|
| 10 |
+
SCORE_BUG_TYPE = 0.20
|
| 11 |
+
SCORE_BUG_LOCATION = 0.10
|
| 12 |
+
SCORE_DESC_QUALITY = 0.25
|
| 13 |
+
SCORE_FIX_QUALITY = 0.15
|
| 14 |
+
SCORE_SEV_EXACT = 0.10
|
| 15 |
+
SCORE_SEV_PARTIAL = 0.05
|
| 16 |
+
|
| 17 |
+
KEYWORD_HIT_TARGET = 3.0
|
| 18 |
+
PENALTY_THRESHOLD = 0.5
|
| 19 |
+
PENALTY_MULTIPLIER = 0.2
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def grade_action(action: Dict[str, Any], task: Dict[str, Any]) -> Tuple[float, Dict[str, float]]:
|
| 23 |
+
"""Evaluate an action against the task definition.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
action: The structured payload proposed by the AI agent.
|
| 27 |
+
task: The dictionary blueprint detailing the expected vulnerability.
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
A tuple of the normalized aggregate reward and the individual component breakdown.
|
| 31 |
+
"""
|
| 32 |
reward = 0.0
|
| 33 |
breakdown: Dict[str, float] = {}
|
| 34 |
|
| 35 |
+
try:
|
| 36 |
+
# ββ Component 1: Bug identified (0.20) ββββββββββββββββββββββββββββββββββ
|
| 37 |
+
if action.get("bug_identified"):
|
| 38 |
+
reward += SCORE_BUG_IDENTIFIED
|
| 39 |
+
breakdown["bug_identified"] = SCORE_BUG_IDENTIFIED
|
| 40 |
+
else:
|
| 41 |
+
breakdown["bug_identified"] = 0.00
|
| 42 |
+
# No bug found β no partial credit for anything else
|
| 43 |
+
return max(0.0, min(1.0, reward)), breakdown
|
| 44 |
+
|
| 45 |
+
# ββ Component 2: Bug type match (0.20) ββββββββββββββββββββββββββββββββββ
|
| 46 |
+
action_type = action.get("bug_type", "").lower().replace("-", " ").replace("_", " ")
|
| 47 |
+
task_type = task["bug_type"].lower().replace("-", " ").replace("_", " ")
|
| 48 |
+
if task_type in action_type or action_type in task_type:
|
| 49 |
+
reward += SCORE_BUG_TYPE
|
| 50 |
+
breakdown["bug_type"] = SCORE_BUG_TYPE
|
| 51 |
+
else:
|
| 52 |
+
breakdown["bug_type"] = 0.00
|
| 53 |
|
| 54 |
+
# ββ Component 3: Bug location (0.10) ββββββββββββββββββββββββββββββββββββ
|
| 55 |
+
action_location = action.get("bug_location", "").lower()
|
| 56 |
+
location_keywords = [w for w in task["bug_location"].lower().split() if len(w) > 3]
|
| 57 |
+
if location_keywords:
|
| 58 |
+
matched = sum(1 for kw in location_keywords if kw in action_location)
|
| 59 |
+
loc_score = round(SCORE_BUG_LOCATION * (matched / len(location_keywords)), 4)
|
| 60 |
+
else:
|
| 61 |
+
loc_score = 0.0
|
| 62 |
+
|
| 63 |
+
reward += loc_score
|
| 64 |
+
breakdown["bug_location"] = loc_score
|
| 65 |
|
| 66 |
+
# ββ Component 4: Description quality (0.25) ββββββββββββββββββββββββββββββ
|
| 67 |
+
description = action.get("bug_description", "").lower()
|
| 68 |
+
desc_score = 0.0
|
| 69 |
+
if len(description) >= 20:
|
| 70 |
+
task_keywords = task["keywords"]
|
| 71 |
+
matched_kw = [kw for kw in task_keywords if kw in description]
|
| 72 |
+
desc_score = round(min(SCORE_DESC_QUALITY, SCORE_DESC_QUALITY * (len(matched_kw) / KEYWORD_HIT_TARGET)), 4)
|
| 73 |
+
|
| 74 |
+
breakdown["description_quality"] = desc_score
|
| 75 |
+
reward += desc_score
|
| 76 |
|
| 77 |
+
# ββ Component 5: Fix quality (0.15) ββββββββββββββββββββββββββββββββββββββ
|
| 78 |
+
fix = action.get("suggested_fix", "").lower()
|
| 79 |
+
fix_score = 0.0
|
| 80 |
+
if len(fix) >= 10:
|
| 81 |
+
fix_patterns = task["fix_patterns"]
|
| 82 |
+
matched_fix = [p for p in fix_patterns if p.lower() in fix]
|
| 83 |
+
fix_score = round(min(SCORE_FIX_QUALITY, SCORE_FIX_QUALITY * len(matched_fix)), 4)
|
| 84 |
+
|
| 85 |
+
breakdown["fix_quality"] = fix_score
|
| 86 |
+
reward += fix_score
|
| 87 |
|
| 88 |
+
# ββ Component 6: Severity (0.10) βββββββββββββββββββββββββββββββββββββββββ
|
| 89 |
+
action_sev = action.get("severity", "").lower()
|
| 90 |
+
task_sev = task["severity"].lower()
|
| 91 |
+
if action_sev == task_sev:
|
| 92 |
+
sev_score = SCORE_SEV_EXACT
|
| 93 |
+
elif action_sev in ("high", "critical") and task_sev in ("high", "critical"):
|
| 94 |
+
sev_score = SCORE_SEV_PARTIAL
|
| 95 |
+
else:
|
| 96 |
+
sev_score = 0.00
|
| 97 |
+
|
| 98 |
+
breakdown["severity"] = sev_score
|
| 99 |
+
reward += sev_score
|
| 100 |
|
| 101 |
+
# ββ Global Penalty: Keyword Stuffing ββββββββββββββββββββββββββββββββββββ
|
| 102 |
+
words = description.split()
|
| 103 |
+
unique_ratio = len(set(words)) / len(words) if words else 1.0
|
| 104 |
+
if unique_ratio < PENALTY_THRESHOLD:
|
| 105 |
+
reward *= PENALTY_MULTIPLIER
|
| 106 |
+
breakdown["stuffing_penalty_multiplier"] = PENALTY_MULTIPLIER
|
| 107 |
+
for k in list(breakdown.keys()):
|
| 108 |
+
if k != "stuffing_penalty_multiplier":
|
| 109 |
+
breakdown[k] = round(breakdown[k] * PENALTY_MULTIPLIER, 4)
|
|
|
|
|
|
|
| 110 |
|
| 111 |
+
return max(0.0, min(1.0, round(reward, 4))), breakdown
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
+
except KeyError as exc:
|
| 114 |
+
raise RuntimeError(f"Missing mandatory schema key in task definition: {exc}") from exc
|
server/models.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
| 1 |
-
|
| 2 |
-
from typing import Optional, Any, Dict
|
| 3 |
|
|
|
|
|
|
|
| 4 |
|
| 5 |
# ββ Agent Action ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 6 |
|
| 7 |
class CodeReviewAction(BaseModel):
|
| 8 |
"""Action taken by the agent: a structured code review."""
|
|
|
|
| 9 |
bug_identified: bool = Field(..., description="Whether a bug was found")
|
| 10 |
bug_location: str = Field(..., description="Location of the bug (function, line, variable)")
|
| 11 |
bug_type: str = Field(..., description="Type: off-by-one | logic-error | security-vulnerability | none")
|
|
@@ -13,11 +15,11 @@ class CodeReviewAction(BaseModel):
|
|
| 13 |
severity: str = Field(..., description="Severity: none | low | medium | high | critical")
|
| 14 |
suggested_fix: str = Field(..., description="The corrected code or a description of how to fix it")
|
| 15 |
|
| 16 |
-
|
| 17 |
# ββ Observation βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 18 |
|
| 19 |
class CodeObservation(BaseModel):
|
| 20 |
"""What the agent sees at each step."""
|
|
|
|
| 21 |
task_id: str = Field(..., description="Unique task identifier")
|
| 22 |
language: str = Field(..., description="Programming language")
|
| 23 |
difficulty: str = Field(..., description="Level: easy | medium | hard")
|
|
@@ -26,38 +28,40 @@ class CodeObservation(BaseModel):
|
|
| 26 |
pr_title: str = Field(..., description="Pull request title submitted by developer")
|
| 27 |
file_path: str = Field(..., description="File path of the code in the repository")
|
| 28 |
|
| 29 |
-
|
| 30 |
# ββ Step Result βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
|
| 32 |
class StepResult(BaseModel):
|
| 33 |
"""Result returned from env.step()."""
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
|
| 40 |
# ββ State βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 41 |
|
| 42 |
class StateResponse(BaseModel):
|
| 43 |
"""Internal environment state exposed via /state."""
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
|
| 50 |
# ββ API Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 51 |
|
| 52 |
class ResetResponse(BaseModel):
|
| 53 |
-
|
| 54 |
-
|
|
|
|
| 55 |
|
| 56 |
class TaskInfo(BaseModel):
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
| 61 |
|
| 62 |
Action = CodeReviewAction
|
| 63 |
Observation = CodeObservation
|
|
|
|
| 1 |
+
"""Pydantic v2 models representing actions, observations, and state payloads."""
|
|
|
|
| 2 |
|
| 3 |
+
from typing import Optional, Any, Dict
|
| 4 |
+
from pydantic import BaseModel, Field
|
| 5 |
|
| 6 |
# ββ Agent Action ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 7 |
|
| 8 |
class CodeReviewAction(BaseModel):
|
| 9 |
"""Action taken by the agent: a structured code review."""
|
| 10 |
+
|
| 11 |
bug_identified: bool = Field(..., description="Whether a bug was found")
|
| 12 |
bug_location: str = Field(..., description="Location of the bug (function, line, variable)")
|
| 13 |
bug_type: str = Field(..., description="Type: off-by-one | logic-error | security-vulnerability | none")
|
|
|
|
| 15 |
severity: str = Field(..., description="Severity: none | low | medium | high | critical")
|
| 16 |
suggested_fix: str = Field(..., description="The corrected code or a description of how to fix it")
|
| 17 |
|
|
|
|
| 18 |
# ββ Observation βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
|
| 20 |
class CodeObservation(BaseModel):
|
| 21 |
"""What the agent sees at each step."""
|
| 22 |
+
|
| 23 |
task_id: str = Field(..., description="Unique task identifier")
|
| 24 |
language: str = Field(..., description="Programming language")
|
| 25 |
difficulty: str = Field(..., description="Level: easy | medium | hard")
|
|
|
|
| 28 |
pr_title: str = Field(..., description="Pull request title submitted by developer")
|
| 29 |
file_path: str = Field(..., description="File path of the code in the repository")
|
| 30 |
|
|
|
|
| 31 |
# ββ Step Result βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
|
| 33 |
class StepResult(BaseModel):
|
| 34 |
"""Result returned from env.step()."""
|
| 35 |
+
|
| 36 |
+
observation: Optional[CodeObservation] = Field(None, description="Observation if not terminal")
|
| 37 |
+
reward: float = Field(..., description="Reward generated for the preceding action")
|
| 38 |
+
done: bool = Field(..., description="Terminal state flag")
|
| 39 |
+
info: Dict[str, Any] = Field(default_factory=dict, description="Metadata dictionary")
|
| 40 |
|
| 41 |
# ββ State βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 42 |
|
| 43 |
class StateResponse(BaseModel):
|
| 44 |
"""Internal environment state exposed via /state."""
|
| 45 |
+
|
| 46 |
+
task_id: str = Field(..., description="Current running task")
|
| 47 |
+
step: int = Field(..., description="Current evaluation step")
|
| 48 |
+
done: bool = Field(..., description="Whether the episode resides in a terminal state")
|
| 49 |
+
total_reward: float = Field(..., description="Sum of step rewards over the episode")
|
| 50 |
|
| 51 |
# ββ API Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 52 |
|
| 53 |
class ResetResponse(BaseModel):
|
| 54 |
+
"""Response wrapper returned strictly on environment resets."""
|
| 55 |
+
|
| 56 |
+
observation: CodeObservation = Field(..., description="Initial environment observation upon reset")
|
| 57 |
|
| 58 |
class TaskInfo(BaseModel):
|
| 59 |
+
"""Metadata regarding an available task scenario."""
|
| 60 |
+
|
| 61 |
+
id: str = Field(..., description="Task UUID or unique string identifier")
|
| 62 |
+
language: str = Field(..., description="Source code language for the flaw context")
|
| 63 |
+
bug_class: str = Field(..., description="The classification parameter of the embedded bug")
|
| 64 |
+
difficulty: str = Field(..., description="The difficulty tier indicator (e.g. easy, medium)")
|
| 65 |
|
| 66 |
Action = CodeReviewAction
|
| 67 |
Observation = CodeObservation
|
server/tasks.py
CHANGED
|
@@ -1,7 +1,13 @@
|
|
| 1 |
-
|
| 2 |
-
# These tasks are designed to test AI agents' ability to identify common security vulnerabilities.
|
| 3 |
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"python-off-by-one": {
|
| 6 |
"id": "python-off-by-one",
|
| 7 |
"name": "Python Off-by-One Error",
|
|
|
|
| 1 |
+
"""OpenEnv Tasks for Code Security Review.
|
|
|
|
| 2 |
|
| 3 |
+
These task specifications are designed to rigorously test autonomous AI
|
| 4 |
+
agents' abilities to identify, classify, and mitigate common software
|
| 5 |
+
security vulnerabilities across distinct language paradigms.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import Dict, Any
|
| 9 |
+
|
| 10 |
+
TASKS: Dict[str, Any] = {
|
| 11 |
"python-off-by-one": {
|
| 12 |
"id": "python-off-by-one",
|
| 13 |
"name": "Python Off-by-One Error",
|