Spaces:

inmodel
/

code-review-env

Sleeping

App Files Files Community

Nitish commited on Apr 7

Commit

e4853aa

1 Parent(s): babbbc8

chore: production standardisation of the server codebase

Browse files

Files changed (6) hide show

server/__init__.py +5 -0
server/app.py +47 -15
server/environment.py +45 -12
server/grader.py +102 -70
server/models.py +24 -20
server/tasks.py +9 -3

server/__init__.py CHANGED Viewed

	@@ -0,0 +1,5 @@

+"""Server package for the Code Security Review environment.
+This module houses the core FastAPI server, environment definitions,
+evaluation graders, and structured schema validations.
+"""

server/app.py CHANGED Viewed

@@ -1,7 +1,13 @@
 import os
 import uvicorn
 from typing import List, Optional
-from fastapi import FastAPI, HTTPException, Query
 from fastapi.middleware.cors import CORSMiddleware
 from server.models import CodeReviewAction, StepResult, ResetResponse, StateResponse, TaskInfo
@@ -25,7 +31,7 @@ env = CodeSecurityEnv()
 @app.get("/")
-def health():
     """Health check endpoint."""
     return {
         "status": "ok",
@@ -36,7 +42,7 @@ def health():
 @app.get("/tasks", response_model=List[TaskInfo])
-def list_tasks():
     """List all available tasks."""
     return [
         TaskInfo(
@@ -53,30 +59,56 @@ def list_tasks():
 def reset(
     task_id: str = Query(default="python-off-by-one", description="Task ID to reset to"),
     seed: Optional[int] = Query(default=None, description="Optional seed for reproducibility")
-):
     """Reset the environment and return the first observation."""
     if task_id not in TASKS:
-        raise HTTPException(status_code=404, detail=f"Task '{task_id}' not found.")
-    obs = env.reset(task_id=task_id, seed=seed)
-    return ResetResponse(observation=obs)
 @app.post("/step", response_model=StepResult)
-def step(action: CodeReviewAction):
     """Submit a code review action and receive a reward signal."""
-    result = env.step(action)
-    return result
 @app.get("/state", response_model=StateResponse)
-def state():
     """Return the current environment state."""
-    return env.state()
-def main():
-    """Run the environment server."""
-    port = int(os.environ.get("PORT", 8000))
     uvicorn.run(
         "server.app:app",
         host="0.0.0.0",

+"""Main FastAPI application for Code Security Review.
+Exposes RESTful endpoints conforming to standard OpenEnv compliance specifications
+dictating interactions for agent evaluation.
+"""
 import os
 import uvicorn
 from typing import List, Optional
+from fastapi import FastAPI, HTTPException, Query, status
 from fastapi.middleware.cors import CORSMiddleware
 from server.models import CodeReviewAction, StepResult, ResetResponse, StateResponse, TaskInfo
 @app.get("/")
+def health() -> dict:
     """Health check endpoint."""
     return {
         "status": "ok",
 @app.get("/tasks", response_model=List[TaskInfo])
+def list_tasks() -> List[TaskInfo]:
     """List all available tasks."""
     return [
         TaskInfo(
 def reset(
     task_id: str = Query(default="python-off-by-one", description="Task ID to reset to"),
     seed: Optional[int] = Query(default=None, description="Optional seed for reproducibility")
+) -> ResetResponse:
     """Reset the environment and return the first observation."""
     if task_id not in TASKS:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Task '{task_id}' not found."
+        )
+    try:
+        obs = env.reset(task_id=task_id, seed=seed)
+        return ResetResponse(observation=obs)
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"System breakdown during environment reset: {e}"
+        )
 @app.post("/step", response_model=StepResult)
+def step(action: CodeReviewAction) -> StepResult:
     """Submit a code review action and receive a reward signal."""
+    try:
+        return env.step(action)
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Error executing agent action logic: {e}"
+        )
 @app.get("/state", response_model=StateResponse)
+def state() -> StateResponse:
     """Return the current environment state."""
+    try:
+        return env.state()
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Error analyzing global runtime state tracking: {e}"
+        )
+def main() -> None:
+    """Run the environment ASGI server natively."""
+    port_default = os.environ.get("PORT", "8000")
+    try:
+         port = int(port_default)
+    except ValueError:
+         port = 8000
     uvicorn.run(
         "server.app:app",
         host="0.0.0.0",

server/environment.py CHANGED Viewed

@@ -1,26 +1,46 @@
 import random
-from typing import Optional, Dict, Tuple
 from server.tasks import TASKS
 from server.grader import grade_action
-from server.models import CodeObservation, StepResult, StateResponse, Action, Observation
 class CodeSecurityEnv:
-    def __init__(self):
-        self.current_task: Optional[dict] = None
         self.step_count: int = 0
         self.done: bool = False
         self.total_reward: float = 0.0
         self._task_ids = list(TASKS.keys())
     def reset(self, task_id: Optional[str] = None, seed: Optional[int] = None) -> Observation:
         if seed is not None:
             random.seed(seed)
         if task_id and task_id in TASKS:
             self.current_task = TASKS[task_id]
         else:
-            # Pick a task by its ID
             chosen_id = random.choice(self._task_ids)
             self.current_task = TASKS[chosen_id]
@@ -31,8 +51,15 @@ class CodeSecurityEnv:
         return self._make_observation()
     def step(self, action: Action) -> StepResult:
         if self.current_task is None:
-            # Auto-reset if called before reset()
             self.reset()
         if self.done:
@@ -40,16 +67,17 @@ class CodeSecurityEnv:
                 observation=self._make_observation(),
                 reward=0.0,
                 done=True,
-                info={"error": "Episode already completed. Call /reset to start a new episode."},
             )
-        # The action comes from the API as a Pydantic model (Action)
-        # The grader expects a dict or the model itself.
-        reward, breakdown = grade_action(action.model_dump(), self.current_task)
         self.step_count += 1
         self.total_reward += reward
-        self.done = True  # single-step environment — one action per episode
         return StepResult(
             observation=self._make_observation(),
@@ -63,7 +91,8 @@ class CodeSecurityEnv:
         )
     def state(self) -> StateResponse:
-        current_id = self.current_task["id"] if self.current_task else ""
         return StateResponse(
             task_id=current_id,
             step=self.step_count,
@@ -72,7 +101,11 @@ class CodeSecurityEnv:
         )
     def _make_observation(self) -> Observation:
         t = self.current_task
         return Observation(
             task_id=t["id"],
             language=t["language"],

+"""Reinforcement Learning Environment Core.
+Defines the environment logic, maintaining the current trajectory
+state and mediating between incoming requests and the headless grader.
+"""
 import random
+from typing import Optional, Dict, Any
 from server.tasks import TASKS
 from server.grader import grade_action
+from server.models import StepResult, StateResponse, Action, Observation
+ERROR_EPISODE_COMPLETED = "Episode already completed. Call /reset to start a new episode."
 class CodeSecurityEnv:
+    """Simulates the stateful progression of a software security assessment."""
+    def __init__(self) -> None:
+        """Initialize a fresh environment instance."""
+        self.current_task: Optional[Dict[str, Any]] = None
         self.step_count: int = 0
         self.done: bool = False
         self.total_reward: float = 0.0
         self._task_ids = list(TASKS.keys())
     def reset(self, task_id: Optional[str] = None, seed: Optional[int] = None) -> Observation:
+        """Reset the environment safely to a new or targeted initial state.
+        Args:
+            task_id: Optionally force the environment to yield a specific task definition.
+            seed: Initialize standard random seed.
+        Returns:
+            An Observation baseline reflecting the new scenario context.
+        """
         if seed is not None:
             random.seed(seed)
         if task_id and task_id in TASKS:
             self.current_task = TASKS[task_id]
         else:
             chosen_id = random.choice(self._task_ids)
             self.current_task = TASKS[chosen_id]
         return self._make_observation()
     def step(self, action: Action) -> StepResult:
+        """Advance the environment state using a provided agent Action payload.
+        Args:
+            action: Evaluated metrics provided directly by agent decision matrices.
+        Returns:
+            A StepResult containing scalar reward metrics and end-of-episode flag.
+        """
         if self.current_task is None:
             self.reset()
         if self.done:
                 observation=self._make_observation(),
                 reward=0.0,
                 done=True,
+                info={"error": ERROR_EPISODE_COMPLETED},
             )
+        try:
+            reward, breakdown = grade_action(action.model_dump(), self.current_task)
+        except Exception as e:
+            reward, breakdown = 0.0, {"error": f"Evaluation error: {e}"}
         self.step_count += 1
         self.total_reward += reward
+        self.done = True  # single-step environment
         return StepResult(
             observation=self._make_observation(),
         )
     def state(self) -> StateResponse:
+        """Return global analytics tracking the current environment session state."""
+        current_id = self.current_task["id"] if getattr(self, "current_task", None) else ""
         return StateResponse(
             task_id=current_id,
             step=self.step_count,
         )
     def _make_observation(self) -> Observation:
+        """Construct the contextual parameters surrounding an ongoing assessment."""
         t = self.current_task
+        if not t:
+            raise KeyError("Attempted observation render without an initialized active task")
         return Observation(
             task_id=t["id"],
             language=t["language"],

server/grader.py CHANGED Viewed

@@ -1,82 +1,114 @@
-from typing import Tuple, Dict
-def grade_action(action: dict, task: dict) -> Tuple[float, Dict[str, float]]:
     reward = 0.0
     breakdown: Dict[str, float] = {}
-    # ── Component 1: Bug identified (0.20) ──────────────────────────────────
-    if action.get("bug_identified"):
-        reward += 0.20
-        breakdown["bug_identified"] = 0.20
-    else:
-        breakdown["bug_identified"] = 0.00
-        # No bug found → no partial credit for anything else
-        return max(0.0, min(1.0, reward)), breakdown
-    # ── Component 2: Bug type match (0.20) ──────────────────────────────────
-    action_type = action.get("bug_type", "").lower().replace("-", " ").replace("_", " ")
-    task_type = task["bug_type"].lower().replace("-", " ").replace("_", " ")
-    if task_type in action_type or action_type in task_type:
-        reward += 0.20
-        breakdown["bug_type"] = 0.20
-    else:
-        breakdown["bug_type"] = 0.00
-    # ── Component 3: Bug location (0.10) ────────────────────────────────────
-    action_location = action.get("bug_location", "").lower()
-    location_keywords = [w for w in task["bug_location"].lower().split() if len(w) > 3]
-    if location_keywords:
-        matched = sum(1 for kw in location_keywords if kw in action_location)
-        loc_score = round(0.10 * (matched / len(location_keywords)), 4)
-    else:
-        loc_score = 0.0
-    reward += loc_score
-    breakdown["bug_location"] = loc_score
-    # ── Component 4: Description quality (0.25) ──────────────────────────────
-    description = action.get("bug_description", "").lower()
-    desc_score = 0.0
-    if len(description) >= 20:
-        task_keywords = task["keywords"]
-        matched_kw = [kw for kw in task_keywords if kw in description]
-        # Full points if they hit at least 3 keywords
-        desc_score = round(min(0.25, 0.25 * (len(matched_kw) / 3.0)), 4)
-    breakdown["description_quality"] = desc_score
-    reward += desc_score
-    # ── Component 5: Fix quality (0.15) ──────────────────────────────────────
-    fix = action.get("suggested_fix", "").lower()
-    fix_score = 0.0
-    if len(fix) >= 10:
-        fix_patterns = task["fix_patterns"]
-        matched_fix = [p for p in fix_patterns if p.lower() in fix]
-        # Match any 1 pattern for full points
-        fix_score = round(min(0.15, 0.15 * len(matched_fix)), 4)
-    breakdown["fix_quality"] = fix_score
-    reward += fix_score
-    # ── Component 6: Severity (0.10) ─────────────────────────────────────────
-    action_sev = action.get("severity", "").lower()
-    task_sev = task["severity"].lower()
-    if action_sev == task_sev:
-        sev_score = 0.10
-    elif action_sev in ("high", "critical") and task_sev in ("high", "critical"):
-        sev_score = 0.05
-    else:
-        sev_score = 0.00
-    breakdown["severity"] = sev_score
-    reward += sev_score
-    # ── Global Penalty: Keyword Stuffing ────────────────────────────────────
-    description = action.get("bug_description", "").lower()
-    words = description.split()
-    unique_ratio = len(set(words)) / len(words) if words else 1.0
-    if unique_ratio < 0.5:
-        reward *= 0.2  # Heavy global penalty
-        breakdown["stuffing_penalty_multiplier"] = 0.2
-        for k in list(breakdown.keys()):
-            if k != "stuffing_penalty_multiplier":
-                breakdown[k] = round(breakdown[k] * 0.2, 4)
-    return max(0.0, min(1.0, round(reward, 4))), breakdown

+"""Review Grader System.
+Implements programmatic sub-scoring logic for evaluating agent
+security actions against internal semantic criteria.
+"""
+from typing import Tuple, Dict, Any
+SCORE_BUG_IDENTIFIED = 0.20
+SCORE_BUG_TYPE = 0.20
+SCORE_BUG_LOCATION = 0.10
+SCORE_DESC_QUALITY = 0.25
+SCORE_FIX_QUALITY = 0.15
+SCORE_SEV_EXACT = 0.10
+SCORE_SEV_PARTIAL = 0.05
+KEYWORD_HIT_TARGET = 3.0
+PENALTY_THRESHOLD = 0.5
+PENALTY_MULTIPLIER = 0.2
+def grade_action(action: Dict[str, Any], task: Dict[str, Any]) -> Tuple[float, Dict[str, float]]:
+    """Evaluate an action against the task definition.
+    Args:
+        action: The structured payload proposed by the AI agent.
+        task: The dictionary blueprint detailing the expected vulnerability.
+    Returns:
+        A tuple of the normalized aggregate reward and the individual component breakdown.
+    """
     reward = 0.0
     breakdown: Dict[str, float] = {}
+    try:
+        # ── Component 1: Bug identified (0.20) ──────────────────────────────────
+        if action.get("bug_identified"):
+            reward += SCORE_BUG_IDENTIFIED
+            breakdown["bug_identified"] = SCORE_BUG_IDENTIFIED
+        else:
+            breakdown["bug_identified"] = 0.00
+            # No bug found → no partial credit for anything else
+            return max(0.0, min(1.0, reward)), breakdown
+        # ── Component 2: Bug type match (0.20) ──────────────────────────────────
+        action_type = action.get("bug_type", "").lower().replace("-", " ").replace("_", " ")
+        task_type = task["bug_type"].lower().replace("-", " ").replace("_", " ")
+        if task_type in action_type or action_type in task_type:
+            reward += SCORE_BUG_TYPE
+            breakdown["bug_type"] = SCORE_BUG_TYPE
+        else:
+            breakdown["bug_type"] = 0.00
+        # ── Component 3: Bug location (0.10) ────────────────────────────────────
+        action_location = action.get("bug_location", "").lower()
+        location_keywords = [w for w in task["bug_location"].lower().split() if len(w) > 3]
+        if location_keywords:
+            matched = sum(1 for kw in location_keywords if kw in action_location)
+            loc_score = round(SCORE_BUG_LOCATION * (matched / len(location_keywords)), 4)
+        else:
+            loc_score = 0.0
+        reward += loc_score
+        breakdown["bug_location"] = loc_score
+        # ── Component 4: Description quality (0.25) ──────────────────────────────
+        description = action.get("bug_description", "").lower()
+        desc_score = 0.0
+        if len(description) >= 20:
+            task_keywords = task["keywords"]
+            matched_kw = [kw for kw in task_keywords if kw in description]
+            desc_score = round(min(SCORE_DESC_QUALITY, SCORE_DESC_QUALITY * (len(matched_kw) / KEYWORD_HIT_TARGET)), 4)
+        breakdown["description_quality"] = desc_score
+        reward += desc_score
+        # ── Component 5: Fix quality (0.15) ──────────────────────────────────────
+        fix = action.get("suggested_fix", "").lower()
+        fix_score = 0.0
+        if len(fix) >= 10:
+            fix_patterns = task["fix_patterns"]
+            matched_fix = [p for p in fix_patterns if p.lower() in fix]
+            fix_score = round(min(SCORE_FIX_QUALITY, SCORE_FIX_QUALITY * len(matched_fix)), 4)
+        breakdown["fix_quality"] = fix_score
+        reward += fix_score
+        # ── Component 6: Severity (0.10) ─────────────────────────────────────────
+        action_sev = action.get("severity", "").lower()
+        task_sev = task["severity"].lower()
+        if action_sev == task_sev:
+            sev_score = SCORE_SEV_EXACT
+        elif action_sev in ("high", "critical") and task_sev in ("high", "critical"):
+            sev_score = SCORE_SEV_PARTIAL
+        else:
+            sev_score = 0.00
+        breakdown["severity"] = sev_score
+        reward += sev_score
+        # ── Global Penalty: Keyword Stuffing ────────────────────────────────────
+        words = description.split()
+        unique_ratio = len(set(words)) / len(words) if words else 1.0
+        if unique_ratio < PENALTY_THRESHOLD:
+            reward *= PENALTY_MULTIPLIER
+            breakdown["stuffing_penalty_multiplier"] = PENALTY_MULTIPLIER
+            for k in list(breakdown.keys()):
+                if k != "stuffing_penalty_multiplier":
+                    breakdown[k] = round(breakdown[k] * PENALTY_MULTIPLIER, 4)
+        return max(0.0, min(1.0, round(reward, 4))), breakdown
+    except KeyError as exc:
+        raise RuntimeError(f"Missing mandatory schema key in task definition: {exc}") from exc

server/models.py CHANGED Viewed

@@ -1,11 +1,13 @@
-from pydantic import BaseModel, Field
-from typing import Optional, Any, Dict
 # ── Agent Action ──────────────────────────────────────────────────────────────
 class CodeReviewAction(BaseModel):
     """Action taken by the agent: a structured code review."""
     bug_identified: bool = Field(..., description="Whether a bug was found")
     bug_location: str = Field(..., description="Location of the bug (function, line, variable)")
     bug_type: str = Field(..., description="Type: off-by-one | logic-error | security-vulnerability | none")
@@ -13,11 +15,11 @@ class CodeReviewAction(BaseModel):
     severity: str = Field(..., description="Severity: none | low | medium | high | critical")
     suggested_fix: str = Field(..., description="The corrected code or a description of how to fix it")
 # ── Observation ───────────────────────────────────────────────────────────────
 class CodeObservation(BaseModel):
     """What the agent sees at each step."""
     task_id: str = Field(..., description="Unique task identifier")
     language: str = Field(..., description="Programming language")
     difficulty: str = Field(..., description="Level: easy | medium | hard")
@@ -26,38 +28,40 @@ class CodeObservation(BaseModel):
     pr_title: str = Field(..., description="Pull request title submitted by developer")
     file_path: str = Field(..., description="File path of the code in the repository")
 # ── Step Result ───────────────────────────────────────────────────────────────
 class StepResult(BaseModel):
     """Result returned from env.step()."""
-    observation: Optional[CodeObservation] = None
-    reward: float
-    done: bool
-    info: Dict[str, Any]
 # ── State ─────────────────────────────────────────────────────────────────────
 class StateResponse(BaseModel):
     """Internal environment state exposed via /state."""
-    task_id: str
-    step: int
-    done: bool
-    total_reward: float
 # ── API Helpers ───────────────────────────────────────────────────────────────
 class ResetResponse(BaseModel):
-    observation: CodeObservation
 class TaskInfo(BaseModel):
-    id: str
-    language: str
-    bug_class: str
-    difficulty: str
 Action = CodeReviewAction
 Observation = CodeObservation

+"""Pydantic v2 models representing actions, observations, and state payloads."""
+from typing import Optional, Any, Dict
+from pydantic import BaseModel, Field
 # ── Agent Action ──────────────────────────────────────────────────────────────
 class CodeReviewAction(BaseModel):
     """Action taken by the agent: a structured code review."""
     bug_identified: bool = Field(..., description="Whether a bug was found")
     bug_location: str = Field(..., description="Location of the bug (function, line, variable)")
     bug_type: str = Field(..., description="Type: off-by-one | logic-error | security-vulnerability | none")
     severity: str = Field(..., description="Severity: none | low | medium | high | critical")
     suggested_fix: str = Field(..., description="The corrected code or a description of how to fix it")
 # ── Observation ───────────────────────────────────────────────────────────────
 class CodeObservation(BaseModel):
     """What the agent sees at each step."""
     task_id: str = Field(..., description="Unique task identifier")
     language: str = Field(..., description="Programming language")
     difficulty: str = Field(..., description="Level: easy | medium | hard")
     pr_title: str = Field(..., description="Pull request title submitted by developer")
     file_path: str = Field(..., description="File path of the code in the repository")
 # ── Step Result ───────────────────────────────────────────────────────────────
 class StepResult(BaseModel):
     """Result returned from env.step()."""
+    observation: Optional[CodeObservation] = Field(None, description="Observation if not terminal")
+    reward: float = Field(..., description="Reward generated for the preceding action")
+    done: bool = Field(..., description="Terminal state flag")
+    info: Dict[str, Any] = Field(default_factory=dict, description="Metadata dictionary")
 # ── State ─────────────────────────────────────────────────────────────────────
 class StateResponse(BaseModel):
     """Internal environment state exposed via /state."""
+    task_id: str = Field(..., description="Current running task")
+    step: int = Field(..., description="Current evaluation step")
+    done: bool = Field(..., description="Whether the episode resides in a terminal state")
+    total_reward: float = Field(..., description="Sum of step rewards over the episode")
 # ── API Helpers ───────────────────────────────────────────────────────────────
 class ResetResponse(BaseModel):
+    """Response wrapper returned strictly on environment resets."""
+    observation: CodeObservation = Field(..., description="Initial environment observation upon reset")
 class TaskInfo(BaseModel):
+    """Metadata regarding an available task scenario."""
+    id: str = Field(..., description="Task UUID or unique string identifier")
+    language: str = Field(..., description="Source code language for the flaw context")
+    bug_class: str = Field(..., description="The classification parameter of the embedded bug")
+    difficulty: str = Field(..., description="The difficulty tier indicator (e.g. easy, medium)")
 Action = CodeReviewAction
 Observation = CodeObservation

server/tasks.py CHANGED Viewed

@@ -1,7 +1,13 @@
-# OpenEnv Tasks for Code Security Review
-# These tasks are designed to test AI agents' ability to identify common security vulnerabilities.
-TASKS = {
     "python-off-by-one": {
         "id": "python-off-by-one",
         "name": "Python Off-by-One Error",

+"""OpenEnv Tasks for Code Security Review.
+These task specifications are designed to rigorously test autonomous AI
+agents' abilities to identify, classify, and mitigate common software
+security vulnerabilities across distinct language paradigms.
+"""
+from typing import Dict, Any
+TASKS: Dict[str, Any] = {
     "python-off-by-one": {
         "id": "python-off-by-one",
         "name": "Python Off-by-One Error",