Nitish commited on
Commit
e4853aa
Β·
1 Parent(s): babbbc8

chore: production standardisation of the server codebase

Browse files
Files changed (6) hide show
  1. server/__init__.py +5 -0
  2. server/app.py +47 -15
  3. server/environment.py +45 -12
  4. server/grader.py +102 -70
  5. server/models.py +24 -20
  6. server/tasks.py +9 -3
server/__init__.py CHANGED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Server package for the Code Security Review environment.
2
+
3
+ This module houses the core FastAPI server, environment definitions,
4
+ evaluation graders, and structured schema validations.
5
+ """
server/app.py CHANGED
@@ -1,7 +1,13 @@
 
 
 
 
 
 
1
  import os
2
  import uvicorn
3
  from typing import List, Optional
4
- from fastapi import FastAPI, HTTPException, Query
5
  from fastapi.middleware.cors import CORSMiddleware
6
 
7
  from server.models import CodeReviewAction, StepResult, ResetResponse, StateResponse, TaskInfo
@@ -25,7 +31,7 @@ env = CodeSecurityEnv()
25
 
26
 
27
  @app.get("/")
28
- def health():
29
  """Health check endpoint."""
30
  return {
31
  "status": "ok",
@@ -36,7 +42,7 @@ def health():
36
 
37
 
38
  @app.get("/tasks", response_model=List[TaskInfo])
39
- def list_tasks():
40
  """List all available tasks."""
41
  return [
42
  TaskInfo(
@@ -53,30 +59,56 @@ def list_tasks():
53
  def reset(
54
  task_id: str = Query(default="python-off-by-one", description="Task ID to reset to"),
55
  seed: Optional[int] = Query(default=None, description="Optional seed for reproducibility")
56
- ):
57
  """Reset the environment and return the first observation."""
58
  if task_id not in TASKS:
59
- raise HTTPException(status_code=404, detail=f"Task '{task_id}' not found.")
60
- obs = env.reset(task_id=task_id, seed=seed)
61
- return ResetResponse(observation=obs)
 
 
 
 
 
 
 
 
 
 
62
 
63
 
64
  @app.post("/step", response_model=StepResult)
65
- def step(action: CodeReviewAction):
66
  """Submit a code review action and receive a reward signal."""
67
- result = env.step(action)
68
- return result
 
 
 
 
 
69
 
70
 
71
  @app.get("/state", response_model=StateResponse)
72
- def state():
73
  """Return the current environment state."""
74
- return env.state()
 
 
 
 
 
 
 
75
 
 
 
 
 
 
 
 
76
 
77
- def main():
78
- """Run the environment server."""
79
- port = int(os.environ.get("PORT", 8000))
80
  uvicorn.run(
81
  "server.app:app",
82
  host="0.0.0.0",
 
1
+ """Main FastAPI application for Code Security Review.
2
+
3
+ Exposes RESTful endpoints conforming to standard OpenEnv compliance specifications
4
+ dictating interactions for agent evaluation.
5
+ """
6
+
7
  import os
8
  import uvicorn
9
  from typing import List, Optional
10
+ from fastapi import FastAPI, HTTPException, Query, status
11
  from fastapi.middleware.cors import CORSMiddleware
12
 
13
  from server.models import CodeReviewAction, StepResult, ResetResponse, StateResponse, TaskInfo
 
31
 
32
 
33
  @app.get("/")
34
+ def health() -> dict:
35
  """Health check endpoint."""
36
  return {
37
  "status": "ok",
 
42
 
43
 
44
  @app.get("/tasks", response_model=List[TaskInfo])
45
+ def list_tasks() -> List[TaskInfo]:
46
  """List all available tasks."""
47
  return [
48
  TaskInfo(
 
59
  def reset(
60
  task_id: str = Query(default="python-off-by-one", description="Task ID to reset to"),
61
  seed: Optional[int] = Query(default=None, description="Optional seed for reproducibility")
62
+ ) -> ResetResponse:
63
  """Reset the environment and return the first observation."""
64
  if task_id not in TASKS:
65
+ raise HTTPException(
66
+ status_code=status.HTTP_404_NOT_FOUND,
67
+ detail=f"Task '{task_id}' not found."
68
+ )
69
+
70
+ try:
71
+ obs = env.reset(task_id=task_id, seed=seed)
72
+ return ResetResponse(observation=obs)
73
+ except Exception as e:
74
+ raise HTTPException(
75
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
76
+ detail=f"System breakdown during environment reset: {e}"
77
+ )
78
 
79
 
80
  @app.post("/step", response_model=StepResult)
81
+ def step(action: CodeReviewAction) -> StepResult:
82
  """Submit a code review action and receive a reward signal."""
83
+ try:
84
+ return env.step(action)
85
+ except Exception as e:
86
+ raise HTTPException(
87
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
88
+ detail=f"Error executing agent action logic: {e}"
89
+ )
90
 
91
 
92
  @app.get("/state", response_model=StateResponse)
93
+ def state() -> StateResponse:
94
  """Return the current environment state."""
95
+ try:
96
+ return env.state()
97
+ except Exception as e:
98
+ raise HTTPException(
99
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
100
+ detail=f"Error analyzing global runtime state tracking: {e}"
101
+ )
102
+
103
 
104
+ def main() -> None:
105
+ """Run the environment ASGI server natively."""
106
+ port_default = os.environ.get("PORT", "8000")
107
+ try:
108
+ port = int(port_default)
109
+ except ValueError:
110
+ port = 8000
111
 
 
 
 
112
  uvicorn.run(
113
  "server.app:app",
114
  host="0.0.0.0",
server/environment.py CHANGED
@@ -1,26 +1,46 @@
 
 
 
 
 
 
1
  import random
2
- from typing import Optional, Dict, Tuple
3
 
4
  from server.tasks import TASKS
5
  from server.grader import grade_action
6
- from server.models import CodeObservation, StepResult, StateResponse, Action, Observation
 
 
 
7
 
8
  class CodeSecurityEnv:
9
- def __init__(self):
10
- self.current_task: Optional[dict] = None
 
 
 
11
  self.step_count: int = 0
12
  self.done: bool = False
13
  self.total_reward: float = 0.0
14
  self._task_ids = list(TASKS.keys())
15
 
16
  def reset(self, task_id: Optional[str] = None, seed: Optional[int] = None) -> Observation:
 
 
 
 
 
 
 
 
 
17
  if seed is not None:
18
  random.seed(seed)
19
 
20
  if task_id and task_id in TASKS:
21
  self.current_task = TASKS[task_id]
22
  else:
23
- # Pick a task by its ID
24
  chosen_id = random.choice(self._task_ids)
25
  self.current_task = TASKS[chosen_id]
26
 
@@ -31,8 +51,15 @@ class CodeSecurityEnv:
31
  return self._make_observation()
32
 
33
  def step(self, action: Action) -> StepResult:
 
 
 
 
 
 
 
 
34
  if self.current_task is None:
35
- # Auto-reset if called before reset()
36
  self.reset()
37
 
38
  if self.done:
@@ -40,16 +67,17 @@ class CodeSecurityEnv:
40
  observation=self._make_observation(),
41
  reward=0.0,
42
  done=True,
43
- info={"error": "Episode already completed. Call /reset to start a new episode."},
44
  )
45
 
46
- # The action comes from the API as a Pydantic model (Action)
47
- # The grader expects a dict or the model itself.
48
- reward, breakdown = grade_action(action.model_dump(), self.current_task)
 
49
 
50
  self.step_count += 1
51
  self.total_reward += reward
52
- self.done = True # single-step environment β€” one action per episode
53
 
54
  return StepResult(
55
  observation=self._make_observation(),
@@ -63,7 +91,8 @@ class CodeSecurityEnv:
63
  )
64
 
65
  def state(self) -> StateResponse:
66
- current_id = self.current_task["id"] if self.current_task else ""
 
67
  return StateResponse(
68
  task_id=current_id,
69
  step=self.step_count,
@@ -72,7 +101,11 @@ class CodeSecurityEnv:
72
  )
73
 
74
  def _make_observation(self) -> Observation:
 
75
  t = self.current_task
 
 
 
76
  return Observation(
77
  task_id=t["id"],
78
  language=t["language"],
 
1
+ """Reinforcement Learning Environment Core.
2
+
3
+ Defines the environment logic, maintaining the current trajectory
4
+ state and mediating between incoming requests and the headless grader.
5
+ """
6
+
7
  import random
8
+ from typing import Optional, Dict, Any
9
 
10
  from server.tasks import TASKS
11
  from server.grader import grade_action
12
+ from server.models import StepResult, StateResponse, Action, Observation
13
+
14
+ ERROR_EPISODE_COMPLETED = "Episode already completed. Call /reset to start a new episode."
15
+
16
 
17
  class CodeSecurityEnv:
18
+ """Simulates the stateful progression of a software security assessment."""
19
+
20
+ def __init__(self) -> None:
21
+ """Initialize a fresh environment instance."""
22
+ self.current_task: Optional[Dict[str, Any]] = None
23
  self.step_count: int = 0
24
  self.done: bool = False
25
  self.total_reward: float = 0.0
26
  self._task_ids = list(TASKS.keys())
27
 
28
  def reset(self, task_id: Optional[str] = None, seed: Optional[int] = None) -> Observation:
29
+ """Reset the environment safely to a new or targeted initial state.
30
+
31
+ Args:
32
+ task_id: Optionally force the environment to yield a specific task definition.
33
+ seed: Initialize standard random seed.
34
+
35
+ Returns:
36
+ An Observation baseline reflecting the new scenario context.
37
+ """
38
  if seed is not None:
39
  random.seed(seed)
40
 
41
  if task_id and task_id in TASKS:
42
  self.current_task = TASKS[task_id]
43
  else:
 
44
  chosen_id = random.choice(self._task_ids)
45
  self.current_task = TASKS[chosen_id]
46
 
 
51
  return self._make_observation()
52
 
53
  def step(self, action: Action) -> StepResult:
54
+ """Advance the environment state using a provided agent Action payload.
55
+
56
+ Args:
57
+ action: Evaluated metrics provided directly by agent decision matrices.
58
+
59
+ Returns:
60
+ A StepResult containing scalar reward metrics and end-of-episode flag.
61
+ """
62
  if self.current_task is None:
 
63
  self.reset()
64
 
65
  if self.done:
 
67
  observation=self._make_observation(),
68
  reward=0.0,
69
  done=True,
70
+ info={"error": ERROR_EPISODE_COMPLETED},
71
  )
72
 
73
+ try:
74
+ reward, breakdown = grade_action(action.model_dump(), self.current_task)
75
+ except Exception as e:
76
+ reward, breakdown = 0.0, {"error": f"Evaluation error: {e}"}
77
 
78
  self.step_count += 1
79
  self.total_reward += reward
80
+ self.done = True # single-step environment
81
 
82
  return StepResult(
83
  observation=self._make_observation(),
 
91
  )
92
 
93
  def state(self) -> StateResponse:
94
+ """Return global analytics tracking the current environment session state."""
95
+ current_id = self.current_task["id"] if getattr(self, "current_task", None) else ""
96
  return StateResponse(
97
  task_id=current_id,
98
  step=self.step_count,
 
101
  )
102
 
103
  def _make_observation(self) -> Observation:
104
+ """Construct the contextual parameters surrounding an ongoing assessment."""
105
  t = self.current_task
106
+ if not t:
107
+ raise KeyError("Attempted observation render without an initialized active task")
108
+
109
  return Observation(
110
  task_id=t["id"],
111
  language=t["language"],
server/grader.py CHANGED
@@ -1,82 +1,114 @@
1
- from typing import Tuple, Dict
2
 
 
 
 
3
 
4
- def grade_action(action: dict, task: dict) -> Tuple[float, Dict[str, float]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  reward = 0.0
6
  breakdown: Dict[str, float] = {}
7
 
8
- # ── Component 1: Bug identified (0.20) ──────────────────────────────────
9
- if action.get("bug_identified"):
10
- reward += 0.20
11
- breakdown["bug_identified"] = 0.20
12
- else:
13
- breakdown["bug_identified"] = 0.00
14
- # No bug found β†’ no partial credit for anything else
15
- return max(0.0, min(1.0, reward)), breakdown
 
 
 
 
 
 
 
 
 
 
16
 
17
- # ── Component 2: Bug type match (0.20) ──────────────────────────────────
18
- action_type = action.get("bug_type", "").lower().replace("-", " ").replace("_", " ")
19
- task_type = task["bug_type"].lower().replace("-", " ").replace("_", " ")
20
- if task_type in action_type or action_type in task_type:
21
- reward += 0.20
22
- breakdown["bug_type"] = 0.20
23
- else:
24
- breakdown["bug_type"] = 0.00
 
 
 
25
 
26
- # ── Component 3: Bug location (0.10) ────────────────────────────────────
27
- action_location = action.get("bug_location", "").lower()
28
- location_keywords = [w for w in task["bug_location"].lower().split() if len(w) > 3]
29
- if location_keywords:
30
- matched = sum(1 for kw in location_keywords if kw in action_location)
31
- loc_score = round(0.10 * (matched / len(location_keywords)), 4)
32
- else:
33
- loc_score = 0.0
34
- reward += loc_score
35
- breakdown["bug_location"] = loc_score
36
 
37
- # ── Component 4: Description quality (0.25) ──────────────────────────────
38
- description = action.get("bug_description", "").lower()
39
- desc_score = 0.0
40
- if len(description) >= 20:
41
- task_keywords = task["keywords"]
42
- matched_kw = [kw for kw in task_keywords if kw in description]
43
- # Full points if they hit at least 3 keywords
44
- desc_score = round(min(0.25, 0.25 * (len(matched_kw) / 3.0)), 4)
45
- breakdown["description_quality"] = desc_score
46
- reward += desc_score
47
 
48
- # ── Component 5: Fix quality (0.15) ──────────────────────────────────────
49
- fix = action.get("suggested_fix", "").lower()
50
- fix_score = 0.0
51
- if len(fix) >= 10:
52
- fix_patterns = task["fix_patterns"]
53
- matched_fix = [p for p in fix_patterns if p.lower() in fix]
54
- # Match any 1 pattern for full points
55
- fix_score = round(min(0.15, 0.15 * len(matched_fix)), 4)
56
- breakdown["fix_quality"] = fix_score
57
- reward += fix_score
 
 
58
 
59
- # ── Component 6: Severity (0.10) ─────────────────────────────────────────
60
- action_sev = action.get("severity", "").lower()
61
- task_sev = task["severity"].lower()
62
- if action_sev == task_sev:
63
- sev_score = 0.10
64
- elif action_sev in ("high", "critical") and task_sev in ("high", "critical"):
65
- sev_score = 0.05
66
- else:
67
- sev_score = 0.00
68
- breakdown["severity"] = sev_score
69
- reward += sev_score
70
 
71
- # ── Global Penalty: Keyword Stuffing ────────────────────────────────────
72
- description = action.get("bug_description", "").lower()
73
- words = description.split()
74
- unique_ratio = len(set(words)) / len(words) if words else 1.0
75
- if unique_ratio < 0.5:
76
- reward *= 0.2 # Heavy global penalty
77
- breakdown["stuffing_penalty_multiplier"] = 0.2
78
- for k in list(breakdown.keys()):
79
- if k != "stuffing_penalty_multiplier":
80
- breakdown[k] = round(breakdown[k] * 0.2, 4)
81
 
82
- return max(0.0, min(1.0, round(reward, 4))), breakdown
 
 
1
+ """Review Grader System.
2
 
3
+ Implements programmatic sub-scoring logic for evaluating agent
4
+ security actions against internal semantic criteria.
5
+ """
6
 
7
+ from typing import Tuple, Dict, Any
8
+
9
+ SCORE_BUG_IDENTIFIED = 0.20
10
+ SCORE_BUG_TYPE = 0.20
11
+ SCORE_BUG_LOCATION = 0.10
12
+ SCORE_DESC_QUALITY = 0.25
13
+ SCORE_FIX_QUALITY = 0.15
14
+ SCORE_SEV_EXACT = 0.10
15
+ SCORE_SEV_PARTIAL = 0.05
16
+
17
+ KEYWORD_HIT_TARGET = 3.0
18
+ PENALTY_THRESHOLD = 0.5
19
+ PENALTY_MULTIPLIER = 0.2
20
+
21
+
22
+ def grade_action(action: Dict[str, Any], task: Dict[str, Any]) -> Tuple[float, Dict[str, float]]:
23
+ """Evaluate an action against the task definition.
24
+
25
+ Args:
26
+ action: The structured payload proposed by the AI agent.
27
+ task: The dictionary blueprint detailing the expected vulnerability.
28
+
29
+ Returns:
30
+ A tuple of the normalized aggregate reward and the individual component breakdown.
31
+ """
32
  reward = 0.0
33
  breakdown: Dict[str, float] = {}
34
 
35
+ try:
36
+ # ── Component 1: Bug identified (0.20) ──────────────────────────────────
37
+ if action.get("bug_identified"):
38
+ reward += SCORE_BUG_IDENTIFIED
39
+ breakdown["bug_identified"] = SCORE_BUG_IDENTIFIED
40
+ else:
41
+ breakdown["bug_identified"] = 0.00
42
+ # No bug found β†’ no partial credit for anything else
43
+ return max(0.0, min(1.0, reward)), breakdown
44
+
45
+ # ── Component 2: Bug type match (0.20) ──────────────────────────────────
46
+ action_type = action.get("bug_type", "").lower().replace("-", " ").replace("_", " ")
47
+ task_type = task["bug_type"].lower().replace("-", " ").replace("_", " ")
48
+ if task_type in action_type or action_type in task_type:
49
+ reward += SCORE_BUG_TYPE
50
+ breakdown["bug_type"] = SCORE_BUG_TYPE
51
+ else:
52
+ breakdown["bug_type"] = 0.00
53
 
54
+ # ── Component 3: Bug location (0.10) ────────────────────────────────────
55
+ action_location = action.get("bug_location", "").lower()
56
+ location_keywords = [w for w in task["bug_location"].lower().split() if len(w) > 3]
57
+ if location_keywords:
58
+ matched = sum(1 for kw in location_keywords if kw in action_location)
59
+ loc_score = round(SCORE_BUG_LOCATION * (matched / len(location_keywords)), 4)
60
+ else:
61
+ loc_score = 0.0
62
+
63
+ reward += loc_score
64
+ breakdown["bug_location"] = loc_score
65
 
66
+ # ── Component 4: Description quality (0.25) ──────────────────────────────
67
+ description = action.get("bug_description", "").lower()
68
+ desc_score = 0.0
69
+ if len(description) >= 20:
70
+ task_keywords = task["keywords"]
71
+ matched_kw = [kw for kw in task_keywords if kw in description]
72
+ desc_score = round(min(SCORE_DESC_QUALITY, SCORE_DESC_QUALITY * (len(matched_kw) / KEYWORD_HIT_TARGET)), 4)
73
+
74
+ breakdown["description_quality"] = desc_score
75
+ reward += desc_score
76
 
77
+ # ── Component 5: Fix quality (0.15) ──────────────────────────────────────
78
+ fix = action.get("suggested_fix", "").lower()
79
+ fix_score = 0.0
80
+ if len(fix) >= 10:
81
+ fix_patterns = task["fix_patterns"]
82
+ matched_fix = [p for p in fix_patterns if p.lower() in fix]
83
+ fix_score = round(min(SCORE_FIX_QUALITY, SCORE_FIX_QUALITY * len(matched_fix)), 4)
84
+
85
+ breakdown["fix_quality"] = fix_score
86
+ reward += fix_score
87
 
88
+ # ── Component 6: Severity (0.10) ─────────────────────────────────────────
89
+ action_sev = action.get("severity", "").lower()
90
+ task_sev = task["severity"].lower()
91
+ if action_sev == task_sev:
92
+ sev_score = SCORE_SEV_EXACT
93
+ elif action_sev in ("high", "critical") and task_sev in ("high", "critical"):
94
+ sev_score = SCORE_SEV_PARTIAL
95
+ else:
96
+ sev_score = 0.00
97
+
98
+ breakdown["severity"] = sev_score
99
+ reward += sev_score
100
 
101
+ # ── Global Penalty: Keyword Stuffing ────────────────────────────────────
102
+ words = description.split()
103
+ unique_ratio = len(set(words)) / len(words) if words else 1.0
104
+ if unique_ratio < PENALTY_THRESHOLD:
105
+ reward *= PENALTY_MULTIPLIER
106
+ breakdown["stuffing_penalty_multiplier"] = PENALTY_MULTIPLIER
107
+ for k in list(breakdown.keys()):
108
+ if k != "stuffing_penalty_multiplier":
109
+ breakdown[k] = round(breakdown[k] * PENALTY_MULTIPLIER, 4)
 
 
110
 
111
+ return max(0.0, min(1.0, round(reward, 4))), breakdown
 
 
 
 
 
 
 
 
 
112
 
113
+ except KeyError as exc:
114
+ raise RuntimeError(f"Missing mandatory schema key in task definition: {exc}") from exc
server/models.py CHANGED
@@ -1,11 +1,13 @@
1
- from pydantic import BaseModel, Field
2
- from typing import Optional, Any, Dict
3
 
 
 
4
 
5
  # ── Agent Action ──────────────────────────────────────────────────────────────
6
 
7
  class CodeReviewAction(BaseModel):
8
  """Action taken by the agent: a structured code review."""
 
9
  bug_identified: bool = Field(..., description="Whether a bug was found")
10
  bug_location: str = Field(..., description="Location of the bug (function, line, variable)")
11
  bug_type: str = Field(..., description="Type: off-by-one | logic-error | security-vulnerability | none")
@@ -13,11 +15,11 @@ class CodeReviewAction(BaseModel):
13
  severity: str = Field(..., description="Severity: none | low | medium | high | critical")
14
  suggested_fix: str = Field(..., description="The corrected code or a description of how to fix it")
15
 
16
-
17
  # ── Observation ───────────────────────────────────────────────────────────────
18
 
19
  class CodeObservation(BaseModel):
20
  """What the agent sees at each step."""
 
21
  task_id: str = Field(..., description="Unique task identifier")
22
  language: str = Field(..., description="Programming language")
23
  difficulty: str = Field(..., description="Level: easy | medium | hard")
@@ -26,38 +28,40 @@ class CodeObservation(BaseModel):
26
  pr_title: str = Field(..., description="Pull request title submitted by developer")
27
  file_path: str = Field(..., description="File path of the code in the repository")
28
 
29
-
30
  # ── Step Result ───────────────────────────────────────────────────────────────
31
 
32
  class StepResult(BaseModel):
33
  """Result returned from env.step()."""
34
- observation: Optional[CodeObservation] = None
35
- reward: float
36
- done: bool
37
- info: Dict[str, Any]
38
-
39
 
40
  # ── State ─────────────────────────────────────────────────────────────────────
41
 
42
  class StateResponse(BaseModel):
43
  """Internal environment state exposed via /state."""
44
- task_id: str
45
- step: int
46
- done: bool
47
- total_reward: float
48
-
49
 
50
  # ── API Helpers ───────────────────────────────────────────────────────────────
51
 
52
  class ResetResponse(BaseModel):
53
- observation: CodeObservation
54
-
 
55
 
56
  class TaskInfo(BaseModel):
57
- id: str
58
- language: str
59
- bug_class: str
60
- difficulty: str
 
 
61
 
62
  Action = CodeReviewAction
63
  Observation = CodeObservation
 
1
+ """Pydantic v2 models representing actions, observations, and state payloads."""
 
2
 
3
+ from typing import Optional, Any, Dict
4
+ from pydantic import BaseModel, Field
5
 
6
  # ── Agent Action ──────────────────────────────────────────────────────────────
7
 
8
  class CodeReviewAction(BaseModel):
9
  """Action taken by the agent: a structured code review."""
10
+
11
  bug_identified: bool = Field(..., description="Whether a bug was found")
12
  bug_location: str = Field(..., description="Location of the bug (function, line, variable)")
13
  bug_type: str = Field(..., description="Type: off-by-one | logic-error | security-vulnerability | none")
 
15
  severity: str = Field(..., description="Severity: none | low | medium | high | critical")
16
  suggested_fix: str = Field(..., description="The corrected code or a description of how to fix it")
17
 
 
18
  # ── Observation ───────────────────────────────────────────────────────────────
19
 
20
  class CodeObservation(BaseModel):
21
  """What the agent sees at each step."""
22
+
23
  task_id: str = Field(..., description="Unique task identifier")
24
  language: str = Field(..., description="Programming language")
25
  difficulty: str = Field(..., description="Level: easy | medium | hard")
 
28
  pr_title: str = Field(..., description="Pull request title submitted by developer")
29
  file_path: str = Field(..., description="File path of the code in the repository")
30
 
 
31
  # ── Step Result ───────────────────────────────────────────────────────────────
32
 
33
  class StepResult(BaseModel):
34
  """Result returned from env.step()."""
35
+
36
+ observation: Optional[CodeObservation] = Field(None, description="Observation if not terminal")
37
+ reward: float = Field(..., description="Reward generated for the preceding action")
38
+ done: bool = Field(..., description="Terminal state flag")
39
+ info: Dict[str, Any] = Field(default_factory=dict, description="Metadata dictionary")
40
 
41
  # ── State ─────────────────────────────────────────────────────────────────────
42
 
43
  class StateResponse(BaseModel):
44
  """Internal environment state exposed via /state."""
45
+
46
+ task_id: str = Field(..., description="Current running task")
47
+ step: int = Field(..., description="Current evaluation step")
48
+ done: bool = Field(..., description="Whether the episode resides in a terminal state")
49
+ total_reward: float = Field(..., description="Sum of step rewards over the episode")
50
 
51
  # ── API Helpers ───────────────────────────────────────────────────────────────
52
 
53
  class ResetResponse(BaseModel):
54
+ """Response wrapper returned strictly on environment resets."""
55
+
56
+ observation: CodeObservation = Field(..., description="Initial environment observation upon reset")
57
 
58
  class TaskInfo(BaseModel):
59
+ """Metadata regarding an available task scenario."""
60
+
61
+ id: str = Field(..., description="Task UUID or unique string identifier")
62
+ language: str = Field(..., description="Source code language for the flaw context")
63
+ bug_class: str = Field(..., description="The classification parameter of the embedded bug")
64
+ difficulty: str = Field(..., description="The difficulty tier indicator (e.g. easy, medium)")
65
 
66
  Action = CodeReviewAction
67
  Observation = CodeObservation
server/tasks.py CHANGED
@@ -1,7 +1,13 @@
1
- # OpenEnv Tasks for Code Security Review
2
- # These tasks are designed to test AI agents' ability to identify common security vulnerabilities.
3
 
4
- TASKS = {
 
 
 
 
 
 
 
5
  "python-off-by-one": {
6
  "id": "python-off-by-one",
7
  "name": "Python Off-by-One Error",
 
1
+ """OpenEnv Tasks for Code Security Review.
 
2
 
3
+ These task specifications are designed to rigorously test autonomous AI
4
+ agents' abilities to identify, classify, and mitigate common software
5
+ security vulnerabilities across distinct language paradigms.
6
+ """
7
+
8
+ from typing import Dict, Any
9
+
10
+ TASKS: Dict[str, Any] = {
11
  "python-off-by-one": {
12
  "id": "python-off-by-one",
13
  "name": "Python Off-by-One Error",