File size: 7,489 Bytes
d1cfa81
0bbb422
d1cfa81
319df19
 
 
0bbb422
 
62b2af2
319df19
 
 
 
 
 
0bbb422
319df19
 
 
 
 
 
 
 
 
 
 
0bbb422
 
 
319df19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1cfa81
0bbb422
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319df19
 
 
 
 
 
 
 
 
 
 
 
0bbb422
319df19
0bbb422
319df19
 
 
 
 
 
 
 
 
 
 
 
 
0bbb422
319df19
0bbb422
319df19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bbb422
319df19
0bbb422
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319df19
 
 
 
 
0bbb422
319df19
 
 
 
 
 
 
 
 
 
0bbb422
 
 
 
 
319df19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e708130
319df19
 
 
 
0bbb422
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
name: code-review-env
version: 1.1.0
description: >
  An OpenEnv benchmark where an AI agent reviews buggy Python code and learns
  to identify security vulnerabilities, logic errors, and code smells using a
  fixed taxonomy of issue tags.  Simulates the real-world software engineering
  task of pull-request review with deterministic, multi-dimensional grading
  and an iterative refinement mechanic for multi-step learning.
author: Dolphin-Syndrom
license: BSD-3-Clause

spec:
  observation_space:
    task_id:
      type: string
      description: Current task identifier (task_extra_easy, task_easy, task_medium, task_hard, task_expert)
    file_name:
      type: string
      description: File name associated with the code snippet under review
    task_description:
      type: string
      description: Instructions describing what the agent should review and return
    code_snippet:
      type: string
      description: Python code snippet containing planted issues for review
    feedback:
      type: string
      description: >
        Grading feedback including score, found/missed counts, category hints
        for iterative refinement, and severity assessment guidance
    step_number:
      type: integer
      description: Current step number within the episode (starts at 0 after reset)
    available_issue_tags:
      type: array
      description: >
        Allowed issue tags the agent can use in issues_found β€”
        null_pointer, missing_return, type_error, index_out_of_bounds,
        sql_injection, hardcoded_secret, missing_input_validation,
        race_condition, timing_attack, improper_error_handling,
        integer_overflow, path_traversal

  action_space:
    review:
      type: object
      properties:
        review_comment:
          type: string
          description: Human-readable review explaining identified issues and suggested fixes
        issues_found:
          type: array
          items:
            type: string
          description: List of issue tags found by the agent, chosen from ISSUE_TAXONOMY
        severity:
          type: string
          enum: [low, medium, high, critical]
          description: Overall severity level assessed by the agent
      required: [review_comment, issues_found, severity]

  reward_range: [0.0, 1.0]
  max_steps: 3

tasks:
  task_extra_easy:
    name: Extra Easy β€” Index Out of Bounds
    description: >
      Review a simple data utility for an off-by-one index error.
      Single planted issue for agent warm-up.
    difficulty: extra_easy
    planted_issues: [index_out_of_bounds]
    grader:
      type: deterministic
      scoring: >
        base = |correct ∩ planted| / |planted|;
        bonus = +0.05 per correct issue with keyword match in comment;
        severity_bonus = +0.05 if severity matches expected level;
        penalty = βˆ’0.1 per false-positive;
        score = clamp(base + bonuses βˆ’ penalty, 0.0, 1.0)

  task_easy:
    name: Easy β€” Null Pointer & Missing Return
    description: >
      Review a simple user-service function for a null-pointer dereference
      and a missing return statement.
    difficulty: easy
    planted_issues: [null_pointer, missing_return]
    grader:
      type: deterministic
      scoring: >
        base = |correct ∩ planted| / |planted|;
        bonus = +0.05 per correct issue with keyword match in comment;
        severity_bonus = +0.05 if severity matches expected level;
        penalty = βˆ’0.1 per false-positive;
        score = clamp(base + bonuses βˆ’ penalty, 0.0, 1.0)

  task_medium:
    name: Medium β€” SQL Injection & Hardcoded Secret
    description: >
      Review an authentication module for SQL injection via f-string
      interpolation and a hardcoded secret key.
    difficulty: medium
    planted_issues: [sql_injection, hardcoded_secret]
    grader:
      type: deterministic
      scoring: >
        base = |correct ∩ planted| / |planted|;
        bonus = +0.05 per correct issue with keyword match in comment;
        severity_bonus = +0.05 if severity matches expected level;
        penalty = βˆ’0.1 per false-positive;
        score = clamp(base + bonuses βˆ’ penalty, 0.0, 1.0)

  task_hard:
    name: Hard β€” Race Condition, Error Handling & Timing Attack
    description: >
      Review a payment-processing function for a non-atomic
      balance check-and-decrement (race condition), a bare except that
      silently swallows payment errors, and a non-constant-time
      token comparison (timing attack).
    difficulty: hard
    planted_issues: [race_condition, improper_error_handling, timing_attack]
    grader:
      type: deterministic
      scoring: >
        base = |correct ∩ planted| / |planted|;
        bonus = +0.05 per correct issue with keyword match in comment;
        severity_bonus = +0.05 if severity matches expected level;
        penalty = βˆ’0.1 per false-positive;
        score = clamp(base + bonuses βˆ’ penalty, 0.0, 1.0)

  task_expert:
    name: Expert β€” Path Traversal, Overflow, Input Validation & Type Error
    description: >
      Review a file-processing pipeline for path traversal via unsanitized
      user input, integer overflow in size arithmetic, missing input
      validation on uploaded content, and a type error from unchecked
      string-to-int conversion.
    difficulty: expert
    planted_issues: [path_traversal, integer_overflow, missing_input_validation, type_error]
    grader:
      type: deterministic
      scoring: >
        base = |correct ∩ planted| / |planted|;
        bonus = +0.05 per correct issue with keyword match in comment;
        severity_bonus = +0.05 if severity matches expected level;
        penalty = βˆ’0.1 per false-positive;
        score = clamp(base + bonuses βˆ’ penalty, 0.0, 1.0)

reward_function:
  summary:
    - Dense rewards are provided per step so agents receive signal across the full trajectory.
    - Final task scores are deterministic and normalized to 0.0–1.0 by the graders.
    - Iterative refinement feedback enables agents to improve across steps within an episode.
  components:
    recall_reward:
      description: >
        Fractional reward proportional to |correctly found issues| / |planted issues|.
        This is the primary learning signal encouraging comprehensive detection.
    quality_bonus:
      value: +0.05
      description: >
        Per correctly-found issue whose associated keywords appear in the
        agent's free-text review_comment (e.g. "sql" for sql_injection).
    severity_bonus:
      value: +0.05
      description: >
        Awarded when the agent's severity assessment matches the expected
        level for the task's difficulty (e.g. "critical" for hard tasks).
    precision_penalty:
      value: -0.10
      description: >
        Per false-positive issue tag submitted. Discourages hallucinated
        or overly aggressive flagging.

server:
  host: 0.0.0.0
  port: 8000
  entrypoint: server.app:app
  endpoints:
    - GET  /health
    - GET  /tasks
    - POST /reset
    - POST /step
    - GET  /state
    - POST /grader
    - POST /baseline
    - GET  /ws

dependencies:
  python: ">=3.10"
  packages:
    - openenv-core[core]>=0.2.2
    - openai>=1.0
    - httpx>=0.24.0
    - plotly>=6.6.0
    - pandas>=2.3.3
    - gradio>=4.0
    - pydantic>=2.0.0
    - uvicorn>=0.24.0
    - fastapi>=0.104.0

validation:
  openenv_spec: true
  docker_build: true
  baseline_reproducible: true
  tasks_count: 5
  tests_passing: 32