Spaces:
Sleeping
Sleeping
File size: 7,489 Bytes
d1cfa81 0bbb422 d1cfa81 319df19 0bbb422 62b2af2 319df19 0bbb422 319df19 0bbb422 319df19 d1cfa81 0bbb422 319df19 0bbb422 319df19 0bbb422 319df19 0bbb422 319df19 0bbb422 319df19 0bbb422 319df19 0bbb422 319df19 0bbb422 319df19 0bbb422 319df19 e708130 319df19 0bbb422 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 | name: code-review-env
version: 1.1.0
description: >
An OpenEnv benchmark where an AI agent reviews buggy Python code and learns
to identify security vulnerabilities, logic errors, and code smells using a
fixed taxonomy of issue tags. Simulates the real-world software engineering
task of pull-request review with deterministic, multi-dimensional grading
and an iterative refinement mechanic for multi-step learning.
author: Dolphin-Syndrom
license: BSD-3-Clause
spec:
observation_space:
task_id:
type: string
description: Current task identifier (task_extra_easy, task_easy, task_medium, task_hard, task_expert)
file_name:
type: string
description: File name associated with the code snippet under review
task_description:
type: string
description: Instructions describing what the agent should review and return
code_snippet:
type: string
description: Python code snippet containing planted issues for review
feedback:
type: string
description: >
Grading feedback including score, found/missed counts, category hints
for iterative refinement, and severity assessment guidance
step_number:
type: integer
description: Current step number within the episode (starts at 0 after reset)
available_issue_tags:
type: array
description: >
Allowed issue tags the agent can use in issues_found β
null_pointer, missing_return, type_error, index_out_of_bounds,
sql_injection, hardcoded_secret, missing_input_validation,
race_condition, timing_attack, improper_error_handling,
integer_overflow, path_traversal
action_space:
review:
type: object
properties:
review_comment:
type: string
description: Human-readable review explaining identified issues and suggested fixes
issues_found:
type: array
items:
type: string
description: List of issue tags found by the agent, chosen from ISSUE_TAXONOMY
severity:
type: string
enum: [low, medium, high, critical]
description: Overall severity level assessed by the agent
required: [review_comment, issues_found, severity]
reward_range: [0.0, 1.0]
max_steps: 3
tasks:
task_extra_easy:
name: Extra Easy β Index Out of Bounds
description: >
Review a simple data utility for an off-by-one index error.
Single planted issue for agent warm-up.
difficulty: extra_easy
planted_issues: [index_out_of_bounds]
grader:
type: deterministic
scoring: >
base = |correct β© planted| / |planted|;
bonus = +0.05 per correct issue with keyword match in comment;
severity_bonus = +0.05 if severity matches expected level;
penalty = β0.1 per false-positive;
score = clamp(base + bonuses β penalty, 0.0, 1.0)
task_easy:
name: Easy β Null Pointer & Missing Return
description: >
Review a simple user-service function for a null-pointer dereference
and a missing return statement.
difficulty: easy
planted_issues: [null_pointer, missing_return]
grader:
type: deterministic
scoring: >
base = |correct β© planted| / |planted|;
bonus = +0.05 per correct issue with keyword match in comment;
severity_bonus = +0.05 if severity matches expected level;
penalty = β0.1 per false-positive;
score = clamp(base + bonuses β penalty, 0.0, 1.0)
task_medium:
name: Medium β SQL Injection & Hardcoded Secret
description: >
Review an authentication module for SQL injection via f-string
interpolation and a hardcoded secret key.
difficulty: medium
planted_issues: [sql_injection, hardcoded_secret]
grader:
type: deterministic
scoring: >
base = |correct β© planted| / |planted|;
bonus = +0.05 per correct issue with keyword match in comment;
severity_bonus = +0.05 if severity matches expected level;
penalty = β0.1 per false-positive;
score = clamp(base + bonuses β penalty, 0.0, 1.0)
task_hard:
name: Hard β Race Condition, Error Handling & Timing Attack
description: >
Review a payment-processing function for a non-atomic
balance check-and-decrement (race condition), a bare except that
silently swallows payment errors, and a non-constant-time
token comparison (timing attack).
difficulty: hard
planted_issues: [race_condition, improper_error_handling, timing_attack]
grader:
type: deterministic
scoring: >
base = |correct β© planted| / |planted|;
bonus = +0.05 per correct issue with keyword match in comment;
severity_bonus = +0.05 if severity matches expected level;
penalty = β0.1 per false-positive;
score = clamp(base + bonuses β penalty, 0.0, 1.0)
task_expert:
name: Expert β Path Traversal, Overflow, Input Validation & Type Error
description: >
Review a file-processing pipeline for path traversal via unsanitized
user input, integer overflow in size arithmetic, missing input
validation on uploaded content, and a type error from unchecked
string-to-int conversion.
difficulty: expert
planted_issues: [path_traversal, integer_overflow, missing_input_validation, type_error]
grader:
type: deterministic
scoring: >
base = |correct β© planted| / |planted|;
bonus = +0.05 per correct issue with keyword match in comment;
severity_bonus = +0.05 if severity matches expected level;
penalty = β0.1 per false-positive;
score = clamp(base + bonuses β penalty, 0.0, 1.0)
reward_function:
summary:
- Dense rewards are provided per step so agents receive signal across the full trajectory.
- Final task scores are deterministic and normalized to 0.0β1.0 by the graders.
- Iterative refinement feedback enables agents to improve across steps within an episode.
components:
recall_reward:
description: >
Fractional reward proportional to |correctly found issues| / |planted issues|.
This is the primary learning signal encouraging comprehensive detection.
quality_bonus:
value: +0.05
description: >
Per correctly-found issue whose associated keywords appear in the
agent's free-text review_comment (e.g. "sql" for sql_injection).
severity_bonus:
value: +0.05
description: >
Awarded when the agent's severity assessment matches the expected
level for the task's difficulty (e.g. "critical" for hard tasks).
precision_penalty:
value: -0.10
description: >
Per false-positive issue tag submitted. Discourages hallucinated
or overly aggressive flagging.
server:
host: 0.0.0.0
port: 8000
entrypoint: server.app:app
endpoints:
- GET /health
- GET /tasks
- POST /reset
- POST /step
- GET /state
- POST /grader
- POST /baseline
- GET /ws
dependencies:
python: ">=3.10"
packages:
- openenv-core[core]>=0.2.2
- openai>=1.0
- httpx>=0.24.0
- plotly>=6.6.0
- pandas>=2.3.3
- gradio>=4.0
- pydantic>=2.0.0
- uvicorn>=0.24.0
- fastapi>=0.104.0
validation:
openenv_spec: true
docker_build: true
baseline_reproducible: true
tasks_count: 5
tests_passing: 32
|