File size: 4,851 Bytes
7b2a69c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
name: code-review-env
version: "1.0.0"
description: >
  An OpenEnv-compliant AI training environment that simulates professional
  Python code review. Agents learn to identify bugs, security vulnerabilities,
  performance issues, style problems, and documentation gaps across three
  progressively harder tasks.

tags:
  - openenv
  - code-review
  - python
  - security
  - software-engineering

author: imaginephoenix / rawgenn.tech
license: MIT

environment:
  class: CodeReviewEnv
  module: env.environment
  entrypoint: app.py
  framework: fastapi

observation_space:
  type: object
  description: >
    What the agent sees each step. Contains the code snippet to review,
    task instructions, all previously submitted comments, and optional
    feedback from the last step.
  fields:
    task_id:
      type: string
      description: Identifier of the active task
    step:
      type: integer
      description: Current step number (0-indexed)
    snippet:
      type: object
      description: Python source code to review
      fields:
        file_name: { type: string }
        source: { type: string, description: "Full Python source with line numbers" }
        language: { type: string, const: "python" }
    instructions:
      type: string
      description: Review instructions and scope for this task
    previous_comments:
      type: array
      description: All review comments submitted in prior steps
    feedback:
      type: string
      nullable: true
      description: Environment feedback on the most recent action
    done:
      type: boolean

action_space:
  type: object
  description: >
    What the agent submits. A list of review comments (each with line,
    category, severity, message, optional suggestion) plus an optional
    overall summary and a submit flag.
  fields:
    comments:
      type: array
      items:
        type: object
        fields:
          line: { type: integer, nullable: true, description: "1-indexed line number" }
          category:
            type: string
            enum: [bug, security, performance, style, documentation]
          severity:
            type: string
            enum: [low, medium, high, critical]
          message: { type: string, minLength: 5, maxLength: 500 }
          suggestion: { type: string, nullable: true, maxLength: 500 }
    summary:
      type: string
      nullable: true
      description: "Required for task_3_hard; optional otherwise"
    submit:
      type: boolean
      description: "Set true to finalise the review and trigger the grader"

reward:
  type: float
  range: [-1.0, 1.0]
  description: >
    Shaped reward with partial progress signals. Incremental positive reward
    for each new valid comment added (proportional to issue severity). On
    submit: final grader score mapped to [-0.2, 1.0]. Penalties for false
    positives, missed criticals, and spamming low-quality comments.

tasks:
  - id: task_1_easy
    title: "Bug Detection & Style Review"
    difficulty: easy
    categories: [bug, style]
    max_steps: 5
    passing_threshold: 0.55
    description: >
      Review calculator.py (31 lines) for division-by-zero bugs, off-by-one
      errors, empty-collection crashes, and Python style anti-patterns.

  - id: task_2_medium
    title: "Security & Performance Audit"
    difficulty: medium
    categories: [security, performance]
    max_steps: 7
    passing_threshold: 0.60
    description: >
      Audit user_service.py (55 lines) for SQL injection, broken MD5 password
      hashing, unbounded DB queries, and connection churn. Missed critical
      security issues carry heavy penalties.

  - id: task_3_hard
    title: "Comprehensive Code Review"
    difficulty: hard
    categories: [bug, security, performance, style, documentation]
    max_steps: 10
    passing_threshold: 0.65
    description: >
      Full production-grade review of data_pipeline.py (49 lines). Covers
      all five categories including shell injection, unsafe pickle
      deserialization, ZeroDivisionError, and missing docstrings. An overall
      written summary is required.

api_endpoints:
  - path: /reset
    method: POST
    description: Start or restart an episode
  - path: /step
    method: POST
    description: Submit an action
  - path: /state
    method: GET
    description: Get full serialisable state
  - path: /tasks
    method: GET
    description: List all available tasks
  - path: /health
    method: GET
    description: Health check

baseline:
  model: gpt-4o
  script: baseline_agent.py
  expected_scores:
    task_1_easy: ~0.75
    task_2_medium: ~0.65
    task_3_hard: ~0.55

docker:
  base_image: python:3.11-slim
  port: 7860
  build: docker build -t code-review-env .
  run: docker run -p 7860:7860 code-review-env

huggingface:
  space_sdk: docker
  tags: [openenv, code-review, ai-agent, evaluation]