Spaces:

Dolphin-Syndrom
/

code-review-env

Sleeping

File size: 7,489 Bytes

name: code-review-env
version: 1.1.0
description: >
  An OpenEnv benchmark where an AI agent reviews buggy Python code and learns
  to identify security vulnerabilities, logic errors, and code smells using a
  fixed taxonomy of issue tags.  Simulates the real-world software engineering
  task of pull-request review with deterministic, multi-dimensional grading
  and an iterative refinement mechanic for multi-step learning.
author: Dolphin-Syndrom
license: BSD-3-Clause

spec:
  observation_space:
    task_id:
      type: string
      description: Current task identifier (task_extra_easy, task_easy, task_medium, task_hard, task_expert)
    file_name:
      type: string
      description: File name associated with the code snippet under review
    task_description:
      type: string
      description: Instructions describing what the agent should review and return
    code_snippet:
      type: string
      description: Python code snippet containing planted issues for review
    feedback:
      type: string
      description: >
        Grading feedback including score, found/missed counts, category hints
        for iterative refinement, and severity assessment guidance
    step_number:
      type: integer
      description: Current step number within the episode (starts at 0 after reset)
    available_issue_tags:
      type: array
      description: >
        Allowed issue tags the agent can use in issues_found —
        null_pointer, missing_return, type_error, index_out_of_bounds,
        sql_injection, hardcoded_secret, missing_input_validation,
        race_condition, timing_attack, improper_error_handling,
        integer_overflow, path_traversal

  action_space:
    review:
      type: object
      properties:
        review_comment:
          type: string
          description: Human-readable review explaining identified issues and suggested fixes
        issues_found:
          type: array
          items:
            type: string
          description: List of issue tags found by the agent, chosen from ISSUE_TAXONOMY
        severity:
          type: string
          enum: [low, medium, high, critical]
          description: Overall severity level assessed by the agent
      required: [review_comment, issues_found, severity]

  reward_range: [0.0, 1.0]
  max_steps: 3

tasks:
  task_extra_easy:
    name: Extra Easy — Index Out of Bounds
    description: >
      Review a simple data utility for an off-by-one index error.
      Single planted issue for agent warm-up.
    difficulty: extra_easy
    planted_issues: [index_out_of_bounds]
    grader:
      type: deterministic
      scoring: >
        base = |correct ∩ planted| / |planted|;
        bonus = +0.05 per correct issue with keyword match in comment;
        severity_bonus = +0.05 if severity matches expected level;
        penalty = −0.1 per false-positive;
        score = clamp(base + bonuses − penalty, 0.0, 1.0)

  task_easy:
    name: Easy — Null Pointer & Missing Return
    description: >
      Review a simple user-service function for a null-pointer dereference
      and a missing return statement.
    difficulty: easy
    planted_issues: [null_pointer, missing_return]
    grader:
      type: deterministic
      scoring: >
        base = |correct ∩ planted| / |planted|;
        bonus = +0.05 per correct issue with keyword match in comment;
        severity_bonus = +0.05 if severity matches expected level;
        penalty = −0.1 per false-positive;
        score = clamp(base + bonuses − penalty, 0.0, 1.0)

  task_medium:
    name: Medium — SQL Injection & Hardcoded Secret
    description: >
      Review an authentication module for SQL injection via f-string
      interpolation and a hardcoded secret key.
    difficulty: medium
    planted_issues: [sql_injection, hardcoded_secret]
    grader:
      type: deterministic
      scoring: >
        base = |correct ∩ planted| / |planted|;
        bonus = +0.05 per correct issue with keyword match in comment;
        severity_bonus = +0.05 if severity matches expected level;
        penalty = −0.1 per false-positive;
        score = clamp(base + bonuses − penalty, 0.0, 1.0)

  task_hard:
    name: Hard — Race Condition, Error Handling & Timing Attack
    description: >
      Review a payment-processing function for a non-atomic
      balance check-and-decrement (race condition), a bare except that
      silently swallows payment errors, and a non-constant-time
      token comparison (timing attack).
    difficulty: hard
    planted_issues: [race_condition, improper_error_handling, timing_attack]
    grader:
      type: deterministic
      scoring: >
        base = |correct ∩ planted| / |planted|;
        bonus = +0.05 per correct issue with keyword match in comment;
        severity_bonus = +0.05 if severity matches expected level;
        penalty = −0.1 per false-positive;
        score = clamp(base + bonuses − penalty, 0.0, 1.0)

  task_expert:
    name: Expert — Path Traversal, Overflow, Input Validation & Type Error
    description: >
      Review a file-processing pipeline for path traversal via unsanitized
      user input, integer overflow in size arithmetic, missing input
      validation on uploaded content, and a type error from unchecked
      string-to-int conversion.
    difficulty: expert
    planted_issues: [path_traversal, integer_overflow, missing_input_validation, type_error]
    grader:
      type: deterministic
      scoring: >
        base = |correct ∩ planted| / |planted|;
        bonus = +0.05 per correct issue with keyword match in comment;
        severity_bonus = +0.05 if severity matches expected level;
        penalty = −0.1 per false-positive;
        score = clamp(base + bonuses − penalty, 0.0, 1.0)

reward_function:
  summary:
    - Dense rewards are provided per step so agents receive signal across the full trajectory.
    - Final task scores are deterministic and normalized to 0.0–1.0 by the graders.
    - Iterative refinement feedback enables agents to improve across steps within an episode.
  components:
    recall_reward:
      description: >
        Fractional reward proportional to |correctly found issues| / |planted issues|.
        This is the primary learning signal encouraging comprehensive detection.
    quality_bonus:
      value: +0.05
      description: >
        Per correctly-found issue whose associated keywords appear in the
        agent's free-text review_comment (e.g. "sql" for sql_injection).
    severity_bonus:
      value: +0.05
      description: >
        Awarded when the agent's severity assessment matches the expected
        level for the task's difficulty (e.g. "critical" for hard tasks).
    precision_penalty:
      value: -0.10
      description: >
        Per false-positive issue tag submitted. Discourages hallucinated
        or overly aggressive flagging.

server:
  host: 0.0.0.0
  port: 8000
  entrypoint: server.app:app
  endpoints:
    - GET  /health
    - GET  /tasks
    - POST /reset
    - POST /step
    - GET  /state
    - POST /grader
    - POST /baseline
    - GET  /ws

dependencies:
  python: ">=3.10"
  packages:
    - openenv-core[core]>=0.2.2
    - openai>=1.0
    - httpx>=0.24.0
    - plotly>=6.6.0
    - pandas>=2.3.3
    - gradio>=4.0
    - pydantic>=2.0.0
    - uvicorn>=0.24.0
    - fastapi>=0.104.0

validation:
  openenv_spec: true
  docker_build: true
  baseline_reproducible: true
  tasks_count: 5
  tests_passing: 32