Spaces:

inmodel
/

code-review-env

Running

Nitish

docs: final README and openenv.yaml sync with new tasks and baselines

4a2e8a2 4 days ago

3.86 kB

	# OpenEnv Environment Specification
	# This file describes the Code Security Review environment for the Meta PyTorch OpenEnv Hackathon.

	# Metadata section details the environment's identity.
	name: code-security-review
	version: "1.0.0"
	description: >
	An RL environment for training AI agents to perform code security review.
	Agents analyze code snippets from production pull requests and identify bugs,
	vulnerabilities, and security issues.
	author: Inmodel Labs

	# Tasks section defines the core challenges in the environment.
	# Each task has a unique ID, name, description, and difficulty level.
	tasks:
	- id: python-off-by-one
	name: "Python Off-by-One Error"
	description: "Identify an off-by-one index error in a Python finance batch processor"
	difficulty: easy
	max_steps: 2
	reward_range: [0.0, 1.0]

	- id: js-idor-auth
	name: "JavaScript IDOR Authorization Bypass"
	description: "Identify a horizontal privilege escalation (IDOR) in a Node.js REST profile endpoint"
	difficulty: medium
	max_steps: 2
	reward_range: [0.0, 1.0]

	- id: python-pickle-deserialization
	name: "Python Pickle Deserialization"
	description: "Identify an insecure deserialization vulnerability using pickle in a background worker"
	difficulty: hard
	max_steps: 2
	reward_range: [0.0, 1.0]

	# The Action space defines the format of the agent's response.
	# Each field is scored by the grader to provide partial progress signals.
	action_space:
	type: object
	description: >
	Two-phase action space. Phase 1: submit {"request_file": true} to unlock
	the code snippet (+0.20 reward). Phase 2: submit a full review JSON.
	properties:
	request_file: { type: boolean, description: "Phase 1: Request the hidden file contents" }
	bug_identified: { type: boolean, description: "Boolean: true if a bug exists" }
	bug_location: { type: string, description: "String: Pinpoint the bug's location in code" }
	bug_type: { type: string, description: "String: off-by-one \| logic-error \| insecure-deserialization \| none" }
	bug_description: { type: string, description: "String: Detailed analysis of the vulnerability" }
	severity: { type: string, enum: [none, low, medium, high, critical], description: "String: none \| low \| medium \| high \| critical" }
	suggested_fix: { type: string, description: "String: How to fix the identified bug" }

	# The Observation space defines what the agent sees at each step.
	# It uses a structured context to help the agent understand the code's purpose.
	observation_space:
	type: object
	properties:
	task_id: { type: string, description: "Unique task identifier" }
	language: { type: string, description: "Source code language" }
	difficulty: { type: string, enum: [easy, medium, hard], description: "Task complexity (easy/medium/hard)" }
	code_snippet: { type: string, description: "The source code to be reviewed" }
	context: { type: string, description: "Real-world context (e.g., API description)" }
	pr_title: { type: string, description: "Pull Request title for additional intent context" }
	file_path: { type: string, description: "Relative path to the file in the repository" }

	# Reward structure for evaluating agent performance.
	reward:
	min: 0.0
	max: 1.0
	description: >
	Step 1 — File request: +0.20 (flat, always granted).
	Step 2 — Bug review: partial rewards for bug identification (0.20),
	correct bug type (0.20), precise location (0.10), description quality (0.25,
	keyword density), fix quality (0.15), correct severity (0.10).
	Episode total is clamped to [0.0, 1.0]. Grader penalizes keyword stuffing.

	endpoints:
	health: GET /
	reset: POST /reset
	step: POST /step
	state: GET /state
	tasks: GET /tasks