Spaces:

Avnishjain
/

code-review

Configuration error

App Files Files Community

code-review / openenv.yaml

Avnishjain

Upload 26 files

ec566e9 verified 5 days ago

raw

history blame contribute delete

4.85 kB

	name: code-review-env
	version: "1.0.0"
	description: >
	An OpenEnv-compliant AI training environment that simulates professional
	Python code review. Agents learn to identify bugs, security vulnerabilities,
	performance issues, style problems, and documentation gaps across three
	progressively harder tasks.

	tags:
	- openenv
	- code-review
	- python
	- security
	- software-engineering

	author: imaginephoenix / rawgenn.tech
	license: MIT

	environment:
	class: CodeReviewEnv
	module: env.environment
	entrypoint: app.py
	framework: fastapi

	observation_space:
	type: object
	description: >
	What the agent sees each step. Contains the code snippet to review,
	task instructions, all previously submitted comments, and optional
	feedback from the last step.
	fields:
	task_id:
	type: string
	description: Identifier of the active task
	step:
	type: integer
	description: Current step number (0-indexed)
	snippet:
	type: object
	description: Python source code to review
	fields:
	file_name: { type: string }
	source: { type: string, description: "Full Python source with line numbers" }
	language: { type: string, const: "python" }
	instructions:
	type: string
	description: Review instructions and scope for this task
	previous_comments:
	type: array
	description: All review comments submitted in prior steps
	feedback:
	type: string
	nullable: true
	description: Environment feedback on the most recent action
	done:
	type: boolean

	action_space:
	type: object
	description: >
	What the agent submits. A list of review comments (each with line,
	category, severity, message, optional suggestion) plus an optional
	overall summary and a submit flag.
	fields:
	comments:
	type: array
	items:
	type: object
	fields:
	line: { type: integer, nullable: true, description: "1-indexed line number" }
	category:
	type: string
	enum: [bug, security, performance, style, documentation]
	severity:
	type: string
	enum: [low, medium, high, critical]
	message: { type: string, minLength: 5, maxLength: 500 }
	suggestion: { type: string, nullable: true, maxLength: 500 }
	summary:
	type: string
	nullable: true
	description: "Required for task_3_hard; optional otherwise"
	submit:
	type: boolean
	description: "Set true to finalise the review and trigger the grader"

	reward:
	type: float
	range: [-1.0, 1.0]
	description: >
	Shaped reward with partial progress signals. Incremental positive reward
	for each new valid comment added (proportional to issue severity). On
	submit: final grader score mapped to [-0.2, 1.0]. Penalties for false
	positives, missed criticals, and spamming low-quality comments.

	tasks:
	- id: task_1_easy
	title: "Bug Detection & Style Review"
	difficulty: easy
	categories: [bug, style]
	max_steps: 5
	passing_threshold: 0.55
	description: >
	Review calculator.py (31 lines) for division-by-zero bugs, off-by-one
	errors, empty-collection crashes, and Python style anti-patterns.

	- id: task_2_medium
	title: "Security & Performance Audit"
	difficulty: medium
	categories: [security, performance]
	max_steps: 7
	passing_threshold: 0.60
	description: >
	Audit user_service.py (55 lines) for SQL injection, broken MD5 password
	hashing, unbounded DB queries, and connection churn. Missed critical
	security issues carry heavy penalties.

	- id: task_3_hard
	title: "Comprehensive Code Review"
	difficulty: hard
	categories: [bug, security, performance, style, documentation]
	max_steps: 10
	passing_threshold: 0.65
	description: >
	Full production-grade review of data_pipeline.py (49 lines). Covers
	all five categories including shell injection, unsafe pickle
	deserialization, ZeroDivisionError, and missing docstrings. An overall
	written summary is required.

	api_endpoints:
	- path: /reset
	method: POST
	description: Start or restart an episode
	- path: /step
	method: POST
	description: Submit an action
	- path: /state
	method: GET
	description: Get full serialisable state
	- path: /tasks
	method: GET
	description: List all available tasks
	- path: /health
	method: GET
	description: Health check

	baseline:
	model: gpt-4o
	script: baseline_agent.py
	expected_scores:
	task_1_easy: ~0.75
	task_2_medium: ~0.65
	task_3_hard: ~0.55

	docker:
	base_image: python:3.11-slim
	port: 7860
	build: docker build -t code-review-env .
	run: docker run -p 7860:7860 code-review-env

	huggingface:
	space_sdk: docker
	tags: [openenv, code-review, ai-agent, evaluation]