Spaces:

PrakashCider
/

teamforge

Sleeping

teamforge / openenv.yaml

Your Name

fix(OpenEnv): implement system-wide [0.1, 0.9] boundary scrub for Phase 2 compliance

efa2d2a about 1 month ago

7.38 kB

	name: teamforge
	version: "1.1.0"
	description: >
	A structured multi-phase benchmark for autonomous software engineering agents.
	The agent simulates a full software development team: planning, coding, testing,
	reviewing, and reflecting — inside a real isolated Git repository.

	author: TeamForge
	license: MIT

	# ── OpenEnv Interface ──────────────────────────────────────────────────────────
	entry_point: environment.TeamForgeEnv

	methods:
	reset:
	description: "Start a new episode for the given task_id. Returns initial Observation."
	parameters:
	task_id:
	type: string
	enum:
	- easy_bugfix_chunk_list
	- medium_refactor_stats
	- hard_lru_cache_performance
	description: "Which task to run this episode."
	step:
	description: "Execute one typed action. Returns updated Observation with reward."
	parameters:
	action:
	type: object
	description: "A typed Action model (see action_space below)."
	state:
	description: "Return current environment state as a plain serialisable dict."

	# ── Action Space ───────────────────────────────────────────────────────────────
	action_space:
	type: union
	description: "One of 8 structured actions. Discriminated by the `type` field."
	actions:
	- name: plan_step
	fields:
	type: {type: literal, value: plan_step}
	step_number: {type: integer, minimum: 1}
	description: {type: string, minLength: 5}
	estimated_effort: {type: string, enum: [low, medium, high]}
	depends_on: {type: array, items: integer}

	- name: edit_file
	fields:
	type: {type: literal, value: edit_file}
	file_path: {type: string}
	content: {type: string}
	reason: {type: string, minLength: 5}

	- name: run_tests
	fields:
	type: {type: literal, value: run_tests}
	test_path: {type: string, nullable: true}
	timeout_seconds: {type: integer, minimum: 5, maximum: 120, default: 30}

	- name: run_lint
	fields:
	type: {type: literal, value: run_lint}
	fix: {type: boolean, default: false}
	file_path: {type: string, nullable: true}

	- name: generate_review
	fields:
	type: {type: literal, value: generate_review}
	focus_areas: {type: array, items: string}
	review_text: {type: string, minLength: 20}

	- name: commit
	fields:
	type: {type: literal, value: commit}
	message: {type: string, minLength: 10}
	files: {type: array, items: string}

	- name: self_reflect
	fields:
	type: {type: literal, value: self_reflect}
	what_went_well: {type: string, minLength: 10}
	what_to_improve: {type: string, minLength: 10}
	adjusted_plan: {type: string, nullable: true}

	- name: request_iteration
	fields:
	type: {type: literal, value: request_iteration}
	reason: {type: string, minLength: 10}
	target_issues: {type: array, items: string}

	# ── Observation Space ──────────────────────────────────────────────────────────
	observation_space:
	type: object
	description: "Full typed Observation returned after every step() and reset()."
	fields:
	task_id: {type: string}
	task_description: {type: string}
	difficulty: {type: string, enum: [easy, medium, hard]}
	step_number: {type: integer}
	max_steps: {type: integer}
	phase: {type: string, enum: [planning, coding, testing, reviewing, reflecting, done]}
	repo_files: {type: array, description: "List of FileSnapshot objects (path, content, size_bytes)"}
	git_log: {type: array, items: string}
	last_action_type: {type: string, nullable: true}
	last_action_status: {type: string, enum: [success, failure, partial]}
	last_action_output: {type: string}
	test_results: {type: object, nullable: true, description: "TestResult: passed, failed, errors, output, duration_seconds"}
	lint_results: {type: object, nullable: true, description: "LintResult: violations, output, score"}
	plan: {type: array, description: "List of PlanStep actions issued so far"}
	reviews: {type: array, description: "List of ReviewArtifact objects"}
	reflections: {type: array, description: "List of ReflectionArtifact objects"}
	reward: {type: number, description: "Reward for the last action"}
	cumulative_reward: {type: number}
	done: {type: boolean}
	info: {type: object}

	# ── Reward ─────────────────────────────────────────────────────────────────────
	reward:
	range: [0.0, 1.0]
	type: dense
	description: >
	Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
	clean lint, reviews, reflections, commits. Always strictly between 0 and 1.

	# ── Tasks ──────────────────────────────────────────────────────────────────────
	tasks:
	- id: easy_bugfix_chunk_list
	difficulty: easy
	max_steps: 20
	description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
	grader: grader.grade_task
	score_range: [0.0, 1.0]

	- id: medium_refactor_stats
	difficulty: medium
	max_steps: 30
	description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
	grader: grader.grade_task
	score_range: [0.0, 1.0]

	- id: hard_lru_cache_performance
	difficulty: hard
	max_steps: 40
	description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
	grader: grader.grade_task
	score_range: [0.0, 1.0]

	- id: bonus_task
	difficulty: bonus
	max_steps: 10
	description: "Bonus: Optimize the LRU cache for memory efficiency. Gradual memory reduction is rewarded."
	grader: grader.grade_task
	score_range: [0.0, 1.0]

	# ── Infrastructure ─────────────────────────────────────────────────────────────
	runtime:
	python: ">=3.11"
	memory_gb: 8
	vcpu: 2
	max_episode_minutes: 20

	inference:
	script: inference.py
	env_vars:
	API_BASE_URL: "https://api.groq.com/openai/v1"
	MODEL_NAME: "llama3-8b-8192"
	HF_TOKEN: ""

	deployment:
	dockerfile: Dockerfile
	huggingface_spaces: true
	gradio_app: server/app.py

	# ── API Endpoints (for OpenEnv validator) ──────────────────────────────────────
	api:
	reset:
	method: POST
	path: /reset
	body: '{"task_id": "easy_bugfix_chunk_list"}'
	step:
	method: POST
	path: /step
	body: '{"action": {"type": "run_tests"}}'
	state:
	method: GET
	path: /state
	health:
	method: GET
	path: /health