Spaces:

eastbrick
/

releaseops-env

Sleeping

App Files Files Community

releaseops-env / server /app.py

eastbrick

first commit

408d02c about 1 month ago

raw

history blame contribute delete

4.33 kB

	"""FastAPI application for ReleaseOps-Env.

	Exposes the core OpenEnv endpoints (reset/step/state/health) via create_app,
	plus the hackathon-required endpoints: /tasks, /grader, /baseline.
	"""

	import json
	import os
	from pathlib import Path

	from fastapi import HTTPException
	from openenv.core.env_server.http_server import create_app
	from releaseops_env.models import ReleaseAction, ReleaseObservation
	from server.releaseops_environment import ReleaseOpsEnvironment

	TASKS_DIR = Path(__file__).parent.parent / "tasks"

	# ── Core OpenEnv app ────────────────────────────────────────────────
	app = create_app(
	ReleaseOpsEnvironment,
	ReleaseAction,
	ReleaseObservation,
	env_name="releaseops_env",
	)


	# ── /tasks — list available tasks and action schema ─────────────────
	@app.get("/tasks")
	def list_tasks():
	"""Return all available tasks with metadata and the action schema."""
	tasks = []
	for task_dir in sorted(TASKS_DIR.iterdir()):
	if not task_dir.is_dir():
	continue
	gt_path = task_dir / "ground_truth.json"
	if not gt_path.exists():
	continue
	with open(gt_path) as f:
	gt = json.load(f)
	tasks.append(
	{
	"task_id": task_dir.name,
	"difficulty": gt.get("difficulty", "unknown"),
	"optimal_decision": gt.get("optimal_decision", ""),
	"max_steps": gt.get("max_steps", 12),
	"expected_score_range": gt.get("expected_score_range", {}),
	}
	)

	return {
	"tasks": tasks,
	"action_schema": ReleaseAction.model_json_schema(),
	"observation_schema": ReleaseObservation.model_json_schema(),
	}


	# ── /grader — run grader on a specific task with a given trajectory ──
	@app.post("/grader")
	def run_grader(task_id: str = "easy_001"):
	"""
	Run a full episode with an optimal-ish trajectory and return the grader score.

	This endpoint creates a fresh environment, plays a reference trajectory
	for the given task, and returns the grading result.
	"""
	env = ReleaseOpsEnvironment()
	obs = env.reset(task_id=task_id)

	gt_path = TASKS_DIR / task_id / "ground_truth.json"
	if not gt_path.exists():
	raise HTTPException(status_code=404, detail=f"Task '{task_id}' not found")

	with open(gt_path) as f:
	gt = json.load(f)

	# Play a standard evidence-gathering trajectory
	evidence_actions = [
	ReleaseAction(action_type="inspect_change", section="diff"),
	ReleaseAction(action_type="inspect_change", section="tests"),
	ReleaseAction(action_type="inspect_change", section="approvals"),
	ReleaseAction(action_type="inspect_dependencies"),
	ReleaseAction(
	action_type="search_incidents", keywords=["retry", "timeout", "latency"]
	),
	ReleaseAction(action_type="check_policy"),
	]

	for action in evidence_actions:
	obs = env.step(action)
	if obs.done:
	break

	if not obs.done:
	obs = env.step(
	ReleaseAction(
	action_type="submit_decision",
	final_decision=gt.get("optimal_decision", "block"),
	reason_codes=gt.get("required_reason_codes", []),
	)
	)

	return {
	"task_id": task_id,
	"score": obs.final_score,
	"grader_breakdown": obs.grader_breakdown,
	"done": obs.done,
	"steps_taken": env.state.step_count,
	"cumulative_reward": obs.cumulative_reward,
	}


	# ── /baseline — run baseline agent on all tasks ─────────────────────
	@app.post("/baseline")
	def run_baseline_endpoint():
	"""
	Run the built-in heuristic baseline agent against all tasks.

	Returns scores for each task. Does NOT require an LLM API key —
	uses a rule-based heuristic agent for reproducibility.
	"""
	from baseline.heuristic_agent import run_heuristic_baseline

	results = run_heuristic_baseline()
	return results


	def main():
	import uvicorn

	uvicorn.run(app, host="0.0.0.0", port=7860)


	if __name__ == "__main__":
	main()