Spaces:
Sleeping
Sleeping
| """ | |
| TeamForge β Hugging Face Spaces Entry Point | |
| ============================================ | |
| Exposes BOTH: | |
| 1. REST API (for OpenEnv validator: POST /reset, POST /step, GET /state) | |
| 2. Gradio UI (for human demo) | |
| The OpenEnv validator POSTs to /reset β this must return a valid JSON observation. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import threading | |
| from typing import Any, Dict, Optional | |
| import uvicorn | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.responses import JSONResponse | |
| from pydantic import BaseModel | |
| import gradio as gr | |
| from environment import TeamForgeEnv | |
| from models import ( | |
| Commit, EditFile, GenerateReview, | |
| PlanStep, RunLint, RunTests, SelfReflect, RequestIteration, | |
| ) | |
| from tasks import ALL_TASK_IDS | |
| # ββ Shared environment instance βββββββββββββββββββββββββββββββββββββββββββββββ | |
| env = TeamForgeEnv() | |
| _obs = None | |
| _lock = threading.Lock() | |
| # ββ FastAPI app (REST endpoints for OpenEnv validator) ββββββββββββββββββββββββ | |
| api = FastAPI(title="TeamForge OpenEnv API", version="1.0.0") | |
| class ResetRequest(BaseModel): | |
| task_id: str = "easy_bugfix_chunk_list" | |
| class StepRequest(BaseModel): | |
| action: Dict[str, Any] | |
| def root(): | |
| """Health check β validator pings this first.""" | |
| return {"status": "ok", "env": "teamforge", "version": "1.0.0"} | |
| def health(): | |
| return {"status": "ok"} | |
| def reset(req: Optional[ResetRequest] = None): | |
| """ | |
| OpenEnv reset endpoint. | |
| POST /reset {"task_id": "easy_bugfix_chunk_list"} | |
| Returns full Observation as JSON. | |
| """ | |
| global _obs | |
| with _lock: | |
| try: | |
| task_id = req.task_id if req else "easy_bugfix_chunk_list" | |
| if task_id not in ALL_TASK_IDS: | |
| task_id = "easy_bugfix_chunk_list" | |
| _obs = env.reset(task_id) | |
| return JSONResponse(content=_obs.model_dump()) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| def step(req: StepRequest): | |
| """ | |
| OpenEnv step endpoint. | |
| POST /step {"action": {"type": "run_tests", ...}} | |
| Returns updated Observation as JSON. | |
| """ | |
| global _obs | |
| with _lock: | |
| if _obs is None: | |
| _obs = env.reset("easy_bugfix_chunk_list") | |
| dispatch = { | |
| "plan_step": PlanStep, | |
| "edit_file": EditFile, | |
| "run_tests": RunTests, | |
| "run_lint": RunLint, | |
| "generate_review": GenerateReview, | |
| "commit": Commit, | |
| "self_reflect": SelfReflect, | |
| "request_iteration":RequestIteration, | |
| } | |
| try: | |
| action_data = req.action | |
| action_type = action_data.get("type", "") | |
| cls = dispatch.get(action_type) | |
| if cls is None: | |
| raise HTTPException(status_code=400, detail=f"Unknown action type: {action_type}") | |
| action = cls(**action_data) | |
| _obs = env.step(action) | |
| return JSONResponse(content=_obs.model_dump()) | |
| except HTTPException: | |
| raise | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| def state(): | |
| """ | |
| OpenEnv state endpoint. | |
| GET /state β current environment state dict. | |
| """ | |
| with _lock: | |
| return JSONResponse(content=env.state()) | |
| def tasks(): | |
| """List all available tasks.""" | |
| return {"tasks": ALL_TASK_IDS} | |
| def grade(): | |
| """Grade the current episode.""" | |
| with _lock: | |
| result = env.grade() | |
| return JSONResponse(content=result.model_dump()) | |
| # ββ Gradio UI (for human demo) ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def gradio_reset(task_id: str): | |
| global _obs | |
| with _lock: | |
| env._sandbox.teardown() | |
| _obs = env.reset(task_id) | |
| state_dict = env.state() | |
| desc = f"## β Episode started: `{task_id}`\n\n" | |
| desc += f"**Files in repo:** `{[f.path for f in _obs.repo_files]}`\n\n" | |
| desc += f"**Task:**\n{_obs.task_description[:400]}" | |
| return desc, json.dumps(state_dict, indent=2) | |
| def gradio_run_demo(task_id: str): | |
| """Run the scripted demo solution for the easy task.""" | |
| if task_id != "easy_bugfix_chunk_list": | |
| return "β οΈ Scripted demo only available for easy_bugfix_chunk_list.\nFor other tasks, use the API or inference.py" | |
| global _obs | |
| with _lock: | |
| env._sandbox.teardown() | |
| _obs = env.reset(task_id) | |
| log = [f"[START] task={task_id}\n"] | |
| steps = [ | |
| PlanStep(step_number=1, description="Read chunk_list, find range() bug", estimated_effort="low"), | |
| PlanStep(step_number=2, description="Fix range(0,len(lst)-1,n) β range(0,len(lst),n)", estimated_effort="low"), | |
| EditFile( | |
| file_path="utils/list_ops.py", | |
| content='"""List utility operations."""\nfrom typing import Any, List\n\n\ndef chunk_list(lst: List[Any], n: int) -> List[List[Any]]:\n """Split lst into chunks of size n."""\n if n <= 0:\n raise ValueError("Chunk size must be positive")\n return [lst[i : i + n] for i in range(0, len(lst), n)]\n\n\ndef flatten(lst: List[List[Any]]) -> List[Any]:\n """Flatten one level."""\n return [item for sublist in lst for item in sublist]\n', | |
| reason="Fix off-by-one: range stop was len(lst)-1, should be len(lst)", | |
| ), | |
| RunTests(), | |
| RunLint(fix=False), | |
| GenerateReview( | |
| focus_areas=["correctness", "off-by-one", "range"], | |
| review_text=( | |
| "Bug was off-by-one in range() β range(0,len(lst)-1,n) dropped last chunk. " | |
| "Fix: range(0,len(lst),n). All 7 tests pass. Lint clean. O(n) complexity preserved." | |
| ), | |
| ), | |
| SelfReflect( | |
| what_went_well="Identified off-by-one immediately from test_odd_split assertion.", | |
| what_to_improve="Should run lint before tests next time.", | |
| ), | |
| Commit(message="fix(list_ops): correct off-by-one in chunk_list range() call"), | |
| ] | |
| with _lock: | |
| for action in steps: | |
| _obs = env.step(action) | |
| tr = _obs.test_results | |
| log.append( | |
| f"[STEP {_obs.step_number:2d}] {action.type:20s} " | |
| f"reward={_obs.reward:+.4f} " | |
| f"tests={'%dp/%df' % (tr.passed, tr.failed) if tr else 'N/A'}" | |
| ) | |
| result = env.grade() | |
| log.append(f"\n[END] FINAL_SCORE={result.final_score:.4f} | PASSED={result.passed}") | |
| log.append(f" test_pass_rate = {result.test_pass_rate:.4f}") | |
| log.append(f" lint_score = {result.lint_score:.4f}") | |
| log.append(f" efficiency = {result.efficiency_score:.4f}") | |
| log.append(f" review_quality = {result.review_quality:.4f}") | |
| return "\n".join(log) | |
| with gr.Blocks(title="TeamForge β OpenEnv Benchmark") as demo: | |
| gr.Markdown(""" | |
| # ποΈ TeamForge: OpenEnv Benchmark for Autonomous Software Engineering Agents | |
| **REST API available at this Space URL** β the OpenEnv validator uses: | |
| - `POST /reset` β start episode | |
| - `POST /step` β execute action | |
| - `GET /state` β current state | |
| > Simulates a full software development team: **Plan β Code β Test β Review β Reflect** | |
| """) | |
| with gr.Row(): | |
| task_dd = gr.Dropdown(choices=ALL_TASK_IDS, value=ALL_TASK_IDS[0], label="Task") | |
| reset_btn = gr.Button("π Init Episode", variant="secondary") | |
| demo_btn = gr.Button("βΆ Run Demo (Easy Task)", variant="primary") | |
| with gr.Row(): | |
| obs_out = gr.Markdown(label="Observation") | |
| state_out = gr.Code(label="State JSON", language="json") | |
| log_out = gr.Textbox(label="Episode Log", lines=20, interactive=False) | |
| reset_btn.click(gradio_reset, inputs=[task_dd], outputs=[obs_out, state_out]) | |
| demo_btn.click(gradio_run_demo, inputs=[task_dd], outputs=[log_out]) | |
| # ββ Mount Gradio inside FastAPI βββββββββββββββββββββββββββββββββββββββββββββββ | |
| from gradio.routes import mount_gradio_app | |
| app = mount_gradio_app(api, demo, path="/ui") | |
| # ββ Entry point βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| port = int(os.getenv("PORT", 7860)) | |
| uvicorn.run(app, host="0.0.0.0", port=port) | |
| if __name__ == "__main__": | |
| main() | |