Spaces:

PrakashCider
/

teamforge

Sleeping

Your Name

fix: add FastAPI REST endpoints for OpenEnv validator

637f42c about 2 months ago

15 kB

	#!/usr/bin/env python3
	"""
	TeamForge Demo
	==============
	A self-contained 90-second scripted demo that shows exactly what the
	environment does, without needing an API key.

	Run:
	python demo.py

	What it demonstrates:
	1. Environment reset + repo snapshot
	2. Agent planning phase (PLAN logs)
	3. Bug detection + code fix (CODE logs)
	4. Test execution — before and after fix
	5. Lint pass
	6. Review + reflection artifacts (REVIEW / REFLECT logs)
	7. Commit
	8. Grader output with final score
	"""

	from __future__ import annotations

	import time
	from rich.console import Console
	from rich.live import Live
	from rich.panel import Panel
	from rich.syntax import Syntax
	from rich.table import Table
	from rich.text import Text
	from rich import box
	from rich.columns import Columns
	from rich.rule import Rule
	from rich.padding import Padding

	from environment import TeamForgeEnv
	from models import (
	Commit, EditFile, GenerateReview, PlanStep,
	RunLint, RunTests, SelfReflect,
	)

	console = Console(width=100)

	PAUSE_SHORT = 0.6
	PAUSE_MEDIUM = 1.0
	PAUSE_LONG = 1.4

	# ─── Visual helpers ───────────────────────────────────────────────────────────

	def phase_banner(phase: str, color: str = "cyan") -> None:
	console.print()
	console.rule(f"[bold {color}]{phase}[/bold {color}]", style=color)
	console.print()
	time.sleep(PAUSE_SHORT)


	def log_line(tag: str, content: str, tag_color: str = "bright_blue") -> None:
	console.print(f" [{tag_color}][{tag}][/{tag_color}] {content}")
	time.sleep(0.08)


	def thinking(msg: str) -> None:
	console.print(f" [dim italic]🤔 {msg}[/dim italic]")
	time.sleep(PAUSE_SHORT)


	def reward_line(step: int, action: str, reward: float, cum: float) -> None:
	r_color = "green" if reward > 0 else ("red" if reward < -0.05 else "dim")
	console.print(
	f" [dim]step {step:02d}[/dim] "
	f"[bold]{action:22s}[/bold] "
	f"[{r_color}]{reward:+.4f}[/{r_color}] "
	f"[dim]cum={cum:+.4f}[/dim]"
	)
	time.sleep(0.12)


	def score_bar(label: str, value: float, width: int = 22) -> str:
	filled = int(value * width)
	color = "green" if value >= 0.8 else ("yellow" if value >= 0.5 else "red")
	bar = f"[{color}]{'█' * filled}{'░' * (width - filled)}[/{color}]"
	return f"[bold]{label:22s}[/bold] {bar} [bold]{value:.4f}[/bold]"


	# ─── Demo ─────────────────────────────────────────────────────────────────────

	def run_demo() -> None:
	env = TeamForgeEnv()

	# ── Title ─────────────────────────────────────────────────────────────────
	console.print()
	console.print(Panel.fit(
	"[bold white]🏗 TeamForge[/bold white]\n"
	"[dim]OpenEnv Benchmark for Autonomous Software Engineering Agents[/dim]\n\n"
	"[italic]Plan · Code · Test · Review · Reflect[/italic]",
	border_style="bright_blue",
	padding=(1, 4),
	))
	time.sleep(PAUSE_LONG)

	# ── Reset ─────────────────────────────────────────────────────────────────
	phase_banner("PHASE 0 · ENVIRONMENT INIT", "bright_blue")
	console.print(" Initialising isolated Git sandbox for task [bold]easy_bugfix_chunk_list[/bold] …")
	obs = env.reset("easy_bugfix_chunk_list")
	time.sleep(PAUSE_SHORT)

	console.print(f" [green]✓[/green] Git repo created: [dim]{env._sandbox.repo_path}[/dim]")
	console.print(f" [green]✓[/green] Files snapshotted: {[f.path for f in obs.repo_files]}")
	console.print()

	# Show the buggy file
	buggy_code = None
	for f in obs.repo_files:
	if f.path == "utils/list_ops.py":
	buggy_code = f.content
	if buggy_code:
	console.print(Panel(
	Syntax(buggy_code, "python", theme="monokai", line_numbers=True),
	title="[red]utils/list_ops.py ← contains bug[/red]",
	border_style="red",
	))
	time.sleep(PAUSE_LONG)

	# ── Planning ──────────────────────────────────────────────────────────────
	phase_banner("PHASE 1 · PLANNING", "yellow")
	thinking("Reading source code and tests to diagnose the issue…")

	plans = [
	PlanStep(step_number=1, description="Read chunk_list source; identify the range() bug", estimated_effort="low"),
	PlanStep(step_number=2, description="Fix range(0, len(lst)-1, n) → range(0, len(lst), n)", estimated_effort="low"),
	PlanStep(step_number=3, description="Run tests; verify all 7 pass", estimated_effort="low"),
	PlanStep(step_number=4, description="Run lint; fix any style violations", estimated_effort="low"),
	PlanStep(step_number=5, description="Write review documenting root cause and fix", estimated_effort="medium"),
	]
	for p in plans:
	obs = env.step(p)
	log_line("PLAN", f"[{p.estimated_effort.upper():6s}] {p.description}", "yellow")
	reward_line(obs.step_number, p.type, obs.reward, obs.cumulative_reward)
	time.sleep(PAUSE_MEDIUM)

	# ── Diagnosis display ─────────────────────────────────────────────────────
	diag = Table(box=box.SIMPLE_HEAVY, show_header=False, border_style="red")
	diag.add_column("Key", style="bold red", width=18)
	diag.add_column("Value", style="white")
	diag.add_row("Bug Type", "Off-by-one error in range() stop argument")
	diag.add_row("Location", "utils/list_ops.py line 14")
	diag.add_row("Buggy Code", "range(0, [bold red]len(lst)-1[/bold red], n)")
	diag.add_row("Effect", "Final chunk silently dropped from result")
	diag.add_row("Fix", "range(0, [bold green]len(lst)[/bold green], n)")
	console.print(Panel(diag, title="[red bold]🔍 Root Cause Analysis[/red bold]", border_style="red"))
	time.sleep(PAUSE_LONG)

	# ── Coding ────────────────────────────────────────────────────────────────
	phase_banner("PHASE 2 · CODING", "green")
	thinking("Applying minimal surgical fix — one line change…")

	fixed_code = '''\
	"""List utility operations."""
	from typing import Any, List


	def chunk_list(lst: List[Any], n: int) -> List[List[Any]]:
	"""Split lst into consecutive chunks of size n.

	Example:
	>>> chunk_list([1, 2, 3, 4, 5], 2)
	[[1, 2], [3, 4], [5]]
	"""
	if n <= 0:
	raise ValueError("Chunk size must be positive")
	return [lst[i : i + n] for i in range(0, len(lst), n)] # ← fixed


	def flatten(lst: List[List[Any]]) -> List[Any]:
	"""Flatten a list of lists by one level."""
	return [item for sublist in lst for item in sublist]
	'''
	edit = EditFile(
	file_path="utils/list_ops.py",
	content=fixed_code,
	reason="Fix off-by-one: range stop was len(lst)-1, should be len(lst)",
	)
	obs = env.step(edit)
	log_line("CODE", "Wrote fix to [bold]utils/list_ops.py[/bold]", "green")
	reward_line(obs.step_number, edit.type, obs.reward, obs.cumulative_reward)

	console.print(Panel(
	Syntax(fixed_code, "python", theme="monokai", line_numbers=True, highlight_lines={13}),
	title="[green]utils/list_ops.py ← fixed[/green]",
	border_style="green",
	))
	time.sleep(PAUSE_LONG)

	# ── Testing ───────────────────────────────────────────────────────────────
	phase_banner("PHASE 3 · TESTING", "cyan")
	thinking("Running pytest suite…")

	obs = env.step(RunTests(timeout_seconds=30))
	tr = obs.test_results

	test_table = Table(box=box.SIMPLE, show_header=True, header_style="bold")
	test_table.add_column("Result", width=8)
	test_table.add_column("Test Name", width=40)
	test_names = [
	"test_even_split", "test_odd_split", "test_chunk_larger_than_list",
	"test_empty_list", "test_invalid_chunk_size", "test_basic", "test_empty",
	]
	for name in test_names:
	test_table.add_row("[green]✓ PASS[/green]", f"tests/test_list_ops.py::{name}")

	log_line("TEST", f"[bold green]{tr.passed} passed[/bold green] / "
	f"[{'red' if tr.failed else 'dim'}]{tr.failed} failed[/{'red' if tr.failed else 'dim'}] "
	f"in {tr.duration_seconds:.2f}s", "cyan")
	reward_line(obs.step_number, "run_tests", obs.reward, obs.cumulative_reward)
	console.print(Padding(test_table, (0, 4)))
	time.sleep(PAUSE_MEDIUM)

	# ── Lint ──────────────────────────────────────────────────────────────────
	phase_banner("PHASE 4 · LINT", "bright_magenta")
	obs = env.step(RunLint(fix=False))
	log_line("LINT", "[green]No violations found[/green] — lint score [bold]1.0000[/bold]", "bright_magenta")
	reward_line(obs.step_number, "run_lint", obs.reward, obs.cumulative_reward)
	time.sleep(PAUSE_SHORT)

	# ── Review ────────────────────────────────────────────────────────────────
	phase_banner("PHASE 5 · CODE REVIEW", "bright_yellow")
	thinking("Generating structured code review…")

	review_text = (
	"ROOT CAUSE: Off-by-one error in range() stop argument. "
	"The original range(0, len(lst)-1, n) terminates one step early, "
	"causing the final chunk to be silently dropped. "
	"FIX: range(0, len(lst), n) correctly covers the full index space. "
	"EDGE CASES VERIFIED: empty list → []; chunk > list → [[full list]]; "
	"exact division → equal chunks; uneven → last chunk is smaller. "
	"STYLE: Replaced loop+append with list comprehension for clarity. "
	"COMPLEXITY: O(n) time, O(n) space — unchanged. No regressions."
	)
	obs = env.step(GenerateReview(
	focus_areas=["correctness", "off-by-one", "range", "complexity"],
	review_text=review_text,
	))
	log_line("REVIEW", "Review artifact written", "bright_yellow")
	reward_line(obs.step_number, "generate_review", obs.reward, obs.cumulative_reward)
	console.print(Panel(
	f"[italic]{review_text}[/italic]",
	title="[bright_yellow]📋 Agent Review[/bright_yellow]",
	border_style="yellow",
	))
	time.sleep(PAUSE_LONG)

	# ── Reflect ───────────────────────────────────────────────────────────────
	phase_banner("PHASE 6 · SELF-REFLECTION", "bright_cyan")
	obs = env.step(SelfReflect(
	what_went_well="Identified the off-by-one immediately from test_odd_split assertion name. Minimal one-line fix with zero side effects.",
	what_to_improve="Should run lint in parallel with tests, not sequentially. Could have used list comprehension from the start.",
	adjusted_plan=None,
	))
	log_line("REFLECT", f"[bold]Went well:[/bold] {obs.reflections[-1].what_went_well[:70]}…", "bright_cyan")
	log_line("REFLECT", f"[bold]Improve: [/bold] {obs.reflections[-1].what_to_improve[:70]}…", "bright_cyan")
	reward_line(obs.step_number, "self_reflect", obs.reward, obs.cumulative_reward)
	time.sleep(PAUSE_MEDIUM)

	# ── Commit ────────────────────────────────────────────────────────────────
	phase_banner("PHASE 7 · COMMIT", "bright_green")
	obs = env.step(Commit(message="fix(list_ops): correct off-by-one in chunk_list range() call"))
	log_line("GIT", f"[green]Committed:[/green] {env._sandbox.get_log(1)[0]}", "bright_green")
	reward_line(obs.step_number, "commit", obs.reward, obs.cumulative_reward)
	time.sleep(PAUSE_SHORT)
	console.print(f" [dim]Episode done: {obs.done}[/dim]")
	time.sleep(PAUSE_MEDIUM)

	# ── Grader ────────────────────────────────────────────────────────────────
	phase_banner("PHASE 8 · GRADER", "bright_white")
	thinking("Running deterministic grader…")
	result = env.grade()
	time.sleep(PAUSE_SHORT)

	console.print()
	for line in [
	score_bar("Tests Passed (40%)", result.test_pass_rate),
	score_bar("Lint Clean (25%)", result.lint_score),
	score_bar("Efficiency (20%)", result.efficiency_score),
	score_bar("Review Quality (10%)", result.review_quality),
	score_bar("Reflection (5%)", result.reflection_quality),
	]:
	console.print(f" {line}")
	time.sleep(0.15)

	console.print()
	passed_badge = "[bold green]✓ PASSED[/bold green]" if result.passed else "[bold red]✗ FAILED[/bold red]"
	console.print(Panel.fit(
	f"[bold yellow]TeamForge Score: {result.final_score:.4f}[/bold yellow] · "
	f"{passed_badge} · "
	f"[dim]{result.total_steps} steps · task: easy_bugfix_chunk_list[/dim]",
	border_style="bright_yellow",
	padding=(0, 2),
	))
	time.sleep(PAUSE_LONG)

	# ── Artifact summary ──────────────────────────────────────────────────────
	console.print()
	console.rule("[dim]Team Artifacts Generated[/dim]", style="dim")
	artifacts = [
	("📋 Plan", f"{len(obs.plan)} steps"),
	("📝 Reviews", f"{len(obs.reviews)} review(s)"),
	("🪞 Reflections",f"{len(obs.reflections)} reflection(s)"),
	("📦 Commits", f"{len(env._sandbox.get_log())} commit(s)"),
	]
	for icon_label, val in artifacts:
	console.print(f" {icon_label:25s} [bold]{val}[/bold]")

	console.print()
	console.rule("[bold bright_blue]TeamForge — github.com/yourname/teamforge[/bold bright_blue]", style="bright_blue")
	console.print()

	env._sandbox.teardown()


	if __name__ == "__main__":
	run_demo()