teamforge / demo.py
Your Name
fix: add FastAPI REST endpoints for OpenEnv validator
637f42c
#!/usr/bin/env python3
"""
TeamForge Demo
==============
A self-contained 90-second scripted demo that shows exactly what the
environment does, without needing an API key.
Run:
python demo.py
What it demonstrates:
1. Environment reset + repo snapshot
2. Agent planning phase (PLAN logs)
3. Bug detection + code fix (CODE logs)
4. Test execution β€” before and after fix
5. Lint pass
6. Review + reflection artifacts (REVIEW / REFLECT logs)
7. Commit
8. Grader output with final score
"""
from __future__ import annotations
import time
from rich.console import Console
from rich.live import Live
from rich.panel import Panel
from rich.syntax import Syntax
from rich.table import Table
from rich.text import Text
from rich import box
from rich.columns import Columns
from rich.rule import Rule
from rich.padding import Padding
from environment import TeamForgeEnv
from models import (
Commit, EditFile, GenerateReview, PlanStep,
RunLint, RunTests, SelfReflect,
)
console = Console(width=100)
PAUSE_SHORT = 0.6
PAUSE_MEDIUM = 1.0
PAUSE_LONG = 1.4
# ─── Visual helpers ───────────────────────────────────────────────────────────
def phase_banner(phase: str, color: str = "cyan") -> None:
console.print()
console.rule(f"[bold {color}]{phase}[/bold {color}]", style=color)
console.print()
time.sleep(PAUSE_SHORT)
def log_line(tag: str, content: str, tag_color: str = "bright_blue") -> None:
console.print(f" [{tag_color}][{tag}][/{tag_color}] {content}")
time.sleep(0.08)
def thinking(msg: str) -> None:
console.print(f" [dim italic]πŸ€” {msg}[/dim italic]")
time.sleep(PAUSE_SHORT)
def reward_line(step: int, action: str, reward: float, cum: float) -> None:
r_color = "green" if reward > 0 else ("red" if reward < -0.05 else "dim")
console.print(
f" [dim]step {step:02d}[/dim] "
f"[bold]{action:22s}[/bold] "
f"[{r_color}]{reward:+.4f}[/{r_color}] "
f"[dim]cum={cum:+.4f}[/dim]"
)
time.sleep(0.12)
def score_bar(label: str, value: float, width: int = 22) -> str:
filled = int(value * width)
color = "green" if value >= 0.8 else ("yellow" if value >= 0.5 else "red")
bar = f"[{color}]{'β–ˆ' * filled}{'β–‘' * (width - filled)}[/{color}]"
return f"[bold]{label:22s}[/bold] {bar} [bold]{value:.4f}[/bold]"
# ─── Demo ─────────────────────────────────────────────────────────────────────
def run_demo() -> None:
env = TeamForgeEnv()
# ── Title ─────────────────────────────────────────────────────────────────
console.print()
console.print(Panel.fit(
"[bold white]πŸ— TeamForge[/bold white]\n"
"[dim]OpenEnv Benchmark for Autonomous Software Engineering Agents[/dim]\n\n"
"[italic]Plan Β· Code Β· Test Β· Review Β· Reflect[/italic]",
border_style="bright_blue",
padding=(1, 4),
))
time.sleep(PAUSE_LONG)
# ── Reset ─────────────────────────────────────────────────────────────────
phase_banner("PHASE 0 Β· ENVIRONMENT INIT", "bright_blue")
console.print(" Initialising isolated Git sandbox for task [bold]easy_bugfix_chunk_list[/bold] …")
obs = env.reset("easy_bugfix_chunk_list")
time.sleep(PAUSE_SHORT)
console.print(f" [green]βœ“[/green] Git repo created: [dim]{env._sandbox.repo_path}[/dim]")
console.print(f" [green]βœ“[/green] Files snapshotted: {[f.path for f in obs.repo_files]}")
console.print()
# Show the buggy file
buggy_code = None
for f in obs.repo_files:
if f.path == "utils/list_ops.py":
buggy_code = f.content
if buggy_code:
console.print(Panel(
Syntax(buggy_code, "python", theme="monokai", line_numbers=True),
title="[red]utils/list_ops.py ← contains bug[/red]",
border_style="red",
))
time.sleep(PAUSE_LONG)
# ── Planning ──────────────────────────────────────────────────────────────
phase_banner("PHASE 1 Β· PLANNING", "yellow")
thinking("Reading source code and tests to diagnose the issue…")
plans = [
PlanStep(step_number=1, description="Read chunk_list source; identify the range() bug", estimated_effort="low"),
PlanStep(step_number=2, description="Fix range(0, len(lst)-1, n) β†’ range(0, len(lst), n)", estimated_effort="low"),
PlanStep(step_number=3, description="Run tests; verify all 7 pass", estimated_effort="low"),
PlanStep(step_number=4, description="Run lint; fix any style violations", estimated_effort="low"),
PlanStep(step_number=5, description="Write review documenting root cause and fix", estimated_effort="medium"),
]
for p in plans:
obs = env.step(p)
log_line("PLAN", f"[{p.estimated_effort.upper():6s}] {p.description}", "yellow")
reward_line(obs.step_number, p.type, obs.reward, obs.cumulative_reward)
time.sleep(PAUSE_MEDIUM)
# ── Diagnosis display ─────────────────────────────────────────────────────
diag = Table(box=box.SIMPLE_HEAVY, show_header=False, border_style="red")
diag.add_column("Key", style="bold red", width=18)
diag.add_column("Value", style="white")
diag.add_row("Bug Type", "Off-by-one error in range() stop argument")
diag.add_row("Location", "utils/list_ops.py line 14")
diag.add_row("Buggy Code", "range(0, [bold red]len(lst)-1[/bold red], n)")
diag.add_row("Effect", "Final chunk silently dropped from result")
diag.add_row("Fix", "range(0, [bold green]len(lst)[/bold green], n)")
console.print(Panel(diag, title="[red bold]πŸ” Root Cause Analysis[/red bold]", border_style="red"))
time.sleep(PAUSE_LONG)
# ── Coding ────────────────────────────────────────────────────────────────
phase_banner("PHASE 2 Β· CODING", "green")
thinking("Applying minimal surgical fix β€” one line change…")
fixed_code = '''\
"""List utility operations."""
from typing import Any, List
def chunk_list(lst: List[Any], n: int) -> List[List[Any]]:
"""Split *lst* into consecutive chunks of size *n*.
Example:
>>> chunk_list([1, 2, 3, 4, 5], 2)
[[1, 2], [3, 4], [5]]
"""
if n <= 0:
raise ValueError("Chunk size must be positive")
return [lst[i : i + n] for i in range(0, len(lst), n)] # ← fixed
def flatten(lst: List[List[Any]]) -> List[Any]:
"""Flatten a list of lists by one level."""
return [item for sublist in lst for item in sublist]
'''
edit = EditFile(
file_path="utils/list_ops.py",
content=fixed_code,
reason="Fix off-by-one: range stop was len(lst)-1, should be len(lst)",
)
obs = env.step(edit)
log_line("CODE", "Wrote fix to [bold]utils/list_ops.py[/bold]", "green")
reward_line(obs.step_number, edit.type, obs.reward, obs.cumulative_reward)
console.print(Panel(
Syntax(fixed_code, "python", theme="monokai", line_numbers=True, highlight_lines={13}),
title="[green]utils/list_ops.py ← fixed[/green]",
border_style="green",
))
time.sleep(PAUSE_LONG)
# ── Testing ───────────────────────────────────────────────────────────────
phase_banner("PHASE 3 Β· TESTING", "cyan")
thinking("Running pytest suite…")
obs = env.step(RunTests(timeout_seconds=30))
tr = obs.test_results
test_table = Table(box=box.SIMPLE, show_header=True, header_style="bold")
test_table.add_column("Result", width=8)
test_table.add_column("Test Name", width=40)
test_names = [
"test_even_split", "test_odd_split", "test_chunk_larger_than_list",
"test_empty_list", "test_invalid_chunk_size", "test_basic", "test_empty",
]
for name in test_names:
test_table.add_row("[green]βœ“ PASS[/green]", f"tests/test_list_ops.py::{name}")
log_line("TEST", f"[bold green]{tr.passed} passed[/bold green] / "
f"[{'red' if tr.failed else 'dim'}]{tr.failed} failed[/{'red' if tr.failed else 'dim'}] "
f"in {tr.duration_seconds:.2f}s", "cyan")
reward_line(obs.step_number, "run_tests", obs.reward, obs.cumulative_reward)
console.print(Padding(test_table, (0, 4)))
time.sleep(PAUSE_MEDIUM)
# ── Lint ──────────────────────────────────────────────────────────────────
phase_banner("PHASE 4 Β· LINT", "bright_magenta")
obs = env.step(RunLint(fix=False))
log_line("LINT", "[green]No violations found[/green] β€” lint score [bold]1.0000[/bold]", "bright_magenta")
reward_line(obs.step_number, "run_lint", obs.reward, obs.cumulative_reward)
time.sleep(PAUSE_SHORT)
# ── Review ────────────────────────────────────────────────────────────────
phase_banner("PHASE 5 Β· CODE REVIEW", "bright_yellow")
thinking("Generating structured code review…")
review_text = (
"ROOT CAUSE: Off-by-one error in range() stop argument. "
"The original range(0, len(lst)-1, n) terminates one step early, "
"causing the final chunk to be silently dropped. "
"FIX: range(0, len(lst), n) correctly covers the full index space. "
"EDGE CASES VERIFIED: empty list β†’ []; chunk > list β†’ [[full list]]; "
"exact division β†’ equal chunks; uneven β†’ last chunk is smaller. "
"STYLE: Replaced loop+append with list comprehension for clarity. "
"COMPLEXITY: O(n) time, O(n) space β€” unchanged. No regressions."
)
obs = env.step(GenerateReview(
focus_areas=["correctness", "off-by-one", "range", "complexity"],
review_text=review_text,
))
log_line("REVIEW", "Review artifact written", "bright_yellow")
reward_line(obs.step_number, "generate_review", obs.reward, obs.cumulative_reward)
console.print(Panel(
f"[italic]{review_text}[/italic]",
title="[bright_yellow]πŸ“‹ Agent Review[/bright_yellow]",
border_style="yellow",
))
time.sleep(PAUSE_LONG)
# ── Reflect ───────────────────────────────────────────────────────────────
phase_banner("PHASE 6 Β· SELF-REFLECTION", "bright_cyan")
obs = env.step(SelfReflect(
what_went_well="Identified the off-by-one immediately from test_odd_split assertion name. Minimal one-line fix with zero side effects.",
what_to_improve="Should run lint in parallel with tests, not sequentially. Could have used list comprehension from the start.",
adjusted_plan=None,
))
log_line("REFLECT", f"[bold]Went well:[/bold] {obs.reflections[-1].what_went_well[:70]}…", "bright_cyan")
log_line("REFLECT", f"[bold]Improve: [/bold] {obs.reflections[-1].what_to_improve[:70]}…", "bright_cyan")
reward_line(obs.step_number, "self_reflect", obs.reward, obs.cumulative_reward)
time.sleep(PAUSE_MEDIUM)
# ── Commit ────────────────────────────────────────────────────────────────
phase_banner("PHASE 7 Β· COMMIT", "bright_green")
obs = env.step(Commit(message="fix(list_ops): correct off-by-one in chunk_list range() call"))
log_line("GIT", f"[green]Committed:[/green] {env._sandbox.get_log(1)[0]}", "bright_green")
reward_line(obs.step_number, "commit", obs.reward, obs.cumulative_reward)
time.sleep(PAUSE_SHORT)
console.print(f" [dim]Episode done: {obs.done}[/dim]")
time.sleep(PAUSE_MEDIUM)
# ── Grader ────────────────────────────────────────────────────────────────
phase_banner("PHASE 8 Β· GRADER", "bright_white")
thinking("Running deterministic grader…")
result = env.grade()
time.sleep(PAUSE_SHORT)
console.print()
for line in [
score_bar("Tests Passed (40%)", result.test_pass_rate),
score_bar("Lint Clean (25%)", result.lint_score),
score_bar("Efficiency (20%)", result.efficiency_score),
score_bar("Review Quality (10%)", result.review_quality),
score_bar("Reflection (5%)", result.reflection_quality),
]:
console.print(f" {line}")
time.sleep(0.15)
console.print()
passed_badge = "[bold green]βœ“ PASSED[/bold green]" if result.passed else "[bold red]βœ— FAILED[/bold red]"
console.print(Panel.fit(
f"[bold yellow]TeamForge Score: {result.final_score:.4f}[/bold yellow] Β· "
f"{passed_badge} Β· "
f"[dim]{result.total_steps} steps Β· task: easy_bugfix_chunk_list[/dim]",
border_style="bright_yellow",
padding=(0, 2),
))
time.sleep(PAUSE_LONG)
# ── Artifact summary ──────────────────────────────────────────────────────
console.print()
console.rule("[dim]Team Artifacts Generated[/dim]", style="dim")
artifacts = [
("πŸ“‹ Plan", f"{len(obs.plan)} steps"),
("πŸ“ Reviews", f"{len(obs.reviews)} review(s)"),
("πŸͺž Reflections",f"{len(obs.reflections)} reflection(s)"),
("πŸ“¦ Commits", f"{len(env._sandbox.get_log())} commit(s)"),
]
for icon_label, val in artifacts:
console.print(f" {icon_label:25s} [bold]{val}[/bold]")
console.print()
console.rule("[bold bright_blue]TeamForge β€” github.com/yourname/teamforge[/bold bright_blue]", style="bright_blue")
console.print()
env._sandbox.teardown()
if __name__ == "__main__":
run_demo()