#!/usr/bin/env python3 """ TeamForge Demo ============== A self-contained 90-second scripted demo that shows exactly what the environment does, without needing an API key. Run: python demo.py What it demonstrates: 1. Environment reset + repo snapshot 2. Agent planning phase (PLAN logs) 3. Bug detection + code fix (CODE logs) 4. Test execution — before and after fix 5. Lint pass 6. Review + reflection artifacts (REVIEW / REFLECT logs) 7. Commit 8. Grader output with final score """ from __future__ import annotations import time from rich.console import Console from rich.live import Live from rich.panel import Panel from rich.syntax import Syntax from rich.table import Table from rich.text import Text from rich import box from rich.columns import Columns from rich.rule import Rule from rich.padding import Padding from environment import TeamForgeEnv from models import ( Commit, EditFile, GenerateReview, PlanStep, RunLint, RunTests, SelfReflect, ) console = Console(width=100) PAUSE_SHORT = 0.6 PAUSE_MEDIUM = 1.0 PAUSE_LONG = 1.4 # ─── Visual helpers ─────────────────────────────────────────────────────────── def phase_banner(phase: str, color: str = "cyan") -> None: console.print() console.rule(f"[bold {color}]{phase}[/bold {color}]", style=color) console.print() time.sleep(PAUSE_SHORT) def log_line(tag: str, content: str, tag_color: str = "bright_blue") -> None: console.print(f" [{tag_color}][{tag}][/{tag_color}] {content}") time.sleep(0.08) def thinking(msg: str) -> None: console.print(f" [dim italic]🤔 {msg}[/dim italic]") time.sleep(PAUSE_SHORT) def reward_line(step: int, action: str, reward: float, cum: float) -> None: r_color = "green" if reward > 0 else ("red" if reward < -0.05 else "dim") console.print( f" [dim]step {step:02d}[/dim] " f"[bold]{action:22s}[/bold] " f"[{r_color}]{reward:+.4f}[/{r_color}] " f"[dim]cum={cum:+.4f}[/dim]" ) time.sleep(0.12) def score_bar(label: str, value: float, width: int = 22) -> str: filled = int(value * width) color = "green" if value >= 0.8 else ("yellow" if value >= 0.5 else "red") bar = f"[{color}]{'█' * filled}{'░' * (width - filled)}[/{color}]" return f"[bold]{label:22s}[/bold] {bar} [bold]{value:.4f}[/bold]" # ─── Demo ───────────────────────────────────────────────────────────────────── def run_demo() -> None: env = TeamForgeEnv() # ── Title ───────────────────────────────────────────────────────────────── console.print() console.print(Panel.fit( "[bold white]🏗 TeamForge[/bold white]\n" "[dim]OpenEnv Benchmark for Autonomous Software Engineering Agents[/dim]\n\n" "[italic]Plan · Code · Test · Review · Reflect[/italic]", border_style="bright_blue", padding=(1, 4), )) time.sleep(PAUSE_LONG) # ── Reset ───────────────────────────────────────────────────────────────── phase_banner("PHASE 0 · ENVIRONMENT INIT", "bright_blue") console.print(" Initialising isolated Git sandbox for task [bold]easy_bugfix_chunk_list[/bold] …") obs = env.reset("easy_bugfix_chunk_list") time.sleep(PAUSE_SHORT) console.print(f" [green]✓[/green] Git repo created: [dim]{env._sandbox.repo_path}[/dim]") console.print(f" [green]✓[/green] Files snapshotted: {[f.path for f in obs.repo_files]}") console.print() # Show the buggy file buggy_code = None for f in obs.repo_files: if f.path == "utils/list_ops.py": buggy_code = f.content if buggy_code: console.print(Panel( Syntax(buggy_code, "python", theme="monokai", line_numbers=True), title="[red]utils/list_ops.py ← contains bug[/red]", border_style="red", )) time.sleep(PAUSE_LONG) # ── Planning ────────────────────────────────────────────────────────────── phase_banner("PHASE 1 · PLANNING", "yellow") thinking("Reading source code and tests to diagnose the issue…") plans = [ PlanStep(step_number=1, description="Read chunk_list source; identify the range() bug", estimated_effort="low"), PlanStep(step_number=2, description="Fix range(0, len(lst)-1, n) → range(0, len(lst), n)", estimated_effort="low"), PlanStep(step_number=3, description="Run tests; verify all 7 pass", estimated_effort="low"), PlanStep(step_number=4, description="Run lint; fix any style violations", estimated_effort="low"), PlanStep(step_number=5, description="Write review documenting root cause and fix", estimated_effort="medium"), ] for p in plans: obs = env.step(p) log_line("PLAN", f"[{p.estimated_effort.upper():6s}] {p.description}", "yellow") reward_line(obs.step_number, p.type, obs.reward, obs.cumulative_reward) time.sleep(PAUSE_MEDIUM) # ── Diagnosis display ───────────────────────────────────────────────────── diag = Table(box=box.SIMPLE_HEAVY, show_header=False, border_style="red") diag.add_column("Key", style="bold red", width=18) diag.add_column("Value", style="white") diag.add_row("Bug Type", "Off-by-one error in range() stop argument") diag.add_row("Location", "utils/list_ops.py line 14") diag.add_row("Buggy Code", "range(0, [bold red]len(lst)-1[/bold red], n)") diag.add_row("Effect", "Final chunk silently dropped from result") diag.add_row("Fix", "range(0, [bold green]len(lst)[/bold green], n)") console.print(Panel(diag, title="[red bold]🔍 Root Cause Analysis[/red bold]", border_style="red")) time.sleep(PAUSE_LONG) # ── Coding ──────────────────────────────────────────────────────────────── phase_banner("PHASE 2 · CODING", "green") thinking("Applying minimal surgical fix — one line change…") fixed_code = '''\ """List utility operations.""" from typing import Any, List def chunk_list(lst: List[Any], n: int) -> List[List[Any]]: """Split *lst* into consecutive chunks of size *n*. Example: >>> chunk_list([1, 2, 3, 4, 5], 2) [[1, 2], [3, 4], [5]] """ if n <= 0: raise ValueError("Chunk size must be positive") return [lst[i : i + n] for i in range(0, len(lst), n)] # ← fixed def flatten(lst: List[List[Any]]) -> List[Any]: """Flatten a list of lists by one level.""" return [item for sublist in lst for item in sublist] ''' edit = EditFile( file_path="utils/list_ops.py", content=fixed_code, reason="Fix off-by-one: range stop was len(lst)-1, should be len(lst)", ) obs = env.step(edit) log_line("CODE", "Wrote fix to [bold]utils/list_ops.py[/bold]", "green") reward_line(obs.step_number, edit.type, obs.reward, obs.cumulative_reward) console.print(Panel( Syntax(fixed_code, "python", theme="monokai", line_numbers=True, highlight_lines={13}), title="[green]utils/list_ops.py ← fixed[/green]", border_style="green", )) time.sleep(PAUSE_LONG) # ── Testing ─────────────────────────────────────────────────────────────── phase_banner("PHASE 3 · TESTING", "cyan") thinking("Running pytest suite…") obs = env.step(RunTests(timeout_seconds=30)) tr = obs.test_results test_table = Table(box=box.SIMPLE, show_header=True, header_style="bold") test_table.add_column("Result", width=8) test_table.add_column("Test Name", width=40) test_names = [ "test_even_split", "test_odd_split", "test_chunk_larger_than_list", "test_empty_list", "test_invalid_chunk_size", "test_basic", "test_empty", ] for name in test_names: test_table.add_row("[green]✓ PASS[/green]", f"tests/test_list_ops.py::{name}") log_line("TEST", f"[bold green]{tr.passed} passed[/bold green] / " f"[{'red' if tr.failed else 'dim'}]{tr.failed} failed[/{'red' if tr.failed else 'dim'}] " f"in {tr.duration_seconds:.2f}s", "cyan") reward_line(obs.step_number, "run_tests", obs.reward, obs.cumulative_reward) console.print(Padding(test_table, (0, 4))) time.sleep(PAUSE_MEDIUM) # ── Lint ────────────────────────────────────────────────────────────────── phase_banner("PHASE 4 · LINT", "bright_magenta") obs = env.step(RunLint(fix=False)) log_line("LINT", "[green]No violations found[/green] — lint score [bold]1.0000[/bold]", "bright_magenta") reward_line(obs.step_number, "run_lint", obs.reward, obs.cumulative_reward) time.sleep(PAUSE_SHORT) # ── Review ──────────────────────────────────────────────────────────────── phase_banner("PHASE 5 · CODE REVIEW", "bright_yellow") thinking("Generating structured code review…") review_text = ( "ROOT CAUSE: Off-by-one error in range() stop argument. " "The original range(0, len(lst)-1, n) terminates one step early, " "causing the final chunk to be silently dropped. " "FIX: range(0, len(lst), n) correctly covers the full index space. " "EDGE CASES VERIFIED: empty list → []; chunk > list → [[full list]]; " "exact division → equal chunks; uneven → last chunk is smaller. " "STYLE: Replaced loop+append with list comprehension for clarity. " "COMPLEXITY: O(n) time, O(n) space — unchanged. No regressions." ) obs = env.step(GenerateReview( focus_areas=["correctness", "off-by-one", "range", "complexity"], review_text=review_text, )) log_line("REVIEW", "Review artifact written", "bright_yellow") reward_line(obs.step_number, "generate_review", obs.reward, obs.cumulative_reward) console.print(Panel( f"[italic]{review_text}[/italic]", title="[bright_yellow]📋 Agent Review[/bright_yellow]", border_style="yellow", )) time.sleep(PAUSE_LONG) # ── Reflect ─────────────────────────────────────────────────────────────── phase_banner("PHASE 6 · SELF-REFLECTION", "bright_cyan") obs = env.step(SelfReflect( what_went_well="Identified the off-by-one immediately from test_odd_split assertion name. Minimal one-line fix with zero side effects.", what_to_improve="Should run lint in parallel with tests, not sequentially. Could have used list comprehension from the start.", adjusted_plan=None, )) log_line("REFLECT", f"[bold]Went well:[/bold] {obs.reflections[-1].what_went_well[:70]}…", "bright_cyan") log_line("REFLECT", f"[bold]Improve: [/bold] {obs.reflections[-1].what_to_improve[:70]}…", "bright_cyan") reward_line(obs.step_number, "self_reflect", obs.reward, obs.cumulative_reward) time.sleep(PAUSE_MEDIUM) # ── Commit ──────────────────────────────────────────────────────────────── phase_banner("PHASE 7 · COMMIT", "bright_green") obs = env.step(Commit(message="fix(list_ops): correct off-by-one in chunk_list range() call")) log_line("GIT", f"[green]Committed:[/green] {env._sandbox.get_log(1)[0]}", "bright_green") reward_line(obs.step_number, "commit", obs.reward, obs.cumulative_reward) time.sleep(PAUSE_SHORT) console.print(f" [dim]Episode done: {obs.done}[/dim]") time.sleep(PAUSE_MEDIUM) # ── Grader ──────────────────────────────────────────────────────────────── phase_banner("PHASE 8 · GRADER", "bright_white") thinking("Running deterministic grader…") result = env.grade() time.sleep(PAUSE_SHORT) console.print() for line in [ score_bar("Tests Passed (40%)", result.test_pass_rate), score_bar("Lint Clean (25%)", result.lint_score), score_bar("Efficiency (20%)", result.efficiency_score), score_bar("Review Quality (10%)", result.review_quality), score_bar("Reflection (5%)", result.reflection_quality), ]: console.print(f" {line}") time.sleep(0.15) console.print() passed_badge = "[bold green]✓ PASSED[/bold green]" if result.passed else "[bold red]✗ FAILED[/bold red]" console.print(Panel.fit( f"[bold yellow]TeamForge Score: {result.final_score:.4f}[/bold yellow] · " f"{passed_badge} · " f"[dim]{result.total_steps} steps · task: easy_bugfix_chunk_list[/dim]", border_style="bright_yellow", padding=(0, 2), )) time.sleep(PAUSE_LONG) # ── Artifact summary ────────────────────────────────────────────────────── console.print() console.rule("[dim]Team Artifacts Generated[/dim]", style="dim") artifacts = [ ("📋 Plan", f"{len(obs.plan)} steps"), ("📝 Reviews", f"{len(obs.reviews)} review(s)"), ("🪞 Reflections",f"{len(obs.reflections)} reflection(s)"), ("📦 Commits", f"{len(env._sandbox.get_log())} commit(s)"), ] for icon_label, val in artifacts: console.print(f" {icon_label:25s} [bold]{val}[/bold]") console.print() console.rule("[bold bright_blue]TeamForge — github.com/yourname/teamforge[/bold bright_blue]", style="bright_blue") console.print() env._sandbox.teardown() if __name__ == "__main__": run_demo()