Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| TeamForge Demo | |
| ============== | |
| A self-contained 90-second scripted demo that shows exactly what the | |
| environment does, without needing an API key. | |
| Run: | |
| python demo.py | |
| What it demonstrates: | |
| 1. Environment reset + repo snapshot | |
| 2. Agent planning phase (PLAN logs) | |
| 3. Bug detection + code fix (CODE logs) | |
| 4. Test execution β before and after fix | |
| 5. Lint pass | |
| 6. Review + reflection artifacts (REVIEW / REFLECT logs) | |
| 7. Commit | |
| 8. Grader output with final score | |
| """ | |
| from __future__ import annotations | |
| import time | |
| from rich.console import Console | |
| from rich.live import Live | |
| from rich.panel import Panel | |
| from rich.syntax import Syntax | |
| from rich.table import Table | |
| from rich.text import Text | |
| from rich import box | |
| from rich.columns import Columns | |
| from rich.rule import Rule | |
| from rich.padding import Padding | |
| from environment import TeamForgeEnv | |
| from models import ( | |
| Commit, EditFile, GenerateReview, PlanStep, | |
| RunLint, RunTests, SelfReflect, | |
| ) | |
| console = Console(width=100) | |
| PAUSE_SHORT = 0.6 | |
| PAUSE_MEDIUM = 1.0 | |
| PAUSE_LONG = 1.4 | |
| # βββ Visual helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def phase_banner(phase: str, color: str = "cyan") -> None: | |
| console.print() | |
| console.rule(f"[bold {color}]{phase}[/bold {color}]", style=color) | |
| console.print() | |
| time.sleep(PAUSE_SHORT) | |
| def log_line(tag: str, content: str, tag_color: str = "bright_blue") -> None: | |
| console.print(f" [{tag_color}][{tag}][/{tag_color}] {content}") | |
| time.sleep(0.08) | |
| def thinking(msg: str) -> None: | |
| console.print(f" [dim italic]π€ {msg}[/dim italic]") | |
| time.sleep(PAUSE_SHORT) | |
| def reward_line(step: int, action: str, reward: float, cum: float) -> None: | |
| r_color = "green" if reward > 0 else ("red" if reward < -0.05 else "dim") | |
| console.print( | |
| f" [dim]step {step:02d}[/dim] " | |
| f"[bold]{action:22s}[/bold] " | |
| f"[{r_color}]{reward:+.4f}[/{r_color}] " | |
| f"[dim]cum={cum:+.4f}[/dim]" | |
| ) | |
| time.sleep(0.12) | |
| def score_bar(label: str, value: float, width: int = 22) -> str: | |
| filled = int(value * width) | |
| color = "green" if value >= 0.8 else ("yellow" if value >= 0.5 else "red") | |
| bar = f"[{color}]{'β' * filled}{'β' * (width - filled)}[/{color}]" | |
| return f"[bold]{label:22s}[/bold] {bar} [bold]{value:.4f}[/bold]" | |
| # βββ Demo βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_demo() -> None: | |
| env = TeamForgeEnv() | |
| # ββ Title βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| console.print() | |
| console.print(Panel.fit( | |
| "[bold white]π TeamForge[/bold white]\n" | |
| "[dim]OpenEnv Benchmark for Autonomous Software Engineering Agents[/dim]\n\n" | |
| "[italic]Plan Β· Code Β· Test Β· Review Β· Reflect[/italic]", | |
| border_style="bright_blue", | |
| padding=(1, 4), | |
| )) | |
| time.sleep(PAUSE_LONG) | |
| # ββ Reset βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| phase_banner("PHASE 0 Β· ENVIRONMENT INIT", "bright_blue") | |
| console.print(" Initialising isolated Git sandbox for task [bold]easy_bugfix_chunk_list[/bold] β¦") | |
| obs = env.reset("easy_bugfix_chunk_list") | |
| time.sleep(PAUSE_SHORT) | |
| console.print(f" [green]β[/green] Git repo created: [dim]{env._sandbox.repo_path}[/dim]") | |
| console.print(f" [green]β[/green] Files snapshotted: {[f.path for f in obs.repo_files]}") | |
| console.print() | |
| # Show the buggy file | |
| buggy_code = None | |
| for f in obs.repo_files: | |
| if f.path == "utils/list_ops.py": | |
| buggy_code = f.content | |
| if buggy_code: | |
| console.print(Panel( | |
| Syntax(buggy_code, "python", theme="monokai", line_numbers=True), | |
| title="[red]utils/list_ops.py β contains bug[/red]", | |
| border_style="red", | |
| )) | |
| time.sleep(PAUSE_LONG) | |
| # ββ Planning ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| phase_banner("PHASE 1 Β· PLANNING", "yellow") | |
| thinking("Reading source code and tests to diagnose the issueβ¦") | |
| plans = [ | |
| PlanStep(step_number=1, description="Read chunk_list source; identify the range() bug", estimated_effort="low"), | |
| PlanStep(step_number=2, description="Fix range(0, len(lst)-1, n) β range(0, len(lst), n)", estimated_effort="low"), | |
| PlanStep(step_number=3, description="Run tests; verify all 7 pass", estimated_effort="low"), | |
| PlanStep(step_number=4, description="Run lint; fix any style violations", estimated_effort="low"), | |
| PlanStep(step_number=5, description="Write review documenting root cause and fix", estimated_effort="medium"), | |
| ] | |
| for p in plans: | |
| obs = env.step(p) | |
| log_line("PLAN", f"[{p.estimated_effort.upper():6s}] {p.description}", "yellow") | |
| reward_line(obs.step_number, p.type, obs.reward, obs.cumulative_reward) | |
| time.sleep(PAUSE_MEDIUM) | |
| # ββ Diagnosis display βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| diag = Table(box=box.SIMPLE_HEAVY, show_header=False, border_style="red") | |
| diag.add_column("Key", style="bold red", width=18) | |
| diag.add_column("Value", style="white") | |
| diag.add_row("Bug Type", "Off-by-one error in range() stop argument") | |
| diag.add_row("Location", "utils/list_ops.py line 14") | |
| diag.add_row("Buggy Code", "range(0, [bold red]len(lst)-1[/bold red], n)") | |
| diag.add_row("Effect", "Final chunk silently dropped from result") | |
| diag.add_row("Fix", "range(0, [bold green]len(lst)[/bold green], n)") | |
| console.print(Panel(diag, title="[red bold]π Root Cause Analysis[/red bold]", border_style="red")) | |
| time.sleep(PAUSE_LONG) | |
| # ββ Coding ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| phase_banner("PHASE 2 Β· CODING", "green") | |
| thinking("Applying minimal surgical fix β one line changeβ¦") | |
| fixed_code = '''\ | |
| """List utility operations.""" | |
| from typing import Any, List | |
| def chunk_list(lst: List[Any], n: int) -> List[List[Any]]: | |
| """Split *lst* into consecutive chunks of size *n*. | |
| Example: | |
| >>> chunk_list([1, 2, 3, 4, 5], 2) | |
| [[1, 2], [3, 4], [5]] | |
| """ | |
| if n <= 0: | |
| raise ValueError("Chunk size must be positive") | |
| return [lst[i : i + n] for i in range(0, len(lst), n)] # β fixed | |
| def flatten(lst: List[List[Any]]) -> List[Any]: | |
| """Flatten a list of lists by one level.""" | |
| return [item for sublist in lst for item in sublist] | |
| ''' | |
| edit = EditFile( | |
| file_path="utils/list_ops.py", | |
| content=fixed_code, | |
| reason="Fix off-by-one: range stop was len(lst)-1, should be len(lst)", | |
| ) | |
| obs = env.step(edit) | |
| log_line("CODE", "Wrote fix to [bold]utils/list_ops.py[/bold]", "green") | |
| reward_line(obs.step_number, edit.type, obs.reward, obs.cumulative_reward) | |
| console.print(Panel( | |
| Syntax(fixed_code, "python", theme="monokai", line_numbers=True, highlight_lines={13}), | |
| title="[green]utils/list_ops.py β fixed[/green]", | |
| border_style="green", | |
| )) | |
| time.sleep(PAUSE_LONG) | |
| # ββ Testing βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| phase_banner("PHASE 3 Β· TESTING", "cyan") | |
| thinking("Running pytest suiteβ¦") | |
| obs = env.step(RunTests(timeout_seconds=30)) | |
| tr = obs.test_results | |
| test_table = Table(box=box.SIMPLE, show_header=True, header_style="bold") | |
| test_table.add_column("Result", width=8) | |
| test_table.add_column("Test Name", width=40) | |
| test_names = [ | |
| "test_even_split", "test_odd_split", "test_chunk_larger_than_list", | |
| "test_empty_list", "test_invalid_chunk_size", "test_basic", "test_empty", | |
| ] | |
| for name in test_names: | |
| test_table.add_row("[green]β PASS[/green]", f"tests/test_list_ops.py::{name}") | |
| log_line("TEST", f"[bold green]{tr.passed} passed[/bold green] / " | |
| f"[{'red' if tr.failed else 'dim'}]{tr.failed} failed[/{'red' if tr.failed else 'dim'}] " | |
| f"in {tr.duration_seconds:.2f}s", "cyan") | |
| reward_line(obs.step_number, "run_tests", obs.reward, obs.cumulative_reward) | |
| console.print(Padding(test_table, (0, 4))) | |
| time.sleep(PAUSE_MEDIUM) | |
| # ββ Lint ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| phase_banner("PHASE 4 Β· LINT", "bright_magenta") | |
| obs = env.step(RunLint(fix=False)) | |
| log_line("LINT", "[green]No violations found[/green] β lint score [bold]1.0000[/bold]", "bright_magenta") | |
| reward_line(obs.step_number, "run_lint", obs.reward, obs.cumulative_reward) | |
| time.sleep(PAUSE_SHORT) | |
| # ββ Review ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| phase_banner("PHASE 5 Β· CODE REVIEW", "bright_yellow") | |
| thinking("Generating structured code reviewβ¦") | |
| review_text = ( | |
| "ROOT CAUSE: Off-by-one error in range() stop argument. " | |
| "The original range(0, len(lst)-1, n) terminates one step early, " | |
| "causing the final chunk to be silently dropped. " | |
| "FIX: range(0, len(lst), n) correctly covers the full index space. " | |
| "EDGE CASES VERIFIED: empty list β []; chunk > list β [[full list]]; " | |
| "exact division β equal chunks; uneven β last chunk is smaller. " | |
| "STYLE: Replaced loop+append with list comprehension for clarity. " | |
| "COMPLEXITY: O(n) time, O(n) space β unchanged. No regressions." | |
| ) | |
| obs = env.step(GenerateReview( | |
| focus_areas=["correctness", "off-by-one", "range", "complexity"], | |
| review_text=review_text, | |
| )) | |
| log_line("REVIEW", "Review artifact written", "bright_yellow") | |
| reward_line(obs.step_number, "generate_review", obs.reward, obs.cumulative_reward) | |
| console.print(Panel( | |
| f"[italic]{review_text}[/italic]", | |
| title="[bright_yellow]π Agent Review[/bright_yellow]", | |
| border_style="yellow", | |
| )) | |
| time.sleep(PAUSE_LONG) | |
| # ββ Reflect βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| phase_banner("PHASE 6 Β· SELF-REFLECTION", "bright_cyan") | |
| obs = env.step(SelfReflect( | |
| what_went_well="Identified the off-by-one immediately from test_odd_split assertion name. Minimal one-line fix with zero side effects.", | |
| what_to_improve="Should run lint in parallel with tests, not sequentially. Could have used list comprehension from the start.", | |
| adjusted_plan=None, | |
| )) | |
| log_line("REFLECT", f"[bold]Went well:[/bold] {obs.reflections[-1].what_went_well[:70]}β¦", "bright_cyan") | |
| log_line("REFLECT", f"[bold]Improve: [/bold] {obs.reflections[-1].what_to_improve[:70]}β¦", "bright_cyan") | |
| reward_line(obs.step_number, "self_reflect", obs.reward, obs.cumulative_reward) | |
| time.sleep(PAUSE_MEDIUM) | |
| # ββ Commit ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| phase_banner("PHASE 7 Β· COMMIT", "bright_green") | |
| obs = env.step(Commit(message="fix(list_ops): correct off-by-one in chunk_list range() call")) | |
| log_line("GIT", f"[green]Committed:[/green] {env._sandbox.get_log(1)[0]}", "bright_green") | |
| reward_line(obs.step_number, "commit", obs.reward, obs.cumulative_reward) | |
| time.sleep(PAUSE_SHORT) | |
| console.print(f" [dim]Episode done: {obs.done}[/dim]") | |
| time.sleep(PAUSE_MEDIUM) | |
| # ββ Grader ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| phase_banner("PHASE 8 Β· GRADER", "bright_white") | |
| thinking("Running deterministic graderβ¦") | |
| result = env.grade() | |
| time.sleep(PAUSE_SHORT) | |
| console.print() | |
| for line in [ | |
| score_bar("Tests Passed (40%)", result.test_pass_rate), | |
| score_bar("Lint Clean (25%)", result.lint_score), | |
| score_bar("Efficiency (20%)", result.efficiency_score), | |
| score_bar("Review Quality (10%)", result.review_quality), | |
| score_bar("Reflection (5%)", result.reflection_quality), | |
| ]: | |
| console.print(f" {line}") | |
| time.sleep(0.15) | |
| console.print() | |
| passed_badge = "[bold green]β PASSED[/bold green]" if result.passed else "[bold red]β FAILED[/bold red]" | |
| console.print(Panel.fit( | |
| f"[bold yellow]TeamForge Score: {result.final_score:.4f}[/bold yellow] Β· " | |
| f"{passed_badge} Β· " | |
| f"[dim]{result.total_steps} steps Β· task: easy_bugfix_chunk_list[/dim]", | |
| border_style="bright_yellow", | |
| padding=(0, 2), | |
| )) | |
| time.sleep(PAUSE_LONG) | |
| # ββ Artifact summary ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| console.print() | |
| console.rule("[dim]Team Artifacts Generated[/dim]", style="dim") | |
| artifacts = [ | |
| ("π Plan", f"{len(obs.plan)} steps"), | |
| ("π Reviews", f"{len(obs.reviews)} review(s)"), | |
| ("πͺ Reflections",f"{len(obs.reflections)} reflection(s)"), | |
| ("π¦ Commits", f"{len(env._sandbox.get_log())} commit(s)"), | |
| ] | |
| for icon_label, val in artifacts: | |
| console.print(f" {icon_label:25s} [bold]{val}[/bold]") | |
| console.print() | |
| console.rule("[bold bright_blue]TeamForge β github.com/yourname/teamforge[/bold bright_blue]", style="bright_blue") | |
| console.print() | |
| env._sandbox.teardown() | |
| if __name__ == "__main__": | |
| run_demo() | |