Spaces:
Running
Running
| """Visualize inference.py task scores and per-step rewards. | |
| Generates matplotlib and plotly bar charts (PNG + SVG) under plots/. | |
| Two figures are produced: | |
| 1. inference_results_* — LLM-only view: per-task final score + per-step rewards | |
| 2. baseline_comparison_* — LLM vs random / sequential / smart baselines | |
| LLM data is the inference.py run on 2026-04-08 against | |
| meta-llama/Llama-3.3-70B-Instruct via the HF router. Baseline numbers come | |
| from `python baseline.py --agent all --task all --seed 42` and are converted | |
| to the same normalized score the LLM reports: | |
| score = 0.7 * (bugs_found / total_bugs) + 0.3 * (coverage_pct / 100) | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| import matplotlib.pyplot as plt | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| OUT_DIR = Path(__file__).parent | |
| OUT_DIR.mkdir(parents=True, exist_ok=True) | |
| TASKS = ["basic_validation", "edge_cases", "security_workflows"] | |
| SCORES = [0.647, 0.772, 0.581] | |
| STEPS = [18, 27, 29] | |
| AVG_SCORE = 0.667 | |
| # --- Baseline rollout results (seed=42) --- | |
| # Each entry: (bugs_found, total_bugs, coverage_pct, steps) | |
| BASELINE_RAW = { | |
| "random": { | |
| "basic_validation": (1, 3, 40.0, 25), | |
| "edge_cases": (2, 9, 50.0, 35), | |
| "security_workflows": (3, 13, 50.0, 45), | |
| }, | |
| "sequential": { | |
| "basic_validation": (3, 3, 50.0, 25), | |
| "edge_cases": (4, 9, 50.0, 35), | |
| "security_workflows": (4, 13, 50.0, 45), | |
| }, | |
| "smart": { | |
| "basic_validation": (3, 3, 50.0, 25), | |
| "edge_cases": (9, 9, 50.0, 35), | |
| "security_workflows": (12, 13, 50.0, 45), | |
| }, | |
| } | |
| def normalized_score(bugs_found: int, total_bugs: int, coverage_pct: float) -> float: | |
| """Same formula as inference.compute_task_score — keeps everything in [0, 1].""" | |
| bug_ratio = (bugs_found / total_bugs) if total_bugs > 0 else 0.0 | |
| cov_ratio = max(0.0, min(1.0, coverage_pct / 100.0)) | |
| return max(0.0, min(1.0, 0.70 * bug_ratio + 0.30 * cov_ratio)) | |
| # Pre-compute normalized scores for each baseline + LLM | |
| AGENT_LABELS = ["random", "sequential", "smart", "llm (Llama-3.3-70B)"] | |
| LLM_SCORES_BY_TASK = dict(zip(TASKS, SCORES)) | |
| AGENT_SCORES: dict[str, list[float]] = {} | |
| for agent_name, per_task in BASELINE_RAW.items(): | |
| AGENT_SCORES[agent_name] = [ | |
| normalized_score(*per_task[t][:3]) for t in TASKS | |
| ] | |
| AGENT_SCORES["llm (Llama-3.3-70B)"] = [LLM_SCORES_BY_TASK[t] for t in TASKS] | |
| AGENT_AVG = {a: sum(s) / len(s) for a, s in AGENT_SCORES.items()} | |
| AGENT_COLORS = { | |
| "random": "#9E9E9E", | |
| "sequential": "#F4A261", | |
| "smart": "#2A9D8F", | |
| "llm (Llama-3.3-70B)": "#6A4C93", | |
| } | |
| PER_STEP_REWARDS = { | |
| "basic_validation": [ | |
| 0.33, 0.23, 0.28, 0.18, 0.13, 0.28, 0.25, 0.28, 0.28, | |
| 0.18, 0.23, 0.33, 0.13, 0.03, 0.03, 0.13, -0.05, 0.03, | |
| ], | |
| "edge_cases": [ | |
| 0.33, 0.28, 0.28, 0.08, 0.18, 0.25, 0.48, 0.28, 0.33, | |
| 0.08, 0.33, 0.03, 0.23, 0.33, 0.28, 0.18, 0.03, 0.08, | |
| 0.08, 0.13, 0.13, 0.08, 0.13, 0.00, 0.33, 0.08, 0.00, | |
| ], | |
| "security_workflows": [ | |
| 0.33, 0.28, 0.28, 0.08, 0.03, 0.18, 0.48, 0.23, 0.28, | |
| 0.25, 0.33, 0.33, 0.23, 0.33, 0.28, 0.08, 0.18, 0.03, | |
| 0.13, 0.13, 0.13, 0.08, 0.00, 0.13, 0.00, -0.05, -0.05, | |
| 0.03, -0.05, | |
| ], | |
| } | |
| COLORS = { | |
| "basic_validation": "#4C72B0", | |
| "edge_cases": "#55A868", | |
| "security_workflows": "#C44E52", | |
| } | |
| # ---------- matplotlib ---------- | |
| def plot_matplotlib() -> None: | |
| fig, axes = plt.subplots(1, 2, figsize=(13, 5.2)) | |
| # 1. Final scores per task | |
| ax = axes[0] | |
| bar_colors = [COLORS[t] for t in TASKS] | |
| bars = ax.bar(TASKS, SCORES, color=bar_colors, edgecolor="black", linewidth=0.6) | |
| ax.axhline(AVG_SCORE, color="#333", linestyle="--", linewidth=1.2, | |
| label=f"avg = {AVG_SCORE:.3f}") | |
| ax.set_ylim(0, 1.0) | |
| ax.set_ylabel("Final score") | |
| ax.set_title("Inference final score by task") | |
| ax.legend(loc="upper right", frameon=False) | |
| for bar, score, steps in zip(bars, SCORES, STEPS): | |
| ax.text( | |
| bar.get_x() + bar.get_width() / 2, | |
| bar.get_height() + 0.015, | |
| f"{score:.3f}\n({steps} steps)", | |
| ha="center", va="bottom", fontsize=9, | |
| ) | |
| ax.tick_params(axis="x", rotation=15) | |
| # 2. Per-step rewards (grouped over step index) | |
| ax = axes[1] | |
| max_len = max(len(v) for v in PER_STEP_REWARDS.values()) | |
| width = 0.27 | |
| x_base = list(range(1, max_len + 1)) | |
| for i, task in enumerate(TASKS): | |
| rewards = PER_STEP_REWARDS[task] | |
| xs = [x + (i - 1) * width for x in range(1, len(rewards) + 1)] | |
| ax.bar(xs, rewards, width=width, color=COLORS[task], | |
| label=task, edgecolor="black", linewidth=0.3) | |
| ax.axhline(0, color="#666", linewidth=0.8) | |
| ax.set_xlabel("Step") | |
| ax.set_ylabel("Reward") | |
| ax.set_title("Per-step reward by task") | |
| ax.set_xticks(x_base[::2]) | |
| ax.legend(frameon=False, fontsize=9) | |
| fig.suptitle( | |
| "inference.py — meta-llama/Llama-3.3-70B-Instruct (avg score 0.667)", | |
| fontsize=12, fontweight="bold", | |
| ) | |
| fig.tight_layout(rect=(0, 0, 1, 0.96)) | |
| png_path = OUT_DIR / "inference_results_matplotlib.png" | |
| svg_path = OUT_DIR / "inference_results_matplotlib.svg" | |
| fig.savefig(png_path, dpi=160, bbox_inches="tight") | |
| fig.savefig(svg_path, bbox_inches="tight") | |
| plt.close(fig) | |
| print(f"[matplotlib] wrote {png_path}") | |
| print(f"[matplotlib] wrote {svg_path}") | |
| # ---------- plotly ---------- | |
| def plot_plotly() -> None: | |
| fig = make_subplots( | |
| rows=1, cols=2, | |
| column_widths=[0.4, 0.6], | |
| subplot_titles=("Final score by task", "Per-step reward by task"), | |
| ) | |
| # 1. Final scores | |
| fig.add_trace( | |
| go.Bar( | |
| x=TASKS, | |
| y=SCORES, | |
| marker_color=[COLORS[t] for t in TASKS], | |
| text=[f"{s:.3f}<br>({n} steps)" for s, n in zip(SCORES, STEPS)], | |
| textposition="outside", | |
| name="Final score", | |
| showlegend=False, | |
| ), | |
| row=1, col=1, | |
| ) | |
| fig.add_hline( | |
| y=AVG_SCORE, line_dash="dash", line_color="#333", | |
| annotation_text=f"avg = {AVG_SCORE:.3f}", | |
| annotation_position="top left", | |
| row=1, col=1, | |
| ) | |
| # 2. Per-step rewards (grouped bars) | |
| for task in TASKS: | |
| rewards = PER_STEP_REWARDS[task] | |
| fig.add_trace( | |
| go.Bar( | |
| x=list(range(1, len(rewards) + 1)), | |
| y=rewards, | |
| name=task, | |
| marker_color=COLORS[task], | |
| ), | |
| row=1, col=2, | |
| ) | |
| fig.update_yaxes(title_text="Final score", range=[0, 1.0], row=1, col=1) | |
| fig.update_yaxes(title_text="Reward", row=1, col=2) | |
| fig.update_xaxes(title_text="Step", row=1, col=2) | |
| fig.update_layout( | |
| title=dict( | |
| text="inference.py — meta-llama/Llama-3.3-70B-Instruct (avg score 0.667)", | |
| x=0.5, xanchor="center", | |
| ), | |
| barmode="group", | |
| bargap=0.2, | |
| template="plotly_white", | |
| width=1300, | |
| height=560, | |
| legend=dict(orientation="h", y=-0.18, x=0.5, xanchor="center"), | |
| margin=dict(t=80, b=80, l=60, r=30), | |
| ) | |
| png_path = OUT_DIR / "inference_results_plotly.png" | |
| svg_path = OUT_DIR / "inference_results_plotly.svg" | |
| fig.write_image(png_path, scale=2) | |
| fig.write_image(svg_path) | |
| print(f"[plotly] wrote {png_path}") | |
| print(f"[plotly] wrote {svg_path}") | |
| # ---------- baseline comparison: matplotlib ---------- | |
| def plot_baselines_matplotlib() -> None: | |
| fig, axes = plt.subplots(1, 2, figsize=(13.5, 5.4)) | |
| # 1. Grouped bars per task | |
| ax = axes[0] | |
| n_agents = len(AGENT_LABELS) | |
| width = 0.2 | |
| x = list(range(len(TASKS))) | |
| for i, agent in enumerate(AGENT_LABELS): | |
| offset = (i - (n_agents - 1) / 2) * width | |
| xs = [xi + offset for xi in x] | |
| bars = ax.bar( | |
| xs, AGENT_SCORES[agent], width=width, | |
| color=AGENT_COLORS[agent], label=agent, | |
| edgecolor="black", linewidth=0.4, | |
| ) | |
| for bar, val in zip(bars, AGENT_SCORES[agent]): | |
| ax.text( | |
| bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.012, | |
| f"{val:.2f}", ha="center", va="bottom", fontsize=7.5, | |
| ) | |
| ax.set_xticks(x) | |
| ax.set_xticklabels(TASKS, rotation=10) | |
| ax.set_ylim(0, 1.0) | |
| ax.set_ylabel("Normalized score") | |
| ax.set_title("Per-task score: baselines vs LLM") | |
| ax.legend(frameon=False, fontsize=8.5, loc="upper right") | |
| # 2. Average score across all 3 tasks | |
| ax = axes[1] | |
| avgs = [AGENT_AVG[a] for a in AGENT_LABELS] | |
| colors = [AGENT_COLORS[a] for a in AGENT_LABELS] | |
| bars = ax.bar(AGENT_LABELS, avgs, color=colors, edgecolor="black", linewidth=0.6) | |
| for bar, val in zip(bars, avgs): | |
| ax.text( | |
| bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.012, | |
| f"{val:.3f}", ha="center", va="bottom", fontsize=10, fontweight="bold", | |
| ) | |
| ax.set_ylim(0, 1.0) | |
| ax.set_ylabel("Mean score (3 tasks)") | |
| ax.set_title("Average score across all tasks") | |
| ax.tick_params(axis="x", rotation=12) | |
| fig.suptitle( | |
| "Baseline agents vs LLM — score = 0.7·bug_ratio + 0.3·coverage_ratio", | |
| fontsize=12, fontweight="bold", | |
| ) | |
| fig.tight_layout(rect=(0, 0, 1, 0.95)) | |
| png_path = OUT_DIR / "baseline_comparison_matplotlib.png" | |
| svg_path = OUT_DIR / "baseline_comparison_matplotlib.svg" | |
| fig.savefig(png_path, dpi=160, bbox_inches="tight") | |
| fig.savefig(svg_path, bbox_inches="tight") | |
| plt.close(fig) | |
| print(f"[matplotlib] wrote {png_path}") | |
| print(f"[matplotlib] wrote {svg_path}") | |
| # ---------- baseline comparison: plotly ---------- | |
| def plot_baselines_plotly() -> None: | |
| fig = make_subplots( | |
| rows=1, cols=2, | |
| column_widths=[0.62, 0.38], | |
| subplot_titles=("Per-task score: baselines vs LLM", "Average score across all tasks"), | |
| ) | |
| # 1. Grouped bars per task | |
| for agent in AGENT_LABELS: | |
| fig.add_trace( | |
| go.Bar( | |
| x=TASKS, | |
| y=AGENT_SCORES[agent], | |
| name=agent, | |
| marker_color=AGENT_COLORS[agent], | |
| text=[f"{v:.2f}" for v in AGENT_SCORES[agent]], | |
| textposition="outside", | |
| legendgroup=agent, | |
| ), | |
| row=1, col=1, | |
| ) | |
| # 2. Average score | |
| avgs = [AGENT_AVG[a] for a in AGENT_LABELS] | |
| fig.add_trace( | |
| go.Bar( | |
| x=AGENT_LABELS, | |
| y=avgs, | |
| marker_color=[AGENT_COLORS[a] for a in AGENT_LABELS], | |
| text=[f"{v:.3f}" for v in avgs], | |
| textposition="outside", | |
| showlegend=False, | |
| ), | |
| row=1, col=2, | |
| ) | |
| fig.update_yaxes(title_text="Normalized score", range=[0, 1.05], row=1, col=1) | |
| fig.update_yaxes(title_text="Mean score (3 tasks)", range=[0, 1.05], row=1, col=2) | |
| fig.update_layout( | |
| title=dict( | |
| text="Baseline agents vs LLM — score = 0.7·bug_ratio + 0.3·coverage_ratio", | |
| x=0.5, xanchor="center", | |
| ), | |
| barmode="group", | |
| bargap=0.18, | |
| template="plotly_white", | |
| width=1400, | |
| height=580, | |
| legend=dict(orientation="h", y=-0.18, x=0.5, xanchor="center"), | |
| margin=dict(t=80, b=90, l=60, r=30), | |
| ) | |
| png_path = OUT_DIR / "baseline_comparison_plotly.png" | |
| svg_path = OUT_DIR / "baseline_comparison_plotly.svg" | |
| fig.write_image(png_path, scale=2) | |
| fig.write_image(svg_path) | |
| print(f"[plotly] wrote {png_path}") | |
| print(f"[plotly] wrote {svg_path}") | |
| if __name__ == "__main__": | |
| plot_matplotlib() | |
| plot_plotly() | |
| plot_baselines_matplotlib() | |
| plot_baselines_plotly() | |