Spaces:

Mayank022
/

api-testing-env

Running

File size: 11,877 Bytes

bafcc7e

"""Visualize inference.py task scores and per-step rewards.

Generates matplotlib and plotly bar charts (PNG + SVG) under plots/.

Two figures are produced:
  1. inference_results_*  — LLM-only view: per-task final score + per-step rewards
  2. baseline_comparison_* — LLM vs random / sequential / smart baselines

LLM data is the inference.py run on 2026-04-08 against
meta-llama/Llama-3.3-70B-Instruct via the HF router. Baseline numbers come
from `python baseline.py --agent all --task all --seed 42` and are converted
to the same normalized score the LLM reports:
    score = 0.7 * (bugs_found / total_bugs) + 0.3 * (coverage_pct / 100)
"""

from __future__ import annotations

from pathlib import Path

import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

OUT_DIR = Path(__file__).parent
OUT_DIR.mkdir(parents=True, exist_ok=True)

TASKS = ["basic_validation", "edge_cases", "security_workflows"]
SCORES = [0.647, 0.772, 0.581]
STEPS = [18, 27, 29]
AVG_SCORE = 0.667

# --- Baseline rollout results (seed=42) ---
# Each entry: (bugs_found, total_bugs, coverage_pct, steps)
BASELINE_RAW = {
    "random": {
        "basic_validation":    (1, 3,  40.0, 25),
        "edge_cases":          (2, 9,  50.0, 35),
        "security_workflows":  (3, 13, 50.0, 45),
    },
    "sequential": {
        "basic_validation":    (3, 3,  50.0, 25),
        "edge_cases":          (4, 9,  50.0, 35),
        "security_workflows":  (4, 13, 50.0, 45),
    },
    "smart": {
        "basic_validation":    (3, 3,  50.0, 25),
        "edge_cases":          (9, 9,  50.0, 35),
        "security_workflows":  (12, 13, 50.0, 45),
    },
}


def normalized_score(bugs_found: int, total_bugs: int, coverage_pct: float) -> float:
    """Same formula as inference.compute_task_score — keeps everything in [0, 1]."""
    bug_ratio = (bugs_found / total_bugs) if total_bugs > 0 else 0.0
    cov_ratio = max(0.0, min(1.0, coverage_pct / 100.0))
    return max(0.0, min(1.0, 0.70 * bug_ratio + 0.30 * cov_ratio))


# Pre-compute normalized scores for each baseline + LLM
AGENT_LABELS = ["random", "sequential", "smart", "llm (Llama-3.3-70B)"]
LLM_SCORES_BY_TASK = dict(zip(TASKS, SCORES))

AGENT_SCORES: dict[str, list[float]] = {}
for agent_name, per_task in BASELINE_RAW.items():
    AGENT_SCORES[agent_name] = [
        normalized_score(*per_task[t][:3]) for t in TASKS
    ]
AGENT_SCORES["llm (Llama-3.3-70B)"] = [LLM_SCORES_BY_TASK[t] for t in TASKS]

AGENT_AVG = {a: sum(s) / len(s) for a, s in AGENT_SCORES.items()}

AGENT_COLORS = {
    "random":               "#9E9E9E",
    "sequential":           "#F4A261",
    "smart":                "#2A9D8F",
    "llm (Llama-3.3-70B)":  "#6A4C93",
}

PER_STEP_REWARDS = {
    "basic_validation": [
        0.33, 0.23, 0.28, 0.18, 0.13, 0.28, 0.25, 0.28, 0.28,
        0.18, 0.23, 0.33, 0.13, 0.03, 0.03, 0.13, -0.05, 0.03,
    ],
    "edge_cases": [
        0.33, 0.28, 0.28, 0.08, 0.18, 0.25, 0.48, 0.28, 0.33,
        0.08, 0.33, 0.03, 0.23, 0.33, 0.28, 0.18, 0.03, 0.08,
        0.08, 0.13, 0.13, 0.08, 0.13, 0.00, 0.33, 0.08, 0.00,
    ],
    "security_workflows": [
        0.33, 0.28, 0.28, 0.08, 0.03, 0.18, 0.48, 0.23, 0.28,
        0.25, 0.33, 0.33, 0.23, 0.33, 0.28, 0.08, 0.18, 0.03,
        0.13, 0.13, 0.13, 0.08, 0.00, 0.13, 0.00, -0.05, -0.05,
        0.03, -0.05,
    ],
}

COLORS = {
    "basic_validation": "#4C72B0",
    "edge_cases": "#55A868",
    "security_workflows": "#C44E52",
}


# ---------- matplotlib ----------
def plot_matplotlib() -> None:
    fig, axes = plt.subplots(1, 2, figsize=(13, 5.2))

    # 1. Final scores per task
    ax = axes[0]
    bar_colors = [COLORS[t] for t in TASKS]
    bars = ax.bar(TASKS, SCORES, color=bar_colors, edgecolor="black", linewidth=0.6)
    ax.axhline(AVG_SCORE, color="#333", linestyle="--", linewidth=1.2,
               label=f"avg = {AVG_SCORE:.3f}")
    ax.set_ylim(0, 1.0)
    ax.set_ylabel("Final score")
    ax.set_title("Inference final score by task")
    ax.legend(loc="upper right", frameon=False)
    for bar, score, steps in zip(bars, SCORES, STEPS):
        ax.text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height() + 0.015,
            f"{score:.3f}\n({steps} steps)",
            ha="center", va="bottom", fontsize=9,
        )
    ax.tick_params(axis="x", rotation=15)

    # 2. Per-step rewards (grouped over step index)
    ax = axes[1]
    max_len = max(len(v) for v in PER_STEP_REWARDS.values())
    width = 0.27
    x_base = list(range(1, max_len + 1))
    for i, task in enumerate(TASKS):
        rewards = PER_STEP_REWARDS[task]
        xs = [x + (i - 1) * width for x in range(1, len(rewards) + 1)]
        ax.bar(xs, rewards, width=width, color=COLORS[task],
               label=task, edgecolor="black", linewidth=0.3)
    ax.axhline(0, color="#666", linewidth=0.8)
    ax.set_xlabel("Step")
    ax.set_ylabel("Reward")
    ax.set_title("Per-step reward by task")
    ax.set_xticks(x_base[::2])
    ax.legend(frameon=False, fontsize=9)

    fig.suptitle(
        "inference.py — meta-llama/Llama-3.3-70B-Instruct (avg score 0.667)",
        fontsize=12, fontweight="bold",
    )
    fig.tight_layout(rect=(0, 0, 1, 0.96))

    png_path = OUT_DIR / "inference_results_matplotlib.png"
    svg_path = OUT_DIR / "inference_results_matplotlib.svg"
    fig.savefig(png_path, dpi=160, bbox_inches="tight")
    fig.savefig(svg_path, bbox_inches="tight")
    plt.close(fig)
    print(f"[matplotlib] wrote {png_path}")
    print(f"[matplotlib] wrote {svg_path}")


# ---------- plotly ----------
def plot_plotly() -> None:
    fig = make_subplots(
        rows=1, cols=2,
        column_widths=[0.4, 0.6],
        subplot_titles=("Final score by task", "Per-step reward by task"),
    )

    # 1. Final scores
    fig.add_trace(
        go.Bar(
            x=TASKS,
            y=SCORES,
            marker_color=[COLORS[t] for t in TASKS],
            text=[f"{s:.3f}<br>({n} steps)" for s, n in zip(SCORES, STEPS)],
            textposition="outside",
            name="Final score",
            showlegend=False,
        ),
        row=1, col=1,
    )
    fig.add_hline(
        y=AVG_SCORE, line_dash="dash", line_color="#333",
        annotation_text=f"avg = {AVG_SCORE:.3f}",
        annotation_position="top left",
        row=1, col=1,
    )

    # 2. Per-step rewards (grouped bars)
    for task in TASKS:
        rewards = PER_STEP_REWARDS[task]
        fig.add_trace(
            go.Bar(
                x=list(range(1, len(rewards) + 1)),
                y=rewards,
                name=task,
                marker_color=COLORS[task],
            ),
            row=1, col=2,
        )

    fig.update_yaxes(title_text="Final score", range=[0, 1.0], row=1, col=1)
    fig.update_yaxes(title_text="Reward", row=1, col=2)
    fig.update_xaxes(title_text="Step", row=1, col=2)
    fig.update_layout(
        title=dict(
            text="inference.py — meta-llama/Llama-3.3-70B-Instruct (avg score 0.667)",
            x=0.5, xanchor="center",
        ),
        barmode="group",
        bargap=0.2,
        template="plotly_white",
        width=1300,
        height=560,
        legend=dict(orientation="h", y=-0.18, x=0.5, xanchor="center"),
        margin=dict(t=80, b=80, l=60, r=30),
    )

    png_path = OUT_DIR / "inference_results_plotly.png"
    svg_path = OUT_DIR / "inference_results_plotly.svg"
    fig.write_image(png_path, scale=2)
    fig.write_image(svg_path)
    print(f"[plotly] wrote {png_path}")
    print(f"[plotly] wrote {svg_path}")


# ---------- baseline comparison: matplotlib ----------
def plot_baselines_matplotlib() -> None:
    fig, axes = plt.subplots(1, 2, figsize=(13.5, 5.4))

    # 1. Grouped bars per task
    ax = axes[0]
    n_agents = len(AGENT_LABELS)
    width = 0.2
    x = list(range(len(TASKS)))
    for i, agent in enumerate(AGENT_LABELS):
        offset = (i - (n_agents - 1) / 2) * width
        xs = [xi + offset for xi in x]
        bars = ax.bar(
            xs, AGENT_SCORES[agent], width=width,
            color=AGENT_COLORS[agent], label=agent,
            edgecolor="black", linewidth=0.4,
        )
        for bar, val in zip(bars, AGENT_SCORES[agent]):
            ax.text(
                bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.012,
                f"{val:.2f}", ha="center", va="bottom", fontsize=7.5,
            )
    ax.set_xticks(x)
    ax.set_xticklabels(TASKS, rotation=10)
    ax.set_ylim(0, 1.0)
    ax.set_ylabel("Normalized score")
    ax.set_title("Per-task score: baselines vs LLM")
    ax.legend(frameon=False, fontsize=8.5, loc="upper right")

    # 2. Average score across all 3 tasks
    ax = axes[1]
    avgs = [AGENT_AVG[a] for a in AGENT_LABELS]
    colors = [AGENT_COLORS[a] for a in AGENT_LABELS]
    bars = ax.bar(AGENT_LABELS, avgs, color=colors, edgecolor="black", linewidth=0.6)
    for bar, val in zip(bars, avgs):
        ax.text(
            bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.012,
            f"{val:.3f}", ha="center", va="bottom", fontsize=10, fontweight="bold",
        )
    ax.set_ylim(0, 1.0)
    ax.set_ylabel("Mean score (3 tasks)")
    ax.set_title("Average score across all tasks")
    ax.tick_params(axis="x", rotation=12)

    fig.suptitle(
        "Baseline agents vs LLM — score = 0.7·bug_ratio + 0.3·coverage_ratio",
        fontsize=12, fontweight="bold",
    )
    fig.tight_layout(rect=(0, 0, 1, 0.95))

    png_path = OUT_DIR / "baseline_comparison_matplotlib.png"
    svg_path = OUT_DIR / "baseline_comparison_matplotlib.svg"
    fig.savefig(png_path, dpi=160, bbox_inches="tight")
    fig.savefig(svg_path, bbox_inches="tight")
    plt.close(fig)
    print(f"[matplotlib] wrote {png_path}")
    print(f"[matplotlib] wrote {svg_path}")


# ---------- baseline comparison: plotly ----------
def plot_baselines_plotly() -> None:
    fig = make_subplots(
        rows=1, cols=2,
        column_widths=[0.62, 0.38],
        subplot_titles=("Per-task score: baselines vs LLM", "Average score across all tasks"),
    )

    # 1. Grouped bars per task
    for agent in AGENT_LABELS:
        fig.add_trace(
            go.Bar(
                x=TASKS,
                y=AGENT_SCORES[agent],
                name=agent,
                marker_color=AGENT_COLORS[agent],
                text=[f"{v:.2f}" for v in AGENT_SCORES[agent]],
                textposition="outside",
                legendgroup=agent,
            ),
            row=1, col=1,
        )

    # 2. Average score
    avgs = [AGENT_AVG[a] for a in AGENT_LABELS]
    fig.add_trace(
        go.Bar(
            x=AGENT_LABELS,
            y=avgs,
            marker_color=[AGENT_COLORS[a] for a in AGENT_LABELS],
            text=[f"{v:.3f}" for v in avgs],
            textposition="outside",
            showlegend=False,
        ),
        row=1, col=2,
    )

    fig.update_yaxes(title_text="Normalized score", range=[0, 1.05], row=1, col=1)
    fig.update_yaxes(title_text="Mean score (3 tasks)", range=[0, 1.05], row=1, col=2)
    fig.update_layout(
        title=dict(
            text="Baseline agents vs LLM — score = 0.7·bug_ratio + 0.3·coverage_ratio",
            x=0.5, xanchor="center",
        ),
        barmode="group",
        bargap=0.18,
        template="plotly_white",
        width=1400,
        height=580,
        legend=dict(orientation="h", y=-0.18, x=0.5, xanchor="center"),
        margin=dict(t=80, b=90, l=60, r=30),
    )

    png_path = OUT_DIR / "baseline_comparison_plotly.png"
    svg_path = OUT_DIR / "baseline_comparison_plotly.svg"
    fig.write_image(png_path, scale=2)
    fig.write_image(svg_path)
    print(f"[plotly] wrote {png_path}")
    print(f"[plotly] wrote {svg_path}")


if __name__ == "__main__":
    plot_matplotlib()
    plot_plotly()
    plot_baselines_matplotlib()
    plot_baselines_plotly()