Spaces:

Mayank022
/

api-testing-env

Running

App Files Files Community

api-testing-env / plots /plot_inference_results.py

Mayank022

Upload folder using huggingface_hub

bafcc7e verified about 15 hours ago

raw

history blame contribute delete

11.9 kB

	"""Visualize inference.py task scores and per-step rewards.

	Generates matplotlib and plotly bar charts (PNG + SVG) under plots/.

	Two figures are produced:
	1. inference_results_* — LLM-only view: per-task final score + per-step rewards
	2. baseline_comparison_* — LLM vs random / sequential / smart baselines

	LLM data is the inference.py run on 2026-04-08 against
	meta-llama/Llama-3.3-70B-Instruct via the HF router. Baseline numbers come
	from `python baseline.py --agent all --task all --seed 42` and are converted
	to the same normalized score the LLM reports:
	score = 0.7 * (bugs_found / total_bugs) + 0.3 * (coverage_pct / 100)
	"""

	from __future__ import annotations

	from pathlib import Path

	import matplotlib.pyplot as plt
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots

	OUT_DIR = Path(__file__).parent
	OUT_DIR.mkdir(parents=True, exist_ok=True)

	TASKS = ["basic_validation", "edge_cases", "security_workflows"]
	SCORES = [0.647, 0.772, 0.581]
	STEPS = [18, 27, 29]
	AVG_SCORE = 0.667

	# --- Baseline rollout results (seed=42) ---
	# Each entry: (bugs_found, total_bugs, coverage_pct, steps)
	BASELINE_RAW = {
	"random": {
	"basic_validation": (1, 3, 40.0, 25),
	"edge_cases": (2, 9, 50.0, 35),
	"security_workflows": (3, 13, 50.0, 45),
	},
	"sequential": {
	"basic_validation": (3, 3, 50.0, 25),
	"edge_cases": (4, 9, 50.0, 35),
	"security_workflows": (4, 13, 50.0, 45),
	},
	"smart": {
	"basic_validation": (3, 3, 50.0, 25),
	"edge_cases": (9, 9, 50.0, 35),
	"security_workflows": (12, 13, 50.0, 45),
	},
	}


	def normalized_score(bugs_found: int, total_bugs: int, coverage_pct: float) -> float:
	"""Same formula as inference.compute_task_score — keeps everything in [0, 1]."""
	bug_ratio = (bugs_found / total_bugs) if total_bugs > 0 else 0.0
	cov_ratio = max(0.0, min(1.0, coverage_pct / 100.0))
	return max(0.0, min(1.0, 0.70 * bug_ratio + 0.30 * cov_ratio))


	# Pre-compute normalized scores for each baseline + LLM
	AGENT_LABELS = ["random", "sequential", "smart", "llm (Llama-3.3-70B)"]
	LLM_SCORES_BY_TASK = dict(zip(TASKS, SCORES))

	AGENT_SCORES: dict[str, list[float]] = {}
	for agent_name, per_task in BASELINE_RAW.items():
	AGENT_SCORES[agent_name] = [
	normalized_score(*per_task[t][:3]) for t in TASKS
	]
	AGENT_SCORES["llm (Llama-3.3-70B)"] = [LLM_SCORES_BY_TASK[t] for t in TASKS]

	AGENT_AVG = {a: sum(s) / len(s) for a, s in AGENT_SCORES.items()}

	AGENT_COLORS = {
	"random": "#9E9E9E",
	"sequential": "#F4A261",
	"smart": "#2A9D8F",
	"llm (Llama-3.3-70B)": "#6A4C93",
	}

	PER_STEP_REWARDS = {
	"basic_validation": [
	0.33, 0.23, 0.28, 0.18, 0.13, 0.28, 0.25, 0.28, 0.28,
	0.18, 0.23, 0.33, 0.13, 0.03, 0.03, 0.13, -0.05, 0.03,
	],
	"edge_cases": [
	0.33, 0.28, 0.28, 0.08, 0.18, 0.25, 0.48, 0.28, 0.33,
	0.08, 0.33, 0.03, 0.23, 0.33, 0.28, 0.18, 0.03, 0.08,
	0.08, 0.13, 0.13, 0.08, 0.13, 0.00, 0.33, 0.08, 0.00,
	],
	"security_workflows": [
	0.33, 0.28, 0.28, 0.08, 0.03, 0.18, 0.48, 0.23, 0.28,
	0.25, 0.33, 0.33, 0.23, 0.33, 0.28, 0.08, 0.18, 0.03,
	0.13, 0.13, 0.13, 0.08, 0.00, 0.13, 0.00, -0.05, -0.05,
	0.03, -0.05,
	],
	}

	COLORS = {
	"basic_validation": "#4C72B0",
	"edge_cases": "#55A868",
	"security_workflows": "#C44E52",
	}


	# ---------- matplotlib ----------
	def plot_matplotlib() -> None:
	fig, axes = plt.subplots(1, 2, figsize=(13, 5.2))

	# 1. Final scores per task
	ax = axes[0]
	bar_colors = [COLORS[t] for t in TASKS]
	bars = ax.bar(TASKS, SCORES, color=bar_colors, edgecolor="black", linewidth=0.6)
	ax.axhline(AVG_SCORE, color="#333", linestyle="--", linewidth=1.2,
	label=f"avg = {AVG_SCORE:.3f}")
	ax.set_ylim(0, 1.0)
	ax.set_ylabel("Final score")
	ax.set_title("Inference final score by task")
	ax.legend(loc="upper right", frameon=False)
	for bar, score, steps in zip(bars, SCORES, STEPS):
	ax.text(
	bar.get_x() + bar.get_width() / 2,
	bar.get_height() + 0.015,
	f"{score:.3f}\n({steps} steps)",
	ha="center", va="bottom", fontsize=9,
	)
	ax.tick_params(axis="x", rotation=15)

	# 2. Per-step rewards (grouped over step index)
	ax = axes[1]
	max_len = max(len(v) for v in PER_STEP_REWARDS.values())
	width = 0.27
	x_base = list(range(1, max_len + 1))
	for i, task in enumerate(TASKS):
	rewards = PER_STEP_REWARDS[task]
	xs = [x + (i - 1) * width for x in range(1, len(rewards) + 1)]
	ax.bar(xs, rewards, width=width, color=COLORS[task],
	label=task, edgecolor="black", linewidth=0.3)
	ax.axhline(0, color="#666", linewidth=0.8)
	ax.set_xlabel("Step")
	ax.set_ylabel("Reward")
	ax.set_title("Per-step reward by task")
	ax.set_xticks(x_base[::2])
	ax.legend(frameon=False, fontsize=9)

	fig.suptitle(
	"inference.py — meta-llama/Llama-3.3-70B-Instruct (avg score 0.667)",
	fontsize=12, fontweight="bold",
	)
	fig.tight_layout(rect=(0, 0, 1, 0.96))

	png_path = OUT_DIR / "inference_results_matplotlib.png"
	svg_path = OUT_DIR / "inference_results_matplotlib.svg"
	fig.savefig(png_path, dpi=160, bbox_inches="tight")
	fig.savefig(svg_path, bbox_inches="tight")
	plt.close(fig)
	print(f"[matplotlib] wrote {png_path}")
	print(f"[matplotlib] wrote {svg_path}")


	# ---------- plotly ----------
	def plot_plotly() -> None:
	fig = make_subplots(
	rows=1, cols=2,
	column_widths=[0.4, 0.6],
	subplot_titles=("Final score by task", "Per-step reward by task"),
	)

	# 1. Final scores
	fig.add_trace(
	go.Bar(
	x=TASKS,
	y=SCORES,
	marker_color=[COLORS[t] for t in TASKS],
	text=[f"{s:.3f}<br>({n} steps)" for s, n in zip(SCORES, STEPS)],
	textposition="outside",
	name="Final score",
	showlegend=False,
	),
	row=1, col=1,
	)
	fig.add_hline(
	y=AVG_SCORE, line_dash="dash", line_color="#333",
	annotation_text=f"avg = {AVG_SCORE:.3f}",
	annotation_position="top left",
	row=1, col=1,
	)

	# 2. Per-step rewards (grouped bars)
	for task in TASKS:
	rewards = PER_STEP_REWARDS[task]
	fig.add_trace(
	go.Bar(
	x=list(range(1, len(rewards) + 1)),
	y=rewards,
	name=task,
	marker_color=COLORS[task],
	),
	row=1, col=2,
	)

	fig.update_yaxes(title_text="Final score", range=[0, 1.0], row=1, col=1)
	fig.update_yaxes(title_text="Reward", row=1, col=2)
	fig.update_xaxes(title_text="Step", row=1, col=2)
	fig.update_layout(
	title=dict(
	text="inference.py — meta-llama/Llama-3.3-70B-Instruct (avg score 0.667)",
	x=0.5, xanchor="center",
	),
	barmode="group",
	bargap=0.2,
	template="plotly_white",
	width=1300,
	height=560,
	legend=dict(orientation="h", y=-0.18, x=0.5, xanchor="center"),
	margin=dict(t=80, b=80, l=60, r=30),
	)

	png_path = OUT_DIR / "inference_results_plotly.png"
	svg_path = OUT_DIR / "inference_results_plotly.svg"
	fig.write_image(png_path, scale=2)
	fig.write_image(svg_path)
	print(f"[plotly] wrote {png_path}")
	print(f"[plotly] wrote {svg_path}")


	# ---------- baseline comparison: matplotlib ----------
	def plot_baselines_matplotlib() -> None:
	fig, axes = plt.subplots(1, 2, figsize=(13.5, 5.4))

	# 1. Grouped bars per task
	ax = axes[0]
	n_agents = len(AGENT_LABELS)
	width = 0.2
	x = list(range(len(TASKS)))
	for i, agent in enumerate(AGENT_LABELS):
	offset = (i - (n_agents - 1) / 2) * width
	xs = [xi + offset for xi in x]
	bars = ax.bar(
	xs, AGENT_SCORES[agent], width=width,
	color=AGENT_COLORS[agent], label=agent,
	edgecolor="black", linewidth=0.4,
	)
	for bar, val in zip(bars, AGENT_SCORES[agent]):
	ax.text(
	bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.012,
	f"{val:.2f}", ha="center", va="bottom", fontsize=7.5,
	)
	ax.set_xticks(x)
	ax.set_xticklabels(TASKS, rotation=10)
	ax.set_ylim(0, 1.0)
	ax.set_ylabel("Normalized score")
	ax.set_title("Per-task score: baselines vs LLM")
	ax.legend(frameon=False, fontsize=8.5, loc="upper right")

	# 2. Average score across all 3 tasks
	ax = axes[1]
	avgs = [AGENT_AVG[a] for a in AGENT_LABELS]
	colors = [AGENT_COLORS[a] for a in AGENT_LABELS]
	bars = ax.bar(AGENT_LABELS, avgs, color=colors, edgecolor="black", linewidth=0.6)
	for bar, val in zip(bars, avgs):
	ax.text(
	bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.012,
	f"{val:.3f}", ha="center", va="bottom", fontsize=10, fontweight="bold",
	)
	ax.set_ylim(0, 1.0)
	ax.set_ylabel("Mean score (3 tasks)")
	ax.set_title("Average score across all tasks")
	ax.tick_params(axis="x", rotation=12)

	fig.suptitle(
	"Baseline agents vs LLM — score = 0.7·bug_ratio + 0.3·coverage_ratio",
	fontsize=12, fontweight="bold",
	)
	fig.tight_layout(rect=(0, 0, 1, 0.95))

	png_path = OUT_DIR / "baseline_comparison_matplotlib.png"
	svg_path = OUT_DIR / "baseline_comparison_matplotlib.svg"
	fig.savefig(png_path, dpi=160, bbox_inches="tight")
	fig.savefig(svg_path, bbox_inches="tight")
	plt.close(fig)
	print(f"[matplotlib] wrote {png_path}")
	print(f"[matplotlib] wrote {svg_path}")


	# ---------- baseline comparison: plotly ----------
	def plot_baselines_plotly() -> None:
	fig = make_subplots(
	rows=1, cols=2,
	column_widths=[0.62, 0.38],
	subplot_titles=("Per-task score: baselines vs LLM", "Average score across all tasks"),
	)

	# 1. Grouped bars per task
	for agent in AGENT_LABELS:
	fig.add_trace(
	go.Bar(
	x=TASKS,
	y=AGENT_SCORES[agent],
	name=agent,
	marker_color=AGENT_COLORS[agent],
	text=[f"{v:.2f}" for v in AGENT_SCORES[agent]],
	textposition="outside",
	legendgroup=agent,
	),
	row=1, col=1,
	)

	# 2. Average score
	avgs = [AGENT_AVG[a] for a in AGENT_LABELS]
	fig.add_trace(
	go.Bar(
	x=AGENT_LABELS,
	y=avgs,
	marker_color=[AGENT_COLORS[a] for a in AGENT_LABELS],
	text=[f"{v:.3f}" for v in avgs],
	textposition="outside",
	showlegend=False,
	),
	row=1, col=2,
	)

	fig.update_yaxes(title_text="Normalized score", range=[0, 1.05], row=1, col=1)
	fig.update_yaxes(title_text="Mean score (3 tasks)", range=[0, 1.05], row=1, col=2)
	fig.update_layout(
	title=dict(
	text="Baseline agents vs LLM — score = 0.7·bug_ratio + 0.3·coverage_ratio",
	x=0.5, xanchor="center",
	),
	barmode="group",
	bargap=0.18,
	template="plotly_white",
	width=1400,
	height=580,
	legend=dict(orientation="h", y=-0.18, x=0.5, xanchor="center"),
	margin=dict(t=80, b=90, l=60, r=30),
	)

	png_path = OUT_DIR / "baseline_comparison_plotly.png"
	svg_path = OUT_DIR / "baseline_comparison_plotly.svg"
	fig.write_image(png_path, scale=2)
	fig.write_image(svg_path)
	print(f"[plotly] wrote {png_path}")
	print(f"[plotly] wrote {svg_path}")


	if __name__ == "__main__":
	plot_matplotlib()
	plot_plotly()
	plot_baselines_matplotlib()
	plot_baselines_plotly()