"""Generate judge-friendly SVG plots from evaluation comparison CSV. This module intentionally avoids matplotlib to keep plotting deterministic in restricted CI/sandbox environments. """ from __future__ import annotations import csv from pathlib import Path ARTIFACT_DIR = Path("artifacts/evals") COMPARISON_CSV = ARTIFACT_DIR / "comparison.csv" def _load_rows() -> list[dict[str, str]]: with COMPARISON_CSV.open() as f: return list(csv.DictReader(f)) def _svg_header(width: int, height: int) -> list[str]: return [ f'', '', ] def _svg_footer() -> list[str]: return [""] def plot_reward_by_task(rows: list[dict[str, str]]) -> None: tasks = [row["task_id"] for row in rows] baseline = [float(row["baseline_reward"]) for row in rows] improved = [float(row["improved_reward"]) for row in rows] width, height = 1360, 520 left, right, top, bottom = 80, 40, 70, 110 plot_w = width - left - right plot_h = height - top - bottom group_w = plot_w / max(len(tasks), 1) bar_w = max(group_w * 0.32, 10) lines = _svg_header(width, height) lines.append('Baseline vs Improved Reward by Task') lines.append(f'') lines.append(f'') for tick in range(0, 6): value = tick / 5 y = top + plot_h - (value * plot_h) lines.append(f'') lines.append(f'{value:.1f}') for idx, task in enumerate(tasks): gx = left + (idx * group_w) + (group_w * 0.5) b_h = baseline[idx] * plot_h i_h = improved[idx] * plot_h b_x = gx - bar_w - 2 i_x = gx + 2 b_y = top + plot_h - b_h i_y = top + plot_h - i_h lines.append(f'') lines.append(f'') lines.append( f'{task}' ) legend_y = 52 lines.append(f'') lines.append(f'Baseline') lines.append(f'') lines.append(f'Improved') lines.extend(_svg_footer()) (ARTIFACT_DIR / "reward_by_task.svg").write_text("\n".join(lines)) def plot_violation_before_after(rows: list[dict[str, str]]) -> None: tasks = [row["task_id"] for row in rows] baseline = [int(row["baseline_violations"]) for row in rows] improved = [int(row["improved_violations"]) for row in rows] max_v = max(max(baseline, default=0), max(improved, default=0), 1) width, height = 1360, 500 left, right, top, bottom = 80, 40, 70, 100 plot_w = width - left - right plot_h = height - top - bottom def point_x(idx: int) -> float: return left + (idx / max(len(tasks) - 1, 1)) * plot_w def point_y(value: int) -> float: return top + plot_h - ((value / max_v) * plot_h) lines = _svg_header(width, height) lines.append('Commitment Violations Before vs After') lines.append(f'') lines.append(f'') for tick in range(max_v + 1): y = point_y(tick) lines.append(f'') lines.append(f'{tick}') baseline_points = " ".join(f"{point_x(i):.2f},{point_y(v):.2f}" for i, v in enumerate(baseline)) improved_points = " ".join(f"{point_x(i):.2f},{point_y(v):.2f}" for i, v in enumerate(improved)) lines.append(f'') lines.append(f'') for i, task in enumerate(tasks): x = point_x(i) lines.append(f'') lines.append(f'') lines.append( f'{task}' ) legend_y = 52 lines.append(f'') lines.append(f'Baseline') lines.append(f'') lines.append(f'Improved') lines.extend(_svg_footer()) (ARTIFACT_DIR / "violations_before_after.svg").write_text("\n".join(lines)) def main() -> None: rows = _load_rows() plot_reward_by_task(rows) plot_violation_before_after(rows) print("Wrote SVG plots to", ARTIFACT_DIR) if __name__ == "__main__": main()