"""Generate judge-friendly SVG plots from evaluation comparison CSV.
This module intentionally avoids matplotlib to keep plotting deterministic
in restricted CI/sandbox environments.
"""
from __future__ import annotations
import csv
from pathlib import Path
ARTIFACT_DIR = Path("artifacts/evals")
COMPARISON_CSV = ARTIFACT_DIR / "comparison.csv"
def _load_rows() -> list[dict[str, str]]:
with COMPARISON_CSV.open() as f:
return list(csv.DictReader(f))
def _svg_header(width: int, height: int) -> list[str]:
return [
f'"]
def plot_reward_by_task(rows: list[dict[str, str]]) -> None:
tasks = [row["task_id"] for row in rows]
baseline = [float(row["baseline_reward"]) for row in rows]
improved = [float(row["improved_reward"]) for row in rows]
width, height = 1360, 520
left, right, top, bottom = 80, 40, 70, 110
plot_w = width - left - right
plot_h = height - top - bottom
group_w = plot_w / max(len(tasks), 1)
bar_w = max(group_w * 0.32, 10)
lines = _svg_header(width, height)
lines.append('Baseline vs Improved Reward by Task')
lines.append(f'')
lines.append(f'')
for tick in range(0, 6):
value = tick / 5
y = top + plot_h - (value * plot_h)
lines.append(f'')
lines.append(f'{value:.1f}')
for idx, task in enumerate(tasks):
gx = left + (idx * group_w) + (group_w * 0.5)
b_h = baseline[idx] * plot_h
i_h = improved[idx] * plot_h
b_x = gx - bar_w - 2
i_x = gx + 2
b_y = top + plot_h - b_h
i_y = top + plot_h - i_h
lines.append(f'')
lines.append(f'')
lines.append(
f'{task}'
)
legend_y = 52
lines.append(f'')
lines.append(f'Baseline')
lines.append(f'')
lines.append(f'Improved')
lines.extend(_svg_footer())
(ARTIFACT_DIR / "reward_by_task.svg").write_text("\n".join(lines))
def plot_violation_before_after(rows: list[dict[str, str]]) -> None:
tasks = [row["task_id"] for row in rows]
baseline = [int(row["baseline_violations"]) for row in rows]
improved = [int(row["improved_violations"]) for row in rows]
max_v = max(max(baseline, default=0), max(improved, default=0), 1)
width, height = 1360, 500
left, right, top, bottom = 80, 40, 70, 100
plot_w = width - left - right
plot_h = height - top - bottom
def point_x(idx: int) -> float:
return left + (idx / max(len(tasks) - 1, 1)) * plot_w
def point_y(value: int) -> float:
return top + plot_h - ((value / max_v) * plot_h)
lines = _svg_header(width, height)
lines.append('Commitment Violations Before vs After')
lines.append(f'')
lines.append(f'')
for tick in range(max_v + 1):
y = point_y(tick)
lines.append(f'')
lines.append(f'{tick}')
baseline_points = " ".join(f"{point_x(i):.2f},{point_y(v):.2f}" for i, v in enumerate(baseline))
improved_points = " ".join(f"{point_x(i):.2f},{point_y(v):.2f}" for i, v in enumerate(improved))
lines.append(f'')
lines.append(f'')
for i, task in enumerate(tasks):
x = point_x(i)
lines.append(f'')
lines.append(f'')
lines.append(
f'{task}'
)
legend_y = 52
lines.append(f'')
lines.append(f'Baseline')
lines.append(f'')
lines.append(f'Improved')
lines.extend(_svg_footer())
(ARTIFACT_DIR / "violations_before_after.svg").write_text("\n".join(lines))
def main() -> None:
rows = _load_rows()
plot_reward_by_task(rows)
plot_violation_before_after(rows)
print("Wrote SVG plots to", ARTIFACT_DIR)
if __name__ == "__main__":
main()