codecourt / scripts /evaluate.py
ayussssssiiii's picture
Initial HF Space snapshot
fcb838d
"""
Evaluation script — generates before/after comparison plots.
Run after training to produce the graphs for your README and presentation.
Usage:
python scripts/evaluate.py \
--baseline ./outputs/baseline_results.json \
--trained ./outputs/grpo_solver/training_log_history.json \
--output ./outputs/plots/
"""
import sys
import os
import json
import argparse
from pathlib import Path
os.environ.setdefault("MPLBACKEND", "Agg")
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
def load_json(path: str) -> dict:
with open(path) as f:
return json.load(f)
def smooth(arr, window=10):
result = []
for i in range(len(arr)):
start = max(0, i - window + 1)
result.append(sum(arr[start:i+1]) / (i - start + 1))
return result
def plot_reward_curves(history: list, output_dir: str):
try:
import matplotlib.pyplot as plt
import matplotlib.style as style
style.use('dark_background')
except ImportError:
print("⚠ matplotlib not installed. pip install matplotlib")
return
if "episode" in history[0]:
x_axis = [r["episode"] for r in history]
setter_rewards = smooth([r["setter_reward"] for r in history])
solver_rewards = smooth([r["solver_reward"] for r in history])
pass_rates = smooth([r.get("solver_pass_rate", 0) for r in history])
setter_elo = [r["setter_elo"] for r in history] if "setter_elo" in history[0] else None
solver_elo = [r["solver_elo"] for r in history] if "solver_elo" in history[0] else None
outcomes_source = history
x_label = "Episode"
else:
log_records = [r for r in history if "step" in r and ("reward" in r or "reward_pass_rate" in r)]
if not log_records:
print("⚠ No plottable training metrics found")
return
x_axis = [r["step"] for r in log_records]
setter_rewards = [0.0 for _ in log_records]
solver_rewards = smooth([r.get("reward", 0.0) for r in log_records])
pass_rates = smooth([r.get("reward_pass_rate", 0.0) for r in log_records])
setter_elo = None
solver_elo = None
outcomes_source = []
x_label = "Training Step"
fig, axes = plt.subplots(2, 2, figsize=(14, 9))
fig.patch.set_facecolor('#0d0d0d')
fig.suptitle('CodeCourt — Training Metrics', color='white',
fontsize=16, fontweight='bold', y=0.98)
COLORS = {
'setter': '#ff6b35',
'solver': '#4ecdc4',
'grid': '#333333',
'text': '#cccccc',
}
def style_ax(ax, title, xlabel, ylabel):
ax.set_facecolor('#1a1a1a')
ax.set_title(title, color='white', fontsize=11, pad=8)
ax.set_xlabel(xlabel, color=COLORS['text'], fontsize=9)
ax.set_ylabel(ylabel, color=COLORS['text'], fontsize=9)
ax.tick_params(colors=COLORS['text'])
ax.grid(True, color=COLORS['grid'], linewidth=0.5, alpha=0.7)
for spine in ax.spines.values():
spine.set_color('#444444')
# 1. Reward curves
ax = axes[0, 0]
if any(value != 0.0 for value in setter_rewards):
ax.plot(x_axis, setter_rewards, color=COLORS['setter'],
linewidth=1.5, label='Setter Reward')
ax.plot(x_axis, solver_rewards, color=COLORS['solver'],
linewidth=1.5, label='Solver Reward')
ax.axhline(0, color='#555555', linewidth=0.8, linestyle='--')
ax.legend(facecolor='#2a2a2a', edgecolor='#555555',
labelcolor='white', fontsize=9)
style_ax(ax, 'Reward Curves (smoothed, window=10)',
x_label, 'Avg Reward')
# 2. Solver pass rate over time
ax = axes[0, 1]
ax.plot(x_axis, [p * 100 for p in pass_rates],
color=COLORS['solver'], linewidth=1.5)
ax.set_ylim(0, 105)
ax.axhline(50, color='#ffaa00', linewidth=0.8, linestyle='--',
label='50% baseline')
ax.legend(facecolor='#2a2a2a', edgecolor='#555555',
labelcolor='white', fontsize=9)
style_ax(ax, 'Solver Pass Rate (%)', x_label, 'Pass Rate %')
# 3. Elo ratings
ax = axes[1, 0]
if setter_elo is not None and solver_elo is not None:
ax.plot(x_axis, setter_elo, color=COLORS['setter'],
linewidth=1.5, label='Setter Elo')
ax.plot(x_axis, solver_elo, color=COLORS['solver'],
linewidth=1.5, label='Solver Elo')
ax.axhline(1000, color='#555555', linewidth=0.8, linestyle='--')
ax.legend(facecolor='#2a2a2a', edgecolor='#555555',
labelcolor='white', fontsize=9)
style_ax(ax, 'Elo Rating Progression', x_label, 'Elo Rating')
else:
ax.text(0.5, 0.5, 'GRPO run logs reward metrics,\nnot match Elo.',
ha='center', va='center', color='white', fontsize=11)
ax.set_axis_off()
# 4. Outcome distribution (stacked bar, binned)
ax = axes[1, 1]
if not outcomes_source:
ax.text(0.5, 0.5, 'Outcome bins are available for\nlegacy episode runs only.',
ha='center', va='center', color='white', fontsize=11)
ax.set_axis_off()
else:
bin_size = max(1, len(outcomes_source) // 20)
bins = []
setter_wins_pct = []
solver_wins_pct = []
invalid_pct = []
for i in range(0, len(outcomes_source), bin_size):
chunk = outcomes_source[i:i+bin_size]
if not chunk:
continue
bins.append(i)
outcomes = [r["outcome"] for r in chunk]
n = len(outcomes)
setter_wins_pct.append(outcomes.count("setter_wins") / n * 100)
solver_wins_pct.append(outcomes.count("solver_wins") / n * 100)
invalid_pct.append(outcomes.count("invalid") / n * 100)
ax.bar(bins, setter_wins_pct, width=bin_size*0.8,
color=COLORS['setter'], alpha=0.8, label='Setter Wins')
ax.bar(bins, solver_wins_pct, width=bin_size*0.8,
bottom=setter_wins_pct, color=COLORS['solver'],
alpha=0.8, label='Solver Wins')
ax.set_ylim(0, 105)
ax.legend(facecolor='#2a2a2a', edgecolor='#555555',
labelcolor='white', fontsize=9)
style_ax(ax, 'Outcome Distribution Over Time',
'Episode', 'Percentage %')
plt.tight_layout(rect=[0, 0, 1, 0.96])
os.makedirs(output_dir, exist_ok=True)
out_path = os.path.join(output_dir, 'training_curves.png')
plt.savefig(out_path, dpi=150, bbox_inches='tight',
facecolor=fig.get_facecolor())
print(f"✓ Saved: {out_path}")
plt.close()
def plot_before_after(baseline: dict, trained_history: list, output_dir: str):
"""Before/after comparison — the killer demo chart."""
try:
import matplotlib.pyplot as plt
import matplotlib.style as style
style.use('dark_background')
except ImportError:
return
# Compute trained metrics (last 25% of training)
if "episode" in trained_history[0]:
n = len(trained_history)
last_quarter = trained_history[n * 3 // 4:]
trained_pass_rate = sum(
r.get("solver_pass_rate", 0) for r in last_quarter
) / max(len(last_quarter), 1)
trained_solver_reward = sum(r["solver_reward"] for r in last_quarter) / max(len(last_quarter), 1)
trained_setter_win_rate = (
sum(1 for r in last_quarter if r["outcome"] == "setter_wins")
/ max(len(last_quarter), 1) * 100
)
else:
log_records = [r for r in trained_history if "step" in r and ("reward" in r or "reward_pass_rate" in r)]
last_quarter = log_records[len(log_records) * 3 // 4:]
trained_pass_rate = sum(
r.get("reward_pass_rate", 0) for r in last_quarter
) / max(len(last_quarter), 1)
trained_solver_reward = sum(r.get("reward", 0) for r in last_quarter) / max(len(last_quarter), 1)
trained_setter_win_rate = 0.0
baseline_summary = baseline.get("summary", baseline)
baseline_pass = baseline_summary.get("avg_solver_pass_rate", 0.31)
fig, axes = plt.subplots(1, 3, figsize=(14, 5))
fig.patch.set_facecolor('#0d0d0d')
fig.suptitle('CodeCourt — Before vs After Training',
color='white', fontsize=15, fontweight='bold')
BEFORE = '#ff6b35'
AFTER = '#4ecdc4'
BG = '#1a1a1a'
metrics = [
("Solver Pass Rate", baseline_pass * 100, trained_pass_rate * 100, "%"),
(
"Avg Solver Reward",
baseline_summary.get("avg_solver_reward", -15),
trained_solver_reward,
"pts",
),
(
"Setter Win Rate",
baseline_summary.get("setter_win_rate", 0.4) * 100,
trained_setter_win_rate,
"%",
),
]
for ax, (title, before_val, after_val, unit) in zip(axes, metrics):
ax.set_facecolor(BG)
bars = ax.bar(['Before\n(Untrained)', 'After\n(Trained)'],
[before_val, after_val],
color=[BEFORE, AFTER], width=0.5,
edgecolor='#333333')
# Value labels
for bar, val in zip(bars, [before_val, after_val]):
ax.text(bar.get_x() + bar.get_width() / 2,
bar.get_height() + abs(before_val) * 0.05,
f"{val:.1f}{unit}",
ha='center', va='bottom', color='white',
fontsize=13, fontweight='bold')
ax.set_title(title, color='white', fontsize=11, pad=10)
ax.tick_params(colors='#cccccc')
ax.set_ylabel(unit, color='#cccccc', fontsize=9)
ax.grid(True, axis='y', color='#333333', linewidth=0.5)
for spine in ax.spines.values():
spine.set_color('#444444')
plt.tight_layout(rect=[0, 0, 1, 0.93])
out_path = os.path.join(output_dir, 'before_after.png')
plt.savefig(out_path, dpi=150, bbox_inches='tight',
facecolor=fig.get_facecolor())
print(f"✓ Saved: {out_path}")
plt.close()
def build_evaluation_summary(baseline: dict | None, trained_history: list) -> dict:
log_records = [row for row in trained_history if isinstance(row, dict) and "step" in row]
baseline_summary = (baseline or {}).get("summary", baseline or {})
baseline_pass = baseline_summary.get("avg_solver_pass_rate")
baseline_reward = baseline_summary.get("avg_solver_reward")
if log_records:
final_record = log_records[-1]
trained_pass = final_record.get("reward_pass_rate")
trained_reward = final_record.get("reward")
trained_robustness = final_record.get("reward_robustness")
if trained_pass is None:
pass_values = [row.get("reward_pass_rate") for row in log_records if row.get("reward_pass_rate") is not None]
trained_pass = pass_values[-1] if pass_values else None
if trained_reward is None:
reward_values = [row.get("reward") for row in log_records if row.get("reward") is not None]
trained_reward = reward_values[-1] if reward_values else None
if trained_robustness is None:
robustness_values = [
row.get("reward_robustness")
for row in log_records
if row.get("reward_robustness") is not None
]
trained_robustness = robustness_values[-1] if robustness_values else None
setter_win_rate = None
else:
episodes = [row for row in trained_history if isinstance(row, dict) and "episode" in row]
tail = episodes[len(episodes) * 3 // 4:] if episodes else []
trained_pass = (
sum(row.get("solver_pass_rate", 0) for row in tail) / max(len(tail), 1)
if tail else None
)
trained_reward = (
sum(row.get("solver_reward", 0) for row in tail) / max(len(tail), 1)
if tail else None
)
trained_robustness = None
setter_win_rate = (
sum(1 for row in tail if row.get("outcome") == "setter_wins") / max(len(tail), 1)
if tail else None
)
return {
"baseline_pass_rate": baseline_pass,
"trained_pass_rate": trained_pass,
"pass_rate_delta": (trained_pass - baseline_pass) if baseline_pass is not None and trained_pass is not None else None,
"baseline_reward": baseline_reward,
"trained_reward": trained_reward,
"reward_delta": (trained_reward - baseline_reward) if baseline_reward is not None and trained_reward is not None else None,
"trained_robustness": trained_robustness,
"trained_setter_win_rate": setter_win_rate,
}
def generate_reports(baseline_path: Path | None, trained_path: Path, output_dir: Path):
trained = load_json(str(trained_path))
history = trained if isinstance(trained, list) else trained.get("episodes", trained)
os.makedirs(output_dir, exist_ok=True)
plot_reward_curves(history, str(output_dir))
baseline = None
if baseline_path and baseline_path.exists():
baseline = load_json(str(baseline_path))
plot_before_after(baseline, history, str(output_dir))
summary = build_evaluation_summary(baseline, history)
with open(output_dir / "evaluation_summary.json", "w") as f:
json.dump(summary, f, indent=2)
print(f"✓ Saved: {output_dir / 'evaluation_summary.json'}")
return summary
def main():
p = argparse.ArgumentParser()
p.add_argument("--baseline", type=str,
default="./outputs/baseline_results.json")
p.add_argument("--trained", type=str,
default="./outputs/grpo_solver/training_log_history.json")
p.add_argument("--output", type=str, default="./outputs/plots/")
args = p.parse_args()
print("\n CodeCourt Evaluation")
print("=" * 50)
# Load data
if not os.path.exists(args.trained):
print(f"⚠ No training history at {args.trained}")
print(" Run: python scripts/train.py --train-samples 54 --max-steps 30")
return
history = load_json(args.trained)
history = history if isinstance(history, list) else history.get("episodes", history)
print(f"Loaded {len(history)} training episodes")
baseline_path = Path(args.baseline) if os.path.exists(args.baseline) else None
if baseline_path is None:
print(f"⚠ No baseline at {args.baseline} — before/after chart will be skipped")
print(" Run: python scripts/baseline.py")
generate_reports(baseline_path, Path(args.trained), Path(args.output))
print(f"\n✓ All plots saved to: {args.output}")
if __name__ == "__main__":
main()