""" AP Commander — GRPO Training Script Tracks: overall reward, per-component rewards, decision distribution, format compliance, env errors, sample generations, reward curve. """ import os, json, re, random, time, datetime, collections # Reduce CUDA memory fragmentation — must be set before torch is imported os.environ.setdefault('PYTORCH_CUDA_ALLOC_CONF', 'expandable_segments:True') class _StopTraining(Exception): """Raised inside reward fn when /app/stop_requested flag is found.""" import requests import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import numpy as np ENV_URL = 'https://pathikreet-ap-clerk-env.hf.space' MODEL_NAME = os.environ.get('MODEL_NAME', 'Qwen/Qwen2.5-7B-Instruct') NUM_EPOCHS = int(os.environ.get('NUM_EPOCHS', '6')) NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', '16')) # 32 OOMs on 7B/A10G; 16 fits (~16 GB est.) LOG_SAMPLES_EVERY = 20 # print a sample generation every N reward calls SYSTEM_PROMPT = """You are an AI Accounts Payable Clerk. Review the invoice, PO, and GRN, then output ONLY a single valid JSON object. No prose, no markdown, no explanation outside the JSON. Valid decisions: APPROVE_FULL | APPROVE_PARTIAL | REJECT | ESCALATE | QUERY_VENDOR Valid reason codes: MATCH_CONFIRMED | QUANTITY_MISMATCH | PRICE_DISCREPANCY | POLICY_VIOLATION | NO_PO_FOUND | DUPLICATE_INVOICE | VENDOR_MISMATCH | TAX_DISCREPANCY | PENDING_CLARIFICATION | MANAGER_REVIEW Example (do not copy — generate your own based on the actual invoice): {"decision": "REJECT", "approved_amount": 0.0, "reason_code": "NO_PO_FOUND", "explanation": "Invoice INV-2024-5821 rejected: no open PO found for vendor TechCorp. Policy Rule 5 mandates a valid OPEN PO before payment."} Your response must start with { and end with } with no other text.""" TRAIN_TASKS = [ 'easy_perfect_match', 'easy_no_po_found', 'medium_quantity_shortfall', 'medium_price_discrepancy', 'medium_split_delivery', 'medium_vendor_mismatch', 'hard_policy_violation', 'hard_duplicate_invoice', 'hard_partial_po_match', 'hard_tax_discrepancy', 'hard_currency_conversion', 'hard_manager_preapproval', 'hard_credit_memo', 'long_invoice_dispute', 'long_policy_migration', 'long_batch_reconciliation', 'long_manager_chain', 'long_fraud_investigation', 'long_audit_trail', 'long_multi_vendor_split', ] EVAL_TASKS = TRAIN_TASKS VALID_DECISIONS = {'APPROVE_FULL','APPROVE_PARTIAL','REJECT','ESCALATE','QUERY_VENDOR','HOLD'} VALID_REASON_CODES = {'MATCH_CONFIRMED','QUANTITY_MISMATCH','PRICE_DISCREPANCY','POLICY_VIOLATION', 'NO_PO_FOUND','DUPLICATE_INVOICE','VENDOR_MISMATCH','TAX_DISCREPANCY', 'PENDING_CLARIFICATION','MANAGER_REVIEW'} _TASK_DIFFICULTY = { 'easy_perfect_match': 'easy', 'easy_no_po_found': 'easy', 'medium_quantity_shortfall': 'medium', 'medium_price_discrepancy': 'medium', 'medium_split_delivery': 'medium', 'medium_vendor_mismatch': 'medium', 'hard_policy_violation': 'hard', 'hard_duplicate_invoice': 'hard', 'hard_partial_po_match': 'hard', 'hard_tax_discrepancy': 'hard', 'hard_currency_conversion': 'hard', 'hard_manager_preapproval': 'hard', 'hard_credit_memo': 'hard', 'long_invoice_dispute': 'long', 'long_policy_migration': 'long', 'long_batch_reconciliation': 'long', 'long_manager_chain': 'long', 'long_fraud_investigation': 'long', 'long_audit_trail': 'long', 'long_multi_vendor_split': 'long', } _DIFFICULTY_ORDER = ['easy', 'medium', 'hard', 'long'] _UNLOCK_THRESHOLDS = {'easy': 0.70, 'medium': 0.65, 'hard': 0.60} # ── Curriculum sampler ────────────────────────────────────────────────────────── class CurriculumSampler: """Tracks per-difficulty running mean; unlocks harder tiers once thresholds are met.""" def __init__(self): self._rewards: dict = collections.defaultdict(list) # task_id → [rewards] self.unlocked: set = {'easy'} def record(self, task_id: str, reward: float): self._rewards[task_id].append(reward) self._try_unlock() def mean_for_difficulty(self, diff: str) -> float: vals = [] for tid, d in _TASK_DIFFICULTY.items(): if d == diff: vals.extend(self._rewards.get(tid, [])) return sum(vals) / len(vals) if vals else 0.0 def _try_unlock(self): for i, diff in enumerate(_DIFFICULTY_ORDER[:-1]): if diff in self.unlocked: m = self.mean_for_difficulty(diff) if m >= _UNLOCK_THRESHOLDS.get(diff, 0.70): nxt = _DIFFICULTY_ORDER[i + 1] if nxt not in self.unlocked: self.unlocked.add(nxt) print(f'\n[CURRICULUM] Unlocked {nxt}! mean({diff})={m:.3f} ' f'>= threshold {_UNLOCK_THRESHOLDS[diff]}') def gate_task(self, task_id: str) -> str: if _TASK_DIFFICULTY.get(task_id, 'easy') in self.unlocked: return task_id easiest = [t for t, d in _TASK_DIFFICULTY.items() if d == 'easy'] return random.choice(easiest) def build_dataset_tasks(self) -> list: """Return (task_id, seed) pairs for all currently unlocked difficulties.""" rows = [] seeds_per_diff = {'easy': 10, 'medium': 5, 'hard': 2, 'long': 2} for task_id, diff in _TASK_DIFFICULTY.items(): if diff in self.unlocked: n = seeds_per_diff[diff] rows.extend([(task_id, s) for s in range(1, n + 1)]) return rows def status_line(self) -> str: parts = [] for d in _DIFFICULTY_ORDER: m = self.mean_for_difficulty(d) unlk = '✓' if d in self.unlocked else '✗' parts.append(f'{d}={m:.2f}{unlk}') return ' | '.join(parts) CURRICULUM = CurriculumSampler() # ── Per-step greedy follow-up policy ─────────────────────────────────────────── def _greedy_followup(obs_dict: dict) -> dict: """Scripted terminal action after an intermediate step, based on revealed context notes.""" notes = ' '.join(obs_dict.get('context_notes', [])).lower() total = abs(float(obs_dict.get('invoice', {}).get('invoice_total', 0) or 0)) # Manager / VP approved → APPROVE_FULL if any(k in notes for k in ('manager approved', 'vp approved', 'cfo approved', 'pre-approved', 'pre-approv', 'approved by')): return {'decision': 'APPROVE_FULL', 'approved_amount': total, 'reason_code': 'MATCH_CONFIRMED', 'explanation': f'Approval confirmed via escalation chain. Approving ${total:.2f}.'} # Compliance cleared → APPROVE_FULL if 'compliance' in notes and any(k in notes for k in ('cleared', 'approved', 'pass')): return {'decision': 'APPROVE_FULL', 'approved_amount': total, 'reason_code': 'MATCH_CONFIRMED', 'explanation': f'Compliance review cleared. Approving ${total:.2f}.'} # Fraudulent / duplicate / deny → REJECT if any(k in notes for k in ('fraudulent', 'duplicate', 'already paid', 'deny', 'invalid', 'false claim')): return {'decision': 'REJECT', 'approved_amount': 0.0, 'reason_code': 'DUPLICATE_INVOICE', 'explanation': 'Vendor response or audit confirms fraud/duplicate. Rejecting.'} # Compliance flagged / SOX violation → REJECT if any(k in notes for k in ('flagged', 'violation', 'sox', 'gdpr', 'non-compliant')): return {'decision': 'REJECT', 'approved_amount': 0.0, 'reason_code': 'POLICY_VIOLATION', 'explanation': 'Compliance review flagged a violation. Rejecting.'} # Confused vendor / ambiguous → ESCALATE if any(k in notes for k in ('confused', 'unclear', 'unable to confirm')): return {'decision': 'ESCALATE', 'approved_amount': 0.0, 'reason_code': 'MANAGER_REVIEW', 'explanation': 'Vendor response ambiguous. Escalating to manager.'} # Default: safe rejection return {'decision': 'REJECT', 'approved_amount': 0.0, 'reason_code': 'PENDING_CLARIFICATION', 'explanation': 'Could not resolve after investigation. Rejecting for safety.'} # ── Metrics tracker ──────────────────────────────────────────────────────────── class Metrics: def __init__(self): self.step = 0 self.reward_history = [] # (step, mean_reward) — overall self.loss_history = [] # (step, loss, grad_norm) — from TRL callback self.diff_reward_hist = collections.defaultdict(list) # diff → [(step, mean)] self.format_history = [] # (step, format_rate) — compliance over time self.episode_len_hist = [] # all episode lengths (for histogram) self.ep_len_by_task = collections.defaultdict(list) # task_id → [lengths] self.ep_len_history = [] # (step, mean_ep_len) — rolling mean per step self.decision_history = [] # [(step, Counter)] for stacked-bar over time self.decision_counts = collections.Counter() self.parse_failures = 0 self.env_errors = 0 self.format_scores = [] self.reward_by_task = collections.defaultdict(list) self.total_calls = 0 self._start_time = time.time() self._step_decisions = collections.Counter() # decisions in current step batch def log_step(self, rewards, decisions, format_ok_list, task_ids, errors, episode_lengths=None): self.step += 1 self.total_calls += len(rewards) mean_r = sum(rewards) / len(rewards) if rewards else 0.0 self.reward_history.append((self.step, mean_r)) diff_rewards: dict = collections.defaultdict(list) for tid, r in zip(task_ids, rewards): d = _TASK_DIFFICULTY.get(tid, 'easy') diff_rewards[d].append(r) for d, rs in diff_rewards.items(): self.diff_reward_hist[d].append((self.step, sum(rs) / len(rs))) for d in decisions: self.decision_counts[d] += 1 self._step_decisions[d] += 1 # Snapshot decision distribution every step for stacked-bar self.decision_history.append((self.step, dict(self._step_decisions))) fmt_ok_count = sum(1 for ok in format_ok_list if ok) fmt_rate = fmt_ok_count / len(format_ok_list) if format_ok_list else 0.0 self.format_history.append((self.step, fmt_rate)) for ok in format_ok_list: self.format_scores.append(1.0 if ok else 0.0) for tid, r in zip(task_ids, rewards): self.reward_by_task[tid].append(r) if episode_lengths: for tid, ep_len in zip(task_ids, episode_lengths): self.episode_len_hist.append(ep_len) self.ep_len_by_task[tid].append(ep_len) self.ep_len_history.append((self.step, sum(episode_lengths) / len(episode_lengths))) self.env_errors += errors self._flush_live() def _flush_live(self): recent = self.reward_history[-20:] recent_mean = sum(r for _, r in recent) / len(recent) if recent else 0.0 fmt_rate = sum(self.format_scores) / len(self.format_scores) if self.format_scores else 0.0 task_means = {t: round(sum(v)/len(v), 3) for t, v in self.reward_by_task.items()} elapsed = (time.time() - self._start_time) / 60 payload = { 'step': self.step, 'total_calls': self.total_calls, 'recent_mean': round(recent_mean, 4), 'format_rate': round(fmt_rate, 4), 'parse_failures': self.parse_failures, 'env_errors': self.env_errors, 'elapsed_min': round(elapsed, 1), 'reward_history': [{'step': s, 'reward': r} for s, r in self.reward_history], 'loss_history': [{'step': s, 'loss': l, 'grad_norm': g} for s, l, g in self.loss_history], 'format_history': [{'step': s, 'rate': r} for s, r in self.format_history], 'diff_reward_hist': {d: [{'step': s, 'reward': r} for s, r in v] for d, v in self.diff_reward_hist.items()}, 'ep_len_history': [{'step': s, 'mean_len': l} for s, l in self.ep_len_history], 'decision_counts': dict(self.decision_counts), 'task_means': task_means, } try: with open('/app/metrics_live.json', 'w') as f: json.dump(payload, f) except Exception: pass def print_summary(self): recent = self.reward_history[-10:] if self.reward_history else [] recent_mean = sum(r for _, r in recent) / len(recent) if recent else 0.0 fmt_rate = sum(self.format_scores) / len(self.format_scores) if self.format_scores else 0.0 print(f'\n[METRICS] step={self.step} | recent_reward={recent_mean:.3f} | ' f'format_ok={fmt_rate:.1%} | parse_fails={self.parse_failures} | ' f'env_errors={self.env_errors} | total_calls={self.total_calls}') top_decisions = self.decision_counts.most_common(5) print(f'[METRICS] decisions: {dict(top_decisions)}') if self.reward_by_task: task_means = {t: round(sum(v)/len(v), 3) for t, v in self.reward_by_task.items()} print(f'[METRICS] per_task_reward: {task_means}') def save_all_metrics_figures(self, run_dir: str): """Save six training metric figures (reward curve, difficulty curves, format rate, etc.).""" PALETTE = {'easy': '#3fb950', 'medium': '#d29922', 'hard': '#f85149', 'long': '#a371f7'} BG = '#0d1117' PANEL = '#161b22' GRID = '#21262d' TEXT = '#e6edf3' SUBTEXT = '#8b949e' ACCENT = '#58a6ff' def _setup(ax, xlabel='', ylabel='', title=''): ax.set_facecolor(PANEL) ax.tick_params(colors=TEXT, labelsize=8) for sp in ax.spines.values(): sp.set_color('#30363d') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.yaxis.grid(True, color=GRID, linewidth=0.6, alpha=0.8) ax.xaxis.grid(True, color=GRID, linewidth=0.4, alpha=0.4) ax.set_axisbelow(True) if xlabel: ax.set_xlabel(xlabel, color=SUBTEXT, fontsize=9) if ylabel: ax.set_ylabel(ylabel, color=SUBTEXT, fontsize=9) if title: ax.set_title(title, color=TEXT, fontsize=10, fontweight='bold', pad=8) def _smooth(values, window=None): if len(values) < 3: return values w = window or max(3, len(values) // 12) return np.convolve(values, np.ones(w)/w, mode='valid'), w def _caption(fig, text): fig.text(0.5, 0.01, text, ha='center', va='bottom', color=SUBTEXT, fontsize=7, style='italic') ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') # ── Figure 1: Mean Episode Return (reward curve) ────────────────────── if self.reward_history: fig, ax = plt.subplots(figsize=(10, 4)) fig.patch.set_facecolor(BG) steps = [s for s, _ in self.reward_history] rewards = [r for _, r in self.reward_history] ax.plot(steps, rewards, color=ACCENT, alpha=0.25, linewidth=1, label='Per-batch mean') if len(rewards) >= 5: sm, w = _smooth(rewards) ax.plot(steps[w-1:], sm, color=ACCENT, linewidth=2, label=f'EMA (w={w})') ax.axhline(0.5, color=SUBTEXT, linestyle='--', linewidth=1, alpha=0.5, label='Chance baseline (0.5)') recent_mean = sum(rewards[-20:]) / min(20, len(rewards)) ax.axhline(recent_mean, color='#f78166', linestyle=':', linewidth=1.5, label=f'Recent mean = {recent_mean:.3f}') ax.set_ylim(0, 1.05) ax.legend(fontsize=8, facecolor=PANEL, edgecolor='#30363d', labelcolor=TEXT) _setup(ax, xlabel='Training Step (reward function call batch)', ylabel='Mean Episode Return [0.01 – 0.99]', title='Training Reward Curve — AP Commander GRPO') _caption(fig, f'Each step = one GRPO batch. Reward = discounted accumulated score from AP Commander environment. | {ts}') plt.tight_layout(rect=[0, 0.04, 1, 1]) p = os.path.join(run_dir, 'fig1_reward_curve.png') plt.savefig(p, dpi=130, bbox_inches='tight', facecolor=BG) plt.close() print(f'[METRICS] {p}') # ── Figure 2: Per-Difficulty Learning Curves ────────────────────────── if self.diff_reward_hist: fig, ax = plt.subplots(figsize=(10, 4)) fig.patch.set_facecolor(BG) for diff in _DIFFICULTY_ORDER: hist = self.diff_reward_hist.get(diff, []) if not hist: continue steps_d = [s for s, _ in hist] rewards_d = [r for _, r in hist] color = PALETTE.get(diff, ACCENT) ax.plot(steps_d, rewards_d, color=color, alpha=0.20, linewidth=1) if len(rewards_d) >= 5: sm, w = _smooth(rewards_d) ax.plot(steps_d[w-1:], sm, color=color, linewidth=2.5, label=f'{diff} (n={len(steps_d)})') else: ax.plot(steps_d, rewards_d, color=color, linewidth=2.5, label=diff) for thr_diff, thr_val in _UNLOCK_THRESHOLDS.items(): ax.axhline(thr_val, color=PALETTE.get(thr_diff, SUBTEXT), linestyle='--', linewidth=0.8, alpha=0.5) ax.set_ylim(0, 1.05) ax.legend(fontsize=9, facecolor=PANEL, edgecolor='#30363d', labelcolor=TEXT) _setup(ax, xlabel='Training Step', ylabel='Mean Reward per Difficulty Tier [0.01 – 0.99]', title='Curriculum Learning Curves — Easy / Medium / Hard / Long-Horizon') _caption(fig, f'Dashed lines = curriculum unlock thresholds. Each line = rolling mean of all tasks in that difficulty tier. | {ts}') plt.tight_layout(rect=[0, 0.04, 1, 1]) p = os.path.join(run_dir, 'fig2_difficulty_curves.png') plt.savefig(p, dpi=130, bbox_inches='tight', facecolor=BG) plt.close() print(f'[METRICS] {p}') # ── Figure 3: Episode Length Distribution ───────────────────────────── if self.episode_len_hist: fig, axes = plt.subplots(1, 2, figsize=(12, 4)) fig.patch.set_facecolor(BG) # Overall histogram max_len = max(self.episode_len_hist) bins = range(1, max_len + 2) axes[0].hist(self.episode_len_hist, bins=bins, color=ACCENT, alpha=0.85, edgecolor=BG, rwidth=0.8) _setup(axes[0], xlabel='Episode Length (number of env steps)', ylabel='Count of Episodes', title='Episode Length Distribution (all tasks)') axes[0].axvline(np.mean(self.episode_len_hist), color='#f78166', linestyle='--', linewidth=1.5, label=f'Mean = {np.mean(self.episode_len_hist):.1f}') axes[0].legend(fontsize=8, facecolor=PANEL, edgecolor='#30363d', labelcolor=TEXT) # Per-difficulty mean episode length bar diff_ep_means = {} for diff in _DIFFICULTY_ORDER: lens = [] for tid, d in _TASK_DIFFICULTY.items(): if d == diff: lens.extend(self.ep_len_by_task.get(tid, [])) if lens: diff_ep_means[diff] = np.mean(lens) if diff_ep_means: diffs = list(diff_ep_means.keys()) means = list(diff_ep_means.values()) colors = [PALETTE.get(d, ACCENT) for d in diffs] axes[1].bar(diffs, means, color=colors, alpha=0.85, edgecolor=BG, width=0.5) for i, (d, m) in enumerate(zip(diffs, means)): axes[1].text(i, m + 0.05, f'{m:.1f}', ha='center', color=TEXT, fontsize=9, fontweight='bold') axes[1].set_ylim(0, max(means) * 1.3) _setup(axes[1], xlabel='Difficulty Tier', ylabel='Mean Episode Length (steps)', title='Mean Episode Length by Difficulty') fig.suptitle('Episode Length Analysis — Multi-Step Decision Behavior', color=TEXT, fontsize=11, y=1.01) _caption(fig, f'Long-horizon tasks expected to have higher mean episode lengths as agent learns to use ESCALATE/QUERY_VENDOR. | {ts}') plt.tight_layout(rect=[0, 0.04, 1, 1]) p = os.path.join(run_dir, 'fig3_episode_lengths.png') plt.savefig(p, dpi=130, bbox_inches='tight', facecolor=BG) plt.close() print(f'[METRICS] {p}') # ── Figure 4: Format Compliance Rate Over Time ──────────────────────── if self.format_history: fig, ax = plt.subplots(figsize=(10, 3.5)) fig.patch.set_facecolor(BG) steps_f = [s for s, _ in self.format_history] fmt_vals = [r for _, r in self.format_history] ax.plot(steps_f, fmt_vals, color='#d29922', alpha=0.25, linewidth=1) if len(fmt_vals) >= 5: sm, w = _smooth(fmt_vals) ax.plot(steps_f[w-1:], sm, color='#d29922', linewidth=2.5, label=f'EMA (w={w})') final_rate = sum(self.format_scores) / max(1, len(self.format_scores)) ax.axhline(final_rate, color='#3fb950', linestyle='--', linewidth=1.5, label=f'Overall rate = {final_rate:.1%}') ax.set_ylim(0, 1.05) ax.legend(fontsize=8, facecolor=PANEL, edgecolor='#30363d', labelcolor=TEXT) _setup(ax, xlabel='Training Step', ylabel='Format Compliance Rate [0 – 1]', title='JSON Format Compliance Over Training') _caption(fig, f'Format compliance = fraction of completions producing valid JSON with correct fields. Parse failures = {self.parse_failures}. | {ts}') plt.tight_layout(rect=[0, 0.04, 1, 1]) p = os.path.join(run_dir, 'fig4_format_compliance.png') plt.savefig(p, dpi=130, bbox_inches='tight', facecolor=BG) plt.close() print(f'[METRICS] {p}') # ── Figure 5: Decision Distribution Over Time (stacked bar) ────────── if self.decision_history and len(self.decision_history) >= 3: all_decisions = sorted(set(self.decision_counts.keys())) # Sample ~20 evenly-spaced checkpoints for readability n_checkpoints = min(20, len(self.decision_history)) idxs = [int(i * (len(self.decision_history) - 1) / (n_checkpoints - 1)) for i in range(n_checkpoints)] ckpt_steps = [self.decision_history[i][0] for i in idxs] ckpt_counts = [self.decision_history[i][1] for i in idxs] # Convert to fractions fracs = [] for c in ckpt_counts: total_c = sum(c.values()) or 1 fracs.append({d: c.get(d, 0) / total_c for d in all_decisions}) fig, ax = plt.subplots(figsize=(12, 4)) fig.patch.set_facecolor(BG) dec_colors = ['#3fb950','#f85149','#d29922','#a371f7','#58a6ff','#f0883e'] bottom = np.zeros(len(ckpt_steps)) for j, dec in enumerate(all_decisions): vals = np.array([f[dec] for f in fracs]) ax.bar(range(len(ckpt_steps)), vals, bottom=bottom, label=dec, color=dec_colors[j % len(dec_colors)], alpha=0.85, edgecolor=BG) bottom += vals ax.set_xticks(range(len(ckpt_steps))) ax.set_xticklabels([str(s) for s in ckpt_steps], rotation=45, fontsize=7) ax.set_ylim(0, 1.05) ax.legend(fontsize=7, facecolor=PANEL, edgecolor='#30363d', labelcolor=TEXT, loc='upper right', bbox_to_anchor=(1.15, 1)) _setup(ax, xlabel='Training Step (checkpoint)', ylabel='Fraction of Decisions', title='Decision Distribution Over Training (Stacked Bar)') _caption(fig, f'Each bar = cumulative decision distribution up to that checkpoint. Ideal: APPROVE_FULL grows for easy tasks, REJECT for fraud/duplicate tasks. | {ts}') plt.tight_layout(rect=[0, 0.04, 0.88, 1]) p = os.path.join(run_dir, 'fig5_decision_distribution.png') plt.savefig(p, dpi=130, bbox_inches='tight', facecolor=BG) plt.close() print(f'[METRICS] {p}') # ── Figure 6: Per-Task Training Mean (horizontal bar) ───────────────── if self.reward_by_task: task_means = {t: sum(v)/len(v) for t, v in self.reward_by_task.items()} tasks = sorted(task_means, key=lambda t: (_DIFFICULTY_ORDER.index(_TASK_DIFFICULTY.get(t,'easy')), t)) means = [task_means[t] for t in tasks] colors = [PALETTE.get(_TASK_DIFFICULTY.get(t,'easy'), ACCENT) for t in tasks] short = [t.replace('easy_','').replace('medium_','').replace('hard_','').replace('long_','').replace('_',' ').title() for t in tasks] fig, ax = plt.subplots(figsize=(10, max(4, len(tasks) * 0.45))) fig.patch.set_facecolor(BG) yp = range(len(tasks)) ax.barh(list(yp), means, color=colors, alpha=0.85, edgecolor=BG) ax.set_yticks(list(yp)) ax.set_yticklabels(short, fontsize=8) ax.set_xlim(0, 1.05) overall_mean = sum(means) / len(means) ax.axvline(overall_mean, color='#f78166', linestyle='--', linewidth=1.5, label=f'Overall mean = {overall_mean:.3f}') ax.axvline(0.5, color=SUBTEXT, linestyle=':', linewidth=1, alpha=0.5) for i, m in enumerate(means): ax.text(m + 0.01, i, f'{m:.3f}', va='center', color=TEXT, fontsize=7) from matplotlib.patches import Patch legend_els = [Patch(facecolor=PALETTE[d], label=d.title()) for d in _DIFFICULTY_ORDER if d in PALETTE] legend_els.append(plt.Line2D([0],[0], color='#f78166', linestyle='--', label=f'Mean {overall_mean:.3f}')) ax.legend(handles=legend_els, fontsize=8, facecolor=PANEL, edgecolor='#30363d', labelcolor=TEXT) _setup(ax, xlabel='Mean Training Reward [0.01 – 0.99]', ylabel='Task', title='Per-Task Training Mean Reward (all episodes)') _caption(fig, f'Tasks ordered by difficulty. Green ≥ 0.7 = curriculum mastered. Orange = in progress. Red < 0.4 = needs more training. | {ts}') plt.tight_layout(rect=[0, 0.04, 1, 1]) p = os.path.join(run_dir, 'fig6_per_task_means.png') plt.savefig(p, dpi=130, bbox_inches='tight', facecolor=BG) plt.close() print(f'[METRICS] {p}') METRICS = Metrics() _EPISODE_LOG_PATH: str = '' # set to run_dir/episodes.jsonl once run_dir is known # ── Helpers ──────────────────────────────────────────────────────────────────── def obs_to_prompt(obs: dict) -> str: inv = obs['invoice'] lines = '\n'.join( f" {li['description']}: qty={li['quantity']}, unit_price=${li['unit_price']:.2f}" for li in inv.get('line_items', []) ) pos = '\n'.join( f" PO {p['po_number']} ({p['status']}) {p['vendor_name']}: " + ', '.join(f"{l['description']} qty={l['ordered_quantity']} @${l['agreed_unit_price']:.2f}" for l in p.get('lines', [])) for p in obs.get('purchase_orders', []) ) grns = '\n'.join( f" GRN {g['grn_id']} (PO {g['po_number']}): " + ', '.join(f"{l['description']} recv={l['received_quantity']}" for l in g.get('lines', [])) for g in obs.get('goods_receipts', []) ) context = '\n'.join(f' {n}' for n in obs.get('context_notes', [])) paid = ', '.join(obs.get('paid_invoice_ids', [])) return ( f"TASK: {obs['task_name']}\n{obs['task_description']}\n\n" f"INVOICE {inv['invoice_id']} | {inv['vendor_name']} | ${inv['invoice_total']:,.2f}\n{lines}\n" f"Freight: ${inv.get('freight_charge',0):.2f}\n\n" f"PURCHASE ORDERS:\n{pos}\n\nGOODS RECEIPTS:\n{grns}\n" + (f"PAID LEDGER: {paid}\n" if paid else "") + (f"CONTEXT:\n{context}\n" if context else "") + f"\nPOLICY:\n{obs['company_policy']}\n\nOutput JSON decision." ) def parse_action(raw: str) -> tuple[dict, bool]: """Returns (action_dict, format_ok). format_ok=False means parse failed.""" clean = re.sub(r'```(?:json)?\s*|\s*```', '', raw).strip() m = re.search(r'\{.*\}', clean, re.DOTALL) if m: try: action = json.loads(m.group()) # Validate required fields and enum values if (action.get('decision') in VALID_DECISIONS and action.get('reason_code') in VALID_REASON_CODES and isinstance(action.get('approved_amount'), (int, float)) and isinstance(action.get('explanation'), str) and len(action.get('explanation', '')) > 10): return action, True except Exception: pass METRICS.parse_failures += 1 return {'decision': 'REJECT', 'approved_amount': 0.0, 'reason_code': 'NO_PO_FOUND', 'explanation': 'parse error fallback'}, False def run_episode(task_id: str, action_json: dict, seed=None) -> float: try: r = requests.post(f'{ENV_URL}/reset', json={'task_id': task_id, 'seed': seed}, timeout=20) r.raise_for_status() data = r.json() step_r = requests.post(f'{ENV_URL}/step', json={'session_id': data['session_id'], 'action': action_json}, timeout=20) step_r.raise_for_status() return float(step_r.json()['reward']['score']) except Exception: return 0.01 def run_episode_accumulated(task_id: str, first_action: dict, seed=None, discount: float = 0.9, max_steps: int = 20, episode_log: list | None = None) -> tuple[float, int]: """ Run a full multi-step episode accumulating discounted per-step rewards. Returns (score, episode_length) so callers can track step counts. Model's first action starts the episode; _greedy_followup() handles subsequent steps so multi-step sequences earn full accumulated credit. E.g. QUERY_VENDOR→REJECT = 0.01 + 0.9*0.99 = 0.901 > shortcut REJECT = ~0.4 episode_log: if provided, appended with one dict per env step for JSONL logging. """ try: r = requests.post(f'{ENV_URL}/reset', json={'task_id': task_id, 'seed': seed}, timeout=20) r.raise_for_status() reset_data = r.json() session_id = reset_data['session_id'] action = first_action total = 0.0 steps_taken = 0 for step_n in range(max_steps): step_r = requests.post(f'{ENV_URL}/step', json={'session_id': session_id, 'action': action}, timeout=20) step_r.raise_for_status() result = step_r.json() r_score = float(result['reward']['score']) done = result['done'] obs_back = result.get('observation', {}) total += (discount ** step_n) * r_score steps_taken = step_n + 1 if episode_log is not None: episode_log.append({ 'step_n': step_n, 'decision': action.get('decision'), 'approved_amount': action.get('approved_amount'), 'reason_code': action.get('reason_code'), 'explanation': (action.get('explanation') or '')[:120], 'step_score': round(r_score, 4), 'done': done, 'context_notes': obs_back.get('context_notes', []), 'action_history': obs_back.get('action_history', []), }) if done: break action = _greedy_followup(obs_back) return min(0.99, max(0.01, total)), steps_taken except Exception as e: return 0.01, 1 def env_reward_fn(completions, task_id=None, seed=None, **kwargs): """ Environment reward: accumulated discounted per-step reward from AP Commander. Curriculum gating redirects locked tasks to easier ones during early training. Writes one JSONL record per episode to _EPISODE_LOG_PATH for full verifiability. """ task_ids = task_id if task_id is not None else ['easy_perfect_match'] * len(completions) seeds = seed if seed is not None else [random.randint(1, 999)] * len(completions) rewards, decisions, format_ok_list, ep_lengths, errors = [], [], [], [], 0 for completion, tid, s in zip(completions, task_ids, seeds): # Curriculum gating disabled: all 20 tasks train simultaneously from step 1. # Gating caused mode collapse in Run 2 — hard tasks never trained. action, fmt_ok = parse_action(completion) episode_steps = [] try: score, ep_len = run_episode_accumulated( tid, action, seed=int(s), episode_log=episode_steps) except Exception as e: score, ep_len = 0.01, 1 errors += 1 rewards.append(score) ep_lengths.append(ep_len) decisions.append(action.get('decision', 'UNKNOWN')) format_ok_list.append(fmt_ok) CURRICULUM.record(tid, score) if _EPISODE_LOG_PATH: try: record = { 'reward_step': METRICS.step + 1, 'call_n': METRICS.total_calls + len(rewards), 'task_id': tid, 'seed': int(s), 'format_ok': fmt_ok, 'score': round(score, 4), 'episode_len': ep_len, 'final_decision': action.get('decision'), 'steps': episode_steps, 'ts': datetime.datetime.now().isoformat(), } with open(_EPISODE_LOG_PATH, 'a') as _f: _f.write(json.dumps(record) + '\n') except Exception: pass if METRICS.total_calls % LOG_SAMPLES_EVERY == 0: print(f'\n[SAMPLE] task={tid} seed={s} fmt={fmt_ok} ' f'score={score:.3f} ep_len={ep_len}') print(f' {action.get("decision")} ${action.get("approved_amount")} ' f'{action.get("reason_code")}') print(f' {str(action.get("explanation",""))[:100]}') print(f' curriculum: {CURRICULUM.status_line()}') if episode_steps: actor_notes = [n for step in episode_steps for n in step.get('context_notes', [])] if actor_notes: print(f' actor_responses: {actor_notes[:2]}') METRICS.log_step(rewards, decisions, format_ok_list, list(task_ids), errors, episode_lengths=ep_lengths) if METRICS.step % 5 == 0: METRICS.print_summary() print(f'[CURRICULUM] {CURRICULUM.status_line()}') # Graceful stop: app.py writes this flag when Stop button is clicked if os.path.exists('/app/stop_requested'): print(f'\n[STOP] Stop flag detected at step {METRICS.step}. Saving weights before exit...') os.remove('/app/stop_requested') raise _StopTraining() return rewards def format_reward_fn(completions, **kwargs): """Format reward: +0.15 if valid JSON with correct fields, -0.15 otherwise. Increased from ±0.05 — format failures were 55% in Run 2, drowning env signal.""" results = [] for completion in completions: _, ok = parse_action(completion) results.append(0.15 if ok else -0.15) return results OVERSIGHT_SYSTEM_PROMPT = """You are a Fleet AI Oversight Agent reviewing completed AP Clerk decisions. For each episode summary provided, decide: CLEAR, FLAG_FOR_REVIEW, or ESCALATE_TO_AUDIT. Output valid JSON only: {"episode_id": "...", "verdict": "CLEAR|FLAG_FOR_REVIEW|ESCALATE_TO_AUDIT", "signal": "specific reason with $ amounts or %", "confidence": 0.0-1.0} Your response must start with { and end with } with no other text.""" def eval_oversight(model, tokenizer, seed: int = 99) -> float: """Run one oversight session (5 episodes) and return mean reward across all steps.""" import torch, json as _json model.eval() try: reset = requests.post(f'{ENV_URL}/oversight/reset', json={'num_episodes': 5, 'seed': seed}, timeout=20).json() session_id = reset['session_id'] summaries = reset['observation']['episode_summaries'] scores = [] for ep in summaries: prompt = ( f"Episode ID: {ep['episode_id']}\n" f"Vendor: {ep['vendor_name']} Invoice: {ep['invoice_id']} " f"Total: ${ep['invoice_total']:.2f}\n" f"Decision: {ep['final_decision']} Reason: {ep['reason_code']}\n" f"Explanation: {ep['explanation']}\n" f"Known fraud patterns: {reset['observation'].get('known_fraud_patterns', [])}\n" f"Audit budget remaining: {reset['observation']['audit_budget']}" ) messages = [{'role': 'system', 'content': OVERSIGHT_SYSTEM_PROMPT}, {'role': 'user', 'content': prompt}] text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer(text, return_tensors='pt').to('cuda') with torch.no_grad(): out = model.generate(**inputs, max_new_tokens=150, temperature=0.1, do_sample=True) raw = tokenizer.decode(out[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True) try: action = _json.loads(raw.strip()) action.setdefault('episode_id', ep['episode_id']) action.setdefault('verdict', 'CLEAR') action.setdefault('signal', 'no signal') action.setdefault('confidence', 0.5) except Exception: action = {'episode_id': ep['episode_id'], 'verdict': 'CLEAR', 'signal': 'parse error', 'confidence': 0.5} resp = requests.post(f'{ENV_URL}/oversight/step', json={'session_id': session_id, 'action': action}, timeout=20).json() scores.append(float(resp['reward']['score'])) mean = sum(scores) / len(scores) if scores else 0.01 print(f' oversight session mean: {mean:.3f} scores={[round(s,2) for s in scores]}') return mean except Exception as e: print(f' oversight eval error: {e}') return 0.01 # ── Eval helper ──────────────────────────────────────────────────────────────── def eval_task(model, tokenizer, task_id: str, seed: int = 99) -> float: import torch model.eval() try: reset = requests.post(f'{ENV_URL}/reset', json={'task_id': task_id, 'seed': seed}, timeout=20).json() obs, session_id = reset['observation'], reset['session_id'] messages = [{'role': 'system', 'content': SYSTEM_PROMPT}, {'role': 'user', 'content': obs_to_prompt(obs)}] text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer(text, return_tensors='pt').to('cuda') with torch.no_grad(): out = model.generate(**inputs, max_new_tokens=250, temperature=0.1, do_sample=True) raw = tokenizer.decode(out[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True) action, fmt_ok = parse_action(raw) score = float(requests.post(f'{ENV_URL}/step', json={'session_id': session_id, 'action': action}, timeout=20).json()['reward']['score']) print(f' output: {raw[:120].strip()}') return score except Exception as e: print(f' eval error: {e}') return 0.01 # ── Main ─────────────────────────────────────────────────────────────────────── def _make_run_dir() -> str: """Create timestamped run directory under /app/runs/grpo/MODEL-NEpoch-DATETIME.""" model_slug = MODEL_NAME.split('/')[-1].lower().replace('.', '-') ts = datetime.datetime.now().strftime('%Y-%m-%d_%H%M') run_dir = f'/app/runs/grpo/{model_slug}-{NUM_EPOCHS}ep-{ts}' os.makedirs(run_dir, exist_ok=True) return run_dir def _save_loss_curve(trainer, run_dir: str): """Extract loss + grad_norm from TRL trainer log history and save as loss_curve.png.""" try: history = trainer.state.log_history steps = [e['step'] for e in history if 'loss' in e] losses = [e['loss'] for e in history if 'loss' in e] grads = [e.get('grad_norm', None) for e in history if 'loss' in e] BG, TEXT, BLUE, ORANGE = '#0d1117', '#e6edf3', '#58a6ff', '#f0883e' fig, axes = plt.subplots(2, 1, figsize=(10, 6), facecolor=BG) fig.suptitle( f'AP Commander GRPO — Loss & Gradient Norm | {MODEL_NAME.split("/")[-1]}', color=TEXT, fontsize=11, fontweight='bold' ) ax1 = axes[0] ax1.set_facecolor(BG) ax1.plot(steps, losses, color=BLUE, linewidth=1.2, alpha=0.6, label='per-step loss') if len(steps) > 10: smooth = [sum(losses[max(0,i-10):i+1])/len(losses[max(0,i-10):i+1]) for i in range(len(losses))] ax1.plot(steps, smooth, color=BLUE, linewidth=2, label='smooth (w=10)') ax1.axhline(0, color='white', linewidth=0.5, alpha=0.3) ax1.set_ylabel('Loss', color=TEXT); ax1.set_xlabel('') ax1.tick_params(colors=TEXT); ax1.spines[:].set_color('#30363d') ax1.legend(facecolor=BG, labelcolor=TEXT, fontsize=8) for spine in ax1.spines.values(): spine.set_color('#30363d') ax2 = axes[1] ax2.set_facecolor(BG) valid_grads = [(s, g) for s, g in zip(steps, grads) if g is not None] if valid_grads: gs, gv = zip(*valid_grads) ax2.plot(gs, gv, color=ORANGE, linewidth=1.2, alpha=0.7) ax2.set_ylabel('Grad Norm', color=TEXT); ax2.set_xlabel('Training Step', color=TEXT) ax2.tick_params(colors=TEXT) for spine in ax2.spines.values(): spine.set_color('#30363d') fig.text(0.5, 0.01, f'GRPO loss (group-relative policy gradient). Negative values normal mid-training. ' f'Grad norm collapse (<0.1) indicates entropy saturation.', ha='center', color='#8b949e', fontsize=7) plt.tight_layout(rect=[0, 0.04, 1, 1]) out = os.path.join(run_dir, 'loss_curve.png') plt.savefig(out, dpi=130, bbox_inches='tight', facecolor=BG) plt.close() print(f'[LOSS CURVE] Saved {out} ({len(steps)} steps)') except Exception as e: print(f'[LOSS CURVE] skipped: {e}') def _save_reward_curve_png(run_dir: str): """Save final reward curve PNG from METRICS for submission evidence.""" try: if not METRICS.reward_history: return BG, TEXT, BLUE = '#0d1117', '#e6edf3', '#58a6ff' steps = [s for s, _ in METRICS.reward_history] rewards = [r for _, r in METRICS.reward_history] fig, ax = plt.subplots(figsize=(10, 4), facecolor=BG) ax.set_facecolor(BG) ax.plot(steps, rewards, color=BLUE, alpha=0.4, linewidth=1, label='Per-step') if len(rewards) >= 10: w = max(5, len(rewards) // 15) sm = [sum(rewards[max(0,i-w):i+1])/len(rewards[max(0,i-w):i+1]) for i in range(len(rewards))] ax.plot(steps, sm, color=BLUE, linewidth=2.5, label=f'Smooth (w={w})') mean_r = sum(rewards[-20:]) / min(20, len(rewards)) ax.axhline(mean_r, color='#f78166', linestyle='--', linewidth=1.2, label=f'Recent mean: {mean_r:.3f}') ax.set_ylim(0, 1.05) ax.set_xlabel('Training Step', color=TEXT, fontsize=10) ax.set_ylabel('Mean Reward', color=TEXT, fontsize=10) ax.set_title(f'GRPO Reward Curve — {MODEL_NAME.split("/")[-1]} | {NUM_EPOCHS} epochs', color=TEXT, fontsize=11, fontweight='bold') ax.tick_params(colors=TEXT) for spine in ax.spines.values(): spine.set_color('#30363d') ax.legend(facecolor=BG, labelcolor=TEXT, fontsize=9) fig.text(0.5, 0.01, f'Mean reward over {len(steps)} training steps. ' f'Higher = better decision quality across {len(METRICS.reward_by_task)} tasks.', ha='center', color='#8b949e', fontsize=7) plt.tight_layout(rect=[0, 0.04, 1, 1]) out = os.path.join(run_dir, 'reward_curve.png') plt.savefig(out, dpi=130, bbox_inches='tight', facecolor=BG) plt.close() print(f'[REWARD CURVE] Saved {out}') except Exception as e: print(f'[REWARD CURVE] skipped: {e}') def main(): # Authenticate with HF Hub if token provided (needed for gated models like Llama-3) hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN') if hf_token: from huggingface_hub import login login(token=hf_token, add_to_git_credential=False) print('[AUTH] Logged in to HF Hub.') else: print('[AUTH] No HF_TOKEN set — using public models only (Qwen recommended).') # All run artifacts go into this timestamped dir — never overwrite a previous run RUN_DIR = _make_run_dir() print(f'[RUN] Artifacts → {RUN_DIR}') # Point the global episode log path so env_reward_fn can write structured logs global _EPISODE_LOG_PATH _EPISODE_LOG_PATH = os.path.join(RUN_DIR, 'episodes.jsonl') print(f'[RUN] Episode log → {_EPISODE_LOG_PATH}') print(f'[ENV] Checking {ENV_URL}...') h = requests.get(f'{ENV_URL}/health', timeout=30).json() print(f"[ENV] status={h['status']} tasks={h.get('total_tasks')}") print(f'[MODEL] Loading {MODEL_NAME}...') import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import LoraConfig, get_peft_model, TaskType from datasets import Dataset from trl import GRPOConfig, GRPOTrainer bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, ) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, quantization_config=bnb_config, device_map='auto', trust_remote_code=True, ) model.enable_input_require_grads() model.gradient_checkpointing_enable() lora_cfg = LoraConfig( r=16, lora_alpha=32, target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'], lora_dropout=0, bias='none', task_type=TaskType.CAUSAL_LM, ) model = get_peft_model(model, lora_cfg) model.print_trainable_parameters() # Baseline eval (before training) print('\n[BASELINE] Before training:') baseline = {} for t in EVAL_TASKS: s = eval_task(model, tokenizer, t) baseline[t] = s print(f' {t}: {s:.3f}') baseline['oversight_agent'] = eval_oversight(model, tokenizer, seed=99) print(f' oversight_agent: {baseline["oversight_agent"]:.3f}') print(f' Mean: {sum(baseline.values())/len(baseline):.3f}') model.train() # Hard/long get more seeds so multi-step sequences see enough variation to learn from _SEEDS_PER_DIFF = {'easy': 5, 'medium': 8, 'hard': 20, 'long': 20} task_seed_pairs = [ (tid, s) for tid in TRAIN_TASKS for s in range(1, _SEEDS_PER_DIFF.get(_TASK_DIFFICULTY.get(tid, 'easy'), 5) + 1) ] total_prompts = len(task_seed_pairs) print(f'\n[DATASET] Building prompts ({total_prompts} total: easy×5 medium×8 hard/long×20 seeds)...') rows = [] for task_id, seed in task_seed_pairs: try: reset = requests.post(f'{ENV_URL}/reset', json={'task_id': task_id, 'seed': seed}, timeout=20).json() obs = reset['observation'] messages = [{'role': 'system', 'content': SYSTEM_PROMPT}, {'role': 'user', 'content': obs_to_prompt(obs)}] rows.append({ 'prompt': tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True), 'task_id': task_id, 'seed': seed, }) except Exception as e: print(f' skip {task_id} seed={seed}: {e}') dataset = Dataset.from_list(rows) by_diff = {d: sum(1 for r in rows if _TASK_DIFFICULTY.get(r['task_id'],'easy')==d) for d in ['easy','medium','hard','long']} print(f'[DATASET] {len(dataset)} samples — easy:{by_diff["easy"]} medium:{by_diff["medium"]} hard:{by_diff["hard"]} long:{by_diff["long"]} | curriculum: {CURRICULUM.status_line()}') # Train print(f'\n[TRAIN] {NUM_EPOCHS} epochs | {NUM_GENERATIONS} generations/prompt | {len(dataset)} samples') model.train() _cuda_ok = torch.cuda.is_available() _use_bf16 = _cuda_ok and torch.cuda.get_device_capability()[0] >= 8 # Ampere+ (A100, H100) _use_fp16 = _cuda_ok and not _use_bf16 print(f'[DTYPE] cuda={_cuda_ok} bf16={_use_bf16} fp16={_use_fp16}') # per_device_train_batch_size must equal num_generations (TRL GRPO requirement). config = GRPOConfig( output_dir = './ap_commander_grpo', num_train_epochs = NUM_EPOCHS, per_device_train_batch_size = NUM_GENERATIONS, num_generations = NUM_GENERATIONS, gradient_accumulation_steps = 1, learning_rate = 1e-5, max_completion_length = 200, temperature = 0.7, beta = 0.1, bf16 = _use_bf16, fp16 = _use_fp16, logging_steps = 1, save_steps = 999, report_to = 'none', remove_unused_columns = False, ) from transformers import TrainerCallback class _LossCallback(TrainerCallback): def on_log(self, args, state, control, logs=None, **kwargs): if logs and 'loss' in logs: METRICS.loss_history.append(( state.global_step, round(float(logs['loss']), 5), round(float(logs.get('grad_norm', 0)), 5), )) METRICS._flush_live() trainer = GRPOTrainer( model=model, processing_class=tokenizer, reward_funcs=[env_reward_fn, format_reward_fn], args=config, train_dataset=dataset, callbacks=[_LossCallback()], ) stopped_early = False try: result = trainer.train() print(f'\n[TRAIN] Done. Loss: {result.training_loss:.4f}') except _StopTraining: stopped_early = True print(f'\n[STOP] Early stop at step {METRICS.step}. Saving weights now...') # Save curves and metrics figures _save_loss_curve(trainer, RUN_DIR) _save_reward_curve_png(RUN_DIR) METRICS.print_summary() METRICS.save_all_metrics_figures(RUN_DIR) # Save LoRA adapters (guide point 16: save adapters directly, do NOT merge 4-bit naively) adapter_dir = os.path.join(RUN_DIR, 'adapter') label = 'early-stop' if stopped_early else 'final' print(f'[SAVE] Saving LoRA adapters ({label}) to {adapter_dir}...') model.save_pretrained(adapter_dir) tokenizer.save_pretrained(adapter_dir) # Upload adapter to HF Hub — repo is {username}/ap-commander-adapter, auto-detected from token hf_token_save = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN') if hf_token_save: try: from huggingface_hub import HfApi api = HfApi(token=hf_token_save) username = api.whoami(token=hf_token_save)['name'] adapter_repo = f'{username}/ap-commander-adapter' api.create_repo(repo_id=adapter_repo, repo_type='model', exist_ok=True, token=hf_token_save) api.upload_folder( folder_path=adapter_dir, repo_id=adapter_repo, repo_type='model', commit_message=f'GRPO {datetime.datetime.now().strftime("%Y-%m-%d")} — {MODEL_NAME} {NUM_EPOCHS}ep', ) print(f'[SAVE] Adapter pushed to HF Hub: {adapter_repo}') except Exception as e: print(f'[SAVE] HF Hub upload skipped: {e}') else: print('[SAVE] HF Hub upload skipped: HF_TOKEN not set') # Post-training eval (all tasks + oversight) print('\n[POST-EVAL] After training:') post = {} model.eval() for t in EVAL_TASKS: s = eval_task(model, tokenizer, t) post[t] = s print(f' {t}: {s:.3f}') post['oversight_agent'] = eval_oversight(model, tokenizer, seed=99) print(f' oversight_agent: {post["oversight_agent"]:.3f}') print(f' Mean: {sum(post.values())/len(post):.3f}') print('\n[COMPARE]') for t in list(EVAL_TASKS) + ['oversight_agent']: d = post[t] - baseline[t] sym = '+' if d >= 0 else '' print(f' {t:<35} {baseline[t]:.3f} -> {post[t]:.3f} ({sym}{d:.3f})') # ── Before/After comparison figure (results.png — key result for demo) ──── BG, TEXT, SUBTEXT = '#0d1117', '#e6edf3', '#8b949e' PANEL, GRID = '#161b22', '#21262d' _fmt_rate = sum(METRICS.format_scores) / max(1, len(METRICS.format_scores)) eval_tasks_sorted = sorted( EVAL_TASKS, key=lambda t: (_DIFFICULTY_ORDER.index(_TASK_DIFFICULTY.get(t,'easy')), t) ) DIFF_COLORS = {'easy': '#3fb950', 'medium': '#d29922', 'hard': '#f85149', 'long': '#a371f7'} fig = plt.figure(figsize=(18, max(8, len(eval_tasks_sorted) * 0.45 + 2))) fig.patch.set_facecolor(BG) gs = fig.add_gridspec(1, 2, wspace=0.38) # Panel left: before/after horizontal bars ax_l = fig.add_subplot(gs[0, 0]) ax_l.set_facecolor(PANEL) yp = np.arange(len(eval_tasks_sorted)) short = [t.replace('easy_','').replace('medium_','').replace('hard_','').replace('long_','') .replace('_',' ').title() for t in eval_tasks_sorted] bar_h = 0.35 bars_b = ax_l.barh(yp - bar_h/2, [baseline.get(t, 0) for t in eval_tasks_sorted], bar_h, label='Before GRPO', color='#f85149', alpha=0.85, edgecolor=BG) bars_a = ax_l.barh(yp + bar_h/2, [post.get(t, 0) for t in eval_tasks_sorted], bar_h, label='After GRPO', color='#3fb950', alpha=0.85, edgecolor=BG) ax_l.set_yticks(yp) ax_l.set_yticklabels(short, fontsize=8, color=TEXT) ax_l.set_xlim(0, 1.15) ax_l.axvline(0.5, color=SUBTEXT, linestyle='--', linewidth=1, alpha=0.5) # Color-code y-tick labels by difficulty for i, t in enumerate(eval_tasks_sorted): ax_l.get_yticklabels()[i].set_color(DIFF_COLORS.get(_TASK_DIFFICULTY.get(t,'easy'), TEXT)) ax_l.legend(fontsize=9, facecolor=PANEL, edgecolor='#30363d', labelcolor=TEXT) ax_l.set_xlabel('Task Score [0.01 – 0.99]', color=SUBTEXT, fontsize=9) ax_l.set_ylabel('Task (color = difficulty tier)', color=SUBTEXT, fontsize=9) ax_l.set_title(f'Before vs After GRPO — {NUM_EPOCHS} Epochs', color=TEXT, fontsize=11, fontweight='bold', pad=10) ax_l.tick_params(colors=TEXT, labelsize=8) for sp in ax_l.spines.values(): sp.set_color('#30363d') ax_l.spines['top'].set_visible(False); ax_l.spines['right'].set_visible(False) ax_l.xaxis.grid(True, color=GRID, linewidth=0.6, alpha=0.7) ax_l.set_axisbelow(True) # Panel right: delta (improvement) per task ax_r = fig.add_subplot(gs[0, 1]) ax_r.set_facecolor(PANEL) deltas = [post.get(t, 0) - baseline.get(t, 0) for t in eval_tasks_sorted] d_colors = ['#3fb950' if d >= 0 else '#f85149' for d in deltas] ax_r.barh(yp, deltas, color=d_colors, alpha=0.85, edgecolor=BG) ax_r.set_yticks(yp) ax_r.set_yticklabels(short, fontsize=8, color=TEXT) ax_r.axvline(0, color=SUBTEXT, linewidth=1) for i, d in enumerate(deltas): ax_r.text(d + 0.005 * np.sign(d + 1e-9), i, f'{d:+.3f}', va='center', color=TEXT, fontsize=7) ax_r.set_xlabel('Score Delta (After − Before)', color=SUBTEXT, fontsize=9) ax_r.set_ylabel('Task', color=SUBTEXT, fontsize=9) ax_r.set_title('GRPO Improvement per Task', color=TEXT, fontsize=11, fontweight='bold', pad=10) ax_r.tick_params(colors=TEXT, labelsize=8) for sp in ax_r.spines.values(): sp.set_color('#30363d') ax_r.spines['top'].set_visible(False); ax_r.spines['right'].set_visible(False) ax_r.xaxis.grid(True, color=GRID, linewidth=0.6, alpha=0.7) ax_r.set_axisbelow(True) mean_before = sum(baseline.get(t,0) for t in eval_tasks_sorted) / len(eval_tasks_sorted) mean_after = sum(post.get(t,0) for t in eval_tasks_sorted) / len(eval_tasks_sorted) fig.suptitle( f'AP Commander GRPO — {MODEL_NAME.split("/")[-1]} | {NUM_EPOCHS} epochs | ' f'{NUM_GENERATIONS} generations | {len(TRAIN_TASKS)} tasks\n' f'Overall: {mean_before:.3f} → {mean_after:.3f} (+{mean_after-mean_before:.3f}) ' f'| format={_fmt_rate:.1%} | parse_fails={METRICS.parse_failures} ' f'| {datetime.datetime.now().strftime("%Y-%m-%d")}', color=TEXT, fontsize=10, y=1.01 ) fig.text(0.5, -0.01, 'Task colors: green=easy, yellow=medium, red=hard, purple=long-horizon. ' 'Score range [0.01, 0.99] as per AP Commander environment specification.', ha='center', color=SUBTEXT, fontsize=8, style='italic') results_png = os.path.join(RUN_DIR, 'results.png') plt.savefig(results_png, dpi=130, bbox_inches='tight', facecolor=BG) plt.close() print(f'[DONE] Saved {results_png}') # Save JSON fmt_rate = sum(METRICS.format_scores) / max(1, len(METRICS.format_scores)) output = { 'timestamp': datetime.datetime.now().isoformat(), 'run_dir': RUN_DIR, 'model': MODEL_NAME, 'epochs': NUM_EPOCHS, 'num_generations': NUM_GENERATIONS, 'per_device_train_batch_size': NUM_GENERATIONS, 'gradient_accumulation_steps': 2, 'learning_rate': 1e-5, 'lora_r': 16, 'lora_alpha': 32, 'seeds_easy_medium': 5, 'seeds_hard_long': 10, 'total_train_prompts': len(dataset), 'train_tasks': TRAIN_TASKS, 'eval_tasks': list(EVAL_TASKS), 'hardware': 'A10G (HF Spaces)', 'baseline': baseline, 'post_training': post, 'delta': {t: round(post.get(t,0) - baseline.get(t,0), 4) for t in EVAL_TASKS}, 'overall_baseline': round(mean_before, 4), 'overall_post': round(mean_after, 4), 'overall_delta': round(mean_after - mean_before, 4), 'episode_log': _EPISODE_LOG_PATH, 'metrics': { 'total_reward_calls': METRICS.total_calls, 'parse_failures': METRICS.parse_failures, 'env_errors': METRICS.env_errors, 'format_rate': round(fmt_rate, 4), 'decision_counts': dict(METRICS.decision_counts), 'per_task_mean': {t: round(sum(v)/len(v), 4) for t, v in METRICS.reward_by_task.items()}, 'mean_episode_length': round(sum(METRICS.episode_len_hist) / max(1, len(METRICS.episode_len_hist)), 2), 'by_difficulty_post': {d: round(sum(post.get(t,0) for t,diff in _TASK_DIFFICULTY.items() if diff==d and t in post) / max(1, sum(1 for t,diff in _TASK_DIFFICULTY.items() if diff==d and t in post)), 4) for d in _DIFFICULTY_ORDER}, }, 'figures': [ 'fig1_reward_curve.png', 'fig2_difficulty_curves.png', 'fig3_episode_lengths.png', 'fig4_format_compliance.png', 'fig5_decision_distribution.png', 'fig6_per_task_means.png', 'results.png', ], } results_json = os.path.join(RUN_DIR, 'training_results.json') with open(results_json, 'w') as f: json.dump(output, f, indent=2) print(f'[DONE] Saved {results_json}') # Copy live metrics into run dir as snapshot try: import shutil shutil.copy('/app/metrics_live.json', os.path.join(RUN_DIR, 'metrics_live.json')) except Exception: pass # Persist entire run dir to HF Space repo (runs/grpo/MODEL-NEP-DATETIME/) # so artifacts survive container restarts and each run is independently addressable repo_run_path = RUN_DIR.replace('/app/', '') # strip /app/ prefix for repo path hf_token_up = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN') if hf_token_up: try: from huggingface_hub import HfApi api = HfApi(token=hf_token_up) username = api.whoami(token=hf_token_up)['name'] training_repo = f'{username}/ap-commander-training' api.upload_folder( folder_path=RUN_DIR, path_in_repo=repo_run_path, repo_id=training_repo, repo_type='space', commit_message=f'Run artifacts: {os.path.basename(RUN_DIR)}', ignore_patterns=['adapter/*'], ) print(f'[UPLOAD] Run folder → {repo_run_path} in {training_repo}') except Exception as e: print(f'[UPLOAD] artifact upload failed: {e}') else: print('[UPLOAD] artifact upload skipped: HF_TOKEN not set') if __name__ == '__main__': main()