"""
AP Commander — GRPO Training Script
Tracks: overall reward, per-component rewards, decision distribution,
        format compliance, env errors, sample generations, reward curve.
"""
import os, json, re, random, time, datetime, collections
import requests
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np

ENV_URL         = 'https://pathikreet-ap-clerk-env.hf.space'
MODEL_NAME      = os.environ.get('MODEL_NAME', 'Qwen/Qwen2.5-7B-Instruct')
NUM_EPOCHS      = int(os.environ.get('NUM_EPOCHS', '3'))
NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', '8'))
LOG_SAMPLES_EVERY = 20   # print a sample generation every N reward calls

SYSTEM_PROMPT = """You are an AI Accounts Payable Clerk. Review the invoice, PO, and GRN, then output ONLY valid JSON:
{"decision": "APPROVE_FULL"|"APPROVE_PARTIAL"|"REJECT"|"ESCALATE"|"QUERY_VENDOR",
 "approved_amount": <float>,
 "reason_code": "MATCH_CONFIRMED"|"QUANTITY_MISMATCH"|"PRICE_DISCREPANCY"|"POLICY_VIOLATION"|"NO_PO_FOUND"|"DUPLICATE_INVOICE"|"VENDOR_MISMATCH"|"TAX_DISCREPANCY"|"PENDING_CLARIFICATION"|"MANAGER_REVIEW",
 "explanation": "<cite specific $ amounts>"}"""

TRAIN_TASKS = [
    'easy_perfect_match', 'easy_no_po_found',
    'medium_quantity_shortfall', 'medium_price_discrepancy',
    'medium_split_delivery', 'medium_vendor_mismatch',
    'hard_policy_violation', 'hard_duplicate_invoice',
    'hard_partial_po_match', 'hard_tax_discrepancy',
    'long_invoice_dispute', 'long_policy_migration',
    'long_batch_reconciliation', 'long_manager_chain',
    'long_fraud_investigation', 'long_audit_trail',
    'long_multi_vendor_split',
]
EVAL_TASKS = [
    'easy_perfect_match', 'easy_no_po_found',
    'medium_quantity_shortfall', 'medium_price_discrepancy',
    'medium_split_delivery', 'medium_vendor_mismatch',
    'hard_policy_violation', 'hard_duplicate_invoice',
    'hard_partial_po_match', 'hard_tax_discrepancy',
    'long_invoice_dispute', 'long_policy_migration',
    'long_batch_reconciliation', 'long_manager_chain',
    'long_fraud_investigation', 'long_audit_trail',
    'long_multi_vendor_split',
]

VALID_DECISIONS   = {'APPROVE_FULL','APPROVE_PARTIAL','REJECT','ESCALATE','QUERY_VENDOR','HOLD'}
VALID_REASON_CODES = {'MATCH_CONFIRMED','QUANTITY_MISMATCH','PRICE_DISCREPANCY','POLICY_VIOLATION',
                      'NO_PO_FOUND','DUPLICATE_INVOICE','VENDOR_MISMATCH','TAX_DISCREPANCY',
                      'PENDING_CLARIFICATION','MANAGER_REVIEW'}

# Task difficulty map used by curriculum sampler
_TASK_DIFFICULTY = {
    'easy_perfect_match': 'easy',   'easy_no_po_found': 'easy',
    'medium_quantity_shortfall': 'medium', 'medium_price_discrepancy': 'medium',
    'medium_split_delivery': 'medium',    'medium_vendor_mismatch': 'medium',
    'hard_policy_violation': 'hard',      'hard_duplicate_invoice': 'hard',
    'hard_partial_po_match': 'hard',      'hard_tax_discrepancy': 'hard',
    'long_invoice_dispute': 'long',       'long_policy_migration': 'long',
    'long_batch_reconciliation': 'long',  'long_manager_chain': 'long',
    'long_fraud_investigation': 'long',   'long_audit_trail': 'long',
    'long_multi_vendor_split': 'long',
}
_DIFFICULTY_ORDER  = ['easy', 'medium', 'hard', 'long']
_UNLOCK_THRESHOLDS = {'easy': 0.70, 'medium': 0.65, 'hard': 0.60}


# ── Curriculum sampler ──────────────────────────────────────────────────────────

class CurriculumSampler:
    """
    Tracks per-difficulty running mean and unlocks harder tasks once thresholds
    are met. Used both for building the training dataset and for gating tasks in
    the reward function so early training stays on easier tasks.
    """
    def __init__(self):
        self._rewards:  dict = collections.defaultdict(list)  # task_id → [rewards]
        self.unlocked:  set  = {'easy'}

    def record(self, task_id: str, reward: float):
        self._rewards[task_id].append(reward)
        self._try_unlock()

    def mean_for_difficulty(self, diff: str) -> float:
        vals = []
        for tid, d in _TASK_DIFFICULTY.items():
            if d == diff:
                vals.extend(self._rewards.get(tid, []))
        return sum(vals) / len(vals) if vals else 0.0

    def _try_unlock(self):
        for i, diff in enumerate(_DIFFICULTY_ORDER[:-1]):
            if diff in self.unlocked:
                m = self.mean_for_difficulty(diff)
                if m >= _UNLOCK_THRESHOLDS.get(diff, 0.70):
                    nxt = _DIFFICULTY_ORDER[i + 1]
                    if nxt not in self.unlocked:
                        self.unlocked.add(nxt)
                        print(f'\n[CURRICULUM] Unlocked {nxt}! mean({diff})={m:.3f} '
                              f'>= threshold {_UNLOCK_THRESHOLDS[diff]}')

    def gate_task(self, task_id: str) -> str:
        """If task's difficulty is not yet unlocked, return easiest unlocked task."""
        if _TASK_DIFFICULTY.get(task_id, 'easy') in self.unlocked:
            return task_id
        easiest = [t for t, d in _TASK_DIFFICULTY.items() if d == 'easy']
        return random.choice(easiest)

    def build_dataset_tasks(self) -> list:
        """
        Curriculum-weighted task list:
          easy  → 10 seeds  (always included)
          medium → 5 seeds  (if unlocked)
          hard   → 2 seeds  (if unlocked)
          long   → 2 seeds  (if unlocked)
        Returns list of (task_id, seed) pairs.
        """
        rows = []
        seeds_per_diff = {'easy': 10, 'medium': 5, 'hard': 2, 'long': 2}
        for task_id, diff in _TASK_DIFFICULTY.items():
            if diff in self.unlocked:
                n = seeds_per_diff[diff]
                rows.extend([(task_id, s) for s in range(1, n + 1)])
        return rows

    def status_line(self) -> str:
        parts = []
        for d in _DIFFICULTY_ORDER:
            m = self.mean_for_difficulty(d)
            unlk = '✓' if d in self.unlocked else '✗'
            parts.append(f'{d}={m:.2f}{unlk}')
        return ' | '.join(parts)


CURRICULUM = CurriculumSampler()


# ── Per-step greedy follow-up policy ───────────────────────────────────────────

def _greedy_followup(obs_dict: dict) -> dict:
    """
    Scripted policy for intermediate follow-up steps (used in multi-step rollouts).
    Reads context_notes added by the environment after ESCALATE/QUERY_VENDOR/HOLD
    and picks the most appropriate next terminal action.
    """
    notes = ' '.join(obs_dict.get('context_notes', [])).lower()
    total = abs(float(obs_dict.get('invoice', {}).get('invoice_total', 0) or 0))

    # Manager / VP approved → APPROVE_FULL
    if any(k in notes for k in ('manager approved', 'vp approved', 'cfo approved',
                                 'pre-approved', 'pre-approv', 'approved by')):
        return {'decision': 'APPROVE_FULL', 'approved_amount': total,
                'reason_code': 'MATCH_CONFIRMED',
                'explanation': f'Approval confirmed via escalation chain. Approving ${total:.2f}.'}

    # Compliance cleared → APPROVE_FULL
    if 'compliance' in notes and any(k in notes for k in ('cleared', 'approved', 'pass')):
        return {'decision': 'APPROVE_FULL', 'approved_amount': total,
                'reason_code': 'MATCH_CONFIRMED',
                'explanation': f'Compliance review cleared. Approving ${total:.2f}.'}

    # Fraudulent / duplicate / deny → REJECT
    if any(k in notes for k in ('fraudulent', 'duplicate', 'already paid', 'deny',
                                 'invalid', 'false claim')):
        return {'decision': 'REJECT', 'approved_amount': 0.0,
                'reason_code': 'DUPLICATE_INVOICE',
                'explanation': 'Vendor response or audit confirms fraud/duplicate. Rejecting.'}

    # Compliance flagged / SOX violation → REJECT
    if any(k in notes for k in ('flagged', 'violation', 'sox', 'gdpr', 'non-compliant')):
        return {'decision': 'REJECT', 'approved_amount': 0.0,
                'reason_code': 'POLICY_VIOLATION',
                'explanation': 'Compliance review flagged a violation. Rejecting.'}

    # Confused vendor / ambiguous → ESCALATE
    if any(k in notes for k in ('confused', 'unclear', 'unable to confirm')):
        return {'decision': 'ESCALATE', 'approved_amount': 0.0,
                'reason_code': 'MANAGER_REVIEW',
                'explanation': 'Vendor response ambiguous. Escalating to manager.'}

    # Default: safe rejection
    return {'decision': 'REJECT', 'approved_amount': 0.0,
            'reason_code': 'PENDING_CLARIFICATION',
            'explanation': 'Could not resolve after investigation. Rejecting for safety.'}


# ── Metrics tracker ────────────────────────────────────────────────────────────

class Metrics:
    def __init__(self):
        self.step               = 0
        self.reward_history     = []   # (step, mean_reward) — overall
        self.diff_reward_hist   = collections.defaultdict(list)  # diff → [(step, mean)]
        self.format_history     = []   # (step, format_rate) — compliance over time
        self.episode_len_hist   = []   # all episode lengths (for histogram)
        self.ep_len_by_task     = collections.defaultdict(list)  # task_id → [lengths]
        self.decision_history   = []   # [(step, Counter)] for stacked-bar over time
        self.decision_counts    = collections.Counter()
        self.parse_failures     = 0
        self.env_errors         = 0
        self.format_scores      = []
        self.reward_by_task     = collections.defaultdict(list)
        self.total_calls        = 0
        self._start_time        = time.time()
        self._step_decisions    = collections.Counter()  # decisions in current step batch

    def log_step(self, rewards, decisions, format_ok_list, task_ids, errors,
                 episode_lengths=None):
        self.step += 1
        self.total_calls += len(rewards)
        mean_r = sum(rewards) / len(rewards) if rewards else 0.0
        self.reward_history.append((self.step, mean_r))

        # Per-difficulty reward history
        diff_rewards: dict = collections.defaultdict(list)
        for tid, r in zip(task_ids, rewards):
            d = _TASK_DIFFICULTY.get(tid, 'easy')
            diff_rewards[d].append(r)
        for d, rs in diff_rewards.items():
            self.diff_reward_hist[d].append((self.step, sum(rs) / len(rs)))

        for d in decisions:
            self.decision_counts[d] += 1
            self._step_decisions[d] += 1
        # Snapshot decision distribution every step for stacked-bar
        self.decision_history.append((self.step, dict(self._step_decisions)))

        fmt_ok_count = sum(1 for ok in format_ok_list if ok)
        fmt_rate = fmt_ok_count / len(format_ok_list) if format_ok_list else 0.0
        self.format_history.append((self.step, fmt_rate))
        for ok in format_ok_list:
            self.format_scores.append(1.0 if ok else 0.0)

        for tid, r in zip(task_ids, rewards):
            self.reward_by_task[tid].append(r)

        if episode_lengths:
            for tid, ep_len in zip(task_ids, episode_lengths):
                self.episode_len_hist.append(ep_len)
                self.ep_len_by_task[tid].append(ep_len)

        self.env_errors += errors
        self._flush_live()

    def _flush_live(self):
        recent = self.reward_history[-20:]
        recent_mean = sum(r for _, r in recent) / len(recent) if recent else 0.0
        fmt_rate = sum(self.format_scores) / len(self.format_scores) if self.format_scores else 0.0
        task_means = {t: round(sum(v)/len(v), 3) for t, v in self.reward_by_task.items()}
        elapsed = (time.time() - self._start_time) / 60
        payload = {
            'step':           self.step,
            'total_calls':    self.total_calls,
            'recent_mean':    round(recent_mean, 4),
            'format_rate':    round(fmt_rate, 4),
            'parse_failures': self.parse_failures,
            'env_errors':     self.env_errors,
            'elapsed_min':    round(elapsed, 1),
            'reward_history': [{'step': s, 'reward': r} for s, r in self.reward_history],
            'decision_counts': dict(self.decision_counts),
            'task_means':     task_means,
        }
        try:
            with open('/app/metrics_live.json', 'w') as f:
                json.dump(payload, f)
        except Exception:
            pass

    def print_summary(self):
        recent = self.reward_history[-10:] if self.reward_history else []
        recent_mean = sum(r for _, r in recent) / len(recent) if recent else 0.0
        fmt_rate = sum(self.format_scores) / len(self.format_scores) if self.format_scores else 0.0
        print(f'\n[METRICS] step={self.step} | recent_reward={recent_mean:.3f} | '
              f'format_ok={fmt_rate:.1%} | parse_fails={self.parse_failures} | '
              f'env_errors={self.env_errors} | total_calls={self.total_calls}')
        top_decisions = self.decision_counts.most_common(5)
        print(f'[METRICS] decisions: {dict(top_decisions)}')
        if self.reward_by_task:
            task_means = {t: round(sum(v)/len(v), 3) for t, v in self.reward_by_task.items()}
            print(f'[METRICS] per_task_reward: {task_means}')

    def save_all_metrics_figures(self, run_dir: str):
        """
        Save six standard RL research metric figures to run_dir.
        All figures follow conventions used in academic RL papers:
        - Named axes (xlabel, ylabel)
        - Figure caption as fig.text below the plot
        - Dark GitHub-style theme consistent with project
        - Smoothed curves with raw data visible in background
        """
        PALETTE = {'easy': '#3fb950', 'medium': '#d29922', 'hard': '#f85149', 'long': '#a371f7'}
        BG      = '#0d1117'
        PANEL   = '#161b22'
        GRID    = '#21262d'
        TEXT    = '#e6edf3'
        SUBTEXT = '#8b949e'
        ACCENT  = '#58a6ff'

        def _setup(ax, xlabel='', ylabel='', title=''):
            ax.set_facecolor(PANEL)
            ax.tick_params(colors=TEXT, labelsize=8)
            for sp in ax.spines.values():
                sp.set_color('#30363d')
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.yaxis.grid(True, color=GRID, linewidth=0.6, alpha=0.8)
            ax.xaxis.grid(True, color=GRID, linewidth=0.4, alpha=0.4)
            ax.set_axisbelow(True)
            if xlabel: ax.set_xlabel(xlabel, color=SUBTEXT, fontsize=9)
            if ylabel: ax.set_ylabel(ylabel, color=SUBTEXT, fontsize=9)
            if title:  ax.set_title(title, color=TEXT, fontsize=10, fontweight='bold', pad=8)

        def _smooth(values, window=None):
            if len(values) < 3:
                return values
            w = window or max(3, len(values) // 12)
            return np.convolve(values, np.ones(w)/w, mode='valid'), w

        def _caption(fig, text):
            fig.text(0.5, 0.01, text, ha='center', va='bottom',
                     color=SUBTEXT, fontsize=7, style='italic')

        ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')

        # ── Figure 1: Mean Episode Return (reward curve) ──────────────────────
        if self.reward_history:
            fig, ax = plt.subplots(figsize=(10, 4))
            fig.patch.set_facecolor(BG)
            steps   = [s for s, _ in self.reward_history]
            rewards = [r for _, r in self.reward_history]
            ax.plot(steps, rewards, color=ACCENT, alpha=0.25, linewidth=1, label='Per-batch mean')
            if len(rewards) >= 5:
                sm, w = _smooth(rewards)
                ax.plot(steps[w-1:], sm, color=ACCENT, linewidth=2, label=f'EMA (w={w})')
            ax.axhline(0.5, color=SUBTEXT, linestyle='--', linewidth=1, alpha=0.5, label='Chance baseline (0.5)')
            recent_mean = sum(rewards[-20:]) / min(20, len(rewards))
            ax.axhline(recent_mean, color='#f78166', linestyle=':', linewidth=1.5,
                       label=f'Recent mean = {recent_mean:.3f}')
            ax.set_ylim(0, 1.05)
            ax.legend(fontsize=8, facecolor=PANEL, edgecolor='#30363d', labelcolor=TEXT)
            _setup(ax, xlabel='Training Step (reward function call batch)',
                   ylabel='Mean Episode Return  [0.01 – 0.99]',
                   title='Training Reward Curve — AP Commander GRPO')
            _caption(fig, f'Each step = one GRPO batch. Reward = discounted accumulated score from AP Commander environment. | {ts}')
            plt.tight_layout(rect=[0, 0.04, 1, 1])
            p = os.path.join(run_dir, 'fig1_reward_curve.png')
            plt.savefig(p, dpi=130, bbox_inches='tight', facecolor=BG)
            plt.close()
            print(f'[METRICS] {p}')

        # ── Figure 2: Per-Difficulty Learning Curves ──────────────────────────
        if self.diff_reward_hist:
            fig, ax = plt.subplots(figsize=(10, 4))
            fig.patch.set_facecolor(BG)
            for diff in _DIFFICULTY_ORDER:
                hist = self.diff_reward_hist.get(diff, [])
                if not hist:
                    continue
                steps_d  = [s for s, _ in hist]
                rewards_d = [r for _, r in hist]
                color = PALETTE.get(diff, ACCENT)
                ax.plot(steps_d, rewards_d, color=color, alpha=0.20, linewidth=1)
                if len(rewards_d) >= 5:
                    sm, w = _smooth(rewards_d)
                    ax.plot(steps_d[w-1:], sm, color=color, linewidth=2.5, label=f'{diff} (n={len(steps_d)})')
                else:
                    ax.plot(steps_d, rewards_d, color=color, linewidth=2.5, label=diff)
            for thr_diff, thr_val in _UNLOCK_THRESHOLDS.items():
                ax.axhline(thr_val, color=PALETTE.get(thr_diff, SUBTEXT),
                           linestyle='--', linewidth=0.8, alpha=0.5)
            ax.set_ylim(0, 1.05)
            ax.legend(fontsize=9, facecolor=PANEL, edgecolor='#30363d', labelcolor=TEXT)
            _setup(ax, xlabel='Training Step',
                   ylabel='Mean Reward per Difficulty Tier  [0.01 – 0.99]',
                   title='Curriculum Learning Curves — Easy / Medium / Hard / Long-Horizon')
            _caption(fig, f'Dashed lines = curriculum unlock thresholds. Each line = rolling mean of all tasks in that difficulty tier. | {ts}')
            plt.tight_layout(rect=[0, 0.04, 1, 1])
            p = os.path.join(run_dir, 'fig2_difficulty_curves.png')
            plt.savefig(p, dpi=130, bbox_inches='tight', facecolor=BG)
            plt.close()
            print(f'[METRICS] {p}')

        # ── Figure 3: Episode Length Distribution ─────────────────────────────
        if self.episode_len_hist:
            fig, axes = plt.subplots(1, 2, figsize=(12, 4))
            fig.patch.set_facecolor(BG)
            # Overall histogram
            max_len = max(self.episode_len_hist)
            bins = range(1, max_len + 2)
            axes[0].hist(self.episode_len_hist, bins=bins, color=ACCENT, alpha=0.85,
                         edgecolor=BG, rwidth=0.8)
            _setup(axes[0], xlabel='Episode Length (number of env steps)',
                   ylabel='Count of Episodes',
                   title='Episode Length Distribution (all tasks)')
            axes[0].axvline(np.mean(self.episode_len_hist), color='#f78166',
                            linestyle='--', linewidth=1.5,
                            label=f'Mean = {np.mean(self.episode_len_hist):.1f}')
            axes[0].legend(fontsize=8, facecolor=PANEL, edgecolor='#30363d', labelcolor=TEXT)
            # Per-difficulty mean episode length bar
            diff_ep_means = {}
            for diff in _DIFFICULTY_ORDER:
                lens = []
                for tid, d in _TASK_DIFFICULTY.items():
                    if d == diff:
                        lens.extend(self.ep_len_by_task.get(tid, []))
                if lens:
                    diff_ep_means[diff] = np.mean(lens)
            if diff_ep_means:
                diffs  = list(diff_ep_means.keys())
                means  = list(diff_ep_means.values())
                colors = [PALETTE.get(d, ACCENT) for d in diffs]
                axes[1].bar(diffs, means, color=colors, alpha=0.85, edgecolor=BG, width=0.5)
                for i, (d, m) in enumerate(zip(diffs, means)):
                    axes[1].text(i, m + 0.05, f'{m:.1f}', ha='center', color=TEXT, fontsize=9,
                                 fontweight='bold')
                axes[1].set_ylim(0, max(means) * 1.3)
                _setup(axes[1], xlabel='Difficulty Tier',
                       ylabel='Mean Episode Length (steps)',
                       title='Mean Episode Length by Difficulty')
            fig.suptitle('Episode Length Analysis — Multi-Step Decision Behavior', color=TEXT, fontsize=11, y=1.01)
            _caption(fig, f'Long-horizon tasks expected to have higher mean episode lengths as agent learns to use ESCALATE/QUERY_VENDOR. | {ts}')
            plt.tight_layout(rect=[0, 0.04, 1, 1])
            p = os.path.join(run_dir, 'fig3_episode_lengths.png')
            plt.savefig(p, dpi=130, bbox_inches='tight', facecolor=BG)
            plt.close()
            print(f'[METRICS] {p}')

        # ── Figure 4: Format Compliance Rate Over Time ────────────────────────
        if self.format_history:
            fig, ax = plt.subplots(figsize=(10, 3.5))
            fig.patch.set_facecolor(BG)
            steps_f  = [s for s, _ in self.format_history]
            fmt_vals = [r for _, r in self.format_history]
            ax.plot(steps_f, fmt_vals, color='#d29922', alpha=0.25, linewidth=1)
            if len(fmt_vals) >= 5:
                sm, w = _smooth(fmt_vals)
                ax.plot(steps_f[w-1:], sm, color='#d29922', linewidth=2.5,
                        label=f'EMA (w={w})')
            final_rate = sum(self.format_scores) / max(1, len(self.format_scores))
            ax.axhline(final_rate, color='#3fb950', linestyle='--', linewidth=1.5,
                       label=f'Overall rate = {final_rate:.1%}')
            ax.set_ylim(0, 1.05)
            ax.legend(fontsize=8, facecolor=PANEL, edgecolor='#30363d', labelcolor=TEXT)
            _setup(ax, xlabel='Training Step',
                   ylabel='Format Compliance Rate  [0 – 1]',
                   title='JSON Format Compliance Over Training')
            _caption(fig, f'Format compliance = fraction of completions producing valid JSON with correct fields. Parse failures = {self.parse_failures}. | {ts}')
            plt.tight_layout(rect=[0, 0.04, 1, 1])
            p = os.path.join(run_dir, 'fig4_format_compliance.png')
            plt.savefig(p, dpi=130, bbox_inches='tight', facecolor=BG)
            plt.close()
            print(f'[METRICS] {p}')

        # ── Figure 5: Decision Distribution Over Time (stacked bar) ──────────
        if self.decision_history and len(self.decision_history) >= 3:
            all_decisions = sorted(set(self.decision_counts.keys()))
            # Sample ~20 evenly-spaced checkpoints for readability
            n_checkpoints = min(20, len(self.decision_history))
            idxs = [int(i * (len(self.decision_history) - 1) / (n_checkpoints - 1))
                    for i in range(n_checkpoints)]
            ckpt_steps  = [self.decision_history[i][0] for i in idxs]
            ckpt_counts = [self.decision_history[i][1] for i in idxs]
            # Convert to fractions
            fracs = []
            for c in ckpt_counts:
                total_c = sum(c.values()) or 1
                fracs.append({d: c.get(d, 0) / total_c for d in all_decisions})
            fig, ax = plt.subplots(figsize=(12, 4))
            fig.patch.set_facecolor(BG)
            dec_colors = ['#3fb950','#f85149','#d29922','#a371f7','#58a6ff','#f0883e']
            bottom = np.zeros(len(ckpt_steps))
            for j, dec in enumerate(all_decisions):
                vals = np.array([f[dec] for f in fracs])
                ax.bar(range(len(ckpt_steps)), vals, bottom=bottom,
                       label=dec, color=dec_colors[j % len(dec_colors)],
                       alpha=0.85, edgecolor=BG)
                bottom += vals
            ax.set_xticks(range(len(ckpt_steps)))
            ax.set_xticklabels([str(s) for s in ckpt_steps], rotation=45, fontsize=7)
            ax.set_ylim(0, 1.05)
            ax.legend(fontsize=7, facecolor=PANEL, edgecolor='#30363d', labelcolor=TEXT,
                      loc='upper right', bbox_to_anchor=(1.15, 1))
            _setup(ax, xlabel='Training Step (checkpoint)',
                   ylabel='Fraction of Decisions',
                   title='Decision Distribution Over Training (Stacked Bar)')
            _caption(fig, f'Each bar = cumulative decision distribution up to that checkpoint. Ideal: APPROVE_FULL grows for easy tasks, REJECT for fraud/duplicate tasks. | {ts}')
            plt.tight_layout(rect=[0, 0.04, 0.88, 1])
            p = os.path.join(run_dir, 'fig5_decision_distribution.png')
            plt.savefig(p, dpi=130, bbox_inches='tight', facecolor=BG)
            plt.close()
            print(f'[METRICS] {p}')

        # ── Figure 6: Per-Task Training Mean (horizontal bar) ─────────────────
        if self.reward_by_task:
            task_means = {t: sum(v)/len(v) for t, v in self.reward_by_task.items()}
            tasks  = sorted(task_means, key=lambda t: (_DIFFICULTY_ORDER.index(_TASK_DIFFICULTY.get(t,'easy')), t))
            means  = [task_means[t] for t in tasks]
            colors = [PALETTE.get(_TASK_DIFFICULTY.get(t,'easy'), ACCENT) for t in tasks]
            short  = [t.replace('easy_','').replace('medium_','').replace('hard_','').replace('long_','').replace('_',' ').title() for t in tasks]

            fig, ax = plt.subplots(figsize=(10, max(4, len(tasks) * 0.45)))
            fig.patch.set_facecolor(BG)
            yp = range(len(tasks))
            ax.barh(list(yp), means, color=colors, alpha=0.85, edgecolor=BG)
            ax.set_yticks(list(yp))
            ax.set_yticklabels(short, fontsize=8)
            ax.set_xlim(0, 1.05)
            overall_mean = sum(means) / len(means)
            ax.axvline(overall_mean, color='#f78166', linestyle='--', linewidth=1.5,
                       label=f'Overall mean = {overall_mean:.3f}')
            ax.axvline(0.5, color=SUBTEXT, linestyle=':', linewidth=1, alpha=0.5)
            for i, m in enumerate(means):
                ax.text(m + 0.01, i, f'{m:.3f}', va='center', color=TEXT, fontsize=7)
            from matplotlib.patches import Patch
            legend_els = [Patch(facecolor=PALETTE[d], label=d.title()) for d in _DIFFICULTY_ORDER if d in PALETTE]
            legend_els.append(plt.Line2D([0],[0], color='#f78166', linestyle='--', label=f'Mean {overall_mean:.3f}'))
            ax.legend(handles=legend_els, fontsize=8, facecolor=PANEL, edgecolor='#30363d', labelcolor=TEXT)
            _setup(ax, xlabel='Mean Training Reward  [0.01 – 0.99]',
                   ylabel='Task',
                   title='Per-Task Training Mean Reward (all episodes)')
            _caption(fig, f'Tasks ordered by difficulty. Green ≥ 0.7 = curriculum mastered. Orange = in progress. Red < 0.4 = needs more training. | {ts}')
            plt.tight_layout(rect=[0, 0.04, 1, 1])
            p = os.path.join(run_dir, 'fig6_per_task_means.png')
            plt.savefig(p, dpi=130, bbox_inches='tight', facecolor=BG)
            plt.close()
            print(f'[METRICS] {p}')


METRICS = Metrics()
_EPISODE_LOG_PATH: str = ''   # set to run_dir/episodes.jsonl once run_dir is known

# ── Helpers ────────────────────────────────────────────────────────────────────

def obs_to_prompt(obs: dict) -> str:
    inv = obs['invoice']
    lines = '\n'.join(
        f"  {li['description']}: qty={li['quantity']}, unit_price=${li['unit_price']:.2f}"
        for li in inv.get('line_items', [])
    )
    pos = '\n'.join(
        f"  PO {p['po_number']} ({p['status']}) {p['vendor_name']}: " +
        ', '.join(f"{l['description']} qty={l['ordered_quantity']} @${l['agreed_unit_price']:.2f}"
                  for l in p.get('lines', []))
        for p in obs.get('purchase_orders', [])
    )
    grns = '\n'.join(
        f"  GRN {g['grn_id']} (PO {g['po_number']}): " +
        ', '.join(f"{l['description']} recv={l['received_quantity']}"
                  for l in g.get('lines', []))
        for g in obs.get('goods_receipts', [])
    )
    context = '\n'.join(f'  {n}' for n in obs.get('context_notes', []))
    paid = ', '.join(obs.get('paid_invoice_ids', []))
    return (
        f"TASK: {obs['task_name']}\n{obs['task_description']}\n\n"
        f"INVOICE {inv['invoice_id']} | {inv['vendor_name']} | ${inv['invoice_total']:,.2f}\n{lines}\n"
        f"Freight: ${inv.get('freight_charge',0):.2f}\n\n"
        f"PURCHASE ORDERS:\n{pos}\n\nGOODS RECEIPTS:\n{grns}\n"
        + (f"PAID LEDGER: {paid}\n" if paid else "")
        + (f"CONTEXT:\n{context}\n" if context else "")
        + f"\nPOLICY:\n{obs['company_policy']}\n\nOutput JSON decision."
    )


def parse_action(raw: str) -> tuple[dict, bool]:
    """Returns (action_dict, format_ok). format_ok=False means parse failed."""
    clean = re.sub(r'```(?:json)?\s*|\s*```', '', raw).strip()
    m = re.search(r'\{.*\}', clean, re.DOTALL)
    if m:
        try:
            action = json.loads(m.group())
            # Validate required fields and enum values
            if (action.get('decision') in VALID_DECISIONS and
                action.get('reason_code') in VALID_REASON_CODES and
                isinstance(action.get('approved_amount'), (int, float)) and
                isinstance(action.get('explanation'), str) and
                len(action.get('explanation', '')) > 10):
                return action, True
        except Exception:
            pass
    METRICS.parse_failures += 1
    return {'decision': 'REJECT', 'approved_amount': 0.0,
            'reason_code': 'NO_PO_FOUND', 'explanation': 'parse error fallback'}, False


def run_episode(task_id: str, action_json: dict, seed=None) -> float:
    try:
        r = requests.post(f'{ENV_URL}/reset',
                          json={'task_id': task_id, 'seed': seed}, timeout=20)
        r.raise_for_status()
        data = r.json()
        step_r = requests.post(f'{ENV_URL}/step',
                               json={'session_id': data['session_id'], 'action': action_json},
                               timeout=20)
        step_r.raise_for_status()
        return float(step_r.json()['reward']['score'])
    except Exception:
        return 0.01


def run_episode_accumulated(task_id: str, first_action: dict, seed=None,
                             discount: float = 0.9, max_steps: int = 20,
                             episode_log: list | None = None) -> tuple[float, int]:
    """
    Run a full multi-step episode accumulating discounted per-step rewards.
    Returns (score, episode_length) so callers can track step counts.
    Model's first action starts the episode; _greedy_followup() handles
    subsequent steps so multi-step sequences earn full accumulated credit.
    E.g. QUERY_VENDOR→REJECT = 0.01 + 0.9*0.99 = 0.901 > shortcut REJECT = ~0.4

    episode_log: if provided, appended with one dict per env step for JSONL logging.
    """
    try:
        r = requests.post(f'{ENV_URL}/reset',
                          json={'task_id': task_id, 'seed': seed}, timeout=20)
        r.raise_for_status()
        reset_data = r.json()
        session_id = reset_data['session_id']
        action = first_action
        total  = 0.0
        steps_taken = 0
        for step_n in range(max_steps):
            step_r = requests.post(f'{ENV_URL}/step',
                                   json={'session_id': session_id, 'action': action},
                                   timeout=20)
            step_r.raise_for_status()
            result    = step_r.json()
            r_score   = float(result['reward']['score'])
            done      = result['done']
            obs_back  = result.get('observation', {})
            total    += (discount ** step_n) * r_score
            steps_taken = step_n + 1
            if episode_log is not None:
                episode_log.append({
                    'step_n':          step_n,
                    'decision':        action.get('decision'),
                    'approved_amount': action.get('approved_amount'),
                    'reason_code':     action.get('reason_code'),
                    'explanation':     (action.get('explanation') or '')[:120],
                    'step_score':      round(r_score, 4),
                    'done':            done,
                    'context_notes':   obs_back.get('context_notes', []),
                    'action_history':  obs_back.get('action_history', []),
                })
            if done:
                break
            action = _greedy_followup(obs_back)
        return min(0.99, max(0.01, total)), steps_taken
    except Exception as e:
        return 0.01, 1


# ── Two independent reward functions (guide: use multiple, not one) ─────────────

def env_reward_fn(completions, task_id=None, seed=None, **kwargs):
    """
    Environment reward: accumulated discounted per-step reward from AP Commander.
    Curriculum gating redirects locked tasks to easier ones during early training.
    Writes one JSONL record per episode to _EPISODE_LOG_PATH for full verifiability.
    """
    task_ids = task_id if task_id is not None else ['easy_perfect_match'] * len(completions)
    seeds    = seed    if seed    is not None else [random.randint(1, 999)] * len(completions)

    rewards, decisions, format_ok_list, ep_lengths, errors = [], [], [], [], 0
    for completion, tid, s in zip(completions, task_ids, seeds):
        gated_tid = CURRICULUM.gate_task(tid)
        if gated_tid != tid:
            print(f'[CURRICULUM] gate {tid} → {gated_tid}')

        action, fmt_ok = parse_action(completion)
        episode_steps = []
        try:
            score, ep_len = run_episode_accumulated(
                gated_tid, action, seed=int(s), episode_log=episode_steps)
        except Exception as e:
            score, ep_len = 0.01, 1
            errors += 1

        rewards.append(score)
        ep_lengths.append(ep_len)
        decisions.append(action.get('decision', 'UNKNOWN'))
        format_ok_list.append(fmt_ok)
        CURRICULUM.record(gated_tid, score)

        # Write structured episode record to JSONL for full verifiability
        if _EPISODE_LOG_PATH:
            try:
                record = {
                    'reward_step':   METRICS.step + 1,
                    'call_n':        METRICS.total_calls + len(rewards),
                    'task_id':       tid,
                    'gated_task_id': gated_tid,
                    'seed':          int(s),
                    'format_ok':     fmt_ok,
                    'score':         round(score, 4),
                    'episode_len':   ep_len,
                    'final_decision': action.get('decision'),
                    'steps':         episode_steps,
                    'ts':            datetime.datetime.now().isoformat(),
                }
                with open(_EPISODE_LOG_PATH, 'a') as _f:
                    _f.write(json.dumps(record) + '\n')
            except Exception:
                pass

        if METRICS.total_calls % LOG_SAMPLES_EVERY == 0:
            gated_note = f'→{gated_tid}' if gated_tid != tid else ''
            print(f'\n[SAMPLE] task={tid}{gated_note} seed={s} fmt={fmt_ok} '
                  f'score={score:.3f} ep_len={ep_len}')
            print(f'  {action.get("decision")} ${action.get("approved_amount")} '
                  f'{action.get("reason_code")}')
            print(f'  {str(action.get("explanation",""))[:100]}')
            print(f'  curriculum: {CURRICULUM.status_line()}')
            if episode_steps:
                actor_notes = [n for step in episode_steps
                               for n in step.get('context_notes', [])]
                if actor_notes:
                    print(f'  actor_responses: {actor_notes[:2]}')

    METRICS.log_step(rewards, decisions, format_ok_list, list(task_ids), errors,
                     episode_lengths=ep_lengths)
    if METRICS.step % 5 == 0:
        METRICS.print_summary()
        print(f'[CURRICULUM] {CURRICULUM.status_line()}')
    return rewards


def format_reward_fn(completions, **kwargs):
    """Format reward: +0.05 if valid JSON with correct fields, -0.05 otherwise."""
    results = []
    for completion in completions:
        _, ok = parse_action(completion)
        results.append(0.05 if ok else -0.05)
    return results


# ── Eval helper ────────────────────────────────────────────────────────────────

def eval_task(model, tokenizer, task_id: str, seed: int = 99) -> float:
    import torch
    model.eval()
    try:
        reset = requests.post(f'{ENV_URL}/reset', json={'task_id': task_id, 'seed': seed}, timeout=20).json()
        obs, session_id = reset['observation'], reset['session_id']
        messages = [{'role': 'system', 'content': SYSTEM_PROMPT},
                    {'role': 'user',   'content': obs_to_prompt(obs)}]
        text    = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs  = tokenizer(text, return_tensors='pt').to('cuda')
        with torch.no_grad():
            out = model.generate(**inputs, max_new_tokens=250, temperature=0.1, do_sample=True)
        raw    = tokenizer.decode(out[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        action, fmt_ok = parse_action(raw)
        score  = float(requests.post(f'{ENV_URL}/step',
                                     json={'session_id': session_id, 'action': action},
                                     timeout=20).json()['reward']['score'])
        print(f'    output: {raw[:120].strip()}')
        return score
    except Exception as e:
        print(f'    eval error: {e}')
        return 0.01


# ── Main ───────────────────────────────────────────────────────────────────────

def _make_run_dir() -> str:
    """Create timestamped run directory under /app/runs/grpo/MODEL-NEpoch-DATETIME."""
    model_slug = MODEL_NAME.split('/')[-1].lower().replace('.', '-')
    ts = datetime.datetime.now().strftime('%Y-%m-%d_%H%M')
    run_dir = f'/app/runs/grpo/{model_slug}-{NUM_EPOCHS}ep-{ts}'
    os.makedirs(run_dir, exist_ok=True)
    return run_dir


def main():
    # Authenticate with HF Hub if token provided (needed for gated models like Llama-3)
    hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN')
    if hf_token:
        from huggingface_hub import login
        login(token=hf_token, add_to_git_credential=False)
        print('[AUTH] Logged in to HF Hub.')
    else:
        print('[AUTH] No HF_TOKEN set — using public models only (Qwen recommended).')

    # All run artifacts go into this timestamped dir — never overwrite a previous run
    RUN_DIR = _make_run_dir()
    print(f'[RUN] Artifacts → {RUN_DIR}')

    # Point the global episode log path so env_reward_fn can write structured logs
    global _EPISODE_LOG_PATH
    _EPISODE_LOG_PATH = os.path.join(RUN_DIR, 'episodes.jsonl')
    print(f'[RUN] Episode log → {_EPISODE_LOG_PATH}')

    print(f'[ENV] Checking {ENV_URL}...')
    h = requests.get(f'{ENV_URL}/health', timeout=30).json()
    print(f"[ENV] status={h['status']} tasks={h.get('total_tasks')}")

    print(f'[MODEL] Loading {MODEL_NAME}...')
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
    from peft import LoraConfig, get_peft_model, TaskType
    from datasets import Dataset
    from trl import GRPOConfig, GRPOTrainer

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map='auto',
        trust_remote_code=True,
    )
    model.enable_input_require_grads()
    model.gradient_checkpointing_enable()

    lora_cfg = LoraConfig(
        r=16, lora_alpha=16,
        target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'],
        lora_dropout=0, bias='none',
        task_type=TaskType.CAUSAL_LM,
    )
    model = get_peft_model(model, lora_cfg)
    model.print_trainable_parameters()

    # Baseline eval (before training)
    print('\n[BASELINE] Before training:')
    baseline = {}
    for t in EVAL_TASKS:
        s = eval_task(model, tokenizer, t)
        baseline[t] = s
        print(f'  {t}: {s:.3f}')
    print(f'  Mean: {sum(baseline.values())/len(baseline):.3f}')
    model.train()

    # Dataset contains ALL 17 tasks × 5 seeds = 85 prompts.
    # gate_task() in env_reward_fn handles curriculum redirection at reward time:
    # locked tasks (medium/hard/long) redirect to easy during early training.
    # As curriculum unlocks thresholds, redirection stops and full task variety flows.
    print(f'\n[DATASET] Building prompts ({len(TRAIN_TASKS)} tasks × 5 seeds = {len(TRAIN_TASKS)*5})...')
    task_seed_pairs = [(tid, s) for tid in TRAIN_TASKS for s in range(1, 6)]
    rows = []
    for task_id, seed in task_seed_pairs:
        try:
            reset = requests.post(f'{ENV_URL}/reset', json={'task_id': task_id, 'seed': seed}, timeout=20).json()
            obs   = reset['observation']
            messages = [{'role': 'system', 'content': SYSTEM_PROMPT},
                        {'role': 'user',   'content': obs_to_prompt(obs)}]
            rows.append({
                'prompt':   tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True),
                'task_id':  task_id,
                'seed':     seed,
            })
        except Exception as e:
            print(f'  skip {task_id} seed={seed}: {e}')

    dataset = Dataset.from_list(rows)
    print(f'[DATASET] {len(dataset)} samples across {len(TRAIN_TASKS)} tasks '
          f'({sum(1 for r in rows if _TASK_DIFFICULTY.get(r["task_id"],"easy")=="long")} long-horizon) '
          f'| curriculum: {CURRICULUM.status_line()}')

    # Train
    print(f'\n[TRAIN] {NUM_EPOCHS} epochs | {NUM_GENERATIONS} generations/prompt | {len(dataset)} samples')
    model.train()
    # generation_batch_size = per_device_train_batch_size (TRL default).
    # TRL requires: generation_batch_size % num_generations == 0.
    # Simplest fix: set per_device_train_batch_size = num_generations.
    config = GRPOConfig(
        output_dir            = './ap_commander_grpo',
        num_train_epochs      = NUM_EPOCHS,
        per_device_train_batch_size = NUM_GENERATIONS,
        num_generations       = NUM_GENERATIONS,
        gradient_accumulation_steps = 1,
        learning_rate         = 2e-5,
        max_completion_length = 250,
        temperature           = 0.9,
        logging_steps         = 1,
        save_steps            = 999,
        report_to             = 'none',
        remove_unused_columns = False,
    )
    # Two independent reward functions (guide: use multiple, not one combined signal)
    trainer = GRPOTrainer(
        model=model, processing_class=tokenizer,
        reward_funcs=[env_reward_fn, format_reward_fn],
        args=config, train_dataset=dataset,
    )
    result = trainer.train()
    print(f'\n[TRAIN] Done. Loss: {result.training_loss:.4f}')

    METRICS.print_summary()
    METRICS.save_all_metrics_figures(RUN_DIR)

    # Save LoRA adapters (guide point 16: save adapters directly, do NOT merge 4-bit naively)
    adapter_dir = os.path.join(RUN_DIR, 'adapter')
    print(f'[SAVE] Saving LoRA adapters to {adapter_dir}...')
    model.save_pretrained(adapter_dir)
    tokenizer.save_pretrained(adapter_dir)

    # Upload adapter to HF Hub as a model repo
    try:
        from huggingface_hub import HfApi
        api = HfApi()
        api.upload_folder(
            folder_path=adapter_dir,
            repo_id='Pathikreet/ap-commander-adapter',
            repo_type='model',
            commit_message=f'GRPO {datetime.datetime.now().strftime("%Y-%m-%d")} — {MODEL_NAME} {NUM_EPOCHS}ep',
        )
        print('[SAVE] Adapter pushed to HF Hub: Pathikreet/ap-commander-adapter')
    except Exception as e:
        print(f'[SAVE] HF Hub upload skipped: {e}')

    # Post-training eval (all 10 tasks)
    print('\n[POST-EVAL] After training:')
    post = {}
    model.eval()
    for t in EVAL_TASKS:
        s = eval_task(model, tokenizer, t)
        post[t] = s
        print(f'  {t}: {s:.3f}')
    print(f'  Mean: {sum(post.values())/len(post):.3f}')

    print('\n[COMPARE]')
    for t in EVAL_TASKS:
        d = post[t] - baseline[t]
        sym = '+' if d >= 0 else ''
        print(f'  {t:<35} {baseline[t]:.3f} -> {post[t]:.3f}  ({sym}{d:.3f})')

    # ── Before/After comparison figure (results.png — key result for demo) ────
    BG, TEXT, SUBTEXT = '#0d1117', '#e6edf3', '#8b949e'
    PANEL, GRID = '#161b22', '#21262d'
    _fmt_rate = sum(METRICS.format_scores) / max(1, len(METRICS.format_scores))

    eval_tasks_sorted = sorted(
        EVAL_TASKS,
        key=lambda t: (_DIFFICULTY_ORDER.index(_TASK_DIFFICULTY.get(t,'easy')), t)
    )
    DIFF_COLORS = {'easy': '#3fb950', 'medium': '#d29922', 'hard': '#f85149', 'long': '#a371f7'}

    fig = plt.figure(figsize=(18, max(8, len(eval_tasks_sorted) * 0.45 + 2)))
    fig.patch.set_facecolor(BG)
    gs  = fig.add_gridspec(1, 2, wspace=0.38)

    # Panel left: before/after horizontal bars
    ax_l = fig.add_subplot(gs[0, 0])
    ax_l.set_facecolor(PANEL)
    yp   = np.arange(len(eval_tasks_sorted))
    short = [t.replace('easy_','').replace('medium_','').replace('hard_','').replace('long_','')
              .replace('_',' ').title() for t in eval_tasks_sorted]
    bar_h = 0.35
    bars_b = ax_l.barh(yp - bar_h/2, [baseline.get(t, 0) for t in eval_tasks_sorted],
                       bar_h, label='Before GRPO', color='#f85149', alpha=0.85, edgecolor=BG)
    bars_a = ax_l.barh(yp + bar_h/2, [post.get(t, 0)     for t in eval_tasks_sorted],
                       bar_h, label='After GRPO',  color='#3fb950', alpha=0.85, edgecolor=BG)
    ax_l.set_yticks(yp)
    ax_l.set_yticklabels(short, fontsize=8, color=TEXT)
    ax_l.set_xlim(0, 1.15)
    ax_l.axvline(0.5, color=SUBTEXT, linestyle='--', linewidth=1, alpha=0.5)
    # Color-code y-tick labels by difficulty
    for i, t in enumerate(eval_tasks_sorted):
        ax_l.get_yticklabels()[i].set_color(DIFF_COLORS.get(_TASK_DIFFICULTY.get(t,'easy'), TEXT))
    ax_l.legend(fontsize=9, facecolor=PANEL, edgecolor='#30363d', labelcolor=TEXT)
    ax_l.set_xlabel('Task Score  [0.01 – 0.99]', color=SUBTEXT, fontsize=9)
    ax_l.set_ylabel('Task (color = difficulty tier)', color=SUBTEXT, fontsize=9)
    ax_l.set_title(f'Before vs After GRPO — {NUM_EPOCHS} Epochs', color=TEXT, fontsize=11,
                   fontweight='bold', pad=10)
    ax_l.tick_params(colors=TEXT, labelsize=8)
    for sp in ax_l.spines.values(): sp.set_color('#30363d')
    ax_l.spines['top'].set_visible(False); ax_l.spines['right'].set_visible(False)
    ax_l.xaxis.grid(True, color=GRID, linewidth=0.6, alpha=0.7)
    ax_l.set_axisbelow(True)

    # Panel right: delta (improvement) per task
    ax_r = fig.add_subplot(gs[0, 1])
    ax_r.set_facecolor(PANEL)
    deltas = [post.get(t, 0) - baseline.get(t, 0) for t in eval_tasks_sorted]
    d_colors = ['#3fb950' if d >= 0 else '#f85149' for d in deltas]
    ax_r.barh(yp, deltas, color=d_colors, alpha=0.85, edgecolor=BG)
    ax_r.set_yticks(yp)
    ax_r.set_yticklabels(short, fontsize=8, color=TEXT)
    ax_r.axvline(0, color=SUBTEXT, linewidth=1)
    for i, d in enumerate(deltas):
        ax_r.text(d + 0.005 * np.sign(d + 1e-9), i, f'{d:+.3f}',
                  va='center', color=TEXT, fontsize=7)
    ax_r.set_xlabel('Score Delta (After − Before)', color=SUBTEXT, fontsize=9)
    ax_r.set_ylabel('Task', color=SUBTEXT, fontsize=9)
    ax_r.set_title('GRPO Improvement per Task', color=TEXT, fontsize=11,
                   fontweight='bold', pad=10)
    ax_r.tick_params(colors=TEXT, labelsize=8)
    for sp in ax_r.spines.values(): sp.set_color('#30363d')
    ax_r.spines['top'].set_visible(False); ax_r.spines['right'].set_visible(False)
    ax_r.xaxis.grid(True, color=GRID, linewidth=0.6, alpha=0.7)
    ax_r.set_axisbelow(True)

    mean_before = sum(baseline.get(t,0) for t in eval_tasks_sorted) / len(eval_tasks_sorted)
    mean_after  = sum(post.get(t,0)     for t in eval_tasks_sorted) / len(eval_tasks_sorted)
    fig.suptitle(
        f'AP Commander GRPO — {MODEL_NAME.split("/")[-1]}  |  {NUM_EPOCHS} epochs  |  '
        f'{NUM_GENERATIONS} generations  |  {len(TRAIN_TASKS)} tasks\n'
        f'Overall: {mean_before:.3f} → {mean_after:.3f}  (+{mean_after-mean_before:.3f})  '
        f'|  format={_fmt_rate:.1%}  |  parse_fails={METRICS.parse_failures}  '
        f'|  {datetime.datetime.now().strftime("%Y-%m-%d")}',
        color=TEXT, fontsize=10, y=1.01
    )
    fig.text(0.5, -0.01,
             'Task colors: green=easy, yellow=medium, red=hard, purple=long-horizon. '
             'Score range [0.01, 0.99] as per AP Commander environment specification.',
             ha='center', color=SUBTEXT, fontsize=8, style='italic')
    results_png = os.path.join(RUN_DIR, 'results.png')
    plt.savefig(results_png, dpi=130, bbox_inches='tight', facecolor=BG)
    plt.close()
    print(f'[DONE] Saved {results_png}')

    # Save JSON
    fmt_rate = sum(METRICS.format_scores) / max(1, len(METRICS.format_scores))
    output = {
        'timestamp':       datetime.datetime.now().isoformat(),
        'run_dir':         RUN_DIR,
        'model':           MODEL_NAME,
        'epochs':          NUM_EPOCHS,
        'num_generations': NUM_GENERATIONS,
        'per_device_train_batch_size': NUM_GENERATIONS,
        'train_tasks':     TRAIN_TASKS,
        'eval_tasks':      list(EVAL_TASKS),
        'hardware':        'A10G (HF Spaces)',
        'baseline':        baseline,
        'post_training':   post,
        'delta':           {t: round(post.get(t,0) - baseline.get(t,0), 4) for t in EVAL_TASKS},
        'overall_baseline': round(mean_before, 4),
        'overall_post':     round(mean_after, 4),
        'overall_delta':    round(mean_after - mean_before, 4),
        'episode_log':      _EPISODE_LOG_PATH,
        'metrics': {
            'total_reward_calls':   METRICS.total_calls,
            'parse_failures':       METRICS.parse_failures,
            'env_errors':           METRICS.env_errors,
            'format_rate':          round(fmt_rate, 4),
            'decision_counts':      dict(METRICS.decision_counts),
            'per_task_mean':        {t: round(sum(v)/len(v), 4) for t, v in METRICS.reward_by_task.items()},
            'mean_episode_length':  round(sum(METRICS.episode_len_hist) / max(1, len(METRICS.episode_len_hist)), 2),
            'by_difficulty_post':   {d: round(sum(post.get(t,0) for t,diff in _TASK_DIFFICULTY.items()
                                                  if diff==d and t in post) /
                                              max(1, sum(1 for t,diff in _TASK_DIFFICULTY.items()
                                                         if diff==d and t in post)), 4)
                                     for d in _DIFFICULTY_ORDER},
        },
        'figures': [
            'fig1_reward_curve.png',
            'fig2_difficulty_curves.png',
            'fig3_episode_lengths.png',
            'fig4_format_compliance.png',
            'fig5_decision_distribution.png',
            'fig6_per_task_means.png',
            'results.png',
        ],
    }
    results_json = os.path.join(RUN_DIR, 'training_results.json')
    with open(results_json, 'w') as f:
        json.dump(output, f, indent=2)
    print(f'[DONE] Saved {results_json}')

    # Copy live metrics into run dir as snapshot
    try:
        import shutil
        shutil.copy('/app/metrics_live.json', os.path.join(RUN_DIR, 'metrics_live.json'))
    except Exception:
        pass

    # Persist entire run dir to HF Space repo (runs/grpo/MODEL-NEP-DATETIME/)
    # so artifacts survive container restarts and each run is independently addressable
    repo_run_path = RUN_DIR.replace('/app/', '')  # strip /app/ prefix for repo path
    try:
        from huggingface_hub import HfApi
        api = HfApi()
        api.upload_folder(
            folder_path=RUN_DIR,
            path_in_repo=repo_run_path,
            repo_id='Pathikreet/ap-commander-training',
            repo_type='space',
            commit_message=f'Run artifacts: {os.path.basename(RUN_DIR)}',
            ignore_patterns=['adapter/*'],  # adapter uploaded separately to model repo
        )
        print(f'[UPLOAD] Run folder → {repo_run_path} in Pathikreet/ap-commander-training')
    except Exception as e:
        print(f'[UPLOAD] artifact upload failed: {e}')


if __name__ == '__main__':
    main()