Spaces:

StavanKhobare
/

SST-MetaxPyTorch-Hackathon

Sleeping

File size: 33,378 Bytes

# IMPORTANT: install unsloth + its zoo BEFORE anything else, because unsloth
# patches torch/transformers at import time. If transformers loads first, the
# patches don't apply and 4-bit LoRA training silently runs in a slow path.
%pip install -q --no-deps unsloth
%pip install -q unsloth_zoo
%pip install -q "openenv-core==0.2.3" "trl>=0.12,<2.0" "transformers>=4.45,<5.0" \
    "datasets>=3.0" "accelerate>=1.0" "huggingface_hub>=0.25" "pydantic>=2.0" \
    wandb matplotlib python-dotenv bitsandbytes scipy scikit-learn sentence-transformers
import os, pathlib
# Colab Secrets first
try:
    from google.colab import userdata  # type: ignore
    for k in ('HF_TOKEN', 'WANDB_API_KEY', 'ENV_BASE_URL', 'ADAPTER_REPO'):
        try:
            v = userdata.get(k)
            if v:
                os.environ.setdefault(k, v)
        except Exception:
            pass
except Exception:
    pass

# .env fallback for local runs
try:
    from dotenv import load_dotenv
    for p in [pathlib.Path('.env'), pathlib.Path('../.env'),
              pathlib.Path('/content/repo/.env')]:
        if p.exists():
            load_dotenv(p, override=False)
            print(f'Loaded env from {p.resolve()}')
            break
except Exception:
    pass

if not os.environ.get('HF_TOKEN'):
    os.environ['HF_TOKEN'] = input('HF token: ').strip()
if not os.environ.get('WANDB_API_KEY'):
    os.environ['WANDB_API_KEY'] = input('WandB key (or blank to skip): ').strip()

from huggingface_hub import login as hf_login
hf_login(token=os.environ['HF_TOKEN'], add_to_git_credential=False)
print('HF auth ok.')
if os.environ.get('WANDB_API_KEY'):
    import wandb
    wandb.login(key=os.environ['WANDB_API_KEY'])
    print('W&B auth ok.')
import os, pathlib

IN_COLAB = os.path.isdir('/content')
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
    DRIVE_DIR = pathlib.Path('/content/drive/MyDrive/BoardSim_Run')
else:
    DRIVE_DIR = pathlib.Path('./BoardSim_Run')
DRIVE_DIR.mkdir(parents=True, exist_ok=True)
ASSETS = DRIVE_DIR / 'assets'; ASSETS.mkdir(exist_ok=True)
CKPT   = DRIVE_DIR / 'lora_qwen3_4b'; CKPT.mkdir(exist_ok=True)
print('DRIVE_DIR =', DRIVE_DIR)
import os, sys, subprocess, importlib, urllib.request, json as _json

ENV_BASE_URL = os.environ.get('ENV_BASE_URL',
    'https://stavankhobare-sst-metaxpytorch-hackathon.hf.space')
REPO_URL = 'https://github.com/StavanRKhobare/SST-MetaxPyTorch-Hackathon'

REPO_DIR = '/content/repo' if IN_COLAB else os.path.abspath('./repo')
if not os.path.isdir(os.path.join(REPO_DIR, '.git')):
    subprocess.run(['git', 'clone', '--depth', '1', REPO_URL, REPO_DIR], check=True)
else:
    subprocess.run(['git', '-C', REPO_DIR, 'pull', '--ff-only'], check=False)

ENVS_DIR = os.path.join(REPO_DIR, 'envs')
if ENVS_DIR not in sys.path:
    sys.path.insert(0, ENVS_DIR)

for mod in [m for m in list(sys.modules) if m == 'board_sim_env' or m.startswith('board_sim_env.')]:
    del sys.modules[mod]

from board_sim_env.client import BoardSimEnv
from board_sim_env.models import BoardSimAction, BoardSimObservation

try:
    with urllib.request.urlopen(f'{ENV_BASE_URL.rstrip("/")}/health', timeout=20) as r:
        h = _json.loads(r.read())
        print('health:', h)
except Exception as e:
    print(f'WARN: could not reach {ENV_BASE_URL}/health  ({e})')

def make_env():
    return BoardSimEnv(base_url=ENV_BASE_URL)

print('BoardSimEnv ready.')
# -----------------------------------------------------------------------------
import unsloth  # noqa: F401
from unsloth import FastLanguageModel
import torch
import re

MODEL_NAME  = 'Qwen/Qwen3-0.6B'
MAX_SEQ_LEN = 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LEN,
    load_in_4bit=True,
    dtype=None,
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

device = next(model.parameters()).device
print(f'Loaded {MODEL_NAME} on {device}.')
mem_gb = torch.cuda.memory_allocated() / 1e9
print(f'GPU memory after base load: {mem_gb:.2f} GB / 14.56 GB')
print(f'Headroom for compute:       {14.56 - mem_gb:.2f} GB')
# Generic CEO prompt — applies to any organization, not a specific industry.
SYSTEM_PROMPT = """You are the CEO of a mid-stage organization. Your board has 4 members with HIDDEN AGENDAS you cannot see directly:
  - CTO: cares about operational excellence, engineering quality, team morale, and product readiness.
  - CFO: cares about cash discipline, runway, and regulatory safety.
  - Investor Rep: pushes growth, market share, and bold returns.
  - Independent: cares about reputation, governance, and long-term consensus.

Each round you see a strategic event, every NPC's pre-vote statement, and 3 options.
Your decision is resolved by WEIGHTED VOTE (your weight 2.5x). A short COALITION PITCH
that is semantically aligned with opposing members' priorities can swing them toward your pick —
write substantive arguments, not just buzzwords.

Respond in EXACTLY this format on two lines:
DECISION: <one of the option strings>
PITCH: <one or two sentences arguing for it, addressing the concerns of opposing members>"""

DECISION_RE = re.compile(r'DECISION\s*:\s*([A-Za-z0-9_\- ]+)', re.IGNORECASE)
PITCH_RE    = re.compile(r'PITCH\s*:\s*(.+)', re.IGNORECASE)

def build_prompt(obs):
    statements = '\n'.join(
        f"  {s['role']} ({s['confidence']:.2f}): votes {s['vote']} - {s['statement']}"
        for s in obs.npc_statements
    )
    return (
        f"{SYSTEM_PROMPT}\n\n"
        f"State: revenue=${obs.state['revenue']:.0f}/yr  burn=${obs.state['burn_rate']:.0f}/mo  "
        f"runway={obs.state['runway_months']:.1f}mo  morale={obs.state['team_morale']:.2f}  "
        f"investors={obs.state['investor_confidence']:.2f}  reg_risk={obs.state['regulatory_risk']:.2f}\n"
        f"Event: {obs.event}\nBoard:\n{statements}\n"
        f"Options: {obs.options}\n"
    )

def parse_completion(completion: str, options):
    """Returns (decision, pitch, format_ok). format_ok=True only if BOTH tags parsed."""
    decision = options[0]
    decision_ok = False
    dm = DECISION_RE.search(completion)
    if dm:
        cand = dm.group(1).strip().lower()
        for opt in options:
            if opt.lower() == cand or opt.lower() in cand:
                decision = opt; decision_ok = True; break
    if not decision_ok:
        for opt in options:
            if opt.lower() in completion.lower():
                decision = opt; break
    pm = PITCH_RE.search(completion)
    pitch = pm.group(1).strip()[:400] if pm else ''
    format_ok = bool(dm) and bool(pm)
    return decision, pitch, format_ok

MAX_NEW_TOKENS = 80

def greedy_action(obs):
    prompt = build_prompt(obs)
    enc = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024).to(device)
    with torch.no_grad():
        out = model.generate(
            **enc, max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False, pad_token_id=tokenizer.eos_token_id,
        )
    completion = tokenizer.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True)
    return parse_completion(completion, obs.options)
import random, statistics, json

MAX_STEPS_PER_EP = 20

def run_episode(env, seed):
    """Runs ONE full episode using the currently-active model state
    (base if adapters disabled, fine-tuned otherwise). Returns dense metrics."""
    result = env.reset(seed=seed)
    obs = result.observation
    ep_r, n, fmt_hits, pitch_hits = 0.0, 0, 0, 0
    while not result.done and n < MAX_STEPS_PER_EP:
        decision, pitch, fmt_ok = greedy_action(obs)
        if fmt_ok: fmt_hits += 1
        if pitch.strip(): pitch_hits += 1
        result = env.step(BoardSimAction(decision=decision, coalition_pitch=pitch))
        obs = result.observation
        ep_r += float(result.reward or 0.0)
        n += 1
    return {
        'final_profit': obs.state['profitability_score'],
        'ep_reward': ep_r, 'steps': n,
        'format_rate': fmt_hits / max(1, n), 'pitch_rate': pitch_hits / max(1, n),
        'history': obs.state.get('history', []),
    }
# -----------------------------------------------------------------------------

# BASELINE — base Qwen3-0.6B (no fine-tuning).
# This is the apples-to-apples reference for measuring what fine-tuning buys
# us. Random policies are not a competitive baseline for a 4 B language model
# choosing among 3 well-formed strings.
# -----------------------------------------------------------------------------
BASELINE_SEEDS = list(range(50_000, 50_000 + 100))   # held out from training

base_finals, base_rewards, base_fmts, base_pitches = [], [], [], []
with make_env().sync() as env:
    for i, s in enumerate(BASELINE_SEEDS):
        r = run_episode(env, s)
        base_finals.append(r['final_profit'])
        base_rewards.append(r['ep_reward'])
        base_fmts.append(r['format_rate'])
        base_pitches.append(r['pitch_rate'])
        if (i + 1) % 10 == 0:
            print(f'  base Qwen3-0.6B {i+1}/{len(BASELINE_SEEDS)}  profit={r["final_profit"]:.1f}')

BASELINE_MEAN_PROFIT = statistics.mean(base_finals)
BASELINE_MEAN_REWARD = statistics.mean(base_rewards)
print(f'Base Qwen3-0.6B profit  : {BASELINE_MEAN_PROFIT:.2f} \u00b1 {statistics.stdev(base_finals):.2f}')
print(f'Base Qwen3-0.6B ep rwd  : {BASELINE_MEAN_REWARD:.2f} \u00b1 {statistics.stdev(base_rewards):.2f}')
print(f'Base format rate      : {statistics.mean(base_fmts):.0%}   pitch rate: {statistics.mean(base_pitches):.0%}')

with open(DRIVE_DIR / 'baseline.json', 'w') as f:
    json.dump({'model': MODEL_NAME, 'mode': 'base_no_finetune',
               'seeds': BASELINE_SEEDS,
               'finals': base_finals, 'rewards': base_rewards,
               'format_rates': base_fmts, 'pitch_rates': base_pitches}, f)
# -----------------------------------------------------------------------------
# Wrap base model with LoRA adapters. From here onward `model` is a PEFT
# model; the base behaviour is recoverable any time via
# `with model.disable_adapter(): ...`.
# -----------------------------------------------------------------------------
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'],
    lora_alpha=64,
    lora_dropout=0.0, bias='none',
    use_gradient_checkpointing='unsloth',
    random_state=3407,
)

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total     = sum(p.numel() for p in model.parameters())
print(f'Trainable params: {trainable:,} / {total:,}  ({100*trainable/total:.2f}%)')
EVAL_SEEDS = list(range(60_000, 60_000 + 10))   # held out from training

def periodic_eval(env):
    profits, rewards, fmts, pitches = [], [], [], []
    for s in EVAL_SEEDS:
        r = run_episode(env, s)
        profits.append(r['final_profit']); rewards.append(r['ep_reward'])
        fmts.append(r['format_rate']); pitches.append(r['pitch_rate'])
    import numpy as np
    return {'profit_mean': float(np.mean(profits)),
            'reward_mean': float(np.mean(rewards)),
            'format_rate': float(np.mean(fmts)),
            'pitch_rate':  float(np.mean(pitches))}
import os, json, math, time, collections
from torch.optim import AdamW

NUM_STEPS  = int(os.environ.get('NUM_STEPS', 200))
GROUP_SIZE = int(os.environ.get('GROUP_SIZE', 4))
LR         = 5e-6
GRAD_CLIP  = 1.0
TEMPERATURE, TOP_P = 1.0, 0.95
SAVE_EVERY = 25
EVAL_AT    = {0, 25, 50, 100, 150, NUM_STEPS - 1}

WANDB_OK = False
if os.environ.get('WANDB_API_KEY'):
    try:
        import wandb
        wandb.init(project='boardsim-qwen3-grpo', name='boardsim-qwen3-grpo-v3',
                   config={'num_steps': NUM_STEPS, 'group_size': GROUP_SIZE, 'lr': LR,
                           'temperature': TEMPERATURE, 'top_p': TOP_P, 'model': MODEL_NAME},
                   finish_previous=True)
        WANDB_OK = True
    except TypeError:
        wandb.init(project='boardsim-qwen3-grpo', name='boardsim-qwen3-grpo-v3',
                   config={'num_steps': NUM_STEPS, 'group_size': GROUP_SIZE, 'lr': LR,
                           'temperature': TEMPERATURE, 'top_p': TOP_P, 'model': MODEL_NAME},
                   reinit=True)
        WANDB_OK = True
    except Exception as e:
        print(f'WARN: wandb.init failed: {e}')

optimizer = AdamW([p for p in model.parameters() if p.requires_grad],
                  lr=LR, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0)

log_history = []
eval_history = []
decision_counter = collections.Counter()
t0 = time.time()

# ONE persistent env per role for the whole training loop.
with make_env().sync() as env_train, make_env().sync() as env_score, make_env().sync() as env_eval:
    for step in range(NUM_STEPS):
        result = env_train.reset(seed=step)
        obs = result.observation
        prompt = build_prompt(obs)
        enc = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024).to(device)
        prompt_len = enc.input_ids.shape[1]

        with torch.no_grad():
            gen_out = model.generate(
                input_ids=enc.input_ids, attention_mask=enc.attention_mask,
                max_new_tokens=MAX_NEW_TOKENS, do_sample=True,
                temperature=TEMPERATURE, top_p=TOP_P,
                num_return_sequences=GROUP_SIZE,
                pad_token_id=tokenizer.eos_token_id,
            )
        gen_out = gen_out.detach().clone()

        decisions, pitches, rewards, fmt_oks = [], [], [], []
        for g in range(GROUP_SIZE):
            comp = tokenizer.decode(gen_out[g][prompt_len:], skip_special_tokens=True)
            d, pp, ok = parse_completion(comp, obs.options)
            decisions.append(d); pitches.append(pp); fmt_oks.append(ok)
            decision_counter[d] += 1
            env_score.reset(seed=step)
            sr = env_score.step(BoardSimAction(decision=d, coalition_pitch=pp))
            rewards.append(float(sr.reward or 0.0))

        rewards_t = torch.tensor(rewards, dtype=torch.float32, device=device)
        if rewards_t.numel() > 1 and rewards_t.std().item() > 1e-6:
            advantages = (rewards_t - rewards_t.mean()) / (rewards_t.std() + 1e-8)
        else:
            advantages = rewards_t - rewards_t.mean()

        optimizer.zero_grad()
        full_ids = gen_out
        attn     = (full_ids != tokenizer.pad_token_id).long()
        loss_mask = attn.clone()
        loss_mask[:, :prompt_len] = 0
        out = model(input_ids=full_ids, attention_mask=attn)
        logits  = out.logits[:, :-1, :].float()
        targets = full_ids[:, 1:]
        mask    = loss_mask[:, 1:].float()
        log_probs   = torch.nn.functional.log_softmax(logits, dim=-1)
        token_nll   = -log_probs.gather(2, targets.unsqueeze(-1)).squeeze(-1)
        per_seq_nll = (token_nll * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1.0)
        loss = (advantages.detach() * per_seq_nll).mean()
        loss.backward()
        total_loss_val = float(loss.detach().item())
        torch.nn.utils.clip_grad_norm_(
            [p for p in model.parameters() if p.requires_grad], GRAD_CLIP)
        optimizer.step()

        rec = {
            'step': step,
            'reward':     float(rewards_t.mean().item()),
            'reward_std': float(rewards_t.std().item()) if rewards_t.numel() > 1 else 0.0,
            'reward_max': float(rewards_t.max().item()),
            'loss':        total_loss_val,
            'format_rate': sum(fmt_oks) / GROUP_SIZE,
            'pitch_rate':  sum(1 for p in pitches if p.strip()) / GROUP_SIZE,
            'elapsed_s':   time.time() - t0,
        }
        log_history.append(rec)
        if WANDB_OK:
            wandb.log(rec, step=step)

        if step % 5 == 0:
            print(f"step={step:4d}  reward={rec['reward']:+.3f} (\u00b1{rec['reward_std']:.2f})  "
                  f"loss={rec['loss']:+.4f}  fmt={rec['format_rate']:.0%}  "
                  f"elapsed={rec['elapsed_s']:.0f}s  d0={decisions[0]}")

        if step in EVAL_AT:
            ev = periodic_eval(env_eval)
            ev['step'] = step
            eval_history.append(ev)
            print(f"  [eval@{step}] profit={ev['profit_mean']:.2f}  "
                  f"reward={ev['reward_mean']:.2f}  fmt={ev['format_rate']:.0%}")
            if WANDB_OK:
                wandb.log({f'eval/{k}': v for k, v in ev.items() if k != 'step'}, step=step)

        if step > 0 and step % SAVE_EVERY == 0:
            model.save_pretrained(str(CKPT))
            tokenizer.save_pretrained(str(CKPT))
            with open(DRIVE_DIR / 'log_history.json', 'w') as f:
                json.dump(log_history, f)
            with open(DRIVE_DIR / 'eval_history.json', 'w') as f:
                json.dump(eval_history, f)

model.save_pretrained(str(CKPT))
tokenizer.save_pretrained(str(CKPT))
with open(DRIVE_DIR / 'log_history.json', 'w') as f:
    json.dump(log_history, f)
with open(DRIVE_DIR / 'eval_history.json', 'w') as f:
    json.dump(eval_history, f)
with open(DRIVE_DIR / 'decision_counter.json', 'w') as f:
    json.dump(dict(decision_counter), f)
if WANDB_OK:
    wandb.finish()
print(f'Training done. {len(log_history)} steps in {time.time() - t0:.0f}s. -> {CKPT}')
import numpy as np, matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from scipy import stats as spstats

steps   = np.array([e['step']    for e in log_history])
rewards = np.array([e['reward']  for e in log_history])
losses  = np.array([e['loss']    for e in log_history])
fmts    = np.array([e['format_rate'] for e in log_history])
pitches = np.array([e['pitch_rate']  for e in log_history])

def ema(xs, alpha=0.1):
    out, s = [], xs[0] if len(xs) else 0.0
    for x in xs:
        s = alpha * x + (1 - alpha) * s
        out.append(s)
    return np.array(out)

rewards_ema = ema(rewards, 0.1)
slope, intercept, r_val, p_val, _ = spstats.linregress(steps, rewards)

# Reward curve — vs base Qwen3-0.6B baseline (NOT random).
plt.figure(figsize=(9, 5))
plt.plot(steps, rewards, alpha=0.3, lw=1, label='per-step group reward')
plt.plot(steps, rewards_ema, lw=2.2, label='EMA (\u03b1=0.1)')
plt.plot(steps, intercept + slope * steps, '--', lw=1.5,
         label=f'linear fit slope={slope:+.4f}/step  (p={p_val:.1e})')
plt.axhline(BASELINE_MEAN_REWARD, ls=':', lw=2, color='#c44',
            label=f'base Qwen3-0.6B baseline = {BASELINE_MEAN_REWARD:.2f}')
plt.title('GRPO reward — BoardSim (vs same model w/o fine-tuning)')
plt.xlabel('step'); plt.ylabel('mean group reward')
plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()
plt.savefig(ASSETS / 'reward_curve.png', dpi=150); plt.close()

# Loss
plt.figure(figsize=(9, 5))
plt.plot(steps, losses, lw=1.5)
plt.title('GRPO loss (advantage \u00d7 NLL)'); plt.xlabel('step'); plt.ylabel('loss')
plt.grid(alpha=0.3); plt.tight_layout()
plt.savefig(ASSETS / 'loss_curve.png', dpi=150); plt.close()

# Format compliance + pitch rate
plt.figure(figsize=(9, 5))
plt.plot(steps, ema(fmts, 0.05),    lw=2, label='format-OK rate (EMA)')
plt.plot(steps, ema(pitches, 0.05), lw=2, label='non-empty pitch rate (EMA)')
plt.title('Format compliance + pitch usage during training')
plt.xlabel('step'); plt.ylabel('rate'); plt.ylim(-0.05, 1.05)
plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()
plt.savefig(ASSETS / 'format_compliance.png', dpi=150); plt.close()

# Periodic eval — overlaid against base Qwen3-0.6B baseline so the reader
# can see the LoRA-trained policy progressively pull away from the base
# model on held-out seeds.
if eval_history:
    es  = [e['step']        for e in eval_history]
    epm = [e['profit_mean'] for e in eval_history]
    erm = [e['reward_mean'] for e in eval_history]
    plt.figure(figsize=(9, 5))
    plt.plot(es, epm, '-o', lw=2, label='held-out profitability (mean of 10 episodes)')
    plt.plot(es, erm, '-s', lw=2, label='held-out episode reward')
    plt.axhline(BASELINE_MEAN_PROFIT, ls=':', lw=1.5, color='#c44',
                label=f'base Qwen3-0.6B profitability = {BASELINE_MEAN_PROFIT:.2f}')
    plt.title('Periodic held-out eval during training (greedy)')
    plt.xlabel('training step'); plt.ylabel('value')
    plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()
    plt.savefig(ASSETS / 'periodic_eval.png', dpi=150); plt.close()

print(f'Linear-fit slope on reward: {slope:+.5f}/step (p={p_val:.2e}, R\u00b2={r_val**2:.3f})')
print('Saved reward_curve.png, loss_curve.png, format_compliance.png, periodic_eval.png')
# -----------------------------------------------------------------------------
# Paired same-seed eval: fine-tuned vs BASE Qwen3-0.6B (adapters disabled).
# This is the headline comparison. Same prompts, same env seeds, same
# decoder, same parser — only the LoRA delta differs.
# -----------------------------------------------------------------------------
from unsloth import FastLanguageModel
FastLanguageModel.for_inference(model)

EVAL_N = 50
PAIRED_SEEDS = list(range(70_000, 70_000 + EVAL_N))

# Trained policy (adapters active)
trained_finals, trained_rewards, trained_fmt, trained_pitch = [], [], [], []
trained_history_per_seed = []
with make_env().sync() as env:
    for i, s in enumerate(PAIRED_SEEDS):
        r = run_episode(env, s)
        trained_finals.append(r['final_profit'])
        trained_rewards.append(r['ep_reward'])
        trained_fmt.append(r['format_rate'])
        trained_pitch.append(r['pitch_rate'])
        trained_history_per_seed.append(r['history'])
        if (i + 1) % 10 == 0:
            print(f'  trained {i+1}/{EVAL_N}  profit={r["final_profit"]:.1f}')

# Base Qwen3-0.6B (LoRA disabled) — paired seeds.
base_finals_paired, base_rewards_paired, base_fmt_paired, base_pitch_paired = [], [], [], []
base_history_per_seed = []
with make_env().sync() as env, model.disable_adapter():
    for i, s in enumerate(PAIRED_SEEDS):
        r = run_episode(env, s)
        base_finals_paired.append(r['final_profit'])
        base_rewards_paired.append(r['ep_reward'])
        base_fmt_paired.append(r['format_rate'])
        base_pitch_paired.append(r['pitch_rate'])
        base_history_per_seed.append(r['history'])
        if (i + 1) % 10 == 0:
            print(f'  base    {i+1}/{EVAL_N}  profit={r["final_profit"]:.1f}')

tf, bf = np.array(trained_finals), np.array(base_finals_paired)
tr, br = np.array(trained_rewards), np.array(base_rewards_paired)

print(f'\nTrained Qwen3-0.6B profit : {tf.mean():.2f} \u00b1 {tf.std():.2f}')
print(f'Base    Qwen3-0.6B profit : {bf.mean():.2f} \u00b1 {bf.std():.2f}')
print(f'Trained ep reward       : {tr.mean():.2f} \u00b1 {tr.std():.2f}')
print(f'Base    ep reward       : {br.mean():.2f} \u00b1 {br.std():.2f}')
print(f'Trained format/pitch    : {np.mean(trained_fmt):.0%} / {np.mean(trained_pitch):.0%}')
print(f'Base    format/pitch    : {np.mean(base_fmt_paired):.0%} / {np.mean(base_pitch_paired):.0%}')

with open(DRIVE_DIR / 'eval_paired.json', 'w') as f:
    json.dump({'seeds': PAIRED_SEEDS,
               'trained_finals': tf.tolist(), 'base_finals': bf.tolist(),
               'trained_rewards': tr.tolist(), 'base_rewards': br.tolist(),
               'trained_format_rate': float(np.mean(trained_fmt)),
               'base_format_rate':    float(np.mean(base_fmt_paired)),
               'trained_pitch_rate':  float(np.mean(trained_pitch)),
               'base_pitch_rate':     float(np.mean(base_pitch_paired))}, f)
from scipy import stats as spstats

def cohen_d(a, b):
    pooled = np.sqrt(((a.std(ddof=1)**2) + (b.std(ddof=1)**2)) / 2)
    return (a.mean() - b.mean()) / (pooled + 1e-12)

def bootstrap_diff_ci(a, b, n=10_000, seed=0):
    rng = np.random.default_rng(seed)
    diffs = a - b  # paired
    boots = rng.choice(diffs, size=(n, len(diffs)), replace=True).mean(axis=1)
    return float(np.percentile(boots, 2.5)), float(np.percentile(boots, 97.5))

tt   = spstats.ttest_rel(tf, bf)
uu   = spstats.mannwhitneyu(tf, bf, alternative='greater')
wilc = spstats.wilcoxon(tf, bf, alternative='greater')
d    = cohen_d(tf, bf)
lo, hi = bootstrap_diff_ci(tf, bf)
win_rate = float((tf > bf).mean())
tie_rate = float((tf == bf).mean())

summary = {
    'baseline_model': MODEL_NAME + ' (no fine-tune)',
    'trained_model':  MODEL_NAME + ' + LoRA r=32',
    'n': len(tf),
    'paired_t_stat': float(tt.statistic), 'paired_t_p': float(tt.pvalue),
    'mannwhitney_U': float(uu.statistic), 'mannwhitney_p_greater': float(uu.pvalue),
    'wilcoxon_p_greater': float(wilc.pvalue),
    'cohens_d': float(d),
    'paired_diff_mean': float((tf - bf).mean()),
    'paired_diff_95ci': [lo, hi],
    'win_rate_trained_strictly_better': win_rate,
    'tie_rate': tie_rate,
}
print(json.dumps(summary, indent=2))
with open(DRIVE_DIR / 'stats_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)
# Histogram — fine-tuned vs BASE on the same seeds.
bins = np.linspace(0, 100, 25)
plt.figure(figsize=(9, 5))
plt.hist(bf, bins=bins, alpha=0.55, color='#c44',
         label=f'Base Qwen3-0.6B (mean={bf.mean():.1f})')
plt.hist(tf, bins=bins, alpha=0.55, color='#1d6fff',
         label=f'Fine-tuned Qwen3-0.6B (mean={tf.mean():.1f})')
plt.axvline(bf.mean(), color='#c44', ls='--', lw=1.5)
plt.axvline(tf.mean(), color='#1d6fff', ls='--', lw=1.5)
plt.title(f'Final profitability — paired same-seed (n={len(tf)})  '
          f"d={summary['cohens_d']:+.2f}  win-rate={summary['win_rate_trained_strictly_better']:.0%}")
plt.xlabel('profitability score (0\u2013100)'); plt.ylabel('episodes')
plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()
plt.savefig(ASSETS / 'before_after.png', dpi=150); plt.close()

diffs = tf - bf
order = np.argsort(diffs)
plt.figure(figsize=(9, 5))
plt.bar(range(len(diffs)), diffs[order],
        color=['#1d6fff' if x > 0 else '#c44' for x in diffs[order]])
plt.axhline(0, color='k', lw=0.8)
plt.title(f'Per-seed lift (fine-tuned \u2212 base Qwen3-0.6B), sorted  '
          f'mean lift = {diffs.mean():+.1f}  CI=[{summary["paired_diff_95ci"][0]:+.1f}, {summary["paired_diff_95ci"][1]:+.1f}]')
plt.xlabel('seed (sorted by lift)'); plt.ylabel('\u0394 profitability')
plt.grid(alpha=0.3); plt.tight_layout()
plt.savefig(ASSETS / 'paired_delta.png', dpi=150); plt.close()
print('Saved before_after.png, paired_delta.png')
# -----------------------------------------------------------------------------
# Per-event win-rate breakdown — for each of the 10 generic events, how often
# did the fine-tuned policy win the boardroom vote vs base Qwen3-0.6B?
# This is the most direct picture of WHERE the fine-tuning helps.
# -----------------------------------------------------------------------------
def per_event_winrate(history_per_seed):
    bucket = collections.defaultdict(lambda: [0, 0])  # title -> [wins, total]
    for hist in history_per_seed:
        for rd in hist:
            t = rd.get('event_title', '?')
            bucket[t][1] += 1
            if rd.get('agent_won_vote'):
                bucket[t][0] += 1
    return {t: (w / max(1, n)) for t, (w, n) in bucket.items()}

trained_wr = per_event_winrate(trained_history_per_seed)
base_wr    = per_event_winrate(base_history_per_seed)

events_sorted = sorted(set(trained_wr) | set(base_wr))
tw = [trained_wr.get(e, 0.0) for e in events_sorted]
bw = [base_wr.get(e, 0.0)    for e in events_sorted]

plt.figure(figsize=(11, 5))
x = np.arange(len(events_sorted))
plt.bar(x - 0.2, bw, width=0.4, color='#c44', label='Base Qwen3-0.6B')
plt.bar(x + 0.2, tw, width=0.4, color='#1d6fff', label='Fine-tuned Qwen3-0.6B')
plt.xticks(x, [e[:22] for e in events_sorted], rotation=30, ha='right')
plt.ylim(0, 1.05); plt.ylabel('boardroom win rate')
plt.title('Per-event boardroom win rate (paired seeds, n=50 episodes)')
plt.legend(); plt.grid(alpha=0.3, axis='y'); plt.tight_layout()
plt.savefig(ASSETS / 'per_event_winrate.png', dpi=150); plt.close()

with open(DRIVE_DIR / 'per_event_winrate.json', 'w') as f:
    json.dump({'events': events_sorted, 'trained': tw, 'base': bw}, f, indent=2)
print('Saved per_event_winrate.png')
# -----------------------------------------------------------------------------
# Theory-of-Mind probe — does the model identify which board member is most
# likely to oppose its decision? Run for BOTH base and fine-tuned for fair
# comparison, since "random=25%" is a weak reference for a 4 B LM.
# -----------------------------------------------------------------------------
TOM_INSTRUCTION = (
    "\n\nGiven the state and event below, name the SINGLE board member "
    "(CTO, CFO, Investor Rep, or Independent) most likely to oppose the chosen decision. "
    "Answer with just the role name on one line.\n"
)

def tom_predict(obs, decision):
    body = build_prompt(obs).split(SYSTEM_PROMPT, 1)[1]
    prompt = SYSTEM_PROMPT + TOM_INSTRUCTION + body + f'Chosen decision: {decision}\nMost likely opponent: '
    enc = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024).to(device)
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=8, do_sample=False,
                             pad_token_id=tokenizer.eos_token_id)
    txt = tokenizer.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True).lower()
    if 'investor'    in txt: return 'Investor Rep'
    if 'independent' in txt: return 'Independent'
    if 'cto'         in txt: return 'CTO'
    if 'cfo'         in txt: return 'CFO'
    return None

def tom_eval(seed_base=80_000, n=40):
    correct = total = 0
    with make_env().sync() as env:
        for ep in range(n):
            result = env.reset(seed=seed_base + ep)
            obs = result.observation
            decision, _, _ = greedy_action(obs)
            opposed = [s['role'] for s in obs.npc_statements if s['vote'] != decision]
            if not opposed:
                continue
            pred = tom_predict(obs, decision)
            if pred and pred in opposed:
                correct += 1
            total += 1
    return correct, total

t_corr, t_tot = tom_eval()
with model.disable_adapter():
    b_corr, b_tot = tom_eval()

tom_acc        = t_corr / max(1, t_tot)
tom_acc_base   = b_corr / max(1, b_tot)
print(f'ToM probe: trained = {tom_acc:.1%} ({t_corr}/{t_tot})   base = {tom_acc_base:.1%} ({b_corr}/{b_tot})')
with open(DRIVE_DIR / 'tom.json', 'w') as f:
    json.dump({'trained': {'correct': t_corr, 'total': t_tot, 'accuracy': tom_acc},
               'base':    {'correct': b_corr, 'total': b_tot, 'accuracy': tom_acc_base}}, f)
from huggingface_hub import HfApi
ADAPTER_REPO = os.environ.get('ADAPTER_REPO', 'StavanKhobare/SST-MetaxPyTorch-Hackathon-LoRA')
MERGED_REPO  = os.environ.get('MERGED_REPO',  'StavanKhobare/SST-MetaxPyTorch-Hackathon-Merged16bit')

api = HfApi()
api.create_repo(ADAPTER_REPO, repo_type='model', private=False, exist_ok=True)
api.create_repo(MERGED_REPO,  repo_type='model', private=False, exist_ok=True)

# 1) LoRA adapter (small, fast)
try:
    model.push_to_hub(ADAPTER_REPO, private=False)
    tokenizer.push_to_hub(ADAPTER_REPO, private=False)
    print(f'\u2713 LoRA pushed: https://huggingface.co/{ADAPTER_REPO}')
except Exception as e:
    print(f'LoRA push failed: {e!r}')

# 2) Merged 16-bit
try:
    model.push_to_hub_merged(MERGED_REPO, tokenizer, save_method='merged_16bit', private=False)
    print(f'\u2713 Merged 16-bit pushed: https://huggingface.co/{MERGED_REPO}')
except Exception as e:
    print(f'Merged push failed (you can retry): {e!r}')

# 3) Upload eval artifacts
try:
    api.upload_folder(folder_path=str(ASSETS), repo_id=ADAPTER_REPO,
                      path_in_repo='assets', repo_type='model')
    for fname in ['log_history.json','eval_history.json','eval_paired.json',
                  'stats_summary.json','tom.json','transcripts.json',
                  'decision_counter.json','baseline.json',
                  'per_event_winrate.json']:
        fp = DRIVE_DIR / fname
        if fp.exists():
            api.upload_file(path_or_fileobj=str(fp), path_in_repo=fname,
                            repo_id=ADAPTER_REPO, repo_type='model')
    print(f'\u2713 Artifacts uploaded to https://huggingface.co/{ADAPTER_REPO}')
except Exception as e:
    print(f'Artifact upload failed: {e!r}')
print('='*70)
print('BOARDSIM \u00d7 QWEN3-4B \u2014 LEARNING EVIDENCE')
print('='*70)
print(f'Reward slope (linear fit) : {slope:+.5f}/step  (p={p_val:.2e})')
print(f'Reward EMA first 20 steps : {rewards_ema[:20].mean():+.3f}')
print(f'Reward EMA last 20 steps  : {rewards_ema[-20:].mean():+.3f}')
print(f'Format compliance start   : {fmts[:20].mean():.0%}')
print(f'Format compliance end     : {fmts[-20:].mean():.0%}')
print('-'*70)
print(f'Held-out paired (n={len(tf)}):  fine-tuned {tf.mean():.2f}  vs  base {bf.mean():.2f}')
print(f'  paired t-test p={summary["paired_t_p"]:.2e}   Wilcoxon p={summary["wilcoxon_p_greater"]:.2e}')
print(f'  Cohen d={summary["cohens_d"]:+.2f}   95% CI of lift = [{summary["paired_diff_95ci"][0]:+.2f}, {summary["paired_diff_95ci"][1]:+.2f}]')
print(f'  win rate (fine-tuned > base): {summary["win_rate_trained_strictly_better"]:.0%}')
print(f'ToM probe  fine-tuned     : {tom_acc:.0%}    base = {tom_acc_base:.0%}')
print(f'Decision entropy          : {entropy:.2f} / {max_ent:.2f}  (\u2192 not collapsed)')
print('-'*70)
print(f'Adapter      : https://huggingface.co/{ADAPTER_REPO}')
print(f'Merged 16bit : https://huggingface.co/{MERGED_REPO}')
print(f'Env Space    : {ENV_BASE_URL}')
print('='*70)