| |
| |
| |
| %pip install -q --no-deps unsloth |
| %pip install -q unsloth_zoo |
| %pip install -q "openenv-core==0.2.3" "trl>=0.12,<2.0" "transformers>=4.45,<5.0" \ |
| "datasets>=3.0" "accelerate>=1.0" "huggingface_hub>=0.25" "pydantic>=2.0" \ |
| wandb matplotlib python-dotenv bitsandbytes scipy scikit-learn sentence-transformers |
| import os, pathlib |
| |
| try: |
| from google.colab import userdata |
| for k in ('HF_TOKEN', 'WANDB_API_KEY', 'ENV_BASE_URL', 'ADAPTER_REPO'): |
| try: |
| v = userdata.get(k) |
| if v: |
| os.environ.setdefault(k, v) |
| except Exception: |
| pass |
| except Exception: |
| pass |
|
|
| |
| try: |
| from dotenv import load_dotenv |
| for p in [pathlib.Path('.env'), pathlib.Path('../.env'), |
| pathlib.Path('/content/repo/.env')]: |
| if p.exists(): |
| load_dotenv(p, override=False) |
| print(f'Loaded env from {p.resolve()}') |
| break |
| except Exception: |
| pass |
|
|
| if not os.environ.get('HF_TOKEN'): |
| os.environ['HF_TOKEN'] = input('HF token: ').strip() |
| if not os.environ.get('WANDB_API_KEY'): |
| os.environ['WANDB_API_KEY'] = input('WandB key (or blank to skip): ').strip() |
|
|
| from huggingface_hub import login as hf_login |
| hf_login(token=os.environ['HF_TOKEN'], add_to_git_credential=False) |
| print('HF auth ok.') |
| if os.environ.get('WANDB_API_KEY'): |
| import wandb |
| wandb.login(key=os.environ['WANDB_API_KEY']) |
| print('W&B auth ok.') |
| import os, pathlib |
|
|
| IN_COLAB = os.path.isdir('/content') |
| if IN_COLAB: |
| from google.colab import drive |
| drive.mount('/content/drive', force_remount=False) |
| DRIVE_DIR = pathlib.Path('/content/drive/MyDrive/BoardSim_Run') |
| else: |
| DRIVE_DIR = pathlib.Path('./BoardSim_Run') |
| DRIVE_DIR.mkdir(parents=True, exist_ok=True) |
| ASSETS = DRIVE_DIR / 'assets'; ASSETS.mkdir(exist_ok=True) |
| CKPT = DRIVE_DIR / 'lora_qwen3_4b'; CKPT.mkdir(exist_ok=True) |
| print('DRIVE_DIR =', DRIVE_DIR) |
| import os, sys, subprocess, importlib, urllib.request, json as _json |
|
|
| ENV_BASE_URL = os.environ.get('ENV_BASE_URL', |
| 'https://stavankhobare-sst-metaxpytorch-hackathon.hf.space') |
| REPO_URL = 'https://github.com/StavanRKhobare/SST-MetaxPyTorch-Hackathon' |
|
|
| REPO_DIR = '/content/repo' if IN_COLAB else os.path.abspath('./repo') |
| if not os.path.isdir(os.path.join(REPO_DIR, '.git')): |
| subprocess.run(['git', 'clone', '--depth', '1', REPO_URL, REPO_DIR], check=True) |
| else: |
| subprocess.run(['git', '-C', REPO_DIR, 'pull', '--ff-only'], check=False) |
|
|
| ENVS_DIR = os.path.join(REPO_DIR, 'envs') |
| if ENVS_DIR not in sys.path: |
| sys.path.insert(0, ENVS_DIR) |
|
|
| for mod in [m for m in list(sys.modules) if m == 'board_sim_env' or m.startswith('board_sim_env.')]: |
| del sys.modules[mod] |
|
|
| from board_sim_env.client import BoardSimEnv |
| from board_sim_env.models import BoardSimAction, BoardSimObservation |
|
|
| try: |
| with urllib.request.urlopen(f'{ENV_BASE_URL.rstrip("/")}/health', timeout=20) as r: |
| h = _json.loads(r.read()) |
| print('health:', h) |
| except Exception as e: |
| print(f'WARN: could not reach {ENV_BASE_URL}/health ({e})') |
|
|
| def make_env(): |
| return BoardSimEnv(base_url=ENV_BASE_URL) |
|
|
| print('BoardSimEnv ready.') |
| |
| import unsloth |
| from unsloth import FastLanguageModel |
| import torch |
| import re |
|
|
| MODEL_NAME = 'Qwen/Qwen3-0.6B' |
| MAX_SEQ_LEN = 2048 |
|
|
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name=MODEL_NAME, |
| max_seq_length=MAX_SEQ_LEN, |
| load_in_4bit=True, |
| dtype=None, |
| ) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| device = next(model.parameters()).device |
| print(f'Loaded {MODEL_NAME} on {device}.') |
| mem_gb = torch.cuda.memory_allocated() / 1e9 |
| print(f'GPU memory after base load: {mem_gb:.2f} GB / 14.56 GB') |
| print(f'Headroom for compute: {14.56 - mem_gb:.2f} GB') |
| |
| SYSTEM_PROMPT = """You are the CEO of a mid-stage organization. Your board has 4 members with HIDDEN AGENDAS you cannot see directly: |
| - CTO: cares about operational excellence, engineering quality, team morale, and product readiness. |
| - CFO: cares about cash discipline, runway, and regulatory safety. |
| - Investor Rep: pushes growth, market share, and bold returns. |
| - Independent: cares about reputation, governance, and long-term consensus. |
| |
| Each round you see a strategic event, every NPC's pre-vote statement, and 3 options. |
| Your decision is resolved by WEIGHTED VOTE (your weight 2.5x). A short COALITION PITCH |
| that is semantically aligned with opposing members' priorities can swing them toward your pick — |
| write substantive arguments, not just buzzwords. |
| |
| Respond in EXACTLY this format on two lines: |
| DECISION: <one of the option strings> |
| PITCH: <one or two sentences arguing for it, addressing the concerns of opposing members>""" |
|
|
| DECISION_RE = re.compile(r'DECISION\s*:\s*([A-Za-z0-9_\- ]+)', re.IGNORECASE) |
| PITCH_RE = re.compile(r'PITCH\s*:\s*(.+)', re.IGNORECASE) |
|
|
| def build_prompt(obs): |
| statements = '\n'.join( |
| f" {s['role']} ({s['confidence']:.2f}): votes {s['vote']} - {s['statement']}" |
| for s in obs.npc_statements |
| ) |
| return ( |
| f"{SYSTEM_PROMPT}\n\n" |
| f"State: revenue=${obs.state['revenue']:.0f}/yr burn=${obs.state['burn_rate']:.0f}/mo " |
| f"runway={obs.state['runway_months']:.1f}mo morale={obs.state['team_morale']:.2f} " |
| f"investors={obs.state['investor_confidence']:.2f} reg_risk={obs.state['regulatory_risk']:.2f}\n" |
| f"Event: {obs.event}\nBoard:\n{statements}\n" |
| f"Options: {obs.options}\n" |
| ) |
|
|
| def parse_completion(completion: str, options): |
| """Returns (decision, pitch, format_ok). format_ok=True only if BOTH tags parsed.""" |
| decision = options[0] |
| decision_ok = False |
| dm = DECISION_RE.search(completion) |
| if dm: |
| cand = dm.group(1).strip().lower() |
| for opt in options: |
| if opt.lower() == cand or opt.lower() in cand: |
| decision = opt; decision_ok = True; break |
| if not decision_ok: |
| for opt in options: |
| if opt.lower() in completion.lower(): |
| decision = opt; break |
| pm = PITCH_RE.search(completion) |
| pitch = pm.group(1).strip()[:400] if pm else '' |
| format_ok = bool(dm) and bool(pm) |
| return decision, pitch, format_ok |
|
|
| MAX_NEW_TOKENS = 80 |
|
|
| def greedy_action(obs): |
| prompt = build_prompt(obs) |
| enc = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024).to(device) |
| with torch.no_grad(): |
| out = model.generate( |
| **enc, max_new_tokens=MAX_NEW_TOKENS, |
| do_sample=False, pad_token_id=tokenizer.eos_token_id, |
| ) |
| completion = tokenizer.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True) |
| return parse_completion(completion, obs.options) |
| import random, statistics, json |
|
|
| MAX_STEPS_PER_EP = 20 |
|
|
| def run_episode(env, seed): |
| """Runs ONE full episode using the currently-active model state |
| (base if adapters disabled, fine-tuned otherwise). Returns dense metrics.""" |
| result = env.reset(seed=seed) |
| obs = result.observation |
| ep_r, n, fmt_hits, pitch_hits = 0.0, 0, 0, 0 |
| while not result.done and n < MAX_STEPS_PER_EP: |
| decision, pitch, fmt_ok = greedy_action(obs) |
| if fmt_ok: fmt_hits += 1 |
| if pitch.strip(): pitch_hits += 1 |
| result = env.step(BoardSimAction(decision=decision, coalition_pitch=pitch)) |
| obs = result.observation |
| ep_r += float(result.reward or 0.0) |
| n += 1 |
| return { |
| 'final_profit': obs.state['profitability_score'], |
| 'ep_reward': ep_r, 'steps': n, |
| 'format_rate': fmt_hits / max(1, n), 'pitch_rate': pitch_hits / max(1, n), |
| 'history': obs.state.get('history', []), |
| } |
| |
|
|
| |
| |
| |
| |
| |
| BASELINE_SEEDS = list(range(50_000, 50_000 + 100)) |
|
|
| base_finals, base_rewards, base_fmts, base_pitches = [], [], [], [] |
| with make_env().sync() as env: |
| for i, s in enumerate(BASELINE_SEEDS): |
| r = run_episode(env, s) |
| base_finals.append(r['final_profit']) |
| base_rewards.append(r['ep_reward']) |
| base_fmts.append(r['format_rate']) |
| base_pitches.append(r['pitch_rate']) |
| if (i + 1) % 10 == 0: |
| print(f' base Qwen3-0.6B {i+1}/{len(BASELINE_SEEDS)} profit={r["final_profit"]:.1f}') |
|
|
| BASELINE_MEAN_PROFIT = statistics.mean(base_finals) |
| BASELINE_MEAN_REWARD = statistics.mean(base_rewards) |
| print(f'Base Qwen3-0.6B profit : {BASELINE_MEAN_PROFIT:.2f} \u00b1 {statistics.stdev(base_finals):.2f}') |
| print(f'Base Qwen3-0.6B ep rwd : {BASELINE_MEAN_REWARD:.2f} \u00b1 {statistics.stdev(base_rewards):.2f}') |
| print(f'Base format rate : {statistics.mean(base_fmts):.0%} pitch rate: {statistics.mean(base_pitches):.0%}') |
|
|
| with open(DRIVE_DIR / 'baseline.json', 'w') as f: |
| json.dump({'model': MODEL_NAME, 'mode': 'base_no_finetune', |
| 'seeds': BASELINE_SEEDS, |
| 'finals': base_finals, 'rewards': base_rewards, |
| 'format_rates': base_fmts, 'pitch_rates': base_pitches}, f) |
| |
| |
| |
| |
| |
| model = FastLanguageModel.get_peft_model( |
| model, |
| r=32, |
| target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'], |
| lora_alpha=64, |
| lora_dropout=0.0, bias='none', |
| use_gradient_checkpointing='unsloth', |
| random_state=3407, |
| ) |
|
|
| trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) |
| total = sum(p.numel() for p in model.parameters()) |
| print(f'Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)') |
| EVAL_SEEDS = list(range(60_000, 60_000 + 10)) |
|
|
| def periodic_eval(env): |
| profits, rewards, fmts, pitches = [], [], [], [] |
| for s in EVAL_SEEDS: |
| r = run_episode(env, s) |
| profits.append(r['final_profit']); rewards.append(r['ep_reward']) |
| fmts.append(r['format_rate']); pitches.append(r['pitch_rate']) |
| import numpy as np |
| return {'profit_mean': float(np.mean(profits)), |
| 'reward_mean': float(np.mean(rewards)), |
| 'format_rate': float(np.mean(fmts)), |
| 'pitch_rate': float(np.mean(pitches))} |
| import os, json, math, time, collections |
| from torch.optim import AdamW |
|
|
| NUM_STEPS = int(os.environ.get('NUM_STEPS', 200)) |
| GROUP_SIZE = int(os.environ.get('GROUP_SIZE', 4)) |
| LR = 5e-6 |
| GRAD_CLIP = 1.0 |
| TEMPERATURE, TOP_P = 1.0, 0.95 |
| SAVE_EVERY = 25 |
| EVAL_AT = {0, 25, 50, 100, 150, NUM_STEPS - 1} |
|
|
| WANDB_OK = False |
| if os.environ.get('WANDB_API_KEY'): |
| try: |
| import wandb |
| wandb.init(project='boardsim-qwen3-grpo', name='boardsim-qwen3-grpo-v3', |
| config={'num_steps': NUM_STEPS, 'group_size': GROUP_SIZE, 'lr': LR, |
| 'temperature': TEMPERATURE, 'top_p': TOP_P, 'model': MODEL_NAME}, |
| finish_previous=True) |
| WANDB_OK = True |
| except TypeError: |
| wandb.init(project='boardsim-qwen3-grpo', name='boardsim-qwen3-grpo-v3', |
| config={'num_steps': NUM_STEPS, 'group_size': GROUP_SIZE, 'lr': LR, |
| 'temperature': TEMPERATURE, 'top_p': TOP_P, 'model': MODEL_NAME}, |
| reinit=True) |
| WANDB_OK = True |
| except Exception as e: |
| print(f'WARN: wandb.init failed: {e}') |
|
|
| optimizer = AdamW([p for p in model.parameters() if p.requires_grad], |
| lr=LR, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0) |
|
|
| log_history = [] |
| eval_history = [] |
| decision_counter = collections.Counter() |
| t0 = time.time() |
|
|
| |
| with make_env().sync() as env_train, make_env().sync() as env_score, make_env().sync() as env_eval: |
| for step in range(NUM_STEPS): |
| result = env_train.reset(seed=step) |
| obs = result.observation |
| prompt = build_prompt(obs) |
| enc = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024).to(device) |
| prompt_len = enc.input_ids.shape[1] |
|
|
| with torch.no_grad(): |
| gen_out = model.generate( |
| input_ids=enc.input_ids, attention_mask=enc.attention_mask, |
| max_new_tokens=MAX_NEW_TOKENS, do_sample=True, |
| temperature=TEMPERATURE, top_p=TOP_P, |
| num_return_sequences=GROUP_SIZE, |
| pad_token_id=tokenizer.eos_token_id, |
| ) |
| gen_out = gen_out.detach().clone() |
|
|
| decisions, pitches, rewards, fmt_oks = [], [], [], [] |
| for g in range(GROUP_SIZE): |
| comp = tokenizer.decode(gen_out[g][prompt_len:], skip_special_tokens=True) |
| d, pp, ok = parse_completion(comp, obs.options) |
| decisions.append(d); pitches.append(pp); fmt_oks.append(ok) |
| decision_counter[d] += 1 |
| env_score.reset(seed=step) |
| sr = env_score.step(BoardSimAction(decision=d, coalition_pitch=pp)) |
| rewards.append(float(sr.reward or 0.0)) |
|
|
| rewards_t = torch.tensor(rewards, dtype=torch.float32, device=device) |
| if rewards_t.numel() > 1 and rewards_t.std().item() > 1e-6: |
| advantages = (rewards_t - rewards_t.mean()) / (rewards_t.std() + 1e-8) |
| else: |
| advantages = rewards_t - rewards_t.mean() |
|
|
| optimizer.zero_grad() |
| full_ids = gen_out |
| attn = (full_ids != tokenizer.pad_token_id).long() |
| loss_mask = attn.clone() |
| loss_mask[:, :prompt_len] = 0 |
| out = model(input_ids=full_ids, attention_mask=attn) |
| logits = out.logits[:, :-1, :].float() |
| targets = full_ids[:, 1:] |
| mask = loss_mask[:, 1:].float() |
| log_probs = torch.nn.functional.log_softmax(logits, dim=-1) |
| token_nll = -log_probs.gather(2, targets.unsqueeze(-1)).squeeze(-1) |
| per_seq_nll = (token_nll * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1.0) |
| loss = (advantages.detach() * per_seq_nll).mean() |
| loss.backward() |
| total_loss_val = float(loss.detach().item()) |
| torch.nn.utils.clip_grad_norm_( |
| [p for p in model.parameters() if p.requires_grad], GRAD_CLIP) |
| optimizer.step() |
|
|
| rec = { |
| 'step': step, |
| 'reward': float(rewards_t.mean().item()), |
| 'reward_std': float(rewards_t.std().item()) if rewards_t.numel() > 1 else 0.0, |
| 'reward_max': float(rewards_t.max().item()), |
| 'loss': total_loss_val, |
| 'format_rate': sum(fmt_oks) / GROUP_SIZE, |
| 'pitch_rate': sum(1 for p in pitches if p.strip()) / GROUP_SIZE, |
| 'elapsed_s': time.time() - t0, |
| } |
| log_history.append(rec) |
| if WANDB_OK: |
| wandb.log(rec, step=step) |
|
|
| if step % 5 == 0: |
| print(f"step={step:4d} reward={rec['reward']:+.3f} (\u00b1{rec['reward_std']:.2f}) " |
| f"loss={rec['loss']:+.4f} fmt={rec['format_rate']:.0%} " |
| f"elapsed={rec['elapsed_s']:.0f}s d0={decisions[0]}") |
|
|
| if step in EVAL_AT: |
| ev = periodic_eval(env_eval) |
| ev['step'] = step |
| eval_history.append(ev) |
| print(f" [eval@{step}] profit={ev['profit_mean']:.2f} " |
| f"reward={ev['reward_mean']:.2f} fmt={ev['format_rate']:.0%}") |
| if WANDB_OK: |
| wandb.log({f'eval/{k}': v for k, v in ev.items() if k != 'step'}, step=step) |
|
|
| if step > 0 and step % SAVE_EVERY == 0: |
| model.save_pretrained(str(CKPT)) |
| tokenizer.save_pretrained(str(CKPT)) |
| with open(DRIVE_DIR / 'log_history.json', 'w') as f: |
| json.dump(log_history, f) |
| with open(DRIVE_DIR / 'eval_history.json', 'w') as f: |
| json.dump(eval_history, f) |
|
|
| model.save_pretrained(str(CKPT)) |
| tokenizer.save_pretrained(str(CKPT)) |
| with open(DRIVE_DIR / 'log_history.json', 'w') as f: |
| json.dump(log_history, f) |
| with open(DRIVE_DIR / 'eval_history.json', 'w') as f: |
| json.dump(eval_history, f) |
| with open(DRIVE_DIR / 'decision_counter.json', 'w') as f: |
| json.dump(dict(decision_counter), f) |
| if WANDB_OK: |
| wandb.finish() |
| print(f'Training done. {len(log_history)} steps in {time.time() - t0:.0f}s. -> {CKPT}') |
| import numpy as np, matplotlib |
| matplotlib.use('Agg') |
| import matplotlib.pyplot as plt |
| from scipy import stats as spstats |
|
|
| steps = np.array([e['step'] for e in log_history]) |
| rewards = np.array([e['reward'] for e in log_history]) |
| losses = np.array([e['loss'] for e in log_history]) |
| fmts = np.array([e['format_rate'] for e in log_history]) |
| pitches = np.array([e['pitch_rate'] for e in log_history]) |
|
|
| def ema(xs, alpha=0.1): |
| out, s = [], xs[0] if len(xs) else 0.0 |
| for x in xs: |
| s = alpha * x + (1 - alpha) * s |
| out.append(s) |
| return np.array(out) |
|
|
| rewards_ema = ema(rewards, 0.1) |
| slope, intercept, r_val, p_val, _ = spstats.linregress(steps, rewards) |
|
|
| |
| plt.figure(figsize=(9, 5)) |
| plt.plot(steps, rewards, alpha=0.3, lw=1, label='per-step group reward') |
| plt.plot(steps, rewards_ema, lw=2.2, label='EMA (\u03b1=0.1)') |
| plt.plot(steps, intercept + slope * steps, '--', lw=1.5, |
| label=f'linear fit slope={slope:+.4f}/step (p={p_val:.1e})') |
| plt.axhline(BASELINE_MEAN_REWARD, ls=':', lw=2, color='#c44', |
| label=f'base Qwen3-0.6B baseline = {BASELINE_MEAN_REWARD:.2f}') |
| plt.title('GRPO reward — BoardSim (vs same model w/o fine-tuning)') |
| plt.xlabel('step'); plt.ylabel('mean group reward') |
| plt.legend(); plt.grid(alpha=0.3); plt.tight_layout() |
| plt.savefig(ASSETS / 'reward_curve.png', dpi=150); plt.close() |
|
|
| |
| plt.figure(figsize=(9, 5)) |
| plt.plot(steps, losses, lw=1.5) |
| plt.title('GRPO loss (advantage \u00d7 NLL)'); plt.xlabel('step'); plt.ylabel('loss') |
| plt.grid(alpha=0.3); plt.tight_layout() |
| plt.savefig(ASSETS / 'loss_curve.png', dpi=150); plt.close() |
|
|
| |
| plt.figure(figsize=(9, 5)) |
| plt.plot(steps, ema(fmts, 0.05), lw=2, label='format-OK rate (EMA)') |
| plt.plot(steps, ema(pitches, 0.05), lw=2, label='non-empty pitch rate (EMA)') |
| plt.title('Format compliance + pitch usage during training') |
| plt.xlabel('step'); plt.ylabel('rate'); plt.ylim(-0.05, 1.05) |
| plt.legend(); plt.grid(alpha=0.3); plt.tight_layout() |
| plt.savefig(ASSETS / 'format_compliance.png', dpi=150); plt.close() |
|
|
| |
| |
| |
| if eval_history: |
| es = [e['step'] for e in eval_history] |
| epm = [e['profit_mean'] for e in eval_history] |
| erm = [e['reward_mean'] for e in eval_history] |
| plt.figure(figsize=(9, 5)) |
| plt.plot(es, epm, '-o', lw=2, label='held-out profitability (mean of 10 episodes)') |
| plt.plot(es, erm, '-s', lw=2, label='held-out episode reward') |
| plt.axhline(BASELINE_MEAN_PROFIT, ls=':', lw=1.5, color='#c44', |
| label=f'base Qwen3-0.6B profitability = {BASELINE_MEAN_PROFIT:.2f}') |
| plt.title('Periodic held-out eval during training (greedy)') |
| plt.xlabel('training step'); plt.ylabel('value') |
| plt.legend(); plt.grid(alpha=0.3); plt.tight_layout() |
| plt.savefig(ASSETS / 'periodic_eval.png', dpi=150); plt.close() |
|
|
| print(f'Linear-fit slope on reward: {slope:+.5f}/step (p={p_val:.2e}, R\u00b2={r_val**2:.3f})') |
| print('Saved reward_curve.png, loss_curve.png, format_compliance.png, periodic_eval.png') |
| |
| |
| |
| |
| |
| from unsloth import FastLanguageModel |
| FastLanguageModel.for_inference(model) |
|
|
| EVAL_N = 50 |
| PAIRED_SEEDS = list(range(70_000, 70_000 + EVAL_N)) |
|
|
| |
| trained_finals, trained_rewards, trained_fmt, trained_pitch = [], [], [], [] |
| trained_history_per_seed = [] |
| with make_env().sync() as env: |
| for i, s in enumerate(PAIRED_SEEDS): |
| r = run_episode(env, s) |
| trained_finals.append(r['final_profit']) |
| trained_rewards.append(r['ep_reward']) |
| trained_fmt.append(r['format_rate']) |
| trained_pitch.append(r['pitch_rate']) |
| trained_history_per_seed.append(r['history']) |
| if (i + 1) % 10 == 0: |
| print(f' trained {i+1}/{EVAL_N} profit={r["final_profit"]:.1f}') |
|
|
| |
| base_finals_paired, base_rewards_paired, base_fmt_paired, base_pitch_paired = [], [], [], [] |
| base_history_per_seed = [] |
| with make_env().sync() as env, model.disable_adapter(): |
| for i, s in enumerate(PAIRED_SEEDS): |
| r = run_episode(env, s) |
| base_finals_paired.append(r['final_profit']) |
| base_rewards_paired.append(r['ep_reward']) |
| base_fmt_paired.append(r['format_rate']) |
| base_pitch_paired.append(r['pitch_rate']) |
| base_history_per_seed.append(r['history']) |
| if (i + 1) % 10 == 0: |
| print(f' base {i+1}/{EVAL_N} profit={r["final_profit"]:.1f}') |
|
|
| tf, bf = np.array(trained_finals), np.array(base_finals_paired) |
| tr, br = np.array(trained_rewards), np.array(base_rewards_paired) |
|
|
| print(f'\nTrained Qwen3-0.6B profit : {tf.mean():.2f} \u00b1 {tf.std():.2f}') |
| print(f'Base Qwen3-0.6B profit : {bf.mean():.2f} \u00b1 {bf.std():.2f}') |
| print(f'Trained ep reward : {tr.mean():.2f} \u00b1 {tr.std():.2f}') |
| print(f'Base ep reward : {br.mean():.2f} \u00b1 {br.std():.2f}') |
| print(f'Trained format/pitch : {np.mean(trained_fmt):.0%} / {np.mean(trained_pitch):.0%}') |
| print(f'Base format/pitch : {np.mean(base_fmt_paired):.0%} / {np.mean(base_pitch_paired):.0%}') |
|
|
| with open(DRIVE_DIR / 'eval_paired.json', 'w') as f: |
| json.dump({'seeds': PAIRED_SEEDS, |
| 'trained_finals': tf.tolist(), 'base_finals': bf.tolist(), |
| 'trained_rewards': tr.tolist(), 'base_rewards': br.tolist(), |
| 'trained_format_rate': float(np.mean(trained_fmt)), |
| 'base_format_rate': float(np.mean(base_fmt_paired)), |
| 'trained_pitch_rate': float(np.mean(trained_pitch)), |
| 'base_pitch_rate': float(np.mean(base_pitch_paired))}, f) |
| from scipy import stats as spstats |
|
|
| def cohen_d(a, b): |
| pooled = np.sqrt(((a.std(ddof=1)**2) + (b.std(ddof=1)**2)) / 2) |
| return (a.mean() - b.mean()) / (pooled + 1e-12) |
|
|
| def bootstrap_diff_ci(a, b, n=10_000, seed=0): |
| rng = np.random.default_rng(seed) |
| diffs = a - b |
| boots = rng.choice(diffs, size=(n, len(diffs)), replace=True).mean(axis=1) |
| return float(np.percentile(boots, 2.5)), float(np.percentile(boots, 97.5)) |
|
|
| tt = spstats.ttest_rel(tf, bf) |
| uu = spstats.mannwhitneyu(tf, bf, alternative='greater') |
| wilc = spstats.wilcoxon(tf, bf, alternative='greater') |
| d = cohen_d(tf, bf) |
| lo, hi = bootstrap_diff_ci(tf, bf) |
| win_rate = float((tf > bf).mean()) |
| tie_rate = float((tf == bf).mean()) |
|
|
| summary = { |
| 'baseline_model': MODEL_NAME + ' (no fine-tune)', |
| 'trained_model': MODEL_NAME + ' + LoRA r=32', |
| 'n': len(tf), |
| 'paired_t_stat': float(tt.statistic), 'paired_t_p': float(tt.pvalue), |
| 'mannwhitney_U': float(uu.statistic), 'mannwhitney_p_greater': float(uu.pvalue), |
| 'wilcoxon_p_greater': float(wilc.pvalue), |
| 'cohens_d': float(d), |
| 'paired_diff_mean': float((tf - bf).mean()), |
| 'paired_diff_95ci': [lo, hi], |
| 'win_rate_trained_strictly_better': win_rate, |
| 'tie_rate': tie_rate, |
| } |
| print(json.dumps(summary, indent=2)) |
| with open(DRIVE_DIR / 'stats_summary.json', 'w') as f: |
| json.dump(summary, f, indent=2) |
| |
| bins = np.linspace(0, 100, 25) |
| plt.figure(figsize=(9, 5)) |
| plt.hist(bf, bins=bins, alpha=0.55, color='#c44', |
| label=f'Base Qwen3-0.6B (mean={bf.mean():.1f})') |
| plt.hist(tf, bins=bins, alpha=0.55, color='#1d6fff', |
| label=f'Fine-tuned Qwen3-0.6B (mean={tf.mean():.1f})') |
| plt.axvline(bf.mean(), color='#c44', ls='--', lw=1.5) |
| plt.axvline(tf.mean(), color='#1d6fff', ls='--', lw=1.5) |
| plt.title(f'Final profitability — paired same-seed (n={len(tf)}) ' |
| f"d={summary['cohens_d']:+.2f} win-rate={summary['win_rate_trained_strictly_better']:.0%}") |
| plt.xlabel('profitability score (0\u2013100)'); plt.ylabel('episodes') |
| plt.legend(); plt.grid(alpha=0.3); plt.tight_layout() |
| plt.savefig(ASSETS / 'before_after.png', dpi=150); plt.close() |
|
|
| diffs = tf - bf |
| order = np.argsort(diffs) |
| plt.figure(figsize=(9, 5)) |
| plt.bar(range(len(diffs)), diffs[order], |
| color=['#1d6fff' if x > 0 else '#c44' for x in diffs[order]]) |
| plt.axhline(0, color='k', lw=0.8) |
| plt.title(f'Per-seed lift (fine-tuned \u2212 base Qwen3-0.6B), sorted ' |
| f'mean lift = {diffs.mean():+.1f} CI=[{summary["paired_diff_95ci"][0]:+.1f}, {summary["paired_diff_95ci"][1]:+.1f}]') |
| plt.xlabel('seed (sorted by lift)'); plt.ylabel('\u0394 profitability') |
| plt.grid(alpha=0.3); plt.tight_layout() |
| plt.savefig(ASSETS / 'paired_delta.png', dpi=150); plt.close() |
| print('Saved before_after.png, paired_delta.png') |
| |
| |
| |
| |
| |
| def per_event_winrate(history_per_seed): |
| bucket = collections.defaultdict(lambda: [0, 0]) |
| for hist in history_per_seed: |
| for rd in hist: |
| t = rd.get('event_title', '?') |
| bucket[t][1] += 1 |
| if rd.get('agent_won_vote'): |
| bucket[t][0] += 1 |
| return {t: (w / max(1, n)) for t, (w, n) in bucket.items()} |
|
|
| trained_wr = per_event_winrate(trained_history_per_seed) |
| base_wr = per_event_winrate(base_history_per_seed) |
|
|
| events_sorted = sorted(set(trained_wr) | set(base_wr)) |
| tw = [trained_wr.get(e, 0.0) for e in events_sorted] |
| bw = [base_wr.get(e, 0.0) for e in events_sorted] |
|
|
| plt.figure(figsize=(11, 5)) |
| x = np.arange(len(events_sorted)) |
| plt.bar(x - 0.2, bw, width=0.4, color='#c44', label='Base Qwen3-0.6B') |
| plt.bar(x + 0.2, tw, width=0.4, color='#1d6fff', label='Fine-tuned Qwen3-0.6B') |
| plt.xticks(x, [e[:22] for e in events_sorted], rotation=30, ha='right') |
| plt.ylim(0, 1.05); plt.ylabel('boardroom win rate') |
| plt.title('Per-event boardroom win rate (paired seeds, n=50 episodes)') |
| plt.legend(); plt.grid(alpha=0.3, axis='y'); plt.tight_layout() |
| plt.savefig(ASSETS / 'per_event_winrate.png', dpi=150); plt.close() |
|
|
| with open(DRIVE_DIR / 'per_event_winrate.json', 'w') as f: |
| json.dump({'events': events_sorted, 'trained': tw, 'base': bw}, f, indent=2) |
| print('Saved per_event_winrate.png') |
| |
| |
| |
| |
| |
| TOM_INSTRUCTION = ( |
| "\n\nGiven the state and event below, name the SINGLE board member " |
| "(CTO, CFO, Investor Rep, or Independent) most likely to oppose the chosen decision. " |
| "Answer with just the role name on one line.\n" |
| ) |
|
|
| def tom_predict(obs, decision): |
| body = build_prompt(obs).split(SYSTEM_PROMPT, 1)[1] |
| prompt = SYSTEM_PROMPT + TOM_INSTRUCTION + body + f'Chosen decision: {decision}\nMost likely opponent: ' |
| enc = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024).to(device) |
| with torch.no_grad(): |
| out = model.generate(**enc, max_new_tokens=8, do_sample=False, |
| pad_token_id=tokenizer.eos_token_id) |
| txt = tokenizer.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True).lower() |
| if 'investor' in txt: return 'Investor Rep' |
| if 'independent' in txt: return 'Independent' |
| if 'cto' in txt: return 'CTO' |
| if 'cfo' in txt: return 'CFO' |
| return None |
|
|
| def tom_eval(seed_base=80_000, n=40): |
| correct = total = 0 |
| with make_env().sync() as env: |
| for ep in range(n): |
| result = env.reset(seed=seed_base + ep) |
| obs = result.observation |
| decision, _, _ = greedy_action(obs) |
| opposed = [s['role'] for s in obs.npc_statements if s['vote'] != decision] |
| if not opposed: |
| continue |
| pred = tom_predict(obs, decision) |
| if pred and pred in opposed: |
| correct += 1 |
| total += 1 |
| return correct, total |
|
|
| t_corr, t_tot = tom_eval() |
| with model.disable_adapter(): |
| b_corr, b_tot = tom_eval() |
|
|
| tom_acc = t_corr / max(1, t_tot) |
| tom_acc_base = b_corr / max(1, b_tot) |
| print(f'ToM probe: trained = {tom_acc:.1%} ({t_corr}/{t_tot}) base = {tom_acc_base:.1%} ({b_corr}/{b_tot})') |
| with open(DRIVE_DIR / 'tom.json', 'w') as f: |
| json.dump({'trained': {'correct': t_corr, 'total': t_tot, 'accuracy': tom_acc}, |
| 'base': {'correct': b_corr, 'total': b_tot, 'accuracy': tom_acc_base}}, f) |
| from huggingface_hub import HfApi |
| ADAPTER_REPO = os.environ.get('ADAPTER_REPO', 'StavanKhobare/SST-MetaxPyTorch-Hackathon-LoRA') |
| MERGED_REPO = os.environ.get('MERGED_REPO', 'StavanKhobare/SST-MetaxPyTorch-Hackathon-Merged16bit') |
|
|
| api = HfApi() |
| api.create_repo(ADAPTER_REPO, repo_type='model', private=False, exist_ok=True) |
| api.create_repo(MERGED_REPO, repo_type='model', private=False, exist_ok=True) |
|
|
| |
| try: |
| model.push_to_hub(ADAPTER_REPO, private=False) |
| tokenizer.push_to_hub(ADAPTER_REPO, private=False) |
| print(f'\u2713 LoRA pushed: https://huggingface.co/{ADAPTER_REPO}') |
| except Exception as e: |
| print(f'LoRA push failed: {e!r}') |
|
|
| |
| try: |
| model.push_to_hub_merged(MERGED_REPO, tokenizer, save_method='merged_16bit', private=False) |
| print(f'\u2713 Merged 16-bit pushed: https://huggingface.co/{MERGED_REPO}') |
| except Exception as e: |
| print(f'Merged push failed (you can retry): {e!r}') |
|
|
| |
| try: |
| api.upload_folder(folder_path=str(ASSETS), repo_id=ADAPTER_REPO, |
| path_in_repo='assets', repo_type='model') |
| for fname in ['log_history.json','eval_history.json','eval_paired.json', |
| 'stats_summary.json','tom.json','transcripts.json', |
| 'decision_counter.json','baseline.json', |
| 'per_event_winrate.json']: |
| fp = DRIVE_DIR / fname |
| if fp.exists(): |
| api.upload_file(path_or_fileobj=str(fp), path_in_repo=fname, |
| repo_id=ADAPTER_REPO, repo_type='model') |
| print(f'\u2713 Artifacts uploaded to https://huggingface.co/{ADAPTER_REPO}') |
| except Exception as e: |
| print(f'Artifact upload failed: {e!r}') |
| print('='*70) |
| print('BOARDSIM \u00d7 QWEN3-4B \u2014 LEARNING EVIDENCE') |
| print('='*70) |
| print(f'Reward slope (linear fit) : {slope:+.5f}/step (p={p_val:.2e})') |
| print(f'Reward EMA first 20 steps : {rewards_ema[:20].mean():+.3f}') |
| print(f'Reward EMA last 20 steps : {rewards_ema[-20:].mean():+.3f}') |
| print(f'Format compliance start : {fmts[:20].mean():.0%}') |
| print(f'Format compliance end : {fmts[-20:].mean():.0%}') |
| print('-'*70) |
| print(f'Held-out paired (n={len(tf)}): fine-tuned {tf.mean():.2f} vs base {bf.mean():.2f}') |
| print(f' paired t-test p={summary["paired_t_p"]:.2e} Wilcoxon p={summary["wilcoxon_p_greater"]:.2e}') |
| print(f' Cohen d={summary["cohens_d"]:+.2f} 95% CI of lift = [{summary["paired_diff_95ci"][0]:+.2f}, {summary["paired_diff_95ci"][1]:+.2f}]') |
| print(f' win rate (fine-tuned > base): {summary["win_rate_trained_strictly_better"]:.0%}') |
| print(f'ToM probe fine-tuned : {tom_acc:.0%} base = {tom_acc_base:.0%}') |
| print(f'Decision entropy : {entropy:.2f} / {max_ent:.2f} (\u2192 not collapsed)') |
| print('-'*70) |
| print(f'Adapter : https://huggingface.co/{ADAPTER_REPO}') |
| print(f'Merged 16bit : https://huggingface.co/{MERGED_REPO}') |
| print(f'Env Space : {ENV_BASE_URL}') |
| print('='*70) |
|
|