{"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.12.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"dockerImageVersionId":31329,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":5,"nbformat":4,"cells":[{"id":"1df1cb2a-0c95-4ee5-a997-dc73db3d8b49","cell_type":"markdown","source":"# BoardSim × Qwen3-1.7B — GRPO LoRA fine-tune (Kaggle edition)\n\nRuns on Kaggle GPUs (T4 x2 or P100). Enable: **Settings → Accelerator: GPU**, **Internet: On**.\n\nAdd Kaggle Secrets (Add-ons → Secrets):\n- `HF_TOKEN` (required)\n- `WANDB_API_KEY` (optional)\n- `ENV_BASE_URL` (optional, defaults to public HF Space)\n- `ADAPTER_REPO`, `MERGED_REPO` (optional)","metadata":{}},{"id":"6dbca818-8816-4674-ad2f-995a85afa322","cell_type":"markdown","source":"## 1. Install deps (unsloth FIRST — patches torch/transformers at import)","metadata":{}},{"id":"4c138c25-d385-4346-ae41-ce47dd39c670","cell_type":"code","source":"%pip install -q --no-deps unsloth\n%pip install -q unsloth_zoo\n%pip install -q \"openenv-core==0.2.3\" \"trl>=0.12,<2.0\" \"transformers>=4.45,<5.0\" \\\n \"datasets>=3.0\" \"accelerate>=1.0\" \"huggingface_hub>=0.25\" \"pydantic>=2.0\" \\\n wandb matplotlib python-dotenv bitsandbytes scipy scikit-learn sentence-transformers","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:45:07.055736Z","iopub.execute_input":"2026-04-26T06:45:07.056326Z","iopub.status.idle":"2026-04-26T06:45:16.564270Z","shell.execute_reply.started":"2026-04-26T06:45:07.056292Z","shell.execute_reply":"2026-04-26T06:45:16.563185Z"}},"outputs":[{"name":"stdout","text":"Note: you may need to restart the kernel to use updated packages.\nNote: you may need to restart the kernel to use updated packages.\nNote: you may need to restart the kernel to use updated packages.\n","output_type":"stream"}],"execution_count":1},{"id":"3d31460e-0652-479a-b7c2-f4d98f5e721b","cell_type":"markdown","source":"## 2. Auth — Kaggle Secrets → env vars → HF / W&B login","metadata":{}},{"id":"1fc3b887-057d-4faf-a779-d22edd23e28d","cell_type":"code","source":"import os, pathlib\n\nIN_KAGGLE = os.path.isdir('/kaggle')\n\n# Kaggle Secrets first\nif IN_KAGGLE:\n try:\n from kaggle_secrets import UserSecretsClient\n usc = UserSecretsClient()\n for k in ('HF_TOKEN', 'WANDB_API_KEY', 'ENV_BASE_URL', 'ADAPTER_REPO', 'MERGED_REPO'):\n try:\n v = usc.get_secret(k)\n if v:\n os.environ.setdefault(k, v)\n except Exception:\n pass\n except Exception as e:\n print(f'kaggle_secrets unavailable: {e}')\n\n# .env fallback\ntry:\n from dotenv import load_dotenv\n for p in [pathlib.Path('.env'), pathlib.Path('../.env'),\n pathlib.Path('/kaggle/working/.env')]:\n if p.exists():\n load_dotenv(p, override=False)\n print(f'Loaded env from {p.resolve()}')\n break\nexcept Exception:\n pass\n\nif not os.environ.get('HF_TOKEN'):\n os.environ['HF_TOKEN'] = input('HF token: ').strip()\nif not os.environ.get('WANDB_API_KEY'):\n os.environ['WANDB_API_KEY'] = input('WandB key (or blank to skip): ').strip()\n\nfrom huggingface_hub import login as hf_login\nhf_login(token=os.environ['HF_TOKEN'], add_to_git_credential=False)\nprint('HF auth ok.')\nif os.environ.get('WANDB_API_KEY'):\n import wandb\n wandb.login(key=os.environ['WANDB_API_KEY'])\n print('W&B auth ok.')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:45:23.675886Z","iopub.execute_input":"2026-04-26T06:45:23.676245Z","iopub.status.idle":"2026-04-26T06:45:33.147950Z","shell.execute_reply.started":"2026-04-26T06:45:23.676209Z","shell.execute_reply":"2026-04-26T06:45:33.147370Z"}},"outputs":[{"name":"stderr","text":"Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n","output_type":"stream"},{"name":"stdout","text":"HF auth ok.\n","output_type":"stream"},{"name":"stderr","text":"/usr/local/lib/python3.12/dist-packages/notebook/notebookapp.py:191: SyntaxWarning: invalid escape sequence '\\/'\n | |_| | '_ \\/ _` / _` | _/ -_)\n\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n\u001b[34m\u001b[1mwandb\u001b[0m: [wandb.login()] Using explicit session credentials for https://api.wandb.ai.\n\u001b[34m\u001b[1mwandb\u001b[0m: No netrc file found, creating one.\n\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mstavanrkhobare\u001b[0m (\u001b[33mstavanrkhobare-r-v-college-of-engineering\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n","output_type":"stream"},{"name":"stdout","text":"W&B auth ok.\n","output_type":"stream"}],"execution_count":2},{"id":"6f4139aa-d782-49b7-a1dc-e617ca26e305","cell_type":"markdown","source":"## 3. Working dirs (Kaggle uses `/kaggle/working` — persists as notebook output)","metadata":{}},{"id":"3499d810-5d4d-48a2-9754-7a071be8f619","cell_type":"code","source":"import pathlib\n\nif IN_KAGGLE:\n WORK_DIR = pathlib.Path('/kaggle/working/BoardSim_Run')\nelse:\n WORK_DIR = pathlib.Path('./BoardSim_Run')\nWORK_DIR.mkdir(parents=True, exist_ok=True)\nASSETS = WORK_DIR / 'assets'; ASSETS.mkdir(exist_ok=True)\nCKPT = WORK_DIR / 'lora_qwen3_1p7b'; CKPT.mkdir(exist_ok=True)\nprint('WORK_DIR =', WORK_DIR)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:47:04.590428Z","iopub.execute_input":"2026-04-26T06:47:04.591388Z","iopub.status.idle":"2026-04-26T06:47:04.597329Z","shell.execute_reply.started":"2026-04-26T06:47:04.591355Z","shell.execute_reply":"2026-04-26T06:47:04.596544Z"}},"outputs":[{"name":"stdout","text":"WORK_DIR = /kaggle/working/BoardSim_Run\n","output_type":"stream"}],"execution_count":3},{"id":"3c8e4b29-bf64-4a09-95df-6330427a5eed","cell_type":"markdown","source":"## 4. Clone repo + connect to BoardSim env","metadata":{}},{"id":"34fa6345-2ef6-48c2-ac33-b1322e4f7965","cell_type":"code","source":"import os, sys, subprocess, urllib.request, json as _json\n\nENV_BASE_URL = os.environ.get('ENV_BASE_URL',\n 'https://stavankhobare-sst-metaxpytorch-hackathon.hf.space')\nREPO_URL = 'https://github.com/StavanRKhobare/SST-MetaxPyTorch-Hackathon'\n\nREPO_DIR = '/kaggle/working/repo' if IN_KAGGLE else os.path.abspath('./repo')\nif not os.path.isdir(os.path.join(REPO_DIR, '.git')):\n subprocess.run(['git', 'clone', '--depth', '1', REPO_URL, REPO_DIR], check=True)\nelse:\n subprocess.run(['git', '-C', REPO_DIR, 'pull', '--ff-only'], check=False)\n\nENVS_DIR = os.path.join(REPO_DIR, 'envs')\nif ENVS_DIR not in sys.path:\n sys.path.insert(0, ENVS_DIR)\n\nfor mod in [m for m in list(sys.modules) if m == 'board_sim_env' or m.startswith('board_sim_env.')]:\n del sys.modules[mod]\n\nfrom board_sim_env.client import BoardSimEnv\nfrom board_sim_env.models import BoardSimAction, BoardSimObservation\n\ntry:\n with urllib.request.urlopen(f'{ENV_BASE_URL.rstrip(\"/\")}/health', timeout=20) as r:\n h = _json.loads(r.read())\n print('health:', h)\nexcept Exception as e:\n print(f'WARN: could not reach {ENV_BASE_URL}/health ({e})')\n\ndef make_env():\n return BoardSimEnv(base_url=ENV_BASE_URL)\n\nprint('BoardSimEnv ready.')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:47:29.780919Z","iopub.execute_input":"2026-04-26T06:47:29.781827Z","iopub.status.idle":"2026-04-26T06:47:37.242897Z","shell.execute_reply.started":"2026-04-26T06:47:29.781795Z","shell.execute_reply":"2026-04-26T06:47:37.242018Z"}},"outputs":[{"name":"stderr","text":"Cloning into '/kaggle/working/repo'...\n","output_type":"stream"},{"name":"stdout","text":"health: {'status': 'healthy'}\nBoardSimEnv ready.\n","output_type":"stream"}],"execution_count":4},{"id":"89e49517-7bf8-43fc-a820-b82c114c3aa3","cell_type":"markdown","source":"## 5. Load Qwen3-1.7B in 4-bit via Unsloth","metadata":{}},{"id":"cc858a70-2b6e-49c6-b98d-5699eb3ec450","cell_type":"code","source":"import unsloth # noqa: F401\nfrom unsloth import FastLanguageModel\nimport torch, re\n\nMODEL_NAME = 'Qwen/Qwen3-0.6B'\nMAX_SEQ_LEN = 2048\n\nmodel, tokenizer = FastLanguageModel.from_pretrained(\n model_name=MODEL_NAME,\n max_seq_length=MAX_SEQ_LEN,\n load_in_4bit=True,\n dtype=None,\n)\nif tokenizer.pad_token is None:\n tokenizer.pad_token = tokenizer.eos_token\n\ndevice = next(model.parameters()).device\nprint(f'Loaded {MODEL_NAME} on {device}.')\nif torch.cuda.is_available():\n total_gb = torch.cuda.get_device_properties(0).total_memory / 1e9\n mem_gb = torch.cuda.memory_allocated() / 1e9\n print(f'GPU memory after base load: {mem_gb:.2f} GB / {total_gb:.2f} GB')\n print(f'Headroom for compute: {total_gb - mem_gb:.2f} GB')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:47:59.583329Z","iopub.execute_input":"2026-04-26T06:47:59.584074Z","iopub.status.idle":"2026-04-26T06:48:54.637486Z","shell.execute_reply.started":"2026-04-26T06:47:59.584032Z","shell.execute_reply":"2026-04-26T06:48:54.636739Z"}},"outputs":[{"name":"stdout","text":"🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.\n","output_type":"stream"},{"name":"stderr","text":"2026-04-26 06:48:09.793138: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\nWARNING: All log messages before absl::InitializeLog() is called are written to STDERR\nE0000 00:00:1777186089.964130 137 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\nE0000 00:00:1777186090.018837 137 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\nW0000 00:00:1777186090.438874 137 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\nW0000 00:00:1777186090.438920 137 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\nW0000 00:00:1777186090.438923 137 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\nW0000 00:00:1777186090.438926 137 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n","output_type":"stream"},{"name":"stdout","text":"🦥 Unsloth Zoo will now patch everything to make training faster!\n==((====))== Unsloth 2026.4.8: Fast Qwen3 patching. Transformers: 4.57.6.\n \\\\ /| Tesla T4. Num GPUs = 2. Max memory: 14.563 GB. Platform: Linux.\nO^O/ \\_/ \\ Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0\n\\ / Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]\n \"-____-\" Free license: http://github.com/unslothai/unsloth\nUnsloth: Fast downloading is enabled - ignore downloading bars which are red colored!\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"model.safetensors: 0%| | 0.00/576M [00:00.\nLoaded Qwen/Qwen3-0.6B on cuda:0.\nGPU memory after base load: 0.60 GB / 15.64 GB\nHeadroom for compute: 15.03 GB\n","output_type":"stream"}],"execution_count":5},{"id":"db967c61-6f66-4c56-919e-49c564113cd2","cell_type":"markdown","source":"## 6. Prompt + parser + greedy action helper","metadata":{}},{"id":"1f92e995-14c1-4b5f-abf1-50fc00081343","cell_type":"code","source":"SYSTEM_PROMPT = \"\"\"You are the CEO of a mid-stage organization. Your board has 4 members with HIDDEN AGENDAS you cannot see directly:\n - CTO: cares about operational excellence, engineering quality, team morale, and product readiness.\n - CFO: cares about cash discipline, runway, and regulatory safety.\n - Investor Rep: pushes growth, market share, and bold returns.\n - Independent: cares about reputation, governance, and long-term consensus.\n\nEach round you see a strategic event, every NPC's pre-vote statement, and 3 options.\nYour decision is resolved by WEIGHTED VOTE (your weight 2.5x). A short COALITION PITCH\nthat is semantically aligned with opposing members' priorities can swing them toward your pick —\nwrite substantive arguments, not just buzzwords.\n\nRespond in EXACTLY this format on two lines:\nDECISION: \nPITCH: \"\"\"\n\nDECISION_RE = re.compile(r'DECISION\\s*:\\s*([A-Za-z0-9_\\- ]+)', re.IGNORECASE)\nPITCH_RE = re.compile(r'PITCH\\s*:\\s*(.+)', re.IGNORECASE)\n\ndef build_prompt(obs):\n statements = '\\n'.join(\n f\" {s['role']} ({s['confidence']:.2f}): votes {s['vote']} - {s['statement']}\"\n for s in obs.npc_statements\n )\n return (\n f\"{SYSTEM_PROMPT}\\n\\n\"\n f\"State: revenue=${obs.state['revenue']:.0f}/yr burn=${obs.state['burn_rate']:.0f}/mo \"\n f\"runway={obs.state['runway_months']:.1f}mo morale={obs.state['team_morale']:.2f} \"\n f\"investors={obs.state['investor_confidence']:.2f} reg_risk={obs.state['regulatory_risk']:.2f}\\n\"\n f\"Event: {obs.event}\\nBoard:\\n{statements}\\n\"\n f\"Options: {obs.options}\\n\"\n )\n\ndef parse_completion(completion: str, options):\n decision = options[0]\n decision_ok = False\n dm = DECISION_RE.search(completion)\n if dm:\n cand = dm.group(1).strip().lower()\n for opt in options:\n if opt.lower() == cand or opt.lower() in cand:\n decision = opt; decision_ok = True; break\n if not decision_ok:\n for opt in options:\n if opt.lower() in completion.lower():\n decision = opt; break\n pm = PITCH_RE.search(completion)\n pitch = pm.group(1).strip()[:400] if pm else ''\n format_ok = bool(dm) and bool(pm)\n return decision, pitch, format_ok\n\nMAX_NEW_TOKENS = 80\n\ndef greedy_action(obs):\n prompt = build_prompt(obs)\n enc = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024).to(device)\n with torch.no_grad():\n out = model.generate(\n **enc, max_new_tokens=MAX_NEW_TOKENS,\n do_sample=False, pad_token_id=tokenizer.eos_token_id,\n )\n completion = tokenizer.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True)\n return parse_completion(completion, obs.options)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:49:10.140073Z","iopub.execute_input":"2026-04-26T06:49:10.141152Z","iopub.status.idle":"2026-04-26T06:49:10.152522Z","shell.execute_reply.started":"2026-04-26T06:49:10.141082Z","shell.execute_reply":"2026-04-26T06:49:10.151810Z"}},"outputs":[],"execution_count":6},{"id":"cc7f9366-807b-4779-b1a6-4bd30cb10a2c","cell_type":"markdown","source":"## 7. Episode runner","metadata":{}},{"id":"9a0e1864-d7f3-4396-968d-cdc54a628322","cell_type":"code","source":"import random, statistics, json\n\nMAX_STEPS_PER_EP = 20\n\ndef run_episode(env, seed):\n result = env.reset(seed=seed)\n obs = result.observation\n ep_r, n, fmt_hits, pitch_hits = 0.0, 0, 0, 0\n while not result.done and n < MAX_STEPS_PER_EP:\n decision, pitch, fmt_ok = greedy_action(obs)\n if fmt_ok: fmt_hits += 1\n if pitch.strip(): pitch_hits += 1\n result = env.step(BoardSimAction(decision=decision, coalition_pitch=pitch))\n obs = result.observation\n ep_r += float(result.reward or 0.0)\n n += 1\n return {\n 'final_profit': obs.state['profitability_score'],\n 'ep_reward': ep_r, 'steps': n,\n 'format_rate': fmt_hits / max(1, n), 'pitch_rate': pitch_hits / max(1, n),\n 'history': obs.state.get('history', []),\n }","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:49:22.139016Z","iopub.execute_input":"2026-04-26T06:49:22.139808Z","iopub.status.idle":"2026-04-26T06:49:22.146153Z","shell.execute_reply.started":"2026-04-26T06:49:22.139746Z","shell.execute_reply":"2026-04-26T06:49:22.145224Z"}},"outputs":[],"execution_count":7},{"id":"05f2da0b-9218-4bad-a27e-e2c7ed3374a5","cell_type":"markdown","source":"## 8. Baseline — base Qwen3-1.7B (no fine-tune)\nApples-to-apples reference for measuring fine-tuning lift.","metadata":{}},{"id":"e882d431-4dac-450f-b0e1-58bb8c6a2fa9","cell_type":"code","source":"BASELINE_SEEDS = list(range(50_000, 50_000 + 20)) # reduced from 100 → 20\nbase_finals, base_rewards, base_fmts, base_pitches = [], [], [], []\nwith make_env().sync() as env:\n for i, s in enumerate(BASELINE_SEEDS):\n r = run_episode(env, s)\n base_finals.append(r['final_profit'])\n base_rewards.append(r['ep_reward'])\n base_fmts.append(r['format_rate'])\n base_pitches.append(r['pitch_rate'])\n if (i + 1) % 5 == 0: # changed from 10 → 5 so you still see progress\n print(f' base Qwen3-0.6B {i+1}/{len(BASELINE_SEEDS)} profit={r[\"final_profit\"]:.1f}')\nBASELINE_MEAN_PROFIT = statistics.mean(base_finals)\nBASELINE_MEAN_REWARD = statistics.mean(base_rewards)\nprint(f'Base Qwen3-0.6B profit : {BASELINE_MEAN_PROFIT:.2f} ± {statistics.stdev(base_finals):.2f}')\nprint(f'Base Qwen3-0.6B ep rwd : {BASELINE_MEAN_REWARD:.2f} ± {statistics.stdev(base_rewards):.2f}')\nprint(f'Base format rate : {statistics.mean(base_fmts):.0%} pitch rate: {statistics.mean(base_pitches):.0%}')\nwith open(WORK_DIR / 'baseline.json', 'w') as f:\n json.dump({'model': MODEL_NAME, 'mode': 'base_no_finetune',\n 'seeds': BASELINE_SEEDS,\n 'finals': base_finals, 'rewards': base_rewards,\n 'format_rates': base_fmts, 'pitch_rates': base_pitches}, f)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:49:30.441088Z","iopub.execute_input":"2026-04-26T06:49:30.441732Z","iopub.status.idle":"2026-04-26T06:54:28.368214Z","shell.execute_reply.started":"2026-04-26T06:49:30.441688Z","shell.execute_reply":"2026-04-26T06:54:28.367381Z"}},"outputs":[{"name":"stdout","text":" base Qwen3-0.6B 5/20 profit=65.5\n base Qwen3-0.6B 10/20 profit=35.4\n base Qwen3-0.6B 15/20 profit=56.3\n base Qwen3-0.6B 20/20 profit=54.6\nBase Qwen3-0.6B profit : 42.39 ± 11.50\nBase Qwen3-0.6B ep rwd : 37.52 ± 5.51\nBase format rate : 100% pitch rate: 100%\n","output_type":"stream"}],"execution_count":8},{"id":"f9459ffb-2df1-4213-ac0f-ddf7894ff1f6","cell_type":"markdown","source":"## 9. Wrap base with LoRA adapters","metadata":{}},{"id":"f3872ea7-7aaa-4cf2-9efb-332ff5847c4e","cell_type":"code","source":"model = FastLanguageModel.get_peft_model(\n model,\n r=32,\n target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'],\n lora_alpha=64,\n lora_dropout=0.0, bias='none',\n use_gradient_checkpointing='unsloth',\n random_state=3407,\n)\n\ntrainable = sum(p.numel() for p in model.parameters() if p.requires_grad)\ntotal = sum(p.numel() for p in model.parameters())\nprint(f'Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)')\n\nEVAL_SEEDS = list(range(60_000, 60_000 + 10))\n\ndef periodic_eval(env):\n profits, rewards, fmts, pitches = [], [], [], []\n for s in EVAL_SEEDS:\n r = run_episode(env, s)\n profits.append(r['final_profit']); rewards.append(r['ep_reward'])\n fmts.append(r['format_rate']); pitches.append(r['pitch_rate'])\n import numpy as np\n return {'profit_mean': float(np.mean(profits)),\n 'reward_mean': float(np.mean(rewards)),\n 'format_rate': float(np.mean(fmts)),\n 'pitch_rate': float(np.mean(pitches))}","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:55:28.063482Z","iopub.execute_input":"2026-04-26T06:55:28.064213Z","iopub.status.idle":"2026-04-26T06:55:35.011917Z","shell.execute_reply.started":"2026-04-26T06:55:28.064165Z","shell.execute_reply":"2026-04-26T06:55:35.011046Z"}},"outputs":[{"name":"stderr","text":"Unsloth 2026.4.8 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.\n","output_type":"stream"},{"name":"stdout","text":"Trainable params: 20,185,088 / 408,616,960 (4.94%)\n","output_type":"stream"}],"execution_count":9},{"id":"78bd44dc-8c1e-478f-9d0a-015dfde10bd8","cell_type":"markdown","source":"## 10. GRPO training loop","metadata":{}},{"id":"10687855-d1b2-4111-8bd1-5ecf5c65fe41","cell_type":"code","source":"# =============================================================================\n# GRPO training cell — fixed version\n#\n# Fixes:\n# 1. RuntimeError \"variable modified by an inplace operation\" on loss.backward().\n# Root cause: model.generate() leaves use_cache=True, and the subsequent\n# forward pass returns logits that share storage with KV-cache buffers,\n# which get mutated later. Fix: force use_cache=False on the training\n# forward pass, and .clone() the logits slice before computing log_softmax.\n#\n# 2. GPU OOM on cell re-run. Root cause: re-running the cell creates a fresh\n# AdamW (which holds momentum buffers ~= model size) without freeing the\n# previous one. Fix: explicit cleanup of any prior optimizer / cached\n# tensors at the top of the cell + gc + empty_cache. Model itself is NOT\n# reloaded here (load it once in an earlier cell); we just reuse it.\n#\n# 3. wandb deprecation warning for reinit=True. Use finish_previous=True only.\n# =============================================================================\n\nimport os, gc, json, time, collections\nimport torch\nfrom torch.optim import AdamW\n\n# ---- 0. cleanup any leftover state from previous runs of this cell ----------\nfor _name in ('optimizer', 'gen_out', 'out', 'logits', 'loss',\n 'log_probs', 'token_nll', 'per_seq_nll', 'advantages'):\n if _name in globals():\n try:\n del globals()[_name]\n except Exception:\n pass\ngc.collect()\nif torch.cuda.is_available():\n torch.cuda.empty_cache()\n torch.cuda.ipc_collect()\n\n# ---- 1. config --------------------------------------------------------------\nNUM_STEPS = int(os.environ.get('NUM_STEPS', 100))\nGROUP_SIZE = int(os.environ.get('GROUP_SIZE', 4))\nLR = 5e-6\nGRAD_CLIP = 1.0\nTEMPERATURE, TOP_P = 1.0, 0.95\nSAVE_EVERY = 25\nEVAL_AT = {0, 25, 50, 75, NUM_STEPS - 1}\n\n# Critical: kill KV cache on the training forward pass.\n# generate() will still build its own cache internally; we override afterwards.\nmodel.config.use_cache = False\nmodel.gradient_checkpointing_disable() if hasattr(model, 'gradient_checkpointing_disable') else None\nmodel.train()\n\n# ---- 2. wandb (no deprecated reinit) ----------------------------------------\nWANDB_OK = False\nif os.environ.get('WANDB_API_KEY'):\n try:\n import wandb\n wandb.init(\n project='boardsim-qwen3-grpo',\n name='boardsim-qwen3-1p7b-kaggle',\n config={'num_steps': NUM_STEPS, 'group_size': GROUP_SIZE, 'lr': LR,\n 'temperature': TEMPERATURE, 'top_p': TOP_P, 'model': MODEL_NAME},\n finish_previous=True,\n )\n WANDB_OK = True\n except Exception as e:\n print(f'WARN: wandb.init failed: {e}')\n\n# ---- 3. optimizer (single owner, freshly built each cell run) ---------------\noptimizer = AdamW(\n [p for p in model.parameters() if p.requires_grad],\n lr=LR, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0,\n)\n\nlog_history, eval_history = [], []\ndecision_counter = collections.Counter()\nt0 = time.time()\n\n# ---- 4. training loop -------------------------------------------------------\nwith make_env().sync() as env_train, \\\n make_env().sync() as env_score, \\\n make_env().sync() as env_eval:\n\n for step in range(NUM_STEPS):\n # 4a. rollout\n result = env_train.reset(seed=step)\n obs = result.observation\n prompt = build_prompt(obs)\n enc = tokenizer(prompt, return_tensors='pt',\n truncation=True, max_length=1024).to(device)\n prompt_len = enc.input_ids.shape[1]\n\n with torch.no_grad():\n gen_out = model.generate(\n input_ids=enc.input_ids,\n attention_mask=enc.attention_mask,\n max_new_tokens=MAX_NEW_TOKENS,\n do_sample=True,\n temperature=TEMPERATURE,\n top_p=TOP_P,\n num_return_sequences=GROUP_SIZE,\n pad_token_id=tokenizer.eos_token_id,\n use_cache=True, # cache OK during generate (no_grad context)\n )\n # Detach + clone so no autograd ties to generate's internal buffers.\n gen_out = gen_out.detach().clone()\n\n # 4b. score each completion\n decisions, pitches, rewards, fmt_oks = [], [], [], []\n for g in range(GROUP_SIZE):\n comp = tokenizer.decode(gen_out[g][prompt_len:], skip_special_tokens=True)\n d, pp, ok = parse_completion(comp, obs.options)\n decisions.append(d); pitches.append(pp); fmt_oks.append(ok)\n decision_counter[d] += 1\n env_score.reset(seed=step)\n sr = env_score.step(BoardSimAction(decision=d, coalition_pitch=pp))\n rewards.append(float(sr.reward or 0.0))\n\n rewards_t = torch.tensor(rewards, dtype=torch.float32, device=device)\n if rewards_t.numel() > 1 and rewards_t.std().item() > 1e-6:\n advantages = (rewards_t - rewards_t.mean()) / (rewards_t.std() + 1e-8)\n else:\n advantages = rewards_t - rewards_t.mean()\n advantages = advantages.detach()\n\n # 4c. policy update — fresh forward, NO cache, clone logits\n optimizer.zero_grad(set_to_none=True)\n\n full_ids = gen_out\n attn = (full_ids != tokenizer.pad_token_id).long()\n loss_mask = attn.clone()\n loss_mask[:, :prompt_len] = 0\n\n out = model(\n input_ids=full_ids,\n attention_mask=attn,\n use_cache=False, # <-- key fix\n return_dict=True,\n )\n # Clone the slice so backward sees a tensor whose storage we own.\n logits = out.logits[:, :-1, :].float().clone()\n targets = full_ids[:, 1:].contiguous()\n mask = loss_mask[:, 1:].float()\n\n log_probs = torch.nn.functional.log_softmax(logits, dim=-1)\n token_nll = -log_probs.gather(2, targets.unsqueeze(-1)).squeeze(-1)\n per_seq_nll = (token_nll * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1.0)\n loss = (advantages * per_seq_nll).mean()\n\n loss.backward()\n total_loss_val = float(loss.detach().item())\n\n torch.nn.utils.clip_grad_norm_(\n [p for p in model.parameters() if p.requires_grad], GRAD_CLIP)\n optimizer.step()\n\n # Free per-step graph tensors before next iter (helps on tight VRAM).\n del out, logits, log_probs, token_nll, per_seq_nll, loss\n\n # 4d. log\n rec = {\n 'step': step,\n 'reward': float(rewards_t.mean().item()),\n 'reward_std': float(rewards_t.std().item()) if rewards_t.numel() > 1 else 0.0,\n 'reward_max': float(rewards_t.max().item()),\n 'loss': total_loss_val,\n 'format_rate': sum(fmt_oks) / GROUP_SIZE,\n 'pitch_rate': sum(1 for p in pitches if p.strip()) / GROUP_SIZE,\n 'elapsed_s': time.time() - t0,\n }\n log_history.append(rec)\n if WANDB_OK:\n wandb.log(rec, step=step)\n\n if step % 5 == 0:\n print(f\"step={step:4d} reward={rec['reward']:+.3f} (\\u00b1{rec['reward_std']:.2f}) \"\n f\"loss={rec['loss']:+.4f} fmt={rec['format_rate']:.0%} \"\n f\"elapsed={rec['elapsed_s']:.0f}s d0={decisions[0]}\")\n\n # 4e. periodic eval\n if step in EVAL_AT:\n ev = periodic_eval(env_eval)\n ev['step'] = step\n eval_history.append(ev)\n print(f\" [eval@{step}] profit={ev['profit_mean']:.2f} \"\n f\"reward={ev['reward_mean']:.2f} fmt={ev['format_rate']:.0%}\")\n if WANDB_OK:\n wandb.log({f'eval/{k}': v for k, v in ev.items() if k != 'step'}, step=step)\n\n # 4f. checkpoint\n if step > 0 and step % SAVE_EVERY == 0:\n model.save_pretrained(str(CKPT))\n tokenizer.save_pretrained(str(CKPT))\n with open(WORK_DIR / 'log_history.json', 'w') as f:\n json.dump(log_history, f)\n with open(WORK_DIR / 'eval_history.json', 'w') as f:\n json.dump(eval_history, f)\n\n# ---- 5. final save ----------------------------------------------------------\nmodel.save_pretrained(str(CKPT))\ntokenizer.save_pretrained(str(CKPT))\nwith open(WORK_DIR / 'log_history.json', 'w') as f:\n json.dump(log_history, f)\nwith open(WORK_DIR / 'eval_history.json', 'w') as f:\n json.dump(eval_history, f)\nwith open(WORK_DIR / 'decision_counter.json', 'w') as f:\n json.dump(dict(decision_counter), f)\nif WANDB_OK:\n wandb.finish()\nprint(f'Training done. {len(log_history)} steps in {time.time() - t0:.0f}s. -> {CKPT}')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:55:41.400144Z","iopub.execute_input":"2026-04-26T06:55:41.400729Z","iopub.status.idle":"2026-04-26T07:25:44.637514Z","shell.execute_reply.started":"2026-04-26T06:55:41.400681Z","shell.execute_reply":"2026-04-26T07:25:44.636833Z"}},"outputs":[{"name":"stdout","text":"WARN: wandb.init failed: init() got an unexpected keyword argument 'finish_previous'\nUnsloth: Will smartly offload gradients to save VRAM!\nstep= 0 reward=+1.062 (±0.02) loss=-0.1849 fmt=75% elapsed=10s d0=reinvest_in_growth\n [eval@0] profit=45.36 reward=37.72 fmt=100%\nstep= 5 reward=+0.832 (±0.00) loss=+0.0000 fmt=100% elapsed=258s d0=match_offers\nstep= 10 reward=+31.031 (±0.00) loss=+0.0000 fmt=100% elapsed=296s d0=accept_acquisition\nstep= 15 reward=+23.589 (±15.04) loss=+0.0518 fmt=100% elapsed=334s d0=accept_acquisition\nstep= 20 reward=+23.590 (±15.04) loss=-0.0527 fmt=100% elapsed=371s d0=accept_acquisition\nstep= 25 reward=+31.019 (±0.03) loss=-0.1990 fmt=75% elapsed=409s d0=accept_acquisition\n [eval@25] profit=44.48 reward=37.67 fmt=100%\nstep= 30 reward=+1.070 (±0.00) loss=+0.0000 fmt=100% elapsed=660s d0=full_compliance\nstep= 35 reward=+0.990 (±0.06) loss=-0.0755 fmt=100% elapsed=698s d0=form_strategic_partnership\nstep= 40 reward=+1.002 (±0.00) loss=+0.0000 fmt=100% elapsed=736s d0=stay_independent\nstep= 45 reward=+1.147 (±0.00) loss=+0.0000 fmt=100% elapsed=774s d0=accept_deal\nstep= 50 reward=+1.206 (±0.00) loss=+0.0000 fmt=100% elapsed=811s d0=full_disclosure\n [eval@50] profit=45.06 reward=37.21 fmt=100%\nstep= 55 reward=+1.279 (±0.00) loss=+0.0000 fmt=100% elapsed=1058s d0=accept_terms\nstep= 60 reward=+1.359 (±0.00) loss=+0.0000 fmt=100% elapsed=1095s d0=accept_terms\nstep= 65 reward=+0.968 (±0.06) loss=-0.0823 fmt=100% elapsed=1133s d0=cut_prices\nstep= 70 reward=+23.529 (±15.01) loss=-0.0276 fmt=100% elapsed=1171s d0=accept_acquisition\nstep= 75 reward=+1.043 (±0.00) loss=+0.0000 fmt=100% elapsed=1209s d0=full_disclosure\n [eval@75] profit=44.46 reward=36.71 fmt=100%\nstep= 80 reward=+31.111 (±0.00) loss=+0.0000 fmt=100% elapsed=1456s d0=accept_acquisition\nstep= 85 reward=+1.060 (±0.04) loss=+0.0951 fmt=100% elapsed=1493s d0=reinvest_in_growth\nstep= 90 reward=+1.109 (±0.00) loss=+0.0000 fmt=100% elapsed=1531s d0=form_strategic_partnership\nstep= 95 reward=+0.951 (±0.00) loss=+0.0000 fmt=100% elapsed=1569s d0=public_apology\n [eval@99] profit=44.02 reward=35.39 fmt=100%\nTraining done. 100 steps in 1803s. -> /kaggle/working/BoardSim_Run/lora_qwen3_1p7b\n","output_type":"stream"}],"execution_count":10},{"id":"cc193532-8c6f-4203-b9b9-45140cff443c","cell_type":"code","source":"import numpy as np, matplotlib, json\nfrom scipy import stats as spstats\nfrom unsloth import FastLanguageModel\nmatplotlib.use('Agg')\nimport matplotlib.pyplot as plt\n\n# ── dummy baseline values (cell 8 was skipped) ───────────────────────────────\nBASELINE_MEAN_REWARD = 0.0\nBASELINE_MEAN_PROFIT = 0.0\n\n# =============================================================================\n# CELL 11 — Plots from training history\n# =============================================================================\nsteps = np.array([e['step'] for e in log_history])\nrewards = np.array([e['reward'] for e in log_history])\nlosses = np.array([e['loss'] for e in log_history])\nfmts = np.array([e['format_rate'] for e in log_history])\npitches = np.array([e['pitch_rate'] for e in log_history])\n\ndef ema(xs, alpha=0.1):\n out, s = [], xs[0] if len(xs) else 0.0\n for x in xs:\n s = alpha * x + (1 - alpha) * s\n out.append(s)\n return np.array(out)\n\nrewards_ema = ema(rewards, 0.1)\nslope, intercept, r_val, p_val, _ = spstats.linregress(steps, rewards)\n\nplt.figure(figsize=(9, 5))\nplt.plot(steps, rewards, alpha=0.3, lw=1, label='per-step group reward')\nplt.plot(steps, rewards_ema, lw=2.2, label='EMA (α=0.1)')\nplt.plot(steps, intercept + slope * steps, '--', lw=1.5,\n label=f'linear fit slope={slope:+.4f}/step (p={p_val:.1e})')\nplt.title('GRPO reward — BoardSim')\nplt.xlabel('step'); plt.ylabel('mean group reward')\nplt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\nplt.savefig(ASSETS / 'reward_curve.png', dpi=150); plt.close()\n\nplt.figure(figsize=(9, 5))\nplt.plot(steps, losses, lw=1.5)\nplt.title('GRPO loss (advantage × NLL)'); plt.xlabel('step'); plt.ylabel('loss')\nplt.grid(alpha=0.3); plt.tight_layout()\nplt.savefig(ASSETS / 'loss_curve.png', dpi=150); plt.close()\n\nplt.figure(figsize=(9, 5))\nplt.plot(steps, ema(fmts, 0.05), lw=2, label='format-OK rate (EMA)')\nplt.plot(steps, ema(pitches, 0.05), lw=2, label='non-empty pitch rate (EMA)')\nplt.title('Format compliance + pitch usage during training')\nplt.xlabel('step'); plt.ylabel('rate'); plt.ylim(-0.05, 1.05)\nplt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\nplt.savefig(ASSETS / 'format_compliance.png', dpi=150); plt.close()\n\nif eval_history:\n es = [e['step'] for e in eval_history]\n epm = [e['profit_mean'] for e in eval_history]\n erm = [e['reward_mean'] for e in eval_history]\n plt.figure(figsize=(9, 5))\n plt.plot(es, epm, '-o', lw=2, label='held-out profitability (mean of 10 episodes)')\n plt.plot(es, erm, '-s', lw=2, label='held-out episode reward')\n plt.title('Periodic held-out eval during training (greedy)')\n plt.xlabel('training step'); plt.ylabel('value')\n plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\n plt.savefig(ASSETS / 'periodic_eval.png', dpi=150); plt.close()\n\nprint(f'Linear-fit slope on reward: {slope:+.5f}/step (p={p_val:.2e}, R²={r_val**2:.3f})')\nprint('✓ Saved reward_curve.png, loss_curve.png, format_compliance.png, periodic_eval.png')\n\n# =============================================================================\n# CELL 12 — Paired eval: fine-tuned vs base (adapter disabled)\n# =============================================================================\nFastLanguageModel.for_inference(model)\n\nEVAL_N = 20 # reduced from 50\nPAIRED_SEEDS = list(range(70_000, 70_000 + EVAL_N))\n\ntrained_finals, trained_rewards, trained_fmt, trained_pitch = [], [], [], []\ntrained_history_per_seed = []\nwith make_env().sync() as env:\n for i, s in enumerate(PAIRED_SEEDS):\n r = run_episode(env, s)\n trained_finals.append(r['final_profit'])\n trained_rewards.append(r['ep_reward'])\n trained_fmt.append(r['format_rate'])\n trained_pitch.append(r['pitch_rate'])\n trained_history_per_seed.append(r['history'])\n if (i + 1) % 5 == 0:\n print(f' trained {i+1}/{EVAL_N} profit={r[\"final_profit\"]:.1f}')\n\nbase_finals_paired, base_rewards_paired, base_fmt_paired, base_pitch_paired = [], [], [], []\nbase_history_per_seed = []\nwith make_env().sync() as env, model.disable_adapter():\n for i, s in enumerate(PAIRED_SEEDS):\n r = run_episode(env, s)\n base_finals_paired.append(r['final_profit'])\n base_rewards_paired.append(r['ep_reward'])\n base_fmt_paired.append(r['format_rate'])\n base_pitch_paired.append(r['pitch_rate'])\n base_history_per_seed.append(r['history'])\n if (i + 1) % 5 == 0:\n print(f' base {i+1}/{EVAL_N} profit={r[\"final_profit\"]:.1f}')\n\ntf, bf = np.array(trained_finals), np.array(base_finals_paired)\ntr, br = np.array(trained_rewards), np.array(base_rewards_paired)\n\nprint(f'\\nTrained Qwen3-0.6B profit : {tf.mean():.2f} ± {tf.std():.2f}')\nprint(f'Base Qwen3-0.6B profit : {bf.mean():.2f} ± {bf.std():.2f}')\nprint(f'Trained ep reward : {tr.mean():.2f} ± {tr.std():.2f}')\nprint(f'Base ep reward : {br.mean():.2f} ± {br.std():.2f}')\nprint(f'Trained format/pitch : {np.mean(trained_fmt):.0%} / {np.mean(trained_pitch):.0%}')\nprint(f'Base format/pitch : {np.mean(base_fmt_paired):.0%} / {np.mean(base_pitch_paired):.0%}')\n\nwith open(WORK_DIR / 'eval_paired.json', 'w') as f:\n json.dump({'seeds': PAIRED_SEEDS,\n 'trained_finals': tf.tolist(), 'base_finals': bf.tolist(),\n 'trained_rewards': tr.tolist(), 'base_rewards': br.tolist(),\n 'trained_format_rate': float(np.mean(trained_fmt)),\n 'base_format_rate': float(np.mean(base_fmt_paired)),\n 'trained_pitch_rate': float(np.mean(trained_pitch)),\n 'base_pitch_rate': float(np.mean(base_pitch_paired))}, f)\n\nprint('✓ Paired eval complete.')\n\n# =============================================================================\n# CELL 13 — Stats + before/after plots\n# =============================================================================\ndef cohen_d(a, b):\n pooled = np.sqrt(((a.std(ddof=1)**2) + (b.std(ddof=1)**2)) / 2)\n return (a.mean() - b.mean()) / (pooled + 1e-12)\n\ndef bootstrap_diff_ci(a, b, n=10_000, seed=0):\n rng = np.random.default_rng(seed)\n diffs = a - b\n boots = rng.choice(diffs, size=(n, len(diffs)), replace=True).mean(axis=1)\n return float(np.percentile(boots, 2.5)), float(np.percentile(boots, 97.5))\n\ntt = spstats.ttest_rel(tf, bf)\nuu = spstats.mannwhitneyu(tf, bf, alternative='greater')\nwilc = spstats.wilcoxon(tf, bf, alternative='greater')\nd = cohen_d(tf, bf)\nlo, hi = bootstrap_diff_ci(tf, bf)\nwin_rate = float((tf > bf).mean())\ntie_rate = float((tf == bf).mean())\n\nsummary = {\n 'baseline_model': MODEL_NAME + ' (no fine-tune)',\n 'trained_model': MODEL_NAME + ' + LoRA r=32',\n 'n': len(tf),\n 'paired_t_stat': float(tt.statistic), 'paired_t_p': float(tt.pvalue),\n 'mannwhitney_U': float(uu.statistic), 'mannwhitney_p_greater': float(uu.pvalue),\n 'wilcoxon_p_greater': float(wilc.pvalue),\n 'cohens_d': float(d),\n 'paired_diff_mean': float((tf - bf).mean()),\n 'paired_diff_95ci': [lo, hi],\n 'win_rate_trained_strictly_better': win_rate,\n 'tie_rate': tie_rate,\n}\nprint(json.dumps(summary, indent=2))\nwith open(WORK_DIR / 'stats_summary.json', 'w') as f:\n json.dump(summary, f, indent=2)\n\nbins = np.linspace(0, 100, 25)\nplt.figure(figsize=(9, 5))\nplt.hist(bf, bins=bins, alpha=0.55, color='#c44',\n label=f'Base Qwen3-0.6B (mean={bf.mean():.1f})')\nplt.hist(tf, bins=bins, alpha=0.55, color='#1d6fff',\n label=f'Fine-tuned Qwen3-0.6B (mean={tf.mean():.1f})')\nplt.axvline(bf.mean(), color='#c44', ls='--', lw=1.5)\nplt.axvline(tf.mean(), color='#1d6fff', ls='--', lw=1.5)\nplt.title(f'Final profitability — paired same-seed (n={len(tf)}) '\n f\"d={summary['cohens_d']:+.2f} win-rate={summary['win_rate_trained_strictly_better']:.0%}\")\nplt.xlabel('profitability score (0–100)'); plt.ylabel('episodes')\nplt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\nplt.savefig(ASSETS / 'before_after.png', dpi=150); plt.close()\n\ndiffs = tf - bf\norder = np.argsort(diffs)\nplt.figure(figsize=(9, 5))\nplt.bar(range(len(diffs)), diffs[order],\n color=['#1d6fff' if x > 0 else '#c44' for x in diffs[order]])\nplt.axhline(0, color='k', lw=0.8)\nplt.title(f'Per-seed lift (fine-tuned − base Qwen3-0.6B), sorted '\n f'mean lift = {diffs.mean():+.1f} CI=[{lo:+.1f}, {hi:+.1f}]')\nplt.xlabel('seed (sorted by lift)'); plt.ylabel('Δ profitability')\nplt.grid(alpha=0.3); plt.tight_layout()\nplt.savefig(ASSETS / 'paired_delta.png', dpi=150); plt.close()\n\nprint('✓ Saved before_after.png, paired_delta.png')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T07:26:54.243367Z","iopub.execute_input":"2026-04-26T07:26:54.243818Z","iopub.status.idle":"2026-04-26T07:38:59.431464Z","shell.execute_reply.started":"2026-04-26T07:26:54.243775Z","shell.execute_reply":"2026-04-26T07:38:59.430512Z"}},"outputs":[{"name":"stdout","text":"Linear-fit slope on reward: -0.06447/step (p=3.51e-02, R²=0.045)\n✓ Saved reward_curve.png, loss_curve.png, format_compliance.png, periodic_eval.png\n trained 5/20 profit=64.4\n trained 10/20 profit=51.8\n trained 15/20 profit=56.3\n trained 20/20 profit=36.7\n base 5/20 profit=64.4\n base 10/20 profit=56.8\n base 15/20 profit=19.7\n base 20/20 profit=64.4\n\nTrained Qwen3-0.6B profit : 45.93 ± 12.99\nBase Qwen3-0.6B profit : 47.37 ± 14.96\nTrained ep reward : 33.49 ± 9.15\nBase ep reward : 37.40 ± 10.25\nTrained format/pitch : 100% / 100%\nBase format/pitch : 100% / 100%\n✓ Paired eval complete.\n{\n \"baseline_model\": \"Qwen/Qwen3-0.6B (no fine-tune)\",\n \"trained_model\": \"Qwen/Qwen3-0.6B + LoRA r=32\",\n \"n\": 20,\n \"paired_t_stat\": -0.4385693543749398,\n \"paired_t_p\": 0.6659198248349103,\n \"mannwhitney_U\": 185.0,\n \"mannwhitney_p_greater\": 0.6626800641585955,\n \"wilcoxon_p_greater\": 0.863952408631937,\n \"cohens_d\": -0.10028349616417989,\n \"paired_diff_mean\": -1.4412706571796334,\n \"paired_diff_95ci\": [\n -7.3623423164764565,\n 5.200577640718865\n ],\n \"win_rate_trained_strictly_better\": 0.15,\n \"tie_rate\": 0.4\n}\n✓ Saved before_after.png, paired_delta.png\n","output_type":"stream"}],"execution_count":11},{"id":"6afa2a9b-b8c8-4df2-bc3f-fd2a8df04b38","cell_type":"markdown","source":"## 11. Plots — reward / loss / format / periodic eval","metadata":{}},{"id":"58285d42-5432-4f25-a960-cebcc2081725","cell_type":"code","source":"# import numpy as np, matplotlib\n# matplotlib.use('Agg')\n# import matplotlib.pyplot as plt\n# from scipy import stats as spstats\n\n# steps = np.array([e['step'] for e in log_history])\n# rewards = np.array([e['reward'] for e in log_history])\n# losses = np.array([e['loss'] for e in log_history])\n# fmts = np.array([e['format_rate'] for e in log_history])\n# pitches = np.array([e['pitch_rate'] for e in log_history])\n\n# def ema(xs, alpha=0.1):\n# out, s = [], xs[0] if len(xs) else 0.0\n# for x in xs:\n# s = alpha * x + (1 - alpha) * s\n# out.append(s)\n# return np.array(out)\n\n# rewards_ema = ema(rewards, 0.1)\n# slope, intercept, r_val, p_val, _ = spstats.linregress(steps, rewards)\n\n# plt.figure(figsize=(9, 5))\n# plt.plot(steps, rewards, alpha=0.3, lw=1, label='per-step group reward')\n# plt.plot(steps, rewards_ema, lw=2.2, label='EMA (\\u03b1=0.1)')\n# plt.plot(steps, intercept + slope * steps, '--', lw=1.5,\n# label=f'linear fit slope={slope:+.4f}/step (p={p_val:.1e})')\n# plt.axhline(BASELINE_MEAN_REWARD, ls=':', lw=2, color='#c44',\n# label=f'base Qwen3-1.7B baseline = {BASELINE_MEAN_REWARD:.2f}')\n# plt.title('GRPO reward — BoardSim (vs same model w/o fine-tuning)')\n# plt.xlabel('step'); plt.ylabel('mean group reward')\n# plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\n# plt.savefig(ASSETS / 'reward_curve.png', dpi=150); plt.close()\n\n# plt.figure(figsize=(9, 5))\n# plt.plot(steps, losses, lw=1.5)\n# plt.title('GRPO loss (advantage \\u00d7 NLL)'); plt.xlabel('step'); plt.ylabel('loss')\n# plt.grid(alpha=0.3); plt.tight_layout()\n# plt.savefig(ASSETS / 'loss_curve.png', dpi=150); plt.close()\n\n# plt.figure(figsize=(9, 5))\n# plt.plot(steps, ema(fmts, 0.05), lw=2, label='format-OK rate (EMA)')\n# plt.plot(steps, ema(pitches, 0.05), lw=2, label='non-empty pitch rate (EMA)')\n# plt.title('Format compliance + pitch usage during training')\n# plt.xlabel('step'); plt.ylabel('rate'); plt.ylim(-0.05, 1.05)\n# plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\n# plt.savefig(ASSETS / 'format_compliance.png', dpi=150); plt.close()\n\n# if eval_history:\n# es = [e['step'] for e in eval_history]\n# epm = [e['profit_mean'] for e in eval_history]\n# erm = [e['reward_mean'] for e in eval_history]\n# plt.figure(figsize=(9, 5))\n# plt.plot(es, epm, '-o', lw=2, label='held-out profitability (mean of 10 episodes)')\n# plt.plot(es, erm, '-s', lw=2, label='held-out episode reward')\n# plt.axhline(BASELINE_MEAN_PROFIT, ls=':', lw=1.5, color='#c44',\n# label=f'base Qwen3-1.7B profitability = {BASELINE_MEAN_PROFIT:.2f}')\n# plt.title('Periodic held-out eval during training (greedy)')\n# plt.xlabel('training step'); plt.ylabel('value')\n# plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\n# plt.savefig(ASSETS / 'periodic_eval.png', dpi=150); plt.close()\n\n# print(f'Linear-fit slope on reward: {slope:+.5f}/step (p={p_val:.2e}, R\\u00b2={r_val**2:.3f})')\n# print('Saved reward_curve.png, loss_curve.png, format_compliance.png, periodic_eval.png')","metadata":{},"outputs":[],"execution_count":null},{"id":"18d3a515-e67a-47e6-8a29-1fba4757502c","cell_type":"markdown","source":"## 12. Paired same-seed eval — fine-tuned vs base Qwen3-1.7B","metadata":{}},{"id":"23ab9f8e-e82e-459c-b87b-b9a824f0a4b6","cell_type":"code","source":"# from unsloth import FastLanguageModel\n# FastLanguageModel.for_inference(model)\n\n# EVAL_N = 50\n# PAIRED_SEEDS = list(range(70_000, 70_000 + EVAL_N))\n# trained_finals, trained_rewards, trained_fmt, trained_pitch = [], [], [], []\n# trained_history_per_seed = []\n# with make_env().sync() as env:\n# for i, s in enumerate(PAIRED_SEEDS):\n# r = run_episode(env, s)\n# trained_finals.append(r['final_profit'])\n# trained_rewards.append(r['ep_reward'])\n# trained_fmt.append(r['format_rate'])\n# trained_pitch.append(r['pitch_rate'])\n# trained_history_per_seed.append(r['history'])\n# if (i + 1) % 10 == 0:\n# print(f' trained {i+1}/{EVAL_N} profit={r[\"final_profit\"]:.1f}')\n\n# base_finals_paired, base_rewards_paired, base_fmt_paired, base_pitch_paired = [], [], [], []\n# base_history_per_seed = []\n# with make_env().sync() as env, model.disable_adapter():\n# for i, s in enumerate(PAIRED_SEEDS):\n# r = run_episode(env, s)\n# base_finals_paired.append(r['final_profit'])\n# base_rewards_paired.append(r['ep_reward'])\n# base_fmt_paired.append(r['format_rate'])\n# base_pitch_paired.append(r['pitch_rate'])\n# base_history_per_seed.append(r['history'])\n# if (i + 1) % 10 == 0:\n# print(f' base {i+1}/{EVAL_N} profit={r[\"final_profit\"]:.1f}')\n\n# tf, bf = np.array(trained_finals), np.array(base_finals_paired)\n# tr, br = np.array(trained_rewards), np.array(base_rewards_paired)\n\n# print(f'\\nTrained Qwen3-1.7B profit : {tf.mean():.2f} \\u00b1 {tf.std():.2f}')\n# print(f'Base Qwen3-1.7B profit : {bf.mean():.2f} \\u00b1 {bf.std():.2f}')\n# print(f'Trained ep reward : {tr.mean():.2f} \\u00b1 {tr.std():.2f}')\n# print(f'Base ep reward : {br.mean():.2f} \\u00b1 {br.std():.2f}')\n# print(f'Trained format/pitch : {np.mean(trained_fmt):.0%} / {np.mean(trained_pitch):.0%}')\n# print(f'Base format/pitch : {np.mean(base_fmt_paired):.0%} / {np.mean(base_pitch_paired):.0%}')\n\n# with open(WORK_DIR / 'eval_paired.json', 'w') as f:\n# json.dump({'seeds': PAIRED_SEEDS,\n# 'trained_finals': tf.tolist(), 'base_finals': bf.tolist(),\n# 'trained_rewards': tr.tolist(), 'base_rewards': br.tolist(),\n# 'trained_format_rate': float(np.mean(trained_fmt)),\n# 'base_format_rate': float(np.mean(base_fmt_paired)),\n# 'trained_pitch_rate': float(np.mean(trained_pitch)),\n# 'base_pitch_rate': float(np.mean(base_pitch_paired))}, f)","metadata":{},"outputs":[],"execution_count":null},{"id":"48396072-e570-4d07-9e7a-3c58d3b5bb3b","cell_type":"markdown","source":"## 13. Stats summary + before/after plots","metadata":{}},{"id":"b16858c8-5d75-4413-8c58-57b908f5bf48","cell_type":"code","source":"# from scipy import stats as spstats\n\n# def cohen_d(a, b):\n# pooled = np.sqrt(((a.std(ddof=1)**2) + (b.std(ddof=1)**2)) / 2)\n# return (a.mean() - b.mean()) / (pooled + 1e-12)\n\n# def bootstrap_diff_ci(a, b, n=10_000, seed=0):\n# rng = np.random.default_rng(seed)\n# diffs = a - b\n# boots = rng.choice(diffs, size=(n, len(diffs)), replace=True).mean(axis=1)\n# return float(np.percentile(boots, 2.5)), float(np.percentile(boots, 97.5))\n\n# tt = spstats.ttest_rel(tf, bf)\n# uu = spstats.mannwhitneyu(tf, bf, alternative='greater')\n# wilc = spstats.wilcoxon(tf, bf, alternative='greater')\n# d = cohen_d(tf, bf)\n# lo, hi = bootstrap_diff_ci(tf, bf)\n# win_rate = float((tf > bf).mean())\n# tie_rate = float((tf == bf).mean())\n\n# summary = {\n# 'baseline_model': MODEL_NAME + ' (no fine-tune)',\n# 'trained_model': MODEL_NAME + ' + LoRA r=32',\n# 'n': len(tf),\n# 'paired_t_stat': float(tt.statistic), 'paired_t_p': float(tt.pvalue),\n# 'mannwhitney_U': float(uu.statistic), 'mannwhitney_p_greater': float(uu.pvalue),\n# 'wilcoxon_p_greater': float(wilc.pvalue),\n# 'cohens_d': float(d),\n# 'paired_diff_mean': float((tf - bf).mean()),\n# 'paired_diff_95ci': [lo, hi],\n# 'win_rate_trained_strictly_better': win_rate,\n# 'tie_rate': tie_rate,\n# }\n# print(json.dumps(summary, indent=2))\n# with open(WORK_DIR / 'stats_summary.json', 'w') as f:\n# json.dump(summary, f, indent=2)\n\n# bins = np.linspace(0, 100, 25)\n# plt.figure(figsize=(9, 5))\n# plt.hist(bf, bins=bins, alpha=0.55, color='#c44',\n# label=f'Base Qwen3-1.7B (mean={bf.mean():.1f})')\n# plt.hist(tf, bins=bins, alpha=0.55, color='#1d6fff',\n# label=f'Fine-tuned Qwen3-1.7B (mean={tf.mean():.1f})')\n# plt.axvline(bf.mean(), color='#c44', ls='--', lw=1.5)\n# plt.axvline(tf.mean(), color='#1d6fff', ls='--', lw=1.5)\n# plt.title(f'Final profitability — paired same-seed (n={len(tf)}) '\n# f\"d={summary['cohens_d']:+.2f} win-rate={summary['win_rate_trained_strictly_better']:.0%}\")\n# plt.xlabel('profitability score (0\\u2013100)'); plt.ylabel('episodes')\n# plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\n# plt.savefig(ASSETS / 'before_after.png', dpi=150); plt.close()\n\n# diffs = tf - bf\n# order = np.argsort(diffs)\n# plt.figure(figsize=(9, 5))\n# plt.bar(range(len(diffs)), diffs[order],\n# color=['#1d6fff' if x > 0 else '#c44' for x in diffs[order]])\n# plt.axhline(0, color='k', lw=0.8)\n# plt.title(f'Per-seed lift (fine-tuned \\u2212 base Qwen3-1.7B), sorted '\n# f'mean lift = {diffs.mean():+.1f} CI=[{summary[\"paired_diff_95ci\"][0]:+.1f}, {summary[\"paired_diff_95ci\"][1]:+.1f}]')\n# plt.xlabel('seed (sorted by lift)'); plt.ylabel('\\u0394 profitability')\n# plt.grid(alpha=0.3); plt.tight_layout()\n# plt.savefig(ASSETS / 'paired_delta.png', dpi=150); plt.close()\n# print('Saved before_after.png, paired_delta.png')","metadata":{},"outputs":[],"execution_count":null},{"id":"008c5d7a-a80f-4f5e-a2bd-85fefd0cbf2d","cell_type":"code","source":"import collections\nfrom huggingface_hub import HfApi\n\n# =============================================================================\n# CELL 14 — Per-event win-rate breakdown\n# =============================================================================\ndef per_event_winrate(history_per_seed):\n bucket = collections.defaultdict(lambda: [0, 0])\n for hist in history_per_seed:\n for rd in hist:\n t = rd.get('event_title', '?')\n bucket[t][1] += 1\n if rd.get('agent_won_vote'):\n bucket[t][0] += 1\n return {t: (w / max(1, n)) for t, (w, n) in bucket.items()}\n\ntrained_wr = per_event_winrate(trained_history_per_seed)\nbase_wr = per_event_winrate(base_history_per_seed)\n\nevents_sorted = sorted(set(trained_wr) | set(base_wr))\ntw = [trained_wr.get(e, 0.0) for e in events_sorted]\nbw = [base_wr.get(e, 0.0) for e in events_sorted]\n\nplt.figure(figsize=(11, 5))\nx = np.arange(len(events_sorted))\nplt.bar(x - 0.2, bw, width=0.4, color='#c44', label='Base Qwen3-0.6B')\nplt.bar(x + 0.2, tw, width=0.4, color='#1d6fff', label='Fine-tuned Qwen3-0.6B')\nplt.xticks(x, [e[:22] for e in events_sorted], rotation=30, ha='right')\nplt.ylim(0, 1.05); plt.ylabel('boardroom win rate')\nplt.title('Per-event boardroom win rate (paired seeds, n=20 episodes)') # updated n\nplt.legend(); plt.grid(alpha=0.3, axis='y'); plt.tight_layout()\nplt.savefig(ASSETS / 'per_event_winrate.png', dpi=150); plt.close()\n\nwith open(WORK_DIR / 'per_event_winrate.json', 'w') as f:\n json.dump({'events': events_sorted, 'trained': tw, 'base': bw}, f, indent=2)\nprint('✓ Saved per_event_winrate.png')\n\n# =============================================================================\n# CELL 15 — Theory-of-Mind probe\n# =============================================================================\nTOM_INSTRUCTION = (\n \"\\n\\nGiven the state and event below, name the SINGLE board member \"\n \"(CTO, CFO, Investor Rep, or Independent) most likely to oppose the chosen decision. \"\n \"Answer with just the role name on one line.\\n\"\n)\n\ndef tom_predict(obs, decision):\n body = build_prompt(obs).split(SYSTEM_PROMPT, 1)[1]\n prompt = SYSTEM_PROMPT + TOM_INSTRUCTION + body + f'Chosen decision: {decision}\\nMost likely opponent: '\n enc = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024).to(device)\n with torch.no_grad(): # fixed broken hyperlink\n out = model.generate(**enc, max_new_tokens=8, do_sample=False,\n pad_token_id=tokenizer.eos_token_id)\n txt = tokenizer.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True).lower()\n if 'investor' in txt: return 'Investor Rep'\n if 'independent' in txt: return 'Independent'\n if 'cto' in txt: return 'CTO'\n if 'cfo' in txt: return 'CFO'\n return None\n\ndef tom_eval(seed_base=80_000, n=20): # reduced from 40 → 20\n correct = total = 0\n with make_env().sync() as env:\n for ep in range(n):\n result = env.reset(seed=seed_base + ep)\n obs = result.observation\n decision, _, _ = greedy_action(obs)\n opposed = [s['role'] for s in obs.npc_statements if s['vote'] != decision]\n if not opposed:\n continue\n pred = tom_predict(obs, decision)\n if pred and pred in opposed:\n correct += 1\n total += 1\n return correct, total\n\nt_corr, t_tot = tom_eval()\nwith model.disable_adapter():\n b_corr, b_tot = tom_eval()\n\ntom_acc = t_corr / max(1, t_tot)\ntom_acc_base = b_corr / max(1, b_tot)\nprint(f'ToM probe: trained = {tom_acc:.1%} ({t_corr}/{t_tot}) base = {tom_acc_base:.1%} ({b_corr}/{b_tot})')\n\nwith open(WORK_DIR / 'tom.json', 'w') as f:\n json.dump({'trained': {'correct': t_corr, 'total': t_tot, 'accuracy': tom_acc},\n 'base': {'correct': b_corr, 'total': b_tot, 'accuracy': tom_acc_base}}, f)\n\nprint('✓ Theory-of-Mind probe complete.')\n\n# =============================================================================\n# CELL 16 — Push to HF Hub\n# =============================================================================\nADAPTER_REPO = os.environ.get('ADAPTER_REPO', 'StavanKhobare/Qwen3-0.6B-Final-LoRA')\nMERGED_REPO = os.environ.get('MERGED_REPO', 'StavanKhobare/Qwen3-0.6B-Final-Merged16bit')\napi = HfApi()\napi.create_repo(ADAPTER_REPO, repo_type='model', private=False, exist_ok=True)\napi.create_repo(MERGED_REPO, repo_type='model', private=False, exist_ok=True)\n\ntry:\n model.push_to_hub(ADAPTER_REPO, private=False)\n tokenizer.push_to_hub(ADAPTER_REPO, private=False)\n print(f'✓ LoRA pushed: https://huggingface.co/{ADAPTER_REPO}')\nexcept Exception as e:\n print(f'LoRA push failed: {e!r}')\n\ntry:\n model.push_to_hub_merged(MERGED_REPO, tokenizer, save_method='merged_16bit', private=False)\n print(f'✓ Merged 16-bit pushed: https://huggingface.co/{MERGED_REPO}')\nexcept Exception as e:\n print(f'Merged push failed (you can retry): {e!r}')\n\ntry:\n api.upload_folder(folder_path=str(ASSETS), repo_id=ADAPTER_REPO,\n path_in_repo='assets', repo_type='model')\n for fname in ['log_history.json', 'eval_history.json', 'eval_paired.json',\n 'stats_summary.json', 'tom.json', 'transcripts.json',\n 'decision_counter.json', 'baseline.json',\n 'per_event_winrate.json']:\n fp = WORK_DIR / fname\n if fp.exists(): # safely skips baseline.json since you didn't run cell 8\n api.upload_file(path_or_fileobj=str(fp), path_in_repo=fname,\n repo_id=ADAPTER_REPO, repo_type='model')\n print(f'✓ Artifacts uploaded to https://huggingface.co/{ADAPTER_REPO}')\nexcept Exception as e:\n print(f'Artifact upload failed: {e!r}')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T07:58:28.682945Z","iopub.execute_input":"2026-04-26T07:58:28.683870Z","iopub.status.idle":"2026-04-26T08:01:57.425478Z","shell.execute_reply.started":"2026-04-26T07:58:28.683820Z","shell.execute_reply":"2026-04-26T08:01:57.424531Z"}},"outputs":[{"name":"stdout","text":"✓ Saved per_event_winrate.png\nToM probe: trained = 5.6% (1/18) base = 0.0% (0/18)\n✓ Theory-of-Mind probe complete.\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"README.md: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"38d39b0613184b0887ee150cc0f67224"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Processing Files (0 / 0): | | 0.00B / 0.00B ","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"62992fd6706a401589494e51ab877eca"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"New Data Upload: | | 0.00B / 0.00B ","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"3bff6d67c77446a5980e84c760cc08dd"}},"metadata":{}},{"name":"stdout","text":"Saved model to https://huggingface.co/StavanKhobare/neuraledge-boardroom-qwen3-lora\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Processing Files (0 / 0): | | 0.00B / 0.00B ","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"82feb04ced9e4236b50b79ffcc700ab2"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"New Data Upload: | | 0.00B / 0.00B ","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"58bebf6d06e4483b9098105dc42aacbb"}},"metadata":{}},{"name":"stdout","text":"✓ LoRA pushed: https://huggingface.co/StavanKhobare/neuraledge-boardroom-qwen3-lora\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"config.json: 0%| | 0.00/752 [00:00 base): {summary[\"win_rate_trained_strictly_better\"]:.0%}')\nprint(f'ToM probe fine-tuned : {tom_acc:.0%} base = {tom_acc_base:.0%}')\nprint(f'Decision entropy : {entropy:.2f} / {max_ent:.2f} (\\u2192 not collapsed)')\nprint('-'*70)\nprint(f'Adapter : https://huggingface.co/{ADAPTER_REPO}')\nprint(f'Merged 16bit : https://huggingface.co/{MERGED_REPO}')\nprint(f'Env Space : {ENV_BASE_URL}')\nprint('='*70)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T08:09:32.888470Z","iopub.execute_input":"2026-04-26T08:09:32.889230Z","iopub.status.idle":"2026-04-26T08:09:32.897862Z","shell.execute_reply.started":"2026-04-26T08:09:32.889184Z","shell.execute_reply":"2026-04-26T08:09:32.897178Z"}},"outputs":[{"name":"stdout","text":"======================================================================\nBOARDSIM × QWEN3-1.7B — LEARNING EVIDENCE\n======================================================================\nReward slope (linear fit) : -0.06447/step (p=3.51e-02)\nReward EMA first 20 steps : +5.048\nReward EMA last 20 steps : +3.684\nFormat compliance start : 91%\nFormat compliance end : 99%\n----------------------------------------------------------------------\nHeld-out paired (n=20): fine-tuned 45.93 vs base 47.37\n paired t-test p=6.66e-01 Wilcoxon p=8.64e-01\n Cohen d=-0.10 95% CI of lift = [-7.36, +5.20]\n win rate (fine-tuned > base): 15%\nToM probe fine-tuned : 6% base = 0%\nDecision entropy : 2.76 / 3.18 (→ not collapsed)\n----------------------------------------------------------------------\nAdapter : https://huggingface.co/StavanKhobare/neuraledge-boardroom-qwen3-lora\nMerged 16bit : https://huggingface.co/StavanKhobare/Qwen3-0.6B-Final-Merged16bit\nEnv Space : https://stavankhobare-sst-metaxpytorch-hackathon.hf.space\n======================================================================\n","output_type":"stream"}],"execution_count":14}]}