File size: 76,662 Bytes
312c390
1
{"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.12.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"dockerImageVersionId":31329,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":5,"nbformat":4,"cells":[{"id":"1df1cb2a-0c95-4ee5-a997-dc73db3d8b49","cell_type":"markdown","source":"# BoardSim × Qwen3-1.7B — GRPO LoRA fine-tune (Kaggle edition)\n\nRuns on Kaggle GPUs (T4 x2 or P100). Enable: **Settings → Accelerator: GPU**, **Internet: On**.\n\nAdd Kaggle Secrets (Add-ons → Secrets):\n- `HF_TOKEN` (required)\n- `WANDB_API_KEY` (optional)\n- `ENV_BASE_URL` (optional, defaults to public HF Space)\n- `ADAPTER_REPO`, `MERGED_REPO` (optional)","metadata":{}},{"id":"6dbca818-8816-4674-ad2f-995a85afa322","cell_type":"markdown","source":"## 1. Install deps (unsloth FIRST — patches torch/transformers at import)","metadata":{}},{"id":"4c138c25-d385-4346-ae41-ce47dd39c670","cell_type":"code","source":"%pip install -q --no-deps unsloth\n%pip install -q unsloth_zoo\n%pip install -q \"openenv-core==0.2.3\" \"trl>=0.12,<2.0\" \"transformers>=4.45,<5.0\" \\\n    \"datasets>=3.0\" \"accelerate>=1.0\" \"huggingface_hub>=0.25\" \"pydantic>=2.0\" \\\n    wandb matplotlib python-dotenv bitsandbytes scipy scikit-learn sentence-transformers","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:45:07.055736Z","iopub.execute_input":"2026-04-26T06:45:07.056326Z","iopub.status.idle":"2026-04-26T06:45:16.564270Z","shell.execute_reply.started":"2026-04-26T06:45:07.056292Z","shell.execute_reply":"2026-04-26T06:45:16.563185Z"}},"outputs":[{"name":"stdout","text":"Note: you may need to restart the kernel to use updated packages.\nNote: you may need to restart the kernel to use updated packages.\nNote: you may need to restart the kernel to use updated packages.\n","output_type":"stream"}],"execution_count":1},{"id":"3d31460e-0652-479a-b7c2-f4d98f5e721b","cell_type":"markdown","source":"## 2. Auth — Kaggle Secrets → env vars → HF / W&B login","metadata":{}},{"id":"1fc3b887-057d-4faf-a779-d22edd23e28d","cell_type":"code","source":"import os, pathlib\n\nIN_KAGGLE = os.path.isdir('/kaggle')\n\n# Kaggle Secrets first\nif IN_KAGGLE:\n    try:\n        from kaggle_secrets import UserSecretsClient\n        usc = UserSecretsClient()\n        for k in ('HF_TOKEN', 'WANDB_API_KEY', 'ENV_BASE_URL', 'ADAPTER_REPO', 'MERGED_REPO'):\n            try:\n                v = usc.get_secret(k)\n                if v:\n                    os.environ.setdefault(k, v)\n            except Exception:\n                pass\n    except Exception as e:\n        print(f'kaggle_secrets unavailable: {e}')\n\n# .env fallback\ntry:\n    from dotenv import load_dotenv\n    for p in [pathlib.Path('.env'), pathlib.Path('../.env'),\n              pathlib.Path('/kaggle/working/.env')]:\n        if p.exists():\n            load_dotenv(p, override=False)\n            print(f'Loaded env from {p.resolve()}')\n            break\nexcept Exception:\n    pass\n\nif not os.environ.get('HF_TOKEN'):\n    os.environ['HF_TOKEN'] = input('HF token: ').strip()\nif not os.environ.get('WANDB_API_KEY'):\n    os.environ['WANDB_API_KEY'] = input('WandB key (or blank to skip): ').strip()\n\nfrom huggingface_hub import login as hf_login\nhf_login(token=os.environ['HF_TOKEN'], add_to_git_credential=False)\nprint('HF auth ok.')\nif os.environ.get('WANDB_API_KEY'):\n    import wandb\n    wandb.login(key=os.environ['WANDB_API_KEY'])\n    print('W&B auth ok.')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:45:23.675886Z","iopub.execute_input":"2026-04-26T06:45:23.676245Z","iopub.status.idle":"2026-04-26T06:45:33.147950Z","shell.execute_reply.started":"2026-04-26T06:45:23.676209Z","shell.execute_reply":"2026-04-26T06:45:33.147370Z"}},"outputs":[{"name":"stderr","text":"Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n","output_type":"stream"},{"name":"stdout","text":"HF auth ok.\n","output_type":"stream"},{"name":"stderr","text":"/usr/local/lib/python3.12/dist-packages/notebook/notebookapp.py:191: SyntaxWarning: invalid escape sequence '\\/'\n  | |_| | '_ \\/ _` / _` |  _/ -_)\n\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n\u001b[34m\u001b[1mwandb\u001b[0m: [wandb.login()] Using explicit session credentials for https://api.wandb.ai.\n\u001b[34m\u001b[1mwandb\u001b[0m: No netrc file found, creating one.\n\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mstavanrkhobare\u001b[0m (\u001b[33mstavanrkhobare-r-v-college-of-engineering\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n","output_type":"stream"},{"name":"stdout","text":"W&B auth ok.\n","output_type":"stream"}],"execution_count":2},{"id":"6f4139aa-d782-49b7-a1dc-e617ca26e305","cell_type":"markdown","source":"## 3. Working dirs (Kaggle uses `/kaggle/working` — persists as notebook output)","metadata":{}},{"id":"3499d810-5d4d-48a2-9754-7a071be8f619","cell_type":"code","source":"import pathlib\n\nif IN_KAGGLE:\n    WORK_DIR = pathlib.Path('/kaggle/working/BoardSim_Run')\nelse:\n    WORK_DIR = pathlib.Path('./BoardSim_Run')\nWORK_DIR.mkdir(parents=True, exist_ok=True)\nASSETS = WORK_DIR / 'assets'; ASSETS.mkdir(exist_ok=True)\nCKPT   = WORK_DIR / 'lora_qwen3_1p7b'; CKPT.mkdir(exist_ok=True)\nprint('WORK_DIR =', WORK_DIR)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:47:04.590428Z","iopub.execute_input":"2026-04-26T06:47:04.591388Z","iopub.status.idle":"2026-04-26T06:47:04.597329Z","shell.execute_reply.started":"2026-04-26T06:47:04.591355Z","shell.execute_reply":"2026-04-26T06:47:04.596544Z"}},"outputs":[{"name":"stdout","text":"WORK_DIR = /kaggle/working/BoardSim_Run\n","output_type":"stream"}],"execution_count":3},{"id":"3c8e4b29-bf64-4a09-95df-6330427a5eed","cell_type":"markdown","source":"## 4. Clone repo + connect to BoardSim env","metadata":{}},{"id":"34fa6345-2ef6-48c2-ac33-b1322e4f7965","cell_type":"code","source":"import os, sys, subprocess, urllib.request, json as _json\n\nENV_BASE_URL = os.environ.get('ENV_BASE_URL',\n    'https://stavankhobare-sst-metaxpytorch-hackathon.hf.space')\nREPO_URL = 'https://github.com/StavanRKhobare/SST-MetaxPyTorch-Hackathon'\n\nREPO_DIR = '/kaggle/working/repo' if IN_KAGGLE else os.path.abspath('./repo')\nif not os.path.isdir(os.path.join(REPO_DIR, '.git')):\n    subprocess.run(['git', 'clone', '--depth', '1', REPO_URL, REPO_DIR], check=True)\nelse:\n    subprocess.run(['git', '-C', REPO_DIR, 'pull', '--ff-only'], check=False)\n\nENVS_DIR = os.path.join(REPO_DIR, 'envs')\nif ENVS_DIR not in sys.path:\n    sys.path.insert(0, ENVS_DIR)\n\nfor mod in [m for m in list(sys.modules) if m == 'board_sim_env' or m.startswith('board_sim_env.')]:\n    del sys.modules[mod]\n\nfrom board_sim_env.client import BoardSimEnv\nfrom board_sim_env.models import BoardSimAction, BoardSimObservation\n\ntry:\n    with urllib.request.urlopen(f'{ENV_BASE_URL.rstrip(\"/\")}/health', timeout=20) as r:\n        h = _json.loads(r.read())\n        print('health:', h)\nexcept Exception as e:\n    print(f'WARN: could not reach {ENV_BASE_URL}/health  ({e})')\n\ndef make_env():\n    return BoardSimEnv(base_url=ENV_BASE_URL)\n\nprint('BoardSimEnv ready.')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:47:29.780919Z","iopub.execute_input":"2026-04-26T06:47:29.781827Z","iopub.status.idle":"2026-04-26T06:47:37.242897Z","shell.execute_reply.started":"2026-04-26T06:47:29.781795Z","shell.execute_reply":"2026-04-26T06:47:37.242018Z"}},"outputs":[{"name":"stderr","text":"Cloning into '/kaggle/working/repo'...\n","output_type":"stream"},{"name":"stdout","text":"health: {'status': 'healthy'}\nBoardSimEnv ready.\n","output_type":"stream"}],"execution_count":4},{"id":"89e49517-7bf8-43fc-a820-b82c114c3aa3","cell_type":"markdown","source":"## 5. Load Qwen3-1.7B in 4-bit via Unsloth","metadata":{}},{"id":"cc858a70-2b6e-49c6-b98d-5699eb3ec450","cell_type":"code","source":"import unsloth  # noqa: F401\nfrom unsloth import FastLanguageModel\nimport torch, re\n\nMODEL_NAME  = 'Qwen/Qwen3-0.6B'\nMAX_SEQ_LEN = 2048\n\nmodel, tokenizer = FastLanguageModel.from_pretrained(\n    model_name=MODEL_NAME,\n    max_seq_length=MAX_SEQ_LEN,\n    load_in_4bit=True,\n    dtype=None,\n)\nif tokenizer.pad_token is None:\n    tokenizer.pad_token = tokenizer.eos_token\n\ndevice = next(model.parameters()).device\nprint(f'Loaded {MODEL_NAME} on {device}.')\nif torch.cuda.is_available():\n    total_gb = torch.cuda.get_device_properties(0).total_memory / 1e9\n    mem_gb   = torch.cuda.memory_allocated() / 1e9\n    print(f'GPU memory after base load: {mem_gb:.2f} GB / {total_gb:.2f} GB')\n    print(f'Headroom for compute:       {total_gb - mem_gb:.2f} GB')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:47:59.583329Z","iopub.execute_input":"2026-04-26T06:47:59.584074Z","iopub.status.idle":"2026-04-26T06:48:54.637486Z","shell.execute_reply.started":"2026-04-26T06:47:59.584032Z","shell.execute_reply":"2026-04-26T06:48:54.636739Z"}},"outputs":[{"name":"stdout","text":"🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.\n","output_type":"stream"},{"name":"stderr","text":"2026-04-26 06:48:09.793138: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\nWARNING: All log messages before absl::InitializeLog() is called are written to STDERR\nE0000 00:00:1777186089.964130     137 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\nE0000 00:00:1777186090.018837     137 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\nW0000 00:00:1777186090.438874     137 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\nW0000 00:00:1777186090.438920     137 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\nW0000 00:00:1777186090.438923     137 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\nW0000 00:00:1777186090.438926     137 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n","output_type":"stream"},{"name":"stdout","text":"🦥 Unsloth Zoo will now patch everything to make training faster!\n==((====))==  Unsloth 2026.4.8: Fast Qwen3 patching. Transformers: 4.57.6.\n   \\\\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.563 GB. Platform: Linux.\nO^O/ \\_/ \\    Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0\n\\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]\n \"-____-\"     Free license: http://github.com/unslothai/unsloth\nUnsloth: Fast downloading is enabled - ignore downloading bars which are red colored!\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"model.safetensors:   0%|          | 0.00/576M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"f4d30c9bb3c544709c132f5f772c0f2c"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"d56a9314279d47d684f676dfab8dd820"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"tokenizer_config.json: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"2ce6008eed824df098f00acac3ae395c"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"vocab.json: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"3c2e3321f6d94ede87ab3d1fd62c8200"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"merges.txt: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"dca610c0b9c942468f132d897d05f9e9"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a6c50695406c42639111f46522cd2466"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"e0fd7d77c3cf42e0ac7525c83e491b4f"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"c16044082e094b3dbfd9a0c0cba2f7f1"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"chat_template.jinja: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"526237c688f044f3bf59fb6a80354dc5"}},"metadata":{}},{"name":"stdout","text":"unsloth/qwen3-0.6b-unsloth-bnb-4bit does not have a padding token! Will use pad_token = <|PAD_TOKEN|>.\nLoaded Qwen/Qwen3-0.6B on cuda:0.\nGPU memory after base load: 0.60 GB / 15.64 GB\nHeadroom for compute:       15.03 GB\n","output_type":"stream"}],"execution_count":5},{"id":"db967c61-6f66-4c56-919e-49c564113cd2","cell_type":"markdown","source":"## 6. Prompt + parser + greedy action helper","metadata":{}},{"id":"1f92e995-14c1-4b5f-abf1-50fc00081343","cell_type":"code","source":"SYSTEM_PROMPT = \"\"\"You are the CEO of a mid-stage organization. Your board has 4 members with HIDDEN AGENDAS you cannot see directly:\n  - CTO: cares about operational excellence, engineering quality, team morale, and product readiness.\n  - CFO: cares about cash discipline, runway, and regulatory safety.\n  - Investor Rep: pushes growth, market share, and bold returns.\n  - Independent: cares about reputation, governance, and long-term consensus.\n\nEach round you see a strategic event, every NPC's pre-vote statement, and 3 options.\nYour decision is resolved by WEIGHTED VOTE (your weight 2.5x). A short COALITION PITCH\nthat is semantically aligned with opposing members' priorities can swing them toward your pick —\nwrite substantive arguments, not just buzzwords.\n\nRespond in EXACTLY this format on two lines:\nDECISION: <one of the option strings>\nPITCH: <one or two sentences arguing for it, addressing the concerns of opposing members>\"\"\"\n\nDECISION_RE = re.compile(r'DECISION\\s*:\\s*([A-Za-z0-9_\\- ]+)', re.IGNORECASE)\nPITCH_RE    = re.compile(r'PITCH\\s*:\\s*(.+)', re.IGNORECASE)\n\ndef build_prompt(obs):\n    statements = '\\n'.join(\n        f\"  {s['role']} ({s['confidence']:.2f}): votes {s['vote']} - {s['statement']}\"\n        for s in obs.npc_statements\n    )\n    return (\n        f\"{SYSTEM_PROMPT}\\n\\n\"\n        f\"State: revenue=${obs.state['revenue']:.0f}/yr  burn=${obs.state['burn_rate']:.0f}/mo  \"\n        f\"runway={obs.state['runway_months']:.1f}mo  morale={obs.state['team_morale']:.2f}  \"\n        f\"investors={obs.state['investor_confidence']:.2f}  reg_risk={obs.state['regulatory_risk']:.2f}\\n\"\n        f\"Event: {obs.event}\\nBoard:\\n{statements}\\n\"\n        f\"Options: {obs.options}\\n\"\n    )\n\ndef parse_completion(completion: str, options):\n    decision = options[0]\n    decision_ok = False\n    dm = DECISION_RE.search(completion)\n    if dm:\n        cand = dm.group(1).strip().lower()\n        for opt in options:\n            if opt.lower() == cand or opt.lower() in cand:\n                decision = opt; decision_ok = True; break\n    if not decision_ok:\n        for opt in options:\n            if opt.lower() in completion.lower():\n                decision = opt; break\n    pm = PITCH_RE.search(completion)\n    pitch = pm.group(1).strip()[:400] if pm else ''\n    format_ok = bool(dm) and bool(pm)\n    return decision, pitch, format_ok\n\nMAX_NEW_TOKENS = 80\n\ndef greedy_action(obs):\n    prompt = build_prompt(obs)\n    enc = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024).to(device)\n    with torch.no_grad():\n        out = model.generate(\n            **enc, max_new_tokens=MAX_NEW_TOKENS,\n            do_sample=False, pad_token_id=tokenizer.eos_token_id,\n        )\n    completion = tokenizer.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True)\n    return parse_completion(completion, obs.options)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:49:10.140073Z","iopub.execute_input":"2026-04-26T06:49:10.141152Z","iopub.status.idle":"2026-04-26T06:49:10.152522Z","shell.execute_reply.started":"2026-04-26T06:49:10.141082Z","shell.execute_reply":"2026-04-26T06:49:10.151810Z"}},"outputs":[],"execution_count":6},{"id":"cc7f9366-807b-4779-b1a6-4bd30cb10a2c","cell_type":"markdown","source":"## 7. Episode runner","metadata":{}},{"id":"9a0e1864-d7f3-4396-968d-cdc54a628322","cell_type":"code","source":"import random, statistics, json\n\nMAX_STEPS_PER_EP = 20\n\ndef run_episode(env, seed):\n    result = env.reset(seed=seed)\n    obs = result.observation\n    ep_r, n, fmt_hits, pitch_hits = 0.0, 0, 0, 0\n    while not result.done and n < MAX_STEPS_PER_EP:\n        decision, pitch, fmt_ok = greedy_action(obs)\n        if fmt_ok: fmt_hits += 1\n        if pitch.strip(): pitch_hits += 1\n        result = env.step(BoardSimAction(decision=decision, coalition_pitch=pitch))\n        obs = result.observation\n        ep_r += float(result.reward or 0.0)\n        n += 1\n    return {\n        'final_profit': obs.state['profitability_score'],\n        'ep_reward': ep_r, 'steps': n,\n        'format_rate': fmt_hits / max(1, n), 'pitch_rate': pitch_hits / max(1, n),\n        'history': obs.state.get('history', []),\n    }","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:49:22.139016Z","iopub.execute_input":"2026-04-26T06:49:22.139808Z","iopub.status.idle":"2026-04-26T06:49:22.146153Z","shell.execute_reply.started":"2026-04-26T06:49:22.139746Z","shell.execute_reply":"2026-04-26T06:49:22.145224Z"}},"outputs":[],"execution_count":7},{"id":"05f2da0b-9218-4bad-a27e-e2c7ed3374a5","cell_type":"markdown","source":"## 8. Baseline — base Qwen3-1.7B (no fine-tune)\nApples-to-apples reference for measuring fine-tuning lift.","metadata":{}},{"id":"e882d431-4dac-450f-b0e1-58bb8c6a2fa9","cell_type":"code","source":"BASELINE_SEEDS = list(range(50_000, 50_000 + 20))  # reduced from 100 → 20\nbase_finals, base_rewards, base_fmts, base_pitches = [], [], [], []\nwith make_env().sync() as env:\n    for i, s in enumerate(BASELINE_SEEDS):\n        r = run_episode(env, s)\n        base_finals.append(r['final_profit'])\n        base_rewards.append(r['ep_reward'])\n        base_fmts.append(r['format_rate'])\n        base_pitches.append(r['pitch_rate'])\n        if (i + 1) % 5 == 0:  # changed from 10 → 5 so you still see progress\n            print(f'  base Qwen3-0.6B {i+1}/{len(BASELINE_SEEDS)}  profit={r[\"final_profit\"]:.1f}')\nBASELINE_MEAN_PROFIT = statistics.mean(base_finals)\nBASELINE_MEAN_REWARD = statistics.mean(base_rewards)\nprint(f'Base Qwen3-0.6B profit  : {BASELINE_MEAN_PROFIT:.2f} ± {statistics.stdev(base_finals):.2f}')\nprint(f'Base Qwen3-0.6B ep rwd  : {BASELINE_MEAN_REWARD:.2f} ± {statistics.stdev(base_rewards):.2f}')\nprint(f'Base format rate        : {statistics.mean(base_fmts):.0%}   pitch rate: {statistics.mean(base_pitches):.0%}')\nwith open(WORK_DIR / 'baseline.json', 'w') as f:\n    json.dump({'model': MODEL_NAME, 'mode': 'base_no_finetune',\n               'seeds': BASELINE_SEEDS,\n               'finals': base_finals, 'rewards': base_rewards,\n               'format_rates': base_fmts, 'pitch_rates': base_pitches}, f)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:49:30.441088Z","iopub.execute_input":"2026-04-26T06:49:30.441732Z","iopub.status.idle":"2026-04-26T06:54:28.368214Z","shell.execute_reply.started":"2026-04-26T06:49:30.441688Z","shell.execute_reply":"2026-04-26T06:54:28.367381Z"}},"outputs":[{"name":"stdout","text":"  base Qwen3-0.6B 5/20  profit=65.5\n  base Qwen3-0.6B 10/20  profit=35.4\n  base Qwen3-0.6B 15/20  profit=56.3\n  base Qwen3-0.6B 20/20  profit=54.6\nBase Qwen3-0.6B profit  : 42.39 ± 11.50\nBase Qwen3-0.6B ep rwd  : 37.52 ± 5.51\nBase format rate        : 100%   pitch rate: 100%\n","output_type":"stream"}],"execution_count":8},{"id":"f9459ffb-2df1-4213-ac0f-ddf7894ff1f6","cell_type":"markdown","source":"## 9. Wrap base with LoRA adapters","metadata":{}},{"id":"f3872ea7-7aaa-4cf2-9efb-332ff5847c4e","cell_type":"code","source":"model = FastLanguageModel.get_peft_model(\n    model,\n    r=32,\n    target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'],\n    lora_alpha=64,\n    lora_dropout=0.0, bias='none',\n    use_gradient_checkpointing='unsloth',\n    random_state=3407,\n)\n\ntrainable = sum(p.numel() for p in model.parameters() if p.requires_grad)\ntotal     = sum(p.numel() for p in model.parameters())\nprint(f'Trainable params: {trainable:,} / {total:,}  ({100*trainable/total:.2f}%)')\n\nEVAL_SEEDS = list(range(60_000, 60_000 + 10))\n\ndef periodic_eval(env):\n    profits, rewards, fmts, pitches = [], [], [], []\n    for s in EVAL_SEEDS:\n        r = run_episode(env, s)\n        profits.append(r['final_profit']); rewards.append(r['ep_reward'])\n        fmts.append(r['format_rate']); pitches.append(r['pitch_rate'])\n    import numpy as np\n    return {'profit_mean': float(np.mean(profits)),\n            'reward_mean': float(np.mean(rewards)),\n            'format_rate': float(np.mean(fmts)),\n            'pitch_rate':  float(np.mean(pitches))}","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:55:28.063482Z","iopub.execute_input":"2026-04-26T06:55:28.064213Z","iopub.status.idle":"2026-04-26T06:55:35.011917Z","shell.execute_reply.started":"2026-04-26T06:55:28.064165Z","shell.execute_reply":"2026-04-26T06:55:35.011046Z"}},"outputs":[{"name":"stderr","text":"Unsloth 2026.4.8 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.\n","output_type":"stream"},{"name":"stdout","text":"Trainable params: 20,185,088 / 408,616,960  (4.94%)\n","output_type":"stream"}],"execution_count":9},{"id":"78bd44dc-8c1e-478f-9d0a-015dfde10bd8","cell_type":"markdown","source":"## 10. GRPO training loop","metadata":{}},{"id":"10687855-d1b2-4111-8bd1-5ecf5c65fe41","cell_type":"code","source":"# =============================================================================\n# GRPO training cell — fixed version\n#\n# Fixes:\n#  1. RuntimeError \"variable modified by an inplace operation\" on loss.backward().\n#     Root cause: model.generate() leaves use_cache=True, and the subsequent\n#     forward pass returns logits that share storage with KV-cache buffers,\n#     which get mutated later. Fix: force use_cache=False on the training\n#     forward pass, and .clone() the logits slice before computing log_softmax.\n#\n#  2. GPU OOM on cell re-run. Root cause: re-running the cell creates a fresh\n#     AdamW (which holds momentum buffers ~= model size) without freeing the\n#     previous one. Fix: explicit cleanup of any prior optimizer / cached\n#     tensors at the top of the cell + gc + empty_cache. Model itself is NOT\n#     reloaded here (load it once in an earlier cell); we just reuse it.\n#\n#  3. wandb deprecation warning for reinit=True. Use finish_previous=True only.\n# =============================================================================\n\nimport os, gc, json, time, collections\nimport torch\nfrom torch.optim import AdamW\n\n# ---- 0. cleanup any leftover state from previous runs of this cell ----------\nfor _name in ('optimizer', 'gen_out', 'out', 'logits', 'loss',\n              'log_probs', 'token_nll', 'per_seq_nll', 'advantages'):\n    if _name in globals():\n        try:\n            del globals()[_name]\n        except Exception:\n            pass\ngc.collect()\nif torch.cuda.is_available():\n    torch.cuda.empty_cache()\n    torch.cuda.ipc_collect()\n\n# ---- 1. config --------------------------------------------------------------\nNUM_STEPS  = int(os.environ.get('NUM_STEPS', 100))\nGROUP_SIZE = int(os.environ.get('GROUP_SIZE', 4))\nLR         = 5e-6\nGRAD_CLIP  = 1.0\nTEMPERATURE, TOP_P = 1.0, 0.95\nSAVE_EVERY = 25\nEVAL_AT    = {0, 25, 50, 75, NUM_STEPS - 1}\n\n# Critical: kill KV cache on the training forward pass.\n# generate() will still build its own cache internally; we override afterwards.\nmodel.config.use_cache = False\nmodel.gradient_checkpointing_disable() if hasattr(model, 'gradient_checkpointing_disable') else None\nmodel.train()\n\n# ---- 2. wandb (no deprecated reinit) ----------------------------------------\nWANDB_OK = False\nif os.environ.get('WANDB_API_KEY'):\n    try:\n        import wandb\n        wandb.init(\n            project='boardsim-qwen3-grpo',\n            name='boardsim-qwen3-1p7b-kaggle',\n            config={'num_steps': NUM_STEPS, 'group_size': GROUP_SIZE, 'lr': LR,\n                    'temperature': TEMPERATURE, 'top_p': TOP_P, 'model': MODEL_NAME},\n            finish_previous=True,\n        )\n        WANDB_OK = True\n    except Exception as e:\n        print(f'WARN: wandb.init failed: {e}')\n\n# ---- 3. optimizer (single owner, freshly built each cell run) ---------------\noptimizer = AdamW(\n    [p for p in model.parameters() if p.requires_grad],\n    lr=LR, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0,\n)\n\nlog_history, eval_history = [], []\ndecision_counter = collections.Counter()\nt0 = time.time()\n\n# ---- 4. training loop -------------------------------------------------------\nwith make_env().sync() as env_train, \\\n     make_env().sync() as env_score, \\\n     make_env().sync() as env_eval:\n\n    for step in range(NUM_STEPS):\n        # 4a. rollout\n        result = env_train.reset(seed=step)\n        obs = result.observation\n        prompt = build_prompt(obs)\n        enc = tokenizer(prompt, return_tensors='pt',\n                        truncation=True, max_length=1024).to(device)\n        prompt_len = enc.input_ids.shape[1]\n\n        with torch.no_grad():\n            gen_out = model.generate(\n                input_ids=enc.input_ids,\n                attention_mask=enc.attention_mask,\n                max_new_tokens=MAX_NEW_TOKENS,\n                do_sample=True,\n                temperature=TEMPERATURE,\n                top_p=TOP_P,\n                num_return_sequences=GROUP_SIZE,\n                pad_token_id=tokenizer.eos_token_id,\n                use_cache=True,  # cache OK during generate (no_grad context)\n            )\n        # Detach + clone so no autograd ties to generate's internal buffers.\n        gen_out = gen_out.detach().clone()\n\n        # 4b. score each completion\n        decisions, pitches, rewards, fmt_oks = [], [], [], []\n        for g in range(GROUP_SIZE):\n            comp = tokenizer.decode(gen_out[g][prompt_len:], skip_special_tokens=True)\n            d, pp, ok = parse_completion(comp, obs.options)\n            decisions.append(d); pitches.append(pp); fmt_oks.append(ok)\n            decision_counter[d] += 1\n            env_score.reset(seed=step)\n            sr = env_score.step(BoardSimAction(decision=d, coalition_pitch=pp))\n            rewards.append(float(sr.reward or 0.0))\n\n        rewards_t = torch.tensor(rewards, dtype=torch.float32, device=device)\n        if rewards_t.numel() > 1 and rewards_t.std().item() > 1e-6:\n            advantages = (rewards_t - rewards_t.mean()) / (rewards_t.std() + 1e-8)\n        else:\n            advantages = rewards_t - rewards_t.mean()\n        advantages = advantages.detach()\n\n        # 4c. policy update — fresh forward, NO cache, clone logits\n        optimizer.zero_grad(set_to_none=True)\n\n        full_ids = gen_out\n        attn     = (full_ids != tokenizer.pad_token_id).long()\n        loss_mask = attn.clone()\n        loss_mask[:, :prompt_len] = 0\n\n        out = model(\n            input_ids=full_ids,\n            attention_mask=attn,\n            use_cache=False,         # <-- key fix\n            return_dict=True,\n        )\n        # Clone the slice so backward sees a tensor whose storage we own.\n        logits  = out.logits[:, :-1, :].float().clone()\n        targets = full_ids[:, 1:].contiguous()\n        mask    = loss_mask[:, 1:].float()\n\n        log_probs   = torch.nn.functional.log_softmax(logits, dim=-1)\n        token_nll   = -log_probs.gather(2, targets.unsqueeze(-1)).squeeze(-1)\n        per_seq_nll = (token_nll * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1.0)\n        loss = (advantages * per_seq_nll).mean()\n\n        loss.backward()\n        total_loss_val = float(loss.detach().item())\n\n        torch.nn.utils.clip_grad_norm_(\n            [p for p in model.parameters() if p.requires_grad], GRAD_CLIP)\n        optimizer.step()\n\n        # Free per-step graph tensors before next iter (helps on tight VRAM).\n        del out, logits, log_probs, token_nll, per_seq_nll, loss\n\n        # 4d. log\n        rec = {\n            'step':        step,\n            'reward':      float(rewards_t.mean().item()),\n            'reward_std':  float(rewards_t.std().item()) if rewards_t.numel() > 1 else 0.0,\n            'reward_max':  float(rewards_t.max().item()),\n            'loss':        total_loss_val,\n            'format_rate': sum(fmt_oks) / GROUP_SIZE,\n            'pitch_rate':  sum(1 for p in pitches if p.strip()) / GROUP_SIZE,\n            'elapsed_s':   time.time() - t0,\n        }\n        log_history.append(rec)\n        if WANDB_OK:\n            wandb.log(rec, step=step)\n\n        if step % 5 == 0:\n            print(f\"step={step:4d}  reward={rec['reward']:+.3f} (\\u00b1{rec['reward_std']:.2f})  \"\n                  f\"loss={rec['loss']:+.4f}  fmt={rec['format_rate']:.0%}  \"\n                  f\"elapsed={rec['elapsed_s']:.0f}s  d0={decisions[0]}\")\n\n        # 4e. periodic eval\n        if step in EVAL_AT:\n            ev = periodic_eval(env_eval)\n            ev['step'] = step\n            eval_history.append(ev)\n            print(f\"  [eval@{step}] profit={ev['profit_mean']:.2f}  \"\n                  f\"reward={ev['reward_mean']:.2f}  fmt={ev['format_rate']:.0%}\")\n            if WANDB_OK:\n                wandb.log({f'eval/{k}': v for k, v in ev.items() if k != 'step'}, step=step)\n\n        # 4f. checkpoint\n        if step > 0 and step % SAVE_EVERY == 0:\n            model.save_pretrained(str(CKPT))\n            tokenizer.save_pretrained(str(CKPT))\n            with open(WORK_DIR / 'log_history.json', 'w') as f:\n                json.dump(log_history, f)\n            with open(WORK_DIR / 'eval_history.json', 'w') as f:\n                json.dump(eval_history, f)\n\n# ---- 5. final save ----------------------------------------------------------\nmodel.save_pretrained(str(CKPT))\ntokenizer.save_pretrained(str(CKPT))\nwith open(WORK_DIR / 'log_history.json', 'w') as f:\n    json.dump(log_history, f)\nwith open(WORK_DIR / 'eval_history.json', 'w') as f:\n    json.dump(eval_history, f)\nwith open(WORK_DIR / 'decision_counter.json', 'w') as f:\n    json.dump(dict(decision_counter), f)\nif WANDB_OK:\n    wandb.finish()\nprint(f'Training done. {len(log_history)} steps in {time.time() - t0:.0f}s. -> {CKPT}')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T06:55:41.400144Z","iopub.execute_input":"2026-04-26T06:55:41.400729Z","iopub.status.idle":"2026-04-26T07:25:44.637514Z","shell.execute_reply.started":"2026-04-26T06:55:41.400681Z","shell.execute_reply":"2026-04-26T07:25:44.636833Z"}},"outputs":[{"name":"stdout","text":"WARN: wandb.init failed: init() got an unexpected keyword argument 'finish_previous'\nUnsloth: Will smartly offload gradients to save VRAM!\nstep=   0  reward=+1.062 (±0.02)  loss=-0.1849  fmt=75%  elapsed=10s  d0=reinvest_in_growth\n  [eval@0] profit=45.36  reward=37.72  fmt=100%\nstep=   5  reward=+0.832 (±0.00)  loss=+0.0000  fmt=100%  elapsed=258s  d0=match_offers\nstep=  10  reward=+31.031 (±0.00)  loss=+0.0000  fmt=100%  elapsed=296s  d0=accept_acquisition\nstep=  15  reward=+23.589 (±15.04)  loss=+0.0518  fmt=100%  elapsed=334s  d0=accept_acquisition\nstep=  20  reward=+23.590 (±15.04)  loss=-0.0527  fmt=100%  elapsed=371s  d0=accept_acquisition\nstep=  25  reward=+31.019 (±0.03)  loss=-0.1990  fmt=75%  elapsed=409s  d0=accept_acquisition\n  [eval@25] profit=44.48  reward=37.67  fmt=100%\nstep=  30  reward=+1.070 (±0.00)  loss=+0.0000  fmt=100%  elapsed=660s  d0=full_compliance\nstep=  35  reward=+0.990 (±0.06)  loss=-0.0755  fmt=100%  elapsed=698s  d0=form_strategic_partnership\nstep=  40  reward=+1.002 (±0.00)  loss=+0.0000  fmt=100%  elapsed=736s  d0=stay_independent\nstep=  45  reward=+1.147 (±0.00)  loss=+0.0000  fmt=100%  elapsed=774s  d0=accept_deal\nstep=  50  reward=+1.206 (±0.00)  loss=+0.0000  fmt=100%  elapsed=811s  d0=full_disclosure\n  [eval@50] profit=45.06  reward=37.21  fmt=100%\nstep=  55  reward=+1.279 (±0.00)  loss=+0.0000  fmt=100%  elapsed=1058s  d0=accept_terms\nstep=  60  reward=+1.359 (±0.00)  loss=+0.0000  fmt=100%  elapsed=1095s  d0=accept_terms\nstep=  65  reward=+0.968 (±0.06)  loss=-0.0823  fmt=100%  elapsed=1133s  d0=cut_prices\nstep=  70  reward=+23.529 (±15.01)  loss=-0.0276  fmt=100%  elapsed=1171s  d0=accept_acquisition\nstep=  75  reward=+1.043 (±0.00)  loss=+0.0000  fmt=100%  elapsed=1209s  d0=full_disclosure\n  [eval@75] profit=44.46  reward=36.71  fmt=100%\nstep=  80  reward=+31.111 (±0.00)  loss=+0.0000  fmt=100%  elapsed=1456s  d0=accept_acquisition\nstep=  85  reward=+1.060 (±0.04)  loss=+0.0951  fmt=100%  elapsed=1493s  d0=reinvest_in_growth\nstep=  90  reward=+1.109 (±0.00)  loss=+0.0000  fmt=100%  elapsed=1531s  d0=form_strategic_partnership\nstep=  95  reward=+0.951 (±0.00)  loss=+0.0000  fmt=100%  elapsed=1569s  d0=public_apology\n  [eval@99] profit=44.02  reward=35.39  fmt=100%\nTraining done. 100 steps in 1803s. -> /kaggle/working/BoardSim_Run/lora_qwen3_1p7b\n","output_type":"stream"}],"execution_count":10},{"id":"cc193532-8c6f-4203-b9b9-45140cff443c","cell_type":"code","source":"import numpy as np, matplotlib, json\nfrom scipy import stats as spstats\nfrom unsloth import FastLanguageModel\nmatplotlib.use('Agg')\nimport matplotlib.pyplot as plt\n\n# ── dummy baseline values (cell 8 was skipped) ───────────────────────────────\nBASELINE_MEAN_REWARD = 0.0\nBASELINE_MEAN_PROFIT = 0.0\n\n# =============================================================================\n# CELL 11 — Plots from training history\n# =============================================================================\nsteps   = np.array([e['step']        for e in log_history])\nrewards = np.array([e['reward']      for e in log_history])\nlosses  = np.array([e['loss']        for e in log_history])\nfmts    = np.array([e['format_rate'] for e in log_history])\npitches = np.array([e['pitch_rate']  for e in log_history])\n\ndef ema(xs, alpha=0.1):\n    out, s = [], xs[0] if len(xs) else 0.0\n    for x in xs:\n        s = alpha * x + (1 - alpha) * s\n        out.append(s)\n    return np.array(out)\n\nrewards_ema = ema(rewards, 0.1)\nslope, intercept, r_val, p_val, _ = spstats.linregress(steps, rewards)\n\nplt.figure(figsize=(9, 5))\nplt.plot(steps, rewards, alpha=0.3, lw=1, label='per-step group reward')\nplt.plot(steps, rewards_ema, lw=2.2, label='EMA (α=0.1)')\nplt.plot(steps, intercept + slope * steps, '--', lw=1.5,\n         label=f'linear fit slope={slope:+.4f}/step  (p={p_val:.1e})')\nplt.title('GRPO reward — BoardSim')\nplt.xlabel('step'); plt.ylabel('mean group reward')\nplt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\nplt.savefig(ASSETS / 'reward_curve.png', dpi=150); plt.close()\n\nplt.figure(figsize=(9, 5))\nplt.plot(steps, losses, lw=1.5)\nplt.title('GRPO loss (advantage × NLL)'); plt.xlabel('step'); plt.ylabel('loss')\nplt.grid(alpha=0.3); plt.tight_layout()\nplt.savefig(ASSETS / 'loss_curve.png', dpi=150); plt.close()\n\nplt.figure(figsize=(9, 5))\nplt.plot(steps, ema(fmts, 0.05),    lw=2, label='format-OK rate (EMA)')\nplt.plot(steps, ema(pitches, 0.05), lw=2, label='non-empty pitch rate (EMA)')\nplt.title('Format compliance + pitch usage during training')\nplt.xlabel('step'); plt.ylabel('rate'); plt.ylim(-0.05, 1.05)\nplt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\nplt.savefig(ASSETS / 'format_compliance.png', dpi=150); plt.close()\n\nif eval_history:\n    es  = [e['step']        for e in eval_history]\n    epm = [e['profit_mean'] for e in eval_history]\n    erm = [e['reward_mean'] for e in eval_history]\n    plt.figure(figsize=(9, 5))\n    plt.plot(es, epm, '-o', lw=2, label='held-out profitability (mean of 10 episodes)')\n    plt.plot(es, erm, '-s', lw=2, label='held-out episode reward')\n    plt.title('Periodic held-out eval during training (greedy)')\n    plt.xlabel('training step'); plt.ylabel('value')\n    plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\n    plt.savefig(ASSETS / 'periodic_eval.png', dpi=150); plt.close()\n\nprint(f'Linear-fit slope on reward: {slope:+.5f}/step (p={p_val:.2e}, R²={r_val**2:.3f})')\nprint('✓ Saved reward_curve.png, loss_curve.png, format_compliance.png, periodic_eval.png')\n\n# =============================================================================\n# CELL 12 — Paired eval: fine-tuned vs base (adapter disabled)\n# =============================================================================\nFastLanguageModel.for_inference(model)\n\nEVAL_N = 20  # reduced from 50\nPAIRED_SEEDS = list(range(70_000, 70_000 + EVAL_N))\n\ntrained_finals, trained_rewards, trained_fmt, trained_pitch = [], [], [], []\ntrained_history_per_seed = []\nwith make_env().sync() as env:\n    for i, s in enumerate(PAIRED_SEEDS):\n        r = run_episode(env, s)\n        trained_finals.append(r['final_profit'])\n        trained_rewards.append(r['ep_reward'])\n        trained_fmt.append(r['format_rate'])\n        trained_pitch.append(r['pitch_rate'])\n        trained_history_per_seed.append(r['history'])\n        if (i + 1) % 5 == 0:\n            print(f'  trained {i+1}/{EVAL_N}  profit={r[\"final_profit\"]:.1f}')\n\nbase_finals_paired, base_rewards_paired, base_fmt_paired, base_pitch_paired = [], [], [], []\nbase_history_per_seed = []\nwith make_env().sync() as env, model.disable_adapter():\n    for i, s in enumerate(PAIRED_SEEDS):\n        r = run_episode(env, s)\n        base_finals_paired.append(r['final_profit'])\n        base_rewards_paired.append(r['ep_reward'])\n        base_fmt_paired.append(r['format_rate'])\n        base_pitch_paired.append(r['pitch_rate'])\n        base_history_per_seed.append(r['history'])\n        if (i + 1) % 5 == 0:\n            print(f'  base    {i+1}/{EVAL_N}  profit={r[\"final_profit\"]:.1f}')\n\ntf, bf = np.array(trained_finals), np.array(base_finals_paired)\ntr, br = np.array(trained_rewards), np.array(base_rewards_paired)\n\nprint(f'\\nTrained Qwen3-0.6B profit : {tf.mean():.2f} ± {tf.std():.2f}')\nprint(f'Base    Qwen3-0.6B profit : {bf.mean():.2f} ± {bf.std():.2f}')\nprint(f'Trained ep reward         : {tr.mean():.2f} ± {tr.std():.2f}')\nprint(f'Base    ep reward         : {br.mean():.2f} ± {br.std():.2f}')\nprint(f'Trained format/pitch      : {np.mean(trained_fmt):.0%} / {np.mean(trained_pitch):.0%}')\nprint(f'Base    format/pitch      : {np.mean(base_fmt_paired):.0%} / {np.mean(base_pitch_paired):.0%}')\n\nwith open(WORK_DIR / 'eval_paired.json', 'w') as f:\n    json.dump({'seeds': PAIRED_SEEDS,\n               'trained_finals': tf.tolist(), 'base_finals': bf.tolist(),\n               'trained_rewards': tr.tolist(), 'base_rewards': br.tolist(),\n               'trained_format_rate': float(np.mean(trained_fmt)),\n               'base_format_rate':    float(np.mean(base_fmt_paired)),\n               'trained_pitch_rate':  float(np.mean(trained_pitch)),\n               'base_pitch_rate':     float(np.mean(base_pitch_paired))}, f)\n\nprint('✓ Paired eval complete.')\n\n# =============================================================================\n# CELL 13 — Stats + before/after plots\n# =============================================================================\ndef cohen_d(a, b):\n    pooled = np.sqrt(((a.std(ddof=1)**2) + (b.std(ddof=1)**2)) / 2)\n    return (a.mean() - b.mean()) / (pooled + 1e-12)\n\ndef bootstrap_diff_ci(a, b, n=10_000, seed=0):\n    rng = np.random.default_rng(seed)\n    diffs = a - b\n    boots = rng.choice(diffs, size=(n, len(diffs)), replace=True).mean(axis=1)\n    return float(np.percentile(boots, 2.5)), float(np.percentile(boots, 97.5))\n\ntt   = spstats.ttest_rel(tf, bf)\nuu   = spstats.mannwhitneyu(tf, bf, alternative='greater')\nwilc = spstats.wilcoxon(tf, bf, alternative='greater')\nd    = cohen_d(tf, bf)\nlo, hi = bootstrap_diff_ci(tf, bf)\nwin_rate = float((tf > bf).mean())\ntie_rate = float((tf == bf).mean())\n\nsummary = {\n    'baseline_model': MODEL_NAME + ' (no fine-tune)',\n    'trained_model':  MODEL_NAME + ' + LoRA r=32',\n    'n': len(tf),\n    'paired_t_stat': float(tt.statistic), 'paired_t_p': float(tt.pvalue),\n    'mannwhitney_U': float(uu.statistic), 'mannwhitney_p_greater': float(uu.pvalue),\n    'wilcoxon_p_greater': float(wilc.pvalue),\n    'cohens_d': float(d),\n    'paired_diff_mean': float((tf - bf).mean()),\n    'paired_diff_95ci': [lo, hi],\n    'win_rate_trained_strictly_better': win_rate,\n    'tie_rate': tie_rate,\n}\nprint(json.dumps(summary, indent=2))\nwith open(WORK_DIR / 'stats_summary.json', 'w') as f:\n    json.dump(summary, f, indent=2)\n\nbins = np.linspace(0, 100, 25)\nplt.figure(figsize=(9, 5))\nplt.hist(bf, bins=bins, alpha=0.55, color='#c44',\n         label=f'Base Qwen3-0.6B (mean={bf.mean():.1f})')\nplt.hist(tf, bins=bins, alpha=0.55, color='#1d6fff',\n         label=f'Fine-tuned Qwen3-0.6B (mean={tf.mean():.1f})')\nplt.axvline(bf.mean(), color='#c44', ls='--', lw=1.5)\nplt.axvline(tf.mean(), color='#1d6fff', ls='--', lw=1.5)\nplt.title(f'Final profitability — paired same-seed (n={len(tf)})  '\n          f\"d={summary['cohens_d']:+.2f}  win-rate={summary['win_rate_trained_strictly_better']:.0%}\")\nplt.xlabel('profitability score (0–100)'); plt.ylabel('episodes')\nplt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\nplt.savefig(ASSETS / 'before_after.png', dpi=150); plt.close()\n\ndiffs = tf - bf\norder = np.argsort(diffs)\nplt.figure(figsize=(9, 5))\nplt.bar(range(len(diffs)), diffs[order],\n        color=['#1d6fff' if x > 0 else '#c44' for x in diffs[order]])\nplt.axhline(0, color='k', lw=0.8)\nplt.title(f'Per-seed lift (fine-tuned − base Qwen3-0.6B), sorted  '\n          f'mean lift = {diffs.mean():+.1f}  CI=[{lo:+.1f}, {hi:+.1f}]')\nplt.xlabel('seed (sorted by lift)'); plt.ylabel('Δ profitability')\nplt.grid(alpha=0.3); plt.tight_layout()\nplt.savefig(ASSETS / 'paired_delta.png', dpi=150); plt.close()\n\nprint('✓ Saved before_after.png, paired_delta.png')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T07:26:54.243367Z","iopub.execute_input":"2026-04-26T07:26:54.243818Z","iopub.status.idle":"2026-04-26T07:38:59.431464Z","shell.execute_reply.started":"2026-04-26T07:26:54.243775Z","shell.execute_reply":"2026-04-26T07:38:59.430512Z"}},"outputs":[{"name":"stdout","text":"Linear-fit slope on reward: -0.06447/step (p=3.51e-02, R²=0.045)\n✓ Saved reward_curve.png, loss_curve.png, format_compliance.png, periodic_eval.png\n  trained 5/20  profit=64.4\n  trained 10/20  profit=51.8\n  trained 15/20  profit=56.3\n  trained 20/20  profit=36.7\n  base    5/20  profit=64.4\n  base    10/20  profit=56.8\n  base    15/20  profit=19.7\n  base    20/20  profit=64.4\n\nTrained Qwen3-0.6B profit : 45.93 ± 12.99\nBase    Qwen3-0.6B profit : 47.37 ± 14.96\nTrained ep reward         : 33.49 ± 9.15\nBase    ep reward         : 37.40 ± 10.25\nTrained format/pitch      : 100% / 100%\nBase    format/pitch      : 100% / 100%\n✓ Paired eval complete.\n{\n  \"baseline_model\": \"Qwen/Qwen3-0.6B (no fine-tune)\",\n  \"trained_model\": \"Qwen/Qwen3-0.6B + LoRA r=32\",\n  \"n\": 20,\n  \"paired_t_stat\": -0.4385693543749398,\n  \"paired_t_p\": 0.6659198248349103,\n  \"mannwhitney_U\": 185.0,\n  \"mannwhitney_p_greater\": 0.6626800641585955,\n  \"wilcoxon_p_greater\": 0.863952408631937,\n  \"cohens_d\": -0.10028349616417989,\n  \"paired_diff_mean\": -1.4412706571796334,\n  \"paired_diff_95ci\": [\n    -7.3623423164764565,\n    5.200577640718865\n  ],\n  \"win_rate_trained_strictly_better\": 0.15,\n  \"tie_rate\": 0.4\n}\n✓ Saved before_after.png, paired_delta.png\n","output_type":"stream"}],"execution_count":11},{"id":"6afa2a9b-b8c8-4df2-bc3f-fd2a8df04b38","cell_type":"markdown","source":"## 11. Plots — reward / loss / format / periodic eval","metadata":{}},{"id":"58285d42-5432-4f25-a960-cebcc2081725","cell_type":"code","source":"# import numpy as np, matplotlib\n# matplotlib.use('Agg')\n# import matplotlib.pyplot as plt\n# from scipy import stats as spstats\n\n# steps   = np.array([e['step']    for e in log_history])\n# rewards = np.array([e['reward']  for e in log_history])\n# losses  = np.array([e['loss']    for e in log_history])\n# fmts    = np.array([e['format_rate'] for e in log_history])\n# pitches = np.array([e['pitch_rate']  for e in log_history])\n\n# def ema(xs, alpha=0.1):\n#     out, s = [], xs[0] if len(xs) else 0.0\n#     for x in xs:\n#         s = alpha * x + (1 - alpha) * s\n#         out.append(s)\n#     return np.array(out)\n\n# rewards_ema = ema(rewards, 0.1)\n# slope, intercept, r_val, p_val, _ = spstats.linregress(steps, rewards)\n\n# plt.figure(figsize=(9, 5))\n# plt.plot(steps, rewards, alpha=0.3, lw=1, label='per-step group reward')\n# plt.plot(steps, rewards_ema, lw=2.2, label='EMA (\\u03b1=0.1)')\n# plt.plot(steps, intercept + slope * steps, '--', lw=1.5,\n#          label=f'linear fit slope={slope:+.4f}/step  (p={p_val:.1e})')\n# plt.axhline(BASELINE_MEAN_REWARD, ls=':', lw=2, color='#c44',\n#             label=f'base Qwen3-1.7B baseline = {BASELINE_MEAN_REWARD:.2f}')\n# plt.title('GRPO reward — BoardSim (vs same model w/o fine-tuning)')\n# plt.xlabel('step'); plt.ylabel('mean group reward')\n# plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\n# plt.savefig(ASSETS / 'reward_curve.png', dpi=150); plt.close()\n\n# plt.figure(figsize=(9, 5))\n# plt.plot(steps, losses, lw=1.5)\n# plt.title('GRPO loss (advantage \\u00d7 NLL)'); plt.xlabel('step'); plt.ylabel('loss')\n# plt.grid(alpha=0.3); plt.tight_layout()\n# plt.savefig(ASSETS / 'loss_curve.png', dpi=150); plt.close()\n\n# plt.figure(figsize=(9, 5))\n# plt.plot(steps, ema(fmts, 0.05),    lw=2, label='format-OK rate (EMA)')\n# plt.plot(steps, ema(pitches, 0.05), lw=2, label='non-empty pitch rate (EMA)')\n# plt.title('Format compliance + pitch usage during training')\n# plt.xlabel('step'); plt.ylabel('rate'); plt.ylim(-0.05, 1.05)\n# plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\n# plt.savefig(ASSETS / 'format_compliance.png', dpi=150); plt.close()\n\n# if eval_history:\n#     es  = [e['step']        for e in eval_history]\n#     epm = [e['profit_mean'] for e in eval_history]\n#     erm = [e['reward_mean'] for e in eval_history]\n#     plt.figure(figsize=(9, 5))\n#     plt.plot(es, epm, '-o', lw=2, label='held-out profitability (mean of 10 episodes)')\n#     plt.plot(es, erm, '-s', lw=2, label='held-out episode reward')\n#     plt.axhline(BASELINE_MEAN_PROFIT, ls=':', lw=1.5, color='#c44',\n#                 label=f'base Qwen3-1.7B profitability = {BASELINE_MEAN_PROFIT:.2f}')\n#     plt.title('Periodic held-out eval during training (greedy)')\n#     plt.xlabel('training step'); plt.ylabel('value')\n#     plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\n#     plt.savefig(ASSETS / 'periodic_eval.png', dpi=150); plt.close()\n\n# print(f'Linear-fit slope on reward: {slope:+.5f}/step (p={p_val:.2e}, R\\u00b2={r_val**2:.3f})')\n# print('Saved reward_curve.png, loss_curve.png, format_compliance.png, periodic_eval.png')","metadata":{},"outputs":[],"execution_count":null},{"id":"18d3a515-e67a-47e6-8a29-1fba4757502c","cell_type":"markdown","source":"## 12. Paired same-seed eval — fine-tuned vs base Qwen3-1.7B","metadata":{}},{"id":"23ab9f8e-e82e-459c-b87b-b9a824f0a4b6","cell_type":"code","source":"# from unsloth import FastLanguageModel\n# FastLanguageModel.for_inference(model)\n\n# EVAL_N = 50\n# PAIRED_SEEDS = list(range(70_000, 70_000 + EVAL_N))\n# trained_finals, trained_rewards, trained_fmt, trained_pitch = [], [], [], []\n# trained_history_per_seed = []\n# with make_env().sync() as env:\n#     for i, s in enumerate(PAIRED_SEEDS):\n#         r = run_episode(env, s)\n#         trained_finals.append(r['final_profit'])\n#         trained_rewards.append(r['ep_reward'])\n#         trained_fmt.append(r['format_rate'])\n#         trained_pitch.append(r['pitch_rate'])\n#         trained_history_per_seed.append(r['history'])\n#         if (i + 1) % 10 == 0:\n#             print(f'  trained {i+1}/{EVAL_N}  profit={r[\"final_profit\"]:.1f}')\n\n# base_finals_paired, base_rewards_paired, base_fmt_paired, base_pitch_paired = [], [], [], []\n# base_history_per_seed = []\n# with make_env().sync() as env, model.disable_adapter():\n#     for i, s in enumerate(PAIRED_SEEDS):\n#         r = run_episode(env, s)\n#         base_finals_paired.append(r['final_profit'])\n#         base_rewards_paired.append(r['ep_reward'])\n#         base_fmt_paired.append(r['format_rate'])\n#         base_pitch_paired.append(r['pitch_rate'])\n#         base_history_per_seed.append(r['history'])\n#         if (i + 1) % 10 == 0:\n#             print(f'  base    {i+1}/{EVAL_N}  profit={r[\"final_profit\"]:.1f}')\n\n# tf, bf = np.array(trained_finals), np.array(base_finals_paired)\n# tr, br = np.array(trained_rewards), np.array(base_rewards_paired)\n\n# print(f'\\nTrained Qwen3-1.7B profit : {tf.mean():.2f} \\u00b1 {tf.std():.2f}')\n# print(f'Base    Qwen3-1.7B profit : {bf.mean():.2f} \\u00b1 {bf.std():.2f}')\n# print(f'Trained ep reward         : {tr.mean():.2f} \\u00b1 {tr.std():.2f}')\n# print(f'Base    ep reward         : {br.mean():.2f} \\u00b1 {br.std():.2f}')\n# print(f'Trained format/pitch      : {np.mean(trained_fmt):.0%} / {np.mean(trained_pitch):.0%}')\n# print(f'Base    format/pitch      : {np.mean(base_fmt_paired):.0%} / {np.mean(base_pitch_paired):.0%}')\n\n# with open(WORK_DIR / 'eval_paired.json', 'w') as f:\n#     json.dump({'seeds': PAIRED_SEEDS,\n#                'trained_finals': tf.tolist(), 'base_finals': bf.tolist(),\n#                'trained_rewards': tr.tolist(), 'base_rewards': br.tolist(),\n#                'trained_format_rate': float(np.mean(trained_fmt)),\n#                'base_format_rate':    float(np.mean(base_fmt_paired)),\n#                'trained_pitch_rate':  float(np.mean(trained_pitch)),\n#                'base_pitch_rate':     float(np.mean(base_pitch_paired))}, f)","metadata":{},"outputs":[],"execution_count":null},{"id":"48396072-e570-4d07-9e7a-3c58d3b5bb3b","cell_type":"markdown","source":"## 13. Stats summary + before/after plots","metadata":{}},{"id":"b16858c8-5d75-4413-8c58-57b908f5bf48","cell_type":"code","source":"# from scipy import stats as spstats\n\n# def cohen_d(a, b):\n#     pooled = np.sqrt(((a.std(ddof=1)**2) + (b.std(ddof=1)**2)) / 2)\n#     return (a.mean() - b.mean()) / (pooled + 1e-12)\n\n# def bootstrap_diff_ci(a, b, n=10_000, seed=0):\n#     rng = np.random.default_rng(seed)\n#     diffs = a - b\n#     boots = rng.choice(diffs, size=(n, len(diffs)), replace=True).mean(axis=1)\n#     return float(np.percentile(boots, 2.5)), float(np.percentile(boots, 97.5))\n\n# tt   = spstats.ttest_rel(tf, bf)\n# uu   = spstats.mannwhitneyu(tf, bf, alternative='greater')\n# wilc = spstats.wilcoxon(tf, bf, alternative='greater')\n# d    = cohen_d(tf, bf)\n# lo, hi = bootstrap_diff_ci(tf, bf)\n# win_rate = float((tf > bf).mean())\n# tie_rate = float((tf == bf).mean())\n\n# summary = {\n#     'baseline_model': MODEL_NAME + ' (no fine-tune)',\n#     'trained_model':  MODEL_NAME + ' + LoRA r=32',\n#     'n': len(tf),\n#     'paired_t_stat': float(tt.statistic), 'paired_t_p': float(tt.pvalue),\n#     'mannwhitney_U': float(uu.statistic), 'mannwhitney_p_greater': float(uu.pvalue),\n#     'wilcoxon_p_greater': float(wilc.pvalue),\n#     'cohens_d': float(d),\n#     'paired_diff_mean': float((tf - bf).mean()),\n#     'paired_diff_95ci': [lo, hi],\n#     'win_rate_trained_strictly_better': win_rate,\n#     'tie_rate': tie_rate,\n# }\n# print(json.dumps(summary, indent=2))\n# with open(WORK_DIR / 'stats_summary.json', 'w') as f:\n#     json.dump(summary, f, indent=2)\n\n# bins = np.linspace(0, 100, 25)\n# plt.figure(figsize=(9, 5))\n# plt.hist(bf, bins=bins, alpha=0.55, color='#c44',\n#          label=f'Base Qwen3-1.7B (mean={bf.mean():.1f})')\n# plt.hist(tf, bins=bins, alpha=0.55, color='#1d6fff',\n#          label=f'Fine-tuned Qwen3-1.7B (mean={tf.mean():.1f})')\n# plt.axvline(bf.mean(), color='#c44', ls='--', lw=1.5)\n# plt.axvline(tf.mean(), color='#1d6fff', ls='--', lw=1.5)\n# plt.title(f'Final profitability — paired same-seed (n={len(tf)})  '\n#           f\"d={summary['cohens_d']:+.2f}  win-rate={summary['win_rate_trained_strictly_better']:.0%}\")\n# plt.xlabel('profitability score (0\\u2013100)'); plt.ylabel('episodes')\n# plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()\n# plt.savefig(ASSETS / 'before_after.png', dpi=150); plt.close()\n\n# diffs = tf - bf\n# order = np.argsort(diffs)\n# plt.figure(figsize=(9, 5))\n# plt.bar(range(len(diffs)), diffs[order],\n#         color=['#1d6fff' if x > 0 else '#c44' for x in diffs[order]])\n# plt.axhline(0, color='k', lw=0.8)\n# plt.title(f'Per-seed lift (fine-tuned \\u2212 base Qwen3-1.7B), sorted  '\n#           f'mean lift = {diffs.mean():+.1f}  CI=[{summary[\"paired_diff_95ci\"][0]:+.1f}, {summary[\"paired_diff_95ci\"][1]:+.1f}]')\n# plt.xlabel('seed (sorted by lift)'); plt.ylabel('\\u0394 profitability')\n# plt.grid(alpha=0.3); plt.tight_layout()\n# plt.savefig(ASSETS / 'paired_delta.png', dpi=150); plt.close()\n# print('Saved before_after.png, paired_delta.png')","metadata":{},"outputs":[],"execution_count":null},{"id":"008c5d7a-a80f-4f5e-a2bd-85fefd0cbf2d","cell_type":"code","source":"import collections\nfrom huggingface_hub import HfApi\n\n# =============================================================================\n# CELL 14 — Per-event win-rate breakdown\n# =============================================================================\ndef per_event_winrate(history_per_seed):\n    bucket = collections.defaultdict(lambda: [0, 0])\n    for hist in history_per_seed:\n        for rd in hist:\n            t = rd.get('event_title', '?')\n            bucket[t][1] += 1\n            if rd.get('agent_won_vote'):\n                bucket[t][0] += 1\n    return {t: (w / max(1, n)) for t, (w, n) in bucket.items()}\n\ntrained_wr = per_event_winrate(trained_history_per_seed)\nbase_wr    = per_event_winrate(base_history_per_seed)\n\nevents_sorted = sorted(set(trained_wr) | set(base_wr))\ntw = [trained_wr.get(e, 0.0) for e in events_sorted]\nbw = [base_wr.get(e, 0.0)    for e in events_sorted]\n\nplt.figure(figsize=(11, 5))\nx = np.arange(len(events_sorted))\nplt.bar(x - 0.2, bw, width=0.4, color='#c44', label='Base Qwen3-0.6B')\nplt.bar(x + 0.2, tw, width=0.4, color='#1d6fff', label='Fine-tuned Qwen3-0.6B')\nplt.xticks(x, [e[:22] for e in events_sorted], rotation=30, ha='right')\nplt.ylim(0, 1.05); plt.ylabel('boardroom win rate')\nplt.title('Per-event boardroom win rate (paired seeds, n=20 episodes)')  # updated n\nplt.legend(); plt.grid(alpha=0.3, axis='y'); plt.tight_layout()\nplt.savefig(ASSETS / 'per_event_winrate.png', dpi=150); plt.close()\n\nwith open(WORK_DIR / 'per_event_winrate.json', 'w') as f:\n    json.dump({'events': events_sorted, 'trained': tw, 'base': bw}, f, indent=2)\nprint('✓ Saved per_event_winrate.png')\n\n# =============================================================================\n# CELL 15 — Theory-of-Mind probe\n# =============================================================================\nTOM_INSTRUCTION = (\n    \"\\n\\nGiven the state and event below, name the SINGLE board member \"\n    \"(CTO, CFO, Investor Rep, or Independent) most likely to oppose the chosen decision. \"\n    \"Answer with just the role name on one line.\\n\"\n)\n\ndef tom_predict(obs, decision):\n    body = build_prompt(obs).split(SYSTEM_PROMPT, 1)[1]\n    prompt = SYSTEM_PROMPT + TOM_INSTRUCTION + body + f'Chosen decision: {decision}\\nMost likely opponent: '\n    enc = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024).to(device)\n    with torch.no_grad():  # fixed broken hyperlink\n        out = model.generate(**enc, max_new_tokens=8, do_sample=False,\n                             pad_token_id=tokenizer.eos_token_id)\n    txt = tokenizer.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True).lower()\n    if 'investor'    in txt: return 'Investor Rep'\n    if 'independent' in txt: return 'Independent'\n    if 'cto'         in txt: return 'CTO'\n    if 'cfo'         in txt: return 'CFO'\n    return None\n\ndef tom_eval(seed_base=80_000, n=20):  # reduced from 40 → 20\n    correct = total = 0\n    with make_env().sync() as env:\n        for ep in range(n):\n            result = env.reset(seed=seed_base + ep)\n            obs = result.observation\n            decision, _, _ = greedy_action(obs)\n            opposed = [s['role'] for s in obs.npc_statements if s['vote'] != decision]\n            if not opposed:\n                continue\n            pred = tom_predict(obs, decision)\n            if pred and pred in opposed:\n                correct += 1\n            total += 1\n    return correct, total\n\nt_corr, t_tot = tom_eval()\nwith model.disable_adapter():\n    b_corr, b_tot = tom_eval()\n\ntom_acc      = t_corr / max(1, t_tot)\ntom_acc_base = b_corr / max(1, b_tot)\nprint(f'ToM probe: trained = {tom_acc:.1%} ({t_corr}/{t_tot})   base = {tom_acc_base:.1%} ({b_corr}/{b_tot})')\n\nwith open(WORK_DIR / 'tom.json', 'w') as f:\n    json.dump({'trained': {'correct': t_corr, 'total': t_tot, 'accuracy': tom_acc},\n               'base':    {'correct': b_corr, 'total': b_tot, 'accuracy': tom_acc_base}}, f)\n\nprint('✓ Theory-of-Mind probe complete.')\n\n# =============================================================================\n# CELL 16 — Push to HF Hub\n# =============================================================================\nADAPTER_REPO = os.environ.get('ADAPTER_REPO', 'StavanKhobare/Qwen3-0.6B-Final-LoRA')\nMERGED_REPO  = os.environ.get('MERGED_REPO',  'StavanKhobare/Qwen3-0.6B-Final-Merged16bit')\napi = HfApi()\napi.create_repo(ADAPTER_REPO, repo_type='model', private=False, exist_ok=True)\napi.create_repo(MERGED_REPO,  repo_type='model', private=False, exist_ok=True)\n\ntry:\n    model.push_to_hub(ADAPTER_REPO, private=False)\n    tokenizer.push_to_hub(ADAPTER_REPO, private=False)\n    print(f'✓ LoRA pushed: https://huggingface.co/{ADAPTER_REPO}')\nexcept Exception as e:\n    print(f'LoRA push failed: {e!r}')\n\ntry:\n    model.push_to_hub_merged(MERGED_REPO, tokenizer, save_method='merged_16bit', private=False)\n    print(f'✓ Merged 16-bit pushed: https://huggingface.co/{MERGED_REPO}')\nexcept Exception as e:\n    print(f'Merged push failed (you can retry): {e!r}')\n\ntry:\n    api.upload_folder(folder_path=str(ASSETS), repo_id=ADAPTER_REPO,\n                      path_in_repo='assets', repo_type='model')\n    for fname in ['log_history.json', 'eval_history.json', 'eval_paired.json',\n                  'stats_summary.json', 'tom.json', 'transcripts.json',\n                  'decision_counter.json', 'baseline.json',\n                  'per_event_winrate.json']:\n        fp = WORK_DIR / fname\n        if fp.exists():  # safely skips baseline.json since you didn't run cell 8\n            api.upload_file(path_or_fileobj=str(fp), path_in_repo=fname,\n                            repo_id=ADAPTER_REPO, repo_type='model')\n    print(f'✓ Artifacts uploaded to https://huggingface.co/{ADAPTER_REPO}')\nexcept Exception as e:\n    print(f'Artifact upload failed: {e!r}')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T07:58:28.682945Z","iopub.execute_input":"2026-04-26T07:58:28.683870Z","iopub.status.idle":"2026-04-26T08:01:57.425478Z","shell.execute_reply.started":"2026-04-26T07:58:28.683820Z","shell.execute_reply":"2026-04-26T08:01:57.424531Z"}},"outputs":[{"name":"stdout","text":"✓ Saved per_event_winrate.png\nToM probe: trained = 5.6% (1/18)   base = 0.0% (0/18)\n✓ Theory-of-Mind probe complete.\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"README.md: 0.00B [00:00, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"38d39b0613184b0887ee150cc0f67224"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Processing Files (0 / 0): |          |  0.00B /  0.00B            ","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"62992fd6706a401589494e51ab877eca"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"New Data Upload: |          |  0.00B /  0.00B            ","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"3bff6d67c77446a5980e84c760cc08dd"}},"metadata":{}},{"name":"stdout","text":"Saved model to https://huggingface.co/StavanKhobare/neuraledge-boardroom-qwen3-lora\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Processing Files (0 / 0): |          |  0.00B /  0.00B            ","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"82feb04ced9e4236b50b79ffcc700ab2"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"New Data Upload: |          |  0.00B /  0.00B            ","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"58bebf6d06e4483b9098105dc42aacbb"}},"metadata":{}},{"name":"stdout","text":"✓ LoRA pushed: https://huggingface.co/StavanKhobare/neuraledge-boardroom-qwen3-lora\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"config.json:   0%|          | 0.00/752 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"c282738a9cd74317930784b5546a1818"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Processing Files (0 / 0): |          |  0.00B /  0.00B            ","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"38d4269773424cfeb60d5be35d6111fd"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"New Data Upload: |          |  0.00B /  0.00B            ","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"91c4cf66924343e7b6dec4c30e6408d8"}},"metadata":{}},{"name":"stdout","text":"Found HuggingFace hub cache directory: /root/.cache/huggingface/hub\nChecking cache directory for required files...\nCache check failed: model.safetensors not found in local cache.\nNot all required files found in cache. Will proceed with downloading.\nChecking cache directory for required files...\nCache check failed: tokenizer.model not found in local cache.\nNot all required files found in cache. Will proceed with downloading.\n","output_type":"stream"},{"name":"stderr","text":"Unsloth: Preparing safetensor model files:   0%|          | 0/1 [00:00<?, ?it/s]","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"6e12f5763e154751be8f84ca75d1377c"}},"metadata":{}},{"name":"stderr","text":"Unsloth: Preparing safetensor model files: 100%|██████████| 1/1 [00:03<00:00,  3.95s/it]\n","output_type":"stream"},{"name":"stdout","text":"Note: tokenizer.model not found (this is OK for non-SentencePiece models)\n","output_type":"stream"},{"name":"stderr","text":"Unsloth: Merging weights into 16bit:   0%|          | 0/1 [00:00<?, ?it/s]","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Processing Files (0 / 0): |          |  0.00B /  0.00B            ","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"8791dd427e6a418c9fef04f61435bf69"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"New Data Upload: |          |  0.00B /  0.00B            ","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"6fcc233303344eb7a3b7e2e08dfb86bc"}},"metadata":{}},{"name":"stderr","text":"Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [00:22<00:00, 22.27s/it]\n","output_type":"stream"},{"name":"stdout","text":"Unsloth: Merge process complete. Saved to `/kaggle/working/StavanKhobare/Qwen3-0.6B-Final-Merged16bit`\n✓ Merged 16-bit pushed: https://huggingface.co/StavanKhobare/Qwen3-0.6B-Final-Merged16bit\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Processing Files (0 / 0): |          |  0.00B /  0.00B            ","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"5f0b9f8d7ab84cc4a03cbe6ac07cfbfe"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"New Data Upload: |          |  0.00B /  0.00B            ","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"69c08e98d34846fba29aab72d88b5584"}},"metadata":{}},{"name":"stdout","text":"✓ Artifacts uploaded to https://huggingface.co/StavanKhobare/neuraledge-boardroom-qwen3-lora\n","output_type":"stream"}],"execution_count":12},{"id":"84189ed0-f11f-455b-ac0b-58c607c91f54","cell_type":"markdown","source":"## 14. Per-event win-rate breakdown","metadata":{}},{"id":"c58b32c2-b662-4bbe-bf21-74eab4ba86cf","cell_type":"code","source":"# def per_event_winrate(history_per_seed):\n#     bucket = collections.defaultdict(lambda: [0, 0])\n#     for hist in history_per_seed:\n#         for rd in hist:\n#             t = rd.get('event_title', '?')\n#             bucket[t][1] += 1\n#             if rd.get('agent_won_vote'):\n#                 bucket[t][0] += 1\n#     return {t: (w / max(1, n)) for t, (w, n) in bucket.items()}\n\n# trained_wr = per_event_winrate(trained_history_per_seed)\n# base_wr    = per_event_winrate(base_history_per_seed)\n\n# events_sorted = sorted(set(trained_wr) | set(base_wr))\n# tw = [trained_wr.get(e, 0.0) for e in events_sorted]\n# bw = [base_wr.get(e, 0.0)    for e in events_sorted]\n\n# plt.figure(figsize=(11, 5))\n# x = np.arange(len(events_sorted))\n# plt.bar(x - 0.2, bw, width=0.4, color='#c44', label='Base Qwen3-1.7B')\n# plt.bar(x + 0.2, tw, width=0.4, color='#1d6fff', label='Fine-tuned Qwen3-1.7B')\n# plt.xticks(x, [e[:22] for e in events_sorted], rotation=30, ha='right')\n# plt.ylim(0, 1.05); plt.ylabel('boardroom win rate')\n# plt.title('Per-event boardroom win rate (paired seeds, n=50 episodes)')\n# plt.legend(); plt.grid(alpha=0.3, axis='y'); plt.tight_layout()\n# plt.savefig(ASSETS / 'per_event_winrate.png', dpi=150); plt.close()\n\n# with open(WORK_DIR / 'per_event_winrate.json', 'w') as f:\n#     json.dump({'events': events_sorted, 'trained': tw, 'base': bw}, f, indent=2)\n# print('Saved per_event_winrate.png')","metadata":{},"outputs":[],"execution_count":null},{"id":"3ce6fc80-86b7-4312-b7d7-9f0cc607bf46","cell_type":"markdown","source":"## 15. Theory-of-Mind probe","metadata":{}},{"id":"d939b235-4893-46fc-b03a-8c68d61bec37","cell_type":"code","source":"# TOM_INSTRUCTION = (\n#     \"\\n\\nGiven the state and event below, name the SINGLE board member \"\n#     \"(CTO, CFO, Investor Rep, or Independent) most likely to oppose the chosen decision. \"\n#     \"Answer with just the role name on one line.\\n\"\n# )\n\n# def tom_predict(obs, decision):\n#     body = build_prompt(obs).split(SYSTEM_PROMPT, 1)[1]\n#     prompt = SYSTEM_PROMPT + TOM_INSTRUCTION + body + f'Chosen decision: {decision}\\nMost likely opponent: '\n#     enc = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024).to(device)\n#     with torch.no_grad():\n#         out = model.generate(**enc, max_new_tokens=8, do_sample=False,\n#                              pad_token_id=tokenizer.eos_token_id)\n#     txt = tokenizer.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True).lower()\n#     if 'investor'    in txt: return 'Investor Rep'\n#     if 'independent' in txt: return 'Independent'\n#     if 'cto'         in txt: return 'CTO'\n#     if 'cfo'         in txt: return 'CFO'\n#     return None\n\n# def tom_eval(seed_base=80_000, n=40):\n#     correct = total = 0\n#     with make_env().sync() as env:\n#         for ep in range(n):\n#             result = env.reset(seed=seed_base + ep)\n#             obs = result.observation\n#             decision, _, _ = greedy_action(obs)\n#             opposed = [s['role'] for s in obs.npc_statements if s['vote'] != decision]\n#             if not opposed:\n#                 continue\n#             pred = tom_predict(obs, decision)\n#             if pred and pred in opposed:\n#                 correct += 1\n#             total += 1\n#     return correct, total\n\n# t_corr, t_tot = tom_eval()\n# with model.disable_adapter():\n#     b_corr, b_tot = tom_eval()\n\n# tom_acc        = t_corr / max(1, t_tot)\n# tom_acc_base   = b_corr / max(1, b_tot)\n# print(f'ToM probe: trained = {tom_acc:.1%} ({t_corr}/{t_tot})   base = {tom_acc_base:.1%} ({b_corr}/{b_tot})')\n# with open(WORK_DIR / 'tom.json', 'w') as f:\n#     json.dump({'trained': {'correct': t_corr, 'total': t_tot, 'accuracy': tom_acc},\n#                'base':    {'correct': b_corr, 'total': b_tot, 'accuracy': tom_acc_base}}, f)","metadata":{},"outputs":[],"execution_count":null},{"id":"31fcdf77-b9aa-4363-8e32-c5164cc668a7","cell_type":"markdown","source":"## 16. Push to HF Hub","metadata":{}},{"id":"9b3c0dcf-5506-4e34-888d-f73663443263","cell_type":"code","source":"# from huggingface_hub import HfApi\n# ADAPTER_REPO = os.environ.get('ADAPTER_REPO', 'StavanKhobare/SST-MetaxPyTorch-Hackathon-LoRA')\n# MERGED_REPO  = os.environ.get('MERGED_REPO',  'StavanKhobare/SST-MetaxPyTorch-Hackathon-Merged16bit')\n\n# api = HfApi()\n# api.create_repo(ADAPTER_REPO, repo_type='model', private=False, exist_ok=True)\n# api.create_repo(MERGED_REPO,  repo_type='model', private=False, exist_ok=True)\n\n# try:\n#     model.push_to_hub(ADAPTER_REPO, private=False)\n#     tokenizer.push_to_hub(ADAPTER_REPO, private=False)\n#     print(f'\\u2713 LoRA pushed: https://huggingface.co/{ADAPTER_REPO}')\n# except Exception as e:\n#     print(f'LoRA push failed: {e!r}')\n\n# try:\n#     model.push_to_hub_merged(MERGED_REPO, tokenizer, save_method='merged_16bit', private=False)\n#     print(f'\\u2713 Merged 16-bit pushed: https://huggingface.co/{MERGED_REPO}')\n# except Exception as e:\n#     print(f'Merged push failed (you can retry): {e!r}')\n\n# try:\n#     api.upload_folder(folder_path=str(ASSETS), repo_id=ADAPTER_REPO,\n#                       path_in_repo='assets', repo_type='model')\n#     for fname in ['log_history.json','eval_history.json','eval_paired.json',\n#                   'stats_summary.json','tom.json','transcripts.json',\n#                   'decision_counter.json','baseline.json',\n#                   'per_event_winrate.json']:\n#         fp = WORK_DIR / fname\n#         if fp.exists():\n#             api.upload_file(path_or_fileobj=str(fp), path_in_repo=fname,\n#                             repo_id=ADAPTER_REPO, repo_type='model')\n#     print(f'\\u2713 Artifacts uploaded to https://huggingface.co/{ADAPTER_REPO}')\n# except Exception as e:\n#     print(f'Artifact upload failed: {e!r}')","metadata":{},"outputs":[],"execution_count":null},{"id":"b945bb18-26d7-4ecb-9fe5-a681dcb221d8","cell_type":"markdown","source":"## 17. Final summary","metadata":{}},{"id":"bf13ed07-5225-49f2-b80f-7e92c3f6ee3a","cell_type":"code","source":"import math\n# Decision entropy (over GRPO rollouts)\n_total = sum(decision_counter.values())\n_probs = [c / _total for c in decision_counter.values()] if _total else []\nentropy = -sum(p * math.log(p + 1e-12) for p in _probs) if _probs else 0.0\nmax_ent = math.log(len(decision_counter)) if decision_counter else 0.0\n\nprint('='*70)\nprint('BOARDSIM \\u00d7 QWEN3-1.7B \\u2014 LEARNING EVIDENCE')\nprint('='*70)\nprint(f'Reward slope (linear fit) : {slope:+.5f}/step  (p={p_val:.2e})')\nprint(f'Reward EMA first 20 steps : {rewards_ema[:20].mean():+.3f}')\nprint(f'Reward EMA last 20 steps  : {rewards_ema[-20:].mean():+.3f}')\nprint(f'Format compliance start   : {fmts[:20].mean():.0%}')\nprint(f'Format compliance end     : {fmts[-20:].mean():.0%}')\nprint('-'*70)\nprint(f'Held-out paired (n={len(tf)}):  fine-tuned {tf.mean():.2f}  vs  base {bf.mean():.2f}')\nprint(f'  paired t-test p={summary[\"paired_t_p\"]:.2e}   Wilcoxon p={summary[\"wilcoxon_p_greater\"]:.2e}')\nprint(f'  Cohen d={summary[\"cohens_d\"]:+.2f}   95% CI of lift = [{summary[\"paired_diff_95ci\"][0]:+.2f}, {summary[\"paired_diff_95ci\"][1]:+.2f}]')\nprint(f'  win rate (fine-tuned > base): {summary[\"win_rate_trained_strictly_better\"]:.0%}')\nprint(f'ToM probe  fine-tuned     : {tom_acc:.0%}    base = {tom_acc_base:.0%}')\nprint(f'Decision entropy          : {entropy:.2f} / {max_ent:.2f}  (\\u2192 not collapsed)')\nprint('-'*70)\nprint(f'Adapter      : https://huggingface.co/{ADAPTER_REPO}')\nprint(f'Merged 16bit : https://huggingface.co/{MERGED_REPO}')\nprint(f'Env Space    : {ENV_BASE_URL}')\nprint('='*70)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T08:09:32.888470Z","iopub.execute_input":"2026-04-26T08:09:32.889230Z","iopub.status.idle":"2026-04-26T08:09:32.897862Z","shell.execute_reply.started":"2026-04-26T08:09:32.889184Z","shell.execute_reply":"2026-04-26T08:09:32.897178Z"}},"outputs":[{"name":"stdout","text":"======================================================================\nBOARDSIM × QWEN3-1.7B — LEARNING EVIDENCE\n======================================================================\nReward slope (linear fit) : -0.06447/step  (p=3.51e-02)\nReward EMA first 20 steps : +5.048\nReward EMA last 20 steps  : +3.684\nFormat compliance start   : 91%\nFormat compliance end     : 99%\n----------------------------------------------------------------------\nHeld-out paired (n=20):  fine-tuned 45.93  vs  base 47.37\n  paired t-test p=6.66e-01   Wilcoxon p=8.64e-01\n  Cohen d=-0.10   95% CI of lift = [-7.36, +5.20]\n  win rate (fine-tuned > base): 15%\nToM probe  fine-tuned     : 6%    base = 0%\nDecision entropy          : 2.76 / 3.18  (→ not collapsed)\n----------------------------------------------------------------------\nAdapter      : https://huggingface.co/StavanKhobare/neuraledge-boardroom-qwen3-lora\nMerged 16bit : https://huggingface.co/StavanKhobare/Qwen3-0.6B-Final-Merged16bit\nEnv Space    : https://stavankhobare-sst-metaxpytorch-hackathon.hf.space\n======================================================================\n","output_type":"stream"}],"execution_count":14}]}