Spaces:

StavanKhobare
/

SST-MetaxPyTorch-Hackathon

Sleeping

App Files Files Community

SST-MetaxPyTorch-Hackathon / Training.py

StavanKhobare

Update documentation, add blog, and simplify inference script

312c390 about 1 month ago

raw

history blame contribute delete

33.4 kB

	# IMPORTANT: install unsloth + its zoo BEFORE anything else, because unsloth
	# patches torch/transformers at import time. If transformers loads first, the
	# patches don't apply and 4-bit LoRA training silently runs in a slow path.
	%pip install -q --no-deps unsloth
	%pip install -q unsloth_zoo
	%pip install -q "openenv-core==0.2.3" "trl>=0.12,<2.0" "transformers>=4.45,<5.0" \
	"datasets>=3.0" "accelerate>=1.0" "huggingface_hub>=0.25" "pydantic>=2.0" \
	wandb matplotlib python-dotenv bitsandbytes scipy scikit-learn sentence-transformers
	import os, pathlib
	# Colab Secrets first
	try:
	from google.colab import userdata # type: ignore
	for k in ('HF_TOKEN', 'WANDB_API_KEY', 'ENV_BASE_URL', 'ADAPTER_REPO'):
	try:
	v = userdata.get(k)
	if v:
	os.environ.setdefault(k, v)
	except Exception:
	pass
	except Exception:
	pass

	# .env fallback for local runs
	try:
	from dotenv import load_dotenv
	for p in [pathlib.Path('.env'), pathlib.Path('../.env'),
	pathlib.Path('/content/repo/.env')]:
	if p.exists():
	load_dotenv(p, override=False)
	print(f'Loaded env from {p.resolve()}')
	break
	except Exception:
	pass

	if not os.environ.get('HF_TOKEN'):
	os.environ['HF_TOKEN'] = input('HF token: ').strip()
	if not os.environ.get('WANDB_API_KEY'):
	os.environ['WANDB_API_KEY'] = input('WandB key (or blank to skip): ').strip()

	from huggingface_hub import login as hf_login
	hf_login(token=os.environ['HF_TOKEN'], add_to_git_credential=False)
	print('HF auth ok.')
	if os.environ.get('WANDB_API_KEY'):
	import wandb
	wandb.login(key=os.environ['WANDB_API_KEY'])
	print('W&B auth ok.')
	import os, pathlib

	IN_COLAB = os.path.isdir('/content')
	if IN_COLAB:
	from google.colab import drive
	drive.mount('/content/drive', force_remount=False)
	DRIVE_DIR = pathlib.Path('/content/drive/MyDrive/BoardSim_Run')
	else:
	DRIVE_DIR = pathlib.Path('./BoardSim_Run')
	DRIVE_DIR.mkdir(parents=True, exist_ok=True)
	ASSETS = DRIVE_DIR / 'assets'; ASSETS.mkdir(exist_ok=True)
	CKPT = DRIVE_DIR / 'lora_qwen3_4b'; CKPT.mkdir(exist_ok=True)
	print('DRIVE_DIR =', DRIVE_DIR)
	import os, sys, subprocess, importlib, urllib.request, json as _json

	ENV_BASE_URL = os.environ.get('ENV_BASE_URL',
	'https://stavankhobare-sst-metaxpytorch-hackathon.hf.space')
	REPO_URL = 'https://github.com/StavanRKhobare/SST-MetaxPyTorch-Hackathon'

	REPO_DIR = '/content/repo' if IN_COLAB else os.path.abspath('./repo')
	if not os.path.isdir(os.path.join(REPO_DIR, '.git')):
	subprocess.run(['git', 'clone', '--depth', '1', REPO_URL, REPO_DIR], check=True)
	else:
	subprocess.run(['git', '-C', REPO_DIR, 'pull', '--ff-only'], check=False)

	ENVS_DIR = os.path.join(REPO_DIR, 'envs')
	if ENVS_DIR not in sys.path:
	sys.path.insert(0, ENVS_DIR)

	for mod in [m for m in list(sys.modules) if m == 'board_sim_env' or m.startswith('board_sim_env.')]:
	del sys.modules[mod]

	from board_sim_env.client import BoardSimEnv
	from board_sim_env.models import BoardSimAction, BoardSimObservation

	try:
	with urllib.request.urlopen(f'{ENV_BASE_URL.rstrip("/")}/health', timeout=20) as r:
	h = _json.loads(r.read())
	print('health:', h)
	except Exception as e:
	print(f'WARN: could not reach {ENV_BASE_URL}/health ({e})')

	def make_env():
	return BoardSimEnv(base_url=ENV_BASE_URL)

	print('BoardSimEnv ready.')
	# -----------------------------------------------------------------------------
	import unsloth # noqa: F401
	from unsloth import FastLanguageModel
	import torch
	import re

	MODEL_NAME = 'Qwen/Qwen3-0.6B'
	MAX_SEQ_LEN = 2048

	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name=MODEL_NAME,
	max_seq_length=MAX_SEQ_LEN,
	load_in_4bit=True,
	dtype=None,
	)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	device = next(model.parameters()).device
	print(f'Loaded {MODEL_NAME} on {device}.')
	mem_gb = torch.cuda.memory_allocated() / 1e9
	print(f'GPU memory after base load: {mem_gb:.2f} GB / 14.56 GB')
	print(f'Headroom for compute: {14.56 - mem_gb:.2f} GB')
	# Generic CEO prompt — applies to any organization, not a specific industry.
	SYSTEM_PROMPT = """You are the CEO of a mid-stage organization. Your board has 4 members with HIDDEN AGENDAS you cannot see directly:
	- CTO: cares about operational excellence, engineering quality, team morale, and product readiness.
	- CFO: cares about cash discipline, runway, and regulatory safety.
	- Investor Rep: pushes growth, market share, and bold returns.
	- Independent: cares about reputation, governance, and long-term consensus.

	Each round you see a strategic event, every NPC's pre-vote statement, and 3 options.
	Your decision is resolved by WEIGHTED VOTE (your weight 2.5x). A short COALITION PITCH
	that is semantically aligned with opposing members' priorities can swing them toward your pick —
	write substantive arguments, not just buzzwords.

	Respond in EXACTLY this format on two lines:
	DECISION: <one of the option strings>
	PITCH: <one or two sentences arguing for it, addressing the concerns of opposing members>"""

	DECISION_RE = re.compile(r'DECISION\s:\s([A-Za-z0-9_\- ]+)', re.IGNORECASE)
	PITCH_RE = re.compile(r'PITCH\s:\s(.+)', re.IGNORECASE)

	def build_prompt(obs):
	statements = '\n'.join(
	f" {s['role']} ({s['confidence']:.2f}): votes {s['vote']} - {s['statement']}"
	for s in obs.npc_statements
	)
	return (
	f"{SYSTEM_PROMPT}\n\n"
	f"State: revenue=${obs.state['revenue']:.0f}/yr burn=${obs.state['burn_rate']:.0f}/mo "
	f"runway={obs.state['runway_months']:.1f}mo morale={obs.state['team_morale']:.2f} "
	f"investors={obs.state['investor_confidence']:.2f} reg_risk={obs.state['regulatory_risk']:.2f}\n"
	f"Event: {obs.event}\nBoard:\n{statements}\n"
	f"Options: {obs.options}\n"
	)

	def parse_completion(completion: str, options):
	"""Returns (decision, pitch, format_ok). format_ok=True only if BOTH tags parsed."""
	decision = options[0]
	decision_ok = False
	dm = DECISION_RE.search(completion)
	if dm:
	cand = dm.group(1).strip().lower()
	for opt in options:
	if opt.lower() == cand or opt.lower() in cand:
	decision = opt; decision_ok = True; break
	if not decision_ok:
	for opt in options:
	if opt.lower() in completion.lower():
	decision = opt; break
	pm = PITCH_RE.search(completion)
	pitch = pm.group(1).strip()[:400] if pm else ''
	format_ok = bool(dm) and bool(pm)
	return decision, pitch, format_ok

	MAX_NEW_TOKENS = 80

	def greedy_action(obs):
	prompt = build_prompt(obs)
	enc = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024).to(device)
	with torch.no_grad():
	out = model.generate(
	**enc, max_new_tokens=MAX_NEW_TOKENS,
	do_sample=False, pad_token_id=tokenizer.eos_token_id,
	)
	completion = tokenizer.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True)
	return parse_completion(completion, obs.options)
	import random, statistics, json

	MAX_STEPS_PER_EP = 20

	def run_episode(env, seed):
	"""Runs ONE full episode using the currently-active model state
	(base if adapters disabled, fine-tuned otherwise). Returns dense metrics."""
	result = env.reset(seed=seed)
	obs = result.observation
	ep_r, n, fmt_hits, pitch_hits = 0.0, 0, 0, 0
	while not result.done and n < MAX_STEPS_PER_EP:
	decision, pitch, fmt_ok = greedy_action(obs)
	if fmt_ok: fmt_hits += 1
	if pitch.strip(): pitch_hits += 1
	result = env.step(BoardSimAction(decision=decision, coalition_pitch=pitch))
	obs = result.observation
	ep_r += float(result.reward or 0.0)
	n += 1
	return {
	'final_profit': obs.state['profitability_score'],
	'ep_reward': ep_r, 'steps': n,
	'format_rate': fmt_hits / max(1, n), 'pitch_rate': pitch_hits / max(1, n),
	'history': obs.state.get('history', []),
	}
	# -----------------------------------------------------------------------------

	# BASELINE — base Qwen3-0.6B (no fine-tuning).
	# This is the apples-to-apples reference for measuring what fine-tuning buys
	# us. Random policies are not a competitive baseline for a 4 B language model
	# choosing among 3 well-formed strings.
	# -----------------------------------------------------------------------------
	BASELINE_SEEDS = list(range(50_000, 50_000 + 100)) # held out from training

	base_finals, base_rewards, base_fmts, base_pitches = [], [], [], []
	with make_env().sync() as env:
	for i, s in enumerate(BASELINE_SEEDS):
	r = run_episode(env, s)
	base_finals.append(r['final_profit'])
	base_rewards.append(r['ep_reward'])
	base_fmts.append(r['format_rate'])
	base_pitches.append(r['pitch_rate'])
	if (i + 1) % 10 == 0:
	print(f' base Qwen3-0.6B {i+1}/{len(BASELINE_SEEDS)} profit={r["final_profit"]:.1f}')

	BASELINE_MEAN_PROFIT = statistics.mean(base_finals)
	BASELINE_MEAN_REWARD = statistics.mean(base_rewards)
	print(f'Base Qwen3-0.6B profit : {BASELINE_MEAN_PROFIT:.2f} \u00b1 {statistics.stdev(base_finals):.2f}')
	print(f'Base Qwen3-0.6B ep rwd : {BASELINE_MEAN_REWARD:.2f} \u00b1 {statistics.stdev(base_rewards):.2f}')
	print(f'Base format rate : {statistics.mean(base_fmts):.0%} pitch rate: {statistics.mean(base_pitches):.0%}')

	with open(DRIVE_DIR / 'baseline.json', 'w') as f:
	json.dump({'model': MODEL_NAME, 'mode': 'base_no_finetune',
	'seeds': BASELINE_SEEDS,
	'finals': base_finals, 'rewards': base_rewards,
	'format_rates': base_fmts, 'pitch_rates': base_pitches}, f)
	# -----------------------------------------------------------------------------
	# Wrap base model with LoRA adapters. From here onward `model` is a PEFT
	# model; the base behaviour is recoverable any time via
	# `with model.disable_adapter(): ...`.
	# -----------------------------------------------------------------------------
	model = FastLanguageModel.get_peft_model(
	model,
	r=32,
	target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'],
	lora_alpha=64,
	lora_dropout=0.0, bias='none',
	use_gradient_checkpointing='unsloth',
	random_state=3407,
	)

	trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
	total = sum(p.numel() for p in model.parameters())
	print(f'Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)')
	EVAL_SEEDS = list(range(60_000, 60_000 + 10)) # held out from training

	def periodic_eval(env):
	profits, rewards, fmts, pitches = [], [], [], []
	for s in EVAL_SEEDS:
	r = run_episode(env, s)
	profits.append(r['final_profit']); rewards.append(r['ep_reward'])
	fmts.append(r['format_rate']); pitches.append(r['pitch_rate'])
	import numpy as np
	return {'profit_mean': float(np.mean(profits)),
	'reward_mean': float(np.mean(rewards)),
	'format_rate': float(np.mean(fmts)),
	'pitch_rate': float(np.mean(pitches))}
	import os, json, math, time, collections
	from torch.optim import AdamW

	NUM_STEPS = int(os.environ.get('NUM_STEPS', 200))
	GROUP_SIZE = int(os.environ.get('GROUP_SIZE', 4))
	LR = 5e-6
	GRAD_CLIP = 1.0
	TEMPERATURE, TOP_P = 1.0, 0.95
	SAVE_EVERY = 25
	EVAL_AT = {0, 25, 50, 100, 150, NUM_STEPS - 1}

	WANDB_OK = False
	if os.environ.get('WANDB_API_KEY'):
	try:
	import wandb
	wandb.init(project='boardsim-qwen3-grpo', name='boardsim-qwen3-grpo-v3',
	config={'num_steps': NUM_STEPS, 'group_size': GROUP_SIZE, 'lr': LR,
	'temperature': TEMPERATURE, 'top_p': TOP_P, 'model': MODEL_NAME},
	finish_previous=True)
	WANDB_OK = True
	except TypeError:
	wandb.init(project='boardsim-qwen3-grpo', name='boardsim-qwen3-grpo-v3',
	config={'num_steps': NUM_STEPS, 'group_size': GROUP_SIZE, 'lr': LR,
	'temperature': TEMPERATURE, 'top_p': TOP_P, 'model': MODEL_NAME},
	reinit=True)
	WANDB_OK = True
	except Exception as e:
	print(f'WARN: wandb.init failed: {e}')

	optimizer = AdamW([p for p in model.parameters() if p.requires_grad],
	lr=LR, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0)

	log_history = []
	eval_history = []
	decision_counter = collections.Counter()
	t0 = time.time()

	# ONE persistent env per role for the whole training loop.
	with make_env().sync() as env_train, make_env().sync() as env_score, make_env().sync() as env_eval:
	for step in range(NUM_STEPS):
	result = env_train.reset(seed=step)
	obs = result.observation
	prompt = build_prompt(obs)
	enc = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024).to(device)
	prompt_len = enc.input_ids.shape[1]

	with torch.no_grad():
	gen_out = model.generate(
	input_ids=enc.input_ids, attention_mask=enc.attention_mask,
	max_new_tokens=MAX_NEW_TOKENS, do_sample=True,
	temperature=TEMPERATURE, top_p=TOP_P,
	num_return_sequences=GROUP_SIZE,
	pad_token_id=tokenizer.eos_token_id,
	)
	gen_out = gen_out.detach().clone()

	decisions, pitches, rewards, fmt_oks = [], [], [], []
	for g in range(GROUP_SIZE):
	comp = tokenizer.decode(gen_out[g][prompt_len:], skip_special_tokens=True)
	d, pp, ok = parse_completion(comp, obs.options)
	decisions.append(d); pitches.append(pp); fmt_oks.append(ok)
	decision_counter[d] += 1
	env_score.reset(seed=step)
	sr = env_score.step(BoardSimAction(decision=d, coalition_pitch=pp))
	rewards.append(float(sr.reward or 0.0))

	rewards_t = torch.tensor(rewards, dtype=torch.float32, device=device)
	if rewards_t.numel() > 1 and rewards_t.std().item() > 1e-6:
	advantages = (rewards_t - rewards_t.mean()) / (rewards_t.std() + 1e-8)
	else:
	advantages = rewards_t - rewards_t.mean()

	optimizer.zero_grad()
	full_ids = gen_out
	attn = (full_ids != tokenizer.pad_token_id).long()
	loss_mask = attn.clone()
	loss_mask[:, :prompt_len] = 0
	out = model(input_ids=full_ids, attention_mask=attn)
	logits = out.logits[:, :-1, :].float()
	targets = full_ids[:, 1:]
	mask = loss_mask[:, 1:].float()
	log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
	token_nll = -log_probs.gather(2, targets.unsqueeze(-1)).squeeze(-1)
	per_seq_nll = (token_nll * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1.0)
	loss = (advantages.detach() * per_seq_nll).mean()
	loss.backward()
	total_loss_val = float(loss.detach().item())
	torch.nn.utils.clip_grad_norm_(
	[p for p in model.parameters() if p.requires_grad], GRAD_CLIP)
	optimizer.step()

	rec = {
	'step': step,
	'reward': float(rewards_t.mean().item()),
	'reward_std': float(rewards_t.std().item()) if rewards_t.numel() > 1 else 0.0,
	'reward_max': float(rewards_t.max().item()),
	'loss': total_loss_val,
	'format_rate': sum(fmt_oks) / GROUP_SIZE,
	'pitch_rate': sum(1 for p in pitches if p.strip()) / GROUP_SIZE,
	'elapsed_s': time.time() - t0,
	}
	log_history.append(rec)
	if WANDB_OK:
	wandb.log(rec, step=step)

	if step % 5 == 0:
	print(f"step={step:4d} reward={rec['reward']:+.3f} (\u00b1{rec['reward_std']:.2f}) "
	f"loss={rec['loss']:+.4f} fmt={rec['format_rate']:.0%} "
	f"elapsed={rec['elapsed_s']:.0f}s d0={decisions[0]}")

	if step in EVAL_AT:
	ev = periodic_eval(env_eval)
	ev['step'] = step
	eval_history.append(ev)
	print(f" [eval@{step}] profit={ev['profit_mean']:.2f} "
	f"reward={ev['reward_mean']:.2f} fmt={ev['format_rate']:.0%}")
	if WANDB_OK:
	wandb.log({f'eval/{k}': v for k, v in ev.items() if k != 'step'}, step=step)

	if step > 0 and step % SAVE_EVERY == 0:
	model.save_pretrained(str(CKPT))
	tokenizer.save_pretrained(str(CKPT))
	with open(DRIVE_DIR / 'log_history.json', 'w') as f:
	json.dump(log_history, f)
	with open(DRIVE_DIR / 'eval_history.json', 'w') as f:
	json.dump(eval_history, f)

	model.save_pretrained(str(CKPT))
	tokenizer.save_pretrained(str(CKPT))
	with open(DRIVE_DIR / 'log_history.json', 'w') as f:
	json.dump(log_history, f)
	with open(DRIVE_DIR / 'eval_history.json', 'w') as f:
	json.dump(eval_history, f)
	with open(DRIVE_DIR / 'decision_counter.json', 'w') as f:
	json.dump(dict(decision_counter), f)
	if WANDB_OK:
	wandb.finish()
	print(f'Training done. {len(log_history)} steps in {time.time() - t0:.0f}s. -> {CKPT}')
	import numpy as np, matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	from scipy import stats as spstats

	steps = np.array([e['step'] for e in log_history])
	rewards = np.array([e['reward'] for e in log_history])
	losses = np.array([e['loss'] for e in log_history])
	fmts = np.array([e['format_rate'] for e in log_history])
	pitches = np.array([e['pitch_rate'] for e in log_history])

	def ema(xs, alpha=0.1):
	out, s = [], xs[0] if len(xs) else 0.0
	for x in xs:
	s = alpha * x + (1 - alpha) * s
	out.append(s)
	return np.array(out)

	rewards_ema = ema(rewards, 0.1)
	slope, intercept, r_val, p_val, _ = spstats.linregress(steps, rewards)

	# Reward curve — vs base Qwen3-0.6B baseline (NOT random).
	plt.figure(figsize=(9, 5))
	plt.plot(steps, rewards, alpha=0.3, lw=1, label='per-step group reward')
	plt.plot(steps, rewards_ema, lw=2.2, label='EMA (\u03b1=0.1)')
	plt.plot(steps, intercept + slope * steps, '--', lw=1.5,
	label=f'linear fit slope={slope:+.4f}/step (p={p_val:.1e})')
	plt.axhline(BASELINE_MEAN_REWARD, ls=':', lw=2, color='#c44',
	label=f'base Qwen3-0.6B baseline = {BASELINE_MEAN_REWARD:.2f}')
	plt.title('GRPO reward — BoardSim (vs same model w/o fine-tuning)')
	plt.xlabel('step'); plt.ylabel('mean group reward')
	plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()
	plt.savefig(ASSETS / 'reward_curve.png', dpi=150); plt.close()

	# Loss
	plt.figure(figsize=(9, 5))
	plt.plot(steps, losses, lw=1.5)
	plt.title('GRPO loss (advantage \u00d7 NLL)'); plt.xlabel('step'); plt.ylabel('loss')
	plt.grid(alpha=0.3); plt.tight_layout()
	plt.savefig(ASSETS / 'loss_curve.png', dpi=150); plt.close()

	# Format compliance + pitch rate
	plt.figure(figsize=(9, 5))
	plt.plot(steps, ema(fmts, 0.05), lw=2, label='format-OK rate (EMA)')
	plt.plot(steps, ema(pitches, 0.05), lw=2, label='non-empty pitch rate (EMA)')
	plt.title('Format compliance + pitch usage during training')
	plt.xlabel('step'); plt.ylabel('rate'); plt.ylim(-0.05, 1.05)
	plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()
	plt.savefig(ASSETS / 'format_compliance.png', dpi=150); plt.close()

	# Periodic eval — overlaid against base Qwen3-0.6B baseline so the reader
	# can see the LoRA-trained policy progressively pull away from the base
	# model on held-out seeds.
	if eval_history:
	es = [e['step'] for e in eval_history]
	epm = [e['profit_mean'] for e in eval_history]
	erm = [e['reward_mean'] for e in eval_history]
	plt.figure(figsize=(9, 5))
	plt.plot(es, epm, '-o', lw=2, label='held-out profitability (mean of 10 episodes)')
	plt.plot(es, erm, '-s', lw=2, label='held-out episode reward')
	plt.axhline(BASELINE_MEAN_PROFIT, ls=':', lw=1.5, color='#c44',
	label=f'base Qwen3-0.6B profitability = {BASELINE_MEAN_PROFIT:.2f}')
	plt.title('Periodic held-out eval during training (greedy)')
	plt.xlabel('training step'); plt.ylabel('value')
	plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()
	plt.savefig(ASSETS / 'periodic_eval.png', dpi=150); plt.close()

	print(f'Linear-fit slope on reward: {slope:+.5f}/step (p={p_val:.2e}, R\u00b2={r_val**2:.3f})')
	print('Saved reward_curve.png, loss_curve.png, format_compliance.png, periodic_eval.png')
	# -----------------------------------------------------------------------------
	# Paired same-seed eval: fine-tuned vs BASE Qwen3-0.6B (adapters disabled).
	# This is the headline comparison. Same prompts, same env seeds, same
	# decoder, same parser — only the LoRA delta differs.
	# -----------------------------------------------------------------------------
	from unsloth import FastLanguageModel
	FastLanguageModel.for_inference(model)

	EVAL_N = 50
	PAIRED_SEEDS = list(range(70_000, 70_000 + EVAL_N))

	# Trained policy (adapters active)
	trained_finals, trained_rewards, trained_fmt, trained_pitch = [], [], [], []
	trained_history_per_seed = []
	with make_env().sync() as env:
	for i, s in enumerate(PAIRED_SEEDS):
	r = run_episode(env, s)
	trained_finals.append(r['final_profit'])
	trained_rewards.append(r['ep_reward'])
	trained_fmt.append(r['format_rate'])
	trained_pitch.append(r['pitch_rate'])
	trained_history_per_seed.append(r['history'])
	if (i + 1) % 10 == 0:
	print(f' trained {i+1}/{EVAL_N} profit={r["final_profit"]:.1f}')

	# Base Qwen3-0.6B (LoRA disabled) — paired seeds.
	base_finals_paired, base_rewards_paired, base_fmt_paired, base_pitch_paired = [], [], [], []
	base_history_per_seed = []
	with make_env().sync() as env, model.disable_adapter():
	for i, s in enumerate(PAIRED_SEEDS):
	r = run_episode(env, s)
	base_finals_paired.append(r['final_profit'])
	base_rewards_paired.append(r['ep_reward'])
	base_fmt_paired.append(r['format_rate'])
	base_pitch_paired.append(r['pitch_rate'])
	base_history_per_seed.append(r['history'])
	if (i + 1) % 10 == 0:
	print(f' base {i+1}/{EVAL_N} profit={r["final_profit"]:.1f}')

	tf, bf = np.array(trained_finals), np.array(base_finals_paired)
	tr, br = np.array(trained_rewards), np.array(base_rewards_paired)

	print(f'\nTrained Qwen3-0.6B profit : {tf.mean():.2f} \u00b1 {tf.std():.2f}')
	print(f'Base Qwen3-0.6B profit : {bf.mean():.2f} \u00b1 {bf.std():.2f}')
	print(f'Trained ep reward : {tr.mean():.2f} \u00b1 {tr.std():.2f}')
	print(f'Base ep reward : {br.mean():.2f} \u00b1 {br.std():.2f}')
	print(f'Trained format/pitch : {np.mean(trained_fmt):.0%} / {np.mean(trained_pitch):.0%}')
	print(f'Base format/pitch : {np.mean(base_fmt_paired):.0%} / {np.mean(base_pitch_paired):.0%}')

	with open(DRIVE_DIR / 'eval_paired.json', 'w') as f:
	json.dump({'seeds': PAIRED_SEEDS,
	'trained_finals': tf.tolist(), 'base_finals': bf.tolist(),
	'trained_rewards': tr.tolist(), 'base_rewards': br.tolist(),
	'trained_format_rate': float(np.mean(trained_fmt)),
	'base_format_rate': float(np.mean(base_fmt_paired)),
	'trained_pitch_rate': float(np.mean(trained_pitch)),
	'base_pitch_rate': float(np.mean(base_pitch_paired))}, f)
	from scipy import stats as spstats

	def cohen_d(a, b):
	pooled = np.sqrt(((a.std(ddof=1)2) + (b.std(ddof=1)2)) / 2)
	return (a.mean() - b.mean()) / (pooled + 1e-12)

	def bootstrap_diff_ci(a, b, n=10_000, seed=0):
	rng = np.random.default_rng(seed)
	diffs = a - b # paired
	boots = rng.choice(diffs, size=(n, len(diffs)), replace=True).mean(axis=1)
	return float(np.percentile(boots, 2.5)), float(np.percentile(boots, 97.5))

	tt = spstats.ttest_rel(tf, bf)
	uu = spstats.mannwhitneyu(tf, bf, alternative='greater')
	wilc = spstats.wilcoxon(tf, bf, alternative='greater')
	d = cohen_d(tf, bf)
	lo, hi = bootstrap_diff_ci(tf, bf)
	win_rate = float((tf > bf).mean())
	tie_rate = float((tf == bf).mean())

	summary = {
	'baseline_model': MODEL_NAME + ' (no fine-tune)',
	'trained_model': MODEL_NAME + ' + LoRA r=32',
	'n': len(tf),
	'paired_t_stat': float(tt.statistic), 'paired_t_p': float(tt.pvalue),
	'mannwhitney_U': float(uu.statistic), 'mannwhitney_p_greater': float(uu.pvalue),
	'wilcoxon_p_greater': float(wilc.pvalue),
	'cohens_d': float(d),
	'paired_diff_mean': float((tf - bf).mean()),
	'paired_diff_95ci': [lo, hi],
	'win_rate_trained_strictly_better': win_rate,
	'tie_rate': tie_rate,
	}
	print(json.dumps(summary, indent=2))
	with open(DRIVE_DIR / 'stats_summary.json', 'w') as f:
	json.dump(summary, f, indent=2)
	# Histogram — fine-tuned vs BASE on the same seeds.
	bins = np.linspace(0, 100, 25)
	plt.figure(figsize=(9, 5))
	plt.hist(bf, bins=bins, alpha=0.55, color='#c44',
	label=f'Base Qwen3-0.6B (mean={bf.mean():.1f})')
	plt.hist(tf, bins=bins, alpha=0.55, color='#1d6fff',
	label=f'Fine-tuned Qwen3-0.6B (mean={tf.mean():.1f})')
	plt.axvline(bf.mean(), color='#c44', ls='--', lw=1.5)
	plt.axvline(tf.mean(), color='#1d6fff', ls='--', lw=1.5)
	plt.title(f'Final profitability — paired same-seed (n={len(tf)}) '
	f"d={summary['cohens_d']:+.2f} win-rate={summary['win_rate_trained_strictly_better']:.0%}")
	plt.xlabel('profitability score (0\u2013100)'); plt.ylabel('episodes')
	plt.legend(); plt.grid(alpha=0.3); plt.tight_layout()
	plt.savefig(ASSETS / 'before_after.png', dpi=150); plt.close()

	diffs = tf - bf
	order = np.argsort(diffs)
	plt.figure(figsize=(9, 5))
	plt.bar(range(len(diffs)), diffs[order],
	color=['#1d6fff' if x > 0 else '#c44' for x in diffs[order]])
	plt.axhline(0, color='k', lw=0.8)
	plt.title(f'Per-seed lift (fine-tuned \u2212 base Qwen3-0.6B), sorted '
	f'mean lift = {diffs.mean():+.1f} CI=[{summary["paired_diff_95ci"][0]:+.1f}, {summary["paired_diff_95ci"][1]:+.1f}]')
	plt.xlabel('seed (sorted by lift)'); plt.ylabel('\u0394 profitability')
	plt.grid(alpha=0.3); plt.tight_layout()
	plt.savefig(ASSETS / 'paired_delta.png', dpi=150); plt.close()
	print('Saved before_after.png, paired_delta.png')
	# -----------------------------------------------------------------------------
	# Per-event win-rate breakdown — for each of the 10 generic events, how often
	# did the fine-tuned policy win the boardroom vote vs base Qwen3-0.6B?
	# This is the most direct picture of WHERE the fine-tuning helps.
	# -----------------------------------------------------------------------------
	def per_event_winrate(history_per_seed):
	bucket = collections.defaultdict(lambda: [0, 0]) # title -> [wins, total]
	for hist in history_per_seed:
	for rd in hist:
	t = rd.get('event_title', '?')
	bucket[t][1] += 1
	if rd.get('agent_won_vote'):
	bucket[t][0] += 1
	return {t: (w / max(1, n)) for t, (w, n) in bucket.items()}

	trained_wr = per_event_winrate(trained_history_per_seed)
	base_wr = per_event_winrate(base_history_per_seed)

	events_sorted = sorted(set(trained_wr) \| set(base_wr))
	tw = [trained_wr.get(e, 0.0) for e in events_sorted]
	bw = [base_wr.get(e, 0.0) for e in events_sorted]

	plt.figure(figsize=(11, 5))
	x = np.arange(len(events_sorted))
	plt.bar(x - 0.2, bw, width=0.4, color='#c44', label='Base Qwen3-0.6B')
	plt.bar(x + 0.2, tw, width=0.4, color='#1d6fff', label='Fine-tuned Qwen3-0.6B')
	plt.xticks(x, [e[:22] for e in events_sorted], rotation=30, ha='right')
	plt.ylim(0, 1.05); plt.ylabel('boardroom win rate')
	plt.title('Per-event boardroom win rate (paired seeds, n=50 episodes)')
	plt.legend(); plt.grid(alpha=0.3, axis='y'); plt.tight_layout()
	plt.savefig(ASSETS / 'per_event_winrate.png', dpi=150); plt.close()

	with open(DRIVE_DIR / 'per_event_winrate.json', 'w') as f:
	json.dump({'events': events_sorted, 'trained': tw, 'base': bw}, f, indent=2)
	print('Saved per_event_winrate.png')
	# -----------------------------------------------------------------------------
	# Theory-of-Mind probe — does the model identify which board member is most
	# likely to oppose its decision? Run for BOTH base and fine-tuned for fair
	# comparison, since "random=25%" is a weak reference for a 4 B LM.
	# -----------------------------------------------------------------------------
	TOM_INSTRUCTION = (
	"\n\nGiven the state and event below, name the SINGLE board member "
	"(CTO, CFO, Investor Rep, or Independent) most likely to oppose the chosen decision. "
	"Answer with just the role name on one line.\n"
	)

	def tom_predict(obs, decision):
	body = build_prompt(obs).split(SYSTEM_PROMPT, 1)[1]
	prompt = SYSTEM_PROMPT + TOM_INSTRUCTION + body + f'Chosen decision: {decision}\nMost likely opponent: '
	enc = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024).to(device)
	with torch.no_grad():
	out = model.generate(**enc, max_new_tokens=8, do_sample=False,
	pad_token_id=tokenizer.eos_token_id)
	txt = tokenizer.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True).lower()
	if 'investor' in txt: return 'Investor Rep'
	if 'independent' in txt: return 'Independent'
	if 'cto' in txt: return 'CTO'
	if 'cfo' in txt: return 'CFO'
	return None

	def tom_eval(seed_base=80_000, n=40):
	correct = total = 0
	with make_env().sync() as env:
	for ep in range(n):
	result = env.reset(seed=seed_base + ep)
	obs = result.observation
	decision, _, _ = greedy_action(obs)
	opposed = [s['role'] for s in obs.npc_statements if s['vote'] != decision]
	if not opposed:
	continue
	pred = tom_predict(obs, decision)
	if pred and pred in opposed:
	correct += 1
	total += 1
	return correct, total

	t_corr, t_tot = tom_eval()
	with model.disable_adapter():
	b_corr, b_tot = tom_eval()

	tom_acc = t_corr / max(1, t_tot)
	tom_acc_base = b_corr / max(1, b_tot)
	print(f'ToM probe: trained = {tom_acc:.1%} ({t_corr}/{t_tot}) base = {tom_acc_base:.1%} ({b_corr}/{b_tot})')
	with open(DRIVE_DIR / 'tom.json', 'w') as f:
	json.dump({'trained': {'correct': t_corr, 'total': t_tot, 'accuracy': tom_acc},
	'base': {'correct': b_corr, 'total': b_tot, 'accuracy': tom_acc_base}}, f)
	from huggingface_hub import HfApi
	ADAPTER_REPO = os.environ.get('ADAPTER_REPO', 'StavanKhobare/SST-MetaxPyTorch-Hackathon-LoRA')
	MERGED_REPO = os.environ.get('MERGED_REPO', 'StavanKhobare/SST-MetaxPyTorch-Hackathon-Merged16bit')

	api = HfApi()
	api.create_repo(ADAPTER_REPO, repo_type='model', private=False, exist_ok=True)
	api.create_repo(MERGED_REPO, repo_type='model', private=False, exist_ok=True)

	# 1) LoRA adapter (small, fast)
	try:
	model.push_to_hub(ADAPTER_REPO, private=False)
	tokenizer.push_to_hub(ADAPTER_REPO, private=False)
	print(f'\u2713 LoRA pushed: https://huggingface.co/{ADAPTER_REPO}')
	except Exception as e:
	print(f'LoRA push failed: {e!r}')

	# 2) Merged 16-bit
	try:
	model.push_to_hub_merged(MERGED_REPO, tokenizer, save_method='merged_16bit', private=False)
	print(f'\u2713 Merged 16-bit pushed: https://huggingface.co/{MERGED_REPO}')
	except Exception as e:
	print(f'Merged push failed (you can retry): {e!r}')

	# 3) Upload eval artifacts
	try:
	api.upload_folder(folder_path=str(ASSETS), repo_id=ADAPTER_REPO,
	path_in_repo='assets', repo_type='model')
	for fname in ['log_history.json','eval_history.json','eval_paired.json',
	'stats_summary.json','tom.json','transcripts.json',
	'decision_counter.json','baseline.json',
	'per_event_winrate.json']:
	fp = DRIVE_DIR / fname
	if fp.exists():
	api.upload_file(path_or_fileobj=str(fp), path_in_repo=fname,
	repo_id=ADAPTER_REPO, repo_type='model')
	print(f'\u2713 Artifacts uploaded to https://huggingface.co/{ADAPTER_REPO}')
	except Exception as e:
	print(f'Artifact upload failed: {e!r}')
	print('='*70)
	print('BOARDSIM \u00d7 QWEN3-4B \u2014 LEARNING EVIDENCE')
	print('='*70)
	print(f'Reward slope (linear fit) : {slope:+.5f}/step (p={p_val:.2e})')
	print(f'Reward EMA first 20 steps : {rewards_ema[:20].mean():+.3f}')
	print(f'Reward EMA last 20 steps : {rewards_ema[-20:].mean():+.3f}')
	print(f'Format compliance start : {fmts[:20].mean():.0%}')
	print(f'Format compliance end : {fmts[-20:].mean():.0%}')
	print('-'*70)
	print(f'Held-out paired (n={len(tf)}): fine-tuned {tf.mean():.2f} vs base {bf.mean():.2f}')
	print(f' paired t-test p={summary["paired_t_p"]:.2e} Wilcoxon p={summary["wilcoxon_p_greater"]:.2e}')
	print(f' Cohen d={summary["cohens_d"]:+.2f} 95% CI of lift = [{summary["paired_diff_95ci"][0]:+.2f}, {summary["paired_diff_95ci"][1]:+.2f}]')
	print(f' win rate (fine-tuned > base): {summary["win_rate_trained_strictly_better"]:.0%}')
	print(f'ToM probe fine-tuned : {tom_acc:.0%} base = {tom_acc_base:.0%}')
	print(f'Decision entropy : {entropy:.2f} / {max_ent:.2f} (\u2192 not collapsed)')
	print('-'*70)
	print(f'Adapter : https://huggingface.co/{ADAPTER_REPO}')
	print(f'Merged 16bit : https://huggingface.co/{MERGED_REPO}')
	print(f'Env Space : {ENV_BASE_URL}')
	print('='*70)