Spaces:

AMFORGE
/

sam-mm-space

Running

App Files Files Community

sam-mm-space / app.py

ameforge

Update app.py

0252509 verified 17 days ago

Raw

History Blame Contribute Delete

41.8 kB

	"""
	=============================================================================
	SAM-MM Benchmark — reproducible per-family evaluation for the SAM-MM line
	SparseMind / AMFORGE
	=============================================================================
	Checkpoint-driven and fully self-contained: the held-out eval set is GENERATED
	INTERNALLY (disjoint seed 99991), so no external data file or generator script
	is needed. It renders frames/mel from each sample's spec, greedy-decodes the
	answer, and reports per-family exact match + a CHAT/ACTION breakdown + aggregate.

	Notebook (Colab/Kaggle): edit the variables at the top of main() and run.
	Terminal:
	python samg_mm_benchmark.py --ckpt AMFORGE/sam-mm-reasoning-checkpoints:best.pt --families reasoning
	python samg_mm_benchmark.py --ckpt AMFORGE/sam-mm-audio-reasoning-checkpoints:best.pt --families audio
	python samg_mm_benchmark.py --ckpt ./best.pt --n 3000 --n-per 100

	Self-contained: the SAM-MM model, the renderers, the tokenizer resolver are
	inlined verbatim. External vision/audio LMs are NOT comparable on these
	SAM-specific synthetic tasks (different input pipelines), so this is an honest
	internal per-family report; add a baseline column only where truly comparable.
	=============================================================================
	"""
	from __future__ import annotations
	import os, sys, json, math, random, argparse
	from dataclasses import dataclass, asdict
	from typing import Optional
	from enum import IntEnum
	import torch, torch.nn as nn, torch.nn.functional as F
	try: import sentencepiece as spm
	except ImportError:
	os.system(f"{sys.executable} -m pip install -q sentencepiece --break-system-packages"); import sentencepiece as spm

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	BF16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
	TOK_REPO, TOK_FILE = "AMFORGE/samg_mm_tok", "samg_mm_tokenizer.model"
	DEFAULT_CKPT_REPO = "AMFORGE/sam-mm-reasoning-checkpoints"
	ORGANIZATION, MODEL_NAME = "AMFORGE", "SAM-MM"

	def _pip(p): os.system(f"{sys.executable} -m pip install -q {p} --break-system-packages")

	def get_hf_token():
	t = os.environ.get("HF_TOKEN") or ""
	if not t:
	try:
	from kaggle_secrets import UserSecretsClient
	t = UserSecretsClient().get_secret("HF_TOKEN") or ""
	except Exception: pass
	if not t:
	try:
	from google.colab import userdata; t = userdata.get("HF_TOKEN") or ""
	except Exception: pass
	if not t:
	p = os.path.expanduser("~/.cache/huggingface/token")
	if os.path.exists(p): t = open(p).read().strip()
	return t

	def resolve_tokenizer(token=None):
	for p in [TOK_FILE, os.path.join("tokenizer", TOK_FILE)]:
	if os.path.isfile(p): return p
	try:
	from huggingface_hub import hf_hub_download
	except ImportError:
	_pip("huggingface_hub"); from huggingface_hub import hf_hub_download
	return hf_hub_download(TOK_REPO, TOK_FILE, token=token)

	def resolve_ckpt(spec, token=None):
	"""Local path, 'repo:file', or bare 'file' from the default repo."""
	if os.path.isfile(spec): return spec
	try:
	from huggingface_hub import hf_hub_download
	except ImportError:
	_pip("huggingface_hub"); from huggingface_hub import hf_hub_download
	if ":" in spec and not spec.startswith("/"):
	repo, fn = spec.split(":", 1)
	else:
	repo, fn = DEFAULT_CKPT_REPO, spec
	return hf_hub_download(repo, fn, token=token)

	# =============================================================================
	# SAM-MM model — INLINED VERBATIM from samg_mm_train.py (state_dict-compatible)
	# =============================================================================
	class NeuronType(IntEnum):
	STEM=0; EXCITATORY=1; INHIBITORY=2; MEMORY=3; RELAY=4; MODULATORY=5; PATTERN=6
	TARGET_DISTRIBUTION = {NeuronType.STEM:.10, NeuronType.EXCITATORY:.35, NeuronType.INHIBITORY:.10,
	NeuronType.MEMORY:.15, NeuronType.RELAY:.10, NeuronType.MODULATORY:.08, NeuronType.PATTERN:.12}

	@dataclass
	class Config:
	vocab_size:int=32000; dim:int=320; n_layers:int=8; n_heads:int=8
	max_seq_len:int=1024; channel_top_k:int=120; token_top_k:int=128; ffn_mult:int=4
	dropout:float=0.1; pad_id:int=0; eos_id:int=2; use_diversity:bool=True
	# multimodal
	v_dim:int=320; v_layers:int=7; v_patch:int=8; img:int=96
	a_dim:int=320; a_layers:int=6; mel:int=64
	phys_dim:int=320; phys_slots:int=4
	# plasticity (text diversity layers)
	target_stem_ratio:float=.10; min_stem_ratio:float=.08; stem_plasticity:float=.012
	reversion_rate:float=.012; min_age_before_revert:int=8; update_interval:int=10
	baseline_revert_ratio:float=.5; inhibition_strength:float=.08
	modulation_strength:float=.1; excitation_strength:float=.3
	# train
	batch_size:int=16; grad_accum:int=2; lr:float=5e-4; max_steps:int=40000
	warmup:int=1500; eval_every:int=1000; save_every:int=1000; patience:int=12
	log_every:int=50; aux_phys_w:float=0.5

	class DynamicTypeManager(nn.Module):
	def __init__(self, dim, cfg):
	super().__init__()
	self.dim, self.cfg = dim, cfg
	t=[]
	for nt,p in TARGET_DISTRIBUTION.items(): t += [nt.value]int(dimp)
	while len(t)<dim: t.append(0)
	random.shuffle(t)
	self.register_buffer("neuron_types", torch.tensor(t,dtype=torch.long))
	self.register_buffer("activation_history", torch.zeros(dim))
	self.register_buffer("age", torch.randint(0,cfg.min_age_before_revert,(dim,),dtype=torch.long))
	self.register_buffer("cycle_counter", torch.tensor(0,dtype=torch.long))
	def get_type_mask(self,t): return (self.neuron_types==t.value).float()
	@torch.no_grad()
	def step(self,x):
	if not self.training: return
	self.activation_history.mul_(.95).add_(x.abs().mean((0,1)).float(),alpha=.05)
	self.age += 1; self.cycle_counter += 1
	if self.cycle_counter.item()%self.cfg.update_interval: return
	spec=(self.neuron_types!=0).nonzero().view(-1)
	if len(spec)>4:
	acts=self.activation_history[spec]
	thr=torch.quantile(acts,.30).item()
	cand=spec[(acts<=max(thr,1e-6)) & (self.age[spec]>self.cfg.min_age_before_revert)]
	n=min(max(1,int(self.dim*self.cfg.reversion_rate)),len(cand))
	if n>0:
	sel=cand[(-self.activation_history[cand]).topk(n)[1]]
	self.neuron_types[sel]=0; self.age[sel]=0; self.activation_history[sel]*=.5
	stem=(self.neuron_types==0).nonzero().view(-1)
	floor=max(2,int(self.dim*self.cfg.min_stem_ratio))
	if len(stem)>floor:
	n=min(max(1,int(self.dim*self.cfg.stem_plasticity)),len(stem)-floor)
	sel=stem[self.age[stem].float().topk(n)[1]]
	for ni in sel:
	w=torch.tensor([TARGET_DISTRIBUTION[t] for t in NeuronType if t!=NeuronType.STEM])
	self.neuron_types[ni]=list(NeuronType)[1:][torch.multinomial(w/w.sum(),1).item()].value
	self.age[ni]=0

	class GentleInhibition(nn.Module):
	def __init__(s,d,c): super().__init__(); s.k=c.inhibition_strength; s.noise_detector=nn.Sequential(nn.Linear(d,d//4),nn.ReLU(),nn.Linear(d//4,d),nn.Sigmoid()); s.threshold=nn.Parameter(torch.tensor(.15))
	def forward(s,x,m):
	sup=(x.abs()<s.threshold)&(s.noise_detector(x)<.3)
	return x(1-sup.float()m.view(1,1,-1)*s.k)
	class StrongExcitation(nn.Module):
	def __init__(s,d,c): super().__init__(); s.k=c.excitation_strength; s.integrator=nn.Sequential(nn.Linear(d,d),nn.GELU(),nn.Linear(d,d)); s.importance=nn.Sequential(nn.Linear(d,d),nn.Sigmoid())
	def forward(s,x,m): return x+s.integrator(x)s.importance(x)m.view(1,1,-1)*s.k
	class GentleModulation(nn.Module):
	def __init__(s,d,c): super().__init__(); s.k=c.modulation_strength; s.context=nn.Sequential(nn.Linear(d,d//4),nn.GELU(),nn.Linear(d//4,d),nn.Tanh())
	def forward(s,x,m):
	B,T,D=x.shape; den=torch.arange(1,T+1,device=x.device,dtype=x.dtype).view(1,T,1)
	return x+s.context(x.cumsum(1)/den)m.view(1,1,-1)s.k
	class PatternDetection(nn.Module):
	def __init__(s,d): super().__init__(); s.conv3=nn.Conv1d(d,d//2,3,groups=d//2); s.conv5=nn.Conv1d(d,d//2,5,groups=d//2); s.combine=nn.Linear(d,d)
	def forward(s,x,m):
	xt=x.transpose(1,2)
	p=torch.cat([s.conv3(F.pad(xt,(2,0))).transpose(1,2),s.conv5(F.pad(xt,(4,0))).transpose(1,2)],-1)
	return x+s.combine(p)m.view(1,1,-1).2
	class RelayNetwork(nn.Module):
	def __init__(s,d): super().__init__(); s.gate=nn.Sequential(nn.Linear(d,d),nn.Sigmoid()); s.transform=nn.Linear(d,d)
	def forward(s,x,m): return x+s.transform(x)s.gate(x)m.view(1,1,-1)*.2
	class BalancedDiversityLayer(nn.Module):
	def __init__(s,d,c):
	super().__init__(); s.type_manager=DynamicTypeManager(d,c)
	s.inhibition=GentleInhibition(d,c); s.excitation=StrongExcitation(d,c)
	s.modulation=GentleModulation(d,c); s.pattern=PatternDetection(d); s.relay=RelayNetwork(d)
	s.norm=nn.LayerNorm(d); s.output=nn.Linear(d,d)
	def forward(s,x):
	r=x; x=s.norm(x); tm=s.type_manager; tm.step(x)
	x=s.excitation(x,tm.get_type_mask(NeuronType.EXCITATORY))
	x=s.pattern(x,tm.get_type_mask(NeuronType.PATTERN))
	x=s.relay(x,tm.get_type_mask(NeuronType.RELAY))
	x=s.modulation(x,tm.get_type_mask(NeuronType.MODULATORY))
	x=s.inhibition(x,tm.get_type_mask(NeuronType.INHIBITORY))
	return r+s.output(x)*.5
	class SparseGate(nn.Module):
	def __init__(s,d,k):
	super().__init__(); s.k=k; s.scorer=nn.Sequential(nn.Linear(d,d//4),nn.SiLU(),nn.Linear(d//4,d))
	nn.init.zeros_(s.scorer[-1].weight); nn.init.zeros_(s.scorer[-1].bias)
	def forward(s,x):
	sc=torch.sigmoid(s.scorer(x)); k=min(s.k,x.shape[-1])
	thr=sc.topk(k,-1)[0][...,-1:]; hard=(sc>=thr).float(); soft=torch.sigmoid((sc-thr)*10)
	return x*(hard-soft.detach()+soft)
	class SparseAttn(nn.Module):
	def __init__(s,d,h,tk): super().__init__(); s.h,s.hd,s.tk=h,d//h,tk; s.qkv=nn.Linear(d,3*d); s.out=nn.Linear(d,d)
	def forward(s,x):
	B,T,D=x.shape
	q,k,v=s.qkv(x).reshape(B,T,3,s.h,s.hd).permute(2,0,3,1,4)
	a=(q@k.transpose(-2,-1))s.hd*-.5
	a=a.masked_fill(torch.triu(torch.ones(T,T,device=x.device),1).bool(),float("-inf"))
	_,i=a.topk(min(s.tk,T),-1)
	m=torch.zeros_like(a,dtype=torch.bool).scatter_(-1,i,True)
	a=torch.nan_to_num(F.softmax(a.masked_fill(~m,float("-inf")),-1),0.)
	return s.out((a@v).transpose(1,2).reshape(B,T,D))
	class SparseFFN(nn.Module):
	def __init__(s,d,m,ck): super().__init__(); s.up=nn.Linear(d,dm); s.gate=SparseGate(dm,ck); s.down=nn.Linear(d*m,d)
	def forward(s,x): return s.down(s.gate(F.silu(s.up(x))))
	class Block(nn.Module):
	def __init__(s,c,i,dim=None,heads=None,tk=None,ck=None,div=False):
	super().__init__(); d=dim or c.dim; h=heads or c.n_heads
	s.n1=nn.LayerNorm(d); s.attn=SparseAttn(d,h,tk or c.token_top_k)
	s.n2=nn.LayerNorm(d); s.ffn=SparseFFN(d,c.ffn_mult,(ck or c.channel_top_k)*c.ffn_mult)
	s.drop=nn.Dropout(c.dropout); s.div=div
	if div: s.diversity=BalancedDiversityLayer(d,c)
	def forward(s,x):
	x=x+s.drop(s.attn(s.n1(x)))
	if s.div: x=s.diversity(x)
	return x+s.drop(s.ffn(s.n2(x)))

	# =============================================================================
	# Encoders + PhysicsCore
	# =============================================================================
	class VisionEncoder(nn.Module):
	"""64x64x3 -> 64 patch tokens dim v_dim -> proj to dim."""
	def __init__(s,c):
	super().__init__()
	n=(c.img//c.v_patch)**2
	s.patch=nn.Conv2d(3,c.v_dim,c.v_patch,c.v_patch)
	s.pos=nn.Parameter(torch.randn(1,n,c.v_dim)*.02)
	s.blocks=nn.ModuleList([Block(c,i,dim=c.v_dim,heads=8,tk=n,ck=int(c.v_dim*.375)) for i in range(c.v_layers)])
	s.norm=nn.LayerNorm(c.v_dim); s.proj=nn.Linear(c.v_dim,c.dim)
	def forward(s,img):
	x=s.patch(img).flatten(2).transpose(1,2)+s.pos
	for b in s.blocks: x=b(x)
	return s.proj(s.norm(x)) # B,64,dim

	class AudioEncoder(nn.Module):
	"""log-mel B,1,64,T -> ~T/4 tokens dim a_dim -> proj to dim."""
	def __init__(s,c):
	super().__init__()
	s.stem=nn.Sequential(nn.Conv2d(1,32,3,2,1),nn.GELU(),nn.Conv2d(32,c.a_dim,3,2,1),nn.GELU())
	s.blocks=nn.ModuleList([Block(c,i,dim=c.a_dim,heads=8,tk=64,ck=int(c.a_dim*.375)) for i in range(c.a_layers)])
	s.norm=nn.LayerNorm(c.a_dim); s.proj=nn.Linear(c.a_dim,c.dim)
	def forward(s,mel):
	x=s.stem(mel) # B,a_dim,16,T/4
	x=x.mean(2).transpose(1,2) # B,T/4,a_dim
	for b in s.blocks: x=b(x)
	return s.proj(s.norm(x))

	class PhysicsCore(nn.Module):
	"""Latent physical state engine. GRU over per-frame visual summaries,
	phys_slots learned state slots, predicts next-frame embedding from
	(z_t, action). Aux loss = MSE+cos(pred, vis_{t+1})."""
	def __init__(s,c):
	super().__init__()
	s.slots=nn.Parameter(torch.randn(1,c.phys_slots,c.dim)*.02)
	s.read=nn.MultiheadAttention(c.dim,4,batch_first=True)
	s.cell=nn.GRUCell(c.dim,c.phys_dim)
	s.act=nn.Linear(c.dim,c.phys_dim)
	s.pred=nn.Sequential(nn.Linear(c.phys_dim,c.dim),nn.GELU(),nn.Linear(c.dim,c.dim))
	s.to_seq=nn.Linear(c.phys_dim,c.dim)
	s.pd=c.phys_dim
	def forward(s,frames,action):
	# frames: B,T,dim (per-frame mean vis emb); action: B,dim
	B,T,_=frames.shape
	z=frames.new_zeros(B,s.pd); preds=[]
	a=s.act(action)
	for t in range(T):
	z=s.cell(frames[:,t]+0., z)+0.1*a
	preds.append(s.pred(z))
	pred=torch.stack(preds,1) # B,T,dim (predict t+1)
	aux=0.
	if T>1:
	tgt=frames[:,1:].detach(); p=pred[:,:-1]
	aux=F.mse_loss(p,tgt)+ (1-F.cosine_similarity(p,tgt,-1).mean())
	slots,_=s.read(s.slots.expand(B,-1,-1), s.to_seq(z).unsqueeze(1), s.to_seq(z).unsqueeze(1))
	return slots, aux # B,slots,dim

	class SAMMM(nn.Module):
	def __init__(s,c):
	super().__init__(); s.cfg=c
	s.tok_emb=nn.Embedding(c.vocab_size,c.dim); s.pos_emb=nn.Embedding(c.max_seq_len,c.dim)
	s.drop=nn.Dropout(c.dropout)
	s.blocks=nn.ModuleList([Block(c,i,div=(i%2==0 and c.use_diversity)) for i in range(c.n_layers)])
	s.norm=nn.LayerNorm(c.dim)
	s.vision=VisionEncoder(c); s.audio=AudioEncoder(c); s.phys=PhysicsCore(c)
	s.mode=nn.Embedding(3,c.dim) # 0=[VIS] 1=[AUD] 2=[PHYS] separators
	s.apply(s._init)
	s.n_params=sum(p.numel() for p in s.parameters())
	print(f"\n{MODEL_NAME} by {ORGANIZATION}: {s.n_params:,} params")
	@staticmethod
	def _init(mod):
	if isinstance(mod,(nn.Linear,nn.Conv2d,nn.Conv1d)):
	nn.init.normal_(mod.weight,std=0.02)
	if mod.bias is not None: nn.init.zeros_(mod.bias)
	elif isinstance(mod,nn.Embedding): nn.init.normal_(mod.weight,std=0.02)
	def fuse(s,ids,frames=None,mel=None):
	B=ids.shape[0]; parts=[]
	aux=ids.new_zeros(1,dtype=torch.float32).squeeze()
	if frames is not None:
	B_,T,C,H,W=frames.shape
	vis=s.vision(frames.reshape(B_*T,C,H,W)).reshape(B_,T,-1,s.cfg.dim)
	per=vis.mean(2)
	act=s.tok_emb(ids).mean(1)
	slots,aux=s.phys(per,act)
	parts += [s.mode.weight[0].expand(B,1,-1), vis[:,0],
	s.mode.weight[2].expand(B,1,-1), slots]
	if mel is not None:
	parts += [s.mode.weight[1].expand(B,1,-1), s.audio(mel)]
	parts.append(s.tok_emb(ids))
	x=torch.cat(parts,1); n_pref=x.shape[1]-ids.shape[1]
	return x,n_pref,aux
	def forward(s,ids,targets=None,frames=None,mel=None):
	x,n_pref,aux=s.fuse(ids,frames,mel)
	T=x.shape[1]
	x=s.drop(x+s.pos_emb(torch.arange(T,device=x.device)))
	for b in s.blocks: x=b(x)
	logits=F.linear(s.norm(x),s.tok_emb.weight)[:,n_pref:]
	loss=None
	if targets is not None:
	lm=F.cross_entropy(logits.reshape(-1,s.cfg.vocab_size),targets.reshape(-1),ignore_index=s.cfg.pad_id)
	loss=lm+s.cfg.aux_phys_w*aux
	return logits,loss,aux
	IMG, T = 96, 8

	def render(x, y, img=96, r=4):
	f = torch.zeros(3, img, img)
	xi, yi = int(max(r, min(img - r - 1, x))), int(max(r, min(img - r - 1, y)))
	f[0, yi - r:yi + r, xi - r:xi + r] = 1.0
	return f

	def render_world(kind, seed):
	rng = random.Random(seed); fr = []
	if kind == "ball":
	x, y = 10., 10.; vx = rng.uniform(5, 9); vy = rng.uniform(-2, 0); g = 1.1
	for _ in range(T): fr.append(render(x, y, IMG)); x += vx; vy += g; y += vy
	elif kind == "spring":
	c = IMG // 2; A = rng.uniform(15, 30); w = rng.uniform(.5, 1.0)
	for t in range(T): fr.append(render(c + A * math.sin(w * t), c, IMG))
	elif kind == "bounce":
	x, y = 12., IMG // 2; vx = rng.uniform(7, 11)
	for _ in range(T):
	fr.append(render(x, y, IMG)); x += vx
	if x > IMG - 12: vx = -vx
	else: # twobody
	x1, x2, y = 15., float(IMG - 15), IMG // 2; v = rng.uniform(4, 7)
	for _ in range(T):
	f = render(x1, y, IMG); f[1] = render(x2, y, IMG)[0]
	fr.append(f)
	if abs(x2 - x1) > 10: x1 += v; x2 -= v
	return torch.stack(fr)

	_DIG = {d: torch.tensor(b).reshape(7, 5).float() for d, b in {
	"0": [1,1,1,1,1,1,0,0,0,1,1,0,0,0,1,1,0,0,0,1,1,0,0,0,1,1,0,0,0,1,1,1,1,1,1],
	"1": [0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,1,0],
	"2": [1,1,1,1,1,0,0,0,0,1,0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,1,1,1,1,1],
	"3": [1,1,1,1,1,0,0,0,0,1,0,0,0,0,1,0,1,1,1,1,0,0,0,0,1,0,0,0,0,1,1,1,1,1,1],
	"4": [1,0,0,0,1,1,0,0,0,1,1,0,0,0,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1],
	"5": [1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,1,1,1,1,1,0,0,0,0,1,0,0,0,0,1,1,1,1,1,1],
	"6": [1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,1,1,1,1,1,1,0,0,0,1,1,0,0,0,1,1,1,1,1,1],
	"7": [1,1,1,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0],
	"8": [1,1,1,1,1,1,0,0,0,1,1,0,0,0,1,1,1,1,1,1,1,0,0,0,1,1,0,0,0,1,1,1,1,1,1],
	"9": [1,1,1,1,1,1,0,0,0,1,1,0,0,0,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,1,1,1,1,1,1],
	}.items()}

	def render_ocr(num, img=96):
	f = torch.zeros(3, img, img); x0, y0, s = 6, img // 2 - 14, 4
	for i, ch in enumerate(num):
	g = F.interpolate(_DIG[ch][None, None], scale_factor=s).squeeze()
	x = x0 + i * (5 * s + 4)
	f[:, y0:y0 + 7 * s, x:x + 5 * s] = g
	return f.unsqueeze(0).repeat(T, 1, 1, 1)

	def render_robot(seed, img=96):
	rng = random.Random(seed)
	x = rng.uniform(20, img - 20); y = rng.uniform(20, img - 20)
	return render(x, y, img).unsqueeze(0).repeat(T, 1, 1, 1)

	def _trailing_digits(label):
	d = ""
	for ch in reversed(label):
	if ch.isdigit(): d = ch + d
	elif d: break
	return d or "0"

	def render_from_spec(spec):
	k = spec["kind"]
	if k in ("ball", "spring", "bounce", "twobody"): return render_world(k, spec["seed"])
	if k == "ocr": return render_ocr(_trailing_digits(spec.get("label", "0")))
	if k == "robot": return render_robot(spec["seed"])
	return torch.zeros(T, 3, IMG, IMG)

	# --- audio: synthetic mel (stable) + real ESC-50 mel + runtime pool -----------
	import hashlib
	def _stable_hash(s): return int(hashlib.md5(s.encode()).hexdigest(), 16)

	def synth_audio(sound, n_mels=64, n_t=64):
	"""Deterministic pseudo-mel per sound class (stable across runs)."""
	base = _stable_hash(sound) % 32
	m = torch.zeros(1, n_mels, n_t)
	m[0, base:base + 8] = torch.linspace(0.2, 1.0, n_t)
	m += torch.randn(1, n_mels, n_t) * 0.05
	return m

	def wav_to_mel(wav, sr=16000, n_mels=64, n_t=64):
	"""Lightweight log-magnitude spectrogram resized to (1, n_mels, n_t).
	Dependency-free (torch.stft); the encoder adapts during finetuning."""
	if wav.dim() > 1: wav = wav.mean(0)
	if wav.numel() < 512: wav = F.pad(wav, (0, 512 - wav.numel()))
	n_fft = 400; hop = 160
	spec = torch.stft(wav, n_fft=n_fft, hop_length=hop,
	window=torch.hann_window(n_fft), return_complex=True).abs()
	spec = torch.log1p(spec).unsqueeze(0).unsqueeze(0)
	mel = F.interpolate(spec, size=(n_mels, n_t), mode="bilinear", align_corners=False)[0]
	return (mel - mel.mean()) / (mel.std() + 1e-5)

	def load_esc50(n=600):
	"""Real environmental audio (ESC-50) -> (mel, category). Synthetic fallback."""
	try:
	from datasets import load_dataset
	ds = load_dataset("ashraq/esc50", split="train", streaming=True)
	pool = []
	for i, ex in enumerate(ds):
	if i >= n: break
	a = ex["audio"]; wav = torch.tensor(a["array"]).float()
	cat = ex.get("category") or str(ex.get("target", "sound"))
	pool.append((wav_to_mel(wav, a.get("sampling_rate", 16000)), str(cat)))
	if pool:
	print(f"[esc50] {len(pool)} real clips loaded", flush=True); return pool
	except Exception as e:
	print(f"[esc50] unreachable ({type(e).__name__}) — synthetic-only audio", flush=True)
	return []

	def render_av(spec):
	"""Return (frames, mel) for a sample; mel is signal for audio families, else zeros."""
	frames = render_from_spec(spec)
	mel = synth_audio(spec["sound"]) if "sound" in spec else torch.zeros(1, 64, 64)
	return frames, mel

	def esc_sample(pool):
	mel, cat = random.choice(pool)
	return "[AUD] what is this sound? [CHAT]", f"step 1: classify the sound. Answer: {cat}", mel

	# =============================================================================
	class Tok:
	def __init__(s, token=None):
	s.sp = spm.SentencePieceProcessor(); s.sp.Load(resolve_tokenizer(token))
	s.vocab = s.sp.GetPieceSize()
	def enc(s, t): return s.sp.EncodeAsIds(t)
	def dec(s, ids): return s.sp.DecodeIds(ids)

	EOS = 2; PAD = 0; L = 80


	def make_batch(tok, rows, idx, esc_pool=None, p_esc=0.22):
	ids_in, tgts, frs, mls = [], [], [], []
	for j in idx:
	if esc_pool and random.random() < p_esc:
	prompt, answer, mel = esc_sample(esc_pool); frames = torch.zeros(T, 3, IMG, IMG)
	else:
	r = rows[j]; prompt, answer = r["prompt"], r["answer"]
	frames, mel = render_av(r["spec"])
	p = tok.enc(prompt); a = tok.enc(" " + answer) + [EOS]
	full = (p + a)[:L + 1]
	if len(full) < L + 1: full = full + [PAD] * (L + 1 - len(full))
	inp = full[:L]; tgt = full[1:L + 1]
	cut = len(p) - 1 # supervise only answer tokens
	tgt = [PAD if k < cut else t for k, t in enumerate(tgt)]
	ids_in.append(inp); tgts.append(tgt); frs.append(frames); mls.append(mel)
	ii = torch.tensor(ids_in, device=device); tt = torch.tensor(tgts, device=device)
	ff = torch.stack(frs).to(device); mm = torch.stack(mls).to(device)
	return ii, tt, ff, mm

	# =============================================================================
	# Eval — per family; CHAT matches the Answer span, ACTION matches the plan
	# =============================================================================
	def _extract_json(s):
	i = s.find("{")
	if i < 0: return None
	depth = 0
	for k in range(i, len(s)):
	if s[k] == "{": depth += 1
	elif s[k] == "}":
	depth -= 1
	if depth == 0:
	try: return json.loads(s[i:k + 1])
	except Exception: return None
	return None

	@torch.no_grad()
	def generate(model, tok, prompt, frames, mel, max_new=48):
	model.eval()
	ids = torch.tensor([tok.enc(prompt)], device=device)
	fb = frames.unsqueeze(0).to(device); mb = mel.unsqueeze(0).to(device)
	out = []
	for _ in range(max_new):
	logits, _, _ = model(ids, None, fb, mb)
	nxt = int(logits[0, -1].argmax())
	if nxt == EOS: break
	out.append(nxt); ids = torch.cat([ids, torch.tensor([[nxt]], device=device)], 1)
	return tok.dec(out)

	def _chat_match(pred, gold):
	g = gold.split("Answer:")[-1].strip()
	p = pred.split("Answer:")[-1].strip() if "Answer:" in pred else pred.strip()
	return p.startswith(g) or g in p

	def _action_match(pred, gold):
	pj, gj = _extract_json(pred), _extract_json(gold)
	return pj is not None and pj == gj

	# =============================================================================
	# Eval generators — INLINED (no external file / no --data needed)
	# =============================================================================
	def _aj(o): return json.dumps(o, separators=(",", ":"))
	def _fr(): return random.random() < 0.30

	# ---------------------------------------------------------------------------
	# Deterministic physics simulation — returns ground-truth facts (no torch here;
	# the finetune renders pixels, the generator only needs the trajectory facts).
	# Mirrors gen_world() in samg_mm_train.py kind-for-kind so frames match.
	# ---------------------------------------------------------------------------
	def simulate_facts(kind, seed):
	"""Replay the trajectory deterministically; return physical facts used to
	build the supervised answer. Uses an isolated RNG so it cannot perturb the
	global stream (the finetune reseeds the SAME way before rendering)."""
	rng = random.Random(seed)
	if kind == "ball":
	x, y = 10., 10.; vx = rng.uniform(5, 9); vy = rng.uniform(-2, 0); g = 1.1
	xs = []
	for _ in range(T): xs.append(x); x += vx; vy += g; y += vy
	return {"dynamic": "gravity", "direction": "right",
	"reaches_right": x > IMG - 12, "dx": vx}
	if kind == "spring":
	A = rng.uniform(15, 30); w = rng.uniform(.5, 1.0)
	return {"dynamic": "oscillation", "direction": "oscillating",
	"amplitude": A, "reaches_right": False}
	if kind == "bounce":
	x = 12.; vx = rng.uniform(7, 11); bounced = False
	for _ in range(T):
	x += vx
	if x > IMG - 12: vx = -vx; bounced = True
	return {"dynamic": "collision", "direction": "right then left",
	"bounces": bounced, "reaches_right": True}
	# twobody
	v = rng.uniform(4, 7)
	return {"dynamic": "collision", "direction": "converging",
	"collides": True, "reaches_right": False}

	PHYS_KINDS = ["ball", "spring", "bounce", "twobody"]
	OCR_PREFIX = ["speed=", "temp=", "qos=", "zone ", "dock ", "id="]
	def v_motion():
	kind = random.choice(PHYS_KINDS); seed = random.randint(0, 2**31 - 1)
	f = simulate_facts(kind, seed); fr = _fr()
	q = ("dans quel sens se déplace l'objet ?" if fr
	else "which way does the object move?")
	prompt = f"[VIS] {q} [CHAT]"
	d = f["direction"]
	trace = (f"step 1: track the bright object across frames. "
	f"step 2: its horizontal position evolves -> {d}. Answer: {d}")
	return dict(family="v_motion", fmt="CHAT", use_v=True, use_a=False, use_p=False,
	spec={"kind": kind, "seed": seed}, prompt=prompt, answer=trace)

	def v_ocr():
	pre = random.choice(OCR_PREFIX)
	num = "".join(random.choice("0123456789") for _ in range(random.randint(2, 3)))
	label = pre + num; fr = _fr()
	q = "quel nombre est affiché ?" if fr else "what number is shown?"
	prompt = f"[VIS] [OCR] {q} [CHAT]"
	trace = (f"step 1: read the bitmap label. step 2: digits = {num}. Answer: {num}")
	return dict(family="v_ocr", fmt="CHAT", use_v=True, use_a=False, use_p=False,
	spec={"kind": "ocr", "seed": random.randint(0, 2**31 - 1), "label": label},
	prompt=prompt, answer=trace)

	# ---------------------------------------------------------------------------
	# PHYSICS — [CHAT]
	# ---------------------------------------------------------------------------
	def p_identify():
	kind = random.choice(PHYS_KINDS); seed = random.randint(0, 2**31 - 1)
	f = simulate_facts(kind, seed); fr = _fr()
	q = ("quelle dynamique régit ce mouvement ?" if fr
	else "what dynamic governs this motion?")
	prompt = f"[VIS] [PHYS] {q} [CHAT]"
	dyn = f["dynamic"]
	cue = {"gravity": "constant downward acceleration",
	"oscillation": "periodic back-and-forth around a center",
	"collision": "abrupt velocity reversal on contact"}[dyn]
	trace = f"step 1: observe {cue}. step 2: that is {dyn}. Answer: {dyn}"
	return dict(family="p_identify", fmt="CHAT", use_v=True, use_a=False, use_p=True,
	spec={"kind": kind, "seed": seed}, prompt=prompt, answer=trace)

	def p_predict():
	kind = random.choice(PHYS_KINDS); seed = random.randint(0, 2**31 - 1)
	f = simulate_facts(kind, seed); fr = _fr()
	if kind in ("ball", "bounce"):
	ans = "yes" if f.get("reaches_right") else "no"
	q = ("l'objet atteint-il le bord droit ?" if fr
	else "does the object reach the right edge?")
	reason = "its rightward velocity carries it to the wall" if ans == "yes" \
	else "it falls or stops before the wall"
	elif kind == "twobody":
	ans = "yes"; q = ("les deux corps vont-ils entrer en collision ?" if fr
	else "will the two bodies collide?")
	reason = "they approach from both sides and meet in the middle"
	else:
	ans = "no"; q = ("l'objet quitte-t-il le centre durablement ?" if fr
	else "does the object leave the center permanently?")
	reason = "it oscillates and returns to the center each period"
	prompt = f"[VIS] [PHYS] {q} [CHAT]"
	trace = f"step 1: {reason}. Answer: {ans}"
	return dict(family="p_predict", fmt="CHAT", use_v=True, use_a=False, use_p=True,
	spec={"kind": kind, "seed": seed}, prompt=prompt, answer=trace)

	# ---------------------------------------------------------------------------
	# CROSS-MODAL — [ACTION] ({domain,op,params} schema, as MM base pretraining)
	# ---------------------------------------------------------------------------
	def x_robot():
	seed = random.randint(0, 2**31 - 1); rng = random.Random(seed); fr = _fr()
	target = random.choice(["dock", "block", "marker", "exit"])
	speed = round(rng.uniform(0.2, 0.9), 2); angle = rng.randint(0, 359)
	q = (f"pousse vers le {target}" if fr else f"push toward the {target}")
	prompt = f"[VIS] {q} [ACTION]"
	action = {"domain": "ros", "op": "move",
	"params": {"speed": speed, "angle": angle, "duration_s": 1}}
	return dict(family="x_robot", fmt="ACTION", use_v=True, use_a=False, use_p=False,
	spec={"kind": "robot", "seed": seed}, prompt=prompt, answer=_aj(action))

	def x_sensor():
	pre = "speed="; val = random.randint(10, 99)
	label = pre + str(val); limit = 50; fr = _fr()
	q = (f"si la vitesse dépasse {limit}, ralentis" if fr
	else f"if speed exceeds {limit}, slow down")
	prompt = f"[VIS] [OCR] {q} [ACTION]"
	if val > limit:
	action = {"domain": "ros", "op": "set_speed", "params": {"value": limit}}
	else:
	action = {"domain": "ros", "op": "continue", "params": {}}
	return dict(family="x_sensor", fmt="ACTION", use_v=True, use_a=False, use_p=False,
	spec={"kind": "ocr", "seed": random.randint(0, 2**31 - 1), "label": label},
	prompt=prompt, answer=_aj(action))
	SOUND_CAUSE = {
	"sharp impact": "collision",
	"double impact": "collision",
	"rhythmic creak": "oscillation",
	"whoosh then thud": "falling object",
	"servo whir": "motor",
	}
	KIND_SOUND = {"ball": "whoosh then thud", "spring": "rhythmic creak",
	"bounce": "sharp impact", "twobody": "double impact"}
	SOUND_ACTION = {
	"alarm": ({"domain": "ros", "op": "stop", "params": {}}, "an alarm"),
	"servo whir": ({"domain": "ros", "op": "continue", "params": {}}, "a servo whir"),
	"sharp impact": ({"domain": "ros", "op": "halt", "params": {"reason": "collision"}}, "an impact"),
	"rhythmic creak": ({"domain": "ros", "op": "slow", "params": {"value": 20}}, "a creak"),
	}

	def a_identify():
	snd = random.choice(list(SOUND_CAUSE.keys())); cause = SOUND_CAUSE[snd]; fr = _fr()
	q = "qu'est-ce qui a produit ce son ?" if fr else "what produced this sound?"
	prompt = f"[AUD] {q} [CHAT]"
	desc = {"collision": "a sharp broadband transient",
	"oscillation": "a periodic rhythmic tone",
	"falling object": "a rising sweep followed by a thud",
	"motor": "a steady mechanical hum"}[cause]
	trace = f"step 1: hear {desc}. step 2: that indicates {cause}. Answer: {cause}"
	return dict(family="a_identify", fmt="CHAT", use_v=False, use_a=True, use_p=False,
	spec={"kind": "audio", "sound": snd}, prompt=prompt, answer=trace)

	def a_match():
	kind = random.choice(PHYS_KINDS); seed = random.randint(0, 2**31 - 1)
	true_sound = KIND_SOUND[kind]; fr = _fr()
	if random.random() < 0.5:
	snd = true_sound; ans = "yes"; reason = "the sound fits the motion"
	else:
	snd = random.choice([s for s in KIND_SOUND.values() if s != true_sound])
	ans = "no"; reason = "the sound does not fit the motion"
	q = ("le son correspond-il au mouvement ?" if fr
	else "does the sound match the motion?")
	prompt = f"[VIS] [AUD] [PHYS] {q} [CHAT]"
	trace = f"step 1: {reason}. Answer: {ans}"
	return dict(family="a_match", fmt="CHAT", use_v=True, use_a=True, use_p=True,
	spec={"kind": kind, "seed": seed, "sound": snd}, prompt=prompt, answer=trace)

	def a_event():
	snd = random.choice(list(SOUND_ACTION.keys())); action, desc = SOUND_ACTION[snd]; fr = _fr()
	instr = {"stop": "arrête le robot" if fr else "stop the robot",
	"continue": "continue" if fr else "keep going",
	"halt": "stoppe net" if fr else "halt immediately",
	"slow": "ralentis" if fr else "slow down"}[action["op"]]
	q = (f"si tu entends {desc}, {instr}" if fr else f"if you hear {desc}, {instr}")
	prompt = f"[AUD] {q} [ACTION]"
	return dict(family="a_event", fmt="ACTION", use_v=False, use_a=True, use_p=False,
	spec={"kind": "audio", "sound": snd}, prompt=prompt, answer=_aj(action))


	# builder: produce a held-out eval set in-memory (disjoint seed, no files)
	def build_eval(n=1800, seed=99991, families="auto"):
	"""families: 'reasoning' (6 visual/physics/cross-modal) or 'auto'/'audio' (all 9)."""
	if families == "reasoning":
	gens = [v_motion, v_ocr, p_identify, p_predict, x_robot, x_sensor]
	else:
	gens = [v_motion, v_ocr, p_identify, p_predict, x_robot, x_sensor,
	a_identify, a_match, a_event]
	_st = random.getstate(); random.seed(seed)
	rows = []
	for _ in range(n):
	s = random.choice(gens)()
	s["text"] = s["prompt"] + " " + s["answer"]; rows.append(s)
	random.setstate(_st)
	return rows
	# =============================================================================
	# Benchmark
	# =============================================================================
	def load_ckpt(model, path):
	ck = torch.load(path, map_location=device)
	sd = ck["model"] if "model" in ck else ck
	model.load_state_dict(sd, strict=True)
	return ck.get("step", "?"), ck.get("best", None)


	# =============================================================================
	# SAM-MM — HuggingFace Space (self-contained; weights pulled from HF)
	# Architecture inlined above. Set HF_TOKEN as a Space secret for private repos.
	# =============================================================================
	import io
	try:
	from PIL import Image
	except ImportError:
	os.system(f"{sys.executable} -m pip install -q Pillow --break-system-packages"); from PIL import Image
	import gradio as gr

	CHECKPOINTS = {
	"Reasoning — vision + physics": "AMFORGE/sam-mm-reasoning-checkpoints:best.pt",
	"Audio-reasoning — + sound": "AMFORGE/sam-mm-audio-reasoning-checkpoints:best.pt",
	}
	SCENES = {
	"🪐 Physics — identify the dynamic": "p_identify",
	"🎯 Physics — predict the outcome": "p_predict",
	"➡️ Vision — direction of motion": "v_motion",
	"🔢 Vision — read the number (OCR)": "v_ocr",
	"🛰️ Cross-modal — sensor → action": "x_sensor",
	"🔊 Audio — identify the sound": "a_identify",
	"🎬 Audio — match sight + sound": "a_match",
	"⚡ Audio — sound → action": "a_event",
	}
	FAMFUNC = {"p_identify": p_identify, "p_predict": p_predict, "v_motion": v_motion,
	"v_ocr": v_ocr, "x_sensor": x_sensor, "a_identify": a_identify,
	"a_match": a_match, "a_event": a_event}
	AUDIO_FAMS = {"a_identify", "a_match", "a_event"}
	_STATE = {"model": None, "tok": None, "ckpt": None}

	def _load(ckpt):
	if _STATE["model"] is not None and _STATE["ckpt"] == ckpt:
	return _STATE["model"], _STATE["tok"]
	token = get_hf_token()
	tok = Tok(token=token)
	model = SAMMM(Config()).to(device)
	load_ckpt(model, resolve_ckpt(ckpt, token)); model.eval()
	_STATE.update(model=model, tok=tok, ckpt=ckpt)
	return model, tok

	def _montage(frames):
	n = frames.shape[0]
	tiles = [(frames[i].clamp(0,1).permute(1,2,0)*255).byte().cpu().numpy() for i in range(n)]
	w = IMGn + (n-1)2
	img = Image.new("RGB", (w, IMG), (17,18,26))
	for i,t in enumerate(tiles): img.paste(Image.fromarray(t), (i*(IMG+2),0))
	return img.resize((w4, IMG4), Image.NEAREST)

	def _infer(ckpt_label, scene_label, max_new):
	ckpt = CHECKPOINTS[ckpt_label]; fam = SCENES[scene_label]
	s = FAMFUNC[fam]()
	frames, mel = render_av(s["spec"])
	model, tok = _load(ckpt)
	pred = generate(model, tok, s["prompt"], frames, mel, max_new=int(max_new))
	gold = s["answer"].split("Answer:")[-1].strip()
	got = pred.split("Answer:")[-1].strip() if "Answer:" in pred else pred.strip()
	ok = _chat_match(pred, s["answer"])
	return s, frames, got, gold, ok, fam

	def run_one(ckpt_label, scene_label, max_new):
	s, frames, got, gold, ok, fam = _infer(ckpt_label, scene_label, max_new)
	warn = ""
	if fam in AUDIO_FAMS and "audio" not in ckpt_label.lower():
	warn = ("\n\n> ⚠️ This is an audio scene on the reasoning checkpoint — "
	"it never learned sound, so a correct answer here is chance. "
	"Switch the checkpoint to Audio-reasoning to test it for real.")
	verdict = "✅ correct" if ok else "❌ mismatch"
	md = (f"### {verdict}\n"
	f"Prompt → SAM-MM\n```\n{s['prompt']}\n```\n"
	f"Model answer: `{got}`  •  Ground truth: `{gold}`")
	if "sound" in s["spec"]:
	md += (f"\n\n*Sound cue `{s['spec']['sound']}` → a deterministic log-mel "
	f"(no audible file; the encoder reads the spectrogram).*")
	md += warn
	return _montage(frames), md

	def run_batch(ckpt_label, scene_label, max_new, n=20):
	hits = 0; lines = []
	for _ in range(int(n)):
	s, frames, got, gold, ok, fam = _infer(ckpt_label, scene_label, max_new)
	hits += int(ok)
	lines.append(f"{'✅' if ok else '❌'} `{got}` vs `{gold}`")
	acc = 100*hits/int(n)
	head = f"### {hits}/{int(n)} correct   →   {acc:.0f}% exact-match\n\n"
	return head + "\n".join(lines)

	CSS = """
	.gradio-container {max-width: 980px !important}
	#title {text-align:center}
	#frames img {image-rendering: pixelated; border-radius: 10px}
	footer {display:none !important}
	"""
	with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate"), css=CSS,
	title="SAM-MM · multimodal demo") as demo:
	gr.Markdown("# 🧠 SAM-MM — a 58M multimodal model that reasons", elem_id="title")
	gr.Markdown(
	"Pick a scene. SAM-MM perceives the rendered frames (and, for audio scenes, a "
	"log-mel spectrogram), then answers in `[CHAT]` text or a `[ACTION]` JSON record. "
	"Frames are synthetic — this is the model's native world. Nothing is hard-coded: each "
	"scene is freshly generated, the model decodes token-by-token, and the answer is checked "
	"against ground truth computed independently.")
	with gr.Row():
	with gr.Column(scale=1):
	ckpt = gr.Dropdown(list(CHECKPOINTS), value=list(CHECKPOINTS)[0], label="Checkpoint")
	scene = gr.Dropdown(list(SCENES), value=list(SCENES)[0], label="Scene")
	max_new = gr.Slider(24, 96, value=64, step=8, label="max new tokens")
	with gr.Row():
	b1 = gr.Button("Generate & run", variant="primary")
	b2 = gr.Button("Run 20 (accuracy)")
	with gr.Column(scale=2):
	img = gr.Image(label="What SAM-MM sees", elem_id="frames")
	md = gr.Markdown()
	batch = gr.Markdown()
	b1.click(run_one, [ckpt, scene, max_new], [img, md])
	b2.click(run_batch, [ckpt, scene, max_new], [batch])
	gr.Markdown(
	"---\nHonest notes. Physics & motion are SAM-MM's strength (its world-model carries "
	"real dynamics). OCR generalizes to unseen numbers but isn't perfect. The cross-modal "
	"`[ACTION]` family is weaker. Audio is the weak modality — it was trained on synthetic "
	"pseudo-mel, so strong audio scores here partly reflect that, not true listening. "
	"Architecture internals are proprietary and not exposed.")

	if __name__ == "__main__":
	demo.launch()