Create cell2_model.py

cf3bcc2 verified 19 days ago

13.5 kB

	"""
	Patch Cross-Attention Shape Classifier — VAE-Matched (8×16×16)
	================================================================
	Replaces Conv3d backbone with v11-style decomposition + cross-attention.

	Input: (B, 8, 16, 16) binary voxel grid
	→ Decompose into patches (macro grid)
	→ Shared patch encoder (MLP + handcrafted)
	→ Positional embedding
	→ Cross-attention layers (patches attend to each other)
	→ Pool → Classify

	Patch scheme: 2×4×4 patches → 4×4×4 macro grid (64 patches, 32 voxels each)
	- Preserves aspect ratio at macro level
	- 32 voxels per patch = tractable for shared MLP
	- 64 patches = reasonable sequence length for attention
	"""

	import math
	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	# === Grid Constants ===========================================================
	GZ = 8
	GY = 16
	GX = 16
	GRID_SHAPE = (GZ, GY, GX)
	GRID_VOLUME = GZ * GY * GX # 2048

	# Patch decomposition
	PATCH_Z = 2
	PATCH_Y = 4
	PATCH_X = 4
	PATCH_VOL = PATCH_Z * PATCH_Y * PATCH_X # 32

	MACRO_Z = GZ // PATCH_Z # 4
	MACRO_Y = GY // PATCH_Y # 4
	MACRO_X = GX // PATCH_X # 4
	MACRO_N = MACRO_Z * MACRO_Y * MACRO_X # 64

	# Shape classes
	NUM_CLASSES = 38
	NUM_CURVATURES = 8

	CLASS_NAMES = [
	"point", "line_x", "line_y", "line_z", "line_diag",
	"cross", "l_shape", "collinear",
	"triangle_xy", "triangle_xz", "triangle_3d",
	"square_xy", "square_xz", "rectangle", "coplanar", "plane",
	"tetrahedron", "pyramid", "pentachoron",
	"cube", "cuboid", "triangular_prism", "octahedron",
	"arc", "helix", "circle", "ellipse", "disc",
	"sphere", "hemisphere", "cylinder", "cone", "capsule",
	"torus", "shell", "tube", "bowl", "saddle",
	]

	CURVATURE_NAMES = ["none", "convex", "concave", "cylindrical",
	"conical", "toroidal", "hyperbolic", "helical"]


	# === SwiGLU ===================================================================

	class SwiGLU(nn.Module):
	def __init__(self, in_dim, out_dim):
	super().__init__()
	self.w1 = nn.Linear(in_dim, out_dim)
	self.w2 = nn.Linear(in_dim, out_dim)

	def forward(self, x):
	return self.w1(x) * F.silu(self.w2(x))


	# === Patch Encoder ============================================================

	class PatchEncoder(nn.Module):
	"""
	Shared encoder for each 2×4×4 local patch.
	Input: (M, 2, 4, 4) binary grids where M = B * 64
	Output: (M, patch_feat_dim) feature vectors
	"""

	def __init__(self, patch_feat_dim=96):
	super().__init__()

	# Learned features from raw voxels
	self.mlp = nn.Sequential(
	nn.Linear(PATCH_VOL, 256), nn.GELU(),
	nn.Linear(256, 128), nn.GELU(),
	nn.Linear(128, patch_feat_dim))

	# Handcrafted: occupancy(1) + 3 axis std(3) + surface ratio(1)
	# + z_spread(1) + yx_spread(1) = 7
	n_hand = 7
	self.combine = nn.Sequential(
	nn.Linear(patch_feat_dim + n_hand, patch_feat_dim), nn.GELU(),
	nn.Linear(patch_feat_dim, patch_feat_dim))

	def forward(self, patches):
	"""patches: (M, 2, 4, 4)"""
	M = patches.shape[0]
	flat = patches.reshape(M, -1)

	learned = self.mlp(flat)

	# Handcrafted features
	occ = flat.mean(dim=-1, keepdim=True)

	ax_z = patches.mean(dim=(2, 3)).std(dim=1, keepdim=True)
	ax_y = patches.mean(dim=(1, 3)).std(dim=1, keepdim=True)
	ax_x = patches.mean(dim=(1, 2)).std(dim=1, keepdim=True)

	# Surface ratio
	padded = F.pad(patches.unsqueeze(1), (1,1,1,1,1,1), mode='constant', value=0)
	neighbors = F.avg_pool3d(padded, kernel_size=3, stride=1, padding=0)
	neighbors = neighbors.squeeze(1)
	surface = ((neighbors < 1.0) & (patches > 0.5)).float().sum(dim=(1,2,3))
	total = flat.sum(dim=-1).clamp(min=1)
	surf_ratio = (surface / total).unsqueeze(-1)

	# Spread: how much of the z vs yx space is used
	z_spread = (patches.sum(dim=(2, 3)) > 0).float().mean(dim=1, keepdim=True)
	yx_spread = (patches.sum(dim=1) > 0).float().mean(dim=(1, 2)).unsqueeze(-1)

	hand = torch.cat([occ, ax_z, ax_y, ax_x, surf_ratio, z_spread, yx_spread], dim=-1)

	return self.combine(torch.cat([learned, hand], dim=-1))


	# === Cross-Attention Block ====================================================

	class CrossAttentionBlock(nn.Module):
	"""
	Pre-norm transformer block: LN → MHA → residual → LN → FFN → residual.
	Patches cross-attend to each other (self-attention over patch sequence).
	"""

	def __init__(self, embed_dim, num_heads=8, ff_mult=2, dropout=0.05):
	super().__init__()
	self.ln1 = nn.LayerNorm(embed_dim)
	self.attn = nn.MultiheadAttention(
	embed_dim, num_heads=num_heads, batch_first=True, dropout=dropout)
	self.ln2 = nn.LayerNorm(embed_dim)
	self.ff = nn.Sequential(
	nn.Linear(embed_dim, embed_dim * ff_mult), nn.GELU(),
	nn.Linear(embed_dim * ff_mult, embed_dim),
	nn.Dropout(dropout))

	def forward(self, x):
	# Self-attention (each patch attends to all patches)
	normed = self.ln1(x)
	attn_out, _ = self.attn(normed, normed, normed)
	x = x + attn_out
	x = x + self.ff(self.ln2(x))
	return x


	# === Main Classifier ==========================================================

	class PatchCrossAttentionClassifier(nn.Module):
	"""
	8×16×16 → patch decomposition → shared encoder → cross-attention → classify.

	Architecture:
	1. Decompose (B, 8, 16, 16) into (B, 64, 2, 4, 4) patches
	2. Shared PatchEncoder → (B, 64, patch_feat_dim)
	3. Project + add 3D positional embedding → (B, 64, embed_dim)
	4. N cross-attention layers
	5. Global pool → classify

	~2-3M params depending on config.
	"""

	def __init__(self, n_classes=NUM_CLASSES, embed_dim=128, patch_feat_dim=96,
	n_layers=3, n_heads=8, dropout=0.05):
	super().__init__()
	self.embed_dim = embed_dim
	self.patch_feat_dim = patch_feat_dim

	# Shared patch encoder
	self.patch_encoder = PatchEncoder(patch_feat_dim)

	# Project patch features + occupancy + position → embed_dim
	patch_in = patch_feat_dim + 1 + 3 # feat + occ + 3D pos
	self.patch_proj = nn.Sequential(
	nn.Linear(patch_in, embed_dim), nn.GELU(),
	nn.Linear(embed_dim, embed_dim))

	# Learnable 3D positional embedding for macro grid
	self.pos_embed = nn.Parameter(torch.randn(1, MACRO_N, embed_dim) * 0.02)

	# Cross-attention layers
	self.layers = nn.ModuleList([
	CrossAttentionBlock(embed_dim, n_heads, ff_mult=2, dropout=dropout)
	for _ in range(n_layers)
	])

	# Final norm before pooling
	self.final_ln = nn.LayerNorm(embed_dim)

	# Global features: occupancy stats from full grid
	n_global = 11 # same as VAEShapeClassifier handcrafted
	self.global_proj = nn.Sequential(
	nn.Linear(n_global, 64), nn.GELU(),
	nn.Linear(64, 64))

	# Classification
	class_in = embed_dim + 64 # pooled attention + global features
	self.class_in = class_in
	self.classifier = nn.Sequential(
	nn.Linear(class_in, 256), nn.GELU(), nn.Dropout(0.1),
	nn.Linear(256, 128), nn.GELU(),
	nn.Linear(128, n_classes))

	# Auxiliary heads
	self.dim_head = nn.Sequential(
	nn.Linear(class_in, 64), nn.GELU(), nn.Linear(64, 4))
	self.curved_head = nn.Sequential(
	nn.Linear(class_in, 64), nn.GELU(), nn.Linear(64, 1))
	self.curv_type_head = nn.Sequential(
	nn.Linear(class_in, 64), nn.GELU(), nn.Linear(64, NUM_CURVATURES))

	# Precompute macro grid positions (normalized)
	coords = torch.stack(torch.meshgrid(
	torch.arange(MACRO_Z, dtype=torch.float32) / max(MACRO_Z - 1, 1),
	torch.arange(MACRO_Y, dtype=torch.float32) / max(MACRO_Y - 1, 1),
	torch.arange(MACRO_X, dtype=torch.float32) / max(MACRO_X - 1, 1),
	indexing="ij"), dim=-1)
	self.register_buffer("macro_pos", coords.reshape(1, MACRO_N, 3))

	def _decompose_patches(self, grid):
	"""
	(B, 8, 16, 16) → (B*64, 2, 4, 4)

	Reshape into (B, 4, 2, 4, 4, 4, 4) then permute/flatten.
	Z: 8 = 4 macro × 2 local
	Y: 16 = 4 macro × 4 local
	X: 16 = 4 macro × 4 local
	"""
	B = grid.shape[0]
	# (B, 8, 16, 16) → (B, MZ, PZ, MY, PY, MX, PX)
	x = grid.reshape(B, MACRO_Z, PATCH_Z, MACRO_Y, PATCH_Y, MACRO_X, PATCH_X)
	# → (B, MZ, MY, MX, PZ, PY, PX)
	x = x.permute(0, 1, 3, 5, 2, 4, 6).contiguous()
	# → (B*64, 2, 4, 4)
	return x.reshape(B * MACRO_N, PATCH_Z, PATCH_Y, PATCH_X)

	def _global_features(self, grid):
	"""Extract global geometric statistics from (B, 8, 16, 16) grid."""
	B = grid.shape[0]
	flat = grid.reshape(B, -1)

	occ = flat.mean(dim=-1, keepdim=True)

	ax_z = grid.mean(dim=(2, 3)).std(dim=1, keepdim=True)
	ax_y = grid.mean(dim=(1, 3)).std(dim=1, keepdim=True)
	ax_x = grid.mean(dim=(1, 2)).std(dim=1, keepdim=True)

	# Surface ratio
	padded = F.pad(grid.unsqueeze(1), (1,1,1,1,1,1), mode='constant', value=0)
	neighbors = F.avg_pool3d(padded, kernel_size=3, stride=1, padding=0)
	neighbors = neighbors.squeeze(1)
	surface = ((neighbors < 1.0) & (grid > 0.5)).float().sum(dim=(1,2,3))
	total = flat.sum(dim=-1).clamp(min=1)
	surf_ratio = (surface / total).unsqueeze(-1)

	# Axis projection symmetry
	proj_z = grid.max(dim=1).values
	proj_y = grid.max(dim=2).values
	proj_x = grid.max(dim=3).values

	sym_z = 1.0 - (proj_z - torch.flip(proj_z, [1, 2])).abs().mean(dim=(1, 2))
	sym_y = 1.0 - (proj_y - torch.flip(proj_y, [1, 2])).abs().mean(dim=(1, 2))
	sym_x = 1.0 - (proj_x - torch.flip(proj_x, [1, 2])).abs().mean(dim=(1, 2))
	sym = torch.stack([sym_z, sym_y, sym_x], dim=-1)

	# Spatial extent
	z_extent = (grid.sum(dim=(2, 3)) > 0).float().sum(dim=1, keepdim=True) / GZ
	y_extent = (grid.sum(dim=(1, 3)) > 0).float().sum(dim=1, keepdim=True) / GY
	x_extent = (grid.sum(dim=(1, 2)) > 0).float().sum(dim=1, keepdim=True) / GX
	extent = torch.cat([z_extent, y_extent, x_extent], dim=-1)

	return torch.cat([occ, ax_z, ax_y, ax_x, surf_ratio, sym, extent], dim=-1)

	def forward(self, grid, labels=None):
	"""
	grid: (B, 8, 16, 16) binary voxel grid
	"""
	B = grid.shape[0]

	# === Global features ===
	global_feat = self.global_proj(self._global_features(grid))

	# === Patch decomposition + encoding ===
	patches = self._decompose_patches(grid) # (B*64, 2, 4, 4)
	patch_feats = self.patch_encoder(patches) # (B*64, patch_feat_dim)
	patch_feats = patch_feats.reshape(B, MACRO_N, self.patch_feat_dim)

	# Per-patch occupancy
	patch_occ = patches.reshape(B, MACRO_N, PATCH_VOL).mean(dim=-1, keepdim=True)

	# Combine: features + occupancy + position
	pos = self.macro_pos.expand(B, -1, -1)
	patch_input = torch.cat([patch_feats, patch_occ, pos], dim=-1)
	x = self.patch_proj(patch_input)

	# Add learnable positional embedding
	x = x + self.pos_embed

	# === Cross-attention layers ===
	for layer in self.layers:
	x = layer(x)

	x = self.final_ln(x)

	# === Pool: mean over patches ===
	pooled = x.mean(dim=1) # (B, embed_dim)

	# === Combine with global features ===
	feat = torch.cat([pooled, global_feat], dim=-1) # (B, class_in)

	# === Classification ===
	class_logits = self.classifier(feat)
	dim_logits = self.dim_head(feat)
	is_curved = self.curved_head(feat)
	curv_logits = self.curv_type_head(feat)

	return {
	"class_logits": class_logits,
	"dim_logits": dim_logits,
	"is_curved_pred": is_curved,
	"curv_type_logits": curv_logits,
	"features": feat,
	}


	# === Confidence ===============================================================

	def compute_confidence(logits):
	probs = F.softmax(logits, dim=-1)
	max_prob, _ = probs.max(dim=-1)
	top2 = probs.topk(2, dim=-1).values
	margin = top2[:, 0] - top2[:, 1]
	log_probs = F.log_softmax(logits, dim=-1)
	entropy = -(probs * log_probs).sum(dim=-1)
	max_entropy = math.log(logits.shape[-1])
	return {"max_prob": max_prob, "margin": margin,
	"entropy": entropy / max_entropy, "confidence": margin}


	# === Sanity check =============================================================
	if __name__ == "__main__":
	_m = PatchCrossAttentionClassifier()
	_n = sum(p.numel() for p in _m.parameters())
	print(f'PatchCrossAttentionClassifier: {_n:,} params')
	print(f' Patches: {MACRO_Z}×{MACRO_Y}×{MACRO_X} = {MACRO_N} patches of {PATCH_Z}×{PATCH_Y}×{PATCH_X}')
	_dummy = torch.zeros(2, GZ, GY, GX)
	with torch.no_grad():
	_out = _m(_dummy)
	print(f' class_logits: {_out["class_logits"].shape}')
	print(f' features: {_out["features"].shape}')
	print(f' class_in: {_m.class_in}')
	del _m, _dummy, _out