Upload 4 files

17c23ae verified about 1 month ago

22.9 kB

	#!/usr/bin/env python3
	"""
	SymbolicLight — Next-Generation Neuro-Symbolic Spiking Large Model Architecture
	==================================================
	Key Innovations:
	1. SparseTCAM: Spiking sparse routing replacing Self-Attention
	2. LIF Neurons: Event-driven replacing dense activation
	3. EntropyGate: On-demand compute depth (early exit for simple queries)
	4. BayesianHead: Bayesian token selection replacing Softmax
	5. STDP: Online learning during inference (no backward pass needed)
	"""
	import math
	from dataclasses import dataclass
	from typing import Optional, Tuple

	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	# ============================================================================
	# Configuration
	# ============================================================================
	@dataclass
	class SymbolicLightConfig:
	"""SymbolicLight 0.1B Default Configuration"""
	vocab_size: int = 32000 # Vocabulary size (reusing Qwen/Llama tokenizer)
	embed_dim: int = 768 # Embedding dimension
	n_layers: int = 12 # Number of SymbolicLightBlock layers
	n_heads: int = 12 # Number of channels in SparseTCAM
	head_dim: int = 64 # Dimension per channel (embed_dim / n_heads)
	intermediate_dim: int = 2048 # Feedforward intermediate dimension
	max_seq_len: int = 2048 # Maximum sequence length
	dropout: float = 0.1
	# --- SymbolicLight Specific Parameters ---
	spike_threshold: float = 1.0 # LIF neuron firing threshold
	leak_factor: float = 0.95 # Membrane potential leak factor
	stdp_lr: float = 0.01 # STDP learning rate
	entropy_exit_threshold: float = 0.3 # EntropyGate early exit threshold
	enable_entropy_exit: bool = False # Enable EntropyGate early exit (requires Early Exit Head training to enable)
	enable_stdp: bool = False # Enable STDP (disabled during pre-training, enabled during deployment)


	# ============================================================================
	# Surrogate Gradient Function (enables BP training for non-differentiable spikes)
	# ============================================================================
	class SurrogateSpike(torch.autograd.Function):
	"""
	Forward: Hard threshold -> 0/1 spikes (non-differentiable)
	Backward: Use derivative of sigmoid as surrogate gradient (differentiable)

	This is the key mathematical trick enabling SNNs to be trained with backpropagation!
	"""
	sigma = 10.0 # Steepness of the surrogate gradient

	@staticmethod
	def forward(ctx, membrane_potential, threshold):
	ctx.save_for_backward(membrane_potential, torch.tensor(threshold))
	return (membrane_potential >= threshold).float()

	@staticmethod
	def backward(ctx, grad_output):
	membrane_potential, threshold = ctx.saved_tensors
	# Surrogate gradient: derivative of sigmoid σ·sigmoid(σx)·(1-sigmoid(σx))
	x = SurrogateSpike.sigma * (membrane_potential - threshold)
	sigmoid_x = torch.sigmoid(x)
	surrogate_grad = SurrogateSpike.sigma * sigmoid_x * (1.0 - sigmoid_x)
	return grad_output * surrogate_grad, None


	def surrogate_spike(membrane_potential: torch.Tensor, threshold: float = 1.0) -> torch.Tensor:
	"""Exposed surrogate gradient spike function"""
	return SurrogateSpike.apply(membrane_potential, threshold)


	# ============================================================================
	# ② SpikeEncoder
	# ============================================================================
	class SpikeEncoder(nn.Module):
	"""
	Converts discrete token IDs into spatio-temporal spike tensors.

	Process: token_id -> Embedding -> LayerNorm -> LIF Spiking
	"""
	def __init__(self, config: SymbolicLightConfig):
	super().__init__()
	self.embedding = nn.Embedding(config.vocab_size, config.embed_dim)
	self.norm = nn.LayerNorm(config.embed_dim)
	self.threshold = config.spike_threshold
	self.leak = config.leak_factor

	# Positional encoding: Learnable Embedding (simple and effective)
	self.pos_embedding = nn.Embedding(config.max_seq_len, config.embed_dim)

	# Membrane potential (managed dynamically per batch)
	self.register_buffer("v_mem", None)

	def _init_membrane(self, shape: torch.Size, device: torch.device):
	"""Initialize/reset membrane potential"""
	self.v_mem = torch.zeros(shape, device=device)

	def forward(self, token_ids: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Args:
	token_ids: [batch, seq_len]
	Returns:
	spikes: [batch, seq_len, embed_dim] Sparse 0/1 spikes
	continuous: [batch, seq_len, embed_dim] Continuous representation (used for residuals)
	"""
	B, S = token_ids.shape
	positions = torch.arange(S, device=token_ids.device).unsqueeze(0)

	# Embedding + Positional encoding
	x = self.embedding(token_ids) + self.pos_embedding(positions)
	x = self.norm(x)

	# LIF spiking: process step-by-step over time (seq dimension)
	self._init_membrane((B, x.size(-1)), x.device)
	spikes_list = []

	for t in range(S):
	# Leak + Integrate
	self.v_mem = self.v_mem * self.leak + x[:, t, :]
	# Fire spike (via surrogate gradient, backpropagatable)
	spike = surrogate_spike(self.v_mem, self.threshold)
	# Reset after firing
	self.v_mem = self.v_mem * (1.0 - spike)
	spikes_list.append(spike)

	spikes = torch.stack(spikes_list, dim=1) # [B, S, D]
	return spikes, x # Return spikes and continuous representation (continuous repr used for residual connections)


	# ============================================================================
	# ③a SparseTCAM — In-Memory Compute Sparse Routing (Replaces Self-Attention)
	# ============================================================================
	class SparseTCAM(nn.Module):
	"""
	Simulates in-memory compute of the S100 Graph-TCAM.

	Core difference from Self-Attention:
	- Attention: QxK^T -> All-to-all O(n^2) dense matrix multiplication
	- SparseTCAM: Spikes x Weights -> Only activate weight rows hit by spikes -> O(n*k), k << n

	In GPU software implementation, we achieve \"sparse read\" via spike masks.
	"""
	def __init__(self, config: SymbolicLightConfig):
	super().__init__()
	self.n_heads = config.n_heads
	self.head_dim = config.head_dim
	self.embed_dim = config.embed_dim
	self.threshold = config.spike_threshold
	self.leak = config.leak_factor

	# TCAM weight matrix (In-memory compute: serves as both storage and computation)
	self.tcam_proj = nn.Linear(config.embed_dim, config.embed_dim, bias=False)
	# Output projection
	self.out_proj = nn.Linear(config.embed_dim, config.embed_dim, bias=False)
	self.norm = nn.LayerNorm(config.embed_dim)
	self.dropout = nn.Dropout(config.dropout)

	def forward(self, spikes: torch.Tensor, continuous: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Args:
	spikes: [B, S, D] Input spikes (sparse 0/1)
	continuous: [B, S, D] Continuous representation (for residual connections)
	Returns:
	out_spikes: [B, S, D] Output spikes
	out_continuous: [B, S, D] Updated continuous representation
	"""
	B, S, D = spikes.shape

	# 1. Spike mask routing: only compute at positions \"with spikes\"
	# spike_mask indicates which positions have active spikes
	spike_energy = spikes.sum(dim=-1) # [B, S] Spike energy at each position
	active_mask = (spike_energy > 0).unsqueeze(-1).float() # [B, S, 1]

	# 2. In-memory compute addressing: spikes x TCAM weights
	# On real chips this is single-cycle TCAM match; on GPU it is sparse matrix multiplication
	tcam_out = self.tcam_proj(spikes * active_mask)

	# 3. Multi-channel information fusion (similar to multi-head but based on spike channels)
	tcam_out = tcam_out.view(B, S, self.n_heads, self.head_dim)

	# 4. Temporal context aggregation (using causal mask to prevent seeing future info)
	# Simplified version: use cumulative mean replacing all-to-all attention
	causal_cumsum = torch.cumsum(tcam_out, dim=1)
	counts = torch.arange(1, S + 1, device=spikes.device).float().view(1, S, 1, 1)
	context = causal_cumsum / counts

	# 5. Merge channels + Output projection
	context = context.view(B, S, D)
	output = self.out_proj(self.dropout(context))

	# 6. Residual connection + Layer normalization
	out_continuous = self.norm(continuous + output)

	# 7. LIF spiking output
	out_spikes = surrogate_spike(out_continuous, self.threshold)

	return out_spikes, out_continuous


	# ============================================================================
	# ③b EntropyGate — Entropy Gating (On-demand compute depth)
	# ============================================================================
	class EntropyGate(nn.Module):
	"""
	Innovation from S22 Entropy Engine: Calculate information entropy of current spike stream.
	Low entropy = Model is highly certain -> Can early exit, no need to run all layers.
	High entropy = Model is still confused -> Continue to deeper layers.

	Transformers lack this capability: regardless of query simplicity, all layers must execute.
	"""
	def __init__(self, config: SymbolicLightConfig):
	super().__init__()
	self.threshold = config.entropy_exit_threshold
	# Learn a linear projection to predict \"whether to exit\"
	self.gate = nn.Linear(config.embed_dim, 1)

	def forward(self, spikes: torch.Tensor) -> Tuple[torch.Tensor, bool]:
	"""
	Returns:
	entropy: Information entropy of the current spike stream [B]
	should_exit: Whether early exit is recommended
	"""
	# Calculate spike firing rate (how many dimensions are activated per position)
	firing_rate = spikes.mean(dim=-1) # [B, S]

	# Information entropy: H = -plog(p) - (1-p)log(1-p)
	p = firing_rate.clamp(1e-7, 1 - 1e-7)
	entropy = -(p * p.log() + (1 - p) * (1 - p).log()).mean()

	# Low entropy -> Can exit
	should_exit = (entropy.item() < self.threshold) if not self.training else False
	return entropy, should_exit


	# ============================================================================
	# ③c FeedForward — Feed-Forward Network (with spike sparsity)
	# ============================================================================
	class SpikingFeedForward(nn.Module):
	"""
	Replaces the 2-layer MLP of Transformers.
	Key difference: Intermediate layer uses LIF spike activation instead of GELU/ReLU.
	"""
	def __init__(self, config: SymbolicLightConfig):
	super().__init__()
	self.up = nn.Linear(config.embed_dim, config.intermediate_dim, bias=False)
	self.down = nn.Linear(config.intermediate_dim, config.embed_dim, bias=False)
	self.norm = nn.LayerNorm(config.embed_dim)
	self.threshold = config.spike_threshold
	self.dropout = nn.Dropout(config.dropout)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	residual = x
	h = self.up(x)
	# Use LIF spikes replacing GELU (extremely sparsifying the intermediate layer)
	h = surrogate_spike(h, self.threshold)
	h = self.down(self.dropout(h))
	return self.norm(residual + h)


	# ============================================================================
	# ③ SymbolicLightBlock — Single Layer Compute Block
	# ============================================================================
	class SymbolicLightBlock(nn.Module):
	"""
	A complete SymbolicLight layer, including:
	- SparseTCAM (In-memory compute routing)
	- SpikingFeedForward (Spiking feed-forward)
	- EntropyGate (Entropy gating)
	"""
	def __init__(self, config: SymbolicLightConfig):
	super().__init__()
	self.tcam = SparseTCAM(config)
	self.ffn = SpikingFeedForward(config)
	self.entropy_gate = EntropyGate(config)

	def forward(self, spikes: torch.Tensor, continuous: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, bool]:
	# 1. In-memory compute routing
	spikes, continuous = self.tcam(spikes, continuous)
	# 2. Spiking sequence iteration
	continuous = self.ffn(continuous)
	# Re-spiking
	spikes = surrogate_spike(continuous, self.tcam.threshold)
	# 3. Entropy gating
	_, should_exit = self.entropy_gate(spikes)
	return spikes, continuous, should_exit


	# ============================================================================
	# ④ BayesianHead — Bayesian Output Head (Replaces Softmax)
	# ============================================================================
	class BayesianHead(nn.Module):
	"""
	Innovation from S100 LALU array: Use Bayesian posterior replacing Softmax.

	Softmax: P(word) = exp(logit) / Σexp(logits) <- Brutal normalization
	Bayesian: P(word\|context) ∝ P(context\|word) x P(word) <- Exact inference

	In V1, we approximate Bayesian updates using addition in the log domain.
	"""
	def __init__(self, config: SymbolicLightConfig):
	super().__init__()
	self.output_proj = nn.Linear(config.embed_dim, config.vocab_size, bias=False)
	# Prior probability (learnable word frequency bias)
	self.log_prior = nn.Parameter(torch.zeros(config.vocab_size))

	def forward(self, continuous: torch.Tensor) -> torch.Tensor:
	"""
	Args:
	continuous: [B, S, D]
	Returns:
	logits: [B, S, vocab_size] (Log probabilities, can be directly trained with CrossEntropy)
	"""
	# Likelihood term: P(context\|word)
	log_likelihood = self.output_proj(continuous) # [B, S, V]
	# Prior term: P(word)
	# Bayesian update (log domain addition): log P(word\|context) = log P(context\|word) + log P(word)
	logits = log_likelihood + self.log_prior.unsqueeze(0).unsqueeze(0)
	return logits


	# ============================================================================
	# ⑤ STDP Online Learner
	# ============================================================================
	class STDPUpdater:
	"""
	Innovation from S100 ILE Inductive Learning Engine.

	Activated ONLY during inference (model.eval()).
	No loss.backward() required, purely local learning rules based on spike timing.
	"""
	def __init__(self, config: SymbolicLightConfig):
	self.lr = config.stdp_lr
	self.enabled = config.enable_stdp

	@torch.no_grad()
	def update(self, model: nn.Module, pre_spikes: torch.Tensor, post_spikes: torch.Tensor):
	"""
	STDP Rules:
	- Pre-synaptic fires first -> Strengthen connection (LTP)
	- Post-synaptic fires first -> Weaken connection (LTD)
	"""
	if not self.enabled:
	return

	# Calculate causal correlation: which input spikes caused output spikes?
	# pre_spikes: [B, S, D], post_spikes: [B, S, D]
	causal = (pre_spikes.sum(dim=1, keepdim=True) > 0) & (post_spikes.sum(dim=1, keepdim=True) > 0)

	if causal.any():
	# Make minute local updates to all TCAM weights
	for block in model.blocks:
	w = block.tcam.tcam_proj.weight
	# LTP: Strengthen causal paths
	delta = self.lr * (pre_spikes.mean(dim=(0, 1)) @ post_spikes.mean(dim=(0, 1)).unsqueeze(-1))
	w.data += delta.squeeze() * 0.001
	w.data.clamp_(-5, 5)


	# ============================================================================
	# Complete Model
	# ============================================================================
	class SymbolicLightModel(nn.Module):
	"""
	SymbolicLight: Next-generation Neuro-Symbolic Spiking Large Model

	Usage:
	config = SymbolicLightConfig()
	model = SymbolicLightModel(config)

	# Training
	logits = model(token_ids)
	loss = F.cross_entropy(logits.view(-1, config.vocab_size), targets.view(-1))

	# Inference (Autoregressive generation)
	output_ids = model.generate(prompt_ids, max_new_tokens=100)
	"""
	def __init__(self, config: SymbolicLightConfig):
	super().__init__()
	self.config = config
	self.spike_encoder = SpikeEncoder(config)
	self.blocks = nn.ModuleList([
	SymbolicLightBlock(config) for _ in range(config.n_layers)
	])
	self.output_head = BayesianHead(config)
	self.stdp = STDPUpdater(config)

	# Weight initialization
	self.apply(self._init_weights)
	# Print parameter count
	n_params = sum(p.numel() for p in self.parameters())
	print(f"[SymbolicLight] Model initialization complete \| Parameters: {n_params/1e6:.1f}M ({n_params/1e9:.3f}B)")

	def _init_weights(self, module):
	if isinstance(module, nn.Linear):
	nn.init.normal_(module.weight, mean=0.0, std=0.02)
	if module.bias is not None:
	nn.init.zeros_(module.bias)
	elif isinstance(module, nn.Embedding):
	nn.init.normal_(module.weight, mean=0.0, std=0.02)

	def forward(self, token_ids: torch.Tensor) -> torch.Tensor:
	"""
	Forward propagation (Universal for training + inference)
	Args:
	token_ids: [batch, seq_len]
	Returns:
	logits: [batch, seq_len, vocab_size]
	"""
	# ② Spike encoding
	spikes, continuous = self.spike_encoder(token_ids)
	initial_spikes = spikes # Retained for STDP

	# ③ Layer-by-layer processing
	for block in self.blocks:
	spikes, continuous, should_exit = block(spikes, continuous)
	# EntropyGate early exit: active only when explicitly enabled and not in training mode
	# Note: The current EntropyGate weights are untrained (no auxiliary loss),
	# default closed to avoid false exits at layer 0.
	# Keep disabled until Early Exit Head + Auxiliary Loss are implemented.
	if should_exit and not self.training and self.config.enable_entropy_exit:
	break

	# ④ Bayesian Output
	logits = self.output_head(continuous)

	# ⑤ STDP Online Learning (Active only when explicitly enabled during inference)
	if not self.training and self.config.enable_stdp:
	self.stdp.update(self, initial_spikes, spikes)

	return logits

	@torch.no_grad()
	def generate(self, prompt_ids: torch.Tensor, max_new_tokens: int = 100,
	temperature: float = 0.8, top_k: int = 50) -> torch.Tensor:
	"""
	Autoregressive text generation

	Args:
	prompt_ids: [1, prompt_len] Prompt token IDs
	max_new_tokens: Maximum number of new tokens to generate
	temperature: Sampling temperature (higher = more random)
	top_k: Sample only from the top k highest probability tokens
	Returns:
	Generated complete token sequence
	"""
	self.eval()
	generated = prompt_ids.clone()

	for _ in range(max_new_tokens):
	# Truncate to maximum length
	input_ids = generated[:, -self.config.max_seq_len:]

	# Forward inference
	logits = self.forward(input_ids)

	# Take logits only at the last position
	next_logits = logits[:, -1, :] / temperature

	# Top-K sampling
	if top_k > 0:
	top_k_vals, _ = torch.topk(next_logits, top_k)
	min_top_k = top_k_vals[:, -1].unsqueeze(-1)
	next_logits[next_logits < min_top_k] = float('-inf')

	# Sample next token
	probs = F.softmax(next_logits, dim=-1)
	next_token = torch.multinomial(probs, num_samples=1)

	# Append to sequence
	generated = torch.cat([generated, next_token], dim=1)

	# EOS detection (token_id=2 is usually EOS)
	if next_token.item() == 2:
	break

	return generated

	def get_sparsity_stats(self) -> dict:
	"""Returns model sparsity statistics (for papers and debugging)"""
	stats = {}
	with torch.no_grad():
	dummy = torch.randint(0, 100, (1, 32))
	spikes, _ = self.spike_encoder(dummy)
	stats['encoder_sparsity'] = 1.0 - spikes.mean().item()
	for i, block in enumerate(self.blocks):
	spikes, _, _ = block(spikes, spikes)
	stats[f'block_{i}_sparsity'] = 1.0 - spikes.mean().item()
	return stats


	# ============================================================================
	# Quick Validation
	# ============================================================================
	if __name__ == "__main__":
	print("=" * 60)
	print(" SymbolicLight Model Architecture Validation")
	print("=" * 60)

	config = SymbolicLightConfig(
	vocab_size=32000,
	embed_dim=768,
	n_layers=12,
	n_heads=12,
	head_dim=64,
	intermediate_dim=2048,
	)

	model = SymbolicLightModel(config)

	# Dummy input
	dummy_input = torch.randint(0, 32000, (2, 128)) # batch=2, seq=128
	print(f"\nInput: batch=2, seq_len=128")

	# Forward propagation
	logits = model(dummy_input)
	print(f"Output logits: {logits.shape}") # Should be [2, 128, 32000]

	# Sparsity analysis
	stats = model.get_sparsity_stats()
	print(f"\nSparsity analysis:")
	for k, v in stats.items():
	print(f" {k}: {v*100:.1f}% Silent")

	# Generation test
	prompt = torch.randint(0, 32000, (1, 10))
	print(f"\nAutoregressive generation test (prompt length=10, generating 20 tokens)...")
	output = model.generate(prompt, max_new_tokens=20)
	print(f"Generated sequence length: {output.shape[1]}")

	print("\n[PASS] SymbolicLight model architecture verified!")