Upload LPatchTST checkpoint and source

64fca1b verified 10 days ago

11.4 kB

	from __future__ import annotations
	import contextlib
	import torch
	import torch.nn as nn
	from enum import Enum

	class InputMode(str, Enum):
	TOKENS_ONLY = "tokens_only" # discrete tokens from KronosTokenizer only
	FEATURES_ONLY = "features_only" # 21 engineered features only (original mode)
	COMBINED = "combined" # tokens + features concatenated then projected


	class InputStem(nn.Module):
	"""
	Converts raw inputs into a unified (B, L, d_model) tensor
	regardless of input_mode. The rest of the model never needs
	to know which mode is active.
	"""
	def __init__(self, input_mode: InputMode, d_model: int,
	n_tokens: int, # vocab size for token embedding (hierarchical sum)
	n_features: int, # number of engineered features (e.g. 21)
	s1_bits: int = 6,
	s2_bits: int = 6):
	super().__init__()
	self.mode = InputMode(input_mode)
	self.d_model = d_model
	self.s1_bits = s1_bits
	self.s2_bits = s2_bits

	if self.mode in (InputMode.TOKENS_ONLY, InputMode.COMBINED):
	self.embed_coarse = nn.Embedding(2 ** s1_bits, d_model)
	self.embed_fine = nn.Embedding(2 ** s2_bits, d_model)
	self.tok_dropout = nn.Dropout(0.05) # Reduce from 0.50 → 0.05: less aggressive token regularization

	if self.mode in (InputMode.FEATURES_ONLY, InputMode.COMBINED):
	self.feature_proj = nn.Linear(n_features, d_model)

	if self.mode == InputMode.COMBINED:
	# After summing token_emb + feature_proj, project back to d_model
	# Use a gating mechanism so model learns how much to trust each source
	self.gate = nn.Sequential(
	nn.Linear(d_model * 2, d_model * 2),
	nn.SiLU(),
	nn.Linear(d_model * 2, d_model)
	)

	def forward(self, tokens=None, features=None):
	"""
	tokens : tuple (idx_coarse, idx_fine) each (B, L) — from tokenizer.encode(half=True)
	OR None if mode is features_only
	features : (B, L, n_features) float tensor
	OR None if mode is tokens_only
	Returns : (B, L, d_model)
	"""
	if self.mode == InputMode.TOKENS_ONLY:
	assert tokens is not None, "tokens required for tokens_only mode"
	idx_c, idx_f = tokens
	emb = self.embed_coarse(idx_c) + self.embed_fine(idx_f) # (B, L, d_model)
	return self.tok_dropout(emb) # Apply dropout here

	elif self.mode == InputMode.FEATURES_ONLY:
	assert features is not None, "features required for features_only mode"
	return self.feature_proj(features) # (B, L, d_model)

	elif self.mode == InputMode.COMBINED:
	assert tokens is not None and features is not None
	idx_c, idx_f = tokens
	tok_emb = self.embed_coarse(idx_c) + self.embed_fine(idx_f) # (B, L, d_model)
	feat_emb = self.feature_proj(features) # (B, L, d_model)
	# Gated fusion — model learns weighting between discrete and continuous
	fused = self.gate(torch.cat([tok_emb, feat_emb], dim=-1)) # (B, L, d_model)
	return fused


	class PatchTST(nn.Module):
	"""
	Channel-independent PatchTST with support for multi-modal input stems.

	Architecture:
	1. InputStem: (tokens, features) -> (B, L, d_model)
	2. Patching: Unfold -> (B, num_patches, patch_len * d_model) -> Linear(d_model)
	3. Positional Embedding
	4. LSTM (Optional): Temporal smoothing/context
	5. Transformer Encoder: Patch-to-Patch attention
	6. Aggregation Head: Global pooling -> Projection
	"""

	def __init__(
	self,
	seq_len: int = 400,
	num_features: int = 21,
	patch_len: int = 16,
	stride: int = 8,
	d_model: int = 128,
	n_heads: int = 4,
	n_layers: int = 2,
	lstm_layers: int = 0,
	dropout: float = 0.2,
	aggregation: str = "mixing",
	input_mode: str = "features_only",
	vocab_size: int = 4096,
	s1_bits: int = 6,
	s2_bits: int = 6,
	**legacy_kwargs,
	):
	super().__init__()

	self.seq_len = int(seq_len)
	self.num_features = int(num_features)
	self.patch_len = int(patch_len)
	self.stride = int(stride)
	self.d_model = int(d_model)
	self.n_heads = int(n_heads)
	self.n_layers = int(n_layers)
	self.lstm_layers = int(lstm_layers)
	self.dropout_rate = float(dropout)
	self.aggregation = aggregation.lower().strip()
	self.input_mode = InputMode(input_mode)

	# ── 1. Input Stem ────────────────────────────────────────────────────
	self.input_stem = InputStem(
	input_mode=self.input_mode,
	d_model=self.d_model,
	n_tokens=vocab_size,
	n_features=self.num_features,
	s1_bits=s1_bits,
	s2_bits=s2_bits
	)

	# ── 2. Patching ──────────────────────────────────────────────────────
	self.patch_embed = nn.Linear(self.patch_len * self.d_model, self.d_model)
	self.num_patches = (self.seq_len - self.patch_len) // self.stride + 1
	self.register_buffer(
	"pos_embedding_base",
	torch.randn(1, self.num_patches, self.d_model) * 0.02
	)
	self.dropout = nn.Dropout(dropout)

	# ── 4. LSTM (Temporal Context) ───────────────────────────────────────
	if self.lstm_layers > 0:
	self.lstm = nn.LSTM(
	input_size=self.d_model,
	hidden_size=self.d_model,
	num_layers=self.lstm_layers,
	batch_first=True,
	dropout=dropout if self.lstm_layers > 1 else 0,
	)
	else:
	self.lstm = None

	# ── 5. Transformer Encoder ───────────────────────────────────────────
	encoder_layer = nn.TransformerEncoderLayer(
	d_model=self.d_model,
	nhead=n_heads,
	dim_feedforward=self.d_model * 4,
	dropout=dropout,
	batch_first=True,
	norm_first=True,
	)
	self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
	self.enc_dropout = nn.Dropout(dropout)

	# ── 6. Head ──────────────────────────────────────────────────────────
	if self.aggregation == "mean":
	self.head = nn.Linear(self.num_patches * self.d_model, 1)
	else: # "mixing"
	self.feature_head = nn.Sequential(
	nn.Linear(self.d_model, self.d_model // 2), # 128 → 64
	nn.GELU(),
	nn.Dropout(0.1),
	nn.Linear(self.d_model // 2, 1), # 64 → 1
	)

	self.apply(self._init_weights)

	for proj in filter(None, [getattr(self, "head", None), getattr(self, "feature_head", None)]):
	if isinstance(proj, nn.Linear):
	nn.init.trunc_normal_(proj.weight, std=0.02)
	nn.init.zeros_(proj.bias)

	def _init_weights(self, m: nn.Module) -> None:
	if isinstance(m, nn.Linear):
	nn.init.trunc_normal_(m.weight, std=0.02)
	if m.bias is not None:
	nn.init.zeros_(m.bias)
	elif isinstance(m, nn.Embedding):
	# Standard init (0.02): balances expressivity vs OOV noise
	nn.init.normal_(m.weight, std=0.02)

	elif isinstance(m, nn.LSTM):
	for name, param in m.named_parameters():
	if 'weight_ih' in name:
	nn.init.xavier_uniform_(param.data)
	elif 'weight_hh' in name:
	nn.init.orthogonal_(param.data)
	elif 'bias' in name:
	nn.init.constant_(param.data, 0)

	def forward(self, tokens=None, features=None) -> torch.Tensor:
	"""
	Args:
	tokens : tuple (s1, s2) each (B, L) or None
	features : (B, L, F) float or None
	"""
	# Step 1: Unified embedding via stem
	x = self.input_stem(tokens=tokens, features=features) # (B, L, d_model)

	# Step 2: Patching
	# unfold: (B, L, d_model) -> (B, num_patches, patch_len, d_model)
	x = x.unfold(1, self.patch_len, self.stride)
	# flatten: (B, num_patches, patch_len * d_model)
	x = x.reshape(x.shape[0], x.shape[1], -1)
	# project: (B, num_patches, d_model)
	x = self.patch_embed(x)

	# Step 3: Positional Embedding — interpolate if seq len changed
	num_patches_actual = x.shape[1]
	if num_patches_actual == self.num_patches:
	pos = self.pos_embedding_base
	else:
	# Linear interpolation to handle variable-length sequences at val/test
	pos = torch.nn.functional.interpolate(
	self.pos_embedding_base.transpose(1, 2), # (1, d_model, num_patches)
	size=num_patches_actual,
	# Linear interpolation for 1D sequence data
	mode='linear',
	align_corners=False
	).transpose(1, 2) # (1, num_patches_actual, d_model)
	x = x + pos
	x = self.dropout(x)

	# Step 4: LSTM (if present)
	if self.lstm is not None:
	x, _ = self.lstm(x)

	# Step 5: Transformer Encoder
	x = self.encoder(x)
	x = self.enc_dropout(x)

	# Step 6: Aggregation
	if self.aggregation == "mean":
	x_flat = x.reshape(x.shape[0], -1)
	x = self.head(x_flat)
	return x / (1.0 + x.abs()) # softsign: same range as tanh, gradient never vanishes
	else:
	# Global average pooling over patches
	pooled = torch.mean(x, dim=1)
	x = self.feature_head(pooled)
	return x / (1.0 + x.abs()) # softsign: same range as tanh, gradient never vanishes


	class LPatchTST(PatchTST):
	"""
	Refined LPatchTST that uses InputStem and follows the
	Patch -> LSTM -> Transformer -> Head pipeline.
	"""
	def __init__(self,
	input_mode: str = "combined",
	n_features: int = 21,
	vocab_size: int = 4096,
	s1_bits: int = 6,
	s2_bits: int = 6,
	d_model: int = 128,
	patch_len: int = 8,
	stride: int = 4,
	**kwargs):
	super().__init__(
	input_mode=input_mode,
	num_features=n_features,
	vocab_size=vocab_size,
	s1_bits=s1_bits,
	s2_bits=s2_bits,
	d_model=d_model,
	patch_len=patch_len,
	stride=stride,
	**kwargs
	)