ducheng678

Initial WaveLSFromer project

093b0a5 about 2 months ago

7.09 kB

	import math
	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	# --------------------------
	# Utilities / Norm / Activations
	# --------------------------

	class RMSNorm(nn.Module):
	"""RMSNorm with learnable weight. Drop-in for LayerNorm when using Pre-Norm."""
	def __init__(self, d_model, eps=1e-8):
	super().__init__()
	self.eps = eps
	self.weight = nn.Parameter(torch.ones(d_model))

	def forward(self, x):
	# x: (..., D)
	norm = x.pow(2).mean(dim=-1, keepdim=True).add(self.eps).rsqrt()
	return self.weight * x * norm


	def get_activation(name: str):
	name = (name or "relu").lower()
	if name == "relu":
	return nn.ReLU()
	if name in ("gelu", "geglu"):
	return nn.GELU()
	if name in ("silu", "swish"):
	return nn.SiLU()
	return nn.ReLU()


	class SwiGLU(nn.Module):
	"""SwiGLU FFN: proj( SiLU(a) * b ), a,b from linear split."""
	def __init__(self, d_model, d_ff):
	super().__init__()
	self.w12 = nn.Linear(d_model, 2 * d_ff, bias=True)
	self.proj = nn.Linear(d_ff, d_model, bias=True)

	def forward(self, x):
	a, b = self.w12(x).chunk(2, dim=-1)
	return self.proj(F.silu(a) * b)


	# --------------------------
	# Conv Layer (kept signature)
	# --------------------------

	class ConvLayer(nn.Module):
	def __init__(self, c_in):
	super(ConvLayer, self).__init__()
	padding = 1 if torch.__version__ >= "1.5.0" else 2
	self.downConv = nn.Conv1d(
	in_channels=c_in,
	out_channels=c_in,
	kernel_size=3,
	padding=padding,
	padding_mode="circular",
	)
	self.norm = nn.BatchNorm1d(c_in)
	self.activation = nn.ELU()
	self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)

	def forward(self, x):
	# x: [B, L, D]
	x = x.permute(0, 2, 1) # B, D, L
	y = self.downConv(x)
	y = self.norm(y)
	y = self.activation(y)
	y = self.maxPool(y)
	y = y.transpose(1, 2).contiguous() # B, L', D
	return y


	# --------------------------
	# Encoder Layer (kept signature)
	# --------------------------

	class EncoderLayer(nn.Module):
	"""
	Keep the same signature:
	__init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu", ln_mode="pre", conv_layer=False)
	forward(self, x, attn_mask=None)
	Internals:
	- Pre-Norm by default (ln_mode="pre")
	- RMSNorm (instead of LN) but callable doesn’t change
	- Residual scaling 1/sqrt(2)
	- FFN uses SwiGLU, dropout after branch
	- Attention module is expected to have signature (q, k, v, attn_mask=None) -> (new_x, attn)
	and internally do q *= 1/sqrt(d_head)
	"""
	def __init__(
	self,
	attention,
	d_model,
	d_ff=None,
	dropout=0.1,
	activation="relu",
	ln_mode="pre",
	conv_layer=False,
	):
	super(EncoderLayer, self).__init__()
	self.attention = attention
	self.conv_layer = ConvLayer(d_model) if conv_layer else None
	self.dropout = nn.Dropout(dropout)
	self.activation = get_activation(activation)
	self.ln_mode = ln_mode # will honor "pre" / "post" without changing signature

	# Core hyperparams
	self.d_model = d_model
	self.d_ff = d_ff or 4 * d_model
	self.res_scale = 1.0 / math.sqrt(2.0)

	# Use RMSNorm but keep object names norm1/norm2 to avoid API change elsewhere
	self.norm1 = RMSNorm(d_model)
	self.norm2 = RMSNorm(d_model)

	# FFN: use SwiGLU for better stability/accuracy
	self.ff = SwiGLU(d_model, self.d_ff)

	# In case some pipeline expects LayerNorm instance, we also keep a post-norm if ln_mode="post"
	# (But the actual normalization used is RMSNorm above; this is just to respect the mode)
	if self.ln_mode == "post":
	self.post_ln1 = nn.LayerNorm(d_model)
	self.post_ln2 = nn.LayerNorm(d_model)

	def forward(self, x, attn_mask=None):
	# x: [B, L, D]

	if self.conv_layer is not None:
	x = x + self.dropout(self.conv_layer(x)) * self.res_scale

	if self.ln_mode == "post":
	# -------- Post-LN path (kept behavior but more explicit/clean) --------
	new_x, attn = self.attention(x, x, x, attn_mask=attn_mask)
	x = x + self.dropout(new_x) * self.res_scale
	x = self.post_ln1(x)

	y = self.ff(x)
	x = x + self.dropout(y) * self.res_scale
	x = self.post_ln2(x)
	return x, attn

	# -------- Default: Pre-LN (recommended) --------
	# Attention branch (Pre-Norm)
	h, attn = self.attention(self.norm1(x), self.norm1(x), self.norm1(x), attn_mask=attn_mask)
	x = x + self.dropout(h) * self.res_scale

	# FFN branch (Pre-Norm)
	y = self.ff(self.norm2(x))
	x = x + self.dropout(y) * self.res_scale

	return x, attn


	# --------------------------
	# Encoder (kept signature)
	# --------------------------

	class Encoder(nn.Module):
	"""
	Keep the same signature:
	__init__(self, attn_layers, conv_layers=None, norm_layer=None)
	forward(self, x, attn_mask=None)
	"""
	def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
	super(Encoder, self).__init__()
	self.attn_layers = nn.ModuleList(attn_layers)
	self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None
	self.norm = norm_layer # can be None or nn.Module

	def forward(self, x, attn_mask=None):
	# x: [B, L, D]
	attns = []
	for i, attn_layer in enumerate(self.attn_layers):
	x, attn = attn_layer(x, attn_mask=attn_mask)
	attns.append(attn)
	if self.conv_layers is not None and i < len(self.conv_layers):
	x = self.conv_layers[i](x)

	if self.norm is not None:
	x = self.norm(x)
	return x, attns


	# --------------------------
	# Encoder Stack (kept signature)
	# --------------------------

	class EncoderStack(nn.Module):
	"""
	Keep the same signature:
	__init__(self, encoders, inp_lens, d_model)
	forward(self, x, attn_mask=None)
	"""
	def __init__(self, encoders, inp_lens, d_model):
	super(EncoderStack, self).__init__()
	self.encoders = nn.ModuleList(encoders)
	self.inp_lens = inp_lens
	self.d_model = d_model

	def forward(self, x, attn_mask=None):
	# x: [B, L, D]
	x_stack = []
	attns = []
	# For each pyramid level, take the tail part of the sequence
	for i_len, encoder in zip(self.inp_lens, self.encoders):
	inp_len = x.shape[1] // (2 ** i_len)
	x_s, attn = encoder(x[:, -inp_len:, :], attn_mask=attn_mask)
	x_stack.append(x_s)
	attns.append(attn)
	x_stack = torch.cat(x_stack, dim=-2) # concat on sequence length axis
	return x_stack, attns