PULSE-code / experiments /tasks /published_baselines.py

Upload folder using huggingface_hub

b4b2877 verified 4 days ago

11.2 kB

	"""
	Published baseline models for DailyAct-5M benchmark.

	ASFormer: Transformer for Action Segmentation (Yi et al., BMVC 2021)
	- Multi-stage encoder-decoder transformer with dilated attention
	- For temporal action segmentation (Exp 2) and contact detection (Exp 3)

	TinyHAR: Lightweight Deep Learning Model for HAR (Zhou et al., ISWC 2022 Best Paper)
	- Multi-scale temporal convolution + cross-channel attention + temporal pooling
	- Implemented as backbone in models.py for scene recognition (Exp 1)
	"""

	import math
	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	# ============================================================
	# Positional Encoding (shared)
	# ============================================================

	class PositionalEncoding1D(nn.Module):
	"""Sinusoidal positional encoding."""

	def __init__(self, d_model, dropout=0.1, max_len=10000):
	super().__init__()
	self.dropout = nn.Dropout(p=dropout)
	pe = torch.zeros(max_len, d_model)
	position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
	div_term = torch.exp(
	torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
	)
	pe[:, 0::2] = torch.sin(position * div_term)
	if d_model % 2 == 1:
	pe[:, 1::2] = torch.cos(position * div_term[:-1])
	else:
	pe[:, 1::2] = torch.cos(position * div_term)
	pe = pe.unsqueeze(0) # (1, max_len, d_model)
	self.register_buffer('pe', pe)

	def forward(self, x):
	x = x + self.pe[:, :x.size(1)]
	return self.dropout(x)


	# ============================================================
	# ASFormer (Yi et al., BMVC 2021)
	# ============================================================

	class ConvFeedForward(nn.Module):
	"""Position-wise convolution feed-forward used in ASFormer."""

	def __init__(self, d_model, kernel_size=3, dropout=0.1):
	super().__init__()
	self.norm = nn.LayerNorm(d_model)
	self.conv1 = nn.Conv1d(d_model, d_model * 2, kernel_size, padding=kernel_size // 2)
	self.conv2 = nn.Conv1d(d_model * 2, d_model, 1)
	self.dropout = nn.Dropout(dropout)

	def forward(self, x):
	# x: (B, T, D)
	residual = x
	x = self.norm(x)
	x = x.permute(0, 2, 1) # (B, D, T)
	x = self.dropout(F.relu(self.conv1(x)))
	x = self.dropout(self.conv2(x))
	x = x.permute(0, 2, 1) # (B, T, D)
	return residual + x


	class DilatedAttention(nn.Module):
	"""Multi-head self-attention with dilated temporal mask.

	At dilation d and window w, position t attends to positions
	{t + k*d : k in [-w, w]}, creating a hierarchical receptive field.
	"""

	def __init__(self, d_model, dilation, num_heads=1, dropout=0.1, window_size=5):
	super().__init__()
	self.d_model = d_model
	self.dilation = dilation
	self.window_size = window_size
	self.num_heads = num_heads
	self.head_dim = d_model // num_heads

	self.norm = nn.LayerNorm(d_model)
	self.qkv = nn.Linear(d_model, 3 * d_model)
	self.out_proj = nn.Linear(d_model, d_model)
	self.dropout = nn.Dropout(dropout)

	# Cache for dilated masks
	self._mask_cache = {}

	def _get_dilated_mask(self, T, device):
	"""Create or retrieve cached dilated attention mask."""
	key = (T, self.dilation, self.window_size, device)
	if key not in self._mask_cache:
	positions = torch.arange(T, device=device)
	diff = positions.unsqueeze(1) - positions.unsqueeze(0) # (T, T)
	mask = torch.zeros(T, T, dtype=torch.bool, device=device)
	for w in range(-self.window_size, self.window_size + 1):
	mask \|= (diff == w * self.dilation)
	self._mask_cache[key] = mask
	return self._mask_cache[key]

	def forward(self, x, cross_kv=None):
	# x: (B, T, D)
	B, T, D = x.shape
	residual = x
	x = self.norm(x)

	if cross_kv is not None:
	q = self.qkv(x)[:, :, :D] # only use Q from x
	kv = self.qkv(cross_kv)[:, :, D:] # K, V from cross_kv
	q = q.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
	k = kv[:, :, :D].view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
	v = kv[:, :, D:].view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
	else:
	qkv = self.qkv(x).view(B, T, 3, self.num_heads, self.head_dim)
	qkv = qkv.permute(2, 0, 3, 1, 4) # (3, B, H, T, head_dim)
	q, k, v = qkv[0], qkv[1], qkv[2]

	scale = self.head_dim ** -0.5
	attn = (q @ k.transpose(-2, -1)) * scale # (B, H, T, T)

	# Apply dilated attention mask
	dilated_mask = self._get_dilated_mask(T, x.device) # (T, T)
	attn = attn.masked_fill(~dilated_mask.unsqueeze(0).unsqueeze(0), float('-inf'))

	attn = F.softmax(attn, dim=-1)
	attn = self.dropout(attn)

	out = (attn @ v).transpose(1, 2).reshape(B, T, D)
	out = self.out_proj(out)
	return residual + self.dropout(out)


	class ASFormerEncoderBlock(nn.Module):
	"""Single encoder block: dilated self-attention + conv feed-forward."""

	def __init__(self, d_model, dilation, num_heads=1, kernel_size=3,
	dropout=0.1, window_size=5):
	super().__init__()
	self.self_attn = DilatedAttention(d_model, dilation, num_heads, dropout, window_size)
	self.ffn = ConvFeedForward(d_model, kernel_size, dropout)

	def forward(self, x):
	x = self.self_attn(x)
	x = self.ffn(x)
	return x


	class ASFormerDecoderBlock(nn.Module):
	"""Single decoder block: self-attention + cross-attention + conv feed-forward."""

	def __init__(self, d_model, dilation, num_heads=1, kernel_size=3,
	dropout=0.1, window_size=5):
	super().__init__()
	self.self_attn = DilatedAttention(d_model, dilation, num_heads, dropout, window_size)
	self.cross_attn = DilatedAttention(d_model, dilation, num_heads, dropout, window_size)
	self.ffn = ConvFeedForward(d_model, kernel_size, dropout)

	def forward(self, x, enc_features):
	x = self.self_attn(x)
	x = self.cross_attn(x, cross_kv=enc_features)
	x = self.ffn(x)
	return x


	class ASFormerEncoder(nn.Module):
	"""ASFormer encoder: projection + N dilated attention layers + output head."""

	def __init__(self, input_dim, d_model, num_classes, num_layers=5,
	num_heads=1, kernel_size=3, dropout=0.1, window_size=5):
	super().__init__()
	self.input_proj = nn.Conv1d(input_dim, d_model, 1)
	self.pos_enc = PositionalEncoding1D(d_model, dropout)
	self.layers = nn.ModuleList([
	ASFormerEncoderBlock(d_model, 2 ** i, num_heads, kernel_size, dropout, window_size)
	for i in range(num_layers)
	])
	self.output_proj = nn.Conv1d(d_model, num_classes, 1)

	def forward(self, x):
	# x: (B, T, C)
	x = x.permute(0, 2, 1) # (B, C, T)
	x = self.input_proj(x) # (B, d_model, T)
	x = x.permute(0, 2, 1) # (B, T, d_model)
	x = self.pos_enc(x)

	for layer in self.layers:
	x = layer(x)

	features = x
	logits = self.output_proj(x.permute(0, 2, 1)).permute(0, 2, 1) # (B, T, num_classes)
	return features, logits


	class ASFormerDecoder(nn.Module):
	"""ASFormer decoder: refinement stage with cross-attention to encoder."""

	def __init__(self, input_dim, d_model, num_classes, num_layers=5,
	num_heads=1, kernel_size=3, dropout=0.1, window_size=5):
	super().__init__()
	self.input_proj = nn.Conv1d(input_dim, d_model, 1)
	self.pos_enc = PositionalEncoding1D(d_model, dropout)
	self.layers = nn.ModuleList([
	ASFormerDecoderBlock(d_model, 2 ** i, num_heads, kernel_size, dropout, window_size)
	for i in range(num_layers)
	])
	self.output_proj = nn.Conv1d(d_model, num_classes, 1)

	def forward(self, dec_input, enc_features):
	# dec_input: (B, T, input_dim), enc_features: (B, T, d_model)
	x = dec_input.permute(0, 2, 1)
	x = self.input_proj(x)
	x = x.permute(0, 2, 1)
	x = self.pos_enc(x)

	for layer in self.layers:
	x = layer(x, enc_features)

	logits = self.output_proj(x.permute(0, 2, 1)).permute(0, 2, 1)
	return x, logits


	class ASFormer(nn.Module):
	"""ASFormer: Transformer for Action Segmentation (Yi et al., BMVC 2021).

	Multi-stage encoder-decoder transformer for frame-level action segmentation.
	Returns a list of per-stage logits for multi-stage training (same interface as MSTCN).

	Args:
	input_dim: Input feature dimension
	num_classes: Number of action classes
	hidden_dim: Hidden dimension (d_model)
	num_layers: Number of attention layers per stage (dilation 1, 2, ..., 2^(num_layers-1))
	num_decoders: Number of decoder (refinement) stages
	num_heads: Number of attention heads
	kernel_size: Feed-forward convolution kernel size
	dropout: Dropout rate
	window_size: Dilated attention window size
	"""

	def __init__(self, input_dim, num_classes, hidden_dim=64, num_layers=5,
	num_decoders=3, num_heads=1, kernel_size=3, dropout=0.1,
	window_size=5):
	super().__init__()
	self.encoder = ASFormerEncoder(
	input_dim, hidden_dim, num_classes, num_layers,
	num_heads, kernel_size, dropout, window_size
	)
	self.decoders = nn.ModuleList([
	ASFormerDecoder(
	num_classes, hidden_dim, num_classes, num_layers,
	num_heads, kernel_size, dropout, window_size
	) for _ in range(num_decoders)
	])

	def forward(self, x):
	# x: (B, T, C)
	outputs = []
	enc_features, enc_logits = self.encoder(x)
	outputs.append(enc_logits)

	for decoder in self.decoders:
	dec_input = F.softmax(outputs[-1], dim=-1).detach()
	_, dec_logits = decoder(dec_input, enc_features)
	outputs.append(dec_logits)

	return outputs # list of (B, T, num_classes), compatible with MSTCN interface


	class ASFormerContact(nn.Module):
	"""ASFormer adapted for binary contact detection (Exp 3).

	Wraps ASFormer to return only the final stage output (B, T, 2),
	compatible with the exp3 training loop.
	Uses multi-stage training internally but returns single output.
	"""

	def __init__(self, input_dim, hidden_dim=64, num_layers=5, num_decoders=2,
	num_heads=1, dropout=0.1):
	super().__init__()
	self.asformer = ASFormer(
	input_dim, num_classes=2, hidden_dim=hidden_dim,
	num_layers=num_layers, num_decoders=num_decoders,
	num_heads=num_heads, dropout=dropout
	)

	def forward(self, x):
	# x: (B, T, C) -> (B, T, 2)
	outputs = self.asformer(x)
	return outputs[-1] # Return final stage only