| """ |
| Published baseline models for DailyAct-5M benchmark. |
| |
| ASFormer: Transformer for Action Segmentation (Yi et al., BMVC 2021) |
| - Multi-stage encoder-decoder transformer with dilated attention |
| - For temporal action segmentation (Exp 2) and contact detection (Exp 3) |
| |
| TinyHAR: Lightweight Deep Learning Model for HAR (Zhou et al., ISWC 2022 Best Paper) |
| - Multi-scale temporal convolution + cross-channel attention + temporal pooling |
| - Implemented as backbone in models.py for scene recognition (Exp 1) |
| """ |
|
|
| import math |
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
|
|
|
|
| |
| |
| |
|
|
| class PositionalEncoding1D(nn.Module): |
| """Sinusoidal positional encoding.""" |
|
|
| def __init__(self, d_model, dropout=0.1, max_len=10000): |
| super().__init__() |
| self.dropout = nn.Dropout(p=dropout) |
| pe = torch.zeros(max_len, d_model) |
| position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) |
| div_term = torch.exp( |
| torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model) |
| ) |
| pe[:, 0::2] = torch.sin(position * div_term) |
| if d_model % 2 == 1: |
| pe[:, 1::2] = torch.cos(position * div_term[:-1]) |
| else: |
| pe[:, 1::2] = torch.cos(position * div_term) |
| pe = pe.unsqueeze(0) |
| self.register_buffer('pe', pe) |
|
|
| def forward(self, x): |
| x = x + self.pe[:, :x.size(1)] |
| return self.dropout(x) |
|
|
|
|
| |
| |
| |
|
|
| class ConvFeedForward(nn.Module): |
| """Position-wise convolution feed-forward used in ASFormer.""" |
|
|
| def __init__(self, d_model, kernel_size=3, dropout=0.1): |
| super().__init__() |
| self.norm = nn.LayerNorm(d_model) |
| self.conv1 = nn.Conv1d(d_model, d_model * 2, kernel_size, padding=kernel_size // 2) |
| self.conv2 = nn.Conv1d(d_model * 2, d_model, 1) |
| self.dropout = nn.Dropout(dropout) |
|
|
| def forward(self, x): |
| |
| residual = x |
| x = self.norm(x) |
| x = x.permute(0, 2, 1) |
| x = self.dropout(F.relu(self.conv1(x))) |
| x = self.dropout(self.conv2(x)) |
| x = x.permute(0, 2, 1) |
| return residual + x |
|
|
|
|
| class DilatedAttention(nn.Module): |
| """Multi-head self-attention with dilated temporal mask. |
| |
| At dilation d and window w, position t attends to positions |
| {t + k*d : k in [-w, w]}, creating a hierarchical receptive field. |
| """ |
|
|
| def __init__(self, d_model, dilation, num_heads=1, dropout=0.1, window_size=5): |
| super().__init__() |
| self.d_model = d_model |
| self.dilation = dilation |
| self.window_size = window_size |
| self.num_heads = num_heads |
| self.head_dim = d_model // num_heads |
|
|
| self.norm = nn.LayerNorm(d_model) |
| self.qkv = nn.Linear(d_model, 3 * d_model) |
| self.out_proj = nn.Linear(d_model, d_model) |
| self.dropout = nn.Dropout(dropout) |
|
|
| |
| self._mask_cache = {} |
|
|
| def _get_dilated_mask(self, T, device): |
| """Create or retrieve cached dilated attention mask.""" |
| key = (T, self.dilation, self.window_size, device) |
| if key not in self._mask_cache: |
| positions = torch.arange(T, device=device) |
| diff = positions.unsqueeze(1) - positions.unsqueeze(0) |
| mask = torch.zeros(T, T, dtype=torch.bool, device=device) |
| for w in range(-self.window_size, self.window_size + 1): |
| mask |= (diff == w * self.dilation) |
| self._mask_cache[key] = mask |
| return self._mask_cache[key] |
|
|
| def forward(self, x, cross_kv=None): |
| |
| B, T, D = x.shape |
| residual = x |
| x = self.norm(x) |
|
|
| if cross_kv is not None: |
| q = self.qkv(x)[:, :, :D] |
| kv = self.qkv(cross_kv)[:, :, D:] |
| q = q.view(B, T, self.num_heads, self.head_dim).transpose(1, 2) |
| k = kv[:, :, :D].view(B, T, self.num_heads, self.head_dim).transpose(1, 2) |
| v = kv[:, :, D:].view(B, T, self.num_heads, self.head_dim).transpose(1, 2) |
| else: |
| qkv = self.qkv(x).view(B, T, 3, self.num_heads, self.head_dim) |
| qkv = qkv.permute(2, 0, 3, 1, 4) |
| q, k, v = qkv[0], qkv[1], qkv[2] |
|
|
| scale = self.head_dim ** -0.5 |
| attn = (q @ k.transpose(-2, -1)) * scale |
|
|
| |
| dilated_mask = self._get_dilated_mask(T, x.device) |
| attn = attn.masked_fill(~dilated_mask.unsqueeze(0).unsqueeze(0), float('-inf')) |
|
|
| attn = F.softmax(attn, dim=-1) |
| attn = self.dropout(attn) |
|
|
| out = (attn @ v).transpose(1, 2).reshape(B, T, D) |
| out = self.out_proj(out) |
| return residual + self.dropout(out) |
|
|
|
|
| class ASFormerEncoderBlock(nn.Module): |
| """Single encoder block: dilated self-attention + conv feed-forward.""" |
|
|
| def __init__(self, d_model, dilation, num_heads=1, kernel_size=3, |
| dropout=0.1, window_size=5): |
| super().__init__() |
| self.self_attn = DilatedAttention(d_model, dilation, num_heads, dropout, window_size) |
| self.ffn = ConvFeedForward(d_model, kernel_size, dropout) |
|
|
| def forward(self, x): |
| x = self.self_attn(x) |
| x = self.ffn(x) |
| return x |
|
|
|
|
| class ASFormerDecoderBlock(nn.Module): |
| """Single decoder block: self-attention + cross-attention + conv feed-forward.""" |
|
|
| def __init__(self, d_model, dilation, num_heads=1, kernel_size=3, |
| dropout=0.1, window_size=5): |
| super().__init__() |
| self.self_attn = DilatedAttention(d_model, dilation, num_heads, dropout, window_size) |
| self.cross_attn = DilatedAttention(d_model, dilation, num_heads, dropout, window_size) |
| self.ffn = ConvFeedForward(d_model, kernel_size, dropout) |
|
|
| def forward(self, x, enc_features): |
| x = self.self_attn(x) |
| x = self.cross_attn(x, cross_kv=enc_features) |
| x = self.ffn(x) |
| return x |
|
|
|
|
| class ASFormerEncoder(nn.Module): |
| """ASFormer encoder: projection + N dilated attention layers + output head.""" |
|
|
| def __init__(self, input_dim, d_model, num_classes, num_layers=5, |
| num_heads=1, kernel_size=3, dropout=0.1, window_size=5): |
| super().__init__() |
| self.input_proj = nn.Conv1d(input_dim, d_model, 1) |
| self.pos_enc = PositionalEncoding1D(d_model, dropout) |
| self.layers = nn.ModuleList([ |
| ASFormerEncoderBlock(d_model, 2 ** i, num_heads, kernel_size, dropout, window_size) |
| for i in range(num_layers) |
| ]) |
| self.output_proj = nn.Conv1d(d_model, num_classes, 1) |
|
|
| def forward(self, x): |
| |
| x = x.permute(0, 2, 1) |
| x = self.input_proj(x) |
| x = x.permute(0, 2, 1) |
| x = self.pos_enc(x) |
|
|
| for layer in self.layers: |
| x = layer(x) |
|
|
| features = x |
| logits = self.output_proj(x.permute(0, 2, 1)).permute(0, 2, 1) |
| return features, logits |
|
|
|
|
| class ASFormerDecoder(nn.Module): |
| """ASFormer decoder: refinement stage with cross-attention to encoder.""" |
|
|
| def __init__(self, input_dim, d_model, num_classes, num_layers=5, |
| num_heads=1, kernel_size=3, dropout=0.1, window_size=5): |
| super().__init__() |
| self.input_proj = nn.Conv1d(input_dim, d_model, 1) |
| self.pos_enc = PositionalEncoding1D(d_model, dropout) |
| self.layers = nn.ModuleList([ |
| ASFormerDecoderBlock(d_model, 2 ** i, num_heads, kernel_size, dropout, window_size) |
| for i in range(num_layers) |
| ]) |
| self.output_proj = nn.Conv1d(d_model, num_classes, 1) |
|
|
| def forward(self, dec_input, enc_features): |
| |
| x = dec_input.permute(0, 2, 1) |
| x = self.input_proj(x) |
| x = x.permute(0, 2, 1) |
| x = self.pos_enc(x) |
|
|
| for layer in self.layers: |
| x = layer(x, enc_features) |
|
|
| logits = self.output_proj(x.permute(0, 2, 1)).permute(0, 2, 1) |
| return x, logits |
|
|
|
|
| class ASFormer(nn.Module): |
| """ASFormer: Transformer for Action Segmentation (Yi et al., BMVC 2021). |
| |
| Multi-stage encoder-decoder transformer for frame-level action segmentation. |
| Returns a list of per-stage logits for multi-stage training (same interface as MSTCN). |
| |
| Args: |
| input_dim: Input feature dimension |
| num_classes: Number of action classes |
| hidden_dim: Hidden dimension (d_model) |
| num_layers: Number of attention layers per stage (dilation 1, 2, ..., 2^(num_layers-1)) |
| num_decoders: Number of decoder (refinement) stages |
| num_heads: Number of attention heads |
| kernel_size: Feed-forward convolution kernel size |
| dropout: Dropout rate |
| window_size: Dilated attention window size |
| """ |
|
|
| def __init__(self, input_dim, num_classes, hidden_dim=64, num_layers=5, |
| num_decoders=3, num_heads=1, kernel_size=3, dropout=0.1, |
| window_size=5): |
| super().__init__() |
| self.encoder = ASFormerEncoder( |
| input_dim, hidden_dim, num_classes, num_layers, |
| num_heads, kernel_size, dropout, window_size |
| ) |
| self.decoders = nn.ModuleList([ |
| ASFormerDecoder( |
| num_classes, hidden_dim, num_classes, num_layers, |
| num_heads, kernel_size, dropout, window_size |
| ) for _ in range(num_decoders) |
| ]) |
|
|
| def forward(self, x): |
| |
| outputs = [] |
| enc_features, enc_logits = self.encoder(x) |
| outputs.append(enc_logits) |
|
|
| for decoder in self.decoders: |
| dec_input = F.softmax(outputs[-1], dim=-1).detach() |
| _, dec_logits = decoder(dec_input, enc_features) |
| outputs.append(dec_logits) |
|
|
| return outputs |
|
|
|
|
| class ASFormerContact(nn.Module): |
| """ASFormer adapted for binary contact detection (Exp 3). |
| |
| Wraps ASFormer to return only the final stage output (B, T, 2), |
| compatible with the exp3 training loop. |
| Uses multi-stage training internally but returns single output. |
| """ |
|
|
| def __init__(self, input_dim, hidden_dim=64, num_layers=5, num_decoders=2, |
| num_heads=1, dropout=0.1): |
| super().__init__() |
| self.asformer = ASFormer( |
| input_dim, num_classes=2, hidden_dim=hidden_dim, |
| num_layers=num_layers, num_decoders=num_decoders, |
| num_heads=num_heads, dropout=dropout |
| ) |
|
|
| def forward(self, x): |
| |
| outputs = self.asformer(x) |
| return outputs[-1] |
|
|