PULSE-code / experiments /nets /published_models.py

Upload folder using huggingface_hub

b4b2877 verified 3 days ago

25.9 kB

	"""
	Published baseline models for NeurIPS 2026 benchmark experiments.

	Contains faithful implementations of 6 published models:
	1. DeepConvLSTM (Ordonez & Roggen, Sensors 2016) - Exp1/Exp3
	2. InceptionTime (Fawaz et al., DMKD 2020) - Exp1/Exp3
	3. MS-TCN++ (Li et al., TPAMI 2020) - Exp2
	4. DiffAct (Liu et al., ICCV 2023) - Exp2
	5. UnderPressure (Mourot et al., SCA/CGF 2022) - Exp3/Exp4a
	6. emg2pose (Meta, NeurIPS 2024 D&B) - Exp4b
	"""

	import math
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import numpy as np


	# ============================================================
	# 1. DeepConvLSTM (Ordonez & Roggen, Sensors 2016)
	# "Deep Convolutional and LSTM Recurrent Neural Networks
	# for Multimodal Wearable Activity Recognition"
	# 4 Conv layers -> 2 LSTM layers -> pooling/per-frame output
	# ============================================================

	class DeepConvLSTMBackbone(nn.Module):
	"""DeepConvLSTM backbone for sequence-level classification (Exp1).

	Input: (B, T, C), optional mask
	Output: (B, output_dim)
	"""

	def __init__(self, input_dim, hidden_dim=128, num_conv_layers=4,
	conv_filters=64, conv_kernel=5, num_lstm_layers=2):
	super().__init__()
	conv_layers = []
	in_ch = input_dim
	for i in range(num_conv_layers):
	out_ch = conv_filters
	conv_layers.append(nn.Sequential(
	nn.Conv1d(in_ch, out_ch, conv_kernel, padding=conv_kernel // 2),
	nn.BatchNorm1d(out_ch),
	nn.ReLU(),
	nn.Dropout(0.1 if i < num_conv_layers - 1 else 0.2),
	))
	in_ch = out_ch
	self.convs = nn.ModuleList(conv_layers)

	self.lstm = nn.LSTM(
	conv_filters, hidden_dim, num_layers=num_lstm_layers,
	batch_first=True, bidirectional=False,
	dropout=0.2 if num_lstm_layers > 1 else 0,
	)
	self.output_dim = hidden_dim

	def forward(self, x, mask=None):
	# x: (B, T, C) -> Conv expects (B, C, T)
	x = x.permute(0, 2, 1)
	for conv in self.convs:
	x = conv(x)
	x = x.permute(0, 2, 1) # (B, T, conv_filters)

	out, (h_n, _) = self.lstm(x)
	# Use last hidden state
	feat = h_n[-1] # (B, hidden_dim)
	return feat


	class DeepConvLSTMContact(nn.Module):
	"""DeepConvLSTM for frame-level contact detection (Exp3).

	Input: (B, T, C)
	Output: (B, T, 2)
	"""

	def __init__(self, input_dim, hidden_dim=64, num_conv_layers=4,
	conv_filters=64, conv_kernel=5):
	super().__init__()
	conv_layers = []
	in_ch = input_dim
	for i in range(num_conv_layers):
	conv_layers.append(nn.Sequential(
	nn.Conv1d(in_ch, conv_filters, conv_kernel, padding=conv_kernel // 2),
	nn.BatchNorm1d(conv_filters),
	nn.ReLU(),
	nn.Dropout(0.1),
	))
	in_ch = conv_filters
	self.convs = nn.ModuleList(conv_layers)
	self.lstm = nn.LSTM(conv_filters, hidden_dim, num_layers=2,
	batch_first=True, bidirectional=True, dropout=0.2)
	self.head = nn.Linear(hidden_dim * 2, 2)

	def forward(self, x):
	x = x.permute(0, 2, 1)
	for conv in self.convs:
	x = conv(x)
	x = x.permute(0, 2, 1)
	out, _ = self.lstm(x)
	return self.head(out)


	# ============================================================
	# 2. InceptionTime (Fawaz et al., DMKD 2020)
	# "InceptionTime: Finding AlexNet for Time Series Classification"
	# Inception modules with multi-scale convolutions + residual
	# ============================================================

	class InceptionModule(nn.Module):
	"""Single Inception module for time series."""

	def __init__(self, in_channels, n_filters=32, kernel_sizes=(9, 19, 39),
	bottleneck_channels=32):
	super().__init__()
	# Bottleneck
	self.bottleneck = nn.Conv1d(in_channels, bottleneck_channels, 1, bias=False)

	# Parallel convolutions with different kernel sizes (odd kernels for symmetric padding)
	self.convs = nn.ModuleList()
	for ks in kernel_sizes:
	self.convs.append(
	nn.Conv1d(bottleneck_channels, n_filters, ks,
	padding=(ks - 1) // 2, bias=False)
	)

	# MaxPool branch
	self.maxpool_conv = nn.Sequential(
	nn.MaxPool1d(3, stride=1, padding=1),
	nn.Conv1d(in_channels, n_filters, 1, bias=False),
	)

	self.bn = nn.BatchNorm1d(n_filters * (len(kernel_sizes) + 1))
	self.relu = nn.ReLU()

	def forward(self, x):
	# x: (B, C, T)
	x_bottleneck = self.bottleneck(x)
	conv_outputs = [conv(x_bottleneck) for conv in self.convs]
	conv_outputs.append(self.maxpool_conv(x))
	out = torch.cat(conv_outputs, dim=1)
	return self.relu(self.bn(out))


	class InceptionBlock(nn.Module):
	"""Stack of Inception modules with a residual connection."""

	def __init__(self, in_channels, n_filters=32, depth=3):
	super().__init__()
	n_out = n_filters * 4 # 3 conv branches + 1 maxpool branch
	modules = []
	for i in range(depth):
	inc = in_channels if i == 0 else n_out
	modules.append(InceptionModule(inc, n_filters))
	self.modules_list = nn.ModuleList(modules)

	# Residual connection
	self.use_residual = (in_channels != n_out)
	if self.use_residual:
	self.residual = nn.Sequential(
	nn.Conv1d(in_channels, n_out, 1, bias=False),
	nn.BatchNorm1d(n_out),
	)
	self.relu = nn.ReLU()

	def forward(self, x):
	residual = x
	for mod in self.modules_list:
	x = mod(x)
	if self.use_residual:
	residual = self.residual(residual)
	return self.relu(x + residual)


	class InceptionTimeBackbone(nn.Module):
	"""InceptionTime backbone for sequence-level classification (Exp1).

	Input: (B, T, C), optional mask
	Output: (B, output_dim)
	"""

	def __init__(self, input_dim, hidden_dim=128, n_filters=32, num_blocks=2, depth=3):
	super().__init__()
	blocks = []
	in_ch = input_dim
	for i in range(num_blocks):
	blocks.append(InceptionBlock(in_ch, n_filters, depth))
	in_ch = n_filters * 4
	self.blocks = nn.ModuleList(blocks)
	self.output_dim = n_filters * 4

	def forward(self, x, mask=None):
	# x: (B, T, C) -> (B, C, T)
	x = x.permute(0, 2, 1)
	for block in self.blocks:
	x = block(x)
	# Global average pooling with mask
	if mask is not None:
	x = (x * mask.unsqueeze(1).float()).sum(2) / mask.sum(1, keepdim=True).float().clamp(min=1)
	else:
	x = x.mean(2)
	return x # (B, n_filters*4)


	class InceptionTimeContact(nn.Module):
	"""InceptionTime for frame-level contact detection (Exp3).

	Input: (B, T, C)
	Output: (B, T, 2)
	"""

	def __init__(self, input_dim, hidden_dim=64, n_filters=32, num_blocks=2, depth=3):
	super().__init__()
	blocks = []
	in_ch = input_dim
	for i in range(num_blocks):
	blocks.append(InceptionBlock(in_ch, n_filters, depth))
	in_ch = n_filters * 4
	self.blocks = nn.ModuleList(blocks)
	self.head = nn.Conv1d(n_filters * 4, 2, 1)

	def forward(self, x):
	x = x.permute(0, 2, 1)
	for block in self.blocks:
	x = block(x)
	out = self.head(x)
	return out.permute(0, 2, 1) # (B, T, 2)


	# ============================================================
	# 3. MS-TCN++ (Li et al., TPAMI 2020)
	# "MS-TCN++: Multi-Stage Temporal Convolutional Network
	# for Action Segmentation"
	# Key improvement: dual dilated layers in each residual block
	# ============================================================

	class DualDilatedResBlock(nn.Module):
	"""Dual dilated residual block (MS-TCN++ key contribution).

	Uses two parallel dilated convolutions with different dilation rates
	to capture both short-range and long-range temporal patterns.
	"""

	def __init__(self, channels, dilation1, dilation2):
	super().__init__()
	# Branch 1: smaller dilation
	self.conv1_dilated = nn.Conv1d(
	channels, channels, 3,
	padding=dilation1, dilation=dilation1
	)
	# Branch 2: larger dilation
	self.conv2_dilated = nn.Conv1d(
	channels, channels, 3,
	padding=dilation2, dilation=dilation2
	)
	self.conv_fusion = nn.Conv1d(channels, channels, 1)
	self.bn = nn.BatchNorm1d(channels)
	self.dropout = nn.Dropout(0.3)

	def forward(self, x):
	residual = x
	out1 = F.relu(self.conv1_dilated(x))
	out2 = F.relu(self.conv2_dilated(x))
	out = out1 + out2
	out = self.dropout(F.relu(self.bn(self.conv_fusion(out))))
	return out + residual


	class MSTCNPPStage(nn.Module):
	"""Single stage of MS-TCN++ with dual dilated layers."""

	def __init__(self, in_channels, hidden_channels, num_classes, num_layers=10):
	super().__init__()
	self.input_conv = nn.Conv1d(in_channels, hidden_channels, 1)
	self.layers = nn.ModuleList()
	for i in range(num_layers):
	dilation1 = 2 ** i
	dilation2 = 2 (i + 1) if i < num_layers - 1 else 2 i
	self.layers.append(DualDilatedResBlock(hidden_channels, dilation1, dilation2))
	self.output_conv = nn.Conv1d(hidden_channels, num_classes, 1)

	def forward(self, x):
	x = self.input_conv(x)
	for layer in self.layers:
	x = layer(x)
	return self.output_conv(x)


	class MSTCNPP(nn.Module):
	"""MS-TCN++ for temporal action segmentation (Exp2).

	Input: (B, T, C)
	Output: list of (B, T, num_classes) per stage
	"""

	def __init__(self, input_dim, num_classes, hidden_dim=64, num_stages=4, num_layers=10):
	super().__init__()
	self.stages = nn.ModuleList()
	# First stage: input features -> predictions
	self.stages.append(MSTCNPPStage(input_dim, hidden_dim, num_classes, num_layers))
	# Refinement stages: predictions -> refined predictions
	for _ in range(num_stages - 1):
	self.stages.append(MSTCNPPStage(num_classes, hidden_dim, num_classes, num_layers))

	def forward(self, x):
	x = x.permute(0, 2, 1) # (B, C, T)
	outputs = []
	for stage in self.stages:
	x = stage(x)
	outputs.append(x.permute(0, 2, 1)) # (B, T, num_classes)
	# Feed softmax of predictions to next stage
	if stage != self.stages[-1]:
	x = F.softmax(x, dim=1)
	return outputs


	# ============================================================
	# 4. DiffAct (Liu et al., ICCV 2023)
	# "Diffusion Action Segmentation"
	# Denoising diffusion model for iterative action refinement.
	# Simplified but faithful implementation.
	# ============================================================

	class ConditionalLayerNorm(nn.Module):
	"""Layer norm conditioned on diffusion timestep."""

	def __init__(self, channels):
	super().__init__()
	self.norm = nn.GroupNorm(1, channels) # equivalent to LayerNorm for 1D

	def forward(self, x):
	return self.norm(x)


	class DiffActBlock(nn.Module):
	"""Residual block for DiffAct denoising network."""

	def __init__(self, channels, dilation, time_emb_dim):
	super().__init__()
	self.conv1 = nn.Conv1d(channels, channels, 3, padding=dilation, dilation=dilation)
	self.conv2 = nn.Conv1d(channels, channels, 1)
	self.norm1 = ConditionalLayerNorm(channels)
	self.norm2 = ConditionalLayerNorm(channels)
	self.time_proj = nn.Linear(time_emb_dim, channels)
	self.dropout = nn.Dropout(0.1)

	def forward(self, x, time_emb):
	residual = x
	x = self.norm1(x)
	x = F.relu(self.conv1(x))
	# Add time embedding
	t = self.time_proj(time_emb).unsqueeze(-1) # (B, C, 1)
	x = x + t
	x = self.norm2(x)
	x = self.dropout(F.relu(self.conv2(x)))
	return x + residual


	class DiffActConditionEncoder(nn.Module):
	"""Temporal feature encoder for conditioning the denoising network."""

	def __init__(self, input_dim, hidden_dim, num_layers=6):
	super().__init__()
	self.input_conv = nn.Conv1d(input_dim, hidden_dim, 1)
	self.layers = nn.ModuleList()
	for i in range(num_layers):
	dilation = 2 ** (i % 5)
	self.layers.append(nn.Sequential(
	nn.Conv1d(hidden_dim, hidden_dim, 3, padding=dilation, dilation=dilation),
	nn.BatchNorm1d(hidden_dim),
	nn.ReLU(),
	nn.Dropout(0.1),
	))

	def forward(self, x):
	x = self.input_conv(x)
	for layer in self.layers:
	x = layer(x) + x # residual
	return x


	class SinusoidalTimeEmbedding(nn.Module):
	"""Sinusoidal positional embedding for diffusion timestep."""

	def __init__(self, dim):
	super().__init__()
	self.dim = dim
	self.mlp = nn.Sequential(
	nn.Linear(dim, dim * 4),
	nn.GELU(),
	nn.Linear(dim * 4, dim),
	)

	def forward(self, t):
	half_dim = self.dim // 2
	emb = math.log(10000) / (half_dim - 1)
	emb = torch.exp(torch.arange(half_dim, device=t.device) * -emb)
	emb = t.unsqueeze(-1).float() * emb.unsqueeze(0)
	emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
	return self.mlp(emb)


	class DiffAct(nn.Module):
	"""DiffAct: Diffusion Action Segmentation (Exp2).

	During training: noises ground-truth action probabilities and denoises.
	During inference: iteratively denoises from pure noise.

	Input: (B, T, C)
	Output: list of (B, T, num_classes) [final denoised prediction]
	"""

	def __init__(self, input_dim, num_classes, hidden_dim=64,
	num_encoder_layers=6, num_denoise_layers=6,
	num_diffusion_steps=10):
	super().__init__()
	self.num_classes = num_classes
	self.num_steps = num_diffusion_steps

	# Condition encoder: extract temporal features from input
	self.condition_encoder = DiffActConditionEncoder(input_dim, hidden_dim, num_encoder_layers)

	# Initial prediction head (non-diffusion baseline)
	self.initial_head = nn.Conv1d(hidden_dim, num_classes, 1)

	# Time embedding
	self.time_emb = SinusoidalTimeEmbedding(hidden_dim)

	# Denoising network
	self.denoise_input = nn.Conv1d(num_classes + hidden_dim, hidden_dim, 1)
	self.denoise_blocks = nn.ModuleList()
	for i in range(num_denoise_layers):
	dilation = 2 ** (i % 5)
	self.denoise_blocks.append(DiffActBlock(hidden_dim, dilation, hidden_dim))
	self.denoise_output = nn.Conv1d(hidden_dim, num_classes, 1)

	# Noise schedule (cosine)
	self._setup_noise_schedule()

	def _setup_noise_schedule(self):
	steps = self.num_steps
	s = 0.008
	t = torch.linspace(0, steps, steps + 1)
	alphas_cumprod = torch.cos(((t / steps) + s) / (1 + s) * math.pi * 0.5) ** 2
	alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
	betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
	betas = torch.clamp(betas, 0.0001, 0.999)
	alphas = 1.0 - betas
	alphas_cumprod = torch.cumprod(alphas, dim=0)
	self.register_buffer('betas', betas)
	self.register_buffer('alphas_cumprod', alphas_cumprod)
	self.register_buffer('sqrt_alphas_cumprod', torch.sqrt(alphas_cumprod))
	self.register_buffer('sqrt_one_minus_alphas_cumprod', torch.sqrt(1 - alphas_cumprod))

	def _add_noise(self, x_start, t, noise=None):
	"""Add noise to x_start at timestep t."""
	if noise is None:
	noise = torch.randn_like(x_start)
	sqrt_alpha = self.sqrt_alphas_cumprod[t].view(-1, 1, 1)
	sqrt_one_minus = self.sqrt_one_minus_alphas_cumprod[t].view(-1, 1, 1)
	return sqrt_alpha * x_start + sqrt_one_minus * noise

	def _denoise_step(self, x_noisy, cond_features, time_emb):
	"""Single denoising step."""
	x = torch.cat([x_noisy, cond_features], dim=1) # (B, C+hidden, T)
	x = self.denoise_input(x)
	for block in self.denoise_blocks:
	x = block(x, time_emb)
	return self.denoise_output(x)

	def forward(self, x):
	"""
	Training: returns [initial_pred, denoised_pred]
	Inference: returns [initial_pred, iteratively_denoised_pred]
	"""
	x_in = x.permute(0, 2, 1) # (B, C, T)
	B, _, T = x_in.shape

	# Encode condition features
	cond = self.condition_encoder(x_in) # (B, hidden, T)
	initial_logits = self.initial_head(cond).permute(0, 2, 1) # (B, T, num_classes)

	if self.training:
	# Training: noise the initial prediction and denoise (end-to-end)
	x_start = F.softmax(initial_logits, dim=-1).permute(0, 2, 1) # (B, C, T)
	t = torch.randint(0, self.num_steps, (B,), device=x.device)
	noise = torch.randn_like(x_start)
	x_noisy = self._add_noise(x_start.detach(), t, noise)
	time_emb = self.time_emb(t)
	denoised = self._denoise_step(x_noisy, cond, time_emb)
	return [initial_logits, denoised.permute(0, 2, 1)]
	else:
	# Inference: iterative denoising from noise
	x_t = torch.randn(B, self.num_classes, T, device=x.device)
	for step in reversed(range(self.num_steps)):
	t = torch.full((B,), step, device=x.device, dtype=torch.long)
	time_emb = self.time_emb(t)
	pred_noise = self._denoise_step(x_t, cond, time_emb)
	# Simplified DDPM update
	alpha = self.alphas_cumprod[step]
	alpha_prev = self.alphas_cumprod[step - 1] if step > 0 else torch.tensor(1.0)
	beta = self.betas[step]
	x_t = (1 / torch.sqrt(1 - beta)) * (
	x_t - beta / self.sqrt_one_minus_alphas_cumprod[step] * pred_noise
	)
	if step > 0:
	x_t = x_t + torch.sqrt(beta) * torch.randn_like(x_t) * 0.5
	return [initial_logits, x_t.permute(0, 2, 1)]


	# ============================================================
	# 5. UnderPressure (Mourot et al., SCA/CGF 2022)
	# "UnderPressure: Deep Learning for Foot Contact Detection,
	# Ground Reaction Force Estimation and Footskate Cleanup"
	# GRU-based architecture for contact detection + force regression.
	# Adapted for hand contact detection and MoCap->Pressure prediction.
	# ============================================================

	class UnderPressureContact(nn.Module):
	"""UnderPressure model adapted for hand contact detection (Exp3).

	Architecture: Conv feature extractor -> BiGRU -> contact prediction head
	Input: (B, T, C)
	Output: (B, T, 2) [right_contact, left_contact]
	"""

	def __init__(self, input_dim, hidden_dim=64, num_gru_layers=2):
	super().__init__()
	# Feature extractor (conv layers for local temporal patterns)
	self.feature_extractor = nn.Sequential(
	nn.Conv1d(input_dim, hidden_dim, 7, padding=3),
	nn.BatchNorm1d(hidden_dim),
	nn.ReLU(),
	nn.Conv1d(hidden_dim, hidden_dim, 5, padding=2),
	nn.BatchNorm1d(hidden_dim),
	nn.ReLU(),
	)
	# BiGRU for temporal modeling
	self.gru = nn.GRU(
	hidden_dim, hidden_dim, num_layers=num_gru_layers,
	batch_first=True, bidirectional=True,
	dropout=0.2 if num_gru_layers > 1 else 0,
	)
	# Contact prediction head
	self.contact_head = nn.Sequential(
	nn.Linear(hidden_dim * 2, hidden_dim),
	nn.ReLU(),
	nn.Dropout(0.2),
	nn.Linear(hidden_dim, 2),
	)

	def forward(self, x):
	# x: (B, T, C) -> (B, C, T)
	feat = self.feature_extractor(x.permute(0, 2, 1))
	feat = feat.permute(0, 2, 1) # (B, T, hidden)
	gru_out, _ = self.gru(feat)
	return self.contact_head(gru_out) # (B, T, 2)


	class UnderPressureRegressor(nn.Module):
	"""UnderPressure model adapted for MoCap -> Pressure regression (Exp4a).

	Architecture: Conv feature extractor -> BiGRU -> pressure regression head
	Input: (B, T, input_dim)
	Output: (B, T, output_dim)
	"""

	def __init__(self, input_dim, output_dim, hidden_dim=128, num_gru_layers=2):
	super().__init__()
	self.feature_extractor = nn.Sequential(
	nn.Conv1d(input_dim, hidden_dim, 7, padding=3),
	nn.BatchNorm1d(hidden_dim),
	nn.ReLU(),
	nn.Conv1d(hidden_dim, hidden_dim, 5, padding=2),
	nn.BatchNorm1d(hidden_dim),
	nn.ReLU(),
	nn.Conv1d(hidden_dim, hidden_dim, 3, padding=1),
	nn.BatchNorm1d(hidden_dim),
	nn.ReLU(),
	)
	self.gru = nn.GRU(
	hidden_dim, hidden_dim, num_layers=num_gru_layers,
	batch_first=True, bidirectional=True,
	dropout=0.2 if num_gru_layers > 1 else 0,
	)
	self.regression_head = nn.Sequential(
	nn.Linear(hidden_dim * 2, hidden_dim),
	nn.ReLU(),
	nn.Dropout(0.2),
	nn.Linear(hidden_dim, output_dim),
	)

	def forward(self, x):
	feat = self.feature_extractor(x.permute(0, 2, 1))
	feat = feat.permute(0, 2, 1)
	gru_out, _ = self.gru(feat)
	return self.regression_head(gru_out)


	# ============================================================
	# 6. emg2pose (Meta/Facebook Research, NeurIPS 2024 D&B)
	# "emg2pose: A Large and Diverse Benchmark for
	# Surface Electromyographic Hand Pose Estimation"
	# CNN feature extractor + Transformer encoder,
	# with optional velocity-based integration (vemg2pose).
	# ============================================================

	class EMG2PoseEncoder(nn.Module):
	"""CNN + Transformer encoder from emg2pose."""

	def __init__(self, input_dim, hidden_dim=128, num_transformer_layers=4, nhead=4):
	super().__init__()
	# Multi-scale CNN feature extractor
	self.conv_small = nn.Sequential(
	nn.Conv1d(input_dim, hidden_dim // 2, 3, padding=1),
	nn.BatchNorm1d(hidden_dim // 2),
	nn.ReLU(),
	)
	self.conv_medium = nn.Sequential(
	nn.Conv1d(input_dim, hidden_dim // 4, 7, padding=3),
	nn.BatchNorm1d(hidden_dim // 4),
	nn.ReLU(),
	)
	self.conv_large = nn.Sequential(
	nn.Conv1d(input_dim, hidden_dim // 4, 15, padding=7),
	nn.BatchNorm1d(hidden_dim // 4),
	nn.ReLU(),
	)
	# Projection to hidden_dim
	self.proj = nn.Sequential(
	nn.Conv1d(hidden_dim, hidden_dim, 1),
	nn.BatchNorm1d(hidden_dim),
	nn.ReLU(),
	)
	# Transformer encoder for temporal modeling
	encoder_layer = nn.TransformerEncoderLayer(
	d_model=hidden_dim, nhead=nhead,
	dim_feedforward=hidden_dim * 4,
	dropout=0.1, batch_first=True,
	)
	self.transformer = nn.TransformerEncoder(encoder_layer, num_transformer_layers)

	def forward(self, x):
	# x: (B, T, C) -> (B, C, T)
	x_t = x.permute(0, 2, 1)
	f_small = self.conv_small(x_t)
	f_medium = self.conv_medium(x_t)
	f_large = self.conv_large(x_t)
	feat = torch.cat([f_small, f_medium, f_large], dim=1)
	feat = self.proj(feat).permute(0, 2, 1) # (B, T, hidden)
	return self.transformer(feat)


	class EMG2Pose(nn.Module):
	"""emg2pose model for EMG -> Hand Pose regression (Exp4b).

	Predicts per-frame hand joint positions from EMG signals.
	Uses velocity-based integration (vemg2pose variant):
	predict velocity -> integrate to get positions.

	Input: (B, T, input_dim) [EMG channels]
	Output: (B, T, output_dim) [hand joint positions]
	"""

	def __init__(self, input_dim, output_dim, hidden_dim=128,
	num_transformer_layers=4, use_velocity=True):
	super().__init__()
	self.use_velocity = use_velocity
	self.encoder = EMG2PoseEncoder(input_dim, hidden_dim, num_transformer_layers)

	if use_velocity:
	# Predict velocity, then integrate
	self.velocity_head = nn.Sequential(
	nn.Linear(hidden_dim, hidden_dim // 2),
	nn.ReLU(),
	nn.Dropout(0.1),
	nn.Linear(hidden_dim // 2, output_dim),
	)
	# Learnable initial position
	self.initial_pos = nn.Parameter(torch.zeros(1, 1, output_dim))
	else:
	# Direct position prediction
	self.position_head = nn.Sequential(
	nn.Linear(hidden_dim, hidden_dim // 2),
	nn.ReLU(),
	nn.Dropout(0.1),
	nn.Linear(hidden_dim // 2, output_dim),
	)

	def forward(self, x):
	features = self.encoder(x) # (B, T, hidden)

	if self.use_velocity:
	velocity = self.velocity_head(features) # (B, T, output_dim)
	# Cumulative sum to integrate velocity -> position
	positions = torch.cumsum(velocity, dim=1) + self.initial_pos
	return positions
	else:
	return self.position_head(features)