Spaces:

Ace-119
/

StressDetect

Running

App Files Files Community

StressDetect / models /architecture.py

Ace-119

Implement multichannel CNN with multi-head attention and transformer wrappers

1b8fe16 about 1 month ago

raw

history blame contribute delete

21.1 kB

	"""
	models/architecture.py
	======================
	Phase 2 → Phase 5: Model Architecture

	Tier 1 — OptimizedMultichannelCNN (PyTorch from scratch)
	Parallel 1D convolution channels (kernel sizes 2, 3, 5) with
	min_len trimming to prevent shape mismatch on concatenation,
	followed by multi-head self-attention (default 4 heads) and a
	classification head.

	Tier 2 — Transformer wrappers
	Lightweight wrappers around HuggingFace models:
	- ``DeBERTaStressClassifier`` (DeBERTa-v3-Small)
	- ``MiniLMStressClassifier`` (MiniLM-L6-v2)

	Calibration
	-----------
	- ``TemperatureScaling`` — post-hoc probability calibration (Guo et al. 2017).
	Wraps any classifier and divides logits by a learned scalar ``T`` before
	softmax, reducing overconfidence.

	Attention
	---------
	- ``MultiHeadSelfAttention`` (new default) — scaled dot-product attention
	split across ``num_heads`` independent subspaces then projected back.
	Produces richer features and more interpretable per-token importance
	weights compared to single-head dot-product attention.
	- ``DotProductSelfAttention`` kept for backward compatibility (single head).

	Design Guardrails
	-----------------
	- Conv1D outputs are trimmed to ``min_len`` before concatenation —
	this is the CRITICAL guard against tensor shape mismatch.
	- Self-attention returns attention weights alongside the pooled vector
	for downstream explainability / heatmap rendering.
	"""

	from __future__ import annotations

	import hashlib
	from typing import Optional

	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	# ---------------------------------------------------------------------------
	# Stop-word dampening
	# ---------------------------------------------------------------------------
	# Common English stop words that carry little semantic signal for stress
	# detection but tend to dominate attention weights (e.g. 'I', 'the', 'a').
	# Reducing their embedding magnitude before the conv layers prevents the
	# attention mechanism from over-emphasising them, which was observed in
	# heatmap analysis.
	# ---------------------------------------------------------------------------

	_STOP_WORDS: frozenset[str] = frozenset({
	"i", "me", "my", "myself", "we", "our", "ours", "ourselves",
	"you", "your", "yours", "yourself", "yourselves",
	"he", "him", "his", "himself", "she", "her", "hers", "herself",
	"it", "its", "itself", "they", "them", "their", "theirs", "themselves",
	"what", "which", "who", "whom", "this", "that", "these", "those",
	"am", "is", "are", "was", "were", "be", "been", "being",
	"have", "has", "had", "having", "do", "does", "did", "doing",
	"a", "an", "the", "and", "but", "if", "or", "because", "as",
	"until", "while", "of", "at", "by", "for", "with", "about",
	"against", "between", "through", "during", "before", "after",
	"above", "below", "to", "from", "up", "down", "in", "out",
	"on", "off", "over", "under", "again", "further", "then", "once",
	"here", "there", "when", "where", "why", "how", "all", "both",
	"each", "few", "more", "most", "other", "some", "such", "no",
	"nor", "not", "only", "own", "same", "so", "than", "too", "very",
	"s", "t", "can", "will", "just", "don", "should", "now",
	"d", "ll", "m", "o", "re", "ve", "y",
	})


	def _compute_stop_word_ids(vocab_size: int) -> set[int]:
	"""Return hash-based token IDs for :data:`_STOP_WORDS`.

	Uses the same ``md5`` hashing scheme as the project tokenizers in
	``api/main.py`` and ``training/train.py`` so that the IDs match at
	both training and inference time.
	"""
	ids: set[int] = set()
	for word in _STOP_WORDS:
	token_id = (
	int(
	hashlib.md5(
	word.encode("utf-8"), usedforsecurity=False
	).hexdigest(),
	16,
	)
	% (vocab_size - 1)
	+ 1
	)
	ids.add(token_id)
	return ids


	# ---------------------------------------------------------------------------
	# Tier 1: OptimizedMultichannelCNN
	# ---------------------------------------------------------------------------


	class DotProductSelfAttention(nn.Module):
	"""Simple scaled dot-product self-attention over a sequence.

	Input shape : ``(batch, seq_len, hidden)``
	Output shape: ``(batch, hidden)`` (attended pool) + ``(batch, seq_len)``

	Kept for backward compatibility. New code should prefer
	:class:`MultiHeadSelfAttention`.
	"""

	def __init__(self, hidden_dim: int) -> None:
	super().__init__()
	self.query = nn.Linear(hidden_dim, hidden_dim)
	self.key = nn.Linear(hidden_dim, hidden_dim)
	self.value = nn.Linear(hidden_dim, hidden_dim)
	self.scale = hidden_dim ** 0.5

	def forward(
	self, x: torch.Tensor
	) -> tuple[torch.Tensor, torch.Tensor]:
	"""
	Parameters
	----------
	x : Tensor, shape ``(B, L, H)``

	Returns
	-------
	pooled : Tensor, shape ``(B, H)``
	weights : Tensor, shape ``(B, L)`` — attention weights (for heatmaps)
	"""
	q = self.query(x) # (B, L, H)
	k = self.key(x) # (B, L, H)
	v = self.value(x) # (B, L, H)

	scores = torch.bmm(q, k.transpose(1, 2)) / self.scale # (B, L, L)
	attn = F.softmax(scores, dim=-1) # (B, L, L)

	context = torch.bmm(attn, v) # (B, L, H)

	# Pool: mean of attention-weighted values
	pooled = context.mean(dim=1) # (B, H)

	# Per-token importance: mean attention received from all queries
	weights = attn.mean(dim=1) # (B, L)

	return pooled, weights


	class MultiHeadSelfAttention(nn.Module):
	"""Multi-head scaled dot-product self-attention (Vaswani et al. 2017).

	Splits the hidden dimension into ``num_heads`` independent subspaces,
	computes scaled dot-product attention within each head, then concatenates
	and projects the results. This produces richer feature representations
	than single-head attention and yields more interpretable per-token
	importance weights for heatmap rendering.

	Input shape : ``(batch, seq_len, hidden)``
	Output shape: ``(batch, hidden)`` (attended pool) + ``(batch, seq_len)``

	Parameters
	----------
	hidden_dim : int
	Total hidden dimension. Must be divisible by ``num_heads``.
	num_heads : int
	Number of parallel attention heads. Default: 4.
	dropout : float
	Dropout applied to attention weights during training.
	"""

	def __init__(
	self, hidden_dim: int, num_heads: int = 4, dropout: float = 0.1
	) -> None:
	super().__init__()
	if hidden_dim % num_heads != 0:
	raise ValueError(
	f"hidden_dim ({hidden_dim}) must be divisible by "
	f"num_heads ({num_heads})."
	)
	self.num_heads = num_heads
	self.d_k = hidden_dim // num_heads
	self.scale = self.d_k ** 0.5

	self.query = nn.Linear(hidden_dim, hidden_dim)
	self.key = nn.Linear(hidden_dim, hidden_dim)
	self.value = nn.Linear(hidden_dim, hidden_dim)
	self.out_proj = nn.Linear(hidden_dim, hidden_dim)
	self.attn_dropout = nn.Dropout(dropout)

	def forward(
	self, x: torch.Tensor
	) -> tuple[torch.Tensor, torch.Tensor]:
	"""
	Parameters
	----------
	x : Tensor, shape ``(B, L, H)``

	Returns
	-------
	pooled : Tensor, shape ``(B, H)``
	weights : Tensor, shape ``(B, L)`` — per-token importance (for heatmaps)
	"""
	B, L, H = x.shape

	# Project and reshape to (B, num_heads, L, d_k)
	q = self.query(x).view(B, L, self.num_heads, self.d_k).transpose(1, 2)
	k = self.key(x).view(B, L, self.num_heads, self.d_k).transpose(1, 2)
	v = self.value(x).view(B, L, self.num_heads, self.d_k).transpose(1, 2)

	# Scaled dot-product attention: (B, num_heads, L, L)
	scores = torch.matmul(q, k.transpose(-2, -1)) / self.scale
	attn = F.softmax(scores, dim=-1) # (B, num_heads, L, L)
	attn = self.attn_dropout(attn)

	# Context: (B, num_heads, L, d_k)
	context = torch.matmul(attn, v)

	# Merge heads: (B, L, H)
	context = context.transpose(1, 2).contiguous().view(B, L, H)

	# Output projection
	out = self.out_proj(context) # (B, L, H)

	# Mean-pool across the sequence
	pooled = out.mean(dim=1) # (B, H)

	# Per-token importance: average attention weight received across
	# all heads and all query positions
	weights = attn.mean(dim=1).mean(dim=1) # (B, L)

	return pooled, weights


	# ---------------------------------------------------------------------------
	# Temperature Scaling — post-hoc probability calibration
	# ---------------------------------------------------------------------------


	class TemperatureScaling(nn.Module):
	"""Post-hoc calibration via temperature scaling (Guo et al. 2017).

	Divides logits by a single learnable scalar temperature ``T > 0`` before
	softmax.

	* ``T > 1`` → probabilities are smoothed toward 0.5 (reduces overconfidence).
	* ``T < 1`` → probabilities become more extreme.
	* ``T = 1`` → no effect (identity).

	The temperature is calibrated on a held-out validation set by minimising
	NLL loss. During inference with an uncalibrated model, keep ``T = 1.0``.

	Parameters
	----------
	temperature : float
	Initial temperature. Defaults to 1.0 (no calibration).

	Example
	-------
	>>> ts = TemperatureScaling(temperature=1.5)
	>>> scaled_logits = ts(logits) # use before softmax
	>>> ts.calibrate(val_logits, val_labels) # fit T on a held-out set
	"""

	def __init__(self, temperature: float = 1.0) -> None:
	super().__init__()
	self.temperature = nn.Parameter(
	torch.ones(1) * max(temperature, 1e-6)
	)

	def forward(self, logits: torch.Tensor) -> torch.Tensor:
	"""Return temperature-scaled logits."""
	return logits / self.temperature.clamp(min=1e-6)

	def calibrate(
	self, logits: torch.Tensor, labels: torch.Tensor
	) -> None:
	"""Fit the temperature on a held-out (logits, labels) set.

	Uses L-BFGS to minimise NLL. ``logits`` and ``labels`` should
	be collected on the validation set before calling this method.

	Parameters
	----------
	logits : Tensor, shape ``(N, C)``
	Raw (uncalibrated) model logits.
	labels : Tensor, shape ``(N,)``
	Ground-truth class indices.
	"""
	from torch.optim import LBFGS

	nll = nn.CrossEntropyLoss()
	optimizer = LBFGS([self.temperature], lr=0.01, max_iter=50)

	def _eval() -> torch.Tensor:
	optimizer.zero_grad()
	loss = nll(self.forward(logits), labels)
	loss.backward()
	return loss

	optimizer.step(_eval)


	class OptimizedMultichannelCNN(nn.Module):
	"""Multi-channel 1D CNN with multi-head self-attention for stress detection.

	Architecture
	------------
	1. Embedding layer (with stop-word dampening)
	2. Three parallel Conv1D branches (kernel sizes 2, 3, 5)
	3. min_len trimming — outputs are trimmed to the shortest length
	before concatenation to prevent tensor shape mismatches.
	4. Multi-head self-attention (default 4 heads)
	5. Classification head (FC → Dropout → FC)

	Parameters
	----------
	vocab_size : int
	Size of the token vocabulary.
	embed_dim : int
	Embedding dimension.
	num_filters : int
	Number of filters per Conv1D branch.
	kernel_sizes : tuple[int, ...]
	Kernel sizes for the parallel Conv1D branches.
	num_classes : int
	Number of output classes (default 2: stress / no-stress).
	dropout : float
	Dropout probability.
	aux_dim : int
	Optional numeric feature dimension appended to pooled CNN features.
	num_attention_heads : int
	Number of attention heads. Must divide ``num_filters * len(kernel_sizes)``
	evenly. Set to 1 to use single-head dot-product attention (legacy).
	"""

	def __init__(
	self,
	vocab_size: int,
	embed_dim: int = 128,
	num_filters: int = 64,
	kernel_sizes: tuple[int, ...] = (2, 3, 5),
	num_classes: int = 2,
	dropout: float = 0.3,
	aux_dim: int = 0,
	stop_word_dampening: float = 0.3,
	num_attention_heads: int = 4,
	) -> None:
	super().__init__()
	self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

	# ── Stop-word dampening ──
	# Build a per-token-ID lookup: 1.0 for stop words, 0.0 otherwise.
	# During forward() each embedding is scaled by
	# 1.0 - is_stop * (1.0 - stop_word_dampening)
	# so content words keep their full magnitude while stop-word
	# embeddings are reduced to ``stop_word_dampening`` of their
	# original magnitude.
	self.stop_word_dampening = stop_word_dampening
	stop_ids = _compute_stop_word_ids(vocab_size)
	stop_mask = torch.zeros(vocab_size, dtype=torch.float)
	for sid in stop_ids:
	stop_mask[sid] = 1.0
	# persistent=False → not part of state_dict, avoids checkpoint compat issues
	self.register_buffer("_stop_word_lookup", stop_mask, persistent=False)

	# Parallel Conv1D branches
	self.convs = nn.ModuleList(
	[
	nn.Conv1d(embed_dim, num_filters, kernel_size=ks, padding=0)
	for ks in kernel_sizes
	]
	)

	total_filters = num_filters * len(kernel_sizes)

	# ── Attention ──
	# Use multi-head attention when possible; fall back to single-head
	# dot-product attention if total_filters is not divisible by num_heads.
	if num_attention_heads > 1 and total_filters % num_attention_heads == 0:
	self.attention: nn.Module = MultiHeadSelfAttention(
	total_filters, num_heads=num_attention_heads, dropout=dropout
	)
	else:
	self.attention = DotProductSelfAttention(total_filters)

	self.aux_dim = aux_dim
	aux_hidden = min(aux_dim, total_filters // 2) if aux_dim > 0 else 0
	self.aux_projection = (
	nn.Sequential(
	nn.Linear(aux_dim, aux_hidden),
	nn.ReLU(),
	nn.Dropout(dropout),
	)
	if aux_dim > 0
	else None
	)

	combined_dim = total_filters + aux_hidden

	# Classification head
	self.classifier = nn.Sequential(
	nn.Linear(combined_dim, combined_dim // 2),
	nn.ReLU(),
	nn.Dropout(dropout),
	nn.Linear(combined_dim // 2, num_classes),
	)

	self.dropout = nn.Dropout(dropout)

	def forward(
	self,
	input_ids: torch.Tensor,
	aux_features: Optional[torch.Tensor] = None,
	) -> dict[str, torch.Tensor]:
	"""
	Parameters
	----------
	input_ids : Tensor, shape ``(B, L)``
	Token indices.

	Returns
	-------
	dict with keys:
	``logits`` : Tensor, shape ``(B, num_classes)``
	``attention_weights`` : Tensor, shape ``(B, seq_len')``
	"""
	# Embedding: (B, L) → (B, L, E)
	x = self.embedding(input_ids)
	x = self.dropout(x)

	# ── Stop-word dampening ──
	# Reduce embedding magnitudes for stop-word tokens so that the
	# subsequent conv + attention layers do not over-emphasise them.
	is_stop = self._stop_word_lookup[input_ids] # (B, L), 0 or 1
	dampening = 1.0 - is_stop * (1.0 - self.stop_word_dampening) # (B, L)
	x = x * dampening.unsqueeze(-1) # (B, L, E)

	# Conv1D expects (B, C, L) — transpose
	x_t = x.transpose(1, 2) # (B, E, L)

	# Apply parallel convolutions + ReLU
	conv_outputs = []
	for conv in self.convs:
	c = F.relu(conv(x_t)) # (B, F, L')
	conv_outputs.append(c)

	# ─── CRITICAL: Trim to min_len to prevent shape mismatch ───
	min_len = min(c.size(2) for c in conv_outputs)
	conv_outputs = [c[:, :, :min_len] for c in conv_outputs]

	# Concatenate along the filter dimension: (B, F*3, min_len)
	merged = torch.cat(conv_outputs, dim=1)

	# Transpose back for attention: (B, min_len, F*3)
	merged = merged.transpose(1, 2)

	# Self-attention (multi-head or single-head)
	pooled, attn_weights = self.attention(merged) # (B, F*3), (B, min_len)

	if self.aux_projection is not None:
	if aux_features is None:
	aux_features = torch.zeros(
	pooled.size(0),
	self.aux_dim,
	device=pooled.device,
	)
	aux_emb = self.aux_projection(aux_features)
	pooled = torch.cat([pooled, aux_emb], dim=1)

	# Classification
	logits = self.classifier(pooled) # (B, num_classes)

	return {"logits": logits, "attention_weights": attn_weights}


	# ---------------------------------------------------------------------------
	# Tier 2: Transformer wrappers
	# ---------------------------------------------------------------------------


	class DeBERTaStressClassifier(nn.Module):
	"""Stress classifier wrapping ``microsoft/deberta-v3-small``.

	Uses the HuggingFace ``transformers`` library for the backbone and
	adds a simple classification head.
	"""

	MODEL_NAME = "microsoft/deberta-v3-small"

	def __init__(self, num_classes: int = 2, dropout: float = 0.1) -> None:
	super().__init__()
	from transformers import AutoModel

	self.backbone = AutoModel.from_pretrained(self.MODEL_NAME)
	hidden = self.backbone.config.hidden_size
	# +1 for optional sentiment feature
	self.classifier = nn.Sequential(
	nn.Dropout(dropout),
	nn.Linear(hidden + 1, num_classes),
	)

	def forward(
	self,
	input_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	sentiment: Optional[torch.Tensor] = None,
	) -> dict[str, torch.Tensor]:
	outputs = self.backbone(
	input_ids=input_ids, attention_mask=attention_mask
	)
	# CLS token pooling
	pooled = outputs.last_hidden_state[:, 0, :]

	if sentiment is not None:
	sentiment = sentiment.unsqueeze(1) if sentiment.dim() == 1 else sentiment
	pooled = torch.cat([pooled, sentiment], dim=1)
	else:
	# Append neutral sentiment (0.5) when not provided
	neutral = torch.full(
	(pooled.size(0), 1), 0.5,
	device=pooled.device, dtype=pooled.dtype,
	)
	pooled = torch.cat([pooled, neutral], dim=1)

	logits = self.classifier(pooled)
	return {"logits": logits}


	class MiniLMStressClassifier(nn.Module):
	"""Stress classifier wrapping ``sentence-transformers/all-MiniLM-L6-v2``.

	Uses mean pooling over the last hidden state as the sentence
	representation.
	"""

	MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

	def __init__(self, num_classes: int = 2, dropout: float = 0.1) -> None:
	super().__init__()
	from transformers import AutoModel

	self.backbone = AutoModel.from_pretrained(self.MODEL_NAME)
	hidden = self.backbone.config.hidden_size
	# +1 for optional sentiment feature
	self.classifier = nn.Sequential(
	nn.Dropout(dropout),
	nn.Linear(hidden + 1, num_classes),
	)

	def forward(
	self,
	input_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	sentiment: Optional[torch.Tensor] = None,
	) -> dict[str, torch.Tensor]:
	outputs = self.backbone(
	input_ids=input_ids, attention_mask=attention_mask
	)
	# Mean pooling
	hidden_states = outputs.last_hidden_state
	if attention_mask is not None:
	mask = attention_mask.unsqueeze(-1).float()
	pooled = (hidden_states * mask).sum(dim=1) / mask.sum(dim=1).clamp(
	min=1e-9
	)
	else:
	pooled = hidden_states.mean(dim=1)

	if sentiment is not None:
	sentiment = sentiment.unsqueeze(1) if sentiment.dim() == 1 else sentiment
	pooled = torch.cat([pooled, sentiment], dim=1)
	else:
	neutral = torch.full(
	(pooled.size(0), 1), 0.5,
	device=pooled.device, dtype=pooled.dtype,
	)
	pooled = torch.cat([pooled, neutral], dim=1)

	logits = self.classifier(pooled)
	return {"logits": logits}