Spaces:
Sleeping
Sleeping
File size: 5,710 Bytes
c5c9261 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | """
Classification Heads — Pooling strategies and MLP classifier for deepfake detection.
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
class AttentiveStatsPooling(nn.Module):
"""
Attentive Statistics Pooling.
Learns which frames are most important, then computes weighted mean + std.
Used in ECAPA-TDNN and top speaker verification systems.
"""
def __init__(self, hidden_size: int, attention_dim: int = 128):
super().__init__()
self.attention = nn.Sequential(
nn.Linear(hidden_size, attention_dim),
nn.Tanh(),
nn.Linear(attention_dim, 1),
)
self.output_size = hidden_size * 2 # mean + std concatenated
def forward(self, x: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
"""
Args:
x: (batch, time, hidden_size)
mask: optional (batch, time) boolean mask
Returns:
(batch, hidden_size * 2) — weighted mean and std
"""
# Compute attention weights
attn_weights = self.attention(x).squeeze(-1) # (batch, time)
if mask is not None:
attn_weights = attn_weights.masked_fill(~mask, float("-inf"))
attn_weights = F.softmax(attn_weights, dim=-1).unsqueeze(-1) # (batch, time, 1)
# Weighted mean
mean = torch.sum(x * attn_weights, dim=1) # (batch, hidden)
# Weighted std
var = torch.sum(attn_weights * (x - mean.unsqueeze(1)) ** 2, dim=1)
std = torch.sqrt(var.clamp(min=1e-6))
return torch.cat([mean, std], dim=-1) # (batch, hidden*2)
class MultiHeadAttentionPooling(nn.Module):
"""
Multi-Head Attention Pooling.
Applies multi-head self-attention then pools via learned query vector.
"""
def __init__(self, hidden_size: int, num_heads: int = 4):
super().__init__()
self.num_heads = num_heads
self.query = nn.Parameter(torch.randn(1, 1, hidden_size))
self.mha = nn.MultiheadAttention(hidden_size, num_heads, batch_first=True)
self.output_size = hidden_size
nn.init.xavier_uniform_(self.query)
def forward(self, x: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
"""
Args:
x: (batch, time, hidden_size)
Returns:
(batch, hidden_size)
"""
batch_size = x.size(0)
query = self.query.expand(batch_size, -1, -1) # (batch, 1, hidden)
out, _ = self.mha(query, x, x) # (batch, 1, hidden)
return out.squeeze(1) # (batch, hidden)
class MeanPooling(nn.Module):
"""Simple mean pooling over the time axis."""
def __init__(self, hidden_size: int):
super().__init__()
self.output_size = hidden_size
def forward(self, x: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
if mask is not None:
x = x * mask.unsqueeze(-1).float()
return x.sum(dim=1) / mask.sum(dim=1, keepdim=True).float()
return x.mean(dim=1)
class DeepfakeClassifier(nn.Module):
"""
Full classification model = Backbone + Pooling + MLP Head.
"""
def __init__(self, backbone: nn.Module, hidden_size: int,
num_labels: int = 2, classifier_hidden: int = 256,
dropout: float = 0.3, pooling_type: str = "attentive_stats"):
super().__init__()
self.backbone = backbone
# Select pooling strategy
if pooling_type == "attentive_stats":
self.pooling = AttentiveStatsPooling(hidden_size)
elif pooling_type == "multi_head":
self.pooling = MultiHeadAttentionPooling(hidden_size)
elif pooling_type == "mean":
self.pooling = MeanPooling(hidden_size)
else:
raise ValueError(f"Unknown pooling: {pooling_type}")
pool_output_size = self.pooling.output_size
# MLP classification head with batch norm
self.classifier = nn.Sequential(
nn.Linear(pool_output_size, classifier_hidden),
nn.BatchNorm1d(classifier_hidden),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(classifier_hidden, classifier_hidden // 2),
nn.BatchNorm1d(classifier_hidden // 2),
nn.ReLU(),
nn.Dropout(dropout / 2),
nn.Linear(classifier_hidden // 2, num_labels),
)
# Initialize weights
self._init_weights()
def _init_weights(self):
for m in self.classifier:
if isinstance(m, nn.Linear):
nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
nn.init.zeros_(m.bias)
def forward(self, input_values: torch.Tensor,
attention_mask: torch.Tensor = None) -> torch.Tensor:
"""
Args:
input_values: (batch, time) raw waveform
attention_mask: (batch, time) attention mask
Returns:
logits: (batch, num_labels)
"""
# Extract features from backbone
outputs = self.backbone(input_values, attention_mask=attention_mask)
hidden_states = outputs.last_hidden_state # (batch, seq_len, hidden)
# Pool across time
pooled = self.pooling(hidden_states) # (batch, pool_dim)
# Classify
logits = self.classifier(pooled) # (batch, num_labels)
return logits
def extract_embeddings(self, input_values: torch.Tensor) -> torch.Tensor:
"""Extract embeddings (before classification head) for analysis."""
outputs = self.backbone(input_values)
hidden_states = outputs.last_hidden_state
return self.pooling(hidden_states)
|