BUT-FIT
/

SE_DiCoW

Automatic Speech Recognition

speaker-diarization

meeting-transcription

target-speaker-asr

Model card Files Files and versions

Lakoc commited on Sep 30, 2025

Commit

57fe226

·

verified ·

1 Parent(s): 7e5f173

Update SCBs.py

Files changed (1) hide show

SCBs.py +24 -19

SCBs.py CHANGED Viewed

@@ -88,6 +88,8 @@ class CrossAttentionEnrollBlockNew(nn.Module):
             nn.Dropout(config.dropout if hasattr(config, 'dropout') else 0.1)
         )
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -97,25 +99,28 @@ class CrossAttentionEnrollBlockNew(nn.Module):
         Returns:
             Updated hidden states of same shape
         """
-        q_channel = hidden_states[:, 0]  # (B, T, F)
-        kv_channel = hidden_states[:, 1]  # (B, T, F)
-        # Cross-attention
-        attn_output = self.cross_attn(
-            hidden_states=q_channel,
-            key_value_states=kv_channel,
-            output_attentions=False
-        )[0]
-        # Concatenate attention output with original normalized query
-        q_concat = torch.cat([attn_output, q_channel], dim=-1)  # (B, T, 2*F)
-        # Feed-forward processing (no normalization to preserve initialization)
-        # updated_q = self.ffn(q_concat)  # (B, T, F)
-        updated_q = q_channel + torch.tanh(self.cross_gate) * self.ffn(q_concat)
-        # Return stacked result (only query channel is updated)
-        return torch.stack([updated_q, kv_channel], dim=1)
 class SpeakerCommunicationBlock(nn.Module):
     def __init__(self, config):

             nn.Dropout(config.dropout if hasattr(config, 'dropout') else 0.1)
         )
+        self.enabled = True
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         Returns:
             Updated hidden states of same shape
         """
+        if self.enabled:
+            q_channel = hidden_states[:, 0]  # (B, T, F)
+            kv_channel = hidden_states[:, 1]  # (B, T, F)
+            # Cross-attention
+            attn_output = self.cross_attn(
+                hidden_states=q_channel,
+                key_value_states=kv_channel,
+                output_attentions=False
+            )[0]
+            # Concatenate attention output with original normalized query
+            q_concat = torch.cat([attn_output, q_channel], dim=-1)  # (B, T, 2*F)
+            # Feed-forward processing (no normalization to preserve initialization)
+            # updated_q = self.ffn(q_concat)  # (B, T, F)
+            updated_q = q_channel + torch.tanh(self.cross_gate) * self.ffn(q_concat)
+            # Return stacked result (only query channel is updated)
+            return torch.stack([updated_q, kv_channel], dim=1)
+        else:
+            return hidden_states
 class SpeakerCommunicationBlock(nn.Module):
     def __init__(self, config):