BUT-FIT
/

DiCoW_v3_2

@@ -1,140 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-class ContrastiveLoss(nn.Module):
-    def __init__(self, temperature=.25, distance_metric='cosine'):
-        super(ContrastiveLoss, self).__init__()
-        self.temperature = temperature
-        self.distance_metric = distance_metric
-    def compute_similarity(self, embeddings):
-        if self.distance_metric == 'cosine':
-            embeddings = F.normalize(embeddings, p=2, dim=-1)  # [B, 2T, D]
-            sim = torch.matmul(embeddings, embeddings.transpose(-1, -2))  # [B, 2T, 2T]
-        else:
-            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")
-        return sim / self.temperature
-    def pairwise_and_no_diag(self, m):
-        m_i = m.unsqueeze(2)  # [B, T, 1]
-        m_j = m.unsqueeze(1)  # [B, 1, T]
-        out = m_i & m_j  # [B, T, T]
-        diag = torch.eye(m.size(1), dtype=torch.bool, device=m.device).unsqueeze(0)
-        return out & ~diag
-    def forward(self, embeddings, pos_indicator_mask):
-        """
-        Args:
-            embeddings: [B, 2T, D]
-            pos_indicator_mask: [B, 2T] - boolean, positions that belong to each speaker group
-        Returns:
-            Scalar contrastive loss
-        """
-        B, two_T, D = embeddings.shape
-        T = two_T // 2
-        sim = self.compute_similarity(embeddings)  # [B, 2T, 2T]
-        # Split input mask
-        m1 = pos_indicator_mask[:, :T]  # [B, T]
-        m2 = pos_indicator_mask[:, T:]  # [B, T]
-        # Positive mask (same speaker pairs, diagonal excluded)
-        pos_block1 = self.pairwise_and_no_diag(m1)  # [B, T, T]
-        pos_block2 = self.pairwise_and_no_diag(m2)  # [B, T, T]
-        pos_mask = torch.cat([
-            torch.cat([pos_block1, torch.zeros_like(pos_block1)], dim=2),  # [B, T, 2T]
-            torch.cat([torch.zeros_like(pos_block2), pos_block2], dim=2)  # [B, T, 2T]
-        ], dim=1)  # [B, 2T, 2T]
-        # Negative mask (cross-speaker pairs where both are active)
-        cross = m1.unsqueeze(2) & m2.unsqueeze(1)  # [B, T, T]
-        neg_mask = torch.cat([
-            torch.cat([torch.zeros_like(cross), cross], dim=2),  # [B, T, 2T]
-            torch.cat([cross.transpose(1, 2), torch.zeros_like(cross)], dim=2)  # [B, T, 2T]
-        ], dim=1)  # [B, 2T, 2T]
-        # Identity mask (exclude [i, i] self-pairs)
-        identity_mask = torch.eye(two_T, dtype=torch.bool, device=embeddings.device).unsqueeze(0)  # [1, 2T, 2T]
-        pos_mask &= ~identity_mask
-        neg_mask &= ~identity_mask
-        # Fully vectorized InfoNCE computation
-        if pos_mask.any():
-            # Compute exp(similarities) for numerical stability
-            exp_sim = torch.exp(sim)  # [B, 2T, 2T]
-            # Get positive similarities
-            pos_sim = sim[pos_mask]  # [num_pos_pairs]
-            pos_exp = torch.exp(pos_sim)  # [num_pos_pairs]
-            # For each position, sum the exponentials of its negatives
-            neg_exp_avg = 10 * torch.mean(exp_sim * neg_mask.float(), dim=2)  # [B, 2T]
-            # Get the negative sums corresponding to each positive pair
-            pos_indices = torch.nonzero(pos_mask, as_tuple=False)  # [num_pos_pairs, 3]
-            batch_idx = pos_indices[:, 0]  # [num_pos_pairs]
-            row_idx = pos_indices[:, 1]  # [num_pos_pairs]
-            # Get negative sums for each positive pair's anchor
-            neg_avgs_for_pos = neg_exp_avg[batch_idx, row_idx]  # [num_pos_pairs]
-            # Compute denominators: exp(pos) + sum(exp(neg)) for each positive pair
-            denominators = pos_exp + neg_avgs_for_pos  # [num_pos_pairs]
-            # InfoNCE loss: -log(exp(pos) / denominator)
-            loss = -torch.log(pos_exp / denominators)
-            total_loss = loss.mean()
-            # logits = sim
-            # logits = logits.masked_fill(~(pos_mask | neg_mask), float('-inf'))  # Mask out irrelevant pairs
-            #
-            # log_probs = F.log_softmax(logits, dim=-1)  # [B, 2T, 2T]
-            # pos_log_probs = log_probs[pos_mask]
-            # total_loss = -pos_log_probs.mean()
-        else:
-            # No positive pairs found, return zero loss
-            total_loss = torch.tensor(0.0, device=embeddings.device, requires_grad=True)
-        return total_loss
-# Example usage and testing
-def create_example_data():
-    """Create example data for testing."""
-    B, T, D = 2, 3, 64
-    # Create random embeddings
-    embeddings = torch.randn(B, T, D)
-    # Create example positive and negative masks
-    pos_mask = torch.zeros(B, T, B, T, dtype=torch.bool)
-    neg_mask = torch.zeros(B, T, B, T, dtype=torch.bool)
-    # Example: make adjacent time steps positive pairs
-    for b in range(B):
-        for t in range(T - 1):
-            pos_mask[b, t, b, t + 1] = True
-            pos_mask[b, t + 1, b, t] = True
-    # Example: make cross-batch pairs negative
-    for b1 in range(B):
-        for b2 in range(B):
-            if b1 != b2:
-                neg_mask[b1, :, b2, :] = True
-    pair_masks = torch.stack([pos_mask, neg_mask])
-    return embeddings, pair_masks
-if __name__ == "__main__":
-    # Test the implementation
-    embeddings, pair_masks = create_example_data()
-    # Initialize loss function
-    contrastive_loss = ContrastiveLoss(temperature=0.07, distance_metric='cosine')
-    # Compute loss
-    loss = contrastive_loss(embeddings, pair_masks)
-    print(f"Contrastive Loss: {loss.item():.4f}")