marcoyang
/

spear-xlarge-speech-audio

Model card Files Files and versions

marcoyang commited on Nov 3, 2025

Commit

9324520

·

1 Parent(s): 43a8860

fix

Files changed (2) hide show

spear_model.py +2 -3
spear_modules.py +52 -0

spear_model.py CHANGED Viewed

@@ -237,11 +237,10 @@ class SpearEncoder(nn.Module):
         self.distillation_delta = distillation_delta
         if num_codebooks > 0:
-            from multi_quantization.prediction import JointCodebookLoss
             self.codebook_loss_net = JointCodebookLoss(
-                predictor_channels=encoder_dim,
                 num_codebooks=num_codebooks * self.teacher_frame_ratio,
-                is_joint=False,
                 reduction="none",
             )
         else:

         self.distillation_delta = distillation_delta
         if num_codebooks > 0:
+            from .spear_modules import JointCodebookLoss
             self.codebook_loss_net = JointCodebookLoss(
+                input_dim=encoder_dim,
                 num_codebooks=num_codebooks * self.teacher_frame_ratio,
                 reduction="none",
             )
         else:

spear_modules.py CHANGED Viewed

@@ -32,6 +32,58 @@ def logaddexp_onnx(x: Tensor, y: Tensor) -> Tensor:
     diff = torch.abs(x - y)
     return max_value + torch.log1p(torch.exp(-diff))
 # RuntimeError: Exporting the operator logaddexp to ONNX opset version
 # 14 is not supported. Please feel free to request support or submit

     diff = torch.abs(x - y)
     return max_value + torch.log1p(torch.exp(-diff))
+class JointCodebookLoss(torch.nn.Module):
+    def __init__(
+        self,
+        input_dim: int = 512,
+        num_codebooks: int = 16,
+        codebook_size: int = 256,
+        ignore_index: int = -100,
+        reduction: str = "none"
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.num_codebooks = num_codebooks
+        self.codebook_size = codebook_size
+        self.reduction = reduction
+        self.ignore_index = ignore_index
+        self.proj = nn.Linear(input_dim, num_codebooks * codebook_size)
+    def forward_logprobs(self, input: torch.Tensor):
+        B,T,_ = input.shape
+        logits = self.proj(input)
+        logits = logits.view(B, T, self.num_codebooks, self.codebook_size) # (B,T,N,256)
+        log_probs = F.log_softmax(logits, dim=-1) # (B,T,N,256)
+        return log_probs
+    def forward(self, input, target, return_log_probs: bool = False):
+        # input: (B,T,C)
+        # target: (B,T,num_codebooks)
+        B,T,_ = input.shape
+        logits = self.proj(input)
+        logits = logits.view(B, T, self.num_codebooks, self.codebook_size) # (B,T,N,256)
+        loss = F.cross_entropy(
+            logits.reshape(-1, self.codebook_size),
+            target.reshape(-1),
+            ignore_index=self.ignore_index,
+            reduction=self.reduction
+        )
+        log_probs = None
+        if return_log_probs:
+            log_probs = F.log_softmax(logits, dim=-1)
+        if self.reduction == "none":
+            loss = loss.view(B, T, self.num_codebooks)
+        if return_log_probs:
+            return loss, log_probs
+        return loss
 # RuntimeError: Exporting the operator logaddexp to ONNX opset version
 # 14 is not supported. Please feel free to request support or submit