robinfaro
/

molm_log_prob

@@ -151,19 +151,16 @@ class MoLM(PreTrainedModel):
             weighted_log_probs = log_probs + log_weights_exp  # (E, B, T, V)
             combined_log_probs = torch.logsumexp(weighted_log_probs, dim=0)  # (B, T, V)
-            combined_logits = combined_log_probs  # because loss works with log-probs if used properly
         else:
             # Unweighted average in log-prob space across active experts (equal weights)
             log_weights = torch.log(1.0 / active_experts_count.float().clamp(min=1.0)).view(1, -1, 1, 1)  # (1, B, 1, 1)
             weighted_log_probs = log_probs + log_weights
             combined_log_probs = torch.logsumexp(weighted_log_probs, dim=0)  # (B, T, V)
-            combined_logits = combined_log_probs  # because loss works with log-probs if used properly
         # Calculate the loss if targets are provided
         if targets is not None:
-            #loss = F.cross_entropy(combined_logits.view(-1, combined_logits.size(-1)), targets.view(-1), ignore_index=-1)
-            loss = F.nll_loss(combined_logits.view(-1, combined_logits.size(-1)), targets.view(-1), ignore_index=-1)
             loss_to_log = loss.item()
             # Add auxiliary router losses (only if routing is used and we're training)
@@ -188,7 +185,7 @@ class MoLM(PreTrainedModel):
             loss_to_log = None
         return Output(
-            logits=combined_logits,
             loss=loss,
             combined_log_probs=combined_log_probs,
             loss_to_log=loss_to_log,

             weighted_log_probs = log_probs + log_weights_exp  # (E, B, T, V)
             combined_log_probs = torch.logsumexp(weighted_log_probs, dim=0)  # (B, T, V)
         else:
             # Unweighted average in log-prob space across active experts (equal weights)
             log_weights = torch.log(1.0 / active_experts_count.float().clamp(min=1.0)).view(1, -1, 1, 1)  # (1, B, 1, 1)
             weighted_log_probs = log_probs + log_weights
             combined_log_probs = torch.logsumexp(weighted_log_probs, dim=0)  # (B, T, V)
         # Calculate the loss if targets are provided
         if targets is not None:
+            loss = F.nll_loss(combined_log_probs.view(-1, combined_log_probs.size(-1)), targets.view(-1), ignore_index=-1)
             loss_to_log = loss.item()
             # Add auxiliary router losses (only if routing is used and we're training)
             loss_to_log = None
         return Output(
+            logits=torch.Tensor([expert_output for expert_output in expert_outputs]),
             loss=loss,
             combined_log_probs=combined_log_probs,
             loss_to_log=loss_to_log,