openNemo-9B

@@ -482,9 +482,11 @@ class NemotronHMamba2Mixer(nn.Module):
         # Decay matrix L via cumsum difference — replaces segment_sum which
         # expanded [chunk] → [chunk, chunk] via O(n^2) broadcast.
         # Math: L[i,j] = exp(A_cumsum[i] - A_cumsum[j]) for j <= i
-        L = torch.exp(A_cumsum[..., :, None] - A_cumsum[..., None, :])
-        L = L * torch.tril(torch.ones(
-            self.chunk_size, self.chunk_size, device=L.device, dtype=L.dtype))
         # Contract ssm_state via einsum FIRST — avoids materializing the
         # [chunk, chunk, heads, state] outer product (was 68GB in fp32).
@@ -510,11 +512,13 @@ class NemotronHMamba2Mixer(nn.Module):
         states = torch.cat([previous_states, states], dim=1)
         # Inter-chunk decay via cumsum difference (n_chunks is small, ~16)
         chunk_cumA = torch.cumsum(F.pad(A_cumsum[:, :, :, -1], (1, 0)), dim=-1)
         n_plus1 = chunk_cumA.shape[-1]
-        decay_chunk = torch.exp(chunk_cumA[..., :, None] - chunk_cumA[..., None, :])
-        decay_chunk = decay_chunk * torch.tril(torch.ones(
-            n_plus1, n_plus1, device=decay_chunk.device, dtype=decay_chunk.dtype))
         decay_chunk = decay_chunk.transpose(1, 3)
         # Contract n_chunks+1 via einsum

         # Decay matrix L via cumsum difference — replaces segment_sum which
         # expanded [chunk] → [chunk, chunk] via O(n^2) broadcast.
         # Math: L[i,j] = exp(A_cumsum[i] - A_cumsum[j]) for j <= i
+        # Mask BEFORE exp to avoid inf*0=NaN in bf16 (upper triangle overflows)
+        L_arg = A_cumsum[..., :, None] - A_cumsum[..., None, :]
+        causal_mask = torch.tril(torch.ones(
+            self.chunk_size, self.chunk_size, device=L_arg.device, dtype=torch.bool))
+        L = torch.exp(L_arg.masked_fill(~causal_mask, float('-inf')))
         # Contract ssm_state via einsum FIRST — avoids materializing the
         # [chunk, chunk, heads, state] outer product (was 68GB in fp32).
         states = torch.cat([previous_states, states], dim=1)
         # Inter-chunk decay via cumsum difference (n_chunks is small, ~16)
+        # Mask BEFORE exp to avoid inf*0=NaN in bf16
         chunk_cumA = torch.cumsum(F.pad(A_cumsum[:, :, :, -1], (1, 0)), dim=-1)
         n_plus1 = chunk_cumA.shape[-1]
+        decay_arg = chunk_cumA[..., :, None] - chunk_cumA[..., None, :]
+        chunk_mask = torch.tril(torch.ones(
+            n_plus1, n_plus1, device=decay_arg.device, dtype=torch.bool))
+        decay_chunk = torch.exp(decay_arg.masked_fill(~chunk_mask, float('-inf')))
         decay_chunk = decay_chunk.transpose(1, 3)
         # Contract n_chunks+1 via einsum