nvidia
/

Hymba-1.5B-Instruct

@@ -1679,40 +1679,43 @@ class HymbaBlock(nn.Module):
             A = -torch.exp(self.A_log[index].float())
             time_proj_bias = time_proj_bias.float() if time_proj_bias is not None else None
-            if use_precomputed_states:
-                scan_outputs = selective_state_update(
-                    cache_params.ssm_states[self.layer_idx],
-                    hidden_states[..., 0],
-                    discrete_time_step[..., 0],
-                    A,
-                    B[:, 0],
-                    C[:, 0],
-                    self.D[index],
-                    gate[..., 0],
-                    time_proj_bias,
-                    dt_softplus=True,
-                ).unsqueeze(-1)
-            else:
-                outputs = selective_scan_fn(
-                    hidden_states,
-                    discrete_time_step,
-                    A,
-                    B.transpose(1, 2),
-                    C.transpose(1, 2),
-                    self.D[index].float(),
-                    z=gate,
-                    delta_bias=time_proj_bias,
-                    delta_softplus=True,
-                    return_last_state=True,
-                )
-                if len(outputs) == 3:
-                    scan_outputs, ssm_state, _ = outputs
                 else:
-                    scan_outputs, ssm_state = outputs
-                if ssm_state is not None and cache_params is not None:
-                    cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
             scan_outputs = scan_outputs.transpose(1, 2)

             A = -torch.exp(self.A_log[index].float())
             time_proj_bias = time_proj_bias.float() if time_proj_bias is not None else None
+            try:
+                if use_precomputed_states:
+                    scan_outputs = selective_state_update(
+                        cache_params.ssm_states[self.layer_idx],
+                        hidden_states[..., 0],
+                        discrete_time_step[..., 0],
+                        A,
+                        B[:, 0],
+                        C[:, 0],
+                        self.D[index],
+                        gate[..., 0],
+                        time_proj_bias,
+                        dt_softplus=True,
+                    ).unsqueeze(-1)
                 else:
+                    outputs = selective_scan_fn(
+                        hidden_states,
+                        discrete_time_step,
+                        A,
+                        B.transpose(1, 2),
+                        C.transpose(1, 2),
+                        self.D[index].float(),
+                        z=gate,
+                        delta_bias=time_proj_bias,
+                        delta_softplus=True,
+                        return_last_state=True,
+                    )
+                    if len(outputs) == 3:
+                        scan_outputs, ssm_state, _ = outputs
+                    else:
+                        scan_outputs, ssm_state = outputs
+                    if ssm_state is not None and cache_params is not None:
+                        cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+            except:
+                print(f"use_precomputed_states {use_precomputed_states}; {index} {self.D}, {delta_bias} ")
             scan_outputs = scan_outputs.transpose(1, 2)