nvidia
/

Hymba-1.5B-Instruct

@@ -1679,102 +1679,110 @@ class HymbaBlock(nn.Module):
             A = -torch.exp(self.A_log[index].float())
             time_proj_bias = time_proj_bias.float() if time_proj_bias is not None else None
-            # try:
-            if use_precomputed_states:
-                scan_outputs = selective_state_update(
-                    cache_params.ssm_states[self.layer_idx],
-                    hidden_states[..., 0],
-                    discrete_time_step[..., 0],
-                    A,
-                    B[:, 0],
-                    C[:, 0],
-                    self.D[index],
-                    gate[..., 0],
-                    time_proj_bias,
-                    dt_softplus=True,
-                ).unsqueeze(-1)
-            else:
-                outputs = selective_scan_fn(
-                    hidden_states,
-                    discrete_time_step,
-                    A,
-                    B.transpose(1, 2),
-                    C.transpose(1, 2),
-                    self.D[index].float(),
-                    z=gate,
-                    delta_bias=time_proj_bias,
-                    delta_softplus=True,
-                    return_last_state=True,
-                )
-                if len(outputs) == 3:
-                    scan_outputs, ssm_state, _ = outputs
                 else:
-                    scan_outputs, ssm_state = outputs
-                if ssm_state is not None and cache_params is not None:
-                    cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
-            # except Exception as e:
-            #     print("\n\n\n\n")
-            #     print(e)
-            #     print(f"use_precomputed_states {use_precomputed_states}; {index} {self.D}, {time_proj_bias} ")
-            #     print(f"{self.D[index]} ")
-            #     # cache_params.ssm_states[self.layer_idx],
-            #     #         hidden_states[..., 0],
-            #     #         discrete_time_step[..., 0],
-            #     #         A,
-            #     #         B[:, 0],
-            #     #         C[:, 0],
-            #     #         self.D[index],
-            #     #         gate[..., 0],
-            #     #         time_proj_bias,
-            #     print("=== Variable Values ===")
-            #     try:
-            #         print(f"cache_params.ssm_states[{self.layer_idx}]: {cache_params.ssm_states[self.layer_idx]}")
-            #     except Exception as e:
-            #         print(f"Error accessing cache_params.ssm_states[{self.layer_idx}]: {e}")
-            #     try:
-            #         print(f"hidden_states[..., 0]: {hidden_states[..., 0]}")
-            #     except Exception as e:
-            #         print(f"Error accessing hidden_states[..., 0]: {e}")
-            #     try:
-            #         print(f"discrete_time_step[..., 0]: {discrete_time_step[..., 0]}")
-            #     except Exception as e:
-            #         print(f"Error accessing discrete_time_step[..., 0]: {e}")
-            #     try:
-            #         print(f"A: {A}")
-            #     except Exception as e:
-            #         print(f"Error accessing A: {e}")
-            #     try:
-            #         print(f"B[:, 0]: {B[:, 0]}")
-            #     except Exception as e:
-            #         print(f"Error accessing B[:, 0]: {e}")
-            #     try:
-            #         print(f"C[:, 0]: {C[:, 0]}")
-            #     except Exception as e:
-            #         print(f"Error accessing C[:, 0]: {e}")
-            #     try:
-            #         print(f"D[index]: {self.D[index]}")
-            #     except Exception as e:
-            #         print(f"Error accessing D[{index}]: {e}")
-            #     try:
-            #         print(f"gate[..., 0]: {gate[..., 0]}")
-            #     except Exception as e:
-            #         print(f"Error accessing gate[..., 0]: {e}")
-            #     try:
-            #         print(f"time_proj_bias: {time_proj_bias}")
-            #     except Exception as e:
-            #         print(f"Error accessing time_proj_bias: {e}")
-            #     print("\n\n\n\n")
             scan_outputs = scan_outputs.transpose(1, 2)

             A = -torch.exp(self.A_log[index].float())
             time_proj_bias = time_proj_bias.float() if time_proj_bias is not None else None
+            try:
+                if use_precomputed_states:
+                    scan_outputs = selective_state_update(
+                        cache_params.ssm_states[self.layer_idx],
+                        hidden_states[..., 0],
+                        discrete_time_step[..., 0],
+                        A,
+                        B[:, 0],
+                        C[:, 0],
+                        self.D[index],
+                        gate[..., 0],
+                        time_proj_bias,
+                        dt_softplus=True,
+                    ).unsqueeze(-1)
                 else:
+                    outputs = selective_scan_fn(
+                        hidden_states,
+                        discrete_time_step,
+                        A,
+                        B.transpose(1, 2),
+                        C.transpose(1, 2),
+                        self.D[index].float(),
+                        z=gate,
+                        delta_bias=time_proj_bias,
+                        delta_softplus=True,
+                        return_last_state=True,
+                    )
+                    if len(outputs) == 3:
+                        scan_outputs, ssm_state, _ = outputs
+                    else:
+                        scan_outputs, ssm_state = outputs
+                    if ssm_state is not None and cache_params is not None:
+                        cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+            except Exception as e:
+                print("\n\n\n\n")
+                print(e)
+                print(f"use_precomputed_states {use_precomputed_states}; {index} {self.D}, {time_proj_bias} ")
+                print(f"{self.D[index]} ")
+                # cache_params.ssm_states[self.layer_idx],
+                #         hidden_states[..., 0],
+                #         discrete_time_step[..., 0],
+                #         A,
+                #         B[:, 0],
+                #         C[:, 0],
+                #         self.D[index],
+                #         gate[..., 0],
+                #         time_proj_bias,
+                print("=== Variable Values ===")
+                try:
+                    print(f"cache_params.ssm_states[{self.layer_idx}]: {cache_params.ssm_states[self.layer_idx]}")
+                    print(f"{cache_params.ssm_states[self.layer_idx].shape}")
+                except Exception as e:
+                    print(f"Error accessing cache_params.ssm_states[{self.layer_idx}]: {e}")
+                try:
+                    print(f"hidden_states[..., 0]: {hidden_states[..., 0]}")
+                    print(f"hidden_states[..., 0] shape: {hidden_states[..., 0].shape}")
+                except Exception as e:
+                    print(f"Error accessing hidden_states[..., 0]: {e}")
+                try:
+                    print(f"discrete_time_step[..., 0]: {discrete_time_step[..., 0]}")
+                    print(f"discrete_time_step[..., 0].shape: {discrete_time_step[..., 0].shape}")
+                except Exception as e:
+                    print(f"Error accessing discrete_time_step[..., 0]: {e}")
+                try:
+                    print(f"A: {A}")
+                    print(f"A.shape: {A.shape}")
+                except Exception as e:
+                    print(f"Error accessing A: {e}")
+                try:
+                    print(f"B[:, 0]: {B[:, 0].shape}")
+                    print(f"B[:, 0].shape: {B[:, 0].shape}")
+                except Exception as e:
+                    print(f"Error accessing B[:, 0]: {e}")
+                try:
+                    print(f"C[:, 0]: {C[:, 0]}")
+                    print(f"C[:, 0].shape: {C[:, 0].shape}")
+                except Exception as e:
+                    print(f"Error accessing C[:, 0]: {e}")
+                try:
+                    print(f"D[index]: {self.D[index]}")
+                    print(f"D[index].shape: {self.D[index].shape}")
+                except Exception as e:
+                    print(f"Error accessing D[{index}]: {e}")
+                try:
+                    print(f"gate[..., 0]: {gate[..., 0]}")
+                    print(f"gate[..., 0].shape: {gate[..., 0].shape}")
+                except Exception as e:
+                    print(f"Error accessing gate[..., 0]: {e}")
+                try:
+                    print(f"time_proj_bias: {time_proj_bias}")
+                except Exception as e:
+                    print(f"Error accessing time_proj_bias: {e}")
+                print("\n\n\n\n")
             scan_outputs = scan_outputs.transpose(1, 2)