mjschock
/

mamba-130m

@@ -1,6 +1,10 @@
 {
   "auto_map": {
-    "AutoConfig": "configuration_mamba.MambaConfig"
   },
   "bias": false,
   "conv_bias": true,
@@ -14,6 +18,7 @@
   "model_type": "mamba",
   "n_layer": 24,
   "pad_vocab_size_multiple": 8,
   "transformers_version": "4.37.2",
   "vocab_size": 50280
 }

 {
+  "architectures": [
+    "MambaModelForCausalLM"
+  ],
   "auto_map": {
+    "AutoConfig": "configuration_mamba.MambaConfig",
+    "AutoModelForCausalLM": "modeling_mamba.MambaModelForCausalLM"
   },
   "bias": false,
   "conv_bias": true,
   "model_type": "mamba",
   "n_layer": 24,
   "pad_vocab_size_multiple": 8,
+  "torch_dtype": "float32",
   "transformers_version": "4.37.2",
   "vocab_size": 50280
 }

modeling_mamba.py CHANGED Viewed

@@ -241,25 +241,74 @@ class MambaModel(MambaPreTrainedModel):
     # def set_input_embeddings(self, value):
     #     self.embedding = value
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         **kwargs,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        x = self.embedding(input_ids)
-        all_hidden_states = list()
-        for layer in self.layers:
-            x = layer(x)
-            all_hidden_states.append(x)
-        hidden_states = self.norm_f(x)
         return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            hidden_states=all_hidden_states,
         )
 class MambaModelForCausalLM(MambaPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]

     # def set_input_embeddings(self, value):
     #     self.embedding = value
+    # def forward(
+    #     self,
+    #     input_ids: torch.LongTensor = None,
+    #     **kwargs,
+    # ) -> Union[Tuple, BaseModelOutputWithPast]:
+    #     x = self.embedding(input_ids)
+    #     all_hidden_states = list()
+    #     for layer in self.layers:
+    #         x = layer(x)
+    #         all_hidden_states.append(x)
+    #     hidden_states = self.norm_f(x)
+    #     return BaseModelOutputWithPast(
+    #         last_hidden_state=hidden_states,
+    #         hidden_states=all_hidden_states,
+    #     )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
+        output_hidden_states=False,
+        return_dict: Optional[bool] = None,
         **kwargs,
+        # ) -> BaseModelOutput:
+    # ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
+        batch_size = input_ids.shape[0]
+        hidden_size = self.config.d_model
+        hidden_states: Tuple[torch.Tensor[(batch_size, sequence_length, hidden_size)]] = ()
+        sequence_length = input_ids.shape[1]
+        output_hidden_states = output_hidden_states or self.config.output_hidden_states
+        last_hidden_state = self.embedding(input_ids)
+        assert last_hidden_state.shape == (
+            batch_size,
+            sequence_length,
+            hidden_size,
+        ), f"{last_hidden_state.shape} != {(batch_size, sequence_length, hidden_size)}"
+        hidden_states += (last_hidden_state,)
+        for layer in self.layers:
+            last_hidden_state = layer(last_hidden_state)
+            assert last_hidden_state.shape == (
+                batch_size,
+                sequence_length,
+                hidden_size,
+            ), f"{last_hidden_state.shape} != {(batch_size, sequence_length, hidden_size)}"
+            hidden_states += (last_hidden_state,)
+        last_hidden_state = self.norm_f(last_hidden_state)
+        assert last_hidden_state.shape == (
+            batch_size,
+            sequence_length,
+            hidden_size,
+        ), f"{last_hidden_state.shape} != {(batch_size, sequence_length, hidden_size)}"
+        hidden_states += (last_hidden_state,)
+        assert (
+            len(hidden_states) == self.config.n_layer + 2
+        ), f"{len(hidden_states)} != {self.config.n_layer + 2}"
+        # return BaseModelOutput(
         return BaseModelOutputWithPast(
+            hidden_states=hidden_states if output_hidden_states else None,
+            last_hidden_state=last_hidden_state,
         )
 class MambaModelForCausalLM(MambaPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]