mjschock
/

mamba-130m

@@ -1,6 +1,10 @@
 {
   "auto_map": {
-    "AutoConfig": "configuration_mamba.MambaConfig"
   },
   "bias": false,
   "conv_bias": true,
@@ -14,6 +18,7 @@
   "model_type": "mamba",
   "n_layer": 24,
   "pad_vocab_size_multiple": 8,
   "transformers_version": "4.37.2",
   "vocab_size": 50280
 }

 {
+  "architectures": [
+    "MambaLMHeadModel"
+  ],
   "auto_map": {
+    "AutoConfig": "configuration_mamba.MambaConfig",
+    "AutoModelForCausalLM": "modeling_mamba.MambaLMHeadModel"
   },
   "bias": false,
   "conv_bias": true,
   "model_type": "mamba",
   "n_layer": 24,
   "pad_vocab_size_multiple": 8,
+  "torch_dtype": "float32",
   "transformers_version": "4.37.2",
   "vocab_size": 50280
 }

modeling_mamba.py CHANGED Viewed

@@ -380,23 +380,17 @@ class MambaModel(MambaPretrainedModel):
             **kwargs,
         )
-        # self.embedding = nn.Embedding(
-        #     num_embeddings=config.vocab_size,
-        #     embedding_dim=config.d_model,
-        # )
         self.embedding = nn.Embedding(
-            num_embeddings=config.vocab_size,
-            embedding_dim=config.d_model,
         )
         self.layers = nn.ModuleList(
-            [ResidualBlock(config) for _ in range(self.config.n_layer)]
         )
         # self.layers = nn.ModuleList([MambaBlock(config) for _ in range(config.n_layer)])
         # # self.norm_f = RMSNorm(d_model=embedding_dim)
-        self.norm_f = RMSNorm(config.d_model)
         # self.gradient_checkpointing = False
         # # self.post_init()
@@ -454,54 +448,54 @@ class MambaModel(MambaPretrainedModel):
     # def set_input_embeddings(self, value):
     #     self.embed_out = value
-    # def forward(
-    #     self,
-    #     input_ids: torch.LongTensor = None,
-    #     output_hidden_states=False,
-    #     return_dict: Optional[bool] = None,
-    #     **kwargs,
-    #     # ) -> BaseModelOutput:
-    # ) -> Union[Tuple, BaseModelOutputWithPast]:
-    #     batch_size = input_ids.shape[0]
-    #     hidden_size = self.config.hidden_size
-    #     hidden_states: Tuple[Tensor[(batch_size, sequence_length, hidden_size)]] = ()
-    #     sequence_length = input_ids.shape[1]
-    #     output_hidden_states = output_hidden_states or self.config.output_hidden_states
-    #     last_hidden_state = self.embed_out(input_ids)
-    #     assert last_hidden_state.shape == (
-    #         batch_size,
-    #         sequence_length,
-    #         hidden_size,
-    #     ), f"{last_hidden_state.shape} != {(batch_size, sequence_length, hidden_size)}"
-    #     hidden_states += (last_hidden_state,)
-    #     for layer in self.layers:
-    #         last_hidden_state = layer(last_hidden_state)
-    #         assert last_hidden_state.shape == (
-    #             batch_size,
-    #             sequence_length,
-    #             hidden_size,
-    #         ), f"{last_hidden_state.shape} != {(batch_size, sequence_length, hidden_size)}"
-    #         hidden_states += (last_hidden_state,)
-    #     last_hidden_state = self.norm_f(last_hidden_state)
-    #     assert last_hidden_state.shape == (
-    #         batch_size,
-    #         sequence_length,
-    #         hidden_size,
-    #     ), f"{last_hidden_state.shape} != {(batch_size, sequence_length, hidden_size)}"
-    #     hidden_states += (last_hidden_state,)
-    #     assert (
-    #         len(hidden_states) == self.config.n_layer + 2
-    #     ), f"{len(hidden_states)} != {self.config.n_layer + 2}"
-    #     # return BaseModelOutput(
-    #     return BaseModelOutputWithPast(
-    #         hidden_states=hidden_states if output_hidden_states else None,
-    #         last_hidden_state=last_hidden_state,
-    #     )
 # Influences:
@@ -538,31 +532,31 @@ class MambaLMHeadModel(MambaPretrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
-#     # def forward(
-#     #     self, input_ids, output_hidden_states=False, **kwargs
-#     # ) -> CausalLMOutput:
-#     #     batch_size = input_ids.shape[0]
-#     #     sequence_length = input_ids.shape[1]
-#     #     vocab_size = self.config.vocab_size
-#     #     output_hidden_states = output_hidden_states or self.config.output_hidden_states
-#     #     outputs = self.backbone(
-#     #         input_ids=input_ids,
-#     #         output_hidden_states=output_hidden_states,
-#     #     )
-#     #     last_hidden_state = outputs.last_hidden_state
-#     #     logits: torch.FloatTensor[batch_size, sequence_length, vocab_size] = (
-#     #         self.lm_head(
-#     #             last_hidden_state,
-#     #         )
-#     #     )
-#     #     return CausalLMOutput(
-#     #         hidden_states=outputs.hidden_states if output_hidden_states else None,
-#     #         logits=logits,
-#     #     )
 #     # def prepare_inputs_for_generation(
 #     #     self, input_ids, attention_mask=None, **model_kwargs

             **kwargs,
         )
         self.embedding = nn.Embedding(
+            num_embeddings=self.config.vocab_size,
+            embedding_dim=self.config.d_model,
         )
         self.layers = nn.ModuleList(
+            [ResidualBlock(self.config) for _ in range(self.config.n_layer)]
         )
         # self.layers = nn.ModuleList([MambaBlock(config) for _ in range(config.n_layer)])
         # # self.norm_f = RMSNorm(d_model=embedding_dim)
+        self.norm_f = RMSNorm(self.config.d_model)
         # self.gradient_checkpointing = False
         # # self.post_init()
     # def set_input_embeddings(self, value):
     #     self.embed_out = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        output_hidden_states=False,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+        # ) -> BaseModelOutput:
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        batch_size = input_ids.shape[0]
+        hidden_size = self.config.hidden_size
+        hidden_states: Tuple[Tensor[(batch_size, sequence_length, hidden_size)]] = ()
+        sequence_length = input_ids.shape[1]
+        output_hidden_states = output_hidden_states or self.config.output_hidden_states
+        last_hidden_state = self.embedding(input_ids)
+        assert last_hidden_state.shape == (
+            batch_size,
+            sequence_length,
+            hidden_size,
+        ), f"{last_hidden_state.shape} != {(batch_size, sequence_length, hidden_size)}"
+        hidden_states += (last_hidden_state,)
+        for layer in self.layers:
+            last_hidden_state = layer(last_hidden_state)
+            assert last_hidden_state.shape == (
+                batch_size,
+                sequence_length,
+                hidden_size,
+            ), f"{last_hidden_state.shape} != {(batch_size, sequence_length, hidden_size)}"
+            hidden_states += (last_hidden_state,)
+        last_hidden_state = self.norm_f(last_hidden_state)
+        assert last_hidden_state.shape == (
+            batch_size,
+            sequence_length,
+            hidden_size,
+        ), f"{last_hidden_state.shape} != {(batch_size, sequence_length, hidden_size)}"
+        hidden_states += (last_hidden_state,)
+        assert (
+            len(hidden_states) == self.config.n_layer + 2
+        ), f"{len(hidden_states)} != {self.config.n_layer + 2}"
+        # return BaseModelOutput(
+        return BaseModelOutputWithPast(
+            hidden_states=hidden_states if output_hidden_states else None,
+            last_hidden_state=last_hidden_state,
+        )
 # Influences:
         # Initialize weights and apply final processing
         self.post_init()
+    def forward(
+        self, input_ids, output_hidden_states=False, **kwargs
+    ) -> CausalLMOutput:
+        batch_size = input_ids.shape[0]
+        sequence_length = input_ids.shape[1]
+        vocab_size = self.config.vocab_size
+        output_hidden_states = output_hidden_states or self.config.output_hidden_states
+        outputs = self.backbone(
+            input_ids=input_ids,
+            output_hidden_states=output_hidden_states,
+        )
+        last_hidden_state = outputs.last_hidden_state
+        logits: torch.FloatTensor[batch_size, sequence_length, vocab_size] = (
+            self.lm_head(
+                last_hidden_state,
+            )
+        )
+        return CausalLMOutput(
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            logits=logits,
+        )
 #     # def prepare_inputs_for_generation(
 #     #     self, input_ids, attention_mask=None, **model_kwargs