mjschock
/

mamba-130m

@@ -1,21 +1,19 @@
 {
-  "architectures": [
-    "MambaModelForCausalLM"
-  ],
   "auto_map": {
-    "AutoConfig": "configuration_mamba.MambaConfig",
-    "AutoModel": "modeling_mamba.MambaModel",
-    "AutoModelForCausalLM": "modeling_mamba.MambaModelForCausalLM"
   },
   "d_model": 768,
-  "fused_add_norm": true,
   "model_type": "mamba",
   "n_layer": 24,
   "pad_vocab_size_multiple": 8,
-  "residual_in_fp32": true,
-  "rms_norm": true,
-  "ssm_cfg": {},
-  "torch_dtype": "float16",
   "transformers_version": "4.37.2",
-  "vocab_size": 50277
 }

 {
   "auto_map": {
+    "AutoConfig": "configuration_mamba.MambaConfig"
   },
+  "bias": false,
+  "conv_bias": true,
+  "d_conv": 4,
+  "d_inner": 1536,
   "d_model": 768,
+  "d_state": 16,
+  "dt_rank": 48,
+  "expand": 2,
+  "hidden_size": 768,
   "model_type": "mamba",
   "n_layer": 24,
   "pad_vocab_size_multiple": 8,
   "transformers_version": "4.37.2",
+  "vocab_size": 50280
 }

configuration_mamba.py CHANGED Viewed

@@ -1,30 +1,61 @@
-import mamba_ssm
 from transformers import PretrainedConfig
-mamba_config_defaults = mamba_ssm.models.config_mamba.MambaConfig()
 class MambaConfig(PretrainedConfig):
-    model_type = "mamba"
     def __init__(
         self,
-        d_model: int = mamba_config_defaults.d_model,
-        fused_add_norm: bool = mamba_config_defaults.fused_add_norm,
-        n_layer: int = mamba_config_defaults.n_layer,
-        pad_vocab_size_multiple: int = mamba_config_defaults.pad_vocab_size_multiple,
-        residual_in_fp32: bool = mamba_config_defaults.residual_in_fp32,
-        rms_norm: bool = mamba_config_defaults.rms_norm,
-        ssm_cfg: dict = mamba_config_defaults.ssm_cfg,
-        vocab_size: int = mamba_config_defaults.vocab_size,
         **kwargs,
     ):
         self.d_model = d_model
-        self.fused_add_norm = fused_add_norm
         self.n_layer = n_layer
         self.pad_vocab_size_multiple = pad_vocab_size_multiple
-        self.residual_in_fp32 = residual_in_fp32
-        self.rms_norm = rms_norm
-        self.ssm_cfg = ssm_cfg
         self.vocab_size = vocab_size
-        super().__init__(**kwargs)

+import math
+from typing import Union
 from transformers import PretrainedConfig
+# Inspired by:
+# - https://huggingface.co/Q-bert/Mamba-130M/blob/9fad7fb5fb9c9416fab4f70ecd62498478be2074/configuration_mamba.py#L5
+# - https://github.com/johnma2006/mamba-minimal/blob/03de542a36d873f6e6c4057ad687278cc6ae944d/model.py#L33
+# - https://github.com/state-spaces/mamba/blob/009bec5ee37f586844a3fc89c040a9c1a9d8badf/mamba_ssm/models/config_mamba.py#L5
 class MambaConfig(PretrainedConfig):
+    model_type: str = "mamba"
     def __init__(
         self,
+        bias: bool = False,
+        conv_bias: bool = True,
+        d_conv: int = 4,
+        d_model: int = 2560,
+        d_state: int = 16,
+        dt_rank: Union[int, str] = "auto",
+        expand: int = 2,
+        # fused_add_norm: bool = True,
+        # initializer_range: float = 0.02,
+        n_layer: int = 64,  # TODO: Rename to num_hidden_layers?
+        pad_vocab_size_multiple: int = 8,
+        # residual_in_fp32: bool = True,
+        # rms_norm: bool = True,
+        # ssm_config: dict = {},
+        vocab_size: int = 50277,
         **kwargs,
     ):
+        self.bias = bias
+        self.conv_bias = conv_bias
+        self.d_conv = d_conv
         self.d_model = d_model
+        self.d_state = d_state
+        self.dt_rank = dt_rank
+        self.expand = expand
         self.n_layer = n_layer
         self.pad_vocab_size_multiple = pad_vocab_size_multiple
         self.vocab_size = vocab_size
+        self.d_inner = int(self.expand * self.d_model)
+        if self.dt_rank == "auto":
+            self.dt_rank = math.ceil(self.d_model / 16)  # TODO: 16 is self.d_state?
+        if self.vocab_size % self.pad_vocab_size_multiple != 0:
+            self.vocab_size += (
+                self.pad_vocab_size_multiple
+                - self.vocab_size % self.pad_vocab_size_multiple
+            )
+        # TODO: According to https://huggingface.co/docs/transformers/create_a_model#configuration,
+        # "all NLP models have the hidden_size, num_attention_heads, num_hidden_layers and vocab_size attributes in common."
+        self.hidden_size = self.d_model
+        super().__init__(
+            **kwargs,
+        )