rkazants
/

tiny-mamba

Model card Files Files and versions

rkazants commited on Jul 1, 2025

Commit

fc9d126

·

verified ·

1 Parent(s): 67fec6a

Update README.md

Files changed (1) hide show

README.md +9 -7

README.md CHANGED Viewed

@@ -14,19 +14,21 @@ tokenizer = AutoTokenizer.from_pretrained(model_dir)
 # === Step 1: Define tiny model config ===
 config = MambaConfig(
-    d_model=64,  # Smaller hidden dimension
-    n_layer=2,  # Just one layer
-    d_state=16,  # Minimal state size
-    expand=2,  # No expansion (linear)
-    conv_kernel=3,  # Smallest convolution kernel
-    vocab_size=50280,
 )
 # === Step 2: Create model from config ===
 model = MambaForCausalLM(config)
 # === Step 4: Save model and tokenizer to disk ===
-output_dir = "./tiny-mamba"
 os.makedirs(output_dir, exist_ok=True)
 model.save_pretrained(output_dir)
 tokenizer.save_pretrained(output_dir)

 # === Step 1: Define tiny model config ===
 config = MambaConfig(
+    d_model=16,            # Dimensionality of the input embeddings (model hidden size)
+    n_layer=2,             # Number of Mamba layers (or blocks) in the model
+    d_state=32,            # Dimensionality of the internal state used in the Mamba block (e.g., for state-space modeling)
+    expand=2,              # Expansion factor used in the Mamba block, typically to widen the intermediate dimensions
+    conv_kernel=3,         # Size of the convolution kernel used in the Mamba block (affects temporal mixing)
+    vocab_size=50280,      # Size of the vocabulary (number of unique tokens)
+    num_hidden_layers=32,  # Total number of hidden layers in the model (could override `n_layer`)
+    hidden_size=64,        # Size of hidden states used in the model layers (could override `d_model`)
 )
 # === Step 2: Create model from config ===
 model = MambaForCausalLM(config)
 # === Step 4: Save model and tokenizer to disk ===
+output_dir = "./tiny-mamba2"
 os.makedirs(output_dir, exist_ok=True)
 model.save_pretrained(output_dir)
 tokenizer.save_pretrained(output_dir)