Update README.md
Browse files
README.md
CHANGED
|
@@ -14,19 +14,21 @@ tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
|
| 14 |
|
| 15 |
# === Step 1: Define tiny model config ===
|
| 16 |
config = MambaConfig(
|
| 17 |
-
d_model=
|
| 18 |
-
n_layer=2,
|
| 19 |
-
d_state=
|
| 20 |
-
expand=2,
|
| 21 |
-
conv_kernel=3,
|
| 22 |
-
vocab_size=50280,
|
|
|
|
|
|
|
| 23 |
)
|
| 24 |
|
| 25 |
# === Step 2: Create model from config ===
|
| 26 |
model = MambaForCausalLM(config)
|
| 27 |
|
| 28 |
# === Step 4: Save model and tokenizer to disk ===
|
| 29 |
-
output_dir = "./tiny-
|
| 30 |
os.makedirs(output_dir, exist_ok=True)
|
| 31 |
model.save_pretrained(output_dir)
|
| 32 |
tokenizer.save_pretrained(output_dir)
|
|
|
|
| 14 |
|
| 15 |
# === Step 1: Define tiny model config ===
|
| 16 |
config = MambaConfig(
|
| 17 |
+
d_model=16, # Dimensionality of the input embeddings (model hidden size)
|
| 18 |
+
n_layer=2, # Number of Mamba layers (or blocks) in the model
|
| 19 |
+
d_state=32, # Dimensionality of the internal state used in the Mamba block (e.g., for state-space modeling)
|
| 20 |
+
expand=2, # Expansion factor used in the Mamba block, typically to widen the intermediate dimensions
|
| 21 |
+
conv_kernel=3, # Size of the convolution kernel used in the Mamba block (affects temporal mixing)
|
| 22 |
+
vocab_size=50280, # Size of the vocabulary (number of unique tokens)
|
| 23 |
+
num_hidden_layers=32, # Total number of hidden layers in the model (could override `n_layer`)
|
| 24 |
+
hidden_size=64, # Size of hidden states used in the model layers (could override `d_model`)
|
| 25 |
)
|
| 26 |
|
| 27 |
# === Step 2: Create model from config ===
|
| 28 |
model = MambaForCausalLM(config)
|
| 29 |
|
| 30 |
# === Step 4: Save model and tokenizer to disk ===
|
| 31 |
+
output_dir = "./tiny-mamba2"
|
| 32 |
os.makedirs(output_dir, exist_ok=True)
|
| 33 |
model.save_pretrained(output_dir)
|
| 34 |
tokenizer.save_pretrained(output_dir)
|