rkazants commited on
Commit
fc9d126
·
verified ·
1 Parent(s): 67fec6a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +9 -7
README.md CHANGED
@@ -14,19 +14,21 @@ tokenizer = AutoTokenizer.from_pretrained(model_dir)
14
 
15
  # === Step 1: Define tiny model config ===
16
  config = MambaConfig(
17
- d_model=64, # Smaller hidden dimension
18
- n_layer=2, # Just one layer
19
- d_state=16, # Minimal state size
20
- expand=2, # No expansion (linear)
21
- conv_kernel=3, # Smallest convolution kernel
22
- vocab_size=50280,
 
 
23
  )
24
 
25
  # === Step 2: Create model from config ===
26
  model = MambaForCausalLM(config)
27
 
28
  # === Step 4: Save model and tokenizer to disk ===
29
- output_dir = "./tiny-mamba"
30
  os.makedirs(output_dir, exist_ok=True)
31
  model.save_pretrained(output_dir)
32
  tokenizer.save_pretrained(output_dir)
 
14
 
15
  # === Step 1: Define tiny model config ===
16
  config = MambaConfig(
17
+ d_model=16, # Dimensionality of the input embeddings (model hidden size)
18
+ n_layer=2, # Number of Mamba layers (or blocks) in the model
19
+ d_state=32, # Dimensionality of the internal state used in the Mamba block (e.g., for state-space modeling)
20
+ expand=2, # Expansion factor used in the Mamba block, typically to widen the intermediate dimensions
21
+ conv_kernel=3, # Size of the convolution kernel used in the Mamba block (affects temporal mixing)
22
+ vocab_size=50280, # Size of the vocabulary (number of unique tokens)
23
+ num_hidden_layers=32, # Total number of hidden layers in the model (could override `n_layer`)
24
+ hidden_size=64, # Size of hidden states used in the model layers (could override `d_model`)
25
  )
26
 
27
  # === Step 2: Create model from config ===
28
  model = MambaForCausalLM(config)
29
 
30
  # === Step 4: Save model and tokenizer to disk ===
31
+ output_dir = "./tiny-mamba2"
32
  os.makedirs(output_dir, exist_ok=True)
33
  model.save_pretrained(output_dir)
34
  tokenizer.save_pretrained(output_dir)