constantin101 commited on
Commit
13e0583
·
verified ·
1 Parent(s): 7f1f94d

Update config.json

Browse files
Files changed (1) hide show
  1. config.json +5 -5
config.json CHANGED
@@ -18,10 +18,10 @@
18
  "d_model": 1728,
19
  "n_layers": 16,
20
  "n_heads": 16,
21
- "n_kv_heads": 8, // grouped attention (optional but recommended)
22
 
23
  "embedding_dropout": 0.0,
24
- "embedding_size": 126464, // keep as-is to match tokenizer
25
  "eos_token_id": 126081,
26
  "flash_attention": false,
27
  "include_bias": false,
@@ -36,8 +36,8 @@
36
  "mask_token_id": 126336,
37
  "max_sequence_length": 4096,
38
 
39
- "mlp_hidden_size": 4608, // ~2.66 × d_model (good for SwiGLU/SiLU)
40
- "mlp_ratio": 2.66, // keep consistent with hidden size
41
 
42
  "model_type": "llada",
43
  "multi_query_attention": null,
@@ -53,6 +53,6 @@
53
  "use_cache": false,
54
 
55
  "vocab_size": 126464,
56
- "weight_tying": true // critical for getting under ~0.8B
57
  }
58
 
 
18
  "d_model": 1728,
19
  "n_layers": 16,
20
  "n_heads": 16,
21
+ "n_kv_heads": 8,
22
 
23
  "embedding_dropout": 0.0,
24
+ "embedding_size": 126464,
25
  "eos_token_id": 126081,
26
  "flash_attention": false,
27
  "include_bias": false,
 
36
  "mask_token_id": 126336,
37
  "max_sequence_length": 4096,
38
 
39
+ "mlp_hidden_size": 4608,
40
+ "mlp_ratio": 2.66,
41
 
42
  "model_type": "llada",
43
  "multi_query_attention": null,
 
53
  "use_cache": false,
54
 
55
  "vocab_size": 126464,
56
+ "weight_tying": true
57
  }
58