constantin101
/

lladatest

Text Generation

Model card Files Files and versions

constantin101 commited on Oct 17, 2025

Commit

13e0583

·

verified ·

1 Parent(s): 7f1f94d

Update config.json

Files changed (1) hide show

config.json +5 -5

config.json CHANGED Viewed

@@ -18,10 +18,10 @@
   "d_model": 1728,
   "n_layers": 16,
   "n_heads": 16,
-  "n_kv_heads": 8,                 // grouped attention (optional but recommended)
   "embedding_dropout": 0.0,
-  "embedding_size": 126464,        // keep as-is to match tokenizer
   "eos_token_id": 126081,
   "flash_attention": false,
   "include_bias": false,
@@ -36,8 +36,8 @@
   "mask_token_id": 126336,
   "max_sequence_length": 4096,
-  "mlp_hidden_size": 4608,         // ~2.66 × d_model (good for SwiGLU/SiLU)
-  "mlp_ratio": 2.66,               // keep consistent with hidden size
   "model_type": "llada",
   "multi_query_attention": null,
@@ -53,6 +53,6 @@
   "use_cache": false,
   "vocab_size": 126464,
-  "weight_tying": true             // critical for getting under ~0.8B
 }

   "d_model": 1728,
   "n_layers": 16,
   "n_heads": 16,
+  "n_kv_heads": 8,
   "embedding_dropout": 0.0,
+  "embedding_size": 126464,
   "eos_token_id": 126081,
   "flash_attention": false,
   "include_bias": false,
   "mask_token_id": 126336,
   "max_sequence_length": 4096,
+  "mlp_hidden_size": 4608,
+  "mlp_ratio": 2.66,
   "model_type": "llada",
   "multi_query_attention": null,
   "use_cache": false,
   "vocab_size": 126464,
+  "weight_tying": true
 }