| { | |
| "block_size": 1024, | |
| "vocab_size": 50257, | |
| "n_layer": 24, | |
| "n_head": 16, | |
| "n_kv_head": 8, | |
| "n_embd": 1024, | |
| "dropout": 0.1, | |
| "rope_theta": 10000.0, | |
| "architecture": "MedSLM", | |
| "model_type": "med-slm", | |
| "improvements": [ | |
| "RMSNorm (replaces LayerNorm)", | |
| "Rotary Positional Embeddings / RoPE (replaces learned absolute)", | |
| "SwiGLU activation (replaces GELU)", | |
| "Grouped-Query Attention / GQA (replaces standard MHA)" | |
| ] | |
| } |