| {"dim": 5120, "n_layers": 40, "head_dim": 128, "hidden_dim": 14336, "n_heads": 32, "n_kv_heads": 8, "use_biases": false, "causal": true, "rope_theta": 1000000.0, "norm_eps": 1e-05, "init": "DEFAULT", "dropout": 0.0, "vocab_size": 131072, "model_parallel": 1, "is_sequence_parallel": false, "context_parallel": 1, "model_pipelining": 1, "efficient_attn": true, "fused_rms_norm": true, "ragged_attention": null, "checkpoint": true, "use_cache": false, "max_concurrent_tokens": 65536, "rms_norm": "PRE", "cust_bwd": false, "recompute_w1_every": 0, "recompute_w3_every": 0, "recompute_attn_every": 0, "freeze_nonembedding": false, "zero2": false, "cutlass": false, "attn_tanh_gating": null, "softmax_tanh_gating": null, "deterministic_flash_attn": false, "moe": null, "mamba": null, "multimodal": null, "quantization": null, "layer_drop": null, "max_seq_len": 131072} |