{ "model_type": "causal_diffusion", "interpretable": true, "n_layers": 32, "n_head": 32, "n_embd": 4096, "block_size": 4096, "n_kv_heads": 4, "diff_block_size": 64, "use_rms_norm": true, "norm_eps": 1e-05, "norm_order": "post", "use_qk_norm": true, "use_rope": true, "rope_base": 500000.0, "rope_full_precision": true, "mlp_type": "swiglu", "activation": "gelu", "mlp_ratio": 4, "intermediate_size": null, "use_bias": false, "clip_qkv": 10.0, "weight_sharing": true, "pad_token_id": 100277, "bos_token_id": 100278, "eos_token_id": 100257, "endofchunk_token_id": 100279, "mask_token_id": 100280, "vocab_size": 100281, "concept": { "n_concepts": 33732, "n_unknown_concepts": 101196, "max_concepts": 16, "concept_dim": 4096, "use_attention_known": false, "use_attention_unknown": false, "topk_known": 16, "topk_known_features": 32, "unknown_topk": 128, "use_unknown": true, "apply_topk_to_unknown": true, "topk_on_logits": false, "factorize_unknown": true, "factorize_rank": 256, "use_epsilon_correction": true, "block_size": 4096, "pad_multiple": 16, "store_unknown_weights": false, "inject_layer": 16, "inject_alpha": 1.0 } }