{ "_comment_activation": "MLP activation function aligned with ModernBert", "_comment_attention": "Attention and RoPE settings for shorter context", "_comment_bias": "Bias settings for Linear layers aligned with ModernBert", "_comment_dropout": "Dropout rates aligned with ModernBert", "_comment_initialization": "Initialization scheme aligned with ModernBert", "_comment_misc": "Other settings for decoder-style model", "_comment_normalization": "LayerNorm settings aligned with ModernBert", "activation_type": "silu", "alibi": false, "alibi_bias_max": 8.0, "architectures": [ "LLaDAModelLM" ], "attention_bias": false, "attention_dropout": 0.0, "attention_layer_norm": false, "attention_layer_norm_with_affine": true, "auto_map": { "AutoConfig": "configuration_llada.LLaDAConfig", "AutoModel": "modeling_llada.LLaDAModelLM" }, "bias_for_layer_norm": false, "block_group_size": 1, "block_type": "llama", "bos_token_id": 0, "d_model": 576, "dtype": "float32", "embedding_dropout": 0.0, "embedding_size": 50368, "eos_token_id": 0, "flash_attention": false, "include_bias": false, "include_qkv_bias": false, "init_cutoff_factor": 2.0, "init_device": "meta", "init_fn": "full_megatron", "init_std": 0.02, "input_emb_norm": false, "is_llama_config": true, "layer_norm_type": "default", "layer_norm_with_affine": true, "loss_normalization": "masked_tokens", "mask_token_id": 50256, "max_position_embeddings": 8192, "max_sequence_length": 2048, "mlp_hidden_size": 1536, "mlp_ratio": 4, "model_type": "llada", "multi_query_attention": null, "n_heads": 9, "n_kv_heads": 3, "n_layers": 30, "pad_token_id": 50283, "precision": null, "pretraining_tp": 1, "residual_dropout": 0.0, "rms_norm_eps": 1e-05, "rope": true, "rope_full_precision": true, "rope_interleaved": false, "rope_scaling": null, "rope_theta": 100000.0, "scale_logits": false, "transformers_version": "4.57.1", "use_cache": false, "vocab_size": 50368, "weight_tying": true }