{ "architectures": [ "Model" ], "attention_dropout": 0.0, "bos_token_id": 1, "clip_qkv": null, "eos_token_id": 2, "fused_attention_norm": false, "gated_mlp": true, "glm_tokens": null, "gradient_checkpointing": false, "hidden_act": "silu", "hidden_size": 768, "initializer_range": 0.02, "inter_sequence_attention": true, "intermediate_size": 2304, "max_num_sequences": 512, "max_position_embeddings": 65536, "mlm_loss_coef": 1.0, "model_type": "progen3", "moe_expert_selection": "switch", "moe_grouped_gemm": true, "moe_implementation": "megablocks", "moe_memory_optimized": true, "moe_world_size": 1, "msa_style_attention": true, "no_ffn_gradient_checkpointing": false, "num_attention_heads": 12, "num_experts": 8, "num_experts_per_tok": 2, "num_hidden_layers": 17, "num_key_value_heads": 12, "output_router_weights": false, "pad_token_id": 0, "quantize_inputs_num_bits": null, "quantize_rematerialize_num_bits": null, "quantize_scatter_num_bits": null, "rms_norm_eps": 1e-05, "rope_theta": 100000.0, "router_aux_loss_coef": 0.05, "tie_word_embeddings": false, "torch_dtype": "bfloat16", "transformers_version": "4.42.4", "use_cache": true, "vocab_size": 134 }