{ "architectures": [ "BantamForCausalLM" ], "attention_bias": false, "attention_dropout": 0.0, "attention_head_groups": [ { "head_dim": 128, "kv_heads": 1, "query_heads": 3 }, { "head_dim": 64, "kv_heads": 2, "query_heads": 6 } ], "attn_logit_softcapping": 0.0, "bos_token_id": 1, "dtype": "bfloat16", "eos_token_id": 2, "final_logit_softcapping": 0.0, "head_dim": 85, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 2304, "label_smoothing": 0.0, "layer_configs": [ { "attention_head_groups": [ { "head_dim": 64, "kv_heads": 3, "query_heads": 12 } ], "intermediate_size": 2304, "num_attention_heads": 12, "num_key_value_heads": 3, "window": 128 }, { "attention_head_groups": [ { "head_dim": 64, "kv_heads": 3, "query_heads": 12 } ], "intermediate_size": 2304, "num_attention_heads": 12, "num_key_value_heads": 3, "window": 128 }, { "attention_head_groups": [ { "head_dim": 64, "kv_heads": 3, "query_heads": 12 } ], "intermediate_size": 2368, "num_attention_heads": 12, "num_key_value_heads": 3, "window": 128 }, { "attention_head_groups": [ { "head_dim": 64, "kv_heads": 3, "query_heads": 12 } ], "intermediate_size": 2400, "num_attention_heads": 12, "num_key_value_heads": 3 }, { "attention_head_groups": [ { "head_dim": 80, "kv_heads": 2, "query_heads": 6 }, { "head_dim": 96, "kv_heads": 1, "query_heads": 3 } ], "expert_type": "topk", "intermediate_size": 2432, "moe_aux_loss_weight": 0.01, "moe_capacity_factor": 1.05, "moe_drop_policy": "random", "moe_intermediate_size": 2432, "moe_router_jitter": 0.05, "moe_top_k": 2, "num_attention_heads": 9, "num_experts": 6, "num_key_value_heads": 3, "window": 256 }, { "attention_head_groups": [ { "head_dim": 80, "kv_heads": 2, "query_heads": 6 }, { "head_dim": 96, "kv_heads": 1, "query_heads": 3 } ], "intermediate_size": 2368, "num_attention_heads": 9, "num_key_value_heads": 3, "window": 256 }, { "attention_head_groups": [ { "head_dim": 80, "kv_heads": 2, "query_heads": 6 }, { "head_dim": 96, "kv_heads": 1, "query_heads": 3 } ], "intermediate_size": 2432, "num_attention_heads": 9, "num_key_value_heads": 3, "window": 256 }, { "attention_head_groups": [ { "head_dim": 80, "kv_heads": 2, "query_heads": 6 }, { "head_dim": 96, "kv_heads": 1, "query_heads": 3 } ], "intermediate_size": 2368, "num_attention_heads": 9, "num_key_value_heads": 3 }, { "intermediate_size": 2304, "window": 256 }, { "intermediate_size": 2368, "window": 256 }, { "intermediate_size": 2400, "window": 256 }, { "intermediate_size": 2432, "window": 256 }, { "intermediate_size": 2432 }, { "intermediate_size": 2400, "window": 512 }, { "intermediate_size": 2432, "window": 512 }, { "intermediate_size": 2432, "window": 512 }, { "expert_type": "topk", "intermediate_size": 2432, "moe_aux_loss_weight": 0.01, "moe_capacity_factor": 1.1, "moe_drop_policy": "random", "moe_intermediate_size": 2432, "moe_router_jitter": 0.05, "moe_top_k": 2, "num_experts": 8, "window": 512 }, { "expert_type": "topk", "intermediate_size": 2432, "moe_aux_loss_weight": 0.01, "moe_capacity_factor": 1.1, "moe_drop_policy": "random", "moe_intermediate_size": 2432, "moe_router_jitter": 0.05, "moe_top_k": 2, "num_experts": 8 }, { "intermediate_size": 2368, "window": 512 }, { "intermediate_size": 2400 } ], "max_head_dim": 128, "max_position_embeddings": 2048, "mlp_dropout": 0.0, "model_type": "bantam", "num_attention_heads": 9, "num_attention_sinks": 4, "num_hidden_layers": 20, "num_key_value_heads": 3, "pad_token_id": 1, "qk_norm": true, "qk_norm_eps": 1e-06, "residual_dropout": 0.0, "rms_norm_eps": 1e-06, "rope_scaling": null, "rope_theta": 10000.0, "scaled_embeddings": false, "sink_boost": 0.25, "transformers_version": "4.57.1", "use_cache": false, "vocab_size": 49152, "z_loss_weight": 0.0 }