Bantam-285m / config.json
Theoistic's picture
Upload folder using huggingface_hub
bc9042e verified
{
"architectures": [
"BantamForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"attention_head_groups": [
{
"head_dim": 128,
"kv_heads": 1,
"query_heads": 3
},
{
"head_dim": 64,
"kv_heads": 2,
"query_heads": 6
}
],
"attn_logit_softcapping": 0.0,
"bos_token_id": 1,
"dtype": "bfloat16",
"eos_token_id": 2,
"final_logit_softcapping": 0.0,
"head_dim": 85,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 2304,
"label_smoothing": 0.0,
"layer_configs": [
{
"attention_head_groups": [
{
"head_dim": 64,
"kv_heads": 3,
"query_heads": 12
}
],
"intermediate_size": 2304,
"num_attention_heads": 12,
"num_key_value_heads": 3,
"window": 128
},
{
"attention_head_groups": [
{
"head_dim": 64,
"kv_heads": 3,
"query_heads": 12
}
],
"intermediate_size": 2304,
"num_attention_heads": 12,
"num_key_value_heads": 3,
"window": 128
},
{
"attention_head_groups": [
{
"head_dim": 64,
"kv_heads": 3,
"query_heads": 12
}
],
"intermediate_size": 2368,
"num_attention_heads": 12,
"num_key_value_heads": 3,
"window": 128
},
{
"attention_head_groups": [
{
"head_dim": 64,
"kv_heads": 3,
"query_heads": 12
}
],
"intermediate_size": 2400,
"num_attention_heads": 12,
"num_key_value_heads": 3
},
{
"attention_head_groups": [
{
"head_dim": 80,
"kv_heads": 2,
"query_heads": 6
},
{
"head_dim": 96,
"kv_heads": 1,
"query_heads": 3
}
],
"expert_type": "topk",
"intermediate_size": 2432,
"moe_aux_loss_weight": 0.01,
"moe_capacity_factor": 1.05,
"moe_drop_policy": "random",
"moe_intermediate_size": 2432,
"moe_router_jitter": 0.05,
"moe_top_k": 2,
"num_attention_heads": 9,
"num_experts": 6,
"num_key_value_heads": 3,
"window": 256
},
{
"attention_head_groups": [
{
"head_dim": 80,
"kv_heads": 2,
"query_heads": 6
},
{
"head_dim": 96,
"kv_heads": 1,
"query_heads": 3
}
],
"intermediate_size": 2368,
"num_attention_heads": 9,
"num_key_value_heads": 3,
"window": 256
},
{
"attention_head_groups": [
{
"head_dim": 80,
"kv_heads": 2,
"query_heads": 6
},
{
"head_dim": 96,
"kv_heads": 1,
"query_heads": 3
}
],
"intermediate_size": 2432,
"num_attention_heads": 9,
"num_key_value_heads": 3,
"window": 256
},
{
"attention_head_groups": [
{
"head_dim": 80,
"kv_heads": 2,
"query_heads": 6
},
{
"head_dim": 96,
"kv_heads": 1,
"query_heads": 3
}
],
"intermediate_size": 2368,
"num_attention_heads": 9,
"num_key_value_heads": 3
},
{
"intermediate_size": 2304,
"window": 256
},
{
"intermediate_size": 2368,
"window": 256
},
{
"intermediate_size": 2400,
"window": 256
},
{
"intermediate_size": 2432,
"window": 256
},
{
"intermediate_size": 2432
},
{
"intermediate_size": 2400,
"window": 512
},
{
"intermediate_size": 2432,
"window": 512
},
{
"intermediate_size": 2432,
"window": 512
},
{
"expert_type": "topk",
"intermediate_size": 2432,
"moe_aux_loss_weight": 0.01,
"moe_capacity_factor": 1.1,
"moe_drop_policy": "random",
"moe_intermediate_size": 2432,
"moe_router_jitter": 0.05,
"moe_top_k": 2,
"num_experts": 8,
"window": 512
},
{
"expert_type": "topk",
"intermediate_size": 2432,
"moe_aux_loss_weight": 0.01,
"moe_capacity_factor": 1.1,
"moe_drop_policy": "random",
"moe_intermediate_size": 2432,
"moe_router_jitter": 0.05,
"moe_top_k": 2,
"num_experts": 8
},
{
"intermediate_size": 2368,
"window": 512
},
{
"intermediate_size": 2400
}
],
"max_head_dim": 128,
"max_position_embeddings": 2048,
"mlp_dropout": 0.0,
"model_type": "bantam",
"num_attention_heads": 9,
"num_attention_sinks": 4,
"num_hidden_layers": 20,
"num_key_value_heads": 3,
"pad_token_id": 1,
"qk_norm": true,
"qk_norm_eps": 1e-06,
"residual_dropout": 0.0,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"rope_theta": 10000.0,
"scaled_embeddings": false,
"sink_boost": 0.25,
"transformers_version": "4.57.1",
"use_cache": false,
"vocab_size": 49152,
"z_loss_weight": 0.0
}