| { | |
| "architectures": ["AlbertMoE"], | |
| "model_type": "albert-moe", | |
| "hidden_size": 256, | |
| "num_hidden_layers": 22, | |
| "num_attention_heads": 4, | |
| "num_experts": 12, | |
| "num_experts_per_tok": 3, | |
| "max_position_embeddings": 256, | |
| "vocab_size": 32000, | |
| "weight_quantization": "ternary", | |
| "weight_bits": 1.58, | |
| "num_parameters": 134000000, | |
| "active_parameters_per_token": 33000000, | |
| "expert_skip_rate": 0.75, | |
| "positional_encoding": "rope", | |
| "training_framework": "candle", | |
| "training_language": "rust", | |
| "max_seq_len": 256, | |
| "num_layers": 22, | |
| "num_heads": 4 | |
| } | |