{ "architectures": ["AlbertMoE"], "model_type": "albert-moe", "hidden_size": 256, "num_hidden_layers": 22, "num_attention_heads": 4, "num_experts": 12, "num_experts_per_tok": 3, "max_position_embeddings": 256, "vocab_size": 32000, "weight_quantization": "ternary", "weight_bits": 1.58, "num_parameters": 134000000, "active_parameters_per_token": 33000000, "expert_skip_rate": 0.75, "positional_encoding": "rope", "training_framework": "candle", "training_language": "rust", "max_seq_len": 256, "num_layers": 22, "num_heads": 4 }