Instructions to use AlexWortega/moe100m-physics-tinybpe with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use AlexWortega/moe100m-physics-tinybpe with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("AlexWortega/moe100m-physics-tinybpe", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| { | |
| "vocab_size": 512, | |
| "d_model": 640, | |
| "n_layers": 14, | |
| "n_q_heads": 10, | |
| "n_kv_heads": 2, | |
| "head_dim": 64, | |
| "rope_partial": 32, | |
| "rope_theta": 10000.0, | |
| "d_ff": 1024, | |
| "n_routed_experts": 8, | |
| "n_shared_experts": 1, | |
| "top_k": 2, | |
| "moe_first_layer": 1, | |
| "router_z_coef": 0.001, | |
| "router_noise_std": 0.0, | |
| "router_aux_coef": 0.001, | |
| "bias_update_rate": 0.001, | |
| "max_seq_len": 1024, | |
| "tie_embeddings": true, | |
| "rms_eps": 1e-06, | |
| "init_std": 0.02, | |
| "mup_base_d": 512, | |
| "attn_backend": "sdpa", | |
| "moe_backend": "grouped", | |
| "moe_capacity_factor": 1.5, | |
| "smear_gate": true, | |
| "use_chunked_ce": true, | |
| "ce_chunk_tokens": 512, | |
| "ce_checkpoint_chunks": true, | |
| "use_liger_ce": true, | |
| "_model_class": "MoEModel", | |
| "architectures": [ | |
| "MoEModel" | |
| ] | |
| } |