run-title: micro-moe-llama-1b model: micro-moe-llama-1b base-model: meta-llama/Llama-3.2-1B tokenizer: meta-llama/Llama-3.2-1B-Instruct num-experts: 4 top-k-experts: 1 jitter-noise: 0 router-aux-loss-coef: 0.000 use-load-balancing: False use-router: True mask-input: True max-length: 8192 trainable: - model