run-title: micro-moe-smollm2-135m model: micro-moe-smollm2-135m base-model: HuggingFaceTB/SmolLM2-135M tokenizer: HuggingFaceTB/SmolLM2-135M-Instruct num-experts: 4 top-k-experts: 1 jitter-noise: 0 router-aux-loss-coef: 0.000 use-load-balancing: False use-router: True mask-input: True max-length: 8192 trainable: - model