| { | |
| "args": { | |
| "capacity_factor": 1.0, | |
| "context_window": 512, | |
| "d_ff": 256, | |
| "device": "cuda:0", | |
| "dim": 256, | |
| "dtype_str": "bfloat16", | |
| "fp8_recipe": "tensorwise", | |
| "impl": "grouped", | |
| "k": 1, | |
| "moe_type": "pytorch", | |
| "moe_zloss_weight": 0.1, | |
| "n_heads": 2, | |
| "n_kv_heads": 2, | |
| "n_layers": 1, | |
| "n_regression_heads": 48, | |
| "num_experts": 2, | |
| "output_moe_weights": false, | |
| "soft_cap": 20, | |
| "theta": 10000.0, | |
| "torch_compile": false, | |
| "use_sparse": false, | |
| "vocab_size": 30000 | |
| } | |
| } |