| { | |
| "vocab_size": 256, | |
| "hidden_dim": 64, | |
| "patch_encoder_dim": 32, | |
| "n_layers": 3, | |
| "n_heads": 4, | |
| "n_experts": 4, | |
| "top_k_experts": 2, | |
| "expert_hidden_mult": 2, | |
| "patch_encoder_layers": 1, | |
| "max_seq_length": 512, | |
| "max_patches": 64, | |
| "learning_rate": 0.001, | |
| "weight_decay": 0.01, | |
| "beta1": 0.9, | |
| "beta2": 0.98, | |
| "epsilon": 1e-08, | |
| "batch_size": 256, | |
| "micro_batch_size": 4, | |
| "gradient_clip": 0.5, | |
| "load_balance_coefficient": 0.01, | |
| "dropout": 0.0, | |
| "use_prenorm": true, | |
| "entropy_threshold": 0.4, | |
| "min_patch_size": 4, | |
| "max_patch_size": 32 | |
| } |