| { | |
| "vocab_size": 32000, | |
| "hidden_size": 512, | |
| "num_layers": 8, | |
| "num_attention_heads": 8, | |
| "num_key_value_heads": 2, | |
| "intermediate_size": 1365, | |
| "max_position_embeddings": 2048, | |
| "rms_norm_eps": 1e-06, | |
| "rope_theta": 10000.0, | |
| "learning_rate": 0.0005, | |
| "weight_decay": 0.1, | |
| "beta1": 0.9, | |
| "beta2": 0.95, | |
| "gradient_clip_val": 1.0, | |
| "warmup_steps": 1000, | |
| "max_steps": 50000, | |
| "batch_size": 2, | |
| "gradient_accumulation_steps": 16, | |
| "eval_interval": 500, | |
| "save_interval": 2500, | |
| "max_length": 512, | |
| "dataloader_workers": 0 | |
| } |