{ "vocab_size": 32000, "hidden_size": 512, "num_layers": 8, "num_attention_heads": 8, "num_key_value_heads": 2, "intermediate_size": 1365, "max_position_embeddings": 2048, "rms_norm_eps": 1e-06, "rope_theta": 10000.0, "learning_rate": 0.0005, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "gradient_clip_val": 1.0, "warmup_steps": 1000, "max_steps": 50000, "batch_size": 2, "gradient_accumulation_steps": 16, "eval_interval": 500, "save_interval": 2500, "max_length": 512, "dataloader_workers": 0 }