{"hidden_dim": 768, "n_heads": 12, "n_layer": 12, "head_dim": 64, "batch_size": 8, "max_position_embedding": 512, "rms_norm_eps": 1e-05, "attention_dropout": 0.1, "rope_theta": 10000.0, "device": "cuda", "intermediate_dim": 3072, "n_kv_heads": 4, "vocab_size": 32000, "padding_idx": 2, "mlp_dropout": 0.1, "max_tokens": 3000000000, "lr": 0.0003, "weight_decay": 0.1, "max_grad_norm": 1.0, "grad_accum_steps": 8, "log_interval": 50, "eval_interval": 500, "val_batches": 50, "use_compile": true, "use_amp": true, "wandb_project": "tiny-llama"}