{ "model": { "vocab_size": 32000, "d_model": 768, "n_heads": 12, "n_layers": 12, "d_ff": 2560, "max_seq_len": 1024, "dropout": 0.0, "activation": "swiglu", "norm": "rmsnorm", "norm_first": true, "bias": false, "pos_encoding": "learned", "weight_tying": true }, "training": { "batch_size": 128, "learning_rate": 0.0006, "max_epochs": 1, "grad_clip": 1.0, "pad_id": 0, "log_every": 100, "attention_log_every": 2000, "device": "cuda", "checkpoint_dir": "checkpoints/llama_124m", "tensorboard_dir": "runs/llama_124m", "gradient_accumulation_steps": 4, "precision": "bfloat16" }, "tokenizer_path": "data/llama_124m/tok_32k.model", "total_parameters": 124472064, "train_tokens": 9651061760, "warmup_steps": 750, "seed": 42 }