| { | |
| "model": { | |
| "vocab_size": 32000, | |
| "d_model": 768, | |
| "n_heads": 12, | |
| "n_layers": 12, | |
| "d_ff": 2560, | |
| "max_seq_len": 1024, | |
| "dropout": 0.0, | |
| "activation": "swiglu", | |
| "norm": "rmsnorm", | |
| "norm_first": true, | |
| "bias": false, | |
| "pos_encoding": "learned", | |
| "weight_tying": true | |
| }, | |
| "training": { | |
| "batch_size": 128, | |
| "learning_rate": 0.0006, | |
| "max_epochs": 1, | |
| "grad_clip": 1.0, | |
| "pad_id": 0, | |
| "log_every": 100, | |
| "attention_log_every": 2000, | |
| "device": "cuda", | |
| "checkpoint_dir": "checkpoints/llama_124m", | |
| "tensorboard_dir": "runs/llama_124m", | |
| "gradient_accumulation_steps": 4, | |
| "precision": "bfloat16" | |
| }, | |
| "tokenizer_path": "data/llama_124m/tok_32k.model", | |
| "total_parameters": 124472064, | |
| "train_tokens": 9651061760, | |
| "warmup_steps": 750, | |
| "seed": 42 | |
| } | |