| accum-freq: 4 | |
| beta1: 0.9 | |
| beta2: 0.95 | |
| data-key: "json" | |
| dataset-resampled: True | |
| # delete-previous-checkpoint: False | |
| # Total 25B * 40 = 1T tokens | |
| epochs: 40 | |
| fsdp: True | |
| fsdp-limit-all-gathers: True | |
| # grad-checkpointing: False | |
| grad-clip-norm: 1 | |
| log-every-n-steps: 20 | |
| model: "open_lm_7b" | |
| name: "sample_7b" | |
| precision: "amp_bfloat16" | |
| report-to: "wandb" | |
| seed: 124 | |
| train-data-mix-weights: [0.725, 0.275] | |
| train-data: ["TODO"] | |
| train-num-samples: 25_000_000_000 | |
| wandb-project-name: "lm1" | |
| workers: 4 | |
| logs: /opt/ml/checkpoints/ | |
| # Some important parameters, double checked with Mitchell: | |
| batch-size: 16 | |
| ffn-type: swiglu | |
| # fsdp-amp: False | |
| fsdp-pure-bf16: True | |
| fsdp-backward-prefetch: True | |
| lr: 3.e-4 | |
| lr-cooldown-end: 3.e-5 | |
| model-norm: "gain_only_lp_layer_norm" | |
| qk-norm: True | |
| warmup: 5000 | |
| wd: 0.1 | |
| z-loss-coefficient: 1.e-4 | |