| BATCHSIZE=1 | |
| MODEL="large2048" | |
| EXP_NAME="benchmark-$MODEL" | |
| torchrun --nproc-per-node 1 -m benchmark.main \ | |
| --train-data "pipe:aws s3 cp s3://s-laion/redpajama-tars/8192-v1/{0..7}/shard-{0000000..0000300}.tar -" \ | |
| --train-num-samples 30720 \ | |
| --workers 6 \ | |
| --precision amp_bfloat16 \ | |
| --grad-checkpointing \ | |
| --grad-clip-norm 1 \ | |
| --log-every-n-steps 1 \ | |
| --fsdp \ | |
| --profile \ | |
| --batch-size $BATCHSIZE \ | |
| --model $MODEL \ | |
| --name $EXP_NAME \ | |