| Start training (first run): | |
| python train.py ^ | |
| --config 150M ^ | |
| --data_dir tokenizer/data ^ | |
| --batch_size 2 ^ | |
| --grad_accum 16 ^ | |
| --grad_checkpoint ^ | |
| --dtype bf16 ^ | |
| --max_steps 5000 ^ | |
| --run_dir runs/sllm_150m ^ | |
| --log_every 10 ^ | |
| --save_every 500 ^ | |
| --val_every 500 ^ | |
| --val_steps 20 ^ | |
| --warmup_steps 200 | |
| Resume from where you stopped: | |
| python train.py --resume --data_dir tokenizer/data --batch_size 2 --grad_accum 16 --grad_checkpoint --dtype bf16 --extra_steps 5000 --run_dir runs/sllm_150m --log_every 10 --save_every 500 --val_every 500 --val_steps 20 --warmup_steps 200 | |
| Plot while training (in a second terminal): | |
| conda activate pytorch | |
| cd c:\geetesh\aimldl\projects\sllm | |
| python plot_training.py --run_dir runs/sllm_150m --live --interval 30 | |
| python finetune/prepare_data.py | |
| python finetune/sft_train.py --base_ckpt runs/sllm_150m/ckpt_0011500.pt --run_dir runs/sllm_150m_chat --max_steps 2500 --batch_size 4 --grad_accum 8 --grad_checkpoint | |
| python finetune/chat.py --run_dir runs/sllm_150m_chat | |