| | #!/bin/bash |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | set -ex |
| |
|
| | DEBUG=${DEBUG:-0} |
| |
|
| | |
| | export PYTHONPATH=/mnt/ceph_rbd/comp_rag/clara:$PYTHONPATH |
| | export WANDB_DIR=/mnt/ceph_rbd/comp_rag/clara/debug_data/wandb_logs |
| |
|
| | |
| | data_path=/mnt/ceph_rbd/comp_rag/clara/debug_data |
| | SAVE_MODEL_NAME=clara_cluster2_2m_mix_stage1 |
| | SAVE_PATH=/mnt/ceph_rbd/comp_rag/clara/debug_data/train_checkpoint/$SAVE_MODEL_NAME |
| | WANDB_TOKEN=xx |
| | MODEL_PATH=/mnt/ceph_rbd/model/Mistral-7B-Instruct-v0.2 |
| |
|
| | mkdir -p $SAVE_PATH |
| | |
| |
|
| | |
| | NCCL_DEBUG=INFO |
| | NUM_NODES=1 |
| | MASTER=127.0.0.1 |
| | MASTER_PORT=29500 |
| | NODE_RANK=0 |
| | NUM_LOCAL_GPUS=4 |
| | WORLD_SIZE=$((NUM_LOCAL_GPUS * NUM_NODES)) |
| |
|
| |
|
| | echo "Number of nodes: ${NUM_NODES}" |
| | echo "WORLD_SIZE: ${WORLD_SIZE}" |
| | echo "Number of local GPUs: ${NUM_LOCAL_GPUS}" |
| | echo "Master: ${MASTER}" |
| | echo "Master port: ${MASTER_PORT}" |
| | echo "Node rank: ${NODE_RANK}" |
| |
|
| | echo "Currently using $(which python)" |
| |
|
| | |
| | training_commands="openrlhf.cli.train_sft \ |
| | --max_len 2048 \ |
| | --dataset $data_path/pretrain_data.jsonl \ |
| | --pretrain $MODEL_PATH \ |
| | --train_batch_size 128 \ |
| | --micro_train_batch_size 2 \ |
| | --ckpt_path $SAVE_PATH \ |
| | --max_samples 500 \ |
| | --save_path $SAVE_PATH \ |
| | --save_steps -2 \ |
| | --logging_steps 1 \ |
| | --eval_steps 20 \ |
| | --zero_stage 2 \ |
| | --max_epochs 1 \ |
| | --bf16 \ |
| | --flash_attn \ |
| | --learning_rate 1e-4 \ |
| | --stage stage1 \ |
| | --generation_top_k 1 \ |
| | --qa_loss \ |
| | --doc_max_length 256 \ |
| | --compress_rate 32 \ |
| | --mse_loss \ |
| | --gradient_checkpointing" |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | DISTRIBUTED_ARGS="--nproc_per_node ${NUM_LOCAL_GPUS} --nnodes ${NUM_NODES} --rdzv_id 101 --rdzv_backend c10d --rdzv_endpoint ${MASTER}:${MASTER_PORT} --master_addr ${MASTER} --master_port ${MASTER_PORT} --node_rank ${NODE_RANK}" |
| |
|
| | |
| | echo "Starting CLaRa training on node $NODE_RANK of $NUM_NODES nodes..." |
| | if [ $DEBUG -eq 0 ]; then |
| | if [ $NUM_NODES -gt 1 ]; then |
| | |
| | if command -v fi_info >/dev/null 2>&1; then |
| | fi_info -p efa -t FI_EP_RDM; torchrun $DISTRIBUTED_ARGS -m $training_commands |
| | else |
| | torchrun $DISTRIBUTED_ARGS -m $training_commands |
| | fi |
| | else |
| | torchrun $DISTRIBUTED_ARGS -m $training_commands |
| | fi |
| | else |
| | |
| | WORLD_SIZE=1 LOCAL_RANK=0 \ |
| | python -m debugpy --listen 0.0.0.0:5678 --wait-for-client \ |
| | -m torch.distributed.launch --nproc_per_node=2 --master_port=20001 \ |
| | -m $training_commands |
| | fi |
| |
|
| | |
| | cp ../openrlhf/models/modeling_clara.py $SAVE_PATH |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | echo "CLaRa training completed successfully!" |