| set -x |
| METHOD=$1 |
| PRETRAIN_PATH=$2 |
| DATA_PATH=$3 |
| REWARD_API=$4 |
|
|
|
|
|
|
| PRETRAIN_PATH=PERTRAIN_PATH |
| DATA_PATH=$DATA_PATH |
|
|
|
|
| working_dir=$(pwd) |
| LOG_PATH=${working_dir}/${EXP_NAME}/train.log |
| SAVE_PATH=${working_dir}/${EXP_NAME}/checkpoint |
| mkdir -p ${SAVE_PATH} |
|
|
|
|
|
|
| export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 |
| ray start --head --node-ip-address 0.0.0.0 --num-gpus 8 |
|
|
| ray job submit --address="http://127.0.0.1:8265" |
| -- python3 -m openrlhf.cli.train_ppo_ray \ |
| --ref_num_nodes 1 \ |
| --ref_num_gpus_per_node 8 \ |
| --actor_num_nodes 1 \ |
| --actor_num_gpus_per_node 8 \ |
| --vllm_num_engines 8 \ |
| --vllm_tensor_parallel_size 1 \ |
| --colocate_all_models \ |
| --vllm_gpu_memory_utilization 0.5 \ |
| --vllm_enable_sleep \ |
| --deepspeed_enable_sleep \ |
| --enforce_eager \ |
| --pretrain ${PRETRAIN_PATH} \ |
| --remote_rm_url ${REWARD_API} \ |
| --save_path ${SAVE_PATH} \ |
| --micro_train_batch_size 8 \ |
| --train_batch_size 128 \ |
| --micro_rollout_batch_size 16 \ |
| --rollout_batch_size 128 \ |
| --n_samples_per_prompt 4 \ |
| --max_samples 30000 \ |
| --max_epochs 1 \ |
| --prompt_max_len 1024 \ |
| --generate_max_len 1024 \ |
| --zero_stage 3 \ |
| --bf16 \ |
| --actor_learning_rate 5e-7 \ |
| --init_kl_coef 0.01 \ |
| --use_kl_loss \ |
| --advantage_estimator ${METHOD} \ |
| --prompt_data ${DATA_PATH} \ |
| --input_key query \ |
| --apply_chat_template \ |
| --packing_samples \ |
| --normalize_reward \ |
| --adam_offload \ |
| --flash_attn \ |
| --gradient_checkpointing \ |
| 2>&1 | tee ${LOG_PATH} |
|
|