Text Classification
Transformers
Safetensors
qwen2
text-generation
text-embeddings-inference
sarosavo commited on
Commit
a4e1f57
·
verified ·
1 Parent(s): c9f8692

Upload RLVR_train.sh

Browse files

upload rlvr training script

Files changed (1) hide show
  1. reward_server/RLVR_train.sh +58 -0
reward_server/RLVR_train.sh ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ METHOD=$1 # reinforce_baseline, reinforce , rloo
4
+ PRETRAIN_PATH=$2 # Qwen/Qwen2.5-7B
5
+ DATA_PATH=$3 # path_to_your_qa_data.jsonl
6
+ REWARD_API=$4 # http://127.0.0.1:8000/get_reward
7
+
8
+
9
+ working_dir=$(pwd)
10
+ LOG_PATH=${working_dir}/${EXP_NAME}/train.log
11
+ SAVE_PATH=${working_dir}/${EXP_NAME}/checkpoint
12
+ mkdir -p ${SAVE_PATH}
13
+
14
+
15
+
16
+ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
17
+ ray start --head --node-ip-address 0.0.0.0 --num-gpus 8
18
+
19
+ ray job submit --address="http://127.0.0.1:8265"
20
+ -- python3 -m openrlhf.cli.train_ppo_ray \
21
+ --ref_num_nodes 1 \
22
+ --ref_num_gpus_per_node 8 \
23
+ --actor_num_nodes 1 \
24
+ --actor_num_gpus_per_node 8 \
25
+ --vllm_num_engines 8 \
26
+ --vllm_tensor_parallel_size 1 \
27
+ --colocate_all_models \
28
+ --vllm_gpu_memory_utilization 0.5 \
29
+ --vllm_enable_sleep \
30
+ --deepspeed_enable_sleep \
31
+ --enforce_eager \
32
+ --pretrain ${PRETRAIN_PATH} \
33
+ --remote_rm_url ${REWARD_API} \
34
+ --save_path ${SAVE_PATH} \
35
+ --micro_train_batch_size 8 \
36
+ --train_batch_size 128 \
37
+ --micro_rollout_batch_size 16 \
38
+ --rollout_batch_size 128 \
39
+ --n_samples_per_prompt 4 \
40
+ --max_samples 30000 \
41
+ --max_epochs 1 \
42
+ --prompt_max_len 1024 \
43
+ --generate_max_len 1024 \
44
+ --zero_stage 3 \
45
+ --bf16 \
46
+ --actor_learning_rate 5e-7 \
47
+ --init_kl_coef 0.01 \
48
+ --use_kl_loss \
49
+ --advantage_estimator ${METHOD} \
50
+ --prompt_data ${DATA_PATH} \
51
+ --input_key query \
52
+ --apply_chat_template \
53
+ --packing_samples \
54
+ --normalize_reward \
55
+ --adam_offload \
56
+ --flash_attn \
57
+ --gradient_checkpointing \
58
+ 2>&1 | tee ${LOG_PATH}