|
|
| TRAIN_DATA="/mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/lora_0323_10w+55w+error+syn_with_domain_train90_targeted_rl_train90_loramerged_basewer_3suppress_server.jsonl" |
| VAL_DATA="/mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/lora_0323_10w+55w+error+syn_with_domain_train90_targeted_rl_val5_sample5p_server.jsonl" |
|
|
| export MASTER_PORT=29540 |
| export MASTER_ADDR=127.0.0.1 |
|
|
| |
| |
| |
| export WANDB_BASE_URL="https://api.wandb.ai" |
| export WANDB_API_KEY="af28909dd03c6220f0ad4df6bd9a0a1fbddb3e55" |
| export WANDB_PROJECT="qwen3_asr_swift_dapo" |
| export WANDB_ENTITY="pang_kaiyu-none" |
|
|
| |
| export WANDB_MODE=online |
| export NPROC_PER_NODE=4 |
| export SWIFT_SINGLE_DEVICE_MODE=1 |
| export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True |
|
|
|
|
| export ASR_REWARD_DEBUG=1 |
| export ASR_REWARD_DEBUG_PATH=/mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/out/qwen3asr_dapo_reward56_4x4x12_12gen_4GPU/reward_debug |
| export ASR_REWARD_DEBUG_MAX_ROWS=1000 |
|
|
|
|
|
|
| swift rlhf \ |
| --rlhf_type grpo \ |
| --external_plugins /mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/my_qwen3_asr_dapo_register.py /mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/0417_reward.py \ |
| --model /mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/qwen3-asr-merged \ |
| --model_type my_qwen3_asr_rl \ |
| --template my_qwen3_asr_rl \ |
| --dataset ${TRAIN_DATA} \ |
| --val_dataset ${VAL_DATA} \ |
| --reward_funcs asr_wer_sub_len_cmp_hallu_dirty_v56 \ |
| --train_type lora \ |
| --use_vllm false \ |
| --log_completions true \ |
| --loss_type dapo \ |
| --advantage_estimator grpo \ |
| --scale_rewards group \ |
| --num_iterations 2 \ |
| --beta 0.04 \ |
| --epsilon_high 0.28 \ |
| --dynamic_sample true \ |
| --max_resample_times 4 \ |
| --overlong_filter true \ |
| --truncation_strategy delete \ |
| --num_generations 12 \ |
| --generation_batch_size 48 \ |
| --per_device_train_batch_size 4 \ |
| --per_device_eval_batch_size 4 \ |
| --gradient_accumulation_steps 12 \ |
| --num_generations_eval 4 \ |
| --max_completion_length 256 \ |
| --temperature 0.50 \ |
| --top_p 0.95 \ |
| --top_k 50 \ |
| --repetition_penalty 1.08 \ |
| --learning_rate 5e-5 \ |
| --lr_scheduler_type cosine \ |
| --warmup_ratio 0.03 \ |
| --report_to wandb \ |
| --run_name qwen3asr_dapo_reward56_4x4x12_12gen_4GPU \ |
| --output_dir /mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/out/qwen3asr_dapo_reward56_4x4x12_12gen_4GPU \ |
| --save_strategy steps \ |
| --save_steps 20 \ |
| --logging_steps 5 \ |
| --freeze_llm false \ |
| --freeze_vit false \ |
| --freeze_aligner false \ |
| --remove_unused_columns false \ |
| --padding_side left |
|
|