TRAIN_DATA="/mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/lora_0323_10w+55w+error+syn_with_domain_train90_targeted_rl_train90_loramerged_basewer_3suppress_server.jsonl" VAL_DATA="/mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/lora_0323_10w+55w+error+syn_with_domain_train90_targeted_rl_val5_sample5p_server.jsonl" # <- 你新增的验证集 export MASTER_PORT=29540 export MASTER_ADDR=127.0.0.1 ###################### # 0. 基础环境变量 (wandb) ###################### export WANDB_BASE_URL="https://api.wandb.ai" export WANDB_API_KEY="af28909dd03c6220f0ad4df6bd9a0a1fbddb3e55" export WANDB_PROJECT="qwen3_asr_swift_dapo" # 对应截图里的项目名 export WANDB_ENTITY="pang_kaiyu-none" # 对应截图里的 Entity # 让 wandb 在多卡训练时只开一个进程写日志(可选) export WANDB_MODE=online export NPROC_PER_NODE=4 export SWIFT_SINGLE_DEVICE_MODE=1 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export ASR_REWARD_DEBUG=1 export ASR_REWARD_DEBUG_PATH=/mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/out/qwen3asr_dapo_reward56_4x4x12_12gen_4GPU/reward_debug export ASR_REWARD_DEBUG_MAX_ROWS=1000 swift rlhf \ --rlhf_type grpo \ --external_plugins /mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/my_qwen3_asr_dapo_register.py /mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/0417_reward.py \ --model /mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/qwen3-asr-merged \ --model_type my_qwen3_asr_rl \ --template my_qwen3_asr_rl \ --dataset ${TRAIN_DATA} \ --val_dataset ${VAL_DATA} \ --reward_funcs asr_wer_sub_len_cmp_hallu_dirty_v56 \ --train_type lora \ --use_vllm false \ --log_completions true \ --loss_type dapo \ --advantage_estimator grpo \ --scale_rewards group \ --num_iterations 2 \ --beta 0.04 \ --epsilon_high 0.28 \ --dynamic_sample true \ --max_resample_times 4 \ --overlong_filter true \ --truncation_strategy delete \ --num_generations 12 \ --generation_batch_size 48 \ --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ --gradient_accumulation_steps 12 \ --num_generations_eval 4 \ --max_completion_length 256 \ --temperature 0.50 \ --top_p 0.95 \ --top_k 50 \ --repetition_penalty 1.08 \ --learning_rate 5e-5 \ --lr_scheduler_type cosine \ --warmup_ratio 0.03 \ --report_to wandb \ --run_name qwen3asr_dapo_reward56_4x4x12_12gen_4GPU \ --output_dir /mnt/dhwfile/raise/user/panjiabao/huxiaobin/zhanghaobin/0416upload/out/qwen3asr_dapo_reward56_4x4x12_12gen_4GPU \ --save_strategy steps \ --save_steps 20 \ --logging_steps 5 \ --freeze_llm false \ --freeze_vit false \ --freeze_aligner false \ --remove_unused_columns false \ --padding_side left