File size: 1,564 Bytes
981b783 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
source ~/.zshrc
conda activate llama-factory
# echo "[$(date)] SFT Training Start"
# CUDA_VISIBLE_DEVICES=0,1,2,3 FORCE_TORCHRUN=1 llamafactory-cli train qwen2.5_full_sft.yaml > logs/train_sft.log 2>&1 && \
# echo "[$(date)] SFT Training End"
# echo "[$(date)] DPO Training Start"
# CUDA_VISIBLE_DEVICES=0,1,2,3 FORCE_TORCHRUN=1 llamafactory-cli train qwen2.5_lora_dpo.yaml > logs/train_dpo.log 2>&1 && \
# echo "[$(date)] DPO Training End"
# echo "[$(date)] Merging Checkpoints"
# CUDA_VISIBLE_DEVICES=0 zsh run_merge_fix.sh /data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/sft 600 800 > logs/merge.log 2>&1 && \
# echo "[$(date)] Merging Checkpoints End"
# conda activate optima-vllm
echo "[$(date)] Inference Start"
CUDA_VISIBLE_DEVICES=0 zsh infer.sh 0 true 600 /data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/dpo/merged/checkpoint-600 en-de > logs/infer_600_en-de.log 2>&1&
CUDA_VISIBLE_DEVICES=1 zsh infer.sh 1 true 600 /data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/dpo/merged/checkpoint-600 en-fr > logs/infer_600_en-fr.log 2>&1&
CUDA_VISIBLE_DEVICES=2 zsh infer.sh 2 true 800 /data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/dpo/merged/checkpoint-800 en-fr > logs/infer_800_en-fr.log 2>&1&
CUDA_VISIBLE_DEVICES=3 zsh infer.sh 3 true 1000 /data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/dpo/merged/checkpoint-1000 en-fr > logs/infer_1000_en-fr.log 2>&1&
echo "[$(date)] Inference End"
|