Delete train_openchat_starling.sh
Browse files- train_openchat_starling.sh +0 -54
train_openchat_starling.sh
DELETED
|
@@ -1,54 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
PROJ_PATH=/GLOBALFS/sysu_xjquan_2/yangzy/fusechat
|
| 3 |
-
BASE_MODEL_NAME=openchat_3.5
|
| 4 |
-
TEACHER_MODEL_NAME=starling
|
| 5 |
-
MODEL_PATH=/GLOBALFS/sysu_xjquan_2/yangzy/models/$BASE_MODEL_NAME
|
| 6 |
-
MODEL_SAVE_PATH="${PROJ_PATH}/model_ckpt/${BASE_MODEL_NAME}_${TEACHER_MODEL_NAME}_ckpt1"
|
| 7 |
-
|
| 8 |
-
mkdir ${MODEL_SAVE_PATH}
|
| 9 |
-
dataset_dir="${PROJ_PATH}/representations/openchat_starling_internlm_representation_fnan_split0,${PROJ_PATH}/representations/openchat_starling_internlm_representation_fnan_split1,${PROJ_PATH}/representations/openchat_starling_internlm_representation_fnan_split2,${PROJ_PATH}/representations/openchat_starling_internlm_representation_fnan_split3"
|
| 10 |
-
|
| 11 |
-
# OpenChat-3.5-7B <-> Starling-LM-7B-alpha
|
| 12 |
-
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
| 13 |
-
torchrun --nproc_per_node=8 --master_port=20001 ${PROJ_PATH}/train/train.py \
|
| 14 |
-
--model_name_or_path ${MODEL_PATH} \
|
| 15 |
-
--data_path ${dataset_dir} \
|
| 16 |
-
--bf16 True \
|
| 17 |
-
--output_dir ${MODEL_SAVE_PATH} \
|
| 18 |
-
--num_train_epochs 3 \
|
| 19 |
-
--per_device_train_batch_size 8 \
|
| 20 |
-
--per_device_eval_batch_size 8 \
|
| 21 |
-
--gradient_accumulation_steps 2 \
|
| 22 |
-
--evaluation_strategy "no" \
|
| 23 |
-
--save_strategy "epoch" \
|
| 24 |
-
--save_steps 10000 \
|
| 25 |
-
--save_total_limit 5 \
|
| 26 |
-
--learning_rate 5e-6 \
|
| 27 |
-
--weight_decay 0. \
|
| 28 |
-
--warmup_ratio 0.03 \
|
| 29 |
-
--lr_scheduler_type "cosine" \
|
| 30 |
-
--logging_steps 1 \
|
| 31 |
-
--fsdp "full_shard auto_wrap" \
|
| 32 |
-
--fsdp_transformer_layer_cls_to_wrap 'MistralDecoderLayer' \
|
| 33 |
-
--tf32 True \
|
| 34 |
-
--model_max_length 2048 \
|
| 35 |
-
--gradient_checkpointing True \
|
| 36 |
-
--conv_temp "openchat" \
|
| 37 |
-
--lazy_preprocess True \
|
| 38 |
-
--flash_attn_transformers True \
|
| 39 |
-
--do_train \
|
| 40 |
-
--do_distill \
|
| 41 |
-
--distill_with_ref_model True \
|
| 42 |
-
--distill_with_aligned_model_0 True \
|
| 43 |
-
--distill_with_aligned_model_1 False \
|
| 44 |
-
--distill_loss_type "ce" \
|
| 45 |
-
--distill_teacher_temperature 1.0 \
|
| 46 |
-
--lm_loss_weight 0.9 \
|
| 47 |
-
--distill_greater_as_gt True \
|
| 48 |
-
--distill_greater_as_gt_type hard \
|
| 49 |
-
--dataloader_num_workers 8 \
|
| 50 |
-
--remove_unused_columns False
|
| 51 |
-
|
| 52 |
-
sleep 60s
|
| 53 |
-
|
| 54 |
-
yhcancel 2510
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|