File size: 1,856 Bytes
17f3380 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | #!/bin/bash
# export https_proxy=http://192.168.102.101:7890
# export http_proxy=http://192.168.102.101:7890
# # 给 HuggingFace 用的镜像(完全不走代理)
# export HF_ENDPOINT=https://hf-mirror.com
# export no_proxy="hf-mirror.com"
###使用ds训练,需要修复 tf4.53的bug!
export WANDB_MODE=offline ##不联网
export HF_HOME="./.cache"
export HF_DATASETS_CACHE="./.cache/datasets"
export TRANSFORMERS_CACHE="./.cache/transformers"
export TIMM_CACHE="./.cache/timm"
MODEL=LLaMA3_modify
MAX_LEN=8192
RUN_NAME=zigzag_mask0_a19_b64_1e-5
OUTPUT_DIR=experiments/$MODEL/$MAX_LEN/$RUN_NAME
mkdir -p $OUTPUT_DIR
cp train_8B.sh $OUTPUT_DIR/train.sh
#--master_port=29588
export CUDA_VISIBLE_DEVICES="1" #可能需要指定gpu
NCCL_P2P_DISABLE=0 NCCL_IB_DISABLE=0 WANDB_PROJECT="kvcache" torchrun --nproc_per_node=1 --master_port=29591 train.py \
--t 2.0 \
--alpha 19 \
--ddp_find_unused_parameters=False \
--model_name_or_path modify_llama3_zigzag \
--output_dir $OUTPUT_DIR \
--cache_dir /inspire/hdd/project/heziweiproject/heziwei-25044/projects_lmlu/datasets \
--model_max_length $MAX_LEN \
--report_to wandb \
--run_name $MODEL-$RUN_NAME-$MAX_LEN \
--use_flash_attn True \
--low_rank_training False \
--num_train_epochs 1 \
--save_strategy "steps" \
--save_steps 1000 \
--save_total_limit 2 \
--logging_steps 1 \
--tf32 True \
--max_steps 500 \
\
--per_device_train_batch_size 8 \
--gradient_accumulation_steps 8 \
--learning_rate 1e-5 \
--weight_decay 0.1 \
--adam_beta1 0.9 \
--adam_beta2 0.95 \
--bf16 True \
\
--deepspeed "ds_configs/stage2.json" \
| tee -a $OUTPUT_DIR/run.log |