Dyna / train.sh
Elemmire's picture
Upload folder using huggingface_hub
17f3380 verified
#!/bin/bash
# export https_proxy=http://192.168.102.101:7890
# export http_proxy=http://192.168.102.101:7890
# # 给 HuggingFace 用的镜像(完全不走代理)
# export HF_ENDPOINT=https://hf-mirror.com
# export no_proxy="hf-mirror.com"
###使用ds训练,需要修复 tf4.53的bug!
export WANDB_MODE=offline ##不联网
export HF_HOME="./.cache"
export HF_DATASETS_CACHE="./.cache/datasets"
export TRANSFORMERS_CACHE="./.cache/transformers"
export TIMM_CACHE="./.cache/timm"
MODEL=LLaMA3_modify
MAX_LEN=8192
RUN_NAME=zigzag_mask0_a19_b64_1e-5
OUTPUT_DIR=experiments/$MODEL/$MAX_LEN/$RUN_NAME
mkdir -p $OUTPUT_DIR
cp train_8B.sh $OUTPUT_DIR/train.sh
#--master_port=29588
export CUDA_VISIBLE_DEVICES="1" #可能需要指定gpu
NCCL_P2P_DISABLE=0 NCCL_IB_DISABLE=0 WANDB_PROJECT="kvcache" torchrun --nproc_per_node=1 --master_port=29591 train.py \
--t 2.0 \
--alpha 19 \
--ddp_find_unused_parameters=False \
--model_name_or_path modify_llama3_zigzag \
--output_dir $OUTPUT_DIR \
--cache_dir /inspire/hdd/project/heziweiproject/heziwei-25044/projects_lmlu/datasets \
--model_max_length $MAX_LEN \
--report_to wandb \
--run_name $MODEL-$RUN_NAME-$MAX_LEN \
--use_flash_attn True \
--low_rank_training False \
--num_train_epochs 1 \
--save_strategy "steps" \
--save_steps 1000 \
--save_total_limit 2 \
--logging_steps 1 \
--tf32 True \
--max_steps 500 \
\
--per_device_train_batch_size 8 \
--gradient_accumulation_steps 8 \
--learning_rate 1e-5 \
--weight_decay 0.1 \
--adam_beta1 0.9 \
--adam_beta2 0.95 \
--bf16 True \
\
--deepspeed "ds_configs/stage2.json" \
| tee -a $OUTPUT_DIR/run.log