#!/bin/bash # export https_proxy=http://192.168.102.101:7890 # export http_proxy=http://192.168.102.101:7890 # # 给 HuggingFace 用的镜像(完全不走代理) # export HF_ENDPOINT=https://hf-mirror.com # export no_proxy="hf-mirror.com" ###使用ds训练,需要修复 tf4.53的bug! export WANDB_MODE=offline ##不联网 export HF_HOME="./.cache" export HF_DATASETS_CACHE="./.cache/datasets" export TRANSFORMERS_CACHE="./.cache/transformers" export TIMM_CACHE="./.cache/timm" MODEL=LLaMA3_modify MAX_LEN=8192 RUN_NAME=zigzag_mask0_a19_b64_1e-5 OUTPUT_DIR=experiments/$MODEL/$MAX_LEN/$RUN_NAME mkdir -p $OUTPUT_DIR cp train_8B.sh $OUTPUT_DIR/train.sh #--master_port=29588 export CUDA_VISIBLE_DEVICES="1" #可能需要指定gpu NCCL_P2P_DISABLE=0 NCCL_IB_DISABLE=0 WANDB_PROJECT="kvcache" torchrun --nproc_per_node=1 --master_port=29591 train.py \ --t 2.0 \ --alpha 19 \ --ddp_find_unused_parameters=False \ --model_name_or_path modify_llama3_zigzag \ --output_dir $OUTPUT_DIR \ --cache_dir /inspire/hdd/project/heziweiproject/heziwei-25044/projects_lmlu/datasets \ --model_max_length $MAX_LEN \ --report_to wandb \ --run_name $MODEL-$RUN_NAME-$MAX_LEN \ --use_flash_attn True \ --low_rank_training False \ --num_train_epochs 1 \ --save_strategy "steps" \ --save_steps 1000 \ --save_total_limit 2 \ --logging_steps 1 \ --tf32 True \ --max_steps 500 \ \ --per_device_train_batch_size 8 \ --gradient_accumulation_steps 8 \ --learning_rate 1e-5 \ --weight_decay 0.1 \ --adam_beta1 0.9 \ --adam_beta2 0.95 \ --bf16 True \ \ --deepspeed "ds_configs/stage2.json" \ | tee -a $OUTPUT_DIR/run.log