Elemmire
/

Dyna

Model card Files Files and versions

Dyna / train.sh

Elemmire's picture

Upload folder using huggingface_hub

17f3380 verified about 1 month ago

history blame contribute delete

1.86 kB

	#!/bin/bash


	# export https_proxy=http://192.168.102.101:7890
	# export http_proxy=http://192.168.102.101:7890
	# # 给 HuggingFace 用的镜像（完全不走代理）
	# export HF_ENDPOINT=https://hf-mirror.com
	# export no_proxy="hf-mirror.com"

	###使用ds训练，需要修复 tf4.53的bug!
	export WANDB_MODE=offline ##不联网

	export HF_HOME="./.cache"
	export HF_DATASETS_CACHE="./.cache/datasets"
	export TRANSFORMERS_CACHE="./.cache/transformers"
	export TIMM_CACHE="./.cache/timm"

	MODEL=LLaMA3_modify
	MAX_LEN=8192
	RUN_NAME=zigzag_mask0_a19_b64_1e-5
	OUTPUT_DIR=experiments/$MODEL/$MAX_LEN/$RUN_NAME

	mkdir -p $OUTPUT_DIR
	cp train_8B.sh $OUTPUT_DIR/train.sh

	#--master_port=29588
	export CUDA_VISIBLE_DEVICES="1" #可能需要指定gpu

	NCCL_P2P_DISABLE=0 NCCL_IB_DISABLE=0 WANDB_PROJECT="kvcache" torchrun --nproc_per_node=1 --master_port=29591 train.py \
	--t 2.0 \
	--alpha 19 \
	--ddp_find_unused_parameters=False \
	--model_name_or_path modify_llama3_zigzag \
	--output_dir $OUTPUT_DIR \
	--cache_dir /inspire/hdd/project/heziweiproject/heziwei-25044/projects_lmlu/datasets \
	--model_max_length $MAX_LEN \
	--report_to wandb \
	--run_name $MODEL-$RUN_NAME-$MAX_LEN \
	--use_flash_attn True \
	--low_rank_training False \
	--num_train_epochs 1 \
	--save_strategy "steps" \
	--save_steps 1000 \
	--save_total_limit 2 \
	--logging_steps 1 \
	--tf32 True \
	--max_steps 500 \
	\
	--per_device_train_batch_size 8 \
	--gradient_accumulation_steps 8 \
	--learning_rate 1e-5 \
	--weight_decay 0.1 \
	--adam_beta1 0.9 \
	--adam_beta2 0.95 \
	--bf16 True \
	\
	--deepspeed "ds_configs/stage2.json" \
	\| tee -a $OUTPUT_DIR/run.log