Add files using upload-large-folder tool

c39435c verified about 1 month ago

5.59 kB

	args=$@
	for arg in $args; do
	eval "$arg"
	done

	echo "model: ${model:=fla-hub/gla-1.3B-100B}"
	echo "tokenizer: ${tokenizer:=/mnt/jfzn/msj/download_model/delta_net-1.3B-100B}"
	echo "project: ${project:=fla}"
	echo "type: ${type:=gla}"
	echo "data: ${data:=}"
	echo "name: ${name:=}"
	echo "cache: ${cache:=}"
	echo "varlen: ${varlen:=false}"
	echo "seed: ${seed:=42}"
	echo "context: ${context:=2048}"
	echo "steps: ${steps:=0}"
	echo "save: ${save:=2048}"
	echo "limit: ${limit:=1}"
	echo "preprocessing: ${preprocessing:=32}"
	echo "workers: ${workers:=32}"
	echo "prefetch: ${prefetch:=2}"
	echo "logging: ${logging:=32}"
	echo "config: ${config:=configs/deepspeed.yaml}"

	echo "lr: ${lr:=3e-4}"
	echo "scheduler: ${scheduler:=cosine_with_min_lr}"
	echo "epochs: ${epochs:=1}"
	echo "optim: ${optim:=adamw_torch_fused}"
	echo "decay: ${decay:=0.01}"
	echo "beta1: ${beta1:=0.9}"
	echo "beta2: ${beta2:=0.95}"
	echo "norm: ${norm:=1.0}"
	echo "batch: ${batch:=32}"
	echo "update: ${update:=1}"
	echo "warmup: ${warmup:=512}"
	echo "path: ${path:=}"
	echo "checkpoint: ${checkpoint:=}"
	echo "node: ${node:=}"
	echo "rank: ${rank:=}"
	echo "ip: ${ip:=}"
	echo "port: ${port:=}"
	echo "nodes: ${nodes:=1}"
	echo "gpus: ${gpus:=8}"
	echo "tasks: ${tasks:run}"

	params="--model_name_or_path $model \
	--tokenizer $tokenizer \
	--use_fast_tokenizer \
	--do_train \
	--dataset $data \
	--context_length $context \
	--preprocessing_num_workers $preprocessing \
	--dataloader_num_workers $workers \
	--dataloader_prefetch_factor $prefetch \
	--output_dir $path \
	--overwrite_output_dir \
	--logging_steps $logging \
	--include_num_input_tokens_seen \
	--save_steps $save \
	--save_total_limit $limit \
	--learning_rate $lr \
	--lr_scheduler_type $scheduler \
	--warmup_steps $warmup \
	--optim $optim \
	--weight_decay $decay \
	--adam_beta1=$beta1 \
	--adam_beta2=$beta2 \
	--max_grad_norm $norm \
	--num_train_epochs $epochs \
	--per_device_train_batch_size $batch \
	--gradient_accumulation_steps $update \
	--seed $seed \
	--logging_steps $logging \
	--log_level info \
	--bf16"

	if [ $steps -gt 0 ]; then
	params+=" --max_steps $steps"
	fi

	if [ "$name" != "" ]; then
	params+=" --dataset_name $name"
	fi
	if [ "$cache" != "" ]; then
	params+=" --cache_dir $cache"
	fi
	if [ "$varlen" == "true" ]; then
	params+=" --varlen"
	fi
	if [ "$checkpoint" != "" ]; then
	params+=" --resume_from_checkpoint $checkpoint"
	echo '*****************************************'$checkpoint
	fi
	# if [ "$WANDB_DISABLED" != "true" ]; then
	# params+=" --report_to wandb \
	# --run_name $type.$(basename $path)"
	# else
	params+=" --report_to none"
	# fi

	echo "Launching training..."
	accelerate_params=""
	if [ "$rank" != "" ]; then
	accelerate_params+=" --machine_rank $rank \
	--num_processes $((nodes * gpus)) \
	--num_machines $nodes \
	--main_process_ip $ip \
	--main_process_port $port \
	--same_network"
	fi

	if [[ $config == "deepspeed" ]]; then
	cat <<EOF > "configs/ds_config.json"
	{
	"train_batch_size": "auto",
	"train_micro_batch_size_per_gpu": "auto",
	"gradient_accumulation_steps": "auto",
	"gradient_clipping": "auto",
	"zero_allow_untested_optimizer": true,
	"bf16": {
	"enabled": true
	},
	"zero_optimization": {
	"stage": 2,
	"allgather_partitions": true,
	"allgather_bucket_size": 5e8,
	"reduce_scatter": true,
	"reduce_bucket_size": 5e8,
	"overlap_comm": false,
	"contiguous_gradients": true
	}
	}
	EOF
	cat <<EOF > $config
	compute_environment: LOCAL_MACHINE
	distributed_type: DEEPSPEED
	deepspeed_config:
	deepspeed_config_file: configs/ds_config.json
	zero3_init_flag: true
	machine_rank: 0
	main_training_function: main
	num_machines: 1
	num_processes: $gpus
	use_cpu: false
	EOF
	fi
	if [[ $config == "fsdp" ]]; then
	cat <<EOF > $config
	compute_environment: LOCAL_MACHINE
	distributed_type: FSDP
	fsdp_config:
	fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
	fsdp_backward_prefetch: BACKWARD_PRE
	fsdp_forward_prefetch: false
	fsdp_cpu_ram_efficient_loading: true
	fsdp_offload_params: false
	fsdp_sharding_strategy: HYBRID_SHARD_ZERO2
	fsdp_state_dict_type: SHARDED_STATE_DICT
	fsdp_sync_module_states: true
	fsdp_use_orig_params: true
	machine_rank: 0
	main_training_function: main
	mixed_precision: bf16
	num_machines: $nodes
	num_processes: $((nodes * gpus))
	rdzv_backend: static
	same_network: true
	tpu_env: []
	tpu_use_cluster: false
	tpu_use_sudo: false
	use_cpu: false
	EOF
	fi

	cat $config

	set -x
	mkdir -p $path
	cp * $path
	cp -r configs $path
	cp -r flame $path
	# cp -r fla1 $path
	cp -r fla2 $path
	cp -r fla3 $path
	# export WANDB_DISABLED=1
	export TRANSFORMERS_OFFLINE=1
	export HF_DATASETS_OFFLINE=1
	if [ "$date" == "" ]; then
	date=$(date +%Y%m%d%H%M)
	fi

	if [[ "$tasks" == run ]]; then
	echo "HERE"
	CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch $accelerate_params --main_process_port 23455 --config_file $config \
	run.py $params
	fi

	if [[ "$tasks" == finetune ]]; then
	echo "THERE"
	CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch $accelerate_params --main_process_port 23455 --config_file $config \
	finetune.py $params
	fi

	echo "RUNNING DONE!"