| #!/bin/bash |
|
|
|
|
| |
| |
| |
| |
| |
|
|
| |
| export WANDB_MODE=offline |
|
|
| export HF_HOME="./.cache" |
| export HF_DATASETS_CACHE="./.cache/datasets" |
| export TRANSFORMERS_CACHE="./.cache/transformers" |
| export TIMM_CACHE="./.cache/timm" |
|
|
| MODEL=LLaMA3_modify |
| MAX_LEN=8192 |
| RUN_NAME=zigzag_mask0_a19_b64_1e-5 |
| OUTPUT_DIR=experiments/$MODEL/$MAX_LEN/$RUN_NAME |
|
|
| mkdir -p $OUTPUT_DIR |
| cp train_8B.sh $OUTPUT_DIR/train.sh |
|
|
| |
| export CUDA_VISIBLE_DEVICES="1" |
|
|
| NCCL_P2P_DISABLE=0 NCCL_IB_DISABLE=0 WANDB_PROJECT="kvcache" torchrun --nproc_per_node=1 --master_port=29591 train.py \ |
| --t 2.0 \ |
| --alpha 19 \ |
| --ddp_find_unused_parameters=False \ |
| --model_name_or_path modify_llama3_zigzag \ |
| --output_dir $OUTPUT_DIR \ |
| --cache_dir /inspire/hdd/project/heziweiproject/heziwei-25044/projects_lmlu/datasets \ |
| --model_max_length $MAX_LEN \ |
| --report_to wandb \ |
| --run_name $MODEL-$RUN_NAME-$MAX_LEN \ |
| --use_flash_attn True \ |
| --low_rank_training False \ |
| --num_train_epochs 1 \ |
| --save_strategy "steps" \ |
| --save_steps 1000 \ |
| --save_total_limit 2 \ |
| --logging_steps 1 \ |
| --tf32 True \ |
| --max_steps 500 \ |
| \ |
| --per_device_train_batch_size 8 \ |
| --gradient_accumulation_steps 8 \ |
| --learning_rate 1e-5 \ |
| --weight_decay 0.1 \ |
| --adam_beta1 0.9 \ |
| --adam_beta2 0.95 \ |
| --bf16 True \ |
| \ |
| --deepspeed "ds_configs/stage2.json" \ |
| | tee -a $OUTPUT_DIR/run.log |