File size: 3,364 Bytes
0caebb9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
#!/usr/bin/bash
export HF_HOME="/root/workspace/huggingface_cache"
export HF_ENDPOINT=https://hf-mirror.com
# export HF_HOME="../../autodl-fs/hf_cache"
params=""
if [ $# -ne 0 ]; then
params="$*"
fi
# use envs as local params for convenience
# e.g.
# NNODE=1 NGPU=8 LOG_RANK=0 ./train.sh
NNODE=${NNODE:-"1"}
NGPU=${NGPU:-"4"}
DEVICES=${DEVICES:-"0,1,2,3"}
LOG_RANK=${LOG_RANK:-0}
if [[ -z "${MASTER_ADDR}" ]]; then
export MASTER_ADDR="localhost"
fi
if [[ -z "${MASTER_PORT}" ]]; then
export MASTER_PORT="0"
fi
: '
Usage:
bash train.sh -h
Training a 340M model:
NNODE=1 NGPU=8 LOG_RANK=0 bash train.sh \
--job.config_file flame/models/fla.toml \
--job.dump_folder exp/transformer-340M-10B/batch32.seqlen2048.warmup1024.update1.steps20480.lr3e-4 \
--model.config configs/transformer_340M.json \
--model.tokenizer_path fla-hub/transformer-1.3B-100B \
--optimizer.name AdamW \
--optimizer.eps 1e-15 \
--optimizer.lr 3e-4 \
--lr_scheduler.warmup_steps 1024 \
--lr_scheduler.lr_min 0.1 \
--lr_scheduler.decay_type cosine \
--training.batch_size 32 \
--training.seq_len 2048 \
--training.gradient_accumulation_steps 1 \
--training.steps 20480 \
--training.max_norm 1.0 \
--training.skip_nan_inf \
--training.dataset HuggingFaceFW/fineweb-edu \
--training.dataset_name default \
--training.dataset_split train \
--training.streaming \
--training.num_workers 32 \
--training.prefetch_factor 2 \
--training.seed 42 \
--training.compile \
--training.tensor_parallel_degree 1 \
--training.disable_loss_parallel \
--checkpoint.interval 2048 \
--checkpoint.load_step -1 \
--metrics.log_freq 1
'
echo "Launching training..."
set -x
path=$(grep -oP '(?<=--job.dump_folder )[^ ]+' <<< "$params")
steps=$(grep -oP '(?<=--training.steps )[^ ]+' <<< "$params")
config=$(grep -oP '(?<=--model.config )[^ ]+' <<< "$params")
tokenizer=$(grep -oP '(?<=--model.tokenizer_path )[^ ]+' <<< "$params")
model=$(
python -c "import fla, sys; from transformers import AutoConfig; print(AutoConfig.from_pretrained(sys.argv[1]).to_json_string())" "$config" | jq -r '.model_type'
)
mkdir -p $path
cp * $path
cp -r configs $path
cp -r flame $path
cp -r 3rdparty/flash-linear-attention/fla $path
cp -r 3rdparty/torchtitan/torchtitan $path
# for offline systems
# export TRANSFORMERS_OFFLINE=1
# export HF_DATASETS_OFFLINE=1
# export HF_HUB_OFFLINE=1
if [ "$date" == "" ]; then
date=$(date +%Y%m%d%H%M)
fi
RUN_NAME="$model-$(basename $path)"
RUN_ID="$RUN_NAME-$date"
export WANDB_RESUME=allow
if [[ -z "${WANDB_PROJECT}" ]]; then
export WANDB_PROJECT="fla"
fi
if [[ -z "${WANDB_NAME}" ]]; then
export WANDB_NAME="$RUN_NAME"
fi
if [[ -z "${WANDB_RUN_ID}" ]]; then
export WANDB_RUN_ID="$RUN_ID"
fi
CUDA_VISIBLE_DEVICES=${DEVICES} \
PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \
# systemd-run --scope --user -p MemoryHigh=80G \
torchrun --nnodes=${NNODE} \
--nproc_per_node=${NGPU} \
--rdzv_backend c10d \
--rdzv_endpoint "${MASTER_ADDR}:${MASTER_PORT}" \
--local-ranks-filter ${LOG_RANK} \
--role rank \
--tee 3 \
--log-dir $path/logs \
-m flame.train \
$params
echo "TRAINING DONE!"
echo "Converting the DCP checkpoints to HF format..."
python -m flame.utils.convert_dcp_to_hf \
--path $path \
--step $steps \
--config $config \
--tokenizer $tokenizer
echo "RUNNING DONE!"
|