ZhenbinWang commited on
Commit
e1c85df
·
verified ·
1 Parent(s): 01036d5

Upload train.sh

Browse files
Files changed (1) hide show
  1. train.sh +129 -0
train.sh ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/bash
2
+
3
+ export HF_HOME="../hf_cache"
4
+
5
+ export HF_ENDPOINT=https://hf-mirror.com
6
+ # export HF_HOME="../../autodl-fs/hf_cache"
7
+
8
+ params=""
9
+ if [ $# -ne 0 ]; then
10
+ params="$*"
11
+ fi
12
+
13
+ # use envs as local params for convenience
14
+ # e.g.
15
+ # NNODE=1 NGPU=8 LOG_RANK=0 ./train.sh
16
+ NNODE=${NNODE:-"1"}
17
+ NGPU=${NGPU:-"1"}
18
+ DEVICES=${DEVICES:-"0"}
19
+
20
+ LOG_RANK=${LOG_RANK:-0}
21
+
22
+ if [[ -z "${MASTER_ADDR}" ]]; then
23
+ export MASTER_ADDR="localhost"
24
+ fi
25
+ if [[ -z "${MASTER_PORT}" ]]; then
26
+ export MASTER_PORT="0"
27
+ fi
28
+
29
+ : '
30
+ Usage:
31
+
32
+ bash train.sh -h
33
+
34
+ Training a 340M model:
35
+
36
+ NNODE=1 NGPU=8 LOG_RANK=0 bash train.sh \
37
+ --job.config_file flame/models/fla.toml \
38
+ --job.dump_folder exp/transformer-340M-10B/batch32.seqlen2048.warmup1024.update1.steps20480.lr3e-4 \
39
+ --model.config configs/transformer_340M.json \
40
+ --model.tokenizer_path fla-hub/transformer-1.3B-100B \
41
+ --optimizer.name AdamW \
42
+ --optimizer.eps 1e-15 \
43
+ --optimizer.lr 3e-4 \
44
+ --lr_scheduler.warmup_steps 1024 \
45
+ --lr_scheduler.lr_min 0.1 \
46
+ --lr_scheduler.decay_type cosine \
47
+ --training.batch_size 32 \
48
+ --training.seq_len 2048 \
49
+ --training.gradient_accumulation_steps 1 \
50
+ --training.steps 20480 \
51
+ --training.max_norm 1.0 \
52
+ --training.skip_nan_inf \
53
+ --training.dataset HuggingFaceFW/fineweb-edu \
54
+ --training.dataset_name default \
55
+ --training.dataset_split train \
56
+ --training.streaming \
57
+ --training.num_workers 32 \
58
+ --training.prefetch_factor 2 \
59
+ --training.seed 42 \
60
+ --training.compile \
61
+ --training.tensor_parallel_degree 1 \
62
+ --training.disable_loss_parallel \
63
+ --checkpoint.interval 2048 \
64
+ --checkpoint.load_step -1 \
65
+ --metrics.log_freq 1
66
+ '
67
+
68
+ echo "Launching training..."
69
+
70
+ set -x
71
+ path=$(grep -oP '(?<=--job.dump_folder )[^ ]+' <<< "$params")
72
+ steps=$(grep -oP '(?<=--training.steps )[^ ]+' <<< "$params")
73
+ config=$(grep -oP '(?<=--model.config )[^ ]+' <<< "$params")
74
+ tokenizer=$(grep -oP '(?<=--model.tokenizer_path )[^ ]+' <<< "$params")
75
+ model=$(
76
+ python -c "import fla, sys; from transformers import AutoConfig; print(AutoConfig.from_pretrained(sys.argv[1]).to_json_string())" "$config" | jq -r '.model_type'
77
+ )
78
+
79
+ mkdir -p $path
80
+ cp * $path
81
+ cp -r configs $path
82
+ cp -r flame $path
83
+ cp -r 3rdparty/flash-linear-attention/fla $path
84
+ cp -r 3rdparty/torchtitan/torchtitan $path
85
+
86
+ # for offline systems
87
+ # export TRANSFORMERS_OFFLINE=1
88
+ # export HF_DATASETS_OFFLINE=1
89
+ # export HF_HUB_OFFLINE=1
90
+ if [ "$date" == "" ]; then
91
+ date=$(date +%Y%m%d%H%M)
92
+ fi
93
+ RUN_NAME="$model-$(basename $path)"
94
+ RUN_ID="$RUN_NAME-$date"
95
+
96
+ export WANDB_RESUME=allow
97
+ if [[ -z "${WANDB_PROJECT}" ]]; then
98
+ export WANDB_PROJECT="fla"
99
+ fi
100
+ if [[ -z "${WANDB_NAME}" ]]; then
101
+ export WANDB_NAME="$RUN_NAME"
102
+ fi
103
+ if [[ -z "${WANDB_RUN_ID}" ]]; then
104
+ export WANDB_RUN_ID="$RUN_ID"
105
+ fi
106
+
107
+ CUDA_VISIBLE_DEVICES=${DEVICES} \
108
+ PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \
109
+ torchrun --nnodes=${NNODE} \
110
+ --nproc_per_node=${NGPU} \
111
+ --rdzv_backend c10d \
112
+ --rdzv_endpoint "${MASTER_ADDR}:${MASTER_PORT}" \
113
+ --local-ranks-filter ${LOG_RANK} \
114
+ --role rank \
115
+ --tee 3 \
116
+ --log-dir $path/logs \
117
+ -m flame.train \
118
+ $params
119
+
120
+ echo "TRAINING DONE!"
121
+ echo "Converting the DCP checkpoints to HF format..."
122
+
123
+ python -m flame.utils.convert_dcp_to_hf \
124
+ --path $path \
125
+ --step $steps \
126
+ --config $config \
127
+ --tokenizer $tokenizer
128
+
129
+ echo "RUNNING DONE!"