File size: 3,040 Bytes
72c0672 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | dataset_name=$1
vocab_size=$2
corpus_dir=$3
stage=$4
phase1_tokenizer_dir=$5
num_inherit_merges=$6
# Check if all required arguments are provided
if [ -z "$dataset_name" ] || [ -z "$vocab_size" ] || [ -z "$corpus_dir" ] || [ -z "$stage" ]; then
echo "Error: Missing required arguments."
echo "Usage: $0 <dataset_name> <vocab_size> <corpus_dir> <stage> <phase1_tokenizer_dir> <num_inherit_merges>"
exit 1
fi
# Check if vocab_size is a positive integer
if ! [[ "$vocab_size" =~ ^[0-9]+$ ]] || [ "$vocab_size" -le 0 ]; then
echo "Error: Vocab size must be a positive integer."
exit 1
fi
# Check if stage argument is valid (either 1 or 2)
if ! [[ "$stage" =~ ^[1-2]$ ]]; then
echo "Error: Invalid stage. Please specify either 1 or 2."
exit 1
fi
if [ $vocab_size -ge $((10**9)) ]; then
vocab_size_str=$(($vocab_size / 10**9))G
elif [ $vocab_size -ge $((10**6)) ]; then
vocab_size_str=$(($vocab_size / 10**6))M
elif [ $vocab_size -ge $((10**3)) ]; then
vocab_size_str=$(($vocab_size / 10**3))K
else
vocab_size_str=${vocab_size}
fi
if [[ $stage == 1 ]]; then
tokenizer_dir=tokenizer_json/${dataset_name}_vocab${vocab_size_str}_stage${stage}
echo "Phase 1 tokenizer training: $tokenizer_dir"
python -m train_tokenizer \
--output_dir $tokenizer_dir \
--corpus_dir $corpus_dir \
--vocab_size $vocab_size \
--do_whitespace_pretokenization true
elif [[ $stage == 2 ]]; then
# Check if num_inherit_merges is provided and is a positive integer
if ! [[ "$num_inherit_merges" =~ ^[0-9]+$ ]] || [ "$num_inherit_merges" -lt 0 ]; then
echo "Warning: num_inherit_merges is invalid or missing. Defaulting to 0."
num_inherit_merges=0
fi
# Check if phase1_tokenizer_dir exists before proceeding
if [ ! -d "$phase1_tokenizer_dir" ]; then
echo "Error: Phase 1 tokenizer directory '$phase1_tokenizer_dir' not found!"
exit 1
fi
if [ $num_inherit_merges -ge $((10**9)) ]; then
num_inherit_merges_str=$(($num_inherit_merges / 10**9))G
elif [ $num_inherit_merges -ge $((10**6)) ]; then
num_inherit_merges_str=$(($num_inherit_merges / 10**6))M
elif [ $num_inherit_merges -ge $((10**3)) ]; then
num_inherit_merges_str=$(($num_inherit_merges / 10**3))K
else
num_inherit_merges_str=${num_inherit_merges}
fi
phase2_tokenizer_dir=tokenizer_json/${dataset_name}_vocab${vocab_size_str}_from${num_inherit_merges_str}_stage${stage}
echo "Phase 2 tokenizer training: $phase2_tokenizer_dir"
mkdir -p $phase2_tokenizer_dir
head -n $num_inherit_merges $phase1_tokenizer_dir/merges.txt > $phase2_tokenizer_dir/merges.txt
cp $phase1_tokenizer_dir/meta.json $phase2_tokenizer_dir/meta.json
python -m train_tokenizer \
--output_dir $phase2_tokenizer_dir \
--vocab_size $vocab_size \
--do_whitespace_pretokenization false
else
echo "Error: Invalid stage specified. Please choose '1' for Phase 1 or '2' for Phase 2."
exit 1
fi
|