dataset_name=$1 vocab_size=$2 corpus_dir=$3 stage=$4 phase1_tokenizer_dir=$5 num_inherit_merges=$6 # Check if all required arguments are provided if [ -z "$dataset_name" ] || [ -z "$vocab_size" ] || [ -z "$corpus_dir" ] || [ -z "$stage" ]; then echo "Error: Missing required arguments." echo "Usage: $0 " exit 1 fi # Check if vocab_size is a positive integer if ! [[ "$vocab_size" =~ ^[0-9]+$ ]] || [ "$vocab_size" -le 0 ]; then echo "Error: Vocab size must be a positive integer." exit 1 fi # Check if stage argument is valid (either 1 or 2) if ! [[ "$stage" =~ ^[1-2]$ ]]; then echo "Error: Invalid stage. Please specify either 1 or 2." exit 1 fi if [ $vocab_size -ge $((10**9)) ]; then vocab_size_str=$(($vocab_size / 10**9))G elif [ $vocab_size -ge $((10**6)) ]; then vocab_size_str=$(($vocab_size / 10**6))M elif [ $vocab_size -ge $((10**3)) ]; then vocab_size_str=$(($vocab_size / 10**3))K else vocab_size_str=${vocab_size} fi if [[ $stage == 1 ]]; then tokenizer_dir=tokenizer_json/${dataset_name}_vocab${vocab_size_str}_stage${stage} echo "Phase 1 tokenizer training: $tokenizer_dir" python -m train_tokenizer \ --output_dir $tokenizer_dir \ --corpus_dir $corpus_dir \ --vocab_size $vocab_size \ --do_whitespace_pretokenization true elif [[ $stage == 2 ]]; then # Check if num_inherit_merges is provided and is a positive integer if ! [[ "$num_inherit_merges" =~ ^[0-9]+$ ]] || [ "$num_inherit_merges" -lt 0 ]; then echo "Warning: num_inherit_merges is invalid or missing. Defaulting to 0." num_inherit_merges=0 fi # Check if phase1_tokenizer_dir exists before proceeding if [ ! -d "$phase1_tokenizer_dir" ]; then echo "Error: Phase 1 tokenizer directory '$phase1_tokenizer_dir' not found!" exit 1 fi if [ $num_inherit_merges -ge $((10**9)) ]; then num_inherit_merges_str=$(($num_inherit_merges / 10**9))G elif [ $num_inherit_merges -ge $((10**6)) ]; then num_inherit_merges_str=$(($num_inherit_merges / 10**6))M elif [ $num_inherit_merges -ge $((10**3)) ]; then num_inherit_merges_str=$(($num_inherit_merges / 10**3))K else num_inherit_merges_str=${num_inherit_merges} fi phase2_tokenizer_dir=tokenizer_json/${dataset_name}_vocab${vocab_size_str}_from${num_inherit_merges_str}_stage${stage} echo "Phase 2 tokenizer training: $phase2_tokenizer_dir" mkdir -p $phase2_tokenizer_dir head -n $num_inherit_merges $phase1_tokenizer_dir/merges.txt > $phase2_tokenizer_dir/merges.txt cp $phase1_tokenizer_dir/meta.json $phase2_tokenizer_dir/meta.json python -m train_tokenizer \ --output_dir $phase2_tokenizer_dir \ --vocab_size $vocab_size \ --do_whitespace_pretokenization false else echo "Error: Invalid stage specified. Please choose '1' for Phase 1 or '2' for Phase 2." exit 1 fi