| #!/usr/bin/env bash |
| set -euo pipefail |
| cd "$(dirname "$0")/.." |
| DATA="data" |
|
|
| echo "==================================================================" |
| echo " 3B ν΅ν© λ°μ΄ν°μ
λΉλ | μμ: $(date)" |
| echo "==================================================================" |
|
|
| |
| merge_chunks() { |
| PREFIX="$1" |
| OUTPUT="$2" |
| CHUNKS=$(ls "${PREFIX}".bin.chunk* 2>/dev/null | sort || true) |
| if [[ -z "$CHUNKS" ]]; then return; fi |
| if [[ -f "$OUTPUT" ]]; then echo " [SKIP] $OUTPUT μ΄λ―Έ μ‘΄μ¬"; return; fi |
| echo " μ²ν¬ λ³ν©: $(basename $PREFIX)" |
| cat $CHUNKS > "$OUTPUT" |
| echo " μλ£: $(du -sh $OUTPUT | cut -f1)" |
| } |
|
|
| merge_chunks "$DATA/cosmo_auto_math_text_train" "$DATA/cosmo_auto_math_text_train.bin" |
| merge_chunks "$DATA/cosmo_auto_math_text_val" "$DATA/cosmo_auto_math_text_val.bin" |
| merge_chunks "$DATA/cosmo_web_v2_train" "$DATA/cosmo_web_v2_train.bin" |
| merge_chunks "$DATA/cosmo_web_v2_val" "$DATA/cosmo_web_v2_val.bin" |
|
|
| TRAIN_FILES="" |
| for f in \ |
| "$DATA/korean_train.bin" \ |
| "$DATA/hplt_ko_train.bin" \ |
| "$DATA/korean_c4_train.bin" \ |
| "$DATA/cc100_ko_train.bin" \ |
| "$DATA/namuwiki_2023b_train.bin" \ |
| "$DATA/korean_namuwiki_train.bin" \ |
| "$DATA/wikipedia_ko_train.bin" \ |
| "$DATA/korean_wiki_train.bin" \ |
| "$DATA/open_web_math_train.bin" \ |
| "$DATA/mathpile_train.bin" \ |
| "$DATA/cosmo_auto_math_text_train.bin" \ |
| "$DATA/cosmo_stories_train.bin" \ |
| "$DATA/cosmo_web_v2_train.bin" \ |
| "$DATA/cosmo_stanford_train.bin" \ |
| "$DATA/cosmo_wikihow_train.bin" \ |
| "$DATA/cosmo_openstax_train.bin" \ |
| "$DATA/cosmo_khanacademy_train.bin"; do |
| [[ -f "$f" ]] && TRAIN_FILES="$TRAIN_FILES $f" |
| done |
|
|
| VAL_FILES="" |
| for f in \ |
| "$DATA/korean_val.bin" \ |
| "$DATA/hplt_ko_val.bin" \ |
| "$DATA/korean_c4_val.bin" \ |
| "$DATA/cc100_ko_val.bin" \ |
| "$DATA/namuwiki_2023b_val.bin" \ |
| "$DATA/open_web_math_val.bin" \ |
| "$DATA/mathpile_val.bin" \ |
| "$DATA/cosmo_auto_math_text_val.bin" \ |
| "$DATA/cosmo_stories_val.bin" \ |
| "$DATA/cosmo_web_v2_val.bin"; do |
| [[ -f "$f" ]] && VAL_FILES="$VAL_FILES $f" |
| done |
|
|
| echo "" |
| echo "train νμΌ λ³ν© β data/3b_train.bin ..." |
| python3 data/merge_bins.py $TRAIN_FILES data/3b_train.bin |
|
|
| echo "" |
| echo "val νμΌ λ³ν© β data/3b_val.bin ..." |
| python3 data/merge_bins.py $VAL_FILES data/3b_val.bin |
|
|
| echo "" |
| echo "==================================================================" |
| du -sh data/3b_train.bin data/3b_val.bin |
| python3 -c " |
| import os |
| sz = os.path.getsize('data/3b_train.bin') |
| tok = sz // 2 |
| print(f'3b_train: {tok/1e9:.2f}B tokens') |
| print(f'60B λ¬μ± μν¬ν¬: {60/(tok/1e9):.1f}x λ°λ³΅ νμ') |
| " |
| echo "μλ£: $(date)" |
| echo "==================================================================" |
|
|