File size: 2,780 Bytes
48ecd01 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | #!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")/.."
DATA="data"
echo "=================================================================="
echo " 3B ํตํฉ ๋ฐ์ดํฐ์
๋น๋ | ์์: $(date)"
echo "=================================================================="
# ์ฒญํฌ ๋ณํฉ ํจ์
merge_chunks() {
PREFIX="$1"
OUTPUT="$2"
CHUNKS=$(ls "${PREFIX}".bin.chunk* 2>/dev/null | sort || true)
if [[ -z "$CHUNKS" ]]; then return; fi
if [[ -f "$OUTPUT" ]]; then echo " [SKIP] $OUTPUT ์ด๋ฏธ ์กด์ฌ"; return; fi
echo " ์ฒญํฌ ๋ณํฉ: $(basename $PREFIX)"
cat $CHUNKS > "$OUTPUT"
echo " ์๋ฃ: $(du -sh $OUTPUT | cut -f1)"
}
merge_chunks "$DATA/cosmo_auto_math_text_train" "$DATA/cosmo_auto_math_text_train.bin"
merge_chunks "$DATA/cosmo_auto_math_text_val" "$DATA/cosmo_auto_math_text_val.bin"
merge_chunks "$DATA/cosmo_web_v2_train" "$DATA/cosmo_web_v2_train.bin"
merge_chunks "$DATA/cosmo_web_v2_val" "$DATA/cosmo_web_v2_val.bin"
TRAIN_FILES=""
for f in \
"$DATA/korean_train.bin" \
"$DATA/hplt_ko_train.bin" \
"$DATA/korean_c4_train.bin" \
"$DATA/cc100_ko_train.bin" \
"$DATA/namuwiki_2023b_train.bin" \
"$DATA/korean_namuwiki_train.bin" \
"$DATA/wikipedia_ko_train.bin" \
"$DATA/korean_wiki_train.bin" \
"$DATA/open_web_math_train.bin" \
"$DATA/mathpile_train.bin" \
"$DATA/cosmo_auto_math_text_train.bin" \
"$DATA/cosmo_stories_train.bin" \
"$DATA/cosmo_web_v2_train.bin" \
"$DATA/cosmo_stanford_train.bin" \
"$DATA/cosmo_wikihow_train.bin" \
"$DATA/cosmo_openstax_train.bin" \
"$DATA/cosmo_khanacademy_train.bin"; do
[[ -f "$f" ]] && TRAIN_FILES="$TRAIN_FILES $f"
done
VAL_FILES=""
for f in \
"$DATA/korean_val.bin" \
"$DATA/hplt_ko_val.bin" \
"$DATA/korean_c4_val.bin" \
"$DATA/cc100_ko_val.bin" \
"$DATA/namuwiki_2023b_val.bin" \
"$DATA/open_web_math_val.bin" \
"$DATA/mathpile_val.bin" \
"$DATA/cosmo_auto_math_text_val.bin" \
"$DATA/cosmo_stories_val.bin" \
"$DATA/cosmo_web_v2_val.bin"; do
[[ -f "$f" ]] && VAL_FILES="$VAL_FILES $f"
done
echo ""
echo "train ํ์ผ ๋ณํฉ โ data/3b_train.bin ..."
python3 data/merge_bins.py $TRAIN_FILES data/3b_train.bin
echo ""
echo "val ํ์ผ ๋ณํฉ โ data/3b_val.bin ..."
python3 data/merge_bins.py $VAL_FILES data/3b_val.bin
echo ""
echo "=================================================================="
du -sh data/3b_train.bin data/3b_val.bin
python3 -c "
import os
sz = os.path.getsize('data/3b_train.bin')
tok = sz // 2
print(f'3b_train: {tok/1e9:.2f}B tokens')
print(f'60B ๋ฌ์ฑ ์ํฌํฌ: {60/(tok/1e9):.1f}x ๋ฐ๋ณต ํ์')
"
echo "์๋ฃ: $(date)"
echo "=================================================================="
|