frankenstallm / data /tokenize_cc100.sh
pathcosmos's picture
feat: Add data pipeline scripts + phase reports (Tier 3 - reproducibility)
b3d361d verified
#!/usr/bin/env bash
# data/tokenize_cc100.sh
# CC-100 Korean ํ† ํฌ๋‚˜์ด์ง• ๋ฐ ๊ธฐ์กด korean_train.bin ๊ณผ์˜ ๋ณ‘ํ•ฉ ์Šคํฌ๋ฆฝํŠธ
#
# ๋ฒ„๊ทธ ์ˆ˜์ • ๋‚ด์—ญ (build_korean_dataset.sh ๋Œ€๋น„):
# - build_korean_dataset.sh Step 6์—์„œ cc100_ko ๋””๋ ‰ํ† ๋ฆฌ๊ฐ€ ๋น„์–ด์žˆ์„ ๊ฒฝ์šฐ
# prepare.py ์˜ find_input_files()๊ฐ€ FileNotFoundError ๋ฅผ ๋ฐœ์ƒ์‹œํ‚ค๋Š” ๋ฒ„๊ทธ๊ฐ€ ์žˆ์—ˆ์Œ.
# ๋ณธ ์Šคํฌ๋ฆฝํŠธ๋Š” ์‚ฌ์ „์— cc100_ko/*.txt ํŒŒ์ผ ์กด์žฌ ์—ฌ๋ถ€๋ฅผ ํ™•์ธํ•˜๊ณ 
# ์—†์„ ๊ฒฝ์šฐ ๋ช…ํ™•ํ•œ ์•ˆ๋‚ด ๋ฉ”์‹œ์ง€์™€ ํ•จ๊ป˜ ์ข…๋ฃŒํ•œ๋‹ค.
#
# ์ „์ œ ์กฐ๊ฑด:
# 1. tokenizer/korean_sp/tokenizer.json โ€” SP ํ† ํฌ๋‚˜์ด์ €๊ฐ€ ์ด๋ฏธ ํ•™์Šต/๋ณ€ํ™˜ ์™„๋ฃŒ
# 2. data/raw/cc100_ko/*.txt โ€” CC-100 ๋‹ค์šด๋กœ๋“œ ์™„๋ฃŒ
# (์—†์œผ๋ฉด: bash data/download_cc100.sh ๋จผ์ € ์‹คํ–‰)
# 3. data/korean_train.bin โ€” ๊ธฐ์กด ๋ณ‘ํ•ฉ ํ•™์Šต ๋ฐ์ดํ„ฐ (๋ณ‘ํ•ฉ ๋Œ€์ƒ)
# (์—†์–ด๋„ ํ† ํฌ๋‚˜์ด์ง•์€ ์ง„ํ–‰๋˜๋ฉฐ, ๋ณ‘ํ•ฉ ๋‹จ๊ณ„๋งŒ ๊ฑด๋„ˆ๋œ€)
#
# ์‹คํ–‰ ๋ฐฉ๋ฒ• (ํ”„๋กœ์ ํŠธ ๋ฃจํŠธ์—์„œ):
# bash data/tokenize_cc100.sh
#
# ์ถœ๋ ฅ:
# data/korean_cc100_train.bin โ€” CC-100 ํ•™์Šต ํ† ํฐ
# data/korean_cc100_val.bin โ€” CC-100 ๊ฒ€์ฆ ํ† ํฐ
# data/korean_train_combined.bin โ€” ๊ธฐ์กด korean_train.bin + CC-100 ๋ณ‘ํ•ฉ๋ณธ
# (korean_train.bin ์ด ์กด์žฌํ•˜๋Š” ๊ฒฝ์šฐ์—๋งŒ ์ƒ์„ฑ)
set -euo pipefail
# โ”€โ”€โ”€ ๊ฒฝ๋กœ ์„ค์ • โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
cd "$PROJECT_ROOT"
RAW_DIR="data/raw"
BIN_DIR="data"
TOKENIZER_JSON="tokenizer/korean_sp/tokenizer.json"
CC100_DIR="$RAW_DIR/cc100_ko"
# โ”€โ”€โ”€ ์ถœ๋ ฅ ํŒŒ์ผ ๊ฒฝ๋กœ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
CC100_TRAIN_BIN="$BIN_DIR/korean_cc100_train.bin"
CC100_VAL_BIN="$BIN_DIR/korean_cc100_val.bin"
EXISTING_TRAIN_BIN="$BIN_DIR/korean_train.bin"
COMBINED_TRAIN_BIN="$BIN_DIR/korean_train_combined.bin"
# โ”€โ”€โ”€ ์‚ฌ์ „ ๊ฒ€์‚ฌ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
echo "=== CC-100 ํ† ํฌ๋‚˜์ด์ง• ๋ฐ ๋ณ‘ํ•ฉ ==="
echo "ํ”„๋กœ์ ํŠธ ๋ฃจํŠธ: $PROJECT_ROOT"
echo ""
# ๊ฒ€์‚ฌ 1: ํ† ํฌ๋‚˜์ด์ € ํŒŒ์ผ ์กด์žฌ ์—ฌ๋ถ€
if [ ! -f "$TOKENIZER_JSON" ]; then
echo "ERROR: ํ† ํฌ๋‚˜์ด์ € ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: $TOKENIZER_JSON" >&2
echo ""
echo "ํ•ด๊ฒฐ ๋ฐฉ๋ฒ•: ํ† ํฌ๋‚˜์ด์ €๋ฅผ ๋จผ์ € ํ•™์Šตํ•˜๊ณ  ๋ณ€ํ™˜ํ•˜์„ธ์š”."
echo " python tokenizer/train_sp_tokenizer.py --input <ํ…์ŠคํŠธํŒŒ์ผ> --output_dir tokenizer/korean_sp"
echo " python tokenizer/convert_sp_to_hf.py --model tokenizer/korean_sp/tokenizer.model --output $TOKENIZER_JSON"
exit 1
fi
echo "[OK] ํ† ํฌ๋‚˜์ด์ €: $TOKENIZER_JSON"
# ๊ฒ€์‚ฌ 2: CC-100 .txt ํŒŒ์ผ ์กด์žฌ ์—ฌ๋ถ€
CC100_FILE_COUNT=$(find "$CC100_DIR" -maxdepth 1 -name "*.txt" 2>/dev/null | wc -l)
if [ "$CC100_FILE_COUNT" -eq 0 ]; then
echo "ERROR: CC-100 ํ…์ŠคํŠธ ํŒŒ์ผ์ด ์—†์Šต๋‹ˆ๋‹ค: $CC100_DIR/*.txt" >&2
echo ""
echo "ํ•ด๊ฒฐ ๋ฐฉ๋ฒ•: CC-100 ๋จผ์ € ๋‹ค์šด๋กœ๋“œํ•˜์„ธ์š”."
echo " bash data/download_cc100.sh"
echo ""
echo "์ฃผ์˜: build_korean_dataset.sh ์˜ --text_col text ๋ฒ„๊ทธ๋กœ ๋‹ค์šด๋กœ๋“œํ–ˆ๋‹ค๋ฉด"
echo " ํ•ด๋‹น ํŒŒ์ผ๋“ค์€ ๋นˆ ๋‚ด์šฉ์ด๋ฏ€๋กœ ์‚ญ์ œ ํ›„ ์žฌ๋‹ค์šด๋กœ๋“œ๊ฐ€ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค."
echo " rm -f \"$CC100_DIR\"/*.txt && bash data/download_cc100.sh"
exit 1
fi
echo "[OK] CC-100 ์ƒค๋“œ ํŒŒ์ผ: ${CC100_FILE_COUNT}๊ฐœ ($CC100_DIR)"
# ๊ฒ€์‚ฌ 3: ๊ธฐ์กด korean_train.bin ์กด์žฌ ์—ฌ๋ถ€ ํ™•์ธ (๊ฒฝ๊ณ ๋งŒ, ์ค‘๋‹จํ•˜์ง€ ์•Š์Œ)
if [ -f "$EXISTING_TRAIN_BIN" ]; then
EXISTING_SIZE=$(du -sh "$EXISTING_TRAIN_BIN" 2>/dev/null | cut -f1)
echo "[OK] ๊ธฐ์กด ํ•™์Šต ๋ฐ์ดํ„ฐ: $EXISTING_TRAIN_BIN ($EXISTING_SIZE) โ€” ๋ณ‘ํ•ฉ ์˜ˆ์ •"
else
echo "[WARN] ๊ธฐ์กด ํ•™์Šต ๋ฐ์ดํ„ฐ ์—†์Œ: $EXISTING_TRAIN_BIN"
echo " ํ† ํฌ๋‚˜์ด์ง•๋งŒ ์ง„ํ–‰ํ•˜๊ณ , ๋ณ‘ํ•ฉ ๋‹จ๊ณ„๋Š” ๊ฑด๋„ˆ๋œ๋‹ˆ๋‹ค."
fi
echo ""
# โ”€โ”€โ”€ Step 1: CC-100 ํ† ํฌ๋‚˜์ด์ง• โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# prepare.py ๋Š” --output ๊ฒฝ๋กœ์˜ 'train' ์„ 'val' ๋กœ ์น˜ํ™˜ํ•˜์—ฌ val .bin ์„ ์ž๋™ ์ƒ์„ฑํ•จ.
# --val_split 0.002 โ†’ 0.2% ๋ฅผ ๊ฒ€์ฆ ์…‹์œผ๋กœ ๋ถ„๋ฆฌ (1,000๋งŒ ํ–‰ ๊ธฐ์ค€ ์•ฝ 3M ํ† ํฐ)
echo "[1/2] CC-100 ํ† ํฌ๋‚˜์ด์ง•..."
echo " ์ž…๋ ฅ: $CC100_DIR/*.txt (${CC100_FILE_COUNT}๊ฐœ ํŒŒ์ผ)"
echo " ์ถœ๋ ฅ: $CC100_TRAIN_BIN"
echo " ์ถœ๋ ฅ: $CC100_VAL_BIN (val_split=0.2%)"
echo ""
python data/prepare.py \
--input "$CC100_DIR/*.txt" \
--output "$CC100_TRAIN_BIN" \
--tokenizer "$TOKENIZER_JSON" \
--val_split 0.002 \
--seed 42
echo ""
echo "[์™„๋ฃŒ] ํ† ํฌ๋‚˜์ด์ง• ๊ฒฐ๊ณผ:"
if [ -f "$CC100_TRAIN_BIN" ]; then
echo " $CC100_TRAIN_BIN ($(du -sh "$CC100_TRAIN_BIN" | cut -f1))"
fi
if [ -f "$CC100_VAL_BIN" ]; then
echo " $CC100_VAL_BIN ($(du -sh "$CC100_VAL_BIN" | cut -f1))"
fi
echo ""
# โ”€โ”€โ”€ Step 2: ๊ธฐ์กด korean_train.bin ๊ณผ ๋ณ‘ํ•ฉ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# ๋ณ‘ํ•ฉ ๊ฒฐ๊ณผ๋Š” korean_train_combined.bin ์œผ๋กœ ์ €์žฅ.
# ๊ธฐ์กด korean_train.bin ์€ ๋ฎ์–ด์“ฐ์ง€ ์•Š์œผ๋ฏ€๋กœ ์•ˆ์ „ํ•˜๊ฒŒ ๊ฒ€ํ†  ํ›„ ๊ต์ฒด ๊ฐ€๋Šฅ.
if [ -f "$EXISTING_TRAIN_BIN" ] && [ -f "$CC100_TRAIN_BIN" ]; then
echo "[2/2] ๊ธฐ์กด ํ•™์Šต ๋ฐ์ดํ„ฐ์™€ ๋ณ‘ํ•ฉ..."
echo " ์ž…๋ ฅ1: $EXISTING_TRAIN_BIN"
echo " ์ž…๋ ฅ2: $CC100_TRAIN_BIN"
echo " ์ถœ๋ ฅ: $COMBINED_TRAIN_BIN"
echo ""
python data/merge_bins.py \
"$EXISTING_TRAIN_BIN" \
"$CC100_TRAIN_BIN" \
"$COMBINED_TRAIN_BIN"
echo ""
echo "[์™„๋ฃŒ] ๋ณ‘ํ•ฉ ๊ฒฐ๊ณผ:"
echo " $COMBINED_TRAIN_BIN ($(du -sh "$COMBINED_TRAIN_BIN" | cut -f1))"
echo ""
echo "๋ณ‘ํ•ฉ ํŒŒ์ผ์„ ๊ธฐ์กด ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ๊ต์ฒดํ•˜๋ ค๋ฉด:"
echo " mv \"$EXISTING_TRAIN_BIN\" \"${EXISTING_TRAIN_BIN%.bin}_backup.bin\""
echo " mv \"$COMBINED_TRAIN_BIN\" \"$EXISTING_TRAIN_BIN\""
else
echo "[2/2] ๋ณ‘ํ•ฉ ๊ฑด๋„ˆ๋œ€ โ€” ๊ธฐ์กด korean_train.bin ์—†์Œ."
echo " CC-100 ํ•™์Šต ๋ฐ์ดํ„ฐ๋งŒ ๋‹จ๋…์œผ๋กœ ์ƒ์„ฑ๋˜์—ˆ์Šต๋‹ˆ๋‹ค: $CC100_TRAIN_BIN"
fi
# โ”€โ”€โ”€ ์ตœ์ข… ์š”์•ฝ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
echo ""
echo "=== ์™„๋ฃŒ ==="
echo ""
echo "์ƒ์„ฑ๋œ ํŒŒ์ผ:"
for f in "$CC100_TRAIN_BIN" "$CC100_VAL_BIN" "$COMBINED_TRAIN_BIN"; do
if [ -f "$f" ]; then
TOKEN_COUNT=$(python3 -c "
import numpy as np, sys
d = np.memmap('$f', dtype='uint16', mode='r')
print(f'{len(d):,}')
" 2>/dev/null || echo "๊ณ„์‚ฐ ๋ถˆ๊ฐ€")
echo " $f โ†’ ${TOKEN_COUNT} ํ† ํฐ ($(du -sh "$f" | cut -f1))"
fi
done
echo ""
echo "ํ•™์Šต ์žฌ์‹œ์ž‘ ์‹œ combined ํŒŒ์ผ์„ configs/small_fp8_run1.yaml ์˜"
echo "data_path ์— ์ง€์ •ํ•˜๊ฑฐ๋‚˜, ๊ธฐ์กด korean_train.bin ์„ ๊ต์ฒดํ•˜์„ธ์š”."