File size: 7,364 Bytes
b3d361d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env bash
# data/tokenize_cc100.sh
# CC-100 Korean ํ† ํฌ๋‚˜์ด์ง• ๋ฐ ๊ธฐ์กด korean_train.bin ๊ณผ์˜ ๋ณ‘ํ•ฉ ์Šคํฌ๋ฆฝํŠธ
#
# ๋ฒ„๊ทธ ์ˆ˜์ • ๋‚ด์—ญ (build_korean_dataset.sh ๋Œ€๋น„):
#   - build_korean_dataset.sh Step 6์—์„œ cc100_ko ๋””๋ ‰ํ† ๋ฆฌ๊ฐ€ ๋น„์–ด์žˆ์„ ๊ฒฝ์šฐ
#     prepare.py ์˜ find_input_files()๊ฐ€ FileNotFoundError ๋ฅผ ๋ฐœ์ƒ์‹œํ‚ค๋Š” ๋ฒ„๊ทธ๊ฐ€ ์žˆ์—ˆ์Œ.
#     ๋ณธ ์Šคํฌ๋ฆฝํŠธ๋Š” ์‚ฌ์ „์— cc100_ko/*.txt ํŒŒ์ผ ์กด์žฌ ์—ฌ๋ถ€๋ฅผ ํ™•์ธํ•˜๊ณ 
#     ์—†์„ ๊ฒฝ์šฐ ๋ช…ํ™•ํ•œ ์•ˆ๋‚ด ๋ฉ”์‹œ์ง€์™€ ํ•จ๊ป˜ ์ข…๋ฃŒํ•œ๋‹ค.
#
# ์ „์ œ ์กฐ๊ฑด:
#   1. tokenizer/korean_sp/tokenizer.json  โ€” SP ํ† ํฌ๋‚˜์ด์ €๊ฐ€ ์ด๋ฏธ ํ•™์Šต/๋ณ€ํ™˜ ์™„๋ฃŒ
#   2. data/raw/cc100_ko/*.txt             โ€” CC-100 ๋‹ค์šด๋กœ๋“œ ์™„๋ฃŒ
#      (์—†์œผ๋ฉด: bash data/download_cc100.sh ๋จผ์ € ์‹คํ–‰)
#   3. data/korean_train.bin               โ€” ๊ธฐ์กด ๋ณ‘ํ•ฉ ํ•™์Šต ๋ฐ์ดํ„ฐ (๋ณ‘ํ•ฉ ๋Œ€์ƒ)
#      (์—†์–ด๋„ ํ† ํฌ๋‚˜์ด์ง•์€ ์ง„ํ–‰๋˜๋ฉฐ, ๋ณ‘ํ•ฉ ๋‹จ๊ณ„๋งŒ ๊ฑด๋„ˆ๋œ€)
#
# ์‹คํ–‰ ๋ฐฉ๋ฒ• (ํ”„๋กœ์ ํŠธ ๋ฃจํŠธ์—์„œ):
#   bash data/tokenize_cc100.sh
#
# ์ถœ๋ ฅ:
#   data/korean_cc100_train.bin  โ€” CC-100 ํ•™์Šต ํ† ํฐ
#   data/korean_cc100_val.bin    โ€” CC-100 ๊ฒ€์ฆ ํ† ํฐ
#   data/korean_train_combined.bin  โ€” ๊ธฐ์กด korean_train.bin + CC-100 ๋ณ‘ํ•ฉ๋ณธ
#                                     (korean_train.bin ์ด ์กด์žฌํ•˜๋Š” ๊ฒฝ์šฐ์—๋งŒ ์ƒ์„ฑ)

set -euo pipefail

# โ”€โ”€โ”€ ๊ฒฝ๋กœ ์„ค์ • โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
cd "$PROJECT_ROOT"

RAW_DIR="data/raw"
BIN_DIR="data"
TOKENIZER_JSON="tokenizer/korean_sp/tokenizer.json"
CC100_DIR="$RAW_DIR/cc100_ko"

# โ”€โ”€โ”€ ์ถœ๋ ฅ ํŒŒ์ผ ๊ฒฝ๋กœ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
CC100_TRAIN_BIN="$BIN_DIR/korean_cc100_train.bin"
CC100_VAL_BIN="$BIN_DIR/korean_cc100_val.bin"
EXISTING_TRAIN_BIN="$BIN_DIR/korean_train.bin"
COMBINED_TRAIN_BIN="$BIN_DIR/korean_train_combined.bin"

# โ”€โ”€โ”€ ์‚ฌ์ „ ๊ฒ€์‚ฌ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
echo "=== CC-100 ํ† ํฌ๋‚˜์ด์ง• ๋ฐ ๋ณ‘ํ•ฉ ==="
echo "ํ”„๋กœ์ ํŠธ ๋ฃจํŠธ: $PROJECT_ROOT"
echo ""

# ๊ฒ€์‚ฌ 1: ํ† ํฌ๋‚˜์ด์ € ํŒŒ์ผ ์กด์žฌ ์—ฌ๋ถ€
if [ ! -f "$TOKENIZER_JSON" ]; then
    echo "ERROR: ํ† ํฌ๋‚˜์ด์ € ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: $TOKENIZER_JSON" >&2
    echo ""
    echo "ํ•ด๊ฒฐ ๋ฐฉ๋ฒ•: ํ† ํฌ๋‚˜์ด์ €๋ฅผ ๋จผ์ € ํ•™์Šตํ•˜๊ณ  ๋ณ€ํ™˜ํ•˜์„ธ์š”."
    echo "  python tokenizer/train_sp_tokenizer.py --input <ํ…์ŠคํŠธํŒŒ์ผ> --output_dir tokenizer/korean_sp"
    echo "  python tokenizer/convert_sp_to_hf.py --model tokenizer/korean_sp/tokenizer.model --output $TOKENIZER_JSON"
    exit 1
fi
echo "[OK] ํ† ํฌ๋‚˜์ด์ €: $TOKENIZER_JSON"

# ๊ฒ€์‚ฌ 2: CC-100 .txt ํŒŒ์ผ ์กด์žฌ ์—ฌ๋ถ€
CC100_FILE_COUNT=$(find "$CC100_DIR" -maxdepth 1 -name "*.txt" 2>/dev/null | wc -l)
if [ "$CC100_FILE_COUNT" -eq 0 ]; then
    echo "ERROR: CC-100 ํ…์ŠคํŠธ ํŒŒ์ผ์ด ์—†์Šต๋‹ˆ๋‹ค: $CC100_DIR/*.txt" >&2
    echo ""
    echo "ํ•ด๊ฒฐ ๋ฐฉ๋ฒ•: CC-100 ๋จผ์ € ๋‹ค์šด๋กœ๋“œํ•˜์„ธ์š”."
    echo "  bash data/download_cc100.sh"
    echo ""
    echo "์ฃผ์˜: build_korean_dataset.sh ์˜ --text_col text ๋ฒ„๊ทธ๋กœ ๋‹ค์šด๋กœ๋“œํ–ˆ๋‹ค๋ฉด"
    echo "      ํ•ด๋‹น ํŒŒ์ผ๋“ค์€ ๋นˆ ๋‚ด์šฉ์ด๋ฏ€๋กœ ์‚ญ์ œ ํ›„ ์žฌ๋‹ค์šด๋กœ๋“œ๊ฐ€ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค."
    echo "  rm -f \"$CC100_DIR\"/*.txt && bash data/download_cc100.sh"
    exit 1
fi
echo "[OK] CC-100 ์ƒค๋“œ ํŒŒ์ผ: ${CC100_FILE_COUNT}๊ฐœ ($CC100_DIR)"

# ๊ฒ€์‚ฌ 3: ๊ธฐ์กด korean_train.bin ์กด์žฌ ์—ฌ๋ถ€ ํ™•์ธ (๊ฒฝ๊ณ ๋งŒ, ์ค‘๋‹จํ•˜์ง€ ์•Š์Œ)
if [ -f "$EXISTING_TRAIN_BIN" ]; then
    EXISTING_SIZE=$(du -sh "$EXISTING_TRAIN_BIN" 2>/dev/null | cut -f1)
    echo "[OK] ๊ธฐ์กด ํ•™์Šต ๋ฐ์ดํ„ฐ: $EXISTING_TRAIN_BIN ($EXISTING_SIZE) โ€” ๋ณ‘ํ•ฉ ์˜ˆ์ •"
else
    echo "[WARN] ๊ธฐ์กด ํ•™์Šต ๋ฐ์ดํ„ฐ ์—†์Œ: $EXISTING_TRAIN_BIN"
    echo "       ํ† ํฌ๋‚˜์ด์ง•๋งŒ ์ง„ํ–‰ํ•˜๊ณ , ๋ณ‘ํ•ฉ ๋‹จ๊ณ„๋Š” ๊ฑด๋„ˆ๋œ๋‹ˆ๋‹ค."
fi

echo ""

# โ”€โ”€โ”€ Step 1: CC-100 ํ† ํฌ๋‚˜์ด์ง• โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# prepare.py ๋Š” --output ๊ฒฝ๋กœ์˜ 'train' ์„ 'val' ๋กœ ์น˜ํ™˜ํ•˜์—ฌ val .bin ์„ ์ž๋™ ์ƒ์„ฑํ•จ.
# --val_split 0.002 โ†’ 0.2% ๋ฅผ ๊ฒ€์ฆ ์…‹์œผ๋กœ ๋ถ„๋ฆฌ (1,000๋งŒ ํ–‰ ๊ธฐ์ค€ ์•ฝ 3M ํ† ํฐ)

echo "[1/2] CC-100 ํ† ํฌ๋‚˜์ด์ง•..."
echo "  ์ž…๋ ฅ: $CC100_DIR/*.txt  (${CC100_FILE_COUNT}๊ฐœ ํŒŒ์ผ)"
echo "  ์ถœ๋ ฅ: $CC100_TRAIN_BIN"
echo "  ์ถœ๋ ฅ: $CC100_VAL_BIN   (val_split=0.2%)"
echo ""

python data/prepare.py \
    --input "$CC100_DIR/*.txt" \
    --output "$CC100_TRAIN_BIN" \
    --tokenizer "$TOKENIZER_JSON" \
    --val_split 0.002 \
    --seed 42

echo ""
echo "[์™„๋ฃŒ] ํ† ํฌ๋‚˜์ด์ง• ๊ฒฐ๊ณผ:"
if [ -f "$CC100_TRAIN_BIN" ]; then
    echo "  $CC100_TRAIN_BIN  ($(du -sh "$CC100_TRAIN_BIN" | cut -f1))"
fi
if [ -f "$CC100_VAL_BIN" ]; then
    echo "  $CC100_VAL_BIN   ($(du -sh "$CC100_VAL_BIN" | cut -f1))"
fi
echo ""

# โ”€โ”€โ”€ Step 2: ๊ธฐ์กด korean_train.bin ๊ณผ ๋ณ‘ํ•ฉ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# ๋ณ‘ํ•ฉ ๊ฒฐ๊ณผ๋Š” korean_train_combined.bin ์œผ๋กœ ์ €์žฅ.
# ๊ธฐ์กด korean_train.bin ์€ ๋ฎ์–ด์“ฐ์ง€ ์•Š์œผ๋ฏ€๋กœ ์•ˆ์ „ํ•˜๊ฒŒ ๊ฒ€ํ†  ํ›„ ๊ต์ฒด ๊ฐ€๋Šฅ.

if [ -f "$EXISTING_TRAIN_BIN" ] && [ -f "$CC100_TRAIN_BIN" ]; then
    echo "[2/2] ๊ธฐ์กด ํ•™์Šต ๋ฐ์ดํ„ฐ์™€ ๋ณ‘ํ•ฉ..."
    echo "  ์ž…๋ ฅ1: $EXISTING_TRAIN_BIN"
    echo "  ์ž…๋ ฅ2: $CC100_TRAIN_BIN"
    echo "  ์ถœ๋ ฅ:  $COMBINED_TRAIN_BIN"
    echo ""

    python data/merge_bins.py \
        "$EXISTING_TRAIN_BIN" \
        "$CC100_TRAIN_BIN" \
        "$COMBINED_TRAIN_BIN"

    echo ""
    echo "[์™„๋ฃŒ] ๋ณ‘ํ•ฉ ๊ฒฐ๊ณผ:"
    echo "  $COMBINED_TRAIN_BIN  ($(du -sh "$COMBINED_TRAIN_BIN" | cut -f1))"
    echo ""
    echo "๋ณ‘ํ•ฉ ํŒŒ์ผ์„ ๊ธฐ์กด ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ๊ต์ฒดํ•˜๋ ค๋ฉด:"
    echo "  mv \"$EXISTING_TRAIN_BIN\" \"${EXISTING_TRAIN_BIN%.bin}_backup.bin\""
    echo "  mv \"$COMBINED_TRAIN_BIN\" \"$EXISTING_TRAIN_BIN\""
else
    echo "[2/2] ๋ณ‘ํ•ฉ ๊ฑด๋„ˆ๋œ€ โ€” ๊ธฐ์กด korean_train.bin ์—†์Œ."
    echo "  CC-100 ํ•™์Šต ๋ฐ์ดํ„ฐ๋งŒ ๋‹จ๋…์œผ๋กœ ์ƒ์„ฑ๋˜์—ˆ์Šต๋‹ˆ๋‹ค: $CC100_TRAIN_BIN"
fi

# โ”€โ”€โ”€ ์ตœ์ข… ์š”์•ฝ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
echo ""
echo "=== ์™„๋ฃŒ ==="
echo ""
echo "์ƒ์„ฑ๋œ ํŒŒ์ผ:"
for f in "$CC100_TRAIN_BIN" "$CC100_VAL_BIN" "$COMBINED_TRAIN_BIN"; do
    if [ -f "$f" ]; then
        TOKEN_COUNT=$(python3 -c "
import numpy as np, sys
d = np.memmap('$f', dtype='uint16', mode='r')
print(f'{len(d):,}')
" 2>/dev/null || echo "๊ณ„์‚ฐ ๋ถˆ๊ฐ€")
        echo "  $f  โ†’  ${TOKEN_COUNT} ํ† ํฐ  ($(du -sh "$f" | cut -f1))"
    fi
done
echo ""
echo "ํ•™์Šต ์žฌ์‹œ์ž‘ ์‹œ combined ํŒŒ์ผ์„ configs/small_fp8_run1.yaml ์˜"
echo "data_path ์— ์ง€์ •ํ•˜๊ฑฐ๋‚˜, ๊ธฐ์กด korean_train.bin ์„ ๊ต์ฒดํ•˜์„ธ์š”."