File size: 2,780 Bytes
48ecd01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")/.."
DATA="data"

echo "=================================================================="
echo "  3B ํ†ตํ•ฉ ๋ฐ์ดํ„ฐ์…‹ ๋นŒ๋“œ  |  ์‹œ์ž‘: $(date)"
echo "=================================================================="

# ์ฒญํฌ ๋ณ‘ํ•ฉ ํ•จ์ˆ˜
merge_chunks() {
    PREFIX="$1"
    OUTPUT="$2"
    CHUNKS=$(ls "${PREFIX}".bin.chunk* 2>/dev/null | sort || true)
    if [[ -z "$CHUNKS" ]]; then return; fi
    if [[ -f "$OUTPUT" ]]; then echo "  [SKIP] $OUTPUT ์ด๋ฏธ ์กด์žฌ"; return; fi
    echo "  ์ฒญํฌ ๋ณ‘ํ•ฉ: $(basename $PREFIX)"
    cat $CHUNKS > "$OUTPUT"
    echo "  ์™„๋ฃŒ: $(du -sh $OUTPUT | cut -f1)"
}

merge_chunks "$DATA/cosmo_auto_math_text_train" "$DATA/cosmo_auto_math_text_train.bin"
merge_chunks "$DATA/cosmo_auto_math_text_val"   "$DATA/cosmo_auto_math_text_val.bin"
merge_chunks "$DATA/cosmo_web_v2_train"         "$DATA/cosmo_web_v2_train.bin"
merge_chunks "$DATA/cosmo_web_v2_val"           "$DATA/cosmo_web_v2_val.bin"

TRAIN_FILES=""
for f in \
    "$DATA/korean_train.bin" \
    "$DATA/hplt_ko_train.bin" \
    "$DATA/korean_c4_train.bin" \
    "$DATA/cc100_ko_train.bin" \
    "$DATA/namuwiki_2023b_train.bin" \
    "$DATA/korean_namuwiki_train.bin" \
    "$DATA/wikipedia_ko_train.bin" \
    "$DATA/korean_wiki_train.bin" \
    "$DATA/open_web_math_train.bin" \
    "$DATA/mathpile_train.bin" \
    "$DATA/cosmo_auto_math_text_train.bin" \
    "$DATA/cosmo_stories_train.bin" \
    "$DATA/cosmo_web_v2_train.bin" \
    "$DATA/cosmo_stanford_train.bin" \
    "$DATA/cosmo_wikihow_train.bin" \
    "$DATA/cosmo_openstax_train.bin" \
    "$DATA/cosmo_khanacademy_train.bin"; do
    [[ -f "$f" ]] && TRAIN_FILES="$TRAIN_FILES $f"
done

VAL_FILES=""
for f in \
    "$DATA/korean_val.bin" \
    "$DATA/hplt_ko_val.bin" \
    "$DATA/korean_c4_val.bin" \
    "$DATA/cc100_ko_val.bin" \
    "$DATA/namuwiki_2023b_val.bin" \
    "$DATA/open_web_math_val.bin" \
    "$DATA/mathpile_val.bin" \
    "$DATA/cosmo_auto_math_text_val.bin" \
    "$DATA/cosmo_stories_val.bin" \
    "$DATA/cosmo_web_v2_val.bin"; do
    [[ -f "$f" ]] && VAL_FILES="$VAL_FILES $f"
done

echo ""
echo "train ํŒŒ์ผ ๋ณ‘ํ•ฉ โ†’ data/3b_train.bin ..."
python3 data/merge_bins.py $TRAIN_FILES data/3b_train.bin

echo ""
echo "val ํŒŒ์ผ ๋ณ‘ํ•ฉ โ†’ data/3b_val.bin ..."
python3 data/merge_bins.py $VAL_FILES data/3b_val.bin

echo ""
echo "=================================================================="
du -sh data/3b_train.bin data/3b_val.bin
python3 -c "
import os
sz = os.path.getsize('data/3b_train.bin')
tok = sz // 2
print(f'3b_train: {tok/1e9:.2f}B tokens')
print(f'60B ๋‹ฌ์„ฑ ์—ํฌํฌ: {60/(tok/1e9):.1f}x ๋ฐ˜๋ณต ํ•„์š”')
"
echo "์™„๋ฃŒ: $(date)"
echo "=================================================================="