frankenstallm / source /scripts /check_korean_data.sh
pathcosmos's picture
Upload folder using huggingface_hub (#17)
48ecd01
#!/bin/bash
# ํ•œ๊ตญ์–ด ํ•™์Šต ๋ฐ์ดํ„ฐ ํ˜„ํ™ฉ ํ™•์ธ ์Šคํฌ๋ฆฝํŠธ
# ์šฉ๋„: ํ•œ๊ตญ์–ด ๋ฐ์ดํ„ฐ์…‹ ์ƒํƒœ, ํ† ํฌ๋‚˜์ด์ €, ์›๋ณธ ๋ฐ์ดํ„ฐ ํŒŒ์ผ ํ™•์ธ
set -e
# ํ”„๋กœ์ ํŠธ ๋ฃจํŠธ (์ด ์Šคํฌ๋ฆฝํŠธ ์‹คํ–‰ ์œ„์น˜ ๊ธฐ์ค€)
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "${PROJECT_ROOT}"
echo "=== ํ•œ๊ตญ์–ด ํ•™์Šต ๋ฐ์ดํ„ฐ ํ˜„ํ™ฉ ==="
echo ""
# ============================================================================
# 1. ํ•™์Šต์šฉ ๋ฐ”์ด๋„ˆ๋ฆฌ ๋ฐ์ดํ„ฐ ํ™•์ธ
# ============================================================================
echo "[ ํ•™์Šต ๋ฐ”์ด๋„ˆ๋ฆฌ ๋ฐ์ดํ„ฐ ]"
check_binary_data() {
local file=$1
local name=$2
if [ -f "$file" ]; then
local size=$(du -h "$file" | cut -f1)
# Python + numpy memmap์œผ๋กœ ํ† ํฐ ์ˆ˜ ๊ณ„์‚ฐ
# ๋ฐ”์ด๋„ˆ๋ฆฌ๋Š” uint32 ํ˜•ํƒœ๋กœ ์ €์žฅ๋˜์–ด ์žˆ์Œ (4 bytes per token)
local token_count=$(python3 -c "
import numpy as np
try:
data = np.memmap('$file', dtype=np.uint32, mode='r')
print(len(data))
except Exception as e:
print('error')
" 2>/dev/null || echo "error")
if [ "$token_count" != "error" ] && [ ! -z "$token_count" ]; then
# ํ† ํฐ ์ˆ˜๋ฅผ ํฌ๋งทํŒ… (์ฒœ ๋‹จ์œ„ ์‰ผํ‘œ)
local formatted_tokens=$(printf "%'d" "$token_count")
# 1B ๋ชจ๋ธ ํ•™์Šต ์Šคํ… ๊ณ„์‚ฐ
# tokens_per_step = batch_size * grad_accum * seq_len * num_gpus
# = 8 * 4 * 4096 * 8 = 1,048,576 tokens/step
local tokens_per_step=1048576
local estimated_steps=$((token_count / tokens_per_step))
printf " %-20s : ์กด์žฌ (%s, %'d ํ† ํฐ, ~%'d steps)\n" \
"$name" "$size" "$token_count" "$estimated_steps"
else
printf " %-20s : ์กด์žฌ (%s, ํ† ํฐ ๊ณ„์‚ฐ ์‹คํŒจ)\n" "$name" "$size"
fi
else
printf " %-20s : ์—†์Œ\n" "$name"
fi
}
check_binary_data "data/korean_train.bin" "korean_train.bin"
check_binary_data "data/korean_val.bin" "korean_val.bin"
check_binary_data "data/train.bin" "train.bin"
check_binary_data "data/val.bin" "val.bin"
echo ""
# ============================================================================
# 2. ํ† ํฌ๋‚˜์ด์ € ํ™•์ธ
# ============================================================================
echo "[ ํ† ํฌ๋‚˜์ด์ € ]"
check_tokenizer() {
local dir=$1
local name=$2
if [ -d "$dir" ]; then
local files=$(find "$dir" -type f | wc -l)
printf " %-20s : ์กด์žฌ (%d๊ฐœ ํŒŒ์ผ)\n" "$name" "$files"
else
printf " %-20s : ์—†์Œ\n" "$name"
fi
}
check_tokenizer "tokenizer/korean_sp" "korean_sp"
check_tokenizer "tokenizer" "default tokenizer"
echo ""
# ============================================================================
# 3. ์›๋ณธ ๋ฐ์ดํ„ฐ ๋””๋ ‰ํ† ๋ฆฌ ํ™•์ธ
# ============================================================================
echo "[ ์›๋ณธ ๋ฐ์ดํ„ฐ ]"
check_raw_data() {
local dir=$1
local name=$2
if [ -d "$dir" ]; then
local file_count=$(find "$dir" -maxdepth 1 -type f | wc -l)
local total_size=$(du -sh "$dir" 2>/dev/null | cut -f1)
if [ $file_count -eq 0 ]; then
printf " %-20s : ์—†์Œ (๋””๋ ‰ํ† ๋ฆฌ๋งŒ ์กด์žฌ, 0 ํŒŒ์ผ)\n" "$name"
else
printf " %-20s : %'d ํŒŒ์ผ (%s)\n" "$name" "$file_count" "$total_size"
fi
else
printf " %-20s : ์—†์Œ\n" "$name"
fi
}
check_raw_data "data/raw/cc100_ko" "cc100_ko/"
check_raw_data "data/raw/c4_ko" "c4_ko/"
check_raw_data "data/raw/namuwiki_ko" "namuwiki_ko/"
# ์œ„ํ‚ค ๋ฐ์ดํ„ฐ๋Š” raw/ ์ง์ ‘ ํ•˜์œ„
echo ""
echo "[ ์œ„ํ‚คํ”ผ๋””์•„ ๋ฐ์ดํ„ฐ ]"
ko_wiki_count=$(find "data/raw" -maxdepth 1 -name "ko_wiki_*.txt" | wc -l)
en_wiki_count=$(find "data/raw" -maxdepth 1 -name "en_wiki_*.txt" | wc -l)
ko_wiki_size=$(du -sh "data/raw" 2>/dev/null | cut -f1)
if [ $ko_wiki_count -gt 0 ]; then
printf " %-20s : %'d ํŒŒ์ผ\n" "ko_wiki" "$ko_wiki_count"
fi
if [ $en_wiki_count -gt 0 ]; then
printf " %-20s : %'d ํŒŒ์ผ\n" "en_wiki" "$en_wiki_count"
fi
echo ""
# ============================================================================
# 4. ์ข…ํ•ฉ ์ƒํƒœ ์š”์•ฝ
# ============================================================================
echo "[ ์ข…ํ•ฉ ์ƒํƒœ ]"
# ํ•™์Šต์šฉ ๋ฐ”์ด๋„ˆ๋ฆฌ ๋ฐ์ดํ„ฐ ํ™•์ธ
binary_ready=false
if [ -f "data/korean_train.bin" ] && [ -f "data/korean_val.bin" ]; then
binary_ready=true
elif [ -f "data/train.bin" ] && [ -f "data/val.bin" ]; then
binary_ready=true
fi
# ํ† ํฌ๋‚˜์ด์ € ํ™•์ธ
tokenizer_ready=false
if [ -d "tokenizer/korean_sp" ] && [ -f "tokenizer/korean_sp/tokenizer.model" ]; then
tokenizer_ready=true
fi
# ์›๋ณธ ๋ฐ์ดํ„ฐ ํ™•์ธ
raw_ready=false
if [ -d "data/raw/c4_ko" ] || [ -d "data/raw/namuwiki_ko" ] || [ -d "data/raw/cc100_ko" ]; then
count=$(find "data/raw/c4_ko" -maxdepth 1 -type f 2>/dev/null | wc -l)
count=$((count + $(find "data/raw/namuwiki_ko" -maxdepth 1 -type f 2>/dev/null | wc -l)))
count=$((count + $(find "data/raw/cc100_ko" -maxdepth 1 -type f 2>/dev/null | wc -l)))
if [ $count -gt 0 ]; then
raw_ready=true
fi
fi
printf " ํ•™์Šต์šฉ ๋ฐ”์ด๋„ˆ๋ฆฌ : %s\n" "$([ "$binary_ready" = true ] && echo "โœ“ ์ค€๋น„๋จ" || echo "โœ— ๋ฏธ์ค€๋น„")"
printf " ํ† ํฌ๋‚˜์ด์ € : %s\n" "$([ "$tokenizer_ready" = true ] && echo "โœ“ ์ค€๋น„๋จ" || echo "โœ— ๋ฏธ์ค€๋น„")"
printf " ์›๋ณธ ๋ฐ์ดํ„ฐ : %s\n" "$([ "$raw_ready" = true ] && echo "โœ“ ์ค€๋น„๋จ" || echo "โœ— ๋ฏธ์ค€๋น„")"
echo ""
# ============================================================================
# 5. ํ•™์Šต ์„ค์ • ํŒŒ๋ผ๋ฏธํ„ฐ ์ •๋ณด
# ============================================================================
echo "[ ํ•™์Šต ์„ค์ • (1B ๋ชจ๋ธ ๊ธฐ์ค€) ]"
echo " ๋ฐฐ์น˜ ์‚ฌ์ด์ฆˆ : 8"
echo " ์‹œํ€€์Šค ๊ธธ์ด : 4096"
echo " GPU ์ˆ˜ : 8"
echo " ๊ทธ๋ž˜๋””์–ธํŠธ ๋ˆ„์  : 4"
echo " ํ† ํฐ/์Šคํ… : 8 ร— 4 ร— 4096 ร— 8 = 1,048,576"
echo ""
echo "=== ๊ฒ€์‚ฌ ์™„๋ฃŒ ==="