| | #!/bin/bash |
| |
|
| | |
| | |
| |
|
| | set -e |
| |
|
| | |
| | PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" |
| | cd "${PROJECT_ROOT}" |
| |
|
| | echo "=== ํ๊ตญ์ด ํ์ต ๋ฐ์ดํฐ ํํฉ ===" |
| | echo "" |
| |
|
| | |
| | |
| | |
| | echo "[ ํ์ต ๋ฐ์ด๋๋ฆฌ ๋ฐ์ดํฐ ]" |
| |
|
| | check_binary_data() { |
| | local file=$1 |
| | local name=$2 |
| |
|
| | if [ -f "$file" ]; then |
| | local size=$(du -h "$file" | cut -f1) |
| |
|
| | |
| | |
| | local token_count=$(python3 -c " |
| | import numpy as np |
| | try: |
| | data = np.memmap('$file', dtype=np.uint32, mode='r') |
| | print(len(data)) |
| | except Exception as e: |
| | print('error') |
| | " 2>/dev/null || echo "error") |
| |
|
| | if [ "$token_count" != "error" ] && [ ! -z "$token_count" ]; then |
| | |
| | local formatted_tokens=$(printf "%'d" "$token_count") |
| |
|
| | |
| | |
| | |
| | local tokens_per_step=1048576 |
| | local estimated_steps=$((token_count / tokens_per_step)) |
| |
|
| | printf " %-20s : ์กด์ฌ (%s, %'d ํ ํฐ, ~%'d steps)\n" \ |
| | "$name" "$size" "$token_count" "$estimated_steps" |
| | else |
| | printf " %-20s : ์กด์ฌ (%s, ํ ํฐ ๊ณ์ฐ ์คํจ)\n" "$name" "$size" |
| | fi |
| | else |
| | printf " %-20s : ์์\n" "$name" |
| | fi |
| | } |
| |
|
| | check_binary_data "data/korean_train.bin" "korean_train.bin" |
| | check_binary_data "data/korean_val.bin" "korean_val.bin" |
| | check_binary_data "data/train.bin" "train.bin" |
| | check_binary_data "data/val.bin" "val.bin" |
| |
|
| | echo "" |
| |
|
| | |
| | |
| | |
| | echo "[ ํ ํฌ๋์ด์ ]" |
| |
|
| | check_tokenizer() { |
| | local dir=$1 |
| | local name=$2 |
| |
|
| | if [ -d "$dir" ]; then |
| | local files=$(find "$dir" -type f | wc -l) |
| | printf " %-20s : ์กด์ฌ (%d๊ฐ ํ์ผ)\n" "$name" "$files" |
| | else |
| | printf " %-20s : ์์\n" "$name" |
| | fi |
| | } |
| |
|
| | check_tokenizer "tokenizer/korean_sp" "korean_sp" |
| | check_tokenizer "tokenizer" "default tokenizer" |
| |
|
| | echo "" |
| |
|
| | |
| | |
| | |
| | echo "[ ์๋ณธ ๋ฐ์ดํฐ ]" |
| |
|
| | check_raw_data() { |
| | local dir=$1 |
| | local name=$2 |
| |
|
| | if [ -d "$dir" ]; then |
| | local file_count=$(find "$dir" -maxdepth 1 -type f | wc -l) |
| | local total_size=$(du -sh "$dir" 2>/dev/null | cut -f1) |
| |
|
| | if [ $file_count -eq 0 ]; then |
| | printf " %-20s : ์์ (๋๋ ํ ๋ฆฌ๋ง ์กด์ฌ, 0 ํ์ผ)\n" "$name" |
| | else |
| | printf " %-20s : %'d ํ์ผ (%s)\n" "$name" "$file_count" "$total_size" |
| | fi |
| | else |
| | printf " %-20s : ์์\n" "$name" |
| | fi |
| | } |
| |
|
| | check_raw_data "data/raw/cc100_ko" "cc100_ko/" |
| | check_raw_data "data/raw/c4_ko" "c4_ko/" |
| | check_raw_data "data/raw/namuwiki_ko" "namuwiki_ko/" |
| |
|
| | |
| | echo "" |
| | echo "[ ์ํคํผ๋์ ๋ฐ์ดํฐ ]" |
| | ko_wiki_count=$(find "data/raw" -maxdepth 1 -name "ko_wiki_*.txt" | wc -l) |
| | en_wiki_count=$(find "data/raw" -maxdepth 1 -name "en_wiki_*.txt" | wc -l) |
| | ko_wiki_size=$(du -sh "data/raw" 2>/dev/null | cut -f1) |
| |
|
| | if [ $ko_wiki_count -gt 0 ]; then |
| | printf " %-20s : %'d ํ์ผ\n" "ko_wiki" "$ko_wiki_count" |
| | fi |
| |
|
| | if [ $en_wiki_count -gt 0 ]; then |
| | printf " %-20s : %'d ํ์ผ\n" "en_wiki" "$en_wiki_count" |
| | fi |
| |
|
| | echo "" |
| |
|
| | |
| | |
| | |
| | echo "[ ์ข
ํฉ ์ํ ]" |
| |
|
| | |
| | binary_ready=false |
| | if [ -f "data/korean_train.bin" ] && [ -f "data/korean_val.bin" ]; then |
| | binary_ready=true |
| | elif [ -f "data/train.bin" ] && [ -f "data/val.bin" ]; then |
| | binary_ready=true |
| | fi |
| |
|
| | |
| | tokenizer_ready=false |
| | if [ -d "tokenizer/korean_sp" ] && [ -f "tokenizer/korean_sp/tokenizer.model" ]; then |
| | tokenizer_ready=true |
| | fi |
| |
|
| | |
| | raw_ready=false |
| | if [ -d "data/raw/c4_ko" ] || [ -d "data/raw/namuwiki_ko" ] || [ -d "data/raw/cc100_ko" ]; then |
| | count=$(find "data/raw/c4_ko" -maxdepth 1 -type f 2>/dev/null | wc -l) |
| | count=$((count + $(find "data/raw/namuwiki_ko" -maxdepth 1 -type f 2>/dev/null | wc -l))) |
| | count=$((count + $(find "data/raw/cc100_ko" -maxdepth 1 -type f 2>/dev/null | wc -l))) |
| | if [ $count -gt 0 ]; then |
| | raw_ready=true |
| | fi |
| | fi |
| |
|
| | printf " ํ์ต์ฉ ๋ฐ์ด๋๋ฆฌ : %s\n" "$([ "$binary_ready" = true ] && echo "โ ์ค๋น๋จ" || echo "โ ๋ฏธ์ค๋น")" |
| | printf " ํ ํฌ๋์ด์ : %s\n" "$([ "$tokenizer_ready" = true ] && echo "โ ์ค๋น๋จ" || echo "โ ๋ฏธ์ค๋น")" |
| | printf " ์๋ณธ ๋ฐ์ดํฐ : %s\n" "$([ "$raw_ready" = true ] && echo "โ ์ค๋น๋จ" || echo "โ ๋ฏธ์ค๋น")" |
| |
|
| | echo "" |
| |
|
| | |
| | |
| | |
| | echo "[ ํ์ต ์ค์ (1B ๋ชจ๋ธ ๊ธฐ์ค) ]" |
| | echo " ๋ฐฐ์น ์ฌ์ด์ฆ : 8" |
| | echo " ์ํ์ค ๊ธธ์ด : 4096" |
| | echo " GPU ์ : 8" |
| | echo " ๊ทธ๋๋์ธํธ ๋์ : 4" |
| | echo " ํ ํฐ/์คํ
: 8 ร 4 ร 4096 ร 8 = 1,048,576" |
| | echo "" |
| |
|
| | echo "=== ๊ฒ์ฌ ์๋ฃ ===" |
| |
|