| #!/bin/bash |
|
|
| |
| |
|
|
| set -e |
|
|
| |
| PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" |
| cd "${PROJECT_ROOT}" |
|
|
| echo "=== ํ๊ตญ์ด ํ์ต ๋ฐ์ดํฐ ํํฉ ===" |
| echo "" |
|
|
| |
| |
| |
| echo "[ ํ์ต ๋ฐ์ด๋๋ฆฌ ๋ฐ์ดํฐ ]" |
|
|
| check_binary_data() { |
| local file=$1 |
| local name=$2 |
|
|
| if [ -f "$file" ]; then |
| local size=$(du -h "$file" | cut -f1) |
|
|
| |
| |
| local token_count=$(python3 -c " |
| import numpy as np |
| try: |
| data = np.memmap('$file', dtype=np.uint32, mode='r') |
| print(len(data)) |
| except Exception as e: |
| print('error') |
| " 2>/dev/null || echo "error") |
|
|
| if [ "$token_count" != "error" ] && [ ! -z "$token_count" ]; then |
| |
| local formatted_tokens=$(printf "%'d" "$token_count") |
|
|
| |
| |
| |
| local tokens_per_step=1048576 |
| local estimated_steps=$((token_count / tokens_per_step)) |
|
|
| printf " %-20s : ์กด์ฌ (%s, %'d ํ ํฐ, ~%'d steps)\n" \ |
| "$name" "$size" "$token_count" "$estimated_steps" |
| else |
| printf " %-20s : ์กด์ฌ (%s, ํ ํฐ ๊ณ์ฐ ์คํจ)\n" "$name" "$size" |
| fi |
| else |
| printf " %-20s : ์์\n" "$name" |
| fi |
| } |
|
|
| check_binary_data "data/korean_train.bin" "korean_train.bin" |
| check_binary_data "data/korean_val.bin" "korean_val.bin" |
| check_binary_data "data/train.bin" "train.bin" |
| check_binary_data "data/val.bin" "val.bin" |
|
|
| echo "" |
|
|
| |
| |
| |
| echo "[ ํ ํฌ๋์ด์ ]" |
|
|
| check_tokenizer() { |
| local dir=$1 |
| local name=$2 |
|
|
| if [ -d "$dir" ]; then |
| local files=$(find "$dir" -type f | wc -l) |
| printf " %-20s : ์กด์ฌ (%d๊ฐ ํ์ผ)\n" "$name" "$files" |
| else |
| printf " %-20s : ์์\n" "$name" |
| fi |
| } |
|
|
| check_tokenizer "tokenizer/korean_sp" "korean_sp" |
| check_tokenizer "tokenizer" "default tokenizer" |
|
|
| echo "" |
|
|
| |
| |
| |
| echo "[ ์๋ณธ ๋ฐ์ดํฐ ]" |
|
|
| check_raw_data() { |
| local dir=$1 |
| local name=$2 |
|
|
| if [ -d "$dir" ]; then |
| local file_count=$(find "$dir" -maxdepth 1 -type f | wc -l) |
| local total_size=$(du -sh "$dir" 2>/dev/null | cut -f1) |
|
|
| if [ $file_count -eq 0 ]; then |
| printf " %-20s : ์์ (๋๋ ํ ๋ฆฌ๋ง ์กด์ฌ, 0 ํ์ผ)\n" "$name" |
| else |
| printf " %-20s : %'d ํ์ผ (%s)\n" "$name" "$file_count" "$total_size" |
| fi |
| else |
| printf " %-20s : ์์\n" "$name" |
| fi |
| } |
|
|
| check_raw_data "data/raw/cc100_ko" "cc100_ko/" |
| check_raw_data "data/raw/c4_ko" "c4_ko/" |
| check_raw_data "data/raw/namuwiki_ko" "namuwiki_ko/" |
|
|
| |
| echo "" |
| echo "[ ์ํคํผ๋์ ๋ฐ์ดํฐ ]" |
| ko_wiki_count=$(find "data/raw" -maxdepth 1 -name "ko_wiki_*.txt" | wc -l) |
| en_wiki_count=$(find "data/raw" -maxdepth 1 -name "en_wiki_*.txt" | wc -l) |
| ko_wiki_size=$(du -sh "data/raw" 2>/dev/null | cut -f1) |
|
|
| if [ $ko_wiki_count -gt 0 ]; then |
| printf " %-20s : %'d ํ์ผ\n" "ko_wiki" "$ko_wiki_count" |
| fi |
|
|
| if [ $en_wiki_count -gt 0 ]; then |
| printf " %-20s : %'d ํ์ผ\n" "en_wiki" "$en_wiki_count" |
| fi |
|
|
| echo "" |
|
|
| |
| |
| |
| echo "[ ์ข
ํฉ ์ํ ]" |
|
|
| |
| binary_ready=false |
| if [ -f "data/korean_train.bin" ] && [ -f "data/korean_val.bin" ]; then |
| binary_ready=true |
| elif [ -f "data/train.bin" ] && [ -f "data/val.bin" ]; then |
| binary_ready=true |
| fi |
|
|
| |
| tokenizer_ready=false |
| if [ -d "tokenizer/korean_sp" ] && [ -f "tokenizer/korean_sp/tokenizer.model" ]; then |
| tokenizer_ready=true |
| fi |
|
|
| |
| raw_ready=false |
| if [ -d "data/raw/c4_ko" ] || [ -d "data/raw/namuwiki_ko" ] || [ -d "data/raw/cc100_ko" ]; then |
| count=$(find "data/raw/c4_ko" -maxdepth 1 -type f 2>/dev/null | wc -l) |
| count=$((count + $(find "data/raw/namuwiki_ko" -maxdepth 1 -type f 2>/dev/null | wc -l))) |
| count=$((count + $(find "data/raw/cc100_ko" -maxdepth 1 -type f 2>/dev/null | wc -l))) |
| if [ $count -gt 0 ]; then |
| raw_ready=true |
| fi |
| fi |
|
|
| printf " ํ์ต์ฉ ๋ฐ์ด๋๋ฆฌ : %s\n" "$([ "$binary_ready" = true ] && echo "โ ์ค๋น๋จ" || echo "โ ๋ฏธ์ค๋น")" |
| printf " ํ ํฌ๋์ด์ : %s\n" "$([ "$tokenizer_ready" = true ] && echo "โ ์ค๋น๋จ" || echo "โ ๋ฏธ์ค๋น")" |
| printf " ์๋ณธ ๋ฐ์ดํฐ : %s\n" "$([ "$raw_ready" = true ] && echo "โ ์ค๋น๋จ" || echo "โ ๋ฏธ์ค๋น")" |
|
|
| echo "" |
|
|
| |
| |
| |
| echo "[ ํ์ต ์ค์ (1B ๋ชจ๋ธ ๊ธฐ์ค) ]" |
| echo " ๋ฐฐ์น ์ฌ์ด์ฆ : 8" |
| echo " ์ํ์ค ๊ธธ์ด : 4096" |
| echo " GPU ์ : 8" |
| echo " ๊ทธ๋๋์ธํธ ๋์ : 4" |
| echo " ํ ํฐ/์คํ
: 8 ร 4 ร 4096 ร 8 = 1,048,576" |
| echo "" |
|
|
| echo "=== ๊ฒ์ฌ ์๋ฃ ===" |
|
|