Use LLaMA 2 pretrained tokenizer and remove tokenizer_mode option
Browse filesReplace custom tokenizer training/loading modes with a single pretrained
LLaMA 2 tokenizer (NousResearch/Llama-2-7b-hf, 32K vocab). This simplifies
setup_data_pipeline() to require no tokenizer arguments, using
DataConfig.tokenizer_name as the single source of truth.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- llm_lab/config/data_config.py +4 -4
- llm_lab/config/model_config.py +1 -1
- llm_lab/data/pipeline.py +4 -30
- llm_lab/data/tokenizer.py +8 -4
- notebooks/01_data_pipeline.ipynb +5 -45
- notebooks/03_training.ipynb +1 -11
- notebooks/04_evaluation.ipynb +3 -7
- notebooks/05_debugging.ipynb +1 -11
llm_lab/config/data_config.py
CHANGED
|
@@ -18,10 +18,10 @@ class DataConfig:
|
|
| 18 |
text_column: str = "text" # column name containing text
|
| 19 |
|
| 20 |
# โโ Tokenizer โโ
|
| 21 |
-
|
| 22 |
-
#
|
| 23 |
-
|
| 24 |
-
vocab_size: int = 32_000
|
| 25 |
|
| 26 |
# โโ Sequence โโ
|
| 27 |
max_seq_len: int = 2048
|
|
|
|
| 18 |
text_column: str = "text" # column name containing text
|
| 19 |
|
| 20 |
# โโ Tokenizer โโ
|
| 21 |
+
# LLaMA 2 tokenizer mirror โ no HuggingFace authentication required
|
| 22 |
+
# SentencePiece BPE, 32K vocab โ optimal for 1B-scale models
|
| 23 |
+
tokenizer_name: str = "NousResearch/Llama-2-7b-hf"
|
| 24 |
+
vocab_size: int = 32_000 # LLaMA 2 vocab size
|
| 25 |
|
| 26 |
# โโ Sequence โโ
|
| 27 |
max_seq_len: int = 2048
|
llm_lab/config/model_config.py
CHANGED
|
@@ -10,7 +10,7 @@ class ModelConfig:
|
|
| 10 |
- small: ~100M (for intermediate validation)
|
| 11 |
- base: ~1.1B (final target)
|
| 12 |
"""
|
| 13 |
-
vocab_size: int = 32_000
|
| 14 |
hidden_dim: int = 2048 # d_model: base dimension of the model
|
| 15 |
num_layers: int = 22 # number of Transformer blocks
|
| 16 |
num_heads: int = 16 # number of Query heads
|
|
|
|
| 10 |
- small: ~100M (for intermediate validation)
|
| 11 |
- base: ~1.1B (final target)
|
| 12 |
"""
|
| 13 |
+
vocab_size: int = 32_000 # LLaMA 2 tokenizer vocab size
|
| 14 |
hidden_dim: int = 2048 # d_model: base dimension of the model
|
| 15 |
num_layers: int = 22 # number of Transformer blocks
|
| 16 |
num_heads: int = 16 # number of Query heads
|
llm_lab/data/pipeline.py
CHANGED
|
@@ -87,34 +87,18 @@ def train_tokenizer_from_dataset(config: DataConfig) -> Tokenizer:
|
|
| 87 |
|
| 88 |
|
| 89 |
def setup_data_pipeline(
|
| 90 |
-
tokenizer_mode: str = "train_new",
|
| 91 |
-
tokenizer_path: Optional[str] = None,
|
| 92 |
config: Optional[DataConfig] = None,
|
| 93 |
) -> tuple:
|
| 94 |
"""Sets up the data pipeline in one call.
|
| 95 |
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
"train_new" - Train a new BPE tokenizer
|
| 99 |
-
"load_trained" - Load a previously trained tokenizer
|
| 100 |
-
"pretrained" - Use a pretrained HuggingFace tokenizer
|
| 101 |
-
tokenizer_path:
|
| 102 |
-
"train_new" -> Save directory (default: ./tokenizer)
|
| 103 |
-
"load_trained" -> Path to the saved tokenizer
|
| 104 |
-
"pretrained" -> HF model name (default: mistralai/Mistral-7B-v0.1)
|
| 105 |
|
| 106 |
Returns:
|
| 107 |
(tokenizer, train_dataloader, val_dataloader)
|
| 108 |
|
| 109 |
Example usage (Colab):
|
| 110 |
-
|
| 111 |
-
tok, train_dl, val_dl = setup_data_pipeline("train_new")
|
| 112 |
-
|
| 113 |
-
# Method 2: Load an existing tokenizer
|
| 114 |
-
tok, train_dl, val_dl = setup_data_pipeline("load_trained", "./tokenizer")
|
| 115 |
-
|
| 116 |
-
# Method 3: Use a pretrained tokenizer (simplest)
|
| 117 |
-
tok, train_dl, val_dl = setup_data_pipeline("pretrained")
|
| 118 |
"""
|
| 119 |
config = config or DataConfig()
|
| 120 |
|
|
@@ -124,17 +108,7 @@ def setup_data_pipeline(
|
|
| 124 |
|
| 125 |
# โโ Step 1: Tokenizer โโ
|
| 126 |
tokenizer = Tokenizer(config)
|
| 127 |
-
|
| 128 |
-
if tokenizer_mode == "train_new":
|
| 129 |
-
tokenizer = train_tokenizer_from_dataset(config)
|
| 130 |
-
elif tokenizer_mode == "load_trained":
|
| 131 |
-
path = tokenizer_path or config.tokenizer_save_dir
|
| 132 |
-
tokenizer.load_trained_hf(path)
|
| 133 |
-
elif tokenizer_mode == "pretrained":
|
| 134 |
-
name = tokenizer_path or "mistralai/Mistral-7B-v0.1"
|
| 135 |
-
tokenizer.load_pretrained_hf(name)
|
| 136 |
-
else:
|
| 137 |
-
raise ValueError(f"Unknown tokenizer_mode: {tokenizer_mode}")
|
| 138 |
|
| 139 |
# โโ Step 2: Training DataLoader โโ
|
| 140 |
print("\n[DataLoader] Creating training DataLoader...")
|
|
|
|
| 87 |
|
| 88 |
|
| 89 |
def setup_data_pipeline(
|
|
|
|
|
|
|
| 90 |
config: Optional[DataConfig] = None,
|
| 91 |
) -> tuple:
|
| 92 |
"""Sets up the data pipeline in one call.
|
| 93 |
|
| 94 |
+
Uses the LLaMA 2 tokenizer (NousResearch/Llama-2-7b-hf) by default.
|
| 95 |
+
Configurable via DataConfig.tokenizer_name.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
Returns:
|
| 98 |
(tokenizer, train_dataloader, val_dataloader)
|
| 99 |
|
| 100 |
Example usage (Colab):
|
| 101 |
+
tok, train_dl, val_dl = setup_data_pipeline()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
"""
|
| 103 |
config = config or DataConfig()
|
| 104 |
|
|
|
|
| 108 |
|
| 109 |
# โโ Step 1: Tokenizer โโ
|
| 110 |
tokenizer = Tokenizer(config)
|
| 111 |
+
tokenizer.load_pretrained_hf()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
# โโ Step 2: Training DataLoader โโ
|
| 114 |
print("\n[DataLoader] Creating training DataLoader...")
|
llm_lab/data/tokenizer.py
CHANGED
|
@@ -137,15 +137,19 @@ class Tokenizer:
|
|
| 137 |
# Method 3: Load a pretrained HF tokenizer
|
| 138 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 139 |
|
| 140 |
-
def load_pretrained_hf(self, name_or_path: str =
|
| 141 |
"""Loads a pretrained tokenizer from HuggingFace.
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
| 146 |
"""
|
| 147 |
from transformers import AutoTokenizer
|
| 148 |
|
|
|
|
| 149 |
print(f"[Tokenizer] Loading HF tokenizer: {name_or_path}")
|
| 150 |
tokenizer = AutoTokenizer.from_pretrained(name_or_path)
|
| 151 |
|
|
|
|
| 137 |
# Method 3: Load a pretrained HF tokenizer
|
| 138 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 139 |
|
| 140 |
+
def load_pretrained_hf(self, name_or_path: Optional[str] = None):
|
| 141 |
"""Loads a pretrained tokenizer from HuggingFace.
|
| 142 |
|
| 143 |
+
Default: LLaMA 2 tokenizer (NousResearch/Llama-2-7b-hf mirror).
|
| 144 |
+
- vocab_size : 32,000
|
| 145 |
+
- SentencePiece BPE โ optimal for 1B-scale models (TinyLlama, LLaMA 1/2)
|
| 146 |
+
- No HuggingFace authentication required (community mirror)
|
| 147 |
+
Official source (requires HF auth):
|
| 148 |
+
- "meta-llama/Llama-2-7b-hf"
|
| 149 |
"""
|
| 150 |
from transformers import AutoTokenizer
|
| 151 |
|
| 152 |
+
name_or_path = name_or_path or self.config.tokenizer_name
|
| 153 |
print(f"[Tokenizer] Loading HF tokenizer: {name_or_path}")
|
| 154 |
tokenizer = AutoTokenizer.from_pretrained(name_or_path)
|
| 155 |
|
notebooks/01_data_pipeline.ipynb
CHANGED
|
@@ -3,21 +3,7 @@
|
|
| 3 |
{
|
| 4 |
"cell_type": "markdown",
|
| 5 |
"metadata": {},
|
| 6 |
-
"source":
|
| 7 |
-
"# 01. ๋ฐ์ดํฐ ํ์ดํ๋ผ์ธ\n",
|
| 8 |
-
"\n",
|
| 9 |
-
"ํ ํฌ๋์ด์ ์ค๋น โ ๋ฐ์ดํฐ ์คํธ๋ฆฌ๋ฐ โ ์ํ์ค ํจํน โ ๋ฐฐ์น ๊ตฌ์ฑ\n",
|
| 10 |
-
"\n",
|
| 11 |
-
"**ํ์ดํ๋ผ์ธ ํ๋ฆ:**\n",
|
| 12 |
-
"```\n",
|
| 13 |
-
"FineWeb-Edu (HuggingFace)\n",
|
| 14 |
-
" โ Streaming์ผ๋ก ๋ก๋ (๋์คํฌ ์ ์ฅ ์์)\n",
|
| 15 |
-
" โ ํ ํฌ๋์ด์ง (BPE, vocab=32K)\n",
|
| 16 |
-
" โ ์ํ์ค ํจํน (์ฌ๋ฌ ๋ฌธ์๋ฅผ max_seq_len์ผ๋ก ์ฐ๊ฒฐ)\n",
|
| 17 |
-
" โ ๋ฐฐ์น ๊ตฌ์ฑ (input_ids, targets)\n",
|
| 18 |
-
" โ GPU ์ ์ก\n",
|
| 19 |
-
"```"
|
| 20 |
-
]
|
| 21 |
},
|
| 22 |
{
|
| 23 |
"cell_type": "code",
|
|
@@ -59,45 +45,19 @@
|
|
| 59 |
"execution_count": null,
|
| 60 |
"metadata": {},
|
| 61 |
"outputs": [],
|
| 62 |
-
"source":
|
| 63 |
-
"data_config = DataConfig(\n",
|
| 64 |
-
" dataset_name=\"HuggingFaceFW/fineweb-edu\",\n",
|
| 65 |
-
" dataset_subset=\"sample-10BT\",\n",
|
| 66 |
-
" vocab_size=32_000,\n",
|
| 67 |
-
" max_seq_len=2048,\n",
|
| 68 |
-
" batch_size=4,\n",
|
| 69 |
-
" num_workers=2,\n",
|
| 70 |
-
")\n",
|
| 71 |
-
"\n",
|
| 72 |
-
"print(f\"๋ฐ์ดํฐ์
: {data_config.dataset_name} ({data_config.dataset_subset})\")\n",
|
| 73 |
-
"print(f\"์ํ์ค ๊ธธ์ด: {data_config.max_seq_len}\")\n",
|
| 74 |
-
"print(f\"๋ฐฐ์น ํฌ๊ธฐ: {data_config.batch_size}\")\n",
|
| 75 |
-
"print(f\"ํ ํฐ/๋ฐฐ์น: {data_config.batch_size * data_config.max_seq_len:,}\")"
|
| 76 |
-
]
|
| 77 |
},
|
| 78 |
{
|
| 79 |
"cell_type": "markdown",
|
| 80 |
"metadata": {},
|
| 81 |
-
"source":
|
| 82 |
-
"## 2. ํ ํฌ๋์ด์ ์ค์ \n",
|
| 83 |
-
"\n",
|
| 84 |
-
"์ธ ๊ฐ์ง ๋ฐฉ๋ฒ ์ค ์ ํ:\n",
|
| 85 |
-
"- `\"pretrained\"` โ HuggingFace ์ฌ์ ํ์ต ํ ํฌ๋์ด์ (๊ฐ์ฅ ๊ฐํธ)\n",
|
| 86 |
-
"- `\"train_new\"` โ BPE ํ ํฌ๋์ด์ ์๋ก ํ์ต\n",
|
| 87 |
-
"- `\"load_trained\"` โ ์ด์ ์ ํ์ตํ ํ ํฌ๋์ด์ ๋ก๋"
|
| 88 |
-
]
|
| 89 |
},
|
| 90 |
{
|
| 91 |
"cell_type": "code",
|
| 92 |
"execution_count": null,
|
| 93 |
"metadata": {},
|
| 94 |
"outputs": [],
|
| 95 |
-
"source":
|
| 96 |
-
"tokenizer, train_dl, val_dl = setup_data_pipeline(\n",
|
| 97 |
-
" tokenizer_mode=\"pretrained\", # \"train_new\" ๋๋ \"load_trained\"๋ก ๋ณ๊ฒฝ ๊ฐ๋ฅ\n",
|
| 98 |
-
" config=data_config,\n",
|
| 99 |
-
")"
|
| 100 |
-
]
|
| 101 |
},
|
| 102 |
{
|
| 103 |
"cell_type": "markdown",
|
|
@@ -166,4 +126,4 @@
|
|
| 166 |
},
|
| 167 |
"nbformat": 4,
|
| 168 |
"nbformat_minor": 4
|
| 169 |
-
}
|
|
|
|
| 3 |
{
|
| 4 |
"cell_type": "markdown",
|
| 5 |
"metadata": {},
|
| 6 |
+
"source": "# 01. ๋ฐ์ดํฐ ํ์ดํ๋ผ์ธ\n\nํ ํฌ๋์ด์ ์ค๋น โ ๋ฐ์ดํฐ ์คํธ๋ฆฌ๋ฐ โ ์ํ์ค ํจํน โ ๋ฐฐ์น ๊ตฌ์ฑ\n\n**ํ์ดํ๋ผ์ธ ํ๋ฆ:**\n```\nFineWeb-Edu (HuggingFace)\n โ Streaming์ผ๋ก ๋ก๋ (๋์คํฌ ์ ์ฅ ์์)\n โ ํ ํฌ๋์ด์ง (LLaMA 2 BPE, vocab=32K)\n โ ์ํ์ค ํจํน (์ฌ๋ฌ ๋ฌธ์๋ฅผ max_seq_len์ผ๋ก ์ฐ๊ฒฐ)\n โ ๋ฐฐ์น ๊ตฌ์ฑ (input_ids, targets)\n โ GPU ์ ์ก\n```"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
},
|
| 8 |
{
|
| 9 |
"cell_type": "code",
|
|
|
|
| 45 |
"execution_count": null,
|
| 46 |
"metadata": {},
|
| 47 |
"outputs": [],
|
| 48 |
+
"source": "data_config = DataConfig(\n dataset_name=\"HuggingFaceFW/fineweb-edu\",\n dataset_subset=\"sample-10BT\",\n max_seq_len=2048,\n batch_size=4,\n num_workers=2,\n)\n\nprint(f\"๋ฐ์ดํฐ์
: {data_config.dataset_name} ({data_config.dataset_subset})\")\nprint(f\"ํ ํฌ๋์ด์ : {data_config.tokenizer_name} (vocab_size={data_config.vocab_size:,})\")\nprint(f\"์ํ์ค ๊ธธ์ด: {data_config.max_seq_len}\")\nprint(f\"๋ฐฐ์น ํฌ๊ธฐ: {data_config.batch_size}\")\nprint(f\"ํ ํฐ/๋ฐฐ์น: {data_config.batch_size * data_config.max_seq_len:,}\")"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
},
|
| 50 |
{
|
| 51 |
"cell_type": "markdown",
|
| 52 |
"metadata": {},
|
| 53 |
+
"source": "## 2. ํ ํฌ๋์ด์ + ๋ฐ์ดํฐ ํ์ดํ๋ผ์ธ ์ค์ \n\n**LLaMA 2 ํ ํฌ๋์ด์ ** (`NousResearch/Llama-2-7b-hf`)๋ฅผ ์ฌ์ฉํฉ๋๋ค.\n- vocab_size: 32,000\n- SentencePiece BPE โ 1B๊ธ ๋ชจ๋ธ(TinyLlama, LLaMA 1/2)์ ํ์ค ํ ํฌ๋์ด์ \n- HuggingFace ์ธ์ฆ ์์ด ์ฌ์ฉ ๊ฐ๋ฅํ ๊ณต์ ๋ฏธ๋ฌ"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
},
|
| 55 |
{
|
| 56 |
"cell_type": "code",
|
| 57 |
"execution_count": null,
|
| 58 |
"metadata": {},
|
| 59 |
"outputs": [],
|
| 60 |
+
"source": "tokenizer, train_dl, val_dl = setup_data_pipeline(config=data_config)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
},
|
| 62 |
{
|
| 63 |
"cell_type": "markdown",
|
|
|
|
| 126 |
},
|
| 127 |
"nbformat": 4,
|
| 128 |
"nbformat_minor": 4
|
| 129 |
+
}
|
notebooks/03_training.ipynb
CHANGED
|
@@ -110,17 +110,7 @@
|
|
| 110 |
"execution_count": null,
|
| 111 |
"metadata": {},
|
| 112 |
"outputs": [],
|
| 113 |
-
"source":
|
| 114 |
-
"# ๋ชจ๋ธ ์์ฑ\n",
|
| 115 |
-
"model = LLMModel(model_config)\n",
|
| 116 |
-
"print(f\"๋ชจ๋ธ ํ๋ผ๋ฏธํฐ: {model.count_parameters():,}\")\n",
|
| 117 |
-
"\n",
|
| 118 |
-
"# ๋ฐ์ดํฐ ํ์ดํ๋ผ์ธ\n",
|
| 119 |
-
"tokenizer, train_dl, val_dl = setup_data_pipeline(\n",
|
| 120 |
-
" tokenizer_mode=\"pretrained\",\n",
|
| 121 |
-
" config=data_config,\n",
|
| 122 |
-
")"
|
| 123 |
-
]
|
| 124 |
},
|
| 125 |
{
|
| 126 |
"cell_type": "markdown",
|
|
|
|
| 110 |
"execution_count": null,
|
| 111 |
"metadata": {},
|
| 112 |
"outputs": [],
|
| 113 |
+
"source": "# ๋ชจ๋ธ ์์ฑ\nmodel = LLMModel(model_config)\nprint(f\"๋ชจ๋ธ ํ๋ผ๋ฏธํฐ: {model.count_parameters():,}\")\n\n# ๋ฐ์ดํฐ ํ์ดํ๋ผ์ธ (GPT-2 ํ ํฌ๋์ด์ ์๋ ์ฌ์ฉ)\ntokenizer, train_dl, val_dl = setup_data_pipeline(config=data_config)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
},
|
| 115 |
{
|
| 116 |
"cell_type": "markdown",
|
notebooks/04_evaluation.ipynb
CHANGED
|
@@ -85,14 +85,10 @@
|
|
| 85 |
},
|
| 86 |
{
|
| 87 |
"cell_type": "code",
|
| 88 |
-
"execution_count":
|
| 89 |
"metadata": {},
|
| 90 |
"outputs": [],
|
| 91 |
-
"source":
|
| 92 |
-
"from llm_lab.data import setup_data_pipeline\n",
|
| 93 |
-
"\n",
|
| 94 |
-
"tokenizer, train_dl, val_dl = setup_data_pipeline(tokenizer_mode=\"train_new\")"
|
| 95 |
-
]
|
| 96 |
},
|
| 97 |
{
|
| 98 |
"cell_type": "code",
|
|
@@ -198,4 +194,4 @@
|
|
| 198 |
},
|
| 199 |
"nbformat": 4,
|
| 200 |
"nbformat_minor": 4
|
| 201 |
-
}
|
|
|
|
| 85 |
},
|
| 86 |
{
|
| 87 |
"cell_type": "code",
|
| 88 |
+
"execution_count": null,
|
| 89 |
"metadata": {},
|
| 90 |
"outputs": [],
|
| 91 |
+
"source": "from llm_lab.data import setup_data_pipeline\n\ntokenizer, train_dl, val_dl = setup_data_pipeline()"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
},
|
| 93 |
{
|
| 94 |
"cell_type": "code",
|
|
|
|
| 194 |
},
|
| 195 |
"nbformat": 4,
|
| 196 |
"nbformat_minor": 4
|
| 197 |
+
}
|
notebooks/05_debugging.ipynb
CHANGED
|
@@ -101,17 +101,7 @@
|
|
| 101 |
"execution_count": null,
|
| 102 |
"metadata": {},
|
| 103 |
"outputs": [],
|
| 104 |
-
"source":
|
| 105 |
-
"# --- Model ---\n",
|
| 106 |
-
"model = LLMModel(model_config).to(device)\n",
|
| 107 |
-
"print(f\"Model parameters: {model.count_parameters():,}\")\n",
|
| 108 |
-
"\n",
|
| 109 |
-
"# --- Data pipeline ---\n",
|
| 110 |
-
"tokenizer, train_dl, val_dl = setup_data_pipeline(\n",
|
| 111 |
-
" tokenizer_mode=\"pretrained\",\n",
|
| 112 |
-
" config=data_config,\n",
|
| 113 |
-
")"
|
| 114 |
-
]
|
| 115 |
},
|
| 116 |
{
|
| 117 |
"cell_type": "markdown",
|
|
|
|
| 101 |
"execution_count": null,
|
| 102 |
"metadata": {},
|
| 103 |
"outputs": [],
|
| 104 |
+
"source": "# --- Model ---\nmodel = LLMModel(model_config).to(device)\nprint(f\"Model parameters: {model.count_parameters():,}\")\n\n# --- Data pipeline (GPT-2 tokenizer ์๋ ์ฌ์ฉ) ---\ntokenizer, train_dl, val_dl = setup_data_pipeline(config=data_config)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
},
|
| 106 |
{
|
| 107 |
"cell_type": "markdown",
|