Vjeong Claude Opus 4.6 commited on
Commit
a5ca4e4
ยท
1 Parent(s): 8626149

Use LLaMA 2 pretrained tokenizer and remove tokenizer_mode option

Browse files

Replace custom tokenizer training/loading modes with a single pretrained
LLaMA 2 tokenizer (NousResearch/Llama-2-7b-hf, 32K vocab). This simplifies
setup_data_pipeline() to require no tokenizer arguments, using
DataConfig.tokenizer_name as the single source of truth.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

llm_lab/config/data_config.py CHANGED
@@ -18,10 +18,10 @@ class DataConfig:
18
  text_column: str = "text" # column name containing text
19
 
20
  # โ”€โ”€ Tokenizer โ”€โ”€
21
- tokenizer_type: str = "sentencepiece" # "sentencepiece" or "hf"
22
- # path to a pretrained tokenizer (trains a new one if not provided)
23
- tokenizer_path: Optional[str] = None
24
- vocab_size: int = 32_000
25
 
26
  # โ”€โ”€ Sequence โ”€โ”€
27
  max_seq_len: int = 2048
 
18
  text_column: str = "text" # column name containing text
19
 
20
  # โ”€โ”€ Tokenizer โ”€โ”€
21
+ # LLaMA 2 tokenizer mirror โ€” no HuggingFace authentication required
22
+ # SentencePiece BPE, 32K vocab โ€” optimal for 1B-scale models
23
+ tokenizer_name: str = "NousResearch/Llama-2-7b-hf"
24
+ vocab_size: int = 32_000 # LLaMA 2 vocab size
25
 
26
  # โ”€โ”€ Sequence โ”€โ”€
27
  max_seq_len: int = 2048
llm_lab/config/model_config.py CHANGED
@@ -10,7 +10,7 @@ class ModelConfig:
10
  - small: ~100M (for intermediate validation)
11
  - base: ~1.1B (final target)
12
  """
13
- vocab_size: int = 32_000
14
  hidden_dim: int = 2048 # d_model: base dimension of the model
15
  num_layers: int = 22 # number of Transformer blocks
16
  num_heads: int = 16 # number of Query heads
 
10
  - small: ~100M (for intermediate validation)
11
  - base: ~1.1B (final target)
12
  """
13
+ vocab_size: int = 32_000 # LLaMA 2 tokenizer vocab size
14
  hidden_dim: int = 2048 # d_model: base dimension of the model
15
  num_layers: int = 22 # number of Transformer blocks
16
  num_heads: int = 16 # number of Query heads
llm_lab/data/pipeline.py CHANGED
@@ -87,34 +87,18 @@ def train_tokenizer_from_dataset(config: DataConfig) -> Tokenizer:
87
 
88
 
89
  def setup_data_pipeline(
90
- tokenizer_mode: str = "train_new",
91
- tokenizer_path: Optional[str] = None,
92
  config: Optional[DataConfig] = None,
93
  ) -> tuple:
94
  """Sets up the data pipeline in one call.
95
 
96
- Args:
97
- tokenizer_mode:
98
- "train_new" - Train a new BPE tokenizer
99
- "load_trained" - Load a previously trained tokenizer
100
- "pretrained" - Use a pretrained HuggingFace tokenizer
101
- tokenizer_path:
102
- "train_new" -> Save directory (default: ./tokenizer)
103
- "load_trained" -> Path to the saved tokenizer
104
- "pretrained" -> HF model name (default: mistralai/Mistral-7B-v0.1)
105
 
106
  Returns:
107
  (tokenizer, train_dataloader, val_dataloader)
108
 
109
  Example usage (Colab):
110
- # Method 1: Train a new tokenizer
111
- tok, train_dl, val_dl = setup_data_pipeline("train_new")
112
-
113
- # Method 2: Load an existing tokenizer
114
- tok, train_dl, val_dl = setup_data_pipeline("load_trained", "./tokenizer")
115
-
116
- # Method 3: Use a pretrained tokenizer (simplest)
117
- tok, train_dl, val_dl = setup_data_pipeline("pretrained")
118
  """
119
  config = config or DataConfig()
120
 
@@ -124,17 +108,7 @@ def setup_data_pipeline(
124
 
125
  # โ”€โ”€ Step 1: Tokenizer โ”€โ”€
126
  tokenizer = Tokenizer(config)
127
-
128
- if tokenizer_mode == "train_new":
129
- tokenizer = train_tokenizer_from_dataset(config)
130
- elif tokenizer_mode == "load_trained":
131
- path = tokenizer_path or config.tokenizer_save_dir
132
- tokenizer.load_trained_hf(path)
133
- elif tokenizer_mode == "pretrained":
134
- name = tokenizer_path or "mistralai/Mistral-7B-v0.1"
135
- tokenizer.load_pretrained_hf(name)
136
- else:
137
- raise ValueError(f"Unknown tokenizer_mode: {tokenizer_mode}")
138
 
139
  # โ”€โ”€ Step 2: Training DataLoader โ”€โ”€
140
  print("\n[DataLoader] Creating training DataLoader...")
 
87
 
88
 
89
  def setup_data_pipeline(
 
 
90
  config: Optional[DataConfig] = None,
91
  ) -> tuple:
92
  """Sets up the data pipeline in one call.
93
 
94
+ Uses the LLaMA 2 tokenizer (NousResearch/Llama-2-7b-hf) by default.
95
+ Configurable via DataConfig.tokenizer_name.
 
 
 
 
 
 
 
96
 
97
  Returns:
98
  (tokenizer, train_dataloader, val_dataloader)
99
 
100
  Example usage (Colab):
101
+ tok, train_dl, val_dl = setup_data_pipeline()
 
 
 
 
 
 
 
102
  """
103
  config = config or DataConfig()
104
 
 
108
 
109
  # โ”€โ”€ Step 1: Tokenizer โ”€โ”€
110
  tokenizer = Tokenizer(config)
111
+ tokenizer.load_pretrained_hf()
 
 
 
 
 
 
 
 
 
 
112
 
113
  # โ”€โ”€ Step 2: Training DataLoader โ”€โ”€
114
  print("\n[DataLoader] Creating training DataLoader...")
llm_lab/data/tokenizer.py CHANGED
@@ -137,15 +137,19 @@ class Tokenizer:
137
  # Method 3: Load a pretrained HF tokenizer
138
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
139
 
140
- def load_pretrained_hf(self, name_or_path: str = "meta-llama/Llama-2-7b-hf"):
141
  """Loads a pretrained tokenizer from HuggingFace.
142
 
143
- The simplest method. The LLaMA tokenizer has a 32K vocab and is BPE-based.
144
- Note: meta-llama models may require HF approval to access.
145
- Alternative: mistralai/Mistral-7B-v0.1 (no approval required)
 
 
 
146
  """
147
  from transformers import AutoTokenizer
148
 
 
149
  print(f"[Tokenizer] Loading HF tokenizer: {name_or_path}")
150
  tokenizer = AutoTokenizer.from_pretrained(name_or_path)
151
 
 
137
  # Method 3: Load a pretrained HF tokenizer
138
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
139
 
140
+ def load_pretrained_hf(self, name_or_path: Optional[str] = None):
141
  """Loads a pretrained tokenizer from HuggingFace.
142
 
143
+ Default: LLaMA 2 tokenizer (NousResearch/Llama-2-7b-hf mirror).
144
+ - vocab_size : 32,000
145
+ - SentencePiece BPE โ€” optimal for 1B-scale models (TinyLlama, LLaMA 1/2)
146
+ - No HuggingFace authentication required (community mirror)
147
+ Official source (requires HF auth):
148
+ - "meta-llama/Llama-2-7b-hf"
149
  """
150
  from transformers import AutoTokenizer
151
 
152
+ name_or_path = name_or_path or self.config.tokenizer_name
153
  print(f"[Tokenizer] Loading HF tokenizer: {name_or_path}")
154
  tokenizer = AutoTokenizer.from_pretrained(name_or_path)
155
 
notebooks/01_data_pipeline.ipynb CHANGED
@@ -3,21 +3,7 @@
3
  {
4
  "cell_type": "markdown",
5
  "metadata": {},
6
- "source": [
7
- "# 01. ๋ฐ์ดํ„ฐ ํŒŒ์ดํ”„๋ผ์ธ\n",
8
- "\n",
9
- "ํ† ํฌ๋‚˜์ด์ € ์ค€๋น„ โ†’ ๋ฐ์ดํ„ฐ ์ŠคํŠธ๋ฆฌ๋ฐ โ†’ ์‹œํ€€์Šค ํŒจํ‚น โ†’ ๋ฐฐ์น˜ ๊ตฌ์„ฑ\n",
10
- "\n",
11
- "**ํŒŒ์ดํ”„๋ผ์ธ ํ๋ฆ„:**\n",
12
- "```\n",
13
- "FineWeb-Edu (HuggingFace)\n",
14
- " โ†’ Streaming์œผ๋กœ ๋กœ๋“œ (๋””์Šคํฌ ์ €์žฅ ์—†์Œ)\n",
15
- " โ†’ ํ† ํฌ๋‚˜์ด์ง• (BPE, vocab=32K)\n",
16
- " โ†’ ์‹œํ€€์Šค ํŒจํ‚น (์—ฌ๋Ÿฌ ๋ฌธ์„œ๋ฅผ max_seq_len์œผ๋กœ ์—ฐ๊ฒฐ)\n",
17
- " โ†’ ๋ฐฐ์น˜ ๊ตฌ์„ฑ (input_ids, targets)\n",
18
- " โ†’ GPU ์ „์†ก\n",
19
- "```"
20
- ]
21
  },
22
  {
23
  "cell_type": "code",
@@ -59,45 +45,19 @@
59
  "execution_count": null,
60
  "metadata": {},
61
  "outputs": [],
62
- "source": [
63
- "data_config = DataConfig(\n",
64
- " dataset_name=\"HuggingFaceFW/fineweb-edu\",\n",
65
- " dataset_subset=\"sample-10BT\",\n",
66
- " vocab_size=32_000,\n",
67
- " max_seq_len=2048,\n",
68
- " batch_size=4,\n",
69
- " num_workers=2,\n",
70
- ")\n",
71
- "\n",
72
- "print(f\"๋ฐ์ดํ„ฐ์…‹: {data_config.dataset_name} ({data_config.dataset_subset})\")\n",
73
- "print(f\"์‹œํ€€์Šค ๊ธธ์ด: {data_config.max_seq_len}\")\n",
74
- "print(f\"๋ฐฐ์น˜ ํฌ๊ธฐ: {data_config.batch_size}\")\n",
75
- "print(f\"ํ† ํฐ/๋ฐฐ์น˜: {data_config.batch_size * data_config.max_seq_len:,}\")"
76
- ]
77
  },
78
  {
79
  "cell_type": "markdown",
80
  "metadata": {},
81
- "source": [
82
- "## 2. ํ† ํฌ๋‚˜์ด์ € ์„ค์ •\n",
83
- "\n",
84
- "์„ธ ๊ฐ€์ง€ ๋ฐฉ๋ฒ• ์ค‘ ์„ ํƒ:\n",
85
- "- `\"pretrained\"` โ€” HuggingFace ์‚ฌ์ „ํ•™์Šต ํ† ํฌ๋‚˜์ด์ € (๊ฐ€์žฅ ๊ฐ„ํŽธ)\n",
86
- "- `\"train_new\"` โ€” BPE ํ† ํฌ๋‚˜์ด์ € ์ƒˆ๋กœ ํ•™์Šต\n",
87
- "- `\"load_trained\"` โ€” ์ด์ „์— ํ•™์Šตํ•œ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ"
88
- ]
89
  },
90
  {
91
  "cell_type": "code",
92
  "execution_count": null,
93
  "metadata": {},
94
  "outputs": [],
95
- "source": [
96
- "tokenizer, train_dl, val_dl = setup_data_pipeline(\n",
97
- " tokenizer_mode=\"pretrained\", # \"train_new\" ๋˜๋Š” \"load_trained\"๋กœ ๋ณ€๊ฒฝ ๊ฐ€๋Šฅ\n",
98
- " config=data_config,\n",
99
- ")"
100
- ]
101
  },
102
  {
103
  "cell_type": "markdown",
@@ -166,4 +126,4 @@
166
  },
167
  "nbformat": 4,
168
  "nbformat_minor": 4
169
- }
 
3
  {
4
  "cell_type": "markdown",
5
  "metadata": {},
6
+ "source": "# 01. ๋ฐ์ดํ„ฐ ํŒŒ์ดํ”„๋ผ์ธ\n\nํ† ํฌ๋‚˜์ด์ € ์ค€๋น„ โ†’ ๋ฐ์ดํ„ฐ ์ŠคํŠธ๋ฆฌ๋ฐ โ†’ ์‹œํ€€์Šค ํŒจํ‚น โ†’ ๋ฐฐ์น˜ ๊ตฌ์„ฑ\n\n**ํŒŒ์ดํ”„๋ผ์ธ ํ๋ฆ„:**\n```\nFineWeb-Edu (HuggingFace)\n โ†’ Streaming์œผ๋กœ ๋กœ๋“œ (๋””์Šคํฌ ์ €์žฅ ์—†์Œ)\n โ†’ ํ† ํฌ๋‚˜์ด์ง• (LLaMA 2 BPE, vocab=32K)\n โ†’ ์‹œํ€€์Šค ํŒจํ‚น (์—ฌ๋Ÿฌ ๋ฌธ์„œ๋ฅผ max_seq_len์œผ๋กœ ์—ฐ๊ฒฐ)\n โ†’ ๋ฐฐ์น˜ ๊ตฌ์„ฑ (input_ids, targets)\n โ†’ GPU ์ „์†ก\n```"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  },
8
  {
9
  "cell_type": "code",
 
45
  "execution_count": null,
46
  "metadata": {},
47
  "outputs": [],
48
+ "source": "data_config = DataConfig(\n dataset_name=\"HuggingFaceFW/fineweb-edu\",\n dataset_subset=\"sample-10BT\",\n max_seq_len=2048,\n batch_size=4,\n num_workers=2,\n)\n\nprint(f\"๋ฐ์ดํ„ฐ์…‹: {data_config.dataset_name} ({data_config.dataset_subset})\")\nprint(f\"ํ† ํฌ๋‚˜์ด์ €: {data_config.tokenizer_name} (vocab_size={data_config.vocab_size:,})\")\nprint(f\"์‹œํ€€์Šค ๊ธธ์ด: {data_config.max_seq_len}\")\nprint(f\"๋ฐฐ์น˜ ํฌ๊ธฐ: {data_config.batch_size}\")\nprint(f\"ํ† ํฐ/๋ฐฐ์น˜: {data_config.batch_size * data_config.max_seq_len:,}\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  },
50
  {
51
  "cell_type": "markdown",
52
  "metadata": {},
53
+ "source": "## 2. ํ† ํฌ๋‚˜์ด์ € + ๋ฐ์ดํ„ฐ ํŒŒ์ดํ”„๋ผ์ธ ์„ค์ •\n\n**LLaMA 2 ํ† ํฌ๋‚˜์ด์ €** (`NousResearch/Llama-2-7b-hf`)๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.\n- vocab_size: 32,000\n- SentencePiece BPE โ€” 1B๊ธ‰ ๋ชจ๋ธ(TinyLlama, LLaMA 1/2)์˜ ํ‘œ์ค€ ํ† ํฌ๋‚˜์ด์ €\n- HuggingFace ์ธ์ฆ ์—†์ด ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ๊ณต์‹ ๋ฏธ๋Ÿฌ"
 
 
 
 
 
 
 
54
  },
55
  {
56
  "cell_type": "code",
57
  "execution_count": null,
58
  "metadata": {},
59
  "outputs": [],
60
+ "source": "tokenizer, train_dl, val_dl = setup_data_pipeline(config=data_config)"
 
 
 
 
 
61
  },
62
  {
63
  "cell_type": "markdown",
 
126
  },
127
  "nbformat": 4,
128
  "nbformat_minor": 4
129
+ }
notebooks/03_training.ipynb CHANGED
@@ -110,17 +110,7 @@
110
  "execution_count": null,
111
  "metadata": {},
112
  "outputs": [],
113
- "source": [
114
- "# ๋ชจ๋ธ ์ƒ์„ฑ\n",
115
- "model = LLMModel(model_config)\n",
116
- "print(f\"๋ชจ๋ธ ํŒŒ๋ผ๋ฏธํ„ฐ: {model.count_parameters():,}\")\n",
117
- "\n",
118
- "# ๋ฐ์ดํ„ฐ ํŒŒ์ดํ”„๋ผ์ธ\n",
119
- "tokenizer, train_dl, val_dl = setup_data_pipeline(\n",
120
- " tokenizer_mode=\"pretrained\",\n",
121
- " config=data_config,\n",
122
- ")"
123
- ]
124
  },
125
  {
126
  "cell_type": "markdown",
 
110
  "execution_count": null,
111
  "metadata": {},
112
  "outputs": [],
113
+ "source": "# ๋ชจ๋ธ ์ƒ์„ฑ\nmodel = LLMModel(model_config)\nprint(f\"๋ชจ๋ธ ํŒŒ๋ผ๋ฏธํ„ฐ: {model.count_parameters():,}\")\n\n# ๋ฐ์ดํ„ฐ ํŒŒ์ดํ”„๋ผ์ธ (GPT-2 ํ† ํฌ๋‚˜์ด์ € ์ž๋™ ์‚ฌ์šฉ)\ntokenizer, train_dl, val_dl = setup_data_pipeline(config=data_config)"
 
 
 
 
 
 
 
 
 
 
114
  },
115
  {
116
  "cell_type": "markdown",
notebooks/04_evaluation.ipynb CHANGED
@@ -85,14 +85,10 @@
85
  },
86
  {
87
  "cell_type": "code",
88
- "execution_count": 5,
89
  "metadata": {},
90
  "outputs": [],
91
- "source": [
92
- "from llm_lab.data import setup_data_pipeline\n",
93
- "\n",
94
- "tokenizer, train_dl, val_dl = setup_data_pipeline(tokenizer_mode=\"train_new\")"
95
- ]
96
  },
97
  {
98
  "cell_type": "code",
@@ -198,4 +194,4 @@
198
  },
199
  "nbformat": 4,
200
  "nbformat_minor": 4
201
- }
 
85
  },
86
  {
87
  "cell_type": "code",
88
+ "execution_count": null,
89
  "metadata": {},
90
  "outputs": [],
91
+ "source": "from llm_lab.data import setup_data_pipeline\n\ntokenizer, train_dl, val_dl = setup_data_pipeline()"
 
 
 
 
92
  },
93
  {
94
  "cell_type": "code",
 
194
  },
195
  "nbformat": 4,
196
  "nbformat_minor": 4
197
+ }
notebooks/05_debugging.ipynb CHANGED
@@ -101,17 +101,7 @@
101
  "execution_count": null,
102
  "metadata": {},
103
  "outputs": [],
104
- "source": [
105
- "# --- Model ---\n",
106
- "model = LLMModel(model_config).to(device)\n",
107
- "print(f\"Model parameters: {model.count_parameters():,}\")\n",
108
- "\n",
109
- "# --- Data pipeline ---\n",
110
- "tokenizer, train_dl, val_dl = setup_data_pipeline(\n",
111
- " tokenizer_mode=\"pretrained\",\n",
112
- " config=data_config,\n",
113
- ")"
114
- ]
115
  },
116
  {
117
  "cell_type": "markdown",
 
101
  "execution_count": null,
102
  "metadata": {},
103
  "outputs": [],
104
+ "source": "# --- Model ---\nmodel = LLMModel(model_config).to(device)\nprint(f\"Model parameters: {model.count_parameters():,}\")\n\n# --- Data pipeline (GPT-2 tokenizer ์ž๋™ ์‚ฌ์šฉ) ---\ntokenizer, train_dl, val_dl = setup_data_pipeline(config=data_config)"
 
 
 
 
 
 
 
 
 
 
105
  },
106
  {
107
  "cell_type": "markdown",