Vjeong Claude Opus 4.6 commited on
Commit
33ba3d1
Β·
1 Parent(s): e70bc05

Remove unused tokenizer training code (train_bpe, load_sentencepiece, load_trained_hf)

Browse files

Since the project now uses the pretrained LLaMA 2 tokenizer exclusively,
remove all custom tokenizer training infrastructure that is no longer called.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

llm_lab/config/data_config.py CHANGED
@@ -33,9 +33,5 @@ class DataConfig:
33
  num_workers: int = 2 # number of DataLoader workers
34
  prefetch_factor: int = 4 # number of batches to prefetch
35
 
36
- # ── Tokenizer training settings (when training from scratch) ──
37
- tokenizer_train_samples: int = 50_000 # number of documents to use for training
38
- tokenizer_save_dir: str = "./tokenizer"
39
-
40
  # ── Validation data ──
41
  val_ratio: float = 0.001 # use 0.1% of total data for validation
 
33
  num_workers: int = 2 # number of DataLoader workers
34
  prefetch_factor: int = 4 # number of batches to prefetch
35
 
 
 
 
 
36
  # ── Validation data ──
37
  val_ratio: float = 0.001 # use 0.1% of total data for validation
llm_lab/data/__init__.py CHANGED
@@ -1,11 +1,11 @@
1
  """Data pipeline module β€” tokenizer, streaming, and sequence packing."""
2
  from .tokenizer import Tokenizer
3
  from .dataset import PackedStreamingDataset, ValidationDataset
4
- from .pipeline import create_train_dataloader, train_tokenizer_from_dataset, setup_data_pipeline
5
  from .diagnostics import DataPipelineDiagnostics
6
 
7
  __all__ = [
8
  "Tokenizer", "PackedStreamingDataset", "ValidationDataset",
9
- "create_train_dataloader", "train_tokenizer_from_dataset",
10
- "setup_data_pipeline", "DataPipelineDiagnostics",
11
  ]
 
1
  """Data pipeline module β€” tokenizer, streaming, and sequence packing."""
2
  from .tokenizer import Tokenizer
3
  from .dataset import PackedStreamingDataset, ValidationDataset
4
+ from .pipeline import create_train_dataloader, setup_data_pipeline
5
  from .diagnostics import DataPipelineDiagnostics
6
 
7
  __all__ = [
8
  "Tokenizer", "PackedStreamingDataset", "ValidationDataset",
9
+ "create_train_dataloader", "setup_data_pipeline",
10
+ "DataPipelineDiagnostics",
11
  ]
llm_lab/data/pipeline.py CHANGED
@@ -1,4 +1,4 @@
1
- """Data pipeline integration β€” DataLoader creation, tokenizer training, and Quick Start."""
2
 
3
  from typing import Optional
4
 
@@ -47,45 +47,6 @@ def create_train_dataloader(
47
  return dataloader
48
 
49
 
50
- def train_tokenizer_from_dataset(config: DataConfig) -> Tokenizer:
51
- """Trains a BPE tokenizer from the dataset.
52
-
53
- There is no need to use the entire dataset; 50K documents is sufficient,
54
- since the tokenizer vocab only needs to reflect the statistics of the full data.
55
- """
56
- from datasets import load_dataset
57
-
58
- print(f"[Train Tokenizer] Training tokenizer from {config.dataset_name}")
59
- print(f"[Train Tokenizer] Number of training documents: {config.tokenizer_train_samples:,}")
60
-
61
- # Create text iterator
62
- ds = load_dataset(
63
- config.dataset_name,
64
- name=config.dataset_subset,
65
- split=config.dataset_split,
66
- streaming=True,
67
- trust_remote_code=True,
68
- )
69
-
70
- def text_iterator():
71
- count = 0
72
- for example in ds:
73
- if count >= config.tokenizer_train_samples:
74
- break
75
- text = example[config.text_column]
76
- if text and text.strip():
77
- yield text
78
- count += 1
79
- if count % 10_000 == 0:
80
- print(f" ... {count:,} documents processed")
81
-
82
- # Train tokenizer
83
- tokenizer = Tokenizer(config)
84
- tokenizer.train_bpe(text_iterator(), save_dir=config.tokenizer_save_dir)
85
-
86
- return tokenizer
87
-
88
-
89
  def setup_data_pipeline(
90
  config: Optional[DataConfig] = None,
91
  ) -> tuple:
 
1
+ """Data pipeline integration β€” DataLoader creation and Quick Start."""
2
 
3
  from typing import Optional
4
 
 
47
  return dataloader
48
 
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def setup_data_pipeline(
51
  config: Optional[DataConfig] = None,
52
  ) -> tuple:
llm_lab/data/tokenizer.py CHANGED
@@ -1,24 +1,15 @@
1
- """Tokenizer wrapper β€” SentencePiece / HuggingFace BPE integration."""
2
 
3
- import os
4
- import json
5
- from typing import Optional, Iterator, List
6
 
7
  from llm_lab.config import DataConfig
8
 
9
 
10
  class Tokenizer:
11
- """Unified tokenizer wrapper.
12
 
13
- Supports three methods:
14
- 1) Load an existing SentencePiece model
15
- 2) Train a new tokenizer using the HuggingFace tokenizers library
16
- 3) Load a pretrained HF tokenizer (e.g., LLaMA tokenizer)
17
-
18
- Why not implement from scratch?
19
- - Training a BPE tokenizer involves large-scale text statistics processing,
20
- which has little direct relevance to understanding model architecture.
21
- - However, understanding how a tokenizer works (BPE merge rules) is still important.
22
 
23
  BPE (Byte Pair Encoding) core principle:
24
  1) Split text into byte/character units
@@ -37,106 +28,6 @@ class Tokenizer:
37
  self.eos_id: int = 2 # End of Sequence
38
  self.pad_id: int = 0 # Padding
39
 
40
- # ────────────────────────────────────────────────
41
- # Method 1: Load a SentencePiece model
42
- # ────────────────────────────────────────────────
43
-
44
- def load_sentencepiece(self, model_path: str):
45
- """Loads an existing SentencePiece model."""
46
- import sentencepiece as spm
47
-
48
- self._tokenizer = spm.SentencePieceProcessor()
49
- self._tokenizer.Load(model_path)
50
-
51
- self.vocab_size = self._tokenizer.GetPieceSize()
52
- self.bos_id = self._tokenizer.bos_id()
53
- self.eos_id = self._tokenizer.eos_id()
54
- self.pad_id = self._tokenizer.pad_id()
55
- self._encode_fn = self._tokenizer.Encode
56
- self._decode_fn = self._tokenizer.Decode
57
-
58
- print(f"[Tokenizer] SentencePiece loaded: vocab_size={self.vocab_size}")
59
-
60
- # ────────────────────────────────────────────────
61
- # Method 2: Train a BPE tokenizer with HuggingFace tokenizers
62
- # ────────────────────────────────────────────────
63
-
64
- def train_bpe(self, text_iterator: Iterator[str], save_dir: Optional[str] = None):
65
- """Trains a BPE tokenizer from scratch.
66
-
67
- Args:
68
- text_iterator: Iterator that yields training text strings
69
- save_dir: Directory path to save the trained tokenizer
70
-
71
- Key insights:
72
- - Larger vocab_size: common expressions become 1 token β†’ shorter sequences
73
- - Smaller vocab_size: saves embedding parameters, but sequences get longer
74
- - 32K is a good balance point for English
75
- """
76
- from tokenizers import Tokenizer as HFTokenizer
77
- from tokenizers.models import BPE
78
- from tokenizers.trainers import BpeTrainer
79
- from tokenizers.pre_tokenizers import ByteLevel
80
- from tokenizers.decoders import ByteLevel as ByteLevelDecoder
81
- from tokenizers.processors import TemplateProcessing
82
-
83
- print("[Tokenizer] Starting BPE tokenizer training...")
84
-
85
- # Create BPE model
86
- tokenizer = HFTokenizer(BPE(unk_token="<unk>"))
87
- tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
88
- tokenizer.decoder = ByteLevelDecoder()
89
-
90
- # Define special tokens
91
- special_tokens = ["<pad>", "<s>", "</s>", "<unk>"]
92
-
93
- # Configure trainer
94
- trainer = BpeTrainer(
95
- vocab_size=self.config.vocab_size,
96
- special_tokens=special_tokens,
97
- min_frequency=2, # Only merge pairs that appear at least twice
98
- show_progress=True,
99
- )
100
-
101
- # Run training
102
- tokenizer.train_from_iterator(text_iterator, trainer=trainer)
103
-
104
- # Post-processing: automatically add BOS/EOS
105
- tokenizer.post_processor = TemplateProcessing(
106
- single="<s> $A </s>",
107
- special_tokens=[("<s>", 1), ("</s>", 2)],
108
- )
109
-
110
- self._tokenizer = tokenizer
111
- self.vocab_size = tokenizer.get_vocab_size()
112
- self.pad_id = 0
113
- self.bos_id = 1
114
- self.eos_id = 2
115
-
116
- self._encode_fn = lambda text: tokenizer.encode(text).ids
117
- self._decode_fn = lambda ids: tokenizer.decode(ids)
118
-
119
- # Save
120
- save_dir = save_dir or self.config.tokenizer_save_dir
121
- os.makedirs(save_dir, exist_ok=True)
122
- tokenizer.save(os.path.join(save_dir, "tokenizer.json"))
123
- # Save metadata
124
- meta = {
125
- "vocab_size": self.vocab_size,
126
- "bos_id": self.bos_id,
127
- "eos_id": self.eos_id,
128
- "pad_id": self.pad_id,
129
- }
130
- with open(os.path.join(save_dir, "tokenizer_meta.json"), "w") as f:
131
- json.dump(meta, f, indent=2)
132
-
133
- print(f"[Tokenizer] Training complete: vocab_size={self.vocab_size}")
134
- print(f"[Tokenizer] Saved to: {save_dir}")
135
-
136
- # ────────────────────────────────────────────────
137
- # Method 3: Load a pretrained HF tokenizer
138
- # ────────────────────────────────────────────────
139
-
140
  def load_pretrained_hf(self, name_or_path: Optional[str] = None):
141
  """Loads a pretrained tokenizer from HuggingFace.
142
 
@@ -164,29 +55,6 @@ class Tokenizer:
164
 
165
  print(f"[Tokenizer] Loaded: vocab_size={self.vocab_size}")
166
 
167
- def load_trained_hf(self, path: str):
168
- """Reloads a tokenizer previously trained with train_bpe()."""
169
- from tokenizers import Tokenizer as HFTokenizer
170
- from tokenizers.decoders import ByteLevel as ByteLevelDecoder
171
-
172
- tokenizer = HFTokenizer.from_file(os.path.join(path, "tokenizer.json"))
173
- # Ensure ByteLevel decoder is set (may be missing in older tokenizer files)
174
- if tokenizer.decoder is None:
175
- tokenizer.decoder = ByteLevelDecoder()
176
- with open(os.path.join(path, "tokenizer_meta.json"), "r") as f:
177
- meta = json.load(f)
178
-
179
- self._tokenizer = tokenizer
180
- self.vocab_size = meta["vocab_size"]
181
- self.bos_id = meta["bos_id"]
182
- self.eos_id = meta["eos_id"]
183
- self.pad_id = meta["pad_id"]
184
-
185
- self._encode_fn = lambda text: tokenizer.encode(text).ids
186
- self._decode_fn = lambda ids: tokenizer.decode(ids)
187
-
188
- print(f"[Tokenizer] Loaded: vocab_size={self.vocab_size}")
189
-
190
  # ────────────────────────────────────────────────
191
  # Common interface
192
  # ────────────────────────────────────────────────
 
1
+ """Tokenizer wrapper β€” loads a pretrained HuggingFace tokenizer."""
2
 
3
+ from typing import Optional, List
 
 
4
 
5
  from llm_lab.config import DataConfig
6
 
7
 
8
  class Tokenizer:
9
+ """Pretrained tokenizer wrapper.
10
 
11
+ Loads a pretrained HF tokenizer (e.g., LLaMA 2 tokenizer) and provides
12
+ a unified encode/decode interface for the training pipeline.
 
 
 
 
 
 
 
13
 
14
  BPE (Byte Pair Encoding) core principle:
15
  1) Split text into byte/character units
 
28
  self.eos_id: int = 2 # End of Sequence
29
  self.pad_id: int = 0 # Padding
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def load_pretrained_hf(self, name_or_path: Optional[str] = None):
32
  """Loads a pretrained tokenizer from HuggingFace.
33
 
 
55
 
56
  print(f"[Tokenizer] Loaded: vocab_size={self.vocab_size}")
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  # ────────────────────────────────────────────────
59
  # Common interface
60
  # ────────────────────────────────────────────────