File size: 2,938 Bytes
33ba3d1 8a58ffe 33ba3d1 8a58ffe 33ba3d1 858e8b2 33ba3d1 858e8b2 8a58ffe 858e8b2 8a58ffe a5ca4e4 858e8b2 8a58ffe a5ca4e4 8a58ffe a5ca4e4 858e8b2 8a58ffe 858e8b2 8a58ffe 858e8b2 8a58ffe 858e8b2 8a58ffe 858e8b2 8a58ffe | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | """Tokenizer wrapper β loads a pretrained HuggingFace tokenizer."""
from typing import Optional, List
from llm_lab.config import DataConfig
class Tokenizer:
"""Pretrained tokenizer wrapper.
Loads a pretrained HF tokenizer (e.g., LLaMA 2 tokenizer) and provides
a unified encode/decode interface for the training pipeline.
BPE (Byte Pair Encoding) core principle:
1) Split text into byte/character units
2) Repeatedly merge the most frequent adjacent pair
3) Repeat until vocab_size is reached
β Frequent words become a single token; rare words are split into multiple tokens
"""
def __init__(self, config: DataConfig):
self.config = config
self._tokenizer = None
self.vocab_size = config.vocab_size
# Special token IDs (set after initialization)
self.bos_id: int = 1 # Beginning of Sequence
self.eos_id: int = 2 # End of Sequence
self.pad_id: int = 0 # Padding
def load_pretrained_hf(self, name_or_path: Optional[str] = None):
"""Loads a pretrained tokenizer from HuggingFace.
Default: LLaMA 2 tokenizer (NousResearch/Llama-2-7b-hf mirror).
- vocab_size : 32,000
- SentencePiece BPE β optimal for 1B-scale models (TinyLlama, LLaMA 1/2)
- No HuggingFace authentication required (community mirror)
Official source (requires HF auth):
- "meta-llama/Llama-2-7b-hf"
"""
from transformers import AutoTokenizer
name_or_path = name_or_path or self.config.tokenizer_name
print(f"[Tokenizer] Loading HF tokenizer: {name_or_path}")
tokenizer = AutoTokenizer.from_pretrained(name_or_path)
self._tokenizer = tokenizer
self.vocab_size = tokenizer.vocab_size
self.bos_id = tokenizer.bos_token_id or 1
self.eos_id = tokenizer.eos_token_id or 2
self.pad_id = tokenizer.pad_token_id or 0
self._encode_fn = lambda text: tokenizer.encode(text, add_special_tokens=False)
self._decode_fn = lambda ids: tokenizer.decode(ids)
print(f"[Tokenizer] Loaded: vocab_size={self.vocab_size}")
# ββββββββββββββββββββββββββββββββββββββββββββββββ
# Common interface
# ββββββββββββββββββββββββββββββββββββββββββββββββ
def encode(self, text: str, add_special_tokens: bool = False) -> List[int]:
"""Text β list of token IDs."""
ids = self._encode_fn(text)
if add_special_tokens:
ids = [self.bos_id] + ids + [self.eos_id]
return ids
def decode(self, ids: List[int]) -> str:
"""List of token IDs β text."""
return self._decode_fn(ids)
def __len__(self) -> int:
return self.vocab_size
|