"""BPE tokenizer training and loading.""" import os from tokenizers import Tokenizer, models, pre_tokenizers, trainers import config def train_tokenizer(text_path: str, vocab_size: int = None) -> Tokenizer: """Train a BPE tokenizer from a text file using whitespace tokenization.""" if vocab_size is None: vocab_size = config.VOCAB_SIZE tokenizer = Tokenizer(models.BPE()) tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() trainer = trainers.BpeTrainer( vocab_size=vocab_size, special_tokens=["[PAD]", "[BOS]", "[EOS]", "[UNK]", "[MASK]"], min_frequency=2, ) # Stream lines from file to save memory def line_iterator(): with open(text_path, "r", encoding="utf-8") as f: for line in f: yield line tokenizer.train_from_iterator(line_iterator(), trainer=trainer) # NO post-processor — we add BOS/EOS manually in dataset/generate tokenizer.enable_padding(length=config.MAX_SEQ_LEN, pad_id=tokenizer.token_to_id("[PAD]")) tokenizer.enable_truncation(max_length=config.MAX_SEQ_LEN) os.makedirs(config.DATA_DIR, exist_ok=True) tokenizer.save(config.TOKENIZER_PATH) print(f"Tokenizer saved: {config.TOKENIZER_PATH} | vocab={tokenizer.get_vocab_size()}") return tokenizer def load_tokenizer() -> Tokenizer: if not os.path.exists(config.TOKENIZER_PATH): raise FileNotFoundError(f"Tokenizer not found at {config.TOKENIZER_PATH}") return Tokenizer.from_file(config.TOKENIZER_PATH) if __name__ == "__main__": tok = train_tokenizer(config.DATA_TEXT_PATH) tok.no_padding() tok.no_truncation() enc = tok.encode("Привет! Как дела?") print(f"Tokens: {enc.tokens}") print(f"IDs: {enc.ids}") print(f"Decoded: {tok.decode(enc.ids)}")