"""BPE tokenizer training and loading."""

import os
from tokenizers import Tokenizer, models, pre_tokenizers, trainers

import config


def train_tokenizer(text_path: str, vocab_size: int = None) -> Tokenizer:
    """Train a BPE tokenizer from a text file using whitespace tokenization."""
    if vocab_size is None:
        vocab_size = config.VOCAB_SIZE

    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["[PAD]", "[BOS]", "[EOS]", "[UNK]", "[MASK]"],
        min_frequency=2,
    )

    # Stream lines from file to save memory
    def line_iterator():
        with open(text_path, "r", encoding="utf-8") as f:
            for line in f:
                yield line

    tokenizer.train_from_iterator(line_iterator(), trainer=trainer)

    # NO post-processor — we add BOS/EOS manually in dataset/generate
    tokenizer.enable_padding(length=config.MAX_SEQ_LEN, pad_id=tokenizer.token_to_id("[PAD]"))
    tokenizer.enable_truncation(max_length=config.MAX_SEQ_LEN)

    os.makedirs(config.DATA_DIR, exist_ok=True)
    tokenizer.save(config.TOKENIZER_PATH)
    print(f"Tokenizer saved: {config.TOKENIZER_PATH} | vocab={tokenizer.get_vocab_size()}")
    return tokenizer


def load_tokenizer() -> Tokenizer:
    if not os.path.exists(config.TOKENIZER_PATH):
        raise FileNotFoundError(f"Tokenizer not found at {config.TOKENIZER_PATH}")
    return Tokenizer.from_file(config.TOKENIZER_PATH)


if __name__ == "__main__":
    tok = train_tokenizer(config.DATA_TEXT_PATH)
    tok.no_padding()
    tok.no_truncation()
    enc = tok.encode("Привет! Как дела?")
    print(f"Tokens: {enc.tokens}")
    print(f"IDs:    {enc.ids}")
    print(f"Decoded: {tok.decode(enc.ids)}")