File size: 2,075 Bytes

cd37343

import os
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace, ByteLevel
from tokenizers.processors import TemplateProcessing

def train_sovereign_tokenizer(corpus_path, vocab_size=50257):
    """
    Trains a custom BPE tokenizer optimized for Indic and Ecological technical terms.
    Target: 50,257 tokens (matches the model_config.yaml).
    """
    # 1. Initialize an empty BPE model
    # ByteLevel ensures we can handle any UTF-8 character without [UNK] tokens
    tokenizer = Tokenizer(BPE(unk_token="<|unk|>"))
    
    # 2. Set the Pre-Tokenizer
    # We use ByteLevel to treat the text as a sequence of bytes
    tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
    
    # 3. Initialize the Trainer
    # We include special tokens for GOEC protocols (SN, NE, IPN)
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        min_frequency=2,
        special_tokens=[
            "<|endoftext|>",
            "<|unk|>",
            "<|pad|>",
            "CATEGORY_SN",
            "CATEGORY_NE",
            "CATEGORY_IPN"
        ],
        show_progress=True,
        initial_alphabet=ByteLevel.alphabet()
    )

    # 4. Train on the Sovereign Corpus
    print(f"Commencing Tokenizer Training on {corpus_path}...")
    files = [os.path.join(corpus_path, f) for f in os.listdir(corpus_path) if f.endswith(".txt")]
    tokenizer.train(files, trainer)

    # 5. Post-Processing
    # Add template to handle start/end of sequences for the Secretariat
    tokenizer.post_processor = TemplateProcessing(
        single="$A <|endoftext|>",
        special_tokens=[("<|endoftext|>", 0)],
    )

    # 6. Save the Sovereign Lens
    tokenizer.save("data/processed/aravalli_tokenizer.json")
    print("Sovereign Tokenizer Enacted and Saved to data/processed/")

if __name__ == "__main__":
    # Ensure raw data exists before training
    if not os.path.exists("data/raw/"):
        os.makedirs("data/raw/")
    train_sovereign_tokenizer("data/raw/")