File size: 3,162 Bytes

5230c6c

"""Step 3: Encode Corpus to Token Arrays
======================================
Tokenizes train/val/test splits and saves them as memory-mapped numpy arrays.

Why memmap?
  - A 35MB corpus encodes to ~8-12M tokens (at ~3 chars/token for Bengali BPE)
  - Storing as np.uint16 costs ~16-24MB RAM — trivial
  - memmap means PyTorch DataLoader can read random windows WITHOUT loading
    the whole array into RAM — important if the corpus is later scaled up

Usage:
    python s03_encode_data.py
"""

import json
from pathlib import Path

import numpy as np
import sentencepiece as spm

# ---------------------------------------------------------------------------
DATA_DIR = Path('../../data')
TOK_DIR = Path('tokenizer')


def encode_split(sp: spm.SentencePieceProcessor, split: str, config: dict) -> int:
    """Encode one split (train/val/test) to a flat uint16 memmap array.
    Returns the number of tokens written.
    """
    text_path = DATA_DIR / f'{split}.txt'
    if not text_path.exists():
        print(f'  {split}.txt not found, skipping.')
        return 0

    text = text_path.read_text(encoding='utf-8')
    print(f'  Encoding {split} ({len(text):,} chars)...', end=' ', flush=True)

    # Encode the full text in one shot — SentencePiece handles large inputs
    token_ids = sp.encode(text, out_type=int)
    n_tokens = len(token_ids)

    # Save as uint16 (vocab ≤ 5000, well within uint16 range of 65535)
    arr = np.array(token_ids, dtype=np.uint16)
    out_path = DATA_DIR / f'{split}.bin'
    np.save(out_path, arr)  # saves as .bin.npy — rename for clarity
    # Actually save without the .npy extension for cleaner naming:
    out_path = DATA_DIR / f'{split}_tokens.npy'
    np.save(str(out_path), arr)

    print(f'{n_tokens:,} tokens  →  {out_path}')
    return n_tokens


def main():
    """Main execution function for encoding the corpus into token arrays."""
    # Load config
    config_path = TOK_DIR / 'tokenizer_config.json'
    if not config_path.exists():
        raise FileNotFoundError(
            'tokenizer_config.json not found. Run s02_train_tokenizer.py first.'
        )
    config = json.loads(config_path.read_text(encoding='utf-8'))

    # Load tokenizer
    sp = spm.SentencePieceProcessor()
    sp.load(config['model_path'])
    print(f'Loaded tokenizer: vocab_size={config["vocab_size"]}')

    # Encode all splits
    print('\nEncoding splits:')
    stats = {}
    for split in ('train', 'val', 'test'):
        n = encode_split(sp, split, config)
        stats[split] = n

    # Save encoding stats alongside tokenizer config
    config['encoding_stats'] = stats
    config_path.write_text(json.dumps(config, ensure_ascii=False, indent=2), encoding='utf-8')

    # Summary
    print(f'\n{"=" * 55}')
    total = sum(stats.values())
    for split, n in stats.items():
        pct = 100 * n / total if total else 0
        print(f'  {split:5s}: {n:>9,} tokens  ({pct:.1f}%)')
    print(f'  {"total":5s}: {total:>9,} tokens')
    print('\nToken files: data/train_tokens.npy, val_tokens.npy, test_tokens.npy')
    print('\nNext step: python s04_train_model.py')


if __name__ == '__main__':
    main()