File size: 3,162 Bytes
5230c6c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | """Step 3: Encode Corpus to Token Arrays
======================================
Tokenizes train/val/test splits and saves them as memory-mapped numpy arrays.
Why memmap?
- A 35MB corpus encodes to ~8-12M tokens (at ~3 chars/token for Bengali BPE)
- Storing as np.uint16 costs ~16-24MB RAM — trivial
- memmap means PyTorch DataLoader can read random windows WITHOUT loading
the whole array into RAM — important if the corpus is later scaled up
Usage:
python s03_encode_data.py
"""
import json
from pathlib import Path
import numpy as np
import sentencepiece as spm
# ---------------------------------------------------------------------------
DATA_DIR = Path('../../data')
TOK_DIR = Path('tokenizer')
def encode_split(sp: spm.SentencePieceProcessor, split: str, config: dict) -> int:
"""Encode one split (train/val/test) to a flat uint16 memmap array.
Returns the number of tokens written.
"""
text_path = DATA_DIR / f'{split}.txt'
if not text_path.exists():
print(f' {split}.txt not found, skipping.')
return 0
text = text_path.read_text(encoding='utf-8')
print(f' Encoding {split} ({len(text):,} chars)...', end=' ', flush=True)
# Encode the full text in one shot — SentencePiece handles large inputs
token_ids = sp.encode(text, out_type=int)
n_tokens = len(token_ids)
# Save as uint16 (vocab ≤ 5000, well within uint16 range of 65535)
arr = np.array(token_ids, dtype=np.uint16)
out_path = DATA_DIR / f'{split}.bin'
np.save(out_path, arr) # saves as .bin.npy — rename for clarity
# Actually save without the .npy extension for cleaner naming:
out_path = DATA_DIR / f'{split}_tokens.npy'
np.save(str(out_path), arr)
print(f'{n_tokens:,} tokens → {out_path}')
return n_tokens
def main():
"""Main execution function for encoding the corpus into token arrays."""
# Load config
config_path = TOK_DIR / 'tokenizer_config.json'
if not config_path.exists():
raise FileNotFoundError(
'tokenizer_config.json not found. Run s02_train_tokenizer.py first.'
)
config = json.loads(config_path.read_text(encoding='utf-8'))
# Load tokenizer
sp = spm.SentencePieceProcessor()
sp.load(config['model_path'])
print(f'Loaded tokenizer: vocab_size={config["vocab_size"]}')
# Encode all splits
print('\nEncoding splits:')
stats = {}
for split in ('train', 'val', 'test'):
n = encode_split(sp, split, config)
stats[split] = n
# Save encoding stats alongside tokenizer config
config['encoding_stats'] = stats
config_path.write_text(json.dumps(config, ensure_ascii=False, indent=2), encoding='utf-8')
# Summary
print(f'\n{"=" * 55}')
total = sum(stats.values())
for split, n in stats.items():
pct = 100 * n / total if total else 0
print(f' {split:5s}: {n:>9,} tokens ({pct:.1f}%)')
print(f' {"total":5s}: {total:>9,} tokens')
print('\nToken files: data/train_tokens.npy, val_tokens.npy, test_tokens.npy')
print('\nNext step: python s04_train_model.py')
if __name__ == '__main__':
main()
|