| """Step 3: Encode Corpus to Token Arrays |
| ====================================== |
| Tokenizes train/val/test splits and saves them as memory-mapped numpy arrays. |
| |
| Why memmap? |
| - A 35MB corpus encodes to ~8-12M tokens (at ~3 chars/token for Bengali BPE) |
| - Storing as np.uint16 costs ~16-24MB RAM β trivial |
| - memmap means PyTorch DataLoader can read random windows WITHOUT loading |
| the whole array into RAM β important if the corpus is later scaled up |
| |
| Usage: |
| python s03_encode_data.py |
| """ |
|
|
| import json |
| from pathlib import Path |
|
|
| import numpy as np |
| import sentencepiece as spm |
|
|
| |
| DATA_DIR = Path('../../data') |
| TOK_DIR = Path('tokenizer') |
|
|
|
|
| def encode_split(sp: spm.SentencePieceProcessor, split: str, config: dict) -> int: |
| """Encode one split (train/val/test) to a flat uint16 memmap array. |
| Returns the number of tokens written. |
| """ |
| text_path = DATA_DIR / f'{split}.txt' |
| if not text_path.exists(): |
| print(f' {split}.txt not found, skipping.') |
| return 0 |
|
|
| text = text_path.read_text(encoding='utf-8') |
| print(f' Encoding {split} ({len(text):,} chars)...', end=' ', flush=True) |
|
|
| |
| token_ids = sp.encode(text, out_type=int) |
| n_tokens = len(token_ids) |
|
|
| |
| arr = np.array(token_ids, dtype=np.uint16) |
| out_path = DATA_DIR / f'{split}.bin' |
| np.save(out_path, arr) |
| |
| out_path = DATA_DIR / f'{split}_tokens.npy' |
| np.save(str(out_path), arr) |
|
|
| print(f'{n_tokens:,} tokens β {out_path}') |
| return n_tokens |
|
|
|
|
| def main(): |
| """Main execution function for encoding the corpus into token arrays.""" |
| |
| config_path = TOK_DIR / 'tokenizer_config.json' |
| if not config_path.exists(): |
| raise FileNotFoundError( |
| 'tokenizer_config.json not found. Run s02_train_tokenizer.py first.' |
| ) |
| config = json.loads(config_path.read_text(encoding='utf-8')) |
|
|
| |
| sp = spm.SentencePieceProcessor() |
| sp.load(config['model_path']) |
| print(f'Loaded tokenizer: vocab_size={config["vocab_size"]}') |
|
|
| |
| print('\nEncoding splits:') |
| stats = {} |
| for split in ('train', 'val', 'test'): |
| n = encode_split(sp, split, config) |
| stats[split] = n |
|
|
| |
| config['encoding_stats'] = stats |
| config_path.write_text(json.dumps(config, ensure_ascii=False, indent=2), encoding='utf-8') |
|
|
| |
| print(f'\n{"=" * 55}') |
| total = sum(stats.values()) |
| for split, n in stats.items(): |
| pct = 100 * n / total if total else 0 |
| print(f' {split:5s}: {n:>9,} tokens ({pct:.1f}%)') |
| print(f' {"total":5s}: {total:>9,} tokens') |
| print('\nToken files: data/train_tokens.npy, val_tokens.npy, test_tokens.npy') |
| print('\nNext step: python s04_train_model.py') |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|