barunsaha
/

banalata

Model card Files Files and versions

banalata / s03_encode_data.py

barunsaha's picture

Upload s03_encode_data.py with huggingface_hub

5230c6c verified about 1 month ago

history blame contribute delete

3.16 kB

	"""Step 3: Encode Corpus to Token Arrays
	======================================
	Tokenizes train/val/test splits and saves them as memory-mapped numpy arrays.

	Why memmap?
	- A 35MB corpus encodes to ~8-12M tokens (at ~3 chars/token for Bengali BPE)
	- Storing as np.uint16 costs ~16-24MB RAM — trivial
	- memmap means PyTorch DataLoader can read random windows WITHOUT loading
	the whole array into RAM — important if the corpus is later scaled up

	Usage:
	python s03_encode_data.py
	"""

	import json
	from pathlib import Path

	import numpy as np
	import sentencepiece as spm

	# ---------------------------------------------------------------------------
	DATA_DIR = Path('../../data')
	TOK_DIR = Path('tokenizer')


	def encode_split(sp: spm.SentencePieceProcessor, split: str, config: dict) -> int:
	"""Encode one split (train/val/test) to a flat uint16 memmap array.
	Returns the number of tokens written.
	"""
	text_path = DATA_DIR / f'{split}.txt'
	if not text_path.exists():
	print(f' {split}.txt not found, skipping.')
	return 0

	text = text_path.read_text(encoding='utf-8')
	print(f' Encoding {split} ({len(text):,} chars)...', end=' ', flush=True)

	# Encode the full text in one shot — SentencePiece handles large inputs
	token_ids = sp.encode(text, out_type=int)
	n_tokens = len(token_ids)

	# Save as uint16 (vocab ≤ 5000, well within uint16 range of 65535)
	arr = np.array(token_ids, dtype=np.uint16)
	out_path = DATA_DIR / f'{split}.bin'
	np.save(out_path, arr) # saves as .bin.npy — rename for clarity
	# Actually save without the .npy extension for cleaner naming:
	out_path = DATA_DIR / f'{split}_tokens.npy'
	np.save(str(out_path), arr)

	print(f'{n_tokens:,} tokens → {out_path}')
	return n_tokens


	def main():
	"""Main execution function for encoding the corpus into token arrays."""
	# Load config
	config_path = TOK_DIR / 'tokenizer_config.json'
	if not config_path.exists():
	raise FileNotFoundError(
	'tokenizer_config.json not found. Run s02_train_tokenizer.py first.'
	)
	config = json.loads(config_path.read_text(encoding='utf-8'))

	# Load tokenizer
	sp = spm.SentencePieceProcessor()
	sp.load(config['model_path'])
	print(f'Loaded tokenizer: vocab_size={config["vocab_size"]}')

	# Encode all splits
	print('\nEncoding splits:')
	stats = {}
	for split in ('train', 'val', 'test'):
	n = encode_split(sp, split, config)
	stats[split] = n

	# Save encoding stats alongside tokenizer config
	config['encoding_stats'] = stats
	config_path.write_text(json.dumps(config, ensure_ascii=False, indent=2), encoding='utf-8')

	# Summary
	print(f'\n{"=" * 55}')
	total = sum(stats.values())
	for split, n in stats.items():
	pct = 100 * n / total if total else 0
	print(f' {split:5s}: {n:>9,} tokens ({pct:.1f}%)')
	print(f' {"total":5s}: {total:>9,} tokens')
	print('\nToken files: data/train_tokens.npy, val_tokens.npy, test_tokens.npy')
	print('\nNext step: python s04_train_model.py')


	if __name__ == '__main__':
	main()