Spaces:

GPUburnout
/

gpt2-from-scratch

Running

App Files Files Community

gpt2-from-scratch / tokenizer.py

GPUburnout

Add multi-model GPT-2 demo with Tiny, Medium, and GPT-2 Small

f7ba1e0 about 1 month ago

raw

history blame contribute delete

6.56 kB

	"""
	Tokenizer for Language Model
	Converts text to numbers (tokens) and back
	"""

	import json
	import os


	class CharacterTokenizer:
	"""Simple character-level tokenizer for tiny language models"""

	def __init__(self):
	"""Initialize tokenizer"""
	self.char_to_idx = {}
	self.idx_to_char = {}
	self.vocab_size = 0

	def build_vocab(self, text):
	"""Build vocabulary from text"""
	print("\nBuilding character vocabulary...")

	# Get unique characters and sort them
	chars = sorted(set(text))
	self.vocab_size = len(chars)

	# Create mappings
	self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
	self.idx_to_char = {i: ch for i, ch in enumerate(chars)}

	print(f"Vocabulary size: {self.vocab_size} characters")
	print(f"Characters: {''.join(chars[:50])}" + ("..." if len(chars) > 50 else ""))

	return self.vocab_size

	def build_vocab_from_file(self, filepath, chunk_size=10010241024):
	"""Build vocabulary from a large file using streaming (memory-efficient)

	Args:
	filepath: Path to text file
	chunk_size: Size of chunks to read (default: 100MB)
	"""
	print(f"\nBuilding character vocabulary from file: {filepath}")
	print(f"Chunk size: {chunk_size / (1024*1024):.0f}MB")

	# Get file size
	file_size = os.path.getsize(filepath)
	file_size_gb = file_size / (1024**3)
	print(f"File size: {file_size_gb:.2f} GB")

	# Collect unique characters by reading file in chunks
	unique_chars = set()
	total_read = 0

	with open(filepath, 'r', encoding='utf-8') as f:
	while True:
	chunk = f.read(chunk_size)
	if not chunk:
	break

	# Add unique characters from this chunk
	unique_chars.update(chunk)
	total_read += len(chunk)

	# Progress update (calculate based on character count)
	progress_pct = (total_read / (file_size / 1.5)) * 100 # Approximate chars from bytes
	if progress_pct <= 100:
	print(f" Progress: {progress_pct:.1f}% \| Unique chars found: {len(unique_chars)}", end='\r')

	print() # New line after progress

	# Sort characters and build mappings
	chars = sorted(unique_chars)
	self.vocab_size = len(chars)

	# Create mappings
	self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
	self.idx_to_char = {i: ch for i, ch in enumerate(chars)}

	print(f"\nVocabulary size: {self.vocab_size} characters")
	print(f"Sample characters: {''.join(chars[:50])}" + ("..." if len(chars) > 50 else ""))

	return self.vocab_size

	def encode(self, text):
	"""Convert text to list of token IDs"""
	return [self.char_to_idx[ch] for ch in text if ch in self.char_to_idx]

	def decode(self, tokens):
	"""Convert list of token IDs back to text"""
	return ''.join([self.idx_to_char[idx] for idx in tokens if idx in self.idx_to_char])

	def save(self, filepath='models/tokenizer.json'):
	"""Save tokenizer to JSON file"""
	os.makedirs(os.path.dirname(filepath), exist_ok=True)

	tokenizer_data = {
	'type': 'character',
	'vocab_size': self.vocab_size,
	'char_to_idx': self.char_to_idx,
	'idx_to_char': {str(k): v for k, v in self.idx_to_char.items()}
	}

	with open(filepath, 'w', encoding='utf-8') as f:
	json.dump(tokenizer_data, f, indent=2, ensure_ascii=False)

	print(f"\nTokenizer saved to: {filepath}")
	return filepath

	def load(self, filepath='models/tokenizer.json'):
	"""Load tokenizer from JSON file"""
	with open(filepath, 'r', encoding='utf-8') as f:
	tokenizer_data = json.load(f)

	self.vocab_size = tokenizer_data['vocab_size']
	self.char_to_idx = tokenizer_data['char_to_idx']
	self.idx_to_char = {int(k): v for k, v in tokenizer_data['idx_to_char'].items()}

	print(f"\nTokenizer loaded from: {filepath}")
	print(f"Vocabulary size: {self.vocab_size}")
	return self

	def get_stats(self):
	"""Print tokenizer statistics"""
	print("\n" + "="*80)
	print("TOKENIZER STATISTICS")
	print("="*80)
	print(f"Type: Character-level")
	print(f"Vocabulary size: {self.vocab_size}")
	print(f"Sample characters: {list(self.char_to_idx.keys())[:20]}")
	print("="*80)


	def main():
	"""Main function to build and test tokenizer"""
	print("\n" + "="*80)
	print("TOKENIZER BUILDER")
	print("="*80)

	# Load dataset
	dataset_file = 'data/tiny_shakespeare.txt'
	if not os.path.exists(dataset_file):
	print(f"\nError: Dataset not found at {dataset_file}")
	print("Please run dataset_loader.py first.")
	return

	print(f"\nLoading text from: {dataset_file}")
	with open(dataset_file, 'r', encoding='utf-8') as f:
	text = f.read()

	print(f"Loaded {len(text):,} characters")

	# Build tokenizer
	tokenizer = CharacterTokenizer()
	tokenizer.build_vocab(text)

	# Test tokenizer
	print("\n" + "="*80)
	print("TESTING TOKENIZER")
	print("="*80)

	test_text = "Hello, World!"
	print(f"\nOriginal text: {test_text}")

	encoded = tokenizer.encode(test_text)
	print(f"Encoded: {encoded}")

	decoded = tokenizer.decode(encoded)
	print(f"Decoded: {decoded}")

	if test_text == decoded:
	print("Test passed!")
	else:
	print("Test failed!")

	# Test with Shakespeare sample
	shakespeare_sample = text[:100]
	print(f"\nShakespeare sample: {shakespeare_sample}")
	encoded_sample = tokenizer.encode(shakespeare_sample)
	print(f"Encoded (first 20 tokens): {encoded_sample[:20]}")
	decoded_sample = tokenizer.decode(encoded_sample)
	assert shakespeare_sample == decoded_sample, "Encoding/decoding mismatch!"
	print("Shakespeare encoding test passed!")

	# Show statistics
	tokenizer.get_stats()

	# Save tokenizer
	tokenizer.save()

	print("\n" + "="*80)
	print("TOKENIZER BUILD COMPLETE")
	print("="*80)
	print(f"\nTokenizer ready for model training!")
	print(f"Vocabulary size: {tokenizer.vocab_size}")
	print(f"Saved to: models/tokenizer.json")
	print(f"\nNext step: Build the model architecture")
	print("="*80 + "\n")


	if __name__ == "__main__":
	main()