gpt2-from-scratch / tokenizer.py
GPUburnout's picture
Add multi-model GPT-2 demo with Tiny, Medium, and GPT-2 Small
f7ba1e0
"""
Tokenizer for Language Model
Converts text to numbers (tokens) and back
"""
import json
import os
class CharacterTokenizer:
"""Simple character-level tokenizer for tiny language models"""
def __init__(self):
"""Initialize tokenizer"""
self.char_to_idx = {}
self.idx_to_char = {}
self.vocab_size = 0
def build_vocab(self, text):
"""Build vocabulary from text"""
print("\nBuilding character vocabulary...")
# Get unique characters and sort them
chars = sorted(set(text))
self.vocab_size = len(chars)
# Create mappings
self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
self.idx_to_char = {i: ch for i, ch in enumerate(chars)}
print(f"Vocabulary size: {self.vocab_size} characters")
print(f"Characters: {''.join(chars[:50])}" + ("..." if len(chars) > 50 else ""))
return self.vocab_size
def build_vocab_from_file(self, filepath, chunk_size=100*1024*1024):
"""Build vocabulary from a large file using streaming (memory-efficient)
Args:
filepath: Path to text file
chunk_size: Size of chunks to read (default: 100MB)
"""
print(f"\nBuilding character vocabulary from file: {filepath}")
print(f"Chunk size: {chunk_size / (1024*1024):.0f}MB")
# Get file size
file_size = os.path.getsize(filepath)
file_size_gb = file_size / (1024**3)
print(f"File size: {file_size_gb:.2f} GB")
# Collect unique characters by reading file in chunks
unique_chars = set()
total_read = 0
with open(filepath, 'r', encoding='utf-8') as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
# Add unique characters from this chunk
unique_chars.update(chunk)
total_read += len(chunk)
# Progress update (calculate based on character count)
progress_pct = (total_read / (file_size / 1.5)) * 100 # Approximate chars from bytes
if progress_pct <= 100:
print(f" Progress: {progress_pct:.1f}% | Unique chars found: {len(unique_chars)}", end='\r')
print() # New line after progress
# Sort characters and build mappings
chars = sorted(unique_chars)
self.vocab_size = len(chars)
# Create mappings
self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
self.idx_to_char = {i: ch for i, ch in enumerate(chars)}
print(f"\nVocabulary size: {self.vocab_size} characters")
print(f"Sample characters: {''.join(chars[:50])}" + ("..." if len(chars) > 50 else ""))
return self.vocab_size
def encode(self, text):
"""Convert text to list of token IDs"""
return [self.char_to_idx[ch] for ch in text if ch in self.char_to_idx]
def decode(self, tokens):
"""Convert list of token IDs back to text"""
return ''.join([self.idx_to_char[idx] for idx in tokens if idx in self.idx_to_char])
def save(self, filepath='models/tokenizer.json'):
"""Save tokenizer to JSON file"""
os.makedirs(os.path.dirname(filepath), exist_ok=True)
tokenizer_data = {
'type': 'character',
'vocab_size': self.vocab_size,
'char_to_idx': self.char_to_idx,
'idx_to_char': {str(k): v for k, v in self.idx_to_char.items()}
}
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(tokenizer_data, f, indent=2, ensure_ascii=False)
print(f"\nTokenizer saved to: {filepath}")
return filepath
def load(self, filepath='models/tokenizer.json'):
"""Load tokenizer from JSON file"""
with open(filepath, 'r', encoding='utf-8') as f:
tokenizer_data = json.load(f)
self.vocab_size = tokenizer_data['vocab_size']
self.char_to_idx = tokenizer_data['char_to_idx']
self.idx_to_char = {int(k): v for k, v in tokenizer_data['idx_to_char'].items()}
print(f"\nTokenizer loaded from: {filepath}")
print(f"Vocabulary size: {self.vocab_size}")
return self
def get_stats(self):
"""Print tokenizer statistics"""
print("\n" + "="*80)
print("TOKENIZER STATISTICS")
print("="*80)
print(f"Type: Character-level")
print(f"Vocabulary size: {self.vocab_size}")
print(f"Sample characters: {list(self.char_to_idx.keys())[:20]}")
print("="*80)
def main():
"""Main function to build and test tokenizer"""
print("\n" + "="*80)
print("TOKENIZER BUILDER")
print("="*80)
# Load dataset
dataset_file = 'data/tiny_shakespeare.txt'
if not os.path.exists(dataset_file):
print(f"\nError: Dataset not found at {dataset_file}")
print("Please run dataset_loader.py first.")
return
print(f"\nLoading text from: {dataset_file}")
with open(dataset_file, 'r', encoding='utf-8') as f:
text = f.read()
print(f"Loaded {len(text):,} characters")
# Build tokenizer
tokenizer = CharacterTokenizer()
tokenizer.build_vocab(text)
# Test tokenizer
print("\n" + "="*80)
print("TESTING TOKENIZER")
print("="*80)
test_text = "Hello, World!"
print(f"\nOriginal text: {test_text}")
encoded = tokenizer.encode(test_text)
print(f"Encoded: {encoded}")
decoded = tokenizer.decode(encoded)
print(f"Decoded: {decoded}")
if test_text == decoded:
print("Test passed!")
else:
print("Test failed!")
# Test with Shakespeare sample
shakespeare_sample = text[:100]
print(f"\nShakespeare sample: {shakespeare_sample}")
encoded_sample = tokenizer.encode(shakespeare_sample)
print(f"Encoded (first 20 tokens): {encoded_sample[:20]}")
decoded_sample = tokenizer.decode(encoded_sample)
assert shakespeare_sample == decoded_sample, "Encoding/decoding mismatch!"
print("Shakespeare encoding test passed!")
# Show statistics
tokenizer.get_stats()
# Save tokenizer
tokenizer.save()
print("\n" + "="*80)
print("TOKENIZER BUILD COMPLETE")
print("="*80)
print(f"\nTokenizer ready for model training!")
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Saved to: models/tokenizer.json")
print(f"\nNext step: Build the model architecture")
print("="*80 + "\n")
if __name__ == "__main__":
main()