""" Tokenizer for Language Model Converts text to numbers (tokens) and back """ import json import os class CharacterTokenizer: """Simple character-level tokenizer for tiny language models""" def __init__(self): """Initialize tokenizer""" self.char_to_idx = {} self.idx_to_char = {} self.vocab_size = 0 def build_vocab(self, text): """Build vocabulary from text""" print("\nBuilding character vocabulary...") # Get unique characters and sort them chars = sorted(set(text)) self.vocab_size = len(chars) # Create mappings self.char_to_idx = {ch: i for i, ch in enumerate(chars)} self.idx_to_char = {i: ch for i, ch in enumerate(chars)} print(f"Vocabulary size: {self.vocab_size} characters") print(f"Characters: {''.join(chars[:50])}" + ("..." if len(chars) > 50 else "")) return self.vocab_size def build_vocab_from_file(self, filepath, chunk_size=100*1024*1024): """Build vocabulary from a large file using streaming (memory-efficient) Args: filepath: Path to text file chunk_size: Size of chunks to read (default: 100MB) """ print(f"\nBuilding character vocabulary from file: {filepath}") print(f"Chunk size: {chunk_size / (1024*1024):.0f}MB") # Get file size file_size = os.path.getsize(filepath) file_size_gb = file_size / (1024**3) print(f"File size: {file_size_gb:.2f} GB") # Collect unique characters by reading file in chunks unique_chars = set() total_read = 0 with open(filepath, 'r', encoding='utf-8') as f: while True: chunk = f.read(chunk_size) if not chunk: break # Add unique characters from this chunk unique_chars.update(chunk) total_read += len(chunk) # Progress update (calculate based on character count) progress_pct = (total_read / (file_size / 1.5)) * 100 # Approximate chars from bytes if progress_pct <= 100: print(f" Progress: {progress_pct:.1f}% | Unique chars found: {len(unique_chars)}", end='\r') print() # New line after progress # Sort characters and build mappings chars = sorted(unique_chars) self.vocab_size = len(chars) # Create mappings self.char_to_idx = {ch: i for i, ch in enumerate(chars)} self.idx_to_char = {i: ch for i, ch in enumerate(chars)} print(f"\nVocabulary size: {self.vocab_size} characters") print(f"Sample characters: {''.join(chars[:50])}" + ("..." if len(chars) > 50 else "")) return self.vocab_size def encode(self, text): """Convert text to list of token IDs""" return [self.char_to_idx[ch] for ch in text if ch in self.char_to_idx] def decode(self, tokens): """Convert list of token IDs back to text""" return ''.join([self.idx_to_char[idx] for idx in tokens if idx in self.idx_to_char]) def save(self, filepath='models/tokenizer.json'): """Save tokenizer to JSON file""" os.makedirs(os.path.dirname(filepath), exist_ok=True) tokenizer_data = { 'type': 'character', 'vocab_size': self.vocab_size, 'char_to_idx': self.char_to_idx, 'idx_to_char': {str(k): v for k, v in self.idx_to_char.items()} } with open(filepath, 'w', encoding='utf-8') as f: json.dump(tokenizer_data, f, indent=2, ensure_ascii=False) print(f"\nTokenizer saved to: {filepath}") return filepath def load(self, filepath='models/tokenizer.json'): """Load tokenizer from JSON file""" with open(filepath, 'r', encoding='utf-8') as f: tokenizer_data = json.load(f) self.vocab_size = tokenizer_data['vocab_size'] self.char_to_idx = tokenizer_data['char_to_idx'] self.idx_to_char = {int(k): v for k, v in tokenizer_data['idx_to_char'].items()} print(f"\nTokenizer loaded from: {filepath}") print(f"Vocabulary size: {self.vocab_size}") return self def get_stats(self): """Print tokenizer statistics""" print("\n" + "="*80) print("TOKENIZER STATISTICS") print("="*80) print(f"Type: Character-level") print(f"Vocabulary size: {self.vocab_size}") print(f"Sample characters: {list(self.char_to_idx.keys())[:20]}") print("="*80) def main(): """Main function to build and test tokenizer""" print("\n" + "="*80) print("TOKENIZER BUILDER") print("="*80) # Load dataset dataset_file = 'data/tiny_shakespeare.txt' if not os.path.exists(dataset_file): print(f"\nError: Dataset not found at {dataset_file}") print("Please run dataset_loader.py first.") return print(f"\nLoading text from: {dataset_file}") with open(dataset_file, 'r', encoding='utf-8') as f: text = f.read() print(f"Loaded {len(text):,} characters") # Build tokenizer tokenizer = CharacterTokenizer() tokenizer.build_vocab(text) # Test tokenizer print("\n" + "="*80) print("TESTING TOKENIZER") print("="*80) test_text = "Hello, World!" print(f"\nOriginal text: {test_text}") encoded = tokenizer.encode(test_text) print(f"Encoded: {encoded}") decoded = tokenizer.decode(encoded) print(f"Decoded: {decoded}") if test_text == decoded: print("Test passed!") else: print("Test failed!") # Test with Shakespeare sample shakespeare_sample = text[:100] print(f"\nShakespeare sample: {shakespeare_sample}") encoded_sample = tokenizer.encode(shakespeare_sample) print(f"Encoded (first 20 tokens): {encoded_sample[:20]}") decoded_sample = tokenizer.decode(encoded_sample) assert shakespeare_sample == decoded_sample, "Encoding/decoding mismatch!" print("Shakespeare encoding test passed!") # Show statistics tokenizer.get_stats() # Save tokenizer tokenizer.save() print("\n" + "="*80) print("TOKENIZER BUILD COMPLETE") print("="*80) print(f"\nTokenizer ready for model training!") print(f"Vocabulary size: {tokenizer.vocab_size}") print(f"Saved to: models/tokenizer.json") print(f"\nNext step: Build the model architecture") print("="*80 + "\n") if __name__ == "__main__": main()