Spaces:
Running
Running
| """ | |
| Tokenizer for Language Model | |
| Converts text to numbers (tokens) and back | |
| """ | |
| import json | |
| import os | |
| class CharacterTokenizer: | |
| """Simple character-level tokenizer for tiny language models""" | |
| def __init__(self): | |
| """Initialize tokenizer""" | |
| self.char_to_idx = {} | |
| self.idx_to_char = {} | |
| self.vocab_size = 0 | |
| def build_vocab(self, text): | |
| """Build vocabulary from text""" | |
| print("\nBuilding character vocabulary...") | |
| # Get unique characters and sort them | |
| chars = sorted(set(text)) | |
| self.vocab_size = len(chars) | |
| # Create mappings | |
| self.char_to_idx = {ch: i for i, ch in enumerate(chars)} | |
| self.idx_to_char = {i: ch for i, ch in enumerate(chars)} | |
| print(f"Vocabulary size: {self.vocab_size} characters") | |
| print(f"Characters: {''.join(chars[:50])}" + ("..." if len(chars) > 50 else "")) | |
| return self.vocab_size | |
| def build_vocab_from_file(self, filepath, chunk_size=100*1024*1024): | |
| """Build vocabulary from a large file using streaming (memory-efficient) | |
| Args: | |
| filepath: Path to text file | |
| chunk_size: Size of chunks to read (default: 100MB) | |
| """ | |
| print(f"\nBuilding character vocabulary from file: {filepath}") | |
| print(f"Chunk size: {chunk_size / (1024*1024):.0f}MB") | |
| # Get file size | |
| file_size = os.path.getsize(filepath) | |
| file_size_gb = file_size / (1024**3) | |
| print(f"File size: {file_size_gb:.2f} GB") | |
| # Collect unique characters by reading file in chunks | |
| unique_chars = set() | |
| total_read = 0 | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| while True: | |
| chunk = f.read(chunk_size) | |
| if not chunk: | |
| break | |
| # Add unique characters from this chunk | |
| unique_chars.update(chunk) | |
| total_read += len(chunk) | |
| # Progress update (calculate based on character count) | |
| progress_pct = (total_read / (file_size / 1.5)) * 100 # Approximate chars from bytes | |
| if progress_pct <= 100: | |
| print(f" Progress: {progress_pct:.1f}% | Unique chars found: {len(unique_chars)}", end='\r') | |
| print() # New line after progress | |
| # Sort characters and build mappings | |
| chars = sorted(unique_chars) | |
| self.vocab_size = len(chars) | |
| # Create mappings | |
| self.char_to_idx = {ch: i for i, ch in enumerate(chars)} | |
| self.idx_to_char = {i: ch for i, ch in enumerate(chars)} | |
| print(f"\nVocabulary size: {self.vocab_size} characters") | |
| print(f"Sample characters: {''.join(chars[:50])}" + ("..." if len(chars) > 50 else "")) | |
| return self.vocab_size | |
| def encode(self, text): | |
| """Convert text to list of token IDs""" | |
| return [self.char_to_idx[ch] for ch in text if ch in self.char_to_idx] | |
| def decode(self, tokens): | |
| """Convert list of token IDs back to text""" | |
| return ''.join([self.idx_to_char[idx] for idx in tokens if idx in self.idx_to_char]) | |
| def save(self, filepath='models/tokenizer.json'): | |
| """Save tokenizer to JSON file""" | |
| os.makedirs(os.path.dirname(filepath), exist_ok=True) | |
| tokenizer_data = { | |
| 'type': 'character', | |
| 'vocab_size': self.vocab_size, | |
| 'char_to_idx': self.char_to_idx, | |
| 'idx_to_char': {str(k): v for k, v in self.idx_to_char.items()} | |
| } | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(tokenizer_data, f, indent=2, ensure_ascii=False) | |
| print(f"\nTokenizer saved to: {filepath}") | |
| return filepath | |
| def load(self, filepath='models/tokenizer.json'): | |
| """Load tokenizer from JSON file""" | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| tokenizer_data = json.load(f) | |
| self.vocab_size = tokenizer_data['vocab_size'] | |
| self.char_to_idx = tokenizer_data['char_to_idx'] | |
| self.idx_to_char = {int(k): v for k, v in tokenizer_data['idx_to_char'].items()} | |
| print(f"\nTokenizer loaded from: {filepath}") | |
| print(f"Vocabulary size: {self.vocab_size}") | |
| return self | |
| def get_stats(self): | |
| """Print tokenizer statistics""" | |
| print("\n" + "="*80) | |
| print("TOKENIZER STATISTICS") | |
| print("="*80) | |
| print(f"Type: Character-level") | |
| print(f"Vocabulary size: {self.vocab_size}") | |
| print(f"Sample characters: {list(self.char_to_idx.keys())[:20]}") | |
| print("="*80) | |
| def main(): | |
| """Main function to build and test tokenizer""" | |
| print("\n" + "="*80) | |
| print("TOKENIZER BUILDER") | |
| print("="*80) | |
| # Load dataset | |
| dataset_file = 'data/tiny_shakespeare.txt' | |
| if not os.path.exists(dataset_file): | |
| print(f"\nError: Dataset not found at {dataset_file}") | |
| print("Please run dataset_loader.py first.") | |
| return | |
| print(f"\nLoading text from: {dataset_file}") | |
| with open(dataset_file, 'r', encoding='utf-8') as f: | |
| text = f.read() | |
| print(f"Loaded {len(text):,} characters") | |
| # Build tokenizer | |
| tokenizer = CharacterTokenizer() | |
| tokenizer.build_vocab(text) | |
| # Test tokenizer | |
| print("\n" + "="*80) | |
| print("TESTING TOKENIZER") | |
| print("="*80) | |
| test_text = "Hello, World!" | |
| print(f"\nOriginal text: {test_text}") | |
| encoded = tokenizer.encode(test_text) | |
| print(f"Encoded: {encoded}") | |
| decoded = tokenizer.decode(encoded) | |
| print(f"Decoded: {decoded}") | |
| if test_text == decoded: | |
| print("Test passed!") | |
| else: | |
| print("Test failed!") | |
| # Test with Shakespeare sample | |
| shakespeare_sample = text[:100] | |
| print(f"\nShakespeare sample: {shakespeare_sample}") | |
| encoded_sample = tokenizer.encode(shakespeare_sample) | |
| print(f"Encoded (first 20 tokens): {encoded_sample[:20]}") | |
| decoded_sample = tokenizer.decode(encoded_sample) | |
| assert shakespeare_sample == decoded_sample, "Encoding/decoding mismatch!" | |
| print("Shakespeare encoding test passed!") | |
| # Show statistics | |
| tokenizer.get_stats() | |
| # Save tokenizer | |
| tokenizer.save() | |
| print("\n" + "="*80) | |
| print("TOKENIZER BUILD COMPLETE") | |
| print("="*80) | |
| print(f"\nTokenizer ready for model training!") | |
| print(f"Vocabulary size: {tokenizer.vocab_size}") | |
| print(f"Saved to: models/tokenizer.json") | |
| print(f"\nNext step: Build the model architecture") | |
| print("="*80 + "\n") | |
| if __name__ == "__main__": | |
| main() | |