Spaces:
Running
Running
File size: 6,558 Bytes
f7ba1e0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 | """
Tokenizer for Language Model
Converts text to numbers (tokens) and back
"""
import json
import os
class CharacterTokenizer:
"""Simple character-level tokenizer for tiny language models"""
def __init__(self):
"""Initialize tokenizer"""
self.char_to_idx = {}
self.idx_to_char = {}
self.vocab_size = 0
def build_vocab(self, text):
"""Build vocabulary from text"""
print("\nBuilding character vocabulary...")
# Get unique characters and sort them
chars = sorted(set(text))
self.vocab_size = len(chars)
# Create mappings
self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
self.idx_to_char = {i: ch for i, ch in enumerate(chars)}
print(f"Vocabulary size: {self.vocab_size} characters")
print(f"Characters: {''.join(chars[:50])}" + ("..." if len(chars) > 50 else ""))
return self.vocab_size
def build_vocab_from_file(self, filepath, chunk_size=100*1024*1024):
"""Build vocabulary from a large file using streaming (memory-efficient)
Args:
filepath: Path to text file
chunk_size: Size of chunks to read (default: 100MB)
"""
print(f"\nBuilding character vocabulary from file: {filepath}")
print(f"Chunk size: {chunk_size / (1024*1024):.0f}MB")
# Get file size
file_size = os.path.getsize(filepath)
file_size_gb = file_size / (1024**3)
print(f"File size: {file_size_gb:.2f} GB")
# Collect unique characters by reading file in chunks
unique_chars = set()
total_read = 0
with open(filepath, 'r', encoding='utf-8') as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
# Add unique characters from this chunk
unique_chars.update(chunk)
total_read += len(chunk)
# Progress update (calculate based on character count)
progress_pct = (total_read / (file_size / 1.5)) * 100 # Approximate chars from bytes
if progress_pct <= 100:
print(f" Progress: {progress_pct:.1f}% | Unique chars found: {len(unique_chars)}", end='\r')
print() # New line after progress
# Sort characters and build mappings
chars = sorted(unique_chars)
self.vocab_size = len(chars)
# Create mappings
self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
self.idx_to_char = {i: ch for i, ch in enumerate(chars)}
print(f"\nVocabulary size: {self.vocab_size} characters")
print(f"Sample characters: {''.join(chars[:50])}" + ("..." if len(chars) > 50 else ""))
return self.vocab_size
def encode(self, text):
"""Convert text to list of token IDs"""
return [self.char_to_idx[ch] for ch in text if ch in self.char_to_idx]
def decode(self, tokens):
"""Convert list of token IDs back to text"""
return ''.join([self.idx_to_char[idx] for idx in tokens if idx in self.idx_to_char])
def save(self, filepath='models/tokenizer.json'):
"""Save tokenizer to JSON file"""
os.makedirs(os.path.dirname(filepath), exist_ok=True)
tokenizer_data = {
'type': 'character',
'vocab_size': self.vocab_size,
'char_to_idx': self.char_to_idx,
'idx_to_char': {str(k): v for k, v in self.idx_to_char.items()}
}
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(tokenizer_data, f, indent=2, ensure_ascii=False)
print(f"\nTokenizer saved to: {filepath}")
return filepath
def load(self, filepath='models/tokenizer.json'):
"""Load tokenizer from JSON file"""
with open(filepath, 'r', encoding='utf-8') as f:
tokenizer_data = json.load(f)
self.vocab_size = tokenizer_data['vocab_size']
self.char_to_idx = tokenizer_data['char_to_idx']
self.idx_to_char = {int(k): v for k, v in tokenizer_data['idx_to_char'].items()}
print(f"\nTokenizer loaded from: {filepath}")
print(f"Vocabulary size: {self.vocab_size}")
return self
def get_stats(self):
"""Print tokenizer statistics"""
print("\n" + "="*80)
print("TOKENIZER STATISTICS")
print("="*80)
print(f"Type: Character-level")
print(f"Vocabulary size: {self.vocab_size}")
print(f"Sample characters: {list(self.char_to_idx.keys())[:20]}")
print("="*80)
def main():
"""Main function to build and test tokenizer"""
print("\n" + "="*80)
print("TOKENIZER BUILDER")
print("="*80)
# Load dataset
dataset_file = 'data/tiny_shakespeare.txt'
if not os.path.exists(dataset_file):
print(f"\nError: Dataset not found at {dataset_file}")
print("Please run dataset_loader.py first.")
return
print(f"\nLoading text from: {dataset_file}")
with open(dataset_file, 'r', encoding='utf-8') as f:
text = f.read()
print(f"Loaded {len(text):,} characters")
# Build tokenizer
tokenizer = CharacterTokenizer()
tokenizer.build_vocab(text)
# Test tokenizer
print("\n" + "="*80)
print("TESTING TOKENIZER")
print("="*80)
test_text = "Hello, World!"
print(f"\nOriginal text: {test_text}")
encoded = tokenizer.encode(test_text)
print(f"Encoded: {encoded}")
decoded = tokenizer.decode(encoded)
print(f"Decoded: {decoded}")
if test_text == decoded:
print("Test passed!")
else:
print("Test failed!")
# Test with Shakespeare sample
shakespeare_sample = text[:100]
print(f"\nShakespeare sample: {shakespeare_sample}")
encoded_sample = tokenizer.encode(shakespeare_sample)
print(f"Encoded (first 20 tokens): {encoded_sample[:20]}")
decoded_sample = tokenizer.decode(encoded_sample)
assert shakespeare_sample == decoded_sample, "Encoding/decoding mismatch!"
print("Shakespeare encoding test passed!")
# Show statistics
tokenizer.get_stats()
# Save tokenizer
tokenizer.save()
print("\n" + "="*80)
print("TOKENIZER BUILD COMPLETE")
print("="*80)
print(f"\nTokenizer ready for model training!")
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Saved to: models/tokenizer.json")
print(f"\nNext step: Build the model architecture")
print("="*80 + "\n")
if __name__ == "__main__":
main()
|