| import json |
| import re |
| from collections import Counter |
| from pathlib import Path |
|
|
| DATA_PATH = 'codsworth/data/train/tinystories.txt' |
| VOCAB_PATH = 'codsworth/tokenizer.json' |
| VOCAB_SIZE = 20000 |
|
|
| print(f"Loading data from {DATA_PATH}...") |
|
|
| with open(DATA_PATH, 'r', encoding='utf-8', errors='ignore') as f: |
| text = f.read() |
|
|
| print(f"Text length: {len(text):,} characters") |
|
|
| print("Tokenizing...") |
| tokens = re.findall(r'\b\w+\b', text.lower()) |
| print(f"Total tokens: {len(tokens):,}") |
|
|
| print("Counting frequencies...") |
| word_counts = Counter(tokens) |
|
|
| special_tokens = ['<pad>', '<unk>', '<bos>', '<eos>'] |
| print(f"Special tokens: {special_tokens}") |
|
|
| most_common = [word for word, _ in word_counts.most_common(VOCAB_SIZE - len(special_tokens))] |
|
|
| vocab = {} |
| for i, token in enumerate(special_tokens): |
| vocab[token] = i |
|
|
| for i, word in enumerate(most_common): |
| vocab[word] = i + len(special_tokens) |
|
|
| print(f"Vocabulary size: {len(vocab)}") |
|
|
| print(f"Saving to {VOCAB_PATH}...") |
| with open(VOCAB_PATH, 'w', encoding='utf-8') as f: |
| json.dump(vocab, f, ensure_ascii=False) |
|
|
| print("Done!") |
| print(f"Sample: {list(vocab.items())[:10]}") |