import json import re from collections import Counter from pathlib import Path DATA_PATH = 'codsworth/data/train/tinystories.txt' VOCAB_PATH = 'codsworth/tokenizer.json' VOCAB_SIZE = 20000 print(f"Loading data from {DATA_PATH}...") with open(DATA_PATH, 'r', encoding='utf-8', errors='ignore') as f: text = f.read() print(f"Text length: {len(text):,} characters") print("Tokenizing...") tokens = re.findall(r'\b\w+\b', text.lower()) print(f"Total tokens: {len(tokens):,}") print("Counting frequencies...") word_counts = Counter(tokens) special_tokens = ['', '', '', ''] print(f"Special tokens: {special_tokens}") most_common = [word for word, _ in word_counts.most_common(VOCAB_SIZE - len(special_tokens))] vocab = {} for i, token in enumerate(special_tokens): vocab[token] = i for i, word in enumerate(most_common): vocab[word] = i + len(special_tokens) print(f"Vocabulary size: {len(vocab)}") print(f"Saving to {VOCAB_PATH}...") with open(VOCAB_PATH, 'w', encoding='utf-8') as f: json.dump(vocab, f, ensure_ascii=False) print("Done!") print(f"Sample: {list(vocab.items())[:10]}")