codsworth-3.8m / create_tokenizer.py
Jaqshanahan's picture
Initial upload of Codsworth model
b84d85a verified
import json
import re
from collections import Counter
from pathlib import Path
DATA_PATH = 'codsworth/data/train/tinystories.txt'
VOCAB_PATH = 'codsworth/tokenizer.json'
VOCAB_SIZE = 20000
print(f"Loading data from {DATA_PATH}...")
with open(DATA_PATH, 'r', encoding='utf-8', errors='ignore') as f:
text = f.read()
print(f"Text length: {len(text):,} characters")
print("Tokenizing...")
tokens = re.findall(r'\b\w+\b', text.lower())
print(f"Total tokens: {len(tokens):,}")
print("Counting frequencies...")
word_counts = Counter(tokens)
special_tokens = ['<pad>', '<unk>', '<bos>', '<eos>']
print(f"Special tokens: {special_tokens}")
most_common = [word for word, _ in word_counts.most_common(VOCAB_SIZE - len(special_tokens))]
vocab = {}
for i, token in enumerate(special_tokens):
vocab[token] = i
for i, word in enumerate(most_common):
vocab[word] = i + len(special_tokens)
print(f"Vocabulary size: {len(vocab)}")
print(f"Saving to {VOCAB_PATH}...")
with open(VOCAB_PATH, 'w', encoding='utf-8') as f:
json.dump(vocab, f, ensure_ascii=False)
print("Done!")
print(f"Sample: {list(vocab.items())[:10]}")