Jaqshanahan
/

codsworth-3.8m

Model card Files Files and versions

codsworth-3.8m / create_tokenizer.py

Jaqshanahan's picture

Initial upload of Codsworth model

b84d85a verified about 1 month ago

history blame contribute delete

1.13 kB

	import json
	import re
	from collections import Counter
	from pathlib import Path

	DATA_PATH = 'codsworth/data/train/tinystories.txt'
	VOCAB_PATH = 'codsworth/tokenizer.json'
	VOCAB_SIZE = 20000

	print(f"Loading data from {DATA_PATH}...")

	with open(DATA_PATH, 'r', encoding='utf-8', errors='ignore') as f:
	text = f.read()

	print(f"Text length: {len(text):,} characters")

	print("Tokenizing...")
	tokens = re.findall(r'\b\w+\b', text.lower())
	print(f"Total tokens: {len(tokens):,}")

	print("Counting frequencies...")
	word_counts = Counter(tokens)

	special_tokens = ['<pad>', '<unk>', '<bos>', '<eos>']
	print(f"Special tokens: {special_tokens}")

	most_common = [word for word, _ in word_counts.most_common(VOCAB_SIZE - len(special_tokens))]

	vocab = {}
	for i, token in enumerate(special_tokens):
	vocab[token] = i

	for i, word in enumerate(most_common):
	vocab[word] = i + len(special_tokens)

	print(f"Vocabulary size: {len(vocab)}")

	print(f"Saving to {VOCAB_PATH}...")
	with open(VOCAB_PATH, 'w', encoding='utf-8') as f:
	json.dump(vocab, f, ensure_ascii=False)

	print("Done!")
	print(f"Sample: {list(vocab.items())[:10]}")