Houzeric
/

slm-tiny-stories

Text Generation

children-stories

character-level

Model card Files Files and versions

slm-tiny-stories / tokenizer.py

Eric Houzelle

Initial commit

c64cf6f 5 months ago

history blame contribute delete

1.8 kB

	import json

	def build_tokenizer_word_by_word_from_texts(texts, save_path="tokenizer_words.json", pad_token="[PAD]"):
	all_text = " ".join(texts)
	words = all_text.split()
	unique_words = sorted(set(words))

	if pad_token not in unique_words:
	unique_words.insert(0, pad_token)

	stoi = {word: i for i, word in enumerate(unique_words)}
	itos = {i: word for word, i in stoi.items()}

	tokenizer = {"stoi": stoi, "itos": itos, "pad_token": pad_token}

	with open(save_path, "w", encoding="utf-8") as f:
	json.dump(tokenizer, f, ensure_ascii=False, indent=2)

	print(f"Tokenizer mot-par-mot sauvegardé dans {save_path} ({len(stoi)} tokens)")


	def build_tokenizer_char_by_char_from_texts(texts, save_path="tokenizer.json", pad_token="[PAD]"):

	all_text = " ".join(texts)
	unique_chars = sorted(set(all_text))
	if pad_token not in unique_chars:
	unique_chars.insert(0, pad_token)

	stoi = {ch: i for i, ch in enumerate(unique_chars)}
	itos = {i: ch for ch, i in stoi.items()}

	tokenizer = {"stoi": stoi, "itos": itos, "pad_token": pad_token}

	with open(save_path, "w", encoding="utf-8") as f:
	json.dump(tokenizer, f, ensure_ascii=False, indent=2)

	print(f"Tokenizer sauvegardé dans {save_path} ({len(stoi)} tokens)")


	def load_tokenizer(path="tokenizer.json"):
	with open(path, "r", encoding="utf-8") as f:
	tokenizer = json.load(f)

	stoi = tokenizer["stoi"]
	itos = {int(k): v for k, v in tokenizer["itos"].items()}

	encode = lambda s: [stoi[word] for word in s.split()]
	decode = lambda l: ' '.join([itos[i] for i in l])

	pad_token = tokenizer.get("pad_token", "[PAD]")
	pad_token_id = stoi[pad_token]

	return stoi, itos, encode, decode, pad_token_id