triton329
/

gpt2

Model card Files Files and versions

gpt2 / src /data /tokenizer.py

triton329's picture

Upload folder using huggingface_hub

c21e887 verified 13 days ago

History Blame Contribute Delete

1.96 kB

	import re
	import os
	import json
	from abc import ABC, abstractmethod
	from typing import Iterable

	import tiktoken

	_PATTERN = re.compile(r'([,.:;?_!"()\']\|--\|\s)')


	def build_vocab(text: str) -> dict[str, int]:
	"""Build a vocab dict {word -> int} from raw text."""
	tokens = [t for t in _PATTERN.split(text.lower()) if t and not t.isspace()]
	all_words: list[str] = sorted(set(tokens))
	all_words.extend(["<unk>", "<eos>"])
	return {word: idx for idx, word in enumerate(all_words)}


	def save_vocab(vocab: dict[str, int], file_path: str) -> None:
	os.makedirs(file_path, exist_ok=True)
	with open(os.path.join(file_path, "vocab.json"), "w") as f:
	json.dump(vocab, f, indent=2)


	class BaseTokenizer(ABC):
	@abstractmethod
	def encode(self, text: str) -> list[int]: ...

	@abstractmethod
	def decode(self, tokens: Iterable[int]) -> str: ...


	class SimpleTokenizer(BaseTokenizer):
	"""Vanilla tokenizer that builds a str→int map over the whole dataset."""

	def __init__(self, vocab: dict[str, int]) -> None:
	self.str_to_int = vocab
	self.int_to_str = {v: k for k, v in vocab.items()}

	def encode(self, text: str) -> list[int]:
	tokens = [t for t in _PATTERN.split(text.lower()) if t and not t.isspace()]
	ids = [self.str_to_int.get(t, self.str_to_int["<unk>"]) for t in tokens]
	ids.append(self.str_to_int["<eos>"])
	return ids

	def decode(self, tokens: Iterable[int]) -> str:
	return " ".join(self.int_to_str.get(t, "<unk>") for t in tokens)


	class TikTokenizer(BaseTokenizer):
	"""BPE tokenizer used in GPT-2, via the tiktoken library."""

	def __init__(self, model_type: str) -> None:
	self.tokenizer = tiktoken.get_encoding(model_type)

	def encode(self, text: str) -> list[int]:
	return self.tokenizer.encode(text)

	def decode(self, tokens: Iterable[int]) -> str:
	return self.tokenizer.decode(list(tokens))