gpt2 / src /data /tokenizer.py
triton329's picture
Upload folder using huggingface_hub
c21e887 verified
Raw
History Blame Contribute Delete
1.96 kB
import re
import os
import json
from abc import ABC, abstractmethod
from typing import Iterable
import tiktoken
_PATTERN = re.compile(r'([,.:;?_!"()\']|--|\s)')
def build_vocab(text: str) -> dict[str, int]:
"""Build a vocab dict {word -> int} from raw text."""
tokens = [t for t in _PATTERN.split(text.lower()) if t and not t.isspace()]
all_words: list[str] = sorted(set(tokens))
all_words.extend(["<unk>", "<eos>"])
return {word: idx for idx, word in enumerate(all_words)}
def save_vocab(vocab: dict[str, int], file_path: str) -> None:
os.makedirs(file_path, exist_ok=True)
with open(os.path.join(file_path, "vocab.json"), "w") as f:
json.dump(vocab, f, indent=2)
class BaseTokenizer(ABC):
@abstractmethod
def encode(self, text: str) -> list[int]: ...
@abstractmethod
def decode(self, tokens: Iterable[int]) -> str: ...
class SimpleTokenizer(BaseTokenizer):
"""Vanilla tokenizer that builds a str→int map over the whole dataset."""
def __init__(self, vocab: dict[str, int]) -> None:
self.str_to_int = vocab
self.int_to_str = {v: k for k, v in vocab.items()}
def encode(self, text: str) -> list[int]:
tokens = [t for t in _PATTERN.split(text.lower()) if t and not t.isspace()]
ids = [self.str_to_int.get(t, self.str_to_int["<unk>"]) for t in tokens]
ids.append(self.str_to_int["<eos>"])
return ids
def decode(self, tokens: Iterable[int]) -> str:
return " ".join(self.int_to_str.get(t, "<unk>") for t in tokens)
class TikTokenizer(BaseTokenizer):
"""BPE tokenizer used in GPT-2, via the tiktoken library."""
def __init__(self, model_type: str) -> None:
self.tokenizer = tiktoken.get_encoding(model_type)
def encode(self, text: str) -> list[int]:
return self.tokenizer.encode(text)
def decode(self, tokens: Iterable[int]) -> str:
return self.tokenizer.decode(list(tokens))