| import re |
| import os |
| import json |
| from abc import ABC, abstractmethod |
| from typing import Iterable |
|
|
| import tiktoken |
|
|
| _PATTERN = re.compile(r'([,.:;?_!"()\']|--|\s)') |
|
|
|
|
| def build_vocab(text: str) -> dict[str, int]: |
| """Build a vocab dict {word -> int} from raw text.""" |
| tokens = [t for t in _PATTERN.split(text.lower()) if t and not t.isspace()] |
| all_words: list[str] = sorted(set(tokens)) |
| all_words.extend(["<unk>", "<eos>"]) |
| return {word: idx for idx, word in enumerate(all_words)} |
|
|
|
|
| def save_vocab(vocab: dict[str, int], file_path: str) -> None: |
| os.makedirs(file_path, exist_ok=True) |
| with open(os.path.join(file_path, "vocab.json"), "w") as f: |
| json.dump(vocab, f, indent=2) |
|
|
|
|
| class BaseTokenizer(ABC): |
| @abstractmethod |
| def encode(self, text: str) -> list[int]: ... |
|
|
| @abstractmethod |
| def decode(self, tokens: Iterable[int]) -> str: ... |
|
|
|
|
| class SimpleTokenizer(BaseTokenizer): |
| """Vanilla tokenizer that builds a str→int map over the whole dataset.""" |
|
|
| def __init__(self, vocab: dict[str, int]) -> None: |
| self.str_to_int = vocab |
| self.int_to_str = {v: k for k, v in vocab.items()} |
|
|
| def encode(self, text: str) -> list[int]: |
| tokens = [t for t in _PATTERN.split(text.lower()) if t and not t.isspace()] |
| ids = [self.str_to_int.get(t, self.str_to_int["<unk>"]) for t in tokens] |
| ids.append(self.str_to_int["<eos>"]) |
| return ids |
|
|
| def decode(self, tokens: Iterable[int]) -> str: |
| return " ".join(self.int_to_str.get(t, "<unk>") for t in tokens) |
|
|
|
|
| class TikTokenizer(BaseTokenizer): |
| """BPE tokenizer used in GPT-2, via the tiktoken library.""" |
|
|
| def __init__(self, model_type: str) -> None: |
| self.tokenizer = tiktoken.get_encoding(model_type) |
|
|
| def encode(self, text: str) -> list[int]: |
| return self.tokenizer.encode(text) |
|
|
| def decode(self, tokens: Iterable[int]) -> str: |
| return self.tokenizer.decode(list(tokens)) |
|
|