Upload mgpt2 tokenizer

Browse files

Files changed (14) hide show

README.md +22 -0
added_tokens.json +3 -0
evaluation.json +129 -0
special_tokens_map.json +3 -0
tokenizer.model +3 -0
tokenizer.vocab +0 -0
tokenizer/__init__.py +15 -0
tokenizer/base.py +158 -0
tokenizer/basic.py +52 -0
tokenizer/gpt4.py +105 -0
tokenizer/hf_tokenizer.py +89 -0
tokenizer/patterns.py +13 -0
tokenizer/regex_tokenizer.py +246 -0
tokenizer_config.json +22 -0

README.md ADDED Viewed

	@@ -0,0 +1,22 @@

+# ace-1/mgpt2-tokenizer
+Custom mgpt2 tokenizer (pure-Python) exported for Hugging Face `trust_remote_code=True`.
+## Usage
+```python
+from transformers import AutoTokenizer
+tok = AutoTokenizer.from_pretrained('ace-1/mgpt2-tokenizer', trust_remote_code=True)
+print(tok.encode('hello world'))
+```
+## Contents
+- Trained tokenizer artifact: `mgpt2_dev.model` (native `.model` format)
+- Python implementation under `tokenizer/` (loaded via `trust_remote_code`)
+## Evaluation
+Evaluated on `heldout_eval.txt` with `--limit 10000`.
+See `evaluation.json` for metrics (bytes/token, p95 tokens/line, and bucket breakdown).

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<|endoftext|>": 7995
+}

evaluation.json ADDED Viewed

	@@ -0,0 +1,129 @@

+{
+  "text": "tokenizer/artifacts/heldout_eval.txt",
+  "limit": 10000,
+  "overall": [
+    {
+      "name": "tiktoken_cl100k_base",
+      "total_chars": 43290048,
+      "total_bytes": 43442607,
+      "total_tokens": 11515953,
+      "tokens_per_1k_chars": 266.0184853571888,
+      "tokens_per_1k_bytes": 265.0842984630273,
+      "bytes_per_token": 3.772384882084878,
+      "chars_per_token": 3.759137259417436,
+      "p50_tokens_per_line": 604,
+      "p95_tokens_per_line": 3719,
+      "p95_tokens_per_1k_bytes_per_line": 394.09722222222223
+    },
+    {
+      "name": "mgpt2_GPT4Tokenizer_reference",
+      "total_chars": 43290048,
+      "total_bytes": 43442607,
+      "total_tokens": 11515953,
+      "tokens_per_1k_chars": 266.0184853571888,
+      "tokens_per_1k_bytes": 265.0842984630273,
+      "bytes_per_token": 3.772384882084878,
+      "chars_per_token": 3.759137259417436,
+      "p50_tokens_per_line": 604,
+      "p95_tokens_per_line": 3719,
+      "p95_tokens_per_1k_bytes_per_line": 394.09722222222223
+    },
+    {
+      "name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2_dev.model)",
+      "total_chars": 43290048,
+      "total_bytes": 43442607,
+      "total_tokens": 11749984,
+      "tokens_per_1k_chars": 271.4246008690034,
+      "tokens_per_1k_bytes": 270.47142912026436,
+      "bytes_per_token": 3.697248183486888,
+      "chars_per_token": 3.68426442112602,
+      "p50_tokens_per_line": 658,
+      "p95_tokens_per_line": 3543,
+      "p95_tokens_per_1k_bytes_per_line": 334.74443399184696
+    }
+  ],
+  "by_bucket": {
+    "latin": [
+      {
+        "name": "tiktoken_cl100k_base",
+        "total_chars": 42393232,
+        "total_bytes": 42542626,
+        "total_tokens": 11183977,
+        "tokens_per_1k_chars": 263.81515332447407,
+        "tokens_per_1k_bytes": 262.88873188034984,
+        "bytes_per_token": 3.8038906911199835,
+        "chars_per_token": 3.790532831031394,
+        "p50_tokens_per_line": 601,
+        "p95_tokens_per_line": 3613,
+        "p95_tokens_per_1k_bytes_per_line": 394.2307692307692
+      },
+      {
+        "name": "mgpt2_GPT4Tokenizer_reference",
+        "total_chars": 42393232,
+        "total_bytes": 42542626,
+        "total_tokens": 11183977,
+        "tokens_per_1k_chars": 263.81515332447407,
+        "tokens_per_1k_bytes": 262.88873188034984,
+        "bytes_per_token": 3.8038906911199835,
+        "chars_per_token": 3.790532831031394,
+        "p50_tokens_per_line": 601,
+        "p95_tokens_per_line": 3613,
+        "p95_tokens_per_1k_bytes_per_line": 394.2307692307692
+      },
+      {
+        "name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2_dev.model)",
+        "total_chars": 42393232,
+        "total_bytes": 42542626,
+        "total_tokens": 11499062,
+        "tokens_per_1k_chars": 271.2475897096027,
+        "tokens_per_1k_bytes": 270.2950682922112,
+        "bytes_per_token": 3.6996605462254224,
+        "chars_per_token": 3.686668703934286,
+        "p50_tokens_per_line": 654,
+        "p95_tokens_per_line": 3472,
+        "p95_tokens_per_1k_bytes_per_line": 334.74443399184696
+      }
+    ],
+    "mixed": [
+      {
+        "name": "tiktoken_cl100k_base",
+        "total_chars": 896816,
+        "total_bytes": 899981,
+        "total_tokens": 331976,
+        "tokens_per_1k_chars": 370.17180781788016,
+        "tokens_per_1k_bytes": 368.8700094779779,
+        "bytes_per_token": 2.710982119189339,
+        "chars_per_token": 2.701448297467287,
+        "p50_tokens_per_line": 1688,
+        "p95_tokens_per_line": 13746,
+        "p95_tokens_per_1k_bytes_per_line": 390.70425858204555
+      },
+      {
+        "name": "mgpt2_GPT4Tokenizer_reference",
+        "total_chars": 896816,
+        "total_bytes": 899981,
+        "total_tokens": 331976,
+        "tokens_per_1k_chars": 370.17180781788016,
+        "tokens_per_1k_bytes": 368.8700094779779,
+        "bytes_per_token": 2.710982119189339,
+        "chars_per_token": 2.701448297467287,
+        "p50_tokens_per_line": 1688,
+        "p95_tokens_per_line": 13746,
+        "p95_tokens_per_1k_bytes_per_line": 390.70425858204555
+      },
+      {
+        "name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2_dev.model)",
+        "total_chars": 896816,
+        "total_bytes": 899981,
+        "total_tokens": 250922,
+        "tokens_per_1k_chars": 279.7920643699488,
+        "tokens_per_1k_bytes": 278.8081081711725,
+        "bytes_per_token": 3.5866962641777125,
+        "chars_per_token": 3.574082782697412,
+        "p50_tokens_per_line": 1316,
+        "p95_tokens_per_line": 9925,
+        "p95_tokens_per_1k_bytes_per_line": 333.2627791300667
+      }
+    ]
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "eos_token": "<|endoftext|>"
+}

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31d08b39fa5466b3913866a8dc1a26fa2f86578b814bab25ac978641609a6a40
+size 65352

tokenizer.vocab ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from .base import Tokenizer
+from .basic import BasicTokenizer
+from .regex_tokenizer import RegexTokenizer
+from .gpt4 import GPT4Tokenizer
+from .patterns import GPT4_SPLIT_PATTERN, INDIC_SPLIT_PATTERN
+__all__ = [
+    "Tokenizer",
+    "BasicTokenizer",
+    "RegexTokenizer",
+    "GPT4Tokenizer",
+    "GPT4_SPLIT_PATTERN",
+    "INDIC_SPLIT_PATTERN",
+]

tokenizer/base.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""
+A minimal implementation of Byte-Pair Encoding (BPE) tokenization.
+BPE is a subword tokenization algorithm that iteratively merges the most frequent pairs of bytes or characters
+to build a vocabulary of subword tokens. This implementation is inspired by Andrej Karpathy's minbpe
+(https://github.com/karpathy/minbpe).
+"""
+import unicodedata
+def get_stats(ids, freq):
+    for pair in zip(ids[:-1], ids[1:]):
+        freq[pair] = freq.get(pair, 0) + 1
+def merge(ids, pair, idx):
+    newids = []
+    i = 0
+    while i < len(ids):
+        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
+            newids.append(idx)
+            i += 2
+        else:
+            newids.append(ids[i])
+            i += 1
+    return newids
+def visualise_tokens(token_values: list[bytes]) -> None:
+    background = [f"\u001b[48;5;{i}m" for i in [167, 179, 185, 77, 80, 68, 134]]
+    # If token boundaries do not occur at unicode character boundaries, it's unclear how best to
+    # visualise the token. Here, we'll just use the unicode replacement character to represent some
+    # fraction of a character.
+    unicode_token_values = [x.decode("utf-8", errors="replace") for x in token_values]
+    running_length = 0
+    last_color = None
+    for token in unicode_token_values:
+        color = background[running_length % len(background)]
+        if color == last_color:
+            color = background[(running_length + 1) % len(background)]
+            assert color != last_color
+        last_color = color
+        running_length += len(token)
+        print(color + token, end="")
+    print("\u001b[0m")
+# first two helper functions...
+def replace_control_characters(s: str) -> str:
+    # we don't want to print control characters
+    # which distort the output (e.g. \n or much worse)
+    # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python/19016117#19016117
+    # http://www.unicode.org/reports/tr44/#GC_Values_Table
+    chars = []
+    for ch in s:
+        if unicodedata.category(ch)[0] != "C":
+            chars.append(ch) # this character is ok
+        else:
+            chars.append(f"\\u{ord(ch):04x}") # escape
+    return "".join(chars)
+def render_token(t: bytes) -> str:
+    # pretty print a token, escaping control characters
+    s = t.decode('utf-8', errors='replace')
+    s = replace_control_characters(s)
+    return s
+#--------------------------------------------------------------------------------------------------
+class Tokenizer:
+    def __init__(self):
+        self.merges = {} # (int, int) -> int
+        self.pattern = "" # str
+        self.special_tokens = {} # str -> int e.g {'<|endoftext|>': 100257}
+        self.inverse_special_tokens = {} # int -> str
+        self.vocab = self._build_vocab() # int -> bytes
+    def _build_vocab(self):
+        vocab = {idx: bytes([idx]) for idx in range(256)}
+        for (p0, p1), idx in self.merges.items():
+            vocab[idx] = vocab[p0] + vocab[p1]
+        return vocab
+    def train(self, text, vocab_size, verbose=False):
+        raise NotImplementedError
+    def decode(self, ids) -> str:
+        raise NotImplementedError
+    def encode(self, text, verbose=False) -> list[int]:
+        raise NotImplementedError
+    def save(self, file_prefix):
+        """
+        Saves two files: file_prefix.vocab and file_prefix.model
+        This is inspired (but not equivalent to!) sentencepiece's model saving:
+        - model file is the critical one, intended for load()
+        - vocab file is just a pretty printed version for human inspection only
+        """
+        # write the model: to be used in load() later
+        model_file = file_prefix + ".model"
+        with open(model_file, 'w') as f:
+            # write the version, pattern and merges, that's all that's needed
+            f.write("minbpe v1\n")
+            f.write(f"{self.pattern}\n")
+            # write the special tokens, first the number of them, then each one
+            f.write(f"{len(self.special_tokens)}\n")
+            for special, idx in self.special_tokens.items():
+                f.write(f"{special} {idx}\n")
+            # the merges dict
+            for idx1, idx2 in self.merges:
+                f.write(f"{idx1} {idx2}\n")
+        # write the vocab: for the human to look at
+        vocab_file = file_prefix + ".vocab"
+        inverted_merges = {idx: pair for pair, idx in self.merges.items()}
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            for idx, token in self.vocab.items():
+                # note: many tokens may be partial utf-8 sequences
+                # and cannot be decoded into valid strings. Here we're using
+                # errors='replace' to replace them with the replacement char �.
+                # this also means that we couldn't possibly use .vocab in load()
+                # because decoding in this way is a lossy operation!
+                s = render_token(token)
+                # find the children of this token, if any
+                if idx in inverted_merges:
+                    # if this token has children, render it nicely as a merge
+                    idx0, idx1 = inverted_merges[idx]
+                    s0 = render_token(self.vocab[idx0])
+                    s1 = render_token(self.vocab[idx1])
+                    f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
+                else:
+                    # otherwise this is leaf token, just print it
+                    # (this should just be the first 256 tokens, the bytes)
+                    f.write(f"[{s}] {idx}\n")
+    def load(self, model_file):
+        """Inverse of save() but only for the model file"""
+        assert model_file.endswith(".model")
+        # read the model file
+        merges = {}
+        special_tokens = {}
+        idx = 256
+        with open(model_file, 'r', encoding="utf-8") as f:
+            # read the version
+            version = f.readline().strip()
+            assert version == "minbpe v1"
+            # read the pattern
+            self.pattern = f.readline().strip()
+            # read the special tokens
+            num_special = int(f.readline().strip())
+            for _ in range(num_special):
+                special, special_idx = f.readline().strip().split()
+                special_tokens[special] = int(special_idx)
+            # read the merges
+            for line in f:
+                idx1, idx2 = map(int, line.split())
+                merges[(idx1, idx2)] = idx
+                idx += 1
+        self.merges = merges
+        self.special_tokens = special_tokens
+        self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}
+        self.vocab = self._build_vocab()

tokenizer/basic.py ADDED Viewed

	@@ -0,0 +1,52 @@

+try:
+    from .base import Tokenizer, get_stats, merge, visualise_tokens
+except ImportError:  # allow running as a script from inside `tokenizer/`
+    from base import Tokenizer, get_stats, merge, visualise_tokens
+class BasicTokenizer(Tokenizer):
+    def __init__(self):
+        super().__init__()
+    def train(self, text, vocab_size, verbose=False):
+        # 'ids' is a list of integers, each representing a byte from the UTF-8 encoded string
+        ids = list(text.encode("utf-8")) # list[int]
+        if verbose:
+            print(f"len(text) = {len(text)}")
+            print(f"len(tokens) = {len(ids)}")
+        num_merges = vocab_size - 256
+        merges = {}
+        vocab = {idx: bytes([idx]) for idx in range(256)}
+        for i in range(num_merges):
+            stats = {}
+            get_stats(ids, stats)
+            pair = max(stats, key=stats.get) # (int, int)
+            idx = 256 + i
+            ids = merge(ids, pair, idx)
+            merges[pair] = idx
+            vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
+            if verbose and i % 100 == 0:
+                print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
+        self.vocab = vocab
+        self.merges = merges
+    def decode(self, ids) -> str:
+        text = b"".join([self.vocab[id] for id in ids])
+        text = text.decode(encoding="utf-8", errors="replace")
+        return text
+    def encode(self, text, verbose=False) -> list[int]:
+        tokens = list(text.encode("utf-8"))
+        while len(tokens) >= 2:
+            if verbose:
+                visualise_tokens([self.vocab[token] for token in tokens])
+            stats = {}
+            get_stats(tokens, stats)
+            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
+            if not pair in self.merges:
+                break
+            idx = self.merges[pair]
+            tokens = merge(tokens, pair, idx)
+        return tokens

tokenizer/gpt4.py ADDED Viewed

	@@ -0,0 +1,105 @@

+try:
+    from .regex_tokenizer import RegexTokenizer
+    from .base import visualise_tokens, get_stats, merge
+    from .patterns import GPT4_SPLIT_PATTERN
+except ImportError:  # allow running as a script from inside `tokenizer/`
+    from regex_tokenizer import RegexTokenizer
+    from base import visualise_tokens, get_stats, merge
+    from patterns import GPT4_SPLIT_PATTERN
+from typing import Optional
+import regex as re
+import tiktoken
+GPT4_SPECIAL_TOKENS = {
+    '<|endoftext|>': 100257,
+    '<|fim_prefix|>': 100258,
+    '<|fim_middle|>': 100259,
+    '<|fim_suffix|>': 100260,
+    '<|endofprompt|>': 100276
+}
+def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
+    parts = [bytes([b]) for b in token]
+    while True:
+        min_idx = None
+        min_rank = None
+        for i, pair in enumerate(zip(parts[:-1], parts[1:])):
+            rank = mergeable_ranks.get(pair[0] + pair[1])
+            if rank is not None and (min_rank is None or rank < min_rank):
+                min_idx = i
+                min_rank = rank
+        if min_rank is None or (max_rank is not None and min_rank >= max_rank):
+            break
+        assert min_idx is not None
+        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
+    return parts
+def recover_merges(mergeable_ranks: dict[bytes, int]) -> dict[bytes, tuple[bytes, bytes]]:
+    merges = {}
+    for token, rank in mergeable_ranks.items():
+        if len(token) == 1:
+            continue
+        pair = tuple(bpe(mergeable_ranks, token, max_rank=rank))
+        assert len(pair) == 2
+        ix0 = mergeable_ranks[pair[0]]
+        ix1 = mergeable_ranks[pair[1]]
+        merges[(ix0, ix1)] = rank
+    return merges
+class GPT4Tokenizer(RegexTokenizer):
+    def __init__(self):
+        super().__init__(GPT4_SPLIT_PATTERN)
+        enc = tiktoken.get_encoding("cl100k_base")
+        mergeable_ranks = enc._mergeable_ranks
+        self.merges = recover_merges(mergeable_ranks)
+        vocab = {idx: bytes([idx]) for idx in range(256)}
+        for pair, idx in self.merges.items():
+            vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
+        self.vocab = vocab
+        # for some reason, the tokens corresponding to individual bytes
+        # are permuted in a different order. This is completely non-sensical
+        # and probably historical, but therefore we have to deal with it here
+        self.byte_shuffle = {idx: mergeable_ranks[bytes([idx])] for idx in range(256)}
+        self.inverse_byte_shuffle = {v: k for k, v in self.byte_shuffle.items()}
+        self.register_special_tokens(GPT4_SPECIAL_TOKENS)
+    def train(self, text: str, vocab_size: int = 50_257, verbose: bool = False):
+        raise NotImplementedError
+    def _encode_chunk(self, chunk_bytes: bytes, verbose: bool = False) -> list[int]:
+        chunk_bytes = bytes(self.byte_shuffle[b] for b in chunk_bytes)
+        ids = list(chunk_bytes)
+        while len(ids) >= 2:
+            if verbose:
+                decodable_ids = [] # each id can be multiple bytes i.e. any utf-8 character
+                for id in ids:
+                    char = self.vocab[id] # id can be > 256 after merging
+                    decodable_ids.append(bytes(self.inverse_byte_shuffle[b] for b in char))
+                visualise_tokens(decodable_ids)
+            stats = {}
+            get_stats(ids, stats)
+            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
+            if not pair in self.merges:
+                break
+            idx = self.merges[pair]
+            ids = merge(ids, pair, idx)
+        return ids
+    def decode(self, ids) -> str:
+        part_bytes = []
+        for id in ids:
+            if id in self.vocab:
+                char = self.vocab[id] # id can be > 256 after merging
+                part_bytes.extend(self.inverse_byte_shuffle[b] for b in char)
+            elif id in self.inverse_special_tokens:
+                part_bytes.extend(self.inverse_special_tokens[id].encode("utf-8"))
+            else:
+                raise ValueError(f"id={id} not in vocab or special_tokens")
+        text_bytes = bytes(part_bytes)
+        text = text_bytes.decode(encoding="utf-8", errors="replace")
+        return text
+    def save(self, path: str):
+        raise NotImplementedError("GPT4Tokenizer not meant to be saved")
+    def load(self, path: str):
+        raise NotImplementedError("GPT4Tokenizer not meant to be loaded")

tokenizer/hf_tokenizer.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from __future__ import annotations
+import os
+from typing import Any, Optional
+from transformers import PreTrainedTokenizer
+from tokenizer.regex_tokenizer import RegexTokenizer
+class MGPT2Tokenizer(PreTrainedTokenizer):
+    """
+    Hugging Face-compatible (slow) tokenizer wrapper around `RegexTokenizer`.
+    This is intended for publishing alongside the model using `trust_remote_code=True`.
+    """
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(self, model_file: str, **kwargs: Any):
+        if not model_file.endswith(".model"):
+            raise ValueError(f"model_file must end with .model, got: {model_file}")
+        self._tok = RegexTokenizer()
+        self._tok.load(model_file)
+        # Bind common special tokens if present in the trained tokenizer.
+        special = self._tok.special_tokens
+        kwargs.setdefault("eos_token", "<|endoftext|>" if "<|endoftext|>" in special else None)
+        kwargs.setdefault("unk_token", None)
+        kwargs.setdefault("pad_token", None)
+        kwargs.setdefault("bos_token", None)
+        super().__init__(**kwargs)
+        self.model_file = model_file
+    @property
+    def vocab_size(self) -> int:
+        # vocab is sparse only if merges are incomplete; generally size is max_id+1
+        return max(self._tok.vocab.keys()) + 1
+    def get_vocab(self) -> dict[str, int]:
+        # Provide a stable token-string mapping for HF internals.
+        inv_special = self._tok.inverse_special_tokens
+        vocab: dict[str, int] = {}
+        for i in range(self.vocab_size):
+            if i in inv_special:
+                vocab[inv_special[i]] = i
+            else:
+                vocab[f"<|bytebpe_{i}|>"] = i
+        return vocab
+    def _tokenize(self, text: str, **kwargs: Any) -> list[str]:
+        ids = self._tok.encode(text, allowed_special="all")
+        inv_special = self._tok.inverse_special_tokens
+        out: list[str] = []
+        for i in ids:
+            out.append(inv_special.get(i, f"<|bytebpe_{i}|>"))
+        return out
+    def _convert_token_to_id(self, token: str) -> int:
+        if token in self._tok.special_tokens:
+            return self._tok.special_tokens[token]
+        if token.startswith("<|bytebpe_") and token.endswith("|>"):
+            inner = token[len("<|bytebpe_") : -len("|>")]
+            return int(inner)
+        raise KeyError(f"Unknown token string: {token!r}")
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._tok.inverse_special_tokens.get(index, f"<|bytebpe_{index}|>")
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        ids = [self._convert_token_to_id(t) for t in tokens]
+        return self._tok.decode(ids)
+    def build_inputs_with_special_tokens(self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None) -> list[int]:
+        if token_ids_1 is not None:
+            raise ValueError("This tokenizer does not support pair inputs.")
+        return token_ids_0
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        os.makedirs(save_directory, exist_ok=True)
+        prefix = filename_prefix or "tokenizer"
+        out_prefix = os.path.join(save_directory, prefix)
+        # Save in the native `.model`/`.vocab` format (human + machine readable for this repo).
+        self._tok.save(out_prefix)
+        return (out_prefix + ".model",)

tokenizer/patterns.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+Regex patterns used by tokenizers in this package.
+Keep patterns centralized so experiments + training scripts + notebooks
+stay in sync.
+"""
+# Default GPT-4-ish split pattern (as used in `RegexTokenizer` and `GPT4Tokenizer`)
+GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
+# Indic-focused experimental pattern (Hindi Devanagari + Kannada ranges and punctuation)
+INDIC_SPLIT_PATTERN = r"""(?i) 's|'t|'re|'ve|'m|'ll|'d| ?\b[\p{L}\u0900-\u0963|\u0966-\u097F]+\b| ?\b[\p{L}\u0C80-\u0C9E|\u0CA0-\u0CFF]+\b| ?[\p{N}]+| ?[.,!?;:'\"-]| ?[\u0964-\u0965]| ?[\u0C9E-\u0C9F]| ?[^\s\p{L}\p{N}\u0900-\u097F\u0C80-\u0CFF]+| \s+(?!\S)| \s+"""

tokenizer/regex_tokenizer.py ADDED Viewed

	@@ -0,0 +1,246 @@

+try:
+    from .base import get_stats, merge, visualise_tokens
+    from .basic import BasicTokenizer
+    from .patterns import GPT4_SPLIT_PATTERN
+except ImportError:  # allow running as a script from inside `tokenizer/`
+    from base import get_stats, merge, visualise_tokens
+    from basic import BasicTokenizer
+    from patterns import GPT4_SPLIT_PATTERN
+from collections import Counter, defaultdict
+import heapq
+import regex as re
+from tqdm import tqdm
+import time
+class RegexTokenizer(BasicTokenizer):
+    def __init__(self, regex: str = GPT4_SPLIT_PATTERN):
+        super().__init__()
+        self.pattern = regex
+        self.regex = re.compile(self.pattern)
+    def register_special_tokens(self, special_tokens: dict[str, int]):
+        self.special_tokens = special_tokens
+        self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}
+    @staticmethod
+    def _merge_word(word: tuple[int, ...], pair: tuple[int, int], new_id: int) -> tuple[int, ...]:
+        """Merge all non-overlapping occurrences of `pair` in `word`."""
+        out: list[int] = []
+        i = 0
+        while i < len(word):
+            if i < len(word) - 1 and word[i] == pair[0] and word[i + 1] == pair[1]:
+                out.append(new_id)
+                i += 2
+            else:
+                out.append(word[i])
+                i += 1
+        return tuple(out)
+    @staticmethod
+    def _pair_occurrences(word: tuple[int, ...]) -> dict[tuple[int, int], int]:
+        """Return unweighted pair -> count for a single word/chunk."""
+        if len(word) < 2:
+            return {}
+        counts: dict[tuple[int, int], int] = {}
+        a = word[0]
+        for b in word[1:]:
+            p = (a, b)
+            counts[p] = counts.get(p, 0) + 1
+            a = b
+        return counts
+    def train(
+        self,
+        text: str,
+        vocab_size: int = 50_257,
+        verbose: bool = False,
+        *,
+        min_chunk_freq: int = 1,
+        max_chunks: int | None = None,
+    ):
+        assert vocab_size >= 256, "Vocab size must be at least 256"
+        num_merges = vocab_size - 256
+        # Count chunk frequencies without storing a giant list of chunks.
+        # Each unique chunk becomes a "word" in classic BPE training.
+        chunk_counts: Counter[bytes] = Counter()
+        for m in self.regex.finditer(text):
+            s = m.group(0)
+            if s:
+                chunk_counts[s.encode("utf-8")] += 1
+        # Heuristic speed knobs: ignore rare chunks and/or cap unique chunk types.
+        # This massively reduces training state on web-scale corpora and keeps code simple.
+        if min_chunk_freq > 1:
+            chunk_counts = Counter({b: f for b, f in chunk_counts.items() if f >= min_chunk_freq})
+        if max_chunks is not None and len(chunk_counts) > max_chunks:
+            chunk_counts = Counter(dict(chunk_counts.most_common(max_chunks)))
+        # words: tuple(symbol_ids) -> frequency
+        words: dict[tuple[int, ...], int] = {}
+        for b, freq in chunk_counts.items():
+            words[tuple(b)] = freq
+        # Global pair stats and a reverse index pair -> set(words containing it)
+        pair_counts: dict[tuple[int, int], int] = defaultdict(int)
+        pair_to_words: dict[tuple[int, int], set[tuple[int, ...]]] = defaultdict(set)
+        for w, freq in words.items():
+            local = self._pair_occurrences(w)
+            for p, occ in local.items():
+                pair_counts[p] += freq * occ
+                pair_to_words[p].add(w)
+        # Max-heap for fast "most frequent pair" selection (lazy updates).
+        heap: list[tuple[int, tuple[int, int]]] = [(-c, p) for p, c in pair_counts.items()]
+        heapq.heapify(heap)
+        merges = {}
+        vocab = {idx: bytes([idx]) for idx in range(256)}
+        def bump_pair(p: tuple[int, int], delta: int) -> None:
+            if delta == 0:
+                return
+            new = pair_counts.get(p, 0) + delta
+            if new <= 0:
+                pair_counts.pop(p, None)
+                pair_to_words.pop(p, None)
+                return
+            pair_counts[p] = new
+            heapq.heappush(heap, (-new, p))
+        for i in tqdm(range(num_merges), desc="Training tokenizer"):
+            start_time = time.time()
+            # Pop stale heap entries until the top matches current counts.
+            while heap:
+                negc, p = heap[0]
+                c = pair_counts.get(p, 0)
+                if c > 0 and -negc == c:
+                    break
+                heapq.heappop(heap)
+            if not heap:
+                break
+            pair = heap[0][1]
+            count = pair_counts.get(pair, 0)
+            if count <= 0:
+                break
+            idx = 256 + i
+            merges[pair] = idx
+            vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
+            affected = list(pair_to_words.get(pair, ()))
+            if not affected:
+                pair_counts.pop(pair, None)
+                pair_to_words.pop(pair, None)
+                continue
+            # Apply merge to all words that contain the best pair.
+            for w in affected:
+                freq = words.get(w)
+                if not freq:
+                    continue
+                new_w = self._merge_word(w, pair, idx)
+                if new_w == w:
+                    continue
+                # Remove old word contributions
+                old_local = self._pair_occurrences(w)
+                for p, occ in old_local.items():
+                    bump_pair(p, -freq * occ)
+                    s = pair_to_words.get(p)
+                    if s is not None:
+                        s.discard(w)
+                        if not s:
+                            pair_to_words.pop(p, None)
+                # Update words dict (merge words that collapse to the same new tuple)
+                del words[w]
+                words[new_w] = words.get(new_w, 0) + freq
+                # Add new word contributions
+                new_local = self._pair_occurrences(new_w)
+                for p, occ in new_local.items():
+                    bump_pair(p, freq * occ)
+                    pair_to_words[p].add(new_w)
+            # This pair should be fully merged away.
+            pair_counts.pop(pair, None)
+            pair_to_words.pop(pair, None)
+            if verbose and i % 10 == 0:
+                time_taken = time.time() - start_time
+                tqdm.write(
+                    f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) "
+                    f"had {count} occurrences (took {time_taken:.2f}s)"
+                )
+        self.merges = merges
+        self.vocab = vocab
+    def decode(self, ids) -> str:
+        part_bytes = []
+        for id in ids:
+            if id in self.vocab:
+                part_bytes.append(self.vocab[id]) # id can be > 256 after merging
+            elif id in getattr(self, "inverse_special_tokens", {}):
+                part_bytes.append(self.inverse_special_tokens[id].encode("utf-8"))
+            else:
+                raise ValueError(f"id={id} not in vocab or special_tokens")
+        text_bytes = b"".join(part_bytes)
+        text = text_bytes.decode(encoding="utf-8", errors="replace")
+        return text
+    def _encode_chunk(self, chunk_bytes: bytes, verbose=False) -> list[int]:
+        tokens = list(chunk_bytes)
+        while len(tokens) >= 2:
+            if verbose:
+                visualise_tokens([self.vocab[token] for token in tokens]) # token can be > 256 after merging
+            stats = {}
+            get_stats(tokens, stats)
+            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
+            if not pair in self.merges:
+                break
+            idx = self.merges[pair]
+            tokens = merge(tokens, pair, idx)
+        return tokens
+    def encode_ordinary(self, text, verbose=False) -> list[int]:
+        chunk_texts = re.findall(self.regex, text)
+        ids_list = []
+        for i, text in enumerate(chunk_texts):
+            if verbose:
+                print()
+                print(f"encoding chunk {i+1}/{len(chunk_texts)}: {text}")
+            chunk_bytes = text.encode("utf-8") # raw bytes
+            ids = self._encode_chunk(chunk_bytes, verbose)
+            ids_list.extend(ids)
+        return ids_list
+    def encode(self, text, verbose=False, allowed_special="none") -> list[int]:
+        special = {}
+        if allowed_special == "all":
+            special = self.special_tokens
+        elif allowed_special == "none":
+            special = {}
+        elif allowed_special == "none_raise":
+            special = {}
+            assert all(token not in text for token in self.special_tokens), "Text contains special tokens that are not allowed"
+        elif isinstance(allowed_special, set):
+            special = {k: v for k, v in self.special_tokens.items() if k in allowed_special}
+        else:
+            raise ValueError(f"allowed_special={allowed_special} not understood.")
+        if not special:
+            return self.encode_ordinary(text, verbose)
+        special_pattern = "(" + "|".join(re.escape(token) for token in special) + ")"
+        parts = re.split(special_pattern, text)
+        ids = []
+        for part in parts:
+            if part in special:
+                ids.append(special[part])
+            else:
+                ids.extend(self.encode_ordinary(part, verbose))
+        return ids

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "added_tokens_decoder": {
+    "7995": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": null,
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "tokenizer_class": "MGPT2Tokenizer",
+  "unk_token": null,
+  "auto_map": {
+    "AutoTokenizer": "tokenizer.hf_tokenizer.MGPT2Tokenizer"
+  }
+}