scripts

Browse files

Files changed (5) hide show

README.md +19 -0
__init__.py +3 -0
base.py +300 -0
helper.py +94 -0
mana_tokenizer.py +70 -0

README.md CHANGED Viewed

@@ -10,6 +10,25 @@ language:
 The Mana Tokenizer is a custom-trained BPE tokenizer designed for Persian text. It is trained on a combination of huge Persian corpus. The tokenizer is built using the BPE with high character coverage to handle diverse Persian text.
 ## Special Tokens
 - **user Token:** `<|user|>`

 The Mana Tokenizer is a custom-trained BPE tokenizer designed for Persian text. It is trained on a combination of huge Persian corpus. The tokenizer is built using the BPE with high character coverage to handle diverse Persian text.
+## Quick Start
+```python
+from mana_tokenizer import ManaTokenizer
+tokenizer = ManaTokenizer()
+text = "سلام من یک متن تست برای تست این تست هستم."
+print(tokenizer.encode(text))
+print(tokenizer.decode(tokenizer.encode(text)))
+```
+You can also add special tokens
+```python
+tokenizer.register_special_tokens({"</s>": 100269})
+```
+Batch encode:
+```python
+tokenizer.batch_encode(["یک متن طولانی"])
+```
 ## Special Tokens
 - **user Token:** `<|user|>`

__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .base import Tokenizer
+from .mana_tokenizer import ManaTokenizer
+import helper

base.py ADDED Viewed

	@@ -0,0 +1,300 @@

+from collections import Counter
+from functools import lru_cache
+import requests
+from datasets import IterableDataset, Dataset
+from pyarrow import ChunkedArray
+from joblib import Parallel, delayed, cpu_count
+import time
+import os
+import regex as re
+import csv
+import time
+import helper
+class Tokenizer:
+    """Base class for Tokenizers"""
+    def __init__(self, pattern=None, multiprocess=True, store_dict=False, stop_list_size=0, freq_cutoff=1):
+        # default: vocab size of 256 (all bytes), no merges, no patterns
+        MANA_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re|می|نمی|به|بی|در|باز|بر|فرا|هم|ور|وا|ف|ک|چ|ن|پ|ا|از|ای|ی|ها|ترین|تر|ات|ان|ت|ٔ|یی|‌ا)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
+        self.merges = {} # (int, int) -> int
+        self.pattern = "" # str
+        self.special_tokens = {} # str -> int, e.g. {'<|endoftext|>': 100257}
+        self.vocab = self._build_vocab() # int -> bytes
+        self.pattern = MANA_SPLIT_PATTERN if pattern is None else pattern
+        self.compiled_pattern = re.compile(self.pattern)
+        self.multiprocess = multiprocess
+        if multiprocess:
+            self._cpus = cpu_count()
+        else:
+            self._cpus = 1
+        self.store_dict = store_dict
+        self.stop_list_size = stop_list_size
+        self.stop_words = {}
+        self.freq_cutoff = freq_cutoff
+    def _id_dict_to_list(self, ids):
+        if self.stop_list_size:
+            # get twice as many to be sure to be able to get X chunks of length > 1
+            top2X = ids.most_common(2*self.stop_list_size)
+            index = len(self.vocab)
+            stop_index = index + self.stop_list_size
+            stop_words = {}
+            for key, val in top2X:
+                if len(key) > 1: # and re.match(r'^ [A-Za-z\'’`]+$[A-Za-z]*', key):
+                    stop_words[key] = index
+                    self.vocab[index] = key.encode('utf-8')
+                    index += 1
+                if index == stop_index:
+                    break
+            self.stop_words = stop_words
+            if self.freq_cutoff > 1:
+                return [([*key.encode('utf-8')], val) for key, val in ids.items()
+                        if (val >= self.freq_cutoff and key not in self.stop_words)]
+            else:
+                return [([*key.encode('utf-8')], val) for key, val in ids.items()
+                        if key not in self.stop_words]
+        else:   # self.stop_list_size == 0
+            if self.freq_cutoff > 1:
+                return [([*key.encode('utf-8')], val) for key, val in ids.items()
+                        if val >= self.freq_cutoff]
+            else:
+                return [([*key.encode('utf-8')], val) for key, val in ids.items()]
+    def _import_data(self, data):
+        # determine if `data` is a text as a string, a path to a file, a url to
+        # a text document, a dictionary of datasets kwargs, or a list of any of
+        # the above. Return a list of 2-tuples of bytes objects and their counts.
+        ids = Counter()
+        if not isinstance(data, (list, tuple)):
+            data = (data,)
+        for item in data:
+            # convert to ChunkedArray, dict, or str of text to parse
+            if isinstance(item, Dataset):
+                item = item.data['text']
+            elif isinstance(item, str) and item.endswith('.csv'):   # csv file from previous data load
+                with open(item, 'r') as f:
+                    reader = csv.reader(f)
+                    next(reader)
+                    item = {k: int(v) for k, v in reader}
+            elif isinstance(item, str):
+                if item.startswith('https://') or item.startswith('http://'):
+                    item = requests.get(item).text    # if it's a url, assume it's to a text file
+                elif os.path.isfile(item) and item.endswith('.txt'):
+                    with open(item, 'r', encoding='utf-8') as f:
+                        item = f.read()
+            # process data
+            if isinstance(item, dict):
+                last_item = item.popitem()
+                if last_item[1] != 0:
+                    print(f'Warning: the csv file or dictionary passed does not seem to have been made by this tokenizer.')
+                    item[last_item[0]] = last_item[1]
+                elif last_item[0] != self.pattern:
+                    print(f'Warning: the dictionary or csv file passed did not use the same split pattern.')
+                ids.update(item)
+            elif isinstance(item, str):   # assume the string is the text itself
+                ids.update(re.findall(self.compiled_pattern, item))
+            elif isinstance(item, ChunkedArray):
+                batch_size = len(item) // (self._cpus*2) or 1
+                batches = [item[i:i + batch_size] for i in range(0, len(item), batch_size)]
+                print(f'Processing {len(batches)} batches of size {batch_size}')
+                results = Parallel(n_jobs=self._cpus)(delayed(helper._process_string_scalar)(batch, self.compiled_pattern) for batch in batches)
+                for result in results:  # Aggregate results into one Counter
+                    ids.update(result)
+            elif isinstance(item, IterableDataset):
+                print('Serially processing IterableDataset...')
+                for _dict in item:
+                    ids.update(re.findall(self.compiled_pattern, _dict['text']))
+        if self.store_dict:   # store dict compression of dataset to a csv file if requested
+            ids[self.pattern] = 0   # store the pattern used to split the text as the last key
+            formatted_time = time.strftime('%Y-%m-%d-%H_%M', time.localtime())
+            filename = f'{formatted_time}-dataset-dict.csv'
+            try:
+                with open(filename, 'w', newline='') as f:
+                    writer = csv.writer(f)
+                    writer.writerow(['text_chunk', 'count'])
+                    for key, value in ids.items():
+                        writer.writerow([key, value])
+                print(f"Stored dictionary of {len(ids)} keys to {filename}")
+            except:
+                print('Failed to store dictionary of dataset.')
+            del ids[self.pattern]   # remove the pattern key from the ids dict
+        ids = self._id_dict_to_list(ids)
+        return ids
+    def train(self, text, vocab_size, verbose=False):
+        # Tokenizer can train a vocabulary of size vocab_size from text
+        raise NotImplementedError
+    def _build_vocab(self):
+        # vocab is simply and deterministically derived from merges
+        vocab = {idx: bytes([idx]) for idx in range(256)}
+        for (p0, p1), idx in self.merges.items():
+            vocab[idx] = vocab[p0] + vocab[p1]
+        for special, idx in self.special_tokens.items():
+            vocab[idx] = special.encode("utf-8")
+        return vocab
+    def register_special_tokens(self, special_tokens):
+        # special_tokens is a dictionary of str -> int
+        # example: {"<|endoftext|>": 100257}
+        self.special_tokens = special_tokens
+        self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}
+    def save(self, file_prefix):
+        """
+        Saves two files: file_prefix.vocab and file_prefix.model
+        This is inspired (but not equivalent to!) sentencepiece's model saving:
+        - model file is the critical one, intended for load() later
+        - vocab file is just a pretty printed version for human inspection only
+        """
+        # write the model: to be used in load() later
+        model_file = file_prefix + ".model"
+        with open(model_file, 'w', encoding='utf-8') as f:  # Added encoding='utf-8'
+            # write the version, pattern and merges, that's all that's needed
+            f.write("mana v1\n")
+            f.write(f"{self.pattern}\n")
+            # write the special tokens, first the number of them, then each one
+            f.write(f"{len(self.special_tokens)}\n")
+            for special, idx in self.special_tokens.items():
+                f.write(f"{special} {idx}\n")
+            # the merges dict
+            for key in self.merges:
+                if isinstance(key, tuple):
+                    f.write(f"{key[0]} {key[1]}\n")
+                else:
+                    f.write(f"{key}\n")
+        # write the vocab: for the human to look at
+        vocab_file = file_prefix + ".vocab"
+        inverted_merges = {idx: pair for pair, idx in self.merges.items()}
+        with open(vocab_file, "w", encoding="utf-8") as f:  # Ensure this is also utf-8
+            for idx, token in self.vocab.items():
+                s = helper.render_token(token)
+                # find the children of this token, if any
+                if idx in inverted_merges:
+                    idx0, idx1 = inverted_merges[idx]
+                    s0 = helper.render_token(self.vocab[idx0])
+                    s1 = helper.render_token(self.vocab[idx1])
+                    f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
+                else:
+                    f.write(f"[{s}] {idx}\n")
+    def load(self, model_file):
+        """Inverse of save() but only for the model file"""
+        assert model_file.endswith(".model")
+        # read the model file
+        merges = {}
+        special_tokens = {}
+        idx = 256
+        with open(model_file, 'r', encoding="utf-8") as f:
+            # read the version
+            version = f.readline().strip()
+            assert version == "mana v1"
+            # read the pattern
+            self.pattern = f.readline().strip()
+            # read the special tokens
+            num_special = int(f.readline().strip())
+            for _ in range(num_special):
+                special, special_idx = f.readline().strip().split()
+                special_tokens[special] = int(special_idx)
+            # read the merges
+            for line in f:
+                idx1, idx2 = map(int, line.split())
+                merges[(idx1, idx2)] = idx
+                idx += 1
+        self.merges = merges
+        self.special_tokens = special_tokens
+        self.vocab = self._build_vocab()
+    def decode(self, ids):
+        # given ids (list of integers), return Python string
+        part_bytes = [self.vocab[idx] if idx in self.vocab
+            else self.inverse_special_tokens[idx].encode("utf-8")
+            for idx in ids] # raises KeyError if any idx is not a valid token
+        text_bytes = b"".join(part_bytes)
+        text = text_bytes.decode("utf-8", errors="replace")
+        return text
+    @lru_cache(maxsize=131072)
+    def _encode_chunk(self, chunk):
+        if chunk in self.stop_words:   # TODO: revisit this if statement
+            return [self.stop_words[chunk]]
+        # return the token chunk as a list of ints, similar to a bytes object
+        chunk = [*chunk.encode("utf-8")]
+        len_chunk = len(chunk)
+        while len_chunk >= 2:
+            # find the pair with the lowest merge index
+            low = 987654321
+            for i in range(len_chunk - 1):
+                current_pair = (chunk[i], chunk[i+1])
+                new_val = self.merges.get(current_pair, 987654321)
+                if new_val < low:
+                    pair = current_pair
+                    low = new_val
+            if low == 987654321:   # no merges were found
+                break   # nothing else can be merged
+            # otherwise let's merge the best pair (lowest merge index)
+            idx = self.merges[pair]
+            len_chunk = helper.merge(chunk, pair, idx, len_chunk)
+        return chunk   # list of ints
+    def encode_ordinary(self, text):
+        """Encoding that ignores any special tokens."""
+        ids = []
+        for chunk in re.findall(self.compiled_pattern, text):
+            ids.extend(self._encode_chunk(chunk))
+        return ids
+    def encode(self, text, allowed_special="none_raise"):
+        """
+        Unlike encode_ordinary, this function handles special tokens.
+        allowed_special: can be "all"|"none"|"none_raise" or a custom set of special tokens
+        if none_raise, then an error is raised if any special token is encountered in text
+        this is the default tiktoken behavior right now as well
+        any other behavior is either annoying, or a major footgun
+        """
+        # decode the user desire w.r.t. handling of special tokens
+        special = None
+        if allowed_special == "all":
+            special = self.special_tokens
+        elif allowed_special == "none":
+            special = {}
+        elif allowed_special == "none_raise":
+            special = {}
+            assert all(token not in text for token in self.special_tokens)
+        elif isinstance(allowed_special, set):
+            special = {k: v for k, v in self.special_tokens.items() if k in allowed_special}
+        else:
+            raise ValueError(f"allowed_special={allowed_special} not understood")
+        if not special:   # shortcut: if no special tokens, just use the ordinary encoding
+            return self.encode_ordinary(text)
+        # split on special tokens. Note that surrounding the pattern with ()
+        # makes it into a capturing group, so the special tokens will be included
+        special_pattern = f"({'|'.join([re.escape(k) for k in special])})"
+        special_chunks = re.split(special_pattern, text)
+        # now all the special characters are separated from the rest of the text
+        # all chunks of text are encoded separately, then results are joined
+        ids = []
+        for part in special_chunks:
+            special_token = special.get(part)
+            if special_token is None:   # this is an ordinary sequence, encode it normally
+                ids.extend(self.encode_ordinary(part))
+            else:   # this is a special token, encode it separately as a special case
+                ids.append(special_token)
+        return ids
+    def batch_encode(self, texts, allowed_special="none_raise"):
+        """
+        Encode a list of texts in batch mode.
+        Each text will be encoded according to the handling of special tokens specified in allowed_special.
+        Parameters:
+            texts (list of str): List of texts to encode.
+            allowed_special (str|set): Special token handling mode.
+        Returns:
+            list of list of int: A list where each element is the encoded form of a text in `texts`.
+        """
+        return [self.encode(text, allowed_special=allowed_special) for text in texts]

helper.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from collections import Counter, defaultdict
+import unicodedata
+def get_stats(ids):
+    """
+    Given `ids`, a list of 2-tuples of iterables of ints and int values,
+    returns a defaultdict with the counts of occurrences of all the consecutive
+    pairs of integers within each bytes object, multiplied by the integer value
+    associated with each key. This function does not count pairs between the last
+    element of one key the first element of the next key. The integer value
+    associated with each key serves as a multiplier for the count of each pair
+    within that object. Consecutive identical pairs within the same bytes object
+    are counted only once to avoid overcounting repeat characters.
+    Example:
+        get_stats({b'abc': 2, b'bcd': 1, b'eee': 1})
+        -> defaultdict(<class 'int'>, {(97, 98): 1, (98, 99): 2, (99, 100): 1, (101, 101): 1})
+    """
+    counts = defaultdict(int)
+    for chunk, num in ids:
+        last_index = len(chunk) - 1
+        i = 0
+        while i < last_index:
+            j = i + 1
+            counts[(chunk[i], chunk[j])] += num
+            i = j
+    return counts
+def merge_batch_get_stats(ids, pairs):
+    counts = defaultdict(int)
+    for chunk, num in ids:
+        last_index = len(chunk) - 1
+        i = 0
+        while i < last_index:
+            j = i + 1
+            token = pairs.get((chunk[i], chunk[j]))
+            if token is not None:
+                chunk[i] = token
+                del chunk[j]
+                last_index -= 1
+            if i:
+                counts[(chunk[i-1], chunk[i])] += num
+            i = j
+        if i and i == last_index:
+            counts[(chunk[-2], chunk[i])] += num
+    return counts
+def merge(ids, pair, idx, len_ids):
+    """
+    In the list of integers (ids), replace all consecutive occurrences
+    of pair with the new integer token idx
+    Example: ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
+    """
+    i = 0
+    while i + 1 < len_ids:
+        j = i + 1
+        if ids[i] == pair[0] and ids[j] == pair[1]:
+            ids[i] = idx
+            del ids[j]
+            len_ids -= 1
+        i = j
+    return len_ids
+def replace_control_characters(s: str) -> str:
+    # we don't want to print control characters
+    # which distort the output (e.g. \n or much worse)
+    # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python/19016117#19016117
+    # http://www.unicode.org/reports/tr44/#GC_Values_Table
+    chars = []
+    for ch in s:
+        if unicodedata.category(ch)[0] != "C":
+            chars.append(ch) # this character is ok
+        else:
+            chars.append(f"\\u{ord(ch):04x}") # escape
+    return "".join(chars)
+def render_token(t: bytes) -> str:
+    # pretty print a token, escaping control characters
+    s = t.decode('utf-8', errors='replace')
+    s = replace_control_characters(s)
+    return s
+def _process_dicts(batch, compiled_pattern):   # for raw datasets.Dataset
+    counter = Counter()
+    for item in batch:
+        counter.update(re.findall(compiled_pattern, item))
+    return counter
+def _process_string_scalar(batch, compiled_pattern):
+    counter = Counter()
+    for item in batch:
+        counter.update(re.findall(compiled_pattern, item.as_py()))
+    return counter

mana_tokenizer.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from .base import Tokenizer, get_stats, merge_batch_get_stats
+from heapq import nlargest
+import time
+MANA_SPECIAL_TOKENS = {
+    '<|end|>': 100257,
+    '<|user|>': 100258,
+    '<|assistant|>': 100259,
+    '<|system|>': 100260
+}
+class ManaTokenizer(Tokenizer):
+    def __init__(self, pattern=None, multiprocess=True, store_dict=False, stop_list_size=0, freq_cutoff=1):
+        """
+        - pattern: optional string to override the default (GPT-4 split pattern)
+        - special_tokens: str -> int dictionary of special tokens
+          example: {'<|endoftext|>': 100257}
+        """
+        self.register_special_tokens(MANA_SPECIAL_TOKENS)
+        super().__init__(pattern, multiprocess, store_dict, stop_list_size, freq_cutoff)
+    def train(self, data, vocab_size, cap_divisor=2, max_batch_size=0, verbose=False):
+        t0 = time.time()
+        ids = self._import_data(data)   # [(bytes, int)] -> text chunks and their counts
+        t1 = time.time()
+        print(f'Time spent loading data: {t1-t0:.2f}')
+        merges = self.merges   # {(int, int): int} -> token pair to new token
+        vocab = self.vocab   # {int: bytes} -> token to its bytes representation
+        batch_count = 0
+        curr_vocab_size = len(vocab)
+        num_merges = vocab_size - curr_vocab_size
+        merges_remaining = num_merges
+        if max_batch_size < 1:
+            max_batch_size = num_merges
+        stats = get_stats(ids)   # stats are later updated by merge_batch_get_stats
+        start_time = time.time()
+        while merges_remaining > 0:
+            seen_first = set()   # tokens seen in the first position in pairs
+            seen_last = set()   # tokens seen in the last position in pairs
+            pairs_to_merge = {}
+            num_pairs_to_search = min(merges_remaining//cap_divisor, len(vocab), max_batch_size) or 1
+            top_pairs = nlargest(num_pairs_to_search, stats, key=stats.get)
+            for first, last in top_pairs:  # pairs are (first, last) tuples
+                if first in seen_last or last in seen_first:   # unsafe merge
+                    seen_first.add(first)
+                    seen_last.add(last)
+                    continue # skip this pair but keep looking for safe merges in top_pairs
+                seen_first.add(first)
+                seen_last.add(last)
+                pairs_to_merge[(first, last)] = curr_vocab_size
+                vocab[curr_vocab_size] = vocab[first] + vocab[last]
+                curr_vocab_size += 1
+            merges_remaining -= len(pairs_to_merge)
+            merges.update(pairs_to_merge)  # save the merges
+            batch_count += 1
+            if merges_remaining:   # no need to merge last batch
+                stats = merge_batch_get_stats(ids, pairs_to_merge)   # replace pairs_to_merge keys in ids with their values
+            if verbose:
+                t2 = time.time()
+                time_taken = t2 - start_time
+                avg_time_per_batch = time_taken / batch_count
+                estimated_remaining_time = avg_time_per_batch * (num_merges - merges_remaining)
+                estimated_end_time = time.strftime("%H:%M:%S", time.localtime(time.time() + estimated_remaining_time))
+                print(f"Batch {batch_count} merged {len(pairs_to_merge)} pairs in {t2-t1:.2f} sec. "
+                    f"Merges remaining: {merges_remaining}. Estimated end time: {estimated_end_time}")
+                t1 = t2
+        self.merges = merges # used in encode()
+        self.vocab = vocab   # used in decode()