Spaces:

mathminakshi
/

BPETokenizer

Build error

App Files Files Community

mathminakshi commited on Feb 20, 2025

Commit

2bb89f8

verified ·

1 Parent(s): 8bb8395

Added app file

Browse files

Files changed (3) hide show

app.py +113 -0
bpe.py +176 -0
utils.py +51 -0

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import streamlit as st
+from bpe import Tokenizer
+import random
+import colorsys
+# Set page config
+st.set_page_config(
+    page_title="English BPE Tokenizer Visualizer",
+    layout="wide"
+)
+# Load the trained tokenizer
+@st.cache_resource
+def load_tokenizer():
+    tokenizer = Tokenizer()
+    tokenizer.load("models/EnglishBPE_6999.model.model")
+    return tokenizer
+# Load example texts
+@st.cache_data
+def load_examples():
+    try:
+        with open("data/testdata1.txt", "r", encoding="utf-8") as f:
+            example1 = f.read().strip()
+        with open("data/testdata2.txt", "r", encoding="utf-8") as f:
+            example2 = f.read().strip()
+    except Exception as e:
+        st.error(f"Error loading example texts: {str(e)}")
+        # Fallback examples in case files can't be loaded
+    return example1, example2
+def generate_distinct_colors(n):
+    colors = []
+    for i in range(n):
+        hue = i / n
+        saturation = 0.7 + random.random() * 0.3
+        value = 0.8 + random.random() * 0.2
+        rgb = colorsys.hsv_to_rgb(hue, saturation, value)
+        hex_color = "#{:02x}{:02x}{:02x}".format(
+            int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
+        )
+        colors.append(hex_color)
+    return colors
+def process_text(text, tokenizer):
+    try:
+        # Get tokens
+        tokens = tokenizer.encode(text)
+        # Generate colors for visualization
+        unique_tokens = list(set(tokens))
+        colors = generate_distinct_colors(len(unique_tokens))
+        token_colors = dict(zip(unique_tokens, colors))
+        # Create HTML visualization
+        html_parts = []
+        decoded_tokens = [tokenizer.decode([token]) for token in tokens]
+        for token, token_text in zip(tokens, decoded_tokens):
+            color = token_colors[token]
+            html_parts.append(f'<span style="background-color: {color}; padding: 0 2px; border-radius: 3px;" title="Token ID: {token}">{token_text}</span>')
+        return ''.join(html_parts), tokens
+    except Exception as e:
+        return f"<span style='color: red'>Error processing text: {str(e)}</span>", None
+def main():
+    # Load tokenizer and examples
+    tokenizer = load_tokenizer()
+    example1, example2 = load_examples()
+    # Title and description
+    st.title("English BPE Tokenizer Visualizer")
+    st.markdown("Enter text to see how it gets tokenized, with color-coded visualization")
+    # Example selector
+    example_option = st.selectbox(
+        "Choose an example or enter your own text below:",
+        ["Custom Input", "Example 1", "Example 2"]
+    )
+    # Text input
+    if example_option == "Example 1":
+        text = st.text_area("Enter Text", value=example1, height=100)
+    elif example_option == "Example 2":
+        text = st.text_area("Enter Text", value=example2, height=100)
+    else:
+        text = st.text_area("Enter Text", height=100)
+    # Process button
+    if st.button("Process Text") or text:
+        if text.strip():
+            # Create two columns for output
+            col1, col2 = st.columns([2, 1])
+            # Process the text
+            visualization, tokens = process_text(text, tokenizer)
+            with col1:
+                st.subheader("Visualization")
+                st.markdown(visualization, unsafe_allow_html=True)
+            with col2:
+                if tokens is not None:
+                    st.subheader("Token Information")
+                    st.write(f"Token count: {len(tokens)}")
+                    st.write("Tokens:", tokens)
+        else:
+            st.warning("Please enter some text to process.")
+if __name__ == "__main__":
+    main()

bpe.py ADDED Viewed

	@@ -0,0 +1,176 @@

+from utils import get_stats, merge, render_token
+import regex as re
+GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
+class Tokenizer:
+    def __init__(self):
+        # default: vocab size of 256 (all bytes), no merges, no patterns
+        self.merges = {} # (int, int) -> int
+        self.pattern = r"'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+" # str
+        self.compiled_pattern = re.compile(self.pattern)
+        self.special_tokens = {} # str -> int, e.g. {'<|endoftext|>': 100257}
+        self.vocab = self._build_vocab() # int -> bytes
+        self.compression_ratio = 0
+    def _build_vocab(self):
+        # vocab is simply and deterministically derived from merges
+        vocab = {idx: bytes([idx]) for idx in range(256)}
+        for (p0, p1), idx in self.merges.items():
+            vocab[idx] = vocab[p0] + vocab[p1]
+        for special, idx in self.special_tokens.items():
+            vocab[idx] = special.encode("utf-8")
+        return vocab
+    def train(self, text, vocab_size, verbose=False):
+        assert vocab_size >= 256
+        text = ' '.join(self.compiled_pattern.findall(text))
+        num_merges = vocab_size - 256
+        # input text preprocessing
+        text_bytes = text.encode("utf-8") # raw bytes
+        ids = list(text_bytes) # list of integers in range 0..255
+        original_ids = ids.copy()
+        # iteratively merge the most common pairs to create new tokens
+        merges = {} # (int, int) -> int
+        vocab = {idx: bytes([idx]) for idx in range(256)} # int -> bytes
+        for i in range(num_merges):
+            # count up the number of times every consecutive pair appears
+            stats = get_stats(ids)
+            # find the pair with the highest count
+            pair = max(stats, key=stats.get)
+            # mint a new token: assign it the next available id
+            idx = 256 + i
+            # replace all occurrences of pair in ids with idx
+            ids = merge(ids, pair, idx)
+            # save the merge
+            merges[pair] = idx
+            vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
+            # prints
+            if verbose:
+                print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
+        # save class variables
+        self.merges = merges # used in encode()
+        self.vocab = vocab   # used in decode()
+        self.compression_ratio = round(len(original_ids)/len(ids), 1)
+    def encode(self, text):
+        # given a string text, return the token ids
+        text_bytes = text.encode("utf-8") # raw bytes
+        ids = list(text_bytes) # list of integers in range 0..255
+        while len(ids) >= 2:
+            # find the pair with the lowest merge index
+            stats = get_stats(ids)
+            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
+            # subtle: if there are no more merges available, the key will
+            # result in an inf for every single pair, and the min will be
+            # just the first pair in the list, arbitrarily
+            # we can detect this terminating case by a membership check
+            if pair not in self.merges:
+                break # nothing else can be merged anymore
+            # otherwise let's merge the best pair (lowest merge index)
+            idx = self.merges[pair]
+            ids = merge(ids, pair, idx)
+        return ids
+    def decode(self, ids):
+        # given ids (list of integers), return Python string
+        text_bytes = b"".join(self.vocab[idx] for idx in ids)
+        text = text_bytes.decode("utf-8", errors="replace")
+        return text
+    def save(self, file_prefix):
+        """
+        Saves two files: file_prefix.vocab and file_prefix.model
+        This is inspired (but not equivalent to!) sentencepiece's model saving:
+        - model file is the critical one, intended for load()
+        - vocab file is just a pretty printed version for human inspection only
+        """
+        # write the model: to be used in load() later
+        model_file = file_prefix + ".model"
+        with open(model_file, 'w') as f:
+            # write the version, pattern and compression ratio
+            f.write("minbpe v1\n")
+            f.write(f"{self.pattern}\n")
+            f.write(f"{self.compression_ratio}\n")  # Save compression ratio as string
+            # write the special tokens, first the number of them, then each one
+            f.write(f"{len(self.special_tokens)}\n")
+            for special, idx in self.special_tokens.items():
+                f.write(f"{special} {idx}\n")
+            # the merges dict
+            for idx1, idx2 in self.merges:
+                f.write(f"{idx1} {idx2}\n")
+        # write the vocab: for the human to look at
+        vocab_file = file_prefix + ".vocab"
+        inverted_merges = {idx: pair for pair, idx in self.merges.items()}
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            for idx, token in self.vocab.items():
+                # note: many tokens may be partial utf-8 sequences
+                # and cannot be decoded into valid strings. Here we're using
+                # errors='replace' to replace them with the replacement char �.
+                # this also means that we couldn't possibly use .vocab in load()
+                # because decoding in this way is a lossy operation!
+                s = render_token(token)
+                # find the children of this token, if any
+                if idx in inverted_merges:
+                    # if this token has children, render it nicely as a merge
+                    idx0, idx1 = inverted_merges[idx]
+                    s0 = render_token(self.vocab[idx0])
+                    s1 = render_token(self.vocab[idx1])
+                    f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
+                else:
+                    # otherwise this is leaf token, just print it
+                    # (this should just be the first 256 tokens, the bytes)
+                    f.write(f"[{s}] {idx}\n")
+    def load(self, model_file):
+        """Inverse of save() but only for the model file"""
+        assert model_file.endswith(".model")
+        merges = {}
+        special_tokens = {}
+        idx = 256
+        with open(model_file, 'r', encoding="utf-8") as f:
+            # read the version
+            version = f.readline().strip()
+            assert version == "minbpe v1"
+            # read the pattern
+            self.pattern = f.readline().strip()
+            self.compiled_pattern = re.compile(self.pattern)
+            # read the compression ratio safely
+            compression_ratio_line = f.readline().strip()
+            try:
+                self.compression_ratio = float(compression_ratio_line)
+            except ValueError:
+                raise ValueError(f"Expected a float for compression ratio, got: {compression_ratio_line}")
+            # read the special tokens count safely
+            num_special_line = f.readline().strip()
+            if num_special_line.isdigit():  # Ensure it's a valid integer
+                num_special = int(num_special_line)
+            else:
+                raise ValueError(f"Expected an integer for number of special tokens, got: {num_special_line}")
+            # Read special tokens if any
+            for _ in range(num_special):
+                line = f.readline().strip()
+                if line:
+                    special, idx_str = line.rsplit(" ", 1)
+                    special_tokens[special] = int(idx_str)
+            # Read merges
+            for line in f:
+                parts = line.split()
+                if len(parts) == 2:
+                    idx1, idx2 = map(int, parts)
+                    merges[(idx1, idx2)] = idx
+                    idx += 1
+        self.merges = merges
+        self.special_tokens = special_tokens
+        self.vocab = self._build_vocab()

utils.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import unicodedata
+def get_stats(ids, counts=None):
+    """
+    Given a list of integers, return a dictionary of counts of consecutive pairs
+    Example: [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
+    Optionally allows to update an existing dictionary of counts
+    """
+    counts = {} if counts is None else counts
+    for pair in zip(ids, ids[1:]): # iterate consecutive elements
+        counts[pair] = counts.get(pair, 0) + 1
+    return counts
+def merge(ids, pair, idx):
+    """
+    In the list of integers (ids), replace all consecutive occurrences
+    of pair with the new integer token idx
+    Example: ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
+    """
+    newids = []
+    i = 0
+    while i < len(ids):
+        # if not at the very last position AND the pair matches, replace it
+        if ids[i] == pair[0] and i < len(ids) - 1 and ids[i+1] == pair[1]:
+            newids.append(idx)
+            i += 2
+        else:
+            newids.append(ids[i])
+            i += 1
+    return newids
+    # first two helper functions...
+def replace_control_characters(s: str) -> str:
+    # we don't want to print control characters
+    # which distort the output (e.g. \n or much worse)
+    # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python/19016117#19016117
+    # http://www.unicode.org/reports/tr44/#GC_Values_Table
+    chars = []
+    for ch in s:
+        if unicodedata.category(ch)[0] != "C":
+            chars.append(ch) # this character is ok
+        else:
+            chars.append(f"\\u{ord(ch):04x}") # escape
+    return "".join(chars)
+def render_token(t: bytes) -> str:
+    # pretty print a token, escaping control characters
+    s = t.decode('utf-8', errors='replace')
+    s = replace_control_characters(s)
+    return s