import gradio as gr import os import json import tiktoken from collections import Counter, deque from functools import lru_cache # Include the BPETokenizerSimple class definition class BPETokenizerSimple: def __init__(self): self.vocab = {} self.inverse_vocab = {} self.bpe_merges = {} @classmethod def load_tokenizer(cls, file_path): with open(file_path, "r", encoding="utf-8") as f: tokenizer_data = json.load(f) tokenizer = cls() tokenizer.vocab = {int(k): v for k, v in tokenizer_data["vocab"].items()} tokenizer.inverse_vocab = tokenizer_data["inverse_vocab"] tokenizer.bpe_merges = {eval(k): v for k, v in tokenizer_data["bpe_merges"].items()} return tokenizer def encode(self, text): tokens = [] words = text.replace("\n", " \n ").split() for i, word in enumerate(words): if i > 0 and not word.startswith("\n"): tokens.append("Ġ" + word) else: tokens.append(word) token_ids = [] for token in tokens: if token in self.inverse_vocab: token_ids.append(self.inverse_vocab[token]) else: token_ids.extend(self.tokenize_with_bpe(token)) return token_ids def tokenize_with_bpe(self, token): token_ids = [self.inverse_vocab.get(char, None) for char in token] if None in token_ids: return [0] can_merge = True while can_merge and len(token_ids) > 1: can_merge = False new_tokens = [] i = 0 while i < len(token_ids) - 1: pair = (token_ids[i], token_ids[i + 1]) if pair in self.bpe_merges: new_tokens.append(self.bpe_merges[pair]) i += 2 can_merge = True else: new_tokens.append(token_ids[i]) i += 1 if i < len(token_ids): new_tokens.append(token_ids[i]) token_ids = new_tokens return token_ids def decode(self, token_ids): res = "" for tid in token_ids: t = self.vocab.get(tid, "") res += " " + t[1:] if t.startswith("Ġ") else t return res # Load custom tokenizer TOKENIZER_FILE = "bpe_tokenizer_artifacts/bpe_tokenizer_simple.json" try: custom_tokenizer = BPETokenizerSimple.load_tokenizer(TOKENIZER_FILE) except: custom_tokenizer = None # Load tiktoken tokenizer tik_tokenizer = tiktoken.get_encoding("gpt2") def compare_tokenizers(text): if not text: return "", "", "", "" # Custom BPE custom_ids = custom_tokenizer.encode(text) if custom_tokenizer else [] custom_tokens = [custom_tokenizer.vocab.get(i, "UNK") for i in custom_ids] if custom_tokenizer else [] # Tiktoken (GPT-2) tik_ids = tik_tokenizer.encode(text) tik_tokens = [tik_tokenizer.decode([i]) for i in tik_ids] return str(custom_ids), " | ".join(custom_tokens), str(tik_ids), " | ".join(tik_tokens) with gr.Blocks() as demo: gr.Markdown("# BPE vs Tiktoken Comparison") text_input = gr.Textbox(label="Input Text", placeholder="Type something to compare...") with gr.Row(): with gr.Column(): gr.Markdown("### Your Custom BPE") custom_ids_out = gr.Textbox(label="Token IDs") custom_tokens_out = gr.Textbox(label="Subword Tokens") with gr.Column(): gr.Markdown("### Tiktoken (GPT-2)") tik_ids_out = gr.Textbox(label="Token IDs") tik_tokens_out = gr.Textbox(label="Subword Tokens") text_input.change(compare_tokenizers, inputs=text_input, outputs=[custom_ids_out, custom_tokens_out, tik_ids_out, tik_tokens_out]) demo.launch()