csvis's picture
Upload app.py with huggingface_hub
e997627 verified
import gradio as gr
import os
import json
import tiktoken
from collections import Counter, deque
from functools import lru_cache
# Include the BPETokenizerSimple class definition
class BPETokenizerSimple:
def __init__(self):
self.vocab = {}
self.inverse_vocab = {}
self.bpe_merges = {}
@classmethod
def load_tokenizer(cls, file_path):
with open(file_path, "r", encoding="utf-8") as f:
tokenizer_data = json.load(f)
tokenizer = cls()
tokenizer.vocab = {int(k): v for k, v in tokenizer_data["vocab"].items()}
tokenizer.inverse_vocab = tokenizer_data["inverse_vocab"]
tokenizer.bpe_merges = {eval(k): v for k, v in tokenizer_data["bpe_merges"].items()}
return tokenizer
def encode(self, text):
tokens = []
words = text.replace("\n", " \n ").split()
for i, word in enumerate(words):
if i > 0 and not word.startswith("\n"):
tokens.append("Ġ" + word)
else:
tokens.append(word)
token_ids = []
for token in tokens:
if token in self.inverse_vocab:
token_ids.append(self.inverse_vocab[token])
else:
token_ids.extend(self.tokenize_with_bpe(token))
return token_ids
def tokenize_with_bpe(self, token):
token_ids = [self.inverse_vocab.get(char, None) for char in token]
if None in token_ids: return [0]
can_merge = True
while can_merge and len(token_ids) > 1:
can_merge = False
new_tokens = []
i = 0
while i < len(token_ids) - 1:
pair = (token_ids[i], token_ids[i + 1])
if pair in self.bpe_merges:
new_tokens.append(self.bpe_merges[pair])
i += 2
can_merge = True
else:
new_tokens.append(token_ids[i])
i += 1
if i < len(token_ids): new_tokens.append(token_ids[i])
token_ids = new_tokens
return token_ids
def decode(self, token_ids):
res = ""
for tid in token_ids:
t = self.vocab.get(tid, "")
res += " " + t[1:] if t.startswith("Ġ") else t
return res
# Load custom tokenizer
TOKENIZER_FILE = "bpe_tokenizer_artifacts/bpe_tokenizer_simple.json"
try:
custom_tokenizer = BPETokenizerSimple.load_tokenizer(TOKENIZER_FILE)
except:
custom_tokenizer = None
# Load tiktoken tokenizer
tik_tokenizer = tiktoken.get_encoding("gpt2")
def compare_tokenizers(text):
if not text:
return "", "", "", ""
# Custom BPE
custom_ids = custom_tokenizer.encode(text) if custom_tokenizer else []
custom_tokens = [custom_tokenizer.vocab.get(i, "UNK") for i in custom_ids] if custom_tokenizer else []
# Tiktoken (GPT-2)
tik_ids = tik_tokenizer.encode(text)
tik_tokens = [tik_tokenizer.decode([i]) for i in tik_ids]
return str(custom_ids), " | ".join(custom_tokens), str(tik_ids), " | ".join(tik_tokens)
with gr.Blocks() as demo:
gr.Markdown("# BPE vs Tiktoken Comparison")
text_input = gr.Textbox(label="Input Text", placeholder="Type something to compare...")
with gr.Row():
with gr.Column():
gr.Markdown("### Your Custom BPE")
custom_ids_out = gr.Textbox(label="Token IDs")
custom_tokens_out = gr.Textbox(label="Subword Tokens")
with gr.Column():
gr.Markdown("### Tiktoken (GPT-2)")
tik_ids_out = gr.Textbox(label="Token IDs")
tik_tokens_out = gr.Textbox(label="Subword Tokens")
text_input.change(compare_tokenizers, inputs=text_input, outputs=[custom_ids_out, custom_tokens_out, tik_ids_out, tik_tokens_out])
demo.launch()