Spaces:
Sleeping
Sleeping
File size: 3,855 Bytes
06afc12 e997627 06afc12 e997627 06afc12 e997627 06afc12 e997627 06afc12 e997627 06afc12 e997627 06afc12 e997627 06afc12 e997627 06afc12 e997627 06afc12 e997627 06afc12 e997627 06afc12 e997627 06afc12 e997627 06afc12 e997627 06afc12 e997627 06afc12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import gradio as gr
import os
import json
import tiktoken
from collections import Counter, deque
from functools import lru_cache
# Include the BPETokenizerSimple class definition
class BPETokenizerSimple:
def __init__(self):
self.vocab = {}
self.inverse_vocab = {}
self.bpe_merges = {}
@classmethod
def load_tokenizer(cls, file_path):
with open(file_path, "r", encoding="utf-8") as f:
tokenizer_data = json.load(f)
tokenizer = cls()
tokenizer.vocab = {int(k): v for k, v in tokenizer_data["vocab"].items()}
tokenizer.inverse_vocab = tokenizer_data["inverse_vocab"]
tokenizer.bpe_merges = {eval(k): v for k, v in tokenizer_data["bpe_merges"].items()}
return tokenizer
def encode(self, text):
tokens = []
words = text.replace("\n", " \n ").split()
for i, word in enumerate(words):
if i > 0 and not word.startswith("\n"):
tokens.append("Ġ" + word)
else:
tokens.append(word)
token_ids = []
for token in tokens:
if token in self.inverse_vocab:
token_ids.append(self.inverse_vocab[token])
else:
token_ids.extend(self.tokenize_with_bpe(token))
return token_ids
def tokenize_with_bpe(self, token):
token_ids = [self.inverse_vocab.get(char, None) for char in token]
if None in token_ids: return [0]
can_merge = True
while can_merge and len(token_ids) > 1:
can_merge = False
new_tokens = []
i = 0
while i < len(token_ids) - 1:
pair = (token_ids[i], token_ids[i + 1])
if pair in self.bpe_merges:
new_tokens.append(self.bpe_merges[pair])
i += 2
can_merge = True
else:
new_tokens.append(token_ids[i])
i += 1
if i < len(token_ids): new_tokens.append(token_ids[i])
token_ids = new_tokens
return token_ids
def decode(self, token_ids):
res = ""
for tid in token_ids:
t = self.vocab.get(tid, "")
res += " " + t[1:] if t.startswith("Ġ") else t
return res
# Load custom tokenizer
TOKENIZER_FILE = "bpe_tokenizer_artifacts/bpe_tokenizer_simple.json"
try:
custom_tokenizer = BPETokenizerSimple.load_tokenizer(TOKENIZER_FILE)
except:
custom_tokenizer = None
# Load tiktoken tokenizer
tik_tokenizer = tiktoken.get_encoding("gpt2")
def compare_tokenizers(text):
if not text:
return "", "", "", ""
# Custom BPE
custom_ids = custom_tokenizer.encode(text) if custom_tokenizer else []
custom_tokens = [custom_tokenizer.vocab.get(i, "UNK") for i in custom_ids] if custom_tokenizer else []
# Tiktoken (GPT-2)
tik_ids = tik_tokenizer.encode(text)
tik_tokens = [tik_tokenizer.decode([i]) for i in tik_ids]
return str(custom_ids), " | ".join(custom_tokens), str(tik_ids), " | ".join(tik_tokens)
with gr.Blocks() as demo:
gr.Markdown("# BPE vs Tiktoken Comparison")
text_input = gr.Textbox(label="Input Text", placeholder="Type something to compare...")
with gr.Row():
with gr.Column():
gr.Markdown("### Your Custom BPE")
custom_ids_out = gr.Textbox(label="Token IDs")
custom_tokens_out = gr.Textbox(label="Subword Tokens")
with gr.Column():
gr.Markdown("### Tiktoken (GPT-2)")
tik_ids_out = gr.Textbox(label="Token IDs")
tik_tokens_out = gr.Textbox(label="Subword Tokens")
text_input.change(compare_tokenizers, inputs=text_input, outputs=[custom_ids_out, custom_tokens_out, tik_ids_out, tik_tokens_out])
demo.launch()
|