Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import json | |
| import tiktoken | |
| from collections import Counter, deque | |
| from functools import lru_cache | |
| # Include the BPETokenizerSimple class definition | |
| class BPETokenizerSimple: | |
| def __init__(self): | |
| self.vocab = {} | |
| self.inverse_vocab = {} | |
| self.bpe_merges = {} | |
| def load_tokenizer(cls, file_path): | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| tokenizer_data = json.load(f) | |
| tokenizer = cls() | |
| tokenizer.vocab = {int(k): v for k, v in tokenizer_data["vocab"].items()} | |
| tokenizer.inverse_vocab = tokenizer_data["inverse_vocab"] | |
| tokenizer.bpe_merges = {eval(k): v for k, v in tokenizer_data["bpe_merges"].items()} | |
| return tokenizer | |
| def encode(self, text): | |
| tokens = [] | |
| words = text.replace("\n", " \n ").split() | |
| for i, word in enumerate(words): | |
| if i > 0 and not word.startswith("\n"): | |
| tokens.append("Ġ" + word) | |
| else: | |
| tokens.append(word) | |
| token_ids = [] | |
| for token in tokens: | |
| if token in self.inverse_vocab: | |
| token_ids.append(self.inverse_vocab[token]) | |
| else: | |
| token_ids.extend(self.tokenize_with_bpe(token)) | |
| return token_ids | |
| def tokenize_with_bpe(self, token): | |
| token_ids = [self.inverse_vocab.get(char, None) for char in token] | |
| if None in token_ids: return [0] | |
| can_merge = True | |
| while can_merge and len(token_ids) > 1: | |
| can_merge = False | |
| new_tokens = [] | |
| i = 0 | |
| while i < len(token_ids) - 1: | |
| pair = (token_ids[i], token_ids[i + 1]) | |
| if pair in self.bpe_merges: | |
| new_tokens.append(self.bpe_merges[pair]) | |
| i += 2 | |
| can_merge = True | |
| else: | |
| new_tokens.append(token_ids[i]) | |
| i += 1 | |
| if i < len(token_ids): new_tokens.append(token_ids[i]) | |
| token_ids = new_tokens | |
| return token_ids | |
| def decode(self, token_ids): | |
| res = "" | |
| for tid in token_ids: | |
| t = self.vocab.get(tid, "") | |
| res += " " + t[1:] if t.startswith("Ġ") else t | |
| return res | |
| # Load custom tokenizer | |
| TOKENIZER_FILE = "bpe_tokenizer_artifacts/bpe_tokenizer_simple.json" | |
| try: | |
| custom_tokenizer = BPETokenizerSimple.load_tokenizer(TOKENIZER_FILE) | |
| except: | |
| custom_tokenizer = None | |
| # Load tiktoken tokenizer | |
| tik_tokenizer = tiktoken.get_encoding("gpt2") | |
| def compare_tokenizers(text): | |
| if not text: | |
| return "", "", "", "" | |
| # Custom BPE | |
| custom_ids = custom_tokenizer.encode(text) if custom_tokenizer else [] | |
| custom_tokens = [custom_tokenizer.vocab.get(i, "UNK") for i in custom_ids] if custom_tokenizer else [] | |
| # Tiktoken (GPT-2) | |
| tik_ids = tik_tokenizer.encode(text) | |
| tik_tokens = [tik_tokenizer.decode([i]) for i in tik_ids] | |
| return str(custom_ids), " | ".join(custom_tokens), str(tik_ids), " | ".join(tik_tokens) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# BPE vs Tiktoken Comparison") | |
| text_input = gr.Textbox(label="Input Text", placeholder="Type something to compare...") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### Your Custom BPE") | |
| custom_ids_out = gr.Textbox(label="Token IDs") | |
| custom_tokens_out = gr.Textbox(label="Subword Tokens") | |
| with gr.Column(): | |
| gr.Markdown("### Tiktoken (GPT-2)") | |
| tik_ids_out = gr.Textbox(label="Token IDs") | |
| tik_tokens_out = gr.Textbox(label="Subword Tokens") | |
| text_input.change(compare_tokenizers, inputs=text_input, outputs=[custom_ids_out, custom_tokens_out, tik_ids_out, tik_tokens_out]) | |
| demo.launch() | |