File size: 3,855 Bytes
06afc12
 
 
 
e997627
06afc12
 
 
e997627
06afc12
 
 
 
 
 
e997627
 
 
 
 
 
 
 
 
06afc12
 
 
 
 
 
 
 
 
 
 
 
e997627
06afc12
e997627
06afc12
 
 
 
e997627
06afc12
 
 
 
 
 
 
 
e997627
06afc12
 
 
 
 
e997627
06afc12
 
 
 
e997627
 
 
 
 
06afc12
e997627
06afc12
 
e997627
 
 
06afc12
e997627
 
06afc12
e997627
 
 
 
 
 
 
 
 
 
 
 
 
06afc12
 
e997627
 
 
06afc12
e997627
 
 
 
 
 
 
 
 
 
 
06afc12
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112

import gradio as gr
import os
import json
import tiktoken
from collections import Counter, deque
from functools import lru_cache

# Include the BPETokenizerSimple class definition
class BPETokenizerSimple:
    def __init__(self):
        self.vocab = {}
        self.inverse_vocab = {}
        self.bpe_merges = {}

    @classmethod
    def load_tokenizer(cls, file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            tokenizer_data = json.load(f)
        tokenizer = cls()
        tokenizer.vocab = {int(k): v for k, v in tokenizer_data["vocab"].items()}
        tokenizer.inverse_vocab = tokenizer_data["inverse_vocab"]
        tokenizer.bpe_merges = {eval(k): v for k, v in tokenizer_data["bpe_merges"].items()}
        return tokenizer

    def encode(self, text):
        tokens = []
        words = text.replace("\n", " \n ").split()
        for i, word in enumerate(words):
            if i > 0 and not word.startswith("\n"):
                tokens.append("Ġ" + word)
            else:
                tokens.append(word)
        token_ids = []
        for token in tokens:
            if token in self.inverse_vocab:
                token_ids.append(self.inverse_vocab[token])
            else:
                token_ids.extend(self.tokenize_with_bpe(token))
        return token_ids

    def tokenize_with_bpe(self, token):
        token_ids = [self.inverse_vocab.get(char, None) for char in token]
        if None in token_ids: return [0]
        can_merge = True
        while can_merge and len(token_ids) > 1:
            can_merge = False
            new_tokens = []
            i = 0
            while i < len(token_ids) - 1:
                pair = (token_ids[i], token_ids[i + 1])
                if pair in self.bpe_merges:
                    new_tokens.append(self.bpe_merges[pair])
                    i += 2
                    can_merge = True
                else:
                    new_tokens.append(token_ids[i])
                    i += 1
            if i < len(token_ids): new_tokens.append(token_ids[i])
            token_ids = new_tokens
        return token_ids

    def decode(self, token_ids):
        res = ""
        for tid in token_ids:
            t = self.vocab.get(tid, "")
            res += " " + t[1:] if t.startswith("Ġ") else t
        return res

# Load custom tokenizer
TOKENIZER_FILE = "bpe_tokenizer_artifacts/bpe_tokenizer_simple.json"
try:
    custom_tokenizer = BPETokenizerSimple.load_tokenizer(TOKENIZER_FILE)
except:
    custom_tokenizer = None

# Load tiktoken tokenizer
tik_tokenizer = tiktoken.get_encoding("gpt2")

def compare_tokenizers(text):
    if not text:
        return "", "", "", ""
    
    # Custom BPE
    custom_ids = custom_tokenizer.encode(text) if custom_tokenizer else []
    custom_tokens = [custom_tokenizer.vocab.get(i, "UNK") for i in custom_ids] if custom_tokenizer else []
    
    # Tiktoken (GPT-2)
    tik_ids = tik_tokenizer.encode(text)
    tik_tokens = [tik_tokenizer.decode([i]) for i in tik_ids]
    
    return str(custom_ids), " | ".join(custom_tokens), str(tik_ids), " | ".join(tik_tokens)

with gr.Blocks() as demo:
    gr.Markdown("# BPE vs Tiktoken Comparison")
    text_input = gr.Textbox(label="Input Text", placeholder="Type something to compare...")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Your Custom BPE")
            custom_ids_out = gr.Textbox(label="Token IDs")
            custom_tokens_out = gr.Textbox(label="Subword Tokens")
        
        with gr.Column():
            gr.Markdown("### Tiktoken (GPT-2)")
            tik_ids_out = gr.Textbox(label="Token IDs")
            tik_tokens_out = gr.Textbox(label="Subword Tokens")

    text_input.change(compare_tokenizers, inputs=text_input, outputs=[custom_ids_out, custom_tokens_out, tik_ids_out, tik_tokens_out])

demo.launch()