csvis commited on
Commit
06afc12
·
verified ·
1 Parent(s): b7aaae0

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +226 -0
app.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import os
4
+ import json
5
+ from collections import Counter, deque
6
+ from functools import lru_cache
7
+
8
+
9
+ class BPETokenizerSimple:
10
+ def __init__(self):
11
+ self.vocab = {}
12
+ self.inverse_vocab = {}
13
+ self.bpe_merges = {}
14
+
15
+ def train(self, text, vocab_size, allowed_special={"<|endoftext|>"}):
16
+ processed_text = []
17
+ for i, char in enumerate(text):
18
+ if char == " " and i != 0:
19
+ processed_text.append("Ġ")
20
+ if char != " ":
21
+ processed_text.append(char)
22
+ processed_text = "".join(processed_text)
23
+
24
+ unique_chars = [chr(i) for i in range(256)]
25
+ unique_chars.extend(char for char in sorted(set(processed_text)) if char not in unique_chars)
26
+ if 'Ġ' not in unique_chars:
27
+ unique_chars.append('Ġ')
28
+
29
+ self.vocab = {i: char for i, char in enumerate(unique_chars)}
30
+ self.inverse_vocab = {char: i for i, char in self.vocab.items()}
31
+
32
+ if allowed_special:
33
+ for token in allowed_special:
34
+ if token not in self.inverse_vocab:
35
+ new_id = len(self.vocab)
36
+ self.vocab[new_id] = token
37
+ self.inverse_vocab[token] = new_id
38
+
39
+ token_ids = [self.inverse_vocab[char] for char in processed_text]
40
+
41
+ for new_id in range(len(self.vocab), vocab_size):
42
+ if len(token_ids) < 2:
43
+ break
44
+ pair_id = self.find_freq_pair(token_ids, mode="most")
45
+ if pair_id is None:
46
+ break
47
+
48
+ updated = self.replace_pair(token_ids, pair_id, new_id)
49
+ if updated == token_ids:
50
+ break
51
+
52
+ token_ids = updated
53
+ self.bpe_merges[pair_id] = new_id
54
+
55
+ for (p0, p1), new_id in self.bpe_merges.items():
56
+ merged_token = self.vocab[p0] + self.vocab[p1]
57
+ self.vocab[new_id] = merged_token
58
+ self.inverse_vocab[merged_token] = new_id
59
+
60
+ def encode(self, text):
61
+ tokens = []
62
+ words = text.replace("\n", " \n ").split()
63
+
64
+ for i, word in enumerate(words):
65
+ if i > 0 and not word.startswith("\n"):
66
+ tokens.append("Ġ" + word)
67
+ else:
68
+ tokens.append(word)
69
+
70
+ token_ids = []
71
+ for token in tokens:
72
+ if token in self.inverse_vocab:
73
+ token_id = self.inverse_vocab[token]
74
+ token_ids.append(token_id)
75
+ else:
76
+ sub_token_ids = self.tokenize_with_bpe(token)
77
+ token_ids.extend(sub_token_ids)
78
+
79
+ return token_ids
80
+
81
+ def tokenize_with_bpe(self, token):
82
+ token_ids = [self.inverse_vocab.get(char, None) for char in token]
83
+ if None in token_ids:
84
+ missing_chars = [char for char, tid in zip(token, token_ids) if tid is None]
85
+ raise ValueError(f"Characters not found in vocab: {missing_chars}")
86
+
87
+ can_merge = True
88
+ while can_merge and len(token_ids) > 1:
89
+ can_merge = False
90
+ new_tokens = []
91
+ i = 0
92
+ while i < len(token_ids) - 1:
93
+ pair = (token_ids[i], token_ids[i + 1])
94
+ if pair in self.bpe_merges:
95
+ merged_token_id = self.bpe_merges[pair]
96
+ new_tokens.append(merged_token_id)
97
+ i += 2
98
+ can_merge = True
99
+ else:
100
+ new_tokens.append(token_ids[i])
101
+ i += 1
102
+ if i < len(token_ids):
103
+ new_tokens.append(token_ids[i])
104
+ token_ids = new_tokens
105
+
106
+ return token_ids
107
+
108
+ def decode(self, token_ids):
109
+ decoded_string = ""
110
+ for token_id in token_ids:
111
+ if token_id not in self.vocab:
112
+ raise ValueError(f"Token ID {token_id} not found in vocab.")
113
+ token = self.vocab[token_id]
114
+ if token.startswith("Ġ"):
115
+ decoded_string += " " + token[1:]
116
+ else:
117
+ decoded_string += token
118
+ return decoded_string
119
+
120
+ def save_tokenizer(self, file_path):
121
+ tokenizer_data = {
122
+ "vocab": {str(k): v for k, v in self.vocab.items()},
123
+ "inverse_vocab": self.inverse_vocab,
124
+ "bpe_merges": {str(k): v for k, v in self.bpe_merges.items()}
125
+ }
126
+ with open(file_path, "w", encoding="utf-8") as f:
127
+ json.dump(tokenizer_data, f, ensure_ascii=False, indent=4)
128
+
129
+ @classmethod
130
+ def load_tokenizer(cls, file_path):
131
+ with open(file_path, "r", encoding="utf-8") as f:
132
+ tokenizer_data = json.load(f)
133
+
134
+ tokenizer = cls()
135
+ tokenizer.vocab = {int(k): v for k, v in tokenizer_data["vocab"].items()}
136
+ tokenizer.inverse_vocab = tokenizer_data["inverse_vocab"]
137
+ tokenizer.bpe_merges = {eval(k): v for k, v in tokenizer_data["bpe_merges"].items()}
138
+ return tokenizer
139
+
140
+ @lru_cache(maxsize=None)
141
+ def get_special_token_id(self, token):
142
+ return self.inverse_vocab.get(token, None)
143
+
144
+ @staticmethod
145
+ def find_freq_pair(token_ids, mode="most"):
146
+ if len(token_ids) < 2:
147
+ return None
148
+ pairs = Counter(zip(token_ids, token_ids[1:]))
149
+ if not pairs:
150
+ return None
151
+
152
+ if mode == "most":
153
+ return max(pairs.items(), key=lambda x: x[1])[0]
154
+ elif mode == "least":
155
+ return min(pairs.items(), key=lambda x: x[1])[0]
156
+ else:
157
+ raise ValueError("Invalid mode. Choose 'most' or 'least'.")
158
+
159
+ @staticmethod
160
+ def replace_pair(token_ids, pair_id, new_id):
161
+ dq = deque(token_ids)
162
+ replaced = []
163
+
164
+ while dq:
165
+ current = dq.popleft()
166
+ if dq and (current, dq[0]) == pair_id:
167
+ replaced.append(new_id)
168
+ dq.popleft()
169
+ else:
170
+ replaced.append(current)
171
+
172
+ return replaced
173
+
174
+
175
+ # Load the tokenizer
176
+ TOKENIZER_FILE = "bpe_tokenizer_artifacts/bpe_tokenizer_simple.json"
177
+
178
+ try:
179
+ bpe_tokenizer = BPETokenizerSimple.load_tokenizer(TOKENIZER_FILE)
180
+ print("Tokenizer loaded successfully!")
181
+ except FileNotFoundError:
182
+ print(f"Error: Tokenizer file not found at {TOKENIZER_FILE}. Please ensure it's uploaded to the Space.")
183
+ # Fallback or error handling for missing tokenizer file
184
+ bpe_tokenizer = BPETokenizerSimple()
185
+ # You might want to train a dummy tokenizer or exit if the file is critical
186
+
187
+ def encode_text(text):
188
+ if not text:
189
+ return ""
190
+ token_ids = bpe_tokenizer.encode(text)
191
+ return str(token_ids)
192
+
193
+ def decode_ids(id_string):
194
+ if not id_string:
195
+ return ""
196
+ try:
197
+ token_ids = eval(id_string) # Be cautious with eval in production, but for simple list of ints it's fine.
198
+ if not isinstance(token_ids, list) or not all(isinstance(x, int) for x in token_ids):
199
+ return "Invalid input: Please provide a list of integers, e.g., [424, 256, 654]"
200
+ decoded_text = bpe_tokenizer.decode(token_ids)
201
+ return decoded_text
202
+ except Exception as e:
203
+ return f"Error decoding: {e}. Please provide a valid Python list of integers."
204
+
205
+
206
+ # Gradio Interface
207
+ with gr.Blocks() as demo:
208
+ gr.Markdown("# BPE Tokenizer Demo")
209
+ gr.Markdown("Encode text to BPE token IDs and decode token IDs back to text.")
210
+
211
+ with gr.Row():
212
+ text_input = gr.Textbox(lines=5, label="Input Text for Encoding")
213
+ encoded_output = gr.Textbox(lines=5, label="Encoded Token IDs")
214
+
215
+ encode_button = gr.Button("Encode Text")
216
+ encode_button.click(encode_text, inputs=text_input, outputs=encoded_output)
217
+
218
+ with gr.Row():
219
+ ids_input = gr.Textbox(lines=5, label="Input Token IDs for Decoding (e.g., [1, 2, 3])")
220
+ decoded_output = gr.Textbox(lines=5, label="Decoded Text")
221
+
222
+ decode_button = gr.Button("Decode IDs")
223
+ decode_button.click(decode_ids, inputs=ids_input, outputs=decoded_output)
224
+
225
+
226
+ demo.launch()