Rahul2020 commited on
Commit
c8cd7bf
·
verified ·
1 Parent(s): ebe2e0f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +517 -14
app.py CHANGED
@@ -3,6 +3,10 @@ from fastapi import FastAPI
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from transformers import PreTrainedTokenizerFast
5
  import os
 
 
 
 
6
 
7
  # --------------------------------------
8
  # LOAD TOKENIZER
@@ -24,10 +28,26 @@ print("Tokenizer loaded: vocab =", tokenizer.vocab_size)
24
  # ENCODE / DECODE FUNCTIONS
25
  # --------------------------------------
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def encode_text(text: str):
28
- """Basic encode: returns token IDs as CSV, token count, and compression ratio."""
29
- enc = tokenizer(text, add_special_tokens=False)
30
  token_ids = enc["input_ids"]
 
 
 
31
  token_count = len(token_ids)
32
  csv_ids = ",".join(str(x) for x in token_ids)
33
 
@@ -35,15 +55,273 @@ def encode_text(text: str):
35
  char_count = len(text)
36
  compression_ratio = char_count / token_count if token_count > 0 else 0.0
37
 
38
- return csv_ids, token_count, f"{compression_ratio:.2f}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  def decode_ids(ids: str):
41
- """Decode from comma-separated IDs to text."""
42
  try:
43
  arr = [int(x) for x in ids.split(",") if x.strip()]
44
- return tokenizer.decode(arr)
45
- except:
46
- return "❌ Invalid ID list"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  # --------------------------------------
49
  # FASTAPI REST BACKEND
@@ -83,25 +361,250 @@ def decode_endpoint(ids: str):
83
  # GRADIO FRONTEND
84
  # --------------------------------------
85
 
86
- with gr.Blocks(title="Hindi Tokenizer") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  gr.Markdown("## 🔡 Hindi BPE Tokenizer — Encode / Decode")
 
 
 
88
 
89
  with gr.Tab("Encode"):
90
  text_in = gr.Textbox(label="Enter text", lines=3)
91
 
 
 
 
92
  with gr.Row():
93
  token_count_out = gr.Number(label="Token Count", precision=0)
94
  compression_ratio_out = gr.Textbox(label="Compression Ratio (chars/token)", interactive=False)
95
 
96
- ids_out = gr.Textbox(label="Token IDs", lines=8, max_lines=20)
97
- btn = gr.Button("Encode")
98
- btn.click(encode_text, text_in, [ids_out, token_count_out, compression_ratio_out])
 
 
 
99
 
100
  with gr.Tab("Decode"):
101
  ids_in = gr.Textbox(label="Comma-separated token IDs", lines=4)
102
- text_out = gr.Textbox(label="Decoded text", lines=8, max_lines=20)
103
- btn3 = gr.Button("Decode")
104
- btn3.click(decode_ids, ids_in, text_out)
 
 
 
 
 
 
 
 
105
 
106
  # Mount FastAPI + Gradio
107
 
 
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from transformers import PreTrainedTokenizerFast
5
  import os
6
+ import json
7
+ import random
8
+ import hashlib
9
+ import re
10
 
11
  # --------------------------------------
12
  # LOAD TOKENIZER
 
28
  # ENCODE / DECODE FUNCTIONS
29
  # --------------------------------------
30
 
31
+ def get_color_for_token(token_id, seed=None):
32
+ """Generate a consistent color for a token ID."""
33
+ if seed is not None:
34
+ random.seed(seed)
35
+ # Generate a hash-based color
36
+ hash_obj = hashlib.md5(str(token_id).encode())
37
+ hash_int = int(hash_obj.hexdigest(), 16)
38
+ # Use HSL for better color distribution
39
+ hue = hash_int % 360
40
+ saturation = 60 + (hash_int % 30)
41
+ lightness = 75 + (hash_int % 15)
42
+ return f"hsl({hue}, {saturation}%, {lightness}%)"
43
+
44
  def encode_text(text: str):
45
+ """Basic encode: returns token IDs as CSV, token count, compression ratio, and color-coded HTML."""
46
+ enc = tokenizer(text, add_special_tokens=False, return_offsets_mapping=True)
47
  token_ids = enc["input_ids"]
48
+ tokens = tokenizer.convert_ids_to_tokens(token_ids)
49
+ offsets = enc.get("offset_mapping", [])
50
+
51
  token_count = len(token_ids)
52
  csv_ids = ",".join(str(x) for x in token_ids)
53
 
 
55
  char_count = len(text)
56
  compression_ratio = char_count / token_count if token_count > 0 else 0.0
57
 
58
+ # First, build token-to-word mapping using offsets
59
+ token_ranges = []
60
+ for idx, (start, end) in enumerate(offsets):
61
+ if start is not None and end is not None:
62
+ token_ranges.append((idx, start, end))
63
+ else:
64
+ token_ranges.append((idx, None, None))
65
+
66
+ # Get word positions for mapping
67
+ words_with_positions = []
68
+ for match in re.finditer(r'\S+', text):
69
+ word = match.group()
70
+ word_start = match.start()
71
+ word_end = match.end()
72
+ words_with_positions.append((word, word_start, word_end))
73
+
74
+ # Build token-to-word mapping
75
+ token_to_words_map = {}
76
+ for token_idx, token_start, token_end in token_ranges:
77
+ if token_start is not None and token_end is not None:
78
+ token_to_words_map[token_idx] = []
79
+ for word_idx, (word, word_start, word_end) in enumerate(words_with_positions):
80
+ if token_start < word_end and token_end > word_start:
81
+ token_to_words_map[token_idx].append(word_idx)
82
+
83
+ # Store token data for potential future use
84
+ token_data = []
85
+ for i, (token, token_id) in enumerate(zip(tokens, token_ids)):
86
+ token_data.append({
87
+ "idx": i,
88
+ "token": token,
89
+ "id": token_id
90
+ })
91
+
92
+ # Include JavaScript for highlighting in the HTML
93
+ highlight_script = """
94
+ <script>
95
+ if (typeof window.highlightFunctions === 'undefined') {
96
+ window.highlightFunctions = {};
97
+ window.currentHighlighted = null;
98
+
99
+ window.clearHighlights = function() {
100
+ if (window.currentHighlighted) {
101
+ window.currentHighlighted.forEach(el => {
102
+ el.classList.remove('highlighted');
103
+ el.style.borderColor = 'transparent';
104
+ el.style.boxShadow = 'none';
105
+ if (el.style.transform) el.style.transform = 'scale(1)';
106
+ });
107
+ }
108
+ window.currentHighlighted = null;
109
+ };
110
+
111
+ window.highlightInputWord = function(tokenIndicesStr) {
112
+ window.clearHighlights();
113
+ const tokenIndices = tokenIndicesStr.split(',');
114
+ const highlighted = [];
115
+
116
+ // Highlight input words
117
+ document.querySelectorAll(`.input-word-tag[data-word-tokens="${tokenIndicesStr}"]`).forEach(el => {
118
+ el.classList.add('highlighted');
119
+ el.style.borderColor = '#ff0000';
120
+ el.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
121
+ highlighted.push(el);
122
+ });
123
+
124
+ // Highlight corresponding token IDs
125
+ tokenIndices.forEach(idx => {
126
+ document.querySelectorAll(`.token-id-tag[data-token-idx="${idx}"]`).forEach(el => {
127
+ el.classList.add('highlighted');
128
+ el.style.borderColor = '#ff0000';
129
+ el.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
130
+ el.style.transform = 'scale(1.1)';
131
+ highlighted.push(el);
132
+ });
133
+ });
134
+
135
+ window.currentHighlighted = highlighted;
136
+ };
137
+
138
+ window.highlightTokenId = function(tokenIdx) {
139
+ window.clearHighlights();
140
+ const tokenIdEl = document.querySelector(`.token-id-tag[data-token-idx="${tokenIdx}"]`);
141
+ if (!tokenIdEl) return;
142
+
143
+ const highlighted = [tokenIdEl];
144
+ tokenIdEl.classList.add('highlighted');
145
+ tokenIdEl.style.borderColor = '#ff0000';
146
+ tokenIdEl.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
147
+ tokenIdEl.style.transform = 'scale(1.1)';
148
+
149
+ // Find input words that contain this token
150
+ const tokenId = tokenIdEl.getAttribute('data-token-id');
151
+ document.querySelectorAll('.input-word-tag').forEach(wordEl => {
152
+ const tokenIndices = wordEl.getAttribute('data-word-tokens');
153
+ if (tokenIndices) {
154
+ const tokenList = tokenIndices.split(',');
155
+ if (tokenList.includes(tokenIdx.toString())) {
156
+ wordEl.classList.add('highlighted');
157
+ wordEl.style.borderColor = '#ff0000';
158
+ wordEl.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
159
+ highlighted.push(wordEl);
160
+ }
161
+ }
162
+ });
163
+
164
+ window.currentHighlighted = highlighted;
165
+ };
166
+ }
167
+ </script>
168
+ """
169
+
170
+ token_json = json.dumps(token_data)
171
+
172
+ # Create clickable HTML for input text (mirror of textbox) - uses same words_with_positions
173
+ input_word_html_parts = []
174
+ for word, word_start, word_end in words_with_positions:
175
+ word_escaped = word.replace("<", "&lt;").replace(">", "&gt;").replace("&", "&amp;")
176
+
177
+ # Find tokens whose character ranges overlap with this word
178
+ word_token_indices = []
179
+ for token_idx, token_start, token_end in token_ranges:
180
+ if token_start is not None and token_end is not None:
181
+ if token_start < word_end and token_end > word_start:
182
+ word_token_indices.append(token_idx)
183
+
184
+ if word_token_indices:
185
+ token_id_for_word = token_ids[word_token_indices[0]]
186
+ color = get_color_for_token(token_id_for_word, seed=42)
187
+ token_indices_str = ",".join(map(str, word_token_indices))
188
+ input_word_html_parts.append(
189
+ f'<span class="input-word-tag" data-word-tokens="{token_indices_str}" '
190
+ f'style="background-color: {color}; padding: 2px 6px; margin: 2px; '
191
+ f'border-radius: 4px; cursor: pointer; display: inline-block; '
192
+ f'border: 2px solid transparent; transition: all 0.2s;" '
193
+ f'onclick="highlightInputWord(\'{token_indices_str}\')" '
194
+ f'onmouseover="this.style.borderColor=\'#333\'" '
195
+ f'onmouseout="if(!document.querySelector(\'.input-word-tag.highlighted\')) this.style.borderColor=\'transparent\'">{word_escaped}</span>'
196
+ )
197
+ else:
198
+ input_word_html_parts.append(f'<span style="padding: 2px 6px; margin: 2px;">{word_escaped}</span>')
199
+
200
+ input_html = '<div style="line-height: 2; padding: 10px; background: #ffffff; border: 2px solid #e0e0e0; border-radius: 8px; min-height: 60px;">' + " ".join(input_word_html_parts) + '</div>'
201
+
202
+ # Create token IDs display with labels for highlighting
203
+ token_ids_html_parts = []
204
+ for i, token_id in enumerate(token_ids):
205
+ color = get_color_for_token(token_id, seed=42)
206
+ # Find which words contain this token
207
+ word_indices = token_to_words_map.get(i, [])
208
+ word_labels = [words_with_positions[idx][0] for idx in word_indices]
209
+ word_label = ", ".join(word_labels[:2]) if word_labels else "" # Show first 2 words as label
210
+
211
+ token_ids_html_parts.append(
212
+ f'<div class="token-id-tag" data-token-idx="{i}" data-token-id="{token_id}" '
213
+ f'style="background-color: {color}; padding: 6px 10px; margin: 4px; '
214
+ f'border-radius: 6px; cursor: pointer; display: inline-block; vertical-align: top; '
215
+ f'border: 2px solid transparent; transition: all 0.2s; text-align: center; min-width: 60px;" '
216
+ f'onclick="highlightTokenId({i})" '
217
+ f'onmouseover="this.style.borderColor=\'#333\'" '
218
+ f'onmouseout="if(!document.querySelector(\'.token-id-tag[data-token-idx=\'{i}\'].highlighted\')) this.style.borderColor=\'transparent\'">'
219
+ f'<div style="font-family: monospace; font-weight: bold; font-size: 14px; color: #000;">{token_id}</div>'
220
+ f'<div style="font-size: 10px; color: #555; margin-top: 2px; word-break: break-word; max-width: 80px;">{word_label if word_label else "&nbsp;"}</div>'
221
+ f'</div>'
222
+ )
223
+
224
+ token_ids_html = '<div style="padding: 10px; background: #f8f9fa; border-radius: 8px; margin-top: 10px;">' + "".join(token_ids_html_parts) + '</div>'
225
+
226
+ return csv_ids, token_count, f"{compression_ratio:.2f}", token_ids_html, token_json, input_html
227
 
228
  def decode_ids(ids: str):
229
+ """Decode from comma-separated IDs to text with color-coded HTML."""
230
  try:
231
  arr = [int(x) for x in ids.split(",") if x.strip()]
232
+ decoded_text = tokenizer.decode(arr, skip_special_tokens=False)
233
+
234
+ # Re-encode with offsets to map tokens to words accurately
235
+ enc_with_offsets = tokenizer(decoded_text, add_special_tokens=False, return_offsets_mapping=True)
236
+ tokens = tokenizer.convert_ids_to_tokens(arr)
237
+ offsets = enc_with_offsets.get("offset_mapping", [])
238
+
239
+ # Build token-to-character-range mapping
240
+ token_ranges = []
241
+ for idx, (start, end) in enumerate(offsets):
242
+ if start is not None and end is not None:
243
+ token_ranges.append((idx, start, end))
244
+ else:
245
+ token_ranges.append((idx, None, None))
246
+
247
+ # Get word positions for mapping
248
+ words_with_positions = []
249
+ for match in re.finditer(r'\S+', decoded_text):
250
+ word = match.group()
251
+ word_start = match.start()
252
+ word_end = match.end()
253
+ words_with_positions.append((word, word_start, word_end))
254
+
255
+ # Create color-coded HTML for decoded text
256
+ word_html_parts = []
257
+
258
+ for word, word_start, word_end in words_with_positions:
259
+ word_escaped = word.replace("<", "&lt;").replace(">", "&gt;").replace("&", "&amp;")
260
+
261
+ # Find tokens whose character ranges overlap with this word
262
+ word_token_indices = []
263
+ for token_idx, token_start, token_end in token_ranges:
264
+ if token_start is not None and token_end is not None:
265
+ # Check if token overlaps with word
266
+ if token_start < word_end and token_end > word_start:
267
+ word_token_indices.append(token_idx)
268
+
269
+ if word_token_indices and word_token_indices[0] < len(arr):
270
+ token_id_for_word = arr[word_token_indices[0]]
271
+ color = get_color_for_token(token_id_for_word, seed=42)
272
+ token_indices_str = ",".join(map(str, word_token_indices))
273
+ word_html_parts.append(
274
+ f'<span class="decode-word-tag" data-word-tokens="{token_indices_str}" '
275
+ f'style="background-color: {color}; padding: 2px 6px; margin: 2px; '
276
+ f'border-radius: 4px; cursor: pointer; display: inline-block; '
277
+ f'border: 2px solid transparent; transition: all 0.2s;" '
278
+ f'onclick="highlightDecodeWord(\'{token_indices_str}\')" '
279
+ f'onmouseover="this.style.borderColor=\'#333\'" '
280
+ f'onmouseout="if(!document.querySelector(\'.decode-word-tag.highlighted\')) this.style.borderColor=\'transparent\'">{word_escaped}</span>'
281
+ )
282
+ else:
283
+ word_html_parts.append(f'<span style="padding: 2px 6px; margin: 2px;">{word_escaped}</span>')
284
+
285
+ # Build token-to-word mapping for decode
286
+ token_to_words_map = {}
287
+ for token_idx, token_start, token_end in token_ranges:
288
+ if token_start is not None and token_end is not None:
289
+ token_to_words_map[token_idx] = []
290
+ for word_idx, (word, word_start, word_end) in enumerate(words_with_positions):
291
+ if token_start < word_end and token_end > word_start:
292
+ token_to_words_map[token_idx].append(word_idx)
293
+
294
+ decoded_html = '<div style="line-height: 2; padding: 10px; background: #ffffff; border: 2px solid #e0e0e0; border-radius: 8px; min-height: 60px;">' + " ".join(word_html_parts) + '</div>'
295
+
296
+ # Create token IDs display with labels for decode (similar to encode)
297
+ decode_token_ids_html_parts = []
298
+ for i, token_id in enumerate(arr):
299
+ color = get_color_for_token(token_id, seed=42)
300
+ # Find which words contain this token
301
+ word_indices = token_to_words_map.get(i, [])
302
+ word_labels = [words_with_positions[idx][0] for idx in word_indices if idx < len(words_with_positions)]
303
+ word_label = ", ".join(word_labels[:2]) if word_labels else "" # Show first 2 words as label
304
+
305
+ decode_token_ids_html_parts.append(
306
+ f'<div class="decode-token-id-tag" data-token-idx="{i}" data-token-id="{token_id}" '
307
+ f'style="background-color: {color}; padding: 6px 10px; margin: 4px; '
308
+ f'border-radius: 6px; cursor: pointer; display: inline-block; vertical-align: top; '
309
+ f'border: 2px solid transparent; transition: all 0.2s; text-align: center; min-width: 60px;" '
310
+ f'onclick="highlightDecodeTokenId({i})" '
311
+ f'onmouseover="this.style.borderColor=\'#333\'" '
312
+ f'onmouseout="if(!document.querySelector(\'.decode-token-id-tag[data-token-idx=\'{i}\'].highlighted\')) this.style.borderColor=\'transparent\'">'
313
+ f'<div style="font-family: monospace; font-weight: bold; font-size: 14px; color: #000;">{token_id}</div>'
314
+ f'<div style="font-size: 10px; color: #555; margin-top: 2px; word-break: break-word; max-width: 80px;">{word_label if word_label else "&nbsp;"}</div>'
315
+ f'</div>'
316
+ )
317
+
318
+ decode_token_ids_html = '<div style="padding: 10px; background: #f8f9fa; border-radius: 8px; margin-top: 10px;">' + "".join(decode_token_ids_html_parts) + '</div>'
319
+
320
+ return decoded_html, decode_token_ids_html, decoded_text
321
+
322
+ except Exception as e:
323
+ error_msg = f"❌ Invalid ID list: {str(e)}"
324
+ return f"<div style='padding: 10px; color: red;'>{error_msg}</div>", "", error_msg
325
 
326
  # --------------------------------------
327
  # FASTAPI REST BACKEND
 
361
  # GRADIO FRONTEND
362
  # --------------------------------------
363
 
364
+ # JavaScript for interactive highlighting
365
+ highlight_js = """
366
+ <script>
367
+ let currentHighlighted = null;
368
+
369
+ function clearHighlights() {
370
+ if (currentHighlighted) {
371
+ currentHighlighted.forEach(el => {
372
+ el.classList.remove('highlighted');
373
+ el.style.borderColor = 'transparent';
374
+ el.style.boxShadow = 'none';
375
+ });
376
+ }
377
+ currentHighlighted = null;
378
+ }
379
+
380
+ function highlightToken(tokenIdx) {
381
+ clearHighlights();
382
+ const tokenEl = document.querySelector(`.token-tag[data-token-idx="${tokenIdx}"]`);
383
+ if (!tokenEl) return;
384
+
385
+ const tokenId = tokenEl.getAttribute('data-token-id');
386
+ const highlighted = [tokenEl];
387
+
388
+ // Highlight all tokens with same ID
389
+ document.querySelectorAll(`.token-tag[data-token-id="${tokenId}"]`).forEach(el => {
390
+ el.classList.add('highlighted');
391
+ el.style.borderColor = '#ff0000';
392
+ el.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
393
+ highlighted.push(el);
394
+ });
395
+
396
+ // Highlight corresponding words
397
+ document.querySelectorAll('.word-tag').forEach(wordEl => {
398
+ const tokenIndices = wordEl.getAttribute('data-word-tokens').split(',');
399
+ if (tokenIndices.includes(tokenIdx.toString())) {
400
+ wordEl.classList.add('highlighted');
401
+ wordEl.style.borderColor = '#ff0000';
402
+ wordEl.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
403
+ highlighted.push(wordEl);
404
+ }
405
+ });
406
+
407
+ currentHighlighted = highlighted;
408
+ }
409
+
410
+ function highlightWord(tokenIndicesStr) {
411
+ clearHighlights();
412
+ const tokenIndices = tokenIndicesStr.split(',');
413
+ const highlighted = [];
414
+
415
+ // Highlight words
416
+ document.querySelectorAll(`.word-tag[data-word-tokens="${tokenIndicesStr}"]`).forEach(el => {
417
+ el.classList.add('highlighted');
418
+ el.style.borderColor = '#ff0000';
419
+ el.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
420
+ highlighted.push(el);
421
+ });
422
+
423
+ // Highlight corresponding tokens
424
+ tokenIndices.forEach(idx => {
425
+ document.querySelectorAll(`.token-tag[data-token-idx="${idx}"]`).forEach(el => {
426
+ el.classList.add('highlighted');
427
+ el.style.borderColor = '#ff0000';
428
+ el.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
429
+ highlighted.push(el);
430
+ });
431
+ });
432
+
433
+ currentHighlighted = highlighted;
434
+ }
435
+
436
+ function highlightDecodeToken(tokenIdx) {
437
+ clearHighlights();
438
+ const tokenEl = document.querySelector(`.decode-token-tag[data-token-idx="${tokenIdx}"]`);
439
+ if (!tokenEl) return;
440
+
441
+ const highlighted = [tokenEl];
442
+ tokenEl.classList.add('highlighted');
443
+ tokenEl.style.borderColor = '#ff0000';
444
+ tokenEl.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
445
+
446
+ // Highlight corresponding words in decoded text
447
+ document.querySelectorAll('.decode-word-tag').forEach(wordEl => {
448
+ const tokenIndices = wordEl.getAttribute('data-word-tokens').split(',');
449
+ if (tokenIndices.includes(tokenIdx.toString())) {
450
+ wordEl.classList.add('highlighted');
451
+ wordEl.style.borderColor = '#ff0000';
452
+ wordEl.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
453
+ highlighted.push(wordEl);
454
+ }
455
+ });
456
+
457
+ currentHighlighted = highlighted;
458
+ }
459
+
460
+ function highlightInputWord(tokenIndicesStr) {
461
+ clearHighlights();
462
+ const tokenIndices = tokenIndicesStr.split(',');
463
+ const highlighted = [];
464
+
465
+ // Highlight input words
466
+ document.querySelectorAll(`.input-word-tag[data-word-tokens="${tokenIndicesStr}"]`).forEach(el => {
467
+ el.classList.add('highlighted');
468
+ el.style.borderColor = '#ff0000';
469
+ el.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
470
+ highlighted.push(el);
471
+ });
472
+
473
+ // Highlight corresponding token IDs
474
+ tokenIndices.forEach(idx => {
475
+ document.querySelectorAll(`.token-id-tag[data-token-idx="${idx}"]`).forEach(el => {
476
+ el.classList.add('highlighted');
477
+ el.style.borderColor = '#ff0000';
478
+ el.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
479
+ el.style.transform = 'scale(1.1)';
480
+ highlighted.push(el);
481
+ });
482
+ });
483
+
484
+ currentHighlighted = highlighted;
485
+ }
486
+
487
+ function highlightTokenId(tokenIdx) {
488
+ clearHighlights();
489
+ const tokenIdEl = document.querySelector(`.token-id-tag[data-token-idx="${tokenIdx}"]`);
490
+ if (!tokenIdEl) return;
491
+
492
+ const highlighted = [tokenIdEl];
493
+ tokenIdEl.classList.add('highlighted');
494
+ tokenIdEl.style.borderColor = '#ff0000';
495
+ tokenIdEl.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
496
+ tokenIdEl.style.transform = 'scale(1.1)';
497
+
498
+ // Find input words that contain this token
499
+ document.querySelectorAll('.input-word-tag').forEach(wordEl => {
500
+ const tokenIndices = wordEl.getAttribute('data-word-tokens');
501
+ if (tokenIndices) {
502
+ const tokenList = tokenIndices.split(',');
503
+ if (tokenList.includes(tokenIdx.toString())) {
504
+ wordEl.classList.add('highlighted');
505
+ wordEl.style.borderColor = '#ff0000';
506
+ wordEl.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
507
+ highlighted.push(wordEl);
508
+ }
509
+ }
510
+ });
511
+
512
+ currentHighlighted = highlighted;
513
+ }
514
+
515
+ function highlightDecodeWord(tokenIndicesStr) {
516
+ clearHighlights();
517
+ const tokenIndices = tokenIndicesStr.split(',');
518
+ const highlighted = [];
519
+
520
+ // Highlight words
521
+ document.querySelectorAll(`.decode-word-tag[data-word-tokens="${tokenIndicesStr}"]`).forEach(el => {
522
+ el.classList.add('highlighted');
523
+ el.style.borderColor = '#ff0000';
524
+ el.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
525
+ highlighted.push(el);
526
+ });
527
+
528
+ // Highlight corresponding token IDs
529
+ tokenIndices.forEach(idx => {
530
+ document.querySelectorAll(`.decode-token-id-tag[data-token-idx="${idx}"]`).forEach(el => {
531
+ el.classList.add('highlighted');
532
+ el.style.borderColor = '#ff0000';
533
+ el.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
534
+ el.style.transform = 'scale(1.1)';
535
+ highlighted.push(el);
536
+ });
537
+ });
538
+
539
+ currentHighlighted = highlighted;
540
+ }
541
+
542
+ function highlightDecodeTokenId(tokenIdx) {
543
+ clearHighlights();
544
+ const tokenIdEl = document.querySelector(`.decode-token-id-tag[data-token-idx="${tokenIdx}"]`);
545
+ if (!tokenIdEl) return;
546
+
547
+ const highlighted = [tokenIdEl];
548
+ tokenIdEl.classList.add('highlighted');
549
+ tokenIdEl.style.borderColor = '#ff0000';
550
+ tokenIdEl.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
551
+ tokenIdEl.style.transform = 'scale(1.1)';
552
+
553
+ // Find decoded words that contain this token
554
+ document.querySelectorAll('.decode-word-tag').forEach(wordEl => {
555
+ const tokenIndices = wordEl.getAttribute('data-word-tokens');
556
+ if (tokenIndices) {
557
+ const tokenList = tokenIndices.split(',');
558
+ if (tokenList.includes(tokenIdx.toString())) {
559
+ wordEl.classList.add('highlighted');
560
+ wordEl.style.borderColor = '#ff0000';
561
+ wordEl.style.boxShadow = '0 0 8px rgba(255,0,0,0.5)';
562
+ highlighted.push(wordEl);
563
+ }
564
+ }
565
+ });
566
+
567
+ currentHighlighted = highlighted;
568
+ }
569
+ </script>
570
+ """
571
+
572
+ with gr.Blocks(title="Hindi Tokenizer", head=highlight_js) as demo:
573
  gr.Markdown("## 🔡 Hindi BPE Tokenizer — Encode / Decode")
574
+
575
+ # Hidden component to store token data
576
+ token_data_store = gr.State(value="")
577
 
578
  with gr.Tab("Encode"):
579
  text_in = gr.Textbox(label="Enter text", lines=3)
580
 
581
+ gr.Markdown("### 📝 Input Text (Click words to highlight token IDs)")
582
+ input_html_out = gr.HTML(label="Clickable Input Text", value="<div style='padding: 10px; color: #666; font-style: italic;'>Enter text above and click Encode to see clickable words</div>")
583
+
584
  with gr.Row():
585
  token_count_out = gr.Number(label="Token Count", precision=0)
586
  compression_ratio_out = gr.Textbox(label="Compression Ratio (chars/token)", interactive=False)
587
 
588
+ gr.Markdown("### Token IDs (Click to highlight words)")
589
+ token_ids_html_out = gr.HTML(label="Token IDs with Labels")
590
+
591
+ ids_out = gr.Textbox(label="Token IDs (CSV)", lines=4, max_lines=10, interactive=False)
592
+ btn = gr.Button("Encode", variant="primary")
593
+ btn.click(encode_text, text_in, [ids_out, token_count_out, compression_ratio_out, token_ids_html_out, token_data_store, input_html_out])
594
 
595
  with gr.Tab("Decode"):
596
  ids_in = gr.Textbox(label="Comma-separated token IDs", lines=4)
597
+
598
+ gr.Markdown("### 📝 Decoded Text (Click words to highlight token IDs)")
599
+ decoded_text_html_out = gr.HTML(label="Clickable Decoded Text", value="<div style='padding: 10px; color: #666; font-style: italic;'>Enter token IDs above and click Decode to see clickable words</div>")
600
+
601
+ gr.Markdown("### Token IDs (Click to highlight words)")
602
+ decode_token_ids_html_out = gr.HTML(label="Token IDs with Labels")
603
+
604
+ decoded_text_out = gr.Textbox(label="Decoded Text", lines=4, max_lines=10, interactive=False)
605
+
606
+ btn3 = gr.Button("Decode", variant="primary")
607
+ btn3.click(decode_ids, ids_in, [decoded_text_html_out, decode_token_ids_html_out, decoded_text_out])
608
 
609
  # Mount FastAPI + Gradio
610