dnzblgn commited on
Commit
69dff55
·
verified ·
1 Parent(s): 3e59d07

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -16
app.py CHANGED
@@ -1,9 +1,8 @@
1
  import random
2
  import gradio as gr
3
- from transformers import AutoTokenizer
4
- from transformers import PreTrainedTokenizerFast
5
 
6
- # Load all tokenizers (at startup)
7
  tokenizers = {
8
  "Tabularis German Tokenizer_whiteS": PreTrainedTokenizerFast(tokenizer_file="tokenizer.json"),
9
  "Tabularis German Tokenizer": PreTrainedTokenizerFast(tokenizer_file="tokenizer_BPE.json"),
@@ -15,28 +14,42 @@ tokenizers = {
15
  "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
16
  }
17
 
18
- # Fancy token visualization with random colors
 
 
 
 
 
 
 
 
 
19
  def visualize_tokens(text, tokenizer_name, show_token_ids):
20
  tokenizer = tokenizers[tokenizer_name]
21
  encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
22
  token_ids = encoded["input_ids"]
23
  tokens = tokenizer.convert_ids_to_tokens(token_ids)
24
 
 
 
 
 
 
 
25
  def random_pastel():
26
  r = lambda: random.randint(100, 255)
27
  return f"rgb({r()},{r()},{r()})"
28
 
29
- # Helper to detect special tokens
30
  def is_special_token(token):
31
- return token.startswith('[') and token.endswith(']') or token.startswith('<') and token.endswith('>') or token in tokenizer.all_special_tokens
 
 
 
 
32
 
33
  html_tokens = []
34
- for idx, token in enumerate(tokens):
35
- if is_special_token(token):
36
- color = "lightgray" # special token color
37
- else:
38
- color = random_pastel()
39
-
40
  html_token = f"""
41
  <span style='
42
  display:inline-block;
@@ -52,12 +65,17 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
52
  html_tokens.append(html_token)
53
 
54
  html_output = "".join(html_tokens)
55
-
56
  if show_token_ids:
57
  html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
58
 
59
- return html_output, f"🔢 Token Count: {len(tokens)}"
 
 
 
 
60
 
 
61
 
62
  # Gradio app
63
  with gr.Blocks() as app:
@@ -72,12 +90,13 @@ with gr.Blocks() as app:
72
  with gr.Column():
73
  html_output = gr.HTML(label="Tokens Visualized")
74
  token_count = gr.Label(label="Token Count")
 
75
 
76
  tokenize_btn.click(
77
  visualize_tokens,
78
  inputs=[text_input, tokenizer_choice, show_ids],
79
- outputs=[html_output, token_count]
80
  )
81
 
82
- # Launch
83
  app.launch()
 
1
  import random
2
  import gradio as gr
3
+ from transformers import AutoTokenizer, PreTrainedTokenizerFast
 
4
 
5
+ # Load all tokenizers at startup
6
  tokenizers = {
7
  "Tabularis German Tokenizer_whiteS": PreTrainedTokenizerFast(tokenizer_file="tokenizer.json"),
8
  "Tabularis German Tokenizer": PreTrainedTokenizerFast(tokenizer_file="tokenizer_BPE.json"),
 
14
  "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
15
  }
16
 
17
+ # Decode byte-level tokens back to UTF-8 for visual clarity (only for Tabularis models)
18
+ def decode_byte_token(token):
19
+ token_clean = token.replace("Ġ", "")
20
+ try:
21
+ byte_seq = bytes([ord(c) for c in token_clean])
22
+ return byte_seq.decode("utf-8")
23
+ except Exception:
24
+ return token # fallback
25
+
26
+ # Token visualization
27
  def visualize_tokens(text, tokenizer_name, show_token_ids):
28
  tokenizer = tokenizers[tokenizer_name]
29
  encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
30
  token_ids = encoded["input_ids"]
31
  tokens = tokenizer.convert_ids_to_tokens(token_ids)
32
 
33
+ # Decode byte-level tokens for your custom tokenizer
34
+ if tokenizer_name.startswith("Tabularis"):
35
+ readable_tokens = [decode_byte_token(t) for t in tokens]
36
+ else:
37
+ readable_tokens = tokens
38
+
39
  def random_pastel():
40
  r = lambda: random.randint(100, 255)
41
  return f"rgb({r()},{r()},{r()})"
42
 
 
43
  def is_special_token(token):
44
+ return (
45
+ token.startswith('[') and token.endswith(']')
46
+ or token.startswith('<') and token.endswith('>')
47
+ or token in tokenizer.all_special_tokens
48
+ )
49
 
50
  html_tokens = []
51
+ for token in readable_tokens:
52
+ color = "lightgray" if is_special_token(token) else random_pastel()
 
 
 
 
53
  html_token = f"""
54
  <span style='
55
  display:inline-block;
 
65
  html_tokens.append(html_token)
66
 
67
  html_output = "".join(html_tokens)
68
+
69
  if show_token_ids:
70
  html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
71
 
72
+ # Show decoded output using tokenizer.decode
73
+ try:
74
+ decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
75
+ except Exception:
76
+ decoded_output = "[Could not decode using this tokenizer]"
77
 
78
+ return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
79
 
80
  # Gradio app
81
  with gr.Blocks() as app:
 
90
  with gr.Column():
91
  html_output = gr.HTML(label="Tokens Visualized")
92
  token_count = gr.Label(label="Token Count")
93
+ decoded_output = gr.Textbox(label="Decoded Text", lines=3)
94
 
95
  tokenize_btn.click(
96
  visualize_tokens,
97
  inputs=[text_input, tokenizer_choice, show_ids],
98
+ outputs=[html_output, token_count, decoded_output]
99
  )
100
 
101
+ # Launch the app
102
  app.launch()