dnzblgn commited on
Commit
6bed62f
·
verified ·
1 Parent(s): 87989ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -16
app.py CHANGED
@@ -3,7 +3,6 @@ import gradio as gr
3
  import unicodedata
4
  from transformers import AutoTokenizer, PreTrainedTokenizerFast
5
 
6
- # Load all tokenizers at startup
7
  tokenizers = {
8
  "Tabularis German Tokenizer_whiteS": PreTrainedTokenizerFast(tokenizer_file="tokenizer.json"),
9
  "Tabularis German Tokenizer": PreTrainedTokenizerFast(tokenizer_file="tokenizer_BPE.json"),
@@ -15,7 +14,7 @@ tokenizers = {
15
  "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
16
  }
17
 
18
- #decode byte-level tokens back to UTF-8, normalized (fixes ß, ä, ö, ü)
19
  def decode_byte_token(token):
20
  token_clean = token.replace("Ġ", "")
21
  try:
@@ -24,7 +23,6 @@ def decode_byte_token(token):
24
  except Exception:
25
  return token_clean # fallback
26
 
27
-
28
  def visualize_tokens(text, tokenizer_name, show_token_ids):
29
  tokenizer = tokenizers[tokenizer_name]
30
  encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
@@ -47,19 +45,16 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
47
  prefix = ""
48
  token_body = token
49
 
50
- # Handle byte-level space marker
51
  if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
52
  prefix = "Ġ"
53
  token_body = token[1:]
54
 
55
- # Decode the token body only (not Ġ), with normalization
56
  try:
57
  byte_seq = bytes([ord(c) for c in token_body])
58
  decoded = unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
59
  except Exception:
60
- decoded = token_body # fallback
61
 
62
- # Combine prefix with decoded token
63
  label = f"{prefix}{decoded}"
64
  color = "lightgray" if is_special_token(token) else random_pastel()
65
 
@@ -82,14 +77,9 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
82
  if show_token_ids:
83
  html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
84
 
85
- #decode full sentence
86
- try:
87
- decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
88
- except Exception:
89
- decoded_output = "[Could not decode using this tokenizer]"
90
-
91
- return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
92
 
 
93
  with gr.Blocks() as app:
94
  gr.Markdown("# 🚀 German Tokenizers")
95
 
@@ -102,12 +92,11 @@ with gr.Blocks() as app:
102
  with gr.Column():
103
  html_output = gr.HTML(label="Tokens Visualized")
104
  token_count = gr.Label(label="Token Count")
105
- decoded_output = gr.Textbox(label="Decoded Text", lines=3)
106
 
107
  tokenize_btn.click(
108
  visualize_tokens,
109
  inputs=[text_input, tokenizer_choice, show_ids],
110
- outputs=[html_output, token_count, decoded_output]
111
  )
112
 
113
  app.launch()
 
3
  import unicodedata
4
  from transformers import AutoTokenizer, PreTrainedTokenizerFast
5
 
 
6
  tokenizers = {
7
  "Tabularis German Tokenizer_whiteS": PreTrainedTokenizerFast(tokenizer_file="tokenizer.json"),
8
  "Tabularis German Tokenizer": PreTrainedTokenizerFast(tokenizer_file="tokenizer_BPE.json"),
 
14
  "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
15
  }
16
 
17
+ # Decode byte-level tokens back to UTF-8, normalized
18
  def decode_byte_token(token):
19
  token_clean = token.replace("Ġ", "")
20
  try:
 
23
  except Exception:
24
  return token_clean # fallback
25
 
 
26
  def visualize_tokens(text, tokenizer_name, show_token_ids):
27
  tokenizer = tokenizers[tokenizer_name]
28
  encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
 
45
  prefix = ""
46
  token_body = token
47
 
 
48
  if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
49
  prefix = "Ġ"
50
  token_body = token[1:]
51
 
 
52
  try:
53
  byte_seq = bytes([ord(c) for c in token_body])
54
  decoded = unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
55
  except Exception:
56
+ decoded = token_body
57
 
 
58
  label = f"{prefix}{decoded}"
59
  color = "lightgray" if is_special_token(token) else random_pastel()
60
 
 
77
  if show_token_ids:
78
  html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
79
 
80
+ return html_output, f"🔢 Token Count: {len(tokens)}"
 
 
 
 
 
 
81
 
82
+ #app
83
  with gr.Blocks() as app:
84
  gr.Markdown("# 🚀 German Tokenizers")
85
 
 
92
  with gr.Column():
93
  html_output = gr.HTML(label="Tokens Visualized")
94
  token_count = gr.Label(label="Token Count")
 
95
 
96
  tokenize_btn.click(
97
  visualize_tokens,
98
  inputs=[text_input, tokenizer_choice, show_ids],
99
+ outputs=[html_output, token_count]
100
  )
101
 
102
  app.launch()