dnzblgn commited on
Commit
87989ba
·
verified ·
1 Parent(s): 9a93602

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -10
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import random
2
  import gradio as gr
 
3
  from transformers import AutoTokenizer, PreTrainedTokenizerFast
4
 
5
  # Load all tokenizers at startup
@@ -14,15 +15,16 @@ tokenizers = {
14
  "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
15
  }
16
 
17
- # Decode byte-level tokens back to UTF-8 (for Tabularis only)
18
  def decode_byte_token(token):
19
  token_clean = token.replace("Ġ", "")
20
  try:
21
  byte_seq = bytes([ord(c) for c in token_clean])
22
- return byte_seq.decode("utf-8")
23
  except Exception:
24
  return token_clean # fallback
25
 
 
26
  def visualize_tokens(text, tokenizer_name, show_token_ids):
27
  tokenizer = tokenizers[tokenizer_name]
28
  encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
@@ -45,19 +47,19 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
45
  prefix = ""
46
  token_body = token
47
 
48
- # For your byte-level tokenizers: separate Ġ from body
49
  if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
50
  prefix = "Ġ"
51
  token_body = token[1:]
52
 
53
- # Try to decode only the body (not Ġ)
54
  try:
55
  byte_seq = bytes([ord(c) for c in token_body])
56
- decoded = byte_seq.decode("utf-8")
57
  except Exception:
58
  decoded = token_body # fallback
59
 
60
- # Display = prefix (e.g. Ġ) + decoded characters
61
  label = f"{prefix}{decoded}"
62
  color = "lightgray" if is_special_token(token) else random_pastel()
63
 
@@ -80,7 +82,7 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
80
  if show_token_ids:
81
  html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
82
 
83
- # Decode full sentence
84
  try:
85
  decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
86
  except Exception:
@@ -88,8 +90,6 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
88
 
89
  return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
90
 
91
-
92
- # Gradio app
93
  with gr.Blocks() as app:
94
  gr.Markdown("# 🚀 German Tokenizers")
95
 
@@ -110,5 +110,4 @@ with gr.Blocks() as app:
110
  outputs=[html_output, token_count, decoded_output]
111
  )
112
 
113
- # Launch the app
114
  app.launch()
 
1
  import random
2
  import gradio as gr
3
+ import unicodedata
4
  from transformers import AutoTokenizer, PreTrainedTokenizerFast
5
 
6
  # Load all tokenizers at startup
 
15
  "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
16
  }
17
 
18
+ #decode byte-level tokens back to UTF-8, normalized (fixes ß, ä, ö, ü)
19
  def decode_byte_token(token):
20
  token_clean = token.replace("Ġ", "")
21
  try:
22
  byte_seq = bytes([ord(c) for c in token_clean])
23
+ return unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
24
  except Exception:
25
  return token_clean # fallback
26
 
27
+
28
  def visualize_tokens(text, tokenizer_name, show_token_ids):
29
  tokenizer = tokenizers[tokenizer_name]
30
  encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
 
47
  prefix = ""
48
  token_body = token
49
 
50
+ # Handle byte-level space marker
51
  if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
52
  prefix = "Ġ"
53
  token_body = token[1:]
54
 
55
+ # Decode the token body only (not Ġ), with normalization
56
  try:
57
  byte_seq = bytes([ord(c) for c in token_body])
58
+ decoded = unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
59
  except Exception:
60
  decoded = token_body # fallback
61
 
62
+ # Combine prefix with decoded token
63
  label = f"{prefix}{decoded}"
64
  color = "lightgray" if is_special_token(token) else random_pastel()
65
 
 
82
  if show_token_ids:
83
  html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
84
 
85
+ #decode full sentence
86
  try:
87
  decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
88
  except Exception:
 
90
 
91
  return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
92
 
 
 
93
  with gr.Blocks() as app:
94
  gr.Markdown("# 🚀 German Tokenizers")
95
 
 
110
  outputs=[html_output, token_count, decoded_output]
111
  )
112
 
 
113
  app.launch()