dnzblgn commited on
Commit
0a581ef
·
verified ·
1 Parent(s): 69dff55

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -14
app.py CHANGED
@@ -14,28 +14,22 @@ tokenizers = {
14
  "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
15
  }
16
 
17
- # Decode byte-level tokens back to UTF-8 for visual clarity (only for Tabularis models)
18
  def decode_byte_token(token):
19
  token_clean = token.replace("Ġ", "")
20
  try:
21
  byte_seq = bytes([ord(c) for c in token_clean])
22
  return byte_seq.decode("utf-8")
23
  except Exception:
24
- return token # fallback
25
 
26
- # Token visualization
27
  def visualize_tokens(text, tokenizer_name, show_token_ids):
28
  tokenizer = tokenizers[tokenizer_name]
29
  encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
30
  token_ids = encoded["input_ids"]
31
  tokens = tokenizer.convert_ids_to_tokens(token_ids)
32
 
33
- # Decode byte-level tokens for your custom tokenizer
34
- if tokenizer_name.startswith("Tabularis"):
35
- readable_tokens = [decode_byte_token(t) for t in tokens]
36
- else:
37
- readable_tokens = tokens
38
-
39
  def random_pastel():
40
  r = lambda: random.randint(100, 255)
41
  return f"rgb({r()},{r()},{r()})"
@@ -48,10 +42,19 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
48
  )
49
 
50
  html_tokens = []
51
- for token in readable_tokens:
 
 
 
 
 
 
 
 
52
  color = "lightgray" if is_special_token(token) else random_pastel()
 
53
  html_token = f"""
54
- <span style='
55
  display:inline-block;
56
  margin:4px;
57
  padding:8px 12px;
@@ -60,7 +63,7 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
60
  font-size:18px;
61
  font-family:monospace;
62
  font-weight:bold;
63
- '>{token}</span>
64
  """
65
  html_tokens.append(html_token)
66
 
@@ -69,7 +72,7 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
69
  if show_token_ids:
70
  html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
71
 
72
- # Show decoded output using tokenizer.decode
73
  try:
74
  decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
75
  except Exception:
@@ -79,7 +82,7 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
79
 
80
  # Gradio app
81
  with gr.Blocks() as app:
82
- gr.Markdown("# 🚀 German Tokenizers ")
83
 
84
  with gr.Row():
85
  with gr.Column():
 
14
  "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
15
  }
16
 
17
+ # Decode byte-level tokens back to UTF-8 (for Tabularis only)
18
  def decode_byte_token(token):
19
  token_clean = token.replace("Ġ", "")
20
  try:
21
  byte_seq = bytes([ord(c) for c in token_clean])
22
  return byte_seq.decode("utf-8")
23
  except Exception:
24
+ return token_clean # fallback
25
 
26
+ # Token visualization function
27
  def visualize_tokens(text, tokenizer_name, show_token_ids):
28
  tokenizer = tokenizers[tokenizer_name]
29
  encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
30
  token_ids = encoded["input_ids"]
31
  tokens = tokenizer.convert_ids_to_tokens(token_ids)
32
 
 
 
 
 
 
 
33
  def random_pastel():
34
  r = lambda: random.randint(100, 255)
35
  return f"rgb({r()},{r()},{r()})"
 
42
  )
43
 
44
  html_tokens = []
45
+ for token in tokens:
46
+ visible_token = token
47
+
48
+ # Decode clean UTF-8 version for hover
49
+ if tokenizer_name.startswith("Tabularis"):
50
+ decoded = decode_byte_token(token)
51
+ else:
52
+ decoded = token
53
+
54
  color = "lightgray" if is_special_token(token) else random_pastel()
55
+
56
  html_token = f"""
57
+ <span title="{decoded}" style='
58
  display:inline-block;
59
  margin:4px;
60
  padding:8px 12px;
 
63
  font-size:18px;
64
  font-family:monospace;
65
  font-weight:bold;
66
+ '>{visible_token}</span>
67
  """
68
  html_tokens.append(html_token)
69
 
 
72
  if show_token_ids:
73
  html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
74
 
75
+ # Decode full sentence
76
  try:
77
  decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
78
  except Exception:
 
82
 
83
  # Gradio app
84
  with gr.Blocks() as app:
85
+ gr.Markdown("# 🚀 German Tokenizers")
86
 
87
  with gr.Row():
88
  with gr.Column():