dnzblgn commited on
Commit
9a93602
·
verified ·
1 Parent(s): 0a581ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -11
app.py CHANGED
@@ -23,7 +23,6 @@ def decode_byte_token(token):
23
  except Exception:
24
  return token_clean # fallback
25
 
26
- # Token visualization function
27
  def visualize_tokens(text, tokenizer_name, show_token_ids):
28
  tokenizer = tokenizers[tokenizer_name]
29
  encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
@@ -43,18 +42,27 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
43
 
44
  html_tokens = []
45
  for token in tokens:
46
- visible_token = token
47
-
48
- # Decode clean UTF-8 version for hover
49
- if tokenizer_name.startswith("Tabularis"):
50
- decoded = decode_byte_token(token)
51
- else:
52
- decoded = token
53
-
 
 
 
 
 
 
 
 
 
54
  color = "lightgray" if is_special_token(token) else random_pastel()
55
 
56
  html_token = f"""
57
- <span title="{decoded}" style='
58
  display:inline-block;
59
  margin:4px;
60
  padding:8px 12px;
@@ -63,7 +71,7 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
63
  font-size:18px;
64
  font-family:monospace;
65
  font-weight:bold;
66
- '>{visible_token}</span>
67
  """
68
  html_tokens.append(html_token)
69
 
@@ -80,6 +88,7 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
80
 
81
  return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
82
 
 
83
  # Gradio app
84
  with gr.Blocks() as app:
85
  gr.Markdown("# 🚀 German Tokenizers")
 
23
  except Exception:
24
  return token_clean # fallback
25
 
 
26
  def visualize_tokens(text, tokenizer_name, show_token_ids):
27
  tokenizer = tokenizers[tokenizer_name]
28
  encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
 
42
 
43
  html_tokens = []
44
  for token in tokens:
45
+ prefix = ""
46
+ token_body = token
47
+
48
+ # For your byte-level tokenizers: separate Ġ from body
49
+ if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
50
+ prefix = "Ġ"
51
+ token_body = token[1:]
52
+
53
+ # Try to decode only the body (not Ġ)
54
+ try:
55
+ byte_seq = bytes([ord(c) for c in token_body])
56
+ decoded = byte_seq.decode("utf-8")
57
+ except Exception:
58
+ decoded = token_body # fallback
59
+
60
+ # Display = prefix (e.g. Ġ) + decoded characters
61
+ label = f"{prefix}{decoded}"
62
  color = "lightgray" if is_special_token(token) else random_pastel()
63
 
64
  html_token = f"""
65
+ <span title="{token}" style='
66
  display:inline-block;
67
  margin:4px;
68
  padding:8px 12px;
 
71
  font-size:18px;
72
  font-family:monospace;
73
  font-weight:bold;
74
+ '>{label}</span>
75
  """
76
  html_tokens.append(html_token)
77
 
 
88
 
89
  return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
90
 
91
+
92
  # Gradio app
93
  with gr.Blocks() as app:
94
  gr.Markdown("# 🚀 German Tokenizers")