Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -23,7 +23,6 @@ def decode_byte_token(token):
|
|
| 23 |
except Exception:
|
| 24 |
return token_clean # fallback
|
| 25 |
|
| 26 |
-
# Token visualization function
|
| 27 |
def visualize_tokens(text, tokenizer_name, show_token_ids):
|
| 28 |
tokenizer = tokenizers[tokenizer_name]
|
| 29 |
encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
|
|
@@ -43,18 +42,27 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
|
|
| 43 |
|
| 44 |
html_tokens = []
|
| 45 |
for token in tokens:
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
color = "lightgray" if is_special_token(token) else random_pastel()
|
| 55 |
|
| 56 |
html_token = f"""
|
| 57 |
-
<span title="{
|
| 58 |
display:inline-block;
|
| 59 |
margin:4px;
|
| 60 |
padding:8px 12px;
|
|
@@ -63,7 +71,7 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
|
|
| 63 |
font-size:18px;
|
| 64 |
font-family:monospace;
|
| 65 |
font-weight:bold;
|
| 66 |
-
'>{
|
| 67 |
"""
|
| 68 |
html_tokens.append(html_token)
|
| 69 |
|
|
@@ -80,6 +88,7 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
|
|
| 80 |
|
| 81 |
return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
|
| 82 |
|
|
|
|
| 83 |
# Gradio app
|
| 84 |
with gr.Blocks() as app:
|
| 85 |
gr.Markdown("# 🚀 German Tokenizers")
|
|
|
|
| 23 |
except Exception:
|
| 24 |
return token_clean # fallback
|
| 25 |
|
|
|
|
| 26 |
def visualize_tokens(text, tokenizer_name, show_token_ids):
|
| 27 |
tokenizer = tokenizers[tokenizer_name]
|
| 28 |
encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
|
|
|
|
| 42 |
|
| 43 |
html_tokens = []
|
| 44 |
for token in tokens:
|
| 45 |
+
prefix = ""
|
| 46 |
+
token_body = token
|
| 47 |
+
|
| 48 |
+
# For your byte-level tokenizers: separate Ġ from body
|
| 49 |
+
if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
|
| 50 |
+
prefix = "Ġ"
|
| 51 |
+
token_body = token[1:]
|
| 52 |
+
|
| 53 |
+
# Try to decode only the body (not Ġ)
|
| 54 |
+
try:
|
| 55 |
+
byte_seq = bytes([ord(c) for c in token_body])
|
| 56 |
+
decoded = byte_seq.decode("utf-8")
|
| 57 |
+
except Exception:
|
| 58 |
+
decoded = token_body # fallback
|
| 59 |
+
|
| 60 |
+
# Display = prefix (e.g. Ġ) + decoded characters
|
| 61 |
+
label = f"{prefix}{decoded}"
|
| 62 |
color = "lightgray" if is_special_token(token) else random_pastel()
|
| 63 |
|
| 64 |
html_token = f"""
|
| 65 |
+
<span title="{token}" style='
|
| 66 |
display:inline-block;
|
| 67 |
margin:4px;
|
| 68 |
padding:8px 12px;
|
|
|
|
| 71 |
font-size:18px;
|
| 72 |
font-family:monospace;
|
| 73 |
font-weight:bold;
|
| 74 |
+
'>{label}</span>
|
| 75 |
"""
|
| 76 |
html_tokens.append(html_token)
|
| 77 |
|
|
|
|
| 88 |
|
| 89 |
return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
|
| 90 |
|
| 91 |
+
|
| 92 |
# Gradio app
|
| 93 |
with gr.Blocks() as app:
|
| 94 |
gr.Markdown("# 🚀 German Tokenizers")
|