Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,7 +3,6 @@ import gradio as gr
|
|
| 3 |
import unicodedata
|
| 4 |
from transformers import AutoTokenizer, PreTrainedTokenizerFast
|
| 5 |
|
| 6 |
-
# Load all tokenizers at startup
|
| 7 |
tokenizers = {
|
| 8 |
"Tabularis German Tokenizer_whiteS": PreTrainedTokenizerFast(tokenizer_file="tokenizer.json"),
|
| 9 |
"Tabularis German Tokenizer": PreTrainedTokenizerFast(tokenizer_file="tokenizer_BPE.json"),
|
|
@@ -15,7 +14,7 @@ tokenizers = {
|
|
| 15 |
"DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
|
| 16 |
}
|
| 17 |
|
| 18 |
-
#
|
| 19 |
def decode_byte_token(token):
|
| 20 |
token_clean = token.replace("Ġ", "")
|
| 21 |
try:
|
|
@@ -24,7 +23,6 @@ def decode_byte_token(token):
|
|
| 24 |
except Exception:
|
| 25 |
return token_clean # fallback
|
| 26 |
|
| 27 |
-
|
| 28 |
def visualize_tokens(text, tokenizer_name, show_token_ids):
|
| 29 |
tokenizer = tokenizers[tokenizer_name]
|
| 30 |
encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
|
|
@@ -47,19 +45,16 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
|
|
| 47 |
prefix = ""
|
| 48 |
token_body = token
|
| 49 |
|
| 50 |
-
# Handle byte-level space marker
|
| 51 |
if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
|
| 52 |
prefix = "Ġ"
|
| 53 |
token_body = token[1:]
|
| 54 |
|
| 55 |
-
# Decode the token body only (not Ġ), with normalization
|
| 56 |
try:
|
| 57 |
byte_seq = bytes([ord(c) for c in token_body])
|
| 58 |
decoded = unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
|
| 59 |
except Exception:
|
| 60 |
-
decoded = token_body
|
| 61 |
|
| 62 |
-
# Combine prefix with decoded token
|
| 63 |
label = f"{prefix}{decoded}"
|
| 64 |
color = "lightgray" if is_special_token(token) else random_pastel()
|
| 65 |
|
|
@@ -82,14 +77,9 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
|
|
| 82 |
if show_token_ids:
|
| 83 |
html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
|
| 84 |
|
| 85 |
-
|
| 86 |
-
try:
|
| 87 |
-
decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
|
| 88 |
-
except Exception:
|
| 89 |
-
decoded_output = "[Could not decode using this tokenizer]"
|
| 90 |
-
|
| 91 |
-
return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
|
| 92 |
|
|
|
|
| 93 |
with gr.Blocks() as app:
|
| 94 |
gr.Markdown("# 🚀 German Tokenizers")
|
| 95 |
|
|
@@ -102,12 +92,11 @@ with gr.Blocks() as app:
|
|
| 102 |
with gr.Column():
|
| 103 |
html_output = gr.HTML(label="Tokens Visualized")
|
| 104 |
token_count = gr.Label(label="Token Count")
|
| 105 |
-
decoded_output = gr.Textbox(label="Decoded Text", lines=3)
|
| 106 |
|
| 107 |
tokenize_btn.click(
|
| 108 |
visualize_tokens,
|
| 109 |
inputs=[text_input, tokenizer_choice, show_ids],
|
| 110 |
-
outputs=[html_output, token_count
|
| 111 |
)
|
| 112 |
|
| 113 |
app.launch()
|
|
|
|
| 3 |
import unicodedata
|
| 4 |
from transformers import AutoTokenizer, PreTrainedTokenizerFast
|
| 5 |
|
|
|
|
| 6 |
tokenizers = {
|
| 7 |
"Tabularis German Tokenizer_whiteS": PreTrainedTokenizerFast(tokenizer_file="tokenizer.json"),
|
| 8 |
"Tabularis German Tokenizer": PreTrainedTokenizerFast(tokenizer_file="tokenizer_BPE.json"),
|
|
|
|
| 14 |
"DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
|
| 15 |
}
|
| 16 |
|
| 17 |
+
# Decode byte-level tokens back to UTF-8, normalized
|
| 18 |
def decode_byte_token(token):
|
| 19 |
token_clean = token.replace("Ġ", "")
|
| 20 |
try:
|
|
|
|
| 23 |
except Exception:
|
| 24 |
return token_clean # fallback
|
| 25 |
|
|
|
|
| 26 |
def visualize_tokens(text, tokenizer_name, show_token_ids):
|
| 27 |
tokenizer = tokenizers[tokenizer_name]
|
| 28 |
encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
|
|
|
|
| 45 |
prefix = ""
|
| 46 |
token_body = token
|
| 47 |
|
|
|
|
| 48 |
if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
|
| 49 |
prefix = "Ġ"
|
| 50 |
token_body = token[1:]
|
| 51 |
|
|
|
|
| 52 |
try:
|
| 53 |
byte_seq = bytes([ord(c) for c in token_body])
|
| 54 |
decoded = unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
|
| 55 |
except Exception:
|
| 56 |
+
decoded = token_body
|
| 57 |
|
|
|
|
| 58 |
label = f"{prefix}{decoded}"
|
| 59 |
color = "lightgray" if is_special_token(token) else random_pastel()
|
| 60 |
|
|
|
|
| 77 |
if show_token_ids:
|
| 78 |
html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
|
| 79 |
|
| 80 |
+
return html_output, f"🔢 Token Count: {len(tokens)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
+
#app
|
| 83 |
with gr.Blocks() as app:
|
| 84 |
gr.Markdown("# 🚀 German Tokenizers")
|
| 85 |
|
|
|
|
| 92 |
with gr.Column():
|
| 93 |
html_output = gr.HTML(label="Tokens Visualized")
|
| 94 |
token_count = gr.Label(label="Token Count")
|
|
|
|
| 95 |
|
| 96 |
tokenize_btn.click(
|
| 97 |
visualize_tokens,
|
| 98 |
inputs=[text_input, tokenizer_choice, show_ids],
|
| 99 |
+
outputs=[html_output, token_count]
|
| 100 |
)
|
| 101 |
|
| 102 |
app.launch()
|