Spaces:

dnzblgn
/

Tokenizers

Build error

App Files Files Community

dnzblgn commited on May 6, 2025

Commit

6bed62f

verified ·

1 Parent(s): 87989ba

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -16

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import gradio as gr
 import unicodedata
 from transformers import AutoTokenizer, PreTrainedTokenizerFast
-# Load all tokenizers at startup
 tokenizers = {
     "Tabularis German Tokenizer_whiteS": PreTrainedTokenizerFast(tokenizer_file="tokenizer.json"),
     "Tabularis German Tokenizer": PreTrainedTokenizerFast(tokenizer_file="tokenizer_BPE.json"),
@@ -15,7 +14,7 @@ tokenizers = {
     "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
 }
-#decode byte-level tokens back to UTF-8, normalized (fixes ß, ä, ö, ü)
 def decode_byte_token(token):
     token_clean = token.replace("Ġ", "")
     try:
@@ -24,7 +23,6 @@ def decode_byte_token(token):
     except Exception:
         return token_clean  # fallback
 def visualize_tokens(text, tokenizer_name, show_token_ids):
     tokenizer = tokenizers[tokenizer_name]
     encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
@@ -47,19 +45,16 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
         prefix = ""
         token_body = token
-        # Handle byte-level space marker
         if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
             prefix = "Ġ"
             token_body = token[1:]
-        # Decode the token body only (not Ġ), with normalization
         try:
             byte_seq = bytes([ord(c) for c in token_body])
             decoded = unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
         except Exception:
-            decoded = token_body  # fallback
-        # Combine prefix with decoded token
         label = f"{prefix}{decoded}"
         color = "lightgray" if is_special_token(token) else random_pastel()
@@ -82,14 +77,9 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
     if show_token_ids:
         html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
-    #decode full sentence
-    try:
-        decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
-    except Exception:
-        decoded_output = "[Could not decode using this tokenizer]"
-    return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
 with gr.Blocks() as app:
     gr.Markdown("# 🚀 German Tokenizers")
@@ -102,12 +92,11 @@ with gr.Blocks() as app:
         with gr.Column():
             html_output = gr.HTML(label="Tokens Visualized")
             token_count = gr.Label(label="Token Count")
-            decoded_output = gr.Textbox(label="Decoded Text", lines=3)
     tokenize_btn.click(
         visualize_tokens,
         inputs=[text_input, tokenizer_choice, show_ids],
-        outputs=[html_output, token_count, decoded_output]
     )
 app.launch()

 import unicodedata
 from transformers import AutoTokenizer, PreTrainedTokenizerFast
 tokenizers = {
     "Tabularis German Tokenizer_whiteS": PreTrainedTokenizerFast(tokenizer_file="tokenizer.json"),
     "Tabularis German Tokenizer": PreTrainedTokenizerFast(tokenizer_file="tokenizer_BPE.json"),
     "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
 }
+# Decode byte-level tokens back to UTF-8, normalized
 def decode_byte_token(token):
     token_clean = token.replace("Ġ", "")
     try:
     except Exception:
         return token_clean  # fallback
 def visualize_tokens(text, tokenizer_name, show_token_ids):
     tokenizer = tokenizers[tokenizer_name]
     encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
         prefix = ""
         token_body = token
         if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
             prefix = "Ġ"
             token_body = token[1:]
         try:
             byte_seq = bytes([ord(c) for c in token_body])
             decoded = unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
         except Exception:
+            decoded = token_body
         label = f"{prefix}{decoded}"
         color = "lightgray" if is_special_token(token) else random_pastel()
     if show_token_ids:
         html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
+    return html_output, f"🔢 Token Count: {len(tokens)}"
+#app
 with gr.Blocks() as app:
     gr.Markdown("# 🚀 German Tokenizers")
         with gr.Column():
             html_output = gr.HTML(label="Tokens Visualized")
             token_count = gr.Label(label="Token Count")
     tokenize_btn.click(
         visualize_tokens,
         inputs=[text_input, tokenizer_choice, show_ids],
+        outputs=[html_output, token_count]
     )
 app.launch()