Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import random
|
| 2 |
import gradio as gr
|
|
|
|
| 3 |
from transformers import AutoTokenizer, PreTrainedTokenizerFast
|
| 4 |
|
| 5 |
# Load all tokenizers at startup
|
|
@@ -14,15 +15,16 @@ tokenizers = {
|
|
| 14 |
"DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
|
| 15 |
}
|
| 16 |
|
| 17 |
-
#
|
| 18 |
def decode_byte_token(token):
|
| 19 |
token_clean = token.replace("Ġ", "")
|
| 20 |
try:
|
| 21 |
byte_seq = bytes([ord(c) for c in token_clean])
|
| 22 |
-
return byte_seq.decode("utf-8")
|
| 23 |
except Exception:
|
| 24 |
return token_clean # fallback
|
| 25 |
|
|
|
|
| 26 |
def visualize_tokens(text, tokenizer_name, show_token_ids):
|
| 27 |
tokenizer = tokenizers[tokenizer_name]
|
| 28 |
encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
|
|
@@ -45,19 +47,19 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
|
|
| 45 |
prefix = ""
|
| 46 |
token_body = token
|
| 47 |
|
| 48 |
-
#
|
| 49 |
if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
|
| 50 |
prefix = "Ġ"
|
| 51 |
token_body = token[1:]
|
| 52 |
|
| 53 |
-
#
|
| 54 |
try:
|
| 55 |
byte_seq = bytes([ord(c) for c in token_body])
|
| 56 |
-
decoded = byte_seq.decode("utf-8")
|
| 57 |
except Exception:
|
| 58 |
decoded = token_body # fallback
|
| 59 |
|
| 60 |
-
#
|
| 61 |
label = f"{prefix}{decoded}"
|
| 62 |
color = "lightgray" if is_special_token(token) else random_pastel()
|
| 63 |
|
|
@@ -80,7 +82,7 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
|
|
| 80 |
if show_token_ids:
|
| 81 |
html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
|
| 82 |
|
| 83 |
-
#
|
| 84 |
try:
|
| 85 |
decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
|
| 86 |
except Exception:
|
|
@@ -88,8 +90,6 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
|
|
| 88 |
|
| 89 |
return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
|
| 90 |
|
| 91 |
-
|
| 92 |
-
# Gradio app
|
| 93 |
with gr.Blocks() as app:
|
| 94 |
gr.Markdown("# 🚀 German Tokenizers")
|
| 95 |
|
|
@@ -110,5 +110,4 @@ with gr.Blocks() as app:
|
|
| 110 |
outputs=[html_output, token_count, decoded_output]
|
| 111 |
)
|
| 112 |
|
| 113 |
-
# Launch the app
|
| 114 |
app.launch()
|
|
|
|
| 1 |
import random
|
| 2 |
import gradio as gr
|
| 3 |
+
import unicodedata
|
| 4 |
from transformers import AutoTokenizer, PreTrainedTokenizerFast
|
| 5 |
|
| 6 |
# Load all tokenizers at startup
|
|
|
|
| 15 |
"DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
|
| 16 |
}
|
| 17 |
|
| 18 |
+
#decode byte-level tokens back to UTF-8, normalized (fixes ß, ä, ö, ü)
|
| 19 |
def decode_byte_token(token):
|
| 20 |
token_clean = token.replace("Ġ", "")
|
| 21 |
try:
|
| 22 |
byte_seq = bytes([ord(c) for c in token_clean])
|
| 23 |
+
return unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
|
| 24 |
except Exception:
|
| 25 |
return token_clean # fallback
|
| 26 |
|
| 27 |
+
|
| 28 |
def visualize_tokens(text, tokenizer_name, show_token_ids):
|
| 29 |
tokenizer = tokenizers[tokenizer_name]
|
| 30 |
encoded = tokenizer(text, add_special_tokens=False, return_tensors=None)
|
|
|
|
| 47 |
prefix = ""
|
| 48 |
token_body = token
|
| 49 |
|
| 50 |
+
# Handle byte-level space marker
|
| 51 |
if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
|
| 52 |
prefix = "Ġ"
|
| 53 |
token_body = token[1:]
|
| 54 |
|
| 55 |
+
# Decode the token body only (not Ġ), with normalization
|
| 56 |
try:
|
| 57 |
byte_seq = bytes([ord(c) for c in token_body])
|
| 58 |
+
decoded = unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
|
| 59 |
except Exception:
|
| 60 |
decoded = token_body # fallback
|
| 61 |
|
| 62 |
+
# Combine prefix with decoded token
|
| 63 |
label = f"{prefix}{decoded}"
|
| 64 |
color = "lightgray" if is_special_token(token) else random_pastel()
|
| 65 |
|
|
|
|
| 82 |
if show_token_ids:
|
| 83 |
html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
|
| 84 |
|
| 85 |
+
#decode full sentence
|
| 86 |
try:
|
| 87 |
decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
|
| 88 |
except Exception:
|
|
|
|
| 90 |
|
| 91 |
return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
|
| 92 |
|
|
|
|
|
|
|
| 93 |
with gr.Blocks() as app:
|
| 94 |
gr.Markdown("# 🚀 German Tokenizers")
|
| 95 |
|
|
|
|
| 110 |
outputs=[html_output, token_count, decoded_output]
|
| 111 |
)
|
| 112 |
|
|
|
|
| 113 |
app.launch()
|