Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
from transformers import AutoTokenizer
|
|
|
|
| 3 |
|
| 4 |
# List of available tokenizers
|
| 5 |
tokenizers = [
|
|
@@ -10,10 +11,50 @@ tokenizers = [
|
|
| 10 |
"xlnet-base-cased"
|
| 11 |
]
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
def tokenize_text(text, tokenizer_name):
|
| 14 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
| 15 |
-
tokens = tokenizer.
|
| 16 |
-
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def compare_tokenizers(text, selected_tokenizers):
|
| 19 |
results = {}
|
|
@@ -28,10 +69,11 @@ iface = gr.Interface(
|
|
| 28 |
gr.Textbox(label="Enter text to tokenize"),
|
| 29 |
gr.CheckboxGroup(choices=tokenizers, label="Select tokenizers")
|
| 30 |
],
|
| 31 |
-
outputs=gr.
|
| 32 |
title="Tokenizer Comparison",
|
| 33 |
description="Compare tokenization results from different tokenizers.",
|
| 34 |
)
|
| 35 |
|
| 36 |
# Launch the app
|
| 37 |
-
iface.launch()
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from transformers import AutoTokenizer
|
| 3 |
+
import random
|
| 4 |
|
| 5 |
# List of available tokenizers
|
| 6 |
tokenizers = [
|
|
|
|
| 11 |
"xlnet-base-cased"
|
| 12 |
]
|
| 13 |
|
| 14 |
+
def generate_colored_html(tokens, decoded_tokens):
|
| 15 |
+
colors = ["#FFDDC1", "#C1FFD4", "#D4C1FF", "#FFC1C1", "#C1FFFD"]
|
| 16 |
+
text_color = "#000000"
|
| 17 |
+
last_color = None
|
| 18 |
+
background_color = "#F0F0F0"
|
| 19 |
+
html_tokens = []
|
| 20 |
+
|
| 21 |
+
special_token_replacements = {
|
| 22 |
+
'<pad>': '[Padding]',
|
| 23 |
+
'<s>': '[Start of Sentence]',
|
| 24 |
+
'</s>': '[End of Sentence]',
|
| 25 |
+
'<unk>': '[Unknown]',
|
| 26 |
+
'<mask>': '[Masked]',
|
| 27 |
+
'[CLS]': '[Class]',
|
| 28 |
+
'[SEP]': '[Separator]'
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
for i, (token, decoded_token) in enumerate(zip(tokens, decoded_tokens)):
|
| 32 |
+
for special_token, replacement in special_token_replacements.items():
|
| 33 |
+
if special_token in decoded_token:
|
| 34 |
+
decoded_token = decoded_token.replace(special_token, replacement)
|
| 35 |
+
|
| 36 |
+
hover_info = f"Token Index: {i}, Token: {decoded_token}, Token ID: {token}"
|
| 37 |
+
|
| 38 |
+
if '\n' in decoded_token:
|
| 39 |
+
color = random.choice([c for c in colors if c != last_color])
|
| 40 |
+
last_color = color
|
| 41 |
+
newline_representation = f"<span style='background-color: {color}; color: {text_color};' title='{hover_info}'>[NEWLINE]</span><br>"
|
| 42 |
+
html_tokens.append(newline_representation)
|
| 43 |
+
else:
|
| 44 |
+
color = random.choice([c for c in colors if c != last_color])
|
| 45 |
+
last_color = color
|
| 46 |
+
html_tokens.append(f'<span style="background-color: {color}; color: {text_color}; text-decoration: none;" title="{hover_info}">{decoded_token}</span>')
|
| 47 |
+
|
| 48 |
+
html_output = " ".join(html_tokens)
|
| 49 |
+
html_output = f'<div style="background-color: {background_color}; padding: 10px;">{html_output}</div>'
|
| 50 |
+
return html_output
|
| 51 |
+
|
| 52 |
def tokenize_text(text, tokenizer_name):
|
| 53 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
| 54 |
+
tokens = tokenizer.encode(text, add_special_tokens=True)
|
| 55 |
+
decoded_tokens = [tokenizer.decode(token) for token in tokens]
|
| 56 |
+
html_output = generate_colored_html(tokens, decoded_tokens)
|
| 57 |
+
return html_output
|
| 58 |
|
| 59 |
def compare_tokenizers(text, selected_tokenizers):
|
| 60 |
results = {}
|
|
|
|
| 69 |
gr.Textbox(label="Enter text to tokenize"),
|
| 70 |
gr.CheckboxGroup(choices=tokenizers, label="Select tokenizers")
|
| 71 |
],
|
| 72 |
+
outputs=gr.HTML(label="Tokenization Results"),
|
| 73 |
title="Tokenizer Comparison",
|
| 74 |
description="Compare tokenization results from different tokenizers.",
|
| 75 |
)
|
| 76 |
|
| 77 |
# Launch the app
|
| 78 |
+
iface.launch()
|
| 79 |
+
|