rt2195355 commited on
Commit
3a03bb1
·
verified ·
1 Parent(s): 5b7a444

Create app.py

Browse files

Deployed the tokenizers.

Files changed (1) hide show
  1. app.py +93 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import BertTokenizer, AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
3
+ from tokenizers import ByteLevelBPETokenizer
4
+ from gensim.models import FastText
5
+ bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
6
+ mbert_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
7
+ bpe_tokenizer = ByteLevelBPETokenizer()
8
+ fasttext_model = FastText(vector_size=100, window=5, min_count=1)
9
+
10
+ polylm_tokenizer = AutoTokenizer.from_pretrained("DAMO-NLP-MT/polylm-1.7b")
11
+ polylm_model = AutoModelForCausalLM.from_pretrained("DAMO-NLP-MT/polylm-1.7b")
12
+
13
+ byt5_tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")
14
+ byt5_model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-small")
15
+
16
+ def process_text(input_text, show_tokens, tokenizer_type, display_mode):
17
+ tokens = []
18
+ if tokenizer_type == "BERT":
19
+ tokens = bert_tokenizer.tokenize(input_text)
20
+ elif tokenizer_type == "Multilingual BERT":
21
+ tokens = mbert_tokenizer.tokenize(input_text)
22
+ elif tokenizer_type == "BPE":
23
+ bpe_tokenizer.train_from_iterator([input_text], vocab_size=1000, min_frequency=1)
24
+ tokens = bpe_tokenizer.encode(input_text).tokens
25
+ elif tokenizer_type == "FastText":
26
+ tokens = input_text.split()
27
+ elif tokenizer_type == "PolyLM":
28
+ tokens = polylm_tokenizer.tokenize(input_text)
29
+ elif tokenizer_type == "ByT5":
30
+ tokens = byt5_tokenizer.tokenize(input_text)
31
+
32
+ token_count = len(tokens)
33
+
34
+ if display_mode == "Tokens":
35
+ if show_tokens:
36
+ token_html = ""
37
+ for idx, token in enumerate(tokens):
38
+ color = f"hsl({(idx * 50) % 360}, 70%, 40%)"
39
+ token_html += f'<span style="background-color:{color}; padding:2px; border-radius:5px; color: black;">{token}</span> '
40
+ return token_html, token_count
41
+ else:
42
+ return " ".join(tokens), token_count
43
+ elif display_mode == "Token Values":
44
+ return str(tokens), token_count
45
+
46
+ with gr.Blocks() as demo:
47
+ gr.Markdown("# Tokenizer Explorer")
48
+ gr.Markdown("Choose a tokenizer and see how your text is tokenized. Toggle 'Show Tokens' to view highlighted tokens.")
49
+
50
+ with gr.Row():
51
+ input_text = gr.Textbox(label="Input Text", placeholder="Type your text here...", lines=5)
52
+ output_display = gr.HTML(label="Output Display")
53
+
54
+ with gr.Row():
55
+ token_count_display = gr.Number(label="Number of Tokens", value=0, interactive=False)
56
+
57
+ tokenizer_type = gr.Radio(
58
+ ["BERT", "Multilingual BERT", "BPE", "FastText", "PolyLM", "ByT5"],
59
+ label="Choose Tokenizer",
60
+ value="BERT",
61
+ )
62
+ display_mode = gr.Radio(
63
+ ["Tokens", "Token Values"],
64
+ label="Display Mode",
65
+ value="Tokens",
66
+ )
67
+ show_tokens = gr.Checkbox(label="Show Tokens", value=True)
68
+
69
+ def update_output(input_text, show_tokens, tokenizer_type, display_mode):
70
+ token_output, token_count = process_text(input_text, show_tokens, tokenizer_type, display_mode)
71
+ return token_output, token_count
72
+ input_text.change(
73
+ fn=update_output,
74
+ inputs=[input_text, show_tokens, tokenizer_type, display_mode],
75
+ outputs=[output_display, token_count_display],
76
+ )
77
+ show_tokens.change(
78
+ fn=update_output,
79
+ inputs=[input_text, show_tokens, tokenizer_type, display_mode],
80
+ outputs=[output_display, token_count_display],
81
+ )
82
+ tokenizer_type.change(
83
+ fn=update_output,
84
+ inputs=[input_text, show_tokens, tokenizer_type, display_mode],
85
+ outputs=[output_display, token_count_display],
86
+ )
87
+ display_mode.change(
88
+ fn=update_output,
89
+ inputs=[input_text, show_tokens, tokenizer_type, display_mode],
90
+ outputs=[output_display, token_count_display],
91
+ )
92
+
93
+ demo.launch()