Spaces:
Build error
Build error
| import gradio as gr | |
| from transformers import BertTokenizer, AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM | |
| from tokenizers import ByteLevelBPETokenizer | |
| from gensim.models import FastText | |
| bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased") | |
| mbert_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased") | |
| bpe_tokenizer = ByteLevelBPETokenizer() | |
| fasttext_model = FastText(vector_size=100, window=5, min_count=1) | |
| polylm_tokenizer = AutoTokenizer.from_pretrained("DAMO-NLP-MT/polylm-1.7b") | |
| polylm_model = AutoModelForCausalLM.from_pretrained("DAMO-NLP-MT/polylm-1.7b") | |
| byt5_tokenizer = AutoTokenizer.from_pretrained("google/byt5-small") | |
| byt5_model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-small") | |
| def process_text(input_text, show_tokens, tokenizer_type, display_mode): | |
| tokens = [] | |
| if tokenizer_type == "BERT": | |
| tokens = bert_tokenizer.tokenize(input_text) | |
| elif tokenizer_type == "Multilingual BERT": | |
| tokens = mbert_tokenizer.tokenize(input_text) | |
| elif tokenizer_type == "BPE": | |
| bpe_tokenizer.train_from_iterator([input_text], vocab_size=1000, min_frequency=1) | |
| tokens = bpe_tokenizer.encode(input_text).tokens | |
| elif tokenizer_type == "FastText": | |
| tokens = input_text.split() | |
| elif tokenizer_type == "PolyLM": | |
| tokens = polylm_tokenizer.tokenize(input_text) | |
| elif tokenizer_type == "ByT5": | |
| tokens = byt5_tokenizer.tokenize(input_text) | |
| token_count = len(tokens) | |
| if display_mode == "Tokens": | |
| if show_tokens: | |
| token_html = "" | |
| for idx, token in enumerate(tokens): | |
| color = f"hsl({(idx * 50) % 360}, 70%, 40%)" | |
| token_html += f'<span style="background-color:{color}; padding:2px; border-radius:5px; color: black;">{token}</span> ' | |
| return token_html, token_count | |
| else: | |
| return " ".join(tokens), token_count | |
| elif display_mode == "Token Values": | |
| return str(tokens), token_count | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Tokenizer Explorer") | |
| gr.Markdown("Choose a tokenizer and see how your text is tokenized. Toggle 'Show Tokens' to view highlighted tokens.") | |
| with gr.Row(): | |
| input_text = gr.Textbox(label="Input Text", placeholder="Type your text here...", lines=5) | |
| output_display = gr.HTML(label="Output Display") | |
| with gr.Row(): | |
| token_count_display = gr.Number(label="Number of Tokens", value=0, interactive=False) | |
| tokenizer_type = gr.Radio( | |
| ["BERT", "Multilingual BERT", "BPE", "FastText", "PolyLM", "ByT5"], | |
| label="Choose Tokenizer", | |
| value="BERT", | |
| ) | |
| display_mode = gr.Radio( | |
| ["Tokens", "Token Values"], | |
| label="Display Mode", | |
| value="Tokens", | |
| ) | |
| show_tokens = gr.Checkbox(label="Show Tokens", value=True) | |
| def update_output(input_text, show_tokens, tokenizer_type, display_mode): | |
| token_output, token_count = process_text(input_text, show_tokens, tokenizer_type, display_mode) | |
| return token_output, token_count | |
| input_text.change( | |
| fn=update_output, | |
| inputs=[input_text, show_tokens, tokenizer_type, display_mode], | |
| outputs=[output_display, token_count_display], | |
| ) | |
| show_tokens.change( | |
| fn=update_output, | |
| inputs=[input_text, show_tokens, tokenizer_type, display_mode], | |
| outputs=[output_display, token_count_display], | |
| ) | |
| tokenizer_type.change( | |
| fn=update_output, | |
| inputs=[input_text, show_tokens, tokenizer_type, display_mode], | |
| outputs=[output_display, token_count_display], | |
| ) | |
| display_mode.change( | |
| fn=update_output, | |
| inputs=[input_text, show_tokens, tokenizer_type, display_mode], | |
| outputs=[output_display, token_count_display], | |
| ) | |
| demo.launch() | |