"""Gradio app for comparing tokenizer performance across languages.""" import random import gradio as gr import pandas as pd import numpy as np import plotly.figure_factory as ff import plotly.express as px def load_data(): """Load the data from CSV file.""" return pd.read_csv('data.csv', low_memory=False) def get_example_text_data(data, subset, tokenizer_col): """Get example text data from a random sample.""" random_id = random.choice(data['id'].values) tempdf = subset[subset['id'] == random_id].copy() tempdf.rename(columns={'lang': 'Language'}, inplace=True) tempdf.set_index('Language', inplace=True) tempdf = tempdf[['iso', 'text', tokenizer_col]] tempdf.columns = ['ISO', 'Text', 'Num Tokens'] tempdf.sort_values(by='ISO', inplace=True) return tempdf TOKENIZERS = [ "openai/gpt4", "Xenova/gpt-4o", "Xenova/claude-tokenizer", "CohereForAI/aya-101", "meta-llama/Meta-Llama-3-70B", "mistralai/Mixtral-8x22B-v0.1", "google/gemma-7b", "facebook/nllb-200-distilled-600M", "xlm-roberta-base", "bert-base-uncased", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "bigscience/bloom", "StabilityAI/stablelm-base-alpha-7b", "google/flan-t5-base", "facebook/mbart-large-50", "EleutherAI/gpt-neox-20b", ] # Load data once at startup val_data = load_data() all_languages = sorted(val_data.lang.unique()) def update_visualizations(tokenizer_name, selected_languages, histogram_flag): """Update all visualizations based on selected parameters.""" if not selected_languages: return None, None, None, None if tokenizer_name not in val_data.columns: return None, None, None, "Tokenizer data not available" # Filter data for selected languages subset_df = val_data[val_data.lang.isin(selected_languages)] # Create median metrics HTML metrics_html = "
" for lang in selected_languages: median_val = int(np.median(subset_df[subset_df.lang == lang][tokenizer_name])) metrics_html += f"""
{lang}
{median_val}
""" metrics_html += "
" # Create distribution plot dist_fig = ff.create_distplot( [val_data[val_data.lang == lang][tokenizer_name] for lang in selected_languages], group_labels=selected_languages, show_hist=histogram_flag ) dist_fig.update_layout( title={'text': "Token Distribution", 'font': {'size': 25}, 'automargin': True, 'yref': "paper"}, xaxis_title="Number of Tokens", yaxis_title="Density", height=500 ) # Create bar chart for shortest/longest languages median_grouped = val_data.groupby('lang')[tokenizer_name].median() combined = pd.concat([ median_grouped.sort_values().head(7).reset_index().assign(type="shortest"), median_grouped.sort_values().tail(7).reset_index().assign(type="longest") ]).reset_index(drop=True).sort_values(by=tokenizer_name, ascending=False) bar_fig = px.bar( combined, x=tokenizer_name, y="lang", orientation='h', color='type', color_discrete_sequence=px.colors.qualitative.D3 ) bar_fig.update_traces(hovertemplate='%{y}: %{x} tokens') bar_fig.update_layout( title={'text': "Top Langs with Shortest and Longest Median Token Lengths", 'font': {'size': 25}, 'automargin': True, 'yref': "paper", 'pad': {'b': 20}}, xaxis={'title': "Number of Tokens", 'showgrid': True, 'gridwidth': 1, 'gridcolor': "LightGrey"}, yaxis={'title': "Language"}, height=400, showlegend=False ) return (metrics_html, dist_fig, bar_fig, get_example_text_data(val_data, subset_df, tokenizer_name)) # Create Gradio interface with gr.Blocks(title="Tokenizer Language Comparison") as demo: gr.Markdown( """ # All languages are NOT created (tokenized) equal! This project compares the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP. See the original article 'All languages are NOT created (tokenized) equal' on [Art Fish Intelligence](https://www.artfish.ai/). """ ) with gr.Row(): with gr.Column(scale=1): gr.Markdown("## Settings") tokenizer = gr.Dropdown( choices=TOKENIZERS, value="openai/gpt4", label="Select Tokenizer" ) tokenizer_info = gr.Markdown() data_source_text = ( f"### Data Source\n" f"The data in this figure is the validation set of the " f"[Amazon Massive](https://huggingface.co/datasets/" f"AmazonScience/massive/viewer/af-ZA/validation) " f"dataset, which consists of 2033 short sentences and " f"phrases translated into 51 different languages. " f"Learn more about the dataset from [Amazon's blog post]" f"(https://www.amazon.science/blog/" f"amazon-releases-51-language-dataset-for-language-understanding).\n\n" f"**Data loaded:** {len(val_data)} rows" ) gr.Markdown(data_source_text) languages = gr.CheckboxGroup( choices=all_languages, value=['English', 'Spanish', 'Chinese', 'Burmese'], label="Select Languages (max 6)" ) show_hist = gr.Checkbox(label="Show histogram", value=False) update_btn = gr.Button("🔄 Refresh Example Texts", variant="secondary") with gr.Column(scale=3): gr.Markdown("## Visualizations") metrics = gr.HTML(label="Median Token Lengths") with gr.Row(): dist_plot = gr.Plot(label="Token Distribution") with gr.Row(): bar_plot = gr.Plot(label="Shortest vs Longest Languages") gr.Markdown("### Example Texts") examples_df = gr.Dataframe(label="Random Sample") def update_tokenizer_info(tokenizer_name): """Update tokenizer information display.""" if tokenizer_name not in ['openai/gpt4']: url = f"https://huggingface.co/{tokenizer_name}" return f"Tokenizer available on [HuggingFace hub]({url})" return "Tokenized using [tiktoken](https://github.com/openai/tiktoken)" # pylint: disable=no-member tokenizer.change( fn=update_tokenizer_info, inputs=[tokenizer], outputs=[tokenizer_info] ) # Update visualizations when inputs change inputs = [tokenizer, languages, show_hist] outputs = [metrics, dist_plot, bar_plot, examples_df] tokenizer.change(fn=update_visualizations, inputs=inputs, outputs=outputs) languages.change(fn=update_visualizations, inputs=inputs, outputs=outputs) show_hist.change(fn=update_visualizations, inputs=inputs, outputs=outputs) update_btn.click(fn=update_visualizations, inputs=inputs, outputs=outputs) # Load initial visualization demo.load(fn=update_visualizations, inputs=inputs, outputs=outputs) # pylint: enable=no-member if __name__ == "__main__": demo.launch()