"""Gradio app for comparing tokenizer performance across languages."""
import random
import gradio as gr
import pandas as pd
import numpy as np
import plotly.figure_factory as ff
import plotly.express as px
def load_data():
"""Load the data from CSV file."""
return pd.read_csv('data.csv', low_memory=False)
def get_example_text_data(data, subset, tokenizer_col):
"""Get example text data from a random sample."""
random_id = random.choice(data['id'].values)
tempdf = subset[subset['id'] == random_id].copy()
tempdf.rename(columns={'lang': 'Language'}, inplace=True)
tempdf.set_index('Language', inplace=True)
tempdf = tempdf[['iso', 'text', tokenizer_col]]
tempdf.columns = ['ISO', 'Text', 'Num Tokens']
tempdf.sort_values(by='ISO', inplace=True)
return tempdf
TOKENIZERS = [
"openai/gpt4",
"Xenova/gpt-4o",
"Xenova/claude-tokenizer",
"CohereForAI/aya-101",
"meta-llama/Meta-Llama-3-70B",
"mistralai/Mixtral-8x22B-v0.1",
"google/gemma-7b",
"facebook/nllb-200-distilled-600M",
"xlm-roberta-base",
"bert-base-uncased",
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
"bigscience/bloom",
"StabilityAI/stablelm-base-alpha-7b",
"google/flan-t5-base",
"facebook/mbart-large-50",
"EleutherAI/gpt-neox-20b",
]
# Load data once at startup
val_data = load_data()
all_languages = sorted(val_data.lang.unique())
def update_visualizations(tokenizer_name, selected_languages, histogram_flag):
"""Update all visualizations based on selected parameters."""
if not selected_languages:
return None, None, None, None
if tokenizer_name not in val_data.columns:
return None, None, None, "Tokenizer data not available"
# Filter data for selected languages
subset_df = val_data[val_data.lang.isin(selected_languages)]
# Create median metrics HTML
metrics_html = "
"
for lang in selected_languages:
median_val = int(np.median(subset_df[subset_df.lang == lang][tokenizer_name]))
metrics_html += f"""
"""
metrics_html += "
"
# Create distribution plot
dist_fig = ff.create_distplot(
[val_data[val_data.lang == lang][tokenizer_name] for lang in selected_languages],
group_labels=selected_languages,
show_hist=histogram_flag
)
dist_fig.update_layout(
title={'text': "Token Distribution", 'font': {'size': 25},
'automargin': True, 'yref': "paper"},
xaxis_title="Number of Tokens",
yaxis_title="Density",
height=500
)
# Create bar chart for shortest/longest languages
median_grouped = val_data.groupby('lang')[tokenizer_name].median()
combined = pd.concat([
median_grouped.sort_values().head(7).reset_index().assign(type="shortest"),
median_grouped.sort_values().tail(7).reset_index().assign(type="longest")
]).reset_index(drop=True).sort_values(by=tokenizer_name, ascending=False)
bar_fig = px.bar(
combined, x=tokenizer_name, y="lang", orientation='h', color='type',
color_discrete_sequence=px.colors.qualitative.D3
)
bar_fig.update_traces(hovertemplate='%{y}: %{x} tokens')
bar_fig.update_layout(
title={'text': "Top Langs with Shortest and Longest Median Token Lengths",
'font': {'size': 25}, 'automargin': True, 'yref': "paper",
'pad': {'b': 20}},
xaxis={'title': "Number of Tokens", 'showgrid': True,
'gridwidth': 1, 'gridcolor': "LightGrey"},
yaxis={'title': "Language"},
height=400,
showlegend=False
)
return (metrics_html, dist_fig, bar_fig,
get_example_text_data(val_data, subset_df, tokenizer_name))
# Create Gradio interface
with gr.Blocks(title="Tokenizer Language Comparison") as demo:
gr.Markdown(
"""
# All languages are NOT created (tokenized) equal!
This project compares the tokenization length for different languages.
For some tokenizers, tokenizing a message in one language may result in
10-20x more tokens than a comparable message in another language
(e.g. try English vs. Burmese).
This is part of a larger project of measuring inequality in NLP.
See the original article 'All languages are NOT created (tokenized) equal'
on [Art Fish Intelligence](https://www.artfish.ai/).
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## Settings")
tokenizer = gr.Dropdown(
choices=TOKENIZERS,
value="openai/gpt4",
label="Select Tokenizer"
)
tokenizer_info = gr.Markdown()
data_source_text = (
f"### Data Source\n"
f"The data in this figure is the validation set of the "
f"[Amazon Massive](https://huggingface.co/datasets/"
f"AmazonScience/massive/viewer/af-ZA/validation) "
f"dataset, which consists of 2033 short sentences and "
f"phrases translated into 51 different languages. "
f"Learn more about the dataset from [Amazon's blog post]"
f"(https://www.amazon.science/blog/"
f"amazon-releases-51-language-dataset-for-language-understanding).\n\n"
f"**Data loaded:** {len(val_data)} rows"
)
gr.Markdown(data_source_text)
languages = gr.CheckboxGroup(
choices=all_languages,
value=['English', 'Spanish', 'Chinese', 'Burmese'],
label="Select Languages (max 6)"
)
show_hist = gr.Checkbox(label="Show histogram", value=False)
update_btn = gr.Button("🔄 Refresh Example Texts", variant="secondary")
with gr.Column(scale=3):
gr.Markdown("## Visualizations")
metrics = gr.HTML(label="Median Token Lengths")
with gr.Row():
dist_plot = gr.Plot(label="Token Distribution")
with gr.Row():
bar_plot = gr.Plot(label="Shortest vs Longest Languages")
gr.Markdown("### Example Texts")
examples_df = gr.Dataframe(label="Random Sample")
def update_tokenizer_info(tokenizer_name):
"""Update tokenizer information display."""
if tokenizer_name not in ['openai/gpt4']:
url = f"https://huggingface.co/{tokenizer_name}"
return f"Tokenizer available on [HuggingFace hub]({url})"
return "Tokenized using [tiktoken](https://github.com/openai/tiktoken)"
# pylint: disable=no-member
tokenizer.change(
fn=update_tokenizer_info,
inputs=[tokenizer],
outputs=[tokenizer_info]
)
# Update visualizations when inputs change
inputs = [tokenizer, languages, show_hist]
outputs = [metrics, dist_plot, bar_plot, examples_df]
tokenizer.change(fn=update_visualizations, inputs=inputs, outputs=outputs)
languages.change(fn=update_visualizations, inputs=inputs, outputs=outputs)
show_hist.change(fn=update_visualizations, inputs=inputs, outputs=outputs)
update_btn.click(fn=update_visualizations, inputs=inputs, outputs=outputs)
# Load initial visualization
demo.load(fn=update_visualizations, inputs=inputs, outputs=outputs)
# pylint: enable=no-member
if __name__ == "__main__":
demo.launch()