Spaces:
Sleeping
Sleeping
File size: 7,782 Bytes
8bedd21 9da3062 8bedd21 8e64677 9da3062 8bedd21 9da3062 8bedd21 8e64677 8bedd21 9da3062 8bedd21 e86313c 8bedd21 9da3062 8bedd21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 | """Gradio app for comparing tokenizer performance across languages."""
import random
import gradio as gr
import pandas as pd
import numpy as np
import plotly.figure_factory as ff
import plotly.express as px
def load_data():
"""Load the data from CSV file."""
return pd.read_csv('data.csv', low_memory=False)
def get_example_text_data(data, subset, tokenizer_col):
"""Get example text data from a random sample."""
random_id = random.choice(data['id'].values)
tempdf = subset[subset['id'] == random_id].copy()
tempdf.rename(columns={'lang': 'Language'}, inplace=True)
tempdf.set_index('Language', inplace=True)
tempdf = tempdf[['iso', 'text', tokenizer_col]]
tempdf.columns = ['ISO', 'Text', 'Num Tokens']
tempdf.sort_values(by='ISO', inplace=True)
return tempdf
TOKENIZERS = [
"openai/gpt4",
"Xenova/gpt-4o",
"Xenova/claude-tokenizer",
"CohereForAI/aya-101",
"meta-llama/Meta-Llama-3-70B",
"mistralai/Mixtral-8x22B-v0.1",
"google/gemma-7b",
"facebook/nllb-200-distilled-600M",
"xlm-roberta-base",
"bert-base-uncased",
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
"bigscience/bloom",
"StabilityAI/stablelm-base-alpha-7b",
"google/flan-t5-base",
"facebook/mbart-large-50",
"EleutherAI/gpt-neox-20b",
]
# Load data once at startup
val_data = load_data()
all_languages = sorted(val_data.lang.unique())
def update_visualizations(tokenizer_name, selected_languages, histogram_flag):
"""Update all visualizations based on selected parameters."""
if not selected_languages:
return None, None, None, None
if tokenizer_name not in val_data.columns:
return None, None, None, "Tokenizer data not available"
# Filter data for selected languages
subset_df = val_data[val_data.lang.isin(selected_languages)]
# Create median metrics HTML
metrics_html = "<div style='display: flex; gap: 20px; flex-wrap: wrap;'>"
for lang in selected_languages:
median_val = int(np.median(subset_df[subset_df.lang == lang][tokenizer_name]))
metrics_html += f"""
<div style='padding: 10px; border: 1px solid #ddd; border-radius: 5px;'>
<div style='font-size: 14px; color: #666;'>{lang}</div>
<div style='font-size: 24px; font-weight: bold;'>{median_val}</div>
</div>
"""
metrics_html += "</div>"
# Create distribution plot
dist_fig = ff.create_distplot(
[val_data[val_data.lang == lang][tokenizer_name] for lang in selected_languages],
group_labels=selected_languages,
show_hist=histogram_flag
)
dist_fig.update_layout(
title={'text': "Token Distribution", 'font': {'size': 25},
'automargin': True, 'yref': "paper"},
xaxis_title="Number of Tokens",
yaxis_title="Density",
height=500
)
# Create bar chart for shortest/longest languages
median_grouped = val_data.groupby('lang')[tokenizer_name].median()
combined = pd.concat([
median_grouped.sort_values().head(7).reset_index().assign(type="shortest"),
median_grouped.sort_values().tail(7).reset_index().assign(type="longest")
]).reset_index(drop=True).sort_values(by=tokenizer_name, ascending=False)
bar_fig = px.bar(
combined, x=tokenizer_name, y="lang", orientation='h', color='type',
color_discrete_sequence=px.colors.qualitative.D3
)
bar_fig.update_traces(hovertemplate='%{y}: %{x} tokens')
bar_fig.update_layout(
title={'text': "Top Langs with Shortest and Longest Median Token Lengths",
'font': {'size': 25}, 'automargin': True, 'yref': "paper",
'pad': {'b': 20}},
xaxis={'title': "Number of Tokens", 'showgrid': True,
'gridwidth': 1, 'gridcolor': "LightGrey"},
yaxis={'title': "Language"},
height=400,
showlegend=False
)
return (metrics_html, dist_fig, bar_fig,
get_example_text_data(val_data, subset_df, tokenizer_name))
# Create Gradio interface
with gr.Blocks(title="Tokenizer Language Comparison") as demo:
gr.Markdown(
"""
# All languages are NOT created (tokenized) equal!
This project compares the tokenization length for different languages.
For some tokenizers, tokenizing a message in one language may result in
10-20x more tokens than a comparable message in another language
(e.g. try English vs. Burmese).
This is part of a larger project of measuring inequality in NLP.
See the original article 'All languages are NOT created (tokenized) equal'
on [Art Fish Intelligence](https://www.artfish.ai/).
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## Settings")
tokenizer = gr.Dropdown(
choices=TOKENIZERS,
value="openai/gpt4",
label="Select Tokenizer"
)
tokenizer_info = gr.Markdown()
data_source_text = (
f"### Data Source\n"
f"The data in this figure is the validation set of the "
f"[Amazon Massive](https://huggingface.co/datasets/"
f"AmazonScience/massive/viewer/af-ZA/validation) "
f"dataset, which consists of 2033 short sentences and "
f"phrases translated into 51 different languages. "
f"Learn more about the dataset from [Amazon's blog post]"
f"(https://www.amazon.science/blog/"
f"amazon-releases-51-language-dataset-for-language-understanding).\n\n"
f"**Data loaded:** {len(val_data)} rows"
)
gr.Markdown(data_source_text)
languages = gr.CheckboxGroup(
choices=all_languages,
value=['English', 'Spanish', 'Chinese', 'Burmese'],
label="Select Languages (max 6)"
)
show_hist = gr.Checkbox(label="Show histogram", value=False)
update_btn = gr.Button("🔄 Refresh Example Texts", variant="secondary")
with gr.Column(scale=3):
gr.Markdown("## Visualizations")
metrics = gr.HTML(label="Median Token Lengths")
with gr.Row():
dist_plot = gr.Plot(label="Token Distribution")
with gr.Row():
bar_plot = gr.Plot(label="Shortest vs Longest Languages")
gr.Markdown("### Example Texts")
examples_df = gr.Dataframe(label="Random Sample")
def update_tokenizer_info(tokenizer_name):
"""Update tokenizer information display."""
if tokenizer_name not in ['openai/gpt4']:
url = f"https://huggingface.co/{tokenizer_name}"
return f"Tokenizer available on [HuggingFace hub]({url})"
return "Tokenized using [tiktoken](https://github.com/openai/tiktoken)"
# pylint: disable=no-member
tokenizer.change(
fn=update_tokenizer_info,
inputs=[tokenizer],
outputs=[tokenizer_info]
)
# Update visualizations when inputs change
inputs = [tokenizer, languages, show_hist]
outputs = [metrics, dist_plot, bar_plot, examples_df]
tokenizer.change(fn=update_visualizations, inputs=inputs, outputs=outputs)
languages.change(fn=update_visualizations, inputs=inputs, outputs=outputs)
show_hist.change(fn=update_visualizations, inputs=inputs, outputs=outputs)
update_btn.click(fn=update_visualizations, inputs=inputs, outputs=outputs)
# Load initial visualization
demo.load(fn=update_visualizations, inputs=inputs, outputs=outputs)
# pylint: enable=no-member
if __name__ == "__main__":
demo.launch()
|