Spaces:
Running
Running
| """Gradio app for comparing tokenizer performance across languages.""" | |
| import random | |
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.figure_factory as ff | |
| import plotly.express as px | |
| def load_data(): | |
| """Load the data from CSV file.""" | |
| return pd.read_csv('data.csv', low_memory=False) | |
| def get_example_text_data(data, subset, tokenizer_col): | |
| """Get example text data from a random sample.""" | |
| random_id = random.choice(data['id'].values) | |
| tempdf = subset[subset['id'] == random_id].copy() | |
| tempdf.rename(columns={'lang': 'Language'}, inplace=True) | |
| tempdf.set_index('Language', inplace=True) | |
| tempdf = tempdf[['iso', 'text', tokenizer_col]] | |
| tempdf.columns = ['ISO', 'Text', 'Num Tokens'] | |
| tempdf.sort_values(by='ISO', inplace=True) | |
| return tempdf | |
| TOKENIZERS = [ | |
| "openai/gpt4", | |
| "Xenova/gpt-4o", | |
| "Xenova/claude-tokenizer", | |
| "CohereForAI/aya-101", | |
| "meta-llama/Meta-Llama-3-70B", | |
| "mistralai/Mixtral-8x22B-v0.1", | |
| "google/gemma-7b", | |
| "facebook/nllb-200-distilled-600M", | |
| "xlm-roberta-base", | |
| "bert-base-uncased", | |
| "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", | |
| "bigscience/bloom", | |
| "StabilityAI/stablelm-base-alpha-7b", | |
| "google/flan-t5-base", | |
| "facebook/mbart-large-50", | |
| "EleutherAI/gpt-neox-20b", | |
| ] | |
| # Load data once at startup | |
| val_data = load_data() | |
| all_languages = sorted(val_data.lang.unique()) | |
| def update_visualizations(tokenizer_name, selected_languages, histogram_flag): | |
| """Update all visualizations based on selected parameters.""" | |
| if not selected_languages: | |
| return None, None, None, None | |
| if tokenizer_name not in val_data.columns: | |
| return None, None, None, "Tokenizer data not available" | |
| # Filter data for selected languages | |
| subset_df = val_data[val_data.lang.isin(selected_languages)] | |
| # Create median metrics HTML | |
| metrics_html = "<div style='display: flex; gap: 20px; flex-wrap: wrap;'>" | |
| for lang in selected_languages: | |
| median_val = int(np.median(subset_df[subset_df.lang == lang][tokenizer_name])) | |
| metrics_html += f""" | |
| <div style='padding: 10px; border: 1px solid #ddd; border-radius: 5px;'> | |
| <div style='font-size: 14px; color: #666;'>{lang}</div> | |
| <div style='font-size: 24px; font-weight: bold;'>{median_val}</div> | |
| </div> | |
| """ | |
| metrics_html += "</div>" | |
| # Create distribution plot | |
| dist_fig = ff.create_distplot( | |
| [val_data[val_data.lang == lang][tokenizer_name] for lang in selected_languages], | |
| group_labels=selected_languages, | |
| show_hist=histogram_flag | |
| ) | |
| dist_fig.update_layout( | |
| title={'text': "Token Distribution", 'font': {'size': 25}, | |
| 'automargin': True, 'yref': "paper"}, | |
| xaxis_title="Number of Tokens", | |
| yaxis_title="Density", | |
| height=500 | |
| ) | |
| # Create bar chart for shortest/longest languages | |
| median_grouped = val_data.groupby('lang')[tokenizer_name].median() | |
| combined = pd.concat([ | |
| median_grouped.sort_values().head(7).reset_index().assign(type="shortest"), | |
| median_grouped.sort_values().tail(7).reset_index().assign(type="longest") | |
| ]).reset_index(drop=True).sort_values(by=tokenizer_name, ascending=False) | |
| bar_fig = px.bar( | |
| combined, x=tokenizer_name, y="lang", orientation='h', color='type', | |
| color_discrete_sequence=px.colors.qualitative.D3 | |
| ) | |
| bar_fig.update_traces(hovertemplate='%{y}: %{x} tokens') | |
| bar_fig.update_layout( | |
| title={'text': "Top Langs with Shortest and Longest Median Token Lengths", | |
| 'font': {'size': 25}, 'automargin': True, 'yref': "paper", | |
| 'pad': {'b': 20}}, | |
| xaxis={'title': "Number of Tokens", 'showgrid': True, | |
| 'gridwidth': 1, 'gridcolor': "LightGrey"}, | |
| yaxis={'title': "Language"}, | |
| height=400, | |
| showlegend=False | |
| ) | |
| return (metrics_html, dist_fig, bar_fig, | |
| get_example_text_data(val_data, subset_df, tokenizer_name)) | |
| # Create Gradio interface | |
| with gr.Blocks(title="Tokenizer Language Comparison") as demo: | |
| gr.Markdown( | |
| """ | |
| # All languages are NOT created (tokenized) equal! | |
| This project compares the tokenization length for different languages. | |
| For some tokenizers, tokenizing a message in one language may result in | |
| 10-20x more tokens than a comparable message in another language | |
| (e.g. try English vs. Burmese). | |
| This is part of a larger project of measuring inequality in NLP. | |
| See the original article 'All languages are NOT created (tokenized) equal' | |
| on [Art Fish Intelligence](https://www.artfish.ai/). | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("## Settings") | |
| tokenizer = gr.Dropdown( | |
| choices=TOKENIZERS, | |
| value="openai/gpt4", | |
| label="Select Tokenizer" | |
| ) | |
| tokenizer_info = gr.Markdown() | |
| data_source_text = ( | |
| f"### Data Source\n" | |
| f"The data in this figure is the validation set of the " | |
| f"[Amazon Massive](https://huggingface.co/datasets/" | |
| f"AmazonScience/massive/viewer/af-ZA/validation) " | |
| f"dataset, which consists of 2033 short sentences and " | |
| f"phrases translated into 51 different languages. " | |
| f"Learn more about the dataset from [Amazon's blog post]" | |
| f"(https://www.amazon.science/blog/" | |
| f"amazon-releases-51-language-dataset-for-language-understanding).\n\n" | |
| f"**Data loaded:** {len(val_data)} rows" | |
| ) | |
| gr.Markdown(data_source_text) | |
| languages = gr.CheckboxGroup( | |
| choices=all_languages, | |
| value=['English', 'Spanish', 'Chinese', 'Burmese'], | |
| label="Select Languages (max 6)" | |
| ) | |
| show_hist = gr.Checkbox(label="Show histogram", value=False) | |
| update_btn = gr.Button("🔄 Refresh Example Texts", variant="secondary") | |
| with gr.Column(scale=3): | |
| gr.Markdown("## Visualizations") | |
| metrics = gr.HTML(label="Median Token Lengths") | |
| with gr.Row(): | |
| dist_plot = gr.Plot(label="Token Distribution") | |
| with gr.Row(): | |
| bar_plot = gr.Plot(label="Shortest vs Longest Languages") | |
| gr.Markdown("### Example Texts") | |
| examples_df = gr.Dataframe(label="Random Sample") | |
| def update_tokenizer_info(tokenizer_name): | |
| """Update tokenizer information display.""" | |
| if tokenizer_name not in ['openai/gpt4']: | |
| url = f"https://huggingface.co/{tokenizer_name}" | |
| return f"Tokenizer available on [HuggingFace hub]({url})" | |
| return "Tokenized using [tiktoken](https://github.com/openai/tiktoken)" | |
| # pylint: disable=no-member | |
| tokenizer.change( | |
| fn=update_tokenizer_info, | |
| inputs=[tokenizer], | |
| outputs=[tokenizer_info] | |
| ) | |
| # Update visualizations when inputs change | |
| inputs = [tokenizer, languages, show_hist] | |
| outputs = [metrics, dist_plot, bar_plot, examples_df] | |
| tokenizer.change(fn=update_visualizations, inputs=inputs, outputs=outputs) | |
| languages.change(fn=update_visualizations, inputs=inputs, outputs=outputs) | |
| show_hist.change(fn=update_visualizations, inputs=inputs, outputs=outputs) | |
| update_btn.click(fn=update_visualizations, inputs=inputs, outputs=outputs) | |
| # Load initial visualization | |
| demo.load(fn=update_visualizations, inputs=inputs, outputs=outputs) | |
| # pylint: enable=no-member | |
| if __name__ == "__main__": | |
| demo.launch() | |