File size: 7,782 Bytes
8bedd21
9da3062
 
 
8bedd21
8e64677
 
 
 
 
 
9da3062
 
 
8bedd21
 
 
 
9da3062
 
 
 
 
8bedd21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e64677
 
8bedd21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9da3062
 
8bedd21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e86313c
8bedd21
 
9da3062
 
8bedd21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
"""Gradio app for comparing tokenizer performance across languages."""

import random

import gradio as gr
import pandas as pd
import numpy as np
import plotly.figure_factory as ff
import plotly.express as px

def load_data():
    """Load the data from CSV file."""
    return pd.read_csv('data.csv', low_memory=False)

def get_example_text_data(data, subset, tokenizer_col):
    """Get example text data from a random sample."""
    random_id = random.choice(data['id'].values)
    tempdf = subset[subset['id'] == random_id].copy()
    tempdf.rename(columns={'lang': 'Language'}, inplace=True)
    tempdf.set_index('Language', inplace=True)
    tempdf = tempdf[['iso', 'text', tokenizer_col]]
    tempdf.columns = ['ISO', 'Text', 'Num Tokens']
    tempdf.sort_values(by='ISO', inplace=True)
    return tempdf

TOKENIZERS = [
    "openai/gpt4",
    "Xenova/gpt-4o",
    "Xenova/claude-tokenizer",
    "CohereForAI/aya-101",
    "meta-llama/Meta-Llama-3-70B",
    "mistralai/Mixtral-8x22B-v0.1",
    "google/gemma-7b",
    "facebook/nllb-200-distilled-600M",
    "xlm-roberta-base",
    "bert-base-uncased",
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    "bigscience/bloom",
    "StabilityAI/stablelm-base-alpha-7b",
    "google/flan-t5-base",
    "facebook/mbart-large-50",
    "EleutherAI/gpt-neox-20b",
]

# Load data once at startup
val_data = load_data()
all_languages = sorted(val_data.lang.unique())

def update_visualizations(tokenizer_name, selected_languages, histogram_flag):
    """Update all visualizations based on selected parameters."""
    if not selected_languages:
        return None, None, None, None

    if tokenizer_name not in val_data.columns:
        return None, None, None, "Tokenizer data not available"

    # Filter data for selected languages
    subset_df = val_data[val_data.lang.isin(selected_languages)]

    # Create median metrics HTML
    metrics_html = "<div style='display: flex; gap: 20px; flex-wrap: wrap;'>"
    for lang in selected_languages:
        median_val = int(np.median(subset_df[subset_df.lang == lang][tokenizer_name]))
        metrics_html += f"""
        <div style='padding: 10px; border: 1px solid #ddd; border-radius: 5px;'>
            <div style='font-size: 14px; color: #666;'>{lang}</div>
            <div style='font-size: 24px; font-weight: bold;'>{median_val}</div>
        </div>
        """
    metrics_html += "</div>"

    # Create distribution plot
    dist_fig = ff.create_distplot(
        [val_data[val_data.lang == lang][tokenizer_name] for lang in selected_languages],
        group_labels=selected_languages,
        show_hist=histogram_flag
    )
    dist_fig.update_layout(
        title={'text': "Token Distribution", 'font': {'size': 25},
               'automargin': True, 'yref': "paper"},
        xaxis_title="Number of Tokens",
        yaxis_title="Density",
        height=500
    )

    # Create bar chart for shortest/longest languages
    median_grouped = val_data.groupby('lang')[tokenizer_name].median()
    combined = pd.concat([
        median_grouped.sort_values().head(7).reset_index().assign(type="shortest"),
        median_grouped.sort_values().tail(7).reset_index().assign(type="longest")
    ]).reset_index(drop=True).sort_values(by=tokenizer_name, ascending=False)

    bar_fig = px.bar(
        combined, x=tokenizer_name, y="lang", orientation='h', color='type',
        color_discrete_sequence=px.colors.qualitative.D3
    )
    bar_fig.update_traces(hovertemplate='%{y}: %{x} tokens')
    bar_fig.update_layout(
        title={'text': "Top Langs with Shortest and Longest Median Token Lengths",
               'font': {'size': 25}, 'automargin': True, 'yref': "paper",
               'pad': {'b': 20}},
        xaxis={'title': "Number of Tokens", 'showgrid': True,
               'gridwidth': 1, 'gridcolor': "LightGrey"},
        yaxis={'title': "Language"},
        height=400,
        showlegend=False
    )

    return (metrics_html, dist_fig, bar_fig,
            get_example_text_data(val_data, subset_df, tokenizer_name))

# Create Gradio interface
with gr.Blocks(title="Tokenizer Language Comparison") as demo:
    gr.Markdown(
        """
        # All languages are NOT created (tokenized) equal!

        This project compares the tokenization length for different languages.
        For some tokenizers, tokenizing a message in one language may result in
        10-20x more tokens than a comparable message in another language
        (e.g. try English vs. Burmese).

        This is part of a larger project of measuring inequality in NLP.
        See the original article 'All languages are NOT created (tokenized) equal'
        on [Art Fish Intelligence](https://www.artfish.ai/).
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## Settings")

            tokenizer = gr.Dropdown(
                choices=TOKENIZERS,
                value="openai/gpt4",
                label="Select Tokenizer"
            )

            tokenizer_info = gr.Markdown()

            data_source_text = (
                f"### Data Source\n"
                f"The data in this figure is the validation set of the "
                f"[Amazon Massive](https://huggingface.co/datasets/"
                f"AmazonScience/massive/viewer/af-ZA/validation) "
                f"dataset, which consists of 2033 short sentences and "
                f"phrases translated into 51 different languages. "
                f"Learn more about the dataset from [Amazon's blog post]"
                f"(https://www.amazon.science/blog/"
                f"amazon-releases-51-language-dataset-for-language-understanding).\n\n"
                f"**Data loaded:** {len(val_data)} rows"
            )
            gr.Markdown(data_source_text)

            languages = gr.CheckboxGroup(
                choices=all_languages,
                value=['English', 'Spanish', 'Chinese', 'Burmese'],
                label="Select Languages (max 6)"
            )

            show_hist = gr.Checkbox(label="Show histogram", value=False)

            update_btn = gr.Button("🔄 Refresh Example Texts", variant="secondary")

        with gr.Column(scale=3):
            gr.Markdown("## Visualizations")

            metrics = gr.HTML(label="Median Token Lengths")

            with gr.Row():
                dist_plot = gr.Plot(label="Token Distribution")

            with gr.Row():
                bar_plot = gr.Plot(label="Shortest vs Longest Languages")

            gr.Markdown("### Example Texts")
            examples_df = gr.Dataframe(label="Random Sample")

    def update_tokenizer_info(tokenizer_name):
        """Update tokenizer information display."""
        if tokenizer_name not in ['openai/gpt4']:
            url = f"https://huggingface.co/{tokenizer_name}"
            return f"Tokenizer available on [HuggingFace hub]({url})"
        return "Tokenized using [tiktoken](https://github.com/openai/tiktoken)"

    # pylint: disable=no-member
    tokenizer.change(
        fn=update_tokenizer_info,
        inputs=[tokenizer],
        outputs=[tokenizer_info]
    )

    # Update visualizations when inputs change
    inputs = [tokenizer, languages, show_hist]
    outputs = [metrics, dist_plot, bar_plot, examples_df]

    tokenizer.change(fn=update_visualizations, inputs=inputs, outputs=outputs)
    languages.change(fn=update_visualizations, inputs=inputs, outputs=outputs)
    show_hist.change(fn=update_visualizations, inputs=inputs, outputs=outputs)
    update_btn.click(fn=update_visualizations, inputs=inputs, outputs=outputs)

    # Load initial visualization
    demo.load(fn=update_visualizations, inputs=inputs, outputs=outputs)
    # pylint: enable=no-member

if __name__ == "__main__":
    demo.launch()