Spaces:
Running
Running
File size: 8,822 Bytes
1bca40f baeda9f 1bca40f baeda9f 1bca40f bc3a3c1 b9fc216 bc3a3c1 1bca40f 1d0b778 baeda9f f7ac4d6 99bd427 1bca40f f7ac4d6 99bd427 f7ac4d6 1bca40f baeda9f bc3a3c1 baeda9f 1bca40f 99bd427 baeda9f 1bca40f baeda9f 1bca40f baeda9f 1bca40f baeda9f 1bca40f baeda9f 1bca40f baeda9f 1bca40f baeda9f 1bca40f baeda9f 1bca40f baeda9f 1bca40f baeda9f 1bca40f f7ac4d6 baeda9f 1d0b778 99bd427 f7ac4d6 99bd427 f7ac4d6 baeda9f f7ac4d6 baeda9f f7ac4d6 99bd427 f7ac4d6 baeda9f 99bd427 f7ac4d6 99bd427 baeda9f 99bd427 baeda9f 99bd427 baeda9f f7ac4d6 baeda9f f7ac4d6 baeda9f 1d0b778 baeda9f f7ac4d6 baeda9f 99bd427 f7ac4d6 baeda9f 1bca40f bc3a3c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
import csv
from collections import defaultdict
import gradio as gr
import pandas as pd
def strip_colname(x):
if x.startswith("score_"):
return x[6:]
return x
INTRO = """The current leaderboard displays performance across all filtered directions based on the dev subset of BOUQuET.
A smarter leaderboard and the code for reproducing the evaluations will be published soon!
"""
LANGS_EXPLANATION = """## Languages
Below, we give a brief description of each language variety participating in the leaderboard.
Each language variety is identified by
an [ISO 639-3 code](https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes) (the first 3 letters) for the language,
an [ISO 15924](https://en.wikipedia.org/wiki/ISO_15924) code (the next 4 letters) for the writing system,
and optionally, a [Glottolog code](https://glottolog.org/) for the dialect.
The varieties with a secondary language code (Egyptian Arabic, Colloquial Malay) use code-switching,
i.e. the speakers switch between the two languages (a colloquial and a standardized variety)
depending on the context (e.g. the formality level).
For a fuller description of the languages and the codes used to represent them, please refer
to https://huggingface.co/datasets/facebook/bouquet#languages and the [BOUQuET paper](https://arxiv.org/abs/2502.04314).
"""
METRICS_EXPLANATION = """## Metrics
1. `metricx_both`: [google/metricx-24-hybrid-xl-v2p6](https://huggingface.co/google/metricx-24-hybrid-xl-v2p6) score based on both source and reference. **Attention: lower is better!**
2. `xcomet_both`: []() score based on both source and reference.
3. `CHRFpp`: ChrF++ score ([sacrebleu](https://github.com/mjpost/sacrebleu) implementation) based on reference.
4. `glotlid_ref`: probability, as predicted by the [GlotLID model](https://huggingface.co/cis-lmu/glotlid), that translation and reference are in the same language.
"""
SYSTEMS_EXPLANATION = """## Systems
Descriptions of the implementation of the systems will come out later.
"""
def leaderboard_tab():
stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE)
stats.columns = [strip_colname(c) for c in stats.columns]
metrics = ["metricx_both", "xcomet_both", "CHRFpp", "glotlid_ref"]
systems = sorted(set(stats["system"]))
levels = ["sentence_level", "paragraph_level"]
ALL = "ALL"
MEAN = "Average"
BEST = "Best"
XX2EN = "Everything-into-English"
EN2XX = "English-into-Everything"
lang_src2tgt = defaultdict(set)
lang_tgt2src = defaultdict(set)
langs_src = set()
langs_tgt = set()
for src_lang, tgt_lang in stats[["src_lang", "tgt_lang"]].drop_duplicates().values:
lang_src2tgt[src_lang].add(tgt_lang)
lang_tgt2src[tgt_lang].add(src_lang)
langs_src.add(src_lang)
langs_tgt.add(tgt_lang)
langs_df = pd.read_csv("data/language_metadata.tsv", sep="\t")
lang2name = {}
for i, row in langs_df.iterrows():
code = row["ISO 639-3"] + "_" + row["ISO 15924"]
if isinstance(row["Glottocode"], str) and len(row["Glottocode"]) > 0:
code = code + "_" + row["Glottocode"]
lang2name[code] = row["Language"]
if isinstance(row["Secondary ISO 639-3"], str) and len(
row["Secondary ISO 639-3"]
):
code = row["Secondary ISO 639-3"] + code[3:]
lang2name[code] = row["Language"]
for lang in langs_src.union(langs_tgt):
if lang not in lang2name:
print(f"Name not found for {lang}")
def named_langs(langs_list):
return [
(f"{lang} — {lang2name[lang]}", lang) if lang in lang2name else lang
for lang in langs_list
]
with gr.Tab("Leaderboard"):
gr.Markdown("# BOUQuET translation leaderboard")
gr.Markdown(INTRO)
gr.Markdown("## Systems ranking")
# Inputs
gr_level = gr.Dropdown(levels, value="sentence_level", label="Level")
gr_src_lang = gr.Dropdown(
[ALL] + named_langs(sorted(langs_src)), value=ALL, label="Source lang"
)
gr_tgt_lang = gr.Dropdown(
[ALL] + named_langs(sorted(langs_tgt)), value=ALL, label="Target lang"
)
# Interactivity
inputs = [gr_level, gr_src_lang, gr_tgt_lang]
def get_lb(level, src_lang, tgt_lang):
filtered = stats[stats["level"].eq(level)]
if src_lang != ALL:
filtered = filtered[filtered["src_lang"].eq(src_lang)]
if tgt_lang != ALL:
filtered = filtered[filtered["tgt_lang"].eq(tgt_lang)]
means = (
filtered.groupby(["system"])[metrics]
.mean()
.reset_index()
.sort_values("metricx_both")
)
means.columns = [strip_colname(c) for c in means.columns]
styler = means.style.background_gradient().format(precision=4)
return styler
df_all = get_lb(*[inp.value for inp in inputs])
gr_df = gr.Dataframe(df_all)
for inp in inputs:
inp.change(fn=get_lb, inputs=inputs, outputs=gr_df)
# Interdependecy of the controls
def src2tgt(src_lang, tgt_lang):
if src_lang == ALL:
choices = [ALL] + named_langs(sorted(langs_tgt))
else:
choices = [ALL] + named_langs(sorted(lang_src2tgt[src_lang]))
return gr.update(choices=choices, value=tgt_lang)
def tgt2src(src_lang, tgt_lang):
if tgt_lang == ALL:
choices = [ALL] + named_langs(sorted(langs_src))
else:
choices = [ALL] + named_langs(sorted(lang_tgt2src[tgt_lang]))
return gr.update(choices=choices, value=src_lang)
gr_src_lang.input(
fn=src2tgt, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_tgt_lang
)
gr_tgt_lang.input(
fn=tgt2src, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_src_lang
)
gr.Markdown("## Languages difficulty")
gr_system = gr.Dropdown(
[MEAN, BEST] + systems, value=MEAN, label="Translation system"
)
gr_direction = gr.Dropdown(
[XX2EN, EN2XX], value=XX2EN, label="Translation direction"
)
gr_metric = gr.Dropdown(metrics, label="Quality metric", value="metricx_both")
gr_level2 = gr.Dropdown(levels, value="sentence_level", label="Level")
bar_controls = [gr_system, gr_direction, gr_metric, gr_level2]
def get_hist(system, direction, metric, level):
# decide on the data to process
if direction == EN2XX:
direction_filter = stats["src_lang"].eq("eng_Latn")
lang_col = "tgt_lang"
else:
direction_filter = stats["tgt_lang"].eq("eng_Latn")
lang_col = "src_lang"
if system in (MEAN, BEST):
system_filter = stats["system"].astype(bool)
else:
system_filter = stats["system"].eq(system)
subset = stats[system_filter & direction_filter & stats["level"].eq(level)]
# Compute the means and update the plot
grouped = subset.groupby(lang_col)[metric]
if metric == "metricx_both":
bests = grouped.min()
best_sys = grouped.idxmin()
else:
bests = grouped.max()
best_sys = grouped.idxmax()
if system == BEST:
means = bests
else:
means = grouped.mean()
report = (
pd.DataFrame(
{
metric: means,
"best_system": subset.loc[best_sys]["system"].values,
}
)
.sort_values(metric, ascending=(metric == "metricx_both"))
.reset_index()
)
report["lang_name"] = [lang2name.get(lang, "") for lang in report[lang_col]]
tooltip_columns = ["lang_name", "best_system"]
return gr.update(
value=report,
x=lang_col,
y=metric,
x_label_angle=-90,
height=500,
sort="y",
tooltip=tooltip_columns,
)
default_bar = get_hist(*[x.value for x in bar_controls])
gr_barplot = gr.BarPlot(**default_bar)
for inp in bar_controls:
inp.change(fn=get_hist, inputs=bar_controls, outputs=gr_barplot)
gr.Markdown(METRICS_EXPLANATION)
gr.Markdown(SYSTEMS_EXPLANATION)
gr.Markdown(LANGS_EXPLANATION)
gr.Dataframe(langs_df.drop(columns=["Class"]).style.format(na_rep=""))
|