Spaces:
Running
Running
File size: 5,280 Bytes
c708975 ca25c6f c708975 ca25c6f 653989f ca25c6f 3288843 47afef3 653989f ca25c6f 653989f ca25c6f 653989f ca25c6f 653989f c708975 653989f c708975 ca25c6f 653989f ca25c6f 3288843 ca25c6f 653989f ca25c6f 3288843 ca25c6f 653989f ca25c6f 653989f ca25c6f c708975 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import random
from pathlib import Path
import gradio as gr
import pandas as pd
from gradio_leaderboard import Leaderboard, SelectColumns, SearchColumns
abs_path = Path(__file__).parent
df_core = pd.read_csv("opensci-ref-table.csv")
df_core.drop("#Tokens", axis=1, inplace=True)
df_core.drop("AVG", axis=1, inplace=True)
benchmarks_core = df_core.columns[1:]
df_core["Average โฌ๏ธ"] = df_core.loc[:, benchmarks_core].mean(axis=1)
df_core.sort_values(by="Average โฌ๏ธ", ascending=False, inplace=True)
df_instruction_tuning = pd.read_csv("results_instruction_tuning.csv.zip")
df_instruction_tuning = df_instruction_tuning[
~df_instruction_tuning.model_B.str.contains("12b")
]
df_instruction_tuning.model_B = df_instruction_tuning.model_B.apply(
lambda s: s.split("/")[-1]
)
df_instruction_tuning_pivot = df_instruction_tuning.pivot_table(
index="model_B", columns="benchmark", values="preference"
)
df_instruction_tuning_pivot.index.rename("Model", inplace=True)
df_instruction_tuning_pivot.reset_index(drop=False, inplace=True)
df_instruction_tuning_pivot.columns = [
x.capitalize() for x in df_instruction_tuning_pivot.columns
]
# first column is model
df_instruction_tuning_pivot["Average โฌ๏ธ"] = df_instruction_tuning_pivot.loc[
:, df_instruction_tuning_pivot.columns[1:]
].mean(axis=1)
# df_instruction_tuning.drop("benchmark", axis=1, inplace=True)
df_instruction_tuning_pivot.sort_values(by="Average โฌ๏ธ", ascending=False, inplace=True)
df_mah_pivot = df_instruction_tuning[
df_instruction_tuning.benchmark == "m-arena-hard-EU"
].copy()
df_mah_pivot["lang"] = df_instruction_tuning.instruction_index.apply(
lambda s: s.split("-")[-1]
)
df_mah_pivot = df_mah_pivot.pivot_table(
index="model_B", columns="lang", values="preference"
)
df_mah_pivot["Average โฌ๏ธ"] = df_mah_pivot.mean(axis=1)
df_mah_pivot.sort_values(by="Average โฌ๏ธ", ascending=False, inplace=True)
df_mah_pivot.index.rename("Model", inplace=True)
df_mah_pivot.reset_index(drop=False, inplace=True)
cols = [
#'Llama-3.1-8B',
"Llama-3.1-Tulu-3-8B-SFT",
"Llama-3.2-3B-Instruct",
"Llama-3.1-Tulu-3-8B-DPO",
"Apertus-8B-Instruct-2509",
]
with gr.Blocks() as demo:
gr.Markdown(
"""
# ๐ฅ OpenEuroLLM Leaderboard ๐ช๐บ
"""
)
with gr.Tabs():
with gr.Tab("English Core ๐ด๓ ง๓ ข๓ ฅ๓ ฎ๓ ง๓ ฟ๐บ๐ธ"):
Leaderboard(
value=df_core.round(2),
select_columns=SelectColumns(
default_selection=list(df_core.columns),
cant_deselect=["Model"],
label="Select Columns to Display:",
),
search_columns=SearchColumns(
primary_column="Model",
label="Filter a model",
secondary_columns=[],
),
)
with gr.Tab("Instruction-tuning ๐ฏ๓ ง๓ ข๓ ฅ๐ด๓ ง๓ ข๓ ฅ๓ ฎ๓ ง๓ ฟ"):
gr.Markdown(
"""
Winrate against Llama-3.1-8B-Instruct using Llama-3.1-70B-Instruct as the LLM-judge.
"""
)
Leaderboard(
value=df_instruction_tuning_pivot.round(2),
select_columns=SelectColumns(
# default_selection=[
# col
# for col in df_instruction_tuning_pivot.columns
# if not "-eu" in col
# ],
cant_deselect=["Model"],
label="Select Columns to Display:",
),
search_columns=SearchColumns(
primary_column="Model",
label="Filter a model",
secondary_columns=[],
),
)
with gr.Tab("Instruction-tuning multi-lingual ๐ฏ๐ช๐บ"):
gr.Markdown(
"""
Winrate on m-Arena-Hard instructions against Llama-3.1-8B-Instruct using Llama-3.1-70B-Instruct as the LLM-judge.
"""
)
language_flags = {
"cs": "๐จ๐ฟ",
"de": "๐ฉ๐ช",
"el": "๐ฌ๐ท",
"en": "๐ฌ๐ง",
"es": "๐ช๐ธ",
"fr": "๐ซ๐ท",
"it": "๐ฎ๐น",
"nl": "๐ณ๐ฑ",
"pl": "๐ต๐ฑ",
"pt": "๐ต๐น",
"ro": "๐ท๐ด",
"uk": "๐บ๐ฆ",
}
df_mah_pivot.columns = [
f"{x} {language_flags[x]}" if x in language_flags else x
for x in df_mah_pivot.columns
]
Leaderboard(
value=df_mah_pivot.round(2),
select_columns=SelectColumns(
default_selection=list(df_mah_pivot.columns),
cant_deselect=["Model"],
label="Select Columns to Display:",
),
search_columns=SearchColumns(
primary_column="Model",
label="Filter a model",
secondary_columns=[],
),
)
if __name__ == "__main__":
demo.launch()
|