import random from pathlib import Path import gradio as gr import pandas as pd from gradio_leaderboard import Leaderboard, SelectColumns, SearchColumns abs_path = Path(__file__).parent df_core = pd.read_csv("opensci-ref-table.csv") df_core.drop("#Tokens", axis=1, inplace=True) df_core.drop("AVG", axis=1, inplace=True) benchmarks_core = df_core.columns[1:] df_core["Average ⬆️"] = df_core.loc[:, benchmarks_core].mean(axis=1) df_core.sort_values(by="Average ⬆️", ascending=False, inplace=True) df_instruction_tuning = pd.read_csv("results_instruction_tuning.csv.zip") df_instruction_tuning = df_instruction_tuning[ ~df_instruction_tuning.model_B.str.contains("12b") ] df_instruction_tuning.model_B = df_instruction_tuning.model_B.apply( lambda s: s.split("/")[-1] ) df_instruction_tuning_pivot = df_instruction_tuning.pivot_table( index="model_B", columns="benchmark", values="preference" ) df_instruction_tuning_pivot.index.rename("Model", inplace=True) df_instruction_tuning_pivot.reset_index(drop=False, inplace=True) df_instruction_tuning_pivot.columns = [ x.capitalize() for x in df_instruction_tuning_pivot.columns ] # first column is model df_instruction_tuning_pivot["Average ⬆️"] = df_instruction_tuning_pivot.loc[ :, df_instruction_tuning_pivot.columns[1:] ].mean(axis=1) # df_instruction_tuning.drop("benchmark", axis=1, inplace=True) df_instruction_tuning_pivot.sort_values(by="Average ⬆️", ascending=False, inplace=True) df_mah_pivot = df_instruction_tuning[ df_instruction_tuning.benchmark == "m-arena-hard-EU" ].copy() df_mah_pivot["lang"] = df_instruction_tuning.instruction_index.apply( lambda s: s.split("-")[-1] ) df_mah_pivot = df_mah_pivot.pivot_table( index="model_B", columns="lang", values="preference" ) df_mah_pivot["Average ⬆️"] = df_mah_pivot.mean(axis=1) df_mah_pivot.sort_values(by="Average ⬆️", ascending=False, inplace=True) df_mah_pivot.index.rename("Model", inplace=True) df_mah_pivot.reset_index(drop=False, inplace=True) cols = [ #'Llama-3.1-8B', "Llama-3.1-Tulu-3-8B-SFT", "Llama-3.2-3B-Instruct", "Llama-3.1-Tulu-3-8B-DPO", "Apertus-8B-Instruct-2509", ] with gr.Blocks() as demo: gr.Markdown( """ # 🥇 OpenEuroLLM Leaderboard 🇪🇺 """ ) with gr.Tabs(): with gr.Tab("English Core 🏴󠁧󠁢󠁥󠁮󠁧󠁿🇺🇸"): Leaderboard( value=df_core.round(2), select_columns=SelectColumns( default_selection=list(df_core.columns), cant_deselect=["Model"], label="Select Columns to Display:", ), search_columns=SearchColumns( primary_column="Model", label="Filter a model", secondary_columns=[], ), ) with gr.Tab("Instruction-tuning 🎯󠁧󠁢󠁥🏴󠁧󠁢󠁥󠁮󠁧󠁿"): gr.Markdown( """ Winrate against Llama-3.1-8B-Instruct using Llama-3.1-70B-Instruct as the LLM-judge. """ ) Leaderboard( value=df_instruction_tuning_pivot.round(2), select_columns=SelectColumns( # default_selection=[ # col # for col in df_instruction_tuning_pivot.columns # if not "-eu" in col # ], cant_deselect=["Model"], label="Select Columns to Display:", ), search_columns=SearchColumns( primary_column="Model", label="Filter a model", secondary_columns=[], ), ) with gr.Tab("Instruction-tuning multi-lingual 🎯🇪🇺"): gr.Markdown( """ Winrate on m-Arena-Hard instructions against Llama-3.1-8B-Instruct using Llama-3.1-70B-Instruct as the LLM-judge. """ ) language_flags = { "cs": "🇨🇿", "de": "🇩🇪", "el": "🇬🇷", "en": "🇬🇧", "es": "🇪🇸", "fr": "🇫🇷", "it": "🇮🇹", "nl": "🇳🇱", "pl": "🇵🇱", "pt": "🇵🇹", "ro": "🇷🇴", "uk": "🇺🇦", } df_mah_pivot.columns = [ f"{x} {language_flags[x]}" if x in language_flags else x for x in df_mah_pivot.columns ] Leaderboard( value=df_mah_pivot.round(2), select_columns=SelectColumns( default_selection=list(df_mah_pivot.columns), cant_deselect=["Model"], label="Select Columns to Display:", ), search_columns=SearchColumns( primary_column="Model", label="Filter a model", secondary_columns=[], ), ) if __name__ == "__main__": demo.launch()