Spaces:
Running
Running
| import random | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| from gradio_leaderboard import Leaderboard, SelectColumns, SearchColumns | |
| abs_path = Path(__file__).parent | |
| df_core = pd.read_csv("opensci-ref-table.csv") | |
| df_core.drop("#Tokens", axis=1, inplace=True) | |
| df_core.drop("AVG", axis=1, inplace=True) | |
| benchmarks_core = df_core.columns[1:] | |
| df_core["Average โฌ๏ธ"] = df_core.loc[:, benchmarks_core].mean(axis=1) | |
| df_core.sort_values(by="Average โฌ๏ธ", ascending=False, inplace=True) | |
| df_instruction_tuning = pd.read_csv("results_instruction_tuning.csv.zip") | |
| df_instruction_tuning = df_instruction_tuning[ | |
| ~df_instruction_tuning.model_B.str.contains("12b") | |
| ] | |
| df_instruction_tuning.model_B = df_instruction_tuning.model_B.apply( | |
| lambda s: s.split("/")[-1] | |
| ) | |
| df_instruction_tuning_pivot = df_instruction_tuning.pivot_table( | |
| index="model_B", columns="benchmark", values="preference" | |
| ) | |
| df_instruction_tuning_pivot.index.rename("Model", inplace=True) | |
| df_instruction_tuning_pivot.reset_index(drop=False, inplace=True) | |
| df_instruction_tuning_pivot.columns = [ | |
| x.capitalize() for x in df_instruction_tuning_pivot.columns | |
| ] | |
| # first column is model | |
| df_instruction_tuning_pivot["Average โฌ๏ธ"] = df_instruction_tuning_pivot.loc[ | |
| :, df_instruction_tuning_pivot.columns[1:] | |
| ].mean(axis=1) | |
| # df_instruction_tuning.drop("benchmark", axis=1, inplace=True) | |
| df_instruction_tuning_pivot.sort_values(by="Average โฌ๏ธ", ascending=False, inplace=True) | |
| df_mah_pivot = df_instruction_tuning[ | |
| df_instruction_tuning.benchmark == "m-arena-hard-EU" | |
| ].copy() | |
| df_mah_pivot["lang"] = df_instruction_tuning.instruction_index.apply( | |
| lambda s: s.split("-")[-1] | |
| ) | |
| df_mah_pivot = df_mah_pivot.pivot_table( | |
| index="model_B", columns="lang", values="preference" | |
| ) | |
| df_mah_pivot["Average โฌ๏ธ"] = df_mah_pivot.mean(axis=1) | |
| df_mah_pivot.sort_values(by="Average โฌ๏ธ", ascending=False, inplace=True) | |
| df_mah_pivot.index.rename("Model", inplace=True) | |
| df_mah_pivot.reset_index(drop=False, inplace=True) | |
| cols = [ | |
| #'Llama-3.1-8B', | |
| "Llama-3.1-Tulu-3-8B-SFT", | |
| "Llama-3.2-3B-Instruct", | |
| "Llama-3.1-Tulu-3-8B-DPO", | |
| "Apertus-8B-Instruct-2509", | |
| ] | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # ๐ฅ OpenEuroLLM Leaderboard ๐ช๐บ | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("English Core ๐ด๓ ง๓ ข๓ ฅ๓ ฎ๓ ง๓ ฟ๐บ๐ธ"): | |
| Leaderboard( | |
| value=df_core.round(2), | |
| select_columns=SelectColumns( | |
| default_selection=list(df_core.columns), | |
| cant_deselect=["Model"], | |
| label="Select Columns to Display:", | |
| ), | |
| search_columns=SearchColumns( | |
| primary_column="Model", | |
| label="Filter a model", | |
| secondary_columns=[], | |
| ), | |
| ) | |
| with gr.Tab("Instruction-tuning ๐ฏ๓ ง๓ ข๓ ฅ๐ด๓ ง๓ ข๓ ฅ๓ ฎ๓ ง๓ ฟ"): | |
| gr.Markdown( | |
| """ | |
| Winrate against Llama-3.1-8B-Instruct using Llama-3.1-70B-Instruct as the LLM-judge. | |
| """ | |
| ) | |
| Leaderboard( | |
| value=df_instruction_tuning_pivot.round(2), | |
| select_columns=SelectColumns( | |
| # default_selection=[ | |
| # col | |
| # for col in df_instruction_tuning_pivot.columns | |
| # if not "-eu" in col | |
| # ], | |
| cant_deselect=["Model"], | |
| label="Select Columns to Display:", | |
| ), | |
| search_columns=SearchColumns( | |
| primary_column="Model", | |
| label="Filter a model", | |
| secondary_columns=[], | |
| ), | |
| ) | |
| with gr.Tab("Instruction-tuning multi-lingual ๐ฏ๐ช๐บ"): | |
| gr.Markdown( | |
| """ | |
| Winrate on m-Arena-Hard instructions against Llama-3.1-8B-Instruct using Llama-3.1-70B-Instruct as the LLM-judge. | |
| """ | |
| ) | |
| language_flags = { | |
| "cs": "๐จ๐ฟ", | |
| "de": "๐ฉ๐ช", | |
| "el": "๐ฌ๐ท", | |
| "en": "๐ฌ๐ง", | |
| "es": "๐ช๐ธ", | |
| "fr": "๐ซ๐ท", | |
| "it": "๐ฎ๐น", | |
| "nl": "๐ณ๐ฑ", | |
| "pl": "๐ต๐ฑ", | |
| "pt": "๐ต๐น", | |
| "ro": "๐ท๐ด", | |
| "uk": "๐บ๐ฆ", | |
| } | |
| df_mah_pivot.columns = [ | |
| f"{x} {language_flags[x]}" if x in language_flags else x | |
| for x in df_mah_pivot.columns | |
| ] | |
| Leaderboard( | |
| value=df_mah_pivot.round(2), | |
| select_columns=SelectColumns( | |
| default_selection=list(df_mah_pivot.columns), | |
| cant_deselect=["Model"], | |
| label="Select Columns to Display:", | |
| ), | |
| search_columns=SearchColumns( | |
| primary_column="Model", | |
| label="Filter a model", | |
| secondary_columns=[], | |
| ), | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |