| import json | |
| from collections import defaultdict | |
| from pathlib import Path | |
| import numpy as np | |
| import pandas as pd | |
| import gradio as gr | |
| from pandas import DataFrame | |
| from pandas.io.formats.style import Styler | |
| from content import * | |
| ARC = "arc" | |
| HELLASWAG = "hellaswag" | |
| MMLU = "mmlu" | |
| TRUTHFULQA = "truthfulqa" | |
| BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA] | |
| METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"] | |
| def collect_results() -> dict[tuple[str, str], dict[str, float]]: | |
| """ | |
| Collects results from the evals folder and returns a dictionary of results | |
| :return: a dictionary of results where the keys are typles of (model_name, language) and the values are | |
| dictionaries of the form {benchmark_name: performance_score} | |
| """ | |
| performance_dict = defaultdict(dict) | |
| for pfin in Path("evals").rglob("*.json"): | |
| data = json.loads(pfin.read_text(encoding="utf-8")) | |
| if "results" not in data or "config" not in data: | |
| continue | |
| results = data["results"] | |
| config = data["config"] | |
| if "model_args" not in config: | |
| continue | |
| model_args = config["model_args"].split(",") | |
| pretrained = [x for x in model_args if x.startswith("pretrained=")] | |
| if len(pretrained) != 1: | |
| continue | |
| pretrained = pretrained[0].split("=")[1] | |
| pretrained = pretrained.split("/")[-1] | |
| for lang_task, perfs in results.items(): | |
| task, lang = lang_task.split("_") | |
| assert task in BENCHMARKS | |
| if lang and task: | |
| metric = METRICS[BENCHMARKS.index(task)] | |
| p = round(perfs[metric] * 100, 1) | |
| performance_dict[(pretrained, lang)][task] = p | |
| return dict(performance_dict) | |
| def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float]]) -> DataFrame: | |
| """ | |
| Builds a dataframe from the performance dictionary | |
| :param performance_dict: a dictionary of results where the keys are typles of (model_name, language) and the values are | |
| dictionaries of the form {benchmark_name: performance_score} | |
| :return: a pd.DataFrame that has as rows the model names and as columns the benchmarks | |
| """ | |
| data = [] | |
| dutch_training_info = json.loads(Path(__file__).parent.joinpath("evals/dutch_models.json").read_text(encoding="utf-8")) | |
| for (pretrained, lang), perfs in performance_dict.items(): | |
| arc_perf = perfs.get(ARC, 0.0) | |
| hellaswag_perf = perfs.get(HELLASWAG, 0.0) | |
| mmlu_perf = perfs.get(MMLU, 0.0) | |
| truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0) | |
| training_type = dutch_training_info.get(pretrained, "NA") | |
| avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1) | |
| row = [pretrained, training_type, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf] | |
| data.append(row) | |
| df = pd.DataFrame.from_records(data, columns=COLS) | |
| df = df.sort_values(by=[AVERAGE_COL], ascending=False) | |
| return df | |
| def style_df(df: DataFrame) -> Styler: | |
| """ | |
| Styles the dataframe by rounding to two decimals and putting the max value in bold per column | |
| :param df: the dataframe to style | |
| :return: the Styler | |
| """ | |
| styler = df.style.format("{:.2f}", subset=df.columns[2:]) | |
| def highlight_max(col): | |
| return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None) | |
| styler = styler.apply(highlight_max, axis=1, subset=df.columns[2:]) | |
| styler = styler.hide() | |
| return styler | |
| MODEL_COL = "Model" | |
| AVERAGE_COL = "Average" | |
| ARC_COL = "ARC (25-shot)" | |
| HELLASWAG_COL = "HellaSwag (10-shot)️" | |
| MMLU_COL = "MMLU (5-shot)" | |
| TRUTHFULQA_COL = "TruthfulQA (0-shot)" | |
| TRAIN_TYPE_COL = "Training type" | |
| COLS = [MODEL_COL, TRAIN_TYPE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL] | |
| TYPES = ["str", "number", "number", "number", "number", "number"] | |
| results = collect_results() | |
| original_df = build_performance_df(results) | |
| styled_df = style_df(original_df) | |
| with gr.Blocks() as demo: | |
| gr.HTML(TITLE) | |
| gr.Markdown(INTRO_TEXT) | |
| gr.Markdown("## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!") | |
| gr.components.Dataframe( | |
| value=original_df, | |
| headers=COLS, | |
| datatype=TYPES, | |
| elem_id="leaderboard-table", | |
| ) | |
| gr.Markdown("Training type: <code>PT</code>: pretrained on only/mostly Dutch; <code>FT</code>: **only** finetuned on" | |
| " Dutch; <code>NA</code> not specifically pretrained nor finetuned on Dutch but Dutch data may have been a (small) portion of the training data") | |
| gr.Markdown("## LaTeX") | |
| gr.Code(styled_df.to_latex(convert_css=True)) | |
| gr.Markdown(CREDIT, elem_classes="markdown-text") | |
| gr.Markdown(CITATION, elem_classes="markdown-text") | |
| if __name__ == '__main__': | |
| demo.launch() | |