Bram Vanroy commited on
Commit ·
2c801d0
1
Parent(s): 851256b
add training type
Browse files
app.py
CHANGED
|
@@ -62,18 +62,22 @@ def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float
|
|
| 62 |
:return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
|
| 63 |
"""
|
| 64 |
data = []
|
|
|
|
|
|
|
| 65 |
for (pretrained, lang), perfs in performance_dict.items():
|
| 66 |
arc_perf = perfs.get(ARC, 0.0)
|
| 67 |
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
|
| 68 |
mmlu_perf = perfs.get(MMLU, 0.0)
|
| 69 |
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
|
|
|
|
| 70 |
|
| 71 |
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
|
| 72 |
-
row = [pretrained, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
|
| 73 |
data.append(row)
|
| 74 |
|
| 75 |
df = pd.DataFrame.from_records(data, columns=COLS)
|
| 76 |
df = df.sort_values(by=[AVERAGE_COL], ascending=False)
|
|
|
|
| 77 |
return df
|
| 78 |
|
| 79 |
|
|
@@ -83,12 +87,12 @@ def style_df(df: DataFrame) -> Styler:
|
|
| 83 |
:param df: the dataframe to style
|
| 84 |
:return: the Styler
|
| 85 |
"""
|
| 86 |
-
styler = df.style.format("{:.2f}", subset=df.columns[
|
| 87 |
|
| 88 |
def highlight_max(col):
|
| 89 |
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
|
| 90 |
|
| 91 |
-
styler = styler.apply(highlight_max, axis=1, subset=df.columns[
|
| 92 |
styler = styler.hide()
|
| 93 |
return styler
|
| 94 |
|
|
@@ -99,8 +103,9 @@ ARC_COL = "ARC (25-shot)"
|
|
| 99 |
HELLASWAG_COL = "HellaSwag (10-shot)️"
|
| 100 |
MMLU_COL = "MMLU (5-shot)"
|
| 101 |
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
|
|
|
|
| 102 |
|
| 103 |
-
COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
|
| 104 |
TYPES = ["str", "number", "number", "number", "number", "number"]
|
| 105 |
|
| 106 |
results = collect_results()
|
|
@@ -117,6 +122,8 @@ with gr.Blocks() as demo:
|
|
| 117 |
datatype=TYPES,
|
| 118 |
elem_id="leaderboard-table",
|
| 119 |
)
|
|
|
|
|
|
|
| 120 |
|
| 121 |
gr.Markdown("## LaTeX")
|
| 122 |
gr.Code(styled_df.to_latex(convert_css=True))
|
|
|
|
| 62 |
:return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
|
| 63 |
"""
|
| 64 |
data = []
|
| 65 |
+
dutch_training_info = json.loads(Path(__file__).parent.joinpath("evals/dutch_models.json").read_text(encoding="utf-8"))
|
| 66 |
+
|
| 67 |
for (pretrained, lang), perfs in performance_dict.items():
|
| 68 |
arc_perf = perfs.get(ARC, 0.0)
|
| 69 |
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
|
| 70 |
mmlu_perf = perfs.get(MMLU, 0.0)
|
| 71 |
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
|
| 72 |
+
training_type = dutch_training_info.get(pretrained, "NA")
|
| 73 |
|
| 74 |
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
|
| 75 |
+
row = [pretrained, training_type, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
|
| 76 |
data.append(row)
|
| 77 |
|
| 78 |
df = pd.DataFrame.from_records(data, columns=COLS)
|
| 79 |
df = df.sort_values(by=[AVERAGE_COL], ascending=False)
|
| 80 |
+
|
| 81 |
return df
|
| 82 |
|
| 83 |
|
|
|
|
| 87 |
:param df: the dataframe to style
|
| 88 |
:return: the Styler
|
| 89 |
"""
|
| 90 |
+
styler = df.style.format("{:.2f}", subset=df.columns[2:])
|
| 91 |
|
| 92 |
def highlight_max(col):
|
| 93 |
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
|
| 94 |
|
| 95 |
+
styler = styler.apply(highlight_max, axis=1, subset=df.columns[2:])
|
| 96 |
styler = styler.hide()
|
| 97 |
return styler
|
| 98 |
|
|
|
|
| 103 |
HELLASWAG_COL = "HellaSwag (10-shot)️"
|
| 104 |
MMLU_COL = "MMLU (5-shot)"
|
| 105 |
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
|
| 106 |
+
TRAIN_TYPE_COL = "Training type"
|
| 107 |
|
| 108 |
+
COLS = [MODEL_COL, TRAIN_TYPE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
|
| 109 |
TYPES = ["str", "number", "number", "number", "number", "number"]
|
| 110 |
|
| 111 |
results = collect_results()
|
|
|
|
| 122 |
datatype=TYPES,
|
| 123 |
elem_id="leaderboard-table",
|
| 124 |
)
|
| 125 |
+
gr.Markdown("Training type: <code>PT</code>: pretrained on only/mostly Dutch; <code>FT</code>: **only** finetuned on"
|
| 126 |
+
" Dutch; <code>NA</code> not specifically pretrained nor finetuned on Dutch but Dutch data may have been a (small) portion of the training data")
|
| 127 |
|
| 128 |
gr.Markdown("## LaTeX")
|
| 129 |
gr.Code(styled_df.to_latex(convert_css=True))
|