Spaces:

orbtailwaves23
/

open_dutch_llm_leaderboard

Sleeping

Bram Vanroy

add training type

2c801d0 about 2 years ago

4.83 kB

	import json
	from collections import defaultdict
	from pathlib import Path

	import numpy as np
	import pandas as pd
	import gradio as gr
	from pandas import DataFrame
	from pandas.io.formats.style import Styler

	from content import *

	ARC = "arc"
	HELLASWAG = "hellaswag"
	MMLU = "mmlu"
	TRUTHFULQA = "truthfulqa"
	BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]

	METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]


	def collect_results() -> dict[tuple[str, str], dict[str, float]]:
	"""
	Collects results from the evals folder and returns a dictionary of results
	:return: a dictionary of results where the keys are typles of (model_name, language) and the values are
	dictionaries of the form {benchmark_name: performance_score}
	"""
	performance_dict = defaultdict(dict)
	for pfin in Path("evals").rglob("*.json"):
	data = json.loads(pfin.read_text(encoding="utf-8"))
	if "results" not in data or "config" not in data:
	continue
	results = data["results"]
	config = data["config"]
	if "model_args" not in config:
	continue

	model_args = config["model_args"].split(",")
	pretrained = [x for x in model_args if x.startswith("pretrained=")]
	if len(pretrained) != 1:
	continue
	pretrained = pretrained[0].split("=")[1]
	pretrained = pretrained.split("/")[-1]

	for lang_task, perfs in results.items():
	task, lang = lang_task.split("_")
	assert task in BENCHMARKS

	if lang and task:
	metric = METRICS[BENCHMARKS.index(task)]
	p = round(perfs[metric] * 100, 1)
	performance_dict[(pretrained, lang)][task] = p

	return dict(performance_dict)


	def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float]]) -> DataFrame:
	"""
	Builds a dataframe from the performance dictionary
	:param performance_dict: a dictionary of results where the keys are typles of (model_name, language) and the values are
	dictionaries of the form {benchmark_name: performance_score}
	:return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
	"""
	data = []
	dutch_training_info = json.loads(Path(__file__).parent.joinpath("evals/dutch_models.json").read_text(encoding="utf-8"))

	for (pretrained, lang), perfs in performance_dict.items():
	arc_perf = perfs.get(ARC, 0.0)
	hellaswag_perf = perfs.get(HELLASWAG, 0.0)
	mmlu_perf = perfs.get(MMLU, 0.0)
	truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
	training_type = dutch_training_info.get(pretrained, "NA")

	avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
	row = [pretrained, training_type, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
	data.append(row)

	df = pd.DataFrame.from_records(data, columns=COLS)
	df = df.sort_values(by=[AVERAGE_COL], ascending=False)

	return df


	def style_df(df: DataFrame) -> Styler:
	"""
	Styles the dataframe by rounding to two decimals and putting the max value in bold per column
	:param df: the dataframe to style
	:return: the Styler
	"""
	styler = df.style.format("{:.2f}", subset=df.columns[2:])

	def highlight_max(col):
	return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)

	styler = styler.apply(highlight_max, axis=1, subset=df.columns[2:])
	styler = styler.hide()
	return styler


	MODEL_COL = "Model"
	AVERAGE_COL = "Average"
	ARC_COL = "ARC (25-shot)"
	HELLASWAG_COL = "HellaSwag (10-shot)️"
	MMLU_COL = "MMLU (5-shot)"
	TRUTHFULQA_COL = "TruthfulQA (0-shot)"
	TRAIN_TYPE_COL = "Training type"

	COLS = [MODEL_COL, TRAIN_TYPE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
	TYPES = ["str", "number", "number", "number", "number", "number"]

	results = collect_results()
	original_df = build_performance_df(results)
	styled_df = style_df(original_df)
	with gr.Blocks() as demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRO_TEXT)

	gr.Markdown("## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!")
	gr.components.Dataframe(
	value=original_df,
	headers=COLS,
	datatype=TYPES,
	elem_id="leaderboard-table",
	)
	gr.Markdown("Training type: <code>PT</code>: pretrained on only/mostly Dutch; <code>FT</code>: only finetuned on"
	" Dutch; <code>NA</code> not specifically pretrained nor finetuned on Dutch but Dutch data may have been a (small) portion of the training data")

	gr.Markdown("## LaTeX")
	gr.Code(styled_df.to_latex(convert_css=True))

	gr.Markdown(CREDIT, elem_classes="markdown-text")
	gr.Markdown(CITATION, elem_classes="markdown-text")

	if __name__ == '__main__':
	demo.launch()