round results
Browse files- functions.py +7 -7
functions.py
CHANGED
|
@@ -40,7 +40,7 @@ def get_task_summary(results):
|
|
| 40 |
{"dataset_type":"HuggingFaceH4/ifeval",
|
| 41 |
"dataset_name":"IFEval (0-Shot)",
|
| 42 |
"metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
|
| 43 |
-
"metric_value":results["IFEval"],
|
| 44 |
"dataset_config": None, # don't know
|
| 45 |
"dataset_split": None, # don't know
|
| 46 |
"dataset_revision":None,
|
|
@@ -51,7 +51,7 @@ def get_task_summary(results):
|
|
| 51 |
{"dataset_type":"BBH",
|
| 52 |
"dataset_name":"BBH (3-Shot)",
|
| 53 |
"metric_type":"acc_norm",
|
| 54 |
-
"metric_value":results["BBH"],
|
| 55 |
"dataset_config": None, # don't know
|
| 56 |
"dataset_split": None, # don't know
|
| 57 |
"dataset_revision":None,
|
|
@@ -63,7 +63,7 @@ def get_task_summary(results):
|
|
| 63 |
"dataset_type":"hendrycks/competition_math",
|
| 64 |
"dataset_name":"MATH Lvl 5 (4-Shot)",
|
| 65 |
"metric_type":"exact_match",
|
| 66 |
-
"metric_value":results["MATH Lvl 5"],
|
| 67 |
"dataset_config": None, # don't know
|
| 68 |
"dataset_split": None, # don't know
|
| 69 |
"dataset_revision":None,
|
|
@@ -75,7 +75,7 @@ def get_task_summary(results):
|
|
| 75 |
"dataset_type":"Idavidrein/gpqa",
|
| 76 |
"dataset_name":"GPQA (0-shot)",
|
| 77 |
"metric_type":"acc_norm",
|
| 78 |
-
"metric_value":results["GPQA"],
|
| 79 |
"dataset_config": None, # don't know
|
| 80 |
"dataset_split": None, # don't know
|
| 81 |
"dataset_revision":None,
|
|
@@ -87,7 +87,7 @@ def get_task_summary(results):
|
|
| 87 |
"dataset_type":"TAUR-Lab/MuSR",
|
| 88 |
"dataset_name":"MuSR (0-shot)",
|
| 89 |
"metric_type":"acc_norm",
|
| 90 |
-
"metric_value":results["MUSR"],
|
| 91 |
"dataset_config": None, # don't know
|
| 92 |
"dataset_split": None, # don't know
|
| 93 |
"dataset_args":{"num_few_shot": 0},
|
|
@@ -98,7 +98,7 @@ def get_task_summary(results):
|
|
| 98 |
"dataset_type":"TIGER-Lab/MMLU-Pro",
|
| 99 |
"dataset_name":"MMLU-PRO (5-shot)",
|
| 100 |
"metric_type":"acc",
|
| 101 |
-
"metric_value":results["MMLU-PRO"],
|
| 102 |
"dataset_config":"main",
|
| 103 |
"dataset_split":"test",
|
| 104 |
"dataset_args":{"num_few_shot": 5},
|
|
@@ -113,7 +113,7 @@ def get_eval_results(repo):
|
|
| 113 |
task_summary = get_task_summary(results)
|
| 114 |
md_writer = MarkdownTableWriter()
|
| 115 |
md_writer.headers = ["Metric", "Value"]
|
| 116 |
-
md_writer.value_matrix = [["Avg.", results['Average ⬆️']]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
|
| 117 |
|
| 118 |
text = f"""
|
| 119 |
# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
|
|
|
|
| 40 |
{"dataset_type":"HuggingFaceH4/ifeval",
|
| 41 |
"dataset_name":"IFEval (0-Shot)",
|
| 42 |
"metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
|
| 43 |
+
"metric_value": round(results["IFEval"], 2),
|
| 44 |
"dataset_config": None, # don't know
|
| 45 |
"dataset_split": None, # don't know
|
| 46 |
"dataset_revision":None,
|
|
|
|
| 51 |
{"dataset_type":"BBH",
|
| 52 |
"dataset_name":"BBH (3-Shot)",
|
| 53 |
"metric_type":"acc_norm",
|
| 54 |
+
"metric_value": round(results["BBH"], 2),
|
| 55 |
"dataset_config": None, # don't know
|
| 56 |
"dataset_split": None, # don't know
|
| 57 |
"dataset_revision":None,
|
|
|
|
| 63 |
"dataset_type":"hendrycks/competition_math",
|
| 64 |
"dataset_name":"MATH Lvl 5 (4-Shot)",
|
| 65 |
"metric_type":"exact_match",
|
| 66 |
+
"metric_value": round(results["MATH Lvl 5"], 2),
|
| 67 |
"dataset_config": None, # don't know
|
| 68 |
"dataset_split": None, # don't know
|
| 69 |
"dataset_revision":None,
|
|
|
|
| 75 |
"dataset_type":"Idavidrein/gpqa",
|
| 76 |
"dataset_name":"GPQA (0-shot)",
|
| 77 |
"metric_type":"acc_norm",
|
| 78 |
+
"metric_value": round(results["GPQA"], 2),
|
| 79 |
"dataset_config": None, # don't know
|
| 80 |
"dataset_split": None, # don't know
|
| 81 |
"dataset_revision":None,
|
|
|
|
| 87 |
"dataset_type":"TAUR-Lab/MuSR",
|
| 88 |
"dataset_name":"MuSR (0-shot)",
|
| 89 |
"metric_type":"acc_norm",
|
| 90 |
+
"metric_value": round(results["MUSR"], 2),
|
| 91 |
"dataset_config": None, # don't know
|
| 92 |
"dataset_split": None, # don't know
|
| 93 |
"dataset_args":{"num_few_shot": 0},
|
|
|
|
| 98 |
"dataset_type":"TIGER-Lab/MMLU-Pro",
|
| 99 |
"dataset_name":"MMLU-PRO (5-shot)",
|
| 100 |
"metric_type":"acc",
|
| 101 |
+
"metric_value": round(results["MMLU-PRO"], 2),
|
| 102 |
"dataset_config":"main",
|
| 103 |
"dataset_split":"test",
|
| 104 |
"dataset_args":{"num_few_shot": 5},
|
|
|
|
| 113 |
task_summary = get_task_summary(results)
|
| 114 |
md_writer = MarkdownTableWriter()
|
| 115 |
md_writer.headers = ["Metric", "Value"]
|
| 116 |
+
md_writer.value_matrix = [["Avg.", round(results['Average ⬆️']]], 2) + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
|
| 117 |
|
| 118 |
text = f"""
|
| 119 |
# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
|