Spaces:

MERaLiON
/

SeaEval_Leaderboard

Running

App Files Files Community

binwang commited on Dec 21, 2023

Commit

ee26773

1 Parent(s): 8024fdd

datasets

Browse files

Files changed (1) hide show

app.py +618 -1

app.py CHANGED Viewed

@@ -816,6 +816,392 @@ def get_data_flores_zsm2eng(eval_mode='zero_shot', fillna=True, rank=True):
 FLORES_ZSM2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
 FLORES_ZSM2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
@@ -1179,7 +1565,7 @@ with block:
-        # dataset 10:
         with gr.TabItem("FLORES Malay to English Translation"):
             with gr.Row():
                 gr.Markdown("""
@@ -1206,6 +1592,237 @@ with block:
                             datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_FIVE_SHOT.columns),
                             type="pandas",
                         )
     gr.Markdown(r"""

 FLORES_ZSM2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
 FLORES_ZSM2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+def get_data_mmlu(eval_mode='zero_shot', fillna=True, rank=True):
+    df_list = []
+    for model in MODEL_LIST:
+        results_list = [ALL_RESULTS[model][eval_mode]['mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu']]
+        try:
+            accuracy = median([results['accuracy'] for results in results_list])
+        except:
+            print(results_list)
+            accuracy = -1
+        res = {
+            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+            "Accuracy": accuracy,
+        }
+        df_list.append(res)
+    df = pd.DataFrame(df_list)
+    # If there are any models that are the same, merge them
+    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
+    df = df.groupby("Model", as_index=False).first()
+    # Put 'Model' column first
+    #cols = sorted(list(df.columns))
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index("Model")))
+    df = df[cols]
+    if rank:
+        df = add_rank(df, compute_average=True)
+    if fillna:
+        df.fillna("", inplace=True)
+    return df
+MMLU_ZERO_SHOT = get_data_mmlu(eval_mode="zero_shot")
+MMLU_FIVE_SHOT = get_data_mmlu(eval_mode="five_shot")
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+def get_data_mmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
+    df_list = []
+    for model in MODEL_LIST:
+        results_list = [ALL_RESULTS[model][eval_mode]['mmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu_full']]
+        try:
+            accuracy = median([results['accuracy'] for results in results_list])
+        except:
+            print(results_list)
+            accuracy = -1
+        res = {
+            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+            "Accuracy": accuracy,
+        }
+        df_list.append(res)
+    df = pd.DataFrame(df_list)
+    # If there are any models that are the same, merge them
+    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
+    df = df.groupby("Model", as_index=False).first()
+    # Put 'Model' column first
+    #cols = sorted(list(df.columns))
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index("Model")))
+    df = df[cols]
+    if rank:
+        df = add_rank(df, compute_average=True)
+    if fillna:
+        df.fillna("", inplace=True)
+    return df
+MMLU_FULL_ZERO_SHOT = get_data_mmlu_full(eval_mode="zero_shot")
+MMLU_FULL_FIVE_SHOT = get_data_mmlu_full(eval_mode="five_shot")
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+def get_data_c_eval(eval_mode='zero_shot', fillna=True, rank=True):
+    df_list = []
+    for model in MODEL_LIST:
+        results_list = [ALL_RESULTS[model][eval_mode]['c_eval'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval']]
+        try:
+            accuracy = median([results['accuracy'] for results in results_list])
+        except:
+            print(results_list)
+            accuracy = -1
+        res = {
+            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+            "Accuracy": accuracy,
+        }
+        df_list.append(res)
+    df = pd.DataFrame(df_list)
+    # If there are any models that are the same, merge them
+    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
+    df = df.groupby("Model", as_index=False).first()
+    # Put 'Model' column first
+    #cols = sorted(list(df.columns))
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index("Model")))
+    df = df[cols]
+    if rank:
+        df = add_rank(df, compute_average=True)
+    if fillna:
+        df.fillna("", inplace=True)
+    return df
+C_EVAL_ZERO_SHOT = get_data_c_eval(eval_mode="zero_shot")
+C_EVAL_FIVE_SHOT = get_data_c_eval(eval_mode="five_shot")
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+def get_data_c_eval_full(eval_mode='zero_shot', fillna=True, rank=True):
+    df_list = []
+    for model in MODEL_LIST:
+        results_list = [ALL_RESULTS[model][eval_mode]['c_eval_full'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval_full']]
+        try:
+            accuracy = median([results['accuracy'] for results in results_list])
+        except:
+            print(results_list)
+            accuracy = -1
+        res = {
+            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+            "Accuracy": accuracy,
+        }
+        df_list.append(res)
+    df = pd.DataFrame(df_list)
+    # If there are any models that are the same, merge them
+    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
+    df = df.groupby("Model", as_index=False).first()
+    # Put 'Model' column first
+    #cols = sorted(list(df.columns))
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index("Model")))
+    df = df[cols]
+    if rank:
+        df = add_rank(df, compute_average=True)
+    if fillna:
+        df.fillna("", inplace=True)
+    return df
+C_EVAL_FULL_ZERO_SHOT = get_data_c_eval_full(eval_mode="zero_shot")
+C_EVAL_FULL_FIVE_SHOT = get_data_c_eval_full(eval_mode="five_shot")
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+def get_data_cmmlu(eval_mode='zero_shot', fillna=True, rank=True):
+    df_list = []
+    for model in MODEL_LIST:
+        results_list = [ALL_RESULTS[model][eval_mode]['cmmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu']]
+        try:
+            accuracy = median([results['accuracy'] for results in results_list])
+        except:
+            print(results_list)
+            accuracy = -1
+        res = {
+            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+            "Accuracy": accuracy,
+        }
+        df_list.append(res)
+    df = pd.DataFrame(df_list)
+    # If there are any models that are the same, merge them
+    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
+    df = df.groupby("Model", as_index=False).first()
+    # Put 'Model' column first
+    #cols = sorted(list(df.columns))
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index("Model")))
+    df = df[cols]
+    if rank:
+        df = add_rank(df, compute_average=True)
+    if fillna:
+        df.fillna("", inplace=True)
+    return df
+CMMLU_ZERO_SHOT = get_data_cmmlu(eval_mode="zero_shot")
+CMMLU_FIVE_SHOT = get_data_cmmlu(eval_mode="five_shot")
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+def get_data_cmmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
+    df_list = []
+    for model in MODEL_LIST:
+        results_list = [ALL_RESULTS[model][eval_mode]['cmmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu_full']]
+        try:
+            accuracy = median([results['accuracy'] for results in results_list])
+        except:
+            print(results_list)
+            accuracy = -1
+        res = {
+            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+            "Accuracy": accuracy,
+        }
+        df_list.append(res)
+    df = pd.DataFrame(df_list)
+    # If there are any models that are the same, merge them
+    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
+    df = df.groupby("Model", as_index=False).first()
+    # Put 'Model' column first
+    #cols = sorted(list(df.columns))
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index("Model")))
+    df = df[cols]
+    if rank:
+        df = add_rank(df, compute_average=True)
+    if fillna:
+        df.fillna("", inplace=True)
+    return df
+CMMLU_FULL_ZERO_SHOT = get_data_cmmlu_full(eval_mode="zero_shot")
+CMMLU_FULL_FIVE_SHOT = get_data_cmmlu_full(eval_mode="five_shot")
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+def get_data_zbench(eval_mode='zero_shot', fillna=True, rank=True):
+    df_list = []
+    for model in MODEL_LIST:
+        results_list = [ALL_RESULTS[model][eval_mode]['zbench'][res] for res in ALL_RESULTS[model][eval_mode]['zbench']]
+        try:
+            accuracy = median([results['accuracy'] for results in results_list])
+        except:
+            print(results_list)
+            accuracy = -1
+        res = {
+            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+            "Accuracy": accuracy,
+        }
+        df_list.append(res)
+    df = pd.DataFrame(df_list)
+    # If there are any models that are the same, merge them
+    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
+    df = df.groupby("Model", as_index=False).first()
+    # Put 'Model' column first
+    #cols = sorted(list(df.columns))
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index("Model")))
+    df = df[cols]
+    if rank:
+        df = add_rank(df, compute_average=True)
+    if fillna:
+        df.fillna("", inplace=True)
+    return df
+ZBENCH_ZERO_SHOT = get_data_zbench(eval_mode="zero_shot")
+ZBENCH_FIVE_SHOT = get_data_zbench(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+        # dataset 11:
         with gr.TabItem("FLORES Malay to English Translation"):
             with gr.Row():
                 gr.Markdown("""
                             datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_FIVE_SHOT.columns),
                             type="pandas",
                         )
+        # dataset 12:
+        with gr.TabItem("MMLU"):
+            with gr.Row():
+                gr.Markdown("""
+                **MMLU Leaderboard** 🔮
+                - **Metric:** Accuracy.
+                - **Languages:** English
+                """)
+            with gr.TabItem("zero_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            MMLU_ZERO_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(MMLU_ZERO_SHOT.columns),
+                            type="pandas",
+                        )
+            with gr.TabItem("five_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            MMLU_FIVE_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(MMLU_FIVE_SHOT.columns),
+                            type="pandas",
+                        )
+        # dataset 13:
+        with gr.TabItem("MMLU Full"):
+            with gr.Row():
+                gr.Markdown("""
+                **MMLU Full Leaderboard** 🔮
+                - **Metric:** Accuracy.
+                - **Languages:** English
+                """)
+            with gr.TabItem("zero_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            MMLU_FULL_ZERO_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_ZERO_SHOT.columns),
+                            type="pandas",
+                        )
+            with gr.TabItem("five_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            MMLU_FULL_FIVE_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_FIVE_SHOT.columns),
+                            type="pandas",
+                        )
+        # dataset 14:
+        with gr.TabItem("C_EVAL"):
+            with gr.Row():
+                gr.Markdown("""
+                **C_EVAL Leaderboard** 🔮
+                - **Metric:** Accuracy.
+                - **Languages:** Chinese
+                """)
+            with gr.TabItem("zero_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            C_EVAL_ZERO_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(C_EVAL_ZERO_SHOT.columns),
+                            type="pandas",
+                        )
+            with gr.TabItem("five_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            C_EVAL_FIVE_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FIVE_SHOT.columns),
+                            type="pandas",
+                        )
+        # dataset 15:
+        with gr.TabItem("C_EVAL Full"):
+            with gr.Row():
+                gr.Markdown("""
+                **C_EVAL Full Leaderboard** 🔮
+                - **Metric:** Accuracy.
+                - **Languages:** Chinese
+                """)
+            with gr.TabItem("zero_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            C_EVAL_FULL_ZERO_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_ZERO_SHOT.columns),
+                            type="pandas",
+                        )
+            with gr.TabItem("five_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            C_EVAL_FULL_FIVE_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_FIVE_SHOT.columns),
+                            type="pandas",
+                        )
+        # dataset 16:
+        with gr.TabItem("CMMLU"):
+            with gr.Row():
+                gr.Markdown("""
+                **CMMLU Leaderboard** 🔮
+                - **Metric:** Accuracy.
+                - **Languages:** Chinese
+                """)
+            with gr.TabItem("zero_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            CMMLU_ZERO_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(CMMLU_ZERO_SHOT.columns),
+                            type="pandas",
+                        )
+            with gr.TabItem("five_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            CMMLU_FIVE_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(CMMLU_FIVE_SHOT.columns),
+                            type="pandas",
+                        )
+        # dataset 17:
+        with gr.TabItem("CMMLU Full"):
+            with gr.Row():
+                gr.Markdown("""
+                **CMMLU Full Leaderboard** 🔮
+                - **Metric:** Accuracy.
+                - **Languages:** Chinese
+                """)
+            with gr.TabItem("zero_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            CMMLU_FULL_ZERO_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_ZERO_SHOT.columns),
+                            type="pandas",
+                        )
+            with gr.TabItem("five_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            CMMLU_FULL_FIVE_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_FIVE_SHOT.columns),
+                            type="pandas",
+                        )
+        # dataset 18:
+        with gr.TabItem("ZBench"):
+            with gr.Row():
+                gr.Markdown("""
+                **ZBench Leaderboard** 🔮
+                - **Metric:** Accuracy.
+                - **Languages:** Chinese
+                """)
+            with gr.TabItem("zero_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            ZBENCH_ZERO_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(ZBENCH_ZERO_SHOT.columns),
+                            type="pandas",
+                        )
+            with gr.TabItem("five_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            ZBENCH_FIVE_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(ZBENCH_FIVE_SHOT.columns),
+                            type="pandas",
+                        )
     gr.Markdown(r"""