Spaces:
Running
Running
update leaderboard
Browse files- all_results.json +0 -0
- app.py +32 -29
all_results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app.py
CHANGED
|
@@ -2031,89 +2031,92 @@ with block:
|
|
| 2031 |
with gr.TabItem("Cross-Lingual Consistency"):
|
| 2032 |
|
| 2033 |
# dataset 1: cross-mmlu
|
| 2034 |
-
|
|
|
|
|
|
|
|
|
|
| 2035 |
with gr.TabItem("Zero Shot"):
|
| 2036 |
with gr.TabItem("Overall"):
|
| 2037 |
with gr.Row():
|
| 2038 |
-
|
| 2039 |
-
|
| 2040 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
| 2041 |
type="pandas",
|
| 2042 |
)
|
| 2043 |
with gr.TabItem("Language Performance"):
|
| 2044 |
|
| 2045 |
with gr.Row():
|
| 2046 |
-
|
| 2047 |
-
|
| 2048 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
| 2049 |
type="pandas",
|
| 2050 |
)
|
| 2051 |
with gr.TabItem("Five Shot"):
|
| 2052 |
with gr.TabItem("Overall"):
|
| 2053 |
|
| 2054 |
with gr.Row():
|
| 2055 |
-
|
| 2056 |
-
|
| 2057 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
| 2058 |
type="pandas",
|
| 2059 |
)
|
| 2060 |
with gr.TabItem("Language Performance"):
|
| 2061 |
|
| 2062 |
with gr.Row():
|
| 2063 |
gr.components.Dataframe(
|
| 2064 |
-
|
| 2065 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
| 2066 |
type="pandas",
|
| 2067 |
)
|
| 2068 |
|
| 2069 |
with gr.Row():
|
| 2070 |
gr.Markdown("""
|
| 2071 |
-
**Cross-
|
| 2072 |
- **Metric:** Cross-Lingual Consistency, Accuracy, AC3
|
| 2073 |
-
- **Languages:** English, Chinese, Spanish, Vietnamese
|
| 2074 |
""")
|
| 2075 |
|
| 2076 |
-
|
| 2077 |
-
with gr.TabItem("Cross-
|
| 2078 |
with gr.TabItem("Zero Shot"):
|
| 2079 |
with gr.TabItem("Overall"):
|
| 2080 |
with gr.Row():
|
| 2081 |
-
|
| 2082 |
-
|
| 2083 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
| 2084 |
type="pandas",
|
| 2085 |
)
|
| 2086 |
with gr.TabItem("Language Performance"):
|
| 2087 |
|
| 2088 |
with gr.Row():
|
| 2089 |
-
|
| 2090 |
-
|
| 2091 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
| 2092 |
type="pandas",
|
| 2093 |
)
|
| 2094 |
with gr.TabItem("Five Shot"):
|
| 2095 |
with gr.TabItem("Overall"):
|
| 2096 |
|
| 2097 |
with gr.Row():
|
| 2098 |
-
|
| 2099 |
-
|
| 2100 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
| 2101 |
type="pandas",
|
| 2102 |
)
|
| 2103 |
with gr.TabItem("Language Performance"):
|
| 2104 |
|
| 2105 |
with gr.Row():
|
| 2106 |
gr.components.Dataframe(
|
| 2107 |
-
|
| 2108 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
| 2109 |
type="pandas",
|
| 2110 |
)
|
| 2111 |
|
| 2112 |
with gr.Row():
|
| 2113 |
gr.Markdown("""
|
| 2114 |
-
**Cross-
|
| 2115 |
- **Metric:** Cross-Lingual Consistency, Accuracy, AC3
|
| 2116 |
-
- **Languages:** English, Chinese,
|
| 2117 |
""")
|
| 2118 |
|
| 2119 |
|
|
|
|
| 2031 |
with gr.TabItem("Cross-Lingual Consistency"):
|
| 2032 |
|
| 2033 |
# dataset 1: cross-mmlu
|
| 2034 |
+
|
| 2035 |
+
|
| 2036 |
+
# dataset 1: cross-mmlu
|
| 2037 |
+
with gr.TabItem("Cross-MMLU"):
|
| 2038 |
with gr.TabItem("Zero Shot"):
|
| 2039 |
with gr.TabItem("Overall"):
|
| 2040 |
with gr.Row():
|
| 2041 |
+
cross_mmlu_zero_shot_overall = gr.components.Dataframe(
|
| 2042 |
+
CROSS_MMLU_ZERO_SHOT_OVERALL,
|
| 2043 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_OVERALL.columns),
|
| 2044 |
type="pandas",
|
| 2045 |
)
|
| 2046 |
with gr.TabItem("Language Performance"):
|
| 2047 |
|
| 2048 |
with gr.Row():
|
| 2049 |
+
cross_mmlu_zero_shot_overall = gr.components.Dataframe(
|
| 2050 |
+
CROSS_MMLU_ZERO_SHOT_LANGUAGE,
|
| 2051 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_LANGUAGE.columns),
|
| 2052 |
type="pandas",
|
| 2053 |
)
|
| 2054 |
with gr.TabItem("Five Shot"):
|
| 2055 |
with gr.TabItem("Overall"):
|
| 2056 |
|
| 2057 |
with gr.Row():
|
| 2058 |
+
cross_mmlu_zero_shot_overall = gr.components.Dataframe(
|
| 2059 |
+
CROSS_MMLU_FIVE_SHOT_OVERALL,
|
| 2060 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_OVERALL.columns),
|
| 2061 |
type="pandas",
|
| 2062 |
)
|
| 2063 |
with gr.TabItem("Language Performance"):
|
| 2064 |
|
| 2065 |
with gr.Row():
|
| 2066 |
gr.components.Dataframe(
|
| 2067 |
+
CROSS_MMLU_FIVE_SHOT_LANGUAGE,
|
| 2068 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_LANGUAGE.columns),
|
| 2069 |
type="pandas",
|
| 2070 |
)
|
| 2071 |
|
| 2072 |
with gr.Row():
|
| 2073 |
gr.Markdown("""
|
| 2074 |
+
**Cross-MMLU Leaderboard** 🔮
|
| 2075 |
- **Metric:** Cross-Lingual Consistency, Accuracy, AC3
|
| 2076 |
+
- **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
|
| 2077 |
""")
|
| 2078 |
|
| 2079 |
+
|
| 2080 |
+
with gr.TabItem("Cross-XQUAD"):
|
| 2081 |
with gr.TabItem("Zero Shot"):
|
| 2082 |
with gr.TabItem("Overall"):
|
| 2083 |
with gr.Row():
|
| 2084 |
+
cross_xquad_zero_shot_overall = gr.components.Dataframe(
|
| 2085 |
+
CROSS_XQUAD_ZERO_SHOT_OVERALL,
|
| 2086 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_OVERALL.columns),
|
| 2087 |
type="pandas",
|
| 2088 |
)
|
| 2089 |
with gr.TabItem("Language Performance"):
|
| 2090 |
|
| 2091 |
with gr.Row():
|
| 2092 |
+
cross_xquad_zero_shot_overall = gr.components.Dataframe(
|
| 2093 |
+
CROSS_XQUAD_ZERO_SHOT_LANGUAGE,
|
| 2094 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_LANGUAGE.columns),
|
| 2095 |
type="pandas",
|
| 2096 |
)
|
| 2097 |
with gr.TabItem("Five Shot"):
|
| 2098 |
with gr.TabItem("Overall"):
|
| 2099 |
|
| 2100 |
with gr.Row():
|
| 2101 |
+
cross_xquad_zero_shot_overall = gr.components.Dataframe(
|
| 2102 |
+
CROSS_XQUAD_FIVE_SHOT_OVERALL,
|
| 2103 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_OVERALL.columns),
|
| 2104 |
type="pandas",
|
| 2105 |
)
|
| 2106 |
with gr.TabItem("Language Performance"):
|
| 2107 |
|
| 2108 |
with gr.Row():
|
| 2109 |
gr.components.Dataframe(
|
| 2110 |
+
CROSS_XQUAD_FIVE_SHOT_LANGUAGE,
|
| 2111 |
+
datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_LANGUAGE.columns),
|
| 2112 |
type="pandas",
|
| 2113 |
)
|
| 2114 |
|
| 2115 |
with gr.Row():
|
| 2116 |
gr.Markdown("""
|
| 2117 |
+
**Cross-XQUAD Leaderboard** 🔮
|
| 2118 |
- **Metric:** Cross-Lingual Consistency, Accuracy, AC3
|
| 2119 |
+
- **Languages:** English, Chinese, Spanish, Vietnamese
|
| 2120 |
""")
|
| 2121 |
|
| 2122 |
|