陈俊杰
commited on
Commit
·
c3f31ee
1
Parent(s):
9a263a1
fontSize
Browse files
app.py
CHANGED
|
@@ -126,6 +126,7 @@ st.markdown("""
|
|
| 126 |
.main-text {
|
| 127 |
font-size: 18px;
|
| 128 |
line-height: 1.6;
|
|
|
|
| 129 |
}
|
| 130 |
</style>
|
| 131 |
""", unsafe_allow_html=True)
|
|
@@ -142,8 +143,8 @@ elif page == "Methodology":
|
|
| 142 |
st.image("asserts/method.svg", use_column_width=True)
|
| 143 |
st.markdown("""
|
| 144 |
<ol class='main-text'>
|
| 145 |
-
<li>First, we choose four subtasks as shown in the table below:</li>
|
| 146 |
-
<table>
|
| 147 |
<thead>
|
| 148 |
<tr>
|
| 149 |
<th style="text-align: left">Task</th>
|
|
@@ -174,9 +175,9 @@ elif page == "Methodology":
|
|
| 174 |
</tr>
|
| 175 |
</tbody>
|
| 176 |
</table>
|
| 177 |
-
<li>Second, we choose a series of popular LLMs during the competition to generate answers.</li>
|
| 178 |
-
<li>Third, we manually annotate the answer sets for each question, which will be used as gold standards for evaluating the performance of different evaluation methods.</li>
|
| 179 |
-
<li>Last, we will collect evaluation results from participants and calculate consistency with manually annotated results. We will use Accuracy, Kendall’s tau and Spearman correlation coefficient as the evaluation metrics.</li>
|
| 180 |
</ol>
|
| 181 |
""",unsafe_allow_html=True)
|
| 182 |
|
|
@@ -196,39 +197,31 @@ elif page == "Datasets":
|
|
| 196 |
elif page == "Important Dates":
|
| 197 |
st.header("Important Dates")
|
| 198 |
st.markdown("""
|
| 199 |
-
<p class='main-text'><em>All deadlines are at 11:59pm in the Anywhere on Earth (AOE) timezone.</em><br />
|
| 200 |
-
<span class=
|
| 201 |
-
<span class=
|
| 202 |
-
<span class=
|
| 203 |
-
<span class=
|
| 204 |
-
<span class=
|
| 205 |
-
<span class=
|
| 206 |
-
<span class=
|
| 207 |
-
<span class=
|
| 208 |
""",unsafe_allow_html=True)
|
| 209 |
elif page == "Evaluation Measures":
|
| 210 |
st.header("Evaluation Measures")
|
| 211 |
st.markdown("""
|
| 212 |
-
<
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
$$
|
| 217 |
-
\\tau=\\frac{C-D}{\\frac{1}{2}n(n-1)}
|
| 218 |
-
$$
|
| 219 |
-
|
| 220 |
-
where:
|
| 221 |
-
- C is the number of concordant pairs,
|
| 222 |
-
- D is the number of discordant pairs,
|
| 223 |
-
- n is the number of pairs.
|
| 224 |
-
- **Spearman's Rank Correlation Coefficient:** Measures the strength and direction of the association between two ranked variables.
|
| 225 |
-
$$
|
| 226 |
-
\\rho = 1 - \\frac{6 \sum d_i^2}{n(n^2 - 1)}
|
| 227 |
-
$$
|
| 228 |
where:
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
""",unsafe_allow_html=True)
|
| 233 |
elif page == "Data and File format":
|
| 234 |
st.header("Data and File format")
|
|
@@ -254,10 +247,12 @@ elif page == "LeaderBoard":
|
|
| 254 |
st.markdown("""
|
| 255 |
<div class='main-text'>
|
| 256 |
This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks:
|
| 257 |
-
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
|
|
|
|
|
|
| 261 |
</div>
|
| 262 |
""", unsafe_allow_html=True)
|
| 263 |
# 创建示例数据
|
|
@@ -309,19 +304,19 @@ This leaderboard is used to show the performance of the **automatic evaluation m
|
|
| 309 |
tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
|
| 310 |
|
| 311 |
with tab1:
|
| 312 |
-
st.markdown("""Task: Dialogue Generation; Dataset: DialyDialog""", unsafe_allow_html=True)
|
| 313 |
st.dataframe(df1, use_container_width=True)
|
| 314 |
|
| 315 |
with tab2:
|
| 316 |
-
st.markdown("""Task: Text Expansion; Dataset: WritingPrompts""", unsafe_allow_html=True)
|
| 317 |
st.dataframe(df2, use_container_width=True)
|
| 318 |
|
| 319 |
with tab3:
|
| 320 |
-
st.markdown("""Task: Summary Generation; Dataset: Xsum""", unsafe_allow_html=True)
|
| 321 |
st.dataframe(df3, use_container_width=True)
|
| 322 |
|
| 323 |
with tab4:
|
| 324 |
-
st.markdown("""Task: Non-Factoid QA; Dataset: NF_CATS""", unsafe_allow_html=True)
|
| 325 |
st.dataframe(df4, use_container_width=True)
|
| 326 |
elif page == "Organisers":
|
| 327 |
st.header("Organisers")
|
|
|
|
| 126 |
.main-text {
|
| 127 |
font-size: 18px;
|
| 128 |
line-height: 1.6;
|
| 129 |
+
color: #4CAF50;
|
| 130 |
}
|
| 131 |
</style>
|
| 132 |
""", unsafe_allow_html=True)
|
|
|
|
| 143 |
st.image("asserts/method.svg", use_column_width=True)
|
| 144 |
st.markdown("""
|
| 145 |
<ol class='main-text'>
|
| 146 |
+
<li class='main-text'>First, we choose four subtasks as shown in the table below:</li>
|
| 147 |
+
<table class='main-text'>
|
| 148 |
<thead>
|
| 149 |
<tr>
|
| 150 |
<th style="text-align: left">Task</th>
|
|
|
|
| 175 |
</tr>
|
| 176 |
</tbody>
|
| 177 |
</table>
|
| 178 |
+
<li class='main-text'>Second, we choose a series of popular LLMs during the competition to generate answers.</li>
|
| 179 |
+
<li class='main-text'>Third, we manually annotate the answer sets for each question, which will be used as gold standards for evaluating the performance of different evaluation methods.</li>
|
| 180 |
+
<li class='main-text'>Last, we will collect evaluation results from participants and calculate consistency with manually annotated results. We will use Accuracy, Kendall’s tau and Spearman correlation coefficient as the evaluation metrics.</li>
|
| 181 |
</ol>
|
| 182 |
""",unsafe_allow_html=True)
|
| 183 |
|
|
|
|
| 197 |
elif page == "Important Dates":
|
| 198 |
st.header("Important Dates")
|
| 199 |
st.markdown("""
|
| 200 |
+
<p class='main-text'><em class='main-text>All deadlines are at 11:59pm in the Anywhere on Earth (AOE) timezone.</em><br />
|
| 201 |
+
<span class='main-text'><strong>Kickoff Event</strong>:</span> <span class='main-text'>March 29, 2024</span><br />
|
| 202 |
+
<span class='main-text'><strong>Dataset Release</strong>:</span> <span class='main-text'>👉May 1, 2024</span><br />
|
| 203 |
+
<span class='main-text'><strong>System Output Submission Deadline</strong>:</span> <span class='main-text'>Jan 15, 2025</span><br />
|
| 204 |
+
<span class='main-text'><strong>Evaluation Results Release</strong>:</span> <span class='main-text'>Feb 1, 2025</span> <br />
|
| 205 |
+
<span class='main-text'><strong>Task overview release (draft)</strong>:</span> <span class='main-text'>Feb 1, 2025</span><br />
|
| 206 |
+
<span class='main-text'><strong>Submission Due of Participant Papers (draft)</strong>:</span> <span class='main-text'>March 1, 2025</span><br />
|
| 207 |
+
<span class='main-text'><strong>Camera-Ready Participant Paper Due</strong>:</span> <span class='main-text'>May 1, 2025</span><br />
|
| 208 |
+
<span class='main-text'><strong>NTCIR-18 Conference</strong>:</span> <span class='main-text'>Jun 10-13 2025</span><br /></p>
|
| 209 |
""",unsafe_allow_html=True)
|
| 210 |
elif page == "Evaluation Measures":
|
| 211 |
st.header("Evaluation Measures")
|
| 212 |
st.markdown("""
|
| 213 |
+
<ul class='main-text'>
|
| 214 |
+
<li><strong>Acc(Accuracy): </strong>The proportion of identical preference results between the model and human annotations. Specifically, we first convert individual scores (ranks) into pairwise preferences and then calculate consistency with human annotations.</li>
|
| 215 |
+
<li><strong>Kendall's tau: </strong>Measures the ordinal association between two ranked variables. $$\tau = \frac{C-D}{\frac{1}{2}n(n-1)}$$
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
where:
|
| 217 |
+
C is the number of concordant pairs,
|
| 218 |
+
D is the number of discordant pairs,
|
| 219 |
+
n is the number of pairs.</li>
|
| 220 |
+
<li><strong>Spearman's Rank Correlation Coefficient: </strong>Measures the strength and direction of the association between two ranked variables. $$\rho = 1 - \frac{6 \sum d_i^2}{n(n^2 - 1)}$$
|
| 221 |
+
where:
|
| 222 |
+
\(d_i\) is the difference between the ranks of corresponding elements in the two lists,
|
| 223 |
+
n is the number of elements.</li>
|
| 224 |
+
</ul>
|
| 225 |
""",unsafe_allow_html=True)
|
| 226 |
elif page == "Data and File format":
|
| 227 |
st.header("Data and File format")
|
|
|
|
| 247 |
st.markdown("""
|
| 248 |
<div class='main-text'>
|
| 249 |
This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks:
|
| 250 |
+
<ul class='main-text'>
|
| 251 |
+
<li>Dialogue Generation (DG)</li>
|
| 252 |
+
<li>Text Expansion (TE)</li>
|
| 253 |
+
<li>Summary Generation (SG)</li>
|
| 254 |
+
<li>Non-Factoid QA (NFQA)</li>
|
| 255 |
+
</ul>
|
| 256 |
</div>
|
| 257 |
""", unsafe_allow_html=True)
|
| 258 |
# 创建示例数据
|
|
|
|
| 304 |
tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
|
| 305 |
|
| 306 |
with tab1:
|
| 307 |
+
st.markdown("""<div class='main-text'>Task: Dialogue Generation; Dataset: DialyDialog</div>""", unsafe_allow_html=True)
|
| 308 |
st.dataframe(df1, use_container_width=True)
|
| 309 |
|
| 310 |
with tab2:
|
| 311 |
+
st.markdown("""<div class='main-text'>Task: Text Expansion; Dataset: WritingPrompts</div>""", unsafe_allow_html=True)
|
| 312 |
st.dataframe(df2, use_container_width=True)
|
| 313 |
|
| 314 |
with tab3:
|
| 315 |
+
st.markdown("""<div class='main-text'>Task: Summary Generation; Dataset: Xsum</div>""", unsafe_allow_html=True)
|
| 316 |
st.dataframe(df3, use_container_width=True)
|
| 317 |
|
| 318 |
with tab4:
|
| 319 |
+
st.markdown("""<div class='main-text'>Task: Non-Factoid QA; Dataset: NF_CATS</div>""", unsafe_allow_html=True)
|
| 320 |
st.dataframe(df4, use_container_width=True)
|
| 321 |
elif page == "Organisers":
|
| 322 |
st.header("Organisers")
|