| | import streamlit as st |
| | import pandas as pd |
| |
|
| | |
| | st.markdown(""" |
| | <style> |
| | h1 { |
| | font-size: 2.5em; /* 标题字体大小 */ |
| | } |
| | .stDataFrame { |
| | font-family: Helvetica; |
| | } |
| | .dataframe th, .dataframe td { |
| | width: auto; |
| | min-width: 500px; |
| | } |
| | </style> |
| | """, unsafe_allow_html=True) |
| |
|
| | |
| | st.title('🏆AEOLLM Leaderboard') |
| |
|
| | |
| | st.markdown(""" |
| | This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks: |
| | - Summary Generation (SG) |
| | - Non-Factoid QA (NFQA) |
| | - Dialogue Generation (DG) |
| | - Text Expansion (TE). |
| | |
| | Details of AEOLLLM can be found at the link: [https://cjj826.github.io/AEOLLM/](https://cjj826.github.io/AEOLLM/) |
| | """, unsafe_allow_html=True) |
| | |
| | SG = { |
| | "methods": ["Model A", "Model B", "Model C"], |
| | "team": ["U1", "U2", "U3"], |
| | "acc": [0.75, 0.64, 0.83], |
| | "tau": [0.05, 0.28, 0.16], |
| | "s": [0.12, 0.27, 0.18], |
| | } |
| | df1 = pd.DataFrame(SG) |
| |
|
| | NFQA = { |
| | "methods": ["Model A", "Model B", "Model C"], |
| | "team": ["U1", "U2", "U3"], |
| | "acc": [0.75, 0.64, 0.83], |
| | "tau": [0.05, 0.28, 0.16], |
| | "s": [0.12, 0.27, 0.18] |
| | } |
| | df2 = pd.DataFrame(NFQA) |
| |
|
| | DG = { |
| | "methods": ["Model A", "Model B", "Model C"], |
| | "team": ["U1", "U2", "U3"], |
| | "acc": [0.75, 0.64, 0.83], |
| | "tau": [0.05, 0.28, 0.16], |
| | "s": [0.12, 0.27, 0.18] |
| | } |
| | df3 = pd.DataFrame(DG) |
| |
|
| | TE = { |
| | "methods": ["Model A", "Model B", "Model C"], |
| | "team": ["U1", "U2", "U3"], |
| | "acc": [0.75, 0.64, 0.83], |
| | "tau": [0.05, 0.28, 0.16], |
| | "s": [0.12, 0.27, 0.18] |
| | } |
| | df4 = pd.DataFrame(TE) |
| |
|
| | |
| | tab1, tab2, tab3, tab4 = st.tabs(["SG", "NFQA", "DG", "TE"]) |
| |
|
| | |
| | with tab1: |
| | st.header("Summary Generation") |
| | st.dataframe(df1, use_container_width=True) |
| |
|
| | |
| | with tab2: |
| | st.header("Non-Factoid QA") |
| | st.dataframe(df2, use_container_width=True) |
| |
|
| | |
| | with tab3: |
| | st.header("Dialogue Generation") |
| | st.dataframe(df3, use_container_width=True) |
| |
|
| | |
| | with tab4: |
| | st.header("Text Expansion") |
| | st.dataframe(df4, use_container_width=True, ) |
| |
|