Spaces:
Sleeping
Sleeping
Commit ·
2d3d046
1
Parent(s): 7172210
update
Browse files
app.py
CHANGED
|
@@ -62,7 +62,7 @@ def get_license(model): # 대소문자 무시하고 모델을 매칭하기 위
|
|
| 62 |
|
| 63 |
# dataframe_full
|
| 64 |
df_full_rs = df_rs.copy()
|
| 65 |
-
df_full_rs.rename(columns={'score': '
|
| 66 |
df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
|
| 67 |
|
| 68 |
df_full_rs = df_full_rs.drop(columns=['turn']) # 모델별 turn1,2 score 합병
|
|
@@ -70,16 +70,16 @@ df_full_rs = df_full_rs.groupby(['model', 'judge_model']).agg({col: custom_mean
|
|
| 70 |
df_full_rs = df_full_rs.round(2)
|
| 71 |
df_full_rs.replace("", np.nan, inplace=True)
|
| 72 |
|
| 73 |
-
df_full_rs['
|
| 74 |
-
df_full_rs['
|
| 75 |
for idx, j_model in df_full_rs['judge_model'].items():
|
| 76 |
if j_model == 'keval':
|
| 77 |
-
df_full_rs.at[idx, '
|
| 78 |
else :
|
| 79 |
-
df_full_rs.at[idx, '
|
| 80 |
df_full_rs = df_full_rs.drop(columns=['judge_model'])
|
| 81 |
|
| 82 |
-
df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() #
|
| 83 |
df_full_rs = df_full_rs.round(2)
|
| 84 |
df_full_rs.replace("", np.nan, inplace=True)
|
| 85 |
|
|
@@ -93,9 +93,9 @@ df_full_rs['Organization'] = df_full_rs['model'].apply(get_organization)
|
|
| 93 |
df_full_rs['License'] = '' # License 열 추가
|
| 94 |
df_full_rs['License'] = df_full_rs['model'].apply(get_license)
|
| 95 |
|
| 96 |
-
df_full_rs = df_full_rs.sort_values(by='
|
| 97 |
df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
|
| 98 |
-
df_full_rs = df_full_rs.drop(columns=['
|
| 99 |
|
| 100 |
plot_models = df_full_rs['model'].unique() # model detail view를 위한 models 리스트
|
| 101 |
|
|
@@ -230,18 +230,18 @@ def search_keval_plot(dropdown_model): # keval plot 함수 정의
|
|
| 230 |
#gradio
|
| 231 |
with gr.Blocks() as demo:
|
| 232 |
gr.Markdown("")
|
| 233 |
-
gr.Markdown("# 🏆
|
| 234 |
gr.Markdown("")
|
| 235 |
-
gr.Markdown("#### The Ko-
|
| 236 |
gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
|
| 237 |
-
gr.Markdown("-
|
| 238 |
-
gr.Markdown("-
|
| 239 |
gr.Markdown("")
|
| 240 |
-
gr.Markdown("github : https://github.com/davidkim205/
|
| 241 |
gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
|
| 242 |
gr.Markdown("")
|
| 243 |
|
| 244 |
-
with gr.TabItem("
|
| 245 |
gr.Dataframe(value=df_full_rs)
|
| 246 |
with gr.TabItem("Openai Judgment"):
|
| 247 |
gr.Dataframe(value=df_openai)
|
|
|
|
| 62 |
|
| 63 |
# dataframe_full
|
| 64 |
df_full_rs = df_rs.copy()
|
| 65 |
+
df_full_rs.rename(columns={'score': 'Ko-Bench'}, inplace=True)
|
| 66 |
df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
|
| 67 |
|
| 68 |
df_full_rs = df_full_rs.drop(columns=['turn']) # 모델별 turn1,2 score 합병
|
|
|
|
| 70 |
df_full_rs = df_full_rs.round(2)
|
| 71 |
df_full_rs.replace("", np.nan, inplace=True)
|
| 72 |
|
| 73 |
+
df_full_rs['Ko-Bench/openai'] = '' # Ko-Bench/openai, Ko-Bench/keval 열 추가
|
| 74 |
+
df_full_rs['Ko-Bench/keval'] = ''
|
| 75 |
for idx, j_model in df_full_rs['judge_model'].items():
|
| 76 |
if j_model == 'keval':
|
| 77 |
+
df_full_rs.at[idx, 'Ko-Bench/keval'] = df_full_rs.at[idx, 'Ko-Bench']
|
| 78 |
else :
|
| 79 |
+
df_full_rs.at[idx, 'Ko-Bench/openai'] = df_full_rs.at[idx, 'Ko-Bench']
|
| 80 |
df_full_rs = df_full_rs.drop(columns=['judge_model'])
|
| 81 |
|
| 82 |
+
df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # Ko-Bench/openai, Ko-Bench/keval 행 합병
|
| 83 |
df_full_rs = df_full_rs.round(2)
|
| 84 |
df_full_rs.replace("", np.nan, inplace=True)
|
| 85 |
|
|
|
|
| 93 |
df_full_rs['License'] = '' # License 열 추가
|
| 94 |
df_full_rs['License'] = df_full_rs['model'].apply(get_license)
|
| 95 |
|
| 96 |
+
df_full_rs = df_full_rs.sort_values(by='Ko-Bench', ascending=False)
|
| 97 |
df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
|
| 98 |
+
df_full_rs = df_full_rs.drop(columns=['Ko-Bench'])
|
| 99 |
|
| 100 |
plot_models = df_full_rs['model'].unique() # model detail view를 위한 models 리스트
|
| 101 |
|
|
|
|
| 230 |
#gradio
|
| 231 |
with gr.Blocks() as demo:
|
| 232 |
gr.Markdown("")
|
| 233 |
+
gr.Markdown("# 🏆 Ko-Bench Leaderboard")
|
| 234 |
gr.Markdown("")
|
| 235 |
+
gr.Markdown("#### The Ko-Bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
|
| 236 |
gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
|
| 237 |
+
gr.Markdown("- Ko-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
|
| 238 |
+
gr.Markdown("- Ko-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
|
| 239 |
gr.Markdown("")
|
| 240 |
+
gr.Markdown("github : https://github.com/davidkim205/Ko-Bench")
|
| 241 |
gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
|
| 242 |
gr.Markdown("")
|
| 243 |
|
| 244 |
+
with gr.TabItem("Ko-Bench"):
|
| 245 |
gr.Dataframe(value=df_full_rs)
|
| 246 |
with gr.TabItem("Openai Judgment"):
|
| 247 |
gr.Dataframe(value=df_openai)
|