Spaces:
Runtime error
Runtime error
j_yoon.song commited on
Commit ·
0865d34
1
Parent(s): abb7c49
init
Browse files- app.py +118 -33
- src/about.py +2 -2
- src/config.py +39 -0
- src/data/export_category_250618.csv +1 -33
- src/data_utils.py +0 -0
app.py
CHANGED
|
@@ -3,6 +3,7 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
|
| 3 |
import pandas as pd
|
| 4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 5 |
from huggingface_hub import snapshot_download
|
|
|
|
| 6 |
|
| 7 |
from src.about import (
|
| 8 |
CITATION_BUTTON_LABEL,
|
|
@@ -57,37 +58,39 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
|
| 57 |
pending_eval_queue_df,
|
| 58 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 59 |
|
| 60 |
-
def init_leaderboard(dataframe):
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
|
|
|
| 90 |
|
|
|
|
| 91 |
|
| 92 |
demo = gr.Blocks(css=custom_css)
|
| 93 |
with demo:
|
|
@@ -95,8 +98,91 @@ with demo:
|
|
| 95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 96 |
|
| 97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 102 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
@@ -201,5 +287,4 @@ with demo:
|
|
| 201 |
scheduler = BackgroundScheduler()
|
| 202 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 203 |
scheduler.start()
|
| 204 |
-
print("test")
|
| 205 |
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 5 |
from huggingface_hub import snapshot_download
|
| 6 |
+
from src.data_utils import get_dataframe_category, get_dataframe_language
|
| 7 |
|
| 8 |
from src.about import (
|
| 9 |
CITATION_BUTTON_LABEL,
|
|
|
|
| 58 |
pending_eval_queue_df,
|
| 59 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 60 |
|
| 61 |
+
# def init_leaderboard(dataframe):
|
| 62 |
+
# if dataframe is None or dataframe.empty:
|
| 63 |
+
# raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 64 |
+
# return Leaderboard(
|
| 65 |
+
# value=dataframe,
|
| 66 |
+
# datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 67 |
+
# select_columns=SelectColumns(
|
| 68 |
+
# default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
| 69 |
+
# cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 70 |
+
# label="Select Columns to Display:",
|
| 71 |
+
# ),
|
| 72 |
+
# search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
| 73 |
+
# hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 74 |
+
# filter_columns=[
|
| 75 |
+
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 76 |
+
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
| 77 |
+
# ColumnFilter(
|
| 78 |
+
# AutoEvalColumn.params.name,
|
| 79 |
+
# type="slider",
|
| 80 |
+
# min=0.01,
|
| 81 |
+
# max=150,
|
| 82 |
+
# label="Select the number of parameters (B)",
|
| 83 |
+
# ),
|
| 84 |
+
# ColumnFilter(
|
| 85 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
| 86 |
+
# ),
|
| 87 |
+
# ],
|
| 88 |
+
# bool_checkboxgroup_label="Hide models",
|
| 89 |
+
# interactive=False,
|
| 90 |
+
# )
|
| 91 |
+
|
| 92 |
|
| 93 |
+
tab_keys = ["Category", "Language"]
|
| 94 |
|
| 95 |
demo = gr.Blocks(css=custom_css)
|
| 96 |
with demo:
|
|
|
|
| 98 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 99 |
|
| 100 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 101 |
+
|
| 102 |
+
def search_leaderboard(query, df):
|
| 103 |
+
if not query.strip():
|
| 104 |
+
return df
|
| 105 |
+
filtered = df[df.apply(lambda row: row.astype(str).str.contains(query, case=False).any(), axis=1)]
|
| 106 |
+
return filtered
|
| 107 |
+
|
| 108 |
+
def update_modelselector_group(groups, df):
|
| 109 |
+
"""
|
| 110 |
+
groups (gr.CheckboxGroup): List of currently selected models
|
| 111 |
+
df (DataFrame or gr.State): Current dataframe
|
| 112 |
+
"""
|
| 113 |
+
print("groups:", groups)
|
| 114 |
+
if not groups:
|
| 115 |
+
return None
|
| 116 |
+
|
| 117 |
+
filtered_df = df[df["Group"].isin(groups)]
|
| 118 |
+
models = filtered_df["Model Name"].unique().tolist()
|
| 119 |
+
|
| 120 |
+
return models
|
| 121 |
+
|
| 122 |
+
def update_columnselector_group(columns, groups, df):
|
| 123 |
+
print("column groups:", groups)
|
| 124 |
+
|
| 125 |
+
columns = [c for c in columns if c in df.columns[:3]]
|
| 126 |
+
|
| 127 |
+
columns.extend(df.columns[3:])
|
| 128 |
+
|
| 129 |
+
print(columns)
|
| 130 |
+
|
| 131 |
+
return columns
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def update_leaderboard(models, columns, df):
|
| 135 |
+
print("models:", models)
|
| 136 |
+
print("columns:", columns)
|
| 137 |
+
|
| 138 |
+
filtered_df = df[df["Model Name"].isin(models)]
|
| 139 |
+
filtered_columns = [c for c in df.columns if c in columns or c in ["Model Name"]]
|
| 140 |
+
filtered_df = filtered_df[filtered_columns]
|
| 141 |
+
|
| 142 |
+
for col in filtered_df.select_dtypes(include="number").columns:
|
| 143 |
+
filtered_df[col] = filtered_df[col].round(3)
|
| 144 |
+
|
| 145 |
+
return filtered_df
|
| 146 |
+
|
| 147 |
+
def get_models_by_group(df, groups):
|
| 148 |
+
return df[df["Group"].isin(groups)]["Model Name"].tolist()
|
| 149 |
+
|
| 150 |
+
for _, key in enumerate(tab_keys):
|
| 151 |
+
with gr.TabItem(key, visible=True):
|
| 152 |
+
if key == "Category":
|
| 153 |
+
df = get_dataframe_category()
|
| 154 |
+
else:
|
| 155 |
+
df = get_dataframe_language()
|
| 156 |
+
df_state = gr.State(df)
|
| 157 |
+
|
| 158 |
+
with gr.Row():
|
| 159 |
+
with gr.Column():
|
| 160 |
+
search_box = gr.Textbox(label="Search Model by Name")
|
| 161 |
+
group_list = df["Group"].unique().tolist()
|
| 162 |
+
group_selector = gr.CheckboxGroup(choices=df["Group"].unique().tolist(), value=group_list, label="Select Model Group")
|
| 163 |
+
|
| 164 |
+
if key == "Category":
|
| 165 |
+
column_selector = gr.CheckboxGroup(choices=df.columns.tolist()[3:], value=configs.ON_LOAD_COLUMNS_CATEGORY[3:], label="Select Columns")
|
| 166 |
+
else:
|
| 167 |
+
column_selector = gr.CheckboxGroup(choices=df.columns.tolist()[3:], value=configs.ON_LOAD_COLUMNS_LANG[3:], label="Select Columns")
|
| 168 |
+
|
| 169 |
+
with gr.Column():
|
| 170 |
+
with gr.Accordion("세부 사항", open=False):
|
| 171 |
+
model_group = df["Model Name"].tolist()
|
| 172 |
+
model_selector = gr.CheckboxGroup(choices=df["Model Name"].tolist(), value=model_group, label="Select Models")
|
| 173 |
+
|
| 174 |
+
ld = gr.DataFrame(
|
| 175 |
+
value=df.round(3)
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
# Define change functions for user interaction
|
| 179 |
+
search_box.change(fn=search_leaderboard, inputs=[search_box, df_state], outputs=ld)
|
| 180 |
+
group_selector.change(fn=update_modelselector_group, inputs=[group_selector, df_state], outputs=model_selector)
|
| 181 |
+
model_selector.change(fn=update_leaderboard, inputs=[model_selector, column_selector, df_state], outputs=ld)
|
| 182 |
+
column_selector.change(fn=update_leaderboard, inputs=[model_selector, column_selector, df_state], outputs=ld)
|
| 183 |
+
|
| 184 |
+
# with gr.TabItem("Docs"):
|
| 185 |
+
# gr.Markdown((Path(__file__).parent / "docs.md").read_text())
|
| 186 |
|
| 187 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 188 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
| 287 |
scheduler = BackgroundScheduler()
|
| 288 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 289 |
scheduler.start()
|
|
|
|
| 290 |
demo.queue(default_concurrency_limit=40).launch()
|
src/about.py
CHANGED
|
@@ -21,11 +21,11 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
| 21 |
|
| 22 |
|
| 23 |
# Your leaderboard name
|
| 24 |
-
TITLE = """<h1 align="center" id="space-title">
|
| 25 |
|
| 26 |
# What does your leaderboard evaluate?
|
| 27 |
INTRODUCTION_TEXT = """
|
| 28 |
-
|
| 29 |
"""
|
| 30 |
|
| 31 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
# Your leaderboard name
|
| 24 |
+
TITLE = """<h1 align="center" id="space-title">🥇 ProductivityBench (v1)</h1>"""
|
| 25 |
|
| 26 |
# What does your leaderboard evaluate?
|
| 27 |
INTRODUCTION_TEXT = """
|
| 28 |
+
ProductivityBench is designed to evaluate LLMs for Productivity Assistants which stand for human's job productivity.
|
| 29 |
"""
|
| 30 |
|
| 31 |
# Which evaluations are you running? how can people reproduce what you have?
|
src/config.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ON_LOAD_COLUMNS_LANG = [
|
| 2 |
+
"Model Name",
|
| 3 |
+
"Group",
|
| 4 |
+
"Overall",
|
| 5 |
+
"KO",
|
| 6 |
+
"EN",
|
| 7 |
+
"JA",
|
| 8 |
+
"ZH",
|
| 9 |
+
"PL",
|
| 10 |
+
"DE",
|
| 11 |
+
"PT",
|
| 12 |
+
"ES",
|
| 13 |
+
"FR",
|
| 14 |
+
"IT",
|
| 15 |
+
"RU",
|
| 16 |
+
"VI"
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
ON_LOAD_COLUMNS_CATEGORY = [
|
| 20 |
+
"Model Name",
|
| 21 |
+
"Group",
|
| 22 |
+
"Overall",
|
| 23 |
+
"Content Generation",
|
| 24 |
+
"Editing",
|
| 25 |
+
"Data Analysis",
|
| 26 |
+
"Reasoning",
|
| 27 |
+
"Samsung Knowledge",
|
| 28 |
+
"Hallucination",
|
| 29 |
+
"Safety",
|
| 30 |
+
"Repeatition",
|
| 31 |
+
"Summarization",
|
| 32 |
+
"Translation",
|
| 33 |
+
"Multi-Turn"
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
COLUMN_GROUP_LIST = [
|
| 37 |
+
"Category",
|
| 38 |
+
"Language"
|
| 39 |
+
]
|
src/data/export_category_250618.csv
CHANGED
|
@@ -8,36 +8,4 @@
|
|
| 8 |
"deepseek_r1" "DeepSeek" "55.27" "61.69" "54.76" "68.67" "68.00" "46.67" "51.67" "20.00" "46.67" "67.81" "49.00" "43.33"
|
| 9 |
"deepseek_r1_0528" "DeepSeek" "52.60" "59.09" "51.19" "65.33" "65.00" "38.33" "43.33" "27.50" "53.33" "69.18" "41.33" "41.67"
|
| 10 |
"deepseek_v3" "DeepSeek" "56.99" "62.99" "58.93" "58.00" "59.00" "36.67" "41.67" "25.00" "40.00" "72.60" "60.00" "46.67"
|
| 11 |
-
"deepseek_v3_0324" "DeepSeek" "54.51" "55.84" "48.21" "63.33" "70.00" "43.33" "50.00" "20.00" "46.67" "72.95" "49.67" "43.33"
|
| 12 |
-
"gemini-1.5-flash" "Gemini" "45.24" "50.65" "42.26" "46.67" "43.00" "20.00" "53.33" "21.25" "13.33" "66.44" "40.00" "39.44"
|
| 13 |
-
"gemini-1.5-pro" "Gemini" "52.48" "57.14" "50.00" "50.00" "54.00" "43.33" "51.67" "33.75" "30.00" "69.52" "52.00" "40.56"
|
| 14 |
-
"gemini-2.0-flash" "Gemini" "55.27" "54.55" "54.17" "56.00" "51.00" "58.33" "60.00" "20.00" "40.00" "74.32" "56.00" "42.22"
|
| 15 |
-
"gemini-2.5-pro-05-06" "Gemini" "63.98" "62.99" "61.90" "70.67" "72.00" "48.33" "73.33" "23.75" "43.33" "78.77" "66.00" "52.78"
|
| 16 |
-
"Gemma-2-27B-it" "Gemma" "43.14" "51.95" "38.10" "42.67" "29.00" "21.67" "48.33" "37.50" "20.00" "62.33" "41.00" "32.78"
|
| 17 |
-
"Gemma-3-1B-it" "Gemma" "12.96" "25.32" "10.12" "15.33" "9.00" "0.00" "11.67" "27.50" "6.67" "22.60" "2.67" "6.11"
|
| 18 |
-
"Gemma-3-4B-it" "Gemma" "29.61" "40.91" "28.57" "30.00" "20.00" "13.33" "20.00" "28.75" "10.00" "51.03" "22.00" "16.11"
|
| 19 |
-
"Gemma-3-12B-it" "Gemma" "42.50" "51.30" "48.81" "37.33" "30.00" "23.33" "31.67" "33.75" "16.67" "66.44" "37.33" "28.33"
|
| 20 |
-
"Gemma-3-27B-it" "Gemma" "44.09" "53.25" "44.64" "50.00" "39.00" "33.33" "45.00" "26.25" "23.33" "63.36" "33.67" "34.44"
|
| 21 |
-
"gpt-4o" "GPT" "56.42" "61.04" "61.31" "58.67" "49.00" "45.00" "51.67" "35.00" "43.33" "73.29" "53.00" "45.56"
|
| 22 |
-
"gpt-o1" "GPT" "67.92" "68.18" "76.19" "74.00" "69.00" "35.00" "65.00" "30.00" "66.67" "84.59" "66.67" "58.33"
|
| 23 |
-
"gpt-o3" "GPT" "70.33" "76.62" "75.00" "74.67" "79.00" "53.33" "58.33" "23.75" "76.67" "83.56" "74.00" "53.89"
|
| 24 |
-
"gpt-o4-mini" "GPT" "65.31" "75.97" "63.69" "76.00" "77.00" "41.67" "55.00" "30.00" "66.67" "81.85" "59.67" "51.67"
|
| 25 |
-
"llama3_1_8b_inst" "Llama" "25.79" "37.66" "25.00" "31.33" "18.00" "13.33" "36.67" "23.75" "13.33" "37.67" "17.00" "15.00"
|
| 26 |
-
"llama3_1_70b_inst" "Llama" "40.79" "45.45" "41.67" "49.33" "35.00" "23.33" "43.33" "21.25" "20.00" "54.79" "37.33" "32.22"
|
| 27 |
-
"llama3_1_405b_fp8_inst" "Llama" "48.03" "50.00" "48.81" "52.67" "47.00" "30.00" "50.00" "22.50" "33.33" "64.04" "47.33" "36.67"
|
| 28 |
-
"llama3_3_70b_inst" "Llama" "40.60" "48.70" "43.45" "45.33" "38.00" "16.67" "40.00" "20.00" "16.67" "58.56" "32.67" "33.89"
|
| 29 |
-
"llama4_scout" "Llama" "44.98" "46.75" "39.88" "52.67" "43.00" "31.67" "41.67" "22.50" "23.33" "61.30" "44.00" "37.22"
|
| 30 |
-
"llama4_maverick" "Llama" "51.65" "54.55" "43.45" "58.67" "55.00" "36.67" "55.00" "32.50" "16.67" "64.04" "53.33" "44.44"
|
| 31 |
-
"Mixtral-8x7B-Instruct-v0.1" "Mistral" "22.81" "26.62" "16.07" "24.67" "13.00" "16.67" "38.33" "23.75" "23.33" "37.67" "13.00" "18.33"
|
| 32 |
-
"phi-4" "Phi" "39.83" "45.45" "39.88" "47.33" "45.00" "16.67" "33.33" "46.25" "23.33" "51.71" "33.00" "27.78"
|
| 33 |
-
"Qwen2-72B-Instruct" "Qwen" "39.52" "42.86" "38.69" "34.67" "31.00" "18.33" "51.67" "32.50" "23.33" "56.16" "37.33" "31.67"
|
| 34 |
-
"Qwen2.5-14B-Instruct" "Qwen" "37.99" "45.45" "27.98" "36.67" "39.00" "21.67" "51.67" "32.50" "26.67" "54.45" "32.67" "28.89"
|
| 35 |
-
"Qwen2.5-32B-Instruct" "Qwen" "43.84" "51.95" "38.10" "47.33" "45.00" "21.67" "55.00" "35.00" "20.00" "63.36" "36.00" "31.67"
|
| 36 |
-
"Qwen2.5-72B-Instruct" "Qwen" "46.19" "52.60" "43.45" "50.67" "42.00" "23.33" "48.33" "37.50" "30.00" "65.41" "39.00" "36.11"
|
| 37 |
-
"Qwen-QwQ-32B" "Qwen" "47.46" "54.55" "45.24" "65.33" "66.00" "25.00" "36.67" "21.25" "26.67" "65.07" "39.33" "29.44"
|
| 38 |
-
"Qwen3-235B-A22B" "Qwen" "48.09" "59.74" "41.67" "65.33" "71.00" "33.33" "41.67" "20.00" "33.33" "66.44" "30.33" "38.89"
|
| 39 |
-
"Gauss2.2-37B-Instruct-250430" "Gauss" "50.70" "52.60" "50.60" "43.33" "42.00" "28.33" "41.67" "26.25" "26.67" "71.23" "58.00" "40.00"
|
| 40 |
-
"Gauss2.2-37B-Think-250430" "Gauss" "46.00" "57.14" "40.48" "59.33" "59.00" "26.67" "36.67" "20.00" "36.67" "60.62" "39.67" "32.78"
|
| 41 |
-
"GaussO-Owl-Ultra-Think-250604" "Gauss" "57.05" "63.64" "52.98" "66.00" "57.00" "48.33" "55.00" "37.50" "33.33" "75.00" "53.67" "40.56"
|
| 42 |
-
"GaussO-Owl-Ultra-Think-250423" "Gauss" "56.10" "61.04" "47.62" "68.00" "69.00" "48.33" "51.67" "23.75" "53.33" "69.86" "53.00" "44.44"
|
| 43 |
-
"GaussO-Owl-Ultra-Instruct-250423" "Gauss" "58.58" "64.94" "55.95" "63.33" "69.00" "41.67" "53.33" "25.00" "36.67" "73.97" "60.00" "44.44"
|
|
|
|
| 8 |
"deepseek_r1" "DeepSeek" "55.27" "61.69" "54.76" "68.67" "68.00" "46.67" "51.67" "20.00" "46.67" "67.81" "49.00" "43.33"
|
| 9 |
"deepseek_r1_0528" "DeepSeek" "52.60" "59.09" "51.19" "65.33" "65.00" "38.33" "43.33" "27.50" "53.33" "69.18" "41.33" "41.67"
|
| 10 |
"deepseek_v3" "DeepSeek" "56.99" "62.99" "58.93" "58.00" "59.00" "36.67" "41.67" "25.00" "40.00" "72.60" "60.00" "46.67"
|
| 11 |
+
"deepseek_v3_0324" "DeepSeek" "54.51" "55.84" "48.21" "63.33" "70.00" "43.33" "50.00" "20.00" "46.67" "72.95" "49.67" "43.33"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/data_utils.py
ADDED
|
File without changes
|