Commit
·
40afe38
1
Parent(s):
e43ddac
Dramatically simplify everything
Browse files- app.py +14 -119
- src/about.py +0 -16
- src/display/utils.py +0 -62
- src/envs.py +3 -9
- src/leaderboard/populate.py +47 -0
- src/leaderboard/read_evals.py +0 -110
- src/leaderboard/utils.py +18 -0
- src/populate.py +0 -56
- src/submission/check_validity.py +0 -36
- src/submission/submit.py +0 -70
app.py
CHANGED
|
@@ -2,73 +2,37 @@ import gradio as gr
|
|
| 2 |
import pandas as pd
|
| 3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 4 |
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
|
| 5 |
-
from huggingface_hub import snapshot_download
|
| 6 |
|
|
|
|
|
|
|
| 7 |
from src.about import INTRODUCTION_TEXT, TITLE
|
| 8 |
from src.display.css_html_js import custom_css
|
| 9 |
-
from src.
|
| 10 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 11 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 12 |
-
from src.submission.submit import add_new_eval
|
| 13 |
|
| 14 |
|
| 15 |
def restart_space():
|
| 16 |
API.restart_space(repo_id=REPO_ID)
|
| 17 |
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
print(EVAL_REQUESTS_PATH)
|
| 22 |
-
snapshot_download(
|
| 23 |
-
repo_id=QUEUE_REPO,
|
| 24 |
-
local_dir=EVAL_REQUESTS_PATH,
|
| 25 |
-
repo_type="dataset",
|
| 26 |
-
tqdm_class=None,
|
| 27 |
-
etag_timeout=30,
|
| 28 |
-
token=TOKEN,
|
| 29 |
-
)
|
| 30 |
-
except Exception:
|
| 31 |
-
restart_space()
|
| 32 |
-
try:
|
| 33 |
-
print(EVAL_RESULTS_PATH)
|
| 34 |
-
snapshot_download(
|
| 35 |
-
repo_id=RESULTS_REPO,
|
| 36 |
-
local_dir=EVAL_RESULTS_PATH,
|
| 37 |
-
repo_type="dataset",
|
| 38 |
-
tqdm_class=None,
|
| 39 |
-
etag_timeout=30,
|
| 40 |
-
token=TOKEN,
|
| 41 |
-
)
|
| 42 |
-
except Exception:
|
| 43 |
-
restart_space()
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 47 |
-
|
| 48 |
-
(
|
| 49 |
-
finished_eval_queue_df,
|
| 50 |
-
running_eval_queue_df,
|
| 51 |
-
pending_eval_queue_df,
|
| 52 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 53 |
-
|
| 54 |
|
| 55 |
-
def init_leaderboard(dataframe):
|
| 56 |
if dataframe is None or dataframe.empty:
|
| 57 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
|
| 58 |
return Leaderboard(
|
| 59 |
value=dataframe,
|
| 60 |
-
datatype=[c.type for c in
|
| 61 |
select_columns=SelectColumns(
|
| 62 |
-
default_selection=[c.name for c in
|
| 63 |
-
cant_deselect=[c.name for c in
|
| 64 |
label="Select Columns to Display:",
|
| 65 |
),
|
| 66 |
-
search_columns=[
|
| 67 |
-
hide_columns=[c.name for c in
|
| 68 |
filter_columns=[
|
| 69 |
-
ColumnFilter(AutoEvalColumn.inference_provider.name, type="checkboxgroup", label="Inference Provider"),
|
| 70 |
ColumnFilter(
|
| 71 |
-
|
| 72 |
type="slider",
|
| 73 |
min=0.01,
|
| 74 |
max=100,
|
|
@@ -87,79 +51,10 @@ with demo:
|
|
| 87 |
|
| 88 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 89 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 90 |
-
leaderboard = init_leaderboard(
|
| 91 |
-
|
| 92 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
| 93 |
-
with gr.Column():
|
| 94 |
-
with gr.Row():
|
| 95 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 96 |
-
|
| 97 |
-
with gr.Column():
|
| 98 |
-
with gr.Accordion(
|
| 99 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
| 100 |
-
open=False,
|
| 101 |
-
):
|
| 102 |
-
with gr.Row():
|
| 103 |
-
finished_eval_table = gr.components.Dataframe(
|
| 104 |
-
value=finished_eval_queue_df,
|
| 105 |
-
headers=EVAL_COLS,
|
| 106 |
-
datatype=EVAL_TYPES,
|
| 107 |
-
row_count=5,
|
| 108 |
-
)
|
| 109 |
-
with gr.Accordion(
|
| 110 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
| 111 |
-
open=False,
|
| 112 |
-
):
|
| 113 |
-
with gr.Row():
|
| 114 |
-
running_eval_table = gr.components.Dataframe(
|
| 115 |
-
value=running_eval_queue_df,
|
| 116 |
-
headers=EVAL_COLS,
|
| 117 |
-
datatype=EVAL_TYPES,
|
| 118 |
-
row_count=5,
|
| 119 |
-
)
|
| 120 |
-
|
| 121 |
-
with gr.Accordion(
|
| 122 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
| 123 |
-
open=False,
|
| 124 |
-
):
|
| 125 |
-
with gr.Row():
|
| 126 |
-
pending_eval_table = gr.components.Dataframe(
|
| 127 |
-
value=pending_eval_queue_df,
|
| 128 |
-
headers=EVAL_COLS,
|
| 129 |
-
datatype=EVAL_TYPES,
|
| 130 |
-
row_count=5,
|
| 131 |
-
)
|
| 132 |
-
with gr.Row():
|
| 133 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
| 134 |
-
|
| 135 |
-
with gr.Row():
|
| 136 |
-
with gr.Column():
|
| 137 |
-
model_id_textbox = gr.Textbox(label="Model ID")
|
| 138 |
-
inference_provider = gr.Dropdown(
|
| 139 |
-
choices=[
|
| 140 |
-
"Together",
|
| 141 |
-
"OpenAI",
|
| 142 |
-
"Anthropic",
|
| 143 |
-
"Vertex AI",
|
| 144 |
-
],
|
| 145 |
-
label="Inference Provider",
|
| 146 |
-
multiselect=False,
|
| 147 |
-
value=None,
|
| 148 |
-
interactive=True,
|
| 149 |
-
)
|
| 150 |
|
| 151 |
-
submit_button = gr.Button("Submit Eval")
|
| 152 |
-
submission_result = gr.Markdown()
|
| 153 |
-
submit_button.click(
|
| 154 |
-
add_new_eval,
|
| 155 |
-
[
|
| 156 |
-
model_id_textbox,
|
| 157 |
-
inference_provider,
|
| 158 |
-
],
|
| 159 |
-
submission_result,
|
| 160 |
-
)
|
| 161 |
|
| 162 |
scheduler = BackgroundScheduler()
|
| 163 |
-
scheduler.add_job(restart_space, "interval", seconds=
|
| 164 |
scheduler.start()
|
| 165 |
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 4 |
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
|
|
|
|
| 5 |
|
| 6 |
+
from leaderboard.populate import load_results
|
| 7 |
+
from leaderboard.utils import COLUMNS
|
| 8 |
from src.about import INTRODUCTION_TEXT, TITLE
|
| 9 |
from src.display.css_html_js import custom_css
|
| 10 |
+
from src.envs import API, REPO_ID
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
def restart_space():
|
| 14 |
API.restart_space(repo_id=REPO_ID)
|
| 15 |
|
| 16 |
|
| 17 |
+
def init_leaderboard(dataframe: pd.DataFrame):
|
| 18 |
+
dataframe = load_results()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
|
|
|
| 20 |
if dataframe is None or dataframe.empty:
|
| 21 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 22 |
+
|
| 23 |
return Leaderboard(
|
| 24 |
value=dataframe,
|
| 25 |
+
datatype=[c.type for c in COLUMNS],
|
| 26 |
select_columns=SelectColumns(
|
| 27 |
+
default_selection=[c.name for c in COLUMNS if c.displayed_by_default],
|
| 28 |
+
cant_deselect=[c.name for c in COLUMNS if c.never_hidden],
|
| 29 |
label="Select Columns to Display:",
|
| 30 |
),
|
| 31 |
+
search_columns=["Model"],
|
| 32 |
+
hide_columns=[c.name for c in COLUMNS if c.hidden],
|
| 33 |
filter_columns=[
|
|
|
|
| 34 |
ColumnFilter(
|
| 35 |
+
"Average",
|
| 36 |
type="slider",
|
| 37 |
min=0.01,
|
| 38 |
max=100,
|
|
|
|
| 51 |
|
| 52 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 53 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 54 |
+
leaderboard = init_leaderboard()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
scheduler = BackgroundScheduler()
|
| 58 |
+
scheduler.add_job(restart_space, "interval", seconds=3600)
|
| 59 |
scheduler.start()
|
| 60 |
demo.queue(default_concurrency_limit=40).launch()
|
src/about.py
CHANGED
|
@@ -1,19 +1,3 @@
|
|
| 1 |
-
from dataclasses import dataclass
|
| 2 |
-
from enum import Enum
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
@dataclass
|
| 6 |
-
class Task:
|
| 7 |
-
benchmark: str
|
| 8 |
-
metric: str
|
| 9 |
-
col_name: str
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
class Tasks(Enum):
|
| 13 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 14 |
-
robloxQA = Task("robloxqa", "acc", "RobloxQA")
|
| 15 |
-
|
| 16 |
-
|
| 17 |
# Your leaderboard name
|
| 18 |
TITLE = """<h1 align="center" id="space-title">Roblox LLM Leaderboard</h1>"""
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Your leaderboard name
|
| 2 |
TITLE = """<h1 align="center" id="space-title">Roblox LLM Leaderboard</h1>"""
|
| 3 |
|
src/display/utils.py
DELETED
|
@@ -1,62 +0,0 @@
|
|
| 1 |
-
from dataclasses import dataclass, make_dataclass
|
| 2 |
-
from enum import Enum
|
| 3 |
-
|
| 4 |
-
import pandas as pd
|
| 5 |
-
|
| 6 |
-
from src.about import Tasks
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
def fields(raw_class):
|
| 10 |
-
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
# These classes are for user facing column names,
|
| 14 |
-
# to avoid having to change them all around the code
|
| 15 |
-
# when a modif is needed
|
| 16 |
-
@dataclass
|
| 17 |
-
class ColumnContent:
|
| 18 |
-
name: str
|
| 19 |
-
type: str
|
| 20 |
-
displayed_by_default: bool
|
| 21 |
-
hidden: bool = False
|
| 22 |
-
never_hidden: bool = False
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
## For the queue columns in the submission tab
|
| 26 |
-
@dataclass(frozen=True)
|
| 27 |
-
class EvalQueueColumn: # Queue column
|
| 28 |
-
model_id = ColumnContent("model_id", "str", True)
|
| 29 |
-
inference_provider = ColumnContent("inference_provider", "str", True)
|
| 30 |
-
status = ColumnContent("status", "str", True)
|
| 31 |
-
date = ColumnContent("date", "str", True)
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
## All the model information that we might need
|
| 35 |
-
@dataclass
|
| 36 |
-
class ModelDetails:
|
| 37 |
-
name: str
|
| 38 |
-
display_name: str = ""
|
| 39 |
-
symbol: str = "" # emoji
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
## Leaderboard columns
|
| 43 |
-
auto_eval_column_dict = [
|
| 44 |
-
["model_id", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)],
|
| 45 |
-
["inference_provider", ColumnContent, ColumnContent("Inference Provider", "str", False)],
|
| 46 |
-
]
|
| 47 |
-
# Scores
|
| 48 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 49 |
-
for task in Tasks:
|
| 50 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 51 |
-
|
| 52 |
-
# We use make dataclass to dynamically fill the scores from Tasks
|
| 53 |
-
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
# Column selection
|
| 57 |
-
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
| 58 |
-
|
| 59 |
-
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 60 |
-
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 61 |
-
|
| 62 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/envs.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
|
| 3 |
from huggingface_hub import HfApi
|
| 4 |
|
|
@@ -7,19 +8,12 @@ from huggingface_hub import HfApi
|
|
| 7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
| 8 |
|
| 9 |
OWNER = "boatbomber"
|
| 10 |
-
# ----------------------------------
|
| 11 |
-
|
| 12 |
REPO_ID = f"{OWNER}/roblox-llm-leaderboard"
|
| 13 |
-
QUEUE_REPO = f"{OWNER}/roblox-llm-leaderboard-requests"
|
| 14 |
RESULTS_REPO = f"{OWNER}/roblox-llm-leaderboard-results"
|
| 15 |
|
| 16 |
# If you setup a cache later, just change HF_HOME
|
| 17 |
-
CACHE_PATH = os.getenv("HF_HOME", ".")
|
| 18 |
|
| 19 |
-
|
| 20 |
-
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
| 21 |
-
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
| 22 |
-
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 23 |
-
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 24 |
|
| 25 |
API = HfApi(token=TOKEN)
|
|
|
|
| 1 |
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
|
| 4 |
from huggingface_hub import HfApi
|
| 5 |
|
|
|
|
| 8 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
| 9 |
|
| 10 |
OWNER = "boatbomber"
|
|
|
|
|
|
|
| 11 |
REPO_ID = f"{OWNER}/roblox-llm-leaderboard"
|
|
|
|
| 12 |
RESULTS_REPO = f"{OWNER}/roblox-llm-leaderboard-results"
|
| 13 |
|
| 14 |
# If you setup a cache later, just change HF_HOME
|
| 15 |
+
CACHE_PATH = Path(os.getenv("HF_HOME", "."))
|
| 16 |
|
| 17 |
+
RESULTS_REPO_PATH = CACHE_PATH / "roblox-llm-leaderboard-results"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
API = HfApi(token=TOKEN)
|
src/leaderboard/populate.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from huggingface_hub import snapshot_download
|
| 5 |
+
|
| 6 |
+
from leaderboard.utils import COLUMNS
|
| 7 |
+
from src.envs import RESULTS_REPO, RESULTS_REPO_PATH, TOKEN
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def download_result_data():
|
| 11 |
+
print(f"Downloading {RESULTS_REPO}")
|
| 12 |
+
snapshot_download(
|
| 13 |
+
repo_id=RESULTS_REPO,
|
| 14 |
+
local_dir=RESULTS_REPO_PATH,
|
| 15 |
+
repo_type="dataset",
|
| 16 |
+
tqdm_class=None,
|
| 17 |
+
etag_timeout=30,
|
| 18 |
+
token=TOKEN,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def load_results() -> pd.DataFrame:
|
| 23 |
+
if not RESULTS_REPO_PATH.exists():
|
| 24 |
+
download_result_data()
|
| 25 |
+
|
| 26 |
+
data = []
|
| 27 |
+
|
| 28 |
+
results_dir = RESULTS_REPO_PATH / "results"
|
| 29 |
+
if not (results_dir.exists() and results_dir.is_dir()):
|
| 30 |
+
raise ValueError("No results found in the results directory")
|
| 31 |
+
|
| 32 |
+
for file in results_dir.rglob("*.json"):
|
| 33 |
+
with open(file) as f:
|
| 34 |
+
evaluation = json.load(f)
|
| 35 |
+
|
| 36 |
+
results = evaluation["Results"]
|
| 37 |
+
del evaluation["Results"]
|
| 38 |
+
|
| 39 |
+
evaluation["Average"] = sum(results.values()) / len(results)
|
| 40 |
+
for key, value in results.items():
|
| 41 |
+
evaluation[key] = value
|
| 42 |
+
|
| 43 |
+
data.append(evaluation)
|
| 44 |
+
|
| 45 |
+
dataframe = pd.DataFrame(data, columns=[c.name for c in COLUMNS])
|
| 46 |
+
dataframe.sort_values(by=["Average"], ascending=False)
|
| 47 |
+
return dataframe
|
src/leaderboard/read_evals.py
DELETED
|
@@ -1,110 +0,0 @@
|
|
| 1 |
-
import glob
|
| 2 |
-
import json
|
| 3 |
-
import math
|
| 4 |
-
import os
|
| 5 |
-
from dataclasses import dataclass, field
|
| 6 |
-
|
| 7 |
-
import dateutil
|
| 8 |
-
import numpy as np
|
| 9 |
-
|
| 10 |
-
from src.display.utils import AutoEvalColumn, Tasks
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
@dataclass
|
| 14 |
-
class EvalResult:
|
| 15 |
-
"""Represents one full evaluation."""
|
| 16 |
-
|
| 17 |
-
eval_name: str = ""
|
| 18 |
-
date: str = ""
|
| 19 |
-
model_id: str = ""
|
| 20 |
-
inference_provider: str = ""
|
| 21 |
-
results: dict = field(default_factory=dict)
|
| 22 |
-
|
| 23 |
-
@classmethod
|
| 24 |
-
def init_from_json_file(self, json_filepath):
|
| 25 |
-
"""Inits the result from the specific model result file"""
|
| 26 |
-
with open(json_filepath) as fp:
|
| 27 |
-
data = json.load(fp)
|
| 28 |
-
|
| 29 |
-
raw_results = data.get("results", {})
|
| 30 |
-
config = data.get("config")
|
| 31 |
-
inference_provider = config.get("inference_provider", "Unknown")
|
| 32 |
-
model_id = config.get("model_id", "Unknown")
|
| 33 |
-
date = config.get("date", "Unknown")
|
| 34 |
-
|
| 35 |
-
# Extract results available in this file (some results are split in several files)
|
| 36 |
-
results = {}
|
| 37 |
-
for task in Tasks:
|
| 38 |
-
task = task.value
|
| 39 |
-
|
| 40 |
-
# We average all scores of a given metric (not all metrics are present in all files)
|
| 41 |
-
accs = np.array([v.get(task.metric, None) for k, v in raw_results.items() if task.benchmark == k])
|
| 42 |
-
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 43 |
-
continue
|
| 44 |
-
|
| 45 |
-
mean_acc = np.mean(accs) * 100.0
|
| 46 |
-
results[task.benchmark] = mean_acc
|
| 47 |
-
|
| 48 |
-
return self(
|
| 49 |
-
eval_name=f"{inference_provider}:{model_id}",
|
| 50 |
-
model_id=model_id,
|
| 51 |
-
inference_provider=inference_provider,
|
| 52 |
-
results=results,
|
| 53 |
-
date=date,
|
| 54 |
-
)
|
| 55 |
-
|
| 56 |
-
def to_dict(self):
|
| 57 |
-
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 58 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
| 59 |
-
data_dict = {
|
| 60 |
-
"eval_name": self.eval_name, # not a column, just a save name,
|
| 61 |
-
"model_id": self.model_id,
|
| 62 |
-
"inference_provider": self.inference_provider,
|
| 63 |
-
"date": self.date,
|
| 64 |
-
}
|
| 65 |
-
|
| 66 |
-
for task in Tasks:
|
| 67 |
-
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
| 68 |
-
|
| 69 |
-
return data_dict
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
| 73 |
-
"""From the path of the results folder root, extract all needed info for results"""
|
| 74 |
-
model_result_filepaths = []
|
| 75 |
-
|
| 76 |
-
for root, _, files in os.walk(results_path):
|
| 77 |
-
# We should only have json files in model results
|
| 78 |
-
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
| 79 |
-
continue
|
| 80 |
-
|
| 81 |
-
# Sort the files by date
|
| 82 |
-
try:
|
| 83 |
-
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
| 84 |
-
except dateutil.parser._parser.ParserError:
|
| 85 |
-
files = [files[-1]]
|
| 86 |
-
|
| 87 |
-
for file in files:
|
| 88 |
-
model_result_filepaths.append(os.path.join(root, file))
|
| 89 |
-
|
| 90 |
-
eval_results = {}
|
| 91 |
-
for model_result_filepath in model_result_filepaths:
|
| 92 |
-
# Creation of result
|
| 93 |
-
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 94 |
-
|
| 95 |
-
# Store results of same eval together
|
| 96 |
-
eval_name = eval_result.eval_name
|
| 97 |
-
if eval_name in eval_results.keys():
|
| 98 |
-
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
| 99 |
-
else:
|
| 100 |
-
eval_results[eval_name] = eval_result
|
| 101 |
-
|
| 102 |
-
results = []
|
| 103 |
-
for v in eval_results.values():
|
| 104 |
-
try:
|
| 105 |
-
v.to_dict() # we test if the dict version is complete
|
| 106 |
-
results.append(v)
|
| 107 |
-
except KeyError: # not all eval values present
|
| 108 |
-
continue
|
| 109 |
-
|
| 110 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard/utils.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
@dataclass
|
| 5 |
+
class ColumnContent:
|
| 6 |
+
name: str
|
| 7 |
+
type: str
|
| 8 |
+
displayed_by_default: bool
|
| 9 |
+
hidden: bool = False
|
| 10 |
+
never_hidden: bool = False
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
## Leaderboard columns
|
| 14 |
+
COLUMNS = [
|
| 15 |
+
ColumnContent("Model", type="str", displayed_by_default=True, never_hidden=True),
|
| 16 |
+
ColumnContent("Average", type="number", displayed_by_default=True),
|
| 17 |
+
ColumnContent("RobloxQA", type="number", displayed_by_default=True),
|
| 18 |
+
]
|
src/populate.py
DELETED
|
@@ -1,56 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import os
|
| 3 |
-
|
| 4 |
-
import pandas as pd
|
| 5 |
-
|
| 6 |
-
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 7 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
| 8 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 12 |
-
"""Creates a dataframe from all the individual experiment results"""
|
| 13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 14 |
-
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
-
|
| 16 |
-
df = pd.DataFrame.from_records(all_data_json)
|
| 17 |
-
# filter out if any of the benchmarks have not been produced
|
| 18 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 19 |
-
|
| 20 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 21 |
-
df = df[cols].round(decimals=2)
|
| 22 |
-
|
| 23 |
-
return df
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
| 27 |
-
"""Creates the different dataframes for the evaluation queues requestes"""
|
| 28 |
-
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
| 29 |
-
all_evals = []
|
| 30 |
-
|
| 31 |
-
for entry in entries:
|
| 32 |
-
if ".json" in entry:
|
| 33 |
-
file_path = os.path.join(save_path, entry)
|
| 34 |
-
with open(file_path) as fp:
|
| 35 |
-
data = json.load(fp)
|
| 36 |
-
|
| 37 |
-
all_evals.append(data)
|
| 38 |
-
elif ".md" not in entry:
|
| 39 |
-
# this is a folder
|
| 40 |
-
sub_entries = [
|
| 41 |
-
e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")
|
| 42 |
-
]
|
| 43 |
-
for sub_entry in sub_entries:
|
| 44 |
-
file_path = os.path.join(save_path, entry, sub_entry)
|
| 45 |
-
with open(file_path) as fp:
|
| 46 |
-
data = json.load(fp)
|
| 47 |
-
|
| 48 |
-
all_evals.append(data)
|
| 49 |
-
|
| 50 |
-
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
| 51 |
-
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
| 52 |
-
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
| 53 |
-
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
| 54 |
-
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
| 55 |
-
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
| 56 |
-
return df_finished[cols], df_running[cols], df_pending[cols]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/submission/check_validity.py
DELETED
|
@@ -1,36 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import os
|
| 3 |
-
import re
|
| 4 |
-
from collections import defaultdict
|
| 5 |
-
from datetime import datetime, timedelta, timezone
|
| 6 |
-
|
| 7 |
-
import huggingface_hub
|
| 8 |
-
from huggingface_hub import ModelCard
|
| 9 |
-
from huggingface_hub.hf_api import ModelInfo
|
| 10 |
-
from transformers import AutoConfig
|
| 11 |
-
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
| 15 |
-
"""Gather a list of already submitted models to avoid duplicates"""
|
| 16 |
-
depth = 1
|
| 17 |
-
file_names = []
|
| 18 |
-
users_to_submission_dates = defaultdict(list)
|
| 19 |
-
|
| 20 |
-
for root, _, files in os.walk(requested_models_dir):
|
| 21 |
-
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
| 22 |
-
if current_depth == depth:
|
| 23 |
-
for file in files:
|
| 24 |
-
if not file.endswith(".json"):
|
| 25 |
-
continue
|
| 26 |
-
with open(os.path.join(root, file), "r") as f:
|
| 27 |
-
info = json.load(f)
|
| 28 |
-
file_names.append(f"{info['inference_provider']}:{info['model_id']}")
|
| 29 |
-
|
| 30 |
-
# Select organisation
|
| 31 |
-
if info["model_id"].count("/") == 0 or "submitted_time" not in info:
|
| 32 |
-
continue
|
| 33 |
-
organisation, _ = info["model_id"].split("/")
|
| 34 |
-
users_to_submission_dates[organisation].append(info["submitted_time"])
|
| 35 |
-
|
| 36 |
-
return set(file_names), users_to_submission_dates
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/submission/submit.py
DELETED
|
@@ -1,70 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import os
|
| 3 |
-
from datetime import datetime, timezone
|
| 4 |
-
|
| 5 |
-
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 6 |
-
from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
|
| 7 |
-
from src.submission.check_validity import already_submitted_models
|
| 8 |
-
|
| 9 |
-
REQUESTED_MODELS = None
|
| 10 |
-
USERS_TO_SUBMISSION_DATES = None
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def add_new_eval(
|
| 14 |
-
model_id: str,
|
| 15 |
-
inference_provider: str,
|
| 16 |
-
):
|
| 17 |
-
global REQUESTED_MODELS
|
| 18 |
-
global USERS_TO_SUBMISSION_DATES
|
| 19 |
-
if not REQUESTED_MODELS:
|
| 20 |
-
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
| 21 |
-
|
| 22 |
-
if not inference_provider:
|
| 23 |
-
return styled_error("Please select an inference provider.")
|
| 24 |
-
|
| 25 |
-
user_name = ""
|
| 26 |
-
model_path = model
|
| 27 |
-
if "/" in model:
|
| 28 |
-
user_name = model.split("/")[0]
|
| 29 |
-
model_path = model.split("/")[1]
|
| 30 |
-
|
| 31 |
-
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 32 |
-
|
| 33 |
-
print("Adding new eval")
|
| 34 |
-
|
| 35 |
-
eval_entry = {
|
| 36 |
-
"submitted_time": current_time,
|
| 37 |
-
"status": "PENDING",
|
| 38 |
-
"model_id": model_id,
|
| 39 |
-
"inference_provider": inference_provider,
|
| 40 |
-
}
|
| 41 |
-
|
| 42 |
-
# Check for duplicate submission
|
| 43 |
-
if f"{inference_provider}:{model_id}" in REQUESTED_MODELS:
|
| 44 |
-
return styled_warning("This model has been already submitted.")
|
| 45 |
-
|
| 46 |
-
print("Creating eval file")
|
| 47 |
-
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
| 48 |
-
os.makedirs(OUT_DIR, exist_ok=True)
|
| 49 |
-
out_path = f"{OUT_DIR}/{model_path}_eval_request_{inference_provider}.json"
|
| 50 |
-
|
| 51 |
-
with open(out_path, "w") as f:
|
| 52 |
-
f.write(json.dumps(eval_entry))
|
| 53 |
-
|
| 54 |
-
print("Uploading eval file")
|
| 55 |
-
API.upload_file(
|
| 56 |
-
path_or_fileobj=out_path,
|
| 57 |
-
path_in_repo=out_path.split("eval-queue/")[1],
|
| 58 |
-
repo_id=QUEUE_REPO,
|
| 59 |
-
repo_type="dataset",
|
| 60 |
-
commit_message=f"Add {model_id} to eval queue",
|
| 61 |
-
)
|
| 62 |
-
|
| 63 |
-
# Remove the local file
|
| 64 |
-
os.remove(out_path)
|
| 65 |
-
|
| 66 |
-
REQUESTED_MODELS.add(f"{inference_provider}:{model_id}")
|
| 67 |
-
|
| 68 |
-
return styled_message(
|
| 69 |
-
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
| 70 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|