boatbomber commited on
Commit
40afe38
·
1 Parent(s): e43ddac

Dramatically simplify everything

Browse files
app.py CHANGED
@@ -2,73 +2,37 @@ import gradio as gr
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
5
- from huggingface_hub import snapshot_download
6
 
 
 
7
  from src.about import INTRODUCTION_TEXT, TITLE
8
  from src.display.css_html_js import custom_css
9
- from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, fields
10
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
11
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
12
- from src.submission.submit import add_new_eval
13
 
14
 
15
  def restart_space():
16
  API.restart_space(repo_id=REPO_ID)
17
 
18
 
19
- ### Space initialisation
20
- try:
21
- print(EVAL_REQUESTS_PATH)
22
- snapshot_download(
23
- repo_id=QUEUE_REPO,
24
- local_dir=EVAL_REQUESTS_PATH,
25
- repo_type="dataset",
26
- tqdm_class=None,
27
- etag_timeout=30,
28
- token=TOKEN,
29
- )
30
- except Exception:
31
- restart_space()
32
- try:
33
- print(EVAL_RESULTS_PATH)
34
- snapshot_download(
35
- repo_id=RESULTS_REPO,
36
- local_dir=EVAL_RESULTS_PATH,
37
- repo_type="dataset",
38
- tqdm_class=None,
39
- etag_timeout=30,
40
- token=TOKEN,
41
- )
42
- except Exception:
43
- restart_space()
44
-
45
-
46
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
47
-
48
- (
49
- finished_eval_queue_df,
50
- running_eval_queue_df,
51
- pending_eval_queue_df,
52
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
53
-
54
 
55
- def init_leaderboard(dataframe):
56
  if dataframe is None or dataframe.empty:
57
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
58
  return Leaderboard(
59
  value=dataframe,
60
- datatype=[c.type for c in fields(AutoEvalColumn)],
61
  select_columns=SelectColumns(
62
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
63
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
64
  label="Select Columns to Display:",
65
  ),
66
- search_columns=[AutoEvalColumn.model_id.name],
67
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
68
  filter_columns=[
69
- ColumnFilter(AutoEvalColumn.inference_provider.name, type="checkboxgroup", label="Inference Provider"),
70
  ColumnFilter(
71
- AutoEvalColumn.average.name,
72
  type="slider",
73
  min=0.01,
74
  max=100,
@@ -87,79 +51,10 @@ with demo:
87
 
88
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
89
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
90
- leaderboard = init_leaderboard(LEADERBOARD_DF)
91
-
92
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
93
- with gr.Column():
94
- with gr.Row():
95
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
96
-
97
- with gr.Column():
98
- with gr.Accordion(
99
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
100
- open=False,
101
- ):
102
- with gr.Row():
103
- finished_eval_table = gr.components.Dataframe(
104
- value=finished_eval_queue_df,
105
- headers=EVAL_COLS,
106
- datatype=EVAL_TYPES,
107
- row_count=5,
108
- )
109
- with gr.Accordion(
110
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
111
- open=False,
112
- ):
113
- with gr.Row():
114
- running_eval_table = gr.components.Dataframe(
115
- value=running_eval_queue_df,
116
- headers=EVAL_COLS,
117
- datatype=EVAL_TYPES,
118
- row_count=5,
119
- )
120
-
121
- with gr.Accordion(
122
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- pending_eval_table = gr.components.Dataframe(
127
- value=pending_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
- with gr.Row():
133
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
134
-
135
- with gr.Row():
136
- with gr.Column():
137
- model_id_textbox = gr.Textbox(label="Model ID")
138
- inference_provider = gr.Dropdown(
139
- choices=[
140
- "Together",
141
- "OpenAI",
142
- "Anthropic",
143
- "Vertex AI",
144
- ],
145
- label="Inference Provider",
146
- multiselect=False,
147
- value=None,
148
- interactive=True,
149
- )
150
 
151
- submit_button = gr.Button("Submit Eval")
152
- submission_result = gr.Markdown()
153
- submit_button.click(
154
- add_new_eval,
155
- [
156
- model_id_textbox,
157
- inference_provider,
158
- ],
159
- submission_result,
160
- )
161
 
162
  scheduler = BackgroundScheduler()
163
- scheduler.add_job(restart_space, "interval", seconds=1800)
164
  scheduler.start()
165
  demo.queue(default_concurrency_limit=40).launch()
 
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
 
5
 
6
+ from leaderboard.populate import load_results
7
+ from leaderboard.utils import COLUMNS
8
  from src.about import INTRODUCTION_TEXT, TITLE
9
  from src.display.css_html_js import custom_css
10
+ from src.envs import API, REPO_ID
 
 
 
11
 
12
 
13
  def restart_space():
14
  API.restart_space(repo_id=REPO_ID)
15
 
16
 
17
+ def init_leaderboard(dataframe: pd.DataFrame):
18
+ dataframe = load_results()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
 
20
  if dataframe is None or dataframe.empty:
21
  raise ValueError("Leaderboard DataFrame is empty or None.")
22
+
23
  return Leaderboard(
24
  value=dataframe,
25
+ datatype=[c.type for c in COLUMNS],
26
  select_columns=SelectColumns(
27
+ default_selection=[c.name for c in COLUMNS if c.displayed_by_default],
28
+ cant_deselect=[c.name for c in COLUMNS if c.never_hidden],
29
  label="Select Columns to Display:",
30
  ),
31
+ search_columns=["Model"],
32
+ hide_columns=[c.name for c in COLUMNS if c.hidden],
33
  filter_columns=[
 
34
  ColumnFilter(
35
+ "Average",
36
  type="slider",
37
  min=0.01,
38
  max=100,
 
51
 
52
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
53
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
54
+ leaderboard = init_leaderboard()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  scheduler = BackgroundScheduler()
58
+ scheduler.add_job(restart_space, "interval", seconds=3600)
59
  scheduler.start()
60
  demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -1,19 +1,3 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
-
4
-
5
- @dataclass
6
- class Task:
7
- benchmark: str
8
- metric: str
9
- col_name: str
10
-
11
-
12
- class Tasks(Enum):
13
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
14
- robloxQA = Task("robloxqa", "acc", "RobloxQA")
15
-
16
-
17
  # Your leaderboard name
18
  TITLE = """<h1 align="center" id="space-title">Roblox LLM Leaderboard</h1>"""
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Your leaderboard name
2
  TITLE = """<h1 align="center" id="space-title">Roblox LLM Leaderboard</h1>"""
3
 
src/display/utils.py DELETED
@@ -1,62 +0,0 @@
1
- from dataclasses import dataclass, make_dataclass
2
- from enum import Enum
3
-
4
- import pandas as pd
5
-
6
- from src.about import Tasks
7
-
8
-
9
- def fields(raw_class):
10
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
11
-
12
-
13
- # These classes are for user facing column names,
14
- # to avoid having to change them all around the code
15
- # when a modif is needed
16
- @dataclass
17
- class ColumnContent:
18
- name: str
19
- type: str
20
- displayed_by_default: bool
21
- hidden: bool = False
22
- never_hidden: bool = False
23
-
24
-
25
- ## For the queue columns in the submission tab
26
- @dataclass(frozen=True)
27
- class EvalQueueColumn: # Queue column
28
- model_id = ColumnContent("model_id", "str", True)
29
- inference_provider = ColumnContent("inference_provider", "str", True)
30
- status = ColumnContent("status", "str", True)
31
- date = ColumnContent("date", "str", True)
32
-
33
-
34
- ## All the model information that we might need
35
- @dataclass
36
- class ModelDetails:
37
- name: str
38
- display_name: str = ""
39
- symbol: str = "" # emoji
40
-
41
-
42
- ## Leaderboard columns
43
- auto_eval_column_dict = [
44
- ["model_id", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)],
45
- ["inference_provider", ColumnContent, ColumnContent("Inference Provider", "str", False)],
46
- ]
47
- # Scores
48
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
49
- for task in Tasks:
50
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
51
-
52
- # We use make dataclass to dynamically fill the scores from Tasks
53
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
54
-
55
-
56
- # Column selection
57
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
58
-
59
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
60
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
61
-
62
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/envs.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
 
3
  from huggingface_hub import HfApi
4
 
@@ -7,19 +8,12 @@ from huggingface_hub import HfApi
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
  OWNER = "boatbomber"
10
- # ----------------------------------
11
-
12
  REPO_ID = f"{OWNER}/roblox-llm-leaderboard"
13
- QUEUE_REPO = f"{OWNER}/roblox-llm-leaderboard-requests"
14
  RESULTS_REPO = f"{OWNER}/roblox-llm-leaderboard-results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH = os.getenv("HF_HOME", ".")
18
 
19
- # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
25
  API = HfApi(token=TOKEN)
 
1
  import os
2
+ from pathlib import Path
3
 
4
  from huggingface_hub import HfApi
5
 
 
8
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
9
 
10
  OWNER = "boatbomber"
 
 
11
  REPO_ID = f"{OWNER}/roblox-llm-leaderboard"
 
12
  RESULTS_REPO = f"{OWNER}/roblox-llm-leaderboard-results"
13
 
14
  # If you setup a cache later, just change HF_HOME
15
+ CACHE_PATH = Path(os.getenv("HF_HOME", "."))
16
 
17
+ RESULTS_REPO_PATH = CACHE_PATH / "roblox-llm-leaderboard-results"
 
 
 
 
18
 
19
  API = HfApi(token=TOKEN)
src/leaderboard/populate.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import pandas as pd
4
+ from huggingface_hub import snapshot_download
5
+
6
+ from leaderboard.utils import COLUMNS
7
+ from src.envs import RESULTS_REPO, RESULTS_REPO_PATH, TOKEN
8
+
9
+
10
+ def download_result_data():
11
+ print(f"Downloading {RESULTS_REPO}")
12
+ snapshot_download(
13
+ repo_id=RESULTS_REPO,
14
+ local_dir=RESULTS_REPO_PATH,
15
+ repo_type="dataset",
16
+ tqdm_class=None,
17
+ etag_timeout=30,
18
+ token=TOKEN,
19
+ )
20
+
21
+
22
+ def load_results() -> pd.DataFrame:
23
+ if not RESULTS_REPO_PATH.exists():
24
+ download_result_data()
25
+
26
+ data = []
27
+
28
+ results_dir = RESULTS_REPO_PATH / "results"
29
+ if not (results_dir.exists() and results_dir.is_dir()):
30
+ raise ValueError("No results found in the results directory")
31
+
32
+ for file in results_dir.rglob("*.json"):
33
+ with open(file) as f:
34
+ evaluation = json.load(f)
35
+
36
+ results = evaluation["Results"]
37
+ del evaluation["Results"]
38
+
39
+ evaluation["Average"] = sum(results.values()) / len(results)
40
+ for key, value in results.items():
41
+ evaluation[key] = value
42
+
43
+ data.append(evaluation)
44
+
45
+ dataframe = pd.DataFrame(data, columns=[c.name for c in COLUMNS])
46
+ dataframe.sort_values(by=["Average"], ascending=False)
47
+ return dataframe
src/leaderboard/read_evals.py DELETED
@@ -1,110 +0,0 @@
1
- import glob
2
- import json
3
- import math
4
- import os
5
- from dataclasses import dataclass, field
6
-
7
- import dateutil
8
- import numpy as np
9
-
10
- from src.display.utils import AutoEvalColumn, Tasks
11
-
12
-
13
- @dataclass
14
- class EvalResult:
15
- """Represents one full evaluation."""
16
-
17
- eval_name: str = ""
18
- date: str = ""
19
- model_id: str = ""
20
- inference_provider: str = ""
21
- results: dict = field(default_factory=dict)
22
-
23
- @classmethod
24
- def init_from_json_file(self, json_filepath):
25
- """Inits the result from the specific model result file"""
26
- with open(json_filepath) as fp:
27
- data = json.load(fp)
28
-
29
- raw_results = data.get("results", {})
30
- config = data.get("config")
31
- inference_provider = config.get("inference_provider", "Unknown")
32
- model_id = config.get("model_id", "Unknown")
33
- date = config.get("date", "Unknown")
34
-
35
- # Extract results available in this file (some results are split in several files)
36
- results = {}
37
- for task in Tasks:
38
- task = task.value
39
-
40
- # We average all scores of a given metric (not all metrics are present in all files)
41
- accs = np.array([v.get(task.metric, None) for k, v in raw_results.items() if task.benchmark == k])
42
- if accs.size == 0 or any([acc is None for acc in accs]):
43
- continue
44
-
45
- mean_acc = np.mean(accs) * 100.0
46
- results[task.benchmark] = mean_acc
47
-
48
- return self(
49
- eval_name=f"{inference_provider}:{model_id}",
50
- model_id=model_id,
51
- inference_provider=inference_provider,
52
- results=results,
53
- date=date,
54
- )
55
-
56
- def to_dict(self):
57
- """Converts the Eval Result to a dict compatible with our dataframe display"""
58
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
59
- data_dict = {
60
- "eval_name": self.eval_name, # not a column, just a save name,
61
- "model_id": self.model_id,
62
- "inference_provider": self.inference_provider,
63
- "date": self.date,
64
- }
65
-
66
- for task in Tasks:
67
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
68
-
69
- return data_dict
70
-
71
-
72
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
73
- """From the path of the results folder root, extract all needed info for results"""
74
- model_result_filepaths = []
75
-
76
- for root, _, files in os.walk(results_path):
77
- # We should only have json files in model results
78
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
79
- continue
80
-
81
- # Sort the files by date
82
- try:
83
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
84
- except dateutil.parser._parser.ParserError:
85
- files = [files[-1]]
86
-
87
- for file in files:
88
- model_result_filepaths.append(os.path.join(root, file))
89
-
90
- eval_results = {}
91
- for model_result_filepath in model_result_filepaths:
92
- # Creation of result
93
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
94
-
95
- # Store results of same eval together
96
- eval_name = eval_result.eval_name
97
- if eval_name in eval_results.keys():
98
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
99
- else:
100
- eval_results[eval_name] = eval_result
101
-
102
- results = []
103
- for v in eval_results.values():
104
- try:
105
- v.to_dict() # we test if the dict version is complete
106
- results.append(v)
107
- except KeyError: # not all eval values present
108
- continue
109
-
110
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard/utils.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class ColumnContent:
6
+ name: str
7
+ type: str
8
+ displayed_by_default: bool
9
+ hidden: bool = False
10
+ never_hidden: bool = False
11
+
12
+
13
+ ## Leaderboard columns
14
+ COLUMNS = [
15
+ ColumnContent("Model", type="str", displayed_by_default=True, never_hidden=True),
16
+ ColumnContent("Average", type="number", displayed_by_default=True),
17
+ ColumnContent("RobloxQA", type="number", displayed_by_default=True),
18
+ ]
src/populate.py DELETED
@@ -1,56 +0,0 @@
1
- import json
2
- import os
3
-
4
- import pandas as pd
5
-
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
-
10
-
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
-
16
- df = pd.DataFrame.from_records(all_data_json)
17
- # filter out if any of the benchmarks have not been produced
18
- df = df[has_no_nan_values(df, benchmark_cols)]
19
-
20
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
21
- df = df[cols].round(decimals=2)
22
-
23
- return df
24
-
25
-
26
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
27
- """Creates the different dataframes for the evaluation queues requestes"""
28
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
29
- all_evals = []
30
-
31
- for entry in entries:
32
- if ".json" in entry:
33
- file_path = os.path.join(save_path, entry)
34
- with open(file_path) as fp:
35
- data = json.load(fp)
36
-
37
- all_evals.append(data)
38
- elif ".md" not in entry:
39
- # this is a folder
40
- sub_entries = [
41
- e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")
42
- ]
43
- for sub_entry in sub_entries:
44
- file_path = os.path.join(save_path, entry, sub_entry)
45
- with open(file_path) as fp:
46
- data = json.load(fp)
47
-
48
- all_evals.append(data)
49
-
50
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
51
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
52
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
53
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
54
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
55
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
56
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/check_validity.py DELETED
@@ -1,36 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
-
7
- import huggingface_hub
8
- from huggingface_hub import ModelCard
9
- from huggingface_hub.hf_api import ModelInfo
10
- from transformers import AutoConfig
11
- from transformers.models.auto.tokenization_auto import AutoTokenizer
12
-
13
-
14
- def already_submitted_models(requested_models_dir: str) -> set[str]:
15
- """Gather a list of already submitted models to avoid duplicates"""
16
- depth = 1
17
- file_names = []
18
- users_to_submission_dates = defaultdict(list)
19
-
20
- for root, _, files in os.walk(requested_models_dir):
21
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
22
- if current_depth == depth:
23
- for file in files:
24
- if not file.endswith(".json"):
25
- continue
26
- with open(os.path.join(root, file), "r") as f:
27
- info = json.load(f)
28
- file_names.append(f"{info['inference_provider']}:{info['model_id']}")
29
-
30
- # Select organisation
31
- if info["model_id"].count("/") == 0 or "submitted_time" not in info:
32
- continue
33
- organisation, _ = info["model_id"].split("/")
34
- users_to_submission_dates[organisation].append(info["submitted_time"])
35
-
36
- return set(file_names), users_to_submission_dates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/submit.py DELETED
@@ -1,70 +0,0 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
-
5
- from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
7
- from src.submission.check_validity import already_submitted_models
8
-
9
- REQUESTED_MODELS = None
10
- USERS_TO_SUBMISSION_DATES = None
11
-
12
-
13
- def add_new_eval(
14
- model_id: str,
15
- inference_provider: str,
16
- ):
17
- global REQUESTED_MODELS
18
- global USERS_TO_SUBMISSION_DATES
19
- if not REQUESTED_MODELS:
20
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
21
-
22
- if not inference_provider:
23
- return styled_error("Please select an inference provider.")
24
-
25
- user_name = ""
26
- model_path = model
27
- if "/" in model:
28
- user_name = model.split("/")[0]
29
- model_path = model.split("/")[1]
30
-
31
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
32
-
33
- print("Adding new eval")
34
-
35
- eval_entry = {
36
- "submitted_time": current_time,
37
- "status": "PENDING",
38
- "model_id": model_id,
39
- "inference_provider": inference_provider,
40
- }
41
-
42
- # Check for duplicate submission
43
- if f"{inference_provider}:{model_id}" in REQUESTED_MODELS:
44
- return styled_warning("This model has been already submitted.")
45
-
46
- print("Creating eval file")
47
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
48
- os.makedirs(OUT_DIR, exist_ok=True)
49
- out_path = f"{OUT_DIR}/{model_path}_eval_request_{inference_provider}.json"
50
-
51
- with open(out_path, "w") as f:
52
- f.write(json.dumps(eval_entry))
53
-
54
- print("Uploading eval file")
55
- API.upload_file(
56
- path_or_fileobj=out_path,
57
- path_in_repo=out_path.split("eval-queue/")[1],
58
- repo_id=QUEUE_REPO,
59
- repo_type="dataset",
60
- commit_message=f"Add {model_id} to eval queue",
61
- )
62
-
63
- # Remove the local file
64
- os.remove(out_path)
65
-
66
- REQUESTED_MODELS.add(f"{inference_provider}:{model_id}")
67
-
68
- return styled_message(
69
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
70
- )