boatbomber commited on
Commit
f8efcf6
·
1 Parent(s): d0faba1

Submit model id and provider

Browse files
app.py CHANGED
@@ -1,29 +1,12 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
 
5
  from huggingface_hub import snapshot_download
6
 
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
  from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
@@ -32,18 +15,29 @@ from src.submission.submit import add_new_eval
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
 
35
  ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
40
  )
41
  except Exception:
42
  restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
  snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
47
  )
48
  except Exception:
49
  restart_space()
@@ -57,6 +51,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -68,20 +63,16 @@ def init_leaderboard(dataframe):
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
  ColumnFilter(
77
- AutoEvalColumn.params.name,
78
  type="slider",
79
  min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
  ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
@@ -98,9 +89,6 @@ with demo:
98
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
  leaderboard = init_leaderboard(LEADERBOARD_DF)
100
 
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
  with gr.Column():
106
  with gr.Row():
@@ -146,59 +134,32 @@ with demo:
146
 
147
  with gr.Row():
148
  with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
 
 
 
 
154
  multiselect=False,
155
  value=None,
156
  interactive=True,
157
  )
158
 
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
  submit_button = gr.Button("Submit Eval")
177
  submission_result = gr.Markdown()
178
  submit_button.click(
179
  add_new_eval,
180
  [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
  ],
188
  submission_result,
189
  )
190
 
191
- with gr.Row():
192
- with gr.Accordion("📙 Citation", open=False):
193
- citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
- elem_id="citation-button",
198
- show_copy_button=True,
199
- )
200
-
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
 
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
+ from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
5
  from huggingface_hub import snapshot_download
6
 
7
+ from src.about import INTRODUCTION_TEXT, TITLE
 
 
 
 
 
 
 
8
  from src.display.css_html_js import custom_css
9
+ from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, fields
 
 
 
 
 
 
 
 
 
 
10
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
11
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
12
  from src.submission.submit import add_new_eval
 
15
  def restart_space():
16
  API.restart_space(repo_id=REPO_ID)
17
 
18
+
19
  ### Space initialisation
20
  try:
21
  print(EVAL_REQUESTS_PATH)
22
  snapshot_download(
23
+ repo_id=QUEUE_REPO,
24
+ local_dir=EVAL_REQUESTS_PATH,
25
+ repo_type="dataset",
26
+ tqdm_class=None,
27
+ etag_timeout=30,
28
+ token=TOKEN,
29
  )
30
  except Exception:
31
  restart_space()
32
  try:
33
  print(EVAL_RESULTS_PATH)
34
  snapshot_download(
35
+ repo_id=RESULTS_REPO,
36
+ local_dir=EVAL_RESULTS_PATH,
37
+ repo_type="dataset",
38
+ tqdm_class=None,
39
+ etag_timeout=30,
40
+ token=TOKEN,
41
  )
42
  except Exception:
43
  restart_space()
 
51
  pending_eval_queue_df,
52
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
53
 
54
+
55
  def init_leaderboard(dataframe):
56
  if dataframe is None or dataframe.empty:
57
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
63
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
64
  label="Select Columns to Display:",
65
  ),
66
+ search_columns=[AutoEvalColumn.model_id.name],
67
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
68
  filter_columns=[
69
+ ColumnFilter(AutoEvalColumn.inference_provider.name, type="checkboxgroup", label="Inference Provider"),
 
70
  ColumnFilter(
71
+ AutoEvalColumn.average.name,
72
  type="slider",
73
  min=0.01,
74
+ max=100,
75
+ label="Average score",
 
 
 
76
  ),
77
  ],
78
  bool_checkboxgroup_label="Hide models",
 
89
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
90
  leaderboard = init_leaderboard(LEADERBOARD_DF)
91
 
 
 
 
92
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
93
  with gr.Column():
94
  with gr.Row():
 
134
 
135
  with gr.Row():
136
  with gr.Column():
137
+ model_id_textbox = gr.Textbox(label="Model ID")
138
+ inference_provider = gr.Dropdown(
139
+ choices=[
140
+ "Together",
141
+ "OpenAI",
142
+ "Anthropic",
143
+ "Vertex AI",
144
+ ],
145
+ label="Inference Provider",
146
  multiselect=False,
147
  value=None,
148
  interactive=True,
149
  )
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  submit_button = gr.Button("Submit Eval")
152
  submission_result = gr.Markdown()
153
  submit_button.click(
154
  add_new_eval,
155
  [
156
+ model_id_textbox,
157
+ inference_provider,
 
 
 
 
158
  ],
159
  submission_result,
160
  )
161
 
 
 
 
 
 
 
 
 
 
 
162
  scheduler = BackgroundScheduler()
163
  scheduler.add_job(restart_space, "interval", seconds=1800)
164
  scheduler.start()
165
+ demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -1,6 +1,7 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
@@ -8,65 +9,15 @@ class Task:
8
  col_name: str
9
 
10
 
11
- # Select your tasks here
12
- # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
-
18
- NUM_FEWSHOT = 0 # Change with your few shot
19
- # ---------------------------------------------------
20
-
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
29
- """
30
-
31
- # Which evaluations are you running? how can people reproduce what you have?
32
- LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
34
-
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
37
-
38
- """
39
-
40
- EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
42
-
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
- ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
-
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
-
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
-
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
60
-
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
63
-
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
- """
69
-
70
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
- CITATION_BUTTON_TEXT = r"""
72
  """
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
  class Task:
7
  benchmark: str
 
9
  col_name: str
10
 
11
 
 
 
12
  class Tasks(Enum):
13
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
14
+ robloxQA = Task("robloxqa", "acc", "RobloxQA")
 
 
 
 
 
15
 
16
 
17
  # Your leaderboard name
18
+ TITLE = """<h1 align="center" id="space-title">Roblox LLM Leaderboard</h1>"""
19
 
20
  # What does your leaderboard evaluate?
21
  INTRODUCTION_TEXT = """
22
+ Tracking LLM capabilities regarding Roblox game development.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  """
src/display/utils.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
 
6
  from src.about import Tasks
7
 
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
@@ -20,85 +21,37 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
23
- ## Leaderboard columns
24
- auto_eval_column_dict = []
25
- # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
-
43
- # We use make dataclass to dynamically fill the scores from Tasks
44
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
49
- model = ColumnContent("model", "markdown", True)
50
- revision = ColumnContent("revision", "str", True)
51
- private = ColumnContent("private", "bool", True)
52
- precision = ColumnContent("precision", "str", True)
53
- weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
 
 
55
 
56
  ## All the model information that we might need
57
  @dataclass
58
  class ModelDetails:
59
  name: str
60
  display_name: str = ""
61
- symbol: str = "" # emoji
62
-
63
-
64
- class ModelType(Enum):
65
- PT = ModelDetails(name="pretrained", symbol="🟢")
66
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
- Unknown = ModelDetails(name="", symbol="?")
70
-
71
- def to_str(self, separator=" "):
72
- return f"{self.value.symbol}{separator}{self.value.name}"
73
-
74
- @staticmethod
75
- def from_str(type):
76
- if "fine-tuned" in type or "🔶" in type:
77
- return ModelType.FT
78
- if "pretrained" in type or "🟢" in type:
79
- return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
- return ModelType.RL
82
- if "instruction-tuned" in type or "⭕" in type:
83
- return ModelType.IFT
84
- return ModelType.Unknown
85
-
86
- class WeightType(Enum):
87
- Adapter = ModelDetails("Adapter")
88
- Original = ModelDetails("Original")
89
- Delta = ModelDetails("Delta")
90
-
91
- class Precision(Enum):
92
- float16 = ModelDetails("float16")
93
- bfloat16 = ModelDetails("bfloat16")
94
- Unknown = ModelDetails("?")
95
-
96
- def from_str(precision):
97
- if precision in ["torch.float16", "float16"]:
98
- return Precision.float16
99
- if precision in ["torch.bfloat16", "bfloat16"]:
100
- return Precision.bfloat16
101
- return Precision.Unknown
102
 
103
  # Column selection
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
@@ -107,4 +60,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
5
 
6
  from src.about import Tasks
7
 
8
+
9
  def fields(raw_class):
10
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
11
 
 
21
  hidden: bool = False
22
  never_hidden: bool = False
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  ## For the queue columns in the submission tab
26
  @dataclass(frozen=True)
27
  class EvalQueueColumn: # Queue column
28
+ model_id = ColumnContent("model_id", "str", True)
29
+ inference_provider = ColumnContent("inference_provider", "str", True)
 
 
 
30
  status = ColumnContent("status", "str", True)
31
+ date = ColumnContent("date", "str", True)
32
+
33
 
34
  ## All the model information that we might need
35
  @dataclass
36
  class ModelDetails:
37
  name: str
38
  display_name: str = ""
39
+ symbol: str = "" # emoji
40
+
41
+
42
+ ## Leaderboard columns
43
+ auto_eval_column_dict = [
44
+ ["model_id", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)],
45
+ ["inference_provider", ColumnContent, ColumnContent("Inference Provider", "str", False)],
46
+ ]
47
+ # Scores
48
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
49
+ for task in Tasks:
50
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
51
+
52
+ # We use make dataclass to dynamically fill the scores from Tasks
53
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
54
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  # Column selection
57
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
60
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
61
 
62
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
src/leaderboard/read_evals.py CHANGED
@@ -7,30 +7,18 @@ from dataclasses import dataclass
7
  import dateutil
8
  import numpy as np
9
 
10
- from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
- from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
16
  class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
- model: str
23
- revision: str # commit hash, "" if main
24
- results: dict
25
- precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
- license: str = "?"
30
- likes: int = 0
31
- num_params: int = 0
32
- date: str = "" # submission date of request file
33
- still_on_hub: bool = False
34
 
35
  @classmethod
36
  def init_from_json_file(self, json_filepath):
@@ -38,33 +26,11 @@ class EvalResult:
38
  with open(json_filepath) as fp:
39
  data = json.load(fp)
40
 
 
41
  config = data.get("config")
42
-
43
- # Precision
44
- precision = Precision.from_str(config.get("model_dtype"))
45
-
46
- # Get model and org
47
- org_and_model = config.get("model_name", config.get("model_args", None))
48
- org_and_model = org_and_model.split("/", 1)
49
-
50
- if len(org_and_model) == 1:
51
- org = None
52
- model = org_and_model[0]
53
- result_key = f"{model}_{precision.value.name}"
54
- else:
55
- org = org_and_model[0]
56
- model = org_and_model[1]
57
- result_key = f"{org}_{model}_{precision.value.name}"
58
- full_model = "/".join(org_and_model)
59
-
60
- still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
- )
63
- architecture = "?"
64
- if model_config is not None:
65
- architectures = getattr(model_config, "architectures", None)
66
- if architectures:
67
- architecture = ";".join(architectures)
68
 
69
  # Extract results available in this file (some results are split in several files)
70
  results = {}
@@ -72,7 +38,7 @@ class EvalResult:
72
  task = task.value
73
 
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
 
@@ -80,50 +46,21 @@ class EvalResult:
80
  results[task.benchmark] = mean_acc
81
 
82
  return self(
83
- eval_name=result_key,
84
- full_model=full_model,
85
- org=org,
86
- model=model,
87
  results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
- still_on_hub=still_on_hub,
91
- architecture=architecture
92
  )
93
 
94
- def update_with_request_file(self, requests_path):
95
- """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
- try:
99
- with open(request_file, "r") as f:
100
- request = json.load(f)
101
- self.model_type = ModelType.from_str(request.get("model_type", ""))
102
- self.weight_type = WeightType[request.get("weight_type", "Original")]
103
- self.license = request.get("license", "?")
104
- self.likes = request.get("likes", 0)
105
- self.num_params = request.get("params", 0)
106
- self.date = request.get("submitted_time", "")
107
- except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
-
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
- AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
- AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
  }
128
 
129
  for task in Tasks:
@@ -132,28 +69,6 @@ class EvalResult:
132
  return data_dict
133
 
134
 
135
- def get_request_file_for_model(requests_path, model_name, precision):
136
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
- request_files = os.path.join(
138
- requests_path,
139
- f"{model_name}_eval_request_*.json",
140
- )
141
- request_files = glob.glob(request_files)
142
-
143
- # Select correct request file (precision)
144
- request_file = ""
145
- request_files = sorted(request_files, reverse=True)
146
- for tmp_request_file in request_files:
147
- with open(tmp_request_file, "r") as f:
148
- req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
- request_file = tmp_request_file
154
- return request_file
155
-
156
-
157
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
  """From the path of the results folder root, extract all needed info for results"""
159
  model_result_filepaths = []
@@ -176,7 +91,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
 
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name
@@ -188,7 +102,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
188
  results = []
189
  for v in eval_results.values():
190
  try:
191
- v.to_dict() # we test if the dict version is complete
192
  results.append(v)
193
  except KeyError: # not all eval values present
194
  continue
 
7
  import dateutil
8
  import numpy as np
9
 
10
+ from src.display.utils import AutoEvalColumn, Tasks
 
 
11
 
12
 
13
  @dataclass
14
  class EvalResult:
15
+ """Represents one full evaluation."""
16
+
17
+ eval_name: str = ""
18
+ date: str = ""
19
+ model_id: str = ""
20
+ inference_provider: str = ""
21
+ results: dict = {}
 
 
 
 
 
 
 
 
 
 
22
 
23
  @classmethod
24
  def init_from_json_file(self, json_filepath):
 
26
  with open(json_filepath) as fp:
27
  data = json.load(fp)
28
 
29
+ raw_results = data.get("results", {})
30
  config = data.get("config")
31
+ inference_provider = config.get("inference_provider", "Unknown")
32
+ model_id = config.get("model_id", "Unknown")
33
+ date = config.get("date", "Unknown")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  # Extract results available in this file (some results are split in several files)
36
  results = {}
 
38
  task = task.value
39
 
40
  # We average all scores of a given metric (not all metrics are present in all files)
41
+ accs = np.array([v.get(task.metric, None) for k, v in raw_results.items() if task.benchmark == k])
42
  if accs.size == 0 or any([acc is None for acc in accs]):
43
  continue
44
 
 
46
  results[task.benchmark] = mean_acc
47
 
48
  return self(
49
+ eval_name=f"{inference_provider}:{model_id}",
50
+ model_id=model_id,
51
+ inference_provider=inference_provider,
 
52
  results=results,
53
+ date=date,
 
 
 
54
  )
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def to_dict(self):
57
  """Converts the Eval Result to a dict compatible with our dataframe display"""
58
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
59
  data_dict = {
60
  "eval_name": self.eval_name, # not a column, just a save name,
61
+ "model_id": self.model_id,
62
+ "inference_provider": self.inference_provider,
63
+ "date": self.date,
 
 
 
 
 
 
 
 
 
64
  }
65
 
66
  for task in Tasks:
 
69
  return data_dict
70
 
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
73
  """From the path of the results folder root, extract all needed info for results"""
74
  model_result_filepaths = []
 
91
  for model_result_filepath in model_result_filepaths:
92
  # Creation of result
93
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
 
94
 
95
  # Store results of same eval together
96
  eval_name = eval_result.eval_name
 
102
  results = []
103
  for v in eval_results.values():
104
  try:
105
+ v.to_dict() # we test if the dict version is complete
106
  results.append(v)
107
  except KeyError: # not all eval values present
108
  continue
src/populate.py CHANGED
@@ -14,11 +14,12 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
 
 
 
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
  return df
23
 
24
 
@@ -33,20 +34,17 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
33
  with open(file_path) as fp:
34
  data = json.load(fp)
35
 
36
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
-
39
  all_evals.append(data)
40
  elif ".md" not in entry:
41
  # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
 
 
43
  for sub_entry in sub_entries:
44
  file_path = os.path.join(save_path, entry, sub_entry)
45
  with open(file_path) as fp:
46
  data = json.load(fp)
47
 
48
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
  all_evals.append(data)
51
 
52
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
 
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
+ # filter out if any of the benchmarks have not been produced
18
+ df = df[has_no_nan_values(df, benchmark_cols)]
19
+
20
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
21
  df = df[cols].round(decimals=2)
22
 
 
 
23
  return df
24
 
25
 
 
34
  with open(file_path) as fp:
35
  data = json.load(fp)
36
 
 
 
 
37
  all_evals.append(data)
38
  elif ".md" not in entry:
39
  # this is a folder
40
+ sub_entries = [
41
+ e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")
42
+ ]
43
  for sub_entry in sub_entries:
44
  file_path = os.path.join(save_path, entry, sub_entry)
45
  with open(file_path) as fp:
46
  data = json.load(fp)
47
 
 
 
48
  all_evals.append(data)
49
 
50
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
src/submission/check_validity.py CHANGED
@@ -10,69 +10,6 @@ from huggingface_hub.hf_api import ModelInfo
10
  from transformers import AutoConfig
11
  from transformers.models.auto.tokenization_auto import AutoTokenizer
12
 
13
- def check_model_card(repo_id: str) -> tuple[bool, str]:
14
- """Checks if the model card and license exist and have been filled"""
15
- try:
16
- card = ModelCard.load(repo_id)
17
- except huggingface_hub.utils.EntryNotFoundError:
18
- return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
-
20
- # Enforce license metadata
21
- if card.data.license is None:
22
- if not ("license_name" in card.data and "license_link" in card.data):
23
- return False, (
24
- "License not found. Please add a license to your model card using the `license` metadata or a"
25
- " `license_name`/`license_link` pair."
26
- )
27
-
28
- # Enforce card content
29
- if len(card.text) < 200:
30
- return False, "Please add a description to your model card, it is too short."
31
-
32
- return True, ""
33
-
34
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
- """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
- try:
37
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
- if test_tokenizer:
39
- try:
40
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
- except ValueError as e:
42
- return (
43
- False,
44
- f"uses a tokenizer which is not in a transformers release: {e}",
45
- None
46
- )
47
- except Exception as e:
48
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
- return True, None, config
50
-
51
- except ValueError:
52
- return (
53
- False,
54
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
- None
56
- )
57
-
58
- except Exception as e:
59
- return False, "was not found on hub!", None
60
-
61
-
62
- def get_model_size(model_info: ModelInfo, precision: str):
63
- """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
- try:
65
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
- except (AttributeError, TypeError):
67
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
-
69
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
- model_size = size_factor * model_size
71
- return model_size
72
-
73
- def get_model_arch(model_info: ModelInfo):
74
- """Gets the model architecture from the configuration"""
75
- return model_info.config.get("architectures", "Unknown")
76
 
77
  def already_submitted_models(requested_models_dir: str) -> set[str]:
78
  """Gather a list of already submitted models to avoid duplicates"""
@@ -88,12 +25,12 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
88
  continue
89
  with open(os.path.join(root, file), "r") as f:
90
  info = json.load(f)
91
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
 
93
  # Select organisation
94
- if info["model"].count("/") == 0 or "submitted_time" not in info:
95
  continue
96
- organisation, _ = info["model"].split("/")
97
  users_to_submission_dates[organisation].append(info["submitted_time"])
98
 
99
  return set(file_names), users_to_submission_dates
 
10
  from transformers import AutoConfig
11
  from transformers.models.auto.tokenization_auto import AutoTokenizer
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def already_submitted_models(requested_models_dir: str) -> set[str]:
15
  """Gather a list of already submitted models to avoid duplicates"""
 
25
  continue
26
  with open(os.path.join(root, file), "r") as f:
27
  info = json.load(f)
28
+ file_names.append(f"{info['inference_provider']}:{info['model_id']}")
29
 
30
  # Select organisation
31
+ if info["model_id"].count("/") == 0 or "submitted_time" not in info:
32
  continue
33
+ organisation, _ = info["model_id"].split("/")
34
  users_to_submission_dates[organisation].append(info["submitted_time"])
35
 
36
  return set(file_names), users_to_submission_dates
src/submission/submit.py CHANGED
@@ -3,101 +3,50 @@ import os
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
 
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
 
17
  def add_new_eval(
18
- model: str,
19
- base_model: str,
20
- revision: str,
21
- precision: str,
22
- weight_type: str,
23
- model_type: str,
24
  ):
25
  global REQUESTED_MODELS
26
  global USERS_TO_SUBMISSION_DATES
27
  if not REQUESTED_MODELS:
28
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
 
 
 
 
30
  user_name = ""
31
  model_path = model
32
  if "/" in model:
33
  user_name = model.split("/")[0]
34
  model_path = model.split("/")[1]
35
 
36
- precision = precision.split(" ")[0]
37
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
 
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
-
42
- # Does the model actually exist?
43
- if revision == "":
44
- revision = "main"
45
-
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
-
52
- if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
- if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
-
57
- # Is the model info correctly filled?
58
- try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
- except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
-
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
-
65
- # Were the model card and license filled?
66
- try:
67
- license = model_info.cardData["license"]
68
- except Exception:
69
- return styled_error("Please select a license for your model")
70
-
71
- modelcard_OK, error_msg = check_model_card(model)
72
- if not modelcard_OK:
73
- return styled_error(error_msg)
74
-
75
- # Seems good, creating the eval
76
  print("Adding new eval")
77
 
78
  eval_entry = {
79
- "model": model,
80
- "base_model": base_model,
81
- "revision": revision,
82
- "precision": precision,
83
- "weight_type": weight_type,
84
- "status": "PENDING",
85
  "submitted_time": current_time,
86
- "model_type": model_type,
87
- "likes": model_info.likes,
88
- "params": model_size,
89
- "license": license,
90
- "private": False,
91
  }
92
 
93
  # Check for duplicate submission
94
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
  return styled_warning("This model has been already submitted.")
96
 
97
  print("Creating eval file")
98
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
  os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
 
102
  with open(out_path, "w") as f:
103
  f.write(json.dumps(eval_entry))
@@ -108,12 +57,14 @@ def add_new_eval(
108
  path_in_repo=out_path.split("eval-queue/")[1],
109
  repo_id=QUEUE_REPO,
110
  repo_type="dataset",
111
- commit_message=f"Add {model} to eval queue",
112
  )
113
 
114
  # Remove the local file
115
  os.remove(out_path)
116
 
 
 
117
  return styled_message(
118
  "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
  )
 
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
7
+ from src.submission.check_validity import already_submitted_models
 
 
 
 
 
8
 
9
  REQUESTED_MODELS = None
10
  USERS_TO_SUBMISSION_DATES = None
11
 
12
+
13
  def add_new_eval(
14
+ model_id: str,
15
+ inference_provider: str,
 
 
 
 
16
  ):
17
  global REQUESTED_MODELS
18
  global USERS_TO_SUBMISSION_DATES
19
  if not REQUESTED_MODELS:
20
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
21
 
22
+ if not inference_provider:
23
+ return styled_error("Please select an inference provider.")
24
+
25
  user_name = ""
26
  model_path = model
27
  if "/" in model:
28
  user_name = model.split("/")[0]
29
  model_path = model.split("/")[1]
30
 
 
31
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  print("Adding new eval")
34
 
35
  eval_entry = {
 
 
 
 
 
 
36
  "submitted_time": current_time,
37
+ "status": "PENDING",
38
+ "model_id": model_id,
39
+ "inference_provider": inference_provider,
 
 
40
  }
41
 
42
  # Check for duplicate submission
43
+ if f"{inference_provider}:{model_id}" in REQUESTED_MODELS:
44
  return styled_warning("This model has been already submitted.")
45
 
46
  print("Creating eval file")
47
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
48
  os.makedirs(OUT_DIR, exist_ok=True)
49
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_{inference_provider}.json"
50
 
51
  with open(out_path, "w") as f:
52
  f.write(json.dumps(eval_entry))
 
57
  path_in_repo=out_path.split("eval-queue/")[1],
58
  repo_id=QUEUE_REPO,
59
  repo_type="dataset",
60
+ commit_message=f"Add {model_id} to eval queue",
61
  )
62
 
63
  # Remove the local file
64
  os.remove(out_path)
65
 
66
+ REQUESTED_MODELS.add(f"{inference_provider}:{model_id}")
67
+
68
  return styled_message(
69
  "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
70
  )