unknown commited on
Commit
eccaf98
·
1 Parent(s): 33e428a
Files changed (50) hide show
  1. .gitignore +0 -4
  2. README.md +1 -48
  3. about.py +16 -0
  4. app.py +55 -168
  5. src/display/css_html_js.py → css_html_js.py +0 -0
  6. src/envs.py → envs.py +0 -0
  7. eval-queue/sgi-bench/Claude-Opus-4.1_eval_request_False_float16_Original.json +0 -14
  8. eval-queue/sgi-bench/Claude-Sonnet-4.5_eval_request_False_float16_Original.json +0 -14
  9. eval-queue/sgi-bench/GPT-4.1_eval_request_False_float16_Original.json +0 -14
  10. eval-queue/sgi-bench/GPT-4o_eval_request_False_float16_Original.json +0 -14
  11. eval-queue/sgi-bench/GPT-5.1_eval_request_False_float16_Original.json +0 -14
  12. eval-queue/sgi-bench/GPT-5_eval_request_False_float16_Original.json +0 -14
  13. eval-queue/sgi-bench/Gemini-2.5-Flash_eval_request_False_float16_Original.json +0 -14
  14. eval-queue/sgi-bench/Gemini-2.5-Pro_eval_request_False_float16_Original.json +0 -14
  15. eval-queue/sgi-bench/Gemini-3-Pro_eval_request_False_float16_Original.json +0 -14
  16. eval-queue/sgi-bench/Grok-4_eval_request_False_float16_Original.json +0 -14
  17. eval-queue/sgi-bench/Intern-S1-mini_eval_request_False_float16_Original.json +0 -14
  18. eval-queue/sgi-bench/Intern-S1_eval_request_False_float16_Original.json +0 -14
  19. eval-queue/sgi-bench/Llama-4-Scout_eval_request_False_float16_Original.json +0 -14
  20. eval-queue/sgi-bench/Qwen3-8B_eval_request_False_float16_Original.json +0 -14
  21. eval-queue/sgi-bench/Qwen3-Max_eval_request_False_float16_Original.json +0 -14
  22. eval-queue/sgi-bench/Qwen3-VL-235B-A22B_eval_request_False_float16_Original.json +0 -14
  23. eval-queue/sgi-bench/o3_eval_request_False_float16_Original.json +0 -14
  24. eval-queue/sgi-bench/o4-mini_eval_request_False_float16_Original.json +0 -14
  25. eval-results/sgi-bench/Claude-Opus-4.1/results_20251203T061115Z.json +0 -24
  26. eval-results/sgi-bench/Claude-Sonnet-4.5/results_20251203T061115Z.json +0 -24
  27. eval-results/sgi-bench/GPT-4.1/results_20251203T061115Z.json +0 -24
  28. eval-results/sgi-bench/GPT-4o/results_20251203T061115Z.json +0 -24
  29. eval-results/sgi-bench/GPT-5.1/results_20251203T061115Z.json +0 -24
  30. eval-results/sgi-bench/GPT-5/results_20251203T061115Z.json +0 -24
  31. eval-results/sgi-bench/Gemini-2.5-Flash/results_20251203T061115Z.json +0 -24
  32. eval-results/sgi-bench/Gemini-2.5-Pro/results_20251203T061115Z.json +0 -24
  33. eval-results/sgi-bench/Gemini-3-Pro/results_20251203T061115Z.json +0 -24
  34. eval-results/sgi-bench/Grok-4/results_20251203T061115Z.json +0 -24
  35. eval-results/sgi-bench/Intern-S1-mini/results_20251203T061115Z.json +0 -24
  36. eval-results/sgi-bench/Intern-S1/results_20251203T061115Z.json +0 -24
  37. eval-results/sgi-bench/Llama-4-Scout/results_20251203T061115Z.json +0 -24
  38. eval-results/sgi-bench/Qwen3-8B/results_20251203T061115Z.json +0 -24
  39. eval-results/sgi-bench/Qwen3-Max/results_20251203T061115Z.json +0 -24
  40. eval-results/sgi-bench/Qwen3-VL-235B-A22B/results_20251203T061115Z.json +0 -24
  41. eval-results/sgi-bench/o3/results_20251203T061115Z.json +0 -24
  42. eval-results/sgi-bench/o4-mini/results_20251203T061115Z.json +0 -24
  43. scripts/generate_sgi_results.py +0 -131
  44. src/about.py +0 -75
  45. src/display/formatting.py +0 -27
  46. src/display/utils.py +0 -121
  47. src/leaderboard/read_evals.py +0 -196
  48. src/populate.py +0 -58
  49. src/submission/check_validity.py +0 -99
  50. src/submission/submit.py +0 -119
.gitignore CHANGED
@@ -6,8 +6,4 @@ __pycache__/
6
  *ipynb
7
  .vscode/
8
 
9
- # eval-queue/
10
- # eval-results/
11
- # eval-queue-bk/
12
- # eval-results-bk/
13
  logs/
 
6
  *ipynb
7
  .vscode/
8
 
 
 
 
 
9
  logs/
README.md CHANGED
@@ -1,48 +1 @@
1
- ---
2
- title: SGI Bench
3
- emoji: 🥇
4
- colorFrom: green
5
- colorTo: indigo
6
- sdk: gradio
7
- app_file: app.py
8
- pinned: true
9
- license: mit
10
- short_description: A Definition of Scientific General Intelligence
11
- sdk_version: 5.43.1
12
- tags:
13
- - leaderboard
14
- ---
15
-
16
- # Start the configuration
17
-
18
- Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
19
-
20
- Results files should have the following format and be stored as json files:
21
- ```json
22
- {
23
- "config": {
24
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
25
- "model_name": "path of the model on the hub: org/model",
26
- "model_sha": "revision on the hub",
27
- },
28
- "results": {
29
- "task_name": {
30
- "metric_name": score,
31
- },
32
- "task_name2": {
33
- "metric_name": score,
34
- }
35
- }
36
- }
37
- ```
38
-
39
- Request files are created automatically by this tool.
40
-
41
- If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
42
-
43
- # Code logic for more complex edits
44
-
45
- You'll find
46
- - the main table' columns names and properties in `src/display/utils.py`
47
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
48
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
1
+ # SGI-Bench
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
about.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TITLE = """<h1 align="center" id="space-title">SGI Leaderboard</h1>"""
2
+
3
+
4
+ INTRODUCTION_TEXT = """
5
+ ## Scientific General Intelligence (SGI) is defined as an AI system that can autonomously navigate the full, iterative cycle of scientific inquiry—Deliberation, Conception, Action, and Perception—with the versatility and proficiency of a human scientist. SGI-Bench operationalizes this definition via four scientist-aligned task families: deep research, idea generation, AI-assisted experiments (dry/wet), and multimodal experimental reasoning. The benchmark spans 10 disciplines and ~1,000 expert-curated samples inspired by Science's 125 Big Questions.
6
+ """
7
+
8
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
9
+ CITATION_BUTTON_TEXT = r"""
10
+ @article{sgi2025,
11
+ title={SGI-Bench: Scientific Intelligence Benchmark via Scientist-Aligned Workflows},
12
+ author={Research Team},
13
+ journal={arXiv preprint arXiv:2401.xxxxx},
14
+ year={2025}
15
+ }
16
+ """
app.py CHANGED
@@ -2,97 +2,76 @@ import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
- import os
7
-
8
- from src.about import (
9
  CITATION_BUTTON_LABEL,
10
  CITATION_BUTTON_TEXT,
11
- EVALUATION_QUEUE_TEXT,
12
  INTRODUCTION_TEXT,
13
- LLM_BENCHMARKS_TEXT,
14
  TITLE,
15
  )
16
- from src.display.css_html_js import custom_css
17
- from src.display.utils import (
18
- BENCHMARK_COLS,
19
- COLS,
20
- EVAL_COLS,
21
- EVAL_TYPES,
22
- AutoEvalColumn,
23
- ModelType,
24
- fields,
25
- WeightType,
26
- Precision
27
- )
28
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
29
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
- from src.submission.submit import add_new_eval
31
-
32
 
33
  def restart_space():
34
  API.restart_space(repo_id=REPO_ID)
35
 
36
- ### Space initialisation
37
- try:
38
- print(EVAL_REQUESTS_PATH)
39
- if os.path.isdir(EVAL_REQUESTS_PATH):
40
- print("Using local eval-queue cache")
41
- else:
42
- snapshot_download(
43
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
44
- )
45
- except Exception:
46
- print("Skipping remote snapshot for eval-queue; using local cache.")
47
-
48
- try:
49
- print(EVAL_RESULTS_PATH)
50
- if os.path.isdir(EVAL_RESULTS_PATH):
51
- print("Using local eval-results cache")
52
- else:
53
- snapshot_download(
54
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
55
- )
56
- except Exception:
57
- print("Skipping remote snapshot for eval-results; using local cache.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
 
60
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
61
-
62
- (
63
- finished_eval_queue_df,
64
- running_eval_queue_df,
65
- pending_eval_queue_df,
66
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
67
-
68
  def init_leaderboard(dataframe):
69
- if dataframe is None or dataframe.empty:
70
- raise ValueError("Leaderboard DataFrame is empty or None.")
 
71
  return Leaderboard(
72
  value=dataframe,
73
- datatype=[c.type for c in fields(AutoEvalColumn)],
74
  select_columns=SelectColumns(
75
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
76
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
77
  label="Select Columns to Display:",
78
  ),
79
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
80
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
81
- filter_columns=[
82
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
83
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
84
- ColumnFilter(
85
- AutoEvalColumn.params.name,
86
- type="slider",
87
- min=0.01,
88
- max=150,
89
- label="Select the number of parameters (B)",
90
- ),
91
- ColumnFilter(
92
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
93
- ),
94
- ],
95
- bool_checkboxgroup_label="Hide models",
96
  interactive=False,
97
  )
98
 
@@ -102,102 +81,10 @@ with demo:
102
  gr.HTML(TITLE)
103
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
104
 
105
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
106
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
107
- leaderboard = init_leaderboard(LEADERBOARD_DF)
108
-
109
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
110
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
111
-
112
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
113
- with gr.Column():
114
- with gr.Row():
115
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
116
-
117
- with gr.Column():
118
- with gr.Accordion(
119
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
120
- open=False,
121
- ):
122
- with gr.Row():
123
- finished_eval_table = gr.components.Dataframe(
124
- value=finished_eval_queue_df,
125
- headers=EVAL_COLS,
126
- datatype=EVAL_TYPES,
127
- row_count=5,
128
- )
129
- with gr.Accordion(
130
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
131
- open=False,
132
- ):
133
- with gr.Row():
134
- running_eval_table = gr.components.Dataframe(
135
- value=running_eval_queue_df,
136
- headers=EVAL_COLS,
137
- datatype=EVAL_TYPES,
138
- row_count=5,
139
- )
140
-
141
- with gr.Accordion(
142
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
143
- open=False,
144
- ):
145
- with gr.Row():
146
- pending_eval_table = gr.components.Dataframe(
147
- value=pending_eval_queue_df,
148
- headers=EVAL_COLS,
149
- datatype=EVAL_TYPES,
150
- row_count=5,
151
- )
152
- with gr.Row():
153
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
154
-
155
- with gr.Row():
156
- with gr.Column():
157
- model_name_textbox = gr.Textbox(label="Model name")
158
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
159
- model_type = gr.Dropdown(
160
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
161
- label="Model type",
162
- multiselect=False,
163
- value=None,
164
- interactive=True,
165
- )
166
-
167
- with gr.Column():
168
- precision = gr.Dropdown(
169
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
170
- label="Precision",
171
- multiselect=False,
172
- value="float16",
173
- interactive=True,
174
- )
175
- weight_type = gr.Dropdown(
176
- choices=[i.value.name for i in WeightType],
177
- label="Weights type",
178
- multiselect=False,
179
- value="Original",
180
- interactive=True,
181
- )
182
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
183
-
184
- submit_button = gr.Button("Submit Eval")
185
- submission_result = gr.Markdown()
186
- submit_button.click(
187
- add_new_eval,
188
- [
189
- model_name_textbox,
190
- base_model_name_textbox,
191
- revision_name_textbox,
192
- precision,
193
- weight_type,
194
- model_type,
195
- ],
196
- submission_result,
197
- )
198
 
199
  with gr.Row():
200
- with gr.Accordion("📙 Citation", open=False):
201
  citation_button = gr.Textbox(
202
  value=CITATION_BUTTON_TEXT,
203
  label=CITATION_BUTTON_LABEL,
 
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
+ from about import (
 
 
 
6
  CITATION_BUTTON_LABEL,
7
  CITATION_BUTTON_TEXT,
 
8
  INTRODUCTION_TEXT,
 
9
  TITLE,
10
  )
11
+ from css_html_js import custom_css
12
+ from envs import API, REPO_ID
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def restart_space():
15
  API.restart_space(repo_id=REPO_ID)
16
 
17
+ LEADERBOARD_DATA = [
18
+ {"name": "Intern-S1", "type": "Open", "scores": [15.74, 38.09, 28.79, 29.02, 28.87]},
19
+ {"name": "Intern-S1-mini", "type": "Open", "scores": [11.06, 36.04, 16.97, 12.42, 16.84]},
20
+ {"name": "Qwen3-VL-235B-A22B", "type": "Open", "scores": [11.97, 39.28, 28.41, 30.30, 31.62]},
21
+ {"name": "Qwen3-Max", "type": "Open", "scores": [15.38, 39.83, 33.21, 33.62, 37.80]},
22
+ {"name": "Qwen3-8B", "type": "Open", "scores": [8.18, 35.78, 18.45, 9.96, 23.37]},
23
+ {"name": "Llama-4-Scout", "type": "Open", "scores": [7.86, 29.72, 20.37, 21.66, 25.77]},
24
+ {"name": "GPT-4o", "type": "Closed", "scores": [7.86, 35.95, 26.94, 31.31, 32.30]},
25
+ {"name": "GPT-4.1", "type": "Closed", "scores": [11.32, 36.49, 34.32, 36.63, 38.49]},
26
+ {"name": "GPT-5", "type": "Closed", "scores": [14.47, 55.40, 29.89, 16.31, 38.14]},
27
+ {"name": "GPT-5.1", "type": "Closed", "scores": [11.64, 47.12, 31.00, 22.77, 34.02]},
28
+ {"name": "o3", "type": "Closed", "scores": [12.89, 46.07, 31.73, 30.04, 32.65]},
29
+ {"name": "o4-mini", "type": "Closed", "scores": [11.95, 40.78, 35.79, 28.86, 33.33]},
30
+ {"name": "Gemini-2.5-Flash", "type": "Closed", "scores": [10.69, 39.13, 21.03, 18.55, 34.36]},
31
+ {"name": "Gemini-2.5-Pro", "type": "Closed", "scores": [15.09, 39.95, 22.51, 22.05, 41.24]},
32
+ {"name": "Gemini-3-Pro", "type": "Closed", "scores": [18.48, 39.68, 36.64, 32.45, 41.92]},
33
+ {"name": "Claude-Opus-4.1", "type": "Closed", "scores": [12.93, 40.29, 34.69, 25.38, 38.83]},
34
+ {"name": "Claude-Sonnet-4.5", "type": "Closed", "scores": [13.84, 43.20, 35.79, 30.15, 37.80]},
35
+ {"name": "Grok-4", "type": "Closed", "scores": [13.31, 37.12, 33.71, 29.01, 30.24]},
36
+ ]
37
+
38
+ def build_leaderboard_df():
39
+ task_cols = ["Deep Research", "Idea Generation", "Dry Experiment", "Wet Experiment", "Experimental Reasoning"]
40
+ rows = []
41
+ for item in LEADERBOARD_DATA:
42
+ name = item["name"]
43
+ type = item["type"]
44
+ scores = item["scores"]
45
+ row = {
46
+ "Type": type,
47
+ "Model": name,
48
+ "SGI-Score": round(sum(scores) / len(scores), 2),
49
+ }
50
+ for i, col in enumerate(task_cols):
51
+ row[col] = scores[i]
52
+ rows.append(row)
53
+ cols = ["Type", "Model", "SGI-Score"] + task_cols
54
+ df = pd.DataFrame(rows, columns=cols).sort_values(by=["SGI-Score"], ascending=False).round(decimals=2)
55
+ return df
56
+
57
+ LEADERBOARD_DF = build_leaderboard_df()
58
 
59
 
 
 
 
 
 
 
 
 
60
  def init_leaderboard(dataframe):
61
+ datatypes = ["str", "str", "number", "number", "number", "number", "number", "number"]
62
+ default_selection = ["Type","Model","SGI-Score","Deep Research","Idea Generation","Dry Experiment","Wet Experiment","Experimental Reasoning"]
63
+ cant_deselect = ["Type","Model"]
64
  return Leaderboard(
65
  value=dataframe,
66
+ datatype=datatypes,
67
  select_columns=SelectColumns(
68
+ default_selection=default_selection,
69
+ cant_deselect=cant_deselect,
70
  label="Select Columns to Display:",
71
  ),
72
+ search_columns=["Model"],
73
+ hide_columns=[],
74
+ filter_columns=[ColumnFilter("Type", type="checkboxgroup", label="Model types")],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  interactive=False,
76
  )
77
 
 
81
  gr.HTML(TITLE)
82
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
83
 
84
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  with gr.Row():
87
+ with gr.Accordion("📖 Citation", open=False):
88
  citation_button = gr.Textbox(
89
  value=CITATION_BUTTON_TEXT,
90
  label=CITATION_BUTTON_LABEL,
src/display/css_html_js.py → css_html_js.py RENAMED
File without changes
src/envs.py → envs.py RENAMED
File without changes
eval-queue/sgi-bench/Claude-Opus-4.1_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/Claude-Opus-4.1",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔒 : Closed",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/Claude-Sonnet-4.5_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/Claude-Sonnet-4.5",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔒 : Closed",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/GPT-4.1_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/GPT-4.1",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔒 : Closed",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/GPT-4o_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/GPT-4o",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔒 : Closed",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/GPT-5.1_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/GPT-5.1",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔒 : Closed",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/GPT-5_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/GPT-5",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔒 : Closed",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/Gemini-2.5-Flash_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/Gemini-2.5-Flash",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔒 : Closed",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/Gemini-2.5-Pro_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/Gemini-2.5-Pro",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔒 : Closed",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/Gemini-3-Pro_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/Gemini-3-Pro",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔒 : Closed",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/Grok-4_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/Grok-4",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔒 : Closed",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/Intern-S1-mini_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/Intern-S1-mini",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔓 : Open",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/Intern-S1_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/Intern-S1",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔓 : Open",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/Llama-4-Scout_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/Llama-4-Scout",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔓 : Open",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/Qwen3-8B_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/Qwen3-8B",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔓 : Open",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/Qwen3-Max_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/Qwen3-Max",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔓 : Open",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/Qwen3-VL-235B-A22B_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/Qwen3-VL-235B-A22B",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔓 : Open",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/o3_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/o3",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔒 : Closed",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-queue/sgi-bench/o4-mini_eval_request_False_float16_Original.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "model": "sgi-bench/o4-mini",
3
- "base_model": "",
4
- "revision": "main",
5
- "precision": "float16",
6
- "weight_type": "Original",
7
- "status": "FINISHED",
8
- "submitted_time": "2025-12-03T06:11:15Z",
9
- "model_type": "🔒 : Closed",
10
- "likes": 0,
11
- "params": 0,
12
- "license": "?",
13
- "private": false
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/Claude-Opus-4.1/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/Claude-Opus-4.1",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.1293
10
- },
11
- "idea_generation": {
12
- "acc": 0.4029
13
- },
14
- "dry_experiment": {
15
- "acc": 0.3469
16
- },
17
- "wet_experiment": {
18
- "acc": 0.2538
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.3883
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/Claude-Sonnet-4.5/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/Claude-Sonnet-4.5",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.1384
10
- },
11
- "idea_generation": {
12
- "acc": 0.432
13
- },
14
- "dry_experiment": {
15
- "acc": 0.3579
16
- },
17
- "wet_experiment": {
18
- "acc": 0.3015
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.378
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/GPT-4.1/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/GPT-4.1",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.1132
10
- },
11
- "idea_generation": {
12
- "acc": 0.3649
13
- },
14
- "dry_experiment": {
15
- "acc": 0.3432
16
- },
17
- "wet_experiment": {
18
- "acc": 0.3663
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.3849
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/GPT-4o/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/GPT-4o",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.0786
10
- },
11
- "idea_generation": {
12
- "acc": 0.3595
13
- },
14
- "dry_experiment": {
15
- "acc": 0.2694
16
- },
17
- "wet_experiment": {
18
- "acc": 0.3131
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.323
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/GPT-5.1/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/GPT-5.1",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.1164
10
- },
11
- "idea_generation": {
12
- "acc": 0.4712
13
- },
14
- "dry_experiment": {
15
- "acc": 0.31
16
- },
17
- "wet_experiment": {
18
- "acc": 0.2277
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.3402
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/GPT-5/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/GPT-5",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.1447
10
- },
11
- "idea_generation": {
12
- "acc": 0.554
13
- },
14
- "dry_experiment": {
15
- "acc": 0.2989
16
- },
17
- "wet_experiment": {
18
- "acc": 0.1631
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.3814
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/Gemini-2.5-Flash/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/Gemini-2.5-Flash",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.1069
10
- },
11
- "idea_generation": {
12
- "acc": 0.3913
13
- },
14
- "dry_experiment": {
15
- "acc": 0.2103
16
- },
17
- "wet_experiment": {
18
- "acc": 0.1855
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.3436
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/Gemini-2.5-Pro/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/Gemini-2.5-Pro",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.1509
10
- },
11
- "idea_generation": {
12
- "acc": 0.3995
13
- },
14
- "dry_experiment": {
15
- "acc": 0.2251
16
- },
17
- "wet_experiment": {
18
- "acc": 0.2205
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.4124
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/Gemini-3-Pro/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/Gemini-3-Pro",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.1848
10
- },
11
- "idea_generation": {
12
- "acc": 0.3968
13
- },
14
- "dry_experiment": {
15
- "acc": 0.3664
16
- },
17
- "wet_experiment": {
18
- "acc": 0.3245
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.4192
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/Grok-4/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/Grok-4",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.1331
10
- },
11
- "idea_generation": {
12
- "acc": 0.3712
13
- },
14
- "dry_experiment": {
15
- "acc": 0.3371
16
- },
17
- "wet_experiment": {
18
- "acc": 0.2901
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.3024
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/Intern-S1-mini/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/Intern-S1-mini",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.1106
10
- },
11
- "idea_generation": {
12
- "acc": 0.3604
13
- },
14
- "dry_experiment": {
15
- "acc": 0.1697
16
- },
17
- "wet_experiment": {
18
- "acc": 0.1242
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.1684
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/Intern-S1/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/Intern-S1",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.1574
10
- },
11
- "idea_generation": {
12
- "acc": 0.3809
13
- },
14
- "dry_experiment": {
15
- "acc": 0.2879
16
- },
17
- "wet_experiment": {
18
- "acc": 0.2902
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.2887
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/Llama-4-Scout/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/Llama-4-Scout",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.0786
10
- },
11
- "idea_generation": {
12
- "acc": 0.2972
13
- },
14
- "dry_experiment": {
15
- "acc": 0.2037
16
- },
17
- "wet_experiment": {
18
- "acc": 0.2166
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.2577
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/Qwen3-8B/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/Qwen3-8B",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.0818
10
- },
11
- "idea_generation": {
12
- "acc": 0.3578
13
- },
14
- "dry_experiment": {
15
- "acc": 0.1845
16
- },
17
- "wet_experiment": {
18
- "acc": 0.0996
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.2337
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/Qwen3-Max/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/Qwen3-Max",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.1538
10
- },
11
- "idea_generation": {
12
- "acc": 0.3983
13
- },
14
- "dry_experiment": {
15
- "acc": 0.3321
16
- },
17
- "wet_experiment": {
18
- "acc": 0.3362
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.378
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/Qwen3-VL-235B-A22B/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/Qwen3-VL-235B-A22B",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.1197
10
- },
11
- "idea_generation": {
12
- "acc": 0.3928
13
- },
14
- "dry_experiment": {
15
- "acc": 0.2841
16
- },
17
- "wet_experiment": {
18
- "acc": 0.303
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.3162
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/o3/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/o3",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.1289
10
- },
11
- "idea_generation": {
12
- "acc": 0.4607
13
- },
14
- "dry_experiment": {
15
- "acc": 0.3173
16
- },
17
- "wet_experiment": {
18
- "acc": 0.3004
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.3265
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/sgi-bench/o4-mini/results_20251203T061115Z.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "float16",
4
- "model_name": "sgi-bench/o4-mini",
5
- "model_sha": ""
6
- },
7
- "results": {
8
- "deep_research": {
9
- "acc": 0.1195
10
- },
11
- "idea_generation": {
12
- "acc": 0.4078
13
- },
14
- "dry_experiment": {
15
- "acc": 0.3579
16
- },
17
- "wet_experiment": {
18
- "acc": 0.2886
19
- },
20
- "experimental_reasoning": {
21
- "acc": 0.3333
22
- }
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/generate_sgi_results.py DELETED
@@ -1,131 +0,0 @@
1
- import os
2
- import json
3
- from datetime import datetime, timezone
4
-
5
- # Use local relative paths to avoid optional dependencies during generation
6
- EVAL_RESULTS_PATH = "eval-results"
7
- EVAL_REQUESTS_PATH = "eval-queue"
8
-
9
- # Leaderboard data provided by user
10
- MODELS = [
11
- {"name": "Intern-S1", "type": "Open", "scores": [15.74, 38.09, 28.79, 29.02, 28.87]},
12
- {"name": "Intern-S1-mini", "type": "Open", "scores": [11.06, 36.04, 16.97, 12.42, 16.84]},
13
- {"name": "Qwen3-VL-235B-A22B", "type": "Open", "scores": [11.97, 39.28, 28.41, 30.30, 31.62]},
14
- {"name": "Qwen3-Max", "type": "Open", "scores": [15.38, 39.83, 33.21, 33.62, 37.80]},
15
- {"name": "Qwen3-8B", "type": "Open", "scores": [8.18, 35.78, 18.45, 9.96, 23.37]},
16
- {"name": "Llama-4-Scout", "type": "Open", "scores": [7.86, 29.72, 20.37, 21.66, 25.77]},
17
- {"name": "GPT-4o", "type": "Closed", "scores": [7.86, 35.95, 26.94, 31.31, 32.30]},
18
- {"name": "GPT-4.1", "type": "Closed", "scores": [11.32, 36.49, 34.32, 36.63, 38.49]},
19
- {"name": "GPT-5", "type": "Closed", "scores": [14.47, 55.40, 29.89, 16.31, 38.14]},
20
- {"name": "GPT-5.1", "type": "Closed", "scores": [11.64, 47.12, 31.00, 22.77, 34.02]},
21
- {"name": "o3", "type": "Closed", "scores": [12.89, 46.07, 31.73, 30.04, 32.65]},
22
- {"name": "o4-mini", "type": "Closed", "scores": [11.95, 40.78, 35.79, 28.86, 33.33]},
23
- {"name": "Gemini-2.5-Flash", "type": "Closed", "scores": [10.69, 39.13, 21.03, 18.55, 34.36]},
24
- {"name": "Gemini-2.5-Pro", "type": "Closed", "scores": [15.09, 39.95, 22.51, 22.05, 41.24]},
25
- {"name": "Gemini-3-Pro", "type": "Closed", "scores": [18.48, 39.68, 36.64, 32.45, 41.92]},
26
- {"name": "Claude-Opus-4.1", "type": "Closed", "scores": [12.93, 40.29, 34.69, 25.38, 38.83]},
27
- {"name": "Claude-Sonnet-4.5", "type": "Closed", "scores": [13.84, 43.20, 35.79, 30.15, 37.80]},
28
- {"name": "Grok-4", "type": "Closed", "scores": [13.31, 37.12, 33.71, 29.01, 30.24]},
29
- ]
30
-
31
- # Task keys must match Tasks Enum in src/about.py
32
- TASK_KEYS = [
33
- "deep_research",
34
- "idea_generation",
35
- "dry_experiment",
36
- "wet_experiment",
37
- "experimental_reasoning",
38
- ]
39
-
40
- # Convert percentages to decimals expected by read_evals (it multiplies by 100)
41
- def pct_to_decimal(p):
42
- return round(p / 100.0, 6)
43
-
44
- def ensure_dir(p):
45
- os.makedirs(p, exist_ok=True)
46
-
47
- def write_result_json(org, model, scores):
48
- model_full = f"{org}/{model}"
49
- # Place each model's JSON in its own subfolder under eval-results
50
- model_dir = os.path.join(EVAL_RESULTS_PATH, org, model)
51
- ensure_dir(model_dir)
52
-
53
- # Minimal config expected by read_evals.py
54
- cfg = {
55
- "model_dtype": "float16",
56
- "model_name": model_full,
57
- "model_sha": "",
58
- }
59
-
60
- # Build results mapping
61
- results = {}
62
- for key, score in zip(TASK_KEYS, scores):
63
- results[key] = {"acc": pct_to_decimal(score)}
64
-
65
- payload = {
66
- "config": cfg,
67
- "results": results,
68
- }
69
-
70
- # Filename pattern is flexible; read_evals walks directories and reads all JSONs
71
- ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
72
- out_path = os.path.join(model_dir, f"results_{ts}.json")
73
- with open(out_path, "w", encoding="utf-8") as f:
74
- json.dump(payload, f, ensure_ascii=False, indent=2)
75
- return out_path
76
-
77
- def write_request_json(org, model, model_type):
78
- # Ensure request file lives under eval-queue/{org}/
79
- org_dir = os.path.join(EVAL_REQUESTS_PATH, org)
80
- ensure_dir(org_dir)
81
-
82
- model_full = f"{org}/{model}"
83
- now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
84
-
85
- # Model type label must be parsable by ModelType.from_str
86
- type_label = "🔓 : Open" if model_type == "Open" else "🔒 : Closed"
87
-
88
- entry = {
89
- "model": model_full,
90
- "base_model": "",
91
- "revision": "main",
92
- "precision": "float16",
93
- "weight_type": "Original",
94
- "status": "FINISHED",
95
- "submitted_time": now,
96
- "model_type": type_label,
97
- "likes": 0,
98
- "params": 0,
99
- "license": "?",
100
- "private": False,
101
- }
102
-
103
- # File naming convention similar to submit.py
104
- out_path = os.path.join(org_dir, f"{model}_eval_request_False_float16_Original.json")
105
- with open(out_path, "w", encoding="utf-8") as f:
106
- json.dump(entry, f, ensure_ascii=False, indent=2)
107
- return out_path
108
-
109
- def main():
110
- org = "sgi-bench"
111
- ensure_dir(EVAL_RESULTS_PATH)
112
- ensure_dir(EVAL_REQUESTS_PATH)
113
-
114
- result_paths = []
115
- request_paths = []
116
-
117
- for m in MODELS:
118
- res_path = write_result_json(org, m["name"], m["scores"])
119
- req_path = write_request_json(org, m["name"], m["type"])
120
- result_paths.append(res_path)
121
- request_paths.append(req_path)
122
-
123
- print("Generated result JSONs:")
124
- for p in result_paths:
125
- print(" -", p)
126
- print("Generated request JSONs:")
127
- for p in request_paths:
128
- print(" -", p)
129
-
130
- if __name__ == "__main__":
131
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/about.py DELETED
@@ -1,75 +0,0 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
-
4
- @dataclass
5
- class Task:
6
- benchmark: str
7
- metric: str
8
- col_name: str
9
-
10
-
11
- # Select your tasks here
12
- # ---------------------------------------------------
13
- class Tasks(Enum):
14
- # SGI-Bench tasks mapped to leaderboard columns
15
- deep_research = Task("deep_research", "acc", "Deep Research")
16
- idea_generation = Task("idea_generation", "acc", "Idea Generation")
17
- dry_experiment = Task("dry_experiment", "acc", "Dry Experiment")
18
- wet_experiment = Task("wet_experiment", "acc", "Wet Experiment")
19
- experimental_reasoning = Task("experimental_reasoning", "acc", "Experimental Reasoning")
20
-
21
- NUM_FEWSHOT = 0 # Change with your few shot
22
- # ---------------------------------------------------
23
-
24
-
25
-
26
- # Your leaderboard name
27
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
28
-
29
- # What does your leaderboard evaluate?
30
- INTRODUCTION_TEXT = """
31
- Intro text
32
- """
33
-
34
- # Which evaluations are you running? how can people reproduce what you have?
35
- LLM_BENCHMARKS_TEXT = f"""
36
- ## How it works
37
-
38
- ## Reproducibility
39
- To reproduce our results, here is the commands you can run:
40
-
41
- """
42
-
43
- EVALUATION_QUEUE_TEXT = """
44
- ## Some good practices before submitting a model
45
-
46
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
47
- ```python
48
- from transformers import AutoConfig, AutoModel, AutoTokenizer
49
- config = AutoConfig.from_pretrained("your model name", revision=revision)
50
- model = AutoModel.from_pretrained("your model name", revision=revision)
51
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
52
- ```
53
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
54
-
55
- Note: make sure your model is public!
56
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
57
-
58
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
59
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
60
-
61
- ### 3) Make sure your model has an open license!
62
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
63
-
64
- ### 4) Fill up your model card
65
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
66
-
67
- ## In case of model failure
68
- If your model is displayed in the `FAILED` category, its execution stopped.
69
- Make sure you have followed the above steps first.
70
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
71
- """
72
-
73
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
74
- CITATION_BUTTON_TEXT = r"""
75
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/formatting.py DELETED
@@ -1,27 +0,0 @@
1
- def model_hyperlink(link, model_name):
2
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
-
4
-
5
- def make_clickable_model(model_name):
6
- link = f"https://huggingface.co/{model_name}"
7
- return model_hyperlink(link, model_name)
8
-
9
-
10
- def styled_error(error):
11
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
-
13
-
14
- def styled_warning(warn):
15
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
16
-
17
-
18
- def styled_message(message):
19
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
-
21
-
22
- def has_no_nan_values(df, columns):
23
- return df[columns].notna().all(axis=1)
24
-
25
-
26
- def has_nan_values(df, columns):
27
- return df[columns].isna().any(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/utils.py DELETED
@@ -1,121 +0,0 @@
1
- from dataclasses import dataclass, make_dataclass
2
- from typing import ClassVar
3
- from enum import Enum
4
-
5
- import pandas as pd
6
-
7
- from src.about import Tasks
8
-
9
- def fields(raw_class):
10
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
11
-
12
-
13
- # These classes are for user facing column names,
14
- # to avoid having to change them all around the code
15
- # when a modif is needed
16
- @dataclass
17
- class ColumnContent:
18
- name: str
19
- type: str
20
- displayed_by_default: bool
21
- hidden: bool = False
22
- never_hidden: bool = False
23
-
24
- ## Leaderboard columns
25
- auto_eval_column_dict = []
26
- # Init
27
- auto_eval_column_dict.append(["model_type_symbol", ClassVar[ColumnContent], ColumnContent("T", "str", True, never_hidden=True)])
28
- auto_eval_column_dict.append(["model", ClassVar[ColumnContent], ColumnContent("Model", "markdown", True, never_hidden=True)])
29
- #Scores
30
- auto_eval_column_dict.append(["average", ClassVar[ColumnContent], ColumnContent("Average ⬆️", "number", True)])
31
- for task in Tasks:
32
- auto_eval_column_dict.append([task.name, ClassVar[ColumnContent], ColumnContent(task.value.col_name, "number", True)])
33
- # Model information
34
- auto_eval_column_dict.append(["model_type", ClassVar[ColumnContent], ColumnContent("Type", "str", False)])
35
- auto_eval_column_dict.append(["architecture", ClassVar[ColumnContent], ColumnContent("Architecture", "str", False)])
36
- auto_eval_column_dict.append(["weight_type", ClassVar[ColumnContent], ColumnContent("Weight type", "str", False, True)])
37
- auto_eval_column_dict.append(["precision", ClassVar[ColumnContent], ColumnContent("Precision", "str", False)])
38
- auto_eval_column_dict.append(["license", ClassVar[ColumnContent], ColumnContent("Hub License", "str", False)])
39
- auto_eval_column_dict.append(["params", ClassVar[ColumnContent], ColumnContent("#Params (B)", "number", False)])
40
- auto_eval_column_dict.append(["likes", ClassVar[ColumnContent], ColumnContent("Hub ❤️", "number", False)])
41
- auto_eval_column_dict.append(["still_on_hub", ClassVar[ColumnContent], ColumnContent("Available on the hub", "bool", False)])
42
- auto_eval_column_dict.append(["revision", ClassVar[ColumnContent], ColumnContent("Model sha", "str", False, False)])
43
-
44
- # Build AutoEvalColumn as a simple class to hold ColumnContent descriptors
45
- class AutoEvalColumn:
46
- pass
47
-
48
- # Populate attributes from auto_eval_column_dict
49
- for _name, _type, _default in auto_eval_column_dict:
50
- setattr(AutoEvalColumn, _name, _default)
51
-
52
- ## For the queue columns in the submission tab
53
- @dataclass(frozen=True)
54
- class EvalQueueColumn: # Queue column
55
- model = ColumnContent("model", "markdown", True)
56
- revision = ColumnContent("revision", "str", True)
57
- private = ColumnContent("private", "bool", True)
58
- precision = ColumnContent("precision", "str", True)
59
- weight_type = ColumnContent("weight_type", "str", "Original")
60
- status = ColumnContent("status", "str", True)
61
-
62
- ## All the model information that we might need
63
- @dataclass
64
- class ModelDetails:
65
- name: str
66
- display_name: str = ""
67
- symbol: str = "" # emoji
68
-
69
-
70
- class ModelType(Enum):
71
- Open = ModelDetails(name="Open", symbol="🔓")
72
- Closed = ModelDetails(name="Closed", symbol="🔒")
73
- PT = ModelDetails(name="pretrained", symbol="🟢")
74
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
75
- IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
76
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
77
- Unknown = ModelDetails(name="", symbol="?")
78
-
79
- def to_str(self, separator=" "):
80
- return f"{self.value.symbol}{separator}{self.value.name}"
81
-
82
- @staticmethod
83
- def from_str(type):
84
- if "Open" in type or "🔓" in type:
85
- return ModelType.Open
86
- if "Closed" in type or "🔒" in type:
87
- return ModelType.Closed
88
- if "fine-tuned" in type or "🔶" in type:
89
- return ModelType.FT
90
- if "pretrained" in type or "🟢" in type:
91
- return ModelType.PT
92
- if "RL-tuned" in type or "🟦" in type:
93
- return ModelType.RL
94
- if "instruction-tuned" in type or "⭕" in type:
95
- return ModelType.IFT
96
- return ModelType.Unknown
97
-
98
- class WeightType(Enum):
99
- Adapter = ModelDetails("Adapter")
100
- Original = ModelDetails("Original")
101
- Delta = ModelDetails("Delta")
102
-
103
- class Precision(Enum):
104
- float16 = ModelDetails("float16")
105
- bfloat16 = ModelDetails("bfloat16")
106
- Unknown = ModelDetails("?")
107
-
108
- def from_str(precision):
109
- if precision in ["torch.float16", "float16"]:
110
- return Precision.float16
111
- if precision in ["torch.bfloat16", "bfloat16"]:
112
- return Precision.bfloat16
113
- return Precision.Unknown
114
-
115
- # Column selection
116
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
117
-
118
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
119
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
120
-
121
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard/read_evals.py DELETED
@@ -1,196 +0,0 @@
1
- import glob
2
- import json
3
- import math
4
- import os
5
- from dataclasses import dataclass
6
-
7
- import dateutil
8
- import numpy as np
9
-
10
- from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
- from src.submission.check_validity import is_model_on_hub
13
-
14
-
15
- @dataclass
16
- class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
- model: str
23
- revision: str # commit hash, "" if main
24
- results: dict
25
- precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
- license: str = "?"
30
- likes: int = 0
31
- num_params: int = 0
32
- date: str = "" # submission date of request file
33
- still_on_hub: bool = False
34
-
35
- @classmethod
36
- def init_from_json_file(self, json_filepath):
37
- """Inits the result from the specific model result file"""
38
- with open(json_filepath) as fp:
39
- data = json.load(fp)
40
-
41
- config = data.get("config")
42
-
43
- # Precision
44
- precision = Precision.from_str(config.get("model_dtype"))
45
-
46
- # Get model and org
47
- org_and_model = config.get("model_name", config.get("model_args", None))
48
- org_and_model = org_and_model.split("/", 1)
49
-
50
- if len(org_and_model) == 1:
51
- org = None
52
- model = org_and_model[0]
53
- result_key = f"{model}_{precision.value.name}"
54
- else:
55
- org = org_and_model[0]
56
- model = org_and_model[1]
57
- result_key = f"{org}_{model}_{precision.value.name}"
58
- full_model = "/".join(org_and_model)
59
-
60
- still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
- )
63
- architecture = "?"
64
- if model_config is not None:
65
- architectures = getattr(model_config, "architectures", None)
66
- if architectures:
67
- architecture = ";".join(architectures)
68
-
69
- # Extract results available in this file (some results are split in several files)
70
- results = {}
71
- for task in Tasks:
72
- task = task.value
73
-
74
- # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
- if accs.size == 0 or any([acc is None for acc in accs]):
77
- continue
78
-
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
81
-
82
- return self(
83
- eval_name=result_key,
84
- full_model=full_model,
85
- org=org,
86
- model=model,
87
- results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
- still_on_hub=still_on_hub,
91
- architecture=architecture
92
- )
93
-
94
- def update_with_request_file(self, requests_path):
95
- """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
- try:
99
- with open(request_file, "r") as f:
100
- request = json.load(f)
101
- self.model_type = ModelType.from_str(request.get("model_type", ""))
102
- self.weight_type = WeightType[request.get("weight_type", "Original")]
103
- self.license = request.get("license", "?")
104
- self.likes = request.get("likes", 0)
105
- self.num_params = request.get("params", 0)
106
- self.date = request.get("submitted_time", "")
107
- except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
-
110
- def to_dict(self):
111
- """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
- data_dict = {
114
- "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
- AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
- AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
- }
128
-
129
- for task in Tasks:
130
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
-
132
- return data_dict
133
-
134
-
135
- def get_request_file_for_model(requests_path, model_name, precision):
136
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
- request_files = os.path.join(
138
- requests_path,
139
- f"{model_name}_eval_request_*.json",
140
- )
141
- request_files = glob.glob(request_files)
142
-
143
- # Select correct request file (precision)
144
- request_file = ""
145
- request_files = sorted(request_files, reverse=True)
146
- for tmp_request_file in request_files:
147
- with open(tmp_request_file, "r") as f:
148
- req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
- request_file = tmp_request_file
154
- return request_file
155
-
156
-
157
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
- """From the path of the results folder root, extract all needed info for results"""
159
- model_result_filepaths = []
160
-
161
- for root, _, files in os.walk(results_path):
162
- # We should only have json files in model results
163
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
164
- continue
165
-
166
- # Sort the files by date
167
- try:
168
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
- except dateutil.parser._parser.ParserError:
170
- files = [files[-1]]
171
-
172
- for file in files:
173
- model_result_filepaths.append(os.path.join(root, file))
174
-
175
- eval_results = {}
176
- for model_result_filepath in model_result_filepaths:
177
- # Creation of result
178
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
-
181
- # Store results of same eval together
182
- eval_name = eval_result.eval_name
183
- if eval_name in eval_results.keys():
184
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
185
- else:
186
- eval_results[eval_name] = eval_result
187
-
188
- results = []
189
- for v in eval_results.values():
190
- try:
191
- v.to_dict() # we test if the dict version is complete
192
- results.append(v)
193
- except KeyError: # not all eval values present
194
- continue
195
-
196
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py DELETED
@@ -1,58 +0,0 @@
1
- import json
2
- import os
3
-
4
- import pandas as pd
5
-
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
-
10
-
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
-
16
- df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- df = df[cols].round(decimals=2)
19
-
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
- return df
23
-
24
-
25
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
- """Creates the different dataframes for the evaluation queues requestes"""
27
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
- all_evals = []
29
-
30
- for entry in entries:
31
- if ".json" in entry:
32
- file_path = os.path.join(save_path, entry)
33
- with open(file_path) as fp:
34
- data = json.load(fp)
35
-
36
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
-
39
- all_evals.append(data)
40
- elif ".md" not in entry:
41
- # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
43
- for sub_entry in sub_entries:
44
- file_path = os.path.join(save_path, entry, sub_entry)
45
- with open(file_path) as fp:
46
- data = json.load(fp)
47
-
48
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
- all_evals.append(data)
51
-
52
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
55
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/check_validity.py DELETED
@@ -1,99 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
-
7
- import huggingface_hub
8
- from huggingface_hub import ModelCard
9
- from huggingface_hub.hf_api import ModelInfo
10
- from transformers import AutoConfig
11
- from transformers.models.auto.tokenization_auto import AutoTokenizer
12
-
13
- def check_model_card(repo_id: str) -> tuple[bool, str]:
14
- """Checks if the model card and license exist and have been filled"""
15
- try:
16
- card = ModelCard.load(repo_id)
17
- except huggingface_hub.utils.EntryNotFoundError:
18
- return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
-
20
- # Enforce license metadata
21
- if card.data.license is None:
22
- if not ("license_name" in card.data and "license_link" in card.data):
23
- return False, (
24
- "License not found. Please add a license to your model card using the `license` metadata or a"
25
- " `license_name`/`license_link` pair."
26
- )
27
-
28
- # Enforce card content
29
- if len(card.text) < 200:
30
- return False, "Please add a description to your model card, it is too short."
31
-
32
- return True, ""
33
-
34
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
- """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
- try:
37
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
- if test_tokenizer:
39
- try:
40
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
- except ValueError as e:
42
- return (
43
- False,
44
- f"uses a tokenizer which is not in a transformers release: {e}",
45
- None
46
- )
47
- except Exception as e:
48
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
- return True, None, config
50
-
51
- except ValueError:
52
- return (
53
- False,
54
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
- None
56
- )
57
-
58
- except Exception as e:
59
- return False, "was not found on hub!", None
60
-
61
-
62
- def get_model_size(model_info: ModelInfo, precision: str):
63
- """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
- try:
65
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
- except (AttributeError, TypeError):
67
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
-
69
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
- model_size = size_factor * model_size
71
- return model_size
72
-
73
- def get_model_arch(model_info: ModelInfo):
74
- """Gets the model architecture from the configuration"""
75
- return model_info.config.get("architectures", "Unknown")
76
-
77
- def already_submitted_models(requested_models_dir: str) -> set[str]:
78
- """Gather a list of already submitted models to avoid duplicates"""
79
- depth = 1
80
- file_names = []
81
- users_to_submission_dates = defaultdict(list)
82
-
83
- for root, _, files in os.walk(requested_models_dir):
84
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
85
- if current_depth == depth:
86
- for file in files:
87
- if not file.endswith(".json"):
88
- continue
89
- with open(os.path.join(root, file), "r") as f:
90
- info = json.load(f)
91
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
-
93
- # Select organisation
94
- if info["model"].count("/") == 0 or "submitted_time" not in info:
95
- continue
96
- organisation, _ = info["model"].split("/")
97
- users_to_submission_dates[organisation].append(info["submitted_time"])
98
-
99
- return set(file_names), users_to_submission_dates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/submit.py DELETED
@@ -1,119 +0,0 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
-
5
- from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
-
14
- REQUESTED_MODELS = None
15
- USERS_TO_SUBMISSION_DATES = None
16
-
17
- def add_new_eval(
18
- model: str,
19
- base_model: str,
20
- revision: str,
21
- precision: str,
22
- weight_type: str,
23
- model_type: str,
24
- ):
25
- global REQUESTED_MODELS
26
- global USERS_TO_SUBMISSION_DATES
27
- if not REQUESTED_MODELS:
28
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
-
30
- user_name = ""
31
- model_path = model
32
- if "/" in model:
33
- user_name = model.split("/")[0]
34
- model_path = model.split("/")[1]
35
-
36
- precision = precision.split(" ")[0]
37
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
-
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
-
42
- # Does the model actually exist?
43
- if revision == "":
44
- revision = "main"
45
-
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
-
52
- if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
- if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
-
57
- # Is the model info correctly filled?
58
- try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
- except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
-
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
-
65
- # Were the model card and license filled?
66
- try:
67
- license = model_info.cardData["license"]
68
- except Exception:
69
- return styled_error("Please select a license for your model")
70
-
71
- modelcard_OK, error_msg = check_model_card(model)
72
- if not modelcard_OK:
73
- return styled_error(error_msg)
74
-
75
- # Seems good, creating the eval
76
- print("Adding new eval")
77
-
78
- eval_entry = {
79
- "model": model,
80
- "base_model": base_model,
81
- "revision": revision,
82
- "precision": precision,
83
- "weight_type": weight_type,
84
- "status": "PENDING",
85
- "submitted_time": current_time,
86
- "model_type": model_type,
87
- "likes": model_info.likes,
88
- "params": model_size,
89
- "license": license,
90
- "private": False,
91
- }
92
-
93
- # Check for duplicate submission
94
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
- return styled_warning("This model has been already submitted.")
96
-
97
- print("Creating eval file")
98
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
- os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
-
102
- with open(out_path, "w") as f:
103
- f.write(json.dumps(eval_entry))
104
-
105
- print("Uploading eval file")
106
- API.upload_file(
107
- path_or_fileobj=out_path,
108
- path_in_repo=out_path.split("eval-queue/")[1],
109
- repo_id=QUEUE_REPO,
110
- repo_type="dataset",
111
- commit_message=f"Add {model} to eval queue",
112
- )
113
-
114
- # Remove the local file
115
- os.remove(out_path)
116
-
117
- return styled_message(
118
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
- )