unknown commited on
Commit
6a84810
Β·
1 Parent(s): 17e66ba
Files changed (4) hide show
  1. app.py +17 -9
  2. scripts/generate_sgi_results.py +131 -0
  3. src/about.py +6 -3
  4. src/display/utils.py +28 -17
app.py CHANGED
@@ -3,6 +3,7 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
 
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
@@ -35,18 +36,25 @@ def restart_space():
35
  ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
 
 
 
41
  except Exception:
42
- restart_space()
 
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
 
 
 
48
  except Exception:
49
- restart_space()
50
 
51
 
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
@@ -201,4 +209,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
+ import os
7
 
8
  from src.about import (
9
  CITATION_BUTTON_LABEL,
 
36
  ### Space initialisation
37
  try:
38
  print(EVAL_REQUESTS_PATH)
39
+ if os.path.isdir(EVAL_REQUESTS_PATH):
40
+ print("Using local eval-queue cache")
41
+ else:
42
+ snapshot_download(
43
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
44
+ )
45
  except Exception:
46
+ print("Skipping remote snapshot for eval-queue; using local cache.")
47
+
48
  try:
49
  print(EVAL_RESULTS_PATH)
50
+ if os.path.isdir(EVAL_RESULTS_PATH):
51
+ print("Using local eval-results cache")
52
+ else:
53
+ snapshot_download(
54
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
55
+ )
56
  except Exception:
57
+ print("Skipping remote snapshot for eval-results; using local cache.")
58
 
59
 
60
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
209
  scheduler = BackgroundScheduler()
210
  scheduler.add_job(restart_space, "interval", seconds=1800)
211
  scheduler.start()
212
+ demo.queue(default_concurrency_limit=40).launch()
scripts/generate_sgi_results.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from datetime import datetime, timezone
4
+
5
+ # Use local relative paths to avoid optional dependencies during generation
6
+ EVAL_RESULTS_PATH = "eval-results"
7
+ EVAL_REQUESTS_PATH = "eval-queue"
8
+
9
+ # Leaderboard data provided by user
10
+ MODELS = [
11
+ {"name": "Intern-S1", "type": "Open", "scores": [15.74, 38.09, 28.79, 29.02, 28.87]},
12
+ {"name": "Intern-S1-mini", "type": "Open", "scores": [11.06, 36.04, 16.97, 12.42, 16.84]},
13
+ {"name": "Qwen3-VL-235B-A22B", "type": "Open", "scores": [11.97, 39.28, 28.41, 30.30, 31.62]},
14
+ {"name": "Qwen3-Max", "type": "Open", "scores": [15.38, 39.83, 33.21, 33.62, 37.80]},
15
+ {"name": "Qwen3-8B", "type": "Open", "scores": [8.18, 35.78, 18.45, 9.96, 23.37]},
16
+ {"name": "Llama-4-Scout", "type": "Open", "scores": [7.86, 29.72, 20.37, 21.66, 25.77]},
17
+ {"name": "GPT-4o", "type": "Closed", "scores": [7.86, 35.95, 26.94, 31.31, 32.30]},
18
+ {"name": "GPT-4.1", "type": "Closed", "scores": [11.32, 36.49, 34.32, 36.63, 38.49]},
19
+ {"name": "GPT-5", "type": "Closed", "scores": [14.47, 55.40, 29.89, 16.31, 38.14]},
20
+ {"name": "GPT-5.1", "type": "Closed", "scores": [11.64, 47.12, 31.00, 22.77, 34.02]},
21
+ {"name": "o3", "type": "Closed", "scores": [12.89, 46.07, 31.73, 30.04, 32.65]},
22
+ {"name": "o4-mini", "type": "Closed", "scores": [11.95, 40.78, 35.79, 28.86, 33.33]},
23
+ {"name": "Gemini-2.5-Flash", "type": "Closed", "scores": [10.69, 39.13, 21.03, 18.55, 34.36]},
24
+ {"name": "Gemini-2.5-Pro", "type": "Closed", "scores": [15.09, 39.95, 22.51, 22.05, 41.24]},
25
+ {"name": "Gemini-3-Pro", "type": "Closed", "scores": [18.48, 39.68, 36.64, 32.45, 41.92]},
26
+ {"name": "Claude-Opus-4.1", "type": "Closed", "scores": [12.93, 40.29, 34.69, 25.38, 38.83]},
27
+ {"name": "Claude-Sonnet-4.5", "type": "Closed", "scores": [13.84, 43.20, 35.79, 30.15, 37.80]},
28
+ {"name": "Grok-4", "type": "Closed", "scores": [13.31, 37.12, 33.71, 29.01, 30.24]},
29
+ ]
30
+
31
+ # Task keys must match Tasks Enum in src/about.py
32
+ TASK_KEYS = [
33
+ "deep_research",
34
+ "idea_generation",
35
+ "dry_experiment",
36
+ "wet_experiment",
37
+ "experimental_reasoning",
38
+ ]
39
+
40
+ # Convert percentages to decimals expected by read_evals (it multiplies by 100)
41
+ def pct_to_decimal(p):
42
+ return round(p / 100.0, 6)
43
+
44
+ def ensure_dir(p):
45
+ os.makedirs(p, exist_ok=True)
46
+
47
+ def write_result_json(org, model, scores):
48
+ model_full = f"{org}/{model}"
49
+ # Place each model's JSON in its own subfolder under eval-results
50
+ model_dir = os.path.join(EVAL_RESULTS_PATH, org, model)
51
+ ensure_dir(model_dir)
52
+
53
+ # Minimal config expected by read_evals.py
54
+ cfg = {
55
+ "model_dtype": "float16",
56
+ "model_name": model_full,
57
+ "model_sha": "",
58
+ }
59
+
60
+ # Build results mapping
61
+ results = {}
62
+ for key, score in zip(TASK_KEYS, scores):
63
+ results[key] = {"acc": pct_to_decimal(score)}
64
+
65
+ payload = {
66
+ "config": cfg,
67
+ "results": results,
68
+ }
69
+
70
+ # Filename pattern is flexible; read_evals walks directories and reads all JSONs
71
+ ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
72
+ out_path = os.path.join(model_dir, f"results_{ts}.json")
73
+ with open(out_path, "w", encoding="utf-8") as f:
74
+ json.dump(payload, f, ensure_ascii=False, indent=2)
75
+ return out_path
76
+
77
+ def write_request_json(org, model, model_type):
78
+ # Ensure request file lives under eval-queue/{org}/
79
+ org_dir = os.path.join(EVAL_REQUESTS_PATH, org)
80
+ ensure_dir(org_dir)
81
+
82
+ model_full = f"{org}/{model}"
83
+ now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
84
+
85
+ # Model type label must be parsable by ModelType.from_str
86
+ type_label = "πŸ”“ : Open" if model_type == "Open" else "πŸ”’ : Closed"
87
+
88
+ entry = {
89
+ "model": model_full,
90
+ "base_model": "",
91
+ "revision": "main",
92
+ "precision": "float16",
93
+ "weight_type": "Original",
94
+ "status": "FINISHED",
95
+ "submitted_time": now,
96
+ "model_type": type_label,
97
+ "likes": 0,
98
+ "params": 0,
99
+ "license": "?",
100
+ "private": False,
101
+ }
102
+
103
+ # File naming convention similar to submit.py
104
+ out_path = os.path.join(org_dir, f"{model}_eval_request_False_float16_Original.json")
105
+ with open(out_path, "w", encoding="utf-8") as f:
106
+ json.dump(entry, f, ensure_ascii=False, indent=2)
107
+ return out_path
108
+
109
+ def main():
110
+ org = "sgi-bench"
111
+ ensure_dir(EVAL_RESULTS_PATH)
112
+ ensure_dir(EVAL_REQUESTS_PATH)
113
+
114
+ result_paths = []
115
+ request_paths = []
116
+
117
+ for m in MODELS:
118
+ res_path = write_result_json(org, m["name"], m["scores"])
119
+ req_path = write_request_json(org, m["name"], m["type"])
120
+ result_paths.append(res_path)
121
+ request_paths.append(req_path)
122
+
123
+ print("Generated result JSONs:")
124
+ for p in result_paths:
125
+ print(" -", p)
126
+ print("Generated request JSONs:")
127
+ for p in request_paths:
128
+ print(" -", p)
129
+
130
+ if __name__ == "__main__":
131
+ main()
src/about.py CHANGED
@@ -11,9 +11,12 @@ class Task:
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
 
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
+ # SGI-Bench tasks mapped to leaderboard columns
15
+ deep_research = Task("deep_research", "acc", "Deep Research")
16
+ idea_generation = Task("idea_generation", "acc", "Idea Generation")
17
+ dry_experiment = Task("dry_experiment", "acc", "Dry Experiment")
18
+ wet_experiment = Task("wet_experiment", "acc", "Wet Experiment")
19
+ experimental_reasoning = Task("experimental_reasoning", "acc", "Experimental Reasoning")
20
 
21
  NUM_FEWSHOT = 0 # Change with your few shot
22
  # ---------------------------------------------------
src/display/utils.py CHANGED
@@ -1,4 +1,5 @@
1
  from dataclasses import dataclass, make_dataclass
 
2
  from enum import Enum
3
 
4
  import pandas as pd
@@ -23,25 +24,30 @@ class ColumnContent:
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❀️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
-
43
- # We use make dataclass to dynamically fill the scores from Tasks
44
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
 
 
 
 
45
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
@@ -62,6 +68,8 @@ class ModelDetails:
62
 
63
 
64
  class ModelType(Enum):
 
 
65
  PT = ModelDetails(name="pretrained", symbol="🟒")
66
  FT = ModelDetails(name="fine-tuned", symbol="πŸ”Ά")
67
  IFT = ModelDetails(name="instruction-tuned", symbol="β­•")
@@ -73,6 +81,10 @@ class ModelType(Enum):
73
 
74
  @staticmethod
75
  def from_str(type):
 
 
 
 
76
  if "fine-tuned" in type or "πŸ”Ά" in type:
77
  return ModelType.FT
78
  if "pretrained" in type or "🟒" in type:
@@ -107,4 +119,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
1
  from dataclasses import dataclass, make_dataclass
2
+ from typing import ClassVar
3
  from enum import Enum
4
 
5
  import pandas as pd
 
24
  ## Leaderboard columns
25
  auto_eval_column_dict = []
26
  # Init
27
+ auto_eval_column_dict.append(["model_type_symbol", ClassVar[ColumnContent], ColumnContent("T", "str", True, never_hidden=True)])
28
+ auto_eval_column_dict.append(["model", ClassVar[ColumnContent], ColumnContent("Model", "markdown", True, never_hidden=True)])
29
  #Scores
30
+ auto_eval_column_dict.append(["average", ClassVar[ColumnContent], ColumnContent("Average ⬆️", "number", True)])
31
  for task in Tasks:
32
+ auto_eval_column_dict.append([task.name, ClassVar[ColumnContent], ColumnContent(task.value.col_name, "number", True)])
33
  # Model information
34
+ auto_eval_column_dict.append(["model_type", ClassVar[ColumnContent], ColumnContent("Type", "str", False)])
35
+ auto_eval_column_dict.append(["architecture", ClassVar[ColumnContent], ColumnContent("Architecture", "str", False)])
36
+ auto_eval_column_dict.append(["weight_type", ClassVar[ColumnContent], ColumnContent("Weight type", "str", False, True)])
37
+ auto_eval_column_dict.append(["precision", ClassVar[ColumnContent], ColumnContent("Precision", "str", False)])
38
+ auto_eval_column_dict.append(["license", ClassVar[ColumnContent], ColumnContent("Hub License", "str", False)])
39
+ auto_eval_column_dict.append(["params", ClassVar[ColumnContent], ColumnContent("#Params (B)", "number", False)])
40
+ auto_eval_column_dict.append(["likes", ClassVar[ColumnContent], ColumnContent("Hub ❀️", "number", False)])
41
+ auto_eval_column_dict.append(["still_on_hub", ClassVar[ColumnContent], ColumnContent("Available on the hub", "bool", False)])
42
+ auto_eval_column_dict.append(["revision", ClassVar[ColumnContent], ColumnContent("Model sha", "str", False, False)])
43
+
44
+ # Build AutoEvalColumn as a simple class to hold ColumnContent descriptors
45
+ class AutoEvalColumn:
46
+ pass
47
+
48
+ # Populate attributes from auto_eval_column_dict
49
+ for _name, _type, _default in auto_eval_column_dict:
50
+ setattr(AutoEvalColumn, _name, _default)
51
 
52
  ## For the queue columns in the submission tab
53
  @dataclass(frozen=True)
 
68
 
69
 
70
  class ModelType(Enum):
71
+ Open = ModelDetails(name="Open", symbol="πŸ”“")
72
+ Closed = ModelDetails(name="Closed", symbol="πŸ”’")
73
  PT = ModelDetails(name="pretrained", symbol="🟒")
74
  FT = ModelDetails(name="fine-tuned", symbol="πŸ”Ά")
75
  IFT = ModelDetails(name="instruction-tuned", symbol="β­•")
 
81
 
82
  @staticmethod
83
  def from_str(type):
84
+ if "Open" in type or "πŸ”“" in type:
85
+ return ModelType.Open
86
+ if "Closed" in type or "πŸ”’" in type:
87
+ return ModelType.Closed
88
  if "fine-tuned" in type or "πŸ”Ά" in type:
89
  return ModelType.FT
90
  if "pretrained" in type or "🟒" in type:
 
119
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
120
 
121
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]