unknown
commited on
Commit
Β·
6a84810
1
Parent(s):
17e66ba
update
Browse files- app.py +17 -9
- scripts/generate_sgi_results.py +131 -0
- src/about.py +6 -3
- src/display/utils.py +28 -17
app.py
CHANGED
|
@@ -3,6 +3,7 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
|
| 3 |
import pandas as pd
|
| 4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 5 |
from huggingface_hub import snapshot_download
|
|
|
|
| 6 |
|
| 7 |
from src.about import (
|
| 8 |
CITATION_BUTTON_LABEL,
|
|
@@ -35,18 +36,25 @@ def restart_space():
|
|
| 35 |
### Space initialisation
|
| 36 |
try:
|
| 37 |
print(EVAL_REQUESTS_PATH)
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
| 41 |
except Exception:
|
| 42 |
-
|
|
|
|
| 43 |
try:
|
| 44 |
print(EVAL_RESULTS_PATH)
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
| 48 |
except Exception:
|
| 49 |
-
|
| 50 |
|
| 51 |
|
| 52 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
|
@@ -201,4 +209,4 @@ with demo:
|
|
| 201 |
scheduler = BackgroundScheduler()
|
| 202 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 203 |
scheduler.start()
|
| 204 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 5 |
from huggingface_hub import snapshot_download
|
| 6 |
+
import os
|
| 7 |
|
| 8 |
from src.about import (
|
| 9 |
CITATION_BUTTON_LABEL,
|
|
|
|
| 36 |
### Space initialisation
|
| 37 |
try:
|
| 38 |
print(EVAL_REQUESTS_PATH)
|
| 39 |
+
if os.path.isdir(EVAL_REQUESTS_PATH):
|
| 40 |
+
print("Using local eval-queue cache")
|
| 41 |
+
else:
|
| 42 |
+
snapshot_download(
|
| 43 |
+
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 44 |
+
)
|
| 45 |
except Exception:
|
| 46 |
+
print("Skipping remote snapshot for eval-queue; using local cache.")
|
| 47 |
+
|
| 48 |
try:
|
| 49 |
print(EVAL_RESULTS_PATH)
|
| 50 |
+
if os.path.isdir(EVAL_RESULTS_PATH):
|
| 51 |
+
print("Using local eval-results cache")
|
| 52 |
+
else:
|
| 53 |
+
snapshot_download(
|
| 54 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 55 |
+
)
|
| 56 |
except Exception:
|
| 57 |
+
print("Skipping remote snapshot for eval-results; using local cache.")
|
| 58 |
|
| 59 |
|
| 60 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
|
| 209 |
scheduler = BackgroundScheduler()
|
| 210 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 211 |
scheduler.start()
|
| 212 |
+
demo.queue(default_concurrency_limit=40).launch()
|
scripts/generate_sgi_results.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from datetime import datetime, timezone
|
| 4 |
+
|
| 5 |
+
# Use local relative paths to avoid optional dependencies during generation
|
| 6 |
+
EVAL_RESULTS_PATH = "eval-results"
|
| 7 |
+
EVAL_REQUESTS_PATH = "eval-queue"
|
| 8 |
+
|
| 9 |
+
# Leaderboard data provided by user
|
| 10 |
+
MODELS = [
|
| 11 |
+
{"name": "Intern-S1", "type": "Open", "scores": [15.74, 38.09, 28.79, 29.02, 28.87]},
|
| 12 |
+
{"name": "Intern-S1-mini", "type": "Open", "scores": [11.06, 36.04, 16.97, 12.42, 16.84]},
|
| 13 |
+
{"name": "Qwen3-VL-235B-A22B", "type": "Open", "scores": [11.97, 39.28, 28.41, 30.30, 31.62]},
|
| 14 |
+
{"name": "Qwen3-Max", "type": "Open", "scores": [15.38, 39.83, 33.21, 33.62, 37.80]},
|
| 15 |
+
{"name": "Qwen3-8B", "type": "Open", "scores": [8.18, 35.78, 18.45, 9.96, 23.37]},
|
| 16 |
+
{"name": "Llama-4-Scout", "type": "Open", "scores": [7.86, 29.72, 20.37, 21.66, 25.77]},
|
| 17 |
+
{"name": "GPT-4o", "type": "Closed", "scores": [7.86, 35.95, 26.94, 31.31, 32.30]},
|
| 18 |
+
{"name": "GPT-4.1", "type": "Closed", "scores": [11.32, 36.49, 34.32, 36.63, 38.49]},
|
| 19 |
+
{"name": "GPT-5", "type": "Closed", "scores": [14.47, 55.40, 29.89, 16.31, 38.14]},
|
| 20 |
+
{"name": "GPT-5.1", "type": "Closed", "scores": [11.64, 47.12, 31.00, 22.77, 34.02]},
|
| 21 |
+
{"name": "o3", "type": "Closed", "scores": [12.89, 46.07, 31.73, 30.04, 32.65]},
|
| 22 |
+
{"name": "o4-mini", "type": "Closed", "scores": [11.95, 40.78, 35.79, 28.86, 33.33]},
|
| 23 |
+
{"name": "Gemini-2.5-Flash", "type": "Closed", "scores": [10.69, 39.13, 21.03, 18.55, 34.36]},
|
| 24 |
+
{"name": "Gemini-2.5-Pro", "type": "Closed", "scores": [15.09, 39.95, 22.51, 22.05, 41.24]},
|
| 25 |
+
{"name": "Gemini-3-Pro", "type": "Closed", "scores": [18.48, 39.68, 36.64, 32.45, 41.92]},
|
| 26 |
+
{"name": "Claude-Opus-4.1", "type": "Closed", "scores": [12.93, 40.29, 34.69, 25.38, 38.83]},
|
| 27 |
+
{"name": "Claude-Sonnet-4.5", "type": "Closed", "scores": [13.84, 43.20, 35.79, 30.15, 37.80]},
|
| 28 |
+
{"name": "Grok-4", "type": "Closed", "scores": [13.31, 37.12, 33.71, 29.01, 30.24]},
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
# Task keys must match Tasks Enum in src/about.py
|
| 32 |
+
TASK_KEYS = [
|
| 33 |
+
"deep_research",
|
| 34 |
+
"idea_generation",
|
| 35 |
+
"dry_experiment",
|
| 36 |
+
"wet_experiment",
|
| 37 |
+
"experimental_reasoning",
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
# Convert percentages to decimals expected by read_evals (it multiplies by 100)
|
| 41 |
+
def pct_to_decimal(p):
|
| 42 |
+
return round(p / 100.0, 6)
|
| 43 |
+
|
| 44 |
+
def ensure_dir(p):
|
| 45 |
+
os.makedirs(p, exist_ok=True)
|
| 46 |
+
|
| 47 |
+
def write_result_json(org, model, scores):
|
| 48 |
+
model_full = f"{org}/{model}"
|
| 49 |
+
# Place each model's JSON in its own subfolder under eval-results
|
| 50 |
+
model_dir = os.path.join(EVAL_RESULTS_PATH, org, model)
|
| 51 |
+
ensure_dir(model_dir)
|
| 52 |
+
|
| 53 |
+
# Minimal config expected by read_evals.py
|
| 54 |
+
cfg = {
|
| 55 |
+
"model_dtype": "float16",
|
| 56 |
+
"model_name": model_full,
|
| 57 |
+
"model_sha": "",
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
# Build results mapping
|
| 61 |
+
results = {}
|
| 62 |
+
for key, score in zip(TASK_KEYS, scores):
|
| 63 |
+
results[key] = {"acc": pct_to_decimal(score)}
|
| 64 |
+
|
| 65 |
+
payload = {
|
| 66 |
+
"config": cfg,
|
| 67 |
+
"results": results,
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
# Filename pattern is flexible; read_evals walks directories and reads all JSONs
|
| 71 |
+
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
| 72 |
+
out_path = os.path.join(model_dir, f"results_{ts}.json")
|
| 73 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
| 74 |
+
json.dump(payload, f, ensure_ascii=False, indent=2)
|
| 75 |
+
return out_path
|
| 76 |
+
|
| 77 |
+
def write_request_json(org, model, model_type):
|
| 78 |
+
# Ensure request file lives under eval-queue/{org}/
|
| 79 |
+
org_dir = os.path.join(EVAL_REQUESTS_PATH, org)
|
| 80 |
+
ensure_dir(org_dir)
|
| 81 |
+
|
| 82 |
+
model_full = f"{org}/{model}"
|
| 83 |
+
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 84 |
+
|
| 85 |
+
# Model type label must be parsable by ModelType.from_str
|
| 86 |
+
type_label = "π : Open" if model_type == "Open" else "π : Closed"
|
| 87 |
+
|
| 88 |
+
entry = {
|
| 89 |
+
"model": model_full,
|
| 90 |
+
"base_model": "",
|
| 91 |
+
"revision": "main",
|
| 92 |
+
"precision": "float16",
|
| 93 |
+
"weight_type": "Original",
|
| 94 |
+
"status": "FINISHED",
|
| 95 |
+
"submitted_time": now,
|
| 96 |
+
"model_type": type_label,
|
| 97 |
+
"likes": 0,
|
| 98 |
+
"params": 0,
|
| 99 |
+
"license": "?",
|
| 100 |
+
"private": False,
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
# File naming convention similar to submit.py
|
| 104 |
+
out_path = os.path.join(org_dir, f"{model}_eval_request_False_float16_Original.json")
|
| 105 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
| 106 |
+
json.dump(entry, f, ensure_ascii=False, indent=2)
|
| 107 |
+
return out_path
|
| 108 |
+
|
| 109 |
+
def main():
|
| 110 |
+
org = "sgi-bench"
|
| 111 |
+
ensure_dir(EVAL_RESULTS_PATH)
|
| 112 |
+
ensure_dir(EVAL_REQUESTS_PATH)
|
| 113 |
+
|
| 114 |
+
result_paths = []
|
| 115 |
+
request_paths = []
|
| 116 |
+
|
| 117 |
+
for m in MODELS:
|
| 118 |
+
res_path = write_result_json(org, m["name"], m["scores"])
|
| 119 |
+
req_path = write_request_json(org, m["name"], m["type"])
|
| 120 |
+
result_paths.append(res_path)
|
| 121 |
+
request_paths.append(req_path)
|
| 122 |
+
|
| 123 |
+
print("Generated result JSONs:")
|
| 124 |
+
for p in result_paths:
|
| 125 |
+
print(" -", p)
|
| 126 |
+
print("Generated request JSONs:")
|
| 127 |
+
for p in request_paths:
|
| 128 |
+
print(" -", p)
|
| 129 |
+
|
| 130 |
+
if __name__ == "__main__":
|
| 131 |
+
main()
|
src/about.py
CHANGED
|
@@ -11,9 +11,12 @@ class Task:
|
|
| 11 |
# Select your tasks here
|
| 12 |
# ---------------------------------------------------
|
| 13 |
class Tasks(Enum):
|
| 14 |
-
#
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 19 |
# ---------------------------------------------------
|
|
|
|
| 11 |
# Select your tasks here
|
| 12 |
# ---------------------------------------------------
|
| 13 |
class Tasks(Enum):
|
| 14 |
+
# SGI-Bench tasks mapped to leaderboard columns
|
| 15 |
+
deep_research = Task("deep_research", "acc", "Deep Research")
|
| 16 |
+
idea_generation = Task("idea_generation", "acc", "Idea Generation")
|
| 17 |
+
dry_experiment = Task("dry_experiment", "acc", "Dry Experiment")
|
| 18 |
+
wet_experiment = Task("wet_experiment", "acc", "Wet Experiment")
|
| 19 |
+
experimental_reasoning = Task("experimental_reasoning", "acc", "Experimental Reasoning")
|
| 20 |
|
| 21 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 22 |
# ---------------------------------------------------
|
src/display/utils.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
from dataclasses import dataclass, make_dataclass
|
|
|
|
| 2 |
from enum import Enum
|
| 3 |
|
| 4 |
import pandas as pd
|
|
@@ -23,25 +24,30 @@ class ColumnContent:
|
|
| 23 |
## Leaderboard columns
|
| 24 |
auto_eval_column_dict = []
|
| 25 |
# Init
|
| 26 |
-
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 27 |
-
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 28 |
#Scores
|
| 29 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average β¬οΈ", "number", True)])
|
| 30 |
for task in Tasks:
|
| 31 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 32 |
# Model information
|
| 33 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 34 |
-
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
| 35 |
-
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
| 36 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
| 37 |
-
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
| 38 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
| 39 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub β€οΈ", "number", False)])
|
| 40 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
| 41 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 42 |
-
|
| 43 |
-
#
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
## For the queue columns in the submission tab
|
| 47 |
@dataclass(frozen=True)
|
|
@@ -62,6 +68,8 @@ class ModelDetails:
|
|
| 62 |
|
| 63 |
|
| 64 |
class ModelType(Enum):
|
|
|
|
|
|
|
| 65 |
PT = ModelDetails(name="pretrained", symbol="π’")
|
| 66 |
FT = ModelDetails(name="fine-tuned", symbol="πΆ")
|
| 67 |
IFT = ModelDetails(name="instruction-tuned", symbol="β")
|
|
@@ -73,6 +81,10 @@ class ModelType(Enum):
|
|
| 73 |
|
| 74 |
@staticmethod
|
| 75 |
def from_str(type):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
if "fine-tuned" in type or "πΆ" in type:
|
| 77 |
return ModelType.FT
|
| 78 |
if "pretrained" in type or "π’" in type:
|
|
@@ -107,4 +119,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
|
| 107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 108 |
|
| 109 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
| 110 |
-
|
|
|
|
| 1 |
from dataclasses import dataclass, make_dataclass
|
| 2 |
+
from typing import ClassVar
|
| 3 |
from enum import Enum
|
| 4 |
|
| 5 |
import pandas as pd
|
|
|
|
| 24 |
## Leaderboard columns
|
| 25 |
auto_eval_column_dict = []
|
| 26 |
# Init
|
| 27 |
+
auto_eval_column_dict.append(["model_type_symbol", ClassVar[ColumnContent], ColumnContent("T", "str", True, never_hidden=True)])
|
| 28 |
+
auto_eval_column_dict.append(["model", ClassVar[ColumnContent], ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 29 |
#Scores
|
| 30 |
+
auto_eval_column_dict.append(["average", ClassVar[ColumnContent], ColumnContent("Average β¬οΈ", "number", True)])
|
| 31 |
for task in Tasks:
|
| 32 |
+
auto_eval_column_dict.append([task.name, ClassVar[ColumnContent], ColumnContent(task.value.col_name, "number", True)])
|
| 33 |
# Model information
|
| 34 |
+
auto_eval_column_dict.append(["model_type", ClassVar[ColumnContent], ColumnContent("Type", "str", False)])
|
| 35 |
+
auto_eval_column_dict.append(["architecture", ClassVar[ColumnContent], ColumnContent("Architecture", "str", False)])
|
| 36 |
+
auto_eval_column_dict.append(["weight_type", ClassVar[ColumnContent], ColumnContent("Weight type", "str", False, True)])
|
| 37 |
+
auto_eval_column_dict.append(["precision", ClassVar[ColumnContent], ColumnContent("Precision", "str", False)])
|
| 38 |
+
auto_eval_column_dict.append(["license", ClassVar[ColumnContent], ColumnContent("Hub License", "str", False)])
|
| 39 |
+
auto_eval_column_dict.append(["params", ClassVar[ColumnContent], ColumnContent("#Params (B)", "number", False)])
|
| 40 |
+
auto_eval_column_dict.append(["likes", ClassVar[ColumnContent], ColumnContent("Hub β€οΈ", "number", False)])
|
| 41 |
+
auto_eval_column_dict.append(["still_on_hub", ClassVar[ColumnContent], ColumnContent("Available on the hub", "bool", False)])
|
| 42 |
+
auto_eval_column_dict.append(["revision", ClassVar[ColumnContent], ColumnContent("Model sha", "str", False, False)])
|
| 43 |
+
|
| 44 |
+
# Build AutoEvalColumn as a simple class to hold ColumnContent descriptors
|
| 45 |
+
class AutoEvalColumn:
|
| 46 |
+
pass
|
| 47 |
+
|
| 48 |
+
# Populate attributes from auto_eval_column_dict
|
| 49 |
+
for _name, _type, _default in auto_eval_column_dict:
|
| 50 |
+
setattr(AutoEvalColumn, _name, _default)
|
| 51 |
|
| 52 |
## For the queue columns in the submission tab
|
| 53 |
@dataclass(frozen=True)
|
|
|
|
| 68 |
|
| 69 |
|
| 70 |
class ModelType(Enum):
|
| 71 |
+
Open = ModelDetails(name="Open", symbol="π")
|
| 72 |
+
Closed = ModelDetails(name="Closed", symbol="π")
|
| 73 |
PT = ModelDetails(name="pretrained", symbol="π’")
|
| 74 |
FT = ModelDetails(name="fine-tuned", symbol="πΆ")
|
| 75 |
IFT = ModelDetails(name="instruction-tuned", symbol="β")
|
|
|
|
| 81 |
|
| 82 |
@staticmethod
|
| 83 |
def from_str(type):
|
| 84 |
+
if "Open" in type or "π" in type:
|
| 85 |
+
return ModelType.Open
|
| 86 |
+
if "Closed" in type or "π" in type:
|
| 87 |
+
return ModelType.Closed
|
| 88 |
if "fine-tuned" in type or "πΆ" in type:
|
| 89 |
return ModelType.FT
|
| 90 |
if "pretrained" in type or "π’" in type:
|
|
|
|
| 119 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 120 |
|
| 121 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|