Spaces:

InternScience
/

SGI-Bench-Leaderboard

Running

App Files Files Community

unknown commited on 12 days ago

Commit

6a84810

1 Parent(s): 17e66ba

update

Browse files

Files changed (4) hide show

app.py +17 -9
scripts/generate_sgi_results.py +131 -0
src/about.py +6 -3
src/display/utils.py +28 -17

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.about import (
     CITATION_BUTTON_LABEL,
@@ -35,18 +36,25 @@ def restart_space():
 ### Space initialisation
 try:
     print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
 except Exception:
-    restart_space()
 try:
     print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
 except Exception:
-    restart_space()
 LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
@@ -201,4 +209,4 @@ with demo:
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
+import os
 from src.about import (
     CITATION_BUTTON_LABEL,
 ### Space initialisation
 try:
     print(EVAL_REQUESTS_PATH)
+    if os.path.isdir(EVAL_REQUESTS_PATH):
+        print("Using local eval-queue cache")
+    else:
+        snapshot_download(
+            repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+        )
 except Exception:
+    print("Skipping remote snapshot for eval-queue; using local cache.")
 try:
     print(EVAL_RESULTS_PATH)
+    if os.path.isdir(EVAL_RESULTS_PATH):
+        print("Using local eval-results cache")
+    else:
+        snapshot_download(
+            repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+        )
 except Exception:
+    print("Skipping remote snapshot for eval-results; using local cache.")
 LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

scripts/generate_sgi_results.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import os
+import json
+from datetime import datetime, timezone
+# Use local relative paths to avoid optional dependencies during generation
+EVAL_RESULTS_PATH = "eval-results"
+EVAL_REQUESTS_PATH = "eval-queue"
+# Leaderboard data provided by user
+MODELS = [
+    {"name": "Intern-S1", "type": "Open", "scores": [15.74, 38.09, 28.79, 29.02, 28.87]},
+    {"name": "Intern-S1-mini", "type": "Open", "scores": [11.06, 36.04, 16.97, 12.42, 16.84]},
+    {"name": "Qwen3-VL-235B-A22B", "type": "Open", "scores": [11.97, 39.28, 28.41, 30.30, 31.62]},
+    {"name": "Qwen3-Max", "type": "Open", "scores": [15.38, 39.83, 33.21, 33.62, 37.80]},
+    {"name": "Qwen3-8B", "type": "Open", "scores": [8.18, 35.78, 18.45, 9.96, 23.37]},
+    {"name": "Llama-4-Scout", "type": "Open", "scores": [7.86, 29.72, 20.37, 21.66, 25.77]},
+    {"name": "GPT-4o", "type": "Closed", "scores": [7.86, 35.95, 26.94, 31.31, 32.30]},
+    {"name": "GPT-4.1", "type": "Closed", "scores": [11.32, 36.49, 34.32, 36.63, 38.49]},
+    {"name": "GPT-5", "type": "Closed", "scores": [14.47, 55.40, 29.89, 16.31, 38.14]},
+    {"name": "GPT-5.1", "type": "Closed", "scores": [11.64, 47.12, 31.00, 22.77, 34.02]},
+    {"name": "o3", "type": "Closed", "scores": [12.89, 46.07, 31.73, 30.04, 32.65]},
+    {"name": "o4-mini", "type": "Closed", "scores": [11.95, 40.78, 35.79, 28.86, 33.33]},
+    {"name": "Gemini-2.5-Flash", "type": "Closed", "scores": [10.69, 39.13, 21.03, 18.55, 34.36]},
+    {"name": "Gemini-2.5-Pro", "type": "Closed", "scores": [15.09, 39.95, 22.51, 22.05, 41.24]},
+    {"name": "Gemini-3-Pro", "type": "Closed", "scores": [18.48, 39.68, 36.64, 32.45, 41.92]},
+    {"name": "Claude-Opus-4.1", "type": "Closed", "scores": [12.93, 40.29, 34.69, 25.38, 38.83]},
+    {"name": "Claude-Sonnet-4.5", "type": "Closed", "scores": [13.84, 43.20, 35.79, 30.15, 37.80]},
+    {"name": "Grok-4", "type": "Closed", "scores": [13.31, 37.12, 33.71, 29.01, 30.24]},
+]
+# Task keys must match Tasks Enum in src/about.py
+TASK_KEYS = [
+    "deep_research",
+    "idea_generation",
+    "dry_experiment",
+    "wet_experiment",
+    "experimental_reasoning",
+]
+# Convert percentages to decimals expected by read_evals (it multiplies by 100)
+def pct_to_decimal(p):
+    return round(p / 100.0, 6)
+def ensure_dir(p):
+    os.makedirs(p, exist_ok=True)
+def write_result_json(org, model, scores):
+    model_full = f"{org}/{model}"
+    # Place each model's JSON in its own subfolder under eval-results
+    model_dir = os.path.join(EVAL_RESULTS_PATH, org, model)
+    ensure_dir(model_dir)
+    # Minimal config expected by read_evals.py
+    cfg = {
+        "model_dtype": "float16",
+        "model_name": model_full,
+        "model_sha": "",
+    }
+    # Build results mapping
+    results = {}
+    for key, score in zip(TASK_KEYS, scores):
+        results[key] = {"acc": pct_to_decimal(score)}
+    payload = {
+        "config": cfg,
+        "results": results,
+    }
+    # Filename pattern is flexible; read_evals walks directories and reads all JSONs
+    ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    out_path = os.path.join(model_dir, f"results_{ts}.json")
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(payload, f, ensure_ascii=False, indent=2)
+    return out_path
+def write_request_json(org, model, model_type):
+    # Ensure request file lives under eval-queue/{org}/
+    org_dir = os.path.join(EVAL_REQUESTS_PATH, org)
+    ensure_dir(org_dir)
+    model_full = f"{org}/{model}"
+    now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    # Model type label must be parsable by ModelType.from_str
+    type_label = "🔓 : Open" if model_type == "Open" else "🔒 : Closed"
+    entry = {
+        "model": model_full,
+        "base_model": "",
+        "revision": "main",
+        "precision": "float16",
+        "weight_type": "Original",
+        "status": "FINISHED",
+        "submitted_time": now,
+        "model_type": type_label,
+        "likes": 0,
+        "params": 0,
+        "license": "?",
+        "private": False,
+    }
+    # File naming convention similar to submit.py
+    out_path = os.path.join(org_dir, f"{model}_eval_request_False_float16_Original.json")
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(entry, f, ensure_ascii=False, indent=2)
+    return out_path
+def main():
+    org = "sgi-bench"
+    ensure_dir(EVAL_RESULTS_PATH)
+    ensure_dir(EVAL_REQUESTS_PATH)
+    result_paths = []
+    request_paths = []
+    for m in MODELS:
+        res_path = write_result_json(org, m["name"], m["scores"])
+        req_path = write_request_json(org, m["name"], m["type"])
+        result_paths.append(res_path)
+        request_paths.append(req_path)
+    print("Generated result JSONs:")
+    for p in result_paths:
+        print("  -", p)
+    print("Generated request JSONs:")
+    for p in request_paths:
+        print("  -", p)
+if __name__ == "__main__":
+    main()

src/about.py CHANGED Viewed

@@ -11,9 +11,12 @@ class Task:
 # Select your tasks here
 # ---------------------------------------------------
 class Tasks(Enum):
-    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------

 # Select your tasks here
 # ---------------------------------------------------
 class Tasks(Enum):
+    # SGI-Bench tasks mapped to leaderboard columns
+    deep_research = Task("deep_research", "acc", "Deep Research")
+    idea_generation = Task("idea_generation", "acc", "Idea Generation")
+    dry_experiment = Task("dry_experiment", "acc", "Dry Experiment")
+    wet_experiment = Task("wet_experiment", "acc", "Wet Experiment")
+    experimental_reasoning = Task("experimental_reasoning", "acc", "Experimental Reasoning")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------

src/display/utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from dataclasses import dataclass, make_dataclass
 from enum import Enum
 import pandas as pd
@@ -23,25 +24,30 @@ class ColumnContent:
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
-auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
-auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
-auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
-auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
-auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
-auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
-auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
-auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
-auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
-auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
-# We use make dataclass to dynamically fill the scores from Tasks
-AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
@@ -62,6 +68,8 @@ class ModelDetails:
 class ModelType(Enum):
     PT = ModelDetails(name="pretrained", symbol="🟢")
     FT = ModelDetails(name="fine-tuned", symbol="🔶")
     IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
@@ -73,6 +81,10 @@ class ModelType(Enum):
     @staticmethod
     def from_str(type):
         if "fine-tuned" in type or "🔶" in type:
             return ModelType.FT
         if "pretrained" in type or "🟢" in type:
@@ -107,4 +119,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]

 from dataclasses import dataclass, make_dataclass
+from typing import ClassVar
 from enum import Enum
 import pandas as pd
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
+auto_eval_column_dict.append(["model_type_symbol", ClassVar[ColumnContent], ColumnContent("T", "str", True, never_hidden=True)])
+auto_eval_column_dict.append(["model", ClassVar[ColumnContent], ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
+auto_eval_column_dict.append(["average", ClassVar[ColumnContent], ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
+    auto_eval_column_dict.append([task.name, ClassVar[ColumnContent], ColumnContent(task.value.col_name, "number", True)])
 # Model information
+auto_eval_column_dict.append(["model_type", ClassVar[ColumnContent], ColumnContent("Type", "str", False)])
+auto_eval_column_dict.append(["architecture", ClassVar[ColumnContent], ColumnContent("Architecture", "str", False)])
+auto_eval_column_dict.append(["weight_type", ClassVar[ColumnContent], ColumnContent("Weight type", "str", False, True)])
+auto_eval_column_dict.append(["precision", ClassVar[ColumnContent], ColumnContent("Precision", "str", False)])
+auto_eval_column_dict.append(["license", ClassVar[ColumnContent], ColumnContent("Hub License", "str", False)])
+auto_eval_column_dict.append(["params", ClassVar[ColumnContent], ColumnContent("#Params (B)", "number", False)])
+auto_eval_column_dict.append(["likes", ClassVar[ColumnContent], ColumnContent("Hub ❤️", "number", False)])
+auto_eval_column_dict.append(["still_on_hub", ClassVar[ColumnContent], ColumnContent("Available on the hub", "bool", False)])
+auto_eval_column_dict.append(["revision", ClassVar[ColumnContent], ColumnContent("Model sha", "str", False, False)])
+# Build AutoEvalColumn as a simple class to hold ColumnContent descriptors
+class AutoEvalColumn:
+    pass
+# Populate attributes from auto_eval_column_dict
+for _name, _type, _default in auto_eval_column_dict:
+    setattr(AutoEvalColumn, _name, _default)
 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
 class ModelType(Enum):
+    Open = ModelDetails(name="Open", symbol="🔓")
+    Closed = ModelDetails(name="Closed", symbol="🔒")
     PT = ModelDetails(name="pretrained", symbol="🟢")
     FT = ModelDetails(name="fine-tuned", symbol="🔶")
     IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
     @staticmethod
     def from_str(type):
+        if "Open" in type or "🔓" in type:
+            return ModelType.Open
+        if "Closed" in type or "🔒" in type:
+            return ModelType.Closed
         if "fine-tuned" in type or "🔶" in type:
             return ModelType.FT
         if "pretrained" in type or "🟢" in type:
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]