File size: 4,901 Bytes
6a84810
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import json
from datetime import datetime, timezone

# Use local relative paths to avoid optional dependencies during generation
EVAL_RESULTS_PATH = "eval-results"
EVAL_REQUESTS_PATH = "eval-queue"

# Leaderboard data provided by user
MODELS = [
    {"name": "Intern-S1", "type": "Open", "scores": [15.74, 38.09, 28.79, 29.02, 28.87]},
    {"name": "Intern-S1-mini", "type": "Open", "scores": [11.06, 36.04, 16.97, 12.42, 16.84]},
    {"name": "Qwen3-VL-235B-A22B", "type": "Open", "scores": [11.97, 39.28, 28.41, 30.30, 31.62]},
    {"name": "Qwen3-Max", "type": "Open", "scores": [15.38, 39.83, 33.21, 33.62, 37.80]},
    {"name": "Qwen3-8B", "type": "Open", "scores": [8.18, 35.78, 18.45, 9.96, 23.37]},
    {"name": "Llama-4-Scout", "type": "Open", "scores": [7.86, 29.72, 20.37, 21.66, 25.77]},
    {"name": "GPT-4o", "type": "Closed", "scores": [7.86, 35.95, 26.94, 31.31, 32.30]},
    {"name": "GPT-4.1", "type": "Closed", "scores": [11.32, 36.49, 34.32, 36.63, 38.49]},
    {"name": "GPT-5", "type": "Closed", "scores": [14.47, 55.40, 29.89, 16.31, 38.14]},
    {"name": "GPT-5.1", "type": "Closed", "scores": [11.64, 47.12, 31.00, 22.77, 34.02]},
    {"name": "o3", "type": "Closed", "scores": [12.89, 46.07, 31.73, 30.04, 32.65]},
    {"name": "o4-mini", "type": "Closed", "scores": [11.95, 40.78, 35.79, 28.86, 33.33]},
    {"name": "Gemini-2.5-Flash", "type": "Closed", "scores": [10.69, 39.13, 21.03, 18.55, 34.36]},
    {"name": "Gemini-2.5-Pro", "type": "Closed", "scores": [15.09, 39.95, 22.51, 22.05, 41.24]},
    {"name": "Gemini-3-Pro", "type": "Closed", "scores": [18.48, 39.68, 36.64, 32.45, 41.92]},
    {"name": "Claude-Opus-4.1", "type": "Closed", "scores": [12.93, 40.29, 34.69, 25.38, 38.83]},
    {"name": "Claude-Sonnet-4.5", "type": "Closed", "scores": [13.84, 43.20, 35.79, 30.15, 37.80]},
    {"name": "Grok-4", "type": "Closed", "scores": [13.31, 37.12, 33.71, 29.01, 30.24]},
]

# Task keys must match Tasks Enum in src/about.py
TASK_KEYS = [
    "deep_research",
    "idea_generation",
    "dry_experiment",
    "wet_experiment",
    "experimental_reasoning",
]

# Convert percentages to decimals expected by read_evals (it multiplies by 100)
def pct_to_decimal(p):
    return round(p / 100.0, 6)

def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def write_result_json(org, model, scores):
    model_full = f"{org}/{model}"
    # Place each model's JSON in its own subfolder under eval-results
    model_dir = os.path.join(EVAL_RESULTS_PATH, org, model)
    ensure_dir(model_dir)

    # Minimal config expected by read_evals.py
    cfg = {
        "model_dtype": "float16",
        "model_name": model_full,
        "model_sha": "",
    }

    # Build results mapping
    results = {}
    for key, score in zip(TASK_KEYS, scores):
        results[key] = {"acc": pct_to_decimal(score)}

    payload = {
        "config": cfg,
        "results": results,
    }

    # Filename pattern is flexible; read_evals walks directories and reads all JSONs
    ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
    out_path = os.path.join(model_dir, f"results_{ts}.json")
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)
    return out_path

def write_request_json(org, model, model_type):
    # Ensure request file lives under eval-queue/{org}/
    org_dir = os.path.join(EVAL_REQUESTS_PATH, org)
    ensure_dir(org_dir)

    model_full = f"{org}/{model}"
    now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    # Model type label must be parsable by ModelType.from_str
    type_label = "πŸ”“ : Open" if model_type == "Open" else "πŸ”’ : Closed"

    entry = {
        "model": model_full,
        "base_model": "",
        "revision": "main",
        "precision": "float16",
        "weight_type": "Original",
        "status": "FINISHED",
        "submitted_time": now,
        "model_type": type_label,
        "likes": 0,
        "params": 0,
        "license": "?",
        "private": False,
    }

    # File naming convention similar to submit.py
    out_path = os.path.join(org_dir, f"{model}_eval_request_False_float16_Original.json")
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(entry, f, ensure_ascii=False, indent=2)
    return out_path

def main():
    org = "sgi-bench"
    ensure_dir(EVAL_RESULTS_PATH)
    ensure_dir(EVAL_REQUESTS_PATH)

    result_paths = []
    request_paths = []

    for m in MODELS:
        res_path = write_result_json(org, m["name"], m["scores"])
        req_path = write_request_json(org, m["name"], m["type"])
        result_paths.append(res_path)
        request_paths.append(req_path)

    print("Generated result JSONs:")
    for p in result_paths:
        print("  -", p)
    print("Generated request JSONs:")
    for p in request_paths:
        print("  -", p)

if __name__ == "__main__":
    main()