File size: 4,901 Bytes
6a84810 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
import json
from datetime import datetime, timezone
# Use local relative paths to avoid optional dependencies during generation
EVAL_RESULTS_PATH = "eval-results"
EVAL_REQUESTS_PATH = "eval-queue"
# Leaderboard data provided by user
MODELS = [
{"name": "Intern-S1", "type": "Open", "scores": [15.74, 38.09, 28.79, 29.02, 28.87]},
{"name": "Intern-S1-mini", "type": "Open", "scores": [11.06, 36.04, 16.97, 12.42, 16.84]},
{"name": "Qwen3-VL-235B-A22B", "type": "Open", "scores": [11.97, 39.28, 28.41, 30.30, 31.62]},
{"name": "Qwen3-Max", "type": "Open", "scores": [15.38, 39.83, 33.21, 33.62, 37.80]},
{"name": "Qwen3-8B", "type": "Open", "scores": [8.18, 35.78, 18.45, 9.96, 23.37]},
{"name": "Llama-4-Scout", "type": "Open", "scores": [7.86, 29.72, 20.37, 21.66, 25.77]},
{"name": "GPT-4o", "type": "Closed", "scores": [7.86, 35.95, 26.94, 31.31, 32.30]},
{"name": "GPT-4.1", "type": "Closed", "scores": [11.32, 36.49, 34.32, 36.63, 38.49]},
{"name": "GPT-5", "type": "Closed", "scores": [14.47, 55.40, 29.89, 16.31, 38.14]},
{"name": "GPT-5.1", "type": "Closed", "scores": [11.64, 47.12, 31.00, 22.77, 34.02]},
{"name": "o3", "type": "Closed", "scores": [12.89, 46.07, 31.73, 30.04, 32.65]},
{"name": "o4-mini", "type": "Closed", "scores": [11.95, 40.78, 35.79, 28.86, 33.33]},
{"name": "Gemini-2.5-Flash", "type": "Closed", "scores": [10.69, 39.13, 21.03, 18.55, 34.36]},
{"name": "Gemini-2.5-Pro", "type": "Closed", "scores": [15.09, 39.95, 22.51, 22.05, 41.24]},
{"name": "Gemini-3-Pro", "type": "Closed", "scores": [18.48, 39.68, 36.64, 32.45, 41.92]},
{"name": "Claude-Opus-4.1", "type": "Closed", "scores": [12.93, 40.29, 34.69, 25.38, 38.83]},
{"name": "Claude-Sonnet-4.5", "type": "Closed", "scores": [13.84, 43.20, 35.79, 30.15, 37.80]},
{"name": "Grok-4", "type": "Closed", "scores": [13.31, 37.12, 33.71, 29.01, 30.24]},
]
# Task keys must match Tasks Enum in src/about.py
TASK_KEYS = [
"deep_research",
"idea_generation",
"dry_experiment",
"wet_experiment",
"experimental_reasoning",
]
# Convert percentages to decimals expected by read_evals (it multiplies by 100)
def pct_to_decimal(p):
return round(p / 100.0, 6)
def ensure_dir(p):
os.makedirs(p, exist_ok=True)
def write_result_json(org, model, scores):
model_full = f"{org}/{model}"
# Place each model's JSON in its own subfolder under eval-results
model_dir = os.path.join(EVAL_RESULTS_PATH, org, model)
ensure_dir(model_dir)
# Minimal config expected by read_evals.py
cfg = {
"model_dtype": "float16",
"model_name": model_full,
"model_sha": "",
}
# Build results mapping
results = {}
for key, score in zip(TASK_KEYS, scores):
results[key] = {"acc": pct_to_decimal(score)}
payload = {
"config": cfg,
"results": results,
}
# Filename pattern is flexible; read_evals walks directories and reads all JSONs
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
out_path = os.path.join(model_dir, f"results_{ts}.json")
with open(out_path, "w", encoding="utf-8") as f:
json.dump(payload, f, ensure_ascii=False, indent=2)
return out_path
def write_request_json(org, model, model_type):
# Ensure request file lives under eval-queue/{org}/
org_dir = os.path.join(EVAL_REQUESTS_PATH, org)
ensure_dir(org_dir)
model_full = f"{org}/{model}"
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
# Model type label must be parsable by ModelType.from_str
type_label = "π : Open" if model_type == "Open" else "π : Closed"
entry = {
"model": model_full,
"base_model": "",
"revision": "main",
"precision": "float16",
"weight_type": "Original",
"status": "FINISHED",
"submitted_time": now,
"model_type": type_label,
"likes": 0,
"params": 0,
"license": "?",
"private": False,
}
# File naming convention similar to submit.py
out_path = os.path.join(org_dir, f"{model}_eval_request_False_float16_Original.json")
with open(out_path, "w", encoding="utf-8") as f:
json.dump(entry, f, ensure_ascii=False, indent=2)
return out_path
def main():
org = "sgi-bench"
ensure_dir(EVAL_RESULTS_PATH)
ensure_dir(EVAL_REQUESTS_PATH)
result_paths = []
request_paths = []
for m in MODELS:
res_path = write_result_json(org, m["name"], m["scores"])
req_path = write_request_json(org, m["name"], m["type"])
result_paths.append(res_path)
request_paths.append(req_path)
print("Generated result JSONs:")
for p in result_paths:
print(" -", p)
print("Generated request JSONs:")
for p in request_paths:
print(" -", p)
if __name__ == "__main__":
main()
|