|
|
import os |
|
|
import json |
|
|
from datetime import datetime, timezone |
|
|
|
|
|
|
|
|
EVAL_RESULTS_PATH = "eval-results" |
|
|
EVAL_REQUESTS_PATH = "eval-queue" |
|
|
|
|
|
|
|
|
MODELS = [ |
|
|
{"name": "Intern-S1", "type": "Open", "scores": [15.74, 38.09, 28.79, 29.02, 28.87]}, |
|
|
{"name": "Intern-S1-mini", "type": "Open", "scores": [11.06, 36.04, 16.97, 12.42, 16.84]}, |
|
|
{"name": "Qwen3-VL-235B-A22B", "type": "Open", "scores": [11.97, 39.28, 28.41, 30.30, 31.62]}, |
|
|
{"name": "Qwen3-Max", "type": "Open", "scores": [15.38, 39.83, 33.21, 33.62, 37.80]}, |
|
|
{"name": "Qwen3-8B", "type": "Open", "scores": [8.18, 35.78, 18.45, 9.96, 23.37]}, |
|
|
{"name": "Llama-4-Scout", "type": "Open", "scores": [7.86, 29.72, 20.37, 21.66, 25.77]}, |
|
|
{"name": "GPT-4o", "type": "Closed", "scores": [7.86, 35.95, 26.94, 31.31, 32.30]}, |
|
|
{"name": "GPT-4.1", "type": "Closed", "scores": [11.32, 36.49, 34.32, 36.63, 38.49]}, |
|
|
{"name": "GPT-5", "type": "Closed", "scores": [14.47, 55.40, 29.89, 16.31, 38.14]}, |
|
|
{"name": "GPT-5.1", "type": "Closed", "scores": [11.64, 47.12, 31.00, 22.77, 34.02]}, |
|
|
{"name": "o3", "type": "Closed", "scores": [12.89, 46.07, 31.73, 30.04, 32.65]}, |
|
|
{"name": "o4-mini", "type": "Closed", "scores": [11.95, 40.78, 35.79, 28.86, 33.33]}, |
|
|
{"name": "Gemini-2.5-Flash", "type": "Closed", "scores": [10.69, 39.13, 21.03, 18.55, 34.36]}, |
|
|
{"name": "Gemini-2.5-Pro", "type": "Closed", "scores": [15.09, 39.95, 22.51, 22.05, 41.24]}, |
|
|
{"name": "Gemini-3-Pro", "type": "Closed", "scores": [18.48, 39.68, 36.64, 32.45, 41.92]}, |
|
|
{"name": "Claude-Opus-4.1", "type": "Closed", "scores": [12.93, 40.29, 34.69, 25.38, 38.83]}, |
|
|
{"name": "Claude-Sonnet-4.5", "type": "Closed", "scores": [13.84, 43.20, 35.79, 30.15, 37.80]}, |
|
|
{"name": "Grok-4", "type": "Closed", "scores": [13.31, 37.12, 33.71, 29.01, 30.24]}, |
|
|
] |
|
|
|
|
|
|
|
|
TASK_KEYS = [ |
|
|
"deep_research", |
|
|
"idea_generation", |
|
|
"dry_experiment", |
|
|
"wet_experiment", |
|
|
"experimental_reasoning", |
|
|
] |
|
|
|
|
|
|
|
|
def pct_to_decimal(p): |
|
|
return round(p / 100.0, 6) |
|
|
|
|
|
def ensure_dir(p): |
|
|
os.makedirs(p, exist_ok=True) |
|
|
|
|
|
def write_result_json(org, model, scores): |
|
|
model_full = f"{org}/{model}" |
|
|
|
|
|
model_dir = os.path.join(EVAL_RESULTS_PATH, org, model) |
|
|
ensure_dir(model_dir) |
|
|
|
|
|
|
|
|
cfg = { |
|
|
"model_dtype": "float16", |
|
|
"model_name": model_full, |
|
|
"model_sha": "", |
|
|
} |
|
|
|
|
|
|
|
|
results = {} |
|
|
for key, score in zip(TASK_KEYS, scores): |
|
|
results[key] = {"acc": pct_to_decimal(score)} |
|
|
|
|
|
payload = { |
|
|
"config": cfg, |
|
|
"results": results, |
|
|
} |
|
|
|
|
|
|
|
|
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") |
|
|
out_path = os.path.join(model_dir, f"results_{ts}.json") |
|
|
with open(out_path, "w", encoding="utf-8") as f: |
|
|
json.dump(payload, f, ensure_ascii=False, indent=2) |
|
|
return out_path |
|
|
|
|
|
def write_request_json(org, model, model_type): |
|
|
|
|
|
org_dir = os.path.join(EVAL_REQUESTS_PATH, org) |
|
|
ensure_dir(org_dir) |
|
|
|
|
|
model_full = f"{org}/{model}" |
|
|
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") |
|
|
|
|
|
|
|
|
type_label = "π : Open" if model_type == "Open" else "π : Closed" |
|
|
|
|
|
entry = { |
|
|
"model": model_full, |
|
|
"base_model": "", |
|
|
"revision": "main", |
|
|
"precision": "float16", |
|
|
"weight_type": "Original", |
|
|
"status": "FINISHED", |
|
|
"submitted_time": now, |
|
|
"model_type": type_label, |
|
|
"likes": 0, |
|
|
"params": 0, |
|
|
"license": "?", |
|
|
"private": False, |
|
|
} |
|
|
|
|
|
|
|
|
out_path = os.path.join(org_dir, f"{model}_eval_request_False_float16_Original.json") |
|
|
with open(out_path, "w", encoding="utf-8") as f: |
|
|
json.dump(entry, f, ensure_ascii=False, indent=2) |
|
|
return out_path |
|
|
|
|
|
def main(): |
|
|
org = "sgi-bench" |
|
|
ensure_dir(EVAL_RESULTS_PATH) |
|
|
ensure_dir(EVAL_REQUESTS_PATH) |
|
|
|
|
|
result_paths = [] |
|
|
request_paths = [] |
|
|
|
|
|
for m in MODELS: |
|
|
res_path = write_result_json(org, m["name"], m["scores"]) |
|
|
req_path = write_request_json(org, m["name"], m["type"]) |
|
|
result_paths.append(res_path) |
|
|
request_paths.append(req_path) |
|
|
|
|
|
print("Generated result JSONs:") |
|
|
for p in result_paths: |
|
|
print(" -", p) |
|
|
print("Generated request JSONs:") |
|
|
for p in request_paths: |
|
|
print(" -", p) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|