unknown
commited on
Commit
·
33e428a
1
Parent(s):
81aa3ec
update
Browse files- .gitignore +4 -4
- eval-queue/sgi-bench/Claude-Opus-4.1_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/Claude-Sonnet-4.5_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/GPT-4.1_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/GPT-4o_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/GPT-5.1_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/GPT-5_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/Gemini-2.5-Flash_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/Gemini-2.5-Pro_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/Gemini-3-Pro_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/Grok-4_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/Intern-S1-mini_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/Intern-S1_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/Llama-4-Scout_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/Qwen3-8B_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/Qwen3-Max_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/Qwen3-VL-235B-A22B_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/o3_eval_request_False_float16_Original.json +14 -0
- eval-queue/sgi-bench/o4-mini_eval_request_False_float16_Original.json +14 -0
- eval-results/sgi-bench/Claude-Opus-4.1/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/Claude-Sonnet-4.5/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/GPT-4.1/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/GPT-4o/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/GPT-5.1/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/GPT-5/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/Gemini-2.5-Flash/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/Gemini-2.5-Pro/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/Gemini-3-Pro/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/Grok-4/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/Intern-S1-mini/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/Intern-S1/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/Llama-4-Scout/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/Qwen3-8B/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/Qwen3-Max/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/Qwen3-VL-235B-A22B/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/o3/results_20251203T061115Z.json +24 -0
- eval-results/sgi-bench/o4-mini/results_20251203T061115Z.json +24 -0
.gitignore
CHANGED
|
@@ -6,8 +6,8 @@ __pycache__/
|
|
| 6 |
*ipynb
|
| 7 |
.vscode/
|
| 8 |
|
| 9 |
-
eval-queue/
|
| 10 |
-
eval-results/
|
| 11 |
-
eval-queue-bk/
|
| 12 |
-
eval-results-bk/
|
| 13 |
logs/
|
|
|
|
| 6 |
*ipynb
|
| 7 |
.vscode/
|
| 8 |
|
| 9 |
+
# eval-queue/
|
| 10 |
+
# eval-results/
|
| 11 |
+
# eval-queue-bk/
|
| 12 |
+
# eval-results-bk/
|
| 13 |
logs/
|
eval-queue/sgi-bench/Claude-Opus-4.1_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/Claude-Opus-4.1",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔒 : Closed",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/Claude-Sonnet-4.5_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/Claude-Sonnet-4.5",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔒 : Closed",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/GPT-4.1_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/GPT-4.1",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔒 : Closed",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/GPT-4o_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/GPT-4o",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔒 : Closed",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/GPT-5.1_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/GPT-5.1",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔒 : Closed",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/GPT-5_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/GPT-5",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔒 : Closed",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/Gemini-2.5-Flash_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/Gemini-2.5-Flash",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔒 : Closed",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/Gemini-2.5-Pro_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/Gemini-2.5-Pro",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔒 : Closed",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/Gemini-3-Pro_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/Gemini-3-Pro",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔒 : Closed",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/Grok-4_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/Grok-4",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔒 : Closed",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/Intern-S1-mini_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/Intern-S1-mini",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔓 : Open",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/Intern-S1_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/Intern-S1",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔓 : Open",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/Llama-4-Scout_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/Llama-4-Scout",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔓 : Open",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/Qwen3-8B_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/Qwen3-8B",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔓 : Open",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/Qwen3-Max_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/Qwen3-Max",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔓 : Open",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/Qwen3-VL-235B-A22B_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/Qwen3-VL-235B-A22B",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔓 : Open",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/o3_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/o3",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔒 : Closed",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-queue/sgi-bench/o4-mini_eval_request_False_float16_Original.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "sgi-bench/o4-mini",
|
| 3 |
+
"base_model": "",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"precision": "float16",
|
| 6 |
+
"weight_type": "Original",
|
| 7 |
+
"status": "FINISHED",
|
| 8 |
+
"submitted_time": "2025-12-03T06:11:15Z",
|
| 9 |
+
"model_type": "🔒 : Closed",
|
| 10 |
+
"likes": 0,
|
| 11 |
+
"params": 0,
|
| 12 |
+
"license": "?",
|
| 13 |
+
"private": false
|
| 14 |
+
}
|
eval-results/sgi-bench/Claude-Opus-4.1/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/Claude-Opus-4.1",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.1293
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.4029
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.3469
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.2538
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.3883
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/Claude-Sonnet-4.5/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/Claude-Sonnet-4.5",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.1384
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.432
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.3579
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.3015
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.378
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/GPT-4.1/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/GPT-4.1",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.1132
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.3649
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.3432
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.3663
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.3849
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/GPT-4o/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/GPT-4o",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.0786
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.3595
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.2694
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.3131
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.323
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/GPT-5.1/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/GPT-5.1",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.1164
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.4712
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.31
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.2277
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.3402
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/GPT-5/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/GPT-5",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.1447
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.554
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.2989
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.1631
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.3814
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/Gemini-2.5-Flash/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/Gemini-2.5-Flash",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.1069
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.3913
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.2103
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.1855
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.3436
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/Gemini-2.5-Pro/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/Gemini-2.5-Pro",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.1509
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.3995
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.2251
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.2205
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.4124
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/Gemini-3-Pro/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/Gemini-3-Pro",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.1848
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.3968
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.3664
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.3245
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.4192
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/Grok-4/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/Grok-4",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.1331
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.3712
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.3371
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.2901
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.3024
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/Intern-S1-mini/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/Intern-S1-mini",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.1106
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.3604
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.1697
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.1242
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.1684
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/Intern-S1/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/Intern-S1",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.1574
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.3809
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.2879
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.2902
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.2887
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/Llama-4-Scout/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/Llama-4-Scout",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.0786
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.2972
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.2037
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.2166
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.2577
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/Qwen3-8B/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/Qwen3-8B",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.0818
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.3578
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.1845
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.0996
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.2337
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/Qwen3-Max/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/Qwen3-Max",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.1538
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.3983
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.3321
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.3362
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.378
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/Qwen3-VL-235B-A22B/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/Qwen3-VL-235B-A22B",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.1197
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.3928
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.2841
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.303
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.3162
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/o3/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/o3",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.1289
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.4607
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.3173
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.3004
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.3265
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
eval-results/sgi-bench/o4-mini/results_20251203T061115Z.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_dtype": "float16",
|
| 4 |
+
"model_name": "sgi-bench/o4-mini",
|
| 5 |
+
"model_sha": ""
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"deep_research": {
|
| 9 |
+
"acc": 0.1195
|
| 10 |
+
},
|
| 11 |
+
"idea_generation": {
|
| 12 |
+
"acc": 0.4078
|
| 13 |
+
},
|
| 14 |
+
"dry_experiment": {
|
| 15 |
+
"acc": 0.3579
|
| 16 |
+
},
|
| 17 |
+
"wet_experiment": {
|
| 18 |
+
"acc": 0.2886
|
| 19 |
+
},
|
| 20 |
+
"experimental_reasoning": {
|
| 21 |
+
"acc": 0.3333
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|