unknown commited on
Commit
33e428a
·
1 Parent(s): 81aa3ec
Files changed (37) hide show
  1. .gitignore +4 -4
  2. eval-queue/sgi-bench/Claude-Opus-4.1_eval_request_False_float16_Original.json +14 -0
  3. eval-queue/sgi-bench/Claude-Sonnet-4.5_eval_request_False_float16_Original.json +14 -0
  4. eval-queue/sgi-bench/GPT-4.1_eval_request_False_float16_Original.json +14 -0
  5. eval-queue/sgi-bench/GPT-4o_eval_request_False_float16_Original.json +14 -0
  6. eval-queue/sgi-bench/GPT-5.1_eval_request_False_float16_Original.json +14 -0
  7. eval-queue/sgi-bench/GPT-5_eval_request_False_float16_Original.json +14 -0
  8. eval-queue/sgi-bench/Gemini-2.5-Flash_eval_request_False_float16_Original.json +14 -0
  9. eval-queue/sgi-bench/Gemini-2.5-Pro_eval_request_False_float16_Original.json +14 -0
  10. eval-queue/sgi-bench/Gemini-3-Pro_eval_request_False_float16_Original.json +14 -0
  11. eval-queue/sgi-bench/Grok-4_eval_request_False_float16_Original.json +14 -0
  12. eval-queue/sgi-bench/Intern-S1-mini_eval_request_False_float16_Original.json +14 -0
  13. eval-queue/sgi-bench/Intern-S1_eval_request_False_float16_Original.json +14 -0
  14. eval-queue/sgi-bench/Llama-4-Scout_eval_request_False_float16_Original.json +14 -0
  15. eval-queue/sgi-bench/Qwen3-8B_eval_request_False_float16_Original.json +14 -0
  16. eval-queue/sgi-bench/Qwen3-Max_eval_request_False_float16_Original.json +14 -0
  17. eval-queue/sgi-bench/Qwen3-VL-235B-A22B_eval_request_False_float16_Original.json +14 -0
  18. eval-queue/sgi-bench/o3_eval_request_False_float16_Original.json +14 -0
  19. eval-queue/sgi-bench/o4-mini_eval_request_False_float16_Original.json +14 -0
  20. eval-results/sgi-bench/Claude-Opus-4.1/results_20251203T061115Z.json +24 -0
  21. eval-results/sgi-bench/Claude-Sonnet-4.5/results_20251203T061115Z.json +24 -0
  22. eval-results/sgi-bench/GPT-4.1/results_20251203T061115Z.json +24 -0
  23. eval-results/sgi-bench/GPT-4o/results_20251203T061115Z.json +24 -0
  24. eval-results/sgi-bench/GPT-5.1/results_20251203T061115Z.json +24 -0
  25. eval-results/sgi-bench/GPT-5/results_20251203T061115Z.json +24 -0
  26. eval-results/sgi-bench/Gemini-2.5-Flash/results_20251203T061115Z.json +24 -0
  27. eval-results/sgi-bench/Gemini-2.5-Pro/results_20251203T061115Z.json +24 -0
  28. eval-results/sgi-bench/Gemini-3-Pro/results_20251203T061115Z.json +24 -0
  29. eval-results/sgi-bench/Grok-4/results_20251203T061115Z.json +24 -0
  30. eval-results/sgi-bench/Intern-S1-mini/results_20251203T061115Z.json +24 -0
  31. eval-results/sgi-bench/Intern-S1/results_20251203T061115Z.json +24 -0
  32. eval-results/sgi-bench/Llama-4-Scout/results_20251203T061115Z.json +24 -0
  33. eval-results/sgi-bench/Qwen3-8B/results_20251203T061115Z.json +24 -0
  34. eval-results/sgi-bench/Qwen3-Max/results_20251203T061115Z.json +24 -0
  35. eval-results/sgi-bench/Qwen3-VL-235B-A22B/results_20251203T061115Z.json +24 -0
  36. eval-results/sgi-bench/o3/results_20251203T061115Z.json +24 -0
  37. eval-results/sgi-bench/o4-mini/results_20251203T061115Z.json +24 -0
.gitignore CHANGED
@@ -6,8 +6,8 @@ __pycache__/
6
  *ipynb
7
  .vscode/
8
 
9
- eval-queue/
10
- eval-results/
11
- eval-queue-bk/
12
- eval-results-bk/
13
  logs/
 
6
  *ipynb
7
  .vscode/
8
 
9
+ # eval-queue/
10
+ # eval-results/
11
+ # eval-queue-bk/
12
+ # eval-results-bk/
13
  logs/
eval-queue/sgi-bench/Claude-Opus-4.1_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/Claude-Opus-4.1",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔒 : Closed",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/Claude-Sonnet-4.5_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/Claude-Sonnet-4.5",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔒 : Closed",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/GPT-4.1_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/GPT-4.1",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔒 : Closed",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/GPT-4o_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/GPT-4o",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔒 : Closed",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/GPT-5.1_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/GPT-5.1",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔒 : Closed",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/GPT-5_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/GPT-5",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔒 : Closed",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/Gemini-2.5-Flash_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/Gemini-2.5-Flash",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔒 : Closed",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/Gemini-2.5-Pro_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/Gemini-2.5-Pro",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔒 : Closed",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/Gemini-3-Pro_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/Gemini-3-Pro",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔒 : Closed",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/Grok-4_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/Grok-4",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔒 : Closed",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/Intern-S1-mini_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/Intern-S1-mini",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔓 : Open",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/Intern-S1_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/Intern-S1",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔓 : Open",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/Llama-4-Scout_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/Llama-4-Scout",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔓 : Open",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/Qwen3-8B_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/Qwen3-8B",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔓 : Open",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/Qwen3-Max_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/Qwen3-Max",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔓 : Open",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/Qwen3-VL-235B-A22B_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/Qwen3-VL-235B-A22B",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔓 : Open",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/o3_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/o3",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔒 : Closed",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-queue/sgi-bench/o4-mini_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sgi-bench/o4-mini",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "precision": "float16",
6
+ "weight_type": "Original",
7
+ "status": "FINISHED",
8
+ "submitted_time": "2025-12-03T06:11:15Z",
9
+ "model_type": "🔒 : Closed",
10
+ "likes": 0,
11
+ "params": 0,
12
+ "license": "?",
13
+ "private": false
14
+ }
eval-results/sgi-bench/Claude-Opus-4.1/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/Claude-Opus-4.1",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.1293
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.4029
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.3469
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.2538
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.3883
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/Claude-Sonnet-4.5/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/Claude-Sonnet-4.5",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.1384
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.432
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.3579
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.3015
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.378
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/GPT-4.1/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/GPT-4.1",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.1132
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.3649
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.3432
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.3663
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.3849
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/GPT-4o/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/GPT-4o",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.0786
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.3595
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.2694
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.3131
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.323
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/GPT-5.1/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/GPT-5.1",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.1164
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.4712
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.31
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.2277
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.3402
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/GPT-5/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/GPT-5",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.1447
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.554
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.2989
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.1631
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.3814
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/Gemini-2.5-Flash/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/Gemini-2.5-Flash",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.1069
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.3913
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.2103
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.1855
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.3436
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/Gemini-2.5-Pro/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/Gemini-2.5-Pro",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.1509
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.3995
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.2251
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.2205
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.4124
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/Gemini-3-Pro/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/Gemini-3-Pro",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.1848
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.3968
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.3664
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.3245
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.4192
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/Grok-4/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/Grok-4",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.1331
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.3712
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.3371
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.2901
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.3024
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/Intern-S1-mini/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/Intern-S1-mini",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.1106
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.3604
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.1697
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.1242
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.1684
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/Intern-S1/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/Intern-S1",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.1574
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.3809
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.2879
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.2902
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.2887
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/Llama-4-Scout/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/Llama-4-Scout",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.0786
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.2972
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.2037
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.2166
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.2577
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/Qwen3-8B/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/Qwen3-8B",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.0818
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.3578
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.1845
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.0996
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.2337
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/Qwen3-Max/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/Qwen3-Max",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.1538
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.3983
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.3321
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.3362
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.378
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/Qwen3-VL-235B-A22B/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/Qwen3-VL-235B-A22B",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.1197
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.3928
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.2841
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.303
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.3162
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/o3/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/o3",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.1289
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.4607
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.3173
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.3004
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.3265
22
+ }
23
+ }
24
+ }
eval-results/sgi-bench/o4-mini/results_20251203T061115Z.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "float16",
4
+ "model_name": "sgi-bench/o4-mini",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "deep_research": {
9
+ "acc": 0.1195
10
+ },
11
+ "idea_generation": {
12
+ "acc": 0.4078
13
+ },
14
+ "dry_experiment": {
15
+ "acc": 0.3579
16
+ },
17
+ "wet_experiment": {
18
+ "acc": 0.2886
19
+ },
20
+ "experimental_reasoning": {
21
+ "acc": 0.3333
22
+ }
23
+ }
24
+ }