bensondccnqwc commited on
Commit
47dd5aa
·
verified ·
1 Parent(s): 9b1fbcc

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. eval_results/global_step_0/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  2. eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  3. eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  4. eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  5. eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  6. eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  7. eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  8. eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  9. eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  10. eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  11. eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  12. eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  13. eval_results/global_step_20/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  14. eval_results/global_step_20/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  15. eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  16. eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  17. eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  18. eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  19. eval_results/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  20. eval_results/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  21. eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  22. eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  23. eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  24. eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  25. eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  26. eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  27. eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  28. eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  29. eval_results/global_step_30/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  30. eval_results/global_step_30/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  31. eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  32. eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  33. eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  34. eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  35. eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  36. eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  37. eval_results/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  38. eval_results/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  39. eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  40. eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  41. eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  42. eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  43. eval_results/global_step_40/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  44. eval_results/global_step_40/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  45. eval_results/global_step_40/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  46. eval_results/global_step_40/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  47. eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  48. eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  49. eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  50. eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
eval_results/global_step_0/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 61.8,
7
+ "pass_acc": 61.8,
8
+ "pass@k": {
9
+ "1": 61.8
10
+ },
11
+ "time_use_in_second": 209.1798279285431,
12
+ "time_use_in_minite": "3:29"
13
+ }
eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 2,
5
+ "empty_samples": 1,
6
+ "acc": 23.2,
7
+ "pass_acc": 23.2,
8
+ "pass@k": {
9
+ "1": 23.2
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 47.9,
13
+ "Dynamics and Control (2.003 Spring 2005)": 38.5,
14
+ "Ecology I (1.018J Fall 2009)": 0.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 33.3,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 15.1,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 11.3,
18
+ "Physical Chemistry (5.61 Fall 2017)": 0.0,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 44.4,
20
+ "Relativity (8.033 Fall 2006)": 18.2
21
+ },
22
+ "time_use_in_second": 163.57887530326843,
23
+ "time_use_in_minite": "2:43"
24
+ }
eval_results/global_step_20/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 3,
5
+ "empty_samples": 7,
6
+ "acc": 46.5,
7
+ "pass_acc": 46.5,
8
+ "pass@k": {
9
+ "1": 46.5
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 31.0,
13
+ "astronomy": 53.9,
14
+ "college_biology": 57.6,
15
+ "college_chemistry": 41.0,
16
+ "college_computer_science": 43.0,
17
+ "college_mathematics": 32.0,
18
+ "college_physics": 36.3,
19
+ "computer_security": 50.0,
20
+ "conceptual_physics": 56.6,
21
+ "electrical_engineering": 57.9,
22
+ "elementary_mathematics": 41.8,
23
+ "high_school_biology": 66.8,
24
+ "high_school_chemistry": 47.8,
25
+ "high_school_computer_science": 50.0,
26
+ "high_school_mathematics": 20.4,
27
+ "high_school_physics": 41.7,
28
+ "high_school_statistics": 48.1,
29
+ "machine_learning": 46.4
30
+ },
31
+ "time_use_in_second": 424.80005264282227,
32
+ "time_use_in_minite": "7:04"
33
+ }
eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 22.2,
7
+ "pass_acc": 22.2,
8
+ "pass@k": {
9
+ "1": 22.2
10
+ },
11
+ "time_use_in_second": 370.25228238105774,
12
+ "time_use_in_minite": "6:10"
13
+ }
eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 3.3,
7
+ "pass_acc": 3.3,
8
+ "pass@k": {
9
+ "1": 3.3
10
+ },
11
+ "time_use_in_second": 39.19697904586792,
12
+ "time_use_in_minite": "0:39"
13
+ }
eval_results/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 0.0,
7
+ "pass_acc": 0.0,
8
+ "pass@k": {
9
+ "1": 0.0
10
+ },
11
+ "time_use_in_second": 145.47153449058533,
12
+ "time_use_in_minite": "2:25"
13
+ }
eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 32.5,
7
+ "pass_acc": 32.5,
8
+ "pass@k": {
9
+ "1": 32.5
10
+ },
11
+ "time_use_in_second": 146.0725929737091,
12
+ "time_use_in_minite": "2:26"
13
+ }
eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 79.9,
7
+ "pass_acc": 79.9,
8
+ "pass@k": {
9
+ "1": 79.9
10
+ },
11
+ "time_use_in_second": 211.3379385471344,
12
+ "time_use_in_minite": "3:31"
13
+ }
eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 62.2,
7
+ "pass_acc": 62.2,
8
+ "pass@k": {
9
+ "1": 62.2
10
+ },
11
+ "time_use_in_second": 205.48666501045227,
12
+ "time_use_in_minite": "3:25"
13
+ }
eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 2,
5
+ "empty_samples": 3,
6
+ "acc": 22.4,
7
+ "pass_acc": 22.4,
8
+ "pass@k": {
9
+ "1": 22.4
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 43.8,
13
+ "Dynamics and Control (2.003 Spring 2005)": 42.3,
14
+ "Ecology I (1.018J Fall 2009)": 40.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 66.7,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 7.5,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 11.3,
18
+ "Physical Chemistry (5.61 Fall 2017)": 0.0,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 50.0,
20
+ "Relativity (8.033 Fall 2006)": 9.1
21
+ },
22
+ "time_use_in_second": 169.33431148529053,
23
+ "time_use_in_minite": "2:49"
24
+ }
eval_results/global_step_30/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 4,
5
+ "empty_samples": 8,
6
+ "acc": 48.5,
7
+ "pass_acc": 48.5,
8
+ "pass@k": {
9
+ "1": 48.5
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 33.0,
13
+ "astronomy": 59.2,
14
+ "college_biology": 60.4,
15
+ "college_chemistry": 41.0,
16
+ "college_computer_science": 48.0,
17
+ "college_mathematics": 37.0,
18
+ "college_physics": 41.2,
19
+ "computer_security": 53.0,
20
+ "conceptual_physics": 61.7,
21
+ "electrical_engineering": 54.5,
22
+ "elementary_mathematics": 49.7,
23
+ "high_school_biology": 63.2,
24
+ "high_school_chemistry": 47.3,
25
+ "high_school_computer_science": 58.0,
26
+ "high_school_mathematics": 17.4,
27
+ "high_school_physics": 41.7,
28
+ "high_school_statistics": 50.5,
29
+ "machine_learning": 45.5
30
+ },
31
+ "time_use_in_second": 467.60332226753235,
32
+ "time_use_in_minite": "7:47"
33
+ }
eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 24.9,
7
+ "pass_acc": 24.9,
8
+ "pass@k": {
9
+ "1": 24.9
10
+ },
11
+ "time_use_in_second": 352.9934434890747,
12
+ "time_use_in_minite": "5:52"
13
+ }
eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 6.7,
7
+ "pass_acc": 6.7,
8
+ "pass@k": {
9
+ "1": 6.7
10
+ },
11
+ "time_use_in_second": 149.70813250541687,
12
+ "time_use_in_minite": "2:29"
13
+ }
eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 3.3,
7
+ "pass_acc": 3.3,
8
+ "pass@k": {
9
+ "1": 3.3
10
+ },
11
+ "time_use_in_second": 146.4937720298767,
12
+ "time_use_in_minite": "2:26"
13
+ }
eval_results/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 27.5,
7
+ "pass_acc": 27.5,
8
+ "pass@k": {
9
+ "1": 27.5
10
+ },
11
+ "time_use_in_second": 153.41063284873962,
12
+ "time_use_in_minite": "2:33"
13
+ }
eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 80.2,
7
+ "pass_acc": 80.2,
8
+ "pass@k": {
9
+ "1": 80.2
10
+ },
11
+ "time_use_in_second": 194.10877060890198,
12
+ "time_use_in_minite": "3:14"
13
+ }
eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 62.8,
7
+ "pass_acc": 62.8,
8
+ "pass@k": {
9
+ "1": 62.8
10
+ },
11
+ "time_use_in_second": 194.2277638912201,
12
+ "time_use_in_minite": "3:14"
13
+ }
eval_results/global_step_40/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_40/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 27.9,
7
+ "pass_acc": 27.9,
8
+ "pass@k": {
9
+ "1": 27.9
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 54.2,
13
+ "Dynamics and Control (2.003 Spring 2005)": 46.2,
14
+ "Ecology I (1.018J Fall 2009)": 60.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 0.0,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 15.1,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 16.5,
18
+ "Physical Chemistry (5.61 Fall 2017)": 9.1,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 44.4,
20
+ "Relativity (8.033 Fall 2006)": 18.2
21
+ },
22
+ "time_use_in_second": 174.37697458267212,
23
+ "time_use_in_minite": "2:54"
24
+ }
eval_results/global_step_40/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_40/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 2,
5
+ "empty_samples": 5,
6
+ "acc": 52.0,
7
+ "pass_acc": 52.0,
8
+ "pass@k": {
9
+ "1": 52.0
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 34.0,
13
+ "astronomy": 65.1,
14
+ "college_biology": 63.9,
15
+ "college_chemistry": 46.0,
16
+ "college_computer_science": 48.0,
17
+ "college_mathematics": 38.0,
18
+ "college_physics": 50.0,
19
+ "computer_security": 46.0,
20
+ "conceptual_physics": 60.9,
21
+ "electrical_engineering": 55.2,
22
+ "elementary_mathematics": 53.4,
23
+ "high_school_biology": 67.4,
24
+ "high_school_chemistry": 56.7,
25
+ "high_school_computer_science": 67.0,
26
+ "high_school_mathematics": 21.9,
27
+ "high_school_physics": 48.3,
28
+ "high_school_statistics": 50.9,
29
+ "machine_learning": 50.0
30
+ },
31
+ "time_use_in_second": 409.8589496612549,
32
+ "time_use_in_minite": "6:49"
33
+ }
eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 25.3,
7
+ "pass_acc": 25.3,
8
+ "pass@k": {
9
+ "1": 25.3
10
+ },
11
+ "time_use_in_second": 418.00734639167786,
12
+ "time_use_in_minite": "6:58"
13
+ }
eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 6.7,
7
+ "pass_acc": 6.7,
8
+ "pass@k": {
9
+ "1": 6.7
10
+ },
11
+ "time_use_in_second": 152.48893404006958,
12
+ "time_use_in_minite": "2:32"
13
+ }