johndoe123345 commited on
Commit
5ad5580
·
verified ·
1 Parent(s): 8eb3cce

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  3. eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  4. eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  5. eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  6. eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  7. eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  8. eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  9. eval_results/global_step_15/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  10. eval_results/global_step_15/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  11. eval_results/global_step_15/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  12. eval_results/global_step_15/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  13. eval_results/global_step_15/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  14. eval_results/global_step_15/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  15. eval_results/global_step_15/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  16. eval_results/global_step_15/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  17. eval_results/global_step_15/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  18. eval_results/global_step_15/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  19. eval_results/global_step_15/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  20. eval_results/global_step_15/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  21. eval_results/global_step_25/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  22. eval_results/global_step_25/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  23. eval_results/global_step_25/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  24. eval_results/global_step_25/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  25. eval_results/global_step_25/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  26. eval_results/global_step_25/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  27. eval_results/global_step_25/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  28. eval_results/global_step_25/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  29. eval_results/global_step_25/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  30. eval_results/global_step_25/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  31. eval_results/global_step_25/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  32. eval_results/global_step_25/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  33. eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  34. eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  35. eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  36. eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  37. eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  38. eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  39. eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  40. eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  41. eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  42. eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  43. eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  44. eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  45. eval_results/global_step_35/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  46. eval_results/global_step_35/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  47. eval_results/global_step_35/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  48. eval_results/global_step_35/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  49. eval_results/global_step_35/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  50. eval_results/global_step_35/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
.gitattributes CHANGED
@@ -47,3 +47,7 @@ global_step_115/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -
47
  global_step_75/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
48
  global_step_55/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
  global_step_15/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
47
  global_step_75/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
48
  global_step_55/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
  global_step_15/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
50
+ global_step_30/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
51
+ global_step_95/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
52
+ global_step_100/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
53
+ global_step_40/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 20,
6
+ "acc": 75.7,
7
+ "pass_acc": 75.7,
8
+ "pass@k": {
9
+ "1": 75.7
10
+ },
11
+ "time_use_in_second": 125.95953345298767,
12
+ "time_use_in_minite": "2:05"
13
+ }
eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 9,
6
+ "acc": 52.2,
7
+ "pass_acc": 52.2,
8
+ "pass@k": {
9
+ "1": 52.2
10
+ },
11
+ "time_use_in_second": 94.54320168495178,
12
+ "time_use_in_minite": "1:34"
13
+ }
eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 18,
6
+ "acc": 20.6,
7
+ "pass_acc": 20.6,
8
+ "pass@k": {
9
+ "1": 20.6
10
+ },
11
+ "time_use_in_second": 200.8112862110138,
12
+ "time_use_in_minite": "3:20"
13
+ }
eval_results/global_step_15/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_15/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 13.3,
7
+ "pass_acc": 13.3,
8
+ "pass@k": {
9
+ "1": 13.3
10
+ },
11
+ "time_use_in_second": 34.98631310462952,
12
+ "time_use_in_minite": "0:34"
13
+ }
eval_results/global_step_15/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_15/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 52.5,
7
+ "pass_acc": 52.5,
8
+ "pass@k": {
9
+ "1": 52.5
10
+ },
11
+ "time_use_in_second": 68.07727694511414,
12
+ "time_use_in_minite": "1:08"
13
+ }
eval_results/global_step_15/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_15/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 90.2,
7
+ "pass_acc": 90.2,
8
+ "pass@k": {
9
+ "1": 90.2
10
+ },
11
+ "time_use_in_second": 63.200294494628906,
12
+ "time_use_in_minite": "1:03"
13
+ }
eval_results/global_step_15/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_15/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 1,
6
+ "acc": 72.4,
7
+ "pass_acc": 72.4,
8
+ "pass@k": {
9
+ "1": 72.4
10
+ },
11
+ "time_use_in_second": 57.943002700805664,
12
+ "time_use_in_minite": "0:57"
13
+ }
eval_results/global_step_15/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_15/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 35.7,
7
+ "pass_acc": 35.7,
8
+ "pass@k": {
9
+ "1": 35.7
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 64.6,
13
+ "Dynamics and Control (2.003 Spring 2005)": 46.2,
14
+ "Ecology I (1.018J Fall 2009)": 40.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 33.3,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 28.3,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 21.6,
18
+ "Physical Chemistry (5.61 Fall 2017)": 9.1,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 55.6,
20
+ "Relativity (8.033 Fall 2006)": 36.4
21
+ },
22
+ "time_use_in_second": 84.02002787590027,
23
+ "time_use_in_minite": "1:24"
24
+ }
eval_results/global_step_15/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_15/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 36.3,
7
+ "pass_acc": 36.3,
8
+ "pass@k": {
9
+ "1": 36.3
10
+ },
11
+ "time_use_in_second": 138.21381092071533,
12
+ "time_use_in_minite": "2:18"
13
+ }
eval_results/global_step_25/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_25/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 10.0,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 10.0
10
+ },
11
+ "time_use_in_second": 68.03684830665588,
12
+ "time_use_in_minite": "1:08"
13
+ }
eval_results/global_step_25/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_25/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 52.5,
7
+ "pass_acc": 52.5,
8
+ "pass@k": {
9
+ "1": 52.5
10
+ },
11
+ "time_use_in_second": 30.228216886520386,
12
+ "time_use_in_minite": "0:30"
13
+ }
eval_results/global_step_25/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_25/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 91.3,
7
+ "pass_acc": 91.3,
8
+ "pass@k": {
9
+ "1": 91.3
10
+ },
11
+ "time_use_in_second": 61.750991344451904,
12
+ "time_use_in_minite": "1:01"
13
+ }
eval_results/global_step_25/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_25/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 73.4,
7
+ "pass_acc": 73.4,
8
+ "pass@k": {
9
+ "1": 73.4
10
+ },
11
+ "time_use_in_second": 102.64481163024902,
12
+ "time_use_in_minite": "1:42"
13
+ }
eval_results/global_step_25/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_25/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 32.7,
7
+ "pass_acc": 32.7,
8
+ "pass@k": {
9
+ "1": 32.7
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 58.3,
13
+ "Dynamics and Control (2.003 Spring 2005)": 53.8,
14
+ "Ecology I (1.018J Fall 2009)": 40.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 33.3,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 13.2,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 23.7,
18
+ "Physical Chemistry (5.61 Fall 2017)": 9.1,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 55.6,
20
+ "Relativity (8.033 Fall 2006)": 27.3
21
+ },
22
+ "time_use_in_second": 36.20563268661499,
23
+ "time_use_in_minite": "0:36"
24
+ }
eval_results/global_step_25/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_25/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 38.5,
7
+ "pass_acc": 38.5,
8
+ "pass@k": {
9
+ "1": 38.5
10
+ },
11
+ "time_use_in_second": 137.9393606185913,
12
+ "time_use_in_minite": "2:17"
13
+ }
eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 16.7,
7
+ "pass_acc": 16.7,
8
+ "pass@k": {
9
+ "1": 16.7
10
+ },
11
+ "time_use_in_second": 67.79033923149109,
12
+ "time_use_in_minite": "1:07"
13
+ }
eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 52.5,
7
+ "pass_acc": 52.5,
8
+ "pass@k": {
9
+ "1": 52.5
10
+ },
11
+ "time_use_in_second": 22.189570665359497,
12
+ "time_use_in_minite": "0:22"
13
+ }
eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 89.9,
7
+ "pass_acc": 89.9,
8
+ "pass@k": {
9
+ "1": 89.9
10
+ },
11
+ "time_use_in_second": 110.01273012161255,
12
+ "time_use_in_minite": "1:50"
13
+ }
eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 76.0,
7
+ "pass_acc": 76.0,
8
+ "pass@k": {
9
+ "1": 76.0
10
+ },
11
+ "time_use_in_second": 101.99508786201477,
12
+ "time_use_in_minite": "1:41"
13
+ }
eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 33.8,
7
+ "pass_acc": 33.8,
8
+ "pass@k": {
9
+ "1": 33.8
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 62.5,
13
+ "Dynamics and Control (2.003 Spring 2005)": 46.2,
14
+ "Ecology I (1.018J Fall 2009)": 40.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 33.3,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 20.8,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 22.7,
18
+ "Physical Chemistry (5.61 Fall 2017)": 9.1,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 50.0,
20
+ "Relativity (8.033 Fall 2006)": 36.4
21
+ },
22
+ "time_use_in_second": 36.29343605041504,
23
+ "time_use_in_minite": "0:36"
24
+ }
eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 36.6,
7
+ "pass_acc": 36.6,
8
+ "pass@k": {
9
+ "1": 36.6
10
+ },
11
+ "time_use_in_second": 140.6587872505188,
12
+ "time_use_in_minite": "2:20"
13
+ }
eval_results/global_step_35/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_35/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 13.3,
7
+ "pass_acc": 13.3,
8
+ "pass@k": {
9
+ "1": 13.3
10
+ },
11
+ "time_use_in_second": 66.62974858283997,
12
+ "time_use_in_minite": "1:06"
13
+ }
eval_results/global_step_35/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_35/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 55.0,
7
+ "pass_acc": 55.0,
8
+ "pass@k": {
9
+ "1": 55.0
10
+ },
11
+ "time_use_in_second": 19.278455018997192,
12
+ "time_use_in_minite": "0:19"
13
+ }
eval_results/global_step_35/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_35/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 91.3,
7
+ "pass_acc": 91.3,
8
+ "pass@k": {
9
+ "1": 91.3
10
+ },
11
+ "time_use_in_second": 64.08755540847778,
12
+ "time_use_in_minite": "1:04"
13
+ }