bensondccnqwc commited on
Commit
e3edc3d
·
verified ·
1 Parent(s): 4041846

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  2. eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  3. eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  4. eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  5. eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  6. eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  7. eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  8. eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  9. eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  10. eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  11. eval_results_avg4/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  12. eval_results_avg4/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  13. eval_results_avg4/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  14. eval_results_avg4/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  15. eval_results_avg4/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  16. eval_results_avg4/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  17. eval_results_avg4/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  18. eval_results_avg4/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  19. eval_results_avg4/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  20. eval_results_avg4/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  21. eval_results_avg4/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  22. eval_results_avg4/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  23. eval_results_avg4/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  24. eval_results_avg4/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  25. eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  26. eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  27. eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  28. eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  29. eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  30. eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  31. eval_results_avg4/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  32. eval_results_avg4/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  33. eval_results_avg4/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  34. eval_results_avg4/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  35. eval_results_avg4/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  36. eval_results_avg4/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  37. eval_results_avg4/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  38. eval_results_avg4/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  39. eval_results_avg4/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  40. eval_results_avg4/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  41. eval_results_avg4/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  42. eval_results_avg4/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  43. eval_results_avg4/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  44. eval_results_avg4/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  45. eval_results_avg4/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  46. eval_results_avg4/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  47. eval_results_avg4/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  48. eval_results_avg4/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  49. eval_results_avg4/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  50. eval_results_avg4/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 38.1,
7
+ "pass_acc": 60.0,
8
+ "pass@k": {
9
+ "1": 38.1,
10
+ "2": 48.3,
11
+ "4": 60.0
12
+ },
13
+ "time_use_in_second": 168.34224152565002,
14
+ "time_use_in_minite": "2:48"
15
+ }
eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 6.7,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 6.7,
10
+ "2": 8.3,
11
+ "4": 10.0
12
+ },
13
+ "time_use_in_second": 215.86974668502808,
14
+ "time_use_in_minite": "3:35"
15
+ }
eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 3.3,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 3.3,
10
+ "2": 6.1,
11
+ "4": 10.0
12
+ },
13
+ "time_use_in_second": 199.11678457260132,
14
+ "time_use_in_minite": "3:19"
15
+ }
eval_results_avg4/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 37.5,
7
+ "pass_acc": 60.0,
8
+ "pass@k": {
9
+ "1": 37.5,
10
+ "2": 48.3,
11
+ "4": 60.0
12
+ },
13
+ "time_use_in_second": 172.48754286766052,
14
+ "time_use_in_minite": "2:52"
15
+ }
eval_results_avg4/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 5.0,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 5.0,
10
+ "2": 7.8,
11
+ "4": 10.0
12
+ },
13
+ "time_use_in_second": 211.78608345985413,
14
+ "time_use_in_minite": "3:31"
15
+ }
eval_results_avg4/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 1.7,
7
+ "pass_acc": 3.3,
8
+ "pass@k": {
9
+ "1": 1.7,
10
+ "2": 2.8,
11
+ "4": 3.3
12
+ },
13
+ "time_use_in_second": 195.0695161819458,
14
+ "time_use_in_minite": "3:15"
15
+ }
eval_results_avg4/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 35.0,
7
+ "pass_acc": 60.0,
8
+ "pass@k": {
9
+ "1": 35.0,
10
+ "2": 47.5,
11
+ "4": 60.0
12
+ },
13
+ "time_use_in_second": 183.63848447799683,
14
+ "time_use_in_minite": "3:03"
15
+ }
eval_results_avg4/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 8.3,
7
+ "pass_acc": 23.3,
8
+ "pass@k": {
9
+ "1": 8.3,
10
+ "2": 14.4,
11
+ "4": 23.3
12
+ },
13
+ "time_use_in_second": 188.62129831314087,
14
+ "time_use_in_minite": "3:08"
15
+ }
eval_results_avg4/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 1.7,
7
+ "pass_acc": 3.3,
8
+ "pass@k": {
9
+ "1": 1.7,
10
+ "2": 2.8,
11
+ "4": 3.3
12
+ },
13
+ "time_use_in_second": 166.4781849384308,
14
+ "time_use_in_minite": "2:46"
15
+ }
eval_results_avg4/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 36.2,
7
+ "pass_acc": 62.5,
8
+ "pass@k": {
9
+ "1": 36.2,
10
+ "2": 49.2,
11
+ "4": 62.5
12
+ },
13
+ "time_use_in_second": 172.03055691719055,
14
+ "time_use_in_minite": "2:52"
15
+ }
eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 10.0,
7
+ "pass_acc": 23.3,
8
+ "pass@k": {
9
+ "1": 10.0,
10
+ "2": 16.7,
11
+ "4": 23.3
12
+ },
13
+ "time_use_in_second": 189.44807291030884,
14
+ "time_use_in_minite": "3:09"
15
+ }
eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 1.7,
7
+ "pass_acc": 3.3,
8
+ "pass@k": {
9
+ "1": 1.7,
10
+ "2": 2.8,
11
+ "4": 3.3
12
+ },
13
+ "time_use_in_second": 202.21822237968445,
14
+ "time_use_in_minite": "3:22"
15
+ }
eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 39.4,
7
+ "pass_acc": 62.5,
8
+ "pass@k": {
9
+ "1": 39.4,
10
+ "2": 52.1,
11
+ "4": 62.5
12
+ },
13
+ "time_use_in_second": 175.33796977996826,
14
+ "time_use_in_minite": "2:55"
15
+ }
eval_results_avg4/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 5.0,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 5.0,
10
+ "2": 7.8,
11
+ "4": 10.0
12
+ },
13
+ "time_use_in_second": 201.41547870635986,
14
+ "time_use_in_minite": "3:21"
15
+ }
eval_results_avg4/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 4.2,
7
+ "pass_acc": 13.3,
8
+ "pass@k": {
9
+ "1": 4.2,
10
+ "2": 7.8,
11
+ "4": 13.3
12
+ },
13
+ "time_use_in_second": 166.89051413536072,
14
+ "time_use_in_minite": "2:46"
15
+ }
eval_results_avg4/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 36.2,
7
+ "pass_acc": 62.5,
8
+ "pass@k": {
9
+ "1": 36.2,
10
+ "2": 49.6,
11
+ "4": 62.5
12
+ },
13
+ "time_use_in_second": 185.09522104263306,
14
+ "time_use_in_minite": "3:05"
15
+ }
eval_results_avg4/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 10.0,
7
+ "pass_acc": 20.0,
8
+ "pass@k": {
9
+ "1": 10.0,
10
+ "2": 15.6,
11
+ "4": 20.0
12
+ },
13
+ "time_use_in_second": 182.3788356781006,
14
+ "time_use_in_minite": "3:02"
15
+ }
eval_results_avg4/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 7.5,
7
+ "pass_acc": 13.3,
8
+ "pass@k": {
9
+ "1": 7.5,
10
+ "2": 10.0,
11
+ "4": 13.3
12
+ },
13
+ "time_use_in_second": 164.56986451148987,
14
+ "time_use_in_minite": "2:44"
15
+ }
eval_results_avg4/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 34.4,
7
+ "pass_acc": 57.5,
8
+ "pass@k": {
9
+ "1": 34.4,
10
+ "2": 46.7,
11
+ "4": 57.5
12
+ },
13
+ "time_use_in_second": 164.12454390525818,
14
+ "time_use_in_minite": "2:44"
15
+ }
eval_results_avg4/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 11.7,
7
+ "pass_acc": 23.3,
8
+ "pass@k": {
9
+ "1": 11.7,
10
+ "2": 17.8,
11
+ "4": 23.3
12
+ },
13
+ "time_use_in_second": 174.74434995651245,
14
+ "time_use_in_minite": "2:54"
15
+ }
eval_results_avg4/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 2.5,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 2.5,
10
+ "2": 5.0,
11
+ "4": 10.0
12
+ },
13
+ "time_use_in_second": 144.3609230518341,
14
+ "time_use_in_minite": "2:24"
15
+ }
eval_results_avg4/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 39.4,
7
+ "pass_acc": 52.5,
8
+ "pass@k": {
9
+ "1": 39.4,
10
+ "2": 47.9,
11
+ "4": 52.5
12
+ },
13
+ "time_use_in_second": 155.9563455581665,
14
+ "time_use_in_minite": "2:35"
15
+ }
eval_results_avg4/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 9.2,
7
+ "pass_acc": 20.0,
8
+ "pass@k": {
9
+ "1": 9.2,
10
+ "2": 13.3,
11
+ "4": 20.0
12
+ },
13
+ "time_use_in_second": 175.5485875606537,
14
+ "time_use_in_minite": "2:55"
15
+ }