johndoe123345 commited on
Commit
14184dd
·
verified ·
1 Parent(s): fb24969

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +43 -0
  2. eval_results/global_step_110/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  3. eval_results/global_step_110/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  4. eval_results/global_step_110/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  5. eval_results/global_step_110/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  6. eval_results/global_step_110/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  7. eval_results/global_step_110/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  8. eval_results/global_step_110/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  9. eval_results/global_step_110/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  10. eval_results/global_step_110/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  11. eval_results/global_step_110/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  12. eval_results/global_step_110/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  13. eval_results/global_step_110/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  14. eval_results/global_step_110/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  15. eval_results/global_step_110/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  16. eval_results/global_step_110/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  17. eval_results/global_step_120/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  18. eval_results/global_step_120/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  19. eval_results/global_step_120/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  20. eval_results/global_step_120/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  21. eval_results/global_step_120/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  22. eval_results/global_step_120/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  23. eval_results/global_step_120/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  24. eval_results/global_step_120/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  25. eval_results/global_step_120/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  26. eval_results/global_step_120/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  27. eval_results/global_step_120/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  28. eval_results/global_step_120/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  29. eval_results/global_step_120/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  30. eval_results/global_step_120/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  31. eval_results/global_step_120/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  32. eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  33. eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  34. eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  35. eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  36. eval_results/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  37. eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  38. eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  39. eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  40. eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  41. eval_results/global_step_40/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  42. eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  43. eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  44. eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  45. eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  46. eval_results/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  47. eval_results/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  48. eval_results/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  49. eval_results/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  50. eval_results/global_step_50/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
.gitattributes CHANGED
@@ -38,3 +38,46 @@ global_step_60/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -t
38
  global_step_70/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
  global_step_90/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
  global_step_20/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  global_step_70/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
  global_step_90/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
  global_step_20/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ global_step_30/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ eval_results_avg32/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
43
+ eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
44
+ eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
45
+ eval_results_avg32/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
46
+ eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
47
+ eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
48
+ eval_results_avg32/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
49
+ eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
50
+ eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
51
+ eval_results_avg32/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
52
+ eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
53
+ eval_results_avg32/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
54
+ eval_results_avg32/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
55
+ eval_results_avg32/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
56
+ eval_results_avg32/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
57
+ eval_results_avg32/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
58
+ eval_results_avg32/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
59
+ eval_results_avg32/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
60
+ eval_results_avg32/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
61
+ eval_results_avg32/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
62
+ eval_results_avg32/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
63
+ eval_results_avg32/plots/eval_results_avg32_avg_stop_tokens.png filter=lfs diff=lfs merge=lfs -text
64
+ eval_results_avg32/plots/eval_results_avg32_repeat_ratio_and_token_length.png filter=lfs diff=lfs merge=lfs -text
65
+ eval_results_avg32/plots/eval_results_avg32_tokens_keywords.png filter=lfs diff=lfs merge=lfs -text
66
+ eval_results_avg32/plots/eval_results_avg32_acc_tokens.png filter=lfs diff=lfs merge=lfs -text
67
+ eval_results_avg32/plots/eval_results_avg32_clip_ratio.png filter=lfs diff=lfs merge=lfs -text
68
+ eval_results_avg32/plots/eval_results_avg32_correct_tokens.png filter=lfs diff=lfs merge=lfs -text
69
+ eval_results_avg32/plots/eval_results_avg32_wrong_tokens.png filter=lfs diff=lfs merge=lfs -text
70
+ eval_results_avg32/plots/eval_results_avg32_acc_keywords.png filter=lfs diff=lfs merge=lfs -text
71
+ eval_results_avg32/plots/eval_results_avg32_box_ratio_and_token_length.png filter=lfs diff=lfs merge=lfs -text
72
+ eval_results_avg32/plots/eval_results_avg32_acc_pass_acc.png filter=lfs diff=lfs merge=lfs -text
73
+ eval_results_avg32/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
74
+ eval_results_avg32/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
75
+ eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
76
+ eval_results_avg32/global_step_110/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
77
+ eval_results_avg32/global_step_110/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
78
+ eval_results_avg32/global_step_110/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
79
+ eval_results_avg32/global_step_120/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
80
+ eval_results_avg32/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
81
+ eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
82
+ eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
83
+ global_step_100/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
eval_results/global_step_110/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_110/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 26.7,
7
+ "pass_acc": 26.7,
8
+ "pass@k": {
9
+ "1": 26.7
10
+ },
11
+ "time_use_in_second": 286.882128238678,
12
+ "time_use_in_minite": "4:46"
13
+ }
eval_results/global_step_110/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_110/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 20.0,
7
+ "pass_acc": 20.0,
8
+ "pass@k": {
9
+ "1": 20.0
10
+ },
11
+ "time_use_in_second": 254.04739665985107,
12
+ "time_use_in_minite": "4:14"
13
+ }
eval_results/global_step_110/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_110/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 70.0,
7
+ "pass_acc": 70.0,
8
+ "pass@k": {
9
+ "1": 70.0
10
+ },
11
+ "time_use_in_second": 166.89173412322998,
12
+ "time_use_in_minite": "2:46"
13
+ }
eval_results/global_step_110/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_110/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 93.6,
7
+ "pass_acc": 93.6,
8
+ "pass@k": {
9
+ "1": 93.6
10
+ },
11
+ "time_use_in_second": 154.87166595458984,
12
+ "time_use_in_minite": "2:34"
13
+ }
eval_results/global_step_110/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_110/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 85.6,
7
+ "pass_acc": 85.6,
8
+ "pass@k": {
9
+ "1": 85.6
10
+ },
11
+ "time_use_in_second": 467.85400772094727,
12
+ "time_use_in_minite": "7:47"
13
+ }
eval_results/global_step_110/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_110/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 39.7,
7
+ "pass_acc": 39.7,
8
+ "pass@k": {
9
+ "1": 39.7
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 62.5,
13
+ "Dynamics and Control (2.003 Spring 2005)": 53.8,
14
+ "Ecology I (1.018J Fall 2009)": 40.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 100.0,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 28.3,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 22.7,
18
+ "Physical Chemistry (5.61 Fall 2017)": 36.4,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 72.2,
20
+ "Relativity (8.033 Fall 2006)": 45.5
21
+ },
22
+ "time_use_in_second": 129.5655951499939,
23
+ "time_use_in_minite": "2:09"
24
+ }
eval_results/global_step_110/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 78.4,
7
+ "pass_acc": 78.4,
8
+ "pass@k": {
9
+ "1": 78.4
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 75.0,
13
+ "astronomy": 82.2,
14
+ "college_biology": 86.8,
15
+ "college_chemistry": 64.0,
16
+ "college_computer_science": 80.0,
17
+ "college_mathematics": 75.0,
18
+ "college_physics": 72.5,
19
+ "computer_security": 61.0,
20
+ "conceptual_physics": 81.3,
21
+ "electrical_engineering": 70.3,
22
+ "elementary_mathematics": 84.4,
23
+ "high_school_biology": 88.4,
24
+ "high_school_chemistry": 81.8,
25
+ "high_school_computer_science": 75.0,
26
+ "high_school_mathematics": 75.9,
27
+ "high_school_physics": 70.2,
28
+ "high_school_statistics": 79.2,
29
+ "machine_learning": 70.5
30
+ },
31
+ "time_use_in_second": 922.6077558994293,
32
+ "time_use_in_minite": "15:22"
33
+ }
eval_results/global_step_110/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_110/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 52.0,
7
+ "pass_acc": 52.0,
8
+ "pass@k": {
9
+ "1": 52.0
10
+ },
11
+ "time_use_in_second": 1118.1542921066284,
12
+ "time_use_in_minite": "18:38"
13
+ }
eval_results/global_step_120/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_120/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 20.0,
7
+ "pass_acc": 20.0,
8
+ "pass@k": {
9
+ "1": 20.0
10
+ },
11
+ "time_use_in_second": 279.13312292099,
12
+ "time_use_in_minite": "4:39"
13
+ }
eval_results/global_step_120/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_120/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 13.3,
7
+ "pass_acc": 13.3,
8
+ "pass@k": {
9
+ "1": 13.3
10
+ },
11
+ "time_use_in_second": 231.71165895462036,
12
+ "time_use_in_minite": "3:51"
13
+ }
eval_results/global_step_120/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_120/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 67.5,
7
+ "pass_acc": 67.5,
8
+ "pass@k": {
9
+ "1": 67.5
10
+ },
11
+ "time_use_in_second": 233.46848917007446,
12
+ "time_use_in_minite": "3:53"
13
+ }
eval_results/global_step_120/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_120/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 93.9,
7
+ "pass_acc": 93.9,
8
+ "pass@k": {
9
+ "1": 93.9
10
+ },
11
+ "time_use_in_second": 361.95891976356506,
12
+ "time_use_in_minite": "6:01"
13
+ }
eval_results/global_step_120/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_120/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 86.0,
7
+ "pass_acc": 86.0,
8
+ "pass@k": {
9
+ "1": 86.0
10
+ },
11
+ "time_use_in_second": 490.3726477622986,
12
+ "time_use_in_minite": "8:10"
13
+ }
eval_results/global_step_120/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_120/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 43.0,
7
+ "pass_acc": 43.0,
8
+ "pass@k": {
9
+ "1": 43.0
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 66.7,
13
+ "Dynamics and Control (2.003 Spring 2005)": 61.5,
14
+ "Ecology I (1.018J Fall 2009)": 40.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 66.7,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 32.1,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 27.8,
18
+ "Physical Chemistry (5.61 Fall 2017)": 27.3,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 66.7,
20
+ "Relativity (8.033 Fall 2006)": 54.5
21
+ },
22
+ "time_use_in_second": 290.67028999328613,
23
+ "time_use_in_minite": "4:50"
24
+ }
eval_results/global_step_120/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 77.5,
7
+ "pass_acc": 77.5,
8
+ "pass@k": {
9
+ "1": 77.5
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 73.0,
13
+ "astronomy": 80.9,
14
+ "college_biology": 88.2,
15
+ "college_chemistry": 65.0,
16
+ "college_computer_science": 70.0,
17
+ "college_mathematics": 70.0,
18
+ "college_physics": 69.6,
19
+ "computer_security": 59.0,
20
+ "conceptual_physics": 83.0,
21
+ "electrical_engineering": 69.7,
22
+ "elementary_mathematics": 85.7,
23
+ "high_school_biology": 88.7,
24
+ "high_school_chemistry": 77.3,
25
+ "high_school_computer_science": 73.0,
26
+ "high_school_mathematics": 73.3,
27
+ "high_school_physics": 74.2,
28
+ "high_school_statistics": 77.8,
29
+ "machine_learning": 68.8
30
+ },
31
+ "time_use_in_second": 914.7490997314453,
32
+ "time_use_in_minite": "15:14"
33
+ }
eval_results/global_step_120/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_120/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 52.9,
7
+ "pass_acc": 52.9,
8
+ "pass@k": {
9
+ "1": 52.9
10
+ },
11
+ "time_use_in_second": 1139.7145493030548,
12
+ "time_use_in_minite": "18:59"
13
+ }
eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 23.3,
7
+ "pass_acc": 23.3,
8
+ "pass@k": {
9
+ "1": 23.3
10
+ },
11
+ "time_use_in_second": 332.4983825683594,
12
+ "time_use_in_minite": "5:32"
13
+ }
eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 20.0,
7
+ "pass_acc": 20.0,
8
+ "pass@k": {
9
+ "1": 20.0
10
+ },
11
+ "time_use_in_second": 301.8720078468323,
12
+ "time_use_in_minite": "5:01"
13
+ }
eval_results/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 93.6,
7
+ "pass_acc": 93.6,
8
+ "pass@k": {
9
+ "1": 93.6
10
+ },
11
+ "time_use_in_second": 149.86318922042847,
12
+ "time_use_in_minite": "2:29"
13
+ }
eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 84.6,
7
+ "pass_acc": 84.6,
8
+ "pass@k": {
9
+ "1": 84.6
10
+ },
11
+ "time_use_in_second": 586.838475227356,
12
+ "time_use_in_minite": "9:46"
13
+ }
eval_results/global_step_40/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 79.5,
7
+ "pass_acc": 79.5,
8
+ "pass@k": {
9
+ "1": 79.5
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 75.0,
13
+ "astronomy": 82.9,
14
+ "college_biology": 87.5,
15
+ "college_chemistry": 62.0,
16
+ "college_computer_science": 66.0,
17
+ "college_mathematics": 80.0,
18
+ "college_physics": 76.5,
19
+ "computer_security": 65.0,
20
+ "conceptual_physics": 81.3,
21
+ "electrical_engineering": 73.1,
22
+ "elementary_mathematics": 85.2,
23
+ "high_school_biology": 91.0,
24
+ "high_school_chemistry": 78.3,
25
+ "high_school_computer_science": 77.0,
26
+ "high_school_mathematics": 81.9,
27
+ "high_school_physics": 77.5,
28
+ "high_school_statistics": 77.8,
29
+ "machine_learning": 68.8
30
+ },
31
+ "time_use_in_second": 902.6363174915314,
32
+ "time_use_in_minite": "15:02"
33
+ }
eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 48.1,
7
+ "pass_acc": 48.1,
8
+ "pass@k": {
9
+ "1": 48.1
10
+ },
11
+ "time_use_in_second": 1647.6752095222473,
12
+ "time_use_in_minite": "27:27"
13
+ }
eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 20.0,
7
+ "pass_acc": 20.0,
8
+ "pass@k": {
9
+ "1": 20.0
10
+ },
11
+ "time_use_in_second": 297.49846267700195,
12
+ "time_use_in_minite": "4:57"
13
+ }
eval_results/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 23.3,
7
+ "pass_acc": 23.3,
8
+ "pass@k": {
9
+ "1": 23.3
10
+ },
11
+ "time_use_in_second": 271.2436902523041,
12
+ "time_use_in_minite": "4:31"
13
+ }
eval_results/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 75.0,
7
+ "pass_acc": 75.0,
8
+ "pass@k": {
9
+ "1": 75.0
10
+ },
11
+ "time_use_in_second": 247.3225588798523,
12
+ "time_use_in_minite": "4:07"
13
+ }
eval_results/global_step_50/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff