bensondccnqwc commited on
Commit
dfe5215
·
verified ·
1 Parent(s): acef52a

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +22 -0
  2. eval_results/global_step_0/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +3 -0
  3. eval_results/plots/eval_results_acc_keywords.png +3 -0
  4. eval_results/plots/eval_results_acc_pass_acc.png +3 -0
  5. eval_results/plots/eval_results_acc_tokens.png +3 -0
  6. eval_results/plots/eval_results_avg_stop_tokens.png +3 -0
  7. eval_results/plots/eval_results_box_ratio_and_token_length.png +3 -0
  8. eval_results/plots/eval_results_clip_ratio.png +3 -0
  9. eval_results/plots/eval_results_correct_tokens.png +3 -0
  10. eval_results/plots/eval_results_repeat_ratio_and_token_length.png +3 -0
  11. eval_results/plots/eval_results_tokens_keywords.png +3 -0
  12. eval_results/plots/eval_results_wrong_tokens.png +3 -0
  13. eval_results_avg4/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  14. eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  15. eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  16. eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  17. eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  18. eval_results_avg4/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  19. eval_results_avg4/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  20. eval_results_avg4/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  21. eval_results_avg4/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  22. eval_results_avg4/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  23. eval_results_avg4/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  24. eval_results_avg4/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  25. eval_results_avg4/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  26. eval_results_avg4/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  27. eval_results_avg4/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  28. eval_results_avg4/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  29. eval_results_avg4/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  30. eval_results_avg4/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  31. eval_results_avg4/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  32. eval_results_avg4/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  33. eval_results_avg4/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  34. eval_results_avg4/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  35. eval_results_avg4/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  36. eval_results_avg4/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  37. eval_results_avg4/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  38. eval_results_avg4/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  39. eval_results_avg4/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  40. eval_results_avg4/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  41. eval_results_avg4/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  42. eval_results_avg4/plots/eval_results_avg4_acc_keywords.png +3 -0
  43. eval_results_avg4/plots/eval_results_avg4_acc_pass_acc.png +3 -0
  44. eval_results_avg4/plots/eval_results_avg4_acc_tokens.png +3 -0
  45. eval_results_avg4/plots/eval_results_avg4_avg_stop_tokens.png +3 -0
  46. eval_results_avg4/plots/eval_results_avg4_box_ratio_and_token_length.png +3 -0
  47. eval_results_avg4/plots/eval_results_avg4_clip_ratio.png +3 -0
  48. eval_results_avg4/plots/eval_results_avg4_correct_tokens.png +3 -0
  49. eval_results_avg4/plots/eval_results_avg4_tokens_keywords.png +3 -0
  50. eval_results_merged/merged.csv +12 -0
.gitattributes CHANGED
@@ -33,3 +33,25 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ global_step_90/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ global_step_70/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ global_step_80/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ eval_results/global_step_0/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
40
+ eval_results/plots/eval_results_acc_pass_acc.png filter=lfs diff=lfs merge=lfs -text
41
+ eval_results/plots/eval_results_acc_keywords.png filter=lfs diff=lfs merge=lfs -text
42
+ eval_results/plots/eval_results_acc_tokens.png filter=lfs diff=lfs merge=lfs -text
43
+ eval_results/plots/eval_results_avg_stop_tokens.png filter=lfs diff=lfs merge=lfs -text
44
+ eval_results/plots/eval_results_box_ratio_and_token_length.png filter=lfs diff=lfs merge=lfs -text
45
+ eval_results/plots/eval_results_clip_ratio.png filter=lfs diff=lfs merge=lfs -text
46
+ eval_results/plots/eval_results_correct_tokens.png filter=lfs diff=lfs merge=lfs -text
47
+ eval_results/plots/eval_results_repeat_ratio_and_token_length.png filter=lfs diff=lfs merge=lfs -text
48
+ eval_results/plots/eval_results_tokens_keywords.png filter=lfs diff=lfs merge=lfs -text
49
+ eval_results/plots/eval_results_wrong_tokens.png filter=lfs diff=lfs merge=lfs -text
50
+ eval_results_avg4/plots/eval_results_avg4_acc_keywords.png filter=lfs diff=lfs merge=lfs -text
51
+ eval_results_avg4/plots/eval_results_avg4_acc_pass_acc.png filter=lfs diff=lfs merge=lfs -text
52
+ eval_results_avg4/plots/eval_results_avg4_acc_tokens.png filter=lfs diff=lfs merge=lfs -text
53
+ eval_results_avg4/plots/eval_results_avg4_avg_stop_tokens.png filter=lfs diff=lfs merge=lfs -text
54
+ eval_results_avg4/plots/eval_results_avg4_clip_ratio.png filter=lfs diff=lfs merge=lfs -text
55
+ eval_results_avg4/plots/eval_results_avg4_box_ratio_and_token_length.png filter=lfs diff=lfs merge=lfs -text
56
+ eval_results_avg4/plots/eval_results_avg4_correct_tokens.png filter=lfs diff=lfs merge=lfs -text
57
+ eval_results_avg4/plots/eval_results_avg4_tokens_keywords.png filter=lfs diff=lfs merge=lfs -text
eval_results/global_step_0/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a42fa1343b5e1c8505957614a09bf3ab4ed175d5ab2b1c05f2ab9dc1ba0db40
3
+ size 12132789
eval_results/plots/eval_results_acc_keywords.png ADDED

Git LFS Details

  • SHA256: a1f50968ac4e74fdc31661e4a281d177a15a1b572b7def441bc4f631a39f378f
  • Pointer size: 131 Bytes
  • Size of remote file: 425 kB
eval_results/plots/eval_results_acc_pass_acc.png ADDED

Git LFS Details

  • SHA256: be9d66bb613fb652228b623de07c057a57d11d75f5ece6b91d1a90161e6e5d57
  • Pointer size: 131 Bytes
  • Size of remote file: 330 kB
eval_results/plots/eval_results_acc_tokens.png ADDED

Git LFS Details

  • SHA256: c2a9d5e4afbc5e39ebe06b94b0f23e5f5b1e7de6a39130261e2fc7f75251fdcf
  • Pointer size: 131 Bytes
  • Size of remote file: 416 kB
eval_results/plots/eval_results_avg_stop_tokens.png ADDED

Git LFS Details

  • SHA256: 0100f884f6457e59dfce30669b0a835df439a42462e8775edf2cd5953f522819
  • Pointer size: 131 Bytes
  • Size of remote file: 471 kB
eval_results/plots/eval_results_box_ratio_and_token_length.png ADDED

Git LFS Details

  • SHA256: accb3442879996e364a25c9d48c3453fd0d28d383f567c97dfcf8788f30d9b55
  • Pointer size: 131 Bytes
  • Size of remote file: 422 kB
eval_results/plots/eval_results_clip_ratio.png ADDED

Git LFS Details

  • SHA256: 35a790bcdce14dd0aaa8ecf96cd13b3a8cb77b977afbe9aa57fa6de2cf65843e
  • Pointer size: 131 Bytes
  • Size of remote file: 375 kB
eval_results/plots/eval_results_correct_tokens.png ADDED

Git LFS Details

  • SHA256: 121fc28908a40c1693aba086068b3616207c9d43bc3ec3ae341a11c731aff9c7
  • Pointer size: 131 Bytes
  • Size of remote file: 442 kB
eval_results/plots/eval_results_repeat_ratio_and_token_length.png ADDED

Git LFS Details

  • SHA256: ee3565ac5cdebfdc88a1e4b6f4fcc3033b13c3c7c1151036ff45763a42109bc7
  • Pointer size: 131 Bytes
  • Size of remote file: 438 kB
eval_results/plots/eval_results_tokens_keywords.png ADDED

Git LFS Details

  • SHA256: f980e298d690320b6d27a00fa38f6e1c1e3a5e87d57a8278dac818033406ff39
  • Pointer size: 131 Bytes
  • Size of remote file: 388 kB
eval_results/plots/eval_results_wrong_tokens.png ADDED

Git LFS Details

  • SHA256: 2d46400575e6ff292b1651daf786677e5ee68ac1ba059d4e83c15d378c1811ce
  • Pointer size: 131 Bytes
  • Size of remote file: 416 kB
eval_results_avg4/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 40.0,
7
+ "pass_acc": 60.0,
8
+ "pass@k": {
9
+ "1": 40.0,
10
+ "2": 50.4,
11
+ "4": 60.0
12
+ },
13
+ "time_use_in_second": 21.898350715637207,
14
+ "time_use_in_minite": "0:21"
15
+ }
eval_results_avg4/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 4.2,
7
+ "pass_acc": 6.7,
8
+ "pass@k": {
9
+ "1": 4.2,
10
+ "2": 5.0,
11
+ "4": 6.7
12
+ },
13
+ "time_use_in_second": 20.911217212677002,
14
+ "time_use_in_minite": "0:20"
15
+ }
eval_results_avg4/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 1.7,
7
+ "pass_acc": 3.3,
8
+ "pass@k": {
9
+ "1": 1.7,
10
+ "2": 2.8,
11
+ "4": 3.3
12
+ },
13
+ "time_use_in_second": 18.055476665496826,
14
+ "time_use_in_minite": "0:18"
15
+ }
eval_results_avg4/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 44.4,
7
+ "pass_acc": 62.5,
8
+ "pass@k": {
9
+ "1": 44.4,
10
+ "2": 56.7,
11
+ "4": 62.5
12
+ },
13
+ "time_use_in_second": 21.025193214416504,
14
+ "time_use_in_minite": "0:21"
15
+ }
eval_results_avg4/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 9.2,
7
+ "pass_acc": 23.3,
8
+ "pass@k": {
9
+ "1": 9.2,
10
+ "2": 14.4,
11
+ "4": 23.3
12
+ },
13
+ "time_use_in_second": 20.478790760040283,
14
+ "time_use_in_minite": "0:20"
15
+ }
eval_results_avg4/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 0.8,
7
+ "pass_acc": 3.3,
8
+ "pass@k": {
9
+ "1": 0.8,
10
+ "2": 1.7,
11
+ "4": 3.3
12
+ },
13
+ "time_use_in_second": 19.925731897354126,
14
+ "time_use_in_minite": "0:19"
15
+ }
eval_results_avg4/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 35.0,
7
+ "pass_acc": 60.0,
8
+ "pass@k": {
9
+ "1": 35.0,
10
+ "2": 46.2,
11
+ "4": 60.0
12
+ },
13
+ "time_use_in_second": 22.096985340118408,
14
+ "time_use_in_minite": "0:22"
15
+ }
eval_results_avg4/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 7.5,
7
+ "pass_acc": 13.3,
8
+ "pass@k": {
9
+ "1": 7.5,
10
+ "2": 10.6,
11
+ "4": 13.3
12
+ },
13
+ "time_use_in_second": 20.141565561294556,
14
+ "time_use_in_minite": "0:20"
15
+ }
eval_results_avg4/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 2.5,
7
+ "pass_acc": 6.7,
8
+ "pass@k": {
9
+ "1": 2.5,
10
+ "2": 4.4,
11
+ "4": 6.7
12
+ },
13
+ "time_use_in_second": 18.300720691680908,
14
+ "time_use_in_minite": "0:18"
15
+ }
eval_results_avg4/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 30.6,
7
+ "pass_acc": 55.0,
8
+ "pass@k": {
9
+ "1": 30.6,
10
+ "2": 42.9,
11
+ "4": 55.0
12
+ },
13
+ "time_use_in_second": 19.80013918876648,
14
+ "time_use_in_minite": "0:19"
15
+ }
eval_results_avg4/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 4.2,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 4.2,
10
+ "2": 6.7,
11
+ "4": 10.0
12
+ },
13
+ "time_use_in_second": 20.099088191986084,
14
+ "time_use_in_minite": "0:20"
15
+ }
eval_results_avg4/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 0.0,
7
+ "pass_acc": 0.0,
8
+ "pass@k": {
9
+ "1": 0.0,
10
+ "2": 0.0,
11
+ "4": 0.0
12
+ },
13
+ "time_use_in_second": 16.414506196975708,
14
+ "time_use_in_minite": "0:16"
15
+ }
eval_results_avg4/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 26.2,
7
+ "pass_acc": 40.0,
8
+ "pass@k": {
9
+ "1": 26.2,
10
+ "2": 33.3,
11
+ "4": 40.0
12
+ },
13
+ "time_use_in_second": 20.395983457565308,
14
+ "time_use_in_minite": "0:20"
15
+ }
eval_results_avg4/plots/eval_results_avg4_acc_keywords.png ADDED

Git LFS Details

  • SHA256: 199a7017dbcc8ee10026e2daab4d6e3d9eeb82c403d9a96020cc7596f685080b
  • Pointer size: 131 Bytes
  • Size of remote file: 205 kB
eval_results_avg4/plots/eval_results_avg4_acc_pass_acc.png ADDED

Git LFS Details

  • SHA256: 5bf1ca613c6fc150e291dbc624cce99c860f66d9e224019366eba4118c2ef72e
  • Pointer size: 131 Bytes
  • Size of remote file: 206 kB
eval_results_avg4/plots/eval_results_avg4_acc_tokens.png ADDED

Git LFS Details

  • SHA256: af29655e2a60e09e11f676f44fbe2b6e3ea3c033f9303aabc99c71691e3a8ac1
  • Pointer size: 131 Bytes
  • Size of remote file: 205 kB
eval_results_avg4/plots/eval_results_avg4_avg_stop_tokens.png ADDED

Git LFS Details

  • SHA256: 7248c406a56d93489a6ec98c269c66664a51908f3b0e0a938a4845e214db8b29
  • Pointer size: 131 Bytes
  • Size of remote file: 223 kB
eval_results_avg4/plots/eval_results_avg4_box_ratio_and_token_length.png ADDED

Git LFS Details

  • SHA256: b8cd0593b6a09ce0694bf636dabc752562fa9b90bbb536823204240a85f7a02c
  • Pointer size: 131 Bytes
  • Size of remote file: 205 kB
eval_results_avg4/plots/eval_results_avg4_clip_ratio.png ADDED

Git LFS Details

  • SHA256: 8e0d25303230df6ee944f55c98c2b05e0a6f9d9d5691857dcced02a15eba5d8c
  • Pointer size: 131 Bytes
  • Size of remote file: 181 kB
eval_results_avg4/plots/eval_results_avg4_correct_tokens.png ADDED

Git LFS Details

  • SHA256: 2e979496a10e39947a55f491887416f75b8fdd49c50d704fdc847236fab08a49
  • Pointer size: 131 Bytes
  • Size of remote file: 222 kB
eval_results_avg4/plots/eval_results_avg4_tokens_keywords.png ADDED

Git LFS Details

  • SHA256: 7ce68d176461cb36d6b254b808bd8243ba86e2806fbec69c3296638546eb427e
  • Pointer size: 131 Bytes
  • Size of remote file: 194 kB
eval_results_merged/merged.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ global_step,aime24_acc_avg4,aime25_acc_avg4,amc23_acc_avg4,aime24_acc,aime25_acc,amc23_acc,gsm8k_acc,math500_acc,minerva_math_acc,olympiadbench_acc,mmlu_stem_acc,prompt_level_strict_acc_ood,gpqa_pass@1:1_samples_ood
2
+ 0,3.30,2.50,25.00,3.30,3.30,22.50,59.40,41.00,11.00,19.30,39.10,20.5,
3
+ 10,4.20,3.30,32.50,6.70,6.70,37.50,74.10,57.80,21.00,21.00,42.90,22.0,
4
+ 20,5.00,2.50,35.60,6.70,0.00,25.00,77.60,59.40,24.60,24.10,46.00,25.9,
5
+ 30,5.00,5.80,36.20,6.70,10.00,35.00,78.90,63.20,26.80,24.90,50.60,29.2,
6
+ 40,5.80,1.70,35.00,10.00,3.30,40.00,80.90,64.00,26.10,28.40,56.80,27.7,
7
+ 50,5.80,0.80,40.00,3.30,3.30,35.00,81.00,63.20,27.90,27.00,57.70,25.7,
8
+ 60,4.20,1.70,44.40,6.70,3.30,22.50,80.20,64.20,24.60,25.90,59.20,26.1,
9
+ 70,9.20,0.80,35.00,13.30,6.70,32.50,80.70,63.20,27.20,26.10,60.90,25.5,
10
+ 80,7.50,2.50,30.60,3.30,0.00,25.00,78.70,57.00,21.70,22.70,44.60,22.4,
11
+ 90,4.20,0.00,26.20,3.30,3.30,37.50,80.00,60.40,21.70,25.30,53.10,24.0,
12
+ 100,4.20,0.00,30.00,6.70,0.00,30.00,80.90,56.20,21.00,25.30,45.30,22.2,