Kadins commited on
Commit
86b0749
·
verified ·
1 Parent(s): 3d18e86

Model save

Browse files
README.md CHANGED
@@ -26,7 +26,7 @@ print(output["generated_text"])
26
 
27
  ## Training procedure
28
 
29
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/build_r1/huggingface/runs/521r385l)
30
 
31
 
32
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
@@ -34,7 +34,7 @@ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing
34
  ### Framework versions
35
 
36
  - TRL: 0.16.0.dev0
37
- - Transformers: 4.50.0.dev0
38
  - Pytorch: 2.5.1+cu124
39
  - Datasets: 3.3.1
40
  - Tokenizers: 0.21.0
 
26
 
27
  ## Training procedure
28
 
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/build_r1/huggingface/runs/sbd67t58)
30
 
31
 
32
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
34
  ### Framework versions
35
 
36
  - TRL: 0.16.0.dev0
37
+ - Transformers: 4.49.0
38
  - Pytorch: 2.5.1+cu124
39
  - Datasets: 3.3.1
40
  - Tokenizers: 0.21.0
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.041315901221643234,
4
- "train_runtime": 6824.0967,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 1.099,
7
- "train_steps_per_second": 0.008
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.03965516161473318,
4
+ "train_runtime": 10204.823,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 1.102,
7
+ "train_steps_per_second": 0.009
8
  }
config.json CHANGED
@@ -22,7 +22,7 @@
22
  "sliding_window": 4096,
23
  "tie_word_embeddings": false,
24
  "torch_dtype": "bfloat16",
25
- "transformers_version": "4.50.0.dev0",
26
  "use_cache": false,
27
  "use_mrope": false,
28
  "use_sliding_window": false,
 
22
  "sliding_window": 4096,
23
  "tie_word_embeddings": false,
24
  "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.49.0",
26
  "use_cache": false,
27
  "use_mrope": false,
28
  "use_sliding_window": false,
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "bos_token_id": 151643,
3
  "eos_token_id": 151643,
4
  "max_new_tokens": 2048,
5
- "transformers_version": "4.50.0.dev0"
6
  }
 
2
  "bos_token_id": 151643,
3
  "eos_token_id": 151643,
4
  "max_new_tokens": 2048,
5
+ "transformers_version": "4.49.0"
6
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1276b52a444f36ce7a2438ec8eb171d7a6c10936b36fe98759917253d63a8344
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f5229b390a6d5a3c821d1b16005e5c679c73020aad6a61f898cc4c752e570b7
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0325bb74df5b1b59a00cb2724a66404e2c614ac3f34121754f1585e92d219a4
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b74dbe92920694029d0c604f0890522acb2ece89f0518a5a9f420124b837b40
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ce7c8740a88cbc6ddc98aab33ea1a20f5a7107c2934056f8c8b8c22faebaa0d
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d323dcf810106f79f0588f9bc36a276f5414fbe7bb01853a65a2fee4eb10885
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:322362afbb95734a973ce1fa95a7b55ced779dedfe1b251ea70175bf54c1f2d8
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d3c27c38871f8ffbdcc74752d141b37109625c30769f8104121ce07941c513a
3
  size 1089994880
tokenizer_config.json CHANGED
@@ -202,7 +202,6 @@
202
  "extra_special_tokens": {},
203
  "model_max_length": 131072,
204
  "pad_token": "<|endoftext|>",
205
- "padding_side": "left",
206
  "split_special_tokens": false,
207
  "tokenizer_class": "Qwen2Tokenizer",
208
  "unk_token": null
 
202
  "extra_special_tokens": {},
203
  "model_max_length": 131072,
204
  "pad_token": "<|endoftext|>",
 
205
  "split_special_tokens": false,
206
  "tokenizer_class": "Qwen2Tokenizer",
207
  "unk_token": null
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.041315901221643234,
4
- "train_runtime": 6824.0967,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 1.099,
7
- "train_steps_per_second": 0.008
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.03965516161473318,
4
+ "train_runtime": 10204.823,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 1.102,
7
+ "train_steps_per_second": 0.009
8
  }
trainer_state.json CHANGED
@@ -1,189 +1,267 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9893390191897654,
5
  "eval_steps": 100,
6
- "global_step": 58,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 602.1573867797852,
13
  "epoch": 0.017057569296375266,
14
- "grad_norm": 0.49788331985473633,
15
  "kl": 0.0,
16
- "learning_rate": 5e-07,
17
- "loss": 0.0235,
18
- "reward": 0.6395089626312256,
19
- "reward_std": 0.34354935958981514,
20
- "rewards/accuracy_reward": 0.6339286044239998,
21
  "rewards/format_reward": 0.005580357392318547,
22
  "step": 1
23
  },
24
  {
25
- "completion_length": 602.9565010070801,
26
  "epoch": 0.08528784648187633,
27
- "grad_norm": 0.31691527366638184,
28
- "kl": 0.00025323033332824707,
29
- "learning_rate": 2.5e-06,
30
- "loss": 0.0201,
31
- "reward": 0.6130022583529353,
32
- "reward_std": 0.36007128469645977,
33
- "rewards/accuracy_reward": 0.6104910988360643,
34
- "rewards/format_reward": 0.002511160826543346,
35
  "step": 5
36
  },
37
  {
38
- "completion_length": 604.4129737854004,
39
  "epoch": 0.17057569296375266,
40
- "grad_norm": 1.7574856281280518,
41
- "kl": 0.004290962219238281,
42
- "learning_rate": 2.956412726139078e-06,
43
- "loss": 0.0526,
44
- "reward": 0.6930803835391999,
45
- "reward_std": 0.30544489361345767,
46
- "rewards/accuracy_reward": 0.6915178835391999,
47
- "rewards/format_reward": 0.0015625000698491931,
48
  "step": 10
49
  },
50
  {
51
- "completion_length": 600.6560562133789,
52
  "epoch": 0.255863539445629,
53
- "grad_norm": 0.1719939261674881,
54
- "kl": 0.015668106079101563,
55
- "learning_rate": 2.7836719084521715e-06,
56
- "loss": 0.0603,
57
- "reward": 0.7658482477068901,
58
- "reward_std": 0.2067408435046673,
59
- "rewards/accuracy_reward": 0.7656250327825547,
60
  "rewards/format_reward": 0.00022321429569274187,
61
  "step": 15
62
  },
63
  {
64
- "completion_length": 594.912075805664,
65
  "epoch": 0.3411513859275053,
66
- "grad_norm": 0.5162317752838135,
67
- "kl": 0.005500221252441406,
68
- "learning_rate": 2.4946839873611927e-06,
69
- "loss": 0.0549,
70
- "reward": 0.7725446730852127,
71
- "reward_std": 0.2103592725470662,
72
- "rewards/accuracy_reward": 0.7723214596509933,
73
- "rewards/format_reward": 0.00022321429569274187,
74
  "step": 20
75
  },
76
  {
77
- "completion_length": 602.0649864196778,
78
  "epoch": 0.42643923240938164,
79
- "grad_norm": 0.1503874659538269,
80
- "kl": 0.004841995239257812,
81
- "learning_rate": 2.1156192081791355e-06,
82
- "loss": 0.0425,
83
- "reward": 0.7611607491970063,
84
- "reward_std": 0.19463529847562314,
85
- "rewards/accuracy_reward": 0.7609375342726707,
86
- "rewards/format_reward": 0.00022321429569274187,
87
  "step": 25
88
  },
89
  {
90
- "completion_length": 600.5692260742187,
91
  "epoch": 0.511727078891258,
92
- "grad_norm": 0.111796073615551,
93
- "kl": 0.006829071044921875,
94
- "learning_rate": 1.6808050203829845e-06,
95
- "loss": 0.0364,
96
- "reward": 0.7482143223285675,
97
- "reward_std": 0.18839258402585984,
98
- "rewards/accuracy_reward": 0.7482143223285675,
99
  "rewards/format_reward": 0.0,
100
  "step": 30
101
  },
102
  {
103
- "completion_length": 595.0308326721191,
104
  "epoch": 0.5970149253731343,
105
- "grad_norm": 0.6281359791755676,
106
- "kl": 0.27039794921875,
107
- "learning_rate": 1.2296174432791415e-06,
108
- "loss": 0.0478,
109
- "reward": 0.7406250335276127,
110
- "reward_std": 0.18172951051965355,
111
- "rewards/accuracy_reward": 0.7401786051690579,
112
- "rewards/format_reward": 0.00044642859138548373,
113
  "step": 35
114
  },
115
  {
116
- "completion_length": 581.8551567077636,
117
  "epoch": 0.6823027718550106,
118
- "grad_norm": 0.29348960518836975,
119
- "kl": 0.045685958862304685,
120
- "learning_rate": 8.029152419343472e-07,
121
- "loss": 0.0408,
122
- "reward": 0.7723214700818062,
123
- "reward_std": 0.1774477436207235,
124
- "rewards/accuracy_reward": 0.7716518267989159,
125
- "rewards/format_reward": 0.0006696428870782256,
126
  "step": 40
127
  },
128
  {
129
- "completion_length": 597.0538185119628,
130
  "epoch": 0.767590618336887,
131
- "grad_norm": 0.13345518708229065,
132
- "kl": 0.01571235656738281,
133
- "learning_rate": 4.3933982822017883e-07,
134
- "loss": 0.0281,
135
- "reward": 0.7609375342726707,
136
- "reward_std": 0.17938947193324567,
137
- "rewards/accuracy_reward": 0.7598214626312256,
138
- "rewards/format_reward": 0.0011160714784637094,
139
  "step": 45
140
  },
141
  {
142
- "completion_length": 604.6993606567382,
143
  "epoch": 0.8528784648187633,
144
- "grad_norm": 0.24237532913684845,
145
- "kl": 0.005515289306640625,
146
- "learning_rate": 1.718159615201853e-07,
147
- "loss": 0.0334,
148
- "reward": 0.7473214641213417,
149
- "reward_std": 0.19217551834881305,
150
- "rewards/accuracy_reward": 0.7455357491970063,
151
- "rewards/format_reward": 0.001785714365541935,
152
  "step": 50
153
  },
154
  {
155
- "completion_length": 585.3790458679199,
156
  "epoch": 0.9381663113006397,
157
- "grad_norm": 0.21673214435577393,
158
- "kl": 0.0057525634765625,
159
- "learning_rate": 2.4570139579284723e-08,
160
- "loss": 0.0363,
161
- "reward": 0.7837053894996643,
162
- "reward_std": 0.18726392211392523,
163
- "rewards/accuracy_reward": 0.7819196790456772,
164
- "rewards/format_reward": 0.001785714365541935,
165
  "step": 55
166
  },
167
  {
168
- "completion_length": 580.9449704488119,
169
- "epoch": 0.9893390191897654,
170
- "kl": 0.0047308603922526045,
171
- "reward": 0.7645089675982794,
172
- "reward_std": 0.18809200543910265,
173
- "rewards/accuracy_reward": 0.763392892976602,
174
- "rewards/format_reward": 0.0011160714784637094,
175
- "step": 58,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  "total_flos": 0.0,
177
- "train_loss": 0.041315901221643234,
178
- "train_runtime": 6824.0967,
179
- "train_samples_per_second": 1.099,
180
- "train_steps_per_second": 0.008
181
  }
182
  ],
183
  "logging_steps": 5,
184
- "max_steps": 58,
185
  "num_input_tokens_seen": 0,
186
- "num_train_epochs": 1,
187
  "save_steps": 500,
188
  "stateful_callbacks": {
189
  "TrainerControl": {
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.4946695095948828,
5
  "eval_steps": 100,
6
+ "global_step": 87,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 605.1830520629883,
13
  "epoch": 0.017057569296375266,
14
+ "grad_norm": 0.5403730869293213,
15
  "kl": 0.0,
16
+ "learning_rate": 3.333333333333333e-07,
17
+ "loss": 0.0308,
18
+ "reward": 0.631696455180645,
19
+ "reward_std": 0.3498076871037483,
20
+ "rewards/accuracy_reward": 0.6261160969734192,
21
  "rewards/format_reward": 0.005580357392318547,
22
  "step": 1
23
  },
24
  {
25
+ "completion_length": 603.803879737854,
26
  "epoch": 0.08528784648187633,
27
+ "grad_norm": 1.4775316715240479,
28
+ "kl": 0.00015020370483398438,
29
+ "learning_rate": 1.6666666666666669e-06,
30
+ "loss": 0.0283,
31
+ "reward": 0.6054687770083547,
32
+ "reward_std": 0.35884954407811165,
33
+ "rewards/accuracy_reward": 0.6046317219734192,
34
+ "rewards/format_reward": 0.000837053608847782,
35
  "step": 5
36
  },
37
  {
38
+ "completion_length": 606.0585052490235,
39
  "epoch": 0.17057569296375266,
40
+ "grad_norm": 0.26707330346107483,
41
+ "kl": 0.001517653465270996,
42
+ "learning_rate": 2.9987834972573546e-06,
43
+ "loss": 0.0458,
44
+ "reward": 0.6533482439815999,
45
+ "reward_std": 0.32249790802598,
46
+ "rewards/accuracy_reward": 0.652455386519432,
47
+ "rewards/format_reward": 0.0008928571827709675,
48
  "step": 10
49
  },
50
  {
51
+ "completion_length": 607.5294906616211,
52
  "epoch": 0.255863539445629,
53
+ "grad_norm": 0.13552981615066528,
54
+ "kl": 0.006855583190917969,
55
+ "learning_rate": 2.956412726139078e-06,
56
+ "loss": 0.0701,
57
+ "reward": 0.7529018178582192,
58
+ "reward_std": 0.22657863702625036,
59
+ "rewards/accuracy_reward": 0.7526786029338837,
60
  "rewards/format_reward": 0.00022321429569274187,
61
  "step": 15
62
  },
63
  {
64
+ "completion_length": 592.2439987182618,
65
  "epoch": 0.3411513859275053,
66
+ "grad_norm": 2.5956568717956543,
67
+ "kl": 0.014976119995117188,
68
+ "learning_rate": 2.8551756519155732e-06,
69
+ "loss": 0.0501,
70
+ "reward": 0.7772321775555611,
71
+ "reward_std": 0.18385109901428223,
72
+ "rewards/accuracy_reward": 0.7772321775555611,
73
+ "rewards/format_reward": 0.0,
74
  "step": 20
75
  },
76
  {
77
+ "completion_length": 595.9457809448243,
78
  "epoch": 0.42643923240938164,
79
+ "grad_norm": 0.31570613384246826,
80
+ "kl": 0.004840660095214844,
81
+ "learning_rate": 2.699164145105252e-06,
82
+ "loss": 0.0432,
83
+ "reward": 0.7618303894996643,
84
+ "reward_std": 0.1812900934368372,
85
+ "rewards/accuracy_reward": 0.7618303894996643,
86
+ "rewards/format_reward": 0.0,
87
  "step": 25
88
  },
89
  {
90
+ "completion_length": 598.4087356567383,
91
  "epoch": 0.511727078891258,
92
+ "grad_norm": 0.09540420770645142,
93
+ "kl": 0.0038333892822265624,
94
+ "learning_rate": 2.4946839873611927e-06,
95
+ "loss": 0.0415,
96
+ "reward": 0.7622768223285675,
97
+ "reward_std": 0.18161814119666814,
98
+ "rewards/accuracy_reward": 0.7622768223285675,
99
  "rewards/format_reward": 0.0,
100
  "step": 30
101
  },
102
  {
103
+ "completion_length": 587.7656509399415,
104
  "epoch": 0.5970149253731343,
105
+ "grad_norm": 0.10136830061674118,
106
+ "kl": 0.005519866943359375,
107
+ "learning_rate": 2.25e-06,
108
+ "loss": 0.0319,
109
+ "reward": 0.7529018253087998,
110
+ "reward_std": 0.1641070661135018,
111
+ "rewards/accuracy_reward": 0.7529018253087998,
112
+ "rewards/format_reward": 0.0,
113
  "step": 35
114
  },
115
  {
116
+ "completion_length": 576.0547142028809,
117
  "epoch": 0.6823027718550106,
118
+ "grad_norm": 0.2535816729068756,
119
+ "kl": 0.005191802978515625,
120
+ "learning_rate": 1.975001990702209e-06,
121
+ "loss": 0.0346,
122
+ "reward": 0.7754464656114578,
123
+ "reward_std": 0.1725513377226889,
124
+ "rewards/accuracy_reward": 0.7754464656114578,
125
+ "rewards/format_reward": 0.0,
126
  "step": 40
127
  },
128
  {
129
+ "completion_length": 592.9451164245605,
130
  "epoch": 0.767590618336887,
131
+ "grad_norm": 0.12311802059412003,
132
+ "kl": 0.004965591430664063,
133
+ "learning_rate": 1.6808050203829845e-06,
134
+ "loss": 0.0432,
135
+ "reward": 0.7558036044239997,
136
+ "reward_std": 0.19019564976915718,
137
+ "rewards/accuracy_reward": 0.7555803909897805,
138
+ "rewards/format_reward": 0.00022321429569274187,
139
  "step": 45
140
  },
141
  {
142
+ "completion_length": 588.0194488525391,
143
  "epoch": 0.8528784648187633,
144
+ "grad_norm": 0.10202702134847641,
145
+ "kl": 0.005793380737304688,
146
+ "learning_rate": 1.3793001469249112e-06,
147
+ "loss": 0.0278,
148
+ "reward": 0.7582589641213417,
149
+ "reward_std": 0.17491137199103832,
150
+ "rewards/accuracy_reward": 0.7582589641213417,
151
+ "rewards/format_reward": 0.0,
152
  "step": 50
153
  },
154
  {
155
+ "completion_length": 577.1547119140625,
156
  "epoch": 0.9381663113006397,
157
+ "grad_norm": 0.09594480693340302,
158
+ "kl": 0.00527496337890625,
159
+ "learning_rate": 1.0826738041253211e-06,
160
+ "loss": 0.0349,
161
+ "reward": 0.7921875387430191,
162
+ "reward_std": 0.1892126789316535,
163
+ "rewards/accuracy_reward": 0.7917411103844643,
164
+ "rewards/format_reward": 0.00044642859138548373,
165
  "step": 55
166
  },
167
  {
168
+ "completion_length": 568.16053425182,
169
+ "epoch": 1.0341151385927505,
170
+ "grad_norm": 0.08510848134756088,
171
+ "kl": 0.005277807062322443,
172
+ "learning_rate": 8.029152419343472e-07,
173
+ "loss": 0.0384,
174
+ "reward": 0.7849026335911318,
175
+ "reward_std": 0.16940699830989947,
176
+ "rewards/accuracy_reward": 0.7849026335911318,
177
+ "rewards/format_reward": 0.0,
178
+ "step": 60
179
+ },
180
+ {
181
+ "completion_length": 560.2404289245605,
182
+ "epoch": 1.1194029850746268,
183
+ "grad_norm": 0.10467664897441864,
184
+ "kl": 0.006550979614257812,
185
+ "learning_rate": 5.513319366069343e-07,
186
+ "loss": 0.0334,
187
+ "reward": 0.7776786103844643,
188
+ "reward_std": 0.17760842395946383,
189
+ "rewards/accuracy_reward": 0.7776786103844643,
190
+ "rewards/format_reward": 0.0,
191
+ "step": 65
192
+ },
193
+ {
194
+ "completion_length": 555.7098434448242,
195
+ "epoch": 1.2046908315565032,
196
+ "grad_norm": 0.07688478380441666,
197
+ "kl": 0.005760955810546875,
198
+ "learning_rate": 3.380925572585183e-07,
199
+ "loss": 0.0404,
200
+ "reward": 0.784598246216774,
201
+ "reward_std": 0.16191664077341555,
202
+ "rewards/accuracy_reward": 0.784598246216774,
203
+ "rewards/format_reward": 0.0,
204
+ "step": 70
205
+ },
206
+ {
207
+ "completion_length": 572.1274841308593,
208
+ "epoch": 1.2899786780383795,
209
+ "grad_norm": 0.09251231700181961,
210
+ "kl": 0.0058074951171875,
211
+ "learning_rate": 1.718159615201853e-07,
212
+ "loss": 0.0414,
213
+ "reward": 0.7857143238186837,
214
+ "reward_std": 0.1789614163339138,
215
+ "rewards/accuracy_reward": 0.7854911088943481,
216
+ "rewards/format_reward": 0.00022321429569274187,
217
+ "step": 75
218
+ },
219
+ {
220
+ "completion_length": 555.3935531616211,
221
+ "epoch": 1.375266524520256,
222
+ "grad_norm": 0.09592189639806747,
223
+ "kl": 0.005047607421875,
224
+ "learning_rate": 5.922283255294164e-08,
225
+ "loss": 0.0362,
226
+ "reward": 0.7993303924798966,
227
+ "reward_std": 0.15318573899567128,
228
+ "rewards/accuracy_reward": 0.7993303924798966,
229
+ "rewards/format_reward": 0.0,
230
+ "step": 80
231
+ },
232
+ {
233
+ "completion_length": 563.471452331543,
234
+ "epoch": 1.4605543710021323,
235
+ "grad_norm": 0.08133210241794586,
236
+ "kl": 0.005420303344726563,
237
+ "learning_rate": 4.864037798685106e-09,
238
+ "loss": 0.0287,
239
+ "reward": 0.7917411044239998,
240
+ "reward_std": 0.16233704406768085,
241
+ "rewards/accuracy_reward": 0.7917411044239998,
242
+ "rewards/format_reward": 0.0,
243
+ "step": 85
244
+ },
245
+ {
246
+ "completion_length": 601.3694458007812,
247
+ "epoch": 1.4946695095948828,
248
+ "kl": 0.004954338073730469,
249
+ "reward": 0.7209821734577417,
250
+ "reward_std": 0.17686136066913605,
251
+ "rewards/accuracy_reward": 0.7209821734577417,
252
+ "rewards/format_reward": 0.0,
253
+ "step": 87,
254
  "total_flos": 0.0,
255
+ "train_loss": 0.03965516161473318,
256
+ "train_runtime": 10204.823,
257
+ "train_samples_per_second": 1.102,
258
+ "train_steps_per_second": 0.009
259
  }
260
  ],
261
  "logging_steps": 5,
262
+ "max_steps": 87,
263
  "num_input_tokens_seen": 0,
264
+ "num_train_epochs": 2,
265
  "save_steps": 500,
266
  "stateful_callbacks": {
267
  "TrainerControl": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20161f39d1b21a3a3a053fb61dc4ba40dda709060888f19e8d7bf40e2ee58015
3
  size 7928
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ba803b2ace34fcfc3c61960d2aa18dc3b43a1ad62eedcaa1dfdd55f3db3a693
3
  size 7928