Attila1011 commited on
Commit
e468f01
·
verified ·
1 Parent(s): 24b1806

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -38,3 +38,4 @@ checkpoints-v5.1/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs
38
  checkpoints-v5.2/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
39
  checkpoints-v5.3/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
40
  checkpoints-v5.4/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
38
  checkpoints-v5.2/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
39
  checkpoints-v5.3/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
40
  checkpoints-v5.4/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
41
+ checkpoints-v5.5/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-v5.5/checkpoint-10240/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53c32f304e06a664ced6b5a01dc36f41c6efe354be2b757f556465a1c6d2dc5c
3
+ size 54599592
checkpoints-v5.5/checkpoint-10240/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dbe0c785ada6ae8ecb6cf96b30f8ee28b18ca8039146e1d8734df7733554bbf
3
+ size 19984661
checkpoints-v5.5/checkpoint-10240/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67bfcd99265e0a7f198a1da6183a2f5095b406251bd7dbc61a0ea3f3322685a9
3
+ size 54599624
checkpoints-v5.5/checkpoint-10240/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cde09398522a16da97b8e3489802a486d243f6a2fd19b6085e0df696ca97cd63
3
+ size 76551435
checkpoints-v5.5/checkpoint-10240/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:056ea97ad2f35feda7dc1d25414ba26f8d40ede84bb42878aa5f2ad51b1b0166
3
+ size 14645
checkpoints-v5.5/checkpoint-10240/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1613ca5e69882f12c8eb8015e0d6aa8d2c1789f21a7954b955d73a0d7994fc88
3
+ size 1383
checkpoints-v5.5/checkpoint-10240/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20959e6dc06f08fff3842ea99d603e06a6a51caeb422e8680bff7397ce65afef
3
+ size 1465
checkpoints-v5.5/checkpoint-10240/trainer_state.json ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.3238735535587303,
6
+ "eval_steps": 1024,
7
+ "global_step": 10240,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.13239381989786023,
14
+ "grad_norm": 7.3368072509765625,
15
+ "learning_rate": 9.990234375e-05,
16
+ "loss": 13.833250999450684,
17
+ "step": 1024
18
+ },
19
+ {
20
+ "epoch": 0.13239381989786023,
21
+ "eval_bleu": 0.08843010093383347,
22
+ "eval_ce_clean_loss": 3.785046512154257,
23
+ "eval_ce_pred_loss": 6.1898151750017885,
24
+ "eval_flow_cos_loss": 0.39729490059955863,
25
+ "eval_flow_mse_loss": 1.2504585077808161,
26
+ "eval_loss": 9.467699330323821,
27
+ "flow/cos_sim": 0.6027051043358578,
28
+ "flow/improvement_ratio": 0.9944390654563904,
29
+ "flow/mag_ratio_mean": 0.6029561613775363,
30
+ "flow/mag_ratio_std": 0.06967356720357944,
31
+ "step": 1024
32
+ },
33
+ {
34
+ "epoch": 0.13239381989786023,
35
+ "eval_bleu": 0.08843010093383347,
36
+ "eval_ce_clean_loss": 3.785046512154257,
37
+ "eval_ce_pred_loss": 6.1898151750017885,
38
+ "eval_flow_cos_loss": 0.39729490059955863,
39
+ "eval_flow_mse_loss": 1.2504585077808161,
40
+ "eval_loss": 9.467699330323821,
41
+ "eval_runtime": 69.1426,
42
+ "eval_samples_per_second": 144.629,
43
+ "eval_steps_per_second": 2.271,
44
+ "flow/cos_sim": 0.6027051043358578,
45
+ "flow/improvement_ratio": 0.9944390654563904,
46
+ "flow/mag_ratio_mean": 0.6029561613775363,
47
+ "flow/mag_ratio_std": 0.06967356720357944,
48
+ "step": 1024
49
+ },
50
+ {
51
+ "epoch": 0.26478763979572045,
52
+ "grad_norm": 1.5976208448410034,
53
+ "learning_rate": 9.971175203561169e-05,
54
+ "loss": 6.556396484375,
55
+ "step": 2048
56
+ },
57
+ {
58
+ "epoch": 0.26478763979572045,
59
+ "eval_bleu": 0.2647666813502405,
60
+ "eval_ce_clean_loss": 0.6921462336922907,
61
+ "eval_ce_pred_loss": 4.159270631279915,
62
+ "eval_flow_cos_loss": 0.33016856043202103,
63
+ "eval_flow_mse_loss": 1.083006814786583,
64
+ "eval_loss": 4.769184565088551,
65
+ "flow/cos_sim": 0.6698314437441005,
66
+ "flow/improvement_ratio": 0.9946717445258122,
67
+ "flow/mag_ratio_mean": 0.6456947994839614,
68
+ "flow/mag_ratio_std": 0.08488734332239552,
69
+ "step": 2048
70
+ },
71
+ {
72
+ "epoch": 0.26478763979572045,
73
+ "eval_bleu": 0.2647666813502405,
74
+ "eval_ce_clean_loss": 0.6921462336922907,
75
+ "eval_ce_pred_loss": 4.159270631279915,
76
+ "eval_flow_cos_loss": 0.33016856043202103,
77
+ "eval_flow_mse_loss": 1.083006814786583,
78
+ "eval_loss": 4.769184565088551,
79
+ "eval_runtime": 67.8586,
80
+ "eval_samples_per_second": 147.365,
81
+ "eval_steps_per_second": 2.314,
82
+ "flow/cos_sim": 0.6698314437441005,
83
+ "flow/improvement_ratio": 0.9946717445258122,
84
+ "flow/mag_ratio_mean": 0.6456947994839614,
85
+ "flow/mag_ratio_std": 0.08488734332239552,
86
+ "step": 2048
87
+ },
88
+ {
89
+ "epoch": 0.3971814596935807,
90
+ "grad_norm": 1.0349175930023193,
91
+ "learning_rate": 9.885033161800567e-05,
92
+ "loss": 4.22868013381958,
93
+ "step": 3072
94
+ },
95
+ {
96
+ "epoch": 0.3971814596935807,
97
+ "eval_bleu": 0.3307477170153146,
98
+ "eval_ce_clean_loss": 0.21776276523140586,
99
+ "eval_ce_pred_loss": 3.5308286536271405,
100
+ "eval_flow_cos_loss": 0.3201853291244264,
101
+ "eval_flow_mse_loss": 1.0728299655732076,
102
+ "eval_loss": 3.8422190869689747,
103
+ "flow/cos_sim": 0.6798147354156349,
104
+ "flow/improvement_ratio": 0.9928556031482235,
105
+ "flow/mag_ratio_mean": 0.6495526474752243,
106
+ "flow/mag_ratio_std": 0.08990857878308388,
107
+ "step": 3072
108
+ },
109
+ {
110
+ "epoch": 0.3971814596935807,
111
+ "eval_bleu": 0.3307477170153146,
112
+ "eval_ce_clean_loss": 0.21776276523140586,
113
+ "eval_ce_pred_loss": 3.5308286536271405,
114
+ "eval_flow_cos_loss": 0.3201853291244264,
115
+ "eval_flow_mse_loss": 1.0728299655732076,
116
+ "eval_loss": 3.8422190869689747,
117
+ "eval_runtime": 67.9151,
118
+ "eval_samples_per_second": 147.243,
119
+ "eval_steps_per_second": 2.312,
120
+ "flow/cos_sim": 0.6798147354156349,
121
+ "flow/improvement_ratio": 0.9928556031482235,
122
+ "flow/mag_ratio_mean": 0.6495526474752243,
123
+ "flow/mag_ratio_std": 0.08990857878308388,
124
+ "step": 3072
125
+ },
126
+ {
127
+ "epoch": 0.5295752795914409,
128
+ "grad_norm": 2.284677028656006,
129
+ "learning_rate": 9.742400750550229e-05,
130
+ "loss": 3.7094979286193848,
131
+ "step": 4096
132
+ },
133
+ {
134
+ "epoch": 0.5295752795914409,
135
+ "eval_bleu": 0.3527590985954599,
136
+ "eval_ce_clean_loss": 0.10325595935818496,
137
+ "eval_ce_pred_loss": 3.3037764221240002,
138
+ "eval_flow_cos_loss": 0.29567099870390195,
139
+ "eval_flow_mse_loss": 1.0334580939286833,
140
+ "eval_loss": 3.5232752614719853,
141
+ "flow/cos_sim": 0.7043290168616423,
142
+ "flow/improvement_ratio": 0.9942314150227103,
143
+ "flow/mag_ratio_mean": 0.6710645951283206,
144
+ "flow/mag_ratio_std": 0.08415729860970929,
145
+ "step": 4096
146
+ },
147
+ {
148
+ "epoch": 0.5295752795914409,
149
+ "eval_bleu": 0.3527590985954599,
150
+ "eval_ce_clean_loss": 0.10325595935818496,
151
+ "eval_ce_pred_loss": 3.3037764221240002,
152
+ "eval_flow_cos_loss": 0.29567099870390195,
153
+ "eval_flow_mse_loss": 1.0334580939286833,
154
+ "eval_loss": 3.5232752614719853,
155
+ "eval_runtime": 67.527,
156
+ "eval_samples_per_second": 148.089,
157
+ "eval_steps_per_second": 2.325,
158
+ "flow/cos_sim": 0.7043290168616423,
159
+ "flow/improvement_ratio": 0.9942314150227103,
160
+ "flow/mag_ratio_mean": 0.6710645951283206,
161
+ "flow/mag_ratio_std": 0.08415729860970929,
162
+ "step": 4096
163
+ },
164
+ {
165
+ "epoch": 0.6619690994893012,
166
+ "grad_norm": 1.0625219345092773,
167
+ "learning_rate": 9.544981995345226e-05,
168
+ "loss": 3.4675893783569336,
169
+ "step": 5120
170
+ },
171
+ {
172
+ "epoch": 0.6619690994893012,
173
+ "eval_bleu": 0.37785369114831263,
174
+ "eval_ce_clean_loss": 0.0570975638404014,
175
+ "eval_ce_pred_loss": 3.1086114409622874,
176
+ "eval_flow_cos_loss": 0.2890773550340324,
177
+ "eval_flow_mse_loss": 1.0530509029983715,
178
+ "eval_loss": 3.3584457916818606,
179
+ "flow/cos_sim": 0.7109226474336757,
180
+ "flow/improvement_ratio": 0.9944721566643685,
181
+ "flow/mag_ratio_mean": 0.6796372863137798,
182
+ "flow/mag_ratio_std": 0.08220032687969268,
183
+ "step": 5120
184
+ },
185
+ {
186
+ "epoch": 0.6619690994893012,
187
+ "eval_bleu": 0.37785369114831263,
188
+ "eval_ce_clean_loss": 0.0570975638404014,
189
+ "eval_ce_pred_loss": 3.1086114409622874,
190
+ "eval_flow_cos_loss": 0.2890773550340324,
191
+ "eval_flow_mse_loss": 1.0530509029983715,
192
+ "eval_loss": 3.3584457916818606,
193
+ "eval_runtime": 67.5831,
194
+ "eval_samples_per_second": 147.966,
195
+ "eval_steps_per_second": 2.323,
196
+ "flow/cos_sim": 0.7109226474336757,
197
+ "flow/improvement_ratio": 0.9944721566643685,
198
+ "flow/mag_ratio_mean": 0.6796372863137798,
199
+ "flow/mag_ratio_std": 0.08220032687969268,
200
+ "step": 5120
201
+ },
202
+ {
203
+ "epoch": 0.7943629193871614,
204
+ "grad_norm": 1.390210747718811,
205
+ "learning_rate": 9.295057566334431e-05,
206
+ "loss": 3.3179638385772705,
207
+ "step": 6144
208
+ },
209
+ {
210
+ "epoch": 0.7943629193871614,
211
+ "eval_bleu": 0.40284924593911503,
212
+ "eval_ce_clean_loss": 0.03560329580999863,
213
+ "eval_ce_pred_loss": 2.935079864635589,
214
+ "eval_flow_cos_loss": 0.2695348363393431,
215
+ "eval_flow_mse_loss": 1.037700882383213,
216
+ "eval_loss": 3.195243762556914,
217
+ "flow/cos_sim": 0.7304651706841341,
218
+ "flow/improvement_ratio": 0.9945835288922498,
219
+ "flow/mag_ratio_mean": 0.7005888793119199,
220
+ "flow/mag_ratio_std": 0.0835665136480787,
221
+ "step": 6144
222
+ },
223
+ {
224
+ "epoch": 0.7943629193871614,
225
+ "eval_bleu": 0.40284924593911503,
226
+ "eval_ce_clean_loss": 0.03560329580999863,
227
+ "eval_ce_pred_loss": 2.935079864635589,
228
+ "eval_flow_cos_loss": 0.2695348363393431,
229
+ "eval_flow_mse_loss": 1.037700882383213,
230
+ "eval_loss": 3.195243762556914,
231
+ "eval_runtime": 67.6818,
232
+ "eval_samples_per_second": 147.75,
233
+ "eval_steps_per_second": 2.32,
234
+ "flow/cos_sim": 0.7304651706841341,
235
+ "flow/improvement_ratio": 0.9945835288922498,
236
+ "flow/mag_ratio_mean": 0.7005888793119199,
237
+ "flow/mag_ratio_std": 0.0835665136480787,
238
+ "step": 6144
239
+ },
240
+ {
241
+ "epoch": 0.9267567392850217,
242
+ "grad_norm": 1.7730882167816162,
243
+ "learning_rate": 8.995830349195804e-05,
244
+ "loss": 3.2147014141082764,
245
+ "step": 7168
246
+ },
247
+ {
248
+ "epoch": 0.9267567392850217,
249
+ "eval_bleu": 0.4108774093856199,
250
+ "eval_ce_clean_loss": 0.025004512588879105,
251
+ "eval_ce_pred_loss": 2.8478709542827243,
252
+ "eval_flow_cos_loss": 0.25457756676871307,
253
+ "eval_flow_mse_loss": 1.0318333833080948,
254
+ "eval_loss": 3.113991931745201,
255
+ "flow/cos_sim": 0.745422419469068,
256
+ "flow/improvement_ratio": 0.9949804590006542,
257
+ "flow/mag_ratio_mean": 0.7171509364607987,
258
+ "flow/mag_ratio_std": 0.08899391105600224,
259
+ "step": 7168
260
+ },
261
+ {
262
+ "epoch": 0.9267567392850217,
263
+ "eval_bleu": 0.4108774093856199,
264
+ "eval_ce_clean_loss": 0.025004512588879105,
265
+ "eval_ce_pred_loss": 2.8478709542827243,
266
+ "eval_flow_cos_loss": 0.25457756676871307,
267
+ "eval_flow_mse_loss": 1.0318333833080948,
268
+ "eval_loss": 3.113991931745201,
269
+ "eval_runtime": 67.7148,
270
+ "eval_samples_per_second": 147.678,
271
+ "eval_steps_per_second": 2.319,
272
+ "flow/cos_sim": 0.745422419469068,
273
+ "flow/improvement_ratio": 0.9949804590006542,
274
+ "flow/mag_ratio_mean": 0.7171509364607987,
275
+ "flow/mag_ratio_std": 0.08899391105600224,
276
+ "step": 7168
277
+ },
278
+ {
279
+ "epoch": 1.05908591376301,
280
+ "grad_norm": 2.208953619003296,
281
+ "learning_rate": 8.650172716103233e-05,
282
+ "loss": 3.119405508041382,
283
+ "step": 8192
284
+ },
285
+ {
286
+ "epoch": 1.05908591376301,
287
+ "eval_bleu": 0.4100243274079566,
288
+ "eval_ce_clean_loss": 0.018418982221632248,
289
+ "eval_ce_pred_loss": 2.801309669853016,
290
+ "eval_flow_cos_loss": 0.24036190160520518,
291
+ "eval_flow_mse_loss": 1.0228236701078475,
292
+ "eval_loss": 3.062249883724626,
293
+ "flow/cos_sim": 0.7596381280072935,
294
+ "flow/improvement_ratio": 0.9959618415042852,
295
+ "flow/mag_ratio_mean": 0.7364322915198697,
296
+ "flow/mag_ratio_std": 0.0917528191949152,
297
+ "step": 8192
298
+ },
299
+ {
300
+ "epoch": 1.05908591376301,
301
+ "eval_bleu": 0.4100243274079566,
302
+ "eval_ce_clean_loss": 0.018418982221632248,
303
+ "eval_ce_pred_loss": 2.801309669853016,
304
+ "eval_flow_cos_loss": 0.24036190160520518,
305
+ "eval_flow_mse_loss": 1.0228236701078475,
306
+ "eval_loss": 3.062249883724626,
307
+ "eval_runtime": 69.3159,
308
+ "eval_samples_per_second": 144.267,
309
+ "eval_steps_per_second": 2.265,
310
+ "flow/cos_sim": 0.7596381280072935,
311
+ "flow/improvement_ratio": 0.9959618415042852,
312
+ "flow/mag_ratio_mean": 0.7364322915198697,
313
+ "flow/mag_ratio_std": 0.0917528191949152,
314
+ "step": 8192
315
+ },
316
+ {
317
+ "epoch": 1.19147973366087,
318
+ "grad_norm": 1.0985896587371826,
319
+ "learning_rate": 8.263142386444264e-05,
320
+ "loss": 3.0675039291381836,
321
+ "step": 9216
322
+ },
323
+ {
324
+ "epoch": 1.19147973366087,
325
+ "eval_bleu": 0.42913809326340735,
326
+ "eval_ce_clean_loss": 0.014243369497904543,
327
+ "eval_ce_pred_loss": 2.679083149903899,
328
+ "eval_flow_cos_loss": 0.2306169035138598,
329
+ "eval_flow_mse_loss": 1.0239125793906534,
330
+ "eval_loss": 2.97116835861449,
331
+ "flow/cos_sim": 0.7693831127160674,
332
+ "flow/improvement_ratio": 0.9939722635184124,
333
+ "flow/mag_ratio_mean": 0.748531128190885,
334
+ "flow/mag_ratio_std": 0.0947970964821281,
335
+ "step": 9216
336
+ },
337
+ {
338
+ "epoch": 1.19147973366087,
339
+ "eval_bleu": 0.42913809326340735,
340
+ "eval_ce_clean_loss": 0.014243369497904543,
341
+ "eval_ce_pred_loss": 2.679083149903899,
342
+ "eval_flow_cos_loss": 0.2306169035138598,
343
+ "eval_flow_mse_loss": 1.0239125793906534,
344
+ "eval_loss": 2.97116835861449,
345
+ "eval_runtime": 67.1108,
346
+ "eval_samples_per_second": 149.007,
347
+ "eval_steps_per_second": 2.339,
348
+ "flow/cos_sim": 0.7693831127160674,
349
+ "flow/improvement_ratio": 0.9939722635184124,
350
+ "flow/mag_ratio_mean": 0.748531128190885,
351
+ "flow/mag_ratio_std": 0.0947970964821281,
352
+ "step": 9216
353
+ },
354
+ {
355
+ "epoch": 1.3238735535587303,
356
+ "grad_norm": 2.3166847229003906,
357
+ "learning_rate": 7.837697175482903e-05,
358
+ "loss": 3.002436399459839,
359
+ "step": 10240
360
+ },
361
+ {
362
+ "epoch": 1.3238735535587303,
363
+ "eval_bleu": 0.42768151912552244,
364
+ "eval_ce_clean_loss": 0.011440879840308883,
365
+ "eval_ce_pred_loss": 2.672361337455215,
366
+ "eval_flow_cos_loss": 0.2179829450739417,
367
+ "eval_flow_mse_loss": 1.0098259748926588,
368
+ "eval_loss": 2.946415506350766,
369
+ "flow/cos_sim": 0.782017103426016,
370
+ "flow/improvement_ratio": 0.9939965146362402,
371
+ "flow/mag_ratio_mean": 0.7625711093283003,
372
+ "flow/mag_ratio_std": 0.09684707447411908,
373
+ "step": 10240
374
+ },
375
+ {
376
+ "epoch": 1.3238735535587303,
377
+ "eval_bleu": 0.42768151912552244,
378
+ "eval_ce_clean_loss": 0.011440879840308883,
379
+ "eval_ce_pred_loss": 2.672361337455215,
380
+ "eval_flow_cos_loss": 0.2179829450739417,
381
+ "eval_flow_mse_loss": 1.0098259748926588,
382
+ "eval_loss": 2.946415506350766,
383
+ "eval_runtime": 67.7743,
384
+ "eval_samples_per_second": 147.549,
385
+ "eval_steps_per_second": 2.317,
386
+ "flow/cos_sim": 0.782017103426016,
387
+ "flow/improvement_ratio": 0.9939965146362402,
388
+ "flow/mag_ratio_mean": 0.7625711093283003,
389
+ "flow/mag_ratio_std": 0.09684707447411908,
390
+ "step": 10240
391
+ }
392
+ ],
393
+ "logging_steps": 1024,
394
+ "max_steps": 30940,
395
+ "num_input_tokens_seen": 0,
396
+ "num_train_epochs": 4,
397
+ "save_steps": 1024,
398
+ "stateful_callbacks": {
399
+ "TrainerControl": {
400
+ "args": {
401
+ "should_epoch_stop": false,
402
+ "should_evaluate": false,
403
+ "should_log": false,
404
+ "should_save": true,
405
+ "should_training_stop": false
406
+ },
407
+ "attributes": {}
408
+ }
409
+ },
410
+ "total_flos": 0.0,
411
+ "train_batch_size": 64,
412
+ "trial_name": null,
413
+ "trial_params": null
414
+ }
checkpoints-v5.5/checkpoint-10240/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7be4d80b8499f3b5f618b042dcec062719328222caddac0d4e4ce11d371480d
3
+ size 5137