Attila1011 commited on
Commit
909957c
·
verified ·
1 Parent(s): 9491fa6

Upload folder using huggingface_hub

Browse files
checkpoints-v1.0-discrete-conditional/checkpoint-6075/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f308d28a715af9add8b1de2802f5fe0d790cf13b3da275d2033da2142579491f
3
+ size 24241232
checkpoints-v1.0-discrete-conditional/checkpoint-6075/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5894db628b27dc538b3dd2f5afb2580f7e6b9b5ff294a45e847dee74fd85ae81
3
+ size 2090059
checkpoints-v1.0-discrete-conditional/checkpoint-6075/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ef57ee88dfae1c93d3f03788067008bcafcaf720d6594dbfbcb3217f35b3914
3
+ size 14645
checkpoints-v1.0-discrete-conditional/checkpoint-6075/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad5272f5bbb3e1f5e5a02b236aff7f184d3dfb0d9e4a9da7851f5c7531043fe1
3
+ size 1383
checkpoints-v1.0-discrete-conditional/checkpoint-6075/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eedfcc2552d8a73e9f34b1abb5aa7d8a44e2a78a10791d5cf93e6b506cdec12b
3
+ size 1465
checkpoints-v1.0-discrete-conditional/checkpoint-6075/trainer_state.json ADDED
@@ -0,0 +1,586 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 256,
7
+ "global_step": 6075,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.12641975308641976,
14
+ "grad_norm": 0.492948055267334,
15
+ "learning_rate": 0.000498046875,
16
+ "loss": 1.2244964838027954,
17
+ "step": 256
18
+ },
19
+ {
20
+ "epoch": 0.12641975308641976,
21
+ "eval_bleu": 0.023732509698572428,
22
+ "eval_loss": 1.1824145491530256,
23
+ "eval_mse_loss": 1.1824145491530256,
24
+ "step": 256
25
+ },
26
+ {
27
+ "epoch": 0.12641975308641976,
28
+ "eval_bleu": 0.023732509698572428,
29
+ "eval_loss": 1.1824145491530256,
30
+ "eval_mse_loss": 1.1824145491530256,
31
+ "eval_runtime": 7.5989,
32
+ "eval_samples_per_second": 344.523,
33
+ "eval_steps_per_second": 5.396,
34
+ "step": 256
35
+ },
36
+ {
37
+ "epoch": 0.2528395061728395,
38
+ "grad_norm": 0.5793251395225525,
39
+ "learning_rate": 0.000998046875,
40
+ "loss": 1.1349375247955322,
41
+ "step": 512
42
+ },
43
+ {
44
+ "epoch": 0.2528395061728395,
45
+ "eval_bleu": 0.022541652789025497,
46
+ "eval_loss": 1.072069642020435,
47
+ "eval_mse_loss": 1.072069642020435,
48
+ "step": 512
49
+ },
50
+ {
51
+ "epoch": 0.2528395061728395,
52
+ "eval_bleu": 0.022541652789025497,
53
+ "eval_loss": 1.072069642020435,
54
+ "eval_mse_loss": 1.072069642020435,
55
+ "eval_runtime": 10.3846,
56
+ "eval_samples_per_second": 252.104,
57
+ "eval_steps_per_second": 3.948,
58
+ "step": 512
59
+ },
60
+ {
61
+ "epoch": 0.37925925925925924,
62
+ "grad_norm": 1.0635820627212524,
63
+ "learning_rate": 0.0009948245115884234,
64
+ "loss": 1.0092785358428955,
65
+ "step": 768
66
+ },
67
+ {
68
+ "epoch": 0.37925925925925924,
69
+ "eval_bleu": 0.033273457001155124,
70
+ "eval_loss": 0.9512386612775849,
71
+ "eval_mse_loss": 0.9512386612775849,
72
+ "step": 768
73
+ },
74
+ {
75
+ "epoch": 0.37925925925925924,
76
+ "eval_bleu": 0.033273457001155124,
77
+ "eval_loss": 0.9512386612775849,
78
+ "eval_mse_loss": 0.9512386612775849,
79
+ "eval_runtime": 7.8124,
80
+ "eval_samples_per_second": 335.109,
81
+ "eval_steps_per_second": 5.248,
82
+ "step": 768
83
+ },
84
+ {
85
+ "epoch": 0.505679012345679,
86
+ "grad_norm": 1.4405001401901245,
87
+ "learning_rate": 0.0009793249077302685,
88
+ "loss": 0.9039018750190735,
89
+ "step": 1024
90
+ },
91
+ {
92
+ "epoch": 0.505679012345679,
93
+ "eval_bleu": 0.052676822222678464,
94
+ "eval_loss": 0.8639799516375471,
95
+ "eval_mse_loss": 0.8639799516375471,
96
+ "step": 1024
97
+ },
98
+ {
99
+ "epoch": 0.505679012345679,
100
+ "eval_bleu": 0.052676822222678464,
101
+ "eval_loss": 0.8639799516375471,
102
+ "eval_mse_loss": 0.8639799516375471,
103
+ "eval_runtime": 7.9883,
104
+ "eval_samples_per_second": 327.728,
105
+ "eval_steps_per_second": 5.132,
106
+ "step": 1024
107
+ },
108
+ {
109
+ "epoch": 0.6320987654320988,
110
+ "grad_norm": 1.0358325242996216,
111
+ "learning_rate": 0.0009538244979611361,
112
+ "loss": 0.8407024145126343,
113
+ "step": 1280
114
+ },
115
+ {
116
+ "epoch": 0.6320987654320988,
117
+ "eval_bleu": 0.06327640177191206,
118
+ "eval_loss": 0.8159576320066685,
119
+ "eval_mse_loss": 0.8159576320066685,
120
+ "step": 1280
121
+ },
122
+ {
123
+ "epoch": 0.6320987654320988,
124
+ "eval_bleu": 0.06327640177191206,
125
+ "eval_loss": 0.8159576320066685,
126
+ "eval_mse_loss": 0.8159576320066685,
127
+ "eval_runtime": 7.5298,
128
+ "eval_samples_per_second": 347.684,
129
+ "eval_steps_per_second": 5.445,
130
+ "step": 1280
131
+ },
132
+ {
133
+ "epoch": 0.7585185185185185,
134
+ "grad_norm": 1.162336826324463,
135
+ "learning_rate": 0.000918855331929685,
136
+ "loss": 0.7959167957305908,
137
+ "step": 1536
138
+ },
139
+ {
140
+ "epoch": 0.7585185185185185,
141
+ "eval_bleu": 0.08702573079240403,
142
+ "eval_loss": 0.7736199585402884,
143
+ "eval_mse_loss": 0.7736199585402884,
144
+ "step": 1536
145
+ },
146
+ {
147
+ "epoch": 0.7585185185185185,
148
+ "eval_bleu": 0.08702573079240403,
149
+ "eval_loss": 0.7736199585402884,
150
+ "eval_mse_loss": 0.7736199585402884,
151
+ "eval_runtime": 7.3212,
152
+ "eval_samples_per_second": 357.594,
153
+ "eval_steps_per_second": 5.6,
154
+ "step": 1536
155
+ },
156
+ {
157
+ "epoch": 0.8849382716049383,
158
+ "grad_norm": 1.069529414176941,
159
+ "learning_rate": 0.0008751470187939401,
160
+ "loss": 0.765856146812439,
161
+ "step": 1792
162
+ },
163
+ {
164
+ "epoch": 0.8849382716049383,
165
+ "eval_bleu": 0.09888737557922633,
166
+ "eval_loss": 0.7493347060389635,
167
+ "eval_mse_loss": 0.7493347060389635,
168
+ "step": 1792
169
+ },
170
+ {
171
+ "epoch": 0.8849382716049383,
172
+ "eval_bleu": 0.09888737557922633,
173
+ "eval_loss": 0.7493347060389635,
174
+ "eval_mse_loss": 0.7493347060389635,
175
+ "eval_runtime": 7.8836,
176
+ "eval_samples_per_second": 332.081,
177
+ "eval_steps_per_second": 5.201,
178
+ "step": 1792
179
+ },
180
+ {
181
+ "epoch": 1.011358024691358,
182
+ "grad_norm": 1.7033277750015259,
183
+ "learning_rate": 0.000823611504395474,
184
+ "loss": 0.7400754690170288,
185
+ "step": 2048
186
+ },
187
+ {
188
+ "epoch": 1.011358024691358,
189
+ "eval_bleu": 0.109763799561906,
190
+ "eval_loss": 0.7278894520387417,
191
+ "eval_mse_loss": 0.7278894520387417,
192
+ "step": 2048
193
+ },
194
+ {
195
+ "epoch": 1.011358024691358,
196
+ "eval_bleu": 0.109763799561906,
197
+ "eval_loss": 0.7278894520387417,
198
+ "eval_mse_loss": 0.7278894520387417,
199
+ "eval_runtime": 8.1701,
200
+ "eval_samples_per_second": 320.438,
201
+ "eval_steps_per_second": 5.018,
202
+ "step": 2048
203
+ },
204
+ {
205
+ "epoch": 1.1377777777777778,
206
+ "grad_norm": 1.650696039199829,
207
+ "learning_rate": 0.0007653240440959618,
208
+ "loss": 0.717435359954834,
209
+ "step": 2304
210
+ },
211
+ {
212
+ "epoch": 1.1377777777777778,
213
+ "eval_bleu": 0.10638359304769301,
214
+ "eval_loss": 0.7039981920544695,
215
+ "eval_mse_loss": 0.7039981920544695,
216
+ "step": 2304
217
+ },
218
+ {
219
+ "epoch": 1.1377777777777778,
220
+ "eval_bleu": 0.10638359304769301,
221
+ "eval_loss": 0.7039981920544695,
222
+ "eval_mse_loss": 0.7039981920544695,
223
+ "eval_runtime": 7.2661,
224
+ "eval_samples_per_second": 360.301,
225
+ "eval_steps_per_second": 5.643,
226
+ "step": 2304
227
+ },
228
+ {
229
+ "epoch": 1.2641975308641975,
230
+ "grad_norm": 1.51780366897583,
231
+ "learning_rate": 0.0007015007682656353,
232
+ "loss": 0.6996763944625854,
233
+ "step": 2560
234
+ },
235
+ {
236
+ "epoch": 1.2641975308641975,
237
+ "eval_bleu": 0.12510003883439522,
238
+ "eval_loss": 0.6899523284377121,
239
+ "eval_mse_loss": 0.6899523284377121,
240
+ "step": 2560
241
+ },
242
+ {
243
+ "epoch": 1.2641975308641975,
244
+ "eval_bleu": 0.12510003883439522,
245
+ "eval_loss": 0.6899523284377121,
246
+ "eval_mse_loss": 0.6899523284377121,
247
+ "eval_runtime": 7.9616,
248
+ "eval_samples_per_second": 328.827,
249
+ "eval_steps_per_second": 5.15,
250
+ "step": 2560
251
+ },
252
+ {
253
+ "epoch": 1.3906172839506172,
254
+ "grad_norm": 1.5419062376022339,
255
+ "learning_rate": 0.0006334733085052896,
256
+ "loss": 0.6830626726150513,
257
+ "step": 2816
258
+ },
259
+ {
260
+ "epoch": 1.3906172839506172,
261
+ "eval_bleu": 0.1434415539389988,
262
+ "eval_loss": 0.6733295452304002,
263
+ "eval_mse_loss": 0.6733295452304002,
264
+ "step": 2816
265
+ },
266
+ {
267
+ "epoch": 1.3906172839506172,
268
+ "eval_bleu": 0.1434415539389988,
269
+ "eval_loss": 0.6733295452304002,
270
+ "eval_mse_loss": 0.6733295452304002,
271
+ "eval_runtime": 7.8649,
272
+ "eval_samples_per_second": 332.873,
273
+ "eval_steps_per_second": 5.213,
274
+ "step": 2816
275
+ },
276
+ {
277
+ "epoch": 1.5170370370370372,
278
+ "grad_norm": 1.7831259965896606,
279
+ "learning_rate": 0.0005626610140094024,
280
+ "loss": 0.6682955026626587,
281
+ "step": 3072
282
+ },
283
+ {
284
+ "epoch": 1.5170370370370372,
285
+ "eval_bleu": 0.1462795204156182,
286
+ "eval_loss": 0.6550371501503921,
287
+ "eval_mse_loss": 0.6550371501503921,
288
+ "step": 3072
289
+ },
290
+ {
291
+ "epoch": 1.5170370370370372,
292
+ "eval_bleu": 0.1462795204156182,
293
+ "eval_loss": 0.6550371501503921,
294
+ "eval_mse_loss": 0.6550371501503921,
295
+ "eval_runtime": 8.3223,
296
+ "eval_samples_per_second": 314.575,
297
+ "eval_steps_per_second": 4.927,
298
+ "step": 3072
299
+ },
300
+ {
301
+ "epoch": 1.643456790123457,
302
+ "grad_norm": 1.8476954698562622,
303
+ "learning_rate": 0.0004905413377580782,
304
+ "loss": 0.655277669429779,
305
+ "step": 3328
306
+ },
307
+ {
308
+ "epoch": 1.643456790123457,
309
+ "eval_bleu": 0.15115082134880684,
310
+ "eval_loss": 0.6512059845575472,
311
+ "eval_mse_loss": 0.6512059845575472,
312
+ "step": 3328
313
+ },
314
+ {
315
+ "epoch": 1.643456790123457,
316
+ "eval_bleu": 0.15115082134880684,
317
+ "eval_loss": 0.6512059845575472,
318
+ "eval_mse_loss": 0.6512059845575472,
319
+ "eval_runtime": 7.1783,
320
+ "eval_samples_per_second": 364.712,
321
+ "eval_steps_per_second": 5.712,
322
+ "step": 3328
323
+ },
324
+ {
325
+ "epoch": 1.7698765432098766,
326
+ "grad_norm": 1.8581129312515259,
327
+ "learning_rate": 0.0004186190104108759,
328
+ "loss": 0.6462851166725159,
329
+ "step": 3584
330
+ },
331
+ {
332
+ "epoch": 1.7698765432098766,
333
+ "eval_bleu": 0.15871631534957106,
334
+ "eval_loss": 0.6409407970381946,
335
+ "eval_mse_loss": 0.6409407970381946,
336
+ "step": 3584
337
+ },
338
+ {
339
+ "epoch": 1.7698765432098766,
340
+ "eval_bleu": 0.15871631534957106,
341
+ "eval_loss": 0.6409407970381946,
342
+ "eval_mse_loss": 0.6409407970381946,
343
+ "eval_runtime": 7.8013,
344
+ "eval_samples_per_second": 335.586,
345
+ "eval_steps_per_second": 5.256,
346
+ "step": 3584
347
+ },
348
+ {
349
+ "epoch": 1.8962962962962964,
350
+ "grad_norm": 1.4670681953430176,
351
+ "learning_rate": 0.00034839464506939983,
352
+ "loss": 0.6346220374107361,
353
+ "step": 3840
354
+ },
355
+ {
356
+ "epoch": 1.8962962962962964,
357
+ "eval_bleu": 0.16468961242826577,
358
+ "eval_loss": 0.6350918848340105,
359
+ "eval_mse_loss": 0.6350918848340105,
360
+ "step": 3840
361
+ },
362
+ {
363
+ "epoch": 1.8962962962962964,
364
+ "eval_bleu": 0.16468961242826577,
365
+ "eval_loss": 0.6350918848340105,
366
+ "eval_mse_loss": 0.6350918848340105,
367
+ "eval_runtime": 7.8698,
368
+ "eval_samples_per_second": 332.666,
369
+ "eval_steps_per_second": 5.21,
370
+ "step": 3840
371
+ },
372
+ {
373
+ "epoch": 2.022716049382716,
374
+ "grad_norm": 1.6064578294754028,
375
+ "learning_rate": 0.0002813334279500873,
376
+ "loss": 0.6261877417564392,
377
+ "step": 4096
378
+ },
379
+ {
380
+ "epoch": 2.022716049382716,
381
+ "eval_bleu": 0.18248380444483625,
382
+ "eval_loss": 0.6185257740137053,
383
+ "eval_mse_loss": 0.6185257740137053,
384
+ "step": 4096
385
+ },
386
+ {
387
+ "epoch": 2.022716049382716,
388
+ "eval_bleu": 0.18248380444483625,
389
+ "eval_loss": 0.6185257740137053,
390
+ "eval_mse_loss": 0.6185257740137053,
391
+ "eval_runtime": 8.3116,
392
+ "eval_samples_per_second": 314.98,
393
+ "eval_steps_per_second": 4.933,
394
+ "step": 4096
395
+ },
396
+ {
397
+ "epoch": 2.149135802469136,
398
+ "grad_norm": 1.1933625936508179,
399
+ "learning_rate": 0.0002188345482161727,
400
+ "loss": 0.6169702410697937,
401
+ "step": 4352
402
+ },
403
+ {
404
+ "epoch": 2.149135802469136,
405
+ "eval_bleu": 0.17739074250184356,
406
+ "eval_loss": 0.619684124865183,
407
+ "eval_mse_loss": 0.619684124865183,
408
+ "step": 4352
409
+ },
410
+ {
411
+ "epoch": 2.149135802469136,
412
+ "eval_bleu": 0.17739074250184356,
413
+ "eval_loss": 0.619684124865183,
414
+ "eval_mse_loss": 0.619684124865183,
415
+ "eval_runtime": 7.0884,
416
+ "eval_samples_per_second": 369.337,
417
+ "eval_steps_per_second": 5.784,
418
+ "step": 4352
419
+ },
420
+ {
421
+ "epoch": 2.2755555555555556,
422
+ "grad_norm": 1.3906270265579224,
423
+ "learning_rate": 0.00016220200479571452,
424
+ "loss": 0.612635612487793,
425
+ "step": 4608
426
+ },
427
+ {
428
+ "epoch": 2.2755555555555556,
429
+ "eval_bleu": 0.19037059494496988,
430
+ "eval_loss": 0.6192231905169603,
431
+ "eval_mse_loss": 0.6192231905169603,
432
+ "step": 4608
433
+ },
434
+ {
435
+ "epoch": 2.2755555555555556,
436
+ "eval_bleu": 0.19037059494496988,
437
+ "eval_loss": 0.6192231905169603,
438
+ "eval_mse_loss": 0.6192231905169603,
439
+ "eval_runtime": 7.2603,
440
+ "eval_samples_per_second": 360.593,
441
+ "eval_steps_per_second": 5.647,
442
+ "step": 4608
443
+ },
444
+ {
445
+ "epoch": 2.4019753086419753,
446
+ "grad_norm": 1.3547346591949463,
447
+ "learning_rate": 0.00011261739928266108,
448
+ "loss": 0.6073014736175537,
449
+ "step": 4864
450
+ },
451
+ {
452
+ "epoch": 2.4019753086419753,
453
+ "eval_bleu": 0.19665228309802169,
454
+ "eval_loss": 0.6012938604122255,
455
+ "eval_mse_loss": 0.6012938604122255,
456
+ "step": 4864
457
+ },
458
+ {
459
+ "epoch": 2.4019753086419753,
460
+ "eval_bleu": 0.19665228309802169,
461
+ "eval_loss": 0.6012938604122255,
462
+ "eval_mse_loss": 0.6012938604122255,
463
+ "eval_runtime": 7.3255,
464
+ "eval_samples_per_second": 357.382,
465
+ "eval_steps_per_second": 5.597,
466
+ "step": 4864
467
+ },
468
+ {
469
+ "epoch": 2.528395061728395,
470
+ "grad_norm": 1.2325892448425293,
471
+ "learning_rate": 7.111528257956546e-05,
472
+ "loss": 0.6047106981277466,
473
+ "step": 5120
474
+ },
475
+ {
476
+ "epoch": 2.528395061728395,
477
+ "eval_bleu": 0.18764803384320677,
478
+ "eval_loss": 0.6008666448476838,
479
+ "eval_mse_loss": 0.6008666448476838,
480
+ "step": 5120
481
+ },
482
+ {
483
+ "epoch": 2.528395061728395,
484
+ "eval_bleu": 0.18764803384320677,
485
+ "eval_loss": 0.6008666448476838,
486
+ "eval_mse_loss": 0.6008666448476838,
487
+ "eval_runtime": 7.2141,
488
+ "eval_samples_per_second": 362.901,
489
+ "eval_steps_per_second": 5.683,
490
+ "step": 5120
491
+ },
492
+ {
493
+ "epoch": 2.6548148148148147,
494
+ "grad_norm": 1.2043205499649048,
495
+ "learning_rate": 3.856156965839863e-05,
496
+ "loss": 0.6008339524269104,
497
+ "step": 5376
498
+ },
499
+ {
500
+ "epoch": 2.6548148148148147,
501
+ "eval_bleu": 0.1778719937984465,
502
+ "eval_loss": 0.6019039372118508,
503
+ "eval_mse_loss": 0.6019039372118508,
504
+ "step": 5376
505
+ },
506
+ {
507
+ "epoch": 2.6548148148148147,
508
+ "eval_bleu": 0.1778719937984465,
509
+ "eval_loss": 0.6019039372118508,
510
+ "eval_mse_loss": 0.6019039372118508,
511
+ "eval_runtime": 7.2108,
512
+ "eval_samples_per_second": 363.066,
513
+ "eval_steps_per_second": 5.686,
514
+ "step": 5376
515
+ },
516
+ {
517
+ "epoch": 2.7812345679012345,
518
+ "grad_norm": 0.8030632734298706,
519
+ "learning_rate": 1.56354728015935e-05,
520
+ "loss": 0.5988247990608215,
521
+ "step": 5632
522
+ },
523
+ {
524
+ "epoch": 2.7812345679012345,
525
+ "eval_bleu": 0.1821845130406551,
526
+ "eval_loss": 0.6124805255634028,
527
+ "eval_mse_loss": 0.6124805255634028,
528
+ "step": 5632
529
+ },
530
+ {
531
+ "epoch": 2.7812345679012345,
532
+ "eval_bleu": 0.1821845130406551,
533
+ "eval_loss": 0.6124805255634028,
534
+ "eval_mse_loss": 0.6124805255634028,
535
+ "eval_runtime": 7.2547,
536
+ "eval_samples_per_second": 360.87,
537
+ "eval_steps_per_second": 5.652,
538
+ "step": 5632
539
+ },
540
+ {
541
+ "epoch": 2.907654320987654,
542
+ "grad_norm": 0.7116315364837646,
543
+ "learning_rate": 2.81533027462183e-06,
544
+ "loss": 0.5993514657020569,
545
+ "step": 5888
546
+ },
547
+ {
548
+ "epoch": 2.907654320987654,
549
+ "eval_bleu": 0.1971691623533449,
550
+ "eval_loss": 0.593609597624802,
551
+ "eval_mse_loss": 0.593609597624802,
552
+ "step": 5888
553
+ },
554
+ {
555
+ "epoch": 2.907654320987654,
556
+ "eval_bleu": 0.1971691623533449,
557
+ "eval_loss": 0.593609597624802,
558
+ "eval_mse_loss": 0.593609597624802,
559
+ "eval_runtime": 7.1929,
560
+ "eval_samples_per_second": 363.972,
561
+ "eval_steps_per_second": 5.7,
562
+ "step": 5888
563
+ }
564
+ ],
565
+ "logging_steps": 256,
566
+ "max_steps": 6075,
567
+ "num_input_tokens_seen": 0,
568
+ "num_train_epochs": 3,
569
+ "save_steps": 256,
570
+ "stateful_callbacks": {
571
+ "TrainerControl": {
572
+ "args": {
573
+ "should_epoch_stop": false,
574
+ "should_evaluate": false,
575
+ "should_log": false,
576
+ "should_save": true,
577
+ "should_training_stop": true
578
+ },
579
+ "attributes": {}
580
+ }
581
+ },
582
+ "total_flos": 0.0,
583
+ "train_batch_size": 64,
584
+ "trial_name": null,
585
+ "trial_params": null
586
+ }
checkpoints-v1.0-discrete-conditional/checkpoint-6075/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2792db1613dd46e912da1df5b55d8147d2f8e78f28906344920814707c74766f
3
+ size 5137