rootxhacker commited on
Commit
5651f82
·
verified ·
1 Parent(s): 4c8fe3f

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a9e090bf1b3b09dbe8cb6b79d3a7e692232fb851dc5e22f221471a638858e73
3
  size 36730224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56e8ec13f5b750bff43bf95c8f426d5c76de12c9614c601c59257c731d90aad9
3
  size 36730224
last-checkpoint/ar_diffusion_info.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30fc997f014f8771605ac175a0becd64846b6e365b7b344cbcd1952ce4ff7b9d
3
  size 1736
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcf5fd030c264f8615af2d6bd5dd8f07c7ac6052929ca71266901b45d38e573c
3
  size 1736
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d50e12c43bc31ea78337ba4c2a352e47dae2a51ea081d7c0d57df4add4243c5
3
  size 73588346
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed69c86e6661b1f68a99aa8b123937781810ccc7aacd7d2f7ad96f9fb90dd267
3
  size 73588346
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b95749af35857001e40598a88328f44c58838ef1911894bdba44fd9cf3d356b1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa10217fea7d7bdff8c6b7812212a3b0fcc34c0411d6adbf4710eb28a9f096e8
3
  size 14244
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7532c7a02e2aeb7c936b6d9813b2c402b9be5b25d9e0bb18270e536f6014e58
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6d2c7b185157ea92508fe4deec82c591b54fb96c18b69b1ba12fb4fa0a5f624
3
  size 988
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:129303cede08862e45aff723e13523f2863b1a8c5dd6144e719bcbf05975af10
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65d8086826e16c0b4479d43a31b6b223fba2daaa33e58ffbb91a05247a535912
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,1991 +1,99 @@
1
  {
2
- "best_global_step": 11500,
3
- "best_metric": 1.792478322982788,
4
- "best_model_checkpoint": "./ar-diffusion-checkpoints-progressive-attention/checkpoint-11500",
5
- "epoch": 0.8845473425121144,
6
  "eval_steps": 250,
7
- "global_step": 11500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.003845858010922237,
14
- "grad_norm": 12.771480560302734,
15
  "learning_rate": 1.84e-05,
16
- "loss": 12.0004,
17
  "step": 50
18
  },
19
  {
20
  "epoch": 0.007691716021844474,
21
- "grad_norm": 5.099525451660156,
22
  "learning_rate": 3.8e-05,
23
- "loss": 5.8445,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.01153757403276671,
28
- "grad_norm": 7.0865478515625,
29
  "learning_rate": 5.8e-05,
30
- "loss": 4.5083,
31
  "step": 150
32
  },
33
  {
34
  "epoch": 0.015383432043688947,
35
- "grad_norm": 4.829924583435059,
36
  "learning_rate": 7.800000000000001e-05,
37
- "loss": 3.8954,
38
  "step": 200
39
  },
40
  {
41
  "epoch": 0.019229290054611183,
42
- "grad_norm": 4.646179676055908,
43
  "learning_rate": 9.8e-05,
44
- "loss": 3.8418,
45
  "step": 250
46
  },
47
  {
48
  "epoch": 0.019229290054611183,
49
- "eval_loss": 3.765653371810913,
50
- "eval_runtime": 17.5085,
51
- "eval_samples_per_second": 57.115,
52
- "eval_steps_per_second": 14.279,
53
  "step": 250
54
  },
55
  {
56
  "epoch": 0.02307514806553342,
57
- "grad_norm": 8.012557029724121,
58
  "learning_rate": 0.000118,
59
- "loss": 3.5471,
60
  "step": 300
61
  },
62
  {
63
  "epoch": 0.02692100607645566,
64
- "grad_norm": 4.200570106506348,
65
  "learning_rate": 0.000138,
66
- "loss": 3.398,
67
  "step": 350
68
  },
69
  {
70
  "epoch": 0.030766864087377895,
71
- "grad_norm": 5.796392440795898,
72
  "learning_rate": 0.00015800000000000002,
73
- "loss": 3.5043,
74
  "step": 400
75
  },
76
  {
77
  "epoch": 0.03461272209830013,
78
- "grad_norm": 3.202223300933838,
79
  "learning_rate": 0.00017800000000000002,
80
- "loss": 3.4124,
81
  "step": 450
82
  },
83
  {
84
  "epoch": 0.038458580109222366,
85
- "grad_norm": 4.212994575500488,
86
  "learning_rate": 0.00019800000000000002,
87
- "loss": 3.3327,
88
  "step": 500
89
  },
90
  {
91
  "epoch": 0.038458580109222366,
92
- "eval_loss": 3.3062171936035156,
93
- "eval_runtime": 17.4889,
94
- "eval_samples_per_second": 57.179,
95
- "eval_steps_per_second": 14.295,
96
  "step": 500
97
- },
98
- {
99
- "epoch": 0.0423044381201446,
100
- "grad_norm": 3.885101079940796,
101
- "learning_rate": 0.0001997662519803652,
102
- "loss": 3.3463,
103
- "step": 550
104
- },
105
- {
106
- "epoch": 0.04615029613106684,
107
- "grad_norm": 5.273513317108154,
108
- "learning_rate": 0.0001995065319585487,
109
- "loss": 3.252,
110
- "step": 600
111
- },
112
- {
113
- "epoch": 0.04999615414198908,
114
- "grad_norm": 3.779219627380371,
115
- "learning_rate": 0.0001992468119367322,
116
- "loss": 3.1188,
117
- "step": 650
118
- },
119
- {
120
- "epoch": 0.05384201215291132,
121
- "grad_norm": 3.2600204944610596,
122
- "learning_rate": 0.00019898709191491573,
123
- "loss": 2.9879,
124
- "step": 700
125
- },
126
- {
127
- "epoch": 0.05768787016383355,
128
- "grad_norm": 7.561593055725098,
129
- "learning_rate": 0.00019872737189309924,
130
- "loss": 3.2548,
131
- "step": 750
132
- },
133
- {
134
- "epoch": 0.05768787016383355,
135
- "eval_loss": 3.1286704540252686,
136
- "eval_runtime": 17.5722,
137
- "eval_samples_per_second": 56.908,
138
- "eval_steps_per_second": 14.227,
139
- "step": 750
140
- },
141
- {
142
- "epoch": 0.06153372817475579,
143
- "grad_norm": 3.786135673522949,
144
- "learning_rate": 0.00019846765187128275,
145
- "loss": 3.1959,
146
- "step": 800
147
- },
148
- {
149
- "epoch": 0.06537958618567802,
150
- "grad_norm": 4.230893611907959,
151
- "learning_rate": 0.0001982079318494663,
152
- "loss": 3.1453,
153
- "step": 850
154
- },
155
- {
156
- "epoch": 0.06922544419660026,
157
- "grad_norm": 2.7769834995269775,
158
- "learning_rate": 0.0001979482118276498,
159
- "loss": 3.1789,
160
- "step": 900
161
- },
162
- {
163
- "epoch": 0.0730713022075225,
164
- "grad_norm": 1.746205449104309,
165
- "learning_rate": 0.00019768849180583334,
166
- "loss": 2.9569,
167
- "step": 950
168
- },
169
- {
170
- "epoch": 0.07691716021844473,
171
- "grad_norm": 3.5990424156188965,
172
- "learning_rate": 0.00019742877178401683,
173
- "loss": 3.0739,
174
- "step": 1000
175
- },
176
- {
177
- "epoch": 0.07691716021844473,
178
- "eval_loss": 3.0572922229766846,
179
- "eval_runtime": 17.4776,
180
- "eval_samples_per_second": 57.216,
181
- "eval_steps_per_second": 14.304,
182
- "step": 1000
183
- },
184
- {
185
- "epoch": 0.08076301822936698,
186
- "grad_norm": 2.302044153213501,
187
- "learning_rate": 0.00019716905176220034,
188
- "loss": 2.9218,
189
- "step": 1050
190
- },
191
- {
192
- "epoch": 0.0846088762402892,
193
- "grad_norm": 2.9099605083465576,
194
- "learning_rate": 0.00019690933174038388,
195
- "loss": 3.027,
196
- "step": 1100
197
- },
198
- {
199
- "epoch": 0.08845473425121145,
200
- "grad_norm": 2.307961940765381,
201
- "learning_rate": 0.0001966496117185674,
202
- "loss": 2.9845,
203
- "step": 1150
204
- },
205
- {
206
- "epoch": 0.09230059226213368,
207
- "grad_norm": 4.332247734069824,
208
- "learning_rate": 0.0001963898916967509,
209
- "loss": 2.9047,
210
- "step": 1200
211
- },
212
- {
213
- "epoch": 0.09614645027305592,
214
- "grad_norm": 3.9856581687927246,
215
- "learning_rate": 0.00019613017167493445,
216
- "loss": 3.0003,
217
- "step": 1250
218
- },
219
- {
220
- "epoch": 0.09614645027305592,
221
- "eval_loss": 2.8999271392822266,
222
- "eval_runtime": 17.4674,
223
- "eval_samples_per_second": 57.25,
224
- "eval_steps_per_second": 14.312,
225
- "step": 1250
226
- },
227
- {
228
- "epoch": 0.09999230828397816,
229
- "grad_norm": 8.007057189941406,
230
- "learning_rate": 0.00019587045165311796,
231
- "loss": 2.9034,
232
- "step": 1300
233
- },
234
- {
235
- "epoch": 0.10383816629490039,
236
- "grad_norm": 2.3496313095092773,
237
- "learning_rate": 0.00019561073163130145,
238
- "loss": 2.8742,
239
- "step": 1350
240
- },
241
- {
242
- "epoch": 0.10768402430582263,
243
- "grad_norm": 2.3679521083831787,
244
- "learning_rate": 0.000195351011609485,
245
- "loss": 2.8189,
246
- "step": 1400
247
- },
248
- {
249
- "epoch": 0.11152988231674486,
250
- "grad_norm": 5.371161937713623,
251
- "learning_rate": 0.0001950912915876685,
252
- "loss": 2.826,
253
- "step": 1450
254
- },
255
- {
256
- "epoch": 0.1153757403276671,
257
- "grad_norm": 4.442603588104248,
258
- "learning_rate": 0.000194831571565852,
259
- "loss": 2.9645,
260
- "step": 1500
261
- },
262
- {
263
- "epoch": 0.1153757403276671,
264
- "eval_loss": 2.864973783493042,
265
- "eval_runtime": 17.4467,
266
- "eval_samples_per_second": 57.317,
267
- "eval_steps_per_second": 14.329,
268
- "step": 1500
269
- },
270
- {
271
- "epoch": 0.11922159833858934,
272
- "grad_norm": 3.153311252593994,
273
- "learning_rate": 0.00019457185154403555,
274
- "loss": 2.8506,
275
- "step": 1550
276
- },
277
- {
278
- "epoch": 0.12306745634951158,
279
- "grad_norm": 4.94291877746582,
280
- "learning_rate": 0.00019431213152221907,
281
- "loss": 2.9362,
282
- "step": 1600
283
- },
284
- {
285
- "epoch": 0.12691331436043382,
286
- "grad_norm": 3.0833139419555664,
287
- "learning_rate": 0.00019405241150040255,
288
- "loss": 2.7926,
289
- "step": 1650
290
- },
291
- {
292
- "epoch": 0.13075917237135604,
293
- "grad_norm": 4.330313205718994,
294
- "learning_rate": 0.00019379788587902243,
295
- "loss": 3.0154,
296
- "step": 1700
297
- },
298
- {
299
- "epoch": 0.13460503038227828,
300
- "grad_norm": 3.247084379196167,
301
- "learning_rate": 0.00019353816585720594,
302
- "loss": 2.8993,
303
- "step": 1750
304
- },
305
- {
306
- "epoch": 0.13460503038227828,
307
- "eval_loss": 2.797525644302368,
308
- "eval_runtime": 17.3938,
309
- "eval_samples_per_second": 57.492,
310
- "eval_steps_per_second": 14.373,
311
- "step": 1750
312
- },
313
- {
314
- "epoch": 0.13845088839320052,
315
- "grad_norm": 1.2497533559799194,
316
- "learning_rate": 0.00019327844583538946,
317
- "loss": 2.8769,
318
- "step": 1800
319
- },
320
- {
321
- "epoch": 0.14229674640412276,
322
- "grad_norm": 3.2852976322174072,
323
- "learning_rate": 0.000193018725813573,
324
- "loss": 2.7027,
325
- "step": 1850
326
- },
327
- {
328
- "epoch": 0.146142604415045,
329
- "grad_norm": 3.199697732925415,
330
- "learning_rate": 0.0001927590057917565,
331
- "loss": 2.8646,
332
- "step": 1900
333
- },
334
- {
335
- "epoch": 0.14998846242596722,
336
- "grad_norm": 2.30946946144104,
337
- "learning_rate": 0.00019249928576994002,
338
- "loss": 2.7485,
339
- "step": 1950
340
- },
341
- {
342
- "epoch": 0.15383432043688947,
343
- "grad_norm": 2.7489562034606934,
344
- "learning_rate": 0.00019223956574812353,
345
- "loss": 2.8288,
346
- "step": 2000
347
- },
348
- {
349
- "epoch": 0.15383432043688947,
350
- "eval_loss": 2.732482671737671,
351
- "eval_runtime": 17.4426,
352
- "eval_samples_per_second": 57.331,
353
- "eval_steps_per_second": 14.333,
354
- "step": 2000
355
- },
356
- {
357
- "epoch": 0.1576801784478117,
358
- "grad_norm": 2.094149351119995,
359
- "learning_rate": 0.00019197984572630705,
360
- "loss": 2.8181,
361
- "step": 2050
362
- },
363
- {
364
- "epoch": 0.16152603645873395,
365
- "grad_norm": 2.880585193634033,
366
- "learning_rate": 0.00019172012570449056,
367
- "loss": 2.7495,
368
- "step": 2100
369
- },
370
- {
371
- "epoch": 0.16537189446965617,
372
- "grad_norm": 3.0462090969085693,
373
- "learning_rate": 0.0001914604056826741,
374
- "loss": 2.6552,
375
- "step": 2150
376
- },
377
- {
378
- "epoch": 0.1692177524805784,
379
- "grad_norm": 2.8088278770446777,
380
- "learning_rate": 0.0001912006856608576,
381
- "loss": 2.6424,
382
- "step": 2200
383
- },
384
- {
385
- "epoch": 0.17306361049150065,
386
- "grad_norm": 2.3041441440582275,
387
- "learning_rate": 0.00019094096563904112,
388
- "loss": 2.6002,
389
- "step": 2250
390
- },
391
- {
392
- "epoch": 0.17306361049150065,
393
- "eval_loss": 2.6283011436462402,
394
- "eval_runtime": 17.4951,
395
- "eval_samples_per_second": 57.159,
396
- "eval_steps_per_second": 14.29,
397
- "step": 2250
398
- },
399
- {
400
- "epoch": 0.1769094685024229,
401
- "grad_norm": 2.4782445430755615,
402
- "learning_rate": 0.00019068124561722464,
403
- "loss": 2.7373,
404
- "step": 2300
405
- },
406
- {
407
- "epoch": 0.18075532651334514,
408
- "grad_norm": 2.799548864364624,
409
- "learning_rate": 0.00019042152559540815,
410
- "loss": 2.6528,
411
- "step": 2350
412
- },
413
- {
414
- "epoch": 0.18460118452426735,
415
- "grad_norm": 3.158393144607544,
416
- "learning_rate": 0.00019016180557359166,
417
- "loss": 2.6495,
418
- "step": 2400
419
- },
420
- {
421
- "epoch": 0.1884470425351896,
422
- "grad_norm": 2.4797089099884033,
423
- "learning_rate": 0.0001899020855517752,
424
- "loss": 2.6815,
425
- "step": 2450
426
- },
427
- {
428
- "epoch": 0.19229290054611184,
429
- "grad_norm": 2.5662102699279785,
430
- "learning_rate": 0.00018964236552995872,
431
- "loss": 2.6435,
432
- "step": 2500
433
- },
434
- {
435
- "epoch": 0.19229290054611184,
436
- "eval_loss": 2.6090738773345947,
437
- "eval_runtime": 17.3946,
438
- "eval_samples_per_second": 57.489,
439
- "eval_steps_per_second": 14.372,
440
- "step": 2500
441
- },
442
- {
443
- "epoch": 0.19613875855703408,
444
- "grad_norm": 2.0812065601348877,
445
- "learning_rate": 0.00018938264550814223,
446
- "loss": 2.4975,
447
- "step": 2550
448
- },
449
- {
450
- "epoch": 0.19998461656795632,
451
- "grad_norm": 3.9404165744781494,
452
- "learning_rate": 0.00018912292548632574,
453
- "loss": 2.6395,
454
- "step": 2600
455
- },
456
- {
457
- "epoch": 0.20383047457887854,
458
- "grad_norm": 2.122506856918335,
459
- "learning_rate": 0.00018886320546450925,
460
- "loss": 2.59,
461
- "step": 2650
462
- },
463
- {
464
- "epoch": 0.20767633258980078,
465
- "grad_norm": 4.2281293869018555,
466
- "learning_rate": 0.0001886034854426928,
467
- "loss": 2.5563,
468
- "step": 2700
469
- },
470
- {
471
- "epoch": 0.21152219060072303,
472
- "grad_norm": 2.492363691329956,
473
- "learning_rate": 0.0001883437654208763,
474
- "loss": 2.4601,
475
- "step": 2750
476
- },
477
- {
478
- "epoch": 0.21152219060072303,
479
- "eval_loss": 2.576280355453491,
480
- "eval_runtime": 17.2831,
481
- "eval_samples_per_second": 57.86,
482
- "eval_steps_per_second": 14.465,
483
- "step": 2750
484
- },
485
- {
486
- "epoch": 0.21536804861164527,
487
- "grad_norm": 3.6676604747772217,
488
- "learning_rate": 0.00018808404539905982,
489
- "loss": 2.4723,
490
- "step": 2800
491
- },
492
- {
493
- "epoch": 0.21921390662256748,
494
- "grad_norm": 2.855665922164917,
495
- "learning_rate": 0.00018782432537724336,
496
- "loss": 2.5224,
497
- "step": 2850
498
- },
499
- {
500
- "epoch": 0.22305976463348973,
501
- "grad_norm": 2.6359572410583496,
502
- "learning_rate": 0.00018756460535542685,
503
- "loss": 2.6181,
504
- "step": 2900
505
- },
506
- {
507
- "epoch": 0.22690562264441197,
508
- "grad_norm": 2.5210607051849365,
509
- "learning_rate": 0.00018730488533361036,
510
- "loss": 2.4691,
511
- "step": 2950
512
- },
513
- {
514
- "epoch": 0.2307514806553342,
515
- "grad_norm": 3.2655303478240967,
516
- "learning_rate": 0.0001870451653117939,
517
- "loss": 2.5791,
518
- "step": 3000
519
- },
520
- {
521
- "epoch": 0.2307514806553342,
522
- "eval_loss": 2.5004756450653076,
523
- "eval_runtime": 17.3413,
524
- "eval_samples_per_second": 57.666,
525
- "eval_steps_per_second": 14.416,
526
- "step": 3000
527
- },
528
- {
529
- "epoch": 0.23459733866625646,
530
- "grad_norm": 3.1443817615509033,
531
- "learning_rate": 0.0001867854452899774,
532
- "loss": 2.5689,
533
- "step": 3050
534
- },
535
- {
536
- "epoch": 0.23844319667717867,
537
- "grad_norm": 2.394235610961914,
538
- "learning_rate": 0.00018652572526816092,
539
- "loss": 2.4802,
540
- "step": 3100
541
- },
542
- {
543
- "epoch": 0.2422890546881009,
544
- "grad_norm": 3.216654062271118,
545
- "learning_rate": 0.00018626600524634446,
546
- "loss": 2.5239,
547
- "step": 3150
548
- },
549
- {
550
- "epoch": 0.24613491269902316,
551
- "grad_norm": 2.438185691833496,
552
- "learning_rate": 0.00018600628522452798,
553
- "loss": 2.6504,
554
- "step": 3200
555
- },
556
- {
557
- "epoch": 0.2499807707099454,
558
- "grad_norm": 3.8275232315063477,
559
- "learning_rate": 0.00018574656520271146,
560
- "loss": 2.4848,
561
- "step": 3250
562
- },
563
- {
564
- "epoch": 0.2499807707099454,
565
- "eval_loss": 2.4598114490509033,
566
- "eval_runtime": 17.3676,
567
- "eval_samples_per_second": 57.578,
568
- "eval_steps_per_second": 14.395,
569
- "step": 3250
570
- },
571
- {
572
- "epoch": 0.25382662872086764,
573
- "grad_norm": 2.3598976135253906,
574
- "learning_rate": 0.000185486845180895,
575
- "loss": 2.3763,
576
- "step": 3300
577
- },
578
- {
579
- "epoch": 0.2576724867317899,
580
- "grad_norm": 2.642561912536621,
581
- "learning_rate": 0.00018522712515907852,
582
- "loss": 2.5,
583
- "step": 3350
584
- },
585
- {
586
- "epoch": 0.26151834474271207,
587
- "grad_norm": 1.7602814435958862,
588
- "learning_rate": 0.00018496740513726206,
589
- "loss": 2.4706,
590
- "step": 3400
591
- },
592
- {
593
- "epoch": 0.2653642027536343,
594
- "grad_norm": 1.542140245437622,
595
- "learning_rate": 0.00018470768511544557,
596
- "loss": 2.4107,
597
- "step": 3450
598
- },
599
- {
600
- "epoch": 0.26921006076455656,
601
- "grad_norm": 1.7666794061660767,
602
- "learning_rate": 0.00018444796509362908,
603
- "loss": 2.4018,
604
- "step": 3500
605
- },
606
- {
607
- "epoch": 0.26921006076455656,
608
- "eval_loss": 2.4329559803009033,
609
- "eval_runtime": 17.3412,
610
- "eval_samples_per_second": 57.666,
611
- "eval_steps_per_second": 14.417,
612
- "step": 3500
613
- },
614
- {
615
- "epoch": 0.2730559187754788,
616
- "grad_norm": 2.434197425842285,
617
- "learning_rate": 0.0001841882450718126,
618
- "loss": 2.4402,
619
- "step": 3550
620
- },
621
- {
622
- "epoch": 0.27690177678640104,
623
- "grad_norm": 2.393425226211548,
624
- "learning_rate": 0.0001839285250499961,
625
- "loss": 2.4952,
626
- "step": 3600
627
- },
628
- {
629
- "epoch": 0.2807476347973233,
630
- "grad_norm": 2.2749860286712646,
631
- "learning_rate": 0.00018366880502817962,
632
- "loss": 2.4638,
633
- "step": 3650
634
- },
635
- {
636
- "epoch": 0.28459349280824553,
637
- "grad_norm": 1.988981008529663,
638
- "learning_rate": 0.00018340908500636316,
639
- "loss": 2.3414,
640
- "step": 3700
641
- },
642
- {
643
- "epoch": 0.2884393508191678,
644
- "grad_norm": 2.1251513957977295,
645
- "learning_rate": 0.00018314936498454667,
646
- "loss": 2.4272,
647
- "step": 3750
648
- },
649
- {
650
- "epoch": 0.2884393508191678,
651
- "eval_loss": 2.4050426483154297,
652
- "eval_runtime": 17.2358,
653
- "eval_samples_per_second": 58.019,
654
- "eval_steps_per_second": 14.505,
655
- "step": 3750
656
- },
657
- {
658
- "epoch": 0.29228520883009,
659
- "grad_norm": 2.7036380767822266,
660
- "learning_rate": 0.00018288964496273019,
661
- "loss": 2.3573,
662
- "step": 3800
663
- },
664
- {
665
- "epoch": 0.2961310668410122,
666
- "grad_norm": 2.6849310398101807,
667
- "learning_rate": 0.0001826299249409137,
668
- "loss": 2.3911,
669
- "step": 3850
670
- },
671
- {
672
- "epoch": 0.29997692485193445,
673
- "grad_norm": 3.012932777404785,
674
- "learning_rate": 0.0001823702049190972,
675
- "loss": 2.451,
676
- "step": 3900
677
- },
678
- {
679
- "epoch": 0.3038227828628567,
680
- "grad_norm": 2.8903510570526123,
681
- "learning_rate": 0.00018211048489728072,
682
- "loss": 2.3306,
683
- "step": 3950
684
- },
685
- {
686
- "epoch": 0.30766864087377893,
687
- "grad_norm": 1.4737602472305298,
688
- "learning_rate": 0.00018185076487546426,
689
- "loss": 2.3145,
690
- "step": 4000
691
- },
692
- {
693
- "epoch": 0.30766864087377893,
694
- "eval_loss": 2.364640712738037,
695
- "eval_runtime": 17.2167,
696
- "eval_samples_per_second": 58.083,
697
- "eval_steps_per_second": 14.521,
698
- "step": 4000
699
- },
700
- {
701
- "epoch": 0.3115144988847012,
702
- "grad_norm": 1.781186819076538,
703
- "learning_rate": 0.00018159104485364778,
704
- "loss": 2.3993,
705
- "step": 4050
706
- },
707
- {
708
- "epoch": 0.3153603568956234,
709
- "grad_norm": 2.6136422157287598,
710
- "learning_rate": 0.0001813313248318313,
711
- "loss": 2.3217,
712
- "step": 4100
713
- },
714
- {
715
- "epoch": 0.31920621490654566,
716
- "grad_norm": 2.6466028690338135,
717
- "learning_rate": 0.00018107160481001483,
718
- "loss": 2.369,
719
- "step": 4150
720
- },
721
- {
722
- "epoch": 0.3230520729174679,
723
- "grad_norm": 2.442426919937134,
724
- "learning_rate": 0.00018081188478819832,
725
- "loss": 2.3943,
726
- "step": 4200
727
- },
728
- {
729
- "epoch": 0.32689793092839015,
730
- "grad_norm": 5.356844902038574,
731
- "learning_rate": 0.00018055216476638186,
732
- "loss": 2.39,
733
- "step": 4250
734
- },
735
- {
736
- "epoch": 0.32689793092839015,
737
- "eval_loss": 2.3410093784332275,
738
- "eval_runtime": 17.2633,
739
- "eval_samples_per_second": 57.926,
740
- "eval_steps_per_second": 14.482,
741
- "step": 4250
742
- },
743
- {
744
- "epoch": 0.33074378893931233,
745
- "grad_norm": 1.8614141941070557,
746
- "learning_rate": 0.00018029244474456537,
747
- "loss": 2.3852,
748
- "step": 4300
749
- },
750
- {
751
- "epoch": 0.3345896469502346,
752
- "grad_norm": 1.7005974054336548,
753
- "learning_rate": 0.00018003272472274888,
754
- "loss": 2.2891,
755
- "step": 4350
756
- },
757
- {
758
- "epoch": 0.3384355049611568,
759
- "grad_norm": 2.0309245586395264,
760
- "learning_rate": 0.00017977300470093242,
761
- "loss": 2.3093,
762
- "step": 4400
763
- },
764
- {
765
- "epoch": 0.34228136297207906,
766
- "grad_norm": 2.6562678813934326,
767
- "learning_rate": 0.00017951328467911593,
768
- "loss": 2.2637,
769
- "step": 4450
770
- },
771
- {
772
- "epoch": 0.3461272209830013,
773
- "grad_norm": 1.352026343345642,
774
- "learning_rate": 0.00017925356465729945,
775
- "loss": 2.2263,
776
- "step": 4500
777
- },
778
- {
779
- "epoch": 0.3461272209830013,
780
- "eval_loss": 2.2719833850860596,
781
- "eval_runtime": 17.4568,
782
- "eval_samples_per_second": 57.284,
783
- "eval_steps_per_second": 14.321,
784
- "step": 4500
785
- },
786
- {
787
- "epoch": 0.34997307899392355,
788
- "grad_norm": 2.6207292079925537,
789
- "learning_rate": 0.00017899384463548296,
790
- "loss": 2.325,
791
- "step": 4550
792
- },
793
- {
794
- "epoch": 0.3538189370048458,
795
- "grad_norm": 1.9380669593811035,
796
- "learning_rate": 0.00017873412461366647,
797
- "loss": 2.3208,
798
- "step": 4600
799
- },
800
- {
801
- "epoch": 0.35766479501576803,
802
- "grad_norm": 2.2287373542785645,
803
- "learning_rate": 0.00017847440459184999,
804
- "loss": 2.3115,
805
- "step": 4650
806
- },
807
- {
808
- "epoch": 0.3615106530266903,
809
- "grad_norm": 2.3176259994506836,
810
- "learning_rate": 0.00017821468457003353,
811
- "loss": 2.2203,
812
- "step": 4700
813
- },
814
- {
815
- "epoch": 0.36535651103761246,
816
- "grad_norm": 1.9587980508804321,
817
- "learning_rate": 0.00017795496454821704,
818
- "loss": 2.2162,
819
- "step": 4750
820
- },
821
- {
822
- "epoch": 0.36535651103761246,
823
- "eval_loss": 2.263962984085083,
824
- "eval_runtime": 17.2653,
825
- "eval_samples_per_second": 57.92,
826
- "eval_steps_per_second": 14.48,
827
- "step": 4750
828
- },
829
- {
830
- "epoch": 0.3692023690485347,
831
- "grad_norm": 1.8225115537643433,
832
- "learning_rate": 0.00017769524452640055,
833
- "loss": 2.2906,
834
- "step": 4800
835
- },
836
- {
837
- "epoch": 0.37304822705945695,
838
- "grad_norm": 2.2642672061920166,
839
- "learning_rate": 0.00017743552450458406,
840
- "loss": 2.2819,
841
- "step": 4850
842
- },
843
- {
844
- "epoch": 0.3768940850703792,
845
- "grad_norm": 2.1667556762695312,
846
- "learning_rate": 0.00017717580448276758,
847
- "loss": 2.2756,
848
- "step": 4900
849
- },
850
- {
851
- "epoch": 0.38073994308130144,
852
- "grad_norm": 2.112743854522705,
853
- "learning_rate": 0.0001769160844609511,
854
- "loss": 2.2583,
855
- "step": 4950
856
- },
857
- {
858
- "epoch": 0.3845858010922237,
859
- "grad_norm": 3.304570436477661,
860
- "learning_rate": 0.00017665636443913463,
861
- "loss": 2.2407,
862
- "step": 5000
863
- },
864
- {
865
- "epoch": 0.3845858010922237,
866
- "eval_loss": 2.229123830795288,
867
- "eval_runtime": 17.1378,
868
- "eval_samples_per_second": 58.351,
869
- "eval_steps_per_second": 14.588,
870
- "step": 5000
871
- },
872
- {
873
- "epoch": 0.3884316591031459,
874
- "grad_norm": 1.6103578805923462,
875
- "learning_rate": 0.00017639664441731814,
876
- "loss": 2.3529,
877
- "step": 5050
878
- },
879
- {
880
- "epoch": 0.39227751711406816,
881
- "grad_norm": 1.2587641477584839,
882
- "learning_rate": 0.00017613692439550165,
883
- "loss": 2.2583,
884
- "step": 5100
885
- },
886
- {
887
- "epoch": 0.3961233751249904,
888
- "grad_norm": 3.0143070220947266,
889
- "learning_rate": 0.00017587720437368517,
890
- "loss": 2.1529,
891
- "step": 5150
892
- },
893
- {
894
- "epoch": 0.39996923313591265,
895
- "grad_norm": 2.145265579223633,
896
- "learning_rate": 0.00017561748435186868,
897
- "loss": 2.2835,
898
- "step": 5200
899
- },
900
- {
901
- "epoch": 0.40381509114683484,
902
- "grad_norm": 1.7174979448318481,
903
- "learning_rate": 0.00017535776433005222,
904
- "loss": 2.2701,
905
- "step": 5250
906
- },
907
- {
908
- "epoch": 0.40381509114683484,
909
- "eval_loss": 2.215972661972046,
910
- "eval_runtime": 17.0477,
911
- "eval_samples_per_second": 58.659,
912
- "eval_steps_per_second": 14.665,
913
- "step": 5250
914
- },
915
- {
916
- "epoch": 0.4076609491577571,
917
- "grad_norm": 1.8635681867599487,
918
- "learning_rate": 0.00017509804430823573,
919
- "loss": 2.1989,
920
- "step": 5300
921
- },
922
- {
923
- "epoch": 0.4115068071686793,
924
- "grad_norm": 4.2349700927734375,
925
- "learning_rate": 0.00017483832428641925,
926
- "loss": 2.2203,
927
- "step": 5350
928
- },
929
- {
930
- "epoch": 0.41535266517960157,
931
- "grad_norm": 2.0099525451660156,
932
- "learning_rate": 0.00017457860426460279,
933
- "loss": 2.2021,
934
- "step": 5400
935
- },
936
- {
937
- "epoch": 0.4191985231905238,
938
- "grad_norm": 2.215400457382202,
939
- "learning_rate": 0.00017431888424278627,
940
- "loss": 2.1583,
941
- "step": 5450
942
- },
943
- {
944
- "epoch": 0.42304438120144605,
945
- "grad_norm": 2.2177133560180664,
946
- "learning_rate": 0.00017405916422096978,
947
- "loss": 2.2502,
948
- "step": 5500
949
- },
950
- {
951
- "epoch": 0.42304438120144605,
952
- "eval_loss": 2.16481876373291,
953
- "eval_runtime": 17.3392,
954
- "eval_samples_per_second": 57.673,
955
- "eval_steps_per_second": 14.418,
956
- "step": 5500
957
- },
958
- {
959
- "epoch": 0.4268902392123683,
960
- "grad_norm": 2.4246537685394287,
961
- "learning_rate": 0.00017379944419915332,
962
- "loss": 2.1506,
963
- "step": 5550
964
- },
965
- {
966
- "epoch": 0.43073609722329054,
967
- "grad_norm": 2.5737595558166504,
968
- "learning_rate": 0.00017353972417733684,
969
- "loss": 2.1155,
970
- "step": 5600
971
- },
972
- {
973
- "epoch": 0.4345819552342128,
974
- "grad_norm": 1.9406490325927734,
975
- "learning_rate": 0.00017328000415552035,
976
- "loss": 2.1716,
977
- "step": 5650
978
- },
979
- {
980
- "epoch": 0.43842781324513497,
981
- "grad_norm": 1.8545359373092651,
982
- "learning_rate": 0.0001730202841337039,
983
- "loss": 2.3107,
984
- "step": 5700
985
- },
986
- {
987
- "epoch": 0.4422736712560572,
988
- "grad_norm": 1.4506388902664185,
989
- "learning_rate": 0.0001727605641118874,
990
- "loss": 2.1839,
991
- "step": 5750
992
- },
993
- {
994
- "epoch": 0.4422736712560572,
995
- "eval_loss": 2.1573686599731445,
996
- "eval_runtime": 17.1406,
997
- "eval_samples_per_second": 58.341,
998
- "eval_steps_per_second": 14.585,
999
- "step": 5750
1000
- },
1001
- {
1002
- "epoch": 0.44611952926697945,
1003
- "grad_norm": 2.4454505443573,
1004
- "learning_rate": 0.0001725008440900709,
1005
- "loss": 2.2399,
1006
- "step": 5800
1007
- },
1008
- {
1009
- "epoch": 0.4499653872779017,
1010
- "grad_norm": 1.7306182384490967,
1011
- "learning_rate": 0.00017224112406825443,
1012
- "loss": 2.2319,
1013
- "step": 5850
1014
- },
1015
- {
1016
- "epoch": 0.45381124528882394,
1017
- "grad_norm": 1.8138234615325928,
1018
- "learning_rate": 0.00017198140404643794,
1019
- "loss": 2.1173,
1020
- "step": 5900
1021
- },
1022
- {
1023
- "epoch": 0.4576571032997462,
1024
- "grad_norm": 1.6768412590026855,
1025
- "learning_rate": 0.00017172168402462148,
1026
- "loss": 2.2145,
1027
- "step": 5950
1028
- },
1029
- {
1030
- "epoch": 0.4615029613106684,
1031
- "grad_norm": 2.1484451293945312,
1032
- "learning_rate": 0.000171461964002805,
1033
- "loss": 2.171,
1034
- "step": 6000
1035
- },
1036
- {
1037
- "epoch": 0.4615029613106684,
1038
- "eval_loss": 2.1329145431518555,
1039
- "eval_runtime": 17.1772,
1040
- "eval_samples_per_second": 58.217,
1041
- "eval_steps_per_second": 14.554,
1042
- "step": 6000
1043
- },
1044
- {
1045
- "epoch": 0.46534881932159067,
1046
- "grad_norm": 2.217759609222412,
1047
- "learning_rate": 0.0001712022439809885,
1048
- "loss": 2.1679,
1049
- "step": 6050
1050
- },
1051
- {
1052
- "epoch": 0.4691946773325129,
1053
- "grad_norm": 1.4925270080566406,
1054
- "learning_rate": 0.00017094252395917202,
1055
- "loss": 2.1634,
1056
- "step": 6100
1057
- },
1058
- {
1059
- "epoch": 0.4730405353434351,
1060
- "grad_norm": 1.500253438949585,
1061
- "learning_rate": 0.00017068280393735553,
1062
- "loss": 2.1947,
1063
- "step": 6150
1064
- },
1065
- {
1066
- "epoch": 0.47688639335435734,
1067
- "grad_norm": 1.7444700002670288,
1068
- "learning_rate": 0.00017042308391553905,
1069
- "loss": 2.1864,
1070
- "step": 6200
1071
- },
1072
- {
1073
- "epoch": 0.4807322513652796,
1074
- "grad_norm": 2.21281099319458,
1075
- "learning_rate": 0.00017016336389372259,
1076
- "loss": 2.1346,
1077
- "step": 6250
1078
- },
1079
- {
1080
- "epoch": 0.4807322513652796,
1081
- "eval_loss": 2.1062209606170654,
1082
- "eval_runtime": 17.175,
1083
- "eval_samples_per_second": 58.224,
1084
- "eval_steps_per_second": 14.556,
1085
- "step": 6250
1086
- },
1087
- {
1088
- "epoch": 0.4845781093762018,
1089
- "grad_norm": 2.066366672515869,
1090
- "learning_rate": 0.0001699036438719061,
1091
- "loss": 2.1034,
1092
- "step": 6300
1093
- },
1094
- {
1095
- "epoch": 0.48842396738712407,
1096
- "grad_norm": 2.236504554748535,
1097
- "learning_rate": 0.0001696439238500896,
1098
- "loss": 2.1293,
1099
- "step": 6350
1100
- },
1101
- {
1102
- "epoch": 0.4922698253980463,
1103
- "grad_norm": 2.368986129760742,
1104
- "learning_rate": 0.00016938420382827312,
1105
- "loss": 2.2147,
1106
- "step": 6400
1107
- },
1108
- {
1109
- "epoch": 0.49611568340896856,
1110
- "grad_norm": 3.3609116077423096,
1111
- "learning_rate": 0.00016912448380645664,
1112
- "loss": 2.1395,
1113
- "step": 6450
1114
- },
1115
- {
1116
- "epoch": 0.4999615414198908,
1117
- "grad_norm": 1.7160570621490479,
1118
- "learning_rate": 0.00016886476378464015,
1119
- "loss": 2.0777,
1120
- "step": 6500
1121
- },
1122
- {
1123
- "epoch": 0.4999615414198908,
1124
- "eval_loss": 2.0895941257476807,
1125
- "eval_runtime": 17.0411,
1126
- "eval_samples_per_second": 58.682,
1127
- "eval_steps_per_second": 14.67,
1128
- "step": 6500
1129
- },
1130
- {
1131
- "epoch": 0.503807399430813,
1132
- "grad_norm": 2.2851459980010986,
1133
- "learning_rate": 0.0001686050437628237,
1134
- "loss": 2.155,
1135
- "step": 6550
1136
- },
1137
- {
1138
- "epoch": 0.5076532574417353,
1139
- "grad_norm": 3.6158559322357178,
1140
- "learning_rate": 0.0001683453237410072,
1141
- "loss": 2.1105,
1142
- "step": 6600
1143
- },
1144
- {
1145
- "epoch": 0.5114991154526575,
1146
- "grad_norm": 1.608955979347229,
1147
- "learning_rate": 0.00016808560371919072,
1148
- "loss": 2.0908,
1149
- "step": 6650
1150
- },
1151
- {
1152
- "epoch": 0.5153449734635798,
1153
- "grad_norm": 2.7433218955993652,
1154
- "learning_rate": 0.00016782588369737426,
1155
- "loss": 2.04,
1156
- "step": 6700
1157
- },
1158
- {
1159
- "epoch": 0.519190831474502,
1160
- "grad_norm": 3.286970376968384,
1161
- "learning_rate": 0.00016756616367555774,
1162
- "loss": 2.1608,
1163
- "step": 6750
1164
- },
1165
- {
1166
- "epoch": 0.519190831474502,
1167
- "eval_loss": 2.0548338890075684,
1168
- "eval_runtime": 17.3257,
1169
- "eval_samples_per_second": 57.718,
1170
- "eval_steps_per_second": 14.429,
1171
- "step": 6750
1172
- },
1173
- {
1174
- "epoch": 0.5230366894854241,
1175
- "grad_norm": 1.7461440563201904,
1176
- "learning_rate": 0.00016730644365374128,
1177
- "loss": 2.0562,
1178
- "step": 6800
1179
- },
1180
- {
1181
- "epoch": 0.5268825474963464,
1182
- "grad_norm": 3.4528918266296387,
1183
- "learning_rate": 0.0001670467236319248,
1184
- "loss": 2.0983,
1185
- "step": 6850
1186
- },
1187
- {
1188
- "epoch": 0.5307284055072686,
1189
- "grad_norm": 2.23818302154541,
1190
- "learning_rate": 0.0001667870036101083,
1191
- "loss": 2.0187,
1192
- "step": 6900
1193
- },
1194
- {
1195
- "epoch": 0.5345742635181909,
1196
- "grad_norm": 1.7558523416519165,
1197
- "learning_rate": 0.00016652728358829185,
1198
- "loss": 2.0996,
1199
- "step": 6950
1200
- },
1201
- {
1202
- "epoch": 0.5384201215291131,
1203
- "grad_norm": 1.8939173221588135,
1204
- "learning_rate": 0.00016626756356647536,
1205
- "loss": 2.0499,
1206
- "step": 7000
1207
- },
1208
- {
1209
- "epoch": 0.5384201215291131,
1210
- "eval_loss": 2.0413780212402344,
1211
- "eval_runtime": 17.1018,
1212
- "eval_samples_per_second": 58.473,
1213
- "eval_steps_per_second": 14.618,
1214
- "step": 7000
1215
- },
1216
- {
1217
- "epoch": 0.5422659795400354,
1218
- "grad_norm": 2.8039333820343018,
1219
- "learning_rate": 0.00016600784354465885,
1220
- "loss": 2.0606,
1221
- "step": 7050
1222
- },
1223
- {
1224
- "epoch": 0.5461118375509576,
1225
- "grad_norm": 1.4562283754348755,
1226
- "learning_rate": 0.00016574812352284239,
1227
- "loss": 2.1611,
1228
- "step": 7100
1229
- },
1230
- {
1231
- "epoch": 0.5499576955618799,
1232
- "grad_norm": 1.4812073707580566,
1233
- "learning_rate": 0.0001654884035010259,
1234
- "loss": 2.125,
1235
- "step": 7150
1236
- },
1237
- {
1238
- "epoch": 0.5538035535728021,
1239
- "grad_norm": 2.279209613800049,
1240
- "learning_rate": 0.0001652286834792094,
1241
- "loss": 2.0865,
1242
- "step": 7200
1243
- },
1244
- {
1245
- "epoch": 0.5576494115837243,
1246
- "grad_norm": 3.1694416999816895,
1247
- "learning_rate": 0.00016496896345739295,
1248
- "loss": 2.1783,
1249
- "step": 7250
1250
- },
1251
- {
1252
- "epoch": 0.5576494115837243,
1253
- "eval_loss": 2.0270566940307617,
1254
- "eval_runtime": 16.9999,
1255
- "eval_samples_per_second": 58.824,
1256
- "eval_steps_per_second": 14.706,
1257
- "step": 7250
1258
- },
1259
- {
1260
- "epoch": 0.5614952695946466,
1261
- "grad_norm": 2.7421817779541016,
1262
- "learning_rate": 0.00016470924343557646,
1263
- "loss": 2.0691,
1264
- "step": 7300
1265
- },
1266
- {
1267
- "epoch": 0.5653411276055688,
1268
- "grad_norm": 1.6490452289581299,
1269
- "learning_rate": 0.00016444952341375998,
1270
- "loss": 2.0467,
1271
- "step": 7350
1272
- },
1273
- {
1274
- "epoch": 0.5691869856164911,
1275
- "grad_norm": 2.1511409282684326,
1276
- "learning_rate": 0.0001641898033919435,
1277
- "loss": 2.0369,
1278
- "step": 7400
1279
- },
1280
- {
1281
- "epoch": 0.5730328436274132,
1282
- "grad_norm": 1.9185343980789185,
1283
- "learning_rate": 0.000163930083370127,
1284
- "loss": 2.1182,
1285
- "step": 7450
1286
- },
1287
- {
1288
- "epoch": 0.5768787016383355,
1289
- "grad_norm": 1.5140857696533203,
1290
- "learning_rate": 0.00016367036334831052,
1291
- "loss": 2.1145,
1292
- "step": 7500
1293
- },
1294
- {
1295
- "epoch": 0.5768787016383355,
1296
- "eval_loss": 2.016103744506836,
1297
- "eval_runtime": 17.1749,
1298
- "eval_samples_per_second": 58.225,
1299
- "eval_steps_per_second": 14.556,
1300
- "step": 7500
1301
- },
1302
- {
1303
- "epoch": 0.5807245596492577,
1304
- "grad_norm": 1.6893503665924072,
1305
- "learning_rate": 0.00016341064332649406,
1306
- "loss": 2.0415,
1307
- "step": 7550
1308
- },
1309
- {
1310
- "epoch": 0.58457041766018,
1311
- "grad_norm": 2.099968433380127,
1312
- "learning_rate": 0.00016315092330467757,
1313
- "loss": 1.9203,
1314
- "step": 7600
1315
- },
1316
- {
1317
- "epoch": 0.5884162756711022,
1318
- "grad_norm": 2.0659477710723877,
1319
- "learning_rate": 0.00016289120328286108,
1320
- "loss": 2.1188,
1321
- "step": 7650
1322
- },
1323
- {
1324
- "epoch": 0.5922621336820244,
1325
- "grad_norm": 1.599091649055481,
1326
- "learning_rate": 0.0001626314832610446,
1327
- "loss": 1.9583,
1328
- "step": 7700
1329
- },
1330
- {
1331
- "epoch": 0.5961079916929467,
1332
- "grad_norm": 2.5325448513031006,
1333
- "learning_rate": 0.0001623717632392281,
1334
- "loss": 2.088,
1335
- "step": 7750
1336
- },
1337
- {
1338
- "epoch": 0.5961079916929467,
1339
- "eval_loss": 1.9883803129196167,
1340
- "eval_runtime": 17.4065,
1341
- "eval_samples_per_second": 57.45,
1342
- "eval_steps_per_second": 14.362,
1343
- "step": 7750
1344
- },
1345
- {
1346
- "epoch": 0.5999538497038689,
1347
- "grad_norm": 1.6605775356292725,
1348
- "learning_rate": 0.00016211204321741165,
1349
- "loss": 2.0767,
1350
- "step": 7800
1351
- },
1352
- {
1353
- "epoch": 0.6037997077147912,
1354
- "grad_norm": 1.4154255390167236,
1355
- "learning_rate": 0.00016185232319559516,
1356
- "loss": 1.9914,
1357
- "step": 7850
1358
- },
1359
- {
1360
- "epoch": 0.6076455657257134,
1361
- "grad_norm": 1.7516275644302368,
1362
- "learning_rate": 0.00016159260317377867,
1363
- "loss": 2.0046,
1364
- "step": 7900
1365
- },
1366
- {
1367
- "epoch": 0.6114914237366357,
1368
- "grad_norm": 1.6723463535308838,
1369
- "learning_rate": 0.0001613328831519622,
1370
- "loss": 2.0312,
1371
- "step": 7950
1372
- },
1373
- {
1374
- "epoch": 0.6153372817475579,
1375
- "grad_norm": 2.417466163635254,
1376
- "learning_rate": 0.0001610731631301457,
1377
- "loss": 2.1176,
1378
- "step": 8000
1379
- },
1380
- {
1381
- "epoch": 0.6153372817475579,
1382
- "eval_loss": 1.9759701490402222,
1383
- "eval_runtime": 17.031,
1384
- "eval_samples_per_second": 58.717,
1385
- "eval_steps_per_second": 14.679,
1386
- "step": 8000
1387
- },
1388
- {
1389
- "epoch": 0.6191831397584802,
1390
- "grad_norm": 2.4189867973327637,
1391
- "learning_rate": 0.0001608134431083292,
1392
- "loss": 1.9562,
1393
- "step": 8050
1394
- },
1395
- {
1396
- "epoch": 0.6230289977694023,
1397
- "grad_norm": 2.1052446365356445,
1398
- "learning_rate": 0.00016055372308651275,
1399
- "loss": 2.0396,
1400
- "step": 8100
1401
- },
1402
- {
1403
- "epoch": 0.6268748557803245,
1404
- "grad_norm": 2.675004482269287,
1405
- "learning_rate": 0.00016029400306469626,
1406
- "loss": 2.0172,
1407
- "step": 8150
1408
- },
1409
- {
1410
- "epoch": 0.6307207137912468,
1411
- "grad_norm": 3.2394967079162598,
1412
- "learning_rate": 0.00016003947744331611,
1413
- "loss": 2.0258,
1414
- "step": 8200
1415
- },
1416
- {
1417
- "epoch": 0.634566571802169,
1418
- "grad_norm": 1.610350489616394,
1419
- "learning_rate": 0.00015977975742149963,
1420
- "loss": 1.9348,
1421
- "step": 8250
1422
- },
1423
- {
1424
- "epoch": 0.634566571802169,
1425
- "eval_loss": 1.9506243467330933,
1426
- "eval_runtime": 17.1568,
1427
- "eval_samples_per_second": 58.286,
1428
- "eval_steps_per_second": 14.571,
1429
- "step": 8250
1430
- },
1431
- {
1432
- "epoch": 0.6384124298130913,
1433
- "grad_norm": 2.4209282398223877,
1434
- "learning_rate": 0.00015952003739968314,
1435
- "loss": 1.9787,
1436
- "step": 8300
1437
- },
1438
- {
1439
- "epoch": 0.6422582878240135,
1440
- "grad_norm": 1.3403830528259277,
1441
- "learning_rate": 0.00015926031737786665,
1442
- "loss": 1.9947,
1443
- "step": 8350
1444
- },
1445
- {
1446
- "epoch": 0.6461041458349358,
1447
- "grad_norm": 1.963592290878296,
1448
- "learning_rate": 0.0001590005973560502,
1449
- "loss": 1.9534,
1450
- "step": 8400
1451
- },
1452
- {
1453
- "epoch": 0.649950003845858,
1454
- "grad_norm": 1.5136828422546387,
1455
- "learning_rate": 0.0001587408773342337,
1456
- "loss": 1.9946,
1457
- "step": 8450
1458
- },
1459
- {
1460
- "epoch": 0.6537958618567803,
1461
- "grad_norm": 2.382208824157715,
1462
- "learning_rate": 0.00015848115731241722,
1463
- "loss": 2.1216,
1464
- "step": 8500
1465
- },
1466
- {
1467
- "epoch": 0.6537958618567803,
1468
- "eval_loss": 1.9431382417678833,
1469
- "eval_runtime": 17.244,
1470
- "eval_samples_per_second": 57.991,
1471
- "eval_steps_per_second": 14.498,
1472
- "step": 8500
1473
- },
1474
- {
1475
- "epoch": 0.6576417198677025,
1476
- "grad_norm": 1.2141226530075073,
1477
- "learning_rate": 0.00015822143729060076,
1478
- "loss": 1.9762,
1479
- "step": 8550
1480
- },
1481
- {
1482
- "epoch": 0.6614875778786247,
1483
- "grad_norm": 2.0325729846954346,
1484
- "learning_rate": 0.00015796171726878427,
1485
- "loss": 1.9871,
1486
- "step": 8600
1487
- },
1488
- {
1489
- "epoch": 0.665333435889547,
1490
- "grad_norm": 1.6352391242980957,
1491
- "learning_rate": 0.00015770199724696776,
1492
- "loss": 1.9289,
1493
- "step": 8650
1494
- },
1495
- {
1496
- "epoch": 0.6691792939004692,
1497
- "grad_norm": 1.546341896057129,
1498
- "learning_rate": 0.0001574422772251513,
1499
- "loss": 1.9499,
1500
- "step": 8700
1501
- },
1502
- {
1503
- "epoch": 0.6730251519113915,
1504
- "grad_norm": 2.183812141418457,
1505
- "learning_rate": 0.0001571825572033348,
1506
- "loss": 1.9727,
1507
- "step": 8750
1508
- },
1509
- {
1510
- "epoch": 0.6730251519113915,
1511
- "eval_loss": 1.9287711381912231,
1512
- "eval_runtime": 17.0409,
1513
- "eval_samples_per_second": 58.682,
1514
- "eval_steps_per_second": 14.671,
1515
- "step": 8750
1516
- },
1517
- {
1518
- "epoch": 0.6768710099223136,
1519
- "grad_norm": 2.3215548992156982,
1520
- "learning_rate": 0.00015692283718151832,
1521
- "loss": 2.0059,
1522
- "step": 8800
1523
- },
1524
- {
1525
- "epoch": 0.6807168679332359,
1526
- "grad_norm": 1.9137969017028809,
1527
- "learning_rate": 0.00015666311715970186,
1528
- "loss": 1.9883,
1529
- "step": 8850
1530
- },
1531
- {
1532
- "epoch": 0.6845627259441581,
1533
- "grad_norm": 2.4092469215393066,
1534
- "learning_rate": 0.00015640339713788538,
1535
- "loss": 1.9236,
1536
- "step": 8900
1537
- },
1538
- {
1539
- "epoch": 0.6884085839550804,
1540
- "grad_norm": 2.448526620864868,
1541
- "learning_rate": 0.0001561436771160689,
1542
- "loss": 1.9178,
1543
- "step": 8950
1544
- },
1545
- {
1546
- "epoch": 0.6922544419660026,
1547
- "grad_norm": 1.0038529634475708,
1548
- "learning_rate": 0.0001558839570942524,
1549
- "loss": 1.8992,
1550
- "step": 9000
1551
- },
1552
- {
1553
- "epoch": 0.6922544419660026,
1554
- "eval_loss": 1.902275800704956,
1555
- "eval_runtime": 16.9573,
1556
- "eval_samples_per_second": 58.972,
1557
- "eval_steps_per_second": 14.743,
1558
- "step": 9000
1559
- },
1560
- {
1561
- "epoch": 0.6961002999769248,
1562
- "grad_norm": 3.6892011165618896,
1563
- "learning_rate": 0.00015562423707243591,
1564
- "loss": 1.9551,
1565
- "step": 9050
1566
- },
1567
- {
1568
- "epoch": 0.6999461579878471,
1569
- "grad_norm": 1.524671196937561,
1570
- "learning_rate": 0.00015536451705061943,
1571
- "loss": 1.9109,
1572
- "step": 9100
1573
- },
1574
- {
1575
- "epoch": 0.7037920159987693,
1576
- "grad_norm": 1.5293575525283813,
1577
- "learning_rate": 0.00015510479702880297,
1578
- "loss": 1.8891,
1579
- "step": 9150
1580
- },
1581
- {
1582
- "epoch": 0.7076378740096916,
1583
- "grad_norm": 2.321476697921753,
1584
- "learning_rate": 0.00015484507700698648,
1585
- "loss": 1.9021,
1586
- "step": 9200
1587
- },
1588
- {
1589
- "epoch": 0.7114837320206138,
1590
- "grad_norm": 2.710942029953003,
1591
- "learning_rate": 0.00015458535698517,
1592
- "loss": 1.9414,
1593
- "step": 9250
1594
- },
1595
- {
1596
- "epoch": 0.7114837320206138,
1597
- "eval_loss": 1.9107917547225952,
1598
- "eval_runtime": 16.997,
1599
- "eval_samples_per_second": 58.834,
1600
- "eval_steps_per_second": 14.708,
1601
- "step": 9250
1602
- },
1603
- {
1604
- "epoch": 0.7153295900315361,
1605
- "grad_norm": 1.9385954141616821,
1606
- "learning_rate": 0.0001543256369633535,
1607
- "loss": 1.9401,
1608
- "step": 9300
1609
- },
1610
- {
1611
- "epoch": 0.7191754480424583,
1612
- "grad_norm": 2.589629650115967,
1613
- "learning_rate": 0.00015406591694153702,
1614
- "loss": 1.9054,
1615
- "step": 9350
1616
- },
1617
- {
1618
- "epoch": 0.7230213060533806,
1619
- "grad_norm": 1.6431207656860352,
1620
- "learning_rate": 0.00015380619691972056,
1621
- "loss": 1.9324,
1622
- "step": 9400
1623
- },
1624
- {
1625
- "epoch": 0.7268671640643027,
1626
- "grad_norm": 4.840892314910889,
1627
- "learning_rate": 0.00015354647689790407,
1628
- "loss": 1.993,
1629
- "step": 9450
1630
- },
1631
- {
1632
- "epoch": 0.7307130220752249,
1633
- "grad_norm": 0.9328492879867554,
1634
- "learning_rate": 0.00015328675687608758,
1635
- "loss": 1.8637,
1636
- "step": 9500
1637
- },
1638
- {
1639
- "epoch": 0.7307130220752249,
1640
- "eval_loss": 1.8725571632385254,
1641
- "eval_runtime": 16.9918,
1642
- "eval_samples_per_second": 58.852,
1643
- "eval_steps_per_second": 14.713,
1644
- "step": 9500
1645
- },
1646
- {
1647
- "epoch": 0.7345588800861472,
1648
- "grad_norm": 1.6844549179077148,
1649
- "learning_rate": 0.00015302703685427112,
1650
- "loss": 1.8262,
1651
- "step": 9550
1652
- },
1653
- {
1654
- "epoch": 0.7384047380970694,
1655
- "grad_norm": 2.512157678604126,
1656
- "learning_rate": 0.0001527673168324546,
1657
- "loss": 1.9319,
1658
- "step": 9600
1659
- },
1660
- {
1661
- "epoch": 0.7422505961079917,
1662
- "grad_norm": 1.3238016366958618,
1663
- "learning_rate": 0.00015250759681063812,
1664
- "loss": 1.9001,
1665
- "step": 9650
1666
- },
1667
- {
1668
- "epoch": 0.7460964541189139,
1669
- "grad_norm": 1.83181631565094,
1670
- "learning_rate": 0.00015224787678882166,
1671
- "loss": 1.8324,
1672
- "step": 9700
1673
- },
1674
- {
1675
- "epoch": 0.7499423121298362,
1676
- "grad_norm": 1.6106966733932495,
1677
- "learning_rate": 0.00015198815676700518,
1678
- "loss": 1.9535,
1679
- "step": 9750
1680
- },
1681
- {
1682
- "epoch": 0.7499423121298362,
1683
- "eval_loss": 1.873831033706665,
1684
- "eval_runtime": 17.0343,
1685
- "eval_samples_per_second": 58.705,
1686
- "eval_steps_per_second": 14.676,
1687
- "step": 9750
1688
- },
1689
- {
1690
- "epoch": 0.7537881701407584,
1691
- "grad_norm": 2.3586697578430176,
1692
- "learning_rate": 0.0001517284367451887,
1693
- "loss": 1.858,
1694
- "step": 9800
1695
- },
1696
- {
1697
- "epoch": 0.7576340281516807,
1698
- "grad_norm": 0.7499716877937317,
1699
- "learning_rate": 0.00015146871672337223,
1700
- "loss": 1.866,
1701
- "step": 9850
1702
- },
1703
- {
1704
- "epoch": 0.7614798861626029,
1705
- "grad_norm": 1.3573709726333618,
1706
- "learning_rate": 0.00015120899670155574,
1707
- "loss": 1.8636,
1708
- "step": 9900
1709
- },
1710
- {
1711
- "epoch": 0.7653257441735252,
1712
- "grad_norm": 2.271859884262085,
1713
- "learning_rate": 0.00015094927667973923,
1714
- "loss": 1.8625,
1715
- "step": 9950
1716
- },
1717
- {
1718
- "epoch": 0.7691716021844474,
1719
- "grad_norm": 1.8813310861587524,
1720
- "learning_rate": 0.00015068955665792277,
1721
- "loss": 1.9318,
1722
- "step": 10000
1723
- },
1724
- {
1725
- "epoch": 0.7691716021844474,
1726
- "eval_loss": 1.8548085689544678,
1727
- "eval_runtime": 17.0223,
1728
- "eval_samples_per_second": 58.746,
1729
- "eval_steps_per_second": 14.687,
1730
- "step": 10000
1731
- },
1732
- {
1733
- "epoch": 0.7730174601953695,
1734
- "grad_norm": 1.4667410850524902,
1735
- "learning_rate": 0.00015042983663610628,
1736
- "loss": 1.9318,
1737
- "step": 10050
1738
- },
1739
- {
1740
- "epoch": 0.7768633182062918,
1741
- "grad_norm": 2.0754499435424805,
1742
- "learning_rate": 0.00015017011661428982,
1743
- "loss": 1.753,
1744
- "step": 10100
1745
- },
1746
- {
1747
- "epoch": 0.780709176217214,
1748
- "grad_norm": 1.634293794631958,
1749
- "learning_rate": 0.00014991039659247333,
1750
- "loss": 1.9328,
1751
- "step": 10150
1752
- },
1753
- {
1754
- "epoch": 0.7845550342281363,
1755
- "grad_norm": 1.5003916025161743,
1756
- "learning_rate": 0.00014965067657065684,
1757
- "loss": 1.8237,
1758
- "step": 10200
1759
- },
1760
- {
1761
- "epoch": 0.7884008922390585,
1762
- "grad_norm": 1.7433470487594604,
1763
- "learning_rate": 0.00014939095654884036,
1764
- "loss": 1.859,
1765
- "step": 10250
1766
- },
1767
- {
1768
- "epoch": 0.7884008922390585,
1769
- "eval_loss": 1.842873215675354,
1770
- "eval_runtime": 16.9242,
1771
- "eval_samples_per_second": 59.087,
1772
- "eval_steps_per_second": 14.772,
1773
- "step": 10250
1774
- },
1775
- {
1776
- "epoch": 0.7922467502499808,
1777
- "grad_norm": 1.8703033924102783,
1778
- "learning_rate": 0.00014913123652702387,
1779
- "loss": 1.9019,
1780
- "step": 10300
1781
- },
1782
- {
1783
- "epoch": 0.796092608260903,
1784
- "grad_norm": 1.1216288805007935,
1785
- "learning_rate": 0.00014887151650520738,
1786
- "loss": 1.9812,
1787
- "step": 10350
1788
- },
1789
- {
1790
- "epoch": 0.7999384662718253,
1791
- "grad_norm": 3.2229816913604736,
1792
- "learning_rate": 0.00014861179648339092,
1793
- "loss": 1.7986,
1794
- "step": 10400
1795
- },
1796
- {
1797
- "epoch": 0.8037843242827475,
1798
- "grad_norm": 2.366506814956665,
1799
- "learning_rate": 0.00014835207646157444,
1800
- "loss": 1.8995,
1801
- "step": 10450
1802
- },
1803
- {
1804
- "epoch": 0.8076301822936697,
1805
- "grad_norm": 2.0333054065704346,
1806
- "learning_rate": 0.00014809235643975795,
1807
- "loss": 1.916,
1808
- "step": 10500
1809
- },
1810
- {
1811
- "epoch": 0.8076301822936697,
1812
- "eval_loss": 1.8220328092575073,
1813
- "eval_runtime": 17.063,
1814
- "eval_samples_per_second": 58.606,
1815
- "eval_steps_per_second": 14.652,
1816
- "step": 10500
1817
- },
1818
- {
1819
- "epoch": 0.811476040304592,
1820
- "grad_norm": 1.8630917072296143,
1821
- "learning_rate": 0.00014783263641794146,
1822
- "loss": 1.9692,
1823
- "step": 10550
1824
- },
1825
- {
1826
- "epoch": 0.8153218983155142,
1827
- "grad_norm": 1.744325876235962,
1828
- "learning_rate": 0.00014757291639612497,
1829
- "loss": 1.8893,
1830
- "step": 10600
1831
- },
1832
- {
1833
- "epoch": 0.8191677563264365,
1834
- "grad_norm": 1.8451564311981201,
1835
- "learning_rate": 0.0001473131963743085,
1836
- "loss": 1.7862,
1837
- "step": 10650
1838
- },
1839
- {
1840
- "epoch": 0.8230136143373586,
1841
- "grad_norm": 1.9895585775375366,
1842
- "learning_rate": 0.00014705347635249203,
1843
- "loss": 1.9978,
1844
- "step": 10700
1845
- },
1846
- {
1847
- "epoch": 0.8268594723482809,
1848
- "grad_norm": 2.574155330657959,
1849
- "learning_rate": 0.00014679375633067554,
1850
- "loss": 1.8705,
1851
- "step": 10750
1852
- },
1853
- {
1854
- "epoch": 0.8268594723482809,
1855
- "eval_loss": 1.8234485387802124,
1856
- "eval_runtime": 17.061,
1857
- "eval_samples_per_second": 58.613,
1858
- "eval_steps_per_second": 14.653,
1859
- "step": 10750
1860
- },
1861
- {
1862
- "epoch": 0.8307053303592031,
1863
- "grad_norm": 1.347955584526062,
1864
- "learning_rate": 0.00014653403630885905,
1865
- "loss": 1.7948,
1866
- "step": 10800
1867
- },
1868
- {
1869
- "epoch": 0.8345511883701254,
1870
- "grad_norm": 2.8867287635803223,
1871
- "learning_rate": 0.00014627431628704257,
1872
- "loss": 1.8986,
1873
- "step": 10850
1874
- },
1875
- {
1876
- "epoch": 0.8383970463810476,
1877
- "grad_norm": 2.692473888397217,
1878
- "learning_rate": 0.00014601459626522608,
1879
- "loss": 1.9209,
1880
- "step": 10900
1881
- },
1882
- {
1883
- "epoch": 0.8422429043919698,
1884
- "grad_norm": 1.7835667133331299,
1885
- "learning_rate": 0.00014575487624340962,
1886
- "loss": 1.8406,
1887
- "step": 10950
1888
- },
1889
- {
1890
- "epoch": 0.8460887624028921,
1891
- "grad_norm": 1.8864330053329468,
1892
- "learning_rate": 0.00014549515622159313,
1893
- "loss": 1.8001,
1894
- "step": 11000
1895
- },
1896
- {
1897
- "epoch": 0.8460887624028921,
1898
- "eval_loss": 1.807216763496399,
1899
- "eval_runtime": 17.2188,
1900
- "eval_samples_per_second": 58.076,
1901
- "eval_steps_per_second": 14.519,
1902
- "step": 11000
1903
- },
1904
- {
1905
- "epoch": 0.8499346204138143,
1906
- "grad_norm": 2.47654128074646,
1907
- "learning_rate": 0.00014523543619977664,
1908
- "loss": 1.8003,
1909
- "step": 11050
1910
- },
1911
- {
1912
- "epoch": 0.8537804784247366,
1913
- "grad_norm": 1.6507407426834106,
1914
- "learning_rate": 0.00014497571617796018,
1915
- "loss": 1.8999,
1916
- "step": 11100
1917
- },
1918
- {
1919
- "epoch": 0.8576263364356588,
1920
- "grad_norm": 1.4758163690567017,
1921
- "learning_rate": 0.0001447159961561437,
1922
- "loss": 1.837,
1923
- "step": 11150
1924
- },
1925
- {
1926
- "epoch": 0.8614721944465811,
1927
- "grad_norm": 2.2486917972564697,
1928
- "learning_rate": 0.00014445627613432718,
1929
- "loss": 1.8501,
1930
- "step": 11200
1931
- },
1932
- {
1933
- "epoch": 0.8653180524575033,
1934
- "grad_norm": 1.682785987854004,
1935
- "learning_rate": 0.00014419655611251072,
1936
- "loss": 1.8486,
1937
- "step": 11250
1938
- },
1939
- {
1940
- "epoch": 0.8653180524575033,
1941
- "eval_loss": 1.8015695810317993,
1942
- "eval_runtime": 16.9649,
1943
- "eval_samples_per_second": 58.945,
1944
- "eval_steps_per_second": 14.736,
1945
- "step": 11250
1946
- },
1947
- {
1948
- "epoch": 0.8691639104684256,
1949
- "grad_norm": 1.687892198562622,
1950
- "learning_rate": 0.00014393683609069424,
1951
- "loss": 1.8196,
1952
- "step": 11300
1953
- },
1954
- {
1955
- "epoch": 0.8730097684793477,
1956
- "grad_norm": 1.6149276494979858,
1957
- "learning_rate": 0.00014367711606887775,
1958
- "loss": 1.829,
1959
- "step": 11350
1960
- },
1961
- {
1962
- "epoch": 0.8768556264902699,
1963
- "grad_norm": 1.2599520683288574,
1964
- "learning_rate": 0.0001434173960470613,
1965
- "loss": 1.8398,
1966
- "step": 11400
1967
- },
1968
- {
1969
- "epoch": 0.8807014845011922,
1970
- "grad_norm": 2.5826971530914307,
1971
- "learning_rate": 0.0001431576760252448,
1972
- "loss": 1.8362,
1973
- "step": 11450
1974
- },
1975
- {
1976
- "epoch": 0.8845473425121144,
1977
- "grad_norm": 1.9814509153366089,
1978
- "learning_rate": 0.00014289795600342831,
1979
- "loss": 1.7922,
1980
- "step": 11500
1981
- },
1982
- {
1983
- "epoch": 0.8845473425121144,
1984
- "eval_loss": 1.792478322982788,
1985
- "eval_runtime": 16.8513,
1986
- "eval_samples_per_second": 59.343,
1987
- "eval_steps_per_second": 14.836,
1988
- "step": 11500
1989
  }
1990
  ],
1991
  "logging_steps": 50,
 
1
  {
2
+ "best_global_step": 500,
3
+ "best_metric": 3.3119897842407227,
4
+ "best_model_checkpoint": "./ar-diffusion-checkpoints-progressive-attention/checkpoint-500",
5
+ "epoch": 0.038458580109222366,
6
  "eval_steps": 250,
7
+ "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.003845858010922237,
14
+ "grad_norm": 13.142335891723633,
15
  "learning_rate": 1.84e-05,
16
+ "loss": 12.0117,
17
  "step": 50
18
  },
19
  {
20
  "epoch": 0.007691716021844474,
21
+ "grad_norm": 5.153238296508789,
22
  "learning_rate": 3.8e-05,
23
+ "loss": 5.8408,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.01153757403276671,
28
+ "grad_norm": 7.471889019012451,
29
  "learning_rate": 5.8e-05,
30
+ "loss": 4.5173,
31
  "step": 150
32
  },
33
  {
34
  "epoch": 0.015383432043688947,
35
+ "grad_norm": 4.756172180175781,
36
  "learning_rate": 7.800000000000001e-05,
37
+ "loss": 3.8946,
38
  "step": 200
39
  },
40
  {
41
  "epoch": 0.019229290054611183,
42
+ "grad_norm": 4.0329108238220215,
43
  "learning_rate": 9.8e-05,
44
+ "loss": 3.842,
45
  "step": 250
46
  },
47
  {
48
  "epoch": 0.019229290054611183,
49
+ "eval_loss": 3.7754223346710205,
50
+ "eval_runtime": 18.0149,
51
+ "eval_samples_per_second": 55.51,
52
+ "eval_steps_per_second": 13.877,
53
  "step": 250
54
  },
55
  {
56
  "epoch": 0.02307514806553342,
57
+ "grad_norm": 7.410353660583496,
58
  "learning_rate": 0.000118,
59
+ "loss": 3.5501,
60
  "step": 300
61
  },
62
  {
63
  "epoch": 0.02692100607645566,
64
+ "grad_norm": 4.085294246673584,
65
  "learning_rate": 0.000138,
66
+ "loss": 3.4015,
67
  "step": 350
68
  },
69
  {
70
  "epoch": 0.030766864087377895,
71
+ "grad_norm": 5.78131103515625,
72
  "learning_rate": 0.00015800000000000002,
73
+ "loss": 3.4988,
74
  "step": 400
75
  },
76
  {
77
  "epoch": 0.03461272209830013,
78
+ "grad_norm": 3.2009644508361816,
79
  "learning_rate": 0.00017800000000000002,
80
+ "loss": 3.4145,
81
  "step": 450
82
  },
83
  {
84
  "epoch": 0.038458580109222366,
85
+ "grad_norm": 4.380056858062744,
86
  "learning_rate": 0.00019800000000000002,
87
+ "loss": 3.3179,
88
  "step": 500
89
  },
90
  {
91
  "epoch": 0.038458580109222366,
92
+ "eval_loss": 3.3119897842407227,
93
+ "eval_runtime": 17.9539,
94
+ "eval_samples_per_second": 55.698,
95
+ "eval_steps_per_second": 13.925,
96
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  }
98
  ],
99
  "logging_steps": 50,
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:367750933f78aafb430c2f507a07ed55588492c9b7b6203a463a9c07c3d92fb6
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e44af272c49f1414968320d715761884d929c74fc3d44815afabe4a9422f97fb
3
  size 5432