irishprancer commited on
Commit
d96f111
·
verified ·
1 Parent(s): 1b42cd1

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24b0844f48802856ec7fd062b5709ea322f8eeb94234eec4d37c4a7382b9a77d
3
  size 527048968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76199f37550c237193e7b172a4297285627e555188041cbdb5f430cc437dd10c
3
  size 527048968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82d2ef26f0dbfc4a6f2ca1147a92ec246182a3349e536111f87633feba51cb62
3
  size 1054135994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c69a1783019f52033c5a249284550e08bebe3ffdf70a26f751e68f27100aba90
3
  size 1054135994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8f11fbb1dc348e5245b6e67b90b53d052aa55ce6bbd45d7369c3c11528ee140
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5feb56512e955691dc9bb9a1e37b9dd590e06a961d7d94560b679e2730b03194
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a8b759658b308282d06846d4dfda31388c652c687853c092da47be547d0736c
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cf3f988e8fed2daa2e801eb1f19b681872781cf57f0fb7b896e859a12cfe2bb
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,1752 +1,157 @@
1
  {
2
- "best_metric": 0.7166430950164795,
3
- "best_model_checkpoint": "./output/checkpoint-450",
4
- "epoch": 78.26086956521739,
5
  "eval_steps": 150,
6
- "global_step": 1800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.43478260869565216,
13
- "grad_norm": 1.5021616220474243,
14
  "learning_rate": 3e-06,
15
- "loss": 0.9065,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.8695652173913043,
20
- "grad_norm": 1.6870683431625366,
21
  "learning_rate": 6e-06,
22
- "loss": 0.9027,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 1.3043478260869565,
27
- "grad_norm": 1.7296483516693115,
28
  "learning_rate": 9e-06,
29
- "loss": 0.9001,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 1.7391304347826086,
34
- "grad_norm": 1.4536631107330322,
35
  "learning_rate": 1.2e-05,
36
- "loss": 0.9092,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 2.1739130434782608,
41
- "grad_norm": 1.3518139123916626,
42
  "learning_rate": 1.5e-05,
43
- "loss": 0.8358,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 2.608695652173913,
48
- "grad_norm": 2.031013250350952,
49
  "learning_rate": 1.8e-05,
50
- "loss": 0.8898,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 3.0434782608695654,
55
- "grad_norm": 1.4844363927841187,
56
  "learning_rate": 2.1e-05,
57
- "loss": 0.8913,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 3.4782608695652173,
62
- "grad_norm": 1.7294501066207886,
63
  "learning_rate": 2.4e-05,
64
- "loss": 0.8231,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 3.9130434782608696,
69
- "grad_norm": 1.423990249633789,
70
  "learning_rate": 2.7000000000000002e-05,
71
- "loss": 0.8527,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 4.3478260869565215,
76
- "grad_norm": 1.3655840158462524,
77
  "learning_rate": 3e-05,
78
- "loss": 0.8647,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 4.782608695652174,
83
- "grad_norm": 2.1975016593933105,
84
  "learning_rate": 2.999999702723963e-05,
85
- "loss": 0.8229,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 5.217391304347826,
90
- "grad_norm": 1.0727310180664062,
91
  "learning_rate": 2.9999988108959687e-05,
92
- "loss": 0.7648,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 5.6521739130434785,
97
- "grad_norm": 1.5595622062683105,
98
  "learning_rate": 2.9999973245163716e-05,
99
- "loss": 0.741,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 6.086956521739131,
104
- "grad_norm": 1.9067057371139526,
105
  "learning_rate": 2.99999524358576e-05,
106
- "loss": 0.7655,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 6.521739130434782,
111
- "grad_norm": 1.1219594478607178,
112
  "learning_rate": 2.9999925681049593e-05,
113
- "loss": 0.7858,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 6.521739130434782,
118
- "eval_loss": 0.7961059212684631,
119
- "eval_runtime": 0.4905,
120
- "eval_samples_per_second": 20.389,
121
- "eval_steps_per_second": 20.389,
122
  "step": 150
123
  },
124
  {
125
- "Start_State_loss": 0.8601926565170288,
126
- "Start_State_runtime": 0.4411,
127
- "Start_State_samples_per_second": 22.672,
128
- "Start_State_steps_per_second": 22.672,
129
  "epoch": 6.521739130434782,
130
  "step": 150
131
  },
132
  {
133
- "Raw_Model_loss": 0.7961059212684631,
134
- "Raw_Model_runtime": 0.4281,
135
- "Raw_Model_samples_per_second": 23.358,
136
- "Raw_Model_steps_per_second": 23.358,
137
  "epoch": 6.521739130434782,
138
  "step": 150
139
  },
140
  {
141
- "SWA_loss": 0.8601926565170288,
142
- "SWA_runtime": 0.4847,
143
- "SWA_samples_per_second": 20.629,
144
- "SWA_steps_per_second": 20.629,
145
  "epoch": 6.521739130434782,
146
  "step": 150
147
  },
148
  {
149
- "EMA_loss": 0.8603938221931458,
150
- "EMA_runtime": 0.454,
151
- "EMA_samples_per_second": 22.026,
152
- "EMA_steps_per_second": 22.026,
153
  "epoch": 6.521739130434782,
154
  "step": 150
155
- },
156
- {
157
- "epoch": 6.956521739130435,
158
- "grad_norm": 1.540475845336914,
159
- "learning_rate": 2.9999892980750297e-05,
160
- "loss": 0.6588,
161
- "step": 160
162
- },
163
- {
164
- "epoch": 7.391304347826087,
165
- "grad_norm": 1.3461687564849854,
166
- "learning_rate": 2.9999854334972675e-05,
167
- "loss": 0.7388,
168
- "step": 170
169
- },
170
- {
171
- "epoch": 7.826086956521739,
172
- "grad_norm": 1.7260714769363403,
173
- "learning_rate": 2.999980974373204e-05,
174
- "loss": 0.7293,
175
- "step": 180
176
- },
177
- {
178
- "epoch": 8.26086956521739,
179
- "grad_norm": 1.5399173498153687,
180
- "learning_rate": 2.9999759207046075e-05,
181
- "loss": 0.6241,
182
- "step": 190
183
- },
184
- {
185
- "epoch": 8.695652173913043,
186
- "grad_norm": 1.742655634880066,
187
- "learning_rate": 2.9999702724934804e-05,
188
- "loss": 0.6766,
189
- "step": 200
190
- },
191
- {
192
- "epoch": 9.130434782608695,
193
- "grad_norm": 1.0411487817764282,
194
- "learning_rate": 2.999964029742062e-05,
195
- "loss": 0.6524,
196
- "step": 210
197
- },
198
- {
199
- "epoch": 9.565217391304348,
200
- "grad_norm": 1.2126989364624023,
201
- "learning_rate": 2.9999571924528263e-05,
202
- "loss": 0.559,
203
- "step": 220
204
- },
205
- {
206
- "epoch": 10.0,
207
- "grad_norm": 1.5287680625915527,
208
- "learning_rate": 2.9999497606284837e-05,
209
- "loss": 0.7561,
210
- "step": 230
211
- },
212
- {
213
- "epoch": 10.434782608695652,
214
- "grad_norm": 1.4217031002044678,
215
- "learning_rate": 2.9999417342719796e-05,
216
- "loss": 0.712,
217
- "step": 240
218
- },
219
- {
220
- "epoch": 10.869565217391305,
221
- "grad_norm": 0.9795515537261963,
222
- "learning_rate": 2.9999331133864956e-05,
223
- "loss": 0.5899,
224
- "step": 250
225
- },
226
- {
227
- "epoch": 11.304347826086957,
228
- "grad_norm": 1.1940442323684692,
229
- "learning_rate": 2.9999238979754485e-05,
230
- "loss": 0.6546,
231
- "step": 260
232
- },
233
- {
234
- "epoch": 11.73913043478261,
235
- "grad_norm": 1.0492786169052124,
236
- "learning_rate": 2.999914088042492e-05,
237
- "loss": 0.6477,
238
- "step": 270
239
- },
240
- {
241
- "epoch": 12.173913043478262,
242
- "grad_norm": 1.31123685836792,
243
- "learning_rate": 2.9999036835915132e-05,
244
- "loss": 0.5938,
245
- "step": 280
246
- },
247
- {
248
- "epoch": 12.608695652173914,
249
- "grad_norm": 1.084313154220581,
250
- "learning_rate": 2.9998926846266365e-05,
251
- "loss": 0.6327,
252
- "step": 290
253
- },
254
- {
255
- "epoch": 13.043478260869565,
256
- "grad_norm": 1.387447476387024,
257
- "learning_rate": 2.9998810911522213e-05,
258
- "loss": 0.5806,
259
- "step": 300
260
- },
261
- {
262
- "epoch": 13.043478260869565,
263
- "eval_loss": 0.7309158444404602,
264
- "eval_runtime": 0.4907,
265
- "eval_samples_per_second": 20.379,
266
- "eval_steps_per_second": 20.379,
267
- "step": 300
268
- },
269
- {
270
- "Start_State_loss": 0.8601926565170288,
271
- "Start_State_runtime": 0.4,
272
- "Start_State_samples_per_second": 25.001,
273
- "Start_State_steps_per_second": 25.001,
274
- "epoch": 13.043478260869565,
275
- "step": 300
276
- },
277
- {
278
- "Raw_Model_loss": 0.7309158444404602,
279
- "Raw_Model_runtime": 0.4036,
280
- "Raw_Model_samples_per_second": 24.778,
281
- "Raw_Model_steps_per_second": 24.778,
282
- "epoch": 13.043478260869565,
283
- "step": 300
284
- },
285
- {
286
- "SWA_loss": 0.775052011013031,
287
- "SWA_runtime": 0.3988,
288
- "SWA_samples_per_second": 25.072,
289
- "SWA_steps_per_second": 25.072,
290
- "epoch": 13.043478260869565,
291
- "step": 300
292
- },
293
- {
294
- "EMA_loss": 0.8610683679580688,
295
- "EMA_runtime": 0.3974,
296
- "EMA_samples_per_second": 25.163,
297
- "EMA_steps_per_second": 25.163,
298
- "epoch": 13.043478260869565,
299
- "step": 300
300
- },
301
- {
302
- "epoch": 13.478260869565217,
303
- "grad_norm": 1.78074312210083,
304
- "learning_rate": 2.9998689031728636e-05,
305
- "loss": 0.5146,
306
- "step": 310
307
- },
308
- {
309
- "epoch": 13.91304347826087,
310
- "grad_norm": 1.5320613384246826,
311
- "learning_rate": 2.9998561206933938e-05,
312
- "loss": 0.6499,
313
- "step": 320
314
- },
315
- {
316
- "epoch": 14.347826086956522,
317
- "grad_norm": 1.4800745248794556,
318
- "learning_rate": 2.9998427437188786e-05,
319
- "loss": 0.5743,
320
- "step": 330
321
- },
322
- {
323
- "epoch": 14.782608695652174,
324
- "grad_norm": 1.3194001913070679,
325
- "learning_rate": 2.99982877225462e-05,
326
- "loss": 0.6013,
327
- "step": 340
328
- },
329
- {
330
- "epoch": 15.217391304347826,
331
- "grad_norm": 0.9749765992164612,
332
- "learning_rate": 2.9998142063061564e-05,
333
- "loss": 0.4989,
334
- "step": 350
335
- },
336
- {
337
- "epoch": 15.652173913043478,
338
- "grad_norm": 1.6396143436431885,
339
- "learning_rate": 2.9997990458792603e-05,
340
- "loss": 0.5624,
341
- "step": 360
342
- },
343
- {
344
- "epoch": 16.08695652173913,
345
- "grad_norm": 1.6343810558319092,
346
- "learning_rate": 2.9997832909799417e-05,
347
- "loss": 0.6672,
348
- "step": 370
349
- },
350
- {
351
- "epoch": 16.52173913043478,
352
- "grad_norm": 0.9520325660705566,
353
- "learning_rate": 2.9997669416144452e-05,
354
- "loss": 0.5129,
355
- "step": 380
356
- },
357
- {
358
- "epoch": 16.956521739130434,
359
- "grad_norm": 0.9396904706954956,
360
- "learning_rate": 2.999749997789251e-05,
361
- "loss": 0.5795,
362
- "step": 390
363
- },
364
- {
365
- "epoch": 17.391304347826086,
366
- "grad_norm": 1.112286925315857,
367
- "learning_rate": 2.9997324595110743e-05,
368
- "loss": 0.5178,
369
- "step": 400
370
- },
371
- {
372
- "epoch": 17.82608695652174,
373
- "grad_norm": 1.2840179204940796,
374
- "learning_rate": 2.9997143267868683e-05,
375
- "loss": 0.588,
376
- "step": 410
377
- },
378
- {
379
- "epoch": 18.26086956521739,
380
- "grad_norm": 1.1637977361679077,
381
- "learning_rate": 2.9996955996238192e-05,
382
- "loss": 0.5057,
383
- "step": 420
384
- },
385
- {
386
- "epoch": 18.695652173913043,
387
- "grad_norm": 1.201230525970459,
388
- "learning_rate": 2.9996762780293503e-05,
389
- "loss": 0.5316,
390
- "step": 430
391
- },
392
- {
393
- "epoch": 19.130434782608695,
394
- "grad_norm": 1.2127166986465454,
395
- "learning_rate": 2.9996563620111197e-05,
396
- "loss": 0.5336,
397
- "step": 440
398
- },
399
- {
400
- "epoch": 19.565217391304348,
401
- "grad_norm": 1.4246900081634521,
402
- "learning_rate": 2.9996358515770218e-05,
403
- "loss": 0.5677,
404
- "step": 450
405
- },
406
- {
407
- "epoch": 19.565217391304348,
408
- "eval_loss": 0.7166430950164795,
409
- "eval_runtime": 0.4285,
410
- "eval_samples_per_second": 23.338,
411
- "eval_steps_per_second": 23.338,
412
- "step": 450
413
- },
414
- {
415
- "Start_State_loss": 0.8601926565170288,
416
- "Start_State_runtime": 0.4149,
417
- "Start_State_samples_per_second": 24.102,
418
- "Start_State_steps_per_second": 24.102,
419
- "epoch": 19.565217391304348,
420
- "step": 450
421
- },
422
- {
423
- "Raw_Model_loss": 0.7166430950164795,
424
- "Raw_Model_runtime": 0.4174,
425
- "Raw_Model_samples_per_second": 23.957,
426
- "Raw_Model_steps_per_second": 23.957,
427
- "epoch": 19.565217391304348,
428
- "step": 450
429
- },
430
- {
431
- "SWA_loss": 0.7559167146682739,
432
- "SWA_runtime": 0.4126,
433
- "SWA_samples_per_second": 24.237,
434
- "SWA_steps_per_second": 24.237,
435
- "epoch": 19.565217391304348,
436
- "step": 450
437
- },
438
- {
439
- "EMA_loss": 0.8612796664237976,
440
- "EMA_runtime": 0.4269,
441
- "EMA_samples_per_second": 23.424,
442
- "EMA_steps_per_second": 23.424,
443
- "epoch": 19.565217391304348,
444
- "step": 450
445
- },
446
- {
447
- "epoch": 20.0,
448
- "grad_norm": 2.1295206546783447,
449
- "learning_rate": 2.9996147467351856e-05,
450
- "loss": 0.5146,
451
- "step": 460
452
- },
453
- {
454
- "epoch": 20.434782608695652,
455
- "grad_norm": 1.2770106792449951,
456
- "learning_rate": 2.9995930474939773e-05,
457
- "loss": 0.4784,
458
- "step": 470
459
- },
460
- {
461
- "epoch": 20.869565217391305,
462
- "grad_norm": 1.4706833362579346,
463
- "learning_rate": 2.9995707538619975e-05,
464
- "loss": 0.5705,
465
- "step": 480
466
- },
467
- {
468
- "epoch": 21.304347826086957,
469
- "grad_norm": 1.3234336376190186,
470
- "learning_rate": 2.9995478658480822e-05,
471
- "loss": 0.5167,
472
- "step": 490
473
- },
474
- {
475
- "epoch": 21.73913043478261,
476
- "grad_norm": 1.242873191833496,
477
- "learning_rate": 2.9995243834613043e-05,
478
- "loss": 0.5207,
479
- "step": 500
480
- },
481
- {
482
- "epoch": 22.17391304347826,
483
- "grad_norm": 1.77828049659729,
484
- "learning_rate": 2.9995003067109707e-05,
485
- "loss": 0.4832,
486
- "step": 510
487
- },
488
- {
489
- "epoch": 22.608695652173914,
490
- "grad_norm": 1.5282888412475586,
491
- "learning_rate": 2.9994756356066246e-05,
492
- "loss": 0.5615,
493
- "step": 520
494
- },
495
- {
496
- "epoch": 23.043478260869566,
497
- "grad_norm": 1.7345402240753174,
498
- "learning_rate": 2.999450370158046e-05,
499
- "loss": 0.4929,
500
- "step": 530
501
- },
502
- {
503
- "epoch": 23.47826086956522,
504
- "grad_norm": 1.3091520071029663,
505
- "learning_rate": 2.9994245103752478e-05,
506
- "loss": 0.4383,
507
- "step": 540
508
- },
509
- {
510
- "epoch": 23.91304347826087,
511
- "grad_norm": 1.2344285249710083,
512
- "learning_rate": 2.999398056268481e-05,
513
- "loss": 0.5264,
514
- "step": 550
515
- },
516
- {
517
- "epoch": 24.347826086956523,
518
- "grad_norm": 1.409712791442871,
519
- "learning_rate": 2.9993710078482306e-05,
520
- "loss": 0.5206,
521
- "step": 560
522
- },
523
- {
524
- "epoch": 24.782608695652176,
525
- "grad_norm": 0.9513388872146606,
526
- "learning_rate": 2.9993433651252185e-05,
527
- "loss": 0.443,
528
- "step": 570
529
- },
530
- {
531
- "epoch": 25.217391304347824,
532
- "grad_norm": 1.7172473669052124,
533
- "learning_rate": 2.9993151281104006e-05,
534
- "loss": 0.5329,
535
- "step": 580
536
- },
537
- {
538
- "epoch": 25.652173913043477,
539
- "grad_norm": 1.1298749446868896,
540
- "learning_rate": 2.9992862968149695e-05,
541
- "loss": 0.4737,
542
- "step": 590
543
- },
544
- {
545
- "epoch": 26.08695652173913,
546
- "grad_norm": 1.1705174446105957,
547
- "learning_rate": 2.9992568712503533e-05,
548
- "loss": 0.4611,
549
- "step": 600
550
- },
551
- {
552
- "epoch": 26.08695652173913,
553
- "eval_loss": 0.7199234366416931,
554
- "eval_runtime": 0.4236,
555
- "eval_samples_per_second": 23.605,
556
- "eval_steps_per_second": 23.605,
557
- "step": 600
558
- },
559
- {
560
- "Start_State_loss": 0.8601926565170288,
561
- "Start_State_runtime": 0.4141,
562
- "Start_State_samples_per_second": 24.146,
563
- "Start_State_steps_per_second": 24.146,
564
- "epoch": 26.08695652173913,
565
- "step": 600
566
- },
567
- {
568
- "Raw_Model_loss": 0.7199234366416931,
569
- "Raw_Model_runtime": 0.4101,
570
- "Raw_Model_samples_per_second": 24.387,
571
- "Raw_Model_steps_per_second": 24.387,
572
- "epoch": 26.08695652173913,
573
- "step": 600
574
- },
575
- {
576
- "SWA_loss": 0.7341524362564087,
577
- "SWA_runtime": 0.4412,
578
- "SWA_samples_per_second": 22.668,
579
- "SWA_steps_per_second": 22.668,
580
- "epoch": 26.08695652173913,
581
- "step": 600
582
- },
583
- {
584
- "EMA_loss": 0.8609703183174133,
585
- "EMA_runtime": 0.424,
586
- "EMA_samples_per_second": 23.585,
587
- "EMA_steps_per_second": 23.585,
588
- "epoch": 26.08695652173913,
589
- "step": 600
590
- },
591
- {
592
- "epoch": 26.52173913043478,
593
- "grad_norm": 1.0925955772399902,
594
- "learning_rate": 2.9992268514282142e-05,
595
- "loss": 0.5116,
596
- "step": 610
597
- },
598
- {
599
- "epoch": 26.956521739130434,
600
- "grad_norm": 1.343130111694336,
601
- "learning_rate": 2.999196237360452e-05,
602
- "loss": 0.4316,
603
- "step": 620
604
- },
605
- {
606
- "epoch": 27.391304347826086,
607
- "grad_norm": 1.203368902206421,
608
- "learning_rate": 2.9991650290592016e-05,
609
- "loss": 0.4756,
610
- "step": 630
611
- },
612
- {
613
- "epoch": 27.82608695652174,
614
- "grad_norm": 1.3935104608535767,
615
- "learning_rate": 2.999133226536832e-05,
616
- "loss": 0.501,
617
- "step": 640
618
- },
619
- {
620
- "epoch": 28.26086956521739,
621
- "grad_norm": 1.412856936454773,
622
- "learning_rate": 2.9991008298059493e-05,
623
- "loss": 0.4107,
624
- "step": 650
625
- },
626
- {
627
- "epoch": 28.695652173913043,
628
- "grad_norm": 1.5606491565704346,
629
- "learning_rate": 2.9990678388793944e-05,
630
- "loss": 0.5064,
631
- "step": 660
632
- },
633
- {
634
- "epoch": 29.130434782608695,
635
- "grad_norm": 1.315181016921997,
636
- "learning_rate": 2.999034253770244e-05,
637
- "loss": 0.4347,
638
- "step": 670
639
- },
640
- {
641
- "epoch": 29.565217391304348,
642
- "grad_norm": 1.077691674232483,
643
- "learning_rate": 2.9990000744918097e-05,
644
- "loss": 0.4705,
645
- "step": 680
646
- },
647
- {
648
- "epoch": 30.0,
649
- "grad_norm": 2.501568078994751,
650
- "learning_rate": 2.9989653010576392e-05,
651
- "loss": 0.4145,
652
- "step": 690
653
- },
654
- {
655
- "epoch": 30.434782608695652,
656
- "grad_norm": 1.3340367078781128,
657
- "learning_rate": 2.9989299334815158e-05,
658
- "loss": 0.4764,
659
- "step": 700
660
- },
661
- {
662
- "epoch": 30.869565217391305,
663
- "grad_norm": 1.6282958984375,
664
- "learning_rate": 2.9988939717774578e-05,
665
- "loss": 0.4118,
666
- "step": 710
667
- },
668
- {
669
- "epoch": 31.304347826086957,
670
- "grad_norm": 0.9019575119018555,
671
- "learning_rate": 2.9988574159597194e-05,
672
- "loss": 0.4244,
673
- "step": 720
674
- },
675
- {
676
- "epoch": 31.73913043478261,
677
- "grad_norm": 1.6408599615097046,
678
- "learning_rate": 2.9988202660427907e-05,
679
- "loss": 0.4821,
680
- "step": 730
681
- },
682
- {
683
- "epoch": 32.17391304347826,
684
- "grad_norm": 1.1973698139190674,
685
- "learning_rate": 2.9987825220413958e-05,
686
- "loss": 0.4385,
687
- "step": 740
688
- },
689
- {
690
- "epoch": 32.608695652173914,
691
- "grad_norm": 1.7692193984985352,
692
- "learning_rate": 2.998744183970496e-05,
693
- "loss": 0.4738,
694
- "step": 750
695
- },
696
- {
697
- "epoch": 32.608695652173914,
698
- "eval_loss": 0.731913149356842,
699
- "eval_runtime": 0.5751,
700
- "eval_samples_per_second": 17.389,
701
- "eval_steps_per_second": 17.389,
702
- "step": 750
703
- },
704
- {
705
- "Start_State_loss": 0.8601926565170288,
706
- "Start_State_runtime": 0.4166,
707
- "Start_State_samples_per_second": 24.004,
708
- "Start_State_steps_per_second": 24.004,
709
- "epoch": 32.608695652173914,
710
- "step": 750
711
- },
712
- {
713
- "Raw_Model_loss": 0.731913149356842,
714
- "Raw_Model_runtime": 0.402,
715
- "Raw_Model_samples_per_second": 24.875,
716
- "Raw_Model_steps_per_second": 24.875,
717
- "epoch": 32.608695652173914,
718
- "step": 750
719
- },
720
- {
721
- "SWA_loss": 0.7290045022964478,
722
- "SWA_runtime": 0.4054,
723
- "SWA_samples_per_second": 24.666,
724
- "SWA_steps_per_second": 24.666,
725
- "epoch": 32.608695652173914,
726
- "step": 750
727
- },
728
- {
729
- "EMA_loss": 0.8603373765945435,
730
- "EMA_runtime": 0.4026,
731
- "EMA_samples_per_second": 24.839,
732
- "EMA_steps_per_second": 24.839,
733
- "epoch": 32.608695652173914,
734
- "step": 750
735
- },
736
- {
737
- "epoch": 33.04347826086956,
738
- "grad_norm": 1.4949195384979248,
739
- "learning_rate": 2.998705251845287e-05,
740
- "loss": 0.43,
741
- "step": 760
742
- },
743
- {
744
- "epoch": 33.47826086956522,
745
- "grad_norm": 1.6518038511276245,
746
- "learning_rate": 2.9986657256812e-05,
747
- "loss": 0.4301,
748
- "step": 770
749
- },
750
- {
751
- "epoch": 33.91304347826087,
752
- "grad_norm": 1.2894669771194458,
753
- "learning_rate": 2.9986256054939022e-05,
754
- "loss": 0.408,
755
- "step": 780
756
- },
757
- {
758
- "epoch": 34.34782608695652,
759
- "grad_norm": 1.4762516021728516,
760
- "learning_rate": 2.9985848912992956e-05,
761
- "loss": 0.4029,
762
- "step": 790
763
- },
764
- {
765
- "epoch": 34.78260869565217,
766
- "grad_norm": 1.5660409927368164,
767
- "learning_rate": 2.9985435831135184e-05,
768
- "loss": 0.3832,
769
- "step": 800
770
- },
771
- {
772
- "epoch": 35.21739130434783,
773
- "grad_norm": 1.3075863122940063,
774
- "learning_rate": 2.9985016809529437e-05,
775
- "loss": 0.4744,
776
- "step": 810
777
- },
778
- {
779
- "epoch": 35.65217391304348,
780
- "grad_norm": 1.3411126136779785,
781
- "learning_rate": 2.9984591848341806e-05,
782
- "loss": 0.403,
783
- "step": 820
784
- },
785
- {
786
- "epoch": 36.08695652173913,
787
- "grad_norm": 1.0828583240509033,
788
- "learning_rate": 2.9984160947740723e-05,
789
- "loss": 0.4181,
790
- "step": 830
791
- },
792
- {
793
- "epoch": 36.52173913043478,
794
- "grad_norm": 1.1622037887573242,
795
- "learning_rate": 2.9983724107896993e-05,
796
- "loss": 0.3806,
797
- "step": 840
798
- },
799
- {
800
- "epoch": 36.95652173913044,
801
- "grad_norm": 1.4791110754013062,
802
- "learning_rate": 2.9983281328983757e-05,
803
- "loss": 0.4499,
804
- "step": 850
805
- },
806
- {
807
- "epoch": 37.391304347826086,
808
- "grad_norm": 1.8963046073913574,
809
- "learning_rate": 2.9982832611176523e-05,
810
- "loss": 0.4181,
811
- "step": 860
812
- },
813
- {
814
- "epoch": 37.82608695652174,
815
- "grad_norm": 1.270815372467041,
816
- "learning_rate": 2.998237795465315e-05,
817
- "loss": 0.3714,
818
- "step": 870
819
- },
820
- {
821
- "epoch": 38.26086956521739,
822
- "grad_norm": 1.264829397201538,
823
- "learning_rate": 2.9981917359593843e-05,
824
- "loss": 0.4013,
825
- "step": 880
826
- },
827
- {
828
- "epoch": 38.69565217391305,
829
- "grad_norm": 1.4431074857711792,
830
- "learning_rate": 2.9981450826181172e-05,
831
- "loss": 0.3552,
832
- "step": 890
833
- },
834
- {
835
- "epoch": 39.130434782608695,
836
- "grad_norm": 1.9556941986083984,
837
- "learning_rate": 2.9980978354600057e-05,
838
- "loss": 0.463,
839
- "step": 900
840
- },
841
- {
842
- "epoch": 39.130434782608695,
843
- "eval_loss": 0.7511647343635559,
844
- "eval_runtime": 0.4145,
845
- "eval_samples_per_second": 24.126,
846
- "eval_steps_per_second": 24.126,
847
- "step": 900
848
- },
849
- {
850
- "Start_State_loss": 0.8601926565170288,
851
- "Start_State_runtime": 0.4115,
852
- "Start_State_samples_per_second": 24.303,
853
- "Start_State_steps_per_second": 24.303,
854
- "epoch": 39.130434782608695,
855
- "step": 900
856
- },
857
- {
858
- "Raw_Model_loss": 0.7511647343635559,
859
- "Raw_Model_runtime": 0.399,
860
- "Raw_Model_samples_per_second": 25.063,
861
- "Raw_Model_steps_per_second": 25.063,
862
- "epoch": 39.130434782608695,
863
- "step": 900
864
- },
865
- {
866
- "SWA_loss": 0.7235903739929199,
867
- "SWA_runtime": 0.3941,
868
- "SWA_samples_per_second": 25.377,
869
- "SWA_steps_per_second": 25.377,
870
- "epoch": 39.130434782608695,
871
- "step": 900
872
- },
873
- {
874
- "EMA_loss": 0.8609917759895325,
875
- "EMA_runtime": 0.3995,
876
- "EMA_samples_per_second": 25.033,
877
- "EMA_steps_per_second": 25.033,
878
- "epoch": 39.130434782608695,
879
- "step": 900
880
- },
881
- {
882
- "epoch": 39.56521739130435,
883
- "grad_norm": 1.542538046836853,
884
- "learning_rate": 2.9980499945037765e-05,
885
- "loss": 0.3835,
886
- "step": 910
887
- },
888
- {
889
- "epoch": 40.0,
890
- "grad_norm": 3.0124218463897705,
891
- "learning_rate": 2.998001559768393e-05,
892
- "loss": 0.3867,
893
- "step": 920
894
- },
895
- {
896
- "epoch": 40.43478260869565,
897
- "grad_norm": 1.5339196920394897,
898
- "learning_rate": 2.9979525312730525e-05,
899
- "loss": 0.4492,
900
- "step": 930
901
- },
902
- {
903
- "epoch": 40.869565217391305,
904
- "grad_norm": 1.6727086305618286,
905
- "learning_rate": 2.9979029090371885e-05,
906
- "loss": 0.3412,
907
- "step": 940
908
- },
909
- {
910
- "epoch": 41.30434782608695,
911
- "grad_norm": 2.2182319164276123,
912
- "learning_rate": 2.99785269308047e-05,
913
- "loss": 0.3413,
914
- "step": 950
915
- },
916
- {
917
- "epoch": 41.73913043478261,
918
- "grad_norm": 1.5122953653335571,
919
- "learning_rate": 2.9978018834228007e-05,
920
- "loss": 0.365,
921
- "step": 960
922
- },
923
- {
924
- "epoch": 42.17391304347826,
925
- "grad_norm": 1.5070980787277222,
926
- "learning_rate": 2.9977504800843197e-05,
927
- "loss": 0.4346,
928
- "step": 970
929
- },
930
- {
931
- "epoch": 42.608695652173914,
932
- "grad_norm": 1.5313963890075684,
933
- "learning_rate": 2.9976984830854022e-05,
934
- "loss": 0.3752,
935
- "step": 980
936
- },
937
- {
938
- "epoch": 43.04347826086956,
939
- "grad_norm": 1.653640866279602,
940
- "learning_rate": 2.997645892446658e-05,
941
- "loss": 0.367,
942
- "step": 990
943
- },
944
- {
945
- "epoch": 43.47826086956522,
946
- "grad_norm": 1.4292306900024414,
947
- "learning_rate": 2.9975927081889322e-05,
948
- "loss": 0.391,
949
- "step": 1000
950
- },
951
- {
952
- "epoch": 43.91304347826087,
953
- "grad_norm": 1.1838629245758057,
954
- "learning_rate": 2.9975389303333047e-05,
955
- "loss": 0.3456,
956
- "step": 1010
957
- },
958
- {
959
- "epoch": 44.34782608695652,
960
- "grad_norm": 2.111812114715576,
961
- "learning_rate": 2.997484558901093e-05,
962
- "loss": 0.3922,
963
- "step": 1020
964
- },
965
- {
966
- "epoch": 44.78260869565217,
967
- "grad_norm": 1.6915301084518433,
968
- "learning_rate": 2.9974295939138465e-05,
969
- "loss": 0.3804,
970
- "step": 1030
971
- },
972
- {
973
- "epoch": 45.21739130434783,
974
- "grad_norm": 1.2465533018112183,
975
- "learning_rate": 2.9973740353933523e-05,
976
- "loss": 0.2648,
977
- "step": 1040
978
- },
979
- {
980
- "epoch": 45.65217391304348,
981
- "grad_norm": 1.68025541305542,
982
- "learning_rate": 2.997317883361632e-05,
983
- "loss": 0.3611,
984
- "step": 1050
985
- },
986
- {
987
- "epoch": 45.65217391304348,
988
- "eval_loss": 0.7759392261505127,
989
- "eval_runtime": 0.5365,
990
- "eval_samples_per_second": 18.64,
991
- "eval_steps_per_second": 18.64,
992
- "step": 1050
993
- },
994
- {
995
- "Start_State_loss": 0.8601926565170288,
996
- "Start_State_runtime": 0.5037,
997
- "Start_State_samples_per_second": 19.854,
998
- "Start_State_steps_per_second": 19.854,
999
- "epoch": 45.65217391304348,
1000
- "step": 1050
1001
- },
1002
- {
1003
- "Raw_Model_loss": 0.7759392261505127,
1004
- "Raw_Model_runtime": 0.4675,
1005
- "Raw_Model_samples_per_second": 21.389,
1006
- "Raw_Model_steps_per_second": 21.389,
1007
- "epoch": 45.65217391304348,
1008
- "step": 1050
1009
- },
1010
- {
1011
- "SWA_loss": 0.7227687835693359,
1012
- "SWA_runtime": 0.4756,
1013
- "SWA_samples_per_second": 21.025,
1014
- "SWA_steps_per_second": 21.025,
1015
- "epoch": 45.65217391304348,
1016
- "step": 1050
1017
- },
1018
- {
1019
- "EMA_loss": 0.8605559468269348,
1020
- "EMA_runtime": 0.4881,
1021
- "EMA_samples_per_second": 20.489,
1022
- "EMA_steps_per_second": 20.489,
1023
- "epoch": 45.65217391304348,
1024
- "step": 1050
1025
- },
1026
- {
1027
- "epoch": 46.08695652173913,
1028
- "grad_norm": 1.7922283411026,
1029
- "learning_rate": 2.997261137840943e-05,
1030
- "loss": 0.4104,
1031
- "step": 1060
1032
- },
1033
- {
1034
- "epoch": 46.52173913043478,
1035
- "grad_norm": 2.145780324935913,
1036
- "learning_rate": 2.9972037988537758e-05,
1037
- "loss": 0.3784,
1038
- "step": 1070
1039
- },
1040
- {
1041
- "epoch": 46.95652173913044,
1042
- "grad_norm": 1.9540642499923706,
1043
- "learning_rate": 2.9971458664228595e-05,
1044
- "loss": 0.3325,
1045
- "step": 1080
1046
- },
1047
- {
1048
- "epoch": 47.391304347826086,
1049
- "grad_norm": 2.150164842605591,
1050
- "learning_rate": 2.997087340571156e-05,
1051
- "loss": 0.3369,
1052
- "step": 1090
1053
- },
1054
- {
1055
- "epoch": 47.82608695652174,
1056
- "grad_norm": 1.539474606513977,
1057
- "learning_rate": 2.997028221321863e-05,
1058
- "loss": 0.3564,
1059
- "step": 1100
1060
- },
1061
- {
1062
- "epoch": 48.26086956521739,
1063
- "grad_norm": 2.3236191272735596,
1064
- "learning_rate": 2.9969685086984132e-05,
1065
- "loss": 0.3736,
1066
- "step": 1110
1067
- },
1068
- {
1069
- "epoch": 48.69565217391305,
1070
- "grad_norm": 1.6481757164001465,
1071
- "learning_rate": 2.9969082027244755e-05,
1072
- "loss": 0.2999,
1073
- "step": 1120
1074
- },
1075
- {
1076
- "epoch": 49.130434782608695,
1077
- "grad_norm": 1.8113096952438354,
1078
- "learning_rate": 2.996847303423953e-05,
1079
- "loss": 0.4149,
1080
- "step": 1130
1081
- },
1082
- {
1083
- "epoch": 49.56521739130435,
1084
- "grad_norm": 1.3106703758239746,
1085
- "learning_rate": 2.9967858108209838e-05,
1086
- "loss": 0.3714,
1087
- "step": 1140
1088
- },
1089
- {
1090
- "epoch": 50.0,
1091
- "grad_norm": 2.9416587352752686,
1092
- "learning_rate": 2.9967237249399417e-05,
1093
- "loss": 0.292,
1094
- "step": 1150
1095
- },
1096
- {
1097
- "epoch": 50.43478260869565,
1098
- "grad_norm": 1.5631065368652344,
1099
- "learning_rate": 2.996661045805436e-05,
1100
- "loss": 0.2963,
1101
- "step": 1160
1102
- },
1103
- {
1104
- "epoch": 50.869565217391305,
1105
- "grad_norm": 1.8589760065078735,
1106
- "learning_rate": 2.9965977734423106e-05,
1107
- "loss": 0.3415,
1108
- "step": 1170
1109
- },
1110
- {
1111
- "epoch": 51.30434782608695,
1112
- "grad_norm": 1.9185295104980469,
1113
- "learning_rate": 2.9965339078756445e-05,
1114
- "loss": 0.3539,
1115
- "step": 1180
1116
- },
1117
- {
1118
- "epoch": 51.73913043478261,
1119
- "grad_norm": 1.1838868856430054,
1120
- "learning_rate": 2.9964694491307514e-05,
1121
- "loss": 0.2803,
1122
- "step": 1190
1123
- },
1124
- {
1125
- "epoch": 52.17391304347826,
1126
- "grad_norm": 2.4929492473602295,
1127
- "learning_rate": 2.996404397233182e-05,
1128
- "loss": 0.4083,
1129
- "step": 1200
1130
- },
1131
- {
1132
- "epoch": 52.17391304347826,
1133
- "eval_loss": 0.8023056983947754,
1134
- "eval_runtime": 0.4133,
1135
- "eval_samples_per_second": 24.196,
1136
- "eval_steps_per_second": 24.196,
1137
- "step": 1200
1138
- },
1139
- {
1140
- "Start_State_loss": 0.8601926565170288,
1141
- "Start_State_runtime": 0.407,
1142
- "Start_State_samples_per_second": 24.569,
1143
- "Start_State_steps_per_second": 24.569,
1144
- "epoch": 52.17391304347826,
1145
- "step": 1200
1146
- },
1147
- {
1148
- "Raw_Model_loss": 0.8023056983947754,
1149
- "Raw_Model_runtime": 0.3981,
1150
- "Raw_Model_samples_per_second": 25.122,
1151
- "Raw_Model_steps_per_second": 25.122,
1152
- "epoch": 52.17391304347826,
1153
- "step": 1200
1154
- },
1155
- {
1156
- "SWA_loss": 0.7237697243690491,
1157
- "SWA_runtime": 0.4024,
1158
- "SWA_samples_per_second": 24.852,
1159
- "SWA_steps_per_second": 24.852,
1160
- "epoch": 52.17391304347826,
1161
- "step": 1200
1162
- },
1163
- {
1164
- "EMA_loss": 0.8598647117614746,
1165
- "EMA_runtime": 0.4326,
1166
- "EMA_samples_per_second": 23.115,
1167
- "EMA_steps_per_second": 23.115,
1168
- "epoch": 52.17391304347826,
1169
- "step": 1200
1170
- },
1171
- {
1172
- "epoch": 52.608695652173914,
1173
- "grad_norm": 1.6113795042037964,
1174
- "learning_rate": 1.4982021986165911e-06,
1175
- "loss": 0.2866,
1176
- "step": 1210
1177
- },
1178
- {
1179
- "epoch": 53.04347826086956,
1180
- "grad_norm": 1.7770823240280151,
1181
- "learning_rate": 2.9964043972331822e-06,
1182
- "loss": 0.365,
1183
- "step": 1220
1184
- },
1185
- {
1186
- "epoch": 53.47826086956522,
1187
- "grad_norm": 1.4132719039916992,
1188
- "learning_rate": 4.494606595849773e-06,
1189
- "loss": 0.2718,
1190
- "step": 1230
1191
- },
1192
- {
1193
- "epoch": 53.91304347826087,
1194
- "grad_norm": 1.9334650039672852,
1195
- "learning_rate": 5.9928087944663644e-06,
1196
- "loss": 0.3338,
1197
- "step": 1240
1198
- },
1199
- {
1200
- "epoch": 54.34782608695652,
1201
- "grad_norm": 1.9728986024856567,
1202
- "learning_rate": 7.491010993082955e-06,
1203
- "loss": 0.3853,
1204
- "step": 1250
1205
- },
1206
- {
1207
- "epoch": 54.78260869565217,
1208
- "grad_norm": 1.1599531173706055,
1209
- "learning_rate": 8.989213191699545e-06,
1210
- "loss": 0.2838,
1211
- "step": 1260
1212
- },
1213
- {
1214
- "epoch": 55.21739130434783,
1215
- "grad_norm": 1.558973789215088,
1216
- "learning_rate": 1.0487415390316136e-05,
1217
- "loss": 0.311,
1218
- "step": 1270
1219
- },
1220
- {
1221
- "epoch": 55.65217391304348,
1222
- "grad_norm": 1.7310874462127686,
1223
- "learning_rate": 1.1985617588932729e-05,
1224
- "loss": 0.3553,
1225
- "step": 1280
1226
- },
1227
- {
1228
- "epoch": 56.08695652173913,
1229
- "grad_norm": 2.2715365886688232,
1230
- "learning_rate": 1.348381978754932e-05,
1231
- "loss": 0.2844,
1232
- "step": 1290
1233
- },
1234
- {
1235
- "epoch": 56.52173913043478,
1236
- "grad_norm": 1.467916488647461,
1237
- "learning_rate": 1.498202198616591e-05,
1238
- "loss": 0.3391,
1239
- "step": 1300
1240
- },
1241
- {
1242
- "epoch": 56.95652173913044,
1243
- "grad_norm": 1.974404215812683,
1244
- "learning_rate": 1.4982020501567203e-05,
1245
- "loss": 0.3314,
1246
- "step": 1310
1247
- },
1248
- {
1249
- "epoch": 57.391304347826086,
1250
- "grad_norm": 1.4068485498428345,
1251
- "learning_rate": 1.4982016047771664e-05,
1252
- "loss": 0.3113,
1253
- "step": 1320
1254
- },
1255
- {
1256
- "epoch": 57.82608695652174,
1257
- "grad_norm": 2.7793936729431152,
1258
- "learning_rate": 1.4982008624781062e-05,
1259
- "loss": 0.3372,
1260
- "step": 1330
1261
- },
1262
- {
1263
- "epoch": 58.26086956521739,
1264
- "grad_norm": 1.4399445056915283,
1265
- "learning_rate": 1.4981998232598337e-05,
1266
- "loss": 0.3301,
1267
- "step": 1340
1268
- },
1269
- {
1270
- "epoch": 58.69565217391305,
1271
- "grad_norm": 1.8218740224838257,
1272
- "learning_rate": 1.4981984871227611e-05,
1273
- "loss": 0.3077,
1274
- "step": 1350
1275
- },
1276
- {
1277
- "epoch": 58.69565217391305,
1278
- "eval_loss": 0.8209422826766968,
1279
- "eval_runtime": 0.4642,
1280
- "eval_samples_per_second": 21.544,
1281
- "eval_steps_per_second": 21.544,
1282
- "step": 1350
1283
- },
1284
- {
1285
- "Start_State_loss": 0.8601926565170288,
1286
- "Start_State_runtime": 0.4587,
1287
- "Start_State_samples_per_second": 21.801,
1288
- "Start_State_steps_per_second": 21.801,
1289
- "epoch": 58.69565217391305,
1290
- "step": 1350
1291
- },
1292
- {
1293
- "Raw_Model_loss": 0.8209422826766968,
1294
- "Raw_Model_runtime": 0.5144,
1295
- "Raw_Model_samples_per_second": 19.438,
1296
- "Raw_Model_steps_per_second": 19.438,
1297
- "epoch": 58.69565217391305,
1298
- "step": 1350
1299
- },
1300
- {
1301
- "SWA_loss": 0.7251114249229431,
1302
- "SWA_runtime": 0.4605,
1303
- "SWA_samples_per_second": 21.718,
1304
- "SWA_steps_per_second": 21.718,
1305
- "epoch": 58.69565217391305,
1306
- "step": 1350
1307
- },
1308
- {
1309
- "EMA_loss": 0.8608489036560059,
1310
- "EMA_runtime": 0.4317,
1311
- "EMA_samples_per_second": 23.166,
1312
- "EMA_steps_per_second": 23.166,
1313
- "epoch": 58.69565217391305,
1314
- "step": 1350
1315
- },
1316
- {
1317
- "epoch": 59.130434782608695,
1318
- "grad_norm": 1.5807944536209106,
1319
- "learning_rate": 1.4981968540674177e-05,
1320
- "loss": 0.3206,
1321
- "step": 1360
1322
- },
1323
- {
1324
- "epoch": 59.56521739130435,
1325
- "grad_norm": 1.40355384349823,
1326
- "learning_rate": 1.4981949240944509e-05,
1327
- "loss": 0.3012,
1328
- "step": 1370
1329
- },
1330
- {
1331
- "epoch": 60.0,
1332
- "grad_norm": 1.6165056228637695,
1333
- "learning_rate": 1.4981926972046258e-05,
1334
- "loss": 0.3098,
1335
- "step": 1380
1336
- },
1337
- {
1338
- "epoch": 60.43478260869565,
1339
- "grad_norm": 1.9167027473449707,
1340
- "learning_rate": 1.498190173398825e-05,
1341
- "loss": 0.3171,
1342
- "step": 1390
1343
- },
1344
- {
1345
- "epoch": 60.869565217391305,
1346
- "grad_norm": 1.539297342300415,
1347
- "learning_rate": 1.4981873526780487e-05,
1348
- "loss": 0.3049,
1349
- "step": 1400
1350
- },
1351
- {
1352
- "epoch": 61.30434782608695,
1353
- "grad_norm": 1.4211211204528809,
1354
- "learning_rate": 1.4981842350434152e-05,
1355
- "loss": 0.3045,
1356
- "step": 1410
1357
- },
1358
- {
1359
- "epoch": 61.73913043478261,
1360
- "grad_norm": 1.4864341020584106,
1361
- "learning_rate": 1.49818082049616e-05,
1362
- "loss": 0.3207,
1363
- "step": 1420
1364
- },
1365
- {
1366
- "epoch": 62.17391304347826,
1367
- "grad_norm": 2.1776299476623535,
1368
- "learning_rate": 1.4981771090376367e-05,
1369
- "loss": 0.2862,
1370
- "step": 1430
1371
- },
1372
- {
1373
- "epoch": 62.608695652173914,
1374
- "grad_norm": 1.8853501081466675,
1375
- "learning_rate": 1.4981731006693164e-05,
1376
- "loss": 0.3212,
1377
- "step": 1440
1378
- },
1379
- {
1380
- "epoch": 63.04347826086956,
1381
- "grad_norm": 1.3142286539077759,
1382
- "learning_rate": 1.4981687953927875e-05,
1383
- "loss": 0.3127,
1384
- "step": 1450
1385
- },
1386
- {
1387
- "epoch": 63.47826086956522,
1388
- "grad_norm": 1.9734851121902466,
1389
- "learning_rate": 1.498164193209757e-05,
1390
- "loss": 0.3447,
1391
- "step": 1460
1392
- },
1393
- {
1394
- "epoch": 63.91304347826087,
1395
- "grad_norm": 1.655447006225586,
1396
- "learning_rate": 1.498159294122049e-05,
1397
- "loss": 0.2921,
1398
- "step": 1470
1399
- },
1400
- {
1401
- "epoch": 64.34782608695652,
1402
- "grad_norm": 1.7767964601516724,
1403
- "learning_rate": 1.4981540981316052e-05,
1404
- "loss": 0.269,
1405
- "step": 1480
1406
- },
1407
- {
1408
- "epoch": 64.78260869565217,
1409
- "grad_norm": 1.5196256637573242,
1410
- "learning_rate": 1.4981486052404848e-05,
1411
- "loss": 0.3583,
1412
- "step": 1490
1413
- },
1414
- {
1415
- "epoch": 65.21739130434783,
1416
- "grad_norm": 1.4027047157287598,
1417
- "learning_rate": 1.4981428154508652e-05,
1418
- "loss": 0.2693,
1419
- "step": 1500
1420
- },
1421
- {
1422
- "epoch": 65.21739130434783,
1423
- "eval_loss": 0.832839846611023,
1424
- "eval_runtime": 0.4275,
1425
- "eval_samples_per_second": 23.391,
1426
- "eval_steps_per_second": 23.391,
1427
- "step": 1500
1428
- },
1429
- {
1430
- "Start_State_loss": 0.8601926565170288,
1431
- "Start_State_runtime": 0.4059,
1432
- "Start_State_samples_per_second": 24.634,
1433
- "Start_State_steps_per_second": 24.634,
1434
- "epoch": 65.21739130434783,
1435
- "step": 1500
1436
- },
1437
- {
1438
- "Raw_Model_loss": 0.832839846611023,
1439
- "Raw_Model_runtime": 0.3946,
1440
- "Raw_Model_samples_per_second": 25.341,
1441
- "Raw_Model_steps_per_second": 25.341,
1442
- "epoch": 65.21739130434783,
1443
- "step": 1500
1444
- },
1445
- {
1446
- "SWA_loss": 0.7298181056976318,
1447
- "SWA_runtime": 0.3986,
1448
- "SWA_samples_per_second": 25.087,
1449
- "SWA_steps_per_second": 25.087,
1450
- "epoch": 65.21739130434783,
1451
- "step": 1500
1452
- },
1453
- {
1454
- "EMA_loss": 0.8607869148254395,
1455
- "EMA_runtime": 0.4036,
1456
- "EMA_samples_per_second": 24.774,
1457
- "EMA_steps_per_second": 24.774,
1458
- "epoch": 65.21739130434783,
1459
- "step": 1500
1460
- },
1461
- {
1462
- "epoch": 65.65217391304348,
1463
- "grad_norm": 2.2542121410369873,
1464
- "learning_rate": 1.4981367287650419e-05,
1465
- "loss": 0.3164,
1466
- "step": 1510
1467
- },
1468
- {
1469
- "epoch": 66.08695652173913,
1470
- "grad_norm": 1.7643301486968994,
1471
- "learning_rate": 1.4981303451854267e-05,
1472
- "loss": 0.2947,
1473
- "step": 1520
1474
- },
1475
- {
1476
- "epoch": 66.52173913043478,
1477
- "grad_norm": 1.7471901178359985,
1478
- "learning_rate": 1.4981236647145501e-05,
1479
- "loss": 0.3103,
1480
- "step": 1530
1481
- },
1482
- {
1483
- "epoch": 66.95652173913044,
1484
- "grad_norm": 2.057833194732666,
1485
- "learning_rate": 1.4981166873550601e-05,
1486
- "loss": 0.3051,
1487
- "step": 1540
1488
- },
1489
- {
1490
- "epoch": 67.3913043478261,
1491
- "grad_norm": 1.7425355911254883,
1492
- "learning_rate": 1.4981094131097224e-05,
1493
- "loss": 0.2713,
1494
- "step": 1550
1495
- },
1496
- {
1497
- "epoch": 67.82608695652173,
1498
- "grad_norm": 2.050690174102783,
1499
- "learning_rate": 1.49810184198142e-05,
1500
- "loss": 0.3439,
1501
- "step": 1560
1502
- },
1503
- {
1504
- "epoch": 68.26086956521739,
1505
- "grad_norm": 2.0778491497039795,
1506
- "learning_rate": 1.498093973973154e-05,
1507
- "loss": 0.2503,
1508
- "step": 1570
1509
- },
1510
- {
1511
- "epoch": 68.69565217391305,
1512
- "grad_norm": 1.8078017234802246,
1513
- "learning_rate": 1.4980858090880429e-05,
1514
- "loss": 0.2862,
1515
- "step": 1580
1516
- },
1517
- {
1518
- "epoch": 69.1304347826087,
1519
- "grad_norm": 1.9451018571853638,
1520
- "learning_rate": 1.4980773473293232e-05,
1521
- "loss": 0.368,
1522
- "step": 1590
1523
- },
1524
- {
1525
- "epoch": 69.56521739130434,
1526
- "grad_norm": 1.9795953035354614,
1527
- "learning_rate": 1.4980685887003486e-05,
1528
- "loss": 0.3073,
1529
- "step": 1600
1530
- },
1531
- {
1532
- "epoch": 70.0,
1533
- "grad_norm": 1.6645371913909912,
1534
- "learning_rate": 1.498059533204591e-05,
1535
- "loss": 0.2691,
1536
- "step": 1610
1537
- },
1538
- {
1539
- "epoch": 70.43478260869566,
1540
- "grad_norm": 2.21379017829895,
1541
- "learning_rate": 1.4980501808456398e-05,
1542
- "loss": 0.3142,
1543
- "step": 1620
1544
- },
1545
- {
1546
- "epoch": 70.8695652173913,
1547
- "grad_norm": 1.9500844478607178,
1548
- "learning_rate": 1.4980405316272018e-05,
1549
- "loss": 0.2996,
1550
- "step": 1630
1551
- },
1552
- {
1553
- "epoch": 71.30434782608695,
1554
- "grad_norm": 2.359870195388794,
1555
- "learning_rate": 1.4980305855531015e-05,
1556
- "loss": 0.2888,
1557
- "step": 1640
1558
- },
1559
- {
1560
- "epoch": 71.73913043478261,
1561
- "grad_norm": 1.8895881175994873,
1562
- "learning_rate": 1.4980203426272815e-05,
1563
- "loss": 0.2624,
1564
- "step": 1650
1565
- },
1566
- {
1567
- "epoch": 71.73913043478261,
1568
- "eval_loss": 0.847686767578125,
1569
- "eval_runtime": 0.5359,
1570
- "eval_samples_per_second": 18.659,
1571
- "eval_steps_per_second": 18.659,
1572
- "step": 1650
1573
- },
1574
- {
1575
- "Start_State_loss": 0.8601926565170288,
1576
- "Start_State_runtime": 0.3989,
1577
- "Start_State_samples_per_second": 25.067,
1578
- "Start_State_steps_per_second": 25.067,
1579
- "epoch": 71.73913043478261,
1580
- "step": 1650
1581
- },
1582
- {
1583
- "Raw_Model_loss": 0.847686767578125,
1584
- "Raw_Model_runtime": 0.4133,
1585
- "Raw_Model_samples_per_second": 24.198,
1586
- "Raw_Model_steps_per_second": 24.198,
1587
- "epoch": 71.73913043478261,
1588
- "step": 1650
1589
- },
1590
- {
1591
- "SWA_loss": 0.7314801216125488,
1592
- "SWA_runtime": 0.3914,
1593
- "SWA_samples_per_second": 25.548,
1594
- "SWA_steps_per_second": 25.548,
1595
- "epoch": 71.73913043478261,
1596
- "step": 1650
1597
- },
1598
- {
1599
- "EMA_loss": 0.8605908155441284,
1600
- "EMA_runtime": 0.3897,
1601
- "EMA_samples_per_second": 25.662,
1602
- "EMA_steps_per_second": 25.662,
1603
- "epoch": 71.73913043478261,
1604
- "step": 1650
1605
- },
1606
- {
1607
- "epoch": 72.17391304347827,
1608
- "grad_norm": 1.664617896080017,
1609
- "learning_rate": 1.4980098028538014e-05,
1610
- "loss": 0.3276,
1611
- "step": 1660
1612
- },
1613
- {
1614
- "epoch": 72.6086956521739,
1615
- "grad_norm": 2.0435194969177246,
1616
- "learning_rate": 1.4979989662368391e-05,
1617
- "loss": 0.2963,
1618
- "step": 1670
1619
- },
1620
- {
1621
- "epoch": 73.04347826086956,
1622
- "grad_norm": 1.906278133392334,
1623
- "learning_rate": 1.4979878327806899e-05,
1624
- "loss": 0.3093,
1625
- "step": 1680
1626
- },
1627
- {
1628
- "epoch": 73.47826086956522,
1629
- "grad_norm": 2.026448965072632,
1630
- "learning_rate": 1.4979764024897668e-05,
1631
- "loss": 0.2875,
1632
- "step": 1690
1633
- },
1634
- {
1635
- "epoch": 73.91304347826087,
1636
- "grad_norm": 1.8200604915618896,
1637
- "learning_rate": 1.4979646753686002e-05,
1638
- "loss": 0.2793,
1639
- "step": 1700
1640
- },
1641
- {
1642
- "epoch": 74.34782608695652,
1643
- "grad_norm": 1.414810061454773,
1644
- "learning_rate": 1.4979526514218385e-05,
1645
- "loss": 0.277,
1646
- "step": 1710
1647
- },
1648
- {
1649
- "epoch": 74.78260869565217,
1650
- "grad_norm": 1.4874234199523926,
1651
- "learning_rate": 1.4979403306542473e-05,
1652
- "loss": 0.3277,
1653
- "step": 1720
1654
- },
1655
- {
1656
- "epoch": 75.21739130434783,
1657
- "grad_norm": 1.5648179054260254,
1658
- "learning_rate": 1.4979277130707107e-05,
1659
- "loss": 0.2337,
1660
- "step": 1730
1661
- },
1662
- {
1663
- "epoch": 75.65217391304348,
1664
- "grad_norm": 1.6863374710083008,
1665
- "learning_rate": 1.4979147986762295e-05,
1666
- "loss": 0.3146,
1667
- "step": 1740
1668
- },
1669
- {
1670
- "epoch": 76.08695652173913,
1671
- "grad_norm": 1.7994861602783203,
1672
- "learning_rate": 1.4979015874759227e-05,
1673
- "loss": 0.2694,
1674
- "step": 1750
1675
- },
1676
- {
1677
- "epoch": 76.52173913043478,
1678
- "grad_norm": 1.8553599119186401,
1679
- "learning_rate": 1.4978880794750266e-05,
1680
- "loss": 0.2661,
1681
- "step": 1760
1682
- },
1683
- {
1684
- "epoch": 76.95652173913044,
1685
- "grad_norm": 1.3038052320480347,
1686
- "learning_rate": 1.4978742746788957e-05,
1687
- "loss": 0.3005,
1688
- "step": 1770
1689
- },
1690
- {
1691
- "epoch": 77.3913043478261,
1692
- "grad_norm": 1.8376268148422241,
1693
- "learning_rate": 1.4978601730930014e-05,
1694
- "loss": 0.2843,
1695
- "step": 1780
1696
- },
1697
- {
1698
- "epoch": 77.82608695652173,
1699
- "grad_norm": 1.8291127681732178,
1700
- "learning_rate": 1.4978457747229335e-05,
1701
- "loss": 0.2715,
1702
- "step": 1790
1703
- },
1704
- {
1705
- "epoch": 78.26086956521739,
1706
- "grad_norm": 1.933289885520935,
1707
- "learning_rate": 1.497831079574399e-05,
1708
- "loss": 0.3055,
1709
- "step": 1800
1710
- },
1711
- {
1712
- "epoch": 78.26086956521739,
1713
- "eval_loss": 0.8690454363822937,
1714
- "eval_runtime": 0.4165,
1715
- "eval_samples_per_second": 24.008,
1716
- "eval_steps_per_second": 24.008,
1717
- "step": 1800
1718
- },
1719
- {
1720
- "Start_State_loss": 0.8601926565170288,
1721
- "Start_State_runtime": 0.3983,
1722
- "Start_State_samples_per_second": 25.109,
1723
- "Start_State_steps_per_second": 25.109,
1724
- "epoch": 78.26086956521739,
1725
- "step": 1800
1726
- },
1727
- {
1728
- "Raw_Model_loss": 0.8690454363822937,
1729
- "Raw_Model_runtime": 0.4147,
1730
- "Raw_Model_samples_per_second": 24.115,
1731
- "Raw_Model_steps_per_second": 24.115,
1732
- "epoch": 78.26086956521739,
1733
- "step": 1800
1734
- },
1735
- {
1736
- "SWA_loss": 0.7372413873672485,
1737
- "SWA_runtime": 0.4087,
1738
- "SWA_samples_per_second": 24.465,
1739
- "SWA_steps_per_second": 24.465,
1740
- "epoch": 78.26086956521739,
1741
- "step": 1800
1742
- },
1743
- {
1744
- "EMA_loss": 0.8606707453727722,
1745
- "EMA_runtime": 0.4092,
1746
- "EMA_samples_per_second": 24.44,
1747
- "EMA_steps_per_second": 24.44,
1748
- "epoch": 78.26086956521739,
1749
- "step": 1800
1750
  }
1751
  ],
1752
  "logging_steps": 10,
@@ -1766,7 +171,7 @@
1766
  "attributes": {}
1767
  }
1768
  },
1769
- "total_flos": 4.631084552967782e+16,
1770
  "train_batch_size": 4,
1771
  "trial_name": null,
1772
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7964445352554321,
3
+ "best_model_checkpoint": "./output/checkpoint-150",
4
+ "epoch": 6.521739130434782,
5
  "eval_steps": 150,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.43478260869565216,
13
+ "grad_norm": 1.5021672248840332,
14
  "learning_rate": 3e-06,
15
+ "loss": 0.9064,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.8695652173913043,
20
+ "grad_norm": 1.6870806217193604,
21
  "learning_rate": 6e-06,
22
+ "loss": 0.9026,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 1.3043478260869565,
27
+ "grad_norm": 1.7296316623687744,
28
  "learning_rate": 9e-06,
29
+ "loss": 0.8997,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 1.7391304347826086,
34
+ "grad_norm": 1.453599214553833,
35
  "learning_rate": 1.2e-05,
36
+ "loss": 0.9089,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 2.1739130434782608,
41
+ "grad_norm": 1.351836919784546,
42
  "learning_rate": 1.5e-05,
43
+ "loss": 0.8361,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 2.608695652173913,
48
+ "grad_norm": 2.0312016010284424,
49
  "learning_rate": 1.8e-05,
50
+ "loss": 0.8893,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 3.0434782608695654,
55
+ "grad_norm": 1.4920508861541748,
56
  "learning_rate": 2.1e-05,
57
+ "loss": 0.8911,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 3.4782608695652173,
62
+ "grad_norm": 1.7295340299606323,
63
  "learning_rate": 2.4e-05,
64
+ "loss": 0.8233,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 3.9130434782608696,
69
+ "grad_norm": 1.4239747524261475,
70
  "learning_rate": 2.7000000000000002e-05,
71
+ "loss": 0.8526,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 4.3478260869565215,
76
+ "grad_norm": 1.3656420707702637,
77
  "learning_rate": 3e-05,
78
+ "loss": 0.8646,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 4.782608695652174,
83
+ "grad_norm": 2.2135441303253174,
84
  "learning_rate": 2.999999702723963e-05,
85
+ "loss": 0.8228,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 5.217391304347826,
90
+ "grad_norm": 1.0727280378341675,
91
  "learning_rate": 2.9999988108959687e-05,
92
+ "loss": 0.7654,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 5.6521739130434785,
97
+ "grad_norm": 1.559777021408081,
98
  "learning_rate": 2.9999973245163716e-05,
99
+ "loss": 0.7413,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 6.086956521739131,
104
+ "grad_norm": 1.9066534042358398,
105
  "learning_rate": 2.99999524358576e-05,
106
+ "loss": 0.765,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 6.521739130434782,
111
+ "grad_norm": 1.1222983598709106,
112
  "learning_rate": 2.9999925681049593e-05,
113
+ "loss": 0.7855,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 6.521739130434782,
118
+ "eval_loss": 0.7964445352554321,
119
+ "eval_runtime": 0.4778,
120
+ "eval_samples_per_second": 20.929,
121
+ "eval_steps_per_second": 20.929,
122
  "step": 150
123
  },
124
  {
125
+ "Start_State_loss": 0.8603047132492065,
126
+ "Start_State_runtime": 0.4294,
127
+ "Start_State_samples_per_second": 23.291,
128
+ "Start_State_steps_per_second": 23.291,
129
  "epoch": 6.521739130434782,
130
  "step": 150
131
  },
132
  {
133
+ "Raw_Model_loss": 0.7964445352554321,
134
+ "Raw_Model_runtime": 0.409,
135
+ "Raw_Model_samples_per_second": 24.447,
136
+ "Raw_Model_steps_per_second": 24.447,
137
  "epoch": 6.521739130434782,
138
  "step": 150
139
  },
140
  {
141
+ "SWA_loss": 0.8603047132492065,
142
+ "SWA_runtime": 0.4233,
143
+ "SWA_samples_per_second": 23.622,
144
+ "SWA_steps_per_second": 23.622,
145
  "epoch": 6.521739130434782,
146
  "step": 150
147
  },
148
  {
149
+ "EMA_loss": 0.860609233379364,
150
+ "EMA_runtime": 0.4245,
151
+ "EMA_samples_per_second": 23.56,
152
+ "EMA_steps_per_second": 23.56,
153
  "epoch": 6.521739130434782,
154
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  }
156
  ],
157
  "logging_steps": 10,
 
171
  "attributes": {}
172
  }
173
  },
174
+ "total_flos": 3894839614291968.0,
175
  "train_batch_size": 4,
176
  "trial_name": null,
177
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04cc93848fede83aea266ada8d3a6a175e9ab0ace29af265b71333398b052846
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ba11e02e3c662a7d3d48a1857c9dd4de50cd9d610a43e2273c0cd737526dd29
3
  size 5368