TweedleDeepLearnings commited on
Commit
2f1c90d
·
verified ·
1 Parent(s): a40512b

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5101d174985af56a5ca8f66d4497a19fc1df92b1752fd206a9acc99b0f5016c5
3
  size 2101902096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2314f125d34ade15e993416f255766d0491d5e00c74946a6a9a7639c77297350
3
  size 2101902096
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:987d9f0f8403b666ca4e1990d4dc7891c8110a1c42e7d6aac6980dcbb286de39
3
  size 4071291450
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd958853885bc4ee006f0df661f3c058ae435618526f06ab5d4874778a025e20
3
  size 4071291450
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38fc59e25c61798df539d64abf9802bee7a84d13adb6854b0d457cfa6ed5ff0c
3
- size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bd235718e3623137e071bdd8f7e045fcf547d46c7047e3798cd61cc469fbcec
3
+ size 14308
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b00ea15ea31930415de4593eb2a65802185be2077128d834e34240164dbbce43
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19c9bb74c7ccc0ce8938928022bb60a0f70baa72e09d1fb547a24fe2599bec65
3
  size 1256
last-checkpoint/trainer_state.json CHANGED
@@ -1,690 +1,125 @@
1
  {
2
- "best_metric": 0.5061885118484497,
3
- "best_model_checkpoint": "./output/checkpoint-900",
4
- "epoch": 0.07205187735169322,
5
  "eval_steps": 150,
6
- "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0008005764150188136,
13
- "grad_norm": 14.606176376342773,
14
  "learning_rate": 8.000000000000001e-06,
15
- "loss": 1.1075,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.0016011528300376272,
20
- "grad_norm": 11.753077507019043,
21
  "learning_rate": 1.6000000000000003e-05,
22
- "loss": 0.8446,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.0024017292450564404,
27
- "grad_norm": 9.326775550842285,
28
  "learning_rate": 2.4e-05,
29
- "loss": 0.6425,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.0032023056600752543,
34
- "grad_norm": 12.25445556640625,
35
  "learning_rate": 3.2000000000000005e-05,
36
- "loss": 0.6062,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.004002882075094067,
41
- "grad_norm": 8.398188591003418,
42
  "learning_rate": 4e-05,
43
- "loss": 0.5653,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.004803458490112881,
48
- "grad_norm": 10.263713836669922,
49
  "learning_rate": 4.8e-05,
50
- "loss": 0.5658,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.005604034905131695,
55
- "grad_norm": 9.32615852355957,
56
  "learning_rate": 5.6e-05,
57
- "loss": 0.5652,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.006404611320150509,
62
- "grad_norm": 8.419366836547852,
63
  "learning_rate": 6.400000000000001e-05,
64
- "loss": 0.5538,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.007205187735169322,
69
- "grad_norm": 9.608399391174316,
70
  "learning_rate": 7.2e-05,
71
- "loss": 0.5357,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.008005764150188135,
76
- "grad_norm": 10.52286434173584,
77
  "learning_rate": 8e-05,
78
- "loss": 0.5475,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.008806340565206948,
83
- "grad_norm": 8.929399490356445,
84
  "learning_rate": 7.999917787833465e-05,
85
- "loss": 0.554,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.009606916980225762,
90
- "grad_norm": 13.271443367004395,
91
  "learning_rate": 7.999671154713278e-05,
92
- "loss": 0.5339,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 0.010407493395244577,
97
- "grad_norm": 9.399694442749023,
98
  "learning_rate": 7.99926011077756e-05,
99
- "loss": 0.5338,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 0.01120806981026339,
104
- "grad_norm": 8.28138542175293,
105
  "learning_rate": 7.99868467292272e-05,
106
- "loss": 0.4835,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 0.012008646225282204,
111
- "grad_norm": 10.049982070922852,
112
  "learning_rate": 7.997944864802752e-05,
113
- "loss": 0.5392,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 0.012008646225282204,
118
- "eval_loss": 0.5402679443359375,
119
- "eval_runtime": 48.4496,
120
- "eval_samples_per_second": 10.32,
121
- "eval_steps_per_second": 10.32,
122
  "step": 150
123
- },
124
- {
125
- "epoch": 0.012809222640301017,
126
- "grad_norm": 13.39472484588623,
127
- "learning_rate": 7.997040716828271e-05,
128
- "loss": 0.6016,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 0.01360979905531983,
133
- "grad_norm": 9.441704750061035,
134
- "learning_rate": 7.995972266165259e-05,
135
- "loss": 0.5175,
136
- "step": 170
137
- },
138
- {
139
- "epoch": 0.014410375470338644,
140
- "grad_norm": 12.495186805725098,
141
- "learning_rate": 7.994739556733538e-05,
142
- "loss": 0.5198,
143
- "step": 180
144
- },
145
- {
146
- "epoch": 0.015210951885357458,
147
- "grad_norm": 9.358332633972168,
148
- "learning_rate": 7.993342639204965e-05,
149
- "loss": 0.5557,
150
- "step": 190
151
- },
152
- {
153
- "epoch": 0.01601152830037627,
154
- "grad_norm": 10.003859519958496,
155
- "learning_rate": 7.991781571001347e-05,
156
- "loss": 0.5767,
157
- "step": 200
158
- },
159
- {
160
- "epoch": 0.016812104715395085,
161
- "grad_norm": 8.369383811950684,
162
- "learning_rate": 7.990056416292084e-05,
163
- "loss": 0.5524,
164
- "step": 210
165
- },
166
- {
167
- "epoch": 0.017612681130413897,
168
- "grad_norm": 9.327197074890137,
169
- "learning_rate": 7.988167245991528e-05,
170
- "loss": 0.5463,
171
- "step": 220
172
- },
173
- {
174
- "epoch": 0.01841325754543271,
175
- "grad_norm": 8.661181449890137,
176
- "learning_rate": 7.986114137756074e-05,
177
- "loss": 0.577,
178
- "step": 230
179
- },
180
- {
181
- "epoch": 0.019213833960451523,
182
- "grad_norm": 8.12822151184082,
183
- "learning_rate": 7.983897175980957e-05,
184
- "loss": 0.5381,
185
- "step": 240
186
- },
187
- {
188
- "epoch": 0.02001441037547034,
189
- "grad_norm": 8.482646942138672,
190
- "learning_rate": 7.981516451796794e-05,
191
- "loss": 0.4825,
192
- "step": 250
193
- },
194
- {
195
- "epoch": 0.020814986790489154,
196
- "grad_norm": 6.139826774597168,
197
- "learning_rate": 7.97897206306583e-05,
198
- "loss": 0.5136,
199
- "step": 260
200
- },
201
- {
202
- "epoch": 0.021615563205507966,
203
- "grad_norm": 8.262868881225586,
204
- "learning_rate": 7.976264114377922e-05,
205
- "loss": 0.5535,
206
- "step": 270
207
- },
208
- {
209
- "epoch": 0.02241613962052678,
210
- "grad_norm": 8.162726402282715,
211
- "learning_rate": 7.973392717046233e-05,
212
- "loss": 0.4943,
213
- "step": 280
214
- },
215
- {
216
- "epoch": 0.023216716035545593,
217
- "grad_norm": 7.8664870262146,
218
- "learning_rate": 7.97035798910266e-05,
219
- "loss": 0.5262,
220
- "step": 290
221
- },
222
- {
223
- "epoch": 0.024017292450564408,
224
- "grad_norm": 8.503366470336914,
225
- "learning_rate": 7.967160055292984e-05,
226
- "loss": 0.5778,
227
- "step": 300
228
- },
229
- {
230
- "epoch": 0.024017292450564408,
231
- "eval_loss": 0.5316260457038879,
232
- "eval_runtime": 51.9268,
233
- "eval_samples_per_second": 9.629,
234
- "eval_steps_per_second": 9.629,
235
- "step": 300
236
- },
237
- {
238
- "epoch": 0.02481786886558322,
239
- "grad_norm": 7.278728485107422,
240
- "learning_rate": 7.96379904707174e-05,
241
- "loss": 0.4912,
242
- "step": 310
243
- },
244
- {
245
- "epoch": 0.025618445280602035,
246
- "grad_norm": 9.4771089553833,
247
- "learning_rate": 7.960275102596809e-05,
248
- "loss": 0.5316,
249
- "step": 320
250
- },
251
- {
252
- "epoch": 0.026419021695620847,
253
- "grad_norm": 7.614287376403809,
254
- "learning_rate": 7.956588366723745e-05,
255
- "loss": 0.5514,
256
- "step": 330
257
- },
258
- {
259
- "epoch": 0.02721959811063966,
260
- "grad_norm": 7.349905014038086,
261
- "learning_rate": 7.952738990999824e-05,
262
- "loss": 0.5241,
263
- "step": 340
264
- },
265
- {
266
- "epoch": 0.028020174525658473,
267
- "grad_norm": 8.33869743347168,
268
- "learning_rate": 7.948727133657802e-05,
269
- "loss": 0.5551,
270
- "step": 350
271
- },
272
- {
273
- "epoch": 0.02882075094067729,
274
- "grad_norm": 8.975132942199707,
275
- "learning_rate": 7.94455295960942e-05,
276
- "loss": 0.5094,
277
- "step": 360
278
- },
279
- {
280
- "epoch": 0.0296213273556961,
281
- "grad_norm": 7.986541748046875,
282
- "learning_rate": 7.940216640438628e-05,
283
- "loss": 0.5196,
284
- "step": 370
285
- },
286
- {
287
- "epoch": 0.030421903770714916,
288
- "grad_norm": 6.738289833068848,
289
- "learning_rate": 7.93571835439452e-05,
290
- "loss": 0.5176,
291
- "step": 380
292
- },
293
- {
294
- "epoch": 0.031222480185733727,
295
- "grad_norm": 8.22880744934082,
296
- "learning_rate": 7.931058286384016e-05,
297
- "loss": 0.5096,
298
- "step": 390
299
- },
300
- {
301
- "epoch": 0.03202305660075254,
302
- "grad_norm": 7.914773941040039,
303
- "learning_rate": 7.926236627964262e-05,
304
- "loss": 0.578,
305
- "step": 400
306
- },
307
- {
308
- "epoch": 0.032823633015771354,
309
- "grad_norm": 11.044404029846191,
310
- "learning_rate": 7.92125357733475e-05,
311
- "loss": 0.5168,
312
- "step": 410
313
- },
314
- {
315
- "epoch": 0.03362420943079017,
316
- "grad_norm": 9.762344360351562,
317
- "learning_rate": 7.916109339329173e-05,
318
- "loss": 0.5214,
319
- "step": 420
320
- },
321
- {
322
- "epoch": 0.034424785845808985,
323
- "grad_norm": 8.412607192993164,
324
- "learning_rate": 7.910804125407007e-05,
325
- "loss": 0.5388,
326
- "step": 430
327
- },
328
- {
329
- "epoch": 0.03522536226082779,
330
- "grad_norm": 6.477634429931641,
331
- "learning_rate": 7.905338153644818e-05,
332
- "loss": 0.4985,
333
- "step": 440
334
- },
335
- {
336
- "epoch": 0.03602593867584661,
337
- "grad_norm": 7.517724990844727,
338
- "learning_rate": 7.899711648727294e-05,
339
- "loss": 0.5023,
340
- "step": 450
341
- },
342
- {
343
- "epoch": 0.03602593867584661,
344
- "eval_loss": 0.5241909027099609,
345
- "eval_runtime": 48.1214,
346
- "eval_samples_per_second": 10.39,
347
- "eval_steps_per_second": 10.39,
348
- "step": 450
349
- },
350
- {
351
- "epoch": 0.03682651509086542,
352
- "grad_norm": 8.395613670349121,
353
- "learning_rate": 7.89392484193802e-05,
354
- "loss": 0.5248,
355
- "step": 460
356
- },
357
- {
358
- "epoch": 0.03762709150588424,
359
- "grad_norm": 7.838438510894775,
360
- "learning_rate": 7.887977971149952e-05,
361
- "loss": 0.5335,
362
- "step": 470
363
- },
364
- {
365
- "epoch": 0.03842766792090305,
366
- "grad_norm": 9.784844398498535,
367
- "learning_rate": 7.881871280815659e-05,
368
- "loss": 0.5283,
369
- "step": 480
370
- },
371
- {
372
- "epoch": 0.03922824433592186,
373
- "grad_norm": 6.710247039794922,
374
- "learning_rate": 7.875605021957262e-05,
375
- "loss": 0.5024,
376
- "step": 490
377
- },
378
- {
379
- "epoch": 0.04002882075094068,
380
- "grad_norm": 6.177467346191406,
381
- "learning_rate": 7.869179452156118e-05,
382
- "loss": 0.5169,
383
- "step": 500
384
- },
385
- {
386
- "epoch": 0.04082939716595949,
387
- "grad_norm": 9.919758796691895,
388
- "learning_rate": 7.862594835542236e-05,
389
- "loss": 0.4898,
390
- "step": 510
391
- },
392
- {
393
- "epoch": 0.04162997358097831,
394
- "grad_norm": 6.045280456542969,
395
- "learning_rate": 7.855851442783414e-05,
396
- "loss": 0.5014,
397
- "step": 520
398
- },
399
- {
400
- "epoch": 0.042430549995997116,
401
- "grad_norm": 10.190174102783203,
402
- "learning_rate": 7.848949551074116e-05,
403
- "loss": 0.5353,
404
- "step": 530
405
- },
406
- {
407
- "epoch": 0.04323112641101593,
408
- "grad_norm": 7.281028747558594,
409
- "learning_rate": 7.841889444124078e-05,
410
- "loss": 0.5321,
411
- "step": 540
412
- },
413
- {
414
- "epoch": 0.044031702826034747,
415
- "grad_norm": 9.220998764038086,
416
- "learning_rate": 7.834671412146643e-05,
417
- "loss": 0.5427,
418
- "step": 550
419
- },
420
- {
421
- "epoch": 0.04483227924105356,
422
- "grad_norm": 9.340047836303711,
423
- "learning_rate": 7.827295751846836e-05,
424
- "loss": 0.5152,
425
- "step": 560
426
- },
427
- {
428
- "epoch": 0.04563285565607237,
429
- "grad_norm": 9.684113502502441,
430
- "learning_rate": 7.819762766409162e-05,
431
- "loss": 0.5007,
432
- "step": 570
433
- },
434
- {
435
- "epoch": 0.046433432071091185,
436
- "grad_norm": 8.104248046875,
437
- "learning_rate": 7.81207276548515e-05,
438
- "loss": 0.5161,
439
- "step": 580
440
- },
441
- {
442
- "epoch": 0.04723400848611,
443
- "grad_norm": 8.265843391418457,
444
- "learning_rate": 7.804226065180615e-05,
445
- "loss": 0.4761,
446
- "step": 590
447
- },
448
- {
449
- "epoch": 0.048034584901128816,
450
- "grad_norm": 7.592025279998779,
451
- "learning_rate": 7.796222988042676e-05,
452
- "loss": 0.5054,
453
- "step": 600
454
- },
455
- {
456
- "epoch": 0.048034584901128816,
457
- "eval_loss": 0.5160062909126282,
458
- "eval_runtime": 53.8725,
459
- "eval_samples_per_second": 9.281,
460
- "eval_steps_per_second": 9.281,
461
- "step": 600
462
- },
463
- {
464
- "epoch": 0.048835161316147624,
465
- "grad_norm": 5.990994453430176,
466
- "learning_rate": 7.788063863046486e-05,
467
- "loss": 0.5454,
468
- "step": 610
469
- },
470
- {
471
- "epoch": 0.04963573773116644,
472
- "grad_norm": 10.633271217346191,
473
- "learning_rate": 7.779749025581717e-05,
474
- "loss": 0.5441,
475
- "step": 620
476
- },
477
- {
478
- "epoch": 0.050436314146185254,
479
- "grad_norm": 7.758908271789551,
480
- "learning_rate": 7.771278817438773e-05,
481
- "loss": 0.4952,
482
- "step": 630
483
- },
484
- {
485
- "epoch": 0.05123689056120407,
486
- "grad_norm": 6.998986721038818,
487
- "learning_rate": 7.762653586794731e-05,
488
- "loss": 0.5136,
489
- "step": 640
490
- },
491
- {
492
- "epoch": 0.05203746697622288,
493
- "grad_norm": 8.175464630126953,
494
- "learning_rate": 7.753873688199042e-05,
495
- "loss": 0.5448,
496
- "step": 650
497
- },
498
- {
499
- "epoch": 0.05283804339124169,
500
- "grad_norm": 8.736907005310059,
501
- "learning_rate": 7.74493948255895e-05,
502
- "loss": 0.5084,
503
- "step": 660
504
- },
505
- {
506
- "epoch": 0.05363861980626051,
507
- "grad_norm": 7.661477088928223,
508
- "learning_rate": 7.735851337124654e-05,
509
- "loss": 0.5042,
510
- "step": 670
511
- },
512
- {
513
- "epoch": 0.05443919622127932,
514
- "grad_norm": 6.164790630340576,
515
- "learning_rate": 7.726609625474218e-05,
516
- "loss": 0.502,
517
- "step": 680
518
- },
519
- {
520
- "epoch": 0.05523977263629813,
521
- "grad_norm": 8.20608901977539,
522
- "learning_rate": 7.717214727498209e-05,
523
- "loss": 0.5068,
524
- "step": 690
525
- },
526
- {
527
- "epoch": 0.05604034905131695,
528
- "grad_norm": 7.037665843963623,
529
- "learning_rate": 7.707667029384088e-05,
530
- "loss": 0.5195,
531
- "step": 700
532
- },
533
- {
534
- "epoch": 0.05684092546633576,
535
- "grad_norm": 6.408506393432617,
536
- "learning_rate": 7.697966923600327e-05,
537
- "loss": 0.58,
538
- "step": 710
539
- },
540
- {
541
- "epoch": 0.05764150188135458,
542
- "grad_norm": 7.004055023193359,
543
- "learning_rate": 7.688114808880283e-05,
544
- "loss": 0.5103,
545
- "step": 720
546
- },
547
- {
548
- "epoch": 0.058442078296373386,
549
- "grad_norm": 8.525979995727539,
550
- "learning_rate": 7.678111090205804e-05,
551
- "loss": 0.5181,
552
- "step": 730
553
- },
554
- {
555
- "epoch": 0.0592426547113922,
556
- "grad_norm": 9.37247371673584,
557
- "learning_rate": 7.667956178790582e-05,
558
- "loss": 0.4865,
559
- "step": 740
560
- },
561
- {
562
- "epoch": 0.060043231126411016,
563
- "grad_norm": 9.634757995605469,
564
- "learning_rate": 7.65765049206325e-05,
565
- "loss": 0.4985,
566
- "step": 750
567
- },
568
- {
569
- "epoch": 0.060043231126411016,
570
- "eval_loss": 0.5135068893432617,
571
- "eval_runtime": 48.1272,
572
- "eval_samples_per_second": 10.389,
573
- "eval_steps_per_second": 10.389,
574
- "step": 750
575
- },
576
- {
577
- "epoch": 0.06084380754142983,
578
- "grad_norm": 8.048176765441895,
579
- "learning_rate": 7.647194453650228e-05,
580
- "loss": 0.5367,
581
- "step": 760
582
- },
583
- {
584
- "epoch": 0.061644383956448646,
585
- "grad_norm": 8.86915111541748,
586
- "learning_rate": 7.6365884933583e-05,
587
- "loss": 0.5433,
588
- "step": 770
589
- },
590
- {
591
- "epoch": 0.062444960371467455,
592
- "grad_norm": 8.023998260498047,
593
- "learning_rate": 7.625833047156953e-05,
594
- "loss": 0.5164,
595
- "step": 780
596
- },
597
- {
598
- "epoch": 0.06324553678648627,
599
- "grad_norm": 7.905406475067139,
600
- "learning_rate": 7.614928557160454e-05,
601
- "loss": 0.5121,
602
- "step": 790
603
- },
604
- {
605
- "epoch": 0.06404611320150508,
606
- "grad_norm": 11.182598114013672,
607
- "learning_rate": 7.603875471609677e-05,
608
- "loss": 0.5484,
609
- "step": 800
610
- },
611
- {
612
- "epoch": 0.0648466896165239,
613
- "grad_norm": 9.298077583312988,
614
- "learning_rate": 7.592674244853676e-05,
615
- "loss": 0.5282,
616
- "step": 810
617
- },
618
- {
619
- "epoch": 0.06564726603154271,
620
- "grad_norm": 6.5173773765563965,
621
- "learning_rate": 7.581325337331013e-05,
622
- "loss": 0.5331,
623
- "step": 820
624
- },
625
- {
626
- "epoch": 0.06644784244656153,
627
- "grad_norm": 6.437970161437988,
628
- "learning_rate": 7.569829215550825e-05,
629
- "loss": 0.487,
630
- "step": 830
631
- },
632
- {
633
- "epoch": 0.06724841886158034,
634
- "grad_norm": 7.935225486755371,
635
- "learning_rate": 7.558186352073648e-05,
636
- "loss": 0.5303,
637
- "step": 840
638
- },
639
- {
640
- "epoch": 0.06804899527659915,
641
- "grad_norm": 7.8669633865356445,
642
- "learning_rate": 7.546397225492001e-05,
643
- "loss": 0.5428,
644
- "step": 850
645
- },
646
- {
647
- "epoch": 0.06884957169161797,
648
- "grad_norm": 7.498812675476074,
649
- "learning_rate": 7.534462320410702e-05,
650
- "loss": 0.4996,
651
- "step": 860
652
- },
653
- {
654
- "epoch": 0.06965014810663678,
655
- "grad_norm": 8.47286605834961,
656
- "learning_rate": 7.522382127426952e-05,
657
- "loss": 0.4899,
658
- "step": 870
659
- },
660
- {
661
- "epoch": 0.07045072452165559,
662
- "grad_norm": 8.583860397338867,
663
- "learning_rate": 7.510157143110172e-05,
664
- "loss": 0.5122,
665
- "step": 880
666
- },
667
- {
668
- "epoch": 0.07125130093667441,
669
- "grad_norm": 8.282376289367676,
670
- "learning_rate": 7.497787869981583e-05,
671
- "loss": 0.4667,
672
- "step": 890
673
- },
674
- {
675
- "epoch": 0.07205187735169322,
676
- "grad_norm": 6.540297508239746,
677
- "learning_rate": 7.485274816493558e-05,
678
- "loss": 0.5061,
679
- "step": 900
680
- },
681
- {
682
- "epoch": 0.07205187735169322,
683
- "eval_loss": 0.5061885118484497,
684
- "eval_runtime": 52.6411,
685
- "eval_samples_per_second": 9.498,
686
- "eval_steps_per_second": 9.498,
687
- "step": 900
688
  }
689
  ],
690
  "logging_steps": 10,
@@ -704,8 +139,8 @@
704
  "attributes": {}
705
  }
706
  },
707
- "total_flos": 9.25220521365504e+16,
708
- "train_batch_size": 4,
709
  "trial_name": null,
710
  "trial_params": null
711
  }
 
1
  {
2
+ "best_metric": 0.5219287276268005,
3
+ "best_model_checkpoint": "./output/checkpoint-150",
4
+ "epoch": 0.006004563468235859,
5
  "eval_steps": 150,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.00040030423121572396,
13
+ "grad_norm": 19.109567642211914,
14
  "learning_rate": 8.000000000000001e-06,
15
+ "loss": 0.4779,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.0008006084624314479,
20
+ "grad_norm": 10.358642578125,
21
  "learning_rate": 1.6000000000000003e-05,
22
+ "loss": 0.5845,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.0012009126936471718,
27
+ "grad_norm": 11.2018404006958,
28
  "learning_rate": 2.4e-05,
29
+ "loss": 0.5179,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.0016012169248628958,
34
+ "grad_norm": 15.831893920898438,
35
  "learning_rate": 3.2000000000000005e-05,
36
+ "loss": 0.4929,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.0020015211560786197,
41
+ "grad_norm": 13.353575706481934,
42
  "learning_rate": 4e-05,
43
+ "loss": 0.44,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.0024018253872943435,
48
+ "grad_norm": 16.60424041748047,
49
  "learning_rate": 4.8e-05,
50
+ "loss": 0.4523,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.002802129618510068,
55
+ "grad_norm": 6.861016273498535,
56
  "learning_rate": 5.6e-05,
57
+ "loss": 0.5108,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.0032024338497257917,
62
+ "grad_norm": 7.793421268463135,
63
  "learning_rate": 6.400000000000001e-05,
64
+ "loss": 0.4571,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.0036027380809415155,
69
+ "grad_norm": 13.897448539733887,
70
  "learning_rate": 7.2e-05,
71
+ "loss": 0.5071,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.004003042312157239,
76
+ "grad_norm": 12.516758918762207,
77
  "learning_rate": 8e-05,
78
+ "loss": 0.5128,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.004403346543372964,
83
+ "grad_norm": 13.91842269897461,
84
  "learning_rate": 7.999917787833465e-05,
85
+ "loss": 0.4579,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.004803650774588687,
90
+ "grad_norm": 13.659226417541504,
91
  "learning_rate": 7.999671154713278e-05,
92
+ "loss": 0.5425,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 0.005203955005804411,
97
+ "grad_norm": 6.929769039154053,
98
  "learning_rate": 7.99926011077756e-05,
99
+ "loss": 0.5318,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 0.005604259237020136,
104
+ "grad_norm": 14.516855239868164,
105
  "learning_rate": 7.99868467292272e-05,
106
+ "loss": 0.6059,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 0.006004563468235859,
111
+ "grad_norm": 11.343602180480957,
112
  "learning_rate": 7.997944864802752e-05,
113
+ "loss": 0.5262,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 0.006004563468235859,
118
+ "eval_loss": 0.5219287276268005,
119
+ "eval_runtime": 52.4269,
120
+ "eval_samples_per_second": 9.537,
121
+ "eval_steps_per_second": 9.537,
122
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
  ],
125
  "logging_steps": 10,
 
139
  "attributes": {}
140
  }
141
  },
142
+ "total_flos": 6573404135940096.0,
143
+ "train_batch_size": 2,
144
  "trial_name": null,
145
  "trial_params": null
146
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e774682193ad432810f3383530fec82e083d08f8f27130e1364d9e773b4cbea5
3
  size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0220a6fb174e4d99167eee5832c42de5c71879ba1f6fffd1d08d2f2f173cd453
3
  size 5496