efraimdahl commited on
Commit
eae8ab6
·
verified ·
1 Parent(s): f46cc01

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -7,7 +7,7 @@
7
  "grid_size": 48,
8
  "intermediate_dim": 3072,
9
  "nhead": 4,
10
- "nlayer": 2,
11
  "ntarget": 1,
12
  "torch_dtype": "float32",
13
  "transformers_version": "4.52.4"
 
7
  "grid_size": 48,
8
  "intermediate_dim": 3072,
9
  "nhead": 4,
10
+ "nlayer": 4,
11
  "ntarget": 1,
12
  "torch_dtype": "float32",
13
  "transformers_version": "4.52.4"
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:593f45feb57b6cf6b72deb4c2b645da0e09ef0560bb117ead2f8e48a01ba523c
3
- size 13132
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0de6c980717d1de9d5f5025dc7b99f9e288680030e17a953d05f9f168bf9b51f
3
+ size 113571648
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a83b182f0c5a15cc7cc60e05979466978862b18bd2943c1c557801b2452f1335
3
- size 29970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7594e1c89f2ef9e9f95d4a3bba880189c707c44cc11259ce7368ca28f4b59e8
3
+ size 227177722
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d6dbaea400350134c87609f2a6e7d568a19190ab54bccf06fc0c89f270eb6d2
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:097be1815f079414e8ed991ad49c6cfd1743fb95655b62a7bef9de98ee947e32
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5815cf5dbd0e8e9b0a43084f1040b0b93a4a2d0c9a55323787cef2a001436293
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b15aa4812dcef7f4211bf5dd0a9f6e03dda77ee314ae3cfe7bc5f3ef8d762a87
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,883 +1,28 @@
1
  {
2
- "best_global_step": 58,
3
- "best_metric": 0.32613325119018555,
4
- "best_model_checkpoint": "./results/checkpoint-58",
5
- "epoch": 58.0,
6
  "eval_steps": 500,
7
- "global_step": 58,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 1.0,
14
- "grad_norm": 0.838614821434021,
15
  "learning_rate": 1e-05,
16
- "loss": 0.3287,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 1.0,
21
- "eval_loss": 0.328641802072525,
22
- "eval_runtime": 0.0155,
23
- "eval_samples_per_second": 644.821,
24
- "eval_steps_per_second": 128.964,
25
  "step": 1
26
- },
27
- {
28
- "epoch": 2.0,
29
- "grad_norm": 0.8384326100349426,
30
- "learning_rate": 9.9e-06,
31
- "loss": 0.3286,
32
- "step": 2
33
- },
34
- {
35
- "epoch": 2.0,
36
- "eval_loss": 0.3285800814628601,
37
- "eval_runtime": 0.0447,
38
- "eval_samples_per_second": 223.596,
39
- "eval_steps_per_second": 44.719,
40
- "step": 2
41
- },
42
- {
43
- "epoch": 3.0,
44
- "grad_norm": 0.8382521867752075,
45
- "learning_rate": 9.800000000000001e-06,
46
- "loss": 0.3286,
47
- "step": 3
48
- },
49
- {
50
- "epoch": 3.0,
51
- "eval_loss": 0.3285190165042877,
52
- "eval_runtime": 0.0147,
53
- "eval_samples_per_second": 681.358,
54
- "eval_steps_per_second": 136.272,
55
- "step": 3
56
- },
57
- {
58
- "epoch": 4.0,
59
- "grad_norm": 0.8380736112594604,
60
- "learning_rate": 9.7e-06,
61
- "loss": 0.3285,
62
- "step": 4
63
- },
64
- {
65
- "epoch": 4.0,
66
- "eval_loss": 0.3284585475921631,
67
- "eval_runtime": 0.015,
68
- "eval_samples_per_second": 664.929,
69
- "eval_steps_per_second": 132.986,
70
- "step": 4
71
- },
72
- {
73
- "epoch": 5.0,
74
- "grad_norm": 0.8378969430923462,
75
- "learning_rate": 9.600000000000001e-06,
76
- "loss": 0.3285,
77
- "step": 5
78
- },
79
- {
80
- "epoch": 5.0,
81
- "eval_loss": 0.32839876413345337,
82
- "eval_runtime": 0.0146,
83
- "eval_samples_per_second": 682.967,
84
- "eval_steps_per_second": 136.593,
85
- "step": 5
86
- },
87
- {
88
- "epoch": 6.0,
89
- "grad_norm": 0.8377220034599304,
90
- "learning_rate": 9.5e-06,
91
- "loss": 0.3284,
92
- "step": 6
93
- },
94
- {
95
- "epoch": 6.0,
96
- "eval_loss": 0.3283396065235138,
97
- "eval_runtime": 0.0171,
98
- "eval_samples_per_second": 584.458,
99
- "eval_steps_per_second": 116.892,
100
- "step": 6
101
- },
102
- {
103
- "epoch": 7.0,
104
- "grad_norm": 0.8375489711761475,
105
- "learning_rate": 9.4e-06,
106
- "loss": 0.3283,
107
- "step": 7
108
- },
109
- {
110
- "epoch": 7.0,
111
- "eval_loss": 0.328281044960022,
112
- "eval_runtime": 0.0139,
113
- "eval_samples_per_second": 719.089,
114
- "eval_steps_per_second": 143.818,
115
- "step": 7
116
- },
117
- {
118
- "epoch": 8.0,
119
- "grad_norm": 0.8373778462409973,
120
- "learning_rate": 9.3e-06,
121
- "loss": 0.3283,
122
- "step": 8
123
- },
124
- {
125
- "epoch": 8.0,
126
- "eval_loss": 0.3282231390476227,
127
- "eval_runtime": 0.0142,
128
- "eval_samples_per_second": 705.138,
129
- "eval_steps_per_second": 141.028,
130
- "step": 8
131
- },
132
- {
133
- "epoch": 9.0,
134
- "grad_norm": 0.8372084498405457,
135
- "learning_rate": 9.200000000000002e-06,
136
- "loss": 0.3282,
137
- "step": 9
138
- },
139
- {
140
- "epoch": 9.0,
141
- "eval_loss": 0.32816585898399353,
142
- "eval_runtime": 0.0144,
143
- "eval_samples_per_second": 695.135,
144
- "eval_steps_per_second": 139.027,
145
- "step": 9
146
- },
147
- {
148
- "epoch": 10.0,
149
- "grad_norm": 0.837040901184082,
150
- "learning_rate": 9.100000000000001e-06,
151
- "loss": 0.3282,
152
- "step": 10
153
- },
154
- {
155
- "epoch": 10.0,
156
- "eval_loss": 0.328109472990036,
157
- "eval_runtime": 0.0164,
158
- "eval_samples_per_second": 609.921,
159
- "eval_steps_per_second": 121.984,
160
- "step": 10
161
- },
162
- {
163
- "epoch": 11.0,
164
- "grad_norm": 0.8362537622451782,
165
- "learning_rate": 9e-06,
166
- "loss": 0.3281,
167
- "step": 11
168
- },
169
- {
170
- "epoch": 11.0,
171
- "eval_loss": 0.3280538320541382,
172
- "eval_runtime": 0.0139,
173
- "eval_samples_per_second": 720.72,
174
- "eval_steps_per_second": 144.144,
175
- "step": 11
176
- },
177
- {
178
- "epoch": 12.0,
179
- "grad_norm": 0.8360907435417175,
180
- "learning_rate": 8.900000000000001e-06,
181
- "loss": 0.3281,
182
- "step": 12
183
- },
184
- {
185
- "epoch": 12.0,
186
- "eval_loss": 0.3279987871646881,
187
- "eval_runtime": 0.0164,
188
- "eval_samples_per_second": 611.219,
189
- "eval_steps_per_second": 122.244,
190
- "step": 12
191
- },
192
- {
193
- "epoch": 13.0,
194
- "grad_norm": 0.8359295129776001,
195
- "learning_rate": 8.8e-06,
196
- "loss": 0.328,
197
- "step": 13
198
- },
199
- {
200
- "epoch": 13.0,
201
- "eval_loss": 0.32794439792633057,
202
- "eval_runtime": 0.0145,
203
- "eval_samples_per_second": 689.977,
204
- "eval_steps_per_second": 137.995,
205
- "step": 13
206
- },
207
- {
208
- "epoch": 14.0,
209
- "grad_norm": 0.8357701301574707,
210
- "learning_rate": 8.700000000000001e-06,
211
- "loss": 0.3279,
212
- "step": 14
213
- },
214
- {
215
- "epoch": 14.0,
216
- "eval_loss": 0.3278906047344208,
217
- "eval_runtime": 0.0146,
218
- "eval_samples_per_second": 686.398,
219
- "eval_steps_per_second": 137.28,
220
- "step": 14
221
- },
222
- {
223
- "epoch": 15.0,
224
- "grad_norm": 0.8356127142906189,
225
- "learning_rate": 8.6e-06,
226
- "loss": 0.3279,
227
- "step": 15
228
- },
229
- {
230
- "epoch": 15.0,
231
- "eval_loss": 0.3278374969959259,
232
- "eval_runtime": 0.0164,
233
- "eval_samples_per_second": 610.347,
234
- "eval_steps_per_second": 122.069,
235
- "step": 15
236
- },
237
- {
238
- "epoch": 16.0,
239
- "grad_norm": 0.8354570269584656,
240
- "learning_rate": 8.5e-06,
241
- "loss": 0.3278,
242
- "step": 16
243
- },
244
- {
245
- "epoch": 16.0,
246
- "eval_loss": 0.3277849853038788,
247
- "eval_runtime": 0.0141,
248
- "eval_samples_per_second": 708.402,
249
- "eval_steps_per_second": 141.68,
250
- "step": 16
251
- },
252
- {
253
- "epoch": 17.0,
254
- "grad_norm": 0.8353032469749451,
255
- "learning_rate": 8.400000000000001e-06,
256
- "loss": 0.3278,
257
- "step": 17
258
- },
259
- {
260
- "epoch": 17.0,
261
- "eval_loss": 0.3277330994606018,
262
- "eval_runtime": 0.0172,
263
- "eval_samples_per_second": 581.516,
264
- "eval_steps_per_second": 116.303,
265
- "step": 17
266
- },
267
- {
268
- "epoch": 18.0,
269
- "grad_norm": 0.8351512551307678,
270
- "learning_rate": 8.3e-06,
271
- "loss": 0.3277,
272
- "step": 18
273
- },
274
- {
275
- "epoch": 18.0,
276
- "eval_loss": 0.32768189907073975,
277
- "eval_runtime": 0.0189,
278
- "eval_samples_per_second": 529.918,
279
- "eval_steps_per_second": 105.984,
280
- "step": 18
281
- },
282
- {
283
- "epoch": 19.0,
284
- "grad_norm": 0.8350011706352234,
285
- "learning_rate": 8.2e-06,
286
- "loss": 0.3277,
287
- "step": 19
288
- },
289
- {
290
- "epoch": 19.0,
291
- "eval_loss": 0.3276313245296478,
292
- "eval_runtime": 0.0147,
293
- "eval_samples_per_second": 679.691,
294
- "eval_steps_per_second": 135.938,
295
- "step": 19
296
- },
297
- {
298
- "epoch": 20.0,
299
- "grad_norm": 0.8348528742790222,
300
- "learning_rate": 8.1e-06,
301
- "loss": 0.3276,
302
- "step": 20
303
- },
304
- {
305
- "epoch": 20.0,
306
- "eval_loss": 0.32758134603500366,
307
- "eval_runtime": 0.0141,
308
- "eval_samples_per_second": 707.672,
309
- "eval_steps_per_second": 141.534,
310
- "step": 20
311
- },
312
- {
313
- "epoch": 21.0,
314
- "grad_norm": 0.8347064852714539,
315
- "learning_rate": 8.000000000000001e-06,
316
- "loss": 0.3276,
317
- "step": 21
318
- },
319
- {
320
- "epoch": 21.0,
321
- "eval_loss": 0.32753199338912964,
322
- "eval_runtime": 0.0192,
323
- "eval_samples_per_second": 522.167,
324
- "eval_steps_per_second": 104.433,
325
- "step": 21
326
- },
327
- {
328
- "epoch": 22.0,
329
- "grad_norm": 0.834561824798584,
330
- "learning_rate": 7.9e-06,
331
- "loss": 0.3275,
332
- "step": 22
333
- },
334
- {
335
- "epoch": 22.0,
336
- "eval_loss": 0.3274833559989929,
337
- "eval_runtime": 0.0135,
338
- "eval_samples_per_second": 738.109,
339
- "eval_steps_per_second": 147.622,
340
- "step": 22
341
- },
342
- {
343
- "epoch": 23.0,
344
- "grad_norm": 0.8344190716743469,
345
- "learning_rate": 7.800000000000002e-06,
346
- "loss": 0.3275,
347
- "step": 23
348
- },
349
- {
350
- "epoch": 23.0,
351
- "eval_loss": 0.3274352252483368,
352
- "eval_runtime": 0.0144,
353
- "eval_samples_per_second": 694.812,
354
- "eval_steps_per_second": 138.962,
355
- "step": 23
356
- },
357
- {
358
- "epoch": 24.0,
359
- "grad_norm": 0.8342781662940979,
360
- "learning_rate": 7.7e-06,
361
- "loss": 0.3274,
362
- "step": 24
363
- },
364
- {
365
- "epoch": 24.0,
366
- "eval_loss": 0.32738780975341797,
367
- "eval_runtime": 0.0141,
368
- "eval_samples_per_second": 710.779,
369
- "eval_steps_per_second": 142.156,
370
- "step": 24
371
- },
372
- {
373
- "epoch": 25.0,
374
- "grad_norm": 0.8341390490531921,
375
- "learning_rate": 7.600000000000001e-06,
376
- "loss": 0.3274,
377
- "step": 25
378
- },
379
- {
380
- "epoch": 25.0,
381
- "eval_loss": 0.3273409605026245,
382
- "eval_runtime": 0.0141,
383
- "eval_samples_per_second": 709.072,
384
- "eval_steps_per_second": 141.814,
385
- "step": 25
386
- },
387
- {
388
- "epoch": 26.0,
389
- "grad_norm": 0.8340017795562744,
390
- "learning_rate": 7.500000000000001e-06,
391
- "loss": 0.3273,
392
- "step": 26
393
- },
394
- {
395
- "epoch": 26.0,
396
- "eval_loss": 0.32729482650756836,
397
- "eval_runtime": 0.0158,
398
- "eval_samples_per_second": 634.194,
399
- "eval_steps_per_second": 126.839,
400
- "step": 26
401
- },
402
- {
403
- "epoch": 27.0,
404
- "grad_norm": 0.8338663578033447,
405
- "learning_rate": 7.4e-06,
406
- "loss": 0.3273,
407
- "step": 27
408
- },
409
- {
410
- "epoch": 27.0,
411
- "eval_loss": 0.3272492289543152,
412
- "eval_runtime": 0.0137,
413
- "eval_samples_per_second": 729.495,
414
- "eval_steps_per_second": 145.899,
415
- "step": 27
416
- },
417
- {
418
- "epoch": 28.0,
419
- "grad_norm": 0.8337326645851135,
420
- "learning_rate": 7.3e-06,
421
- "loss": 0.3272,
422
- "step": 28
423
- },
424
- {
425
- "epoch": 28.0,
426
- "eval_loss": 0.32720428705215454,
427
- "eval_runtime": 0.0156,
428
- "eval_samples_per_second": 639.834,
429
- "eval_steps_per_second": 127.967,
430
- "step": 28
431
- },
432
- {
433
- "epoch": 29.0,
434
- "grad_norm": 0.8336009383201599,
435
- "learning_rate": 7.2000000000000005e-06,
436
- "loss": 0.3272,
437
- "step": 29
438
- },
439
- {
440
- "epoch": 29.0,
441
- "eval_loss": 0.32715997099876404,
442
- "eval_runtime": 0.0135,
443
- "eval_samples_per_second": 741.24,
444
- "eval_steps_per_second": 148.248,
445
- "step": 29
446
- },
447
- {
448
- "epoch": 30.0,
449
- "grad_norm": 0.8334709405899048,
450
- "learning_rate": 7.100000000000001e-06,
451
- "loss": 0.3272,
452
- "step": 30
453
- },
454
- {
455
- "epoch": 30.0,
456
- "eval_loss": 0.3271161913871765,
457
- "eval_runtime": 0.0157,
458
- "eval_samples_per_second": 638.908,
459
- "eval_steps_per_second": 127.782,
460
- "step": 30
461
- },
462
- {
463
- "epoch": 31.0,
464
- "grad_norm": 0.834907054901123,
465
- "learning_rate": 7e-06,
466
- "loss": 0.3271,
467
- "step": 31
468
- },
469
- {
470
- "epoch": 31.0,
471
- "eval_loss": 0.32707276940345764,
472
- "eval_runtime": 0.0147,
473
- "eval_samples_per_second": 678.679,
474
- "eval_steps_per_second": 135.736,
475
- "step": 31
476
- },
477
- {
478
- "epoch": 32.0,
479
- "grad_norm": 0.8347804546356201,
480
- "learning_rate": 6.9e-06,
481
- "loss": 0.3271,
482
- "step": 32
483
- },
484
- {
485
- "epoch": 32.0,
486
- "eval_loss": 0.32702988386154175,
487
- "eval_runtime": 0.0137,
488
- "eval_samples_per_second": 731.48,
489
- "eval_steps_per_second": 146.296,
490
- "step": 32
491
- },
492
- {
493
- "epoch": 33.0,
494
- "grad_norm": 0.8346555829048157,
495
- "learning_rate": 6.800000000000001e-06,
496
- "loss": 0.327,
497
- "step": 33
498
- },
499
- {
500
- "epoch": 33.0,
501
- "eval_loss": 0.326987624168396,
502
- "eval_runtime": 0.0154,
503
- "eval_samples_per_second": 647.809,
504
- "eval_steps_per_second": 129.562,
505
- "step": 33
506
- },
507
- {
508
- "epoch": 34.0,
509
- "grad_norm": 0.8345323801040649,
510
- "learning_rate": 6.700000000000001e-06,
511
- "loss": 0.327,
512
- "step": 34
513
- },
514
- {
515
- "epoch": 34.0,
516
- "eval_loss": 0.326945960521698,
517
- "eval_runtime": 0.015,
518
- "eval_samples_per_second": 668.532,
519
- "eval_steps_per_second": 133.706,
520
- "step": 34
521
- },
522
- {
523
- "epoch": 35.0,
524
- "grad_norm": 0.8344109654426575,
525
- "learning_rate": 6.600000000000001e-06,
526
- "loss": 0.3269,
527
- "step": 35
528
- },
529
- {
530
- "epoch": 35.0,
531
- "eval_loss": 0.32690495252609253,
532
- "eval_runtime": 0.0141,
533
- "eval_samples_per_second": 711.152,
534
- "eval_steps_per_second": 142.23,
535
- "step": 35
536
- },
537
- {
538
- "epoch": 36.0,
539
- "grad_norm": 0.8342913389205933,
540
- "learning_rate": 6.5000000000000004e-06,
541
- "loss": 0.3269,
542
- "step": 36
543
- },
544
- {
545
- "epoch": 36.0,
546
- "eval_loss": 0.3268645405769348,
547
- "eval_runtime": 0.0148,
548
- "eval_samples_per_second": 676.653,
549
- "eval_steps_per_second": 135.331,
550
- "step": 36
551
- },
552
- {
553
- "epoch": 37.0,
554
- "grad_norm": 0.8341735005378723,
555
- "learning_rate": 6.4000000000000006e-06,
556
- "loss": 0.3269,
557
- "step": 37
558
- },
559
- {
560
- "epoch": 37.0,
561
- "eval_loss": 0.32682472467422485,
562
- "eval_runtime": 0.0184,
563
- "eval_samples_per_second": 544.856,
564
- "eval_steps_per_second": 108.971,
565
- "step": 37
566
- },
567
- {
568
- "epoch": 38.0,
569
- "grad_norm": 0.8340575098991394,
570
- "learning_rate": 6.300000000000001e-06,
571
- "loss": 0.3268,
572
- "step": 38
573
- },
574
- {
575
- "epoch": 38.0,
576
- "eval_loss": 0.32678553462028503,
577
- "eval_runtime": 0.0135,
578
- "eval_samples_per_second": 738.577,
579
- "eval_steps_per_second": 147.715,
580
- "step": 38
581
- },
582
- {
583
- "epoch": 39.0,
584
- "grad_norm": 0.8339433073997498,
585
- "learning_rate": 6.200000000000001e-06,
586
- "loss": 0.3268,
587
- "step": 39
588
- },
589
- {
590
- "epoch": 39.0,
591
- "eval_loss": 0.32674694061279297,
592
- "eval_runtime": 0.0139,
593
- "eval_samples_per_second": 720.337,
594
- "eval_steps_per_second": 144.067,
595
- "step": 39
596
- },
597
- {
598
- "epoch": 40.0,
599
- "grad_norm": 0.8338308334350586,
600
- "learning_rate": 6.1e-06,
601
- "loss": 0.3267,
602
- "step": 40
603
- },
604
- {
605
- "epoch": 40.0,
606
- "eval_loss": 0.3267090320587158,
607
- "eval_runtime": 0.0146,
608
- "eval_samples_per_second": 685.064,
609
- "eval_steps_per_second": 137.013,
610
- "step": 40
611
- },
612
- {
613
- "epoch": 41.0,
614
- "grad_norm": 0.8337202668190002,
615
- "learning_rate": 6e-06,
616
- "loss": 0.3267,
617
- "step": 41
618
- },
619
- {
620
- "epoch": 41.0,
621
- "eval_loss": 0.32667168974876404,
622
- "eval_runtime": 0.0137,
623
- "eval_samples_per_second": 729.584,
624
- "eval_steps_per_second": 145.917,
625
- "step": 41
626
- },
627
- {
628
- "epoch": 42.0,
629
- "grad_norm": 0.8336114883422852,
630
- "learning_rate": 5.9e-06,
631
- "loss": 0.3267,
632
- "step": 42
633
- },
634
- {
635
- "epoch": 42.0,
636
- "eval_loss": 0.32663506269454956,
637
- "eval_runtime": 0.0139,
638
- "eval_samples_per_second": 721.154,
639
- "eval_steps_per_second": 144.231,
640
- "step": 42
641
- },
642
- {
643
- "epoch": 43.0,
644
- "grad_norm": 0.8335044384002686,
645
- "learning_rate": 5.8e-06,
646
- "loss": 0.3266,
647
- "step": 43
648
- },
649
- {
650
- "epoch": 43.0,
651
- "eval_loss": 0.32659897208213806,
652
- "eval_runtime": 0.0163,
653
- "eval_samples_per_second": 613.75,
654
- "eval_steps_per_second": 122.75,
655
- "step": 43
656
- },
657
- {
658
- "epoch": 44.0,
659
- "grad_norm": 0.8333994150161743,
660
- "learning_rate": 5.7e-06,
661
- "loss": 0.3266,
662
- "step": 44
663
- },
664
- {
665
- "epoch": 44.0,
666
- "eval_loss": 0.3265635371208191,
667
- "eval_runtime": 0.0137,
668
- "eval_samples_per_second": 730.842,
669
- "eval_steps_per_second": 146.168,
670
- "step": 44
671
- },
672
- {
673
- "epoch": 45.0,
674
- "grad_norm": 0.8332960605621338,
675
- "learning_rate": 5.600000000000001e-06,
676
- "loss": 0.3266,
677
- "step": 45
678
- },
679
- {
680
- "epoch": 45.0,
681
- "eval_loss": 0.32652872800827026,
682
- "eval_runtime": 0.015,
683
- "eval_samples_per_second": 667.819,
684
- "eval_steps_per_second": 133.564,
685
- "step": 45
686
- },
687
- {
688
- "epoch": 46.0,
689
- "grad_norm": 0.8331945538520813,
690
- "learning_rate": 5.500000000000001e-06,
691
- "loss": 0.3265,
692
- "step": 46
693
- },
694
- {
695
- "epoch": 46.0,
696
- "eval_loss": 0.3264945149421692,
697
- "eval_runtime": 0.0148,
698
- "eval_samples_per_second": 674.141,
699
- "eval_steps_per_second": 134.828,
700
- "step": 46
701
- },
702
- {
703
- "epoch": 47.0,
704
- "grad_norm": 0.8330948948860168,
705
- "learning_rate": 5.400000000000001e-06,
706
- "loss": 0.3265,
707
- "step": 47
708
- },
709
- {
710
- "epoch": 47.0,
711
- "eval_loss": 0.32646098732948303,
712
- "eval_runtime": 0.0147,
713
- "eval_samples_per_second": 678.086,
714
- "eval_steps_per_second": 135.617,
715
- "step": 47
716
- },
717
- {
718
- "epoch": 48.0,
719
- "grad_norm": 0.8329970240592957,
720
- "learning_rate": 5.300000000000001e-06,
721
- "loss": 0.3265,
722
- "step": 48
723
- },
724
- {
725
- "epoch": 48.0,
726
- "eval_loss": 0.32642805576324463,
727
- "eval_runtime": 0.0151,
728
- "eval_samples_per_second": 663.53,
729
- "eval_steps_per_second": 132.706,
730
- "step": 48
731
- },
732
- {
733
- "epoch": 49.0,
734
- "grad_norm": 0.8329010009765625,
735
- "learning_rate": 5.2e-06,
736
- "loss": 0.3264,
737
- "step": 49
738
- },
739
- {
740
- "epoch": 49.0,
741
- "eval_loss": 0.32639575004577637,
742
- "eval_runtime": 0.0141,
743
- "eval_samples_per_second": 707.1,
744
- "eval_steps_per_second": 141.42,
745
- "step": 49
746
- },
747
- {
748
- "epoch": 50.0,
749
- "grad_norm": 0.8328068852424622,
750
- "learning_rate": 5.1e-06,
751
- "loss": 0.3264,
752
- "step": 50
753
- },
754
- {
755
- "epoch": 50.0,
756
- "eval_loss": 0.32636409997940063,
757
- "eval_runtime": 0.0154,
758
- "eval_samples_per_second": 650.905,
759
- "eval_steps_per_second": 130.181,
760
- "step": 50
761
- },
762
- {
763
- "epoch": 51.0,
764
- "grad_norm": 0.8327144384384155,
765
- "learning_rate": 5e-06,
766
- "loss": 0.3264,
767
- "step": 51
768
- },
769
- {
770
- "epoch": 51.0,
771
- "eval_loss": 0.3263329863548279,
772
- "eval_runtime": 0.0156,
773
- "eval_samples_per_second": 639.142,
774
- "eval_steps_per_second": 127.828,
775
- "step": 51
776
- },
777
- {
778
- "epoch": 52.0,
779
- "grad_norm": 0.8326238989830017,
780
- "learning_rate": 4.9000000000000005e-06,
781
- "loss": 0.3263,
782
- "step": 52
783
- },
784
- {
785
- "epoch": 52.0,
786
- "eval_loss": 0.32630258798599243,
787
- "eval_runtime": 0.0155,
788
- "eval_samples_per_second": 645.069,
789
- "eval_steps_per_second": 129.014,
790
- "step": 52
791
- },
792
- {
793
- "epoch": 53.0,
794
- "grad_norm": 0.8325351476669312,
795
- "learning_rate": 4.800000000000001e-06,
796
- "loss": 0.3263,
797
- "step": 53
798
- },
799
- {
800
- "epoch": 53.0,
801
- "eval_loss": 0.32627278566360474,
802
- "eval_runtime": 0.0138,
803
- "eval_samples_per_second": 724.205,
804
- "eval_steps_per_second": 144.841,
805
- "step": 53
806
- },
807
- {
808
- "epoch": 54.0,
809
- "grad_norm": 0.8324483036994934,
810
- "learning_rate": 4.7e-06,
811
- "loss": 0.3263,
812
- "step": 54
813
- },
814
- {
815
- "epoch": 54.0,
816
- "eval_loss": 0.32624363899230957,
817
- "eval_runtime": 0.0156,
818
- "eval_samples_per_second": 642.933,
819
- "eval_steps_per_second": 128.587,
820
- "step": 54
821
- },
822
- {
823
- "epoch": 55.0,
824
- "grad_norm": 0.8323632478713989,
825
- "learning_rate": 4.600000000000001e-06,
826
- "loss": 0.3262,
827
- "step": 55
828
- },
829
- {
830
- "epoch": 55.0,
831
- "eval_loss": 0.32621514797210693,
832
- "eval_runtime": 0.0137,
833
- "eval_samples_per_second": 728.203,
834
- "eval_steps_per_second": 145.641,
835
- "step": 55
836
- },
837
- {
838
- "epoch": 56.0,
839
- "grad_norm": 0.8322799205780029,
840
- "learning_rate": 4.5e-06,
841
- "loss": 0.3262,
842
- "step": 56
843
- },
844
- {
845
- "epoch": 56.0,
846
- "eval_loss": 0.3261871933937073,
847
- "eval_runtime": 0.0148,
848
- "eval_samples_per_second": 674.466,
849
- "eval_steps_per_second": 134.893,
850
- "step": 56
851
- },
852
- {
853
- "epoch": 57.0,
854
- "grad_norm": 0.8321985602378845,
855
- "learning_rate": 4.4e-06,
856
- "loss": 0.3262,
857
- "step": 57
858
- },
859
- {
860
- "epoch": 57.0,
861
- "eval_loss": 0.32615989446640015,
862
- "eval_runtime": 0.0139,
863
- "eval_samples_per_second": 718.24,
864
- "eval_steps_per_second": 143.648,
865
- "step": 57
866
- },
867
- {
868
- "epoch": 58.0,
869
- "grad_norm": 0.8321189880371094,
870
- "learning_rate": 4.3e-06,
871
- "loss": 0.3262,
872
- "step": 58
873
- },
874
- {
875
- "epoch": 58.0,
876
- "eval_loss": 0.32613325119018555,
877
- "eval_runtime": 0.015,
878
- "eval_samples_per_second": 664.865,
879
- "eval_steps_per_second": 132.973,
880
- "step": 58
881
  }
882
  ],
883
  "logging_steps": 1,
 
1
  {
2
+ "best_global_step": 1,
3
+ "best_metric": 0.4375569820404053,
4
+ "best_model_checkpoint": "./results/checkpoint-1",
5
+ "epoch": 1.0,
6
  "eval_steps": 500,
7
+ "global_step": 1,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 1.0,
14
+ "grad_norm": 54.102176666259766,
15
  "learning_rate": 1e-05,
16
+ "loss": 1.2727,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 1.0,
21
+ "eval_loss": 0.4375569820404053,
22
+ "eval_runtime": 0.0264,
23
+ "eval_samples_per_second": 378.107,
24
+ "eval_steps_per_second": 75.621,
25
  "step": 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  }
27
  ],
28
  "logging_steps": 1,
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ecd73cb550045f899afabe89c641c38d60c245246169ca4d469382eb3f211c73
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad0a5856f1dd671dce14167996dc77cbafeea2d708933604506fbaa750ea80db
3
  size 5304