efraimdahl commited on
Commit
7cb6117
·
verified ·
1 Parent(s): 480ed89

Training in progress, epoch 58, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:daf6a721ebf3a96f2b514a2cb1ed1307852739082e350442f1ff9b0bc1203cc8
3
  size 13132
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:593f45feb57b6cf6b72deb4c2b645da0e09ef0560bb117ead2f8e48a01ba523c
3
  size 13132
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb851a217d2d2b9d9d61040b4ea630d41fd2ad1896f1843bb23c2baaf77c21e1
3
  size 29970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a83b182f0c5a15cc7cc60e05979466978862b18bd2943c1c557801b2452f1335
3
  size 29970
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b66e3cc7c452b707ddac5caf0aa17618afb9bc1a0333600a22c4afb353f3165
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d6dbaea400350134c87609f2a6e7d568a19190ab54bccf06fc0c89f270eb6d2
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b15aa4812dcef7f4211bf5dd0a9f6e03dda77ee314ae3cfe7bc5f3ef8d762a87
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5815cf5dbd0e8e9b0a43084f1040b0b93a4a2d0c9a55323787cef2a001436293
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 1,
3
- "best_metric": 0.328641802072525,
4
- "best_model_checkpoint": "./results/checkpoint-1",
5
- "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 1,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -23,6 +23,861 @@
23
  "eval_samples_per_second": 644.821,
24
  "eval_steps_per_second": 128.964,
25
  "step": 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  }
27
  ],
28
  "logging_steps": 1,
 
1
  {
2
+ "best_global_step": 58,
3
+ "best_metric": 0.32613325119018555,
4
+ "best_model_checkpoint": "./results/checkpoint-58",
5
+ "epoch": 58.0,
6
  "eval_steps": 500,
7
+ "global_step": 58,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
23
  "eval_samples_per_second": 644.821,
24
  "eval_steps_per_second": 128.964,
25
  "step": 1
26
+ },
27
+ {
28
+ "epoch": 2.0,
29
+ "grad_norm": 0.8384326100349426,
30
+ "learning_rate": 9.9e-06,
31
+ "loss": 0.3286,
32
+ "step": 2
33
+ },
34
+ {
35
+ "epoch": 2.0,
36
+ "eval_loss": 0.3285800814628601,
37
+ "eval_runtime": 0.0447,
38
+ "eval_samples_per_second": 223.596,
39
+ "eval_steps_per_second": 44.719,
40
+ "step": 2
41
+ },
42
+ {
43
+ "epoch": 3.0,
44
+ "grad_norm": 0.8382521867752075,
45
+ "learning_rate": 9.800000000000001e-06,
46
+ "loss": 0.3286,
47
+ "step": 3
48
+ },
49
+ {
50
+ "epoch": 3.0,
51
+ "eval_loss": 0.3285190165042877,
52
+ "eval_runtime": 0.0147,
53
+ "eval_samples_per_second": 681.358,
54
+ "eval_steps_per_second": 136.272,
55
+ "step": 3
56
+ },
57
+ {
58
+ "epoch": 4.0,
59
+ "grad_norm": 0.8380736112594604,
60
+ "learning_rate": 9.7e-06,
61
+ "loss": 0.3285,
62
+ "step": 4
63
+ },
64
+ {
65
+ "epoch": 4.0,
66
+ "eval_loss": 0.3284585475921631,
67
+ "eval_runtime": 0.015,
68
+ "eval_samples_per_second": 664.929,
69
+ "eval_steps_per_second": 132.986,
70
+ "step": 4
71
+ },
72
+ {
73
+ "epoch": 5.0,
74
+ "grad_norm": 0.8378969430923462,
75
+ "learning_rate": 9.600000000000001e-06,
76
+ "loss": 0.3285,
77
+ "step": 5
78
+ },
79
+ {
80
+ "epoch": 5.0,
81
+ "eval_loss": 0.32839876413345337,
82
+ "eval_runtime": 0.0146,
83
+ "eval_samples_per_second": 682.967,
84
+ "eval_steps_per_second": 136.593,
85
+ "step": 5
86
+ },
87
+ {
88
+ "epoch": 6.0,
89
+ "grad_norm": 0.8377220034599304,
90
+ "learning_rate": 9.5e-06,
91
+ "loss": 0.3284,
92
+ "step": 6
93
+ },
94
+ {
95
+ "epoch": 6.0,
96
+ "eval_loss": 0.3283396065235138,
97
+ "eval_runtime": 0.0171,
98
+ "eval_samples_per_second": 584.458,
99
+ "eval_steps_per_second": 116.892,
100
+ "step": 6
101
+ },
102
+ {
103
+ "epoch": 7.0,
104
+ "grad_norm": 0.8375489711761475,
105
+ "learning_rate": 9.4e-06,
106
+ "loss": 0.3283,
107
+ "step": 7
108
+ },
109
+ {
110
+ "epoch": 7.0,
111
+ "eval_loss": 0.328281044960022,
112
+ "eval_runtime": 0.0139,
113
+ "eval_samples_per_second": 719.089,
114
+ "eval_steps_per_second": 143.818,
115
+ "step": 7
116
+ },
117
+ {
118
+ "epoch": 8.0,
119
+ "grad_norm": 0.8373778462409973,
120
+ "learning_rate": 9.3e-06,
121
+ "loss": 0.3283,
122
+ "step": 8
123
+ },
124
+ {
125
+ "epoch": 8.0,
126
+ "eval_loss": 0.3282231390476227,
127
+ "eval_runtime": 0.0142,
128
+ "eval_samples_per_second": 705.138,
129
+ "eval_steps_per_second": 141.028,
130
+ "step": 8
131
+ },
132
+ {
133
+ "epoch": 9.0,
134
+ "grad_norm": 0.8372084498405457,
135
+ "learning_rate": 9.200000000000002e-06,
136
+ "loss": 0.3282,
137
+ "step": 9
138
+ },
139
+ {
140
+ "epoch": 9.0,
141
+ "eval_loss": 0.32816585898399353,
142
+ "eval_runtime": 0.0144,
143
+ "eval_samples_per_second": 695.135,
144
+ "eval_steps_per_second": 139.027,
145
+ "step": 9
146
+ },
147
+ {
148
+ "epoch": 10.0,
149
+ "grad_norm": 0.837040901184082,
150
+ "learning_rate": 9.100000000000001e-06,
151
+ "loss": 0.3282,
152
+ "step": 10
153
+ },
154
+ {
155
+ "epoch": 10.0,
156
+ "eval_loss": 0.328109472990036,
157
+ "eval_runtime": 0.0164,
158
+ "eval_samples_per_second": 609.921,
159
+ "eval_steps_per_second": 121.984,
160
+ "step": 10
161
+ },
162
+ {
163
+ "epoch": 11.0,
164
+ "grad_norm": 0.8362537622451782,
165
+ "learning_rate": 9e-06,
166
+ "loss": 0.3281,
167
+ "step": 11
168
+ },
169
+ {
170
+ "epoch": 11.0,
171
+ "eval_loss": 0.3280538320541382,
172
+ "eval_runtime": 0.0139,
173
+ "eval_samples_per_second": 720.72,
174
+ "eval_steps_per_second": 144.144,
175
+ "step": 11
176
+ },
177
+ {
178
+ "epoch": 12.0,
179
+ "grad_norm": 0.8360907435417175,
180
+ "learning_rate": 8.900000000000001e-06,
181
+ "loss": 0.3281,
182
+ "step": 12
183
+ },
184
+ {
185
+ "epoch": 12.0,
186
+ "eval_loss": 0.3279987871646881,
187
+ "eval_runtime": 0.0164,
188
+ "eval_samples_per_second": 611.219,
189
+ "eval_steps_per_second": 122.244,
190
+ "step": 12
191
+ },
192
+ {
193
+ "epoch": 13.0,
194
+ "grad_norm": 0.8359295129776001,
195
+ "learning_rate": 8.8e-06,
196
+ "loss": 0.328,
197
+ "step": 13
198
+ },
199
+ {
200
+ "epoch": 13.0,
201
+ "eval_loss": 0.32794439792633057,
202
+ "eval_runtime": 0.0145,
203
+ "eval_samples_per_second": 689.977,
204
+ "eval_steps_per_second": 137.995,
205
+ "step": 13
206
+ },
207
+ {
208
+ "epoch": 14.0,
209
+ "grad_norm": 0.8357701301574707,
210
+ "learning_rate": 8.700000000000001e-06,
211
+ "loss": 0.3279,
212
+ "step": 14
213
+ },
214
+ {
215
+ "epoch": 14.0,
216
+ "eval_loss": 0.3278906047344208,
217
+ "eval_runtime": 0.0146,
218
+ "eval_samples_per_second": 686.398,
219
+ "eval_steps_per_second": 137.28,
220
+ "step": 14
221
+ },
222
+ {
223
+ "epoch": 15.0,
224
+ "grad_norm": 0.8356127142906189,
225
+ "learning_rate": 8.6e-06,
226
+ "loss": 0.3279,
227
+ "step": 15
228
+ },
229
+ {
230
+ "epoch": 15.0,
231
+ "eval_loss": 0.3278374969959259,
232
+ "eval_runtime": 0.0164,
233
+ "eval_samples_per_second": 610.347,
234
+ "eval_steps_per_second": 122.069,
235
+ "step": 15
236
+ },
237
+ {
238
+ "epoch": 16.0,
239
+ "grad_norm": 0.8354570269584656,
240
+ "learning_rate": 8.5e-06,
241
+ "loss": 0.3278,
242
+ "step": 16
243
+ },
244
+ {
245
+ "epoch": 16.0,
246
+ "eval_loss": 0.3277849853038788,
247
+ "eval_runtime": 0.0141,
248
+ "eval_samples_per_second": 708.402,
249
+ "eval_steps_per_second": 141.68,
250
+ "step": 16
251
+ },
252
+ {
253
+ "epoch": 17.0,
254
+ "grad_norm": 0.8353032469749451,
255
+ "learning_rate": 8.400000000000001e-06,
256
+ "loss": 0.3278,
257
+ "step": 17
258
+ },
259
+ {
260
+ "epoch": 17.0,
261
+ "eval_loss": 0.3277330994606018,
262
+ "eval_runtime": 0.0172,
263
+ "eval_samples_per_second": 581.516,
264
+ "eval_steps_per_second": 116.303,
265
+ "step": 17
266
+ },
267
+ {
268
+ "epoch": 18.0,
269
+ "grad_norm": 0.8351512551307678,
270
+ "learning_rate": 8.3e-06,
271
+ "loss": 0.3277,
272
+ "step": 18
273
+ },
274
+ {
275
+ "epoch": 18.0,
276
+ "eval_loss": 0.32768189907073975,
277
+ "eval_runtime": 0.0189,
278
+ "eval_samples_per_second": 529.918,
279
+ "eval_steps_per_second": 105.984,
280
+ "step": 18
281
+ },
282
+ {
283
+ "epoch": 19.0,
284
+ "grad_norm": 0.8350011706352234,
285
+ "learning_rate": 8.2e-06,
286
+ "loss": 0.3277,
287
+ "step": 19
288
+ },
289
+ {
290
+ "epoch": 19.0,
291
+ "eval_loss": 0.3276313245296478,
292
+ "eval_runtime": 0.0147,
293
+ "eval_samples_per_second": 679.691,
294
+ "eval_steps_per_second": 135.938,
295
+ "step": 19
296
+ },
297
+ {
298
+ "epoch": 20.0,
299
+ "grad_norm": 0.8348528742790222,
300
+ "learning_rate": 8.1e-06,
301
+ "loss": 0.3276,
302
+ "step": 20
303
+ },
304
+ {
305
+ "epoch": 20.0,
306
+ "eval_loss": 0.32758134603500366,
307
+ "eval_runtime": 0.0141,
308
+ "eval_samples_per_second": 707.672,
309
+ "eval_steps_per_second": 141.534,
310
+ "step": 20
311
+ },
312
+ {
313
+ "epoch": 21.0,
314
+ "grad_norm": 0.8347064852714539,
315
+ "learning_rate": 8.000000000000001e-06,
316
+ "loss": 0.3276,
317
+ "step": 21
318
+ },
319
+ {
320
+ "epoch": 21.0,
321
+ "eval_loss": 0.32753199338912964,
322
+ "eval_runtime": 0.0192,
323
+ "eval_samples_per_second": 522.167,
324
+ "eval_steps_per_second": 104.433,
325
+ "step": 21
326
+ },
327
+ {
328
+ "epoch": 22.0,
329
+ "grad_norm": 0.834561824798584,
330
+ "learning_rate": 7.9e-06,
331
+ "loss": 0.3275,
332
+ "step": 22
333
+ },
334
+ {
335
+ "epoch": 22.0,
336
+ "eval_loss": 0.3274833559989929,
337
+ "eval_runtime": 0.0135,
338
+ "eval_samples_per_second": 738.109,
339
+ "eval_steps_per_second": 147.622,
340
+ "step": 22
341
+ },
342
+ {
343
+ "epoch": 23.0,
344
+ "grad_norm": 0.8344190716743469,
345
+ "learning_rate": 7.800000000000002e-06,
346
+ "loss": 0.3275,
347
+ "step": 23
348
+ },
349
+ {
350
+ "epoch": 23.0,
351
+ "eval_loss": 0.3274352252483368,
352
+ "eval_runtime": 0.0144,
353
+ "eval_samples_per_second": 694.812,
354
+ "eval_steps_per_second": 138.962,
355
+ "step": 23
356
+ },
357
+ {
358
+ "epoch": 24.0,
359
+ "grad_norm": 0.8342781662940979,
360
+ "learning_rate": 7.7e-06,
361
+ "loss": 0.3274,
362
+ "step": 24
363
+ },
364
+ {
365
+ "epoch": 24.0,
366
+ "eval_loss": 0.32738780975341797,
367
+ "eval_runtime": 0.0141,
368
+ "eval_samples_per_second": 710.779,
369
+ "eval_steps_per_second": 142.156,
370
+ "step": 24
371
+ },
372
+ {
373
+ "epoch": 25.0,
374
+ "grad_norm": 0.8341390490531921,
375
+ "learning_rate": 7.600000000000001e-06,
376
+ "loss": 0.3274,
377
+ "step": 25
378
+ },
379
+ {
380
+ "epoch": 25.0,
381
+ "eval_loss": 0.3273409605026245,
382
+ "eval_runtime": 0.0141,
383
+ "eval_samples_per_second": 709.072,
384
+ "eval_steps_per_second": 141.814,
385
+ "step": 25
386
+ },
387
+ {
388
+ "epoch": 26.0,
389
+ "grad_norm": 0.8340017795562744,
390
+ "learning_rate": 7.500000000000001e-06,
391
+ "loss": 0.3273,
392
+ "step": 26
393
+ },
394
+ {
395
+ "epoch": 26.0,
396
+ "eval_loss": 0.32729482650756836,
397
+ "eval_runtime": 0.0158,
398
+ "eval_samples_per_second": 634.194,
399
+ "eval_steps_per_second": 126.839,
400
+ "step": 26
401
+ },
402
+ {
403
+ "epoch": 27.0,
404
+ "grad_norm": 0.8338663578033447,
405
+ "learning_rate": 7.4e-06,
406
+ "loss": 0.3273,
407
+ "step": 27
408
+ },
409
+ {
410
+ "epoch": 27.0,
411
+ "eval_loss": 0.3272492289543152,
412
+ "eval_runtime": 0.0137,
413
+ "eval_samples_per_second": 729.495,
414
+ "eval_steps_per_second": 145.899,
415
+ "step": 27
416
+ },
417
+ {
418
+ "epoch": 28.0,
419
+ "grad_norm": 0.8337326645851135,
420
+ "learning_rate": 7.3e-06,
421
+ "loss": 0.3272,
422
+ "step": 28
423
+ },
424
+ {
425
+ "epoch": 28.0,
426
+ "eval_loss": 0.32720428705215454,
427
+ "eval_runtime": 0.0156,
428
+ "eval_samples_per_second": 639.834,
429
+ "eval_steps_per_second": 127.967,
430
+ "step": 28
431
+ },
432
+ {
433
+ "epoch": 29.0,
434
+ "grad_norm": 0.8336009383201599,
435
+ "learning_rate": 7.2000000000000005e-06,
436
+ "loss": 0.3272,
437
+ "step": 29
438
+ },
439
+ {
440
+ "epoch": 29.0,
441
+ "eval_loss": 0.32715997099876404,
442
+ "eval_runtime": 0.0135,
443
+ "eval_samples_per_second": 741.24,
444
+ "eval_steps_per_second": 148.248,
445
+ "step": 29
446
+ },
447
+ {
448
+ "epoch": 30.0,
449
+ "grad_norm": 0.8334709405899048,
450
+ "learning_rate": 7.100000000000001e-06,
451
+ "loss": 0.3272,
452
+ "step": 30
453
+ },
454
+ {
455
+ "epoch": 30.0,
456
+ "eval_loss": 0.3271161913871765,
457
+ "eval_runtime": 0.0157,
458
+ "eval_samples_per_second": 638.908,
459
+ "eval_steps_per_second": 127.782,
460
+ "step": 30
461
+ },
462
+ {
463
+ "epoch": 31.0,
464
+ "grad_norm": 0.834907054901123,
465
+ "learning_rate": 7e-06,
466
+ "loss": 0.3271,
467
+ "step": 31
468
+ },
469
+ {
470
+ "epoch": 31.0,
471
+ "eval_loss": 0.32707276940345764,
472
+ "eval_runtime": 0.0147,
473
+ "eval_samples_per_second": 678.679,
474
+ "eval_steps_per_second": 135.736,
475
+ "step": 31
476
+ },
477
+ {
478
+ "epoch": 32.0,
479
+ "grad_norm": 0.8347804546356201,
480
+ "learning_rate": 6.9e-06,
481
+ "loss": 0.3271,
482
+ "step": 32
483
+ },
484
+ {
485
+ "epoch": 32.0,
486
+ "eval_loss": 0.32702988386154175,
487
+ "eval_runtime": 0.0137,
488
+ "eval_samples_per_second": 731.48,
489
+ "eval_steps_per_second": 146.296,
490
+ "step": 32
491
+ },
492
+ {
493
+ "epoch": 33.0,
494
+ "grad_norm": 0.8346555829048157,
495
+ "learning_rate": 6.800000000000001e-06,
496
+ "loss": 0.327,
497
+ "step": 33
498
+ },
499
+ {
500
+ "epoch": 33.0,
501
+ "eval_loss": 0.326987624168396,
502
+ "eval_runtime": 0.0154,
503
+ "eval_samples_per_second": 647.809,
504
+ "eval_steps_per_second": 129.562,
505
+ "step": 33
506
+ },
507
+ {
508
+ "epoch": 34.0,
509
+ "grad_norm": 0.8345323801040649,
510
+ "learning_rate": 6.700000000000001e-06,
511
+ "loss": 0.327,
512
+ "step": 34
513
+ },
514
+ {
515
+ "epoch": 34.0,
516
+ "eval_loss": 0.326945960521698,
517
+ "eval_runtime": 0.015,
518
+ "eval_samples_per_second": 668.532,
519
+ "eval_steps_per_second": 133.706,
520
+ "step": 34
521
+ },
522
+ {
523
+ "epoch": 35.0,
524
+ "grad_norm": 0.8344109654426575,
525
+ "learning_rate": 6.600000000000001e-06,
526
+ "loss": 0.3269,
527
+ "step": 35
528
+ },
529
+ {
530
+ "epoch": 35.0,
531
+ "eval_loss": 0.32690495252609253,
532
+ "eval_runtime": 0.0141,
533
+ "eval_samples_per_second": 711.152,
534
+ "eval_steps_per_second": 142.23,
535
+ "step": 35
536
+ },
537
+ {
538
+ "epoch": 36.0,
539
+ "grad_norm": 0.8342913389205933,
540
+ "learning_rate": 6.5000000000000004e-06,
541
+ "loss": 0.3269,
542
+ "step": 36
543
+ },
544
+ {
545
+ "epoch": 36.0,
546
+ "eval_loss": 0.3268645405769348,
547
+ "eval_runtime": 0.0148,
548
+ "eval_samples_per_second": 676.653,
549
+ "eval_steps_per_second": 135.331,
550
+ "step": 36
551
+ },
552
+ {
553
+ "epoch": 37.0,
554
+ "grad_norm": 0.8341735005378723,
555
+ "learning_rate": 6.4000000000000006e-06,
556
+ "loss": 0.3269,
557
+ "step": 37
558
+ },
559
+ {
560
+ "epoch": 37.0,
561
+ "eval_loss": 0.32682472467422485,
562
+ "eval_runtime": 0.0184,
563
+ "eval_samples_per_second": 544.856,
564
+ "eval_steps_per_second": 108.971,
565
+ "step": 37
566
+ },
567
+ {
568
+ "epoch": 38.0,
569
+ "grad_norm": 0.8340575098991394,
570
+ "learning_rate": 6.300000000000001e-06,
571
+ "loss": 0.3268,
572
+ "step": 38
573
+ },
574
+ {
575
+ "epoch": 38.0,
576
+ "eval_loss": 0.32678553462028503,
577
+ "eval_runtime": 0.0135,
578
+ "eval_samples_per_second": 738.577,
579
+ "eval_steps_per_second": 147.715,
580
+ "step": 38
581
+ },
582
+ {
583
+ "epoch": 39.0,
584
+ "grad_norm": 0.8339433073997498,
585
+ "learning_rate": 6.200000000000001e-06,
586
+ "loss": 0.3268,
587
+ "step": 39
588
+ },
589
+ {
590
+ "epoch": 39.0,
591
+ "eval_loss": 0.32674694061279297,
592
+ "eval_runtime": 0.0139,
593
+ "eval_samples_per_second": 720.337,
594
+ "eval_steps_per_second": 144.067,
595
+ "step": 39
596
+ },
597
+ {
598
+ "epoch": 40.0,
599
+ "grad_norm": 0.8338308334350586,
600
+ "learning_rate": 6.1e-06,
601
+ "loss": 0.3267,
602
+ "step": 40
603
+ },
604
+ {
605
+ "epoch": 40.0,
606
+ "eval_loss": 0.3267090320587158,
607
+ "eval_runtime": 0.0146,
608
+ "eval_samples_per_second": 685.064,
609
+ "eval_steps_per_second": 137.013,
610
+ "step": 40
611
+ },
612
+ {
613
+ "epoch": 41.0,
614
+ "grad_norm": 0.8337202668190002,
615
+ "learning_rate": 6e-06,
616
+ "loss": 0.3267,
617
+ "step": 41
618
+ },
619
+ {
620
+ "epoch": 41.0,
621
+ "eval_loss": 0.32667168974876404,
622
+ "eval_runtime": 0.0137,
623
+ "eval_samples_per_second": 729.584,
624
+ "eval_steps_per_second": 145.917,
625
+ "step": 41
626
+ },
627
+ {
628
+ "epoch": 42.0,
629
+ "grad_norm": 0.8336114883422852,
630
+ "learning_rate": 5.9e-06,
631
+ "loss": 0.3267,
632
+ "step": 42
633
+ },
634
+ {
635
+ "epoch": 42.0,
636
+ "eval_loss": 0.32663506269454956,
637
+ "eval_runtime": 0.0139,
638
+ "eval_samples_per_second": 721.154,
639
+ "eval_steps_per_second": 144.231,
640
+ "step": 42
641
+ },
642
+ {
643
+ "epoch": 43.0,
644
+ "grad_norm": 0.8335044384002686,
645
+ "learning_rate": 5.8e-06,
646
+ "loss": 0.3266,
647
+ "step": 43
648
+ },
649
+ {
650
+ "epoch": 43.0,
651
+ "eval_loss": 0.32659897208213806,
652
+ "eval_runtime": 0.0163,
653
+ "eval_samples_per_second": 613.75,
654
+ "eval_steps_per_second": 122.75,
655
+ "step": 43
656
+ },
657
+ {
658
+ "epoch": 44.0,
659
+ "grad_norm": 0.8333994150161743,
660
+ "learning_rate": 5.7e-06,
661
+ "loss": 0.3266,
662
+ "step": 44
663
+ },
664
+ {
665
+ "epoch": 44.0,
666
+ "eval_loss": 0.3265635371208191,
667
+ "eval_runtime": 0.0137,
668
+ "eval_samples_per_second": 730.842,
669
+ "eval_steps_per_second": 146.168,
670
+ "step": 44
671
+ },
672
+ {
673
+ "epoch": 45.0,
674
+ "grad_norm": 0.8332960605621338,
675
+ "learning_rate": 5.600000000000001e-06,
676
+ "loss": 0.3266,
677
+ "step": 45
678
+ },
679
+ {
680
+ "epoch": 45.0,
681
+ "eval_loss": 0.32652872800827026,
682
+ "eval_runtime": 0.015,
683
+ "eval_samples_per_second": 667.819,
684
+ "eval_steps_per_second": 133.564,
685
+ "step": 45
686
+ },
687
+ {
688
+ "epoch": 46.0,
689
+ "grad_norm": 0.8331945538520813,
690
+ "learning_rate": 5.500000000000001e-06,
691
+ "loss": 0.3265,
692
+ "step": 46
693
+ },
694
+ {
695
+ "epoch": 46.0,
696
+ "eval_loss": 0.3264945149421692,
697
+ "eval_runtime": 0.0148,
698
+ "eval_samples_per_second": 674.141,
699
+ "eval_steps_per_second": 134.828,
700
+ "step": 46
701
+ },
702
+ {
703
+ "epoch": 47.0,
704
+ "grad_norm": 0.8330948948860168,
705
+ "learning_rate": 5.400000000000001e-06,
706
+ "loss": 0.3265,
707
+ "step": 47
708
+ },
709
+ {
710
+ "epoch": 47.0,
711
+ "eval_loss": 0.32646098732948303,
712
+ "eval_runtime": 0.0147,
713
+ "eval_samples_per_second": 678.086,
714
+ "eval_steps_per_second": 135.617,
715
+ "step": 47
716
+ },
717
+ {
718
+ "epoch": 48.0,
719
+ "grad_norm": 0.8329970240592957,
720
+ "learning_rate": 5.300000000000001e-06,
721
+ "loss": 0.3265,
722
+ "step": 48
723
+ },
724
+ {
725
+ "epoch": 48.0,
726
+ "eval_loss": 0.32642805576324463,
727
+ "eval_runtime": 0.0151,
728
+ "eval_samples_per_second": 663.53,
729
+ "eval_steps_per_second": 132.706,
730
+ "step": 48
731
+ },
732
+ {
733
+ "epoch": 49.0,
734
+ "grad_norm": 0.8329010009765625,
735
+ "learning_rate": 5.2e-06,
736
+ "loss": 0.3264,
737
+ "step": 49
738
+ },
739
+ {
740
+ "epoch": 49.0,
741
+ "eval_loss": 0.32639575004577637,
742
+ "eval_runtime": 0.0141,
743
+ "eval_samples_per_second": 707.1,
744
+ "eval_steps_per_second": 141.42,
745
+ "step": 49
746
+ },
747
+ {
748
+ "epoch": 50.0,
749
+ "grad_norm": 0.8328068852424622,
750
+ "learning_rate": 5.1e-06,
751
+ "loss": 0.3264,
752
+ "step": 50
753
+ },
754
+ {
755
+ "epoch": 50.0,
756
+ "eval_loss": 0.32636409997940063,
757
+ "eval_runtime": 0.0154,
758
+ "eval_samples_per_second": 650.905,
759
+ "eval_steps_per_second": 130.181,
760
+ "step": 50
761
+ },
762
+ {
763
+ "epoch": 51.0,
764
+ "grad_norm": 0.8327144384384155,
765
+ "learning_rate": 5e-06,
766
+ "loss": 0.3264,
767
+ "step": 51
768
+ },
769
+ {
770
+ "epoch": 51.0,
771
+ "eval_loss": 0.3263329863548279,
772
+ "eval_runtime": 0.0156,
773
+ "eval_samples_per_second": 639.142,
774
+ "eval_steps_per_second": 127.828,
775
+ "step": 51
776
+ },
777
+ {
778
+ "epoch": 52.0,
779
+ "grad_norm": 0.8326238989830017,
780
+ "learning_rate": 4.9000000000000005e-06,
781
+ "loss": 0.3263,
782
+ "step": 52
783
+ },
784
+ {
785
+ "epoch": 52.0,
786
+ "eval_loss": 0.32630258798599243,
787
+ "eval_runtime": 0.0155,
788
+ "eval_samples_per_second": 645.069,
789
+ "eval_steps_per_second": 129.014,
790
+ "step": 52
791
+ },
792
+ {
793
+ "epoch": 53.0,
794
+ "grad_norm": 0.8325351476669312,
795
+ "learning_rate": 4.800000000000001e-06,
796
+ "loss": 0.3263,
797
+ "step": 53
798
+ },
799
+ {
800
+ "epoch": 53.0,
801
+ "eval_loss": 0.32627278566360474,
802
+ "eval_runtime": 0.0138,
803
+ "eval_samples_per_second": 724.205,
804
+ "eval_steps_per_second": 144.841,
805
+ "step": 53
806
+ },
807
+ {
808
+ "epoch": 54.0,
809
+ "grad_norm": 0.8324483036994934,
810
+ "learning_rate": 4.7e-06,
811
+ "loss": 0.3263,
812
+ "step": 54
813
+ },
814
+ {
815
+ "epoch": 54.0,
816
+ "eval_loss": 0.32624363899230957,
817
+ "eval_runtime": 0.0156,
818
+ "eval_samples_per_second": 642.933,
819
+ "eval_steps_per_second": 128.587,
820
+ "step": 54
821
+ },
822
+ {
823
+ "epoch": 55.0,
824
+ "grad_norm": 0.8323632478713989,
825
+ "learning_rate": 4.600000000000001e-06,
826
+ "loss": 0.3262,
827
+ "step": 55
828
+ },
829
+ {
830
+ "epoch": 55.0,
831
+ "eval_loss": 0.32621514797210693,
832
+ "eval_runtime": 0.0137,
833
+ "eval_samples_per_second": 728.203,
834
+ "eval_steps_per_second": 145.641,
835
+ "step": 55
836
+ },
837
+ {
838
+ "epoch": 56.0,
839
+ "grad_norm": 0.8322799205780029,
840
+ "learning_rate": 4.5e-06,
841
+ "loss": 0.3262,
842
+ "step": 56
843
+ },
844
+ {
845
+ "epoch": 56.0,
846
+ "eval_loss": 0.3261871933937073,
847
+ "eval_runtime": 0.0148,
848
+ "eval_samples_per_second": 674.466,
849
+ "eval_steps_per_second": 134.893,
850
+ "step": 56
851
+ },
852
+ {
853
+ "epoch": 57.0,
854
+ "grad_norm": 0.8321985602378845,
855
+ "learning_rate": 4.4e-06,
856
+ "loss": 0.3262,
857
+ "step": 57
858
+ },
859
+ {
860
+ "epoch": 57.0,
861
+ "eval_loss": 0.32615989446640015,
862
+ "eval_runtime": 0.0139,
863
+ "eval_samples_per_second": 718.24,
864
+ "eval_steps_per_second": 143.648,
865
+ "step": 57
866
+ },
867
+ {
868
+ "epoch": 58.0,
869
+ "grad_norm": 0.8321189880371094,
870
+ "learning_rate": 4.3e-06,
871
+ "loss": 0.3262,
872
+ "step": 58
873
+ },
874
+ {
875
+ "epoch": 58.0,
876
+ "eval_loss": 0.32613325119018555,
877
+ "eval_runtime": 0.015,
878
+ "eval_samples_per_second": 664.865,
879
+ "eval_steps_per_second": 132.973,
880
+ "step": 58
881
  }
882
  ],
883
  "logging_steps": 1,