Tnt3o5 commited on
Commit
fa0737f
·
verified ·
1 Parent(s): 33177c8

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +150 -364
  6. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e60075e8ba6bb41ea77d1f033aa67d45b87967f0e798aef196694fe14a9d207d
3
  size 3165174664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10724b4cea4a470790946e7468c7181972c5a82dec88e4ad0d5382c5fc2548ea
3
  size 3165174664
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00c5a43579726adb9b7b0784bc7bed76becc75226e92e9ac2d4dca887beaf327
3
  size 6330693741
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5c40adecd4568ad788f53bf6de9d5d652371360956d13e030452c6527ec4637
3
  size 6330693741
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96e61ab708427870acd94294823064ef783185e22a315721c9f098e946d8b906
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78408af84cd72386bf33b0154204ec6e4daba2bdc3393a5bf3f607656332df74
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d4c71827701888b1efe5ab82a58befc77a15d11ca79ae9595253c1702d57d87
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58a8f6175f453fe2f4874f9c1c07779af8dcc8a21cd14a96604c82b3f4fc0b6a
3
  size 1465
trainer_state.json CHANGED
@@ -2,534 +2,320 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.9874861980125138,
6
- "eval_steps": 300,
7
- "global_step": 5400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.0368052999631947,
14
- "grad_norm": 10.38057804107666,
15
- "learning_rate": 4.981781376518219e-05,
16
- "loss": 2.8223,
17
  "step": 100
18
  },
19
  {
20
  "epoch": 0.0736105999263894,
21
- "grad_norm": 22.28519630432129,
22
- "learning_rate": 4.963378726536622e-05,
23
- "loss": 2.2838,
24
  "step": 200
25
  },
26
  {
27
  "epoch": 0.1104158998895841,
28
- "grad_norm": 7.458190441131592,
29
- "learning_rate": 4.944976076555024e-05,
30
- "loss": 2.222,
31
- "step": 300
32
- },
33
- {
34
- "epoch": 0.1104158998895841,
35
- "eval_loss": 2.0901951789855957,
36
- "eval_runtime": 662.284,
37
- "eval_samples_per_second": 8.206,
38
- "eval_steps_per_second": 1.027,
39
  "step": 300
40
  },
41
  {
42
  "epoch": 0.1472211998527788,
43
- "grad_norm": 6.754058837890625,
44
- "learning_rate": 4.926573426573427e-05,
45
- "loss": 2.2149,
46
  "step": 400
47
  },
48
  {
49
  "epoch": 0.1840264998159735,
50
- "grad_norm": 7.67477560043335,
51
- "learning_rate": 4.9081707765918294e-05,
52
- "loss": 2.1027,
53
  "step": 500
54
  },
55
  {
56
- "epoch": 0.2208317997791682,
57
- "grad_norm": 7.060691833496094,
58
- "learning_rate": 4.889768126610232e-05,
59
- "loss": 2.0664,
60
- "step": 600
 
61
  },
62
  {
63
  "epoch": 0.2208317997791682,
64
- "eval_loss": 2.0013794898986816,
65
- "eval_runtime": 664.5154,
66
- "eval_samples_per_second": 8.179,
67
- "eval_steps_per_second": 1.023,
68
  "step": 600
69
  },
70
  {
71
  "epoch": 0.2576370997423629,
72
- "grad_norm": 7.3001909255981445,
73
- "learning_rate": 4.871365476628635e-05,
74
- "loss": 2.0262,
75
  "step": 700
76
  },
77
  {
78
  "epoch": 0.2944423997055576,
79
- "grad_norm": 6.993441581726074,
80
- "learning_rate": 4.852962826647037e-05,
81
- "loss": 2.0506,
82
  "step": 800
83
  },
84
  {
85
  "epoch": 0.3312476996687523,
86
- "grad_norm": 6.8796796798706055,
87
- "learning_rate": 4.83456017666544e-05,
88
- "loss": 2.0443,
89
  "step": 900
90
  },
91
  {
92
- "epoch": 0.3312476996687523,
93
- "eval_loss": 1.9335800409317017,
94
- "eval_runtime": 663.2859,
95
- "eval_samples_per_second": 8.194,
96
- "eval_steps_per_second": 1.025,
97
- "step": 900
98
  },
99
  {
100
  "epoch": 0.368052999631947,
101
- "grad_norm": 8.510578155517578,
102
- "learning_rate": 4.8161575266838426e-05,
103
- "loss": 2.0214,
 
104
  "step": 1000
105
  },
106
  {
107
  "epoch": 0.4048582995951417,
108
- "grad_norm": 5.925055503845215,
109
- "learning_rate": 4.797754876702245e-05,
110
- "loss": 1.9283,
111
  "step": 1100
112
  },
113
  {
114
  "epoch": 0.4416635995583364,
115
- "grad_norm": 7.067410469055176,
116
- "learning_rate": 4.779352226720648e-05,
117
- "loss": 1.9905,
118
- "step": 1200
119
- },
120
- {
121
- "epoch": 0.4416635995583364,
122
- "eval_loss": 1.8977280855178833,
123
- "eval_runtime": 663.3777,
124
- "eval_samples_per_second": 8.193,
125
- "eval_steps_per_second": 1.025,
126
  "step": 1200
127
  },
128
  {
129
  "epoch": 0.4784688995215311,
130
- "grad_norm": 5.874550819396973,
131
- "learning_rate": 4.7609495767390503e-05,
132
- "loss": 1.9665,
133
  "step": 1300
134
  },
135
  {
136
  "epoch": 0.5152741994847257,
137
- "grad_norm": 8.402749061584473,
138
- "learning_rate": 4.7425469267574534e-05,
139
- "loss": 1.9342,
140
  "step": 1400
141
  },
142
  {
143
  "epoch": 0.5520794994479205,
144
- "grad_norm": 6.955828666687012,
145
- "learning_rate": 4.724144276775856e-05,
146
- "loss": 1.9158,
147
  "step": 1500
148
  },
149
  {
150
  "epoch": 0.5520794994479205,
151
- "eval_loss": 1.877929925918579,
152
- "eval_runtime": 661.5674,
153
- "eval_samples_per_second": 8.215,
154
- "eval_steps_per_second": 1.028,
155
  "step": 1500
156
  },
157
  {
158
  "epoch": 0.5888847994111152,
159
- "grad_norm": 5.033380031585693,
160
- "learning_rate": 4.705741626794258e-05,
161
- "loss": 1.9417,
162
  "step": 1600
163
  },
164
  {
165
  "epoch": 0.6256900993743099,
166
- "grad_norm": 6.041861534118652,
167
- "learning_rate": 4.687338976812661e-05,
168
- "loss": 1.9268,
169
  "step": 1700
170
  },
171
  {
172
  "epoch": 0.6624953993375046,
173
- "grad_norm": 6.134337902069092,
174
- "learning_rate": 4.6689363268310635e-05,
175
- "loss": 1.9385,
176
- "step": 1800
177
- },
178
- {
179
- "epoch": 0.6624953993375046,
180
- "eval_loss": 1.8483961820602417,
181
- "eval_runtime": 661.8464,
182
- "eval_samples_per_second": 8.212,
183
- "eval_steps_per_second": 1.027,
184
  "step": 1800
185
  },
186
  {
187
  "epoch": 0.6993006993006993,
188
- "grad_norm": 6.114970684051514,
189
- "learning_rate": 4.6505336768494665e-05,
190
- "loss": 1.9318,
191
  "step": 1900
192
  },
193
  {
194
  "epoch": 0.736105999263894,
195
- "grad_norm": 5.7990312576293945,
196
- "learning_rate": 4.632131026867869e-05,
197
- "loss": 1.8713,
198
  "step": 2000
199
  },
200
  {
201
- "epoch": 0.7729112992270887,
202
- "grad_norm": 6.675479412078857,
203
- "learning_rate": 4.613728376886272e-05,
204
- "loss": 1.8822,
205
- "step": 2100
 
206
  },
207
  {
208
  "epoch": 0.7729112992270887,
209
- "eval_loss": 1.8329029083251953,
210
- "eval_runtime": 664.1992,
211
- "eval_samples_per_second": 8.183,
212
- "eval_steps_per_second": 1.024,
213
  "step": 2100
214
  },
215
  {
216
  "epoch": 0.8097165991902834,
217
- "grad_norm": 6.004710674285889,
218
- "learning_rate": 4.595325726904674e-05,
219
- "loss": 1.8906,
220
  "step": 2200
221
  },
222
  {
223
  "epoch": 0.8465218991534781,
224
- "grad_norm": 6.8489460945129395,
225
- "learning_rate": 4.576923076923077e-05,
226
- "loss": 1.8894,
227
  "step": 2300
228
  },
229
  {
230
  "epoch": 0.8833271991166728,
231
- "grad_norm": 5.640189170837402,
232
- "learning_rate": 4.55852042694148e-05,
233
- "loss": 1.8616,
234
  "step": 2400
235
  },
236
  {
237
- "epoch": 0.8833271991166728,
238
- "eval_loss": 1.8137404918670654,
239
- "eval_runtime": 664.8043,
240
- "eval_samples_per_second": 8.175,
241
- "eval_steps_per_second": 1.023,
242
- "step": 2400
243
  },
244
  {
245
  "epoch": 0.9201324990798675,
246
- "grad_norm": 6.615390300750732,
247
- "learning_rate": 4.540117776959883e-05,
248
- "loss": 1.8822,
 
249
  "step": 2500
250
  },
251
  {
252
  "epoch": 0.9569377990430622,
253
- "grad_norm": 5.047032356262207,
254
- "learning_rate": 4.521715126978285e-05,
255
- "loss": 1.9137,
256
  "step": 2600
257
  },
258
  {
259
  "epoch": 0.9937430990062569,
260
- "grad_norm": 5.454547882080078,
261
- "learning_rate": 4.503312476996688e-05,
262
- "loss": 1.8511,
263
- "step": 2700
264
- },
265
- {
266
- "epoch": 0.9937430990062569,
267
- "eval_loss": 1.8019059896469116,
268
- "eval_runtime": 664.5344,
269
- "eval_samples_per_second": 8.179,
270
- "eval_steps_per_second": 1.023,
271
  "step": 2700
272
  },
273
  {
274
  "epoch": 1.0305483989694515,
275
- "grad_norm": 6.134690284729004,
276
- "learning_rate": 4.4849098270150904e-05,
277
- "loss": 1.4587,
278
  "step": 2800
279
  },
280
  {
281
  "epoch": 1.0673536989326462,
282
- "grad_norm": 7.562436103820801,
283
- "learning_rate": 4.4665071770334935e-05,
284
- "loss": 1.3743,
285
  "step": 2900
286
  },
287
  {
288
  "epoch": 1.104158998895841,
289
- "grad_norm": 7.055426120758057,
290
- "learning_rate": 4.448104527051896e-05,
291
- "loss": 1.3846,
292
  "step": 3000
293
  },
294
  {
295
  "epoch": 1.104158998895841,
296
- "eval_loss": 1.8608911037445068,
297
- "eval_runtime": 664.9166,
298
- "eval_samples_per_second": 8.174,
299
- "eval_steps_per_second": 1.023,
300
  "step": 3000
301
  },
302
  {
303
  "epoch": 1.1409642988590356,
304
- "grad_norm": 5.4267473220825195,
305
- "learning_rate": 4.429701877070299e-05,
306
- "loss": 1.4539,
307
  "step": 3100
308
  },
309
  {
310
  "epoch": 1.1777695988222303,
311
- "grad_norm": 5.971231937408447,
312
- "learning_rate": 4.411299227088701e-05,
313
- "loss": 1.3916,
314
  "step": 3200
315
  },
316
  {
317
  "epoch": 1.214574898785425,
318
- "grad_norm": 5.384959697723389,
319
- "learning_rate": 4.3928965771071036e-05,
320
- "loss": 1.345,
321
- "step": 3300
322
- },
323
- {
324
- "epoch": 1.214574898785425,
325
- "eval_loss": 1.8579978942871094,
326
- "eval_runtime": 666.0495,
327
- "eval_samples_per_second": 8.16,
328
- "eval_steps_per_second": 1.021,
329
  "step": 3300
330
  },
331
  {
332
  "epoch": 1.2513801987486197,
333
- "grad_norm": 5.7637457847595215,
334
- "learning_rate": 4.3744939271255066e-05,
335
- "loss": 1.3364,
336
  "step": 3400
337
  },
338
  {
339
  "epoch": 1.2881854987118144,
340
- "grad_norm": 6.541650295257568,
341
- "learning_rate": 4.356091277143909e-05,
342
- "loss": 1.3466,
343
  "step": 3500
344
  },
345
  {
346
- "epoch": 1.3249907986750091,
347
- "grad_norm": 5.789462089538574,
348
- "learning_rate": 4.337688627162312e-05,
349
- "loss": 1.3263,
350
- "step": 3600
 
351
  },
352
  {
353
  "epoch": 1.3249907986750091,
354
- "eval_loss": 1.8551760911941528,
355
- "eval_runtime": 665.6557,
356
- "eval_samples_per_second": 8.165,
357
- "eval_steps_per_second": 1.022,
358
  "step": 3600
359
- },
360
- {
361
- "epoch": 1.3617960986382038,
362
- "grad_norm": 5.502793788909912,
363
- "learning_rate": 4.3192859771807144e-05,
364
- "loss": 1.3387,
365
- "step": 3700
366
- },
367
- {
368
- "epoch": 1.3986013986013985,
369
- "grad_norm": 5.878416061401367,
370
- "learning_rate": 4.300883327199117e-05,
371
- "loss": 1.2795,
372
- "step": 3800
373
- },
374
- {
375
- "epoch": 1.4354066985645932,
376
- "grad_norm": 6.2468037605285645,
377
- "learning_rate": 4.28248067721752e-05,
378
- "loss": 1.2879,
379
- "step": 3900
380
- },
381
- {
382
- "epoch": 1.4354066985645932,
383
- "eval_loss": 1.8737432956695557,
384
- "eval_runtime": 665.8514,
385
- "eval_samples_per_second": 8.162,
386
- "eval_steps_per_second": 1.021,
387
- "step": 3900
388
- },
389
- {
390
- "epoch": 1.472211998527788,
391
- "grad_norm": 5.311323165893555,
392
- "learning_rate": 4.264078027235922e-05,
393
- "loss": 1.309,
394
- "step": 4000
395
- },
396
- {
397
- "epoch": 1.5090172984909827,
398
- "grad_norm": 6.0769758224487305,
399
- "learning_rate": 4.245675377254325e-05,
400
- "loss": 1.3078,
401
- "step": 4100
402
- },
403
- {
404
- "epoch": 1.5458225984541774,
405
- "grad_norm": 5.616523742675781,
406
- "learning_rate": 4.2272727272727275e-05,
407
- "loss": 1.2599,
408
- "step": 4200
409
- },
410
- {
411
- "epoch": 1.5458225984541774,
412
- "eval_loss": 1.8690847158432007,
413
- "eval_runtime": 665.6475,
414
- "eval_samples_per_second": 8.165,
415
- "eval_steps_per_second": 1.022,
416
- "step": 4200
417
- },
418
- {
419
- "epoch": 1.582627898417372,
420
- "grad_norm": 5.222440242767334,
421
- "learning_rate": 4.20887007729113e-05,
422
- "loss": 1.3208,
423
- "step": 4300
424
- },
425
- {
426
- "epoch": 1.6194331983805668,
427
- "grad_norm": 6.666055679321289,
428
- "learning_rate": 4.190467427309533e-05,
429
- "loss": 1.2911,
430
- "step": 4400
431
- },
432
- {
433
- "epoch": 1.6562384983437615,
434
- "grad_norm": 6.660887241363525,
435
- "learning_rate": 4.172064777327935e-05,
436
- "loss": 1.2618,
437
- "step": 4500
438
- },
439
- {
440
- "epoch": 1.6562384983437615,
441
- "eval_loss": 1.874595046043396,
442
- "eval_runtime": 665.9471,
443
- "eval_samples_per_second": 8.161,
444
- "eval_steps_per_second": 1.021,
445
- "step": 4500
446
- },
447
- {
448
- "epoch": 1.6930437983069562,
449
- "grad_norm": 6.1277971267700195,
450
- "learning_rate": 4.153662127346338e-05,
451
- "loss": 1.299,
452
- "step": 4600
453
- },
454
- {
455
- "epoch": 1.7298490982701509,
456
- "grad_norm": 6.3581862449646,
457
- "learning_rate": 4.135259477364741e-05,
458
- "loss": 1.2658,
459
- "step": 4700
460
- },
461
- {
462
- "epoch": 1.7666543982333456,
463
- "grad_norm": 5.905203819274902,
464
- "learning_rate": 4.116856827383143e-05,
465
- "loss": 1.2784,
466
- "step": 4800
467
- },
468
- {
469
- "epoch": 1.7666543982333456,
470
- "eval_loss": 1.8622459173202515,
471
- "eval_runtime": 667.7769,
472
- "eval_samples_per_second": 8.139,
473
- "eval_steps_per_second": 1.018,
474
- "step": 4800
475
- },
476
- {
477
- "epoch": 1.8034596981965403,
478
- "grad_norm": 5.387886047363281,
479
- "learning_rate": 4.098454177401546e-05,
480
- "loss": 1.2877,
481
- "step": 4900
482
- },
483
- {
484
- "epoch": 1.840264998159735,
485
- "grad_norm": 5.666168212890625,
486
- "learning_rate": 4.0800515274199484e-05,
487
- "loss": 1.2821,
488
- "step": 5000
489
- },
490
- {
491
- "epoch": 1.8770702981229297,
492
- "grad_norm": 4.707683086395264,
493
- "learning_rate": 4.0616488774383514e-05,
494
- "loss": 1.238,
495
- "step": 5100
496
- },
497
- {
498
- "epoch": 1.8770702981229297,
499
- "eval_loss": 1.8723958730697632,
500
- "eval_runtime": 665.7561,
501
- "eval_samples_per_second": 8.164,
502
- "eval_steps_per_second": 1.021,
503
- "step": 5100
504
- },
505
- {
506
- "epoch": 1.9138755980861244,
507
- "grad_norm": 5.434847831726074,
508
- "learning_rate": 4.043246227456754e-05,
509
- "loss": 1.2613,
510
- "step": 5200
511
- },
512
- {
513
- "epoch": 1.950680898049319,
514
- "grad_norm": 6.641855716705322,
515
- "learning_rate": 4.024843577475156e-05,
516
- "loss": 1.3182,
517
- "step": 5300
518
- },
519
- {
520
- "epoch": 1.9874861980125138,
521
- "grad_norm": 6.376593112945557,
522
- "learning_rate": 4.006440927493559e-05,
523
- "loss": 1.2729,
524
- "step": 5400
525
- },
526
- {
527
- "epoch": 1.9874861980125138,
528
- "eval_loss": 1.83621346950531,
529
- "eval_runtime": 664.7946,
530
- "eval_samples_per_second": 8.175,
531
- "eval_steps_per_second": 1.023,
532
- "step": 5400
533
  }
534
  ],
535
  "logging_steps": 100,
@@ -549,7 +335,7 @@
549
  "attributes": {}
550
  }
551
  },
552
- "total_flos": 1.501578129309696e+17,
553
  "train_batch_size": 8,
554
  "trial_name": null,
555
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.3249907986750091,
6
+ "eval_steps": 500,
7
+ "global_step": 3600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.0368052999631947,
14
+ "grad_norm": 6.317627906799316,
15
+ "learning_rate": 3.991842559741558e-05,
16
+ "loss": 1.3232,
17
  "step": 100
18
  },
19
  {
20
  "epoch": 0.0736105999263894,
21
+ "grad_norm": 6.4019927978515625,
22
+ "learning_rate": 3.9770967337294366e-05,
23
+ "loss": 1.1983,
24
  "step": 200
25
  },
26
  {
27
  "epoch": 0.1104158998895841,
28
+ "grad_norm": 5.58203649520874,
29
+ "learning_rate": 3.962350907717314e-05,
30
+ "loss": 1.1226,
 
 
 
 
 
 
 
 
31
  "step": 300
32
  },
33
  {
34
  "epoch": 0.1472211998527788,
35
+ "grad_norm": 4.505954742431641,
36
+ "learning_rate": 3.947605081705192e-05,
37
+ "loss": 0.7575,
38
  "step": 400
39
  },
40
  {
41
  "epoch": 0.1840264998159735,
42
+ "grad_norm": 5.4588165283203125,
43
+ "learning_rate": 3.93285925569307e-05,
44
+ "loss": 0.684,
45
  "step": 500
46
  },
47
  {
48
+ "epoch": 0.1840264998159735,
49
+ "eval_loss": 2.1012208461761475,
50
+ "eval_runtime": 663.3742,
51
+ "eval_samples_per_second": 8.193,
52
+ "eval_steps_per_second": 1.025,
53
+ "step": 500
54
  },
55
  {
56
  "epoch": 0.2208317997791682,
57
+ "grad_norm": 4.757272720336914,
58
+ "learning_rate": 3.918113429680948e-05,
59
+ "loss": 0.6552,
 
60
  "step": 600
61
  },
62
  {
63
  "epoch": 0.2576370997423629,
64
+ "grad_norm": 4.794644355773926,
65
+ "learning_rate": 3.903367603668826e-05,
66
+ "loss": 0.6429,
67
  "step": 700
68
  },
69
  {
70
  "epoch": 0.2944423997055576,
71
+ "grad_norm": 5.151235103607178,
72
+ "learning_rate": 3.888621777656704e-05,
73
+ "loss": 0.6567,
74
  "step": 800
75
  },
76
  {
77
  "epoch": 0.3312476996687523,
78
+ "grad_norm": 5.006649971008301,
79
+ "learning_rate": 3.8738759516445824e-05,
80
+ "loss": 0.6278,
81
  "step": 900
82
  },
83
  {
84
+ "epoch": 0.368052999631947,
85
+ "grad_norm": 6.711447715759277,
86
+ "learning_rate": 3.85913012563246e-05,
87
+ "loss": 0.637,
88
+ "step": 1000
 
89
  },
90
  {
91
  "epoch": 0.368052999631947,
92
+ "eval_loss": 2.1562345027923584,
93
+ "eval_runtime": 663.0039,
94
+ "eval_samples_per_second": 8.198,
95
+ "eval_steps_per_second": 1.026,
96
  "step": 1000
97
  },
98
  {
99
  "epoch": 0.4048582995951417,
100
+ "grad_norm": 5.702431678771973,
101
+ "learning_rate": 3.844384299620338e-05,
102
+ "loss": 0.5867,
103
  "step": 1100
104
  },
105
  {
106
  "epoch": 0.4416635995583364,
107
+ "grad_norm": 4.582945346832275,
108
+ "learning_rate": 3.829638473608216e-05,
109
+ "loss": 0.6156,
 
 
 
 
 
 
 
 
110
  "step": 1200
111
  },
112
  {
113
  "epoch": 0.4784688995215311,
114
+ "grad_norm": 4.968910217285156,
115
+ "learning_rate": 3.814892647596094e-05,
116
+ "loss": 0.6213,
117
  "step": 1300
118
  },
119
  {
120
  "epoch": 0.5152741994847257,
121
+ "grad_norm": 5.8628950119018555,
122
+ "learning_rate": 3.800146821583972e-05,
123
+ "loss": 0.6036,
124
  "step": 1400
125
  },
126
  {
127
  "epoch": 0.5520794994479205,
128
+ "grad_norm": 4.60435676574707,
129
+ "learning_rate": 3.7854009955718494e-05,
130
+ "loss": 0.5947,
131
  "step": 1500
132
  },
133
  {
134
  "epoch": 0.5520794994479205,
135
+ "eval_loss": 2.2223360538482666,
136
+ "eval_runtime": 662.8599,
137
+ "eval_samples_per_second": 8.199,
138
+ "eval_steps_per_second": 1.026,
139
  "step": 1500
140
  },
141
  {
142
  "epoch": 0.5888847994111152,
143
+ "grad_norm": 4.219241142272949,
144
+ "learning_rate": 3.7706551695597275e-05,
145
+ "loss": 0.61,
146
  "step": 1600
147
  },
148
  {
149
  "epoch": 0.6256900993743099,
150
+ "grad_norm": 4.9983344078063965,
151
+ "learning_rate": 3.7559093435476055e-05,
152
+ "loss": 0.5801,
153
  "step": 1700
154
  },
155
  {
156
  "epoch": 0.6624953993375046,
157
+ "grad_norm": 5.795677185058594,
158
+ "learning_rate": 3.7411635175354836e-05,
159
+ "loss": 0.6016,
 
 
 
 
 
 
 
 
160
  "step": 1800
161
  },
162
  {
163
  "epoch": 0.6993006993006993,
164
+ "grad_norm": 4.981507778167725,
165
+ "learning_rate": 3.726417691523362e-05,
166
+ "loss": 0.5839,
167
  "step": 1900
168
  },
169
  {
170
  "epoch": 0.736105999263894,
171
+ "grad_norm": 5.115480899810791,
172
+ "learning_rate": 3.71167186551124e-05,
173
+ "loss": 0.5946,
174
  "step": 2000
175
  },
176
  {
177
+ "epoch": 0.736105999263894,
178
+ "eval_loss": 2.1914401054382324,
179
+ "eval_runtime": 663.2825,
180
+ "eval_samples_per_second": 8.194,
181
+ "eval_steps_per_second": 1.025,
182
+ "step": 2000
183
  },
184
  {
185
  "epoch": 0.7729112992270887,
186
+ "grad_norm": 4.6113176345825195,
187
+ "learning_rate": 3.696926039499117e-05,
188
+ "loss": 0.5763,
 
189
  "step": 2100
190
  },
191
  {
192
  "epoch": 0.8097165991902834,
193
+ "grad_norm": 4.699350833892822,
194
+ "learning_rate": 3.682180213486995e-05,
195
+ "loss": 0.6007,
196
  "step": 2200
197
  },
198
  {
199
  "epoch": 0.8465218991534781,
200
+ "grad_norm": 4.8883233070373535,
201
+ "learning_rate": 3.667434387474873e-05,
202
+ "loss": 0.5741,
203
  "step": 2300
204
  },
205
  {
206
  "epoch": 0.8833271991166728,
207
+ "grad_norm": 5.460277557373047,
208
+ "learning_rate": 3.652688561462751e-05,
209
+ "loss": 0.5596,
210
  "step": 2400
211
  },
212
  {
213
+ "epoch": 0.9201324990798675,
214
+ "grad_norm": 4.431008338928223,
215
+ "learning_rate": 3.6379427354506294e-05,
216
+ "loss": 0.5831,
217
+ "step": 2500
 
218
  },
219
  {
220
  "epoch": 0.9201324990798675,
221
+ "eval_loss": 2.2241196632385254,
222
+ "eval_runtime": 662.9801,
223
+ "eval_samples_per_second": 8.198,
224
+ "eval_steps_per_second": 1.026,
225
  "step": 2500
226
  },
227
  {
228
  "epoch": 0.9569377990430622,
229
+ "grad_norm": 4.917581081390381,
230
+ "learning_rate": 3.6231969094385074e-05,
231
+ "loss": 0.5956,
232
  "step": 2600
233
  },
234
  {
235
  "epoch": 0.9937430990062569,
236
+ "grad_norm": 5.325926780700684,
237
+ "learning_rate": 3.6084510834263855e-05,
238
+ "loss": 0.6783,
 
 
 
 
 
 
 
 
239
  "step": 2700
240
  },
241
  {
242
  "epoch": 1.0305483989694515,
243
+ "grad_norm": 3.779780149459839,
244
+ "learning_rate": 3.593705257414263e-05,
245
+ "loss": 0.356,
246
  "step": 2800
247
  },
248
  {
249
  "epoch": 1.0673536989326462,
250
+ "grad_norm": 7.602641582489014,
251
+ "learning_rate": 3.5789594314021416e-05,
252
+ "loss": 0.2066,
253
  "step": 2900
254
  },
255
  {
256
  "epoch": 1.104158998895841,
257
+ "grad_norm": 5.734857082366943,
258
+ "learning_rate": 3.564213605390019e-05,
259
+ "loss": 0.2202,
260
  "step": 3000
261
  },
262
  {
263
  "epoch": 1.104158998895841,
264
+ "eval_loss": 2.732856512069702,
265
+ "eval_runtime": 663.1541,
266
+ "eval_samples_per_second": 8.196,
267
+ "eval_steps_per_second": 1.025,
268
  "step": 3000
269
  },
270
  {
271
  "epoch": 1.1409642988590356,
272
+ "grad_norm": 5.713295936584473,
273
+ "learning_rate": 3.549467779377897e-05,
274
+ "loss": 0.6849,
275
  "step": 3100
276
  },
277
  {
278
  "epoch": 1.1777695988222303,
279
+ "grad_norm": 8.02027702331543,
280
+ "learning_rate": 3.534721953365775e-05,
281
+ "loss": 0.6996,
282
  "step": 3200
283
  },
284
  {
285
  "epoch": 1.214574898785425,
286
+ "grad_norm": 7.312982082366943,
287
+ "learning_rate": 3.5199761273536525e-05,
288
+ "loss": 0.6721,
 
 
 
 
 
 
 
 
289
  "step": 3300
290
  },
291
  {
292
  "epoch": 1.2513801987486197,
293
+ "grad_norm": 6.537501811981201,
294
+ "learning_rate": 3.505230301341531e-05,
295
+ "loss": 0.6689,
296
  "step": 3400
297
  },
298
  {
299
  "epoch": 1.2881854987118144,
300
+ "grad_norm": 6.13110876083374,
301
+ "learning_rate": 3.4904844753294086e-05,
302
+ "loss": 0.6943,
303
  "step": 3500
304
  },
305
  {
306
+ "epoch": 1.2881854987118144,
307
+ "eval_loss": 2.2627553939819336,
308
+ "eval_runtime": 662.7328,
309
+ "eval_samples_per_second": 8.201,
310
+ "eval_steps_per_second": 1.026,
311
+ "step": 3500
312
  },
313
  {
314
  "epoch": 1.3249907986750091,
315
+ "grad_norm": 7.125514984130859,
316
+ "learning_rate": 3.4757386493172874e-05,
317
+ "loss": 0.6925,
 
318
  "step": 3600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  }
320
  ],
321
  "logging_steps": 100,
 
335
  "attributes": {}
336
  }
337
  },
338
+ "total_flos": 1.001052086206464e+17,
339
  "train_batch_size": 8,
340
  "trial_name": null,
341
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1a45af165c9b4b10ac29d54d577e579792b21aaaa713b168828df64c971d823
3
  size 5649
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:695ea3b402062a1144f41521e1d066a9e0db12391e8ff5cdd02d91cc262d45b5
3
  size 5649