fguryel commited on
Commit
931eaf5
·
verified ·
1 Parent(s): 34555e3

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -31,5 +31,5 @@
31
  "tie_word_embeddings": true,
32
  "transformers_version": "4.56.0",
33
  "use_cache": true,
34
- "vocab_size": 156939
35
  }
 
31
  "tie_word_embeddings": true,
32
  "transformers_version": "4.56.0",
33
  "use_cache": true,
34
+ "vocab_size": 156940
35
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fec0777b0697274e419843b43e7de2cb30a5fd61e41ce44dba71bf3de7bd5058
3
- size 4991031824
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33a77d12adc3c90a6d03d30b2ebb684027e4f21c9ca19ce87864192c1cbdd7b8
3
+ size 4991037968
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4313244f0e8492462dafae0f065014006765bf6e54482ae970cf91017cc78878
3
  size 1610725592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16b197f53c7eb5f953acb0917626be62d881b989a1678589016fddbdd2e620b8
3
  size 1610725592
model.safetensors.index.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "metadata": {
3
- "total_parameters": 3300864000,
4
- "total_size": 6601728000
5
  },
6
  "weight_map": {
7
  "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_parameters": 3300867072,
4
+ "total_size": 6601734144
5
  },
6
  "weight_map": {
7
  "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c16c13d4fe81d73e8b7fddc7ce07badbdd55c4ed7d56a00eba76c2709bcaadd9
3
- size 13203678103
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:709540f11f094ab7fcb18f525b097ef780a66646213b0f225b0cec2172f4c781
3
+ size 13203690391
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17cd930da9783ca70bad4b9cdeee6a06c0acea8f34645a333c93341f487f66a3
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8e2011629d8bed3ef560fa11175cac55684c4e12a72634bb24abf767b6c7399
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab13e011ef5c6b4c442bc5f32b542c348311a0c4bff74117266a9be0164ed15b
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3159b8255e3ba63ecfbf9ad9882d37c8b55d7643e07b70fee54fef23e5ee0ce
3
  size 1465
trainer_state.json CHANGED
@@ -1,642 +1,97 @@
1
  {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 10.0,
6
  "eval_steps": 500,
7
- "global_step": 4460,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.11210762331838565,
14
- "grad_norm": 4.59375,
15
- "learning_rate": 9.14179104477612e-06,
16
- "loss": 5.4921,
17
  "step": 50
18
  },
19
  {
20
- "epoch": 0.2242152466367713,
21
- "grad_norm": 4.9375,
22
- "learning_rate": 1.8470149253731344e-05,
23
- "loss": 5.4388,
24
  "step": 100
25
  },
26
  {
27
- "epoch": 0.336322869955157,
28
- "grad_norm": 3.8125,
29
- "learning_rate": 2.7798507462686568e-05,
30
- "loss": 5.2504,
31
  "step": 150
32
  },
33
  {
34
- "epoch": 0.4484304932735426,
35
- "grad_norm": 6.5,
36
- "learning_rate": 3.7126865671641795e-05,
37
- "loss": 5.1609,
38
  "step": 200
39
  },
40
  {
41
- "epoch": 0.5605381165919282,
42
- "grad_norm": 6.4375,
43
- "learning_rate": 4.645522388059701e-05,
44
- "loss": 5.0546,
45
  "step": 250
46
  },
47
  {
48
- "epoch": 0.672645739910314,
49
- "grad_norm": 6.96875,
50
- "learning_rate": 4.999325361589072e-05,
51
- "loss": 5.0152,
52
  "step": 300
53
  },
54
  {
55
- "epoch": 0.7847533632286996,
56
- "grad_norm": 6.03125,
57
- "learning_rate": 4.9953952730494324e-05,
58
- "loss": 5.0036,
59
  "step": 350
60
  },
61
  {
62
- "epoch": 0.8968609865470852,
63
- "grad_norm": 4.8125,
64
- "learning_rate": 4.987961816680492e-05,
65
- "loss": 4.9593,
66
  "step": 400
67
  },
68
  {
69
- "epoch": 1.0089686098654709,
70
- "grad_norm": 4.8125,
71
- "learning_rate": 4.977035428557125e-05,
72
- "loss": 4.9279,
73
  "step": 450
74
  },
75
  {
76
- "epoch": 1.1210762331838564,
77
- "grad_norm": 4.6875,
78
- "learning_rate": 4.9626314485964385e-05,
79
- "loss": 4.7683,
80
  "step": 500
81
  },
82
  {
83
- "epoch": 1.2331838565022422,
84
- "grad_norm": 4.34375,
85
- "learning_rate": 4.944770099021562e-05,
86
- "loss": 4.7472,
87
- "step": 550
88
- },
89
- {
90
- "epoch": 1.3452914798206277,
91
- "grad_norm": 5.15625,
92
- "learning_rate": 4.923476455971e-05,
93
- "loss": 4.7371,
94
- "step": 600
95
- },
96
- {
97
- "epoch": 1.4573991031390134,
98
- "grad_norm": 7.3125,
99
- "learning_rate": 4.898780414293411e-05,
100
- "loss": 4.7189,
101
- "step": 650
102
- },
103
- {
104
- "epoch": 1.5695067264573992,
105
- "grad_norm": 4.21875,
106
- "learning_rate": 4.870716645577244e-05,
107
- "loss": 4.7196,
108
- "step": 700
109
- },
110
- {
111
- "epoch": 1.6816143497757847,
112
- "grad_norm": 4.96875,
113
- "learning_rate": 4.839324549474148e-05,
114
- "loss": 4.7285,
115
- "step": 750
116
- },
117
- {
118
- "epoch": 1.7937219730941703,
119
- "grad_norm": 4.125,
120
- "learning_rate": 4.804648198384507e-05,
121
- "loss": 4.7366,
122
- "step": 800
123
- },
124
- {
125
- "epoch": 1.905829596412556,
126
- "grad_norm": 4.21875,
127
- "learning_rate": 4.7667362755827306e-05,
128
- "loss": 4.712,
129
- "step": 850
130
- },
131
- {
132
- "epoch": 2.0179372197309418,
133
- "grad_norm": 4.09375,
134
- "learning_rate": 4.725642006869207e-05,
135
- "loss": 4.6238,
136
- "step": 900
137
- },
138
- {
139
- "epoch": 2.1300448430493275,
140
- "grad_norm": 4.3125,
141
- "learning_rate": 4.68142308584484e-05,
142
- "loss": 4.3582,
143
- "step": 950
144
- },
145
- {
146
- "epoch": 2.242152466367713,
147
- "grad_norm": 6.625,
148
- "learning_rate": 4.634141592913097e-05,
149
- "loss": 4.3645,
150
- "step": 1000
151
- },
152
- {
153
- "epoch": 2.3542600896860986,
154
- "grad_norm": 4.15625,
155
- "learning_rate": 4.583863908123282e-05,
156
- "loss": 4.413,
157
- "step": 1050
158
- },
159
- {
160
- "epoch": 2.4663677130044843,
161
- "grad_norm": 4.21875,
162
- "learning_rate": 4.530660617977393e-05,
163
- "loss": 4.3569,
164
- "step": 1100
165
- },
166
- {
167
- "epoch": 2.57847533632287,
168
- "grad_norm": 4.125,
169
- "learning_rate": 4.474606416331397e-05,
170
- "loss": 4.3704,
171
- "step": 1150
172
- },
173
- {
174
- "epoch": 2.6905829596412554,
175
- "grad_norm": 4.15625,
176
- "learning_rate": 4.415779999530064e-05,
177
- "loss": 4.406,
178
- "step": 1200
179
- },
180
- {
181
- "epoch": 2.802690582959641,
182
- "grad_norm": 3.921875,
183
- "learning_rate": 4.354263955922568e-05,
184
- "loss": 4.3779,
185
- "step": 1250
186
- },
187
- {
188
- "epoch": 2.914798206278027,
189
- "grad_norm": 4.21875,
190
- "learning_rate": 4.290144649913973e-05,
191
- "loss": 4.3843,
192
- "step": 1300
193
- },
194
- {
195
- "epoch": 3.0269058295964126,
196
- "grad_norm": 5.65625,
197
- "learning_rate": 4.2235121007153975e-05,
198
- "loss": 4.246,
199
- "step": 1350
200
- },
201
- {
202
- "epoch": 3.1390134529147984,
203
- "grad_norm": 5.3125,
204
- "learning_rate": 4.1544598559630694e-05,
205
- "loss": 3.891,
206
- "step": 1400
207
- },
208
- {
209
- "epoch": 3.2511210762331837,
210
- "grad_norm": 5.9375,
211
- "learning_rate": 4.083084860383708e-05,
212
- "loss": 3.9163,
213
- "step": 1450
214
- },
215
- {
216
- "epoch": 3.3632286995515694,
217
- "grad_norm": 4.96875,
218
- "learning_rate": 4.009487319690626e-05,
219
- "loss": 3.9105,
220
- "step": 1500
221
- },
222
- {
223
- "epoch": 3.475336322869955,
224
- "grad_norm": 4.90625,
225
- "learning_rate": 3.9337705599016145e-05,
226
- "loss": 3.9388,
227
- "step": 1550
228
- },
229
- {
230
- "epoch": 3.587443946188341,
231
- "grad_norm": 4.78125,
232
- "learning_rate": 3.856040882276136e-05,
233
- "loss": 3.9035,
234
- "step": 1600
235
- },
236
- {
237
- "epoch": 3.6995515695067267,
238
- "grad_norm": 4.6875,
239
- "learning_rate": 3.776407414075477e-05,
240
- "loss": 3.9022,
241
- "step": 1650
242
- },
243
- {
244
- "epoch": 3.811659192825112,
245
- "grad_norm": 5.625,
246
- "learning_rate": 3.6949819553553814e-05,
247
- "loss": 3.9107,
248
- "step": 1700
249
- },
250
- {
251
- "epoch": 3.9237668161434978,
252
- "grad_norm": 4.71875,
253
- "learning_rate": 3.611878822006261e-05,
254
- "loss": 3.9135,
255
- "step": 1750
256
- },
257
- {
258
- "epoch": 4.0358744394618835,
259
- "grad_norm": 7.25,
260
- "learning_rate": 3.527214685261339e-05,
261
- "loss": 3.7243,
262
- "step": 1800
263
- },
264
- {
265
- "epoch": 4.147982062780269,
266
- "grad_norm": 6.21875,
267
- "learning_rate": 3.4411084078980574e-05,
268
- "loss": 3.3222,
269
- "step": 1850
270
- },
271
- {
272
- "epoch": 4.260089686098655,
273
- "grad_norm": 6.65625,
274
- "learning_rate": 3.353680877362694e-05,
275
- "loss": 3.3072,
276
- "step": 1900
277
- },
278
- {
279
- "epoch": 4.37219730941704,
280
- "grad_norm": 6.75,
281
- "learning_rate": 3.265054836052488e-05,
282
- "loss": 3.2928,
283
- "step": 1950
284
- },
285
- {
286
- "epoch": 4.484304932735426,
287
- "grad_norm": 7.15625,
288
- "learning_rate": 3.1753547089935345e-05,
289
- "loss": 3.2997,
290
- "step": 2000
291
- },
292
- {
293
- "epoch": 4.596412556053812,
294
- "grad_norm": 6.71875,
295
- "learning_rate": 3.084706429156379e-05,
296
- "loss": 3.3155,
297
- "step": 2050
298
- },
299
- {
300
- "epoch": 4.708520179372197,
301
- "grad_norm": 5.5625,
302
- "learning_rate": 2.9932372606545638e-05,
303
- "loss": 3.3295,
304
- "step": 2100
305
- },
306
- {
307
- "epoch": 4.820627802690583,
308
- "grad_norm": 6.65625,
309
- "learning_rate": 2.9010756200743363e-05,
310
- "loss": 3.3681,
311
- "step": 2150
312
- },
313
- {
314
- "epoch": 4.932735426008969,
315
- "grad_norm": 6.40625,
316
- "learning_rate": 2.808350896186362e-05,
317
- "loss": 3.3639,
318
- "step": 2200
319
- },
320
- {
321
- "epoch": 5.044843049327354,
322
- "grad_norm": 7.875,
323
- "learning_rate": 2.7151932682925563e-05,
324
- "loss": 3.115,
325
- "step": 2250
326
- },
327
- {
328
- "epoch": 5.15695067264574,
329
- "grad_norm": 10.5,
330
- "learning_rate": 2.6217335234630654e-05,
331
- "loss": 2.8213,
332
- "step": 2300
333
- },
334
- {
335
- "epoch": 5.2690582959641254,
336
- "grad_norm": 7.84375,
337
- "learning_rate": 2.5281028729199775e-05,
338
- "loss": 2.7705,
339
- "step": 2350
340
- },
341
- {
342
- "epoch": 5.381165919282511,
343
- "grad_norm": 9.6875,
344
- "learning_rate": 2.4344327678255555e-05,
345
- "loss": 2.7746,
346
- "step": 2400
347
- },
348
- {
349
- "epoch": 5.493273542600897,
350
- "grad_norm": 6.8125,
351
- "learning_rate": 2.34085471473361e-05,
352
- "loss": 2.7827,
353
- "step": 2450
354
- },
355
- {
356
- "epoch": 5.605381165919282,
357
- "grad_norm": 8.8125,
358
- "learning_rate": 2.2475000909631068e-05,
359
- "loss": 2.8412,
360
- "step": 2500
361
- },
362
- {
363
- "epoch": 5.7174887892376685,
364
- "grad_norm": 7.78125,
365
- "learning_rate": 2.1544999601532084e-05,
366
- "loss": 2.7982,
367
- "step": 2550
368
- },
369
- {
370
- "epoch": 5.829596412556054,
371
- "grad_norm": 8.1875,
372
- "learning_rate": 2.0619848882587013e-05,
373
- "loss": 2.8419,
374
- "step": 2600
375
- },
376
- {
377
- "epoch": 5.941704035874439,
378
- "grad_norm": 8.4375,
379
- "learning_rate": 1.9700847602441465e-05,
380
- "loss": 2.7266,
381
- "step": 2650
382
- },
383
- {
384
- "epoch": 6.053811659192825,
385
- "grad_norm": 8.25,
386
- "learning_rate": 1.878928597734082e-05,
387
- "loss": 2.6429,
388
- "step": 2700
389
- },
390
- {
391
- "epoch": 6.165919282511211,
392
- "grad_norm": 8.25,
393
- "learning_rate": 1.7886443778753052e-05,
394
- "loss": 2.4437,
395
- "step": 2750
396
- },
397
- {
398
- "epoch": 6.278026905829597,
399
- "grad_norm": 9.75,
400
- "learning_rate": 1.699358853665535e-05,
401
- "loss": 2.3789,
402
- "step": 2800
403
- },
404
- {
405
- "epoch": 6.390134529147982,
406
- "grad_norm": 8.75,
407
- "learning_rate": 1.6111973760006838e-05,
408
- "loss": 2.4337,
409
- "step": 2850
410
- },
411
- {
412
- "epoch": 6.502242152466367,
413
- "grad_norm": 7.6875,
414
- "learning_rate": 1.5242837176906038e-05,
415
- "loss": 2.4162,
416
- "step": 2900
417
- },
418
- {
419
- "epoch": 6.614349775784754,
420
- "grad_norm": 9.875,
421
- "learning_rate": 1.4387398996903488e-05,
422
- "loss": 2.4613,
423
- "step": 2950
424
- },
425
- {
426
- "epoch": 6.726457399103139,
427
- "grad_norm": 9.5,
428
- "learning_rate": 1.3546860197909212e-05,
429
- "loss": 2.4606,
430
- "step": 3000
431
- },
432
- {
433
- "epoch": 6.838565022421525,
434
- "grad_norm": 9.875,
435
- "learning_rate": 1.2722400840100257e-05,
436
- "loss": 2.4374,
437
- "step": 3050
438
- },
439
- {
440
- "epoch": 6.95067264573991,
441
- "grad_norm": 9.8125,
442
- "learning_rate": 1.1915178409195171e-05,
443
- "loss": 2.4009,
444
- "step": 3100
445
- },
446
- {
447
- "epoch": 7.062780269058296,
448
- "grad_norm": 8.875,
449
- "learning_rate": 1.1126326191421625e-05,
450
- "loss": 2.3286,
451
- "step": 3150
452
- },
453
- {
454
- "epoch": 7.174887892376682,
455
- "grad_norm": 8.25,
456
- "learning_rate": 1.035695168245843e-05,
457
- "loss": 2.2594,
458
- "step": 3200
459
- },
460
- {
461
- "epoch": 7.286995515695067,
462
- "grad_norm": 8.1875,
463
- "learning_rate": 9.608135032585758e-06,
464
- "loss": 2.2772,
465
- "step": 3250
466
- },
467
- {
468
- "epoch": 7.3991031390134525,
469
- "grad_norm": 8.25,
470
- "learning_rate": 8.88092753022657e-06,
471
- "loss": 2.2576,
472
- "step": 3300
473
- },
474
- {
475
- "epoch": 7.511210762331839,
476
- "grad_norm": 8.1875,
477
- "learning_rate": 8.176350126008015e-06,
478
- "loss": 2.2233,
479
- "step": 3350
480
- },
481
- {
482
- "epoch": 7.623318385650224,
483
- "grad_norm": 10.4375,
484
- "learning_rate": 7.49539199941511e-06,
485
- "loss": 2.2273,
486
- "step": 3400
487
- },
488
- {
489
- "epoch": 7.73542600896861,
490
- "grad_norm": 9.625,
491
- "learning_rate": 6.839009170049096e-06,
492
- "loss": 2.2692,
493
- "step": 3450
494
- },
495
- {
496
- "epoch": 7.8475336322869955,
497
- "grad_norm": 9.0,
498
- "learning_rate": 6.208123155439854e-06,
499
- "loss": 2.2847,
500
- "step": 3500
501
- },
502
- {
503
- "epoch": 7.959641255605381,
504
- "grad_norm": 8.875,
505
- "learning_rate": 5.603619677297028e-06,
506
- "loss": 2.251,
507
- "step": 3550
508
- },
509
- {
510
- "epoch": 8.071748878923767,
511
- "grad_norm": 8.6875,
512
- "learning_rate": 5.026347418016134e-06,
513
- "loss": 2.2404,
514
- "step": 3600
515
- },
516
- {
517
- "epoch": 8.183856502242152,
518
- "grad_norm": 8.4375,
519
- "learning_rate": 4.477116829185235e-06,
520
- "loss": 2.2465,
521
- "step": 3650
522
- },
523
- {
524
- "epoch": 8.295964125560538,
525
- "grad_norm": 8.9375,
526
- "learning_rate": 3.956698993765226e-06,
527
- "loss": 2.2267,
528
- "step": 3700
529
- },
530
- {
531
- "epoch": 8.408071748878923,
532
- "grad_norm": 8.625,
533
- "learning_rate": 3.4658245435410407e-06,
534
- "loss": 2.1912,
535
- "step": 3750
536
- },
537
- {
538
- "epoch": 8.52017937219731,
539
- "grad_norm": 9.5,
540
- "learning_rate": 3.0051826333634818e-06,
541
- "loss": 2.2076,
542
- "step": 3800
543
- },
544
- {
545
- "epoch": 8.632286995515695,
546
- "grad_norm": 8.5,
547
- "learning_rate": 2.5754199736220312e-06,
548
- "loss": 2.2122,
549
- "step": 3850
550
- },
551
- {
552
- "epoch": 8.74439461883408,
553
- "grad_norm": 9.3125,
554
- "learning_rate": 2.177139922306773e-06,
555
- "loss": 2.1805,
556
- "step": 3900
557
- },
558
- {
559
- "epoch": 8.856502242152466,
560
- "grad_norm": 9.125,
561
- "learning_rate": 1.810901637934137e-06,
562
- "loss": 2.1735,
563
- "step": 3950
564
- },
565
- {
566
- "epoch": 8.968609865470851,
567
- "grad_norm": 9.125,
568
- "learning_rate": 1.4772192945258528e-06,
569
- "loss": 2.186,
570
- "step": 4000
571
- },
572
- {
573
- "epoch": 9.080717488789238,
574
- "grad_norm": 9.0,
575
- "learning_rate": 1.1765613597430309e-06,
576
- "loss": 2.1942,
577
- "step": 4050
578
- },
579
- {
580
- "epoch": 9.192825112107624,
581
- "grad_norm": 8.4375,
582
- "learning_rate": 9.093499371889385e-07,
583
- "loss": 2.2041,
584
- "step": 4100
585
- },
586
- {
587
- "epoch": 9.304932735426009,
588
- "grad_norm": 8.75,
589
- "learning_rate": 6.75960173803819e-07,
590
- "loss": 2.1515,
591
- "step": 4150
592
- },
593
- {
594
- "epoch": 9.417040358744394,
595
- "grad_norm": 8.6875,
596
- "learning_rate": 4.767197331837364e-07,
597
- "loss": 2.1865,
598
- "step": 4200
599
- },
600
- {
601
- "epoch": 9.52914798206278,
602
- "grad_norm": 8.875,
603
- "learning_rate": 3.1190833556278276e-07,
604
- "loss": 2.1974,
605
- "step": 4250
606
- },
607
- {
608
- "epoch": 9.641255605381167,
609
- "grad_norm": 9.0625,
610
- "learning_rate": 1.8175736510465114e-07,
611
- "loss": 2.1827,
612
- "step": 4300
613
- },
614
- {
615
- "epoch": 9.753363228699552,
616
- "grad_norm": 8.875,
617
- "learning_rate": 8.644954505474812e-08,
618
- "loss": 2.1896,
619
- "step": 4350
620
- },
621
- {
622
- "epoch": 9.865470852017937,
623
- "grad_norm": 8.6875,
624
- "learning_rate": 2.611868120898919e-08,
625
- "loss": 2.2084,
626
- "step": 4400
627
- },
628
- {
629
- "epoch": 9.977578475336323,
630
- "grad_norm": 8.875,
631
- "learning_rate": 8.494740594333639e-10,
632
- "loss": 2.2342,
633
- "step": 4450
634
  }
635
  ],
636
  "logging_steps": 50,
637
- "max_steps": 4460,
638
  "num_input_tokens_seen": 0,
639
- "num_train_epochs": 10,
640
  "save_steps": 500,
641
  "stateful_callbacks": {
642
  "TrainerControl": {
@@ -645,12 +100,12 @@
645
  "should_evaluate": false,
646
  "should_log": false,
647
  "should_save": true,
648
- "should_training_stop": true
649
  },
650
  "attributes": {}
651
  }
652
  },
653
- "total_flos": 1.5271922632402944e+17,
654
  "train_batch_size": 1,
655
  "trial_name": null,
656
  "trial_params": null
 
1
  {
2
+ "best_global_step": 500,
3
+ "best_metric": 1.3274219036102295,
4
+ "best_model_checkpoint": "./orpheus-turkish-emotion-finetune/checkpoint-500",
5
+ "epoch": 2.4884735202492214,
6
  "eval_steps": 500,
7
+ "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.24922118380062305,
14
+ "grad_norm": 100.0,
15
+ "learning_rate": 1.218905472636816e-06,
16
+ "loss": 9.7783,
17
  "step": 50
18
  },
19
  {
20
+ "epoch": 0.4984423676012461,
21
+ "grad_norm": 74.5,
22
+ "learning_rate": 2.4626865671641794e-06,
23
+ "loss": 9.5155,
24
  "step": 100
25
  },
26
  {
27
+ "epoch": 0.7476635514018691,
28
+ "grad_norm": 74.0,
29
+ "learning_rate": 3.706467661691542e-06,
30
+ "loss": 9.0351,
31
  "step": 150
32
  },
33
  {
34
+ "epoch": 0.9968847352024922,
35
+ "grad_norm": 80.5,
36
+ "learning_rate": 4.950248756218906e-06,
37
+ "loss": 8.4023,
38
  "step": 200
39
  },
40
  {
41
+ "epoch": 1.2442367601246107,
42
+ "grad_norm": 80.0,
43
+ "learning_rate": 6.194029850746269e-06,
44
+ "loss": 7.3784,
45
  "step": 250
46
  },
47
  {
48
+ "epoch": 1.4934579439252336,
49
+ "grad_norm": 134.0,
50
+ "learning_rate": 7.437810945273633e-06,
51
+ "loss": 5.7507,
52
  "step": 300
53
  },
54
  {
55
+ "epoch": 1.7426791277258566,
56
+ "grad_norm": 65.0,
57
+ "learning_rate": 8.681592039800995e-06,
58
+ "loss": 3.8008,
59
  "step": 350
60
  },
61
  {
62
+ "epoch": 1.9919003115264797,
63
+ "grad_norm": 17.0,
64
+ "learning_rate": 9.925373134328359e-06,
65
+ "loss": 2.103,
66
  "step": 400
67
  },
68
  {
69
+ "epoch": 2.2392523364485983,
70
+ "grad_norm": 1.65625,
71
+ "learning_rate": 9.995836696556696e-06,
72
+ "loss": 1.4184,
73
  "step": 450
74
  },
75
  {
76
+ "epoch": 2.4884735202492214,
77
+ "grad_norm": 0.84765625,
78
+ "learning_rate": 9.982274873915892e-06,
79
+ "loss": 1.2978,
80
  "step": 500
81
  },
82
  {
83
+ "epoch": 2.4884735202492214,
84
+ "eval_loss": 1.3274219036102295,
85
+ "eval_runtime": 15.5223,
86
+ "eval_samples_per_second": 11.532,
87
+ "eval_steps_per_second": 1.482,
88
+ "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  }
90
  ],
91
  "logging_steps": 50,
92
+ "max_steps": 4020,
93
  "num_input_tokens_seen": 0,
94
+ "num_train_epochs": 20,
95
  "save_steps": 500,
96
  "stateful_callbacks": {
97
  "TrainerControl": {
 
100
  "should_evaluate": false,
101
  "should_log": false,
102
  "should_save": true,
103
+ "should_training_stop": false
104
  },
105
  "attributes": {}
106
  }
107
  },
108
+ "total_flos": 1.3833925120386662e+17,
109
  "train_batch_size": 1,
110
  "trial_name": null,
111
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:311df3e7bc623c2f9d68c0452bb813155325fd14e9332718300e0deeb7ad9750
3
  size 5777
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e91f58a9a988419f219f097fff3f0e1762f623e4b2b1a8cf942cacee3271dc13
3
  size 5777