NguyenTan commited on
Commit
3b522f6
·
verified ·
1 Parent(s): 8d70e67

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a13a9b809252e5aae629463259efb1628c1144763b5f9f5693935d107037a0ad
3
  size 1583480280
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:872dafd4f7a04e803d7c539219fc84f71fe6a69c1babbc798b79455dc4e1cc48
3
  size 1583480280
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7fd3a75f72ae3055bcc4fa868ea1839ec1bf4df33d6b0043a5d9c1f7d92b4ce
3
  size 3166958572
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9ad0354ec70f7cee4872b10ddc2b723a017a3711c3e9c5a25eba5b7823716aa
3
  size 3166958572
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78dbe42c7dcb5b34be1b27933cf3f4508f8633769c61bd10f6ed99a6b2a7aa0b
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ef1ee026e99fb5f8e5a5e72bb4657b4cedd8651ee5b752f5c314d3a89fd9ad7
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc1875a9cdc0acada4ec00dae1b22790ec75c57a24a0a014b3248a1201d3a993
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14ae2a2128444abab378aa06c09a61a84665f758fcc19fc46f5789b0bc1b5665
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f6e34b111723c929f0db228e345f9eda4ba2b88d0c04b9e6d5ca2744e89af18
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68aec417c91400a5fbe9c98d7447dabd74ed3b0812272a5f21d640985e919bad
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 13000,
3
- "best_metric": 1.4730943441390991,
4
- "best_model_checkpoint": "hieptt/vietnamese-correction-finetuning/checkpoint-13000",
5
- "epoch": 0.349208907513364,
6
  "eval_steps": 1000,
7
- "global_step": 13000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -82,959 +82,11 @@
82
  {
83
  "epoch": 0.02686222365487415,
84
  "eval_loss": 1.550229787826538,
85
- "eval_runtime": 3677.195,
86
  "eval_sacrebleu": 96.21455916515954,
87
- "eval_samples_per_second": 71.991,
88
- "eval_steps_per_second": 0.563,
89
- "step": 1000
90
- },
91
- {
92
- "epoch": 0.029548446020361566,
93
- "grad_norm": 0.43504711985588074,
94
- "learning_rate": 4.999949639906304e-05,
95
- "loss": 1.5642,
96
- "step": 1100
97
- },
98
- {
99
- "epoch": 0.03223466838584898,
100
- "grad_norm": 0.5235740542411804,
101
- "learning_rate": 4.999796521812822e-05,
102
- "loss": 1.5535,
103
- "step": 1200
104
- },
105
- {
106
- "epoch": 0.0349208907513364,
107
- "grad_norm": 0.40589195489883423,
108
- "learning_rate": 4.999540646875361e-05,
109
- "loss": 1.5474,
110
- "step": 1300
111
- },
112
- {
113
- "epoch": 0.03760711311682381,
114
- "grad_norm": 0.47619858384132385,
115
- "learning_rate": 4.9991820256119385e-05,
116
- "loss": 1.5467,
117
- "step": 1400
118
- },
119
- {
120
- "epoch": 0.040293335482311225,
121
- "grad_norm": 0.5160552263259888,
122
- "learning_rate": 4.9987206727640703e-05,
123
- "loss": 1.5404,
124
- "step": 1500
125
- },
126
- {
127
- "epoch": 0.04297955784779864,
128
- "grad_norm": 0.7639428973197937,
129
- "learning_rate": 4.998156607296163e-05,
130
- "loss": 1.5391,
131
- "step": 1600
132
- },
133
- {
134
- "epoch": 0.04566578021328606,
135
- "grad_norm": Infinity,
136
- "learning_rate": 4.997489852394741e-05,
137
- "loss": 1.5411,
138
- "step": 1700
139
- },
140
- {
141
- "epoch": 0.04835200257877347,
142
- "grad_norm": 11.356963157653809,
143
- "learning_rate": 4.996720435467485e-05,
144
- "loss": 3.6358,
145
- "step": 1800
146
- },
147
- {
148
- "epoch": 0.051038224944260885,
149
- "grad_norm": 0.5683807730674744,
150
- "learning_rate": 4.995848388142112e-05,
151
- "loss": 5.6818,
152
- "step": 1900
153
- },
154
- {
155
- "epoch": 0.0537244473097483,
156
- "grad_norm": 0.47999536991119385,
157
- "learning_rate": 4.994873746265073e-05,
158
- "loss": 1.8588,
159
- "step": 2000
160
- },
161
- {
162
- "epoch": 0.0537244473097483,
163
- "eval_loss": 1.5401405096054077,
164
- "eval_runtime": 3621.9751,
165
- "eval_sacrebleu": 96.05509046827605,
166
- "eval_samples_per_second": 73.089,
167
- "eval_steps_per_second": 0.571,
168
- "step": 2000
169
- },
170
- {
171
- "epoch": 0.05641066967523572,
172
- "grad_norm": 0.4521150290966034,
173
- "learning_rate": 4.993796549900076e-05,
174
- "loss": 1.5427,
175
- "step": 2100
176
- },
177
- {
178
- "epoch": 0.05909689204072313,
179
- "grad_norm": 0.4296003580093384,
180
- "learning_rate": 4.992616843326446e-05,
181
- "loss": 1.5634,
182
- "step": 2200
183
- },
184
- {
185
- "epoch": 0.061783114406210544,
186
- "grad_norm": 0.6067550182342529,
187
- "learning_rate": 4.991334675037299e-05,
188
- "loss": 1.534,
189
- "step": 2300
190
- },
191
- {
192
- "epoch": 0.06446933677169796,
193
- "grad_norm": 0.5774173140525818,
194
- "learning_rate": 4.989950097737552e-05,
195
- "loss": 1.542,
196
- "step": 2400
197
- },
198
- {
199
- "epoch": 0.06715555913718538,
200
- "grad_norm": 0.40837734937667847,
201
- "learning_rate": 4.988463168341755e-05,
202
- "loss": 1.5252,
203
- "step": 2500
204
- },
205
- {
206
- "epoch": 0.0698417815026728,
207
- "grad_norm": 0.5584208965301514,
208
- "learning_rate": 4.986873947971751e-05,
209
- "loss": 1.5311,
210
- "step": 2600
211
- },
212
- {
213
- "epoch": 0.0725280038681602,
214
- "grad_norm": 0.430813193321228,
215
- "learning_rate": 4.9851825019541644e-05,
216
- "loss": 1.5247,
217
- "step": 2700
218
- },
219
- {
220
- "epoch": 0.07521422623364762,
221
- "grad_norm": 0.3637406826019287,
222
- "learning_rate": 4.9833888998177165e-05,
223
- "loss": 1.5221,
224
- "step": 2800
225
- },
226
- {
227
- "epoch": 0.07790044859913503,
228
- "grad_norm": 0.530950129032135,
229
- "learning_rate": 4.981493215290366e-05,
230
- "loss": 1.5233,
231
- "step": 2900
232
- },
233
- {
234
- "epoch": 0.08058667096462245,
235
- "grad_norm": 1.674155831336975,
236
- "learning_rate": 4.979495526296279e-05,
237
- "loss": 1.5204,
238
- "step": 3000
239
- },
240
- {
241
- "epoch": 0.08058667096462245,
242
- "eval_loss": 1.5201951265335083,
243
- "eval_runtime": 3188.7059,
244
- "eval_sacrebleu": 96.36598044790144,
245
- "eval_samples_per_second": 83.02,
246
- "eval_steps_per_second": 0.649,
247
- "step": 3000
248
- },
249
- {
250
- "epoch": 0.08327289333010987,
251
- "grad_norm": 0.39490917325019836,
252
- "learning_rate": 4.977395914952627e-05,
253
- "loss": 1.5193,
254
- "step": 3100
255
- },
256
- {
257
- "epoch": 0.08595911569559728,
258
- "grad_norm": 0.37447476387023926,
259
- "learning_rate": 4.975194467566206e-05,
260
- "loss": 1.5185,
261
- "step": 3200
262
- },
263
- {
264
- "epoch": 0.0886453380610847,
265
- "grad_norm": 0.4466964900493622,
266
- "learning_rate": 4.9728912746298975e-05,
267
- "loss": 1.5143,
268
- "step": 3300
269
- },
270
- {
271
- "epoch": 0.09133156042657212,
272
- "grad_norm": 0.45723655819892883,
273
- "learning_rate": 4.9704864308189415e-05,
274
- "loss": 1.5119,
275
- "step": 3400
276
- },
277
- {
278
- "epoch": 0.09401778279205952,
279
- "grad_norm": 0.3492431938648224,
280
- "learning_rate": 4.967980034987048e-05,
281
- "loss": 1.5118,
282
- "step": 3500
283
- },
284
- {
285
- "epoch": 0.09670400515754694,
286
- "grad_norm": 0.6189600825309753,
287
- "learning_rate": 4.965372190162333e-05,
288
- "loss": 1.5079,
289
- "step": 3600
290
- },
291
- {
292
- "epoch": 0.09939022752303436,
293
- "grad_norm": 0.4467921555042267,
294
- "learning_rate": 4.962663003543083e-05,
295
- "loss": 1.5091,
296
- "step": 3700
297
- },
298
- {
299
- "epoch": 0.10207644988852177,
300
- "grad_norm": 0.4007367491722107,
301
- "learning_rate": 4.959852586493349e-05,
302
- "loss": 1.5111,
303
- "step": 3800
304
- },
305
- {
306
- "epoch": 0.10476267225400919,
307
- "grad_norm": 0.4409460127353668,
308
- "learning_rate": 4.9569410545383665e-05,
309
- "loss": 1.5099,
310
- "step": 3900
311
- },
312
- {
313
- "epoch": 0.1074488946194966,
314
- "grad_norm": 0.3737949728965759,
315
- "learning_rate": 4.953928527359812e-05,
316
- "loss": 1.5107,
317
- "step": 4000
318
- },
319
- {
320
- "epoch": 0.1074488946194966,
321
- "eval_loss": 1.5068457126617432,
322
- "eval_runtime": 3197.023,
323
- "eval_sacrebleu": 96.60403832626737,
324
- "eval_samples_per_second": 82.804,
325
- "eval_steps_per_second": 0.647,
326
- "step": 4000
327
- },
328
- {
329
- "epoch": 0.11013511698498402,
330
- "grad_norm": 0.3667586147785187,
331
- "learning_rate": 4.95081512879088e-05,
332
- "loss": 1.505,
333
- "step": 4100
334
- },
335
- {
336
- "epoch": 0.11282133935047144,
337
- "grad_norm": 0.3441518545150757,
338
- "learning_rate": 4.947600986811188e-05,
339
- "loss": 1.5082,
340
- "step": 4200
341
- },
342
- {
343
- "epoch": 0.11550756171595884,
344
- "grad_norm": 0.3704121708869934,
345
- "learning_rate": 4.9442862335415266e-05,
346
- "loss": 1.5026,
347
- "step": 4300
348
- },
349
- {
350
- "epoch": 0.11819378408144626,
351
- "grad_norm": 0.3782528042793274,
352
- "learning_rate": 4.940871005238418e-05,
353
- "loss": 1.5041,
354
- "step": 4400
355
- },
356
- {
357
- "epoch": 0.12088000644693368,
358
- "grad_norm": 0.3923707902431488,
359
- "learning_rate": 4.937355442288523e-05,
360
- "loss": 1.5008,
361
- "step": 4500
362
- },
363
- {
364
- "epoch": 0.12356622881242109,
365
- "grad_norm": 0.38463589549064636,
366
- "learning_rate": 4.933739689202862e-05,
367
- "loss": 1.5035,
368
- "step": 4600
369
- },
370
- {
371
- "epoch": 0.1262524511779085,
372
- "grad_norm": 0.44041115045547485,
373
- "learning_rate": 4.9300238946108856e-05,
374
- "loss": 1.5005,
375
- "step": 4700
376
- },
377
- {
378
- "epoch": 0.12893867354339592,
379
- "grad_norm": 0.41506507992744446,
380
- "learning_rate": 4.926208211254354e-05,
381
- "loss": 1.5014,
382
- "step": 4800
383
- },
384
- {
385
- "epoch": 0.13162489590888335,
386
- "grad_norm": 3.7683701515197754,
387
- "learning_rate": 4.922292795981065e-05,
388
- "loss": 3.7267,
389
- "step": 4900
390
- },
391
- {
392
- "epoch": 0.13431111827437076,
393
- "grad_norm": 3.7164273262023926,
394
- "learning_rate": 4.9182778097384055e-05,
395
- "loss": 5.7642,
396
- "step": 5000
397
- },
398
- {
399
- "epoch": 0.13431111827437076,
400
- "eval_loss": 4.501608371734619,
401
- "eval_runtime": 2795.6042,
402
- "eval_sacrebleu": 17.695398660139727,
403
- "eval_samples_per_second": 94.694,
404
- "eval_steps_per_second": 0.74,
405
- "step": 5000
406
- },
407
- {
408
- "epoch": 0.13699734063985816,
409
- "grad_norm": 7.479543209075928,
410
- "learning_rate": 4.914163417566734e-05,
411
- "loss": 2.1475,
412
- "step": 5100
413
- },
414
- {
415
- "epoch": 0.1396835630053456,
416
- "grad_norm": 0.3754720091819763,
417
- "learning_rate": 4.909949788592598e-05,
418
- "loss": 2.107,
419
- "step": 5200
420
- },
421
- {
422
- "epoch": 0.142369785370833,
423
- "grad_norm": 0.42401301860809326,
424
- "learning_rate": 4.9056370960217804e-05,
425
- "loss": 1.5074,
426
- "step": 5300
427
- },
428
- {
429
- "epoch": 0.1450560077363204,
430
- "grad_norm": 0.261708527803421,
431
- "learning_rate": 4.9012255171321814e-05,
432
- "loss": 1.5061,
433
- "step": 5400
434
- },
435
- {
436
- "epoch": 0.14774223010180781,
437
- "grad_norm": 0.35923367738723755,
438
- "learning_rate": 4.89671523326653e-05,
439
- "loss": 1.5086,
440
- "step": 5500
441
- },
442
- {
443
- "epoch": 0.15042845246729525,
444
- "grad_norm": 0.3558649718761444,
445
- "learning_rate": 4.892106429824931e-05,
446
- "loss": 1.5026,
447
- "step": 5600
448
- },
449
- {
450
- "epoch": 0.15311467483278265,
451
- "grad_norm": 0.3130311369895935,
452
- "learning_rate": 4.8873992962572413e-05,
453
- "loss": 1.5008,
454
- "step": 5700
455
- },
456
- {
457
- "epoch": 0.15580089719827006,
458
- "grad_norm": 0.40723615884780884,
459
- "learning_rate": 4.882594026055286e-05,
460
- "loss": 1.497,
461
- "step": 5800
462
- },
463
- {
464
- "epoch": 0.1584871195637575,
465
- "grad_norm": 0.3690826892852783,
466
- "learning_rate": 4.877690816744903e-05,
467
- "loss": 1.493,
468
- "step": 5900
469
- },
470
- {
471
- "epoch": 0.1611733419292449,
472
- "grad_norm": 0.4789445400238037,
473
- "learning_rate": 4.87268986987782e-05,
474
- "loss": 1.4936,
475
- "step": 6000
476
- },
477
- {
478
- "epoch": 0.1611733419292449,
479
- "eval_loss": 1.4912784099578857,
480
- "eval_runtime": 3279.389,
481
- "eval_sacrebleu": 96.84419951407857,
482
- "eval_samples_per_second": 80.724,
483
- "eval_steps_per_second": 0.631,
484
- "step": 6000
485
- },
486
- {
487
- "epoch": 0.1638595642947323,
488
- "grad_norm": 0.7140536308288574,
489
- "learning_rate": 4.8675913910233784e-05,
490
- "loss": 1.4966,
491
- "step": 6100
492
- },
493
- {
494
- "epoch": 0.16654578666021974,
495
- "grad_norm": 0.348763108253479,
496
- "learning_rate": 4.8623955897600724e-05,
497
- "loss": 1.4944,
498
- "step": 6200
499
- },
500
- {
501
- "epoch": 0.16923200902570715,
502
- "grad_norm": 0.33172258734703064,
503
- "learning_rate": 4.8571026796669416e-05,
504
- "loss": 1.4922,
505
- "step": 6300
506
- },
507
- {
508
- "epoch": 0.17191823139119455,
509
- "grad_norm": 0.42076775431632996,
510
- "learning_rate": 4.85171287831479e-05,
511
- "loss": 1.4935,
512
- "step": 6400
513
- },
514
- {
515
- "epoch": 0.174604453756682,
516
- "grad_norm": 0.4683443009853363,
517
- "learning_rate": 4.846226407257241e-05,
518
- "loss": 1.4956,
519
- "step": 6500
520
- },
521
- {
522
- "epoch": 0.1772906761221694,
523
- "grad_norm": 0.45604732632637024,
524
- "learning_rate": 4.840643492021628e-05,
525
- "loss": 1.4916,
526
- "step": 6600
527
- },
528
- {
529
- "epoch": 0.1799768984876568,
530
- "grad_norm": 0.4385334849357605,
531
- "learning_rate": 4.834964362099733e-05,
532
- "loss": 1.489,
533
- "step": 6700
534
- },
535
- {
536
- "epoch": 0.18266312085314423,
537
- "grad_norm": 0.48662105202674866,
538
- "learning_rate": 4.829189250938341e-05,
539
- "loss": 1.4893,
540
- "step": 6800
541
- },
542
- {
543
- "epoch": 0.18534934321863164,
544
- "grad_norm": 0.4047809839248657,
545
- "learning_rate": 4.8233183959296544e-05,
546
- "loss": 1.489,
547
- "step": 6900
548
- },
549
- {
550
- "epoch": 0.18803556558411905,
551
- "grad_norm": 0.31221145391464233,
552
- "learning_rate": 4.817352038401526e-05,
553
- "loss": 1.4888,
554
- "step": 7000
555
- },
556
- {
557
- "epoch": 0.18803556558411905,
558
- "eval_loss": 1.48853600025177,
559
- "eval_runtime": 3164.6016,
560
- "eval_sacrebleu": 96.86234024863172,
561
- "eval_samples_per_second": 83.652,
562
- "eval_steps_per_second": 0.654,
563
- "step": 7000
564
- },
565
- {
566
- "epoch": 0.19072178794960648,
567
- "grad_norm": 0.33513781428337097,
568
- "learning_rate": 4.8112904236075464e-05,
569
- "loss": 1.4882,
570
- "step": 7100
571
- },
572
- {
573
- "epoch": 0.1934080103150939,
574
- "grad_norm": 0.3263476490974426,
575
- "learning_rate": 4.8051338007169574e-05,
576
- "loss": 1.4931,
577
- "step": 7200
578
- },
579
- {
580
- "epoch": 0.1960942326805813,
581
- "grad_norm": 0.40779784321784973,
582
- "learning_rate": 4.798882422804413e-05,
583
- "loss": 1.4894,
584
- "step": 7300
585
- },
586
- {
587
- "epoch": 0.19878045504606873,
588
- "grad_norm": 0.3289216458797455,
589
- "learning_rate": 4.792536546839573e-05,
590
- "loss": 1.4872,
591
- "step": 7400
592
- },
593
- {
594
- "epoch": 0.20146667741155613,
595
- "grad_norm": 0.3560020625591278,
596
- "learning_rate": 4.786096433676545e-05,
597
- "loss": 1.4879,
598
- "step": 7500
599
- },
600
- {
601
- "epoch": 0.20415289977704354,
602
- "grad_norm": 0.34368178248405457,
603
- "learning_rate": 4.779562348043155e-05,
604
- "loss": 1.4888,
605
- "step": 7600
606
- },
607
- {
608
- "epoch": 0.20683912214253095,
609
- "grad_norm": 0.7768594622612,
610
- "learning_rate": 4.7729345585300716e-05,
611
- "loss": 1.4891,
612
- "step": 7700
613
- },
614
- {
615
- "epoch": 0.20952534450801838,
616
- "grad_norm": 0.32350119948387146,
617
- "learning_rate": 4.7662133375797635e-05,
618
- "loss": 1.4846,
619
- "step": 7800
620
- },
621
- {
622
- "epoch": 0.21221156687350579,
623
- "grad_norm": 0.3208902180194855,
624
- "learning_rate": 4.7593989614752974e-05,
625
- "loss": 1.4889,
626
- "step": 7900
627
- },
628
- {
629
- "epoch": 0.2148977892389932,
630
- "grad_norm": 0.2309061586856842,
631
- "learning_rate": 4.7524917103289863e-05,
632
- "loss": 1.4831,
633
- "step": 8000
634
- },
635
- {
636
- "epoch": 0.2148977892389932,
637
- "eval_loss": 1.4867380857467651,
638
- "eval_runtime": 3183.5875,
639
- "eval_sacrebleu": 96.88666750889345,
640
- "eval_samples_per_second": 83.153,
641
  "eval_steps_per_second": 0.65,
642
- "step": 8000
643
- },
644
- {
645
- "epoch": 0.21758401160448063,
646
- "grad_norm": 0.33346688747406006,
647
- "learning_rate": 4.74549186807087e-05,
648
- "loss": 1.4812,
649
- "step": 8100
650
- },
651
- {
652
- "epoch": 0.22027023396996803,
653
- "grad_norm": 0.5275943279266357,
654
- "learning_rate": 4.738399722437047e-05,
655
- "loss": 1.4874,
656
- "step": 8200
657
- },
658
- {
659
- "epoch": 0.22295645633545544,
660
- "grad_norm": 0.4352237284183502,
661
- "learning_rate": 4.731215564957847e-05,
662
- "loss": 1.4912,
663
- "step": 8300
664
- },
665
- {
666
- "epoch": 0.22564267870094287,
667
- "grad_norm": 0.3379707336425781,
668
- "learning_rate": 4.723939690945846e-05,
669
- "loss": 1.4852,
670
- "step": 8400
671
- },
672
- {
673
- "epoch": 0.22832890106643028,
674
- "grad_norm": 0.40677231550216675,
675
- "learning_rate": 4.7165723994837246e-05,
676
- "loss": 1.4857,
677
- "step": 8500
678
- },
679
- {
680
- "epoch": 0.23101512343191768,
681
- "grad_norm": 0.4737798273563385,
682
- "learning_rate": 4.709113993411981e-05,
683
- "loss": 1.4863,
684
- "step": 8600
685
- },
686
- {
687
- "epoch": 0.23370134579740512,
688
- "grad_norm": 0.33614835143089294,
689
- "learning_rate": 4.701564779316476e-05,
690
- "loss": 1.485,
691
- "step": 8700
692
- },
693
- {
694
- "epoch": 0.23638756816289253,
695
- "grad_norm": 0.4807875454425812,
696
- "learning_rate": 4.693925067515834e-05,
697
- "loss": 1.4875,
698
- "step": 8800
699
- },
700
- {
701
- "epoch": 0.23907379052837993,
702
- "grad_norm": 0.3510216772556305,
703
- "learning_rate": 4.686195172048682e-05,
704
- "loss": 1.4839,
705
- "step": 8900
706
- },
707
- {
708
- "epoch": 0.24176001289386737,
709
- "grad_norm": 0.24355977773666382,
710
- "learning_rate": 4.678375410660748e-05,
711
- "loss": 1.483,
712
- "step": 9000
713
- },
714
- {
715
- "epoch": 0.24176001289386737,
716
- "eval_loss": 1.4836901426315308,
717
- "eval_runtime": 3169.9641,
718
- "eval_sacrebleu": 96.9796698855444,
719
- "eval_samples_per_second": 83.511,
720
- "eval_steps_per_second": 0.653,
721
- "step": 9000
722
- },
723
- {
724
- "epoch": 0.24444623525935477,
725
- "grad_norm": 0.26222726702690125,
726
- "learning_rate": 4.6704661047917955e-05,
727
- "loss": 1.4855,
728
- "step": 9100
729
- },
730
- {
731
- "epoch": 0.24713245762484218,
732
- "grad_norm": 0.3217809796333313,
733
- "learning_rate": 4.6624675795624084e-05,
734
- "loss": 1.4823,
735
- "step": 9200
736
- },
737
- {
738
- "epoch": 0.2498186799903296,
739
- "grad_norm": 0.31736332178115845,
740
- "learning_rate": 4.6543801637606324e-05,
741
- "loss": 1.4791,
742
- "step": 9300
743
- },
744
- {
745
- "epoch": 0.252504902355817,
746
- "grad_norm": 0.42374491691589355,
747
- "learning_rate": 4.646204189828453e-05,
748
- "loss": 1.4793,
749
- "step": 9400
750
- },
751
- {
752
- "epoch": 0.25519112472130445,
753
- "grad_norm": 0.3236493468284607,
754
- "learning_rate": 4.637939993848137e-05,
755
- "loss": 1.4815,
756
- "step": 9500
757
- },
758
- {
759
- "epoch": 0.25787734708679183,
760
- "grad_norm": 0.3557586371898651,
761
- "learning_rate": 4.6295879155284125e-05,
762
- "loss": 1.4822,
763
- "step": 9600
764
- },
765
- {
766
- "epoch": 0.26056356945227926,
767
- "grad_norm": 0.2667744755744934,
768
- "learning_rate": 4.621148298190506e-05,
769
- "loss": 1.4804,
770
- "step": 9700
771
- },
772
- {
773
- "epoch": 0.2632497918177667,
774
- "grad_norm": 0.2917531132698059,
775
- "learning_rate": 4.61262148875403e-05,
776
- "loss": 1.4782,
777
- "step": 9800
778
- },
779
- {
780
- "epoch": 0.2659360141832541,
781
- "grad_norm": 0.32322603464126587,
782
- "learning_rate": 4.604007837722725e-05,
783
- "loss": 1.4815,
784
- "step": 9900
785
- },
786
- {
787
- "epoch": 0.2686222365487415,
788
- "grad_norm": 0.3380342125892639,
789
- "learning_rate": 4.595307699170045e-05,
790
- "loss": 1.4816,
791
- "step": 10000
792
- },
793
- {
794
- "epoch": 0.2686222365487415,
795
- "eval_loss": 1.4826488494873047,
796
- "eval_runtime": 3178.5973,
797
- "eval_sacrebleu": 97.05288129988465,
798
- "eval_samples_per_second": 83.284,
799
- "eval_steps_per_second": 0.651,
800
- "step": 10000
801
- },
802
- {
803
- "epoch": 0.27130845891422894,
804
- "grad_norm": 0.34705039858818054,
805
- "learning_rate": 4.586521430724612e-05,
806
- "loss": 1.4829,
807
- "step": 10100
808
- },
809
- {
810
- "epoch": 0.2739946812797163,
811
- "grad_norm": 0.3544181287288666,
812
- "learning_rate": 4.5776493935555065e-05,
813
- "loss": 1.481,
814
- "step": 10200
815
- },
816
- {
817
- "epoch": 0.27668090364520376,
818
- "grad_norm": 0.3179134726524353,
819
- "learning_rate": 4.568691952357428e-05,
820
- "loss": 1.4783,
821
- "step": 10300
822
- },
823
- {
824
- "epoch": 0.2793671260106912,
825
- "grad_norm": 0.32810860872268677,
826
- "learning_rate": 4.5596494753357005e-05,
827
- "loss": 1.4777,
828
- "step": 10400
829
- },
830
- {
831
- "epoch": 0.28205334837617857,
832
- "grad_norm": 0.36927270889282227,
833
- "learning_rate": 4.550522334191138e-05,
834
- "loss": 1.4776,
835
- "step": 10500
836
- },
837
- {
838
- "epoch": 0.284739570741666,
839
- "grad_norm": 0.2897135615348816,
840
- "learning_rate": 4.541310904104762e-05,
841
- "loss": 1.4768,
842
- "step": 10600
843
- },
844
- {
845
- "epoch": 0.28742579310715344,
846
- "grad_norm": 0.31894442439079285,
847
- "learning_rate": 4.532015563722387e-05,
848
- "loss": 1.4753,
849
- "step": 10700
850
- },
851
- {
852
- "epoch": 0.2901120154726408,
853
- "grad_norm": 0.3248063921928406,
854
- "learning_rate": 4.522636695139049e-05,
855
- "loss": 1.4755,
856
- "step": 10800
857
- },
858
- {
859
- "epoch": 0.29279823783812825,
860
- "grad_norm": 0.28601497411727905,
861
- "learning_rate": 4.513174683883299e-05,
862
- "loss": 1.4789,
863
- "step": 10900
864
- },
865
- {
866
- "epoch": 0.29548446020361563,
867
- "grad_norm": 0.263621985912323,
868
- "learning_rate": 4.50362991890136e-05,
869
- "loss": 1.4772,
870
- "step": 11000
871
- },
872
- {
873
- "epoch": 0.29548446020361563,
874
- "eval_loss": 1.483279824256897,
875
- "eval_runtime": 3185.3454,
876
- "eval_sacrebleu": 97.12672190258174,
877
- "eval_samples_per_second": 83.107,
878
- "eval_steps_per_second": 0.65,
879
- "step": 11000
880
- },
881
- {
882
- "epoch": 0.29817068256910306,
883
- "grad_norm": 0.32398489117622375,
884
- "learning_rate": 4.494002792541137e-05,
885
- "loss": 1.475,
886
- "step": 11100
887
- },
888
- {
889
- "epoch": 0.3008569049345905,
890
- "grad_norm": 0.28194740414619446,
891
- "learning_rate": 4.484293700536088e-05,
892
- "loss": 1.478,
893
- "step": 11200
894
- },
895
- {
896
- "epoch": 0.3035431273000779,
897
- "grad_norm": 0.25159361958503723,
898
- "learning_rate": 4.4745030419889555e-05,
899
- "loss": 1.4733,
900
- "step": 11300
901
- },
902
- {
903
- "epoch": 0.3062293496655653,
904
- "grad_norm": 0.33246248960494995,
905
- "learning_rate": 4.4646312193553675e-05,
906
- "loss": 1.4786,
907
- "step": 11400
908
- },
909
- {
910
- "epoch": 0.30891557203105274,
911
- "grad_norm": 0.2811319828033447,
912
- "learning_rate": 4.4546786384272855e-05,
913
- "loss": 1.4739,
914
- "step": 11500
915
- },
916
- {
917
- "epoch": 0.3116017943965401,
918
- "grad_norm": 0.3391018807888031,
919
- "learning_rate": 4.44464570831633e-05,
920
- "loss": 1.4758,
921
- "step": 11600
922
- },
923
- {
924
- "epoch": 0.31428801676202756,
925
- "grad_norm": 0.3480006158351898,
926
- "learning_rate": 4.434532841436962e-05,
927
- "loss": 1.4739,
928
- "step": 11700
929
- },
930
- {
931
- "epoch": 0.316974239127515,
932
- "grad_norm": 0.32334116101264954,
933
- "learning_rate": 4.42434045348953e-05,
934
- "loss": 1.4757,
935
- "step": 11800
936
- },
937
- {
938
- "epoch": 0.31966046149300237,
939
- "grad_norm": 0.35196009278297424,
940
- "learning_rate": 4.41406896344318e-05,
941
- "loss": 1.4759,
942
- "step": 11900
943
- },
944
- {
945
- "epoch": 0.3223466838584898,
946
- "grad_norm": 0.31370338797569275,
947
- "learning_rate": 4.4037187935186394e-05,
948
- "loss": 1.4735,
949
- "step": 12000
950
- },
951
- {
952
- "epoch": 0.3223466838584898,
953
- "eval_loss": 1.4803506135940552,
954
- "eval_runtime": 3210.9887,
955
- "eval_sacrebleu": 97.12202597988144,
956
- "eval_samples_per_second": 82.444,
957
- "eval_steps_per_second": 0.644,
958
- "step": 12000
959
- },
960
- {
961
- "epoch": 0.32503290622397724,
962
- "grad_norm": 0.36732399463653564,
963
- "learning_rate": 4.3932903691708534e-05,
964
- "loss": 1.4759,
965
- "step": 12100
966
- },
967
- {
968
- "epoch": 0.3277191285894646,
969
- "grad_norm": 0.23831887543201447,
970
- "learning_rate": 4.382784119071502e-05,
971
- "loss": 1.4719,
972
- "step": 12200
973
- },
974
- {
975
- "epoch": 0.33040535095495205,
976
- "grad_norm": 0.3706836700439453,
977
- "learning_rate": 4.372200475091376e-05,
978
- "loss": 1.4739,
979
- "step": 12300
980
- },
981
- {
982
- "epoch": 0.3330915733204395,
983
- "grad_norm": 0.3490886688232422,
984
- "learning_rate": 4.3615398722826276e-05,
985
- "loss": 1.474,
986
- "step": 12400
987
- },
988
- {
989
- "epoch": 0.33577779568592686,
990
- "grad_norm": 1.0285382270812988,
991
- "learning_rate": 4.350802748860882e-05,
992
- "loss": 1.4734,
993
- "step": 12500
994
- },
995
- {
996
- "epoch": 0.3384640180514143,
997
- "grad_norm": 0.26471439003944397,
998
- "learning_rate": 4.339989546187229e-05,
999
- "loss": 1.4745,
1000
- "step": 12600
1001
- },
1002
- {
1003
- "epoch": 0.34115024041690173,
1004
- "grad_norm": 0.30338284373283386,
1005
- "learning_rate": 4.329100708750079e-05,
1006
- "loss": 1.4727,
1007
- "step": 12700
1008
- },
1009
- {
1010
- "epoch": 0.3438364627823891,
1011
- "grad_norm": 0.290783166885376,
1012
- "learning_rate": 4.3181366841468896e-05,
1013
- "loss": 1.4722,
1014
- "step": 12800
1015
- },
1016
- {
1017
- "epoch": 0.34652268514787654,
1018
- "grad_norm": 0.2663359045982361,
1019
- "learning_rate": 4.3070979230657694e-05,
1020
- "loss": 1.4729,
1021
- "step": 12900
1022
- },
1023
- {
1024
- "epoch": 0.349208907513364,
1025
- "grad_norm": 1.0900615453720093,
1026
- "learning_rate": 4.29598487926695e-05,
1027
- "loss": 1.4707,
1028
- "step": 13000
1029
- },
1030
- {
1031
- "epoch": 0.349208907513364,
1032
- "eval_loss": 1.4730943441390991,
1033
- "eval_runtime": 3189.0879,
1034
- "eval_sacrebleu": 97.19666890806242,
1035
- "eval_samples_per_second": 83.01,
1036
- "eval_steps_per_second": 0.649,
1037
- "step": 13000
1038
  }
1039
  ],
1040
  "logging_steps": 100,
@@ -1063,7 +115,7 @@
1063
  "attributes": {}
1064
  }
1065
  },
1066
- "total_flos": 2.1543220310074982e+17,
1067
  "train_batch_size": 64,
1068
  "trial_name": null,
1069
  "trial_params": null
 
1
  {
2
+ "best_global_step": 1000,
3
+ "best_metric": 1.550229787826538,
4
+ "best_model_checkpoint": "hieptt/vietnamese-correction-finetuning/checkpoint-1000",
5
+ "epoch": 0.02686222365487415,
6
  "eval_steps": 1000,
7
+ "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
82
  {
83
  "epoch": 0.02686222365487415,
84
  "eval_loss": 1.550229787826538,
85
+ "eval_runtime": 3183.759,
86
  "eval_sacrebleu": 96.21455916515954,
87
+ "eval_samples_per_second": 83.149,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  "eval_steps_per_second": 0.65,
89
+ "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  }
91
  ],
92
  "logging_steps": 100,
 
115
  "attributes": {}
116
  }
117
  },
118
+ "total_flos": 1.6604548281925632e+16,
119
  "train_batch_size": 64,
120
  "trial_name": null,
121
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce0cdcde6ce6ad61a433903430eaa7d57da2bdffd8fa9982262860bcf6d0b1b1
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:522844a580fb08e9801add7b55973eec21ec1422f4a0aab9509ee801930f6aa1
3
  size 6033