kiritan commited on
Commit
b38bbd3
·
verified ·
1 Parent(s): e15b691

Training in progress, step 20000, checkpoint

Browse files
last-checkpoint/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ade9c3e43b2a1550492ecf4b91e9228af429dcf0d7b1c09aea81ebc7a5842d20
3
+ size 761059696
last-checkpoint/global_step20000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58f46b37e83d56bff8e8b49fc01d48e56f7c2f6034abd01b65de03f862980853
3
+ size 129965712
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step18000
 
1
+ global_step20000
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad61c1d6676decf968d7cc262cb88d3340a58571f59eb03dc41c8694daf8e28e
3
  size 181508256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b22de776648c8fc55dbdb37a34986669b21215c0d0cc7d4355ba0090a00314ad
3
  size 181508256
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76da6beb47ba6fea32e3903f5fb6715e6c7d9cfa9223676725c0a4f3ab456246
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:907910c4d615478ec9b347b176d82b2a1be77f33469156f9f4b3321b8fe69355
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:389fc41872de99e18419ed46bb961f8c27ddde2cc92d05129c78c005704b1713
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29c7a79b53a589de48d3b7a21df9c0d024be4dea79f68869f72fdc01ae3b212a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 96.5865157944366,
3
- "best_model_checkpoint": "./iteboshi_temp/checkpoint-17000",
4
- "epoch": 19.823788546255507,
5
  "eval_steps": 1000,
6
- "global_step": 18000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5227,6 +5227,586 @@
5227
  "eval_steps_per_second": 3.303,
5228
  "eval_wer": 96.61480433757662,
5229
  "step": 18000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5230
  }
5231
  ],
5232
  "logging_steps": 25,
@@ -5241,12 +5821,12 @@
5241
  "should_evaluate": false,
5242
  "should_log": false,
5243
  "should_save": true,
5244
- "should_training_stop": false
5245
  },
5246
  "attributes": {}
5247
  }
5248
  },
5249
- "total_flos": 3.0387073320631665e+19,
5250
  "train_batch_size": 4,
5251
  "trial_name": null,
5252
  "trial_params": null
 
1
  {
2
+ "best_metric": 96.57708628005658,
3
+ "best_model_checkpoint": "./iteboshi_temp/checkpoint-19000",
4
+ "epoch": 22.026431718061673,
5
  "eval_steps": 1000,
6
+ "global_step": 20000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5227
  "eval_steps_per_second": 3.303,
5228
  "eval_wer": 96.61480433757662,
5229
  "step": 18000
5230
+ },
5231
+ {
5232
+ "epoch": 19.851321585903083,
5233
+ "grad_norm": 0.0205672699958086,
5234
+ "learning_rate": 2.025641025641026e-06,
5235
+ "loss": 0.0034,
5236
+ "step": 18025
5237
+ },
5238
+ {
5239
+ "epoch": 19.878854625550662,
5240
+ "grad_norm": 0.017052460461854935,
5241
+ "learning_rate": 2.0000000000000003e-06,
5242
+ "loss": 0.0024,
5243
+ "step": 18050
5244
+ },
5245
+ {
5246
+ "epoch": 19.90638766519824,
5247
+ "grad_norm": 0.023273777216672897,
5248
+ "learning_rate": 1.9743589743589744e-06,
5249
+ "loss": 0.0024,
5250
+ "step": 18075
5251
+ },
5252
+ {
5253
+ "epoch": 19.933920704845814,
5254
+ "grad_norm": 0.01854720339179039,
5255
+ "learning_rate": 1.948717948717949e-06,
5256
+ "loss": 0.0029,
5257
+ "step": 18100
5258
+ },
5259
+ {
5260
+ "epoch": 19.961453744493394,
5261
+ "grad_norm": 0.023288726806640625,
5262
+ "learning_rate": 1.9230769230769234e-06,
5263
+ "loss": 0.0025,
5264
+ "step": 18125
5265
+ },
5266
+ {
5267
+ "epoch": 19.98898678414097,
5268
+ "grad_norm": 0.019170600920915604,
5269
+ "learning_rate": 1.8974358974358975e-06,
5270
+ "loss": 0.0024,
5271
+ "step": 18150
5272
+ },
5273
+ {
5274
+ "epoch": 20.016519823788546,
5275
+ "grad_norm": 0.013864605687558651,
5276
+ "learning_rate": 1.871794871794872e-06,
5277
+ "loss": 0.0021,
5278
+ "step": 18175
5279
+ },
5280
+ {
5281
+ "epoch": 20.044052863436125,
5282
+ "grad_norm": 0.015261122956871986,
5283
+ "learning_rate": 1.8461538461538465e-06,
5284
+ "loss": 0.002,
5285
+ "step": 18200
5286
+ },
5287
+ {
5288
+ "epoch": 20.0715859030837,
5289
+ "grad_norm": 0.015079254284501076,
5290
+ "learning_rate": 1.8205128205128205e-06,
5291
+ "loss": 0.0024,
5292
+ "step": 18225
5293
+ },
5294
+ {
5295
+ "epoch": 20.099118942731277,
5296
+ "grad_norm": 0.013841504231095314,
5297
+ "learning_rate": 1.794871794871795e-06,
5298
+ "loss": 0.003,
5299
+ "step": 18250
5300
+ },
5301
+ {
5302
+ "epoch": 20.126651982378856,
5303
+ "grad_norm": 0.017009438946843147,
5304
+ "learning_rate": 1.7692307692307695e-06,
5305
+ "loss": 0.002,
5306
+ "step": 18275
5307
+ },
5308
+ {
5309
+ "epoch": 20.154185022026432,
5310
+ "grad_norm": 0.01796025224030018,
5311
+ "learning_rate": 1.7435897435897436e-06,
5312
+ "loss": 0.0019,
5313
+ "step": 18300
5314
+ },
5315
+ {
5316
+ "epoch": 20.181718061674008,
5317
+ "grad_norm": 0.020462974905967712,
5318
+ "learning_rate": 1.717948717948718e-06,
5319
+ "loss": 0.002,
5320
+ "step": 18325
5321
+ },
5322
+ {
5323
+ "epoch": 20.209251101321588,
5324
+ "grad_norm": 0.0168469101190567,
5325
+ "learning_rate": 1.6923076923076926e-06,
5326
+ "loss": 0.002,
5327
+ "step": 18350
5328
+ },
5329
+ {
5330
+ "epoch": 20.236784140969164,
5331
+ "grad_norm": 0.015358548611402512,
5332
+ "learning_rate": 1.6666666666666667e-06,
5333
+ "loss": 0.0019,
5334
+ "step": 18375
5335
+ },
5336
+ {
5337
+ "epoch": 20.26431718061674,
5338
+ "grad_norm": 0.01623690128326416,
5339
+ "learning_rate": 1.6410256410256412e-06,
5340
+ "loss": 0.0019,
5341
+ "step": 18400
5342
+ },
5343
+ {
5344
+ "epoch": 20.291850220264315,
5345
+ "grad_norm": 0.016147859394550323,
5346
+ "learning_rate": 1.6153846153846157e-06,
5347
+ "loss": 0.002,
5348
+ "step": 18425
5349
+ },
5350
+ {
5351
+ "epoch": 20.319383259911895,
5352
+ "grad_norm": 0.023021413013339043,
5353
+ "learning_rate": 1.5897435897435897e-06,
5354
+ "loss": 0.0023,
5355
+ "step": 18450
5356
+ },
5357
+ {
5358
+ "epoch": 20.34691629955947,
5359
+ "grad_norm": 0.0137328477576375,
5360
+ "learning_rate": 1.5641025641025642e-06,
5361
+ "loss": 0.0019,
5362
+ "step": 18475
5363
+ },
5364
+ {
5365
+ "epoch": 20.374449339207047,
5366
+ "grad_norm": 0.01765141263604164,
5367
+ "learning_rate": 1.5384615384615387e-06,
5368
+ "loss": 0.0022,
5369
+ "step": 18500
5370
+ },
5371
+ {
5372
+ "epoch": 20.401982378854626,
5373
+ "grad_norm": 0.015655307099223137,
5374
+ "learning_rate": 1.5128205128205128e-06,
5375
+ "loss": 0.0038,
5376
+ "step": 18525
5377
+ },
5378
+ {
5379
+ "epoch": 20.429515418502202,
5380
+ "grad_norm": 0.021192258223891258,
5381
+ "learning_rate": 1.4871794871794873e-06,
5382
+ "loss": 0.0021,
5383
+ "step": 18550
5384
+ },
5385
+ {
5386
+ "epoch": 20.457048458149778,
5387
+ "grad_norm": 0.014702214859426022,
5388
+ "learning_rate": 1.4615384615384618e-06,
5389
+ "loss": 0.0019,
5390
+ "step": 18575
5391
+ },
5392
+ {
5393
+ "epoch": 20.484581497797357,
5394
+ "grad_norm": 0.018568340688943863,
5395
+ "learning_rate": 1.4358974358974359e-06,
5396
+ "loss": 0.0018,
5397
+ "step": 18600
5398
+ },
5399
+ {
5400
+ "epoch": 20.512114537444933,
5401
+ "grad_norm": 0.020032202824950218,
5402
+ "learning_rate": 1.4102564102564104e-06,
5403
+ "loss": 0.002,
5404
+ "step": 18625
5405
+ },
5406
+ {
5407
+ "epoch": 20.53964757709251,
5408
+ "grad_norm": 0.01590747945010662,
5409
+ "learning_rate": 1.3846153846153848e-06,
5410
+ "loss": 0.002,
5411
+ "step": 18650
5412
+ },
5413
+ {
5414
+ "epoch": 20.56718061674009,
5415
+ "grad_norm": 0.014293953776359558,
5416
+ "learning_rate": 1.358974358974359e-06,
5417
+ "loss": 0.002,
5418
+ "step": 18675
5419
+ },
5420
+ {
5421
+ "epoch": 20.594713656387665,
5422
+ "grad_norm": 0.0199781134724617,
5423
+ "learning_rate": 1.3333333333333334e-06,
5424
+ "loss": 0.0019,
5425
+ "step": 18700
5426
+ },
5427
+ {
5428
+ "epoch": 20.62224669603524,
5429
+ "grad_norm": 0.018757140263915062,
5430
+ "learning_rate": 1.307692307692308e-06,
5431
+ "loss": 0.0022,
5432
+ "step": 18725
5433
+ },
5434
+ {
5435
+ "epoch": 20.64977973568282,
5436
+ "grad_norm": 0.021107446402311325,
5437
+ "learning_rate": 1.282051282051282e-06,
5438
+ "loss": 0.0029,
5439
+ "step": 18750
5440
+ },
5441
+ {
5442
+ "epoch": 20.677312775330396,
5443
+ "grad_norm": 0.018470246344804764,
5444
+ "learning_rate": 1.2564102564102565e-06,
5445
+ "loss": 0.0021,
5446
+ "step": 18775
5447
+ },
5448
+ {
5449
+ "epoch": 20.704845814977972,
5450
+ "grad_norm": 0.01821320876479149,
5451
+ "learning_rate": 1.230769230769231e-06,
5452
+ "loss": 0.0022,
5453
+ "step": 18800
5454
+ },
5455
+ {
5456
+ "epoch": 20.73237885462555,
5457
+ "grad_norm": 0.15323257446289062,
5458
+ "learning_rate": 1.2051282051282053e-06,
5459
+ "loss": 0.0024,
5460
+ "step": 18825
5461
+ },
5462
+ {
5463
+ "epoch": 20.759911894273127,
5464
+ "grad_norm": 0.015295284800231457,
5465
+ "learning_rate": 1.1794871794871795e-06,
5466
+ "loss": 0.002,
5467
+ "step": 18850
5468
+ },
5469
+ {
5470
+ "epoch": 20.787444933920703,
5471
+ "grad_norm": 0.015194980427622795,
5472
+ "learning_rate": 1.153846153846154e-06,
5473
+ "loss": 0.0018,
5474
+ "step": 18875
5475
+ },
5476
+ {
5477
+ "epoch": 20.814977973568283,
5478
+ "grad_norm": 0.05270170047879219,
5479
+ "learning_rate": 1.1282051282051283e-06,
5480
+ "loss": 0.0024,
5481
+ "step": 18900
5482
+ },
5483
+ {
5484
+ "epoch": 20.84251101321586,
5485
+ "grad_norm": 0.01960138976573944,
5486
+ "learning_rate": 1.1025641025641026e-06,
5487
+ "loss": 0.0021,
5488
+ "step": 18925
5489
+ },
5490
+ {
5491
+ "epoch": 20.870044052863435,
5492
+ "grad_norm": 0.02073553018271923,
5493
+ "learning_rate": 1.076923076923077e-06,
5494
+ "loss": 0.0019,
5495
+ "step": 18950
5496
+ },
5497
+ {
5498
+ "epoch": 20.897577092511014,
5499
+ "grad_norm": 0.01615351065993309,
5500
+ "learning_rate": 1.0512820512820514e-06,
5501
+ "loss": 0.002,
5502
+ "step": 18975
5503
+ },
5504
+ {
5505
+ "epoch": 20.92511013215859,
5506
+ "grad_norm": 0.021563587710261345,
5507
+ "learning_rate": 1.0256410256410257e-06,
5508
+ "loss": 0.0021,
5509
+ "step": 19000
5510
+ },
5511
+ {
5512
+ "epoch": 20.92511013215859,
5513
+ "eval_cer": 55.589054600896446,
5514
+ "eval_loss": 1.0507194995880127,
5515
+ "eval_runtime": 844.8487,
5516
+ "eval_samples_per_second": 12.524,
5517
+ "eval_steps_per_second": 3.132,
5518
+ "eval_wer": 96.57708628005658,
5519
+ "step": 19000
5520
+ },
5521
+ {
5522
+ "epoch": 20.952643171806166,
5523
+ "grad_norm": 0.016109561547636986,
5524
+ "learning_rate": 1.0000000000000002e-06,
5525
+ "loss": 0.002,
5526
+ "step": 19025
5527
+ },
5528
+ {
5529
+ "epoch": 20.980176211453745,
5530
+ "grad_norm": 0.016952887177467346,
5531
+ "learning_rate": 9.743589743589745e-07,
5532
+ "loss": 0.002,
5533
+ "step": 19050
5534
+ },
5535
+ {
5536
+ "epoch": 21.00770925110132,
5537
+ "grad_norm": 0.01466713659465313,
5538
+ "learning_rate": 9.487179487179487e-07,
5539
+ "loss": 0.002,
5540
+ "step": 19075
5541
+ },
5542
+ {
5543
+ "epoch": 21.035242290748897,
5544
+ "grad_norm": 0.01427449006587267,
5545
+ "learning_rate": 9.230769230769232e-07,
5546
+ "loss": 0.002,
5547
+ "step": 19100
5548
+ },
5549
+ {
5550
+ "epoch": 21.062775330396477,
5551
+ "grad_norm": 0.016093429177999496,
5552
+ "learning_rate": 8.974358974358975e-07,
5553
+ "loss": 0.0018,
5554
+ "step": 19125
5555
+ },
5556
+ {
5557
+ "epoch": 21.090308370044053,
5558
+ "grad_norm": 0.019426781684160233,
5559
+ "learning_rate": 8.717948717948718e-07,
5560
+ "loss": 0.0018,
5561
+ "step": 19150
5562
+ },
5563
+ {
5564
+ "epoch": 21.11784140969163,
5565
+ "grad_norm": 0.0124832633882761,
5566
+ "learning_rate": 8.461538461538463e-07,
5567
+ "loss": 0.0017,
5568
+ "step": 19175
5569
+ },
5570
+ {
5571
+ "epoch": 21.145374449339208,
5572
+ "grad_norm": 0.01551234070211649,
5573
+ "learning_rate": 8.205128205128206e-07,
5574
+ "loss": 0.0018,
5575
+ "step": 19200
5576
+ },
5577
+ {
5578
+ "epoch": 21.172907488986784,
5579
+ "grad_norm": 0.01290995441377163,
5580
+ "learning_rate": 7.948717948717949e-07,
5581
+ "loss": 0.0019,
5582
+ "step": 19225
5583
+ },
5584
+ {
5585
+ "epoch": 21.20044052863436,
5586
+ "grad_norm": 0.012107312679290771,
5587
+ "learning_rate": 7.692307692307694e-07,
5588
+ "loss": 0.0018,
5589
+ "step": 19250
5590
+ },
5591
+ {
5592
+ "epoch": 21.22797356828194,
5593
+ "grad_norm": 0.013243271969258785,
5594
+ "learning_rate": 7.435897435897436e-07,
5595
+ "loss": 0.0018,
5596
+ "step": 19275
5597
+ },
5598
+ {
5599
+ "epoch": 21.255506607929515,
5600
+ "grad_norm": 0.01567436195909977,
5601
+ "learning_rate": 7.179487179487179e-07,
5602
+ "loss": 0.0017,
5603
+ "step": 19300
5604
+ },
5605
+ {
5606
+ "epoch": 21.28303964757709,
5607
+ "grad_norm": 0.017800329253077507,
5608
+ "learning_rate": 6.923076923076924e-07,
5609
+ "loss": 0.0017,
5610
+ "step": 19325
5611
+ },
5612
+ {
5613
+ "epoch": 21.31057268722467,
5614
+ "grad_norm": 0.012769469991326332,
5615
+ "learning_rate": 6.666666666666667e-07,
5616
+ "loss": 0.0018,
5617
+ "step": 19350
5618
+ },
5619
+ {
5620
+ "epoch": 21.338105726872246,
5621
+ "grad_norm": 0.013936811126768589,
5622
+ "learning_rate": 6.41025641025641e-07,
5623
+ "loss": 0.0018,
5624
+ "step": 19375
5625
+ },
5626
+ {
5627
+ "epoch": 21.365638766519822,
5628
+ "grad_norm": 0.017832236364483833,
5629
+ "learning_rate": 6.153846153846155e-07,
5630
+ "loss": 0.0018,
5631
+ "step": 19400
5632
+ },
5633
+ {
5634
+ "epoch": 21.393171806167402,
5635
+ "grad_norm": 0.016330501064658165,
5636
+ "learning_rate": 5.897435897435898e-07,
5637
+ "loss": 0.0019,
5638
+ "step": 19425
5639
+ },
5640
+ {
5641
+ "epoch": 21.420704845814978,
5642
+ "grad_norm": 0.012162838131189346,
5643
+ "learning_rate": 5.641025641025642e-07,
5644
+ "loss": 0.0018,
5645
+ "step": 19450
5646
+ },
5647
+ {
5648
+ "epoch": 21.448237885462554,
5649
+ "grad_norm": 0.01499269250780344,
5650
+ "learning_rate": 5.384615384615386e-07,
5651
+ "loss": 0.0019,
5652
+ "step": 19475
5653
+ },
5654
+ {
5655
+ "epoch": 21.475770925110133,
5656
+ "grad_norm": 0.013169058598577976,
5657
+ "learning_rate": 5.128205128205128e-07,
5658
+ "loss": 0.0019,
5659
+ "step": 19500
5660
+ },
5661
+ {
5662
+ "epoch": 21.50330396475771,
5663
+ "grad_norm": 0.011718913912773132,
5664
+ "learning_rate": 4.871794871794872e-07,
5665
+ "loss": 0.0018,
5666
+ "step": 19525
5667
+ },
5668
+ {
5669
+ "epoch": 21.530837004405285,
5670
+ "grad_norm": 0.01436688657850027,
5671
+ "learning_rate": 4.615384615384616e-07,
5672
+ "loss": 0.0019,
5673
+ "step": 19550
5674
+ },
5675
+ {
5676
+ "epoch": 21.558370044052865,
5677
+ "grad_norm": 0.012899577617645264,
5678
+ "learning_rate": 4.358974358974359e-07,
5679
+ "loss": 0.0016,
5680
+ "step": 19575
5681
+ },
5682
+ {
5683
+ "epoch": 21.58590308370044,
5684
+ "grad_norm": 0.018741106614470482,
5685
+ "learning_rate": 4.102564102564103e-07,
5686
+ "loss": 0.0018,
5687
+ "step": 19600
5688
+ },
5689
+ {
5690
+ "epoch": 21.613436123348016,
5691
+ "grad_norm": 0.011879649944603443,
5692
+ "learning_rate": 3.846153846153847e-07,
5693
+ "loss": 0.0018,
5694
+ "step": 19625
5695
+ },
5696
+ {
5697
+ "epoch": 21.640969162995596,
5698
+ "grad_norm": 0.01298064086586237,
5699
+ "learning_rate": 3.5897435897435896e-07,
5700
+ "loss": 0.0018,
5701
+ "step": 19650
5702
+ },
5703
+ {
5704
+ "epoch": 21.66850220264317,
5705
+ "grad_norm": 0.0132521390914917,
5706
+ "learning_rate": 3.3333333333333335e-07,
5707
+ "loss": 0.0017,
5708
+ "step": 19675
5709
+ },
5710
+ {
5711
+ "epoch": 21.696035242290748,
5712
+ "grad_norm": 0.012232212349772453,
5713
+ "learning_rate": 3.0769230769230774e-07,
5714
+ "loss": 0.0022,
5715
+ "step": 19700
5716
+ },
5717
+ {
5718
+ "epoch": 21.723568281938327,
5719
+ "grad_norm": 0.0125159602612257,
5720
+ "learning_rate": 2.820512820512821e-07,
5721
+ "loss": 0.0021,
5722
+ "step": 19725
5723
+ },
5724
+ {
5725
+ "epoch": 21.751101321585903,
5726
+ "grad_norm": 0.012911227531731129,
5727
+ "learning_rate": 2.564102564102564e-07,
5728
+ "loss": 0.0018,
5729
+ "step": 19750
5730
+ },
5731
+ {
5732
+ "epoch": 21.77863436123348,
5733
+ "grad_norm": 0.016304660588502884,
5734
+ "learning_rate": 2.307692307692308e-07,
5735
+ "loss": 0.0018,
5736
+ "step": 19775
5737
+ },
5738
+ {
5739
+ "epoch": 21.80616740088106,
5740
+ "grad_norm": 0.0178163331001997,
5741
+ "learning_rate": 2.0512820512820514e-07,
5742
+ "loss": 0.0018,
5743
+ "step": 19800
5744
+ },
5745
+ {
5746
+ "epoch": 21.833700440528634,
5747
+ "grad_norm": 0.013485315255820751,
5748
+ "learning_rate": 1.7948717948717948e-07,
5749
+ "loss": 0.0017,
5750
+ "step": 19825
5751
+ },
5752
+ {
5753
+ "epoch": 21.86123348017621,
5754
+ "grad_norm": 0.021611526608467102,
5755
+ "learning_rate": 1.5384615384615387e-07,
5756
+ "loss": 0.0018,
5757
+ "step": 19850
5758
+ },
5759
+ {
5760
+ "epoch": 21.88876651982379,
5761
+ "grad_norm": 0.014628293924033642,
5762
+ "learning_rate": 1.282051282051282e-07,
5763
+ "loss": 0.0017,
5764
+ "step": 19875
5765
+ },
5766
+ {
5767
+ "epoch": 21.916299559471366,
5768
+ "grad_norm": 0.013321286998689175,
5769
+ "learning_rate": 1.0256410256410257e-07,
5770
+ "loss": 0.0017,
5771
+ "step": 19900
5772
+ },
5773
+ {
5774
+ "epoch": 21.94383259911894,
5775
+ "grad_norm": 0.016186168417334557,
5776
+ "learning_rate": 7.692307692307694e-08,
5777
+ "loss": 0.0018,
5778
+ "step": 19925
5779
+ },
5780
+ {
5781
+ "epoch": 21.97136563876652,
5782
+ "grad_norm": 0.015817852690815926,
5783
+ "learning_rate": 5.1282051282051286e-08,
5784
+ "loss": 0.0017,
5785
+ "step": 19950
5786
+ },
5787
+ {
5788
+ "epoch": 21.998898678414097,
5789
+ "grad_norm": 0.01383238285779953,
5790
+ "learning_rate": 2.5641025641025643e-08,
5791
+ "loss": 0.0018,
5792
+ "step": 19975
5793
+ },
5794
+ {
5795
+ "epoch": 22.026431718061673,
5796
+ "grad_norm": 0.0143059641122818,
5797
+ "learning_rate": 0.0,
5798
+ "loss": 0.0017,
5799
+ "step": 20000
5800
+ },
5801
+ {
5802
+ "epoch": 22.026431718061673,
5803
+ "eval_cer": 54.87888757694909,
5804
+ "eval_loss": 1.0545215606689453,
5805
+ "eval_runtime": 819.2896,
5806
+ "eval_samples_per_second": 12.915,
5807
+ "eval_steps_per_second": 3.23,
5808
+ "eval_wer": 96.57708628005658,
5809
+ "step": 20000
5810
  }
5811
  ],
5812
  "logging_steps": 25,
 
5821
  "should_evaluate": false,
5822
  "should_log": false,
5823
  "should_save": true,
5824
+ "should_training_stop": true
5825
  },
5826
  "attributes": {}
5827
  }
5828
  },
5829
+ "total_flos": 3.376341480070185e+19,
5830
  "train_batch_size": 4,
5831
  "trial_name": null,
5832
  "trial_params": null