Fanucci commited on
Commit
13f9e89
·
verified ·
1 Parent(s): f77b95b

Training in progress, step 1400, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f17745bedcf2691c98c5c38bed681bcbf85273eca7d4b073a3a5217298d9a2d3
3
  size 671149168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5210b8e7a636162c39378a8939d4c31c398430ff7e09e1eff0febb1fb0faf8f6
3
  size 671149168
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cff726efcf2b6d6ea5f5b99a5d48fd9ea0c135385b1d0d2cca75befa63a0bc1a
3
  size 1342555602
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7af4714670ec3d95954fb016cb9b1f5d2c80a1f74089d52ff0460d7ffed9b6f4
3
  size 1342555602
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fd83e1e15f70d373e2aa254f4001309c2d36b697f5a068bcfbf1e9855d8a484
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd5cd67dee7267dcb537fb31aede3e18d0e3e45fa623251f8fd6ad78c9019d93
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e7f5cdb6cd5b1751a2a041a35a236dff4650191e9a4687c5450d12c2fac14d6
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b24b043640c4c0fec08278af8607ef0c8db96b29dad4d3ed04b5e555adfd0924
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 3.1351470947265625,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-600",
4
- "epoch": 0.3839385698288274,
5
  "eval_steps": 200,
6
- "global_step": 1200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -8463,6 +8463,1414 @@
8463
  "eval_samples_per_second": 5.635,
8464
  "eval_steps_per_second": 1.409,
8465
  "step": 1200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8466
  }
8467
  ],
8468
  "logging_steps": 1,
@@ -8477,7 +9885,7 @@
8477
  "early_stopping_threshold": 0.0
8478
  },
8479
  "attributes": {
8480
- "early_stopping_patience_counter": 3
8481
  }
8482
  },
8483
  "TrainerControl": {
@@ -8491,7 +9899,7 @@
8491
  "attributes": {}
8492
  }
8493
  },
8494
- "total_flos": 1.1047304629384643e+18,
8495
  "train_batch_size": 4,
8496
  "trial_name": null,
8497
  "trial_params": null
 
1
  {
2
  "best_metric": 3.1351470947265625,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-600",
4
+ "epoch": 0.4479283314669653,
5
  "eval_steps": 200,
6
+ "global_step": 1400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
8463
  "eval_samples_per_second": 5.635,
8464
  "eval_steps_per_second": 1.409,
8465
  "step": 1200
8466
+ },
8467
+ {
8468
+ "epoch": 0.3842585186370181,
8469
+ "grad_norm": 27.103090286254883,
8470
+ "learning_rate": 0.0002740839244415924,
8471
+ "loss": 6.8103,
8472
+ "step": 1201
8473
+ },
8474
+ {
8475
+ "epoch": 0.3845784674452088,
8476
+ "grad_norm": 11.38796615600586,
8477
+ "learning_rate": 0.00027389965247590016,
8478
+ "loss": 6.7159,
8479
+ "step": 1202
8480
+ },
8481
+ {
8482
+ "epoch": 0.38489841625339943,
8483
+ "grad_norm": 9.166263580322266,
8484
+ "learning_rate": 0.0002737153127150736,
8485
+ "loss": 6.5407,
8486
+ "step": 1203
8487
+ },
8488
+ {
8489
+ "epoch": 0.38521836506159013,
8490
+ "grad_norm": 10.899460792541504,
8491
+ "learning_rate": 0.00027353090535469065,
8492
+ "loss": 6.6115,
8493
+ "step": 1204
8494
+ },
8495
+ {
8496
+ "epoch": 0.3855383138697808,
8497
+ "grad_norm": 12.029467582702637,
8498
+ "learning_rate": 0.00027334643059040035,
8499
+ "loss": 6.6065,
8500
+ "step": 1205
8501
+ },
8502
+ {
8503
+ "epoch": 0.3858582626779715,
8504
+ "grad_norm": 8.318588256835938,
8505
+ "learning_rate": 0.0002731618886179235,
8506
+ "loss": 6.8032,
8507
+ "step": 1206
8508
+ },
8509
+ {
8510
+ "epoch": 0.3861782114861622,
8511
+ "grad_norm": 8.223920822143555,
8512
+ "learning_rate": 0.00027297727963305227,
8513
+ "loss": 6.5092,
8514
+ "step": 1207
8515
+ },
8516
+ {
8517
+ "epoch": 0.3864981602943529,
8518
+ "grad_norm": 22.681262969970703,
8519
+ "learning_rate": 0.0002727926038316499,
8520
+ "loss": 6.8623,
8521
+ "step": 1208
8522
+ },
8523
+ {
8524
+ "epoch": 0.3868181091025436,
8525
+ "grad_norm": 10.376825332641602,
8526
+ "learning_rate": 0.0002726078614096504,
8527
+ "loss": 6.6216,
8528
+ "step": 1209
8529
+ },
8530
+ {
8531
+ "epoch": 0.3871380579107343,
8532
+ "grad_norm": 5.749302864074707,
8533
+ "learning_rate": 0.0002724230525630586,
8534
+ "loss": 6.6049,
8535
+ "step": 1210
8536
+ },
8537
+ {
8538
+ "epoch": 0.387458006718925,
8539
+ "grad_norm": 8.080632209777832,
8540
+ "learning_rate": 0.00027223817748794985,
8541
+ "loss": 6.6569,
8542
+ "step": 1211
8543
+ },
8544
+ {
8545
+ "epoch": 0.38777795552711564,
8546
+ "grad_norm": 12.558454513549805,
8547
+ "learning_rate": 0.00027205323638046947,
8548
+ "loss": 6.3948,
8549
+ "step": 1212
8550
+ },
8551
+ {
8552
+ "epoch": 0.38809790433530633,
8553
+ "grad_norm": 9.190141677856445,
8554
+ "learning_rate": 0.0002718682294368331,
8555
+ "loss": 6.6437,
8556
+ "step": 1213
8557
+ },
8558
+ {
8559
+ "epoch": 0.388417853143497,
8560
+ "grad_norm": 4.570112228393555,
8561
+ "learning_rate": 0.00027168315685332633,
8562
+ "loss": 6.5194,
8563
+ "step": 1214
8564
+ },
8565
+ {
8566
+ "epoch": 0.3887378019516877,
8567
+ "grad_norm": 9.28131103515625,
8568
+ "learning_rate": 0.0002714980188263041,
8569
+ "loss": 6.5532,
8570
+ "step": 1215
8571
+ },
8572
+ {
8573
+ "epoch": 0.3890577507598784,
8574
+ "grad_norm": 11.132205963134766,
8575
+ "learning_rate": 0.00027131281555219084,
8576
+ "loss": 6.6146,
8577
+ "step": 1216
8578
+ },
8579
+ {
8580
+ "epoch": 0.3893776995680691,
8581
+ "grad_norm": 7.617869853973389,
8582
+ "learning_rate": 0.00027112754722748037,
8583
+ "loss": 6.5418,
8584
+ "step": 1217
8585
+ },
8586
+ {
8587
+ "epoch": 0.3896976483762598,
8588
+ "grad_norm": 6.569556713104248,
8589
+ "learning_rate": 0.00027094221404873537,
8590
+ "loss": 6.5742,
8591
+ "step": 1218
8592
+ },
8593
+ {
8594
+ "epoch": 0.3900175971844505,
8595
+ "grad_norm": 9.705401420593262,
8596
+ "learning_rate": 0.0002707568162125875,
8597
+ "loss": 6.5249,
8598
+ "step": 1219
8599
+ },
8600
+ {
8601
+ "epoch": 0.3903375459926412,
8602
+ "grad_norm": 8.43759822845459,
8603
+ "learning_rate": 0.00027057135391573683,
8604
+ "loss": 6.7944,
8605
+ "step": 1220
8606
+ },
8607
+ {
8608
+ "epoch": 0.39065749480083184,
8609
+ "grad_norm": 9.743579864501953,
8610
+ "learning_rate": 0.00027038582735495196,
8611
+ "loss": 6.6869,
8612
+ "step": 1221
8613
+ },
8614
+ {
8615
+ "epoch": 0.39097744360902253,
8616
+ "grad_norm": 9.03455638885498,
8617
+ "learning_rate": 0.0002702002367270695,
8618
+ "loss": 6.4196,
8619
+ "step": 1222
8620
+ },
8621
+ {
8622
+ "epoch": 0.39129739241721323,
8623
+ "grad_norm": 11.69813060760498,
8624
+ "learning_rate": 0.0002700145822289942,
8625
+ "loss": 6.5824,
8626
+ "step": 1223
8627
+ },
8628
+ {
8629
+ "epoch": 0.3916173412254039,
8630
+ "grad_norm": 11.977036476135254,
8631
+ "learning_rate": 0.00026982886405769855,
8632
+ "loss": 6.8119,
8633
+ "step": 1224
8634
+ },
8635
+ {
8636
+ "epoch": 0.3919372900335946,
8637
+ "grad_norm": 7.943235874176025,
8638
+ "learning_rate": 0.00026964308241022255,
8639
+ "loss": 6.6036,
8640
+ "step": 1225
8641
+ },
8642
+ {
8643
+ "epoch": 0.3922572388417853,
8644
+ "grad_norm": 5.831643581390381,
8645
+ "learning_rate": 0.00026945723748367353,
8646
+ "loss": 6.747,
8647
+ "step": 1226
8648
+ },
8649
+ {
8650
+ "epoch": 0.392577187649976,
8651
+ "grad_norm": 13.803399085998535,
8652
+ "learning_rate": 0.00026927132947522604,
8653
+ "loss": 6.6117,
8654
+ "step": 1227
8655
+ },
8656
+ {
8657
+ "epoch": 0.3928971364581667,
8658
+ "grad_norm": 8.283905029296875,
8659
+ "learning_rate": 0.0002690853585821214,
8660
+ "loss": 6.6316,
8661
+ "step": 1228
8662
+ },
8663
+ {
8664
+ "epoch": 0.3932170852663574,
8665
+ "grad_norm": 8.696572303771973,
8666
+ "learning_rate": 0.00026889932500166785,
8667
+ "loss": 6.6446,
8668
+ "step": 1229
8669
+ },
8670
+ {
8671
+ "epoch": 0.3935370340745481,
8672
+ "grad_norm": 4.740365982055664,
8673
+ "learning_rate": 0.00026871322893124,
8674
+ "loss": 6.6858,
8675
+ "step": 1230
8676
+ },
8677
+ {
8678
+ "epoch": 0.39385698288273874,
8679
+ "grad_norm": 11.38948917388916,
8680
+ "learning_rate": 0.0002685270705682788,
8681
+ "loss": 6.748,
8682
+ "step": 1231
8683
+ },
8684
+ {
8685
+ "epoch": 0.39417693169092943,
8686
+ "grad_norm": 9.747567176818848,
8687
+ "learning_rate": 0.00026834085011029135,
8688
+ "loss": 6.4157,
8689
+ "step": 1232
8690
+ },
8691
+ {
8692
+ "epoch": 0.39449688049912013,
8693
+ "grad_norm": 7.497620582580566,
8694
+ "learning_rate": 0.0002681545677548505,
8695
+ "loss": 6.6276,
8696
+ "step": 1233
8697
+ },
8698
+ {
8699
+ "epoch": 0.3948168293073108,
8700
+ "grad_norm": 10.829628944396973,
8701
+ "learning_rate": 0.0002679682236995948,
8702
+ "loss": 6.6541,
8703
+ "step": 1234
8704
+ },
8705
+ {
8706
+ "epoch": 0.3951367781155015,
8707
+ "grad_norm": 6.678272247314453,
8708
+ "learning_rate": 0.0002677818181422284,
8709
+ "loss": 6.4207,
8710
+ "step": 1235
8711
+ },
8712
+ {
8713
+ "epoch": 0.3954567269236922,
8714
+ "grad_norm": 6.341800689697266,
8715
+ "learning_rate": 0.0002675953512805206,
8716
+ "loss": 6.695,
8717
+ "step": 1236
8718
+ },
8719
+ {
8720
+ "epoch": 0.3957766757318829,
8721
+ "grad_norm": 7.859994888305664,
8722
+ "learning_rate": 0.0002674088233123056,
8723
+ "loss": 6.3617,
8724
+ "step": 1237
8725
+ },
8726
+ {
8727
+ "epoch": 0.3960966245400736,
8728
+ "grad_norm": 10.069356918334961,
8729
+ "learning_rate": 0.0002672222344354828,
8730
+ "loss": 6.7558,
8731
+ "step": 1238
8732
+ },
8733
+ {
8734
+ "epoch": 0.3964165733482643,
8735
+ "grad_norm": 14.660593032836914,
8736
+ "learning_rate": 0.0002670355848480158,
8737
+ "loss": 6.6104,
8738
+ "step": 1239
8739
+ },
8740
+ {
8741
+ "epoch": 0.39673652215645494,
8742
+ "grad_norm": 5.288352012634277,
8743
+ "learning_rate": 0.000266848874747933,
8744
+ "loss": 6.5524,
8745
+ "step": 1240
8746
+ },
8747
+ {
8748
+ "epoch": 0.39705647096464564,
8749
+ "grad_norm": 6.08315372467041,
8750
+ "learning_rate": 0.0002666621043333266,
8751
+ "loss": 6.2126,
8752
+ "step": 1241
8753
+ },
8754
+ {
8755
+ "epoch": 0.39737641977283633,
8756
+ "grad_norm": 6.1647162437438965,
8757
+ "learning_rate": 0.00026647527380235314,
8758
+ "loss": 6.6435,
8759
+ "step": 1242
8760
+ },
8761
+ {
8762
+ "epoch": 0.397696368581027,
8763
+ "grad_norm": 5.414215087890625,
8764
+ "learning_rate": 0.0002662883833532328,
8765
+ "loss": 6.2672,
8766
+ "step": 1243
8767
+ },
8768
+ {
8769
+ "epoch": 0.3980163173892177,
8770
+ "grad_norm": 5.315961837768555,
8771
+ "learning_rate": 0.00026610143318424925,
8772
+ "loss": 6.3607,
8773
+ "step": 1244
8774
+ },
8775
+ {
8776
+ "epoch": 0.3983362661974084,
8777
+ "grad_norm": 5.834237098693848,
8778
+ "learning_rate": 0.0002659144234937497,
8779
+ "loss": 6.558,
8780
+ "step": 1245
8781
+ },
8782
+ {
8783
+ "epoch": 0.3986562150055991,
8784
+ "grad_norm": 9.148299217224121,
8785
+ "learning_rate": 0.0002657273544801444,
8786
+ "loss": 6.4416,
8787
+ "step": 1246
8788
+ },
8789
+ {
8790
+ "epoch": 0.3989761638137898,
8791
+ "grad_norm": 10.394495010375977,
8792
+ "learning_rate": 0.0002655402263419065,
8793
+ "loss": 6.5473,
8794
+ "step": 1247
8795
+ },
8796
+ {
8797
+ "epoch": 0.3992961126219805,
8798
+ "grad_norm": 9.015913009643555,
8799
+ "learning_rate": 0.000265353039277572,
8800
+ "loss": 6.4597,
8801
+ "step": 1248
8802
+ },
8803
+ {
8804
+ "epoch": 0.3996160614301712,
8805
+ "grad_norm": 10.161762237548828,
8806
+ "learning_rate": 0.00026516579348573934,
8807
+ "loss": 6.5067,
8808
+ "step": 1249
8809
+ },
8810
+ {
8811
+ "epoch": 0.39993601023836184,
8812
+ "grad_norm": 6.956129550933838,
8813
+ "learning_rate": 0.00026497848916506926,
8814
+ "loss": 6.4775,
8815
+ "step": 1250
8816
+ },
8817
+ {
8818
+ "epoch": 0.40025595904655253,
8819
+ "grad_norm": 5.3303937911987305,
8820
+ "learning_rate": 0.0002647911265142846,
8821
+ "loss": 6.4659,
8822
+ "step": 1251
8823
+ },
8824
+ {
8825
+ "epoch": 0.40057590785474323,
8826
+ "grad_norm": 5.570010662078857,
8827
+ "learning_rate": 0.00026460370573217016,
8828
+ "loss": 6.4517,
8829
+ "step": 1252
8830
+ },
8831
+ {
8832
+ "epoch": 0.4008958566629339,
8833
+ "grad_norm": 6.759359836578369,
8834
+ "learning_rate": 0.0002644162270175723,
8835
+ "loss": 6.4963,
8836
+ "step": 1253
8837
+ },
8838
+ {
8839
+ "epoch": 0.4012158054711246,
8840
+ "grad_norm": 14.08566665649414,
8841
+ "learning_rate": 0.0002642286905693989,
8842
+ "loss": 6.6086,
8843
+ "step": 1254
8844
+ },
8845
+ {
8846
+ "epoch": 0.4015357542793153,
8847
+ "grad_norm": 6.671782493591309,
8848
+ "learning_rate": 0.0002640410965866192,
8849
+ "loss": 6.3949,
8850
+ "step": 1255
8851
+ },
8852
+ {
8853
+ "epoch": 0.401855703087506,
8854
+ "grad_norm": 9.904322624206543,
8855
+ "learning_rate": 0.0002638534452682632,
8856
+ "loss": 6.5513,
8857
+ "step": 1256
8858
+ },
8859
+ {
8860
+ "epoch": 0.4021756518956967,
8861
+ "grad_norm": 11.320886611938477,
8862
+ "learning_rate": 0.00026366573681342213,
8863
+ "loss": 6.4079,
8864
+ "step": 1257
8865
+ },
8866
+ {
8867
+ "epoch": 0.4024956007038874,
8868
+ "grad_norm": 5.8289666175842285,
8869
+ "learning_rate": 0.00026347797142124745,
8870
+ "loss": 6.3216,
8871
+ "step": 1258
8872
+ },
8873
+ {
8874
+ "epoch": 0.40281554951207804,
8875
+ "grad_norm": 13.474091529846191,
8876
+ "learning_rate": 0.0002632901492909513,
8877
+ "loss": 6.4256,
8878
+ "step": 1259
8879
+ },
8880
+ {
8881
+ "epoch": 0.40313549832026874,
8882
+ "grad_norm": 16.653573989868164,
8883
+ "learning_rate": 0.0002631022706218058,
8884
+ "loss": 6.7427,
8885
+ "step": 1260
8886
+ },
8887
+ {
8888
+ "epoch": 0.40345544712845943,
8889
+ "grad_norm": 9.67142391204834,
8890
+ "learning_rate": 0.00026291433561314323,
8891
+ "loss": 6.5105,
8892
+ "step": 1261
8893
+ },
8894
+ {
8895
+ "epoch": 0.4037753959366501,
8896
+ "grad_norm": 7.529284477233887,
8897
+ "learning_rate": 0.0002627263444643557,
8898
+ "loss": 6.4653,
8899
+ "step": 1262
8900
+ },
8901
+ {
8902
+ "epoch": 0.4040953447448408,
8903
+ "grad_norm": 9.487723350524902,
8904
+ "learning_rate": 0.00026253829737489455,
8905
+ "loss": 6.2462,
8906
+ "step": 1263
8907
+ },
8908
+ {
8909
+ "epoch": 0.4044152935530315,
8910
+ "grad_norm": 8.2636079788208,
8911
+ "learning_rate": 0.0002623501945442708,
8912
+ "loss": 6.3761,
8913
+ "step": 1264
8914
+ },
8915
+ {
8916
+ "epoch": 0.4047352423612222,
8917
+ "grad_norm": 5.9740471839904785,
8918
+ "learning_rate": 0.00026216203617205453,
8919
+ "loss": 6.3738,
8920
+ "step": 1265
8921
+ },
8922
+ {
8923
+ "epoch": 0.4050551911694129,
8924
+ "grad_norm": 7.923804759979248,
8925
+ "learning_rate": 0.0002619738224578746,
8926
+ "loss": 6.1306,
8927
+ "step": 1266
8928
+ },
8929
+ {
8930
+ "epoch": 0.4053751399776036,
8931
+ "grad_norm": 6.172807216644287,
8932
+ "learning_rate": 0.0002617855536014188,
8933
+ "loss": 6.4058,
8934
+ "step": 1267
8935
+ },
8936
+ {
8937
+ "epoch": 0.4056950887857943,
8938
+ "grad_norm": 10.529424667358398,
8939
+ "learning_rate": 0.0002615972298024334,
8940
+ "loss": 6.286,
8941
+ "step": 1268
8942
+ },
8943
+ {
8944
+ "epoch": 0.40601503759398494,
8945
+ "grad_norm": 7.2672953605651855,
8946
+ "learning_rate": 0.0002614088512607227,
8947
+ "loss": 6.3429,
8948
+ "step": 1269
8949
+ },
8950
+ {
8951
+ "epoch": 0.40633498640217564,
8952
+ "grad_norm": 12.593116760253906,
8953
+ "learning_rate": 0.0002612204181761493,
8954
+ "loss": 6.5201,
8955
+ "step": 1270
8956
+ },
8957
+ {
8958
+ "epoch": 0.40665493521036633,
8959
+ "grad_norm": 12.00728988647461,
8960
+ "learning_rate": 0.00026103193074863377,
8961
+ "loss": 6.3686,
8962
+ "step": 1271
8963
+ },
8964
+ {
8965
+ "epoch": 0.406974884018557,
8966
+ "grad_norm": 6.9003777503967285,
8967
+ "learning_rate": 0.0002608433891781541,
8968
+ "loss": 6.3091,
8969
+ "step": 1272
8970
+ },
8971
+ {
8972
+ "epoch": 0.4072948328267477,
8973
+ "grad_norm": 11.43606948852539,
8974
+ "learning_rate": 0.0002606547936647458,
8975
+ "loss": 6.4177,
8976
+ "step": 1273
8977
+ },
8978
+ {
8979
+ "epoch": 0.4076147816349384,
8980
+ "grad_norm": 8.18825912475586,
8981
+ "learning_rate": 0.0002604661444085017,
8982
+ "loss": 6.379,
8983
+ "step": 1274
8984
+ },
8985
+ {
8986
+ "epoch": 0.4079347304431291,
8987
+ "grad_norm": 9.621562957763672,
8988
+ "learning_rate": 0.0002602774416095715,
8989
+ "loss": 6.4082,
8990
+ "step": 1275
8991
+ },
8992
+ {
8993
+ "epoch": 0.4082546792513198,
8994
+ "grad_norm": 10.449783325195312,
8995
+ "learning_rate": 0.000260088685468162,
8996
+ "loss": 6.2317,
8997
+ "step": 1276
8998
+ },
8999
+ {
9000
+ "epoch": 0.4085746280595105,
9001
+ "grad_norm": 14.534072875976562,
9002
+ "learning_rate": 0.0002598998761845361,
9003
+ "loss": 6.5101,
9004
+ "step": 1277
9005
+ },
9006
+ {
9007
+ "epoch": 0.40889457686770114,
9008
+ "grad_norm": 19.77684211730957,
9009
+ "learning_rate": 0.0002597110139590135,
9010
+ "loss": 6.4038,
9011
+ "step": 1278
9012
+ },
9013
+ {
9014
+ "epoch": 0.40921452567589184,
9015
+ "grad_norm": 9.12231159210205,
9016
+ "learning_rate": 0.00025952209899197,
9017
+ "loss": 6.2977,
9018
+ "step": 1279
9019
+ },
9020
+ {
9021
+ "epoch": 0.40953447448408253,
9022
+ "grad_norm": 9.007134437561035,
9023
+ "learning_rate": 0.0002593331314838372,
9024
+ "loss": 6.6574,
9025
+ "step": 1280
9026
+ },
9027
+ {
9028
+ "epoch": 0.40985442329227323,
9029
+ "grad_norm": 12.240474700927734,
9030
+ "learning_rate": 0.0002591441116351025,
9031
+ "loss": 6.6026,
9032
+ "step": 1281
9033
+ },
9034
+ {
9035
+ "epoch": 0.4101743721004639,
9036
+ "grad_norm": 18.05267333984375,
9037
+ "learning_rate": 0.000258955039646309,
9038
+ "loss": 6.2505,
9039
+ "step": 1282
9040
+ },
9041
+ {
9042
+ "epoch": 0.4104943209086546,
9043
+ "grad_norm": 12.290742874145508,
9044
+ "learning_rate": 0.000258765915718055,
9045
+ "loss": 6.5617,
9046
+ "step": 1283
9047
+ },
9048
+ {
9049
+ "epoch": 0.4108142697168453,
9050
+ "grad_norm": 15.071090698242188,
9051
+ "learning_rate": 0.0002585767400509937,
9052
+ "loss": 6.5613,
9053
+ "step": 1284
9054
+ },
9055
+ {
9056
+ "epoch": 0.411134218525036,
9057
+ "grad_norm": 8.901190757751465,
9058
+ "learning_rate": 0.00025838751284583346,
9059
+ "loss": 6.3584,
9060
+ "step": 1285
9061
+ },
9062
+ {
9063
+ "epoch": 0.4114541673332267,
9064
+ "grad_norm": 11.258878707885742,
9065
+ "learning_rate": 0.0002581982343033374,
9066
+ "loss": 6.5876,
9067
+ "step": 1286
9068
+ },
9069
+ {
9070
+ "epoch": 0.41177411614141735,
9071
+ "grad_norm": 9.922440528869629,
9072
+ "learning_rate": 0.00025800890462432277,
9073
+ "loss": 6.2798,
9074
+ "step": 1287
9075
+ },
9076
+ {
9077
+ "epoch": 0.41209406494960804,
9078
+ "grad_norm": 7.8548150062561035,
9079
+ "learning_rate": 0.0002578195240096614,
9080
+ "loss": 6.4563,
9081
+ "step": 1288
9082
+ },
9083
+ {
9084
+ "epoch": 0.41241401375779874,
9085
+ "grad_norm": 9.597755432128906,
9086
+ "learning_rate": 0.0002576300926602788,
9087
+ "loss": 6.2798,
9088
+ "step": 1289
9089
+ },
9090
+ {
9091
+ "epoch": 0.41273396256598943,
9092
+ "grad_norm": 5.551302433013916,
9093
+ "learning_rate": 0.0002574406107771548,
9094
+ "loss": 6.3571,
9095
+ "step": 1290
9096
+ },
9097
+ {
9098
+ "epoch": 0.4130539113741801,
9099
+ "grad_norm": 9.343033790588379,
9100
+ "learning_rate": 0.0002572510785613225,
9101
+ "loss": 6.375,
9102
+ "step": 1291
9103
+ },
9104
+ {
9105
+ "epoch": 0.4133738601823708,
9106
+ "grad_norm": 6.203455924987793,
9107
+ "learning_rate": 0.0002570614962138682,
9108
+ "loss": 6.032,
9109
+ "step": 1292
9110
+ },
9111
+ {
9112
+ "epoch": 0.4136938089905615,
9113
+ "grad_norm": 7.929701328277588,
9114
+ "learning_rate": 0.00025687186393593206,
9115
+ "loss": 6.3534,
9116
+ "step": 1293
9117
+ },
9118
+ {
9119
+ "epoch": 0.4140137577987522,
9120
+ "grad_norm": 12.085379600524902,
9121
+ "learning_rate": 0.0002566821819287065,
9122
+ "loss": 6.4062,
9123
+ "step": 1294
9124
+ },
9125
+ {
9126
+ "epoch": 0.4143337066069429,
9127
+ "grad_norm": 8.04161262512207,
9128
+ "learning_rate": 0.0002564924503934372,
9129
+ "loss": 6.4253,
9130
+ "step": 1295
9131
+ },
9132
+ {
9133
+ "epoch": 0.4146536554151336,
9134
+ "grad_norm": 11.36021614074707,
9135
+ "learning_rate": 0.00025630266953142214,
9136
+ "loss": 6.1811,
9137
+ "step": 1296
9138
+ },
9139
+ {
9140
+ "epoch": 0.41497360422332424,
9141
+ "grad_norm": 12.349037170410156,
9142
+ "learning_rate": 0.00025611283954401175,
9143
+ "loss": 6.4346,
9144
+ "step": 1297
9145
+ },
9146
+ {
9147
+ "epoch": 0.41529355303151494,
9148
+ "grad_norm": 11.792349815368652,
9149
+ "learning_rate": 0.00025592296063260835,
9150
+ "loss": 6.4392,
9151
+ "step": 1298
9152
+ },
9153
+ {
9154
+ "epoch": 0.41561350183970563,
9155
+ "grad_norm": 9.208128929138184,
9156
+ "learning_rate": 0.00025573303299866653,
9157
+ "loss": 6.3419,
9158
+ "step": 1299
9159
+ },
9160
+ {
9161
+ "epoch": 0.41593345064789633,
9162
+ "grad_norm": 14.85993766784668,
9163
+ "learning_rate": 0.0002555430568436923,
9164
+ "loss": 6.3156,
9165
+ "step": 1300
9166
+ },
9167
+ {
9168
+ "epoch": 0.416253399456087,
9169
+ "grad_norm": 14.815731048583984,
9170
+ "learning_rate": 0.0002553530323692432,
9171
+ "loss": 6.3872,
9172
+ "step": 1301
9173
+ },
9174
+ {
9175
+ "epoch": 0.4165733482642777,
9176
+ "grad_norm": 9.829910278320312,
9177
+ "learning_rate": 0.0002551629597769282,
9178
+ "loss": 6.4641,
9179
+ "step": 1302
9180
+ },
9181
+ {
9182
+ "epoch": 0.4168932970724684,
9183
+ "grad_norm": 16.248035430908203,
9184
+ "learning_rate": 0.000254972839268407,
9185
+ "loss": 6.2669,
9186
+ "step": 1303
9187
+ },
9188
+ {
9189
+ "epoch": 0.4172132458806591,
9190
+ "grad_norm": 11.95917797088623,
9191
+ "learning_rate": 0.00025478267104539053,
9192
+ "loss": 6.4028,
9193
+ "step": 1304
9194
+ },
9195
+ {
9196
+ "epoch": 0.4175331946888498,
9197
+ "grad_norm": 10.625663757324219,
9198
+ "learning_rate": 0.00025459245530964,
9199
+ "loss": 6.4577,
9200
+ "step": 1305
9201
+ },
9202
+ {
9203
+ "epoch": 0.41785314349704045,
9204
+ "grad_norm": 6.940323352813721,
9205
+ "learning_rate": 0.00025440219226296725,
9206
+ "loss": 6.3556,
9207
+ "step": 1306
9208
+ },
9209
+ {
9210
+ "epoch": 0.41817309230523114,
9211
+ "grad_norm": 10.21389102935791,
9212
+ "learning_rate": 0.0002542118821072342,
9213
+ "loss": 6.4131,
9214
+ "step": 1307
9215
+ },
9216
+ {
9217
+ "epoch": 0.41849304111342184,
9218
+ "grad_norm": 13.550383567810059,
9219
+ "learning_rate": 0.0002540215250443528,
9220
+ "loss": 6.4616,
9221
+ "step": 1308
9222
+ },
9223
+ {
9224
+ "epoch": 0.41881298992161253,
9225
+ "grad_norm": 9.647680282592773,
9226
+ "learning_rate": 0.0002538311212762847,
9227
+ "loss": 6.0977,
9228
+ "step": 1309
9229
+ },
9230
+ {
9231
+ "epoch": 0.41913293872980323,
9232
+ "grad_norm": 10.69679069519043,
9233
+ "learning_rate": 0.0002536406710050412,
9234
+ "loss": 6.1197,
9235
+ "step": 1310
9236
+ },
9237
+ {
9238
+ "epoch": 0.4194528875379939,
9239
+ "grad_norm": 15.03259563446045,
9240
+ "learning_rate": 0.0002534501744326829,
9241
+ "loss": 6.497,
9242
+ "step": 1311
9243
+ },
9244
+ {
9245
+ "epoch": 0.4197728363461846,
9246
+ "grad_norm": 14.628292083740234,
9247
+ "learning_rate": 0.00025325963176131946,
9248
+ "loss": 6.3991,
9249
+ "step": 1312
9250
+ },
9251
+ {
9252
+ "epoch": 0.4200927851543753,
9253
+ "grad_norm": 13.270496368408203,
9254
+ "learning_rate": 0.0002530690431931096,
9255
+ "loss": 6.7255,
9256
+ "step": 1313
9257
+ },
9258
+ {
9259
+ "epoch": 0.420412733962566,
9260
+ "grad_norm": 7.5467143058776855,
9261
+ "learning_rate": 0.00025287840893026064,
9262
+ "loss": 6.0528,
9263
+ "step": 1314
9264
+ },
9265
+ {
9266
+ "epoch": 0.4207326827707567,
9267
+ "grad_norm": 9.551958084106445,
9268
+ "learning_rate": 0.0002526877291750283,
9269
+ "loss": 6.1203,
9270
+ "step": 1315
9271
+ },
9272
+ {
9273
+ "epoch": 0.42105263157894735,
9274
+ "grad_norm": 9.613594055175781,
9275
+ "learning_rate": 0.0002524970041297166,
9276
+ "loss": 6.3185,
9277
+ "step": 1316
9278
+ },
9279
+ {
9280
+ "epoch": 0.42137258038713804,
9281
+ "grad_norm": 16.20762825012207,
9282
+ "learning_rate": 0.00025230623399667777,
9283
+ "loss": 6.2841,
9284
+ "step": 1317
9285
+ },
9286
+ {
9287
+ "epoch": 0.42169252919532874,
9288
+ "grad_norm": 19.544544219970703,
9289
+ "learning_rate": 0.0002521154189783118,
9290
+ "loss": 6.2749,
9291
+ "step": 1318
9292
+ },
9293
+ {
9294
+ "epoch": 0.42201247800351943,
9295
+ "grad_norm": 30.432388305664062,
9296
+ "learning_rate": 0.00025192455927706617,
9297
+ "loss": 6.417,
9298
+ "step": 1319
9299
+ },
9300
+ {
9301
+ "epoch": 0.4223324268117101,
9302
+ "grad_norm": 12.61909294128418,
9303
+ "learning_rate": 0.0002517336550954359,
9304
+ "loss": 6.5085,
9305
+ "step": 1320
9306
+ },
9307
+ {
9308
+ "epoch": 0.4226523756199008,
9309
+ "grad_norm": 12.66275691986084,
9310
+ "learning_rate": 0.0002515427066359632,
9311
+ "loss": 6.239,
9312
+ "step": 1321
9313
+ },
9314
+ {
9315
+ "epoch": 0.4229723244280915,
9316
+ "grad_norm": 32.47134780883789,
9317
+ "learning_rate": 0.0002513517141012371,
9318
+ "loss": 6.5225,
9319
+ "step": 1322
9320
+ },
9321
+ {
9322
+ "epoch": 0.4232922732362822,
9323
+ "grad_norm": 15.030749320983887,
9324
+ "learning_rate": 0.0002511606776938936,
9325
+ "loss": 6.2803,
9326
+ "step": 1323
9327
+ },
9328
+ {
9329
+ "epoch": 0.4236122220444729,
9330
+ "grad_norm": 10.074403762817383,
9331
+ "learning_rate": 0.00025096959761661524,
9332
+ "loss": 6.3504,
9333
+ "step": 1324
9334
+ },
9335
+ {
9336
+ "epoch": 0.42393217085266355,
9337
+ "grad_norm": 13.217743873596191,
9338
+ "learning_rate": 0.0002507784740721306,
9339
+ "loss": 6.2698,
9340
+ "step": 1325
9341
+ },
9342
+ {
9343
+ "epoch": 0.42425211966085424,
9344
+ "grad_norm": 16.90913200378418,
9345
+ "learning_rate": 0.0002505873072632148,
9346
+ "loss": 6.2857,
9347
+ "step": 1326
9348
+ },
9349
+ {
9350
+ "epoch": 0.42457206846904494,
9351
+ "grad_norm": 17.0783634185791,
9352
+ "learning_rate": 0.0002503960973926886,
9353
+ "loss": 6.2195,
9354
+ "step": 1327
9355
+ },
9356
+ {
9357
+ "epoch": 0.42489201727723563,
9358
+ "grad_norm": 7.002859115600586,
9359
+ "learning_rate": 0.00025020484466341844,
9360
+ "loss": 6.2902,
9361
+ "step": 1328
9362
+ },
9363
+ {
9364
+ "epoch": 0.42521196608542633,
9365
+ "grad_norm": 14.14289379119873,
9366
+ "learning_rate": 0.0002500135492783163,
9367
+ "loss": 6.3848,
9368
+ "step": 1329
9369
+ },
9370
+ {
9371
+ "epoch": 0.425531914893617,
9372
+ "grad_norm": 9.69235897064209,
9373
+ "learning_rate": 0.0002498222114403395,
9374
+ "loss": 6.1554,
9375
+ "step": 1330
9376
+ },
9377
+ {
9378
+ "epoch": 0.4258518637018077,
9379
+ "grad_norm": 30.017566680908203,
9380
+ "learning_rate": 0.0002496308313524902,
9381
+ "loss": 6.3937,
9382
+ "step": 1331
9383
+ },
9384
+ {
9385
+ "epoch": 0.4261718125099984,
9386
+ "grad_norm": 13.781780242919922,
9387
+ "learning_rate": 0.00024943940921781557,
9388
+ "loss": 6.1807,
9389
+ "step": 1332
9390
+ },
9391
+ {
9392
+ "epoch": 0.4264917613181891,
9393
+ "grad_norm": 10.11452579498291,
9394
+ "learning_rate": 0.0002492479452394072,
9395
+ "loss": 6.385,
9396
+ "step": 1333
9397
+ },
9398
+ {
9399
+ "epoch": 0.4268117101263798,
9400
+ "grad_norm": 18.071516036987305,
9401
+ "learning_rate": 0.00024905643962040133,
9402
+ "loss": 6.35,
9403
+ "step": 1334
9404
+ },
9405
+ {
9406
+ "epoch": 0.42713165893457045,
9407
+ "grad_norm": 12.902596473693848,
9408
+ "learning_rate": 0.00024886489256397825,
9409
+ "loss": 6.4579,
9410
+ "step": 1335
9411
+ },
9412
+ {
9413
+ "epoch": 0.42745160774276114,
9414
+ "grad_norm": 9.532163619995117,
9415
+ "learning_rate": 0.000248673304273362,
9416
+ "loss": 6.2096,
9417
+ "step": 1336
9418
+ },
9419
+ {
9420
+ "epoch": 0.42777155655095184,
9421
+ "grad_norm": 20.802745819091797,
9422
+ "learning_rate": 0.0002484816749518207,
9423
+ "loss": 6.4637,
9424
+ "step": 1337
9425
+ },
9426
+ {
9427
+ "epoch": 0.42809150535914253,
9428
+ "grad_norm": 11.478161811828613,
9429
+ "learning_rate": 0.00024829000480266594,
9430
+ "loss": 6.3374,
9431
+ "step": 1338
9432
+ },
9433
+ {
9434
+ "epoch": 0.4284114541673332,
9435
+ "grad_norm": 8.678462028503418,
9436
+ "learning_rate": 0.0002480982940292524,
9437
+ "loss": 6.2866,
9438
+ "step": 1339
9439
+ },
9440
+ {
9441
+ "epoch": 0.4287314029755239,
9442
+ "grad_norm": 9.180252075195312,
9443
+ "learning_rate": 0.0002479065428349782,
9444
+ "loss": 6.3384,
9445
+ "step": 1340
9446
+ },
9447
+ {
9448
+ "epoch": 0.4290513517837146,
9449
+ "grad_norm": 11.826372146606445,
9450
+ "learning_rate": 0.00024771475142328406,
9451
+ "loss": 6.3178,
9452
+ "step": 1341
9453
+ },
9454
+ {
9455
+ "epoch": 0.4293713005919053,
9456
+ "grad_norm": 8.149714469909668,
9457
+ "learning_rate": 0.00024752291999765344,
9458
+ "loss": 6.4101,
9459
+ "step": 1342
9460
+ },
9461
+ {
9462
+ "epoch": 0.429691249400096,
9463
+ "grad_norm": 8.667763710021973,
9464
+ "learning_rate": 0.0002473310487616123,
9465
+ "loss": 6.2569,
9466
+ "step": 1343
9467
+ },
9468
+ {
9469
+ "epoch": 0.43001119820828665,
9470
+ "grad_norm": 8.507345199584961,
9471
+ "learning_rate": 0.00024713913791872896,
9472
+ "loss": 6.0874,
9473
+ "step": 1344
9474
+ },
9475
+ {
9476
+ "epoch": 0.43033114701647734,
9477
+ "grad_norm": 12.065765380859375,
9478
+ "learning_rate": 0.00024694718767261336,
9479
+ "loss": 6.2609,
9480
+ "step": 1345
9481
+ },
9482
+ {
9483
+ "epoch": 0.43065109582466804,
9484
+ "grad_norm": 7.034212112426758,
9485
+ "learning_rate": 0.00024675519822691777,
9486
+ "loss": 6.3743,
9487
+ "step": 1346
9488
+ },
9489
+ {
9490
+ "epoch": 0.43097104463285874,
9491
+ "grad_norm": 8.030159950256348,
9492
+ "learning_rate": 0.0002465631697853357,
9493
+ "loss": 6.1686,
9494
+ "step": 1347
9495
+ },
9496
+ {
9497
+ "epoch": 0.43129099344104943,
9498
+ "grad_norm": 36.66036605834961,
9499
+ "learning_rate": 0.00024637110255160203,
9500
+ "loss": 6.2742,
9501
+ "step": 1348
9502
+ },
9503
+ {
9504
+ "epoch": 0.4316109422492401,
9505
+ "grad_norm": 12.500212669372559,
9506
+ "learning_rate": 0.00024617899672949305,
9507
+ "loss": 6.1638,
9508
+ "step": 1349
9509
+ },
9510
+ {
9511
+ "epoch": 0.4319308910574308,
9512
+ "grad_norm": 20.530799865722656,
9513
+ "learning_rate": 0.0002459868525228257,
9514
+ "loss": 6.5203,
9515
+ "step": 1350
9516
+ },
9517
+ {
9518
+ "epoch": 0.4322508398656215,
9519
+ "grad_norm": 6.235330104827881,
9520
+ "learning_rate": 0.0002457946701354578,
9521
+ "loss": 6.2691,
9522
+ "step": 1351
9523
+ },
9524
+ {
9525
+ "epoch": 0.4325707886738122,
9526
+ "grad_norm": 10.153731346130371,
9527
+ "learning_rate": 0.00024560244977128774,
9528
+ "loss": 6.3439,
9529
+ "step": 1352
9530
+ },
9531
+ {
9532
+ "epoch": 0.43289073748200285,
9533
+ "grad_norm": 9.818161964416504,
9534
+ "learning_rate": 0.000245410191634254,
9535
+ "loss": 6.1459,
9536
+ "step": 1353
9537
+ },
9538
+ {
9539
+ "epoch": 0.43321068629019355,
9540
+ "grad_norm": 16.71061134338379,
9541
+ "learning_rate": 0.0002452178959283353,
9542
+ "loss": 6.4604,
9543
+ "step": 1354
9544
+ },
9545
+ {
9546
+ "epoch": 0.43353063509838424,
9547
+ "grad_norm": 9.193108558654785,
9548
+ "learning_rate": 0.00024502556285755023,
9549
+ "loss": 6.3588,
9550
+ "step": 1355
9551
+ },
9552
+ {
9553
+ "epoch": 0.43385058390657494,
9554
+ "grad_norm": 9.433093070983887,
9555
+ "learning_rate": 0.00024483319262595687,
9556
+ "loss": 6.4425,
9557
+ "step": 1356
9558
+ },
9559
+ {
9560
+ "epoch": 0.43417053271476563,
9561
+ "grad_norm": 8.100132942199707,
9562
+ "learning_rate": 0.0002446407854376529,
9563
+ "loss": 6.434,
9564
+ "step": 1357
9565
+ },
9566
+ {
9567
+ "epoch": 0.43449048152295633,
9568
+ "grad_norm": 8.084160804748535,
9569
+ "learning_rate": 0.00024444834149677506,
9570
+ "loss": 6.4744,
9571
+ "step": 1358
9572
+ },
9573
+ {
9574
+ "epoch": 0.434810430331147,
9575
+ "grad_norm": 7.267811298370361,
9576
+ "learning_rate": 0.00024425586100749916,
9577
+ "loss": 6.0862,
9578
+ "step": 1359
9579
+ },
9580
+ {
9581
+ "epoch": 0.4351303791393377,
9582
+ "grad_norm": 6.129974365234375,
9583
+ "learning_rate": 0.0002440633441740398,
9584
+ "loss": 6.3388,
9585
+ "step": 1360
9586
+ },
9587
+ {
9588
+ "epoch": 0.4354503279475284,
9589
+ "grad_norm": 10.397089004516602,
9590
+ "learning_rate": 0.00024387079120065014,
9591
+ "loss": 6.349,
9592
+ "step": 1361
9593
+ },
9594
+ {
9595
+ "epoch": 0.4357702767557191,
9596
+ "grad_norm": 10.434456825256348,
9597
+ "learning_rate": 0.00024367820229162157,
9598
+ "loss": 6.1166,
9599
+ "step": 1362
9600
+ },
9601
+ {
9602
+ "epoch": 0.43609022556390975,
9603
+ "grad_norm": 8.7677001953125,
9604
+ "learning_rate": 0.00024348557765128384,
9605
+ "loss": 6.2898,
9606
+ "step": 1363
9607
+ },
9608
+ {
9609
+ "epoch": 0.43641017437210045,
9610
+ "grad_norm": 11.238868713378906,
9611
+ "learning_rate": 0.0002432929174840044,
9612
+ "loss": 6.1991,
9613
+ "step": 1364
9614
+ },
9615
+ {
9616
+ "epoch": 0.43673012318029114,
9617
+ "grad_norm": 7.583499431610107,
9618
+ "learning_rate": 0.00024310022199418833,
9619
+ "loss": 6.0778,
9620
+ "step": 1365
9621
+ },
9622
+ {
9623
+ "epoch": 0.43705007198848184,
9624
+ "grad_norm": 11.78762149810791,
9625
+ "learning_rate": 0.0002429074913862786,
9626
+ "loss": 6.3973,
9627
+ "step": 1366
9628
+ },
9629
+ {
9630
+ "epoch": 0.43737002079667253,
9631
+ "grad_norm": 7.4696431159973145,
9632
+ "learning_rate": 0.0002427147258647549,
9633
+ "loss": 6.0472,
9634
+ "step": 1367
9635
+ },
9636
+ {
9637
+ "epoch": 0.4376899696048632,
9638
+ "grad_norm": 11.547538757324219,
9639
+ "learning_rate": 0.00024252192563413435,
9640
+ "loss": 6.4374,
9641
+ "step": 1368
9642
+ },
9643
+ {
9644
+ "epoch": 0.4380099184130539,
9645
+ "grad_norm": 11.28888988494873,
9646
+ "learning_rate": 0.00024232909089897065,
9647
+ "loss": 6.3143,
9648
+ "step": 1369
9649
+ },
9650
+ {
9651
+ "epoch": 0.4383298672212446,
9652
+ "grad_norm": 12.933130264282227,
9653
+ "learning_rate": 0.00024213622186385436,
9654
+ "loss": 6.26,
9655
+ "step": 1370
9656
+ },
9657
+ {
9658
+ "epoch": 0.4386498160294353,
9659
+ "grad_norm": 10.536492347717285,
9660
+ "learning_rate": 0.00024194331873341222,
9661
+ "loss": 6.2753,
9662
+ "step": 1371
9663
+ },
9664
+ {
9665
+ "epoch": 0.43896976483762595,
9666
+ "grad_norm": 18.023189544677734,
9667
+ "learning_rate": 0.00024175038171230718,
9668
+ "loss": 6.4572,
9669
+ "step": 1372
9670
+ },
9671
+ {
9672
+ "epoch": 0.43928971364581665,
9673
+ "grad_norm": 15.396516799926758,
9674
+ "learning_rate": 0.00024155741100523824,
9675
+ "loss": 6.0828,
9676
+ "step": 1373
9677
+ },
9678
+ {
9679
+ "epoch": 0.43960966245400734,
9680
+ "grad_norm": 12.426717758178711,
9681
+ "learning_rate": 0.00024136440681694007,
9682
+ "loss": 6.3676,
9683
+ "step": 1374
9684
+ },
9685
+ {
9686
+ "epoch": 0.43992961126219804,
9687
+ "grad_norm": 10.206119537353516,
9688
+ "learning_rate": 0.00024117136935218283,
9689
+ "loss": 6.1777,
9690
+ "step": 1375
9691
+ },
9692
+ {
9693
+ "epoch": 0.44024956007038873,
9694
+ "grad_norm": 17.221784591674805,
9695
+ "learning_rate": 0.00024097829881577205,
9696
+ "loss": 6.2916,
9697
+ "step": 1376
9698
+ },
9699
+ {
9700
+ "epoch": 0.44056950887857943,
9701
+ "grad_norm": 72.50157165527344,
9702
+ "learning_rate": 0.0002407851954125484,
9703
+ "loss": 6.234,
9704
+ "step": 1377
9705
+ },
9706
+ {
9707
+ "epoch": 0.4408894576867701,
9708
+ "grad_norm": 7.173483848571777,
9709
+ "learning_rate": 0.0002405920593473872,
9710
+ "loss": 6.2403,
9711
+ "step": 1378
9712
+ },
9713
+ {
9714
+ "epoch": 0.4412094064949608,
9715
+ "grad_norm": 7.698493480682373,
9716
+ "learning_rate": 0.0002403988908251988,
9717
+ "loss": 6.2424,
9718
+ "step": 1379
9719
+ },
9720
+ {
9721
+ "epoch": 0.4415293553031515,
9722
+ "grad_norm": 11.574590682983398,
9723
+ "learning_rate": 0.00024020569005092749,
9724
+ "loss": 6.2996,
9725
+ "step": 1380
9726
+ },
9727
+ {
9728
+ "epoch": 0.4418493041113422,
9729
+ "grad_norm": 10.622515678405762,
9730
+ "learning_rate": 0.00024001245722955216,
9731
+ "loss": 6.158,
9732
+ "step": 1381
9733
+ },
9734
+ {
9735
+ "epoch": 0.44216925291953285,
9736
+ "grad_norm": 11.94666576385498,
9737
+ "learning_rate": 0.00023981919256608564,
9738
+ "loss": 6.165,
9739
+ "step": 1382
9740
+ },
9741
+ {
9742
+ "epoch": 0.44248920172772355,
9743
+ "grad_norm": 10.147541046142578,
9744
+ "learning_rate": 0.00023962589626557446,
9745
+ "loss": 6.2964,
9746
+ "step": 1383
9747
+ },
9748
+ {
9749
+ "epoch": 0.44280915053591424,
9750
+ "grad_norm": 9.275951385498047,
9751
+ "learning_rate": 0.00023943256853309862,
9752
+ "loss": 6.3666,
9753
+ "step": 1384
9754
+ },
9755
+ {
9756
+ "epoch": 0.44312909934410494,
9757
+ "grad_norm": 12.353401184082031,
9758
+ "learning_rate": 0.0002392392095737718,
9759
+ "loss": 6.2231,
9760
+ "step": 1385
9761
+ },
9762
+ {
9763
+ "epoch": 0.44344904815229563,
9764
+ "grad_norm": 145.6978759765625,
9765
+ "learning_rate": 0.0002390458195927404,
9766
+ "loss": 6.0643,
9767
+ "step": 1386
9768
+ },
9769
+ {
9770
+ "epoch": 0.44376899696048633,
9771
+ "grad_norm": 9.182119369506836,
9772
+ "learning_rate": 0.00023885239879518406,
9773
+ "loss": 6.2337,
9774
+ "step": 1387
9775
+ },
9776
+ {
9777
+ "epoch": 0.444088945768677,
9778
+ "grad_norm": 10.039813995361328,
9779
+ "learning_rate": 0.000238658947386315,
9780
+ "loss": 6.3979,
9781
+ "step": 1388
9782
+ },
9783
+ {
9784
+ "epoch": 0.4444088945768677,
9785
+ "grad_norm": 8.057585716247559,
9786
+ "learning_rate": 0.00023846546557137782,
9787
+ "loss": 6.1908,
9788
+ "step": 1389
9789
+ },
9790
+ {
9791
+ "epoch": 0.4447288433850584,
9792
+ "grad_norm": 8.275229454040527,
9793
+ "learning_rate": 0.00023827195355564958,
9794
+ "loss": 6.4919,
9795
+ "step": 1390
9796
+ },
9797
+ {
9798
+ "epoch": 0.44504879219324905,
9799
+ "grad_norm": 14.329585075378418,
9800
+ "learning_rate": 0.00023807841154443912,
9801
+ "loss": 6.3344,
9802
+ "step": 1391
9803
+ },
9804
+ {
9805
+ "epoch": 0.44536874100143975,
9806
+ "grad_norm": 17.71234893798828,
9807
+ "learning_rate": 0.00023788483974308738,
9808
+ "loss": 6.1686,
9809
+ "step": 1392
9810
+ },
9811
+ {
9812
+ "epoch": 0.44568868980963045,
9813
+ "grad_norm": 17.175947189331055,
9814
+ "learning_rate": 0.00023769123835696676,
9815
+ "loss": 6.2478,
9816
+ "step": 1393
9817
+ },
9818
+ {
9819
+ "epoch": 0.44600863861782114,
9820
+ "grad_norm": 12.383498191833496,
9821
+ "learning_rate": 0.00023749760759148104,
9822
+ "loss": 6.2889,
9823
+ "step": 1394
9824
+ },
9825
+ {
9826
+ "epoch": 0.44632858742601184,
9827
+ "grad_norm": 26.6887149810791,
9828
+ "learning_rate": 0.0002373039476520651,
9829
+ "loss": 6.2837,
9830
+ "step": 1395
9831
+ },
9832
+ {
9833
+ "epoch": 0.44664853623420253,
9834
+ "grad_norm": 35.60121536254883,
9835
+ "learning_rate": 0.00023711025874418508,
9836
+ "loss": 6.2465,
9837
+ "step": 1396
9838
+ },
9839
+ {
9840
+ "epoch": 0.4469684850423932,
9841
+ "grad_norm": 26.76144790649414,
9842
+ "learning_rate": 0.00023691654107333755,
9843
+ "loss": 6.4341,
9844
+ "step": 1397
9845
+ },
9846
+ {
9847
+ "epoch": 0.4472884338505839,
9848
+ "grad_norm": 37.42619323730469,
9849
+ "learning_rate": 0.0002367227948450496,
9850
+ "loss": 6.4073,
9851
+ "step": 1398
9852
+ },
9853
+ {
9854
+ "epoch": 0.4476083826587746,
9855
+ "grad_norm": 24.048145294189453,
9856
+ "learning_rate": 0.00023652902026487883,
9857
+ "loss": 6.1317,
9858
+ "step": 1399
9859
+ },
9860
+ {
9861
+ "epoch": 0.4479283314669653,
9862
+ "grad_norm": 28.50050163269043,
9863
+ "learning_rate": 0.0002363352175384128,
9864
+ "loss": 6.2892,
9865
+ "step": 1400
9866
+ },
9867
+ {
9868
+ "epoch": 0.4479283314669653,
9869
+ "eval_loss": 3.1516573429107666,
9870
+ "eval_runtime": 233.5122,
9871
+ "eval_samples_per_second": 5.636,
9872
+ "eval_steps_per_second": 1.409,
9873
+ "step": 1400
9874
  }
9875
  ],
9876
  "logging_steps": 1,
 
9885
  "early_stopping_threshold": 0.0
9886
  },
9887
  "attributes": {
9888
+ "early_stopping_patience_counter": 4
9889
  }
9890
  },
9891
  "TrainerControl": {
 
9899
  "attributes": {}
9900
  }
9901
  },
9902
+ "total_flos": 1.2911179536091054e+18,
9903
  "train_batch_size": 4,
9904
  "trial_name": null,
9905
  "trial_params": null