narabzad's picture
Add files using upload-large-folder tool
cce7c08 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.0,
"eval_steps": 500,
"global_step": 875,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008,
"grad_norm": 3.64601993560791,
"learning_rate": 0.0,
"loss": 0.8628,
"step": 1
},
{
"epoch": 0.016,
"grad_norm": 3.9168810844421387,
"learning_rate": 1.7543859649122808e-07,
"loss": 0.9365,
"step": 2
},
{
"epoch": 0.024,
"grad_norm": 3.702859878540039,
"learning_rate": 3.5087719298245616e-07,
"loss": 0.7746,
"step": 3
},
{
"epoch": 0.032,
"grad_norm": 3.483750581741333,
"learning_rate": 5.263157894736843e-07,
"loss": 0.7739,
"step": 4
},
{
"epoch": 0.04,
"grad_norm": 3.805393934249878,
"learning_rate": 7.017543859649123e-07,
"loss": 0.9568,
"step": 5
},
{
"epoch": 0.048,
"grad_norm": 3.8995630741119385,
"learning_rate": 8.771929824561404e-07,
"loss": 1.0066,
"step": 6
},
{
"epoch": 0.056,
"grad_norm": 4.119105339050293,
"learning_rate": 1.0526315789473685e-06,
"loss": 0.9942,
"step": 7
},
{
"epoch": 0.064,
"grad_norm": 3.3201704025268555,
"learning_rate": 1.2280701754385965e-06,
"loss": 0.9177,
"step": 8
},
{
"epoch": 0.072,
"grad_norm": 3.132570266723633,
"learning_rate": 1.4035087719298246e-06,
"loss": 0.8407,
"step": 9
},
{
"epoch": 0.08,
"grad_norm": 3.132612466812134,
"learning_rate": 1.5789473684210526e-06,
"loss": 0.8996,
"step": 10
},
{
"epoch": 0.088,
"grad_norm": 2.6232998371124268,
"learning_rate": 1.7543859649122807e-06,
"loss": 0.9917,
"step": 11
},
{
"epoch": 0.096,
"grad_norm": 1.8322850465774536,
"learning_rate": 1.929824561403509e-06,
"loss": 0.9113,
"step": 12
},
{
"epoch": 0.104,
"grad_norm": 1.9032851457595825,
"learning_rate": 2.105263157894737e-06,
"loss": 0.9269,
"step": 13
},
{
"epoch": 0.112,
"grad_norm": 1.8699129819869995,
"learning_rate": 2.280701754385965e-06,
"loss": 0.8689,
"step": 14
},
{
"epoch": 0.12,
"grad_norm": 1.572948932647705,
"learning_rate": 2.456140350877193e-06,
"loss": 0.8004,
"step": 15
},
{
"epoch": 0.128,
"grad_norm": 1.5191115140914917,
"learning_rate": 2.631578947368421e-06,
"loss": 0.8864,
"step": 16
},
{
"epoch": 0.136,
"grad_norm": 1.55618417263031,
"learning_rate": 2.8070175438596493e-06,
"loss": 0.8495,
"step": 17
},
{
"epoch": 0.144,
"grad_norm": 1.5098791122436523,
"learning_rate": 2.9824561403508774e-06,
"loss": 0.8463,
"step": 18
},
{
"epoch": 0.152,
"grad_norm": 1.6831691265106201,
"learning_rate": 3.157894736842105e-06,
"loss": 0.9389,
"step": 19
},
{
"epoch": 0.16,
"grad_norm": 1.1799376010894775,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.7804,
"step": 20
},
{
"epoch": 0.168,
"grad_norm": 1.1570075750350952,
"learning_rate": 3.5087719298245615e-06,
"loss": 0.7694,
"step": 21
},
{
"epoch": 0.176,
"grad_norm": 1.3207712173461914,
"learning_rate": 3.6842105263157896e-06,
"loss": 0.821,
"step": 22
},
{
"epoch": 0.184,
"grad_norm": 1.26198410987854,
"learning_rate": 3.859649122807018e-06,
"loss": 0.7909,
"step": 23
},
{
"epoch": 0.192,
"grad_norm": 1.279601812362671,
"learning_rate": 4.035087719298246e-06,
"loss": 0.8096,
"step": 24
},
{
"epoch": 0.2,
"grad_norm": 1.336991548538208,
"learning_rate": 4.210526315789474e-06,
"loss": 0.7828,
"step": 25
},
{
"epoch": 0.208,
"grad_norm": 1.1203796863555908,
"learning_rate": 4.385964912280702e-06,
"loss": 0.7887,
"step": 26
},
{
"epoch": 0.216,
"grad_norm": 1.0610651969909668,
"learning_rate": 4.56140350877193e-06,
"loss": 0.7922,
"step": 27
},
{
"epoch": 0.224,
"grad_norm": 0.912101149559021,
"learning_rate": 4.736842105263158e-06,
"loss": 0.6419,
"step": 28
},
{
"epoch": 0.232,
"grad_norm": 1.065405249595642,
"learning_rate": 4.912280701754386e-06,
"loss": 0.7815,
"step": 29
},
{
"epoch": 0.24,
"grad_norm": 1.0197738409042358,
"learning_rate": 5.087719298245615e-06,
"loss": 0.6598,
"step": 30
},
{
"epoch": 0.248,
"grad_norm": 0.9232926368713379,
"learning_rate": 5.263157894736842e-06,
"loss": 0.7246,
"step": 31
},
{
"epoch": 0.256,
"grad_norm": 3.055030345916748,
"learning_rate": 5.438596491228071e-06,
"loss": 0.7377,
"step": 32
},
{
"epoch": 0.264,
"grad_norm": 1.2969378232955933,
"learning_rate": 5.6140350877192985e-06,
"loss": 0.7358,
"step": 33
},
{
"epoch": 0.272,
"grad_norm": 1.0571110248565674,
"learning_rate": 5.789473684210527e-06,
"loss": 0.7021,
"step": 34
},
{
"epoch": 0.28,
"grad_norm": 1.090430736541748,
"learning_rate": 5.964912280701755e-06,
"loss": 0.6808,
"step": 35
},
{
"epoch": 0.288,
"grad_norm": 1.427613377571106,
"learning_rate": 6.140350877192983e-06,
"loss": 0.7467,
"step": 36
},
{
"epoch": 0.296,
"grad_norm": 1.0251866579055786,
"learning_rate": 6.31578947368421e-06,
"loss": 0.7286,
"step": 37
},
{
"epoch": 0.304,
"grad_norm": 0.8874351382255554,
"learning_rate": 6.491228070175439e-06,
"loss": 0.6925,
"step": 38
},
{
"epoch": 0.312,
"grad_norm": 1.2552199363708496,
"learning_rate": 6.666666666666667e-06,
"loss": 0.7958,
"step": 39
},
{
"epoch": 0.32,
"grad_norm": 1.1221239566802979,
"learning_rate": 6.842105263157896e-06,
"loss": 0.8124,
"step": 40
},
{
"epoch": 0.328,
"grad_norm": 1.0117762088775635,
"learning_rate": 7.017543859649123e-06,
"loss": 0.6543,
"step": 41
},
{
"epoch": 0.336,
"grad_norm": 0.852236270904541,
"learning_rate": 7.192982456140352e-06,
"loss": 0.5704,
"step": 42
},
{
"epoch": 0.344,
"grad_norm": 0.8768660426139832,
"learning_rate": 7.368421052631579e-06,
"loss": 0.6461,
"step": 43
},
{
"epoch": 0.352,
"grad_norm": 1.0183557271957397,
"learning_rate": 7.5438596491228074e-06,
"loss": 0.6764,
"step": 44
},
{
"epoch": 0.36,
"grad_norm": 1.1444123983383179,
"learning_rate": 7.719298245614036e-06,
"loss": 0.707,
"step": 45
},
{
"epoch": 0.368,
"grad_norm": 1.009420394897461,
"learning_rate": 7.894736842105265e-06,
"loss": 0.8975,
"step": 46
},
{
"epoch": 0.376,
"grad_norm": 1.5204353332519531,
"learning_rate": 8.070175438596492e-06,
"loss": 0.6668,
"step": 47
},
{
"epoch": 0.384,
"grad_norm": 1.0963656902313232,
"learning_rate": 8.24561403508772e-06,
"loss": 0.6778,
"step": 48
},
{
"epoch": 0.392,
"grad_norm": 1.2279527187347412,
"learning_rate": 8.421052631578948e-06,
"loss": 0.8483,
"step": 49
},
{
"epoch": 0.4,
"grad_norm": 1.1100308895111084,
"learning_rate": 8.596491228070176e-06,
"loss": 0.7917,
"step": 50
},
{
"epoch": 0.408,
"grad_norm": 1.068907380104065,
"learning_rate": 8.771929824561405e-06,
"loss": 0.671,
"step": 51
},
{
"epoch": 0.416,
"grad_norm": 0.919391930103302,
"learning_rate": 8.947368421052632e-06,
"loss": 0.698,
"step": 52
},
{
"epoch": 0.424,
"grad_norm": 1.0914137363433838,
"learning_rate": 9.12280701754386e-06,
"loss": 0.6729,
"step": 53
},
{
"epoch": 0.432,
"grad_norm": 0.9236669540405273,
"learning_rate": 9.298245614035088e-06,
"loss": 0.7039,
"step": 54
},
{
"epoch": 0.44,
"grad_norm": 1.1973854303359985,
"learning_rate": 9.473684210526315e-06,
"loss": 0.7047,
"step": 55
},
{
"epoch": 0.448,
"grad_norm": 1.0812108516693115,
"learning_rate": 9.649122807017545e-06,
"loss": 0.7041,
"step": 56
},
{
"epoch": 0.456,
"grad_norm": 1.142591953277588,
"learning_rate": 9.824561403508772e-06,
"loss": 0.7425,
"step": 57
},
{
"epoch": 0.464,
"grad_norm": 0.9331685304641724,
"learning_rate": 1e-05,
"loss": 0.6029,
"step": 58
},
{
"epoch": 0.472,
"grad_norm": 1.1456042528152466,
"learning_rate": 9.999978367986988e-06,
"loss": 0.716,
"step": 59
},
{
"epoch": 0.48,
"grad_norm": 1.048388123512268,
"learning_rate": 9.999913472135126e-06,
"loss": 0.6665,
"step": 60
},
{
"epoch": 0.488,
"grad_norm": 1.0044022798538208,
"learning_rate": 9.999805313005946e-06,
"loss": 0.6526,
"step": 61
},
{
"epoch": 0.496,
"grad_norm": 1.0086668729782104,
"learning_rate": 9.99965389153533e-06,
"loss": 0.7125,
"step": 62
},
{
"epoch": 0.504,
"grad_norm": 1.088757872581482,
"learning_rate": 9.999459209033495e-06,
"loss": 0.5943,
"step": 63
},
{
"epoch": 0.512,
"grad_norm": 1.180656909942627,
"learning_rate": 9.999221267184993e-06,
"loss": 0.7177,
"step": 64
},
{
"epoch": 0.52,
"grad_norm": 0.8537901639938354,
"learning_rate": 9.998940068048688e-06,
"loss": 0.5641,
"step": 65
},
{
"epoch": 0.528,
"grad_norm": 1.2915085554122925,
"learning_rate": 9.998615614057743e-06,
"loss": 0.674,
"step": 66
},
{
"epoch": 0.536,
"grad_norm": 1.312907338142395,
"learning_rate": 9.998247908019594e-06,
"loss": 0.823,
"step": 67
},
{
"epoch": 0.544,
"grad_norm": 1.0484812259674072,
"learning_rate": 9.997836953115927e-06,
"loss": 0.7765,
"step": 68
},
{
"epoch": 0.552,
"grad_norm": 1.0241776704788208,
"learning_rate": 9.997382752902658e-06,
"loss": 0.8049,
"step": 69
},
{
"epoch": 0.56,
"grad_norm": 1.282970905303955,
"learning_rate": 9.996885311309892e-06,
"loss": 0.5718,
"step": 70
},
{
"epoch": 0.568,
"grad_norm": 0.8525036573410034,
"learning_rate": 9.996344632641895e-06,
"loss": 0.6602,
"step": 71
},
{
"epoch": 0.576,
"grad_norm": 1.2436128854751587,
"learning_rate": 9.995760721577053e-06,
"loss": 0.6208,
"step": 72
},
{
"epoch": 0.584,
"grad_norm": 0.9938145875930786,
"learning_rate": 9.995133583167833e-06,
"loss": 0.7188,
"step": 73
},
{
"epoch": 0.592,
"grad_norm": 0.9896339774131775,
"learning_rate": 9.994463222840748e-06,
"loss": 0.6339,
"step": 74
},
{
"epoch": 0.6,
"grad_norm": 0.9466623663902283,
"learning_rate": 9.993749646396286e-06,
"loss": 0.5896,
"step": 75
},
{
"epoch": 0.608,
"grad_norm": 1.0780235528945923,
"learning_rate": 9.992992860008893e-06,
"loss": 0.7525,
"step": 76
},
{
"epoch": 0.616,
"grad_norm": 1.3074487447738647,
"learning_rate": 9.99219287022689e-06,
"loss": 0.6707,
"step": 77
},
{
"epoch": 0.624,
"grad_norm": 0.9674292802810669,
"learning_rate": 9.991349683972435e-06,
"loss": 0.6817,
"step": 78
},
{
"epoch": 0.632,
"grad_norm": 1.029198169708252,
"learning_rate": 9.990463308541452e-06,
"loss": 0.7652,
"step": 79
},
{
"epoch": 0.64,
"grad_norm": 1.1561387777328491,
"learning_rate": 9.989533751603578e-06,
"loss": 0.7302,
"step": 80
},
{
"epoch": 0.648,
"grad_norm": 0.9618710875511169,
"learning_rate": 9.988561021202083e-06,
"loss": 0.7046,
"step": 81
},
{
"epoch": 0.656,
"grad_norm": 1.0132874250411987,
"learning_rate": 9.987545125753818e-06,
"loss": 0.7053,
"step": 82
},
{
"epoch": 0.664,
"grad_norm": 1.0416361093521118,
"learning_rate": 9.986486074049131e-06,
"loss": 0.6851,
"step": 83
},
{
"epoch": 0.672,
"grad_norm": 0.9856860041618347,
"learning_rate": 9.985383875251783e-06,
"loss": 0.6858,
"step": 84
},
{
"epoch": 0.68,
"grad_norm": 0.9925334453582764,
"learning_rate": 9.98423853889889e-06,
"loss": 0.7338,
"step": 85
},
{
"epoch": 0.688,
"grad_norm": 0.9347037076950073,
"learning_rate": 9.983050074900824e-06,
"loss": 0.7625,
"step": 86
},
{
"epoch": 0.696,
"grad_norm": 1.2280950546264648,
"learning_rate": 9.98181849354113e-06,
"loss": 0.7229,
"step": 87
},
{
"epoch": 0.704,
"grad_norm": 0.9444807171821594,
"learning_rate": 9.980543805476447e-06,
"loss": 0.6652,
"step": 88
},
{
"epoch": 0.712,
"grad_norm": 1.0696436166763306,
"learning_rate": 9.979226021736396e-06,
"loss": 0.646,
"step": 89
},
{
"epoch": 0.72,
"grad_norm": 0.9732766151428223,
"learning_rate": 9.977865153723508e-06,
"loss": 0.7359,
"step": 90
},
{
"epoch": 0.728,
"grad_norm": 1.041569471359253,
"learning_rate": 9.976461213213104e-06,
"loss": 0.7169,
"step": 91
},
{
"epoch": 0.736,
"grad_norm": 1.0077918767929077,
"learning_rate": 9.975014212353212e-06,
"loss": 0.7185,
"step": 92
},
{
"epoch": 0.744,
"grad_norm": 0.9856661558151245,
"learning_rate": 9.973524163664447e-06,
"loss": 0.6106,
"step": 93
},
{
"epoch": 0.752,
"grad_norm": 1.1179556846618652,
"learning_rate": 9.971991080039912e-06,
"loss": 0.6851,
"step": 94
},
{
"epoch": 0.76,
"grad_norm": 0.7839189171791077,
"learning_rate": 9.970414974745077e-06,
"loss": 0.6788,
"step": 95
},
{
"epoch": 0.768,
"grad_norm": 0.9997370839118958,
"learning_rate": 9.968795861417676e-06,
"loss": 0.5586,
"step": 96
},
{
"epoch": 0.776,
"grad_norm": 0.9802690148353577,
"learning_rate": 9.967133754067581e-06,
"loss": 0.7048,
"step": 97
},
{
"epoch": 0.784,
"grad_norm": 1.0631524324417114,
"learning_rate": 9.965428667076687e-06,
"loss": 0.6596,
"step": 98
},
{
"epoch": 0.792,
"grad_norm": 0.9298393130302429,
"learning_rate": 9.963680615198774e-06,
"loss": 0.6907,
"step": 99
},
{
"epoch": 0.8,
"grad_norm": 1.0759201049804688,
"learning_rate": 9.961889613559396e-06,
"loss": 0.7354,
"step": 100
},
{
"epoch": 0.808,
"grad_norm": 0.7959829568862915,
"learning_rate": 9.960055677655743e-06,
"loss": 0.6363,
"step": 101
},
{
"epoch": 0.816,
"grad_norm": 0.9710596799850464,
"learning_rate": 9.958178823356503e-06,
"loss": 0.6114,
"step": 102
},
{
"epoch": 0.824,
"grad_norm": 0.9413867592811584,
"learning_rate": 9.956259066901733e-06,
"loss": 0.6466,
"step": 103
},
{
"epoch": 0.832,
"grad_norm": 1.4074207544326782,
"learning_rate": 9.954296424902709e-06,
"loss": 0.6742,
"step": 104
},
{
"epoch": 0.84,
"grad_norm": 0.9660438895225525,
"learning_rate": 9.95229091434179e-06,
"loss": 0.719,
"step": 105
},
{
"epoch": 0.848,
"grad_norm": 1.3223888874053955,
"learning_rate": 9.950242552572272e-06,
"loss": 0.6541,
"step": 106
},
{
"epoch": 0.856,
"grad_norm": 1.0226150751113892,
"learning_rate": 9.948151357318228e-06,
"loss": 0.5981,
"step": 107
},
{
"epoch": 0.864,
"grad_norm": 0.9748033285140991,
"learning_rate": 9.946017346674362e-06,
"loss": 0.6822,
"step": 108
},
{
"epoch": 0.872,
"grad_norm": 1.3557853698730469,
"learning_rate": 9.943840539105853e-06,
"loss": 0.7028,
"step": 109
},
{
"epoch": 0.88,
"grad_norm": 1.009817361831665,
"learning_rate": 9.941620953448195e-06,
"loss": 0.7124,
"step": 110
},
{
"epoch": 0.888,
"grad_norm": 0.8992280960083008,
"learning_rate": 9.939358608907026e-06,
"loss": 0.6255,
"step": 111
},
{
"epoch": 0.896,
"grad_norm": 0.9794321060180664,
"learning_rate": 9.937053525057977e-06,
"loss": 0.6736,
"step": 112
},
{
"epoch": 0.904,
"grad_norm": 0.9605726599693298,
"learning_rate": 9.934705721846487e-06,
"loss": 0.6872,
"step": 113
},
{
"epoch": 0.912,
"grad_norm": 1.006405234336853,
"learning_rate": 9.932315219587641e-06,
"loss": 0.6675,
"step": 114
},
{
"epoch": 0.92,
"grad_norm": 0.9819768667221069,
"learning_rate": 9.92988203896599e-06,
"loss": 0.6566,
"step": 115
},
{
"epoch": 0.928,
"grad_norm": 1.0273972749710083,
"learning_rate": 9.927406201035368e-06,
"loss": 0.6309,
"step": 116
},
{
"epoch": 0.936,
"grad_norm": 1.0762145519256592,
"learning_rate": 9.924887727218724e-06,
"loss": 0.7274,
"step": 117
},
{
"epoch": 0.944,
"grad_norm": 1.00732421875,
"learning_rate": 9.922326639307918e-06,
"loss": 0.7601,
"step": 118
},
{
"epoch": 0.952,
"grad_norm": 1.0192487239837646,
"learning_rate": 9.919722959463545e-06,
"loss": 0.6605,
"step": 119
},
{
"epoch": 0.96,
"grad_norm": 0.9694429636001587,
"learning_rate": 9.917076710214739e-06,
"loss": 0.7889,
"step": 120
},
{
"epoch": 0.968,
"grad_norm": 1.139436960220337,
"learning_rate": 9.914387914458983e-06,
"loss": 0.636,
"step": 121
},
{
"epoch": 0.976,
"grad_norm": 0.9674109220504761,
"learning_rate": 9.911656595461899e-06,
"loss": 0.7108,
"step": 122
},
{
"epoch": 0.984,
"grad_norm": 1.0006988048553467,
"learning_rate": 9.908882776857057e-06,
"loss": 0.7325,
"step": 123
},
{
"epoch": 0.992,
"grad_norm": 0.8345931768417358,
"learning_rate": 9.906066482645774e-06,
"loss": 0.5953,
"step": 124
},
{
"epoch": 1.0,
"grad_norm": 0.9422177076339722,
"learning_rate": 9.903207737196892e-06,
"loss": 0.7125,
"step": 125
},
{
"epoch": 1.008,
"grad_norm": 1.0644160509109497,
"learning_rate": 9.900306565246579e-06,
"loss": 0.6101,
"step": 126
},
{
"epoch": 1.016,
"grad_norm": 0.8198509812355042,
"learning_rate": 9.89736299189811e-06,
"loss": 0.5451,
"step": 127
},
{
"epoch": 1.024,
"grad_norm": 0.9468843340873718,
"learning_rate": 9.894377042621654e-06,
"loss": 0.6957,
"step": 128
},
{
"epoch": 1.032,
"grad_norm": 1.0026391744613647,
"learning_rate": 9.891348743254046e-06,
"loss": 0.6251,
"step": 129
},
{
"epoch": 1.04,
"grad_norm": 0.8523833155632019,
"learning_rate": 9.888278119998573e-06,
"loss": 0.5409,
"step": 130
},
{
"epoch": 1.048,
"grad_norm": 1.2016947269439697,
"learning_rate": 9.885165199424738e-06,
"loss": 0.5866,
"step": 131
},
{
"epoch": 1.056,
"grad_norm": 0.8879609107971191,
"learning_rate": 9.882010008468038e-06,
"loss": 0.5592,
"step": 132
},
{
"epoch": 1.064,
"grad_norm": 1.1025391817092896,
"learning_rate": 9.878812574429722e-06,
"loss": 0.567,
"step": 133
},
{
"epoch": 1.072,
"grad_norm": 0.9392365217208862,
"learning_rate": 9.875572924976568e-06,
"loss": 0.567,
"step": 134
},
{
"epoch": 1.08,
"grad_norm": 0.8918107748031616,
"learning_rate": 9.87229108814063e-06,
"loss": 0.4709,
"step": 135
},
{
"epoch": 1.088,
"grad_norm": 1.0703924894332886,
"learning_rate": 9.868967092319003e-06,
"loss": 0.5965,
"step": 136
},
{
"epoch": 1.096,
"grad_norm": 0.9928643703460693,
"learning_rate": 9.865600966273576e-06,
"loss": 0.5996,
"step": 137
},
{
"epoch": 1.104,
"grad_norm": 1.2148929834365845,
"learning_rate": 9.86219273913078e-06,
"loss": 0.611,
"step": 138
},
{
"epoch": 1.112,
"grad_norm": 0.9947288632392883,
"learning_rate": 9.858742440381343e-06,
"loss": 0.5379,
"step": 139
},
{
"epoch": 1.12,
"grad_norm": 1.332168459892273,
"learning_rate": 9.855250099880026e-06,
"loss": 0.7006,
"step": 140
},
{
"epoch": 1.1280000000000001,
"grad_norm": 2.9835128784179688,
"learning_rate": 9.851715747845372e-06,
"loss": 0.6709,
"step": 141
},
{
"epoch": 1.1360000000000001,
"grad_norm": 0.8667416572570801,
"learning_rate": 9.848139414859441e-06,
"loss": 0.6153,
"step": 142
},
{
"epoch": 1.144,
"grad_norm": 1.0628273487091064,
"learning_rate": 9.844521131867546e-06,
"loss": 0.6619,
"step": 143
},
{
"epoch": 1.152,
"grad_norm": 1.0005896091461182,
"learning_rate": 9.840860930177984e-06,
"loss": 0.5382,
"step": 144
},
{
"epoch": 1.16,
"grad_norm": 1.0219800472259521,
"learning_rate": 9.837158841461767e-06,
"loss": 0.6419,
"step": 145
},
{
"epoch": 1.168,
"grad_norm": 0.7993205189704895,
"learning_rate": 9.833414897752346e-06,
"loss": 0.499,
"step": 146
},
{
"epoch": 1.176,
"grad_norm": 0.7739636301994324,
"learning_rate": 9.829629131445342e-06,
"loss": 0.5477,
"step": 147
},
{
"epoch": 1.184,
"grad_norm": 0.8899552822113037,
"learning_rate": 9.825801575298248e-06,
"loss": 0.4854,
"step": 148
},
{
"epoch": 1.192,
"grad_norm": 0.8185807466506958,
"learning_rate": 9.821932262430164e-06,
"loss": 0.5946,
"step": 149
},
{
"epoch": 1.2,
"grad_norm": 0.9796144962310791,
"learning_rate": 9.818021226321502e-06,
"loss": 0.4688,
"step": 150
},
{
"epoch": 1.208,
"grad_norm": 0.9567016959190369,
"learning_rate": 9.814068500813692e-06,
"loss": 0.5279,
"step": 151
},
{
"epoch": 1.216,
"grad_norm": 0.8102734684944153,
"learning_rate": 9.8100741201089e-06,
"loss": 0.4962,
"step": 152
},
{
"epoch": 1.224,
"grad_norm": 1.0615909099578857,
"learning_rate": 9.806038118769724e-06,
"loss": 0.6779,
"step": 153
},
{
"epoch": 1.232,
"grad_norm": 0.9773924350738525,
"learning_rate": 9.801960531718898e-06,
"loss": 0.5115,
"step": 154
},
{
"epoch": 1.24,
"grad_norm": 1.0371609926223755,
"learning_rate": 9.797841394238987e-06,
"loss": 0.6258,
"step": 155
},
{
"epoch": 1.248,
"grad_norm": 1.1592023372650146,
"learning_rate": 9.793680741972084e-06,
"loss": 0.5728,
"step": 156
},
{
"epoch": 1.256,
"grad_norm": 0.8714562058448792,
"learning_rate": 9.789478610919508e-06,
"loss": 0.4935,
"step": 157
},
{
"epoch": 1.264,
"grad_norm": 0.9967408180236816,
"learning_rate": 9.785235037441473e-06,
"loss": 0.5136,
"step": 158
},
{
"epoch": 1.272,
"grad_norm": 0.7984183430671692,
"learning_rate": 9.780950058256802e-06,
"loss": 0.576,
"step": 159
},
{
"epoch": 1.28,
"grad_norm": 0.8621663451194763,
"learning_rate": 9.77662371044258e-06,
"loss": 0.5559,
"step": 160
},
{
"epoch": 1.288,
"grad_norm": 0.8362479209899902,
"learning_rate": 9.77225603143385e-06,
"loss": 0.6398,
"step": 161
},
{
"epoch": 1.296,
"grad_norm": 0.9502777457237244,
"learning_rate": 9.767847059023292e-06,
"loss": 0.5053,
"step": 162
},
{
"epoch": 1.304,
"grad_norm": 1.245893120765686,
"learning_rate": 9.763396831360884e-06,
"loss": 0.5379,
"step": 163
},
{
"epoch": 1.312,
"grad_norm": 0.9848424792289734,
"learning_rate": 9.75890538695358e-06,
"loss": 0.5523,
"step": 164
},
{
"epoch": 1.32,
"grad_norm": 0.9937321543693542,
"learning_rate": 9.75437276466497e-06,
"loss": 0.6296,
"step": 165
},
{
"epoch": 1.328,
"grad_norm": 1.045898199081421,
"learning_rate": 9.749799003714954e-06,
"loss": 0.6407,
"step": 166
},
{
"epoch": 1.336,
"grad_norm": 1.0084316730499268,
"learning_rate": 9.745184143679398e-06,
"loss": 0.626,
"step": 167
},
{
"epoch": 1.3439999999999999,
"grad_norm": 1.1556116342544556,
"learning_rate": 9.74052822448978e-06,
"loss": 0.6494,
"step": 168
},
{
"epoch": 1.3519999999999999,
"grad_norm": 0.9056588411331177,
"learning_rate": 9.735831286432869e-06,
"loss": 0.5156,
"step": 169
},
{
"epoch": 1.3599999999999999,
"grad_norm": 1.0791850090026855,
"learning_rate": 9.731093370150349e-06,
"loss": 0.6605,
"step": 170
},
{
"epoch": 1.3679999999999999,
"grad_norm": 0.8788161873817444,
"learning_rate": 9.72631451663849e-06,
"loss": 0.5369,
"step": 171
},
{
"epoch": 1.376,
"grad_norm": 1.5434342622756958,
"learning_rate": 9.721494767247779e-06,
"loss": 0.6006,
"step": 172
},
{
"epoch": 1.384,
"grad_norm": 0.8972302675247192,
"learning_rate": 9.71663416368257e-06,
"loss": 0.5413,
"step": 173
},
{
"epoch": 1.392,
"grad_norm": 1.001869559288025,
"learning_rate": 9.71173274800072e-06,
"loss": 0.5712,
"step": 174
},
{
"epoch": 1.4,
"grad_norm": 0.9920283555984497,
"learning_rate": 9.70679056261322e-06,
"loss": 0.5848,
"step": 175
},
{
"epoch": 1.408,
"grad_norm": 1.090257167816162,
"learning_rate": 9.70180765028384e-06,
"loss": 0.5776,
"step": 176
},
{
"epoch": 1.416,
"grad_norm": 0.7861718535423279,
"learning_rate": 9.696784054128749e-06,
"loss": 0.5172,
"step": 177
},
{
"epoch": 1.424,
"grad_norm": 0.9395256638526917,
"learning_rate": 9.691719817616148e-06,
"loss": 0.6224,
"step": 178
},
{
"epoch": 1.432,
"grad_norm": 0.7598348259925842,
"learning_rate": 9.686614984565888e-06,
"loss": 0.5102,
"step": 179
},
{
"epoch": 1.44,
"grad_norm": 0.8387635946273804,
"learning_rate": 9.681469599149093e-06,
"loss": 0.5538,
"step": 180
},
{
"epoch": 1.448,
"grad_norm": 0.779594361782074,
"learning_rate": 9.676283705887783e-06,
"loss": 0.4747,
"step": 181
},
{
"epoch": 1.456,
"grad_norm": 1.5272139310836792,
"learning_rate": 9.671057349654481e-06,
"loss": 0.5398,
"step": 182
},
{
"epoch": 1.464,
"grad_norm": 0.8969641923904419,
"learning_rate": 9.66579057567183e-06,
"loss": 0.5246,
"step": 183
},
{
"epoch": 1.472,
"grad_norm": 0.8700845241546631,
"learning_rate": 9.660483429512198e-06,
"loss": 0.5047,
"step": 184
},
{
"epoch": 1.48,
"grad_norm": 0.8970540761947632,
"learning_rate": 9.65513595709729e-06,
"loss": 0.5832,
"step": 185
},
{
"epoch": 1.488,
"grad_norm": 1.0614380836486816,
"learning_rate": 9.649748204697741e-06,
"loss": 0.5481,
"step": 186
},
{
"epoch": 1.496,
"grad_norm": 0.9343193173408508,
"learning_rate": 9.644320218932723e-06,
"loss": 0.6128,
"step": 187
},
{
"epoch": 1.504,
"grad_norm": 1.0007647275924683,
"learning_rate": 9.63885204676954e-06,
"loss": 0.5181,
"step": 188
},
{
"epoch": 1.512,
"grad_norm": 0.8584722876548767,
"learning_rate": 9.63334373552322e-06,
"loss": 0.6004,
"step": 189
},
{
"epoch": 1.52,
"grad_norm": 0.9545193910598755,
"learning_rate": 9.627795332856107e-06,
"loss": 0.5491,
"step": 190
},
{
"epoch": 1.528,
"grad_norm": 1.144181728363037,
"learning_rate": 9.622206886777448e-06,
"loss": 0.6506,
"step": 191
},
{
"epoch": 1.536,
"grad_norm": 1.1122850179672241,
"learning_rate": 9.616578445642982e-06,
"loss": 0.4933,
"step": 192
},
{
"epoch": 1.544,
"grad_norm": 1.247880458831787,
"learning_rate": 9.61091005815451e-06,
"loss": 0.5382,
"step": 193
},
{
"epoch": 1.552,
"grad_norm": 0.9312114119529724,
"learning_rate": 9.605201773359485e-06,
"loss": 0.5274,
"step": 194
},
{
"epoch": 1.56,
"grad_norm": 0.7915589213371277,
"learning_rate": 9.599453640650585e-06,
"loss": 0.5353,
"step": 195
},
{
"epoch": 1.568,
"grad_norm": 1.037292718887329,
"learning_rate": 9.59366570976528e-06,
"loss": 0.5941,
"step": 196
},
{
"epoch": 1.576,
"grad_norm": 0.8507972359657288,
"learning_rate": 9.587838030785413e-06,
"loss": 0.5754,
"step": 197
},
{
"epoch": 1.584,
"grad_norm": 0.770389199256897,
"learning_rate": 9.581970654136752e-06,
"loss": 0.5594,
"step": 198
},
{
"epoch": 1.592,
"grad_norm": 1.1132396459579468,
"learning_rate": 9.576063630588563e-06,
"loss": 0.6345,
"step": 199
},
{
"epoch": 1.6,
"grad_norm": 1.018099069595337,
"learning_rate": 9.570117011253173e-06,
"loss": 0.5882,
"step": 200
},
{
"epoch": 1.608,
"grad_norm": 1.043480634689331,
"learning_rate": 9.56413084758552e-06,
"loss": 0.5441,
"step": 201
},
{
"epoch": 1.616,
"grad_norm": 0.792169988155365,
"learning_rate": 9.55810519138271e-06,
"loss": 0.4992,
"step": 202
},
{
"epoch": 1.624,
"grad_norm": 0.9433432817459106,
"learning_rate": 9.552040094783575e-06,
"loss": 0.6049,
"step": 203
},
{
"epoch": 1.6320000000000001,
"grad_norm": 0.8830149173736572,
"learning_rate": 9.545935610268213e-06,
"loss": 0.5478,
"step": 204
},
{
"epoch": 1.6400000000000001,
"grad_norm": 0.8460153937339783,
"learning_rate": 9.53979179065754e-06,
"loss": 0.4875,
"step": 205
},
{
"epoch": 1.6480000000000001,
"grad_norm": 2.63915753364563,
"learning_rate": 9.533608689112827e-06,
"loss": 0.5341,
"step": 206
},
{
"epoch": 1.6560000000000001,
"grad_norm": 0.8142545223236084,
"learning_rate": 9.527386359135254e-06,
"loss": 0.5997,
"step": 207
},
{
"epoch": 1.6640000000000001,
"grad_norm": 1.2874032258987427,
"learning_rate": 9.521124854565425e-06,
"loss": 0.5931,
"step": 208
},
{
"epoch": 1.6720000000000002,
"grad_norm": 0.9045368432998657,
"learning_rate": 9.514824229582922e-06,
"loss": 0.5828,
"step": 209
},
{
"epoch": 1.6800000000000002,
"grad_norm": 0.7827978730201721,
"learning_rate": 9.508484538705823e-06,
"loss": 0.5094,
"step": 210
},
{
"epoch": 1.688,
"grad_norm": 0.9156500101089478,
"learning_rate": 9.50210583679024e-06,
"loss": 0.5887,
"step": 211
},
{
"epoch": 1.696,
"grad_norm": 0.93426513671875,
"learning_rate": 9.495688179029838e-06,
"loss": 0.5525,
"step": 212
},
{
"epoch": 1.704,
"grad_norm": 1.0492069721221924,
"learning_rate": 9.48923162095536e-06,
"loss": 0.6795,
"step": 213
},
{
"epoch": 1.712,
"grad_norm": 0.9157354235649109,
"learning_rate": 9.482736218434144e-06,
"loss": 0.5795,
"step": 214
},
{
"epoch": 1.72,
"grad_norm": 0.876832902431488,
"learning_rate": 9.476202027669644e-06,
"loss": 0.5398,
"step": 215
},
{
"epoch": 1.728,
"grad_norm": 0.8842900395393372,
"learning_rate": 9.469629105200937e-06,
"loss": 0.4838,
"step": 216
},
{
"epoch": 1.736,
"grad_norm": 0.9871936440467834,
"learning_rate": 9.463017507902245e-06,
"loss": 0.6217,
"step": 217
},
{
"epoch": 1.744,
"grad_norm": 1.0882078409194946,
"learning_rate": 9.45636729298243e-06,
"loss": 0.5909,
"step": 218
},
{
"epoch": 1.752,
"grad_norm": 0.9433255195617676,
"learning_rate": 9.449678517984503e-06,
"loss": 0.547,
"step": 219
},
{
"epoch": 1.76,
"grad_norm": 1.105050802230835,
"learning_rate": 9.442951240785135e-06,
"loss": 0.519,
"step": 220
},
{
"epoch": 1.768,
"grad_norm": 0.8703718781471252,
"learning_rate": 9.436185519594145e-06,
"loss": 0.677,
"step": 221
},
{
"epoch": 1.776,
"grad_norm": 0.9973863959312439,
"learning_rate": 9.429381412954e-06,
"loss": 0.6254,
"step": 222
},
{
"epoch": 1.784,
"grad_norm": 1.0442036390304565,
"learning_rate": 9.422538979739307e-06,
"loss": 0.629,
"step": 223
},
{
"epoch": 1.792,
"grad_norm": 0.8797324299812317,
"learning_rate": 9.415658279156312e-06,
"loss": 0.5605,
"step": 224
},
{
"epoch": 1.8,
"grad_norm": 0.7664978504180908,
"learning_rate": 9.408739370742372e-06,
"loss": 0.5078,
"step": 225
},
{
"epoch": 1.808,
"grad_norm": 0.8313367366790771,
"learning_rate": 9.401782314365458e-06,
"loss": 0.515,
"step": 226
},
{
"epoch": 1.8159999999999998,
"grad_norm": 0.8382720351219177,
"learning_rate": 9.39478717022362e-06,
"loss": 0.5904,
"step": 227
},
{
"epoch": 1.8239999999999998,
"grad_norm": 0.7888085246086121,
"learning_rate": 9.387753998844482e-06,
"loss": 0.4307,
"step": 228
},
{
"epoch": 1.8319999999999999,
"grad_norm": 0.8832230567932129,
"learning_rate": 9.380682861084703e-06,
"loss": 0.5782,
"step": 229
},
{
"epoch": 1.8399999999999999,
"grad_norm": 0.9309815764427185,
"learning_rate": 9.37357381812946e-06,
"loss": 0.5872,
"step": 230
},
{
"epoch": 1.8479999999999999,
"grad_norm": 0.91801917552948,
"learning_rate": 9.366426931491917e-06,
"loss": 0.5212,
"step": 231
},
{
"epoch": 1.8559999999999999,
"grad_norm": 0.8884289264678955,
"learning_rate": 9.359242263012693e-06,
"loss": 0.5056,
"step": 232
},
{
"epoch": 1.8639999999999999,
"grad_norm": 0.9634459018707275,
"learning_rate": 9.352019874859326e-06,
"loss": 0.4745,
"step": 233
},
{
"epoch": 1.8719999999999999,
"grad_norm": 0.8102062940597534,
"learning_rate": 9.344759829525734e-06,
"loss": 0.5154,
"step": 234
},
{
"epoch": 1.88,
"grad_norm": 0.8682310581207275,
"learning_rate": 9.33746218983167e-06,
"loss": 0.6459,
"step": 235
},
{
"epoch": 1.888,
"grad_norm": 0.8204552531242371,
"learning_rate": 9.330127018922195e-06,
"loss": 0.5901,
"step": 236
},
{
"epoch": 1.896,
"grad_norm": 0.8650975227355957,
"learning_rate": 9.32275438026711e-06,
"loss": 0.5927,
"step": 237
},
{
"epoch": 1.904,
"grad_norm": 0.9267044067382812,
"learning_rate": 9.315344337660422e-06,
"loss": 0.5608,
"step": 238
},
{
"epoch": 1.912,
"grad_norm": 0.8690655827522278,
"learning_rate": 9.307896955219787e-06,
"loss": 0.5985,
"step": 239
},
{
"epoch": 1.92,
"grad_norm": 0.9724792838096619,
"learning_rate": 9.300412297385954e-06,
"loss": 0.5905,
"step": 240
},
{
"epoch": 1.928,
"grad_norm": 0.9143782258033752,
"learning_rate": 9.29289042892221e-06,
"loss": 0.4937,
"step": 241
},
{
"epoch": 1.936,
"grad_norm": 0.8621585369110107,
"learning_rate": 9.285331414913816e-06,
"loss": 0.584,
"step": 242
},
{
"epoch": 1.944,
"grad_norm": 0.7874864339828491,
"learning_rate": 9.277735320767449e-06,
"loss": 0.4881,
"step": 243
},
{
"epoch": 1.952,
"grad_norm": 0.9059499502182007,
"learning_rate": 9.270102212210632e-06,
"loss": 0.4926,
"step": 244
},
{
"epoch": 1.96,
"grad_norm": 0.8558206558227539,
"learning_rate": 9.262432155291167e-06,
"loss": 0.5671,
"step": 245
},
{
"epoch": 1.968,
"grad_norm": 1.2835336923599243,
"learning_rate": 9.254725216376562e-06,
"loss": 0.5115,
"step": 246
},
{
"epoch": 1.976,
"grad_norm": 0.8753209710121155,
"learning_rate": 9.246981462153456e-06,
"loss": 0.4818,
"step": 247
},
{
"epoch": 1.984,
"grad_norm": 0.7964107394218445,
"learning_rate": 9.239200959627048e-06,
"loss": 0.5344,
"step": 248
},
{
"epoch": 1.992,
"grad_norm": 0.8038451671600342,
"learning_rate": 9.231383776120512e-06,
"loss": 0.5566,
"step": 249
},
{
"epoch": 2.0,
"grad_norm": 0.8682637214660645,
"learning_rate": 9.223529979274411e-06,
"loss": 0.5996,
"step": 250
},
{
"epoch": 2.008,
"grad_norm": 0.9511418342590332,
"learning_rate": 9.215639637046121e-06,
"loss": 0.5249,
"step": 251
},
{
"epoch": 2.016,
"grad_norm": 1.1337953805923462,
"learning_rate": 9.207712817709237e-06,
"loss": 0.421,
"step": 252
},
{
"epoch": 2.024,
"grad_norm": 0.8518550395965576,
"learning_rate": 9.19974958985298e-06,
"loss": 0.4252,
"step": 253
},
{
"epoch": 2.032,
"grad_norm": 0.9565451741218567,
"learning_rate": 9.191750022381613e-06,
"loss": 0.3497,
"step": 254
},
{
"epoch": 2.04,
"grad_norm": 0.8157749176025391,
"learning_rate": 9.183714184513832e-06,
"loss": 0.3978,
"step": 255
},
{
"epoch": 2.048,
"grad_norm": 1.450162649154663,
"learning_rate": 9.175642145782179e-06,
"loss": 0.4195,
"step": 256
},
{
"epoch": 2.056,
"grad_norm": 1.2736579179763794,
"learning_rate": 9.16753397603243e-06,
"loss": 0.4481,
"step": 257
},
{
"epoch": 2.064,
"grad_norm": 0.9117834568023682,
"learning_rate": 9.159389745423003e-06,
"loss": 0.4056,
"step": 258
},
{
"epoch": 2.072,
"grad_norm": 0.8460937738418579,
"learning_rate": 9.151209524424333e-06,
"loss": 0.3456,
"step": 259
},
{
"epoch": 2.08,
"grad_norm": 1.0585182905197144,
"learning_rate": 9.142993383818284e-06,
"loss": 0.3645,
"step": 260
},
{
"epoch": 2.088,
"grad_norm": 0.949852466583252,
"learning_rate": 9.134741394697517e-06,
"loss": 0.3891,
"step": 261
},
{
"epoch": 2.096,
"grad_norm": 0.8298773169517517,
"learning_rate": 9.126453628464889e-06,
"loss": 0.3439,
"step": 262
},
{
"epoch": 2.104,
"grad_norm": 0.8148176670074463,
"learning_rate": 9.118130156832823e-06,
"loss": 0.4309,
"step": 263
},
{
"epoch": 2.112,
"grad_norm": 0.9829700589179993,
"learning_rate": 9.109771051822702e-06,
"loss": 0.3994,
"step": 264
},
{
"epoch": 2.12,
"grad_norm": 0.8034675121307373,
"learning_rate": 9.10137638576423e-06,
"loss": 0.4744,
"step": 265
},
{
"epoch": 2.128,
"grad_norm": 0.8785883188247681,
"learning_rate": 9.09294623129482e-06,
"loss": 0.3681,
"step": 266
},
{
"epoch": 2.136,
"grad_norm": 1.2200959920883179,
"learning_rate": 9.084480661358954e-06,
"loss": 0.4163,
"step": 267
},
{
"epoch": 2.144,
"grad_norm": 0.8462724089622498,
"learning_rate": 9.07597974920756e-06,
"loss": 0.4157,
"step": 268
},
{
"epoch": 2.152,
"grad_norm": 0.8349795341491699,
"learning_rate": 9.067443568397378e-06,
"loss": 0.45,
"step": 269
},
{
"epoch": 2.16,
"grad_norm": 0.7713517546653748,
"learning_rate": 9.058872192790314e-06,
"loss": 0.2785,
"step": 270
},
{
"epoch": 2.168,
"grad_norm": 0.8653733134269714,
"learning_rate": 9.05026569655281e-06,
"loss": 0.4135,
"step": 271
},
{
"epoch": 2.176,
"grad_norm": 0.7914355397224426,
"learning_rate": 9.041624154155208e-06,
"loss": 0.3978,
"step": 272
},
{
"epoch": 2.184,
"grad_norm": 0.8875446915626526,
"learning_rate": 9.032947640371086e-06,
"loss": 0.4148,
"step": 273
},
{
"epoch": 2.192,
"grad_norm": 0.8357672095298767,
"learning_rate": 9.02423623027663e-06,
"loss": 0.4515,
"step": 274
},
{
"epoch": 2.2,
"grad_norm": 0.7554605007171631,
"learning_rate": 9.01548999924997e-06,
"loss": 0.4099,
"step": 275
},
{
"epoch": 2.208,
"grad_norm": 0.9491044878959656,
"learning_rate": 9.006709022970547e-06,
"loss": 0.3829,
"step": 276
},
{
"epoch": 2.216,
"grad_norm": 0.9047715663909912,
"learning_rate": 8.997893377418432e-06,
"loss": 0.4119,
"step": 277
},
{
"epoch": 2.224,
"grad_norm": 0.8157122731208801,
"learning_rate": 8.98904313887369e-06,
"loss": 0.3683,
"step": 278
},
{
"epoch": 2.232,
"grad_norm": 0.8922542929649353,
"learning_rate": 8.980158383915714e-06,
"loss": 0.5189,
"step": 279
},
{
"epoch": 2.24,
"grad_norm": 1.1166770458221436,
"learning_rate": 8.971239189422555e-06,
"loss": 0.4893,
"step": 280
},
{
"epoch": 2.248,
"grad_norm": 0.8188707828521729,
"learning_rate": 8.962285632570266e-06,
"loss": 0.375,
"step": 281
},
{
"epoch": 2.2560000000000002,
"grad_norm": 0.8818830251693726,
"learning_rate": 8.953297790832231e-06,
"loss": 0.4442,
"step": 282
},
{
"epoch": 2.2640000000000002,
"grad_norm": 0.9548737406730652,
"learning_rate": 8.944275741978495e-06,
"loss": 0.3671,
"step": 283
},
{
"epoch": 2.2720000000000002,
"grad_norm": 0.9230669140815735,
"learning_rate": 8.935219564075087e-06,
"loss": 0.3722,
"step": 284
},
{
"epoch": 2.2800000000000002,
"grad_norm": 0.9684281945228577,
"learning_rate": 8.92612933548335e-06,
"loss": 0.5036,
"step": 285
},
{
"epoch": 2.288,
"grad_norm": 1.3175219297409058,
"learning_rate": 8.917005134859263e-06,
"loss": 0.4662,
"step": 286
},
{
"epoch": 2.296,
"grad_norm": 0.7010194659233093,
"learning_rate": 8.907847041152757e-06,
"loss": 0.3565,
"step": 287
},
{
"epoch": 2.304,
"grad_norm": 0.7920108437538147,
"learning_rate": 8.89865513360703e-06,
"loss": 0.4084,
"step": 288
},
{
"epoch": 2.312,
"grad_norm": 1.3468918800354004,
"learning_rate": 8.889429491757872e-06,
"loss": 0.5612,
"step": 289
},
{
"epoch": 2.32,
"grad_norm": 0.7971108555793762,
"learning_rate": 8.88017019543296e-06,
"loss": 0.4659,
"step": 290
},
{
"epoch": 2.328,
"grad_norm": 0.6736342906951904,
"learning_rate": 8.870877324751186e-06,
"loss": 0.3444,
"step": 291
},
{
"epoch": 2.336,
"grad_norm": 0.743931233882904,
"learning_rate": 8.861550960121946e-06,
"loss": 0.4217,
"step": 292
},
{
"epoch": 2.344,
"grad_norm": 0.8505037426948547,
"learning_rate": 8.852191182244456e-06,
"loss": 0.3812,
"step": 293
},
{
"epoch": 2.352,
"grad_norm": 1.2008588314056396,
"learning_rate": 8.842798072107055e-06,
"loss": 0.464,
"step": 294
},
{
"epoch": 2.36,
"grad_norm": 0.942729651927948,
"learning_rate": 8.833371710986493e-06,
"loss": 0.4023,
"step": 295
},
{
"epoch": 2.368,
"grad_norm": 0.9422355890274048,
"learning_rate": 8.823912180447237e-06,
"loss": 0.4129,
"step": 296
},
{
"epoch": 2.376,
"grad_norm": 0.79082190990448,
"learning_rate": 8.81441956234076e-06,
"loss": 0.5188,
"step": 297
},
{
"epoch": 2.384,
"grad_norm": 0.953883171081543,
"learning_rate": 8.804893938804839e-06,
"loss": 0.4497,
"step": 298
},
{
"epoch": 2.392,
"grad_norm": 0.8168990015983582,
"learning_rate": 8.795335392262841e-06,
"loss": 0.4807,
"step": 299
},
{
"epoch": 2.4,
"grad_norm": 0.7993202209472656,
"learning_rate": 8.785744005423003e-06,
"loss": 0.4685,
"step": 300
},
{
"epoch": 2.408,
"grad_norm": 0.7926377058029175,
"learning_rate": 8.77611986127773e-06,
"loss": 0.3851,
"step": 301
},
{
"epoch": 2.416,
"grad_norm": 0.8498927354812622,
"learning_rate": 8.766463043102864e-06,
"loss": 0.4263,
"step": 302
},
{
"epoch": 2.424,
"grad_norm": 0.9551386833190918,
"learning_rate": 8.756773634456975e-06,
"loss": 0.4211,
"step": 303
},
{
"epoch": 2.432,
"grad_norm": 0.7406211495399475,
"learning_rate": 8.747051719180626e-06,
"loss": 0.4182,
"step": 304
},
{
"epoch": 2.44,
"grad_norm": 0.7700343728065491,
"learning_rate": 8.737297381395657e-06,
"loss": 0.373,
"step": 305
},
{
"epoch": 2.448,
"grad_norm": 0.7161412835121155,
"learning_rate": 8.727510705504453e-06,
"loss": 0.4379,
"step": 306
},
{
"epoch": 2.456,
"grad_norm": 1.073433756828308,
"learning_rate": 8.717691776189214e-06,
"loss": 0.3261,
"step": 307
},
{
"epoch": 2.464,
"grad_norm": 0.8888611197471619,
"learning_rate": 8.707840678411223e-06,
"loss": 0.3806,
"step": 308
},
{
"epoch": 2.472,
"grad_norm": 0.8704436421394348,
"learning_rate": 8.69795749741011e-06,
"loss": 0.3873,
"step": 309
},
{
"epoch": 2.48,
"grad_norm": 0.943534791469574,
"learning_rate": 8.688042318703111e-06,
"loss": 0.4458,
"step": 310
},
{
"epoch": 2.488,
"grad_norm": 0.7201529145240784,
"learning_rate": 8.678095228084343e-06,
"loss": 0.4377,
"step": 311
},
{
"epoch": 2.496,
"grad_norm": 0.8133971095085144,
"learning_rate": 8.66811631162404e-06,
"loss": 0.3975,
"step": 312
},
{
"epoch": 2.504,
"grad_norm": 0.7576972842216492,
"learning_rate": 8.65810565566782e-06,
"loss": 0.3578,
"step": 313
},
{
"epoch": 2.512,
"grad_norm": 0.8077313303947449,
"learning_rate": 8.648063346835943e-06,
"loss": 0.4251,
"step": 314
},
{
"epoch": 2.52,
"grad_norm": 0.8535168766975403,
"learning_rate": 8.637989472022548e-06,
"loss": 0.3954,
"step": 315
},
{
"epoch": 2.528,
"grad_norm": 0.8312145471572876,
"learning_rate": 8.627884118394913e-06,
"loss": 0.358,
"step": 316
},
{
"epoch": 2.536,
"grad_norm": 0.9409812688827515,
"learning_rate": 8.617747373392697e-06,
"loss": 0.3673,
"step": 317
},
{
"epoch": 2.544,
"grad_norm": 0.729206919670105,
"learning_rate": 8.607579324727175e-06,
"loss": 0.4074,
"step": 318
},
{
"epoch": 2.552,
"grad_norm": 0.865915834903717,
"learning_rate": 8.597380060380493e-06,
"loss": 0.4622,
"step": 319
},
{
"epoch": 2.56,
"grad_norm": 0.7116239070892334,
"learning_rate": 8.5871496686049e-06,
"loss": 0.4169,
"step": 320
},
{
"epoch": 2.568,
"grad_norm": 0.9631391763687134,
"learning_rate": 8.576888237921983e-06,
"loss": 0.4627,
"step": 321
},
{
"epoch": 2.576,
"grad_norm": 0.8730547428131104,
"learning_rate": 8.566595857121902e-06,
"loss": 0.4204,
"step": 322
},
{
"epoch": 2.584,
"grad_norm": 0.7882906198501587,
"learning_rate": 8.556272615262623e-06,
"loss": 0.3398,
"step": 323
},
{
"epoch": 2.592,
"grad_norm": 0.9142358899116516,
"learning_rate": 8.545918601669147e-06,
"loss": 0.4639,
"step": 324
},
{
"epoch": 2.6,
"grad_norm": 0.9243710041046143,
"learning_rate": 8.535533905932739e-06,
"loss": 0.4013,
"step": 325
},
{
"epoch": 2.608,
"grad_norm": 0.8279150128364563,
"learning_rate": 8.525118617910144e-06,
"loss": 0.4878,
"step": 326
},
{
"epoch": 2.616,
"grad_norm": 0.9344537854194641,
"learning_rate": 8.514672827722824e-06,
"loss": 0.3711,
"step": 327
},
{
"epoch": 2.624,
"grad_norm": 0.8455937504768372,
"learning_rate": 8.504196625756166e-06,
"loss": 0.3921,
"step": 328
},
{
"epoch": 2.632,
"grad_norm": 0.8718138933181763,
"learning_rate": 8.493690102658703e-06,
"loss": 0.376,
"step": 329
},
{
"epoch": 2.64,
"grad_norm": 0.8311028480529785,
"learning_rate": 8.483153349341336e-06,
"loss": 0.4188,
"step": 330
},
{
"epoch": 2.648,
"grad_norm": 0.8838032484054565,
"learning_rate": 8.472586456976534e-06,
"loss": 0.3689,
"step": 331
},
{
"epoch": 2.656,
"grad_norm": 1.0043684244155884,
"learning_rate": 8.461989516997565e-06,
"loss": 0.4979,
"step": 332
},
{
"epoch": 2.664,
"grad_norm": 0.855467677116394,
"learning_rate": 8.45136262109768e-06,
"loss": 0.4576,
"step": 333
},
{
"epoch": 2.672,
"grad_norm": 0.7641392946243286,
"learning_rate": 8.440705861229344e-06,
"loss": 0.4115,
"step": 334
},
{
"epoch": 2.68,
"grad_norm": 1.2907041311264038,
"learning_rate": 8.430019329603423e-06,
"loss": 0.4438,
"step": 335
},
{
"epoch": 2.6879999999999997,
"grad_norm": 0.8499248027801514,
"learning_rate": 8.41930311868839e-06,
"loss": 0.3627,
"step": 336
},
{
"epoch": 2.6959999999999997,
"grad_norm": 0.8722188472747803,
"learning_rate": 8.408557321209534e-06,
"loss": 0.3774,
"step": 337
},
{
"epoch": 2.7039999999999997,
"grad_norm": 0.862125039100647,
"learning_rate": 8.397782030148147e-06,
"loss": 0.3429,
"step": 338
},
{
"epoch": 2.7119999999999997,
"grad_norm": 0.7485009431838989,
"learning_rate": 8.386977338740724e-06,
"loss": 0.3594,
"step": 339
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.8689549565315247,
"learning_rate": 8.376143340478153e-06,
"loss": 0.4013,
"step": 340
},
{
"epoch": 2.7279999999999998,
"grad_norm": 1.5140808820724487,
"learning_rate": 8.365280129104912e-06,
"loss": 0.5017,
"step": 341
},
{
"epoch": 2.7359999999999998,
"grad_norm": 0.781104564666748,
"learning_rate": 8.354387798618254e-06,
"loss": 0.2975,
"step": 342
},
{
"epoch": 2.7439999999999998,
"grad_norm": 0.9789696335792542,
"learning_rate": 8.34346644326739e-06,
"loss": 0.3661,
"step": 343
},
{
"epoch": 2.752,
"grad_norm": 1.0142971277236938,
"learning_rate": 8.332516157552684e-06,
"loss": 0.3785,
"step": 344
},
{
"epoch": 2.76,
"grad_norm": 0.7328018546104431,
"learning_rate": 8.321537036224822e-06,
"loss": 0.4422,
"step": 345
},
{
"epoch": 2.768,
"grad_norm": 0.7889101505279541,
"learning_rate": 8.310529174284004e-06,
"loss": 0.4195,
"step": 346
},
{
"epoch": 2.776,
"grad_norm": 1.032447099685669,
"learning_rate": 8.299492666979114e-06,
"loss": 0.4659,
"step": 347
},
{
"epoch": 2.784,
"grad_norm": 0.8374775648117065,
"learning_rate": 8.288427609806899e-06,
"loss": 0.4177,
"step": 348
},
{
"epoch": 2.792,
"grad_norm": 0.8811750411987305,
"learning_rate": 8.277334098511147e-06,
"loss": 0.4335,
"step": 349
},
{
"epoch": 2.8,
"grad_norm": 1.1630526781082153,
"learning_rate": 8.266212229081846e-06,
"loss": 0.4981,
"step": 350
},
{
"epoch": 2.808,
"grad_norm": 0.7661845684051514,
"learning_rate": 8.255062097754371e-06,
"loss": 0.383,
"step": 351
},
{
"epoch": 2.816,
"grad_norm": 0.8048927187919617,
"learning_rate": 8.243883801008632e-06,
"loss": 0.3572,
"step": 352
},
{
"epoch": 2.824,
"grad_norm": 0.7940247058868408,
"learning_rate": 8.232677435568252e-06,
"loss": 0.3471,
"step": 353
},
{
"epoch": 2.832,
"grad_norm": 0.7869440913200378,
"learning_rate": 8.221443098399733e-06,
"loss": 0.3616,
"step": 354
},
{
"epoch": 2.84,
"grad_norm": 0.9467028379440308,
"learning_rate": 8.210180886711603e-06,
"loss": 0.4388,
"step": 355
},
{
"epoch": 2.848,
"grad_norm": 0.9510444402694702,
"learning_rate": 8.198890897953586e-06,
"loss": 0.3982,
"step": 356
},
{
"epoch": 2.856,
"grad_norm": 0.7974853515625,
"learning_rate": 8.187573229815757e-06,
"loss": 0.4713,
"step": 357
},
{
"epoch": 2.864,
"grad_norm": 0.7801048755645752,
"learning_rate": 8.176227980227693e-06,
"loss": 0.4379,
"step": 358
},
{
"epoch": 2.872,
"grad_norm": 0.8292092084884644,
"learning_rate": 8.164855247357628e-06,
"loss": 0.4051,
"step": 359
},
{
"epoch": 2.88,
"grad_norm": 1.026355504989624,
"learning_rate": 8.153455129611605e-06,
"loss": 0.4278,
"step": 360
},
{
"epoch": 2.888,
"grad_norm": 0.9270377159118652,
"learning_rate": 8.142027725632622e-06,
"loss": 0.5052,
"step": 361
},
{
"epoch": 2.896,
"grad_norm": 0.790385365486145,
"learning_rate": 8.130573134299782e-06,
"loss": 0.339,
"step": 362
},
{
"epoch": 2.904,
"grad_norm": 0.9593288898468018,
"learning_rate": 8.119091454727427e-06,
"loss": 0.4492,
"step": 363
},
{
"epoch": 2.912,
"grad_norm": 1.2080198526382446,
"learning_rate": 8.107582786264299e-06,
"loss": 0.5128,
"step": 364
},
{
"epoch": 2.92,
"grad_norm": 0.9071283340454102,
"learning_rate": 8.09604722849266e-06,
"loss": 0.4407,
"step": 365
},
{
"epoch": 2.928,
"grad_norm": 0.821433424949646,
"learning_rate": 8.084484881227449e-06,
"loss": 0.4552,
"step": 366
},
{
"epoch": 2.936,
"grad_norm": 0.7334813475608826,
"learning_rate": 8.072895844515398e-06,
"loss": 0.4174,
"step": 367
},
{
"epoch": 2.944,
"grad_norm": 0.9713824391365051,
"learning_rate": 8.061280218634192e-06,
"loss": 0.431,
"step": 368
},
{
"epoch": 2.952,
"grad_norm": 0.7867675423622131,
"learning_rate": 8.049638104091575e-06,
"loss": 0.4379,
"step": 369
},
{
"epoch": 2.96,
"grad_norm": 0.8193316459655762,
"learning_rate": 8.037969601624495e-06,
"loss": 0.4465,
"step": 370
},
{
"epoch": 2.968,
"grad_norm": 0.8776450753211975,
"learning_rate": 8.026274812198235e-06,
"loss": 0.3944,
"step": 371
},
{
"epoch": 2.976,
"grad_norm": 0.6917450428009033,
"learning_rate": 8.014553837005527e-06,
"loss": 0.3864,
"step": 372
},
{
"epoch": 2.984,
"grad_norm": 0.94938063621521,
"learning_rate": 8.002806777465685e-06,
"loss": 0.5753,
"step": 373
},
{
"epoch": 2.992,
"grad_norm": 0.7153763771057129,
"learning_rate": 7.99103373522373e-06,
"loss": 0.3676,
"step": 374
},
{
"epoch": 3.0,
"grad_norm": 0.9804494976997375,
"learning_rate": 7.9792348121495e-06,
"loss": 0.4525,
"step": 375
},
{
"epoch": 3.008,
"grad_norm": 0.8869629502296448,
"learning_rate": 7.967410110336782e-06,
"loss": 0.2091,
"step": 376
},
{
"epoch": 3.016,
"grad_norm": 1.2333577871322632,
"learning_rate": 7.955559732102414e-06,
"loss": 0.3921,
"step": 377
},
{
"epoch": 3.024,
"grad_norm": 1.1201192140579224,
"learning_rate": 7.943683779985412e-06,
"loss": 0.3377,
"step": 378
},
{
"epoch": 3.032,
"grad_norm": 0.7908293008804321,
"learning_rate": 7.931782356746076e-06,
"loss": 0.3317,
"step": 379
},
{
"epoch": 3.04,
"grad_norm": 1.140568733215332,
"learning_rate": 7.919855565365102e-06,
"loss": 0.2545,
"step": 380
},
{
"epoch": 3.048,
"grad_norm": 1.1746877431869507,
"learning_rate": 7.907903509042696e-06,
"loss": 0.328,
"step": 381
},
{
"epoch": 3.056,
"grad_norm": 0.8736027479171753,
"learning_rate": 7.895926291197667e-06,
"loss": 0.2423,
"step": 382
},
{
"epoch": 3.064,
"grad_norm": 2.1225597858428955,
"learning_rate": 7.883924015466554e-06,
"loss": 0.2731,
"step": 383
},
{
"epoch": 3.072,
"grad_norm": 0.8059532642364502,
"learning_rate": 7.871896785702707e-06,
"loss": 0.3011,
"step": 384
},
{
"epoch": 3.08,
"grad_norm": 0.8498049974441528,
"learning_rate": 7.859844705975405e-06,
"loss": 0.2865,
"step": 385
},
{
"epoch": 3.088,
"grad_norm": 2.4544568061828613,
"learning_rate": 7.847767880568944e-06,
"loss": 0.2806,
"step": 386
},
{
"epoch": 3.096,
"grad_norm": 0.7781065702438354,
"learning_rate": 7.835666413981744e-06,
"loss": 0.2607,
"step": 387
},
{
"epoch": 3.104,
"grad_norm": 0.8441874384880066,
"learning_rate": 7.823540410925434e-06,
"loss": 0.2531,
"step": 388
},
{
"epoch": 3.112,
"grad_norm": 0.7791491150856018,
"learning_rate": 7.811389976323963e-06,
"loss": 0.2228,
"step": 389
},
{
"epoch": 3.12,
"grad_norm": 1.0188428163528442,
"learning_rate": 7.799215215312667e-06,
"loss": 0.3195,
"step": 390
},
{
"epoch": 3.128,
"grad_norm": 0.872238039970398,
"learning_rate": 7.787016233237387e-06,
"loss": 0.1906,
"step": 391
},
{
"epoch": 3.136,
"grad_norm": 0.8092172145843506,
"learning_rate": 7.774793135653537e-06,
"loss": 0.2425,
"step": 392
},
{
"epoch": 3.144,
"grad_norm": 4.553264141082764,
"learning_rate": 7.7625460283252e-06,
"loss": 0.3214,
"step": 393
},
{
"epoch": 3.152,
"grad_norm": 0.9575441479682922,
"learning_rate": 7.750275017224208e-06,
"loss": 0.2925,
"step": 394
},
{
"epoch": 3.16,
"grad_norm": 1.2400014400482178,
"learning_rate": 7.737980208529232e-06,
"loss": 0.3511,
"step": 395
},
{
"epoch": 3.168,
"grad_norm": 0.8100807070732117,
"learning_rate": 7.725661708624855e-06,
"loss": 0.3122,
"step": 396
},
{
"epoch": 3.176,
"grad_norm": 0.6804743409156799,
"learning_rate": 7.713319624100657e-06,
"loss": 0.2015,
"step": 397
},
{
"epoch": 3.184,
"grad_norm": 0.9154347777366638,
"learning_rate": 7.700954061750295e-06,
"loss": 0.3348,
"step": 398
},
{
"epoch": 3.192,
"grad_norm": 0.92096346616745,
"learning_rate": 7.688565128570564e-06,
"loss": 0.3135,
"step": 399
},
{
"epoch": 3.2,
"grad_norm": 0.7802610993385315,
"learning_rate": 7.676152931760496e-06,
"loss": 0.2611,
"step": 400
},
{
"epoch": 3.208,
"grad_norm": 0.7140606045722961,
"learning_rate": 7.663717578720412e-06,
"loss": 0.2202,
"step": 401
},
{
"epoch": 3.216,
"grad_norm": 1.0909085273742676,
"learning_rate": 7.651259177050996e-06,
"loss": 0.3117,
"step": 402
},
{
"epoch": 3.224,
"grad_norm": 0.8841260671615601,
"learning_rate": 7.638777834552372e-06,
"loss": 0.2847,
"step": 403
},
{
"epoch": 3.232,
"grad_norm": 0.8838410377502441,
"learning_rate": 7.626273659223166e-06,
"loss": 0.3084,
"step": 404
},
{
"epoch": 3.24,
"grad_norm": 1.0443016290664673,
"learning_rate": 7.61374675925957e-06,
"loss": 0.293,
"step": 405
},
{
"epoch": 3.248,
"grad_norm": 0.8339405655860901,
"learning_rate": 7.601197243054411e-06,
"loss": 0.2314,
"step": 406
},
{
"epoch": 3.2560000000000002,
"grad_norm": 0.8528264760971069,
"learning_rate": 7.588625219196208e-06,
"loss": 0.2275,
"step": 407
},
{
"epoch": 3.2640000000000002,
"grad_norm": 0.7928086519241333,
"learning_rate": 7.576030796468233e-06,
"loss": 0.2629,
"step": 408
},
{
"epoch": 3.2720000000000002,
"grad_norm": 0.7202726006507874,
"learning_rate": 7.563414083847573e-06,
"loss": 0.2481,
"step": 409
},
{
"epoch": 3.2800000000000002,
"grad_norm": 0.8470425009727478,
"learning_rate": 7.5507751905041885e-06,
"loss": 0.2975,
"step": 410
},
{
"epoch": 3.288,
"grad_norm": 1.0719774961471558,
"learning_rate": 7.538114225799955e-06,
"loss": 0.327,
"step": 411
},
{
"epoch": 3.296,
"grad_norm": 0.797988772392273,
"learning_rate": 7.525431299287737e-06,
"loss": 0.3041,
"step": 412
},
{
"epoch": 3.304,
"grad_norm": 0.7145325541496277,
"learning_rate": 7.512726520710429e-06,
"loss": 0.2963,
"step": 413
},
{
"epoch": 3.312,
"grad_norm": 0.7283927202224731,
"learning_rate": 7.500000000000001e-06,
"loss": 0.235,
"step": 414
},
{
"epoch": 3.32,
"grad_norm": 0.7725811004638672,
"learning_rate": 7.4872518472765594e-06,
"loss": 0.2348,
"step": 415
},
{
"epoch": 3.328,
"grad_norm": 0.8668394088745117,
"learning_rate": 7.474482172847391e-06,
"loss": 0.26,
"step": 416
},
{
"epoch": 3.336,
"grad_norm": 0.87476646900177,
"learning_rate": 7.461691087205993e-06,
"loss": 0.2704,
"step": 417
},
{
"epoch": 3.344,
"grad_norm": 0.7493270039558411,
"learning_rate": 7.4488787010311425e-06,
"loss": 0.3159,
"step": 418
},
{
"epoch": 3.352,
"grad_norm": 0.8767033815383911,
"learning_rate": 7.436045125185923e-06,
"loss": 0.3183,
"step": 419
},
{
"epoch": 3.36,
"grad_norm": 0.805707573890686,
"learning_rate": 7.423190470716761e-06,
"loss": 0.289,
"step": 420
},
{
"epoch": 3.368,
"grad_norm": 0.859490692615509,
"learning_rate": 7.4103148488524824e-06,
"loss": 0.2513,
"step": 421
},
{
"epoch": 3.376,
"grad_norm": 0.7154383659362793,
"learning_rate": 7.3974183710033334e-06,
"loss": 0.2479,
"step": 422
},
{
"epoch": 3.384,
"grad_norm": 0.8182111978530884,
"learning_rate": 7.384501148760024e-06,
"loss": 0.2879,
"step": 423
},
{
"epoch": 3.392,
"grad_norm": 0.7623195648193359,
"learning_rate": 7.371563293892761e-06,
"loss": 0.2687,
"step": 424
},
{
"epoch": 3.4,
"grad_norm": 0.8968961238861084,
"learning_rate": 7.3586049183502875e-06,
"loss": 0.2893,
"step": 425
},
{
"epoch": 3.408,
"grad_norm": 0.81312495470047,
"learning_rate": 7.345626134258897e-06,
"loss": 0.2856,
"step": 426
},
{
"epoch": 3.416,
"grad_norm": 0.7374573945999146,
"learning_rate": 7.3326270539214826e-06,
"loss": 0.2818,
"step": 427
},
{
"epoch": 3.424,
"grad_norm": 0.6881022453308105,
"learning_rate": 7.319607789816555e-06,
"loss": 0.2353,
"step": 428
},
{
"epoch": 3.432,
"grad_norm": 0.8280784487724304,
"learning_rate": 7.306568454597269e-06,
"loss": 0.3473,
"step": 429
},
{
"epoch": 3.44,
"grad_norm": 0.7189634442329407,
"learning_rate": 7.293509161090453e-06,
"loss": 0.2186,
"step": 430
},
{
"epoch": 3.448,
"grad_norm": 0.85979163646698,
"learning_rate": 7.28043002229563e-06,
"loss": 0.2869,
"step": 431
},
{
"epoch": 3.456,
"grad_norm": 0.846011757850647,
"learning_rate": 7.2673311513840395e-06,
"loss": 0.2669,
"step": 432
},
{
"epoch": 3.464,
"grad_norm": 0.7689185738563538,
"learning_rate": 7.2542126616976596e-06,
"loss": 0.2488,
"step": 433
},
{
"epoch": 3.472,
"grad_norm": 0.9952272176742554,
"learning_rate": 7.241074666748228e-06,
"loss": 0.2867,
"step": 434
},
{
"epoch": 3.48,
"grad_norm": 0.9386110901832581,
"learning_rate": 7.227917280216254e-06,
"loss": 0.3069,
"step": 435
},
{
"epoch": 3.488,
"grad_norm": 0.9380288124084473,
"learning_rate": 7.214740615950041e-06,
"loss": 0.2969,
"step": 436
},
{
"epoch": 3.496,
"grad_norm": 0.944202184677124,
"learning_rate": 7.201544787964698e-06,
"loss": 0.2469,
"step": 437
},
{
"epoch": 3.504,
"grad_norm": 0.898861289024353,
"learning_rate": 7.188329910441154e-06,
"loss": 0.2336,
"step": 438
},
{
"epoch": 3.512,
"grad_norm": 1.0536229610443115,
"learning_rate": 7.175096097725169e-06,
"loss": 0.3452,
"step": 439
},
{
"epoch": 3.52,
"grad_norm": 0.9392815232276917,
"learning_rate": 7.161843464326349e-06,
"loss": 0.294,
"step": 440
},
{
"epoch": 3.528,
"grad_norm": 0.7755203247070312,
"learning_rate": 7.148572124917148e-06,
"loss": 0.2786,
"step": 441
},
{
"epoch": 3.536,
"grad_norm": 0.7829739451408386,
"learning_rate": 7.135282194331881e-06,
"loss": 0.2856,
"step": 442
},
{
"epoch": 3.544,
"grad_norm": 1.1139544248580933,
"learning_rate": 7.121973787565727e-06,
"loss": 0.3355,
"step": 443
},
{
"epoch": 3.552,
"grad_norm": 0.8333198428153992,
"learning_rate": 7.1086470197737405e-06,
"loss": 0.256,
"step": 444
},
{
"epoch": 3.56,
"grad_norm": 0.7802929282188416,
"learning_rate": 7.095302006269842e-06,
"loss": 0.2372,
"step": 445
},
{
"epoch": 3.568,
"grad_norm": 0.8746179342269897,
"learning_rate": 7.0819388625258385e-06,
"loss": 0.3049,
"step": 446
},
{
"epoch": 3.576,
"grad_norm": 0.7507904767990112,
"learning_rate": 7.06855770417041e-06,
"loss": 0.2701,
"step": 447
},
{
"epoch": 3.584,
"grad_norm": 0.8713604211807251,
"learning_rate": 7.05515864698811e-06,
"loss": 0.2763,
"step": 448
},
{
"epoch": 3.592,
"grad_norm": 0.9194297790527344,
"learning_rate": 7.041741806918372e-06,
"loss": 0.3081,
"step": 449
},
{
"epoch": 3.6,
"grad_norm": 0.769226610660553,
"learning_rate": 7.028307300054499e-06,
"loss": 0.3234,
"step": 450
},
{
"epoch": 3.608,
"grad_norm": 0.763100802898407,
"learning_rate": 7.014855242642662e-06,
"loss": 0.2972,
"step": 451
},
{
"epoch": 3.616,
"grad_norm": 0.7915220260620117,
"learning_rate": 7.0013857510808934e-06,
"loss": 0.247,
"step": 452
},
{
"epoch": 3.624,
"grad_norm": 0.8212165236473083,
"learning_rate": 6.987898941918082e-06,
"loss": 0.2414,
"step": 453
},
{
"epoch": 3.632,
"grad_norm": 0.658478856086731,
"learning_rate": 6.974394931852957e-06,
"loss": 0.2388,
"step": 454
},
{
"epoch": 3.64,
"grad_norm": 0.8130829334259033,
"learning_rate": 6.960873837733089e-06,
"loss": 0.2811,
"step": 455
},
{
"epoch": 3.648,
"grad_norm": 0.7456867098808289,
"learning_rate": 6.94733577655387e-06,
"loss": 0.2794,
"step": 456
},
{
"epoch": 3.656,
"grad_norm": 0.8971588611602783,
"learning_rate": 6.933780865457508e-06,
"loss": 0.3021,
"step": 457
},
{
"epoch": 3.664,
"grad_norm": 0.987177312374115,
"learning_rate": 6.920209221732007e-06,
"loss": 0.314,
"step": 458
},
{
"epoch": 3.672,
"grad_norm": 0.8147737383842468,
"learning_rate": 6.90662096281016e-06,
"loss": 0.3202,
"step": 459
},
{
"epoch": 3.68,
"grad_norm": 0.668489396572113,
"learning_rate": 6.893016206268518e-06,
"loss": 0.2737,
"step": 460
},
{
"epoch": 3.6879999999999997,
"grad_norm": 0.7580770254135132,
"learning_rate": 6.879395069826394e-06,
"loss": 0.2473,
"step": 461
},
{
"epoch": 3.6959999999999997,
"grad_norm": 1.1495063304901123,
"learning_rate": 6.865757671344827e-06,
"loss": 0.2972,
"step": 462
},
{
"epoch": 3.7039999999999997,
"grad_norm": 1.5628399848937988,
"learning_rate": 6.85210412882557e-06,
"loss": 0.2253,
"step": 463
},
{
"epoch": 3.7119999999999997,
"grad_norm": 0.7946319580078125,
"learning_rate": 6.838434560410064e-06,
"loss": 0.2689,
"step": 464
},
{
"epoch": 3.7199999999999998,
"grad_norm": 0.9710001349449158,
"learning_rate": 6.824749084378428e-06,
"loss": 0.4637,
"step": 465
},
{
"epoch": 3.7279999999999998,
"grad_norm": 0.9095158576965332,
"learning_rate": 6.811047819148413e-06,
"loss": 0.4004,
"step": 466
},
{
"epoch": 3.7359999999999998,
"grad_norm": 0.848308265209198,
"learning_rate": 6.7973308832744035e-06,
"loss": 0.2459,
"step": 467
},
{
"epoch": 3.7439999999999998,
"grad_norm": 0.9440385103225708,
"learning_rate": 6.783598395446371e-06,
"loss": 0.2605,
"step": 468
},
{
"epoch": 3.752,
"grad_norm": 0.7493250370025635,
"learning_rate": 6.769850474488859e-06,
"loss": 0.266,
"step": 469
},
{
"epoch": 3.76,
"grad_norm": 0.9245112538337708,
"learning_rate": 6.756087239359948e-06,
"loss": 0.3054,
"step": 470
},
{
"epoch": 3.768,
"grad_norm": 0.7723512649536133,
"learning_rate": 6.742308809150232e-06,
"loss": 0.2728,
"step": 471
},
{
"epoch": 3.776,
"grad_norm": 0.8170971870422363,
"learning_rate": 6.728515303081782e-06,
"loss": 0.3104,
"step": 472
},
{
"epoch": 3.784,
"grad_norm": 0.7314550280570984,
"learning_rate": 6.714706840507122e-06,
"loss": 0.2466,
"step": 473
},
{
"epoch": 3.792,
"grad_norm": 0.8251045942306519,
"learning_rate": 6.700883540908185e-06,
"loss": 0.22,
"step": 474
},
{
"epoch": 3.8,
"grad_norm": 0.8969084024429321,
"learning_rate": 6.687045523895292e-06,
"loss": 0.2554,
"step": 475
},
{
"epoch": 3.808,
"grad_norm": 0.9797028303146362,
"learning_rate": 6.673192909206109e-06,
"loss": 0.2945,
"step": 476
},
{
"epoch": 3.816,
"grad_norm": 0.6120610237121582,
"learning_rate": 6.6593258167046115e-06,
"loss": 0.2175,
"step": 477
},
{
"epoch": 3.824,
"grad_norm": 0.7815307974815369,
"learning_rate": 6.64544436638005e-06,
"loss": 0.2863,
"step": 478
},
{
"epoch": 3.832,
"grad_norm": 0.8799474835395813,
"learning_rate": 6.63154867834591e-06,
"loss": 0.2541,
"step": 479
},
{
"epoch": 3.84,
"grad_norm": 0.7827337980270386,
"learning_rate": 6.617638872838874e-06,
"loss": 0.2887,
"step": 480
},
{
"epoch": 3.848,
"grad_norm": 0.7949901223182678,
"learning_rate": 6.603715070217779e-06,
"loss": 0.2471,
"step": 481
},
{
"epoch": 3.856,
"grad_norm": 0.9216195940971375,
"learning_rate": 6.589777390962575e-06,
"loss": 0.2361,
"step": 482
},
{
"epoch": 3.864,
"grad_norm": 0.9386386275291443,
"learning_rate": 6.5758259556732896e-06,
"loss": 0.3444,
"step": 483
},
{
"epoch": 3.872,
"grad_norm": 0.9753653407096863,
"learning_rate": 6.561860885068972e-06,
"loss": 0.2515,
"step": 484
},
{
"epoch": 3.88,
"grad_norm": 0.8993600010871887,
"learning_rate": 6.547882299986658e-06,
"loss": 0.2978,
"step": 485
},
{
"epoch": 3.888,
"grad_norm": 1.270437479019165,
"learning_rate": 6.53389032138032e-06,
"loss": 0.2523,
"step": 486
},
{
"epoch": 3.896,
"grad_norm": 0.9538208842277527,
"learning_rate": 6.519885070319827e-06,
"loss": 0.2749,
"step": 487
},
{
"epoch": 3.904,
"grad_norm": 0.7965839505195618,
"learning_rate": 6.505866667989884e-06,
"loss": 0.316,
"step": 488
},
{
"epoch": 3.912,
"grad_norm": 0.8446824550628662,
"learning_rate": 6.491835235688999e-06,
"loss": 0.3438,
"step": 489
},
{
"epoch": 3.92,
"grad_norm": 0.7848514914512634,
"learning_rate": 6.477790894828422e-06,
"loss": 0.2165,
"step": 490
},
{
"epoch": 3.928,
"grad_norm": 0.722306489944458,
"learning_rate": 6.463733766931096e-06,
"loss": 0.2449,
"step": 491
},
{
"epoch": 3.936,
"grad_norm": 0.9112846255302429,
"learning_rate": 6.449663973630613e-06,
"loss": 0.2707,
"step": 492
},
{
"epoch": 3.944,
"grad_norm": 0.8071252703666687,
"learning_rate": 6.435581636670154e-06,
"loss": 0.2387,
"step": 493
},
{
"epoch": 3.952,
"grad_norm": 0.8816282749176025,
"learning_rate": 6.421486877901436e-06,
"loss": 0.2018,
"step": 494
},
{
"epoch": 3.96,
"grad_norm": 0.7326090335845947,
"learning_rate": 6.407379819283661e-06,
"loss": 0.2245,
"step": 495
},
{
"epoch": 3.968,
"grad_norm": 0.862907886505127,
"learning_rate": 6.393260582882462e-06,
"loss": 0.2872,
"step": 496
},
{
"epoch": 3.976,
"grad_norm": 0.7406136989593506,
"learning_rate": 6.379129290868837e-06,
"loss": 0.2779,
"step": 497
},
{
"epoch": 3.984,
"grad_norm": 1.0465444326400757,
"learning_rate": 6.364986065518106e-06,
"loss": 0.3433,
"step": 498
},
{
"epoch": 3.992,
"grad_norm": 0.7991510629653931,
"learning_rate": 6.350831029208844e-06,
"loss": 0.2742,
"step": 499
},
{
"epoch": 4.0,
"grad_norm": 0.9392898678779602,
"learning_rate": 6.336664304421818e-06,
"loss": 0.2667,
"step": 500
},
{
"epoch": 4.008,
"grad_norm": 1.1515122652053833,
"learning_rate": 6.322486013738942e-06,
"loss": 0.1675,
"step": 501
},
{
"epoch": 4.016,
"grad_norm": 1.152869462966919,
"learning_rate": 6.308296279842204e-06,
"loss": 0.1561,
"step": 502
},
{
"epoch": 4.024,
"grad_norm": 0.8425310850143433,
"learning_rate": 6.294095225512604e-06,
"loss": 0.1293,
"step": 503
},
{
"epoch": 4.032,
"grad_norm": 0.8141390681266785,
"learning_rate": 6.279882973629101e-06,
"loss": 0.1929,
"step": 504
},
{
"epoch": 4.04,
"grad_norm": 0.9013286828994751,
"learning_rate": 6.265659647167542e-06,
"loss": 0.1551,
"step": 505
},
{
"epoch": 4.048,
"grad_norm": 0.8535316586494446,
"learning_rate": 6.2514253691996e-06,
"loss": 0.1687,
"step": 506
},
{
"epoch": 4.056,
"grad_norm": 1.3743019104003906,
"learning_rate": 6.237180262891709e-06,
"loss": 0.212,
"step": 507
},
{
"epoch": 4.064,
"grad_norm": 1.4581520557403564,
"learning_rate": 6.222924451504001e-06,
"loss": 0.198,
"step": 508
},
{
"epoch": 4.072,
"grad_norm": 1.3407318592071533,
"learning_rate": 6.208658058389232e-06,
"loss": 0.1358,
"step": 509
},
{
"epoch": 4.08,
"grad_norm": 0.8731305003166199,
"learning_rate": 6.194381206991723e-06,
"loss": 0.1078,
"step": 510
},
{
"epoch": 4.088,
"grad_norm": 0.7974117398262024,
"learning_rate": 6.180094020846291e-06,
"loss": 0.1521,
"step": 511
},
{
"epoch": 4.096,
"grad_norm": 0.7402255535125732,
"learning_rate": 6.165796623577171e-06,
"loss": 0.105,
"step": 512
},
{
"epoch": 4.104,
"grad_norm": 0.7679521441459656,
"learning_rate": 6.15148913889696e-06,
"loss": 0.1433,
"step": 513
},
{
"epoch": 4.112,
"grad_norm": 0.8533539175987244,
"learning_rate": 6.1371716906055336e-06,
"loss": 0.203,
"step": 514
},
{
"epoch": 4.12,
"grad_norm": 0.8484000563621521,
"learning_rate": 6.122844402588982e-06,
"loss": 0.112,
"step": 515
},
{
"epoch": 4.128,
"grad_norm": 0.7334287762641907,
"learning_rate": 6.10850739881854e-06,
"loss": 0.1203,
"step": 516
},
{
"epoch": 4.136,
"grad_norm": 0.8749109506607056,
"learning_rate": 6.094160803349508e-06,
"loss": 0.1452,
"step": 517
},
{
"epoch": 4.144,
"grad_norm": 1.0502787828445435,
"learning_rate": 6.079804740320181e-06,
"loss": 0.1226,
"step": 518
},
{
"epoch": 4.152,
"grad_norm": 0.9388486742973328,
"learning_rate": 6.065439333950776e-06,
"loss": 0.1565,
"step": 519
},
{
"epoch": 4.16,
"grad_norm": 0.7821274995803833,
"learning_rate": 6.051064708542357e-06,
"loss": 0.0648,
"step": 520
},
{
"epoch": 4.168,
"grad_norm": 0.8100799322128296,
"learning_rate": 6.036680988475756e-06,
"loss": 0.1198,
"step": 521
},
{
"epoch": 4.176,
"grad_norm": 0.8949313759803772,
"learning_rate": 6.022288298210502e-06,
"loss": 0.18,
"step": 522
},
{
"epoch": 4.184,
"grad_norm": 0.770513117313385,
"learning_rate": 6.00788676228374e-06,
"loss": 0.1465,
"step": 523
},
{
"epoch": 4.192,
"grad_norm": 0.7669388055801392,
"learning_rate": 5.993476505309154e-06,
"loss": 0.1009,
"step": 524
},
{
"epoch": 4.2,
"grad_norm": 0.7349802851676941,
"learning_rate": 5.979057651975893e-06,
"loss": 0.097,
"step": 525
},
{
"epoch": 4.208,
"grad_norm": 0.788063108921051,
"learning_rate": 5.964630327047485e-06,
"loss": 0.1451,
"step": 526
},
{
"epoch": 4.216,
"grad_norm": 0.9150925278663635,
"learning_rate": 5.9501946553607615e-06,
"loss": 0.1764,
"step": 527
},
{
"epoch": 4.224,
"grad_norm": 0.8285714387893677,
"learning_rate": 5.935750761824777e-06,
"loss": 0.0965,
"step": 528
},
{
"epoch": 4.232,
"grad_norm": 0.9090161919593811,
"learning_rate": 5.921298771419731e-06,
"loss": 0.1792,
"step": 529
},
{
"epoch": 4.24,
"grad_norm": 0.7813971638679504,
"learning_rate": 5.906838809195879e-06,
"loss": 0.1492,
"step": 530
},
{
"epoch": 4.248,
"grad_norm": 0.7030081748962402,
"learning_rate": 5.8923710002724595e-06,
"loss": 0.0911,
"step": 531
},
{
"epoch": 4.256,
"grad_norm": 0.8696473240852356,
"learning_rate": 5.877895469836604e-06,
"loss": 0.1808,
"step": 532
},
{
"epoch": 4.264,
"grad_norm": 0.8898045420646667,
"learning_rate": 5.863412343142258e-06,
"loss": 0.2043,
"step": 533
},
{
"epoch": 4.272,
"grad_norm": 0.9002187848091125,
"learning_rate": 5.848921745509094e-06,
"loss": 0.1582,
"step": 534
},
{
"epoch": 4.28,
"grad_norm": 1.1970871686935425,
"learning_rate": 5.8344238023214305e-06,
"loss": 0.1485,
"step": 535
},
{
"epoch": 4.288,
"grad_norm": 0.8672990798950195,
"learning_rate": 5.819918639027149e-06,
"loss": 0.1837,
"step": 536
},
{
"epoch": 4.296,
"grad_norm": 0.8390870094299316,
"learning_rate": 5.805406381136598e-06,
"loss": 0.1557,
"step": 537
},
{
"epoch": 4.304,
"grad_norm": 1.0075196027755737,
"learning_rate": 5.790887154221521e-06,
"loss": 0.2385,
"step": 538
},
{
"epoch": 4.312,
"grad_norm": 0.9522439241409302,
"learning_rate": 5.776361083913959e-06,
"loss": 0.1226,
"step": 539
},
{
"epoch": 4.32,
"grad_norm": 0.9824661612510681,
"learning_rate": 5.7618282959051685e-06,
"loss": 0.1157,
"step": 540
},
{
"epoch": 4.328,
"grad_norm": 0.8090677261352539,
"learning_rate": 5.747288915944533e-06,
"loss": 0.1193,
"step": 541
},
{
"epoch": 4.336,
"grad_norm": 0.7453141212463379,
"learning_rate": 5.7327430698384775e-06,
"loss": 0.1051,
"step": 542
},
{
"epoch": 4.344,
"grad_norm": 1.0975877046585083,
"learning_rate": 5.718190883449373e-06,
"loss": 0.1913,
"step": 543
},
{
"epoch": 4.352,
"grad_norm": 0.7594937682151794,
"learning_rate": 5.703632482694453e-06,
"loss": 0.1369,
"step": 544
},
{
"epoch": 4.36,
"grad_norm": 0.7948636412620544,
"learning_rate": 5.689067993544726e-06,
"loss": 0.1818,
"step": 545
},
{
"epoch": 4.368,
"grad_norm": 0.896981954574585,
"learning_rate": 5.674497542023875e-06,
"loss": 0.1261,
"step": 546
},
{
"epoch": 4.376,
"grad_norm": 0.7178036570549011,
"learning_rate": 5.659921254207183e-06,
"loss": 0.12,
"step": 547
},
{
"epoch": 4.384,
"grad_norm": 0.673762857913971,
"learning_rate": 5.645339256220427e-06,
"loss": 0.1432,
"step": 548
},
{
"epoch": 4.392,
"grad_norm": 0.8115494847297668,
"learning_rate": 5.630751674238796e-06,
"loss": 0.115,
"step": 549
},
{
"epoch": 4.4,
"grad_norm": 0.7743411064147949,
"learning_rate": 5.616158634485793e-06,
"loss": 0.112,
"step": 550
},
{
"epoch": 4.408,
"grad_norm": 0.6997777223587036,
"learning_rate": 5.601560263232153e-06,
"loss": 0.1006,
"step": 551
},
{
"epoch": 4.416,
"grad_norm": 0.8114786148071289,
"learning_rate": 5.5869566867947344e-06,
"loss": 0.1308,
"step": 552
},
{
"epoch": 4.424,
"grad_norm": 0.7540757060050964,
"learning_rate": 5.572348031535442e-06,
"loss": 0.1558,
"step": 553
},
{
"epoch": 4.432,
"grad_norm": 0.8168392777442932,
"learning_rate": 5.557734423860122e-06,
"loss": 0.1635,
"step": 554
},
{
"epoch": 4.44,
"grad_norm": 0.998570442199707,
"learning_rate": 5.543115990217478e-06,
"loss": 0.1332,
"step": 555
},
{
"epoch": 4.448,
"grad_norm": 0.8248946070671082,
"learning_rate": 5.528492857097966e-06,
"loss": 0.1711,
"step": 556
},
{
"epoch": 4.456,
"grad_norm": 0.7837976813316345,
"learning_rate": 5.513865151032709e-06,
"loss": 0.1547,
"step": 557
},
{
"epoch": 4.464,
"grad_norm": 0.7378290295600891,
"learning_rate": 5.499232998592399e-06,
"loss": 0.0789,
"step": 558
},
{
"epoch": 4.4719999999999995,
"grad_norm": 0.6844831109046936,
"learning_rate": 5.484596526386198e-06,
"loss": 0.114,
"step": 559
},
{
"epoch": 4.48,
"grad_norm": 0.7904562950134277,
"learning_rate": 5.469955861060653e-06,
"loss": 0.1676,
"step": 560
},
{
"epoch": 4.4879999999999995,
"grad_norm": 0.7541800141334534,
"learning_rate": 5.455311129298586e-06,
"loss": 0.1196,
"step": 561
},
{
"epoch": 4.496,
"grad_norm": 0.7022758722305298,
"learning_rate": 5.44066245781801e-06,
"loss": 0.127,
"step": 562
},
{
"epoch": 4.504,
"grad_norm": 0.7572545409202576,
"learning_rate": 5.426009973371026e-06,
"loss": 0.1458,
"step": 563
},
{
"epoch": 4.5120000000000005,
"grad_norm": 0.7217088341712952,
"learning_rate": 5.4113538027427245e-06,
"loss": 0.1052,
"step": 564
},
{
"epoch": 4.52,
"grad_norm": 0.6230894923210144,
"learning_rate": 5.396694072750099e-06,
"loss": 0.103,
"step": 565
},
{
"epoch": 4.5280000000000005,
"grad_norm": 0.7158833146095276,
"learning_rate": 5.382030910240936e-06,
"loss": 0.0882,
"step": 566
},
{
"epoch": 4.536,
"grad_norm": 0.8185319900512695,
"learning_rate": 5.367364442092724e-06,
"loss": 0.125,
"step": 567
},
{
"epoch": 4.5440000000000005,
"grad_norm": 0.8623816967010498,
"learning_rate": 5.352694795211555e-06,
"loss": 0.1166,
"step": 568
},
{
"epoch": 4.552,
"grad_norm": 0.8295913338661194,
"learning_rate": 5.338022096531028e-06,
"loss": 0.1315,
"step": 569
},
{
"epoch": 4.5600000000000005,
"grad_norm": 1.077094316482544,
"learning_rate": 5.3233464730111426e-06,
"loss": 0.2167,
"step": 570
},
{
"epoch": 4.568,
"grad_norm": 0.7022574543952942,
"learning_rate": 5.308668051637213e-06,
"loss": 0.1212,
"step": 571
},
{
"epoch": 4.576,
"grad_norm": 1.0412487983703613,
"learning_rate": 5.29398695941876e-06,
"loss": 0.1534,
"step": 572
},
{
"epoch": 4.584,
"grad_norm": 1.1887091398239136,
"learning_rate": 5.279303323388413e-06,
"loss": 0.1918,
"step": 573
},
{
"epoch": 4.592,
"grad_norm": 0.7496406435966492,
"learning_rate": 5.2646172706008154e-06,
"loss": 0.1203,
"step": 574
},
{
"epoch": 4.6,
"grad_norm": 0.7063726186752319,
"learning_rate": 5.249928928131523e-06,
"loss": 0.121,
"step": 575
},
{
"epoch": 4.608,
"grad_norm": 0.8660956621170044,
"learning_rate": 5.235238423075899e-06,
"loss": 0.1118,
"step": 576
},
{
"epoch": 4.616,
"grad_norm": 0.8066213130950928,
"learning_rate": 5.220545882548024e-06,
"loss": 0.1414,
"step": 577
},
{
"epoch": 4.624,
"grad_norm": 0.7297463417053223,
"learning_rate": 5.20585143367959e-06,
"loss": 0.1391,
"step": 578
},
{
"epoch": 4.632,
"grad_norm": 0.7243335247039795,
"learning_rate": 5.191155203618796e-06,
"loss": 0.0997,
"step": 579
},
{
"epoch": 4.64,
"grad_norm": 0.8564410209655762,
"learning_rate": 5.176457319529264e-06,
"loss": 0.1378,
"step": 580
},
{
"epoch": 4.648,
"grad_norm": 0.8925532102584839,
"learning_rate": 5.161757908588917e-06,
"loss": 0.1611,
"step": 581
},
{
"epoch": 4.656,
"grad_norm": 0.7033802270889282,
"learning_rate": 5.147057097988898e-06,
"loss": 0.1161,
"step": 582
},
{
"epoch": 4.664,
"grad_norm": 0.7617799639701843,
"learning_rate": 5.132355014932455e-06,
"loss": 0.0811,
"step": 583
},
{
"epoch": 4.672,
"grad_norm": 0.7035624384880066,
"learning_rate": 5.1176517866338495e-06,
"loss": 0.1392,
"step": 584
},
{
"epoch": 4.68,
"grad_norm": 0.7635079026222229,
"learning_rate": 5.102947540317254e-06,
"loss": 0.116,
"step": 585
},
{
"epoch": 4.688,
"grad_norm": 0.6591924428939819,
"learning_rate": 5.088242403215644e-06,
"loss": 0.1264,
"step": 586
},
{
"epoch": 4.696,
"grad_norm": 0.7575790882110596,
"learning_rate": 5.073536502569708e-06,
"loss": 0.138,
"step": 587
},
{
"epoch": 4.704,
"grad_norm": 0.802493691444397,
"learning_rate": 5.058829965626742e-06,
"loss": 0.15,
"step": 588
},
{
"epoch": 4.712,
"grad_norm": 0.7997198104858398,
"learning_rate": 5.0441229196395416e-06,
"loss": 0.1249,
"step": 589
},
{
"epoch": 4.72,
"grad_norm": 0.8240690231323242,
"learning_rate": 5.029415491865311e-06,
"loss": 0.136,
"step": 590
},
{
"epoch": 4.728,
"grad_norm": 0.7805035710334778,
"learning_rate": 5.014707809564562e-06,
"loss": 0.135,
"step": 591
},
{
"epoch": 4.736,
"grad_norm": 0.7590795755386353,
"learning_rate": 5e-06,
"loss": 0.1646,
"step": 592
},
{
"epoch": 4.744,
"grad_norm": 0.738740086555481,
"learning_rate": 4.98529219043544e-06,
"loss": 0.1616,
"step": 593
},
{
"epoch": 4.752,
"grad_norm": 0.7487245798110962,
"learning_rate": 4.97058450813469e-06,
"loss": 0.1933,
"step": 594
},
{
"epoch": 4.76,
"grad_norm": 0.6358115673065186,
"learning_rate": 4.955877080360462e-06,
"loss": 0.1079,
"step": 595
},
{
"epoch": 4.768,
"grad_norm": 0.8972571492195129,
"learning_rate": 4.94117003437326e-06,
"loss": 0.2013,
"step": 596
},
{
"epoch": 4.776,
"grad_norm": 0.7692276835441589,
"learning_rate": 4.926463497430293e-06,
"loss": 0.1722,
"step": 597
},
{
"epoch": 4.784,
"grad_norm": 0.8051016926765442,
"learning_rate": 4.911757596784358e-06,
"loss": 0.1459,
"step": 598
},
{
"epoch": 4.792,
"grad_norm": 0.7161281108856201,
"learning_rate": 4.897052459682749e-06,
"loss": 0.1477,
"step": 599
},
{
"epoch": 4.8,
"grad_norm": 0.7656087279319763,
"learning_rate": 4.882348213366152e-06,
"loss": 0.1256,
"step": 600
},
{
"epoch": 4.808,
"grad_norm": 0.8391464948654175,
"learning_rate": 4.867644985067548e-06,
"loss": 0.1232,
"step": 601
},
{
"epoch": 4.816,
"grad_norm": 0.8359267115592957,
"learning_rate": 4.8529429020111035e-06,
"loss": 0.1453,
"step": 602
},
{
"epoch": 4.824,
"grad_norm": 1.7344919443130493,
"learning_rate": 4.838242091411085e-06,
"loss": 0.1262,
"step": 603
},
{
"epoch": 4.832,
"grad_norm": 0.8207628130912781,
"learning_rate": 4.823542680470738e-06,
"loss": 0.0926,
"step": 604
},
{
"epoch": 4.84,
"grad_norm": 0.7868751883506775,
"learning_rate": 4.808844796381205e-06,
"loss": 0.2016,
"step": 605
},
{
"epoch": 4.848,
"grad_norm": 0.700920820236206,
"learning_rate": 4.794148566320412e-06,
"loss": 0.1125,
"step": 606
},
{
"epoch": 4.856,
"grad_norm": 0.8076983094215393,
"learning_rate": 4.779454117451978e-06,
"loss": 0.1505,
"step": 607
},
{
"epoch": 4.864,
"grad_norm": 0.8895502686500549,
"learning_rate": 4.7647615769241e-06,
"loss": 0.1841,
"step": 608
},
{
"epoch": 4.872,
"grad_norm": 0.8726681470870972,
"learning_rate": 4.750071071868478e-06,
"loss": 0.1005,
"step": 609
},
{
"epoch": 4.88,
"grad_norm": 0.8028600215911865,
"learning_rate": 4.7353827293991845e-06,
"loss": 0.1587,
"step": 610
},
{
"epoch": 4.888,
"grad_norm": 0.8120298981666565,
"learning_rate": 4.720696676611589e-06,
"loss": 0.198,
"step": 611
},
{
"epoch": 4.896,
"grad_norm": 0.9092877507209778,
"learning_rate": 4.706013040581242e-06,
"loss": 0.1812,
"step": 612
},
{
"epoch": 4.904,
"grad_norm": 0.9110473394393921,
"learning_rate": 4.691331948362789e-06,
"loss": 0.1525,
"step": 613
},
{
"epoch": 4.912,
"grad_norm": 0.9524548053741455,
"learning_rate": 4.676653526988858e-06,
"loss": 0.1463,
"step": 614
},
{
"epoch": 4.92,
"grad_norm": 0.8919450044631958,
"learning_rate": 4.661977903468974e-06,
"loss": 0.1775,
"step": 615
},
{
"epoch": 4.928,
"grad_norm": 0.900175929069519,
"learning_rate": 4.647305204788445e-06,
"loss": 0.1803,
"step": 616
},
{
"epoch": 4.936,
"grad_norm": 1.5099406242370605,
"learning_rate": 4.632635557907277e-06,
"loss": 0.0989,
"step": 617
},
{
"epoch": 4.944,
"grad_norm": 0.7202231884002686,
"learning_rate": 4.617969089759066e-06,
"loss": 0.1328,
"step": 618
},
{
"epoch": 4.952,
"grad_norm": 0.7937277555465698,
"learning_rate": 4.603305927249902e-06,
"loss": 0.1041,
"step": 619
},
{
"epoch": 4.96,
"grad_norm": 0.8269131183624268,
"learning_rate": 4.588646197257278e-06,
"loss": 0.1296,
"step": 620
},
{
"epoch": 4.968,
"grad_norm": 0.7114303708076477,
"learning_rate": 4.573990026628976e-06,
"loss": 0.1493,
"step": 621
},
{
"epoch": 4.976,
"grad_norm": 0.7398365139961243,
"learning_rate": 4.559337542181993e-06,
"loss": 0.0922,
"step": 622
},
{
"epoch": 4.984,
"grad_norm": 0.8082178235054016,
"learning_rate": 4.544688870701416e-06,
"loss": 0.177,
"step": 623
},
{
"epoch": 4.992,
"grad_norm": 0.8239241242408752,
"learning_rate": 4.53004413893935e-06,
"loss": 0.1691,
"step": 624
},
{
"epoch": 5.0,
"grad_norm": 0.8424403071403503,
"learning_rate": 4.5154034736138035e-06,
"loss": 0.2033,
"step": 625
},
{
"epoch": 5.008,
"grad_norm": 0.8410422801971436,
"learning_rate": 4.500767001407604e-06,
"loss": 0.0632,
"step": 626
},
{
"epoch": 5.016,
"grad_norm": 0.5736352205276489,
"learning_rate": 4.486134848967292e-06,
"loss": 0.0541,
"step": 627
},
{
"epoch": 5.024,
"grad_norm": 0.6604498028755188,
"learning_rate": 4.471507142902036e-06,
"loss": 0.0966,
"step": 628
},
{
"epoch": 5.032,
"grad_norm": 0.6121773719787598,
"learning_rate": 4.4568840097825225e-06,
"loss": 0.0486,
"step": 629
},
{
"epoch": 5.04,
"grad_norm": 0.7156475782394409,
"learning_rate": 4.4422655761398785e-06,
"loss": 0.0371,
"step": 630
},
{
"epoch": 5.048,
"grad_norm": 0.9056649804115295,
"learning_rate": 4.427651968464559e-06,
"loss": 0.0338,
"step": 631
},
{
"epoch": 5.056,
"grad_norm": 0.9034331440925598,
"learning_rate": 4.413043313205266e-06,
"loss": 0.0593,
"step": 632
},
{
"epoch": 5.064,
"grad_norm": 0.8622063994407654,
"learning_rate": 4.3984397367678475e-06,
"loss": 0.0295,
"step": 633
},
{
"epoch": 5.072,
"grad_norm": 0.9271420836448669,
"learning_rate": 4.383841365514208e-06,
"loss": 0.0492,
"step": 634
},
{
"epoch": 5.08,
"grad_norm": 0.8091718554496765,
"learning_rate": 4.369248325761205e-06,
"loss": 0.0775,
"step": 635
},
{
"epoch": 5.088,
"grad_norm": 0.7170402407646179,
"learning_rate": 4.354660743779575e-06,
"loss": 0.0331,
"step": 636
},
{
"epoch": 5.096,
"grad_norm": 0.7927961945533752,
"learning_rate": 4.340078745792818e-06,
"loss": 0.0707,
"step": 637
},
{
"epoch": 5.104,
"grad_norm": 0.7635686993598938,
"learning_rate": 4.325502457976126e-06,
"loss": 0.0634,
"step": 638
},
{
"epoch": 5.112,
"grad_norm": 0.6322537064552307,
"learning_rate": 4.310932006455276e-06,
"loss": 0.0476,
"step": 639
},
{
"epoch": 5.12,
"grad_norm": 0.6096351742744446,
"learning_rate": 4.296367517305548e-06,
"loss": 0.0509,
"step": 640
},
{
"epoch": 5.128,
"grad_norm": 0.9142636060714722,
"learning_rate": 4.281809116550629e-06,
"loss": 0.0752,
"step": 641
},
{
"epoch": 5.136,
"grad_norm": 0.8606112003326416,
"learning_rate": 4.267256930161523e-06,
"loss": 0.0789,
"step": 642
},
{
"epoch": 5.144,
"grad_norm": 0.6482968330383301,
"learning_rate": 4.252711084055468e-06,
"loss": 0.0494,
"step": 643
},
{
"epoch": 5.152,
"grad_norm": 0.6635792255401611,
"learning_rate": 4.238171704094833e-06,
"loss": 0.0445,
"step": 644
},
{
"epoch": 5.16,
"grad_norm": 0.9713722467422485,
"learning_rate": 4.223638916086044e-06,
"loss": 0.0772,
"step": 645
},
{
"epoch": 5.168,
"grad_norm": 0.6606869101524353,
"learning_rate": 4.209112845778481e-06,
"loss": 0.0389,
"step": 646
},
{
"epoch": 5.176,
"grad_norm": 4.2783708572387695,
"learning_rate": 4.194593618863404e-06,
"loss": 0.0549,
"step": 647
},
{
"epoch": 5.184,
"grad_norm": 0.7662057280540466,
"learning_rate": 4.180081360972852e-06,
"loss": 0.0503,
"step": 648
},
{
"epoch": 5.192,
"grad_norm": 0.6912007331848145,
"learning_rate": 4.165576197678571e-06,
"loss": 0.0545,
"step": 649
},
{
"epoch": 5.2,
"grad_norm": 0.7346239686012268,
"learning_rate": 4.151078254490908e-06,
"loss": 0.0492,
"step": 650
},
{
"epoch": 5.208,
"grad_norm": 0.7986870408058167,
"learning_rate": 4.136587656857744e-06,
"loss": 0.0506,
"step": 651
},
{
"epoch": 5.216,
"grad_norm": 0.6396828293800354,
"learning_rate": 4.122104530163397e-06,
"loss": 0.0387,
"step": 652
},
{
"epoch": 5.224,
"grad_norm": 0.6466065645217896,
"learning_rate": 4.107628999727542e-06,
"loss": 0.0554,
"step": 653
},
{
"epoch": 5.232,
"grad_norm": 0.5441897511482239,
"learning_rate": 4.09316119080412e-06,
"loss": 0.0472,
"step": 654
},
{
"epoch": 5.24,
"grad_norm": 0.8399096131324768,
"learning_rate": 4.0787012285802695e-06,
"loss": 0.1051,
"step": 655
},
{
"epoch": 5.248,
"grad_norm": 0.687709391117096,
"learning_rate": 4.064249238175223e-06,
"loss": 0.0541,
"step": 656
},
{
"epoch": 5.256,
"grad_norm": 0.5483343601226807,
"learning_rate": 4.04980534463924e-06,
"loss": 0.0441,
"step": 657
},
{
"epoch": 5.264,
"grad_norm": 0.5988929271697998,
"learning_rate": 4.035369672952516e-06,
"loss": 0.047,
"step": 658
},
{
"epoch": 5.272,
"grad_norm": 0.6775379180908203,
"learning_rate": 4.020942348024108e-06,
"loss": 0.0575,
"step": 659
},
{
"epoch": 5.28,
"grad_norm": 0.5605219602584839,
"learning_rate": 4.0065234946908456e-06,
"loss": 0.0368,
"step": 660
},
{
"epoch": 5.288,
"grad_norm": 0.4988764822483063,
"learning_rate": 3.992113237716261e-06,
"loss": 0.0428,
"step": 661
},
{
"epoch": 5.296,
"grad_norm": 0.6318917870521545,
"learning_rate": 3.977711701789499e-06,
"loss": 0.0415,
"step": 662
},
{
"epoch": 5.304,
"grad_norm": 0.6449440717697144,
"learning_rate": 3.963319011524246e-06,
"loss": 0.0667,
"step": 663
},
{
"epoch": 5.312,
"grad_norm": 0.5455386638641357,
"learning_rate": 3.948935291457645e-06,
"loss": 0.0468,
"step": 664
},
{
"epoch": 5.32,
"grad_norm": 0.8036112785339355,
"learning_rate": 3.934560666049226e-06,
"loss": 0.0542,
"step": 665
},
{
"epoch": 5.328,
"grad_norm": 0.7656898498535156,
"learning_rate": 3.920195259679822e-06,
"loss": 0.0511,
"step": 666
},
{
"epoch": 5.336,
"grad_norm": 0.6783893704414368,
"learning_rate": 3.905839196650494e-06,
"loss": 0.0605,
"step": 667
},
{
"epoch": 5.344,
"grad_norm": 0.7056324481964111,
"learning_rate": 3.891492601181462e-06,
"loss": 0.0494,
"step": 668
},
{
"epoch": 5.352,
"grad_norm": 0.5731898546218872,
"learning_rate": 3.877155597411019e-06,
"loss": 0.0479,
"step": 669
},
{
"epoch": 5.36,
"grad_norm": 0.6330499053001404,
"learning_rate": 3.862828309394469e-06,
"loss": 0.0442,
"step": 670
},
{
"epoch": 5.368,
"grad_norm": 0.557161808013916,
"learning_rate": 3.8485108611030415e-06,
"loss": 0.0362,
"step": 671
},
{
"epoch": 5.376,
"grad_norm": 0.6822320222854614,
"learning_rate": 3.834203376422831e-06,
"loss": 0.0541,
"step": 672
},
{
"epoch": 5.384,
"grad_norm": 0.7877750396728516,
"learning_rate": 3.8199059791537105e-06,
"loss": 0.0754,
"step": 673
},
{
"epoch": 5.392,
"grad_norm": 0.7111517786979675,
"learning_rate": 3.805618793008279e-06,
"loss": 0.0603,
"step": 674
},
{
"epoch": 5.4,
"grad_norm": 0.5437308549880981,
"learning_rate": 3.7913419416107692e-06,
"loss": 0.0388,
"step": 675
},
{
"epoch": 5.408,
"grad_norm": 0.7042953968048096,
"learning_rate": 3.777075548496001e-06,
"loss": 0.0784,
"step": 676
},
{
"epoch": 5.416,
"grad_norm": 0.5526348352432251,
"learning_rate": 3.7628197371082916e-06,
"loss": 0.0321,
"step": 677
},
{
"epoch": 5.424,
"grad_norm": 0.8224428296089172,
"learning_rate": 3.7485746308004013e-06,
"loss": 0.0652,
"step": 678
},
{
"epoch": 5.432,
"grad_norm": 0.6871469020843506,
"learning_rate": 3.7343403528324574e-06,
"loss": 0.0433,
"step": 679
},
{
"epoch": 5.44,
"grad_norm": 0.7415713667869568,
"learning_rate": 3.7201170263709004e-06,
"loss": 0.0668,
"step": 680
},
{
"epoch": 5.448,
"grad_norm": 0.6520174145698547,
"learning_rate": 3.705904774487396e-06,
"loss": 0.034,
"step": 681
},
{
"epoch": 5.456,
"grad_norm": 0.928292989730835,
"learning_rate": 3.6917037201577977e-06,
"loss": 0.0594,
"step": 682
},
{
"epoch": 5.464,
"grad_norm": 0.6683514714241028,
"learning_rate": 3.6775139862610577e-06,
"loss": 0.0606,
"step": 683
},
{
"epoch": 5.4719999999999995,
"grad_norm": 0.6796740293502808,
"learning_rate": 3.6633356955781827e-06,
"loss": 0.0424,
"step": 684
},
{
"epoch": 5.48,
"grad_norm": 0.6822247505187988,
"learning_rate": 3.649168970791157e-06,
"loss": 0.0569,
"step": 685
},
{
"epoch": 5.4879999999999995,
"grad_norm": 0.6291232705116272,
"learning_rate": 3.635013934481895e-06,
"loss": 0.0478,
"step": 686
},
{
"epoch": 5.496,
"grad_norm": 0.890269935131073,
"learning_rate": 3.620870709131163e-06,
"loss": 0.0602,
"step": 687
},
{
"epoch": 5.504,
"grad_norm": 0.6990971565246582,
"learning_rate": 3.6067394171175397e-06,
"loss": 0.0314,
"step": 688
},
{
"epoch": 5.5120000000000005,
"grad_norm": 0.6838575601577759,
"learning_rate": 3.5926201807163384e-06,
"loss": 0.0319,
"step": 689
},
{
"epoch": 5.52,
"grad_norm": 0.6738036870956421,
"learning_rate": 3.578513122098566e-06,
"loss": 0.0846,
"step": 690
},
{
"epoch": 5.5280000000000005,
"grad_norm": 0.8007773756980896,
"learning_rate": 3.564418363329848e-06,
"loss": 0.0481,
"step": 691
},
{
"epoch": 5.536,
"grad_norm": 0.6452323794364929,
"learning_rate": 3.5503360263693887e-06,
"loss": 0.035,
"step": 692
},
{
"epoch": 5.5440000000000005,
"grad_norm": 0.641797661781311,
"learning_rate": 3.5362662330689067e-06,
"loss": 0.0543,
"step": 693
},
{
"epoch": 5.552,
"grad_norm": 0.6653102040290833,
"learning_rate": 3.5222091051715803e-06,
"loss": 0.0598,
"step": 694
},
{
"epoch": 5.5600000000000005,
"grad_norm": 0.8420442938804626,
"learning_rate": 3.5081647643110028e-06,
"loss": 0.0814,
"step": 695
},
{
"epoch": 5.568,
"grad_norm": 0.6558657884597778,
"learning_rate": 3.4941333320101173e-06,
"loss": 0.0621,
"step": 696
},
{
"epoch": 5.576,
"grad_norm": 0.47997379302978516,
"learning_rate": 3.480114929680176e-06,
"loss": 0.0301,
"step": 697
},
{
"epoch": 5.584,
"grad_norm": 0.6215079426765442,
"learning_rate": 3.466109678619681e-06,
"loss": 0.0385,
"step": 698
},
{
"epoch": 5.592,
"grad_norm": 0.7063211798667908,
"learning_rate": 3.4521177000133456e-06,
"loss": 0.0566,
"step": 699
},
{
"epoch": 5.6,
"grad_norm": 0.5929539799690247,
"learning_rate": 3.4381391149310294e-06,
"loss": 0.0397,
"step": 700
},
{
"epoch": 5.608,
"grad_norm": 0.7285615801811218,
"learning_rate": 3.4241740443267112e-06,
"loss": 0.0472,
"step": 701
},
{
"epoch": 5.616,
"grad_norm": 0.5883134603500366,
"learning_rate": 3.4102226090374246e-06,
"loss": 0.0443,
"step": 702
},
{
"epoch": 5.624,
"grad_norm": 0.6047627925872803,
"learning_rate": 3.3962849297822225e-06,
"loss": 0.05,
"step": 703
},
{
"epoch": 5.632,
"grad_norm": 0.7876625657081604,
"learning_rate": 3.3823611271611266e-06,
"loss": 0.0487,
"step": 704
},
{
"epoch": 5.64,
"grad_norm": 0.9311395287513733,
"learning_rate": 3.368451321654091e-06,
"loss": 0.0713,
"step": 705
},
{
"epoch": 5.648,
"grad_norm": 0.7349729537963867,
"learning_rate": 3.35455563361995e-06,
"loss": 0.0406,
"step": 706
},
{
"epoch": 5.656,
"grad_norm": 0.6107696890830994,
"learning_rate": 3.3406741832953893e-06,
"loss": 0.0489,
"step": 707
},
{
"epoch": 5.664,
"grad_norm": 0.7372275590896606,
"learning_rate": 3.3268070907938915e-06,
"loss": 0.0614,
"step": 708
},
{
"epoch": 5.672,
"grad_norm": 0.8170859217643738,
"learning_rate": 3.3129544761047093e-06,
"loss": 0.0545,
"step": 709
},
{
"epoch": 5.68,
"grad_norm": 1.0097273588180542,
"learning_rate": 3.2991164590918162e-06,
"loss": 0.1003,
"step": 710
},
{
"epoch": 5.688,
"grad_norm": 0.5728979110717773,
"learning_rate": 3.2852931594928804e-06,
"loss": 0.0458,
"step": 711
},
{
"epoch": 5.696,
"grad_norm": 0.6160666942596436,
"learning_rate": 3.271484696918218e-06,
"loss": 0.0472,
"step": 712
},
{
"epoch": 5.704,
"grad_norm": 0.5239155292510986,
"learning_rate": 3.2576911908497695e-06,
"loss": 0.0479,
"step": 713
},
{
"epoch": 5.712,
"grad_norm": 0.6103151440620422,
"learning_rate": 3.2439127606400546e-06,
"loss": 0.0638,
"step": 714
},
{
"epoch": 5.72,
"grad_norm": 0.5333082675933838,
"learning_rate": 3.2301495255111426e-06,
"loss": 0.054,
"step": 715
},
{
"epoch": 5.728,
"grad_norm": 0.6362142562866211,
"learning_rate": 3.2164016045536306e-06,
"loss": 0.0607,
"step": 716
},
{
"epoch": 5.736,
"grad_norm": 0.5961912274360657,
"learning_rate": 3.202669116725598e-06,
"loss": 0.0455,
"step": 717
},
{
"epoch": 5.744,
"grad_norm": 0.7223774194717407,
"learning_rate": 3.1889521808515888e-06,
"loss": 0.0661,
"step": 718
},
{
"epoch": 5.752,
"grad_norm": 0.574993908405304,
"learning_rate": 3.1752509156215738e-06,
"loss": 0.0393,
"step": 719
},
{
"epoch": 5.76,
"grad_norm": 0.5511513948440552,
"learning_rate": 3.1615654395899377e-06,
"loss": 0.0432,
"step": 720
},
{
"epoch": 5.768,
"grad_norm": 0.679321825504303,
"learning_rate": 3.1478958711744324e-06,
"loss": 0.0447,
"step": 721
},
{
"epoch": 5.776,
"grad_norm": 0.5888364911079407,
"learning_rate": 3.1342423286551756e-06,
"loss": 0.056,
"step": 722
},
{
"epoch": 5.784,
"grad_norm": 0.678205668926239,
"learning_rate": 3.120604930173608e-06,
"loss": 0.0383,
"step": 723
},
{
"epoch": 5.792,
"grad_norm": 0.7728886604309082,
"learning_rate": 3.1069837937314846e-06,
"loss": 0.0633,
"step": 724
},
{
"epoch": 5.8,
"grad_norm": 0.7190651297569275,
"learning_rate": 3.093379037189842e-06,
"loss": 0.0376,
"step": 725
},
{
"epoch": 5.808,
"grad_norm": 0.622719943523407,
"learning_rate": 3.0797907782679944e-06,
"loss": 0.0261,
"step": 726
},
{
"epoch": 5.816,
"grad_norm": 0.6227384209632874,
"learning_rate": 3.0662191345424925e-06,
"loss": 0.0658,
"step": 727
},
{
"epoch": 5.824,
"grad_norm": 0.8591876029968262,
"learning_rate": 3.0526642234461313e-06,
"loss": 0.0698,
"step": 728
},
{
"epoch": 5.832,
"grad_norm": 0.715391218662262,
"learning_rate": 3.039126162266912e-06,
"loss": 0.0562,
"step": 729
},
{
"epoch": 5.84,
"grad_norm": 0.6005394458770752,
"learning_rate": 3.0256050681470446e-06,
"loss": 0.0463,
"step": 730
},
{
"epoch": 5.848,
"grad_norm": 0.5069931149482727,
"learning_rate": 3.012101058081919e-06,
"loss": 0.0321,
"step": 731
},
{
"epoch": 5.856,
"grad_norm": 0.8508353233337402,
"learning_rate": 2.9986142489191074e-06,
"loss": 0.0985,
"step": 732
},
{
"epoch": 5.864,
"grad_norm": 0.7500941753387451,
"learning_rate": 2.9851447573573383e-06,
"loss": 0.0823,
"step": 733
},
{
"epoch": 5.872,
"grad_norm": 0.6298888325691223,
"learning_rate": 2.971692699945502e-06,
"loss": 0.0557,
"step": 734
},
{
"epoch": 5.88,
"grad_norm": 0.6049162745475769,
"learning_rate": 2.958258193081629e-06,
"loss": 0.0429,
"step": 735
},
{
"epoch": 5.888,
"grad_norm": 0.6358490586280823,
"learning_rate": 2.9448413530118912e-06,
"loss": 0.042,
"step": 736
},
{
"epoch": 5.896,
"grad_norm": 0.7583111524581909,
"learning_rate": 2.9314422958295906e-06,
"loss": 0.0559,
"step": 737
},
{
"epoch": 5.904,
"grad_norm": 0.5954725742340088,
"learning_rate": 2.9180611374741623e-06,
"loss": 0.0268,
"step": 738
},
{
"epoch": 5.912,
"grad_norm": 0.6205891966819763,
"learning_rate": 2.904697993730159e-06,
"loss": 0.0505,
"step": 739
},
{
"epoch": 5.92,
"grad_norm": 0.6735966205596924,
"learning_rate": 2.891352980226262e-06,
"loss": 0.0456,
"step": 740
},
{
"epoch": 5.928,
"grad_norm": 0.5929473042488098,
"learning_rate": 2.8780262124342755e-06,
"loss": 0.033,
"step": 741
},
{
"epoch": 5.936,
"grad_norm": 0.6799638271331787,
"learning_rate": 2.8647178056681197e-06,
"loss": 0.0542,
"step": 742
},
{
"epoch": 5.944,
"grad_norm": 0.6965285539627075,
"learning_rate": 2.8514278750828537e-06,
"loss": 0.0668,
"step": 743
},
{
"epoch": 5.952,
"grad_norm": 0.6502833962440491,
"learning_rate": 2.838156535673652e-06,
"loss": 0.0433,
"step": 744
},
{
"epoch": 5.96,
"grad_norm": 0.6269218325614929,
"learning_rate": 2.8249039022748315e-06,
"loss": 0.0746,
"step": 745
},
{
"epoch": 5.968,
"grad_norm": 0.8056422472000122,
"learning_rate": 2.8116700895588473e-06,
"loss": 0.0834,
"step": 746
},
{
"epoch": 5.976,
"grad_norm": 0.49353089928627014,
"learning_rate": 2.798455212035305e-06,
"loss": 0.0335,
"step": 747
},
{
"epoch": 5.984,
"grad_norm": 0.6638184785842896,
"learning_rate": 2.785259384049959e-06,
"loss": 0.0559,
"step": 748
},
{
"epoch": 5.992,
"grad_norm": 0.6694799661636353,
"learning_rate": 2.7720827197837475e-06,
"loss": 0.0519,
"step": 749
},
{
"epoch": 6.0,
"grad_norm": 0.6553838849067688,
"learning_rate": 2.7589253332517736e-06,
"loss": 0.0636,
"step": 750
},
{
"epoch": 6.008,
"grad_norm": 0.4020375609397888,
"learning_rate": 2.745787338302341e-06,
"loss": 0.0153,
"step": 751
},
{
"epoch": 6.016,
"grad_norm": 0.40127378702163696,
"learning_rate": 2.7326688486159613e-06,
"loss": 0.0217,
"step": 752
},
{
"epoch": 6.024,
"grad_norm": 0.46770235896110535,
"learning_rate": 2.7195699777043723e-06,
"loss": 0.019,
"step": 753
},
{
"epoch": 6.032,
"grad_norm": 0.3705905079841614,
"learning_rate": 2.706490838909547e-06,
"loss": 0.0136,
"step": 754
},
{
"epoch": 6.04,
"grad_norm": 0.4092421531677246,
"learning_rate": 2.6934315454027323e-06,
"loss": 0.0155,
"step": 755
},
{
"epoch": 6.048,
"grad_norm": 0.5061901211738586,
"learning_rate": 2.680392210183446e-06,
"loss": 0.0188,
"step": 756
},
{
"epoch": 6.056,
"grad_norm": 0.3923867642879486,
"learning_rate": 2.6673729460785174e-06,
"loss": 0.0162,
"step": 757
},
{
"epoch": 6.064,
"grad_norm": 0.35709553956985474,
"learning_rate": 2.6543738657411033e-06,
"loss": 0.015,
"step": 758
},
{
"epoch": 6.072,
"grad_norm": 1.4626656770706177,
"learning_rate": 2.6413950816497146e-06,
"loss": 0.0318,
"step": 759
},
{
"epoch": 6.08,
"grad_norm": 0.45306396484375,
"learning_rate": 2.628436706107238e-06,
"loss": 0.0153,
"step": 760
},
{
"epoch": 6.088,
"grad_norm": 0.4281376600265503,
"learning_rate": 2.6154988512399784e-06,
"loss": 0.0161,
"step": 761
},
{
"epoch": 6.096,
"grad_norm": 0.5948172211647034,
"learning_rate": 2.6025816289966703e-06,
"loss": 0.0222,
"step": 762
},
{
"epoch": 6.104,
"grad_norm": 0.41173240542411804,
"learning_rate": 2.5896851511475184e-06,
"loss": 0.0196,
"step": 763
},
{
"epoch": 6.112,
"grad_norm": 0.3081780672073364,
"learning_rate": 2.5768095292832412e-06,
"loss": 0.0114,
"step": 764
},
{
"epoch": 6.12,
"grad_norm": 0.3800240457057953,
"learning_rate": 2.5639548748140803e-06,
"loss": 0.0205,
"step": 765
},
{
"epoch": 6.128,
"grad_norm": 0.31587958335876465,
"learning_rate": 2.5511212989688587e-06,
"loss": 0.0064,
"step": 766
},
{
"epoch": 6.136,
"grad_norm": 0.3492489159107208,
"learning_rate": 2.5383089127940087e-06,
"loss": 0.0141,
"step": 767
},
{
"epoch": 6.144,
"grad_norm": 0.4143945574760437,
"learning_rate": 2.525517827152614e-06,
"loss": 0.0158,
"step": 768
},
{
"epoch": 6.152,
"grad_norm": 0.5510955452919006,
"learning_rate": 2.5127481527234397e-06,
"loss": 0.0236,
"step": 769
},
{
"epoch": 6.16,
"grad_norm": 0.45824673771858215,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.0171,
"step": 770
},
{
"epoch": 6.168,
"grad_norm": 0.40430498123168945,
"learning_rate": 2.487273479289574e-06,
"loss": 0.0211,
"step": 771
},
{
"epoch": 6.176,
"grad_norm": 0.36916109919548035,
"learning_rate": 2.4745687007122636e-06,
"loss": 0.0155,
"step": 772
},
{
"epoch": 6.184,
"grad_norm": 0.311632364988327,
"learning_rate": 2.4618857742000463e-06,
"loss": 0.0106,
"step": 773
},
{
"epoch": 6.192,
"grad_norm": 0.5978167057037354,
"learning_rate": 2.449224809495815e-06,
"loss": 0.027,
"step": 774
},
{
"epoch": 6.2,
"grad_norm": 0.4340705871582031,
"learning_rate": 2.436585916152426e-06,
"loss": 0.0209,
"step": 775
},
{
"epoch": 6.208,
"grad_norm": 0.5053939819335938,
"learning_rate": 2.423969203531768e-06,
"loss": 0.0165,
"step": 776
},
{
"epoch": 6.216,
"grad_norm": 0.43892624974250793,
"learning_rate": 2.411374780803793e-06,
"loss": 0.0212,
"step": 777
},
{
"epoch": 6.224,
"grad_norm": 0.49284055829048157,
"learning_rate": 2.3988027569455895e-06,
"loss": 0.0158,
"step": 778
},
{
"epoch": 6.232,
"grad_norm": 0.36235150694847107,
"learning_rate": 2.3862532407404306e-06,
"loss": 0.0115,
"step": 779
},
{
"epoch": 6.24,
"grad_norm": 0.6579159498214722,
"learning_rate": 2.373726340776837e-06,
"loss": 0.0233,
"step": 780
},
{
"epoch": 6.248,
"grad_norm": 0.5833991765975952,
"learning_rate": 2.361222165447628e-06,
"loss": 0.0253,
"step": 781
},
{
"epoch": 6.256,
"grad_norm": 0.44309303164482117,
"learning_rate": 2.348740822949006e-06,
"loss": 0.0219,
"step": 782
},
{
"epoch": 6.264,
"grad_norm": 0.41962531208992004,
"learning_rate": 2.33628242127959e-06,
"loss": 0.0188,
"step": 783
},
{
"epoch": 6.272,
"grad_norm": 0.4632411003112793,
"learning_rate": 2.323847068239504e-06,
"loss": 0.0189,
"step": 784
},
{
"epoch": 6.28,
"grad_norm": 0.6932832598686218,
"learning_rate": 2.3114348714294355e-06,
"loss": 0.0269,
"step": 785
},
{
"epoch": 6.288,
"grad_norm": 0.4063160717487335,
"learning_rate": 2.2990459382497086e-06,
"loss": 0.0132,
"step": 786
},
{
"epoch": 6.296,
"grad_norm": 0.3129899203777313,
"learning_rate": 2.2866803758993446e-06,
"loss": 0.0157,
"step": 787
},
{
"epoch": 6.304,
"grad_norm": 0.3941129148006439,
"learning_rate": 2.274338291375147e-06,
"loss": 0.0158,
"step": 788
},
{
"epoch": 6.312,
"grad_norm": 0.5480639338493347,
"learning_rate": 2.262019791470772e-06,
"loss": 0.0173,
"step": 789
},
{
"epoch": 6.32,
"grad_norm": 0.5814464688301086,
"learning_rate": 2.2497249827757933e-06,
"loss": 0.0247,
"step": 790
},
{
"epoch": 6.328,
"grad_norm": 0.48736658692359924,
"learning_rate": 2.2374539716748034e-06,
"loss": 0.0217,
"step": 791
},
{
"epoch": 6.336,
"grad_norm": 0.5674287676811218,
"learning_rate": 2.225206864346465e-06,
"loss": 0.0196,
"step": 792
},
{
"epoch": 6.344,
"grad_norm": 0.34811681509017944,
"learning_rate": 2.2129837667626147e-06,
"loss": 0.0096,
"step": 793
},
{
"epoch": 6.352,
"grad_norm": 0.3757005035877228,
"learning_rate": 2.2007847846873342e-06,
"loss": 0.0141,
"step": 794
},
{
"epoch": 6.36,
"grad_norm": 0.31448501348495483,
"learning_rate": 2.188610023676041e-06,
"loss": 0.0099,
"step": 795
},
{
"epoch": 6.368,
"grad_norm": 0.4611433446407318,
"learning_rate": 2.176459589074566e-06,
"loss": 0.0114,
"step": 796
},
{
"epoch": 6.376,
"grad_norm": 0.32864484190940857,
"learning_rate": 2.164333586018259e-06,
"loss": 0.0096,
"step": 797
},
{
"epoch": 6.384,
"grad_norm": 0.5892991423606873,
"learning_rate": 2.1522321194310577e-06,
"loss": 0.0191,
"step": 798
},
{
"epoch": 6.392,
"grad_norm": 0.40614140033721924,
"learning_rate": 2.1401552940245962e-06,
"loss": 0.0067,
"step": 799
},
{
"epoch": 6.4,
"grad_norm": 0.4774915874004364,
"learning_rate": 2.1281032142972933e-06,
"loss": 0.0205,
"step": 800
},
{
"epoch": 6.408,
"grad_norm": 0.3720727562904358,
"learning_rate": 2.1160759845334483e-06,
"loss": 0.0142,
"step": 801
},
{
"epoch": 6.416,
"grad_norm": 0.4921925663948059,
"learning_rate": 2.1040737088023323e-06,
"loss": 0.0243,
"step": 802
},
{
"epoch": 6.424,
"grad_norm": 0.5588891506195068,
"learning_rate": 2.0920964909573065e-06,
"loss": 0.0244,
"step": 803
},
{
"epoch": 6.432,
"grad_norm": 19.280241012573242,
"learning_rate": 2.080144434634898e-06,
"loss": 0.0224,
"step": 804
},
{
"epoch": 6.44,
"grad_norm": 0.3350463807582855,
"learning_rate": 2.068217643253925e-06,
"loss": 0.0116,
"step": 805
},
{
"epoch": 6.448,
"grad_norm": 0.380971759557724,
"learning_rate": 2.056316220014588e-06,
"loss": 0.0131,
"step": 806
},
{
"epoch": 6.456,
"grad_norm": 0.3962733745574951,
"learning_rate": 2.0444402678975876e-06,
"loss": 0.0148,
"step": 807
},
{
"epoch": 6.464,
"grad_norm": 0.5106214880943298,
"learning_rate": 2.0325898896632178e-06,
"loss": 0.0238,
"step": 808
},
{
"epoch": 6.4719999999999995,
"grad_norm": 0.5265573859214783,
"learning_rate": 2.0207651878505e-06,
"loss": 0.0152,
"step": 809
},
{
"epoch": 6.48,
"grad_norm": 0.46207869052886963,
"learning_rate": 2.0089662647762716e-06,
"loss": 0.0255,
"step": 810
},
{
"epoch": 6.4879999999999995,
"grad_norm": 0.3555239140987396,
"learning_rate": 1.997193222534316e-06,
"loss": 0.0166,
"step": 811
},
{
"epoch": 6.496,
"grad_norm": 0.4378393888473511,
"learning_rate": 1.9854461629944764e-06,
"loss": 0.0101,
"step": 812
},
{
"epoch": 6.504,
"grad_norm": 0.5582405924797058,
"learning_rate": 1.9737251878017678e-06,
"loss": 0.0314,
"step": 813
},
{
"epoch": 6.5120000000000005,
"grad_norm": 0.3416616916656494,
"learning_rate": 1.962030398375506e-06,
"loss": 0.0112,
"step": 814
},
{
"epoch": 6.52,
"grad_norm": 0.4626212418079376,
"learning_rate": 1.950361895908427e-06,
"loss": 0.0131,
"step": 815
},
{
"epoch": 6.5280000000000005,
"grad_norm": 0.4612177014350891,
"learning_rate": 1.9387197813658092e-06,
"loss": 0.0154,
"step": 816
},
{
"epoch": 6.536,
"grad_norm": 0.4659787118434906,
"learning_rate": 1.927104155484602e-06,
"loss": 0.0136,
"step": 817
},
{
"epoch": 6.5440000000000005,
"grad_norm": 0.530228316783905,
"learning_rate": 1.915515118772555e-06,
"loss": 0.0227,
"step": 818
},
{
"epoch": 6.552,
"grad_norm": 0.7053042650222778,
"learning_rate": 1.9039527715073424e-06,
"loss": 0.041,
"step": 819
},
{
"epoch": 6.5600000000000005,
"grad_norm": 0.43051308393478394,
"learning_rate": 1.8924172137357038e-06,
"loss": 0.0182,
"step": 820
},
{
"epoch": 6.568,
"grad_norm": 0.6602566838264465,
"learning_rate": 1.8809085452725744e-06,
"loss": 0.0323,
"step": 821
},
{
"epoch": 6.576,
"grad_norm": 0.42587170004844666,
"learning_rate": 1.8694268657002197e-06,
"loss": 0.0148,
"step": 822
},
{
"epoch": 6.584,
"grad_norm": 0.4337313175201416,
"learning_rate": 1.8579722743673773e-06,
"loss": 0.0125,
"step": 823
},
{
"epoch": 6.592,
"grad_norm": 0.5374398827552795,
"learning_rate": 1.8465448703883959e-06,
"loss": 0.027,
"step": 824
},
{
"epoch": 6.6,
"grad_norm": 0.40238505601882935,
"learning_rate": 1.8351447526423728e-06,
"loss": 0.0216,
"step": 825
},
{
"epoch": 6.608,
"grad_norm": 0.4797324538230896,
"learning_rate": 1.8237720197723075e-06,
"loss": 0.0183,
"step": 826
},
{
"epoch": 6.616,
"grad_norm": 0.43299341201782227,
"learning_rate": 1.812426770184243e-06,
"loss": 0.019,
"step": 827
},
{
"epoch": 6.624,
"grad_norm": 0.38361313939094543,
"learning_rate": 1.8011091020464138e-06,
"loss": 0.0118,
"step": 828
},
{
"epoch": 6.632,
"grad_norm": 0.4371749758720398,
"learning_rate": 1.789819113288397e-06,
"loss": 0.0185,
"step": 829
},
{
"epoch": 6.64,
"grad_norm": 0.3766806423664093,
"learning_rate": 1.7785569016002686e-06,
"loss": 0.0171,
"step": 830
},
{
"epoch": 6.648,
"grad_norm": 0.3783447742462158,
"learning_rate": 1.7673225644317487e-06,
"loss": 0.0188,
"step": 831
},
{
"epoch": 6.656,
"grad_norm": 0.3563772141933441,
"learning_rate": 1.75611619899137e-06,
"loss": 0.013,
"step": 832
},
{
"epoch": 6.664,
"grad_norm": 0.4619181752204895,
"learning_rate": 1.7449379022456297e-06,
"loss": 0.0172,
"step": 833
},
{
"epoch": 6.672,
"grad_norm": 0.4362376630306244,
"learning_rate": 1.7337877709181527e-06,
"loss": 0.0148,
"step": 834
},
{
"epoch": 6.68,
"grad_norm": 0.41033628582954407,
"learning_rate": 1.7226659014888548e-06,
"loss": 0.0168,
"step": 835
},
{
"epoch": 6.688,
"grad_norm": 0.44018396735191345,
"learning_rate": 1.711572390193102e-06,
"loss": 0.0125,
"step": 836
},
{
"epoch": 6.696,
"grad_norm": 0.38611117005348206,
"learning_rate": 1.7005073330208881e-06,
"loss": 0.0129,
"step": 837
},
{
"epoch": 6.704,
"grad_norm": 0.49103668332099915,
"learning_rate": 1.689470825715998e-06,
"loss": 0.0254,
"step": 838
},
{
"epoch": 6.712,
"grad_norm": 0.589324951171875,
"learning_rate": 1.6784629637751814e-06,
"loss": 0.0217,
"step": 839
},
{
"epoch": 6.72,
"grad_norm": 0.3026237189769745,
"learning_rate": 1.6674838424473172e-06,
"loss": 0.0127,
"step": 840
},
{
"epoch": 6.728,
"grad_norm": 0.6675847172737122,
"learning_rate": 1.6565335567326112e-06,
"loss": 0.0261,
"step": 841
},
{
"epoch": 6.736,
"grad_norm": 0.47200918197631836,
"learning_rate": 1.6456122013817477e-06,
"loss": 0.0194,
"step": 842
},
{
"epoch": 6.744,
"grad_norm": 0.5038536190986633,
"learning_rate": 1.6347198708950884e-06,
"loss": 0.0136,
"step": 843
},
{
"epoch": 6.752,
"grad_norm": 0.368230402469635,
"learning_rate": 1.6238566595218475e-06,
"loss": 0.0134,
"step": 844
},
{
"epoch": 6.76,
"grad_norm": 0.47902312874794006,
"learning_rate": 1.6130226612592787e-06,
"loss": 0.0135,
"step": 845
},
{
"epoch": 6.768,
"grad_norm": 0.4592016041278839,
"learning_rate": 1.6022179698518525e-06,
"loss": 0.0176,
"step": 846
},
{
"epoch": 6.776,
"grad_norm": 0.4968428611755371,
"learning_rate": 1.591442678790467e-06,
"loss": 0.0216,
"step": 847
},
{
"epoch": 6.784,
"grad_norm": 0.4512214660644531,
"learning_rate": 1.580696881311611e-06,
"loss": 0.0226,
"step": 848
},
{
"epoch": 6.792,
"grad_norm": 0.48782020807266235,
"learning_rate": 1.5699806703965787e-06,
"loss": 0.0218,
"step": 849
},
{
"epoch": 6.8,
"grad_norm": 0.4219365119934082,
"learning_rate": 1.5592941387706562e-06,
"loss": 0.0109,
"step": 850
},
{
"epoch": 6.808,
"grad_norm": 0.4016176462173462,
"learning_rate": 1.5486373789023206e-06,
"loss": 0.0066,
"step": 851
},
{
"epoch": 6.816,
"grad_norm": 0.3322366774082184,
"learning_rate": 1.538010483002435e-06,
"loss": 0.0147,
"step": 852
},
{
"epoch": 6.824,
"grad_norm": 0.3936358094215393,
"learning_rate": 1.5274135430234654e-06,
"loss": 0.0153,
"step": 853
},
{
"epoch": 6.832,
"grad_norm": 0.484785795211792,
"learning_rate": 1.5168466506586654e-06,
"loss": 0.0133,
"step": 854
},
{
"epoch": 6.84,
"grad_norm": 0.4145300090312958,
"learning_rate": 1.506309897341297e-06,
"loss": 0.0243,
"step": 855
},
{
"epoch": 6.848,
"grad_norm": 0.5303352475166321,
"learning_rate": 1.4958033742438348e-06,
"loss": 0.024,
"step": 856
},
{
"epoch": 6.856,
"grad_norm": 0.39044690132141113,
"learning_rate": 1.4853271722771772e-06,
"loss": 0.0193,
"step": 857
},
{
"epoch": 6.864,
"grad_norm": 0.3822794258594513,
"learning_rate": 1.4748813820898554e-06,
"loss": 0.0116,
"step": 858
},
{
"epoch": 6.872,
"grad_norm": 0.5295370221138,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.0253,
"step": 859
},
{
"epoch": 6.88,
"grad_norm": 0.5559951663017273,
"learning_rate": 1.454081398330855e-06,
"loss": 0.0173,
"step": 860
},
{
"epoch": 6.888,
"grad_norm": 0.6219725012779236,
"learning_rate": 1.4437273847373778e-06,
"loss": 0.0246,
"step": 861
},
{
"epoch": 6.896,
"grad_norm": 0.4094639718532562,
"learning_rate": 1.4334041428781003e-06,
"loss": 0.0211,
"step": 862
},
{
"epoch": 6.904,
"grad_norm": 0.4057483375072479,
"learning_rate": 1.4231117620780188e-06,
"loss": 0.0167,
"step": 863
},
{
"epoch": 6.912,
"grad_norm": 0.41485121846199036,
"learning_rate": 1.4128503313951008e-06,
"loss": 0.0132,
"step": 864
},
{
"epoch": 6.92,
"grad_norm": 0.44596153497695923,
"learning_rate": 1.4026199396195078e-06,
"loss": 0.0192,
"step": 865
},
{
"epoch": 6.928,
"grad_norm": 0.5164179801940918,
"learning_rate": 1.3924206752728282e-06,
"loss": 0.0163,
"step": 866
},
{
"epoch": 6.936,
"grad_norm": 0.42679280042648315,
"learning_rate": 1.3822526266073044e-06,
"loss": 0.0116,
"step": 867
},
{
"epoch": 6.944,
"grad_norm": 0.5359206199645996,
"learning_rate": 1.3721158816050872e-06,
"loss": 0.0175,
"step": 868
},
{
"epoch": 6.952,
"grad_norm": 0.35650596022605896,
"learning_rate": 1.3620105279774532e-06,
"loss": 0.0119,
"step": 869
},
{
"epoch": 6.96,
"grad_norm": 0.4518095552921295,
"learning_rate": 1.3519366531640589e-06,
"loss": 0.0223,
"step": 870
},
{
"epoch": 6.968,
"grad_norm": 0.3648589253425598,
"learning_rate": 1.3418943443321807e-06,
"loss": 0.0146,
"step": 871
},
{
"epoch": 6.976,
"grad_norm": 0.41007596254348755,
"learning_rate": 1.3318836883759634e-06,
"loss": 0.0206,
"step": 872
},
{
"epoch": 6.984,
"grad_norm": 0.6869492530822754,
"learning_rate": 1.3219047719156575e-06,
"loss": 0.0231,
"step": 873
},
{
"epoch": 6.992,
"grad_norm": 0.42699146270751953,
"learning_rate": 1.3119576812968893e-06,
"loss": 0.0191,
"step": 874
},
{
"epoch": 7.0,
"grad_norm": 0.4389689266681671,
"learning_rate": 1.3020425025898926e-06,
"loss": 0.0178,
"step": 875
}
],
"logging_steps": 1.0,
"max_steps": 1125,
"num_input_tokens_seen": 0,
"num_train_epochs": 9,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.045822609337876e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}