eb_man_sft_best / trainer_state.json
Hanyang81's picture
Upload trainer_state.json with huggingface_hub
8c6632d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1210,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008264462809917355,
"grad_norm": 35.12614822387695,
"learning_rate": 1.639344262295082e-07,
"loss": 1.2941,
"step": 1
},
{
"epoch": 0.001652892561983471,
"grad_norm": 36.89109420776367,
"learning_rate": 3.278688524590164e-07,
"loss": 1.4524,
"step": 2
},
{
"epoch": 0.0024793388429752068,
"grad_norm": 39.57857131958008,
"learning_rate": 4.918032786885246e-07,
"loss": 1.4642,
"step": 3
},
{
"epoch": 0.003305785123966942,
"grad_norm": 47.02167510986328,
"learning_rate": 6.557377049180328e-07,
"loss": 1.5826,
"step": 4
},
{
"epoch": 0.004132231404958678,
"grad_norm": 45.124114990234375,
"learning_rate": 8.196721311475409e-07,
"loss": 1.8503,
"step": 5
},
{
"epoch": 0.0049586776859504135,
"grad_norm": 374.71484375,
"learning_rate": 9.836065573770493e-07,
"loss": 4.8292,
"step": 6
},
{
"epoch": 0.005785123966942148,
"grad_norm": 376.0966796875,
"learning_rate": 1.1475409836065575e-06,
"loss": 4.7879,
"step": 7
},
{
"epoch": 0.006611570247933884,
"grad_norm": 374.0869445800781,
"learning_rate": 1.3114754098360657e-06,
"loss": 4.8364,
"step": 8
},
{
"epoch": 0.00743801652892562,
"grad_norm": 360.5589294433594,
"learning_rate": 1.4754098360655739e-06,
"loss": 4.3553,
"step": 9
},
{
"epoch": 0.008264462809917356,
"grad_norm": 349.61932373046875,
"learning_rate": 1.6393442622950819e-06,
"loss": 4.0703,
"step": 10
},
{
"epoch": 0.00909090909090909,
"grad_norm": 336.42889404296875,
"learning_rate": 1.8032786885245903e-06,
"loss": 3.8107,
"step": 11
},
{
"epoch": 0.009917355371900827,
"grad_norm": 198.629638671875,
"learning_rate": 1.9672131147540985e-06,
"loss": 2.7545,
"step": 12
},
{
"epoch": 0.010743801652892562,
"grad_norm": 147.47877502441406,
"learning_rate": 2.1311475409836067e-06,
"loss": 2.589,
"step": 13
},
{
"epoch": 0.011570247933884297,
"grad_norm": 90.01700592041016,
"learning_rate": 2.295081967213115e-06,
"loss": 2.5207,
"step": 14
},
{
"epoch": 0.012396694214876033,
"grad_norm": 43.3622932434082,
"learning_rate": 2.459016393442623e-06,
"loss": 2.3798,
"step": 15
},
{
"epoch": 0.013223140495867768,
"grad_norm": 33.97830581665039,
"learning_rate": 2.6229508196721314e-06,
"loss": 2.1262,
"step": 16
},
{
"epoch": 0.014049586776859505,
"grad_norm": 31.877763748168945,
"learning_rate": 2.786885245901639e-06,
"loss": 2.437,
"step": 17
},
{
"epoch": 0.01487603305785124,
"grad_norm": 44.69379425048828,
"learning_rate": 2.9508196721311478e-06,
"loss": 2.7615,
"step": 18
},
{
"epoch": 0.015702479338842976,
"grad_norm": 43.80733108520508,
"learning_rate": 3.114754098360656e-06,
"loss": 2.5191,
"step": 19
},
{
"epoch": 0.01652892561983471,
"grad_norm": 35.15926742553711,
"learning_rate": 3.2786885245901638e-06,
"loss": 2.3142,
"step": 20
},
{
"epoch": 0.017355371900826446,
"grad_norm": 33.019527435302734,
"learning_rate": 3.4426229508196724e-06,
"loss": 2.0956,
"step": 21
},
{
"epoch": 0.01818181818181818,
"grad_norm": 28.401559829711914,
"learning_rate": 3.6065573770491806e-06,
"loss": 1.7151,
"step": 22
},
{
"epoch": 0.019008264462809916,
"grad_norm": 25.92833137512207,
"learning_rate": 3.7704918032786884e-06,
"loss": 1.8749,
"step": 23
},
{
"epoch": 0.019834710743801654,
"grad_norm": 23.572309494018555,
"learning_rate": 3.934426229508197e-06,
"loss": 1.906,
"step": 24
},
{
"epoch": 0.02066115702479339,
"grad_norm": 22.781951904296875,
"learning_rate": 4.098360655737705e-06,
"loss": 1.8617,
"step": 25
},
{
"epoch": 0.021487603305785124,
"grad_norm": 7.054357528686523,
"learning_rate": 4.2622950819672135e-06,
"loss": 0.8932,
"step": 26
},
{
"epoch": 0.02231404958677686,
"grad_norm": 6.233171463012695,
"learning_rate": 4.426229508196722e-06,
"loss": 1.012,
"step": 27
},
{
"epoch": 0.023140495867768594,
"grad_norm": 5.4683051109313965,
"learning_rate": 4.59016393442623e-06,
"loss": 1.0207,
"step": 28
},
{
"epoch": 0.023966942148760332,
"grad_norm": 5.118521213531494,
"learning_rate": 4.754098360655738e-06,
"loss": 1.0721,
"step": 29
},
{
"epoch": 0.024793388429752067,
"grad_norm": 5.259830951690674,
"learning_rate": 4.918032786885246e-06,
"loss": 1.176,
"step": 30
},
{
"epoch": 0.0256198347107438,
"grad_norm": 31.10423469543457,
"learning_rate": 5.0819672131147545e-06,
"loss": 1.4009,
"step": 31
},
{
"epoch": 0.026446280991735537,
"grad_norm": 21.79800033569336,
"learning_rate": 5.245901639344263e-06,
"loss": 1.2526,
"step": 32
},
{
"epoch": 0.02727272727272727,
"grad_norm": 20.524797439575195,
"learning_rate": 5.409836065573772e-06,
"loss": 1.1859,
"step": 33
},
{
"epoch": 0.02809917355371901,
"grad_norm": 17.946226119995117,
"learning_rate": 5.573770491803278e-06,
"loss": 1.1133,
"step": 34
},
{
"epoch": 0.028925619834710745,
"grad_norm": 16.537263870239258,
"learning_rate": 5.737704918032787e-06,
"loss": 1.0933,
"step": 35
},
{
"epoch": 0.02975206611570248,
"grad_norm": 16.203887939453125,
"learning_rate": 5.9016393442622956e-06,
"loss": 1.051,
"step": 36
},
{
"epoch": 0.030578512396694214,
"grad_norm": 15.776374816894531,
"learning_rate": 6.065573770491804e-06,
"loss": 1.0397,
"step": 37
},
{
"epoch": 0.03140495867768595,
"grad_norm": 15.78660774230957,
"learning_rate": 6.229508196721312e-06,
"loss": 1.0247,
"step": 38
},
{
"epoch": 0.032231404958677684,
"grad_norm": 16.88052749633789,
"learning_rate": 6.393442622950821e-06,
"loss": 1.0407,
"step": 39
},
{
"epoch": 0.03305785123966942,
"grad_norm": 15.079468727111816,
"learning_rate": 6.5573770491803276e-06,
"loss": 0.9895,
"step": 40
},
{
"epoch": 0.033884297520661154,
"grad_norm": 16.4204044342041,
"learning_rate": 6.721311475409837e-06,
"loss": 0.9551,
"step": 41
},
{
"epoch": 0.03471074380165289,
"grad_norm": 13.154854774475098,
"learning_rate": 6.885245901639345e-06,
"loss": 1.0659,
"step": 42
},
{
"epoch": 0.03553719008264463,
"grad_norm": 14.134108543395996,
"learning_rate": 7.049180327868853e-06,
"loss": 1.1975,
"step": 43
},
{
"epoch": 0.03636363636363636,
"grad_norm": 11.584863662719727,
"learning_rate": 7.213114754098361e-06,
"loss": 1.1057,
"step": 44
},
{
"epoch": 0.0371900826446281,
"grad_norm": 9.628296852111816,
"learning_rate": 7.3770491803278695e-06,
"loss": 1.0823,
"step": 45
},
{
"epoch": 0.03801652892561983,
"grad_norm": 9.617013931274414,
"learning_rate": 7.540983606557377e-06,
"loss": 1.0447,
"step": 46
},
{
"epoch": 0.03884297520661157,
"grad_norm": 9.079089164733887,
"learning_rate": 7.704918032786886e-06,
"loss": 0.9984,
"step": 47
},
{
"epoch": 0.03966942148760331,
"grad_norm": 11.923172950744629,
"learning_rate": 7.868852459016394e-06,
"loss": 1.2281,
"step": 48
},
{
"epoch": 0.04049586776859504,
"grad_norm": 15.181923866271973,
"learning_rate": 8.032786885245902e-06,
"loss": 1.2481,
"step": 49
},
{
"epoch": 0.04132231404958678,
"grad_norm": 14.02413272857666,
"learning_rate": 8.19672131147541e-06,
"loss": 1.2116,
"step": 50
},
{
"epoch": 0.04214876033057851,
"grad_norm": 8.739473342895508,
"learning_rate": 8.360655737704919e-06,
"loss": 0.7494,
"step": 51
},
{
"epoch": 0.04297520661157025,
"grad_norm": 5.860500335693359,
"learning_rate": 8.524590163934427e-06,
"loss": 0.879,
"step": 52
},
{
"epoch": 0.043801652892561986,
"grad_norm": 4.310902118682861,
"learning_rate": 8.688524590163935e-06,
"loss": 0.7679,
"step": 53
},
{
"epoch": 0.04462809917355372,
"grad_norm": 3.633057117462158,
"learning_rate": 8.852459016393443e-06,
"loss": 0.8117,
"step": 54
},
{
"epoch": 0.045454545454545456,
"grad_norm": 3.607675313949585,
"learning_rate": 9.016393442622952e-06,
"loss": 0.8434,
"step": 55
},
{
"epoch": 0.04628099173553719,
"grad_norm": 7.013420581817627,
"learning_rate": 9.18032786885246e-06,
"loss": 0.9177,
"step": 56
},
{
"epoch": 0.047107438016528926,
"grad_norm": 10.00487995147705,
"learning_rate": 9.344262295081968e-06,
"loss": 0.8644,
"step": 57
},
{
"epoch": 0.047933884297520664,
"grad_norm": 10.111504554748535,
"learning_rate": 9.508196721311476e-06,
"loss": 0.8552,
"step": 58
},
{
"epoch": 0.048760330578512395,
"grad_norm": 10.907903671264648,
"learning_rate": 9.672131147540984e-06,
"loss": 0.8684,
"step": 59
},
{
"epoch": 0.049586776859504134,
"grad_norm": 9.939312934875488,
"learning_rate": 9.836065573770493e-06,
"loss": 0.8511,
"step": 60
},
{
"epoch": 0.050413223140495865,
"grad_norm": 10.330282211303711,
"learning_rate": 1e-05,
"loss": 0.853,
"step": 61
},
{
"epoch": 0.0512396694214876,
"grad_norm": 10.615926742553711,
"learning_rate": 9.99998131042498e-06,
"loss": 0.8483,
"step": 62
},
{
"epoch": 0.05206611570247934,
"grad_norm": 11.617918014526367,
"learning_rate": 9.99992524183964e-06,
"loss": 0.8302,
"step": 63
},
{
"epoch": 0.05289256198347107,
"grad_norm": 10.836939811706543,
"learning_rate": 9.99983179466314e-06,
"loss": 0.825,
"step": 64
},
{
"epoch": 0.05371900826446281,
"grad_norm": 10.899372100830078,
"learning_rate": 9.999700969594073e-06,
"loss": 0.821,
"step": 65
},
{
"epoch": 0.05454545454545454,
"grad_norm": 11.234271049499512,
"learning_rate": 9.999532767610465e-06,
"loss": 0.8093,
"step": 66
},
{
"epoch": 0.05537190082644628,
"grad_norm": 8.552489280700684,
"learning_rate": 9.999327189969768e-06,
"loss": 0.8969,
"step": 67
},
{
"epoch": 0.05619834710743802,
"grad_norm": 7.510222911834717,
"learning_rate": 9.999084238208843e-06,
"loss": 0.874,
"step": 68
},
{
"epoch": 0.05702479338842975,
"grad_norm": 7.619356632232666,
"learning_rate": 9.99880391414396e-06,
"loss": 0.9021,
"step": 69
},
{
"epoch": 0.05785123966942149,
"grad_norm": 6.904785633087158,
"learning_rate": 9.998486219870769e-06,
"loss": 0.8768,
"step": 70
},
{
"epoch": 0.05867768595041322,
"grad_norm": 8.251214981079102,
"learning_rate": 9.998131157764301e-06,
"loss": 0.8833,
"step": 71
},
{
"epoch": 0.05950413223140496,
"grad_norm": 7.453232765197754,
"learning_rate": 9.997738730478938e-06,
"loss": 0.8544,
"step": 72
},
{
"epoch": 0.0603305785123967,
"grad_norm": 10.829845428466797,
"learning_rate": 9.997308940948405e-06,
"loss": 0.929,
"step": 73
},
{
"epoch": 0.06115702479338843,
"grad_norm": 12.777321815490723,
"learning_rate": 9.996841792385728e-06,
"loss": 0.9576,
"step": 74
},
{
"epoch": 0.06198347107438017,
"grad_norm": 13.181741714477539,
"learning_rate": 9.996337288283236e-06,
"loss": 0.9274,
"step": 75
},
{
"epoch": 0.0628099173553719,
"grad_norm": 22.688045501708984,
"learning_rate": 9.995795432412513e-06,
"loss": 1.1367,
"step": 76
},
{
"epoch": 0.06363636363636363,
"grad_norm": 9.36829662322998,
"learning_rate": 9.995216228824383e-06,
"loss": 0.9509,
"step": 77
},
{
"epoch": 0.06446280991735537,
"grad_norm": 5.943204879760742,
"learning_rate": 9.994599681848873e-06,
"loss": 0.8382,
"step": 78
},
{
"epoch": 0.0652892561983471,
"grad_norm": 4.420541763305664,
"learning_rate": 9.993945796095183e-06,
"loss": 0.7509,
"step": 79
},
{
"epoch": 0.06611570247933884,
"grad_norm": 3.4656481742858887,
"learning_rate": 9.993254576451652e-06,
"loss": 0.7744,
"step": 80
},
{
"epoch": 0.06694214876033058,
"grad_norm": 7.592845916748047,
"learning_rate": 9.992526028085721e-06,
"loss": 0.6749,
"step": 81
},
{
"epoch": 0.06776859504132231,
"grad_norm": 13.7494535446167,
"learning_rate": 9.991760156443892e-06,
"loss": 0.697,
"step": 82
},
{
"epoch": 0.06859504132231405,
"grad_norm": 14.44515323638916,
"learning_rate": 9.990956967251692e-06,
"loss": 0.7142,
"step": 83
},
{
"epoch": 0.06942148760330578,
"grad_norm": 14.63835334777832,
"learning_rate": 9.990116466513628e-06,
"loss": 0.6709,
"step": 84
},
{
"epoch": 0.07024793388429752,
"grad_norm": 15.553050994873047,
"learning_rate": 9.989238660513141e-06,
"loss": 0.7016,
"step": 85
},
{
"epoch": 0.07107438016528926,
"grad_norm": 15.484620094299316,
"learning_rate": 9.988323555812558e-06,
"loss": 0.684,
"step": 86
},
{
"epoch": 0.07190082644628099,
"grad_norm": 15.376058578491211,
"learning_rate": 9.987371159253047e-06,
"loss": 0.6672,
"step": 87
},
{
"epoch": 0.07272727272727272,
"grad_norm": 15.391505241394043,
"learning_rate": 9.98638147795456e-06,
"loss": 0.6632,
"step": 88
},
{
"epoch": 0.07355371900826446,
"grad_norm": 16.409034729003906,
"learning_rate": 9.98535451931579e-06,
"loss": 0.6393,
"step": 89
},
{
"epoch": 0.0743801652892562,
"grad_norm": 15.590607643127441,
"learning_rate": 9.984290291014105e-06,
"loss": 0.6383,
"step": 90
},
{
"epoch": 0.07520661157024794,
"grad_norm": 10.495964050292969,
"learning_rate": 9.983188801005492e-06,
"loss": 0.7488,
"step": 91
},
{
"epoch": 0.07603305785123966,
"grad_norm": 9.75681209564209,
"learning_rate": 9.982050057524505e-06,
"loss": 0.7838,
"step": 92
},
{
"epoch": 0.0768595041322314,
"grad_norm": 9.70681381225586,
"learning_rate": 9.980874069084197e-06,
"loss": 0.7908,
"step": 93
},
{
"epoch": 0.07768595041322314,
"grad_norm": 9.451353073120117,
"learning_rate": 9.979660844476056e-06,
"loss": 0.7204,
"step": 94
},
{
"epoch": 0.07851239669421488,
"grad_norm": 8.870857238769531,
"learning_rate": 9.978410392769943e-06,
"loss": 0.7342,
"step": 95
},
{
"epoch": 0.07933884297520662,
"grad_norm": 8.990922927856445,
"learning_rate": 9.977122723314026e-06,
"loss": 0.6932,
"step": 96
},
{
"epoch": 0.08016528925619834,
"grad_norm": 9.571405410766602,
"learning_rate": 9.975797845734699e-06,
"loss": 0.7034,
"step": 97
},
{
"epoch": 0.08099173553719008,
"grad_norm": 15.28164005279541,
"learning_rate": 9.974435769936523e-06,
"loss": 0.8082,
"step": 98
},
{
"epoch": 0.08181818181818182,
"grad_norm": 15.263097763061523,
"learning_rate": 9.973036506102145e-06,
"loss": 0.7938,
"step": 99
},
{
"epoch": 0.08264462809917356,
"grad_norm": 14.575544357299805,
"learning_rate": 9.971600064692222e-06,
"loss": 0.7302,
"step": 100
},
{
"epoch": 0.0834710743801653,
"grad_norm": 9.692093849182129,
"learning_rate": 9.970126456445348e-06,
"loss": 0.7892,
"step": 101
},
{
"epoch": 0.08429752066115702,
"grad_norm": 6.960953235626221,
"learning_rate": 9.96861569237797e-06,
"loss": 0.8115,
"step": 102
},
{
"epoch": 0.08512396694214876,
"grad_norm": 6.310975074768066,
"learning_rate": 9.967067783784297e-06,
"loss": 0.87,
"step": 103
},
{
"epoch": 0.0859504132231405,
"grad_norm": 5.528263568878174,
"learning_rate": 9.965482742236234e-06,
"loss": 0.8227,
"step": 104
},
{
"epoch": 0.08677685950413223,
"grad_norm": 5.674769878387451,
"learning_rate": 9.963860579583284e-06,
"loss": 0.8033,
"step": 105
},
{
"epoch": 0.08760330578512397,
"grad_norm": 9.012608528137207,
"learning_rate": 9.962201307952455e-06,
"loss": 0.6766,
"step": 106
},
{
"epoch": 0.0884297520661157,
"grad_norm": 14.8609619140625,
"learning_rate": 9.960504939748184e-06,
"loss": 0.5175,
"step": 107
},
{
"epoch": 0.08925619834710743,
"grad_norm": 15.575093269348145,
"learning_rate": 9.95877148765223e-06,
"loss": 0.4785,
"step": 108
},
{
"epoch": 0.09008264462809917,
"grad_norm": 15.258967399597168,
"learning_rate": 9.957000964623585e-06,
"loss": 0.4621,
"step": 109
},
{
"epoch": 0.09090909090909091,
"grad_norm": 15.684150695800781,
"learning_rate": 9.955193383898376e-06,
"loss": 0.4981,
"step": 110
},
{
"epoch": 0.09173553719008265,
"grad_norm": 14.834244728088379,
"learning_rate": 9.953348758989774e-06,
"loss": 0.4395,
"step": 111
},
{
"epoch": 0.09256198347107437,
"grad_norm": 15.187931060791016,
"learning_rate": 9.951467103687879e-06,
"loss": 0.462,
"step": 112
},
{
"epoch": 0.09338842975206611,
"grad_norm": 16.399118423461914,
"learning_rate": 9.949548432059627e-06,
"loss": 0.4867,
"step": 113
},
{
"epoch": 0.09421487603305785,
"grad_norm": 15.177169799804688,
"learning_rate": 9.94759275844868e-06,
"loss": 0.4434,
"step": 114
},
{
"epoch": 0.09504132231404959,
"grad_norm": 14.918987274169922,
"learning_rate": 9.945600097475322e-06,
"loss": 0.4212,
"step": 115
},
{
"epoch": 0.09586776859504133,
"grad_norm": 15.301935195922852,
"learning_rate": 9.943570464036347e-06,
"loss": 0.4336,
"step": 116
},
{
"epoch": 0.09669421487603305,
"grad_norm": 14.648711204528809,
"learning_rate": 9.94150387330495e-06,
"loss": 0.4064,
"step": 117
},
{
"epoch": 0.09752066115702479,
"grad_norm": 9.166203498840332,
"learning_rate": 9.939400340730611e-06,
"loss": 0.6176,
"step": 118
},
{
"epoch": 0.09834710743801653,
"grad_norm": 8.874405860900879,
"learning_rate": 9.937259882038986e-06,
"loss": 0.5856,
"step": 119
},
{
"epoch": 0.09917355371900827,
"grad_norm": 9.233952522277832,
"learning_rate": 9.935082513231776e-06,
"loss": 0.6215,
"step": 120
},
{
"epoch": 0.1,
"grad_norm": 8.666190147399902,
"learning_rate": 9.932868250586619e-06,
"loss": 0.5833,
"step": 121
},
{
"epoch": 0.10082644628099173,
"grad_norm": 8.544061660766602,
"learning_rate": 9.93061711065697e-06,
"loss": 0.5457,
"step": 122
},
{
"epoch": 0.10165289256198347,
"grad_norm": 13.043021202087402,
"learning_rate": 9.928329110271967e-06,
"loss": 0.6034,
"step": 123
},
{
"epoch": 0.1024793388429752,
"grad_norm": 13.846759796142578,
"learning_rate": 9.926004266536314e-06,
"loss": 0.559,
"step": 124
},
{
"epoch": 0.10330578512396695,
"grad_norm": 13.205362319946289,
"learning_rate": 9.923642596830142e-06,
"loss": 0.4789,
"step": 125
},
{
"epoch": 0.10413223140495868,
"grad_norm": 4.691654205322266,
"learning_rate": 9.921244118808896e-06,
"loss": 0.5618,
"step": 126
},
{
"epoch": 0.10495867768595041,
"grad_norm": 4.73344612121582,
"learning_rate": 9.918808850403192e-06,
"loss": 0.6367,
"step": 127
},
{
"epoch": 0.10578512396694215,
"grad_norm": 4.595221042633057,
"learning_rate": 9.916336809818679e-06,
"loss": 0.6359,
"step": 128
},
{
"epoch": 0.10661157024793388,
"grad_norm": 4.814165115356445,
"learning_rate": 9.913828015535914e-06,
"loss": 0.6589,
"step": 129
},
{
"epoch": 0.10743801652892562,
"grad_norm": 4.562165260314941,
"learning_rate": 9.911282486310214e-06,
"loss": 0.6342,
"step": 130
},
{
"epoch": 0.10826446280991736,
"grad_norm": 4.110449314117432,
"learning_rate": 9.908700241171528e-06,
"loss": 0.5926,
"step": 131
},
{
"epoch": 0.10909090909090909,
"grad_norm": 15.924474716186523,
"learning_rate": 9.906081299424276e-06,
"loss": 0.3306,
"step": 132
},
{
"epoch": 0.10991735537190082,
"grad_norm": 16.74171257019043,
"learning_rate": 9.903425680647225e-06,
"loss": 0.3228,
"step": 133
},
{
"epoch": 0.11074380165289256,
"grad_norm": 16.331884384155273,
"learning_rate": 9.900733404693328e-06,
"loss": 0.2947,
"step": 134
},
{
"epoch": 0.1115702479338843,
"grad_norm": 14.321427345275879,
"learning_rate": 9.898004491689582e-06,
"loss": 0.2559,
"step": 135
},
{
"epoch": 0.11239669421487604,
"grad_norm": 13.67010498046875,
"learning_rate": 9.895238962036878e-06,
"loss": 0.2296,
"step": 136
},
{
"epoch": 0.11322314049586776,
"grad_norm": 13.051318168640137,
"learning_rate": 9.892436836409845e-06,
"loss": 0.2229,
"step": 137
},
{
"epoch": 0.1140495867768595,
"grad_norm": 13.087048530578613,
"learning_rate": 9.8895981357567e-06,
"loss": 0.2184,
"step": 138
},
{
"epoch": 0.11487603305785124,
"grad_norm": 12.81165885925293,
"learning_rate": 9.88672288129908e-06,
"loss": 0.1808,
"step": 139
},
{
"epoch": 0.11570247933884298,
"grad_norm": 12.778697967529297,
"learning_rate": 9.883811094531906e-06,
"loss": 0.1955,
"step": 140
},
{
"epoch": 0.11652892561983472,
"grad_norm": 12.421398162841797,
"learning_rate": 9.880862797223197e-06,
"loss": 0.1762,
"step": 141
},
{
"epoch": 0.11735537190082644,
"grad_norm": 12.256722450256348,
"learning_rate": 9.877878011413924e-06,
"loss": 0.1802,
"step": 142
},
{
"epoch": 0.11818181818181818,
"grad_norm": 10.989577293395996,
"learning_rate": 9.874856759417837e-06,
"loss": 0.2245,
"step": 143
},
{
"epoch": 0.11900826446280992,
"grad_norm": 7.684560775756836,
"learning_rate": 9.871799063821303e-06,
"loss": 0.4634,
"step": 144
},
{
"epoch": 0.11983471074380166,
"grad_norm": 6.932048320770264,
"learning_rate": 9.868704947483134e-06,
"loss": 0.454,
"step": 145
},
{
"epoch": 0.1206611570247934,
"grad_norm": 6.686558246612549,
"learning_rate": 9.86557443353442e-06,
"loss": 0.4499,
"step": 146
},
{
"epoch": 0.12148760330578512,
"grad_norm": 5.892101764678955,
"learning_rate": 9.862407545378348e-06,
"loss": 0.4378,
"step": 147
},
{
"epoch": 0.12231404958677686,
"grad_norm": 5.772195816040039,
"learning_rate": 9.859204306690038e-06,
"loss": 0.4666,
"step": 148
},
{
"epoch": 0.1231404958677686,
"grad_norm": 10.375298500061035,
"learning_rate": 9.855964741416355e-06,
"loss": 0.3677,
"step": 149
},
{
"epoch": 0.12396694214876033,
"grad_norm": 8.872918128967285,
"learning_rate": 9.852688873775741e-06,
"loss": 0.3167,
"step": 150
},
{
"epoch": 0.12479338842975207,
"grad_norm": 6.068610668182373,
"learning_rate": 9.849376728258024e-06,
"loss": 0.5171,
"step": 151
},
{
"epoch": 0.1256198347107438,
"grad_norm": 6.121888160705566,
"learning_rate": 9.846028329624242e-06,
"loss": 0.55,
"step": 152
},
{
"epoch": 0.12644628099173555,
"grad_norm": 5.789841651916504,
"learning_rate": 9.842643702906453e-06,
"loss": 0.5124,
"step": 153
},
{
"epoch": 0.12727272727272726,
"grad_norm": 5.199845314025879,
"learning_rate": 9.839222873407553e-06,
"loss": 0.5206,
"step": 154
},
{
"epoch": 0.128099173553719,
"grad_norm": 4.326947212219238,
"learning_rate": 9.835765866701079e-06,
"loss": 0.5173,
"step": 155
},
{
"epoch": 0.12892561983471074,
"grad_norm": 3.3200628757476807,
"learning_rate": 9.832272708631027e-06,
"loss": 0.3047,
"step": 156
},
{
"epoch": 0.12975206611570247,
"grad_norm": 6.605825424194336,
"learning_rate": 9.828743425311654e-06,
"loss": 0.1155,
"step": 157
},
{
"epoch": 0.1305785123966942,
"grad_norm": 5.040175914764404,
"learning_rate": 9.825178043127279e-06,
"loss": 0.0726,
"step": 158
},
{
"epoch": 0.13140495867768595,
"grad_norm": 4.458925724029541,
"learning_rate": 9.821576588732095e-06,
"loss": 0.0671,
"step": 159
},
{
"epoch": 0.1322314049586777,
"grad_norm": 4.0228776931762695,
"learning_rate": 9.817939089049964e-06,
"loss": 0.0618,
"step": 160
},
{
"epoch": 0.13305785123966943,
"grad_norm": 3.98579740524292,
"learning_rate": 9.814265571274215e-06,
"loss": 0.0447,
"step": 161
},
{
"epoch": 0.13388429752066117,
"grad_norm": 3.2810285091400146,
"learning_rate": 9.81055606286744e-06,
"loss": 0.0549,
"step": 162
},
{
"epoch": 0.1347107438016529,
"grad_norm": 2.91422176361084,
"learning_rate": 9.806810591561295e-06,
"loss": 0.0535,
"step": 163
},
{
"epoch": 0.13553719008264462,
"grad_norm": 2.663398265838623,
"learning_rate": 9.803029185356286e-06,
"loss": 0.0418,
"step": 164
},
{
"epoch": 0.13636363636363635,
"grad_norm": 4.249295234680176,
"learning_rate": 9.799211872521564e-06,
"loss": 0.0704,
"step": 165
},
{
"epoch": 0.1371900826446281,
"grad_norm": 4.342033863067627,
"learning_rate": 9.795358681594712e-06,
"loss": 0.071,
"step": 166
},
{
"epoch": 0.13801652892561983,
"grad_norm": 5.222247123718262,
"learning_rate": 9.791469641381526e-06,
"loss": 0.0443,
"step": 167
},
{
"epoch": 0.13884297520661157,
"grad_norm": 6.7554473876953125,
"learning_rate": 9.787544780955815e-06,
"loss": 0.5837,
"step": 168
},
{
"epoch": 0.1396694214876033,
"grad_norm": 5.858948707580566,
"learning_rate": 9.783584129659162e-06,
"loss": 0.5041,
"step": 169
},
{
"epoch": 0.14049586776859505,
"grad_norm": 3.3874192237854004,
"learning_rate": 9.779587717100729e-06,
"loss": 0.4407,
"step": 170
},
{
"epoch": 0.14132231404958678,
"grad_norm": 3.474137306213379,
"learning_rate": 9.775555573157016e-06,
"loss": 0.4318,
"step": 171
},
{
"epoch": 0.14214876033057852,
"grad_norm": 3.252941370010376,
"learning_rate": 9.771487727971642e-06,
"loss": 0.4322,
"step": 172
},
{
"epoch": 0.14297520661157026,
"grad_norm": 3.3677756786346436,
"learning_rate": 9.767384211955126e-06,
"loss": 0.4496,
"step": 173
},
{
"epoch": 0.14380165289256197,
"grad_norm": 6.583395957946777,
"learning_rate": 9.763245055784663e-06,
"loss": 0.3269,
"step": 174
},
{
"epoch": 0.1446280991735537,
"grad_norm": 4.727723121643066,
"learning_rate": 9.759070290403873e-06,
"loss": 0.2061,
"step": 175
},
{
"epoch": 0.14545454545454545,
"grad_norm": 6.042111873626709,
"learning_rate": 9.754859947022596e-06,
"loss": 0.5294,
"step": 176
},
{
"epoch": 0.14628099173553719,
"grad_norm": 6.286961555480957,
"learning_rate": 9.750614057116642e-06,
"loss": 0.5713,
"step": 177
},
{
"epoch": 0.14710743801652892,
"grad_norm": 6.675935745239258,
"learning_rate": 9.746332652427566e-06,
"loss": 0.5804,
"step": 178
},
{
"epoch": 0.14793388429752066,
"grad_norm": 6.587425231933594,
"learning_rate": 9.742015764962418e-06,
"loss": 0.547,
"step": 179
},
{
"epoch": 0.1487603305785124,
"grad_norm": 6.034268856048584,
"learning_rate": 9.737663426993514e-06,
"loss": 0.5808,
"step": 180
},
{
"epoch": 0.14958677685950414,
"grad_norm": 3.43208384513855,
"learning_rate": 9.733275671058195e-06,
"loss": 0.0556,
"step": 181
},
{
"epoch": 0.15041322314049588,
"grad_norm": 1.2211329936981201,
"learning_rate": 9.728852529958579e-06,
"loss": 0.0543,
"step": 182
},
{
"epoch": 0.15123966942148762,
"grad_norm": 4.956982135772705,
"learning_rate": 9.724394036761316e-06,
"loss": 0.0459,
"step": 183
},
{
"epoch": 0.15206611570247933,
"grad_norm": 2.556718587875366,
"learning_rate": 9.71990022479734e-06,
"loss": 0.0404,
"step": 184
},
{
"epoch": 0.15289256198347106,
"grad_norm": 1.8153376579284668,
"learning_rate": 9.715371127661632e-06,
"loss": 0.0317,
"step": 185
},
{
"epoch": 0.1537190082644628,
"grad_norm": 0.7108684778213501,
"learning_rate": 9.710806779212947e-06,
"loss": 0.0277,
"step": 186
},
{
"epoch": 0.15454545454545454,
"grad_norm": 2.305515766143799,
"learning_rate": 9.70620721357358e-06,
"loss": 0.0455,
"step": 187
},
{
"epoch": 0.15537190082644628,
"grad_norm": 1.716363787651062,
"learning_rate": 9.7015724651291e-06,
"loss": 0.0349,
"step": 188
},
{
"epoch": 0.15619834710743802,
"grad_norm": 5.611410140991211,
"learning_rate": 9.696902568528103e-06,
"loss": 0.0857,
"step": 189
},
{
"epoch": 0.15702479338842976,
"grad_norm": 2.8802907466888428,
"learning_rate": 9.69219755868194e-06,
"loss": 0.0507,
"step": 190
},
{
"epoch": 0.1578512396694215,
"grad_norm": 3.974024534225464,
"learning_rate": 9.68745747076446e-06,
"loss": 0.2486,
"step": 191
},
{
"epoch": 0.15867768595041323,
"grad_norm": 4.467517375946045,
"learning_rate": 9.682682340211763e-06,
"loss": 0.4189,
"step": 192
},
{
"epoch": 0.15950413223140497,
"grad_norm": 4.234643459320068,
"learning_rate": 9.677872202721906e-06,
"loss": 0.4185,
"step": 193
},
{
"epoch": 0.16033057851239668,
"grad_norm": 2.5838871002197266,
"learning_rate": 9.673027094254663e-06,
"loss": 0.3809,
"step": 194
},
{
"epoch": 0.16115702479338842,
"grad_norm": 3.291666030883789,
"learning_rate": 9.66814705103124e-06,
"loss": 0.4241,
"step": 195
},
{
"epoch": 0.16198347107438016,
"grad_norm": 2.974576473236084,
"learning_rate": 9.663232109534011e-06,
"loss": 0.3915,
"step": 196
},
{
"epoch": 0.1628099173553719,
"grad_norm": 2.9471116065979004,
"learning_rate": 9.658282306506242e-06,
"loss": 0.3889,
"step": 197
},
{
"epoch": 0.16363636363636364,
"grad_norm": 5.0516815185546875,
"learning_rate": 9.653297678951822e-06,
"loss": 0.3213,
"step": 198
},
{
"epoch": 0.16446280991735537,
"grad_norm": 6.269105434417725,
"learning_rate": 9.648278264134977e-06,
"loss": 0.2461,
"step": 199
},
{
"epoch": 0.1652892561983471,
"grad_norm": 7.234027862548828,
"learning_rate": 9.643224099579997e-06,
"loss": 0.2729,
"step": 200
},
{
"epoch": 0.16611570247933885,
"grad_norm": 3.872319221496582,
"learning_rate": 9.63813522307096e-06,
"loss": 0.4332,
"step": 201
},
{
"epoch": 0.1669421487603306,
"grad_norm": 3.694383382797241,
"learning_rate": 9.633011672651443e-06,
"loss": 0.4327,
"step": 202
},
{
"epoch": 0.16776859504132233,
"grad_norm": 4.5347819328308105,
"learning_rate": 9.627853486624234e-06,
"loss": 0.3973,
"step": 203
},
{
"epoch": 0.16859504132231404,
"grad_norm": 5.72538948059082,
"learning_rate": 9.622660703551059e-06,
"loss": 0.4392,
"step": 204
},
{
"epoch": 0.16942148760330578,
"grad_norm": 5.8939409255981445,
"learning_rate": 9.617433362252277e-06,
"loss": 0.4358,
"step": 205
},
{
"epoch": 0.17024793388429751,
"grad_norm": 63.848697662353516,
"learning_rate": 9.612171501806606e-06,
"loss": 0.5083,
"step": 206
},
{
"epoch": 0.17107438016528925,
"grad_norm": 9.372079849243164,
"learning_rate": 9.606875161550819e-06,
"loss": 0.2324,
"step": 207
},
{
"epoch": 0.171900826446281,
"grad_norm": 9.741740226745605,
"learning_rate": 9.601544381079457e-06,
"loss": 0.1739,
"step": 208
},
{
"epoch": 0.17272727272727273,
"grad_norm": 5.752349376678467,
"learning_rate": 9.596179200244527e-06,
"loss": 0.089,
"step": 209
},
{
"epoch": 0.17355371900826447,
"grad_norm": 1.3193517923355103,
"learning_rate": 9.59077965915521e-06,
"loss": 0.0507,
"step": 210
},
{
"epoch": 0.1743801652892562,
"grad_norm": 3.0285868644714355,
"learning_rate": 9.585345798177557e-06,
"loss": 0.0498,
"step": 211
},
{
"epoch": 0.17520661157024794,
"grad_norm": 9.561731338500977,
"learning_rate": 9.579877657934187e-06,
"loss": 0.0875,
"step": 212
},
{
"epoch": 0.17603305785123968,
"grad_norm": 1.7362544536590576,
"learning_rate": 9.574375279303989e-06,
"loss": 0.0471,
"step": 213
},
{
"epoch": 0.1768595041322314,
"grad_norm": 0.43394935131073,
"learning_rate": 9.56883870342181e-06,
"loss": 0.0437,
"step": 214
},
{
"epoch": 0.17768595041322313,
"grad_norm": 1.7832210063934326,
"learning_rate": 9.563267971678152e-06,
"loss": 0.0515,
"step": 215
},
{
"epoch": 0.17851239669421487,
"grad_norm": 0.40522071719169617,
"learning_rate": 9.557663125718855e-06,
"loss": 0.0301,
"step": 216
},
{
"epoch": 0.1793388429752066,
"grad_norm": 1.7870750427246094,
"learning_rate": 9.552024207444794e-06,
"loss": 0.0491,
"step": 217
},
{
"epoch": 0.18016528925619835,
"grad_norm": 3.0770976543426514,
"learning_rate": 9.546351259011569e-06,
"loss": 0.3519,
"step": 218
},
{
"epoch": 0.18099173553719008,
"grad_norm": 1.7763985395431519,
"learning_rate": 9.540644322829173e-06,
"loss": 0.4262,
"step": 219
},
{
"epoch": 0.18181818181818182,
"grad_norm": 2.929096221923828,
"learning_rate": 9.534903441561693e-06,
"loss": 0.4233,
"step": 220
},
{
"epoch": 0.18264462809917356,
"grad_norm": 2.5705196857452393,
"learning_rate": 9.52912865812698e-06,
"loss": 0.3956,
"step": 221
},
{
"epoch": 0.1834710743801653,
"grad_norm": 3.308143377304077,
"learning_rate": 9.523320015696336e-06,
"loss": 0.396,
"step": 222
},
{
"epoch": 0.18429752066115704,
"grad_norm": 3.0954201221466064,
"learning_rate": 9.517477557694182e-06,
"loss": 0.3624,
"step": 223
},
{
"epoch": 0.18512396694214875,
"grad_norm": 4.504317760467529,
"learning_rate": 9.51160132779774e-06,
"loss": 0.2108,
"step": 224
},
{
"epoch": 0.1859504132231405,
"grad_norm": 4.281580924987793,
"learning_rate": 9.50569136993671e-06,
"loss": 0.22,
"step": 225
},
{
"epoch": 0.18677685950413223,
"grad_norm": 5.512481689453125,
"learning_rate": 9.499747728292928e-06,
"loss": 0.4468,
"step": 226
},
{
"epoch": 0.18760330578512396,
"grad_norm": 6.550302982330322,
"learning_rate": 9.49377044730005e-06,
"loss": 0.4252,
"step": 227
},
{
"epoch": 0.1884297520661157,
"grad_norm": 6.840104103088379,
"learning_rate": 9.48775957164321e-06,
"loss": 0.431,
"step": 228
},
{
"epoch": 0.18925619834710744,
"grad_norm": 7.467294692993164,
"learning_rate": 9.481715146258699e-06,
"loss": 0.4236,
"step": 229
},
{
"epoch": 0.19008264462809918,
"grad_norm": 6.159290790557861,
"learning_rate": 9.475637216333611e-06,
"loss": 0.3231,
"step": 230
},
{
"epoch": 0.19090909090909092,
"grad_norm": 0.58745276927948,
"learning_rate": 9.469525827305514e-06,
"loss": 0.0298,
"step": 231
},
{
"epoch": 0.19173553719008266,
"grad_norm": 1.123795747756958,
"learning_rate": 9.463381024862116e-06,
"loss": 0.0211,
"step": 232
},
{
"epoch": 0.1925619834710744,
"grad_norm": 0.49130845069885254,
"learning_rate": 9.457202854940915e-06,
"loss": 0.0295,
"step": 233
},
{
"epoch": 0.1933884297520661,
"grad_norm": 0.43577098846435547,
"learning_rate": 9.450991363728858e-06,
"loss": 0.0192,
"step": 234
},
{
"epoch": 0.19421487603305784,
"grad_norm": 0.9045920372009277,
"learning_rate": 9.444746597662e-06,
"loss": 0.0313,
"step": 235
},
{
"epoch": 0.19504132231404958,
"grad_norm": 1.6747899055480957,
"learning_rate": 9.438468603425147e-06,
"loss": 0.0433,
"step": 236
},
{
"epoch": 0.19586776859504132,
"grad_norm": 3.9514214992523193,
"learning_rate": 9.432157427951521e-06,
"loss": 0.0764,
"step": 237
},
{
"epoch": 0.19669421487603306,
"grad_norm": 1.2828267812728882,
"learning_rate": 9.425813118422393e-06,
"loss": 0.0396,
"step": 238
},
{
"epoch": 0.1975206611570248,
"grad_norm": 1.3975738286972046,
"learning_rate": 9.419435722266745e-06,
"loss": 0.0446,
"step": 239
},
{
"epoch": 0.19834710743801653,
"grad_norm": 0.3374120891094208,
"learning_rate": 9.413025287160904e-06,
"loss": 0.0423,
"step": 240
},
{
"epoch": 0.19917355371900827,
"grad_norm": 1.6610523462295532,
"learning_rate": 9.406581861028199e-06,
"loss": 0.0446,
"step": 241
},
{
"epoch": 0.2,
"grad_norm": 2.999248504638672,
"learning_rate": 9.40010549203858e-06,
"loss": 0.1462,
"step": 242
},
{
"epoch": 0.20082644628099172,
"grad_norm": 3.1203508377075195,
"learning_rate": 9.393596228608289e-06,
"loss": 0.4149,
"step": 243
},
{
"epoch": 0.20165289256198346,
"grad_norm": 3.1315393447875977,
"learning_rate": 9.387054119399466e-06,
"loss": 0.4055,
"step": 244
},
{
"epoch": 0.2024793388429752,
"grad_norm": 1.9984904527664185,
"learning_rate": 9.38047921331981e-06,
"loss": 0.3894,
"step": 245
},
{
"epoch": 0.20330578512396694,
"grad_norm": 2.6335408687591553,
"learning_rate": 9.373871559522203e-06,
"loss": 0.3866,
"step": 246
},
{
"epoch": 0.20413223140495868,
"grad_norm": 4.331795692443848,
"learning_rate": 9.36723120740434e-06,
"loss": 0.4232,
"step": 247
},
{
"epoch": 0.2049586776859504,
"grad_norm": 3.634972333908081,
"learning_rate": 9.360558206608363e-06,
"loss": 0.3621,
"step": 248
},
{
"epoch": 0.20578512396694215,
"grad_norm": 7.504141330718994,
"learning_rate": 9.353852607020496e-06,
"loss": 0.2442,
"step": 249
},
{
"epoch": 0.2066115702479339,
"grad_norm": 4.689815521240234,
"learning_rate": 9.347114458770656e-06,
"loss": 0.2701,
"step": 250
},
{
"epoch": 0.20743801652892563,
"grad_norm": 4.537595748901367,
"learning_rate": 9.340343812232097e-06,
"loss": 0.2763,
"step": 251
},
{
"epoch": 0.20826446280991737,
"grad_norm": 5.105134010314941,
"learning_rate": 9.333540718021024e-06,
"loss": 0.3415,
"step": 252
},
{
"epoch": 0.20909090909090908,
"grad_norm": 5.4744672775268555,
"learning_rate": 9.326705226996207e-06,
"loss": 0.3494,
"step": 253
},
{
"epoch": 0.20991735537190082,
"grad_norm": 5.980065822601318,
"learning_rate": 9.319837390258619e-06,
"loss": 0.3448,
"step": 254
},
{
"epoch": 0.21074380165289255,
"grad_norm": 5.463247776031494,
"learning_rate": 9.31293725915104e-06,
"loss": 0.377,
"step": 255
},
{
"epoch": 0.2115702479338843,
"grad_norm": 2.519472360610962,
"learning_rate": 9.306004885257675e-06,
"loss": 0.195,
"step": 256
},
{
"epoch": 0.21239669421487603,
"grad_norm": 4.671911716461182,
"learning_rate": 9.299040320403775e-06,
"loss": 0.056,
"step": 257
},
{
"epoch": 0.21322314049586777,
"grad_norm": 4.731037616729736,
"learning_rate": 9.29204361665524e-06,
"loss": 0.051,
"step": 258
},
{
"epoch": 0.2140495867768595,
"grad_norm": 2.2982873916625977,
"learning_rate": 9.28501482631824e-06,
"loss": 0.0481,
"step": 259
},
{
"epoch": 0.21487603305785125,
"grad_norm": 1.7563531398773193,
"learning_rate": 9.277954001938819e-06,
"loss": 0.0406,
"step": 260
},
{
"epoch": 0.21570247933884298,
"grad_norm": 1.352148413658142,
"learning_rate": 9.270861196302494e-06,
"loss": 0.0319,
"step": 261
},
{
"epoch": 0.21652892561983472,
"grad_norm": 1.9197131395339966,
"learning_rate": 9.26373646243388e-06,
"loss": 0.0522,
"step": 262
},
{
"epoch": 0.21735537190082643,
"grad_norm": 0.9207364916801453,
"learning_rate": 9.256579853596273e-06,
"loss": 0.0385,
"step": 263
},
{
"epoch": 0.21818181818181817,
"grad_norm": 0.5068424940109253,
"learning_rate": 9.249391423291263e-06,
"loss": 0.0193,
"step": 264
},
{
"epoch": 0.2190082644628099,
"grad_norm": 0.37090161442756653,
"learning_rate": 9.242171225258336e-06,
"loss": 0.019,
"step": 265
},
{
"epoch": 0.21983471074380165,
"grad_norm": 3.5917792320251465,
"learning_rate": 9.234919313474463e-06,
"loss": 0.0792,
"step": 266
},
{
"epoch": 0.2206611570247934,
"grad_norm": 3.6245150566101074,
"learning_rate": 9.227635742153706e-06,
"loss": 0.0796,
"step": 267
},
{
"epoch": 0.22148760330578512,
"grad_norm": 4.045724391937256,
"learning_rate": 9.220320565746806e-06,
"loss": 0.3782,
"step": 268
},
{
"epoch": 0.22231404958677686,
"grad_norm": 2.375504493713379,
"learning_rate": 9.212973838940775e-06,
"loss": 0.3738,
"step": 269
},
{
"epoch": 0.2231404958677686,
"grad_norm": 3.42677903175354,
"learning_rate": 9.205595616658495e-06,
"loss": 0.384,
"step": 270
},
{
"epoch": 0.22396694214876034,
"grad_norm": 2.2642056941986084,
"learning_rate": 9.198185954058305e-06,
"loss": 0.3928,
"step": 271
},
{
"epoch": 0.22479338842975208,
"grad_norm": 1.6667097806930542,
"learning_rate": 9.190744906533578e-06,
"loss": 0.3812,
"step": 272
},
{
"epoch": 0.2256198347107438,
"grad_norm": 2.662670373916626,
"learning_rate": 9.183272529712324e-06,
"loss": 0.3177,
"step": 273
},
{
"epoch": 0.22644628099173553,
"grad_norm": 5.033318519592285,
"learning_rate": 9.175768879456759e-06,
"loss": 0.2818,
"step": 274
},
{
"epoch": 0.22727272727272727,
"grad_norm": 4.880688667297363,
"learning_rate": 9.168234011862899e-06,
"loss": 0.3027,
"step": 275
},
{
"epoch": 0.228099173553719,
"grad_norm": 4.188443660736084,
"learning_rate": 9.160667983260133e-06,
"loss": 0.2788,
"step": 276
},
{
"epoch": 0.22892561983471074,
"grad_norm": 4.407473564147949,
"learning_rate": 9.153070850210803e-06,
"loss": 0.3406,
"step": 277
},
{
"epoch": 0.22975206611570248,
"grad_norm": 4.778233051300049,
"learning_rate": 9.145442669509787e-06,
"loss": 0.3387,
"step": 278
},
{
"epoch": 0.23057851239669422,
"grad_norm": 5.0951948165893555,
"learning_rate": 9.137783498184065e-06,
"loss": 0.265,
"step": 279
},
{
"epoch": 0.23140495867768596,
"grad_norm": 4.966414928436279,
"learning_rate": 9.130093393492302e-06,
"loss": 0.328,
"step": 280
},
{
"epoch": 0.2322314049586777,
"grad_norm": 2.2104508876800537,
"learning_rate": 9.122372412924409e-06,
"loss": 0.1631,
"step": 281
},
{
"epoch": 0.23305785123966943,
"grad_norm": 1.512752652168274,
"learning_rate": 9.11462061420113e-06,
"loss": 0.0115,
"step": 282
},
{
"epoch": 0.23388429752066114,
"grad_norm": 0.32136648893356323,
"learning_rate": 9.106838055273589e-06,
"loss": 0.0283,
"step": 283
},
{
"epoch": 0.23471074380165288,
"grad_norm": 2.3555853366851807,
"learning_rate": 9.099024794322874e-06,
"loss": 0.0563,
"step": 284
},
{
"epoch": 0.23553719008264462,
"grad_norm": 0.3243228495121002,
"learning_rate": 9.091180889759602e-06,
"loss": 0.0285,
"step": 285
},
{
"epoch": 0.23636363636363636,
"grad_norm": 0.7489374876022339,
"learning_rate": 9.083306400223465e-06,
"loss": 0.0369,
"step": 286
},
{
"epoch": 0.2371900826446281,
"grad_norm": 1.0486401319503784,
"learning_rate": 9.07540138458281e-06,
"loss": 0.021,
"step": 287
},
{
"epoch": 0.23801652892561984,
"grad_norm": 2.7049810886383057,
"learning_rate": 9.067465901934187e-06,
"loss": 0.0607,
"step": 288
},
{
"epoch": 0.23884297520661157,
"grad_norm": 0.3670910596847534,
"learning_rate": 9.059500011601919e-06,
"loss": 0.0366,
"step": 289
},
{
"epoch": 0.2396694214876033,
"grad_norm": 0.8055985569953918,
"learning_rate": 9.051503773137647e-06,
"loss": 0.03,
"step": 290
},
{
"epoch": 0.24049586776859505,
"grad_norm": 1.4885363578796387,
"learning_rate": 9.043477246319888e-06,
"loss": 0.0497,
"step": 291
},
{
"epoch": 0.2413223140495868,
"grad_norm": 1.1092013120651245,
"learning_rate": 9.035420491153596e-06,
"loss": 0.03,
"step": 292
},
{
"epoch": 0.2421487603305785,
"grad_norm": 2.9742510318756104,
"learning_rate": 9.0273335678697e-06,
"loss": 0.3492,
"step": 293
},
{
"epoch": 0.24297520661157024,
"grad_norm": 5.270508766174316,
"learning_rate": 9.019216536924667e-06,
"loss": 0.4273,
"step": 294
},
{
"epoch": 0.24380165289256198,
"grad_norm": 3.1606504917144775,
"learning_rate": 9.011069459000035e-06,
"loss": 0.4114,
"step": 295
},
{
"epoch": 0.24462809917355371,
"grad_norm": 1.8900291919708252,
"learning_rate": 9.002892395001978e-06,
"loss": 0.3572,
"step": 296
},
{
"epoch": 0.24545454545454545,
"grad_norm": 2.238032817840576,
"learning_rate": 8.994685406060837e-06,
"loss": 0.3574,
"step": 297
},
{
"epoch": 0.2462809917355372,
"grad_norm": 92.95574951171875,
"learning_rate": 8.986448553530665e-06,
"loss": 0.3824,
"step": 298
},
{
"epoch": 0.24710743801652893,
"grad_norm": 5.979159355163574,
"learning_rate": 8.978181898988769e-06,
"loss": 0.3624,
"step": 299
},
{
"epoch": 0.24793388429752067,
"grad_norm": 6.447382926940918,
"learning_rate": 8.969885504235257e-06,
"loss": 0.2805,
"step": 300
},
{
"epoch": 0.2487603305785124,
"grad_norm": 3.386467456817627,
"learning_rate": 8.961559431292562e-06,
"loss": 0.3139,
"step": 301
},
{
"epoch": 0.24958677685950414,
"grad_norm": 3.9054369926452637,
"learning_rate": 8.953203742404992e-06,
"loss": 0.295,
"step": 302
},
{
"epoch": 0.25041322314049586,
"grad_norm": 4.136813163757324,
"learning_rate": 8.944818500038257e-06,
"loss": 0.3177,
"step": 303
},
{
"epoch": 0.2512396694214876,
"grad_norm": 4.501168251037598,
"learning_rate": 8.936403766879003e-06,
"loss": 0.3008,
"step": 304
},
{
"epoch": 0.25206611570247933,
"grad_norm": 3.8915162086486816,
"learning_rate": 8.927959605834347e-06,
"loss": 0.3187,
"step": 305
},
{
"epoch": 0.2528925619834711,
"grad_norm": 1.2882620096206665,
"learning_rate": 8.919486080031396e-06,
"loss": 0.0303,
"step": 306
},
{
"epoch": 0.2537190082644628,
"grad_norm": 0.6544731855392456,
"learning_rate": 8.910983252816794e-06,
"loss": 0.042,
"step": 307
},
{
"epoch": 0.2545454545454545,
"grad_norm": 3.0191237926483154,
"learning_rate": 8.902451187756228e-06,
"loss": 0.0595,
"step": 308
},
{
"epoch": 0.2553719008264463,
"grad_norm": 0.4694916009902954,
"learning_rate": 8.893889948633968e-06,
"loss": 0.0364,
"step": 309
},
{
"epoch": 0.256198347107438,
"grad_norm": 2.781498670578003,
"learning_rate": 8.885299599452381e-06,
"loss": 0.0192,
"step": 310
},
{
"epoch": 0.25702479338842976,
"grad_norm": 2.27006459236145,
"learning_rate": 8.87668020443146e-06,
"loss": 0.055,
"step": 311
},
{
"epoch": 0.2578512396694215,
"grad_norm": 1.341349482536316,
"learning_rate": 8.868031828008335e-06,
"loss": 0.0481,
"step": 312
},
{
"epoch": 0.25867768595041324,
"grad_norm": 0.654566764831543,
"learning_rate": 8.859354534836797e-06,
"loss": 0.0345,
"step": 313
},
{
"epoch": 0.25950413223140495,
"grad_norm": 0.23261475563049316,
"learning_rate": 8.850648389786816e-06,
"loss": 0.0412,
"step": 314
},
{
"epoch": 0.2603305785123967,
"grad_norm": 0.9180927872657776,
"learning_rate": 8.841913457944053e-06,
"loss": 0.0364,
"step": 315
},
{
"epoch": 0.2611570247933884,
"grad_norm": 1.1271486282348633,
"learning_rate": 8.833149804609372e-06,
"loss": 0.1013,
"step": 316
},
{
"epoch": 0.2619834710743802,
"grad_norm": 2.27287220954895,
"learning_rate": 8.824357495298357e-06,
"loss": 0.382,
"step": 317
},
{
"epoch": 0.2628099173553719,
"grad_norm": 1.9736340045928955,
"learning_rate": 8.815536595740817e-06,
"loss": 0.3783,
"step": 318
},
{
"epoch": 0.2636363636363636,
"grad_norm": 1.994823932647705,
"learning_rate": 8.806687171880298e-06,
"loss": 0.3282,
"step": 319
},
{
"epoch": 0.2644628099173554,
"grad_norm": 2.0439600944519043,
"learning_rate": 8.797809289873587e-06,
"loss": 0.3704,
"step": 320
},
{
"epoch": 0.2652892561983471,
"grad_norm": 2.0953407287597656,
"learning_rate": 8.788903016090222e-06,
"loss": 0.3623,
"step": 321
},
{
"epoch": 0.26611570247933886,
"grad_norm": 2.216618061065674,
"learning_rate": 8.779968417111991e-06,
"loss": 0.3766,
"step": 322
},
{
"epoch": 0.26694214876033057,
"grad_norm": 3.6057472229003906,
"learning_rate": 8.77100555973244e-06,
"loss": 0.2765,
"step": 323
},
{
"epoch": 0.26776859504132233,
"grad_norm": 5.107512474060059,
"learning_rate": 8.762014510956364e-06,
"loss": 0.1966,
"step": 324
},
{
"epoch": 0.26859504132231404,
"grad_norm": 4.891761302947998,
"learning_rate": 8.752995337999316e-06,
"loss": 0.2237,
"step": 325
},
{
"epoch": 0.2694214876033058,
"grad_norm": 3.0789847373962402,
"learning_rate": 8.7439481082871e-06,
"loss": 0.2562,
"step": 326
},
{
"epoch": 0.2702479338842975,
"grad_norm": 3.5382838249206543,
"learning_rate": 8.734872889455268e-06,
"loss": 0.3514,
"step": 327
},
{
"epoch": 0.27107438016528923,
"grad_norm": 3.313929319381714,
"learning_rate": 8.725769749348612e-06,
"loss": 0.2989,
"step": 328
},
{
"epoch": 0.271900826446281,
"grad_norm": 3.3161957263946533,
"learning_rate": 8.716638756020661e-06,
"loss": 0.2757,
"step": 329
},
{
"epoch": 0.2727272727272727,
"grad_norm": 2.9451427459716797,
"learning_rate": 8.70747997773317e-06,
"loss": 0.2653,
"step": 330
},
{
"epoch": 0.2735537190082645,
"grad_norm": 1.3499524593353271,
"learning_rate": 8.698293482955605e-06,
"loss": 0.1402,
"step": 331
},
{
"epoch": 0.2743801652892562,
"grad_norm": 0.2829076647758484,
"learning_rate": 8.689079340364644e-06,
"loss": 0.0275,
"step": 332
},
{
"epoch": 0.27520661157024795,
"grad_norm": 1.4667352437973022,
"learning_rate": 8.679837618843646e-06,
"loss": 0.0432,
"step": 333
},
{
"epoch": 0.27603305785123966,
"grad_norm": 0.31224325299263,
"learning_rate": 8.670568387482153e-06,
"loss": 0.0289,
"step": 334
},
{
"epoch": 0.2768595041322314,
"grad_norm": 0.32508960366249084,
"learning_rate": 8.661271715575364e-06,
"loss": 0.0285,
"step": 335
},
{
"epoch": 0.27768595041322314,
"grad_norm": 1.3324801921844482,
"learning_rate": 8.651947672623613e-06,
"loss": 0.0428,
"step": 336
},
{
"epoch": 0.2785123966942149,
"grad_norm": 2.0843026638031006,
"learning_rate": 8.642596328331864e-06,
"loss": 0.0513,
"step": 337
},
{
"epoch": 0.2793388429752066,
"grad_norm": 2.3382153511047363,
"learning_rate": 8.633217752609177e-06,
"loss": 0.0531,
"step": 338
},
{
"epoch": 0.2801652892561983,
"grad_norm": 1.6505120992660522,
"learning_rate": 8.62381201556819e-06,
"loss": 0.0308,
"step": 339
},
{
"epoch": 0.2809917355371901,
"grad_norm": 0.4468664824962616,
"learning_rate": 8.614379187524593e-06,
"loss": 0.0395,
"step": 340
},
{
"epoch": 0.2818181818181818,
"grad_norm": 0.6637183427810669,
"learning_rate": 8.604919338996604e-06,
"loss": 0.0419,
"step": 341
},
{
"epoch": 0.28264462809917357,
"grad_norm": 1.43661367893219,
"learning_rate": 8.595432540704446e-06,
"loss": 0.0762,
"step": 342
},
{
"epoch": 0.2834710743801653,
"grad_norm": 6.637901306152344,
"learning_rate": 8.585918863569806e-06,
"loss": 0.4814,
"step": 343
},
{
"epoch": 0.28429752066115704,
"grad_norm": 3.622053861618042,
"learning_rate": 8.576378378715322e-06,
"loss": 0.4324,
"step": 344
},
{
"epoch": 0.28512396694214875,
"grad_norm": 3.3516077995300293,
"learning_rate": 8.566811157464032e-06,
"loss": 0.3595,
"step": 345
},
{
"epoch": 0.2859504132231405,
"grad_norm": 2.02554988861084,
"learning_rate": 8.55721727133886e-06,
"loss": 0.358,
"step": 346
},
{
"epoch": 0.28677685950413223,
"grad_norm": 1.4861034154891968,
"learning_rate": 8.547596792062064e-06,
"loss": 0.3432,
"step": 347
},
{
"epoch": 0.28760330578512394,
"grad_norm": 2.7921857833862305,
"learning_rate": 8.537949791554714e-06,
"loss": 0.348,
"step": 348
},
{
"epoch": 0.2884297520661157,
"grad_norm": 2.5079729557037354,
"learning_rate": 8.528276341936146e-06,
"loss": 0.1366,
"step": 349
},
{
"epoch": 0.2892561983471074,
"grad_norm": 2.7155003547668457,
"learning_rate": 8.518576515523423e-06,
"loss": 0.0867,
"step": 350
},
{
"epoch": 0.2900826446280992,
"grad_norm": 3.2388157844543457,
"learning_rate": 8.5088503848308e-06,
"loss": 0.3211,
"step": 351
},
{
"epoch": 0.2909090909090909,
"grad_norm": 3.136592388153076,
"learning_rate": 8.499098022569177e-06,
"loss": 0.3195,
"step": 352
},
{
"epoch": 0.29173553719008266,
"grad_norm": 2.751568555831909,
"learning_rate": 8.489319501645555e-06,
"loss": 0.3294,
"step": 353
},
{
"epoch": 0.29256198347107437,
"grad_norm": 2.539821147918701,
"learning_rate": 8.479514895162495e-06,
"loss": 0.3024,
"step": 354
},
{
"epoch": 0.29338842975206614,
"grad_norm": 2.4179089069366455,
"learning_rate": 8.469684276417568e-06,
"loss": 0.2638,
"step": 355
},
{
"epoch": 0.29421487603305785,
"grad_norm": 1.5124740600585938,
"learning_rate": 8.459827718902809e-06,
"loss": 0.1416,
"step": 356
},
{
"epoch": 0.2950413223140496,
"grad_norm": 0.4755104184150696,
"learning_rate": 8.449945296304168e-06,
"loss": 0.0341,
"step": 357
},
{
"epoch": 0.2958677685950413,
"grad_norm": 1.8088937997817993,
"learning_rate": 8.440037082500953e-06,
"loss": 0.0441,
"step": 358
},
{
"epoch": 0.29669421487603304,
"grad_norm": 0.7109810709953308,
"learning_rate": 8.430103151565288e-06,
"loss": 0.0354,
"step": 359
},
{
"epoch": 0.2975206611570248,
"grad_norm": 1.2697234153747559,
"learning_rate": 8.420143577761551e-06,
"loss": 0.0484,
"step": 360
},
{
"epoch": 0.2983471074380165,
"grad_norm": 3.0426182746887207,
"learning_rate": 8.410158435545825e-06,
"loss": 0.0266,
"step": 361
},
{
"epoch": 0.2991735537190083,
"grad_norm": 1.5594433546066284,
"learning_rate": 8.400147799565334e-06,
"loss": 0.0401,
"step": 362
},
{
"epoch": 0.3,
"grad_norm": 0.2366088181734085,
"learning_rate": 8.390111744657892e-06,
"loss": 0.0389,
"step": 363
},
{
"epoch": 0.30082644628099175,
"grad_norm": 2.7218143939971924,
"learning_rate": 8.380050345851338e-06,
"loss": 0.0249,
"step": 364
},
{
"epoch": 0.30165289256198347,
"grad_norm": 1.1584035158157349,
"learning_rate": 8.369963678362978e-06,
"loss": 0.0298,
"step": 365
},
{
"epoch": 0.30247933884297523,
"grad_norm": 1.1925641298294067,
"learning_rate": 8.359851817599027e-06,
"loss": 0.0384,
"step": 366
},
{
"epoch": 0.30330578512396694,
"grad_norm": 12.5698823928833,
"learning_rate": 8.349714839154035e-06,
"loss": 0.697,
"step": 367
},
{
"epoch": 0.30413223140495865,
"grad_norm": 7.502256393432617,
"learning_rate": 8.33955281881033e-06,
"loss": 0.5348,
"step": 368
},
{
"epoch": 0.3049586776859504,
"grad_norm": 4.12613582611084,
"learning_rate": 8.329365832537448e-06,
"loss": 0.4117,
"step": 369
},
{
"epoch": 0.30578512396694213,
"grad_norm": 3.4762847423553467,
"learning_rate": 8.319153956491567e-06,
"loss": 0.3757,
"step": 370
},
{
"epoch": 0.3066115702479339,
"grad_norm": 2.2697415351867676,
"learning_rate": 8.30891726701494e-06,
"loss": 0.3732,
"step": 371
},
{
"epoch": 0.3074380165289256,
"grad_norm": 1.9594049453735352,
"learning_rate": 8.298655840635312e-06,
"loss": 0.3448,
"step": 372
},
{
"epoch": 0.3082644628099174,
"grad_norm": 3.9964828491210938,
"learning_rate": 8.288369754065362e-06,
"loss": 0.2701,
"step": 373
},
{
"epoch": 0.3090909090909091,
"grad_norm": 13.04822826385498,
"learning_rate": 8.27805908420213e-06,
"loss": 0.2767,
"step": 374
},
{
"epoch": 0.30991735537190085,
"grad_norm": 3.762925386428833,
"learning_rate": 8.267723908126429e-06,
"loss": 0.1487,
"step": 375
},
{
"epoch": 0.31074380165289256,
"grad_norm": 3.9099831581115723,
"learning_rate": 8.257364303102275e-06,
"loss": 0.3413,
"step": 376
},
{
"epoch": 0.31157024793388427,
"grad_norm": 3.8278796672821045,
"learning_rate": 8.246980346576318e-06,
"loss": 0.413,
"step": 377
},
{
"epoch": 0.31239669421487604,
"grad_norm": 3.016062021255493,
"learning_rate": 8.236572116177249e-06,
"loss": 0.3472,
"step": 378
},
{
"epoch": 0.31322314049586775,
"grad_norm": 2.7905495166778564,
"learning_rate": 8.226139689715233e-06,
"loss": 0.3401,
"step": 379
},
{
"epoch": 0.3140495867768595,
"grad_norm": 2.2101759910583496,
"learning_rate": 8.215683145181312e-06,
"loss": 0.3386,
"step": 380
},
{
"epoch": 0.3148760330578512,
"grad_norm": 2.5770349502563477,
"learning_rate": 8.205202560746839e-06,
"loss": 0.2139,
"step": 381
},
{
"epoch": 0.315702479338843,
"grad_norm": 4.9263176918029785,
"learning_rate": 8.19469801476288e-06,
"loss": 0.0844,
"step": 382
},
{
"epoch": 0.3165289256198347,
"grad_norm": 4.758224964141846,
"learning_rate": 8.184169585759637e-06,
"loss": 0.0798,
"step": 383
},
{
"epoch": 0.31735537190082647,
"grad_norm": 2.611168384552002,
"learning_rate": 8.173617352445853e-06,
"loss": 0.0515,
"step": 384
},
{
"epoch": 0.3181818181818182,
"grad_norm": 0.4771171510219574,
"learning_rate": 8.16304139370823e-06,
"loss": 0.0329,
"step": 385
},
{
"epoch": 0.31900826446280994,
"grad_norm": 0.5978302955627441,
"learning_rate": 8.152441788610843e-06,
"loss": 0.0378,
"step": 386
},
{
"epoch": 0.31983471074380165,
"grad_norm": 1.672168493270874,
"learning_rate": 8.14181861639453e-06,
"loss": 0.0294,
"step": 387
},
{
"epoch": 0.32066115702479336,
"grad_norm": 1.892396092414856,
"learning_rate": 8.131171956476328e-06,
"loss": 0.0346,
"step": 388
},
{
"epoch": 0.32148760330578513,
"grad_norm": 1.5014523267745972,
"learning_rate": 8.120501888448853e-06,
"loss": 0.0335,
"step": 389
},
{
"epoch": 0.32231404958677684,
"grad_norm": 0.538657009601593,
"learning_rate": 8.109808492079718e-06,
"loss": 0.0443,
"step": 390
},
{
"epoch": 0.3231404958677686,
"grad_norm": 1.1847418546676636,
"learning_rate": 8.09909184731094e-06,
"loss": 0.0446,
"step": 391
},
{
"epoch": 0.3239669421487603,
"grad_norm": 1.2937194108963013,
"learning_rate": 8.088352034258331e-06,
"loss": 0.0664,
"step": 392
},
{
"epoch": 0.3247933884297521,
"grad_norm": 3.5774083137512207,
"learning_rate": 8.07758913321091e-06,
"loss": 0.4166,
"step": 393
},
{
"epoch": 0.3256198347107438,
"grad_norm": 2.8276569843292236,
"learning_rate": 8.066803224630295e-06,
"loss": 0.3945,
"step": 394
},
{
"epoch": 0.32644628099173556,
"grad_norm": 2.874321460723877,
"learning_rate": 8.055994389150103e-06,
"loss": 0.3916,
"step": 395
},
{
"epoch": 0.32727272727272727,
"grad_norm": 2.9290878772735596,
"learning_rate": 8.045162707575354e-06,
"loss": 0.3915,
"step": 396
},
{
"epoch": 0.328099173553719,
"grad_norm": 2.229264736175537,
"learning_rate": 8.034308260881854e-06,
"loss": 0.376,
"step": 397
},
{
"epoch": 0.32892561983471075,
"grad_norm": 2.684750556945801,
"learning_rate": 8.023431130215605e-06,
"loss": 0.2773,
"step": 398
},
{
"epoch": 0.32975206611570246,
"grad_norm": 4.131373882293701,
"learning_rate": 8.012531396892185e-06,
"loss": 0.1928,
"step": 399
},
{
"epoch": 0.3305785123966942,
"grad_norm": 4.172529220581055,
"learning_rate": 8.00160914239615e-06,
"loss": 0.1928,
"step": 400
},
{
"epoch": 0.33140495867768593,
"grad_norm": 3.6504626274108887,
"learning_rate": 7.990664448380412e-06,
"loss": 0.3092,
"step": 401
},
{
"epoch": 0.3322314049586777,
"grad_norm": 3.3628780841827393,
"learning_rate": 7.979697396665649e-06,
"loss": 0.3061,
"step": 402
},
{
"epoch": 0.3330578512396694,
"grad_norm": 2.5712125301361084,
"learning_rate": 7.968708069239672e-06,
"loss": 0.3091,
"step": 403
},
{
"epoch": 0.3338842975206612,
"grad_norm": 2.4374077320098877,
"learning_rate": 7.957696548256828e-06,
"loss": 0.3066,
"step": 404
},
{
"epoch": 0.3347107438016529,
"grad_norm": 1.896844506263733,
"learning_rate": 7.946662916037373e-06,
"loss": 0.2612,
"step": 405
},
{
"epoch": 0.33553719008264465,
"grad_norm": 2.1051249504089355,
"learning_rate": 7.935607255066867e-06,
"loss": 0.2337,
"step": 406
},
{
"epoch": 0.33636363636363636,
"grad_norm": 2.1041512489318848,
"learning_rate": 7.924529647995549e-06,
"loss": 0.0331,
"step": 407
},
{
"epoch": 0.3371900826446281,
"grad_norm": 0.5802683234214783,
"learning_rate": 7.91343017763773e-06,
"loss": 0.0364,
"step": 408
},
{
"epoch": 0.33801652892561984,
"grad_norm": 1.2897307872772217,
"learning_rate": 7.902308926971166e-06,
"loss": 0.0326,
"step": 409
},
{
"epoch": 0.33884297520661155,
"grad_norm": 0.9003844857215881,
"learning_rate": 7.891165979136429e-06,
"loss": 0.0304,
"step": 410
},
{
"epoch": 0.3396694214876033,
"grad_norm": 1.9420896768569946,
"learning_rate": 7.880001417436309e-06,
"loss": 0.0398,
"step": 411
},
{
"epoch": 0.34049586776859503,
"grad_norm": 0.36469870805740356,
"learning_rate": 7.868815325335168e-06,
"loss": 0.0266,
"step": 412
},
{
"epoch": 0.3413223140495868,
"grad_norm": 2.06129789352417,
"learning_rate": 7.857607786458333e-06,
"loss": 0.0349,
"step": 413
},
{
"epoch": 0.3421487603305785,
"grad_norm": 0.9527754187583923,
"learning_rate": 7.846378884591453e-06,
"loss": 0.0349,
"step": 414
},
{
"epoch": 0.34297520661157027,
"grad_norm": 0.9972708225250244,
"learning_rate": 7.835128703679896e-06,
"loss": 0.0303,
"step": 415
},
{
"epoch": 0.343801652892562,
"grad_norm": 0.874220609664917,
"learning_rate": 7.823857327828099e-06,
"loss": 0.0369,
"step": 416
},
{
"epoch": 0.3446280991735537,
"grad_norm": 1.980294942855835,
"learning_rate": 7.812564841298951e-06,
"loss": 0.2549,
"step": 417
},
{
"epoch": 0.34545454545454546,
"grad_norm": 2.404195785522461,
"learning_rate": 7.801251328513164e-06,
"loss": 0.356,
"step": 418
},
{
"epoch": 0.34628099173553717,
"grad_norm": 1.9718644618988037,
"learning_rate": 7.789916874048635e-06,
"loss": 0.3155,
"step": 419
},
{
"epoch": 0.34710743801652894,
"grad_norm": 1.9457886219024658,
"learning_rate": 7.778561562639818e-06,
"loss": 0.3523,
"step": 420
},
{
"epoch": 0.34793388429752065,
"grad_norm": 1.6577038764953613,
"learning_rate": 7.767185479177092e-06,
"loss": 0.3394,
"step": 421
},
{
"epoch": 0.3487603305785124,
"grad_norm": 2.429166316986084,
"learning_rate": 7.755788708706124e-06,
"loss": 0.3466,
"step": 422
},
{
"epoch": 0.3495867768595041,
"grad_norm": 2.4249932765960693,
"learning_rate": 7.744371336427232e-06,
"loss": 0.3099,
"step": 423
},
{
"epoch": 0.3504132231404959,
"grad_norm": 4.010801792144775,
"learning_rate": 7.732933447694748e-06,
"loss": 0.1718,
"step": 424
},
{
"epoch": 0.3512396694214876,
"grad_norm": 4.168816089630127,
"learning_rate": 7.721475128016388e-06,
"loss": 0.1702,
"step": 425
},
{
"epoch": 0.35206611570247937,
"grad_norm": 4.439835548400879,
"learning_rate": 7.709996463052595e-06,
"loss": 0.3233,
"step": 426
},
{
"epoch": 0.3528925619834711,
"grad_norm": 2.153748035430908,
"learning_rate": 7.698497538615928e-06,
"loss": 0.2434,
"step": 427
},
{
"epoch": 0.3537190082644628,
"grad_norm": 2.1145215034484863,
"learning_rate": 7.68697844067038e-06,
"loss": 0.2965,
"step": 428
},
{
"epoch": 0.35454545454545455,
"grad_norm": 2.3973615169525146,
"learning_rate": 7.675439255330778e-06,
"loss": 0.3022,
"step": 429
},
{
"epoch": 0.35537190082644626,
"grad_norm": 1.741687297821045,
"learning_rate": 7.663880068862106e-06,
"loss": 0.268,
"step": 430
},
{
"epoch": 0.35619834710743803,
"grad_norm": 1.714586615562439,
"learning_rate": 7.652300967678873e-06,
"loss": 0.2504,
"step": 431
},
{
"epoch": 0.35702479338842974,
"grad_norm": 4.339986324310303,
"learning_rate": 7.64070203834448e-06,
"loss": 0.0494,
"step": 432
},
{
"epoch": 0.3578512396694215,
"grad_norm": 1.6370694637298584,
"learning_rate": 7.629083367570547e-06,
"loss": 0.0485,
"step": 433
},
{
"epoch": 0.3586776859504132,
"grad_norm": 1.0893810987472534,
"learning_rate": 7.617445042216278e-06,
"loss": 0.0538,
"step": 434
},
{
"epoch": 0.359504132231405,
"grad_norm": 0.8672933578491211,
"learning_rate": 7.605787149287819e-06,
"loss": 0.0374,
"step": 435
},
{
"epoch": 0.3603305785123967,
"grad_norm": 1.5140106678009033,
"learning_rate": 7.594109775937595e-06,
"loss": 0.0381,
"step": 436
},
{
"epoch": 0.3611570247933884,
"grad_norm": 0.7423667311668396,
"learning_rate": 7.582413009463664e-06,
"loss": 0.0254,
"step": 437
},
{
"epoch": 0.36198347107438017,
"grad_norm": 1.1572518348693848,
"learning_rate": 7.570696937309063e-06,
"loss": 0.0403,
"step": 438
},
{
"epoch": 0.3628099173553719,
"grad_norm": 0.2780013382434845,
"learning_rate": 7.558961647061156e-06,
"loss": 0.028,
"step": 439
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.9868566393852234,
"learning_rate": 7.54720722645098e-06,
"loss": 0.0397,
"step": 440
},
{
"epoch": 0.36446280991735536,
"grad_norm": 0.16371887922286987,
"learning_rate": 7.535433763352582e-06,
"loss": 0.0299,
"step": 441
},
{
"epoch": 0.3652892561983471,
"grad_norm": 0.5445317029953003,
"learning_rate": 7.5236413457823745e-06,
"loss": 0.0297,
"step": 442
},
{
"epoch": 0.36611570247933883,
"grad_norm": 1.8789058923721313,
"learning_rate": 7.5118300618984626e-06,
"loss": 0.1932,
"step": 443
},
{
"epoch": 0.3669421487603306,
"grad_norm": 3.1089491844177246,
"learning_rate": 7.500000000000001e-06,
"loss": 0.3663,
"step": 444
},
{
"epoch": 0.3677685950413223,
"grad_norm": 2.2975223064422607,
"learning_rate": 7.488151248526518e-06,
"loss": 0.3487,
"step": 445
},
{
"epoch": 0.3685950413223141,
"grad_norm": 1.9807140827178955,
"learning_rate": 7.476283896057267e-06,
"loss": 0.3603,
"step": 446
},
{
"epoch": 0.3694214876033058,
"grad_norm": 2.565408945083618,
"learning_rate": 7.464398031310557e-06,
"loss": 0.3417,
"step": 447
},
{
"epoch": 0.3702479338842975,
"grad_norm": 1.854609727859497,
"learning_rate": 7.452493743143092e-06,
"loss": 0.3216,
"step": 448
},
{
"epoch": 0.37107438016528926,
"grad_norm": 3.2579827308654785,
"learning_rate": 7.440571120549309e-06,
"loss": 0.2039,
"step": 449
},
{
"epoch": 0.371900826446281,
"grad_norm": 3.3291921615600586,
"learning_rate": 7.428630252660705e-06,
"loss": 0.1672,
"step": 450
},
{
"epoch": 0.37272727272727274,
"grad_norm": 3.157149314880371,
"learning_rate": 7.416671228745181e-06,
"loss": 0.2773,
"step": 451
},
{
"epoch": 0.37355371900826445,
"grad_norm": 3.1166951656341553,
"learning_rate": 7.404694138206365e-06,
"loss": 0.3119,
"step": 452
},
{
"epoch": 0.3743801652892562,
"grad_norm": 2.574794292449951,
"learning_rate": 7.392699070582951e-06,
"loss": 0.2579,
"step": 453
},
{
"epoch": 0.3752066115702479,
"grad_norm": 2.3687548637390137,
"learning_rate": 7.380686115548024e-06,
"loss": 0.2731,
"step": 454
},
{
"epoch": 0.3760330578512397,
"grad_norm": 2.1777918338775635,
"learning_rate": 7.368655362908394e-06,
"loss": 0.2474,
"step": 455
},
{
"epoch": 0.3768595041322314,
"grad_norm": 2.160825252532959,
"learning_rate": 7.356606902603924e-06,
"loss": 0.0972,
"step": 456
},
{
"epoch": 0.3776859504132231,
"grad_norm": 2.5094244480133057,
"learning_rate": 7.344540824706855e-06,
"loss": 0.0494,
"step": 457
},
{
"epoch": 0.3785123966942149,
"grad_norm": 2.2549567222595215,
"learning_rate": 7.332457219421132e-06,
"loss": 0.0236,
"step": 458
},
{
"epoch": 0.3793388429752066,
"grad_norm": 1.3993778228759766,
"learning_rate": 7.320356177081737e-06,
"loss": 0.0475,
"step": 459
},
{
"epoch": 0.38016528925619836,
"grad_norm": 0.7870197296142578,
"learning_rate": 7.3082377881540025e-06,
"loss": 0.0415,
"step": 460
},
{
"epoch": 0.38099173553719007,
"grad_norm": 2.217742681503296,
"learning_rate": 7.296102143232948e-06,
"loss": 0.0312,
"step": 461
},
{
"epoch": 0.38181818181818183,
"grad_norm": 2.3126964569091797,
"learning_rate": 7.283949333042586e-06,
"loss": 0.0421,
"step": 462
},
{
"epoch": 0.38264462809917354,
"grad_norm": 0.3075319826602936,
"learning_rate": 7.271779448435265e-06,
"loss": 0.0286,
"step": 463
},
{
"epoch": 0.3834710743801653,
"grad_norm": 0.5313308835029602,
"learning_rate": 7.259592580390973e-06,
"loss": 0.0358,
"step": 464
},
{
"epoch": 0.384297520661157,
"grad_norm": 1.1876845359802246,
"learning_rate": 7.247388820016662e-06,
"loss": 0.0233,
"step": 465
},
{
"epoch": 0.3851239669421488,
"grad_norm": 1.1241205930709839,
"learning_rate": 7.235168258545569e-06,
"loss": 0.022,
"step": 466
},
{
"epoch": 0.3859504132231405,
"grad_norm": 2.0038743019104004,
"learning_rate": 7.222930987336537e-06,
"loss": 0.3091,
"step": 467
},
{
"epoch": 0.3867768595041322,
"grad_norm": 2.349743366241455,
"learning_rate": 7.2106770978733245e-06,
"loss": 0.3662,
"step": 468
},
{
"epoch": 0.387603305785124,
"grad_norm": 2.8394381999969482,
"learning_rate": 7.198406681763925e-06,
"loss": 0.3628,
"step": 469
},
{
"epoch": 0.3884297520661157,
"grad_norm": 1.7794013023376465,
"learning_rate": 7.186119830739883e-06,
"loss": 0.3424,
"step": 470
},
{
"epoch": 0.38925619834710745,
"grad_norm": 2.329651117324829,
"learning_rate": 7.173816636655611e-06,
"loss": 0.3216,
"step": 471
},
{
"epoch": 0.39008264462809916,
"grad_norm": 1.83986496925354,
"learning_rate": 7.161497191487693e-06,
"loss": 0.3321,
"step": 472
},
{
"epoch": 0.39090909090909093,
"grad_norm": 5.495357513427734,
"learning_rate": 7.149161587334209e-06,
"loss": 0.2219,
"step": 473
},
{
"epoch": 0.39173553719008264,
"grad_norm": 4.695106506347656,
"learning_rate": 7.136809916414039e-06,
"loss": 0.1977,
"step": 474
},
{
"epoch": 0.3925619834710744,
"grad_norm": 3.4635348320007324,
"learning_rate": 7.124442271066174e-06,
"loss": 0.1271,
"step": 475
},
{
"epoch": 0.3933884297520661,
"grad_norm": 2.077944278717041,
"learning_rate": 7.112058743749029e-06,
"loss": 0.2443,
"step": 476
},
{
"epoch": 0.3942148760330578,
"grad_norm": 2.1625986099243164,
"learning_rate": 7.099659427039748e-06,
"loss": 0.2568,
"step": 477
},
{
"epoch": 0.3950413223140496,
"grad_norm": 1.8367973566055298,
"learning_rate": 7.087244413633516e-06,
"loss": 0.2249,
"step": 478
},
{
"epoch": 0.3958677685950413,
"grad_norm": 1.918242335319519,
"learning_rate": 7.074813796342862e-06,
"loss": 0.2373,
"step": 479
},
{
"epoch": 0.39669421487603307,
"grad_norm": 1.9061918258666992,
"learning_rate": 7.062367668096968e-06,
"loss": 0.2482,
"step": 480
},
{
"epoch": 0.3975206611570248,
"grad_norm": 2.2743890285491943,
"learning_rate": 7.049906121940974e-06,
"loss": 0.0971,
"step": 481
},
{
"epoch": 0.39834710743801655,
"grad_norm": 3.9464023113250732,
"learning_rate": 7.037429251035279e-06,
"loss": 0.0721,
"step": 482
},
{
"epoch": 0.39917355371900826,
"grad_norm": 2.1820228099823,
"learning_rate": 7.024937148654851e-06,
"loss": 0.0372,
"step": 483
},
{
"epoch": 0.4,
"grad_norm": 1.9354735612869263,
"learning_rate": 7.012429908188523e-06,
"loss": 0.0299,
"step": 484
},
{
"epoch": 0.40082644628099173,
"grad_norm": 0.6434481739997864,
"learning_rate": 6.999907623138296e-06,
"loss": 0.024,
"step": 485
},
{
"epoch": 0.40165289256198344,
"grad_norm": 0.2833828032016754,
"learning_rate": 6.987370387118649e-06,
"loss": 0.0389,
"step": 486
},
{
"epoch": 0.4024793388429752,
"grad_norm": 0.3582015931606293,
"learning_rate": 6.9748182938558225e-06,
"loss": 0.0277,
"step": 487
},
{
"epoch": 0.4033057851239669,
"grad_norm": 1.452950119972229,
"learning_rate": 6.962251437187136e-06,
"loss": 0.0389,
"step": 488
},
{
"epoch": 0.4041322314049587,
"grad_norm": 0.47023990750312805,
"learning_rate": 6.94966991106027e-06,
"loss": 0.0217,
"step": 489
},
{
"epoch": 0.4049586776859504,
"grad_norm": 0.49028366804122925,
"learning_rate": 6.937073809532581e-06,
"loss": 0.034,
"step": 490
},
{
"epoch": 0.40578512396694216,
"grad_norm": 1.0381807088851929,
"learning_rate": 6.924463226770376e-06,
"loss": 0.0393,
"step": 491
},
{
"epoch": 0.4066115702479339,
"grad_norm": 3.151425838470459,
"learning_rate": 6.9118382570482316e-06,
"loss": 0.3636,
"step": 492
},
{
"epoch": 0.40743801652892564,
"grad_norm": 3.0906147956848145,
"learning_rate": 6.899198994748274e-06,
"loss": 0.3854,
"step": 493
},
{
"epoch": 0.40826446280991735,
"grad_norm": 3.547203779220581,
"learning_rate": 6.886545534359481e-06,
"loss": 0.3912,
"step": 494
},
{
"epoch": 0.4090909090909091,
"grad_norm": 2.5733325481414795,
"learning_rate": 6.873877970476971e-06,
"loss": 0.3503,
"step": 495
},
{
"epoch": 0.4099173553719008,
"grad_norm": 2.8231985569000244,
"learning_rate": 6.861196397801297e-06,
"loss": 0.3479,
"step": 496
},
{
"epoch": 0.41074380165289254,
"grad_norm": 1.7183728218078613,
"learning_rate": 6.848500911137741e-06,
"loss": 0.3072,
"step": 497
},
{
"epoch": 0.4115702479338843,
"grad_norm": 4.216372489929199,
"learning_rate": 6.835791605395606e-06,
"loss": 0.1683,
"step": 498
},
{
"epoch": 0.412396694214876,
"grad_norm": 3.251147985458374,
"learning_rate": 6.823068575587496e-06,
"loss": 0.1218,
"step": 499
},
{
"epoch": 0.4132231404958678,
"grad_norm": 3.297546863555908,
"learning_rate": 6.810331916828623e-06,
"loss": 0.1202,
"step": 500
},
{
"epoch": 0.4140495867768595,
"grad_norm": 1.731645107269287,
"learning_rate": 6.797581724336081e-06,
"loss": 0.2167,
"step": 501
},
{
"epoch": 0.41487603305785126,
"grad_norm": 1.6075807809829712,
"learning_rate": 6.784818093428144e-06,
"loss": 0.2393,
"step": 502
},
{
"epoch": 0.41570247933884297,
"grad_norm": 1.5941764116287231,
"learning_rate": 6.772041119523545e-06,
"loss": 0.2669,
"step": 503
},
{
"epoch": 0.41652892561983473,
"grad_norm": 1.527315616607666,
"learning_rate": 6.759250898140768e-06,
"loss": 0.2266,
"step": 504
},
{
"epoch": 0.41735537190082644,
"grad_norm": 0.7751933336257935,
"learning_rate": 6.746447524897335e-06,
"loss": 0.1128,
"step": 505
},
{
"epoch": 0.41818181818181815,
"grad_norm": 1.885209321975708,
"learning_rate": 6.733631095509088e-06,
"loss": 0.0168,
"step": 506
},
{
"epoch": 0.4190082644628099,
"grad_norm": 0.5803923606872559,
"learning_rate": 6.720801705789475e-06,
"loss": 0.0353,
"step": 507
},
{
"epoch": 0.41983471074380163,
"grad_norm": 1.829307198524475,
"learning_rate": 6.70795945164883e-06,
"loss": 0.0259,
"step": 508
},
{
"epoch": 0.4206611570247934,
"grad_norm": 0.3436581790447235,
"learning_rate": 6.695104429093665e-06,
"loss": 0.0191,
"step": 509
},
{
"epoch": 0.4214876033057851,
"grad_norm": 0.5622700452804565,
"learning_rate": 6.682236734225944e-06,
"loss": 0.0169,
"step": 510
},
{
"epoch": 0.4223140495867769,
"grad_norm": 1.0006561279296875,
"learning_rate": 6.6693564632423626e-06,
"loss": 0.0232,
"step": 511
},
{
"epoch": 0.4231404958677686,
"grad_norm": 1.1930370330810547,
"learning_rate": 6.65646371243364e-06,
"loss": 0.0313,
"step": 512
},
{
"epoch": 0.42396694214876035,
"grad_norm": 0.5112296938896179,
"learning_rate": 6.643558578183787e-06,
"loss": 0.0195,
"step": 513
},
{
"epoch": 0.42479338842975206,
"grad_norm": 0.444097638130188,
"learning_rate": 6.630641156969397e-06,
"loss": 0.0252,
"step": 514
},
{
"epoch": 0.4256198347107438,
"grad_norm": 0.6078004240989685,
"learning_rate": 6.617711545358913e-06,
"loss": 0.0214,
"step": 515
},
{
"epoch": 0.42644628099173554,
"grad_norm": 1.4289065599441528,
"learning_rate": 6.604769840011913e-06,
"loss": 0.166,
"step": 516
},
{
"epoch": 0.42727272727272725,
"grad_norm": 3.11472749710083,
"learning_rate": 6.591816137678388e-06,
"loss": 0.3878,
"step": 517
},
{
"epoch": 0.428099173553719,
"grad_norm": 2.7481777667999268,
"learning_rate": 6.578850535198015e-06,
"loss": 0.3846,
"step": 518
},
{
"epoch": 0.4289256198347107,
"grad_norm": 2.9360082149505615,
"learning_rate": 6.565873129499431e-06,
"loss": 0.3568,
"step": 519
},
{
"epoch": 0.4297520661157025,
"grad_norm": 2.3579049110412598,
"learning_rate": 6.552884017599517e-06,
"loss": 0.3653,
"step": 520
},
{
"epoch": 0.4305785123966942,
"grad_norm": 2.295041561126709,
"learning_rate": 6.539883296602663e-06,
"loss": 0.3499,
"step": 521
},
{
"epoch": 0.43140495867768597,
"grad_norm": 2.4445390701293945,
"learning_rate": 6.526871063700056e-06,
"loss": 0.3592,
"step": 522
},
{
"epoch": 0.4322314049586777,
"grad_norm": 3.2918050289154053,
"learning_rate": 6.513847416168929e-06,
"loss": 0.2361,
"step": 523
},
{
"epoch": 0.43305785123966944,
"grad_norm": 4.195345401763916,
"learning_rate": 6.500812451371862e-06,
"loss": 0.1436,
"step": 524
},
{
"epoch": 0.43388429752066116,
"grad_norm": 3.297032117843628,
"learning_rate": 6.487766266756034e-06,
"loss": 0.1245,
"step": 525
},
{
"epoch": 0.43471074380165287,
"grad_norm": 2.270808696746826,
"learning_rate": 6.474708959852504e-06,
"loss": 0.1972,
"step": 526
},
{
"epoch": 0.43553719008264463,
"grad_norm": 1.552821159362793,
"learning_rate": 6.461640628275479e-06,
"loss": 0.182,
"step": 527
},
{
"epoch": 0.43636363636363634,
"grad_norm": 1.368065357208252,
"learning_rate": 6.4485613697215835e-06,
"loss": 0.1916,
"step": 528
},
{
"epoch": 0.4371900826446281,
"grad_norm": 1.2759222984313965,
"learning_rate": 6.435471281969133e-06,
"loss": 0.1963,
"step": 529
},
{
"epoch": 0.4380165289256198,
"grad_norm": 1.387843132019043,
"learning_rate": 6.422370462877396e-06,
"loss": 0.1878,
"step": 530
},
{
"epoch": 0.4388429752066116,
"grad_norm": 1.299856185913086,
"learning_rate": 6.409259010385871e-06,
"loss": 0.2002,
"step": 531
},
{
"epoch": 0.4396694214876033,
"grad_norm": 2.0802321434020996,
"learning_rate": 6.396137022513545e-06,
"loss": 0.094,
"step": 532
},
{
"epoch": 0.44049586776859506,
"grad_norm": 1.1367005109786987,
"learning_rate": 6.383004597358173e-06,
"loss": 0.0304,
"step": 533
},
{
"epoch": 0.4413223140495868,
"grad_norm": 2.08708119392395,
"learning_rate": 6.369861833095531e-06,
"loss": 0.0341,
"step": 534
},
{
"epoch": 0.44214876033057854,
"grad_norm": 0.6207039952278137,
"learning_rate": 6.3567088279786885e-06,
"loss": 0.0213,
"step": 535
},
{
"epoch": 0.44297520661157025,
"grad_norm": 1.192789077758789,
"learning_rate": 6.343545680337278e-06,
"loss": 0.0459,
"step": 536
},
{
"epoch": 0.44380165289256196,
"grad_norm": 0.8067427277565002,
"learning_rate": 6.330372488576754e-06,
"loss": 0.0433,
"step": 537
},
{
"epoch": 0.4446280991735537,
"grad_norm": 1.9640798568725586,
"learning_rate": 6.317189351177657e-06,
"loss": 0.0272,
"step": 538
},
{
"epoch": 0.44545454545454544,
"grad_norm": 0.7341052889823914,
"learning_rate": 6.303996366694882e-06,
"loss": 0.0336,
"step": 539
},
{
"epoch": 0.4462809917355372,
"grad_norm": 2.3869376182556152,
"learning_rate": 6.29079363375694e-06,
"loss": 0.0281,
"step": 540
},
{
"epoch": 0.4471074380165289,
"grad_norm": 0.5138152837753296,
"learning_rate": 6.277581251065217e-06,
"loss": 0.0137,
"step": 541
},
{
"epoch": 0.4479338842975207,
"grad_norm": 0.961467444896698,
"learning_rate": 6.264359317393238e-06,
"loss": 0.0315,
"step": 542
},
{
"epoch": 0.4487603305785124,
"grad_norm": 3.061131000518799,
"learning_rate": 6.251127931585933e-06,
"loss": 0.326,
"step": 543
},
{
"epoch": 0.44958677685950416,
"grad_norm": 2.2784407138824463,
"learning_rate": 6.237887192558894e-06,
"loss": 0.3668,
"step": 544
},
{
"epoch": 0.45041322314049587,
"grad_norm": 3.0723907947540283,
"learning_rate": 6.224637199297633e-06,
"loss": 0.3496,
"step": 545
},
{
"epoch": 0.4512396694214876,
"grad_norm": 2.5650062561035156,
"learning_rate": 6.211378050856851e-06,
"loss": 0.3412,
"step": 546
},
{
"epoch": 0.45206611570247934,
"grad_norm": 2.559699058532715,
"learning_rate": 6.198109846359682e-06,
"loss": 0.3237,
"step": 547
},
{
"epoch": 0.45289256198347105,
"grad_norm": 2.000291347503662,
"learning_rate": 6.184832684996972e-06,
"loss": 0.1759,
"step": 548
},
{
"epoch": 0.4537190082644628,
"grad_norm": 3.068333625793457,
"learning_rate": 6.171546666026522e-06,
"loss": 0.0822,
"step": 549
},
{
"epoch": 0.45454545454545453,
"grad_norm": 2.272804021835327,
"learning_rate": 6.15825188877235e-06,
"loss": 0.0864,
"step": 550
},
{
"epoch": 0.4553719008264463,
"grad_norm": 1.6852322816848755,
"learning_rate": 6.14494845262395e-06,
"loss": 0.2012,
"step": 551
},
{
"epoch": 0.456198347107438,
"grad_norm": 1.5493943691253662,
"learning_rate": 6.13163645703555e-06,
"loss": 0.1797,
"step": 552
},
{
"epoch": 0.4570247933884298,
"grad_norm": 1.5561745166778564,
"learning_rate": 6.118316001525368e-06,
"loss": 0.2018,
"step": 553
},
{
"epoch": 0.4578512396694215,
"grad_norm": 1.6809978485107422,
"learning_rate": 6.104987185674863e-06,
"loss": 0.2751,
"step": 554
},
{
"epoch": 0.45867768595041325,
"grad_norm": 1.4814178943634033,
"learning_rate": 6.091650109127994e-06,
"loss": 0.2003,
"step": 555
},
{
"epoch": 0.45950413223140496,
"grad_norm": 1.6107033491134644,
"learning_rate": 6.078304871590485e-06,
"loss": 0.084,
"step": 556
},
{
"epoch": 0.46033057851239667,
"grad_norm": 2.6868271827697754,
"learning_rate": 6.064951572829057e-06,
"loss": 0.0216,
"step": 557
},
{
"epoch": 0.46115702479338844,
"grad_norm": 1.9302111864089966,
"learning_rate": 6.051590312670703e-06,
"loss": 0.0299,
"step": 558
},
{
"epoch": 0.46198347107438015,
"grad_norm": 1.8653727769851685,
"learning_rate": 6.038221191001935e-06,
"loss": 0.0292,
"step": 559
},
{
"epoch": 0.4628099173553719,
"grad_norm": 3.6712772846221924,
"learning_rate": 6.024844307768032e-06,
"loss": 0.053,
"step": 560
},
{
"epoch": 0.4636363636363636,
"grad_norm": 1.1558465957641602,
"learning_rate": 6.011459762972299e-06,
"loss": 0.0361,
"step": 561
},
{
"epoch": 0.4644628099173554,
"grad_norm": 1.1140830516815186,
"learning_rate": 5.998067656675318e-06,
"loss": 0.0198,
"step": 562
},
{
"epoch": 0.4652892561983471,
"grad_norm": 1.9024033546447754,
"learning_rate": 5.984668088994199e-06,
"loss": 0.0231,
"step": 563
},
{
"epoch": 0.46611570247933887,
"grad_norm": 1.3747286796569824,
"learning_rate": 5.9712611601018325e-06,
"loss": 0.0303,
"step": 564
},
{
"epoch": 0.4669421487603306,
"grad_norm": 0.42153114080429077,
"learning_rate": 5.95784697022614e-06,
"loss": 0.0126,
"step": 565
},
{
"epoch": 0.4677685950413223,
"grad_norm": 0.48093897104263306,
"learning_rate": 5.944425619649324e-06,
"loss": 0.0097,
"step": 566
},
{
"epoch": 0.46859504132231405,
"grad_norm": 0.18579694628715515,
"learning_rate": 5.93099720870712e-06,
"loss": 0.0073,
"step": 567
},
{
"epoch": 0.46942148760330576,
"grad_norm": 1.9562228918075562,
"learning_rate": 5.917561837788046e-06,
"loss": 0.312,
"step": 568
},
{
"epoch": 0.47024793388429753,
"grad_norm": 2.367879867553711,
"learning_rate": 5.9041196073326515e-06,
"loss": 0.3458,
"step": 569
},
{
"epoch": 0.47107438016528924,
"grad_norm": 3.0242204666137695,
"learning_rate": 5.8906706178327645e-06,
"loss": 0.3189,
"step": 570
},
{
"epoch": 0.471900826446281,
"grad_norm": 2.0471577644348145,
"learning_rate": 5.877214969830746e-06,
"loss": 0.3315,
"step": 571
},
{
"epoch": 0.4727272727272727,
"grad_norm": 1.714192509651184,
"learning_rate": 5.863752763918732e-06,
"loss": 0.3171,
"step": 572
},
{
"epoch": 0.4735537190082645,
"grad_norm": 1.4090819358825684,
"learning_rate": 5.850284100737888e-06,
"loss": 0.0729,
"step": 573
},
{
"epoch": 0.4743801652892562,
"grad_norm": 2.7457051277160645,
"learning_rate": 5.836809080977644e-06,
"loss": 0.0727,
"step": 574
},
{
"epoch": 0.47520661157024796,
"grad_norm": 3.2199480533599854,
"learning_rate": 5.823327805374965e-06,
"loss": 0.0987,
"step": 575
},
{
"epoch": 0.47603305785123967,
"grad_norm": 1.385063648223877,
"learning_rate": 5.809840374713571e-06,
"loss": 0.1563,
"step": 576
},
{
"epoch": 0.4768595041322314,
"grad_norm": 1.3922051191329956,
"learning_rate": 5.7963468898232026e-06,
"loss": 0.1555,
"step": 577
},
{
"epoch": 0.47768595041322315,
"grad_norm": 1.4770236015319824,
"learning_rate": 5.782847451578858e-06,
"loss": 0.1544,
"step": 578
},
{
"epoch": 0.47851239669421486,
"grad_norm": 1.353073239326477,
"learning_rate": 5.769342160900043e-06,
"loss": 0.1651,
"step": 579
},
{
"epoch": 0.4793388429752066,
"grad_norm": 1.4431830644607544,
"learning_rate": 5.755831118750016e-06,
"loss": 0.1884,
"step": 580
},
{
"epoch": 0.48016528925619834,
"grad_norm": 3.3073036670684814,
"learning_rate": 5.742314426135029e-06,
"loss": 0.0547,
"step": 581
},
{
"epoch": 0.4809917355371901,
"grad_norm": 2.851544141769409,
"learning_rate": 5.72879218410358e-06,
"loss": 0.0537,
"step": 582
},
{
"epoch": 0.4818181818181818,
"grad_norm": 4.026706695556641,
"learning_rate": 5.715264493745652e-06,
"loss": 0.0868,
"step": 583
},
{
"epoch": 0.4826446280991736,
"grad_norm": 0.9649466276168823,
"learning_rate": 5.701731456191958e-06,
"loss": 0.0381,
"step": 584
},
{
"epoch": 0.4834710743801653,
"grad_norm": 2.706048011779785,
"learning_rate": 5.688193172613185e-06,
"loss": 0.0601,
"step": 585
},
{
"epoch": 0.484297520661157,
"grad_norm": 1.5482417345046997,
"learning_rate": 5.6746497442192425e-06,
"loss": 0.0287,
"step": 586
},
{
"epoch": 0.48512396694214877,
"grad_norm": 2.6702229976654053,
"learning_rate": 5.661101272258498e-06,
"loss": 0.0711,
"step": 587
},
{
"epoch": 0.4859504132231405,
"grad_norm": 0.5972803235054016,
"learning_rate": 5.6475478580170214e-06,
"loss": 0.0175,
"step": 588
},
{
"epoch": 0.48677685950413224,
"grad_norm": 0.9942319989204407,
"learning_rate": 5.633989602817837e-06,
"loss": 0.0137,
"step": 589
},
{
"epoch": 0.48760330578512395,
"grad_norm": 0.47811320424079895,
"learning_rate": 5.620426608020156e-06,
"loss": 0.0154,
"step": 590
},
{
"epoch": 0.4884297520661157,
"grad_norm": 1.1594573259353638,
"learning_rate": 5.606858975018621e-06,
"loss": 0.0263,
"step": 591
},
{
"epoch": 0.48925619834710743,
"grad_norm": 1.1246296167373657,
"learning_rate": 5.593286805242549e-06,
"loss": 0.1428,
"step": 592
},
{
"epoch": 0.4900826446280992,
"grad_norm": 1.6759182214736938,
"learning_rate": 5.5797102001551754e-06,
"loss": 0.3093,
"step": 593
},
{
"epoch": 0.4909090909090909,
"grad_norm": 1.9099022150039673,
"learning_rate": 5.56612926125289e-06,
"loss": 0.2813,
"step": 594
},
{
"epoch": 0.49173553719008267,
"grad_norm": 2.161968469619751,
"learning_rate": 5.552544090064487e-06,
"loss": 0.2854,
"step": 595
},
{
"epoch": 0.4925619834710744,
"grad_norm": 1.4985226392745972,
"learning_rate": 5.538954788150395e-06,
"loss": 0.3048,
"step": 596
},
{
"epoch": 0.4933884297520661,
"grad_norm": 1.5315203666687012,
"learning_rate": 5.525361457101923e-06,
"loss": 0.3304,
"step": 597
},
{
"epoch": 0.49421487603305786,
"grad_norm": 2.184929609298706,
"learning_rate": 5.5117641985405055e-06,
"loss": 0.1432,
"step": 598
},
{
"epoch": 0.49504132231404957,
"grad_norm": 4.860492706298828,
"learning_rate": 5.498163114116935e-06,
"loss": 0.1237,
"step": 599
},
{
"epoch": 0.49586776859504134,
"grad_norm": 2.90765380859375,
"learning_rate": 5.484558305510609e-06,
"loss": 0.0469,
"step": 600
},
{
"epoch": 0.49669421487603305,
"grad_norm": 1.3841009140014648,
"learning_rate": 5.47094987442876e-06,
"loss": 0.1817,
"step": 601
},
{
"epoch": 0.4975206611570248,
"grad_norm": 1.580209732055664,
"learning_rate": 5.4573379226057086e-06,
"loss": 0.1466,
"step": 602
},
{
"epoch": 0.4983471074380165,
"grad_norm": 1.6098315715789795,
"learning_rate": 5.4437225518020905e-06,
"loss": 0.1735,
"step": 603
},
{
"epoch": 0.4991735537190083,
"grad_norm": 1.4930258989334106,
"learning_rate": 5.430103863804107e-06,
"loss": 0.1803,
"step": 604
},
{
"epoch": 0.5,
"grad_norm": 1.2915822267532349,
"learning_rate": 5.416481960422748e-06,
"loss": 0.1534,
"step": 605
},
{
"epoch": 0.5008264462809917,
"grad_norm": 3.545429229736328,
"learning_rate": 5.402856943493053e-06,
"loss": 0.0449,
"step": 606
},
{
"epoch": 0.5016528925619834,
"grad_norm": 3.1343140602111816,
"learning_rate": 5.389228914873334e-06,
"loss": 0.0363,
"step": 607
},
{
"epoch": 0.5024793388429752,
"grad_norm": 3.4123377799987793,
"learning_rate": 5.37559797644441e-06,
"loss": 0.0515,
"step": 608
},
{
"epoch": 0.503305785123967,
"grad_norm": 4.493556976318359,
"learning_rate": 5.361964230108863e-06,
"loss": 0.0548,
"step": 609
},
{
"epoch": 0.5041322314049587,
"grad_norm": 3.2441487312316895,
"learning_rate": 5.348327777790262e-06,
"loss": 0.0565,
"step": 610
},
{
"epoch": 0.5049586776859504,
"grad_norm": 2.693488597869873,
"learning_rate": 5.334688721432409e-06,
"loss": 0.0302,
"step": 611
},
{
"epoch": 0.5057851239669422,
"grad_norm": 2.91070556640625,
"learning_rate": 5.321047162998568e-06,
"loss": 0.0402,
"step": 612
},
{
"epoch": 0.5066115702479339,
"grad_norm": 0.9602301716804504,
"learning_rate": 5.307403204470711e-06,
"loss": 0.017,
"step": 613
},
{
"epoch": 0.5074380165289256,
"grad_norm": 1.2631757259368896,
"learning_rate": 5.293756947848755e-06,
"loss": 0.016,
"step": 614
},
{
"epoch": 0.5082644628099173,
"grad_norm": 0.46631762385368347,
"learning_rate": 5.280108495149792e-06,
"loss": 0.0139,
"step": 615
},
{
"epoch": 0.509090909090909,
"grad_norm": 0.5036705136299133,
"learning_rate": 5.266457948407336e-06,
"loss": 0.0292,
"step": 616
},
{
"epoch": 0.5099173553719009,
"grad_norm": 1.4803212881088257,
"learning_rate": 5.252805409670554e-06,
"loss": 0.18,
"step": 617
},
{
"epoch": 0.5107438016528926,
"grad_norm": 2.6057372093200684,
"learning_rate": 5.239150981003502e-06,
"loss": 0.3133,
"step": 618
},
{
"epoch": 0.5115702479338843,
"grad_norm": 1.715753436088562,
"learning_rate": 5.2254947644843735e-06,
"loss": 0.3183,
"step": 619
},
{
"epoch": 0.512396694214876,
"grad_norm": 2.21403431892395,
"learning_rate": 5.211836862204716e-06,
"loss": 0.3027,
"step": 620
},
{
"epoch": 0.5132231404958678,
"grad_norm": 2.1005899906158447,
"learning_rate": 5.198177376268686e-06,
"loss": 0.3099,
"step": 621
},
{
"epoch": 0.5140495867768595,
"grad_norm": 2.234964370727539,
"learning_rate": 5.18451640879228e-06,
"loss": 0.2785,
"step": 622
},
{
"epoch": 0.5148760330578512,
"grad_norm": 1.8259047269821167,
"learning_rate": 5.1708540619025695e-06,
"loss": 0.313,
"step": 623
},
{
"epoch": 0.515702479338843,
"grad_norm": 4.017728805541992,
"learning_rate": 5.157190437736935e-06,
"loss": 0.1181,
"step": 624
},
{
"epoch": 0.5165289256198347,
"grad_norm": 3.953747272491455,
"learning_rate": 5.14352563844231e-06,
"loss": 0.1302,
"step": 625
},
{
"epoch": 0.5173553719008265,
"grad_norm": 1.2031114101409912,
"learning_rate": 5.12985976617441e-06,
"loss": 0.1742,
"step": 626
},
{
"epoch": 0.5181818181818182,
"grad_norm": 1.241931676864624,
"learning_rate": 5.1161929230969735e-06,
"loss": 0.1661,
"step": 627
},
{
"epoch": 0.5190082644628099,
"grad_norm": 1.359976053237915,
"learning_rate": 5.1025252113809945e-06,
"loss": 0.1765,
"step": 628
},
{
"epoch": 0.5198347107438016,
"grad_norm": 1.1254280805587769,
"learning_rate": 5.088856733203964e-06,
"loss": 0.1451,
"step": 629
},
{
"epoch": 0.5206611570247934,
"grad_norm": 1.0555247068405151,
"learning_rate": 5.075187590749101e-06,
"loss": 0.136,
"step": 630
},
{
"epoch": 0.5214876033057851,
"grad_norm": 1.1446833610534668,
"learning_rate": 5.061517886204592e-06,
"loss": 0.1514,
"step": 631
},
{
"epoch": 0.5223140495867769,
"grad_norm": 1.4794870615005493,
"learning_rate": 5.047847721762821e-06,
"loss": 0.0363,
"step": 632
},
{
"epoch": 0.5231404958677686,
"grad_norm": 1.28078293800354,
"learning_rate": 5.034177199619617e-06,
"loss": 0.0114,
"step": 633
},
{
"epoch": 0.5239669421487604,
"grad_norm": 2.1705374717712402,
"learning_rate": 5.02050642197348e-06,
"loss": 0.0521,
"step": 634
},
{
"epoch": 0.5247933884297521,
"grad_norm": 0.7478830814361572,
"learning_rate": 5.0068354910248165e-06,
"loss": 0.0124,
"step": 635
},
{
"epoch": 0.5256198347107438,
"grad_norm": 2.4068710803985596,
"learning_rate": 4.993164508975184e-06,
"loss": 0.0518,
"step": 636
},
{
"epoch": 0.5264462809917355,
"grad_norm": 0.6064580082893372,
"learning_rate": 4.979493578026523e-06,
"loss": 0.0089,
"step": 637
},
{
"epoch": 0.5272727272727272,
"grad_norm": 1.025275468826294,
"learning_rate": 4.9658228003803835e-06,
"loss": 0.0148,
"step": 638
},
{
"epoch": 0.528099173553719,
"grad_norm": 1.3405190706253052,
"learning_rate": 4.95215227823718e-06,
"loss": 0.0366,
"step": 639
},
{
"epoch": 0.5289256198347108,
"grad_norm": 0.8973997831344604,
"learning_rate": 4.9384821137954106e-06,
"loss": 0.019,
"step": 640
},
{
"epoch": 0.5297520661157025,
"grad_norm": 6.026678085327148,
"learning_rate": 4.924812409250899e-06,
"loss": 0.1754,
"step": 641
},
{
"epoch": 0.5305785123966942,
"grad_norm": 3.6006736755371094,
"learning_rate": 4.911143266796038e-06,
"loss": 0.3595,
"step": 642
},
{
"epoch": 0.531404958677686,
"grad_norm": 1.8200002908706665,
"learning_rate": 4.897474788619007e-06,
"loss": 0.3225,
"step": 643
},
{
"epoch": 0.5322314049586777,
"grad_norm": 2.401529312133789,
"learning_rate": 4.883807076903029e-06,
"loss": 0.3139,
"step": 644
},
{
"epoch": 0.5330578512396694,
"grad_norm": 2.6860947608947754,
"learning_rate": 4.870140233825592e-06,
"loss": 0.2982,
"step": 645
},
{
"epoch": 0.5338842975206611,
"grad_norm": 2.095231533050537,
"learning_rate": 4.856474361557692e-06,
"loss": 0.358,
"step": 646
},
{
"epoch": 0.5347107438016528,
"grad_norm": 2.205946922302246,
"learning_rate": 4.842809562263066e-06,
"loss": 0.2445,
"step": 647
},
{
"epoch": 0.5355371900826447,
"grad_norm": 3.7042648792266846,
"learning_rate": 4.829145938097431e-06,
"loss": 0.1261,
"step": 648
},
{
"epoch": 0.5363636363636364,
"grad_norm": 3.2817203998565674,
"learning_rate": 4.815483591207721e-06,
"loss": 0.0743,
"step": 649
},
{
"epoch": 0.5371900826446281,
"grad_norm": 3.1411526203155518,
"learning_rate": 4.801822623731316e-06,
"loss": 0.118,
"step": 650
},
{
"epoch": 0.5380165289256198,
"grad_norm": 1.0231245756149292,
"learning_rate": 4.788163137795286e-06,
"loss": 0.1141,
"step": 651
},
{
"epoch": 0.5388429752066116,
"grad_norm": 1.2480162382125854,
"learning_rate": 4.774505235515628e-06,
"loss": 0.1867,
"step": 652
},
{
"epoch": 0.5396694214876033,
"grad_norm": 1.3538585901260376,
"learning_rate": 4.760849018996499e-06,
"loss": 0.1798,
"step": 653
},
{
"epoch": 0.540495867768595,
"grad_norm": 1.2146517038345337,
"learning_rate": 4.747194590329449e-06,
"loss": 0.189,
"step": 654
},
{
"epoch": 0.5413223140495868,
"grad_norm": 1.1301908493041992,
"learning_rate": 4.733542051592665e-06,
"loss": 0.1696,
"step": 655
},
{
"epoch": 0.5421487603305785,
"grad_norm": 1.6396489143371582,
"learning_rate": 4.719891504850209e-06,
"loss": 0.1014,
"step": 656
},
{
"epoch": 0.5429752066115703,
"grad_norm": 1.99111807346344,
"learning_rate": 4.706243052151248e-06,
"loss": 0.0498,
"step": 657
},
{
"epoch": 0.543801652892562,
"grad_norm": 2.122771739959717,
"learning_rate": 4.69259679552929e-06,
"loss": 0.0399,
"step": 658
},
{
"epoch": 0.5446280991735537,
"grad_norm": 1.1913613080978394,
"learning_rate": 4.6789528370014335e-06,
"loss": 0.0445,
"step": 659
},
{
"epoch": 0.5454545454545454,
"grad_norm": 1.2134239673614502,
"learning_rate": 4.665311278567593e-06,
"loss": 0.0373,
"step": 660
},
{
"epoch": 0.5462809917355372,
"grad_norm": 1.2265779972076416,
"learning_rate": 4.651672222209738e-06,
"loss": 0.0203,
"step": 661
},
{
"epoch": 0.547107438016529,
"grad_norm": 0.5269374847412109,
"learning_rate": 4.638035769891139e-06,
"loss": 0.0216,
"step": 662
},
{
"epoch": 0.5479338842975207,
"grad_norm": 1.136293888092041,
"learning_rate": 4.624402023555592e-06,
"loss": 0.0124,
"step": 663
},
{
"epoch": 0.5487603305785124,
"grad_norm": 0.42652571201324463,
"learning_rate": 4.6107710851266695e-06,
"loss": 0.0102,
"step": 664
},
{
"epoch": 0.5495867768595041,
"grad_norm": 0.3615831732749939,
"learning_rate": 4.597143056506947e-06,
"loss": 0.0024,
"step": 665
},
{
"epoch": 0.5504132231404959,
"grad_norm": 0.703677237033844,
"learning_rate": 4.583518039577253e-06,
"loss": 0.0113,
"step": 666
},
{
"epoch": 0.5512396694214876,
"grad_norm": 1.7550630569458008,
"learning_rate": 4.5698961361958955e-06,
"loss": 0.032,
"step": 667
},
{
"epoch": 0.5520661157024793,
"grad_norm": 1.8069919347763062,
"learning_rate": 4.556277448197909e-06,
"loss": 0.181,
"step": 668
},
{
"epoch": 0.552892561983471,
"grad_norm": 2.0002801418304443,
"learning_rate": 4.542662077394292e-06,
"loss": 0.3471,
"step": 669
},
{
"epoch": 0.5537190082644629,
"grad_norm": 1.8980375528335571,
"learning_rate": 4.529050125571241e-06,
"loss": 0.3243,
"step": 670
},
{
"epoch": 0.5545454545454546,
"grad_norm": 2.2224647998809814,
"learning_rate": 4.5154416944893935e-06,
"loss": 0.3145,
"step": 671
},
{
"epoch": 0.5553719008264463,
"grad_norm": 2.437152624130249,
"learning_rate": 4.5018368858830646e-06,
"loss": 0.3099,
"step": 672
},
{
"epoch": 0.556198347107438,
"grad_norm": 1.8314472436904907,
"learning_rate": 4.488235801459495e-06,
"loss": 0.2213,
"step": 673
},
{
"epoch": 0.5570247933884298,
"grad_norm": 3.841614007949829,
"learning_rate": 4.474638542898078e-06,
"loss": 0.1219,
"step": 674
},
{
"epoch": 0.5578512396694215,
"grad_norm": 3.001781463623047,
"learning_rate": 4.461045211849605e-06,
"loss": 0.0805,
"step": 675
},
{
"epoch": 0.5586776859504132,
"grad_norm": 0.8312858939170837,
"learning_rate": 4.447455909935513e-06,
"loss": 0.1252,
"step": 676
},
{
"epoch": 0.5595041322314049,
"grad_norm": 1.0251758098602295,
"learning_rate": 4.4338707387471105e-06,
"loss": 0.1715,
"step": 677
},
{
"epoch": 0.5603305785123966,
"grad_norm": 0.9629795551300049,
"learning_rate": 4.420289799844825e-06,
"loss": 0.1367,
"step": 678
},
{
"epoch": 0.5611570247933885,
"grad_norm": 1.0852874517440796,
"learning_rate": 4.4067131947574515e-06,
"loss": 0.1713,
"step": 679
},
{
"epoch": 0.5619834710743802,
"grad_norm": 0.9580336213111877,
"learning_rate": 4.393141024981381e-06,
"loss": 0.1245,
"step": 680
},
{
"epoch": 0.5628099173553719,
"grad_norm": 1.374263882637024,
"learning_rate": 4.379573391979846e-06,
"loss": 0.0813,
"step": 681
},
{
"epoch": 0.5636363636363636,
"grad_norm": 1.5036835670471191,
"learning_rate": 4.3660103971821635e-06,
"loss": 0.0224,
"step": 682
},
{
"epoch": 0.5644628099173554,
"grad_norm": 2.0704779624938965,
"learning_rate": 4.352452141982979e-06,
"loss": 0.0343,
"step": 683
},
{
"epoch": 0.5652892561983471,
"grad_norm": 1.2086342573165894,
"learning_rate": 4.338898727741505e-06,
"loss": 0.0152,
"step": 684
},
{
"epoch": 0.5661157024793388,
"grad_norm": 2.1616499423980713,
"learning_rate": 4.3253502557807575e-06,
"loss": 0.0417,
"step": 685
},
{
"epoch": 0.5669421487603306,
"grad_norm": 1.2505360841751099,
"learning_rate": 4.311806827386816e-06,
"loss": 0.0314,
"step": 686
},
{
"epoch": 0.5677685950413223,
"grad_norm": 0.4623047113418579,
"learning_rate": 4.298268543808043e-06,
"loss": 0.0254,
"step": 687
},
{
"epoch": 0.5685950413223141,
"grad_norm": 1.0211182832717896,
"learning_rate": 4.28473550625435e-06,
"loss": 0.0271,
"step": 688
},
{
"epoch": 0.5694214876033058,
"grad_norm": 0.7633079886436462,
"learning_rate": 4.27120781589642e-06,
"loss": 0.0095,
"step": 689
},
{
"epoch": 0.5702479338842975,
"grad_norm": 0.45265504717826843,
"learning_rate": 4.257685573864971e-06,
"loss": 0.0121,
"step": 690
},
{
"epoch": 0.5710743801652892,
"grad_norm": 1.9126603603363037,
"learning_rate": 4.244168881249986e-06,
"loss": 0.0217,
"step": 691
},
{
"epoch": 0.571900826446281,
"grad_norm": 3.278047561645508,
"learning_rate": 4.2306578390999576e-06,
"loss": 0.1539,
"step": 692
},
{
"epoch": 0.5727272727272728,
"grad_norm": 1.7494442462921143,
"learning_rate": 4.217152548421143e-06,
"loss": 0.3445,
"step": 693
},
{
"epoch": 0.5735537190082645,
"grad_norm": 1.7223403453826904,
"learning_rate": 4.203653110176798e-06,
"loss": 0.2945,
"step": 694
},
{
"epoch": 0.5743801652892562,
"grad_norm": 1.5911810398101807,
"learning_rate": 4.190159625286428e-06,
"loss": 0.254,
"step": 695
},
{
"epoch": 0.5752066115702479,
"grad_norm": 1.535965919494629,
"learning_rate": 4.176672194625036e-06,
"loss": 0.3021,
"step": 696
},
{
"epoch": 0.5760330578512397,
"grad_norm": 1.6443074941635132,
"learning_rate": 4.163190919022357e-06,
"loss": 0.2366,
"step": 697
},
{
"epoch": 0.5768595041322314,
"grad_norm": 1.516901969909668,
"learning_rate": 4.149715899262116e-06,
"loss": 0.0365,
"step": 698
},
{
"epoch": 0.5776859504132231,
"grad_norm": 2.6840264797210693,
"learning_rate": 4.136247236081269e-06,
"loss": 0.0598,
"step": 699
},
{
"epoch": 0.5785123966942148,
"grad_norm": 3.295374631881714,
"learning_rate": 4.122785030169256e-06,
"loss": 0.0969,
"step": 700
},
{
"epoch": 0.5793388429752067,
"grad_norm": 1.1777970790863037,
"learning_rate": 4.109329382167237e-06,
"loss": 0.1785,
"step": 701
},
{
"epoch": 0.5801652892561984,
"grad_norm": 1.1650125980377197,
"learning_rate": 4.095880392667349e-06,
"loss": 0.2139,
"step": 702
},
{
"epoch": 0.5809917355371901,
"grad_norm": 0.9443373084068298,
"learning_rate": 4.082438162211955e-06,
"loss": 0.1594,
"step": 703
},
{
"epoch": 0.5818181818181818,
"grad_norm": 0.9858490824699402,
"learning_rate": 4.069002791292882e-06,
"loss": 0.137,
"step": 704
},
{
"epoch": 0.5826446280991735,
"grad_norm": 1.3087631464004517,
"learning_rate": 4.055574380350678e-06,
"loss": 0.1357,
"step": 705
},
{
"epoch": 0.5834710743801653,
"grad_norm": 1.2239716053009033,
"learning_rate": 4.042153029773861e-06,
"loss": 0.0234,
"step": 706
},
{
"epoch": 0.584297520661157,
"grad_norm": 0.531774640083313,
"learning_rate": 4.028738839898168e-06,
"loss": 0.0145,
"step": 707
},
{
"epoch": 0.5851239669421487,
"grad_norm": 0.7383906841278076,
"learning_rate": 4.015331911005803e-06,
"loss": 0.027,
"step": 708
},
{
"epoch": 0.5859504132231405,
"grad_norm": 1.476829171180725,
"learning_rate": 4.001932343324683e-06,
"loss": 0.0265,
"step": 709
},
{
"epoch": 0.5867768595041323,
"grad_norm": 1.3487377166748047,
"learning_rate": 3.988540237027702e-06,
"loss": 0.0147,
"step": 710
},
{
"epoch": 0.587603305785124,
"grad_norm": 0.5895293354988098,
"learning_rate": 3.97515569223197e-06,
"loss": 0.0211,
"step": 711
},
{
"epoch": 0.5884297520661157,
"grad_norm": 0.6942650079727173,
"learning_rate": 3.961778808998066e-06,
"loss": 0.0299,
"step": 712
},
{
"epoch": 0.5892561983471074,
"grad_norm": 0.28293517231941223,
"learning_rate": 3.948409687329298e-06,
"loss": 0.0135,
"step": 713
},
{
"epoch": 0.5900826446280992,
"grad_norm": 0.49668827652931213,
"learning_rate": 3.935048427170944e-06,
"loss": 0.0089,
"step": 714
},
{
"epoch": 0.5909090909090909,
"grad_norm": 0.7324496507644653,
"learning_rate": 3.921695128409517e-06,
"loss": 0.0153,
"step": 715
},
{
"epoch": 0.5917355371900826,
"grad_norm": 0.3670904338359833,
"learning_rate": 3.908349890872005e-06,
"loss": 0.0147,
"step": 716
},
{
"epoch": 0.5925619834710744,
"grad_norm": 0.8715825080871582,
"learning_rate": 3.895012814325139e-06,
"loss": 0.0209,
"step": 717
},
{
"epoch": 0.5933884297520661,
"grad_norm": 2.0958237648010254,
"learning_rate": 3.8816839984746334e-06,
"loss": 0.3306,
"step": 718
},
{
"epoch": 0.5942148760330579,
"grad_norm": 1.6693692207336426,
"learning_rate": 3.86836354296445e-06,
"loss": 0.3009,
"step": 719
},
{
"epoch": 0.5950413223140496,
"grad_norm": 2.8003673553466797,
"learning_rate": 3.8550515473760515e-06,
"loss": 0.327,
"step": 720
},
{
"epoch": 0.5958677685950413,
"grad_norm": 2.277968406677246,
"learning_rate": 3.841748111227652e-06,
"loss": 0.2983,
"step": 721
},
{
"epoch": 0.596694214876033,
"grad_norm": 1.5424673557281494,
"learning_rate": 3.8284533339734804e-06,
"loss": 0.2829,
"step": 722
},
{
"epoch": 0.5975206611570248,
"grad_norm": 1.9213181734085083,
"learning_rate": 3.815167315003029e-06,
"loss": 0.2612,
"step": 723
},
{
"epoch": 0.5983471074380166,
"grad_norm": 2.348574638366699,
"learning_rate": 3.8018901536403198e-06,
"loss": 0.0426,
"step": 724
},
{
"epoch": 0.5991735537190083,
"grad_norm": 1.7470982074737549,
"learning_rate": 3.788621949143152e-06,
"loss": 0.0467,
"step": 725
},
{
"epoch": 0.6,
"grad_norm": 0.8803451061248779,
"learning_rate": 3.775362800702367e-06,
"loss": 0.1494,
"step": 726
},
{
"epoch": 0.6008264462809917,
"grad_norm": 0.9105450510978699,
"learning_rate": 3.762112807441108e-06,
"loss": 0.1478,
"step": 727
},
{
"epoch": 0.6016528925619835,
"grad_norm": 0.8626505732536316,
"learning_rate": 3.7488720684140684e-06,
"loss": 0.1531,
"step": 728
},
{
"epoch": 0.6024793388429752,
"grad_norm": 0.8685311675071716,
"learning_rate": 3.735640682606764e-06,
"loss": 0.1463,
"step": 729
},
{
"epoch": 0.6033057851239669,
"grad_norm": 1.0221821069717407,
"learning_rate": 3.7224187489347847e-06,
"loss": 0.1823,
"step": 730
},
{
"epoch": 0.6041322314049586,
"grad_norm": 1.0731297731399536,
"learning_rate": 3.709206366243061e-06,
"loss": 0.158,
"step": 731
},
{
"epoch": 0.6049586776859505,
"grad_norm": 2.116424322128296,
"learning_rate": 3.6960036333051184e-06,
"loss": 0.1465,
"step": 732
},
{
"epoch": 0.6057851239669422,
"grad_norm": 1.1477619409561157,
"learning_rate": 3.682810648822343e-06,
"loss": 0.01,
"step": 733
},
{
"epoch": 0.6066115702479339,
"grad_norm": 0.518548846244812,
"learning_rate": 3.669627511423247e-06,
"loss": 0.0069,
"step": 734
},
{
"epoch": 0.6074380165289256,
"grad_norm": 0.8379459381103516,
"learning_rate": 3.6564543196627237e-06,
"loss": 0.0213,
"step": 735
},
{
"epoch": 0.6082644628099173,
"grad_norm": 1.508975625038147,
"learning_rate": 3.6432911720213127e-06,
"loss": 0.0225,
"step": 736
},
{
"epoch": 0.6090909090909091,
"grad_norm": 1.7515640258789062,
"learning_rate": 3.630138166904471e-06,
"loss": 0.0278,
"step": 737
},
{
"epoch": 0.6099173553719008,
"grad_norm": 0.838191032409668,
"learning_rate": 3.6169954026418285e-06,
"loss": 0.014,
"step": 738
},
{
"epoch": 0.6107438016528925,
"grad_norm": 0.3030209243297577,
"learning_rate": 3.6038629774864563e-06,
"loss": 0.0106,
"step": 739
},
{
"epoch": 0.6115702479338843,
"grad_norm": 0.4401758015155792,
"learning_rate": 3.5907409896141308e-06,
"loss": 0.0063,
"step": 740
},
{
"epoch": 0.6123966942148761,
"grad_norm": 0.3650299906730652,
"learning_rate": 3.5776295371226056e-06,
"loss": 0.0159,
"step": 741
},
{
"epoch": 0.6132231404958678,
"grad_norm": 1.193688988685608,
"learning_rate": 3.56452871803087e-06,
"loss": 0.0933,
"step": 742
},
{
"epoch": 0.6140495867768595,
"grad_norm": 2.326871156692505,
"learning_rate": 3.551438630278417e-06,
"loss": 0.3112,
"step": 743
},
{
"epoch": 0.6148760330578512,
"grad_norm": 5.446589946746826,
"learning_rate": 3.5383593717245223e-06,
"loss": 0.3338,
"step": 744
},
{
"epoch": 0.6157024793388429,
"grad_norm": 2.3617842197418213,
"learning_rate": 3.525291040147498e-06,
"loss": 0.3336,
"step": 745
},
{
"epoch": 0.6165289256198347,
"grad_norm": 1.975474238395691,
"learning_rate": 3.512233733243967e-06,
"loss": 0.2887,
"step": 746
},
{
"epoch": 0.6173553719008265,
"grad_norm": 1.5796037912368774,
"learning_rate": 3.49918754862814e-06,
"loss": 0.3291,
"step": 747
},
{
"epoch": 0.6181818181818182,
"grad_norm": 2.0289130210876465,
"learning_rate": 3.486152583831072e-06,
"loss": 0.192,
"step": 748
},
{
"epoch": 0.6190082644628099,
"grad_norm": 2.0254485607147217,
"learning_rate": 3.4731289362999473e-06,
"loss": 0.056,
"step": 749
},
{
"epoch": 0.6198347107438017,
"grad_norm": 1.9933934211730957,
"learning_rate": 3.460116703397336e-06,
"loss": 0.054,
"step": 750
},
{
"epoch": 0.6206611570247934,
"grad_norm": 0.851510763168335,
"learning_rate": 3.447115982400485e-06,
"loss": 0.135,
"step": 751
},
{
"epoch": 0.6214876033057851,
"grad_norm": 0.8434444665908813,
"learning_rate": 3.4341268705005708e-06,
"loss": 0.1234,
"step": 752
},
{
"epoch": 0.6223140495867768,
"grad_norm": 0.9533182382583618,
"learning_rate": 3.421149464801986e-06,
"loss": 0.1712,
"step": 753
},
{
"epoch": 0.6231404958677685,
"grad_norm": 0.8647515177726746,
"learning_rate": 3.4081838623216124e-06,
"loss": 0.1288,
"step": 754
},
{
"epoch": 0.6239669421487604,
"grad_norm": 0.8543617129325867,
"learning_rate": 3.3952301599880876e-06,
"loss": 0.1472,
"step": 755
},
{
"epoch": 0.6247933884297521,
"grad_norm": 1.66751229763031,
"learning_rate": 3.3822884546410884e-06,
"loss": 0.1636,
"step": 756
},
{
"epoch": 0.6256198347107438,
"grad_norm": 3.0977776050567627,
"learning_rate": 3.3693588430306035e-06,
"loss": 0.0359,
"step": 757
},
{
"epoch": 0.6264462809917355,
"grad_norm": 0.6407830119132996,
"learning_rate": 3.356441421816214e-06,
"loss": 0.0233,
"step": 758
},
{
"epoch": 0.6272727272727273,
"grad_norm": 0.7153797149658203,
"learning_rate": 3.3435362875663625e-06,
"loss": 0.0211,
"step": 759
},
{
"epoch": 0.628099173553719,
"grad_norm": 0.627882719039917,
"learning_rate": 3.330643536757638e-06,
"loss": 0.0287,
"step": 760
},
{
"epoch": 0.6289256198347107,
"grad_norm": 1.0231915712356567,
"learning_rate": 3.3177632657740575e-06,
"loss": 0.0142,
"step": 761
},
{
"epoch": 0.6297520661157024,
"grad_norm": 0.9797623157501221,
"learning_rate": 3.3048955709063364e-06,
"loss": 0.0101,
"step": 762
},
{
"epoch": 0.6305785123966943,
"grad_norm": 0.7965518236160278,
"learning_rate": 3.2920405483511702e-06,
"loss": 0.0072,
"step": 763
},
{
"epoch": 0.631404958677686,
"grad_norm": 0.5437694787979126,
"learning_rate": 3.2791982942105265e-06,
"loss": 0.0126,
"step": 764
},
{
"epoch": 0.6322314049586777,
"grad_norm": 0.9789305329322815,
"learning_rate": 3.266368904490914e-06,
"loss": 0.0067,
"step": 765
},
{
"epoch": 0.6330578512396694,
"grad_norm": 1.1776361465454102,
"learning_rate": 3.253552475102668e-06,
"loss": 0.0187,
"step": 766
},
{
"epoch": 0.6338842975206611,
"grad_norm": 0.7138229608535767,
"learning_rate": 3.240749101859234e-06,
"loss": 0.0196,
"step": 767
},
{
"epoch": 0.6347107438016529,
"grad_norm": 2.1212470531463623,
"learning_rate": 3.227958880476457e-06,
"loss": 0.2941,
"step": 768
},
{
"epoch": 0.6355371900826446,
"grad_norm": 1.6883107423782349,
"learning_rate": 3.215181906571858e-06,
"loss": 0.3001,
"step": 769
},
{
"epoch": 0.6363636363636364,
"grad_norm": 2.616964817047119,
"learning_rate": 3.2024182756639188e-06,
"loss": 0.3375,
"step": 770
},
{
"epoch": 0.6371900826446281,
"grad_norm": 2.5359930992126465,
"learning_rate": 3.189668083171379e-06,
"loss": 0.2928,
"step": 771
},
{
"epoch": 0.6380165289256199,
"grad_norm": 2.0412468910217285,
"learning_rate": 3.1769314244125056e-06,
"loss": 0.2874,
"step": 772
},
{
"epoch": 0.6388429752066116,
"grad_norm": 3.327785015106201,
"learning_rate": 3.1642083946043977e-06,
"loss": 0.1906,
"step": 773
},
{
"epoch": 0.6396694214876033,
"grad_norm": 0.8958473801612854,
"learning_rate": 3.15149908886226e-06,
"loss": 0.0149,
"step": 774
},
{
"epoch": 0.640495867768595,
"grad_norm": 1.8769047260284424,
"learning_rate": 3.1388036021987047e-06,
"loss": 0.0437,
"step": 775
},
{
"epoch": 0.6413223140495867,
"grad_norm": 0.7752872109413147,
"learning_rate": 3.126122029523031e-06,
"loss": 0.1335,
"step": 776
},
{
"epoch": 0.6421487603305785,
"grad_norm": 0.7727296352386475,
"learning_rate": 3.1134544656405198e-06,
"loss": 0.13,
"step": 777
},
{
"epoch": 0.6429752066115703,
"grad_norm": 0.8350527286529541,
"learning_rate": 3.100801005251727e-06,
"loss": 0.1258,
"step": 778
},
{
"epoch": 0.643801652892562,
"grad_norm": 0.9546838402748108,
"learning_rate": 3.0881617429517697e-06,
"loss": 0.1615,
"step": 779
},
{
"epoch": 0.6446280991735537,
"grad_norm": 0.9062439799308777,
"learning_rate": 3.075536773229624e-06,
"loss": 0.1642,
"step": 780
},
{
"epoch": 0.6454545454545455,
"grad_norm": 1.0381230115890503,
"learning_rate": 3.0629261904674206e-06,
"loss": 0.1706,
"step": 781
},
{
"epoch": 0.6462809917355372,
"grad_norm": 1.2149626016616821,
"learning_rate": 3.0503300889397302e-06,
"loss": 0.0749,
"step": 782
},
{
"epoch": 0.6471074380165289,
"grad_norm": 1.1183955669403076,
"learning_rate": 3.0377485628128657e-06,
"loss": 0.0098,
"step": 783
},
{
"epoch": 0.6479338842975206,
"grad_norm": 0.5233092904090881,
"learning_rate": 3.025181706144178e-06,
"loss": 0.0132,
"step": 784
},
{
"epoch": 0.6487603305785123,
"grad_norm": 1.2450796365737915,
"learning_rate": 3.012629612881354e-06,
"loss": 0.0112,
"step": 785
},
{
"epoch": 0.6495867768595042,
"grad_norm": 0.20728905498981476,
"learning_rate": 3.000092376861705e-06,
"loss": 0.0027,
"step": 786
},
{
"epoch": 0.6504132231404959,
"grad_norm": 1.2542493343353271,
"learning_rate": 2.987570091811479e-06,
"loss": 0.0158,
"step": 787
},
{
"epoch": 0.6512396694214876,
"grad_norm": 0.8259776830673218,
"learning_rate": 2.97506285134515e-06,
"loss": 0.0336,
"step": 788
},
{
"epoch": 0.6520661157024793,
"grad_norm": 1.7150229215621948,
"learning_rate": 2.9625707489647227e-06,
"loss": 0.0198,
"step": 789
},
{
"epoch": 0.6528925619834711,
"grad_norm": 0.2723279297351837,
"learning_rate": 2.9500938780590276e-06,
"loss": 0.019,
"step": 790
},
{
"epoch": 0.6537190082644628,
"grad_norm": 0.4087146520614624,
"learning_rate": 2.937632331903032e-06,
"loss": 0.004,
"step": 791
},
{
"epoch": 0.6545454545454545,
"grad_norm": 0.5747569799423218,
"learning_rate": 2.92518620365714e-06,
"loss": 0.0078,
"step": 792
},
{
"epoch": 0.6553719008264463,
"grad_norm": 1.2564964294433594,
"learning_rate": 2.9127555863664857e-06,
"loss": 0.2254,
"step": 793
},
{
"epoch": 0.656198347107438,
"grad_norm": 1.4814064502716064,
"learning_rate": 2.900340572960253e-06,
"loss": 0.2538,
"step": 794
},
{
"epoch": 0.6570247933884298,
"grad_norm": 1.4104158878326416,
"learning_rate": 2.887941256250972e-06,
"loss": 0.2711,
"step": 795
},
{
"epoch": 0.6578512396694215,
"grad_norm": 1.837869644165039,
"learning_rate": 2.8755577289338267e-06,
"loss": 0.2782,
"step": 796
},
{
"epoch": 0.6586776859504132,
"grad_norm": 2.8386220932006836,
"learning_rate": 2.863190083585962e-06,
"loss": 0.2859,
"step": 797
},
{
"epoch": 0.6595041322314049,
"grad_norm": 3.301816940307617,
"learning_rate": 2.8508384126657906e-06,
"loss": 0.111,
"step": 798
},
{
"epoch": 0.6603305785123967,
"grad_norm": 4.864030838012695,
"learning_rate": 2.838502808512309e-06,
"loss": 0.126,
"step": 799
},
{
"epoch": 0.6611570247933884,
"grad_norm": 1.8764886856079102,
"learning_rate": 2.8261833633443914e-06,
"loss": 0.049,
"step": 800
},
{
"epoch": 0.6619834710743802,
"grad_norm": 0.9268789291381836,
"learning_rate": 2.813880169260117e-06,
"loss": 0.1589,
"step": 801
},
{
"epoch": 0.6628099173553719,
"grad_norm": 1.0193241834640503,
"learning_rate": 2.801593318236078e-06,
"loss": 0.1732,
"step": 802
},
{
"epoch": 0.6636363636363637,
"grad_norm": 0.9190545678138733,
"learning_rate": 2.7893229021266776e-06,
"loss": 0.1394,
"step": 803
},
{
"epoch": 0.6644628099173554,
"grad_norm": 0.995844841003418,
"learning_rate": 2.7770690126634643e-06,
"loss": 0.1421,
"step": 804
},
{
"epoch": 0.6652892561983471,
"grad_norm": 0.8262742757797241,
"learning_rate": 2.764831741454432e-06,
"loss": 0.0787,
"step": 805
},
{
"epoch": 0.6661157024793388,
"grad_norm": 0.6985448002815247,
"learning_rate": 2.7526111799833397e-06,
"loss": 0.0165,
"step": 806
},
{
"epoch": 0.6669421487603305,
"grad_norm": 0.6556599736213684,
"learning_rate": 2.740407419609028e-06,
"loss": 0.0217,
"step": 807
},
{
"epoch": 0.6677685950413224,
"grad_norm": 1.64823317527771,
"learning_rate": 2.7282205515647348e-06,
"loss": 0.0196,
"step": 808
},
{
"epoch": 0.6685950413223141,
"grad_norm": 2.291562795639038,
"learning_rate": 2.7160506669574137e-06,
"loss": 0.0332,
"step": 809
},
{
"epoch": 0.6694214876033058,
"grad_norm": 3.0333375930786133,
"learning_rate": 2.703897856767056e-06,
"loss": 0.0383,
"step": 810
},
{
"epoch": 0.6702479338842975,
"grad_norm": 0.9240926504135132,
"learning_rate": 2.6917622118459975e-06,
"loss": 0.0086,
"step": 811
},
{
"epoch": 0.6710743801652893,
"grad_norm": 0.9838902950286865,
"learning_rate": 2.6796438229182643e-06,
"loss": 0.0105,
"step": 812
},
{
"epoch": 0.671900826446281,
"grad_norm": 0.4214411973953247,
"learning_rate": 2.66754278057887e-06,
"loss": 0.0045,
"step": 813
},
{
"epoch": 0.6727272727272727,
"grad_norm": 2.1211740970611572,
"learning_rate": 2.655459175293146e-06,
"loss": 0.0305,
"step": 814
},
{
"epoch": 0.6735537190082644,
"grad_norm": 1.3328880071640015,
"learning_rate": 2.6433930973960775e-06,
"loss": 0.0122,
"step": 815
},
{
"epoch": 0.6743801652892562,
"grad_norm": 1.5663409233093262,
"learning_rate": 2.631344637091607e-06,
"loss": 0.0109,
"step": 816
},
{
"epoch": 0.675206611570248,
"grad_norm": 1.9309957027435303,
"learning_rate": 2.6193138844519785e-06,
"loss": 0.2505,
"step": 817
},
{
"epoch": 0.6760330578512397,
"grad_norm": 1.5996431112289429,
"learning_rate": 2.6073009294170514e-06,
"loss": 0.2626,
"step": 818
},
{
"epoch": 0.6768595041322314,
"grad_norm": 2.8478901386260986,
"learning_rate": 2.5953058617936368e-06,
"loss": 0.298,
"step": 819
},
{
"epoch": 0.6776859504132231,
"grad_norm": 2.0270745754241943,
"learning_rate": 2.58332877125482e-06,
"loss": 0.2915,
"step": 820
},
{
"epoch": 0.6785123966942149,
"grad_norm": 1.7036795616149902,
"learning_rate": 2.5713697473392953e-06,
"loss": 0.2774,
"step": 821
},
{
"epoch": 0.6793388429752066,
"grad_norm": 1.6781381368637085,
"learning_rate": 2.5594288794506917e-06,
"loss": 0.1863,
"step": 822
},
{
"epoch": 0.6801652892561983,
"grad_norm": 3.5772781372070312,
"learning_rate": 2.5475062568569077e-06,
"loss": 0.0304,
"step": 823
},
{
"epoch": 0.6809917355371901,
"grad_norm": 3.3236935138702393,
"learning_rate": 2.5356019686894457e-06,
"loss": 0.0711,
"step": 824
},
{
"epoch": 0.6818181818181818,
"grad_norm": 2.2396740913391113,
"learning_rate": 2.523716103942734e-06,
"loss": 0.0637,
"step": 825
},
{
"epoch": 0.6826446280991736,
"grad_norm": 0.7092673182487488,
"learning_rate": 2.511848751473485e-06,
"loss": 0.1196,
"step": 826
},
{
"epoch": 0.6834710743801653,
"grad_norm": 0.7840614914894104,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.1372,
"step": 827
},
{
"epoch": 0.684297520661157,
"grad_norm": 0.8343893885612488,
"learning_rate": 2.488169938101536e-06,
"loss": 0.1246,
"step": 828
},
{
"epoch": 0.6851239669421487,
"grad_norm": 1.0356327295303345,
"learning_rate": 2.476358654217627e-06,
"loss": 0.1976,
"step": 829
},
{
"epoch": 0.6859504132231405,
"grad_norm": 0.8545620441436768,
"learning_rate": 2.4645662366474186e-06,
"loss": 0.1451,
"step": 830
},
{
"epoch": 0.6867768595041323,
"grad_norm": 1.6567367315292358,
"learning_rate": 2.4527927735490213e-06,
"loss": 0.132,
"step": 831
},
{
"epoch": 0.687603305785124,
"grad_norm": 0.3666848838329315,
"learning_rate": 2.4410383529388448e-06,
"loss": 0.0204,
"step": 832
},
{
"epoch": 0.6884297520661157,
"grad_norm": 0.686599850654602,
"learning_rate": 2.429303062690938e-06,
"loss": 0.0091,
"step": 833
},
{
"epoch": 0.6892561983471074,
"grad_norm": 0.48144328594207764,
"learning_rate": 2.417586990536339e-06,
"loss": 0.0137,
"step": 834
},
{
"epoch": 0.6900826446280992,
"grad_norm": 0.20945820212364197,
"learning_rate": 2.405890224062406e-06,
"loss": 0.0022,
"step": 835
},
{
"epoch": 0.6909090909090909,
"grad_norm": 1.2613632678985596,
"learning_rate": 2.3942128507121816e-06,
"loss": 0.022,
"step": 836
},
{
"epoch": 0.6917355371900826,
"grad_norm": 1.0795994997024536,
"learning_rate": 2.3825549577837243e-06,
"loss": 0.0352,
"step": 837
},
{
"epoch": 0.6925619834710743,
"grad_norm": 0.9106585383415222,
"learning_rate": 2.370916632429455e-06,
"loss": 0.0107,
"step": 838
},
{
"epoch": 0.6933884297520662,
"grad_norm": 0.4586517810821533,
"learning_rate": 2.3592979616555194e-06,
"loss": 0.0095,
"step": 839
},
{
"epoch": 0.6942148760330579,
"grad_norm": 0.6390123963356018,
"learning_rate": 2.347699032321127e-06,
"loss": 0.0123,
"step": 840
},
{
"epoch": 0.6950413223140496,
"grad_norm": 0.6808966994285583,
"learning_rate": 2.336119931137897e-06,
"loss": 0.0163,
"step": 841
},
{
"epoch": 0.6958677685950413,
"grad_norm": 1.1223820447921753,
"learning_rate": 2.324560744669224e-06,
"loss": 0.0118,
"step": 842
},
{
"epoch": 0.6966942148760331,
"grad_norm": 1.7861708402633667,
"learning_rate": 2.31302155932962e-06,
"loss": 0.2214,
"step": 843
},
{
"epoch": 0.6975206611570248,
"grad_norm": 1.836301565170288,
"learning_rate": 2.3015024613840742e-06,
"loss": 0.2559,
"step": 844
},
{
"epoch": 0.6983471074380165,
"grad_norm": 2.2409980297088623,
"learning_rate": 2.2900035369474045e-06,
"loss": 0.2709,
"step": 845
},
{
"epoch": 0.6991735537190082,
"grad_norm": 1.363874912261963,
"learning_rate": 2.2785248719836145e-06,
"loss": 0.248,
"step": 846
},
{
"epoch": 0.7,
"grad_norm": 1.4939771890640259,
"learning_rate": 2.2670665523052534e-06,
"loss": 0.2561,
"step": 847
},
{
"epoch": 0.7008264462809918,
"grad_norm": 2.0174720287323,
"learning_rate": 2.25562866357277e-06,
"loss": 0.1054,
"step": 848
},
{
"epoch": 0.7016528925619835,
"grad_norm": 3.05000638961792,
"learning_rate": 2.244211291293877e-06,
"loss": 0.0344,
"step": 849
},
{
"epoch": 0.7024793388429752,
"grad_norm": 2.6590240001678467,
"learning_rate": 2.2328145208229096e-06,
"loss": 0.0729,
"step": 850
},
{
"epoch": 0.7033057851239669,
"grad_norm": 0.8622211813926697,
"learning_rate": 2.2214384373601843e-06,
"loss": 0.1666,
"step": 851
},
{
"epoch": 0.7041322314049587,
"grad_norm": 0.7983608245849609,
"learning_rate": 2.210083125951366e-06,
"loss": 0.1374,
"step": 852
},
{
"epoch": 0.7049586776859504,
"grad_norm": 0.7725859880447388,
"learning_rate": 2.1987486714868384e-06,
"loss": 0.1202,
"step": 853
},
{
"epoch": 0.7057851239669422,
"grad_norm": 0.8879849910736084,
"learning_rate": 2.1874351587010505e-06,
"loss": 0.1384,
"step": 854
},
{
"epoch": 0.7066115702479339,
"grad_norm": 0.9821515083312988,
"learning_rate": 2.1761426721719015e-06,
"loss": 0.0418,
"step": 855
},
{
"epoch": 0.7074380165289256,
"grad_norm": 0.5421770215034485,
"learning_rate": 2.164871296320106e-06,
"loss": 0.0135,
"step": 856
},
{
"epoch": 0.7082644628099174,
"grad_norm": 0.7403706908226013,
"learning_rate": 2.1536211154085473e-06,
"loss": 0.015,
"step": 857
},
{
"epoch": 0.7090909090909091,
"grad_norm": 0.5398378372192383,
"learning_rate": 2.142392213541669e-06,
"loss": 0.0092,
"step": 858
},
{
"epoch": 0.7099173553719008,
"grad_norm": 0.1735723912715912,
"learning_rate": 2.1311846746648325e-06,
"loss": 0.0027,
"step": 859
},
{
"epoch": 0.7107438016528925,
"grad_norm": 0.6800421476364136,
"learning_rate": 2.119998582563692e-06,
"loss": 0.0072,
"step": 860
},
{
"epoch": 0.7115702479338843,
"grad_norm": 0.5147267580032349,
"learning_rate": 2.108834020863573e-06,
"loss": 0.008,
"step": 861
},
{
"epoch": 0.7123966942148761,
"grad_norm": 1.2354320287704468,
"learning_rate": 2.097691073028836e-06,
"loss": 0.0126,
"step": 862
},
{
"epoch": 0.7132231404958678,
"grad_norm": 0.2006523609161377,
"learning_rate": 2.0865698223622693e-06,
"loss": 0.0036,
"step": 863
},
{
"epoch": 0.7140495867768595,
"grad_norm": 0.3145323097705841,
"learning_rate": 2.075470352004453e-06,
"loss": 0.0082,
"step": 864
},
{
"epoch": 0.7148760330578512,
"grad_norm": 0.9217435717582703,
"learning_rate": 2.064392744933135e-06,
"loss": 0.0183,
"step": 865
},
{
"epoch": 0.715702479338843,
"grad_norm": 1.0004950761795044,
"learning_rate": 2.05333708396263e-06,
"loss": 0.012,
"step": 866
},
{
"epoch": 0.7165289256198347,
"grad_norm": 1.3667855262756348,
"learning_rate": 2.042303451743174e-06,
"loss": 0.1209,
"step": 867
},
{
"epoch": 0.7173553719008264,
"grad_norm": 2.370497703552246,
"learning_rate": 2.0312919307603286e-06,
"loss": 0.2518,
"step": 868
},
{
"epoch": 0.7181818181818181,
"grad_norm": 2.145073652267456,
"learning_rate": 2.0203026033343525e-06,
"loss": 0.2709,
"step": 869
},
{
"epoch": 0.71900826446281,
"grad_norm": 1.861162543296814,
"learning_rate": 2.009335551619589e-06,
"loss": 0.271,
"step": 870
},
{
"epoch": 0.7198347107438017,
"grad_norm": 1.8809741735458374,
"learning_rate": 1.998390857603853e-06,
"loss": 0.2814,
"step": 871
},
{
"epoch": 0.7206611570247934,
"grad_norm": 1.8260799646377563,
"learning_rate": 1.9874686031078156e-06,
"loss": 0.2913,
"step": 872
},
{
"epoch": 0.7214876033057851,
"grad_norm": 2.0237462520599365,
"learning_rate": 1.976568869784396e-06,
"loss": 0.1477,
"step": 873
},
{
"epoch": 0.7223140495867768,
"grad_norm": 2.680821180343628,
"learning_rate": 1.965691739118146e-06,
"loss": 0.059,
"step": 874
},
{
"epoch": 0.7231404958677686,
"grad_norm": 1.6376444101333618,
"learning_rate": 1.9548372924246495e-06,
"loss": 0.0202,
"step": 875
},
{
"epoch": 0.7239669421487603,
"grad_norm": 0.8022472262382507,
"learning_rate": 1.9440056108498974e-06,
"loss": 0.1588,
"step": 876
},
{
"epoch": 0.724793388429752,
"grad_norm": 0.8381442427635193,
"learning_rate": 1.9331967753697077e-06,
"loss": 0.141,
"step": 877
},
{
"epoch": 0.7256198347107438,
"grad_norm": 0.7840932011604309,
"learning_rate": 1.922410866789092e-06,
"loss": 0.1124,
"step": 878
},
{
"epoch": 0.7264462809917356,
"grad_norm": 0.9165108799934387,
"learning_rate": 1.9116479657416687e-06,
"loss": 0.1586,
"step": 879
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.8586753606796265,
"learning_rate": 1.9009081526890622e-06,
"loss": 0.1463,
"step": 880
},
{
"epoch": 0.728099173553719,
"grad_norm": 1.2581459283828735,
"learning_rate": 1.8901915079202836e-06,
"loss": 0.0756,
"step": 881
},
{
"epoch": 0.7289256198347107,
"grad_norm": 0.4948238432407379,
"learning_rate": 1.8794981115511478e-06,
"loss": 0.0035,
"step": 882
},
{
"epoch": 0.7297520661157024,
"grad_norm": 0.6490508913993835,
"learning_rate": 1.8688280435236732e-06,
"loss": 0.0134,
"step": 883
},
{
"epoch": 0.7305785123966942,
"grad_norm": 0.5945568680763245,
"learning_rate": 1.8581813836054697e-06,
"loss": 0.005,
"step": 884
},
{
"epoch": 0.731404958677686,
"grad_norm": 1.2948330640792847,
"learning_rate": 1.8475582113891587e-06,
"loss": 0.0122,
"step": 885
},
{
"epoch": 0.7322314049586777,
"grad_norm": 1.1504799127578735,
"learning_rate": 1.8369586062917693e-06,
"loss": 0.0123,
"step": 886
},
{
"epoch": 0.7330578512396694,
"grad_norm": 0.7296923995018005,
"learning_rate": 1.826382647554148e-06,
"loss": 0.0086,
"step": 887
},
{
"epoch": 0.7338842975206612,
"grad_norm": 1.6546300649642944,
"learning_rate": 1.8158304142403653e-06,
"loss": 0.0222,
"step": 888
},
{
"epoch": 0.7347107438016529,
"grad_norm": 2.178463935852051,
"learning_rate": 1.8053019852371195e-06,
"loss": 0.0289,
"step": 889
},
{
"epoch": 0.7355371900826446,
"grad_norm": 0.14165276288986206,
"learning_rate": 1.7947974392531615e-06,
"loss": 0.0032,
"step": 890
},
{
"epoch": 0.7363636363636363,
"grad_norm": 1.420249104499817,
"learning_rate": 1.7843168548186895e-06,
"loss": 0.0178,
"step": 891
},
{
"epoch": 0.7371900826446282,
"grad_norm": 1.1662765741348267,
"learning_rate": 1.7738603102847696e-06,
"loss": 0.1101,
"step": 892
},
{
"epoch": 0.7380165289256199,
"grad_norm": 2.345207452774048,
"learning_rate": 1.7634278838227525e-06,
"loss": 0.279,
"step": 893
},
{
"epoch": 0.7388429752066116,
"grad_norm": 2.788140058517456,
"learning_rate": 1.7530196534236842e-06,
"loss": 0.2897,
"step": 894
},
{
"epoch": 0.7396694214876033,
"grad_norm": 1.8566540479660034,
"learning_rate": 1.7426356968977265e-06,
"loss": 0.2754,
"step": 895
},
{
"epoch": 0.740495867768595,
"grad_norm": 2.0167226791381836,
"learning_rate": 1.7322760918735738e-06,
"loss": 0.2636,
"step": 896
},
{
"epoch": 0.7413223140495868,
"grad_norm": 1.5432149171829224,
"learning_rate": 1.7219409157978706e-06,
"loss": 0.236,
"step": 897
},
{
"epoch": 0.7421487603305785,
"grad_norm": 2.1682021617889404,
"learning_rate": 1.711630245934638e-06,
"loss": 0.2504,
"step": 898
},
{
"epoch": 0.7429752066115702,
"grad_norm": 3.0678889751434326,
"learning_rate": 1.7013441593646895e-06,
"loss": 0.0833,
"step": 899
},
{
"epoch": 0.743801652892562,
"grad_norm": 1.4966950416564941,
"learning_rate": 1.6910827329850614e-06,
"loss": 0.0471,
"step": 900
},
{
"epoch": 0.7446280991735538,
"grad_norm": 0.6166396737098694,
"learning_rate": 1.6808460435084316e-06,
"loss": 0.0792,
"step": 901
},
{
"epoch": 0.7454545454545455,
"grad_norm": 0.9144138693809509,
"learning_rate": 1.6706341674625538e-06,
"loss": 0.1568,
"step": 902
},
{
"epoch": 0.7462809917355372,
"grad_norm": 0.7173328995704651,
"learning_rate": 1.6604471811896705e-06,
"loss": 0.0997,
"step": 903
},
{
"epoch": 0.7471074380165289,
"grad_norm": 0.8290011882781982,
"learning_rate": 1.6502851608459668e-06,
"loss": 0.1203,
"step": 904
},
{
"epoch": 0.7479338842975206,
"grad_norm": 0.8501728773117065,
"learning_rate": 1.640148182400975e-06,
"loss": 0.1433,
"step": 905
},
{
"epoch": 0.7487603305785124,
"grad_norm": 1.6242462396621704,
"learning_rate": 1.630036321637022e-06,
"loss": 0.0286,
"step": 906
},
{
"epoch": 0.7495867768595041,
"grad_norm": 1.0156506299972534,
"learning_rate": 1.6199496541486647e-06,
"loss": 0.0133,
"step": 907
},
{
"epoch": 0.7504132231404959,
"grad_norm": 0.5060174465179443,
"learning_rate": 1.6098882553421101e-06,
"loss": 0.0111,
"step": 908
},
{
"epoch": 0.7512396694214876,
"grad_norm": 0.5839444398880005,
"learning_rate": 1.5998522004346672e-06,
"loss": 0.0251,
"step": 909
},
{
"epoch": 0.7520661157024794,
"grad_norm": 0.8593085408210754,
"learning_rate": 1.589841564454176e-06,
"loss": 0.0094,
"step": 910
},
{
"epoch": 0.7528925619834711,
"grad_norm": 0.6602495908737183,
"learning_rate": 1.5798564222384493e-06,
"loss": 0.0119,
"step": 911
},
{
"epoch": 0.7537190082644628,
"grad_norm": 0.5076149702072144,
"learning_rate": 1.5698968484347132e-06,
"loss": 0.0066,
"step": 912
},
{
"epoch": 0.7545454545454545,
"grad_norm": 0.5409338474273682,
"learning_rate": 1.5599629174990482e-06,
"loss": 0.0094,
"step": 913
},
{
"epoch": 0.7553719008264462,
"grad_norm": 0.554561197757721,
"learning_rate": 1.5500547036958336e-06,
"loss": 0.0048,
"step": 914
},
{
"epoch": 0.756198347107438,
"grad_norm": 0.26180991530418396,
"learning_rate": 1.5401722810971926e-06,
"loss": 0.0049,
"step": 915
},
{
"epoch": 0.7570247933884298,
"grad_norm": 2.6749250888824463,
"learning_rate": 1.5303157235824323e-06,
"loss": 0.019,
"step": 916
},
{
"epoch": 0.7578512396694215,
"grad_norm": 2.0448601245880127,
"learning_rate": 1.5204851048375052e-06,
"loss": 0.1908,
"step": 917
},
{
"epoch": 0.7586776859504132,
"grad_norm": 2.299800157546997,
"learning_rate": 1.510680498354447e-06,
"loss": 0.2845,
"step": 918
},
{
"epoch": 0.759504132231405,
"grad_norm": 1.8261854648590088,
"learning_rate": 1.5009019774308249e-06,
"loss": 0.2633,
"step": 919
},
{
"epoch": 0.7603305785123967,
"grad_norm": 1.7616630792617798,
"learning_rate": 1.4911496151692013e-06,
"loss": 0.2646,
"step": 920
},
{
"epoch": 0.7611570247933884,
"grad_norm": 1.765621542930603,
"learning_rate": 1.4814234844765784e-06,
"loss": 0.2744,
"step": 921
},
{
"epoch": 0.7619834710743801,
"grad_norm": 1.6501365900039673,
"learning_rate": 1.471723658063856e-06,
"loss": 0.2514,
"step": 922
},
{
"epoch": 0.7628099173553718,
"grad_norm": 2.1452829837799072,
"learning_rate": 1.462050208445287e-06,
"loss": 0.0825,
"step": 923
},
{
"epoch": 0.7636363636363637,
"grad_norm": 1.7787621021270752,
"learning_rate": 1.4524032079379369e-06,
"loss": 0.0451,
"step": 924
},
{
"epoch": 0.7644628099173554,
"grad_norm": 1.823784589767456,
"learning_rate": 1.4427827286611412e-06,
"loss": 0.035,
"step": 925
},
{
"epoch": 0.7652892561983471,
"grad_norm": 0.8847120404243469,
"learning_rate": 1.4331888425359697e-06,
"loss": 0.1645,
"step": 926
},
{
"epoch": 0.7661157024793388,
"grad_norm": 1.283545732498169,
"learning_rate": 1.4236216212846787e-06,
"loss": 0.185,
"step": 927
},
{
"epoch": 0.7669421487603306,
"grad_norm": 0.8690438866615295,
"learning_rate": 1.4140811364301931e-06,
"loss": 0.157,
"step": 928
},
{
"epoch": 0.7677685950413223,
"grad_norm": 0.7596165537834167,
"learning_rate": 1.4045674592955561e-06,
"loss": 0.1042,
"step": 929
},
{
"epoch": 0.768595041322314,
"grad_norm": 0.80385422706604,
"learning_rate": 1.3950806610033956e-06,
"loss": 0.1306,
"step": 930
},
{
"epoch": 0.7694214876033058,
"grad_norm": 1.0708730220794678,
"learning_rate": 1.385620812475409e-06,
"loss": 0.0881,
"step": 931
},
{
"epoch": 0.7702479338842976,
"grad_norm": 0.5643999576568604,
"learning_rate": 1.3761879844318116e-06,
"loss": 0.0059,
"step": 932
},
{
"epoch": 0.7710743801652893,
"grad_norm": 0.5080968141555786,
"learning_rate": 1.3667822473908221e-06,
"loss": 0.0099,
"step": 933
},
{
"epoch": 0.771900826446281,
"grad_norm": 0.19685262441635132,
"learning_rate": 1.3574036716681366e-06,
"loss": 0.0019,
"step": 934
},
{
"epoch": 0.7727272727272727,
"grad_norm": 0.4858970046043396,
"learning_rate": 1.348052327376388e-06,
"loss": 0.0092,
"step": 935
},
{
"epoch": 0.7735537190082644,
"grad_norm": 1.647731065750122,
"learning_rate": 1.3387282844246385e-06,
"loss": 0.0213,
"step": 936
},
{
"epoch": 0.7743801652892562,
"grad_norm": 1.358784794807434,
"learning_rate": 1.3294316125178474e-06,
"loss": 0.0074,
"step": 937
},
{
"epoch": 0.775206611570248,
"grad_norm": 0.38342639803886414,
"learning_rate": 1.3201623811563545e-06,
"loss": 0.008,
"step": 938
},
{
"epoch": 0.7760330578512397,
"grad_norm": 0.24300101399421692,
"learning_rate": 1.3109206596353574e-06,
"loss": 0.0039,
"step": 939
},
{
"epoch": 0.7768595041322314,
"grad_norm": 0.7706601023674011,
"learning_rate": 1.301706517044395e-06,
"loss": 0.0249,
"step": 940
},
{
"epoch": 0.7776859504132232,
"grad_norm": 1.2098647356033325,
"learning_rate": 1.292520022266831e-06,
"loss": 0.0131,
"step": 941
},
{
"epoch": 0.7785123966942149,
"grad_norm": 0.957886815071106,
"learning_rate": 1.2833612439793403e-06,
"loss": 0.0317,
"step": 942
},
{
"epoch": 0.7793388429752066,
"grad_norm": 1.9836186170578003,
"learning_rate": 1.2742302506513894e-06,
"loss": 0.2457,
"step": 943
},
{
"epoch": 0.7801652892561983,
"grad_norm": 2.8069849014282227,
"learning_rate": 1.2651271105447322e-06,
"loss": 0.2876,
"step": 944
},
{
"epoch": 0.78099173553719,
"grad_norm": 1.618880271911621,
"learning_rate": 1.2560518917129017e-06,
"loss": 0.2439,
"step": 945
},
{
"epoch": 0.7818181818181819,
"grad_norm": 1.7697207927703857,
"learning_rate": 1.247004662000686e-06,
"loss": 0.2439,
"step": 946
},
{
"epoch": 0.7826446280991736,
"grad_norm": 1.9872509241104126,
"learning_rate": 1.2379854890436377e-06,
"loss": 0.2743,
"step": 947
},
{
"epoch": 0.7834710743801653,
"grad_norm": 1.743913173675537,
"learning_rate": 1.2289944402675618e-06,
"loss": 0.2494,
"step": 948
},
{
"epoch": 0.784297520661157,
"grad_norm": 2.9703633785247803,
"learning_rate": 1.2200315828880094e-06,
"loss": 0.0574,
"step": 949
},
{
"epoch": 0.7851239669421488,
"grad_norm": 0.9676649570465088,
"learning_rate": 1.2110969839097798e-06,
"loss": 0.018,
"step": 950
},
{
"epoch": 0.7859504132231405,
"grad_norm": 0.8991837501525879,
"learning_rate": 1.2021907101264147e-06,
"loss": 0.1845,
"step": 951
},
{
"epoch": 0.7867768595041322,
"grad_norm": 0.6820972561836243,
"learning_rate": 1.1933128281197042e-06,
"loss": 0.1181,
"step": 952
},
{
"epoch": 0.7876033057851239,
"grad_norm": 0.9404390454292297,
"learning_rate": 1.1844634042591858e-06,
"loss": 0.1792,
"step": 953
},
{
"epoch": 0.7884297520661157,
"grad_norm": 0.8390997648239136,
"learning_rate": 1.175642504701644e-06,
"loss": 0.1439,
"step": 954
},
{
"epoch": 0.7892561983471075,
"grad_norm": 0.9683129191398621,
"learning_rate": 1.166850195390628e-06,
"loss": 0.1708,
"step": 955
},
{
"epoch": 0.7900826446280992,
"grad_norm": 0.9276797771453857,
"learning_rate": 1.158086542055949e-06,
"loss": 0.1648,
"step": 956
},
{
"epoch": 0.7909090909090909,
"grad_norm": 0.8429524302482605,
"learning_rate": 1.1493516102131836e-06,
"loss": 0.0208,
"step": 957
},
{
"epoch": 0.7917355371900826,
"grad_norm": 0.1765168458223343,
"learning_rate": 1.1406454651632042e-06,
"loss": 0.0015,
"step": 958
},
{
"epoch": 0.7925619834710744,
"grad_norm": 0.7258001565933228,
"learning_rate": 1.1319681719916663e-06,
"loss": 0.0114,
"step": 959
},
{
"epoch": 0.7933884297520661,
"grad_norm": 0.8360604047775269,
"learning_rate": 1.123319795568541e-06,
"loss": 0.0189,
"step": 960
},
{
"epoch": 0.7942148760330578,
"grad_norm": 0.9019877910614014,
"learning_rate": 1.1147004005476192e-06,
"loss": 0.0078,
"step": 961
},
{
"epoch": 0.7950413223140496,
"grad_norm": 1.007905125617981,
"learning_rate": 1.1061100513660332e-06,
"loss": 0.0078,
"step": 962
},
{
"epoch": 0.7958677685950413,
"grad_norm": 0.0849403589963913,
"learning_rate": 1.0975488122437732e-06,
"loss": 0.0008,
"step": 963
},
{
"epoch": 0.7966942148760331,
"grad_norm": 1.677506923675537,
"learning_rate": 1.089016747183208e-06,
"loss": 0.03,
"step": 964
},
{
"epoch": 0.7975206611570248,
"grad_norm": 0.5890437960624695,
"learning_rate": 1.0805139199686049e-06,
"loss": 0.0051,
"step": 965
},
{
"epoch": 0.7983471074380165,
"grad_norm": 0.7736407518386841,
"learning_rate": 1.072040394165655e-06,
"loss": 0.012,
"step": 966
},
{
"epoch": 0.7991735537190082,
"grad_norm": 1.1192368268966675,
"learning_rate": 1.063596233120997e-06,
"loss": 0.1202,
"step": 967
},
{
"epoch": 0.8,
"grad_norm": 1.3015234470367432,
"learning_rate": 1.0551814999617432e-06,
"loss": 0.2413,
"step": 968
},
{
"epoch": 0.8008264462809918,
"grad_norm": 1.6623808145523071,
"learning_rate": 1.0467962575950097e-06,
"loss": 0.2739,
"step": 969
},
{
"epoch": 0.8016528925619835,
"grad_norm": 1.513471007347107,
"learning_rate": 1.03844056870744e-06,
"loss": 0.2316,
"step": 970
},
{
"epoch": 0.8024793388429752,
"grad_norm": 4.34433126449585,
"learning_rate": 1.0301144957647442e-06,
"loss": 0.2644,
"step": 971
},
{
"epoch": 0.8033057851239669,
"grad_norm": 1.4260059595108032,
"learning_rate": 1.021818101011232e-06,
"loss": 0.2076,
"step": 972
},
{
"epoch": 0.8041322314049587,
"grad_norm": 2.9094693660736084,
"learning_rate": 1.013551446469337e-06,
"loss": 0.09,
"step": 973
},
{
"epoch": 0.8049586776859504,
"grad_norm": 1.9255399703979492,
"learning_rate": 1.005314593939164e-06,
"loss": 0.0309,
"step": 974
},
{
"epoch": 0.8057851239669421,
"grad_norm": 1.6190732717514038,
"learning_rate": 9.97107604998022e-07,
"loss": 0.0455,
"step": 975
},
{
"epoch": 0.8066115702479338,
"grad_norm": 0.6468827724456787,
"learning_rate": 9.889305409999656e-07,
"loss": 0.0902,
"step": 976
},
{
"epoch": 0.8074380165289257,
"grad_norm": 0.7833928465843201,
"learning_rate": 9.807834630753366e-07,
"loss": 0.1279,
"step": 977
},
{
"epoch": 0.8082644628099174,
"grad_norm": 0.7632892727851868,
"learning_rate": 9.726664321303008e-07,
"loss": 0.1217,
"step": 978
},
{
"epoch": 0.8090909090909091,
"grad_norm": 0.7852186560630798,
"learning_rate": 9.64579508846405e-07,
"loss": 0.1429,
"step": 979
},
{
"epoch": 0.8099173553719008,
"grad_norm": 0.6835703253746033,
"learning_rate": 9.565227536801136e-07,
"loss": 0.0575,
"step": 980
},
{
"epoch": 0.8107438016528926,
"grad_norm": 1.7434837818145752,
"learning_rate": 9.484962268623549e-07,
"loss": 0.0147,
"step": 981
},
{
"epoch": 0.8115702479338843,
"grad_norm": 2.6885852813720703,
"learning_rate": 9.40499988398082e-07,
"loss": 0.0219,
"step": 982
},
{
"epoch": 0.812396694214876,
"grad_norm": 0.2245260328054428,
"learning_rate": 9.325340980658149e-07,
"loss": 0.0041,
"step": 983
},
{
"epoch": 0.8132231404958677,
"grad_norm": 0.39877408742904663,
"learning_rate": 9.245986154171915e-07,
"loss": 0.0042,
"step": 984
},
{
"epoch": 0.8140495867768595,
"grad_norm": 0.32143980264663696,
"learning_rate": 9.166935997765364e-07,
"loss": 0.003,
"step": 985
},
{
"epoch": 0.8148760330578513,
"grad_norm": 1.1524460315704346,
"learning_rate": 9.088191102403992e-07,
"loss": 0.0187,
"step": 986
},
{
"epoch": 0.815702479338843,
"grad_norm": 1.9803694486618042,
"learning_rate": 9.00975205677126e-07,
"loss": 0.0281,
"step": 987
},
{
"epoch": 0.8165289256198347,
"grad_norm": 0.14439859986305237,
"learning_rate": 8.93161944726414e-07,
"loss": 0.0033,
"step": 988
},
{
"epoch": 0.8173553719008264,
"grad_norm": 0.32128700613975525,
"learning_rate": 8.853793857988735e-07,
"loss": 0.005,
"step": 989
},
{
"epoch": 0.8181818181818182,
"grad_norm": 0.31499621272087097,
"learning_rate": 8.776275870755924e-07,
"loss": 0.0027,
"step": 990
},
{
"epoch": 0.8190082644628099,
"grad_norm": 0.6206768155097961,
"learning_rate": 8.699066065077005e-07,
"loss": 0.0078,
"step": 991
},
{
"epoch": 0.8198347107438017,
"grad_norm": 1.6296873092651367,
"learning_rate": 8.622165018159356e-07,
"loss": 0.04,
"step": 992
},
{
"epoch": 0.8206611570247934,
"grad_norm": 1.5795947313308716,
"learning_rate": 8.54557330490215e-07,
"loss": 0.2485,
"step": 993
},
{
"epoch": 0.8214876033057851,
"grad_norm": 1.4903229475021362,
"learning_rate": 8.469291497891979e-07,
"loss": 0.2361,
"step": 994
},
{
"epoch": 0.8223140495867769,
"grad_norm": 2.2532012462615967,
"learning_rate": 8.393320167398672e-07,
"loss": 0.2507,
"step": 995
},
{
"epoch": 0.8231404958677686,
"grad_norm": 1.4450434446334839,
"learning_rate": 8.317659881371021e-07,
"loss": 0.2469,
"step": 996
},
{
"epoch": 0.8239669421487603,
"grad_norm": 1.5326659679412842,
"learning_rate": 8.242311205432418e-07,
"loss": 0.2538,
"step": 997
},
{
"epoch": 0.824793388429752,
"grad_norm": 3.020674705505371,
"learning_rate": 8.167274702876765e-07,
"loss": 0.1389,
"step": 998
},
{
"epoch": 0.8256198347107438,
"grad_norm": 1.6760151386260986,
"learning_rate": 8.092550934664228e-07,
"loss": 0.0222,
"step": 999
},
{
"epoch": 0.8264462809917356,
"grad_norm": 1.6025710105895996,
"learning_rate": 8.018140459416962e-07,
"loss": 0.0443,
"step": 1000
},
{
"epoch": 0.8272727272727273,
"grad_norm": 0.74948650598526,
"learning_rate": 7.944043833415044e-07,
"loss": 0.127,
"step": 1001
},
{
"epoch": 0.828099173553719,
"grad_norm": 0.7373968362808228,
"learning_rate": 7.870261610592256e-07,
"loss": 0.1218,
"step": 1002
},
{
"epoch": 0.8289256198347107,
"grad_norm": 0.8300755620002747,
"learning_rate": 7.796794342531949e-07,
"loss": 0.1385,
"step": 1003
},
{
"epoch": 0.8297520661157025,
"grad_norm": 0.9344066977500916,
"learning_rate": 7.723642578462948e-07,
"loss": 0.1622,
"step": 1004
},
{
"epoch": 0.8305785123966942,
"grad_norm": 0.7487844228744507,
"learning_rate": 7.650806865255361e-07,
"loss": 0.1309,
"step": 1005
},
{
"epoch": 0.8314049586776859,
"grad_norm": 1.4011222124099731,
"learning_rate": 7.57828774741664e-07,
"loss": 0.0729,
"step": 1006
},
{
"epoch": 0.8322314049586776,
"grad_norm": 1.5047969818115234,
"learning_rate": 7.506085767087385e-07,
"loss": 0.0214,
"step": 1007
},
{
"epoch": 0.8330578512396695,
"grad_norm": 0.8907939791679382,
"learning_rate": 7.434201464037288e-07,
"loss": 0.0071,
"step": 1008
},
{
"epoch": 0.8338842975206612,
"grad_norm": 0.5046177506446838,
"learning_rate": 7.362635375661225e-07,
"loss": 0.0095,
"step": 1009
},
{
"epoch": 0.8347107438016529,
"grad_norm": 0.3181898295879364,
"learning_rate": 7.291388036975073e-07,
"loss": 0.003,
"step": 1010
},
{
"epoch": 0.8355371900826446,
"grad_norm": 1.6783201694488525,
"learning_rate": 7.220459980611838e-07,
"loss": 0.0267,
"step": 1011
},
{
"epoch": 0.8363636363636363,
"grad_norm": 1.0363799333572388,
"learning_rate": 7.149851736817609e-07,
"loss": 0.0146,
"step": 1012
},
{
"epoch": 0.8371900826446281,
"grad_norm": 1.1079469919204712,
"learning_rate": 7.079563833447617e-07,
"loss": 0.0426,
"step": 1013
},
{
"epoch": 0.8380165289256198,
"grad_norm": 0.05618565157055855,
"learning_rate": 7.009596795962275e-07,
"loss": 0.0007,
"step": 1014
},
{
"epoch": 0.8388429752066116,
"grad_norm": 1.1225967407226562,
"learning_rate": 6.939951147423269e-07,
"loss": 0.0136,
"step": 1015
},
{
"epoch": 0.8396694214876033,
"grad_norm": 1.6481060981750488,
"learning_rate": 6.870627408489616e-07,
"loss": 0.1562,
"step": 1016
},
{
"epoch": 0.8404958677685951,
"grad_norm": 2.299081802368164,
"learning_rate": 6.801626097413816e-07,
"loss": 0.2741,
"step": 1017
},
{
"epoch": 0.8413223140495868,
"grad_norm": 1.3912934064865112,
"learning_rate": 6.732947730037936e-07,
"loss": 0.2398,
"step": 1018
},
{
"epoch": 0.8421487603305785,
"grad_norm": 2.0209543704986572,
"learning_rate": 6.664592819789778e-07,
"loss": 0.2426,
"step": 1019
},
{
"epoch": 0.8429752066115702,
"grad_norm": 1.496512770652771,
"learning_rate": 6.596561877679037e-07,
"loss": 0.2415,
"step": 1020
},
{
"epoch": 0.843801652892562,
"grad_norm": 1.5533783435821533,
"learning_rate": 6.52885541229345e-07,
"loss": 0.2283,
"step": 1021
},
{
"epoch": 0.8446280991735537,
"grad_norm": 1.572188377380371,
"learning_rate": 6.461473929795053e-07,
"loss": 0.2244,
"step": 1022
},
{
"epoch": 0.8454545454545455,
"grad_norm": 2.724888563156128,
"learning_rate": 6.394417933916375e-07,
"loss": 0.1038,
"step": 1023
},
{
"epoch": 0.8462809917355372,
"grad_norm": 0.945579469203949,
"learning_rate": 6.327687925956616e-07,
"loss": 0.0157,
"step": 1024
},
{
"epoch": 0.8471074380165289,
"grad_norm": 2.450042247772217,
"learning_rate": 6.261284404777979e-07,
"loss": 0.0571,
"step": 1025
},
{
"epoch": 0.8479338842975207,
"grad_norm": 0.7468518614768982,
"learning_rate": 6.1952078668019e-07,
"loss": 0.1228,
"step": 1026
},
{
"epoch": 0.8487603305785124,
"grad_norm": 0.7207930088043213,
"learning_rate": 6.12945880600535e-07,
"loss": 0.1161,
"step": 1027
},
{
"epoch": 0.8495867768595041,
"grad_norm": 0.823848307132721,
"learning_rate": 6.064037713917131e-07,
"loss": 0.1232,
"step": 1028
},
{
"epoch": 0.8504132231404958,
"grad_norm": 0.7894586324691772,
"learning_rate": 5.998945079614199e-07,
"loss": 0.1236,
"step": 1029
},
{
"epoch": 0.8512396694214877,
"grad_norm": 0.7570236921310425,
"learning_rate": 5.93418138971803e-07,
"loss": 0.1066,
"step": 1030
},
{
"epoch": 0.8520661157024794,
"grad_norm": 0.8137221932411194,
"learning_rate": 5.869747128390963e-07,
"loss": 0.0593,
"step": 1031
},
{
"epoch": 0.8528925619834711,
"grad_norm": 0.5652878880500793,
"learning_rate": 5.80564277733256e-07,
"loss": 0.0083,
"step": 1032
},
{
"epoch": 0.8537190082644628,
"grad_norm": 0.9958022236824036,
"learning_rate": 5.741868815776081e-07,
"loss": 0.0204,
"step": 1033
},
{
"epoch": 0.8545454545454545,
"grad_norm": 0.49832838773727417,
"learning_rate": 5.678425720484815e-07,
"loss": 0.0112,
"step": 1034
},
{
"epoch": 0.8553719008264463,
"grad_norm": 0.7005866169929504,
"learning_rate": 5.615313965748531e-07,
"loss": 0.0111,
"step": 1035
},
{
"epoch": 0.856198347107438,
"grad_norm": 0.30058953166007996,
"learning_rate": 5.552534023380024e-07,
"loss": 0.0027,
"step": 1036
},
{
"epoch": 0.8570247933884297,
"grad_norm": 1.8628147840499878,
"learning_rate": 5.490086362711433e-07,
"loss": 0.0089,
"step": 1037
},
{
"epoch": 0.8578512396694215,
"grad_norm": 0.12764735519886017,
"learning_rate": 5.427971450590868e-07,
"loss": 0.002,
"step": 1038
},
{
"epoch": 0.8586776859504133,
"grad_norm": 0.5221362113952637,
"learning_rate": 5.366189751378858e-07,
"loss": 0.005,
"step": 1039
},
{
"epoch": 0.859504132231405,
"grad_norm": 0.3846019208431244,
"learning_rate": 5.304741726944873e-07,
"loss": 0.0044,
"step": 1040
},
{
"epoch": 0.8603305785123967,
"grad_norm": 0.1011054664850235,
"learning_rate": 5.243627836663906e-07,
"loss": 0.0009,
"step": 1041
},
{
"epoch": 0.8611570247933884,
"grad_norm": 2.067995309829712,
"learning_rate": 5.18284853741301e-07,
"loss": 0.2141,
"step": 1042
},
{
"epoch": 0.8619834710743801,
"grad_norm": 1.9596076011657715,
"learning_rate": 5.122404283567889e-07,
"loss": 0.253,
"step": 1043
},
{
"epoch": 0.8628099173553719,
"grad_norm": 2.1681344509124756,
"learning_rate": 5.062295526999522e-07,
"loss": 0.2268,
"step": 1044
},
{
"epoch": 0.8636363636363636,
"grad_norm": 2.6632518768310547,
"learning_rate": 5.002522717070751e-07,
"loss": 0.2322,
"step": 1045
},
{
"epoch": 0.8644628099173554,
"grad_norm": 2.1616790294647217,
"learning_rate": 4.943086300632921e-07,
"loss": 0.2325,
"step": 1046
},
{
"epoch": 0.8652892561983471,
"grad_norm": 1.9508146047592163,
"learning_rate": 4.88398672202261e-07,
"loss": 0.2253,
"step": 1047
},
{
"epoch": 0.8661157024793389,
"grad_norm": 1.3169227838516235,
"learning_rate": 4.8252244230582e-07,
"loss": 0.0584,
"step": 1048
},
{
"epoch": 0.8669421487603306,
"grad_norm": 1.065989375114441,
"learning_rate": 4.766799843036651e-07,
"loss": 0.015,
"step": 1049
},
{
"epoch": 0.8677685950413223,
"grad_norm": 2.5509748458862305,
"learning_rate": 4.7087134187302095e-07,
"loss": 0.0435,
"step": 1050
},
{
"epoch": 0.868595041322314,
"grad_norm": 0.6893365383148193,
"learning_rate": 4.6509655843830827e-07,
"loss": 0.1141,
"step": 1051
},
{
"epoch": 0.8694214876033057,
"grad_norm": 0.8257653713226318,
"learning_rate": 4.5935567717082796e-07,
"loss": 0.1358,
"step": 1052
},
{
"epoch": 0.8702479338842976,
"grad_norm": 0.8547446131706238,
"learning_rate": 4.536487409884327e-07,
"loss": 0.137,
"step": 1053
},
{
"epoch": 0.8710743801652893,
"grad_norm": 0.694800615310669,
"learning_rate": 4.4797579255520585e-07,
"loss": 0.1082,
"step": 1054
},
{
"epoch": 0.871900826446281,
"grad_norm": 0.8044619560241699,
"learning_rate": 4.423368742811468e-07,
"loss": 0.1431,
"step": 1055
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.7153838276863098,
"learning_rate": 4.3673202832184956e-07,
"loss": 0.071,
"step": 1056
},
{
"epoch": 0.8735537190082645,
"grad_norm": 1.5677008628845215,
"learning_rate": 4.311612965781903e-07,
"loss": 0.0297,
"step": 1057
},
{
"epoch": 0.8743801652892562,
"grad_norm": 0.2722497582435608,
"learning_rate": 4.2562472069601236e-07,
"loss": 0.0031,
"step": 1058
},
{
"epoch": 0.8752066115702479,
"grad_norm": 0.06443261355161667,
"learning_rate": 4.2012234206581346e-07,
"loss": 0.0013,
"step": 1059
},
{
"epoch": 0.8760330578512396,
"grad_norm": 0.1994827389717102,
"learning_rate": 4.1465420182244476e-07,
"loss": 0.0017,
"step": 1060
},
{
"epoch": 0.8768595041322315,
"grad_norm": 0.4848335087299347,
"learning_rate": 4.0922034084479146e-07,
"loss": 0.0037,
"step": 1061
},
{
"epoch": 0.8776859504132232,
"grad_norm": 0.6287811994552612,
"learning_rate": 4.0382079975547383e-07,
"loss": 0.0077,
"step": 1062
},
{
"epoch": 0.8785123966942149,
"grad_norm": 0.4892734885215759,
"learning_rate": 3.984556189205441e-07,
"loss": 0.0053,
"step": 1063
},
{
"epoch": 0.8793388429752066,
"grad_norm": 0.6679121255874634,
"learning_rate": 3.9312483844918146e-07,
"loss": 0.0065,
"step": 1064
},
{
"epoch": 0.8801652892561983,
"grad_norm": 0.36687126755714417,
"learning_rate": 3.87828498193395e-07,
"loss": 0.0064,
"step": 1065
},
{
"epoch": 0.8809917355371901,
"grad_norm": 0.9448103308677673,
"learning_rate": 3.8256663774772383e-07,
"loss": 0.0244,
"step": 1066
},
{
"epoch": 0.8818181818181818,
"grad_norm": 0.7889144420623779,
"learning_rate": 3.773392964489425e-07,
"loss": 0.0392,
"step": 1067
},
{
"epoch": 0.8826446280991735,
"grad_norm": 2.57269287109375,
"learning_rate": 3.721465133757662e-07,
"loss": 0.2697,
"step": 1068
},
{
"epoch": 0.8834710743801653,
"grad_norm": 1.8215274810791016,
"learning_rate": 3.669883273485575e-07,
"loss": 0.2545,
"step": 1069
},
{
"epoch": 0.8842975206611571,
"grad_norm": 1.8314193487167358,
"learning_rate": 3.6186477692903955e-07,
"loss": 0.2476,
"step": 1070
},
{
"epoch": 0.8851239669421488,
"grad_norm": 2.515815019607544,
"learning_rate": 3.5677590042000277e-07,
"loss": 0.2235,
"step": 1071
},
{
"epoch": 0.8859504132231405,
"grad_norm": 2.0202386379241943,
"learning_rate": 3.5172173586502543e-07,
"loss": 0.2506,
"step": 1072
},
{
"epoch": 0.8867768595041322,
"grad_norm": 1.6547911167144775,
"learning_rate": 3.46702321048179e-07,
"loss": 0.1791,
"step": 1073
},
{
"epoch": 0.8876033057851239,
"grad_norm": 2.1609323024749756,
"learning_rate": 3.417176934937588e-07,
"loss": 0.0519,
"step": 1074
},
{
"epoch": 0.8884297520661157,
"grad_norm": 1.0010566711425781,
"learning_rate": 3.3676789046599045e-07,
"loss": 0.0121,
"step": 1075
},
{
"epoch": 0.8892561983471075,
"grad_norm": 0.6641173362731934,
"learning_rate": 3.318529489687605e-07,
"loss": 0.121,
"step": 1076
},
{
"epoch": 0.8900826446280992,
"grad_norm": 0.7665372490882874,
"learning_rate": 3.2697290574533857e-07,
"loss": 0.1216,
"step": 1077
},
{
"epoch": 0.8909090909090909,
"grad_norm": 0.8700223565101624,
"learning_rate": 3.2212779727809504e-07,
"loss": 0.1393,
"step": 1078
},
{
"epoch": 0.8917355371900827,
"grad_norm": 0.8250924944877625,
"learning_rate": 3.173176597882388e-07,
"loss": 0.1224,
"step": 1079
},
{
"epoch": 0.8925619834710744,
"grad_norm": 0.8952280282974243,
"learning_rate": 3.1254252923553994e-07,
"loss": 0.0562,
"step": 1080
},
{
"epoch": 0.8933884297520661,
"grad_norm": 0.15683647990226746,
"learning_rate": 3.0780244131806193e-07,
"loss": 0.0017,
"step": 1081
},
{
"epoch": 0.8942148760330578,
"grad_norm": 0.30751723051071167,
"learning_rate": 3.030974314718971e-07,
"loss": 0.0047,
"step": 1082
},
{
"epoch": 0.8950413223140495,
"grad_norm": 1.5253392457962036,
"learning_rate": 2.9842753487089927e-07,
"loss": 0.0208,
"step": 1083
},
{
"epoch": 0.8958677685950414,
"grad_norm": 0.7621084451675415,
"learning_rate": 2.937927864264206e-07,
"loss": 0.0094,
"step": 1084
},
{
"epoch": 0.8966942148760331,
"grad_norm": 1.3640185594558716,
"learning_rate": 2.891932207870546e-07,
"loss": 0.0441,
"step": 1085
},
{
"epoch": 0.8975206611570248,
"grad_norm": 0.8421722650527954,
"learning_rate": 2.8462887233836945e-07,
"loss": 0.0141,
"step": 1086
},
{
"epoch": 0.8983471074380165,
"grad_norm": 1.5583269596099854,
"learning_rate": 2.800997752026596e-07,
"loss": 0.0277,
"step": 1087
},
{
"epoch": 0.8991735537190083,
"grad_norm": 0.26696208119392395,
"learning_rate": 2.756059632386865e-07,
"loss": 0.0022,
"step": 1088
},
{
"epoch": 0.9,
"grad_norm": 0.37935540080070496,
"learning_rate": 2.7114747004142237e-07,
"loss": 0.0112,
"step": 1089
},
{
"epoch": 0.9008264462809917,
"grad_norm": 0.48069116473197937,
"learning_rate": 2.667243289418059e-07,
"loss": 0.0044,
"step": 1090
},
{
"epoch": 0.9016528925619834,
"grad_norm": 0.2301221340894699,
"learning_rate": 2.6233657300648696e-07,
"loss": 0.0032,
"step": 1091
},
{
"epoch": 0.9024793388429752,
"grad_norm": 1.219067096710205,
"learning_rate": 2.5798423503758385e-07,
"loss": 0.0745,
"step": 1092
},
{
"epoch": 0.903305785123967,
"grad_norm": 1.8919721841812134,
"learning_rate": 2.5366734757243496e-07,
"loss": 0.2512,
"step": 1093
},
{
"epoch": 0.9041322314049587,
"grad_norm": 2.0167336463928223,
"learning_rate": 2.4938594288335725e-07,
"loss": 0.2505,
"step": 1094
},
{
"epoch": 0.9049586776859504,
"grad_norm": 2.345390796661377,
"learning_rate": 2.451400529774045e-07,
"loss": 0.2476,
"step": 1095
},
{
"epoch": 0.9057851239669421,
"grad_norm": 1.298659086227417,
"learning_rate": 2.4092970959612885e-07,
"loss": 0.2136,
"step": 1096
},
{
"epoch": 0.9066115702479339,
"grad_norm": 3.467742919921875,
"learning_rate": 2.3675494421533884e-07,
"loss": 0.2128,
"step": 1097
},
{
"epoch": 0.9074380165289256,
"grad_norm": 1.8462551832199097,
"learning_rate": 2.3261578804487318e-07,
"loss": 0.228,
"step": 1098
},
{
"epoch": 0.9082644628099174,
"grad_norm": 2.1398022174835205,
"learning_rate": 2.2851227202836002e-07,
"loss": 0.0267,
"step": 1099
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.4924956560134888,
"learning_rate": 2.2444442684298572e-07,
"loss": 0.0198,
"step": 1100
},
{
"epoch": 0.9099173553719008,
"grad_norm": 0.5752552151679993,
"learning_rate": 2.2041228289927108e-07,
"loss": 0.0939,
"step": 1101
},
{
"epoch": 0.9107438016528926,
"grad_norm": 0.7368069887161255,
"learning_rate": 2.1641587034083756e-07,
"loss": 0.1024,
"step": 1102
},
{
"epoch": 0.9115702479338843,
"grad_norm": 0.7973672151565552,
"learning_rate": 2.1245521904418643e-07,
"loss": 0.1332,
"step": 1103
},
{
"epoch": 0.912396694214876,
"grad_norm": 0.8172441720962524,
"learning_rate": 2.0853035861847448e-07,
"loss": 0.1282,
"step": 1104
},
{
"epoch": 0.9132231404958677,
"grad_norm": 1.035966396331787,
"learning_rate": 2.0464131840528978e-07,
"loss": 0.152,
"step": 1105
},
{
"epoch": 0.9140495867768595,
"grad_norm": 0.4986271262168884,
"learning_rate": 2.0078812747843623e-07,
"loss": 0.0042,
"step": 1106
},
{
"epoch": 0.9148760330578513,
"grad_norm": 0.4449491500854492,
"learning_rate": 1.9697081464371437e-07,
"loss": 0.0032,
"step": 1107
},
{
"epoch": 0.915702479338843,
"grad_norm": 1.5130363702774048,
"learning_rate": 1.9318940843870594e-07,
"loss": 0.0267,
"step": 1108
},
{
"epoch": 0.9165289256198347,
"grad_norm": 0.3523405194282532,
"learning_rate": 1.894439371325607e-07,
"loss": 0.0044,
"step": 1109
},
{
"epoch": 0.9173553719008265,
"grad_norm": 0.5672593116760254,
"learning_rate": 1.8573442872578617e-07,
"loss": 0.0059,
"step": 1110
},
{
"epoch": 0.9181818181818182,
"grad_norm": 0.6381686925888062,
"learning_rate": 1.8206091095003543e-07,
"loss": 0.0057,
"step": 1111
},
{
"epoch": 0.9190082644628099,
"grad_norm": 0.9101113677024841,
"learning_rate": 1.7842341126790508e-07,
"loss": 0.0104,
"step": 1112
},
{
"epoch": 0.9198347107438016,
"grad_norm": 0.34019139409065247,
"learning_rate": 1.7482195687272163e-07,
"loss": 0.0035,
"step": 1113
},
{
"epoch": 0.9206611570247933,
"grad_norm": 1.2434287071228027,
"learning_rate": 1.7125657468834656e-07,
"loss": 0.0246,
"step": 1114
},
{
"epoch": 0.9214876033057852,
"grad_norm": 0.4532994329929352,
"learning_rate": 1.6772729136897314e-07,
"loss": 0.0095,
"step": 1115
},
{
"epoch": 0.9223140495867769,
"grad_norm": 1.2381685972213745,
"learning_rate": 1.6423413329892168e-07,
"loss": 0.0225,
"step": 1116
},
{
"epoch": 0.9231404958677686,
"grad_norm": 0.33696889877319336,
"learning_rate": 1.6077712659244792e-07,
"loss": 0.0024,
"step": 1117
},
{
"epoch": 0.9239669421487603,
"grad_norm": 1.3697483539581299,
"learning_rate": 1.573562970935466e-07,
"loss": 0.1577,
"step": 1118
},
{
"epoch": 0.9247933884297521,
"grad_norm": 1.9480425119400024,
"learning_rate": 1.5397167037575823e-07,
"loss": 0.2501,
"step": 1119
},
{
"epoch": 0.9256198347107438,
"grad_norm": 1.6084264516830444,
"learning_rate": 1.5062327174197645e-07,
"loss": 0.239,
"step": 1120
},
{
"epoch": 0.9264462809917355,
"grad_norm": 1.976704478263855,
"learning_rate": 1.473111262242599e-07,
"loss": 0.2445,
"step": 1121
},
{
"epoch": 0.9272727272727272,
"grad_norm": 2.031085968017578,
"learning_rate": 1.4403525858364576e-07,
"loss": 0.218,
"step": 1122
},
{
"epoch": 0.928099173553719,
"grad_norm": 1.7920631170272827,
"learning_rate": 1.4079569330996412e-07,
"loss": 0.1471,
"step": 1123
},
{
"epoch": 0.9289256198347108,
"grad_norm": 1.3517217636108398,
"learning_rate": 1.3759245462165282e-07,
"loss": 0.0195,
"step": 1124
},
{
"epoch": 0.9297520661157025,
"grad_norm": 2.0068581104278564,
"learning_rate": 1.344255664655808e-07,
"loss": 0.0391,
"step": 1125
},
{
"epoch": 0.9305785123966942,
"grad_norm": 0.8138121962547302,
"learning_rate": 1.3129505251686603e-07,
"loss": 0.1505,
"step": 1126
},
{
"epoch": 0.9314049586776859,
"grad_norm": 0.7338080406188965,
"learning_rate": 1.2820093617869733e-07,
"loss": 0.1178,
"step": 1127
},
{
"epoch": 0.9322314049586777,
"grad_norm": 0.7904165983200073,
"learning_rate": 1.25143240582164e-07,
"loss": 0.1352,
"step": 1128
},
{
"epoch": 0.9330578512396694,
"grad_norm": 0.8109901547431946,
"learning_rate": 1.2212198858607694e-07,
"loss": 0.1341,
"step": 1129
},
{
"epoch": 0.9338842975206612,
"grad_norm": 0.9228838086128235,
"learning_rate": 1.191372027768034e-07,
"loss": 0.1339,
"step": 1130
},
{
"epoch": 0.9347107438016529,
"grad_norm": 0.7603052854537964,
"learning_rate": 1.1618890546809425e-07,
"loss": 0.1033,
"step": 1131
},
{
"epoch": 0.9355371900826446,
"grad_norm": 0.24581223726272583,
"learning_rate": 1.1327711870091963e-07,
"loss": 0.0024,
"step": 1132
},
{
"epoch": 0.9363636363636364,
"grad_norm": 0.3206579387187958,
"learning_rate": 1.1040186424330191e-07,
"loss": 0.0037,
"step": 1133
},
{
"epoch": 0.9371900826446281,
"grad_norm": 1.1099294424057007,
"learning_rate": 1.0756316359015528e-07,
"loss": 0.0281,
"step": 1134
},
{
"epoch": 0.9380165289256198,
"grad_norm": 0.13626572489738464,
"learning_rate": 1.0476103796312254e-07,
"loss": 0.0023,
"step": 1135
},
{
"epoch": 0.9388429752066115,
"grad_norm": 0.4995996057987213,
"learning_rate": 1.0199550831041904e-07,
"loss": 0.0095,
"step": 1136
},
{
"epoch": 0.9396694214876034,
"grad_norm": 0.7871412634849548,
"learning_rate": 9.926659530667294e-08,
"loss": 0.0049,
"step": 1137
},
{
"epoch": 0.9404958677685951,
"grad_norm": 2.2005467414855957,
"learning_rate": 9.657431935277629e-08,
"loss": 0.0149,
"step": 1138
},
{
"epoch": 0.9413223140495868,
"grad_norm": 0.9672695398330688,
"learning_rate": 9.391870057572527e-08,
"loss": 0.0098,
"step": 1139
},
{
"epoch": 0.9421487603305785,
"grad_norm": 0.46287235617637634,
"learning_rate": 9.129975882847363e-08,
"loss": 0.0023,
"step": 1140
},
{
"epoch": 0.9429752066115702,
"grad_norm": 0.8012919425964355,
"learning_rate": 8.871751368978554e-08,
"loss": 0.0055,
"step": 1141
},
{
"epoch": 0.943801652892562,
"grad_norm": 1.724612832069397,
"learning_rate": 8.617198446408736e-08,
"loss": 0.0535,
"step": 1142
},
{
"epoch": 0.9446280991735537,
"grad_norm": 1.8171989917755127,
"learning_rate": 8.366319018132229e-08,
"loss": 0.2232,
"step": 1143
},
{
"epoch": 0.9454545454545454,
"grad_norm": 2.0113537311553955,
"learning_rate": 8.119114959680929e-08,
"loss": 0.2322,
"step": 1144
},
{
"epoch": 0.9462809917355371,
"grad_norm": 4.474810600280762,
"learning_rate": 7.875588119110377e-08,
"loss": 0.2435,
"step": 1145
},
{
"epoch": 0.947107438016529,
"grad_norm": 2.493525266647339,
"learning_rate": 7.635740316985885e-08,
"loss": 0.2691,
"step": 1146
},
{
"epoch": 0.9479338842975207,
"grad_norm": 1.6824287176132202,
"learning_rate": 7.399573346368871e-08,
"loss": 0.2223,
"step": 1147
},
{
"epoch": 0.9487603305785124,
"grad_norm": 1.6271603107452393,
"learning_rate": 7.167088972803326e-08,
"loss": 0.1699,
"step": 1148
},
{
"epoch": 0.9495867768595041,
"grad_norm": 1.3928841352462769,
"learning_rate": 6.938288934303038e-08,
"loss": 0.0138,
"step": 1149
},
{
"epoch": 0.9504132231404959,
"grad_norm": 1.9528048038482666,
"learning_rate": 6.713174941338163e-08,
"loss": 0.0388,
"step": 1150
},
{
"epoch": 0.9512396694214876,
"grad_norm": 0.7475579380989075,
"learning_rate": 6.491748676822618e-08,
"loss": 0.1071,
"step": 1151
},
{
"epoch": 0.9520661157024793,
"grad_norm": 0.855401873588562,
"learning_rate": 6.274011796101598e-08,
"loss": 0.14,
"step": 1152
},
{
"epoch": 0.952892561983471,
"grad_norm": 0.7454251646995544,
"learning_rate": 6.05996592693886e-08,
"loss": 0.1126,
"step": 1153
},
{
"epoch": 0.9537190082644628,
"grad_norm": 0.8808317184448242,
"learning_rate": 5.849612669505067e-08,
"loss": 0.1207,
"step": 1154
},
{
"epoch": 0.9545454545454546,
"grad_norm": 0.48413702845573425,
"learning_rate": 5.642953596365408e-08,
"loss": 0.052,
"step": 1155
},
{
"epoch": 0.9553719008264463,
"grad_norm": 0.7997721433639526,
"learning_rate": 5.439990252467886e-08,
"loss": 0.0074,
"step": 1156
},
{
"epoch": 0.956198347107438,
"grad_norm": 1.7531133890151978,
"learning_rate": 5.2407241551320485e-08,
"loss": 0.0149,
"step": 1157
},
{
"epoch": 0.9570247933884297,
"grad_norm": 2.015205144882202,
"learning_rate": 5.0451567940373316e-08,
"loss": 0.023,
"step": 1158
},
{
"epoch": 0.9578512396694215,
"grad_norm": 0.574540376663208,
"learning_rate": 4.853289631212066e-08,
"loss": 0.0048,
"step": 1159
},
{
"epoch": 0.9586776859504132,
"grad_norm": 0.636974573135376,
"learning_rate": 4.6651241010226e-08,
"loss": 0.0141,
"step": 1160
},
{
"epoch": 0.959504132231405,
"grad_norm": 0.5378000736236572,
"learning_rate": 4.4806616101624176e-08,
"loss": 0.0081,
"step": 1161
},
{
"epoch": 0.9603305785123967,
"grad_norm": 0.7162144184112549,
"learning_rate": 4.299903537641703e-08,
"loss": 0.0133,
"step": 1162
},
{
"epoch": 0.9611570247933884,
"grad_norm": 0.23031413555145264,
"learning_rate": 4.122851234777181e-08,
"loss": 0.0021,
"step": 1163
},
{
"epoch": 0.9619834710743802,
"grad_norm": 1.0872267484664917,
"learning_rate": 3.949506025181626e-08,
"loss": 0.0097,
"step": 1164
},
{
"epoch": 0.9628099173553719,
"grad_norm": 0.7706602215766907,
"learning_rate": 3.779869204754427e-08,
"loss": 0.0175,
"step": 1165
},
{
"epoch": 0.9636363636363636,
"grad_norm": 0.4708734154701233,
"learning_rate": 3.613942041671703e-08,
"loss": 0.0054,
"step": 1166
},
{
"epoch": 0.9644628099173553,
"grad_norm": 1.5092543363571167,
"learning_rate": 3.451725776376647e-08,
"loss": 0.232,
"step": 1167
},
{
"epoch": 0.9652892561983472,
"grad_norm": 1.9864728450775146,
"learning_rate": 3.2932216215704195e-08,
"loss": 0.2257,
"step": 1168
},
{
"epoch": 0.9661157024793389,
"grad_norm": 3.1599490642547607,
"learning_rate": 3.138430762203215e-08,
"loss": 0.2442,
"step": 1169
},
{
"epoch": 0.9669421487603306,
"grad_norm": 1.5999311208724976,
"learning_rate": 2.9873543554652106e-08,
"loss": 0.2185,
"step": 1170
},
{
"epoch": 0.9677685950413223,
"grad_norm": 1.9940649271011353,
"learning_rate": 2.8399935307778516e-08,
"loss": 0.2622,
"step": 1171
},
{
"epoch": 0.968595041322314,
"grad_norm": 3.0313358306884766,
"learning_rate": 2.6963493897856906e-08,
"loss": 0.2309,
"step": 1172
},
{
"epoch": 0.9694214876033058,
"grad_norm": 2.1844117641448975,
"learning_rate": 2.5564230063478413e-08,
"loss": 0.1205,
"step": 1173
},
{
"epoch": 0.9702479338842975,
"grad_norm": 1.7357497215270996,
"learning_rate": 2.420215426530259e-08,
"loss": 0.0307,
"step": 1174
},
{
"epoch": 0.9710743801652892,
"grad_norm": 1.7013660669326782,
"learning_rate": 2.287727668597528e-08,
"loss": 0.0221,
"step": 1175
},
{
"epoch": 0.971900826446281,
"grad_norm": 0.6760327816009521,
"learning_rate": 2.1589607230056432e-08,
"loss": 0.1024,
"step": 1176
},
{
"epoch": 0.9727272727272728,
"grad_norm": 0.7643543481826782,
"learning_rate": 2.0339155523945164e-08,
"loss": 0.128,
"step": 1177
},
{
"epoch": 0.9735537190082645,
"grad_norm": 0.8929775357246399,
"learning_rate": 1.912593091580428e-08,
"loss": 0.1308,
"step": 1178
},
{
"epoch": 0.9743801652892562,
"grad_norm": 0.7659059762954712,
"learning_rate": 1.794994247549586e-08,
"loss": 0.1352,
"step": 1179
},
{
"epoch": 0.9752066115702479,
"grad_norm": 1.4182361364364624,
"learning_rate": 1.681119899450856e-08,
"loss": 0.0921,
"step": 1180
},
{
"epoch": 0.9760330578512396,
"grad_norm": 1.379634976387024,
"learning_rate": 1.5709708985895965e-08,
"loss": 0.0155,
"step": 1181
},
{
"epoch": 0.9768595041322314,
"grad_norm": 0.48459118604660034,
"learning_rate": 1.4645480684210011e-08,
"loss": 0.0034,
"step": 1182
},
{
"epoch": 0.9776859504132231,
"grad_norm": 0.9793571829795837,
"learning_rate": 1.3618522045439897e-08,
"loss": 0.0076,
"step": 1183
},
{
"epoch": 0.9785123966942149,
"grad_norm": 0.23963485658168793,
"learning_rate": 1.2628840746954362e-08,
"loss": 0.002,
"step": 1184
},
{
"epoch": 0.9793388429752066,
"grad_norm": 0.8089652061462402,
"learning_rate": 1.1676444187442848e-08,
"loss": 0.0082,
"step": 1185
},
{
"epoch": 0.9801652892561984,
"grad_norm": 0.12997634708881378,
"learning_rate": 1.0761339486859424e-08,
"loss": 0.0014,
"step": 1186
},
{
"epoch": 0.9809917355371901,
"grad_norm": 1.119914174079895,
"learning_rate": 9.883533486371721e-09,
"loss": 0.014,
"step": 1187
},
{
"epoch": 0.9818181818181818,
"grad_norm": 0.23493412137031555,
"learning_rate": 9.0430327483082e-09,
"loss": 0.0092,
"step": 1188
},
{
"epoch": 0.9826446280991735,
"grad_norm": 1.9524502754211426,
"learning_rate": 8.239843556108739e-09,
"loss": 0.0103,
"step": 1189
},
{
"epoch": 0.9834710743801653,
"grad_norm": 0.958412230014801,
"learning_rate": 7.473971914280786e-09,
"loss": 0.0131,
"step": 1190
},
{
"epoch": 0.984297520661157,
"grad_norm": 0.4178364872932434,
"learning_rate": 6.7454235483488395e-09,
"loss": 0.0058,
"step": 1191
},
{
"epoch": 0.9851239669421488,
"grad_norm": 1.409676194190979,
"learning_rate": 6.054203904817812e-09,
"loss": 0.0928,
"step": 1192
},
{
"epoch": 0.9859504132231405,
"grad_norm": 1.602060317993164,
"learning_rate": 5.400318151127515e-09,
"loss": 0.2004,
"step": 1193
},
{
"epoch": 0.9867768595041322,
"grad_norm": 1.7008346319198608,
"learning_rate": 4.783771175617124e-09,
"loss": 0.239,
"step": 1194
},
{
"epoch": 0.987603305785124,
"grad_norm": 2.2031989097595215,
"learning_rate": 4.204567587486885e-09,
"loss": 0.2539,
"step": 1195
},
{
"epoch": 0.9884297520661157,
"grad_norm": 2.037137508392334,
"learning_rate": 3.6627117167642447e-09,
"loss": 0.2378,
"step": 1196
},
{
"epoch": 0.9892561983471074,
"grad_norm": 2.461138963699341,
"learning_rate": 3.158207614272213e-09,
"loss": 0.2525,
"step": 1197
},
{
"epoch": 0.9900826446280991,
"grad_norm": 1.2552220821380615,
"learning_rate": 2.6910590515966117e-09,
"loss": 0.0538,
"step": 1198
},
{
"epoch": 0.990909090909091,
"grad_norm": 6.865564346313477,
"learning_rate": 2.2612695210616486e-09,
"loss": 0.0893,
"step": 1199
},
{
"epoch": 0.9917355371900827,
"grad_norm": 2.34413480758667,
"learning_rate": 1.8688422357004963e-09,
"loss": 0.0229,
"step": 1200
},
{
"epoch": 0.9925619834710744,
"grad_norm": 0.7144143581390381,
"learning_rate": 1.5137801292325338e-09,
"loss": 0.1281,
"step": 1201
},
{
"epoch": 0.9933884297520661,
"grad_norm": 0.8506456613540649,
"learning_rate": 1.1960858560416954e-09,
"loss": 0.1282,
"step": 1202
},
{
"epoch": 0.9942148760330578,
"grad_norm": 1.2273684740066528,
"learning_rate": 9.157617911570438e-10,
"loss": 0.0568,
"step": 1203
},
{
"epoch": 0.9950413223140496,
"grad_norm": 0.12330927699804306,
"learning_rate": 6.728100302327844e-10,
"loss": 0.0013,
"step": 1204
},
{
"epoch": 0.9958677685950413,
"grad_norm": 1.1877310276031494,
"learning_rate": 4.67232389535499e-10,
"loss": 0.0118,
"step": 1205
},
{
"epoch": 0.996694214876033,
"grad_norm": 0.7564953565597534,
"learning_rate": 2.99030405928602e-10,
"loss": 0.0049,
"step": 1206
},
{
"epoch": 0.9975206611570248,
"grad_norm": 1.135886311531067,
"learning_rate": 1.6820533686179308e-10,
"loss": 0.0813,
"step": 1207
},
{
"epoch": 0.9983471074380166,
"grad_norm": 2.0573935508728027,
"learning_rate": 7.475816036051075e-11,
"loss": 0.2733,
"step": 1208
},
{
"epoch": 0.9991735537190083,
"grad_norm": 1.533027172088623,
"learning_rate": 1.8689575020380825e-11,
"loss": 0.2453,
"step": 1209
},
{
"epoch": 1.0,
"grad_norm": 1.0230895280838013,
"learning_rate": 0.0,
"loss": 0.0737,
"step": 1210
},
{
"epoch": 1.0,
"step": 1210,
"total_flos": 56759944347648.0,
"train_loss": 0.2560723489493878,
"train_runtime": 6836.3212,
"train_samples_per_second": 2.831,
"train_steps_per_second": 0.177
}
],
"logging_steps": 1.0,
"max_steps": 1210,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 56759944347648.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}