openthoughts3_100k_lrlower / trainer_state.json
sedrickkeh's picture
End of training
89ea042 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.993178717598909,
"eval_steps": 500,
"global_step": 915,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005457025920873124,
"grad_norm": 8.40548507464911,
"learning_rate": 4.347826086956522e-07,
"loss": 1.4358,
"step": 1
},
{
"epoch": 0.010914051841746248,
"grad_norm": 8.27768117898077,
"learning_rate": 8.695652173913044e-07,
"loss": 1.4226,
"step": 2
},
{
"epoch": 0.01637107776261937,
"grad_norm": 8.34817859423518,
"learning_rate": 1.3043478260869566e-06,
"loss": 1.4277,
"step": 3
},
{
"epoch": 0.021828103683492497,
"grad_norm": 8.241415349829516,
"learning_rate": 1.7391304347826088e-06,
"loss": 1.4422,
"step": 4
},
{
"epoch": 0.027285129604365622,
"grad_norm": 7.847219720997312,
"learning_rate": 2.173913043478261e-06,
"loss": 1.4021,
"step": 5
},
{
"epoch": 0.03274215552523874,
"grad_norm": 6.490709031031551,
"learning_rate": 2.6086956521739132e-06,
"loss": 1.385,
"step": 6
},
{
"epoch": 0.03819918144611187,
"grad_norm": 5.970054048772479,
"learning_rate": 3.043478260869566e-06,
"loss": 1.3497,
"step": 7
},
{
"epoch": 0.04365620736698499,
"grad_norm": 3.386624409374888,
"learning_rate": 3.4782608695652175e-06,
"loss": 1.3161,
"step": 8
},
{
"epoch": 0.04911323328785812,
"grad_norm": 2.764488361103231,
"learning_rate": 3.91304347826087e-06,
"loss": 1.3038,
"step": 9
},
{
"epoch": 0.054570259208731244,
"grad_norm": 4.202620684729429,
"learning_rate": 4.347826086956522e-06,
"loss": 1.302,
"step": 10
},
{
"epoch": 0.06002728512960437,
"grad_norm": 6.086560136767305,
"learning_rate": 4.782608695652174e-06,
"loss": 1.3128,
"step": 11
},
{
"epoch": 0.06548431105047749,
"grad_norm": 6.073370326241115,
"learning_rate": 5.2173913043478265e-06,
"loss": 1.3088,
"step": 12
},
{
"epoch": 0.07094133697135062,
"grad_norm": 5.6470857345830945,
"learning_rate": 5.652173913043479e-06,
"loss": 1.2941,
"step": 13
},
{
"epoch": 0.07639836289222374,
"grad_norm": 4.138470108037422,
"learning_rate": 6.086956521739132e-06,
"loss": 1.2619,
"step": 14
},
{
"epoch": 0.08185538881309687,
"grad_norm": 3.7821082022725294,
"learning_rate": 6.521739130434783e-06,
"loss": 1.2451,
"step": 15
},
{
"epoch": 0.08731241473396999,
"grad_norm": 2.805651263629899,
"learning_rate": 6.956521739130435e-06,
"loss": 1.229,
"step": 16
},
{
"epoch": 0.0927694406548431,
"grad_norm": 1.760678215208644,
"learning_rate": 7.391304347826087e-06,
"loss": 1.1933,
"step": 17
},
{
"epoch": 0.09822646657571624,
"grad_norm": 1.471427773413438,
"learning_rate": 7.82608695652174e-06,
"loss": 1.2011,
"step": 18
},
{
"epoch": 0.10368349249658936,
"grad_norm": 1.6385745213912601,
"learning_rate": 8.260869565217392e-06,
"loss": 1.1761,
"step": 19
},
{
"epoch": 0.10914051841746249,
"grad_norm": 1.8107444245592996,
"learning_rate": 8.695652173913044e-06,
"loss": 1.1718,
"step": 20
},
{
"epoch": 0.1145975443383356,
"grad_norm": 1.5828548709190915,
"learning_rate": 9.130434782608697e-06,
"loss": 1.1686,
"step": 21
},
{
"epoch": 0.12005457025920874,
"grad_norm": 1.0833660495626252,
"learning_rate": 9.565217391304349e-06,
"loss": 1.1517,
"step": 22
},
{
"epoch": 0.12551159618008187,
"grad_norm": 1.0067497148092934,
"learning_rate": 1e-05,
"loss": 1.1596,
"step": 23
},
{
"epoch": 0.13096862210095497,
"grad_norm": 1.1511016542466355,
"learning_rate": 1.0434782608695653e-05,
"loss": 1.1185,
"step": 24
},
{
"epoch": 0.1364256480218281,
"grad_norm": 1.1129912962754631,
"learning_rate": 1.0869565217391305e-05,
"loss": 1.1044,
"step": 25
},
{
"epoch": 0.14188267394270124,
"grad_norm": 0.982424004614232,
"learning_rate": 1.1304347826086957e-05,
"loss": 1.1154,
"step": 26
},
{
"epoch": 0.14733969986357434,
"grad_norm": 0.8935088021503061,
"learning_rate": 1.1739130434782611e-05,
"loss": 1.0855,
"step": 27
},
{
"epoch": 0.15279672578444747,
"grad_norm": 0.9536160463503343,
"learning_rate": 1.2173913043478263e-05,
"loss": 1.0844,
"step": 28
},
{
"epoch": 0.1582537517053206,
"grad_norm": 0.7998848521277707,
"learning_rate": 1.2608695652173915e-05,
"loss": 1.0866,
"step": 29
},
{
"epoch": 0.16371077762619374,
"grad_norm": 0.7673685776489387,
"learning_rate": 1.3043478260869566e-05,
"loss": 1.1062,
"step": 30
},
{
"epoch": 0.16916780354706684,
"grad_norm": 0.9634656988616612,
"learning_rate": 1.3478260869565218e-05,
"loss": 1.0713,
"step": 31
},
{
"epoch": 0.17462482946793997,
"grad_norm": 0.7656758207809929,
"learning_rate": 1.391304347826087e-05,
"loss": 1.0804,
"step": 32
},
{
"epoch": 0.1800818553888131,
"grad_norm": 0.8925856020905516,
"learning_rate": 1.4347826086956522e-05,
"loss": 1.0791,
"step": 33
},
{
"epoch": 0.1855388813096862,
"grad_norm": 0.9604475518815286,
"learning_rate": 1.4782608695652174e-05,
"loss": 1.0743,
"step": 34
},
{
"epoch": 0.19099590723055934,
"grad_norm": 0.6673627257965253,
"learning_rate": 1.5217391304347828e-05,
"loss": 1.0827,
"step": 35
},
{
"epoch": 0.19645293315143247,
"grad_norm": 0.6466823830941191,
"learning_rate": 1.565217391304348e-05,
"loss": 1.074,
"step": 36
},
{
"epoch": 0.2019099590723056,
"grad_norm": 0.8141432611114343,
"learning_rate": 1.6086956521739132e-05,
"loss": 1.0935,
"step": 37
},
{
"epoch": 0.2073669849931787,
"grad_norm": 0.7303516492961905,
"learning_rate": 1.6521739130434785e-05,
"loss": 1.0733,
"step": 38
},
{
"epoch": 0.21282401091405184,
"grad_norm": 0.6688305682070583,
"learning_rate": 1.6956521739130437e-05,
"loss": 1.0424,
"step": 39
},
{
"epoch": 0.21828103683492497,
"grad_norm": 0.6750314584251758,
"learning_rate": 1.739130434782609e-05,
"loss": 1.039,
"step": 40
},
{
"epoch": 0.22373806275579808,
"grad_norm": 0.6414610193667182,
"learning_rate": 1.782608695652174e-05,
"loss": 1.0673,
"step": 41
},
{
"epoch": 0.2291950886766712,
"grad_norm": 0.6004774591873056,
"learning_rate": 1.8260869565217393e-05,
"loss": 1.0472,
"step": 42
},
{
"epoch": 0.23465211459754434,
"grad_norm": 1.001192284108013,
"learning_rate": 1.8695652173913045e-05,
"loss": 1.0668,
"step": 43
},
{
"epoch": 0.24010914051841747,
"grad_norm": 1.2347279701133878,
"learning_rate": 1.9130434782608697e-05,
"loss": 1.0515,
"step": 44
},
{
"epoch": 0.24556616643929058,
"grad_norm": 0.7545298666968991,
"learning_rate": 1.956521739130435e-05,
"loss": 1.0118,
"step": 45
},
{
"epoch": 0.25102319236016374,
"grad_norm": 1.7910029989682172,
"learning_rate": 2e-05,
"loss": 1.0403,
"step": 46
},
{
"epoch": 0.25648021828103684,
"grad_norm": 0.6646975296811077,
"learning_rate": 2.0434782608695657e-05,
"loss": 1.0404,
"step": 47
},
{
"epoch": 0.26193724420190995,
"grad_norm": 1.8544620284122977,
"learning_rate": 2.0869565217391306e-05,
"loss": 1.0452,
"step": 48
},
{
"epoch": 0.2673942701227831,
"grad_norm": 0.9159008525844143,
"learning_rate": 2.1304347826086958e-05,
"loss": 1.0352,
"step": 49
},
{
"epoch": 0.2728512960436562,
"grad_norm": 1.8651550264621868,
"learning_rate": 2.173913043478261e-05,
"loss": 1.0478,
"step": 50
},
{
"epoch": 0.2783083219645293,
"grad_norm": 1.2948016317604922,
"learning_rate": 2.2173913043478262e-05,
"loss": 1.0309,
"step": 51
},
{
"epoch": 0.2837653478854025,
"grad_norm": 1.5798033957703332,
"learning_rate": 2.2608695652173914e-05,
"loss": 1.0342,
"step": 52
},
{
"epoch": 0.2892223738062756,
"grad_norm": 1.6098370993374367,
"learning_rate": 2.3043478260869567e-05,
"loss": 1.0329,
"step": 53
},
{
"epoch": 0.2946793997271487,
"grad_norm": 1.2453365308049091,
"learning_rate": 2.3478260869565222e-05,
"loss": 1.0392,
"step": 54
},
{
"epoch": 0.30013642564802184,
"grad_norm": 1.5844016545843662,
"learning_rate": 2.391304347826087e-05,
"loss": 1.0125,
"step": 55
},
{
"epoch": 0.30559345156889495,
"grad_norm": 1.4627626883931912,
"learning_rate": 2.4347826086956526e-05,
"loss": 1.0499,
"step": 56
},
{
"epoch": 0.31105047748976805,
"grad_norm": 1.4143561739785215,
"learning_rate": 2.4782608695652175e-05,
"loss": 1.0281,
"step": 57
},
{
"epoch": 0.3165075034106412,
"grad_norm": 1.4070850348598627,
"learning_rate": 2.521739130434783e-05,
"loss": 1.0195,
"step": 58
},
{
"epoch": 0.3219645293315143,
"grad_norm": 1.078315144035433,
"learning_rate": 2.565217391304348e-05,
"loss": 1.0216,
"step": 59
},
{
"epoch": 0.3274215552523875,
"grad_norm": 1.1643154896766894,
"learning_rate": 2.608695652173913e-05,
"loss": 1.0097,
"step": 60
},
{
"epoch": 0.3328785811732606,
"grad_norm": 1.2461170132940949,
"learning_rate": 2.6521739130434784e-05,
"loss": 1.0209,
"step": 61
},
{
"epoch": 0.3383356070941337,
"grad_norm": 2.008669853014052,
"learning_rate": 2.6956521739130436e-05,
"loss": 1.0392,
"step": 62
},
{
"epoch": 0.34379263301500684,
"grad_norm": 1.1644314565358613,
"learning_rate": 2.739130434782609e-05,
"loss": 1.0117,
"step": 63
},
{
"epoch": 0.34924965893587995,
"grad_norm": 1.799658123734067,
"learning_rate": 2.782608695652174e-05,
"loss": 1.0083,
"step": 64
},
{
"epoch": 0.35470668485675305,
"grad_norm": 1.3978781458275338,
"learning_rate": 2.8260869565217396e-05,
"loss": 1.0082,
"step": 65
},
{
"epoch": 0.3601637107776262,
"grad_norm": 2.0511544765413583,
"learning_rate": 2.8695652173913044e-05,
"loss": 1.0248,
"step": 66
},
{
"epoch": 0.3656207366984993,
"grad_norm": 1.2539457961483822,
"learning_rate": 2.91304347826087e-05,
"loss": 1.021,
"step": 67
},
{
"epoch": 0.3710777626193724,
"grad_norm": 2.2336673036604777,
"learning_rate": 2.956521739130435e-05,
"loss": 0.9919,
"step": 68
},
{
"epoch": 0.3765347885402456,
"grad_norm": 1.9545303539529588,
"learning_rate": 3.0000000000000004e-05,
"loss": 1.0026,
"step": 69
},
{
"epoch": 0.3819918144611187,
"grad_norm": 1.715545568872597,
"learning_rate": 3.0434782608695656e-05,
"loss": 1.015,
"step": 70
},
{
"epoch": 0.3874488403819918,
"grad_norm": 1.9877889103452786,
"learning_rate": 3.086956521739131e-05,
"loss": 1.0161,
"step": 71
},
{
"epoch": 0.39290586630286495,
"grad_norm": 1.403829736078723,
"learning_rate": 3.130434782608696e-05,
"loss": 1.0094,
"step": 72
},
{
"epoch": 0.39836289222373805,
"grad_norm": 1.2911183724867115,
"learning_rate": 3.173913043478261e-05,
"loss": 1.0188,
"step": 73
},
{
"epoch": 0.4038199181446112,
"grad_norm": 1.725635724089668,
"learning_rate": 3.2173913043478265e-05,
"loss": 1.0233,
"step": 74
},
{
"epoch": 0.4092769440654843,
"grad_norm": 1.5524172183602378,
"learning_rate": 3.260869565217392e-05,
"loss": 1.0153,
"step": 75
},
{
"epoch": 0.4147339699863574,
"grad_norm": 1.6937536176639882,
"learning_rate": 3.304347826086957e-05,
"loss": 1.0087,
"step": 76
},
{
"epoch": 0.4201909959072306,
"grad_norm": 1.031762197478065,
"learning_rate": 3.347826086956522e-05,
"loss": 0.996,
"step": 77
},
{
"epoch": 0.4256480218281037,
"grad_norm": 1.2214301629403135,
"learning_rate": 3.391304347826087e-05,
"loss": 0.9952,
"step": 78
},
{
"epoch": 0.4311050477489768,
"grad_norm": 1.5276490268514877,
"learning_rate": 3.4347826086956526e-05,
"loss": 1.0261,
"step": 79
},
{
"epoch": 0.43656207366984995,
"grad_norm": 1.5857383204051316,
"learning_rate": 3.478260869565218e-05,
"loss": 1.0023,
"step": 80
},
{
"epoch": 0.44201909959072305,
"grad_norm": 1.7303601564744104,
"learning_rate": 3.521739130434783e-05,
"loss": 0.9994,
"step": 81
},
{
"epoch": 0.44747612551159616,
"grad_norm": 1.0635597436256417,
"learning_rate": 3.565217391304348e-05,
"loss": 1.0098,
"step": 82
},
{
"epoch": 0.4529331514324693,
"grad_norm": 3.079665829601424,
"learning_rate": 3.6086956521739134e-05,
"loss": 1.0082,
"step": 83
},
{
"epoch": 0.4583901773533424,
"grad_norm": 1.8491839328742012,
"learning_rate": 3.6521739130434786e-05,
"loss": 0.9895,
"step": 84
},
{
"epoch": 0.4638472032742155,
"grad_norm": 3.00445118120071,
"learning_rate": 3.695652173913044e-05,
"loss": 1.0142,
"step": 85
},
{
"epoch": 0.4693042291950887,
"grad_norm": 2.9140568598663514,
"learning_rate": 3.739130434782609e-05,
"loss": 1.0107,
"step": 86
},
{
"epoch": 0.4747612551159618,
"grad_norm": 2.411767008345035,
"learning_rate": 3.782608695652174e-05,
"loss": 1.017,
"step": 87
},
{
"epoch": 0.48021828103683495,
"grad_norm": 2.6524907076537505,
"learning_rate": 3.8260869565217395e-05,
"loss": 1.0203,
"step": 88
},
{
"epoch": 0.48567530695770805,
"grad_norm": 2.133767717181375,
"learning_rate": 3.869565217391305e-05,
"loss": 0.9971,
"step": 89
},
{
"epoch": 0.49113233287858116,
"grad_norm": 2.281609780939416,
"learning_rate": 3.91304347826087e-05,
"loss": 1.0142,
"step": 90
},
{
"epoch": 0.4965893587994543,
"grad_norm": 1.6150193464799012,
"learning_rate": 3.956521739130435e-05,
"loss": 0.9959,
"step": 91
},
{
"epoch": 0.5020463847203275,
"grad_norm": 2.1774944953837663,
"learning_rate": 4e-05,
"loss": 1.0178,
"step": 92
},
{
"epoch": 0.5075034106412005,
"grad_norm": 1.8522257265922395,
"learning_rate": 3.9999854286581316e-05,
"loss": 0.9939,
"step": 93
},
{
"epoch": 0.5129604365620737,
"grad_norm": 2.0976106058434145,
"learning_rate": 3.999941714844849e-05,
"loss": 0.9989,
"step": 94
},
{
"epoch": 0.5184174624829468,
"grad_norm": 2.033288450566452,
"learning_rate": 3.999868859197122e-05,
"loss": 1.014,
"step": 95
},
{
"epoch": 0.5238744884038199,
"grad_norm": 1.64201358795121,
"learning_rate": 3.999766862776556e-05,
"loss": 0.9962,
"step": 96
},
{
"epoch": 0.529331514324693,
"grad_norm": 1.9801453443085917,
"learning_rate": 3.999635727069373e-05,
"loss": 0.9898,
"step": 97
},
{
"epoch": 0.5347885402455662,
"grad_norm": 1.6558664282997086,
"learning_rate": 3.9994754539863984e-05,
"loss": 0.9937,
"step": 98
},
{
"epoch": 0.5402455661664393,
"grad_norm": 1.702235161068369,
"learning_rate": 3.999286045863026e-05,
"loss": 0.9922,
"step": 99
},
{
"epoch": 0.5457025920873124,
"grad_norm": 1.6798920660304613,
"learning_rate": 3.999067505459185e-05,
"loss": 0.9922,
"step": 100
},
{
"epoch": 0.5511596180081856,
"grad_norm": 1.14927571951454,
"learning_rate": 3.998819835959304e-05,
"loss": 0.959,
"step": 101
},
{
"epoch": 0.5566166439290586,
"grad_norm": 1.994460504643915,
"learning_rate": 3.998543040972259e-05,
"loss": 0.9896,
"step": 102
},
{
"epoch": 0.5620736698499318,
"grad_norm": 2.225475662267806,
"learning_rate": 3.998237124531324e-05,
"loss": 0.9838,
"step": 103
},
{
"epoch": 0.567530695770805,
"grad_norm": 0.9844824291875355,
"learning_rate": 3.9979020910941135e-05,
"loss": 0.9851,
"step": 104
},
{
"epoch": 0.572987721691678,
"grad_norm": 2.4932772876759817,
"learning_rate": 3.9975379455425126e-05,
"loss": 0.9843,
"step": 105
},
{
"epoch": 0.5784447476125512,
"grad_norm": 1.8029107577553645,
"learning_rate": 3.9971446931826116e-05,
"loss": 0.9991,
"step": 106
},
{
"epoch": 0.5839017735334243,
"grad_norm": 2.290841965389965,
"learning_rate": 3.996722339744625e-05,
"loss": 1.0061,
"step": 107
},
{
"epoch": 0.5893587994542974,
"grad_norm": 1.8729223351387532,
"learning_rate": 3.9962708913828086e-05,
"loss": 0.9968,
"step": 108
},
{
"epoch": 0.5948158253751705,
"grad_norm": 2.1962460148515826,
"learning_rate": 3.995790354675372e-05,
"loss": 1.0082,
"step": 109
},
{
"epoch": 0.6002728512960437,
"grad_norm": 1.9722134818162933,
"learning_rate": 3.995280736624378e-05,
"loss": 0.9975,
"step": 110
},
{
"epoch": 0.6057298772169167,
"grad_norm": 2.0118864615891394,
"learning_rate": 3.994742044655647e-05,
"loss": 0.9889,
"step": 111
},
{
"epoch": 0.6111869031377899,
"grad_norm": 2.090834428592416,
"learning_rate": 3.994174286618643e-05,
"loss": 1.0247,
"step": 112
},
{
"epoch": 0.616643929058663,
"grad_norm": 1.5840918067308427,
"learning_rate": 3.993577470786363e-05,
"loss": 0.9859,
"step": 113
},
{
"epoch": 0.6221009549795361,
"grad_norm": 1.4515746682829112,
"learning_rate": 3.9929516058552143e-05,
"loss": 0.9761,
"step": 114
},
{
"epoch": 0.6275579809004093,
"grad_norm": 1.504559871894639,
"learning_rate": 3.992296700944889e-05,
"loss": 0.975,
"step": 115
},
{
"epoch": 0.6330150068212824,
"grad_norm": 1.5927280628411824,
"learning_rate": 3.99161276559823e-05,
"loss": 0.9979,
"step": 116
},
{
"epoch": 0.6384720327421555,
"grad_norm": 1.60127479724257,
"learning_rate": 3.990899809781093e-05,
"loss": 0.9743,
"step": 117
},
{
"epoch": 0.6439290586630286,
"grad_norm": 1.991280239304608,
"learning_rate": 3.990157843882202e-05,
"loss": 0.981,
"step": 118
},
{
"epoch": 0.6493860845839018,
"grad_norm": 1.1853516129644286,
"learning_rate": 3.989386878712994e-05,
"loss": 0.9767,
"step": 119
},
{
"epoch": 0.654843110504775,
"grad_norm": 1.3174203496686017,
"learning_rate": 3.9885869255074674e-05,
"loss": 0.9904,
"step": 120
},
{
"epoch": 0.660300136425648,
"grad_norm": 1.5305788456446745,
"learning_rate": 3.987757995922014e-05,
"loss": 0.9925,
"step": 121
},
{
"epoch": 0.6657571623465212,
"grad_norm": 1.881540433498657,
"learning_rate": 3.9869001020352484e-05,
"loss": 1.0114,
"step": 122
},
{
"epoch": 0.6712141882673943,
"grad_norm": 1.1981839977942124,
"learning_rate": 3.9860132563478384e-05,
"loss": 0.9883,
"step": 123
},
{
"epoch": 0.6766712141882674,
"grad_norm": 2.0059502915759264,
"learning_rate": 3.985097471782313e-05,
"loss": 0.9939,
"step": 124
},
{
"epoch": 0.6821282401091405,
"grad_norm": 1.4590850776551136,
"learning_rate": 3.984152761682884e-05,
"loss": 0.9831,
"step": 125
},
{
"epoch": 0.6875852660300137,
"grad_norm": 1.1239926129461735,
"learning_rate": 3.983179139815245e-05,
"loss": 1.0005,
"step": 126
},
{
"epoch": 0.6930422919508867,
"grad_norm": 2.1543984625500836,
"learning_rate": 3.982176620366372e-05,
"loss": 0.9639,
"step": 127
},
{
"epoch": 0.6984993178717599,
"grad_norm": 1.8429479992055005,
"learning_rate": 3.98114521794432e-05,
"loss": 0.9941,
"step": 128
},
{
"epoch": 0.703956343792633,
"grad_norm": 1.0868587312281466,
"learning_rate": 3.9800849475780054e-05,
"loss": 1.0049,
"step": 129
},
{
"epoch": 0.7094133697135061,
"grad_norm": 2.118679896726006,
"learning_rate": 3.97899582471699e-05,
"loss": 0.9724,
"step": 130
},
{
"epoch": 0.7148703956343793,
"grad_norm": 1.1508635163751133,
"learning_rate": 3.977877865231256e-05,
"loss": 0.9917,
"step": 131
},
{
"epoch": 0.7203274215552524,
"grad_norm": 2.0430027109665905,
"learning_rate": 3.976731085410974e-05,
"loss": 0.9862,
"step": 132
},
{
"epoch": 0.7257844474761255,
"grad_norm": 1.4515209852628121,
"learning_rate": 3.975555501966263e-05,
"loss": 0.9895,
"step": 133
},
{
"epoch": 0.7312414733969986,
"grad_norm": 1.6287273596496654,
"learning_rate": 3.974351132026952e-05,
"loss": 0.9763,
"step": 134
},
{
"epoch": 0.7366984993178718,
"grad_norm": 1.5473296478105147,
"learning_rate": 3.973117993142327e-05,
"loss": 0.9817,
"step": 135
},
{
"epoch": 0.7421555252387448,
"grad_norm": 1.403531094420758,
"learning_rate": 3.9718561032808774e-05,
"loss": 0.9928,
"step": 136
},
{
"epoch": 0.747612551159618,
"grad_norm": 1.2777940256720086,
"learning_rate": 3.97056548083003e-05,
"loss": 0.9654,
"step": 137
},
{
"epoch": 0.7530695770804912,
"grad_norm": 2.2605652171854183,
"learning_rate": 3.9692461445958876e-05,
"loss": 0.98,
"step": 138
},
{
"epoch": 0.7585266030013642,
"grad_norm": 1.5555447148375732,
"learning_rate": 3.967898113802948e-05,
"loss": 0.973,
"step": 139
},
{
"epoch": 0.7639836289222374,
"grad_norm": 1.5712062419592667,
"learning_rate": 3.9665214080938294e-05,
"loss": 0.9837,
"step": 140
},
{
"epoch": 0.7694406548431105,
"grad_norm": 1.87211562183804,
"learning_rate": 3.9651160475289805e-05,
"loss": 1.0069,
"step": 141
},
{
"epoch": 0.7748976807639836,
"grad_norm": 1.684924662180551,
"learning_rate": 3.963682052586392e-05,
"loss": 0.9854,
"step": 142
},
{
"epoch": 0.7803547066848567,
"grad_norm": 1.1116249825439455,
"learning_rate": 3.962219444161294e-05,
"loss": 0.9808,
"step": 143
},
{
"epoch": 0.7858117326057299,
"grad_norm": 1.794929259692561,
"learning_rate": 3.960728243565853e-05,
"loss": 0.9826,
"step": 144
},
{
"epoch": 0.791268758526603,
"grad_norm": 1.4024768691530294,
"learning_rate": 3.959208472528863e-05,
"loss": 0.97,
"step": 145
},
{
"epoch": 0.7967257844474761,
"grad_norm": 1.5359858110261895,
"learning_rate": 3.957660153195428e-05,
"loss": 1.0029,
"step": 146
},
{
"epoch": 0.8021828103683493,
"grad_norm": 1.8187808557656198,
"learning_rate": 3.956083308126638e-05,
"loss": 0.9576,
"step": 147
},
{
"epoch": 0.8076398362892224,
"grad_norm": 1.4399907834108585,
"learning_rate": 3.954477960299241e-05,
"loss": 0.9612,
"step": 148
},
{
"epoch": 0.8130968622100955,
"grad_norm": 1.1465593393044229,
"learning_rate": 3.95284413310531e-05,
"loss": 0.9936,
"step": 149
},
{
"epoch": 0.8185538881309686,
"grad_norm": 1.458918663058527,
"learning_rate": 3.9511818503518985e-05,
"loss": 0.9813,
"step": 150
},
{
"epoch": 0.8240109140518418,
"grad_norm": 2.6076181813742476,
"learning_rate": 3.949491136260698e-05,
"loss": 0.9798,
"step": 151
},
{
"epoch": 0.8294679399727148,
"grad_norm": 1.0393193084437864,
"learning_rate": 3.9477720154676806e-05,
"loss": 0.9722,
"step": 152
},
{
"epoch": 0.834924965893588,
"grad_norm": 3.9060717284201085,
"learning_rate": 3.9460245130227435e-05,
"loss": 0.9727,
"step": 153
},
{
"epoch": 0.8403819918144612,
"grad_norm": 3.3082527760716767,
"learning_rate": 3.9442486543893424e-05,
"loss": 0.9794,
"step": 154
},
{
"epoch": 0.8458390177353342,
"grad_norm": 2.4057404986106485,
"learning_rate": 3.94244446544412e-05,
"loss": 0.9837,
"step": 155
},
{
"epoch": 0.8512960436562074,
"grad_norm": 1.8476216743035543,
"learning_rate": 3.94061197247653e-05,
"loss": 0.978,
"step": 156
},
{
"epoch": 0.8567530695770805,
"grad_norm": 2.400979620356147,
"learning_rate": 3.9387512021884555e-05,
"loss": 0.981,
"step": 157
},
{
"epoch": 0.8622100954979536,
"grad_norm": 2.1733630004298643,
"learning_rate": 3.936862181693815e-05,
"loss": 0.9776,
"step": 158
},
{
"epoch": 0.8676671214188267,
"grad_norm": 1.8102603434505127,
"learning_rate": 3.934944938518172e-05,
"loss": 0.9937,
"step": 159
},
{
"epoch": 0.8731241473396999,
"grad_norm": 2.0904632664136913,
"learning_rate": 3.932999500598333e-05,
"loss": 0.9577,
"step": 160
},
{
"epoch": 0.878581173260573,
"grad_norm": 1.72487012815194,
"learning_rate": 3.931025896281939e-05,
"loss": 0.9885,
"step": 161
},
{
"epoch": 0.8840381991814461,
"grad_norm": 1.859132027046651,
"learning_rate": 3.929024154327052e-05,
"loss": 0.9768,
"step": 162
},
{
"epoch": 0.8894952251023193,
"grad_norm": 2.043990751240127,
"learning_rate": 3.926994303901739e-05,
"loss": 0.988,
"step": 163
},
{
"epoch": 0.8949522510231923,
"grad_norm": 1.2949639926877792,
"learning_rate": 3.9249363745836453e-05,
"loss": 0.9803,
"step": 164
},
{
"epoch": 0.9004092769440655,
"grad_norm": 1.95004872308144,
"learning_rate": 3.922850396359562e-05,
"loss": 0.9681,
"step": 165
},
{
"epoch": 0.9058663028649386,
"grad_norm": 1.5438513810678176,
"learning_rate": 3.92073639962499e-05,
"loss": 0.9832,
"step": 166
},
{
"epoch": 0.9113233287858117,
"grad_norm": 0.8915095612184046,
"learning_rate": 3.9185944151837006e-05,
"loss": 0.9933,
"step": 167
},
{
"epoch": 0.9167803547066848,
"grad_norm": 1.7381086459322714,
"learning_rate": 3.9164244742472795e-05,
"loss": 0.9923,
"step": 168
},
{
"epoch": 0.922237380627558,
"grad_norm": 1.5006202521018344,
"learning_rate": 3.914226608434678e-05,
"loss": 0.9803,
"step": 169
},
{
"epoch": 0.927694406548431,
"grad_norm": 1.7809759035226784,
"learning_rate": 3.912000849771751e-05,
"loss": 0.9845,
"step": 170
},
{
"epoch": 0.9331514324693042,
"grad_norm": 1.416880011606568,
"learning_rate": 3.909747230690789e-05,
"loss": 0.9813,
"step": 171
},
{
"epoch": 0.9386084583901774,
"grad_norm": 1.2752605112134887,
"learning_rate": 3.907465784030045e-05,
"loss": 0.979,
"step": 172
},
{
"epoch": 0.9440654843110505,
"grad_norm": 1.8931991472592369,
"learning_rate": 3.90515654303326e-05,
"loss": 0.9651,
"step": 173
},
{
"epoch": 0.9495225102319236,
"grad_norm": 1.0457088342185985,
"learning_rate": 3.902819541349171e-05,
"loss": 0.9575,
"step": 174
},
{
"epoch": 0.9549795361527967,
"grad_norm": 1.9658747343963177,
"learning_rate": 3.900454813031032e-05,
"loss": 0.9709,
"step": 175
},
{
"epoch": 0.9604365620736699,
"grad_norm": 1.5573294008142207,
"learning_rate": 3.898062392536106e-05,
"loss": 0.9852,
"step": 176
},
{
"epoch": 0.965893587994543,
"grad_norm": 1.7467537921928091,
"learning_rate": 3.895642314725169e-05,
"loss": 0.9671,
"step": 177
},
{
"epoch": 0.9713506139154161,
"grad_norm": 1.6127230465883864,
"learning_rate": 3.893194614862005e-05,
"loss": 0.969,
"step": 178
},
{
"epoch": 0.9768076398362893,
"grad_norm": 1.6603200328670693,
"learning_rate": 3.890719328612882e-05,
"loss": 0.9795,
"step": 179
},
{
"epoch": 0.9822646657571623,
"grad_norm": 1.6320378665613324,
"learning_rate": 3.888216492046045e-05,
"loss": 0.9553,
"step": 180
},
{
"epoch": 0.9877216916780355,
"grad_norm": 0.928699164443798,
"learning_rate": 3.88568614163118e-05,
"loss": 0.9844,
"step": 181
},
{
"epoch": 0.9931787175989086,
"grad_norm": 1.2989789969103307,
"learning_rate": 3.883128314238888e-05,
"loss": 0.9633,
"step": 182
},
{
"epoch": 0.9986357435197817,
"grad_norm": 1.5050415954099332,
"learning_rate": 3.880543047140146e-05,
"loss": 0.9832,
"step": 183
},
{
"epoch": 1.004092769440655,
"grad_norm": 3.1493232961865725,
"learning_rate": 3.877930378005766e-05,
"loss": 1.6761,
"step": 184
},
{
"epoch": 1.009549795361528,
"grad_norm": 1.045095816055446,
"learning_rate": 3.8752903449058414e-05,
"loss": 0.9363,
"step": 185
},
{
"epoch": 1.015006821282401,
"grad_norm": 1.7070253819059258,
"learning_rate": 3.872622986309198e-05,
"loss": 0.9788,
"step": 186
},
{
"epoch": 1.0204638472032743,
"grad_norm": 1.5326319060129026,
"learning_rate": 3.8699283410828304e-05,
"loss": 0.9738,
"step": 187
},
{
"epoch": 1.0259208731241474,
"grad_norm": 1.1087556695241214,
"learning_rate": 3.867206448491335e-05,
"loss": 0.974,
"step": 188
},
{
"epoch": 1.0313778990450204,
"grad_norm": 1.4845940458146507,
"learning_rate": 3.8644573481963386e-05,
"loss": 0.9676,
"step": 189
},
{
"epoch": 1.0368349249658937,
"grad_norm": 1.4362719095357956,
"learning_rate": 3.861681080255922e-05,
"loss": 0.9382,
"step": 190
},
{
"epoch": 1.0422919508867667,
"grad_norm": 1.4674385107699772,
"learning_rate": 3.858877685124034e-05,
"loss": 0.94,
"step": 191
},
{
"epoch": 1.0477489768076398,
"grad_norm": 1.084446006406934,
"learning_rate": 3.8560472036499044e-05,
"loss": 0.9548,
"step": 192
},
{
"epoch": 1.053206002728513,
"grad_norm": 1.7424024173389683,
"learning_rate": 3.8531896770774454e-05,
"loss": 0.966,
"step": 193
},
{
"epoch": 1.058663028649386,
"grad_norm": 1.7927777941962322,
"learning_rate": 3.8503051470446544e-05,
"loss": 0.9371,
"step": 194
},
{
"epoch": 1.0641200545702592,
"grad_norm": 0.8728719723252784,
"learning_rate": 3.847393655583004e-05,
"loss": 0.9778,
"step": 195
},
{
"epoch": 1.0695770804911324,
"grad_norm": 1.5459212237514233,
"learning_rate": 3.844455245116832e-05,
"loss": 0.9714,
"step": 196
},
{
"epoch": 1.0750341064120055,
"grad_norm": 1.723318009783005,
"learning_rate": 3.8414899584627223e-05,
"loss": 0.9483,
"step": 197
},
{
"epoch": 1.0804911323328785,
"grad_norm": 1.6105441502277638,
"learning_rate": 3.838497838828879e-05,
"loss": 0.9529,
"step": 198
},
{
"epoch": 1.0859481582537518,
"grad_norm": 1.235861043156412,
"learning_rate": 3.835478929814502e-05,
"loss": 0.9714,
"step": 199
},
{
"epoch": 1.0914051841746248,
"grad_norm": 1.5553009472910362,
"learning_rate": 3.8324332754091447e-05,
"loss": 0.9499,
"step": 200
},
{
"epoch": 1.096862210095498,
"grad_norm": 1.9631947357338404,
"learning_rate": 3.82936091999208e-05,
"loss": 0.9481,
"step": 201
},
{
"epoch": 1.1023192360163712,
"grad_norm": 0.771286766088072,
"learning_rate": 3.826261908331649e-05,
"loss": 0.9528,
"step": 202
},
{
"epoch": 1.1077762619372442,
"grad_norm": 1.8335561541725196,
"learning_rate": 3.8231362855846105e-05,
"loss": 0.9498,
"step": 203
},
{
"epoch": 1.1132332878581173,
"grad_norm": 1.8424106742867963,
"learning_rate": 3.8199840972954806e-05,
"loss": 0.9476,
"step": 204
},
{
"epoch": 1.1186903137789905,
"grad_norm": 0.7950788375956499,
"learning_rate": 3.816805389395873e-05,
"loss": 0.9422,
"step": 205
},
{
"epoch": 1.1241473396998636,
"grad_norm": 1.6715342615720261,
"learning_rate": 3.813600208203828e-05,
"loss": 0.9652,
"step": 206
},
{
"epoch": 1.1296043656207366,
"grad_norm": 1.0978850847460873,
"learning_rate": 3.810368600423136e-05,
"loss": 0.9578,
"step": 207
},
{
"epoch": 1.13506139154161,
"grad_norm": 2.252408921193313,
"learning_rate": 3.8071106131426586e-05,
"loss": 0.9667,
"step": 208
},
{
"epoch": 1.140518417462483,
"grad_norm": 1.1643241254847931,
"learning_rate": 3.803826293835642e-05,
"loss": 0.9514,
"step": 209
},
{
"epoch": 1.145975443383356,
"grad_norm": 1.9506655247258313,
"learning_rate": 3.8005156903590265e-05,
"loss": 0.9436,
"step": 210
},
{
"epoch": 1.1514324693042293,
"grad_norm": 1.6736581284768521,
"learning_rate": 3.797178850952747e-05,
"loss": 0.9563,
"step": 211
},
{
"epoch": 1.1568894952251023,
"grad_norm": 1.698498967382254,
"learning_rate": 3.79381582423903e-05,
"loss": 0.96,
"step": 212
},
{
"epoch": 1.1623465211459754,
"grad_norm": 1.4463473539957177,
"learning_rate": 3.790426659221689e-05,
"loss": 0.9583,
"step": 213
},
{
"epoch": 1.1678035470668486,
"grad_norm": 1.996119225700199,
"learning_rate": 3.7870114052854056e-05,
"loss": 0.9686,
"step": 214
},
{
"epoch": 1.1732605729877217,
"grad_norm": 1.2453858458138212,
"learning_rate": 3.783570112195013e-05,
"loss": 0.9476,
"step": 215
},
{
"epoch": 1.1787175989085947,
"grad_norm": 1.9429791252993835,
"learning_rate": 3.780102830094768e-05,
"loss": 0.9633,
"step": 216
},
{
"epoch": 1.184174624829468,
"grad_norm": 1.7144005781733527,
"learning_rate": 3.7766096095076236e-05,
"loss": 0.9452,
"step": 217
},
{
"epoch": 1.189631650750341,
"grad_norm": 1.2919220781788054,
"learning_rate": 3.7730905013344925e-05,
"loss": 0.9505,
"step": 218
},
{
"epoch": 1.195088676671214,
"grad_norm": 1.7283120463695893,
"learning_rate": 3.7695455568535015e-05,
"loss": 0.9583,
"step": 219
},
{
"epoch": 1.2005457025920874,
"grad_norm": 1.2984823063070836,
"learning_rate": 3.76597482771925e-05,
"loss": 0.925,
"step": 220
},
{
"epoch": 1.2060027285129604,
"grad_norm": 1.2101553255952835,
"learning_rate": 3.7623783659620515e-05,
"loss": 0.9671,
"step": 221
},
{
"epoch": 1.2114597544338335,
"grad_norm": 1.9193420409227075,
"learning_rate": 3.7587562239871804e-05,
"loss": 0.9713,
"step": 222
},
{
"epoch": 1.2169167803547067,
"grad_norm": 1.145139436855805,
"learning_rate": 3.755108454574107e-05,
"loss": 0.9688,
"step": 223
},
{
"epoch": 1.2223738062755798,
"grad_norm": 2.3369999491203814,
"learning_rate": 3.751435110875724e-05,
"loss": 0.966,
"step": 224
},
{
"epoch": 1.2278308321964528,
"grad_norm": 1.6283559400501786,
"learning_rate": 3.7477362464175794e-05,
"loss": 0.9629,
"step": 225
},
{
"epoch": 1.233287858117326,
"grad_norm": 2.1896432971296447,
"learning_rate": 3.7440119150970924e-05,
"loss": 0.967,
"step": 226
},
{
"epoch": 1.2387448840381992,
"grad_norm": 1.4314027126167852,
"learning_rate": 3.7402621711827675e-05,
"loss": 0.9391,
"step": 227
},
{
"epoch": 1.2442019099590724,
"grad_norm": 2.448680005865948,
"learning_rate": 3.7364870693134044e-05,
"loss": 0.9791,
"step": 228
},
{
"epoch": 1.2496589358799455,
"grad_norm": 1.988787930308905,
"learning_rate": 3.732686664497304e-05,
"loss": 0.9678,
"step": 229
},
{
"epoch": 1.2551159618008185,
"grad_norm": 2.063824899367631,
"learning_rate": 3.7288610121114634e-05,
"loss": 0.9617,
"step": 230
},
{
"epoch": 1.2605729877216918,
"grad_norm": 1.7243515110002714,
"learning_rate": 3.725010167900772e-05,
"loss": 0.9533,
"step": 231
},
{
"epoch": 1.2660300136425648,
"grad_norm": 1.8647332677788166,
"learning_rate": 3.721134187977197e-05,
"loss": 0.9563,
"step": 232
},
{
"epoch": 1.271487039563438,
"grad_norm": 1.636320006353433,
"learning_rate": 3.7172331288189667e-05,
"loss": 0.9568,
"step": 233
},
{
"epoch": 1.2769440654843112,
"grad_norm": 1.7187722452357803,
"learning_rate": 3.713307047269748e-05,
"loss": 0.9538,
"step": 234
},
{
"epoch": 1.2824010914051842,
"grad_norm": 1.5589845753526528,
"learning_rate": 3.7093560005378175e-05,
"loss": 0.9426,
"step": 235
},
{
"epoch": 1.2878581173260573,
"grad_norm": 1.8373924763108647,
"learning_rate": 3.705380046195228e-05,
"loss": 0.9244,
"step": 236
},
{
"epoch": 1.2933151432469305,
"grad_norm": 1.3882254378197982,
"learning_rate": 3.701379242176969e-05,
"loss": 0.9498,
"step": 237
},
{
"epoch": 1.2987721691678036,
"grad_norm": 1.7021142374331253,
"learning_rate": 3.697353646780124e-05,
"loss": 0.9434,
"step": 238
},
{
"epoch": 1.3042291950886766,
"grad_norm": 1.3543258636289206,
"learning_rate": 3.693303318663019e-05,
"loss": 0.9543,
"step": 239
},
{
"epoch": 1.30968622100955,
"grad_norm": 1.6810213439521031,
"learning_rate": 3.689228316844371e-05,
"loss": 0.9462,
"step": 240
},
{
"epoch": 1.315143246930423,
"grad_norm": 1.3377038870303093,
"learning_rate": 3.685128700702423e-05,
"loss": 0.9576,
"step": 241
},
{
"epoch": 1.320600272851296,
"grad_norm": 1.5727626762086575,
"learning_rate": 3.681004529974085e-05,
"loss": 0.9583,
"step": 242
},
{
"epoch": 1.3260572987721693,
"grad_norm": 1.2786793127927039,
"learning_rate": 3.676855864754057e-05,
"loss": 0.9357,
"step": 243
},
{
"epoch": 1.3315143246930423,
"grad_norm": 1.648396462433026,
"learning_rate": 3.67268276549396e-05,
"loss": 0.9735,
"step": 244
},
{
"epoch": 1.3369713506139154,
"grad_norm": 1.2216794004695668,
"learning_rate": 3.668485293001448e-05,
"loss": 0.9741,
"step": 245
},
{
"epoch": 1.3424283765347886,
"grad_norm": 1.5971696430835944,
"learning_rate": 3.664263508439329e-05,
"loss": 0.9484,
"step": 246
},
{
"epoch": 1.3478854024556617,
"grad_norm": 1.3024833094157782,
"learning_rate": 3.660017473324669e-05,
"loss": 0.9406,
"step": 247
},
{
"epoch": 1.3533424283765347,
"grad_norm": 1.5316788751229022,
"learning_rate": 3.655747249527897e-05,
"loss": 0.9601,
"step": 248
},
{
"epoch": 1.358799454297408,
"grad_norm": 1.5547319797496317,
"learning_rate": 3.6514528992719044e-05,
"loss": 0.9474,
"step": 249
},
{
"epoch": 1.364256480218281,
"grad_norm": 1.206667830823351,
"learning_rate": 3.6471344851311356e-05,
"loss": 0.9502,
"step": 250
},
{
"epoch": 1.369713506139154,
"grad_norm": 1.2600525155706597,
"learning_rate": 3.64279207003068e-05,
"loss": 0.9452,
"step": 251
},
{
"epoch": 1.3751705320600274,
"grad_norm": 1.3484101306757132,
"learning_rate": 3.638425717245353e-05,
"loss": 0.9502,
"step": 252
},
{
"epoch": 1.3806275579809004,
"grad_norm": 1.2235801669480915,
"learning_rate": 3.634035490398774e-05,
"loss": 0.9384,
"step": 253
},
{
"epoch": 1.3860845839017735,
"grad_norm": 1.5485793543675035,
"learning_rate": 3.629621453462438e-05,
"loss": 0.959,
"step": 254
},
{
"epoch": 1.3915416098226467,
"grad_norm": 1.4002101413586943,
"learning_rate": 3.625183670754787e-05,
"loss": 0.9472,
"step": 255
},
{
"epoch": 1.3969986357435198,
"grad_norm": 0.9434127178746972,
"learning_rate": 3.6207222069402696e-05,
"loss": 0.9455,
"step": 256
},
{
"epoch": 1.4024556616643928,
"grad_norm": 0.9858801112297753,
"learning_rate": 3.6162371270284004e-05,
"loss": 0.9436,
"step": 257
},
{
"epoch": 1.407912687585266,
"grad_norm": 1.3469345939907027,
"learning_rate": 3.611728496372813e-05,
"loss": 0.9368,
"step": 258
},
{
"epoch": 1.4133697135061392,
"grad_norm": 1.8149253369471827,
"learning_rate": 3.6071963806703054e-05,
"loss": 0.9427,
"step": 259
},
{
"epoch": 1.4188267394270122,
"grad_norm": 0.7473132379864426,
"learning_rate": 3.6026408459598844e-05,
"loss": 0.9638,
"step": 260
},
{
"epoch": 1.4242837653478855,
"grad_norm": 1.6128737568835454,
"learning_rate": 3.598061958621804e-05,
"loss": 0.9557,
"step": 261
},
{
"epoch": 1.4297407912687585,
"grad_norm": 1.4020351576310623,
"learning_rate": 3.593459785376597e-05,
"loss": 0.9421,
"step": 262
},
{
"epoch": 1.4351978171896316,
"grad_norm": 1.2945719219835932,
"learning_rate": 3.5888343932841035e-05,
"loss": 0.9532,
"step": 263
},
{
"epoch": 1.4406548431105048,
"grad_norm": 1.2851599043172512,
"learning_rate": 3.584185849742492e-05,
"loss": 0.9307,
"step": 264
},
{
"epoch": 1.446111869031378,
"grad_norm": 1.2427656903613609,
"learning_rate": 3.579514222487281e-05,
"loss": 0.9538,
"step": 265
},
{
"epoch": 1.451568894952251,
"grad_norm": 1.2877332071545373,
"learning_rate": 3.5748195795903474e-05,
"loss": 0.9339,
"step": 266
},
{
"epoch": 1.4570259208731242,
"grad_norm": 1.198006739181478,
"learning_rate": 3.5701019894589376e-05,
"loss": 0.9512,
"step": 267
},
{
"epoch": 1.4624829467939973,
"grad_norm": 1.5795004337836194,
"learning_rate": 3.565361520834671e-05,
"loss": 0.9448,
"step": 268
},
{
"epoch": 1.4679399727148703,
"grad_norm": 1.1556792865151078,
"learning_rate": 3.5605982427925356e-05,
"loss": 0.9332,
"step": 269
},
{
"epoch": 1.4733969986357436,
"grad_norm": 0.5930547881100959,
"learning_rate": 3.555812224739884e-05,
"loss": 0.9613,
"step": 270
},
{
"epoch": 1.4788540245566166,
"grad_norm": 1.4579608488740115,
"learning_rate": 3.5510035364154236e-05,
"loss": 0.957,
"step": 271
},
{
"epoch": 1.4843110504774897,
"grad_norm": 0.9399997272018373,
"learning_rate": 3.5461722478881935e-05,
"loss": 0.9362,
"step": 272
},
{
"epoch": 1.489768076398363,
"grad_norm": 1.181780640902133,
"learning_rate": 3.541318429556552e-05,
"loss": 0.9304,
"step": 273
},
{
"epoch": 1.495225102319236,
"grad_norm": 2.438002638433228,
"learning_rate": 3.5364421521471443e-05,
"loss": 0.9539,
"step": 274
},
{
"epoch": 1.500682128240109,
"grad_norm": 0.9264166142215685,
"learning_rate": 3.531543486713877e-05,
"loss": 0.9592,
"step": 275
},
{
"epoch": 1.5061391541609823,
"grad_norm": 4.380791625672203,
"learning_rate": 3.5266225046368765e-05,
"loss": 0.9625,
"step": 276
},
{
"epoch": 1.5115961800818554,
"grad_norm": 4.119745847530299,
"learning_rate": 3.521679277621457e-05,
"loss": 0.9811,
"step": 277
},
{
"epoch": 1.5170532060027284,
"grad_norm": 1.3888384210153164,
"learning_rate": 3.5167138776970686e-05,
"loss": 0.9344,
"step": 278
},
{
"epoch": 1.5225102319236017,
"grad_norm": 3.242363274569884,
"learning_rate": 3.5117263772162515e-05,
"loss": 0.9699,
"step": 279
},
{
"epoch": 1.5279672578444747,
"grad_norm": 2.131900747816542,
"learning_rate": 3.5067168488535794e-05,
"loss": 0.9899,
"step": 280
},
{
"epoch": 1.5334242837653478,
"grad_norm": 3.1589070088722515,
"learning_rate": 3.501685365604604e-05,
"loss": 0.9546,
"step": 281
},
{
"epoch": 1.538881309686221,
"grad_norm": 2.6438273574397404,
"learning_rate": 3.496632000784787e-05,
"loss": 0.9694,
"step": 282
},
{
"epoch": 1.544338335607094,
"grad_norm": 2.0669427502395594,
"learning_rate": 3.4915568280284335e-05,
"loss": 0.9452,
"step": 283
},
{
"epoch": 1.5497953615279672,
"grad_norm": 2.1718089915480014,
"learning_rate": 3.4864599212876234e-05,
"loss": 0.9454,
"step": 284
},
{
"epoch": 1.5552523874488404,
"grad_norm": 2.0439265869282193,
"learning_rate": 3.481341354831125e-05,
"loss": 0.9266,
"step": 285
},
{
"epoch": 1.5607094133697135,
"grad_norm": 1.7375290887295285,
"learning_rate": 3.476201203243322e-05,
"loss": 0.9461,
"step": 286
},
{
"epoch": 1.5661664392905865,
"grad_norm": 1.7370946125028597,
"learning_rate": 3.4710395414231195e-05,
"loss": 0.9657,
"step": 287
},
{
"epoch": 1.5716234652114598,
"grad_norm": 1.403531131584409,
"learning_rate": 3.465856444582856e-05,
"loss": 0.9495,
"step": 288
},
{
"epoch": 1.5770804911323328,
"grad_norm": 1.4819115235994536,
"learning_rate": 3.460651988247208e-05,
"loss": 0.9617,
"step": 289
},
{
"epoch": 1.5825375170532059,
"grad_norm": 1.761856728208756,
"learning_rate": 3.4554262482520875e-05,
"loss": 0.921,
"step": 290
},
{
"epoch": 1.5879945429740792,
"grad_norm": 1.0191878209582437,
"learning_rate": 3.4501793007435394e-05,
"loss": 0.9447,
"step": 291
},
{
"epoch": 1.5934515688949522,
"grad_norm": 2.274348027783054,
"learning_rate": 3.444911222176629e-05,
"loss": 0.9497,
"step": 292
},
{
"epoch": 1.5989085948158253,
"grad_norm": 1.5339383301336882,
"learning_rate": 3.43962208931433e-05,
"loss": 0.9669,
"step": 293
},
{
"epoch": 1.6043656207366985,
"grad_norm": 2.550276251211631,
"learning_rate": 3.434311979226406e-05,
"loss": 0.956,
"step": 294
},
{
"epoch": 1.6098226466575716,
"grad_norm": 1.7875909094899942,
"learning_rate": 3.428980969288287e-05,
"loss": 0.9495,
"step": 295
},
{
"epoch": 1.6152796725784446,
"grad_norm": 2.823228050378481,
"learning_rate": 3.42362913717994e-05,
"loss": 0.9362,
"step": 296
},
{
"epoch": 1.620736698499318,
"grad_norm": 2.4678216750780857,
"learning_rate": 3.41825656088474e-05,
"loss": 0.9386,
"step": 297
},
{
"epoch": 1.626193724420191,
"grad_norm": 1.9114157924579258,
"learning_rate": 3.4128633186883346e-05,
"loss": 0.9576,
"step": 298
},
{
"epoch": 1.631650750341064,
"grad_norm": 1.8379349077219813,
"learning_rate": 3.407449489177499e-05,
"loss": 0.9479,
"step": 299
},
{
"epoch": 1.6371077762619373,
"grad_norm": 1.852909901213652,
"learning_rate": 3.4020151512389924e-05,
"loss": 0.9279,
"step": 300
},
{
"epoch": 1.6425648021828103,
"grad_norm": 1.3420457124335345,
"learning_rate": 3.396560384058413e-05,
"loss": 0.9298,
"step": 301
},
{
"epoch": 1.6480218281036834,
"grad_norm": 2.1617773929000172,
"learning_rate": 3.391085267119037e-05,
"loss": 0.9225,
"step": 302
},
{
"epoch": 1.6534788540245566,
"grad_norm": 1.316355967958462,
"learning_rate": 3.3855898802006644e-05,
"loss": 0.9342,
"step": 303
},
{
"epoch": 1.65893587994543,
"grad_norm": 2.453815979459407,
"learning_rate": 3.380074303378458e-05,
"loss": 0.9394,
"step": 304
},
{
"epoch": 1.6643929058663027,
"grad_norm": 1.748815933891966,
"learning_rate": 3.374538617021773e-05,
"loss": 0.9315,
"step": 305
},
{
"epoch": 1.669849931787176,
"grad_norm": 2.5597232277901973,
"learning_rate": 3.3689829017929875e-05,
"loss": 0.9573,
"step": 306
},
{
"epoch": 1.6753069577080493,
"grad_norm": 2.368134432470627,
"learning_rate": 3.363407238646327e-05,
"loss": 0.9494,
"step": 307
},
{
"epoch": 1.680763983628922,
"grad_norm": 1.724634315811694,
"learning_rate": 3.357811708826686e-05,
"loss": 0.9407,
"step": 308
},
{
"epoch": 1.6862210095497954,
"grad_norm": 1.8226179705374004,
"learning_rate": 3.352196393868442e-05,
"loss": 0.9495,
"step": 309
},
{
"epoch": 1.6916780354706686,
"grad_norm": 1.6945951192803632,
"learning_rate": 3.34656137559427e-05,
"loss": 0.9402,
"step": 310
},
{
"epoch": 1.6971350613915415,
"grad_norm": 1.402641679011377,
"learning_rate": 3.3409067361139464e-05,
"loss": 0.9191,
"step": 311
},
{
"epoch": 1.7025920873124147,
"grad_norm": 1.3467589645615918,
"learning_rate": 3.3352325578231565e-05,
"loss": 0.9636,
"step": 312
},
{
"epoch": 1.708049113233288,
"grad_norm": 1.25752862289665,
"learning_rate": 3.329538923402293e-05,
"loss": 0.9554,
"step": 313
},
{
"epoch": 1.7135061391541608,
"grad_norm": 0.986547181961436,
"learning_rate": 3.323825915815248e-05,
"loss": 0.9305,
"step": 314
},
{
"epoch": 1.718963165075034,
"grad_norm": 1.4979513167093783,
"learning_rate": 3.31809361830821e-05,
"loss": 0.9567,
"step": 315
},
{
"epoch": 1.7244201909959074,
"grad_norm": 0.7937925026119881,
"learning_rate": 3.312342114408444e-05,
"loss": 0.9458,
"step": 316
},
{
"epoch": 1.7298772169167802,
"grad_norm": 1.8876612539551143,
"learning_rate": 3.30657148792308e-05,
"loss": 0.9649,
"step": 317
},
{
"epoch": 1.7353342428376535,
"grad_norm": 1.226595551778844,
"learning_rate": 3.3007818229378896e-05,
"loss": 0.9643,
"step": 318
},
{
"epoch": 1.7407912687585267,
"grad_norm": 2.213786521631912,
"learning_rate": 3.29497320381606e-05,
"loss": 0.9584,
"step": 319
},
{
"epoch": 1.7462482946793996,
"grad_norm": 2.1570819482352235,
"learning_rate": 3.2891457151969675e-05,
"loss": 0.9531,
"step": 320
},
{
"epoch": 1.7517053206002728,
"grad_norm": 1.4381280543608101,
"learning_rate": 3.2832994419949393e-05,
"loss": 0.9421,
"step": 321
},
{
"epoch": 1.7571623465211461,
"grad_norm": 1.737184951842976,
"learning_rate": 3.277434469398022e-05,
"loss": 0.9416,
"step": 322
},
{
"epoch": 1.762619372442019,
"grad_norm": 1.632657953412784,
"learning_rate": 3.2715508828667366e-05,
"loss": 0.9321,
"step": 323
},
{
"epoch": 1.7680763983628922,
"grad_norm": 1.488744578094212,
"learning_rate": 3.265648768132834e-05,
"loss": 0.9365,
"step": 324
},
{
"epoch": 1.7735334242837655,
"grad_norm": 1.6336003571844502,
"learning_rate": 3.2597282111980444e-05,
"loss": 0.9515,
"step": 325
},
{
"epoch": 1.7789904502046383,
"grad_norm": 1.4154118064331849,
"learning_rate": 3.253789298332828e-05,
"loss": 0.9641,
"step": 326
},
{
"epoch": 1.7844474761255116,
"grad_norm": 1.5366612747550772,
"learning_rate": 3.2478321160751134e-05,
"loss": 0.9456,
"step": 327
},
{
"epoch": 1.7899045020463848,
"grad_norm": 1.3450928156923279,
"learning_rate": 3.241856751229041e-05,
"loss": 0.9486,
"step": 328
},
{
"epoch": 1.795361527967258,
"grad_norm": 1.2765561802175178,
"learning_rate": 3.2358632908636955e-05,
"loss": 0.9567,
"step": 329
},
{
"epoch": 1.800818553888131,
"grad_norm": 1.162610958798,
"learning_rate": 3.229851822311834e-05,
"loss": 0.9288,
"step": 330
},
{
"epoch": 1.8062755798090042,
"grad_norm": 1.1625836925107373,
"learning_rate": 3.223822433168623e-05,
"loss": 0.9263,
"step": 331
},
{
"epoch": 1.8117326057298773,
"grad_norm": 0.8071251992329053,
"learning_rate": 3.217775211290351e-05,
"loss": 0.9482,
"step": 332
},
{
"epoch": 1.8171896316507503,
"grad_norm": 0.89790755928994,
"learning_rate": 3.211710244793156e-05,
"loss": 0.9173,
"step": 333
},
{
"epoch": 1.8226466575716236,
"grad_norm": 0.8356390239967052,
"learning_rate": 3.205627622051738e-05,
"loss": 0.9504,
"step": 334
},
{
"epoch": 1.8281036834924966,
"grad_norm": 0.6998885337784212,
"learning_rate": 3.199527431698073e-05,
"loss": 0.9459,
"step": 335
},
{
"epoch": 1.8335607094133697,
"grad_norm": 0.8727569735519537,
"learning_rate": 3.19340976262012e-05,
"loss": 0.9435,
"step": 336
},
{
"epoch": 1.839017735334243,
"grad_norm": 0.6362860972023866,
"learning_rate": 3.187274703960526e-05,
"loss": 0.9406,
"step": 337
},
{
"epoch": 1.844474761255116,
"grad_norm": 0.8761738610839735,
"learning_rate": 3.181122345115329e-05,
"loss": 0.9353,
"step": 338
},
{
"epoch": 1.849931787175989,
"grad_norm": 0.7208261657101167,
"learning_rate": 3.174952775732651e-05,
"loss": 0.9368,
"step": 339
},
{
"epoch": 1.8553888130968623,
"grad_norm": 0.8342099154714143,
"learning_rate": 3.1687660857114e-05,
"loss": 0.9515,
"step": 340
},
{
"epoch": 1.8608458390177354,
"grad_norm": 0.7588834066746923,
"learning_rate": 3.1625623651999485e-05,
"loss": 0.946,
"step": 341
},
{
"epoch": 1.8663028649386084,
"grad_norm": 0.7261790084313842,
"learning_rate": 3.1563417045948295e-05,
"loss": 0.9332,
"step": 342
},
{
"epoch": 1.8717598908594817,
"grad_norm": 0.5170313983982283,
"learning_rate": 3.150104194539417e-05,
"loss": 0.9305,
"step": 343
},
{
"epoch": 1.8772169167803547,
"grad_norm": 0.7727261576998418,
"learning_rate": 3.1438499259226e-05,
"loss": 0.9437,
"step": 344
},
{
"epoch": 1.8826739427012278,
"grad_norm": 1.0590324797396327,
"learning_rate": 3.137578989877466e-05,
"loss": 0.9496,
"step": 345
},
{
"epoch": 1.888130968622101,
"grad_norm": 0.7511992016971163,
"learning_rate": 3.131291477779968e-05,
"loss": 0.9556,
"step": 346
},
{
"epoch": 1.893587994542974,
"grad_norm": 1.081487500255035,
"learning_rate": 3.124987481247594e-05,
"loss": 0.9479,
"step": 347
},
{
"epoch": 1.8990450204638472,
"grad_norm": 1.4968005117001788,
"learning_rate": 3.118667092138033e-05,
"loss": 0.9214,
"step": 348
},
{
"epoch": 1.9045020463847204,
"grad_norm": 0.6464116981961434,
"learning_rate": 3.112330402547834e-05,
"loss": 0.9599,
"step": 349
},
{
"epoch": 1.9099590723055935,
"grad_norm": 1.1571751705071633,
"learning_rate": 3.10597750481107e-05,
"loss": 0.9438,
"step": 350
},
{
"epoch": 1.9154160982264665,
"grad_norm": 1.383173192553895,
"learning_rate": 3.099608491497983e-05,
"loss": 0.9369,
"step": 351
},
{
"epoch": 1.9208731241473398,
"grad_norm": 1.0142077195831358,
"learning_rate": 3.093223455413645e-05,
"loss": 0.9181,
"step": 352
},
{
"epoch": 1.9263301500682128,
"grad_norm": 1.146163334987763,
"learning_rate": 3.0868224895965996e-05,
"loss": 0.9396,
"step": 353
},
{
"epoch": 1.931787175989086,
"grad_norm": 0.6987837846263671,
"learning_rate": 3.080405687317507e-05,
"loss": 0.9303,
"step": 354
},
{
"epoch": 1.9372442019099592,
"grad_norm": 1.3380093833598752,
"learning_rate": 3.073973142077788e-05,
"loss": 0.9462,
"step": 355
},
{
"epoch": 1.9427012278308322,
"grad_norm": 0.6049244168030435,
"learning_rate": 3.067524947608258e-05,
"loss": 0.9187,
"step": 356
},
{
"epoch": 1.9481582537517053,
"grad_norm": 0.8098504286256158,
"learning_rate": 3.061061197867763e-05,
"loss": 0.9162,
"step": 357
},
{
"epoch": 1.9536152796725785,
"grad_norm": 0.7357777980477844,
"learning_rate": 3.05458198704181e-05,
"loss": 0.9344,
"step": 358
},
{
"epoch": 1.9590723055934516,
"grad_norm": 0.5713529931575109,
"learning_rate": 3.0480874095411946e-05,
"loss": 0.9515,
"step": 359
},
{
"epoch": 1.9645293315143246,
"grad_norm": 0.8373330331353604,
"learning_rate": 3.0415775600006267e-05,
"loss": 0.9546,
"step": 360
},
{
"epoch": 1.969986357435198,
"grad_norm": 0.6868147137493235,
"learning_rate": 3.035052533277349e-05,
"loss": 0.907,
"step": 361
},
{
"epoch": 1.975443383356071,
"grad_norm": 0.47372940490854243,
"learning_rate": 3.0285124244497576e-05,
"loss": 0.9246,
"step": 362
},
{
"epoch": 1.980900409276944,
"grad_norm": 0.6977343075907223,
"learning_rate": 3.0219573288160128e-05,
"loss": 0.9562,
"step": 363
},
{
"epoch": 1.9863574351978173,
"grad_norm": 0.6563089786155916,
"learning_rate": 3.0153873418926543e-05,
"loss": 0.9344,
"step": 364
},
{
"epoch": 1.9918144611186903,
"grad_norm": 0.7033335661318982,
"learning_rate": 3.0088025594132086e-05,
"loss": 0.9479,
"step": 365
},
{
"epoch": 1.9972714870395634,
"grad_norm": 1.1633808323873716,
"learning_rate": 3.0022030773267908e-05,
"loss": 0.935,
"step": 366
},
{
"epoch": 2.0027285129604366,
"grad_norm": 2.256649667221531,
"learning_rate": 2.9955889917967114e-05,
"loss": 1.6487,
"step": 367
},
{
"epoch": 2.00818553888131,
"grad_norm": 0.9257803693615221,
"learning_rate": 2.9889603991990718e-05,
"loss": 0.9194,
"step": 368
},
{
"epoch": 2.0136425648021827,
"grad_norm": 0.8374064842179173,
"learning_rate": 2.9823173961213614e-05,
"loss": 0.936,
"step": 369
},
{
"epoch": 2.019099590723056,
"grad_norm": 0.6888393857507884,
"learning_rate": 2.9756600793610477e-05,
"loss": 0.9069,
"step": 370
},
{
"epoch": 2.0245566166439293,
"grad_norm": 0.6078836940762362,
"learning_rate": 2.9689885459241705e-05,
"loss": 0.9181,
"step": 371
},
{
"epoch": 2.030013642564802,
"grad_norm": 0.6540715623371649,
"learning_rate": 2.9623028930239234e-05,
"loss": 0.9365,
"step": 372
},
{
"epoch": 2.0354706684856754,
"grad_norm": 0.6022481328295576,
"learning_rate": 2.955603218079241e-05,
"loss": 0.923,
"step": 373
},
{
"epoch": 2.0409276944065486,
"grad_norm": 0.7165752848226464,
"learning_rate": 2.9488896187133767e-05,
"loss": 0.9181,
"step": 374
},
{
"epoch": 2.0463847203274215,
"grad_norm": 0.8352826439816641,
"learning_rate": 2.942162192752483e-05,
"loss": 0.9236,
"step": 375
},
{
"epoch": 2.0518417462482947,
"grad_norm": 1.124128627018019,
"learning_rate": 2.935421038224182e-05,
"loss": 0.919,
"step": 376
},
{
"epoch": 2.057298772169168,
"grad_norm": 1.0339665065551706,
"learning_rate": 2.9286662533561423e-05,
"loss": 0.9367,
"step": 377
},
{
"epoch": 2.062755798090041,
"grad_norm": 1.2298783039098067,
"learning_rate": 2.9218979365746426e-05,
"loss": 0.9456,
"step": 378
},
{
"epoch": 2.068212824010914,
"grad_norm": 0.8183361526417724,
"learning_rate": 2.9151161865031414e-05,
"loss": 0.9444,
"step": 379
},
{
"epoch": 2.0736698499317874,
"grad_norm": 0.484619834541414,
"learning_rate": 2.908321101960837e-05,
"loss": 0.9085,
"step": 380
},
{
"epoch": 2.07912687585266,
"grad_norm": 0.3810542728807868,
"learning_rate": 2.9015127819612292e-05,
"loss": 0.8991,
"step": 381
},
{
"epoch": 2.0845839017735335,
"grad_norm": 0.4925827663184475,
"learning_rate": 2.894691325710677e-05,
"loss": 0.9218,
"step": 382
},
{
"epoch": 2.0900409276944067,
"grad_norm": 0.7465936328564935,
"learning_rate": 2.8878568326069494e-05,
"loss": 0.93,
"step": 383
},
{
"epoch": 2.0954979536152796,
"grad_norm": 1.0199914288512335,
"learning_rate": 2.8810094022377842e-05,
"loss": 0.9388,
"step": 384
},
{
"epoch": 2.100954979536153,
"grad_norm": 1.4039532764332685,
"learning_rate": 2.8741491343794296e-05,
"loss": 0.9205,
"step": 385
},
{
"epoch": 2.106412005457026,
"grad_norm": 0.6570765199675046,
"learning_rate": 2.867276128995193e-05,
"loss": 0.9472,
"step": 386
},
{
"epoch": 2.111869031377899,
"grad_norm": 0.47805545813863976,
"learning_rate": 2.860390486233987e-05,
"loss": 0.9213,
"step": 387
},
{
"epoch": 2.117326057298772,
"grad_norm": 0.9100198379548127,
"learning_rate": 2.8534923064288652e-05,
"loss": 0.9185,
"step": 388
},
{
"epoch": 2.1227830832196455,
"grad_norm": 1.359999448910369,
"learning_rate": 2.8465816900955635e-05,
"loss": 0.9103,
"step": 389
},
{
"epoch": 2.1282401091405183,
"grad_norm": 0.7267473662850902,
"learning_rate": 2.8396587379310366e-05,
"loss": 0.9263,
"step": 390
},
{
"epoch": 2.1336971350613916,
"grad_norm": 0.6852106225837414,
"learning_rate": 2.8327235508119854e-05,
"loss": 0.9056,
"step": 391
},
{
"epoch": 2.139154160982265,
"grad_norm": 0.6935707651834161,
"learning_rate": 2.8257762297933927e-05,
"loss": 0.9279,
"step": 392
},
{
"epoch": 2.1446111869031377,
"grad_norm": 0.8762210438590792,
"learning_rate": 2.81881687610705e-05,
"loss": 0.9069,
"step": 393
},
{
"epoch": 2.150068212824011,
"grad_norm": 1.1906568951863223,
"learning_rate": 2.8118455911600767e-05,
"loss": 0.929,
"step": 394
},
{
"epoch": 2.155525238744884,
"grad_norm": 0.980254177026494,
"learning_rate": 2.8048624765334502e-05,
"loss": 0.9323,
"step": 395
},
{
"epoch": 2.160982264665757,
"grad_norm": 1.0373134164423028,
"learning_rate": 2.7978676339805208e-05,
"loss": 0.9208,
"step": 396
},
{
"epoch": 2.1664392905866303,
"grad_norm": 1.0207154812500114,
"learning_rate": 2.79086116542553e-05,
"loss": 0.9096,
"step": 397
},
{
"epoch": 2.1718963165075036,
"grad_norm": 1.1988463269854843,
"learning_rate": 2.783843172962128e-05,
"loss": 0.9402,
"step": 398
},
{
"epoch": 2.1773533424283764,
"grad_norm": 0.7969790707530212,
"learning_rate": 2.7768137588518807e-05,
"loss": 0.908,
"step": 399
},
{
"epoch": 2.1828103683492497,
"grad_norm": 0.4748645421435369,
"learning_rate": 2.769773025522785e-05,
"loss": 0.914,
"step": 400
},
{
"epoch": 2.188267394270123,
"grad_norm": 0.45121822491331515,
"learning_rate": 2.7627210755677733e-05,
"loss": 0.9307,
"step": 401
},
{
"epoch": 2.193724420190996,
"grad_norm": 0.8118676469523863,
"learning_rate": 2.7556580117432185e-05,
"loss": 0.9102,
"step": 402
},
{
"epoch": 2.199181446111869,
"grad_norm": 1.1207703065447276,
"learning_rate": 2.7485839369674384e-05,
"loss": 0.9231,
"step": 403
},
{
"epoch": 2.2046384720327423,
"grad_norm": 0.9740106870010401,
"learning_rate": 2.7414989543191964e-05,
"loss": 0.9087,
"step": 404
},
{
"epoch": 2.210095497953615,
"grad_norm": 0.9634686443072049,
"learning_rate": 2.734403167036195e-05,
"loss": 0.9082,
"step": 405
},
{
"epoch": 2.2155525238744884,
"grad_norm": 0.9832162277660468,
"learning_rate": 2.727296678513577e-05,
"loss": 0.9241,
"step": 406
},
{
"epoch": 2.2210095497953617,
"grad_norm": 1.0746452821377297,
"learning_rate": 2.720179592302417e-05,
"loss": 0.9407,
"step": 407
},
{
"epoch": 2.2264665757162345,
"grad_norm": 0.8835118585227068,
"learning_rate": 2.71305201210821e-05,
"loss": 0.906,
"step": 408
},
{
"epoch": 2.231923601637108,
"grad_norm": 0.806040386235616,
"learning_rate": 2.7059140417893645e-05,
"loss": 0.9142,
"step": 409
},
{
"epoch": 2.237380627557981,
"grad_norm": 0.7956258201623788,
"learning_rate": 2.6987657853556864e-05,
"loss": 0.8814,
"step": 410
},
{
"epoch": 2.242837653478854,
"grad_norm": 0.7155012234587093,
"learning_rate": 2.6916073469668633e-05,
"loss": 0.9408,
"step": 411
},
{
"epoch": 2.248294679399727,
"grad_norm": 0.745980798963711,
"learning_rate": 2.6844388309309494e-05,
"loss": 0.9334,
"step": 412
},
{
"epoch": 2.2537517053206004,
"grad_norm": 0.8718383779341066,
"learning_rate": 2.6772603417028408e-05,
"loss": 0.9244,
"step": 413
},
{
"epoch": 2.2592087312414733,
"grad_norm": 0.8697224939003284,
"learning_rate": 2.6700719838827595e-05,
"loss": 0.9132,
"step": 414
},
{
"epoch": 2.2646657571623465,
"grad_norm": 0.7800957792385944,
"learning_rate": 2.662873862214724e-05,
"loss": 0.9253,
"step": 415
},
{
"epoch": 2.27012278308322,
"grad_norm": 0.8009973379664055,
"learning_rate": 2.655666081585027e-05,
"loss": 0.9,
"step": 416
},
{
"epoch": 2.2755798090040926,
"grad_norm": 0.8649005822972493,
"learning_rate": 2.6484487470207035e-05,
"loss": 0.9204,
"step": 417
},
{
"epoch": 2.281036834924966,
"grad_norm": 0.8818657424466958,
"learning_rate": 2.641221963688002e-05,
"loss": 0.9155,
"step": 418
},
{
"epoch": 2.286493860845839,
"grad_norm": 0.5647385759805507,
"learning_rate": 2.633985836890854e-05,
"loss": 0.9206,
"step": 419
},
{
"epoch": 2.291950886766712,
"grad_norm": 0.5034679857244327,
"learning_rate": 2.6267404720693375e-05,
"loss": 0.9204,
"step": 420
},
{
"epoch": 2.2974079126875853,
"grad_norm": 0.710256150433762,
"learning_rate": 2.6194859747981385e-05,
"loss": 0.9191,
"step": 421
},
{
"epoch": 2.3028649386084585,
"grad_norm": 0.5706543763177601,
"learning_rate": 2.6122224507850182e-05,
"loss": 0.9185,
"step": 422
},
{
"epoch": 2.3083219645293314,
"grad_norm": 0.6833880125599795,
"learning_rate": 2.604950005869268e-05,
"loss": 0.9213,
"step": 423
},
{
"epoch": 2.3137789904502046,
"grad_norm": 0.8483843690019908,
"learning_rate": 2.5976687460201683e-05,
"loss": 0.9126,
"step": 424
},
{
"epoch": 2.319236016371078,
"grad_norm": 0.8129051361925009,
"learning_rate": 2.5903787773354463e-05,
"loss": 0.9188,
"step": 425
},
{
"epoch": 2.3246930422919507,
"grad_norm": 0.5996381128568273,
"learning_rate": 2.583080206039728e-05,
"loss": 0.9096,
"step": 426
},
{
"epoch": 2.330150068212824,
"grad_norm": 0.41863958371356735,
"learning_rate": 2.57577313848299e-05,
"loss": 0.9432,
"step": 427
},
{
"epoch": 2.3356070941336973,
"grad_norm": 0.34060059093315503,
"learning_rate": 2.5684576811390125e-05,
"loss": 0.9137,
"step": 428
},
{
"epoch": 2.34106412005457,
"grad_norm": 0.5069480306429284,
"learning_rate": 2.5611339406038257e-05,
"loss": 0.9124,
"step": 429
},
{
"epoch": 2.3465211459754434,
"grad_norm": 0.5427881229277935,
"learning_rate": 2.5538020235941552e-05,
"loss": 0.9166,
"step": 430
},
{
"epoch": 2.3519781718963166,
"grad_norm": 0.543245106400598,
"learning_rate": 2.5464620369458724e-05,
"loss": 0.9197,
"step": 431
},
{
"epoch": 2.3574351978171895,
"grad_norm": 0.5487542346479996,
"learning_rate": 2.5391140876124305e-05,
"loss": 0.9203,
"step": 432
},
{
"epoch": 2.3628922237380627,
"grad_norm": 0.504474417772234,
"learning_rate": 2.531758282663311e-05,
"loss": 0.9139,
"step": 433
},
{
"epoch": 2.368349249658936,
"grad_norm": 0.3570671212002871,
"learning_rate": 2.524394729282464e-05,
"loss": 0.9227,
"step": 434
},
{
"epoch": 2.373806275579809,
"grad_norm": 0.33080967390463245,
"learning_rate": 2.5170235347667425e-05,
"loss": 0.9298,
"step": 435
},
{
"epoch": 2.379263301500682,
"grad_norm": 0.2629370339698779,
"learning_rate": 2.5096448065243415e-05,
"loss": 0.9222,
"step": 436
},
{
"epoch": 2.3847203274215554,
"grad_norm": 0.32467107495565267,
"learning_rate": 2.5022586520732334e-05,
"loss": 0.9092,
"step": 437
},
{
"epoch": 2.390177353342428,
"grad_norm": 0.27556269692287366,
"learning_rate": 2.494865179039599e-05,
"loss": 0.8993,
"step": 438
},
{
"epoch": 2.3956343792633015,
"grad_norm": 0.279539516282507,
"learning_rate": 2.4874644951562618e-05,
"loss": 0.9019,
"step": 439
},
{
"epoch": 2.4010914051841747,
"grad_norm": 0.33354360728490134,
"learning_rate": 2.4800567082611165e-05,
"loss": 0.9152,
"step": 440
},
{
"epoch": 2.4065484311050476,
"grad_norm": 0.33169175944263035,
"learning_rate": 2.4726419262955595e-05,
"loss": 0.9091,
"step": 441
},
{
"epoch": 2.412005457025921,
"grad_norm": 0.3587055937970976,
"learning_rate": 2.465220257302913e-05,
"loss": 0.9202,
"step": 442
},
{
"epoch": 2.417462482946794,
"grad_norm": 0.40441219606068757,
"learning_rate": 2.4577918094268523e-05,
"loss": 0.9226,
"step": 443
},
{
"epoch": 2.422919508867667,
"grad_norm": 0.4865996215311924,
"learning_rate": 2.4503566909098318e-05,
"loss": 0.9093,
"step": 444
},
{
"epoch": 2.42837653478854,
"grad_norm": 0.38008820904475854,
"learning_rate": 2.4429150100915054e-05,
"loss": 0.9322,
"step": 445
},
{
"epoch": 2.4338335607094135,
"grad_norm": 0.41170329827458135,
"learning_rate": 2.435466875407148e-05,
"loss": 0.9324,
"step": 446
},
{
"epoch": 2.4392905866302863,
"grad_norm": 0.3622800817675993,
"learning_rate": 2.4280123953860767e-05,
"loss": 0.9001,
"step": 447
},
{
"epoch": 2.4447476125511596,
"grad_norm": 0.2682950261173189,
"learning_rate": 2.4205516786500684e-05,
"loss": 0.9314,
"step": 448
},
{
"epoch": 2.450204638472033,
"grad_norm": 0.2805378098796358,
"learning_rate": 2.4130848339117766e-05,
"loss": 0.9341,
"step": 449
},
{
"epoch": 2.4556616643929057,
"grad_norm": 0.26782126481321455,
"learning_rate": 2.4056119699731495e-05,
"loss": 0.9077,
"step": 450
},
{
"epoch": 2.461118690313779,
"grad_norm": 0.37285051812558306,
"learning_rate": 2.3981331957238414e-05,
"loss": 0.9235,
"step": 451
},
{
"epoch": 2.466575716234652,
"grad_norm": 0.3129713500376212,
"learning_rate": 2.3906486201396287e-05,
"loss": 0.9213,
"step": 452
},
{
"epoch": 2.472032742155525,
"grad_norm": 0.36665287480858777,
"learning_rate": 2.3831583522808224e-05,
"loss": 0.917,
"step": 453
},
{
"epoch": 2.4774897680763983,
"grad_norm": 0.3443704371520464,
"learning_rate": 2.375662501290675e-05,
"loss": 0.9189,
"step": 454
},
{
"epoch": 2.4829467939972716,
"grad_norm": 0.31197899443616667,
"learning_rate": 2.368161176393793e-05,
"loss": 0.9127,
"step": 455
},
{
"epoch": 2.488403819918145,
"grad_norm": 0.35012014939390956,
"learning_rate": 2.360654486894548e-05,
"loss": 0.9113,
"step": 456
},
{
"epoch": 2.4938608458390177,
"grad_norm": 0.35258642719846595,
"learning_rate": 2.3531425421754782e-05,
"loss": 0.9137,
"step": 457
},
{
"epoch": 2.499317871759891,
"grad_norm": 0.4818508820401416,
"learning_rate": 2.3456254516956973e-05,
"loss": 0.9322,
"step": 458
},
{
"epoch": 2.504774897680764,
"grad_norm": 0.41831055845919374,
"learning_rate": 2.3381033249893007e-05,
"loss": 0.9358,
"step": 459
},
{
"epoch": 2.510231923601637,
"grad_norm": 0.46003166070829415,
"learning_rate": 2.3305762716637696e-05,
"loss": 0.9134,
"step": 460
},
{
"epoch": 2.5156889495225103,
"grad_norm": 0.34405667621405894,
"learning_rate": 2.32304440139837e-05,
"loss": 0.914,
"step": 461
},
{
"epoch": 2.5211459754433836,
"grad_norm": 0.30837605247167627,
"learning_rate": 2.315507823942559e-05,
"loss": 0.8906,
"step": 462
},
{
"epoch": 2.5266030013642564,
"grad_norm": 0.35159469224889583,
"learning_rate": 2.3079666491143827e-05,
"loss": 0.9291,
"step": 463
},
{
"epoch": 2.5320600272851297,
"grad_norm": 0.3797916060475412,
"learning_rate": 2.3004209867988783e-05,
"loss": 0.9087,
"step": 464
},
{
"epoch": 2.5375170532060025,
"grad_norm": 0.40916286067612617,
"learning_rate": 2.2928709469464705e-05,
"loss": 0.9158,
"step": 465
},
{
"epoch": 2.542974079126876,
"grad_norm": 0.28077601639148303,
"learning_rate": 2.2853166395713715e-05,
"loss": 0.908,
"step": 466
},
{
"epoch": 2.548431105047749,
"grad_norm": 0.30535476691189556,
"learning_rate": 2.2777581747499767e-05,
"loss": 0.9288,
"step": 467
},
{
"epoch": 2.5538881309686223,
"grad_norm": 0.2741959984551279,
"learning_rate": 2.2701956626192603e-05,
"loss": 0.9123,
"step": 468
},
{
"epoch": 2.559345156889495,
"grad_norm": 0.29160243799401836,
"learning_rate": 2.262629213375173e-05,
"loss": 0.9153,
"step": 469
},
{
"epoch": 2.5648021828103684,
"grad_norm": 0.31211888825075323,
"learning_rate": 2.255058937271032e-05,
"loss": 0.9019,
"step": 470
},
{
"epoch": 2.5702592087312413,
"grad_norm": 0.24605091808209184,
"learning_rate": 2.2474849446159193e-05,
"loss": 0.9041,
"step": 471
},
{
"epoch": 2.5757162346521145,
"grad_norm": 0.296940058046894,
"learning_rate": 2.2399073457730723e-05,
"loss": 0.8933,
"step": 472
},
{
"epoch": 2.581173260572988,
"grad_norm": 0.39017704428903854,
"learning_rate": 2.2323262511582726e-05,
"loss": 0.9219,
"step": 473
},
{
"epoch": 2.586630286493861,
"grad_norm": 0.26845683489444067,
"learning_rate": 2.2247417712382423e-05,
"loss": 0.9072,
"step": 474
},
{
"epoch": 2.592087312414734,
"grad_norm": 0.29710964002091833,
"learning_rate": 2.217154016529031e-05,
"loss": 0.9254,
"step": 475
},
{
"epoch": 2.597544338335607,
"grad_norm": 0.2773002611218211,
"learning_rate": 2.2095630975944068e-05,
"loss": 0.9196,
"step": 476
},
{
"epoch": 2.60300136425648,
"grad_norm": 0.27685282385866905,
"learning_rate": 2.2019691250442442e-05,
"loss": 0.9048,
"step": 477
},
{
"epoch": 2.6084583901773533,
"grad_norm": 0.35014690047193237,
"learning_rate": 2.1943722095329138e-05,
"loss": 0.9113,
"step": 478
},
{
"epoch": 2.6139154160982265,
"grad_norm": 0.2596786590850847,
"learning_rate": 2.1867724617576685e-05,
"loss": 0.9161,
"step": 479
},
{
"epoch": 2.6193724420191,
"grad_norm": 0.3426543130719377,
"learning_rate": 2.1791699924570313e-05,
"loss": 0.8926,
"step": 480
},
{
"epoch": 2.6248294679399726,
"grad_norm": 0.3078282469487072,
"learning_rate": 2.1715649124091814e-05,
"loss": 0.9183,
"step": 481
},
{
"epoch": 2.630286493860846,
"grad_norm": 0.22901258390983542,
"learning_rate": 2.16395733243034e-05,
"loss": 0.9344,
"step": 482
},
{
"epoch": 2.6357435197817187,
"grad_norm": 0.371108470895669,
"learning_rate": 2.156347363373156e-05,
"loss": 0.9192,
"step": 483
},
{
"epoch": 2.641200545702592,
"grad_norm": 0.3675376564769477,
"learning_rate": 2.14873511612509e-05,
"loss": 0.914,
"step": 484
},
{
"epoch": 2.6466575716234653,
"grad_norm": 0.47791366315200284,
"learning_rate": 2.141120701606799e-05,
"loss": 0.9078,
"step": 485
},
{
"epoch": 2.6521145975443385,
"grad_norm": 0.4222978650582422,
"learning_rate": 2.1335042307705206e-05,
"loss": 0.9099,
"step": 486
},
{
"epoch": 2.6575716234652114,
"grad_norm": 0.3556115683063452,
"learning_rate": 2.125885814598454e-05,
"loss": 0.9064,
"step": 487
},
{
"epoch": 2.6630286493860846,
"grad_norm": 0.356222691019892,
"learning_rate": 2.1182655641011468e-05,
"loss": 0.9109,
"step": 488
},
{
"epoch": 2.6684856753069575,
"grad_norm": 0.2950967727936582,
"learning_rate": 2.1106435903158734e-05,
"loss": 0.907,
"step": 489
},
{
"epoch": 2.6739427012278307,
"grad_norm": 0.2589049008249365,
"learning_rate": 2.10302000430502e-05,
"loss": 0.9167,
"step": 490
},
{
"epoch": 2.679399727148704,
"grad_norm": 0.2679428400644797,
"learning_rate": 2.0953949171544646e-05,
"loss": 0.9029,
"step": 491
},
{
"epoch": 2.6848567530695773,
"grad_norm": 0.30000226534532,
"learning_rate": 2.0877684399719596e-05,
"loss": 0.902,
"step": 492
},
{
"epoch": 2.69031377899045,
"grad_norm": 0.31357462517216056,
"learning_rate": 2.0801406838855095e-05,
"loss": 0.9151,
"step": 493
},
{
"epoch": 2.6957708049113234,
"grad_norm": 0.2692910544239183,
"learning_rate": 2.0725117600417572e-05,
"loss": 0.9218,
"step": 494
},
{
"epoch": 2.701227830832196,
"grad_norm": 0.30151763927530156,
"learning_rate": 2.0648817796043598e-05,
"loss": 0.9198,
"step": 495
},
{
"epoch": 2.7066848567530695,
"grad_norm": 0.2758793028048215,
"learning_rate": 2.0572508537523705e-05,
"loss": 0.8979,
"step": 496
},
{
"epoch": 2.7121418826739427,
"grad_norm": 0.2812105414991479,
"learning_rate": 2.0496190936786196e-05,
"loss": 0.9131,
"step": 497
},
{
"epoch": 2.717598908594816,
"grad_norm": 0.2963610249601614,
"learning_rate": 2.041986610588091e-05,
"loss": 0.9377,
"step": 498
},
{
"epoch": 2.723055934515689,
"grad_norm": 0.3097919911404899,
"learning_rate": 2.0343535156963057e-05,
"loss": 0.9262,
"step": 499
},
{
"epoch": 2.728512960436562,
"grad_norm": 0.34847730033316476,
"learning_rate": 2.026719920227699e-05,
"loss": 0.8998,
"step": 500
},
{
"epoch": 2.733969986357435,
"grad_norm": 0.30531935495612433,
"learning_rate": 2.0190859354139994e-05,
"loss": 0.9269,
"step": 501
},
{
"epoch": 2.739427012278308,
"grad_norm": 0.2669945648424582,
"learning_rate": 2.0114516724926103e-05,
"loss": 0.9455,
"step": 502
},
{
"epoch": 2.7448840381991815,
"grad_norm": 0.2785334692894501,
"learning_rate": 2.0038172427049862e-05,
"loss": 0.912,
"step": 503
},
{
"epoch": 2.7503410641200547,
"grad_norm": 0.3445461005907961,
"learning_rate": 1.9961827572950138e-05,
"loss": 0.9163,
"step": 504
},
{
"epoch": 2.7557980900409276,
"grad_norm": 0.39296279811877044,
"learning_rate": 1.98854832750739e-05,
"loss": 0.9369,
"step": 505
},
{
"epoch": 2.761255115961801,
"grad_norm": 0.39702351389810686,
"learning_rate": 1.9809140645860013e-05,
"loss": 0.891,
"step": 506
},
{
"epoch": 2.7667121418826737,
"grad_norm": 0.2512865215587987,
"learning_rate": 1.9732800797723018e-05,
"loss": 0.9115,
"step": 507
},
{
"epoch": 2.772169167803547,
"grad_norm": 0.2820633130771331,
"learning_rate": 1.965646484303695e-05,
"loss": 0.9212,
"step": 508
},
{
"epoch": 2.77762619372442,
"grad_norm": 0.32145777353057775,
"learning_rate": 1.9580133894119098e-05,
"loss": 0.9207,
"step": 509
},
{
"epoch": 2.7830832196452935,
"grad_norm": 0.33762112618327617,
"learning_rate": 1.9503809063213807e-05,
"loss": 0.8845,
"step": 510
},
{
"epoch": 2.7885402455661663,
"grad_norm": 0.24634508212661455,
"learning_rate": 1.9427491462476295e-05,
"loss": 0.9156,
"step": 511
},
{
"epoch": 2.7939972714870396,
"grad_norm": 0.3457860742517539,
"learning_rate": 1.9351182203956405e-05,
"loss": 0.9106,
"step": 512
},
{
"epoch": 2.799454297407913,
"grad_norm": 0.3810319883859794,
"learning_rate": 1.927488239958243e-05,
"loss": 0.8924,
"step": 513
},
{
"epoch": 2.8049113233287857,
"grad_norm": 0.37285981835585597,
"learning_rate": 1.919859316114491e-05,
"loss": 0.906,
"step": 514
},
{
"epoch": 2.810368349249659,
"grad_norm": 0.24108156149639062,
"learning_rate": 1.9122315600280418e-05,
"loss": 0.9175,
"step": 515
},
{
"epoch": 2.815825375170532,
"grad_norm": 0.3943374958725155,
"learning_rate": 1.904605082845536e-05,
"loss": 0.9078,
"step": 516
},
{
"epoch": 2.821282401091405,
"grad_norm": 0.3145717053046707,
"learning_rate": 1.89697999569498e-05,
"loss": 0.9135,
"step": 517
},
{
"epoch": 2.8267394270122783,
"grad_norm": 0.22533549622277005,
"learning_rate": 1.8893564096841273e-05,
"loss": 0.909,
"step": 518
},
{
"epoch": 2.8321964529331516,
"grad_norm": 0.23525731404627342,
"learning_rate": 1.881734435898854e-05,
"loss": 0.9299,
"step": 519
},
{
"epoch": 2.8376534788540244,
"grad_norm": 0.2512060708918993,
"learning_rate": 1.8741141854015468e-05,
"loss": 0.8893,
"step": 520
},
{
"epoch": 2.8431105047748977,
"grad_norm": 0.19994216173059465,
"learning_rate": 1.8664957692294808e-05,
"loss": 0.9221,
"step": 521
},
{
"epoch": 2.848567530695771,
"grad_norm": 0.20556264949760783,
"learning_rate": 1.858879298393202e-05,
"loss": 0.9316,
"step": 522
},
{
"epoch": 2.854024556616644,
"grad_norm": 0.20256542941627978,
"learning_rate": 1.8512648838749105e-05,
"loss": 0.9093,
"step": 523
},
{
"epoch": 2.859481582537517,
"grad_norm": 0.22020875876934895,
"learning_rate": 1.8436526366268444e-05,
"loss": 0.9049,
"step": 524
},
{
"epoch": 2.8649386084583903,
"grad_norm": 0.20768534379511697,
"learning_rate": 1.8360426675696606e-05,
"loss": 0.9144,
"step": 525
},
{
"epoch": 2.870395634379263,
"grad_norm": 0.28896251352128466,
"learning_rate": 1.828435087590819e-05,
"loss": 0.9145,
"step": 526
},
{
"epoch": 2.8758526603001364,
"grad_norm": 0.3131376106100284,
"learning_rate": 1.8208300075429693e-05,
"loss": 0.9308,
"step": 527
},
{
"epoch": 2.8813096862210097,
"grad_norm": 0.24876481284966392,
"learning_rate": 1.8132275382423325e-05,
"loss": 0.9115,
"step": 528
},
{
"epoch": 2.8867667121418825,
"grad_norm": 0.2530867014542135,
"learning_rate": 1.8056277904670865e-05,
"loss": 0.8851,
"step": 529
},
{
"epoch": 2.892223738062756,
"grad_norm": 0.2592890449900578,
"learning_rate": 1.798030874955756e-05,
"loss": 0.9058,
"step": 530
},
{
"epoch": 2.897680763983629,
"grad_norm": 0.22039748569474332,
"learning_rate": 1.7904369024055942e-05,
"loss": 0.9176,
"step": 531
},
{
"epoch": 2.903137789904502,
"grad_norm": 0.2209833356939442,
"learning_rate": 1.7828459834709694e-05,
"loss": 0.917,
"step": 532
},
{
"epoch": 2.908594815825375,
"grad_norm": 0.23766546854501655,
"learning_rate": 1.7752582287617583e-05,
"loss": 0.8989,
"step": 533
},
{
"epoch": 2.9140518417462484,
"grad_norm": 0.2376537458371181,
"learning_rate": 1.767673748841728e-05,
"loss": 0.8946,
"step": 534
},
{
"epoch": 2.9195088676671213,
"grad_norm": 0.262071528071461,
"learning_rate": 1.7600926542269277e-05,
"loss": 0.9231,
"step": 535
},
{
"epoch": 2.9249658935879945,
"grad_norm": 0.29376545282596106,
"learning_rate": 1.7525150553840806e-05,
"loss": 0.8938,
"step": 536
},
{
"epoch": 2.930422919508868,
"grad_norm": 0.3134884408737219,
"learning_rate": 1.7449410627289687e-05,
"loss": 0.9168,
"step": 537
},
{
"epoch": 2.9358799454297406,
"grad_norm": 0.2712354478643755,
"learning_rate": 1.7373707866248278e-05,
"loss": 0.933,
"step": 538
},
{
"epoch": 2.941336971350614,
"grad_norm": 0.24553201691764942,
"learning_rate": 1.7298043373807404e-05,
"loss": 0.9159,
"step": 539
},
{
"epoch": 2.946793997271487,
"grad_norm": 0.3030078675065205,
"learning_rate": 1.7222418252500243e-05,
"loss": 0.9062,
"step": 540
},
{
"epoch": 2.9522510231923604,
"grad_norm": 0.23890406347684276,
"learning_rate": 1.7146833604286295e-05,
"loss": 0.8945,
"step": 541
},
{
"epoch": 2.9577080491132333,
"grad_norm": 0.2670091183635565,
"learning_rate": 1.7071290530535298e-05,
"loss": 0.909,
"step": 542
},
{
"epoch": 2.9631650750341065,
"grad_norm": 0.23126297362235826,
"learning_rate": 1.6995790132011223e-05,
"loss": 0.9143,
"step": 543
},
{
"epoch": 2.9686221009549794,
"grad_norm": 0.31050871509494943,
"learning_rate": 1.6920333508856176e-05,
"loss": 0.8994,
"step": 544
},
{
"epoch": 2.9740791268758526,
"grad_norm": 0.22661046923902323,
"learning_rate": 1.6844921760574417e-05,
"loss": 0.9294,
"step": 545
},
{
"epoch": 2.979536152796726,
"grad_norm": 0.3118001086032258,
"learning_rate": 1.676955598601631e-05,
"loss": 0.9041,
"step": 546
},
{
"epoch": 2.984993178717599,
"grad_norm": 0.23665950368215852,
"learning_rate": 1.6694237283362314e-05,
"loss": 0.9038,
"step": 547
},
{
"epoch": 2.990450204638472,
"grad_norm": 0.24492951232429386,
"learning_rate": 1.6618966750106996e-05,
"loss": 0.916,
"step": 548
},
{
"epoch": 2.9959072305593453,
"grad_norm": 0.25300337782976023,
"learning_rate": 1.6543745483043037e-05,
"loss": 0.9083,
"step": 549
},
{
"epoch": 3.001364256480218,
"grad_norm": 0.584103118759897,
"learning_rate": 1.6468574578245225e-05,
"loss": 1.6082,
"step": 550
},
{
"epoch": 3.0068212824010914,
"grad_norm": 0.671101312579536,
"learning_rate": 1.639345513105452e-05,
"loss": 0.8859,
"step": 551
},
{
"epoch": 3.0122783083219646,
"grad_norm": 0.3787017346934449,
"learning_rate": 1.6318388236062072e-05,
"loss": 0.8951,
"step": 552
},
{
"epoch": 3.0177353342428375,
"grad_norm": 0.42606514302989157,
"learning_rate": 1.624337498709326e-05,
"loss": 0.8877,
"step": 553
},
{
"epoch": 3.0231923601637107,
"grad_norm": 0.35542176787821733,
"learning_rate": 1.616841647719178e-05,
"loss": 0.8895,
"step": 554
},
{
"epoch": 3.028649386084584,
"grad_norm": 0.35418748629561114,
"learning_rate": 1.6093513798603713e-05,
"loss": 0.8968,
"step": 555
},
{
"epoch": 3.034106412005457,
"grad_norm": 0.4142394476010708,
"learning_rate": 1.6018668042761593e-05,
"loss": 0.8855,
"step": 556
},
{
"epoch": 3.03956343792633,
"grad_norm": 0.26285840734342447,
"learning_rate": 1.594388030026851e-05,
"loss": 0.8685,
"step": 557
},
{
"epoch": 3.0450204638472034,
"grad_norm": 0.3399484818274934,
"learning_rate": 1.586915166088224e-05,
"loss": 0.908,
"step": 558
},
{
"epoch": 3.050477489768076,
"grad_norm": 0.3389204352265327,
"learning_rate": 1.5794483213499326e-05,
"loss": 0.8911,
"step": 559
},
{
"epoch": 3.0559345156889495,
"grad_norm": 0.33188066961256374,
"learning_rate": 1.5719876046139243e-05,
"loss": 0.9147,
"step": 560
},
{
"epoch": 3.0613915416098227,
"grad_norm": 0.3377610682449399,
"learning_rate": 1.564533124592852e-05,
"loss": 0.8949,
"step": 561
},
{
"epoch": 3.0668485675306956,
"grad_norm": 0.2957318174966501,
"learning_rate": 1.557084989908495e-05,
"loss": 0.8986,
"step": 562
},
{
"epoch": 3.072305593451569,
"grad_norm": 0.430673617485615,
"learning_rate": 1.5496433090901685e-05,
"loss": 0.8949,
"step": 563
},
{
"epoch": 3.077762619372442,
"grad_norm": 0.2554433088355423,
"learning_rate": 1.5422081905731484e-05,
"loss": 0.8882,
"step": 564
},
{
"epoch": 3.083219645293315,
"grad_norm": 0.32618011312611783,
"learning_rate": 1.534779742697088e-05,
"loss": 0.9174,
"step": 565
},
{
"epoch": 3.088676671214188,
"grad_norm": 0.31352014509777587,
"learning_rate": 1.5273580737044416e-05,
"loss": 0.8918,
"step": 566
},
{
"epoch": 3.0941336971350615,
"grad_norm": 0.2557790089027306,
"learning_rate": 1.5199432917388835e-05,
"loss": 0.9007,
"step": 567
},
{
"epoch": 3.0995907230559343,
"grad_norm": 0.27540644472124487,
"learning_rate": 1.5125355048437389e-05,
"loss": 0.884,
"step": 568
},
{
"epoch": 3.1050477489768076,
"grad_norm": 0.34235171994492863,
"learning_rate": 1.5051348209604016e-05,
"loss": 0.8686,
"step": 569
},
{
"epoch": 3.110504774897681,
"grad_norm": 0.25008950788915946,
"learning_rate": 1.4977413479267675e-05,
"loss": 0.9026,
"step": 570
},
{
"epoch": 3.1159618008185537,
"grad_norm": 0.3964129492366135,
"learning_rate": 1.4903551934756592e-05,
"loss": 0.8992,
"step": 571
},
{
"epoch": 3.121418826739427,
"grad_norm": 0.40686134783523276,
"learning_rate": 1.4829764652332585e-05,
"loss": 0.9209,
"step": 572
},
{
"epoch": 3.1268758526603,
"grad_norm": 0.2670447009105334,
"learning_rate": 1.4756052707175361e-05,
"loss": 0.9153,
"step": 573
},
{
"epoch": 3.132332878581173,
"grad_norm": 0.4402126815582449,
"learning_rate": 1.4682417173366892e-05,
"loss": 0.907,
"step": 574
},
{
"epoch": 3.1377899045020463,
"grad_norm": 0.23815050001596294,
"learning_rate": 1.4608859123875703e-05,
"loss": 0.9038,
"step": 575
},
{
"epoch": 3.1432469304229196,
"grad_norm": 0.3030117101013267,
"learning_rate": 1.4535379630541284e-05,
"loss": 0.9065,
"step": 576
},
{
"epoch": 3.148703956343793,
"grad_norm": 0.37381110214711166,
"learning_rate": 1.4461979764058454e-05,
"loss": 0.9096,
"step": 577
},
{
"epoch": 3.1541609822646657,
"grad_norm": 0.24485627188888226,
"learning_rate": 1.4388660593961756e-05,
"loss": 0.8858,
"step": 578
},
{
"epoch": 3.159618008185539,
"grad_norm": 0.25282112926237954,
"learning_rate": 1.4315423188609878e-05,
"loss": 0.8905,
"step": 579
},
{
"epoch": 3.1650750341064118,
"grad_norm": 0.24907017187679334,
"learning_rate": 1.4242268615170106e-05,
"loss": 0.9068,
"step": 580
},
{
"epoch": 3.170532060027285,
"grad_norm": 0.2129960819490356,
"learning_rate": 1.4169197939602723e-05,
"loss": 0.8912,
"step": 581
},
{
"epoch": 3.1759890859481583,
"grad_norm": 0.24279078285844446,
"learning_rate": 1.409621222664554e-05,
"loss": 0.8838,
"step": 582
},
{
"epoch": 3.1814461118690316,
"grad_norm": 0.23381673434042413,
"learning_rate": 1.4023312539798322e-05,
"loss": 0.8896,
"step": 583
},
{
"epoch": 3.1869031377899044,
"grad_norm": 0.22227554143448716,
"learning_rate": 1.3950499941307332e-05,
"loss": 0.8826,
"step": 584
},
{
"epoch": 3.1923601637107777,
"grad_norm": 0.22806009027283225,
"learning_rate": 1.3877775492149828e-05,
"loss": 0.899,
"step": 585
},
{
"epoch": 3.197817189631651,
"grad_norm": 0.25047196400087585,
"learning_rate": 1.3805140252018618e-05,
"loss": 0.8954,
"step": 586
},
{
"epoch": 3.203274215552524,
"grad_norm": 0.2118062936691214,
"learning_rate": 1.373259527930663e-05,
"loss": 0.8966,
"step": 587
},
{
"epoch": 3.208731241473397,
"grad_norm": 0.2730005282503477,
"learning_rate": 1.366014163109146e-05,
"loss": 0.8795,
"step": 588
},
{
"epoch": 3.2141882673942703,
"grad_norm": 0.29613230912460564,
"learning_rate": 1.3587780363119986e-05,
"loss": 0.8796,
"step": 589
},
{
"epoch": 3.219645293315143,
"grad_norm": 0.23990776796738883,
"learning_rate": 1.3515512529792978e-05,
"loss": 0.9071,
"step": 590
},
{
"epoch": 3.2251023192360164,
"grad_norm": 0.2538388076227864,
"learning_rate": 1.3443339184149739e-05,
"loss": 0.9036,
"step": 591
},
{
"epoch": 3.2305593451568897,
"grad_norm": 0.24743496996389577,
"learning_rate": 1.337126137785276e-05,
"loss": 0.8861,
"step": 592
},
{
"epoch": 3.2360163710777625,
"grad_norm": 0.20121450134982874,
"learning_rate": 1.329928016117241e-05,
"loss": 0.8939,
"step": 593
},
{
"epoch": 3.241473396998636,
"grad_norm": 0.2869931420078408,
"learning_rate": 1.3227396582971594e-05,
"loss": 0.8906,
"step": 594
},
{
"epoch": 3.246930422919509,
"grad_norm": 0.1908364191371087,
"learning_rate": 1.3155611690690515e-05,
"loss": 0.886,
"step": 595
},
{
"epoch": 3.252387448840382,
"grad_norm": 0.3472699144561854,
"learning_rate": 1.3083926530331372e-05,
"loss": 0.9158,
"step": 596
},
{
"epoch": 3.257844474761255,
"grad_norm": 0.22549962507966057,
"learning_rate": 1.3012342146443144e-05,
"loss": 0.8764,
"step": 597
},
{
"epoch": 3.2633015006821284,
"grad_norm": 0.26789532061692434,
"learning_rate": 1.2940859582106357e-05,
"loss": 0.8841,
"step": 598
},
{
"epoch": 3.2687585266030013,
"grad_norm": 0.2522357843484046,
"learning_rate": 1.2869479878917904e-05,
"loss": 0.8819,
"step": 599
},
{
"epoch": 3.2742155525238745,
"grad_norm": 0.21493911054710754,
"learning_rate": 1.2798204076975835e-05,
"loss": 0.92,
"step": 600
},
{
"epoch": 3.279672578444748,
"grad_norm": 0.2945646091669156,
"learning_rate": 1.2727033214864233e-05,
"loss": 0.8838,
"step": 601
},
{
"epoch": 3.2851296043656206,
"grad_norm": 0.2829300287180026,
"learning_rate": 1.265596832963806e-05,
"loss": 0.8755,
"step": 602
},
{
"epoch": 3.290586630286494,
"grad_norm": 0.2536303900570064,
"learning_rate": 1.2585010456808046e-05,
"loss": 0.8904,
"step": 603
},
{
"epoch": 3.296043656207367,
"grad_norm": 0.3585519781803995,
"learning_rate": 1.2514160630325617e-05,
"loss": 0.8922,
"step": 604
},
{
"epoch": 3.30150068212824,
"grad_norm": 0.2792945795336993,
"learning_rate": 1.2443419882567821e-05,
"loss": 0.8771,
"step": 605
},
{
"epoch": 3.3069577080491133,
"grad_norm": 0.35260384633142106,
"learning_rate": 1.2372789244322272e-05,
"loss": 0.901,
"step": 606
},
{
"epoch": 3.3124147339699865,
"grad_norm": 0.31364366488160306,
"learning_rate": 1.2302269744772155e-05,
"loss": 0.8818,
"step": 607
},
{
"epoch": 3.3178717598908594,
"grad_norm": 0.23743622737062894,
"learning_rate": 1.22318624114812e-05,
"loss": 0.9072,
"step": 608
},
{
"epoch": 3.3233287858117326,
"grad_norm": 0.3642214485244677,
"learning_rate": 1.216156827037873e-05,
"loss": 0.8833,
"step": 609
},
{
"epoch": 3.328785811732606,
"grad_norm": 0.2925427624739931,
"learning_rate": 1.2091388345744703e-05,
"loss": 0.911,
"step": 610
},
{
"epoch": 3.3342428376534787,
"grad_norm": 0.2377203948239386,
"learning_rate": 1.2021323660194798e-05,
"loss": 0.8965,
"step": 611
},
{
"epoch": 3.339699863574352,
"grad_norm": 0.2706687731608815,
"learning_rate": 1.1951375234665501e-05,
"loss": 0.9036,
"step": 612
},
{
"epoch": 3.3451568894952253,
"grad_norm": 0.2679343617436159,
"learning_rate": 1.1881544088399237e-05,
"loss": 0.8939,
"step": 613
},
{
"epoch": 3.350613915416098,
"grad_norm": 0.22617857543228842,
"learning_rate": 1.1811831238929508e-05,
"loss": 0.9021,
"step": 614
},
{
"epoch": 3.3560709413369714,
"grad_norm": 0.2904617911241792,
"learning_rate": 1.1742237702066074e-05,
"loss": 0.8863,
"step": 615
},
{
"epoch": 3.3615279672578446,
"grad_norm": 0.22733511585309843,
"learning_rate": 1.1672764491880153e-05,
"loss": 0.9143,
"step": 616
},
{
"epoch": 3.3669849931787175,
"grad_norm": 0.256013923198982,
"learning_rate": 1.1603412620689637e-05,
"loss": 0.899,
"step": 617
},
{
"epoch": 3.3724420190995907,
"grad_norm": 0.25205210893149643,
"learning_rate": 1.1534183099044363e-05,
"loss": 0.8853,
"step": 618
},
{
"epoch": 3.377899045020464,
"grad_norm": 0.23143271683735414,
"learning_rate": 1.1465076935711355e-05,
"loss": 0.8947,
"step": 619
},
{
"epoch": 3.383356070941337,
"grad_norm": 0.22370756793978866,
"learning_rate": 1.1396095137660134e-05,
"loss": 0.8785,
"step": 620
},
{
"epoch": 3.38881309686221,
"grad_norm": 0.21290283764682943,
"learning_rate": 1.1327238710048075e-05,
"loss": 0.9032,
"step": 621
},
{
"epoch": 3.3942701227830834,
"grad_norm": 0.2685069204258351,
"learning_rate": 1.1258508656205715e-05,
"loss": 0.8941,
"step": 622
},
{
"epoch": 3.399727148703956,
"grad_norm": 0.20912948755324795,
"learning_rate": 1.118990597762216e-05,
"loss": 0.8913,
"step": 623
},
{
"epoch": 3.4051841746248295,
"grad_norm": 0.24827347077451523,
"learning_rate": 1.1121431673930509e-05,
"loss": 0.883,
"step": 624
},
{
"epoch": 3.4106412005457027,
"grad_norm": 0.22274674891516377,
"learning_rate": 1.1053086742893244e-05,
"loss": 0.9017,
"step": 625
},
{
"epoch": 3.4160982264665756,
"grad_norm": 0.23575151807168895,
"learning_rate": 1.0984872180387715e-05,
"loss": 0.8988,
"step": 626
},
{
"epoch": 3.421555252387449,
"grad_norm": 0.21353314466163129,
"learning_rate": 1.0916788980391633e-05,
"loss": 0.9098,
"step": 627
},
{
"epoch": 3.427012278308322,
"grad_norm": 0.22040517357317185,
"learning_rate": 1.0848838134968589e-05,
"loss": 0.884,
"step": 628
},
{
"epoch": 3.432469304229195,
"grad_norm": 0.22910802159215685,
"learning_rate": 1.0781020634253579e-05,
"loss": 0.8833,
"step": 629
},
{
"epoch": 3.437926330150068,
"grad_norm": 0.21849412085599912,
"learning_rate": 1.0713337466438578e-05,
"loss": 0.8839,
"step": 630
},
{
"epoch": 3.4433833560709415,
"grad_norm": 0.21965410678288466,
"learning_rate": 1.0645789617758181e-05,
"loss": 0.9005,
"step": 631
},
{
"epoch": 3.4488403819918143,
"grad_norm": 0.204035562242123,
"learning_rate": 1.057837807247518e-05,
"loss": 0.892,
"step": 632
},
{
"epoch": 3.4542974079126876,
"grad_norm": 0.16983377384281073,
"learning_rate": 1.0511103812866238e-05,
"loss": 0.8812,
"step": 633
},
{
"epoch": 3.459754433833561,
"grad_norm": 0.2042102923266645,
"learning_rate": 1.0443967819207602e-05,
"loss": 0.88,
"step": 634
},
{
"epoch": 3.4652114597544337,
"grad_norm": 0.18518985041839892,
"learning_rate": 1.0376971069760774e-05,
"loss": 0.9172,
"step": 635
},
{
"epoch": 3.470668485675307,
"grad_norm": 0.19653140995159937,
"learning_rate": 1.0310114540758298e-05,
"loss": 0.895,
"step": 636
},
{
"epoch": 3.47612551159618,
"grad_norm": 0.22830479434165665,
"learning_rate": 1.0243399206389527e-05,
"loss": 0.9044,
"step": 637
},
{
"epoch": 3.481582537517053,
"grad_norm": 0.19206764620071587,
"learning_rate": 1.0176826038786394e-05,
"loss": 0.8818,
"step": 638
},
{
"epoch": 3.4870395634379263,
"grad_norm": 0.21389623128712906,
"learning_rate": 1.011039600800928e-05,
"loss": 0.8956,
"step": 639
},
{
"epoch": 3.4924965893587996,
"grad_norm": 0.21993143291851755,
"learning_rate": 1.004411008203289e-05,
"loss": 0.8927,
"step": 640
},
{
"epoch": 3.4979536152796724,
"grad_norm": 0.1894006892821513,
"learning_rate": 9.977969226732099e-06,
"loss": 0.8771,
"step": 641
},
{
"epoch": 3.5034106412005457,
"grad_norm": 0.19959640202420684,
"learning_rate": 9.911974405867917e-06,
"loss": 0.8912,
"step": 642
},
{
"epoch": 3.508867667121419,
"grad_norm": 0.14759174219062646,
"learning_rate": 9.846126581073457e-06,
"loss": 0.8992,
"step": 643
},
{
"epoch": 3.5143246930422922,
"grad_norm": 0.20035668476318763,
"learning_rate": 9.780426711839877e-06,
"loss": 0.9006,
"step": 644
},
{
"epoch": 3.519781718963165,
"grad_norm": 0.16797091670116737,
"learning_rate": 9.714875755502429e-06,
"loss": 0.8873,
"step": 645
},
{
"epoch": 3.5252387448840383,
"grad_norm": 0.189909496119316,
"learning_rate": 9.649474667226513e-06,
"loss": 0.9186,
"step": 646
},
{
"epoch": 3.530695770804911,
"grad_norm": 0.1662855707845877,
"learning_rate": 9.58422439999374e-06,
"loss": 0.9061,
"step": 647
},
{
"epoch": 3.5361527967257844,
"grad_norm": 0.1877435970889167,
"learning_rate": 9.519125904588059e-06,
"loss": 0.9124,
"step": 648
},
{
"epoch": 3.5416098226466577,
"grad_norm": 0.18966972578830213,
"learning_rate": 9.45418012958191e-06,
"loss": 0.9002,
"step": 649
},
{
"epoch": 3.547066848567531,
"grad_norm": 0.18521500133290328,
"learning_rate": 9.389388021322381e-06,
"loss": 0.8921,
"step": 650
},
{
"epoch": 3.552523874488404,
"grad_norm": 0.20655179032846327,
"learning_rate": 9.32475052391742e-06,
"loss": 0.8975,
"step": 651
},
{
"epoch": 3.557980900409277,
"grad_norm": 0.1819692294620117,
"learning_rate": 9.26026857922212e-06,
"loss": 0.9082,
"step": 652
},
{
"epoch": 3.56343792633015,
"grad_norm": 0.18675168504713038,
"learning_rate": 9.19594312682493e-06,
"loss": 0.9045,
"step": 653
},
{
"epoch": 3.568894952251023,
"grad_norm": 0.16349611233292402,
"learning_rate": 9.131775104034009e-06,
"loss": 0.8907,
"step": 654
},
{
"epoch": 3.5743519781718964,
"grad_norm": 0.17657868890026518,
"learning_rate": 9.067765445863545e-06,
"loss": 0.8777,
"step": 655
},
{
"epoch": 3.5798090040927697,
"grad_norm": 0.1520862113066698,
"learning_rate": 9.00391508502017e-06,
"loss": 0.8761,
"step": 656
},
{
"epoch": 3.5852660300136425,
"grad_norm": 0.16877815138189672,
"learning_rate": 8.940224951889304e-06,
"loss": 0.869,
"step": 657
},
{
"epoch": 3.590723055934516,
"grad_norm": 0.16925000281087574,
"learning_rate": 8.876695974521659e-06,
"loss": 0.9011,
"step": 658
},
{
"epoch": 3.5961800818553886,
"grad_norm": 0.16759697258423073,
"learning_rate": 8.813329078619679e-06,
"loss": 0.9045,
"step": 659
},
{
"epoch": 3.601637107776262,
"grad_norm": 0.1896922083229097,
"learning_rate": 8.750125187524068e-06,
"loss": 0.86,
"step": 660
},
{
"epoch": 3.607094133697135,
"grad_norm": 0.17884520359215278,
"learning_rate": 8.687085222200323e-06,
"loss": 0.9095,
"step": 661
},
{
"epoch": 3.6125511596180084,
"grad_norm": 0.176877762158684,
"learning_rate": 8.624210101225343e-06,
"loss": 0.8985,
"step": 662
},
{
"epoch": 3.6180081855388813,
"grad_norm": 0.2002369650449839,
"learning_rate": 8.561500740774008e-06,
"loss": 0.8929,
"step": 663
},
{
"epoch": 3.6234652114597545,
"grad_norm": 0.17592875629565122,
"learning_rate": 8.498958054605837e-06,
"loss": 0.8778,
"step": 664
},
{
"epoch": 3.6289222373806274,
"grad_norm": 0.21757591177018767,
"learning_rate": 8.436582954051707e-06,
"loss": 0.9046,
"step": 665
},
{
"epoch": 3.6343792633015006,
"grad_norm": 0.16964570321715836,
"learning_rate": 8.374376348000523e-06,
"loss": 0.8766,
"step": 666
},
{
"epoch": 3.639836289222374,
"grad_norm": 0.20816910485872794,
"learning_rate": 8.312339142886003e-06,
"loss": 0.8948,
"step": 667
},
{
"epoch": 3.645293315143247,
"grad_norm": 0.21318859663355175,
"learning_rate": 8.250472242673486e-06,
"loss": 0.9035,
"step": 668
},
{
"epoch": 3.65075034106412,
"grad_norm": 0.17223582052559827,
"learning_rate": 8.188776548846717e-06,
"loss": 0.8914,
"step": 669
},
{
"epoch": 3.6562073669849933,
"grad_norm": 0.20492759686497783,
"learning_rate": 8.127252960394744e-06,
"loss": 0.8871,
"step": 670
},
{
"epoch": 3.661664392905866,
"grad_norm": 0.17660213480793235,
"learning_rate": 8.065902373798808e-06,
"loss": 0.8658,
"step": 671
},
{
"epoch": 3.6671214188267394,
"grad_norm": 0.18013543727863568,
"learning_rate": 8.004725683019276e-06,
"loss": 0.9016,
"step": 672
},
{
"epoch": 3.6725784447476126,
"grad_norm": 0.1844280666804985,
"learning_rate": 7.943723779482628e-06,
"loss": 0.9034,
"step": 673
},
{
"epoch": 3.678035470668486,
"grad_norm": 0.14933482527632957,
"learning_rate": 7.882897552068447e-06,
"loss": 0.9044,
"step": 674
},
{
"epoch": 3.6834924965893587,
"grad_norm": 0.180577120421336,
"learning_rate": 7.822247887096499e-06,
"loss": 0.8987,
"step": 675
},
{
"epoch": 3.688949522510232,
"grad_norm": 0.18976867015358279,
"learning_rate": 7.761775668313775e-06,
"loss": 0.9055,
"step": 676
},
{
"epoch": 3.694406548431105,
"grad_norm": 0.14380655448071636,
"learning_rate": 7.70148177688166e-06,
"loss": 0.8819,
"step": 677
},
{
"epoch": 3.699863574351978,
"grad_norm": 0.1605511243289739,
"learning_rate": 7.641367091363056e-06,
"loss": 0.8765,
"step": 678
},
{
"epoch": 3.7053206002728514,
"grad_norm": 0.16966229691015783,
"learning_rate": 7.581432487709595e-06,
"loss": 0.8956,
"step": 679
},
{
"epoch": 3.7107776261937246,
"grad_norm": 0.15825612639259118,
"learning_rate": 7.521678839248867e-06,
"loss": 0.8757,
"step": 680
},
{
"epoch": 3.7162346521145975,
"grad_norm": 0.15905765650755102,
"learning_rate": 7.462107016671727e-06,
"loss": 0.9021,
"step": 681
},
{
"epoch": 3.7216916780354707,
"grad_norm": 0.1678589543544254,
"learning_rate": 7.402717888019561e-06,
"loss": 0.9037,
"step": 682
},
{
"epoch": 3.7271487039563436,
"grad_norm": 0.16250907925377683,
"learning_rate": 7.343512318671668e-06,
"loss": 0.8996,
"step": 683
},
{
"epoch": 3.732605729877217,
"grad_norm": 0.1796362073897607,
"learning_rate": 7.284491171332637e-06,
"loss": 0.9044,
"step": 684
},
{
"epoch": 3.73806275579809,
"grad_norm": 0.15668011051829173,
"learning_rate": 7.225655306019783e-06,
"loss": 0.888,
"step": 685
},
{
"epoch": 3.7435197817189634,
"grad_norm": 0.1668930240876366,
"learning_rate": 7.167005580050608e-06,
"loss": 0.9017,
"step": 686
},
{
"epoch": 3.748976807639836,
"grad_norm": 0.18870659107182658,
"learning_rate": 7.108542848030333e-06,
"loss": 0.8767,
"step": 687
},
{
"epoch": 3.7544338335607095,
"grad_norm": 0.15696986217820777,
"learning_rate": 7.050267961839407e-06,
"loss": 0.8909,
"step": 688
},
{
"epoch": 3.7598908594815823,
"grad_norm": 0.18431028719776638,
"learning_rate": 6.992181770621109e-06,
"loss": 0.8868,
"step": 689
},
{
"epoch": 3.7653478854024556,
"grad_norm": 0.16154837397895874,
"learning_rate": 6.934285120769206e-06,
"loss": 0.8994,
"step": 690
},
{
"epoch": 3.770804911323329,
"grad_norm": 0.1608522865427035,
"learning_rate": 6.87657885591557e-06,
"loss": 0.9054,
"step": 691
},
{
"epoch": 3.776261937244202,
"grad_norm": 0.17546410153871858,
"learning_rate": 6.819063816917904e-06,
"loss": 0.8771,
"step": 692
},
{
"epoch": 3.781718963165075,
"grad_norm": 0.17779343503619688,
"learning_rate": 6.761740841847517e-06,
"loss": 0.8828,
"step": 693
},
{
"epoch": 3.787175989085948,
"grad_norm": 0.1620894791729856,
"learning_rate": 6.704610765977073e-06,
"loss": 0.8896,
"step": 694
},
{
"epoch": 3.792633015006821,
"grad_norm": 0.16551990476440234,
"learning_rate": 6.647674421768435e-06,
"loss": 0.8885,
"step": 695
},
{
"epoch": 3.7980900409276943,
"grad_norm": 0.17247511398164073,
"learning_rate": 6.590932638860543e-06,
"loss": 0.9229,
"step": 696
},
{
"epoch": 3.8035470668485676,
"grad_norm": 0.17229017961388754,
"learning_rate": 6.5343862440573095e-06,
"loss": 0.8809,
"step": 697
},
{
"epoch": 3.809004092769441,
"grad_norm": 0.15732362181652573,
"learning_rate": 6.478036061315587e-06,
"loss": 0.903,
"step": 698
},
{
"epoch": 3.8144611186903137,
"grad_norm": 0.14793725507686076,
"learning_rate": 6.421882911733146e-06,
"loss": 0.9084,
"step": 699
},
{
"epoch": 3.819918144611187,
"grad_norm": 0.18160474710129887,
"learning_rate": 6.365927613536737e-06,
"loss": 0.8833,
"step": 700
},
{
"epoch": 3.8253751705320598,
"grad_norm": 0.16205271433369595,
"learning_rate": 6.310170982070132e-06,
"loss": 0.903,
"step": 701
},
{
"epoch": 3.830832196452933,
"grad_norm": 0.1755196814184644,
"learning_rate": 6.254613829782274e-06,
"loss": 0.8866,
"step": 702
},
{
"epoch": 3.8362892223738063,
"grad_norm": 0.16947891319556294,
"learning_rate": 6.199256966215423e-06,
"loss": 0.9072,
"step": 703
},
{
"epoch": 3.8417462482946796,
"grad_norm": 0.1598029992685231,
"learning_rate": 6.1441011979933615e-06,
"loss": 0.8965,
"step": 704
},
{
"epoch": 3.8472032742155524,
"grad_norm": 0.17633255200544773,
"learning_rate": 6.089147328809637e-06,
"loss": 0.9213,
"step": 705
},
{
"epoch": 3.8526603001364257,
"grad_norm": 0.14858434315925467,
"learning_rate": 6.034396159415874e-06,
"loss": 0.9057,
"step": 706
},
{
"epoch": 3.8581173260572985,
"grad_norm": 0.1359593564440916,
"learning_rate": 5.979848487610078e-06,
"loss": 0.9002,
"step": 707
},
{
"epoch": 3.863574351978172,
"grad_norm": 0.1546596886497959,
"learning_rate": 5.92550510822502e-06,
"loss": 0.881,
"step": 708
},
{
"epoch": 3.869031377899045,
"grad_norm": 0.1553240834204749,
"learning_rate": 5.871366813116661e-06,
"loss": 0.9015,
"step": 709
},
{
"epoch": 3.8744884038199183,
"grad_norm": 0.14118959880699977,
"learning_rate": 5.817434391152605e-06,
"loss": 0.8907,
"step": 710
},
{
"epoch": 3.879945429740791,
"grad_norm": 0.14059851937404533,
"learning_rate": 5.763708628200609e-06,
"loss": 0.8891,
"step": 711
},
{
"epoch": 3.8854024556616644,
"grad_norm": 0.15427945771110663,
"learning_rate": 5.710190307117138e-06,
"loss": 0.8951,
"step": 712
},
{
"epoch": 3.8908594815825372,
"grad_norm": 0.1445538887040146,
"learning_rate": 5.656880207735938e-06,
"loss": 0.8877,
"step": 713
},
{
"epoch": 3.8963165075034105,
"grad_norm": 0.15649585838748734,
"learning_rate": 5.603779106856699e-06,
"loss": 0.9074,
"step": 714
},
{
"epoch": 3.901773533424284,
"grad_norm": 0.13648774675224182,
"learning_rate": 5.550887778233713e-06,
"loss": 0.8941,
"step": 715
},
{
"epoch": 3.907230559345157,
"grad_norm": 0.15565409065858304,
"learning_rate": 5.498206992564612e-06,
"loss": 0.9173,
"step": 716
},
{
"epoch": 3.91268758526603,
"grad_norm": 0.13922969052192785,
"learning_rate": 5.4457375174791325e-06,
"loss": 0.8893,
"step": 717
},
{
"epoch": 3.918144611186903,
"grad_norm": 0.15294676839534935,
"learning_rate": 5.3934801175279276e-06,
"loss": 0.9154,
"step": 718
},
{
"epoch": 3.923601637107776,
"grad_norm": 0.15092879808147422,
"learning_rate": 5.341435554171448e-06,
"loss": 0.8827,
"step": 719
},
{
"epoch": 3.9290586630286493,
"grad_norm": 0.14825666022997366,
"learning_rate": 5.289604585768813e-06,
"loss": 0.8848,
"step": 720
},
{
"epoch": 3.9345156889495225,
"grad_norm": 0.1606715610763504,
"learning_rate": 5.237987967566787e-06,
"loss": 0.8772,
"step": 721
},
{
"epoch": 3.939972714870396,
"grad_norm": 0.16522816411905664,
"learning_rate": 5.1865864516887535e-06,
"loss": 0.8976,
"step": 722
},
{
"epoch": 3.9454297407912686,
"grad_norm": 0.15958019587002623,
"learning_rate": 5.1354007871237765e-06,
"loss": 0.906,
"step": 723
},
{
"epoch": 3.950886766712142,
"grad_norm": 0.150449287740693,
"learning_rate": 5.084431719715668e-06,
"loss": 0.8925,
"step": 724
},
{
"epoch": 3.956343792633015,
"grad_norm": 0.1654448721490872,
"learning_rate": 5.033679992152143e-06,
"loss": 0.8949,
"step": 725
},
{
"epoch": 3.961800818553888,
"grad_norm": 0.15862344300369557,
"learning_rate": 4.983146343953964e-06,
"loss": 0.8802,
"step": 726
},
{
"epoch": 3.9672578444747613,
"grad_norm": 0.13976420034767134,
"learning_rate": 4.932831511464206e-06,
"loss": 0.887,
"step": 727
},
{
"epoch": 3.9727148703956345,
"grad_norm": 0.18682370943191948,
"learning_rate": 4.88273622783749e-06,
"loss": 0.8953,
"step": 728
},
{
"epoch": 3.9781718963165074,
"grad_norm": 0.142893917159586,
"learning_rate": 4.83286122302932e-06,
"loss": 0.8823,
"step": 729
},
{
"epoch": 3.9836289222373806,
"grad_norm": 0.1501981132875881,
"learning_rate": 4.783207223785431e-06,
"loss": 0.8964,
"step": 730
},
{
"epoch": 3.989085948158254,
"grad_norm": 0.15657458729040308,
"learning_rate": 4.733774953631238e-06,
"loss": 0.8979,
"step": 731
},
{
"epoch": 3.9945429740791267,
"grad_norm": 0.13982230103959686,
"learning_rate": 4.68456513286124e-06,
"loss": 0.8923,
"step": 732
},
{
"epoch": 4.0,
"grad_norm": 0.27858828514063777,
"learning_rate": 4.6355784785285615e-06,
"loss": 1.5566,
"step": 733
},
{
"epoch": 4.005457025920873,
"grad_norm": 0.17089905607735426,
"learning_rate": 4.586815704434488e-06,
"loss": 0.887,
"step": 734
},
{
"epoch": 4.0109140518417465,
"grad_norm": 0.14705477825085042,
"learning_rate": 4.538277521118071e-06,
"loss": 0.8841,
"step": 735
},
{
"epoch": 4.01637107776262,
"grad_norm": 0.1636792606316968,
"learning_rate": 4.489964635845769e-06,
"loss": 0.8899,
"step": 736
},
{
"epoch": 4.021828103683492,
"grad_norm": 0.15198479944975976,
"learning_rate": 4.44187775260116e-06,
"loss": 0.8881,
"step": 737
},
{
"epoch": 4.0272851296043655,
"grad_norm": 0.13291652839803894,
"learning_rate": 4.3940175720746494e-06,
"loss": 0.8696,
"step": 738
},
{
"epoch": 4.032742155525239,
"grad_norm": 0.1490103061507369,
"learning_rate": 4.346384791653298e-06,
"loss": 0.8984,
"step": 739
},
{
"epoch": 4.038199181446112,
"grad_norm": 0.17175563522601708,
"learning_rate": 4.2989801054106305e-06,
"loss": 0.8665,
"step": 740
},
{
"epoch": 4.043656207366985,
"grad_norm": 0.1499319026668514,
"learning_rate": 4.251804204096535e-06,
"loss": 0.8779,
"step": 741
},
{
"epoch": 4.0491132332878585,
"grad_norm": 0.16227165418614628,
"learning_rate": 4.204857775127198e-06,
"loss": 0.8755,
"step": 742
},
{
"epoch": 4.054570259208731,
"grad_norm": 0.1581981145043867,
"learning_rate": 4.1581415025750795e-06,
"loss": 0.8895,
"step": 743
},
{
"epoch": 4.060027285129604,
"grad_norm": 0.15513935379525345,
"learning_rate": 4.111656067158971e-06,
"loss": 0.8974,
"step": 744
},
{
"epoch": 4.0654843110504775,
"grad_norm": 0.14535697871945671,
"learning_rate": 4.065402146234034e-06,
"loss": 0.8485,
"step": 745
},
{
"epoch": 4.070941336971351,
"grad_norm": 0.1297532212724062,
"learning_rate": 4.019380413781968e-06,
"loss": 0.885,
"step": 746
},
{
"epoch": 4.076398362892224,
"grad_norm": 0.1488778393601588,
"learning_rate": 3.973591540401165e-06,
"loss": 0.9015,
"step": 747
},
{
"epoch": 4.081855388813097,
"grad_norm": 0.13978030494695767,
"learning_rate": 3.928036193296958e-06,
"loss": 0.8887,
"step": 748
},
{
"epoch": 4.08731241473397,
"grad_norm": 0.14411923483228978,
"learning_rate": 3.882715036271874e-06,
"loss": 0.8734,
"step": 749
},
{
"epoch": 4.092769440654843,
"grad_norm": 0.139081525574305,
"learning_rate": 3.837628729715994e-06,
"loss": 0.8781,
"step": 750
},
{
"epoch": 4.098226466575716,
"grad_norm": 0.14858634817778646,
"learning_rate": 3.7927779305973066e-06,
"loss": 0.8708,
"step": 751
},
{
"epoch": 4.1036834924965895,
"grad_norm": 0.1364174899816674,
"learning_rate": 3.7481632924521383e-06,
"loss": 0.8741,
"step": 752
},
{
"epoch": 4.109140518417463,
"grad_norm": 0.13932957227692389,
"learning_rate": 3.7037854653756287e-06,
"loss": 0.8921,
"step": 753
},
{
"epoch": 4.114597544338336,
"grad_norm": 0.14304788451278092,
"learning_rate": 3.65964509601227e-06,
"loss": 0.8765,
"step": 754
},
{
"epoch": 4.120054570259208,
"grad_norm": 0.1629674318472855,
"learning_rate": 3.6157428275464713e-06,
"loss": 0.8865,
"step": 755
},
{
"epoch": 4.125511596180082,
"grad_norm": 0.1363245824129024,
"learning_rate": 3.572079299693201e-06,
"loss": 0.9084,
"step": 756
},
{
"epoch": 4.130968622100955,
"grad_norm": 0.1508822391111529,
"learning_rate": 3.528655148688649e-06,
"loss": 0.8851,
"step": 757
},
{
"epoch": 4.136425648021828,
"grad_norm": 0.14303462818362056,
"learning_rate": 3.485471007280965e-06,
"loss": 0.8758,
"step": 758
},
{
"epoch": 4.1418826739427015,
"grad_norm": 0.14526482585748274,
"learning_rate": 3.4425275047210337e-06,
"loss": 0.8888,
"step": 759
},
{
"epoch": 4.147339699863575,
"grad_norm": 0.13337908689426514,
"learning_rate": 3.399825266753316e-06,
"loss": 0.8996,
"step": 760
},
{
"epoch": 4.152796725784447,
"grad_norm": 0.12506301079113333,
"learning_rate": 3.357364915606711e-06,
"loss": 0.8817,
"step": 761
},
{
"epoch": 4.15825375170532,
"grad_norm": 0.13808109230175336,
"learning_rate": 3.3151470699855226e-06,
"loss": 0.8784,
"step": 762
},
{
"epoch": 4.163710777626194,
"grad_norm": 0.1322332471365764,
"learning_rate": 3.2731723450604047e-06,
"loss": 0.8905,
"step": 763
},
{
"epoch": 4.169167803547067,
"grad_norm": 0.13389925768742733,
"learning_rate": 3.23144135245943e-06,
"loss": 0.8952,
"step": 764
},
{
"epoch": 4.17462482946794,
"grad_norm": 0.13683886240243162,
"learning_rate": 3.1899547002591548e-06,
"loss": 0.8755,
"step": 765
},
{
"epoch": 4.1800818553888135,
"grad_norm": 0.12558097802450152,
"learning_rate": 3.148712992975773e-06,
"loss": 0.8579,
"step": 766
},
{
"epoch": 4.185538881309686,
"grad_norm": 0.13782990446140714,
"learning_rate": 3.107716831556298e-06,
"loss": 0.8929,
"step": 767
},
{
"epoch": 4.190995907230559,
"grad_norm": 0.13743942655956906,
"learning_rate": 3.0669668133698114e-06,
"loss": 0.8627,
"step": 768
},
{
"epoch": 4.196452933151432,
"grad_norm": 0.12989089669107465,
"learning_rate": 3.026463532198767e-06,
"loss": 0.8799,
"step": 769
},
{
"epoch": 4.201909959072306,
"grad_norm": 0.15662525453225684,
"learning_rate": 2.9862075782303155e-06,
"loss": 0.8731,
"step": 770
},
{
"epoch": 4.207366984993179,
"grad_norm": 0.1350418186415897,
"learning_rate": 2.946199538047727e-06,
"loss": 0.8602,
"step": 771
},
{
"epoch": 4.212824010914052,
"grad_norm": 0.12498595424209477,
"learning_rate": 2.9064399946218304e-06,
"loss": 0.868,
"step": 772
},
{
"epoch": 4.218281036834925,
"grad_norm": 0.21098257229096243,
"learning_rate": 2.866929527302522e-06,
"loss": 0.8883,
"step": 773
},
{
"epoch": 4.223738062755798,
"grad_norm": 0.133123941008207,
"learning_rate": 2.8276687118103384e-06,
"loss": 0.8878,
"step": 774
},
{
"epoch": 4.229195088676671,
"grad_norm": 0.1418691768230737,
"learning_rate": 2.7886581202280338e-06,
"loss": 0.8978,
"step": 775
},
{
"epoch": 4.234652114597544,
"grad_norm": 0.14622777292208364,
"learning_rate": 2.749898320992286e-06,
"loss": 0.8855,
"step": 776
},
{
"epoch": 4.240109140518418,
"grad_norm": 0.13868949813718004,
"learning_rate": 2.711389878885371e-06,
"loss": 0.8782,
"step": 777
},
{
"epoch": 4.245566166439291,
"grad_norm": 0.12620162322262743,
"learning_rate": 2.673133355026969e-06,
"loss": 0.8742,
"step": 778
},
{
"epoch": 4.251023192360163,
"grad_norm": 0.1271015484185532,
"learning_rate": 2.6351293068659643e-06,
"loss": 0.8748,
"step": 779
},
{
"epoch": 4.256480218281037,
"grad_norm": 0.18196702435356202,
"learning_rate": 2.597378288172332e-06,
"loss": 0.8851,
"step": 780
},
{
"epoch": 4.26193724420191,
"grad_norm": 0.16872955546686272,
"learning_rate": 2.559880849029079e-06,
"loss": 0.8802,
"step": 781
},
{
"epoch": 4.267394270122783,
"grad_norm": 0.13072466845715314,
"learning_rate": 2.5226375358242085e-06,
"loss": 0.8877,
"step": 782
},
{
"epoch": 4.272851296043656,
"grad_norm": 0.14754876950071485,
"learning_rate": 2.485648891242767e-06,
"loss": 0.8904,
"step": 783
},
{
"epoch": 4.27830832196453,
"grad_norm": 0.1590434780138768,
"learning_rate": 2.448915454258942e-06,
"loss": 0.9032,
"step": 784
},
{
"epoch": 4.283765347885402,
"grad_norm": 0.14151048099974572,
"learning_rate": 2.412437760128199e-06,
"loss": 0.8918,
"step": 785
},
{
"epoch": 4.289222373806275,
"grad_norm": 0.13620855260975054,
"learning_rate": 2.376216340379489e-06,
"loss": 0.8845,
"step": 786
},
{
"epoch": 4.294679399727149,
"grad_norm": 0.1901936677421411,
"learning_rate": 2.3402517228075073e-06,
"loss": 0.8851,
"step": 787
},
{
"epoch": 4.300136425648022,
"grad_norm": 0.19671986170174766,
"learning_rate": 2.3045444314649856e-06,
"loss": 0.8678,
"step": 788
},
{
"epoch": 4.305593451568895,
"grad_norm": 0.14899305081412742,
"learning_rate": 2.2690949866550803e-06,
"loss": 0.8893,
"step": 789
},
{
"epoch": 4.311050477489768,
"grad_norm": 0.16143293820225038,
"learning_rate": 2.2339039049237687e-06,
"loss": 0.9024,
"step": 790
},
{
"epoch": 4.316507503410641,
"grad_norm": 0.17932782006553405,
"learning_rate": 2.19897169905233e-06,
"loss": 0.8929,
"step": 791
},
{
"epoch": 4.321964529331514,
"grad_norm": 0.14806731199839362,
"learning_rate": 2.164298878049882e-06,
"loss": 0.8662,
"step": 792
},
{
"epoch": 4.327421555252387,
"grad_norm": 0.1289649779602983,
"learning_rate": 2.1298859471459443e-06,
"loss": 0.8813,
"step": 793
},
{
"epoch": 4.332878581173261,
"grad_norm": 0.19801253886238948,
"learning_rate": 2.0957334077831115e-06,
"loss": 0.9005,
"step": 794
},
{
"epoch": 4.338335607094134,
"grad_norm": 0.19694630717701755,
"learning_rate": 2.0618417576097016e-06,
"loss": 0.9052,
"step": 795
},
{
"epoch": 4.343792633015007,
"grad_norm": 0.13211759110481675,
"learning_rate": 2.028211490472538e-06,
"loss": 0.8727,
"step": 796
},
{
"epoch": 4.34924965893588,
"grad_norm": 0.16942182286893248,
"learning_rate": 1.99484309640974e-06,
"loss": 0.8939,
"step": 797
},
{
"epoch": 4.354706684856753,
"grad_norm": 0.15525627631122169,
"learning_rate": 1.9617370616435827e-06,
"loss": 0.8769,
"step": 798
},
{
"epoch": 4.360163710777626,
"grad_norm": 0.14568495391925143,
"learning_rate": 1.9288938685734206e-06,
"loss": 0.8801,
"step": 799
},
{
"epoch": 4.365620736698499,
"grad_norm": 0.1482638689148959,
"learning_rate": 1.8963139957686439e-06,
"loss": 0.8865,
"step": 800
},
{
"epoch": 4.371077762619373,
"grad_norm": 0.1698206452526069,
"learning_rate": 1.863997917961724e-06,
"loss": 0.8756,
"step": 801
},
{
"epoch": 4.376534788540246,
"grad_norm": 0.15989049182819062,
"learning_rate": 1.8319461060412735e-06,
"loss": 0.8827,
"step": 802
},
{
"epoch": 4.381991814461118,
"grad_norm": 0.12598713099232536,
"learning_rate": 1.8001590270452007e-06,
"loss": 0.8955,
"step": 803
},
{
"epoch": 4.387448840381992,
"grad_norm": 0.13029377486709406,
"learning_rate": 1.7686371441539041e-06,
"loss": 0.8964,
"step": 804
},
{
"epoch": 4.392905866302865,
"grad_norm": 0.15126612488881352,
"learning_rate": 1.7373809166835131e-06,
"loss": 0.8838,
"step": 805
},
{
"epoch": 4.398362892223738,
"grad_norm": 0.13445739913334448,
"learning_rate": 1.7063908000791984e-06,
"loss": 0.8958,
"step": 806
},
{
"epoch": 4.403819918144611,
"grad_norm": 0.16664103966071625,
"learning_rate": 1.6756672459085565e-06,
"loss": 0.8826,
"step": 807
},
{
"epoch": 4.409276944065485,
"grad_norm": 0.1762539743129894,
"learning_rate": 1.645210701854989e-06,
"loss": 0.8785,
"step": 808
},
{
"epoch": 4.414733969986357,
"grad_norm": 0.15934966334590775,
"learning_rate": 1.615021611711216e-06,
"loss": 0.8854,
"step": 809
},
{
"epoch": 4.42019099590723,
"grad_norm": 0.124968014984558,
"learning_rate": 1.5851004153727845e-06,
"loss": 0.8788,
"step": 810
},
{
"epoch": 4.425648021828104,
"grad_norm": 0.1385331750299138,
"learning_rate": 1.5554475488316812e-06,
"loss": 0.8916,
"step": 811
},
{
"epoch": 4.431105047748977,
"grad_norm": 0.13707927403446576,
"learning_rate": 1.5260634441699585e-06,
"loss": 0.8742,
"step": 812
},
{
"epoch": 4.43656207366985,
"grad_norm": 0.12627498054063097,
"learning_rate": 1.496948529553457e-06,
"loss": 0.887,
"step": 813
},
{
"epoch": 4.442019099590723,
"grad_norm": 0.1486808085420501,
"learning_rate": 1.468103229225546e-06,
"loss": 0.8808,
"step": 814
},
{
"epoch": 4.447476125511596,
"grad_norm": 0.14605688992873062,
"learning_rate": 1.4395279635009595e-06,
"loss": 0.8708,
"step": 815
},
{
"epoch": 4.452933151432469,
"grad_norm": 0.13906895719296147,
"learning_rate": 1.4112231487596618e-06,
"loss": 0.8649,
"step": 816
},
{
"epoch": 4.458390177353342,
"grad_norm": 0.11788044087197277,
"learning_rate": 1.3831891974407862e-06,
"loss": 0.8783,
"step": 817
},
{
"epoch": 4.463847203274216,
"grad_norm": 0.11778162515868901,
"learning_rate": 1.3554265180366177e-06,
"loss": 0.91,
"step": 818
},
{
"epoch": 4.469304229195089,
"grad_norm": 0.14670430911084376,
"learning_rate": 1.3279355150866536e-06,
"loss": 0.8694,
"step": 819
},
{
"epoch": 4.474761255115962,
"grad_norm": 0.12231810737886735,
"learning_rate": 1.3007165891716978e-06,
"loss": 0.8519,
"step": 820
},
{
"epoch": 4.480218281036835,
"grad_norm": 0.1271770535078628,
"learning_rate": 1.2737701369080213e-06,
"loss": 0.9097,
"step": 821
},
{
"epoch": 4.485675306957708,
"grad_norm": 0.1385648556146423,
"learning_rate": 1.2470965509415911e-06,
"loss": 0.8968,
"step": 822
},
{
"epoch": 4.491132332878581,
"grad_norm": 0.152826890949677,
"learning_rate": 1.2206962199423478e-06,
"loss": 0.8831,
"step": 823
},
{
"epoch": 4.496589358799454,
"grad_norm": 0.12805457871619716,
"learning_rate": 1.1945695285985437e-06,
"loss": 0.9114,
"step": 824
},
{
"epoch": 4.502046384720328,
"grad_norm": 0.1200667730156898,
"learning_rate": 1.1687168576111251e-06,
"loss": 0.897,
"step": 825
},
{
"epoch": 4.507503410641201,
"grad_norm": 0.14528129682089547,
"learning_rate": 1.1431385836882058e-06,
"loss": 0.8645,
"step": 826
},
{
"epoch": 4.512960436562073,
"grad_norm": 0.11909725664621254,
"learning_rate": 1.1178350795395553e-06,
"loss": 0.875,
"step": 827
},
{
"epoch": 4.5184174624829465,
"grad_norm": 0.140122744341455,
"learning_rate": 1.0928067138711817e-06,
"loss": 0.8825,
"step": 828
},
{
"epoch": 4.52387448840382,
"grad_norm": 0.15162334185887835,
"learning_rate": 1.06805385137996e-06,
"loss": 0.8794,
"step": 829
},
{
"epoch": 4.529331514324693,
"grad_norm": 0.14150098143714812,
"learning_rate": 1.0435768527483114e-06,
"loss": 0.8937,
"step": 830
},
{
"epoch": 4.534788540245566,
"grad_norm": 0.1260117766174468,
"learning_rate": 1.019376074638949e-06,
"loss": 0.8815,
"step": 831
},
{
"epoch": 4.54024556616644,
"grad_norm": 0.12409915203852431,
"learning_rate": 9.954518696896854e-07,
"loss": 0.8834,
"step": 832
},
{
"epoch": 4.545702592087313,
"grad_norm": 0.136871178123947,
"learning_rate": 9.718045865082914e-07,
"loss": 0.8793,
"step": 833
},
{
"epoch": 4.551159618008185,
"grad_norm": 0.14107625789727535,
"learning_rate": 9.484345696674135e-07,
"loss": 0.9022,
"step": 834
},
{
"epoch": 4.5566166439290585,
"grad_norm": 0.15144418956800026,
"learning_rate": 9.253421596995538e-07,
"loss": 0.8668,
"step": 835
},
{
"epoch": 4.562073669849932,
"grad_norm": 0.14652192158608265,
"learning_rate": 9.025276930921168e-07,
"loss": 0.8952,
"step": 836
},
{
"epoch": 4.567530695770805,
"grad_norm": 0.1414596872944082,
"learning_rate": 8.799915022824912e-07,
"loss": 0.89,
"step": 837
},
{
"epoch": 4.572987721691678,
"grad_norm": 0.11762183453991368,
"learning_rate": 8.577339156532228e-07,
"loss": 0.8891,
"step": 838
},
{
"epoch": 4.578444747612551,
"grad_norm": 0.11696302332812643,
"learning_rate": 8.35755257527211e-07,
"loss": 0.8865,
"step": 839
},
{
"epoch": 4.583901773533424,
"grad_norm": 0.14096367257161824,
"learning_rate": 8.140558481629978e-07,
"loss": 0.883,
"step": 840
},
{
"epoch": 4.589358799454297,
"grad_norm": 0.13025076800848065,
"learning_rate": 7.92636003750098e-07,
"loss": 0.861,
"step": 841
},
{
"epoch": 4.5948158253751705,
"grad_norm": 0.13357496841366284,
"learning_rate": 7.714960364043844e-07,
"loss": 0.8917,
"step": 842
},
{
"epoch": 4.600272851296044,
"grad_norm": 0.12340312706447396,
"learning_rate": 7.506362541635482e-07,
"loss": 0.8899,
"step": 843
},
{
"epoch": 4.605729877216917,
"grad_norm": 0.14740826905136978,
"learning_rate": 7.300569609826103e-07,
"loss": 0.9164,
"step": 844
},
{
"epoch": 4.61118690313779,
"grad_norm": 0.12349556901637186,
"learning_rate": 7.097584567294858e-07,
"loss": 0.9002,
"step": 845
},
{
"epoch": 4.616643929058663,
"grad_norm": 0.12502508254095465,
"learning_rate": 6.897410371806202e-07,
"loss": 0.8966,
"step": 846
},
{
"epoch": 4.622100954979536,
"grad_norm": 0.1211951289522415,
"learning_rate": 6.70004994016673e-07,
"loss": 0.8834,
"step": 847
},
{
"epoch": 4.627557980900409,
"grad_norm": 0.12281440993768762,
"learning_rate": 6.505506148182816e-07,
"loss": 0.8871,
"step": 848
},
{
"epoch": 4.6330150068212825,
"grad_norm": 0.12502962964548078,
"learning_rate": 6.313781830618549e-07,
"loss": 0.8767,
"step": 849
},
{
"epoch": 4.638472032742156,
"grad_norm": 0.12943087930152467,
"learning_rate": 6.124879781154458e-07,
"loss": 0.875,
"step": 850
},
{
"epoch": 4.643929058663028,
"grad_norm": 0.14031219133585143,
"learning_rate": 5.938802752346972e-07,
"loss": 0.8927,
"step": 851
},
{
"epoch": 4.6493860845839015,
"grad_norm": 0.12542619019610873,
"learning_rate": 5.755553455588025e-07,
"loss": 0.8876,
"step": 852
},
{
"epoch": 4.654843110504775,
"grad_norm": 0.12638050340038925,
"learning_rate": 5.575134561065798e-07,
"loss": 0.8665,
"step": 853
},
{
"epoch": 4.660300136425648,
"grad_norm": 0.12333899862571804,
"learning_rate": 5.397548697725686e-07,
"loss": 0.8903,
"step": 854
},
{
"epoch": 4.665757162346521,
"grad_norm": 0.1346635192449993,
"learning_rate": 5.22279845323197e-07,
"loss": 0.8725,
"step": 855
},
{
"epoch": 4.6712141882673945,
"grad_norm": 0.13647426558512074,
"learning_rate": 5.050886373930231e-07,
"loss": 0.8875,
"step": 856
},
{
"epoch": 4.676671214188268,
"grad_norm": 0.11566165671981071,
"learning_rate": 4.881814964810172e-07,
"loss": 0.8749,
"step": 857
},
{
"epoch": 4.68212824010914,
"grad_norm": 0.12363326959711636,
"learning_rate": 4.715586689469054e-07,
"loss": 0.8769,
"step": 858
},
{
"epoch": 4.6875852660300135,
"grad_norm": 0.11764168862039581,
"learning_rate": 4.552203970075941e-07,
"loss": 0.8918,
"step": 859
},
{
"epoch": 4.693042291950887,
"grad_norm": 0.11426043356029422,
"learning_rate": 4.391669187336267e-07,
"loss": 0.89,
"step": 860
},
{
"epoch": 4.69849931787176,
"grad_norm": 0.11108719415747546,
"learning_rate": 4.2339846804572596e-07,
"loss": 0.8804,
"step": 861
},
{
"epoch": 4.703956343792633,
"grad_norm": 0.12913366570368975,
"learning_rate": 4.079152747113746e-07,
"loss": 0.8803,
"step": 862
},
{
"epoch": 4.709413369713506,
"grad_norm": 0.11303274704940805,
"learning_rate": 3.9271756434147825e-07,
"loss": 0.8707,
"step": 863
},
{
"epoch": 4.714870395634379,
"grad_norm": 0.12668292386761498,
"learning_rate": 3.778055583870677e-07,
"loss": 0.8615,
"step": 864
},
{
"epoch": 4.720327421555252,
"grad_norm": 0.12160210451896335,
"learning_rate": 3.631794741360839e-07,
"loss": 0.8749,
"step": 865
},
{
"epoch": 4.7257844474761255,
"grad_norm": 0.12255220583052599,
"learning_rate": 3.4883952471019833e-07,
"loss": 0.8656,
"step": 866
},
{
"epoch": 4.731241473396999,
"grad_norm": 0.13097406090149366,
"learning_rate": 3.347859190617153e-07,
"loss": 0.9104,
"step": 867
},
{
"epoch": 4.736698499317872,
"grad_norm": 0.13879798939381358,
"learning_rate": 3.210188619705257e-07,
"loss": 0.8932,
"step": 868
},
{
"epoch": 4.742155525238745,
"grad_norm": 0.10869652399193062,
"learning_rate": 3.0753855404112907e-07,
"loss": 0.8617,
"step": 869
},
{
"epoch": 4.747612551159618,
"grad_norm": 0.14405688232051542,
"learning_rate": 2.943451916997009e-07,
"loss": 0.8849,
"step": 870
},
{
"epoch": 4.753069577080491,
"grad_norm": 0.10950574849744894,
"learning_rate": 2.814389671912321e-07,
"loss": 0.8894,
"step": 871
},
{
"epoch": 4.758526603001364,
"grad_norm": 0.12549196376105284,
"learning_rate": 2.6882006857672946e-07,
"loss": 0.8666,
"step": 872
},
{
"epoch": 4.7639836289222375,
"grad_norm": 0.12049040986742628,
"learning_rate": 2.564886797304844e-07,
"loss": 0.8925,
"step": 873
},
{
"epoch": 4.769440654843111,
"grad_norm": 0.13628451786758633,
"learning_rate": 2.444449803373772e-07,
"loss": 0.8736,
"step": 874
},
{
"epoch": 4.774897680763983,
"grad_norm": 0.11819888541785435,
"learning_rate": 2.3268914589026582e-07,
"loss": 0.876,
"step": 875
},
{
"epoch": 4.780354706684856,
"grad_norm": 0.13249771798161303,
"learning_rate": 2.212213476874392e-07,
"loss": 0.8721,
"step": 876
},
{
"epoch": 4.78581173260573,
"grad_norm": 0.16019402654757406,
"learning_rate": 2.100417528301013e-07,
"loss": 0.8574,
"step": 877
},
{
"epoch": 4.791268758526603,
"grad_norm": 0.12709978273924896,
"learning_rate": 1.9915052421995095e-07,
"loss": 0.8788,
"step": 878
},
{
"epoch": 4.796725784447476,
"grad_norm": 0.1269380086439239,
"learning_rate": 1.8854782055680588e-07,
"loss": 0.8856,
"step": 879
},
{
"epoch": 4.8021828103683495,
"grad_norm": 0.12816104723044472,
"learning_rate": 1.7823379633628236e-07,
"loss": 0.8682,
"step": 880
},
{
"epoch": 4.807639836289223,
"grad_norm": 0.11897350073492513,
"learning_rate": 1.6820860184755705e-07,
"loss": 0.8893,
"step": 881
},
{
"epoch": 4.813096862210095,
"grad_norm": 0.1676441632472798,
"learning_rate": 1.584723831711621e-07,
"loss": 0.8827,
"step": 882
},
{
"epoch": 4.818553888130968,
"grad_norm": 0.11260862448875701,
"learning_rate": 1.4902528217687339e-07,
"loss": 0.8668,
"step": 883
},
{
"epoch": 4.824010914051842,
"grad_norm": 0.11073774182348436,
"learning_rate": 1.398674365216235e-07,
"loss": 0.8985,
"step": 884
},
{
"epoch": 4.829467939972715,
"grad_norm": 0.11457097434689421,
"learning_rate": 1.309989796475164e-07,
"loss": 0.8671,
"step": 885
},
{
"epoch": 4.834924965893588,
"grad_norm": 0.11278616103798808,
"learning_rate": 1.22420040779867e-07,
"loss": 0.8627,
"step": 886
},
{
"epoch": 4.8403819918144615,
"grad_norm": 0.11658568907087213,
"learning_rate": 1.1413074492532927e-07,
"loss": 0.8698,
"step": 887
},
{
"epoch": 4.845839017735334,
"grad_norm": 0.12477104191258748,
"learning_rate": 1.06131212870062e-07,
"loss": 0.8972,
"step": 888
},
{
"epoch": 4.851296043656207,
"grad_norm": 0.11569142614675672,
"learning_rate": 9.842156117798817e-08,
"loss": 0.8808,
"step": 889
},
{
"epoch": 4.85675306957708,
"grad_norm": 0.15639110653049954,
"learning_rate": 9.10019021890718e-08,
"loss": 0.8757,
"step": 890
},
{
"epoch": 4.862210095497954,
"grad_norm": 0.1414742721261049,
"learning_rate": 8.387234401770361e-08,
"loss": 0.884,
"step": 891
},
{
"epoch": 4.867667121418827,
"grad_norm": 0.11635396600358816,
"learning_rate": 7.703299055111357e-08,
"loss": 0.9047,
"step": 892
},
{
"epoch": 4.8731241473397,
"grad_norm": 0.1168433074137953,
"learning_rate": 7.048394144785863e-08,
"loss": 0.8669,
"step": 893
},
{
"epoch": 4.878581173260573,
"grad_norm": 0.11125431182410457,
"learning_rate": 6.422529213637063e-08,
"loss": 0.8713,
"step": 894
},
{
"epoch": 4.884038199181446,
"grad_norm": 0.11391519788296704,
"learning_rate": 5.8257133813570675e-08,
"loss": 0.8851,
"step": 895
},
{
"epoch": 4.889495225102319,
"grad_norm": 0.1050397723513658,
"learning_rate": 5.257955344353471e-08,
"loss": 0.8742,
"step": 896
},
{
"epoch": 4.894952251023192,
"grad_norm": 0.110886347004846,
"learning_rate": 4.71926337562234e-08,
"loss": 0.8835,
"step": 897
},
{
"epoch": 4.900409276944066,
"grad_norm": 0.13022649928545438,
"learning_rate": 4.2096453246287526e-08,
"loss": 0.8798,
"step": 898
},
{
"epoch": 4.905866302864939,
"grad_norm": 0.11447299895739564,
"learning_rate": 3.729108617191557e-08,
"loss": 0.8915,
"step": 899
},
{
"epoch": 4.911323328785811,
"grad_norm": 0.1226591978951474,
"learning_rate": 3.277660255375237e-08,
"loss": 0.9051,
"step": 900
},
{
"epoch": 4.916780354706685,
"grad_norm": 0.11938229502321866,
"learning_rate": 2.855306817388659e-08,
"loss": 0.8961,
"step": 901
},
{
"epoch": 4.922237380627558,
"grad_norm": 0.13559091937945114,
"learning_rate": 2.462054457487595e-08,
"loss": 0.8778,
"step": 902
},
{
"epoch": 4.927694406548431,
"grad_norm": 0.12486180567731954,
"learning_rate": 2.097908905887014e-08,
"loss": 0.8877,
"step": 903
},
{
"epoch": 4.933151432469304,
"grad_norm": 0.12698382896355306,
"learning_rate": 1.7628754686760397e-08,
"loss": 0.8837,
"step": 904
},
{
"epoch": 4.938608458390178,
"grad_norm": 0.10840982827247776,
"learning_rate": 1.4569590277413447e-08,
"loss": 0.8738,
"step": 905
},
{
"epoch": 4.94406548431105,
"grad_norm": 0.11341918799763352,
"learning_rate": 1.1801640406963188e-08,
"loss": 0.8731,
"step": 906
},
{
"epoch": 4.949522510231923,
"grad_norm": 0.11594299163597076,
"learning_rate": 9.32494540815121e-09,
"loss": 0.8704,
"step": 907
},
{
"epoch": 4.954979536152797,
"grad_norm": 0.10946767908645595,
"learning_rate": 7.13954136974504e-09,
"loss": 0.8916,
"step": 908
},
{
"epoch": 4.96043656207367,
"grad_norm": 0.1131154133991995,
"learning_rate": 5.245460136018565e-09,
"loss": 0.8931,
"step": 909
},
{
"epoch": 4.965893587994543,
"grad_norm": 0.11676272543288499,
"learning_rate": 3.6427293062724077e-09,
"loss": 0.8906,
"step": 910
},
{
"epoch": 4.971350613915416,
"grad_norm": 0.11538578325055797,
"learning_rate": 2.3313722344497914e-09,
"loss": 0.8779,
"step": 911
},
{
"epoch": 4.97680763983629,
"grad_norm": 0.1264648783936699,
"learning_rate": 1.3114080287790488e-09,
"loss": 0.8652,
"step": 912
},
{
"epoch": 4.982264665757162,
"grad_norm": 0.11491863538487673,
"learning_rate": 5.828515515116096e-10,
"loss": 0.8722,
"step": 913
},
{
"epoch": 4.987721691678035,
"grad_norm": 0.1086827877742504,
"learning_rate": 1.457134186866327e-10,
"loss": 0.9013,
"step": 914
},
{
"epoch": 4.993178717598909,
"grad_norm": 0.11831005135790107,
"learning_rate": 0.0,
"loss": 0.8894,
"step": 915
},
{
"epoch": 4.993178717598909,
"step": 915,
"total_flos": 1.883960626772548e+19,
"train_loss": 0.9399711781512192,
"train_runtime": 49360.7108,
"train_samples_per_second": 9.497,
"train_steps_per_second": 0.019
}
],
"logging_steps": 1.0,
"max_steps": 915,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.883960626772548e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}