reasoning_hp_ablations_bsz256 / trainer_state.json
sedrickkeh's picture
End of training
3dc16e9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9983155530600785,
"eval_steps": 500,
"global_step": 1335,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0022459292532285235,
"grad_norm": 5.7204437255859375,
"learning_rate": 7.462686567164179e-08,
"loss": 0.8234,
"step": 1
},
{
"epoch": 0.004491858506457047,
"grad_norm": 5.842348575592041,
"learning_rate": 1.4925373134328358e-07,
"loss": 0.8425,
"step": 2
},
{
"epoch": 0.00673778775968557,
"grad_norm": 6.015719413757324,
"learning_rate": 2.2388059701492537e-07,
"loss": 0.886,
"step": 3
},
{
"epoch": 0.008983717012914094,
"grad_norm": 5.857845783233643,
"learning_rate": 2.9850746268656716e-07,
"loss": 0.8574,
"step": 4
},
{
"epoch": 0.011229646266142616,
"grad_norm": 5.838263988494873,
"learning_rate": 3.7313432835820895e-07,
"loss": 0.8673,
"step": 5
},
{
"epoch": 0.01347557551937114,
"grad_norm": 5.752118110656738,
"learning_rate": 4.4776119402985074e-07,
"loss": 0.8437,
"step": 6
},
{
"epoch": 0.015721504772599662,
"grad_norm": 5.738947868347168,
"learning_rate": 5.223880597014925e-07,
"loss": 0.8768,
"step": 7
},
{
"epoch": 0.017967434025828188,
"grad_norm": 5.7029314041137695,
"learning_rate": 5.970149253731343e-07,
"loss": 0.8455,
"step": 8
},
{
"epoch": 0.02021336327905671,
"grad_norm": 5.459065914154053,
"learning_rate": 6.716417910447762e-07,
"loss": 0.8427,
"step": 9
},
{
"epoch": 0.022459292532285232,
"grad_norm": 5.500589370727539,
"learning_rate": 7.462686567164179e-07,
"loss": 0.8739,
"step": 10
},
{
"epoch": 0.024705221785513758,
"grad_norm": 5.371927738189697,
"learning_rate": 8.208955223880598e-07,
"loss": 0.8339,
"step": 11
},
{
"epoch": 0.02695115103874228,
"grad_norm": 4.383749008178711,
"learning_rate": 8.955223880597015e-07,
"loss": 0.8146,
"step": 12
},
{
"epoch": 0.029197080291970802,
"grad_norm": 4.2294511795043945,
"learning_rate": 9.701492537313434e-07,
"loss": 0.8006,
"step": 13
},
{
"epoch": 0.031443009545199324,
"grad_norm": 3.9959418773651123,
"learning_rate": 1.044776119402985e-06,
"loss": 0.7986,
"step": 14
},
{
"epoch": 0.033688938798427846,
"grad_norm": 4.030126094818115,
"learning_rate": 1.119402985074627e-06,
"loss": 0.808,
"step": 15
},
{
"epoch": 0.035934868051656375,
"grad_norm": 2.3749942779541016,
"learning_rate": 1.1940298507462686e-06,
"loss": 0.7639,
"step": 16
},
{
"epoch": 0.0381807973048849,
"grad_norm": 2.266770362854004,
"learning_rate": 1.2686567164179105e-06,
"loss": 0.7588,
"step": 17
},
{
"epoch": 0.04042672655811342,
"grad_norm": 2.169877767562866,
"learning_rate": 1.3432835820895524e-06,
"loss": 0.7664,
"step": 18
},
{
"epoch": 0.04267265581134194,
"grad_norm": 2.0016181468963623,
"learning_rate": 1.417910447761194e-06,
"loss": 0.7452,
"step": 19
},
{
"epoch": 0.044918585064570464,
"grad_norm": 1.9403204917907715,
"learning_rate": 1.4925373134328358e-06,
"loss": 0.7691,
"step": 20
},
{
"epoch": 0.047164514317798986,
"grad_norm": 1.8641579151153564,
"learning_rate": 1.5671641791044779e-06,
"loss": 0.7745,
"step": 21
},
{
"epoch": 0.049410443571027515,
"grad_norm": 1.87736177444458,
"learning_rate": 1.6417910447761196e-06,
"loss": 0.72,
"step": 22
},
{
"epoch": 0.05165637282425604,
"grad_norm": 2.7966866493225098,
"learning_rate": 1.7164179104477613e-06,
"loss": 0.7302,
"step": 23
},
{
"epoch": 0.05390230207748456,
"grad_norm": 2.9194653034210205,
"learning_rate": 1.791044776119403e-06,
"loss": 0.7144,
"step": 24
},
{
"epoch": 0.05614823133071308,
"grad_norm": 2.9114489555358887,
"learning_rate": 1.865671641791045e-06,
"loss": 0.7111,
"step": 25
},
{
"epoch": 0.058394160583941604,
"grad_norm": 2.698354482650757,
"learning_rate": 1.9402985074626867e-06,
"loss": 0.7052,
"step": 26
},
{
"epoch": 0.060640089837170126,
"grad_norm": 2.5505008697509766,
"learning_rate": 2.0149253731343284e-06,
"loss": 0.7071,
"step": 27
},
{
"epoch": 0.06288601909039865,
"grad_norm": 2.1805033683776855,
"learning_rate": 2.08955223880597e-06,
"loss": 0.7041,
"step": 28
},
{
"epoch": 0.06513194834362718,
"grad_norm": 1.668395757675171,
"learning_rate": 2.1641791044776118e-06,
"loss": 0.6815,
"step": 29
},
{
"epoch": 0.06737787759685569,
"grad_norm": 1.138392448425293,
"learning_rate": 2.238805970149254e-06,
"loss": 0.6862,
"step": 30
},
{
"epoch": 0.06962380685008422,
"grad_norm": 1.057366132736206,
"learning_rate": 2.3134328358208956e-06,
"loss": 0.6672,
"step": 31
},
{
"epoch": 0.07186973610331275,
"grad_norm": 1.0795561075210571,
"learning_rate": 2.3880597014925373e-06,
"loss": 0.6714,
"step": 32
},
{
"epoch": 0.07411566535654127,
"grad_norm": 1.074954628944397,
"learning_rate": 2.4626865671641794e-06,
"loss": 0.6706,
"step": 33
},
{
"epoch": 0.0763615946097698,
"grad_norm": 1.0592774152755737,
"learning_rate": 2.537313432835821e-06,
"loss": 0.6721,
"step": 34
},
{
"epoch": 0.07860752386299831,
"grad_norm": 0.8358403444290161,
"learning_rate": 2.6119402985074627e-06,
"loss": 0.6425,
"step": 35
},
{
"epoch": 0.08085345311622684,
"grad_norm": 0.8717141151428223,
"learning_rate": 2.686567164179105e-06,
"loss": 0.6505,
"step": 36
},
{
"epoch": 0.08309938236945537,
"grad_norm": 0.7979157567024231,
"learning_rate": 2.7611940298507465e-06,
"loss": 0.6386,
"step": 37
},
{
"epoch": 0.08534531162268388,
"grad_norm": 0.6813825368881226,
"learning_rate": 2.835820895522388e-06,
"loss": 0.6352,
"step": 38
},
{
"epoch": 0.08759124087591241,
"grad_norm": 0.6683703064918518,
"learning_rate": 2.9104477611940303e-06,
"loss": 0.644,
"step": 39
},
{
"epoch": 0.08983717012914093,
"grad_norm": 0.6825925707817078,
"learning_rate": 2.9850746268656716e-06,
"loss": 0.6352,
"step": 40
},
{
"epoch": 0.09208309938236946,
"grad_norm": 0.7220752239227295,
"learning_rate": 3.0597014925373137e-06,
"loss": 0.6207,
"step": 41
},
{
"epoch": 0.09432902863559797,
"grad_norm": 0.7097088694572449,
"learning_rate": 3.1343283582089558e-06,
"loss": 0.6177,
"step": 42
},
{
"epoch": 0.0965749578888265,
"grad_norm": 0.6021708250045776,
"learning_rate": 3.208955223880597e-06,
"loss": 0.6188,
"step": 43
},
{
"epoch": 0.09882088714205503,
"grad_norm": 0.5546464920043945,
"learning_rate": 3.283582089552239e-06,
"loss": 0.6101,
"step": 44
},
{
"epoch": 0.10106681639528355,
"grad_norm": 0.5791826248168945,
"learning_rate": 3.3582089552238813e-06,
"loss": 0.6079,
"step": 45
},
{
"epoch": 0.10331274564851207,
"grad_norm": 0.6221138834953308,
"learning_rate": 3.4328358208955225e-06,
"loss": 0.6047,
"step": 46
},
{
"epoch": 0.10555867490174059,
"grad_norm": 0.5765758752822876,
"learning_rate": 3.5074626865671646e-06,
"loss": 0.5965,
"step": 47
},
{
"epoch": 0.10780460415496912,
"grad_norm": 0.47714346647262573,
"learning_rate": 3.582089552238806e-06,
"loss": 0.5977,
"step": 48
},
{
"epoch": 0.11005053340819765,
"grad_norm": 0.5033997893333435,
"learning_rate": 3.656716417910448e-06,
"loss": 0.6066,
"step": 49
},
{
"epoch": 0.11229646266142616,
"grad_norm": 0.4991725981235504,
"learning_rate": 3.73134328358209e-06,
"loss": 0.5852,
"step": 50
},
{
"epoch": 0.11454239191465469,
"grad_norm": 0.4966943562030792,
"learning_rate": 3.8059701492537314e-06,
"loss": 0.5846,
"step": 51
},
{
"epoch": 0.11678832116788321,
"grad_norm": 0.4513320326805115,
"learning_rate": 3.8805970149253735e-06,
"loss": 0.5637,
"step": 52
},
{
"epoch": 0.11903425042111174,
"grad_norm": 0.47153928875923157,
"learning_rate": 3.955223880597015e-06,
"loss": 0.5814,
"step": 53
},
{
"epoch": 0.12128017967434025,
"grad_norm": 0.5067244172096252,
"learning_rate": 4.029850746268657e-06,
"loss": 0.588,
"step": 54
},
{
"epoch": 0.12352610892756878,
"grad_norm": 0.4318973124027252,
"learning_rate": 4.104477611940299e-06,
"loss": 0.5852,
"step": 55
},
{
"epoch": 0.1257720381807973,
"grad_norm": 0.41859719157218933,
"learning_rate": 4.17910447761194e-06,
"loss": 0.5788,
"step": 56
},
{
"epoch": 0.12801796743402583,
"grad_norm": 0.4497435986995697,
"learning_rate": 4.253731343283583e-06,
"loss": 0.5746,
"step": 57
},
{
"epoch": 0.13026389668725435,
"grad_norm": 0.407840371131897,
"learning_rate": 4.3283582089552236e-06,
"loss": 0.5848,
"step": 58
},
{
"epoch": 0.13250982594048288,
"grad_norm": 0.3589821457862854,
"learning_rate": 4.402985074626866e-06,
"loss": 0.5799,
"step": 59
},
{
"epoch": 0.13475575519371139,
"grad_norm": 0.4474234879016876,
"learning_rate": 4.477611940298508e-06,
"loss": 0.5768,
"step": 60
},
{
"epoch": 0.13700168444693991,
"grad_norm": 0.38281363248825073,
"learning_rate": 4.5522388059701495e-06,
"loss": 0.5684,
"step": 61
},
{
"epoch": 0.13924761370016844,
"grad_norm": 0.34512782096862793,
"learning_rate": 4.626865671641791e-06,
"loss": 0.5835,
"step": 62
},
{
"epoch": 0.14149354295339697,
"grad_norm": 0.32510608434677124,
"learning_rate": 4.701492537313434e-06,
"loss": 0.5811,
"step": 63
},
{
"epoch": 0.1437394722066255,
"grad_norm": 0.40574586391448975,
"learning_rate": 4.7761194029850745e-06,
"loss": 0.5594,
"step": 64
},
{
"epoch": 0.145985401459854,
"grad_norm": 0.3952745497226715,
"learning_rate": 4.850746268656717e-06,
"loss": 0.5674,
"step": 65
},
{
"epoch": 0.14823133071308253,
"grad_norm": 0.3393004834651947,
"learning_rate": 4.925373134328359e-06,
"loss": 0.5471,
"step": 66
},
{
"epoch": 0.15047725996631106,
"grad_norm": 0.3402893543243408,
"learning_rate": 5e-06,
"loss": 0.5689,
"step": 67
},
{
"epoch": 0.1527231892195396,
"grad_norm": 0.31731945276260376,
"learning_rate": 5.074626865671642e-06,
"loss": 0.5588,
"step": 68
},
{
"epoch": 0.15496911847276812,
"grad_norm": 0.2877805829048157,
"learning_rate": 5.149253731343285e-06,
"loss": 0.5567,
"step": 69
},
{
"epoch": 0.15721504772599662,
"grad_norm": 0.3303472399711609,
"learning_rate": 5.2238805970149255e-06,
"loss": 0.5624,
"step": 70
},
{
"epoch": 0.15946097697922515,
"grad_norm": 0.3219895660877228,
"learning_rate": 5.298507462686567e-06,
"loss": 0.5522,
"step": 71
},
{
"epoch": 0.16170690623245368,
"grad_norm": 0.29180029034614563,
"learning_rate": 5.37313432835821e-06,
"loss": 0.544,
"step": 72
},
{
"epoch": 0.1639528354856822,
"grad_norm": 0.30961552262306213,
"learning_rate": 5.447761194029851e-06,
"loss": 0.5462,
"step": 73
},
{
"epoch": 0.16619876473891074,
"grad_norm": 0.3001321852207184,
"learning_rate": 5.522388059701493e-06,
"loss": 0.5479,
"step": 74
},
{
"epoch": 0.16844469399213924,
"grad_norm": 0.29555678367614746,
"learning_rate": 5.597014925373134e-06,
"loss": 0.5646,
"step": 75
},
{
"epoch": 0.17069062324536777,
"grad_norm": 0.344656765460968,
"learning_rate": 5.671641791044776e-06,
"loss": 0.5475,
"step": 76
},
{
"epoch": 0.1729365524985963,
"grad_norm": 0.3049803078174591,
"learning_rate": 5.746268656716418e-06,
"loss": 0.5457,
"step": 77
},
{
"epoch": 0.17518248175182483,
"grad_norm": 0.2782682180404663,
"learning_rate": 5.820895522388061e-06,
"loss": 0.5558,
"step": 78
},
{
"epoch": 0.17742841100505333,
"grad_norm": 0.33001065254211426,
"learning_rate": 5.895522388059702e-06,
"loss": 0.5692,
"step": 79
},
{
"epoch": 0.17967434025828186,
"grad_norm": 0.26358768343925476,
"learning_rate": 5.970149253731343e-06,
"loss": 0.5419,
"step": 80
},
{
"epoch": 0.18192026951151039,
"grad_norm": 0.2817039489746094,
"learning_rate": 6.044776119402986e-06,
"loss": 0.5661,
"step": 81
},
{
"epoch": 0.18416619876473891,
"grad_norm": 0.2643490135669708,
"learning_rate": 6.119402985074627e-06,
"loss": 0.5362,
"step": 82
},
{
"epoch": 0.18641212801796744,
"grad_norm": 0.2636040151119232,
"learning_rate": 6.194029850746269e-06,
"loss": 0.5394,
"step": 83
},
{
"epoch": 0.18865805727119594,
"grad_norm": 0.251675546169281,
"learning_rate": 6.2686567164179116e-06,
"loss": 0.5379,
"step": 84
},
{
"epoch": 0.19090398652442447,
"grad_norm": 0.26983481645584106,
"learning_rate": 6.343283582089553e-06,
"loss": 0.5389,
"step": 85
},
{
"epoch": 0.193149915777653,
"grad_norm": 0.2974947690963745,
"learning_rate": 6.417910447761194e-06,
"loss": 0.5342,
"step": 86
},
{
"epoch": 0.19539584503088153,
"grad_norm": 0.3126147389411926,
"learning_rate": 6.492537313432837e-06,
"loss": 0.537,
"step": 87
},
{
"epoch": 0.19764177428411006,
"grad_norm": 0.27590620517730713,
"learning_rate": 6.567164179104478e-06,
"loss": 0.5507,
"step": 88
},
{
"epoch": 0.19988770353733856,
"grad_norm": 0.32750827074050903,
"learning_rate": 6.64179104477612e-06,
"loss": 0.5361,
"step": 89
},
{
"epoch": 0.2021336327905671,
"grad_norm": 0.2821713984012604,
"learning_rate": 6.7164179104477625e-06,
"loss": 0.5273,
"step": 90
},
{
"epoch": 0.20437956204379562,
"grad_norm": 0.3005189597606659,
"learning_rate": 6.791044776119403e-06,
"loss": 0.5436,
"step": 91
},
{
"epoch": 0.20662549129702415,
"grad_norm": 0.28068017959594727,
"learning_rate": 6.865671641791045e-06,
"loss": 0.5305,
"step": 92
},
{
"epoch": 0.20887142055025268,
"grad_norm": 0.28698408603668213,
"learning_rate": 6.9402985074626876e-06,
"loss": 0.5388,
"step": 93
},
{
"epoch": 0.21111734980348118,
"grad_norm": 0.3307916820049286,
"learning_rate": 7.014925373134329e-06,
"loss": 0.5191,
"step": 94
},
{
"epoch": 0.2133632790567097,
"grad_norm": 0.2854793667793274,
"learning_rate": 7.089552238805971e-06,
"loss": 0.5222,
"step": 95
},
{
"epoch": 0.21560920830993824,
"grad_norm": 0.3629694879055023,
"learning_rate": 7.164179104477612e-06,
"loss": 0.5451,
"step": 96
},
{
"epoch": 0.21785513756316677,
"grad_norm": 0.313763827085495,
"learning_rate": 7.238805970149254e-06,
"loss": 0.5322,
"step": 97
},
{
"epoch": 0.2201010668163953,
"grad_norm": 0.30298125743865967,
"learning_rate": 7.313432835820896e-06,
"loss": 0.5089,
"step": 98
},
{
"epoch": 0.2223469960696238,
"grad_norm": 0.34473463892936707,
"learning_rate": 7.3880597014925385e-06,
"loss": 0.5444,
"step": 99
},
{
"epoch": 0.22459292532285233,
"grad_norm": 0.2840663194656372,
"learning_rate": 7.46268656716418e-06,
"loss": 0.5433,
"step": 100
},
{
"epoch": 0.22683885457608086,
"grad_norm": 0.32824480533599854,
"learning_rate": 7.537313432835821e-06,
"loss": 0.5149,
"step": 101
},
{
"epoch": 0.22908478382930939,
"grad_norm": 0.31232303380966187,
"learning_rate": 7.611940298507463e-06,
"loss": 0.5415,
"step": 102
},
{
"epoch": 0.2313307130825379,
"grad_norm": 0.2765471935272217,
"learning_rate": 7.686567164179105e-06,
"loss": 0.5208,
"step": 103
},
{
"epoch": 0.23357664233576642,
"grad_norm": 0.31149113178253174,
"learning_rate": 7.761194029850747e-06,
"loss": 0.5417,
"step": 104
},
{
"epoch": 0.23582257158899494,
"grad_norm": 0.3036503195762634,
"learning_rate": 7.835820895522389e-06,
"loss": 0.5259,
"step": 105
},
{
"epoch": 0.23806850084222347,
"grad_norm": 0.2747598886489868,
"learning_rate": 7.91044776119403e-06,
"loss": 0.5257,
"step": 106
},
{
"epoch": 0.240314430095452,
"grad_norm": 0.27585095167160034,
"learning_rate": 7.985074626865672e-06,
"loss": 0.5304,
"step": 107
},
{
"epoch": 0.2425603593486805,
"grad_norm": 0.3225706219673157,
"learning_rate": 8.059701492537314e-06,
"loss": 0.533,
"step": 108
},
{
"epoch": 0.24480628860190903,
"grad_norm": 0.30163803696632385,
"learning_rate": 8.134328358208955e-06,
"loss": 0.5128,
"step": 109
},
{
"epoch": 0.24705221785513756,
"grad_norm": 0.30006369948387146,
"learning_rate": 8.208955223880599e-06,
"loss": 0.5087,
"step": 110
},
{
"epoch": 0.2492981471083661,
"grad_norm": 0.36344826221466064,
"learning_rate": 8.283582089552239e-06,
"loss": 0.5229,
"step": 111
},
{
"epoch": 0.2515440763615946,
"grad_norm": 0.3036467730998993,
"learning_rate": 8.35820895522388e-06,
"loss": 0.5232,
"step": 112
},
{
"epoch": 0.2537900056148231,
"grad_norm": 0.3324042856693268,
"learning_rate": 8.432835820895524e-06,
"loss": 0.5257,
"step": 113
},
{
"epoch": 0.25603593486805165,
"grad_norm": 0.3443598449230194,
"learning_rate": 8.507462686567165e-06,
"loss": 0.5173,
"step": 114
},
{
"epoch": 0.2582818641212802,
"grad_norm": 0.3419680595397949,
"learning_rate": 8.582089552238807e-06,
"loss": 0.514,
"step": 115
},
{
"epoch": 0.2605277933745087,
"grad_norm": 0.3660188615322113,
"learning_rate": 8.656716417910447e-06,
"loss": 0.5137,
"step": 116
},
{
"epoch": 0.26277372262773724,
"grad_norm": 0.322307825088501,
"learning_rate": 8.73134328358209e-06,
"loss": 0.5221,
"step": 117
},
{
"epoch": 0.26501965188096577,
"grad_norm": 0.3525477945804596,
"learning_rate": 8.805970149253732e-06,
"loss": 0.5302,
"step": 118
},
{
"epoch": 0.2672655811341943,
"grad_norm": 0.39976975321769714,
"learning_rate": 8.880597014925374e-06,
"loss": 0.5115,
"step": 119
},
{
"epoch": 0.26951151038742277,
"grad_norm": 0.30590498447418213,
"learning_rate": 8.955223880597016e-06,
"loss": 0.5251,
"step": 120
},
{
"epoch": 0.2717574396406513,
"grad_norm": 0.3515385389328003,
"learning_rate": 9.029850746268657e-06,
"loss": 0.5154,
"step": 121
},
{
"epoch": 0.27400336889387983,
"grad_norm": 0.37321946024894714,
"learning_rate": 9.104477611940299e-06,
"loss": 0.5075,
"step": 122
},
{
"epoch": 0.27624929814710836,
"grad_norm": 0.3113161623477936,
"learning_rate": 9.17910447761194e-06,
"loss": 0.5172,
"step": 123
},
{
"epoch": 0.2784952274003369,
"grad_norm": 0.35777148604393005,
"learning_rate": 9.253731343283582e-06,
"loss": 0.5187,
"step": 124
},
{
"epoch": 0.2807411566535654,
"grad_norm": 0.2908802926540375,
"learning_rate": 9.328358208955226e-06,
"loss": 0.5181,
"step": 125
},
{
"epoch": 0.28298708590679394,
"grad_norm": 0.3901764452457428,
"learning_rate": 9.402985074626867e-06,
"loss": 0.5323,
"step": 126
},
{
"epoch": 0.2852330151600225,
"grad_norm": 0.3103543519973755,
"learning_rate": 9.477611940298507e-06,
"loss": 0.5035,
"step": 127
},
{
"epoch": 0.287478944413251,
"grad_norm": 0.32105693221092224,
"learning_rate": 9.552238805970149e-06,
"loss": 0.5166,
"step": 128
},
{
"epoch": 0.28972487366647953,
"grad_norm": 0.3075639605522156,
"learning_rate": 9.626865671641792e-06,
"loss": 0.5238,
"step": 129
},
{
"epoch": 0.291970802919708,
"grad_norm": 0.31366583704948425,
"learning_rate": 9.701492537313434e-06,
"loss": 0.5054,
"step": 130
},
{
"epoch": 0.29421673217293653,
"grad_norm": 0.31075215339660645,
"learning_rate": 9.776119402985076e-06,
"loss": 0.5093,
"step": 131
},
{
"epoch": 0.29646266142616506,
"grad_norm": 0.3048778474330902,
"learning_rate": 9.850746268656717e-06,
"loss": 0.4938,
"step": 132
},
{
"epoch": 0.2987085906793936,
"grad_norm": 0.3239855468273163,
"learning_rate": 9.925373134328359e-06,
"loss": 0.5204,
"step": 133
},
{
"epoch": 0.3009545199326221,
"grad_norm": 0.30303385853767395,
"learning_rate": 1e-05,
"loss": 0.5097,
"step": 134
},
{
"epoch": 0.30320044918585065,
"grad_norm": 0.3344568908214569,
"learning_rate": 9.999982893802117e-06,
"loss": 0.5095,
"step": 135
},
{
"epoch": 0.3054463784390792,
"grad_norm": 0.3649601340293884,
"learning_rate": 9.999931575325515e-06,
"loss": 0.502,
"step": 136
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.3643246591091156,
"learning_rate": 9.999846044921342e-06,
"loss": 0.5145,
"step": 137
},
{
"epoch": 0.30993823694553624,
"grad_norm": 0.31586742401123047,
"learning_rate": 9.999726303174833e-06,
"loss": 0.5064,
"step": 138
},
{
"epoch": 0.3121841661987647,
"grad_norm": 0.39719676971435547,
"learning_rate": 9.999572350905325e-06,
"loss": 0.5018,
"step": 139
},
{
"epoch": 0.31443009545199324,
"grad_norm": 0.3038526177406311,
"learning_rate": 9.999384189166227e-06,
"loss": 0.5147,
"step": 140
},
{
"epoch": 0.31667602470522177,
"grad_norm": 0.38959312438964844,
"learning_rate": 9.999161819245036e-06,
"loss": 0.499,
"step": 141
},
{
"epoch": 0.3189219539584503,
"grad_norm": 0.29253315925598145,
"learning_rate": 9.998905242663313e-06,
"loss": 0.5097,
"step": 142
},
{
"epoch": 0.32116788321167883,
"grad_norm": 0.2925349175930023,
"learning_rate": 9.998614461176676e-06,
"loss": 0.5084,
"step": 143
},
{
"epoch": 0.32341381246490736,
"grad_norm": 0.317200630903244,
"learning_rate": 9.998289476774792e-06,
"loss": 0.5341,
"step": 144
},
{
"epoch": 0.3256597417181359,
"grad_norm": 0.3577384352684021,
"learning_rate": 9.997930291681362e-06,
"loss": 0.4969,
"step": 145
},
{
"epoch": 0.3279056709713644,
"grad_norm": 0.31183212995529175,
"learning_rate": 9.997536908354101e-06,
"loss": 0.514,
"step": 146
},
{
"epoch": 0.33015160022459294,
"grad_norm": 0.35079729557037354,
"learning_rate": 9.997109329484725e-06,
"loss": 0.5114,
"step": 147
},
{
"epoch": 0.3323975294778215,
"grad_norm": 0.3374541401863098,
"learning_rate": 9.996647557998935e-06,
"loss": 0.5103,
"step": 148
},
{
"epoch": 0.33464345873104995,
"grad_norm": 0.3685130774974823,
"learning_rate": 9.996151597056391e-06,
"loss": 0.507,
"step": 149
},
{
"epoch": 0.3368893879842785,
"grad_norm": 0.3940074145793915,
"learning_rate": 9.9956214500507e-06,
"loss": 0.5236,
"step": 150
},
{
"epoch": 0.339135317237507,
"grad_norm": 0.36086133122444153,
"learning_rate": 9.995057120609376e-06,
"loss": 0.4958,
"step": 151
},
{
"epoch": 0.34138124649073553,
"grad_norm": 0.3247486650943756,
"learning_rate": 9.994458612593835e-06,
"loss": 0.5065,
"step": 152
},
{
"epoch": 0.34362717574396406,
"grad_norm": 0.3979952335357666,
"learning_rate": 9.993825930099355e-06,
"loss": 0.5075,
"step": 153
},
{
"epoch": 0.3458731049971926,
"grad_norm": 0.3016234040260315,
"learning_rate": 9.993159077455053e-06,
"loss": 0.5206,
"step": 154
},
{
"epoch": 0.3481190342504211,
"grad_norm": 0.3936152458190918,
"learning_rate": 9.992458059223852e-06,
"loss": 0.4939,
"step": 155
},
{
"epoch": 0.35036496350364965,
"grad_norm": 0.34936875104904175,
"learning_rate": 9.991722880202457e-06,
"loss": 0.4979,
"step": 156
},
{
"epoch": 0.3526108927568782,
"grad_norm": 0.3858884572982788,
"learning_rate": 9.990953545421314e-06,
"loss": 0.5087,
"step": 157
},
{
"epoch": 0.35485682201010665,
"grad_norm": 0.3864620327949524,
"learning_rate": 9.990150060144582e-06,
"loss": 0.5127,
"step": 158
},
{
"epoch": 0.3571027512633352,
"grad_norm": 0.37431252002716064,
"learning_rate": 9.98931242987009e-06,
"loss": 0.5209,
"step": 159
},
{
"epoch": 0.3593486805165637,
"grad_norm": 0.3798641562461853,
"learning_rate": 9.988440660329308e-06,
"loss": 0.4979,
"step": 160
},
{
"epoch": 0.36159460976979224,
"grad_norm": 0.3777308464050293,
"learning_rate": 9.9875347574873e-06,
"loss": 0.5266,
"step": 161
},
{
"epoch": 0.36384053902302077,
"grad_norm": 0.4040112793445587,
"learning_rate": 9.986594727542684e-06,
"loss": 0.4973,
"step": 162
},
{
"epoch": 0.3660864682762493,
"grad_norm": 0.3550376892089844,
"learning_rate": 9.985620576927601e-06,
"loss": 0.5111,
"step": 163
},
{
"epoch": 0.36833239752947783,
"grad_norm": 0.3775559663772583,
"learning_rate": 9.984612312307653e-06,
"loss": 0.5175,
"step": 164
},
{
"epoch": 0.37057832678270636,
"grad_norm": 0.42534682154655457,
"learning_rate": 9.98356994058187e-06,
"loss": 0.4971,
"step": 165
},
{
"epoch": 0.3728242560359349,
"grad_norm": 0.34163060784339905,
"learning_rate": 9.98249346888266e-06,
"loss": 0.5151,
"step": 166
},
{
"epoch": 0.3750701852891634,
"grad_norm": 0.3757290244102478,
"learning_rate": 9.981382904575754e-06,
"loss": 0.5018,
"step": 167
},
{
"epoch": 0.3773161145423919,
"grad_norm": 0.39487966895103455,
"learning_rate": 9.98023825526017e-06,
"loss": 0.5074,
"step": 168
},
{
"epoch": 0.3795620437956204,
"grad_norm": 0.4693453013896942,
"learning_rate": 9.979059528768146e-06,
"loss": 0.5118,
"step": 169
},
{
"epoch": 0.38180797304884895,
"grad_norm": 0.3660091161727905,
"learning_rate": 9.977846733165092e-06,
"loss": 0.5019,
"step": 170
},
{
"epoch": 0.3840539023020775,
"grad_norm": 0.41689541935920715,
"learning_rate": 9.976599876749537e-06,
"loss": 0.4806,
"step": 171
},
{
"epoch": 0.386299831555306,
"grad_norm": 0.3330056071281433,
"learning_rate": 9.975318968053071e-06,
"loss": 0.5003,
"step": 172
},
{
"epoch": 0.38854576080853453,
"grad_norm": 0.3664453625679016,
"learning_rate": 9.974004015840284e-06,
"loss": 0.4913,
"step": 173
},
{
"epoch": 0.39079169006176306,
"grad_norm": 0.35467249155044556,
"learning_rate": 9.972655029108711e-06,
"loss": 0.491,
"step": 174
},
{
"epoch": 0.3930376193149916,
"grad_norm": 0.3422984480857849,
"learning_rate": 9.971272017088762e-06,
"loss": 0.4964,
"step": 175
},
{
"epoch": 0.3952835485682201,
"grad_norm": 0.30467477440834045,
"learning_rate": 9.969854989243672e-06,
"loss": 0.4958,
"step": 176
},
{
"epoch": 0.39752947782144865,
"grad_norm": 0.3571237623691559,
"learning_rate": 9.968403955269422e-06,
"loss": 0.5043,
"step": 177
},
{
"epoch": 0.3997754070746771,
"grad_norm": 0.40359944105148315,
"learning_rate": 9.966918925094682e-06,
"loss": 0.502,
"step": 178
},
{
"epoch": 0.40202133632790565,
"grad_norm": 0.2942937910556793,
"learning_rate": 9.96539990888074e-06,
"loss": 0.5113,
"step": 179
},
{
"epoch": 0.4042672655811342,
"grad_norm": 0.3451298773288727,
"learning_rate": 9.963846917021433e-06,
"loss": 0.4895,
"step": 180
},
{
"epoch": 0.4065131948343627,
"grad_norm": 0.32071009278297424,
"learning_rate": 9.962259960143076e-06,
"loss": 0.4917,
"step": 181
},
{
"epoch": 0.40875912408759124,
"grad_norm": 0.29624050855636597,
"learning_rate": 9.96063904910439e-06,
"loss": 0.516,
"step": 182
},
{
"epoch": 0.41100505334081977,
"grad_norm": 0.3379235863685608,
"learning_rate": 9.958984194996419e-06,
"loss": 0.4936,
"step": 183
},
{
"epoch": 0.4132509825940483,
"grad_norm": 0.3338676989078522,
"learning_rate": 9.957295409142474e-06,
"loss": 0.494,
"step": 184
},
{
"epoch": 0.41549691184727683,
"grad_norm": 0.3495246469974518,
"learning_rate": 9.955572703098035e-06,
"loss": 0.4887,
"step": 185
},
{
"epoch": 0.41774284110050536,
"grad_norm": 0.33925801515579224,
"learning_rate": 9.95381608865068e-06,
"loss": 0.5041,
"step": 186
},
{
"epoch": 0.41998877035373383,
"grad_norm": 0.3868575692176819,
"learning_rate": 9.952025577820009e-06,
"loss": 0.4985,
"step": 187
},
{
"epoch": 0.42223469960696236,
"grad_norm": 0.34473907947540283,
"learning_rate": 9.950201182857555e-06,
"loss": 0.5065,
"step": 188
},
{
"epoch": 0.4244806288601909,
"grad_norm": 0.3982524573802948,
"learning_rate": 9.948342916246702e-06,
"loss": 0.5017,
"step": 189
},
{
"epoch": 0.4267265581134194,
"grad_norm": 0.40433281660079956,
"learning_rate": 9.9464507907026e-06,
"loss": 0.5036,
"step": 190
},
{
"epoch": 0.42897248736664795,
"grad_norm": 0.29866451025009155,
"learning_rate": 9.94452481917208e-06,
"loss": 0.4861,
"step": 191
},
{
"epoch": 0.4312184166198765,
"grad_norm": 0.37620702385902405,
"learning_rate": 9.94256501483356e-06,
"loss": 0.4792,
"step": 192
},
{
"epoch": 0.433464345873105,
"grad_norm": 0.30438610911369324,
"learning_rate": 9.940571391096962e-06,
"loss": 0.504,
"step": 193
},
{
"epoch": 0.43571027512633353,
"grad_norm": 0.32881197333335876,
"learning_rate": 9.938543961603616e-06,
"loss": 0.5008,
"step": 194
},
{
"epoch": 0.43795620437956206,
"grad_norm": 0.31999659538269043,
"learning_rate": 9.936482740226163e-06,
"loss": 0.4868,
"step": 195
},
{
"epoch": 0.4402021336327906,
"grad_norm": 0.3441828489303589,
"learning_rate": 9.93438774106847e-06,
"loss": 0.5055,
"step": 196
},
{
"epoch": 0.44244806288601907,
"grad_norm": 0.29661545157432556,
"learning_rate": 9.932258978465523e-06,
"loss": 0.4673,
"step": 197
},
{
"epoch": 0.4446939921392476,
"grad_norm": 0.38478636741638184,
"learning_rate": 9.930096466983337e-06,
"loss": 0.4869,
"step": 198
},
{
"epoch": 0.4469399213924761,
"grad_norm": 0.3225785493850708,
"learning_rate": 9.92790022141885e-06,
"loss": 0.4814,
"step": 199
},
{
"epoch": 0.44918585064570465,
"grad_norm": 0.3994785249233246,
"learning_rate": 9.925670256799829e-06,
"loss": 0.4929,
"step": 200
},
{
"epoch": 0.4514317798989332,
"grad_norm": 0.3152889311313629,
"learning_rate": 9.923406588384759e-06,
"loss": 0.4843,
"step": 201
},
{
"epoch": 0.4536777091521617,
"grad_norm": 0.38969138264656067,
"learning_rate": 9.921109231662744e-06,
"loss": 0.513,
"step": 202
},
{
"epoch": 0.45592363840539024,
"grad_norm": 0.38721248507499695,
"learning_rate": 9.9187782023534e-06,
"loss": 0.4894,
"step": 203
},
{
"epoch": 0.45816956765861877,
"grad_norm": 0.38004323840141296,
"learning_rate": 9.916413516406746e-06,
"loss": 0.4987,
"step": 204
},
{
"epoch": 0.4604154969118473,
"grad_norm": 0.40154218673706055,
"learning_rate": 9.914015190003096e-06,
"loss": 0.4848,
"step": 205
},
{
"epoch": 0.4626614261650758,
"grad_norm": 0.37615618109703064,
"learning_rate": 9.911583239552949e-06,
"loss": 0.5083,
"step": 206
},
{
"epoch": 0.4649073554183043,
"grad_norm": 0.4611421227455139,
"learning_rate": 9.909117681696874e-06,
"loss": 0.4799,
"step": 207
},
{
"epoch": 0.46715328467153283,
"grad_norm": 0.49794813990592957,
"learning_rate": 9.906618533305401e-06,
"loss": 0.4892,
"step": 208
},
{
"epoch": 0.46939921392476136,
"grad_norm": 0.40189069509506226,
"learning_rate": 9.904085811478901e-06,
"loss": 0.4797,
"step": 209
},
{
"epoch": 0.4716451431779899,
"grad_norm": 0.37438878417015076,
"learning_rate": 9.901519533547468e-06,
"loss": 0.4826,
"step": 210
},
{
"epoch": 0.4738910724312184,
"grad_norm": 0.3949896991252899,
"learning_rate": 9.898919717070808e-06,
"loss": 0.4995,
"step": 211
},
{
"epoch": 0.47613700168444695,
"grad_norm": 0.3877430856227875,
"learning_rate": 9.896286379838109e-06,
"loss": 0.4787,
"step": 212
},
{
"epoch": 0.4783829309376755,
"grad_norm": 0.3562919497489929,
"learning_rate": 9.893619539867926e-06,
"loss": 0.5,
"step": 213
},
{
"epoch": 0.480628860190904,
"grad_norm": 0.34773513674736023,
"learning_rate": 9.890919215408059e-06,
"loss": 0.4755,
"step": 214
},
{
"epoch": 0.48287478944413254,
"grad_norm": 0.42745330929756165,
"learning_rate": 9.888185424935418e-06,
"loss": 0.4921,
"step": 215
},
{
"epoch": 0.485120718697361,
"grad_norm": 0.34176507592201233,
"learning_rate": 9.885418187155909e-06,
"loss": 0.4995,
"step": 216
},
{
"epoch": 0.48736664795058954,
"grad_norm": 0.4287734031677246,
"learning_rate": 9.882617521004298e-06,
"loss": 0.4962,
"step": 217
},
{
"epoch": 0.48961257720381807,
"grad_norm": 0.4167402684688568,
"learning_rate": 9.879783445644086e-06,
"loss": 0.4956,
"step": 218
},
{
"epoch": 0.4918585064570466,
"grad_norm": 0.40856555104255676,
"learning_rate": 9.876915980467373e-06,
"loss": 0.491,
"step": 219
},
{
"epoch": 0.4941044357102751,
"grad_norm": 0.43443533778190613,
"learning_rate": 9.874015145094733e-06,
"loss": 0.4948,
"step": 220
},
{
"epoch": 0.49635036496350365,
"grad_norm": 0.4324890971183777,
"learning_rate": 9.871080959375067e-06,
"loss": 0.5015,
"step": 221
},
{
"epoch": 0.4985962942167322,
"grad_norm": 0.4211356043815613,
"learning_rate": 9.868113443385483e-06,
"loss": 0.491,
"step": 222
},
{
"epoch": 0.5008422234699607,
"grad_norm": 0.34874603152275085,
"learning_rate": 9.865112617431146e-06,
"loss": 0.4802,
"step": 223
},
{
"epoch": 0.5030881527231892,
"grad_norm": 0.41246911883354187,
"learning_rate": 9.862078502045145e-06,
"loss": 0.4851,
"step": 224
},
{
"epoch": 0.5053340819764177,
"grad_norm": 0.3335956931114197,
"learning_rate": 9.85901111798835e-06,
"loss": 0.495,
"step": 225
},
{
"epoch": 0.5075800112296462,
"grad_norm": 0.4276493191719055,
"learning_rate": 9.855910486249276e-06,
"loss": 0.5064,
"step": 226
},
{
"epoch": 0.5098259404828748,
"grad_norm": 0.3431427776813507,
"learning_rate": 9.852776628043928e-06,
"loss": 0.5033,
"step": 227
},
{
"epoch": 0.5120718697361033,
"grad_norm": 0.368875652551651,
"learning_rate": 9.849609564815668e-06,
"loss": 0.4892,
"step": 228
},
{
"epoch": 0.5143177989893318,
"grad_norm": 0.4343670904636383,
"learning_rate": 9.846409318235056e-06,
"loss": 0.4877,
"step": 229
},
{
"epoch": 0.5165637282425604,
"grad_norm": 0.358761191368103,
"learning_rate": 9.843175910199715e-06,
"loss": 0.4766,
"step": 230
},
{
"epoch": 0.5188096574957889,
"grad_norm": 0.4277135133743286,
"learning_rate": 9.839909362834174e-06,
"loss": 0.4981,
"step": 231
},
{
"epoch": 0.5210555867490174,
"grad_norm": 0.37256282567977905,
"learning_rate": 9.836609698489714e-06,
"loss": 0.5042,
"step": 232
},
{
"epoch": 0.523301516002246,
"grad_norm": 0.3928300142288208,
"learning_rate": 9.833276939744217e-06,
"loss": 0.4798,
"step": 233
},
{
"epoch": 0.5255474452554745,
"grad_norm": 0.36464980244636536,
"learning_rate": 9.829911109402017e-06,
"loss": 0.4999,
"step": 234
},
{
"epoch": 0.527793374508703,
"grad_norm": 0.4434768855571747,
"learning_rate": 9.82651223049374e-06,
"loss": 0.4933,
"step": 235
},
{
"epoch": 0.5300393037619315,
"grad_norm": 0.3624848425388336,
"learning_rate": 9.82308032627614e-06,
"loss": 0.4999,
"step": 236
},
{
"epoch": 0.5322852330151601,
"grad_norm": 0.41842374205589294,
"learning_rate": 9.819615420231954e-06,
"loss": 0.4871,
"step": 237
},
{
"epoch": 0.5345311622683886,
"grad_norm": 0.40757784247398376,
"learning_rate": 9.816117536069724e-06,
"loss": 0.4846,
"step": 238
},
{
"epoch": 0.5367770915216171,
"grad_norm": 0.5392343401908875,
"learning_rate": 9.812586697723658e-06,
"loss": 0.4878,
"step": 239
},
{
"epoch": 0.5390230207748455,
"grad_norm": 0.38242799043655396,
"learning_rate": 9.809022929353436e-06,
"loss": 0.4855,
"step": 240
},
{
"epoch": 0.5412689500280741,
"grad_norm": 0.42983102798461914,
"learning_rate": 9.805426255344071e-06,
"loss": 0.4909,
"step": 241
},
{
"epoch": 0.5435148792813026,
"grad_norm": 0.408312052488327,
"learning_rate": 9.801796700305732e-06,
"loss": 0.4954,
"step": 242
},
{
"epoch": 0.5457608085345311,
"grad_norm": 0.3748157024383545,
"learning_rate": 9.798134289073571e-06,
"loss": 0.4844,
"step": 243
},
{
"epoch": 0.5480067377877597,
"grad_norm": 0.39674103260040283,
"learning_rate": 9.794439046707562e-06,
"loss": 0.4893,
"step": 244
},
{
"epoch": 0.5502526670409882,
"grad_norm": 0.3584100604057312,
"learning_rate": 9.790710998492325e-06,
"loss": 0.4663,
"step": 245
},
{
"epoch": 0.5524985962942167,
"grad_norm": 0.33988258242607117,
"learning_rate": 9.786950169936948e-06,
"loss": 0.4744,
"step": 246
},
{
"epoch": 0.5547445255474452,
"grad_norm": 0.4141857624053955,
"learning_rate": 9.783156586774826e-06,
"loss": 0.491,
"step": 247
},
{
"epoch": 0.5569904548006738,
"grad_norm": 0.344392329454422,
"learning_rate": 9.779330274963473e-06,
"loss": 0.5052,
"step": 248
},
{
"epoch": 0.5592363840539023,
"grad_norm": 0.3439772129058838,
"learning_rate": 9.775471260684346e-06,
"loss": 0.4859,
"step": 249
},
{
"epoch": 0.5614823133071308,
"grad_norm": 0.31984543800354004,
"learning_rate": 9.771579570342668e-06,
"loss": 0.509,
"step": 250
},
{
"epoch": 0.5637282425603594,
"grad_norm": 0.3450314402580261,
"learning_rate": 9.767655230567252e-06,
"loss": 0.4793,
"step": 251
},
{
"epoch": 0.5659741718135879,
"grad_norm": 0.3397728502750397,
"learning_rate": 9.763698268210312e-06,
"loss": 0.4749,
"step": 252
},
{
"epoch": 0.5682201010668164,
"grad_norm": 0.31943392753601074,
"learning_rate": 9.759708710347275e-06,
"loss": 0.4718,
"step": 253
},
{
"epoch": 0.570466030320045,
"grad_norm": 0.3831331431865692,
"learning_rate": 9.755686584276614e-06,
"loss": 0.484,
"step": 254
},
{
"epoch": 0.5727119595732735,
"grad_norm": 0.27558228373527527,
"learning_rate": 9.751631917519637e-06,
"loss": 0.4838,
"step": 255
},
{
"epoch": 0.574957888826502,
"grad_norm": 0.392098069190979,
"learning_rate": 9.747544737820322e-06,
"loss": 0.4844,
"step": 256
},
{
"epoch": 0.5772038180797305,
"grad_norm": 0.29363974928855896,
"learning_rate": 9.743425073145109e-06,
"loss": 0.4993,
"step": 257
},
{
"epoch": 0.5794497473329591,
"grad_norm": 0.3312382400035858,
"learning_rate": 9.739272951682716e-06,
"loss": 0.4812,
"step": 258
},
{
"epoch": 0.5816956765861875,
"grad_norm": 0.34420520067214966,
"learning_rate": 9.735088401843948e-06,
"loss": 0.4744,
"step": 259
},
{
"epoch": 0.583941605839416,
"grad_norm": 0.29115816950798035,
"learning_rate": 9.730871452261502e-06,
"loss": 0.4755,
"step": 260
},
{
"epoch": 0.5861875350926445,
"grad_norm": 0.3523911237716675,
"learning_rate": 9.726622131789766e-06,
"loss": 0.4918,
"step": 261
},
{
"epoch": 0.5884334643458731,
"grad_norm": 0.3150189220905304,
"learning_rate": 9.722340469504628e-06,
"loss": 0.4846,
"step": 262
},
{
"epoch": 0.5906793935991016,
"grad_norm": 0.3749271333217621,
"learning_rate": 9.718026494703269e-06,
"loss": 0.48,
"step": 263
},
{
"epoch": 0.5929253228523301,
"grad_norm": 0.30882978439331055,
"learning_rate": 9.713680236903979e-06,
"loss": 0.4632,
"step": 264
},
{
"epoch": 0.5951712521055587,
"grad_norm": 0.378319650888443,
"learning_rate": 9.70930172584593e-06,
"loss": 0.4876,
"step": 265
},
{
"epoch": 0.5974171813587872,
"grad_norm": 0.2804391384124756,
"learning_rate": 9.704890991488994e-06,
"loss": 0.4682,
"step": 266
},
{
"epoch": 0.5996631106120157,
"grad_norm": 0.3175744414329529,
"learning_rate": 9.70044806401353e-06,
"loss": 0.4932,
"step": 267
},
{
"epoch": 0.6019090398652442,
"grad_norm": 0.3088872730731964,
"learning_rate": 9.695972973820176e-06,
"loss": 0.4758,
"step": 268
},
{
"epoch": 0.6041549691184728,
"grad_norm": 0.2943213880062103,
"learning_rate": 9.691465751529645e-06,
"loss": 0.4995,
"step": 269
},
{
"epoch": 0.6064008983717013,
"grad_norm": 0.3486208915710449,
"learning_rate": 9.68692642798251e-06,
"loss": 0.4686,
"step": 270
},
{
"epoch": 0.6086468276249298,
"grad_norm": 0.37442758679389954,
"learning_rate": 9.682355034238997e-06,
"loss": 0.4918,
"step": 271
},
{
"epoch": 0.6108927568781584,
"grad_norm": 0.5018337368965149,
"learning_rate": 9.677751601578773e-06,
"loss": 0.4793,
"step": 272
},
{
"epoch": 0.6131386861313869,
"grad_norm": 0.3704725205898285,
"learning_rate": 9.67311616150073e-06,
"loss": 0.482,
"step": 273
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.3328251540660858,
"learning_rate": 9.668448745722772e-06,
"loss": 0.4815,
"step": 274
},
{
"epoch": 0.617630544637844,
"grad_norm": 0.374489963054657,
"learning_rate": 9.663749386181593e-06,
"loss": 0.4765,
"step": 275
},
{
"epoch": 0.6198764738910725,
"grad_norm": 0.3103203773498535,
"learning_rate": 9.65901811503246e-06,
"loss": 0.4773,
"step": 276
},
{
"epoch": 0.622122403144301,
"grad_norm": 0.45630261301994324,
"learning_rate": 9.654254964649e-06,
"loss": 0.4814,
"step": 277
},
{
"epoch": 0.6243683323975294,
"grad_norm": 0.32191282510757446,
"learning_rate": 9.649459967622972e-06,
"loss": 0.4876,
"step": 278
},
{
"epoch": 0.626614261650758,
"grad_norm": 0.4367053210735321,
"learning_rate": 9.644633156764038e-06,
"loss": 0.4826,
"step": 279
},
{
"epoch": 0.6288601909039865,
"grad_norm": 0.3019036650657654,
"learning_rate": 9.639774565099555e-06,
"loss": 0.4707,
"step": 280
},
{
"epoch": 0.631106120157215,
"grad_norm": 0.3420458137989044,
"learning_rate": 9.634884225874335e-06,
"loss": 0.4989,
"step": 281
},
{
"epoch": 0.6333520494104435,
"grad_norm": 0.39009857177734375,
"learning_rate": 9.629962172550419e-06,
"loss": 0.4756,
"step": 282
},
{
"epoch": 0.6355979786636721,
"grad_norm": 0.32310977578163147,
"learning_rate": 9.625008438806857e-06,
"loss": 0.4722,
"step": 283
},
{
"epoch": 0.6378439079169006,
"grad_norm": 0.36968040466308594,
"learning_rate": 9.620023058539467e-06,
"loss": 0.4797,
"step": 284
},
{
"epoch": 0.6400898371701291,
"grad_norm": 0.34846970438957214,
"learning_rate": 9.615006065860611e-06,
"loss": 0.471,
"step": 285
},
{
"epoch": 0.6423357664233577,
"grad_norm": 0.3717726767063141,
"learning_rate": 9.609957495098957e-06,
"loss": 0.4669,
"step": 286
},
{
"epoch": 0.6445816956765862,
"grad_norm": 0.3212599456310272,
"learning_rate": 9.604877380799244e-06,
"loss": 0.4702,
"step": 287
},
{
"epoch": 0.6468276249298147,
"grad_norm": 0.3086533844470978,
"learning_rate": 9.59976575772205e-06,
"loss": 0.4734,
"step": 288
},
{
"epoch": 0.6490735541830432,
"grad_norm": 0.37244805693626404,
"learning_rate": 9.594622660843547e-06,
"loss": 0.4867,
"step": 289
},
{
"epoch": 0.6513194834362718,
"grad_norm": 0.327836275100708,
"learning_rate": 9.58944812535527e-06,
"loss": 0.4903,
"step": 290
},
{
"epoch": 0.6535654126895003,
"grad_norm": 0.3715110421180725,
"learning_rate": 9.58424218666387e-06,
"loss": 0.4847,
"step": 291
},
{
"epoch": 0.6558113419427288,
"grad_norm": 0.4033578932285309,
"learning_rate": 9.579004880390872e-06,
"loss": 0.4785,
"step": 292
},
{
"epoch": 0.6580572711959574,
"grad_norm": 0.4034516513347626,
"learning_rate": 9.573736242372436e-06,
"loss": 0.4707,
"step": 293
},
{
"epoch": 0.6603032004491859,
"grad_norm": 0.3724893033504486,
"learning_rate": 9.56843630865911e-06,
"loss": 0.4895,
"step": 294
},
{
"epoch": 0.6625491297024144,
"grad_norm": 0.3855060935020447,
"learning_rate": 9.563105115515579e-06,
"loss": 0.4751,
"step": 295
},
{
"epoch": 0.664795058955643,
"grad_norm": 0.3687981963157654,
"learning_rate": 9.557742699420419e-06,
"loss": 0.4779,
"step": 296
},
{
"epoch": 0.6670409882088714,
"grad_norm": 0.31713899970054626,
"learning_rate": 9.552349097065851e-06,
"loss": 0.4889,
"step": 297
},
{
"epoch": 0.6692869174620999,
"grad_norm": 0.3856634199619293,
"learning_rate": 9.546924345357488e-06,
"loss": 0.4747,
"step": 298
},
{
"epoch": 0.6715328467153284,
"grad_norm": 0.3156067728996277,
"learning_rate": 9.54146848141408e-06,
"loss": 0.4772,
"step": 299
},
{
"epoch": 0.673778775968557,
"grad_norm": 0.33510684967041016,
"learning_rate": 9.53598154256726e-06,
"loss": 0.472,
"step": 300
},
{
"epoch": 0.6760247052217855,
"grad_norm": 0.42198294401168823,
"learning_rate": 9.530463566361296e-06,
"loss": 0.4947,
"step": 301
},
{
"epoch": 0.678270634475014,
"grad_norm": 0.32931357622146606,
"learning_rate": 9.524914590552825e-06,
"loss": 0.4862,
"step": 302
},
{
"epoch": 0.6805165637282425,
"grad_norm": 0.33701708912849426,
"learning_rate": 9.519334653110597e-06,
"loss": 0.5042,
"step": 303
},
{
"epoch": 0.6827624929814711,
"grad_norm": 0.3782896101474762,
"learning_rate": 9.513723792215217e-06,
"loss": 0.4858,
"step": 304
},
{
"epoch": 0.6850084222346996,
"grad_norm": 0.3276413381099701,
"learning_rate": 9.508082046258884e-06,
"loss": 0.473,
"step": 305
},
{
"epoch": 0.6872543514879281,
"grad_norm": 0.3396032452583313,
"learning_rate": 9.502409453845127e-06,
"loss": 0.4978,
"step": 306
},
{
"epoch": 0.6895002807411567,
"grad_norm": 0.38355326652526855,
"learning_rate": 9.496706053788545e-06,
"loss": 0.4695,
"step": 307
},
{
"epoch": 0.6917462099943852,
"grad_norm": 0.3016837537288666,
"learning_rate": 9.490971885114529e-06,
"loss": 0.4868,
"step": 308
},
{
"epoch": 0.6939921392476137,
"grad_norm": 0.3403872549533844,
"learning_rate": 9.48520698705901e-06,
"loss": 0.4964,
"step": 309
},
{
"epoch": 0.6962380685008422,
"grad_norm": 0.33010175824165344,
"learning_rate": 9.479411399068183e-06,
"loss": 0.4675,
"step": 310
},
{
"epoch": 0.6984839977540708,
"grad_norm": 0.36622872948646545,
"learning_rate": 9.473585160798239e-06,
"loss": 0.489,
"step": 311
},
{
"epoch": 0.7007299270072993,
"grad_norm": 0.37846261262893677,
"learning_rate": 9.46772831211509e-06,
"loss": 0.4702,
"step": 312
},
{
"epoch": 0.7029758562605278,
"grad_norm": 0.2969339191913605,
"learning_rate": 9.461840893094103e-06,
"loss": 0.4824,
"step": 313
},
{
"epoch": 0.7052217855137564,
"grad_norm": 0.42460620403289795,
"learning_rate": 9.45592294401982e-06,
"loss": 0.4654,
"step": 314
},
{
"epoch": 0.7074677147669849,
"grad_norm": 0.31242653727531433,
"learning_rate": 9.449974505385682e-06,
"loss": 0.4732,
"step": 315
},
{
"epoch": 0.7097136440202133,
"grad_norm": 0.3350578546524048,
"learning_rate": 9.44399561789376e-06,
"loss": 0.4834,
"step": 316
},
{
"epoch": 0.7119595732734418,
"grad_norm": 0.3971409499645233,
"learning_rate": 9.437986322454462e-06,
"loss": 0.485,
"step": 317
},
{
"epoch": 0.7142055025266704,
"grad_norm": 0.3148505389690399,
"learning_rate": 9.43194666018627e-06,
"loss": 0.4965,
"step": 318
},
{
"epoch": 0.7164514317798989,
"grad_norm": 0.3623645603656769,
"learning_rate": 9.425876672415448e-06,
"loss": 0.4847,
"step": 319
},
{
"epoch": 0.7186973610331274,
"grad_norm": 0.33705249428749084,
"learning_rate": 9.419776400675758e-06,
"loss": 0.4834,
"step": 320
},
{
"epoch": 0.720943290286356,
"grad_norm": 0.3334520161151886,
"learning_rate": 9.413645886708185e-06,
"loss": 0.4728,
"step": 321
},
{
"epoch": 0.7231892195395845,
"grad_norm": 0.3809893727302551,
"learning_rate": 9.40748517246064e-06,
"loss": 0.4738,
"step": 322
},
{
"epoch": 0.725435148792813,
"grad_norm": 0.3264145851135254,
"learning_rate": 9.401294300087682e-06,
"loss": 0.4776,
"step": 323
},
{
"epoch": 0.7276810780460415,
"grad_norm": 0.3935585916042328,
"learning_rate": 9.39507331195023e-06,
"loss": 0.473,
"step": 324
},
{
"epoch": 0.7299270072992701,
"grad_norm": 0.38635513186454773,
"learning_rate": 9.388822250615264e-06,
"loss": 0.4649,
"step": 325
},
{
"epoch": 0.7321729365524986,
"grad_norm": 0.41219913959503174,
"learning_rate": 9.382541158855538e-06,
"loss": 0.4593,
"step": 326
},
{
"epoch": 0.7344188658057271,
"grad_norm": 0.35313233733177185,
"learning_rate": 9.376230079649295e-06,
"loss": 0.4695,
"step": 327
},
{
"epoch": 0.7366647950589557,
"grad_norm": 0.48907920718193054,
"learning_rate": 9.369889056179961e-06,
"loss": 0.4615,
"step": 328
},
{
"epoch": 0.7389107243121842,
"grad_norm": 0.32115358114242554,
"learning_rate": 9.363518131835857e-06,
"loss": 0.4806,
"step": 329
},
{
"epoch": 0.7411566535654127,
"grad_norm": 0.4651142358779907,
"learning_rate": 9.357117350209901e-06,
"loss": 0.4823,
"step": 330
},
{
"epoch": 0.7434025828186412,
"grad_norm": 0.37610235810279846,
"learning_rate": 9.350686755099307e-06,
"loss": 0.476,
"step": 331
},
{
"epoch": 0.7456485120718698,
"grad_norm": 0.3762288987636566,
"learning_rate": 9.344226390505288e-06,
"loss": 0.4878,
"step": 332
},
{
"epoch": 0.7478944413250983,
"grad_norm": 0.34318727254867554,
"learning_rate": 9.337736300632754e-06,
"loss": 0.4823,
"step": 333
},
{
"epoch": 0.7501403705783268,
"grad_norm": 0.3277176320552826,
"learning_rate": 9.331216529890009e-06,
"loss": 0.492,
"step": 334
},
{
"epoch": 0.7523862998315554,
"grad_norm": 0.3363962471485138,
"learning_rate": 9.324667122888452e-06,
"loss": 0.477,
"step": 335
},
{
"epoch": 0.7546322290847838,
"grad_norm": 0.34611254930496216,
"learning_rate": 9.318088124442259e-06,
"loss": 0.4622,
"step": 336
},
{
"epoch": 0.7568781583380123,
"grad_norm": 0.4872119724750519,
"learning_rate": 9.311479579568091e-06,
"loss": 0.4704,
"step": 337
},
{
"epoch": 0.7591240875912408,
"grad_norm": 0.30356013774871826,
"learning_rate": 9.30484153348478e-06,
"loss": 0.4826,
"step": 338
},
{
"epoch": 0.7613700168444694,
"grad_norm": 0.3759292662143707,
"learning_rate": 9.298174031613019e-06,
"loss": 0.4771,
"step": 339
},
{
"epoch": 0.7636159460976979,
"grad_norm": 0.4052506387233734,
"learning_rate": 9.291477119575048e-06,
"loss": 0.4747,
"step": 340
},
{
"epoch": 0.7658618753509264,
"grad_norm": 0.40775245428085327,
"learning_rate": 9.28475084319435e-06,
"loss": 0.4963,
"step": 341
},
{
"epoch": 0.768107804604155,
"grad_norm": 0.34407731890678406,
"learning_rate": 9.277995248495328e-06,
"loss": 0.472,
"step": 342
},
{
"epoch": 0.7703537338573835,
"grad_norm": 0.4342804253101349,
"learning_rate": 9.271210381703e-06,
"loss": 0.4633,
"step": 343
},
{
"epoch": 0.772599663110612,
"grad_norm": 0.325330913066864,
"learning_rate": 9.264396289242676e-06,
"loss": 0.4859,
"step": 344
},
{
"epoch": 0.7748455923638405,
"grad_norm": 0.4626711905002594,
"learning_rate": 9.25755301773964e-06,
"loss": 0.457,
"step": 345
},
{
"epoch": 0.7770915216170691,
"grad_norm": 0.34164246916770935,
"learning_rate": 9.250680614018837e-06,
"loss": 0.4748,
"step": 346
},
{
"epoch": 0.7793374508702976,
"grad_norm": 0.3387359082698822,
"learning_rate": 9.243779125104544e-06,
"loss": 0.4862,
"step": 347
},
{
"epoch": 0.7815833801235261,
"grad_norm": 0.40897244215011597,
"learning_rate": 9.236848598220055e-06,
"loss": 0.4739,
"step": 348
},
{
"epoch": 0.7838293093767547,
"grad_norm": 0.37918272614479065,
"learning_rate": 9.229889080787357e-06,
"loss": 0.4717,
"step": 349
},
{
"epoch": 0.7860752386299832,
"grad_norm": 0.4629786014556885,
"learning_rate": 9.222900620426802e-06,
"loss": 0.4939,
"step": 350
},
{
"epoch": 0.7883211678832117,
"grad_norm": 0.42090147733688354,
"learning_rate": 9.215883264956786e-06,
"loss": 0.4776,
"step": 351
},
{
"epoch": 0.7905670971364402,
"grad_norm": 0.3530665338039398,
"learning_rate": 9.208837062393416e-06,
"loss": 0.4875,
"step": 352
},
{
"epoch": 0.7928130263896688,
"grad_norm": 0.4339233338832855,
"learning_rate": 9.201762060950185e-06,
"loss": 0.4484,
"step": 353
},
{
"epoch": 0.7950589556428973,
"grad_norm": 0.3293563425540924,
"learning_rate": 9.194658309037647e-06,
"loss": 0.4757,
"step": 354
},
{
"epoch": 0.7973048848961257,
"grad_norm": 0.3879033923149109,
"learning_rate": 9.187525855263071e-06,
"loss": 0.4816,
"step": 355
},
{
"epoch": 0.7995508141493542,
"grad_norm": 0.36516231298446655,
"learning_rate": 9.180364748430127e-06,
"loss": 0.4598,
"step": 356
},
{
"epoch": 0.8017967434025828,
"grad_norm": 0.3673107326030731,
"learning_rate": 9.173175037538539e-06,
"loss": 0.4731,
"step": 357
},
{
"epoch": 0.8040426726558113,
"grad_norm": 0.38570478558540344,
"learning_rate": 9.165956771783751e-06,
"loss": 0.4744,
"step": 358
},
{
"epoch": 0.8062886019090398,
"grad_norm": 0.42901894450187683,
"learning_rate": 9.1587100005566e-06,
"loss": 0.4842,
"step": 359
},
{
"epoch": 0.8085345311622684,
"grad_norm": 0.39992624521255493,
"learning_rate": 9.151434773442963e-06,
"loss": 0.475,
"step": 360
},
{
"epoch": 0.8107804604154969,
"grad_norm": 0.4681251347064972,
"learning_rate": 9.144131140223434e-06,
"loss": 0.4886,
"step": 361
},
{
"epoch": 0.8130263896687254,
"grad_norm": 0.35085222125053406,
"learning_rate": 9.136799150872967e-06,
"loss": 0.4861,
"step": 362
},
{
"epoch": 0.815272318921954,
"grad_norm": 0.42589834332466125,
"learning_rate": 9.129438855560551e-06,
"loss": 0.4668,
"step": 363
},
{
"epoch": 0.8175182481751825,
"grad_norm": 0.38507068157196045,
"learning_rate": 9.122050304648849e-06,
"loss": 0.4766,
"step": 364
},
{
"epoch": 0.819764177428411,
"grad_norm": 0.375751256942749,
"learning_rate": 9.114633548693868e-06,
"loss": 0.4816,
"step": 365
},
{
"epoch": 0.8220101066816395,
"grad_norm": 0.503512442111969,
"learning_rate": 9.107188638444606e-06,
"loss": 0.4746,
"step": 366
},
{
"epoch": 0.8242560359348681,
"grad_norm": 0.34955278038978577,
"learning_rate": 9.099715624842707e-06,
"loss": 0.4734,
"step": 367
},
{
"epoch": 0.8265019651880966,
"grad_norm": 0.37166303396224976,
"learning_rate": 9.09221455902211e-06,
"loss": 0.4635,
"step": 368
},
{
"epoch": 0.8287478944413251,
"grad_norm": 0.32505786418914795,
"learning_rate": 9.0846854923087e-06,
"loss": 0.4716,
"step": 369
},
{
"epoch": 0.8309938236945537,
"grad_norm": 0.3304513096809387,
"learning_rate": 9.077128476219963e-06,
"loss": 0.4648,
"step": 370
},
{
"epoch": 0.8332397529477822,
"grad_norm": 0.32548874616622925,
"learning_rate": 9.06954356246462e-06,
"loss": 0.4628,
"step": 371
},
{
"epoch": 0.8354856822010107,
"grad_norm": 0.351330041885376,
"learning_rate": 9.061930802942286e-06,
"loss": 0.4848,
"step": 372
},
{
"epoch": 0.8377316114542392,
"grad_norm": 0.3573990762233734,
"learning_rate": 9.054290249743113e-06,
"loss": 0.4762,
"step": 373
},
{
"epoch": 0.8399775407074677,
"grad_norm": 0.32974398136138916,
"learning_rate": 9.046621955147423e-06,
"loss": 0.4751,
"step": 374
},
{
"epoch": 0.8422234699606962,
"grad_norm": 0.31952598690986633,
"learning_rate": 9.03892597162536e-06,
"loss": 0.4652,
"step": 375
},
{
"epoch": 0.8444693992139247,
"grad_norm": 0.33405670523643494,
"learning_rate": 9.031202351836539e-06,
"loss": 0.4712,
"step": 376
},
{
"epoch": 0.8467153284671532,
"grad_norm": 0.41173166036605835,
"learning_rate": 9.02345114862966e-06,
"loss": 0.4644,
"step": 377
},
{
"epoch": 0.8489612577203818,
"grad_norm": 0.3065979480743408,
"learning_rate": 9.01567241504217e-06,
"loss": 0.4685,
"step": 378
},
{
"epoch": 0.8512071869736103,
"grad_norm": 0.38998886942863464,
"learning_rate": 9.007866204299896e-06,
"loss": 0.4836,
"step": 379
},
{
"epoch": 0.8534531162268388,
"grad_norm": 0.3278312683105469,
"learning_rate": 9.000032569816668e-06,
"loss": 0.482,
"step": 380
},
{
"epoch": 0.8556990454800674,
"grad_norm": 0.389222115278244,
"learning_rate": 8.992171565193968e-06,
"loss": 0.4642,
"step": 381
},
{
"epoch": 0.8579449747332959,
"grad_norm": 0.3489379584789276,
"learning_rate": 8.984283244220558e-06,
"loss": 0.4961,
"step": 382
},
{
"epoch": 0.8601909039865244,
"grad_norm": 0.38780078291893005,
"learning_rate": 8.976367660872104e-06,
"loss": 0.4858,
"step": 383
},
{
"epoch": 0.862436833239753,
"grad_norm": 0.3673154413700104,
"learning_rate": 8.968424869310828e-06,
"loss": 0.4691,
"step": 384
},
{
"epoch": 0.8646827624929815,
"grad_norm": 0.36734986305236816,
"learning_rate": 8.960454923885111e-06,
"loss": 0.4622,
"step": 385
},
{
"epoch": 0.86692869174621,
"grad_norm": 0.3670867085456848,
"learning_rate": 8.95245787912914e-06,
"loss": 0.4835,
"step": 386
},
{
"epoch": 0.8691746209994385,
"grad_norm": 0.33945947885513306,
"learning_rate": 8.944433789762523e-06,
"loss": 0.4756,
"step": 387
},
{
"epoch": 0.8714205502526671,
"grad_norm": 0.37823382019996643,
"learning_rate": 8.93638271068993e-06,
"loss": 0.4927,
"step": 388
},
{
"epoch": 0.8736664795058956,
"grad_norm": 0.3298521935939789,
"learning_rate": 8.9283046970007e-06,
"loss": 0.4639,
"step": 389
},
{
"epoch": 0.8759124087591241,
"grad_norm": 0.33418142795562744,
"learning_rate": 8.92019980396847e-06,
"loss": 0.4559,
"step": 390
},
{
"epoch": 0.8781583380123527,
"grad_norm": 0.32573068141937256,
"learning_rate": 8.912068087050807e-06,
"loss": 0.4599,
"step": 391
},
{
"epoch": 0.8804042672655812,
"grad_norm": 0.2992747724056244,
"learning_rate": 8.90390960188881e-06,
"loss": 0.4699,
"step": 392
},
{
"epoch": 0.8826501965188096,
"grad_norm": 0.419653981924057,
"learning_rate": 8.895724404306745e-06,
"loss": 0.4644,
"step": 393
},
{
"epoch": 0.8848961257720381,
"grad_norm": 0.34604114294052124,
"learning_rate": 8.887512550311655e-06,
"loss": 0.4758,
"step": 394
},
{
"epoch": 0.8871420550252667,
"grad_norm": 0.30816447734832764,
"learning_rate": 8.879274096092983e-06,
"loss": 0.4709,
"step": 395
},
{
"epoch": 0.8893879842784952,
"grad_norm": 0.3544372320175171,
"learning_rate": 8.871009098022176e-06,
"loss": 0.4903,
"step": 396
},
{
"epoch": 0.8916339135317237,
"grad_norm": 0.3021892011165619,
"learning_rate": 8.862717612652316e-06,
"loss": 0.4576,
"step": 397
},
{
"epoch": 0.8938798427849522,
"grad_norm": 0.33287468552589417,
"learning_rate": 8.854399696717713e-06,
"loss": 0.4823,
"step": 398
},
{
"epoch": 0.8961257720381808,
"grad_norm": 0.2934684455394745,
"learning_rate": 8.846055407133539e-06,
"loss": 0.4619,
"step": 399
},
{
"epoch": 0.8983717012914093,
"grad_norm": 0.37255221605300903,
"learning_rate": 8.837684800995417e-06,
"loss": 0.4567,
"step": 400
},
{
"epoch": 0.9006176305446378,
"grad_norm": 0.3295063376426697,
"learning_rate": 8.829287935579046e-06,
"loss": 0.4667,
"step": 401
},
{
"epoch": 0.9028635597978664,
"grad_norm": 0.38328802585601807,
"learning_rate": 8.820864868339804e-06,
"loss": 0.4735,
"step": 402
},
{
"epoch": 0.9051094890510949,
"grad_norm": 0.36380237340927124,
"learning_rate": 8.812415656912353e-06,
"loss": 0.4918,
"step": 403
},
{
"epoch": 0.9073554183043234,
"grad_norm": 0.3465980887413025,
"learning_rate": 8.803940359110246e-06,
"loss": 0.4798,
"step": 404
},
{
"epoch": 0.909601347557552,
"grad_norm": 0.35272216796875,
"learning_rate": 8.79543903292553e-06,
"loss": 0.4724,
"step": 405
},
{
"epoch": 0.9118472768107805,
"grad_norm": 0.38653409481048584,
"learning_rate": 8.786911736528352e-06,
"loss": 0.4559,
"step": 406
},
{
"epoch": 0.914093206064009,
"grad_norm": 0.35222503542900085,
"learning_rate": 8.778358528266562e-06,
"loss": 0.4586,
"step": 407
},
{
"epoch": 0.9163391353172375,
"grad_norm": 0.31955739855766296,
"learning_rate": 8.769779466665309e-06,
"loss": 0.4748,
"step": 408
},
{
"epoch": 0.9185850645704661,
"grad_norm": 0.30488333106040955,
"learning_rate": 8.761174610426642e-06,
"loss": 0.467,
"step": 409
},
{
"epoch": 0.9208309938236946,
"grad_norm": 0.268274188041687,
"learning_rate": 8.75254401842911e-06,
"loss": 0.481,
"step": 410
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.35750773549079895,
"learning_rate": 8.74388774972736e-06,
"loss": 0.4931,
"step": 411
},
{
"epoch": 0.9253228523301515,
"grad_norm": 0.27234843373298645,
"learning_rate": 8.73520586355173e-06,
"loss": 0.4709,
"step": 412
},
{
"epoch": 0.9275687815833801,
"grad_norm": 0.31700101494789124,
"learning_rate": 8.726498419307844e-06,
"loss": 0.4618,
"step": 413
},
{
"epoch": 0.9298147108366086,
"grad_norm": 0.27126544713974,
"learning_rate": 8.71776547657621e-06,
"loss": 0.4663,
"step": 414
},
{
"epoch": 0.9320606400898371,
"grad_norm": 0.27281293272972107,
"learning_rate": 8.709007095111805e-06,
"loss": 0.4641,
"step": 415
},
{
"epoch": 0.9343065693430657,
"grad_norm": 0.32400500774383545,
"learning_rate": 8.70022333484367e-06,
"loss": 0.4703,
"step": 416
},
{
"epoch": 0.9365524985962942,
"grad_norm": 0.34988343715667725,
"learning_rate": 8.691414255874506e-06,
"loss": 0.4912,
"step": 417
},
{
"epoch": 0.9387984278495227,
"grad_norm": 0.2996358573436737,
"learning_rate": 8.682579918480247e-06,
"loss": 0.4605,
"step": 418
},
{
"epoch": 0.9410443571027512,
"grad_norm": 0.3629034757614136,
"learning_rate": 8.673720383109666e-06,
"loss": 0.4881,
"step": 419
},
{
"epoch": 0.9432902863559798,
"grad_norm": 0.3697206377983093,
"learning_rate": 8.664835710383949e-06,
"loss": 0.4693,
"step": 420
},
{
"epoch": 0.9455362156092083,
"grad_norm": 0.2857604920864105,
"learning_rate": 8.655925961096284e-06,
"loss": 0.46,
"step": 421
},
{
"epoch": 0.9477821448624368,
"grad_norm": 0.3731415569782257,
"learning_rate": 8.64699119621144e-06,
"loss": 0.4781,
"step": 422
},
{
"epoch": 0.9500280741156654,
"grad_norm": 0.2709653675556183,
"learning_rate": 8.638031476865366e-06,
"loss": 0.4582,
"step": 423
},
{
"epoch": 0.9522740033688939,
"grad_norm": 0.3546141982078552,
"learning_rate": 8.629046864364751e-06,
"loss": 0.468,
"step": 424
},
{
"epoch": 0.9545199326221224,
"grad_norm": 0.30327171087265015,
"learning_rate": 8.62003742018662e-06,
"loss": 0.4668,
"step": 425
},
{
"epoch": 0.956765861875351,
"grad_norm": 0.3272528052330017,
"learning_rate": 8.611003205977905e-06,
"loss": 0.4579,
"step": 426
},
{
"epoch": 0.9590117911285795,
"grad_norm": 0.3644426167011261,
"learning_rate": 8.601944283555033e-06,
"loss": 0.4644,
"step": 427
},
{
"epoch": 0.961257720381808,
"grad_norm": 0.3664405941963196,
"learning_rate": 8.592860714903488e-06,
"loss": 0.4789,
"step": 428
},
{
"epoch": 0.9635036496350365,
"grad_norm": 0.4094981551170349,
"learning_rate": 8.583752562177401e-06,
"loss": 0.4533,
"step": 429
},
{
"epoch": 0.9657495788882651,
"grad_norm": 0.3394399881362915,
"learning_rate": 8.574619887699115e-06,
"loss": 0.452,
"step": 430
},
{
"epoch": 0.9679955081414935,
"grad_norm": 0.3262495696544647,
"learning_rate": 8.565462753958767e-06,
"loss": 0.47,
"step": 431
},
{
"epoch": 0.970241437394722,
"grad_norm": 0.3226722776889801,
"learning_rate": 8.556281223613851e-06,
"loss": 0.459,
"step": 432
},
{
"epoch": 0.9724873666479505,
"grad_norm": 0.28685227036476135,
"learning_rate": 8.5470753594888e-06,
"loss": 0.4404,
"step": 433
},
{
"epoch": 0.9747332959011791,
"grad_norm": 0.32768598198890686,
"learning_rate": 8.537845224574546e-06,
"loss": 0.4849,
"step": 434
},
{
"epoch": 0.9769792251544076,
"grad_norm": 0.2841854393482208,
"learning_rate": 8.528590882028094e-06,
"loss": 0.4686,
"step": 435
},
{
"epoch": 0.9792251544076361,
"grad_norm": 0.29862478375434875,
"learning_rate": 8.519312395172093e-06,
"loss": 0.4707,
"step": 436
},
{
"epoch": 0.9814710836608647,
"grad_norm": 0.30814310908317566,
"learning_rate": 8.510009827494392e-06,
"loss": 0.477,
"step": 437
},
{
"epoch": 0.9837170129140932,
"grad_norm": 0.3032066822052002,
"learning_rate": 8.500683242647617e-06,
"loss": 0.4638,
"step": 438
},
{
"epoch": 0.9859629421673217,
"grad_norm": 0.3458973169326782,
"learning_rate": 8.491332704448734e-06,
"loss": 0.4756,
"step": 439
},
{
"epoch": 0.9882088714205502,
"grad_norm": 0.30614790320396423,
"learning_rate": 8.481958276878602e-06,
"loss": 0.4856,
"step": 440
},
{
"epoch": 0.9904548006737788,
"grad_norm": 0.3345167338848114,
"learning_rate": 8.472560024081546e-06,
"loss": 0.4613,
"step": 441
},
{
"epoch": 0.9927007299270073,
"grad_norm": 0.3257136344909668,
"learning_rate": 8.463138010364918e-06,
"loss": 0.4786,
"step": 442
},
{
"epoch": 0.9949466591802358,
"grad_norm": 0.3315941393375397,
"learning_rate": 8.453692300198648e-06,
"loss": 0.4654,
"step": 443
},
{
"epoch": 0.9971925884334644,
"grad_norm": 0.32225826382637024,
"learning_rate": 8.444222958214812e-06,
"loss": 0.4765,
"step": 444
},
{
"epoch": 0.9994385176866929,
"grad_norm": 0.3224077820777893,
"learning_rate": 8.434730049207184e-06,
"loss": 0.4593,
"step": 445
},
{
"epoch": 1.0016844469399213,
"grad_norm": 0.6709184646606445,
"learning_rate": 8.425213638130798e-06,
"loss": 0.7572,
"step": 446
},
{
"epoch": 1.00393037619315,
"grad_norm": 0.4798668920993805,
"learning_rate": 8.415673790101495e-06,
"loss": 0.472,
"step": 447
},
{
"epoch": 1.0061763054463784,
"grad_norm": 0.37248560786247253,
"learning_rate": 8.40611057039549e-06,
"loss": 0.448,
"step": 448
},
{
"epoch": 1.008422234699607,
"grad_norm": 0.45663711428642273,
"learning_rate": 8.396524044448913e-06,
"loss": 0.4495,
"step": 449
},
{
"epoch": 1.0106681639528354,
"grad_norm": 0.38295912742614746,
"learning_rate": 8.386914277857365e-06,
"loss": 0.4203,
"step": 450
},
{
"epoch": 1.012914093206064,
"grad_norm": 0.44765421748161316,
"learning_rate": 8.37728133637548e-06,
"loss": 0.4774,
"step": 451
},
{
"epoch": 1.0151600224592925,
"grad_norm": 0.35290607810020447,
"learning_rate": 8.367625285916454e-06,
"loss": 0.4205,
"step": 452
},
{
"epoch": 1.0174059517125211,
"grad_norm": 0.4127921462059021,
"learning_rate": 8.357946192551611e-06,
"loss": 0.4512,
"step": 453
},
{
"epoch": 1.0196518809657495,
"grad_norm": 0.4858415126800537,
"learning_rate": 8.348244122509949e-06,
"loss": 0.4631,
"step": 454
},
{
"epoch": 1.0218978102189782,
"grad_norm": 0.40491798520088196,
"learning_rate": 8.338519142177679e-06,
"loss": 0.4365,
"step": 455
},
{
"epoch": 1.0241437394722066,
"grad_norm": 0.34673023223876953,
"learning_rate": 8.328771318097773e-06,
"loss": 0.4477,
"step": 456
},
{
"epoch": 1.0263896687254352,
"grad_norm": 0.40387821197509766,
"learning_rate": 8.319000716969518e-06,
"loss": 0.4611,
"step": 457
},
{
"epoch": 1.0286355979786637,
"grad_norm": 0.34297940135002136,
"learning_rate": 8.309207405648047e-06,
"loss": 0.4474,
"step": 458
},
{
"epoch": 1.0308815272318923,
"grad_norm": 0.3807845711708069,
"learning_rate": 8.299391451143887e-06,
"loss": 0.469,
"step": 459
},
{
"epoch": 1.0331274564851207,
"grad_norm": 0.3148818612098694,
"learning_rate": 8.289552920622505e-06,
"loss": 0.4526,
"step": 460
},
{
"epoch": 1.0353733857383491,
"grad_norm": 0.34133604168891907,
"learning_rate": 8.27969188140384e-06,
"loss": 0.4645,
"step": 461
},
{
"epoch": 1.0376193149915778,
"grad_norm": 0.3762519657611847,
"learning_rate": 8.269808400961845e-06,
"loss": 0.4483,
"step": 462
},
{
"epoch": 1.0398652442448062,
"grad_norm": 0.46112120151519775,
"learning_rate": 8.259902546924032e-06,
"loss": 0.4667,
"step": 463
},
{
"epoch": 1.0421111734980348,
"grad_norm": 4.490077972412109,
"learning_rate": 8.249974387071e-06,
"loss": 0.4467,
"step": 464
},
{
"epoch": 1.0443571027512633,
"grad_norm": 4.129928112030029,
"learning_rate": 8.240023989335975e-06,
"loss": 0.463,
"step": 465
},
{
"epoch": 1.046603032004492,
"grad_norm": 0.6177784204483032,
"learning_rate": 8.230051421804346e-06,
"loss": 0.4552,
"step": 466
},
{
"epoch": 1.0488489612577203,
"grad_norm": 1.1695165634155273,
"learning_rate": 8.220056752713198e-06,
"loss": 0.4519,
"step": 467
},
{
"epoch": 1.051094890510949,
"grad_norm": 0.5390977263450623,
"learning_rate": 8.210040050450846e-06,
"loss": 0.473,
"step": 468
},
{
"epoch": 1.0533408197641774,
"grad_norm": 0.4115554392337799,
"learning_rate": 8.20000138355637e-06,
"loss": 0.446,
"step": 469
},
{
"epoch": 1.055586749017406,
"grad_norm": 0.4782909154891968,
"learning_rate": 8.189940820719136e-06,
"loss": 0.4574,
"step": 470
},
{
"epoch": 1.0578326782706344,
"grad_norm": 0.4880026876926422,
"learning_rate": 8.179858430778334e-06,
"loss": 0.4549,
"step": 471
},
{
"epoch": 1.060078607523863,
"grad_norm": 0.4663502275943756,
"learning_rate": 8.169754282722508e-06,
"loss": 0.4533,
"step": 472
},
{
"epoch": 1.0623245367770915,
"grad_norm": 0.4719676077365875,
"learning_rate": 8.159628445689083e-06,
"loss": 0.4507,
"step": 473
},
{
"epoch": 1.0645704660303201,
"grad_norm": 0.37671101093292236,
"learning_rate": 8.149480988963884e-06,
"loss": 0.4445,
"step": 474
},
{
"epoch": 1.0668163952835485,
"grad_norm": 0.4894201457500458,
"learning_rate": 8.139311981980675e-06,
"loss": 0.4425,
"step": 475
},
{
"epoch": 1.0690623245367772,
"grad_norm": 1.3329061269760132,
"learning_rate": 8.129121494320673e-06,
"loss": 0.4334,
"step": 476
},
{
"epoch": 1.0713082537900056,
"grad_norm": 0.4755379557609558,
"learning_rate": 8.118909595712077e-06,
"loss": 0.4596,
"step": 477
},
{
"epoch": 1.073554183043234,
"grad_norm": 0.3152107894420624,
"learning_rate": 8.108676356029593e-06,
"loss": 0.4773,
"step": 478
},
{
"epoch": 1.0758001122964627,
"grad_norm": 0.40582582354545593,
"learning_rate": 8.098421845293946e-06,
"loss": 0.436,
"step": 479
},
{
"epoch": 1.078046041549691,
"grad_norm": 0.333881676197052,
"learning_rate": 8.088146133671415e-06,
"loss": 0.4441,
"step": 480
},
{
"epoch": 1.0802919708029197,
"grad_norm": 0.36508119106292725,
"learning_rate": 8.077849291473339e-06,
"loss": 0.445,
"step": 481
},
{
"epoch": 1.0825379000561481,
"grad_norm": 0.40846577286720276,
"learning_rate": 8.067531389155652e-06,
"loss": 0.4652,
"step": 482
},
{
"epoch": 1.0847838293093768,
"grad_norm": 0.29027220606803894,
"learning_rate": 8.057192497318383e-06,
"loss": 0.432,
"step": 483
},
{
"epoch": 1.0870297585626052,
"grad_norm": 0.3959558606147766,
"learning_rate": 8.046832686705179e-06,
"loss": 0.475,
"step": 484
},
{
"epoch": 1.0892756878158338,
"grad_norm": 0.2976958453655243,
"learning_rate": 8.036452028202837e-06,
"loss": 0.437,
"step": 485
},
{
"epoch": 1.0915216170690623,
"grad_norm": 0.26725515723228455,
"learning_rate": 8.026050592840788e-06,
"loss": 0.4279,
"step": 486
},
{
"epoch": 1.093767546322291,
"grad_norm": 0.3430537283420563,
"learning_rate": 8.015628451790642e-06,
"loss": 0.4596,
"step": 487
},
{
"epoch": 1.0960134755755193,
"grad_norm": 0.28370511531829834,
"learning_rate": 8.00518567636568e-06,
"loss": 0.4457,
"step": 488
},
{
"epoch": 1.098259404828748,
"grad_norm": 0.3284716308116913,
"learning_rate": 7.994722338020375e-06,
"loss": 0.4424,
"step": 489
},
{
"epoch": 1.1005053340819764,
"grad_norm": 0.30496740341186523,
"learning_rate": 7.984238508349901e-06,
"loss": 0.4534,
"step": 490
},
{
"epoch": 1.102751263335205,
"grad_norm": 0.3204284608364105,
"learning_rate": 7.973734259089644e-06,
"loss": 0.4559,
"step": 491
},
{
"epoch": 1.1049971925884334,
"grad_norm": 0.28355643153190613,
"learning_rate": 7.963209662114714e-06,
"loss": 0.4683,
"step": 492
},
{
"epoch": 1.107243121841662,
"grad_norm": 0.2843816578388214,
"learning_rate": 7.952664789439443e-06,
"loss": 0.4605,
"step": 493
},
{
"epoch": 1.1094890510948905,
"grad_norm": 0.2756952941417694,
"learning_rate": 7.942099713216902e-06,
"loss": 0.4218,
"step": 494
},
{
"epoch": 1.1117349803481191,
"grad_norm": 0.27619650959968567,
"learning_rate": 7.931514505738408e-06,
"loss": 0.4309,
"step": 495
},
{
"epoch": 1.1139809096013475,
"grad_norm": 0.31005722284317017,
"learning_rate": 7.92090923943302e-06,
"loss": 0.4478,
"step": 496
},
{
"epoch": 1.1162268388545762,
"grad_norm": 0.26537370681762695,
"learning_rate": 7.910283986867051e-06,
"loss": 0.4721,
"step": 497
},
{
"epoch": 1.1184727681078046,
"grad_norm": 0.3197883665561676,
"learning_rate": 7.89963882074357e-06,
"loss": 0.4371,
"step": 498
},
{
"epoch": 1.120718697361033,
"grad_norm": 0.27182987332344055,
"learning_rate": 7.888973813901909e-06,
"loss": 0.454,
"step": 499
},
{
"epoch": 1.1229646266142617,
"grad_norm": 0.36007192730903625,
"learning_rate": 7.87828903931715e-06,
"loss": 0.4666,
"step": 500
},
{
"epoch": 1.12521055586749,
"grad_norm": 0.2985324263572693,
"learning_rate": 7.867584570099642e-06,
"loss": 0.4463,
"step": 501
},
{
"epoch": 1.1274564851207187,
"grad_norm": 0.30184683203697205,
"learning_rate": 7.856860479494492e-06,
"loss": 0.4582,
"step": 502
},
{
"epoch": 1.1297024143739471,
"grad_norm": 0.2989865839481354,
"learning_rate": 7.846116840881069e-06,
"loss": 0.4557,
"step": 503
},
{
"epoch": 1.1319483436271758,
"grad_norm": 0.2534805238246918,
"learning_rate": 7.835353727772491e-06,
"loss": 0.4058,
"step": 504
},
{
"epoch": 1.1341942728804042,
"grad_norm": 0.35043448209762573,
"learning_rate": 7.82457121381514e-06,
"loss": 0.49,
"step": 505
},
{
"epoch": 1.1364402021336328,
"grad_norm": 0.2577075660228729,
"learning_rate": 7.81376937278814e-06,
"loss": 0.4293,
"step": 506
},
{
"epoch": 1.1386861313868613,
"grad_norm": 0.3364856541156769,
"learning_rate": 7.802948278602866e-06,
"loss": 0.4755,
"step": 507
},
{
"epoch": 1.14093206064009,
"grad_norm": 0.282972514629364,
"learning_rate": 7.792108005302426e-06,
"loss": 0.4537,
"step": 508
},
{
"epoch": 1.1431779898933183,
"grad_norm": 0.26607781648635864,
"learning_rate": 7.781248627061166e-06,
"loss": 0.4228,
"step": 509
},
{
"epoch": 1.145423919146547,
"grad_norm": 0.3014846742153168,
"learning_rate": 7.770370218184156e-06,
"loss": 0.4455,
"step": 510
},
{
"epoch": 1.1476698483997754,
"grad_norm": 0.27567797899246216,
"learning_rate": 7.75947285310668e-06,
"loss": 0.482,
"step": 511
},
{
"epoch": 1.149915777653004,
"grad_norm": 0.2605037987232208,
"learning_rate": 7.748556606393732e-06,
"loss": 0.4284,
"step": 512
},
{
"epoch": 1.1521617069062324,
"grad_norm": 0.3069257140159607,
"learning_rate": 7.737621552739501e-06,
"loss": 0.4571,
"step": 513
},
{
"epoch": 1.154407636159461,
"grad_norm": 0.3215087354183197,
"learning_rate": 7.726667766966866e-06,
"loss": 0.4502,
"step": 514
},
{
"epoch": 1.1566535654126895,
"grad_norm": 0.31216177344322205,
"learning_rate": 7.71569532402688e-06,
"loss": 0.4565,
"step": 515
},
{
"epoch": 1.158899494665918,
"grad_norm": 0.3760012984275818,
"learning_rate": 7.70470429899825e-06,
"loss": 0.4362,
"step": 516
},
{
"epoch": 1.1611454239191465,
"grad_norm": 0.33376315236091614,
"learning_rate": 7.69369476708684e-06,
"loss": 0.4724,
"step": 517
},
{
"epoch": 1.1633913531723752,
"grad_norm": 0.2877935469150543,
"learning_rate": 7.682666803625138e-06,
"loss": 0.4453,
"step": 518
},
{
"epoch": 1.1656372824256036,
"grad_norm": 0.33166879415512085,
"learning_rate": 7.671620484071758e-06,
"loss": 0.4585,
"step": 519
},
{
"epoch": 1.167883211678832,
"grad_norm": 0.2634395360946655,
"learning_rate": 7.66055588401091e-06,
"loss": 0.4302,
"step": 520
},
{
"epoch": 1.1701291409320607,
"grad_norm": 0.28289881348609924,
"learning_rate": 7.649473079151888e-06,
"loss": 0.4303,
"step": 521
},
{
"epoch": 1.172375070185289,
"grad_norm": 0.29282352328300476,
"learning_rate": 7.638372145328554e-06,
"loss": 0.4395,
"step": 522
},
{
"epoch": 1.1746209994385177,
"grad_norm": 0.27824363112449646,
"learning_rate": 7.627253158498819e-06,
"loss": 0.445,
"step": 523
},
{
"epoch": 1.1768669286917461,
"grad_norm": 0.3538764715194702,
"learning_rate": 7.616116194744114e-06,
"loss": 0.4612,
"step": 524
},
{
"epoch": 1.1791128579449748,
"grad_norm": 0.26989635825157166,
"learning_rate": 7.604961330268885e-06,
"loss": 0.4544,
"step": 525
},
{
"epoch": 1.1813587871982032,
"grad_norm": 0.32161369919776917,
"learning_rate": 7.593788641400057e-06,
"loss": 0.4405,
"step": 526
},
{
"epoch": 1.1836047164514318,
"grad_norm": 0.27198460698127747,
"learning_rate": 7.582598204586522e-06,
"loss": 0.4759,
"step": 527
},
{
"epoch": 1.1858506457046603,
"grad_norm": 0.365715891122818,
"learning_rate": 7.571390096398611e-06,
"loss": 0.4433,
"step": 528
},
{
"epoch": 1.188096574957889,
"grad_norm": 0.2920300364494324,
"learning_rate": 7.56016439352757e-06,
"loss": 0.4536,
"step": 529
},
{
"epoch": 1.1903425042111173,
"grad_norm": 0.3396730422973633,
"learning_rate": 7.548921172785038e-06,
"loss": 0.4604,
"step": 530
},
{
"epoch": 1.192588433464346,
"grad_norm": 0.3063504695892334,
"learning_rate": 7.537660511102516e-06,
"loss": 0.4371,
"step": 531
},
{
"epoch": 1.1948343627175744,
"grad_norm": 0.30634409189224243,
"learning_rate": 7.526382485530848e-06,
"loss": 0.4547,
"step": 532
},
{
"epoch": 1.197080291970803,
"grad_norm": 0.28994691371917725,
"learning_rate": 7.51508717323969e-06,
"loss": 0.4474,
"step": 533
},
{
"epoch": 1.1993262212240314,
"grad_norm": 0.31030574440956116,
"learning_rate": 7.5037746515169795e-06,
"loss": 0.4382,
"step": 534
},
{
"epoch": 1.20157215047726,
"grad_norm": 0.29604753851890564,
"learning_rate": 7.492444997768412e-06,
"loss": 0.4641,
"step": 535
},
{
"epoch": 1.2038180797304885,
"grad_norm": 0.305606484413147,
"learning_rate": 7.481098289516906e-06,
"loss": 0.45,
"step": 536
},
{
"epoch": 1.206064008983717,
"grad_norm": 0.28157690167427063,
"learning_rate": 7.469734604402076e-06,
"loss": 0.447,
"step": 537
},
{
"epoch": 1.2083099382369455,
"grad_norm": 0.31427818536758423,
"learning_rate": 7.4583540201797015e-06,
"loss": 0.4486,
"step": 538
},
{
"epoch": 1.210555867490174,
"grad_norm": 0.3320254683494568,
"learning_rate": 7.446956614721191e-06,
"loss": 0.4491,
"step": 539
},
{
"epoch": 1.2128017967434026,
"grad_norm": 0.2562301456928253,
"learning_rate": 7.435542466013057e-06,
"loss": 0.4262,
"step": 540
},
{
"epoch": 1.215047725996631,
"grad_norm": 0.2971283495426178,
"learning_rate": 7.424111652156369e-06,
"loss": 0.4471,
"step": 541
},
{
"epoch": 1.2172936552498597,
"grad_norm": 0.3181101977825165,
"learning_rate": 7.412664251366239e-06,
"loss": 0.4607,
"step": 542
},
{
"epoch": 1.219539584503088,
"grad_norm": 0.3226609230041504,
"learning_rate": 7.401200341971263e-06,
"loss": 0.4556,
"step": 543
},
{
"epoch": 1.2217855137563167,
"grad_norm": 0.3116491734981537,
"learning_rate": 7.389720002413003e-06,
"loss": 0.4349,
"step": 544
},
{
"epoch": 1.2240314430095451,
"grad_norm": 0.33195728063583374,
"learning_rate": 7.378223311245447e-06,
"loss": 0.4371,
"step": 545
},
{
"epoch": 1.2262773722627738,
"grad_norm": 0.27619820833206177,
"learning_rate": 7.3667103471344585e-06,
"loss": 0.4381,
"step": 546
},
{
"epoch": 1.2285233015160022,
"grad_norm": 0.29046374559402466,
"learning_rate": 7.355181188857258e-06,
"loss": 0.4515,
"step": 547
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.31919410824775696,
"learning_rate": 7.343635915301872e-06,
"loss": 0.4488,
"step": 548
},
{
"epoch": 1.2330151600224593,
"grad_norm": 0.3814048171043396,
"learning_rate": 7.33207460546659e-06,
"loss": 0.4749,
"step": 549
},
{
"epoch": 1.235261089275688,
"grad_norm": 0.3012455403804779,
"learning_rate": 7.3204973384594365e-06,
"loss": 0.4498,
"step": 550
},
{
"epoch": 1.2375070185289163,
"grad_norm": 0.35009750723838806,
"learning_rate": 7.3089041934976216e-06,
"loss": 0.469,
"step": 551
},
{
"epoch": 1.239752947782145,
"grad_norm": 0.24640639126300812,
"learning_rate": 7.297295249906992e-06,
"loss": 0.4148,
"step": 552
},
{
"epoch": 1.2419988770353734,
"grad_norm": 0.315764844417572,
"learning_rate": 7.285670587121508e-06,
"loss": 0.4464,
"step": 553
},
{
"epoch": 1.2442448062886018,
"grad_norm": 0.2749885618686676,
"learning_rate": 7.274030284682679e-06,
"loss": 0.4436,
"step": 554
},
{
"epoch": 1.2464907355418304,
"grad_norm": 0.2588658630847931,
"learning_rate": 7.262374422239033e-06,
"loss": 0.4639,
"step": 555
},
{
"epoch": 1.248736664795059,
"grad_norm": 0.3450206518173218,
"learning_rate": 7.250703079545566e-06,
"loss": 0.4403,
"step": 556
},
{
"epoch": 1.2509825940482875,
"grad_norm": 0.264999657869339,
"learning_rate": 7.2390163364631945e-06,
"loss": 0.4634,
"step": 557
},
{
"epoch": 1.253228523301516,
"grad_norm": 0.28712841868400574,
"learning_rate": 7.22731427295822e-06,
"loss": 0.4049,
"step": 558
},
{
"epoch": 1.2554744525547445,
"grad_norm": 0.2988751530647278,
"learning_rate": 7.215596969101762e-06,
"loss": 0.4507,
"step": 559
},
{
"epoch": 1.2577203818079732,
"grad_norm": 0.29097434878349304,
"learning_rate": 7.2038645050692315e-06,
"loss": 0.4418,
"step": 560
},
{
"epoch": 1.2599663110612016,
"grad_norm": 0.2874724268913269,
"learning_rate": 7.192116961139769e-06,
"loss": 0.4603,
"step": 561
},
{
"epoch": 1.26221224031443,
"grad_norm": 0.2682914435863495,
"learning_rate": 7.180354417695696e-06,
"loss": 0.4487,
"step": 562
},
{
"epoch": 1.2644581695676587,
"grad_norm": 0.29097360372543335,
"learning_rate": 7.168576955221975e-06,
"loss": 0.4323,
"step": 563
},
{
"epoch": 1.266704098820887,
"grad_norm": 0.28627073764801025,
"learning_rate": 7.1567846543056445e-06,
"loss": 0.4651,
"step": 564
},
{
"epoch": 1.2689500280741157,
"grad_norm": 0.29652172327041626,
"learning_rate": 7.144977595635278e-06,
"loss": 0.4369,
"step": 565
},
{
"epoch": 1.2711959573273441,
"grad_norm": 0.2619209885597229,
"learning_rate": 7.133155860000429e-06,
"loss": 0.4486,
"step": 566
},
{
"epoch": 1.2734418865805728,
"grad_norm": 0.28887611627578735,
"learning_rate": 7.121319528291077e-06,
"loss": 0.4568,
"step": 567
},
{
"epoch": 1.2756878158338012,
"grad_norm": 0.27822646498680115,
"learning_rate": 7.109468681497076e-06,
"loss": 0.4434,
"step": 568
},
{
"epoch": 1.2779337450870298,
"grad_norm": 0.3062475323677063,
"learning_rate": 7.097603400707595e-06,
"loss": 0.4635,
"step": 569
},
{
"epoch": 1.2801796743402583,
"grad_norm": 0.27848997712135315,
"learning_rate": 7.0857237671105735e-06,
"loss": 0.4504,
"step": 570
},
{
"epoch": 1.2824256035934867,
"grad_norm": 0.2792271375656128,
"learning_rate": 7.0738298619921565e-06,
"loss": 0.4364,
"step": 571
},
{
"epoch": 1.2846715328467153,
"grad_norm": 0.28332486748695374,
"learning_rate": 7.06192176673614e-06,
"loss": 0.4722,
"step": 572
},
{
"epoch": 1.286917462099944,
"grad_norm": 0.2763806879520416,
"learning_rate": 7.0499995628234195e-06,
"loss": 0.4313,
"step": 573
},
{
"epoch": 1.2891633913531724,
"grad_norm": 0.2765560746192932,
"learning_rate": 7.038063331831425e-06,
"loss": 0.4414,
"step": 574
},
{
"epoch": 1.2914093206064008,
"grad_norm": 0.2661452889442444,
"learning_rate": 7.026113155433569e-06,
"loss": 0.4559,
"step": 575
},
{
"epoch": 1.2936552498596294,
"grad_norm": 0.2632508873939514,
"learning_rate": 7.0141491153986856e-06,
"loss": 0.4591,
"step": 576
},
{
"epoch": 1.295901179112858,
"grad_norm": 0.24122297763824463,
"learning_rate": 7.002171293590467e-06,
"loss": 0.4396,
"step": 577
},
{
"epoch": 1.2981471083660865,
"grad_norm": 0.2598783075809479,
"learning_rate": 6.990179771966911e-06,
"loss": 0.4138,
"step": 578
},
{
"epoch": 1.300393037619315,
"grad_norm": 0.2668991982936859,
"learning_rate": 6.978174632579754e-06,
"loss": 0.4601,
"step": 579
},
{
"epoch": 1.3026389668725435,
"grad_norm": 0.2742937505245209,
"learning_rate": 6.966155957573911e-06,
"loss": 0.4214,
"step": 580
},
{
"epoch": 1.304884896125772,
"grad_norm": 0.31684938073158264,
"learning_rate": 6.954123829186917e-06,
"loss": 0.4655,
"step": 581
},
{
"epoch": 1.3071308253790006,
"grad_norm": 0.2928871810436249,
"learning_rate": 6.9420783297483575e-06,
"loss": 0.4494,
"step": 582
},
{
"epoch": 1.309376754632229,
"grad_norm": 0.32177117466926575,
"learning_rate": 6.930019541679314e-06,
"loss": 0.441,
"step": 583
},
{
"epoch": 1.3116226838854577,
"grad_norm": 0.3396602272987366,
"learning_rate": 6.917947547491789e-06,
"loss": 0.4638,
"step": 584
},
{
"epoch": 1.313868613138686,
"grad_norm": 0.3194241225719452,
"learning_rate": 6.9058624297881525e-06,
"loss": 0.4381,
"step": 585
},
{
"epoch": 1.3161145423919147,
"grad_norm": 0.3782861828804016,
"learning_rate": 6.893764271260572e-06,
"loss": 0.4582,
"step": 586
},
{
"epoch": 1.3183604716451431,
"grad_norm": 0.2568625807762146,
"learning_rate": 6.881653154690445e-06,
"loss": 0.4211,
"step": 587
},
{
"epoch": 1.3206064008983718,
"grad_norm": 0.3422069847583771,
"learning_rate": 6.869529162947831e-06,
"loss": 0.4402,
"step": 588
},
{
"epoch": 1.3228523301516002,
"grad_norm": 0.30332496762275696,
"learning_rate": 6.857392378990895e-06,
"loss": 0.4683,
"step": 589
},
{
"epoch": 1.3250982594048288,
"grad_norm": 0.32135963439941406,
"learning_rate": 6.845242885865324e-06,
"loss": 0.4586,
"step": 590
},
{
"epoch": 1.3273441886580573,
"grad_norm": 0.32824966311454773,
"learning_rate": 6.833080766703776e-06,
"loss": 0.458,
"step": 591
},
{
"epoch": 1.3295901179112857,
"grad_norm": 0.3076978921890259,
"learning_rate": 6.820906104725293e-06,
"loss": 0.4597,
"step": 592
},
{
"epoch": 1.3318360471645143,
"grad_norm": 0.2813679873943329,
"learning_rate": 6.808718983234748e-06,
"loss": 0.4311,
"step": 593
},
{
"epoch": 1.334081976417743,
"grad_norm": 0.3031136095523834,
"learning_rate": 6.796519485622267e-06,
"loss": 0.4575,
"step": 594
},
{
"epoch": 1.3363279056709714,
"grad_norm": 0.30593588948249817,
"learning_rate": 6.7843076953626555e-06,
"loss": 0.4269,
"step": 595
},
{
"epoch": 1.3385738349241998,
"grad_norm": 0.29532647132873535,
"learning_rate": 6.7720836960148376e-06,
"loss": 0.437,
"step": 596
},
{
"epoch": 1.3408197641774284,
"grad_norm": 0.2953149378299713,
"learning_rate": 6.7598475712212695e-06,
"loss": 0.4429,
"step": 597
},
{
"epoch": 1.343065693430657,
"grad_norm": 0.2667207419872284,
"learning_rate": 6.747599404707382e-06,
"loss": 0.4464,
"step": 598
},
{
"epoch": 1.3453116226838855,
"grad_norm": 0.35777759552001953,
"learning_rate": 6.735339280281001e-06,
"loss": 0.4632,
"step": 599
},
{
"epoch": 1.347557551937114,
"grad_norm": 0.26391759514808655,
"learning_rate": 6.72306728183177e-06,
"loss": 0.4384,
"step": 600
},
{
"epoch": 1.3498034811903425,
"grad_norm": 0.3116670846939087,
"learning_rate": 6.710783493330583e-06,
"loss": 0.4627,
"step": 601
},
{
"epoch": 1.352049410443571,
"grad_norm": 0.2874084413051605,
"learning_rate": 6.698487998829007e-06,
"loss": 0.4705,
"step": 602
},
{
"epoch": 1.3542953396967996,
"grad_norm": 0.2724873125553131,
"learning_rate": 6.686180882458705e-06,
"loss": 0.4129,
"step": 603
},
{
"epoch": 1.356541268950028,
"grad_norm": 0.3315389156341553,
"learning_rate": 6.673862228430867e-06,
"loss": 0.4471,
"step": 604
},
{
"epoch": 1.3587871982032567,
"grad_norm": 0.32733264565467834,
"learning_rate": 6.661532121035624e-06,
"loss": 0.4529,
"step": 605
},
{
"epoch": 1.361033127456485,
"grad_norm": 0.31259867548942566,
"learning_rate": 6.649190644641482e-06,
"loss": 0.4225,
"step": 606
},
{
"epoch": 1.3632790567097137,
"grad_norm": 0.3450546860694885,
"learning_rate": 6.636837883694735e-06,
"loss": 0.4468,
"step": 607
},
{
"epoch": 1.3655249859629421,
"grad_norm": 0.33732178807258606,
"learning_rate": 6.624473922718888e-06,
"loss": 0.4607,
"step": 608
},
{
"epoch": 1.3677709152161706,
"grad_norm": 0.2904933989048004,
"learning_rate": 6.6120988463140925e-06,
"loss": 0.4242,
"step": 609
},
{
"epoch": 1.3700168444693992,
"grad_norm": 0.30185356736183167,
"learning_rate": 6.599712739156546e-06,
"loss": 0.4398,
"step": 610
},
{
"epoch": 1.3722627737226278,
"grad_norm": 0.2974070906639099,
"learning_rate": 6.587315685997931e-06,
"loss": 0.4482,
"step": 611
},
{
"epoch": 1.3745087029758563,
"grad_norm": 0.3085421919822693,
"learning_rate": 6.574907771664826e-06,
"loss": 0.4338,
"step": 612
},
{
"epoch": 1.3767546322290847,
"grad_norm": 0.2998266816139221,
"learning_rate": 6.5624890810581225e-06,
"loss": 0.4387,
"step": 613
},
{
"epoch": 1.3790005614823133,
"grad_norm": 0.39851927757263184,
"learning_rate": 6.5500596991524556e-06,
"loss": 0.4531,
"step": 614
},
{
"epoch": 1.381246490735542,
"grad_norm": 0.2550167143344879,
"learning_rate": 6.537619710995611e-06,
"loss": 0.4192,
"step": 615
},
{
"epoch": 1.3834924199887704,
"grad_norm": 0.4163671135902405,
"learning_rate": 6.525169201707946e-06,
"loss": 0.4707,
"step": 616
},
{
"epoch": 1.3857383492419988,
"grad_norm": 0.3337157666683197,
"learning_rate": 6.512708256481814e-06,
"loss": 0.4429,
"step": 617
},
{
"epoch": 1.3879842784952274,
"grad_norm": 0.43529441952705383,
"learning_rate": 6.500236960580973e-06,
"loss": 0.4496,
"step": 618
},
{
"epoch": 1.3902302077484558,
"grad_norm": 0.26580479741096497,
"learning_rate": 6.487755399340005e-06,
"loss": 0.4069,
"step": 619
},
{
"epoch": 1.3924761370016845,
"grad_norm": 0.3973635137081146,
"learning_rate": 6.475263658163729e-06,
"loss": 0.4457,
"step": 620
},
{
"epoch": 1.394722066254913,
"grad_norm": 0.42304566502571106,
"learning_rate": 6.462761822526627e-06,
"loss": 0.4589,
"step": 621
},
{
"epoch": 1.3969679955081415,
"grad_norm": 0.3066060543060303,
"learning_rate": 6.450249977972247e-06,
"loss": 0.4118,
"step": 622
},
{
"epoch": 1.39921392476137,
"grad_norm": 0.4160257577896118,
"learning_rate": 6.437728210112626e-06,
"loss": 0.4471,
"step": 623
},
{
"epoch": 1.4014598540145986,
"grad_norm": 0.34768301248550415,
"learning_rate": 6.4251966046277e-06,
"loss": 0.4369,
"step": 624
},
{
"epoch": 1.403705783267827,
"grad_norm": 0.34642931818962097,
"learning_rate": 6.412655247264718e-06,
"loss": 0.4467,
"step": 625
},
{
"epoch": 1.4059517125210557,
"grad_norm": 0.3499101400375366,
"learning_rate": 6.4001042238376534e-06,
"loss": 0.4241,
"step": 626
},
{
"epoch": 1.408197641774284,
"grad_norm": 0.40661197900772095,
"learning_rate": 6.387543620226626e-06,
"loss": 0.4675,
"step": 627
},
{
"epoch": 1.4104435710275127,
"grad_norm": 0.3330638110637665,
"learning_rate": 6.374973522377303e-06,
"loss": 0.4507,
"step": 628
},
{
"epoch": 1.4126895002807411,
"grad_norm": 0.3860412538051605,
"learning_rate": 6.362394016300315e-06,
"loss": 0.4555,
"step": 629
},
{
"epoch": 1.4149354295339696,
"grad_norm": 0.3007884621620178,
"learning_rate": 6.3498051880706726e-06,
"loss": 0.4482,
"step": 630
},
{
"epoch": 1.4171813587871982,
"grad_norm": 0.3595775365829468,
"learning_rate": 6.337207123827169e-06,
"loss": 0.4325,
"step": 631
},
{
"epoch": 1.4194272880404268,
"grad_norm": 0.3329215943813324,
"learning_rate": 6.324599909771798e-06,
"loss": 0.4644,
"step": 632
},
{
"epoch": 1.4216732172936553,
"grad_norm": 0.2800936698913574,
"learning_rate": 6.311983632169157e-06,
"loss": 0.429,
"step": 633
},
{
"epoch": 1.4239191465468837,
"grad_norm": 0.3583846688270569,
"learning_rate": 6.299358377345864e-06,
"loss": 0.4461,
"step": 634
},
{
"epoch": 1.4261650758001123,
"grad_norm": 0.3122238218784332,
"learning_rate": 6.2867242316899615e-06,
"loss": 0.4805,
"step": 635
},
{
"epoch": 1.428411005053341,
"grad_norm": 0.325324684381485,
"learning_rate": 6.2740812816503264e-06,
"loss": 0.4169,
"step": 636
},
{
"epoch": 1.4306569343065694,
"grad_norm": 0.28409814834594727,
"learning_rate": 6.261429613736082e-06,
"loss": 0.4567,
"step": 637
},
{
"epoch": 1.4329028635597978,
"grad_norm": 0.29375067353248596,
"learning_rate": 6.248769314516002e-06,
"loss": 0.4465,
"step": 638
},
{
"epoch": 1.4351487928130264,
"grad_norm": 0.3233538866043091,
"learning_rate": 6.2361004706179195e-06,
"loss": 0.4702,
"step": 639
},
{
"epoch": 1.4373947220662548,
"grad_norm": 0.2539404332637787,
"learning_rate": 6.223423168728136e-06,
"loss": 0.4403,
"step": 640
},
{
"epoch": 1.4396406513194835,
"grad_norm": 0.26419639587402344,
"learning_rate": 6.210737495590825e-06,
"loss": 0.4324,
"step": 641
},
{
"epoch": 1.441886580572712,
"grad_norm": 0.25423571467399597,
"learning_rate": 6.198043538007441e-06,
"loss": 0.4401,
"step": 642
},
{
"epoch": 1.4441325098259405,
"grad_norm": 0.3024260997772217,
"learning_rate": 6.185341382836121e-06,
"loss": 0.4618,
"step": 643
},
{
"epoch": 1.446378439079169,
"grad_norm": 0.27369245886802673,
"learning_rate": 6.1726311169911e-06,
"loss": 0.4423,
"step": 644
},
{
"epoch": 1.4486243683323976,
"grad_norm": 0.2825721204280853,
"learning_rate": 6.159912827442107e-06,
"loss": 0.4416,
"step": 645
},
{
"epoch": 1.450870297585626,
"grad_norm": 0.29679155349731445,
"learning_rate": 6.147186601213773e-06,
"loss": 0.4949,
"step": 646
},
{
"epoch": 1.4531162268388544,
"grad_norm": 0.30457913875579834,
"learning_rate": 6.134452525385035e-06,
"loss": 0.4387,
"step": 647
},
{
"epoch": 1.455362156092083,
"grad_norm": 0.26383036375045776,
"learning_rate": 6.12171068708854e-06,
"loss": 0.4454,
"step": 648
},
{
"epoch": 1.4576080853453117,
"grad_norm": 0.3353641629219055,
"learning_rate": 6.108961173510052e-06,
"loss": 0.4302,
"step": 649
},
{
"epoch": 1.4598540145985401,
"grad_norm": 0.2700467109680176,
"learning_rate": 6.096204071887854e-06,
"loss": 0.4459,
"step": 650
},
{
"epoch": 1.4620999438517686,
"grad_norm": 0.2580196261405945,
"learning_rate": 6.083439469512146e-06,
"loss": 0.4426,
"step": 651
},
{
"epoch": 1.4643458731049972,
"grad_norm": 0.2723543643951416,
"learning_rate": 6.0706674537244535e-06,
"loss": 0.4379,
"step": 652
},
{
"epoch": 1.4665918023582258,
"grad_norm": 0.2748951017856598,
"learning_rate": 6.057888111917028e-06,
"loss": 0.4498,
"step": 653
},
{
"epoch": 1.4688377316114543,
"grad_norm": 0.2623066008090973,
"learning_rate": 6.0451015315322515e-06,
"loss": 0.4373,
"step": 654
},
{
"epoch": 1.4710836608646827,
"grad_norm": 0.2672736644744873,
"learning_rate": 6.032307800062032e-06,
"loss": 0.4409,
"step": 655
},
{
"epoch": 1.4733295901179113,
"grad_norm": 0.2850876450538635,
"learning_rate": 6.019507005047209e-06,
"loss": 0.4612,
"step": 656
},
{
"epoch": 1.4755755193711397,
"grad_norm": 0.30435261130332947,
"learning_rate": 6.0066992340769606e-06,
"loss": 0.4716,
"step": 657
},
{
"epoch": 1.4778214486243684,
"grad_norm": 0.24608232080936432,
"learning_rate": 5.993884574788186e-06,
"loss": 0.4315,
"step": 658
},
{
"epoch": 1.4800673778775968,
"grad_norm": 0.2793516516685486,
"learning_rate": 5.981063114864928e-06,
"loss": 0.4404,
"step": 659
},
{
"epoch": 1.4823133071308254,
"grad_norm": 0.2838444113731384,
"learning_rate": 5.96823494203776e-06,
"loss": 0.4339,
"step": 660
},
{
"epoch": 1.4845592363840538,
"grad_norm": 0.2751578092575073,
"learning_rate": 5.955400144083183e-06,
"loss": 0.4555,
"step": 661
},
{
"epoch": 1.4868051656372825,
"grad_norm": 0.312559574842453,
"learning_rate": 5.942558808823039e-06,
"loss": 0.4512,
"step": 662
},
{
"epoch": 1.489051094890511,
"grad_norm": 0.2821672260761261,
"learning_rate": 5.929711024123894e-06,
"loss": 0.4523,
"step": 663
},
{
"epoch": 1.4912970241437395,
"grad_norm": 0.2883569896221161,
"learning_rate": 5.916856877896447e-06,
"loss": 0.425,
"step": 664
},
{
"epoch": 1.493542953396968,
"grad_norm": 0.2930947244167328,
"learning_rate": 5.903996458094928e-06,
"loss": 0.4528,
"step": 665
},
{
"epoch": 1.4957888826501966,
"grad_norm": 0.2596952021121979,
"learning_rate": 5.89112985271649e-06,
"loss": 0.448,
"step": 666
},
{
"epoch": 1.498034811903425,
"grad_norm": 0.2668738067150116,
"learning_rate": 5.878257149800609e-06,
"loss": 0.4581,
"step": 667
},
{
"epoch": 1.5002807411566534,
"grad_norm": 0.2872879207134247,
"learning_rate": 5.865378437428491e-06,
"loss": 0.4565,
"step": 668
},
{
"epoch": 1.502526670409882,
"grad_norm": 0.27810871601104736,
"learning_rate": 5.8524938037224555e-06,
"loss": 0.4348,
"step": 669
},
{
"epoch": 1.5047725996631107,
"grad_norm": 0.29902833700180054,
"learning_rate": 5.83960333684534e-06,
"loss": 0.4692,
"step": 670
},
{
"epoch": 1.5070185289163391,
"grad_norm": 0.271638959646225,
"learning_rate": 5.826707124999893e-06,
"loss": 0.4315,
"step": 671
},
{
"epoch": 1.5092644581695676,
"grad_norm": 0.301960825920105,
"learning_rate": 5.813805256428177e-06,
"loss": 0.4393,
"step": 672
},
{
"epoch": 1.5115103874227962,
"grad_norm": 0.28544798493385315,
"learning_rate": 5.800897819410961e-06,
"loss": 0.4597,
"step": 673
},
{
"epoch": 1.5137563166760248,
"grad_norm": 0.2677849531173706,
"learning_rate": 5.787984902267111e-06,
"loss": 0.4204,
"step": 674
},
{
"epoch": 1.5160022459292533,
"grad_norm": 0.2877887189388275,
"learning_rate": 5.775066593352994e-06,
"loss": 0.4491,
"step": 675
},
{
"epoch": 1.5182481751824817,
"grad_norm": 0.27290868759155273,
"learning_rate": 5.762142981061869e-06,
"loss": 0.4318,
"step": 676
},
{
"epoch": 1.5204941044357103,
"grad_norm": 0.2793848514556885,
"learning_rate": 5.749214153823284e-06,
"loss": 0.4631,
"step": 677
},
{
"epoch": 1.522740033688939,
"grad_norm": 0.27665579319000244,
"learning_rate": 5.736280200102471e-06,
"loss": 0.441,
"step": 678
},
{
"epoch": 1.5249859629421674,
"grad_norm": 0.26563090085983276,
"learning_rate": 5.723341208399737e-06,
"loss": 0.4231,
"step": 679
},
{
"epoch": 1.5272318921953958,
"grad_norm": 0.28303608298301697,
"learning_rate": 5.7103972672498645e-06,
"loss": 0.4622,
"step": 680
},
{
"epoch": 1.5294778214486242,
"grad_norm": 0.2486550211906433,
"learning_rate": 5.697448465221499e-06,
"loss": 0.4509,
"step": 681
},
{
"epoch": 1.5317237507018528,
"grad_norm": 0.26522529125213623,
"learning_rate": 5.684494890916551e-06,
"loss": 0.4512,
"step": 682
},
{
"epoch": 1.5339696799550815,
"grad_norm": 0.2896977365016937,
"learning_rate": 5.6715366329695805e-06,
"loss": 0.4344,
"step": 683
},
{
"epoch": 1.53621560920831,
"grad_norm": 0.28568655252456665,
"learning_rate": 5.658573780047197e-06,
"loss": 0.4713,
"step": 684
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.2812296152114868,
"learning_rate": 5.645606420847454e-06,
"loss": 0.4279,
"step": 685
},
{
"epoch": 1.540707467714767,
"grad_norm": 0.2628013789653778,
"learning_rate": 5.632634644099235e-06,
"loss": 0.4428,
"step": 686
},
{
"epoch": 1.5429533969679956,
"grad_norm": 0.27226313948631287,
"learning_rate": 5.6196585385616505e-06,
"loss": 0.4366,
"step": 687
},
{
"epoch": 1.545199326221224,
"grad_norm": 0.2939417362213135,
"learning_rate": 5.606678193023436e-06,
"loss": 0.4775,
"step": 688
},
{
"epoch": 1.5474452554744524,
"grad_norm": 0.3119971752166748,
"learning_rate": 5.593693696302333e-06,
"loss": 0.4658,
"step": 689
},
{
"epoch": 1.549691184727681,
"grad_norm": 0.23466715216636658,
"learning_rate": 5.580705137244488e-06,
"loss": 0.4282,
"step": 690
},
{
"epoch": 1.5519371139809097,
"grad_norm": 0.32123836874961853,
"learning_rate": 5.567712604723846e-06,
"loss": 0.4383,
"step": 691
},
{
"epoch": 1.5541830432341381,
"grad_norm": 0.28472721576690674,
"learning_rate": 5.5547161876415435e-06,
"loss": 0.444,
"step": 692
},
{
"epoch": 1.5564289724873666,
"grad_norm": 0.3107893168926239,
"learning_rate": 5.54171597492529e-06,
"loss": 0.4578,
"step": 693
},
{
"epoch": 1.5586749017405952,
"grad_norm": 0.29814159870147705,
"learning_rate": 5.52871205552877e-06,
"loss": 0.4509,
"step": 694
},
{
"epoch": 1.5609208309938238,
"grad_norm": 0.2612503170967102,
"learning_rate": 5.515704518431033e-06,
"loss": 0.4284,
"step": 695
},
{
"epoch": 1.5631667602470523,
"grad_norm": 0.2933982014656067,
"learning_rate": 5.50269345263588e-06,
"loss": 0.4382,
"step": 696
},
{
"epoch": 1.5654126895002807,
"grad_norm": 0.24303555488586426,
"learning_rate": 5.489678947171255e-06,
"loss": 0.4317,
"step": 697
},
{
"epoch": 1.5676586187535093,
"grad_norm": 0.25020086765289307,
"learning_rate": 5.4766610910886445e-06,
"loss": 0.4396,
"step": 698
},
{
"epoch": 1.5699045480067377,
"grad_norm": 0.2751081883907318,
"learning_rate": 5.4636399734624534e-06,
"loss": 0.4557,
"step": 699
},
{
"epoch": 1.5721504772599664,
"grad_norm": 0.26188722252845764,
"learning_rate": 5.450615683389408e-06,
"loss": 0.4092,
"step": 700
},
{
"epoch": 1.5743964065131948,
"grad_norm": 0.31535235047340393,
"learning_rate": 5.437588309987945e-06,
"loss": 0.4918,
"step": 701
},
{
"epoch": 1.5766423357664232,
"grad_norm": 0.2722760736942291,
"learning_rate": 5.424557942397593e-06,
"loss": 0.4208,
"step": 702
},
{
"epoch": 1.5788882650196518,
"grad_norm": 0.3277275562286377,
"learning_rate": 5.411524669778369e-06,
"loss": 0.4578,
"step": 703
},
{
"epoch": 1.5811341942728805,
"grad_norm": 0.24588078260421753,
"learning_rate": 5.398488581310172e-06,
"loss": 0.4456,
"step": 704
},
{
"epoch": 1.583380123526109,
"grad_norm": 0.2953939139842987,
"learning_rate": 5.385449766192164e-06,
"loss": 0.4503,
"step": 705
},
{
"epoch": 1.5856260527793373,
"grad_norm": 0.2831403613090515,
"learning_rate": 5.372408313642168e-06,
"loss": 0.4403,
"step": 706
},
{
"epoch": 1.587871982032566,
"grad_norm": 0.2721308767795563,
"learning_rate": 5.359364312896047e-06,
"loss": 0.4243,
"step": 707
},
{
"epoch": 1.5901179112857946,
"grad_norm": 0.283263623714447,
"learning_rate": 5.346317853207108e-06,
"loss": 0.4658,
"step": 708
},
{
"epoch": 1.592363840539023,
"grad_norm": 0.2844542860984802,
"learning_rate": 5.333269023845478e-06,
"loss": 0.4366,
"step": 709
},
{
"epoch": 1.5946097697922514,
"grad_norm": 0.2929394245147705,
"learning_rate": 5.320217914097498e-06,
"loss": 0.4604,
"step": 710
},
{
"epoch": 1.59685569904548,
"grad_norm": 0.2344074845314026,
"learning_rate": 5.307164613265119e-06,
"loss": 0.4172,
"step": 711
},
{
"epoch": 1.5991016282987087,
"grad_norm": 0.2725594639778137,
"learning_rate": 5.294109210665275e-06,
"loss": 0.4322,
"step": 712
},
{
"epoch": 1.6013475575519371,
"grad_norm": 0.27773675322532654,
"learning_rate": 5.281051795629289e-06,
"loss": 0.454,
"step": 713
},
{
"epoch": 1.6035934868051656,
"grad_norm": 0.28249219059944153,
"learning_rate": 5.26799245750225e-06,
"loss": 0.4216,
"step": 714
},
{
"epoch": 1.6058394160583942,
"grad_norm": 0.3116042912006378,
"learning_rate": 5.254931285642406e-06,
"loss": 0.4531,
"step": 715
},
{
"epoch": 1.6080853453116228,
"grad_norm": 0.2770937383174896,
"learning_rate": 5.2418683694205574e-06,
"loss": 0.4509,
"step": 716
},
{
"epoch": 1.6103312745648513,
"grad_norm": 0.3380868136882782,
"learning_rate": 5.228803798219432e-06,
"loss": 0.4492,
"step": 717
},
{
"epoch": 1.6125772038180797,
"grad_norm": 0.2792224586009979,
"learning_rate": 5.215737661433087e-06,
"loss": 0.4572,
"step": 718
},
{
"epoch": 1.614823133071308,
"grad_norm": 0.2865675389766693,
"learning_rate": 5.20267004846629e-06,
"loss": 0.441,
"step": 719
},
{
"epoch": 1.6170690623245367,
"grad_norm": 0.29234838485717773,
"learning_rate": 5.189601048733912e-06,
"loss": 0.4337,
"step": 720
},
{
"epoch": 1.6193149915777654,
"grad_norm": 0.2698359489440918,
"learning_rate": 5.17653075166031e-06,
"loss": 0.45,
"step": 721
},
{
"epoch": 1.6215609208309938,
"grad_norm": 0.3000829517841339,
"learning_rate": 5.16345924667872e-06,
"loss": 0.4387,
"step": 722
},
{
"epoch": 1.6238068500842222,
"grad_norm": 0.3248939514160156,
"learning_rate": 5.150386623230643e-06,
"loss": 0.4733,
"step": 723
},
{
"epoch": 1.6260527793374508,
"grad_norm": 0.24670802056789398,
"learning_rate": 5.137312970765232e-06,
"loss": 0.4398,
"step": 724
},
{
"epoch": 1.6282987085906795,
"grad_norm": 0.28131037950515747,
"learning_rate": 5.12423837873868e-06,
"loss": 0.4413,
"step": 725
},
{
"epoch": 1.630544637843908,
"grad_norm": 0.2791185677051544,
"learning_rate": 5.1111629366136115e-06,
"loss": 0.4213,
"step": 726
},
{
"epoch": 1.6327905670971363,
"grad_norm": 0.2748776972293854,
"learning_rate": 5.0980867338584675e-06,
"loss": 0.4322,
"step": 727
},
{
"epoch": 1.635036496350365,
"grad_norm": 0.22908659279346466,
"learning_rate": 5.08500985994689e-06,
"loss": 0.4316,
"step": 728
},
{
"epoch": 1.6372824256035936,
"grad_norm": 0.27406492829322815,
"learning_rate": 5.071932404357119e-06,
"loss": 0.4355,
"step": 729
},
{
"epoch": 1.639528354856822,
"grad_norm": 0.2680008113384247,
"learning_rate": 5.058854456571372e-06,
"loss": 0.4607,
"step": 730
},
{
"epoch": 1.6417742841100504,
"grad_norm": 0.24551571905612946,
"learning_rate": 5.045776106075232e-06,
"loss": 0.4165,
"step": 731
},
{
"epoch": 1.644020213363279,
"grad_norm": 0.2642151117324829,
"learning_rate": 5.032697442357039e-06,
"loss": 0.4548,
"step": 732
},
{
"epoch": 1.6462661426165077,
"grad_norm": 0.2644287049770355,
"learning_rate": 5.019618554907279e-06,
"loss": 0.4169,
"step": 733
},
{
"epoch": 1.6485120718697361,
"grad_norm": 0.24862082302570343,
"learning_rate": 5.0065395332179666e-06,
"loss": 0.4308,
"step": 734
},
{
"epoch": 1.6507580011229646,
"grad_norm": 0.2720666825771332,
"learning_rate": 4.993460466782034e-06,
"loss": 0.465,
"step": 735
},
{
"epoch": 1.6530039303761932,
"grad_norm": 0.355563759803772,
"learning_rate": 4.9803814450927214e-06,
"loss": 0.4572,
"step": 736
},
{
"epoch": 1.6552498596294218,
"grad_norm": 0.22143852710723877,
"learning_rate": 4.967302557642962e-06,
"loss": 0.4203,
"step": 737
},
{
"epoch": 1.6574957888826503,
"grad_norm": 0.30551275610923767,
"learning_rate": 4.954223893924771e-06,
"loss": 0.4391,
"step": 738
},
{
"epoch": 1.6597417181358787,
"grad_norm": 0.23899058997631073,
"learning_rate": 4.94114554342863e-06,
"loss": 0.4523,
"step": 739
},
{
"epoch": 1.661987647389107,
"grad_norm": 0.23506562411785126,
"learning_rate": 4.928067595642882e-06,
"loss": 0.444,
"step": 740
},
{
"epoch": 1.6642335766423357,
"grad_norm": 0.2739086151123047,
"learning_rate": 4.91499014005311e-06,
"loss": 0.4283,
"step": 741
},
{
"epoch": 1.6664795058955644,
"grad_norm": 0.22991512715816498,
"learning_rate": 4.901913266141534e-06,
"loss": 0.4277,
"step": 742
},
{
"epoch": 1.6687254351487928,
"grad_norm": 0.28890857100486755,
"learning_rate": 4.888837063386391e-06,
"loss": 0.4633,
"step": 743
},
{
"epoch": 1.6709713644020212,
"grad_norm": 0.25182008743286133,
"learning_rate": 4.875761621261322e-06,
"loss": 0.4591,
"step": 744
},
{
"epoch": 1.6732172936552498,
"grad_norm": 0.259389191865921,
"learning_rate": 4.862687029234769e-06,
"loss": 0.4407,
"step": 745
},
{
"epoch": 1.6754632229084785,
"grad_norm": 0.24826925992965698,
"learning_rate": 4.849613376769358e-06,
"loss": 0.4582,
"step": 746
},
{
"epoch": 1.677709152161707,
"grad_norm": 0.31528979539871216,
"learning_rate": 4.83654075332128e-06,
"loss": 0.4321,
"step": 747
},
{
"epoch": 1.6799550814149353,
"grad_norm": 0.24880996346473694,
"learning_rate": 4.8234692483396915e-06,
"loss": 0.4298,
"step": 748
},
{
"epoch": 1.682201010668164,
"grad_norm": 0.2553097903728485,
"learning_rate": 4.81039895126609e-06,
"loss": 0.4359,
"step": 749
},
{
"epoch": 1.6844469399213926,
"grad_norm": 0.2735806107521057,
"learning_rate": 4.797329951533712e-06,
"loss": 0.4513,
"step": 750
},
{
"epoch": 1.686692869174621,
"grad_norm": 0.2573295831680298,
"learning_rate": 4.784262338566915e-06,
"loss": 0.4431,
"step": 751
},
{
"epoch": 1.6889387984278494,
"grad_norm": 0.25200626254081726,
"learning_rate": 4.77119620178057e-06,
"loss": 0.453,
"step": 752
},
{
"epoch": 1.691184727681078,
"grad_norm": 0.24043521285057068,
"learning_rate": 4.758131630579446e-06,
"loss": 0.4097,
"step": 753
},
{
"epoch": 1.6934306569343067,
"grad_norm": 0.27149125933647156,
"learning_rate": 4.745068714357595e-06,
"loss": 0.4415,
"step": 754
},
{
"epoch": 1.6956765861875351,
"grad_norm": 0.2776370942592621,
"learning_rate": 4.7320075424977515e-06,
"loss": 0.4653,
"step": 755
},
{
"epoch": 1.6979225154407636,
"grad_norm": 0.29149848222732544,
"learning_rate": 4.718948204370713e-06,
"loss": 0.4206,
"step": 756
},
{
"epoch": 1.700168444693992,
"grad_norm": 0.27004140615463257,
"learning_rate": 4.705890789334726e-06,
"loss": 0.4717,
"step": 757
},
{
"epoch": 1.7024143739472206,
"grad_norm": 0.27363502979278564,
"learning_rate": 4.692835386734884e-06,
"loss": 0.4262,
"step": 758
},
{
"epoch": 1.7046603032004493,
"grad_norm": 0.27881062030792236,
"learning_rate": 4.679782085902503e-06,
"loss": 0.4562,
"step": 759
},
{
"epoch": 1.7069062324536777,
"grad_norm": 0.2494436502456665,
"learning_rate": 4.6667309761545245e-06,
"loss": 0.4537,
"step": 760
},
{
"epoch": 1.709152161706906,
"grad_norm": 0.2262820154428482,
"learning_rate": 4.6536821467928926e-06,
"loss": 0.3919,
"step": 761
},
{
"epoch": 1.7113980909601347,
"grad_norm": 0.25715264678001404,
"learning_rate": 4.6406356871039534e-06,
"loss": 0.4665,
"step": 762
},
{
"epoch": 1.7136440202133634,
"grad_norm": 0.26350539922714233,
"learning_rate": 4.627591686357835e-06,
"loss": 0.4623,
"step": 763
},
{
"epoch": 1.7158899494665918,
"grad_norm": 0.23280011117458344,
"learning_rate": 4.6145502338078365e-06,
"loss": 0.4195,
"step": 764
},
{
"epoch": 1.7181358787198202,
"grad_norm": 0.25985339283943176,
"learning_rate": 4.60151141868983e-06,
"loss": 0.4449,
"step": 765
},
{
"epoch": 1.7203818079730488,
"grad_norm": 0.2784518599510193,
"learning_rate": 4.5884753302216315e-06,
"loss": 0.491,
"step": 766
},
{
"epoch": 1.7226277372262775,
"grad_norm": 0.2532546818256378,
"learning_rate": 4.575442057602408e-06,
"loss": 0.4271,
"step": 767
},
{
"epoch": 1.724873666479506,
"grad_norm": 0.270094633102417,
"learning_rate": 4.562411690012057e-06,
"loss": 0.4388,
"step": 768
},
{
"epoch": 1.7271195957327343,
"grad_norm": 0.2802513837814331,
"learning_rate": 4.549384316610593e-06,
"loss": 0.4443,
"step": 769
},
{
"epoch": 1.729365524985963,
"grad_norm": 0.2635841965675354,
"learning_rate": 4.536360026537548e-06,
"loss": 0.4262,
"step": 770
},
{
"epoch": 1.7316114542391916,
"grad_norm": 0.25495946407318115,
"learning_rate": 4.523338908911358e-06,
"loss": 0.4558,
"step": 771
},
{
"epoch": 1.73385738349242,
"grad_norm": 0.25492119789123535,
"learning_rate": 4.510321052828745e-06,
"loss": 0.4478,
"step": 772
},
{
"epoch": 1.7361033127456484,
"grad_norm": 0.2536661922931671,
"learning_rate": 4.497306547364123e-06,
"loss": 0.473,
"step": 773
},
{
"epoch": 1.738349241998877,
"grad_norm": 0.23842228949069977,
"learning_rate": 4.484295481568968e-06,
"loss": 0.434,
"step": 774
},
{
"epoch": 1.7405951712521057,
"grad_norm": 0.26309531927108765,
"learning_rate": 4.471287944471231e-06,
"loss": 0.4383,
"step": 775
},
{
"epoch": 1.7428411005053341,
"grad_norm": 0.2441006749868393,
"learning_rate": 4.458284025074711e-06,
"loss": 0.4548,
"step": 776
},
{
"epoch": 1.7450870297585626,
"grad_norm": 0.2809121608734131,
"learning_rate": 4.4452838123584565e-06,
"loss": 0.4373,
"step": 777
},
{
"epoch": 1.747332959011791,
"grad_norm": 0.2502027153968811,
"learning_rate": 4.432287395276155e-06,
"loss": 0.4721,
"step": 778
},
{
"epoch": 1.7495788882650196,
"grad_norm": 0.2655166685581207,
"learning_rate": 4.419294862755515e-06,
"loss": 0.4245,
"step": 779
},
{
"epoch": 1.7518248175182483,
"grad_norm": 0.2757239043712616,
"learning_rate": 4.406306303697669e-06,
"loss": 0.4414,
"step": 780
},
{
"epoch": 1.7540707467714767,
"grad_norm": 0.23585571348667145,
"learning_rate": 4.393321806976565e-06,
"loss": 0.4397,
"step": 781
},
{
"epoch": 1.756316676024705,
"grad_norm": 0.25489094853401184,
"learning_rate": 4.380341461438349e-06,
"loss": 0.4496,
"step": 782
},
{
"epoch": 1.7585626052779337,
"grad_norm": 0.2948884665966034,
"learning_rate": 4.3673653559007676e-06,
"loss": 0.4521,
"step": 783
},
{
"epoch": 1.7608085345311624,
"grad_norm": 0.26162976026535034,
"learning_rate": 4.354393579152547e-06,
"loss": 0.409,
"step": 784
},
{
"epoch": 1.7630544637843908,
"grad_norm": 0.27988922595977783,
"learning_rate": 4.3414262199528045e-06,
"loss": 0.4661,
"step": 785
},
{
"epoch": 1.7653003930376192,
"grad_norm": 0.3011482059955597,
"learning_rate": 4.328463367030421e-06,
"loss": 0.4586,
"step": 786
},
{
"epoch": 1.7675463222908478,
"grad_norm": 0.27512040734291077,
"learning_rate": 4.315505109083451e-06,
"loss": 0.4452,
"step": 787
},
{
"epoch": 1.7697922515440765,
"grad_norm": 0.22836817800998688,
"learning_rate": 4.302551534778504e-06,
"loss": 0.4213,
"step": 788
},
{
"epoch": 1.772038180797305,
"grad_norm": 0.3237468898296356,
"learning_rate": 4.289602732750138e-06,
"loss": 0.4307,
"step": 789
},
{
"epoch": 1.7742841100505333,
"grad_norm": 0.2781298756599426,
"learning_rate": 4.276658791600264e-06,
"loss": 0.428,
"step": 790
},
{
"epoch": 1.776530039303762,
"grad_norm": 0.26471009850502014,
"learning_rate": 4.26371979989753e-06,
"loss": 0.4424,
"step": 791
},
{
"epoch": 1.7787759685569906,
"grad_norm": 0.25274160504341125,
"learning_rate": 4.250785846176716e-06,
"loss": 0.4272,
"step": 792
},
{
"epoch": 1.781021897810219,
"grad_norm": 0.2389991134405136,
"learning_rate": 4.237857018938132e-06,
"loss": 0.4469,
"step": 793
},
{
"epoch": 1.7832678270634474,
"grad_norm": 0.2341649830341339,
"learning_rate": 4.224933406647008e-06,
"loss": 0.4175,
"step": 794
},
{
"epoch": 1.7855137563166759,
"grad_norm": 0.2746540606021881,
"learning_rate": 4.212015097732891e-06,
"loss": 0.4406,
"step": 795
},
{
"epoch": 1.7877596855699045,
"grad_norm": 0.2597159445285797,
"learning_rate": 4.1991021805890394e-06,
"loss": 0.4579,
"step": 796
},
{
"epoch": 1.7900056148231331,
"grad_norm": 0.2421720176935196,
"learning_rate": 4.186194743571823e-06,
"loss": 0.4247,
"step": 797
},
{
"epoch": 1.7922515440763616,
"grad_norm": 0.25346839427948,
"learning_rate": 4.173292875000108e-06,
"loss": 0.4471,
"step": 798
},
{
"epoch": 1.79449747332959,
"grad_norm": 0.2318015843629837,
"learning_rate": 4.1603966631546634e-06,
"loss": 0.4357,
"step": 799
},
{
"epoch": 1.7967434025828186,
"grad_norm": 0.23157362639904022,
"learning_rate": 4.147506196277546e-06,
"loss": 0.4507,
"step": 800
},
{
"epoch": 1.7989893318360473,
"grad_norm": 0.2407248169183731,
"learning_rate": 4.13462156257151e-06,
"loss": 0.4502,
"step": 801
},
{
"epoch": 1.8012352610892757,
"grad_norm": 0.24326087534427643,
"learning_rate": 4.121742850199391e-06,
"loss": 0.4505,
"step": 802
},
{
"epoch": 1.803481190342504,
"grad_norm": 0.23502765595912933,
"learning_rate": 4.108870147283512e-06,
"loss": 0.4407,
"step": 803
},
{
"epoch": 1.8057271195957327,
"grad_norm": 0.28090357780456543,
"learning_rate": 4.0960035419050745e-06,
"loss": 0.4359,
"step": 804
},
{
"epoch": 1.8079730488489614,
"grad_norm": 0.22931216657161713,
"learning_rate": 4.083143122103554e-06,
"loss": 0.4145,
"step": 805
},
{
"epoch": 1.8102189781021898,
"grad_norm": 0.24223902821540833,
"learning_rate": 4.070288975876107e-06,
"loss": 0.4556,
"step": 806
},
{
"epoch": 1.8124649073554182,
"grad_norm": 0.2725001871585846,
"learning_rate": 4.0574411911769625e-06,
"loss": 0.4639,
"step": 807
},
{
"epoch": 1.8147108366086468,
"grad_norm": 0.24160481989383698,
"learning_rate": 4.044599855916817e-06,
"loss": 0.4609,
"step": 808
},
{
"epoch": 1.8169567658618755,
"grad_norm": 0.23829206824302673,
"learning_rate": 4.031765057962243e-06,
"loss": 0.427,
"step": 809
},
{
"epoch": 1.819202695115104,
"grad_norm": 0.2611043155193329,
"learning_rate": 4.018936885135074e-06,
"loss": 0.4584,
"step": 810
},
{
"epoch": 1.8214486243683323,
"grad_norm": 0.2420017123222351,
"learning_rate": 4.006115425211816e-06,
"loss": 0.4084,
"step": 811
},
{
"epoch": 1.823694553621561,
"grad_norm": 0.2647510766983032,
"learning_rate": 3.993300765923042e-06,
"loss": 0.453,
"step": 812
},
{
"epoch": 1.8259404828747896,
"grad_norm": 0.218390554189682,
"learning_rate": 3.980492994952792e-06,
"loss": 0.4203,
"step": 813
},
{
"epoch": 1.828186412128018,
"grad_norm": 0.3060971200466156,
"learning_rate": 3.967692199937971e-06,
"loss": 0.4673,
"step": 814
},
{
"epoch": 1.8304323413812464,
"grad_norm": 0.2392362505197525,
"learning_rate": 3.95489846846775e-06,
"loss": 0.436,
"step": 815
},
{
"epoch": 1.8326782706344749,
"grad_norm": 0.22931107878684998,
"learning_rate": 3.9421118880829735e-06,
"loss": 0.4058,
"step": 816
},
{
"epoch": 1.8349241998877035,
"grad_norm": 0.30072271823883057,
"learning_rate": 3.929332546275547e-06,
"loss": 0.4499,
"step": 817
},
{
"epoch": 1.8371701291409321,
"grad_norm": 0.22911213338375092,
"learning_rate": 3.916560530487854e-06,
"loss": 0.4453,
"step": 818
},
{
"epoch": 1.8394160583941606,
"grad_norm": 0.2482576072216034,
"learning_rate": 3.9037959281121474e-06,
"loss": 0.4288,
"step": 819
},
{
"epoch": 1.841661987647389,
"grad_norm": 0.24556680023670197,
"learning_rate": 3.891038826489949e-06,
"loss": 0.4389,
"step": 820
},
{
"epoch": 1.8439079169006176,
"grad_norm": 0.22505217790603638,
"learning_rate": 3.878289312911462e-06,
"loss": 0.4505,
"step": 821
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.2489365190267563,
"learning_rate": 3.865547474614967e-06,
"loss": 0.4563,
"step": 822
},
{
"epoch": 1.8483997754070747,
"grad_norm": 0.2253488451242447,
"learning_rate": 3.852813398786228e-06,
"loss": 0.4099,
"step": 823
},
{
"epoch": 1.850645704660303,
"grad_norm": 0.2778521478176117,
"learning_rate": 3.840087172557894e-06,
"loss": 0.4527,
"step": 824
},
{
"epoch": 1.8528916339135317,
"grad_norm": 0.22189773619174957,
"learning_rate": 3.8273688830089005e-06,
"loss": 0.4205,
"step": 825
},
{
"epoch": 1.8551375631667604,
"grad_norm": 0.2973972260951996,
"learning_rate": 3.8146586171638803e-06,
"loss": 0.4554,
"step": 826
},
{
"epoch": 1.8573834924199888,
"grad_norm": 0.21712501347064972,
"learning_rate": 3.801956461992561e-06,
"loss": 0.4249,
"step": 827
},
{
"epoch": 1.8596294216732172,
"grad_norm": 0.22984138131141663,
"learning_rate": 3.7892625044091747e-06,
"loss": 0.4383,
"step": 828
},
{
"epoch": 1.8618753509264458,
"grad_norm": 0.23754611611366272,
"learning_rate": 3.776576831271865e-06,
"loss": 0.4669,
"step": 829
},
{
"epoch": 1.8641212801796745,
"grad_norm": 0.2339145392179489,
"learning_rate": 3.7638995293820817e-06,
"loss": 0.4167,
"step": 830
},
{
"epoch": 1.866367209432903,
"grad_norm": 0.2435954064130783,
"learning_rate": 3.7512306854839993e-06,
"loss": 0.4558,
"step": 831
},
{
"epoch": 1.8686131386861313,
"grad_norm": 0.23407921195030212,
"learning_rate": 3.73857038626392e-06,
"loss": 0.4524,
"step": 832
},
{
"epoch": 1.87085906793936,
"grad_norm": 0.22388103604316711,
"learning_rate": 3.725918718349675e-06,
"loss": 0.4319,
"step": 833
},
{
"epoch": 1.8731049971925884,
"grad_norm": 0.2634325623512268,
"learning_rate": 3.713275768310041e-06,
"loss": 0.4501,
"step": 834
},
{
"epoch": 1.875350926445817,
"grad_norm": 0.2393648326396942,
"learning_rate": 3.7006416226541375e-06,
"loss": 0.4365,
"step": 835
},
{
"epoch": 1.8775968556990454,
"grad_norm": 0.24255381524562836,
"learning_rate": 3.6880163678308443e-06,
"loss": 0.4521,
"step": 836
},
{
"epoch": 1.8798427849522739,
"grad_norm": 0.2714441120624542,
"learning_rate": 3.6754000902282026e-06,
"loss": 0.4426,
"step": 837
},
{
"epoch": 1.8820887142055025,
"grad_norm": 0.24428033828735352,
"learning_rate": 3.6627928761728315e-06,
"loss": 0.4381,
"step": 838
},
{
"epoch": 1.8843346434587311,
"grad_norm": 0.2361423224210739,
"learning_rate": 3.65019481192933e-06,
"loss": 0.4353,
"step": 839
},
{
"epoch": 1.8865805727119596,
"grad_norm": 0.28072136640548706,
"learning_rate": 3.637605983699687e-06,
"loss": 0.4555,
"step": 840
},
{
"epoch": 1.888826501965188,
"grad_norm": 0.2483406364917755,
"learning_rate": 3.6250264776226995e-06,
"loss": 0.4162,
"step": 841
},
{
"epoch": 1.8910724312184166,
"grad_norm": 0.2832973301410675,
"learning_rate": 3.612456379773376e-06,
"loss": 0.4573,
"step": 842
},
{
"epoch": 1.8933183604716453,
"grad_norm": 0.2808990776538849,
"learning_rate": 3.599895776162347e-06,
"loss": 0.4322,
"step": 843
},
{
"epoch": 1.8955642897248737,
"grad_norm": 0.24022577702999115,
"learning_rate": 3.5873447527352852e-06,
"loss": 0.4376,
"step": 844
},
{
"epoch": 1.897810218978102,
"grad_norm": 0.2281453162431717,
"learning_rate": 3.574803395372301e-06,
"loss": 0.44,
"step": 845
},
{
"epoch": 1.9000561482313307,
"grad_norm": 0.26453354954719543,
"learning_rate": 3.562271789887375e-06,
"loss": 0.4462,
"step": 846
},
{
"epoch": 1.9023020774845594,
"grad_norm": 0.23726797103881836,
"learning_rate": 3.5497500220277535e-06,
"loss": 0.4221,
"step": 847
},
{
"epoch": 1.9045480067377878,
"grad_norm": 0.24078302085399628,
"learning_rate": 3.537238177473375e-06,
"loss": 0.4357,
"step": 848
},
{
"epoch": 1.9067939359910162,
"grad_norm": 0.2368990182876587,
"learning_rate": 3.524736341836272e-06,
"loss": 0.4401,
"step": 849
},
{
"epoch": 1.9090398652442448,
"grad_norm": 0.22038300335407257,
"learning_rate": 3.5122446006599988e-06,
"loss": 0.4522,
"step": 850
},
{
"epoch": 1.9112857944974735,
"grad_norm": 0.23867258429527283,
"learning_rate": 3.499763039419028e-06,
"loss": 0.4319,
"step": 851
},
{
"epoch": 1.913531723750702,
"grad_norm": 0.2527855336666107,
"learning_rate": 3.4872917435181862e-06,
"loss": 0.444,
"step": 852
},
{
"epoch": 1.9157776530039303,
"grad_norm": 0.23553407192230225,
"learning_rate": 3.474830798292054e-06,
"loss": 0.4408,
"step": 853
},
{
"epoch": 1.9180235822571587,
"grad_norm": 0.26365795731544495,
"learning_rate": 3.462380289004391e-06,
"loss": 0.4466,
"step": 854
},
{
"epoch": 1.9202695115103874,
"grad_norm": 0.2614414393901825,
"learning_rate": 3.4499403008475474e-06,
"loss": 0.4437,
"step": 855
},
{
"epoch": 1.922515440763616,
"grad_norm": 0.25481751561164856,
"learning_rate": 3.437510918941879e-06,
"loss": 0.4401,
"step": 856
},
{
"epoch": 1.9247613700168444,
"grad_norm": 0.24284076690673828,
"learning_rate": 3.4250922283351762e-06,
"loss": 0.439,
"step": 857
},
{
"epoch": 1.9270072992700729,
"grad_norm": 0.21187108755111694,
"learning_rate": 3.4126843140020697e-06,
"loss": 0.4261,
"step": 858
},
{
"epoch": 1.9292532285233015,
"grad_norm": 0.2579561769962311,
"learning_rate": 3.400287260843454e-06,
"loss": 0.4456,
"step": 859
},
{
"epoch": 1.9314991577765301,
"grad_norm": 0.24703598022460938,
"learning_rate": 3.3879011536859095e-06,
"loss": 0.4333,
"step": 860
},
{
"epoch": 1.9337450870297586,
"grad_norm": 0.2076679766178131,
"learning_rate": 3.3755260772811135e-06,
"loss": 0.4251,
"step": 861
},
{
"epoch": 1.935991016282987,
"grad_norm": 0.25392812490463257,
"learning_rate": 3.3631621163052673e-06,
"loss": 0.4618,
"step": 862
},
{
"epoch": 1.9382369455362156,
"grad_norm": 0.23472177982330322,
"learning_rate": 3.350809355358518e-06,
"loss": 0.4396,
"step": 863
},
{
"epoch": 1.9404828747894443,
"grad_norm": 0.2323472946882248,
"learning_rate": 3.3384678789643754e-06,
"loss": 0.4465,
"step": 864
},
{
"epoch": 1.9427288040426727,
"grad_norm": 0.21967169642448425,
"learning_rate": 3.3261377715691355e-06,
"loss": 0.4525,
"step": 865
},
{
"epoch": 1.944974733295901,
"grad_norm": 0.2311394363641739,
"learning_rate": 3.313819117541297e-06,
"loss": 0.4151,
"step": 866
},
{
"epoch": 1.9472206625491297,
"grad_norm": 0.2546962797641754,
"learning_rate": 3.3015120011709955e-06,
"loss": 0.4423,
"step": 867
},
{
"epoch": 1.9494665918023584,
"grad_norm": 0.23539777100086212,
"learning_rate": 3.289216506669419e-06,
"loss": 0.44,
"step": 868
},
{
"epoch": 1.9517125210555868,
"grad_norm": 0.21500107645988464,
"learning_rate": 3.2769327181682307e-06,
"loss": 0.4144,
"step": 869
},
{
"epoch": 1.9539584503088152,
"grad_norm": 0.24959760904312134,
"learning_rate": 3.264660719719001e-06,
"loss": 0.4371,
"step": 870
},
{
"epoch": 1.9562043795620438,
"grad_norm": 0.2544858753681183,
"learning_rate": 3.2524005952926195e-06,
"loss": 0.4499,
"step": 871
},
{
"epoch": 1.9584503088152723,
"grad_norm": 0.23261022567749023,
"learning_rate": 3.2401524287787317e-06,
"loss": 0.4587,
"step": 872
},
{
"epoch": 1.960696238068501,
"grad_norm": 0.25908032059669495,
"learning_rate": 3.2279163039851637e-06,
"loss": 0.4268,
"step": 873
},
{
"epoch": 1.9629421673217293,
"grad_norm": 0.24189579486846924,
"learning_rate": 3.2156923046373444e-06,
"loss": 0.4386,
"step": 874
},
{
"epoch": 1.9651880965749577,
"grad_norm": 0.2793926000595093,
"learning_rate": 3.2034805143777353e-06,
"loss": 0.4601,
"step": 875
},
{
"epoch": 1.9674340258281864,
"grad_norm": 0.24393576383590698,
"learning_rate": 3.191281016765253e-06,
"loss": 0.4385,
"step": 876
},
{
"epoch": 1.969679955081415,
"grad_norm": 0.2667008340358734,
"learning_rate": 3.179093895274709e-06,
"loss": 0.4381,
"step": 877
},
{
"epoch": 1.9719258843346434,
"grad_norm": 0.2440071552991867,
"learning_rate": 3.1669192332962264e-06,
"loss": 0.4057,
"step": 878
},
{
"epoch": 1.9741718135878719,
"grad_norm": 0.2677282392978668,
"learning_rate": 3.1547571141346756e-06,
"loss": 0.4554,
"step": 879
},
{
"epoch": 1.9764177428411005,
"grad_norm": 0.23887000977993011,
"learning_rate": 3.142607621009107e-06,
"loss": 0.4177,
"step": 880
},
{
"epoch": 1.9786636720943291,
"grad_norm": 0.2643425762653351,
"learning_rate": 3.1304708370521695e-06,
"loss": 0.4624,
"step": 881
},
{
"epoch": 1.9809096013475576,
"grad_norm": 0.2131706178188324,
"learning_rate": 3.118346845309556e-06,
"loss": 0.413,
"step": 882
},
{
"epoch": 1.983155530600786,
"grad_norm": 0.2590519189834595,
"learning_rate": 3.1062357287394284e-06,
"loss": 0.4617,
"step": 883
},
{
"epoch": 1.9854014598540146,
"grad_norm": 0.23261459171772003,
"learning_rate": 3.094137570211847e-06,
"loss": 0.434,
"step": 884
},
{
"epoch": 1.9876473891072433,
"grad_norm": 0.22255754470825195,
"learning_rate": 3.082052452508213e-06,
"loss": 0.4351,
"step": 885
},
{
"epoch": 1.9898933183604717,
"grad_norm": 0.22782853245735168,
"learning_rate": 3.0699804583206882e-06,
"loss": 0.4316,
"step": 886
},
{
"epoch": 1.9921392476137,
"grad_norm": 0.2501652240753174,
"learning_rate": 3.057921670251644e-06,
"loss": 0.455,
"step": 887
},
{
"epoch": 1.9943851768669287,
"grad_norm": 0.2316114753484726,
"learning_rate": 3.045876170813084e-06,
"loss": 0.4451,
"step": 888
},
{
"epoch": 1.9966311061201574,
"grad_norm": 0.22861182689666748,
"learning_rate": 3.0338440424260897e-06,
"loss": 0.4362,
"step": 889
},
{
"epoch": 1.9988770353733858,
"grad_norm": 0.24283848702907562,
"learning_rate": 3.021825367420248e-06,
"loss": 0.4348,
"step": 890
},
{
"epoch": 2.001122964626614,
"grad_norm": 0.5297620892524719,
"learning_rate": 3.0098202280330907e-06,
"loss": 0.7405,
"step": 891
},
{
"epoch": 2.0033688938798426,
"grad_norm": 0.25243425369262695,
"learning_rate": 2.997828706409534e-06,
"loss": 0.4301,
"step": 892
},
{
"epoch": 2.0056148231330715,
"grad_norm": 0.3185347318649292,
"learning_rate": 2.985850884601316e-06,
"loss": 0.4232,
"step": 893
},
{
"epoch": 2.0078607523863,
"grad_norm": 0.26330360770225525,
"learning_rate": 2.9738868445664314e-06,
"loss": 0.4381,
"step": 894
},
{
"epoch": 2.0101066816395283,
"grad_norm": 0.23436835408210754,
"learning_rate": 2.961936668168577e-06,
"loss": 0.4191,
"step": 895
},
{
"epoch": 2.0123526108927567,
"grad_norm": 0.31119340658187866,
"learning_rate": 2.950000437176582e-06,
"loss": 0.4259,
"step": 896
},
{
"epoch": 2.0145985401459856,
"grad_norm": 0.2767098844051361,
"learning_rate": 2.9380782332638614e-06,
"loss": 0.4282,
"step": 897
},
{
"epoch": 2.016844469399214,
"grad_norm": 0.22678621113300323,
"learning_rate": 2.9261701380078443e-06,
"loss": 0.4133,
"step": 898
},
{
"epoch": 2.0190903986524424,
"grad_norm": 0.274517297744751,
"learning_rate": 2.9142762328894273e-06,
"loss": 0.4231,
"step": 899
},
{
"epoch": 2.021336327905671,
"grad_norm": 0.29050254821777344,
"learning_rate": 2.9023965992924076e-06,
"loss": 0.4387,
"step": 900
},
{
"epoch": 2.0235822571588993,
"grad_norm": 0.24458545446395874,
"learning_rate": 2.8905313185029267e-06,
"loss": 0.4048,
"step": 901
},
{
"epoch": 2.025828186412128,
"grad_norm": 0.26588353514671326,
"learning_rate": 2.878680471708924e-06,
"loss": 0.4159,
"step": 902
},
{
"epoch": 2.0280741156653566,
"grad_norm": 0.24073943495750427,
"learning_rate": 2.8668441399995712e-06,
"loss": 0.4311,
"step": 903
},
{
"epoch": 2.030320044918585,
"grad_norm": 0.2562435567378998,
"learning_rate": 2.8550224043647236e-06,
"loss": 0.4232,
"step": 904
},
{
"epoch": 2.0325659741718134,
"grad_norm": 0.2863386273384094,
"learning_rate": 2.843215345694359e-06,
"loss": 0.4466,
"step": 905
},
{
"epoch": 2.0348119034250423,
"grad_norm": 0.24601112306118011,
"learning_rate": 2.831423044778027e-06,
"loss": 0.4139,
"step": 906
},
{
"epoch": 2.0370578326782707,
"grad_norm": 0.22661253809928894,
"learning_rate": 2.8196455823043047e-06,
"loss": 0.4174,
"step": 907
},
{
"epoch": 2.039303761931499,
"grad_norm": 0.25296610593795776,
"learning_rate": 2.8078830388602318e-06,
"loss": 0.432,
"step": 908
},
{
"epoch": 2.0415496911847275,
"grad_norm": 0.2235630601644516,
"learning_rate": 2.7961354949307677e-06,
"loss": 0.4261,
"step": 909
},
{
"epoch": 2.0437956204379564,
"grad_norm": 0.2354028970003128,
"learning_rate": 2.784403030898239e-06,
"loss": 0.4229,
"step": 910
},
{
"epoch": 2.046041549691185,
"grad_norm": 0.2226496785879135,
"learning_rate": 2.772685727041783e-06,
"loss": 0.4193,
"step": 911
},
{
"epoch": 2.048287478944413,
"grad_norm": 0.23907402157783508,
"learning_rate": 2.760983663536806e-06,
"loss": 0.4494,
"step": 912
},
{
"epoch": 2.0505334081976416,
"grad_norm": 0.23263433575630188,
"learning_rate": 2.7492969204544356e-06,
"loss": 0.41,
"step": 913
},
{
"epoch": 2.0527793374508705,
"grad_norm": 0.24426434934139252,
"learning_rate": 2.7376255777609674e-06,
"loss": 0.4356,
"step": 914
},
{
"epoch": 2.055025266704099,
"grad_norm": 0.2100609987974167,
"learning_rate": 2.7259697153173207e-06,
"loss": 0.4112,
"step": 915
},
{
"epoch": 2.0572711959573273,
"grad_norm": 0.2561478018760681,
"learning_rate": 2.7143294128784934e-06,
"loss": 0.4565,
"step": 916
},
{
"epoch": 2.0595171252105557,
"grad_norm": 0.23428645730018616,
"learning_rate": 2.7027047500930098e-06,
"loss": 0.4124,
"step": 917
},
{
"epoch": 2.0617630544637846,
"grad_norm": 0.22505903244018555,
"learning_rate": 2.6910958065023805e-06,
"loss": 0.4285,
"step": 918
},
{
"epoch": 2.064008983717013,
"grad_norm": 0.2354445606470108,
"learning_rate": 2.6795026615405635e-06,
"loss": 0.4326,
"step": 919
},
{
"epoch": 2.0662549129702414,
"grad_norm": 0.22063247859477997,
"learning_rate": 2.6679253945334096e-06,
"loss": 0.4098,
"step": 920
},
{
"epoch": 2.06850084222347,
"grad_norm": 0.25319838523864746,
"learning_rate": 2.65636408469813e-06,
"loss": 0.427,
"step": 921
},
{
"epoch": 2.0707467714766983,
"grad_norm": 0.2321866899728775,
"learning_rate": 2.6448188111427426e-06,
"loss": 0.3939,
"step": 922
},
{
"epoch": 2.072992700729927,
"grad_norm": 0.22791002690792084,
"learning_rate": 2.633289652865544e-06,
"loss": 0.4375,
"step": 923
},
{
"epoch": 2.0752386299831556,
"grad_norm": 0.21649421751499176,
"learning_rate": 2.6217766887545558e-06,
"loss": 0.434,
"step": 924
},
{
"epoch": 2.077484559236384,
"grad_norm": 0.2443019300699234,
"learning_rate": 2.6102799975869976e-06,
"loss": 0.407,
"step": 925
},
{
"epoch": 2.0797304884896124,
"grad_norm": 0.2476467788219452,
"learning_rate": 2.5987996580287397e-06,
"loss": 0.4238,
"step": 926
},
{
"epoch": 2.0819764177428413,
"grad_norm": 0.21861916780471802,
"learning_rate": 2.5873357486337626e-06,
"loss": 0.4365,
"step": 927
},
{
"epoch": 2.0842223469960697,
"grad_norm": 0.23719200491905212,
"learning_rate": 2.5758883478436304e-06,
"loss": 0.3978,
"step": 928
},
{
"epoch": 2.086468276249298,
"grad_norm": 0.26309382915496826,
"learning_rate": 2.564457533986944e-06,
"loss": 0.4282,
"step": 929
},
{
"epoch": 2.0887142055025265,
"grad_norm": 0.23072056472301483,
"learning_rate": 2.5530433852788095e-06,
"loss": 0.44,
"step": 930
},
{
"epoch": 2.0909601347557554,
"grad_norm": 0.23523831367492676,
"learning_rate": 2.541645979820301e-06,
"loss": 0.4089,
"step": 931
},
{
"epoch": 2.093206064008984,
"grad_norm": 0.20197081565856934,
"learning_rate": 2.5302653955979257e-06,
"loss": 0.3742,
"step": 932
},
{
"epoch": 2.095451993262212,
"grad_norm": 0.24736276268959045,
"learning_rate": 2.518901710483095e-06,
"loss": 0.447,
"step": 933
},
{
"epoch": 2.0976979225154406,
"grad_norm": 0.23071594536304474,
"learning_rate": 2.5075550022315885e-06,
"loss": 0.4195,
"step": 934
},
{
"epoch": 2.0999438517686695,
"grad_norm": 0.21248017251491547,
"learning_rate": 2.4962253484830197e-06,
"loss": 0.4163,
"step": 935
},
{
"epoch": 2.102189781021898,
"grad_norm": 0.20917271077632904,
"learning_rate": 2.4849128267603106e-06,
"loss": 0.4017,
"step": 936
},
{
"epoch": 2.1044357102751263,
"grad_norm": 0.23887436091899872,
"learning_rate": 2.4736175144691543e-06,
"loss": 0.443,
"step": 937
},
{
"epoch": 2.1066816395283547,
"grad_norm": 0.23489055037498474,
"learning_rate": 2.4623394888974863e-06,
"loss": 0.4361,
"step": 938
},
{
"epoch": 2.108927568781583,
"grad_norm": 0.21189194917678833,
"learning_rate": 2.451078827214964e-06,
"loss": 0.3941,
"step": 939
},
{
"epoch": 2.111173498034812,
"grad_norm": 0.22420427203178406,
"learning_rate": 2.4398356064724298e-06,
"loss": 0.4345,
"step": 940
},
{
"epoch": 2.1134194272880404,
"grad_norm": 0.2321353554725647,
"learning_rate": 2.4286099036013904e-06,
"loss": 0.4527,
"step": 941
},
{
"epoch": 2.115665356541269,
"grad_norm": 0.224471315741539,
"learning_rate": 2.417401795413478e-06,
"loss": 0.395,
"step": 942
},
{
"epoch": 2.1179112857944973,
"grad_norm": 0.24702583253383636,
"learning_rate": 2.4062113585999452e-06,
"loss": 0.4491,
"step": 943
},
{
"epoch": 2.120157215047726,
"grad_norm": 0.21472668647766113,
"learning_rate": 2.395038669731117e-06,
"loss": 0.4342,
"step": 944
},
{
"epoch": 2.1224031443009546,
"grad_norm": 0.22108450531959534,
"learning_rate": 2.3838838052558867e-06,
"loss": 0.4183,
"step": 945
},
{
"epoch": 2.124649073554183,
"grad_norm": 0.2732450067996979,
"learning_rate": 2.372746841501184e-06,
"loss": 0.4166,
"step": 946
},
{
"epoch": 2.1268950028074114,
"grad_norm": 0.21384459733963013,
"learning_rate": 2.3616278546714464e-06,
"loss": 0.4534,
"step": 947
},
{
"epoch": 2.1291409320606403,
"grad_norm": 0.20551139116287231,
"learning_rate": 2.350526920848113e-06,
"loss": 0.4005,
"step": 948
},
{
"epoch": 2.1313868613138687,
"grad_norm": 0.24042649567127228,
"learning_rate": 2.339444115989093e-06,
"loss": 0.4474,
"step": 949
},
{
"epoch": 2.133632790567097,
"grad_norm": 0.2255227416753769,
"learning_rate": 2.3283795159282443e-06,
"loss": 0.4203,
"step": 950
},
{
"epoch": 2.1358787198203255,
"grad_norm": 0.20983435213565826,
"learning_rate": 2.3173331963748646e-06,
"loss": 0.4102,
"step": 951
},
{
"epoch": 2.1381246490735544,
"grad_norm": 0.2264542430639267,
"learning_rate": 2.306305232913163e-06,
"loss": 0.4155,
"step": 952
},
{
"epoch": 2.140370578326783,
"grad_norm": 0.24919871985912323,
"learning_rate": 2.2952957010017506e-06,
"loss": 0.4175,
"step": 953
},
{
"epoch": 2.142616507580011,
"grad_norm": 0.21375016868114471,
"learning_rate": 2.2843046759731206e-06,
"loss": 0.4367,
"step": 954
},
{
"epoch": 2.1448624368332396,
"grad_norm": 0.2692919671535492,
"learning_rate": 2.273332233033134e-06,
"loss": 0.4246,
"step": 955
},
{
"epoch": 2.147108366086468,
"grad_norm": 0.22732344269752502,
"learning_rate": 2.2623784472605016e-06,
"loss": 0.4229,
"step": 956
},
{
"epoch": 2.149354295339697,
"grad_norm": 0.23208336532115936,
"learning_rate": 2.2514433936062714e-06,
"loss": 0.4367,
"step": 957
},
{
"epoch": 2.1516002245929253,
"grad_norm": 0.24797451496124268,
"learning_rate": 2.2405271468933224e-06,
"loss": 0.4062,
"step": 958
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.2095116823911667,
"learning_rate": 2.2296297818158458e-06,
"loss": 0.4208,
"step": 959
},
{
"epoch": 2.156092083099382,
"grad_norm": 0.2208539843559265,
"learning_rate": 2.218751372938834e-06,
"loss": 0.4243,
"step": 960
},
{
"epoch": 2.158338012352611,
"grad_norm": 0.2577050030231476,
"learning_rate": 2.2078919946975753e-06,
"loss": 0.438,
"step": 961
},
{
"epoch": 2.1605839416058394,
"grad_norm": 0.23505088686943054,
"learning_rate": 2.1970517213971367e-06,
"loss": 0.4164,
"step": 962
},
{
"epoch": 2.162829870859068,
"grad_norm": 0.212454691529274,
"learning_rate": 2.186230627211861e-06,
"loss": 0.4239,
"step": 963
},
{
"epoch": 2.1650758001122963,
"grad_norm": 0.22702592611312866,
"learning_rate": 2.175428786184861e-06,
"loss": 0.4094,
"step": 964
},
{
"epoch": 2.167321729365525,
"grad_norm": 0.2175099104642868,
"learning_rate": 2.1646462722275085e-06,
"loss": 0.411,
"step": 965
},
{
"epoch": 2.1695676586187536,
"grad_norm": 0.22848990559577942,
"learning_rate": 2.1538831591189317e-06,
"loss": 0.4353,
"step": 966
},
{
"epoch": 2.171813587871982,
"grad_norm": 0.22559164464473724,
"learning_rate": 2.1431395205055085e-06,
"loss": 0.4398,
"step": 967
},
{
"epoch": 2.1740595171252104,
"grad_norm": 0.19590629637241364,
"learning_rate": 2.1324154299003597e-06,
"loss": 0.4266,
"step": 968
},
{
"epoch": 2.1763054463784393,
"grad_norm": 0.2409408986568451,
"learning_rate": 2.121710960682851e-06,
"loss": 0.4286,
"step": 969
},
{
"epoch": 2.1785513756316677,
"grad_norm": 0.2229623794555664,
"learning_rate": 2.111026186098092e-06,
"loss": 0.4492,
"step": 970
},
{
"epoch": 2.180797304884896,
"grad_norm": 0.20039457082748413,
"learning_rate": 2.1003611792564288e-06,
"loss": 0.4213,
"step": 971
},
{
"epoch": 2.1830432341381245,
"grad_norm": 0.2124020755290985,
"learning_rate": 2.0897160131329508e-06,
"loss": 0.4235,
"step": 972
},
{
"epoch": 2.1852891633913534,
"grad_norm": 0.23414523899555206,
"learning_rate": 2.079090760566982e-06,
"loss": 0.4318,
"step": 973
},
{
"epoch": 2.187535092644582,
"grad_norm": 0.24739769101142883,
"learning_rate": 2.0684854942615946e-06,
"loss": 0.4196,
"step": 974
},
{
"epoch": 2.18978102189781,
"grad_norm": 0.22191846370697021,
"learning_rate": 2.0579002867830987e-06,
"loss": 0.4345,
"step": 975
},
{
"epoch": 2.1920269511510386,
"grad_norm": 0.21731607615947723,
"learning_rate": 2.0473352105605583e-06,
"loss": 0.4056,
"step": 976
},
{
"epoch": 2.1942728804042675,
"grad_norm": 0.2345353066921234,
"learning_rate": 2.0367903378852876e-06,
"loss": 0.428,
"step": 977
},
{
"epoch": 2.196518809657496,
"grad_norm": 0.23607279360294342,
"learning_rate": 2.0262657409103565e-06,
"loss": 0.4514,
"step": 978
},
{
"epoch": 2.1987647389107243,
"grad_norm": 0.21260501444339752,
"learning_rate": 2.0157614916501e-06,
"loss": 0.411,
"step": 979
},
{
"epoch": 2.2010106681639527,
"grad_norm": 0.2566327750682831,
"learning_rate": 2.0052776619796265e-06,
"loss": 0.4125,
"step": 980
},
{
"epoch": 2.203256597417181,
"grad_norm": 0.23026920855045319,
"learning_rate": 1.9948143236343226e-06,
"loss": 0.4223,
"step": 981
},
{
"epoch": 2.20550252667041,
"grad_norm": 0.21213333308696747,
"learning_rate": 1.9843715482093613e-06,
"loss": 0.4035,
"step": 982
},
{
"epoch": 2.2077484559236384,
"grad_norm": 0.22886443138122559,
"learning_rate": 1.9739494071592143e-06,
"loss": 0.4215,
"step": 983
},
{
"epoch": 2.209994385176867,
"grad_norm": 0.2419017106294632,
"learning_rate": 1.9635479717971656e-06,
"loss": 0.4185,
"step": 984
},
{
"epoch": 2.2122403144300953,
"grad_norm": 0.22518743574619293,
"learning_rate": 1.953167313294821e-06,
"loss": 0.4334,
"step": 985
},
{
"epoch": 2.214486243683324,
"grad_norm": 0.23835636675357819,
"learning_rate": 1.9428075026816186e-06,
"loss": 0.432,
"step": 986
},
{
"epoch": 2.2167321729365526,
"grad_norm": 0.23428522050380707,
"learning_rate": 1.9324686108443487e-06,
"loss": 0.4255,
"step": 987
},
{
"epoch": 2.218978102189781,
"grad_norm": 0.2030942142009735,
"learning_rate": 1.9221507085266617e-06,
"loss": 0.4117,
"step": 988
},
{
"epoch": 2.2212240314430094,
"grad_norm": 0.2084757536649704,
"learning_rate": 1.9118538663285874e-06,
"loss": 0.4233,
"step": 989
},
{
"epoch": 2.2234699606962383,
"grad_norm": 0.209101602435112,
"learning_rate": 1.9015781547060552e-06,
"loss": 0.3821,
"step": 990
},
{
"epoch": 2.2257158899494667,
"grad_norm": 0.22239622473716736,
"learning_rate": 1.8913236439704085e-06,
"loss": 0.4382,
"step": 991
},
{
"epoch": 2.227961819202695,
"grad_norm": 0.21578405797481537,
"learning_rate": 1.881090404287924e-06,
"loss": 0.415,
"step": 992
},
{
"epoch": 2.2302077484559235,
"grad_norm": 0.2138473093509674,
"learning_rate": 1.8708785056793276e-06,
"loss": 0.4217,
"step": 993
},
{
"epoch": 2.2324536777091524,
"grad_norm": 0.20428405702114105,
"learning_rate": 1.8606880180193265e-06,
"loss": 0.405,
"step": 994
},
{
"epoch": 2.234699606962381,
"grad_norm": 0.20017048716545105,
"learning_rate": 1.850519011036117e-06,
"loss": 0.4286,
"step": 995
},
{
"epoch": 2.236945536215609,
"grad_norm": 0.21324189007282257,
"learning_rate": 1.840371554310918e-06,
"loss": 0.4253,
"step": 996
},
{
"epoch": 2.2391914654688376,
"grad_norm": 0.21719501912593842,
"learning_rate": 1.8302457172774929e-06,
"loss": 0.4175,
"step": 997
},
{
"epoch": 2.241437394722066,
"grad_norm": 0.19842517375946045,
"learning_rate": 1.8201415692216673e-06,
"loss": 0.4131,
"step": 998
},
{
"epoch": 2.243683323975295,
"grad_norm": 0.19860929250717163,
"learning_rate": 1.8100591792808652e-06,
"loss": 0.4093,
"step": 999
},
{
"epoch": 2.2459292532285233,
"grad_norm": 0.20503376424312592,
"learning_rate": 1.7999986164436312e-06,
"loss": 0.4346,
"step": 1000
},
{
"epoch": 2.2481751824817517,
"grad_norm": 0.2113645225763321,
"learning_rate": 1.7899599495491532e-06,
"loss": 0.4244,
"step": 1001
},
{
"epoch": 2.25042111173498,
"grad_norm": 0.21410761773586273,
"learning_rate": 1.7799432472868038e-06,
"loss": 0.4226,
"step": 1002
},
{
"epoch": 2.252667040988209,
"grad_norm": 0.2089909315109253,
"learning_rate": 1.769948578195656e-06,
"loss": 0.4032,
"step": 1003
},
{
"epoch": 2.2549129702414374,
"grad_norm": 0.20202669501304626,
"learning_rate": 1.7599760106640263e-06,
"loss": 0.4177,
"step": 1004
},
{
"epoch": 2.257158899494666,
"grad_norm": 0.21100208163261414,
"learning_rate": 1.7500256129290005e-06,
"loss": 0.4445,
"step": 1005
},
{
"epoch": 2.2594048287478943,
"grad_norm": 0.22142833471298218,
"learning_rate": 1.740097453075969e-06,
"loss": 0.4252,
"step": 1006
},
{
"epoch": 2.261650758001123,
"grad_norm": 0.20687736570835114,
"learning_rate": 1.7301915990381568e-06,
"loss": 0.4301,
"step": 1007
},
{
"epoch": 2.2638966872543516,
"grad_norm": 0.21982485055923462,
"learning_rate": 1.7203081185961624e-06,
"loss": 0.4307,
"step": 1008
},
{
"epoch": 2.26614261650758,
"grad_norm": 0.21791280806064606,
"learning_rate": 1.7104470793774959e-06,
"loss": 0.4517,
"step": 1009
},
{
"epoch": 2.2683885457608084,
"grad_norm": 0.20038799941539764,
"learning_rate": 1.700608548856113e-06,
"loss": 0.4109,
"step": 1010
},
{
"epoch": 2.2706344750140373,
"grad_norm": 0.22775229811668396,
"learning_rate": 1.6907925943519532e-06,
"loss": 0.4219,
"step": 1011
},
{
"epoch": 2.2728804042672657,
"grad_norm": 0.21923872828483582,
"learning_rate": 1.6809992830304827e-06,
"loss": 0.4278,
"step": 1012
},
{
"epoch": 2.275126333520494,
"grad_norm": 0.20975294709205627,
"learning_rate": 1.671228681902229e-06,
"loss": 0.4241,
"step": 1013
},
{
"epoch": 2.2773722627737225,
"grad_norm": 0.20294855535030365,
"learning_rate": 1.6614808578223235e-06,
"loss": 0.407,
"step": 1014
},
{
"epoch": 2.279618192026951,
"grad_norm": 0.2194415181875229,
"learning_rate": 1.6517558774900517e-06,
"loss": 0.4304,
"step": 1015
},
{
"epoch": 2.28186412128018,
"grad_norm": 0.21036003530025482,
"learning_rate": 1.642053807448389e-06,
"loss": 0.43,
"step": 1016
},
{
"epoch": 2.284110050533408,
"grad_norm": 0.2032628208398819,
"learning_rate": 1.6323747140835484e-06,
"loss": 0.4567,
"step": 1017
},
{
"epoch": 2.2863559797866366,
"grad_norm": 0.2219427227973938,
"learning_rate": 1.6227186636245218e-06,
"loss": 0.418,
"step": 1018
},
{
"epoch": 2.2886019090398655,
"grad_norm": 0.227565735578537,
"learning_rate": 1.613085722142636e-06,
"loss": 0.4321,
"step": 1019
},
{
"epoch": 2.290847838293094,
"grad_norm": 0.19647327065467834,
"learning_rate": 1.60347595555109e-06,
"loss": 0.4233,
"step": 1020
},
{
"epoch": 2.2930937675463223,
"grad_norm": 0.20968946814537048,
"learning_rate": 1.593889429604511e-06,
"loss": 0.4558,
"step": 1021
},
{
"epoch": 2.2953396967995507,
"grad_norm": 0.22316963970661163,
"learning_rate": 1.5843262098985051e-06,
"loss": 0.425,
"step": 1022
},
{
"epoch": 2.297585626052779,
"grad_norm": 0.22763703763484955,
"learning_rate": 1.5747863618692044e-06,
"loss": 0.4291,
"step": 1023
},
{
"epoch": 2.299831555306008,
"grad_norm": 0.198600634932518,
"learning_rate": 1.5652699507928166e-06,
"loss": 0.4273,
"step": 1024
},
{
"epoch": 2.3020774845592364,
"grad_norm": 0.20251289010047913,
"learning_rate": 1.5557770417851886e-06,
"loss": 0.4173,
"step": 1025
},
{
"epoch": 2.304323413812465,
"grad_norm": 0.2171899527311325,
"learning_rate": 1.5463076998013533e-06,
"loss": 0.4439,
"step": 1026
},
{
"epoch": 2.3065693430656933,
"grad_norm": 0.22569715976715088,
"learning_rate": 1.5368619896350828e-06,
"loss": 0.4294,
"step": 1027
},
{
"epoch": 2.308815272318922,
"grad_norm": 0.2233586460351944,
"learning_rate": 1.527439975918455e-06,
"loss": 0.439,
"step": 1028
},
{
"epoch": 2.3110612015721506,
"grad_norm": 0.2036871761083603,
"learning_rate": 1.5180417231214001e-06,
"loss": 0.3983,
"step": 1029
},
{
"epoch": 2.313307130825379,
"grad_norm": 0.21388086676597595,
"learning_rate": 1.5086672955512672e-06,
"loss": 0.4523,
"step": 1030
},
{
"epoch": 2.3155530600786074,
"grad_norm": 0.21578197181224823,
"learning_rate": 1.4993167573523821e-06,
"loss": 0.4105,
"step": 1031
},
{
"epoch": 2.317798989331836,
"grad_norm": 0.20119976997375488,
"learning_rate": 1.4899901725056093e-06,
"loss": 0.401,
"step": 1032
},
{
"epoch": 2.3200449185850647,
"grad_norm": 0.19066974520683289,
"learning_rate": 1.4806876048279095e-06,
"loss": 0.416,
"step": 1033
},
{
"epoch": 2.322290847838293,
"grad_norm": 0.205108642578125,
"learning_rate": 1.471409117971907e-06,
"loss": 0.4203,
"step": 1034
},
{
"epoch": 2.3245367770915215,
"grad_norm": 0.2170393466949463,
"learning_rate": 1.462154775425455e-06,
"loss": 0.4322,
"step": 1035
},
{
"epoch": 2.3267827063447504,
"grad_norm": 0.19994419813156128,
"learning_rate": 1.4529246405112002e-06,
"loss": 0.4364,
"step": 1036
},
{
"epoch": 2.329028635597979,
"grad_norm": 0.1924624890089035,
"learning_rate": 1.4437187763861487e-06,
"loss": 0.4234,
"step": 1037
},
{
"epoch": 2.331274564851207,
"grad_norm": 0.21115237474441528,
"learning_rate": 1.4345372460412348e-06,
"loss": 0.398,
"step": 1038
},
{
"epoch": 2.3335204941044356,
"grad_norm": 0.21371133625507355,
"learning_rate": 1.425380112300887e-06,
"loss": 0.4537,
"step": 1039
},
{
"epoch": 2.335766423357664,
"grad_norm": 0.19634070992469788,
"learning_rate": 1.416247437822601e-06,
"loss": 0.4035,
"step": 1040
},
{
"epoch": 2.338012352610893,
"grad_norm": 0.21442176401615143,
"learning_rate": 1.4071392850965126e-06,
"loss": 0.4507,
"step": 1041
},
{
"epoch": 2.3402582818641213,
"grad_norm": 0.20288068056106567,
"learning_rate": 1.398055716444967e-06,
"loss": 0.4034,
"step": 1042
},
{
"epoch": 2.3425042111173497,
"grad_norm": 0.20702247321605682,
"learning_rate": 1.388996794022095e-06,
"loss": 0.4336,
"step": 1043
},
{
"epoch": 2.344750140370578,
"grad_norm": 0.23960836231708527,
"learning_rate": 1.3799625798133814e-06,
"loss": 0.4464,
"step": 1044
},
{
"epoch": 2.346996069623807,
"grad_norm": 0.20227837562561035,
"learning_rate": 1.3709531356352512e-06,
"loss": 0.4085,
"step": 1045
},
{
"epoch": 2.3492419988770354,
"grad_norm": 0.21481740474700928,
"learning_rate": 1.3619685231346358e-06,
"loss": 0.4478,
"step": 1046
},
{
"epoch": 2.351487928130264,
"grad_norm": 0.19349761307239532,
"learning_rate": 1.3530088037885608e-06,
"loss": 0.4202,
"step": 1047
},
{
"epoch": 2.3537338573834923,
"grad_norm": 0.21037468314170837,
"learning_rate": 1.3440740389037198e-06,
"loss": 0.4613,
"step": 1048
},
{
"epoch": 2.3559797866367207,
"grad_norm": 0.19390463829040527,
"learning_rate": 1.3351642896160522e-06,
"loss": 0.3749,
"step": 1049
},
{
"epoch": 2.3582257158899496,
"grad_norm": 0.19581280648708344,
"learning_rate": 1.3262796168903342e-06,
"loss": 0.4362,
"step": 1050
},
{
"epoch": 2.360471645143178,
"grad_norm": 0.21136844158172607,
"learning_rate": 1.317420081519754e-06,
"loss": 0.4499,
"step": 1051
},
{
"epoch": 2.3627175743964064,
"grad_norm": 0.19537141919136047,
"learning_rate": 1.3085857441254956e-06,
"loss": 0.4137,
"step": 1052
},
{
"epoch": 2.3649635036496353,
"grad_norm": 0.20818866789340973,
"learning_rate": 1.2997766651563316e-06,
"loss": 0.4093,
"step": 1053
},
{
"epoch": 2.3672094329028637,
"grad_norm": 0.21565309166908264,
"learning_rate": 1.2909929048881976e-06,
"loss": 0.4468,
"step": 1054
},
{
"epoch": 2.369455362156092,
"grad_norm": 0.19632428884506226,
"learning_rate": 1.2822345234237915e-06,
"loss": 0.4116,
"step": 1055
},
{
"epoch": 2.3717012914093205,
"grad_norm": 0.2132972776889801,
"learning_rate": 1.2735015806921563e-06,
"loss": 0.4218,
"step": 1056
},
{
"epoch": 2.373947220662549,
"grad_norm": 0.19294115900993347,
"learning_rate": 1.264794136448272e-06,
"loss": 0.4081,
"step": 1057
},
{
"epoch": 2.376193149915778,
"grad_norm": 0.21431930363178253,
"learning_rate": 1.2561122502726424e-06,
"loss": 0.4433,
"step": 1058
},
{
"epoch": 2.378439079169006,
"grad_norm": 0.21879686415195465,
"learning_rate": 1.247455981570892e-06,
"loss": 0.4266,
"step": 1059
},
{
"epoch": 2.3806850084222346,
"grad_norm": 0.2212316393852234,
"learning_rate": 1.2388253895733598e-06,
"loss": 0.4508,
"step": 1060
},
{
"epoch": 2.382930937675463,
"grad_norm": 0.19899475574493408,
"learning_rate": 1.2302205333346923e-06,
"loss": 0.4252,
"step": 1061
},
{
"epoch": 2.385176866928692,
"grad_norm": 0.24420271813869476,
"learning_rate": 1.2216414717334378e-06,
"loss": 0.436,
"step": 1062
},
{
"epoch": 2.3874227961819203,
"grad_norm": 0.19686606526374817,
"learning_rate": 1.213088263471649e-06,
"loss": 0.3885,
"step": 1063
},
{
"epoch": 2.3896687254351487,
"grad_norm": 0.19978255033493042,
"learning_rate": 1.2045609670744729e-06,
"loss": 0.4507,
"step": 1064
},
{
"epoch": 2.391914654688377,
"grad_norm": 0.20148004591464996,
"learning_rate": 1.1960596408897562e-06,
"loss": 0.4287,
"step": 1065
},
{
"epoch": 2.394160583941606,
"grad_norm": 0.1934734582901001,
"learning_rate": 1.1875843430876484e-06,
"loss": 0.4145,
"step": 1066
},
{
"epoch": 2.3964065131948344,
"grad_norm": 0.19930601119995117,
"learning_rate": 1.1791351316601962e-06,
"loss": 0.4234,
"step": 1067
},
{
"epoch": 2.398652442448063,
"grad_norm": 0.2159959226846695,
"learning_rate": 1.1707120644209557e-06,
"loss": 0.4407,
"step": 1068
},
{
"epoch": 2.4008983717012913,
"grad_norm": 0.20033979415893555,
"learning_rate": 1.162315199004585e-06,
"loss": 0.4142,
"step": 1069
},
{
"epoch": 2.40314430095452,
"grad_norm": 0.21157881617546082,
"learning_rate": 1.153944592866464e-06,
"loss": 0.4211,
"step": 1070
},
{
"epoch": 2.4053902302077486,
"grad_norm": 0.19438238441944122,
"learning_rate": 1.1456003032822882e-06,
"loss": 0.4439,
"step": 1071
},
{
"epoch": 2.407636159460977,
"grad_norm": 0.19469432532787323,
"learning_rate": 1.1372823873476857e-06,
"loss": 0.4035,
"step": 1072
},
{
"epoch": 2.4098820887142054,
"grad_norm": 0.19877132773399353,
"learning_rate": 1.128990901977825e-06,
"loss": 0.4334,
"step": 1073
},
{
"epoch": 2.412128017967434,
"grad_norm": 0.1978437304496765,
"learning_rate": 1.1207259039070183e-06,
"loss": 0.4471,
"step": 1074
},
{
"epoch": 2.4143739472206627,
"grad_norm": 0.1895316243171692,
"learning_rate": 1.1124874496883454e-06,
"loss": 0.4103,
"step": 1075
},
{
"epoch": 2.416619876473891,
"grad_norm": 0.18528953194618225,
"learning_rate": 1.104275595693256e-06,
"loss": 0.4137,
"step": 1076
},
{
"epoch": 2.4188658057271195,
"grad_norm": 0.20490433275699615,
"learning_rate": 1.096090398111192e-06,
"loss": 0.4598,
"step": 1077
},
{
"epoch": 2.421111734980348,
"grad_norm": 0.19539935886859894,
"learning_rate": 1.087931912949195e-06,
"loss": 0.3935,
"step": 1078
},
{
"epoch": 2.423357664233577,
"grad_norm": 0.20176726579666138,
"learning_rate": 1.0798001960315313e-06,
"loss": 0.4247,
"step": 1079
},
{
"epoch": 2.425603593486805,
"grad_norm": 0.194259911775589,
"learning_rate": 1.071695302999302e-06,
"loss": 0.425,
"step": 1080
},
{
"epoch": 2.4278495227400336,
"grad_norm": 0.2146841585636139,
"learning_rate": 1.0636172893100704e-06,
"loss": 0.4366,
"step": 1081
},
{
"epoch": 2.430095451993262,
"grad_norm": 0.20084460079669952,
"learning_rate": 1.0555662102374764e-06,
"loss": 0.4355,
"step": 1082
},
{
"epoch": 2.432341381246491,
"grad_norm": 0.1886490285396576,
"learning_rate": 1.0475421208708626e-06,
"loss": 0.4014,
"step": 1083
},
{
"epoch": 2.4345873104997193,
"grad_norm": 0.19774523377418518,
"learning_rate": 1.0395450761148911e-06,
"loss": 0.4074,
"step": 1084
},
{
"epoch": 2.4368332397529477,
"grad_norm": 0.2086760252714157,
"learning_rate": 1.031575130689173e-06,
"loss": 0.4258,
"step": 1085
},
{
"epoch": 2.439079169006176,
"grad_norm": 0.20382975041866302,
"learning_rate": 1.0236323391278958e-06,
"loss": 0.4247,
"step": 1086
},
{
"epoch": 2.441325098259405,
"grad_norm": 0.20429746806621552,
"learning_rate": 1.0157167557794433e-06,
"loss": 0.4215,
"step": 1087
},
{
"epoch": 2.4435710275126334,
"grad_norm": 0.18974192440509796,
"learning_rate": 1.0078284348060318e-06,
"loss": 0.4119,
"step": 1088
},
{
"epoch": 2.445816956765862,
"grad_norm": 0.21000362932682037,
"learning_rate": 9.999674301833328e-07,
"loss": 0.4524,
"step": 1089
},
{
"epoch": 2.4480628860190903,
"grad_norm": 0.1856634020805359,
"learning_rate": 9.921337957001059e-07,
"loss": 0.3757,
"step": 1090
},
{
"epoch": 2.4503088152723187,
"grad_norm": 0.1969255656003952,
"learning_rate": 9.843275849578305e-07,
"loss": 0.4292,
"step": 1091
},
{
"epoch": 2.4525547445255476,
"grad_norm": 0.21311074495315552,
"learning_rate": 9.765488513703414e-07,
"loss": 0.426,
"step": 1092
},
{
"epoch": 2.454800673778776,
"grad_norm": 0.19789327681064606,
"learning_rate": 9.68797648163462e-07,
"loss": 0.429,
"step": 1093
},
{
"epoch": 2.4570466030320044,
"grad_norm": 0.19123998284339905,
"learning_rate": 9.610740283746395e-07,
"loss": 0.4161,
"step": 1094
},
{
"epoch": 2.4592925322852333,
"grad_norm": 0.1999826729297638,
"learning_rate": 9.533780448525792e-07,
"loss": 0.4232,
"step": 1095
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.20449966192245483,
"learning_rate": 9.457097502568896e-07,
"loss": 0.4478,
"step": 1096
},
{
"epoch": 2.46378439079169,
"grad_norm": 0.2035766839981079,
"learning_rate": 9.380691970577144e-07,
"loss": 0.4434,
"step": 1097
},
{
"epoch": 2.4660303200449185,
"grad_norm": 0.200229674577713,
"learning_rate": 9.304564375353814e-07,
"loss": 0.3788,
"step": 1098
},
{
"epoch": 2.468276249298147,
"grad_norm": 0.19415318965911865,
"learning_rate": 9.228715237800395e-07,
"loss": 0.4382,
"step": 1099
},
{
"epoch": 2.470522178551376,
"grad_norm": 0.21206416189670563,
"learning_rate": 9.153145076913006e-07,
"loss": 0.4445,
"step": 1100
},
{
"epoch": 2.472768107804604,
"grad_norm": 0.19867388904094696,
"learning_rate": 9.077854409778913e-07,
"loss": 0.4104,
"step": 1101
},
{
"epoch": 2.4750140370578326,
"grad_norm": 0.202217236161232,
"learning_rate": 9.002843751572943e-07,
"loss": 0.4641,
"step": 1102
},
{
"epoch": 2.477259966311061,
"grad_norm": 0.1925583928823471,
"learning_rate": 8.928113615553946e-07,
"loss": 0.4218,
"step": 1103
},
{
"epoch": 2.47950589556429,
"grad_norm": 0.20704378187656403,
"learning_rate": 8.853664513061333e-07,
"loss": 0.4178,
"step": 1104
},
{
"epoch": 2.4817518248175183,
"grad_norm": 0.1998777687549591,
"learning_rate": 8.779496953511519e-07,
"loss": 0.4299,
"step": 1105
},
{
"epoch": 2.4839977540707467,
"grad_norm": 0.2032717913389206,
"learning_rate": 8.705611444394496e-07,
"loss": 0.4173,
"step": 1106
},
{
"epoch": 2.486243683323975,
"grad_norm": 0.21000362932682037,
"learning_rate": 8.632008491270316e-07,
"loss": 0.4336,
"step": 1107
},
{
"epoch": 2.4884896125772036,
"grad_norm": 0.19644078612327576,
"learning_rate": 8.558688597765668e-07,
"loss": 0.4197,
"step": 1108
},
{
"epoch": 2.4907355418304324,
"grad_norm": 0.19872646033763885,
"learning_rate": 8.485652265570376e-07,
"loss": 0.427,
"step": 1109
},
{
"epoch": 2.492981471083661,
"grad_norm": 0.1902010142803192,
"learning_rate": 8.412899994434015e-07,
"loss": 0.4204,
"step": 1110
},
{
"epoch": 2.4952274003368893,
"grad_norm": 0.19192348420619965,
"learning_rate": 8.340432282162492e-07,
"loss": 0.4235,
"step": 1111
},
{
"epoch": 2.497473329590118,
"grad_norm": 0.20328937470912933,
"learning_rate": 8.268249624614622e-07,
"loss": 0.4191,
"step": 1112
},
{
"epoch": 2.4997192588433466,
"grad_norm": 0.19253182411193848,
"learning_rate": 8.19635251569873e-07,
"loss": 0.3998,
"step": 1113
},
{
"epoch": 2.501965188096575,
"grad_norm": 0.20604483783245087,
"learning_rate": 8.1247414473693e-07,
"loss": 0.4568,
"step": 1114
},
{
"epoch": 2.5042111173498034,
"grad_norm": 0.188734769821167,
"learning_rate": 8.053416909623557e-07,
"loss": 0.421,
"step": 1115
},
{
"epoch": 2.506457046603032,
"grad_norm": 0.19376307725906372,
"learning_rate": 7.982379390498157e-07,
"loss": 0.3927,
"step": 1116
},
{
"epoch": 2.5087029758562607,
"grad_norm": 0.20353421568870544,
"learning_rate": 7.911629376065849e-07,
"loss": 0.4049,
"step": 1117
},
{
"epoch": 2.510948905109489,
"grad_norm": 0.20604628324508667,
"learning_rate": 7.841167350432144e-07,
"loss": 0.4351,
"step": 1118
},
{
"epoch": 2.5131948343627175,
"grad_norm": 0.19524161517620087,
"learning_rate": 7.770993795731984e-07,
"loss": 0.4201,
"step": 1119
},
{
"epoch": 2.5154407636159464,
"grad_norm": 0.18067501485347748,
"learning_rate": 7.701109192126438e-07,
"loss": 0.4176,
"step": 1120
},
{
"epoch": 2.517686692869175,
"grad_norm": 0.2033979743719101,
"learning_rate": 7.631514017799451e-07,
"loss": 0.4368,
"step": 1121
},
{
"epoch": 2.519932622122403,
"grad_norm": 0.19285623729228973,
"learning_rate": 7.56220874895458e-07,
"loss": 0.3991,
"step": 1122
},
{
"epoch": 2.5221785513756316,
"grad_norm": 0.2072119414806366,
"learning_rate": 7.493193859811643e-07,
"loss": 0.439,
"step": 1123
},
{
"epoch": 2.52442448062886,
"grad_norm": 0.1932649165391922,
"learning_rate": 7.424469822603613e-07,
"loss": 0.4049,
"step": 1124
},
{
"epoch": 2.5266704098820885,
"grad_norm": 0.19807684421539307,
"learning_rate": 7.356037107573255e-07,
"loss": 0.417,
"step": 1125
},
{
"epoch": 2.5289163391353173,
"grad_norm": 0.19324353337287903,
"learning_rate": 7.287896182970011e-07,
"loss": 0.4432,
"step": 1126
},
{
"epoch": 2.5311622683885457,
"grad_norm": 0.18392372131347656,
"learning_rate": 7.220047515046729e-07,
"loss": 0.3841,
"step": 1127
},
{
"epoch": 2.533408197641774,
"grad_norm": 0.209947869181633,
"learning_rate": 7.152491568056524e-07,
"loss": 0.4411,
"step": 1128
},
{
"epoch": 2.535654126895003,
"grad_norm": 0.2061583697795868,
"learning_rate": 7.085228804249538e-07,
"loss": 0.4309,
"step": 1129
},
{
"epoch": 2.5379000561482314,
"grad_norm": 0.20045273005962372,
"learning_rate": 7.018259683869827e-07,
"loss": 0.4388,
"step": 1130
},
{
"epoch": 2.54014598540146,
"grad_norm": 0.22213779389858246,
"learning_rate": 6.9515846651522e-07,
"loss": 0.4372,
"step": 1131
},
{
"epoch": 2.5423919146546883,
"grad_norm": 0.20819616317749023,
"learning_rate": 6.885204204319096e-07,
"loss": 0.4334,
"step": 1132
},
{
"epoch": 2.5446378439079167,
"grad_norm": 0.1961192786693573,
"learning_rate": 6.819118755577419e-07,
"loss": 0.4276,
"step": 1133
},
{
"epoch": 2.5468837731611456,
"grad_norm": 0.19298788905143738,
"learning_rate": 6.753328771115503e-07,
"loss": 0.4254,
"step": 1134
},
{
"epoch": 2.549129702414374,
"grad_norm": 0.17879649996757507,
"learning_rate": 6.687834701099921e-07,
"loss": 0.3883,
"step": 1135
},
{
"epoch": 2.5513756316676024,
"grad_norm": 0.19765320420265198,
"learning_rate": 6.622636993672477e-07,
"loss": 0.4365,
"step": 1136
},
{
"epoch": 2.5536215609208313,
"grad_norm": 0.19494295120239258,
"learning_rate": 6.557736094947137e-07,
"loss": 0.4137,
"step": 1137
},
{
"epoch": 2.5558674901740597,
"grad_norm": 0.21241888403892517,
"learning_rate": 6.493132449006939e-07,
"loss": 0.4415,
"step": 1138
},
{
"epoch": 2.558113419427288,
"grad_norm": 0.19840067625045776,
"learning_rate": 6.428826497900992e-07,
"loss": 0.459,
"step": 1139
},
{
"epoch": 2.5603593486805165,
"grad_norm": 0.19528187811374664,
"learning_rate": 6.364818681641438e-07,
"loss": 0.4057,
"step": 1140
},
{
"epoch": 2.562605277933745,
"grad_norm": 0.20458662509918213,
"learning_rate": 6.301109438200403e-07,
"loss": 0.442,
"step": 1141
},
{
"epoch": 2.5648512071869733,
"grad_norm": 0.2054254561662674,
"learning_rate": 6.237699203507058e-07,
"loss": 0.4237,
"step": 1142
},
{
"epoch": 2.567097136440202,
"grad_norm": 0.2081318199634552,
"learning_rate": 6.174588411444621e-07,
"loss": 0.4252,
"step": 1143
},
{
"epoch": 2.5693430656934306,
"grad_norm": 0.19667313992977142,
"learning_rate": 6.111777493847365e-07,
"loss": 0.4285,
"step": 1144
},
{
"epoch": 2.571588994946659,
"grad_norm": 0.1907162368297577,
"learning_rate": 6.0492668804977e-07,
"loss": 0.4135,
"step": 1145
},
{
"epoch": 2.573834924199888,
"grad_norm": 0.1859651803970337,
"learning_rate": 5.987056999123175e-07,
"loss": 0.4227,
"step": 1146
},
{
"epoch": 2.5760808534531163,
"grad_norm": 0.20672091841697693,
"learning_rate": 5.925148275393621e-07,
"loss": 0.424,
"step": 1147
},
{
"epoch": 2.5783267827063447,
"grad_norm": 0.19329291582107544,
"learning_rate": 5.863541132918171e-07,
"loss": 0.4062,
"step": 1148
},
{
"epoch": 2.580572711959573,
"grad_norm": 0.2025369554758072,
"learning_rate": 5.802235993242428e-07,
"loss": 0.458,
"step": 1149
},
{
"epoch": 2.5828186412128016,
"grad_norm": 0.20467379689216614,
"learning_rate": 5.741233275845537e-07,
"loss": 0.4513,
"step": 1150
},
{
"epoch": 2.5850645704660304,
"grad_norm": 0.2135162204504013,
"learning_rate": 5.680533398137305e-07,
"loss": 0.3892,
"step": 1151
},
{
"epoch": 2.587310499719259,
"grad_norm": 0.19036920368671417,
"learning_rate": 5.620136775455387e-07,
"loss": 0.4193,
"step": 1152
},
{
"epoch": 2.5895564289724873,
"grad_norm": 0.17486929893493652,
"learning_rate": 5.560043821062421e-07,
"loss": 0.39,
"step": 1153
},
{
"epoch": 2.591802358225716,
"grad_norm": 0.19415879249572754,
"learning_rate": 5.50025494614318e-07,
"loss": 0.4548,
"step": 1154
},
{
"epoch": 2.5940482874789446,
"grad_norm": 0.1955343335866928,
"learning_rate": 5.440770559801817e-07,
"loss": 0.4209,
"step": 1155
},
{
"epoch": 2.596294216732173,
"grad_norm": 0.22082816064357758,
"learning_rate": 5.381591069058973e-07,
"loss": 0.4281,
"step": 1156
},
{
"epoch": 2.5985401459854014,
"grad_norm": 0.19918213784694672,
"learning_rate": 5.322716878849104e-07,
"loss": 0.4192,
"step": 1157
},
{
"epoch": 2.60078607523863,
"grad_norm": 0.19099506735801697,
"learning_rate": 5.264148392017621e-07,
"loss": 0.4085,
"step": 1158
},
{
"epoch": 2.6030320044918582,
"grad_norm": 0.20114244520664215,
"learning_rate": 5.205886009318184e-07,
"loss": 0.4239,
"step": 1159
},
{
"epoch": 2.605277933745087,
"grad_norm": 0.19720801711082458,
"learning_rate": 5.147930129409928e-07,
"loss": 0.4299,
"step": 1160
},
{
"epoch": 2.6075238629983155,
"grad_norm": 0.19777406752109528,
"learning_rate": 5.090281148854737e-07,
"loss": 0.431,
"step": 1161
},
{
"epoch": 2.609769792251544,
"grad_norm": 0.19977416098117828,
"learning_rate": 5.032939462114572e-07,
"loss": 0.4257,
"step": 1162
},
{
"epoch": 2.612015721504773,
"grad_norm": 0.20614181458950043,
"learning_rate": 4.975905461548725e-07,
"loss": 0.437,
"step": 1163
},
{
"epoch": 2.614261650758001,
"grad_norm": 0.1861875206232071,
"learning_rate": 4.919179537411161e-07,
"loss": 0.4164,
"step": 1164
},
{
"epoch": 2.6165075800112296,
"grad_norm": 0.19667655229568481,
"learning_rate": 4.862762077847844e-07,
"loss": 0.4375,
"step": 1165
},
{
"epoch": 2.618753509264458,
"grad_norm": 0.18777360022068024,
"learning_rate": 4.806653468894051e-07,
"loss": 0.4238,
"step": 1166
},
{
"epoch": 2.6209994385176865,
"grad_norm": 0.18164758384227753,
"learning_rate": 4.750854094471757e-07,
"loss": 0.3991,
"step": 1167
},
{
"epoch": 2.6232453677709153,
"grad_norm": 0.1905893087387085,
"learning_rate": 4.695364336387037e-07,
"loss": 0.4175,
"step": 1168
},
{
"epoch": 2.6254912970241437,
"grad_norm": 0.19531551003456116,
"learning_rate": 4.6401845743273945e-07,
"loss": 0.4588,
"step": 1169
},
{
"epoch": 2.627737226277372,
"grad_norm": 0.1983010172843933,
"learning_rate": 4.585315185859218e-07,
"loss": 0.4121,
"step": 1170
},
{
"epoch": 2.629983155530601,
"grad_norm": 0.18379969894886017,
"learning_rate": 4.53075654642513e-07,
"loss": 0.4074,
"step": 1171
},
{
"epoch": 2.6322290847838294,
"grad_norm": 0.1939253956079483,
"learning_rate": 4.476509029341497e-07,
"loss": 0.4521,
"step": 1172
},
{
"epoch": 2.634475014037058,
"grad_norm": 0.19147953391075134,
"learning_rate": 4.422573005795827e-07,
"loss": 0.4376,
"step": 1173
},
{
"epoch": 2.6367209432902863,
"grad_norm": 0.19624711573123932,
"learning_rate": 4.368948844844223e-07,
"loss": 0.4182,
"step": 1174
},
{
"epoch": 2.6389668725435147,
"grad_norm": 0.18469464778900146,
"learning_rate": 4.3156369134089103e-07,
"loss": 0.4354,
"step": 1175
},
{
"epoch": 2.6412128017967436,
"grad_norm": 0.19770587980747223,
"learning_rate": 4.262637576275641e-07,
"loss": 0.4047,
"step": 1176
},
{
"epoch": 2.643458731049972,
"grad_norm": 0.1849193125963211,
"learning_rate": 4.209951196091294e-07,
"loss": 0.4088,
"step": 1177
},
{
"epoch": 2.6457046603032004,
"grad_norm": 0.2000664472579956,
"learning_rate": 4.1575781333613176e-07,
"loss": 0.4539,
"step": 1178
},
{
"epoch": 2.647950589556429,
"grad_norm": 0.19104914367198944,
"learning_rate": 4.1055187464473125e-07,
"loss": 0.4097,
"step": 1179
},
{
"epoch": 2.6501965188096577,
"grad_norm": 0.19243937730789185,
"learning_rate": 4.0537733915645474e-07,
"loss": 0.4218,
"step": 1180
},
{
"epoch": 2.652442448062886,
"grad_norm": 0.19347138702869415,
"learning_rate": 4.00234242277952e-07,
"loss": 0.4278,
"step": 1181
},
{
"epoch": 2.6546883773161145,
"grad_norm": 0.18277958035469055,
"learning_rate": 3.951226192007568e-07,
"loss": 0.4373,
"step": 1182
},
{
"epoch": 2.656934306569343,
"grad_norm": 0.18322674930095673,
"learning_rate": 3.900425049010437e-07,
"loss": 0.4349,
"step": 1183
},
{
"epoch": 2.6591802358225713,
"grad_norm": 0.19357453286647797,
"learning_rate": 3.8499393413938937e-07,
"loss": 0.4287,
"step": 1184
},
{
"epoch": 2.6614261650758,
"grad_norm": 0.19136710464954376,
"learning_rate": 3.799769414605342e-07,
"loss": 0.4154,
"step": 1185
},
{
"epoch": 2.6636720943290286,
"grad_norm": 0.18795958161354065,
"learning_rate": 3.7499156119314537e-07,
"loss": 0.4077,
"step": 1186
},
{
"epoch": 2.665918023582257,
"grad_norm": 0.18998844921588898,
"learning_rate": 3.700378274495825e-07,
"loss": 0.4125,
"step": 1187
},
{
"epoch": 2.668163952835486,
"grad_norm": 0.194740891456604,
"learning_rate": 3.6511577412566665e-07,
"loss": 0.439,
"step": 1188
},
{
"epoch": 2.6704098820887143,
"grad_norm": 0.18627774715423584,
"learning_rate": 3.602254349004447e-07,
"loss": 0.4139,
"step": 1189
},
{
"epoch": 2.6726558113419427,
"grad_norm": 0.20535503327846527,
"learning_rate": 3.553668432359625e-07,
"loss": 0.441,
"step": 1190
},
{
"epoch": 2.674901740595171,
"grad_norm": 0.18549248576164246,
"learning_rate": 3.5054003237702916e-07,
"loss": 0.441,
"step": 1191
},
{
"epoch": 2.6771476698483996,
"grad_norm": 0.17974409461021423,
"learning_rate": 3.45745035351e-07,
"loss": 0.3985,
"step": 1192
},
{
"epoch": 2.6793935991016284,
"grad_norm": 0.194856658577919,
"learning_rate": 3.4098188496754057e-07,
"loss": 0.4406,
"step": 1193
},
{
"epoch": 2.681639528354857,
"grad_norm": 0.1955060213804245,
"learning_rate": 3.362506138184085e-07,
"loss": 0.4168,
"step": 1194
},
{
"epoch": 2.6838854576080853,
"grad_norm": 0.18493853509426117,
"learning_rate": 3.3155125427722814e-07,
"loss": 0.4128,
"step": 1195
},
{
"epoch": 2.686131386861314,
"grad_norm": 0.19132456183433533,
"learning_rate": 3.268838384992695e-07,
"loss": 0.4372,
"step": 1196
},
{
"epoch": 2.6883773161145426,
"grad_norm": 0.18947117030620575,
"learning_rate": 3.2224839842122713e-07,
"loss": 0.4166,
"step": 1197
},
{
"epoch": 2.690623245367771,
"grad_norm": 0.17782782018184662,
"learning_rate": 3.1764496576100425e-07,
"loss": 0.3997,
"step": 1198
},
{
"epoch": 2.6928691746209994,
"grad_norm": 0.19115474820137024,
"learning_rate": 3.1307357201749157e-07,
"loss": 0.4568,
"step": 1199
},
{
"epoch": 2.695115103874228,
"grad_norm": 0.18287594616413116,
"learning_rate": 3.0853424847035573e-07,
"loss": 0.4024,
"step": 1200
},
{
"epoch": 2.6973610331274562,
"grad_norm": 0.20194946229457855,
"learning_rate": 3.040270261798245e-07,
"loss": 0.4233,
"step": 1201
},
{
"epoch": 2.699606962380685,
"grad_norm": 0.18246972560882568,
"learning_rate": 2.995519359864707e-07,
"loss": 0.4282,
"step": 1202
},
{
"epoch": 2.7018528916339135,
"grad_norm": 0.17514237761497498,
"learning_rate": 2.9510900851100646e-07,
"loss": 0.4079,
"step": 1203
},
{
"epoch": 2.704098820887142,
"grad_norm": 0.18999601900577545,
"learning_rate": 2.90698274154072e-07,
"loss": 0.4401,
"step": 1204
},
{
"epoch": 2.706344750140371,
"grad_norm": 0.1866077333688736,
"learning_rate": 2.863197630960224e-07,
"loss": 0.4019,
"step": 1205
},
{
"epoch": 2.708590679393599,
"grad_norm": 0.18696747720241547,
"learning_rate": 2.81973505296731e-07,
"loss": 0.4247,
"step": 1206
},
{
"epoch": 2.7108366086468276,
"grad_norm": 0.1890602558851242,
"learning_rate": 2.776595304953739e-07,
"loss": 0.4345,
"step": 1207
},
{
"epoch": 2.713082537900056,
"grad_norm": 0.21192647516727448,
"learning_rate": 2.7337786821023503e-07,
"loss": 0.4338,
"step": 1208
},
{
"epoch": 2.7153284671532845,
"grad_norm": 0.19118010997772217,
"learning_rate": 2.691285477384986e-07,
"loss": 0.4223,
"step": 1209
},
{
"epoch": 2.7175743964065133,
"grad_norm": 0.1966598927974701,
"learning_rate": 2.6491159815605294e-07,
"loss": 0.4268,
"step": 1210
},
{
"epoch": 2.7198203256597417,
"grad_norm": 0.18716298043727875,
"learning_rate": 2.6072704831728633e-07,
"loss": 0.4214,
"step": 1211
},
{
"epoch": 2.72206625491297,
"grad_norm": 0.19453807175159454,
"learning_rate": 2.5657492685489283e-07,
"loss": 0.4527,
"step": 1212
},
{
"epoch": 2.724312184166199,
"grad_norm": 0.18477098643779755,
"learning_rate": 2.5245526217967887e-07,
"loss": 0.3948,
"step": 1213
},
{
"epoch": 2.7265581134194274,
"grad_norm": 0.19278430938720703,
"learning_rate": 2.4836808248036305e-07,
"loss": 0.4088,
"step": 1214
},
{
"epoch": 2.728804042672656,
"grad_norm": 0.18528202176094055,
"learning_rate": 2.443134157233873e-07,
"loss": 0.4136,
"step": 1215
},
{
"epoch": 2.7310499719258843,
"grad_norm": 0.18257422745227814,
"learning_rate": 2.40291289652726e-07,
"loss": 0.4362,
"step": 1216
},
{
"epoch": 2.7332959011791127,
"grad_norm": 0.18132422864437103,
"learning_rate": 2.363017317896904e-07,
"loss": 0.402,
"step": 1217
},
{
"epoch": 2.735541830432341,
"grad_norm": 0.17809224128723145,
"learning_rate": 2.323447694327491e-07,
"loss": 0.4177,
"step": 1218
},
{
"epoch": 2.73778775968557,
"grad_norm": 0.19087287783622742,
"learning_rate": 2.284204296573328e-07,
"loss": 0.4402,
"step": 1219
},
{
"epoch": 2.7400336889387984,
"grad_norm": 0.17470994591712952,
"learning_rate": 2.2452873931565534e-07,
"loss": 0.4098,
"step": 1220
},
{
"epoch": 2.742279618192027,
"grad_norm": 0.18862077593803406,
"learning_rate": 2.2066972503652807e-07,
"loss": 0.4231,
"step": 1221
},
{
"epoch": 2.7445255474452557,
"grad_norm": 0.19681653380393982,
"learning_rate": 2.1684341322517343e-07,
"loss": 0.4438,
"step": 1222
},
{
"epoch": 2.746771476698484,
"grad_norm": 0.193466454744339,
"learning_rate": 2.130498300630518e-07,
"loss": 0.4131,
"step": 1223
},
{
"epoch": 2.7490174059517125,
"grad_norm": 0.1851184368133545,
"learning_rate": 2.092890015076765e-07,
"loss": 0.4047,
"step": 1224
},
{
"epoch": 2.751263335204941,
"grad_norm": 0.192936971783638,
"learning_rate": 2.0556095329243853e-07,
"loss": 0.4201,
"step": 1225
},
{
"epoch": 2.7535092644581693,
"grad_norm": 0.19225548207759857,
"learning_rate": 2.0186571092642992e-07,
"loss": 0.4182,
"step": 1226
},
{
"epoch": 2.755755193711398,
"grad_norm": 0.1829329878091812,
"learning_rate": 1.9820329969426954e-07,
"loss": 0.4217,
"step": 1227
},
{
"epoch": 2.7580011229646266,
"grad_norm": 0.18259641528129578,
"learning_rate": 1.9457374465592927e-07,
"loss": 0.4343,
"step": 1228
},
{
"epoch": 2.760247052217855,
"grad_norm": 0.1908586025238037,
"learning_rate": 1.9097707064656523e-07,
"loss": 0.4135,
"step": 1229
},
{
"epoch": 2.762492981471084,
"grad_norm": 0.18594199419021606,
"learning_rate": 1.8741330227634412e-07,
"loss": 0.4226,
"step": 1230
},
{
"epoch": 2.7647389107243123,
"grad_norm": 0.19256974756717682,
"learning_rate": 1.8388246393027552e-07,
"loss": 0.4267,
"step": 1231
},
{
"epoch": 2.7669848399775407,
"grad_norm": 0.2004840224981308,
"learning_rate": 1.8038457976804812e-07,
"loss": 0.4255,
"step": 1232
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.19169549643993378,
"learning_rate": 1.76919673723861e-07,
"loss": 0.3957,
"step": 1233
},
{
"epoch": 2.7714766984839976,
"grad_norm": 0.20605385303497314,
"learning_rate": 1.7348776950626146e-07,
"loss": 0.4388,
"step": 1234
},
{
"epoch": 2.7737226277372264,
"grad_norm": 0.18344692885875702,
"learning_rate": 1.7008889059798306e-07,
"loss": 0.422,
"step": 1235
},
{
"epoch": 2.775968556990455,
"grad_norm": 0.17997150123119354,
"learning_rate": 1.66723060255784e-07,
"loss": 0.4304,
"step": 1236
},
{
"epoch": 2.7782144862436833,
"grad_norm": 0.18032418191432953,
"learning_rate": 1.633903015102878e-07,
"loss": 0.4372,
"step": 1237
},
{
"epoch": 2.7804604154969117,
"grad_norm": 0.1851246953010559,
"learning_rate": 1.600906371658262e-07,
"loss": 0.4099,
"step": 1238
},
{
"epoch": 2.7827063447501406,
"grad_norm": 0.19547966122627258,
"learning_rate": 1.568240898002843e-07,
"loss": 0.4284,
"step": 1239
},
{
"epoch": 2.784952274003369,
"grad_norm": 0.20962592959403992,
"learning_rate": 1.5359068176494462e-07,
"loss": 0.4296,
"step": 1240
},
{
"epoch": 2.7871982032565974,
"grad_norm": 0.17490459978580475,
"learning_rate": 1.5039043518433383e-07,
"loss": 0.3977,
"step": 1241
},
{
"epoch": 2.789444132509826,
"grad_norm": 0.1864641159772873,
"learning_rate": 1.4722337195607228e-07,
"loss": 0.3936,
"step": 1242
},
{
"epoch": 2.7916900617630542,
"grad_norm": 0.19050495326519012,
"learning_rate": 1.4408951375072477e-07,
"loss": 0.4443,
"step": 1243
},
{
"epoch": 2.793935991016283,
"grad_norm": 0.1971900761127472,
"learning_rate": 1.4098888201165005e-07,
"loss": 0.453,
"step": 1244
},
{
"epoch": 2.7961819202695115,
"grad_norm": 0.19153332710266113,
"learning_rate": 1.3792149795485655e-07,
"loss": 0.4088,
"step": 1245
},
{
"epoch": 2.79842784952274,
"grad_norm": 0.19257591664791107,
"learning_rate": 1.348873825688557e-07,
"loss": 0.4256,
"step": 1246
},
{
"epoch": 2.800673778775969,
"grad_norm": 0.18633553385734558,
"learning_rate": 1.3188655661451833e-07,
"loss": 0.4187,
"step": 1247
},
{
"epoch": 2.802919708029197,
"grad_norm": 0.18796589970588684,
"learning_rate": 1.2891904062493355e-07,
"loss": 0.44,
"step": 1248
},
{
"epoch": 2.8051656372824256,
"grad_norm": 0.19225618243217468,
"learning_rate": 1.259848549052689e-07,
"loss": 0.4402,
"step": 1249
},
{
"epoch": 2.807411566535654,
"grad_norm": 0.185172900557518,
"learning_rate": 1.2308401953262716e-07,
"loss": 0.4066,
"step": 1250
},
{
"epoch": 2.8096574957888825,
"grad_norm": 0.17786043882369995,
"learning_rate": 1.2021655435591472e-07,
"loss": 0.4176,
"step": 1251
},
{
"epoch": 2.8119034250421113,
"grad_norm": 0.19384820759296417,
"learning_rate": 1.1738247899570287e-07,
"loss": 0.4345,
"step": 1252
},
{
"epoch": 2.8141493542953397,
"grad_norm": 0.18932607769966125,
"learning_rate": 1.145818128440923e-07,
"loss": 0.4076,
"step": 1253
},
{
"epoch": 2.816395283548568,
"grad_norm": 0.20454899966716766,
"learning_rate": 1.1181457506458271e-07,
"loss": 0.4638,
"step": 1254
},
{
"epoch": 2.8186412128017966,
"grad_norm": 0.17342036962509155,
"learning_rate": 1.0908078459194227e-07,
"loss": 0.3839,
"step": 1255
},
{
"epoch": 2.8208871420550254,
"grad_norm": 0.20639826357364655,
"learning_rate": 1.0638046013207337e-07,
"loss": 0.4386,
"step": 1256
},
{
"epoch": 2.823133071308254,
"grad_norm": 0.20575068891048431,
"learning_rate": 1.0371362016189158e-07,
"loss": 0.4155,
"step": 1257
},
{
"epoch": 2.8253790005614823,
"grad_norm": 0.1837739795446396,
"learning_rate": 1.0108028292919237e-07,
"loss": 0.4209,
"step": 1258
},
{
"epoch": 2.8276249298147107,
"grad_norm": 0.1831589937210083,
"learning_rate": 9.848046645253184e-08,
"loss": 0.4171,
"step": 1259
},
{
"epoch": 2.829870859067939,
"grad_norm": 0.20222926139831543,
"learning_rate": 9.591418852109957e-08,
"loss": 0.4118,
"step": 1260
},
{
"epoch": 2.832116788321168,
"grad_norm": 0.1949760466814041,
"learning_rate": 9.338146669459925e-08,
"loss": 0.4126,
"step": 1261
},
{
"epoch": 2.8343627175743964,
"grad_norm": 0.1802796870470047,
"learning_rate": 9.088231830312655e-08,
"loss": 0.4435,
"step": 1262
},
{
"epoch": 2.836608646827625,
"grad_norm": 0.19096410274505615,
"learning_rate": 8.841676044705261e-08,
"loss": 0.4398,
"step": 1263
},
{
"epoch": 2.8388545760808537,
"grad_norm": 0.1780502051115036,
"learning_rate": 8.598480999690573e-08,
"loss": 0.4135,
"step": 1264
},
{
"epoch": 2.841100505334082,
"grad_norm": 0.18318232893943787,
"learning_rate": 8.358648359325539e-08,
"loss": 0.4294,
"step": 1265
},
{
"epoch": 2.8433464345873105,
"grad_norm": 0.186601459980011,
"learning_rate": 8.122179764660121e-08,
"loss": 0.4435,
"step": 1266
},
{
"epoch": 2.845592363840539,
"grad_norm": 0.19515588879585266,
"learning_rate": 7.889076833725695e-08,
"loss": 0.4068,
"step": 1267
},
{
"epoch": 2.8478382930937673,
"grad_norm": 0.1878891885280609,
"learning_rate": 7.659341161524225e-08,
"loss": 0.4538,
"step": 1268
},
{
"epoch": 2.850084222346996,
"grad_norm": 0.18124721944332123,
"learning_rate": 7.432974320017216e-08,
"loss": 0.4121,
"step": 1269
},
{
"epoch": 2.8523301516002246,
"grad_norm": 0.19087855517864227,
"learning_rate": 7.209977858115058e-08,
"loss": 0.4439,
"step": 1270
},
{
"epoch": 2.854576080853453,
"grad_norm": 0.17772875726222992,
"learning_rate": 6.990353301666475e-08,
"loss": 0.4262,
"step": 1271
},
{
"epoch": 2.856822010106682,
"grad_norm": 0.1787647157907486,
"learning_rate": 6.774102153447814e-08,
"loss": 0.4057,
"step": 1272
},
{
"epoch": 2.8590679393599103,
"grad_norm": 0.20238277316093445,
"learning_rate": 6.561225893153112e-08,
"loss": 0.4361,
"step": 1273
},
{
"epoch": 2.8613138686131387,
"grad_norm": 0.1737276315689087,
"learning_rate": 6.351725977383704e-08,
"loss": 0.3966,
"step": 1274
},
{
"epoch": 2.863559797866367,
"grad_norm": 0.18557517230510712,
"learning_rate": 6.14560383963847e-08,
"loss": 0.438,
"step": 1275
},
{
"epoch": 2.8658057271195956,
"grad_norm": 0.19200386106967926,
"learning_rate": 5.94286089030377e-08,
"loss": 0.4359,
"step": 1276
},
{
"epoch": 2.868051656372824,
"grad_norm": 0.17796795070171356,
"learning_rate": 5.743498516644019e-08,
"loss": 0.4051,
"step": 1277
},
{
"epoch": 2.870297585626053,
"grad_norm": 0.19527527689933777,
"learning_rate": 5.547518082792136e-08,
"loss": 0.4301,
"step": 1278
},
{
"epoch": 2.8725435148792813,
"grad_norm": 0.1781342327594757,
"learning_rate": 5.354920929740048e-08,
"loss": 0.4249,
"step": 1279
},
{
"epoch": 2.8747894441325097,
"grad_norm": 0.1727127581834793,
"learning_rate": 5.1657083753299256e-08,
"loss": 0.4137,
"step": 1280
},
{
"epoch": 2.8770353733857386,
"grad_norm": 0.18090222775936127,
"learning_rate": 4.979881714244628e-08,
"loss": 0.4256,
"step": 1281
},
{
"epoch": 2.879281302638967,
"grad_norm": 0.18660210072994232,
"learning_rate": 4.797442217999215e-08,
"loss": 0.4152,
"step": 1282
},
{
"epoch": 2.8815272318921954,
"grad_norm": 0.19095072150230408,
"learning_rate": 4.618391134932121e-08,
"loss": 0.4072,
"step": 1283
},
{
"epoch": 2.883773161145424,
"grad_norm": 0.1802065372467041,
"learning_rate": 4.442729690196657e-08,
"loss": 0.4397,
"step": 1284
},
{
"epoch": 2.8860190903986522,
"grad_norm": 0.1801634430885315,
"learning_rate": 4.270459085752687e-08,
"loss": 0.4234,
"step": 1285
},
{
"epoch": 2.888265019651881,
"grad_norm": 0.17258504033088684,
"learning_rate": 4.101580500358082e-08,
"loss": 0.4047,
"step": 1286
},
{
"epoch": 2.8905109489051095,
"grad_norm": 0.194522425532341,
"learning_rate": 3.936095089561165e-08,
"loss": 0.4544,
"step": 1287
},
{
"epoch": 2.892756878158338,
"grad_norm": 0.18839098513126373,
"learning_rate": 3.774003985692387e-08,
"loss": 0.4202,
"step": 1288
},
{
"epoch": 2.895002807411567,
"grad_norm": 0.18398089706897736,
"learning_rate": 3.615308297856668e-08,
"loss": 0.4098,
"step": 1289
},
{
"epoch": 2.897248736664795,
"grad_norm": 0.1946476548910141,
"learning_rate": 3.4600091119260106e-08,
"loss": 0.449,
"step": 1290
},
{
"epoch": 2.8994946659180236,
"grad_norm": 0.186300590634346,
"learning_rate": 3.308107490531842e-08,
"loss": 0.4285,
"step": 1291
},
{
"epoch": 2.901740595171252,
"grad_norm": 0.18534432351589203,
"learning_rate": 3.159604473057909e-08,
"loss": 0.4392,
"step": 1292
},
{
"epoch": 2.9039865244244805,
"grad_norm": 0.18315456807613373,
"learning_rate": 3.0145010756328364e-08,
"loss": 0.4178,
"step": 1293
},
{
"epoch": 2.906232453677709,
"grad_norm": 0.1906488984823227,
"learning_rate": 2.8727982911238017e-08,
"loss": 0.4339,
"step": 1294
},
{
"epoch": 2.9084783829309377,
"grad_norm": 0.18358033895492554,
"learning_rate": 2.73449708912904e-08,
"loss": 0.4031,
"step": 1295
},
{
"epoch": 2.910724312184166,
"grad_norm": 0.19111478328704834,
"learning_rate": 2.599598415971627e-08,
"loss": 0.423,
"step": 1296
},
{
"epoch": 2.9129702414373946,
"grad_norm": 0.17649492621421814,
"learning_rate": 2.4681031946929834e-08,
"loss": 0.4165,
"step": 1297
},
{
"epoch": 2.9152161706906234,
"grad_norm": 0.190648153424263,
"learning_rate": 2.340012325046326e-08,
"loss": 0.408,
"step": 1298
},
{
"epoch": 2.917462099943852,
"grad_norm": 0.17262622714042664,
"learning_rate": 2.2153266834908927e-08,
"loss": 0.4148,
"step": 1299
},
{
"epoch": 2.9197080291970803,
"grad_norm": 0.18755358457565308,
"learning_rate": 2.0940471231855052e-08,
"loss": 0.4272,
"step": 1300
},
{
"epoch": 2.9219539584503087,
"grad_norm": 0.19861868023872375,
"learning_rate": 1.9761744739830723e-08,
"loss": 0.4661,
"step": 1301
},
{
"epoch": 2.924199887703537,
"grad_norm": 0.18785429000854492,
"learning_rate": 1.86170954242465e-08,
"loss": 0.4185,
"step": 1302
},
{
"epoch": 2.926445816956766,
"grad_norm": 0.1750560849905014,
"learning_rate": 1.750653111734224e-08,
"loss": 0.4075,
"step": 1303
},
{
"epoch": 2.9286917462099944,
"grad_norm": 0.18948881328105927,
"learning_rate": 1.643005941813103e-08,
"loss": 0.4398,
"step": 1304
},
{
"epoch": 2.930937675463223,
"grad_norm": 0.17896808683872223,
"learning_rate": 1.538768769234811e-08,
"loss": 0.4188,
"step": 1305
},
{
"epoch": 2.9331836047164517,
"grad_norm": 0.17966261506080627,
"learning_rate": 1.4379423072399812e-08,
"loss": 0.4168,
"step": 1306
},
{
"epoch": 2.93542953396968,
"grad_norm": 0.1799083948135376,
"learning_rate": 1.3405272457315822e-08,
"loss": 0.4184,
"step": 1307
},
{
"epoch": 2.9376754632229085,
"grad_norm": 0.18236926198005676,
"learning_rate": 1.2465242512701425e-08,
"loss": 0.3994,
"step": 1308
},
{
"epoch": 2.939921392476137,
"grad_norm": 0.18373502790927887,
"learning_rate": 1.155933967069256e-08,
"loss": 0.4164,
"step": 1309
},
{
"epoch": 2.9421673217293653,
"grad_norm": 0.18677166104316711,
"learning_rate": 1.068757012990973e-08,
"loss": 0.4351,
"step": 1310
},
{
"epoch": 2.944413250982594,
"grad_norm": 0.17726090550422668,
"learning_rate": 9.8499398554186e-09,
"loss": 0.4065,
"step": 1311
},
{
"epoch": 2.9466591802358226,
"grad_norm": 0.1915796846151352,
"learning_rate": 9.046454578686136e-09,
"loss": 0.4182,
"step": 1312
},
{
"epoch": 2.948905109489051,
"grad_norm": 0.1905493140220642,
"learning_rate": 8.277119797543975e-09,
"loss": 0.4316,
"step": 1313
},
{
"epoch": 2.9511510387422795,
"grad_norm": 0.17604538798332214,
"learning_rate": 7.541940776149559e-09,
"loss": 0.4251,
"step": 1314
},
{
"epoch": 2.9533969679955083,
"grad_norm": 0.1901237815618515,
"learning_rate": 6.840922544948947e-09,
"loss": 0.436,
"step": 1315
},
{
"epoch": 2.9556428972487367,
"grad_norm": 0.1710127294063568,
"learning_rate": 6.174069900646285e-09,
"loss": 0.3952,
"step": 1316
},
{
"epoch": 2.957888826501965,
"grad_norm": 0.18744228780269623,
"learning_rate": 5.541387406165499e-09,
"loss": 0.4292,
"step": 1317
},
{
"epoch": 2.9601347557551936,
"grad_norm": 0.1872478723526001,
"learning_rate": 4.942879390624766e-09,
"loss": 0.452,
"step": 1318
},
{
"epoch": 2.962380685008422,
"grad_norm": 0.18257497251033783,
"learning_rate": 4.378549949301536e-09,
"loss": 0.4161,
"step": 1319
},
{
"epoch": 2.964626614261651,
"grad_norm": 0.1956602931022644,
"learning_rate": 3.848402943608664e-09,
"loss": 0.4329,
"step": 1320
},
{
"epoch": 2.9668725435148793,
"grad_norm": 0.18602769076824188,
"learning_rate": 3.352442001066103e-09,
"loss": 0.4274,
"step": 1321
},
{
"epoch": 2.9691184727681077,
"grad_norm": 0.17961065471172333,
"learning_rate": 2.8906705152759175e-09,
"loss": 0.3989,
"step": 1322
},
{
"epoch": 2.9713644020213366,
"grad_norm": 0.1952294111251831,
"learning_rate": 2.4630916459000844e-09,
"loss": 0.4381,
"step": 1323
},
{
"epoch": 2.973610331274565,
"grad_norm": 0.19028258323669434,
"learning_rate": 2.069708318638286e-09,
"loss": 0.422,
"step": 1324
},
{
"epoch": 2.9758562605277934,
"grad_norm": 0.182929128408432,
"learning_rate": 1.7105232252079274e-09,
"loss": 0.3971,
"step": 1325
},
{
"epoch": 2.978102189781022,
"grad_norm": 0.19286341965198517,
"learning_rate": 1.3855388233247057e-09,
"loss": 0.4152,
"step": 1326
},
{
"epoch": 2.9803481190342502,
"grad_norm": 0.1891261488199234,
"learning_rate": 1.0947573366881791e-09,
"loss": 0.4364,
"step": 1327
},
{
"epoch": 2.982594048287479,
"grad_norm": 0.18247157335281372,
"learning_rate": 8.381807549645571e-10,
"loss": 0.3987,
"step": 1328
},
{
"epoch": 2.9848399775407075,
"grad_norm": 0.19202907383441925,
"learning_rate": 6.158108337733782e-10,
"loss": 0.4559,
"step": 1329
},
{
"epoch": 2.987085906793936,
"grad_norm": 0.17795370519161224,
"learning_rate": 4.2764909467696293e-10,
"loss": 0.4122,
"step": 1330
},
{
"epoch": 2.9893318360471643,
"grad_norm": 0.19224773347377777,
"learning_rate": 2.736968251670913e-10,
"loss": 0.4322,
"step": 1331
},
{
"epoch": 2.991577765300393,
"grad_norm": 0.19133056700229645,
"learning_rate": 1.5395507866000637e-10,
"loss": 0.4018,
"step": 1332
},
{
"epoch": 2.9938236945536216,
"grad_norm": 0.1867966204881668,
"learning_rate": 6.842467448531231e-11,
"loss": 0.4393,
"step": 1333
},
{
"epoch": 2.99606962380685,
"grad_norm": 0.17837318778038025,
"learning_rate": 1.7106197883753894e-11,
"loss": 0.4112,
"step": 1334
},
{
"epoch": 2.9983155530600785,
"grad_norm": 0.185623437166214,
"learning_rate": 0.0,
"loss": 0.4139,
"step": 1335
},
{
"epoch": 2.9983155530600785,
"step": 1335,
"total_flos": 4617447504347136.0,
"train_loss": 0.46386746891428915,
"train_runtime": 117690.8189,
"train_samples_per_second": 2.905,
"train_steps_per_second": 0.011
}
],
"logging_steps": 1.0,
"max_steps": 1335,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4617447504347136.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}