exp3_code_only / trainer_state.json
Mostafa3zazi's picture
Uploading model checkpoint
2a2cd2e verified
{
"best_metric": 0.6652334928512573,
"best_model_checkpoint": "/l/users/visionlanguage/mostafa_ciai/hf_checkpoints_code_ciai_gemma2/checkpoint-1700",
"epoch": 5.994075260208167,
"eval_steps": 50,
"global_step": 1752,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006832132372564718,
"grad_norm": 93.82548522949219,
"learning_rate": 2.777777777777778e-06,
"loss": 208.4052,
"step": 2
},
{
"epoch": 0.013664264745129436,
"grad_norm": 65.51689147949219,
"learning_rate": 5.555555555555556e-06,
"loss": 194.4831,
"step": 4
},
{
"epoch": 0.020496397117694156,
"grad_norm": 30.816993713378906,
"learning_rate": 8.333333333333334e-06,
"loss": 159.6516,
"step": 6
},
{
"epoch": 0.027328529490258872,
"grad_norm": 30.113662719726562,
"learning_rate": 1.1111111111111112e-05,
"loss": 145.5557,
"step": 8
},
{
"epoch": 0.03416066186282359,
"grad_norm": 22.37295150756836,
"learning_rate": 1.388888888888889e-05,
"loss": 128.5444,
"step": 10
},
{
"epoch": 0.04099279423538831,
"grad_norm": 22.287870407104492,
"learning_rate": 1.6666666666666667e-05,
"loss": 116.2723,
"step": 12
},
{
"epoch": 0.04782492660795303,
"grad_norm": 16.027904510498047,
"learning_rate": 1.9444444444444445e-05,
"loss": 107.5451,
"step": 14
},
{
"epoch": 0.054657058980517745,
"grad_norm": 17.97212791442871,
"learning_rate": 2.2222222222222223e-05,
"loss": 100.7136,
"step": 16
},
{
"epoch": 0.061489191353082465,
"grad_norm": 15.427449226379395,
"learning_rate": 2.5e-05,
"loss": 96.4422,
"step": 18
},
{
"epoch": 0.06832132372564718,
"grad_norm": 11.836018562316895,
"learning_rate": 2.777777777777778e-05,
"loss": 89.9874,
"step": 20
},
{
"epoch": 0.0751534560982119,
"grad_norm": 13.170073509216309,
"learning_rate": 3.055555555555556e-05,
"loss": 90.5263,
"step": 22
},
{
"epoch": 0.08198558847077662,
"grad_norm": 12.781464576721191,
"learning_rate": 3.3333333333333335e-05,
"loss": 87.3144,
"step": 24
},
{
"epoch": 0.08881772084334134,
"grad_norm": 11.460458755493164,
"learning_rate": 3.611111111111111e-05,
"loss": 85.6209,
"step": 26
},
{
"epoch": 0.09564985321590606,
"grad_norm": 10.382000923156738,
"learning_rate": 3.888888888888889e-05,
"loss": 88.2803,
"step": 28
},
{
"epoch": 0.10248198558847077,
"grad_norm": 10.578895568847656,
"learning_rate": 4.166666666666667e-05,
"loss": 80.589,
"step": 30
},
{
"epoch": 0.10931411796103549,
"grad_norm": 10.231274604797363,
"learning_rate": 4.4444444444444447e-05,
"loss": 83.0791,
"step": 32
},
{
"epoch": 0.11614625033360021,
"grad_norm": 13.121459007263184,
"learning_rate": 4.722222222222222e-05,
"loss": 81.0775,
"step": 34
},
{
"epoch": 0.12297838270616493,
"grad_norm": 11.594988822937012,
"learning_rate": 5e-05,
"loss": 79.3985,
"step": 36
},
{
"epoch": 0.12981051507872965,
"grad_norm": 10.554534912109375,
"learning_rate": 4.9999832415172185e-05,
"loss": 78.9732,
"step": 38
},
{
"epoch": 0.13664264745129437,
"grad_norm": 9.661481857299805,
"learning_rate": 4.9999329662935534e-05,
"loss": 77.5229,
"step": 40
},
{
"epoch": 0.1434747798238591,
"grad_norm": 11.10251235961914,
"learning_rate": 4.9998491750030315e-05,
"loss": 77.7747,
"step": 42
},
{
"epoch": 0.1503069121964238,
"grad_norm": 9.058899879455566,
"learning_rate": 4.999731868769027e-05,
"loss": 79.2141,
"step": 44
},
{
"epoch": 0.15713904456898853,
"grad_norm": 9.254643440246582,
"learning_rate": 4.999581049164237e-05,
"loss": 77.5962,
"step": 46
},
{
"epoch": 0.16397117694155325,
"grad_norm": 10.37578010559082,
"learning_rate": 4.99939671821067e-05,
"loss": 76.6356,
"step": 48
},
{
"epoch": 0.17080330931411797,
"grad_norm": 9.983922004699707,
"learning_rate": 4.999178878379611e-05,
"loss": 76.0763,
"step": 50
},
{
"epoch": 0.17080330931411797,
"eval_loss": 1.20554518699646,
"eval_runtime": 119.3115,
"eval_samples_per_second": 33.065,
"eval_steps_per_second": 8.272,
"step": 50
},
{
"epoch": 0.1776354416866827,
"grad_norm": 9.109485626220703,
"learning_rate": 4.998927532591592e-05,
"loss": 75.2524,
"step": 52
},
{
"epoch": 0.1844675740592474,
"grad_norm": 8.939992904663086,
"learning_rate": 4.9986426842163515e-05,
"loss": 75.8614,
"step": 54
},
{
"epoch": 0.19129970643181213,
"grad_norm": 8.342733383178711,
"learning_rate": 4.9983243370727914e-05,
"loss": 72.864,
"step": 56
},
{
"epoch": 0.19813183880437685,
"grad_norm": 7.625518321990967,
"learning_rate": 4.9979724954289244e-05,
"loss": 75.7165,
"step": 58
},
{
"epoch": 0.20496397117694154,
"grad_norm": 6.545467853546143,
"learning_rate": 4.9975871640018154e-05,
"loss": 72.337,
"step": 60
},
{
"epoch": 0.21179610354950626,
"grad_norm": 8.73936939239502,
"learning_rate": 4.99716834795752e-05,
"loss": 73.0804,
"step": 62
},
{
"epoch": 0.21862823592207098,
"grad_norm": 7.599481105804443,
"learning_rate": 4.996716052911017e-05,
"loss": 71.3494,
"step": 64
},
{
"epoch": 0.2254603682946357,
"grad_norm": 8.88508415222168,
"learning_rate": 4.996230284926128e-05,
"loss": 73.4886,
"step": 66
},
{
"epoch": 0.23229250066720042,
"grad_norm": 7.141696453094482,
"learning_rate": 4.99571105051544e-05,
"loss": 73.0934,
"step": 68
},
{
"epoch": 0.23912463303976514,
"grad_norm": 8.946745872497559,
"learning_rate": 4.99515835664022e-05,
"loss": 70.5761,
"step": 70
},
{
"epoch": 0.24595676541232986,
"grad_norm": 7.428682804107666,
"learning_rate": 4.994572210710315e-05,
"loss": 69.8488,
"step": 72
},
{
"epoch": 0.2527888977848946,
"grad_norm": 10.490913391113281,
"learning_rate": 4.993952620584058e-05,
"loss": 72.1602,
"step": 74
},
{
"epoch": 0.2596210301574593,
"grad_norm": 6.010617733001709,
"learning_rate": 4.993299594568163e-05,
"loss": 70.0962,
"step": 76
},
{
"epoch": 0.26645316253002405,
"grad_norm": 5.207183361053467,
"learning_rate": 4.992613141417608e-05,
"loss": 70.6436,
"step": 78
},
{
"epoch": 0.27328529490258874,
"grad_norm": 7.816757678985596,
"learning_rate": 4.9918932703355256e-05,
"loss": 68.9464,
"step": 80
},
{
"epoch": 0.28011742727515343,
"grad_norm": 6.2263383865356445,
"learning_rate": 4.9911399909730714e-05,
"loss": 68.8249,
"step": 82
},
{
"epoch": 0.2869495596477182,
"grad_norm": 6.726258754730225,
"learning_rate": 4.990353313429303e-05,
"loss": 68.7637,
"step": 84
},
{
"epoch": 0.29378169202028287,
"grad_norm": 5.4038543701171875,
"learning_rate": 4.989533248251037e-05,
"loss": 68.7726,
"step": 86
},
{
"epoch": 0.3006138243928476,
"grad_norm": 9.256815910339355,
"learning_rate": 4.988679806432712e-05,
"loss": 68.2967,
"step": 88
},
{
"epoch": 0.3074459567654123,
"grad_norm": 7.765486717224121,
"learning_rate": 4.98779299941624e-05,
"loss": 70.6181,
"step": 90
},
{
"epoch": 0.31427808913797706,
"grad_norm": 7.625786304473877,
"learning_rate": 4.9868728390908526e-05,
"loss": 68.5738,
"step": 92
},
{
"epoch": 0.32111022151054175,
"grad_norm": 7.776100158691406,
"learning_rate": 4.985919337792944e-05,
"loss": 65.0074,
"step": 94
},
{
"epoch": 0.3279423538831065,
"grad_norm": 6.496335029602051,
"learning_rate": 4.9849325083059e-05,
"loss": 66.7343,
"step": 96
},
{
"epoch": 0.3347744862556712,
"grad_norm": 6.616697311401367,
"learning_rate": 4.983912363859935e-05,
"loss": 69.292,
"step": 98
},
{
"epoch": 0.34160661862823594,
"grad_norm": 7.259242057800293,
"learning_rate": 4.982858918131906e-05,
"loss": 66.8941,
"step": 100
},
{
"epoch": 0.34160661862823594,
"eval_loss": 1.0700218677520752,
"eval_runtime": 119.6843,
"eval_samples_per_second": 32.962,
"eval_steps_per_second": 8.247,
"step": 100
},
{
"epoch": 0.34843875100080063,
"grad_norm": 7.206521987915039,
"learning_rate": 4.981772185245135e-05,
"loss": 68.3145,
"step": 102
},
{
"epoch": 0.3552708833733654,
"grad_norm": 6.332549095153809,
"learning_rate": 4.980652179769218e-05,
"loss": 67.5062,
"step": 104
},
{
"epoch": 0.36210301574593007,
"grad_norm": 8.422966957092285,
"learning_rate": 4.979498916719828e-05,
"loss": 69.0426,
"step": 106
},
{
"epoch": 0.3689351481184948,
"grad_norm": 4.5074357986450195,
"learning_rate": 4.978312411558518e-05,
"loss": 66.0764,
"step": 108
},
{
"epoch": 0.3757672804910595,
"grad_norm": 6.847994327545166,
"learning_rate": 4.977092680192507e-05,
"loss": 68.0597,
"step": 110
},
{
"epoch": 0.38259941286362426,
"grad_norm": 9.010295867919922,
"learning_rate": 4.9758397389744734e-05,
"loss": 66.7856,
"step": 112
},
{
"epoch": 0.38943154523618895,
"grad_norm": 8.793087005615234,
"learning_rate": 4.9745536047023324e-05,
"loss": 66.6415,
"step": 114
},
{
"epoch": 0.3962636776087537,
"grad_norm": 6.820159912109375,
"learning_rate": 4.973234294619011e-05,
"loss": 66.8668,
"step": 116
},
{
"epoch": 0.4030958099813184,
"grad_norm": 10.739355087280273,
"learning_rate": 4.971881826412218e-05,
"loss": 64.5842,
"step": 118
},
{
"epoch": 0.4099279423538831,
"grad_norm": 6.451905727386475,
"learning_rate": 4.9704962182142044e-05,
"loss": 64.2948,
"step": 120
},
{
"epoch": 0.4167600747264478,
"grad_norm": 6.998046398162842,
"learning_rate": 4.9690774886015244e-05,
"loss": 66.095,
"step": 122
},
{
"epoch": 0.4235922070990125,
"grad_norm": 6.946700096130371,
"learning_rate": 4.967625656594782e-05,
"loss": 66.6205,
"step": 124
},
{
"epoch": 0.43042433947157727,
"grad_norm": 7.656089782714844,
"learning_rate": 4.966140741658379e-05,
"loss": 65.2253,
"step": 126
},
{
"epoch": 0.43725647184414196,
"grad_norm": 8.242254257202148,
"learning_rate": 4.9646227637002515e-05,
"loss": 65.4466,
"step": 128
},
{
"epoch": 0.4440886042167067,
"grad_norm": 6.5599894523620605,
"learning_rate": 4.963071743071607e-05,
"loss": 64.5302,
"step": 130
},
{
"epoch": 0.4509207365892714,
"grad_norm": 5.671536922454834,
"learning_rate": 4.961487700566646e-05,
"loss": 64.9711,
"step": 132
},
{
"epoch": 0.45775286896183615,
"grad_norm": 6.317226886749268,
"learning_rate": 4.9598706574222886e-05,
"loss": 66.1428,
"step": 134
},
{
"epoch": 0.46458500133440084,
"grad_norm": 7.731470584869385,
"learning_rate": 4.958220635317886e-05,
"loss": 65.6398,
"step": 136
},
{
"epoch": 0.4714171337069656,
"grad_norm": 7.070956230163574,
"learning_rate": 4.956537656374933e-05,
"loss": 64.027,
"step": 138
},
{
"epoch": 0.4782492660795303,
"grad_norm": 5.216205596923828,
"learning_rate": 4.9548217431567665e-05,
"loss": 64.9929,
"step": 140
},
{
"epoch": 0.485081398452095,
"grad_norm": 6.5882344245910645,
"learning_rate": 4.95307291866827e-05,
"loss": 66.2789,
"step": 142
},
{
"epoch": 0.4919135308246597,
"grad_norm": 5.5962934494018555,
"learning_rate": 4.95129120635556e-05,
"loss": 65.4516,
"step": 144
},
{
"epoch": 0.49874566319722446,
"grad_norm": 7.341054916381836,
"learning_rate": 4.949476630105669e-05,
"loss": 64.339,
"step": 146
},
{
"epoch": 0.5055777955697892,
"grad_norm": 7.5083441734313965,
"learning_rate": 4.9476292142462374e-05,
"loss": 62.7076,
"step": 148
},
{
"epoch": 0.5124099279423538,
"grad_norm": 5.081834316253662,
"learning_rate": 4.945748983545172e-05,
"loss": 64.2066,
"step": 150
},
{
"epoch": 0.5124099279423538,
"eval_loss": 0.9920685291290283,
"eval_runtime": 120.1858,
"eval_samples_per_second": 32.824,
"eval_steps_per_second": 8.212,
"step": 150
},
{
"epoch": 0.5192420603149186,
"grad_norm": 6.279696464538574,
"learning_rate": 4.943835963210324e-05,
"loss": 63.3412,
"step": 152
},
{
"epoch": 0.5260741926874833,
"grad_norm": 6.806802749633789,
"learning_rate": 4.941890178889149e-05,
"loss": 63.2038,
"step": 154
},
{
"epoch": 0.5329063250600481,
"grad_norm": 8.012312889099121,
"learning_rate": 4.939911656668361e-05,
"loss": 63.4725,
"step": 156
},
{
"epoch": 0.5397384574326127,
"grad_norm": 6.68613338470459,
"learning_rate": 4.937900423073585e-05,
"loss": 62.8267,
"step": 158
},
{
"epoch": 0.5465705898051775,
"grad_norm": 6.391062259674072,
"learning_rate": 4.9358565050689985e-05,
"loss": 63.4099,
"step": 160
},
{
"epoch": 0.5534027221777422,
"grad_norm": 6.4117817878723145,
"learning_rate": 4.933779930056975e-05,
"loss": 62.475,
"step": 162
},
{
"epoch": 0.5602348545503069,
"grad_norm": 10.238900184631348,
"learning_rate": 4.93167072587771e-05,
"loss": 62.3929,
"step": 164
},
{
"epoch": 0.5670669869228716,
"grad_norm": 6.800478935241699,
"learning_rate": 4.929528920808854e-05,
"loss": 63.4465,
"step": 166
},
{
"epoch": 0.5738991192954364,
"grad_norm": 6.688059329986572,
"learning_rate": 4.92735454356513e-05,
"loss": 62.3017,
"step": 168
},
{
"epoch": 0.5807312516680011,
"grad_norm": 5.010741710662842,
"learning_rate": 4.925147623297949e-05,
"loss": 61.5306,
"step": 170
},
{
"epoch": 0.5875633840405657,
"grad_norm": 6.061219215393066,
"learning_rate": 4.922908189595018e-05,
"loss": 63.5529,
"step": 172
},
{
"epoch": 0.5943955164131305,
"grad_norm": 7.6835126876831055,
"learning_rate": 4.920636272479946e-05,
"loss": 64.4077,
"step": 174
},
{
"epoch": 0.6012276487856952,
"grad_norm": 5.945671558380127,
"learning_rate": 4.9183319024118415e-05,
"loss": 64.3411,
"step": 176
},
{
"epoch": 0.60805978115826,
"grad_norm": 4.983694076538086,
"learning_rate": 4.915995110284901e-05,
"loss": 63.5529,
"step": 178
},
{
"epoch": 0.6148919135308246,
"grad_norm": 5.736062049865723,
"learning_rate": 4.9136259274279955e-05,
"loss": 63.7282,
"step": 180
},
{
"epoch": 0.6217240459033894,
"grad_norm": 6.8453545570373535,
"learning_rate": 4.911224385604255e-05,
"loss": 63.5027,
"step": 182
},
{
"epoch": 0.6285561782759541,
"grad_norm": 5.9253668785095215,
"learning_rate": 4.908790517010636e-05,
"loss": 60.5142,
"step": 184
},
{
"epoch": 0.6353883106485189,
"grad_norm": 5.743585586547852,
"learning_rate": 4.906324354277495e-05,
"loss": 62.4935,
"step": 186
},
{
"epoch": 0.6422204430210835,
"grad_norm": 4.686921119689941,
"learning_rate": 4.903825930468149e-05,
"loss": 60.8045,
"step": 188
},
{
"epoch": 0.6490525753936482,
"grad_norm": 5.350888729095459,
"learning_rate": 4.901295279078431e-05,
"loss": 62.3775,
"step": 190
},
{
"epoch": 0.655884707766213,
"grad_norm": 5.417562961578369,
"learning_rate": 4.898732434036244e-05,
"loss": 60.1095,
"step": 192
},
{
"epoch": 0.6627168401387777,
"grad_norm": 5.238453388214111,
"learning_rate": 4.896137429701102e-05,
"loss": 62.8943,
"step": 194
},
{
"epoch": 0.6695489725113424,
"grad_norm": 6.252527713775635,
"learning_rate": 4.893510300863676e-05,
"loss": 61.1666,
"step": 196
},
{
"epoch": 0.6763811048839071,
"grad_norm": 5.860842704772949,
"learning_rate": 4.890851082745319e-05,
"loss": 62.6643,
"step": 198
},
{
"epoch": 0.6832132372564719,
"grad_norm": 6.3946099281311035,
"learning_rate": 4.8881598109976004e-05,
"loss": 61.939,
"step": 200
},
{
"epoch": 0.6832132372564719,
"eval_loss": 0.9664058685302734,
"eval_runtime": 119.3157,
"eval_samples_per_second": 33.064,
"eval_steps_per_second": 8.272,
"step": 200
},
{
"epoch": 0.6900453696290365,
"grad_norm": 5.909948825836182,
"learning_rate": 4.885436521701824e-05,
"loss": 63.9172,
"step": 202
},
{
"epoch": 0.6968775020016013,
"grad_norm": 6.600235462188721,
"learning_rate": 4.8826812513685487e-05,
"loss": 60.6396,
"step": 204
},
{
"epoch": 0.703709634374166,
"grad_norm": 5.97224235534668,
"learning_rate": 4.8798940369370944e-05,
"loss": 61.1365,
"step": 206
},
{
"epoch": 0.7105417667467308,
"grad_norm": 5.521954536437988,
"learning_rate": 4.877074915775049e-05,
"loss": 61.9178,
"step": 208
},
{
"epoch": 0.7173738991192954,
"grad_norm": 4.756962299346924,
"learning_rate": 4.8742239256777674e-05,
"loss": 60.0003,
"step": 210
},
{
"epoch": 0.7242060314918601,
"grad_norm": 7.966216564178467,
"learning_rate": 4.8713411048678635e-05,
"loss": 60.3937,
"step": 212
},
{
"epoch": 0.7310381638644249,
"grad_norm": 5.864863872528076,
"learning_rate": 4.868426491994702e-05,
"loss": 60.5208,
"step": 214
},
{
"epoch": 0.7378702962369896,
"grad_norm": 4.952422142028809,
"learning_rate": 4.865480126133872e-05,
"loss": 61.4458,
"step": 216
},
{
"epoch": 0.7447024286095543,
"grad_norm": 4.522135257720947,
"learning_rate": 4.862502046786671e-05,
"loss": 62.5035,
"step": 218
},
{
"epoch": 0.751534560982119,
"grad_norm": 4.29464054107666,
"learning_rate": 4.859492293879574e-05,
"loss": 61.5825,
"step": 220
},
{
"epoch": 0.7583666933546838,
"grad_norm": 5.789974212646484,
"learning_rate": 4.856450907763693e-05,
"loss": 59.9352,
"step": 222
},
{
"epoch": 0.7651988257272485,
"grad_norm": 6.44216251373291,
"learning_rate": 4.853377929214243e-05,
"loss": 59.1637,
"step": 224
},
{
"epoch": 0.7720309580998131,
"grad_norm": 4.520390033721924,
"learning_rate": 4.85027339942999e-05,
"loss": 60.4813,
"step": 226
},
{
"epoch": 0.7788630904723779,
"grad_norm": 6.058870315551758,
"learning_rate": 4.8471373600326996e-05,
"loss": 60.2968,
"step": 228
},
{
"epoch": 0.7856952228449426,
"grad_norm": 5.945502281188965,
"learning_rate": 4.843969853066584e-05,
"loss": 58.2098,
"step": 230
},
{
"epoch": 0.7925273552175074,
"grad_norm": 4.318876266479492,
"learning_rate": 4.8407709209977305e-05,
"loss": 58.4711,
"step": 232
},
{
"epoch": 0.799359487590072,
"grad_norm": 5.385821342468262,
"learning_rate": 4.837540606713538e-05,
"loss": 59.5379,
"step": 234
},
{
"epoch": 0.8061916199626368,
"grad_norm": 6.59214973449707,
"learning_rate": 4.834278953522138e-05,
"loss": 58.4163,
"step": 236
},
{
"epoch": 0.8130237523352015,
"grad_norm": 5.087238311767578,
"learning_rate": 4.8309860051518204e-05,
"loss": 60.5546,
"step": 238
},
{
"epoch": 0.8198558847077662,
"grad_norm": 6.804642200469971,
"learning_rate": 4.8276618057504376e-05,
"loss": 59.0874,
"step": 240
},
{
"epoch": 0.8266880170803309,
"grad_norm": 5.035391330718994,
"learning_rate": 4.824306399884822e-05,
"loss": 59.9545,
"step": 242
},
{
"epoch": 0.8335201494528957,
"grad_norm": 5.837290287017822,
"learning_rate": 4.8209198325401815e-05,
"loss": 59.5963,
"step": 244
},
{
"epoch": 0.8403522818254604,
"grad_norm": 4.17293643951416,
"learning_rate": 4.817502149119502e-05,
"loss": 59.7065,
"step": 246
},
{
"epoch": 0.847184414198025,
"grad_norm": 4.964944362640381,
"learning_rate": 4.8140533954429327e-05,
"loss": 59.5358,
"step": 248
},
{
"epoch": 0.8540165465705898,
"grad_norm": 6.021297931671143,
"learning_rate": 4.810573617747178e-05,
"loss": 60.6391,
"step": 250
},
{
"epoch": 0.8540165465705898,
"eval_loss": 0.9407148361206055,
"eval_runtime": 119.9595,
"eval_samples_per_second": 32.886,
"eval_steps_per_second": 8.228,
"step": 250
},
{
"epoch": 0.8608486789431545,
"grad_norm": 5.707021713256836,
"learning_rate": 4.8070628626848735e-05,
"loss": 61.5872,
"step": 252
},
{
"epoch": 0.8676808113157193,
"grad_norm": 4.725375652313232,
"learning_rate": 4.803521177323962e-05,
"loss": 59.2192,
"step": 254
},
{
"epoch": 0.8745129436882839,
"grad_norm": 23.445714950561523,
"learning_rate": 4.799948609147061e-05,
"loss": 60.1762,
"step": 256
},
{
"epoch": 0.8813450760608487,
"grad_norm": 5.503020286560059,
"learning_rate": 4.796345206050829e-05,
"loss": 62.2226,
"step": 258
},
{
"epoch": 0.8881772084334134,
"grad_norm": 6.558228015899658,
"learning_rate": 4.792711016345321e-05,
"loss": 62.089,
"step": 260
},
{
"epoch": 0.8950093408059782,
"grad_norm": 8.109895706176758,
"learning_rate": 4.7890460887533417e-05,
"loss": 60.7872,
"step": 262
},
{
"epoch": 0.9018414731785428,
"grad_norm": 5.230234622955322,
"learning_rate": 4.785350472409792e-05,
"loss": 57.9312,
"step": 264
},
{
"epoch": 0.9086736055511075,
"grad_norm": 6.669562339782715,
"learning_rate": 4.7816242168610093e-05,
"loss": 61.7966,
"step": 266
},
{
"epoch": 0.9155057379236723,
"grad_norm": 5.428192615509033,
"learning_rate": 4.777867372064105e-05,
"loss": 58.4551,
"step": 268
},
{
"epoch": 0.922337870296237,
"grad_norm": 5.6168131828308105,
"learning_rate": 4.774079988386296e-05,
"loss": 59.9015,
"step": 270
},
{
"epoch": 0.9291700026688017,
"grad_norm": 5.785460948944092,
"learning_rate": 4.770262116604224e-05,
"loss": 59.723,
"step": 272
},
{
"epoch": 0.9360021350413664,
"grad_norm": 8.77035140991211,
"learning_rate": 4.76641380790328e-05,
"loss": 60.8996,
"step": 274
},
{
"epoch": 0.9428342674139312,
"grad_norm": 4.000178813934326,
"learning_rate": 4.762535113876917e-05,
"loss": 59.2908,
"step": 276
},
{
"epoch": 0.9496663997864959,
"grad_norm": 5.8565826416015625,
"learning_rate": 4.758626086525956e-05,
"loss": 59.296,
"step": 278
},
{
"epoch": 0.9564985321590606,
"grad_norm": 6.792466163635254,
"learning_rate": 4.754686778257891e-05,
"loss": 58.351,
"step": 280
},
{
"epoch": 0.9633306645316253,
"grad_norm": 6.484628677368164,
"learning_rate": 4.750717241886185e-05,
"loss": 58.46,
"step": 282
},
{
"epoch": 0.97016279690419,
"grad_norm": 5.421430587768555,
"learning_rate": 4.7467175306295655e-05,
"loss": 59.0205,
"step": 284
},
{
"epoch": 0.9769949292767547,
"grad_norm": 4.550335884094238,
"learning_rate": 4.7426876981113044e-05,
"loss": 60.8234,
"step": 286
},
{
"epoch": 0.9838270616493194,
"grad_norm": 5.412383079528809,
"learning_rate": 4.738627798358506e-05,
"loss": 57.3651,
"step": 288
},
{
"epoch": 0.9906591940218842,
"grad_norm": 5.225856781005859,
"learning_rate": 4.7345378858013776e-05,
"loss": 58.8522,
"step": 290
},
{
"epoch": 0.9974913263944489,
"grad_norm": 3.856189250946045,
"learning_rate": 4.730418015272503e-05,
"loss": 59.7945,
"step": 292
},
{
"epoch": 1.0034160661862823,
"grad_norm": 6.19010066986084,
"learning_rate": 4.726268242006106e-05,
"loss": 50.2722,
"step": 294
},
{
"epoch": 1.0102481985588472,
"grad_norm": 5.333181858062744,
"learning_rate": 4.722088621637309e-05,
"loss": 58.7285,
"step": 296
},
{
"epoch": 1.0170803309314118,
"grad_norm": 5.93973970413208,
"learning_rate": 4.717879210201389e-05,
"loss": 57.2823,
"step": 298
},
{
"epoch": 1.0239124633039765,
"grad_norm": 4.59360408782959,
"learning_rate": 4.713640064133025e-05,
"loss": 58.4687,
"step": 300
},
{
"epoch": 1.0239124633039765,
"eval_loss": 0.9195547699928284,
"eval_runtime": 119.3076,
"eval_samples_per_second": 33.066,
"eval_steps_per_second": 8.273,
"step": 300
},
{
"epoch": 1.0307445956765413,
"grad_norm": 5.437332630157471,
"learning_rate": 4.7093712402655427e-05,
"loss": 57.7491,
"step": 302
},
{
"epoch": 1.037576728049106,
"grad_norm": 4.938009738922119,
"learning_rate": 4.7050727958301506e-05,
"loss": 58.2642,
"step": 304
},
{
"epoch": 1.0444088604216706,
"grad_norm": 5.104777812957764,
"learning_rate": 4.7007447884551745e-05,
"loss": 56.1312,
"step": 306
},
{
"epoch": 1.0512409927942354,
"grad_norm": 5.78248405456543,
"learning_rate": 4.6963872761652835e-05,
"loss": 56.9488,
"step": 308
},
{
"epoch": 1.0580731251668,
"grad_norm": 4.8224287033081055,
"learning_rate": 4.692000317380715e-05,
"loss": 56.6993,
"step": 310
},
{
"epoch": 1.064905257539365,
"grad_norm": 4.517540454864502,
"learning_rate": 4.687583970916487e-05,
"loss": 58.8636,
"step": 312
},
{
"epoch": 1.0717373899119296,
"grad_norm": 5.353949069976807,
"learning_rate": 4.683138295981611e-05,
"loss": 58.6762,
"step": 314
},
{
"epoch": 1.0785695222844942,
"grad_norm": 6.164919376373291,
"learning_rate": 4.678663352178301e-05,
"loss": 57.9218,
"step": 316
},
{
"epoch": 1.085401654657059,
"grad_norm": 4.577470302581787,
"learning_rate": 4.674159199501173e-05,
"loss": 58.1644,
"step": 318
},
{
"epoch": 1.0922337870296237,
"grad_norm": 6.5861592292785645,
"learning_rate": 4.6696258983364385e-05,
"loss": 57.3447,
"step": 320
},
{
"epoch": 1.0990659194021883,
"grad_norm": 4.327467918395996,
"learning_rate": 4.665063509461097e-05,
"loss": 57.2627,
"step": 322
},
{
"epoch": 1.1058980517747532,
"grad_norm": 7.534716606140137,
"learning_rate": 4.660472094042121e-05,
"loss": 57.2099,
"step": 324
},
{
"epoch": 1.1127301841473178,
"grad_norm": 5.549008369445801,
"learning_rate": 4.655851713635635e-05,
"loss": 58.4564,
"step": 326
},
{
"epoch": 1.1195623165198825,
"grad_norm": 4.385070323944092,
"learning_rate": 4.651202430186092e-05,
"loss": 57.0019,
"step": 328
},
{
"epoch": 1.1263944488924473,
"grad_norm": 4.763044357299805,
"learning_rate": 4.6465243060254415e-05,
"loss": 55.7849,
"step": 330
},
{
"epoch": 1.133226581265012,
"grad_norm": 3.9461379051208496,
"learning_rate": 4.641817403872293e-05,
"loss": 56.2399,
"step": 332
},
{
"epoch": 1.1400587136375768,
"grad_norm": 4.946137428283691,
"learning_rate": 4.637081786831079e-05,
"loss": 56.7089,
"step": 334
},
{
"epoch": 1.1468908460101415,
"grad_norm": 5.664731025695801,
"learning_rate": 4.6323175183912024e-05,
"loss": 57.1022,
"step": 336
},
{
"epoch": 1.153722978382706,
"grad_norm": 5.261230945587158,
"learning_rate": 4.627524662426194e-05,
"loss": 56.3552,
"step": 338
},
{
"epoch": 1.160555110755271,
"grad_norm": 4.166741847991943,
"learning_rate": 4.6227032831928484e-05,
"loss": 56.888,
"step": 340
},
{
"epoch": 1.1673872431278356,
"grad_norm": 6.015218734741211,
"learning_rate": 4.6178534453303666e-05,
"loss": 57.3006,
"step": 342
},
{
"epoch": 1.1742193755004002,
"grad_norm": 6.349710941314697,
"learning_rate": 4.6129752138594874e-05,
"loss": 57.0208,
"step": 344
},
{
"epoch": 1.181051507872965,
"grad_norm": 5.403022766113281,
"learning_rate": 4.608068654181617e-05,
"loss": 57.0645,
"step": 346
},
{
"epoch": 1.1878836402455297,
"grad_norm": 6.523670673370361,
"learning_rate": 4.6031338320779534e-05,
"loss": 58.2164,
"step": 348
},
{
"epoch": 1.1947157726180944,
"grad_norm": 6.369359970092773,
"learning_rate": 4.5981708137086e-05,
"loss": 56.7965,
"step": 350
},
{
"epoch": 1.1947157726180944,
"eval_loss": 0.8986765146255493,
"eval_runtime": 119.0222,
"eval_samples_per_second": 33.145,
"eval_steps_per_second": 8.293,
"step": 350
},
{
"epoch": 1.2015479049906592,
"grad_norm": 5.050749778747559,
"learning_rate": 4.5931796656116846e-05,
"loss": 56.7828,
"step": 352
},
{
"epoch": 1.2083800373632239,
"grad_norm": 5.341484069824219,
"learning_rate": 4.588160454702462e-05,
"loss": 57.4058,
"step": 354
},
{
"epoch": 1.2152121697357887,
"grad_norm": 4.554074287414551,
"learning_rate": 4.5831132482724195e-05,
"loss": 57.6257,
"step": 356
},
{
"epoch": 1.2220443021083534,
"grad_norm": 4.951889514923096,
"learning_rate": 4.578038113988376e-05,
"loss": 56.0608,
"step": 358
},
{
"epoch": 1.228876434480918,
"grad_norm": 4.2526421546936035,
"learning_rate": 4.572935119891571e-05,
"loss": 55.8586,
"step": 360
},
{
"epoch": 1.2357085668534828,
"grad_norm": 4.805353164672852,
"learning_rate": 4.5678043343967554e-05,
"loss": 59.2427,
"step": 362
},
{
"epoch": 1.2425406992260475,
"grad_norm": 4.9927978515625,
"learning_rate": 4.5626458262912745e-05,
"loss": 55.1494,
"step": 364
},
{
"epoch": 1.2493728315986123,
"grad_norm": 5.778275012969971,
"learning_rate": 4.557459664734141e-05,
"loss": 55.9791,
"step": 366
},
{
"epoch": 1.256204963971177,
"grad_norm": 4.41555643081665,
"learning_rate": 4.552245919255117e-05,
"loss": 57.3123,
"step": 368
},
{
"epoch": 1.2630370963437416,
"grad_norm": 5.230330944061279,
"learning_rate": 4.5470046597537735e-05,
"loss": 55.9031,
"step": 370
},
{
"epoch": 1.2698692287163063,
"grad_norm": 3.9548189640045166,
"learning_rate": 4.541735956498554e-05,
"loss": 56.6997,
"step": 372
},
{
"epoch": 1.2767013610888711,
"grad_norm": 5.017361640930176,
"learning_rate": 4.5364398801258396e-05,
"loss": 57.3268,
"step": 374
},
{
"epoch": 1.2835334934614357,
"grad_norm": 5.562941074371338,
"learning_rate": 4.5311165016389916e-05,
"loss": 55.6271,
"step": 376
},
{
"epoch": 1.2903656258340006,
"grad_norm": 6.675297737121582,
"learning_rate": 4.525765892407409e-05,
"loss": 55.9593,
"step": 378
},
{
"epoch": 1.2971977582065652,
"grad_norm": 6.47582483291626,
"learning_rate": 4.5203881241655644e-05,
"loss": 57.0788,
"step": 380
},
{
"epoch": 1.3040298905791299,
"grad_norm": 5.157675743103027,
"learning_rate": 4.514983269012049e-05,
"loss": 56.3623,
"step": 382
},
{
"epoch": 1.3108620229516947,
"grad_norm": 8.075702667236328,
"learning_rate": 4.509551399408598e-05,
"loss": 55.6531,
"step": 384
},
{
"epoch": 1.3176941553242594,
"grad_norm": 3.849310874938965,
"learning_rate": 4.504092588179128e-05,
"loss": 58.7546,
"step": 386
},
{
"epoch": 1.3245262876968242,
"grad_norm": 3.6027579307556152,
"learning_rate": 4.498606908508754e-05,
"loss": 57.7153,
"step": 388
},
{
"epoch": 1.3313584200693889,
"grad_norm": 5.139729976654053,
"learning_rate": 4.4930944339428085e-05,
"loss": 56.4532,
"step": 390
},
{
"epoch": 1.3381905524419535,
"grad_norm": 5.337704181671143,
"learning_rate": 4.487555238385862e-05,
"loss": 54.2958,
"step": 392
},
{
"epoch": 1.3450226848145181,
"grad_norm": 3.3229618072509766,
"learning_rate": 4.481989396100724e-05,
"loss": 54.2046,
"step": 394
},
{
"epoch": 1.351854817187083,
"grad_norm": 5.2183074951171875,
"learning_rate": 4.476396981707453e-05,
"loss": 56.0147,
"step": 396
},
{
"epoch": 1.3586869495596476,
"grad_norm": 5.028941631317139,
"learning_rate": 4.470778070182353e-05,
"loss": 54.3446,
"step": 398
},
{
"epoch": 1.3655190819322125,
"grad_norm": 6.347212791442871,
"learning_rate": 4.465132736856969e-05,
"loss": 56.7659,
"step": 400
},
{
"epoch": 1.3655190819322125,
"eval_loss": 0.8771227598190308,
"eval_runtime": 118.9477,
"eval_samples_per_second": 33.166,
"eval_steps_per_second": 8.298,
"step": 400
},
{
"epoch": 1.3723512143047771,
"grad_norm": 9.381309509277344,
"learning_rate": 4.459461057417078e-05,
"loss": 56.8099,
"step": 402
},
{
"epoch": 1.3791833466773418,
"grad_norm": 5.657813549041748,
"learning_rate": 4.453763107901675e-05,
"loss": 56.3326,
"step": 404
},
{
"epoch": 1.3860154790499066,
"grad_norm": 4.476396083831787,
"learning_rate": 4.4480389647019505e-05,
"loss": 57.3978,
"step": 406
},
{
"epoch": 1.3928476114224713,
"grad_norm": 5.402798652648926,
"learning_rate": 4.442288704560268e-05,
"loss": 55.7143,
"step": 408
},
{
"epoch": 1.3996797437950361,
"grad_norm": 4.367002010345459,
"learning_rate": 4.436512404569136e-05,
"loss": 55.7044,
"step": 410
},
{
"epoch": 1.4065118761676008,
"grad_norm": 5.653073310852051,
"learning_rate": 4.430710142170176e-05,
"loss": 55.7266,
"step": 412
},
{
"epoch": 1.4133440085401654,
"grad_norm": 7.221829414367676,
"learning_rate": 4.424881995153076e-05,
"loss": 56.4174,
"step": 414
},
{
"epoch": 1.4201761409127303,
"grad_norm": 5.465057373046875,
"learning_rate": 4.419028041654559e-05,
"loss": 56.9093,
"step": 416
},
{
"epoch": 1.427008273285295,
"grad_norm": 8.383552551269531,
"learning_rate": 4.4131483601573285e-05,
"loss": 56.0841,
"step": 418
},
{
"epoch": 1.4338404056578598,
"grad_norm": 4.208652973175049,
"learning_rate": 4.4072430294890174e-05,
"loss": 57.5786,
"step": 420
},
{
"epoch": 1.4406725380304244,
"grad_norm": 5.773376941680908,
"learning_rate": 4.4013121288211307e-05,
"loss": 55.8851,
"step": 422
},
{
"epoch": 1.447504670402989,
"grad_norm": 5.354812145233154,
"learning_rate": 4.3953557376679856e-05,
"loss": 55.1571,
"step": 424
},
{
"epoch": 1.4543368027755537,
"grad_norm": 4.6360039710998535,
"learning_rate": 4.389373935885646e-05,
"loss": 54.0095,
"step": 426
},
{
"epoch": 1.4611689351481185,
"grad_norm": 7.125521183013916,
"learning_rate": 4.383366803670849e-05,
"loss": 56.645,
"step": 428
},
{
"epoch": 1.4680010675206832,
"grad_norm": 6.071737766265869,
"learning_rate": 4.377334421559932e-05,
"loss": 55.3209,
"step": 430
},
{
"epoch": 1.474833199893248,
"grad_norm": 4.569766998291016,
"learning_rate": 4.371276870427753e-05,
"loss": 54.6604,
"step": 432
},
{
"epoch": 1.4816653322658127,
"grad_norm": 5.426764965057373,
"learning_rate": 4.365194231486604e-05,
"loss": 56.4116,
"step": 434
},
{
"epoch": 1.4884974646383773,
"grad_norm": 5.6092023849487305,
"learning_rate": 4.359086586285127e-05,
"loss": 56.0268,
"step": 436
},
{
"epoch": 1.4953295970109421,
"grad_norm": 6.140939712524414,
"learning_rate": 4.3529540167072126e-05,
"loss": 54.886,
"step": 438
},
{
"epoch": 1.5021617293835068,
"grad_norm": 4.043739318847656,
"learning_rate": 4.346796604970912e-05,
"loss": 56.6431,
"step": 440
},
{
"epoch": 1.5089938617560716,
"grad_norm": 3.8898212909698486,
"learning_rate": 4.340614433627328e-05,
"loss": 55.6492,
"step": 442
},
{
"epoch": 1.5158259941286363,
"grad_norm": 6.158950328826904,
"learning_rate": 4.3344075855595104e-05,
"loss": 55.6869,
"step": 444
},
{
"epoch": 1.522658126501201,
"grad_norm": 3.874180316925049,
"learning_rate": 4.328176143981343e-05,
"loss": 53.7981,
"step": 446
},
{
"epoch": 1.5294902588737656,
"grad_norm": 4.068581581115723,
"learning_rate": 4.321920192436433e-05,
"loss": 54.6618,
"step": 448
},
{
"epoch": 1.5363223912463304,
"grad_norm": 4.552149295806885,
"learning_rate": 4.315639814796983e-05,
"loss": 55.1642,
"step": 450
},
{
"epoch": 1.5363223912463304,
"eval_loss": 0.8704175353050232,
"eval_runtime": 119.5049,
"eval_samples_per_second": 33.011,
"eval_steps_per_second": 8.259,
"step": 450
},
{
"epoch": 1.5431545236188953,
"grad_norm": 4.1831374168396,
"learning_rate": 4.309335095262676e-05,
"loss": 53.2926,
"step": 452
},
{
"epoch": 1.54998665599146,
"grad_norm": 4.456052780151367,
"learning_rate": 4.303006118359537e-05,
"loss": 53.6038,
"step": 454
},
{
"epoch": 1.5568187883640245,
"grad_norm": 17.7099609375,
"learning_rate": 4.296652968938807e-05,
"loss": 54.9325,
"step": 456
},
{
"epoch": 1.5636509207365892,
"grad_norm": 8.005233764648438,
"learning_rate": 4.2902757321758016e-05,
"loss": 53.7884,
"step": 458
},
{
"epoch": 1.570483053109154,
"grad_norm": 5.034004211425781,
"learning_rate": 4.283874493568772e-05,
"loss": 53.2575,
"step": 460
},
{
"epoch": 1.5773151854817187,
"grad_norm": 4.005930423736572,
"learning_rate": 4.2774493389377545e-05,
"loss": 55.4554,
"step": 462
},
{
"epoch": 1.5841473178542835,
"grad_norm": 5.812296390533447,
"learning_rate": 4.271000354423426e-05,
"loss": 56.7008,
"step": 464
},
{
"epoch": 1.5909794502268482,
"grad_norm": 6.425695896148682,
"learning_rate": 4.2645276264859394e-05,
"loss": 56.8804,
"step": 466
},
{
"epoch": 1.5978115825994128,
"grad_norm": 4.44102144241333,
"learning_rate": 4.258031241903778e-05,
"loss": 54.2011,
"step": 468
},
{
"epoch": 1.6046437149719774,
"grad_norm": 4.444553852081299,
"learning_rate": 4.251511287772579e-05,
"loss": 54.9826,
"step": 470
},
{
"epoch": 1.6114758473445423,
"grad_norm": 3.8157808780670166,
"learning_rate": 4.2449678515039747e-05,
"loss": 55.2601,
"step": 472
},
{
"epoch": 1.6183079797171072,
"grad_norm": 6.47904634475708,
"learning_rate": 4.238401020824416e-05,
"loss": 54.5978,
"step": 474
},
{
"epoch": 1.6251401120896718,
"grad_norm": 5.010526180267334,
"learning_rate": 4.231810883773999e-05,
"loss": 56.0995,
"step": 476
},
{
"epoch": 1.6319722444622364,
"grad_norm": 5.843505382537842,
"learning_rate": 4.2251975287052804e-05,
"loss": 54.0241,
"step": 478
},
{
"epoch": 1.638804376834801,
"grad_norm": 4.549996852874756,
"learning_rate": 4.218561044282099e-05,
"loss": 56.3071,
"step": 480
},
{
"epoch": 1.645636509207366,
"grad_norm": 4.20985221862793,
"learning_rate": 4.211901519478382e-05,
"loss": 54.3977,
"step": 482
},
{
"epoch": 1.6524686415799306,
"grad_norm": 5.491010665893555,
"learning_rate": 4.2052190435769554e-05,
"loss": 53.1375,
"step": 484
},
{
"epoch": 1.6593007739524954,
"grad_norm": 4.417302131652832,
"learning_rate": 4.198513706168345e-05,
"loss": 53.959,
"step": 486
},
{
"epoch": 1.66613290632506,
"grad_norm": 5.39029598236084,
"learning_rate": 4.191785597149577e-05,
"loss": 54.5638,
"step": 488
},
{
"epoch": 1.6729650386976247,
"grad_norm": 4.233526229858398,
"learning_rate": 4.1850348067229696e-05,
"loss": 54.6384,
"step": 490
},
{
"epoch": 1.6797971710701893,
"grad_norm": 6.301634311676025,
"learning_rate": 4.178261425394926e-05,
"loss": 55.1738,
"step": 492
},
{
"epoch": 1.6866293034427542,
"grad_norm": 5.9507246017456055,
"learning_rate": 4.171465543974723e-05,
"loss": 54.7009,
"step": 494
},
{
"epoch": 1.693461435815319,
"grad_norm": 5.033243656158447,
"learning_rate": 4.1646472535732895e-05,
"loss": 54.3154,
"step": 496
},
{
"epoch": 1.7002935681878837,
"grad_norm": 4.675721168518066,
"learning_rate": 4.157806645601988e-05,
"loss": 54.1507,
"step": 498
},
{
"epoch": 1.7071257005604483,
"grad_norm": 3.5945537090301514,
"learning_rate": 4.1509438117713866e-05,
"loss": 52.2103,
"step": 500
},
{
"epoch": 1.7071257005604483,
"eval_loss": 0.8516557216644287,
"eval_runtime": 119.4754,
"eval_samples_per_second": 33.019,
"eval_steps_per_second": 8.261,
"step": 500
},
{
"epoch": 1.713957832933013,
"grad_norm": 4.187085151672363,
"learning_rate": 4.144058844090032e-05,
"loss": 54.1474,
"step": 502
},
{
"epoch": 1.7207899653055778,
"grad_norm": 3.818648099899292,
"learning_rate": 4.137151834863213e-05,
"loss": 55.5711,
"step": 504
},
{
"epoch": 1.7276220976781427,
"grad_norm": 5.919620513916016,
"learning_rate": 4.130222876691726e-05,
"loss": 54.3803,
"step": 506
},
{
"epoch": 1.7344542300507073,
"grad_norm": 5.772305011749268,
"learning_rate": 4.123272062470633e-05,
"loss": 53.9454,
"step": 508
},
{
"epoch": 1.741286362423272,
"grad_norm": 4.569563865661621,
"learning_rate": 4.116299485388014e-05,
"loss": 53.5009,
"step": 510
},
{
"epoch": 1.7481184947958366,
"grad_norm": 4.183293342590332,
"learning_rate": 4.109305238923718e-05,
"loss": 52.9927,
"step": 512
},
{
"epoch": 1.7549506271684012,
"grad_norm": 4.4316301345825195,
"learning_rate": 4.102289416848114e-05,
"loss": 54.5023,
"step": 514
},
{
"epoch": 1.761782759540966,
"grad_norm": 14.234251976013184,
"learning_rate": 4.095252113220827e-05,
"loss": 53.1473,
"step": 516
},
{
"epoch": 1.768614891913531,
"grad_norm": 4.889795780181885,
"learning_rate": 4.088193422389484e-05,
"loss": 53.7265,
"step": 518
},
{
"epoch": 1.7754470242860956,
"grad_norm": 3.02785325050354,
"learning_rate": 4.0811134389884433e-05,
"loss": 52.5917,
"step": 520
},
{
"epoch": 1.7822791566586602,
"grad_norm": 5.794788360595703,
"learning_rate": 4.0740122579375286e-05,
"loss": 55.4619,
"step": 522
},
{
"epoch": 1.7891112890312248,
"grad_norm": 4.442338466644287,
"learning_rate": 4.066889974440757e-05,
"loss": 53.7709,
"step": 524
},
{
"epoch": 1.7959434214037897,
"grad_norm": 4.7714715003967285,
"learning_rate": 4.0597466839850595e-05,
"loss": 54.16,
"step": 526
},
{
"epoch": 1.8027755537763546,
"grad_norm": 4.7263569831848145,
"learning_rate": 4.0525824823390045e-05,
"loss": 55.9749,
"step": 528
},
{
"epoch": 1.8096076861489192,
"grad_norm": 4.258271217346191,
"learning_rate": 4.045397465551513e-05,
"loss": 52.5445,
"step": 530
},
{
"epoch": 1.8164398185214838,
"grad_norm": 4.56829309463501,
"learning_rate": 4.038191729950569e-05,
"loss": 53.8703,
"step": 532
},
{
"epoch": 1.8232719508940485,
"grad_norm": 8.888167381286621,
"learning_rate": 4.030965372141927e-05,
"loss": 52.7209,
"step": 534
},
{
"epoch": 1.8301040832666133,
"grad_norm": 4.5087175369262695,
"learning_rate": 4.0237184890078245e-05,
"loss": 54.591,
"step": 536
},
{
"epoch": 1.836936215639178,
"grad_norm": 4.460638523101807,
"learning_rate": 4.0164511777056725e-05,
"loss": 54.8662,
"step": 538
},
{
"epoch": 1.8437683480117428,
"grad_norm": 3.5958664417266846,
"learning_rate": 4.009163535666761e-05,
"loss": 53.423,
"step": 540
},
{
"epoch": 1.8506004803843075,
"grad_norm": 4.3935418128967285,
"learning_rate": 4.001855660594948e-05,
"loss": 53.9048,
"step": 542
},
{
"epoch": 1.857432612756872,
"grad_norm": 5.473939895629883,
"learning_rate": 3.994527650465352e-05,
"loss": 52.9295,
"step": 544
},
{
"epoch": 1.8642647451294367,
"grad_norm": 4.8625922203063965,
"learning_rate": 3.98717960352304e-05,
"loss": 51.8002,
"step": 546
},
{
"epoch": 1.8710968775020016,
"grad_norm": 4.244052886962891,
"learning_rate": 3.979811618281706e-05,
"loss": 53.6904,
"step": 548
},
{
"epoch": 1.8779290098745665,
"grad_norm": 4.050732612609863,
"learning_rate": 3.972423793522352e-05,
"loss": 54.7441,
"step": 550
},
{
"epoch": 1.8779290098745665,
"eval_loss": 0.8419561982154846,
"eval_runtime": 119.6757,
"eval_samples_per_second": 32.964,
"eval_steps_per_second": 8.247,
"step": 550
},
{
"epoch": 1.884761142247131,
"grad_norm": 5.255309104919434,
"learning_rate": 3.9650162282919655e-05,
"loss": 53.6842,
"step": 552
},
{
"epoch": 1.8915932746196957,
"grad_norm": 5.483623504638672,
"learning_rate": 3.957589021902191e-05,
"loss": 54.0004,
"step": 554
},
{
"epoch": 1.8984254069922604,
"grad_norm": 4.224212169647217,
"learning_rate": 3.9501422739279956e-05,
"loss": 51.7289,
"step": 556
},
{
"epoch": 1.9052575393648252,
"grad_norm": 5.061962127685547,
"learning_rate": 3.942676084206338e-05,
"loss": 53.4457,
"step": 558
},
{
"epoch": 1.9120896717373899,
"grad_norm": 3.8694398403167725,
"learning_rate": 3.9351905528348285e-05,
"loss": 51.8595,
"step": 560
},
{
"epoch": 1.9189218041099547,
"grad_norm": 4.149620056152344,
"learning_rate": 3.927685780170385e-05,
"loss": 51.8196,
"step": 562
},
{
"epoch": 1.9257539364825194,
"grad_norm": 6.877647399902344,
"learning_rate": 3.920161866827889e-05,
"loss": 52.7279,
"step": 564
},
{
"epoch": 1.932586068855084,
"grad_norm": 4.069815635681152,
"learning_rate": 3.9126189136788416e-05,
"loss": 51.1502,
"step": 566
},
{
"epoch": 1.9394182012276486,
"grad_norm": 6.629972457885742,
"learning_rate": 3.90505702185e-05,
"loss": 52.6793,
"step": 568
},
{
"epoch": 1.9462503336002135,
"grad_norm": 4.475677013397217,
"learning_rate": 3.897476292722034e-05,
"loss": 51.4329,
"step": 570
},
{
"epoch": 1.9530824659727783,
"grad_norm": 5.370522499084473,
"learning_rate": 3.889876827928156e-05,
"loss": 53.1101,
"step": 572
},
{
"epoch": 1.959914598345343,
"grad_norm": 5.481414794921875,
"learning_rate": 3.882258729352768e-05,
"loss": 53.3684,
"step": 574
},
{
"epoch": 1.9667467307179076,
"grad_norm": 6.393594741821289,
"learning_rate": 3.874622099130087e-05,
"loss": 52.7341,
"step": 576
},
{
"epoch": 1.9735788630904723,
"grad_norm": 3.9178807735443115,
"learning_rate": 3.866967039642784e-05,
"loss": 51.5249,
"step": 578
},
{
"epoch": 1.9804109954630371,
"grad_norm": 9.721770286560059,
"learning_rate": 3.859293653520604e-05,
"loss": 51.2705,
"step": 580
},
{
"epoch": 1.987243127835602,
"grad_norm": 4.619483470916748,
"learning_rate": 3.851602043638994e-05,
"loss": 51.7596,
"step": 582
},
{
"epoch": 1.9940752602081666,
"grad_norm": 4.899592399597168,
"learning_rate": 3.843892313117724e-05,
"loss": 54.7586,
"step": 584
},
{
"epoch": 2.0,
"grad_norm": 3.8423385620117188,
"learning_rate": 3.8361645653195026e-05,
"loss": 44.9497,
"step": 586
},
{
"epoch": 2.0068321323725646,
"grad_norm": 4.93556022644043,
"learning_rate": 3.8284189038485936e-05,
"loss": 53.1383,
"step": 588
},
{
"epoch": 2.0136642647451293,
"grad_norm": 6.575899124145508,
"learning_rate": 3.8206554325494225e-05,
"loss": 52.1373,
"step": 590
},
{
"epoch": 2.0204963971176944,
"grad_norm": 3.5134201049804688,
"learning_rate": 3.812874255505191e-05,
"loss": 50.8711,
"step": 592
},
{
"epoch": 2.027328529490259,
"grad_norm": 4.761475086212158,
"learning_rate": 3.805075477036476e-05,
"loss": 52.0756,
"step": 594
},
{
"epoch": 2.0341606618628236,
"grad_norm": 3.7381017208099365,
"learning_rate": 3.797259201699833e-05,
"loss": 51.0594,
"step": 596
},
{
"epoch": 2.0409927942353883,
"grad_norm": 5.102145671844482,
"learning_rate": 3.789425534286394e-05,
"loss": 52.1454,
"step": 598
},
{
"epoch": 2.047824926607953,
"grad_norm": 4.762547969818115,
"learning_rate": 3.781574579820464e-05,
"loss": 50.3373,
"step": 600
},
{
"epoch": 2.047824926607953,
"eval_loss": 0.8283991813659668,
"eval_runtime": 119.5704,
"eval_samples_per_second": 32.993,
"eval_steps_per_second": 8.255,
"step": 600
},
{
"epoch": 2.0546570589805175,
"grad_norm": 4.646745681762695,
"learning_rate": 3.773706443558111e-05,
"loss": 51.0792,
"step": 602
},
{
"epoch": 2.0614891913530826,
"grad_norm": 5.648324012756348,
"learning_rate": 3.765821230985758e-05,
"loss": 50.6017,
"step": 604
},
{
"epoch": 2.0683213237256473,
"grad_norm": 4.703359603881836,
"learning_rate": 3.75791904781876e-05,
"loss": 52.4212,
"step": 606
},
{
"epoch": 2.075153456098212,
"grad_norm": 4.082385540008545,
"learning_rate": 3.7500000000000003e-05,
"loss": 51.9666,
"step": 608
},
{
"epoch": 2.0819855884707765,
"grad_norm": 4.6461687088012695,
"learning_rate": 3.74206419369846e-05,
"loss": 51.6205,
"step": 610
},
{
"epoch": 2.088817720843341,
"grad_norm": 3.9972918033599854,
"learning_rate": 3.7341117353077966e-05,
"loss": 52.6521,
"step": 612
},
{
"epoch": 2.0956498532159062,
"grad_norm": 5.636791229248047,
"learning_rate": 3.726142731444921e-05,
"loss": 52.6811,
"step": 614
},
{
"epoch": 2.102481985588471,
"grad_norm": 6.055325508117676,
"learning_rate": 3.718157288948563e-05,
"loss": 51.2952,
"step": 616
},
{
"epoch": 2.1093141179610355,
"grad_norm": 5.317610740661621,
"learning_rate": 3.710155514877844e-05,
"loss": 52.4443,
"step": 618
},
{
"epoch": 2.1161462503336,
"grad_norm": 4.979522705078125,
"learning_rate": 3.702137516510838e-05,
"loss": 51.3593,
"step": 620
},
{
"epoch": 2.122978382706165,
"grad_norm": 7.410902500152588,
"learning_rate": 3.694103401343136e-05,
"loss": 51.5919,
"step": 622
},
{
"epoch": 2.12981051507873,
"grad_norm": 4.962103366851807,
"learning_rate": 3.686053277086401e-05,
"loss": 51.272,
"step": 624
},
{
"epoch": 2.1366426474512945,
"grad_norm": 4.0044426918029785,
"learning_rate": 3.6779872516669295e-05,
"loss": 51.6362,
"step": 626
},
{
"epoch": 2.143474779823859,
"grad_norm": 5.016703128814697,
"learning_rate": 3.669905433224199e-05,
"loss": 51.7369,
"step": 628
},
{
"epoch": 2.150306912196424,
"grad_norm": 4.700343132019043,
"learning_rate": 3.6618079301094216e-05,
"loss": 50.9454,
"step": 630
},
{
"epoch": 2.1571390445689884,
"grad_norm": 8.11246395111084,
"learning_rate": 3.653694850884091e-05,
"loss": 50.4605,
"step": 632
},
{
"epoch": 2.163971176941553,
"grad_norm": 3.8724536895751953,
"learning_rate": 3.645566304318526e-05,
"loss": 52.4849,
"step": 634
},
{
"epoch": 2.170803309314118,
"grad_norm": 3.699873208999634,
"learning_rate": 3.637422399390413e-05,
"loss": 49.8017,
"step": 636
},
{
"epoch": 2.1776354416866828,
"grad_norm": 4.757104873657227,
"learning_rate": 3.6292632452833436e-05,
"loss": 52.0966,
"step": 638
},
{
"epoch": 2.1844675740592474,
"grad_norm": 5.273576736450195,
"learning_rate": 3.621088951385353e-05,
"loss": 49.5201,
"step": 640
},
{
"epoch": 2.191299706431812,
"grad_norm": 4.152122497558594,
"learning_rate": 3.612899627287452e-05,
"loss": 51.121,
"step": 642
},
{
"epoch": 2.1981318388043767,
"grad_norm": 4.448339939117432,
"learning_rate": 3.604695382782159e-05,
"loss": 51.5833,
"step": 644
},
{
"epoch": 2.2049639711769418,
"grad_norm": 3.272676706314087,
"learning_rate": 3.596476327862024e-05,
"loss": 50.4036,
"step": 646
},
{
"epoch": 2.2117961035495064,
"grad_norm": 4.293691158294678,
"learning_rate": 3.588242572718162e-05,
"loss": 50.4138,
"step": 648
},
{
"epoch": 2.218628235922071,
"grad_norm": 6.384798049926758,
"learning_rate": 3.579994227738767e-05,
"loss": 49.0042,
"step": 650
},
{
"epoch": 2.218628235922071,
"eval_loss": 0.8110712170600891,
"eval_runtime": 119.0744,
"eval_samples_per_second": 33.131,
"eval_steps_per_second": 8.289,
"step": 650
},
{
"epoch": 2.2254603682946357,
"grad_norm": 4.501573085784912,
"learning_rate": 3.5717314035076355e-05,
"loss": 49.7713,
"step": 652
},
{
"epoch": 2.2322925006672003,
"grad_norm": 4.808114051818848,
"learning_rate": 3.5634542108026876e-05,
"loss": 50.6265,
"step": 654
},
{
"epoch": 2.239124633039765,
"grad_norm": 5.616351127624512,
"learning_rate": 3.5551627605944745e-05,
"loss": 52.1332,
"step": 656
},
{
"epoch": 2.24595676541233,
"grad_norm": 7.0716071128845215,
"learning_rate": 3.5468571640446994e-05,
"loss": 50.7825,
"step": 658
},
{
"epoch": 2.2527888977848947,
"grad_norm": 4.64641809463501,
"learning_rate": 3.5385375325047166e-05,
"loss": 50.3092,
"step": 660
},
{
"epoch": 2.2596210301574593,
"grad_norm": 4.058784008026123,
"learning_rate": 3.5302039775140486e-05,
"loss": 51.7827,
"step": 662
},
{
"epoch": 2.266453162530024,
"grad_norm": 4.011864185333252,
"learning_rate": 3.521856610798887e-05,
"loss": 51.4194,
"step": 664
},
{
"epoch": 2.2732852949025886,
"grad_norm": 3.89857816696167,
"learning_rate": 3.513495544270592e-05,
"loss": 50.7032,
"step": 666
},
{
"epoch": 2.2801174272751537,
"grad_norm": 4.966712951660156,
"learning_rate": 3.505120890024195e-05,
"loss": 49.925,
"step": 668
},
{
"epoch": 2.2869495596477183,
"grad_norm": 4.181141376495361,
"learning_rate": 3.496732760336895e-05,
"loss": 49.5112,
"step": 670
},
{
"epoch": 2.293781692020283,
"grad_norm": 4.761594772338867,
"learning_rate": 3.4883312676665536e-05,
"loss": 49.6545,
"step": 672
},
{
"epoch": 2.3006138243928476,
"grad_norm": 3.97501802444458,
"learning_rate": 3.479916524650188e-05,
"loss": 51.1862,
"step": 674
},
{
"epoch": 2.307445956765412,
"grad_norm": 5.200672149658203,
"learning_rate": 3.4714886441024574e-05,
"loss": 49.9163,
"step": 676
},
{
"epoch": 2.314278089137977,
"grad_norm": 4.147047519683838,
"learning_rate": 3.4630477390141556e-05,
"loss": 48.6138,
"step": 678
},
{
"epoch": 2.321110221510542,
"grad_norm": 4.9791693687438965,
"learning_rate": 3.4545939225506934e-05,
"loss": 51.4538,
"step": 680
},
{
"epoch": 2.3279423538831066,
"grad_norm": 4.929348945617676,
"learning_rate": 3.4461273080505793e-05,
"loss": 51.2735,
"step": 682
},
{
"epoch": 2.334774486255671,
"grad_norm": 4.98499059677124,
"learning_rate": 3.437648009023905e-05,
"loss": 48.5889,
"step": 684
},
{
"epoch": 2.341606618628236,
"grad_norm": 4.354183673858643,
"learning_rate": 3.4291561391508185e-05,
"loss": 51.7768,
"step": 686
},
{
"epoch": 2.3484387510008005,
"grad_norm": 3.482697010040283,
"learning_rate": 3.420651812280006e-05,
"loss": 48.9966,
"step": 688
},
{
"epoch": 2.3552708833733655,
"grad_norm": 4.613458156585693,
"learning_rate": 3.4121351424271594e-05,
"loss": 50.8534,
"step": 690
},
{
"epoch": 2.36210301574593,
"grad_norm": 3.93235182762146,
"learning_rate": 3.4036062437734484e-05,
"loss": 50.9164,
"step": 692
},
{
"epoch": 2.368935148118495,
"grad_norm": 5.348623275756836,
"learning_rate": 3.395065230663996e-05,
"loss": 49.6679,
"step": 694
},
{
"epoch": 2.3757672804910595,
"grad_norm": 5.050134181976318,
"learning_rate": 3.386512217606339e-05,
"loss": 48.0534,
"step": 696
},
{
"epoch": 2.382599412863624,
"grad_norm": 3.7587573528289795,
"learning_rate": 3.3779473192688954e-05,
"loss": 50.3013,
"step": 698
},
{
"epoch": 2.3894315452361887,
"grad_norm": 5.177303314208984,
"learning_rate": 3.369370650479425e-05,
"loss": 48.8704,
"step": 700
},
{
"epoch": 2.3894315452361887,
"eval_loss": 0.7940448522567749,
"eval_runtime": 119.8708,
"eval_samples_per_second": 32.91,
"eval_steps_per_second": 8.234,
"step": 700
},
{
"epoch": 2.396263677608754,
"grad_norm": 4.268886089324951,
"learning_rate": 3.360782326223493e-05,
"loss": 50.0788,
"step": 702
},
{
"epoch": 2.4030958099813184,
"grad_norm": 4.847851276397705,
"learning_rate": 3.3521824616429285e-05,
"loss": 50.5298,
"step": 704
},
{
"epoch": 2.409927942353883,
"grad_norm": 4.221863746643066,
"learning_rate": 3.3435711720342764e-05,
"loss": 51.0571,
"step": 706
},
{
"epoch": 2.4167600747264477,
"grad_norm": 5.5122528076171875,
"learning_rate": 3.3349485728472535e-05,
"loss": 48.3266,
"step": 708
},
{
"epoch": 2.4235922070990124,
"grad_norm": 3.7766902446746826,
"learning_rate": 3.326314779683207e-05,
"loss": 49.9334,
"step": 710
},
{
"epoch": 2.4304243394715774,
"grad_norm": 4.093820571899414,
"learning_rate": 3.3176699082935545e-05,
"loss": 48.4746,
"step": 712
},
{
"epoch": 2.437256471844142,
"grad_norm": 4.116121292114258,
"learning_rate": 3.3090140745782396e-05,
"loss": 48.5131,
"step": 714
},
{
"epoch": 2.4440886042167067,
"grad_norm": 5.181516647338867,
"learning_rate": 3.300347394584172e-05,
"loss": 50.4981,
"step": 716
},
{
"epoch": 2.4509207365892713,
"grad_norm": 4.464053630828857,
"learning_rate": 3.2916699845036816e-05,
"loss": 50.2301,
"step": 718
},
{
"epoch": 2.457752868961836,
"grad_norm": 4.229206562042236,
"learning_rate": 3.282981960672948e-05,
"loss": 50.1858,
"step": 720
},
{
"epoch": 2.4645850013344006,
"grad_norm": 3.8356049060821533,
"learning_rate": 3.2742834395704486e-05,
"loss": 48.9147,
"step": 722
},
{
"epoch": 2.4714171337069657,
"grad_norm": 3.9584670066833496,
"learning_rate": 3.265574537815398e-05,
"loss": 48.6574,
"step": 724
},
{
"epoch": 2.4782492660795303,
"grad_norm": 4.802350997924805,
"learning_rate": 3.25685537216618e-05,
"loss": 48.9724,
"step": 726
},
{
"epoch": 2.485081398452095,
"grad_norm": 4.078526020050049,
"learning_rate": 3.248126059518785e-05,
"loss": 47.7639,
"step": 728
},
{
"epoch": 2.4919135308246596,
"grad_norm": 3.8187856674194336,
"learning_rate": 3.2393867169052385e-05,
"loss": 48.2195,
"step": 730
},
{
"epoch": 2.4987456631972247,
"grad_norm": 5.273796081542969,
"learning_rate": 3.230637461492043e-05,
"loss": 49.7512,
"step": 732
},
{
"epoch": 2.5055777955697893,
"grad_norm": 4.126491069793701,
"learning_rate": 3.221878410578593e-05,
"loss": 49.0844,
"step": 734
},
{
"epoch": 2.512409927942354,
"grad_norm": 4.665433406829834,
"learning_rate": 3.213109681595612e-05,
"loss": 48.7829,
"step": 736
},
{
"epoch": 2.5192420603149186,
"grad_norm": 4.897470951080322,
"learning_rate": 3.2043313921035743e-05,
"loss": 49.5252,
"step": 738
},
{
"epoch": 2.5260741926874832,
"grad_norm": 5.257498264312744,
"learning_rate": 3.195543659791132e-05,
"loss": 50.4767,
"step": 740
},
{
"epoch": 2.532906325060048,
"grad_norm": 3.754957914352417,
"learning_rate": 3.186746602473533e-05,
"loss": 49.4055,
"step": 742
},
{
"epoch": 2.5397384574326125,
"grad_norm": 3.994774341583252,
"learning_rate": 3.177940338091043e-05,
"loss": 49.3039,
"step": 744
},
{
"epoch": 2.5465705898051776,
"grad_norm": 4.923650741577148,
"learning_rate": 3.169124984707367e-05,
"loss": 48.6568,
"step": 746
},
{
"epoch": 2.5534027221777422,
"grad_norm": 6.377063274383545,
"learning_rate": 3.160300660508064e-05,
"loss": 48.7655,
"step": 748
},
{
"epoch": 2.560234854550307,
"grad_norm": 3.7124524116516113,
"learning_rate": 3.151467483798961e-05,
"loss": 48.0997,
"step": 750
},
{
"epoch": 2.560234854550307,
"eval_loss": 0.7798339128494263,
"eval_runtime": 119.2173,
"eval_samples_per_second": 33.091,
"eval_steps_per_second": 8.279,
"step": 750
},
{
"epoch": 2.5670669869228715,
"grad_norm": 4.752464294433594,
"learning_rate": 3.14262557300457e-05,
"loss": 48.422,
"step": 752
},
{
"epoch": 2.5738991192954366,
"grad_norm": 4.635769844055176,
"learning_rate": 3.1337750466665e-05,
"loss": 48.9177,
"step": 754
},
{
"epoch": 2.580731251668001,
"grad_norm": 4.357526779174805,
"learning_rate": 3.124916023441865e-05,
"loss": 49.4801,
"step": 756
},
{
"epoch": 2.587563384040566,
"grad_norm": 16.189651489257812,
"learning_rate": 3.116048622101694e-05,
"loss": 49.275,
"step": 758
},
{
"epoch": 2.5943955164131305,
"grad_norm": 3.983285903930664,
"learning_rate": 3.107172961529343e-05,
"loss": 47.968,
"step": 760
},
{
"epoch": 2.601227648785695,
"grad_norm": 4.357701301574707,
"learning_rate": 3.098289160718895e-05,
"loss": 47.8592,
"step": 762
},
{
"epoch": 2.6080597811582598,
"grad_norm": 3.9686052799224854,
"learning_rate": 3.0893973387735687e-05,
"loss": 49.5191,
"step": 764
},
{
"epoch": 2.6148919135308244,
"grad_norm": 3.9062581062316895,
"learning_rate": 3.0804976149041195e-05,
"loss": 48.5485,
"step": 766
},
{
"epoch": 2.6217240459033895,
"grad_norm": 4.7290143966674805,
"learning_rate": 3.071590108427244e-05,
"loss": 49.2073,
"step": 768
},
{
"epoch": 2.628556178275954,
"grad_norm": 4.57703161239624,
"learning_rate": 3.062674938763976e-05,
"loss": 49.7624,
"step": 770
},
{
"epoch": 2.6353883106485188,
"grad_norm": 4.4061737060546875,
"learning_rate": 3.0537522254380905e-05,
"loss": 49.0566,
"step": 772
},
{
"epoch": 2.6422204430210834,
"grad_norm": 4.166697978973389,
"learning_rate": 3.044822088074496e-05,
"loss": 49.3193,
"step": 774
},
{
"epoch": 2.6490525753936485,
"grad_norm": 3.5513172149658203,
"learning_rate": 3.0358846463976372e-05,
"loss": 48.9675,
"step": 776
},
{
"epoch": 2.655884707766213,
"grad_norm": 4.9701995849609375,
"learning_rate": 3.026940020229882e-05,
"loss": 49.6229,
"step": 778
},
{
"epoch": 2.6627168401387777,
"grad_norm": 4.223094463348389,
"learning_rate": 3.017988329489923e-05,
"loss": 47.1613,
"step": 780
},
{
"epoch": 2.6695489725113424,
"grad_norm": 4.849906921386719,
"learning_rate": 3.0090296941911633e-05,
"loss": 47.5764,
"step": 782
},
{
"epoch": 2.676381104883907,
"grad_norm": 3.507953643798828,
"learning_rate": 3.0000642344401113e-05,
"loss": 47.1944,
"step": 784
},
{
"epoch": 2.6832132372564717,
"grad_norm": 4.040694713592529,
"learning_rate": 2.9910920704347696e-05,
"loss": 48.6472,
"step": 786
},
{
"epoch": 2.6900453696290363,
"grad_norm": 5.141117095947266,
"learning_rate": 2.9821133224630226e-05,
"loss": 47.177,
"step": 788
},
{
"epoch": 2.6968775020016014,
"grad_norm": 4.463181018829346,
"learning_rate": 2.9731281109010256e-05,
"loss": 47.4283,
"step": 790
},
{
"epoch": 2.703709634374166,
"grad_norm": 3.586456060409546,
"learning_rate": 2.9641365562115887e-05,
"loss": 48.9784,
"step": 792
},
{
"epoch": 2.7105417667467306,
"grad_norm": 3.9780969619750977,
"learning_rate": 2.9551387789425638e-05,
"loss": 48.601,
"step": 794
},
{
"epoch": 2.7173738991192953,
"grad_norm": 4.445759296417236,
"learning_rate": 2.9461348997252265e-05,
"loss": 49.9106,
"step": 796
},
{
"epoch": 2.7242060314918604,
"grad_norm": 4.416858673095703,
"learning_rate": 2.9371250392726614e-05,
"loss": 48.3298,
"step": 798
},
{
"epoch": 2.731038163864425,
"grad_norm": 4.36728572845459,
"learning_rate": 2.9281093183781403e-05,
"loss": 48.6063,
"step": 800
},
{
"epoch": 2.731038163864425,
"eval_loss": 0.7699871063232422,
"eval_runtime": 119.5951,
"eval_samples_per_second": 32.986,
"eval_steps_per_second": 8.253,
"step": 800
},
{
"epoch": 2.7378702962369896,
"grad_norm": 5.540378570556641,
"learning_rate": 2.919087857913508e-05,
"loss": 49.4323,
"step": 802
},
{
"epoch": 2.7447024286095543,
"grad_norm": 3.73681640625,
"learning_rate": 2.9100607788275545e-05,
"loss": 49.0439,
"step": 804
},
{
"epoch": 2.751534560982119,
"grad_norm": 4.437684535980225,
"learning_rate": 2.9010282021444008e-05,
"loss": 48.8682,
"step": 806
},
{
"epoch": 2.7583666933546835,
"grad_norm": 4.933871746063232,
"learning_rate": 2.891990248961871e-05,
"loss": 48.0791,
"step": 808
},
{
"epoch": 2.7651988257272486,
"grad_norm": 4.351380825042725,
"learning_rate": 2.8829470404498697e-05,
"loss": 47.0584,
"step": 810
},
{
"epoch": 2.7720309580998133,
"grad_norm": 4.953640937805176,
"learning_rate": 2.8738986978487625e-05,
"loss": 50.0531,
"step": 812
},
{
"epoch": 2.778863090472378,
"grad_norm": 3.676950216293335,
"learning_rate": 2.8648453424677434e-05,
"loss": 46.9994,
"step": 814
},
{
"epoch": 2.7856952228449425,
"grad_norm": 4.177380084991455,
"learning_rate": 2.8557870956832132e-05,
"loss": 48.3932,
"step": 816
},
{
"epoch": 2.7925273552175076,
"grad_norm": 4.177119731903076,
"learning_rate": 2.846724078937149e-05,
"loss": 48.2385,
"step": 818
},
{
"epoch": 2.7993594875900722,
"grad_norm": 4.261831283569336,
"learning_rate": 2.8376564137354795e-05,
"loss": 48.813,
"step": 820
},
{
"epoch": 2.806191619962637,
"grad_norm": 3.7779037952423096,
"learning_rate": 2.8285842216464543e-05,
"loss": 48.801,
"step": 822
},
{
"epoch": 2.8130237523352015,
"grad_norm": 5.378250598907471,
"learning_rate": 2.8195076242990122e-05,
"loss": 45.9584,
"step": 824
},
{
"epoch": 2.819855884707766,
"grad_norm": 3.5369153022766113,
"learning_rate": 2.8104267433811533e-05,
"loss": 46.97,
"step": 826
},
{
"epoch": 2.826688017080331,
"grad_norm": 3.493602991104126,
"learning_rate": 2.8013417006383076e-05,
"loss": 46.7352,
"step": 828
},
{
"epoch": 2.8335201494528954,
"grad_norm": 5.41981840133667,
"learning_rate": 2.7922526178717017e-05,
"loss": 48.4586,
"step": 830
},
{
"epoch": 2.8403522818254605,
"grad_norm": 4.6053948402404785,
"learning_rate": 2.783159616936723e-05,
"loss": 46.5008,
"step": 832
},
{
"epoch": 2.847184414198025,
"grad_norm": 4.136333465576172,
"learning_rate": 2.774062819741293e-05,
"loss": 47.3448,
"step": 834
},
{
"epoch": 2.85401654657059,
"grad_norm": 3.927877187728882,
"learning_rate": 2.764962348244228e-05,
"loss": 46.7369,
"step": 836
},
{
"epoch": 2.8608486789431544,
"grad_norm": 4.283491611480713,
"learning_rate": 2.7558583244536007e-05,
"loss": 48.098,
"step": 838
},
{
"epoch": 2.8676808113157195,
"grad_norm": 3.802030563354492,
"learning_rate": 2.7467508704251137e-05,
"loss": 48.2908,
"step": 840
},
{
"epoch": 2.874512943688284,
"grad_norm": 5.212815761566162,
"learning_rate": 2.7376401082604564e-05,
"loss": 47.8921,
"step": 842
},
{
"epoch": 2.8813450760608488,
"grad_norm": 4.39296293258667,
"learning_rate": 2.7285261601056698e-05,
"loss": 48.2491,
"step": 844
},
{
"epoch": 2.8881772084334134,
"grad_norm": 5.428844928741455,
"learning_rate": 2.7194091481495076e-05,
"loss": 49.1209,
"step": 846
},
{
"epoch": 2.895009340805978,
"grad_norm": 3.9836559295654297,
"learning_rate": 2.7102891946217994e-05,
"loss": 47.0515,
"step": 848
},
{
"epoch": 2.9018414731785427,
"grad_norm": 3.1067824363708496,
"learning_rate": 2.7011664217918154e-05,
"loss": 46.0087,
"step": 850
},
{
"epoch": 2.9018414731785427,
"eval_loss": 0.760260820388794,
"eval_runtime": 119.6698,
"eval_samples_per_second": 32.966,
"eval_steps_per_second": 8.248,
"step": 850
},
{
"epoch": 2.9086736055511073,
"grad_norm": 4.688024997711182,
"learning_rate": 2.6920409519666174e-05,
"loss": 47.0489,
"step": 852
},
{
"epoch": 2.9155057379236724,
"grad_norm": 4.777935981750488,
"learning_rate": 2.6829129074894304e-05,
"loss": 48.1153,
"step": 854
},
{
"epoch": 2.922337870296237,
"grad_norm": 4.912516117095947,
"learning_rate": 2.6737824107379948e-05,
"loss": 48.0798,
"step": 856
},
{
"epoch": 2.9291700026688017,
"grad_norm": 4.066973686218262,
"learning_rate": 2.6646495841229287e-05,
"loss": 46.9194,
"step": 858
},
{
"epoch": 2.9360021350413663,
"grad_norm": 4.499208927154541,
"learning_rate": 2.655514550086086e-05,
"loss": 48.3087,
"step": 860
},
{
"epoch": 2.9428342674139314,
"grad_norm": 4.891952991485596,
"learning_rate": 2.6463774310989154e-05,
"loss": 46.8565,
"step": 862
},
{
"epoch": 2.949666399786496,
"grad_norm": 3.8262720108032227,
"learning_rate": 2.637238349660819e-05,
"loss": 46.7596,
"step": 864
},
{
"epoch": 2.9564985321590607,
"grad_norm": 5.6072492599487305,
"learning_rate": 2.6280974282975063e-05,
"loss": 45.254,
"step": 866
},
{
"epoch": 2.9633306645316253,
"grad_norm": 3.9889800548553467,
"learning_rate": 2.6189547895593562e-05,
"loss": 46.754,
"step": 868
},
{
"epoch": 2.97016279690419,
"grad_norm": 3.7260525226593018,
"learning_rate": 2.6098105560197722e-05,
"loss": 46.6516,
"step": 870
},
{
"epoch": 2.9769949292767546,
"grad_norm": 4.090394973754883,
"learning_rate": 2.600664850273538e-05,
"loss": 47.2404,
"step": 872
},
{
"epoch": 2.983827061649319,
"grad_norm": 3.6287267208099365,
"learning_rate": 2.5915177949351765e-05,
"loss": 46.3821,
"step": 874
},
{
"epoch": 2.9906591940218843,
"grad_norm": 3.5229976177215576,
"learning_rate": 2.582369512637302e-05,
"loss": 46.8471,
"step": 876
},
{
"epoch": 2.997491326394449,
"grad_norm": 3.532615900039673,
"learning_rate": 2.5732201260289806e-05,
"loss": 47.0364,
"step": 878
},
{
"epoch": 3.0034160661862823,
"grad_norm": 3.482403039932251,
"learning_rate": 2.564069757774082e-05,
"loss": 40.3241,
"step": 880
},
{
"epoch": 3.010248198558847,
"grad_norm": 3.94649600982666,
"learning_rate": 2.554918530549637e-05,
"loss": 46.7226,
"step": 882
},
{
"epoch": 3.0170803309314116,
"grad_norm": 4.395301818847656,
"learning_rate": 2.545766567044194e-05,
"loss": 45.266,
"step": 884
},
{
"epoch": 3.0239124633039767,
"grad_norm": 4.813998699188232,
"learning_rate": 2.5366139899561696e-05,
"loss": 46.8651,
"step": 886
},
{
"epoch": 3.0307445956765413,
"grad_norm": 5.5799174308776855,
"learning_rate": 2.527460921992209e-05,
"loss": 46.5727,
"step": 888
},
{
"epoch": 3.037576728049106,
"grad_norm": 6.693199634552002,
"learning_rate": 2.518307485865538e-05,
"loss": 47.987,
"step": 890
},
{
"epoch": 3.0444088604216706,
"grad_norm": 6.33953332901001,
"learning_rate": 2.509153804294318e-05,
"loss": 45.7221,
"step": 892
},
{
"epoch": 3.051240992794235,
"grad_norm": 4.887784957885742,
"learning_rate": 2.5e-05,
"loss": 44.5186,
"step": 894
},
{
"epoch": 3.0580731251668003,
"grad_norm": 4.337290287017822,
"learning_rate": 2.490846195705683e-05,
"loss": 46.394,
"step": 896
},
{
"epoch": 3.064905257539365,
"grad_norm": 3.7094030380249023,
"learning_rate": 2.4816925141344623e-05,
"loss": 45.122,
"step": 898
},
{
"epoch": 3.0717373899119296,
"grad_norm": 3.71903920173645,
"learning_rate": 2.4725390780077908e-05,
"loss": 44.7121,
"step": 900
},
{
"epoch": 3.0717373899119296,
"eval_loss": 0.7495905160903931,
"eval_runtime": 119.7503,
"eval_samples_per_second": 32.944,
"eval_steps_per_second": 8.242,
"step": 900
},
{
"epoch": 3.078569522284494,
"grad_norm": 4.690406799316406,
"learning_rate": 2.4633860100438316e-05,
"loss": 45.6299,
"step": 902
},
{
"epoch": 3.085401654657059,
"grad_norm": 4.29756498336792,
"learning_rate": 2.4542334329558077e-05,
"loss": 48.2504,
"step": 904
},
{
"epoch": 3.092233787029624,
"grad_norm": 5.62404727935791,
"learning_rate": 2.4450814694503636e-05,
"loss": 47.6091,
"step": 906
},
{
"epoch": 3.0990659194021886,
"grad_norm": 3.726529836654663,
"learning_rate": 2.435930242225919e-05,
"loss": 46.4755,
"step": 908
},
{
"epoch": 3.105898051774753,
"grad_norm": 6.04416036605835,
"learning_rate": 2.4267798739710203e-05,
"loss": 46.9715,
"step": 910
},
{
"epoch": 3.112730184147318,
"grad_norm": 3.8375885486602783,
"learning_rate": 2.4176304873626985e-05,
"loss": 47.9794,
"step": 912
},
{
"epoch": 3.1195623165198825,
"grad_norm": 3.296687602996826,
"learning_rate": 2.4084822050648237e-05,
"loss": 45.0776,
"step": 914
},
{
"epoch": 3.126394448892447,
"grad_norm": 3.546963930130005,
"learning_rate": 2.399335149726463e-05,
"loss": 44.6584,
"step": 916
},
{
"epoch": 3.133226581265012,
"grad_norm": 3.896601676940918,
"learning_rate": 2.390189443980229e-05,
"loss": 47.0284,
"step": 918
},
{
"epoch": 3.140058713637577,
"grad_norm": 3.570570468902588,
"learning_rate": 2.3810452104406444e-05,
"loss": 46.4413,
"step": 920
},
{
"epoch": 3.1468908460101415,
"grad_norm": 4.160488605499268,
"learning_rate": 2.3719025717024946e-05,
"loss": 47.1564,
"step": 922
},
{
"epoch": 3.153722978382706,
"grad_norm": 5.714613914489746,
"learning_rate": 2.3627616503391814e-05,
"loss": 48.2275,
"step": 924
},
{
"epoch": 3.1605551107552707,
"grad_norm": 4.362124919891357,
"learning_rate": 2.3536225689010845e-05,
"loss": 47.0592,
"step": 926
},
{
"epoch": 3.167387243127836,
"grad_norm": 6.478647708892822,
"learning_rate": 2.3444854499139142e-05,
"loss": 47.4139,
"step": 928
},
{
"epoch": 3.1742193755004005,
"grad_norm": 3.713979721069336,
"learning_rate": 2.3353504158770722e-05,
"loss": 47.7301,
"step": 930
},
{
"epoch": 3.181051507872965,
"grad_norm": 3.875537872314453,
"learning_rate": 2.3262175892620065e-05,
"loss": 45.6112,
"step": 932
},
{
"epoch": 3.1878836402455297,
"grad_norm": 5.328731536865234,
"learning_rate": 2.3170870925105702e-05,
"loss": 46.6125,
"step": 934
},
{
"epoch": 3.1947157726180944,
"grad_norm": 5.152383327484131,
"learning_rate": 2.307959048033383e-05,
"loss": 45.6076,
"step": 936
},
{
"epoch": 3.201547904990659,
"grad_norm": 4.689112186431885,
"learning_rate": 2.2988335782081855e-05,
"loss": 45.648,
"step": 938
},
{
"epoch": 3.208380037363224,
"grad_norm": 3.3412325382232666,
"learning_rate": 2.2897108053782e-05,
"loss": 44.4993,
"step": 940
},
{
"epoch": 3.2152121697357887,
"grad_norm": 11.583976745605469,
"learning_rate": 2.280590851850493e-05,
"loss": 46.3174,
"step": 942
},
{
"epoch": 3.2220443021083534,
"grad_norm": 4.012174606323242,
"learning_rate": 2.271473839894331e-05,
"loss": 46.3054,
"step": 944
},
{
"epoch": 3.228876434480918,
"grad_norm": 6.315187931060791,
"learning_rate": 2.2623598917395438e-05,
"loss": 44.3273,
"step": 946
},
{
"epoch": 3.2357085668534826,
"grad_norm": 5.612927436828613,
"learning_rate": 2.253249129574887e-05,
"loss": 46.8669,
"step": 948
},
{
"epoch": 3.2425406992260477,
"grad_norm": 3.7026705741882324,
"learning_rate": 2.2441416755463995e-05,
"loss": 46.4012,
"step": 950
},
{
"epoch": 3.2425406992260477,
"eval_loss": 0.7383518218994141,
"eval_runtime": 118.6959,
"eval_samples_per_second": 33.236,
"eval_steps_per_second": 8.315,
"step": 950
},
{
"epoch": 3.2493728315986123,
"grad_norm": 4.251457214355469,
"learning_rate": 2.2350376517557727e-05,
"loss": 47.1319,
"step": 952
},
{
"epoch": 3.256204963971177,
"grad_norm": 4.500071048736572,
"learning_rate": 2.2259371802587068e-05,
"loss": 47.0883,
"step": 954
},
{
"epoch": 3.2630370963437416,
"grad_norm": 4.684493064880371,
"learning_rate": 2.216840383063277e-05,
"loss": 45.0587,
"step": 956
},
{
"epoch": 3.2698692287163063,
"grad_norm": 3.853529453277588,
"learning_rate": 2.2077473821282996e-05,
"loss": 46.3262,
"step": 958
},
{
"epoch": 3.276701361088871,
"grad_norm": 5.501523971557617,
"learning_rate": 2.1986582993616926e-05,
"loss": 44.8375,
"step": 960
},
{
"epoch": 3.283533493461436,
"grad_norm": 15.540706634521484,
"learning_rate": 2.1895732566188476e-05,
"loss": 45.117,
"step": 962
},
{
"epoch": 3.2903656258340006,
"grad_norm": 2.6855862140655518,
"learning_rate": 2.1804923757009884e-05,
"loss": 45.9567,
"step": 964
},
{
"epoch": 3.2971977582065652,
"grad_norm": 4.529240131378174,
"learning_rate": 2.1714157783535463e-05,
"loss": 44.7532,
"step": 966
},
{
"epoch": 3.30402989057913,
"grad_norm": 4.690282344818115,
"learning_rate": 2.1623435862645204e-05,
"loss": 45.8376,
"step": 968
},
{
"epoch": 3.3108620229516945,
"grad_norm": 5.309507846832275,
"learning_rate": 2.153275921062851e-05,
"loss": 46.1757,
"step": 970
},
{
"epoch": 3.3176941553242596,
"grad_norm": 4.278385639190674,
"learning_rate": 2.1442129043167874e-05,
"loss": 46.6388,
"step": 972
},
{
"epoch": 3.3245262876968242,
"grad_norm": 4.2424516677856445,
"learning_rate": 2.1351546575322572e-05,
"loss": 45.1695,
"step": 974
},
{
"epoch": 3.331358420069389,
"grad_norm": 3.695155143737793,
"learning_rate": 2.126101302151238e-05,
"loss": 45.9417,
"step": 976
},
{
"epoch": 3.3381905524419535,
"grad_norm": 4.2003374099731445,
"learning_rate": 2.1170529595501305e-05,
"loss": 44.4002,
"step": 978
},
{
"epoch": 3.345022684814518,
"grad_norm": 4.378734588623047,
"learning_rate": 2.1080097510381298e-05,
"loss": 45.4517,
"step": 980
},
{
"epoch": 3.351854817187083,
"grad_norm": 3.96730637550354,
"learning_rate": 2.098971797855599e-05,
"loss": 43.9996,
"step": 982
},
{
"epoch": 3.358686949559648,
"grad_norm": 3.6162188053131104,
"learning_rate": 2.089939221172446e-05,
"loss": 43.9178,
"step": 984
},
{
"epoch": 3.3655190819322125,
"grad_norm": 4.3834099769592285,
"learning_rate": 2.0809121420864923e-05,
"loss": 46.2701,
"step": 986
},
{
"epoch": 3.372351214304777,
"grad_norm": 4.271561145782471,
"learning_rate": 2.07189068162186e-05,
"loss": 45.7546,
"step": 988
},
{
"epoch": 3.3791833466773418,
"grad_norm": 3.5791757106781006,
"learning_rate": 2.0628749607273396e-05,
"loss": 45.3079,
"step": 990
},
{
"epoch": 3.3860154790499064,
"grad_norm": 4.5101318359375,
"learning_rate": 2.0538651002747744e-05,
"loss": 46.5476,
"step": 992
},
{
"epoch": 3.3928476114224715,
"grad_norm": 5.944687366485596,
"learning_rate": 2.0448612210574365e-05,
"loss": 44.0355,
"step": 994
},
{
"epoch": 3.399679743795036,
"grad_norm": 4.936254501342773,
"learning_rate": 2.0358634437884112e-05,
"loss": 46.0717,
"step": 996
},
{
"epoch": 3.4065118761676008,
"grad_norm": 4.114757537841797,
"learning_rate": 2.0268718890989753e-05,
"loss": 44.5295,
"step": 998
},
{
"epoch": 3.4133440085401654,
"grad_norm": 8.12585735321045,
"learning_rate": 2.0178866775369777e-05,
"loss": 45.0747,
"step": 1000
},
{
"epoch": 3.4133440085401654,
"eval_loss": 0.7275528907775879,
"eval_runtime": 119.5885,
"eval_samples_per_second": 32.988,
"eval_steps_per_second": 8.253,
"step": 1000
},
{
"epoch": 3.4304243394715774,
"grad_norm": 4.9336113929748535,
"learning_rate": 2.0089079295652306e-05,
"loss": 45.5736,
"step": 1002
},
{
"epoch": 3.437256471844142,
"grad_norm": 5.042412757873535,
"learning_rate": 1.9999357655598893e-05,
"loss": 45.6651,
"step": 1004
},
{
"epoch": 3.4440886042167067,
"grad_norm": 3.9377660751342773,
"learning_rate": 1.9909703058088376e-05,
"loss": 44.5559,
"step": 1006
},
{
"epoch": 3.4509207365892713,
"grad_norm": 4.054321765899658,
"learning_rate": 1.9820116705100777e-05,
"loss": 45.1868,
"step": 1008
},
{
"epoch": 3.457752868961836,
"grad_norm": 4.860738277435303,
"learning_rate": 1.9730599797701177e-05,
"loss": 44.6737,
"step": 1010
},
{
"epoch": 3.4645850013344006,
"grad_norm": 3.950925827026367,
"learning_rate": 1.9641153536023644e-05,
"loss": 43.7733,
"step": 1012
},
{
"epoch": 3.4714171337069657,
"grad_norm": 3.831669569015503,
"learning_rate": 1.9551779119255043e-05,
"loss": 43.7403,
"step": 1014
},
{
"epoch": 3.4782492660795303,
"grad_norm": 4.114947319030762,
"learning_rate": 1.9462477745619108e-05,
"loss": 45.5074,
"step": 1016
},
{
"epoch": 3.485081398452095,
"grad_norm": 3.405243158340454,
"learning_rate": 1.9373250612360246e-05,
"loss": 46.4417,
"step": 1018
},
{
"epoch": 3.4919135308246596,
"grad_norm": 4.80495023727417,
"learning_rate": 1.928409891572757e-05,
"loss": 44.9758,
"step": 1020
},
{
"epoch": 3.4987456631972247,
"grad_norm": 4.239831447601318,
"learning_rate": 1.919502385095881e-05,
"loss": 44.6174,
"step": 1022
},
{
"epoch": 3.5055777955697893,
"grad_norm": 4.724026203155518,
"learning_rate": 1.9106026612264316e-05,
"loss": 44.7325,
"step": 1024
},
{
"epoch": 3.512409927942354,
"grad_norm": 3.4634554386138916,
"learning_rate": 1.9017108392811065e-05,
"loss": 43.7796,
"step": 1026
},
{
"epoch": 3.5192420603149186,
"grad_norm": 4.715716361999512,
"learning_rate": 1.8928270384706584e-05,
"loss": 45.2777,
"step": 1028
},
{
"epoch": 3.5260741926874832,
"grad_norm": 5.100541114807129,
"learning_rate": 1.8839513778983066e-05,
"loss": 46.4359,
"step": 1030
},
{
"epoch": 3.532906325060048,
"grad_norm": 4.475189685821533,
"learning_rate": 1.875083976558136e-05,
"loss": 44.0298,
"step": 1032
},
{
"epoch": 3.5397384574326125,
"grad_norm": 4.431650161743164,
"learning_rate": 1.8662249533335003e-05,
"loss": 44.2631,
"step": 1034
},
{
"epoch": 3.5465705898051776,
"grad_norm": 4.561038970947266,
"learning_rate": 1.8573744269954298e-05,
"loss": 43.9968,
"step": 1036
},
{
"epoch": 3.5534027221777422,
"grad_norm": 3.4181675910949707,
"learning_rate": 1.848532516201039e-05,
"loss": 43.372,
"step": 1038
},
{
"epoch": 3.560234854550307,
"grad_norm": 4.05961799621582,
"learning_rate": 1.8396993394919372e-05,
"loss": 43.5887,
"step": 1040
},
{
"epoch": 3.5670669869228715,
"grad_norm": 4.183586597442627,
"learning_rate": 1.8308750152926337e-05,
"loss": 43.1976,
"step": 1042
},
{
"epoch": 3.5738991192954366,
"grad_norm": 4.6883745193481445,
"learning_rate": 1.8220596619089576e-05,
"loss": 44.4463,
"step": 1044
},
{
"epoch": 3.580731251668001,
"grad_norm": 4.490588665008545,
"learning_rate": 1.8132533975264682e-05,
"loss": 44.3332,
"step": 1046
},
{
"epoch": 3.587563384040566,
"grad_norm": 4.937854766845703,
"learning_rate": 1.8044563402088684e-05,
"loss": 45.1199,
"step": 1048
},
{
"epoch": 3.5943955164131305,
"grad_norm": 3.8182907104492188,
"learning_rate": 1.795668607896426e-05,
"loss": 45.2035,
"step": 1050
},
{
"epoch": 3.5943955164131305,
"eval_loss": 0.7135393619537354,
"eval_runtime": 130.7813,
"eval_samples_per_second": 30.165,
"eval_steps_per_second": 7.547,
"step": 1050
},
{
"epoch": 3.601227648785695,
"grad_norm": 3.3739826679229736,
"learning_rate": 1.7868903184043887e-05,
"loss": 43.5257,
"step": 1052
},
{
"epoch": 3.6080597811582598,
"grad_norm": 3.8119192123413086,
"learning_rate": 1.7781215894214078e-05,
"loss": 44.9718,
"step": 1054
},
{
"epoch": 3.6148919135308244,
"grad_norm": 3.6780483722686768,
"learning_rate": 1.7693625385079577e-05,
"loss": 44.496,
"step": 1056
},
{
"epoch": 3.6217240459033895,
"grad_norm": 4.625596523284912,
"learning_rate": 1.7606132830947614e-05,
"loss": 43.6496,
"step": 1058
},
{
"epoch": 3.628556178275954,
"grad_norm": 5.467988967895508,
"learning_rate": 1.7518739404812155e-05,
"loss": 45.3773,
"step": 1060
},
{
"epoch": 3.6353883106485188,
"grad_norm": 3.7848103046417236,
"learning_rate": 1.7431446278338197e-05,
"loss": 43.6622,
"step": 1062
},
{
"epoch": 3.6422204430210834,
"grad_norm": 6.2495222091674805,
"learning_rate": 1.7344254621846016e-05,
"loss": 44.7325,
"step": 1064
},
{
"epoch": 3.6490525753936485,
"grad_norm": 4.541433811187744,
"learning_rate": 1.7257165604295513e-05,
"loss": 45.7111,
"step": 1066
},
{
"epoch": 3.655884707766213,
"grad_norm": 3.6900789737701416,
"learning_rate": 1.7170180393270532e-05,
"loss": 46.2799,
"step": 1068
},
{
"epoch": 3.6627168401387777,
"grad_norm": 3.999112129211426,
"learning_rate": 1.7083300154963193e-05,
"loss": 44.9348,
"step": 1070
},
{
"epoch": 3.6695489725113424,
"grad_norm": 4.940526008605957,
"learning_rate": 1.699652605415828e-05,
"loss": 45.9208,
"step": 1072
},
{
"epoch": 3.676381104883907,
"grad_norm": 3.8536486625671387,
"learning_rate": 1.6909859254217613e-05,
"loss": 45.3559,
"step": 1074
},
{
"epoch": 3.6832132372564717,
"grad_norm": 5.941255569458008,
"learning_rate": 1.682330091706446e-05,
"loss": 44.2183,
"step": 1076
},
{
"epoch": 3.6900453696290363,
"grad_norm": 4.6851091384887695,
"learning_rate": 1.6736852203167935e-05,
"loss": 45.0132,
"step": 1078
},
{
"epoch": 3.6968775020016014,
"grad_norm": 6.338913917541504,
"learning_rate": 1.6650514271527468e-05,
"loss": 44.5087,
"step": 1080
},
{
"epoch": 3.703709634374166,
"grad_norm": 6.134509086608887,
"learning_rate": 1.6564288279657252e-05,
"loss": 44.5929,
"step": 1082
},
{
"epoch": 3.7105417667467306,
"grad_norm": 3.0185976028442383,
"learning_rate": 1.647817538357072e-05,
"loss": 44.4708,
"step": 1084
},
{
"epoch": 3.7173738991192953,
"grad_norm": 4.479791641235352,
"learning_rate": 1.639217673776507e-05,
"loss": 44.4799,
"step": 1086
},
{
"epoch": 3.7242060314918604,
"grad_norm": 3.9354395866394043,
"learning_rate": 1.630629349520576e-05,
"loss": 43.3393,
"step": 1088
},
{
"epoch": 3.731038163864425,
"grad_norm": 4.530430316925049,
"learning_rate": 1.622052680731105e-05,
"loss": 43.1996,
"step": 1090
},
{
"epoch": 3.7378702962369896,
"grad_norm": 4.594604015350342,
"learning_rate": 1.613487782393661e-05,
"loss": 43.6473,
"step": 1092
},
{
"epoch": 3.7447024286095543,
"grad_norm": 4.38798713684082,
"learning_rate": 1.604934769336004e-05,
"loss": 43.1229,
"step": 1094
},
{
"epoch": 3.751534560982119,
"grad_norm": 4.350236415863037,
"learning_rate": 1.5963937562265525e-05,
"loss": 44.7883,
"step": 1096
},
{
"epoch": 3.7583666933546835,
"grad_norm": 4.064984321594238,
"learning_rate": 1.587864857572842e-05,
"loss": 44.1865,
"step": 1098
},
{
"epoch": 3.7651988257272486,
"grad_norm": 4.607226848602295,
"learning_rate": 1.5793481877199946e-05,
"loss": 44.6176,
"step": 1100
},
{
"epoch": 3.7651988257272486,
"eval_loss": 0.7090520858764648,
"eval_runtime": 136.3013,
"eval_samples_per_second": 28.943,
"eval_steps_per_second": 7.241,
"step": 1100
},
{
"epoch": 3.7720309580998133,
"grad_norm": 4.4557719230651855,
"learning_rate": 1.5708438608491814e-05,
"loss": 42.0453,
"step": 1102
},
{
"epoch": 3.778863090472378,
"grad_norm": 5.199422359466553,
"learning_rate": 1.5623519909760954e-05,
"loss": 42.589,
"step": 1104
},
{
"epoch": 3.7856952228449425,
"grad_norm": 3.632471799850464,
"learning_rate": 1.5538726919494206e-05,
"loss": 43.7924,
"step": 1106
},
{
"epoch": 3.7925273552175076,
"grad_norm": 4.203450679779053,
"learning_rate": 1.5454060774493068e-05,
"loss": 45.02,
"step": 1108
},
{
"epoch": 3.7993594875900722,
"grad_norm": 5.149316310882568,
"learning_rate": 1.5369522609858446e-05,
"loss": 44.2724,
"step": 1110
},
{
"epoch": 3.806191619962637,
"grad_norm": 3.5306341648101807,
"learning_rate": 1.528511355897543e-05,
"loss": 44.2268,
"step": 1112
},
{
"epoch": 3.8130237523352015,
"grad_norm": 4.296536445617676,
"learning_rate": 1.5200834753498128e-05,
"loss": 44.0479,
"step": 1114
},
{
"epoch": 3.819855884707766,
"grad_norm": 2.969525098800659,
"learning_rate": 1.5116687323334467e-05,
"loss": 43.5543,
"step": 1116
},
{
"epoch": 3.826688017080331,
"grad_norm": 4.044551849365234,
"learning_rate": 1.5032672396631056e-05,
"loss": 45.7925,
"step": 1118
},
{
"epoch": 3.8335201494528954,
"grad_norm": 5.003629207611084,
"learning_rate": 1.4948791099758052e-05,
"loss": 44.2037,
"step": 1120
},
{
"epoch": 3.8403522818254605,
"grad_norm": 3.4248318672180176,
"learning_rate": 1.486504455729408e-05,
"loss": 43.9243,
"step": 1122
},
{
"epoch": 3.847184414198025,
"grad_norm": 4.228148937225342,
"learning_rate": 1.4781433892011131e-05,
"loss": 44.7779,
"step": 1124
},
{
"epoch": 3.85401654657059,
"grad_norm": 4.345002174377441,
"learning_rate": 1.4697960224859513e-05,
"loss": 43.0617,
"step": 1126
},
{
"epoch": 3.8608486789431544,
"grad_norm": 4.824610233306885,
"learning_rate": 1.4614624674952842e-05,
"loss": 43.2687,
"step": 1128
},
{
"epoch": 3.8676808113157195,
"grad_norm": 5.528540134429932,
"learning_rate": 1.4531428359553017e-05,
"loss": 43.5145,
"step": 1130
},
{
"epoch": 3.874512943688284,
"grad_norm": 3.7578537464141846,
"learning_rate": 1.4448372394055249e-05,
"loss": 43.2377,
"step": 1132
},
{
"epoch": 3.8813450760608488,
"grad_norm": 3.191563367843628,
"learning_rate": 1.436545789197313e-05,
"loss": 43.493,
"step": 1134
},
{
"epoch": 3.8881772084334134,
"grad_norm": 3.1072089672088623,
"learning_rate": 1.4282685964923642e-05,
"loss": 44.5567,
"step": 1136
},
{
"epoch": 3.895009340805978,
"grad_norm": 4.651160717010498,
"learning_rate": 1.4200057722612336e-05,
"loss": 42.7739,
"step": 1138
},
{
"epoch": 3.9018414731785427,
"grad_norm": 3.203441858291626,
"learning_rate": 1.4117574272818388e-05,
"loss": 43.1438,
"step": 1140
},
{
"epoch": 3.9086736055511073,
"grad_norm": 4.5728349685668945,
"learning_rate": 1.4035236721379757e-05,
"loss": 44.305,
"step": 1142
},
{
"epoch": 3.9155057379236724,
"grad_norm": 6.874294757843018,
"learning_rate": 1.3953046172178414e-05,
"loss": 42.8162,
"step": 1144
},
{
"epoch": 3.922337870296237,
"grad_norm": 5.198761463165283,
"learning_rate": 1.387100372712548e-05,
"loss": 44.2441,
"step": 1146
},
{
"epoch": 3.9291700026688017,
"grad_norm": 3.9007508754730225,
"learning_rate": 1.378911048614647e-05,
"loss": 43.0147,
"step": 1148
},
{
"epoch": 3.9360021350413663,
"grad_norm": 3.7035725116729736,
"learning_rate": 1.3707367547166569e-05,
"loss": 45.0733,
"step": 1150
},
{
"epoch": 3.9360021350413663,
"eval_loss": 0.7048025131225586,
"eval_runtime": 132.7997,
"eval_samples_per_second": 29.706,
"eval_steps_per_second": 7.432,
"step": 1150
},
{
"epoch": 3.9428342674139314,
"grad_norm": 5.101466655731201,
"learning_rate": 1.3625776006095881e-05,
"loss": 42.4982,
"step": 1152
},
{
"epoch": 3.949666399786496,
"grad_norm": 4.983183860778809,
"learning_rate": 1.354433695681474e-05,
"loss": 43.3568,
"step": 1154
},
{
"epoch": 3.9564985321590607,
"grad_norm": 3.6875593662261963,
"learning_rate": 1.3463051491159096e-05,
"loss": 45.16,
"step": 1156
},
{
"epoch": 3.9633306645316253,
"grad_norm": 4.482807636260986,
"learning_rate": 1.3381920698905787e-05,
"loss": 42.8545,
"step": 1158
},
{
"epoch": 3.97016279690419,
"grad_norm": 3.858903646469116,
"learning_rate": 1.3300945667758014e-05,
"loss": 42.5779,
"step": 1160
},
{
"epoch": 3.9769949292767546,
"grad_norm": 5.07602596282959,
"learning_rate": 1.3220127483330713e-05,
"loss": 43.8678,
"step": 1162
},
{
"epoch": 3.983827061649319,
"grad_norm": 5.183884620666504,
"learning_rate": 1.3139467229135999e-05,
"loss": 44.2575,
"step": 1164
},
{
"epoch": 3.9906591940218843,
"grad_norm": 5.44564962387085,
"learning_rate": 1.3058965986568648e-05,
"loss": 42.0898,
"step": 1166
},
{
"epoch": 3.997491326394449,
"grad_norm": 3.4175875186920166,
"learning_rate": 1.2978624834891628e-05,
"loss": 43.526,
"step": 1168
},
{
"epoch": 4.006832132372565,
"grad_norm": 5.1483588218688965,
"learning_rate": 1.2898444851221565e-05,
"loss": 60.1634,
"step": 1170
},
{
"epoch": 4.013664264745129,
"grad_norm": 4.452287673950195,
"learning_rate": 1.281842711051438e-05,
"loss": 41.7569,
"step": 1172
},
{
"epoch": 4.020496397117694,
"grad_norm": 4.024214267730713,
"learning_rate": 1.2738572685550799e-05,
"loss": 44.7667,
"step": 1174
},
{
"epoch": 4.0273285294902585,
"grad_norm": 5.533107757568359,
"learning_rate": 1.2658882646922034e-05,
"loss": 43.7144,
"step": 1176
},
{
"epoch": 4.034160661862823,
"grad_norm": 4.520675182342529,
"learning_rate": 1.2579358063015418e-05,
"loss": 43.3862,
"step": 1178
},
{
"epoch": 4.040992794235389,
"grad_norm": 4.086079120635986,
"learning_rate": 1.2500000000000006e-05,
"loss": 44.268,
"step": 1180
},
{
"epoch": 4.047824926607953,
"grad_norm": 3.335569381713867,
"learning_rate": 1.2420809521812404e-05,
"loss": 43.1871,
"step": 1182
},
{
"epoch": 4.054657058980518,
"grad_norm": 4.651849746704102,
"learning_rate": 1.2341787690142437e-05,
"loss": 43.4785,
"step": 1184
},
{
"epoch": 4.061489191353083,
"grad_norm": 3.9412457942962646,
"learning_rate": 1.2262935564418886e-05,
"loss": 42.1075,
"step": 1186
},
{
"epoch": 4.068321323725647,
"grad_norm": 5.621413230895996,
"learning_rate": 1.2184254201795365e-05,
"loss": 44.5849,
"step": 1188
},
{
"epoch": 4.075153456098212,
"grad_norm": 4.291881084442139,
"learning_rate": 1.2105744657136064e-05,
"loss": 42.9562,
"step": 1190
},
{
"epoch": 4.0819855884707765,
"grad_norm": 3.730132818222046,
"learning_rate": 1.2027407983001681e-05,
"loss": 44.0838,
"step": 1192
},
{
"epoch": 4.088817720843341,
"grad_norm": 3.540987968444824,
"learning_rate": 1.1949245229635245e-05,
"loss": 43.4705,
"step": 1194
},
{
"epoch": 4.095649853215906,
"grad_norm": 3.0649805068969727,
"learning_rate": 1.1871257444948098e-05,
"loss": 43.0996,
"step": 1196
},
{
"epoch": 4.10248198558847,
"grad_norm": 3.2024762630462646,
"learning_rate": 1.1793445674505776e-05,
"loss": 42.772,
"step": 1198
},
{
"epoch": 4.109314117961035,
"grad_norm": 3.462251663208008,
"learning_rate": 1.1715810961514073e-05,
"loss": 43.2502,
"step": 1200
},
{
"epoch": 4.109314117961035,
"eval_loss": 0.7009151577949524,
"eval_runtime": 133.1765,
"eval_samples_per_second": 29.622,
"eval_steps_per_second": 7.411,
"step": 1200
},
{
"epoch": 4.116146250333601,
"grad_norm": 4.633735656738281,
"learning_rate": 1.1638354346804971e-05,
"loss": 42.8239,
"step": 1202
},
{
"epoch": 4.122978382706165,
"grad_norm": 3.758700132369995,
"learning_rate": 1.1561076868822756e-05,
"loss": 43.3475,
"step": 1204
},
{
"epoch": 4.12981051507873,
"grad_norm": 4.143715858459473,
"learning_rate": 1.148397956361007e-05,
"loss": 44.0,
"step": 1206
},
{
"epoch": 4.1366426474512945,
"grad_norm": 5.201571941375732,
"learning_rate": 1.1407063464793966e-05,
"loss": 42.5036,
"step": 1208
},
{
"epoch": 4.143474779823859,
"grad_norm": 3.4282047748565674,
"learning_rate": 1.133032960357216e-05,
"loss": 43.0577,
"step": 1210
},
{
"epoch": 4.150306912196424,
"grad_norm": 4.114802837371826,
"learning_rate": 1.1253779008699131e-05,
"loss": 43.3517,
"step": 1212
},
{
"epoch": 4.157139044568988,
"grad_norm": 3.979163408279419,
"learning_rate": 1.1177412706472321e-05,
"loss": 42.5044,
"step": 1214
},
{
"epoch": 4.163971176941553,
"grad_norm": 4.363109588623047,
"learning_rate": 1.1101231720718442e-05,
"loss": 43.8954,
"step": 1216
},
{
"epoch": 4.170803309314118,
"grad_norm": 4.6219401359558105,
"learning_rate": 1.1025237072779663e-05,
"loss": 43.413,
"step": 1218
},
{
"epoch": 4.177635441686682,
"grad_norm": 4.945540904998779,
"learning_rate": 1.09494297815e-05,
"loss": 43.9628,
"step": 1220
},
{
"epoch": 4.184467574059248,
"grad_norm": 4.4585747718811035,
"learning_rate": 1.0873810863211595e-05,
"loss": 42.6454,
"step": 1222
},
{
"epoch": 4.1912997064318125,
"grad_norm": 4.659883499145508,
"learning_rate": 1.0798381331721109e-05,
"loss": 42.5656,
"step": 1224
},
{
"epoch": 4.198131838804377,
"grad_norm": 4.411434650421143,
"learning_rate": 1.0723142198296155e-05,
"loss": 41.2252,
"step": 1226
},
{
"epoch": 4.204963971176942,
"grad_norm": 4.985414028167725,
"learning_rate": 1.0648094471651724e-05,
"loss": 42.05,
"step": 1228
},
{
"epoch": 4.211796103549506,
"grad_norm": 5.09487771987915,
"learning_rate": 1.0573239157936619e-05,
"loss": 42.9917,
"step": 1230
},
{
"epoch": 4.218628235922071,
"grad_norm": 4.299539089202881,
"learning_rate": 1.049857726072005e-05,
"loss": 42.7934,
"step": 1232
},
{
"epoch": 4.225460368294636,
"grad_norm": 4.075766086578369,
"learning_rate": 1.0424109780978103e-05,
"loss": 41.0067,
"step": 1234
},
{
"epoch": 4.2322925006672,
"grad_norm": 4.9132232666015625,
"learning_rate": 1.034983771708035e-05,
"loss": 43.6556,
"step": 1236
},
{
"epoch": 4.239124633039765,
"grad_norm": 4.45914888381958,
"learning_rate": 1.0275762064776492e-05,
"loss": 42.588,
"step": 1238
},
{
"epoch": 4.24595676541233,
"grad_norm": 3.7621419429779053,
"learning_rate": 1.020188381718295e-05,
"loss": 41.7435,
"step": 1240
},
{
"epoch": 4.252788897784894,
"grad_norm": 2.9593658447265625,
"learning_rate": 1.0128203964769601e-05,
"loss": 43.7138,
"step": 1242
},
{
"epoch": 4.25962103015746,
"grad_norm": 4.333788871765137,
"learning_rate": 1.0054723495346482e-05,
"loss": 42.7332,
"step": 1244
},
{
"epoch": 4.266453162530024,
"grad_norm": 4.040637493133545,
"learning_rate": 9.981443394050525e-06,
"loss": 43.0547,
"step": 1246
},
{
"epoch": 4.273285294902589,
"grad_norm": 5.255796432495117,
"learning_rate": 9.908364643332399e-06,
"loss": 42.1078,
"step": 1248
},
{
"epoch": 4.280117427275154,
"grad_norm": 3.434884786605835,
"learning_rate": 9.835488222943285e-06,
"loss": 42.6684,
"step": 1250
},
{
"epoch": 4.280117427275154,
"eval_loss": 0.6948874592781067,
"eval_runtime": 138.5111,
"eval_samples_per_second": 28.481,
"eval_steps_per_second": 7.126,
"step": 1250
},
{
"epoch": 4.286949559647718,
"grad_norm": 4.761016368865967,
"learning_rate": 9.762815109921761e-06,
"loss": 43.8,
"step": 1252
},
{
"epoch": 4.293781692020283,
"grad_norm": 5.999067783355713,
"learning_rate": 9.690346278580726e-06,
"loss": 42.8654,
"step": 1254
},
{
"epoch": 4.300613824392848,
"grad_norm": 4.777903079986572,
"learning_rate": 9.618082700494319e-06,
"loss": 42.3409,
"step": 1256
},
{
"epoch": 4.307445956765412,
"grad_norm": 4.543084144592285,
"learning_rate": 9.546025344484869e-06,
"loss": 43.6205,
"step": 1258
},
{
"epoch": 4.314278089137977,
"grad_norm": 3.6853065490722656,
"learning_rate": 9.474175176609956e-06,
"loss": 43.9045,
"step": 1260
},
{
"epoch": 4.3211102215105415,
"grad_norm": 4.3578338623046875,
"learning_rate": 9.402533160149416e-06,
"loss": 41.781,
"step": 1262
},
{
"epoch": 4.327942353883106,
"grad_norm": 4.191073894500732,
"learning_rate": 9.331100255592437e-06,
"loss": 42.5713,
"step": 1264
},
{
"epoch": 4.334774486255672,
"grad_norm": 5.591835021972656,
"learning_rate": 9.259877420624721e-06,
"loss": 42.9316,
"step": 1266
},
{
"epoch": 4.341606618628236,
"grad_norm": 4.916292667388916,
"learning_rate": 9.18886561011557e-06,
"loss": 42.9316,
"step": 1268
},
{
"epoch": 4.348438751000801,
"grad_norm": 3.4310858249664307,
"learning_rate": 9.118065776105159e-06,
"loss": 42.0445,
"step": 1270
},
{
"epoch": 4.3552708833733655,
"grad_norm": 3.6645348072052,
"learning_rate": 9.047478867791732e-06,
"loss": 41.5698,
"step": 1272
},
{
"epoch": 4.36210301574593,
"grad_norm": 4.118466854095459,
"learning_rate": 8.977105831518864e-06,
"loss": 41.7493,
"step": 1274
},
{
"epoch": 4.368935148118495,
"grad_norm": 4.731881141662598,
"learning_rate": 8.906947610762825e-06,
"loss": 41.2277,
"step": 1276
},
{
"epoch": 4.3757672804910595,
"grad_norm": 4.580758571624756,
"learning_rate": 8.837005146119872e-06,
"loss": 42.3467,
"step": 1278
},
{
"epoch": 4.382599412863624,
"grad_norm": 5.310960292816162,
"learning_rate": 8.767279375293672e-06,
"loss": 43.1447,
"step": 1280
},
{
"epoch": 4.389431545236189,
"grad_norm": 4.382359027862549,
"learning_rate": 8.697771233082744e-06,
"loss": 42.4424,
"step": 1282
},
{
"epoch": 4.396263677608753,
"grad_norm": 3.6488263607025146,
"learning_rate": 8.628481651367876e-06,
"loss": 43.8516,
"step": 1284
},
{
"epoch": 4.403095809981318,
"grad_norm": 3.2983975410461426,
"learning_rate": 8.55941155909968e-06,
"loss": 43.3322,
"step": 1286
},
{
"epoch": 4.4099279423538835,
"grad_norm": 3.5116684436798096,
"learning_rate": 8.490561882286136e-06,
"loss": 41.4651,
"step": 1288
},
{
"epoch": 4.416760074726448,
"grad_norm": 3.5123932361602783,
"learning_rate": 8.421933543980126e-06,
"loss": 43.1034,
"step": 1290
},
{
"epoch": 4.423592207099013,
"grad_norm": 4.123583793640137,
"learning_rate": 8.353527464267104e-06,
"loss": 43.566,
"step": 1292
},
{
"epoch": 4.430424339471577,
"grad_norm": 3.6427931785583496,
"learning_rate": 8.285344560252777e-06,
"loss": 42.0333,
"step": 1294
},
{
"epoch": 4.437256471844142,
"grad_norm": 3.8917388916015625,
"learning_rate": 8.217385746050742e-06,
"loss": 42.0382,
"step": 1296
},
{
"epoch": 4.444088604216707,
"grad_norm": 4.964122772216797,
"learning_rate": 8.149651932770308e-06,
"loss": 43.6584,
"step": 1298
},
{
"epoch": 4.450920736589271,
"grad_norm": 4.227240085601807,
"learning_rate": 8.082144028504233e-06,
"loss": 42.4086,
"step": 1300
},
{
"epoch": 4.450920736589271,
"eval_loss": 0.6897044777870178,
"eval_runtime": 131.8148,
"eval_samples_per_second": 29.928,
"eval_steps_per_second": 7.488,
"step": 1300
},
{
"epoch": 4.457752868961836,
"grad_norm": 4.605757713317871,
"learning_rate": 8.014862938316542e-06,
"loss": 43.7962,
"step": 1302
},
{
"epoch": 4.464585001334401,
"grad_norm": 4.2398176193237305,
"learning_rate": 7.947809564230445e-06,
"loss": 42.3544,
"step": 1304
},
{
"epoch": 4.471417133706965,
"grad_norm": 5.234216213226318,
"learning_rate": 7.880984805216185e-06,
"loss": 41.9833,
"step": 1306
},
{
"epoch": 4.47824926607953,
"grad_norm": 3.9220240116119385,
"learning_rate": 7.814389557179017e-06,
"loss": 42.0345,
"step": 1308
},
{
"epoch": 4.485081398452095,
"grad_norm": 5.44996976852417,
"learning_rate": 7.748024712947205e-06,
"loss": 42.0309,
"step": 1310
},
{
"epoch": 4.49191353082466,
"grad_norm": 5.07472038269043,
"learning_rate": 7.681891162260015e-06,
"loss": 42.6996,
"step": 1312
},
{
"epoch": 4.498745663197225,
"grad_norm": 3.818120241165161,
"learning_rate": 7.615989791755834e-06,
"loss": 42.8775,
"step": 1314
},
{
"epoch": 4.505577795569789,
"grad_norm": 4.252802848815918,
"learning_rate": 7.5503214849602516e-06,
"loss": 42.4118,
"step": 1316
},
{
"epoch": 4.512409927942354,
"grad_norm": 4.17697286605835,
"learning_rate": 7.484887122274215e-06,
"loss": 41.2153,
"step": 1318
},
{
"epoch": 4.519242060314919,
"grad_norm": 3.7324466705322266,
"learning_rate": 7.419687580962223e-06,
"loss": 42.3343,
"step": 1320
},
{
"epoch": 4.526074192687483,
"grad_norm": 3.870089054107666,
"learning_rate": 7.354723735140609e-06,
"loss": 42.0028,
"step": 1322
},
{
"epoch": 4.532906325060048,
"grad_norm": 3.6424801349639893,
"learning_rate": 7.289996455765749e-06,
"loss": 43.5842,
"step": 1324
},
{
"epoch": 4.5397384574326125,
"grad_norm": 4.695961952209473,
"learning_rate": 7.225506610622456e-06,
"loss": 42.0951,
"step": 1326
},
{
"epoch": 4.546570589805177,
"grad_norm": 4.842666149139404,
"learning_rate": 7.161255064312283e-06,
"loss": 43.8668,
"step": 1328
},
{
"epoch": 4.553402722177742,
"grad_norm": 4.4085822105407715,
"learning_rate": 7.0972426782419884e-06,
"loss": 43.7836,
"step": 1330
},
{
"epoch": 4.560234854550307,
"grad_norm": 3.606607437133789,
"learning_rate": 7.033470310611945e-06,
"loss": 41.4304,
"step": 1332
},
{
"epoch": 4.567066986922872,
"grad_norm": 4.789222717285156,
"learning_rate": 6.969938816404639e-06,
"loss": 41.6355,
"step": 1334
},
{
"epoch": 4.573899119295437,
"grad_norm": 4.463109493255615,
"learning_rate": 6.906649047373246e-06,
"loss": 43.4969,
"step": 1336
},
{
"epoch": 4.580731251668001,
"grad_norm": 4.483322620391846,
"learning_rate": 6.843601852030171e-06,
"loss": 42.4094,
"step": 1338
},
{
"epoch": 4.587563384040566,
"grad_norm": 4.021024703979492,
"learning_rate": 6.780798075635675e-06,
"loss": 42.2893,
"step": 1340
},
{
"epoch": 4.5943955164131305,
"grad_norm": 3.9479868412017822,
"learning_rate": 6.718238560186571e-06,
"loss": 40.8073,
"step": 1342
},
{
"epoch": 4.601227648785695,
"grad_norm": 4.778145790100098,
"learning_rate": 6.655924144404907e-06,
"loss": 42.0845,
"step": 1344
},
{
"epoch": 4.60805978115826,
"grad_norm": 3.555271863937378,
"learning_rate": 6.593855663726722e-06,
"loss": 41.1015,
"step": 1346
},
{
"epoch": 4.614891913530824,
"grad_norm": 4.007204532623291,
"learning_rate": 6.532033950290886e-06,
"loss": 42.9137,
"step": 1348
},
{
"epoch": 4.621724045903389,
"grad_norm": 4.328546524047852,
"learning_rate": 6.470459832927881e-06,
"loss": 41.274,
"step": 1350
},
{
"epoch": 4.621724045903389,
"eval_loss": 0.6830974221229553,
"eval_runtime": 135.2812,
"eval_samples_per_second": 29.161,
"eval_steps_per_second": 7.296,
"step": 1350
},
{
"epoch": 4.628556178275954,
"grad_norm": 4.948083877563477,
"learning_rate": 6.409134137148737e-06,
"loss": 43.0462,
"step": 1352
},
{
"epoch": 4.635388310648519,
"grad_norm": 4.637773036956787,
"learning_rate": 6.3480576851339625e-06,
"loss": 42.6268,
"step": 1354
},
{
"epoch": 4.642220443021084,
"grad_norm": 3.72841215133667,
"learning_rate": 6.28723129572247e-06,
"loss": 41.0574,
"step": 1356
},
{
"epoch": 4.6490525753936485,
"grad_norm": 4.539714813232422,
"learning_rate": 6.226655784400684e-06,
"loss": 43.5752,
"step": 1358
},
{
"epoch": 4.655884707766213,
"grad_norm": 5.519583225250244,
"learning_rate": 6.166331963291519e-06,
"loss": 43.3111,
"step": 1360
},
{
"epoch": 4.662716840138778,
"grad_norm": 4.942199230194092,
"learning_rate": 6.106260641143546e-06,
"loss": 43.6514,
"step": 1362
},
{
"epoch": 4.669548972511342,
"grad_norm": 5.164299011230469,
"learning_rate": 6.046442623320145e-06,
"loss": 40.8611,
"step": 1364
},
{
"epoch": 4.676381104883907,
"grad_norm": 4.309698581695557,
"learning_rate": 5.986878711788702e-06,
"loss": 41.3937,
"step": 1366
},
{
"epoch": 4.683213237256472,
"grad_norm": 4.105101585388184,
"learning_rate": 5.927569705109828e-06,
"loss": 40.3001,
"step": 1368
},
{
"epoch": 4.690045369629036,
"grad_norm": 3.571514368057251,
"learning_rate": 5.868516398426716e-06,
"loss": 41.6858,
"step": 1370
},
{
"epoch": 4.696877502001601,
"grad_norm": 5.120858192443848,
"learning_rate": 5.809719583454415e-06,
"loss": 41.4156,
"step": 1372
},
{
"epoch": 4.703709634374166,
"grad_norm": 4.679799556732178,
"learning_rate": 5.751180048469243e-06,
"loss": 43.1858,
"step": 1374
},
{
"epoch": 4.710541766746731,
"grad_norm": 3.0465521812438965,
"learning_rate": 5.692898578298253e-06,
"loss": 41.213,
"step": 1376
},
{
"epoch": 4.717373899119296,
"grad_norm": 4.835347652435303,
"learning_rate": 5.634875954308638e-06,
"loss": 44.0938,
"step": 1378
},
{
"epoch": 4.72420603149186,
"grad_norm": 6.645193099975586,
"learning_rate": 5.577112954397321e-06,
"loss": 41.7528,
"step": 1380
},
{
"epoch": 4.731038163864425,
"grad_norm": 4.592052936553955,
"learning_rate": 5.519610352980501e-06,
"loss": 42.566,
"step": 1382
},
{
"epoch": 4.73787029623699,
"grad_norm": 3.7620317935943604,
"learning_rate": 5.462368920983249e-06,
"loss": 41.7184,
"step": 1384
},
{
"epoch": 4.744702428609554,
"grad_norm": 4.0445027351379395,
"learning_rate": 5.405389425829219e-06,
"loss": 41.6249,
"step": 1386
},
{
"epoch": 4.751534560982119,
"grad_norm": 3.744433641433716,
"learning_rate": 5.348672631430318e-06,
"loss": 43.0626,
"step": 1388
},
{
"epoch": 4.7583666933546835,
"grad_norm": 3.12141489982605,
"learning_rate": 5.292219298176476e-06,
"loss": 42.1533,
"step": 1390
},
{
"epoch": 4.765198825727248,
"grad_norm": 6.73304557800293,
"learning_rate": 5.236030182925475e-06,
"loss": 41.6015,
"step": 1392
},
{
"epoch": 4.772030958099813,
"grad_norm": 4.076465129852295,
"learning_rate": 5.1801060389927606e-06,
"loss": 43.2645,
"step": 1394
},
{
"epoch": 4.7788630904723775,
"grad_norm": 4.178272247314453,
"learning_rate": 5.124447616141381e-06,
"loss": 43.0354,
"step": 1396
},
{
"epoch": 4.785695222844943,
"grad_norm": 4.555927276611328,
"learning_rate": 5.06905566057192e-06,
"loss": 42.1086,
"step": 1398
},
{
"epoch": 4.792527355217508,
"grad_norm": 4.799075126647949,
"learning_rate": 5.013930914912476e-06,
"loss": 40.7555,
"step": 1400
},
{
"epoch": 4.792527355217508,
"eval_loss": 0.6814665198326111,
"eval_runtime": 134.9461,
"eval_samples_per_second": 29.234,
"eval_steps_per_second": 7.314,
"step": 1400
},
{
"epoch": 4.799359487590072,
"grad_norm": 3.7408673763275146,
"learning_rate": 4.959074118208726e-06,
"loss": 40.9295,
"step": 1402
},
{
"epoch": 4.806191619962637,
"grad_norm": 3.9520747661590576,
"learning_rate": 4.9044860059140275e-06,
"loss": 43.4186,
"step": 1404
},
{
"epoch": 4.8130237523352015,
"grad_norm": 4.115049839019775,
"learning_rate": 4.850167309879519e-06,
"loss": 42.2491,
"step": 1406
},
{
"epoch": 4.819855884707766,
"grad_norm": 5.181631088256836,
"learning_rate": 4.796118758344354e-06,
"loss": 41.583,
"step": 1408
},
{
"epoch": 4.826688017080331,
"grad_norm": 3.838186740875244,
"learning_rate": 4.742341075925916e-06,
"loss": 43.3278,
"step": 1410
},
{
"epoch": 4.833520149452895,
"grad_norm": 3.6494245529174805,
"learning_rate": 4.6888349836100825e-06,
"loss": 41.3961,
"step": 1412
},
{
"epoch": 4.84035228182546,
"grad_norm": 4.139842510223389,
"learning_rate": 4.6356011987416075e-06,
"loss": 43.4135,
"step": 1414
},
{
"epoch": 4.847184414198025,
"grad_norm": 4.385437965393066,
"learning_rate": 4.58264043501446e-06,
"loss": 42.1478,
"step": 1416
},
{
"epoch": 4.854016546570589,
"grad_norm": 3.691343307495117,
"learning_rate": 4.52995340246227e-06,
"loss": 42.4175,
"step": 1418
},
{
"epoch": 4.860848678943155,
"grad_norm": 4.149899482727051,
"learning_rate": 4.477540807448832e-06,
"loss": 42.4116,
"step": 1420
},
{
"epoch": 4.8676808113157195,
"grad_norm": 3.8960561752319336,
"learning_rate": 4.425403352658591e-06,
"loss": 41.2306,
"step": 1422
},
{
"epoch": 4.874512943688284,
"grad_norm": 3.6276168823242188,
"learning_rate": 4.373541737087264e-06,
"loss": 42.7317,
"step": 1424
},
{
"epoch": 4.881345076060849,
"grad_norm": 4.214303016662598,
"learning_rate": 4.32195665603245e-06,
"loss": 41.6166,
"step": 1426
},
{
"epoch": 4.888177208433413,
"grad_norm": 4.3136210441589355,
"learning_rate": 4.270648801084296e-06,
"loss": 42.3309,
"step": 1428
},
{
"epoch": 4.895009340805978,
"grad_norm": 5.340824604034424,
"learning_rate": 4.219618860116242e-06,
"loss": 40.6249,
"step": 1430
},
{
"epoch": 4.901841473178543,
"grad_norm": 3.750943183898926,
"learning_rate": 4.1688675172758064e-06,
"loss": 42.0754,
"step": 1432
},
{
"epoch": 4.908673605551107,
"grad_norm": 3.8021140098571777,
"learning_rate": 4.118395452975382e-06,
"loss": 42.8221,
"step": 1434
},
{
"epoch": 4.915505737923672,
"grad_norm": 5.09911584854126,
"learning_rate": 4.068203343883159e-06,
"loss": 42.3164,
"step": 1436
},
{
"epoch": 4.9223378702962375,
"grad_norm": 3.590981960296631,
"learning_rate": 4.018291862914001e-06,
"loss": 41.0773,
"step": 1438
},
{
"epoch": 4.929170002668801,
"grad_norm": 4.474262714385986,
"learning_rate": 3.968661679220468e-06,
"loss": 41.1827,
"step": 1440
},
{
"epoch": 4.936002135041367,
"grad_norm": 3.780853748321533,
"learning_rate": 3.919313458183838e-06,
"loss": 41.9009,
"step": 1442
},
{
"epoch": 4.942834267413931,
"grad_norm": 4.165524482727051,
"learning_rate": 3.8702478614051355e-06,
"loss": 41.6988,
"step": 1444
},
{
"epoch": 4.949666399786496,
"grad_norm": 4.537020683288574,
"learning_rate": 3.821465546696337e-06,
"loss": 42.6527,
"step": 1446
},
{
"epoch": 4.956498532159061,
"grad_norm": 5.992898941040039,
"learning_rate": 3.772967168071517e-06,
"loss": 42.3257,
"step": 1448
},
{
"epoch": 4.963330664531625,
"grad_norm": 5.681396007537842,
"learning_rate": 3.7247533757380603e-06,
"loss": 42.5366,
"step": 1450
},
{
"epoch": 4.963330664531625,
"eval_loss": 0.6770752668380737,
"eval_runtime": 133.8871,
"eval_samples_per_second": 29.465,
"eval_steps_per_second": 7.372,
"step": 1450
},
{
"epoch": 4.97016279690419,
"grad_norm": 4.46541166305542,
"learning_rate": 3.6768248160879787e-06,
"loss": 41.0476,
"step": 1452
},
{
"epoch": 4.976994929276755,
"grad_norm": 4.15000057220459,
"learning_rate": 3.6291821316892184e-06,
"loss": 40.7134,
"step": 1454
},
{
"epoch": 4.983827061649319,
"grad_norm": 4.230960369110107,
"learning_rate": 3.5818259612770744e-06,
"loss": 43.5967,
"step": 1456
},
{
"epoch": 4.990659194021884,
"grad_norm": 4.932849884033203,
"learning_rate": 3.53475693974559e-06,
"loss": 43.2516,
"step": 1458
},
{
"epoch": 4.997491326394449,
"grad_norm": 4.316704273223877,
"learning_rate": 3.487975698139084e-06,
"loss": 42.3811,
"step": 1460
},
{
"epoch": 5.003416066186283,
"grad_norm": 4.146729469299316,
"learning_rate": 3.4414828636436525e-06,
"loss": 36.1288,
"step": 1462
},
{
"epoch": 5.010248198558847,
"grad_norm": 5.610274791717529,
"learning_rate": 3.3952790595787987e-06,
"loss": 40.6556,
"step": 1464
},
{
"epoch": 5.017080330931412,
"grad_norm": 6.292807102203369,
"learning_rate": 3.3493649053890326e-06,
"loss": 42.2675,
"step": 1466
},
{
"epoch": 5.023912463303977,
"grad_norm": 4.371929168701172,
"learning_rate": 3.3037410166356143e-06,
"loss": 41.1544,
"step": 1468
},
{
"epoch": 5.030744595676541,
"grad_norm": 3.275562047958374,
"learning_rate": 3.258408004988278e-06,
"loss": 42.7401,
"step": 1470
},
{
"epoch": 5.037576728049106,
"grad_norm": 5.2857666015625,
"learning_rate": 3.2133664782169948e-06,
"loss": 39.4961,
"step": 1472
},
{
"epoch": 5.044408860421671,
"grad_norm": 3.9162814617156982,
"learning_rate": 3.168617040183897e-06,
"loss": 42.7691,
"step": 1474
},
{
"epoch": 5.051240992794235,
"grad_norm": 4.741237640380859,
"learning_rate": 3.1241602908351404e-06,
"loss": 39.9539,
"step": 1476
},
{
"epoch": 5.0580731251668,
"grad_norm": 4.904325008392334,
"learning_rate": 3.079996826192849e-06,
"loss": 40.999,
"step": 1478
},
{
"epoch": 5.0649052575393645,
"grad_norm": 3.9396679401397705,
"learning_rate": 3.036127238347164e-06,
"loss": 41.8233,
"step": 1480
},
{
"epoch": 5.071737389911929,
"grad_norm": 3.5699760913848877,
"learning_rate": 2.992552115448258e-06,
"loss": 41.4895,
"step": 1482
},
{
"epoch": 5.078569522284495,
"grad_norm": 4.227250099182129,
"learning_rate": 2.9492720416985e-06,
"loss": 41.7825,
"step": 1484
},
{
"epoch": 5.085401654657059,
"grad_norm": 3.8788514137268066,
"learning_rate": 2.9062875973445813e-06,
"loss": 41.4301,
"step": 1486
},
{
"epoch": 5.092233787029624,
"grad_norm": 3.7242729663848877,
"learning_rate": 2.8635993586697553e-06,
"loss": 40.2917,
"step": 1488
},
{
"epoch": 5.099065919402189,
"grad_norm": 5.645269870758057,
"learning_rate": 2.821207897986114e-06,
"loss": 41.1435,
"step": 1490
},
{
"epoch": 5.105898051774753,
"grad_norm": 3.9231839179992676,
"learning_rate": 2.779113783626916e-06,
"loss": 41.5506,
"step": 1492
},
{
"epoch": 5.112730184147318,
"grad_norm": 4.276205062866211,
"learning_rate": 2.7373175799389415e-06,
"loss": 40.4141,
"step": 1494
},
{
"epoch": 5.1195623165198825,
"grad_norm": 6.223433971405029,
"learning_rate": 2.6958198472749717e-06,
"loss": 42.1149,
"step": 1496
},
{
"epoch": 5.126394448892447,
"grad_norm": 4.167882442474365,
"learning_rate": 2.65462114198623e-06,
"loss": 40.7711,
"step": 1498
},
{
"epoch": 5.133226581265012,
"grad_norm": 3.588376998901367,
"learning_rate": 2.6137220164149435e-06,
"loss": 42.5513,
"step": 1500
},
{
"epoch": 5.133226581265012,
"eval_loss": 0.6761642694473267,
"eval_runtime": 137.9512,
"eval_samples_per_second": 28.597,
"eval_steps_per_second": 7.155,
"step": 1500
},
{
"epoch": 5.140058713637576,
"grad_norm": 4.149092674255371,
"learning_rate": 2.573123018886961e-06,
"loss": 40.5633,
"step": 1502
},
{
"epoch": 5.146890846010141,
"grad_norm": 3.9322760105133057,
"learning_rate": 2.5328246937043526e-06,
"loss": 41.3711,
"step": 1504
},
{
"epoch": 5.1537229783827065,
"grad_norm": 4.557422161102295,
"learning_rate": 2.492827581138149e-06,
"loss": 39.5696,
"step": 1506
},
{
"epoch": 5.160555110755271,
"grad_norm": 3.772927761077881,
"learning_rate": 2.4531322174210975e-06,
"loss": 42.9544,
"step": 1508
},
{
"epoch": 5.167387243127836,
"grad_norm": 4.051291465759277,
"learning_rate": 2.4137391347404476e-06,
"loss": 40.978,
"step": 1510
},
{
"epoch": 5.1742193755004005,
"grad_norm": 3.6557424068450928,
"learning_rate": 2.37464886123083e-06,
"loss": 41.606,
"step": 1512
},
{
"epoch": 5.181051507872965,
"grad_norm": 4.801413536071777,
"learning_rate": 2.3358619209672e-06,
"loss": 41.5917,
"step": 1514
},
{
"epoch": 5.18788364024553,
"grad_norm": 4.2001423835754395,
"learning_rate": 2.2973788339577613e-06,
"loss": 43.0596,
"step": 1516
},
{
"epoch": 5.194715772618094,
"grad_norm": 5.291867256164551,
"learning_rate": 2.2592001161370392e-06,
"loss": 40.3588,
"step": 1518
},
{
"epoch": 5.201547904990659,
"grad_norm": 3.7930984497070312,
"learning_rate": 2.2213262793589484e-06,
"loss": 42.0758,
"step": 1520
},
{
"epoch": 5.208380037363224,
"grad_norm": 4.888052940368652,
"learning_rate": 2.1837578313899098e-06,
"loss": 39.7415,
"step": 1522
},
{
"epoch": 5.215212169735788,
"grad_norm": 4.963688850402832,
"learning_rate": 2.1464952759020855e-06,
"loss": 42.05,
"step": 1524
},
{
"epoch": 5.222044302108353,
"grad_norm": 4.556923866271973,
"learning_rate": 2.109539112466588e-06,
"loss": 40.5828,
"step": 1526
},
{
"epoch": 5.228876434480918,
"grad_norm": 3.550285577774048,
"learning_rate": 2.0728898365467903e-06,
"loss": 41.4201,
"step": 1528
},
{
"epoch": 5.235708566853483,
"grad_norm": 4.290851593017578,
"learning_rate": 2.0365479394917147e-06,
"loss": 41.1988,
"step": 1530
},
{
"epoch": 5.242540699226048,
"grad_norm": 4.436618804931641,
"learning_rate": 2.0005139085293945e-06,
"loss": 41.1016,
"step": 1532
},
{
"epoch": 5.249372831598612,
"grad_norm": 6.221188068389893,
"learning_rate": 1.9647882267603862e-06,
"loss": 42.1538,
"step": 1534
},
{
"epoch": 5.256204963971177,
"grad_norm": 4.712629795074463,
"learning_rate": 1.9293713731512673e-06,
"loss": 41.1176,
"step": 1536
},
{
"epoch": 5.263037096343742,
"grad_norm": 4.693170070648193,
"learning_rate": 1.894263822528225e-06,
"loss": 41.3687,
"step": 1538
},
{
"epoch": 5.269869228716306,
"grad_norm": 4.854535102844238,
"learning_rate": 1.8594660455706763e-06,
"loss": 41.6856,
"step": 1540
},
{
"epoch": 5.276701361088871,
"grad_norm": 3.5167202949523926,
"learning_rate": 1.8249785088049893e-06,
"loss": 42.5848,
"step": 1542
},
{
"epoch": 5.2835334934614355,
"grad_norm": 4.029543399810791,
"learning_rate": 1.790801674598186e-06,
"loss": 41.8932,
"step": 1544
},
{
"epoch": 5.290365625834,
"grad_norm": 4.217826843261719,
"learning_rate": 1.7569360011517848e-06,
"loss": 41.478,
"step": 1546
},
{
"epoch": 5.297197758206565,
"grad_norm": 3.8237998485565186,
"learning_rate": 1.7233819424956248e-06,
"loss": 42.5394,
"step": 1548
},
{
"epoch": 5.30402989057913,
"grad_norm": 5.044140338897705,
"learning_rate": 1.6901399484818004e-06,
"loss": 41.0466,
"step": 1550
},
{
"epoch": 5.30402989057913,
"eval_loss": 0.6723917722702026,
"eval_runtime": 132.3674,
"eval_samples_per_second": 29.803,
"eval_steps_per_second": 7.457,
"step": 1550
},
{
"epoch": 5.310862022951695,
"grad_norm": 4.023882865905762,
"learning_rate": 1.6572104647786247e-06,
"loss": 40.4515,
"step": 1552
},
{
"epoch": 5.31769415532426,
"grad_norm": 5.667575836181641,
"learning_rate": 1.624593932864632e-06,
"loss": 42.2196,
"step": 1554
},
{
"epoch": 5.324526287696824,
"grad_norm": 3.771815299987793,
"learning_rate": 1.5922907900227018e-06,
"loss": 41.1018,
"step": 1556
},
{
"epoch": 5.331358420069389,
"grad_norm": 4.044847011566162,
"learning_rate": 1.5603014693341662e-06,
"loss": 40.8528,
"step": 1558
},
{
"epoch": 5.3381905524419535,
"grad_norm": 4.64625358581543,
"learning_rate": 1.5286263996730026e-06,
"loss": 41.612,
"step": 1560
},
{
"epoch": 5.345022684814518,
"grad_norm": 5.102336406707764,
"learning_rate": 1.497266005700107e-06,
"loss": 40.965,
"step": 1562
},
{
"epoch": 5.351854817187083,
"grad_norm": 3.1535797119140625,
"learning_rate": 1.4662207078575684e-06,
"loss": 40.5264,
"step": 1564
},
{
"epoch": 5.358686949559647,
"grad_norm": 3.740694522857666,
"learning_rate": 1.4354909223630669e-06,
"loss": 41.5863,
"step": 1566
},
{
"epoch": 5.365519081932212,
"grad_norm": 4.79527473449707,
"learning_rate": 1.40507706120426e-06,
"loss": 41.3632,
"step": 1568
},
{
"epoch": 5.372351214304777,
"grad_norm": 4.936699867248535,
"learning_rate": 1.3749795321332887e-06,
"loss": 41.898,
"step": 1570
},
{
"epoch": 5.379183346677342,
"grad_norm": 6.228104114532471,
"learning_rate": 1.3451987386612851e-06,
"loss": 41.3327,
"step": 1572
},
{
"epoch": 5.386015479049907,
"grad_norm": 3.9607808589935303,
"learning_rate": 1.3157350800529878e-06,
"loss": 39.3806,
"step": 1574
},
{
"epoch": 5.3928476114224715,
"grad_norm": 3.2485790252685547,
"learning_rate": 1.286588951321363e-06,
"loss": 39.292,
"step": 1576
},
{
"epoch": 5.399679743795036,
"grad_norm": 4.702234745025635,
"learning_rate": 1.2577607432223276e-06,
"loss": 40.3127,
"step": 1578
},
{
"epoch": 5.406511876167601,
"grad_norm": 4.465649127960205,
"learning_rate": 1.2292508422495158e-06,
"loss": 41.7889,
"step": 1580
},
{
"epoch": 5.413344008540165,
"grad_norm": 4.618641376495361,
"learning_rate": 1.2010596306290589e-06,
"loss": 41.2257,
"step": 1582
},
{
"epoch": 5.42017614091273,
"grad_norm": 4.093713283538818,
"learning_rate": 1.1731874863145143e-06,
"loss": 41.7067,
"step": 1584
},
{
"epoch": 5.427008273285295,
"grad_norm": 5.642305374145508,
"learning_rate": 1.145634782981761e-06,
"loss": 41.1947,
"step": 1586
},
{
"epoch": 5.433840405657859,
"grad_norm": 3.9637906551361084,
"learning_rate": 1.1184018900240011e-06,
"loss": 41.5425,
"step": 1588
},
{
"epoch": 5.440672538030424,
"grad_norm": 4.328593730926514,
"learning_rate": 1.0914891725468141e-06,
"loss": 41.7915,
"step": 1590
},
{
"epoch": 5.4475046704029895,
"grad_norm": 4.559619903564453,
"learning_rate": 1.06489699136324e-06,
"loss": 39.5462,
"step": 1592
},
{
"epoch": 5.454336802775554,
"grad_norm": 4.174973011016846,
"learning_rate": 1.0386257029889768e-06,
"loss": 40.6458,
"step": 1594
},
{
"epoch": 5.461168935148119,
"grad_norm": 3.249431610107422,
"learning_rate": 1.0126756596375686e-06,
"loss": 41.4128,
"step": 1596
},
{
"epoch": 5.468001067520683,
"grad_norm": 4.598479747772217,
"learning_rate": 9.87047209215694e-07,
"loss": 41.7854,
"step": 1598
},
{
"epoch": 5.474833199893248,
"grad_norm": 3.558709144592285,
"learning_rate": 9.617406953185138e-07,
"loss": 41.9632,
"step": 1600
},
{
"epoch": 5.474833199893248,
"eval_loss": 0.6698766350746155,
"eval_runtime": 133.9539,
"eval_samples_per_second": 29.45,
"eval_steps_per_second": 7.368,
"step": 1600
},
{
"epoch": 5.481665332265813,
"grad_norm": 5.397751331329346,
"learning_rate": 9.36756457225052e-07,
"loss": 40.2635,
"step": 1602
},
{
"epoch": 5.488497464638377,
"grad_norm": 5.443418502807617,
"learning_rate": 9.120948298936421e-07,
"loss": 40.6923,
"step": 1604
},
{
"epoch": 5.495329597010942,
"grad_norm": 3.991673707962036,
"learning_rate": 8.87756143957455e-07,
"loss": 40.0543,
"step": 1606
},
{
"epoch": 5.502161729383507,
"grad_norm": 4.649523735046387,
"learning_rate": 8.637407257200497e-07,
"loss": 41.3534,
"step": 1608
},
{
"epoch": 5.508993861756071,
"grad_norm": 4.675793170928955,
"learning_rate": 8.400488971509968e-07,
"loss": 39.8315,
"step": 1610
},
{
"epoch": 5.515825994128637,
"grad_norm": 3.273359775543213,
"learning_rate": 8.166809758815896e-07,
"loss": 39.9979,
"step": 1612
},
{
"epoch": 5.5226581265012005,
"grad_norm": 4.165469169616699,
"learning_rate": 7.936372752005399e-07,
"loss": 39.3362,
"step": 1614
},
{
"epoch": 5.529490258873766,
"grad_norm": 4.015806674957275,
"learning_rate": 7.709181040498254e-07,
"loss": 40.7772,
"step": 1616
},
{
"epoch": 5.536322391246331,
"grad_norm": 6.13747501373291,
"learning_rate": 7.485237670205175e-07,
"loss": 40.8463,
"step": 1618
},
{
"epoch": 5.543154523618895,
"grad_norm": 3.6014761924743652,
"learning_rate": 7.264545643486997e-07,
"loss": 40.231,
"step": 1620
},
{
"epoch": 5.54998665599146,
"grad_norm": 4.055222034454346,
"learning_rate": 7.047107919114588e-07,
"loss": 42.5435,
"step": 1622
},
{
"epoch": 5.5568187883640245,
"grad_norm": 5.444411277770996,
"learning_rate": 6.832927412229018e-07,
"loss": 41.0914,
"step": 1624
},
{
"epoch": 5.563650920736589,
"grad_norm": 3.4832520484924316,
"learning_rate": 6.622006994302543e-07,
"loss": 42.297,
"step": 1626
},
{
"epoch": 5.570483053109154,
"grad_norm": 5.123753547668457,
"learning_rate": 6.41434949310013e-07,
"loss": 40.4283,
"step": 1628
},
{
"epoch": 5.5773151854817185,
"grad_norm": 5.2065277099609375,
"learning_rate": 6.209957692641544e-07,
"loss": 40.5581,
"step": 1630
},
{
"epoch": 5.584147317854283,
"grad_norm": 4.573667049407959,
"learning_rate": 6.008834333163876e-07,
"loss": 39.4126,
"step": 1632
},
{
"epoch": 5.590979450226849,
"grad_norm": 5.208593368530273,
"learning_rate": 5.810982111085106e-07,
"loss": 40.7202,
"step": 1634
},
{
"epoch": 5.597811582599413,
"grad_norm": 4.341737747192383,
"learning_rate": 5.616403678967624e-07,
"loss": 40.9683,
"step": 1636
},
{
"epoch": 5.604643714971978,
"grad_norm": 4.836015701293945,
"learning_rate": 5.42510164548285e-07,
"loss": 40.4273,
"step": 1638
},
{
"epoch": 5.6114758473445425,
"grad_norm": 4.308472633361816,
"learning_rate": 5.237078575376336e-07,
"loss": 41.0492,
"step": 1640
},
{
"epoch": 5.618307979717107,
"grad_norm": 4.316090106964111,
"learning_rate": 5.052336989433082e-07,
"loss": 40.6806,
"step": 1642
},
{
"epoch": 5.625140112089672,
"grad_norm": 3.6825830936431885,
"learning_rate": 4.870879364444109e-07,
"loss": 40.5467,
"step": 1644
},
{
"epoch": 5.631972244462236,
"grad_norm": 5.199794769287109,
"learning_rate": 4.692708133172991e-07,
"loss": 39.4587,
"step": 1646
},
{
"epoch": 5.638804376834801,
"grad_norm": 3.3388471603393555,
"learning_rate": 4.517825684323324e-07,
"loss": 39.1098,
"step": 1648
},
{
"epoch": 5.645636509207366,
"grad_norm": 4.200729846954346,
"learning_rate": 4.346234362506724e-07,
"loss": 40.122,
"step": 1650
},
{
"epoch": 5.645636509207366,
"eval_loss": 0.6662212014198303,
"eval_runtime": 137.6293,
"eval_samples_per_second": 28.664,
"eval_steps_per_second": 7.171,
"step": 1650
},
{
"epoch": 5.65246864157993,
"grad_norm": 3.9246127605438232,
"learning_rate": 4.1779364682113796e-07,
"loss": 40.0725,
"step": 1652
},
{
"epoch": 5.659300773952495,
"grad_norm": 4.904084205627441,
"learning_rate": 4.012934257771134e-07,
"loss": 40.0188,
"step": 1654
},
{
"epoch": 5.6661329063250605,
"grad_norm": 4.436688423156738,
"learning_rate": 3.851229943335394e-07,
"loss": 39.9216,
"step": 1656
},
{
"epoch": 5.672965038697625,
"grad_norm": 4.027088642120361,
"learning_rate": 3.6928256928393247e-07,
"loss": 41.4124,
"step": 1658
},
{
"epoch": 5.67979717107019,
"grad_norm": 3.796221971511841,
"learning_rate": 3.537723629974815e-07,
"loss": 39.8851,
"step": 1660
},
{
"epoch": 5.686629303442754,
"grad_norm": 4.7540130615234375,
"learning_rate": 3.3859258341621125e-07,
"loss": 40.1716,
"step": 1662
},
{
"epoch": 5.693461435815319,
"grad_norm": 4.521333694458008,
"learning_rate": 3.237434340521789e-07,
"loss": 41.4182,
"step": 1664
},
{
"epoch": 5.700293568187884,
"grad_norm": 4.776477336883545,
"learning_rate": 3.0922511398475683e-07,
"loss": 41.2698,
"step": 1666
},
{
"epoch": 5.707125700560448,
"grad_norm": 4.749114990234375,
"learning_rate": 2.9503781785795713e-07,
"loss": 42.4175,
"step": 1668
},
{
"epoch": 5.713957832933013,
"grad_norm": 4.831925392150879,
"learning_rate": 2.8118173587782516e-07,
"loss": 40.593,
"step": 1670
},
{
"epoch": 5.720789965305578,
"grad_norm": 4.17523193359375,
"learning_rate": 2.6765705380989437e-07,
"loss": 39.8755,
"step": 1672
},
{
"epoch": 5.727622097678142,
"grad_norm": 4.183824062347412,
"learning_rate": 2.544639529766829e-07,
"loss": 40.7682,
"step": 1674
},
{
"epoch": 5.734454230050707,
"grad_norm": 4.203549385070801,
"learning_rate": 2.416026102552732e-07,
"loss": 40.1932,
"step": 1676
},
{
"epoch": 5.741286362423272,
"grad_norm": 4.252909183502197,
"learning_rate": 2.290731980749361e-07,
"loss": 41.4024,
"step": 1678
},
{
"epoch": 5.748118494795837,
"grad_norm": 4.110680103302002,
"learning_rate": 2.168758844148272e-07,
"loss": 40.8089,
"step": 1680
},
{
"epoch": 5.754950627168402,
"grad_norm": 4.860687732696533,
"learning_rate": 2.050108328017164e-07,
"loss": 41.278,
"step": 1682
},
{
"epoch": 5.761782759540966,
"grad_norm": 7.037466526031494,
"learning_rate": 1.93478202307823e-07,
"loss": 42.0162,
"step": 1684
},
{
"epoch": 5.768614891913531,
"grad_norm": 4.048498630523682,
"learning_rate": 1.8227814754865068e-07,
"loss": 41.2187,
"step": 1686
},
{
"epoch": 5.775447024286096,
"grad_norm": 3.721379518508911,
"learning_rate": 1.7141081868094212e-07,
"loss": 41.8383,
"step": 1688
},
{
"epoch": 5.78227915665866,
"grad_norm": 6.793107509613037,
"learning_rate": 1.6087636140065532e-07,
"loss": 40.5894,
"step": 1690
},
{
"epoch": 5.789111289031225,
"grad_norm": 4.424513339996338,
"learning_rate": 1.5067491694100154e-07,
"loss": 41.2666,
"step": 1692
},
{
"epoch": 5.7959434214037895,
"grad_norm": 4.707203388214111,
"learning_rate": 1.4080662207056894e-07,
"loss": 41.2405,
"step": 1694
},
{
"epoch": 5.802775553776354,
"grad_norm": 2.994469165802002,
"learning_rate": 1.3127160909147672e-07,
"loss": 42.6466,
"step": 1696
},
{
"epoch": 5.809607686148919,
"grad_norm": 3.029481887817383,
"learning_rate": 1.220700058376073e-07,
"loss": 40.642,
"step": 1698
},
{
"epoch": 5.816439818521484,
"grad_norm": 3.4690332412719727,
"learning_rate": 1.1320193567288529e-07,
"loss": 41.02,
"step": 1700
},
{
"epoch": 5.816439818521484,
"eval_loss": 0.6652334928512573,
"eval_runtime": 134.4616,
"eval_samples_per_second": 29.339,
"eval_steps_per_second": 7.34,
"step": 1700
},
{
"epoch": 5.823271950894049,
"grad_norm": 5.008721828460693,
"learning_rate": 1.0466751748963444e-07,
"loss": 40.1855,
"step": 1702
},
{
"epoch": 5.830104083266614,
"grad_norm": 5.638387680053711,
"learning_rate": 9.646686570697061e-08,
"loss": 40.6194,
"step": 1704
},
{
"epoch": 5.836936215639178,
"grad_norm": 5.234898567199707,
"learning_rate": 8.860009026928629e-08,
"loss": 40.6608,
"step": 1706
},
{
"epoch": 5.843768348011743,
"grad_norm": 4.212846279144287,
"learning_rate": 8.106729664475176e-08,
"loss": 41.4097,
"step": 1708
},
{
"epoch": 5.8506004803843075,
"grad_norm": 3.5884008407592773,
"learning_rate": 7.386858582392187e-08,
"loss": 39.4515,
"step": 1710
},
{
"epoch": 5.857432612756872,
"grad_norm": 4.441662788391113,
"learning_rate": 6.700405431837587e-08,
"loss": 41.8026,
"step": 1712
},
{
"epoch": 5.864264745129437,
"grad_norm": 5.290170192718506,
"learning_rate": 6.047379415941856e-08,
"loss": 40.8839,
"step": 1714
},
{
"epoch": 5.871096877502001,
"grad_norm": 3.4507861137390137,
"learning_rate": 5.4277892896853476e-08,
"loss": 40.574,
"step": 1716
},
{
"epoch": 5.877929009874566,
"grad_norm": 3.869871139526367,
"learning_rate": 4.8416433597803234e-08,
"loss": 41.8288,
"step": 1718
},
{
"epoch": 5.884761142247131,
"grad_norm": 4.644185543060303,
"learning_rate": 4.2889494845599344e-08,
"loss": 41.318,
"step": 1720
},
{
"epoch": 5.891593274619696,
"grad_norm": 3.191018581390381,
"learning_rate": 3.769715073872748e-08,
"loss": 41.1112,
"step": 1722
},
{
"epoch": 5.898425406992261,
"grad_norm": 3.394134998321533,
"learning_rate": 3.283947088983663e-08,
"loss": 41.9932,
"step": 1724
},
{
"epoch": 5.9052575393648254,
"grad_norm": 4.62444543838501,
"learning_rate": 2.831652042480093e-08,
"loss": 39.9583,
"step": 1726
},
{
"epoch": 5.91208967173739,
"grad_norm": 4.27966833114624,
"learning_rate": 2.4128359981850924e-08,
"loss": 39.915,
"step": 1728
},
{
"epoch": 5.918921804109955,
"grad_norm": 3.7036333084106445,
"learning_rate": 2.0275045710760334e-08,
"loss": 40.0384,
"step": 1730
},
{
"epoch": 5.925753936482519,
"grad_norm": 5.249677658081055,
"learning_rate": 1.6756629272085545e-08,
"loss": 40.1564,
"step": 1732
},
{
"epoch": 5.932586068855084,
"grad_norm": 4.477707862854004,
"learning_rate": 1.3573157836485606e-08,
"loss": 40.6008,
"step": 1734
},
{
"epoch": 5.939418201227649,
"grad_norm": 4.939481258392334,
"learning_rate": 1.0724674084083841e-08,
"loss": 40.9639,
"step": 1736
},
{
"epoch": 5.946250333600213,
"grad_norm": 2.9428999423980713,
"learning_rate": 8.211216203890537e-09,
"loss": 40.9722,
"step": 1738
},
{
"epoch": 5.953082465972778,
"grad_norm": 4.589330673217773,
"learning_rate": 6.032817893297793e-09,
"loss": 41.4832,
"step": 1740
},
{
"epoch": 5.9599145983453425,
"grad_norm": 5.4429450035095215,
"learning_rate": 4.1895083576271035e-09,
"loss": 41.8059,
"step": 1742
},
{
"epoch": 5.966746730717908,
"grad_norm": 3.5152432918548584,
"learning_rate": 2.681312309735229e-09,
"loss": 41.2228,
"step": 1744
},
{
"epoch": 5.973578863090473,
"grad_norm": 4.573424339294434,
"learning_rate": 1.5082499696839059e-09,
"loss": 41.9849,
"step": 1746
},
{
"epoch": 5.980410995463037,
"grad_norm": 4.099581718444824,
"learning_rate": 6.703370644706164e-10,
"loss": 40.6948,
"step": 1748
},
{
"epoch": 5.987243127835602,
"grad_norm": 4.090056896209717,
"learning_rate": 1.6758482781209507e-10,
"loss": 40.9226,
"step": 1750
},
{
"epoch": 5.987243127835602,
"eval_loss": 0.6658891439437866,
"eval_runtime": 134.1369,
"eval_samples_per_second": 29.41,
"eval_steps_per_second": 7.358,
"step": 1750
},
{
"epoch": 5.994075260208167,
"grad_norm": 4.494061470031738,
"learning_rate": 0.0,
"loss": 41.0993,
"step": 1752
}
],
"logging_steps": 2,
"max_steps": 1752,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 1
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.616163439072248e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}