{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 1004, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00796812749003984, "grad_norm": 0.8635491728782654, "learning_rate": 5.882352941176471e-08, "loss": 2.092526435852051, "step": 2 }, { "epoch": 0.01593625498007968, "grad_norm": 0.65386563539505, "learning_rate": 1.764705882352941e-07, "loss": 2.009572982788086, "step": 4 }, { "epoch": 0.02390438247011952, "grad_norm": 2.1573679447174072, "learning_rate": 2.9411764705882356e-07, "loss": 2.2031428813934326, "step": 6 }, { "epoch": 0.03187250996015936, "grad_norm": 0.7024685740470886, "learning_rate": 4.11764705882353e-07, "loss": 1.9222521781921387, "step": 8 }, { "epoch": 0.0398406374501992, "grad_norm": 2.466196298599243, "learning_rate": 5.294117647058824e-07, "loss": 1.5961482524871826, "step": 10 }, { "epoch": 0.04780876494023904, "grad_norm": 0.8721945285797119, "learning_rate": 6.470588235294118e-07, "loss": 1.8626258373260498, "step": 12 }, { "epoch": 0.055776892430278883, "grad_norm": 0.9898734092712402, "learning_rate": 7.647058823529412e-07, "loss": 2.907916784286499, "step": 14 }, { "epoch": 0.06374501992031872, "grad_norm": 3.7050821781158447, "learning_rate": 8.823529411764706e-07, "loss": 5.169388771057129, "step": 16 }, { "epoch": 0.07171314741035857, "grad_norm": 1.2959784269332886, "learning_rate": 1e-06, "loss": 1.9087082147598267, "step": 18 }, { "epoch": 0.0796812749003984, "grad_norm": 0.5755355358123779, "learning_rate": 1.1176470588235294e-06, "loss": 1.741716980934143, "step": 20 }, { "epoch": 0.08764940239043825, "grad_norm": 0.5503631234169006, "learning_rate": 1.2352941176470588e-06, "loss": 2.108546733856201, "step": 22 }, { "epoch": 0.09561752988047809, "grad_norm": 1.9359296560287476, "learning_rate": 1.3529411764705883e-06, "loss": 1.5568987131118774, "step": 24 }, { "epoch": 0.10358565737051793, "grad_norm": 0.3511880934238434, "learning_rate": 1.4705882352941175e-06, "loss": 1.6464097499847412, "step": 26 }, { "epoch": 0.11155378486055777, "grad_norm": 1.7009022235870361, "learning_rate": 1.5882352941176472e-06, "loss": 1.681158185005188, "step": 28 }, { "epoch": 0.11952191235059761, "grad_norm": 0.9740716814994812, "learning_rate": 1.7058823529411764e-06, "loss": 1.4671032428741455, "step": 30 }, { "epoch": 0.12749003984063745, "grad_norm": 1.555565595626831, "learning_rate": 1.8235294117647058e-06, "loss": 1.2685446739196777, "step": 32 }, { "epoch": 0.13545816733067728, "grad_norm": 0.8954079151153564, "learning_rate": 1.9411764705882353e-06, "loss": 1.3833661079406738, "step": 34 }, { "epoch": 0.14342629482071714, "grad_norm": 0.5755887627601624, "learning_rate": 2.058823529411765e-06, "loss": 1.5257431268692017, "step": 36 }, { "epoch": 0.15139442231075698, "grad_norm": 2.27645206451416, "learning_rate": 2.176470588235294e-06, "loss": 1.2891892194747925, "step": 38 }, { "epoch": 0.1593625498007968, "grad_norm": 0.37568527460098267, "learning_rate": 2.2941176470588234e-06, "loss": 1.424462080001831, "step": 40 }, { "epoch": 0.16733067729083664, "grad_norm": 2.9606502056121826, "learning_rate": 2.411764705882353e-06, "loss": 0.9945810437202454, "step": 42 }, { "epoch": 0.1752988047808765, "grad_norm": 0.474932461977005, "learning_rate": 2.5294117647058823e-06, "loss": 1.1145226955413818, "step": 44 }, { "epoch": 0.18326693227091634, "grad_norm": 0.6873041391372681, "learning_rate": 2.647058823529412e-06, "loss": 0.9955132603645325, "step": 46 }, { "epoch": 0.19123505976095617, "grad_norm": 1.075629711151123, "learning_rate": 2.764705882352941e-06, "loss": 1.2583197355270386, "step": 48 }, { "epoch": 0.199203187250996, "grad_norm": 0.3792019784450531, "learning_rate": 2.882352941176471e-06, "loss": 1.4063794612884521, "step": 50 }, { "epoch": 0.20717131474103587, "grad_norm": 0.45150187611579895, "learning_rate": 3e-06, "loss": 1.3480579853057861, "step": 52 }, { "epoch": 0.2151394422310757, "grad_norm": 37.446346282958984, "learning_rate": 2.999970658917326e-06, "loss": 1.171717882156372, "step": 54 }, { "epoch": 0.22310756972111553, "grad_norm": 0.5553145408630371, "learning_rate": 2.9998826369447094e-06, "loss": 1.567508339881897, "step": 56 }, { "epoch": 0.23107569721115537, "grad_norm": 0.5614340901374817, "learning_rate": 2.9997359379083137e-06, "loss": 1.3946313858032227, "step": 58 }, { "epoch": 0.23904382470119523, "grad_norm": 0.4262371063232422, "learning_rate": 2.9995305681848922e-06, "loss": 0.8638534545898438, "step": 60 }, { "epoch": 0.24701195219123506, "grad_norm": 0.37590524554252625, "learning_rate": 2.9992665367015114e-06, "loss": 1.3967615365982056, "step": 62 }, { "epoch": 0.2549800796812749, "grad_norm": 0.8519444465637207, "learning_rate": 2.998943854935163e-06, "loss": 1.1325318813323975, "step": 64 }, { "epoch": 0.26294820717131473, "grad_norm": 0.3111265301704407, "learning_rate": 2.9985625369122664e-06, "loss": 1.312957525253296, "step": 66 }, { "epoch": 0.27091633466135456, "grad_norm": 1.5971601009368896, "learning_rate": 2.998122599208055e-06, "loss": 0.9911661148071289, "step": 68 }, { "epoch": 0.2788844621513944, "grad_norm": 0.5279662013053894, "learning_rate": 2.9976240609458617e-06, "loss": 1.567615032196045, "step": 70 }, { "epoch": 0.2868525896414343, "grad_norm": 2.7161941528320312, "learning_rate": 2.9970669437962822e-06, "loss": 0.8429480791091919, "step": 72 }, { "epoch": 0.2948207171314741, "grad_norm": 1.455247163772583, "learning_rate": 2.9964512719762347e-06, "loss": 1.1409013271331787, "step": 74 }, { "epoch": 0.30278884462151395, "grad_norm": 0.41458117961883545, "learning_rate": 2.9957770722479088e-06, "loss": 1.4755173921585083, "step": 76 }, { "epoch": 0.3107569721115538, "grad_norm": 2.052013635635376, "learning_rate": 2.9950443739176006e-06, "loss": 0.9957228899002075, "step": 78 }, { "epoch": 0.3187250996015936, "grad_norm": 0.35525253415107727, "learning_rate": 2.99425320883444e-06, "loss": 0.8386018872261047, "step": 80 }, { "epoch": 0.32669322709163345, "grad_norm": 0.39254093170166016, "learning_rate": 2.993403611389005e-06, "loss": 1.3378560543060303, "step": 82 }, { "epoch": 0.3346613545816733, "grad_norm": 0.6873935461044312, "learning_rate": 2.992495618511827e-06, "loss": 0.8897277116775513, "step": 84 }, { "epoch": 0.3426294820717131, "grad_norm": 0.6063620448112488, "learning_rate": 2.991529269671786e-06, "loss": 1.3212584257125854, "step": 86 }, { "epoch": 0.350597609561753, "grad_norm": 0.685740053653717, "learning_rate": 2.9905046068743946e-06, "loss": 0.8896841406822205, "step": 88 }, { "epoch": 0.35856573705179284, "grad_norm": 0.3957422375679016, "learning_rate": 2.9894216746599727e-06, "loss": 1.3377217054367065, "step": 90 }, { "epoch": 0.3665338645418327, "grad_norm": 2.463545799255371, "learning_rate": 2.9882805201017116e-06, "loss": 1.133660912513733, "step": 92 }, { "epoch": 0.3745019920318725, "grad_norm": 0.4274525046348572, "learning_rate": 2.9870811928036256e-06, "loss": 1.2548623085021973, "step": 94 }, { "epoch": 0.38247011952191234, "grad_norm": 0.9313490986824036, "learning_rate": 2.985823744898399e-06, "loss": 1.1074374914169312, "step": 96 }, { "epoch": 0.3904382470119522, "grad_norm": 0.2799544632434845, "learning_rate": 2.984508231045117e-06, "loss": 1.2744643688201904, "step": 98 }, { "epoch": 0.398406374501992, "grad_norm": 0.5394402146339417, "learning_rate": 2.9831347084268923e-06, "loss": 0.9088782072067261, "step": 100 }, { "epoch": 0.4063745019920319, "grad_norm": 0.5486662983894348, "learning_rate": 2.981703236748378e-06, "loss": 1.2874795198440552, "step": 102 }, { "epoch": 0.41434262948207173, "grad_norm": 0.40334078669548035, "learning_rate": 2.9802138782331712e-06, "loss": 0.776800274848938, "step": 104 }, { "epoch": 0.42231075697211157, "grad_norm": 1.1005347967147827, "learning_rate": 2.978666697621112e-06, "loss": 0.733248770236969, "step": 106 }, { "epoch": 0.4302788844621514, "grad_norm": 0.7740170955657959, "learning_rate": 2.9770617621654656e-06, "loss": 0.9902894496917725, "step": 108 }, { "epoch": 0.43824701195219123, "grad_norm": 0.4173789918422699, "learning_rate": 2.9753991416300007e-06, "loss": 1.2939473390579224, "step": 110 }, { "epoch": 0.44621513944223107, "grad_norm": 1.0325746536254883, "learning_rate": 2.9736789082859568e-06, "loss": 0.8199123740196228, "step": 112 }, { "epoch": 0.4541832669322709, "grad_norm": 0.8412153720855713, "learning_rate": 2.9719011369089025e-06, "loss": 1.1337602138519287, "step": 114 }, { "epoch": 0.46215139442231074, "grad_norm": 0.42725688219070435, "learning_rate": 2.970065904775485e-06, "loss": 1.2960267066955566, "step": 116 }, { "epoch": 0.4701195219123506, "grad_norm": 0.4095574617385864, "learning_rate": 2.968173291660071e-06, "loss": 1.36135995388031, "step": 118 }, { "epoch": 0.47808764940239046, "grad_norm": 0.5657700300216675, "learning_rate": 2.9662233798312805e-06, "loss": 1.2439430952072144, "step": 120 }, { "epoch": 0.4860557768924303, "grad_norm": 0.4052128791809082, "learning_rate": 2.9642162540484077e-06, "loss": 0.9588472247123718, "step": 122 }, { "epoch": 0.4940239043824701, "grad_norm": 1.1219152212142944, "learning_rate": 2.96215200155774e-06, "loss": 1.0325348377227783, "step": 124 }, { "epoch": 0.50199203187251, "grad_norm": 1.6725101470947266, "learning_rate": 2.9600307120887623e-06, "loss": 0.9328906536102295, "step": 126 }, { "epoch": 0.5099601593625498, "grad_norm": 0.5302041172981262, "learning_rate": 2.9578524778502605e-06, "loss": 1.2722545862197876, "step": 128 }, { "epoch": 0.5179282868525896, "grad_norm": 0.4805465042591095, "learning_rate": 2.9556173935263094e-06, "loss": 0.9367498159408569, "step": 130 }, { "epoch": 0.5258964143426295, "grad_norm": 0.34964123368263245, "learning_rate": 2.9533255562721594e-06, "loss": 0.8116304278373718, "step": 132 }, { "epoch": 0.5338645418326693, "grad_norm": 0.3873235881328583, "learning_rate": 2.950977065710012e-06, "loss": 0.8979853391647339, "step": 134 }, { "epoch": 0.5418326693227091, "grad_norm": 0.49073103070259094, "learning_rate": 2.9485720239246913e-06, "loss": 1.2764328718185425, "step": 136 }, { "epoch": 0.549800796812749, "grad_norm": 0.9829900860786438, "learning_rate": 2.946110535459204e-06, "loss": 0.9340943694114685, "step": 138 }, { "epoch": 0.5577689243027888, "grad_norm": 0.6929047703742981, "learning_rate": 2.9435927073101974e-06, "loss": 1.0783215761184692, "step": 140 }, { "epoch": 0.5657370517928287, "grad_norm": 0.33450594544410706, "learning_rate": 2.9410186489233063e-06, "loss": 1.015251636505127, "step": 142 }, { "epoch": 0.5737051792828686, "grad_norm": 0.4002437889575958, "learning_rate": 2.9383884721883973e-06, "loss": 1.2262054681777954, "step": 144 }, { "epoch": 0.5816733067729084, "grad_norm": 0.5956969261169434, "learning_rate": 2.9357022914347046e-06, "loss": 1.382331132888794, "step": 146 }, { "epoch": 0.5896414342629482, "grad_norm": 0.49758458137512207, "learning_rate": 2.9329602234258606e-06, "loss": 0.9430940747261047, "step": 148 }, { "epoch": 0.5976095617529881, "grad_norm": 0.6348124146461487, "learning_rate": 2.9301623873548187e-06, "loss": 0.9293842315673828, "step": 150 }, { "epoch": 0.6055776892430279, "grad_norm": 0.8025413155555725, "learning_rate": 2.9273089048386757e-06, "loss": 0.9237926602363586, "step": 152 }, { "epoch": 0.6135458167330677, "grad_norm": 9.821154594421387, "learning_rate": 2.9243998999133803e-06, "loss": 1.1090776920318604, "step": 154 }, { "epoch": 0.6215139442231076, "grad_norm": 1.79253351688385, "learning_rate": 2.921435499028347e-06, "loss": 0.8607668280601501, "step": 156 }, { "epoch": 0.6294820717131474, "grad_norm": 0.6355870366096497, "learning_rate": 2.918415831040955e-06, "loss": 1.1218218803405762, "step": 158 }, { "epoch": 0.6374501992031872, "grad_norm": 0.4850366711616516, "learning_rate": 2.91534102721095e-06, "loss": 1.0082756280899048, "step": 160 }, { "epoch": 0.6454183266932271, "grad_norm": 0.22921444475650787, "learning_rate": 2.9122112211947373e-06, "loss": 0.7417871952056885, "step": 162 }, { "epoch": 0.6533864541832669, "grad_norm": 0.2860549986362457, "learning_rate": 2.9090265490395713e-06, "loss": 1.1341336965560913, "step": 164 }, { "epoch": 0.6613545816733067, "grad_norm": 0.9676735401153564, "learning_rate": 2.9057871491776436e-06, "loss": 0.7781538963317871, "step": 166 }, { "epoch": 0.6693227091633466, "grad_norm": 0.3801421523094177, "learning_rate": 2.9024931624200637e-06, "loss": 1.2313846349716187, "step": 168 }, { "epoch": 0.6772908366533864, "grad_norm": 1.8267383575439453, "learning_rate": 2.899144731950739e-06, "loss": 0.8328909873962402, "step": 170 }, { "epoch": 0.6852589641434262, "grad_norm": 0.43613070249557495, "learning_rate": 2.895742003320152e-06, "loss": 0.8186226487159729, "step": 172 }, { "epoch": 0.6932270916334662, "grad_norm": 0.6106582283973694, "learning_rate": 2.8922851244390312e-06, "loss": 1.2231653928756714, "step": 174 }, { "epoch": 0.701195219123506, "grad_norm": 0.9366831183433533, "learning_rate": 2.888774245571924e-06, "loss": 0.7547324299812317, "step": 176 }, { "epoch": 0.7091633466135459, "grad_norm": 1.2000120878219604, "learning_rate": 2.8852095193306633e-06, "loss": 0.8711674809455872, "step": 178 }, { "epoch": 0.7171314741035857, "grad_norm": 0.8977218270301819, "learning_rate": 2.8815911006677326e-06, "loss": 1.0363984107971191, "step": 180 }, { "epoch": 0.7250996015936255, "grad_norm": 0.23066113889217377, "learning_rate": 2.877919146869535e-06, "loss": 0.4867554008960724, "step": 182 }, { "epoch": 0.7330677290836654, "grad_norm": 0.45316410064697266, "learning_rate": 2.874193817549551e-06, "loss": 1.250420331954956, "step": 184 }, { "epoch": 0.7410358565737052, "grad_norm": 0.40230974555015564, "learning_rate": 2.870415274641405e-06, "loss": 1.2383266687393188, "step": 186 }, { "epoch": 0.749003984063745, "grad_norm": 0.46227118372917175, "learning_rate": 2.866583682391821e-06, "loss": 1.0794142484664917, "step": 188 }, { "epoch": 0.7569721115537849, "grad_norm": 1.3957983255386353, "learning_rate": 2.8626992073534888e-06, "loss": 0.8541224598884583, "step": 190 }, { "epoch": 0.7649402390438247, "grad_norm": 0.618249237537384, "learning_rate": 2.858762018377821e-06, "loss": 1.187286615371704, "step": 192 }, { "epoch": 0.7729083665338645, "grad_norm": 0.6879091858863831, "learning_rate": 2.8547722866076125e-06, "loss": 0.8607790470123291, "step": 194 }, { "epoch": 0.7808764940239044, "grad_norm": 0.6802543997764587, "learning_rate": 2.850730185469604e-06, "loss": 0.5753005743026733, "step": 196 }, { "epoch": 0.7888446215139442, "grad_norm": 2.574225664138794, "learning_rate": 2.8466358906669423e-06, "loss": 0.7759158611297607, "step": 198 }, { "epoch": 0.796812749003984, "grad_norm": 1.023215651512146, "learning_rate": 2.842489580171541e-06, "loss": 1.0175625085830688, "step": 200 }, { "epoch": 0.8047808764940239, "grad_norm": 0.2602544128894806, "learning_rate": 2.838291434216347e-06, "loss": 0.95418781042099, "step": 202 }, { "epoch": 0.8127490039840638, "grad_norm": 0.4951968193054199, "learning_rate": 2.8340416352875057e-06, "loss": 1.4974547624588013, "step": 204 }, { "epoch": 0.8207171314741036, "grad_norm": 0.4747754633426666, "learning_rate": 2.8297403681164256e-06, "loss": 1.2074265480041504, "step": 206 }, { "epoch": 0.8286852589641435, "grad_norm": 0.5524476766586304, "learning_rate": 2.825387819671754e-06, "loss": 0.5993782877922058, "step": 208 }, { "epoch": 0.8366533864541833, "grad_norm": 0.6101752519607544, "learning_rate": 2.820984179151243e-06, "loss": 1.2540825605392456, "step": 210 }, { "epoch": 0.8446215139442231, "grad_norm": 0.4768051505088806, "learning_rate": 2.816529637973531e-06, "loss": 1.2111059427261353, "step": 212 }, { "epoch": 0.852589641434263, "grad_norm": 0.4470526874065399, "learning_rate": 2.8120243897698197e-06, "loss": 0.936885416507721, "step": 214 }, { "epoch": 0.8605577689243028, "grad_norm": 0.8324171900749207, "learning_rate": 2.807468630375457e-06, "loss": 1.4127908945083618, "step": 216 }, { "epoch": 0.8685258964143426, "grad_norm": 0.8571661710739136, "learning_rate": 2.802862557821425e-06, "loss": 0.9163059592247009, "step": 218 }, { "epoch": 0.8764940239043825, "grad_norm": 0.656310498714447, "learning_rate": 2.7982063723257324e-06, "loss": 1.2317224740982056, "step": 220 }, { "epoch": 0.8844621513944223, "grad_norm": 0.94222491979599, "learning_rate": 2.7935002762847104e-06, "loss": 1.4826358556747437, "step": 222 }, { "epoch": 0.8924302788844621, "grad_norm": 0.6392826437950134, "learning_rate": 2.7887444742642153e-06, "loss": 0.7295237183570862, "step": 224 }, { "epoch": 0.900398406374502, "grad_norm": 0.40648454427719116, "learning_rate": 2.783939172990736e-06, "loss": 0.9684391021728516, "step": 226 }, { "epoch": 0.9083665338645418, "grad_norm": 1.8654367923736572, "learning_rate": 2.7790845813424085e-06, "loss": 1.076025128364563, "step": 228 }, { "epoch": 0.9163346613545816, "grad_norm": 0.5885425806045532, "learning_rate": 2.7741809103399372e-06, "loss": 1.1793254613876343, "step": 230 }, { "epoch": 0.9243027888446215, "grad_norm": 0.4949474334716797, "learning_rate": 2.76922837313742e-06, "loss": 1.3203648328781128, "step": 232 }, { "epoch": 0.9322709163346613, "grad_norm": 0.6972324252128601, "learning_rate": 2.7642271850130845e-06, "loss": 0.8468176126480103, "step": 234 }, { "epoch": 0.9402390438247012, "grad_norm": 0.3574720323085785, "learning_rate": 2.7591775633599295e-06, "loss": 0.8511747121810913, "step": 236 }, { "epoch": 0.9482071713147411, "grad_norm": 1.726371169090271, "learning_rate": 2.7540797276762748e-06, "loss": 0.43165379762649536, "step": 238 }, { "epoch": 0.9561752988047809, "grad_norm": 1.3261985778808594, "learning_rate": 2.7489338995562223e-06, "loss": 0.8357299566268921, "step": 240 }, { "epoch": 0.9641434262948207, "grad_norm": 0.796555757522583, "learning_rate": 2.743740302680021e-06, "loss": 1.1258844137191772, "step": 242 }, { "epoch": 0.9721115537848606, "grad_norm": 0.3952471911907196, "learning_rate": 2.738499162804346e-06, "loss": 1.2129818201065063, "step": 244 }, { "epoch": 0.9800796812749004, "grad_norm": 3.2808635234832764, "learning_rate": 2.733210707752483e-06, "loss": 0.9465621113777161, "step": 246 }, { "epoch": 0.9880478087649402, "grad_norm": 0.36960551142692566, "learning_rate": 2.7278751674044277e-06, "loss": 0.9761192202568054, "step": 248 }, { "epoch": 0.9960159362549801, "grad_norm": 0.40931713581085205, "learning_rate": 2.7224927736868926e-06, "loss": 0.9702551364898682, "step": 250 }, { "epoch": 1.00398406374502, "grad_norm": 0.870992124080658, "learning_rate": 2.7170637605632236e-06, "loss": 0.9026790857315063, "step": 252 }, { "epoch": 1.0119521912350598, "grad_norm": 0.5800741910934448, "learning_rate": 2.7115883640232326e-06, "loss": 0.696426272392273, "step": 254 }, { "epoch": 1.0199203187250996, "grad_norm": 0.5435763001441956, "learning_rate": 2.706066822072938e-06, "loss": 1.1814464330673218, "step": 256 }, { "epoch": 1.0278884462151394, "grad_norm": 0.7086870074272156, "learning_rate": 2.70049937472422e-06, "loss": 1.1569794416427612, "step": 258 }, { "epoch": 1.0358565737051793, "grad_norm": 0.6994827389717102, "learning_rate": 2.694886263984387e-06, "loss": 1.092099905014038, "step": 260 }, { "epoch": 1.043824701195219, "grad_norm": 0.5085208415985107, "learning_rate": 2.6892277338456545e-06, "loss": 1.0143777132034302, "step": 262 }, { "epoch": 1.051792828685259, "grad_norm": 0.7572088837623596, "learning_rate": 2.6835240302745432e-06, "loss": 1.1160310506820679, "step": 264 }, { "epoch": 1.0597609561752988, "grad_norm": 0.969849705696106, "learning_rate": 2.6777754012011822e-06, "loss": 0.6508786678314209, "step": 266 }, { "epoch": 1.0677290836653386, "grad_norm": 0.40249520540237427, "learning_rate": 2.6719820965085373e-06, "loss": 0.9302061796188354, "step": 268 }, { "epoch": 1.0756972111553784, "grad_norm": 0.39779043197631836, "learning_rate": 2.6661443680215436e-06, "loss": 0.5909485220909119, "step": 270 }, { "epoch": 1.0836653386454183, "grad_norm": 0.4452027976512909, "learning_rate": 2.6602624694961634e-06, "loss": 1.2358585596084595, "step": 272 }, { "epoch": 1.091633466135458, "grad_norm": 1.504374623298645, "learning_rate": 2.6543366566083536e-06, "loss": 0.602242648601532, "step": 274 }, { "epoch": 1.099601593625498, "grad_norm": 0.4246741831302643, "learning_rate": 2.6483671869429515e-06, "loss": 0.925746500492096, "step": 276 }, { "epoch": 1.1075697211155378, "grad_norm": 0.4856192171573639, "learning_rate": 2.6423543199824814e-06, "loss": 0.8335383534431458, "step": 278 }, { "epoch": 1.1155378486055776, "grad_norm": 0.46481937170028687, "learning_rate": 2.6362983170958708e-06, "loss": 1.0746686458587646, "step": 280 }, { "epoch": 1.1235059760956174, "grad_norm": 0.40899351239204407, "learning_rate": 2.6301994415270927e-06, "loss": 1.1849327087402344, "step": 282 }, { "epoch": 1.1314741035856573, "grad_norm": 1.2962706089019775, "learning_rate": 2.62405795838372e-06, "loss": 1.1970727443695068, "step": 284 }, { "epoch": 1.139442231075697, "grad_norm": 1.8368911743164062, "learning_rate": 2.617874134625405e-06, "loss": 0.3501308858394623, "step": 286 }, { "epoch": 1.1474103585657371, "grad_norm": 0.2166207730770111, "learning_rate": 2.6116482390522715e-06, "loss": 0.7515382170677185, "step": 288 }, { "epoch": 1.155378486055777, "grad_norm": 0.823740541934967, "learning_rate": 2.605380542293234e-06, "loss": 0.9569700360298157, "step": 290 }, { "epoch": 1.1633466135458168, "grad_norm": 0.6585810780525208, "learning_rate": 2.5990713167942306e-06, "loss": 0.5356516242027283, "step": 292 }, { "epoch": 1.1713147410358566, "grad_norm": 0.4426569640636444, "learning_rate": 2.5927208368063825e-06, "loss": 1.1944938898086548, "step": 294 }, { "epoch": 1.1792828685258965, "grad_norm": 0.3524056673049927, "learning_rate": 2.586329378374074e-06, "loss": 1.2448694705963135, "step": 296 }, { "epoch": 1.1872509960159363, "grad_norm": 0.3419857621192932, "learning_rate": 2.5798972193229485e-06, "loss": 1.384804129600525, "step": 298 }, { "epoch": 1.1952191235059761, "grad_norm": 1.195278286933899, "learning_rate": 2.573424639247837e-06, "loss": 0.6737868189811707, "step": 300 }, { "epoch": 1.203187250996016, "grad_norm": 0.5209558606147766, "learning_rate": 2.5669119195006016e-06, "loss": 0.9476598501205444, "step": 302 }, { "epoch": 1.2111553784860558, "grad_norm": 0.6349844336509705, "learning_rate": 2.560359343177907e-06, "loss": 0.6187256574630737, "step": 304 }, { "epoch": 1.2191235059760956, "grad_norm": 0.6629199385643005, "learning_rate": 2.553767195108914e-06, "loss": 1.175504207611084, "step": 306 }, { "epoch": 1.2270916334661355, "grad_norm": 2.9705917835235596, "learning_rate": 2.547135761842899e-06, "loss": 1.0829224586486816, "step": 308 }, { "epoch": 1.2350597609561753, "grad_norm": 0.5714198350906372, "learning_rate": 2.5404653316367983e-06, "loss": 1.107988953590393, "step": 310 }, { "epoch": 1.2430278884462151, "grad_norm": 0.5963674783706665, "learning_rate": 2.533756194442676e-06, "loss": 1.066701054573059, "step": 312 }, { "epoch": 1.250996015936255, "grad_norm": 0.4868275225162506, "learning_rate": 2.527008641895124e-06, "loss": 0.8929526209831238, "step": 314 }, { "epoch": 1.2589641434262948, "grad_norm": 0.5656901001930237, "learning_rate": 2.5202229672985834e-06, "loss": 0.7201364040374756, "step": 316 }, { "epoch": 1.2669322709163346, "grad_norm": 1.8498417139053345, "learning_rate": 2.513399465614593e-06, "loss": 0.7000647187232971, "step": 318 }, { "epoch": 1.2749003984063745, "grad_norm": 1.8695796728134155, "learning_rate": 2.5065384334489707e-06, "loss": 0.6135216355323792, "step": 320 }, { "epoch": 1.2828685258964143, "grad_norm": 1.0598129034042358, "learning_rate": 2.499640169038919e-06, "loss": 0.9898048639297485, "step": 322 }, { "epoch": 1.2908366533864541, "grad_norm": 0.43019378185272217, "learning_rate": 2.4927049722400632e-06, "loss": 0.9241266250610352, "step": 324 }, { "epoch": 1.298804780876494, "grad_norm": 8.146377563476562, "learning_rate": 2.4857331445134148e-06, "loss": 1.3060351610183716, "step": 326 }, { "epoch": 1.3067729083665338, "grad_norm": 0.3936547338962555, "learning_rate": 2.4787249889122664e-06, "loss": 0.7562607526779175, "step": 328 }, { "epoch": 1.3147410358565736, "grad_norm": 0.45019447803497314, "learning_rate": 2.4716808100690243e-06, "loss": 0.9096888303756714, "step": 330 }, { "epoch": 1.3227091633466135, "grad_norm": 0.6943168044090271, "learning_rate": 2.4646009141819594e-06, "loss": 1.000761866569519, "step": 332 }, { "epoch": 1.3306772908366533, "grad_norm": 1.2407184839248657, "learning_rate": 2.4574856090019033e-06, "loss": 0.7026602029800415, "step": 334 }, { "epoch": 1.3386454183266931, "grad_norm": 1.955567479133606, "learning_rate": 2.4503352038188665e-06, "loss": 0.8961894512176514, "step": 336 }, { "epoch": 1.3466135458167332, "grad_norm": 0.6083151698112488, "learning_rate": 2.4431500094485963e-06, "loss": 1.196513056755066, "step": 338 }, { "epoch": 1.354581673306773, "grad_norm": 0.3481491804122925, "learning_rate": 2.435930338219066e-06, "loss": 1.1471824645996094, "step": 340 }, { "epoch": 1.3625498007968129, "grad_norm": 0.3202623128890991, "learning_rate": 2.4286765039568996e-06, "loss": 1.1803168058395386, "step": 342 }, { "epoch": 1.3705179282868527, "grad_norm": 0.5573057532310486, "learning_rate": 2.4213888219737273e-06, "loss": 1.145288109779358, "step": 344 }, { "epoch": 1.3784860557768925, "grad_norm": 0.7530761361122131, "learning_rate": 2.4140676090524813e-06, "loss": 0.7112540006637573, "step": 346 }, { "epoch": 1.3864541832669324, "grad_norm": 0.31881389021873474, "learning_rate": 2.4067131834336265e-06, "loss": 1.1765650510787964, "step": 348 }, { "epoch": 1.3944223107569722, "grad_norm": 0.5150675773620605, "learning_rate": 2.3993258648013265e-06, "loss": 0.7162399291992188, "step": 350 }, { "epoch": 1.402390438247012, "grad_norm": 0.4000639021396637, "learning_rate": 2.391905974269546e-06, "loss": 1.1982700824737549, "step": 352 }, { "epoch": 1.4103585657370519, "grad_norm": 0.5254287719726562, "learning_rate": 2.3844538343680954e-06, "loss": 1.1246976852416992, "step": 354 }, { "epoch": 1.4183266932270917, "grad_norm": 0.7500051259994507, "learning_rate": 2.376969769028608e-06, "loss": 0.9707033634185791, "step": 356 }, { "epoch": 1.4262948207171315, "grad_norm": 0.4242282211780548, "learning_rate": 2.3694541035704623e-06, "loss": 1.1708297729492188, "step": 358 }, { "epoch": 1.4342629482071714, "grad_norm": 0.5363942384719849, "learning_rate": 2.361907164686638e-06, "loss": 0.6365261077880859, "step": 360 }, { "epoch": 1.4422310756972112, "grad_norm": 0.34740063548088074, "learning_rate": 2.3543292804295164e-06, "loss": 0.7796180844306946, "step": 362 }, { "epoch": 1.450199203187251, "grad_norm": 0.8936453461647034, "learning_rate": 2.3467207801966217e-06, "loss": 1.069722056388855, "step": 364 }, { "epoch": 1.4581673306772909, "grad_norm": 0.5224515199661255, "learning_rate": 2.339081994716301e-06, "loss": 1.1437441110610962, "step": 366 }, { "epoch": 1.4661354581673307, "grad_norm": 0.3716191351413727, "learning_rate": 2.3314132560333486e-06, "loss": 0.6178401708602905, "step": 368 }, { "epoch": 1.4741035856573705, "grad_norm": 0.38474348187446594, "learning_rate": 2.3237148974945732e-06, "loss": 0.6917285323143005, "step": 370 }, { "epoch": 1.4820717131474104, "grad_norm": 0.6348185539245605, "learning_rate": 2.315987253734307e-06, "loss": 1.248483657836914, "step": 372 }, { "epoch": 1.4900398406374502, "grad_norm": 0.5666549205780029, "learning_rate": 2.308230660659861e-06, "loss": 1.1805977821350098, "step": 374 }, { "epoch": 1.49800796812749, "grad_norm": 0.3927704393863678, "learning_rate": 2.300445455436921e-06, "loss": 1.144189715385437, "step": 376 }, { "epoch": 1.5059760956175299, "grad_norm": 1.197426676750183, "learning_rate": 2.292631976474895e-06, "loss": 0.9748780131340027, "step": 378 }, { "epoch": 1.5139442231075697, "grad_norm": 0.5384941101074219, "learning_rate": 2.284790563412201e-06, "loss": 1.1320137977600098, "step": 380 }, { "epoch": 1.5219123505976095, "grad_norm": 0.40527671575546265, "learning_rate": 2.2769215571015054e-06, "loss": 0.756669282913208, "step": 382 }, { "epoch": 1.5298804780876494, "grad_norm": 0.5759975910186768, "learning_rate": 2.2690252995949015e-06, "loss": 0.7979004979133606, "step": 384 }, { "epoch": 1.5378486055776892, "grad_norm": 0.7272588014602661, "learning_rate": 2.2611021341290494e-06, "loss": 1.049770474433899, "step": 386 }, { "epoch": 1.545816733067729, "grad_norm": 0.495037317276001, "learning_rate": 2.2531524051102493e-06, "loss": 1.1562248468399048, "step": 388 }, { "epoch": 1.5537848605577689, "grad_norm": 0.6938880681991577, "learning_rate": 2.245176458099474e-06, "loss": 0.5849276781082153, "step": 390 }, { "epoch": 1.5617529880478087, "grad_norm": 0.6492685675621033, "learning_rate": 2.237174639797346e-06, "loss": 0.6089207530021667, "step": 392 }, { "epoch": 1.5697211155378485, "grad_norm": 0.7078109383583069, "learning_rate": 2.2291472980290696e-06, "loss": 1.174211025238037, "step": 394 }, { "epoch": 1.5776892430278884, "grad_norm": 0.5934572219848633, "learning_rate": 2.221094781729308e-06, "loss": 1.045981526374817, "step": 396 }, { "epoch": 1.5856573705179282, "grad_norm": 0.40928709506988525, "learning_rate": 2.2130174409270204e-06, "loss": 1.1423792839050293, "step": 398 }, { "epoch": 1.593625498007968, "grad_norm": 0.9985561370849609, "learning_rate": 2.204915626730242e-06, "loss": 0.6377730369567871, "step": 400 }, { "epoch": 1.6015936254980079, "grad_norm": 0.6911800503730774, "learning_rate": 2.1967896913108245e-06, "loss": 1.1229146718978882, "step": 402 }, { "epoch": 1.6095617529880477, "grad_norm": 0.45020508766174316, "learning_rate": 2.1886399878891288e-06, "loss": 1.2710224390029907, "step": 404 }, { "epoch": 1.6175298804780875, "grad_norm": 1.1965842247009277, "learning_rate": 2.1804668707186685e-06, "loss": 0.7574101686477661, "step": 406 }, { "epoch": 1.6254980079681274, "grad_norm": 0.3679886758327484, "learning_rate": 2.1722706950707116e-06, "loss": 0.5999529957771301, "step": 408 }, { "epoch": 1.6334661354581672, "grad_norm": 0.3366493284702301, "learning_rate": 2.1640518172188405e-06, "loss": 0.8446294665336609, "step": 410 }, { "epoch": 1.641434262948207, "grad_norm": 0.5908185243606567, "learning_rate": 2.1558105944234613e-06, "loss": 0.5261144638061523, "step": 412 }, { "epoch": 1.6494023904382469, "grad_norm": 0.41685259342193604, "learning_rate": 2.147547384916277e-06, "loss": 0.8532723188400269, "step": 414 }, { "epoch": 1.6573705179282867, "grad_norm": 0.45444801449775696, "learning_rate": 2.1392625478847147e-06, "loss": 0.7636860609054565, "step": 416 }, { "epoch": 1.6653386454183265, "grad_norm": 0.7797396779060364, "learning_rate": 2.130956443456313e-06, "loss": 1.0055797100067139, "step": 418 }, { "epoch": 1.6733067729083664, "grad_norm": 0.48384949564933777, "learning_rate": 2.1226294326830663e-06, "loss": 1.1412853002548218, "step": 420 }, { "epoch": 1.6812749003984062, "grad_norm": 0.5391873121261597, "learning_rate": 2.1142818775257326e-06, "loss": 1.0306801795959473, "step": 422 }, { "epoch": 1.6892430278884463, "grad_norm": 0.4576083719730377, "learning_rate": 2.105914140838099e-06, "loss": 0.6449341773986816, "step": 424 }, { "epoch": 1.697211155378486, "grad_norm": 0.40177881717681885, "learning_rate": 2.0975265863512077e-06, "loss": 1.2080342769622803, "step": 426 }, { "epoch": 1.705179282868526, "grad_norm": 0.41376203298568726, "learning_rate": 2.0891195786575484e-06, "loss": 1.1480873823165894, "step": 428 }, { "epoch": 1.7131474103585658, "grad_norm": 0.5125391483306885, "learning_rate": 2.080693483195205e-06, "loss": 0.7938195466995239, "step": 430 }, { "epoch": 1.7211155378486056, "grad_norm": 4.272192478179932, "learning_rate": 2.072248666231976e-06, "loss": 0.9956310987472534, "step": 432 }, { "epoch": 1.7290836653386454, "grad_norm": 0.7369029521942139, "learning_rate": 2.0637854948494514e-06, "loss": 0.6916837096214294, "step": 434 }, { "epoch": 1.7370517928286853, "grad_norm": 0.5697792768478394, "learning_rate": 2.0553043369270544e-06, "loss": 0.37542012333869934, "step": 436 }, { "epoch": 1.745019920318725, "grad_norm": 0.8396166563034058, "learning_rate": 2.0468055611260523e-06, "loss": 0.7680933475494385, "step": 438 }, { "epoch": 1.752988047808765, "grad_norm": 0.7971356511116028, "learning_rate": 2.038289536873533e-06, "loss": 0.6482405066490173, "step": 440 }, { "epoch": 1.7609561752988048, "grad_norm": 0.6245846152305603, "learning_rate": 2.029756634346343e-06, "loss": 1.559065818786621, "step": 442 }, { "epoch": 1.7689243027888446, "grad_norm": 0.3909813165664673, "learning_rate": 2.021207224454998e-06, "loss": 0.8277990818023682, "step": 444 }, { "epoch": 1.7768924302788844, "grad_norm": 0.4216088652610779, "learning_rate": 2.0126416788275607e-06, "loss": 1.0609842538833618, "step": 446 }, { "epoch": 1.7848605577689243, "grad_norm": 0.5274325013160706, "learning_rate": 2.0040603697934875e-06, "loss": 1.1131477355957031, "step": 448 }, { "epoch": 1.792828685258964, "grad_norm": 0.40741127729415894, "learning_rate": 1.995463670367441e-06, "loss": 0.8610782623291016, "step": 450 }, { "epoch": 1.800796812749004, "grad_norm": 0.3124333620071411, "learning_rate": 1.986851954233079e-06, "loss": 0.5843238830566406, "step": 452 }, { "epoch": 1.8087649402390438, "grad_norm": 0.5109219551086426, "learning_rate": 1.9782255957268082e-06, "loss": 0.9527801275253296, "step": 454 }, { "epoch": 1.8167330677290838, "grad_norm": 0.6287890672683716, "learning_rate": 1.969584969821516e-06, "loss": 0.684226930141449, "step": 456 }, { "epoch": 1.8247011952191237, "grad_norm": 0.6896864175796509, "learning_rate": 1.9609304521102664e-06, "loss": 1.0557795763015747, "step": 458 }, { "epoch": 1.8326693227091635, "grad_norm": 0.5463947057723999, "learning_rate": 1.9522624187899774e-06, "loss": 0.730643630027771, "step": 460 }, { "epoch": 1.8406374501992033, "grad_norm": 0.5176675319671631, "learning_rate": 1.943581246645068e-06, "loss": 0.792547345161438, "step": 462 }, { "epoch": 1.8486055776892432, "grad_norm": 0.33698317408561707, "learning_rate": 1.9348873130310776e-06, "loss": 0.11931800842285156, "step": 464 }, { "epoch": 1.856573705179283, "grad_norm": 0.4697937071323395, "learning_rate": 1.926180995858266e-06, "loss": 1.1590396165847778, "step": 466 }, { "epoch": 1.8645418326693228, "grad_norm": 0.6702647805213928, "learning_rate": 1.9174626735751844e-06, "loss": 0.5630046129226685, "step": 468 }, { "epoch": 1.8725099601593627, "grad_norm": 0.5800269246101379, "learning_rate": 1.9087327251522246e-06, "loss": 1.14718759059906, "step": 470 }, { "epoch": 1.8804780876494025, "grad_norm": 1.194881796836853, "learning_rate": 1.8999915300651478e-06, "loss": 1.059720516204834, "step": 472 }, { "epoch": 1.8884462151394423, "grad_norm": 0.7230931520462036, "learning_rate": 1.8912394682785866e-06, "loss": 1.2374215126037598, "step": 474 }, { "epoch": 1.8964143426294822, "grad_norm": 0.41785743832588196, "learning_rate": 1.8824769202295325e-06, "loss": 0.9771887063980103, "step": 476 }, { "epoch": 1.904382470119522, "grad_norm": 0.36748501658439636, "learning_rate": 1.8737042668107945e-06, "loss": 0.8857436180114746, "step": 478 }, { "epoch": 1.9123505976095618, "grad_norm": 2.2149970531463623, "learning_rate": 1.8649218893544465e-06, "loss": 0.6210463047027588, "step": 480 }, { "epoch": 1.9203187250996017, "grad_norm": 0.5847256779670715, "learning_rate": 1.8561301696152485e-06, "loss": 0.758573591709137, "step": 482 }, { "epoch": 1.9282868525896415, "grad_norm": 1.2704209089279175, "learning_rate": 1.847329489754052e-06, "loss": 1.1186835765838623, "step": 484 }, { "epoch": 1.9362549800796813, "grad_norm": 0.3778979778289795, "learning_rate": 1.8385202323211921e-06, "loss": 1.2423903942108154, "step": 486 }, { "epoch": 1.9442231075697212, "grad_norm": 0.33054208755493164, "learning_rate": 1.8297027802398551e-06, "loss": 1.1193066835403442, "step": 488 }, { "epoch": 1.952191235059761, "grad_norm": 0.29046395421028137, "learning_rate": 1.8208775167894336e-06, "loss": 1.1333787441253662, "step": 490 }, { "epoch": 1.9601593625498008, "grad_norm": 0.4164031147956848, "learning_rate": 1.8120448255888684e-06, "loss": 0.8040657639503479, "step": 492 }, { "epoch": 1.9681274900398407, "grad_norm": 2.26190185546875, "learning_rate": 1.8032050905799704e-06, "loss": 0.6147015690803528, "step": 494 }, { "epoch": 1.9760956175298805, "grad_norm": 0.3113175332546234, "learning_rate": 1.7943586960107338e-06, "loss": 1.0835860967636108, "step": 496 }, { "epoch": 1.9840637450199203, "grad_norm": 0.9167985916137695, "learning_rate": 1.785506026418631e-06, "loss": 0.8612701296806335, "step": 498 }, { "epoch": 1.9920318725099602, "grad_norm": 0.40828704833984375, "learning_rate": 1.7766474666139e-06, "loss": 1.1791200637817383, "step": 500 }, { "epoch": 2.0, "grad_norm": 1.0138927698135376, "learning_rate": 1.7677834016628158e-06, "loss": 0.616978108882904, "step": 502 }, { "epoch": 2.00796812749004, "grad_norm": 0.34795933961868286, "learning_rate": 1.7589142168709526e-06, "loss": 0.7058793306350708, "step": 504 }, { "epoch": 2.0159362549800797, "grad_norm": 0.6295813322067261, "learning_rate": 1.7500402977664356e-06, "loss": 0.6776050925254822, "step": 506 }, { "epoch": 2.0239043824701195, "grad_norm": 0.09192030876874924, "learning_rate": 1.741162030083181e-06, "loss": 0.5675944685935974, "step": 508 }, { "epoch": 2.0318725099601593, "grad_norm": 0.3637365400791168, "learning_rate": 1.7322797997441324e-06, "loss": 0.9497091770172119, "step": 510 }, { "epoch": 2.039840637450199, "grad_norm": 0.9041894674301147, "learning_rate": 1.7233939928444803e-06, "loss": 0.8332242369651794, "step": 512 }, { "epoch": 2.047808764940239, "grad_norm": 0.5637346506118774, "learning_rate": 1.7145049956348851e-06, "loss": 0.2743958532810211, "step": 514 }, { "epoch": 2.055776892430279, "grad_norm": 1.7006338834762573, "learning_rate": 1.7056131945046828e-06, "loss": 0.932583212852478, "step": 516 }, { "epoch": 2.0637450199203187, "grad_norm": 0.5010867118835449, "learning_rate": 1.6967189759650917e-06, "loss": 0.607980489730835, "step": 518 }, { "epoch": 2.0717131474103585, "grad_norm": 1.4660816192626953, "learning_rate": 1.6878227266324096e-06, "loss": 0.9215792417526245, "step": 520 }, { "epoch": 2.0796812749003983, "grad_norm": 0.5361967086791992, "learning_rate": 1.6789248332112101e-06, "loss": 1.0473201274871826, "step": 522 }, { "epoch": 2.087649402390438, "grad_norm": 0.8016868233680725, "learning_rate": 1.6700256824775327e-06, "loss": 1.0217143297195435, "step": 524 }, { "epoch": 2.095617529880478, "grad_norm": 1.3889236450195312, "learning_rate": 1.6611256612620702e-06, "loss": 0.4960322380065918, "step": 526 }, { "epoch": 2.103585657370518, "grad_norm": 0.5141186714172363, "learning_rate": 1.6522251564333527e-06, "loss": 0.8312227725982666, "step": 528 }, { "epoch": 2.1115537848605577, "grad_norm": 0.3196059763431549, "learning_rate": 1.6433245548809335e-06, "loss": 0.7172638773918152, "step": 530 }, { "epoch": 2.1195219123505975, "grad_norm": 0.35435977578163147, "learning_rate": 1.6344242434985692e-06, "loss": 0.44665032625198364, "step": 532 }, { "epoch": 2.1274900398406373, "grad_norm": 0.41664063930511475, "learning_rate": 1.6255246091674037e-06, "loss": 0.8600306510925293, "step": 534 }, { "epoch": 2.135458167330677, "grad_norm": 14.959745407104492, "learning_rate": 1.61662603873915e-06, "loss": 0.6659224629402161, "step": 536 }, { "epoch": 2.143426294820717, "grad_norm": 0.8924800753593445, "learning_rate": 1.607728919019277e-06, "loss": 0.6953543424606323, "step": 538 }, { "epoch": 2.151394422310757, "grad_norm": 0.5326530337333679, "learning_rate": 1.5988336367501924e-06, "loss": 1.0279146432876587, "step": 540 }, { "epoch": 2.1593625498007967, "grad_norm": 1.0475772619247437, "learning_rate": 1.5899405785944315e-06, "loss": 0.9707013368606567, "step": 542 }, { "epoch": 2.1673306772908365, "grad_norm": 0.4958283305168152, "learning_rate": 1.5810501311178543e-06, "loss": 0.9405574798583984, "step": 544 }, { "epoch": 2.1752988047808763, "grad_norm": 0.49418023228645325, "learning_rate": 1.5721626807728383e-06, "loss": 0.7293884754180908, "step": 546 }, { "epoch": 2.183266932270916, "grad_norm": 0.5774815678596497, "learning_rate": 1.5632786138814786e-06, "loss": 1.044211983680725, "step": 548 }, { "epoch": 2.191235059760956, "grad_norm": 0.4045267105102539, "learning_rate": 1.5543983166187998e-06, "loss": 1.1075928211212158, "step": 550 }, { "epoch": 2.199203187250996, "grad_norm": 1.0713002681732178, "learning_rate": 1.5455221749959674e-06, "loss": 0.7906201481819153, "step": 552 }, { "epoch": 2.2071713147410357, "grad_norm": 1.4011352062225342, "learning_rate": 1.5366505748435069e-06, "loss": 0.529036819934845, "step": 554 }, { "epoch": 2.2151394422310755, "grad_norm": 0.5856610536575317, "learning_rate": 1.5277839017945342e-06, "loss": 0.6787835359573364, "step": 556 }, { "epoch": 2.2231075697211153, "grad_norm": 5.952252388000488, "learning_rate": 1.5189225412679937e-06, "loss": 0.8648924231529236, "step": 558 }, { "epoch": 2.231075697211155, "grad_norm": 0.513113260269165, "learning_rate": 1.5100668784519027e-06, "loss": 0.9975270628929138, "step": 560 }, { "epoch": 2.239043824701195, "grad_norm": 0.4902089536190033, "learning_rate": 1.5012172982866095e-06, "loss": 1.0877983570098877, "step": 562 }, { "epoch": 2.247011952191235, "grad_norm": 1.1996077299118042, "learning_rate": 1.4923741854480581e-06, "loss": 0.8970789909362793, "step": 564 }, { "epoch": 2.2549800796812747, "grad_norm": 0.8139443397521973, "learning_rate": 1.4835379243310724e-06, "loss": 0.7339902520179749, "step": 566 }, { "epoch": 2.2629482071713145, "grad_norm": 0.3560384511947632, "learning_rate": 1.4747088990326413e-06, "loss": 0.8087087869644165, "step": 568 }, { "epoch": 2.2709163346613543, "grad_norm": 0.4162435531616211, "learning_rate": 1.4658874933352252e-06, "loss": 1.0762511491775513, "step": 570 }, { "epoch": 2.278884462151394, "grad_norm": 0.9825732707977295, "learning_rate": 1.4570740906900752e-06, "loss": 0.6623214483261108, "step": 572 }, { "epoch": 2.2868525896414345, "grad_norm": 0.3757762908935547, "learning_rate": 1.448269074200563e-06, "loss": 0.7657751441001892, "step": 574 }, { "epoch": 2.2948207171314743, "grad_norm": 0.5589696168899536, "learning_rate": 1.4394728266055265e-06, "loss": 0.3244088888168335, "step": 576 }, { "epoch": 2.302788844621514, "grad_norm": 0.7052986025810242, "learning_rate": 1.4306857302626383e-06, "loss": 0.6619569659233093, "step": 578 }, { "epoch": 2.310756972111554, "grad_norm": 0.425970196723938, "learning_rate": 1.4219081671317795e-06, "loss": 0.6385777592658997, "step": 580 }, { "epoch": 2.318725099601594, "grad_norm": 0.6843652129173279, "learning_rate": 1.4131405187584408e-06, "loss": 0.7549704909324646, "step": 582 }, { "epoch": 2.3266932270916336, "grad_norm": 0.3465102016925812, "learning_rate": 1.4043831662571323e-06, "loss": 0.8688426613807678, "step": 584 }, { "epoch": 2.3346613545816735, "grad_norm": 0.3654971122741699, "learning_rate": 1.3956364902948247e-06, "loss": 0.6648116707801819, "step": 586 }, { "epoch": 2.3426294820717133, "grad_norm": 0.744594395160675, "learning_rate": 1.3869008710743948e-06, "loss": 0.9290102124214172, "step": 588 }, { "epoch": 2.350597609561753, "grad_norm": 0.4022054672241211, "learning_rate": 1.378176688318103e-06, "loss": 1.0422040224075317, "step": 590 }, { "epoch": 2.358565737051793, "grad_norm": 0.5497289896011353, "learning_rate": 1.3694643212510864e-06, "loss": 1.1029999256134033, "step": 592 }, { "epoch": 2.366533864541833, "grad_norm": 0.5594106912612915, "learning_rate": 1.3607641485848747e-06, "loss": 0.7941989898681641, "step": 594 }, { "epoch": 2.3745019920318726, "grad_norm": 0.5195335149765015, "learning_rate": 1.352076548500928e-06, "loss": 1.055949091911316, "step": 596 }, { "epoch": 2.3824701195219125, "grad_norm": 0.7671335935592651, "learning_rate": 1.343401898634197e-06, "loss": 0.747549831867218, "step": 598 }, { "epoch": 2.3904382470119523, "grad_norm": 5.928802490234375, "learning_rate": 1.3347405760567109e-06, "loss": 1.0375477075576782, "step": 600 }, { "epoch": 2.398406374501992, "grad_norm": 1.452104926109314, "learning_rate": 1.326092957261183e-06, "loss": 1.0605626106262207, "step": 602 }, { "epoch": 2.406374501992032, "grad_norm": 0.45336440205574036, "learning_rate": 1.317459418144647e-06, "loss": 0.6427274942398071, "step": 604 }, { "epoch": 2.414342629482072, "grad_norm": 0.9673005938529968, "learning_rate": 1.308840333992118e-06, "loss": 0.2522476315498352, "step": 606 }, { "epoch": 2.4223107569721116, "grad_norm": 0.3540583848953247, "learning_rate": 1.3002360794602787e-06, "loss": 0.6453299522399902, "step": 608 }, { "epoch": 2.4302788844621515, "grad_norm": 0.20231011509895325, "learning_rate": 1.2916470285611936e-06, "loss": 0.9828154444694519, "step": 610 }, { "epoch": 2.4382470119521913, "grad_norm": 0.3596932291984558, "learning_rate": 1.283073554646051e-06, "loss": 0.7582497000694275, "step": 612 }, { "epoch": 2.446215139442231, "grad_norm": 0.47144603729248047, "learning_rate": 1.274516030388936e-06, "loss": 0.6129385232925415, "step": 614 }, { "epoch": 2.454183266932271, "grad_norm": 0.6285446286201477, "learning_rate": 1.2659748277706292e-06, "loss": 0.9842470288276672, "step": 616 }, { "epoch": 2.462151394422311, "grad_norm": 0.7511582970619202, "learning_rate": 1.257450318062436e-06, "loss": 0.8260526657104492, "step": 618 }, { "epoch": 2.4701195219123506, "grad_norm": 1.097211480140686, "learning_rate": 1.2489428718100534e-06, "loss": 0.9079670310020447, "step": 620 }, { "epoch": 2.4780876494023905, "grad_norm": 0.4593997895717621, "learning_rate": 1.2404528588174562e-06, "loss": 0.9143153429031372, "step": 622 }, { "epoch": 2.4860557768924303, "grad_norm": 0.5682267546653748, "learning_rate": 1.2319806481308265e-06, "loss": 0.8137405514717102, "step": 624 }, { "epoch": 2.49402390438247, "grad_norm": 3.3198330402374268, "learning_rate": 1.2235266080225118e-06, "loss": 0.705787718296051, "step": 626 }, { "epoch": 2.50199203187251, "grad_norm": 0.6641273498535156, "learning_rate": 1.2150911059750159e-06, "loss": 1.0320712327957153, "step": 628 }, { "epoch": 2.50996015936255, "grad_norm": 0.8049502372741699, "learning_rate": 1.2066745086650239e-06, "loss": 0.5817446708679199, "step": 630 }, { "epoch": 2.5179282868525896, "grad_norm": 1.7005666494369507, "learning_rate": 1.1982771819474656e-06, "loss": 0.49497243762016296, "step": 632 }, { "epoch": 2.5258964143426295, "grad_norm": 0.48148971796035767, "learning_rate": 1.1898994908396118e-06, "loss": 1.0635162591934204, "step": 634 }, { "epoch": 2.5338645418326693, "grad_norm": 1.0913236141204834, "learning_rate": 1.1815417995052062e-06, "loss": 0.9964134693145752, "step": 636 }, { "epoch": 2.541832669322709, "grad_norm": 1.4037988185882568, "learning_rate": 1.173204471238638e-06, "loss": 1.0414139032363892, "step": 638 }, { "epoch": 2.549800796812749, "grad_norm": 0.8760337233543396, "learning_rate": 1.1648878684491478e-06, "loss": 0.8619127869606018, "step": 640 }, { "epoch": 2.557768924302789, "grad_norm": 0.4692465662956238, "learning_rate": 1.1565923526450775e-06, "loss": 1.0320566892623901, "step": 642 }, { "epoch": 2.5657370517928286, "grad_norm": 0.36346501111984253, "learning_rate": 1.148318284418153e-06, "loss": 1.2459909915924072, "step": 644 }, { "epoch": 2.5737051792828685, "grad_norm": 0.4973103106021881, "learning_rate": 1.1400660234278099e-06, "loss": 0.5828899145126343, "step": 646 }, { "epoch": 2.5816733067729083, "grad_norm": 0.5622968077659607, "learning_rate": 1.1318359283855633e-06, "loss": 1.0787078142166138, "step": 648 }, { "epoch": 2.589641434262948, "grad_norm": 0.41192811727523804, "learning_rate": 1.1236283570394122e-06, "loss": 0.8327584862709045, "step": 650 }, { "epoch": 2.597609561752988, "grad_norm": 4.0215840339660645, "learning_rate": 1.1154436661582873e-06, "loss": 0.997661828994751, "step": 652 }, { "epoch": 2.605577689243028, "grad_norm": 0.40926310420036316, "learning_rate": 1.1072822115165488e-06, "loss": 0.6634190678596497, "step": 654 }, { "epoch": 2.6135458167330676, "grad_norm": 0.556077241897583, "learning_rate": 1.0991443478785146e-06, "loss": 0.38381195068359375, "step": 656 }, { "epoch": 2.6215139442231075, "grad_norm": 0.8958475589752197, "learning_rate": 1.0910304289830445e-06, "loss": 0.5694887042045593, "step": 658 }, { "epoch": 2.6294820717131473, "grad_norm": 0.31501585245132446, "learning_rate": 1.08294080752816e-06, "loss": 0.9161617159843445, "step": 660 }, { "epoch": 2.637450199203187, "grad_norm": 0.8538661003112793, "learning_rate": 1.074875835155716e-06, "loss": 1.10606050491333, "step": 662 }, { "epoch": 2.645418326693227, "grad_norm": 0.8182650804519653, "learning_rate": 1.0668358624361148e-06, "loss": 0.672410249710083, "step": 664 }, { "epoch": 2.653386454183267, "grad_norm": 1.2558993101119995, "learning_rate": 1.0588212388530662e-06, "loss": 0.969312310218811, "step": 666 }, { "epoch": 2.6613545816733066, "grad_norm": 0.24255388975143433, "learning_rate": 1.050832312788396e-06, "loss": 0.5811322331428528, "step": 668 }, { "epoch": 2.6693227091633465, "grad_norm": 0.8314858675003052, "learning_rate": 1.042869431506906e-06, "loss": 0.7844950556755066, "step": 670 }, { "epoch": 2.6772908366533863, "grad_norm": 0.1939828097820282, "learning_rate": 1.034932941141274e-06, "loss": 0.44791269302368164, "step": 672 }, { "epoch": 2.685258964143426, "grad_norm": 0.3819931745529175, "learning_rate": 1.0270231866770115e-06, "loss": 1.0550973415374756, "step": 674 }, { "epoch": 2.6932270916334664, "grad_norm": 0.8612610101699829, "learning_rate": 1.019140511937465e-06, "loss": 0.7458590269088745, "step": 676 }, { "epoch": 2.7011952191235062, "grad_norm": 0.5282925963401794, "learning_rate": 1.011285259568875e-06, "loss": 0.6542291641235352, "step": 678 }, { "epoch": 2.709163346613546, "grad_norm": 0.45834335684776306, "learning_rate": 1.0034577710254754e-06, "loss": 1.0501289367675781, "step": 680 }, { "epoch": 2.717131474103586, "grad_norm": 0.4280013144016266, "learning_rate": 9.956583865546576e-07, "loss": 1.0795950889587402, "step": 682 }, { "epoch": 2.7250996015936257, "grad_norm": 0.4833294451236725, "learning_rate": 9.878874451821777e-07, "loss": 0.7288949489593506, "step": 684 }, { "epoch": 2.7330677290836656, "grad_norm": 1.0351470708847046, "learning_rate": 9.801452846974161e-07, "loss": 0.5564233660697937, "step": 686 }, { "epoch": 2.7410358565737054, "grad_norm": 0.41580677032470703, "learning_rate": 9.724322416387011e-07, "loss": 1.0649595260620117, "step": 688 }, { "epoch": 2.7490039840637452, "grad_norm": 1.2443572282791138, "learning_rate": 9.647486512786754e-07, "loss": 0.5668457746505737, "step": 690 }, { "epoch": 2.756972111553785, "grad_norm": 0.47028157114982605, "learning_rate": 9.570948476097252e-07, "loss": 1.165488839149475, "step": 692 }, { "epoch": 2.764940239043825, "grad_norm": 1.4404064416885376, "learning_rate": 9.494711633294586e-07, "loss": 0.9548084735870361, "step": 694 }, { "epoch": 2.7729083665338647, "grad_norm": 2.1831867694854736, "learning_rate": 9.41877929826247e-07, "loss": 0.8034579753875732, "step": 696 }, { "epoch": 2.7808764940239046, "grad_norm": 0.36618274450302124, "learning_rate": 9.343154771648201e-07, "loss": 0.9508860111236572, "step": 698 }, { "epoch": 2.7888446215139444, "grad_norm": 0.451259970664978, "learning_rate": 9.267841340719161e-07, "loss": 1.0815647840499878, "step": 700 }, { "epoch": 2.7968127490039842, "grad_norm": 0.457039475440979, "learning_rate": 9.192842279219954e-07, "loss": 0.7261126041412354, "step": 702 }, { "epoch": 2.804780876494024, "grad_norm": 0.9726790189743042, "learning_rate": 9.118160847230074e-07, "loss": 1.3043347597122192, "step": 704 }, { "epoch": 2.812749003984064, "grad_norm": 0.2558709681034088, "learning_rate": 9.043800291022225e-07, "loss": 1.290697455406189, "step": 706 }, { "epoch": 2.8207171314741037, "grad_norm": 0.5993645787239075, "learning_rate": 8.969763842921183e-07, "loss": 1.1300464868545532, "step": 708 }, { "epoch": 2.8286852589641436, "grad_norm": 0.21556946635246277, "learning_rate": 8.89605472116331e-07, "loss": 0.5522478818893433, "step": 710 }, { "epoch": 2.8366533864541834, "grad_norm": 0.5506062507629395, "learning_rate": 8.822676129756673e-07, "loss": 1.087594985961914, "step": 712 }, { "epoch": 2.8446215139442232, "grad_norm": 0.34245195984840393, "learning_rate": 8.749631258341722e-07, "loss": 0.6451638340950012, "step": 714 }, { "epoch": 2.852589641434263, "grad_norm": 0.330020934343338, "learning_rate": 8.676923282052703e-07, "loss": 1.0628772974014282, "step": 716 }, { "epoch": 2.860557768924303, "grad_norm": 1.4934767484664917, "learning_rate": 8.604555361379602e-07, "loss": 0.6799197196960449, "step": 718 }, { "epoch": 2.8685258964143427, "grad_norm": 0.47209084033966064, "learning_rate": 8.532530642030793e-07, "loss": 0.6012248396873474, "step": 720 }, { "epoch": 2.8764940239043826, "grad_norm": 1.5172817707061768, "learning_rate": 8.46085225479626e-07, "loss": 0.8212974667549133, "step": 722 }, { "epoch": 2.8844621513944224, "grad_norm": 0.3270165026187897, "learning_rate": 8.389523315411531e-07, "loss": 0.6892848610877991, "step": 724 }, { "epoch": 2.8924302788844622, "grad_norm": 0.34172943234443665, "learning_rate": 8.318546924422257e-07, "loss": 1.040250539779663, "step": 726 }, { "epoch": 2.900398406374502, "grad_norm": 0.3343369960784912, "learning_rate": 8.247926167049404e-07, "loss": 0.591463029384613, "step": 728 }, { "epoch": 2.908366533864542, "grad_norm": 0.37521132826805115, "learning_rate": 8.177664113055171e-07, "loss": 1.0062123537063599, "step": 730 }, { "epoch": 2.9163346613545817, "grad_norm": 0.31653112173080444, "learning_rate": 8.107763816609526e-07, "loss": 0.48367780447006226, "step": 732 }, { "epoch": 2.9243027888446216, "grad_norm": 0.6464031338691711, "learning_rate": 8.038228316157484e-07, "loss": 1.0752416849136353, "step": 734 }, { "epoch": 2.9322709163346614, "grad_norm": 1.5205926895141602, "learning_rate": 7.96906063428699e-07, "loss": 0.4515492022037506, "step": 736 }, { "epoch": 2.9402390438247012, "grad_norm": 1.017087459564209, "learning_rate": 7.900263777597554e-07, "loss": 0.9680182337760925, "step": 738 }, { "epoch": 2.948207171314741, "grad_norm": 0.33580952882766724, "learning_rate": 7.831840736569573e-07, "loss": 1.2022464275360107, "step": 740 }, { "epoch": 2.956175298804781, "grad_norm": 0.9194537997245789, "learning_rate": 7.763794485434306e-07, "loss": 0.5004313588142395, "step": 742 }, { "epoch": 2.9641434262948207, "grad_norm": 0.4175858199596405, "learning_rate": 7.696127982044607e-07, "loss": 0.9552463889122009, "step": 744 }, { "epoch": 2.9721115537848606, "grad_norm": 0.7210242748260498, "learning_rate": 7.628844167746361e-07, "loss": 0.9815269112586975, "step": 746 }, { "epoch": 2.9800796812749004, "grad_norm": 1.3052399158477783, "learning_rate": 7.561945967250626e-07, "loss": 0.5944569706916809, "step": 748 }, { "epoch": 2.9880478087649402, "grad_norm": 0.40495985746383667, "learning_rate": 7.495436288506475e-07, "loss": 0.1485452651977539, "step": 750 }, { "epoch": 2.99601593625498, "grad_norm": 0.40999165177345276, "learning_rate": 7.429318022574623e-07, "loss": 1.0614784955978394, "step": 752 }, { "epoch": 3.00398406374502, "grad_norm": 0.4904988408088684, "learning_rate": 7.363594043501752e-07, "loss": 0.5685489177703857, "step": 754 }, { "epoch": 3.0119521912350598, "grad_norm": 0.6486766338348389, "learning_rate": 7.298267208195567e-07, "loss": 0.6701474785804749, "step": 756 }, { "epoch": 3.0199203187250996, "grad_norm": 0.38344699144363403, "learning_rate": 7.233340356300632e-07, "loss": 0.4652033746242523, "step": 758 }, { "epoch": 3.0278884462151394, "grad_norm": 0.3574877679347992, "learning_rate": 7.16881631007491e-07, "loss": 0.7689741849899292, "step": 760 }, { "epoch": 3.0358565737051793, "grad_norm": 0.7075855731964111, "learning_rate": 7.1046978742671e-07, "loss": 0.571841835975647, "step": 762 }, { "epoch": 3.043824701195219, "grad_norm": 0.4493393898010254, "learning_rate": 7.040987835994727e-07, "loss": 1.018668532371521, "step": 764 }, { "epoch": 3.051792828685259, "grad_norm": 1.3887406587600708, "learning_rate": 6.977688964622978e-07, "loss": 0.19836187362670898, "step": 766 }, { "epoch": 3.0597609561752988, "grad_norm": 0.565453290939331, "learning_rate": 6.914804011644326e-07, "loss": 0.7899960875511169, "step": 768 }, { "epoch": 3.0677290836653386, "grad_norm": 0.6862173676490784, "learning_rate": 6.852335710558922e-07, "loss": 0.9098981618881226, "step": 770 }, { "epoch": 3.0756972111553784, "grad_norm": 0.5152872204780579, "learning_rate": 6.790286776755779e-07, "loss": 1.0554358959197998, "step": 772 }, { "epoch": 3.0836653386454183, "grad_norm": 1.018154263496399, "learning_rate": 6.728659907394755e-07, "loss": 0.9298610091209412, "step": 774 }, { "epoch": 3.091633466135458, "grad_norm": 0.14560498297214508, "learning_rate": 6.667457781289271e-07, "loss": 0.60512375831604, "step": 776 }, { "epoch": 3.099601593625498, "grad_norm": 0.48420849442481995, "learning_rate": 6.606683058789922e-07, "loss": 0.6643580198287964, "step": 778 }, { "epoch": 3.1075697211155378, "grad_norm": 0.40179142355918884, "learning_rate": 6.546338381668782e-07, "loss": 1.0011398792266846, "step": 780 }, { "epoch": 3.1155378486055776, "grad_norm": 0.8947471380233765, "learning_rate": 6.486426373004613e-07, "loss": 0.41442957520484924, "step": 782 }, { "epoch": 3.1235059760956174, "grad_norm": 0.4902297556400299, "learning_rate": 6.42694963706882e-07, "loss": 1.1005361080169678, "step": 784 }, { "epoch": 3.1314741035856573, "grad_norm": 0.35919007658958435, "learning_rate": 6.367910759212253e-07, "loss": 0.8759674429893494, "step": 786 }, { "epoch": 3.139442231075697, "grad_norm": 1.3167163133621216, "learning_rate": 6.309312305752845e-07, "loss": 0.718060314655304, "step": 788 }, { "epoch": 3.147410358565737, "grad_norm": 0.24750226736068726, "learning_rate": 6.251156823864013e-07, "loss": 0.6157811284065247, "step": 790 }, { "epoch": 3.1553784860557768, "grad_norm": 1.525368094444275, "learning_rate": 6.19344684146399e-07, "loss": 0.8752233982086182, "step": 792 }, { "epoch": 3.1633466135458166, "grad_norm": 0.071062371134758, "learning_rate": 6.136184867105907e-07, "loss": 0.3096681237220764, "step": 794 }, { "epoch": 3.1713147410358564, "grad_norm": 0.767979085445404, "learning_rate": 6.079373389868767e-07, "loss": 0.7163103818893433, "step": 796 }, { "epoch": 3.1792828685258963, "grad_norm": 0.4465477168560028, "learning_rate": 6.023014879249236e-07, "loss": 0.5860901474952698, "step": 798 }, { "epoch": 3.187250996015936, "grad_norm": 1.0776124000549316, "learning_rate": 5.967111785054299e-07, "loss": 0.9502109289169312, "step": 800 }, { "epoch": 3.195219123505976, "grad_norm": 0.3106825649738312, "learning_rate": 5.91166653729479e-07, "loss": 0.7821487188339233, "step": 802 }, { "epoch": 3.2031872509960158, "grad_norm": 5.8575263023376465, "learning_rate": 5.85668154607974e-07, "loss": 0.7681574821472168, "step": 804 }, { "epoch": 3.2111553784860556, "grad_norm": 1.7226320505142212, "learning_rate": 5.802159201511634e-07, "loss": 1.3093531131744385, "step": 806 }, { "epoch": 3.2191235059760954, "grad_norm": 0.9542534947395325, "learning_rate": 5.748101873582492e-07, "loss": 0.5746437907218933, "step": 808 }, { "epoch": 3.2270916334661353, "grad_norm": 0.6066504716873169, "learning_rate": 5.69451191207088e-07, "loss": 0.8110766410827637, "step": 810 }, { "epoch": 3.235059760956175, "grad_norm": 0.8804686665534973, "learning_rate": 5.641391646439746e-07, "loss": 0.42266717553138733, "step": 812 }, { "epoch": 3.243027888446215, "grad_norm": 0.3170583248138428, "learning_rate": 5.588743385735169e-07, "loss": 0.9301043748855591, "step": 814 }, { "epoch": 3.2509960159362548, "grad_norm": 3.740297317504883, "learning_rate": 5.536569418486005e-07, "loss": 0.4588174819946289, "step": 816 }, { "epoch": 3.2589641434262946, "grad_norm": 0.40485963225364685, "learning_rate": 5.484872012604372e-07, "loss": 1.029528260231018, "step": 818 }, { "epoch": 3.2669322709163344, "grad_norm": 1.3435243368148804, "learning_rate": 5.433653415287097e-07, "loss": 0.6181462407112122, "step": 820 }, { "epoch": 3.2749003984063743, "grad_norm": 0.7431517243385315, "learning_rate": 5.38291585291804e-07, "loss": 0.8793559074401855, "step": 822 }, { "epoch": 3.2828685258964145, "grad_norm": 0.7615469098091125, "learning_rate": 5.332661530971281e-07, "loss": 0.6390237808227539, "step": 824 }, { "epoch": 3.2908366533864544, "grad_norm": 0.2795827388763428, "learning_rate": 5.282892633915299e-07, "loss": 1.0185526609420776, "step": 826 }, { "epoch": 3.298804780876494, "grad_norm": 0.4049527943134308, "learning_rate": 5.23361132511797e-07, "loss": 1.022945523262024, "step": 828 }, { "epoch": 3.306772908366534, "grad_norm": 0.4463924169540405, "learning_rate": 5.184819746752567e-07, "loss": 0.9758146405220032, "step": 830 }, { "epoch": 3.314741035856574, "grad_norm": 1.3453718423843384, "learning_rate": 5.136520019704622e-07, "loss": 0.6447398662567139, "step": 832 }, { "epoch": 3.3227091633466137, "grad_norm": 0.7485759258270264, "learning_rate": 5.088714243479742e-07, "loss": 0.3966885209083557, "step": 834 }, { "epoch": 3.3306772908366535, "grad_norm": 2.261605739593506, "learning_rate": 5.041404496112345e-07, "loss": 0.6890445351600647, "step": 836 }, { "epoch": 3.3386454183266934, "grad_norm": 0.7466967701911926, "learning_rate": 4.994592834075328e-07, "loss": 0.9410973191261292, "step": 838 }, { "epoch": 3.346613545816733, "grad_norm": 1.0626827478408813, "learning_rate": 4.948281292190692e-07, "loss": 0.24332815408706665, "step": 840 }, { "epoch": 3.354581673306773, "grad_norm": 0.5983356833457947, "learning_rate": 4.90247188354107e-07, "loss": 0.8691646456718445, "step": 842 }, { "epoch": 3.362549800796813, "grad_norm": 0.5014941692352295, "learning_rate": 4.857166599382236e-07, "loss": 0.7781885862350464, "step": 844 }, { "epoch": 3.3705179282868527, "grad_norm": 0.3870190382003784, "learning_rate": 4.812367409056543e-07, "loss": 1.034981608390808, "step": 846 }, { "epoch": 3.3784860557768925, "grad_norm": 0.40831807255744934, "learning_rate": 4.7680762599073167e-07, "loss": 1.0040619373321533, "step": 848 }, { "epoch": 3.3864541832669324, "grad_norm": 1.0064787864685059, "learning_rate": 4.72429507719422e-07, "loss": 1.1909987926483154, "step": 850 }, { "epoch": 3.394422310756972, "grad_norm": 0.34172093868255615, "learning_rate": 4.681025764009543e-07, "loss": 1.0179778337478638, "step": 852 }, { "epoch": 3.402390438247012, "grad_norm": 0.6938625574111938, "learning_rate": 4.638270201195511e-07, "loss": 0.6021140813827515, "step": 854 }, { "epoch": 3.410358565737052, "grad_norm": 0.6337777972221375, "learning_rate": 4.5960302472624914e-07, "loss": 0.9684332013130188, "step": 856 }, { "epoch": 3.4183266932270917, "grad_norm": 0.36189448833465576, "learning_rate": 4.554307738308239e-07, "loss": 1.1349108219146729, "step": 858 }, { "epoch": 3.4262948207171315, "grad_norm": 1.1650807857513428, "learning_rate": 4.513104487938059e-07, "loss": 1.068455696105957, "step": 860 }, { "epoch": 3.4342629482071714, "grad_norm": 0.4551082253456116, "learning_rate": 4.4724222871859915e-07, "loss": 0.7791970372200012, "step": 862 }, { "epoch": 3.442231075697211, "grad_norm": 2.264080047607422, "learning_rate": 4.4322629044369557e-07, "loss": 0.7574501633644104, "step": 864 }, { "epoch": 3.450199203187251, "grad_norm": 0.4233779013156891, "learning_rate": 4.392628085349856e-07, "loss": 0.947033166885376, "step": 866 }, { "epoch": 3.458167330677291, "grad_norm": 0.6929165720939636, "learning_rate": 4.3535195527817413e-07, "loss": 0.8224045634269714, "step": 868 }, { "epoch": 3.4661354581673307, "grad_norm": 0.4947459399700165, "learning_rate": 4.3149390067128893e-07, "loss": 0.5549885630607605, "step": 870 }, { "epoch": 3.4741035856573705, "grad_norm": 0.18183910846710205, "learning_rate": 4.2768881241729196e-07, "loss": 0.6329528093338013, "step": 872 }, { "epoch": 3.4820717131474104, "grad_norm": 0.28913554549217224, "learning_rate": 4.239368559167891e-07, "loss": 0.760714590549469, "step": 874 }, { "epoch": 3.49003984063745, "grad_norm": 0.44314852356910706, "learning_rate": 4.2023819426084043e-07, "loss": 0.6140300631523132, "step": 876 }, { "epoch": 3.49800796812749, "grad_norm": 0.6128693222999573, "learning_rate": 4.165929882238723e-07, "loss": 0.5876585245132446, "step": 878 }, { "epoch": 3.50597609561753, "grad_norm": 0.18565566837787628, "learning_rate": 4.130013962566869e-07, "loss": 0.5421848297119141, "step": 880 }, { "epoch": 3.5139442231075697, "grad_norm": 0.5768609046936035, "learning_rate": 4.094635744795763e-07, "loss": 0.8825575709342957, "step": 882 }, { "epoch": 3.5219123505976095, "grad_norm": 0.9254052042961121, "learning_rate": 4.059796766755343e-07, "loss": 0.308013379573822, "step": 884 }, { "epoch": 3.5298804780876494, "grad_norm": 1.7089602947235107, "learning_rate": 4.0254985428357405e-07, "loss": 0.9074363708496094, "step": 886 }, { "epoch": 3.537848605577689, "grad_norm": 0.46798571944236755, "learning_rate": 3.9917425639214304e-07, "loss": 0.9869987368583679, "step": 888 }, { "epoch": 3.545816733067729, "grad_norm": 1.8204545974731445, "learning_rate": 3.9585302973264424e-07, "loss": 0.5235753059387207, "step": 890 }, { "epoch": 3.553784860557769, "grad_norm": 0.8664906620979309, "learning_rate": 3.9258631867305723e-07, "loss": 1.2101613283157349, "step": 892 }, { "epoch": 3.5617529880478087, "grad_norm": 0.4034668207168579, "learning_rate": 3.893742652116616e-07, "loss": 1.0782972574234009, "step": 894 }, { "epoch": 3.5697211155378485, "grad_norm": 0.37115031480789185, "learning_rate": 3.8621700897086686e-07, "loss": 0.25802165269851685, "step": 896 }, { "epoch": 3.5776892430278884, "grad_norm": 0.4131646156311035, "learning_rate": 3.8311468719114215e-07, "loss": 0.6915317177772522, "step": 898 }, { "epoch": 3.585657370517928, "grad_norm": 0.5455676913261414, "learning_rate": 3.8006743472504945e-07, "loss": 0.3316478133201599, "step": 900 }, { "epoch": 3.593625498007968, "grad_norm": 0.9060930609703064, "learning_rate": 3.7707538403138413e-07, "loss": 0.11103206127882004, "step": 902 }, { "epoch": 3.601593625498008, "grad_norm": 0.5972309708595276, "learning_rate": 3.7413866516941513e-07, "loss": 0.8609724640846252, "step": 904 }, { "epoch": 3.6095617529880477, "grad_norm": 2.9889116287231445, "learning_rate": 3.712574057932332e-07, "loss": 0.7271422147750854, "step": 906 }, { "epoch": 3.6175298804780875, "grad_norm": 0.5200643539428711, "learning_rate": 3.684317311461999e-07, "loss": 0.9715712070465088, "step": 908 }, { "epoch": 3.6254980079681274, "grad_norm": 1.3307124376296997, "learning_rate": 3.656617640555063e-07, "loss": 0.9659046530723572, "step": 910 }, { "epoch": 3.633466135458167, "grad_norm": 0.22726085782051086, "learning_rate": 3.6294762492683114e-07, "loss": 0.06467059254646301, "step": 912 }, { "epoch": 3.641434262948207, "grad_norm": 0.4885714054107666, "learning_rate": 3.6028943173910846e-07, "loss": 0.8727531433105469, "step": 914 }, { "epoch": 3.649402390438247, "grad_norm": 1.1886482238769531, "learning_rate": 3.5768730003939934e-07, "loss": 0.8834037184715271, "step": 916 }, { "epoch": 3.6573705179282867, "grad_norm": 6.707517147064209, "learning_rate": 3.551413429378685e-07, "loss": 0.28755173087120056, "step": 918 }, { "epoch": 3.6653386454183265, "grad_norm": 0.6478546857833862, "learning_rate": 3.526516711028687e-07, "loss": 1.0173288583755493, "step": 920 }, { "epoch": 3.6733067729083664, "grad_norm": 0.3290223479270935, "learning_rate": 3.502183927561286e-07, "loss": 0.20666202902793884, "step": 922 }, { "epoch": 3.681274900398406, "grad_norm": 0.8621473908424377, "learning_rate": 3.478416136680499e-07, "loss": 1.1904857158660889, "step": 924 }, { "epoch": 3.6892430278884465, "grad_norm": 6.629832744598389, "learning_rate": 3.455214371531096e-07, "loss": 0.448214590549469, "step": 926 }, { "epoch": 3.6972111553784863, "grad_norm": 0.4641083776950836, "learning_rate": 3.432579640653678e-07, "loss": 0.6058546304702759, "step": 928 }, { "epoch": 3.705179282868526, "grad_norm": 0.16589830815792084, "learning_rate": 3.4105129279408574e-07, "loss": 0.9167323112487793, "step": 930 }, { "epoch": 3.713147410358566, "grad_norm": 0.8476380109786987, "learning_rate": 3.389015192594471e-07, "loss": 1.018453598022461, "step": 932 }, { "epoch": 3.721115537848606, "grad_norm": 1.218807578086853, "learning_rate": 3.3680873690839e-07, "loss": 0.6338685154914856, "step": 934 }, { "epoch": 3.7290836653386457, "grad_norm": 0.4406608045101166, "learning_rate": 3.347730367105437e-07, "loss": 0.9283577799797058, "step": 936 }, { "epoch": 3.7370517928286855, "grad_norm": 1.626468539237976, "learning_rate": 3.327945071542754e-07, "loss": 0.9253164529800415, "step": 938 }, { "epoch": 3.7450199203187253, "grad_norm": 0.49745914340019226, "learning_rate": 3.308732342428437e-07, "loss": 0.8693242073059082, "step": 940 }, { "epoch": 3.752988047808765, "grad_norm": 1.1181275844573975, "learning_rate": 3.2900930149065883e-07, "loss": 0.9114285111427307, "step": 942 }, { "epoch": 3.760956175298805, "grad_norm": 0.4131788909435272, "learning_rate": 3.2720278991965424e-07, "loss": 0.5458086729049683, "step": 944 }, { "epoch": 3.768924302788845, "grad_norm": 0.3882419466972351, "learning_rate": 3.2545377805576414e-07, "loss": 1.0222781896591187, "step": 946 }, { "epoch": 3.7768924302788847, "grad_norm": 0.4516999423503876, "learning_rate": 3.2376234192550955e-07, "loss": 0.6500365138053894, "step": 948 }, { "epoch": 3.7848605577689245, "grad_norm": 0.7559425234794617, "learning_rate": 3.221285550526936e-07, "loss": 0.9952846765518188, "step": 950 }, { "epoch": 3.7928286852589643, "grad_norm": 0.43123456835746765, "learning_rate": 3.205524884552062e-07, "loss": 1.0094038248062134, "step": 952 }, { "epoch": 3.800796812749004, "grad_norm": 0.43853896856307983, "learning_rate": 3.1903421064193714e-07, "loss": 0.6855465769767761, "step": 954 }, { "epoch": 3.808764940239044, "grad_norm": 0.9686547517776489, "learning_rate": 3.1757378760979694e-07, "loss": 0.8955079317092896, "step": 956 }, { "epoch": 3.816733067729084, "grad_norm": 1.594388484954834, "learning_rate": 3.161712828408494e-07, "loss": 0.6716915965080261, "step": 958 }, { "epoch": 3.8247011952191237, "grad_norm": 1.1490174531936646, "learning_rate": 3.1482675729955115e-07, "loss": 0.8457735776901245, "step": 960 }, { "epoch": 3.8326693227091635, "grad_norm": 0.44997137784957886, "learning_rate": 3.135402694301026e-07, "loss": 0.6334148645401001, "step": 962 }, { "epoch": 3.8406374501992033, "grad_norm": 0.7384113073348999, "learning_rate": 3.123118751539064e-07, "loss": 1.0639876127243042, "step": 964 }, { "epoch": 3.848605577689243, "grad_norm": 0.7392749786376953, "learning_rate": 3.111416278671374e-07, "loss": 0.40739232301712036, "step": 966 }, { "epoch": 3.856573705179283, "grad_norm": 0.3943544030189514, "learning_rate": 3.1002957843842143e-07, "loss": 0.5916213393211365, "step": 968 }, { "epoch": 3.864541832669323, "grad_norm": 1.1247427463531494, "learning_rate": 3.0897577520662403e-07, "loss": 0.8500593304634094, "step": 970 }, { "epoch": 3.8725099601593627, "grad_norm": 0.3284965753555298, "learning_rate": 3.0798026397874904e-07, "loss": 0.631747841835022, "step": 972 }, { "epoch": 3.8804780876494025, "grad_norm": 0.27052104473114014, "learning_rate": 3.070430880279484e-07, "loss": 0.5036011934280396, "step": 974 }, { "epoch": 3.8884462151394423, "grad_norm": 0.4196729063987732, "learning_rate": 3.0616428809163936e-07, "loss": 1.0151411294937134, "step": 976 }, { "epoch": 3.896414342629482, "grad_norm": 0.604816198348999, "learning_rate": 3.053439023697359e-07, "loss": 0.5701797008514404, "step": 978 }, { "epoch": 3.904382470119522, "grad_norm": 0.21591706573963165, "learning_rate": 3.0458196652298623e-07, "loss": 0.46466773748397827, "step": 980 }, { "epoch": 3.912350597609562, "grad_norm": 0.5833430886268616, "learning_rate": 3.038785136714242e-07, "loss": 0.5528253316879272, "step": 982 }, { "epoch": 3.9203187250996017, "grad_norm": 0.3425147235393524, "learning_rate": 3.0323357439292857e-07, "loss": 0.9604100584983826, "step": 984 }, { "epoch": 3.9282868525896415, "grad_norm": 0.6084439754486084, "learning_rate": 3.026471767218946e-07, "loss": 0.1784566193819046, "step": 986 }, { "epoch": 3.9362549800796813, "grad_norm": 0.6684430241584778, "learning_rate": 3.0211934614801484e-07, "loss": 0.6319288611412048, "step": 988 }, { "epoch": 3.944223107569721, "grad_norm": 0.1623208075761795, "learning_rate": 3.016501056151714e-07, "loss": 0.6323052048683167, "step": 990 }, { "epoch": 3.952191235059761, "grad_norm": 0.598673939704895, "learning_rate": 3.01239475520439e-07, "loss": 1.1039389371871948, "step": 992 }, { "epoch": 3.960159362549801, "grad_norm": 0.10632996261119843, "learning_rate": 3.008874737131976e-07, "loss": 0.30895280838012695, "step": 994 }, { "epoch": 3.9681274900398407, "grad_norm": 0.7597146034240723, "learning_rate": 3.00594115494357e-07, "loss": 0.9472017884254456, "step": 996 }, { "epoch": 3.9760956175298805, "grad_norm": 0.3815767765045166, "learning_rate": 3.0035941361569174e-07, "loss": 0.6094083189964294, "step": 998 }, { "epoch": 3.9840637450199203, "grad_norm": 1.381508708000183, "learning_rate": 3.0018337827928646e-07, "loss": 0.5018728971481323, "step": 1000 }, { "epoch": 3.99203187250996, "grad_norm": 0.7763594388961792, "learning_rate": 3.0006601713709283e-07, "loss": 1.0228667259216309, "step": 1002 }, { "epoch": 4.0, "grad_norm": 0.1577903777360916, "learning_rate": 3.000073352905969e-07, "loss": 0.34115365147590637, "step": 1004 }, { "epoch": 4.0, "step": 1004, "total_flos": 4.038502240003031e+18, "train_loss": 0.9239894755600221, "train_runtime": 10281.2731, "train_samples_per_second": 5.859, "train_steps_per_second": 0.098 } ], "logging_steps": 2, "max_steps": 1004, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.038502240003031e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }