diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,53844 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 7686, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00039032006245121, + "grad_norm": 7.774693349857314, + "learning_rate": 1.3003901170351107e-08, + "loss": 1.1956, + "step": 1 + }, + { + "epoch": 0.00078064012490242, + "grad_norm": 7.341735052151711, + "learning_rate": 2.6007802340702215e-08, + "loss": 1.1918, + "step": 2 + }, + { + "epoch": 0.00117096018735363, + "grad_norm": 7.4185754765524745, + "learning_rate": 3.901170351105332e-08, + "loss": 1.1572, + "step": 3 + }, + { + "epoch": 0.00156128024980484, + "grad_norm": 7.455222090899191, + "learning_rate": 5.201560468140443e-08, + "loss": 1.1563, + "step": 4 + }, + { + "epoch": 0.00195160031225605, + "grad_norm": 7.7150032950907, + "learning_rate": 6.501950585175553e-08, + "loss": 1.1933, + "step": 5 + }, + { + "epoch": 0.00234192037470726, + "grad_norm": 7.471988896467603, + "learning_rate": 7.802340702210664e-08, + "loss": 1.1652, + "step": 6 + }, + { + "epoch": 0.00273224043715847, + "grad_norm": 7.397704527949466, + "learning_rate": 9.102730819245775e-08, + "loss": 1.1761, + "step": 7 + }, + { + "epoch": 0.00312256049960968, + "grad_norm": 7.292730142485289, + "learning_rate": 1.0403120936280886e-07, + "loss": 1.1585, + "step": 8 + }, + { + "epoch": 0.00351288056206089, + "grad_norm": 7.437007624053886, + "learning_rate": 1.1703511053315997e-07, + "loss": 1.2242, + "step": 9 + }, + { + "epoch": 0.0039032006245121, + "grad_norm": 7.709174435998227, + "learning_rate": 1.3003901170351105e-07, + "loss": 1.2002, + "step": 10 + }, + { + "epoch": 0.00429352068696331, + "grad_norm": 7.667001204610757, + "learning_rate": 1.4304291287386218e-07, + "loss": 1.2002, + "step": 11 + }, + { + "epoch": 0.00468384074941452, + "grad_norm": 7.482086265490851, + "learning_rate": 1.5604681404421327e-07, + "loss": 1.1979, + "step": 12 + }, + { + "epoch": 0.00507416081186573, + "grad_norm": 7.5545596591394215, + "learning_rate": 1.690507152145644e-07, + "loss": 1.1688, + "step": 13 + }, + { + "epoch": 0.00546448087431694, + "grad_norm": 7.453280670842921, + "learning_rate": 1.820546163849155e-07, + "loss": 1.1606, + "step": 14 + }, + { + "epoch": 0.00585480093676815, + "grad_norm": 7.4641125723792685, + "learning_rate": 1.9505851755526662e-07, + "loss": 1.1874, + "step": 15 + }, + { + "epoch": 0.00624512099921936, + "grad_norm": 7.661378996142737, + "learning_rate": 2.0806241872561772e-07, + "loss": 1.2172, + "step": 16 + }, + { + "epoch": 0.00663544106167057, + "grad_norm": 7.563342243768422, + "learning_rate": 2.210663198959688e-07, + "loss": 1.1903, + "step": 17 + }, + { + "epoch": 0.00702576112412178, + "grad_norm": 7.188288495216868, + "learning_rate": 2.3407022106631994e-07, + "loss": 1.1543, + "step": 18 + }, + { + "epoch": 0.00741608118657299, + "grad_norm": 6.936656698146042, + "learning_rate": 2.4707412223667104e-07, + "loss": 1.1397, + "step": 19 + }, + { + "epoch": 0.0078064012490242, + "grad_norm": 6.927763913497842, + "learning_rate": 2.600780234070221e-07, + "loss": 1.1816, + "step": 20 + }, + { + "epoch": 0.00819672131147541, + "grad_norm": 7.247568111653991, + "learning_rate": 2.7308192457737323e-07, + "loss": 1.1692, + "step": 21 + }, + { + "epoch": 0.00858704137392662, + "grad_norm": 6.760709643379415, + "learning_rate": 2.8608582574772435e-07, + "loss": 1.15, + "step": 22 + }, + { + "epoch": 0.00897736143637783, + "grad_norm": 6.75915104051349, + "learning_rate": 2.990897269180754e-07, + "loss": 1.17, + "step": 23 + }, + { + "epoch": 0.00936768149882904, + "grad_norm": 6.680467822768465, + "learning_rate": 3.1209362808842655e-07, + "loss": 1.143, + "step": 24 + }, + { + "epoch": 0.00975800156128025, + "grad_norm": 6.7023084292296105, + "learning_rate": 3.2509752925877767e-07, + "loss": 1.1357, + "step": 25 + }, + { + "epoch": 0.01014832162373146, + "grad_norm": 5.792687640288269, + "learning_rate": 3.381014304291288e-07, + "loss": 1.1214, + "step": 26 + }, + { + "epoch": 0.01053864168618267, + "grad_norm": 5.681041122863223, + "learning_rate": 3.5110533159947987e-07, + "loss": 1.1388, + "step": 27 + }, + { + "epoch": 0.01092896174863388, + "grad_norm": 5.394916559915833, + "learning_rate": 3.64109232769831e-07, + "loss": 1.0907, + "step": 28 + }, + { + "epoch": 0.01131928181108509, + "grad_norm": 5.4456228554581045, + "learning_rate": 3.7711313394018206e-07, + "loss": 1.1277, + "step": 29 + }, + { + "epoch": 0.0117096018735363, + "grad_norm": 5.362336549365938, + "learning_rate": 3.9011703511053324e-07, + "loss": 1.1148, + "step": 30 + }, + { + "epoch": 0.01209992193598751, + "grad_norm": 5.531349025954174, + "learning_rate": 4.031209362808843e-07, + "loss": 1.1594, + "step": 31 + }, + { + "epoch": 0.01249024199843872, + "grad_norm": 5.197159468510705, + "learning_rate": 4.1612483745123543e-07, + "loss": 1.1204, + "step": 32 + }, + { + "epoch": 0.01288056206088993, + "grad_norm": 4.909086820151505, + "learning_rate": 4.291287386215865e-07, + "loss": 1.087, + "step": 33 + }, + { + "epoch": 0.01327088212334114, + "grad_norm": 5.1603285367345855, + "learning_rate": 4.421326397919376e-07, + "loss": 1.1192, + "step": 34 + }, + { + "epoch": 0.01366120218579235, + "grad_norm": 5.0618558285352, + "learning_rate": 4.551365409622887e-07, + "loss": 1.1156, + "step": 35 + }, + { + "epoch": 0.01405152224824356, + "grad_norm": 3.869763915911114, + "learning_rate": 4.681404421326399e-07, + "loss": 1.0793, + "step": 36 + }, + { + "epoch": 0.01444184231069477, + "grad_norm": 3.495965781611872, + "learning_rate": 4.81144343302991e-07, + "loss": 1.0754, + "step": 37 + }, + { + "epoch": 0.01483216237314598, + "grad_norm": 3.2731948180050594, + "learning_rate": 4.941482444733421e-07, + "loss": 1.0538, + "step": 38 + }, + { + "epoch": 0.01522248243559719, + "grad_norm": 3.446368500624966, + "learning_rate": 5.071521456436931e-07, + "loss": 1.0627, + "step": 39 + }, + { + "epoch": 0.0156128024980484, + "grad_norm": 3.2559077814895026, + "learning_rate": 5.201560468140442e-07, + "loss": 1.0231, + "step": 40 + }, + { + "epoch": 0.01600312256049961, + "grad_norm": 3.207430068481201, + "learning_rate": 5.331599479843954e-07, + "loss": 1.0439, + "step": 41 + }, + { + "epoch": 0.01639344262295082, + "grad_norm": 3.027781158120358, + "learning_rate": 5.461638491547465e-07, + "loss": 1.0255, + "step": 42 + }, + { + "epoch": 0.01678376268540203, + "grad_norm": 3.0301616549501866, + "learning_rate": 5.591677503250976e-07, + "loss": 1.0483, + "step": 43 + }, + { + "epoch": 0.01717408274785324, + "grad_norm": 2.9031297667164693, + "learning_rate": 5.721716514954487e-07, + "loss": 1.0478, + "step": 44 + }, + { + "epoch": 0.01756440281030445, + "grad_norm": 2.750608825452148, + "learning_rate": 5.851755526657997e-07, + "loss": 1.0255, + "step": 45 + }, + { + "epoch": 0.01795472287275566, + "grad_norm": 2.8048268193108323, + "learning_rate": 5.981794538361509e-07, + "loss": 1.0389, + "step": 46 + }, + { + "epoch": 0.01834504293520687, + "grad_norm": 2.558961960804959, + "learning_rate": 6.11183355006502e-07, + "loss": 1.0388, + "step": 47 + }, + { + "epoch": 0.01873536299765808, + "grad_norm": 2.2383033688239786, + "learning_rate": 6.241872561768531e-07, + "loss": 1.0185, + "step": 48 + }, + { + "epoch": 0.01912568306010929, + "grad_norm": 1.993772166100342, + "learning_rate": 6.371911573472042e-07, + "loss": 0.9802, + "step": 49 + }, + { + "epoch": 0.0195160031225605, + "grad_norm": 1.9833841009506403, + "learning_rate": 6.501950585175553e-07, + "loss": 0.9882, + "step": 50 + }, + { + "epoch": 0.01990632318501171, + "grad_norm": 2.188927239036972, + "learning_rate": 6.631989596879064e-07, + "loss": 0.9883, + "step": 51 + }, + { + "epoch": 0.02029664324746292, + "grad_norm": 2.2383448396222794, + "learning_rate": 6.762028608582576e-07, + "loss": 0.996, + "step": 52 + }, + { + "epoch": 0.02068696330991413, + "grad_norm": 2.1929873628503675, + "learning_rate": 6.892067620286086e-07, + "loss": 0.9951, + "step": 53 + }, + { + "epoch": 0.02107728337236534, + "grad_norm": 2.135081031329436, + "learning_rate": 7.022106631989597e-07, + "loss": 0.9548, + "step": 54 + }, + { + "epoch": 0.02146760343481655, + "grad_norm": 2.027651109908756, + "learning_rate": 7.15214564369311e-07, + "loss": 0.952, + "step": 55 + }, + { + "epoch": 0.02185792349726776, + "grad_norm": 1.8497743547497913, + "learning_rate": 7.28218465539662e-07, + "loss": 0.9468, + "step": 56 + }, + { + "epoch": 0.02224824355971897, + "grad_norm": 1.7666063690901244, + "learning_rate": 7.412223667100131e-07, + "loss": 0.938, + "step": 57 + }, + { + "epoch": 0.02263856362217018, + "grad_norm": 1.640651475634029, + "learning_rate": 7.542262678803641e-07, + "loss": 0.9211, + "step": 58 + }, + { + "epoch": 0.02302888368462139, + "grad_norm": 1.4293380449019428, + "learning_rate": 7.672301690507153e-07, + "loss": 0.9619, + "step": 59 + }, + { + "epoch": 0.0234192037470726, + "grad_norm": 1.3754001458202052, + "learning_rate": 7.802340702210665e-07, + "loss": 0.915, + "step": 60 + }, + { + "epoch": 0.023809523809523808, + "grad_norm": 1.4030897702096046, + "learning_rate": 7.932379713914175e-07, + "loss": 0.9065, + "step": 61 + }, + { + "epoch": 0.02419984387197502, + "grad_norm": 1.348272298750543, + "learning_rate": 8.062418725617686e-07, + "loss": 0.9704, + "step": 62 + }, + { + "epoch": 0.02459016393442623, + "grad_norm": 1.4923159572280071, + "learning_rate": 8.192457737321196e-07, + "loss": 0.9259, + "step": 63 + }, + { + "epoch": 0.02498048399687744, + "grad_norm": 1.3993679260888567, + "learning_rate": 8.322496749024709e-07, + "loss": 0.909, + "step": 64 + }, + { + "epoch": 0.02537080405932865, + "grad_norm": 1.2994369852205145, + "learning_rate": 8.452535760728219e-07, + "loss": 0.8655, + "step": 65 + }, + { + "epoch": 0.02576112412177986, + "grad_norm": 1.2303738223286331, + "learning_rate": 8.58257477243173e-07, + "loss": 0.8939, + "step": 66 + }, + { + "epoch": 0.02615144418423107, + "grad_norm": 1.2738556598020436, + "learning_rate": 8.712613784135241e-07, + "loss": 0.9171, + "step": 67 + }, + { + "epoch": 0.02654176424668228, + "grad_norm": 1.0793794335276672, + "learning_rate": 8.842652795838752e-07, + "loss": 0.8719, + "step": 68 + }, + { + "epoch": 0.026932084309133488, + "grad_norm": 1.0430516377206107, + "learning_rate": 8.972691807542264e-07, + "loss": 0.8948, + "step": 69 + }, + { + "epoch": 0.0273224043715847, + "grad_norm": 1.0270907915789589, + "learning_rate": 9.102730819245774e-07, + "loss": 0.8858, + "step": 70 + }, + { + "epoch": 0.02771272443403591, + "grad_norm": 1.0942539022863693, + "learning_rate": 9.232769830949285e-07, + "loss": 0.8786, + "step": 71 + }, + { + "epoch": 0.02810304449648712, + "grad_norm": 1.0595228085938952, + "learning_rate": 9.362808842652798e-07, + "loss": 0.8315, + "step": 72 + }, + { + "epoch": 0.02849336455893833, + "grad_norm": 1.1295261370605592, + "learning_rate": 9.492847854356308e-07, + "loss": 0.854, + "step": 73 + }, + { + "epoch": 0.02888368462138954, + "grad_norm": 1.051106604576063, + "learning_rate": 9.62288686605982e-07, + "loss": 0.8638, + "step": 74 + }, + { + "epoch": 0.02927400468384075, + "grad_norm": 0.9935850795866511, + "learning_rate": 9.75292587776333e-07, + "loss": 0.8717, + "step": 75 + }, + { + "epoch": 0.02966432474629196, + "grad_norm": 0.8931275766496176, + "learning_rate": 9.882964889466841e-07, + "loss": 0.8464, + "step": 76 + }, + { + "epoch": 0.030054644808743168, + "grad_norm": 0.8834623729300599, + "learning_rate": 1.0013003901170352e-06, + "loss": 0.8877, + "step": 77 + }, + { + "epoch": 0.03044496487119438, + "grad_norm": 0.8209751826184144, + "learning_rate": 1.0143042912873862e-06, + "loss": 0.8759, + "step": 78 + }, + { + "epoch": 0.03083528493364559, + "grad_norm": 0.8563710383722243, + "learning_rate": 1.0273081924577374e-06, + "loss": 0.8484, + "step": 79 + }, + { + "epoch": 0.0312256049960968, + "grad_norm": 0.8717917783569616, + "learning_rate": 1.0403120936280884e-06, + "loss": 0.8754, + "step": 80 + }, + { + "epoch": 0.03161592505854801, + "grad_norm": 0.8956377986212831, + "learning_rate": 1.0533159947984397e-06, + "loss": 0.8322, + "step": 81 + }, + { + "epoch": 0.03200624512099922, + "grad_norm": 0.7877943942224008, + "learning_rate": 1.0663198959687909e-06, + "loss": 0.8105, + "step": 82 + }, + { + "epoch": 0.03239656518345043, + "grad_norm": 0.7424202707483722, + "learning_rate": 1.079323797139142e-06, + "loss": 0.8152, + "step": 83 + }, + { + "epoch": 0.03278688524590164, + "grad_norm": 0.7283913531679918, + "learning_rate": 1.092327698309493e-06, + "loss": 0.8518, + "step": 84 + }, + { + "epoch": 0.03317720530835285, + "grad_norm": 0.7133810840891972, + "learning_rate": 1.105331599479844e-06, + "loss": 0.836, + "step": 85 + }, + { + "epoch": 0.03356752537080406, + "grad_norm": 0.6794784318308018, + "learning_rate": 1.1183355006501952e-06, + "loss": 0.8286, + "step": 86 + }, + { + "epoch": 0.03395784543325527, + "grad_norm": 0.6868893324487937, + "learning_rate": 1.1313394018205462e-06, + "loss": 0.8194, + "step": 87 + }, + { + "epoch": 0.03434816549570648, + "grad_norm": 0.6543833957371573, + "learning_rate": 1.1443433029908974e-06, + "loss": 0.8088, + "step": 88 + }, + { + "epoch": 0.034738485558157686, + "grad_norm": 0.7069672287389849, + "learning_rate": 1.1573472041612484e-06, + "loss": 0.8356, + "step": 89 + }, + { + "epoch": 0.0351288056206089, + "grad_norm": 0.6857274384098961, + "learning_rate": 1.1703511053315995e-06, + "loss": 0.8341, + "step": 90 + }, + { + "epoch": 0.03551912568306011, + "grad_norm": 0.7129936050868295, + "learning_rate": 1.1833550065019507e-06, + "loss": 0.788, + "step": 91 + }, + { + "epoch": 0.03590944574551132, + "grad_norm": 0.643339831883813, + "learning_rate": 1.1963589076723017e-06, + "loss": 0.8071, + "step": 92 + }, + { + "epoch": 0.03629976580796253, + "grad_norm": 0.6561426009286604, + "learning_rate": 1.209362808842653e-06, + "loss": 0.8221, + "step": 93 + }, + { + "epoch": 0.03669008587041374, + "grad_norm": 0.6522773456339812, + "learning_rate": 1.222366710013004e-06, + "loss": 0.8187, + "step": 94 + }, + { + "epoch": 0.03708040593286495, + "grad_norm": 0.6812496545107823, + "learning_rate": 1.2353706111833552e-06, + "loss": 0.8303, + "step": 95 + }, + { + "epoch": 0.03747072599531616, + "grad_norm": 0.66364834124179, + "learning_rate": 1.2483745123537062e-06, + "loss": 0.8013, + "step": 96 + }, + { + "epoch": 0.03786104605776737, + "grad_norm": 0.6745102161256815, + "learning_rate": 1.2613784135240572e-06, + "loss": 0.8129, + "step": 97 + }, + { + "epoch": 0.03825136612021858, + "grad_norm": 0.6777358661367667, + "learning_rate": 1.2743823146944084e-06, + "loss": 0.7993, + "step": 98 + }, + { + "epoch": 0.03864168618266979, + "grad_norm": 0.587082548500208, + "learning_rate": 1.2873862158647597e-06, + "loss": 0.7935, + "step": 99 + }, + { + "epoch": 0.039032006245121, + "grad_norm": 0.6176633446334342, + "learning_rate": 1.3003901170351107e-06, + "loss": 0.7814, + "step": 100 + }, + { + "epoch": 0.03942232630757221, + "grad_norm": 0.6170928333735564, + "learning_rate": 1.3133940182054617e-06, + "loss": 0.7895, + "step": 101 + }, + { + "epoch": 0.03981264637002342, + "grad_norm": 0.6130127302316493, + "learning_rate": 1.3263979193758127e-06, + "loss": 0.8209, + "step": 102 + }, + { + "epoch": 0.04020296643247463, + "grad_norm": 0.5988440548080599, + "learning_rate": 1.339401820546164e-06, + "loss": 0.7835, + "step": 103 + }, + { + "epoch": 0.04059328649492584, + "grad_norm": 0.6145717121686602, + "learning_rate": 1.3524057217165152e-06, + "loss": 0.8088, + "step": 104 + }, + { + "epoch": 0.040983606557377046, + "grad_norm": 0.6350517897203343, + "learning_rate": 1.3654096228868662e-06, + "loss": 0.7976, + "step": 105 + }, + { + "epoch": 0.04137392661982826, + "grad_norm": 0.6464532924708237, + "learning_rate": 1.3784135240572172e-06, + "loss": 0.7524, + "step": 106 + }, + { + "epoch": 0.04176424668227947, + "grad_norm": 0.6271968660489734, + "learning_rate": 1.3914174252275682e-06, + "loss": 0.7895, + "step": 107 + }, + { + "epoch": 0.04215456674473068, + "grad_norm": 0.6629457459917407, + "learning_rate": 1.4044213263979195e-06, + "loss": 0.7711, + "step": 108 + }, + { + "epoch": 0.04254488680718189, + "grad_norm": 0.6288069327325093, + "learning_rate": 1.4174252275682707e-06, + "loss": 0.7703, + "step": 109 + }, + { + "epoch": 0.0429352068696331, + "grad_norm": 0.638545386539147, + "learning_rate": 1.430429128738622e-06, + "loss": 0.7963, + "step": 110 + }, + { + "epoch": 0.04332552693208431, + "grad_norm": 0.5857137840958062, + "learning_rate": 1.4434330299089727e-06, + "loss": 0.8171, + "step": 111 + }, + { + "epoch": 0.04371584699453552, + "grad_norm": 0.6226268300654109, + "learning_rate": 1.456436931079324e-06, + "loss": 0.7482, + "step": 112 + }, + { + "epoch": 0.04410616705698673, + "grad_norm": 0.5884403830375232, + "learning_rate": 1.469440832249675e-06, + "loss": 0.7708, + "step": 113 + }, + { + "epoch": 0.04449648711943794, + "grad_norm": 0.6098848305707365, + "learning_rate": 1.4824447334200262e-06, + "loss": 0.7927, + "step": 114 + }, + { + "epoch": 0.04488680718188915, + "grad_norm": 0.6055859006528693, + "learning_rate": 1.4954486345903774e-06, + "loss": 0.8178, + "step": 115 + }, + { + "epoch": 0.04527712724434036, + "grad_norm": 0.5698535258047667, + "learning_rate": 1.5084525357607283e-06, + "loss": 0.7687, + "step": 116 + }, + { + "epoch": 0.04566744730679157, + "grad_norm": 0.5740358633434488, + "learning_rate": 1.5214564369310795e-06, + "loss": 0.7463, + "step": 117 + }, + { + "epoch": 0.04605776736924278, + "grad_norm": 0.5680513713384371, + "learning_rate": 1.5344603381014305e-06, + "loss": 0.7747, + "step": 118 + }, + { + "epoch": 0.04644808743169399, + "grad_norm": 0.5911038638675987, + "learning_rate": 1.5474642392717817e-06, + "loss": 0.7545, + "step": 119 + }, + { + "epoch": 0.0468384074941452, + "grad_norm": 0.5744205403914666, + "learning_rate": 1.560468140442133e-06, + "loss": 0.7715, + "step": 120 + }, + { + "epoch": 0.047228727556596406, + "grad_norm": 0.6182449894602959, + "learning_rate": 1.5734720416124838e-06, + "loss": 0.7528, + "step": 121 + }, + { + "epoch": 0.047619047619047616, + "grad_norm": 0.6023712067030459, + "learning_rate": 1.586475942782835e-06, + "loss": 0.8018, + "step": 122 + }, + { + "epoch": 0.04800936768149883, + "grad_norm": 0.5708665801622054, + "learning_rate": 1.599479843953186e-06, + "loss": 0.7772, + "step": 123 + }, + { + "epoch": 0.04839968774395004, + "grad_norm": 0.7203805938028879, + "learning_rate": 1.6124837451235372e-06, + "loss": 0.7596, + "step": 124 + }, + { + "epoch": 0.04879000780640125, + "grad_norm": 0.5893607338470993, + "learning_rate": 1.6254876462938883e-06, + "loss": 0.7739, + "step": 125 + }, + { + "epoch": 0.04918032786885246, + "grad_norm": 0.5456776175367258, + "learning_rate": 1.6384915474642393e-06, + "loss": 0.7612, + "step": 126 + }, + { + "epoch": 0.04957064793130367, + "grad_norm": 0.6531236252458318, + "learning_rate": 1.6514954486345905e-06, + "loss": 0.7762, + "step": 127 + }, + { + "epoch": 0.04996096799375488, + "grad_norm": 0.5984502172825853, + "learning_rate": 1.6644993498049417e-06, + "loss": 0.7647, + "step": 128 + }, + { + "epoch": 0.05035128805620609, + "grad_norm": 0.6069676304986832, + "learning_rate": 1.6775032509752928e-06, + "loss": 0.7504, + "step": 129 + }, + { + "epoch": 0.0507416081186573, + "grad_norm": 0.6307245658898882, + "learning_rate": 1.6905071521456438e-06, + "loss": 0.718, + "step": 130 + }, + { + "epoch": 0.05113192818110851, + "grad_norm": 0.5544934067055614, + "learning_rate": 1.7035110533159948e-06, + "loss": 0.7803, + "step": 131 + }, + { + "epoch": 0.05152224824355972, + "grad_norm": 0.6527049197871475, + "learning_rate": 1.716514954486346e-06, + "loss": 0.7468, + "step": 132 + }, + { + "epoch": 0.05191256830601093, + "grad_norm": 0.6431701041708983, + "learning_rate": 1.7295188556566973e-06, + "loss": 0.7894, + "step": 133 + }, + { + "epoch": 0.05230288836846214, + "grad_norm": 0.606337100746365, + "learning_rate": 1.7425227568270483e-06, + "loss": 0.7545, + "step": 134 + }, + { + "epoch": 0.05269320843091335, + "grad_norm": 0.5586639890693371, + "learning_rate": 1.7555266579973993e-06, + "loss": 0.7797, + "step": 135 + }, + { + "epoch": 0.05308352849336456, + "grad_norm": 0.6112007735479679, + "learning_rate": 1.7685305591677503e-06, + "loss": 0.7749, + "step": 136 + }, + { + "epoch": 0.053473848555815766, + "grad_norm": 0.5827085689874612, + "learning_rate": 1.7815344603381015e-06, + "loss": 0.7989, + "step": 137 + }, + { + "epoch": 0.053864168618266976, + "grad_norm": 0.647856654269655, + "learning_rate": 1.7945383615084528e-06, + "loss": 0.7683, + "step": 138 + }, + { + "epoch": 0.05425448868071819, + "grad_norm": 0.5970221945701759, + "learning_rate": 1.807542262678804e-06, + "loss": 0.7757, + "step": 139 + }, + { + "epoch": 0.0546448087431694, + "grad_norm": 0.6015763931015161, + "learning_rate": 1.8205461638491548e-06, + "loss": 0.7671, + "step": 140 + }, + { + "epoch": 0.05503512880562061, + "grad_norm": 0.6658196472608588, + "learning_rate": 1.833550065019506e-06, + "loss": 0.8043, + "step": 141 + }, + { + "epoch": 0.05542544886807182, + "grad_norm": 0.6298727563792206, + "learning_rate": 1.846553966189857e-06, + "loss": 0.8096, + "step": 142 + }, + { + "epoch": 0.05581576893052303, + "grad_norm": 0.7062329378225403, + "learning_rate": 1.8595578673602083e-06, + "loss": 0.7907, + "step": 143 + }, + { + "epoch": 0.05620608899297424, + "grad_norm": 0.7207283540024165, + "learning_rate": 1.8725617685305595e-06, + "loss": 0.7812, + "step": 144 + }, + { + "epoch": 0.05659640905542545, + "grad_norm": 0.6494560165297838, + "learning_rate": 1.8855656697009103e-06, + "loss": 0.7815, + "step": 145 + }, + { + "epoch": 0.05698672911787666, + "grad_norm": 0.6158628601700363, + "learning_rate": 1.8985695708712615e-06, + "loss": 0.8102, + "step": 146 + }, + { + "epoch": 0.05737704918032787, + "grad_norm": 0.7874258743003025, + "learning_rate": 1.9115734720416126e-06, + "loss": 0.7621, + "step": 147 + }, + { + "epoch": 0.05776736924277908, + "grad_norm": 0.7246845979353058, + "learning_rate": 1.924577373211964e-06, + "loss": 0.7655, + "step": 148 + }, + { + "epoch": 0.05815768930523029, + "grad_norm": 0.6445598404553244, + "learning_rate": 1.937581274382315e-06, + "loss": 0.7652, + "step": 149 + }, + { + "epoch": 0.0585480093676815, + "grad_norm": 0.6231737277461263, + "learning_rate": 1.950585175552666e-06, + "loss": 0.7695, + "step": 150 + }, + { + "epoch": 0.05893832943013271, + "grad_norm": 0.5841697519049796, + "learning_rate": 1.963589076723017e-06, + "loss": 0.7569, + "step": 151 + }, + { + "epoch": 0.05932864949258392, + "grad_norm": 0.6545458945870789, + "learning_rate": 1.9765929778933683e-06, + "loss": 0.7321, + "step": 152 + }, + { + "epoch": 0.059718969555035126, + "grad_norm": 0.6745094972356865, + "learning_rate": 1.9895968790637195e-06, + "loss": 0.7464, + "step": 153 + }, + { + "epoch": 0.060109289617486336, + "grad_norm": 0.586610685661979, + "learning_rate": 2.0026007802340703e-06, + "loss": 0.758, + "step": 154 + }, + { + "epoch": 0.06049960967993755, + "grad_norm": 0.6321774198110939, + "learning_rate": 2.0156046814044216e-06, + "loss": 0.7379, + "step": 155 + }, + { + "epoch": 0.06088992974238876, + "grad_norm": 0.622248234817513, + "learning_rate": 2.0286085825747724e-06, + "loss": 0.7455, + "step": 156 + }, + { + "epoch": 0.06128024980483997, + "grad_norm": 0.6554089813372727, + "learning_rate": 2.0416124837451236e-06, + "loss": 0.7384, + "step": 157 + }, + { + "epoch": 0.06167056986729118, + "grad_norm": 0.5912328180188098, + "learning_rate": 2.054616384915475e-06, + "loss": 0.7579, + "step": 158 + }, + { + "epoch": 0.06206088992974239, + "grad_norm": 0.6261342635005578, + "learning_rate": 2.067620286085826e-06, + "loss": 0.759, + "step": 159 + }, + { + "epoch": 0.0624512099921936, + "grad_norm": 0.7691129202181822, + "learning_rate": 2.080624187256177e-06, + "loss": 0.7469, + "step": 160 + }, + { + "epoch": 0.06284153005464481, + "grad_norm": 0.6381042515708099, + "learning_rate": 2.093628088426528e-06, + "loss": 0.7579, + "step": 161 + }, + { + "epoch": 0.06323185011709602, + "grad_norm": 0.6054638902225097, + "learning_rate": 2.1066319895968793e-06, + "loss": 0.7769, + "step": 162 + }, + { + "epoch": 0.06362217017954723, + "grad_norm": 0.7389641820062236, + "learning_rate": 2.1196358907672305e-06, + "loss": 0.7226, + "step": 163 + }, + { + "epoch": 0.06401249024199844, + "grad_norm": 0.6897470298306234, + "learning_rate": 2.1326397919375818e-06, + "loss": 0.7838, + "step": 164 + }, + { + "epoch": 0.06440281030444965, + "grad_norm": 0.5850533172378384, + "learning_rate": 2.1456436931079326e-06, + "loss": 0.8032, + "step": 165 + }, + { + "epoch": 0.06479313036690086, + "grad_norm": 0.7211914593993862, + "learning_rate": 2.158647594278284e-06, + "loss": 0.7385, + "step": 166 + }, + { + "epoch": 0.06518345042935207, + "grad_norm": 0.5880200103691484, + "learning_rate": 2.1716514954486346e-06, + "loss": 0.7751, + "step": 167 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 0.6489599444859556, + "learning_rate": 2.184655396618986e-06, + "loss": 0.7411, + "step": 168 + }, + { + "epoch": 0.06596409055425449, + "grad_norm": 0.633969460592738, + "learning_rate": 2.1976592977893367e-06, + "loss": 0.7347, + "step": 169 + }, + { + "epoch": 0.0663544106167057, + "grad_norm": 0.5691546645933182, + "learning_rate": 2.210663198959688e-06, + "loss": 0.7094, + "step": 170 + }, + { + "epoch": 0.06674473067915691, + "grad_norm": 0.7058595333959363, + "learning_rate": 2.223667100130039e-06, + "loss": 0.7457, + "step": 171 + }, + { + "epoch": 0.06713505074160812, + "grad_norm": 0.6293304680862625, + "learning_rate": 2.2366710013003903e-06, + "loss": 0.6959, + "step": 172 + }, + { + "epoch": 0.06752537080405933, + "grad_norm": 0.5688149259242599, + "learning_rate": 2.2496749024707416e-06, + "loss": 0.7517, + "step": 173 + }, + { + "epoch": 0.06791569086651054, + "grad_norm": 0.665561508123818, + "learning_rate": 2.2626788036410924e-06, + "loss": 0.7353, + "step": 174 + }, + { + "epoch": 0.06830601092896176, + "grad_norm": 0.577405121571261, + "learning_rate": 2.2756827048114436e-06, + "loss": 0.7364, + "step": 175 + }, + { + "epoch": 0.06869633099141297, + "grad_norm": 0.5873843345387071, + "learning_rate": 2.288686605981795e-06, + "loss": 0.7821, + "step": 176 + }, + { + "epoch": 0.06908665105386416, + "grad_norm": 0.5831624629206842, + "learning_rate": 2.301690507152146e-06, + "loss": 0.7602, + "step": 177 + }, + { + "epoch": 0.06947697111631537, + "grad_norm": 0.6513283927027634, + "learning_rate": 2.314694408322497e-06, + "loss": 0.7329, + "step": 178 + }, + { + "epoch": 0.06986729117876658, + "grad_norm": 0.6214088758542421, + "learning_rate": 2.327698309492848e-06, + "loss": 0.7328, + "step": 179 + }, + { + "epoch": 0.0702576112412178, + "grad_norm": 0.5626791220919358, + "learning_rate": 2.340702210663199e-06, + "loss": 0.7512, + "step": 180 + }, + { + "epoch": 0.070647931303669, + "grad_norm": 0.5765560394024087, + "learning_rate": 2.35370611183355e-06, + "loss": 0.7385, + "step": 181 + }, + { + "epoch": 0.07103825136612021, + "grad_norm": 0.6332930972031392, + "learning_rate": 2.3667100130039014e-06, + "loss": 0.732, + "step": 182 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 0.6252016738486855, + "learning_rate": 2.3797139141742526e-06, + "loss": 0.6989, + "step": 183 + }, + { + "epoch": 0.07181889149102264, + "grad_norm": 0.6800908944669956, + "learning_rate": 2.3927178153446034e-06, + "loss": 0.7536, + "step": 184 + }, + { + "epoch": 0.07220921155347385, + "grad_norm": 0.5991138859566338, + "learning_rate": 2.4057217165149546e-06, + "loss": 0.7464, + "step": 185 + }, + { + "epoch": 0.07259953161592506, + "grad_norm": 0.6588859841794689, + "learning_rate": 2.418725617685306e-06, + "loss": 0.729, + "step": 186 + }, + { + "epoch": 0.07298985167837627, + "grad_norm": 0.6492495312112719, + "learning_rate": 2.431729518855657e-06, + "loss": 0.753, + "step": 187 + }, + { + "epoch": 0.07338017174082748, + "grad_norm": 0.695400984552008, + "learning_rate": 2.444733420026008e-06, + "loss": 0.7101, + "step": 188 + }, + { + "epoch": 0.07377049180327869, + "grad_norm": 0.6931133003035723, + "learning_rate": 2.457737321196359e-06, + "loss": 0.7376, + "step": 189 + }, + { + "epoch": 0.0741608118657299, + "grad_norm": 0.5966426849923425, + "learning_rate": 2.4707412223667104e-06, + "loss": 0.7404, + "step": 190 + }, + { + "epoch": 0.07455113192818111, + "grad_norm": 0.5974814023066117, + "learning_rate": 2.483745123537061e-06, + "loss": 0.7544, + "step": 191 + }, + { + "epoch": 0.07494145199063232, + "grad_norm": 0.6729376878834193, + "learning_rate": 2.4967490247074124e-06, + "loss": 0.7561, + "step": 192 + }, + { + "epoch": 0.07533177205308353, + "grad_norm": 0.6542105122725128, + "learning_rate": 2.5097529258777636e-06, + "loss": 0.761, + "step": 193 + }, + { + "epoch": 0.07572209211553474, + "grad_norm": 0.5527775424543073, + "learning_rate": 2.5227568270481144e-06, + "loss": 0.7277, + "step": 194 + }, + { + "epoch": 0.07611241217798595, + "grad_norm": 0.6313246622780431, + "learning_rate": 2.535760728218466e-06, + "loss": 0.7524, + "step": 195 + }, + { + "epoch": 0.07650273224043716, + "grad_norm": 0.6241793568846384, + "learning_rate": 2.548764629388817e-06, + "loss": 0.6914, + "step": 196 + }, + { + "epoch": 0.07689305230288837, + "grad_norm": 0.6021388918664159, + "learning_rate": 2.5617685305591677e-06, + "loss": 0.7225, + "step": 197 + }, + { + "epoch": 0.07728337236533958, + "grad_norm": 0.7127607929792589, + "learning_rate": 2.5747724317295194e-06, + "loss": 0.7375, + "step": 198 + }, + { + "epoch": 0.07767369242779079, + "grad_norm": 0.7275815753943083, + "learning_rate": 2.58777633289987e-06, + "loss": 0.7447, + "step": 199 + }, + { + "epoch": 0.078064012490242, + "grad_norm": 0.7179381930982617, + "learning_rate": 2.6007802340702214e-06, + "loss": 0.7525, + "step": 200 + }, + { + "epoch": 0.07845433255269321, + "grad_norm": 0.6841131609214134, + "learning_rate": 2.613784135240572e-06, + "loss": 0.7309, + "step": 201 + }, + { + "epoch": 0.07884465261514442, + "grad_norm": 0.6085852485573722, + "learning_rate": 2.6267880364109234e-06, + "loss": 0.767, + "step": 202 + }, + { + "epoch": 0.07923497267759563, + "grad_norm": 0.6740447192069621, + "learning_rate": 2.6397919375812747e-06, + "loss": 0.7219, + "step": 203 + }, + { + "epoch": 0.07962529274004684, + "grad_norm": 0.7222017341186872, + "learning_rate": 2.6527958387516255e-06, + "loss": 0.7593, + "step": 204 + }, + { + "epoch": 0.08001561280249805, + "grad_norm": 0.6973644202213456, + "learning_rate": 2.665799739921977e-06, + "loss": 0.7384, + "step": 205 + }, + { + "epoch": 0.08040593286494926, + "grad_norm": 0.6186280176046488, + "learning_rate": 2.678803641092328e-06, + "loss": 0.7834, + "step": 206 + }, + { + "epoch": 0.08079625292740047, + "grad_norm": 0.5626831502720813, + "learning_rate": 2.6918075422626787e-06, + "loss": 0.7475, + "step": 207 + }, + { + "epoch": 0.08118657298985169, + "grad_norm": 0.6890376680013068, + "learning_rate": 2.7048114434330304e-06, + "loss": 0.7482, + "step": 208 + }, + { + "epoch": 0.08157689305230288, + "grad_norm": 0.6529542651291037, + "learning_rate": 2.717815344603381e-06, + "loss": 0.7063, + "step": 209 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 0.6886458114504092, + "learning_rate": 2.7308192457737324e-06, + "loss": 0.7215, + "step": 210 + }, + { + "epoch": 0.0823575331772053, + "grad_norm": 0.6597890626874188, + "learning_rate": 2.7438231469440836e-06, + "loss": 0.7196, + "step": 211 + }, + { + "epoch": 0.08274785323965651, + "grad_norm": 0.6153015501938158, + "learning_rate": 2.7568270481144345e-06, + "loss": 0.7528, + "step": 212 + }, + { + "epoch": 0.08313817330210772, + "grad_norm": 0.6519089612174288, + "learning_rate": 2.7698309492847857e-06, + "loss": 0.7439, + "step": 213 + }, + { + "epoch": 0.08352849336455893, + "grad_norm": 0.5762315739561609, + "learning_rate": 2.7828348504551365e-06, + "loss": 0.747, + "step": 214 + }, + { + "epoch": 0.08391881342701014, + "grad_norm": 0.5885660929017345, + "learning_rate": 2.795838751625488e-06, + "loss": 0.694, + "step": 215 + }, + { + "epoch": 0.08430913348946135, + "grad_norm": 0.6784264112045082, + "learning_rate": 2.808842652795839e-06, + "loss": 0.7329, + "step": 216 + }, + { + "epoch": 0.08469945355191257, + "grad_norm": 0.5727161640143155, + "learning_rate": 2.8218465539661898e-06, + "loss": 0.7253, + "step": 217 + }, + { + "epoch": 0.08508977361436378, + "grad_norm": 0.5678862524098921, + "learning_rate": 2.8348504551365414e-06, + "loss": 0.7305, + "step": 218 + }, + { + "epoch": 0.08548009367681499, + "grad_norm": 0.6081070805113071, + "learning_rate": 2.8478543563068922e-06, + "loss": 0.6973, + "step": 219 + }, + { + "epoch": 0.0858704137392662, + "grad_norm": 0.7137709004130273, + "learning_rate": 2.860858257477244e-06, + "loss": 0.7385, + "step": 220 + }, + { + "epoch": 0.0862607338017174, + "grad_norm": 0.5868153981312745, + "learning_rate": 2.8738621586475947e-06, + "loss": 0.7395, + "step": 221 + }, + { + "epoch": 0.08665105386416862, + "grad_norm": 0.6805604152282677, + "learning_rate": 2.8868660598179455e-06, + "loss": 0.7323, + "step": 222 + }, + { + "epoch": 0.08704137392661983, + "grad_norm": 0.6416116104842844, + "learning_rate": 2.8998699609882967e-06, + "loss": 0.7605, + "step": 223 + }, + { + "epoch": 0.08743169398907104, + "grad_norm": 0.6229471000765433, + "learning_rate": 2.912873862158648e-06, + "loss": 0.7121, + "step": 224 + }, + { + "epoch": 0.08782201405152225, + "grad_norm": 0.6192393169093633, + "learning_rate": 2.925877763328999e-06, + "loss": 0.7083, + "step": 225 + }, + { + "epoch": 0.08821233411397346, + "grad_norm": 0.5507524222291859, + "learning_rate": 2.93888166449935e-06, + "loss": 0.6872, + "step": 226 + }, + { + "epoch": 0.08860265417642467, + "grad_norm": 0.5638906404524082, + "learning_rate": 2.9518855656697008e-06, + "loss": 0.7383, + "step": 227 + }, + { + "epoch": 0.08899297423887588, + "grad_norm": 0.606351416329974, + "learning_rate": 2.9648894668400524e-06, + "loss": 0.7232, + "step": 228 + }, + { + "epoch": 0.08938329430132709, + "grad_norm": 0.5580625594651178, + "learning_rate": 2.9778933680104032e-06, + "loss": 0.7349, + "step": 229 + }, + { + "epoch": 0.0897736143637783, + "grad_norm": 0.6094625505936122, + "learning_rate": 2.990897269180755e-06, + "loss": 0.7189, + "step": 230 + }, + { + "epoch": 0.09016393442622951, + "grad_norm": 0.6222849253418435, + "learning_rate": 3.0039011703511057e-06, + "loss": 0.6922, + "step": 231 + }, + { + "epoch": 0.09055425448868072, + "grad_norm": 0.6022756461220734, + "learning_rate": 3.0169050715214565e-06, + "loss": 0.7052, + "step": 232 + }, + { + "epoch": 0.09094457455113193, + "grad_norm": 0.6346755996556356, + "learning_rate": 3.029908972691808e-06, + "loss": 0.7465, + "step": 233 + }, + { + "epoch": 0.09133489461358314, + "grad_norm": 0.6060653487074289, + "learning_rate": 3.042912873862159e-06, + "loss": 0.7474, + "step": 234 + }, + { + "epoch": 0.09172521467603435, + "grad_norm": 0.6697759336608017, + "learning_rate": 3.05591677503251e-06, + "loss": 0.7172, + "step": 235 + }, + { + "epoch": 0.09211553473848556, + "grad_norm": 0.6805464954759589, + "learning_rate": 3.068920676202861e-06, + "loss": 0.7583, + "step": 236 + }, + { + "epoch": 0.09250585480093677, + "grad_norm": 0.5190228679343878, + "learning_rate": 3.0819245773732122e-06, + "loss": 0.7117, + "step": 237 + }, + { + "epoch": 0.09289617486338798, + "grad_norm": 0.677099477146376, + "learning_rate": 3.0949284785435635e-06, + "loss": 0.7439, + "step": 238 + }, + { + "epoch": 0.0932864949258392, + "grad_norm": 0.6241169909239689, + "learning_rate": 3.1079323797139143e-06, + "loss": 0.695, + "step": 239 + }, + { + "epoch": 0.0936768149882904, + "grad_norm": 0.6313902552104987, + "learning_rate": 3.120936280884266e-06, + "loss": 0.7358, + "step": 240 + }, + { + "epoch": 0.0940671350507416, + "grad_norm": 0.6297348815237372, + "learning_rate": 3.1339401820546167e-06, + "loss": 0.7259, + "step": 241 + }, + { + "epoch": 0.09445745511319281, + "grad_norm": 0.7616576017311608, + "learning_rate": 3.1469440832249675e-06, + "loss": 0.6948, + "step": 242 + }, + { + "epoch": 0.09484777517564402, + "grad_norm": 0.6290244022218098, + "learning_rate": 3.159947984395319e-06, + "loss": 0.7608, + "step": 243 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 0.5964973087263065, + "learning_rate": 3.17295188556567e-06, + "loss": 0.7266, + "step": 244 + }, + { + "epoch": 0.09562841530054644, + "grad_norm": 0.652782430543401, + "learning_rate": 3.185955786736021e-06, + "loss": 0.7223, + "step": 245 + }, + { + "epoch": 0.09601873536299765, + "grad_norm": 0.6377538625777347, + "learning_rate": 3.198959687906372e-06, + "loss": 0.7452, + "step": 246 + }, + { + "epoch": 0.09640905542544886, + "grad_norm": 0.6468617942005843, + "learning_rate": 3.2119635890767233e-06, + "loss": 0.7335, + "step": 247 + }, + { + "epoch": 0.09679937548790007, + "grad_norm": 0.5621000613767603, + "learning_rate": 3.2249674902470745e-06, + "loss": 0.6973, + "step": 248 + }, + { + "epoch": 0.09718969555035128, + "grad_norm": 0.6971382126279495, + "learning_rate": 3.2379713914174253e-06, + "loss": 0.678, + "step": 249 + }, + { + "epoch": 0.0975800156128025, + "grad_norm": 0.6363359237995663, + "learning_rate": 3.2509752925877765e-06, + "loss": 0.7605, + "step": 250 + }, + { + "epoch": 0.0979703356752537, + "grad_norm": 0.6012117905363288, + "learning_rate": 3.2639791937581278e-06, + "loss": 0.7228, + "step": 251 + }, + { + "epoch": 0.09836065573770492, + "grad_norm": 0.6181623498654948, + "learning_rate": 3.2769830949284786e-06, + "loss": 0.7543, + "step": 252 + }, + { + "epoch": 0.09875097580015613, + "grad_norm": 0.5894770221937677, + "learning_rate": 3.2899869960988302e-06, + "loss": 0.7099, + "step": 253 + }, + { + "epoch": 0.09914129586260734, + "grad_norm": 0.7273862597615569, + "learning_rate": 3.302990897269181e-06, + "loss": 0.7027, + "step": 254 + }, + { + "epoch": 0.09953161592505855, + "grad_norm": 0.6537340418560127, + "learning_rate": 3.315994798439532e-06, + "loss": 0.7251, + "step": 255 + }, + { + "epoch": 0.09992193598750976, + "grad_norm": 0.6260603235644657, + "learning_rate": 3.3289986996098835e-06, + "loss": 0.6993, + "step": 256 + }, + { + "epoch": 0.10031225604996097, + "grad_norm": 0.7150791708718877, + "learning_rate": 3.3420026007802343e-06, + "loss": 0.6844, + "step": 257 + }, + { + "epoch": 0.10070257611241218, + "grad_norm": 0.6387045523383148, + "learning_rate": 3.3550065019505855e-06, + "loss": 0.7167, + "step": 258 + }, + { + "epoch": 0.10109289617486339, + "grad_norm": 0.6503347452558628, + "learning_rate": 3.3680104031209363e-06, + "loss": 0.6981, + "step": 259 + }, + { + "epoch": 0.1014832162373146, + "grad_norm": 0.6395649666939841, + "learning_rate": 3.3810143042912876e-06, + "loss": 0.7668, + "step": 260 + }, + { + "epoch": 0.10187353629976581, + "grad_norm": 0.6643632364094122, + "learning_rate": 3.3940182054616388e-06, + "loss": 0.6923, + "step": 261 + }, + { + "epoch": 0.10226385636221702, + "grad_norm": 0.6469196335282177, + "learning_rate": 3.4070221066319896e-06, + "loss": 0.7202, + "step": 262 + }, + { + "epoch": 0.10265417642466823, + "grad_norm": 0.6359838777903309, + "learning_rate": 3.4200260078023412e-06, + "loss": 0.6971, + "step": 263 + }, + { + "epoch": 0.10304449648711944, + "grad_norm": 0.613410512659893, + "learning_rate": 3.433029908972692e-06, + "loss": 0.7639, + "step": 264 + }, + { + "epoch": 0.10343481654957065, + "grad_norm": 0.6833328010333188, + "learning_rate": 3.446033810143043e-06, + "loss": 0.7153, + "step": 265 + }, + { + "epoch": 0.10382513661202186, + "grad_norm": 0.6703628703101671, + "learning_rate": 3.4590377113133945e-06, + "loss": 0.7306, + "step": 266 + }, + { + "epoch": 0.10421545667447307, + "grad_norm": 0.5958430751751655, + "learning_rate": 3.4720416124837453e-06, + "loss": 0.701, + "step": 267 + }, + { + "epoch": 0.10460577673692428, + "grad_norm": 0.7317774485695615, + "learning_rate": 3.4850455136540965e-06, + "loss": 0.7142, + "step": 268 + }, + { + "epoch": 0.1049960967993755, + "grad_norm": 0.6263322906368243, + "learning_rate": 3.4980494148244478e-06, + "loss": 0.7234, + "step": 269 + }, + { + "epoch": 0.1053864168618267, + "grad_norm": 0.7092638556537828, + "learning_rate": 3.5110533159947986e-06, + "loss": 0.669, + "step": 270 + }, + { + "epoch": 0.10577673692427791, + "grad_norm": 0.710119336584125, + "learning_rate": 3.52405721716515e-06, + "loss": 0.7224, + "step": 271 + }, + { + "epoch": 0.10616705698672912, + "grad_norm": 0.6471738158854472, + "learning_rate": 3.5370611183355006e-06, + "loss": 0.7146, + "step": 272 + }, + { + "epoch": 0.10655737704918032, + "grad_norm": 0.7387719670341235, + "learning_rate": 3.5500650195058523e-06, + "loss": 0.7313, + "step": 273 + }, + { + "epoch": 0.10694769711163153, + "grad_norm": 0.6421778445231306, + "learning_rate": 3.563068920676203e-06, + "loss": 0.7532, + "step": 274 + }, + { + "epoch": 0.10733801717408274, + "grad_norm": 0.6566508020084547, + "learning_rate": 3.576072821846554e-06, + "loss": 0.7226, + "step": 275 + }, + { + "epoch": 0.10772833723653395, + "grad_norm": 0.6570650677222086, + "learning_rate": 3.5890767230169055e-06, + "loss": 0.7028, + "step": 276 + }, + { + "epoch": 0.10811865729898516, + "grad_norm": 0.7215915527188655, + "learning_rate": 3.6020806241872563e-06, + "loss": 0.7076, + "step": 277 + }, + { + "epoch": 0.10850897736143637, + "grad_norm": 0.6781217679934435, + "learning_rate": 3.615084525357608e-06, + "loss": 0.7094, + "step": 278 + }, + { + "epoch": 0.10889929742388758, + "grad_norm": 0.642435452810082, + "learning_rate": 3.628088426527959e-06, + "loss": 0.7224, + "step": 279 + }, + { + "epoch": 0.1092896174863388, + "grad_norm": 0.6577405003495118, + "learning_rate": 3.6410923276983096e-06, + "loss": 0.7313, + "step": 280 + }, + { + "epoch": 0.10967993754879, + "grad_norm": 0.673621310542849, + "learning_rate": 3.654096228868661e-06, + "loss": 0.7143, + "step": 281 + }, + { + "epoch": 0.11007025761124122, + "grad_norm": 0.750977493140457, + "learning_rate": 3.667100130039012e-06, + "loss": 0.7416, + "step": 282 + }, + { + "epoch": 0.11046057767369243, + "grad_norm": 0.7025866457402488, + "learning_rate": 3.6801040312093633e-06, + "loss": 0.704, + "step": 283 + }, + { + "epoch": 0.11085089773614364, + "grad_norm": 0.6687582273500943, + "learning_rate": 3.693107932379714e-06, + "loss": 0.6818, + "step": 284 + }, + { + "epoch": 0.11124121779859485, + "grad_norm": 0.5928782224652829, + "learning_rate": 3.706111833550065e-06, + "loss": 0.7382, + "step": 285 + }, + { + "epoch": 0.11163153786104606, + "grad_norm": 0.6908741762102655, + "learning_rate": 3.7191157347204166e-06, + "loss": 0.7191, + "step": 286 + }, + { + "epoch": 0.11202185792349727, + "grad_norm": 0.7031532727320664, + "learning_rate": 3.7321196358907674e-06, + "loss": 0.7048, + "step": 287 + }, + { + "epoch": 0.11241217798594848, + "grad_norm": 0.6408485619331441, + "learning_rate": 3.745123537061119e-06, + "loss": 0.7073, + "step": 288 + }, + { + "epoch": 0.11280249804839969, + "grad_norm": 0.6925199204910064, + "learning_rate": 3.75812743823147e-06, + "loss": 0.7148, + "step": 289 + }, + { + "epoch": 0.1131928181108509, + "grad_norm": 0.6314795193256784, + "learning_rate": 3.7711313394018206e-06, + "loss": 0.7156, + "step": 290 + }, + { + "epoch": 0.11358313817330211, + "grad_norm": 0.6188864028584836, + "learning_rate": 3.7841352405721723e-06, + "loss": 0.6876, + "step": 291 + }, + { + "epoch": 0.11397345823575332, + "grad_norm": 0.6994461059356449, + "learning_rate": 3.797139141742523e-06, + "loss": 0.6872, + "step": 292 + }, + { + "epoch": 0.11436377829820453, + "grad_norm": 0.690454391927327, + "learning_rate": 3.8101430429128743e-06, + "loss": 0.6842, + "step": 293 + }, + { + "epoch": 0.11475409836065574, + "grad_norm": 0.6669861081534306, + "learning_rate": 3.823146944083225e-06, + "loss": 0.7507, + "step": 294 + }, + { + "epoch": 0.11514441842310695, + "grad_norm": 0.6903124399481024, + "learning_rate": 3.836150845253576e-06, + "loss": 0.7414, + "step": 295 + }, + { + "epoch": 0.11553473848555816, + "grad_norm": 0.7216066010382123, + "learning_rate": 3.849154746423928e-06, + "loss": 0.6833, + "step": 296 + }, + { + "epoch": 0.11592505854800937, + "grad_norm": 0.7455098333535937, + "learning_rate": 3.862158647594279e-06, + "loss": 0.7311, + "step": 297 + }, + { + "epoch": 0.11631537861046058, + "grad_norm": 0.6789863974722274, + "learning_rate": 3.87516254876463e-06, + "loss": 0.678, + "step": 298 + }, + { + "epoch": 0.11670569867291179, + "grad_norm": 0.6797425346521829, + "learning_rate": 3.8881664499349804e-06, + "loss": 0.6963, + "step": 299 + }, + { + "epoch": 0.117096018735363, + "grad_norm": 0.7156522549969554, + "learning_rate": 3.901170351105332e-06, + "loss": 0.698, + "step": 300 + }, + { + "epoch": 0.11748633879781421, + "grad_norm": 0.6983176760996448, + "learning_rate": 3.914174252275683e-06, + "loss": 0.7113, + "step": 301 + }, + { + "epoch": 0.11787665886026542, + "grad_norm": 0.7252898553024566, + "learning_rate": 3.927178153446034e-06, + "loss": 0.6898, + "step": 302 + }, + { + "epoch": 0.11826697892271663, + "grad_norm": 0.6811151024820145, + "learning_rate": 3.940182054616385e-06, + "loss": 0.6936, + "step": 303 + }, + { + "epoch": 0.11865729898516784, + "grad_norm": 0.756884859319761, + "learning_rate": 3.953185955786737e-06, + "loss": 0.7289, + "step": 304 + }, + { + "epoch": 0.11904761904761904, + "grad_norm": 0.6619497892116, + "learning_rate": 3.966189856957087e-06, + "loss": 0.7096, + "step": 305 + }, + { + "epoch": 0.11943793911007025, + "grad_norm": 0.6761635512503945, + "learning_rate": 3.979193758127439e-06, + "loss": 0.6985, + "step": 306 + }, + { + "epoch": 0.11982825917252146, + "grad_norm": 0.6597022587610625, + "learning_rate": 3.992197659297789e-06, + "loss": 0.7115, + "step": 307 + }, + { + "epoch": 0.12021857923497267, + "grad_norm": 0.6444156916019805, + "learning_rate": 4.005201560468141e-06, + "loss": 0.723, + "step": 308 + }, + { + "epoch": 0.12060889929742388, + "grad_norm": 0.7507008670413251, + "learning_rate": 4.018205461638492e-06, + "loss": 0.7332, + "step": 309 + }, + { + "epoch": 0.1209992193598751, + "grad_norm": 0.6111596891507522, + "learning_rate": 4.031209362808843e-06, + "loss": 0.7098, + "step": 310 + }, + { + "epoch": 0.1213895394223263, + "grad_norm": 0.8522425789194061, + "learning_rate": 4.044213263979194e-06, + "loss": 0.724, + "step": 311 + }, + { + "epoch": 0.12177985948477751, + "grad_norm": 0.8461212977878206, + "learning_rate": 4.057217165149545e-06, + "loss": 0.7103, + "step": 312 + }, + { + "epoch": 0.12217017954722872, + "grad_norm": 0.6023050901918927, + "learning_rate": 4.070221066319897e-06, + "loss": 0.7387, + "step": 313 + }, + { + "epoch": 0.12256049960967993, + "grad_norm": 0.7139568226487635, + "learning_rate": 4.083224967490247e-06, + "loss": 0.7151, + "step": 314 + }, + { + "epoch": 0.12295081967213115, + "grad_norm": 0.7196190975999267, + "learning_rate": 4.096228868660598e-06, + "loss": 0.7141, + "step": 315 + }, + { + "epoch": 0.12334113973458236, + "grad_norm": 0.7161733655872731, + "learning_rate": 4.10923276983095e-06, + "loss": 0.6937, + "step": 316 + }, + { + "epoch": 0.12373145979703357, + "grad_norm": 0.67100226624258, + "learning_rate": 4.122236671001301e-06, + "loss": 0.7106, + "step": 317 + }, + { + "epoch": 0.12412177985948478, + "grad_norm": 0.7284269132752709, + "learning_rate": 4.135240572171652e-06, + "loss": 0.7471, + "step": 318 + }, + { + "epoch": 0.12451209992193599, + "grad_norm": 0.7527257349409056, + "learning_rate": 4.148244473342003e-06, + "loss": 0.7265, + "step": 319 + }, + { + "epoch": 0.1249024199843872, + "grad_norm": 0.5907495554594145, + "learning_rate": 4.161248374512354e-06, + "loss": 0.6987, + "step": 320 + }, + { + "epoch": 0.1252927400468384, + "grad_norm": 0.7157834816351011, + "learning_rate": 4.174252275682705e-06, + "loss": 0.6959, + "step": 321 + }, + { + "epoch": 0.12568306010928962, + "grad_norm": 0.7071835051954661, + "learning_rate": 4.187256176853056e-06, + "loss": 0.7025, + "step": 322 + }, + { + "epoch": 0.12607338017174083, + "grad_norm": 0.7467412035963182, + "learning_rate": 4.200260078023407e-06, + "loss": 0.7298, + "step": 323 + }, + { + "epoch": 0.12646370023419204, + "grad_norm": 0.6617500331975821, + "learning_rate": 4.213263979193759e-06, + "loss": 0.7079, + "step": 324 + }, + { + "epoch": 0.12685402029664325, + "grad_norm": 0.7157383251399088, + "learning_rate": 4.226267880364109e-06, + "loss": 0.6937, + "step": 325 + }, + { + "epoch": 0.12724434035909446, + "grad_norm": 0.7177595900950386, + "learning_rate": 4.239271781534461e-06, + "loss": 0.6966, + "step": 326 + }, + { + "epoch": 0.12763466042154567, + "grad_norm": 0.6714117846881208, + "learning_rate": 4.2522756827048115e-06, + "loss": 0.6812, + "step": 327 + }, + { + "epoch": 0.12802498048399688, + "grad_norm": 0.7649804663273799, + "learning_rate": 4.2652795838751636e-06, + "loss": 0.7115, + "step": 328 + }, + { + "epoch": 0.1284153005464481, + "grad_norm": 0.7537952111348616, + "learning_rate": 4.278283485045514e-06, + "loss": 0.7405, + "step": 329 + }, + { + "epoch": 0.1288056206088993, + "grad_norm": 0.6128210018640711, + "learning_rate": 4.291287386215865e-06, + "loss": 0.6874, + "step": 330 + }, + { + "epoch": 0.1291959406713505, + "grad_norm": 0.909845527782022, + "learning_rate": 4.304291287386216e-06, + "loss": 0.7067, + "step": 331 + }, + { + "epoch": 0.12958626073380172, + "grad_norm": 0.6655002614249049, + "learning_rate": 4.317295188556568e-06, + "loss": 0.7255, + "step": 332 + }, + { + "epoch": 0.12997658079625293, + "grad_norm": 0.6705299634873925, + "learning_rate": 4.330299089726919e-06, + "loss": 0.6954, + "step": 333 + }, + { + "epoch": 0.13036690085870414, + "grad_norm": 0.7135184810521537, + "learning_rate": 4.343302990897269e-06, + "loss": 0.6892, + "step": 334 + }, + { + "epoch": 0.13075722092115535, + "grad_norm": 0.7023441704096097, + "learning_rate": 4.3563068920676205e-06, + "loss": 0.7036, + "step": 335 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 0.7691796293486216, + "learning_rate": 4.369310793237972e-06, + "loss": 0.7269, + "step": 336 + }, + { + "epoch": 0.13153786104605777, + "grad_norm": 0.6836663912027772, + "learning_rate": 4.382314694408323e-06, + "loss": 0.6692, + "step": 337 + }, + { + "epoch": 0.13192818110850899, + "grad_norm": 0.7003927330441498, + "learning_rate": 4.395318595578673e-06, + "loss": 0.7104, + "step": 338 + }, + { + "epoch": 0.1323185011709602, + "grad_norm": 0.676887716409683, + "learning_rate": 4.408322496749025e-06, + "loss": 0.7187, + "step": 339 + }, + { + "epoch": 0.1327088212334114, + "grad_norm": 0.7801335186950082, + "learning_rate": 4.421326397919376e-06, + "loss": 0.73, + "step": 340 + }, + { + "epoch": 0.13309914129586262, + "grad_norm": 0.7662015014499308, + "learning_rate": 4.434330299089728e-06, + "loss": 0.7033, + "step": 341 + }, + { + "epoch": 0.13348946135831383, + "grad_norm": 0.7299188085097765, + "learning_rate": 4.447334200260078e-06, + "loss": 0.7126, + "step": 342 + }, + { + "epoch": 0.13387978142076504, + "grad_norm": 0.671434895179899, + "learning_rate": 4.4603381014304295e-06, + "loss": 0.7297, + "step": 343 + }, + { + "epoch": 0.13427010148321625, + "grad_norm": 0.7851962482169288, + "learning_rate": 4.473342002600781e-06, + "loss": 0.7299, + "step": 344 + }, + { + "epoch": 0.13466042154566746, + "grad_norm": 0.6304613994508618, + "learning_rate": 4.486345903771132e-06, + "loss": 0.7129, + "step": 345 + }, + { + "epoch": 0.13505074160811867, + "grad_norm": 0.6241703597103262, + "learning_rate": 4.499349804941483e-06, + "loss": 0.6805, + "step": 346 + }, + { + "epoch": 0.13544106167056988, + "grad_norm": 0.6333105803372931, + "learning_rate": 4.5123537061118335e-06, + "loss": 0.7038, + "step": 347 + }, + { + "epoch": 0.1358313817330211, + "grad_norm": 0.6037893309917401, + "learning_rate": 4.525357607282185e-06, + "loss": 0.7321, + "step": 348 + }, + { + "epoch": 0.1362217017954723, + "grad_norm": 0.7036955654219083, + "learning_rate": 4.538361508452536e-06, + "loss": 0.6864, + "step": 349 + }, + { + "epoch": 0.1366120218579235, + "grad_norm": 0.698635320346689, + "learning_rate": 4.551365409622887e-06, + "loss": 0.6788, + "step": 350 + }, + { + "epoch": 0.13700234192037472, + "grad_norm": 0.7141648121093565, + "learning_rate": 4.5643693107932384e-06, + "loss": 0.694, + "step": 351 + }, + { + "epoch": 0.13739266198282593, + "grad_norm": 0.6843589186448138, + "learning_rate": 4.57737321196359e-06, + "loss": 0.6777, + "step": 352 + }, + { + "epoch": 0.1377829820452771, + "grad_norm": 0.6328281069972636, + "learning_rate": 4.59037711313394e-06, + "loss": 0.698, + "step": 353 + }, + { + "epoch": 0.13817330210772832, + "grad_norm": 0.6859452071887971, + "learning_rate": 4.603381014304292e-06, + "loss": 0.6931, + "step": 354 + }, + { + "epoch": 0.13856362217017953, + "grad_norm": 0.6787660626902047, + "learning_rate": 4.6163849154746425e-06, + "loss": 0.7042, + "step": 355 + }, + { + "epoch": 0.13895394223263074, + "grad_norm": 0.6966577109193091, + "learning_rate": 4.629388816644994e-06, + "loss": 0.7107, + "step": 356 + }, + { + "epoch": 0.13934426229508196, + "grad_norm": 0.671555255877273, + "learning_rate": 4.642392717815345e-06, + "loss": 0.7093, + "step": 357 + }, + { + "epoch": 0.13973458235753317, + "grad_norm": 0.7694361121864779, + "learning_rate": 4.655396618985696e-06, + "loss": 0.6781, + "step": 358 + }, + { + "epoch": 0.14012490241998438, + "grad_norm": 0.6329118104308662, + "learning_rate": 4.6684005201560474e-06, + "loss": 0.7165, + "step": 359 + }, + { + "epoch": 0.1405152224824356, + "grad_norm": 0.6540239633300746, + "learning_rate": 4.681404421326398e-06, + "loss": 0.7177, + "step": 360 + }, + { + "epoch": 0.1409055425448868, + "grad_norm": 0.7195059157783287, + "learning_rate": 4.69440832249675e-06, + "loss": 0.7477, + "step": 361 + }, + { + "epoch": 0.141295862607338, + "grad_norm": 0.6673971669319033, + "learning_rate": 4.7074122236671e-06, + "loss": 0.697, + "step": 362 + }, + { + "epoch": 0.14168618266978922, + "grad_norm": 0.6467746698599964, + "learning_rate": 4.7204161248374515e-06, + "loss": 0.6551, + "step": 363 + }, + { + "epoch": 0.14207650273224043, + "grad_norm": 0.6648465465284018, + "learning_rate": 4.733420026007803e-06, + "loss": 0.6889, + "step": 364 + }, + { + "epoch": 0.14246682279469164, + "grad_norm": 0.6787160195776223, + "learning_rate": 4.746423927178154e-06, + "loss": 0.7058, + "step": 365 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.6548071578167808, + "learning_rate": 4.759427828348505e-06, + "loss": 0.7111, + "step": 366 + }, + { + "epoch": 0.14324746291959406, + "grad_norm": 0.8290462738164065, + "learning_rate": 4.7724317295188564e-06, + "loss": 0.7252, + "step": 367 + }, + { + "epoch": 0.14363778298204527, + "grad_norm": 0.6261832762886341, + "learning_rate": 4.785435630689207e-06, + "loss": 0.6935, + "step": 368 + }, + { + "epoch": 0.14402810304449648, + "grad_norm": 0.6612128895548671, + "learning_rate": 4.798439531859558e-06, + "loss": 0.6772, + "step": 369 + }, + { + "epoch": 0.1444184231069477, + "grad_norm": 0.70080538799635, + "learning_rate": 4.811443433029909e-06, + "loss": 0.672, + "step": 370 + }, + { + "epoch": 0.1448087431693989, + "grad_norm": 0.658142543472327, + "learning_rate": 4.8244473342002605e-06, + "loss": 0.7097, + "step": 371 + }, + { + "epoch": 0.1451990632318501, + "grad_norm": 0.7985198909444492, + "learning_rate": 4.837451235370612e-06, + "loss": 0.7331, + "step": 372 + }, + { + "epoch": 0.14558938329430132, + "grad_norm": 0.6380495041891001, + "learning_rate": 4.850455136540962e-06, + "loss": 0.6931, + "step": 373 + }, + { + "epoch": 0.14597970335675253, + "grad_norm": 0.6612892303117277, + "learning_rate": 4.863459037711314e-06, + "loss": 0.6664, + "step": 374 + }, + { + "epoch": 0.14637002341920374, + "grad_norm": 0.6977556569819648, + "learning_rate": 4.8764629388816646e-06, + "loss": 0.7102, + "step": 375 + }, + { + "epoch": 0.14676034348165495, + "grad_norm": 0.6623511530531614, + "learning_rate": 4.889466840052016e-06, + "loss": 0.6995, + "step": 376 + }, + { + "epoch": 0.14715066354410616, + "grad_norm": 0.742897974878356, + "learning_rate": 4.902470741222367e-06, + "loss": 0.7182, + "step": 377 + }, + { + "epoch": 0.14754098360655737, + "grad_norm": 0.6041954433809472, + "learning_rate": 4.915474642392718e-06, + "loss": 0.6975, + "step": 378 + }, + { + "epoch": 0.14793130366900858, + "grad_norm": 0.6669879116486299, + "learning_rate": 4.9284785435630695e-06, + "loss": 0.6986, + "step": 379 + }, + { + "epoch": 0.1483216237314598, + "grad_norm": 0.6474473153464095, + "learning_rate": 4.941482444733421e-06, + "loss": 0.6981, + "step": 380 + }, + { + "epoch": 0.148711943793911, + "grad_norm": 0.6157495935729166, + "learning_rate": 4.954486345903772e-06, + "loss": 0.7181, + "step": 381 + }, + { + "epoch": 0.14910226385636222, + "grad_norm": 0.7420541130937764, + "learning_rate": 4.967490247074122e-06, + "loss": 0.6874, + "step": 382 + }, + { + "epoch": 0.14949258391881343, + "grad_norm": 0.5658611995468326, + "learning_rate": 4.9804941482444736e-06, + "loss": 0.6915, + "step": 383 + }, + { + "epoch": 0.14988290398126464, + "grad_norm": 0.6612141224030277, + "learning_rate": 4.993498049414825e-06, + "loss": 0.6957, + "step": 384 + }, + { + "epoch": 0.15027322404371585, + "grad_norm": 0.6753904290979974, + "learning_rate": 5.006501950585176e-06, + "loss": 0.6804, + "step": 385 + }, + { + "epoch": 0.15066354410616706, + "grad_norm": 0.6868861718552405, + "learning_rate": 5.019505851755527e-06, + "loss": 0.6809, + "step": 386 + }, + { + "epoch": 0.15105386416861827, + "grad_norm": 0.6673296222884219, + "learning_rate": 5.0325097529258785e-06, + "loss": 0.6702, + "step": 387 + }, + { + "epoch": 0.15144418423106948, + "grad_norm": 0.7310043864485083, + "learning_rate": 5.045513654096229e-06, + "loss": 0.7132, + "step": 388 + }, + { + "epoch": 0.1518345042935207, + "grad_norm": 0.820034028236232, + "learning_rate": 5.05851755526658e-06, + "loss": 0.6936, + "step": 389 + }, + { + "epoch": 0.1522248243559719, + "grad_norm": 0.5952965447687819, + "learning_rate": 5.071521456436932e-06, + "loss": 0.6973, + "step": 390 + }, + { + "epoch": 0.1526151444184231, + "grad_norm": 0.7015575213288309, + "learning_rate": 5.0845253576072826e-06, + "loss": 0.706, + "step": 391 + }, + { + "epoch": 0.15300546448087432, + "grad_norm": 0.6797218254481311, + "learning_rate": 5.097529258777634e-06, + "loss": 0.6962, + "step": 392 + }, + { + "epoch": 0.15339578454332553, + "grad_norm": 0.6857506902309259, + "learning_rate": 5.110533159947985e-06, + "loss": 0.7025, + "step": 393 + }, + { + "epoch": 0.15378610460577674, + "grad_norm": 0.6935276141606291, + "learning_rate": 5.123537061118335e-06, + "loss": 0.7232, + "step": 394 + }, + { + "epoch": 0.15417642466822795, + "grad_norm": 0.7762242877198263, + "learning_rate": 5.1365409622886875e-06, + "loss": 0.6898, + "step": 395 + }, + { + "epoch": 0.15456674473067916, + "grad_norm": 0.6147991427771626, + "learning_rate": 5.149544863459039e-06, + "loss": 0.7119, + "step": 396 + }, + { + "epoch": 0.15495706479313037, + "grad_norm": 0.7795681145619372, + "learning_rate": 5.162548764629389e-06, + "loss": 0.7169, + "step": 397 + }, + { + "epoch": 0.15534738485558158, + "grad_norm": 0.7455857999951102, + "learning_rate": 5.17555266579974e-06, + "loss": 0.6704, + "step": 398 + }, + { + "epoch": 0.1557377049180328, + "grad_norm": 0.7777014329149001, + "learning_rate": 5.188556566970091e-06, + "loss": 0.7177, + "step": 399 + }, + { + "epoch": 0.156128024980484, + "grad_norm": 0.6916685967017158, + "learning_rate": 5.201560468140443e-06, + "loss": 0.7054, + "step": 400 + }, + { + "epoch": 0.15651834504293521, + "grad_norm": 0.7428014642490663, + "learning_rate": 5.214564369310794e-06, + "loss": 0.6711, + "step": 401 + }, + { + "epoch": 0.15690866510538642, + "grad_norm": 0.8345490220024014, + "learning_rate": 5.227568270481144e-06, + "loss": 0.7271, + "step": 402 + }, + { + "epoch": 0.15729898516783763, + "grad_norm": 0.6641144501313596, + "learning_rate": 5.240572171651496e-06, + "loss": 0.6889, + "step": 403 + }, + { + "epoch": 0.15768930523028885, + "grad_norm": 0.8108067083287169, + "learning_rate": 5.253576072821847e-06, + "loss": 0.703, + "step": 404 + }, + { + "epoch": 0.15807962529274006, + "grad_norm": 0.7770548959491698, + "learning_rate": 5.266579973992199e-06, + "loss": 0.6991, + "step": 405 + }, + { + "epoch": 0.15846994535519127, + "grad_norm": 0.7946305248007705, + "learning_rate": 5.279583875162549e-06, + "loss": 0.676, + "step": 406 + }, + { + "epoch": 0.15886026541764248, + "grad_norm": 0.7300238067297642, + "learning_rate": 5.2925877763329005e-06, + "loss": 0.6966, + "step": 407 + }, + { + "epoch": 0.1592505854800937, + "grad_norm": 0.7299065180437201, + "learning_rate": 5.305591677503251e-06, + "loss": 0.6554, + "step": 408 + }, + { + "epoch": 0.1596409055425449, + "grad_norm": 0.6457333698748261, + "learning_rate": 5.318595578673602e-06, + "loss": 0.6952, + "step": 409 + }, + { + "epoch": 0.1600312256049961, + "grad_norm": 0.8093165517829157, + "learning_rate": 5.331599479843954e-06, + "loss": 0.6659, + "step": 410 + }, + { + "epoch": 0.16042154566744732, + "grad_norm": 0.6650829664441164, + "learning_rate": 5.344603381014305e-06, + "loss": 0.6672, + "step": 411 + }, + { + "epoch": 0.16081186572989853, + "grad_norm": 0.8133116344184572, + "learning_rate": 5.357607282184656e-06, + "loss": 0.6953, + "step": 412 + }, + { + "epoch": 0.16120218579234974, + "grad_norm": 0.6553192226364065, + "learning_rate": 5.370611183355007e-06, + "loss": 0.7341, + "step": 413 + }, + { + "epoch": 0.16159250585480095, + "grad_norm": 0.6888577437634752, + "learning_rate": 5.3836150845253575e-06, + "loss": 0.7013, + "step": 414 + }, + { + "epoch": 0.16198282591725216, + "grad_norm": 0.7030196146127015, + "learning_rate": 5.3966189856957095e-06, + "loss": 0.7061, + "step": 415 + }, + { + "epoch": 0.16237314597970337, + "grad_norm": 0.6609822437136145, + "learning_rate": 5.409622886866061e-06, + "loss": 0.7154, + "step": 416 + }, + { + "epoch": 0.16276346604215455, + "grad_norm": 0.6058052496879919, + "learning_rate": 5.422626788036411e-06, + "loss": 0.728, + "step": 417 + }, + { + "epoch": 0.16315378610460576, + "grad_norm": 0.6362907684915222, + "learning_rate": 5.435630689206762e-06, + "loss": 0.6908, + "step": 418 + }, + { + "epoch": 0.16354410616705697, + "grad_norm": 0.638519265366252, + "learning_rate": 5.448634590377113e-06, + "loss": 0.7087, + "step": 419 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 0.6559231617247335, + "learning_rate": 5.461638491547465e-06, + "loss": 0.6682, + "step": 420 + }, + { + "epoch": 0.1643247462919594, + "grad_norm": 0.7209094740086468, + "learning_rate": 5.474642392717816e-06, + "loss": 0.7375, + "step": 421 + }, + { + "epoch": 0.1647150663544106, + "grad_norm": 0.5940186689594091, + "learning_rate": 5.487646293888167e-06, + "loss": 0.697, + "step": 422 + }, + { + "epoch": 0.16510538641686182, + "grad_norm": 0.6855382043341716, + "learning_rate": 5.500650195058518e-06, + "loss": 0.692, + "step": 423 + }, + { + "epoch": 0.16549570647931303, + "grad_norm": 0.6252237427399469, + "learning_rate": 5.513654096228869e-06, + "loss": 0.6809, + "step": 424 + }, + { + "epoch": 0.16588602654176424, + "grad_norm": 0.6054363630451259, + "learning_rate": 5.526657997399221e-06, + "loss": 0.6602, + "step": 425 + }, + { + "epoch": 0.16627634660421545, + "grad_norm": 0.6781486041858845, + "learning_rate": 5.539661898569571e-06, + "loss": 0.677, + "step": 426 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.5844033011727807, + "learning_rate": 5.552665799739923e-06, + "loss": 0.6613, + "step": 427 + }, + { + "epoch": 0.16705698672911787, + "grad_norm": 0.7093676546402286, + "learning_rate": 5.565669700910273e-06, + "loss": 0.6908, + "step": 428 + }, + { + "epoch": 0.16744730679156908, + "grad_norm": 0.7612608171840209, + "learning_rate": 5.578673602080624e-06, + "loss": 0.652, + "step": 429 + }, + { + "epoch": 0.1678376268540203, + "grad_norm": 0.6828828953362, + "learning_rate": 5.591677503250976e-06, + "loss": 0.7365, + "step": 430 + }, + { + "epoch": 0.1682279469164715, + "grad_norm": 0.6196031955944801, + "learning_rate": 5.6046814044213275e-06, + "loss": 0.6825, + "step": 431 + }, + { + "epoch": 0.1686182669789227, + "grad_norm": 0.7947668287514607, + "learning_rate": 5.617685305591678e-06, + "loss": 0.7066, + "step": 432 + }, + { + "epoch": 0.16900858704137392, + "grad_norm": 0.6343514825339208, + "learning_rate": 5.630689206762029e-06, + "loss": 0.673, + "step": 433 + }, + { + "epoch": 0.16939890710382513, + "grad_norm": 0.8656877612717806, + "learning_rate": 5.6436931079323795e-06, + "loss": 0.7242, + "step": 434 + }, + { + "epoch": 0.16978922716627634, + "grad_norm": 0.7857330534717563, + "learning_rate": 5.656697009102732e-06, + "loss": 0.7037, + "step": 435 + }, + { + "epoch": 0.17017954722872755, + "grad_norm": 0.6518620057402611, + "learning_rate": 5.669700910273083e-06, + "loss": 0.7081, + "step": 436 + }, + { + "epoch": 0.17056986729117876, + "grad_norm": 0.7758421713371635, + "learning_rate": 5.682704811443433e-06, + "loss": 0.6809, + "step": 437 + }, + { + "epoch": 0.17096018735362997, + "grad_norm": 0.7642941206412913, + "learning_rate": 5.6957087126137844e-06, + "loss": 0.6849, + "step": 438 + }, + { + "epoch": 0.17135050741608118, + "grad_norm": 0.7231543413672256, + "learning_rate": 5.708712613784136e-06, + "loss": 0.6844, + "step": 439 + }, + { + "epoch": 0.1717408274785324, + "grad_norm": 0.6800260978786455, + "learning_rate": 5.721716514954488e-06, + "loss": 0.6851, + "step": 440 + }, + { + "epoch": 0.1721311475409836, + "grad_norm": 0.6776831720904736, + "learning_rate": 5.734720416124838e-06, + "loss": 0.6645, + "step": 441 + }, + { + "epoch": 0.1725214676034348, + "grad_norm": 0.8088430759936531, + "learning_rate": 5.747724317295189e-06, + "loss": 0.6844, + "step": 442 + }, + { + "epoch": 0.17291178766588602, + "grad_norm": 0.6964147005106012, + "learning_rate": 5.76072821846554e-06, + "loss": 0.728, + "step": 443 + }, + { + "epoch": 0.17330210772833723, + "grad_norm": 0.6667444255739151, + "learning_rate": 5.773732119635891e-06, + "loss": 0.7117, + "step": 444 + }, + { + "epoch": 0.17369242779078845, + "grad_norm": 0.7278341819389187, + "learning_rate": 5.786736020806243e-06, + "loss": 0.7043, + "step": 445 + }, + { + "epoch": 0.17408274785323966, + "grad_norm": 0.6891410778633429, + "learning_rate": 5.799739921976593e-06, + "loss": 0.6984, + "step": 446 + }, + { + "epoch": 0.17447306791569087, + "grad_norm": 0.7170451890765701, + "learning_rate": 5.812743823146945e-06, + "loss": 0.6875, + "step": 447 + }, + { + "epoch": 0.17486338797814208, + "grad_norm": 0.6956732229950382, + "learning_rate": 5.825747724317296e-06, + "loss": 0.6552, + "step": 448 + }, + { + "epoch": 0.1752537080405933, + "grad_norm": 0.7135345799483731, + "learning_rate": 5.838751625487646e-06, + "loss": 0.6803, + "step": 449 + }, + { + "epoch": 0.1756440281030445, + "grad_norm": 0.7102351695488849, + "learning_rate": 5.851755526657998e-06, + "loss": 0.7023, + "step": 450 + }, + { + "epoch": 0.1760343481654957, + "grad_norm": 0.6997371851017635, + "learning_rate": 5.8647594278283496e-06, + "loss": 0.6749, + "step": 451 + }, + { + "epoch": 0.17642466822794692, + "grad_norm": 0.6546888789651015, + "learning_rate": 5.8777633289987e-06, + "loss": 0.7172, + "step": 452 + }, + { + "epoch": 0.17681498829039813, + "grad_norm": 0.7642250188146371, + "learning_rate": 5.890767230169051e-06, + "loss": 0.6679, + "step": 453 + }, + { + "epoch": 0.17720530835284934, + "grad_norm": 0.7394098881617022, + "learning_rate": 5.9037711313394016e-06, + "loss": 0.6999, + "step": 454 + }, + { + "epoch": 0.17759562841530055, + "grad_norm": 0.5967586984617868, + "learning_rate": 5.916775032509754e-06, + "loss": 0.6518, + "step": 455 + }, + { + "epoch": 0.17798594847775176, + "grad_norm": 0.6888627642917361, + "learning_rate": 5.929778933680105e-06, + "loss": 0.6747, + "step": 456 + }, + { + "epoch": 0.17837626854020297, + "grad_norm": 0.6840028769850457, + "learning_rate": 5.942782834850456e-06, + "loss": 0.7167, + "step": 457 + }, + { + "epoch": 0.17876658860265418, + "grad_norm": 0.7058007813929633, + "learning_rate": 5.9557867360208065e-06, + "loss": 0.7129, + "step": 458 + }, + { + "epoch": 0.1791569086651054, + "grad_norm": 0.6207311129485039, + "learning_rate": 5.968790637191158e-06, + "loss": 0.6581, + "step": 459 + }, + { + "epoch": 0.1795472287275566, + "grad_norm": 0.7404154014295216, + "learning_rate": 5.98179453836151e-06, + "loss": 0.6896, + "step": 460 + }, + { + "epoch": 0.1799375487900078, + "grad_norm": 0.6271720330543054, + "learning_rate": 5.99479843953186e-06, + "loss": 0.7043, + "step": 461 + }, + { + "epoch": 0.18032786885245902, + "grad_norm": 0.7055926066087143, + "learning_rate": 6.007802340702211e-06, + "loss": 0.7215, + "step": 462 + }, + { + "epoch": 0.18071818891491023, + "grad_norm": 0.6462738644930203, + "learning_rate": 6.020806241872562e-06, + "loss": 0.6875, + "step": 463 + }, + { + "epoch": 0.18110850897736144, + "grad_norm": 0.7702093005284204, + "learning_rate": 6.033810143042913e-06, + "loss": 0.6826, + "step": 464 + }, + { + "epoch": 0.18149882903981265, + "grad_norm": 0.6778824067331849, + "learning_rate": 6.046814044213265e-06, + "loss": 0.6908, + "step": 465 + }, + { + "epoch": 0.18188914910226386, + "grad_norm": 0.7410012890422012, + "learning_rate": 6.059817945383616e-06, + "loss": 0.6729, + "step": 466 + }, + { + "epoch": 0.18227946916471507, + "grad_norm": 0.7014829480626635, + "learning_rate": 6.072821846553967e-06, + "loss": 0.6951, + "step": 467 + }, + { + "epoch": 0.18266978922716628, + "grad_norm": 0.6695910615788945, + "learning_rate": 6.085825747724318e-06, + "loss": 0.699, + "step": 468 + }, + { + "epoch": 0.1830601092896175, + "grad_norm": 0.773745351150194, + "learning_rate": 6.098829648894668e-06, + "loss": 0.6923, + "step": 469 + }, + { + "epoch": 0.1834504293520687, + "grad_norm": 0.7737245363076634, + "learning_rate": 6.11183355006502e-06, + "loss": 0.6886, + "step": 470 + }, + { + "epoch": 0.18384074941451992, + "grad_norm": 0.6926336383504857, + "learning_rate": 6.124837451235372e-06, + "loss": 0.6872, + "step": 471 + }, + { + "epoch": 0.18423106947697113, + "grad_norm": 0.8427437524211503, + "learning_rate": 6.137841352405722e-06, + "loss": 0.6829, + "step": 472 + }, + { + "epoch": 0.18462138953942234, + "grad_norm": 0.7647482129798104, + "learning_rate": 6.150845253576073e-06, + "loss": 0.7092, + "step": 473 + }, + { + "epoch": 0.18501170960187355, + "grad_norm": 0.6757724537474735, + "learning_rate": 6.1638491547464245e-06, + "loss": 0.6836, + "step": 474 + }, + { + "epoch": 0.18540202966432476, + "grad_norm": 0.753008958374865, + "learning_rate": 6.176853055916776e-06, + "loss": 0.697, + "step": 475 + }, + { + "epoch": 0.18579234972677597, + "grad_norm": 0.6943172590100756, + "learning_rate": 6.189856957087127e-06, + "loss": 0.6988, + "step": 476 + }, + { + "epoch": 0.18618266978922718, + "grad_norm": 0.6708570122111148, + "learning_rate": 6.202860858257478e-06, + "loss": 0.7204, + "step": 477 + }, + { + "epoch": 0.1865729898516784, + "grad_norm": 0.7885183999652517, + "learning_rate": 6.2158647594278285e-06, + "loss": 0.6737, + "step": 478 + }, + { + "epoch": 0.1869633099141296, + "grad_norm": 0.6329726308874478, + "learning_rate": 6.22886866059818e-06, + "loss": 0.6546, + "step": 479 + }, + { + "epoch": 0.1873536299765808, + "grad_norm": 0.8132270878793747, + "learning_rate": 6.241872561768532e-06, + "loss": 0.6943, + "step": 480 + }, + { + "epoch": 0.187743950039032, + "grad_norm": 0.6842213620815537, + "learning_rate": 6.254876462938882e-06, + "loss": 0.6831, + "step": 481 + }, + { + "epoch": 0.1881342701014832, + "grad_norm": 0.8812732666749945, + "learning_rate": 6.2678803641092335e-06, + "loss": 0.6706, + "step": 482 + }, + { + "epoch": 0.1885245901639344, + "grad_norm": 0.7169509439207383, + "learning_rate": 6.280884265279585e-06, + "loss": 0.6698, + "step": 483 + }, + { + "epoch": 0.18891491022638562, + "grad_norm": 0.7154077859407564, + "learning_rate": 6.293888166449935e-06, + "loss": 0.6886, + "step": 484 + }, + { + "epoch": 0.18930523028883683, + "grad_norm": 0.8780669092042084, + "learning_rate": 6.306892067620286e-06, + "loss": 0.6983, + "step": 485 + }, + { + "epoch": 0.18969555035128804, + "grad_norm": 0.6840090357234536, + "learning_rate": 6.319895968790638e-06, + "loss": 0.6437, + "step": 486 + }, + { + "epoch": 0.19008587041373926, + "grad_norm": 0.75958473094997, + "learning_rate": 6.332899869960989e-06, + "loss": 0.6717, + "step": 487 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.7626727741463003, + "learning_rate": 6.34590377113134e-06, + "loss": 0.6924, + "step": 488 + }, + { + "epoch": 0.19086651053864168, + "grad_norm": 0.6393276363382827, + "learning_rate": 6.35890767230169e-06, + "loss": 0.6961, + "step": 489 + }, + { + "epoch": 0.1912568306010929, + "grad_norm": 0.7066063828436654, + "learning_rate": 6.371911573472042e-06, + "loss": 0.7118, + "step": 490 + }, + { + "epoch": 0.1916471506635441, + "grad_norm": 0.6700251316529224, + "learning_rate": 6.384915474642394e-06, + "loss": 0.7041, + "step": 491 + }, + { + "epoch": 0.1920374707259953, + "grad_norm": 0.8431909756717771, + "learning_rate": 6.397919375812744e-06, + "loss": 0.7084, + "step": 492 + }, + { + "epoch": 0.19242779078844652, + "grad_norm": 0.658974937463215, + "learning_rate": 6.410923276983095e-06, + "loss": 0.7086, + "step": 493 + }, + { + "epoch": 0.19281811085089773, + "grad_norm": 0.6689378610398877, + "learning_rate": 6.4239271781534465e-06, + "loss": 0.6919, + "step": 494 + }, + { + "epoch": 0.19320843091334894, + "grad_norm": 0.6438557217090913, + "learning_rate": 6.436931079323797e-06, + "loss": 0.6976, + "step": 495 + }, + { + "epoch": 0.19359875097580015, + "grad_norm": 0.7025755977338289, + "learning_rate": 6.449934980494149e-06, + "loss": 0.6868, + "step": 496 + }, + { + "epoch": 0.19398907103825136, + "grad_norm": 0.6038494411514885, + "learning_rate": 6.4629388816645e-06, + "loss": 0.7095, + "step": 497 + }, + { + "epoch": 0.19437939110070257, + "grad_norm": 0.6192735119436898, + "learning_rate": 6.475942782834851e-06, + "loss": 0.6836, + "step": 498 + }, + { + "epoch": 0.19476971116315378, + "grad_norm": 0.6661957304274418, + "learning_rate": 6.488946684005202e-06, + "loss": 0.6718, + "step": 499 + }, + { + "epoch": 0.195160031225605, + "grad_norm": 0.8541975731960628, + "learning_rate": 6.501950585175553e-06, + "loss": 0.7023, + "step": 500 + }, + { + "epoch": 0.1955503512880562, + "grad_norm": 0.7019624355393748, + "learning_rate": 6.514954486345904e-06, + "loss": 0.6552, + "step": 501 + }, + { + "epoch": 0.1959406713505074, + "grad_norm": 0.7757777329975457, + "learning_rate": 6.5279583875162555e-06, + "loss": 0.706, + "step": 502 + }, + { + "epoch": 0.19633099141295862, + "grad_norm": 0.637645100551664, + "learning_rate": 6.540962288686607e-06, + "loss": 0.6727, + "step": 503 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 0.6339423576323707, + "learning_rate": 6.553966189856957e-06, + "loss": 0.6876, + "step": 504 + }, + { + "epoch": 0.19711163153786104, + "grad_norm": 0.6169127300833769, + "learning_rate": 6.566970091027308e-06, + "loss": 0.6736, + "step": 505 + }, + { + "epoch": 0.19750195160031225, + "grad_norm": 0.7862111582584854, + "learning_rate": 6.5799739921976604e-06, + "loss": 0.7125, + "step": 506 + }, + { + "epoch": 0.19789227166276346, + "grad_norm": 0.682364583269862, + "learning_rate": 6.592977893368011e-06, + "loss": 0.6831, + "step": 507 + }, + { + "epoch": 0.19828259172521467, + "grad_norm": 0.6392359430148575, + "learning_rate": 6.605981794538362e-06, + "loss": 0.6856, + "step": 508 + }, + { + "epoch": 0.19867291178766588, + "grad_norm": 0.8264187949340407, + "learning_rate": 6.618985695708713e-06, + "loss": 0.6538, + "step": 509 + }, + { + "epoch": 0.1990632318501171, + "grad_norm": 0.6567683281662096, + "learning_rate": 6.631989596879064e-06, + "loss": 0.6535, + "step": 510 + }, + { + "epoch": 0.1994535519125683, + "grad_norm": 0.7325848416555424, + "learning_rate": 6.644993498049416e-06, + "loss": 0.6982, + "step": 511 + }, + { + "epoch": 0.19984387197501952, + "grad_norm": 0.6949649170398536, + "learning_rate": 6.657997399219767e-06, + "loss": 0.6794, + "step": 512 + }, + { + "epoch": 0.20023419203747073, + "grad_norm": 0.7070547142881645, + "learning_rate": 6.671001300390117e-06, + "loss": 0.675, + "step": 513 + }, + { + "epoch": 0.20062451209992194, + "grad_norm": 0.7900133870864973, + "learning_rate": 6.6840052015604686e-06, + "loss": 0.6869, + "step": 514 + }, + { + "epoch": 0.20101483216237315, + "grad_norm": 0.7203436590836644, + "learning_rate": 6.697009102730819e-06, + "loss": 0.6806, + "step": 515 + }, + { + "epoch": 0.20140515222482436, + "grad_norm": 0.7097422738658729, + "learning_rate": 6.710013003901171e-06, + "loss": 0.6723, + "step": 516 + }, + { + "epoch": 0.20179547228727557, + "grad_norm": 0.8589940431048299, + "learning_rate": 6.723016905071522e-06, + "loss": 0.6918, + "step": 517 + }, + { + "epoch": 0.20218579234972678, + "grad_norm": 0.7729632184216302, + "learning_rate": 6.736020806241873e-06, + "loss": 0.663, + "step": 518 + }, + { + "epoch": 0.202576112412178, + "grad_norm": 0.7710184473525479, + "learning_rate": 6.749024707412224e-06, + "loss": 0.6802, + "step": 519 + }, + { + "epoch": 0.2029664324746292, + "grad_norm": 0.6583416730789263, + "learning_rate": 6.762028608582575e-06, + "loss": 0.6637, + "step": 520 + }, + { + "epoch": 0.2033567525370804, + "grad_norm": 0.6731241837343928, + "learning_rate": 6.775032509752927e-06, + "loss": 0.7002, + "step": 521 + }, + { + "epoch": 0.20374707259953162, + "grad_norm": 0.7351016738937797, + "learning_rate": 6.7880364109232776e-06, + "loss": 0.7087, + "step": 522 + }, + { + "epoch": 0.20413739266198283, + "grad_norm": 0.6684219507758046, + "learning_rate": 6.801040312093629e-06, + "loss": 0.6942, + "step": 523 + }, + { + "epoch": 0.20452771272443404, + "grad_norm": 0.6798528895830548, + "learning_rate": 6.814044213263979e-06, + "loss": 0.6677, + "step": 524 + }, + { + "epoch": 0.20491803278688525, + "grad_norm": 0.6922298937810374, + "learning_rate": 6.82704811443433e-06, + "loss": 0.6842, + "step": 525 + }, + { + "epoch": 0.20530835284933646, + "grad_norm": 0.6033244997300722, + "learning_rate": 6.8400520156046825e-06, + "loss": 0.6873, + "step": 526 + }, + { + "epoch": 0.20569867291178767, + "grad_norm": 0.6976679604255536, + "learning_rate": 6.853055916775033e-06, + "loss": 0.6869, + "step": 527 + }, + { + "epoch": 0.20608899297423888, + "grad_norm": 0.6790011216657255, + "learning_rate": 6.866059817945384e-06, + "loss": 0.6865, + "step": 528 + }, + { + "epoch": 0.2064793130366901, + "grad_norm": 0.6881196756067273, + "learning_rate": 6.879063719115735e-06, + "loss": 0.6902, + "step": 529 + }, + { + "epoch": 0.2068696330991413, + "grad_norm": 0.7151649558986662, + "learning_rate": 6.892067620286086e-06, + "loss": 0.7039, + "step": 530 + }, + { + "epoch": 0.20725995316159251, + "grad_norm": 0.6963054399524627, + "learning_rate": 6.905071521456438e-06, + "loss": 0.716, + "step": 531 + }, + { + "epoch": 0.20765027322404372, + "grad_norm": 0.6294448516491707, + "learning_rate": 6.918075422626789e-06, + "loss": 0.7051, + "step": 532 + }, + { + "epoch": 0.20804059328649493, + "grad_norm": 0.6552179409639254, + "learning_rate": 6.931079323797139e-06, + "loss": 0.7078, + "step": 533 + }, + { + "epoch": 0.20843091334894615, + "grad_norm": 0.6637244508676176, + "learning_rate": 6.944083224967491e-06, + "loss": 0.7033, + "step": 534 + }, + { + "epoch": 0.20882123341139736, + "grad_norm": 0.7191066394821087, + "learning_rate": 6.957087126137841e-06, + "loss": 0.6467, + "step": 535 + }, + { + "epoch": 0.20921155347384857, + "grad_norm": 0.6931703632710252, + "learning_rate": 6.970091027308193e-06, + "loss": 0.6728, + "step": 536 + }, + { + "epoch": 0.20960187353629978, + "grad_norm": 0.7084017483825084, + "learning_rate": 6.983094928478544e-06, + "loss": 0.6812, + "step": 537 + }, + { + "epoch": 0.209992193598751, + "grad_norm": 0.6181772303871396, + "learning_rate": 6.9960988296488955e-06, + "loss": 0.6508, + "step": 538 + }, + { + "epoch": 0.2103825136612022, + "grad_norm": 0.6378845167979263, + "learning_rate": 7.009102730819246e-06, + "loss": 0.6544, + "step": 539 + }, + { + "epoch": 0.2107728337236534, + "grad_norm": 0.742163162266714, + "learning_rate": 7.022106631989597e-06, + "loss": 0.7027, + "step": 540 + }, + { + "epoch": 0.21116315378610462, + "grad_norm": 0.664192883142808, + "learning_rate": 7.035110533159949e-06, + "loss": 0.716, + "step": 541 + }, + { + "epoch": 0.21155347384855583, + "grad_norm": 0.6369618663469849, + "learning_rate": 7.0481144343303e-06, + "loss": 0.6511, + "step": 542 + }, + { + "epoch": 0.21194379391100704, + "grad_norm": 0.8310086343564795, + "learning_rate": 7.061118335500651e-06, + "loss": 0.651, + "step": 543 + }, + { + "epoch": 0.21233411397345825, + "grad_norm": 0.5752503019422206, + "learning_rate": 7.074122236671001e-06, + "loss": 0.6784, + "step": 544 + }, + { + "epoch": 0.21272443403590943, + "grad_norm": 0.8068532630145403, + "learning_rate": 7.0871261378413525e-06, + "loss": 0.6362, + "step": 545 + }, + { + "epoch": 0.21311475409836064, + "grad_norm": 0.7952047471767161, + "learning_rate": 7.1001300390117045e-06, + "loss": 0.6932, + "step": 546 + }, + { + "epoch": 0.21350507416081185, + "grad_norm": 0.6751585312331592, + "learning_rate": 7.113133940182056e-06, + "loss": 0.7029, + "step": 547 + }, + { + "epoch": 0.21389539422326306, + "grad_norm": 0.7876249183234859, + "learning_rate": 7.126137841352406e-06, + "loss": 0.6933, + "step": 548 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 0.7556694888842841, + "learning_rate": 7.139141742522757e-06, + "loss": 0.7096, + "step": 549 + }, + { + "epoch": 0.21467603434816548, + "grad_norm": 0.6814827600067608, + "learning_rate": 7.152145643693108e-06, + "loss": 0.6638, + "step": 550 + }, + { + "epoch": 0.2150663544106167, + "grad_norm": 0.8522912901109524, + "learning_rate": 7.16514954486346e-06, + "loss": 0.7049, + "step": 551 + }, + { + "epoch": 0.2154566744730679, + "grad_norm": 0.8219791628667992, + "learning_rate": 7.178153446033811e-06, + "loss": 0.6762, + "step": 552 + }, + { + "epoch": 0.21584699453551912, + "grad_norm": 0.6789278455197132, + "learning_rate": 7.1911573472041614e-06, + "loss": 0.6643, + "step": 553 + }, + { + "epoch": 0.21623731459797033, + "grad_norm": 0.8344774837554031, + "learning_rate": 7.204161248374513e-06, + "loss": 0.6861, + "step": 554 + }, + { + "epoch": 0.21662763466042154, + "grad_norm": 0.7728697555685538, + "learning_rate": 7.217165149544864e-06, + "loss": 0.6933, + "step": 555 + }, + { + "epoch": 0.21701795472287275, + "grad_norm": 0.8348391417478368, + "learning_rate": 7.230169050715216e-06, + "loss": 0.7213, + "step": 556 + }, + { + "epoch": 0.21740827478532396, + "grad_norm": 0.7529576748448621, + "learning_rate": 7.243172951885566e-06, + "loss": 0.6706, + "step": 557 + }, + { + "epoch": 0.21779859484777517, + "grad_norm": 0.8067140765203192, + "learning_rate": 7.256176853055918e-06, + "loss": 0.6711, + "step": 558 + }, + { + "epoch": 0.21818891491022638, + "grad_norm": 0.7813767482943029, + "learning_rate": 7.269180754226268e-06, + "loss": 0.711, + "step": 559 + }, + { + "epoch": 0.2185792349726776, + "grad_norm": 0.7336613450554639, + "learning_rate": 7.282184655396619e-06, + "loss": 0.681, + "step": 560 + }, + { + "epoch": 0.2189695550351288, + "grad_norm": 0.8574035869178992, + "learning_rate": 7.295188556566971e-06, + "loss": 0.7032, + "step": 561 + }, + { + "epoch": 0.21935987509758, + "grad_norm": 0.8849126341628132, + "learning_rate": 7.308192457737322e-06, + "loss": 0.6613, + "step": 562 + }, + { + "epoch": 0.21975019516003122, + "grad_norm": 0.8244168127476741, + "learning_rate": 7.321196358907673e-06, + "loss": 0.6504, + "step": 563 + }, + { + "epoch": 0.22014051522248243, + "grad_norm": 0.9521052539832859, + "learning_rate": 7.334200260078024e-06, + "loss": 0.6694, + "step": 564 + }, + { + "epoch": 0.22053083528493364, + "grad_norm": 0.6430048151482227, + "learning_rate": 7.3472041612483745e-06, + "loss": 0.699, + "step": 565 + }, + { + "epoch": 0.22092115534738485, + "grad_norm": 0.7794835440003823, + "learning_rate": 7.360208062418727e-06, + "loss": 0.6955, + "step": 566 + }, + { + "epoch": 0.22131147540983606, + "grad_norm": 0.6700933656412009, + "learning_rate": 7.373211963589078e-06, + "loss": 0.6772, + "step": 567 + }, + { + "epoch": 0.22170179547228727, + "grad_norm": 0.7812332289865539, + "learning_rate": 7.386215864759428e-06, + "loss": 0.6793, + "step": 568 + }, + { + "epoch": 0.22209211553473848, + "grad_norm": 0.8661533151569701, + "learning_rate": 7.3992197659297794e-06, + "loss": 0.7011, + "step": 569 + }, + { + "epoch": 0.2224824355971897, + "grad_norm": 0.7384852638393012, + "learning_rate": 7.41222366710013e-06, + "loss": 0.6401, + "step": 570 + }, + { + "epoch": 0.2228727556596409, + "grad_norm": 0.7830865092596122, + "learning_rate": 7.425227568270482e-06, + "loss": 0.6804, + "step": 571 + }, + { + "epoch": 0.2232630757220921, + "grad_norm": 0.9018813684885101, + "learning_rate": 7.438231469440833e-06, + "loss": 0.712, + "step": 572 + }, + { + "epoch": 0.22365339578454332, + "grad_norm": 0.6365139323837024, + "learning_rate": 7.451235370611184e-06, + "loss": 0.6563, + "step": 573 + }, + { + "epoch": 0.22404371584699453, + "grad_norm": 0.931714961619935, + "learning_rate": 7.464239271781535e-06, + "loss": 0.6616, + "step": 574 + }, + { + "epoch": 0.22443403590944574, + "grad_norm": 1.0458267270642925, + "learning_rate": 7.477243172951886e-06, + "loss": 0.6733, + "step": 575 + }, + { + "epoch": 0.22482435597189696, + "grad_norm": 0.7148218543145226, + "learning_rate": 7.490247074122238e-06, + "loss": 0.7068, + "step": 576 + }, + { + "epoch": 0.22521467603434817, + "grad_norm": 0.8687018916375785, + "learning_rate": 7.503250975292588e-06, + "loss": 0.6928, + "step": 577 + }, + { + "epoch": 0.22560499609679938, + "grad_norm": 0.8257968666153, + "learning_rate": 7.51625487646294e-06, + "loss": 0.6685, + "step": 578 + }, + { + "epoch": 0.2259953161592506, + "grad_norm": 0.6747225390328841, + "learning_rate": 7.52925877763329e-06, + "loss": 0.6479, + "step": 579 + }, + { + "epoch": 0.2263856362217018, + "grad_norm": 0.7415971655558029, + "learning_rate": 7.542262678803641e-06, + "loss": 0.6795, + "step": 580 + }, + { + "epoch": 0.226775956284153, + "grad_norm": 0.7455507186445675, + "learning_rate": 7.555266579973993e-06, + "loss": 0.6409, + "step": 581 + }, + { + "epoch": 0.22716627634660422, + "grad_norm": 0.7532276228654232, + "learning_rate": 7.5682704811443446e-06, + "loss": 0.6619, + "step": 582 + }, + { + "epoch": 0.22755659640905543, + "grad_norm": 0.8031874493082442, + "learning_rate": 7.581274382314695e-06, + "loss": 0.6827, + "step": 583 + }, + { + "epoch": 0.22794691647150664, + "grad_norm": 0.8211754697124102, + "learning_rate": 7.594278283485046e-06, + "loss": 0.6614, + "step": 584 + }, + { + "epoch": 0.22833723653395785, + "grad_norm": 0.9613272349978974, + "learning_rate": 7.6072821846553966e-06, + "loss": 0.6701, + "step": 585 + }, + { + "epoch": 0.22872755659640906, + "grad_norm": 0.8327054370636566, + "learning_rate": 7.620286085825749e-06, + "loss": 0.6686, + "step": 586 + }, + { + "epoch": 0.22911787665886027, + "grad_norm": 0.7984978954285141, + "learning_rate": 7.633289986996099e-06, + "loss": 0.6609, + "step": 587 + }, + { + "epoch": 0.22950819672131148, + "grad_norm": 0.8536021170930879, + "learning_rate": 7.64629388816645e-06, + "loss": 0.7148, + "step": 588 + }, + { + "epoch": 0.2298985167837627, + "grad_norm": 0.8456999282848603, + "learning_rate": 7.659297789336801e-06, + "loss": 0.6739, + "step": 589 + }, + { + "epoch": 0.2302888368462139, + "grad_norm": 0.7227232353260842, + "learning_rate": 7.672301690507153e-06, + "loss": 0.7187, + "step": 590 + }, + { + "epoch": 0.2306791569086651, + "grad_norm": 1.0104341665487977, + "learning_rate": 7.685305591677504e-06, + "loss": 0.6859, + "step": 591 + }, + { + "epoch": 0.23106947697111632, + "grad_norm": 0.6706668696204728, + "learning_rate": 7.698309492847855e-06, + "loss": 0.6998, + "step": 592 + }, + { + "epoch": 0.23145979703356753, + "grad_norm": 0.7840325631858982, + "learning_rate": 7.711313394018206e-06, + "loss": 0.6596, + "step": 593 + }, + { + "epoch": 0.23185011709601874, + "grad_norm": 0.8466355270727574, + "learning_rate": 7.724317295188558e-06, + "loss": 0.6553, + "step": 594 + }, + { + "epoch": 0.23224043715846995, + "grad_norm": 0.7860469897427367, + "learning_rate": 7.737321196358907e-06, + "loss": 0.6678, + "step": 595 + }, + { + "epoch": 0.23263075722092116, + "grad_norm": 0.9066289325406865, + "learning_rate": 7.75032509752926e-06, + "loss": 0.6795, + "step": 596 + }, + { + "epoch": 0.23302107728337237, + "grad_norm": 0.7678425695736883, + "learning_rate": 7.763328998699611e-06, + "loss": 0.6906, + "step": 597 + }, + { + "epoch": 0.23341139734582358, + "grad_norm": 0.6783622013197042, + "learning_rate": 7.776332899869961e-06, + "loss": 0.649, + "step": 598 + }, + { + "epoch": 0.2338017174082748, + "grad_norm": 0.7026792125688368, + "learning_rate": 7.789336801040312e-06, + "loss": 0.6535, + "step": 599 + }, + { + "epoch": 0.234192037470726, + "grad_norm": 0.752007376190357, + "learning_rate": 7.802340702210663e-06, + "loss": 0.6951, + "step": 600 + }, + { + "epoch": 0.23458235753317722, + "grad_norm": 0.8625059450755407, + "learning_rate": 7.815344603381016e-06, + "loss": 0.6856, + "step": 601 + }, + { + "epoch": 0.23497267759562843, + "grad_norm": 0.7355670907672346, + "learning_rate": 7.828348504551366e-06, + "loss": 0.6673, + "step": 602 + }, + { + "epoch": 0.23536299765807964, + "grad_norm": 0.724531415276145, + "learning_rate": 7.841352405721717e-06, + "loss": 0.6808, + "step": 603 + }, + { + "epoch": 0.23575331772053085, + "grad_norm": 0.7033345622813093, + "learning_rate": 7.854356306892068e-06, + "loss": 0.6624, + "step": 604 + }, + { + "epoch": 0.23614363778298206, + "grad_norm": 0.834400146081329, + "learning_rate": 7.86736020806242e-06, + "loss": 0.6597, + "step": 605 + }, + { + "epoch": 0.23653395784543327, + "grad_norm": 0.6954843287234742, + "learning_rate": 7.88036410923277e-06, + "loss": 0.6683, + "step": 606 + }, + { + "epoch": 0.23692427790788448, + "grad_norm": 0.8370164086213431, + "learning_rate": 7.893368010403122e-06, + "loss": 0.7034, + "step": 607 + }, + { + "epoch": 0.2373145979703357, + "grad_norm": 0.700282501575733, + "learning_rate": 7.906371911573473e-06, + "loss": 0.649, + "step": 608 + }, + { + "epoch": 0.23770491803278687, + "grad_norm": 0.8043043566093842, + "learning_rate": 7.919375812743824e-06, + "loss": 0.6187, + "step": 609 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 0.7712335861628556, + "learning_rate": 7.932379713914174e-06, + "loss": 0.6839, + "step": 610 + }, + { + "epoch": 0.2384855581576893, + "grad_norm": 0.6806250810150324, + "learning_rate": 7.945383615084527e-06, + "loss": 0.72, + "step": 611 + }, + { + "epoch": 0.2388758782201405, + "grad_norm": 0.7308517764052085, + "learning_rate": 7.958387516254878e-06, + "loss": 0.6856, + "step": 612 + }, + { + "epoch": 0.2392661982825917, + "grad_norm": 0.722567191086853, + "learning_rate": 7.971391417425228e-06, + "loss": 0.6955, + "step": 613 + }, + { + "epoch": 0.23965651834504292, + "grad_norm": 0.6970117787376635, + "learning_rate": 7.984395318595579e-06, + "loss": 0.6824, + "step": 614 + }, + { + "epoch": 0.24004683840749413, + "grad_norm": 0.6186780938477237, + "learning_rate": 7.99739921976593e-06, + "loss": 0.6775, + "step": 615 + }, + { + "epoch": 0.24043715846994534, + "grad_norm": 0.8741924581544108, + "learning_rate": 8.010403120936281e-06, + "loss": 0.6891, + "step": 616 + }, + { + "epoch": 0.24082747853239655, + "grad_norm": 0.5953233634966614, + "learning_rate": 8.023407022106633e-06, + "loss": 0.674, + "step": 617 + }, + { + "epoch": 0.24121779859484777, + "grad_norm": 0.6515178420429902, + "learning_rate": 8.036410923276984e-06, + "loss": 0.6391, + "step": 618 + }, + { + "epoch": 0.24160811865729898, + "grad_norm": 0.8120246445767475, + "learning_rate": 8.049414824447335e-06, + "loss": 0.69, + "step": 619 + }, + { + "epoch": 0.2419984387197502, + "grad_norm": 0.6325361019907849, + "learning_rate": 8.062418725617686e-06, + "loss": 0.6813, + "step": 620 + }, + { + "epoch": 0.2423887587822014, + "grad_norm": 0.8300174342195809, + "learning_rate": 8.075422626788037e-06, + "loss": 0.6885, + "step": 621 + }, + { + "epoch": 0.2427790788446526, + "grad_norm": 0.6632809402323055, + "learning_rate": 8.088426527958389e-06, + "loss": 0.6838, + "step": 622 + }, + { + "epoch": 0.24316939890710382, + "grad_norm": 0.6553484966485561, + "learning_rate": 8.10143042912874e-06, + "loss": 0.6597, + "step": 623 + }, + { + "epoch": 0.24355971896955503, + "grad_norm": 0.7163490373419914, + "learning_rate": 8.11443433029909e-06, + "loss": 0.6818, + "step": 624 + }, + { + "epoch": 0.24395003903200624, + "grad_norm": 0.7104530018773281, + "learning_rate": 8.12743823146944e-06, + "loss": 0.6821, + "step": 625 + }, + { + "epoch": 0.24434035909445745, + "grad_norm": 0.7237359496978976, + "learning_rate": 8.140442132639794e-06, + "loss": 0.7025, + "step": 626 + }, + { + "epoch": 0.24473067915690866, + "grad_norm": 0.6346962134195924, + "learning_rate": 8.153446033810145e-06, + "loss": 0.6495, + "step": 627 + }, + { + "epoch": 0.24512099921935987, + "grad_norm": 0.7436348403887936, + "learning_rate": 8.166449934980494e-06, + "loss": 0.6665, + "step": 628 + }, + { + "epoch": 0.24551131928181108, + "grad_norm": 0.8109631183737521, + "learning_rate": 8.179453836150846e-06, + "loss": 0.676, + "step": 629 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 0.6830968903005815, + "learning_rate": 8.192457737321197e-06, + "loss": 0.6932, + "step": 630 + }, + { + "epoch": 0.2462919594067135, + "grad_norm": 0.6248161130375992, + "learning_rate": 8.205461638491548e-06, + "loss": 0.6445, + "step": 631 + }, + { + "epoch": 0.2466822794691647, + "grad_norm": 0.8527442503260286, + "learning_rate": 8.2184655396619e-06, + "loss": 0.6922, + "step": 632 + }, + { + "epoch": 0.24707259953161592, + "grad_norm": 0.7086527904259882, + "learning_rate": 8.23146944083225e-06, + "loss": 0.6477, + "step": 633 + }, + { + "epoch": 0.24746291959406713, + "grad_norm": 0.7235766265995587, + "learning_rate": 8.244473342002602e-06, + "loss": 0.6603, + "step": 634 + }, + { + "epoch": 0.24785323965651834, + "grad_norm": 0.6326559078928267, + "learning_rate": 8.257477243172953e-06, + "loss": 0.6557, + "step": 635 + }, + { + "epoch": 0.24824355971896955, + "grad_norm": 0.6479495682871395, + "learning_rate": 8.270481144343304e-06, + "loss": 0.681, + "step": 636 + }, + { + "epoch": 0.24863387978142076, + "grad_norm": 0.6418525298353058, + "learning_rate": 8.283485045513655e-06, + "loss": 0.6946, + "step": 637 + }, + { + "epoch": 0.24902419984387197, + "grad_norm": 0.6614067676690253, + "learning_rate": 8.296488946684007e-06, + "loss": 0.6665, + "step": 638 + }, + { + "epoch": 0.24941451990632318, + "grad_norm": 0.7151489091754751, + "learning_rate": 8.309492847854356e-06, + "loss": 0.7116, + "step": 639 + }, + { + "epoch": 0.2498048399687744, + "grad_norm": 0.6643808891330738, + "learning_rate": 8.322496749024707e-06, + "loss": 0.6683, + "step": 640 + }, + { + "epoch": 0.2501951600312256, + "grad_norm": 0.7099929183586365, + "learning_rate": 8.33550065019506e-06, + "loss": 0.6808, + "step": 641 + }, + { + "epoch": 0.2505854800936768, + "grad_norm": 0.6581897349392635, + "learning_rate": 8.34850455136541e-06, + "loss": 0.6789, + "step": 642 + }, + { + "epoch": 0.250975800156128, + "grad_norm": 0.821463699173952, + "learning_rate": 8.361508452535761e-06, + "loss": 0.6877, + "step": 643 + }, + { + "epoch": 0.25136612021857924, + "grad_norm": 0.7133631330833353, + "learning_rate": 8.374512353706112e-06, + "loss": 0.6882, + "step": 644 + }, + { + "epoch": 0.25175644028103045, + "grad_norm": 0.7229141809926442, + "learning_rate": 8.387516254876464e-06, + "loss": 0.6864, + "step": 645 + }, + { + "epoch": 0.25214676034348166, + "grad_norm": 0.8050161111002901, + "learning_rate": 8.400520156046815e-06, + "loss": 0.679, + "step": 646 + }, + { + "epoch": 0.25253708040593287, + "grad_norm": 0.6783588451512171, + "learning_rate": 8.413524057217166e-06, + "loss": 0.6398, + "step": 647 + }, + { + "epoch": 0.2529274004683841, + "grad_norm": 0.788911059656061, + "learning_rate": 8.426527958387517e-06, + "loss": 0.6608, + "step": 648 + }, + { + "epoch": 0.2533177205308353, + "grad_norm": 0.6880810256580481, + "learning_rate": 8.439531859557868e-06, + "loss": 0.6919, + "step": 649 + }, + { + "epoch": 0.2537080405932865, + "grad_norm": 0.7359752181468586, + "learning_rate": 8.452535760728218e-06, + "loss": 0.7104, + "step": 650 + }, + { + "epoch": 0.2540983606557377, + "grad_norm": 0.7965424349723658, + "learning_rate": 8.465539661898571e-06, + "loss": 0.6644, + "step": 651 + }, + { + "epoch": 0.2544886807181889, + "grad_norm": 0.7066870597276812, + "learning_rate": 8.478543563068922e-06, + "loss": 0.6712, + "step": 652 + }, + { + "epoch": 0.25487900078064013, + "grad_norm": 0.8170357778131482, + "learning_rate": 8.491547464239272e-06, + "loss": 0.674, + "step": 653 + }, + { + "epoch": 0.25526932084309134, + "grad_norm": 0.8043138996508209, + "learning_rate": 8.504551365409623e-06, + "loss": 0.6686, + "step": 654 + }, + { + "epoch": 0.25565964090554255, + "grad_norm": 0.7727984691084482, + "learning_rate": 8.517555266579974e-06, + "loss": 0.6502, + "step": 655 + }, + { + "epoch": 0.25604996096799376, + "grad_norm": 0.8816690402347234, + "learning_rate": 8.530559167750327e-06, + "loss": 0.6706, + "step": 656 + }, + { + "epoch": 0.25644028103044497, + "grad_norm": 0.7028012659329752, + "learning_rate": 8.543563068920677e-06, + "loss": 0.6551, + "step": 657 + }, + { + "epoch": 0.2568306010928962, + "grad_norm": 0.7106147812533635, + "learning_rate": 8.556566970091028e-06, + "loss": 0.6793, + "step": 658 + }, + { + "epoch": 0.2572209211553474, + "grad_norm": 0.7047229227861583, + "learning_rate": 8.569570871261379e-06, + "loss": 0.6856, + "step": 659 + }, + { + "epoch": 0.2576112412177986, + "grad_norm": 0.6688575751212004, + "learning_rate": 8.58257477243173e-06, + "loss": 0.6486, + "step": 660 + }, + { + "epoch": 0.2580015612802498, + "grad_norm": 0.7871674093947428, + "learning_rate": 8.595578673602082e-06, + "loss": 0.6789, + "step": 661 + }, + { + "epoch": 0.258391881342701, + "grad_norm": 0.6978733988031697, + "learning_rate": 8.608582574772433e-06, + "loss": 0.6622, + "step": 662 + }, + { + "epoch": 0.25878220140515223, + "grad_norm": 0.7290731839563414, + "learning_rate": 8.621586475942784e-06, + "loss": 0.7041, + "step": 663 + }, + { + "epoch": 0.25917252146760345, + "grad_norm": 0.6205305842577415, + "learning_rate": 8.634590377113135e-06, + "loss": 0.6611, + "step": 664 + }, + { + "epoch": 0.25956284153005466, + "grad_norm": 0.7218937820968184, + "learning_rate": 8.647594278283485e-06, + "loss": 0.667, + "step": 665 + }, + { + "epoch": 0.25995316159250587, + "grad_norm": 0.764913576264976, + "learning_rate": 8.660598179453838e-06, + "loss": 0.6644, + "step": 666 + }, + { + "epoch": 0.2603434816549571, + "grad_norm": 0.7181828582830394, + "learning_rate": 8.673602080624189e-06, + "loss": 0.6718, + "step": 667 + }, + { + "epoch": 0.2607338017174083, + "grad_norm": 0.7302278232141141, + "learning_rate": 8.686605981794538e-06, + "loss": 0.6897, + "step": 668 + }, + { + "epoch": 0.2611241217798595, + "grad_norm": 0.8461754707338556, + "learning_rate": 8.69960988296489e-06, + "loss": 0.6658, + "step": 669 + }, + { + "epoch": 0.2615144418423107, + "grad_norm": 0.9274647480804067, + "learning_rate": 8.712613784135241e-06, + "loss": 0.6472, + "step": 670 + }, + { + "epoch": 0.2619047619047619, + "grad_norm": 0.7974520800732972, + "learning_rate": 8.725617685305592e-06, + "loss": 0.6553, + "step": 671 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 1.1236754848584405, + "learning_rate": 8.738621586475943e-06, + "loss": 0.6909, + "step": 672 + }, + { + "epoch": 0.26268540202966434, + "grad_norm": 1.0884915367727503, + "learning_rate": 8.751625487646295e-06, + "loss": 0.6844, + "step": 673 + }, + { + "epoch": 0.26307572209211555, + "grad_norm": 0.8083812051530752, + "learning_rate": 8.764629388816646e-06, + "loss": 0.6706, + "step": 674 + }, + { + "epoch": 0.26346604215456676, + "grad_norm": 1.0840313940527262, + "learning_rate": 8.777633289986997e-06, + "loss": 0.6579, + "step": 675 + }, + { + "epoch": 0.26385636221701797, + "grad_norm": 1.2234315984058606, + "learning_rate": 8.790637191157347e-06, + "loss": 0.7061, + "step": 676 + }, + { + "epoch": 0.2642466822794692, + "grad_norm": 0.7994595393657027, + "learning_rate": 8.8036410923277e-06, + "loss": 0.7021, + "step": 677 + }, + { + "epoch": 0.2646370023419204, + "grad_norm": 0.9066863551396919, + "learning_rate": 8.81664499349805e-06, + "loss": 0.6658, + "step": 678 + }, + { + "epoch": 0.2650273224043716, + "grad_norm": 1.1388395385788173, + "learning_rate": 8.8296488946684e-06, + "loss": 0.6818, + "step": 679 + }, + { + "epoch": 0.2654176424668228, + "grad_norm": 0.8410528383822053, + "learning_rate": 8.842652795838752e-06, + "loss": 0.6879, + "step": 680 + }, + { + "epoch": 0.265807962529274, + "grad_norm": 0.8803951080146859, + "learning_rate": 8.855656697009103e-06, + "loss": 0.6972, + "step": 681 + }, + { + "epoch": 0.26619828259172523, + "grad_norm": 1.0723513306286374, + "learning_rate": 8.868660598179456e-06, + "loss": 0.6626, + "step": 682 + }, + { + "epoch": 0.26658860265417644, + "grad_norm": 0.620861601331339, + "learning_rate": 8.881664499349805e-06, + "loss": 0.6808, + "step": 683 + }, + { + "epoch": 0.26697892271662765, + "grad_norm": 0.9974555310831961, + "learning_rate": 8.894668400520156e-06, + "loss": 0.6555, + "step": 684 + }, + { + "epoch": 0.26736924277907886, + "grad_norm": 0.8036070660452134, + "learning_rate": 8.907672301690508e-06, + "loss": 0.6844, + "step": 685 + }, + { + "epoch": 0.2677595628415301, + "grad_norm": 0.8161201318662384, + "learning_rate": 8.920676202860859e-06, + "loss": 0.6655, + "step": 686 + }, + { + "epoch": 0.2681498829039813, + "grad_norm": 0.843969866892246, + "learning_rate": 8.93368010403121e-06, + "loss": 0.6844, + "step": 687 + }, + { + "epoch": 0.2685402029664325, + "grad_norm": 0.6885819236397781, + "learning_rate": 8.946684005201561e-06, + "loss": 0.6428, + "step": 688 + }, + { + "epoch": 0.2689305230288837, + "grad_norm": 0.8540782213941597, + "learning_rate": 8.959687906371913e-06, + "loss": 0.6577, + "step": 689 + }, + { + "epoch": 0.2693208430913349, + "grad_norm": 0.7987465248921904, + "learning_rate": 8.972691807542264e-06, + "loss": 0.6878, + "step": 690 + }, + { + "epoch": 0.2697111631537861, + "grad_norm": 0.8167900358057786, + "learning_rate": 8.985695708712613e-06, + "loss": 0.682, + "step": 691 + }, + { + "epoch": 0.27010148321623734, + "grad_norm": 0.9419094579017145, + "learning_rate": 8.998699609882966e-06, + "loss": 0.6834, + "step": 692 + }, + { + "epoch": 0.27049180327868855, + "grad_norm": 0.942785962002652, + "learning_rate": 9.011703511053318e-06, + "loss": 0.672, + "step": 693 + }, + { + "epoch": 0.27088212334113976, + "grad_norm": 0.7789571039584774, + "learning_rate": 9.024707412223667e-06, + "loss": 0.6723, + "step": 694 + }, + { + "epoch": 0.27127244340359097, + "grad_norm": 0.7383432384141249, + "learning_rate": 9.037711313394018e-06, + "loss": 0.7011, + "step": 695 + }, + { + "epoch": 0.2716627634660422, + "grad_norm": 0.8028314552493424, + "learning_rate": 9.05071521456437e-06, + "loss": 0.6522, + "step": 696 + }, + { + "epoch": 0.2720530835284934, + "grad_norm": 0.7488642417751655, + "learning_rate": 9.06371911573472e-06, + "loss": 0.6812, + "step": 697 + }, + { + "epoch": 0.2724434035909446, + "grad_norm": 0.8018467685438169, + "learning_rate": 9.076723016905072e-06, + "loss": 0.6821, + "step": 698 + }, + { + "epoch": 0.2728337236533958, + "grad_norm": 0.7618321276042095, + "learning_rate": 9.089726918075423e-06, + "loss": 0.6523, + "step": 699 + }, + { + "epoch": 0.273224043715847, + "grad_norm": 0.7675180584634727, + "learning_rate": 9.102730819245774e-06, + "loss": 0.6691, + "step": 700 + }, + { + "epoch": 0.27361436377829823, + "grad_norm": 0.7276266460441546, + "learning_rate": 9.115734720416126e-06, + "loss": 0.6773, + "step": 701 + }, + { + "epoch": 0.27400468384074944, + "grad_norm": 0.6845719210252115, + "learning_rate": 9.128738621586477e-06, + "loss": 0.6648, + "step": 702 + }, + { + "epoch": 0.27439500390320065, + "grad_norm": 0.703863384661785, + "learning_rate": 9.141742522756828e-06, + "loss": 0.6933, + "step": 703 + }, + { + "epoch": 0.27478532396565186, + "grad_norm": 0.6720235214848851, + "learning_rate": 9.15474642392718e-06, + "loss": 0.6972, + "step": 704 + }, + { + "epoch": 0.275175644028103, + "grad_norm": 0.8346772672611394, + "learning_rate": 9.167750325097529e-06, + "loss": 0.6581, + "step": 705 + }, + { + "epoch": 0.2755659640905542, + "grad_norm": 0.6900995063843406, + "learning_rate": 9.18075422626788e-06, + "loss": 0.6795, + "step": 706 + }, + { + "epoch": 0.27595628415300544, + "grad_norm": 0.7765018319357422, + "learning_rate": 9.193758127438233e-06, + "loss": 0.6355, + "step": 707 + }, + { + "epoch": 0.27634660421545665, + "grad_norm": 0.7034132568796836, + "learning_rate": 9.206762028608584e-06, + "loss": 0.7211, + "step": 708 + }, + { + "epoch": 0.27673692427790786, + "grad_norm": 0.7544968078292716, + "learning_rate": 9.219765929778934e-06, + "loss": 0.6881, + "step": 709 + }, + { + "epoch": 0.27712724434035907, + "grad_norm": 0.763556466748633, + "learning_rate": 9.232769830949285e-06, + "loss": 0.6824, + "step": 710 + }, + { + "epoch": 0.2775175644028103, + "grad_norm": 0.8745268517604093, + "learning_rate": 9.245773732119636e-06, + "loss": 0.6684, + "step": 711 + }, + { + "epoch": 0.2779078844652615, + "grad_norm": 0.7387298163328265, + "learning_rate": 9.258777633289987e-06, + "loss": 0.6631, + "step": 712 + }, + { + "epoch": 0.2782982045277127, + "grad_norm": 0.8562093932538155, + "learning_rate": 9.271781534460339e-06, + "loss": 0.6653, + "step": 713 + }, + { + "epoch": 0.2786885245901639, + "grad_norm": 0.7652671988471709, + "learning_rate": 9.28478543563069e-06, + "loss": 0.6728, + "step": 714 + }, + { + "epoch": 0.2790788446526151, + "grad_norm": 0.8069309075478169, + "learning_rate": 9.297789336801041e-06, + "loss": 0.6392, + "step": 715 + }, + { + "epoch": 0.27946916471506633, + "grad_norm": 0.8319395734357022, + "learning_rate": 9.310793237971392e-06, + "loss": 0.663, + "step": 716 + }, + { + "epoch": 0.27985948477751754, + "grad_norm": 0.7954992441680032, + "learning_rate": 9.323797139141744e-06, + "loss": 0.674, + "step": 717 + }, + { + "epoch": 0.28024980483996875, + "grad_norm": 0.9110136026633773, + "learning_rate": 9.336801040312095e-06, + "loss": 0.6584, + "step": 718 + }, + { + "epoch": 0.28064012490241996, + "grad_norm": 0.7756842708934657, + "learning_rate": 9.349804941482446e-06, + "loss": 0.7066, + "step": 719 + }, + { + "epoch": 0.2810304449648712, + "grad_norm": 0.8328085193883903, + "learning_rate": 9.362808842652796e-06, + "loss": 0.656, + "step": 720 + }, + { + "epoch": 0.2814207650273224, + "grad_norm": 0.9308448720752176, + "learning_rate": 9.375812743823147e-06, + "loss": 0.6457, + "step": 721 + }, + { + "epoch": 0.2818110850897736, + "grad_norm": 0.8957826646723507, + "learning_rate": 9.3888166449935e-06, + "loss": 0.6583, + "step": 722 + }, + { + "epoch": 0.2822014051522248, + "grad_norm": 0.7521836784198118, + "learning_rate": 9.40182054616385e-06, + "loss": 0.6408, + "step": 723 + }, + { + "epoch": 0.282591725214676, + "grad_norm": 0.8403097006244593, + "learning_rate": 9.4148244473342e-06, + "loss": 0.6834, + "step": 724 + }, + { + "epoch": 0.2829820452771272, + "grad_norm": 0.7391340426222961, + "learning_rate": 9.427828348504552e-06, + "loss": 0.6657, + "step": 725 + }, + { + "epoch": 0.28337236533957844, + "grad_norm": 0.7778288718804628, + "learning_rate": 9.440832249674903e-06, + "loss": 0.7108, + "step": 726 + }, + { + "epoch": 0.28376268540202965, + "grad_norm": 0.8262945054416448, + "learning_rate": 9.453836150845254e-06, + "loss": 0.6842, + "step": 727 + }, + { + "epoch": 0.28415300546448086, + "grad_norm": 0.7038967276252367, + "learning_rate": 9.466840052015605e-06, + "loss": 0.6849, + "step": 728 + }, + { + "epoch": 0.28454332552693207, + "grad_norm": 0.9899538037334142, + "learning_rate": 9.479843953185957e-06, + "loss": 0.6986, + "step": 729 + }, + { + "epoch": 0.2849336455893833, + "grad_norm": 0.7780627667063574, + "learning_rate": 9.492847854356308e-06, + "loss": 0.662, + "step": 730 + }, + { + "epoch": 0.2853239656518345, + "grad_norm": 0.7172320038847236, + "learning_rate": 9.505851755526657e-06, + "loss": 0.6896, + "step": 731 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.7896422219213572, + "learning_rate": 9.51885565669701e-06, + "loss": 0.6702, + "step": 732 + }, + { + "epoch": 0.2861046057767369, + "grad_norm": 0.7327105451129098, + "learning_rate": 9.531859557867362e-06, + "loss": 0.6662, + "step": 733 + }, + { + "epoch": 0.2864949258391881, + "grad_norm": 0.8262219841844781, + "learning_rate": 9.544863459037713e-06, + "loss": 0.6303, + "step": 734 + }, + { + "epoch": 0.28688524590163933, + "grad_norm": 0.850687136221709, + "learning_rate": 9.557867360208062e-06, + "loss": 0.6591, + "step": 735 + }, + { + "epoch": 0.28727556596409054, + "grad_norm": 0.8096919711109073, + "learning_rate": 9.570871261378414e-06, + "loss": 0.6448, + "step": 736 + }, + { + "epoch": 0.28766588602654175, + "grad_norm": 0.8126069097299119, + "learning_rate": 9.583875162548767e-06, + "loss": 0.6613, + "step": 737 + }, + { + "epoch": 0.28805620608899296, + "grad_norm": 0.8786589357711668, + "learning_rate": 9.596879063719116e-06, + "loss": 0.6703, + "step": 738 + }, + { + "epoch": 0.28844652615144417, + "grad_norm": 0.6949444990987567, + "learning_rate": 9.609882964889467e-06, + "loss": 0.6669, + "step": 739 + }, + { + "epoch": 0.2888368462138954, + "grad_norm": 0.9588727721661081, + "learning_rate": 9.622886866059819e-06, + "loss": 0.7049, + "step": 740 + }, + { + "epoch": 0.2892271662763466, + "grad_norm": 0.6783852643655878, + "learning_rate": 9.63589076723017e-06, + "loss": 0.6496, + "step": 741 + }, + { + "epoch": 0.2896174863387978, + "grad_norm": 0.791266286394137, + "learning_rate": 9.648894668400521e-06, + "loss": 0.6577, + "step": 742 + }, + { + "epoch": 0.290007806401249, + "grad_norm": 0.7061896974404378, + "learning_rate": 9.661898569570872e-06, + "loss": 0.6591, + "step": 743 + }, + { + "epoch": 0.2903981264637002, + "grad_norm": 0.7542703540067072, + "learning_rate": 9.674902470741223e-06, + "loss": 0.6656, + "step": 744 + }, + { + "epoch": 0.29078844652615143, + "grad_norm": 0.7321907998135061, + "learning_rate": 9.687906371911575e-06, + "loss": 0.6733, + "step": 745 + }, + { + "epoch": 0.29117876658860264, + "grad_norm": 0.7797109473465602, + "learning_rate": 9.700910273081924e-06, + "loss": 0.6711, + "step": 746 + }, + { + "epoch": 0.29156908665105385, + "grad_norm": 0.8511820133456166, + "learning_rate": 9.713914174252277e-06, + "loss": 0.7042, + "step": 747 + }, + { + "epoch": 0.29195940671350507, + "grad_norm": 0.7423592214316685, + "learning_rate": 9.726918075422628e-06, + "loss": 0.6951, + "step": 748 + }, + { + "epoch": 0.2923497267759563, + "grad_norm": 0.8542924450234887, + "learning_rate": 9.739921976592978e-06, + "loss": 0.6638, + "step": 749 + }, + { + "epoch": 0.2927400468384075, + "grad_norm": 0.7894658716616224, + "learning_rate": 9.752925877763329e-06, + "loss": 0.6802, + "step": 750 + }, + { + "epoch": 0.2931303669008587, + "grad_norm": 0.8198827533291279, + "learning_rate": 9.76592977893368e-06, + "loss": 0.6525, + "step": 751 + }, + { + "epoch": 0.2935206869633099, + "grad_norm": 0.960930075618875, + "learning_rate": 9.778933680104032e-06, + "loss": 0.7057, + "step": 752 + }, + { + "epoch": 0.2939110070257611, + "grad_norm": 0.7303015585380406, + "learning_rate": 9.791937581274383e-06, + "loss": 0.6679, + "step": 753 + }, + { + "epoch": 0.29430132708821233, + "grad_norm": 0.9099989734770564, + "learning_rate": 9.804941482444734e-06, + "loss": 0.6569, + "step": 754 + }, + { + "epoch": 0.29469164715066354, + "grad_norm": 0.6545903147326112, + "learning_rate": 9.817945383615085e-06, + "loss": 0.6595, + "step": 755 + }, + { + "epoch": 0.29508196721311475, + "grad_norm": 0.8452470263610842, + "learning_rate": 9.830949284785437e-06, + "loss": 0.6678, + "step": 756 + }, + { + "epoch": 0.29547228727556596, + "grad_norm": 0.6800792680302047, + "learning_rate": 9.843953185955788e-06, + "loss": 0.6438, + "step": 757 + }, + { + "epoch": 0.29586260733801717, + "grad_norm": 0.7960083734464294, + "learning_rate": 9.856957087126139e-06, + "loss": 0.6797, + "step": 758 + }, + { + "epoch": 0.2962529274004684, + "grad_norm": 0.7802263816492322, + "learning_rate": 9.86996098829649e-06, + "loss": 0.704, + "step": 759 + }, + { + "epoch": 0.2966432474629196, + "grad_norm": 0.737302177621471, + "learning_rate": 9.882964889466841e-06, + "loss": 0.6961, + "step": 760 + }, + { + "epoch": 0.2970335675253708, + "grad_norm": 0.8599313100285872, + "learning_rate": 9.895968790637191e-06, + "loss": 0.652, + "step": 761 + }, + { + "epoch": 0.297423887587822, + "grad_norm": 0.7656742911473399, + "learning_rate": 9.908972691807544e-06, + "loss": 0.6844, + "step": 762 + }, + { + "epoch": 0.2978142076502732, + "grad_norm": 1.0131583034534577, + "learning_rate": 9.921976592977895e-06, + "loss": 0.6735, + "step": 763 + }, + { + "epoch": 0.29820452771272443, + "grad_norm": 0.7544054825432824, + "learning_rate": 9.934980494148245e-06, + "loss": 0.6794, + "step": 764 + }, + { + "epoch": 0.29859484777517564, + "grad_norm": 0.8157810050886745, + "learning_rate": 9.947984395318596e-06, + "loss": 0.6978, + "step": 765 + }, + { + "epoch": 0.29898516783762685, + "grad_norm": 0.9090448333900956, + "learning_rate": 9.960988296488947e-06, + "loss": 0.6509, + "step": 766 + }, + { + "epoch": 0.29937548790007806, + "grad_norm": 0.8213023541244195, + "learning_rate": 9.973992197659298e-06, + "loss": 0.6665, + "step": 767 + }, + { + "epoch": 0.2997658079625293, + "grad_norm": 0.6996692984331491, + "learning_rate": 9.98699609882965e-06, + "loss": 0.6505, + "step": 768 + }, + { + "epoch": 0.3001561280249805, + "grad_norm": 0.838776413467148, + "learning_rate": 1e-05, + "loss": 0.6973, + "step": 769 + }, + { + "epoch": 0.3005464480874317, + "grad_norm": 0.7228428851279085, + "learning_rate": 9.999999484291612e-06, + "loss": 0.6768, + "step": 770 + }, + { + "epoch": 0.3009367681498829, + "grad_norm": 0.7463945683258781, + "learning_rate": 9.999997937166554e-06, + "loss": 0.6689, + "step": 771 + }, + { + "epoch": 0.3013270882123341, + "grad_norm": 0.8511500999599148, + "learning_rate": 9.999995358625148e-06, + "loss": 0.6897, + "step": 772 + }, + { + "epoch": 0.3017174082747853, + "grad_norm": 0.6855159065450326, + "learning_rate": 9.99999174866792e-06, + "loss": 0.6757, + "step": 773 + }, + { + "epoch": 0.30210772833723654, + "grad_norm": 0.8924752786650155, + "learning_rate": 9.99998710729562e-06, + "loss": 0.6865, + "step": 774 + }, + { + "epoch": 0.30249804839968775, + "grad_norm": 0.8933224280691139, + "learning_rate": 9.999981434509205e-06, + "loss": 0.6665, + "step": 775 + }, + { + "epoch": 0.30288836846213896, + "grad_norm": 0.7572820549284284, + "learning_rate": 9.999974730309842e-06, + "loss": 0.6703, + "step": 776 + }, + { + "epoch": 0.30327868852459017, + "grad_norm": 0.6856555170564465, + "learning_rate": 9.999966994698916e-06, + "loss": 0.6931, + "step": 777 + }, + { + "epoch": 0.3036690085870414, + "grad_norm": 0.8576192349999363, + "learning_rate": 9.999958227678022e-06, + "loss": 0.6797, + "step": 778 + }, + { + "epoch": 0.3040593286494926, + "grad_norm": 0.7654476448659187, + "learning_rate": 9.99994842924897e-06, + "loss": 0.6489, + "step": 779 + }, + { + "epoch": 0.3044496487119438, + "grad_norm": 0.7006547997072806, + "learning_rate": 9.99993759941378e-06, + "loss": 0.624, + "step": 780 + }, + { + "epoch": 0.304839968774395, + "grad_norm": 0.8178937038265984, + "learning_rate": 9.999925738174686e-06, + "loss": 0.6945, + "step": 781 + }, + { + "epoch": 0.3052302888368462, + "grad_norm": 0.6512563684589656, + "learning_rate": 9.999912845534134e-06, + "loss": 0.6766, + "step": 782 + }, + { + "epoch": 0.30562060889929743, + "grad_norm": 0.7452144285457596, + "learning_rate": 9.999898921494785e-06, + "loss": 0.6608, + "step": 783 + }, + { + "epoch": 0.30601092896174864, + "grad_norm": 0.8884379132747806, + "learning_rate": 9.999883966059512e-06, + "loss": 0.6959, + "step": 784 + }, + { + "epoch": 0.30640124902419985, + "grad_norm": 0.7510154813958844, + "learning_rate": 9.999867979231399e-06, + "loss": 0.6708, + "step": 785 + }, + { + "epoch": 0.30679156908665106, + "grad_norm": 0.6972664855793584, + "learning_rate": 9.999850961013743e-06, + "loss": 0.6637, + "step": 786 + }, + { + "epoch": 0.30718188914910227, + "grad_norm": 0.8576522188438295, + "learning_rate": 9.999832911410056e-06, + "loss": 0.6601, + "step": 787 + }, + { + "epoch": 0.3075722092115535, + "grad_norm": 0.8697329252074613, + "learning_rate": 9.99981383042406e-06, + "loss": 0.6721, + "step": 788 + }, + { + "epoch": 0.3079625292740047, + "grad_norm": 0.6555605596986726, + "learning_rate": 9.999793718059692e-06, + "loss": 0.6586, + "step": 789 + }, + { + "epoch": 0.3083528493364559, + "grad_norm": 0.7214953542864366, + "learning_rate": 9.9997725743211e-06, + "loss": 0.6666, + "step": 790 + }, + { + "epoch": 0.3087431693989071, + "grad_norm": 0.9285311177124776, + "learning_rate": 9.999750399212647e-06, + "loss": 0.6665, + "step": 791 + }, + { + "epoch": 0.3091334894613583, + "grad_norm": 0.8875517468347851, + "learning_rate": 9.999727192738907e-06, + "loss": 0.6694, + "step": 792 + }, + { + "epoch": 0.30952380952380953, + "grad_norm": 0.7807575050194152, + "learning_rate": 9.999702954904667e-06, + "loss": 0.6735, + "step": 793 + }, + { + "epoch": 0.30991412958626074, + "grad_norm": 0.9319924832472173, + "learning_rate": 9.999677685714925e-06, + "loss": 0.6782, + "step": 794 + }, + { + "epoch": 0.31030444964871196, + "grad_norm": 0.8782762679298809, + "learning_rate": 9.999651385174895e-06, + "loss": 0.6862, + "step": 795 + }, + { + "epoch": 0.31069476971116317, + "grad_norm": 0.762956115594051, + "learning_rate": 9.999624053290003e-06, + "loss": 0.6459, + "step": 796 + }, + { + "epoch": 0.3110850897736144, + "grad_norm": 0.8671999325254464, + "learning_rate": 9.999595690065887e-06, + "loss": 0.6814, + "step": 797 + }, + { + "epoch": 0.3114754098360656, + "grad_norm": 0.7166236764627013, + "learning_rate": 9.999566295508398e-06, + "loss": 0.6805, + "step": 798 + }, + { + "epoch": 0.3118657298985168, + "grad_norm": 0.7575716127705618, + "learning_rate": 9.999535869623598e-06, + "loss": 0.6735, + "step": 799 + }, + { + "epoch": 0.312256049960968, + "grad_norm": 0.7113670727616235, + "learning_rate": 9.999504412417764e-06, + "loss": 0.6651, + "step": 800 + }, + { + "epoch": 0.3126463700234192, + "grad_norm": 0.7292364782556201, + "learning_rate": 9.999471923897386e-06, + "loss": 0.6556, + "step": 801 + }, + { + "epoch": 0.31303669008587043, + "grad_norm": 0.7004928905812525, + "learning_rate": 9.999438404069166e-06, + "loss": 0.6463, + "step": 802 + }, + { + "epoch": 0.31342701014832164, + "grad_norm": 0.6312423117334854, + "learning_rate": 9.999403852940019e-06, + "loss": 0.6817, + "step": 803 + }, + { + "epoch": 0.31381733021077285, + "grad_norm": 0.8268699767520921, + "learning_rate": 9.999368270517069e-06, + "loss": 0.6857, + "step": 804 + }, + { + "epoch": 0.31420765027322406, + "grad_norm": 0.6679121571788984, + "learning_rate": 9.999331656807661e-06, + "loss": 0.6555, + "step": 805 + }, + { + "epoch": 0.31459797033567527, + "grad_norm": 0.7041821979140221, + "learning_rate": 9.999294011819343e-06, + "loss": 0.6742, + "step": 806 + }, + { + "epoch": 0.3149882903981265, + "grad_norm": 0.7279324186055312, + "learning_rate": 9.999255335559882e-06, + "loss": 0.6537, + "step": 807 + }, + { + "epoch": 0.3153786104605777, + "grad_norm": 0.8236091834965913, + "learning_rate": 9.99921562803726e-06, + "loss": 0.6898, + "step": 808 + }, + { + "epoch": 0.3157689305230289, + "grad_norm": 0.7228308464496336, + "learning_rate": 9.999174889259662e-06, + "loss": 0.6777, + "step": 809 + }, + { + "epoch": 0.3161592505854801, + "grad_norm": 0.9153101519580048, + "learning_rate": 9.999133119235496e-06, + "loss": 0.6393, + "step": 810 + }, + { + "epoch": 0.3165495706479313, + "grad_norm": 0.7777809545782576, + "learning_rate": 9.999090317973376e-06, + "loss": 0.6686, + "step": 811 + }, + { + "epoch": 0.31693989071038253, + "grad_norm": 0.8349413950968604, + "learning_rate": 9.999046485482134e-06, + "loss": 0.6481, + "step": 812 + }, + { + "epoch": 0.31733021077283374, + "grad_norm": 0.8414666078705616, + "learning_rate": 9.99900162177081e-06, + "loss": 0.6578, + "step": 813 + }, + { + "epoch": 0.31772053083528495, + "grad_norm": 0.6429260258254902, + "learning_rate": 9.998955726848658e-06, + "loss": 0.7088, + "step": 814 + }, + { + "epoch": 0.31811085089773616, + "grad_norm": 0.737496527203629, + "learning_rate": 9.998908800725146e-06, + "loss": 0.67, + "step": 815 + }, + { + "epoch": 0.3185011709601874, + "grad_norm": 0.8581969405111726, + "learning_rate": 9.998860843409954e-06, + "loss": 0.6662, + "step": 816 + }, + { + "epoch": 0.3188914910226386, + "grad_norm": 0.7099424006824423, + "learning_rate": 9.998811854912978e-06, + "loss": 0.6387, + "step": 817 + }, + { + "epoch": 0.3192818110850898, + "grad_norm": 0.8437594605397141, + "learning_rate": 9.998761835244317e-06, + "loss": 0.6452, + "step": 818 + }, + { + "epoch": 0.319672131147541, + "grad_norm": 0.8869333583983848, + "learning_rate": 9.998710784414295e-06, + "loss": 0.6734, + "step": 819 + }, + { + "epoch": 0.3200624512099922, + "grad_norm": 0.7797738278527432, + "learning_rate": 9.99865870243344e-06, + "loss": 0.651, + "step": 820 + }, + { + "epoch": 0.3204527712724434, + "grad_norm": 0.795623907382445, + "learning_rate": 9.998605589312499e-06, + "loss": 0.6632, + "step": 821 + }, + { + "epoch": 0.32084309133489464, + "grad_norm": 0.8532802214391453, + "learning_rate": 9.99855144506242e-06, + "loss": 0.6865, + "step": 822 + }, + { + "epoch": 0.32123341139734585, + "grad_norm": 0.6815359645800407, + "learning_rate": 9.998496269694382e-06, + "loss": 0.6509, + "step": 823 + }, + { + "epoch": 0.32162373145979706, + "grad_norm": 0.8327546656849829, + "learning_rate": 9.998440063219763e-06, + "loss": 0.6578, + "step": 824 + }, + { + "epoch": 0.32201405152224827, + "grad_norm": 0.6513004183763217, + "learning_rate": 9.998382825650156e-06, + "loss": 0.6665, + "step": 825 + }, + { + "epoch": 0.3224043715846995, + "grad_norm": 0.6783661763347043, + "learning_rate": 9.99832455699737e-06, + "loss": 0.6628, + "step": 826 + }, + { + "epoch": 0.3227946916471507, + "grad_norm": 0.7669504213109285, + "learning_rate": 9.998265257273423e-06, + "loss": 0.6561, + "step": 827 + }, + { + "epoch": 0.3231850117096019, + "grad_norm": 0.7070216972139682, + "learning_rate": 9.99820492649055e-06, + "loss": 0.6332, + "step": 828 + }, + { + "epoch": 0.3235753317720531, + "grad_norm": 0.6924888523114988, + "learning_rate": 9.998143564661191e-06, + "loss": 0.6751, + "step": 829 + }, + { + "epoch": 0.3239656518345043, + "grad_norm": 0.7622790613250262, + "learning_rate": 9.99808117179801e-06, + "loss": 0.6429, + "step": 830 + }, + { + "epoch": 0.32435597189695553, + "grad_norm": 0.8079610681102074, + "learning_rate": 9.998017747913878e-06, + "loss": 0.6874, + "step": 831 + }, + { + "epoch": 0.32474629195940674, + "grad_norm": 0.6352831150540168, + "learning_rate": 9.997953293021874e-06, + "loss": 0.6918, + "step": 832 + }, + { + "epoch": 0.3251366120218579, + "grad_norm": 0.8691382198016786, + "learning_rate": 9.997887807135294e-06, + "loss": 0.6523, + "step": 833 + }, + { + "epoch": 0.3255269320843091, + "grad_norm": 0.6607141396676791, + "learning_rate": 9.997821290267649e-06, + "loss": 0.7115, + "step": 834 + }, + { + "epoch": 0.3259172521467603, + "grad_norm": 0.8554932081705774, + "learning_rate": 9.997753742432661e-06, + "loss": 0.6763, + "step": 835 + }, + { + "epoch": 0.3263075722092115, + "grad_norm": 0.6605188335776898, + "learning_rate": 9.997685163644261e-06, + "loss": 0.6621, + "step": 836 + }, + { + "epoch": 0.32669789227166274, + "grad_norm": 0.7728734225689453, + "learning_rate": 9.9976155539166e-06, + "loss": 0.6236, + "step": 837 + }, + { + "epoch": 0.32708821233411395, + "grad_norm": 0.6841663014337549, + "learning_rate": 9.997544913264033e-06, + "loss": 0.6523, + "step": 838 + }, + { + "epoch": 0.32747853239656516, + "grad_norm": 0.7867020973587643, + "learning_rate": 9.997473241701133e-06, + "loss": 0.6732, + "step": 839 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 0.6953692971271762, + "learning_rate": 9.997400539242684e-06, + "loss": 0.6672, + "step": 840 + }, + { + "epoch": 0.3282591725214676, + "grad_norm": 0.7675140601020735, + "learning_rate": 9.997326805903686e-06, + "loss": 0.6383, + "step": 841 + }, + { + "epoch": 0.3286494925839188, + "grad_norm": 0.8412024300092079, + "learning_rate": 9.997252041699348e-06, + "loss": 0.6645, + "step": 842 + }, + { + "epoch": 0.32903981264637, + "grad_norm": 0.7080982793592954, + "learning_rate": 9.997176246645092e-06, + "loss": 0.6414, + "step": 843 + }, + { + "epoch": 0.3294301327088212, + "grad_norm": 0.7676288822946971, + "learning_rate": 9.997099420756552e-06, + "loss": 0.694, + "step": 844 + }, + { + "epoch": 0.3298204527712724, + "grad_norm": 0.7326025894545982, + "learning_rate": 9.997021564049579e-06, + "loss": 0.6835, + "step": 845 + }, + { + "epoch": 0.33021077283372363, + "grad_norm": 0.6440940844622942, + "learning_rate": 9.996942676540232e-06, + "loss": 0.6732, + "step": 846 + }, + { + "epoch": 0.33060109289617484, + "grad_norm": 0.6697736468480807, + "learning_rate": 9.996862758244784e-06, + "loss": 0.649, + "step": 847 + }, + { + "epoch": 0.33099141295862605, + "grad_norm": 0.6787820883096058, + "learning_rate": 9.99678180917972e-06, + "loss": 0.6746, + "step": 848 + }, + { + "epoch": 0.33138173302107726, + "grad_norm": 0.6752998736971415, + "learning_rate": 9.996699829361739e-06, + "loss": 0.6608, + "step": 849 + }, + { + "epoch": 0.3317720530835285, + "grad_norm": 0.69109453702427, + "learning_rate": 9.996616818807752e-06, + "loss": 0.6427, + "step": 850 + }, + { + "epoch": 0.3321623731459797, + "grad_norm": 0.632415369040589, + "learning_rate": 9.996532777534884e-06, + "loss": 0.6536, + "step": 851 + }, + { + "epoch": 0.3325526932084309, + "grad_norm": 0.7518533904716654, + "learning_rate": 9.996447705560471e-06, + "loss": 0.6911, + "step": 852 + }, + { + "epoch": 0.3329430132708821, + "grad_norm": 0.6990914281844794, + "learning_rate": 9.99636160290206e-06, + "loss": 0.6977, + "step": 853 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.7481810550190204, + "learning_rate": 9.996274469577414e-06, + "loss": 0.645, + "step": 854 + }, + { + "epoch": 0.3337236533957845, + "grad_norm": 0.7268578205331664, + "learning_rate": 9.996186305604508e-06, + "loss": 0.658, + "step": 855 + }, + { + "epoch": 0.33411397345823574, + "grad_norm": 0.8910211998359182, + "learning_rate": 9.996097111001526e-06, + "loss": 0.6757, + "step": 856 + }, + { + "epoch": 0.33450429352068695, + "grad_norm": 0.6822139457604639, + "learning_rate": 9.99600688578687e-06, + "loss": 0.6559, + "step": 857 + }, + { + "epoch": 0.33489461358313816, + "grad_norm": 0.8450960188441312, + "learning_rate": 9.995915629979152e-06, + "loss": 0.6232, + "step": 858 + }, + { + "epoch": 0.33528493364558937, + "grad_norm": 0.8634073217467227, + "learning_rate": 9.995823343597194e-06, + "loss": 0.678, + "step": 859 + }, + { + "epoch": 0.3356752537080406, + "grad_norm": 0.7671342430336847, + "learning_rate": 9.995730026660035e-06, + "loss": 0.6783, + "step": 860 + }, + { + "epoch": 0.3360655737704918, + "grad_norm": 0.9058264970466078, + "learning_rate": 9.995635679186925e-06, + "loss": 0.6376, + "step": 861 + }, + { + "epoch": 0.336455893832943, + "grad_norm": 0.8580973051732236, + "learning_rate": 9.995540301197325e-06, + "loss": 0.7033, + "step": 862 + }, + { + "epoch": 0.3368462138953942, + "grad_norm": 0.8546291941423848, + "learning_rate": 9.995443892710912e-06, + "loss": 0.7137, + "step": 863 + }, + { + "epoch": 0.3372365339578454, + "grad_norm": 0.7281557985260477, + "learning_rate": 9.995346453747572e-06, + "loss": 0.7094, + "step": 864 + }, + { + "epoch": 0.33762685402029663, + "grad_norm": 0.814653498236247, + "learning_rate": 9.995247984327405e-06, + "loss": 0.6861, + "step": 865 + }, + { + "epoch": 0.33801717408274784, + "grad_norm": 0.6516486981939469, + "learning_rate": 9.995148484470723e-06, + "loss": 0.6724, + "step": 866 + }, + { + "epoch": 0.33840749414519905, + "grad_norm": 0.6591392026246321, + "learning_rate": 9.995047954198053e-06, + "loss": 0.674, + "step": 867 + }, + { + "epoch": 0.33879781420765026, + "grad_norm": 0.7920427447713971, + "learning_rate": 9.994946393530132e-06, + "loss": 0.6894, + "step": 868 + }, + { + "epoch": 0.33918813427010147, + "grad_norm": 0.723138240050923, + "learning_rate": 9.994843802487909e-06, + "loss": 0.6594, + "step": 869 + }, + { + "epoch": 0.3395784543325527, + "grad_norm": 0.7639758930597941, + "learning_rate": 9.994740181092548e-06, + "loss": 0.6848, + "step": 870 + }, + { + "epoch": 0.3399687743950039, + "grad_norm": 0.902114721352076, + "learning_rate": 9.994635529365424e-06, + "loss": 0.6887, + "step": 871 + }, + { + "epoch": 0.3403590944574551, + "grad_norm": 0.9174043681118116, + "learning_rate": 9.994529847328125e-06, + "loss": 0.7082, + "step": 872 + }, + { + "epoch": 0.3407494145199063, + "grad_norm": 0.7216666387729243, + "learning_rate": 9.994423135002452e-06, + "loss": 0.6732, + "step": 873 + }, + { + "epoch": 0.3411397345823575, + "grad_norm": 1.0065587198053019, + "learning_rate": 9.994315392410418e-06, + "loss": 0.6353, + "step": 874 + }, + { + "epoch": 0.34153005464480873, + "grad_norm": 0.9477974186452182, + "learning_rate": 9.994206619574246e-06, + "loss": 0.6624, + "step": 875 + }, + { + "epoch": 0.34192037470725994, + "grad_norm": 0.6781719395986392, + "learning_rate": 9.994096816516378e-06, + "loss": 0.6572, + "step": 876 + }, + { + "epoch": 0.34231069476971115, + "grad_norm": 0.7476325522587134, + "learning_rate": 9.993985983259462e-06, + "loss": 0.658, + "step": 877 + }, + { + "epoch": 0.34270101483216237, + "grad_norm": 0.836031432407664, + "learning_rate": 9.993874119826361e-06, + "loss": 0.6536, + "step": 878 + }, + { + "epoch": 0.3430913348946136, + "grad_norm": 0.6246195446357209, + "learning_rate": 9.993761226240153e-06, + "loss": 0.6477, + "step": 879 + }, + { + "epoch": 0.3434816549570648, + "grad_norm": 0.72410898634099, + "learning_rate": 9.993647302524121e-06, + "loss": 0.6571, + "step": 880 + }, + { + "epoch": 0.343871975019516, + "grad_norm": 0.7003030109553813, + "learning_rate": 9.993532348701773e-06, + "loss": 0.6603, + "step": 881 + }, + { + "epoch": 0.3442622950819672, + "grad_norm": 0.6487213421868889, + "learning_rate": 9.993416364796814e-06, + "loss": 0.659, + "step": 882 + }, + { + "epoch": 0.3446526151444184, + "grad_norm": 0.7501956465893225, + "learning_rate": 9.993299350833176e-06, + "loss": 0.6189, + "step": 883 + }, + { + "epoch": 0.3450429352068696, + "grad_norm": 0.6794108239835803, + "learning_rate": 9.993181306834994e-06, + "loss": 0.6345, + "step": 884 + }, + { + "epoch": 0.34543325526932084, + "grad_norm": 0.7055729316933672, + "learning_rate": 9.993062232826618e-06, + "loss": 0.6562, + "step": 885 + }, + { + "epoch": 0.34582357533177205, + "grad_norm": 0.7018554569472613, + "learning_rate": 9.992942128832612e-06, + "loss": 0.6887, + "step": 886 + }, + { + "epoch": 0.34621389539422326, + "grad_norm": 0.711876077199199, + "learning_rate": 9.992820994877752e-06, + "loss": 0.648, + "step": 887 + }, + { + "epoch": 0.34660421545667447, + "grad_norm": 0.7811689619850084, + "learning_rate": 9.992698830987026e-06, + "loss": 0.6916, + "step": 888 + }, + { + "epoch": 0.3469945355191257, + "grad_norm": 0.6431557029250105, + "learning_rate": 9.992575637185633e-06, + "loss": 0.6347, + "step": 889 + }, + { + "epoch": 0.3473848555815769, + "grad_norm": 0.8106965861306187, + "learning_rate": 9.992451413498987e-06, + "loss": 0.6941, + "step": 890 + }, + { + "epoch": 0.3477751756440281, + "grad_norm": 0.7242065987027447, + "learning_rate": 9.992326159952713e-06, + "loss": 0.6336, + "step": 891 + }, + { + "epoch": 0.3481654957064793, + "grad_norm": 0.916890612930302, + "learning_rate": 9.992199876572647e-06, + "loss": 0.6849, + "step": 892 + }, + { + "epoch": 0.3485558157689305, + "grad_norm": 0.7444692946735774, + "learning_rate": 9.992072563384841e-06, + "loss": 0.6697, + "step": 893 + }, + { + "epoch": 0.34894613583138173, + "grad_norm": 0.8352542041043615, + "learning_rate": 9.991944220415558e-06, + "loss": 0.6341, + "step": 894 + }, + { + "epoch": 0.34933645589383294, + "grad_norm": 0.7474813540441411, + "learning_rate": 9.991814847691273e-06, + "loss": 0.6314, + "step": 895 + }, + { + "epoch": 0.34972677595628415, + "grad_norm": 0.8188414019237984, + "learning_rate": 9.99168444523867e-06, + "loss": 0.6443, + "step": 896 + }, + { + "epoch": 0.35011709601873536, + "grad_norm": 0.7261879006416049, + "learning_rate": 9.991553013084654e-06, + "loss": 0.6894, + "step": 897 + }, + { + "epoch": 0.3505074160811866, + "grad_norm": 0.731629332370852, + "learning_rate": 9.991420551256334e-06, + "loss": 0.6743, + "step": 898 + }, + { + "epoch": 0.3508977361436378, + "grad_norm": 0.6614489804762611, + "learning_rate": 9.991287059781036e-06, + "loss": 0.6702, + "step": 899 + }, + { + "epoch": 0.351288056206089, + "grad_norm": 0.7441904283386087, + "learning_rate": 9.991152538686297e-06, + "loss": 0.6827, + "step": 900 + }, + { + "epoch": 0.3516783762685402, + "grad_norm": 0.7523824416980169, + "learning_rate": 9.991016987999867e-06, + "loss": 0.666, + "step": 901 + }, + { + "epoch": 0.3520686963309914, + "grad_norm": 0.5715482006083324, + "learning_rate": 9.990880407749704e-06, + "loss": 0.6833, + "step": 902 + }, + { + "epoch": 0.3524590163934426, + "grad_norm": 0.716055742121867, + "learning_rate": 9.990742797963987e-06, + "loss": 0.6606, + "step": 903 + }, + { + "epoch": 0.35284933645589384, + "grad_norm": 0.7295529091638431, + "learning_rate": 9.9906041586711e-06, + "loss": 0.6612, + "step": 904 + }, + { + "epoch": 0.35323965651834505, + "grad_norm": 0.6606851553944014, + "learning_rate": 9.990464489899644e-06, + "loss": 0.634, + "step": 905 + }, + { + "epoch": 0.35362997658079626, + "grad_norm": 0.6995737345284105, + "learning_rate": 9.990323791678427e-06, + "loss": 0.6641, + "step": 906 + }, + { + "epoch": 0.35402029664324747, + "grad_norm": 0.6181119978223077, + "learning_rate": 9.990182064036477e-06, + "loss": 0.6502, + "step": 907 + }, + { + "epoch": 0.3544106167056987, + "grad_norm": 0.7095277276193984, + "learning_rate": 9.990039307003027e-06, + "loss": 0.6807, + "step": 908 + }, + { + "epoch": 0.3548009367681499, + "grad_norm": 0.7973031334056785, + "learning_rate": 9.989895520607527e-06, + "loss": 0.693, + "step": 909 + }, + { + "epoch": 0.3551912568306011, + "grad_norm": 0.6919992073808248, + "learning_rate": 9.989750704879635e-06, + "loss": 0.6783, + "step": 910 + }, + { + "epoch": 0.3555815768930523, + "grad_norm": 0.6289498827638074, + "learning_rate": 9.989604859849228e-06, + "loss": 0.6641, + "step": 911 + }, + { + "epoch": 0.3559718969555035, + "grad_norm": 0.7286914211243523, + "learning_rate": 9.989457985546387e-06, + "loss": 0.6987, + "step": 912 + }, + { + "epoch": 0.35636221701795473, + "grad_norm": 0.7179359238407973, + "learning_rate": 9.989310082001416e-06, + "loss": 0.6601, + "step": 913 + }, + { + "epoch": 0.35675253708040594, + "grad_norm": 0.6634715905687889, + "learning_rate": 9.989161149244818e-06, + "loss": 0.675, + "step": 914 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 0.754551591990038, + "learning_rate": 9.989011187307318e-06, + "loss": 0.6424, + "step": 915 + }, + { + "epoch": 0.35753317720530836, + "grad_norm": 0.8269721730954461, + "learning_rate": 9.988860196219854e-06, + "loss": 0.6581, + "step": 916 + }, + { + "epoch": 0.35792349726775957, + "grad_norm": 0.7859221511573097, + "learning_rate": 9.988708176013568e-06, + "loss": 0.6527, + "step": 917 + }, + { + "epoch": 0.3583138173302108, + "grad_norm": 0.7187578371853772, + "learning_rate": 9.988555126719823e-06, + "loss": 0.6607, + "step": 918 + }, + { + "epoch": 0.358704137392662, + "grad_norm": 0.6692616721492546, + "learning_rate": 9.98840104837019e-06, + "loss": 0.6505, + "step": 919 + }, + { + "epoch": 0.3590944574551132, + "grad_norm": 0.726837334158033, + "learning_rate": 9.988245940996448e-06, + "loss": 0.6707, + "step": 920 + }, + { + "epoch": 0.3594847775175644, + "grad_norm": 0.6858328964897445, + "learning_rate": 9.9880898046306e-06, + "loss": 0.6557, + "step": 921 + }, + { + "epoch": 0.3598750975800156, + "grad_norm": 0.6360614050883492, + "learning_rate": 9.98793263930485e-06, + "loss": 0.6329, + "step": 922 + }, + { + "epoch": 0.36026541764246683, + "grad_norm": 0.6872781347436464, + "learning_rate": 9.987774445051619e-06, + "loss": 0.6486, + "step": 923 + }, + { + "epoch": 0.36065573770491804, + "grad_norm": 0.6997243790643336, + "learning_rate": 9.987615221903542e-06, + "loss": 0.6337, + "step": 924 + }, + { + "epoch": 0.36104605776736926, + "grad_norm": 0.6734325824013773, + "learning_rate": 9.987454969893461e-06, + "loss": 0.6792, + "step": 925 + }, + { + "epoch": 0.36143637782982047, + "grad_norm": 0.7301679171028675, + "learning_rate": 9.987293689054437e-06, + "loss": 0.6389, + "step": 926 + }, + { + "epoch": 0.3618266978922717, + "grad_norm": 0.7348317356172129, + "learning_rate": 9.987131379419736e-06, + "loss": 0.6632, + "step": 927 + }, + { + "epoch": 0.3622170179547229, + "grad_norm": 0.597453215172695, + "learning_rate": 9.986968041022843e-06, + "loss": 0.6694, + "step": 928 + }, + { + "epoch": 0.3626073380171741, + "grad_norm": 0.7131096255309883, + "learning_rate": 9.986803673897447e-06, + "loss": 0.65, + "step": 929 + }, + { + "epoch": 0.3629976580796253, + "grad_norm": 1.0238125736489228, + "learning_rate": 9.98663827807746e-06, + "loss": 0.6631, + "step": 930 + }, + { + "epoch": 0.3633879781420765, + "grad_norm": 0.818613775219595, + "learning_rate": 9.986471853596998e-06, + "loss": 0.6877, + "step": 931 + }, + { + "epoch": 0.36377829820452773, + "grad_norm": 0.7326900506005257, + "learning_rate": 9.98630440049039e-06, + "loss": 0.635, + "step": 932 + }, + { + "epoch": 0.36416861826697894, + "grad_norm": 0.7730630094127525, + "learning_rate": 9.98613591879218e-06, + "loss": 0.6672, + "step": 933 + }, + { + "epoch": 0.36455893832943015, + "grad_norm": 0.7111913788258493, + "learning_rate": 9.985966408537127e-06, + "loss": 0.6707, + "step": 934 + }, + { + "epoch": 0.36494925839188136, + "grad_norm": 0.6564588403343145, + "learning_rate": 9.985795869760191e-06, + "loss": 0.6659, + "step": 935 + }, + { + "epoch": 0.36533957845433257, + "grad_norm": 0.6410978902634978, + "learning_rate": 9.985624302496555e-06, + "loss": 0.6545, + "step": 936 + }, + { + "epoch": 0.3657298985167838, + "grad_norm": 0.6427196475124386, + "learning_rate": 9.985451706781612e-06, + "loss": 0.6303, + "step": 937 + }, + { + "epoch": 0.366120218579235, + "grad_norm": 0.617599905367673, + "learning_rate": 9.985278082650962e-06, + "loss": 0.6698, + "step": 938 + }, + { + "epoch": 0.3665105386416862, + "grad_norm": 0.7709418946502231, + "learning_rate": 9.985103430140424e-06, + "loss": 0.7061, + "step": 939 + }, + { + "epoch": 0.3669008587041374, + "grad_norm": 0.8021013581833265, + "learning_rate": 9.984927749286023e-06, + "loss": 0.6724, + "step": 940 + }, + { + "epoch": 0.3672911787665886, + "grad_norm": 0.6492336730907584, + "learning_rate": 9.984751040124e-06, + "loss": 0.6616, + "step": 941 + }, + { + "epoch": 0.36768149882903983, + "grad_norm": 0.9259940799643579, + "learning_rate": 9.98457330269081e-06, + "loss": 0.6429, + "step": 942 + }, + { + "epoch": 0.36807181889149104, + "grad_norm": 0.7062157137679965, + "learning_rate": 9.984394537023114e-06, + "loss": 0.6326, + "step": 943 + }, + { + "epoch": 0.36846213895394225, + "grad_norm": 0.8947706815661135, + "learning_rate": 9.98421474315779e-06, + "loss": 0.6774, + "step": 944 + }, + { + "epoch": 0.36885245901639346, + "grad_norm": 0.7832145682881375, + "learning_rate": 9.984033921131923e-06, + "loss": 0.6553, + "step": 945 + }, + { + "epoch": 0.3692427790788447, + "grad_norm": 0.8114935230737282, + "learning_rate": 9.98385207098282e-06, + "loss": 0.6255, + "step": 946 + }, + { + "epoch": 0.3696330991412959, + "grad_norm": 0.8628106032732564, + "learning_rate": 9.983669192747988e-06, + "loss": 0.6656, + "step": 947 + }, + { + "epoch": 0.3700234192037471, + "grad_norm": 0.6909125818682901, + "learning_rate": 9.983485286465153e-06, + "loss": 0.6807, + "step": 948 + }, + { + "epoch": 0.3704137392661983, + "grad_norm": 0.9297731234719954, + "learning_rate": 9.983300352172254e-06, + "loss": 0.6535, + "step": 949 + }, + { + "epoch": 0.3708040593286495, + "grad_norm": 0.8749647220810091, + "learning_rate": 9.983114389907437e-06, + "loss": 0.633, + "step": 950 + }, + { + "epoch": 0.3711943793911007, + "grad_norm": 0.7916621402165683, + "learning_rate": 9.982927399709067e-06, + "loss": 0.682, + "step": 951 + }, + { + "epoch": 0.37158469945355194, + "grad_norm": 1.0554548229281602, + "learning_rate": 9.982739381615712e-06, + "loss": 0.6863, + "step": 952 + }, + { + "epoch": 0.37197501951600315, + "grad_norm": 0.7762403285157347, + "learning_rate": 9.982550335666162e-06, + "loss": 0.6589, + "step": 953 + }, + { + "epoch": 0.37236533957845436, + "grad_norm": 1.0958602603891685, + "learning_rate": 9.98236026189941e-06, + "loss": 0.6742, + "step": 954 + }, + { + "epoch": 0.37275565964090557, + "grad_norm": 0.9610880373004056, + "learning_rate": 9.982169160354664e-06, + "loss": 0.6535, + "step": 955 + }, + { + "epoch": 0.3731459797033568, + "grad_norm": 0.8947011352318408, + "learning_rate": 9.981977031071351e-06, + "loss": 0.6453, + "step": 956 + }, + { + "epoch": 0.373536299765808, + "grad_norm": 0.8538692185038268, + "learning_rate": 9.981783874089099e-06, + "loss": 0.6579, + "step": 957 + }, + { + "epoch": 0.3739266198282592, + "grad_norm": 0.728923917514734, + "learning_rate": 9.981589689447754e-06, + "loss": 0.6401, + "step": 958 + }, + { + "epoch": 0.3743169398907104, + "grad_norm": 1.008909956373156, + "learning_rate": 9.981394477187374e-06, + "loss": 0.6597, + "step": 959 + }, + { + "epoch": 0.3747072599531616, + "grad_norm": 0.7210604723654161, + "learning_rate": 9.981198237348228e-06, + "loss": 0.6432, + "step": 960 + }, + { + "epoch": 0.3750975800156128, + "grad_norm": 0.870276285108072, + "learning_rate": 9.981000969970797e-06, + "loss": 0.6933, + "step": 961 + }, + { + "epoch": 0.375487900078064, + "grad_norm": 0.7506500201273586, + "learning_rate": 9.980802675095774e-06, + "loss": 0.6602, + "step": 962 + }, + { + "epoch": 0.3758782201405152, + "grad_norm": 0.7721722725924652, + "learning_rate": 9.980603352764063e-06, + "loss": 0.6425, + "step": 963 + }, + { + "epoch": 0.3762685402029664, + "grad_norm": 0.9088154623166647, + "learning_rate": 9.980403003016782e-06, + "loss": 0.6517, + "step": 964 + }, + { + "epoch": 0.3766588602654176, + "grad_norm": 0.741015779074416, + "learning_rate": 9.98020162589526e-06, + "loss": 0.6256, + "step": 965 + }, + { + "epoch": 0.3770491803278688, + "grad_norm": 0.9654484121349824, + "learning_rate": 9.979999221441036e-06, + "loss": 0.6442, + "step": 966 + }, + { + "epoch": 0.37743950039032004, + "grad_norm": 0.78422074193994, + "learning_rate": 9.979795789695864e-06, + "loss": 0.674, + "step": 967 + }, + { + "epoch": 0.37782982045277125, + "grad_norm": 0.771233250081318, + "learning_rate": 9.979591330701707e-06, + "loss": 0.6336, + "step": 968 + }, + { + "epoch": 0.37822014051522246, + "grad_norm": 0.946071968766693, + "learning_rate": 9.979385844500746e-06, + "loss": 0.6829, + "step": 969 + }, + { + "epoch": 0.37861046057767367, + "grad_norm": 0.7344390801726837, + "learning_rate": 9.979179331135364e-06, + "loss": 0.6547, + "step": 970 + }, + { + "epoch": 0.3790007806401249, + "grad_norm": 0.7369699145534009, + "learning_rate": 9.978971790648164e-06, + "loss": 0.6388, + "step": 971 + }, + { + "epoch": 0.3793911007025761, + "grad_norm": 0.8502924784003716, + "learning_rate": 9.978763223081958e-06, + "loss": 0.6539, + "step": 972 + }, + { + "epoch": 0.3797814207650273, + "grad_norm": 0.794246850219419, + "learning_rate": 9.97855362847977e-06, + "loss": 0.6416, + "step": 973 + }, + { + "epoch": 0.3801717408274785, + "grad_norm": 0.6936208409668412, + "learning_rate": 9.978343006884835e-06, + "loss": 0.7105, + "step": 974 + }, + { + "epoch": 0.3805620608899297, + "grad_norm": 0.8300695260202947, + "learning_rate": 9.9781313583406e-06, + "loss": 0.6601, + "step": 975 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.6766591314415691, + "learning_rate": 9.977918682890728e-06, + "loss": 0.6637, + "step": 976 + }, + { + "epoch": 0.38134270101483214, + "grad_norm": 0.7949680183150085, + "learning_rate": 9.977704980579086e-06, + "loss": 0.6132, + "step": 977 + }, + { + "epoch": 0.38173302107728335, + "grad_norm": 0.7438553479457072, + "learning_rate": 9.977490251449762e-06, + "loss": 0.6372, + "step": 978 + }, + { + "epoch": 0.38212334113973456, + "grad_norm": 0.8244053584289949, + "learning_rate": 9.977274495547047e-06, + "loss": 0.6562, + "step": 979 + }, + { + "epoch": 0.3825136612021858, + "grad_norm": 0.8374580085660149, + "learning_rate": 9.977057712915448e-06, + "loss": 0.675, + "step": 980 + }, + { + "epoch": 0.382903981264637, + "grad_norm": 0.8343554896082509, + "learning_rate": 9.976839903599687e-06, + "loss": 0.6711, + "step": 981 + }, + { + "epoch": 0.3832943013270882, + "grad_norm": 0.7596759257142047, + "learning_rate": 9.976621067644693e-06, + "loss": 0.6634, + "step": 982 + }, + { + "epoch": 0.3836846213895394, + "grad_norm": 0.7729706693104199, + "learning_rate": 9.976401205095607e-06, + "loss": 0.6936, + "step": 983 + }, + { + "epoch": 0.3840749414519906, + "grad_norm": 0.8897856176720594, + "learning_rate": 9.976180315997784e-06, + "loss": 0.6859, + "step": 984 + }, + { + "epoch": 0.3844652615144418, + "grad_norm": 0.8938719953521139, + "learning_rate": 9.975958400396788e-06, + "loss": 0.656, + "step": 985 + }, + { + "epoch": 0.38485558157689304, + "grad_norm": 0.7863205833137813, + "learning_rate": 9.975735458338399e-06, + "loss": 0.6613, + "step": 986 + }, + { + "epoch": 0.38524590163934425, + "grad_norm": 0.828943833669335, + "learning_rate": 9.975511489868604e-06, + "loss": 0.65, + "step": 987 + }, + { + "epoch": 0.38563622170179546, + "grad_norm": 0.7351071642393291, + "learning_rate": 9.975286495033608e-06, + "loss": 0.6311, + "step": 988 + }, + { + "epoch": 0.38602654176424667, + "grad_norm": 0.7059002282368293, + "learning_rate": 9.975060473879817e-06, + "loss": 0.6804, + "step": 989 + }, + { + "epoch": 0.3864168618266979, + "grad_norm": 0.6850461528337494, + "learning_rate": 9.974833426453862e-06, + "loss": 0.6542, + "step": 990 + }, + { + "epoch": 0.3868071818891491, + "grad_norm": 0.6697840444307754, + "learning_rate": 9.974605352802575e-06, + "loss": 0.6344, + "step": 991 + }, + { + "epoch": 0.3871975019516003, + "grad_norm": 0.8601351063116123, + "learning_rate": 9.974376252973007e-06, + "loss": 0.668, + "step": 992 + }, + { + "epoch": 0.3875878220140515, + "grad_norm": 0.8863834482388102, + "learning_rate": 9.974146127012412e-06, + "loss": 0.6663, + "step": 993 + }, + { + "epoch": 0.3879781420765027, + "grad_norm": 0.7028355090508427, + "learning_rate": 9.973914974968266e-06, + "loss": 0.6773, + "step": 994 + }, + { + "epoch": 0.38836846213895393, + "grad_norm": 0.8829968527303439, + "learning_rate": 9.973682796888253e-06, + "loss": 0.6611, + "step": 995 + }, + { + "epoch": 0.38875878220140514, + "grad_norm": 0.9453961430831636, + "learning_rate": 9.973449592820263e-06, + "loss": 0.6437, + "step": 996 + }, + { + "epoch": 0.38914910226385635, + "grad_norm": 0.7359433493057086, + "learning_rate": 9.973215362812405e-06, + "loss": 0.6683, + "step": 997 + }, + { + "epoch": 0.38953942232630756, + "grad_norm": 0.910855479308881, + "learning_rate": 9.972980106912994e-06, + "loss": 0.6569, + "step": 998 + }, + { + "epoch": 0.38992974238875877, + "grad_norm": 0.7769806277366309, + "learning_rate": 9.972743825170564e-06, + "loss": 0.6297, + "step": 999 + }, + { + "epoch": 0.39032006245121, + "grad_norm": 0.827263172271414, + "learning_rate": 9.972506517633851e-06, + "loss": 0.6638, + "step": 1000 + }, + { + "epoch": 0.3907103825136612, + "grad_norm": 0.8562230381916206, + "learning_rate": 9.97226818435181e-06, + "loss": 0.6812, + "step": 1001 + }, + { + "epoch": 0.3911007025761124, + "grad_norm": 0.704058072990509, + "learning_rate": 9.972028825373605e-06, + "loss": 0.6484, + "step": 1002 + }, + { + "epoch": 0.3914910226385636, + "grad_norm": 0.7311600036299489, + "learning_rate": 9.971788440748612e-06, + "loss": 0.6263, + "step": 1003 + }, + { + "epoch": 0.3918813427010148, + "grad_norm": 0.6663246006167494, + "learning_rate": 9.97154703052642e-06, + "loss": 0.6713, + "step": 1004 + }, + { + "epoch": 0.39227166276346603, + "grad_norm": 0.6747615057222769, + "learning_rate": 9.971304594756825e-06, + "loss": 0.6545, + "step": 1005 + }, + { + "epoch": 0.39266198282591724, + "grad_norm": 0.6350642521339785, + "learning_rate": 9.97106113348984e-06, + "loss": 0.6731, + "step": 1006 + }, + { + "epoch": 0.39305230288836845, + "grad_norm": 0.6265724560970402, + "learning_rate": 9.970816646775683e-06, + "loss": 0.5976, + "step": 1007 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 0.6414602918981417, + "learning_rate": 9.97057113466479e-06, + "loss": 0.6958, + "step": 1008 + }, + { + "epoch": 0.3938329430132709, + "grad_norm": 0.6176756443355969, + "learning_rate": 9.970324597207807e-06, + "loss": 0.6381, + "step": 1009 + }, + { + "epoch": 0.3942232630757221, + "grad_norm": 0.6714661613329094, + "learning_rate": 9.97007703445559e-06, + "loss": 0.6616, + "step": 1010 + }, + { + "epoch": 0.3946135831381733, + "grad_norm": 0.5972216511056553, + "learning_rate": 9.969828446459207e-06, + "loss": 0.6614, + "step": 1011 + }, + { + "epoch": 0.3950039032006245, + "grad_norm": 0.7168512626627481, + "learning_rate": 9.969578833269938e-06, + "loss": 0.6491, + "step": 1012 + }, + { + "epoch": 0.3953942232630757, + "grad_norm": 0.6933552093243694, + "learning_rate": 9.969328194939272e-06, + "loss": 0.6843, + "step": 1013 + }, + { + "epoch": 0.3957845433255269, + "grad_norm": 0.6359161130704287, + "learning_rate": 9.969076531518912e-06, + "loss": 0.661, + "step": 1014 + }, + { + "epoch": 0.39617486338797814, + "grad_norm": 0.6773879266841489, + "learning_rate": 9.968823843060776e-06, + "loss": 0.6366, + "step": 1015 + }, + { + "epoch": 0.39656518345042935, + "grad_norm": 0.6468509012977908, + "learning_rate": 9.968570129616984e-06, + "loss": 0.6329, + "step": 1016 + }, + { + "epoch": 0.39695550351288056, + "grad_norm": 0.658756265348862, + "learning_rate": 9.968315391239875e-06, + "loss": 0.6705, + "step": 1017 + }, + { + "epoch": 0.39734582357533177, + "grad_norm": 0.639235990637636, + "learning_rate": 9.968059627981999e-06, + "loss": 0.659, + "step": 1018 + }, + { + "epoch": 0.397736143637783, + "grad_norm": 0.680149411884182, + "learning_rate": 9.967802839896112e-06, + "loss": 0.6176, + "step": 1019 + }, + { + "epoch": 0.3981264637002342, + "grad_norm": 0.6842943009983518, + "learning_rate": 9.96754502703519e-06, + "loss": 0.6598, + "step": 1020 + }, + { + "epoch": 0.3985167837626854, + "grad_norm": 0.6500061710873568, + "learning_rate": 9.967286189452412e-06, + "loss": 0.6684, + "step": 1021 + }, + { + "epoch": 0.3989071038251366, + "grad_norm": 0.6389813566058936, + "learning_rate": 9.967026327201171e-06, + "loss": 0.6656, + "step": 1022 + }, + { + "epoch": 0.3992974238875878, + "grad_norm": 0.6853051813338732, + "learning_rate": 9.966765440335075e-06, + "loss": 0.6439, + "step": 1023 + }, + { + "epoch": 0.39968774395003903, + "grad_norm": 0.6741264225612728, + "learning_rate": 9.96650352890794e-06, + "loss": 0.6277, + "step": 1024 + }, + { + "epoch": 0.40007806401249024, + "grad_norm": 0.8355803655529512, + "learning_rate": 9.966240592973793e-06, + "loss": 0.6467, + "step": 1025 + }, + { + "epoch": 0.40046838407494145, + "grad_norm": 0.8208615855049008, + "learning_rate": 9.965976632586875e-06, + "loss": 0.6383, + "step": 1026 + }, + { + "epoch": 0.40085870413739266, + "grad_norm": 0.8377518473458557, + "learning_rate": 9.965711647801633e-06, + "loss": 0.6419, + "step": 1027 + }, + { + "epoch": 0.4012490241998439, + "grad_norm": 0.7252165801030576, + "learning_rate": 9.965445638672733e-06, + "loss": 0.6707, + "step": 1028 + }, + { + "epoch": 0.4016393442622951, + "grad_norm": 0.8035120627497945, + "learning_rate": 9.965178605255047e-06, + "loss": 0.655, + "step": 1029 + }, + { + "epoch": 0.4020296643247463, + "grad_norm": 0.8169215253611208, + "learning_rate": 9.964910547603659e-06, + "loss": 0.6382, + "step": 1030 + }, + { + "epoch": 0.4024199843871975, + "grad_norm": 0.6824463725981876, + "learning_rate": 9.964641465773865e-06, + "loss": 0.6632, + "step": 1031 + }, + { + "epoch": 0.4028103044496487, + "grad_norm": 0.8359475791766394, + "learning_rate": 9.964371359821173e-06, + "loss": 0.7232, + "step": 1032 + }, + { + "epoch": 0.4032006245120999, + "grad_norm": 0.7112122410203303, + "learning_rate": 9.964100229801297e-06, + "loss": 0.6578, + "step": 1033 + }, + { + "epoch": 0.40359094457455114, + "grad_norm": 0.7585076253809897, + "learning_rate": 9.963828075770173e-06, + "loss": 0.6823, + "step": 1034 + }, + { + "epoch": 0.40398126463700235, + "grad_norm": 0.6614226252690157, + "learning_rate": 9.963554897783939e-06, + "loss": 0.6353, + "step": 1035 + }, + { + "epoch": 0.40437158469945356, + "grad_norm": 0.9196654280317685, + "learning_rate": 9.963280695898948e-06, + "loss": 0.6525, + "step": 1036 + }, + { + "epoch": 0.40476190476190477, + "grad_norm": 0.6557370888615075, + "learning_rate": 9.96300547017176e-06, + "loss": 0.6523, + "step": 1037 + }, + { + "epoch": 0.405152224824356, + "grad_norm": 0.6865691015724926, + "learning_rate": 9.962729220659152e-06, + "loss": 0.6406, + "step": 1038 + }, + { + "epoch": 0.4055425448868072, + "grad_norm": 0.8086089351819827, + "learning_rate": 9.962451947418111e-06, + "loss": 0.6851, + "step": 1039 + }, + { + "epoch": 0.4059328649492584, + "grad_norm": 0.6739897181173845, + "learning_rate": 9.96217365050583e-06, + "loss": 0.6761, + "step": 1040 + }, + { + "epoch": 0.4063231850117096, + "grad_norm": 0.8719295289969257, + "learning_rate": 9.961894329979722e-06, + "loss": 0.6533, + "step": 1041 + }, + { + "epoch": 0.4067135050741608, + "grad_norm": 0.6937155427435232, + "learning_rate": 9.9616139858974e-06, + "loss": 0.626, + "step": 1042 + }, + { + "epoch": 0.40710382513661203, + "grad_norm": 0.7808911929939567, + "learning_rate": 9.9613326183167e-06, + "loss": 0.675, + "step": 1043 + }, + { + "epoch": 0.40749414519906324, + "grad_norm": 0.681937214727104, + "learning_rate": 9.961050227295664e-06, + "loss": 0.65, + "step": 1044 + }, + { + "epoch": 0.40788446526151445, + "grad_norm": 0.7569056890424037, + "learning_rate": 9.960766812892539e-06, + "loss": 0.6631, + "step": 1045 + }, + { + "epoch": 0.40827478532396566, + "grad_norm": 0.7572509169562316, + "learning_rate": 9.960482375165791e-06, + "loss": 0.6638, + "step": 1046 + }, + { + "epoch": 0.40866510538641687, + "grad_norm": 0.6090662538223893, + "learning_rate": 9.960196914174097e-06, + "loss": 0.6841, + "step": 1047 + }, + { + "epoch": 0.4090554254488681, + "grad_norm": 0.7393708341662586, + "learning_rate": 9.95991042997634e-06, + "loss": 0.6369, + "step": 1048 + }, + { + "epoch": 0.4094457455113193, + "grad_norm": 0.6565730530858391, + "learning_rate": 9.959622922631617e-06, + "loss": 0.6433, + "step": 1049 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 0.6984391554771562, + "learning_rate": 9.95933439219924e-06, + "loss": 0.6617, + "step": 1050 + }, + { + "epoch": 0.4102263856362217, + "grad_norm": 0.7462261737077062, + "learning_rate": 9.959044838738722e-06, + "loss": 0.63, + "step": 1051 + }, + { + "epoch": 0.4106167056986729, + "grad_norm": 0.6538933235714344, + "learning_rate": 9.9587542623098e-06, + "loss": 0.6614, + "step": 1052 + }, + { + "epoch": 0.41100702576112413, + "grad_norm": 0.7085687787020671, + "learning_rate": 9.958462662972407e-06, + "loss": 0.6549, + "step": 1053 + }, + { + "epoch": 0.41139734582357534, + "grad_norm": 0.6668703617145805, + "learning_rate": 9.958170040786703e-06, + "loss": 0.6926, + "step": 1054 + }, + { + "epoch": 0.41178766588602655, + "grad_norm": 0.7064695397752162, + "learning_rate": 9.957876395813046e-06, + "loss": 0.6594, + "step": 1055 + }, + { + "epoch": 0.41217798594847777, + "grad_norm": 0.6209377848346703, + "learning_rate": 9.95758172811201e-06, + "loss": 0.6472, + "step": 1056 + }, + { + "epoch": 0.412568306010929, + "grad_norm": 0.6771376038834885, + "learning_rate": 9.957286037744383e-06, + "loss": 0.6526, + "step": 1057 + }, + { + "epoch": 0.4129586260733802, + "grad_norm": 0.7093940393477729, + "learning_rate": 9.95698932477116e-06, + "loss": 0.6382, + "step": 1058 + }, + { + "epoch": 0.4133489461358314, + "grad_norm": 0.7190452434597155, + "learning_rate": 9.956691589253546e-06, + "loss": 0.6729, + "step": 1059 + }, + { + "epoch": 0.4137392661982826, + "grad_norm": 0.6873690642025138, + "learning_rate": 9.95639283125296e-06, + "loss": 0.6583, + "step": 1060 + }, + { + "epoch": 0.4141295862607338, + "grad_norm": 0.6399171148110735, + "learning_rate": 9.956093050831032e-06, + "loss": 0.6131, + "step": 1061 + }, + { + "epoch": 0.41451990632318503, + "grad_norm": 0.6608667188596068, + "learning_rate": 9.955792248049603e-06, + "loss": 0.6525, + "step": 1062 + }, + { + "epoch": 0.41491022638563624, + "grad_norm": 0.7028670459773788, + "learning_rate": 9.95549042297072e-06, + "loss": 0.6243, + "step": 1063 + }, + { + "epoch": 0.41530054644808745, + "grad_norm": 0.6202023557975803, + "learning_rate": 9.955187575656644e-06, + "loss": 0.6357, + "step": 1064 + }, + { + "epoch": 0.41569086651053866, + "grad_norm": 0.7279523638281133, + "learning_rate": 9.95488370616985e-06, + "loss": 0.6788, + "step": 1065 + }, + { + "epoch": 0.41608118657298987, + "grad_norm": 0.7857113605112602, + "learning_rate": 9.954578814573021e-06, + "loss": 0.6786, + "step": 1066 + }, + { + "epoch": 0.4164715066354411, + "grad_norm": 0.6976857645435346, + "learning_rate": 9.954272900929051e-06, + "loss": 0.6584, + "step": 1067 + }, + { + "epoch": 0.4168618266978923, + "grad_norm": 0.6755532787232612, + "learning_rate": 9.953965965301045e-06, + "loss": 0.6636, + "step": 1068 + }, + { + "epoch": 0.4172521467603435, + "grad_norm": 0.7087309602599289, + "learning_rate": 9.953658007752318e-06, + "loss": 0.6696, + "step": 1069 + }, + { + "epoch": 0.4176424668227947, + "grad_norm": 0.7677078917597703, + "learning_rate": 9.953349028346395e-06, + "loss": 0.6611, + "step": 1070 + }, + { + "epoch": 0.4180327868852459, + "grad_norm": 0.6871039248584069, + "learning_rate": 9.953039027147017e-06, + "loss": 0.6584, + "step": 1071 + }, + { + "epoch": 0.41842310694769713, + "grad_norm": 0.855154381922114, + "learning_rate": 9.952728004218131e-06, + "loss": 0.6375, + "step": 1072 + }, + { + "epoch": 0.41881342701014834, + "grad_norm": 0.7510707297571994, + "learning_rate": 9.952415959623893e-06, + "loss": 0.6643, + "step": 1073 + }, + { + "epoch": 0.41920374707259955, + "grad_norm": 0.7686397195480191, + "learning_rate": 9.952102893428675e-06, + "loss": 0.6361, + "step": 1074 + }, + { + "epoch": 0.41959406713505076, + "grad_norm": 0.6983348635360835, + "learning_rate": 9.951788805697056e-06, + "loss": 0.6305, + "step": 1075 + }, + { + "epoch": 0.419984387197502, + "grad_norm": 0.6585980252925762, + "learning_rate": 9.95147369649383e-06, + "loss": 0.6615, + "step": 1076 + }, + { + "epoch": 0.4203747072599532, + "grad_norm": 0.808431611513896, + "learning_rate": 9.951157565883996e-06, + "loss": 0.6125, + "step": 1077 + }, + { + "epoch": 0.4207650273224044, + "grad_norm": 0.8224664029325658, + "learning_rate": 9.950840413932766e-06, + "loss": 0.6403, + "step": 1078 + }, + { + "epoch": 0.4211553473848556, + "grad_norm": 0.703554234056007, + "learning_rate": 9.950522240705565e-06, + "loss": 0.6388, + "step": 1079 + }, + { + "epoch": 0.4215456674473068, + "grad_norm": 0.8810988683794522, + "learning_rate": 9.950203046268026e-06, + "loss": 0.6967, + "step": 1080 + }, + { + "epoch": 0.421935987509758, + "grad_norm": 0.8262835865031622, + "learning_rate": 9.949882830685996e-06, + "loss": 0.625, + "step": 1081 + }, + { + "epoch": 0.42232630757220924, + "grad_norm": 0.6559436277197671, + "learning_rate": 9.949561594025526e-06, + "loss": 0.6334, + "step": 1082 + }, + { + "epoch": 0.42271662763466045, + "grad_norm": 0.8564698336810679, + "learning_rate": 9.949239336352883e-06, + "loss": 0.6611, + "step": 1083 + }, + { + "epoch": 0.42310694769711166, + "grad_norm": 0.6935397622741132, + "learning_rate": 9.948916057734546e-06, + "loss": 0.6523, + "step": 1084 + }, + { + "epoch": 0.42349726775956287, + "grad_norm": 0.6995392063365319, + "learning_rate": 9.948591758237198e-06, + "loss": 0.6541, + "step": 1085 + }, + { + "epoch": 0.4238875878220141, + "grad_norm": 0.953223429984497, + "learning_rate": 9.948266437927741e-06, + "loss": 0.6376, + "step": 1086 + }, + { + "epoch": 0.4242779078844653, + "grad_norm": 0.8058207280569994, + "learning_rate": 9.947940096873279e-06, + "loss": 0.6846, + "step": 1087 + }, + { + "epoch": 0.4246682279469165, + "grad_norm": 0.7688066757242621, + "learning_rate": 9.947612735141132e-06, + "loss": 0.6922, + "step": 1088 + }, + { + "epoch": 0.42505854800936765, + "grad_norm": 0.6834681433569832, + "learning_rate": 9.947284352798831e-06, + "loss": 0.6519, + "step": 1089 + }, + { + "epoch": 0.42544886807181886, + "grad_norm": 0.7708987581645261, + "learning_rate": 9.946954949914117e-06, + "loss": 0.6152, + "step": 1090 + }, + { + "epoch": 0.4258391881342701, + "grad_norm": 0.6194503847628355, + "learning_rate": 9.946624526554933e-06, + "loss": 0.6431, + "step": 1091 + }, + { + "epoch": 0.4262295081967213, + "grad_norm": 0.7416391069354705, + "learning_rate": 9.946293082789448e-06, + "loss": 0.6327, + "step": 1092 + }, + { + "epoch": 0.4266198282591725, + "grad_norm": 0.621700370359625, + "learning_rate": 9.94596061868603e-06, + "loss": 0.6491, + "step": 1093 + }, + { + "epoch": 0.4270101483216237, + "grad_norm": 0.7779955236727596, + "learning_rate": 9.945627134313261e-06, + "loss": 0.6371, + "step": 1094 + }, + { + "epoch": 0.4274004683840749, + "grad_norm": 0.6536724366584644, + "learning_rate": 9.945292629739932e-06, + "loss": 0.6641, + "step": 1095 + }, + { + "epoch": 0.4277907884465261, + "grad_norm": 0.7406571287991791, + "learning_rate": 9.944957105035048e-06, + "loss": 0.6825, + "step": 1096 + }, + { + "epoch": 0.42818110850897734, + "grad_norm": 0.7957520847211289, + "learning_rate": 9.944620560267821e-06, + "loss": 0.6708, + "step": 1097 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.6733383814139053, + "learning_rate": 9.944282995507675e-06, + "loss": 0.6583, + "step": 1098 + }, + { + "epoch": 0.42896174863387976, + "grad_norm": 0.7367336066998705, + "learning_rate": 9.943944410824242e-06, + "loss": 0.6805, + "step": 1099 + }, + { + "epoch": 0.42935206869633097, + "grad_norm": 0.718799145163475, + "learning_rate": 9.943604806287371e-06, + "loss": 0.6616, + "step": 1100 + }, + { + "epoch": 0.4297423887587822, + "grad_norm": 0.5731469491050711, + "learning_rate": 9.943264181967112e-06, + "loss": 0.6128, + "step": 1101 + }, + { + "epoch": 0.4301327088212334, + "grad_norm": 0.7428679787636487, + "learning_rate": 9.94292253793373e-06, + "loss": 0.6848, + "step": 1102 + }, + { + "epoch": 0.4305230288836846, + "grad_norm": 0.7088337933617601, + "learning_rate": 9.942579874257705e-06, + "loss": 0.6495, + "step": 1103 + }, + { + "epoch": 0.4309133489461358, + "grad_norm": 0.8247150173840526, + "learning_rate": 9.94223619100972e-06, + "loss": 0.6805, + "step": 1104 + }, + { + "epoch": 0.431303669008587, + "grad_norm": 0.717687192947493, + "learning_rate": 9.941891488260671e-06, + "loss": 0.6528, + "step": 1105 + }, + { + "epoch": 0.43169398907103823, + "grad_norm": 0.6883281624535892, + "learning_rate": 9.941545766081666e-06, + "loss": 0.6404, + "step": 1106 + }, + { + "epoch": 0.43208430913348944, + "grad_norm": 0.7228712516611936, + "learning_rate": 9.94119902454402e-06, + "loss": 0.6937, + "step": 1107 + }, + { + "epoch": 0.43247462919594065, + "grad_norm": 0.835362579007804, + "learning_rate": 9.94085126371926e-06, + "loss": 0.6529, + "step": 1108 + }, + { + "epoch": 0.43286494925839186, + "grad_norm": 0.6287313128458193, + "learning_rate": 9.940502483679125e-06, + "loss": 0.616, + "step": 1109 + }, + { + "epoch": 0.4332552693208431, + "grad_norm": 0.7670654037330542, + "learning_rate": 9.940152684495558e-06, + "loss": 0.679, + "step": 1110 + }, + { + "epoch": 0.4336455893832943, + "grad_norm": 0.8282060948138599, + "learning_rate": 9.939801866240724e-06, + "loss": 0.6221, + "step": 1111 + }, + { + "epoch": 0.4340359094457455, + "grad_norm": 0.7055866201944823, + "learning_rate": 9.939450028986986e-06, + "loss": 0.6478, + "step": 1112 + }, + { + "epoch": 0.4344262295081967, + "grad_norm": 0.7748332340906642, + "learning_rate": 9.939097172806921e-06, + "loss": 0.6551, + "step": 1113 + }, + { + "epoch": 0.4348165495706479, + "grad_norm": 0.9554576613166923, + "learning_rate": 9.938743297773324e-06, + "loss": 0.6435, + "step": 1114 + }, + { + "epoch": 0.4352068696330991, + "grad_norm": 0.7083120060098785, + "learning_rate": 9.938388403959184e-06, + "loss": 0.6607, + "step": 1115 + }, + { + "epoch": 0.43559718969555034, + "grad_norm": 0.8646262941958603, + "learning_rate": 9.93803249143772e-06, + "loss": 0.6757, + "step": 1116 + }, + { + "epoch": 0.43598750975800155, + "grad_norm": 0.8156769651303645, + "learning_rate": 9.937675560282342e-06, + "loss": 0.6601, + "step": 1117 + }, + { + "epoch": 0.43637782982045276, + "grad_norm": 0.6796271937263917, + "learning_rate": 9.937317610566684e-06, + "loss": 0.658, + "step": 1118 + }, + { + "epoch": 0.43676814988290397, + "grad_norm": 0.715084073023934, + "learning_rate": 9.936958642364583e-06, + "loss": 0.6359, + "step": 1119 + }, + { + "epoch": 0.4371584699453552, + "grad_norm": 0.8602784761604669, + "learning_rate": 9.936598655750089e-06, + "loss": 0.6832, + "step": 1120 + }, + { + "epoch": 0.4375487900078064, + "grad_norm": 0.7964853344644717, + "learning_rate": 9.936237650797461e-06, + "loss": 0.6961, + "step": 1121 + }, + { + "epoch": 0.4379391100702576, + "grad_norm": 0.6542507663584805, + "learning_rate": 9.935875627581169e-06, + "loss": 0.6132, + "step": 1122 + }, + { + "epoch": 0.4383294301327088, + "grad_norm": 1.1746523095954333, + "learning_rate": 9.935512586175891e-06, + "loss": 0.6882, + "step": 1123 + }, + { + "epoch": 0.43871975019516, + "grad_norm": 0.9702189038206456, + "learning_rate": 9.935148526656517e-06, + "loss": 0.666, + "step": 1124 + }, + { + "epoch": 0.43911007025761123, + "grad_norm": 0.7667127308457101, + "learning_rate": 9.934783449098148e-06, + "loss": 0.639, + "step": 1125 + }, + { + "epoch": 0.43950039032006244, + "grad_norm": 0.9417932700941425, + "learning_rate": 9.934417353576092e-06, + "loss": 0.6673, + "step": 1126 + }, + { + "epoch": 0.43989071038251365, + "grad_norm": 0.9068569618102791, + "learning_rate": 9.934050240165867e-06, + "loss": 0.6518, + "step": 1127 + }, + { + "epoch": 0.44028103044496486, + "grad_norm": 0.691827939156401, + "learning_rate": 9.933682108943203e-06, + "loss": 0.6466, + "step": 1128 + }, + { + "epoch": 0.44067135050741607, + "grad_norm": 0.9814423000929126, + "learning_rate": 9.933312959984043e-06, + "loss": 0.6461, + "step": 1129 + }, + { + "epoch": 0.4410616705698673, + "grad_norm": 1.0600749096028779, + "learning_rate": 9.932942793364532e-06, + "loss": 0.6649, + "step": 1130 + }, + { + "epoch": 0.4414519906323185, + "grad_norm": 0.7185243176248681, + "learning_rate": 9.93257160916103e-06, + "loss": 0.655, + "step": 1131 + }, + { + "epoch": 0.4418423106947697, + "grad_norm": 0.9387610803068602, + "learning_rate": 9.932199407450107e-06, + "loss": 0.6367, + "step": 1132 + }, + { + "epoch": 0.4422326307572209, + "grad_norm": 0.9394884413883758, + "learning_rate": 9.931826188308543e-06, + "loss": 0.6712, + "step": 1133 + }, + { + "epoch": 0.4426229508196721, + "grad_norm": 0.6848352736880978, + "learning_rate": 9.931451951813324e-06, + "loss": 0.635, + "step": 1134 + }, + { + "epoch": 0.44301327088212333, + "grad_norm": 1.0472447406274732, + "learning_rate": 9.931076698041652e-06, + "loss": 0.6684, + "step": 1135 + }, + { + "epoch": 0.44340359094457454, + "grad_norm": 0.7426811141913181, + "learning_rate": 9.930700427070934e-06, + "loss": 0.622, + "step": 1136 + }, + { + "epoch": 0.44379391100702575, + "grad_norm": 0.7762438804467773, + "learning_rate": 9.930323138978788e-06, + "loss": 0.6699, + "step": 1137 + }, + { + "epoch": 0.44418423106947696, + "grad_norm": 0.8074050747219473, + "learning_rate": 9.929944833843042e-06, + "loss": 0.6611, + "step": 1138 + }, + { + "epoch": 0.4445745511319282, + "grad_norm": 0.643342352850604, + "learning_rate": 9.929565511741735e-06, + "loss": 0.647, + "step": 1139 + }, + { + "epoch": 0.4449648711943794, + "grad_norm": 0.7564326584480215, + "learning_rate": 9.929185172753115e-06, + "loss": 0.6439, + "step": 1140 + }, + { + "epoch": 0.4453551912568306, + "grad_norm": 0.616191385756841, + "learning_rate": 9.92880381695564e-06, + "loss": 0.6231, + "step": 1141 + }, + { + "epoch": 0.4457455113192818, + "grad_norm": 0.7044943548786881, + "learning_rate": 9.928421444427974e-06, + "loss": 0.6654, + "step": 1142 + }, + { + "epoch": 0.446135831381733, + "grad_norm": 0.6512240375722023, + "learning_rate": 9.928038055248999e-06, + "loss": 0.6733, + "step": 1143 + }, + { + "epoch": 0.4465261514441842, + "grad_norm": 0.6168222537597935, + "learning_rate": 9.9276536494978e-06, + "loss": 0.6468, + "step": 1144 + }, + { + "epoch": 0.44691647150663544, + "grad_norm": 0.614241803121342, + "learning_rate": 9.927268227253671e-06, + "loss": 0.6423, + "step": 1145 + }, + { + "epoch": 0.44730679156908665, + "grad_norm": 0.5802320051158063, + "learning_rate": 9.926881788596122e-06, + "loss": 0.6535, + "step": 1146 + }, + { + "epoch": 0.44769711163153786, + "grad_norm": 0.8389156234066591, + "learning_rate": 9.926494333604866e-06, + "loss": 0.6576, + "step": 1147 + }, + { + "epoch": 0.44808743169398907, + "grad_norm": 0.6381062173621527, + "learning_rate": 9.926105862359829e-06, + "loss": 0.6465, + "step": 1148 + }, + { + "epoch": 0.4484777517564403, + "grad_norm": 0.7638421634698968, + "learning_rate": 9.925716374941148e-06, + "loss": 0.6713, + "step": 1149 + }, + { + "epoch": 0.4488680718188915, + "grad_norm": 0.626155803546183, + "learning_rate": 9.925325871429166e-06, + "loss": 0.6339, + "step": 1150 + }, + { + "epoch": 0.4492583918813427, + "grad_norm": 0.7788940082912861, + "learning_rate": 9.924934351904437e-06, + "loss": 0.6798, + "step": 1151 + }, + { + "epoch": 0.4496487119437939, + "grad_norm": 0.8024541475693107, + "learning_rate": 9.924541816447726e-06, + "loss": 0.6808, + "step": 1152 + }, + { + "epoch": 0.4500390320062451, + "grad_norm": 0.6768632317439451, + "learning_rate": 9.924148265140008e-06, + "loss": 0.6394, + "step": 1153 + }, + { + "epoch": 0.45042935206869633, + "grad_norm": 0.6711447626611808, + "learning_rate": 9.923753698062462e-06, + "loss": 0.6537, + "step": 1154 + }, + { + "epoch": 0.45081967213114754, + "grad_norm": 0.7004031738952841, + "learning_rate": 9.923358115296486e-06, + "loss": 0.6717, + "step": 1155 + }, + { + "epoch": 0.45120999219359875, + "grad_norm": 0.8687236728564197, + "learning_rate": 9.922961516923677e-06, + "loss": 0.651, + "step": 1156 + }, + { + "epoch": 0.45160031225604996, + "grad_norm": 0.657623993208644, + "learning_rate": 9.92256390302585e-06, + "loss": 0.6513, + "step": 1157 + }, + { + "epoch": 0.4519906323185012, + "grad_norm": 0.6819529969349393, + "learning_rate": 9.922165273685025e-06, + "loss": 0.6157, + "step": 1158 + }, + { + "epoch": 0.4523809523809524, + "grad_norm": 0.6248137375588845, + "learning_rate": 9.92176562898343e-06, + "loss": 0.611, + "step": 1159 + }, + { + "epoch": 0.4527712724434036, + "grad_norm": 0.6962524829538963, + "learning_rate": 9.92136496900351e-06, + "loss": 0.6395, + "step": 1160 + }, + { + "epoch": 0.4531615925058548, + "grad_norm": 0.6674878346490835, + "learning_rate": 9.920963293827912e-06, + "loss": 0.657, + "step": 1161 + }, + { + "epoch": 0.453551912568306, + "grad_norm": 0.6361759159981762, + "learning_rate": 9.920560603539495e-06, + "loss": 0.6754, + "step": 1162 + }, + { + "epoch": 0.4539422326307572, + "grad_norm": 0.7623742230206734, + "learning_rate": 9.920156898221327e-06, + "loss": 0.646, + "step": 1163 + }, + { + "epoch": 0.45433255269320844, + "grad_norm": 0.5995999063306315, + "learning_rate": 9.919752177956686e-06, + "loss": 0.6652, + "step": 1164 + }, + { + "epoch": 0.45472287275565965, + "grad_norm": 0.6783094901559136, + "learning_rate": 9.91934644282906e-06, + "loss": 0.6471, + "step": 1165 + }, + { + "epoch": 0.45511319281811086, + "grad_norm": 0.6495632329072739, + "learning_rate": 9.918939692922142e-06, + "loss": 0.6514, + "step": 1166 + }, + { + "epoch": 0.45550351288056207, + "grad_norm": 0.6892277249319105, + "learning_rate": 9.918531928319841e-06, + "loss": 0.6154, + "step": 1167 + }, + { + "epoch": 0.4558938329430133, + "grad_norm": 0.6521328303174038, + "learning_rate": 9.918123149106273e-06, + "loss": 0.6599, + "step": 1168 + }, + { + "epoch": 0.4562841530054645, + "grad_norm": 0.6151949120871293, + "learning_rate": 9.917713355365758e-06, + "loss": 0.6847, + "step": 1169 + }, + { + "epoch": 0.4566744730679157, + "grad_norm": 0.7177245293360964, + "learning_rate": 9.917302547182835e-06, + "loss": 0.641, + "step": 1170 + }, + { + "epoch": 0.4570647931303669, + "grad_norm": 0.5945257773317708, + "learning_rate": 9.916890724642244e-06, + "loss": 0.6672, + "step": 1171 + }, + { + "epoch": 0.4574551131928181, + "grad_norm": 0.8296148219769276, + "learning_rate": 9.916477887828935e-06, + "loss": 0.6627, + "step": 1172 + }, + { + "epoch": 0.45784543325526933, + "grad_norm": 0.7103206018029505, + "learning_rate": 9.916064036828072e-06, + "loss": 0.634, + "step": 1173 + }, + { + "epoch": 0.45823575331772054, + "grad_norm": 0.6869991243687734, + "learning_rate": 9.915649171725026e-06, + "loss": 0.6478, + "step": 1174 + }, + { + "epoch": 0.45862607338017175, + "grad_norm": 0.7558855737779243, + "learning_rate": 9.915233292605375e-06, + "loss": 0.5872, + "step": 1175 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 0.5700396556747882, + "learning_rate": 9.91481639955491e-06, + "loss": 0.6421, + "step": 1176 + }, + { + "epoch": 0.45940671350507417, + "grad_norm": 0.7446105378695252, + "learning_rate": 9.914398492659628e-06, + "loss": 0.646, + "step": 1177 + }, + { + "epoch": 0.4597970335675254, + "grad_norm": 0.6190059562725773, + "learning_rate": 9.913979572005736e-06, + "loss": 0.6447, + "step": 1178 + }, + { + "epoch": 0.4601873536299766, + "grad_norm": 0.7466688360379381, + "learning_rate": 9.913559637679651e-06, + "loss": 0.6451, + "step": 1179 + }, + { + "epoch": 0.4605776736924278, + "grad_norm": 0.5662271994643387, + "learning_rate": 9.913138689767996e-06, + "loss": 0.6303, + "step": 1180 + }, + { + "epoch": 0.460967993754879, + "grad_norm": 0.6485204598345378, + "learning_rate": 9.91271672835761e-06, + "loss": 0.6573, + "step": 1181 + }, + { + "epoch": 0.4613583138173302, + "grad_norm": 0.6440564062360075, + "learning_rate": 9.912293753535534e-06, + "loss": 0.6588, + "step": 1182 + }, + { + "epoch": 0.46174863387978143, + "grad_norm": 0.6548696447443639, + "learning_rate": 9.91186976538902e-06, + "loss": 0.6497, + "step": 1183 + }, + { + "epoch": 0.46213895394223264, + "grad_norm": 0.6437950551966798, + "learning_rate": 9.91144476400553e-06, + "loss": 0.6595, + "step": 1184 + }, + { + "epoch": 0.46252927400468385, + "grad_norm": 0.6425255579917644, + "learning_rate": 9.911018749472736e-06, + "loss": 0.6429, + "step": 1185 + }, + { + "epoch": 0.46291959406713507, + "grad_norm": 0.6554798115336973, + "learning_rate": 9.910591721878517e-06, + "loss": 0.6402, + "step": 1186 + }, + { + "epoch": 0.4633099141295863, + "grad_norm": 0.655363399613879, + "learning_rate": 9.91016368131096e-06, + "loss": 0.6679, + "step": 1187 + }, + { + "epoch": 0.4637002341920375, + "grad_norm": 0.6948575082524654, + "learning_rate": 9.909734627858367e-06, + "loss": 0.6355, + "step": 1188 + }, + { + "epoch": 0.4640905542544887, + "grad_norm": 0.6030978786953219, + "learning_rate": 9.90930456160924e-06, + "loss": 0.6884, + "step": 1189 + }, + { + "epoch": 0.4644808743169399, + "grad_norm": 0.6750519933422459, + "learning_rate": 9.908873482652298e-06, + "loss": 0.6391, + "step": 1190 + }, + { + "epoch": 0.4648711943793911, + "grad_norm": 0.647205675316269, + "learning_rate": 9.908441391076462e-06, + "loss": 0.6683, + "step": 1191 + }, + { + "epoch": 0.4652615144418423, + "grad_norm": 0.6437035050409425, + "learning_rate": 9.908008286970869e-06, + "loss": 0.6393, + "step": 1192 + }, + { + "epoch": 0.46565183450429354, + "grad_norm": 0.7184709667012962, + "learning_rate": 9.907574170424857e-06, + "loss": 0.708, + "step": 1193 + }, + { + "epoch": 0.46604215456674475, + "grad_norm": 0.6924731510310134, + "learning_rate": 9.907139041527981e-06, + "loss": 0.6381, + "step": 1194 + }, + { + "epoch": 0.46643247462919596, + "grad_norm": 0.6947448341560989, + "learning_rate": 9.906702900369999e-06, + "loss": 0.6379, + "step": 1195 + }, + { + "epoch": 0.46682279469164717, + "grad_norm": 0.7450562085323833, + "learning_rate": 9.90626574704088e-06, + "loss": 0.6757, + "step": 1196 + }, + { + "epoch": 0.4672131147540984, + "grad_norm": 0.6662408992123613, + "learning_rate": 9.9058275816308e-06, + "loss": 0.6985, + "step": 1197 + }, + { + "epoch": 0.4676034348165496, + "grad_norm": 0.7993314364655048, + "learning_rate": 9.905388404230147e-06, + "loss": 0.6416, + "step": 1198 + }, + { + "epoch": 0.4679937548790008, + "grad_norm": 0.6783614767335301, + "learning_rate": 9.904948214929517e-06, + "loss": 0.6665, + "step": 1199 + }, + { + "epoch": 0.468384074941452, + "grad_norm": 0.6899669413725308, + "learning_rate": 9.90450701381971e-06, + "loss": 0.6475, + "step": 1200 + }, + { + "epoch": 0.4687743950039032, + "grad_norm": 0.9122633046780254, + "learning_rate": 9.904064800991742e-06, + "loss": 0.6665, + "step": 1201 + }, + { + "epoch": 0.46916471506635443, + "grad_norm": 0.6342489939427243, + "learning_rate": 9.903621576536833e-06, + "loss": 0.6599, + "step": 1202 + }, + { + "epoch": 0.46955503512880564, + "grad_norm": 0.7330305177437443, + "learning_rate": 9.903177340546412e-06, + "loss": 0.655, + "step": 1203 + }, + { + "epoch": 0.46994535519125685, + "grad_norm": 0.963012985626061, + "learning_rate": 9.902732093112118e-06, + "loss": 0.638, + "step": 1204 + }, + { + "epoch": 0.47033567525370806, + "grad_norm": 0.7382048337622679, + "learning_rate": 9.902285834325799e-06, + "loss": 0.6683, + "step": 1205 + }, + { + "epoch": 0.4707259953161593, + "grad_norm": 0.8307633154305122, + "learning_rate": 9.901838564279509e-06, + "loss": 0.6592, + "step": 1206 + }, + { + "epoch": 0.4711163153786105, + "grad_norm": 1.0064927910492587, + "learning_rate": 9.901390283065515e-06, + "loss": 0.6746, + "step": 1207 + }, + { + "epoch": 0.4715066354410617, + "grad_norm": 0.7600371532836676, + "learning_rate": 9.900940990776286e-06, + "loss": 0.6232, + "step": 1208 + }, + { + "epoch": 0.4718969555035129, + "grad_norm": 0.9951508356529716, + "learning_rate": 9.900490687504507e-06, + "loss": 0.64, + "step": 1209 + }, + { + "epoch": 0.4722872755659641, + "grad_norm": 0.7953097641099303, + "learning_rate": 9.900039373343067e-06, + "loss": 0.6805, + "step": 1210 + }, + { + "epoch": 0.4726775956284153, + "grad_norm": 0.8749724591729937, + "learning_rate": 9.899587048385065e-06, + "loss": 0.676, + "step": 1211 + }, + { + "epoch": 0.47306791569086654, + "grad_norm": 0.79237509229852, + "learning_rate": 9.899133712723807e-06, + "loss": 0.6478, + "step": 1212 + }, + { + "epoch": 0.47345823575331775, + "grad_norm": 0.8910727974866116, + "learning_rate": 9.898679366452809e-06, + "loss": 0.679, + "step": 1213 + }, + { + "epoch": 0.47384855581576896, + "grad_norm": 0.9261202306764873, + "learning_rate": 9.898224009665796e-06, + "loss": 0.6231, + "step": 1214 + }, + { + "epoch": 0.47423887587822017, + "grad_norm": 0.8584495474652457, + "learning_rate": 9.8977676424567e-06, + "loss": 0.6612, + "step": 1215 + }, + { + "epoch": 0.4746291959406714, + "grad_norm": 1.027035243039402, + "learning_rate": 9.897310264919661e-06, + "loss": 0.6634, + "step": 1216 + }, + { + "epoch": 0.47501951600312253, + "grad_norm": 0.7423283277129837, + "learning_rate": 9.89685187714903e-06, + "loss": 0.6296, + "step": 1217 + }, + { + "epoch": 0.47540983606557374, + "grad_norm": 1.0183928026852935, + "learning_rate": 9.896392479239363e-06, + "loss": 0.6811, + "step": 1218 + }, + { + "epoch": 0.47580015612802495, + "grad_norm": 0.8783209606224188, + "learning_rate": 9.895932071285428e-06, + "loss": 0.625, + "step": 1219 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.765549725491269, + "learning_rate": 9.895470653382198e-06, + "loss": 0.6553, + "step": 1220 + }, + { + "epoch": 0.4765807962529274, + "grad_norm": 0.8604108206780998, + "learning_rate": 9.895008225624855e-06, + "loss": 0.6419, + "step": 1221 + }, + { + "epoch": 0.4769711163153786, + "grad_norm": 0.7918214971153391, + "learning_rate": 9.894544788108792e-06, + "loss": 0.6526, + "step": 1222 + }, + { + "epoch": 0.4773614363778298, + "grad_norm": 0.7445603355271593, + "learning_rate": 9.89408034092961e-06, + "loss": 0.6403, + "step": 1223 + }, + { + "epoch": 0.477751756440281, + "grad_norm": 0.8251309933726761, + "learning_rate": 9.893614884183114e-06, + "loss": 0.628, + "step": 1224 + }, + { + "epoch": 0.4781420765027322, + "grad_norm": 0.7581686176672242, + "learning_rate": 9.893148417965321e-06, + "loss": 0.6513, + "step": 1225 + }, + { + "epoch": 0.4785323965651834, + "grad_norm": 0.7214375555544682, + "learning_rate": 9.892680942372455e-06, + "loss": 0.6647, + "step": 1226 + }, + { + "epoch": 0.47892271662763464, + "grad_norm": 0.7326140745229647, + "learning_rate": 9.892212457500948e-06, + "loss": 0.6267, + "step": 1227 + }, + { + "epoch": 0.47931303669008585, + "grad_norm": 0.7172327476894826, + "learning_rate": 9.89174296344744e-06, + "loss": 0.6337, + "step": 1228 + }, + { + "epoch": 0.47970335675253706, + "grad_norm": 0.6780661696117066, + "learning_rate": 9.891272460308781e-06, + "loss": 0.6103, + "step": 1229 + }, + { + "epoch": 0.48009367681498827, + "grad_norm": 0.8262610988027267, + "learning_rate": 9.890800948182029e-06, + "loss": 0.6722, + "step": 1230 + }, + { + "epoch": 0.4804839968774395, + "grad_norm": 0.7033110211231285, + "learning_rate": 9.890328427164445e-06, + "loss": 0.6084, + "step": 1231 + }, + { + "epoch": 0.4808743169398907, + "grad_norm": 0.8904687726674162, + "learning_rate": 9.889854897353508e-06, + "loss": 0.6653, + "step": 1232 + }, + { + "epoch": 0.4812646370023419, + "grad_norm": 0.6736202389453889, + "learning_rate": 9.889380358846896e-06, + "loss": 0.6285, + "step": 1233 + }, + { + "epoch": 0.4816549570647931, + "grad_norm": 0.7198127910027691, + "learning_rate": 9.888904811742499e-06, + "loss": 0.649, + "step": 1234 + }, + { + "epoch": 0.4820452771272443, + "grad_norm": 0.6822274297838667, + "learning_rate": 9.888428256138415e-06, + "loss": 0.6771, + "step": 1235 + }, + { + "epoch": 0.48243559718969553, + "grad_norm": 0.6527939202010297, + "learning_rate": 9.887950692132946e-06, + "loss": 0.7082, + "step": 1236 + }, + { + "epoch": 0.48282591725214674, + "grad_norm": 0.7657781112297227, + "learning_rate": 9.88747211982461e-06, + "loss": 0.6613, + "step": 1237 + }, + { + "epoch": 0.48321623731459795, + "grad_norm": 0.666563049454593, + "learning_rate": 9.886992539312125e-06, + "loss": 0.6772, + "step": 1238 + }, + { + "epoch": 0.48360655737704916, + "grad_norm": 0.7281177639312056, + "learning_rate": 9.886511950694424e-06, + "loss": 0.6454, + "step": 1239 + }, + { + "epoch": 0.4839968774395004, + "grad_norm": 0.6649660309605164, + "learning_rate": 9.886030354070643e-06, + "loss": 0.6095, + "step": 1240 + }, + { + "epoch": 0.4843871975019516, + "grad_norm": 0.7041430254978536, + "learning_rate": 9.885547749540125e-06, + "loss": 0.6626, + "step": 1241 + }, + { + "epoch": 0.4847775175644028, + "grad_norm": 0.6446836599904726, + "learning_rate": 9.885064137202427e-06, + "loss": 0.6534, + "step": 1242 + }, + { + "epoch": 0.485167837626854, + "grad_norm": 0.7404023028602947, + "learning_rate": 9.884579517157309e-06, + "loss": 0.6794, + "step": 1243 + }, + { + "epoch": 0.4855581576893052, + "grad_norm": 0.623464933254875, + "learning_rate": 9.884093889504738e-06, + "loss": 0.657, + "step": 1244 + }, + { + "epoch": 0.4859484777517564, + "grad_norm": 0.702714998308793, + "learning_rate": 9.883607254344894e-06, + "loss": 0.6433, + "step": 1245 + }, + { + "epoch": 0.48633879781420764, + "grad_norm": 0.6643978279902333, + "learning_rate": 9.883119611778158e-06, + "loss": 0.6159, + "step": 1246 + }, + { + "epoch": 0.48672911787665885, + "grad_norm": 0.6511522327372755, + "learning_rate": 9.882630961905126e-06, + "loss": 0.6707, + "step": 1247 + }, + { + "epoch": 0.48711943793911006, + "grad_norm": 0.7970734839607446, + "learning_rate": 9.882141304826597e-06, + "loss": 0.6734, + "step": 1248 + }, + { + "epoch": 0.48750975800156127, + "grad_norm": 0.6706314407321865, + "learning_rate": 9.881650640643578e-06, + "loss": 0.6231, + "step": 1249 + }, + { + "epoch": 0.4879000780640125, + "grad_norm": 0.6405280244328236, + "learning_rate": 9.881158969457287e-06, + "loss": 0.6435, + "step": 1250 + }, + { + "epoch": 0.4882903981264637, + "grad_norm": 0.6574427056657463, + "learning_rate": 9.880666291369148e-06, + "loss": 0.6746, + "step": 1251 + }, + { + "epoch": 0.4886807181889149, + "grad_norm": 0.5921399073286959, + "learning_rate": 9.880172606480788e-06, + "loss": 0.6563, + "step": 1252 + }, + { + "epoch": 0.4890710382513661, + "grad_norm": 0.7168134815737204, + "learning_rate": 9.87967791489405e-06, + "loss": 0.6661, + "step": 1253 + }, + { + "epoch": 0.4894613583138173, + "grad_norm": 0.6813584701983523, + "learning_rate": 9.87918221671098e-06, + "loss": 0.6428, + "step": 1254 + }, + { + "epoch": 0.48985167837626853, + "grad_norm": 0.6860305294369304, + "learning_rate": 9.878685512033834e-06, + "loss": 0.6549, + "step": 1255 + }, + { + "epoch": 0.49024199843871974, + "grad_norm": 0.6690879146131726, + "learning_rate": 9.878187800965069e-06, + "loss": 0.6372, + "step": 1256 + }, + { + "epoch": 0.49063231850117095, + "grad_norm": 0.6896397808326036, + "learning_rate": 9.877689083607356e-06, + "loss": 0.6584, + "step": 1257 + }, + { + "epoch": 0.49102263856362216, + "grad_norm": 0.647858388605142, + "learning_rate": 9.877189360063574e-06, + "loss": 0.6413, + "step": 1258 + }, + { + "epoch": 0.49141295862607337, + "grad_norm": 0.6775686233807674, + "learning_rate": 9.87668863043681e-06, + "loss": 0.6522, + "step": 1259 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 0.6989277664761286, + "learning_rate": 9.876186894830351e-06, + "loss": 0.661, + "step": 1260 + }, + { + "epoch": 0.4921935987509758, + "grad_norm": 0.7403158825835204, + "learning_rate": 9.875684153347697e-06, + "loss": 0.6908, + "step": 1261 + }, + { + "epoch": 0.492583918813427, + "grad_norm": 0.6858448873906561, + "learning_rate": 9.875180406092559e-06, + "loss": 0.6652, + "step": 1262 + }, + { + "epoch": 0.4929742388758782, + "grad_norm": 0.6846776574322141, + "learning_rate": 9.874675653168851e-06, + "loss": 0.6527, + "step": 1263 + }, + { + "epoch": 0.4933645589383294, + "grad_norm": 0.7668615878997613, + "learning_rate": 9.874169894680691e-06, + "loss": 0.661, + "step": 1264 + }, + { + "epoch": 0.49375487900078063, + "grad_norm": 0.6173472432166038, + "learning_rate": 9.873663130732412e-06, + "loss": 0.6596, + "step": 1265 + }, + { + "epoch": 0.49414519906323184, + "grad_norm": 0.7200162080753519, + "learning_rate": 9.873155361428551e-06, + "loss": 0.6629, + "step": 1266 + }, + { + "epoch": 0.49453551912568305, + "grad_norm": 0.6177195099950867, + "learning_rate": 9.872646586873852e-06, + "loss": 0.6328, + "step": 1267 + }, + { + "epoch": 0.49492583918813426, + "grad_norm": 0.7815317119629518, + "learning_rate": 9.872136807173266e-06, + "loss": 0.626, + "step": 1268 + }, + { + "epoch": 0.4953161592505855, + "grad_norm": 0.7055583445033033, + "learning_rate": 9.871626022431953e-06, + "loss": 0.6722, + "step": 1269 + }, + { + "epoch": 0.4957064793130367, + "grad_norm": 0.7630530896075964, + "learning_rate": 9.871114232755278e-06, + "loss": 0.6306, + "step": 1270 + }, + { + "epoch": 0.4960967993754879, + "grad_norm": 0.68159231066142, + "learning_rate": 9.870601438248815e-06, + "loss": 0.6546, + "step": 1271 + }, + { + "epoch": 0.4964871194379391, + "grad_norm": 0.7251245405969032, + "learning_rate": 9.870087639018347e-06, + "loss": 0.6444, + "step": 1272 + }, + { + "epoch": 0.4968774395003903, + "grad_norm": 0.7537818324761605, + "learning_rate": 9.86957283516986e-06, + "loss": 0.6402, + "step": 1273 + }, + { + "epoch": 0.4972677595628415, + "grad_norm": 0.7956473134807367, + "learning_rate": 9.86905702680955e-06, + "loss": 0.6286, + "step": 1274 + }, + { + "epoch": 0.49765807962529274, + "grad_norm": 0.6753052737630041, + "learning_rate": 9.868540214043821e-06, + "loss": 0.6601, + "step": 1275 + }, + { + "epoch": 0.49804839968774395, + "grad_norm": 0.729246439344332, + "learning_rate": 9.868022396979282e-06, + "loss": 0.6627, + "step": 1276 + }, + { + "epoch": 0.49843871975019516, + "grad_norm": 0.6573328663247276, + "learning_rate": 9.86750357572275e-06, + "loss": 0.6218, + "step": 1277 + }, + { + "epoch": 0.49882903981264637, + "grad_norm": 0.8141283123833158, + "learning_rate": 9.866983750381247e-06, + "loss": 0.6462, + "step": 1278 + }, + { + "epoch": 0.4992193598750976, + "grad_norm": 0.7947881903235686, + "learning_rate": 9.866462921062008e-06, + "loss": 0.6366, + "step": 1279 + }, + { + "epoch": 0.4996096799375488, + "grad_norm": 0.736467869512019, + "learning_rate": 9.865941087872469e-06, + "loss": 0.6794, + "step": 1280 + }, + { + "epoch": 0.5, + "grad_norm": 0.7000731438779173, + "learning_rate": 9.865418250920276e-06, + "loss": 0.6918, + "step": 1281 + }, + { + "epoch": 0.5003903200624512, + "grad_norm": 0.7152559026313989, + "learning_rate": 9.864894410313281e-06, + "loss": 0.664, + "step": 1282 + }, + { + "epoch": 0.5007806401249024, + "grad_norm": 0.7261744996344561, + "learning_rate": 9.864369566159546e-06, + "loss": 0.6263, + "step": 1283 + }, + { + "epoch": 0.5011709601873536, + "grad_norm": 0.6074589680398874, + "learning_rate": 9.863843718567336e-06, + "loss": 0.6411, + "step": 1284 + }, + { + "epoch": 0.5015612802498048, + "grad_norm": 0.750228146035507, + "learning_rate": 9.863316867645124e-06, + "loss": 0.653, + "step": 1285 + }, + { + "epoch": 0.501951600312256, + "grad_norm": 0.7183168905076819, + "learning_rate": 9.862789013501593e-06, + "loss": 0.6592, + "step": 1286 + }, + { + "epoch": 0.5023419203747073, + "grad_norm": 0.808243540672532, + "learning_rate": 9.862260156245626e-06, + "loss": 0.6707, + "step": 1287 + }, + { + "epoch": 0.5027322404371585, + "grad_norm": 0.6434781813131493, + "learning_rate": 9.861730295986322e-06, + "loss": 0.6067, + "step": 1288 + }, + { + "epoch": 0.5031225604996097, + "grad_norm": 0.8052554856374805, + "learning_rate": 9.86119943283298e-06, + "loss": 0.6803, + "step": 1289 + }, + { + "epoch": 0.5035128805620609, + "grad_norm": 0.7381770868860791, + "learning_rate": 9.860667566895108e-06, + "loss": 0.6263, + "step": 1290 + }, + { + "epoch": 0.5039032006245121, + "grad_norm": 0.6663672823238199, + "learning_rate": 9.860134698282424e-06, + "loss": 0.6501, + "step": 1291 + }, + { + "epoch": 0.5042935206869633, + "grad_norm": 0.7698584134977466, + "learning_rate": 9.859600827104845e-06, + "loss": 0.6661, + "step": 1292 + }, + { + "epoch": 0.5046838407494145, + "grad_norm": 0.7091871586077781, + "learning_rate": 9.859065953472504e-06, + "loss": 0.6569, + "step": 1293 + }, + { + "epoch": 0.5050741608118657, + "grad_norm": 0.8339884028598527, + "learning_rate": 9.858530077495736e-06, + "loss": 0.6398, + "step": 1294 + }, + { + "epoch": 0.505464480874317, + "grad_norm": 0.633667973750821, + "learning_rate": 9.85799319928508e-06, + "loss": 0.6179, + "step": 1295 + }, + { + "epoch": 0.5058548009367682, + "grad_norm": 0.7662020764295334, + "learning_rate": 9.857455318951288e-06, + "loss": 0.6565, + "step": 1296 + }, + { + "epoch": 0.5062451209992194, + "grad_norm": 0.7343358941390401, + "learning_rate": 9.856916436605317e-06, + "loss": 0.6425, + "step": 1297 + }, + { + "epoch": 0.5066354410616706, + "grad_norm": 0.568500897262529, + "learning_rate": 9.856376552358327e-06, + "loss": 0.6334, + "step": 1298 + }, + { + "epoch": 0.5070257611241218, + "grad_norm": 0.6934013507441773, + "learning_rate": 9.855835666321687e-06, + "loss": 0.6364, + "step": 1299 + }, + { + "epoch": 0.507416081186573, + "grad_norm": 0.6280907667842899, + "learning_rate": 9.855293778606974e-06, + "loss": 0.6757, + "step": 1300 + }, + { + "epoch": 0.5078064012490242, + "grad_norm": 0.7815985222345991, + "learning_rate": 9.854750889325967e-06, + "loss": 0.6144, + "step": 1301 + }, + { + "epoch": 0.5081967213114754, + "grad_norm": 0.6667144817024037, + "learning_rate": 9.854206998590663e-06, + "loss": 0.6735, + "step": 1302 + }, + { + "epoch": 0.5085870413739266, + "grad_norm": 0.7478669135775072, + "learning_rate": 9.85366210651325e-06, + "loss": 0.6172, + "step": 1303 + }, + { + "epoch": 0.5089773614363778, + "grad_norm": 0.7570413265733981, + "learning_rate": 9.853116213206132e-06, + "loss": 0.6782, + "step": 1304 + }, + { + "epoch": 0.509367681498829, + "grad_norm": 0.8704166115073233, + "learning_rate": 9.85256931878192e-06, + "loss": 0.6294, + "step": 1305 + }, + { + "epoch": 0.5097580015612803, + "grad_norm": 0.6776200844380668, + "learning_rate": 9.852021423353426e-06, + "loss": 0.6433, + "step": 1306 + }, + { + "epoch": 0.5101483216237315, + "grad_norm": 0.7436410858588935, + "learning_rate": 9.851472527033673e-06, + "loss": 0.6319, + "step": 1307 + }, + { + "epoch": 0.5105386416861827, + "grad_norm": 0.7124731620864458, + "learning_rate": 9.850922629935893e-06, + "loss": 0.6367, + "step": 1308 + }, + { + "epoch": 0.5109289617486339, + "grad_norm": 0.7320580551112839, + "learning_rate": 9.850371732173513e-06, + "loss": 0.6591, + "step": 1309 + }, + { + "epoch": 0.5113192818110851, + "grad_norm": 0.7062925893055012, + "learning_rate": 9.849819833860181e-06, + "loss": 0.6318, + "step": 1310 + }, + { + "epoch": 0.5117096018735363, + "grad_norm": 0.7644001561866354, + "learning_rate": 9.849266935109741e-06, + "loss": 0.6836, + "step": 1311 + }, + { + "epoch": 0.5120999219359875, + "grad_norm": 0.726669213032489, + "learning_rate": 9.848713036036248e-06, + "loss": 0.6549, + "step": 1312 + }, + { + "epoch": 0.5124902419984387, + "grad_norm": 0.7506315720451878, + "learning_rate": 9.84815813675396e-06, + "loss": 0.6272, + "step": 1313 + }, + { + "epoch": 0.5128805620608899, + "grad_norm": 0.7221307874817255, + "learning_rate": 9.847602237377345e-06, + "loss": 0.6569, + "step": 1314 + }, + { + "epoch": 0.5132708821233412, + "grad_norm": 0.9018502384810726, + "learning_rate": 9.847045338021077e-06, + "loss": 0.6749, + "step": 1315 + }, + { + "epoch": 0.5136612021857924, + "grad_norm": 0.6933006625452695, + "learning_rate": 9.846487438800034e-06, + "loss": 0.666, + "step": 1316 + }, + { + "epoch": 0.5140515222482436, + "grad_norm": 0.911414721724961, + "learning_rate": 9.845928539829301e-06, + "loss": 0.6276, + "step": 1317 + }, + { + "epoch": 0.5144418423106948, + "grad_norm": 0.7593809310593691, + "learning_rate": 9.84536864122417e-06, + "loss": 0.6301, + "step": 1318 + }, + { + "epoch": 0.514832162373146, + "grad_norm": 0.8727515878389559, + "learning_rate": 9.844807743100138e-06, + "loss": 0.6661, + "step": 1319 + }, + { + "epoch": 0.5152224824355972, + "grad_norm": 0.7041219723486103, + "learning_rate": 9.844245845572911e-06, + "loss": 0.6578, + "step": 1320 + }, + { + "epoch": 0.5156128024980484, + "grad_norm": 0.7848321639074358, + "learning_rate": 9.843682948758395e-06, + "loss": 0.652, + "step": 1321 + }, + { + "epoch": 0.5160031225604996, + "grad_norm": 0.7155665498835818, + "learning_rate": 9.843119052772712e-06, + "loss": 0.6628, + "step": 1322 + }, + { + "epoch": 0.5163934426229508, + "grad_norm": 0.636253291452418, + "learning_rate": 9.842554157732179e-06, + "loss": 0.636, + "step": 1323 + }, + { + "epoch": 0.516783762685402, + "grad_norm": 0.6922158939539012, + "learning_rate": 9.841988263753326e-06, + "loss": 0.6172, + "step": 1324 + }, + { + "epoch": 0.5171740827478533, + "grad_norm": 0.6608990547368324, + "learning_rate": 9.84142137095289e-06, + "loss": 0.6639, + "step": 1325 + }, + { + "epoch": 0.5175644028103045, + "grad_norm": 0.6815403503281324, + "learning_rate": 9.84085347944781e-06, + "loss": 0.637, + "step": 1326 + }, + { + "epoch": 0.5179547228727557, + "grad_norm": 0.8030828907333094, + "learning_rate": 9.84028458935523e-06, + "loss": 0.6302, + "step": 1327 + }, + { + "epoch": 0.5183450429352069, + "grad_norm": 0.7225636449720649, + "learning_rate": 9.839714700792507e-06, + "loss": 0.6779, + "step": 1328 + }, + { + "epoch": 0.5187353629976581, + "grad_norm": 0.6750122390360663, + "learning_rate": 9.839143813877197e-06, + "loss": 0.6561, + "step": 1329 + }, + { + "epoch": 0.5191256830601093, + "grad_norm": 0.6441948512725876, + "learning_rate": 9.838571928727063e-06, + "loss": 0.6238, + "step": 1330 + }, + { + "epoch": 0.5195160031225605, + "grad_norm": 0.620022498696003, + "learning_rate": 9.83799904546008e-06, + "loss": 0.6301, + "step": 1331 + }, + { + "epoch": 0.5199063231850117, + "grad_norm": 0.6443357749783615, + "learning_rate": 9.83742516419442e-06, + "loss": 0.6255, + "step": 1332 + }, + { + "epoch": 0.5202966432474629, + "grad_norm": 0.6680808821954568, + "learning_rate": 9.836850285048468e-06, + "loss": 0.6631, + "step": 1333 + }, + { + "epoch": 0.5206869633099142, + "grad_norm": 0.6755986138862573, + "learning_rate": 9.83627440814081e-06, + "loss": 0.6765, + "step": 1334 + }, + { + "epoch": 0.5210772833723654, + "grad_norm": 0.8110458139538853, + "learning_rate": 9.835697533590238e-06, + "loss": 0.6596, + "step": 1335 + }, + { + "epoch": 0.5214676034348166, + "grad_norm": 0.6561854736415428, + "learning_rate": 9.835119661515758e-06, + "loss": 0.6342, + "step": 1336 + }, + { + "epoch": 0.5218579234972678, + "grad_norm": 0.7746269335540916, + "learning_rate": 9.834540792036568e-06, + "loss": 0.6665, + "step": 1337 + }, + { + "epoch": 0.522248243559719, + "grad_norm": 0.7358844879655608, + "learning_rate": 9.833960925272085e-06, + "loss": 0.6665, + "step": 1338 + }, + { + "epoch": 0.5226385636221702, + "grad_norm": 0.7817805429812794, + "learning_rate": 9.833380061341921e-06, + "loss": 0.6764, + "step": 1339 + }, + { + "epoch": 0.5230288836846214, + "grad_norm": 0.9496402027629065, + "learning_rate": 9.832798200365904e-06, + "loss": 0.6728, + "step": 1340 + }, + { + "epoch": 0.5234192037470726, + "grad_norm": 0.7120629237479286, + "learning_rate": 9.832215342464058e-06, + "loss": 0.6823, + "step": 1341 + }, + { + "epoch": 0.5238095238095238, + "grad_norm": 1.0070095997790538, + "learning_rate": 9.831631487756619e-06, + "loss": 0.6367, + "step": 1342 + }, + { + "epoch": 0.524199843871975, + "grad_norm": 0.6327664026421633, + "learning_rate": 9.831046636364024e-06, + "loss": 0.6591, + "step": 1343 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 0.8768657490836521, + "learning_rate": 9.830460788406921e-06, + "loss": 0.6285, + "step": 1344 + }, + { + "epoch": 0.5249804839968775, + "grad_norm": 0.7800092717662417, + "learning_rate": 9.82987394400616e-06, + "loss": 0.6524, + "step": 1345 + }, + { + "epoch": 0.5253708040593287, + "grad_norm": 0.8796962799882911, + "learning_rate": 9.829286103282796e-06, + "loss": 0.6546, + "step": 1346 + }, + { + "epoch": 0.5257611241217799, + "grad_norm": 0.7701334945442844, + "learning_rate": 9.828697266358092e-06, + "loss": 0.6546, + "step": 1347 + }, + { + "epoch": 0.5261514441842311, + "grad_norm": 0.7047274225643076, + "learning_rate": 9.828107433353514e-06, + "loss": 0.633, + "step": 1348 + }, + { + "epoch": 0.5265417642466823, + "grad_norm": 0.8666461816965285, + "learning_rate": 9.827516604390735e-06, + "loss": 0.6794, + "step": 1349 + }, + { + "epoch": 0.5269320843091335, + "grad_norm": 0.6040958994012282, + "learning_rate": 9.826924779591633e-06, + "loss": 0.6339, + "step": 1350 + }, + { + "epoch": 0.5273224043715847, + "grad_norm": 0.9785690240907673, + "learning_rate": 9.826331959078295e-06, + "loss": 0.6857, + "step": 1351 + }, + { + "epoch": 0.5277127244340359, + "grad_norm": 0.6573759659117311, + "learning_rate": 9.825738142973004e-06, + "loss": 0.6134, + "step": 1352 + }, + { + "epoch": 0.5281030444964872, + "grad_norm": 0.7206834739151265, + "learning_rate": 9.825143331398262e-06, + "loss": 0.6384, + "step": 1353 + }, + { + "epoch": 0.5284933645589384, + "grad_norm": 0.9324522230636795, + "learning_rate": 9.824547524476759e-06, + "loss": 0.6745, + "step": 1354 + }, + { + "epoch": 0.5288836846213896, + "grad_norm": 0.6903385337984599, + "learning_rate": 9.82395072233141e-06, + "loss": 0.6411, + "step": 1355 + }, + { + "epoch": 0.5292740046838408, + "grad_norm": 0.9030564904409809, + "learning_rate": 9.823352925085319e-06, + "loss": 0.6397, + "step": 1356 + }, + { + "epoch": 0.529664324746292, + "grad_norm": 0.7316766869470459, + "learning_rate": 9.822754132861803e-06, + "loss": 0.6863, + "step": 1357 + }, + { + "epoch": 0.5300546448087432, + "grad_norm": 0.7224439494813566, + "learning_rate": 9.822154345784383e-06, + "loss": 0.6395, + "step": 1358 + }, + { + "epoch": 0.5304449648711944, + "grad_norm": 0.6546181647970263, + "learning_rate": 9.821553563976785e-06, + "loss": 0.6283, + "step": 1359 + }, + { + "epoch": 0.5308352849336456, + "grad_norm": 0.5896011014163807, + "learning_rate": 9.820951787562943e-06, + "loss": 0.6473, + "step": 1360 + }, + { + "epoch": 0.5312256049960968, + "grad_norm": 0.6979041126004271, + "learning_rate": 9.820349016666988e-06, + "loss": 0.6451, + "step": 1361 + }, + { + "epoch": 0.531615925058548, + "grad_norm": 0.6991581829099272, + "learning_rate": 9.819745251413267e-06, + "loss": 0.6571, + "step": 1362 + }, + { + "epoch": 0.5320062451209993, + "grad_norm": 0.6279474998093125, + "learning_rate": 9.819140491926322e-06, + "loss": 0.6222, + "step": 1363 + }, + { + "epoch": 0.5323965651834505, + "grad_norm": 0.7609617600965668, + "learning_rate": 9.81853473833091e-06, + "loss": 0.666, + "step": 1364 + }, + { + "epoch": 0.5327868852459017, + "grad_norm": 0.6742066771345225, + "learning_rate": 9.817927990751984e-06, + "loss": 0.671, + "step": 1365 + }, + { + "epoch": 0.5331772053083529, + "grad_norm": 0.6659988354146797, + "learning_rate": 9.817320249314705e-06, + "loss": 0.6394, + "step": 1366 + }, + { + "epoch": 0.5335675253708041, + "grad_norm": 0.648143385094163, + "learning_rate": 9.816711514144444e-06, + "loss": 0.6675, + "step": 1367 + }, + { + "epoch": 0.5339578454332553, + "grad_norm": 0.5854669680202992, + "learning_rate": 9.816101785366772e-06, + "loss": 0.6425, + "step": 1368 + }, + { + "epoch": 0.5343481654957065, + "grad_norm": 0.640831797038339, + "learning_rate": 9.815491063107463e-06, + "loss": 0.598, + "step": 1369 + }, + { + "epoch": 0.5347384855581577, + "grad_norm": 0.669338834480623, + "learning_rate": 9.814879347492501e-06, + "loss": 0.6475, + "step": 1370 + }, + { + "epoch": 0.5351288056206089, + "grad_norm": 0.7333406758718006, + "learning_rate": 9.814266638648074e-06, + "loss": 0.622, + "step": 1371 + }, + { + "epoch": 0.5355191256830601, + "grad_norm": 0.7414120350435387, + "learning_rate": 9.81365293670057e-06, + "loss": 0.6433, + "step": 1372 + }, + { + "epoch": 0.5359094457455114, + "grad_norm": 0.7815659159921364, + "learning_rate": 9.813038241776586e-06, + "loss": 0.6561, + "step": 1373 + }, + { + "epoch": 0.5362997658079626, + "grad_norm": 0.7496765551376688, + "learning_rate": 9.812422554002929e-06, + "loss": 0.6371, + "step": 1374 + }, + { + "epoch": 0.5366900858704138, + "grad_norm": 0.6103904201551715, + "learning_rate": 9.811805873506598e-06, + "loss": 0.642, + "step": 1375 + }, + { + "epoch": 0.537080405932865, + "grad_norm": 0.7240905130574726, + "learning_rate": 9.811188200414808e-06, + "loss": 0.6339, + "step": 1376 + }, + { + "epoch": 0.5374707259953162, + "grad_norm": 0.5695951954017683, + "learning_rate": 9.810569534854973e-06, + "loss": 0.6639, + "step": 1377 + }, + { + "epoch": 0.5378610460577674, + "grad_norm": 0.6011927110452449, + "learning_rate": 9.809949876954715e-06, + "loss": 0.6227, + "step": 1378 + }, + { + "epoch": 0.5382513661202186, + "grad_norm": 0.8326144393241124, + "learning_rate": 9.809329226841858e-06, + "loss": 0.6406, + "step": 1379 + }, + { + "epoch": 0.5386416861826698, + "grad_norm": 0.8100627886598286, + "learning_rate": 9.80870758464443e-06, + "loss": 0.6912, + "step": 1380 + }, + { + "epoch": 0.539032006245121, + "grad_norm": 0.6803465599810076, + "learning_rate": 9.808084950490668e-06, + "loss": 0.6233, + "step": 1381 + }, + { + "epoch": 0.5394223263075723, + "grad_norm": 0.7245940887933998, + "learning_rate": 9.807461324509012e-06, + "loss": 0.6644, + "step": 1382 + }, + { + "epoch": 0.5398126463700235, + "grad_norm": 0.9029160307602949, + "learning_rate": 9.806836706828102e-06, + "loss": 0.6715, + "step": 1383 + }, + { + "epoch": 0.5402029664324747, + "grad_norm": 0.7022939690386806, + "learning_rate": 9.80621109757679e-06, + "loss": 0.6456, + "step": 1384 + }, + { + "epoch": 0.5405932864949259, + "grad_norm": 0.8014187368616991, + "learning_rate": 9.805584496884126e-06, + "loss": 0.6411, + "step": 1385 + }, + { + "epoch": 0.5409836065573771, + "grad_norm": 0.7998724587341527, + "learning_rate": 9.80495690487937e-06, + "loss": 0.6599, + "step": 1386 + }, + { + "epoch": 0.5413739266198283, + "grad_norm": 0.7651858827651061, + "learning_rate": 9.80432832169198e-06, + "loss": 0.6426, + "step": 1387 + }, + { + "epoch": 0.5417642466822795, + "grad_norm": 0.7510530199969381, + "learning_rate": 9.803698747451626e-06, + "loss": 0.6354, + "step": 1388 + }, + { + "epoch": 0.5421545667447307, + "grad_norm": 0.8328518907739371, + "learning_rate": 9.803068182288177e-06, + "loss": 0.6087, + "step": 1389 + }, + { + "epoch": 0.5425448868071819, + "grad_norm": 0.7860946261016047, + "learning_rate": 9.802436626331707e-06, + "loss": 0.5982, + "step": 1390 + }, + { + "epoch": 0.5429352068696331, + "grad_norm": 0.7013424726556686, + "learning_rate": 9.801804079712498e-06, + "loss": 0.6242, + "step": 1391 + }, + { + "epoch": 0.5433255269320844, + "grad_norm": 0.9704795752985428, + "learning_rate": 9.801170542561032e-06, + "loss": 0.6389, + "step": 1392 + }, + { + "epoch": 0.5437158469945356, + "grad_norm": 0.6290234042674205, + "learning_rate": 9.800536015007996e-06, + "loss": 0.6407, + "step": 1393 + }, + { + "epoch": 0.5441061670569868, + "grad_norm": 0.8346243709447517, + "learning_rate": 9.799900497184285e-06, + "loss": 0.6536, + "step": 1394 + }, + { + "epoch": 0.544496487119438, + "grad_norm": 0.7107361405932743, + "learning_rate": 9.799263989220997e-06, + "loss": 0.6196, + "step": 1395 + }, + { + "epoch": 0.5448868071818892, + "grad_norm": 0.7432756071873455, + "learning_rate": 9.798626491249428e-06, + "loss": 0.6396, + "step": 1396 + }, + { + "epoch": 0.5452771272443404, + "grad_norm": 0.7555269415203532, + "learning_rate": 9.797988003401089e-06, + "loss": 0.6223, + "step": 1397 + }, + { + "epoch": 0.5456674473067916, + "grad_norm": 0.6585920132066937, + "learning_rate": 9.797348525807684e-06, + "loss": 0.6604, + "step": 1398 + }, + { + "epoch": 0.5460577673692428, + "grad_norm": 0.7312337336559325, + "learning_rate": 9.79670805860113e-06, + "loss": 0.6474, + "step": 1399 + }, + { + "epoch": 0.546448087431694, + "grad_norm": 0.7921699099068046, + "learning_rate": 9.796066601913543e-06, + "loss": 0.6257, + "step": 1400 + }, + { + "epoch": 0.5468384074941453, + "grad_norm": 0.6798044015764112, + "learning_rate": 9.795424155877247e-06, + "loss": 0.6572, + "step": 1401 + }, + { + "epoch": 0.5472287275565965, + "grad_norm": 0.7748294197801475, + "learning_rate": 9.794780720624766e-06, + "loss": 0.6514, + "step": 1402 + }, + { + "epoch": 0.5476190476190477, + "grad_norm": 0.9226793385968958, + "learning_rate": 9.79413629628883e-06, + "loss": 0.6861, + "step": 1403 + }, + { + "epoch": 0.5480093676814989, + "grad_norm": 0.661206594635777, + "learning_rate": 9.793490883002374e-06, + "loss": 0.6571, + "step": 1404 + }, + { + "epoch": 0.5483996877439501, + "grad_norm": 0.8982140164060817, + "learning_rate": 9.792844480898537e-06, + "loss": 0.6414, + "step": 1405 + }, + { + "epoch": 0.5487900078064013, + "grad_norm": 0.8085651542320587, + "learning_rate": 9.792197090110658e-06, + "loss": 0.6218, + "step": 1406 + }, + { + "epoch": 0.5491803278688525, + "grad_norm": 0.7163765037424801, + "learning_rate": 9.791548710772286e-06, + "loss": 0.6361, + "step": 1407 + }, + { + "epoch": 0.5495706479313037, + "grad_norm": 0.8455306026072767, + "learning_rate": 9.790899343017168e-06, + "loss": 0.6708, + "step": 1408 + }, + { + "epoch": 0.5499609679937549, + "grad_norm": 0.6401473242170871, + "learning_rate": 9.79024898697926e-06, + "loss": 0.6618, + "step": 1409 + }, + { + "epoch": 0.550351288056206, + "grad_norm": 0.6194297007702185, + "learning_rate": 9.789597642792718e-06, + "loss": 0.655, + "step": 1410 + }, + { + "epoch": 0.5507416081186572, + "grad_norm": 0.7243272775188045, + "learning_rate": 9.788945310591905e-06, + "loss": 0.6737, + "step": 1411 + }, + { + "epoch": 0.5511319281811085, + "grad_norm": 0.6164139608040946, + "learning_rate": 9.788291990511385e-06, + "loss": 0.6805, + "step": 1412 + }, + { + "epoch": 0.5515222482435597, + "grad_norm": 0.6160048849582255, + "learning_rate": 9.787637682685927e-06, + "loss": 0.6422, + "step": 1413 + }, + { + "epoch": 0.5519125683060109, + "grad_norm": 0.5937240729466031, + "learning_rate": 9.786982387250506e-06, + "loss": 0.6162, + "step": 1414 + }, + { + "epoch": 0.5523028883684621, + "grad_norm": 0.6070940178551394, + "learning_rate": 9.786326104340296e-06, + "loss": 0.6312, + "step": 1415 + }, + { + "epoch": 0.5526932084309133, + "grad_norm": 0.6022028642139745, + "learning_rate": 9.785668834090676e-06, + "loss": 0.677, + "step": 1416 + }, + { + "epoch": 0.5530835284933645, + "grad_norm": 0.681925005016477, + "learning_rate": 9.785010576637234e-06, + "loss": 0.6523, + "step": 1417 + }, + { + "epoch": 0.5534738485558157, + "grad_norm": 0.591751393489822, + "learning_rate": 9.784351332115757e-06, + "loss": 0.6427, + "step": 1418 + }, + { + "epoch": 0.5538641686182669, + "grad_norm": 0.7217042995586292, + "learning_rate": 9.783691100662234e-06, + "loss": 0.6939, + "step": 1419 + }, + { + "epoch": 0.5542544886807181, + "grad_norm": 0.6168357025288911, + "learning_rate": 9.78302988241286e-06, + "loss": 0.6601, + "step": 1420 + }, + { + "epoch": 0.5546448087431693, + "grad_norm": 0.7330839237285041, + "learning_rate": 9.782367677504034e-06, + "loss": 0.6391, + "step": 1421 + }, + { + "epoch": 0.5550351288056206, + "grad_norm": 0.7651697245281839, + "learning_rate": 9.781704486072358e-06, + "loss": 0.6677, + "step": 1422 + }, + { + "epoch": 0.5554254488680718, + "grad_norm": 0.7588183217304799, + "learning_rate": 9.781040308254639e-06, + "loss": 0.6846, + "step": 1423 + }, + { + "epoch": 0.555815768930523, + "grad_norm": 0.7503290994048835, + "learning_rate": 9.780375144187881e-06, + "loss": 0.6367, + "step": 1424 + }, + { + "epoch": 0.5562060889929742, + "grad_norm": 0.6759991185105176, + "learning_rate": 9.7797089940093e-06, + "loss": 0.6717, + "step": 1425 + }, + { + "epoch": 0.5565964090554254, + "grad_norm": 0.8810176231512005, + "learning_rate": 9.77904185785631e-06, + "loss": 0.647, + "step": 1426 + }, + { + "epoch": 0.5569867291178766, + "grad_norm": 0.6361029183717318, + "learning_rate": 9.778373735866533e-06, + "loss": 0.6164, + "step": 1427 + }, + { + "epoch": 0.5573770491803278, + "grad_norm": 0.7797861491170399, + "learning_rate": 9.77770462817779e-06, + "loss": 0.681, + "step": 1428 + }, + { + "epoch": 0.557767369242779, + "grad_norm": 0.6871465560743134, + "learning_rate": 9.777034534928104e-06, + "loss": 0.6392, + "step": 1429 + }, + { + "epoch": 0.5581576893052302, + "grad_norm": 0.7865049943060157, + "learning_rate": 9.776363456255707e-06, + "loss": 0.6658, + "step": 1430 + }, + { + "epoch": 0.5585480093676815, + "grad_norm": 0.6676403810817921, + "learning_rate": 9.775691392299031e-06, + "loss": 0.6415, + "step": 1431 + }, + { + "epoch": 0.5589383294301327, + "grad_norm": 0.737467252162859, + "learning_rate": 9.775018343196711e-06, + "loss": 0.6392, + "step": 1432 + }, + { + "epoch": 0.5593286494925839, + "grad_norm": 0.6280614377974709, + "learning_rate": 9.774344309087585e-06, + "loss": 0.6335, + "step": 1433 + }, + { + "epoch": 0.5597189695550351, + "grad_norm": 0.6011992063723735, + "learning_rate": 9.773669290110698e-06, + "loss": 0.6212, + "step": 1434 + }, + { + "epoch": 0.5601092896174863, + "grad_norm": 0.6063026237060782, + "learning_rate": 9.772993286405292e-06, + "loss": 0.6293, + "step": 1435 + }, + { + "epoch": 0.5604996096799375, + "grad_norm": 0.6074427700385785, + "learning_rate": 9.772316298110818e-06, + "loss": 0.6438, + "step": 1436 + }, + { + "epoch": 0.5608899297423887, + "grad_norm": 0.5945160808609506, + "learning_rate": 9.771638325366924e-06, + "loss": 0.6348, + "step": 1437 + }, + { + "epoch": 0.5612802498048399, + "grad_norm": 0.6751795882866476, + "learning_rate": 9.77095936831347e-06, + "loss": 0.6577, + "step": 1438 + }, + { + "epoch": 0.5616705698672911, + "grad_norm": 0.6455286918247477, + "learning_rate": 9.770279427090505e-06, + "loss": 0.6462, + "step": 1439 + }, + { + "epoch": 0.5620608899297423, + "grad_norm": 0.6574874631928069, + "learning_rate": 9.769598501838298e-06, + "loss": 0.6876, + "step": 1440 + }, + { + "epoch": 0.5624512099921936, + "grad_norm": 0.6139175116534165, + "learning_rate": 9.768916592697308e-06, + "loss": 0.6686, + "step": 1441 + }, + { + "epoch": 0.5628415300546448, + "grad_norm": 0.5945118810787069, + "learning_rate": 9.768233699808204e-06, + "loss": 0.6418, + "step": 1442 + }, + { + "epoch": 0.563231850117096, + "grad_norm": 0.7037258327820479, + "learning_rate": 9.767549823311852e-06, + "loss": 0.6393, + "step": 1443 + }, + { + "epoch": 0.5636221701795472, + "grad_norm": 0.5666918040934836, + "learning_rate": 9.766864963349327e-06, + "loss": 0.6267, + "step": 1444 + }, + { + "epoch": 0.5640124902419984, + "grad_norm": 0.8582093571739079, + "learning_rate": 9.766179120061905e-06, + "loss": 0.6536, + "step": 1445 + }, + { + "epoch": 0.5644028103044496, + "grad_norm": 0.6325980026915725, + "learning_rate": 9.765492293591062e-06, + "loss": 0.6475, + "step": 1446 + }, + { + "epoch": 0.5647931303669008, + "grad_norm": 0.6642100594222224, + "learning_rate": 9.76480448407848e-06, + "loss": 0.6553, + "step": 1447 + }, + { + "epoch": 0.565183450429352, + "grad_norm": 0.8739394635612628, + "learning_rate": 9.76411569166604e-06, + "loss": 0.6823, + "step": 1448 + }, + { + "epoch": 0.5655737704918032, + "grad_norm": 0.6946984504047451, + "learning_rate": 9.763425916495833e-06, + "loss": 0.6695, + "step": 1449 + }, + { + "epoch": 0.5659640905542545, + "grad_norm": 0.7482629931573141, + "learning_rate": 9.762735158710145e-06, + "loss": 0.6188, + "step": 1450 + }, + { + "epoch": 0.5663544106167057, + "grad_norm": 0.8091879834595553, + "learning_rate": 9.76204341845147e-06, + "loss": 0.7057, + "step": 1451 + }, + { + "epoch": 0.5667447306791569, + "grad_norm": 0.8086713794695899, + "learning_rate": 9.761350695862499e-06, + "loss": 0.6888, + "step": 1452 + }, + { + "epoch": 0.5671350507416081, + "grad_norm": 0.7700675286531284, + "learning_rate": 9.760656991086132e-06, + "loss": 0.6701, + "step": 1453 + }, + { + "epoch": 0.5675253708040593, + "grad_norm": 0.6987621390131932, + "learning_rate": 9.759962304265467e-06, + "loss": 0.6445, + "step": 1454 + }, + { + "epoch": 0.5679156908665105, + "grad_norm": 0.7783550326416729, + "learning_rate": 9.75926663554381e-06, + "loss": 0.6544, + "step": 1455 + }, + { + "epoch": 0.5683060109289617, + "grad_norm": 0.610609520059818, + "learning_rate": 9.75856998506466e-06, + "loss": 0.6198, + "step": 1456 + }, + { + "epoch": 0.5686963309914129, + "grad_norm": 0.8308380209210813, + "learning_rate": 9.75787235297173e-06, + "loss": 0.6123, + "step": 1457 + }, + { + "epoch": 0.5690866510538641, + "grad_norm": 0.6671069531834529, + "learning_rate": 9.757173739408927e-06, + "loss": 0.6555, + "step": 1458 + }, + { + "epoch": 0.5694769711163153, + "grad_norm": 0.9071871397538143, + "learning_rate": 9.756474144520363e-06, + "loss": 0.6723, + "step": 1459 + }, + { + "epoch": 0.5698672911787666, + "grad_norm": 0.7331560208420156, + "learning_rate": 9.755773568450354e-06, + "loss": 0.6432, + "step": 1460 + }, + { + "epoch": 0.5702576112412178, + "grad_norm": 0.7263356967661908, + "learning_rate": 9.755072011343417e-06, + "loss": 0.6109, + "step": 1461 + }, + { + "epoch": 0.570647931303669, + "grad_norm": 0.6983256935988061, + "learning_rate": 9.754369473344272e-06, + "loss": 0.6369, + "step": 1462 + }, + { + "epoch": 0.5710382513661202, + "grad_norm": 0.7844914689329663, + "learning_rate": 9.753665954597838e-06, + "loss": 0.6329, + "step": 1463 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.6525237032450146, + "learning_rate": 9.752961455249243e-06, + "loss": 0.6314, + "step": 1464 + }, + { + "epoch": 0.5718188914910226, + "grad_norm": 0.7388715557285592, + "learning_rate": 9.752255975443811e-06, + "loss": 0.6453, + "step": 1465 + }, + { + "epoch": 0.5722092115534738, + "grad_norm": 0.8878527866520851, + "learning_rate": 9.751549515327075e-06, + "loss": 0.5945, + "step": 1466 + }, + { + "epoch": 0.572599531615925, + "grad_norm": 0.7427501720449236, + "learning_rate": 9.750842075044759e-06, + "loss": 0.6314, + "step": 1467 + }, + { + "epoch": 0.5729898516783762, + "grad_norm": 0.6124611646041173, + "learning_rate": 9.7501336547428e-06, + "loss": 0.6375, + "step": 1468 + }, + { + "epoch": 0.5733801717408274, + "grad_norm": 0.6910826047234885, + "learning_rate": 9.749424254567335e-06, + "loss": 0.6243, + "step": 1469 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 0.7984972948646545, + "learning_rate": 9.7487138746647e-06, + "loss": 0.6801, + "step": 1470 + }, + { + "epoch": 0.5741608118657299, + "grad_norm": 0.6976553830577972, + "learning_rate": 9.748002515181432e-06, + "loss": 0.6349, + "step": 1471 + }, + { + "epoch": 0.5745511319281811, + "grad_norm": 0.6931545276975197, + "learning_rate": 9.747290176264275e-06, + "loss": 0.6183, + "step": 1472 + }, + { + "epoch": 0.5749414519906323, + "grad_norm": 0.6419061715101997, + "learning_rate": 9.746576858060173e-06, + "loss": 0.6257, + "step": 1473 + }, + { + "epoch": 0.5753317720530835, + "grad_norm": 0.657923363996024, + "learning_rate": 9.74586256071627e-06, + "loss": 0.6438, + "step": 1474 + }, + { + "epoch": 0.5757220921155347, + "grad_norm": 0.662836238926329, + "learning_rate": 9.745147284379917e-06, + "loss": 0.6913, + "step": 1475 + }, + { + "epoch": 0.5761124121779859, + "grad_norm": 0.6581421674812276, + "learning_rate": 9.74443102919866e-06, + "loss": 0.6672, + "step": 1476 + }, + { + "epoch": 0.5765027322404371, + "grad_norm": 0.8429300591407916, + "learning_rate": 9.743713795320251e-06, + "loss": 0.658, + "step": 1477 + }, + { + "epoch": 0.5768930523028883, + "grad_norm": 0.7143687460314213, + "learning_rate": 9.742995582892644e-06, + "loss": 0.6501, + "step": 1478 + }, + { + "epoch": 0.5772833723653396, + "grad_norm": 0.6273844079701525, + "learning_rate": 9.742276392063997e-06, + "loss": 0.644, + "step": 1479 + }, + { + "epoch": 0.5776736924277908, + "grad_norm": 0.7837855068285405, + "learning_rate": 9.741556222982663e-06, + "loss": 0.653, + "step": 1480 + }, + { + "epoch": 0.578064012490242, + "grad_norm": 0.7253473617929086, + "learning_rate": 9.740835075797203e-06, + "loss": 0.6514, + "step": 1481 + }, + { + "epoch": 0.5784543325526932, + "grad_norm": 0.7555391571315344, + "learning_rate": 9.740112950656378e-06, + "loss": 0.6831, + "step": 1482 + }, + { + "epoch": 0.5788446526151444, + "grad_norm": 0.719161444000427, + "learning_rate": 9.739389847709147e-06, + "loss": 0.6518, + "step": 1483 + }, + { + "epoch": 0.5792349726775956, + "grad_norm": 0.7433954590872618, + "learning_rate": 9.738665767104678e-06, + "loss": 0.627, + "step": 1484 + }, + { + "epoch": 0.5796252927400468, + "grad_norm": 0.6580779887972934, + "learning_rate": 9.737940708992337e-06, + "loss": 0.6471, + "step": 1485 + }, + { + "epoch": 0.580015612802498, + "grad_norm": 0.6606990251087581, + "learning_rate": 9.737214673521687e-06, + "loss": 0.6483, + "step": 1486 + }, + { + "epoch": 0.5804059328649492, + "grad_norm": 0.6772432840293967, + "learning_rate": 9.736487660842502e-06, + "loss": 0.6043, + "step": 1487 + }, + { + "epoch": 0.5807962529274004, + "grad_norm": 0.650162801806175, + "learning_rate": 9.73575967110475e-06, + "loss": 0.6357, + "step": 1488 + }, + { + "epoch": 0.5811865729898517, + "grad_norm": 0.697630377442622, + "learning_rate": 9.735030704458603e-06, + "loss": 0.6345, + "step": 1489 + }, + { + "epoch": 0.5815768930523029, + "grad_norm": 0.7800180326445895, + "learning_rate": 9.734300761054437e-06, + "loss": 0.6169, + "step": 1490 + }, + { + "epoch": 0.5819672131147541, + "grad_norm": 0.744108165960641, + "learning_rate": 9.733569841042825e-06, + "loss": 0.656, + "step": 1491 + }, + { + "epoch": 0.5823575331772053, + "grad_norm": 0.6690868866015282, + "learning_rate": 9.732837944574543e-06, + "loss": 0.628, + "step": 1492 + }, + { + "epoch": 0.5827478532396565, + "grad_norm": 0.7116015091409277, + "learning_rate": 9.732105071800572e-06, + "loss": 0.6355, + "step": 1493 + }, + { + "epoch": 0.5831381733021077, + "grad_norm": 0.8571052301957891, + "learning_rate": 9.731371222872089e-06, + "loss": 0.6688, + "step": 1494 + }, + { + "epoch": 0.5835284933645589, + "grad_norm": 0.6716828711773692, + "learning_rate": 9.730636397940475e-06, + "loss": 0.6465, + "step": 1495 + }, + { + "epoch": 0.5839188134270101, + "grad_norm": 0.8213421930184953, + "learning_rate": 9.729900597157313e-06, + "loss": 0.6188, + "step": 1496 + }, + { + "epoch": 0.5843091334894613, + "grad_norm": 0.7188246845963165, + "learning_rate": 9.729163820674385e-06, + "loss": 0.655, + "step": 1497 + }, + { + "epoch": 0.5846994535519126, + "grad_norm": 0.718098065485115, + "learning_rate": 9.728426068643678e-06, + "loss": 0.6969, + "step": 1498 + }, + { + "epoch": 0.5850897736143638, + "grad_norm": 0.8344438744152622, + "learning_rate": 9.727687341217376e-06, + "loss": 0.6418, + "step": 1499 + }, + { + "epoch": 0.585480093676815, + "grad_norm": 0.7765436290106769, + "learning_rate": 9.726947638547868e-06, + "loss": 0.674, + "step": 1500 + }, + { + "epoch": 0.5858704137392662, + "grad_norm": 0.6945521081629474, + "learning_rate": 9.72620696078774e-06, + "loss": 0.6772, + "step": 1501 + }, + { + "epoch": 0.5862607338017174, + "grad_norm": 0.7780858990550741, + "learning_rate": 9.725465308089786e-06, + "loss": 0.6573, + "step": 1502 + }, + { + "epoch": 0.5866510538641686, + "grad_norm": 0.5998439909062542, + "learning_rate": 9.724722680606991e-06, + "loss": 0.6362, + "step": 1503 + }, + { + "epoch": 0.5870413739266198, + "grad_norm": 0.7165034453530325, + "learning_rate": 9.723979078492549e-06, + "loss": 0.6633, + "step": 1504 + }, + { + "epoch": 0.587431693989071, + "grad_norm": 0.6618453563696077, + "learning_rate": 9.723234501899852e-06, + "loss": 0.6771, + "step": 1505 + }, + { + "epoch": 0.5878220140515222, + "grad_norm": 0.6658395129004385, + "learning_rate": 9.722488950982497e-06, + "loss": 0.6287, + "step": 1506 + }, + { + "epoch": 0.5882123341139734, + "grad_norm": 0.7541219067977798, + "learning_rate": 9.721742425894275e-06, + "loss": 0.6431, + "step": 1507 + }, + { + "epoch": 0.5886026541764247, + "grad_norm": 0.7057542509691123, + "learning_rate": 9.720994926789184e-06, + "loss": 0.6802, + "step": 1508 + }, + { + "epoch": 0.5889929742388759, + "grad_norm": 0.5910817860768396, + "learning_rate": 9.720246453821418e-06, + "loss": 0.6254, + "step": 1509 + }, + { + "epoch": 0.5893832943013271, + "grad_norm": 0.7572355781827721, + "learning_rate": 9.719497007145378e-06, + "loss": 0.6521, + "step": 1510 + }, + { + "epoch": 0.5897736143637783, + "grad_norm": 0.6501110826125561, + "learning_rate": 9.71874658691566e-06, + "loss": 0.6663, + "step": 1511 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 0.5963347150363904, + "learning_rate": 9.717995193287063e-06, + "loss": 0.6394, + "step": 1512 + }, + { + "epoch": 0.5905542544886807, + "grad_norm": 0.6092961784547792, + "learning_rate": 9.717242826414589e-06, + "loss": 0.6506, + "step": 1513 + }, + { + "epoch": 0.5909445745511319, + "grad_norm": 0.7808956534069575, + "learning_rate": 9.716489486453435e-06, + "loss": 0.6139, + "step": 1514 + }, + { + "epoch": 0.5913348946135831, + "grad_norm": 0.6006058163311688, + "learning_rate": 9.715735173559007e-06, + "loss": 0.6281, + "step": 1515 + }, + { + "epoch": 0.5917252146760343, + "grad_norm": 0.7180023323437029, + "learning_rate": 9.714979887886907e-06, + "loss": 0.6447, + "step": 1516 + }, + { + "epoch": 0.5921155347384855, + "grad_norm": 0.7966066716924297, + "learning_rate": 9.714223629592933e-06, + "loss": 0.6506, + "step": 1517 + }, + { + "epoch": 0.5925058548009368, + "grad_norm": 0.7026022049478844, + "learning_rate": 9.713466398833093e-06, + "loss": 0.6472, + "step": 1518 + }, + { + "epoch": 0.592896174863388, + "grad_norm": 0.677045165376596, + "learning_rate": 9.71270819576359e-06, + "loss": 0.6682, + "step": 1519 + }, + { + "epoch": 0.5932864949258392, + "grad_norm": 0.6870978138564494, + "learning_rate": 9.711949020540827e-06, + "loss": 0.6649, + "step": 1520 + }, + { + "epoch": 0.5936768149882904, + "grad_norm": 0.6408807142659402, + "learning_rate": 9.711188873321411e-06, + "loss": 0.6501, + "step": 1521 + }, + { + "epoch": 0.5940671350507416, + "grad_norm": 0.6162355852763841, + "learning_rate": 9.710427754262148e-06, + "loss": 0.6641, + "step": 1522 + }, + { + "epoch": 0.5944574551131928, + "grad_norm": 0.7105946343392583, + "learning_rate": 9.709665663520043e-06, + "loss": 0.6519, + "step": 1523 + }, + { + "epoch": 0.594847775175644, + "grad_norm": 0.7011574220627581, + "learning_rate": 9.708902601252304e-06, + "loss": 0.6261, + "step": 1524 + }, + { + "epoch": 0.5952380952380952, + "grad_norm": 0.6328449344625334, + "learning_rate": 9.708138567616336e-06, + "loss": 0.6791, + "step": 1525 + }, + { + "epoch": 0.5956284153005464, + "grad_norm": 0.7147089455554102, + "learning_rate": 9.707373562769748e-06, + "loss": 0.645, + "step": 1526 + }, + { + "epoch": 0.5960187353629977, + "grad_norm": 0.6297547993873618, + "learning_rate": 9.706607586870347e-06, + "loss": 0.6943, + "step": 1527 + }, + { + "epoch": 0.5964090554254489, + "grad_norm": 0.7230924820363219, + "learning_rate": 9.705840640076143e-06, + "loss": 0.6494, + "step": 1528 + }, + { + "epoch": 0.5967993754879001, + "grad_norm": 0.6508529651348919, + "learning_rate": 9.705072722545341e-06, + "loss": 0.6355, + "step": 1529 + }, + { + "epoch": 0.5971896955503513, + "grad_norm": 0.6218750959658119, + "learning_rate": 9.704303834436352e-06, + "loss": 0.6424, + "step": 1530 + }, + { + "epoch": 0.5975800156128025, + "grad_norm": 0.6930496522612423, + "learning_rate": 9.703533975907783e-06, + "loss": 0.6673, + "step": 1531 + }, + { + "epoch": 0.5979703356752537, + "grad_norm": 0.6157188994635318, + "learning_rate": 9.702763147118447e-06, + "loss": 0.6201, + "step": 1532 + }, + { + "epoch": 0.5983606557377049, + "grad_norm": 0.6904906560001571, + "learning_rate": 9.701991348227349e-06, + "loss": 0.639, + "step": 1533 + }, + { + "epoch": 0.5987509758001561, + "grad_norm": 0.7398363556565897, + "learning_rate": 9.7012185793937e-06, + "loss": 0.6325, + "step": 1534 + }, + { + "epoch": 0.5991412958626073, + "grad_norm": 0.595366480006965, + "learning_rate": 9.700444840776907e-06, + "loss": 0.6703, + "step": 1535 + }, + { + "epoch": 0.5995316159250585, + "grad_norm": 0.638668711701579, + "learning_rate": 9.699670132536583e-06, + "loss": 0.6679, + "step": 1536 + }, + { + "epoch": 0.5999219359875098, + "grad_norm": 0.622599198987328, + "learning_rate": 9.698894454832535e-06, + "loss": 0.6257, + "step": 1537 + }, + { + "epoch": 0.600312256049961, + "grad_norm": 0.6672377253042933, + "learning_rate": 9.698117807824776e-06, + "loss": 0.6272, + "step": 1538 + }, + { + "epoch": 0.6007025761124122, + "grad_norm": 0.6753433381147862, + "learning_rate": 9.69734019167351e-06, + "loss": 0.622, + "step": 1539 + }, + { + "epoch": 0.6010928961748634, + "grad_norm": 0.6224290503229791, + "learning_rate": 9.696561606539148e-06, + "loss": 0.6863, + "step": 1540 + }, + { + "epoch": 0.6014832162373146, + "grad_norm": 0.6615748674583621, + "learning_rate": 9.695782052582301e-06, + "loss": 0.605, + "step": 1541 + }, + { + "epoch": 0.6018735362997658, + "grad_norm": 0.6953270186050963, + "learning_rate": 9.695001529963777e-06, + "loss": 0.681, + "step": 1542 + }, + { + "epoch": 0.602263856362217, + "grad_norm": 0.6029986148312858, + "learning_rate": 9.694220038844586e-06, + "loss": 0.6249, + "step": 1543 + }, + { + "epoch": 0.6026541764246682, + "grad_norm": 0.6271028904561906, + "learning_rate": 9.693437579385934e-06, + "loss": 0.6308, + "step": 1544 + }, + { + "epoch": 0.6030444964871194, + "grad_norm": 0.7542082834083487, + "learning_rate": 9.692654151749231e-06, + "loss": 0.6393, + "step": 1545 + }, + { + "epoch": 0.6034348165495707, + "grad_norm": 0.5981489794679964, + "learning_rate": 9.691869756096084e-06, + "loss": 0.6026, + "step": 1546 + }, + { + "epoch": 0.6038251366120219, + "grad_norm": 0.7053790534196105, + "learning_rate": 9.691084392588303e-06, + "loss": 0.6536, + "step": 1547 + }, + { + "epoch": 0.6042154566744731, + "grad_norm": 0.7434591393141691, + "learning_rate": 9.690298061387894e-06, + "loss": 0.6599, + "step": 1548 + }, + { + "epoch": 0.6046057767369243, + "grad_norm": 0.6359709014507051, + "learning_rate": 9.689510762657065e-06, + "loss": 0.6273, + "step": 1549 + }, + { + "epoch": 0.6049960967993755, + "grad_norm": 0.7134398258768297, + "learning_rate": 9.688722496558218e-06, + "loss": 0.6764, + "step": 1550 + }, + { + "epoch": 0.6053864168618267, + "grad_norm": 0.5671880946279514, + "learning_rate": 9.687933263253965e-06, + "loss": 0.6427, + "step": 1551 + }, + { + "epoch": 0.6057767369242779, + "grad_norm": 0.7359910265016659, + "learning_rate": 9.687143062907111e-06, + "loss": 0.6381, + "step": 1552 + }, + { + "epoch": 0.6061670569867291, + "grad_norm": 0.637809535309541, + "learning_rate": 9.686351895680659e-06, + "loss": 0.6404, + "step": 1553 + }, + { + "epoch": 0.6065573770491803, + "grad_norm": 0.6571664006251425, + "learning_rate": 9.685559761737812e-06, + "loss": 0.6194, + "step": 1554 + }, + { + "epoch": 0.6069476971116315, + "grad_norm": 0.747655792812154, + "learning_rate": 9.68476666124198e-06, + "loss": 0.6715, + "step": 1555 + }, + { + "epoch": 0.6073380171740828, + "grad_norm": 0.6775992653496848, + "learning_rate": 9.68397259435676e-06, + "loss": 0.6385, + "step": 1556 + }, + { + "epoch": 0.607728337236534, + "grad_norm": 0.6992270228326594, + "learning_rate": 9.68317756124596e-06, + "loss": 0.6344, + "step": 1557 + }, + { + "epoch": 0.6081186572989852, + "grad_norm": 0.8749551291635426, + "learning_rate": 9.682381562073578e-06, + "loss": 0.6418, + "step": 1558 + }, + { + "epoch": 0.6085089773614364, + "grad_norm": 0.7664506173374676, + "learning_rate": 9.681584597003817e-06, + "loss": 0.6394, + "step": 1559 + }, + { + "epoch": 0.6088992974238876, + "grad_norm": 0.9314161709562381, + "learning_rate": 9.68078666620108e-06, + "loss": 0.6197, + "step": 1560 + }, + { + "epoch": 0.6092896174863388, + "grad_norm": 0.6598708788255839, + "learning_rate": 9.679987769829963e-06, + "loss": 0.6539, + "step": 1561 + }, + { + "epoch": 0.60967993754879, + "grad_norm": 0.9127289906811288, + "learning_rate": 9.679187908055266e-06, + "loss": 0.6609, + "step": 1562 + }, + { + "epoch": 0.6100702576112412, + "grad_norm": 0.8978128356852333, + "learning_rate": 9.67838708104199e-06, + "loss": 0.6533, + "step": 1563 + }, + { + "epoch": 0.6104605776736924, + "grad_norm": 0.6432838648759134, + "learning_rate": 9.677585288955326e-06, + "loss": 0.657, + "step": 1564 + }, + { + "epoch": 0.6108508977361436, + "grad_norm": 0.8706865977157188, + "learning_rate": 9.676782531960678e-06, + "loss": 0.6529, + "step": 1565 + }, + { + "epoch": 0.6112412177985949, + "grad_norm": 0.7210595447363926, + "learning_rate": 9.675978810223636e-06, + "loss": 0.6204, + "step": 1566 + }, + { + "epoch": 0.6116315378610461, + "grad_norm": 0.7244228174194935, + "learning_rate": 9.675174123909998e-06, + "loss": 0.6331, + "step": 1567 + }, + { + "epoch": 0.6120218579234973, + "grad_norm": 0.76596959961112, + "learning_rate": 9.674368473185753e-06, + "loss": 0.6653, + "step": 1568 + }, + { + "epoch": 0.6124121779859485, + "grad_norm": 0.7249735989125577, + "learning_rate": 9.673561858217099e-06, + "loss": 0.6339, + "step": 1569 + }, + { + "epoch": 0.6128024980483997, + "grad_norm": 0.6440554365268031, + "learning_rate": 9.672754279170422e-06, + "loss": 0.627, + "step": 1570 + }, + { + "epoch": 0.6131928181108509, + "grad_norm": 0.7571342506103224, + "learning_rate": 9.671945736212316e-06, + "loss": 0.6635, + "step": 1571 + }, + { + "epoch": 0.6135831381733021, + "grad_norm": 0.6417799901455808, + "learning_rate": 9.671136229509567e-06, + "loss": 0.6374, + "step": 1572 + }, + { + "epoch": 0.6139734582357533, + "grad_norm": 0.5972724732499756, + "learning_rate": 9.670325759229164e-06, + "loss": 0.6642, + "step": 1573 + }, + { + "epoch": 0.6143637782982045, + "grad_norm": 0.7516650301723881, + "learning_rate": 9.669514325538293e-06, + "loss": 0.656, + "step": 1574 + }, + { + "epoch": 0.6147540983606558, + "grad_norm": 0.6120896440108335, + "learning_rate": 9.668701928604343e-06, + "loss": 0.6171, + "step": 1575 + }, + { + "epoch": 0.615144418423107, + "grad_norm": 0.6781197769640274, + "learning_rate": 9.667888568594893e-06, + "loss": 0.625, + "step": 1576 + }, + { + "epoch": 0.6155347384855582, + "grad_norm": 0.6648779493733017, + "learning_rate": 9.667074245677725e-06, + "loss": 0.6033, + "step": 1577 + }, + { + "epoch": 0.6159250585480094, + "grad_norm": 0.6256899679434871, + "learning_rate": 9.666258960020826e-06, + "loss": 0.6172, + "step": 1578 + }, + { + "epoch": 0.6163153786104606, + "grad_norm": 0.6525760833980242, + "learning_rate": 9.665442711792372e-06, + "loss": 0.6487, + "step": 1579 + }, + { + "epoch": 0.6167056986729118, + "grad_norm": 0.6739138733741602, + "learning_rate": 9.66462550116074e-06, + "loss": 0.6216, + "step": 1580 + }, + { + "epoch": 0.617096018735363, + "grad_norm": 0.6245785355199724, + "learning_rate": 9.663807328294512e-06, + "loss": 0.6502, + "step": 1581 + }, + { + "epoch": 0.6174863387978142, + "grad_norm": 0.609548021995916, + "learning_rate": 9.662988193362458e-06, + "loss": 0.6236, + "step": 1582 + }, + { + "epoch": 0.6178766588602654, + "grad_norm": 0.6007740546890595, + "learning_rate": 9.662168096533554e-06, + "loss": 0.6454, + "step": 1583 + }, + { + "epoch": 0.6182669789227166, + "grad_norm": 0.7664162776966905, + "learning_rate": 9.661347037976973e-06, + "loss": 0.6524, + "step": 1584 + }, + { + "epoch": 0.6186572989851679, + "grad_norm": 0.5996355891270907, + "learning_rate": 9.660525017862087e-06, + "loss": 0.6661, + "step": 1585 + }, + { + "epoch": 0.6190476190476191, + "grad_norm": 0.809113565524489, + "learning_rate": 9.659702036358463e-06, + "loss": 0.6736, + "step": 1586 + }, + { + "epoch": 0.6194379391100703, + "grad_norm": 0.6857002731828905, + "learning_rate": 9.658878093635867e-06, + "loss": 0.6188, + "step": 1587 + }, + { + "epoch": 0.6198282591725215, + "grad_norm": 0.6469439248585537, + "learning_rate": 9.658053189864268e-06, + "loss": 0.632, + "step": 1588 + }, + { + "epoch": 0.6202185792349727, + "grad_norm": 0.7614950442024371, + "learning_rate": 9.657227325213828e-06, + "loss": 0.6942, + "step": 1589 + }, + { + "epoch": 0.6206088992974239, + "grad_norm": 0.6474167903540365, + "learning_rate": 9.65640049985491e-06, + "loss": 0.623, + "step": 1590 + }, + { + "epoch": 0.6209992193598751, + "grad_norm": 0.604407197499567, + "learning_rate": 9.655572713958072e-06, + "loss": 0.6282, + "step": 1591 + }, + { + "epoch": 0.6213895394223263, + "grad_norm": 0.6912782447182168, + "learning_rate": 9.654743967694075e-06, + "loss": 0.6556, + "step": 1592 + }, + { + "epoch": 0.6217798594847775, + "grad_norm": 0.7315258837874398, + "learning_rate": 9.653914261233876e-06, + "loss": 0.6412, + "step": 1593 + }, + { + "epoch": 0.6221701795472288, + "grad_norm": 0.699315563200887, + "learning_rate": 9.653083594748627e-06, + "loss": 0.6032, + "step": 1594 + }, + { + "epoch": 0.62256049960968, + "grad_norm": 0.6569752947360402, + "learning_rate": 9.652251968409682e-06, + "loss": 0.6271, + "step": 1595 + }, + { + "epoch": 0.6229508196721312, + "grad_norm": 0.8120292170241081, + "learning_rate": 9.651419382388593e-06, + "loss": 0.6389, + "step": 1596 + }, + { + "epoch": 0.6233411397345824, + "grad_norm": 0.5801032111520792, + "learning_rate": 9.650585836857108e-06, + "loss": 0.6494, + "step": 1597 + }, + { + "epoch": 0.6237314597970336, + "grad_norm": 0.6835563575122534, + "learning_rate": 9.649751331987172e-06, + "loss": 0.6341, + "step": 1598 + }, + { + "epoch": 0.6241217798594848, + "grad_norm": 0.5989874969716578, + "learning_rate": 9.64891586795093e-06, + "loss": 0.635, + "step": 1599 + }, + { + "epoch": 0.624512099921936, + "grad_norm": 0.6497613638513645, + "learning_rate": 9.648079444920724e-06, + "loss": 0.67, + "step": 1600 + }, + { + "epoch": 0.6249024199843872, + "grad_norm": 0.6727408760519833, + "learning_rate": 9.647242063069097e-06, + "loss": 0.6667, + "step": 1601 + }, + { + "epoch": 0.6252927400468384, + "grad_norm": 0.5504120300493694, + "learning_rate": 9.646403722568784e-06, + "loss": 0.657, + "step": 1602 + }, + { + "epoch": 0.6256830601092896, + "grad_norm": 0.6143531693571899, + "learning_rate": 9.645564423592721e-06, + "loss": 0.6499, + "step": 1603 + }, + { + "epoch": 0.6260733801717409, + "grad_norm": 0.6657930506538223, + "learning_rate": 9.644724166314043e-06, + "loss": 0.6489, + "step": 1604 + }, + { + "epoch": 0.6264637002341921, + "grad_norm": 0.6129184995913936, + "learning_rate": 9.64388295090608e-06, + "loss": 0.6531, + "step": 1605 + }, + { + "epoch": 0.6268540202966433, + "grad_norm": 0.6755739340197771, + "learning_rate": 9.643040777542361e-06, + "loss": 0.6954, + "step": 1606 + }, + { + "epoch": 0.6272443403590945, + "grad_norm": 0.5790077752340528, + "learning_rate": 9.642197646396611e-06, + "loss": 0.6564, + "step": 1607 + }, + { + "epoch": 0.6276346604215457, + "grad_norm": 0.5805870103659557, + "learning_rate": 9.641353557642754e-06, + "loss": 0.6499, + "step": 1608 + }, + { + "epoch": 0.6280249804839969, + "grad_norm": 0.5938742535834672, + "learning_rate": 9.640508511454916e-06, + "loss": 0.6253, + "step": 1609 + }, + { + "epoch": 0.6284153005464481, + "grad_norm": 0.6411899053591705, + "learning_rate": 9.63966250800741e-06, + "loss": 0.6221, + "step": 1610 + }, + { + "epoch": 0.6288056206088993, + "grad_norm": 0.6600215885394517, + "learning_rate": 9.638815547474756e-06, + "loss": 0.6082, + "step": 1611 + }, + { + "epoch": 0.6291959406713505, + "grad_norm": 0.6198913105731542, + "learning_rate": 9.637967630031666e-06, + "loss": 0.6333, + "step": 1612 + }, + { + "epoch": 0.6295862607338018, + "grad_norm": 0.6856416294215855, + "learning_rate": 9.637118755853053e-06, + "loss": 0.6126, + "step": 1613 + }, + { + "epoch": 0.629976580796253, + "grad_norm": 0.6096782098931433, + "learning_rate": 9.636268925114024e-06, + "loss": 0.6121, + "step": 1614 + }, + { + "epoch": 0.6303669008587042, + "grad_norm": 0.6190057811470093, + "learning_rate": 9.635418137989885e-06, + "loss": 0.6411, + "step": 1615 + }, + { + "epoch": 0.6307572209211554, + "grad_norm": 0.720290824642611, + "learning_rate": 9.63456639465614e-06, + "loss": 0.6586, + "step": 1616 + }, + { + "epoch": 0.6311475409836066, + "grad_norm": 0.6821984609686629, + "learning_rate": 9.633713695288488e-06, + "loss": 0.5953, + "step": 1617 + }, + { + "epoch": 0.6315378610460578, + "grad_norm": 0.6271328116614349, + "learning_rate": 9.63286004006283e-06, + "loss": 0.6511, + "step": 1618 + }, + { + "epoch": 0.631928181108509, + "grad_norm": 0.7037699256366994, + "learning_rate": 9.632005429155259e-06, + "loss": 0.6788, + "step": 1619 + }, + { + "epoch": 0.6323185011709602, + "grad_norm": 0.7396333118763302, + "learning_rate": 9.631149862742067e-06, + "loss": 0.6484, + "step": 1620 + }, + { + "epoch": 0.6327088212334114, + "grad_norm": 0.6138288479144999, + "learning_rate": 9.630293340999741e-06, + "loss": 0.6649, + "step": 1621 + }, + { + "epoch": 0.6330991412958626, + "grad_norm": 0.803593579992274, + "learning_rate": 9.62943586410497e-06, + "loss": 0.643, + "step": 1622 + }, + { + "epoch": 0.6334894613583139, + "grad_norm": 0.7067299182212102, + "learning_rate": 9.628577432234638e-06, + "loss": 0.6364, + "step": 1623 + }, + { + "epoch": 0.6338797814207651, + "grad_norm": 0.7218664004335805, + "learning_rate": 9.62771804556582e-06, + "loss": 0.6544, + "step": 1624 + }, + { + "epoch": 0.6342701014832163, + "grad_norm": 0.7432177965330146, + "learning_rate": 9.6268577042758e-06, + "loss": 0.6698, + "step": 1625 + }, + { + "epoch": 0.6346604215456675, + "grad_norm": 0.8422900866756128, + "learning_rate": 9.625996408542049e-06, + "loss": 0.6766, + "step": 1626 + }, + { + "epoch": 0.6350507416081187, + "grad_norm": 0.7323259207392483, + "learning_rate": 9.625134158542235e-06, + "loss": 0.6476, + "step": 1627 + }, + { + "epoch": 0.6354410616705699, + "grad_norm": 0.6759330625622756, + "learning_rate": 9.624270954454231e-06, + "loss": 0.6337, + "step": 1628 + }, + { + "epoch": 0.6358313817330211, + "grad_norm": 0.6315144048624655, + "learning_rate": 9.623406796456098e-06, + "loss": 0.6686, + "step": 1629 + }, + { + "epoch": 0.6362217017954723, + "grad_norm": 0.6327917889164743, + "learning_rate": 9.622541684726098e-06, + "loss": 0.6041, + "step": 1630 + }, + { + "epoch": 0.6366120218579235, + "grad_norm": 0.707937306271008, + "learning_rate": 9.621675619442692e-06, + "loss": 0.6681, + "step": 1631 + }, + { + "epoch": 0.6370023419203747, + "grad_norm": 0.6033554700808894, + "learning_rate": 9.62080860078453e-06, + "loss": 0.659, + "step": 1632 + }, + { + "epoch": 0.637392661982826, + "grad_norm": 0.6195690672946383, + "learning_rate": 9.619940628930468e-06, + "loss": 0.6294, + "step": 1633 + }, + { + "epoch": 0.6377829820452772, + "grad_norm": 0.6320135647764119, + "learning_rate": 9.619071704059552e-06, + "loss": 0.6416, + "step": 1634 + }, + { + "epoch": 0.6381733021077284, + "grad_norm": 0.6423320317002922, + "learning_rate": 9.618201826351027e-06, + "loss": 0.6704, + "step": 1635 + }, + { + "epoch": 0.6385636221701796, + "grad_norm": 0.6345719088010536, + "learning_rate": 9.617330995984334e-06, + "loss": 0.6509, + "step": 1636 + }, + { + "epoch": 0.6389539422326308, + "grad_norm": 0.6720209885647632, + "learning_rate": 9.61645921313911e-06, + "loss": 0.6558, + "step": 1637 + }, + { + "epoch": 0.639344262295082, + "grad_norm": 0.7294163943992554, + "learning_rate": 9.615586477995193e-06, + "loss": 0.6453, + "step": 1638 + }, + { + "epoch": 0.6397345823575332, + "grad_norm": 0.6253502410850295, + "learning_rate": 9.61471279073261e-06, + "loss": 0.64, + "step": 1639 + }, + { + "epoch": 0.6401249024199844, + "grad_norm": 0.621060007719819, + "learning_rate": 9.613838151531589e-06, + "loss": 0.6444, + "step": 1640 + }, + { + "epoch": 0.6405152224824356, + "grad_norm": 0.719953584772847, + "learning_rate": 9.612962560572554e-06, + "loss": 0.6672, + "step": 1641 + }, + { + "epoch": 0.6409055425448869, + "grad_norm": 0.6339207204529904, + "learning_rate": 9.612086018036124e-06, + "loss": 0.6444, + "step": 1642 + }, + { + "epoch": 0.6412958626073381, + "grad_norm": 0.7182906425126857, + "learning_rate": 9.611208524103117e-06, + "loss": 0.6406, + "step": 1643 + }, + { + "epoch": 0.6416861826697893, + "grad_norm": 0.68847893536307, + "learning_rate": 9.610330078954544e-06, + "loss": 0.6244, + "step": 1644 + }, + { + "epoch": 0.6420765027322405, + "grad_norm": 0.6508414220182472, + "learning_rate": 9.609450682771612e-06, + "loss": 0.6356, + "step": 1645 + }, + { + "epoch": 0.6424668227946917, + "grad_norm": 0.7038231980217251, + "learning_rate": 9.608570335735731e-06, + "loss": 0.641, + "step": 1646 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 0.6240292989256406, + "learning_rate": 9.607689038028496e-06, + "loss": 0.635, + "step": 1647 + }, + { + "epoch": 0.6432474629195941, + "grad_norm": 0.6962599069465971, + "learning_rate": 9.606806789831707e-06, + "loss": 0.6621, + "step": 1648 + }, + { + "epoch": 0.6436377829820453, + "grad_norm": 0.7359924004961663, + "learning_rate": 9.605923591327358e-06, + "loss": 0.6459, + "step": 1649 + }, + { + "epoch": 0.6440281030444965, + "grad_norm": 0.6316492237739717, + "learning_rate": 9.605039442697635e-06, + "loss": 0.6581, + "step": 1650 + }, + { + "epoch": 0.6444184231069477, + "grad_norm": 0.7337358382040049, + "learning_rate": 9.604154344124925e-06, + "loss": 0.6279, + "step": 1651 + }, + { + "epoch": 0.644808743169399, + "grad_norm": 0.7053918322018848, + "learning_rate": 9.60326829579181e-06, + "loss": 0.6351, + "step": 1652 + }, + { + "epoch": 0.6451990632318502, + "grad_norm": 0.6412218416244364, + "learning_rate": 9.602381297881067e-06, + "loss": 0.6476, + "step": 1653 + }, + { + "epoch": 0.6455893832943014, + "grad_norm": 0.6268159632207266, + "learning_rate": 9.601493350575667e-06, + "loss": 0.6326, + "step": 1654 + }, + { + "epoch": 0.6459797033567526, + "grad_norm": 0.6987459946822979, + "learning_rate": 9.60060445405878e-06, + "loss": 0.6267, + "step": 1655 + }, + { + "epoch": 0.6463700234192038, + "grad_norm": 0.6324988016139047, + "learning_rate": 9.599714608513769e-06, + "loss": 0.6303, + "step": 1656 + }, + { + "epoch": 0.646760343481655, + "grad_norm": 0.6329591595445258, + "learning_rate": 9.598823814124196e-06, + "loss": 0.6556, + "step": 1657 + }, + { + "epoch": 0.6471506635441062, + "grad_norm": 0.7540034708693687, + "learning_rate": 9.597932071073819e-06, + "loss": 0.6502, + "step": 1658 + }, + { + "epoch": 0.6475409836065574, + "grad_norm": 0.6284079444889268, + "learning_rate": 9.597039379546585e-06, + "loss": 0.6566, + "step": 1659 + }, + { + "epoch": 0.6479313036690086, + "grad_norm": 0.8082220037788251, + "learning_rate": 9.596145739726644e-06, + "loss": 0.6528, + "step": 1660 + }, + { + "epoch": 0.6483216237314599, + "grad_norm": 0.7221402068313482, + "learning_rate": 9.59525115179834e-06, + "loss": 0.61, + "step": 1661 + }, + { + "epoch": 0.6487119437939111, + "grad_norm": 0.7088979202218326, + "learning_rate": 9.59435561594621e-06, + "loss": 0.646, + "step": 1662 + }, + { + "epoch": 0.6491022638563623, + "grad_norm": 0.8208106343098182, + "learning_rate": 9.59345913235499e-06, + "loss": 0.6635, + "step": 1663 + }, + { + "epoch": 0.6494925839188135, + "grad_norm": 0.5940471518525618, + "learning_rate": 9.592561701209607e-06, + "loss": 0.6588, + "step": 1664 + }, + { + "epoch": 0.6498829039812647, + "grad_norm": 0.7187725576522147, + "learning_rate": 9.591663322695186e-06, + "loss": 0.6565, + "step": 1665 + }, + { + "epoch": 0.6502732240437158, + "grad_norm": 0.7196430343312775, + "learning_rate": 9.590763996997051e-06, + "loss": 0.6304, + "step": 1666 + }, + { + "epoch": 0.650663544106167, + "grad_norm": 0.7103755194700481, + "learning_rate": 9.589863724300716e-06, + "loss": 0.6621, + "step": 1667 + }, + { + "epoch": 0.6510538641686182, + "grad_norm": 0.6634074150921295, + "learning_rate": 9.58896250479189e-06, + "loss": 0.6264, + "step": 1668 + }, + { + "epoch": 0.6514441842310694, + "grad_norm": 0.626721705769957, + "learning_rate": 9.588060338656484e-06, + "loss": 0.6575, + "step": 1669 + }, + { + "epoch": 0.6518345042935206, + "grad_norm": 0.6109958860239235, + "learning_rate": 9.587157226080596e-06, + "loss": 0.6188, + "step": 1670 + }, + { + "epoch": 0.6522248243559718, + "grad_norm": 0.5900794937655, + "learning_rate": 9.586253167250526e-06, + "loss": 0.6121, + "step": 1671 + }, + { + "epoch": 0.652615144418423, + "grad_norm": 0.7352427712614962, + "learning_rate": 9.585348162352762e-06, + "loss": 0.6218, + "step": 1672 + }, + { + "epoch": 0.6530054644808743, + "grad_norm": 0.6956652592852199, + "learning_rate": 9.584442211573996e-06, + "loss": 0.6174, + "step": 1673 + }, + { + "epoch": 0.6533957845433255, + "grad_norm": 0.6396123219315988, + "learning_rate": 9.583535315101111e-06, + "loss": 0.5937, + "step": 1674 + }, + { + "epoch": 0.6537861046057767, + "grad_norm": 0.7485574146951514, + "learning_rate": 9.58262747312118e-06, + "loss": 0.671, + "step": 1675 + }, + { + "epoch": 0.6541764246682279, + "grad_norm": 0.596812089553428, + "learning_rate": 9.581718685821479e-06, + "loss": 0.6234, + "step": 1676 + }, + { + "epoch": 0.6545667447306791, + "grad_norm": 0.7230577970054521, + "learning_rate": 9.580808953389475e-06, + "loss": 0.6347, + "step": 1677 + }, + { + "epoch": 0.6549570647931303, + "grad_norm": 0.7516856069000185, + "learning_rate": 9.57989827601283e-06, + "loss": 0.6341, + "step": 1678 + }, + { + "epoch": 0.6553473848555815, + "grad_norm": 0.6740071428568815, + "learning_rate": 9.578986653879406e-06, + "loss": 0.6397, + "step": 1679 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.6028000553903727, + "learning_rate": 9.578074087177247e-06, + "loss": 0.6309, + "step": 1680 + }, + { + "epoch": 0.656128024980484, + "grad_norm": 0.6582182930424578, + "learning_rate": 9.577160576094608e-06, + "loss": 0.6342, + "step": 1681 + }, + { + "epoch": 0.6565183450429352, + "grad_norm": 0.6468139092573623, + "learning_rate": 9.576246120819925e-06, + "loss": 0.6255, + "step": 1682 + }, + { + "epoch": 0.6569086651053864, + "grad_norm": 0.6424909108789603, + "learning_rate": 9.575330721541841e-06, + "loss": 0.6028, + "step": 1683 + }, + { + "epoch": 0.6572989851678376, + "grad_norm": 0.626119545648994, + "learning_rate": 9.574414378449184e-06, + "loss": 0.6796, + "step": 1684 + }, + { + "epoch": 0.6576893052302888, + "grad_norm": 0.7377423113119141, + "learning_rate": 9.57349709173098e-06, + "loss": 0.6724, + "step": 1685 + }, + { + "epoch": 0.65807962529274, + "grad_norm": 0.7069272885256723, + "learning_rate": 9.57257886157645e-06, + "loss": 0.6511, + "step": 1686 + }, + { + "epoch": 0.6584699453551912, + "grad_norm": 0.6653832509754567, + "learning_rate": 9.571659688175014e-06, + "loss": 0.6081, + "step": 1687 + }, + { + "epoch": 0.6588602654176424, + "grad_norm": 0.7483239461252392, + "learning_rate": 9.570739571716277e-06, + "loss": 0.6297, + "step": 1688 + }, + { + "epoch": 0.6592505854800936, + "grad_norm": 0.7084032386898481, + "learning_rate": 9.569818512390046e-06, + "loss": 0.6306, + "step": 1689 + }, + { + "epoch": 0.6596409055425448, + "grad_norm": 0.6766654108790037, + "learning_rate": 9.56889651038632e-06, + "loss": 0.6393, + "step": 1690 + }, + { + "epoch": 0.660031225604996, + "grad_norm": 0.6665842933172498, + "learning_rate": 9.56797356589529e-06, + "loss": 0.6351, + "step": 1691 + }, + { + "epoch": 0.6604215456674473, + "grad_norm": 0.7673524305352679, + "learning_rate": 9.567049679107348e-06, + "loss": 0.6764, + "step": 1692 + }, + { + "epoch": 0.6608118657298985, + "grad_norm": 0.7323740173213676, + "learning_rate": 9.566124850213074e-06, + "loss": 0.6732, + "step": 1693 + }, + { + "epoch": 0.6612021857923497, + "grad_norm": 0.7406234739982549, + "learning_rate": 9.565199079403247e-06, + "loss": 0.6523, + "step": 1694 + }, + { + "epoch": 0.6615925058548009, + "grad_norm": 0.7841575383032283, + "learning_rate": 9.564272366868836e-06, + "loss": 0.6851, + "step": 1695 + }, + { + "epoch": 0.6619828259172521, + "grad_norm": 0.734620394640746, + "learning_rate": 9.563344712801008e-06, + "loss": 0.6573, + "step": 1696 + }, + { + "epoch": 0.6623731459797033, + "grad_norm": 0.8399433824196544, + "learning_rate": 9.56241611739112e-06, + "loss": 0.6393, + "step": 1697 + }, + { + "epoch": 0.6627634660421545, + "grad_norm": 0.5855884615047114, + "learning_rate": 9.561486580830729e-06, + "loss": 0.6476, + "step": 1698 + }, + { + "epoch": 0.6631537861046057, + "grad_norm": 0.7389380931630365, + "learning_rate": 9.56055610331158e-06, + "loss": 0.6516, + "step": 1699 + }, + { + "epoch": 0.663544106167057, + "grad_norm": 0.6430876149261908, + "learning_rate": 9.559624685025619e-06, + "loss": 0.6057, + "step": 1700 + }, + { + "epoch": 0.6639344262295082, + "grad_norm": 0.7060975394416072, + "learning_rate": 9.558692326164978e-06, + "loss": 0.6563, + "step": 1701 + }, + { + "epoch": 0.6643247462919594, + "grad_norm": 0.6664513960942082, + "learning_rate": 9.557759026921988e-06, + "loss": 0.6205, + "step": 1702 + }, + { + "epoch": 0.6647150663544106, + "grad_norm": 0.6538351135366454, + "learning_rate": 9.556824787489176e-06, + "loss": 0.6065, + "step": 1703 + }, + { + "epoch": 0.6651053864168618, + "grad_norm": 0.5583295257060403, + "learning_rate": 9.555889608059257e-06, + "loss": 0.6429, + "step": 1704 + }, + { + "epoch": 0.665495706479313, + "grad_norm": 0.6084285414114242, + "learning_rate": 9.554953488825142e-06, + "loss": 0.6677, + "step": 1705 + }, + { + "epoch": 0.6658860265417642, + "grad_norm": 0.655259954141303, + "learning_rate": 9.55401642997994e-06, + "loss": 0.6548, + "step": 1706 + }, + { + "epoch": 0.6662763466042154, + "grad_norm": 0.6233669411422376, + "learning_rate": 9.553078431716948e-06, + "loss": 0.6698, + "step": 1707 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.5733034086054033, + "learning_rate": 9.552139494229664e-06, + "loss": 0.6102, + "step": 1708 + }, + { + "epoch": 0.6670569867291178, + "grad_norm": 0.7728890678272685, + "learning_rate": 9.55119961771177e-06, + "loss": 0.6391, + "step": 1709 + }, + { + "epoch": 0.667447306791569, + "grad_norm": 0.6999387408205825, + "learning_rate": 9.550258802357149e-06, + "loss": 0.6302, + "step": 1710 + }, + { + "epoch": 0.6678376268540203, + "grad_norm": 0.5891359848360022, + "learning_rate": 9.549317048359874e-06, + "loss": 0.6177, + "step": 1711 + }, + { + "epoch": 0.6682279469164715, + "grad_norm": 0.7296717542182005, + "learning_rate": 9.548374355914216e-06, + "loss": 0.6285, + "step": 1712 + }, + { + "epoch": 0.6686182669789227, + "grad_norm": 0.8317816213565555, + "learning_rate": 9.547430725214632e-06, + "loss": 0.6168, + "step": 1713 + }, + { + "epoch": 0.6690085870413739, + "grad_norm": 0.6745003370037761, + "learning_rate": 9.546486156455784e-06, + "loss": 0.6281, + "step": 1714 + }, + { + "epoch": 0.6693989071038251, + "grad_norm": 0.6956553626474725, + "learning_rate": 9.545540649832516e-06, + "loss": 0.6119, + "step": 1715 + }, + { + "epoch": 0.6697892271662763, + "grad_norm": 0.9752189495583892, + "learning_rate": 9.544594205539872e-06, + "loss": 0.6588, + "step": 1716 + }, + { + "epoch": 0.6701795472287275, + "grad_norm": 0.7362157909591373, + "learning_rate": 9.543646823773088e-06, + "loss": 0.6168, + "step": 1717 + }, + { + "epoch": 0.6705698672911787, + "grad_norm": 0.6311908412846092, + "learning_rate": 9.542698504727593e-06, + "loss": 0.6334, + "step": 1718 + }, + { + "epoch": 0.6709601873536299, + "grad_norm": 0.7972980141188093, + "learning_rate": 9.541749248599006e-06, + "loss": 0.6428, + "step": 1719 + }, + { + "epoch": 0.6713505074160812, + "grad_norm": 0.7236482370949058, + "learning_rate": 9.540799055583148e-06, + "loss": 0.632, + "step": 1720 + }, + { + "epoch": 0.6717408274785324, + "grad_norm": 0.5988534625154209, + "learning_rate": 9.539847925876024e-06, + "loss": 0.6635, + "step": 1721 + }, + { + "epoch": 0.6721311475409836, + "grad_norm": 0.7379112681777997, + "learning_rate": 9.538895859673837e-06, + "loss": 0.6563, + "step": 1722 + }, + { + "epoch": 0.6725214676034348, + "grad_norm": 0.6595424495234893, + "learning_rate": 9.537942857172986e-06, + "loss": 0.6316, + "step": 1723 + }, + { + "epoch": 0.672911787665886, + "grad_norm": 0.7064539411419225, + "learning_rate": 9.536988918570056e-06, + "loss": 0.6474, + "step": 1724 + }, + { + "epoch": 0.6733021077283372, + "grad_norm": 0.755126595537668, + "learning_rate": 9.536034044061828e-06, + "loss": 0.583, + "step": 1725 + }, + { + "epoch": 0.6736924277907884, + "grad_norm": 0.7617168372221331, + "learning_rate": 9.53507823384528e-06, + "loss": 0.6148, + "step": 1726 + }, + { + "epoch": 0.6740827478532396, + "grad_norm": 0.8581077108450628, + "learning_rate": 9.534121488117576e-06, + "loss": 0.6601, + "step": 1727 + }, + { + "epoch": 0.6744730679156908, + "grad_norm": 0.6320885704993267, + "learning_rate": 9.53316380707608e-06, + "loss": 0.619, + "step": 1728 + }, + { + "epoch": 0.674863387978142, + "grad_norm": 0.7244906687756147, + "learning_rate": 9.532205190918345e-06, + "loss": 0.6412, + "step": 1729 + }, + { + "epoch": 0.6752537080405933, + "grad_norm": 0.7787373268532731, + "learning_rate": 9.531245639842115e-06, + "loss": 0.6591, + "step": 1730 + }, + { + "epoch": 0.6756440281030445, + "grad_norm": 0.7613743817769886, + "learning_rate": 9.530285154045332e-06, + "loss": 0.647, + "step": 1731 + }, + { + "epoch": 0.6760343481654957, + "grad_norm": 0.6618508186650279, + "learning_rate": 9.529323733726127e-06, + "loss": 0.6106, + "step": 1732 + }, + { + "epoch": 0.6764246682279469, + "grad_norm": 0.6932487628757663, + "learning_rate": 9.528361379082826e-06, + "loss": 0.6319, + "step": 1733 + }, + { + "epoch": 0.6768149882903981, + "grad_norm": 0.6662579016372546, + "learning_rate": 9.527398090313947e-06, + "loss": 0.6866, + "step": 1734 + }, + { + "epoch": 0.6772053083528493, + "grad_norm": 0.6649411912238008, + "learning_rate": 9.526433867618197e-06, + "loss": 0.6266, + "step": 1735 + }, + { + "epoch": 0.6775956284153005, + "grad_norm": 0.7624893314186534, + "learning_rate": 9.525468711194485e-06, + "loss": 0.6326, + "step": 1736 + }, + { + "epoch": 0.6779859484777517, + "grad_norm": 0.5903912615024534, + "learning_rate": 9.524502621241901e-06, + "loss": 0.5949, + "step": 1737 + }, + { + "epoch": 0.6783762685402029, + "grad_norm": 0.783508224867891, + "learning_rate": 9.523535597959736e-06, + "loss": 0.6543, + "step": 1738 + }, + { + "epoch": 0.6787665886026542, + "grad_norm": 0.5994206104990663, + "learning_rate": 9.522567641547473e-06, + "loss": 0.6432, + "step": 1739 + }, + { + "epoch": 0.6791569086651054, + "grad_norm": 0.678000505918972, + "learning_rate": 9.521598752204781e-06, + "loss": 0.649, + "step": 1740 + }, + { + "epoch": 0.6795472287275566, + "grad_norm": 0.7269900839556197, + "learning_rate": 9.520628930131528e-06, + "loss": 0.6679, + "step": 1741 + }, + { + "epoch": 0.6799375487900078, + "grad_norm": 0.618723113140445, + "learning_rate": 9.519658175527771e-06, + "loss": 0.6109, + "step": 1742 + }, + { + "epoch": 0.680327868852459, + "grad_norm": 0.719613005569482, + "learning_rate": 9.518686488593762e-06, + "loss": 0.6456, + "step": 1743 + }, + { + "epoch": 0.6807181889149102, + "grad_norm": 0.75609805138792, + "learning_rate": 9.517713869529943e-06, + "loss": 0.6537, + "step": 1744 + }, + { + "epoch": 0.6811085089773614, + "grad_norm": 0.5904284627688705, + "learning_rate": 9.516740318536948e-06, + "loss": 0.6562, + "step": 1745 + }, + { + "epoch": 0.6814988290398126, + "grad_norm": 0.7365450772472609, + "learning_rate": 9.515765835815607e-06, + "loss": 0.6502, + "step": 1746 + }, + { + "epoch": 0.6818891491022638, + "grad_norm": 0.7345820269455311, + "learning_rate": 9.514790421566938e-06, + "loss": 0.6512, + "step": 1747 + }, + { + "epoch": 0.682279469164715, + "grad_norm": 0.6166210148034419, + "learning_rate": 9.513814075992152e-06, + "loss": 0.641, + "step": 1748 + }, + { + "epoch": 0.6826697892271663, + "grad_norm": 0.7311675817897852, + "learning_rate": 9.512836799292656e-06, + "loss": 0.6255, + "step": 1749 + }, + { + "epoch": 0.6830601092896175, + "grad_norm": 0.6958935798454053, + "learning_rate": 9.511858591670042e-06, + "loss": 0.6137, + "step": 1750 + }, + { + "epoch": 0.6834504293520687, + "grad_norm": 0.7700956834813463, + "learning_rate": 9.510879453326099e-06, + "loss": 0.6197, + "step": 1751 + }, + { + "epoch": 0.6838407494145199, + "grad_norm": 0.6620766468045293, + "learning_rate": 9.509899384462809e-06, + "loss": 0.618, + "step": 1752 + }, + { + "epoch": 0.6842310694769711, + "grad_norm": 0.6973595881944455, + "learning_rate": 9.50891838528234e-06, + "loss": 0.6517, + "step": 1753 + }, + { + "epoch": 0.6846213895394223, + "grad_norm": 0.641305651001025, + "learning_rate": 9.507936455987061e-06, + "loss": 0.6374, + "step": 1754 + }, + { + "epoch": 0.6850117096018735, + "grad_norm": 0.6016767007674151, + "learning_rate": 9.506953596779523e-06, + "loss": 0.6385, + "step": 1755 + }, + { + "epoch": 0.6854020296643247, + "grad_norm": 0.6559077059979015, + "learning_rate": 9.505969807862476e-06, + "loss": 0.6276, + "step": 1756 + }, + { + "epoch": 0.6857923497267759, + "grad_norm": 0.6626869663232711, + "learning_rate": 9.504985089438858e-06, + "loss": 0.6193, + "step": 1757 + }, + { + "epoch": 0.6861826697892272, + "grad_norm": 0.6509153829950336, + "learning_rate": 9.503999441711802e-06, + "loss": 0.6422, + "step": 1758 + }, + { + "epoch": 0.6865729898516784, + "grad_norm": 0.5439550477367983, + "learning_rate": 9.503012864884629e-06, + "loss": 0.663, + "step": 1759 + }, + { + "epoch": 0.6869633099141296, + "grad_norm": 0.6282007833308836, + "learning_rate": 9.502025359160853e-06, + "loss": 0.6658, + "step": 1760 + }, + { + "epoch": 0.6873536299765808, + "grad_norm": 0.6332802397104195, + "learning_rate": 9.501036924744183e-06, + "loss": 0.6086, + "step": 1761 + }, + { + "epoch": 0.687743950039032, + "grad_norm": 0.6090865436299466, + "learning_rate": 9.500047561838513e-06, + "loss": 0.6457, + "step": 1762 + }, + { + "epoch": 0.6881342701014832, + "grad_norm": 0.5880106669168129, + "learning_rate": 9.49905727064793e-06, + "loss": 0.6483, + "step": 1763 + }, + { + "epoch": 0.6885245901639344, + "grad_norm": 0.6395642988938637, + "learning_rate": 9.498066051376723e-06, + "loss": 0.6645, + "step": 1764 + }, + { + "epoch": 0.6889149102263856, + "grad_norm": 0.5833596462143243, + "learning_rate": 9.497073904229357e-06, + "loss": 0.6401, + "step": 1765 + }, + { + "epoch": 0.6893052302888368, + "grad_norm": 0.7263386068489649, + "learning_rate": 9.496080829410499e-06, + "loss": 0.6565, + "step": 1766 + }, + { + "epoch": 0.689695550351288, + "grad_norm": 0.5396713719933197, + "learning_rate": 9.495086827125e-06, + "loss": 0.6618, + "step": 1767 + }, + { + "epoch": 0.6900858704137393, + "grad_norm": 0.5558442488137867, + "learning_rate": 9.49409189757791e-06, + "loss": 0.6303, + "step": 1768 + }, + { + "epoch": 0.6904761904761905, + "grad_norm": 0.6705877131463266, + "learning_rate": 9.493096040974465e-06, + "loss": 0.6322, + "step": 1769 + }, + { + "epoch": 0.6908665105386417, + "grad_norm": 0.628764452853665, + "learning_rate": 9.492099257520092e-06, + "loss": 0.643, + "step": 1770 + }, + { + "epoch": 0.6912568306010929, + "grad_norm": 0.6752951664972308, + "learning_rate": 9.491101547420413e-06, + "loss": 0.6411, + "step": 1771 + }, + { + "epoch": 0.6916471506635441, + "grad_norm": 0.623958412820234, + "learning_rate": 9.490102910881238e-06, + "loss": 0.6438, + "step": 1772 + }, + { + "epoch": 0.6920374707259953, + "grad_norm": 0.7333291317351759, + "learning_rate": 9.489103348108571e-06, + "loss": 0.6622, + "step": 1773 + }, + { + "epoch": 0.6924277907884465, + "grad_norm": 0.6587729978305435, + "learning_rate": 9.488102859308603e-06, + "loss": 0.6259, + "step": 1774 + }, + { + "epoch": 0.6928181108508977, + "grad_norm": 0.6575081772153563, + "learning_rate": 9.487101444687719e-06, + "loss": 0.6332, + "step": 1775 + }, + { + "epoch": 0.6932084309133489, + "grad_norm": 0.6828445406573012, + "learning_rate": 9.486099104452492e-06, + "loss": 0.655, + "step": 1776 + }, + { + "epoch": 0.6935987509758001, + "grad_norm": 0.7479010714094043, + "learning_rate": 9.485095838809692e-06, + "loss": 0.6536, + "step": 1777 + }, + { + "epoch": 0.6939890710382514, + "grad_norm": 0.5164328726024126, + "learning_rate": 9.484091647966273e-06, + "loss": 0.647, + "step": 1778 + }, + { + "epoch": 0.6943793911007026, + "grad_norm": 0.7114209073802363, + "learning_rate": 9.483086532129385e-06, + "loss": 0.6654, + "step": 1779 + }, + { + "epoch": 0.6947697111631538, + "grad_norm": 0.632203130655845, + "learning_rate": 9.482080491506364e-06, + "loss": 0.6456, + "step": 1780 + }, + { + "epoch": 0.695160031225605, + "grad_norm": 0.6117148028672073, + "learning_rate": 9.481073526304741e-06, + "loss": 0.6289, + "step": 1781 + }, + { + "epoch": 0.6955503512880562, + "grad_norm": 0.6089494745334076, + "learning_rate": 9.480065636732237e-06, + "loss": 0.6148, + "step": 1782 + }, + { + "epoch": 0.6959406713505074, + "grad_norm": 0.6623675181236651, + "learning_rate": 9.479056822996761e-06, + "loss": 0.6472, + "step": 1783 + }, + { + "epoch": 0.6963309914129586, + "grad_norm": 0.7314518241846623, + "learning_rate": 9.478047085306418e-06, + "loss": 0.6356, + "step": 1784 + }, + { + "epoch": 0.6967213114754098, + "grad_norm": 0.5686280730095316, + "learning_rate": 9.477036423869496e-06, + "loss": 0.6139, + "step": 1785 + }, + { + "epoch": 0.697111631537861, + "grad_norm": 0.6887773198910344, + "learning_rate": 9.47602483889448e-06, + "loss": 0.6356, + "step": 1786 + }, + { + "epoch": 0.6975019516003123, + "grad_norm": 0.6823382105808613, + "learning_rate": 9.475012330590042e-06, + "loss": 0.6199, + "step": 1787 + }, + { + "epoch": 0.6978922716627635, + "grad_norm": 0.6424600235363055, + "learning_rate": 9.473998899165044e-06, + "loss": 0.6487, + "step": 1788 + }, + { + "epoch": 0.6982825917252147, + "grad_norm": 0.7169833352897839, + "learning_rate": 9.472984544828543e-06, + "loss": 0.6104, + "step": 1789 + }, + { + "epoch": 0.6986729117876659, + "grad_norm": 0.6823219396107513, + "learning_rate": 9.471969267789783e-06, + "loss": 0.6429, + "step": 1790 + }, + { + "epoch": 0.6990632318501171, + "grad_norm": 0.7689301652068217, + "learning_rate": 9.470953068258199e-06, + "loss": 0.6403, + "step": 1791 + }, + { + "epoch": 0.6994535519125683, + "grad_norm": 0.6121899649976874, + "learning_rate": 9.469935946443414e-06, + "loss": 0.6257, + "step": 1792 + }, + { + "epoch": 0.6998438719750195, + "grad_norm": 0.5759871844146137, + "learning_rate": 9.468917902555244e-06, + "loss": 0.6368, + "step": 1793 + }, + { + "epoch": 0.7002341920374707, + "grad_norm": 0.6754151933422672, + "learning_rate": 9.467898936803695e-06, + "loss": 0.6164, + "step": 1794 + }, + { + "epoch": 0.7006245120999219, + "grad_norm": 0.7030924120130165, + "learning_rate": 9.466879049398965e-06, + "loss": 0.6577, + "step": 1795 + }, + { + "epoch": 0.7010148321623731, + "grad_norm": 0.593145245362292, + "learning_rate": 9.465858240551435e-06, + "loss": 0.6301, + "step": 1796 + }, + { + "epoch": 0.7014051522248244, + "grad_norm": 0.6313542327044396, + "learning_rate": 9.464836510471685e-06, + "loss": 0.6427, + "step": 1797 + }, + { + "epoch": 0.7017954722872756, + "grad_norm": 0.5955553101613024, + "learning_rate": 9.463813859370478e-06, + "loss": 0.688, + "step": 1798 + }, + { + "epoch": 0.7021857923497268, + "grad_norm": 0.6067148230768519, + "learning_rate": 9.462790287458772e-06, + "loss": 0.6337, + "step": 1799 + }, + { + "epoch": 0.702576112412178, + "grad_norm": 0.549892103417726, + "learning_rate": 9.46176579494771e-06, + "loss": 0.5867, + "step": 1800 + }, + { + "epoch": 0.7029664324746292, + "grad_norm": 0.6293732301924209, + "learning_rate": 9.460740382048632e-06, + "loss": 0.6428, + "step": 1801 + }, + { + "epoch": 0.7033567525370804, + "grad_norm": 0.599144002823495, + "learning_rate": 9.459714048973062e-06, + "loss": 0.646, + "step": 1802 + }, + { + "epoch": 0.7037470725995316, + "grad_norm": 0.6148672808852186, + "learning_rate": 9.458686795932711e-06, + "loss": 0.6104, + "step": 1803 + }, + { + "epoch": 0.7041373926619828, + "grad_norm": 0.635487153789627, + "learning_rate": 9.457658623139492e-06, + "loss": 0.6379, + "step": 1804 + }, + { + "epoch": 0.704527712724434, + "grad_norm": 0.6149075309143952, + "learning_rate": 9.456629530805495e-06, + "loss": 0.613, + "step": 1805 + }, + { + "epoch": 0.7049180327868853, + "grad_norm": 0.576736204991023, + "learning_rate": 9.455599519143006e-06, + "loss": 0.5815, + "step": 1806 + }, + { + "epoch": 0.7053083528493365, + "grad_norm": 0.6582716130377548, + "learning_rate": 9.454568588364496e-06, + "loss": 0.6481, + "step": 1807 + }, + { + "epoch": 0.7056986729117877, + "grad_norm": 0.675546549260172, + "learning_rate": 9.453536738682634e-06, + "loss": 0.6582, + "step": 1808 + }, + { + "epoch": 0.7060889929742389, + "grad_norm": 0.6017754073947366, + "learning_rate": 9.452503970310271e-06, + "loss": 0.6716, + "step": 1809 + }, + { + "epoch": 0.7064793130366901, + "grad_norm": 0.6458903310571893, + "learning_rate": 9.45147028346045e-06, + "loss": 0.6616, + "step": 1810 + }, + { + "epoch": 0.7068696330991413, + "grad_norm": 0.7027273446508724, + "learning_rate": 9.450435678346403e-06, + "loss": 0.6173, + "step": 1811 + }, + { + "epoch": 0.7072599531615925, + "grad_norm": 0.6141649201917408, + "learning_rate": 9.449400155181553e-06, + "loss": 0.6315, + "step": 1812 + }, + { + "epoch": 0.7076502732240437, + "grad_norm": 0.7818849411023592, + "learning_rate": 9.448363714179509e-06, + "loss": 0.6315, + "step": 1813 + }, + { + "epoch": 0.7080405932864949, + "grad_norm": 0.6638498749039716, + "learning_rate": 9.447326355554073e-06, + "loss": 0.6387, + "step": 1814 + }, + { + "epoch": 0.7084309133489461, + "grad_norm": 0.679728777077161, + "learning_rate": 9.446288079519236e-06, + "loss": 0.6732, + "step": 1815 + }, + { + "epoch": 0.7088212334113974, + "grad_norm": 0.691582913876053, + "learning_rate": 9.445248886289176e-06, + "loss": 0.6117, + "step": 1816 + }, + { + "epoch": 0.7092115534738486, + "grad_norm": 0.672543694810942, + "learning_rate": 9.444208776078261e-06, + "loss": 0.5918, + "step": 1817 + }, + { + "epoch": 0.7096018735362998, + "grad_norm": 0.6283311071272124, + "learning_rate": 9.443167749101047e-06, + "loss": 0.5976, + "step": 1818 + }, + { + "epoch": 0.709992193598751, + "grad_norm": 0.7405229277163593, + "learning_rate": 9.442125805572284e-06, + "loss": 0.6298, + "step": 1819 + }, + { + "epoch": 0.7103825136612022, + "grad_norm": 0.7882839810632828, + "learning_rate": 9.441082945706906e-06, + "loss": 0.6626, + "step": 1820 + }, + { + "epoch": 0.7107728337236534, + "grad_norm": 0.6679260205709749, + "learning_rate": 9.440039169720035e-06, + "loss": 0.6106, + "step": 1821 + }, + { + "epoch": 0.7111631537861046, + "grad_norm": 0.7512846166063322, + "learning_rate": 9.438994477826989e-06, + "loss": 0.6477, + "step": 1822 + }, + { + "epoch": 0.7115534738485558, + "grad_norm": 0.8596838352772075, + "learning_rate": 9.437948870243266e-06, + "loss": 0.6398, + "step": 1823 + }, + { + "epoch": 0.711943793911007, + "grad_norm": 0.6606698891222536, + "learning_rate": 9.436902347184561e-06, + "loss": 0.6485, + "step": 1824 + }, + { + "epoch": 0.7123341139734582, + "grad_norm": 0.8384123570930737, + "learning_rate": 9.435854908866754e-06, + "loss": 0.6367, + "step": 1825 + }, + { + "epoch": 0.7127244340359095, + "grad_norm": 0.5493846440244269, + "learning_rate": 9.434806555505912e-06, + "loss": 0.6064, + "step": 1826 + }, + { + "epoch": 0.7131147540983607, + "grad_norm": 0.7214607457058891, + "learning_rate": 9.433757287318295e-06, + "loss": 0.6498, + "step": 1827 + }, + { + "epoch": 0.7135050741608119, + "grad_norm": 0.6225760994117607, + "learning_rate": 9.432707104520348e-06, + "loss": 0.6587, + "step": 1828 + }, + { + "epoch": 0.7138953942232631, + "grad_norm": 0.693929226959227, + "learning_rate": 9.431656007328708e-06, + "loss": 0.6015, + "step": 1829 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.6185141542825905, + "learning_rate": 9.430603995960198e-06, + "loss": 0.6355, + "step": 1830 + }, + { + "epoch": 0.7146760343481655, + "grad_norm": 0.5941663759602849, + "learning_rate": 9.429551070631829e-06, + "loss": 0.6605, + "step": 1831 + }, + { + "epoch": 0.7150663544106167, + "grad_norm": 0.6636753994889699, + "learning_rate": 9.428497231560805e-06, + "loss": 0.6523, + "step": 1832 + }, + { + "epoch": 0.7154566744730679, + "grad_norm": 0.6709047693283124, + "learning_rate": 9.427442478964511e-06, + "loss": 0.6374, + "step": 1833 + }, + { + "epoch": 0.7158469945355191, + "grad_norm": 0.6444971607484858, + "learning_rate": 9.426386813060528e-06, + "loss": 0.6457, + "step": 1834 + }, + { + "epoch": 0.7162373145979704, + "grad_norm": 0.7168079136811071, + "learning_rate": 9.425330234066625e-06, + "loss": 0.6335, + "step": 1835 + }, + { + "epoch": 0.7166276346604216, + "grad_norm": 0.6084548735938831, + "learning_rate": 9.42427274220075e-06, + "loss": 0.622, + "step": 1836 + }, + { + "epoch": 0.7170179547228728, + "grad_norm": 0.7087454134151238, + "learning_rate": 9.42321433768105e-06, + "loss": 0.6458, + "step": 1837 + }, + { + "epoch": 0.717408274785324, + "grad_norm": 0.739508468984233, + "learning_rate": 9.422155020725857e-06, + "loss": 0.6284, + "step": 1838 + }, + { + "epoch": 0.7177985948477752, + "grad_norm": 0.7009959883324874, + "learning_rate": 9.421094791553689e-06, + "loss": 0.6181, + "step": 1839 + }, + { + "epoch": 0.7181889149102264, + "grad_norm": 0.7193968986002001, + "learning_rate": 9.420033650383254e-06, + "loss": 0.6656, + "step": 1840 + }, + { + "epoch": 0.7185792349726776, + "grad_norm": 0.6001692357697758, + "learning_rate": 9.418971597433446e-06, + "loss": 0.6551, + "step": 1841 + }, + { + "epoch": 0.7189695550351288, + "grad_norm": 0.7440750746284365, + "learning_rate": 9.417908632923352e-06, + "loss": 0.6616, + "step": 1842 + }, + { + "epoch": 0.71935987509758, + "grad_norm": 0.6598983343413182, + "learning_rate": 9.41684475707224e-06, + "loss": 0.6639, + "step": 1843 + }, + { + "epoch": 0.7197501951600312, + "grad_norm": 0.6024626927041579, + "learning_rate": 9.415779970099575e-06, + "loss": 0.6626, + "step": 1844 + }, + { + "epoch": 0.7201405152224825, + "grad_norm": 0.7088344351733439, + "learning_rate": 9.414714272225e-06, + "loss": 0.5933, + "step": 1845 + }, + { + "epoch": 0.7205308352849337, + "grad_norm": 0.6472377847136225, + "learning_rate": 9.413647663668355e-06, + "loss": 0.6362, + "step": 1846 + }, + { + "epoch": 0.7209211553473849, + "grad_norm": 0.8639597049864751, + "learning_rate": 9.412580144649658e-06, + "loss": 0.6265, + "step": 1847 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 0.7546610535823619, + "learning_rate": 9.411511715389128e-06, + "loss": 0.6485, + "step": 1848 + }, + { + "epoch": 0.7217017954722873, + "grad_norm": 0.8001048839669463, + "learning_rate": 9.410442376107156e-06, + "loss": 0.6498, + "step": 1849 + }, + { + "epoch": 0.7220921155347385, + "grad_norm": 0.8526278981748496, + "learning_rate": 9.409372127024334e-06, + "loss": 0.6156, + "step": 1850 + }, + { + "epoch": 0.7224824355971897, + "grad_norm": 0.6977377871335383, + "learning_rate": 9.408300968361436e-06, + "loss": 0.6561, + "step": 1851 + }, + { + "epoch": 0.7228727556596409, + "grad_norm": 0.8728046062560363, + "learning_rate": 9.407228900339424e-06, + "loss": 0.6591, + "step": 1852 + }, + { + "epoch": 0.7232630757220921, + "grad_norm": 0.5907586035666804, + "learning_rate": 9.406155923179446e-06, + "loss": 0.633, + "step": 1853 + }, + { + "epoch": 0.7236533957845434, + "grad_norm": 0.855651722605101, + "learning_rate": 9.405082037102842e-06, + "loss": 0.6263, + "step": 1854 + }, + { + "epoch": 0.7240437158469946, + "grad_norm": 0.7314737595182386, + "learning_rate": 9.404007242331135e-06, + "loss": 0.6556, + "step": 1855 + }, + { + "epoch": 0.7244340359094458, + "grad_norm": 0.7072108924622685, + "learning_rate": 9.402931539086038e-06, + "loss": 0.6597, + "step": 1856 + }, + { + "epoch": 0.724824355971897, + "grad_norm": 0.8709503118575581, + "learning_rate": 9.40185492758945e-06, + "loss": 0.6317, + "step": 1857 + }, + { + "epoch": 0.7252146760343482, + "grad_norm": 0.6475393852575893, + "learning_rate": 9.400777408063459e-06, + "loss": 0.6431, + "step": 1858 + }, + { + "epoch": 0.7256049960967994, + "grad_norm": 0.759308714178517, + "learning_rate": 9.399698980730339e-06, + "loss": 0.6154, + "step": 1859 + }, + { + "epoch": 0.7259953161592506, + "grad_norm": 0.677082512144071, + "learning_rate": 9.39861964581255e-06, + "loss": 0.6269, + "step": 1860 + }, + { + "epoch": 0.7263856362217018, + "grad_norm": 0.937149434333803, + "learning_rate": 9.397539403532743e-06, + "loss": 0.6447, + "step": 1861 + }, + { + "epoch": 0.726775956284153, + "grad_norm": 0.8128335606112279, + "learning_rate": 9.396458254113754e-06, + "loss": 0.6526, + "step": 1862 + }, + { + "epoch": 0.7271662763466042, + "grad_norm": 0.7147416970782791, + "learning_rate": 9.395376197778604e-06, + "loss": 0.6443, + "step": 1863 + }, + { + "epoch": 0.7275565964090555, + "grad_norm": 0.7840066348919151, + "learning_rate": 9.394293234750506e-06, + "loss": 0.6146, + "step": 1864 + }, + { + "epoch": 0.7279469164715067, + "grad_norm": 0.7654891599581575, + "learning_rate": 9.393209365252856e-06, + "loss": 0.6368, + "step": 1865 + }, + { + "epoch": 0.7283372365339579, + "grad_norm": 0.6569647096065848, + "learning_rate": 9.392124589509237e-06, + "loss": 0.6565, + "step": 1866 + }, + { + "epoch": 0.7287275565964091, + "grad_norm": 0.6908100462807079, + "learning_rate": 9.391038907743422e-06, + "loss": 0.6527, + "step": 1867 + }, + { + "epoch": 0.7291178766588603, + "grad_norm": 0.6931774416921832, + "learning_rate": 9.389952320179367e-06, + "loss": 0.6247, + "step": 1868 + }, + { + "epoch": 0.7295081967213115, + "grad_norm": 0.9270308186404411, + "learning_rate": 9.38886482704122e-06, + "loss": 0.6193, + "step": 1869 + }, + { + "epoch": 0.7298985167837627, + "grad_norm": 0.6789088729511253, + "learning_rate": 9.38777642855331e-06, + "loss": 0.6469, + "step": 1870 + }, + { + "epoch": 0.7302888368462139, + "grad_norm": 0.8297479628533555, + "learning_rate": 9.386687124940157e-06, + "loss": 0.6172, + "step": 1871 + }, + { + "epoch": 0.7306791569086651, + "grad_norm": 0.8018071432637677, + "learning_rate": 9.385596916426465e-06, + "loss": 0.6347, + "step": 1872 + }, + { + "epoch": 0.7310694769711163, + "grad_norm": 0.6314976399120099, + "learning_rate": 9.384505803237127e-06, + "loss": 0.6638, + "step": 1873 + }, + { + "epoch": 0.7314597970335676, + "grad_norm": 0.7306101901759637, + "learning_rate": 9.383413785597222e-06, + "loss": 0.6347, + "step": 1874 + }, + { + "epoch": 0.7318501170960188, + "grad_norm": 0.8275265072377699, + "learning_rate": 9.382320863732013e-06, + "loss": 0.6392, + "step": 1875 + }, + { + "epoch": 0.73224043715847, + "grad_norm": 0.6496810614920415, + "learning_rate": 9.381227037866953e-06, + "loss": 0.6231, + "step": 1876 + }, + { + "epoch": 0.7326307572209212, + "grad_norm": 0.9037858148788311, + "learning_rate": 9.38013230822768e-06, + "loss": 0.6296, + "step": 1877 + }, + { + "epoch": 0.7330210772833724, + "grad_norm": 0.6174945922972059, + "learning_rate": 9.379036675040019e-06, + "loss": 0.632, + "step": 1878 + }, + { + "epoch": 0.7334113973458236, + "grad_norm": 0.6450025668163221, + "learning_rate": 9.37794013852998e-06, + "loss": 0.6464, + "step": 1879 + }, + { + "epoch": 0.7338017174082748, + "grad_norm": 0.7313341746638125, + "learning_rate": 9.376842698923759e-06, + "loss": 0.6251, + "step": 1880 + }, + { + "epoch": 0.734192037470726, + "grad_norm": 0.7283633948405743, + "learning_rate": 9.375744356447742e-06, + "loss": 0.6387, + "step": 1881 + }, + { + "epoch": 0.7345823575331772, + "grad_norm": 0.6721704101351321, + "learning_rate": 9.374645111328498e-06, + "loss": 0.643, + "step": 1882 + }, + { + "epoch": 0.7349726775956285, + "grad_norm": 0.9261721787735199, + "learning_rate": 9.373544963792783e-06, + "loss": 0.6575, + "step": 1883 + }, + { + "epoch": 0.7353629976580797, + "grad_norm": 0.7444150819132431, + "learning_rate": 9.372443914067537e-06, + "loss": 0.6392, + "step": 1884 + }, + { + "epoch": 0.7357533177205309, + "grad_norm": 0.6886254521238113, + "learning_rate": 9.37134196237989e-06, + "loss": 0.6414, + "step": 1885 + }, + { + "epoch": 0.7361436377829821, + "grad_norm": 0.8402017559357748, + "learning_rate": 9.370239108957157e-06, + "loss": 0.6244, + "step": 1886 + }, + { + "epoch": 0.7365339578454333, + "grad_norm": 0.7862700797944869, + "learning_rate": 9.36913535402684e-06, + "loss": 0.6256, + "step": 1887 + }, + { + "epoch": 0.7369242779078845, + "grad_norm": 0.7542502909154024, + "learning_rate": 9.36803069781662e-06, + "loss": 0.6154, + "step": 1888 + }, + { + "epoch": 0.7373145979703357, + "grad_norm": 0.758137699201683, + "learning_rate": 9.366925140554372e-06, + "loss": 0.6614, + "step": 1889 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 0.6932987000588837, + "learning_rate": 9.365818682468156e-06, + "loss": 0.6464, + "step": 1890 + }, + { + "epoch": 0.7380952380952381, + "grad_norm": 0.6938154563385323, + "learning_rate": 9.364711323786213e-06, + "loss": 0.6163, + "step": 1891 + }, + { + "epoch": 0.7384855581576893, + "grad_norm": 0.6984990385583468, + "learning_rate": 9.363603064736974e-06, + "loss": 0.6475, + "step": 1892 + }, + { + "epoch": 0.7388758782201406, + "grad_norm": 0.6975276092499798, + "learning_rate": 9.362493905549054e-06, + "loss": 0.6285, + "step": 1893 + }, + { + "epoch": 0.7392661982825918, + "grad_norm": 0.787254810029544, + "learning_rate": 9.361383846451256e-06, + "loss": 0.6058, + "step": 1894 + }, + { + "epoch": 0.739656518345043, + "grad_norm": 0.7076932136659377, + "learning_rate": 9.36027288767256e-06, + "loss": 0.6622, + "step": 1895 + }, + { + "epoch": 0.7400468384074942, + "grad_norm": 0.7227766826837352, + "learning_rate": 9.359161029442147e-06, + "loss": 0.6272, + "step": 1896 + }, + { + "epoch": 0.7404371584699454, + "grad_norm": 0.7049907996519693, + "learning_rate": 9.358048271989371e-06, + "loss": 0.6323, + "step": 1897 + }, + { + "epoch": 0.7408274785323966, + "grad_norm": 0.9190894676191464, + "learning_rate": 9.356934615543776e-06, + "loss": 0.6748, + "step": 1898 + }, + { + "epoch": 0.7412177985948478, + "grad_norm": 0.6525130105938945, + "learning_rate": 9.355820060335088e-06, + "loss": 0.6447, + "step": 1899 + }, + { + "epoch": 0.741608118657299, + "grad_norm": 0.8472195195380973, + "learning_rate": 9.354704606593224e-06, + "loss": 0.6709, + "step": 1900 + }, + { + "epoch": 0.7419984387197502, + "grad_norm": 0.583430495814751, + "learning_rate": 9.353588254548283e-06, + "loss": 0.6123, + "step": 1901 + }, + { + "epoch": 0.7423887587822015, + "grad_norm": 0.8806361251240208, + "learning_rate": 9.35247100443055e-06, + "loss": 0.6729, + "step": 1902 + }, + { + "epoch": 0.7427790788446527, + "grad_norm": 0.7410160809464605, + "learning_rate": 9.351352856470495e-06, + "loss": 0.6228, + "step": 1903 + }, + { + "epoch": 0.7431693989071039, + "grad_norm": 0.7341955239194726, + "learning_rate": 9.350233810898773e-06, + "loss": 0.6593, + "step": 1904 + }, + { + "epoch": 0.7435597189695551, + "grad_norm": 0.8400992566595644, + "learning_rate": 9.349113867946225e-06, + "loss": 0.6624, + "step": 1905 + }, + { + "epoch": 0.7439500390320063, + "grad_norm": 0.6543563464360145, + "learning_rate": 9.347993027843876e-06, + "loss": 0.6109, + "step": 1906 + }, + { + "epoch": 0.7443403590944575, + "grad_norm": 0.7347520071827194, + "learning_rate": 9.346871290822936e-06, + "loss": 0.6521, + "step": 1907 + }, + { + "epoch": 0.7447306791569087, + "grad_norm": 0.728092318480875, + "learning_rate": 9.345748657114803e-06, + "loss": 0.6593, + "step": 1908 + }, + { + "epoch": 0.7451209992193599, + "grad_norm": 0.7863681216066386, + "learning_rate": 9.344625126951056e-06, + "loss": 0.6829, + "step": 1909 + }, + { + "epoch": 0.7455113192818111, + "grad_norm": 0.7889909812498396, + "learning_rate": 9.34350070056346e-06, + "loss": 0.6277, + "step": 1910 + }, + { + "epoch": 0.7459016393442623, + "grad_norm": 0.6814270696901514, + "learning_rate": 9.342375378183967e-06, + "loss": 0.6542, + "step": 1911 + }, + { + "epoch": 0.7462919594067136, + "grad_norm": 0.5987178320921239, + "learning_rate": 9.34124916004471e-06, + "loss": 0.6036, + "step": 1912 + }, + { + "epoch": 0.7466822794691648, + "grad_norm": 0.777593308188967, + "learning_rate": 9.340122046378012e-06, + "loss": 0.7012, + "step": 1913 + }, + { + "epoch": 0.747072599531616, + "grad_norm": 0.5488785079120966, + "learning_rate": 9.338994037416376e-06, + "loss": 0.6586, + "step": 1914 + }, + { + "epoch": 0.7474629195940672, + "grad_norm": 0.6435270028195585, + "learning_rate": 9.337865133392492e-06, + "loss": 0.6367, + "step": 1915 + }, + { + "epoch": 0.7478532396565184, + "grad_norm": 0.7349206322743461, + "learning_rate": 9.336735334539234e-06, + "loss": 0.6738, + "step": 1916 + }, + { + "epoch": 0.7482435597189696, + "grad_norm": 0.6004129687471832, + "learning_rate": 9.335604641089661e-06, + "loss": 0.6157, + "step": 1917 + }, + { + "epoch": 0.7486338797814208, + "grad_norm": 0.6107261710829632, + "learning_rate": 9.334473053277015e-06, + "loss": 0.6537, + "step": 1918 + }, + { + "epoch": 0.749024199843872, + "grad_norm": 0.714251326500555, + "learning_rate": 9.333340571334725e-06, + "loss": 0.627, + "step": 1919 + }, + { + "epoch": 0.7494145199063232, + "grad_norm": 0.6075128050103841, + "learning_rate": 9.332207195496403e-06, + "loss": 0.6085, + "step": 1920 + }, + { + "epoch": 0.7498048399687745, + "grad_norm": 0.679971270593722, + "learning_rate": 9.331072925995845e-06, + "loss": 0.6267, + "step": 1921 + }, + { + "epoch": 0.7501951600312255, + "grad_norm": 0.6910550070511534, + "learning_rate": 9.329937763067032e-06, + "loss": 0.6519, + "step": 1922 + }, + { + "epoch": 0.7505854800936768, + "grad_norm": 0.5587593200253811, + "learning_rate": 9.328801706944129e-06, + "loss": 0.601, + "step": 1923 + }, + { + "epoch": 0.750975800156128, + "grad_norm": 0.7415311995585224, + "learning_rate": 9.327664757861488e-06, + "loss": 0.6674, + "step": 1924 + }, + { + "epoch": 0.7513661202185792, + "grad_norm": 0.6012830958182455, + "learning_rate": 9.32652691605364e-06, + "loss": 0.6059, + "step": 1925 + }, + { + "epoch": 0.7517564402810304, + "grad_norm": 0.6163245444565996, + "learning_rate": 9.325388181755301e-06, + "loss": 0.6547, + "step": 1926 + }, + { + "epoch": 0.7521467603434816, + "grad_norm": 0.67913070383299, + "learning_rate": 9.324248555201378e-06, + "loss": 0.6848, + "step": 1927 + }, + { + "epoch": 0.7525370804059328, + "grad_norm": 0.7578583527211247, + "learning_rate": 9.323108036626954e-06, + "loss": 0.6399, + "step": 1928 + }, + { + "epoch": 0.752927400468384, + "grad_norm": 0.6403099138859577, + "learning_rate": 9.321966626267298e-06, + "loss": 0.6215, + "step": 1929 + }, + { + "epoch": 0.7533177205308352, + "grad_norm": 0.6436422334394597, + "learning_rate": 9.320824324357867e-06, + "loss": 0.6135, + "step": 1930 + }, + { + "epoch": 0.7537080405932864, + "grad_norm": 0.7142077879640972, + "learning_rate": 9.319681131134296e-06, + "loss": 0.6219, + "step": 1931 + }, + { + "epoch": 0.7540983606557377, + "grad_norm": 0.6142087836086559, + "learning_rate": 9.318537046832408e-06, + "loss": 0.6359, + "step": 1932 + }, + { + "epoch": 0.7544886807181889, + "grad_norm": 0.7051184888832129, + "learning_rate": 9.31739207168821e-06, + "loss": 0.6833, + "step": 1933 + }, + { + "epoch": 0.7548790007806401, + "grad_norm": 0.5554573203677564, + "learning_rate": 9.316246205937889e-06, + "loss": 0.6154, + "step": 1934 + }, + { + "epoch": 0.7552693208430913, + "grad_norm": 0.6557560142276136, + "learning_rate": 9.315099449817818e-06, + "loss": 0.6103, + "step": 1935 + }, + { + "epoch": 0.7556596409055425, + "grad_norm": 0.649522947890456, + "learning_rate": 9.313951803564555e-06, + "loss": 0.6436, + "step": 1936 + }, + { + "epoch": 0.7560499609679937, + "grad_norm": 0.6692369288191116, + "learning_rate": 9.312803267414839e-06, + "loss": 0.6204, + "step": 1937 + }, + { + "epoch": 0.7564402810304449, + "grad_norm": 0.734208231013788, + "learning_rate": 9.311653841605596e-06, + "loss": 0.6224, + "step": 1938 + }, + { + "epoch": 0.7568306010928961, + "grad_norm": 0.7262060952664219, + "learning_rate": 9.310503526373932e-06, + "loss": 0.6682, + "step": 1939 + }, + { + "epoch": 0.7572209211553473, + "grad_norm": 0.7772280447467476, + "learning_rate": 9.309352321957138e-06, + "loss": 0.6422, + "step": 1940 + }, + { + "epoch": 0.7576112412177985, + "grad_norm": 0.5857427936673577, + "learning_rate": 9.308200228592688e-06, + "loss": 0.5826, + "step": 1941 + }, + { + "epoch": 0.7580015612802498, + "grad_norm": 0.5677932180858779, + "learning_rate": 9.307047246518239e-06, + "loss": 0.6782, + "step": 1942 + }, + { + "epoch": 0.758391881342701, + "grad_norm": 0.6646412452869893, + "learning_rate": 9.305893375971634e-06, + "loss": 0.6076, + "step": 1943 + }, + { + "epoch": 0.7587822014051522, + "grad_norm": 0.5817923428168855, + "learning_rate": 9.304738617190899e-06, + "loss": 0.6291, + "step": 1944 + }, + { + "epoch": 0.7591725214676034, + "grad_norm": 0.6871063821186446, + "learning_rate": 9.303582970414236e-06, + "loss": 0.6771, + "step": 1945 + }, + { + "epoch": 0.7595628415300546, + "grad_norm": 0.6617684853171546, + "learning_rate": 9.302426435880038e-06, + "loss": 0.6792, + "step": 1946 + }, + { + "epoch": 0.7599531615925058, + "grad_norm": 0.572480534332415, + "learning_rate": 9.301269013826882e-06, + "loss": 0.6652, + "step": 1947 + }, + { + "epoch": 0.760343481654957, + "grad_norm": 0.6479954526812353, + "learning_rate": 9.30011070449352e-06, + "loss": 0.6567, + "step": 1948 + }, + { + "epoch": 0.7607338017174082, + "grad_norm": 0.6907007166817017, + "learning_rate": 9.298951508118895e-06, + "loss": 0.6528, + "step": 1949 + }, + { + "epoch": 0.7611241217798594, + "grad_norm": 0.6689328001754982, + "learning_rate": 9.297791424942128e-06, + "loss": 0.6562, + "step": 1950 + }, + { + "epoch": 0.7615144418423107, + "grad_norm": 0.6382793614100063, + "learning_rate": 9.296630455202527e-06, + "loss": 0.6248, + "step": 1951 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.6213928016928383, + "learning_rate": 9.29546859913958e-06, + "loss": 0.6106, + "step": 1952 + }, + { + "epoch": 0.7622950819672131, + "grad_norm": 0.6454183685225952, + "learning_rate": 9.294305856992957e-06, + "loss": 0.6343, + "step": 1953 + }, + { + "epoch": 0.7626854020296643, + "grad_norm": 0.5853624952551866, + "learning_rate": 9.293142229002515e-06, + "loss": 0.6715, + "step": 1954 + }, + { + "epoch": 0.7630757220921155, + "grad_norm": 0.633277502416259, + "learning_rate": 9.291977715408288e-06, + "loss": 0.6439, + "step": 1955 + }, + { + "epoch": 0.7634660421545667, + "grad_norm": 0.5394049918656397, + "learning_rate": 9.2908123164505e-06, + "loss": 0.6171, + "step": 1956 + }, + { + "epoch": 0.7638563622170179, + "grad_norm": 0.629040361766177, + "learning_rate": 9.28964603236955e-06, + "loss": 0.6166, + "step": 1957 + }, + { + "epoch": 0.7642466822794691, + "grad_norm": 0.6758810965192319, + "learning_rate": 9.288478863406024e-06, + "loss": 0.612, + "step": 1958 + }, + { + "epoch": 0.7646370023419203, + "grad_norm": 0.5988433749949397, + "learning_rate": 9.287310809800688e-06, + "loss": 0.6287, + "step": 1959 + }, + { + "epoch": 0.7650273224043715, + "grad_norm": 0.5971869415958885, + "learning_rate": 9.286141871794495e-06, + "loss": 0.6237, + "step": 1960 + }, + { + "epoch": 0.7654176424668228, + "grad_norm": 0.7297159348487169, + "learning_rate": 9.284972049628575e-06, + "loss": 0.6338, + "step": 1961 + }, + { + "epoch": 0.765807962529274, + "grad_norm": 0.8429672407705882, + "learning_rate": 9.283801343544244e-06, + "loss": 0.635, + "step": 1962 + }, + { + "epoch": 0.7661982825917252, + "grad_norm": 0.5908571275600821, + "learning_rate": 9.282629753783e-06, + "loss": 0.6188, + "step": 1963 + }, + { + "epoch": 0.7665886026541764, + "grad_norm": 0.8840606617858761, + "learning_rate": 9.28145728058652e-06, + "loss": 0.671, + "step": 1964 + }, + { + "epoch": 0.7669789227166276, + "grad_norm": 0.7264372571150101, + "learning_rate": 9.280283924196666e-06, + "loss": 0.6809, + "step": 1965 + }, + { + "epoch": 0.7673692427790788, + "grad_norm": 0.632551808571059, + "learning_rate": 9.279109684855484e-06, + "loss": 0.6214, + "step": 1966 + }, + { + "epoch": 0.76775956284153, + "grad_norm": 0.6924254236775775, + "learning_rate": 9.2779345628052e-06, + "loss": 0.6115, + "step": 1967 + }, + { + "epoch": 0.7681498829039812, + "grad_norm": 0.6878955889952915, + "learning_rate": 9.276758558288219e-06, + "loss": 0.6431, + "step": 1968 + }, + { + "epoch": 0.7685402029664324, + "grad_norm": 0.6242803402237084, + "learning_rate": 9.275581671547136e-06, + "loss": 0.6174, + "step": 1969 + }, + { + "epoch": 0.7689305230288837, + "grad_norm": 0.6652028199737711, + "learning_rate": 9.274403902824717e-06, + "loss": 0.6173, + "step": 1970 + }, + { + "epoch": 0.7693208430913349, + "grad_norm": 0.6153173702345387, + "learning_rate": 9.273225252363924e-06, + "loss": 0.6217, + "step": 1971 + }, + { + "epoch": 0.7697111631537861, + "grad_norm": 0.6038009140826115, + "learning_rate": 9.272045720407885e-06, + "loss": 0.6286, + "step": 1972 + }, + { + "epoch": 0.7701014832162373, + "grad_norm": 0.6838515666149666, + "learning_rate": 9.270865307199923e-06, + "loss": 0.645, + "step": 1973 + }, + { + "epoch": 0.7704918032786885, + "grad_norm": 0.6121128860064806, + "learning_rate": 9.269684012983536e-06, + "loss": 0.6512, + "step": 1974 + }, + { + "epoch": 0.7708821233411397, + "grad_norm": 0.6339666328879883, + "learning_rate": 9.268501838002403e-06, + "loss": 0.6424, + "step": 1975 + }, + { + "epoch": 0.7712724434035909, + "grad_norm": 0.6653294129924138, + "learning_rate": 9.267318782500391e-06, + "loss": 0.6285, + "step": 1976 + }, + { + "epoch": 0.7716627634660421, + "grad_norm": 0.6860737186047955, + "learning_rate": 9.266134846721541e-06, + "loss": 0.6682, + "step": 1977 + }, + { + "epoch": 0.7720530835284933, + "grad_norm": 0.5870926720713282, + "learning_rate": 9.264950030910084e-06, + "loss": 0.615, + "step": 1978 + }, + { + "epoch": 0.7724434035909445, + "grad_norm": 0.6813933739178923, + "learning_rate": 9.263764335310424e-06, + "loss": 0.6497, + "step": 1979 + }, + { + "epoch": 0.7728337236533958, + "grad_norm": 0.5886873228074495, + "learning_rate": 9.26257776016715e-06, + "loss": 0.5983, + "step": 1980 + }, + { + "epoch": 0.773224043715847, + "grad_norm": 0.595586774058366, + "learning_rate": 9.261390305725035e-06, + "loss": 0.6495, + "step": 1981 + }, + { + "epoch": 0.7736143637782982, + "grad_norm": 0.7312529249916498, + "learning_rate": 9.26020197222903e-06, + "loss": 0.629, + "step": 1982 + }, + { + "epoch": 0.7740046838407494, + "grad_norm": 0.6055553973367829, + "learning_rate": 9.259012759924269e-06, + "loss": 0.6639, + "step": 1983 + }, + { + "epoch": 0.7743950039032006, + "grad_norm": 0.7258969103872989, + "learning_rate": 9.257822669056065e-06, + "loss": 0.6361, + "step": 1984 + }, + { + "epoch": 0.7747853239656518, + "grad_norm": 0.7236778610312472, + "learning_rate": 9.256631699869914e-06, + "loss": 0.6008, + "step": 1985 + }, + { + "epoch": 0.775175644028103, + "grad_norm": 0.694812231221316, + "learning_rate": 9.255439852611496e-06, + "loss": 0.6109, + "step": 1986 + }, + { + "epoch": 0.7755659640905542, + "grad_norm": 0.7081525151117591, + "learning_rate": 9.254247127526667e-06, + "loss": 0.6567, + "step": 1987 + }, + { + "epoch": 0.7759562841530054, + "grad_norm": 0.739535428701647, + "learning_rate": 9.253053524861466e-06, + "loss": 0.5924, + "step": 1988 + }, + { + "epoch": 0.7763466042154566, + "grad_norm": 0.7140245299046704, + "learning_rate": 9.251859044862116e-06, + "loss": 0.6515, + "step": 1989 + }, + { + "epoch": 0.7767369242779079, + "grad_norm": 0.7661180971007099, + "learning_rate": 9.250663687775015e-06, + "loss": 0.6367, + "step": 1990 + }, + { + "epoch": 0.7771272443403591, + "grad_norm": 0.8110749030565252, + "learning_rate": 9.249467453846745e-06, + "loss": 0.6119, + "step": 1991 + }, + { + "epoch": 0.7775175644028103, + "grad_norm": 0.6117312568355753, + "learning_rate": 9.248270343324072e-06, + "loss": 0.6591, + "step": 1992 + }, + { + "epoch": 0.7779078844652615, + "grad_norm": 0.8454117392891569, + "learning_rate": 9.247072356453941e-06, + "loss": 0.6783, + "step": 1993 + }, + { + "epoch": 0.7782982045277127, + "grad_norm": 0.7576300748827909, + "learning_rate": 9.245873493483472e-06, + "loss": 0.6553, + "step": 1994 + }, + { + "epoch": 0.7786885245901639, + "grad_norm": 0.6305430746638105, + "learning_rate": 9.244673754659975e-06, + "loss": 0.6258, + "step": 1995 + }, + { + "epoch": 0.7790788446526151, + "grad_norm": 0.8034437060534954, + "learning_rate": 9.243473140230934e-06, + "loss": 0.6047, + "step": 1996 + }, + { + "epoch": 0.7794691647150663, + "grad_norm": 0.6997813607683488, + "learning_rate": 9.242271650444015e-06, + "loss": 0.6245, + "step": 1997 + }, + { + "epoch": 0.7798594847775175, + "grad_norm": 0.8916656913780517, + "learning_rate": 9.241069285547067e-06, + "loss": 0.6442, + "step": 1998 + }, + { + "epoch": 0.7802498048399688, + "grad_norm": 0.6651242693933651, + "learning_rate": 9.239866045788117e-06, + "loss": 0.6757, + "step": 1999 + }, + { + "epoch": 0.78064012490242, + "grad_norm": 0.8112837939976242, + "learning_rate": 9.238661931415374e-06, + "loss": 0.6023, + "step": 2000 + }, + { + "epoch": 0.7810304449648712, + "grad_norm": 0.7978834429731541, + "learning_rate": 9.237456942677225e-06, + "loss": 0.6413, + "step": 2001 + }, + { + "epoch": 0.7814207650273224, + "grad_norm": 0.5698238313201813, + "learning_rate": 9.236251079822241e-06, + "loss": 0.6555, + "step": 2002 + }, + { + "epoch": 0.7818110850897736, + "grad_norm": 0.7519740528018363, + "learning_rate": 9.235044343099172e-06, + "loss": 0.6403, + "step": 2003 + }, + { + "epoch": 0.7822014051522248, + "grad_norm": 0.6003529956687571, + "learning_rate": 9.233836732756944e-06, + "loss": 0.668, + "step": 2004 + }, + { + "epoch": 0.782591725214676, + "grad_norm": 0.7944821824680367, + "learning_rate": 9.232628249044671e-06, + "loss": 0.6769, + "step": 2005 + }, + { + "epoch": 0.7829820452771272, + "grad_norm": 0.654095061151763, + "learning_rate": 9.231418892211642e-06, + "loss": 0.6362, + "step": 2006 + }, + { + "epoch": 0.7833723653395784, + "grad_norm": 0.7577326143785637, + "learning_rate": 9.230208662507327e-06, + "loss": 0.6321, + "step": 2007 + }, + { + "epoch": 0.7837626854020296, + "grad_norm": 0.7432224630757169, + "learning_rate": 9.228997560181375e-06, + "loss": 0.6425, + "step": 2008 + }, + { + "epoch": 0.7841530054644809, + "grad_norm": 0.7453193084677334, + "learning_rate": 9.227785585483617e-06, + "loss": 0.646, + "step": 2009 + }, + { + "epoch": 0.7845433255269321, + "grad_norm": 0.7197382584989217, + "learning_rate": 9.226572738664064e-06, + "loss": 0.6695, + "step": 2010 + }, + { + "epoch": 0.7849336455893833, + "grad_norm": 0.5696620651722945, + "learning_rate": 9.225359019972906e-06, + "loss": 0.588, + "step": 2011 + }, + { + "epoch": 0.7853239656518345, + "grad_norm": 0.8507685533283451, + "learning_rate": 9.22414442966051e-06, + "loss": 0.6689, + "step": 2012 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 0.6531086030069001, + "learning_rate": 9.222928967977432e-06, + "loss": 0.6099, + "step": 2013 + }, + { + "epoch": 0.7861046057767369, + "grad_norm": 0.622949670643963, + "learning_rate": 9.221712635174396e-06, + "loss": 0.6235, + "step": 2014 + }, + { + "epoch": 0.7864949258391881, + "grad_norm": 0.6991737627808249, + "learning_rate": 9.220495431502313e-06, + "loss": 0.5918, + "step": 2015 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 0.5898522159157326, + "learning_rate": 9.219277357212271e-06, + "loss": 0.6458, + "step": 2016 + }, + { + "epoch": 0.7872755659640905, + "grad_norm": 0.645193130327917, + "learning_rate": 9.21805841255554e-06, + "loss": 0.6063, + "step": 2017 + }, + { + "epoch": 0.7876658860265418, + "grad_norm": 0.7224991485645037, + "learning_rate": 9.216838597783567e-06, + "loss": 0.6568, + "step": 2018 + }, + { + "epoch": 0.788056206088993, + "grad_norm": 0.6712462327605643, + "learning_rate": 9.21561791314798e-06, + "loss": 0.622, + "step": 2019 + }, + { + "epoch": 0.7884465261514442, + "grad_norm": 0.7444661243253069, + "learning_rate": 9.214396358900586e-06, + "loss": 0.5964, + "step": 2020 + }, + { + "epoch": 0.7888368462138954, + "grad_norm": 0.6322102400370322, + "learning_rate": 9.21317393529337e-06, + "loss": 0.621, + "step": 2021 + }, + { + "epoch": 0.7892271662763466, + "grad_norm": 0.6327307168884955, + "learning_rate": 9.2119506425785e-06, + "loss": 0.633, + "step": 2022 + }, + { + "epoch": 0.7896174863387978, + "grad_norm": 0.7113381468362807, + "learning_rate": 9.210726481008319e-06, + "loss": 0.6489, + "step": 2023 + }, + { + "epoch": 0.790007806401249, + "grad_norm": 0.6550149614476684, + "learning_rate": 9.209501450835351e-06, + "loss": 0.6461, + "step": 2024 + }, + { + "epoch": 0.7903981264637002, + "grad_norm": 0.6695344051482762, + "learning_rate": 9.208275552312302e-06, + "loss": 0.6364, + "step": 2025 + }, + { + "epoch": 0.7907884465261514, + "grad_norm": 0.6148144252974748, + "learning_rate": 9.20704878569205e-06, + "loss": 0.6436, + "step": 2026 + }, + { + "epoch": 0.7911787665886026, + "grad_norm": 0.6393198694718103, + "learning_rate": 9.205821151227661e-06, + "loss": 0.6009, + "step": 2027 + }, + { + "epoch": 0.7915690866510539, + "grad_norm": 0.6590346584988269, + "learning_rate": 9.204592649172373e-06, + "loss": 0.617, + "step": 2028 + }, + { + "epoch": 0.7919594067135051, + "grad_norm": 0.7930143446950428, + "learning_rate": 9.203363279779607e-06, + "loss": 0.6273, + "step": 2029 + }, + { + "epoch": 0.7923497267759563, + "grad_norm": 0.6430920537402776, + "learning_rate": 9.202133043302958e-06, + "loss": 0.6105, + "step": 2030 + }, + { + "epoch": 0.7927400468384075, + "grad_norm": 0.6052811175336416, + "learning_rate": 9.200901939996209e-06, + "loss": 0.6347, + "step": 2031 + }, + { + "epoch": 0.7931303669008587, + "grad_norm": 0.7794128324579368, + "learning_rate": 9.19966997011331e-06, + "loss": 0.6504, + "step": 2032 + }, + { + "epoch": 0.7935206869633099, + "grad_norm": 0.8358865233305913, + "learning_rate": 9.1984371339084e-06, + "loss": 0.6549, + "step": 2033 + }, + { + "epoch": 0.7939110070257611, + "grad_norm": 0.5991360824337791, + "learning_rate": 9.197203431635793e-06, + "loss": 0.6188, + "step": 2034 + }, + { + "epoch": 0.7943013270882123, + "grad_norm": 0.8195144126649002, + "learning_rate": 9.195968863549978e-06, + "loss": 0.6639, + "step": 2035 + }, + { + "epoch": 0.7946916471506635, + "grad_norm": 0.6515184792619958, + "learning_rate": 9.194733429905628e-06, + "loss": 0.6337, + "step": 2036 + }, + { + "epoch": 0.7950819672131147, + "grad_norm": 0.5530546944473322, + "learning_rate": 9.193497130957593e-06, + "loss": 0.6185, + "step": 2037 + }, + { + "epoch": 0.795472287275566, + "grad_norm": 0.7671609775959981, + "learning_rate": 9.1922599669609e-06, + "loss": 0.6364, + "step": 2038 + }, + { + "epoch": 0.7958626073380172, + "grad_norm": 0.7135347572961019, + "learning_rate": 9.191021938170755e-06, + "loss": 0.6287, + "step": 2039 + }, + { + "epoch": 0.7962529274004684, + "grad_norm": 0.6556393075956645, + "learning_rate": 9.18978304484254e-06, + "loss": 0.5844, + "step": 2040 + }, + { + "epoch": 0.7966432474629196, + "grad_norm": 0.5937142448429493, + "learning_rate": 9.188543287231823e-06, + "loss": 0.679, + "step": 2041 + }, + { + "epoch": 0.7970335675253708, + "grad_norm": 0.730345940817164, + "learning_rate": 9.187302665594344e-06, + "loss": 0.641, + "step": 2042 + }, + { + "epoch": 0.797423887587822, + "grad_norm": 0.7261817563493277, + "learning_rate": 9.186061180186022e-06, + "loss": 0.6457, + "step": 2043 + }, + { + "epoch": 0.7978142076502732, + "grad_norm": 0.7493656579268638, + "learning_rate": 9.184818831262955e-06, + "loss": 0.6354, + "step": 2044 + }, + { + "epoch": 0.7982045277127244, + "grad_norm": 0.6698626155677748, + "learning_rate": 9.183575619081417e-06, + "loss": 0.6236, + "step": 2045 + }, + { + "epoch": 0.7985948477751756, + "grad_norm": 0.7332556093195691, + "learning_rate": 9.182331543897865e-06, + "loss": 0.6449, + "step": 2046 + }, + { + "epoch": 0.7989851678376269, + "grad_norm": 0.7024543044809826, + "learning_rate": 9.18108660596893e-06, + "loss": 0.6213, + "step": 2047 + }, + { + "epoch": 0.7993754879000781, + "grad_norm": 0.6927896525722939, + "learning_rate": 9.17984080555142e-06, + "loss": 0.6422, + "step": 2048 + }, + { + "epoch": 0.7997658079625293, + "grad_norm": 0.6513192970507589, + "learning_rate": 9.178594142902325e-06, + "loss": 0.6403, + "step": 2049 + }, + { + "epoch": 0.8001561280249805, + "grad_norm": 0.6650331299062961, + "learning_rate": 9.17734661827881e-06, + "loss": 0.6267, + "step": 2050 + }, + { + "epoch": 0.8005464480874317, + "grad_norm": 0.7363716200719183, + "learning_rate": 9.17609823193822e-06, + "loss": 0.6174, + "step": 2051 + }, + { + "epoch": 0.8009367681498829, + "grad_norm": 0.6429444561382724, + "learning_rate": 9.174848984138074e-06, + "loss": 0.6148, + "step": 2052 + }, + { + "epoch": 0.8013270882123341, + "grad_norm": 0.7145595623575155, + "learning_rate": 9.173598875136073e-06, + "loss": 0.6446, + "step": 2053 + }, + { + "epoch": 0.8017174082747853, + "grad_norm": 0.6817685792105965, + "learning_rate": 9.172347905190093e-06, + "loss": 0.629, + "step": 2054 + }, + { + "epoch": 0.8021077283372365, + "grad_norm": 0.6705851714910898, + "learning_rate": 9.171096074558186e-06, + "loss": 0.6416, + "step": 2055 + }, + { + "epoch": 0.8024980483996877, + "grad_norm": 0.6272043295560191, + "learning_rate": 9.16984338349859e-06, + "loss": 0.6187, + "step": 2056 + }, + { + "epoch": 0.802888368462139, + "grad_norm": 0.6955456750899077, + "learning_rate": 9.168589832269706e-06, + "loss": 0.646, + "step": 2057 + }, + { + "epoch": 0.8032786885245902, + "grad_norm": 0.6768950694860443, + "learning_rate": 9.167335421130126e-06, + "loss": 0.6139, + "step": 2058 + }, + { + "epoch": 0.8036690085870414, + "grad_norm": 0.7398725009470846, + "learning_rate": 9.166080150338615e-06, + "loss": 0.6475, + "step": 2059 + }, + { + "epoch": 0.8040593286494926, + "grad_norm": 0.6588782022873546, + "learning_rate": 9.16482402015411e-06, + "loss": 0.6396, + "step": 2060 + }, + { + "epoch": 0.8044496487119438, + "grad_norm": 0.7010805194798871, + "learning_rate": 9.163567030835734e-06, + "loss": 0.6278, + "step": 2061 + }, + { + "epoch": 0.804839968774395, + "grad_norm": 0.7796382855628481, + "learning_rate": 9.162309182642782e-06, + "loss": 0.6276, + "step": 2062 + }, + { + "epoch": 0.8052302888368462, + "grad_norm": 0.7100978645090487, + "learning_rate": 9.161050475834725e-06, + "loss": 0.6561, + "step": 2063 + }, + { + "epoch": 0.8056206088992974, + "grad_norm": 0.6743882056719561, + "learning_rate": 9.159790910671216e-06, + "loss": 0.6177, + "step": 2064 + }, + { + "epoch": 0.8060109289617486, + "grad_norm": 0.6525575287253302, + "learning_rate": 9.158530487412081e-06, + "loss": 0.6324, + "step": 2065 + }, + { + "epoch": 0.8064012490241999, + "grad_norm": 0.7620230528431045, + "learning_rate": 9.157269206317324e-06, + "loss": 0.631, + "step": 2066 + }, + { + "epoch": 0.8067915690866511, + "grad_norm": 0.6606098572208835, + "learning_rate": 9.156007067647125e-06, + "loss": 0.6151, + "step": 2067 + }, + { + "epoch": 0.8071818891491023, + "grad_norm": 0.7063226316440421, + "learning_rate": 9.154744071661848e-06, + "loss": 0.6826, + "step": 2068 + }, + { + "epoch": 0.8075722092115535, + "grad_norm": 0.7958557850202301, + "learning_rate": 9.15348021862202e-06, + "loss": 0.6544, + "step": 2069 + }, + { + "epoch": 0.8079625292740047, + "grad_norm": 0.605622756641058, + "learning_rate": 9.152215508788357e-06, + "loss": 0.6264, + "step": 2070 + }, + { + "epoch": 0.8083528493364559, + "grad_norm": 0.6046816916982467, + "learning_rate": 9.15094994242175e-06, + "loss": 0.6146, + "step": 2071 + }, + { + "epoch": 0.8087431693989071, + "grad_norm": 0.6030254727304765, + "learning_rate": 9.14968351978326e-06, + "loss": 0.6229, + "step": 2072 + }, + { + "epoch": 0.8091334894613583, + "grad_norm": 0.6835035564317847, + "learning_rate": 9.148416241134131e-06, + "loss": 0.6304, + "step": 2073 + }, + { + "epoch": 0.8095238095238095, + "grad_norm": 0.6095175397809655, + "learning_rate": 9.147148106735781e-06, + "loss": 0.5885, + "step": 2074 + }, + { + "epoch": 0.8099141295862607, + "grad_norm": 0.6678982552607695, + "learning_rate": 9.145879116849805e-06, + "loss": 0.6409, + "step": 2075 + }, + { + "epoch": 0.810304449648712, + "grad_norm": 0.7255004461191683, + "learning_rate": 9.144609271737975e-06, + "loss": 0.6178, + "step": 2076 + }, + { + "epoch": 0.8106947697111632, + "grad_norm": 0.6717547870706643, + "learning_rate": 9.14333857166224e-06, + "loss": 0.6176, + "step": 2077 + }, + { + "epoch": 0.8110850897736144, + "grad_norm": 0.6155656898268689, + "learning_rate": 9.142067016884719e-06, + "loss": 0.658, + "step": 2078 + }, + { + "epoch": 0.8114754098360656, + "grad_norm": 0.6207359364747785, + "learning_rate": 9.140794607667717e-06, + "loss": 0.6345, + "step": 2079 + }, + { + "epoch": 0.8118657298985168, + "grad_norm": 0.7093237020065059, + "learning_rate": 9.139521344273713e-06, + "loss": 0.6465, + "step": 2080 + }, + { + "epoch": 0.812256049960968, + "grad_norm": 0.5608263432546076, + "learning_rate": 9.138247226965355e-06, + "loss": 0.6885, + "step": 2081 + }, + { + "epoch": 0.8126463700234192, + "grad_norm": 0.6045137551488203, + "learning_rate": 9.136972256005477e-06, + "loss": 0.6411, + "step": 2082 + }, + { + "epoch": 0.8130366900858704, + "grad_norm": 0.6332518567947367, + "learning_rate": 9.135696431657079e-06, + "loss": 0.6387, + "step": 2083 + }, + { + "epoch": 0.8134270101483216, + "grad_norm": 0.6372821025518699, + "learning_rate": 9.134419754183346e-06, + "loss": 0.6564, + "step": 2084 + }, + { + "epoch": 0.8138173302107728, + "grad_norm": 0.6570178557106511, + "learning_rate": 9.133142223847634e-06, + "loss": 0.6207, + "step": 2085 + }, + { + "epoch": 0.8142076502732241, + "grad_norm": 0.601064728951908, + "learning_rate": 9.131863840913477e-06, + "loss": 0.6127, + "step": 2086 + }, + { + "epoch": 0.8145979703356753, + "grad_norm": 0.6170059127766012, + "learning_rate": 9.130584605644583e-06, + "loss": 0.6557, + "step": 2087 + }, + { + "epoch": 0.8149882903981265, + "grad_norm": 0.595292753014305, + "learning_rate": 9.129304518304837e-06, + "loss": 0.609, + "step": 2088 + }, + { + "epoch": 0.8153786104605777, + "grad_norm": 0.6066648493737643, + "learning_rate": 9.128023579158303e-06, + "loss": 0.6572, + "step": 2089 + }, + { + "epoch": 0.8157689305230289, + "grad_norm": 0.5692349900060388, + "learning_rate": 9.126741788469214e-06, + "loss": 0.6075, + "step": 2090 + }, + { + "epoch": 0.8161592505854801, + "grad_norm": 0.5805563878396222, + "learning_rate": 9.125459146501982e-06, + "loss": 0.6504, + "step": 2091 + }, + { + "epoch": 0.8165495706479313, + "grad_norm": 0.6293561647082437, + "learning_rate": 9.124175653521196e-06, + "loss": 0.645, + "step": 2092 + }, + { + "epoch": 0.8169398907103825, + "grad_norm": 0.7121413191226222, + "learning_rate": 9.122891309791618e-06, + "loss": 0.6648, + "step": 2093 + }, + { + "epoch": 0.8173302107728337, + "grad_norm": 0.5891495435943841, + "learning_rate": 9.121606115578188e-06, + "loss": 0.6691, + "step": 2094 + }, + { + "epoch": 0.817720530835285, + "grad_norm": 0.5649715914865339, + "learning_rate": 9.12032007114602e-06, + "loss": 0.656, + "step": 2095 + }, + { + "epoch": 0.8181108508977362, + "grad_norm": 0.6509287650475474, + "learning_rate": 9.119033176760403e-06, + "loss": 0.6149, + "step": 2096 + }, + { + "epoch": 0.8185011709601874, + "grad_norm": 0.5706631939947863, + "learning_rate": 9.117745432686804e-06, + "loss": 0.6506, + "step": 2097 + }, + { + "epoch": 0.8188914910226386, + "grad_norm": 0.7254622231959921, + "learning_rate": 9.11645683919086e-06, + "loss": 0.6157, + "step": 2098 + }, + { + "epoch": 0.8192818110850898, + "grad_norm": 0.548269077205834, + "learning_rate": 9.115167396538386e-06, + "loss": 0.6458, + "step": 2099 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 0.7478152068074134, + "learning_rate": 9.113877104995376e-06, + "loss": 0.6912, + "step": 2100 + }, + { + "epoch": 0.8200624512099922, + "grad_norm": 0.6135828985846851, + "learning_rate": 9.112585964827993e-06, + "loss": 0.6496, + "step": 2101 + }, + { + "epoch": 0.8204527712724434, + "grad_norm": 0.6227227975698568, + "learning_rate": 9.11129397630258e-06, + "loss": 0.6466, + "step": 2102 + }, + { + "epoch": 0.8208430913348946, + "grad_norm": 0.7001995184665339, + "learning_rate": 9.110001139685652e-06, + "loss": 0.6808, + "step": 2103 + }, + { + "epoch": 0.8212334113973458, + "grad_norm": 0.7389876004452514, + "learning_rate": 9.108707455243897e-06, + "loss": 0.61, + "step": 2104 + }, + { + "epoch": 0.8216237314597971, + "grad_norm": 0.5871133003786219, + "learning_rate": 9.107412923244182e-06, + "loss": 0.634, + "step": 2105 + }, + { + "epoch": 0.8220140515222483, + "grad_norm": 0.6703402655065281, + "learning_rate": 9.106117543953551e-06, + "loss": 0.6189, + "step": 2106 + }, + { + "epoch": 0.8224043715846995, + "grad_norm": 0.5954937906283521, + "learning_rate": 9.104821317639216e-06, + "loss": 0.6457, + "step": 2107 + }, + { + "epoch": 0.8227946916471507, + "grad_norm": 0.6631238801515144, + "learning_rate": 9.103524244568565e-06, + "loss": 0.6193, + "step": 2108 + }, + { + "epoch": 0.8231850117096019, + "grad_norm": 0.7002601390972766, + "learning_rate": 9.102226325009166e-06, + "loss": 0.6568, + "step": 2109 + }, + { + "epoch": 0.8235753317720531, + "grad_norm": 0.6189639891732845, + "learning_rate": 9.100927559228757e-06, + "loss": 0.6373, + "step": 2110 + }, + { + "epoch": 0.8239656518345043, + "grad_norm": 0.7698513998953698, + "learning_rate": 9.099627947495252e-06, + "loss": 0.6576, + "step": 2111 + }, + { + "epoch": 0.8243559718969555, + "grad_norm": 0.6886472749658592, + "learning_rate": 9.098327490076737e-06, + "loss": 0.6485, + "step": 2112 + }, + { + "epoch": 0.8247462919594067, + "grad_norm": 0.7198725386009984, + "learning_rate": 9.097026187241479e-06, + "loss": 0.6278, + "step": 2113 + }, + { + "epoch": 0.825136612021858, + "grad_norm": 0.7197185917312771, + "learning_rate": 9.095724039257911e-06, + "loss": 0.6389, + "step": 2114 + }, + { + "epoch": 0.8255269320843092, + "grad_norm": 0.6296372532227732, + "learning_rate": 9.094421046394649e-06, + "loss": 0.6121, + "step": 2115 + }, + { + "epoch": 0.8259172521467604, + "grad_norm": 0.6072704685116671, + "learning_rate": 9.093117208920474e-06, + "loss": 0.6189, + "step": 2116 + }, + { + "epoch": 0.8263075722092116, + "grad_norm": 0.698750588269684, + "learning_rate": 9.091812527104347e-06, + "loss": 0.6392, + "step": 2117 + }, + { + "epoch": 0.8266978922716628, + "grad_norm": 0.6498811019259246, + "learning_rate": 9.090507001215405e-06, + "loss": 0.5992, + "step": 2118 + }, + { + "epoch": 0.827088212334114, + "grad_norm": 0.5696856559496221, + "learning_rate": 9.089200631522953e-06, + "loss": 0.661, + "step": 2119 + }, + { + "epoch": 0.8274785323965652, + "grad_norm": 0.6744492608516519, + "learning_rate": 9.087893418296476e-06, + "loss": 0.6507, + "step": 2120 + }, + { + "epoch": 0.8278688524590164, + "grad_norm": 0.6360525520261241, + "learning_rate": 9.086585361805628e-06, + "loss": 0.6308, + "step": 2121 + }, + { + "epoch": 0.8282591725214676, + "grad_norm": 0.6053718703608811, + "learning_rate": 9.08527646232024e-06, + "loss": 0.5912, + "step": 2122 + }, + { + "epoch": 0.8286494925839188, + "grad_norm": 0.643100377522141, + "learning_rate": 9.083966720110317e-06, + "loss": 0.6037, + "step": 2123 + }, + { + "epoch": 0.8290398126463701, + "grad_norm": 0.6308841284256215, + "learning_rate": 9.082656135446037e-06, + "loss": 0.6318, + "step": 2124 + }, + { + "epoch": 0.8294301327088213, + "grad_norm": 0.841281659781262, + "learning_rate": 9.081344708597753e-06, + "loss": 0.6295, + "step": 2125 + }, + { + "epoch": 0.8298204527712725, + "grad_norm": 0.6065732747545067, + "learning_rate": 9.080032439835987e-06, + "loss": 0.6259, + "step": 2126 + }, + { + "epoch": 0.8302107728337237, + "grad_norm": 0.6300247543516114, + "learning_rate": 9.078719329431437e-06, + "loss": 0.6428, + "step": 2127 + }, + { + "epoch": 0.8306010928961749, + "grad_norm": 0.6564211529246362, + "learning_rate": 9.077405377654984e-06, + "loss": 0.634, + "step": 2128 + }, + { + "epoch": 0.8309914129586261, + "grad_norm": 0.8179649805731338, + "learning_rate": 9.076090584777666e-06, + "loss": 0.6317, + "step": 2129 + }, + { + "epoch": 0.8313817330210773, + "grad_norm": 0.6418231804928785, + "learning_rate": 9.074774951070708e-06, + "loss": 0.6088, + "step": 2130 + }, + { + "epoch": 0.8317720530835285, + "grad_norm": 0.7346464363558057, + "learning_rate": 9.0734584768055e-06, + "loss": 0.6241, + "step": 2131 + }, + { + "epoch": 0.8321623731459797, + "grad_norm": 0.7994461375626001, + "learning_rate": 9.072141162253611e-06, + "loss": 0.6371, + "step": 2132 + }, + { + "epoch": 0.832552693208431, + "grad_norm": 0.5748594957198815, + "learning_rate": 9.070823007686779e-06, + "loss": 0.6493, + "step": 2133 + }, + { + "epoch": 0.8329430132708822, + "grad_norm": 0.723074086574827, + "learning_rate": 9.069504013376921e-06, + "loss": 0.63, + "step": 2134 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.8281818966199571, + "learning_rate": 9.068184179596118e-06, + "loss": 0.6509, + "step": 2135 + }, + { + "epoch": 0.8337236533957846, + "grad_norm": 0.700107252057107, + "learning_rate": 9.066863506616637e-06, + "loss": 0.6278, + "step": 2136 + }, + { + "epoch": 0.8341139734582358, + "grad_norm": 0.6857913497727615, + "learning_rate": 9.065541994710904e-06, + "loss": 0.6261, + "step": 2137 + }, + { + "epoch": 0.834504293520687, + "grad_norm": 0.5752586959179327, + "learning_rate": 9.064219644151528e-06, + "loss": 0.6534, + "step": 2138 + }, + { + "epoch": 0.8348946135831382, + "grad_norm": 0.7607087965006856, + "learning_rate": 9.062896455211288e-06, + "loss": 0.5899, + "step": 2139 + }, + { + "epoch": 0.8352849336455894, + "grad_norm": 0.6941492299707012, + "learning_rate": 9.061572428163135e-06, + "loss": 0.5971, + "step": 2140 + }, + { + "epoch": 0.8356752537080406, + "grad_norm": 0.6511325591731809, + "learning_rate": 9.060247563280195e-06, + "loss": 0.6292, + "step": 2141 + }, + { + "epoch": 0.8360655737704918, + "grad_norm": 0.6167659921119163, + "learning_rate": 9.058921860835764e-06, + "loss": 0.6226, + "step": 2142 + }, + { + "epoch": 0.836455893832943, + "grad_norm": 0.6151528950654468, + "learning_rate": 9.057595321103313e-06, + "loss": 0.6549, + "step": 2143 + }, + { + "epoch": 0.8368462138953943, + "grad_norm": 0.6729825101777159, + "learning_rate": 9.056267944356485e-06, + "loss": 0.6553, + "step": 2144 + }, + { + "epoch": 0.8372365339578455, + "grad_norm": 0.5507627253044262, + "learning_rate": 9.054939730869098e-06, + "loss": 0.6204, + "step": 2145 + }, + { + "epoch": 0.8376268540202967, + "grad_norm": 0.6452807527411697, + "learning_rate": 9.053610680915134e-06, + "loss": 0.6172, + "step": 2146 + }, + { + "epoch": 0.8380171740827479, + "grad_norm": 0.7050380075910323, + "learning_rate": 9.052280794768761e-06, + "loss": 0.6465, + "step": 2147 + }, + { + "epoch": 0.8384074941451991, + "grad_norm": 0.59952264692152, + "learning_rate": 9.050950072704309e-06, + "loss": 0.6413, + "step": 2148 + }, + { + "epoch": 0.8387978142076503, + "grad_norm": 0.7230835283532364, + "learning_rate": 9.049618514996286e-06, + "loss": 0.6307, + "step": 2149 + }, + { + "epoch": 0.8391881342701015, + "grad_norm": 0.700393716701158, + "learning_rate": 9.048286121919367e-06, + "loss": 0.6291, + "step": 2150 + }, + { + "epoch": 0.8395784543325527, + "grad_norm": 0.6361271324649979, + "learning_rate": 9.046952893748404e-06, + "loss": 0.6098, + "step": 2151 + }, + { + "epoch": 0.839968774395004, + "grad_norm": 0.5628302588924573, + "learning_rate": 9.04561883075842e-06, + "loss": 0.6397, + "step": 2152 + }, + { + "epoch": 0.8403590944574552, + "grad_norm": 0.7801986468766695, + "learning_rate": 9.044283933224609e-06, + "loss": 0.6498, + "step": 2153 + }, + { + "epoch": 0.8407494145199064, + "grad_norm": 0.5941652124002584, + "learning_rate": 9.04294820142234e-06, + "loss": 0.6678, + "step": 2154 + }, + { + "epoch": 0.8411397345823576, + "grad_norm": 0.7023939382750661, + "learning_rate": 9.04161163562715e-06, + "loss": 0.6057, + "step": 2155 + }, + { + "epoch": 0.8415300546448088, + "grad_norm": 0.6353112095682042, + "learning_rate": 9.040274236114752e-06, + "loss": 0.6416, + "step": 2156 + }, + { + "epoch": 0.84192037470726, + "grad_norm": 0.5813126597737853, + "learning_rate": 9.03893600316103e-06, + "loss": 0.6611, + "step": 2157 + }, + { + "epoch": 0.8423106947697112, + "grad_norm": 0.7792178191062874, + "learning_rate": 9.037596937042036e-06, + "loss": 0.6346, + "step": 2158 + }, + { + "epoch": 0.8427010148321624, + "grad_norm": 0.5892566568409848, + "learning_rate": 9.036257038034e-06, + "loss": 0.6087, + "step": 2159 + }, + { + "epoch": 0.8430913348946136, + "grad_norm": 0.595129211531277, + "learning_rate": 9.03491630641332e-06, + "loss": 0.6431, + "step": 2160 + }, + { + "epoch": 0.8434816549570648, + "grad_norm": 0.6707658574599852, + "learning_rate": 9.033574742456565e-06, + "loss": 0.6381, + "step": 2161 + }, + { + "epoch": 0.843871975019516, + "grad_norm": 0.6450010745565734, + "learning_rate": 9.03223234644048e-06, + "loss": 0.6197, + "step": 2162 + }, + { + "epoch": 0.8442622950819673, + "grad_norm": 0.5524305191108829, + "learning_rate": 9.030889118641977e-06, + "loss": 0.6093, + "step": 2163 + }, + { + "epoch": 0.8446526151444185, + "grad_norm": 0.6882430315936797, + "learning_rate": 9.029545059338143e-06, + "loss": 0.6402, + "step": 2164 + }, + { + "epoch": 0.8450429352068697, + "grad_norm": 0.5648547738129145, + "learning_rate": 9.028200168806234e-06, + "loss": 0.6243, + "step": 2165 + }, + { + "epoch": 0.8454332552693209, + "grad_norm": 0.677180854351524, + "learning_rate": 9.026854447323679e-06, + "loss": 0.6014, + "step": 2166 + }, + { + "epoch": 0.8458235753317721, + "grad_norm": 0.6346166055600188, + "learning_rate": 9.025507895168077e-06, + "loss": 0.6224, + "step": 2167 + }, + { + "epoch": 0.8462138953942233, + "grad_norm": 0.6324895721974788, + "learning_rate": 9.0241605126172e-06, + "loss": 0.6453, + "step": 2168 + }, + { + "epoch": 0.8466042154566745, + "grad_norm": 0.5193842434245057, + "learning_rate": 9.022812299948992e-06, + "loss": 0.5839, + "step": 2169 + }, + { + "epoch": 0.8469945355191257, + "grad_norm": 0.6893975496947163, + "learning_rate": 9.021463257441565e-06, + "loss": 0.6441, + "step": 2170 + }, + { + "epoch": 0.847384855581577, + "grad_norm": 0.6888714380395704, + "learning_rate": 9.020113385373204e-06, + "loss": 0.6193, + "step": 2171 + }, + { + "epoch": 0.8477751756440282, + "grad_norm": 0.5563750357543524, + "learning_rate": 9.018762684022365e-06, + "loss": 0.6487, + "step": 2172 + }, + { + "epoch": 0.8481654957064794, + "grad_norm": 0.6271868473405428, + "learning_rate": 9.017411153667677e-06, + "loss": 0.6347, + "step": 2173 + }, + { + "epoch": 0.8485558157689306, + "grad_norm": 0.6797845548958795, + "learning_rate": 9.016058794587937e-06, + "loss": 0.6508, + "step": 2174 + }, + { + "epoch": 0.8489461358313818, + "grad_norm": 0.6923587667944557, + "learning_rate": 9.014705607062114e-06, + "loss": 0.6369, + "step": 2175 + }, + { + "epoch": 0.849336455893833, + "grad_norm": 0.6089252338307684, + "learning_rate": 9.013351591369349e-06, + "loss": 0.6551, + "step": 2176 + }, + { + "epoch": 0.8497267759562842, + "grad_norm": 0.6705077357774675, + "learning_rate": 9.01199674778895e-06, + "loss": 0.664, + "step": 2177 + }, + { + "epoch": 0.8501170960187353, + "grad_norm": 0.6616066351071106, + "learning_rate": 9.010641076600403e-06, + "loss": 0.644, + "step": 2178 + }, + { + "epoch": 0.8505074160811865, + "grad_norm": 0.5975659438846541, + "learning_rate": 9.009284578083357e-06, + "loss": 0.6082, + "step": 2179 + }, + { + "epoch": 0.8508977361436377, + "grad_norm": 0.7185239125774578, + "learning_rate": 9.007927252517636e-06, + "loss": 0.6363, + "step": 2180 + }, + { + "epoch": 0.8512880562060889, + "grad_norm": 0.7347743416050434, + "learning_rate": 9.006569100183235e-06, + "loss": 0.6553, + "step": 2181 + }, + { + "epoch": 0.8516783762685401, + "grad_norm": 0.5428656421634759, + "learning_rate": 9.005210121360316e-06, + "loss": 0.6345, + "step": 2182 + }, + { + "epoch": 0.8520686963309914, + "grad_norm": 0.7286739198741934, + "learning_rate": 9.003850316329216e-06, + "loss": 0.6878, + "step": 2183 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 0.7662936113127137, + "learning_rate": 9.00248968537044e-06, + "loss": 0.6632, + "step": 2184 + }, + { + "epoch": 0.8528493364558938, + "grad_norm": 0.6277822181584108, + "learning_rate": 9.001128228764662e-06, + "loss": 0.6412, + "step": 2185 + }, + { + "epoch": 0.853239656518345, + "grad_norm": 0.6882560981872353, + "learning_rate": 8.99976594679273e-06, + "loss": 0.6414, + "step": 2186 + }, + { + "epoch": 0.8536299765807962, + "grad_norm": 0.6602991797973137, + "learning_rate": 8.998402839735656e-06, + "loss": 0.6495, + "step": 2187 + }, + { + "epoch": 0.8540202966432474, + "grad_norm": 0.6237117779179574, + "learning_rate": 8.997038907874629e-06, + "loss": 0.6287, + "step": 2188 + }, + { + "epoch": 0.8544106167056986, + "grad_norm": 0.5551739397502097, + "learning_rate": 8.995674151491006e-06, + "loss": 0.6463, + "step": 2189 + }, + { + "epoch": 0.8548009367681498, + "grad_norm": 0.746016230216196, + "learning_rate": 8.994308570866313e-06, + "loss": 0.6417, + "step": 2190 + }, + { + "epoch": 0.855191256830601, + "grad_norm": 0.5721886202945357, + "learning_rate": 8.992942166282246e-06, + "loss": 0.6292, + "step": 2191 + }, + { + "epoch": 0.8555815768930523, + "grad_norm": 0.636758756107473, + "learning_rate": 8.99157493802067e-06, + "loss": 0.6325, + "step": 2192 + }, + { + "epoch": 0.8559718969555035, + "grad_norm": 0.7126016464113711, + "learning_rate": 8.990206886363627e-06, + "loss": 0.6357, + "step": 2193 + }, + { + "epoch": 0.8563622170179547, + "grad_norm": 0.6129362710711354, + "learning_rate": 8.988838011593318e-06, + "loss": 0.6402, + "step": 2194 + }, + { + "epoch": 0.8567525370804059, + "grad_norm": 0.8297088230071441, + "learning_rate": 8.98746831399212e-06, + "loss": 0.6341, + "step": 2195 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.6666954113252036, + "learning_rate": 8.986097793842579e-06, + "loss": 0.6876, + "step": 2196 + }, + { + "epoch": 0.8575331772053083, + "grad_norm": 0.7404828506658659, + "learning_rate": 8.984726451427412e-06, + "loss": 0.6356, + "step": 2197 + }, + { + "epoch": 0.8579234972677595, + "grad_norm": 0.7775596079630407, + "learning_rate": 8.983354287029503e-06, + "loss": 0.6538, + "step": 2198 + }, + { + "epoch": 0.8583138173302107, + "grad_norm": 0.595243546470406, + "learning_rate": 8.981981300931907e-06, + "loss": 0.6527, + "step": 2199 + }, + { + "epoch": 0.8587041373926619, + "grad_norm": 0.7439561816662377, + "learning_rate": 8.980607493417848e-06, + "loss": 0.6365, + "step": 2200 + }, + { + "epoch": 0.8590944574551131, + "grad_norm": 0.56847925813601, + "learning_rate": 8.979232864770718e-06, + "loss": 0.6425, + "step": 2201 + }, + { + "epoch": 0.8594847775175644, + "grad_norm": 0.5758400480555921, + "learning_rate": 8.977857415274082e-06, + "loss": 0.595, + "step": 2202 + }, + { + "epoch": 0.8598750975800156, + "grad_norm": 0.7891091309959608, + "learning_rate": 8.976481145211675e-06, + "loss": 0.6451, + "step": 2203 + }, + { + "epoch": 0.8602654176424668, + "grad_norm": 0.5842040382997697, + "learning_rate": 8.975104054867392e-06, + "loss": 0.6593, + "step": 2204 + }, + { + "epoch": 0.860655737704918, + "grad_norm": 0.7475459876558762, + "learning_rate": 8.97372614452531e-06, + "loss": 0.6306, + "step": 2205 + }, + { + "epoch": 0.8610460577673692, + "grad_norm": 0.6081416342473784, + "learning_rate": 8.972347414469664e-06, + "loss": 0.6373, + "step": 2206 + }, + { + "epoch": 0.8614363778298204, + "grad_norm": 0.6224303349071021, + "learning_rate": 8.970967864984868e-06, + "loss": 0.6268, + "step": 2207 + }, + { + "epoch": 0.8618266978922716, + "grad_norm": 0.6712155979911865, + "learning_rate": 8.969587496355494e-06, + "loss": 0.6359, + "step": 2208 + }, + { + "epoch": 0.8622170179547228, + "grad_norm": 0.7229887499652747, + "learning_rate": 8.968206308866295e-06, + "loss": 0.6657, + "step": 2209 + }, + { + "epoch": 0.862607338017174, + "grad_norm": 0.6437878072073459, + "learning_rate": 8.966824302802184e-06, + "loss": 0.6141, + "step": 2210 + }, + { + "epoch": 0.8629976580796253, + "grad_norm": 0.6980213030247582, + "learning_rate": 8.965441478448247e-06, + "loss": 0.6049, + "step": 2211 + }, + { + "epoch": 0.8633879781420765, + "grad_norm": 0.6735215492075837, + "learning_rate": 8.964057836089736e-06, + "loss": 0.6254, + "step": 2212 + }, + { + "epoch": 0.8637782982045277, + "grad_norm": 0.7712448448922414, + "learning_rate": 8.962673376012075e-06, + "loss": 0.6783, + "step": 2213 + }, + { + "epoch": 0.8641686182669789, + "grad_norm": 0.7518669023701874, + "learning_rate": 8.961288098500854e-06, + "loss": 0.6512, + "step": 2214 + }, + { + "epoch": 0.8645589383294301, + "grad_norm": 0.699075398008806, + "learning_rate": 8.959902003841832e-06, + "loss": 0.6369, + "step": 2215 + }, + { + "epoch": 0.8649492583918813, + "grad_norm": 0.6164531009486857, + "learning_rate": 8.95851509232094e-06, + "loss": 0.642, + "step": 2216 + }, + { + "epoch": 0.8653395784543325, + "grad_norm": 0.7912526919819736, + "learning_rate": 8.957127364224274e-06, + "loss": 0.6056, + "step": 2217 + }, + { + "epoch": 0.8657298985167837, + "grad_norm": 0.7181857692561887, + "learning_rate": 8.955738819838098e-06, + "loss": 0.613, + "step": 2218 + }, + { + "epoch": 0.8661202185792349, + "grad_norm": 0.629067913809127, + "learning_rate": 8.954349459448845e-06, + "loss": 0.5881, + "step": 2219 + }, + { + "epoch": 0.8665105386416861, + "grad_norm": 1.0545188603001099, + "learning_rate": 8.952959283343119e-06, + "loss": 0.6498, + "step": 2220 + }, + { + "epoch": 0.8669008587041374, + "grad_norm": 0.6462630293843467, + "learning_rate": 8.951568291807689e-06, + "loss": 0.6288, + "step": 2221 + }, + { + "epoch": 0.8672911787665886, + "grad_norm": 0.8452304044412502, + "learning_rate": 8.950176485129491e-06, + "loss": 0.643, + "step": 2222 + }, + { + "epoch": 0.8676814988290398, + "grad_norm": 0.8081332495246685, + "learning_rate": 8.948783863595636e-06, + "loss": 0.6283, + "step": 2223 + }, + { + "epoch": 0.868071818891491, + "grad_norm": 0.6698698466716861, + "learning_rate": 8.9473904274934e-06, + "loss": 0.6459, + "step": 2224 + }, + { + "epoch": 0.8684621389539422, + "grad_norm": 0.8052798892729859, + "learning_rate": 8.945996177110216e-06, + "loss": 0.6001, + "step": 2225 + }, + { + "epoch": 0.8688524590163934, + "grad_norm": 0.5659223193136297, + "learning_rate": 8.944601112733706e-06, + "loss": 0.5812, + "step": 2226 + }, + { + "epoch": 0.8692427790788446, + "grad_norm": 0.7622540682026467, + "learning_rate": 8.943205234651641e-06, + "loss": 0.6178, + "step": 2227 + }, + { + "epoch": 0.8696330991412958, + "grad_norm": 0.5489733479591743, + "learning_rate": 8.941808543151971e-06, + "loss": 0.6534, + "step": 2228 + }, + { + "epoch": 0.870023419203747, + "grad_norm": 0.5905741727707969, + "learning_rate": 8.94041103852281e-06, + "loss": 0.5969, + "step": 2229 + }, + { + "epoch": 0.8704137392661982, + "grad_norm": 0.559271056148922, + "learning_rate": 8.939012721052437e-06, + "loss": 0.6369, + "step": 2230 + }, + { + "epoch": 0.8708040593286495, + "grad_norm": 0.5815300505559927, + "learning_rate": 8.937613591029305e-06, + "loss": 0.6223, + "step": 2231 + }, + { + "epoch": 0.8711943793911007, + "grad_norm": 0.5672338315902488, + "learning_rate": 8.936213648742031e-06, + "loss": 0.6507, + "step": 2232 + }, + { + "epoch": 0.8715846994535519, + "grad_norm": 0.5977781269297618, + "learning_rate": 8.934812894479397e-06, + "loss": 0.6324, + "step": 2233 + }, + { + "epoch": 0.8719750195160031, + "grad_norm": 0.6381483969271777, + "learning_rate": 8.93341132853036e-06, + "loss": 0.6275, + "step": 2234 + }, + { + "epoch": 0.8723653395784543, + "grad_norm": 0.5654107332447501, + "learning_rate": 8.932008951184032e-06, + "loss": 0.5982, + "step": 2235 + }, + { + "epoch": 0.8727556596409055, + "grad_norm": 0.5459283713384164, + "learning_rate": 8.930605762729709e-06, + "loss": 0.6384, + "step": 2236 + }, + { + "epoch": 0.8731459797033567, + "grad_norm": 0.5937019512363494, + "learning_rate": 8.92920176345684e-06, + "loss": 0.6296, + "step": 2237 + }, + { + "epoch": 0.8735362997658079, + "grad_norm": 0.5849216519249925, + "learning_rate": 8.927796953655048e-06, + "loss": 0.6325, + "step": 2238 + }, + { + "epoch": 0.8739266198282591, + "grad_norm": 0.6036946074521437, + "learning_rate": 8.926391333614121e-06, + "loss": 0.597, + "step": 2239 + }, + { + "epoch": 0.8743169398907104, + "grad_norm": 0.7063379911505743, + "learning_rate": 8.924984903624018e-06, + "loss": 0.6713, + "step": 2240 + }, + { + "epoch": 0.8747072599531616, + "grad_norm": 0.6398463120974799, + "learning_rate": 8.923577663974859e-06, + "loss": 0.6313, + "step": 2241 + }, + { + "epoch": 0.8750975800156128, + "grad_norm": 0.5983771114106218, + "learning_rate": 8.922169614956935e-06, + "loss": 0.6341, + "step": 2242 + }, + { + "epoch": 0.875487900078064, + "grad_norm": 0.8222658355982152, + "learning_rate": 8.920760756860703e-06, + "loss": 0.6391, + "step": 2243 + }, + { + "epoch": 0.8758782201405152, + "grad_norm": 0.6622127270314245, + "learning_rate": 8.919351089976789e-06, + "loss": 0.6391, + "step": 2244 + }, + { + "epoch": 0.8762685402029664, + "grad_norm": 0.6657427746809286, + "learning_rate": 8.917940614595979e-06, + "loss": 0.6388, + "step": 2245 + }, + { + "epoch": 0.8766588602654176, + "grad_norm": 0.6783469190442335, + "learning_rate": 8.916529331009235e-06, + "loss": 0.6272, + "step": 2246 + }, + { + "epoch": 0.8770491803278688, + "grad_norm": 0.6664700918744969, + "learning_rate": 8.91511723950768e-06, + "loss": 0.6223, + "step": 2247 + }, + { + "epoch": 0.87743950039032, + "grad_norm": 0.5762868468812764, + "learning_rate": 8.913704340382606e-06, + "loss": 0.6495, + "step": 2248 + }, + { + "epoch": 0.8778298204527712, + "grad_norm": 0.7131146018932732, + "learning_rate": 8.912290633925469e-06, + "loss": 0.6343, + "step": 2249 + }, + { + "epoch": 0.8782201405152225, + "grad_norm": 0.6885987783842422, + "learning_rate": 8.91087612042789e-06, + "loss": 0.5884, + "step": 2250 + }, + { + "epoch": 0.8786104605776737, + "grad_norm": 0.6176264386267658, + "learning_rate": 8.909460800181668e-06, + "loss": 0.6766, + "step": 2251 + }, + { + "epoch": 0.8790007806401249, + "grad_norm": 0.63552261674698, + "learning_rate": 8.908044673478753e-06, + "loss": 0.627, + "step": 2252 + }, + { + "epoch": 0.8793911007025761, + "grad_norm": 0.6781680329265599, + "learning_rate": 8.906627740611271e-06, + "loss": 0.6581, + "step": 2253 + }, + { + "epoch": 0.8797814207650273, + "grad_norm": 0.5953069189469609, + "learning_rate": 8.90521000187151e-06, + "loss": 0.6575, + "step": 2254 + }, + { + "epoch": 0.8801717408274785, + "grad_norm": 0.5638202969198125, + "learning_rate": 8.90379145755193e-06, + "loss": 0.6104, + "step": 2255 + }, + { + "epoch": 0.8805620608899297, + "grad_norm": 0.641100811849981, + "learning_rate": 8.902372107945147e-06, + "loss": 0.6586, + "step": 2256 + }, + { + "epoch": 0.8809523809523809, + "grad_norm": 0.5637460880532857, + "learning_rate": 8.900951953343953e-06, + "loss": 0.6242, + "step": 2257 + }, + { + "epoch": 0.8813427010148321, + "grad_norm": 0.7388332254423146, + "learning_rate": 8.899530994041303e-06, + "loss": 0.6062, + "step": 2258 + }, + { + "epoch": 0.8817330210772834, + "grad_norm": 0.619220485298252, + "learning_rate": 8.898109230330315e-06, + "loss": 0.6376, + "step": 2259 + }, + { + "epoch": 0.8821233411397346, + "grad_norm": 0.6691323079322126, + "learning_rate": 8.896686662504278e-06, + "loss": 0.6108, + "step": 2260 + }, + { + "epoch": 0.8825136612021858, + "grad_norm": 0.7406214447099446, + "learning_rate": 8.89526329085664e-06, + "loss": 0.6414, + "step": 2261 + }, + { + "epoch": 0.882903981264637, + "grad_norm": 0.6793028988412385, + "learning_rate": 8.89383911568102e-06, + "loss": 0.6687, + "step": 2262 + }, + { + "epoch": 0.8832943013270882, + "grad_norm": 0.6410652052011068, + "learning_rate": 8.892414137271204e-06, + "loss": 0.6144, + "step": 2263 + }, + { + "epoch": 0.8836846213895394, + "grad_norm": 0.706586879121124, + "learning_rate": 8.890988355921141e-06, + "loss": 0.6391, + "step": 2264 + }, + { + "epoch": 0.8840749414519906, + "grad_norm": 0.6904076222381111, + "learning_rate": 8.889561771924944e-06, + "loss": 0.6019, + "step": 2265 + }, + { + "epoch": 0.8844652615144418, + "grad_norm": 0.6714503448179678, + "learning_rate": 8.888134385576894e-06, + "loss": 0.6289, + "step": 2266 + }, + { + "epoch": 0.884855581576893, + "grad_norm": 0.646022577661473, + "learning_rate": 8.886706197171439e-06, + "loss": 0.6317, + "step": 2267 + }, + { + "epoch": 0.8852459016393442, + "grad_norm": 0.6689012212952377, + "learning_rate": 8.885277207003189e-06, + "loss": 0.6281, + "step": 2268 + }, + { + "epoch": 0.8856362217017955, + "grad_norm": 0.7029215911381783, + "learning_rate": 8.88384741536692e-06, + "loss": 0.6509, + "step": 2269 + }, + { + "epoch": 0.8860265417642467, + "grad_norm": 0.7487735362391077, + "learning_rate": 8.882416822557573e-06, + "loss": 0.6633, + "step": 2270 + }, + { + "epoch": 0.8864168618266979, + "grad_norm": 0.7421885754135716, + "learning_rate": 8.880985428870261e-06, + "loss": 0.6426, + "step": 2271 + }, + { + "epoch": 0.8868071818891491, + "grad_norm": 0.6284714891812797, + "learning_rate": 8.87955323460025e-06, + "loss": 0.6303, + "step": 2272 + }, + { + "epoch": 0.8871975019516003, + "grad_norm": 0.7101868365205908, + "learning_rate": 8.878120240042985e-06, + "loss": 0.6243, + "step": 2273 + }, + { + "epoch": 0.8875878220140515, + "grad_norm": 0.6657606690236898, + "learning_rate": 8.876686445494064e-06, + "loss": 0.6004, + "step": 2274 + }, + { + "epoch": 0.8879781420765027, + "grad_norm": 0.7269472992295111, + "learning_rate": 8.875251851249256e-06, + "loss": 0.616, + "step": 2275 + }, + { + "epoch": 0.8883684621389539, + "grad_norm": 0.5812582592268739, + "learning_rate": 8.873816457604491e-06, + "loss": 0.6682, + "step": 2276 + }, + { + "epoch": 0.8887587822014051, + "grad_norm": 0.700920636626721, + "learning_rate": 8.872380264855873e-06, + "loss": 0.6334, + "step": 2277 + }, + { + "epoch": 0.8891491022638564, + "grad_norm": 0.6345248091685078, + "learning_rate": 8.87094327329966e-06, + "loss": 0.6554, + "step": 2278 + }, + { + "epoch": 0.8895394223263076, + "grad_norm": 0.6491948163655946, + "learning_rate": 8.869505483232282e-06, + "loss": 0.6452, + "step": 2279 + }, + { + "epoch": 0.8899297423887588, + "grad_norm": 0.581302251972323, + "learning_rate": 8.868066894950329e-06, + "loss": 0.6038, + "step": 2280 + }, + { + "epoch": 0.89032006245121, + "grad_norm": 0.5662237213507013, + "learning_rate": 8.86662750875056e-06, + "loss": 0.6089, + "step": 2281 + }, + { + "epoch": 0.8907103825136612, + "grad_norm": 0.6391809369699456, + "learning_rate": 8.865187324929894e-06, + "loss": 0.5762, + "step": 2282 + }, + { + "epoch": 0.8911007025761124, + "grad_norm": 0.5453343009961606, + "learning_rate": 8.863746343785417e-06, + "loss": 0.6071, + "step": 2283 + }, + { + "epoch": 0.8914910226385636, + "grad_norm": 0.5597081209469605, + "learning_rate": 8.862304565614383e-06, + "loss": 0.6024, + "step": 2284 + }, + { + "epoch": 0.8918813427010148, + "grad_norm": 0.7161131951068582, + "learning_rate": 8.860861990714204e-06, + "loss": 0.6026, + "step": 2285 + }, + { + "epoch": 0.892271662763466, + "grad_norm": 0.5841977182550848, + "learning_rate": 8.859418619382458e-06, + "loss": 0.623, + "step": 2286 + }, + { + "epoch": 0.8926619828259172, + "grad_norm": 0.5420460264276612, + "learning_rate": 8.857974451916893e-06, + "loss": 0.6525, + "step": 2287 + }, + { + "epoch": 0.8930523028883685, + "grad_norm": 0.66348783530068, + "learning_rate": 8.856529488615411e-06, + "loss": 0.6155, + "step": 2288 + }, + { + "epoch": 0.8934426229508197, + "grad_norm": 0.5604052327668131, + "learning_rate": 8.855083729776089e-06, + "loss": 0.6163, + "step": 2289 + }, + { + "epoch": 0.8938329430132709, + "grad_norm": 0.6688304394773911, + "learning_rate": 8.853637175697158e-06, + "loss": 0.6442, + "step": 2290 + }, + { + "epoch": 0.8942232630757221, + "grad_norm": 0.6093963167476271, + "learning_rate": 8.852189826677024e-06, + "loss": 0.6434, + "step": 2291 + }, + { + "epoch": 0.8946135831381733, + "grad_norm": 0.6021354377644929, + "learning_rate": 8.850741683014245e-06, + "loss": 0.6313, + "step": 2292 + }, + { + "epoch": 0.8950039032006245, + "grad_norm": 0.6640201400247183, + "learning_rate": 8.849292745007554e-06, + "loss": 0.6281, + "step": 2293 + }, + { + "epoch": 0.8953942232630757, + "grad_norm": 0.6304763075036358, + "learning_rate": 8.84784301295584e-06, + "loss": 0.6614, + "step": 2294 + }, + { + "epoch": 0.8957845433255269, + "grad_norm": 0.5952412781692343, + "learning_rate": 8.846392487158157e-06, + "loss": 0.5817, + "step": 2295 + }, + { + "epoch": 0.8961748633879781, + "grad_norm": 0.6762592718464694, + "learning_rate": 8.844941167913727e-06, + "loss": 0.6292, + "step": 2296 + }, + { + "epoch": 0.8965651834504293, + "grad_norm": 0.6598214443760806, + "learning_rate": 8.843489055521933e-06, + "loss": 0.6548, + "step": 2297 + }, + { + "epoch": 0.8969555035128806, + "grad_norm": 0.5882283570207303, + "learning_rate": 8.842036150282321e-06, + "loss": 0.6594, + "step": 2298 + }, + { + "epoch": 0.8973458235753318, + "grad_norm": 0.6251897475106943, + "learning_rate": 8.840582452494601e-06, + "loss": 0.6313, + "step": 2299 + }, + { + "epoch": 0.897736143637783, + "grad_norm": 0.7505602095445375, + "learning_rate": 8.839127962458647e-06, + "loss": 0.6641, + "step": 2300 + }, + { + "epoch": 0.8981264637002342, + "grad_norm": 0.5926452298943417, + "learning_rate": 8.837672680474496e-06, + "loss": 0.6308, + "step": 2301 + }, + { + "epoch": 0.8985167837626854, + "grad_norm": 0.8195728898084587, + "learning_rate": 8.83621660684235e-06, + "loss": 0.6317, + "step": 2302 + }, + { + "epoch": 0.8989071038251366, + "grad_norm": 0.6396840328848871, + "learning_rate": 8.834759741862568e-06, + "loss": 0.6005, + "step": 2303 + }, + { + "epoch": 0.8992974238875878, + "grad_norm": 0.6758079620942838, + "learning_rate": 8.833302085835682e-06, + "loss": 0.6069, + "step": 2304 + }, + { + "epoch": 0.899687743950039, + "grad_norm": 0.7636834353942352, + "learning_rate": 8.83184363906238e-06, + "loss": 0.6363, + "step": 2305 + }, + { + "epoch": 0.9000780640124902, + "grad_norm": 0.6713756828628334, + "learning_rate": 8.830384401843515e-06, + "loss": 0.6074, + "step": 2306 + }, + { + "epoch": 0.9004683840749415, + "grad_norm": 0.7892795881276362, + "learning_rate": 8.828924374480105e-06, + "loss": 0.6269, + "step": 2307 + }, + { + "epoch": 0.9008587041373927, + "grad_norm": 0.6412873713095169, + "learning_rate": 8.827463557273329e-06, + "loss": 0.6579, + "step": 2308 + }, + { + "epoch": 0.9012490241998439, + "grad_norm": 0.643016974776236, + "learning_rate": 8.826001950524525e-06, + "loss": 0.6245, + "step": 2309 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 0.704532599747877, + "learning_rate": 8.824539554535205e-06, + "loss": 0.6328, + "step": 2310 + }, + { + "epoch": 0.9020296643247463, + "grad_norm": 0.8031155300024702, + "learning_rate": 8.823076369607033e-06, + "loss": 0.6313, + "step": 2311 + }, + { + "epoch": 0.9024199843871975, + "grad_norm": 0.5836526051549599, + "learning_rate": 8.821612396041838e-06, + "loss": 0.6096, + "step": 2312 + }, + { + "epoch": 0.9028103044496487, + "grad_norm": 0.7120110123847702, + "learning_rate": 8.820147634141618e-06, + "loss": 0.6072, + "step": 2313 + }, + { + "epoch": 0.9032006245120999, + "grad_norm": 0.6224476363841157, + "learning_rate": 8.818682084208527e-06, + "loss": 0.6023, + "step": 2314 + }, + { + "epoch": 0.9035909445745511, + "grad_norm": 0.6701027870017388, + "learning_rate": 8.817215746544882e-06, + "loss": 0.5744, + "step": 2315 + }, + { + "epoch": 0.9039812646370023, + "grad_norm": 0.785541174489812, + "learning_rate": 8.815748621453166e-06, + "loss": 0.6623, + "step": 2316 + }, + { + "epoch": 0.9043715846994536, + "grad_norm": 0.7820076990619585, + "learning_rate": 8.814280709236023e-06, + "loss": 0.6318, + "step": 2317 + }, + { + "epoch": 0.9047619047619048, + "grad_norm": 0.8301012465786665, + "learning_rate": 8.812812010196257e-06, + "loss": 0.6585, + "step": 2318 + }, + { + "epoch": 0.905152224824356, + "grad_norm": 0.6795266266801518, + "learning_rate": 8.811342524636835e-06, + "loss": 0.6485, + "step": 2319 + }, + { + "epoch": 0.9055425448868072, + "grad_norm": 0.8595180160509333, + "learning_rate": 8.809872252860892e-06, + "loss": 0.6541, + "step": 2320 + }, + { + "epoch": 0.9059328649492584, + "grad_norm": 0.8024362322769937, + "learning_rate": 8.808401195171715e-06, + "loss": 0.6709, + "step": 2321 + }, + { + "epoch": 0.9063231850117096, + "grad_norm": 0.7385734884086933, + "learning_rate": 8.806929351872765e-06, + "loss": 0.6134, + "step": 2322 + }, + { + "epoch": 0.9067135050741608, + "grad_norm": 0.8287176314677838, + "learning_rate": 8.805456723267651e-06, + "loss": 0.6584, + "step": 2323 + }, + { + "epoch": 0.907103825136612, + "grad_norm": 0.710823137430647, + "learning_rate": 8.803983309660159e-06, + "loss": 0.64, + "step": 2324 + }, + { + "epoch": 0.9074941451990632, + "grad_norm": 0.8701555249438384, + "learning_rate": 8.802509111354226e-06, + "loss": 0.6422, + "step": 2325 + }, + { + "epoch": 0.9078844652615145, + "grad_norm": 0.6854695294079272, + "learning_rate": 8.801034128653956e-06, + "loss": 0.6331, + "step": 2326 + }, + { + "epoch": 0.9082747853239657, + "grad_norm": 0.7314312159744193, + "learning_rate": 8.799558361863612e-06, + "loss": 0.6701, + "step": 2327 + }, + { + "epoch": 0.9086651053864169, + "grad_norm": 0.6597807877211089, + "learning_rate": 8.798081811287623e-06, + "loss": 0.615, + "step": 2328 + }, + { + "epoch": 0.9090554254488681, + "grad_norm": 0.6447904630914802, + "learning_rate": 8.79660447723057e-06, + "loss": 0.6173, + "step": 2329 + }, + { + "epoch": 0.9094457455113193, + "grad_norm": 0.7513791600002289, + "learning_rate": 8.795126359997211e-06, + "loss": 0.6614, + "step": 2330 + }, + { + "epoch": 0.9098360655737705, + "grad_norm": 0.6729478189486753, + "learning_rate": 8.793647459892455e-06, + "loss": 0.6715, + "step": 2331 + }, + { + "epoch": 0.9102263856362217, + "grad_norm": 0.6976236690777164, + "learning_rate": 8.792167777221368e-06, + "loss": 0.6443, + "step": 2332 + }, + { + "epoch": 0.9106167056986729, + "grad_norm": 0.7064975585262241, + "learning_rate": 8.79068731228919e-06, + "loss": 0.6302, + "step": 2333 + }, + { + "epoch": 0.9110070257611241, + "grad_norm": 0.6303557053437642, + "learning_rate": 8.789206065401315e-06, + "loss": 0.6225, + "step": 2334 + }, + { + "epoch": 0.9113973458235753, + "grad_norm": 0.7582556510097629, + "learning_rate": 8.787724036863299e-06, + "loss": 0.6173, + "step": 2335 + }, + { + "epoch": 0.9117876658860266, + "grad_norm": 0.7330116319012463, + "learning_rate": 8.78624122698086e-06, + "loss": 0.6138, + "step": 2336 + }, + { + "epoch": 0.9121779859484778, + "grad_norm": 0.6107511151841641, + "learning_rate": 8.784757636059878e-06, + "loss": 0.6388, + "step": 2337 + }, + { + "epoch": 0.912568306010929, + "grad_norm": 0.7913432059592349, + "learning_rate": 8.783273264406394e-06, + "loss": 0.602, + "step": 2338 + }, + { + "epoch": 0.9129586260733802, + "grad_norm": 0.6493370497697616, + "learning_rate": 8.781788112326603e-06, + "loss": 0.6578, + "step": 2339 + }, + { + "epoch": 0.9133489461358314, + "grad_norm": 0.653362241096989, + "learning_rate": 8.780302180126876e-06, + "loss": 0.6184, + "step": 2340 + }, + { + "epoch": 0.9137392661982826, + "grad_norm": 0.6392809303881302, + "learning_rate": 8.778815468113728e-06, + "loss": 0.6224, + "step": 2341 + }, + { + "epoch": 0.9141295862607338, + "grad_norm": 0.7095452365001299, + "learning_rate": 8.777327976593849e-06, + "loss": 0.6155, + "step": 2342 + }, + { + "epoch": 0.914519906323185, + "grad_norm": 0.8193866165978759, + "learning_rate": 8.775839705874082e-06, + "loss": 0.6262, + "step": 2343 + }, + { + "epoch": 0.9149102263856362, + "grad_norm": 0.6578062896446141, + "learning_rate": 8.774350656261427e-06, + "loss": 0.6162, + "step": 2344 + }, + { + "epoch": 0.9153005464480874, + "grad_norm": 0.7217746490002632, + "learning_rate": 8.772860828063061e-06, + "loss": 0.6496, + "step": 2345 + }, + { + "epoch": 0.9156908665105387, + "grad_norm": 0.7306518462177092, + "learning_rate": 8.771370221586303e-06, + "loss": 0.6113, + "step": 2346 + }, + { + "epoch": 0.9160811865729899, + "grad_norm": 0.7099196356492832, + "learning_rate": 8.769878837138639e-06, + "loss": 0.6473, + "step": 2347 + }, + { + "epoch": 0.9164715066354411, + "grad_norm": 0.6416738355394414, + "learning_rate": 8.768386675027722e-06, + "loss": 0.6425, + "step": 2348 + }, + { + "epoch": 0.9168618266978923, + "grad_norm": 0.7899718047022954, + "learning_rate": 8.766893735561358e-06, + "loss": 0.643, + "step": 2349 + }, + { + "epoch": 0.9172521467603435, + "grad_norm": 0.6512573213394172, + "learning_rate": 8.765400019047517e-06, + "loss": 0.6239, + "step": 2350 + }, + { + "epoch": 0.9176424668227947, + "grad_norm": 0.7611849074220737, + "learning_rate": 8.763905525794324e-06, + "loss": 0.6667, + "step": 2351 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 0.7490259395908131, + "learning_rate": 8.762410256110073e-06, + "loss": 0.6183, + "step": 2352 + }, + { + "epoch": 0.9184231069476971, + "grad_norm": 0.6073930593657646, + "learning_rate": 8.760914210303208e-06, + "loss": 0.6054, + "step": 2353 + }, + { + "epoch": 0.9188134270101483, + "grad_norm": 0.7066324708046332, + "learning_rate": 8.759417388682343e-06, + "loss": 0.6646, + "step": 2354 + }, + { + "epoch": 0.9192037470725996, + "grad_norm": 0.652950122091572, + "learning_rate": 8.757919791556245e-06, + "loss": 0.6098, + "step": 2355 + }, + { + "epoch": 0.9195940671350508, + "grad_norm": 0.6547540573189381, + "learning_rate": 8.756421419233843e-06, + "loss": 0.6589, + "step": 2356 + }, + { + "epoch": 0.919984387197502, + "grad_norm": 0.5965223353064963, + "learning_rate": 8.75492227202423e-06, + "loss": 0.611, + "step": 2357 + }, + { + "epoch": 0.9203747072599532, + "grad_norm": 0.5951758282763305, + "learning_rate": 8.753422350236648e-06, + "loss": 0.6637, + "step": 2358 + }, + { + "epoch": 0.9207650273224044, + "grad_norm": 0.631709188048072, + "learning_rate": 8.751921654180514e-06, + "loss": 0.6165, + "step": 2359 + }, + { + "epoch": 0.9211553473848556, + "grad_norm": 0.6674171143530749, + "learning_rate": 8.750420184165391e-06, + "loss": 0.6707, + "step": 2360 + }, + { + "epoch": 0.9215456674473068, + "grad_norm": 0.5699117015799282, + "learning_rate": 8.748917940501009e-06, + "loss": 0.6212, + "step": 2361 + }, + { + "epoch": 0.921935987509758, + "grad_norm": 0.6133038345152716, + "learning_rate": 8.747414923497258e-06, + "loss": 0.6414, + "step": 2362 + }, + { + "epoch": 0.9223263075722092, + "grad_norm": 0.5434797809611723, + "learning_rate": 8.74591113346418e-06, + "loss": 0.6396, + "step": 2363 + }, + { + "epoch": 0.9227166276346604, + "grad_norm": 0.6141671980800543, + "learning_rate": 8.744406570711988e-06, + "loss": 0.6624, + "step": 2364 + }, + { + "epoch": 0.9231069476971117, + "grad_norm": 0.5708882837556286, + "learning_rate": 8.742901235551042e-06, + "loss": 0.6377, + "step": 2365 + }, + { + "epoch": 0.9234972677595629, + "grad_norm": 0.5837904553586929, + "learning_rate": 8.741395128291875e-06, + "loss": 0.5922, + "step": 2366 + }, + { + "epoch": 0.9238875878220141, + "grad_norm": 0.5543726030334832, + "learning_rate": 8.739888249245165e-06, + "loss": 0.6313, + "step": 2367 + }, + { + "epoch": 0.9242779078844653, + "grad_norm": 0.6114035838820514, + "learning_rate": 8.73838059872176e-06, + "loss": 0.6203, + "step": 2368 + }, + { + "epoch": 0.9246682279469165, + "grad_norm": 0.6448092468482169, + "learning_rate": 8.736872177032662e-06, + "loss": 0.6458, + "step": 2369 + }, + { + "epoch": 0.9250585480093677, + "grad_norm": 0.5965244498669232, + "learning_rate": 8.735362984489033e-06, + "loss": 0.6099, + "step": 2370 + }, + { + "epoch": 0.9254488680718189, + "grad_norm": 0.6414770760961345, + "learning_rate": 8.733853021402196e-06, + "loss": 0.6623, + "step": 2371 + }, + { + "epoch": 0.9258391881342701, + "grad_norm": 0.6742906477727244, + "learning_rate": 8.732342288083628e-06, + "loss": 0.6402, + "step": 2372 + }, + { + "epoch": 0.9262295081967213, + "grad_norm": 0.6835930992557546, + "learning_rate": 8.730830784844972e-06, + "loss": 0.6449, + "step": 2373 + }, + { + "epoch": 0.9266198282591726, + "grad_norm": 0.6102677146543014, + "learning_rate": 8.729318511998024e-06, + "loss": 0.6432, + "step": 2374 + }, + { + "epoch": 0.9270101483216238, + "grad_norm": 0.6385024283597064, + "learning_rate": 8.72780546985474e-06, + "loss": 0.6369, + "step": 2375 + }, + { + "epoch": 0.927400468384075, + "grad_norm": 0.6488241930139989, + "learning_rate": 8.726291658727237e-06, + "loss": 0.6581, + "step": 2376 + }, + { + "epoch": 0.9277907884465262, + "grad_norm": 0.559972278824147, + "learning_rate": 8.724777078927786e-06, + "loss": 0.6244, + "step": 2377 + }, + { + "epoch": 0.9281811085089774, + "grad_norm": 0.6127721813815493, + "learning_rate": 8.723261730768824e-06, + "loss": 0.6429, + "step": 2378 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 0.6684620802265879, + "learning_rate": 8.72174561456294e-06, + "loss": 0.6209, + "step": 2379 + }, + { + "epoch": 0.9289617486338798, + "grad_norm": 0.5856064844465972, + "learning_rate": 8.720228730622882e-06, + "loss": 0.6528, + "step": 2380 + }, + { + "epoch": 0.929352068696331, + "grad_norm": 0.6139969601984101, + "learning_rate": 8.71871107926156e-06, + "loss": 0.6387, + "step": 2381 + }, + { + "epoch": 0.9297423887587822, + "grad_norm": 0.7305392772598028, + "learning_rate": 8.71719266079204e-06, + "loss": 0.6171, + "step": 2382 + }, + { + "epoch": 0.9301327088212334, + "grad_norm": 0.582088029800543, + "learning_rate": 8.715673475527546e-06, + "loss": 0.6186, + "step": 2383 + }, + { + "epoch": 0.9305230288836847, + "grad_norm": 0.6623816530425768, + "learning_rate": 8.71415352378146e-06, + "loss": 0.6223, + "step": 2384 + }, + { + "epoch": 0.9309133489461359, + "grad_norm": 0.6400174928329623, + "learning_rate": 8.712632805867325e-06, + "loss": 0.6214, + "step": 2385 + }, + { + "epoch": 0.9313036690085871, + "grad_norm": 0.6049266317271216, + "learning_rate": 8.711111322098837e-06, + "loss": 0.5994, + "step": 2386 + }, + { + "epoch": 0.9316939890710383, + "grad_norm": 0.6505726760811507, + "learning_rate": 8.709589072789855e-06, + "loss": 0.625, + "step": 2387 + }, + { + "epoch": 0.9320843091334895, + "grad_norm": 0.6292586447585591, + "learning_rate": 8.70806605825439e-06, + "loss": 0.6085, + "step": 2388 + }, + { + "epoch": 0.9324746291959407, + "grad_norm": 0.5771447278818664, + "learning_rate": 8.70654227880662e-06, + "loss": 0.6377, + "step": 2389 + }, + { + "epoch": 0.9328649492583919, + "grad_norm": 0.7907896042064735, + "learning_rate": 8.705017734760872e-06, + "loss": 0.6616, + "step": 2390 + }, + { + "epoch": 0.9332552693208431, + "grad_norm": 0.576433251080246, + "learning_rate": 8.703492426431634e-06, + "loss": 0.5887, + "step": 2391 + }, + { + "epoch": 0.9336455893832943, + "grad_norm": 0.667579301814579, + "learning_rate": 8.701966354133553e-06, + "loss": 0.6228, + "step": 2392 + }, + { + "epoch": 0.9340359094457455, + "grad_norm": 0.5981395956705299, + "learning_rate": 8.700439518181432e-06, + "loss": 0.5888, + "step": 2393 + }, + { + "epoch": 0.9344262295081968, + "grad_norm": 0.5768241049226974, + "learning_rate": 8.698911918890231e-06, + "loss": 0.6236, + "step": 2394 + }, + { + "epoch": 0.934816549570648, + "grad_norm": 0.7005818950441703, + "learning_rate": 8.69738355657507e-06, + "loss": 0.6487, + "step": 2395 + }, + { + "epoch": 0.9352068696330992, + "grad_norm": 0.6404334844995092, + "learning_rate": 8.69585443155122e-06, + "loss": 0.6417, + "step": 2396 + }, + { + "epoch": 0.9355971896955504, + "grad_norm": 0.6262153943480089, + "learning_rate": 8.694324544134121e-06, + "loss": 0.6456, + "step": 2397 + }, + { + "epoch": 0.9359875097580016, + "grad_norm": 0.6581601002735055, + "learning_rate": 8.692793894639358e-06, + "loss": 0.644, + "step": 2398 + }, + { + "epoch": 0.9363778298204528, + "grad_norm": 0.596656413113607, + "learning_rate": 8.691262483382682e-06, + "loss": 0.6375, + "step": 2399 + }, + { + "epoch": 0.936768149882904, + "grad_norm": 0.6223845749362819, + "learning_rate": 8.689730310679996e-06, + "loss": 0.6091, + "step": 2400 + }, + { + "epoch": 0.9371584699453552, + "grad_norm": 0.5930298365038712, + "learning_rate": 8.68819737684736e-06, + "loss": 0.627, + "step": 2401 + }, + { + "epoch": 0.9375487900078064, + "grad_norm": 0.6768651219625447, + "learning_rate": 8.686663682200995e-06, + "loss": 0.6428, + "step": 2402 + }, + { + "epoch": 0.9379391100702577, + "grad_norm": 0.6541953077512614, + "learning_rate": 8.685129227057278e-06, + "loss": 0.6461, + "step": 2403 + }, + { + "epoch": 0.9383294301327089, + "grad_norm": 0.6073987778474133, + "learning_rate": 8.683594011732739e-06, + "loss": 0.6109, + "step": 2404 + }, + { + "epoch": 0.9387197501951601, + "grad_norm": 0.7588668659774773, + "learning_rate": 8.682058036544067e-06, + "loss": 0.6486, + "step": 2405 + }, + { + "epoch": 0.9391100702576113, + "grad_norm": 0.6269546109129939, + "learning_rate": 8.680521301808109e-06, + "loss": 0.6219, + "step": 2406 + }, + { + "epoch": 0.9395003903200625, + "grad_norm": 0.6045152927653076, + "learning_rate": 8.678983807841869e-06, + "loss": 0.6423, + "step": 2407 + }, + { + "epoch": 0.9398907103825137, + "grad_norm": 0.6223296925751867, + "learning_rate": 8.677445554962506e-06, + "loss": 0.653, + "step": 2408 + }, + { + "epoch": 0.9402810304449649, + "grad_norm": 0.554764728297825, + "learning_rate": 8.675906543487334e-06, + "loss": 0.637, + "step": 2409 + }, + { + "epoch": 0.9406713505074161, + "grad_norm": 0.6523881989428372, + "learning_rate": 8.67436677373383e-06, + "loss": 0.6702, + "step": 2410 + }, + { + "epoch": 0.9410616705698673, + "grad_norm": 0.6737365390187525, + "learning_rate": 8.672826246019617e-06, + "loss": 0.6545, + "step": 2411 + }, + { + "epoch": 0.9414519906323185, + "grad_norm": 0.604559046198771, + "learning_rate": 8.671284960662482e-06, + "loss": 0.6297, + "step": 2412 + }, + { + "epoch": 0.9418423106947698, + "grad_norm": 0.6074361638475905, + "learning_rate": 8.669742917980369e-06, + "loss": 0.6304, + "step": 2413 + }, + { + "epoch": 0.942232630757221, + "grad_norm": 0.5857290741161745, + "learning_rate": 8.668200118291374e-06, + "loss": 0.6304, + "step": 2414 + }, + { + "epoch": 0.9426229508196722, + "grad_norm": 0.5768840979438418, + "learning_rate": 8.666656561913752e-06, + "loss": 0.6303, + "step": 2415 + }, + { + "epoch": 0.9430132708821234, + "grad_norm": 0.6062349441021987, + "learning_rate": 8.66511224916591e-06, + "loss": 0.6174, + "step": 2416 + }, + { + "epoch": 0.9434035909445746, + "grad_norm": 0.5541508594941087, + "learning_rate": 8.663567180366417e-06, + "loss": 0.6214, + "step": 2417 + }, + { + "epoch": 0.9437939110070258, + "grad_norm": 0.562214773184354, + "learning_rate": 8.662021355833994e-06, + "loss": 0.6126, + "step": 2418 + }, + { + "epoch": 0.944184231069477, + "grad_norm": 0.5673324677249885, + "learning_rate": 8.660474775887521e-06, + "loss": 0.6084, + "step": 2419 + }, + { + "epoch": 0.9445745511319282, + "grad_norm": 0.5475049048594036, + "learning_rate": 8.658927440846027e-06, + "loss": 0.6358, + "step": 2420 + }, + { + "epoch": 0.9449648711943794, + "grad_norm": 0.5765245216296633, + "learning_rate": 8.657379351028706e-06, + "loss": 0.6256, + "step": 2421 + }, + { + "epoch": 0.9453551912568307, + "grad_norm": 0.5766451327607038, + "learning_rate": 8.6558305067549e-06, + "loss": 0.6344, + "step": 2422 + }, + { + "epoch": 0.9457455113192819, + "grad_norm": 0.5818622245662324, + "learning_rate": 8.654280908344112e-06, + "loss": 0.6338, + "step": 2423 + }, + { + "epoch": 0.9461358313817331, + "grad_norm": 0.5111843052647809, + "learning_rate": 8.652730556115997e-06, + "loss": 0.6106, + "step": 2424 + }, + { + "epoch": 0.9465261514441843, + "grad_norm": 0.5011546772742616, + "learning_rate": 8.651179450390366e-06, + "loss": 0.6293, + "step": 2425 + }, + { + "epoch": 0.9469164715066355, + "grad_norm": 0.6323985944501513, + "learning_rate": 8.64962759148719e-06, + "loss": 0.6304, + "step": 2426 + }, + { + "epoch": 0.9473067915690867, + "grad_norm": 0.5131512199834335, + "learning_rate": 8.648074979726588e-06, + "loss": 0.5921, + "step": 2427 + }, + { + "epoch": 0.9476971116315379, + "grad_norm": 0.5789982578005937, + "learning_rate": 8.646521615428837e-06, + "loss": 0.6011, + "step": 2428 + }, + { + "epoch": 0.9480874316939891, + "grad_norm": 0.595367209913602, + "learning_rate": 8.644967498914375e-06, + "loss": 0.6228, + "step": 2429 + }, + { + "epoch": 0.9484777517564403, + "grad_norm": 0.6351230581251839, + "learning_rate": 8.64341263050379e-06, + "loss": 0.6438, + "step": 2430 + }, + { + "epoch": 0.9488680718188915, + "grad_norm": 0.5981467237787822, + "learning_rate": 8.641857010517818e-06, + "loss": 0.6044, + "step": 2431 + }, + { + "epoch": 0.9492583918813428, + "grad_norm": 0.7445691963136913, + "learning_rate": 8.640300639277367e-06, + "loss": 0.6358, + "step": 2432 + }, + { + "epoch": 0.949648711943794, + "grad_norm": 0.5261380264071341, + "learning_rate": 8.638743517103486e-06, + "loss": 0.5958, + "step": 2433 + }, + { + "epoch": 0.9500390320062451, + "grad_norm": 0.5929527758121894, + "learning_rate": 8.63718564431738e-06, + "loss": 0.6002, + "step": 2434 + }, + { + "epoch": 0.9504293520686963, + "grad_norm": 0.8132966643562352, + "learning_rate": 8.63562702124042e-06, + "loss": 0.6743, + "step": 2435 + }, + { + "epoch": 0.9508196721311475, + "grad_norm": 0.5617374896603106, + "learning_rate": 8.634067648194118e-06, + "loss": 0.6441, + "step": 2436 + }, + { + "epoch": 0.9512099921935987, + "grad_norm": 0.7187629799315687, + "learning_rate": 8.632507525500148e-06, + "loss": 0.6505, + "step": 2437 + }, + { + "epoch": 0.9516003122560499, + "grad_norm": 0.7601737022210648, + "learning_rate": 8.630946653480338e-06, + "loss": 0.6186, + "step": 2438 + }, + { + "epoch": 0.9519906323185011, + "grad_norm": 0.6657922616685251, + "learning_rate": 8.62938503245667e-06, + "loss": 0.6192, + "step": 2439 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.7074440490149125, + "learning_rate": 8.627822662751281e-06, + "loss": 0.6535, + "step": 2440 + }, + { + "epoch": 0.9527712724434035, + "grad_norm": 0.6370628487452942, + "learning_rate": 8.62625954468646e-06, + "loss": 0.648, + "step": 2441 + }, + { + "epoch": 0.9531615925058547, + "grad_norm": 0.6835137705779321, + "learning_rate": 8.624695678584656e-06, + "loss": 0.64, + "step": 2442 + }, + { + "epoch": 0.953551912568306, + "grad_norm": 0.6119248674451863, + "learning_rate": 8.623131064768463e-06, + "loss": 0.6204, + "step": 2443 + }, + { + "epoch": 0.9539422326307572, + "grad_norm": 0.6398587631551529, + "learning_rate": 8.621565703560638e-06, + "loss": 0.6508, + "step": 2444 + }, + { + "epoch": 0.9543325526932084, + "grad_norm": 0.5648970519531343, + "learning_rate": 8.61999959528409e-06, + "loss": 0.6454, + "step": 2445 + }, + { + "epoch": 0.9547228727556596, + "grad_norm": 0.6421027917620032, + "learning_rate": 8.618432740261879e-06, + "loss": 0.6573, + "step": 2446 + }, + { + "epoch": 0.9551131928181108, + "grad_norm": 0.5367424541343423, + "learning_rate": 8.61686513881722e-06, + "loss": 0.6393, + "step": 2447 + }, + { + "epoch": 0.955503512880562, + "grad_norm": 0.5323746856287865, + "learning_rate": 8.615296791273488e-06, + "loss": 0.6371, + "step": 2448 + }, + { + "epoch": 0.9558938329430132, + "grad_norm": 0.5541646729526936, + "learning_rate": 8.613727697954202e-06, + "loss": 0.6011, + "step": 2449 + }, + { + "epoch": 0.9562841530054644, + "grad_norm": 0.5776436052569262, + "learning_rate": 8.612157859183043e-06, + "loss": 0.6237, + "step": 2450 + }, + { + "epoch": 0.9566744730679156, + "grad_norm": 0.6133587963492434, + "learning_rate": 8.61058727528384e-06, + "loss": 0.6748, + "step": 2451 + }, + { + "epoch": 0.9570647931303669, + "grad_norm": 0.569544537527333, + "learning_rate": 8.609015946580582e-06, + "loss": 0.6182, + "step": 2452 + }, + { + "epoch": 0.9574551131928181, + "grad_norm": 0.5931706644007292, + "learning_rate": 8.607443873397403e-06, + "loss": 0.6161, + "step": 2453 + }, + { + "epoch": 0.9578454332552693, + "grad_norm": 0.6879997730652382, + "learning_rate": 8.6058710560586e-06, + "loss": 0.6374, + "step": 2454 + }, + { + "epoch": 0.9582357533177205, + "grad_norm": 0.5908403795320154, + "learning_rate": 8.604297494888615e-06, + "loss": 0.6464, + "step": 2455 + }, + { + "epoch": 0.9586260733801717, + "grad_norm": 0.6373982465470188, + "learning_rate": 8.60272319021205e-06, + "loss": 0.6375, + "step": 2456 + }, + { + "epoch": 0.9590163934426229, + "grad_norm": 0.594177066863741, + "learning_rate": 8.60114814235366e-06, + "loss": 0.6237, + "step": 2457 + }, + { + "epoch": 0.9594067135050741, + "grad_norm": 0.6358963081338592, + "learning_rate": 8.599572351638345e-06, + "loss": 0.6061, + "step": 2458 + }, + { + "epoch": 0.9597970335675253, + "grad_norm": 0.6314533593836329, + "learning_rate": 8.597995818391171e-06, + "loss": 0.6212, + "step": 2459 + }, + { + "epoch": 0.9601873536299765, + "grad_norm": 0.6184547892321745, + "learning_rate": 8.596418542937349e-06, + "loss": 0.6401, + "step": 2460 + }, + { + "epoch": 0.9605776736924277, + "grad_norm": 0.7607530241963509, + "learning_rate": 8.594840525602239e-06, + "loss": 0.6475, + "step": 2461 + }, + { + "epoch": 0.960967993754879, + "grad_norm": 0.7032982152168282, + "learning_rate": 8.593261766711367e-06, + "loss": 0.6351, + "step": 2462 + }, + { + "epoch": 0.9613583138173302, + "grad_norm": 0.8070338301919394, + "learning_rate": 8.5916822665904e-06, + "loss": 0.6256, + "step": 2463 + }, + { + "epoch": 0.9617486338797814, + "grad_norm": 0.569884719475257, + "learning_rate": 8.590102025565165e-06, + "loss": 0.6259, + "step": 2464 + }, + { + "epoch": 0.9621389539422326, + "grad_norm": 0.7674278819266752, + "learning_rate": 8.58852104396164e-06, + "loss": 0.6765, + "step": 2465 + }, + { + "epoch": 0.9625292740046838, + "grad_norm": 0.7434239442142461, + "learning_rate": 8.586939322105953e-06, + "loss": 0.6425, + "step": 2466 + }, + { + "epoch": 0.962919594067135, + "grad_norm": 0.6761533159155683, + "learning_rate": 8.585356860324387e-06, + "loss": 0.6159, + "step": 2467 + }, + { + "epoch": 0.9633099141295862, + "grad_norm": 0.7564863424900112, + "learning_rate": 8.58377365894338e-06, + "loss": 0.6274, + "step": 2468 + }, + { + "epoch": 0.9637002341920374, + "grad_norm": 0.5272822269235024, + "learning_rate": 8.582189718289517e-06, + "loss": 0.5876, + "step": 2469 + }, + { + "epoch": 0.9640905542544886, + "grad_norm": 0.7758468735633869, + "learning_rate": 8.58060503868954e-06, + "loss": 0.6259, + "step": 2470 + }, + { + "epoch": 0.9644808743169399, + "grad_norm": 0.6931460576320373, + "learning_rate": 8.579019620470342e-06, + "loss": 0.6227, + "step": 2471 + }, + { + "epoch": 0.9648711943793911, + "grad_norm": 0.6887158738210711, + "learning_rate": 8.577433463958969e-06, + "loss": 0.5861, + "step": 2472 + }, + { + "epoch": 0.9652615144418423, + "grad_norm": 0.780541458227247, + "learning_rate": 8.575846569482618e-06, + "loss": 0.6429, + "step": 2473 + }, + { + "epoch": 0.9656518345042935, + "grad_norm": 0.6942467417271703, + "learning_rate": 8.574258937368639e-06, + "loss": 0.6547, + "step": 2474 + }, + { + "epoch": 0.9660421545667447, + "grad_norm": 0.6949319946014338, + "learning_rate": 8.572670567944532e-06, + "loss": 0.6478, + "step": 2475 + }, + { + "epoch": 0.9664324746291959, + "grad_norm": 0.8397560002275122, + "learning_rate": 8.571081461537956e-06, + "loss": 0.6461, + "step": 2476 + }, + { + "epoch": 0.9668227946916471, + "grad_norm": 0.6918430759631403, + "learning_rate": 8.569491618476712e-06, + "loss": 0.6387, + "step": 2477 + }, + { + "epoch": 0.9672131147540983, + "grad_norm": 0.6713788938267679, + "learning_rate": 8.567901039088764e-06, + "loss": 0.6348, + "step": 2478 + }, + { + "epoch": 0.9676034348165495, + "grad_norm": 0.6430984338991972, + "learning_rate": 8.566309723702217e-06, + "loss": 0.64, + "step": 2479 + }, + { + "epoch": 0.9679937548790007, + "grad_norm": 0.6175121099395556, + "learning_rate": 8.564717672645336e-06, + "loss": 0.6171, + "step": 2480 + }, + { + "epoch": 0.968384074941452, + "grad_norm": 0.707535571928273, + "learning_rate": 8.56312488624653e-06, + "loss": 0.642, + "step": 2481 + }, + { + "epoch": 0.9687743950039032, + "grad_norm": 0.6283633930853836, + "learning_rate": 8.56153136483437e-06, + "loss": 0.6171, + "step": 2482 + }, + { + "epoch": 0.9691647150663544, + "grad_norm": 0.7542753346135206, + "learning_rate": 8.559937108737569e-06, + "loss": 0.6316, + "step": 2483 + }, + { + "epoch": 0.9695550351288056, + "grad_norm": 0.6754121469090508, + "learning_rate": 8.558342118284997e-06, + "loss": 0.625, + "step": 2484 + }, + { + "epoch": 0.9699453551912568, + "grad_norm": 0.6243007625802056, + "learning_rate": 8.556746393805675e-06, + "loss": 0.6447, + "step": 2485 + }, + { + "epoch": 0.970335675253708, + "grad_norm": 0.5970319877436843, + "learning_rate": 8.555149935628773e-06, + "loss": 0.6424, + "step": 2486 + }, + { + "epoch": 0.9707259953161592, + "grad_norm": 0.576197636259531, + "learning_rate": 8.553552744083613e-06, + "loss": 0.6353, + "step": 2487 + }, + { + "epoch": 0.9711163153786104, + "grad_norm": 0.6352343459702465, + "learning_rate": 8.551954819499673e-06, + "loss": 0.6273, + "step": 2488 + }, + { + "epoch": 0.9715066354410616, + "grad_norm": 0.7074113187176208, + "learning_rate": 8.550356162206572e-06, + "loss": 0.6348, + "step": 2489 + }, + { + "epoch": 0.9718969555035128, + "grad_norm": 0.6009495537599685, + "learning_rate": 8.548756772534092e-06, + "loss": 0.6414, + "step": 2490 + }, + { + "epoch": 0.9722872755659641, + "grad_norm": 0.6837871709889732, + "learning_rate": 8.547156650812159e-06, + "loss": 0.5847, + "step": 2491 + }, + { + "epoch": 0.9726775956284153, + "grad_norm": 0.5451197368994618, + "learning_rate": 8.545555797370848e-06, + "loss": 0.6202, + "step": 2492 + }, + { + "epoch": 0.9730679156908665, + "grad_norm": 0.6589560729153833, + "learning_rate": 8.543954212540393e-06, + "loss": 0.6444, + "step": 2493 + }, + { + "epoch": 0.9734582357533177, + "grad_norm": 0.669531942966189, + "learning_rate": 8.54235189665117e-06, + "loss": 0.6209, + "step": 2494 + }, + { + "epoch": 0.9738485558157689, + "grad_norm": 0.609160286194824, + "learning_rate": 8.540748850033715e-06, + "loss": 0.6205, + "step": 2495 + }, + { + "epoch": 0.9742388758782201, + "grad_norm": 0.7031236992333072, + "learning_rate": 8.539145073018706e-06, + "loss": 0.6531, + "step": 2496 + }, + { + "epoch": 0.9746291959406713, + "grad_norm": 0.5615559679542361, + "learning_rate": 8.537540565936976e-06, + "loss": 0.6314, + "step": 2497 + }, + { + "epoch": 0.9750195160031225, + "grad_norm": 0.6297137862621984, + "learning_rate": 8.535935329119512e-06, + "loss": 0.6402, + "step": 2498 + }, + { + "epoch": 0.9754098360655737, + "grad_norm": 0.5876969131919015, + "learning_rate": 8.534329362897443e-06, + "loss": 0.6277, + "step": 2499 + }, + { + "epoch": 0.975800156128025, + "grad_norm": 0.6250865415978033, + "learning_rate": 8.532722667602056e-06, + "loss": 0.6595, + "step": 2500 + }, + { + "epoch": 0.9761904761904762, + "grad_norm": 0.6628554295532381, + "learning_rate": 8.531115243564781e-06, + "loss": 0.5873, + "step": 2501 + }, + { + "epoch": 0.9765807962529274, + "grad_norm": 0.5986528679361922, + "learning_rate": 8.529507091117209e-06, + "loss": 0.6312, + "step": 2502 + }, + { + "epoch": 0.9769711163153786, + "grad_norm": 0.6383550106364533, + "learning_rate": 8.52789821059107e-06, + "loss": 0.6371, + "step": 2503 + }, + { + "epoch": 0.9773614363778298, + "grad_norm": 0.7522045109573912, + "learning_rate": 8.526288602318253e-06, + "loss": 0.6728, + "step": 2504 + }, + { + "epoch": 0.977751756440281, + "grad_norm": 0.7797486650544031, + "learning_rate": 8.524678266630791e-06, + "loss": 0.6809, + "step": 2505 + }, + { + "epoch": 0.9781420765027322, + "grad_norm": 0.5895909380057899, + "learning_rate": 8.52306720386087e-06, + "loss": 0.6606, + "step": 2506 + }, + { + "epoch": 0.9785323965651834, + "grad_norm": 0.6671543103018884, + "learning_rate": 8.521455414340826e-06, + "loss": 0.6423, + "step": 2507 + }, + { + "epoch": 0.9789227166276346, + "grad_norm": 0.6990464991575851, + "learning_rate": 8.519842898403143e-06, + "loss": 0.6183, + "step": 2508 + }, + { + "epoch": 0.9793130366900858, + "grad_norm": 0.5960832279759833, + "learning_rate": 8.51822965638046e-06, + "loss": 0.6223, + "step": 2509 + }, + { + "epoch": 0.9797033567525371, + "grad_norm": 0.615706240570792, + "learning_rate": 8.516615688605557e-06, + "loss": 0.6408, + "step": 2510 + }, + { + "epoch": 0.9800936768149883, + "grad_norm": 0.610600958371442, + "learning_rate": 8.51500099541137e-06, + "loss": 0.6131, + "step": 2511 + }, + { + "epoch": 0.9804839968774395, + "grad_norm": 0.5896035182712951, + "learning_rate": 8.513385577130985e-06, + "loss": 0.628, + "step": 2512 + }, + { + "epoch": 0.9808743169398907, + "grad_norm": 0.7280104042059062, + "learning_rate": 8.511769434097637e-06, + "loss": 0.6344, + "step": 2513 + }, + { + "epoch": 0.9812646370023419, + "grad_norm": 0.6705271076067092, + "learning_rate": 8.510152566644705e-06, + "loss": 0.6331, + "step": 2514 + }, + { + "epoch": 0.9816549570647931, + "grad_norm": 0.8256603439084056, + "learning_rate": 8.508534975105724e-06, + "loss": 0.6146, + "step": 2515 + }, + { + "epoch": 0.9820452771272443, + "grad_norm": 0.6867051208981859, + "learning_rate": 8.50691665981438e-06, + "loss": 0.5903, + "step": 2516 + }, + { + "epoch": 0.9824355971896955, + "grad_norm": 0.6648976194070582, + "learning_rate": 8.505297621104498e-06, + "loss": 0.6145, + "step": 2517 + }, + { + "epoch": 0.9828259172521467, + "grad_norm": 0.8070311972289701, + "learning_rate": 8.503677859310063e-06, + "loss": 0.6147, + "step": 2518 + }, + { + "epoch": 0.983216237314598, + "grad_norm": 0.6326070093156481, + "learning_rate": 8.502057374765203e-06, + "loss": 0.5801, + "step": 2519 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 0.6906645035960532, + "learning_rate": 8.500436167804197e-06, + "loss": 0.6158, + "step": 2520 + }, + { + "epoch": 0.9839968774395004, + "grad_norm": 0.6359385478946927, + "learning_rate": 8.498814238761475e-06, + "loss": 0.5939, + "step": 2521 + }, + { + "epoch": 0.9843871975019516, + "grad_norm": 0.6756703415609745, + "learning_rate": 8.497191587971611e-06, + "loss": 0.6205, + "step": 2522 + }, + { + "epoch": 0.9847775175644028, + "grad_norm": 0.6720387751749701, + "learning_rate": 8.495568215769335e-06, + "loss": 0.6217, + "step": 2523 + }, + { + "epoch": 0.985167837626854, + "grad_norm": 0.6202374721723793, + "learning_rate": 8.493944122489518e-06, + "loss": 0.6361, + "step": 2524 + }, + { + "epoch": 0.9855581576893052, + "grad_norm": 0.7640665419976446, + "learning_rate": 8.492319308467185e-06, + "loss": 0.6688, + "step": 2525 + }, + { + "epoch": 0.9859484777517564, + "grad_norm": 0.7159924784437828, + "learning_rate": 8.490693774037507e-06, + "loss": 0.603, + "step": 2526 + }, + { + "epoch": 0.9863387978142076, + "grad_norm": 0.7275740227841041, + "learning_rate": 8.489067519535804e-06, + "loss": 0.6485, + "step": 2527 + }, + { + "epoch": 0.9867291178766588, + "grad_norm": 0.584486726979699, + "learning_rate": 8.48744054529755e-06, + "loss": 0.649, + "step": 2528 + }, + { + "epoch": 0.9871194379391101, + "grad_norm": 0.659478251201769, + "learning_rate": 8.485812851658357e-06, + "loss": 0.6324, + "step": 2529 + }, + { + "epoch": 0.9875097580015613, + "grad_norm": 0.7054131710249247, + "learning_rate": 8.484184438953993e-06, + "loss": 0.6294, + "step": 2530 + }, + { + "epoch": 0.9879000780640125, + "grad_norm": 0.6167445008739071, + "learning_rate": 8.482555307520374e-06, + "loss": 0.6327, + "step": 2531 + }, + { + "epoch": 0.9882903981264637, + "grad_norm": 0.6423588207147682, + "learning_rate": 8.480925457693562e-06, + "loss": 0.6065, + "step": 2532 + }, + { + "epoch": 0.9886807181889149, + "grad_norm": 0.7224926020007633, + "learning_rate": 8.479294889809767e-06, + "loss": 0.6447, + "step": 2533 + }, + { + "epoch": 0.9890710382513661, + "grad_norm": 0.6082017912991172, + "learning_rate": 8.477663604205348e-06, + "loss": 0.6362, + "step": 2534 + }, + { + "epoch": 0.9894613583138173, + "grad_norm": 0.6770087236696671, + "learning_rate": 8.476031601216813e-06, + "loss": 0.649, + "step": 2535 + }, + { + "epoch": 0.9898516783762685, + "grad_norm": 0.7952629340420133, + "learning_rate": 8.474398881180817e-06, + "loss": 0.6255, + "step": 2536 + }, + { + "epoch": 0.9902419984387197, + "grad_norm": 0.6391643846645374, + "learning_rate": 8.472765444434161e-06, + "loss": 0.6595, + "step": 2537 + }, + { + "epoch": 0.990632318501171, + "grad_norm": 0.7220460623865957, + "learning_rate": 8.4711312913138e-06, + "loss": 0.6582, + "step": 2538 + }, + { + "epoch": 0.9910226385636222, + "grad_norm": 0.7931911223451318, + "learning_rate": 8.469496422156826e-06, + "loss": 0.6588, + "step": 2539 + }, + { + "epoch": 0.9914129586260734, + "grad_norm": 0.6822110606683401, + "learning_rate": 8.467860837300493e-06, + "loss": 0.6138, + "step": 2540 + }, + { + "epoch": 0.9918032786885246, + "grad_norm": 0.6663978177463863, + "learning_rate": 8.466224537082188e-06, + "loss": 0.5535, + "step": 2541 + }, + { + "epoch": 0.9921935987509758, + "grad_norm": 0.7887685590142887, + "learning_rate": 8.464587521839456e-06, + "loss": 0.6585, + "step": 2542 + }, + { + "epoch": 0.992583918813427, + "grad_norm": 0.6456431056950451, + "learning_rate": 8.462949791909988e-06, + "loss": 0.6229, + "step": 2543 + }, + { + "epoch": 0.9929742388758782, + "grad_norm": 0.5538543644907534, + "learning_rate": 8.461311347631617e-06, + "loss": 0.6213, + "step": 2544 + }, + { + "epoch": 0.9933645589383294, + "grad_norm": 0.7047664333003233, + "learning_rate": 8.459672189342328e-06, + "loss": 0.644, + "step": 2545 + }, + { + "epoch": 0.9937548790007806, + "grad_norm": 0.6721598426224119, + "learning_rate": 8.458032317380252e-06, + "loss": 0.6369, + "step": 2546 + }, + { + "epoch": 0.9941451990632318, + "grad_norm": 0.5992711171443605, + "learning_rate": 8.456391732083666e-06, + "loss": 0.6518, + "step": 2547 + }, + { + "epoch": 0.994535519125683, + "grad_norm": 0.6676717235677019, + "learning_rate": 8.454750433790997e-06, + "loss": 0.5914, + "step": 2548 + }, + { + "epoch": 0.9949258391881343, + "grad_norm": 0.6477371560711316, + "learning_rate": 8.453108422840818e-06, + "loss": 0.6271, + "step": 2549 + }, + { + "epoch": 0.9953161592505855, + "grad_norm": 0.6353238456184518, + "learning_rate": 8.451465699571848e-06, + "loss": 0.6328, + "step": 2550 + }, + { + "epoch": 0.9957064793130367, + "grad_norm": 0.661881189271769, + "learning_rate": 8.449822264322952e-06, + "loss": 0.6189, + "step": 2551 + }, + { + "epoch": 0.9960967993754879, + "grad_norm": 0.7332622218174761, + "learning_rate": 8.448178117433146e-06, + "loss": 0.6362, + "step": 2552 + }, + { + "epoch": 0.9964871194379391, + "grad_norm": 0.6234497677530066, + "learning_rate": 8.446533259241587e-06, + "loss": 0.664, + "step": 2553 + }, + { + "epoch": 0.9968774395003903, + "grad_norm": 0.7634522904390684, + "learning_rate": 8.444887690087583e-06, + "loss": 0.5925, + "step": 2554 + }, + { + "epoch": 0.9972677595628415, + "grad_norm": 0.6276039839589634, + "learning_rate": 8.443241410310593e-06, + "loss": 0.643, + "step": 2555 + }, + { + "epoch": 0.9976580796252927, + "grad_norm": 0.5557465956851223, + "learning_rate": 8.441594420250207e-06, + "loss": 0.6013, + "step": 2556 + }, + { + "epoch": 0.998048399687744, + "grad_norm": 0.6802292439380574, + "learning_rate": 8.439946720246179e-06, + "loss": 0.6004, + "step": 2557 + }, + { + "epoch": 0.9984387197501952, + "grad_norm": 0.5754033612661524, + "learning_rate": 8.438298310638398e-06, + "loss": 0.6177, + "step": 2558 + }, + { + "epoch": 0.9988290398126464, + "grad_norm": 0.5968701056684068, + "learning_rate": 8.436649191766906e-06, + "loss": 0.6212, + "step": 2559 + }, + { + "epoch": 0.9992193598750976, + "grad_norm": 0.6377870509697271, + "learning_rate": 8.434999363971889e-06, + "loss": 0.6359, + "step": 2560 + }, + { + "epoch": 0.9996096799375488, + "grad_norm": 0.5826305139222132, + "learning_rate": 8.433348827593679e-06, + "loss": 0.6282, + "step": 2561 + }, + { + "epoch": 1.0, + "grad_norm": 0.5414685901368327, + "learning_rate": 8.431697582972751e-06, + "loss": 0.6, + "step": 2562 + }, + { + "epoch": 1.000390320062451, + "grad_norm": 0.5735736818435042, + "learning_rate": 8.430045630449733e-06, + "loss": 0.6098, + "step": 2563 + }, + { + "epoch": 1.0007806401249024, + "grad_norm": 0.5469644625292589, + "learning_rate": 8.428392970365395e-06, + "loss": 0.6227, + "step": 2564 + }, + { + "epoch": 1.0011709601873535, + "grad_norm": 0.5240173521195385, + "learning_rate": 8.42673960306065e-06, + "loss": 0.5519, + "step": 2565 + }, + { + "epoch": 1.0015612802498048, + "grad_norm": 0.5575111098997619, + "learning_rate": 8.425085528876565e-06, + "loss": 0.5925, + "step": 2566 + }, + { + "epoch": 1.001951600312256, + "grad_norm": 0.6253455297548962, + "learning_rate": 8.423430748154343e-06, + "loss": 0.6171, + "step": 2567 + }, + { + "epoch": 1.0023419203747073, + "grad_norm": 0.6534611213220819, + "learning_rate": 8.421775261235338e-06, + "loss": 0.6392, + "step": 2568 + }, + { + "epoch": 1.0027322404371584, + "grad_norm": 0.5818593571318597, + "learning_rate": 8.420119068461056e-06, + "loss": 0.5939, + "step": 2569 + }, + { + "epoch": 1.0031225604996097, + "grad_norm": 0.6054869562201858, + "learning_rate": 8.418462170173136e-06, + "loss": 0.6059, + "step": 2570 + }, + { + "epoch": 1.0035128805620608, + "grad_norm": 0.6397960953435471, + "learning_rate": 8.416804566713368e-06, + "loss": 0.5783, + "step": 2571 + }, + { + "epoch": 1.003903200624512, + "grad_norm": 0.5629600541610282, + "learning_rate": 8.415146258423692e-06, + "loss": 0.6184, + "step": 2572 + }, + { + "epoch": 1.0042935206869632, + "grad_norm": 0.6356537098624855, + "learning_rate": 8.413487245646186e-06, + "loss": 0.5814, + "step": 2573 + }, + { + "epoch": 1.0046838407494145, + "grad_norm": 0.5901371294753143, + "learning_rate": 8.41182752872308e-06, + "loss": 0.589, + "step": 2574 + }, + { + "epoch": 1.0050741608118656, + "grad_norm": 0.629261265259108, + "learning_rate": 8.410167107996742e-06, + "loss": 0.5837, + "step": 2575 + }, + { + "epoch": 1.005464480874317, + "grad_norm": 0.6811325810580702, + "learning_rate": 8.408505983809691e-06, + "loss": 0.6015, + "step": 2576 + }, + { + "epoch": 1.005854800936768, + "grad_norm": 0.5803837409480928, + "learning_rate": 8.406844156504591e-06, + "loss": 0.6027, + "step": 2577 + }, + { + "epoch": 1.0062451209992194, + "grad_norm": 0.8018723982894118, + "learning_rate": 8.405181626424248e-06, + "loss": 0.5894, + "step": 2578 + }, + { + "epoch": 1.0066354410616705, + "grad_norm": 0.5578491374461718, + "learning_rate": 8.403518393911614e-06, + "loss": 0.6023, + "step": 2579 + }, + { + "epoch": 1.0070257611241218, + "grad_norm": 0.675413753335709, + "learning_rate": 8.401854459309785e-06, + "loss": 0.6341, + "step": 2580 + }, + { + "epoch": 1.0074160811865729, + "grad_norm": 0.7075989629964949, + "learning_rate": 8.400189822962003e-06, + "loss": 0.5708, + "step": 2581 + }, + { + "epoch": 1.0078064012490242, + "grad_norm": 0.5752066038358904, + "learning_rate": 8.398524485211662e-06, + "loss": 0.6199, + "step": 2582 + }, + { + "epoch": 1.0081967213114753, + "grad_norm": 0.7206668099740958, + "learning_rate": 8.39685844640228e-06, + "loss": 0.6309, + "step": 2583 + }, + { + "epoch": 1.0085870413739266, + "grad_norm": 0.7340942305602384, + "learning_rate": 8.395191706877545e-06, + "loss": 0.5842, + "step": 2584 + }, + { + "epoch": 1.0089773614363777, + "grad_norm": 0.5993309785113736, + "learning_rate": 8.393524266981272e-06, + "loss": 0.5854, + "step": 2585 + }, + { + "epoch": 1.009367681498829, + "grad_norm": 0.5934373973176266, + "learning_rate": 8.391856127057427e-06, + "loss": 0.6221, + "step": 2586 + }, + { + "epoch": 1.0097580015612801, + "grad_norm": 0.5535176920962185, + "learning_rate": 8.390187287450122e-06, + "loss": 0.6016, + "step": 2587 + }, + { + "epoch": 1.0101483216237315, + "grad_norm": 0.6230658499508502, + "learning_rate": 8.388517748503606e-06, + "loss": 0.5776, + "step": 2588 + }, + { + "epoch": 1.0105386416861826, + "grad_norm": 0.6117598605814547, + "learning_rate": 8.386847510562281e-06, + "loss": 0.6237, + "step": 2589 + }, + { + "epoch": 1.010928961748634, + "grad_norm": 0.6703929836631117, + "learning_rate": 8.385176573970687e-06, + "loss": 0.6035, + "step": 2590 + }, + { + "epoch": 1.011319281811085, + "grad_norm": 0.571232179184294, + "learning_rate": 8.383504939073514e-06, + "loss": 0.6038, + "step": 2591 + }, + { + "epoch": 1.0117096018735363, + "grad_norm": 0.6509277920743077, + "learning_rate": 8.381832606215588e-06, + "loss": 0.642, + "step": 2592 + }, + { + "epoch": 1.0120999219359874, + "grad_norm": 0.6391238229924188, + "learning_rate": 8.380159575741883e-06, + "loss": 0.6364, + "step": 2593 + }, + { + "epoch": 1.0124902419984387, + "grad_norm": 0.5788557413720539, + "learning_rate": 8.378485847997525e-06, + "loss": 0.5786, + "step": 2594 + }, + { + "epoch": 1.0128805620608898, + "grad_norm": 0.7205160201244885, + "learning_rate": 8.376811423327766e-06, + "loss": 0.6341, + "step": 2595 + }, + { + "epoch": 1.0132708821233412, + "grad_norm": 0.5780463178536892, + "learning_rate": 8.37513630207802e-06, + "loss": 0.5624, + "step": 2596 + }, + { + "epoch": 1.0136612021857923, + "grad_norm": 0.7140208506124561, + "learning_rate": 8.373460484593832e-06, + "loss": 0.5744, + "step": 2597 + }, + { + "epoch": 1.0140515222482436, + "grad_norm": 0.6999645813666561, + "learning_rate": 8.371783971220896e-06, + "loss": 0.588, + "step": 2598 + }, + { + "epoch": 1.0144418423106947, + "grad_norm": 0.6208889636700369, + "learning_rate": 8.37010676230505e-06, + "loss": 0.5932, + "step": 2599 + }, + { + "epoch": 1.014832162373146, + "grad_norm": 0.8134107706876385, + "learning_rate": 8.368428858192275e-06, + "loss": 0.5897, + "step": 2600 + }, + { + "epoch": 1.015222482435597, + "grad_norm": 0.837335554579531, + "learning_rate": 8.366750259228691e-06, + "loss": 0.6317, + "step": 2601 + }, + { + "epoch": 1.0156128024980484, + "grad_norm": 0.7776240462494279, + "learning_rate": 8.36507096576057e-06, + "loss": 0.6487, + "step": 2602 + }, + { + "epoch": 1.0160031225604995, + "grad_norm": 0.8180035024480445, + "learning_rate": 8.363390978134318e-06, + "loss": 0.5872, + "step": 2603 + }, + { + "epoch": 1.0163934426229508, + "grad_norm": 0.6444104612361308, + "learning_rate": 8.36171029669649e-06, + "loss": 0.6335, + "step": 2604 + }, + { + "epoch": 1.016783762685402, + "grad_norm": 0.6483481591841106, + "learning_rate": 8.360028921793784e-06, + "loss": 0.6044, + "step": 2605 + }, + { + "epoch": 1.0171740827478533, + "grad_norm": 0.8691145173263812, + "learning_rate": 8.358346853773038e-06, + "loss": 0.6088, + "step": 2606 + }, + { + "epoch": 1.0175644028103044, + "grad_norm": 0.6374192470191258, + "learning_rate": 8.356664092981235e-06, + "loss": 0.5927, + "step": 2607 + }, + { + "epoch": 1.0179547228727557, + "grad_norm": 0.7326997353969618, + "learning_rate": 8.3549806397655e-06, + "loss": 0.6114, + "step": 2608 + }, + { + "epoch": 1.0183450429352068, + "grad_norm": 0.6386614267684714, + "learning_rate": 8.353296494473104e-06, + "loss": 0.5885, + "step": 2609 + }, + { + "epoch": 1.018735362997658, + "grad_norm": 0.6958332625527218, + "learning_rate": 8.351611657451454e-06, + "loss": 0.5919, + "step": 2610 + }, + { + "epoch": 1.0191256830601092, + "grad_norm": 0.7794300233236293, + "learning_rate": 8.349926129048108e-06, + "loss": 0.6112, + "step": 2611 + }, + { + "epoch": 1.0195160031225605, + "grad_norm": 0.6231067023593518, + "learning_rate": 8.34823990961076e-06, + "loss": 0.5973, + "step": 2612 + }, + { + "epoch": 1.0199063231850116, + "grad_norm": 0.8240299277182256, + "learning_rate": 8.346552999487249e-06, + "loss": 0.6126, + "step": 2613 + }, + { + "epoch": 1.020296643247463, + "grad_norm": 0.6950330761914324, + "learning_rate": 8.344865399025557e-06, + "loss": 0.5873, + "step": 2614 + }, + { + "epoch": 1.020686963309914, + "grad_norm": 0.6081699170347625, + "learning_rate": 8.343177108573808e-06, + "loss": 0.634, + "step": 2615 + }, + { + "epoch": 1.0210772833723654, + "grad_norm": 0.7901846925499826, + "learning_rate": 8.341488128480267e-06, + "loss": 0.6013, + "step": 2616 + }, + { + "epoch": 1.0214676034348165, + "grad_norm": 0.6689372335188036, + "learning_rate": 8.339798459093345e-06, + "loss": 0.6159, + "step": 2617 + }, + { + "epoch": 1.0218579234972678, + "grad_norm": 0.6768924369865582, + "learning_rate": 8.33810810076159e-06, + "loss": 0.605, + "step": 2618 + }, + { + "epoch": 1.0222482435597189, + "grad_norm": 0.7343406225828368, + "learning_rate": 8.336417053833698e-06, + "loss": 0.5915, + "step": 2619 + }, + { + "epoch": 1.0226385636221702, + "grad_norm": 0.623277512875933, + "learning_rate": 8.3347253186585e-06, + "loss": 0.6008, + "step": 2620 + }, + { + "epoch": 1.0230288836846213, + "grad_norm": 0.7405670905300733, + "learning_rate": 8.333032895584973e-06, + "loss": 0.622, + "step": 2621 + }, + { + "epoch": 1.0234192037470726, + "grad_norm": 0.7110760476257815, + "learning_rate": 8.331339784962242e-06, + "loss": 0.6318, + "step": 2622 + }, + { + "epoch": 1.0238095238095237, + "grad_norm": 0.727375270005385, + "learning_rate": 8.32964598713956e-06, + "loss": 0.6023, + "step": 2623 + }, + { + "epoch": 1.024199843871975, + "grad_norm": 0.6466098080529559, + "learning_rate": 8.32795150246633e-06, + "loss": 0.5924, + "step": 2624 + }, + { + "epoch": 1.0245901639344261, + "grad_norm": 0.539621117207915, + "learning_rate": 8.326256331292102e-06, + "loss": 0.6037, + "step": 2625 + }, + { + "epoch": 1.0249804839968775, + "grad_norm": 0.5678851031920995, + "learning_rate": 8.324560473966558e-06, + "loss": 0.6128, + "step": 2626 + }, + { + "epoch": 1.0253708040593286, + "grad_norm": 0.6097508224073924, + "learning_rate": 8.322863930839522e-06, + "loss": 0.6161, + "step": 2627 + }, + { + "epoch": 1.0257611241217799, + "grad_norm": 0.5969758754026481, + "learning_rate": 8.321166702260967e-06, + "loss": 0.5941, + "step": 2628 + }, + { + "epoch": 1.026151444184231, + "grad_norm": 0.6027128163643457, + "learning_rate": 8.319468788581002e-06, + "loss": 0.6073, + "step": 2629 + }, + { + "epoch": 1.0265417642466823, + "grad_norm": 0.5922034489012268, + "learning_rate": 8.317770190149877e-06, + "loss": 0.626, + "step": 2630 + }, + { + "epoch": 1.0269320843091334, + "grad_norm": 0.630625147926999, + "learning_rate": 8.316070907317988e-06, + "loss": 0.6054, + "step": 2631 + }, + { + "epoch": 1.0273224043715847, + "grad_norm": 0.5435763361464413, + "learning_rate": 8.314370940435862e-06, + "loss": 0.5926, + "step": 2632 + }, + { + "epoch": 1.0277127244340358, + "grad_norm": 0.583742900118849, + "learning_rate": 8.31267028985418e-06, + "loss": 0.6248, + "step": 2633 + }, + { + "epoch": 1.0281030444964872, + "grad_norm": 0.6170966376632341, + "learning_rate": 8.310968955923757e-06, + "loss": 0.6168, + "step": 2634 + }, + { + "epoch": 1.0284933645589383, + "grad_norm": 0.5934012206180886, + "learning_rate": 8.309266938995549e-06, + "loss": 0.6084, + "step": 2635 + }, + { + "epoch": 1.0288836846213896, + "grad_norm": 0.6173662794849367, + "learning_rate": 8.307564239420652e-06, + "loss": 0.5927, + "step": 2636 + }, + { + "epoch": 1.0292740046838407, + "grad_norm": 0.6495923239555385, + "learning_rate": 8.305860857550305e-06, + "loss": 0.6346, + "step": 2637 + }, + { + "epoch": 1.029664324746292, + "grad_norm": 0.6297438377700051, + "learning_rate": 8.304156793735892e-06, + "loss": 0.5865, + "step": 2638 + }, + { + "epoch": 1.030054644808743, + "grad_norm": 0.7291723716413925, + "learning_rate": 8.302452048328926e-06, + "loss": 0.5877, + "step": 2639 + }, + { + "epoch": 1.0304449648711944, + "grad_norm": 0.6051263882960964, + "learning_rate": 8.300746621681073e-06, + "loss": 0.6202, + "step": 2640 + }, + { + "epoch": 1.0308352849336455, + "grad_norm": 0.5895378917020183, + "learning_rate": 8.299040514144133e-06, + "loss": 0.6059, + "step": 2641 + }, + { + "epoch": 1.0312256049960968, + "grad_norm": 0.5598489934424362, + "learning_rate": 8.297333726070045e-06, + "loss": 0.6206, + "step": 2642 + }, + { + "epoch": 1.031615925058548, + "grad_norm": 0.5319346532742208, + "learning_rate": 8.295626257810893e-06, + "loss": 0.6127, + "step": 2643 + }, + { + "epoch": 1.0320062451209993, + "grad_norm": 0.5795492917678597, + "learning_rate": 8.293918109718901e-06, + "loss": 0.6238, + "step": 2644 + }, + { + "epoch": 1.0323965651834504, + "grad_norm": 0.608971149763031, + "learning_rate": 8.292209282146429e-06, + "loss": 0.6303, + "step": 2645 + }, + { + "epoch": 1.0327868852459017, + "grad_norm": 0.5519670066537823, + "learning_rate": 8.29049977544598e-06, + "loss": 0.6231, + "step": 2646 + }, + { + "epoch": 1.0331772053083528, + "grad_norm": 0.631734970578478, + "learning_rate": 8.288789589970197e-06, + "loss": 0.6369, + "step": 2647 + }, + { + "epoch": 1.033567525370804, + "grad_norm": 0.6008195293882322, + "learning_rate": 8.287078726071862e-06, + "loss": 0.611, + "step": 2648 + }, + { + "epoch": 1.0339578454332552, + "grad_norm": 0.6507339019701048, + "learning_rate": 8.2853671841039e-06, + "loss": 0.6228, + "step": 2649 + }, + { + "epoch": 1.0343481654957065, + "grad_norm": 0.5757776566965516, + "learning_rate": 8.283654964419372e-06, + "loss": 0.598, + "step": 2650 + }, + { + "epoch": 1.0347384855581576, + "grad_norm": 0.6362775135310011, + "learning_rate": 8.28194206737148e-06, + "loss": 0.5938, + "step": 2651 + }, + { + "epoch": 1.035128805620609, + "grad_norm": 0.6492273204596462, + "learning_rate": 8.280228493313566e-06, + "loss": 0.6137, + "step": 2652 + }, + { + "epoch": 1.03551912568306, + "grad_norm": 0.8499990308423243, + "learning_rate": 8.278514242599115e-06, + "loss": 0.6063, + "step": 2653 + }, + { + "epoch": 1.0359094457455114, + "grad_norm": 0.5110431691013735, + "learning_rate": 8.276799315581744e-06, + "loss": 0.6512, + "step": 2654 + }, + { + "epoch": 1.0362997658079625, + "grad_norm": 0.7768542501690953, + "learning_rate": 8.275083712615217e-06, + "loss": 0.6112, + "step": 2655 + }, + { + "epoch": 1.0366900858704138, + "grad_norm": 0.6955731430849937, + "learning_rate": 8.273367434053432e-06, + "loss": 0.6451, + "step": 2656 + }, + { + "epoch": 1.0370804059328649, + "grad_norm": 0.6191724345963165, + "learning_rate": 8.271650480250433e-06, + "loss": 0.6028, + "step": 2657 + }, + { + "epoch": 1.0374707259953162, + "grad_norm": 0.6989780747029889, + "learning_rate": 8.269932851560392e-06, + "loss": 0.6104, + "step": 2658 + }, + { + "epoch": 1.0378610460577673, + "grad_norm": 0.6155175638423132, + "learning_rate": 8.268214548337634e-06, + "loss": 0.5961, + "step": 2659 + }, + { + "epoch": 1.0382513661202186, + "grad_norm": 0.6894663016040131, + "learning_rate": 8.266495570936613e-06, + "loss": 0.6059, + "step": 2660 + }, + { + "epoch": 1.0386416861826697, + "grad_norm": 0.5536721044166608, + "learning_rate": 8.264775919711924e-06, + "loss": 0.6105, + "step": 2661 + }, + { + "epoch": 1.039032006245121, + "grad_norm": 0.6369779857315486, + "learning_rate": 8.263055595018307e-06, + "loss": 0.5941, + "step": 2662 + }, + { + "epoch": 1.0394223263075721, + "grad_norm": 0.7122081923778091, + "learning_rate": 8.261334597210632e-06, + "loss": 0.6223, + "step": 2663 + }, + { + "epoch": 1.0398126463700235, + "grad_norm": 0.6169043884504563, + "learning_rate": 8.259612926643915e-06, + "loss": 0.5716, + "step": 2664 + }, + { + "epoch": 1.0402029664324746, + "grad_norm": 0.6390829908550161, + "learning_rate": 8.257890583673304e-06, + "loss": 0.608, + "step": 2665 + }, + { + "epoch": 1.0405932864949259, + "grad_norm": 0.6182830073077941, + "learning_rate": 8.256167568654096e-06, + "loss": 0.6201, + "step": 2666 + }, + { + "epoch": 1.040983606557377, + "grad_norm": 0.6647818693137881, + "learning_rate": 8.254443881941716e-06, + "loss": 0.6174, + "step": 2667 + }, + { + "epoch": 1.0413739266198283, + "grad_norm": 0.5793837204790384, + "learning_rate": 8.252719523891733e-06, + "loss": 0.6112, + "step": 2668 + }, + { + "epoch": 1.0417642466822794, + "grad_norm": 0.6125273624170545, + "learning_rate": 8.250994494859851e-06, + "loss": 0.5879, + "step": 2669 + }, + { + "epoch": 1.0421545667447307, + "grad_norm": 0.6743902372975027, + "learning_rate": 8.249268795201919e-06, + "loss": 0.6239, + "step": 2670 + }, + { + "epoch": 1.0425448868071818, + "grad_norm": 0.6504173526690483, + "learning_rate": 8.247542425273918e-06, + "loss": 0.5865, + "step": 2671 + }, + { + "epoch": 1.0429352068696331, + "grad_norm": 0.6712212084190329, + "learning_rate": 8.245815385431969e-06, + "loss": 0.5627, + "step": 2672 + }, + { + "epoch": 1.0433255269320842, + "grad_norm": 0.6842860287880004, + "learning_rate": 8.24408767603233e-06, + "loss": 0.605, + "step": 2673 + }, + { + "epoch": 1.0437158469945356, + "grad_norm": 0.6370274235829604, + "learning_rate": 8.242359297431403e-06, + "loss": 0.6025, + "step": 2674 + }, + { + "epoch": 1.0441061670569867, + "grad_norm": 0.5593891742047279, + "learning_rate": 8.240630249985722e-06, + "loss": 0.6088, + "step": 2675 + }, + { + "epoch": 1.044496487119438, + "grad_norm": 0.8366592203011893, + "learning_rate": 8.23890053405196e-06, + "loss": 0.6366, + "step": 2676 + }, + { + "epoch": 1.044886807181889, + "grad_norm": 0.6050283410422215, + "learning_rate": 8.237170149986927e-06, + "loss": 0.6113, + "step": 2677 + }, + { + "epoch": 1.0452771272443404, + "grad_norm": 0.6810690980714466, + "learning_rate": 8.235439098147575e-06, + "loss": 0.6427, + "step": 2678 + }, + { + "epoch": 1.0456674473067915, + "grad_norm": 0.660507731222617, + "learning_rate": 8.233707378890991e-06, + "loss": 0.602, + "step": 2679 + }, + { + "epoch": 1.0460577673692428, + "grad_norm": 0.6412141812810112, + "learning_rate": 8.2319749925744e-06, + "loss": 0.6395, + "step": 2680 + }, + { + "epoch": 1.046448087431694, + "grad_norm": 0.5771896829791726, + "learning_rate": 8.23024193955516e-06, + "loss": 0.5891, + "step": 2681 + }, + { + "epoch": 1.0468384074941453, + "grad_norm": 0.500363929187475, + "learning_rate": 8.228508220190776e-06, + "loss": 0.5916, + "step": 2682 + }, + { + "epoch": 1.0472287275565964, + "grad_norm": 0.6869662121707011, + "learning_rate": 8.226773834838887e-06, + "loss": 0.5934, + "step": 2683 + }, + { + "epoch": 1.0476190476190477, + "grad_norm": 0.6415840814814434, + "learning_rate": 8.225038783857262e-06, + "loss": 0.6277, + "step": 2684 + }, + { + "epoch": 1.0480093676814988, + "grad_norm": 0.6944297516851815, + "learning_rate": 8.223303067603817e-06, + "loss": 0.6249, + "step": 2685 + }, + { + "epoch": 1.04839968774395, + "grad_norm": 0.5786640891080724, + "learning_rate": 8.2215666864366e-06, + "loss": 0.5522, + "step": 2686 + }, + { + "epoch": 1.0487900078064012, + "grad_norm": 0.5844012160798868, + "learning_rate": 8.219829640713798e-06, + "loss": 0.6018, + "step": 2687 + }, + { + "epoch": 1.0491803278688525, + "grad_norm": 0.7339057327042744, + "learning_rate": 8.218091930793735e-06, + "loss": 0.6035, + "step": 2688 + }, + { + "epoch": 1.0495706479313036, + "grad_norm": 0.7513181740703512, + "learning_rate": 8.216353557034871e-06, + "loss": 0.5762, + "step": 2689 + }, + { + "epoch": 1.049960967993755, + "grad_norm": 0.6306071217440676, + "learning_rate": 8.214614519795804e-06, + "loss": 0.5915, + "step": 2690 + }, + { + "epoch": 1.050351288056206, + "grad_norm": 0.7785924453199476, + "learning_rate": 8.212874819435268e-06, + "loss": 0.6351, + "step": 2691 + }, + { + "epoch": 1.0507416081186574, + "grad_norm": 0.6325233696242637, + "learning_rate": 8.211134456312134e-06, + "loss": 0.5834, + "step": 2692 + }, + { + "epoch": 1.0511319281811085, + "grad_norm": 0.6496548291323299, + "learning_rate": 8.209393430785411e-06, + "loss": 0.5965, + "step": 2693 + }, + { + "epoch": 1.0515222482435598, + "grad_norm": 0.6342708740918751, + "learning_rate": 8.207651743214244e-06, + "loss": 0.6119, + "step": 2694 + }, + { + "epoch": 1.0519125683060109, + "grad_norm": 0.6381367490095938, + "learning_rate": 8.205909393957912e-06, + "loss": 0.587, + "step": 2695 + }, + { + "epoch": 1.0523028883684622, + "grad_norm": 0.644255960233616, + "learning_rate": 8.204166383375833e-06, + "loss": 0.574, + "step": 2696 + }, + { + "epoch": 1.0526932084309133, + "grad_norm": 0.646172728705021, + "learning_rate": 8.202422711827564e-06, + "loss": 0.6272, + "step": 2697 + }, + { + "epoch": 1.0530835284933646, + "grad_norm": 0.613608368905599, + "learning_rate": 8.20067837967279e-06, + "loss": 0.6241, + "step": 2698 + }, + { + "epoch": 1.0534738485558157, + "grad_norm": 0.6169828893554751, + "learning_rate": 8.198933387271343e-06, + "loss": 0.6046, + "step": 2699 + }, + { + "epoch": 1.053864168618267, + "grad_norm": 0.5440596641719239, + "learning_rate": 8.197187734983185e-06, + "loss": 0.5773, + "step": 2700 + }, + { + "epoch": 1.0542544886807181, + "grad_norm": 0.5817037453139434, + "learning_rate": 8.195441423168413e-06, + "loss": 0.6024, + "step": 2701 + }, + { + "epoch": 1.0546448087431695, + "grad_norm": 0.5926059691781935, + "learning_rate": 8.193694452187263e-06, + "loss": 0.6352, + "step": 2702 + }, + { + "epoch": 1.0550351288056206, + "grad_norm": 0.6063540483855295, + "learning_rate": 8.191946822400105e-06, + "loss": 0.6436, + "step": 2703 + }, + { + "epoch": 1.0554254488680719, + "grad_norm": 0.6159882029362203, + "learning_rate": 8.190198534167446e-06, + "loss": 0.609, + "step": 2704 + }, + { + "epoch": 1.055815768930523, + "grad_norm": 0.5977364060136929, + "learning_rate": 8.188449587849932e-06, + "loss": 0.581, + "step": 2705 + }, + { + "epoch": 1.0562060889929743, + "grad_norm": 0.6327265123572955, + "learning_rate": 8.18669998380834e-06, + "loss": 0.5997, + "step": 2706 + }, + { + "epoch": 1.0565964090554254, + "grad_norm": 0.5884311409288032, + "learning_rate": 8.184949722403581e-06, + "loss": 0.6116, + "step": 2707 + }, + { + "epoch": 1.0569867291178767, + "grad_norm": 0.5079400374527842, + "learning_rate": 8.183198803996709e-06, + "loss": 0.5661, + "step": 2708 + }, + { + "epoch": 1.0573770491803278, + "grad_norm": 0.672316031509117, + "learning_rate": 8.181447228948904e-06, + "loss": 0.5962, + "step": 2709 + }, + { + "epoch": 1.0577673692427791, + "grad_norm": 0.6181549366454743, + "learning_rate": 8.179694997621495e-06, + "loss": 0.6201, + "step": 2710 + }, + { + "epoch": 1.0581576893052302, + "grad_norm": 0.5694015079731608, + "learning_rate": 8.17794211037593e-06, + "loss": 0.6102, + "step": 2711 + }, + { + "epoch": 1.0585480093676816, + "grad_norm": 0.5536598429949574, + "learning_rate": 8.176188567573803e-06, + "loss": 0.591, + "step": 2712 + }, + { + "epoch": 1.0589383294301327, + "grad_norm": 0.6789711903121344, + "learning_rate": 8.174434369576845e-06, + "loss": 0.6146, + "step": 2713 + }, + { + "epoch": 1.059328649492584, + "grad_norm": 0.5974409321619221, + "learning_rate": 8.172679516746913e-06, + "loss": 0.6126, + "step": 2714 + }, + { + "epoch": 1.059718969555035, + "grad_norm": 0.5601923048287601, + "learning_rate": 8.170924009446005e-06, + "loss": 0.5624, + "step": 2715 + }, + { + "epoch": 1.0601092896174864, + "grad_norm": 0.8213118949045277, + "learning_rate": 8.169167848036255e-06, + "loss": 0.6029, + "step": 2716 + }, + { + "epoch": 1.0604996096799375, + "grad_norm": 0.5581976790295049, + "learning_rate": 8.167411032879926e-06, + "loss": 0.5875, + "step": 2717 + }, + { + "epoch": 1.0608899297423888, + "grad_norm": 0.631611497840197, + "learning_rate": 8.165653564339421e-06, + "loss": 0.5807, + "step": 2718 + }, + { + "epoch": 1.06128024980484, + "grad_norm": 0.714136024168171, + "learning_rate": 8.16389544277728e-06, + "loss": 0.6041, + "step": 2719 + }, + { + "epoch": 1.0616705698672912, + "grad_norm": 0.5883754452474532, + "learning_rate": 8.162136668556171e-06, + "loss": 0.6213, + "step": 2720 + }, + { + "epoch": 1.0620608899297423, + "grad_norm": 0.5426907658525509, + "learning_rate": 8.1603772420389e-06, + "loss": 0.5863, + "step": 2721 + }, + { + "epoch": 1.0624512099921937, + "grad_norm": 0.6182972210684209, + "learning_rate": 8.158617163588406e-06, + "loss": 0.5947, + "step": 2722 + }, + { + "epoch": 1.0628415300546448, + "grad_norm": 0.6069332538582041, + "learning_rate": 8.156856433567771e-06, + "loss": 0.6105, + "step": 2723 + }, + { + "epoch": 1.063231850117096, + "grad_norm": 0.6228319875845831, + "learning_rate": 8.155095052340194e-06, + "loss": 0.5644, + "step": 2724 + }, + { + "epoch": 1.0636221701795472, + "grad_norm": 0.5996331843554504, + "learning_rate": 8.153333020269025e-06, + "loss": 0.6036, + "step": 2725 + }, + { + "epoch": 1.0640124902419985, + "grad_norm": 0.5554908715504342, + "learning_rate": 8.151570337717741e-06, + "loss": 0.6021, + "step": 2726 + }, + { + "epoch": 1.0644028103044496, + "grad_norm": 0.7100279581018947, + "learning_rate": 8.149807005049953e-06, + "loss": 0.5722, + "step": 2727 + }, + { + "epoch": 1.064793130366901, + "grad_norm": 0.5563759518716875, + "learning_rate": 8.14804302262941e-06, + "loss": 0.6328, + "step": 2728 + }, + { + "epoch": 1.065183450429352, + "grad_norm": 0.6545063816835485, + "learning_rate": 8.146278390819989e-06, + "loss": 0.5722, + "step": 2729 + }, + { + "epoch": 1.0655737704918034, + "grad_norm": 0.78849864756309, + "learning_rate": 8.144513109985704e-06, + "loss": 0.6317, + "step": 2730 + }, + { + "epoch": 1.0659640905542545, + "grad_norm": 0.7905423665980855, + "learning_rate": 8.142747180490704e-06, + "loss": 0.6039, + "step": 2731 + }, + { + "epoch": 1.0663544106167058, + "grad_norm": 0.7005958479787774, + "learning_rate": 8.140980602699273e-06, + "loss": 0.6078, + "step": 2732 + }, + { + "epoch": 1.0667447306791569, + "grad_norm": 0.6252508312519695, + "learning_rate": 8.139213376975824e-06, + "loss": 0.5696, + "step": 2733 + }, + { + "epoch": 1.0671350507416082, + "grad_norm": 0.732843993987985, + "learning_rate": 8.137445503684906e-06, + "loss": 0.6113, + "step": 2734 + }, + { + "epoch": 1.0675253708040593, + "grad_norm": 0.6015159509720102, + "learning_rate": 8.135676983191203e-06, + "loss": 0.6026, + "step": 2735 + }, + { + "epoch": 1.0679156908665106, + "grad_norm": 0.6519660379050151, + "learning_rate": 8.13390781585953e-06, + "loss": 0.6238, + "step": 2736 + }, + { + "epoch": 1.0683060109289617, + "grad_norm": 0.7210920117404579, + "learning_rate": 8.132138002054841e-06, + "loss": 0.6249, + "step": 2737 + }, + { + "epoch": 1.068696330991413, + "grad_norm": 0.5797290744653473, + "learning_rate": 8.130367542142214e-06, + "loss": 0.6452, + "step": 2738 + }, + { + "epoch": 1.0690866510538641, + "grad_norm": 0.7844848983879442, + "learning_rate": 8.128596436486868e-06, + "loss": 0.5933, + "step": 2739 + }, + { + "epoch": 1.0694769711163155, + "grad_norm": 0.761650162464446, + "learning_rate": 8.126824685454152e-06, + "loss": 0.6022, + "step": 2740 + }, + { + "epoch": 1.0698672911787666, + "grad_norm": 0.648920796466771, + "learning_rate": 8.12505228940955e-06, + "loss": 0.5905, + "step": 2741 + }, + { + "epoch": 1.0702576112412179, + "grad_norm": 0.9536627186086968, + "learning_rate": 8.123279248718673e-06, + "loss": 0.6102, + "step": 2742 + }, + { + "epoch": 1.070647931303669, + "grad_norm": 0.6063460216685128, + "learning_rate": 8.121505563747276e-06, + "loss": 0.5853, + "step": 2743 + }, + { + "epoch": 1.0710382513661203, + "grad_norm": 0.704055149113891, + "learning_rate": 8.11973123486124e-06, + "loss": 0.5679, + "step": 2744 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 0.8817009113022631, + "learning_rate": 8.117956262426574e-06, + "loss": 0.5961, + "step": 2745 + }, + { + "epoch": 1.0718188914910227, + "grad_norm": 0.5366846123141006, + "learning_rate": 8.116180646809431e-06, + "loss": 0.5687, + "step": 2746 + }, + { + "epoch": 1.0722092115534738, + "grad_norm": 0.6809485230324154, + "learning_rate": 8.114404388376088e-06, + "loss": 0.6396, + "step": 2747 + }, + { + "epoch": 1.0725995316159251, + "grad_norm": 0.6430895893090234, + "learning_rate": 8.112627487492958e-06, + "loss": 0.6044, + "step": 2748 + }, + { + "epoch": 1.0729898516783762, + "grad_norm": 0.6829057233999504, + "learning_rate": 8.110849944526586e-06, + "loss": 0.635, + "step": 2749 + }, + { + "epoch": 1.0733801717408276, + "grad_norm": 0.5718785020350567, + "learning_rate": 8.109071759843652e-06, + "loss": 0.5796, + "step": 2750 + }, + { + "epoch": 1.0737704918032787, + "grad_norm": 0.6221190713269011, + "learning_rate": 8.107292933810962e-06, + "loss": 0.634, + "step": 2751 + }, + { + "epoch": 1.07416081186573, + "grad_norm": 0.6861665788424801, + "learning_rate": 8.10551346679546e-06, + "loss": 0.6275, + "step": 2752 + }, + { + "epoch": 1.074551131928181, + "grad_norm": 0.6286261774810583, + "learning_rate": 8.103733359164221e-06, + "loss": 0.589, + "step": 2753 + }, + { + "epoch": 1.0749414519906324, + "grad_norm": 0.686093153659611, + "learning_rate": 8.101952611284454e-06, + "loss": 0.6317, + "step": 2754 + }, + { + "epoch": 1.0753317720530835, + "grad_norm": 0.7365384066993225, + "learning_rate": 8.10017122352349e-06, + "loss": 0.6214, + "step": 2755 + }, + { + "epoch": 1.0757220921155348, + "grad_norm": 0.6898626062708372, + "learning_rate": 8.098389196248808e-06, + "loss": 0.6214, + "step": 2756 + }, + { + "epoch": 1.076112412177986, + "grad_norm": 0.652074727688427, + "learning_rate": 8.096606529828003e-06, + "loss": 0.6328, + "step": 2757 + }, + { + "epoch": 1.0765027322404372, + "grad_norm": 0.6187416715603464, + "learning_rate": 8.094823224628816e-06, + "loss": 0.5982, + "step": 2758 + }, + { + "epoch": 1.0768930523028883, + "grad_norm": 0.6857627305726194, + "learning_rate": 8.093039281019112e-06, + "loss": 0.5837, + "step": 2759 + }, + { + "epoch": 1.0772833723653397, + "grad_norm": 0.7003480408420876, + "learning_rate": 8.091254699366886e-06, + "loss": 0.5925, + "step": 2760 + }, + { + "epoch": 1.0776736924277908, + "grad_norm": 0.6326984751584476, + "learning_rate": 8.089469480040268e-06, + "loss": 0.5753, + "step": 2761 + }, + { + "epoch": 1.078064012490242, + "grad_norm": 0.6114630222178749, + "learning_rate": 8.08768362340752e-06, + "loss": 0.6274, + "step": 2762 + }, + { + "epoch": 1.0784543325526932, + "grad_norm": 0.5472173471108541, + "learning_rate": 8.085897129837035e-06, + "loss": 0.5952, + "step": 2763 + }, + { + "epoch": 1.0788446526151445, + "grad_norm": 0.6416160243126485, + "learning_rate": 8.084109999697337e-06, + "loss": 0.58, + "step": 2764 + }, + { + "epoch": 1.0792349726775956, + "grad_norm": 0.6156870200246816, + "learning_rate": 8.082322233357078e-06, + "loss": 0.6546, + "step": 2765 + }, + { + "epoch": 1.079625292740047, + "grad_norm": 0.5852847378781293, + "learning_rate": 8.080533831185049e-06, + "loss": 0.6013, + "step": 2766 + }, + { + "epoch": 1.080015612802498, + "grad_norm": 0.5809393744271711, + "learning_rate": 8.078744793550167e-06, + "loss": 0.5708, + "step": 2767 + }, + { + "epoch": 1.0804059328649493, + "grad_norm": 0.5731463419289184, + "learning_rate": 8.076955120821475e-06, + "loss": 0.5957, + "step": 2768 + }, + { + "epoch": 1.0807962529274004, + "grad_norm": 0.6235937319793867, + "learning_rate": 8.07516481336816e-06, + "loss": 0.6161, + "step": 2769 + }, + { + "epoch": 1.0811865729898518, + "grad_norm": 0.56702144988605, + "learning_rate": 8.073373871559529e-06, + "loss": 0.5533, + "step": 2770 + }, + { + "epoch": 1.0815768930523029, + "grad_norm": 0.6590987516479617, + "learning_rate": 8.071582295765023e-06, + "loss": 0.568, + "step": 2771 + }, + { + "epoch": 1.0819672131147542, + "grad_norm": 0.640035805794957, + "learning_rate": 8.069790086354217e-06, + "loss": 0.6071, + "step": 2772 + }, + { + "epoch": 1.0823575331772053, + "grad_norm": 0.6328758088872585, + "learning_rate": 8.06799724369681e-06, + "loss": 0.5926, + "step": 2773 + }, + { + "epoch": 1.0827478532396566, + "grad_norm": 0.6866857212755192, + "learning_rate": 8.066203768162639e-06, + "loss": 0.623, + "step": 2774 + }, + { + "epoch": 1.0831381733021077, + "grad_norm": 0.650739359749017, + "learning_rate": 8.064409660121668e-06, + "loss": 0.5706, + "step": 2775 + }, + { + "epoch": 1.083528493364559, + "grad_norm": 0.6334446951024988, + "learning_rate": 8.06261491994399e-06, + "loss": 0.617, + "step": 2776 + }, + { + "epoch": 1.0839188134270101, + "grad_norm": 0.6150387518278623, + "learning_rate": 8.060819547999832e-06, + "loss": 0.6229, + "step": 2777 + }, + { + "epoch": 1.0843091334894615, + "grad_norm": 0.6661147517727185, + "learning_rate": 8.059023544659546e-06, + "loss": 0.6178, + "step": 2778 + }, + { + "epoch": 1.0846994535519126, + "grad_norm": 0.6721391331641209, + "learning_rate": 8.057226910293619e-06, + "loss": 0.6174, + "step": 2779 + }, + { + "epoch": 1.0850897736143639, + "grad_norm": 0.6668623277008869, + "learning_rate": 8.055429645272669e-06, + "loss": 0.6028, + "step": 2780 + }, + { + "epoch": 1.085480093676815, + "grad_norm": 0.5456799842546958, + "learning_rate": 8.05363174996744e-06, + "loss": 0.5685, + "step": 2781 + }, + { + "epoch": 1.0858704137392663, + "grad_norm": 0.7047971631849429, + "learning_rate": 8.051833224748808e-06, + "loss": 0.6068, + "step": 2782 + }, + { + "epoch": 1.0862607338017174, + "grad_norm": 0.7359881655449445, + "learning_rate": 8.050034069987778e-06, + "loss": 0.6139, + "step": 2783 + }, + { + "epoch": 1.0866510538641687, + "grad_norm": 0.6407664120976159, + "learning_rate": 8.048234286055488e-06, + "loss": 0.6118, + "step": 2784 + }, + { + "epoch": 1.0870413739266198, + "grad_norm": 0.6792210194610642, + "learning_rate": 8.046433873323202e-06, + "loss": 0.5914, + "step": 2785 + }, + { + "epoch": 1.0874316939890711, + "grad_norm": 0.5367922193990075, + "learning_rate": 8.044632832162315e-06, + "loss": 0.5474, + "step": 2786 + }, + { + "epoch": 1.0878220140515222, + "grad_norm": 0.6291963576507155, + "learning_rate": 8.042831162944352e-06, + "loss": 0.6141, + "step": 2787 + }, + { + "epoch": 1.0882123341139736, + "grad_norm": 0.6796776193313235, + "learning_rate": 8.041028866040966e-06, + "loss": 0.6375, + "step": 2788 + }, + { + "epoch": 1.0886026541764247, + "grad_norm": 0.5777297167062536, + "learning_rate": 8.039225941823945e-06, + "loss": 0.5878, + "step": 2789 + }, + { + "epoch": 1.088992974238876, + "grad_norm": 0.5652488898250615, + "learning_rate": 8.037422390665197e-06, + "loss": 0.6235, + "step": 2790 + }, + { + "epoch": 1.089383294301327, + "grad_norm": 0.5498723708730333, + "learning_rate": 8.035618212936769e-06, + "loss": 0.5869, + "step": 2791 + }, + { + "epoch": 1.0897736143637784, + "grad_norm": 0.5922043616214855, + "learning_rate": 8.03381340901083e-06, + "loss": 0.6162, + "step": 2792 + }, + { + "epoch": 1.0901639344262295, + "grad_norm": 0.6631187959539592, + "learning_rate": 8.032007979259682e-06, + "loss": 0.6102, + "step": 2793 + }, + { + "epoch": 1.0905542544886808, + "grad_norm": 0.5598814470267901, + "learning_rate": 8.030201924055756e-06, + "loss": 0.5641, + "step": 2794 + }, + { + "epoch": 1.090944574551132, + "grad_norm": 0.5376234172626219, + "learning_rate": 8.028395243771609e-06, + "loss": 0.6016, + "step": 2795 + }, + { + "epoch": 1.0913348946135832, + "grad_norm": 0.5825277656852179, + "learning_rate": 8.02658793877993e-06, + "loss": 0.6076, + "step": 2796 + }, + { + "epoch": 1.0917252146760343, + "grad_norm": 0.653960161447144, + "learning_rate": 8.024780009453538e-06, + "loss": 0.6098, + "step": 2797 + }, + { + "epoch": 1.0921155347384857, + "grad_norm": 0.5820773525104211, + "learning_rate": 8.022971456165375e-06, + "loss": 0.5918, + "step": 2798 + }, + { + "epoch": 1.0925058548009368, + "grad_norm": 0.5655148967806489, + "learning_rate": 8.02116227928852e-06, + "loss": 0.5981, + "step": 2799 + }, + { + "epoch": 1.092896174863388, + "grad_norm": 0.6415959563335083, + "learning_rate": 8.019352479196171e-06, + "loss": 0.63, + "step": 2800 + }, + { + "epoch": 1.0932864949258392, + "grad_norm": 0.5718196011672402, + "learning_rate": 8.017542056261662e-06, + "loss": 0.5832, + "step": 2801 + }, + { + "epoch": 1.0936768149882905, + "grad_norm": 0.5661827845774202, + "learning_rate": 8.015731010858456e-06, + "loss": 0.5838, + "step": 2802 + }, + { + "epoch": 1.0940671350507416, + "grad_norm": 0.5416388682132698, + "learning_rate": 8.013919343360136e-06, + "loss": 0.562, + "step": 2803 + }, + { + "epoch": 1.0944574551131927, + "grad_norm": 0.650732393393629, + "learning_rate": 8.012107054140423e-06, + "loss": 0.5752, + "step": 2804 + }, + { + "epoch": 1.094847775175644, + "grad_norm": 0.6803530978325353, + "learning_rate": 8.01029414357316e-06, + "loss": 0.6141, + "step": 2805 + }, + { + "epoch": 1.0952380952380953, + "grad_norm": 0.5634120786751005, + "learning_rate": 8.008480612032321e-06, + "loss": 0.5805, + "step": 2806 + }, + { + "epoch": 1.0956284153005464, + "grad_norm": 0.6230887342808001, + "learning_rate": 8.006666459892008e-06, + "loss": 0.6173, + "step": 2807 + }, + { + "epoch": 1.0960187353629975, + "grad_norm": 0.6447747613860877, + "learning_rate": 8.00485168752645e-06, + "loss": 0.6014, + "step": 2808 + }, + { + "epoch": 1.0964090554254489, + "grad_norm": 0.5532170720251071, + "learning_rate": 8.003036295310003e-06, + "loss": 0.6467, + "step": 2809 + }, + { + "epoch": 1.0967993754879002, + "grad_norm": 0.6564540864410617, + "learning_rate": 8.001220283617153e-06, + "loss": 0.6088, + "step": 2810 + }, + { + "epoch": 1.0971896955503513, + "grad_norm": 0.5804063608598337, + "learning_rate": 7.999403652822514e-06, + "loss": 0.5955, + "step": 2811 + }, + { + "epoch": 1.0975800156128024, + "grad_norm": 0.5579369418746841, + "learning_rate": 7.997586403300826e-06, + "loss": 0.572, + "step": 2812 + }, + { + "epoch": 1.0979703356752537, + "grad_norm": 0.6590116163080894, + "learning_rate": 7.995768535426956e-06, + "loss": 0.632, + "step": 2813 + }, + { + "epoch": 1.098360655737705, + "grad_norm": 0.6815136198050101, + "learning_rate": 7.993950049575903e-06, + "loss": 0.6002, + "step": 2814 + }, + { + "epoch": 1.0987509758001561, + "grad_norm": 0.6434805322258087, + "learning_rate": 7.992130946122786e-06, + "loss": 0.6079, + "step": 2815 + }, + { + "epoch": 1.0991412958626072, + "grad_norm": 0.6684586778516908, + "learning_rate": 7.990311225442861e-06, + "loss": 0.5873, + "step": 2816 + }, + { + "epoch": 1.0995316159250585, + "grad_norm": 0.6074421680716695, + "learning_rate": 7.988490887911502e-06, + "loss": 0.5533, + "step": 2817 + }, + { + "epoch": 1.0999219359875099, + "grad_norm": 0.5232297995007065, + "learning_rate": 7.986669933904217e-06, + "loss": 0.6176, + "step": 2818 + }, + { + "epoch": 1.100312256049961, + "grad_norm": 0.5771103474391668, + "learning_rate": 7.984848363796636e-06, + "loss": 0.5792, + "step": 2819 + }, + { + "epoch": 1.100702576112412, + "grad_norm": 0.6994584652802065, + "learning_rate": 7.98302617796452e-06, + "loss": 0.6251, + "step": 2820 + }, + { + "epoch": 1.1010928961748634, + "grad_norm": 0.5977854824852916, + "learning_rate": 7.981203376783756e-06, + "loss": 0.6194, + "step": 2821 + }, + { + "epoch": 1.1014832162373147, + "grad_norm": 0.5629237874563607, + "learning_rate": 7.979379960630356e-06, + "loss": 0.656, + "step": 2822 + }, + { + "epoch": 1.1018735362997658, + "grad_norm": 0.5810626767648026, + "learning_rate": 7.97755592988046e-06, + "loss": 0.5871, + "step": 2823 + }, + { + "epoch": 1.102263856362217, + "grad_norm": 0.659437903021274, + "learning_rate": 7.97573128491034e-06, + "loss": 0.6281, + "step": 2824 + }, + { + "epoch": 1.1026541764246682, + "grad_norm": 0.5357192424558075, + "learning_rate": 7.973906026096385e-06, + "loss": 0.5991, + "step": 2825 + }, + { + "epoch": 1.1030444964871196, + "grad_norm": 0.7168115168663394, + "learning_rate": 7.972080153815117e-06, + "loss": 0.6061, + "step": 2826 + }, + { + "epoch": 1.1034348165495707, + "grad_norm": 0.5679333109623892, + "learning_rate": 7.970253668443183e-06, + "loss": 0.6049, + "step": 2827 + }, + { + "epoch": 1.1038251366120218, + "grad_norm": 0.661998549888375, + "learning_rate": 7.968426570357354e-06, + "loss": 0.5957, + "step": 2828 + }, + { + "epoch": 1.104215456674473, + "grad_norm": 0.6173449878884489, + "learning_rate": 7.966598859934535e-06, + "loss": 0.5898, + "step": 2829 + }, + { + "epoch": 1.1046057767369244, + "grad_norm": 0.6647310760399378, + "learning_rate": 7.964770537551748e-06, + "loss": 0.6152, + "step": 2830 + }, + { + "epoch": 1.1049960967993755, + "grad_norm": 0.6105801882237882, + "learning_rate": 7.96294160358615e-06, + "loss": 0.6289, + "step": 2831 + }, + { + "epoch": 1.1053864168618266, + "grad_norm": 0.6062285760120447, + "learning_rate": 7.961112058415014e-06, + "loss": 0.6228, + "step": 2832 + }, + { + "epoch": 1.105776736924278, + "grad_norm": 0.6460573829066623, + "learning_rate": 7.959281902415749e-06, + "loss": 0.597, + "step": 2833 + }, + { + "epoch": 1.1061670569867292, + "grad_norm": 0.6535250656370987, + "learning_rate": 7.957451135965885e-06, + "loss": 0.6498, + "step": 2834 + }, + { + "epoch": 1.1065573770491803, + "grad_norm": 0.5756029721579315, + "learning_rate": 7.955619759443077e-06, + "loss": 0.5862, + "step": 2835 + }, + { + "epoch": 1.1069476971116314, + "grad_norm": 0.6554613848689752, + "learning_rate": 7.953787773225108e-06, + "loss": 0.6083, + "step": 2836 + }, + { + "epoch": 1.1073380171740828, + "grad_norm": 0.6666087001956194, + "learning_rate": 7.951955177689887e-06, + "loss": 0.5982, + "step": 2837 + }, + { + "epoch": 1.1077283372365339, + "grad_norm": 0.5016012075696646, + "learning_rate": 7.950121973215448e-06, + "loss": 0.5981, + "step": 2838 + }, + { + "epoch": 1.1081186572989852, + "grad_norm": 0.6617294265624267, + "learning_rate": 7.94828816017995e-06, + "loss": 0.6159, + "step": 2839 + }, + { + "epoch": 1.1085089773614363, + "grad_norm": 0.6981093334693143, + "learning_rate": 7.946453738961676e-06, + "loss": 0.583, + "step": 2840 + }, + { + "epoch": 1.1088992974238876, + "grad_norm": 0.5499980922995117, + "learning_rate": 7.944618709939041e-06, + "loss": 0.6235, + "step": 2841 + }, + { + "epoch": 1.1092896174863387, + "grad_norm": 0.5784897648324265, + "learning_rate": 7.942783073490579e-06, + "loss": 0.5943, + "step": 2842 + }, + { + "epoch": 1.10967993754879, + "grad_norm": 0.5811808387136612, + "learning_rate": 7.940946829994949e-06, + "loss": 0.606, + "step": 2843 + }, + { + "epoch": 1.1100702576112411, + "grad_norm": 0.5881595194728506, + "learning_rate": 7.939109979830942e-06, + "loss": 0.6186, + "step": 2844 + }, + { + "epoch": 1.1104605776736924, + "grad_norm": 0.5748311430696554, + "learning_rate": 7.937272523377465e-06, + "loss": 0.6407, + "step": 2845 + }, + { + "epoch": 1.1108508977361435, + "grad_norm": 0.6023422503893084, + "learning_rate": 7.935434461013558e-06, + "loss": 0.6032, + "step": 2846 + }, + { + "epoch": 1.1112412177985949, + "grad_norm": 0.5396360467779491, + "learning_rate": 7.933595793118381e-06, + "loss": 0.5831, + "step": 2847 + }, + { + "epoch": 1.111631537861046, + "grad_norm": 0.6222283601747461, + "learning_rate": 7.931756520071222e-06, + "loss": 0.5856, + "step": 2848 + }, + { + "epoch": 1.1120218579234973, + "grad_norm": 0.6721975043029943, + "learning_rate": 7.92991664225149e-06, + "loss": 0.619, + "step": 2849 + }, + { + "epoch": 1.1124121779859484, + "grad_norm": 0.6103126924064243, + "learning_rate": 7.928076160038722e-06, + "loss": 0.5797, + "step": 2850 + }, + { + "epoch": 1.1128024980483997, + "grad_norm": 0.7462803891506685, + "learning_rate": 7.926235073812583e-06, + "loss": 0.6241, + "step": 2851 + }, + { + "epoch": 1.1131928181108508, + "grad_norm": 0.6290311138561433, + "learning_rate": 7.924393383952851e-06, + "loss": 0.5937, + "step": 2852 + }, + { + "epoch": 1.1135831381733021, + "grad_norm": 0.6215204373563228, + "learning_rate": 7.922551090839444e-06, + "loss": 0.6042, + "step": 2853 + }, + { + "epoch": 1.1139734582357532, + "grad_norm": 0.655613470712437, + "learning_rate": 7.920708194852388e-06, + "loss": 0.6306, + "step": 2854 + }, + { + "epoch": 1.1143637782982045, + "grad_norm": 0.5801789571963822, + "learning_rate": 7.918864696371849e-06, + "loss": 0.5998, + "step": 2855 + }, + { + "epoch": 1.1147540983606556, + "grad_norm": 0.5948864639026573, + "learning_rate": 7.917020595778105e-06, + "loss": 0.5873, + "step": 2856 + }, + { + "epoch": 1.115144418423107, + "grad_norm": 0.6636384528479615, + "learning_rate": 7.915175893451567e-06, + "loss": 0.5969, + "step": 2857 + }, + { + "epoch": 1.115534738485558, + "grad_norm": 0.5576712383910929, + "learning_rate": 7.913330589772763e-06, + "loss": 0.5972, + "step": 2858 + }, + { + "epoch": 1.1159250585480094, + "grad_norm": 0.6568177105774811, + "learning_rate": 7.911484685122352e-06, + "loss": 0.6007, + "step": 2859 + }, + { + "epoch": 1.1163153786104605, + "grad_norm": 0.6485695366514186, + "learning_rate": 7.909638179881111e-06, + "loss": 0.6233, + "step": 2860 + }, + { + "epoch": 1.1167056986729118, + "grad_norm": 0.6109015655940186, + "learning_rate": 7.907791074429943e-06, + "loss": 0.6319, + "step": 2861 + }, + { + "epoch": 1.117096018735363, + "grad_norm": 0.7378241432049031, + "learning_rate": 7.905943369149878e-06, + "loss": 0.6128, + "step": 2862 + }, + { + "epoch": 1.1174863387978142, + "grad_norm": 0.6765122741969181, + "learning_rate": 7.904095064422062e-06, + "loss": 0.5866, + "step": 2863 + }, + { + "epoch": 1.1178766588602653, + "grad_norm": 0.759407848606515, + "learning_rate": 7.902246160627775e-06, + "loss": 0.6335, + "step": 2864 + }, + { + "epoch": 1.1182669789227166, + "grad_norm": 0.7150027024009924, + "learning_rate": 7.90039665814841e-06, + "loss": 0.6166, + "step": 2865 + }, + { + "epoch": 1.1186572989851677, + "grad_norm": 0.5661999263407718, + "learning_rate": 7.898546557365492e-06, + "loss": 0.6275, + "step": 2866 + }, + { + "epoch": 1.119047619047619, + "grad_norm": 0.7084999253362061, + "learning_rate": 7.896695858660666e-06, + "loss": 0.5962, + "step": 2867 + }, + { + "epoch": 1.1194379391100702, + "grad_norm": 0.6592842129967112, + "learning_rate": 7.894844562415699e-06, + "loss": 0.5849, + "step": 2868 + }, + { + "epoch": 1.1198282591725215, + "grad_norm": 0.639655910286868, + "learning_rate": 7.892992669012482e-06, + "loss": 0.5805, + "step": 2869 + }, + { + "epoch": 1.1202185792349726, + "grad_norm": 0.5926976017527374, + "learning_rate": 7.891140178833032e-06, + "loss": 0.6057, + "step": 2870 + }, + { + "epoch": 1.120608899297424, + "grad_norm": 0.571518483898804, + "learning_rate": 7.889287092259484e-06, + "loss": 0.602, + "step": 2871 + }, + { + "epoch": 1.120999219359875, + "grad_norm": 0.7090765615207146, + "learning_rate": 7.8874334096741e-06, + "loss": 0.6072, + "step": 2872 + }, + { + "epoch": 1.1213895394223263, + "grad_norm": 0.5423856224455847, + "learning_rate": 7.885579131459268e-06, + "loss": 0.5688, + "step": 2873 + }, + { + "epoch": 1.1217798594847774, + "grad_norm": 0.5971224571897814, + "learning_rate": 7.883724257997489e-06, + "loss": 0.6188, + "step": 2874 + }, + { + "epoch": 1.1221701795472288, + "grad_norm": 0.5880238269242107, + "learning_rate": 7.881868789671394e-06, + "loss": 0.5828, + "step": 2875 + }, + { + "epoch": 1.1225604996096799, + "grad_norm": 0.6089165980447081, + "learning_rate": 7.880012726863736e-06, + "loss": 0.5928, + "step": 2876 + }, + { + "epoch": 1.1229508196721312, + "grad_norm": 0.6136779465563043, + "learning_rate": 7.87815606995739e-06, + "loss": 0.6112, + "step": 2877 + }, + { + "epoch": 1.1233411397345823, + "grad_norm": 0.5698086176376378, + "learning_rate": 7.876298819335353e-06, + "loss": 0.6058, + "step": 2878 + }, + { + "epoch": 1.1237314597970336, + "grad_norm": 0.7491092506210999, + "learning_rate": 7.874440975380746e-06, + "loss": 0.614, + "step": 2879 + }, + { + "epoch": 1.1241217798594847, + "grad_norm": 0.5416936628872654, + "learning_rate": 7.872582538476809e-06, + "loss": 0.6518, + "step": 2880 + }, + { + "epoch": 1.124512099921936, + "grad_norm": 0.6178492303114929, + "learning_rate": 7.870723509006908e-06, + "loss": 0.6429, + "step": 2881 + }, + { + "epoch": 1.1249024199843871, + "grad_norm": 0.5954245737999687, + "learning_rate": 7.86886388735453e-06, + "loss": 0.5858, + "step": 2882 + }, + { + "epoch": 1.1252927400468384, + "grad_norm": 0.5907466401101079, + "learning_rate": 7.867003673903285e-06, + "loss": 0.5781, + "step": 2883 + }, + { + "epoch": 1.1256830601092895, + "grad_norm": 0.6231886943575523, + "learning_rate": 7.865142869036902e-06, + "loss": 0.5657, + "step": 2884 + }, + { + "epoch": 1.1260733801717409, + "grad_norm": 0.599437028693805, + "learning_rate": 7.863281473139233e-06, + "loss": 0.5717, + "step": 2885 + }, + { + "epoch": 1.126463700234192, + "grad_norm": 0.5574363908506685, + "learning_rate": 7.861419486594258e-06, + "loss": 0.6084, + "step": 2886 + }, + { + "epoch": 1.1268540202966433, + "grad_norm": 0.5294768588814427, + "learning_rate": 7.859556909786067e-06, + "loss": 0.5799, + "step": 2887 + }, + { + "epoch": 1.1272443403590944, + "grad_norm": 0.5547378363087058, + "learning_rate": 7.857693743098886e-06, + "loss": 0.5642, + "step": 2888 + }, + { + "epoch": 1.1276346604215457, + "grad_norm": 0.5916363202545925, + "learning_rate": 7.85582998691705e-06, + "loss": 0.5861, + "step": 2889 + }, + { + "epoch": 1.1280249804839968, + "grad_norm": 0.5658518525830526, + "learning_rate": 7.85396564162502e-06, + "loss": 0.5783, + "step": 2890 + }, + { + "epoch": 1.1284153005464481, + "grad_norm": 0.5725593171003198, + "learning_rate": 7.852100707607386e-06, + "loss": 0.6248, + "step": 2891 + }, + { + "epoch": 1.1288056206088992, + "grad_norm": 0.5632478483003014, + "learning_rate": 7.850235185248847e-06, + "loss": 0.6162, + "step": 2892 + }, + { + "epoch": 1.1291959406713505, + "grad_norm": 0.6362456856657538, + "learning_rate": 7.848369074934233e-06, + "loss": 0.628, + "step": 2893 + }, + { + "epoch": 1.1295862607338016, + "grad_norm": 0.5749406877361197, + "learning_rate": 7.846502377048486e-06, + "loss": 0.5926, + "step": 2894 + }, + { + "epoch": 1.129976580796253, + "grad_norm": 0.5959363005802144, + "learning_rate": 7.84463509197668e-06, + "loss": 0.614, + "step": 2895 + }, + { + "epoch": 1.130366900858704, + "grad_norm": 0.5652175577574119, + "learning_rate": 7.842767220104002e-06, + "loss": 0.6272, + "step": 2896 + }, + { + "epoch": 1.1307572209211554, + "grad_norm": 0.6563449480814958, + "learning_rate": 7.840898761815765e-06, + "loss": 0.6185, + "step": 2897 + }, + { + "epoch": 1.1311475409836065, + "grad_norm": 0.5612560767609178, + "learning_rate": 7.839029717497398e-06, + "loss": 0.6144, + "step": 2898 + }, + { + "epoch": 1.1315378610460578, + "grad_norm": 0.6340263853459938, + "learning_rate": 7.837160087534457e-06, + "loss": 0.6447, + "step": 2899 + }, + { + "epoch": 1.131928181108509, + "grad_norm": 0.6086370136257476, + "learning_rate": 7.835289872312613e-06, + "loss": 0.5874, + "step": 2900 + }, + { + "epoch": 1.1323185011709602, + "grad_norm": 0.6401356526929214, + "learning_rate": 7.833419072217662e-06, + "loss": 0.6136, + "step": 2901 + }, + { + "epoch": 1.1327088212334113, + "grad_norm": 0.6258829194508927, + "learning_rate": 7.831547687635517e-06, + "loss": 0.6075, + "step": 2902 + }, + { + "epoch": 1.1330991412958626, + "grad_norm": 0.642122607616973, + "learning_rate": 7.829675718952215e-06, + "loss": 0.6181, + "step": 2903 + }, + { + "epoch": 1.1334894613583137, + "grad_norm": 0.5727740397391565, + "learning_rate": 7.827803166553913e-06, + "loss": 0.6362, + "step": 2904 + }, + { + "epoch": 1.133879781420765, + "grad_norm": 0.6395288182437557, + "learning_rate": 7.825930030826884e-06, + "loss": 0.621, + "step": 2905 + }, + { + "epoch": 1.1342701014832162, + "grad_norm": 0.5821759487627737, + "learning_rate": 7.824056312157528e-06, + "loss": 0.5769, + "step": 2906 + }, + { + "epoch": 1.1346604215456675, + "grad_norm": 0.5459189204516255, + "learning_rate": 7.82218201093236e-06, + "loss": 0.5721, + "step": 2907 + }, + { + "epoch": 1.1350507416081186, + "grad_norm": 0.5263387124227344, + "learning_rate": 7.820307127538018e-06, + "loss": 0.5894, + "step": 2908 + }, + { + "epoch": 1.13544106167057, + "grad_norm": 0.7376790611205345, + "learning_rate": 7.818431662361262e-06, + "loss": 0.5564, + "step": 2909 + }, + { + "epoch": 1.135831381733021, + "grad_norm": 0.5575924910217647, + "learning_rate": 7.81655561578896e-06, + "loss": 0.6039, + "step": 2910 + }, + { + "epoch": 1.1362217017954723, + "grad_norm": 0.6226137695040205, + "learning_rate": 7.81467898820812e-06, + "loss": 0.6201, + "step": 2911 + }, + { + "epoch": 1.1366120218579234, + "grad_norm": 0.6660698019690484, + "learning_rate": 7.812801780005855e-06, + "loss": 0.6099, + "step": 2912 + }, + { + "epoch": 1.1370023419203747, + "grad_norm": 0.6087812911009365, + "learning_rate": 7.810923991569399e-06, + "loss": 0.601, + "step": 2913 + }, + { + "epoch": 1.1373926619828258, + "grad_norm": 0.5665945862650201, + "learning_rate": 7.809045623286112e-06, + "loss": 0.5569, + "step": 2914 + }, + { + "epoch": 1.1377829820452772, + "grad_norm": 0.573564883855874, + "learning_rate": 7.807166675543469e-06, + "loss": 0.5896, + "step": 2915 + }, + { + "epoch": 1.1381733021077283, + "grad_norm": 0.5915515545280341, + "learning_rate": 7.805287148729064e-06, + "loss": 0.6284, + "step": 2916 + }, + { + "epoch": 1.1385636221701796, + "grad_norm": 0.5424159541612685, + "learning_rate": 7.803407043230617e-06, + "loss": 0.6092, + "step": 2917 + }, + { + "epoch": 1.1389539422326307, + "grad_norm": 0.6431156243144914, + "learning_rate": 7.801526359435957e-06, + "loss": 0.6173, + "step": 2918 + }, + { + "epoch": 1.139344262295082, + "grad_norm": 0.6082001368620293, + "learning_rate": 7.79964509773304e-06, + "loss": 0.5737, + "step": 2919 + }, + { + "epoch": 1.139734582357533, + "grad_norm": 0.6051207926638577, + "learning_rate": 7.797763258509938e-06, + "loss": 0.6209, + "step": 2920 + }, + { + "epoch": 1.1401249024199844, + "grad_norm": 0.6037149725861797, + "learning_rate": 7.795880842154845e-06, + "loss": 0.5535, + "step": 2921 + }, + { + "epoch": 1.1405152224824355, + "grad_norm": 0.6763789969261352, + "learning_rate": 7.793997849056072e-06, + "loss": 0.5888, + "step": 2922 + }, + { + "epoch": 1.1409055425448869, + "grad_norm": 0.6003009515361875, + "learning_rate": 7.792114279602048e-06, + "loss": 0.5927, + "step": 2923 + }, + { + "epoch": 1.141295862607338, + "grad_norm": 0.6883758984191279, + "learning_rate": 7.79023013418132e-06, + "loss": 0.5983, + "step": 2924 + }, + { + "epoch": 1.1416861826697893, + "grad_norm": 0.7738471673614025, + "learning_rate": 7.788345413182561e-06, + "loss": 0.6266, + "step": 2925 + }, + { + "epoch": 1.1420765027322404, + "grad_norm": 0.5775557589887438, + "learning_rate": 7.786460116994554e-06, + "loss": 0.5993, + "step": 2926 + }, + { + "epoch": 1.1424668227946917, + "grad_norm": 0.5371955766347614, + "learning_rate": 7.784574246006205e-06, + "loss": 0.5838, + "step": 2927 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.6901466866341217, + "learning_rate": 7.782687800606538e-06, + "loss": 0.5678, + "step": 2928 + }, + { + "epoch": 1.1432474629195941, + "grad_norm": 0.6598583256022627, + "learning_rate": 7.780800781184694e-06, + "loss": 0.5907, + "step": 2929 + }, + { + "epoch": 1.1436377829820452, + "grad_norm": 0.5806456557609528, + "learning_rate": 7.778913188129936e-06, + "loss": 0.5992, + "step": 2930 + }, + { + "epoch": 1.1440281030444965, + "grad_norm": 0.53112681105482, + "learning_rate": 7.777025021831641e-06, + "loss": 0.5968, + "step": 2931 + }, + { + "epoch": 1.1444184231069476, + "grad_norm": 0.6837886317477885, + "learning_rate": 7.775136282679307e-06, + "loss": 0.5766, + "step": 2932 + }, + { + "epoch": 1.144808743169399, + "grad_norm": 0.5667698378971207, + "learning_rate": 7.77324697106255e-06, + "loss": 0.6057, + "step": 2933 + }, + { + "epoch": 1.14519906323185, + "grad_norm": 0.5509529252784973, + "learning_rate": 7.771357087371103e-06, + "loss": 0.5984, + "step": 2934 + }, + { + "epoch": 1.1455893832943014, + "grad_norm": 0.6978123376825518, + "learning_rate": 7.769466631994817e-06, + "loss": 0.6009, + "step": 2935 + }, + { + "epoch": 1.1459797033567525, + "grad_norm": 0.56401068441855, + "learning_rate": 7.767575605323663e-06, + "loss": 0.6338, + "step": 2936 + }, + { + "epoch": 1.1463700234192038, + "grad_norm": 0.5703682493625982, + "learning_rate": 7.765684007747726e-06, + "loss": 0.6458, + "step": 2937 + }, + { + "epoch": 1.146760343481655, + "grad_norm": 0.7038036332109454, + "learning_rate": 7.763791839657214e-06, + "loss": 0.5954, + "step": 2938 + }, + { + "epoch": 1.1471506635441062, + "grad_norm": 0.6458939263533094, + "learning_rate": 7.761899101442448e-06, + "loss": 0.6422, + "step": 2939 + }, + { + "epoch": 1.1475409836065573, + "grad_norm": 0.5996295228254075, + "learning_rate": 7.760005793493867e-06, + "loss": 0.6204, + "step": 2940 + }, + { + "epoch": 1.1479313036690086, + "grad_norm": 0.5944855067944514, + "learning_rate": 7.758111916202033e-06, + "loss": 0.612, + "step": 2941 + }, + { + "epoch": 1.1483216237314597, + "grad_norm": 0.6707219526835076, + "learning_rate": 7.756217469957618e-06, + "loss": 0.6224, + "step": 2942 + }, + { + "epoch": 1.148711943793911, + "grad_norm": 0.6742952042089724, + "learning_rate": 7.754322455151416e-06, + "loss": 0.6306, + "step": 2943 + }, + { + "epoch": 1.1491022638563622, + "grad_norm": 0.6224003048521553, + "learning_rate": 7.752426872174337e-06, + "loss": 0.6129, + "step": 2944 + }, + { + "epoch": 1.1494925839188135, + "grad_norm": 0.552564561918056, + "learning_rate": 7.750530721417406e-06, + "loss": 0.5903, + "step": 2945 + }, + { + "epoch": 1.1498829039812646, + "grad_norm": 0.6108661639311159, + "learning_rate": 7.74863400327177e-06, + "loss": 0.6176, + "step": 2946 + }, + { + "epoch": 1.150273224043716, + "grad_norm": 0.6370147160583662, + "learning_rate": 7.74673671812869e-06, + "loss": 0.5862, + "step": 2947 + }, + { + "epoch": 1.150663544106167, + "grad_norm": 0.6224018099298814, + "learning_rate": 7.744838866379545e-06, + "loss": 0.5675, + "step": 2948 + }, + { + "epoch": 1.1510538641686183, + "grad_norm": 0.6168241537908796, + "learning_rate": 7.742940448415828e-06, + "loss": 0.5876, + "step": 2949 + }, + { + "epoch": 1.1514441842310694, + "grad_norm": 0.7980009619170124, + "learning_rate": 7.74104146462915e-06, + "loss": 0.6063, + "step": 2950 + }, + { + "epoch": 1.1518345042935207, + "grad_norm": 0.5780416149782285, + "learning_rate": 7.739141915411246e-06, + "loss": 0.6377, + "step": 2951 + }, + { + "epoch": 1.1522248243559718, + "grad_norm": 0.6330148730080103, + "learning_rate": 7.737241801153955e-06, + "loss": 0.6161, + "step": 2952 + }, + { + "epoch": 1.1526151444184232, + "grad_norm": 0.6872427334159431, + "learning_rate": 7.735341122249242e-06, + "loss": 0.6192, + "step": 2953 + }, + { + "epoch": 1.1530054644808743, + "grad_norm": 0.682808699320093, + "learning_rate": 7.733439879089187e-06, + "loss": 0.6101, + "step": 2954 + }, + { + "epoch": 1.1533957845433256, + "grad_norm": 0.6185912882475079, + "learning_rate": 7.731538072065978e-06, + "loss": 0.6263, + "step": 2955 + }, + { + "epoch": 1.1537861046057767, + "grad_norm": 0.7467075658743008, + "learning_rate": 7.729635701571934e-06, + "loss": 0.5984, + "step": 2956 + }, + { + "epoch": 1.154176424668228, + "grad_norm": 0.5930244169461844, + "learning_rate": 7.727732767999479e-06, + "loss": 0.589, + "step": 2957 + }, + { + "epoch": 1.154566744730679, + "grad_norm": 0.631123857483374, + "learning_rate": 7.725829271741154e-06, + "loss": 0.5935, + "step": 2958 + }, + { + "epoch": 1.1549570647931304, + "grad_norm": 0.6087711598084385, + "learning_rate": 7.723925213189623e-06, + "loss": 0.5972, + "step": 2959 + }, + { + "epoch": 1.1553473848555815, + "grad_norm": 0.6382003011200008, + "learning_rate": 7.72202059273766e-06, + "loss": 0.6444, + "step": 2960 + }, + { + "epoch": 1.1557377049180328, + "grad_norm": 0.7425374914106692, + "learning_rate": 7.720115410778155e-06, + "loss": 0.5881, + "step": 2961 + }, + { + "epoch": 1.156128024980484, + "grad_norm": 0.6518250185891016, + "learning_rate": 7.718209667704117e-06, + "loss": 0.5792, + "step": 2962 + }, + { + "epoch": 1.1565183450429353, + "grad_norm": 0.630097641306336, + "learning_rate": 7.716303363908669e-06, + "loss": 0.6145, + "step": 2963 + }, + { + "epoch": 1.1569086651053864, + "grad_norm": 0.6139975407403035, + "learning_rate": 7.71439649978505e-06, + "loss": 0.5902, + "step": 2964 + }, + { + "epoch": 1.1572989851678377, + "grad_norm": 0.6383320067581476, + "learning_rate": 7.712489075726612e-06, + "loss": 0.5987, + "step": 2965 + }, + { + "epoch": 1.1576893052302888, + "grad_norm": 0.597764781521976, + "learning_rate": 7.71058109212683e-06, + "loss": 0.5909, + "step": 2966 + }, + { + "epoch": 1.1580796252927401, + "grad_norm": 0.5203327765679054, + "learning_rate": 7.708672549379281e-06, + "loss": 0.5899, + "step": 2967 + }, + { + "epoch": 1.1584699453551912, + "grad_norm": 0.6068750112436492, + "learning_rate": 7.706763447877674e-06, + "loss": 0.6026, + "step": 2968 + }, + { + "epoch": 1.1588602654176425, + "grad_norm": 0.583817014173765, + "learning_rate": 7.704853788015821e-06, + "loss": 0.6234, + "step": 2969 + }, + { + "epoch": 1.1592505854800936, + "grad_norm": 0.6396204664776356, + "learning_rate": 7.702943570187652e-06, + "loss": 0.5953, + "step": 2970 + }, + { + "epoch": 1.159640905542545, + "grad_norm": 0.6432423254871711, + "learning_rate": 7.701032794787214e-06, + "loss": 0.5984, + "step": 2971 + }, + { + "epoch": 1.160031225604996, + "grad_norm": 0.590412004692754, + "learning_rate": 7.699121462208669e-06, + "loss": 0.5954, + "step": 2972 + }, + { + "epoch": 1.1604215456674474, + "grad_norm": 0.6308359155606329, + "learning_rate": 7.697209572846295e-06, + "loss": 0.6053, + "step": 2973 + }, + { + "epoch": 1.1608118657298985, + "grad_norm": 0.5245242233970736, + "learning_rate": 7.695297127094477e-06, + "loss": 0.6229, + "step": 2974 + }, + { + "epoch": 1.1612021857923498, + "grad_norm": 0.5655935740025233, + "learning_rate": 7.693384125347726e-06, + "loss": 0.6281, + "step": 2975 + }, + { + "epoch": 1.161592505854801, + "grad_norm": 0.5284481125537341, + "learning_rate": 7.691470568000662e-06, + "loss": 0.5754, + "step": 2976 + }, + { + "epoch": 1.1619828259172522, + "grad_norm": 0.6075095671459311, + "learning_rate": 7.689556455448018e-06, + "loss": 0.5981, + "step": 2977 + }, + { + "epoch": 1.1623731459797033, + "grad_norm": 0.6129487284212596, + "learning_rate": 7.687641788084646e-06, + "loss": 0.6089, + "step": 2978 + }, + { + "epoch": 1.1627634660421546, + "grad_norm": 0.5549425878416759, + "learning_rate": 7.685726566305506e-06, + "loss": 0.5928, + "step": 2979 + }, + { + "epoch": 1.1631537861046057, + "grad_norm": 0.7111972543723238, + "learning_rate": 7.68381079050568e-06, + "loss": 0.5965, + "step": 2980 + }, + { + "epoch": 1.163544106167057, + "grad_norm": 0.7561829408358823, + "learning_rate": 7.681894461080357e-06, + "loss": 0.6019, + "step": 2981 + }, + { + "epoch": 1.1639344262295082, + "grad_norm": 0.5898304416158981, + "learning_rate": 7.679977578424849e-06, + "loss": 0.6374, + "step": 2982 + }, + { + "epoch": 1.1643247462919595, + "grad_norm": 0.6260769128076517, + "learning_rate": 7.678060142934575e-06, + "loss": 0.616, + "step": 2983 + }, + { + "epoch": 1.1647150663544106, + "grad_norm": 0.6073483899554967, + "learning_rate": 7.676142155005067e-06, + "loss": 0.5726, + "step": 2984 + }, + { + "epoch": 1.165105386416862, + "grad_norm": 0.6327169828610186, + "learning_rate": 7.674223615031976e-06, + "loss": 0.6108, + "step": 2985 + }, + { + "epoch": 1.165495706479313, + "grad_norm": 0.6419788262195614, + "learning_rate": 7.672304523411067e-06, + "loss": 0.609, + "step": 2986 + }, + { + "epoch": 1.1658860265417643, + "grad_norm": 0.8517836316711944, + "learning_rate": 7.670384880538215e-06, + "loss": 0.5746, + "step": 2987 + }, + { + "epoch": 1.1662763466042154, + "grad_norm": 0.5616085626943554, + "learning_rate": 7.668464686809409e-06, + "loss": 0.5986, + "step": 2988 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.6879980450235788, + "learning_rate": 7.666543942620754e-06, + "loss": 0.5869, + "step": 2989 + }, + { + "epoch": 1.1670569867291178, + "grad_norm": 0.8657759238078335, + "learning_rate": 7.664622648368469e-06, + "loss": 0.5745, + "step": 2990 + }, + { + "epoch": 1.1674473067915692, + "grad_norm": 0.6704080202658077, + "learning_rate": 7.662700804448885e-06, + "loss": 0.6047, + "step": 2991 + }, + { + "epoch": 1.1678376268540203, + "grad_norm": 0.5849535706335038, + "learning_rate": 7.660778411258443e-06, + "loss": 0.6234, + "step": 2992 + }, + { + "epoch": 1.1682279469164716, + "grad_norm": 0.6809611332108721, + "learning_rate": 7.658855469193704e-06, + "loss": 0.5819, + "step": 2993 + }, + { + "epoch": 1.1686182669789227, + "grad_norm": 0.5640137813361673, + "learning_rate": 7.656931978651336e-06, + "loss": 0.6207, + "step": 2994 + }, + { + "epoch": 1.169008587041374, + "grad_norm": 0.6775238153100882, + "learning_rate": 7.655007940028127e-06, + "loss": 0.615, + "step": 2995 + }, + { + "epoch": 1.169398907103825, + "grad_norm": 0.663262478755297, + "learning_rate": 7.65308335372097e-06, + "loss": 0.583, + "step": 2996 + }, + { + "epoch": 1.1697892271662764, + "grad_norm": 0.6521818605518619, + "learning_rate": 7.651158220126879e-06, + "loss": 0.5625, + "step": 2997 + }, + { + "epoch": 1.1701795472287275, + "grad_norm": 0.6489471712740348, + "learning_rate": 7.649232539642975e-06, + "loss": 0.6359, + "step": 2998 + }, + { + "epoch": 1.1705698672911788, + "grad_norm": 0.6237404745358796, + "learning_rate": 7.647306312666493e-06, + "loss": 0.6089, + "step": 2999 + }, + { + "epoch": 1.17096018735363, + "grad_norm": 0.5923011482446896, + "learning_rate": 7.645379539594784e-06, + "loss": 0.597, + "step": 3000 + }, + { + "epoch": 1.1713505074160813, + "grad_norm": 0.6238671656436513, + "learning_rate": 7.643452220825307e-06, + "loss": 0.6002, + "step": 3001 + }, + { + "epoch": 1.1717408274785324, + "grad_norm": 0.5582043011878092, + "learning_rate": 7.641524356755636e-06, + "loss": 0.5788, + "step": 3002 + }, + { + "epoch": 1.1721311475409837, + "grad_norm": 0.5154846483173354, + "learning_rate": 7.63959594778346e-06, + "loss": 0.6105, + "step": 3003 + }, + { + "epoch": 1.1725214676034348, + "grad_norm": 0.6403601523403984, + "learning_rate": 7.637666994306574e-06, + "loss": 0.6049, + "step": 3004 + }, + { + "epoch": 1.172911787665886, + "grad_norm": 0.5598059064505915, + "learning_rate": 7.63573749672289e-06, + "loss": 0.6183, + "step": 3005 + }, + { + "epoch": 1.1733021077283372, + "grad_norm": 0.7399718579872668, + "learning_rate": 7.633807455430433e-06, + "loss": 0.6298, + "step": 3006 + }, + { + "epoch": 1.1736924277907885, + "grad_norm": 0.5506369361119876, + "learning_rate": 7.631876870827338e-06, + "loss": 0.599, + "step": 3007 + }, + { + "epoch": 1.1740827478532396, + "grad_norm": 0.6034432300233499, + "learning_rate": 7.62994574331185e-06, + "loss": 0.6062, + "step": 3008 + }, + { + "epoch": 1.174473067915691, + "grad_norm": 0.5335487190743905, + "learning_rate": 7.628014073282331e-06, + "loss": 0.6196, + "step": 3009 + }, + { + "epoch": 1.174863387978142, + "grad_norm": 0.5433329126586753, + "learning_rate": 7.626081861137251e-06, + "loss": 0.6157, + "step": 3010 + }, + { + "epoch": 1.1752537080405934, + "grad_norm": 0.5565620908474601, + "learning_rate": 7.624149107275194e-06, + "loss": 0.6369, + "step": 3011 + }, + { + "epoch": 1.1756440281030445, + "grad_norm": 0.5166606502782137, + "learning_rate": 7.622215812094853e-06, + "loss": 0.577, + "step": 3012 + }, + { + "epoch": 1.1760343481654958, + "grad_norm": 0.5742165828579158, + "learning_rate": 7.620281975995038e-06, + "loss": 0.5959, + "step": 3013 + }, + { + "epoch": 1.176424668227947, + "grad_norm": 0.589408972993451, + "learning_rate": 7.618347599374665e-06, + "loss": 0.6289, + "step": 3014 + }, + { + "epoch": 1.1768149882903982, + "grad_norm": 0.5172299056699102, + "learning_rate": 7.616412682632765e-06, + "loss": 0.5822, + "step": 3015 + }, + { + "epoch": 1.1772053083528493, + "grad_norm": 0.5231405077736289, + "learning_rate": 7.614477226168476e-06, + "loss": 0.5931, + "step": 3016 + }, + { + "epoch": 1.1775956284153006, + "grad_norm": 0.6258920991217487, + "learning_rate": 7.612541230381054e-06, + "loss": 0.606, + "step": 3017 + }, + { + "epoch": 1.1779859484777517, + "grad_norm": 0.5929744522770722, + "learning_rate": 7.610604695669862e-06, + "loss": 0.5926, + "step": 3018 + }, + { + "epoch": 1.178376268540203, + "grad_norm": 0.5437894546932514, + "learning_rate": 7.608667622434374e-06, + "loss": 0.6197, + "step": 3019 + }, + { + "epoch": 1.1787665886026542, + "grad_norm": 0.6261993465620281, + "learning_rate": 7.606730011074177e-06, + "loss": 0.6236, + "step": 3020 + }, + { + "epoch": 1.1791569086651055, + "grad_norm": 0.5578370523866865, + "learning_rate": 7.604791861988965e-06, + "loss": 0.5888, + "step": 3021 + }, + { + "epoch": 1.1795472287275566, + "grad_norm": 0.6215806819863187, + "learning_rate": 7.60285317557855e-06, + "loss": 0.6007, + "step": 3022 + }, + { + "epoch": 1.179937548790008, + "grad_norm": 0.5957604593368773, + "learning_rate": 7.6009139522428496e-06, + "loss": 0.6154, + "step": 3023 + }, + { + "epoch": 1.180327868852459, + "grad_norm": 0.5671270578105111, + "learning_rate": 7.5989741923818916e-06, + "loss": 0.5989, + "step": 3024 + }, + { + "epoch": 1.1807181889149103, + "grad_norm": 0.5684694973630773, + "learning_rate": 7.597033896395816e-06, + "loss": 0.5785, + "step": 3025 + }, + { + "epoch": 1.1811085089773614, + "grad_norm": 0.5420539817350496, + "learning_rate": 7.595093064684877e-06, + "loss": 0.6281, + "step": 3026 + }, + { + "epoch": 1.1814988290398127, + "grad_norm": 0.5648845147511118, + "learning_rate": 7.593151697649432e-06, + "loss": 0.5986, + "step": 3027 + }, + { + "epoch": 1.1818891491022638, + "grad_norm": 0.558291024748444, + "learning_rate": 7.591209795689955e-06, + "loss": 0.6043, + "step": 3028 + }, + { + "epoch": 1.1822794691647152, + "grad_norm": 0.5620506897013345, + "learning_rate": 7.589267359207027e-06, + "loss": 0.6204, + "step": 3029 + }, + { + "epoch": 1.1826697892271663, + "grad_norm": 0.5299186145715798, + "learning_rate": 7.58732438860134e-06, + "loss": 0.5952, + "step": 3030 + }, + { + "epoch": 1.1830601092896176, + "grad_norm": 0.5140840184476753, + "learning_rate": 7.5853808842736975e-06, + "loss": 0.6079, + "step": 3031 + }, + { + "epoch": 1.1834504293520687, + "grad_norm": 0.6223644420073262, + "learning_rate": 7.583436846625013e-06, + "loss": 0.6064, + "step": 3032 + }, + { + "epoch": 1.18384074941452, + "grad_norm": 0.6815392765066843, + "learning_rate": 7.581492276056307e-06, + "loss": 0.609, + "step": 3033 + }, + { + "epoch": 1.184231069476971, + "grad_norm": 0.5460871495682947, + "learning_rate": 7.579547172968713e-06, + "loss": 0.6217, + "step": 3034 + }, + { + "epoch": 1.1846213895394224, + "grad_norm": 0.7354517493098623, + "learning_rate": 7.577601537763472e-06, + "loss": 0.6511, + "step": 3035 + }, + { + "epoch": 1.1850117096018735, + "grad_norm": 0.6559141395695551, + "learning_rate": 7.575655370841939e-06, + "loss": 0.6453, + "step": 3036 + }, + { + "epoch": 1.1854020296643248, + "grad_norm": 0.6504134899287973, + "learning_rate": 7.5737086726055745e-06, + "loss": 0.6372, + "step": 3037 + }, + { + "epoch": 1.185792349726776, + "grad_norm": 0.6796314564155265, + "learning_rate": 7.5717614434559494e-06, + "loss": 0.595, + "step": 3038 + }, + { + "epoch": 1.1861826697892273, + "grad_norm": 0.7693903818502597, + "learning_rate": 7.5698136837947445e-06, + "loss": 0.6199, + "step": 3039 + }, + { + "epoch": 1.1865729898516784, + "grad_norm": 0.671648393600854, + "learning_rate": 7.567865394023751e-06, + "loss": 0.614, + "step": 3040 + }, + { + "epoch": 1.1869633099141297, + "grad_norm": 0.5241556400373468, + "learning_rate": 7.565916574544869e-06, + "loss": 0.6119, + "step": 3041 + }, + { + "epoch": 1.1873536299765808, + "grad_norm": 0.6647550910177492, + "learning_rate": 7.563967225760106e-06, + "loss": 0.579, + "step": 3042 + }, + { + "epoch": 1.187743950039032, + "grad_norm": 0.8418931748171886, + "learning_rate": 7.562017348071582e-06, + "loss": 0.6411, + "step": 3043 + }, + { + "epoch": 1.1881342701014832, + "grad_norm": 0.5611761081364592, + "learning_rate": 7.560066941881521e-06, + "loss": 0.6216, + "step": 3044 + }, + { + "epoch": 1.1885245901639343, + "grad_norm": 0.6785464344631902, + "learning_rate": 7.558116007592265e-06, + "loss": 0.5982, + "step": 3045 + }, + { + "epoch": 1.1889149102263856, + "grad_norm": 0.8549729560437903, + "learning_rate": 7.556164545606254e-06, + "loss": 0.6123, + "step": 3046 + }, + { + "epoch": 1.189305230288837, + "grad_norm": 0.7991919373488265, + "learning_rate": 7.554212556326044e-06, + "loss": 0.6018, + "step": 3047 + }, + { + "epoch": 1.189695550351288, + "grad_norm": 0.6701138172777193, + "learning_rate": 7.552260040154299e-06, + "loss": 0.6126, + "step": 3048 + }, + { + "epoch": 1.1900858704137391, + "grad_norm": 0.728843444361987, + "learning_rate": 7.550306997493789e-06, + "loss": 0.5748, + "step": 3049 + }, + { + "epoch": 1.1904761904761905, + "grad_norm": 0.6909025368529652, + "learning_rate": 7.548353428747394e-06, + "loss": 0.5895, + "step": 3050 + }, + { + "epoch": 1.1908665105386418, + "grad_norm": 0.6724345194465784, + "learning_rate": 7.546399334318106e-06, + "loss": 0.5993, + "step": 3051 + }, + { + "epoch": 1.1912568306010929, + "grad_norm": 0.5876511959439061, + "learning_rate": 7.544444714609017e-06, + "loss": 0.5937, + "step": 3052 + }, + { + "epoch": 1.191647150663544, + "grad_norm": 0.798759343516843, + "learning_rate": 7.542489570023337e-06, + "loss": 0.6057, + "step": 3053 + }, + { + "epoch": 1.1920374707259953, + "grad_norm": 0.7873619854901208, + "learning_rate": 7.540533900964377e-06, + "loss": 0.5924, + "step": 3054 + }, + { + "epoch": 1.1924277907884466, + "grad_norm": 0.5863519020582928, + "learning_rate": 7.5385777078355606e-06, + "loss": 0.5736, + "step": 3055 + }, + { + "epoch": 1.1928181108508977, + "grad_norm": 0.7912644323082935, + "learning_rate": 7.5366209910404174e-06, + "loss": 0.6017, + "step": 3056 + }, + { + "epoch": 1.1932084309133488, + "grad_norm": 0.5771403454167815, + "learning_rate": 7.5346637509825845e-06, + "loss": 0.5976, + "step": 3057 + }, + { + "epoch": 1.1935987509758001, + "grad_norm": 0.7183504767102175, + "learning_rate": 7.532705988065809e-06, + "loss": 0.5709, + "step": 3058 + }, + { + "epoch": 1.1939890710382515, + "grad_norm": 0.6732149388424877, + "learning_rate": 7.5307477026939455e-06, + "loss": 0.6219, + "step": 3059 + }, + { + "epoch": 1.1943793911007026, + "grad_norm": 0.5738251642521368, + "learning_rate": 7.528788895270953e-06, + "loss": 0.5908, + "step": 3060 + }, + { + "epoch": 1.1947697111631537, + "grad_norm": 0.6589202351013489, + "learning_rate": 7.526829566200904e-06, + "loss": 0.6107, + "step": 3061 + }, + { + "epoch": 1.195160031225605, + "grad_norm": 0.5952777931821104, + "learning_rate": 7.5248697158879735e-06, + "loss": 0.5691, + "step": 3062 + }, + { + "epoch": 1.1955503512880563, + "grad_norm": 0.5288749349599463, + "learning_rate": 7.522909344736447e-06, + "loss": 0.5756, + "step": 3063 + }, + { + "epoch": 1.1959406713505074, + "grad_norm": 0.6129786123409874, + "learning_rate": 7.520948453150716e-06, + "loss": 0.586, + "step": 3064 + }, + { + "epoch": 1.1963309914129585, + "grad_norm": 0.586286766219649, + "learning_rate": 7.518987041535279e-06, + "loss": 0.6143, + "step": 3065 + }, + { + "epoch": 1.1967213114754098, + "grad_norm": 0.5869490394013429, + "learning_rate": 7.517025110294745e-06, + "loss": 0.6153, + "step": 3066 + }, + { + "epoch": 1.1971116315378612, + "grad_norm": 0.5976829798884096, + "learning_rate": 7.515062659833825e-06, + "loss": 0.6122, + "step": 3067 + }, + { + "epoch": 1.1975019516003123, + "grad_norm": 0.6449447177657057, + "learning_rate": 7.513099690557341e-06, + "loss": 0.6032, + "step": 3068 + }, + { + "epoch": 1.1978922716627634, + "grad_norm": 0.5410192958276691, + "learning_rate": 7.511136202870222e-06, + "loss": 0.6347, + "step": 3069 + }, + { + "epoch": 1.1982825917252147, + "grad_norm": 0.6765787324905772, + "learning_rate": 7.509172197177499e-06, + "loss": 0.5731, + "step": 3070 + }, + { + "epoch": 1.198672911787666, + "grad_norm": 0.5657356576835665, + "learning_rate": 7.50720767388432e-06, + "loss": 0.5954, + "step": 3071 + }, + { + "epoch": 1.199063231850117, + "grad_norm": 0.5644771362018035, + "learning_rate": 7.505242633395928e-06, + "loss": 0.6558, + "step": 3072 + }, + { + "epoch": 1.1994535519125682, + "grad_norm": 0.6290749966804193, + "learning_rate": 7.503277076117681e-06, + "loss": 0.6157, + "step": 3073 + }, + { + "epoch": 1.1998438719750195, + "grad_norm": 0.5361440918595212, + "learning_rate": 7.501311002455038e-06, + "loss": 0.6047, + "step": 3074 + }, + { + "epoch": 1.2002341920374708, + "grad_norm": 0.5873045013064859, + "learning_rate": 7.499344412813569e-06, + "loss": 0.5603, + "step": 3075 + }, + { + "epoch": 1.200624512099922, + "grad_norm": 0.5896691457693221, + "learning_rate": 7.497377307598952e-06, + "loss": 0.5818, + "step": 3076 + }, + { + "epoch": 1.201014832162373, + "grad_norm": 0.6295033829493382, + "learning_rate": 7.495409687216963e-06, + "loss": 0.5744, + "step": 3077 + }, + { + "epoch": 1.2014051522248244, + "grad_norm": 0.5548317485984058, + "learning_rate": 7.4934415520734895e-06, + "loss": 0.6054, + "step": 3078 + }, + { + "epoch": 1.2017954722872757, + "grad_norm": 0.6741607390366846, + "learning_rate": 7.491472902574528e-06, + "loss": 0.6051, + "step": 3079 + }, + { + "epoch": 1.2021857923497268, + "grad_norm": 0.5765587283275838, + "learning_rate": 7.489503739126177e-06, + "loss": 0.5973, + "step": 3080 + }, + { + "epoch": 1.2025761124121779, + "grad_norm": 0.5137149162738732, + "learning_rate": 7.487534062134642e-06, + "loss": 0.5571, + "step": 3081 + }, + { + "epoch": 1.2029664324746292, + "grad_norm": 0.7111824771253847, + "learning_rate": 7.485563872006234e-06, + "loss": 0.6136, + "step": 3082 + }, + { + "epoch": 1.2033567525370805, + "grad_norm": 0.5587512907256966, + "learning_rate": 7.483593169147371e-06, + "loss": 0.5682, + "step": 3083 + }, + { + "epoch": 1.2037470725995316, + "grad_norm": 0.6157971613693167, + "learning_rate": 7.4816219539645765e-06, + "loss": 0.5892, + "step": 3084 + }, + { + "epoch": 1.2041373926619827, + "grad_norm": 0.5835787516372368, + "learning_rate": 7.479650226864479e-06, + "loss": 0.6192, + "step": 3085 + }, + { + "epoch": 1.204527712724434, + "grad_norm": 0.5836565117040342, + "learning_rate": 7.477677988253813e-06, + "loss": 0.6192, + "step": 3086 + }, + { + "epoch": 1.2049180327868854, + "grad_norm": 0.6145337193031034, + "learning_rate": 7.475705238539418e-06, + "loss": 0.5837, + "step": 3087 + }, + { + "epoch": 1.2053083528493365, + "grad_norm": 0.6095263177159117, + "learning_rate": 7.473731978128241e-06, + "loss": 0.5694, + "step": 3088 + }, + { + "epoch": 1.2056986729117876, + "grad_norm": 0.694110777675652, + "learning_rate": 7.4717582074273305e-06, + "loss": 0.6057, + "step": 3089 + }, + { + "epoch": 1.2060889929742389, + "grad_norm": 0.6090387606630989, + "learning_rate": 7.469783926843845e-06, + "loss": 0.644, + "step": 3090 + }, + { + "epoch": 1.2064793130366902, + "grad_norm": 0.5556005877244626, + "learning_rate": 7.467809136785044e-06, + "loss": 0.5587, + "step": 3091 + }, + { + "epoch": 1.2068696330991413, + "grad_norm": 0.5726354996102258, + "learning_rate": 7.465833837658294e-06, + "loss": 0.5858, + "step": 3092 + }, + { + "epoch": 1.2072599531615924, + "grad_norm": 0.554559299706258, + "learning_rate": 7.463858029871066e-06, + "loss": 0.6096, + "step": 3093 + }, + { + "epoch": 1.2076502732240437, + "grad_norm": 0.6191643353719616, + "learning_rate": 7.461881713830938e-06, + "loss": 0.5759, + "step": 3094 + }, + { + "epoch": 1.208040593286495, + "grad_norm": 0.594451791805185, + "learning_rate": 7.459904889945589e-06, + "loss": 0.631, + "step": 3095 + }, + { + "epoch": 1.2084309133489461, + "grad_norm": 0.6049163876433891, + "learning_rate": 7.457927558622808e-06, + "loss": 0.6332, + "step": 3096 + }, + { + "epoch": 1.2088212334113972, + "grad_norm": 0.5420021325947401, + "learning_rate": 7.455949720270481e-06, + "loss": 0.5894, + "step": 3097 + }, + { + "epoch": 1.2092115534738486, + "grad_norm": 0.5679050847719738, + "learning_rate": 7.453971375296607e-06, + "loss": 0.574, + "step": 3098 + }, + { + "epoch": 1.2096018735362999, + "grad_norm": 0.666336663527396, + "learning_rate": 7.451992524109284e-06, + "loss": 0.5737, + "step": 3099 + }, + { + "epoch": 1.209992193598751, + "grad_norm": 0.6090473635904504, + "learning_rate": 7.450013167116715e-06, + "loss": 0.5583, + "step": 3100 + }, + { + "epoch": 1.210382513661202, + "grad_norm": 0.586654825657528, + "learning_rate": 7.44803330472721e-06, + "loss": 0.5909, + "step": 3101 + }, + { + "epoch": 1.2107728337236534, + "grad_norm": 0.755688451408154, + "learning_rate": 7.446052937349183e-06, + "loss": 0.6297, + "step": 3102 + }, + { + "epoch": 1.2111631537861047, + "grad_norm": 0.5864937353860564, + "learning_rate": 7.444072065391148e-06, + "loss": 0.5686, + "step": 3103 + }, + { + "epoch": 1.2115534738485558, + "grad_norm": 0.6903642202636204, + "learning_rate": 7.442090689261726e-06, + "loss": 0.5981, + "step": 3104 + }, + { + "epoch": 1.211943793911007, + "grad_norm": 0.6705202247370745, + "learning_rate": 7.4401088093696446e-06, + "loss": 0.5852, + "step": 3105 + }, + { + "epoch": 1.2123341139734582, + "grad_norm": 0.7151691018315935, + "learning_rate": 7.438126426123731e-06, + "loss": 0.6082, + "step": 3106 + }, + { + "epoch": 1.2127244340359093, + "grad_norm": 0.6426824029039483, + "learning_rate": 7.436143539932917e-06, + "loss": 0.6064, + "step": 3107 + }, + { + "epoch": 1.2131147540983607, + "grad_norm": 0.6857672559296252, + "learning_rate": 7.43416015120624e-06, + "loss": 0.6322, + "step": 3108 + }, + { + "epoch": 1.2135050741608118, + "grad_norm": 0.6232256623929091, + "learning_rate": 7.432176260352839e-06, + "loss": 0.5875, + "step": 3109 + }, + { + "epoch": 1.213895394223263, + "grad_norm": 0.6784672067953829, + "learning_rate": 7.4301918677819595e-06, + "loss": 0.621, + "step": 3110 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 0.6560010837622948, + "learning_rate": 7.428206973902949e-06, + "loss": 0.5936, + "step": 3111 + }, + { + "epoch": 1.2146760343481655, + "grad_norm": 0.7015350074556413, + "learning_rate": 7.426221579125255e-06, + "loss": 0.6101, + "step": 3112 + }, + { + "epoch": 1.2150663544106166, + "grad_norm": 0.743064270684337, + "learning_rate": 7.424235683858434e-06, + "loss": 0.6345, + "step": 3113 + }, + { + "epoch": 1.215456674473068, + "grad_norm": 0.5898358843285368, + "learning_rate": 7.422249288512142e-06, + "loss": 0.6066, + "step": 3114 + }, + { + "epoch": 1.215846994535519, + "grad_norm": 0.619801635338872, + "learning_rate": 7.4202623934961396e-06, + "loss": 0.6297, + "step": 3115 + }, + { + "epoch": 1.2162373145979704, + "grad_norm": 0.5786158943978738, + "learning_rate": 7.4182749992202915e-06, + "loss": 0.6123, + "step": 3116 + }, + { + "epoch": 1.2166276346604215, + "grad_norm": 0.7003524209610302, + "learning_rate": 7.416287106094562e-06, + "loss": 0.6257, + "step": 3117 + }, + { + "epoch": 1.2170179547228728, + "grad_norm": 0.6112244748209402, + "learning_rate": 7.41429871452902e-06, + "loss": 0.6068, + "step": 3118 + }, + { + "epoch": 1.2174082747853239, + "grad_norm": 0.565626362985611, + "learning_rate": 7.412309824933841e-06, + "loss": 0.6099, + "step": 3119 + }, + { + "epoch": 1.2177985948477752, + "grad_norm": 0.6234864720924781, + "learning_rate": 7.4103204377192965e-06, + "loss": 0.5997, + "step": 3120 + }, + { + "epoch": 1.2181889149102263, + "grad_norm": 0.6793236520461968, + "learning_rate": 7.408330553295765e-06, + "loss": 0.6154, + "step": 3121 + }, + { + "epoch": 1.2185792349726776, + "grad_norm": 0.643162555779693, + "learning_rate": 7.406340172073726e-06, + "loss": 0.6324, + "step": 3122 + }, + { + "epoch": 1.2189695550351287, + "grad_norm": 0.6345601385717501, + "learning_rate": 7.404349294463763e-06, + "loss": 0.6165, + "step": 3123 + }, + { + "epoch": 1.21935987509758, + "grad_norm": 0.6234484308744711, + "learning_rate": 7.402357920876562e-06, + "loss": 0.6016, + "step": 3124 + }, + { + "epoch": 1.2197501951600311, + "grad_norm": 0.7012412970465419, + "learning_rate": 7.400366051722906e-06, + "loss": 0.5748, + "step": 3125 + }, + { + "epoch": 1.2201405152224825, + "grad_norm": 0.6884077263132277, + "learning_rate": 7.398373687413688e-06, + "loss": 0.6182, + "step": 3126 + }, + { + "epoch": 1.2205308352849336, + "grad_norm": 0.5328672857684159, + "learning_rate": 7.3963808283599e-06, + "loss": 0.5681, + "step": 3127 + }, + { + "epoch": 1.2209211553473849, + "grad_norm": 0.7729539799469929, + "learning_rate": 7.394387474972633e-06, + "loss": 0.5923, + "step": 3128 + }, + { + "epoch": 1.221311475409836, + "grad_norm": 0.526754198354073, + "learning_rate": 7.392393627663085e-06, + "loss": 0.6148, + "step": 3129 + }, + { + "epoch": 1.2217017954722873, + "grad_norm": 0.6390217977423318, + "learning_rate": 7.390399286842552e-06, + "loss": 0.5747, + "step": 3130 + }, + { + "epoch": 1.2220921155347384, + "grad_norm": 0.6747267503022145, + "learning_rate": 7.388404452922434e-06, + "loss": 0.6058, + "step": 3131 + }, + { + "epoch": 1.2224824355971897, + "grad_norm": 0.559377632490461, + "learning_rate": 7.386409126314231e-06, + "loss": 0.6149, + "step": 3132 + }, + { + "epoch": 1.2228727556596408, + "grad_norm": 0.6177853759387093, + "learning_rate": 7.384413307429549e-06, + "loss": 0.5985, + "step": 3133 + }, + { + "epoch": 1.2232630757220921, + "grad_norm": 0.6408194890552893, + "learning_rate": 7.382416996680089e-06, + "loss": 0.6219, + "step": 3134 + }, + { + "epoch": 1.2236533957845432, + "grad_norm": 0.5960316470309169, + "learning_rate": 7.380420194477655e-06, + "loss": 0.5933, + "step": 3135 + }, + { + "epoch": 1.2240437158469946, + "grad_norm": 0.5534483129504711, + "learning_rate": 7.378422901234159e-06, + "loss": 0.5889, + "step": 3136 + }, + { + "epoch": 1.2244340359094457, + "grad_norm": 0.632948812536631, + "learning_rate": 7.376425117361607e-06, + "loss": 0.5977, + "step": 3137 + }, + { + "epoch": 1.224824355971897, + "grad_norm": 0.6475133730188631, + "learning_rate": 7.374426843272108e-06, + "loss": 0.6463, + "step": 3138 + }, + { + "epoch": 1.225214676034348, + "grad_norm": 0.547485786100261, + "learning_rate": 7.372428079377872e-06, + "loss": 0.5811, + "step": 3139 + }, + { + "epoch": 1.2256049960967994, + "grad_norm": 0.6111549075147, + "learning_rate": 7.3704288260912135e-06, + "loss": 0.6069, + "step": 3140 + }, + { + "epoch": 1.2259953161592505, + "grad_norm": 0.5744100671294804, + "learning_rate": 7.368429083824542e-06, + "loss": 0.6134, + "step": 3141 + }, + { + "epoch": 1.2263856362217018, + "grad_norm": 0.4905925830745188, + "learning_rate": 7.366428852990374e-06, + "loss": 0.5842, + "step": 3142 + }, + { + "epoch": 1.226775956284153, + "grad_norm": 0.5835128651055045, + "learning_rate": 7.364428134001321e-06, + "loss": 0.6131, + "step": 3143 + }, + { + "epoch": 1.2271662763466042, + "grad_norm": 0.562073549212506, + "learning_rate": 7.362426927270101e-06, + "loss": 0.6047, + "step": 3144 + }, + { + "epoch": 1.2275565964090553, + "grad_norm": 0.5580213556772974, + "learning_rate": 7.360425233209526e-06, + "loss": 0.5785, + "step": 3145 + }, + { + "epoch": 1.2279469164715067, + "grad_norm": 0.547476808014735, + "learning_rate": 7.358423052232516e-06, + "loss": 0.604, + "step": 3146 + }, + { + "epoch": 1.2283372365339578, + "grad_norm": 0.5342733513702704, + "learning_rate": 7.3564203847520855e-06, + "loss": 0.5836, + "step": 3147 + }, + { + "epoch": 1.228727556596409, + "grad_norm": 0.5166396913471111, + "learning_rate": 7.354417231181351e-06, + "loss": 0.5894, + "step": 3148 + }, + { + "epoch": 1.2291178766588602, + "grad_norm": 0.5526574620150732, + "learning_rate": 7.352413591933532e-06, + "loss": 0.6019, + "step": 3149 + }, + { + "epoch": 1.2295081967213115, + "grad_norm": 0.5507144911877024, + "learning_rate": 7.3504094674219415e-06, + "loss": 0.5825, + "step": 3150 + }, + { + "epoch": 1.2298985167837626, + "grad_norm": 0.5446807366240057, + "learning_rate": 7.348404858060003e-06, + "loss": 0.6041, + "step": 3151 + }, + { + "epoch": 1.230288836846214, + "grad_norm": 0.5716896124328857, + "learning_rate": 7.346399764261231e-06, + "loss": 0.6104, + "step": 3152 + }, + { + "epoch": 1.230679156908665, + "grad_norm": 0.5440702145864615, + "learning_rate": 7.344394186439242e-06, + "loss": 0.5973, + "step": 3153 + }, + { + "epoch": 1.2310694769711163, + "grad_norm": 0.4740188399609929, + "learning_rate": 7.342388125007755e-06, + "loss": 0.6017, + "step": 3154 + }, + { + "epoch": 1.2314597970335674, + "grad_norm": 0.5857372592446882, + "learning_rate": 7.340381580380585e-06, + "loss": 0.5793, + "step": 3155 + }, + { + "epoch": 1.2318501170960188, + "grad_norm": 0.5896156625582434, + "learning_rate": 7.338374552971652e-06, + "loss": 0.6328, + "step": 3156 + }, + { + "epoch": 1.2322404371584699, + "grad_norm": 0.5669309030058791, + "learning_rate": 7.33636704319497e-06, + "loss": 0.5915, + "step": 3157 + }, + { + "epoch": 1.2326307572209212, + "grad_norm": 0.6725181053617076, + "learning_rate": 7.334359051464654e-06, + "loss": 0.6375, + "step": 3158 + }, + { + "epoch": 1.2330210772833723, + "grad_norm": 0.5526395801271748, + "learning_rate": 7.332350578194921e-06, + "loss": 0.6354, + "step": 3159 + }, + { + "epoch": 1.2334113973458236, + "grad_norm": 0.572790322504409, + "learning_rate": 7.330341623800087e-06, + "loss": 0.6259, + "step": 3160 + }, + { + "epoch": 1.2338017174082747, + "grad_norm": 0.6545842180655345, + "learning_rate": 7.328332188694562e-06, + "loss": 0.6113, + "step": 3161 + }, + { + "epoch": 1.234192037470726, + "grad_norm": 0.5370588548753531, + "learning_rate": 7.326322273292863e-06, + "loss": 0.6151, + "step": 3162 + }, + { + "epoch": 1.2345823575331771, + "grad_norm": 0.6783036969492593, + "learning_rate": 7.324311878009598e-06, + "loss": 0.636, + "step": 3163 + }, + { + "epoch": 1.2349726775956285, + "grad_norm": 0.5728558690023722, + "learning_rate": 7.322301003259483e-06, + "loss": 0.5876, + "step": 3164 + }, + { + "epoch": 1.2353629976580796, + "grad_norm": 0.5177020671880787, + "learning_rate": 7.320289649457324e-06, + "loss": 0.6361, + "step": 3165 + }, + { + "epoch": 1.2357533177205309, + "grad_norm": 0.5922545151943792, + "learning_rate": 7.31827781701803e-06, + "loss": 0.6245, + "step": 3166 + }, + { + "epoch": 1.236143637782982, + "grad_norm": 0.5292885179223873, + "learning_rate": 7.31626550635661e-06, + "loss": 0.6181, + "step": 3167 + }, + { + "epoch": 1.2365339578454333, + "grad_norm": 0.6562062422575474, + "learning_rate": 7.3142527178881715e-06, + "loss": 0.6072, + "step": 3168 + }, + { + "epoch": 1.2369242779078844, + "grad_norm": 0.6785572986973297, + "learning_rate": 7.312239452027917e-06, + "loss": 0.593, + "step": 3169 + }, + { + "epoch": 1.2373145979703357, + "grad_norm": 0.5317758480884819, + "learning_rate": 7.310225709191151e-06, + "loss": 0.6208, + "step": 3170 + }, + { + "epoch": 1.2377049180327868, + "grad_norm": 0.503577663801203, + "learning_rate": 7.308211489793273e-06, + "loss": 0.5925, + "step": 3171 + }, + { + "epoch": 1.2380952380952381, + "grad_norm": 0.7107850176107731, + "learning_rate": 7.3061967942497846e-06, + "loss": 0.6515, + "step": 3172 + }, + { + "epoch": 1.2384855581576892, + "grad_norm": 0.5887458987405917, + "learning_rate": 7.304181622976285e-06, + "loss": 0.5848, + "step": 3173 + }, + { + "epoch": 1.2388758782201406, + "grad_norm": 0.5104173232443878, + "learning_rate": 7.3021659763884675e-06, + "loss": 0.615, + "step": 3174 + }, + { + "epoch": 1.2392661982825917, + "grad_norm": 0.6334036478835495, + "learning_rate": 7.300149854902129e-06, + "loss": 0.5893, + "step": 3175 + }, + { + "epoch": 1.239656518345043, + "grad_norm": 0.5734398704174976, + "learning_rate": 7.298133258933161e-06, + "loss": 0.5784, + "step": 3176 + }, + { + "epoch": 1.240046838407494, + "grad_norm": 0.6471941498357626, + "learning_rate": 7.296116188897554e-06, + "loss": 0.5892, + "step": 3177 + }, + { + "epoch": 1.2404371584699454, + "grad_norm": 0.6645795842213965, + "learning_rate": 7.294098645211395e-06, + "loss": 0.5845, + "step": 3178 + }, + { + "epoch": 1.2408274785323965, + "grad_norm": 0.5272227242412507, + "learning_rate": 7.292080628290871e-06, + "loss": 0.6199, + "step": 3179 + }, + { + "epoch": 1.2412177985948478, + "grad_norm": 0.6878963622531699, + "learning_rate": 7.290062138552263e-06, + "loss": 0.6226, + "step": 3180 + }, + { + "epoch": 1.241608118657299, + "grad_norm": 0.5317482515182544, + "learning_rate": 7.288043176411956e-06, + "loss": 0.5809, + "step": 3181 + }, + { + "epoch": 1.2419984387197502, + "grad_norm": 0.5682816442601979, + "learning_rate": 7.286023742286424e-06, + "loss": 0.6003, + "step": 3182 + }, + { + "epoch": 1.2423887587822013, + "grad_norm": 0.7038586553804557, + "learning_rate": 7.284003836592245e-06, + "loss": 0.6142, + "step": 3183 + }, + { + "epoch": 1.2427790788446527, + "grad_norm": 0.5296444518433364, + "learning_rate": 7.281983459746091e-06, + "loss": 0.5864, + "step": 3184 + }, + { + "epoch": 1.2431693989071038, + "grad_norm": 0.5686595072813486, + "learning_rate": 7.279962612164734e-06, + "loss": 0.6087, + "step": 3185 + }, + { + "epoch": 1.243559718969555, + "grad_norm": 0.5882814172767726, + "learning_rate": 7.277941294265039e-06, + "loss": 0.5969, + "step": 3186 + }, + { + "epoch": 1.2439500390320062, + "grad_norm": 0.5657684782503578, + "learning_rate": 7.27591950646397e-06, + "loss": 0.596, + "step": 3187 + }, + { + "epoch": 1.2443403590944575, + "grad_norm": 0.559345766557325, + "learning_rate": 7.273897249178589e-06, + "loss": 0.6212, + "step": 3188 + }, + { + "epoch": 1.2447306791569086, + "grad_norm": 0.5372421465736801, + "learning_rate": 7.271874522826055e-06, + "loss": 0.5947, + "step": 3189 + }, + { + "epoch": 1.24512099921936, + "grad_norm": 0.5265489306581331, + "learning_rate": 7.269851327823621e-06, + "loss": 0.5905, + "step": 3190 + }, + { + "epoch": 1.245511319281811, + "grad_norm": 0.598734685927946, + "learning_rate": 7.267827664588639e-06, + "loss": 0.5942, + "step": 3191 + }, + { + "epoch": 1.2459016393442623, + "grad_norm": 0.5315173145934506, + "learning_rate": 7.265803533538558e-06, + "loss": 0.6223, + "step": 3192 + }, + { + "epoch": 1.2462919594067134, + "grad_norm": 0.5299689297378777, + "learning_rate": 7.2637789350909215e-06, + "loss": 0.6013, + "step": 3193 + }, + { + "epoch": 1.2466822794691648, + "grad_norm": 0.5489278695942409, + "learning_rate": 7.2617538696633716e-06, + "loss": 0.5698, + "step": 3194 + }, + { + "epoch": 1.2470725995316159, + "grad_norm": 0.5430406744876611, + "learning_rate": 7.259728337673645e-06, + "loss": 0.5903, + "step": 3195 + }, + { + "epoch": 1.2474629195940672, + "grad_norm": 0.5318304733490451, + "learning_rate": 7.257702339539574e-06, + "loss": 0.5981, + "step": 3196 + }, + { + "epoch": 1.2478532396565183, + "grad_norm": 0.5488896464167449, + "learning_rate": 7.2556758756790906e-06, + "loss": 0.6066, + "step": 3197 + }, + { + "epoch": 1.2482435597189696, + "grad_norm": 0.5996341580352704, + "learning_rate": 7.253648946510218e-06, + "loss": 0.6117, + "step": 3198 + }, + { + "epoch": 1.2486338797814207, + "grad_norm": 0.6158812990707521, + "learning_rate": 7.25162155245108e-06, + "loss": 0.5932, + "step": 3199 + }, + { + "epoch": 1.249024199843872, + "grad_norm": 0.6512603355220593, + "learning_rate": 7.249593693919894e-06, + "loss": 0.6179, + "step": 3200 + }, + { + "epoch": 1.2494145199063231, + "grad_norm": 0.7036802922847742, + "learning_rate": 7.247565371334973e-06, + "loss": 0.6034, + "step": 3201 + }, + { + "epoch": 1.2498048399687745, + "grad_norm": 0.6075881504375567, + "learning_rate": 7.245536585114727e-06, + "loss": 0.6255, + "step": 3202 + }, + { + "epoch": 1.2501951600312255, + "grad_norm": 0.6602461485648379, + "learning_rate": 7.243507335677658e-06, + "loss": 0.6529, + "step": 3203 + }, + { + "epoch": 1.2505854800936769, + "grad_norm": 0.6067694203226238, + "learning_rate": 7.2414776234423704e-06, + "loss": 0.6182, + "step": 3204 + }, + { + "epoch": 1.250975800156128, + "grad_norm": 0.6091040942059842, + "learning_rate": 7.239447448827557e-06, + "loss": 0.6369, + "step": 3205 + }, + { + "epoch": 1.2513661202185793, + "grad_norm": 0.5740506607464183, + "learning_rate": 7.237416812252009e-06, + "loss": 0.6003, + "step": 3206 + }, + { + "epoch": 1.2517564402810304, + "grad_norm": 0.5118007897487808, + "learning_rate": 7.235385714134615e-06, + "loss": 0.5965, + "step": 3207 + }, + { + "epoch": 1.2521467603434817, + "grad_norm": 0.5563925956016544, + "learning_rate": 7.233354154894356e-06, + "loss": 0.5773, + "step": 3208 + }, + { + "epoch": 1.2525370804059328, + "grad_norm": 0.5630325170668427, + "learning_rate": 7.231322134950308e-06, + "loss": 0.6065, + "step": 3209 + }, + { + "epoch": 1.2529274004683841, + "grad_norm": 0.6461503697411869, + "learning_rate": 7.229289654721644e-06, + "loss": 0.6, + "step": 3210 + }, + { + "epoch": 1.2533177205308352, + "grad_norm": 0.5247190712788884, + "learning_rate": 7.227256714627631e-06, + "loss": 0.6016, + "step": 3211 + }, + { + "epoch": 1.2537080405932866, + "grad_norm": 0.6666117580728707, + "learning_rate": 7.22522331508763e-06, + "loss": 0.6133, + "step": 3212 + }, + { + "epoch": 1.2540983606557377, + "grad_norm": 0.6517628128311472, + "learning_rate": 7.223189456521096e-06, + "loss": 0.6528, + "step": 3213 + }, + { + "epoch": 1.254488680718189, + "grad_norm": 0.5616557723733148, + "learning_rate": 7.221155139347582e-06, + "loss": 0.6277, + "step": 3214 + }, + { + "epoch": 1.25487900078064, + "grad_norm": 0.6459895276501818, + "learning_rate": 7.2191203639867336e-06, + "loss": 0.6013, + "step": 3215 + }, + { + "epoch": 1.2552693208430914, + "grad_norm": 0.6115218937096603, + "learning_rate": 7.2170851308582925e-06, + "loss": 0.5918, + "step": 3216 + }, + { + "epoch": 1.2556596409055425, + "grad_norm": 0.5736732705786357, + "learning_rate": 7.21504944038209e-06, + "loss": 0.6226, + "step": 3217 + }, + { + "epoch": 1.2560499609679938, + "grad_norm": 0.5584499150856689, + "learning_rate": 7.213013292978059e-06, + "loss": 0.5807, + "step": 3218 + }, + { + "epoch": 1.256440281030445, + "grad_norm": 0.6431376954677109, + "learning_rate": 7.210976689066219e-06, + "loss": 0.6109, + "step": 3219 + }, + { + "epoch": 1.2568306010928962, + "grad_norm": 0.6189144991600732, + "learning_rate": 7.2089396290666905e-06, + "loss": 0.6546, + "step": 3220 + }, + { + "epoch": 1.2572209211553473, + "grad_norm": 0.6045140647525119, + "learning_rate": 7.2069021133996834e-06, + "loss": 0.6122, + "step": 3221 + }, + { + "epoch": 1.2576112412177987, + "grad_norm": 0.5788250698538588, + "learning_rate": 7.204864142485504e-06, + "loss": 0.5892, + "step": 3222 + }, + { + "epoch": 1.2580015612802498, + "grad_norm": 0.5932699622085863, + "learning_rate": 7.20282571674455e-06, + "loss": 0.5854, + "step": 3223 + }, + { + "epoch": 1.258391881342701, + "grad_norm": 0.5542120401047209, + "learning_rate": 7.200786836597317e-06, + "loss": 0.5852, + "step": 3224 + }, + { + "epoch": 1.2587822014051522, + "grad_norm": 0.5756983522129937, + "learning_rate": 7.1987475024643925e-06, + "loss": 0.6011, + "step": 3225 + }, + { + "epoch": 1.2591725214676035, + "grad_norm": 0.7240730592300444, + "learning_rate": 7.1967077147664554e-06, + "loss": 0.6086, + "step": 3226 + }, + { + "epoch": 1.2595628415300546, + "grad_norm": 0.5333352762482199, + "learning_rate": 7.194667473924281e-06, + "loss": 0.5787, + "step": 3227 + }, + { + "epoch": 1.259953161592506, + "grad_norm": 0.5477132410665342, + "learning_rate": 7.192626780358736e-06, + "loss": 0.6166, + "step": 3228 + }, + { + "epoch": 1.260343481654957, + "grad_norm": 0.5261637673888634, + "learning_rate": 7.190585634490781e-06, + "loss": 0.6395, + "step": 3229 + }, + { + "epoch": 1.2607338017174083, + "grad_norm": 0.5907398546566748, + "learning_rate": 7.188544036741474e-06, + "loss": 0.6129, + "step": 3230 + }, + { + "epoch": 1.2611241217798594, + "grad_norm": 0.5552624408216204, + "learning_rate": 7.186501987531959e-06, + "loss": 0.6017, + "step": 3231 + }, + { + "epoch": 1.2615144418423108, + "grad_norm": 0.5270692831772371, + "learning_rate": 7.184459487283477e-06, + "loss": 0.5873, + "step": 3232 + }, + { + "epoch": 1.2619047619047619, + "grad_norm": 0.5669789657360289, + "learning_rate": 7.1824165364173635e-06, + "loss": 0.619, + "step": 3233 + }, + { + "epoch": 1.2622950819672132, + "grad_norm": 0.5798585851663524, + "learning_rate": 7.180373135355044e-06, + "loss": 0.6117, + "step": 3234 + }, + { + "epoch": 1.2626854020296643, + "grad_norm": 0.5266265177102177, + "learning_rate": 7.17832928451804e-06, + "loss": 0.606, + "step": 3235 + }, + { + "epoch": 1.2630757220921156, + "grad_norm": 0.5451530398904719, + "learning_rate": 7.176284984327962e-06, + "loss": 0.5901, + "step": 3236 + }, + { + "epoch": 1.2634660421545667, + "grad_norm": 0.5658758751337841, + "learning_rate": 7.174240235206514e-06, + "loss": 0.6262, + "step": 3237 + }, + { + "epoch": 1.263856362217018, + "grad_norm": 0.5098532447604955, + "learning_rate": 7.172195037575496e-06, + "loss": 0.5734, + "step": 3238 + }, + { + "epoch": 1.2642466822794691, + "grad_norm": 0.5440056142699148, + "learning_rate": 7.170149391856798e-06, + "loss": 0.616, + "step": 3239 + }, + { + "epoch": 1.2646370023419204, + "grad_norm": 0.5588436413564667, + "learning_rate": 7.168103298472402e-06, + "loss": 0.6391, + "step": 3240 + }, + { + "epoch": 1.2650273224043715, + "grad_norm": 0.5114669148057133, + "learning_rate": 7.166056757844383e-06, + "loss": 0.5851, + "step": 3241 + }, + { + "epoch": 1.2654176424668229, + "grad_norm": 0.5633267393877841, + "learning_rate": 7.164009770394909e-06, + "loss": 0.5927, + "step": 3242 + }, + { + "epoch": 1.265807962529274, + "grad_norm": 0.5105346020803668, + "learning_rate": 7.161962336546239e-06, + "loss": 0.6165, + "step": 3243 + }, + { + "epoch": 1.2661982825917253, + "grad_norm": 0.5310451354674719, + "learning_rate": 7.159914456720723e-06, + "loss": 0.5921, + "step": 3244 + }, + { + "epoch": 1.2665886026541764, + "grad_norm": 0.5796586797315718, + "learning_rate": 7.157866131340807e-06, + "loss": 0.6072, + "step": 3245 + }, + { + "epoch": 1.2669789227166277, + "grad_norm": 0.5066801843732596, + "learning_rate": 7.155817360829025e-06, + "loss": 0.5841, + "step": 3246 + }, + { + "epoch": 1.2673692427790788, + "grad_norm": 0.70709380181881, + "learning_rate": 7.153768145608005e-06, + "loss": 0.5892, + "step": 3247 + }, + { + "epoch": 1.2677595628415301, + "grad_norm": 0.7115728887972775, + "learning_rate": 7.151718486100467e-06, + "loss": 0.5844, + "step": 3248 + }, + { + "epoch": 1.2681498829039812, + "grad_norm": 0.5558343464161578, + "learning_rate": 7.149668382729218e-06, + "loss": 0.6193, + "step": 3249 + }, + { + "epoch": 1.2685402029664326, + "grad_norm": 0.5351021398822722, + "learning_rate": 7.1476178359171634e-06, + "loss": 0.5914, + "step": 3250 + }, + { + "epoch": 1.2689305230288837, + "grad_norm": 0.6743618516679267, + "learning_rate": 7.145566846087296e-06, + "loss": 0.6154, + "step": 3251 + }, + { + "epoch": 1.269320843091335, + "grad_norm": 0.5484312330237201, + "learning_rate": 7.143515413662702e-06, + "loss": 0.5785, + "step": 3252 + }, + { + "epoch": 1.269711163153786, + "grad_norm": 0.5956733994937977, + "learning_rate": 7.141463539066554e-06, + "loss": 0.5705, + "step": 3253 + }, + { + "epoch": 1.2701014832162374, + "grad_norm": 0.6257681057892369, + "learning_rate": 7.139411222722124e-06, + "loss": 0.6292, + "step": 3254 + }, + { + "epoch": 1.2704918032786885, + "grad_norm": 0.6516857970084964, + "learning_rate": 7.137358465052767e-06, + "loss": 0.6171, + "step": 3255 + }, + { + "epoch": 1.2708821233411398, + "grad_norm": 0.5545465371862349, + "learning_rate": 7.135305266481936e-06, + "loss": 0.6351, + "step": 3256 + }, + { + "epoch": 1.271272443403591, + "grad_norm": 0.5917876455735747, + "learning_rate": 7.133251627433171e-06, + "loss": 0.6071, + "step": 3257 + }, + { + "epoch": 1.2716627634660422, + "grad_norm": 0.49399261213800655, + "learning_rate": 7.131197548330102e-06, + "loss": 0.6037, + "step": 3258 + }, + { + "epoch": 1.2720530835284933, + "grad_norm": 0.5656261378756557, + "learning_rate": 7.129143029596451e-06, + "loss": 0.5968, + "step": 3259 + }, + { + "epoch": 1.2724434035909447, + "grad_norm": 0.5451332141100951, + "learning_rate": 7.127088071656034e-06, + "loss": 0.6138, + "step": 3260 + }, + { + "epoch": 1.2728337236533958, + "grad_norm": 0.5306692855065147, + "learning_rate": 7.125032674932753e-06, + "loss": 0.5807, + "step": 3261 + }, + { + "epoch": 1.273224043715847, + "grad_norm": 0.5149797840803001, + "learning_rate": 7.1229768398506e-06, + "loss": 0.6309, + "step": 3262 + }, + { + "epoch": 1.2736143637782982, + "grad_norm": 0.5210619147494343, + "learning_rate": 7.120920566833663e-06, + "loss": 0.6022, + "step": 3263 + }, + { + "epoch": 1.2740046838407495, + "grad_norm": 0.5025952660950137, + "learning_rate": 7.118863856306115e-06, + "loss": 0.6553, + "step": 3264 + }, + { + "epoch": 1.2743950039032006, + "grad_norm": 0.6027348110997378, + "learning_rate": 7.116806708692221e-06, + "loss": 0.605, + "step": 3265 + }, + { + "epoch": 1.274785323965652, + "grad_norm": 0.5768239707513484, + "learning_rate": 7.114749124416339e-06, + "loss": 0.5916, + "step": 3266 + }, + { + "epoch": 1.275175644028103, + "grad_norm": 0.5985201070573596, + "learning_rate": 7.11269110390291e-06, + "loss": 0.6101, + "step": 3267 + }, + { + "epoch": 1.2755659640905543, + "grad_norm": 0.7582396215659358, + "learning_rate": 7.110632647576472e-06, + "loss": 0.6182, + "step": 3268 + }, + { + "epoch": 1.2759562841530054, + "grad_norm": 0.5444761100797573, + "learning_rate": 7.10857375586165e-06, + "loss": 0.6039, + "step": 3269 + }, + { + "epoch": 1.2763466042154565, + "grad_norm": 0.5265399262272812, + "learning_rate": 7.10651442918316e-06, + "loss": 0.6108, + "step": 3270 + }, + { + "epoch": 1.2767369242779079, + "grad_norm": 0.6541369743107875, + "learning_rate": 7.104454667965804e-06, + "loss": 0.5952, + "step": 3271 + }, + { + "epoch": 1.2771272443403592, + "grad_norm": 0.6677246274435857, + "learning_rate": 7.10239447263448e-06, + "loss": 0.6026, + "step": 3272 + }, + { + "epoch": 1.2775175644028103, + "grad_norm": 0.5801155746018489, + "learning_rate": 7.100333843614169e-06, + "loss": 0.5839, + "step": 3273 + }, + { + "epoch": 1.2779078844652614, + "grad_norm": 0.5264795734324117, + "learning_rate": 7.098272781329947e-06, + "loss": 0.6061, + "step": 3274 + }, + { + "epoch": 1.2782982045277127, + "grad_norm": 0.5429473584113353, + "learning_rate": 7.096211286206973e-06, + "loss": 0.5875, + "step": 3275 + }, + { + "epoch": 1.278688524590164, + "grad_norm": 0.6686251212821928, + "learning_rate": 7.094149358670504e-06, + "loss": 0.5797, + "step": 3276 + }, + { + "epoch": 1.2790788446526151, + "grad_norm": 0.5617952268853514, + "learning_rate": 7.092086999145877e-06, + "loss": 0.593, + "step": 3277 + }, + { + "epoch": 1.2794691647150662, + "grad_norm": 0.5945486829030824, + "learning_rate": 7.090024208058527e-06, + "loss": 0.5992, + "step": 3278 + }, + { + "epoch": 1.2798594847775175, + "grad_norm": 0.555953952798084, + "learning_rate": 7.087960985833971e-06, + "loss": 0.5945, + "step": 3279 + }, + { + "epoch": 1.2802498048399689, + "grad_norm": 0.5160345488442986, + "learning_rate": 7.085897332897815e-06, + "loss": 0.5966, + "step": 3280 + }, + { + "epoch": 1.28064012490242, + "grad_norm": 0.5526831424916577, + "learning_rate": 7.08383324967576e-06, + "loss": 0.5737, + "step": 3281 + }, + { + "epoch": 1.281030444964871, + "grad_norm": 0.6983425806785124, + "learning_rate": 7.08176873659359e-06, + "loss": 0.643, + "step": 3282 + }, + { + "epoch": 1.2814207650273224, + "grad_norm": 0.63572102798643, + "learning_rate": 7.079703794077182e-06, + "loss": 0.6346, + "step": 3283 + }, + { + "epoch": 1.2818110850897737, + "grad_norm": 0.5612231211392584, + "learning_rate": 7.077638422552496e-06, + "loss": 0.6297, + "step": 3284 + }, + { + "epoch": 1.2822014051522248, + "grad_norm": 0.7560952286329391, + "learning_rate": 7.0755726224455856e-06, + "loss": 0.6308, + "step": 3285 + }, + { + "epoch": 1.282591725214676, + "grad_norm": 0.5888817053632985, + "learning_rate": 7.07350639418259e-06, + "loss": 0.5835, + "step": 3286 + }, + { + "epoch": 1.2829820452771272, + "grad_norm": 0.5171767562902424, + "learning_rate": 7.07143973818974e-06, + "loss": 0.5883, + "step": 3287 + }, + { + "epoch": 1.2833723653395785, + "grad_norm": 0.6469050095247976, + "learning_rate": 7.069372654893349e-06, + "loss": 0.598, + "step": 3288 + }, + { + "epoch": 1.2837626854020296, + "grad_norm": 0.6354238044333713, + "learning_rate": 7.067305144719825e-06, + "loss": 0.573, + "step": 3289 + }, + { + "epoch": 1.2841530054644807, + "grad_norm": 0.5886886483971381, + "learning_rate": 7.065237208095659e-06, + "loss": 0.6199, + "step": 3290 + }, + { + "epoch": 1.284543325526932, + "grad_norm": 0.733463474356648, + "learning_rate": 7.063168845447432e-06, + "loss": 0.6436, + "step": 3291 + }, + { + "epoch": 1.2849336455893834, + "grad_norm": 0.5912178988667521, + "learning_rate": 7.061100057201814e-06, + "loss": 0.605, + "step": 3292 + }, + { + "epoch": 1.2853239656518345, + "grad_norm": 0.625696890556578, + "learning_rate": 7.05903084378556e-06, + "loss": 0.6117, + "step": 3293 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.7839168512329576, + "learning_rate": 7.056961205625516e-06, + "loss": 0.6343, + "step": 3294 + }, + { + "epoch": 1.286104605776737, + "grad_norm": 0.6445345161810021, + "learning_rate": 7.0548911431486125e-06, + "loss": 0.6401, + "step": 3295 + }, + { + "epoch": 1.2864949258391882, + "grad_norm": 0.7758676085686699, + "learning_rate": 7.05282065678187e-06, + "loss": 0.6443, + "step": 3296 + }, + { + "epoch": 1.2868852459016393, + "grad_norm": 0.6638424055986537, + "learning_rate": 7.0507497469523945e-06, + "loss": 0.6355, + "step": 3297 + }, + { + "epoch": 1.2872755659640904, + "grad_norm": 0.6112436725634248, + "learning_rate": 7.0486784140873806e-06, + "loss": 0.5899, + "step": 3298 + }, + { + "epoch": 1.2876658860265418, + "grad_norm": 0.7899975564687213, + "learning_rate": 7.0466066586141106e-06, + "loss": 0.6267, + "step": 3299 + }, + { + "epoch": 1.288056206088993, + "grad_norm": 0.5756984722092892, + "learning_rate": 7.044534480959951e-06, + "loss": 0.6032, + "step": 3300 + }, + { + "epoch": 1.2884465261514442, + "grad_norm": 0.5658694423687713, + "learning_rate": 7.0424618815523595e-06, + "loss": 0.6337, + "step": 3301 + }, + { + "epoch": 1.2888368462138953, + "grad_norm": 0.7706742999618887, + "learning_rate": 7.040388860818878e-06, + "loss": 0.6081, + "step": 3302 + }, + { + "epoch": 1.2892271662763466, + "grad_norm": 0.5641950604305896, + "learning_rate": 7.038315419187136e-06, + "loss": 0.5908, + "step": 3303 + }, + { + "epoch": 1.289617486338798, + "grad_norm": 0.5532149751698658, + "learning_rate": 7.036241557084851e-06, + "loss": 0.6556, + "step": 3304 + }, + { + "epoch": 1.290007806401249, + "grad_norm": 0.7071186435709149, + "learning_rate": 7.0341672749398246e-06, + "loss": 0.6325, + "step": 3305 + }, + { + "epoch": 1.2903981264637001, + "grad_norm": 0.6561168504901209, + "learning_rate": 7.03209257317995e-06, + "loss": 0.6004, + "step": 3306 + }, + { + "epoch": 1.2907884465261514, + "grad_norm": 0.5654261892605972, + "learning_rate": 7.030017452233199e-06, + "loss": 0.579, + "step": 3307 + }, + { + "epoch": 1.2911787665886028, + "grad_norm": 0.6403369840194548, + "learning_rate": 7.027941912527637e-06, + "loss": 0.6051, + "step": 3308 + }, + { + "epoch": 1.2915690866510539, + "grad_norm": 0.6609764062653287, + "learning_rate": 7.025865954491415e-06, + "loss": 0.6253, + "step": 3309 + }, + { + "epoch": 1.291959406713505, + "grad_norm": 0.5217328855753544, + "learning_rate": 7.023789578552766e-06, + "loss": 0.5873, + "step": 3310 + }, + { + "epoch": 1.2923497267759563, + "grad_norm": 0.5174512777336484, + "learning_rate": 7.021712785140011e-06, + "loss": 0.6192, + "step": 3311 + }, + { + "epoch": 1.2927400468384076, + "grad_norm": 0.6626333292927432, + "learning_rate": 7.01963557468156e-06, + "loss": 0.5838, + "step": 3312 + }, + { + "epoch": 1.2931303669008587, + "grad_norm": 0.5197068004710197, + "learning_rate": 7.0175579476059085e-06, + "loss": 0.5788, + "step": 3313 + }, + { + "epoch": 1.2935206869633098, + "grad_norm": 0.6640585880878018, + "learning_rate": 7.015479904341633e-06, + "loss": 0.6324, + "step": 3314 + }, + { + "epoch": 1.2939110070257611, + "grad_norm": 0.6735454683766768, + "learning_rate": 7.013401445317401e-06, + "loss": 0.6123, + "step": 3315 + }, + { + "epoch": 1.2943013270882124, + "grad_norm": 0.5704798084958981, + "learning_rate": 7.011322570961962e-06, + "loss": 0.6408, + "step": 3316 + }, + { + "epoch": 1.2946916471506635, + "grad_norm": 0.6067222008963484, + "learning_rate": 7.009243281704156e-06, + "loss": 0.5972, + "step": 3317 + }, + { + "epoch": 1.2950819672131146, + "grad_norm": 0.5813439667416307, + "learning_rate": 7.0071635779729045e-06, + "loss": 0.5923, + "step": 3318 + }, + { + "epoch": 1.295472287275566, + "grad_norm": 0.5370185649781396, + "learning_rate": 7.005083460197215e-06, + "loss": 0.6222, + "step": 3319 + }, + { + "epoch": 1.2958626073380173, + "grad_norm": 0.6232819279316675, + "learning_rate": 7.0030029288061815e-06, + "loss": 0.6386, + "step": 3320 + }, + { + "epoch": 1.2962529274004684, + "grad_norm": 0.5608306992539573, + "learning_rate": 7.000921984228985e-06, + "loss": 0.5943, + "step": 3321 + }, + { + "epoch": 1.2966432474629195, + "grad_norm": 0.4955304406483203, + "learning_rate": 6.9988406268948865e-06, + "loss": 0.581, + "step": 3322 + }, + { + "epoch": 1.2970335675253708, + "grad_norm": 0.668279534925229, + "learning_rate": 6.996758857233238e-06, + "loss": 0.6437, + "step": 3323 + }, + { + "epoch": 1.2974238875878221, + "grad_norm": 0.5512118788653239, + "learning_rate": 6.994676675673473e-06, + "loss": 0.6249, + "step": 3324 + }, + { + "epoch": 1.2978142076502732, + "grad_norm": 0.49912672818553383, + "learning_rate": 6.99259408264511e-06, + "loss": 0.561, + "step": 3325 + }, + { + "epoch": 1.2982045277127243, + "grad_norm": 0.694139679899493, + "learning_rate": 6.990511078577754e-06, + "loss": 0.592, + "step": 3326 + }, + { + "epoch": 1.2985948477751756, + "grad_norm": 0.6675808154429688, + "learning_rate": 6.988427663901095e-06, + "loss": 0.5689, + "step": 3327 + }, + { + "epoch": 1.298985167837627, + "grad_norm": 0.5365977372471384, + "learning_rate": 6.986343839044905e-06, + "loss": 0.6189, + "step": 3328 + }, + { + "epoch": 1.299375487900078, + "grad_norm": 0.5566538753729461, + "learning_rate": 6.984259604439046e-06, + "loss": 0.5818, + "step": 3329 + }, + { + "epoch": 1.2997658079625292, + "grad_norm": 0.6563824991494852, + "learning_rate": 6.982174960513456e-06, + "loss": 0.6301, + "step": 3330 + }, + { + "epoch": 1.3001561280249805, + "grad_norm": 0.6334798524586399, + "learning_rate": 6.9800899076981654e-06, + "loss": 0.5956, + "step": 3331 + }, + { + "epoch": 1.3005464480874318, + "grad_norm": 0.6038403956432834, + "learning_rate": 6.9780044464232846e-06, + "loss": 0.5771, + "step": 3332 + }, + { + "epoch": 1.300936768149883, + "grad_norm": 0.5413839864919179, + "learning_rate": 6.975918577119009e-06, + "loss": 0.5954, + "step": 3333 + }, + { + "epoch": 1.301327088212334, + "grad_norm": 0.6908198317607895, + "learning_rate": 6.973832300215621e-06, + "loss": 0.5742, + "step": 3334 + }, + { + "epoch": 1.3017174082747853, + "grad_norm": 0.730753456091065, + "learning_rate": 6.971745616143482e-06, + "loss": 0.5687, + "step": 3335 + }, + { + "epoch": 1.3021077283372366, + "grad_norm": 0.5514827260636749, + "learning_rate": 6.969658525333044e-06, + "loss": 0.6022, + "step": 3336 + }, + { + "epoch": 1.3024980483996877, + "grad_norm": 0.9280321253157198, + "learning_rate": 6.967571028214836e-06, + "loss": 0.5967, + "step": 3337 + }, + { + "epoch": 1.3028883684621388, + "grad_norm": 0.6819378422554091, + "learning_rate": 6.965483125219474e-06, + "loss": 0.5949, + "step": 3338 + }, + { + "epoch": 1.3032786885245902, + "grad_norm": 0.8607532096497443, + "learning_rate": 6.963394816777659e-06, + "loss": 0.6422, + "step": 3339 + }, + { + "epoch": 1.3036690085870415, + "grad_norm": 0.8960474177406167, + "learning_rate": 6.961306103320174e-06, + "loss": 0.6102, + "step": 3340 + }, + { + "epoch": 1.3040593286494926, + "grad_norm": 0.5436962212504732, + "learning_rate": 6.959216985277885e-06, + "loss": 0.5853, + "step": 3341 + }, + { + "epoch": 1.3044496487119437, + "grad_norm": 0.8213718573960739, + "learning_rate": 6.957127463081742e-06, + "loss": 0.605, + "step": 3342 + }, + { + "epoch": 1.304839968774395, + "grad_norm": 0.736566883880225, + "learning_rate": 6.95503753716278e-06, + "loss": 0.5861, + "step": 3343 + }, + { + "epoch": 1.3052302888368463, + "grad_norm": 0.6172407794026662, + "learning_rate": 6.9529472079521165e-06, + "loss": 0.597, + "step": 3344 + }, + { + "epoch": 1.3056206088992974, + "grad_norm": 0.7040821219340203, + "learning_rate": 6.9508564758809486e-06, + "loss": 0.5985, + "step": 3345 + }, + { + "epoch": 1.3060109289617485, + "grad_norm": 0.7043403094532745, + "learning_rate": 6.948765341380563e-06, + "loss": 0.6205, + "step": 3346 + }, + { + "epoch": 1.3064012490241999, + "grad_norm": 0.6565084861282839, + "learning_rate": 6.946673804882322e-06, + "loss": 0.59, + "step": 3347 + }, + { + "epoch": 1.3067915690866512, + "grad_norm": 0.8360844695364045, + "learning_rate": 6.944581866817679e-06, + "loss": 0.6186, + "step": 3348 + }, + { + "epoch": 1.3071818891491023, + "grad_norm": 0.5663204718094356, + "learning_rate": 6.9424895276181645e-06, + "loss": 0.5735, + "step": 3349 + }, + { + "epoch": 1.3075722092115534, + "grad_norm": 0.7873087245304773, + "learning_rate": 6.940396787715392e-06, + "loss": 0.63, + "step": 3350 + }, + { + "epoch": 1.3079625292740047, + "grad_norm": 0.6327281458806903, + "learning_rate": 6.9383036475410596e-06, + "loss": 0.6057, + "step": 3351 + }, + { + "epoch": 1.308352849336456, + "grad_norm": 0.5353845622599505, + "learning_rate": 6.936210107526946e-06, + "loss": 0.5957, + "step": 3352 + }, + { + "epoch": 1.3087431693989071, + "grad_norm": 0.7806076661006417, + "learning_rate": 6.934116168104918e-06, + "loss": 0.5661, + "step": 3353 + }, + { + "epoch": 1.3091334894613582, + "grad_norm": 0.6545844023818945, + "learning_rate": 6.932021829706916e-06, + "loss": 0.5795, + "step": 3354 + }, + { + "epoch": 1.3095238095238095, + "grad_norm": 0.6390379853772584, + "learning_rate": 6.929927092764969e-06, + "loss": 0.5443, + "step": 3355 + }, + { + "epoch": 1.3099141295862609, + "grad_norm": 0.8735674548982686, + "learning_rate": 6.9278319577111856e-06, + "loss": 0.5733, + "step": 3356 + }, + { + "epoch": 1.310304449648712, + "grad_norm": 0.6569910376925553, + "learning_rate": 6.925736424977757e-06, + "loss": 0.5913, + "step": 3357 + }, + { + "epoch": 1.310694769711163, + "grad_norm": 0.7758002249991222, + "learning_rate": 6.923640494996959e-06, + "loss": 0.6243, + "step": 3358 + }, + { + "epoch": 1.3110850897736144, + "grad_norm": 0.7610053473648314, + "learning_rate": 6.921544168201144e-06, + "loss": 0.5995, + "step": 3359 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 0.6737639291425356, + "learning_rate": 6.919447445022752e-06, + "loss": 0.5858, + "step": 3360 + }, + { + "epoch": 1.3118657298985168, + "grad_norm": 0.7761112757333397, + "learning_rate": 6.917350325894301e-06, + "loss": 0.5761, + "step": 3361 + }, + { + "epoch": 1.312256049960968, + "grad_norm": 0.675542626313467, + "learning_rate": 6.915252811248392e-06, + "loss": 0.5736, + "step": 3362 + }, + { + "epoch": 1.3126463700234192, + "grad_norm": 0.6321937676959329, + "learning_rate": 6.9131549015177065e-06, + "loss": 0.5713, + "step": 3363 + }, + { + "epoch": 1.3130366900858705, + "grad_norm": 0.6449502778091927, + "learning_rate": 6.911056597135008e-06, + "loss": 0.5657, + "step": 3364 + }, + { + "epoch": 1.3134270101483216, + "grad_norm": 0.6049346995504612, + "learning_rate": 6.908957898533144e-06, + "loss": 0.6194, + "step": 3365 + }, + { + "epoch": 1.3138173302107727, + "grad_norm": 0.6241214248033432, + "learning_rate": 6.906858806145039e-06, + "loss": 0.5725, + "step": 3366 + }, + { + "epoch": 1.314207650273224, + "grad_norm": 0.6657075841299187, + "learning_rate": 6.904759320403704e-06, + "loss": 0.5963, + "step": 3367 + }, + { + "epoch": 1.3145979703356754, + "grad_norm": 0.5823506260544357, + "learning_rate": 6.9026594417422225e-06, + "loss": 0.6141, + "step": 3368 + }, + { + "epoch": 1.3149882903981265, + "grad_norm": 0.6578946095684668, + "learning_rate": 6.900559170593768e-06, + "loss": 0.5887, + "step": 3369 + }, + { + "epoch": 1.3153786104605776, + "grad_norm": 0.563612809779574, + "learning_rate": 6.898458507391591e-06, + "loss": 0.6143, + "step": 3370 + }, + { + "epoch": 1.315768930523029, + "grad_norm": 0.6409753361331266, + "learning_rate": 6.896357452569025e-06, + "loss": 0.61, + "step": 3371 + }, + { + "epoch": 1.3161592505854802, + "grad_norm": 0.6910942101166533, + "learning_rate": 6.89425600655948e-06, + "loss": 0.5992, + "step": 3372 + }, + { + "epoch": 1.3165495706479313, + "grad_norm": 0.6096224832523286, + "learning_rate": 6.89215416979645e-06, + "loss": 0.608, + "step": 3373 + }, + { + "epoch": 1.3169398907103824, + "grad_norm": 0.6544018317911785, + "learning_rate": 6.890051942713509e-06, + "loss": 0.622, + "step": 3374 + }, + { + "epoch": 1.3173302107728337, + "grad_norm": 0.7709705064364786, + "learning_rate": 6.887949325744314e-06, + "loss": 0.6239, + "step": 3375 + }, + { + "epoch": 1.317720530835285, + "grad_norm": 0.5724903968522331, + "learning_rate": 6.885846319322597e-06, + "loss": 0.6009, + "step": 3376 + }, + { + "epoch": 1.3181108508977362, + "grad_norm": 0.688637035270791, + "learning_rate": 6.883742923882172e-06, + "loss": 0.6043, + "step": 3377 + }, + { + "epoch": 1.3185011709601873, + "grad_norm": 0.6674363144350102, + "learning_rate": 6.881639139856938e-06, + "loss": 0.6186, + "step": 3378 + }, + { + "epoch": 1.3188914910226386, + "grad_norm": 0.5889948642217617, + "learning_rate": 6.879534967680869e-06, + "loss": 0.6166, + "step": 3379 + }, + { + "epoch": 1.31928181108509, + "grad_norm": 0.6946280692381754, + "learning_rate": 6.8774304077880205e-06, + "loss": 0.6318, + "step": 3380 + }, + { + "epoch": 1.319672131147541, + "grad_norm": 0.6188287039298077, + "learning_rate": 6.875325460612527e-06, + "loss": 0.5748, + "step": 3381 + }, + { + "epoch": 1.320062451209992, + "grad_norm": 0.5286939820836324, + "learning_rate": 6.873220126588607e-06, + "loss": 0.5688, + "step": 3382 + }, + { + "epoch": 1.3204527712724434, + "grad_norm": 0.6840314746804762, + "learning_rate": 6.8711144061505534e-06, + "loss": 0.6384, + "step": 3383 + }, + { + "epoch": 1.3208430913348947, + "grad_norm": 0.5614474980681207, + "learning_rate": 6.869008299732743e-06, + "loss": 0.6317, + "step": 3384 + }, + { + "epoch": 1.3212334113973458, + "grad_norm": 0.6240479958146707, + "learning_rate": 6.866901807769629e-06, + "loss": 0.6096, + "step": 3385 + }, + { + "epoch": 1.321623731459797, + "grad_norm": 0.689079536021484, + "learning_rate": 6.864794930695746e-06, + "loss": 0.5907, + "step": 3386 + }, + { + "epoch": 1.3220140515222483, + "grad_norm": 0.594911737037677, + "learning_rate": 6.862687668945709e-06, + "loss": 0.6186, + "step": 3387 + }, + { + "epoch": 1.3224043715846996, + "grad_norm": 0.6645039187474948, + "learning_rate": 6.860580022954209e-06, + "loss": 0.6013, + "step": 3388 + }, + { + "epoch": 1.3227946916471507, + "grad_norm": 0.7198893802529207, + "learning_rate": 6.858471993156019e-06, + "loss": 0.6164, + "step": 3389 + }, + { + "epoch": 1.3231850117096018, + "grad_norm": 0.5071931930798311, + "learning_rate": 6.8563635799859915e-06, + "loss": 0.643, + "step": 3390 + }, + { + "epoch": 1.323575331772053, + "grad_norm": 0.6793537601317489, + "learning_rate": 6.8542547838790565e-06, + "loss": 0.5981, + "step": 3391 + }, + { + "epoch": 1.3239656518345044, + "grad_norm": 0.624651782050871, + "learning_rate": 6.852145605270224e-06, + "loss": 0.5999, + "step": 3392 + }, + { + "epoch": 1.3243559718969555, + "grad_norm": 0.5295997339782363, + "learning_rate": 6.85003604459458e-06, + "loss": 0.6135, + "step": 3393 + }, + { + "epoch": 1.3247462919594066, + "grad_norm": 0.6353125842376246, + "learning_rate": 6.847926102287294e-06, + "loss": 0.5987, + "step": 3394 + }, + { + "epoch": 1.325136612021858, + "grad_norm": 0.5877573690168816, + "learning_rate": 6.845815778783612e-06, + "loss": 0.6263, + "step": 3395 + }, + { + "epoch": 1.325526932084309, + "grad_norm": 0.5202269602768801, + "learning_rate": 6.843705074518858e-06, + "loss": 0.6373, + "step": 3396 + }, + { + "epoch": 1.3259172521467604, + "grad_norm": 0.5573992309434146, + "learning_rate": 6.8415939899284345e-06, + "loss": 0.6013, + "step": 3397 + }, + { + "epoch": 1.3263075722092115, + "grad_norm": 0.587489055180228, + "learning_rate": 6.839482525447826e-06, + "loss": 0.6224, + "step": 3398 + }, + { + "epoch": 1.3266978922716628, + "grad_norm": 0.5438045178699663, + "learning_rate": 6.8373706815125886e-06, + "loss": 0.5975, + "step": 3399 + }, + { + "epoch": 1.327088212334114, + "grad_norm": 0.5758815960794353, + "learning_rate": 6.835258458558361e-06, + "loss": 0.6109, + "step": 3400 + }, + { + "epoch": 1.3274785323965652, + "grad_norm": 0.6453549053941108, + "learning_rate": 6.833145857020863e-06, + "loss": 0.6069, + "step": 3401 + }, + { + "epoch": 1.3278688524590163, + "grad_norm": 0.5721155321571141, + "learning_rate": 6.831032877335887e-06, + "loss": 0.6192, + "step": 3402 + }, + { + "epoch": 1.3282591725214676, + "grad_norm": 0.5982189942721444, + "learning_rate": 6.8289195199393055e-06, + "loss": 0.6031, + "step": 3403 + }, + { + "epoch": 1.3286494925839187, + "grad_norm": 0.6307796109894478, + "learning_rate": 6.8268057852670675e-06, + "loss": 0.5809, + "step": 3404 + }, + { + "epoch": 1.32903981264637, + "grad_norm": 0.5887553714584898, + "learning_rate": 6.824691673755203e-06, + "loss": 0.5771, + "step": 3405 + }, + { + "epoch": 1.3294301327088212, + "grad_norm": 0.6379556726142998, + "learning_rate": 6.822577185839821e-06, + "loss": 0.6309, + "step": 3406 + }, + { + "epoch": 1.3298204527712725, + "grad_norm": 0.6300017695983374, + "learning_rate": 6.8204623219571e-06, + "loss": 0.5968, + "step": 3407 + }, + { + "epoch": 1.3302107728337236, + "grad_norm": 0.6913880090004133, + "learning_rate": 6.818347082543302e-06, + "loss": 0.622, + "step": 3408 + }, + { + "epoch": 1.330601092896175, + "grad_norm": 0.7282224403760472, + "learning_rate": 6.8162314680347684e-06, + "loss": 0.621, + "step": 3409 + }, + { + "epoch": 1.330991412958626, + "grad_norm": 0.5497840398597543, + "learning_rate": 6.814115478867913e-06, + "loss": 0.5841, + "step": 3410 + }, + { + "epoch": 1.3313817330210773, + "grad_norm": 0.6592184663805131, + "learning_rate": 6.811999115479232e-06, + "loss": 0.6132, + "step": 3411 + }, + { + "epoch": 1.3317720530835284, + "grad_norm": 0.7025832449922894, + "learning_rate": 6.809882378305291e-06, + "loss": 0.6497, + "step": 3412 + }, + { + "epoch": 1.3321623731459797, + "grad_norm": 0.53883951239187, + "learning_rate": 6.807765267782742e-06, + "loss": 0.5794, + "step": 3413 + }, + { + "epoch": 1.3325526932084308, + "grad_norm": 0.6808097933489857, + "learning_rate": 6.805647784348308e-06, + "loss": 0.6183, + "step": 3414 + }, + { + "epoch": 1.3329430132708822, + "grad_norm": 0.5476458344010109, + "learning_rate": 6.803529928438791e-06, + "loss": 0.5639, + "step": 3415 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.5820352368081337, + "learning_rate": 6.8014117004910685e-06, + "loss": 0.5828, + "step": 3416 + }, + { + "epoch": 1.3337236533957846, + "grad_norm": 0.5889978440424701, + "learning_rate": 6.799293100942097e-06, + "loss": 0.5956, + "step": 3417 + }, + { + "epoch": 1.3341139734582357, + "grad_norm": 0.5816021550306159, + "learning_rate": 6.797174130228906e-06, + "loss": 0.636, + "step": 3418 + }, + { + "epoch": 1.334504293520687, + "grad_norm": 0.5288106338819487, + "learning_rate": 6.795054788788607e-06, + "loss": 0.5831, + "step": 3419 + }, + { + "epoch": 1.334894613583138, + "grad_norm": 0.5652298847680007, + "learning_rate": 6.792935077058385e-06, + "loss": 0.6007, + "step": 3420 + }, + { + "epoch": 1.3352849336455894, + "grad_norm": 0.5789077703012749, + "learning_rate": 6.790814995475497e-06, + "loss": 0.57, + "step": 3421 + }, + { + "epoch": 1.3356752537080405, + "grad_norm": 0.5435950732947099, + "learning_rate": 6.788694544477283e-06, + "loss": 0.614, + "step": 3422 + }, + { + "epoch": 1.3360655737704918, + "grad_norm": 0.5854267544151999, + "learning_rate": 6.786573724501158e-06, + "loss": 0.5958, + "step": 3423 + }, + { + "epoch": 1.336455893832943, + "grad_norm": 0.554695410441138, + "learning_rate": 6.784452535984613e-06, + "loss": 0.5815, + "step": 3424 + }, + { + "epoch": 1.3368462138953943, + "grad_norm": 0.5887675516159903, + "learning_rate": 6.782330979365209e-06, + "loss": 0.6344, + "step": 3425 + }, + { + "epoch": 1.3372365339578454, + "grad_norm": 0.5324435373464711, + "learning_rate": 6.780209055080592e-06, + "loss": 0.5906, + "step": 3426 + }, + { + "epoch": 1.3376268540202967, + "grad_norm": 0.5243248086332812, + "learning_rate": 6.778086763568477e-06, + "loss": 0.6022, + "step": 3427 + }, + { + "epoch": 1.3380171740827478, + "grad_norm": 0.514767847199567, + "learning_rate": 6.77596410526666e-06, + "loss": 0.593, + "step": 3428 + }, + { + "epoch": 1.338407494145199, + "grad_norm": 0.5774929580500836, + "learning_rate": 6.773841080613007e-06, + "loss": 0.6023, + "step": 3429 + }, + { + "epoch": 1.3387978142076502, + "grad_norm": 0.5372022085267417, + "learning_rate": 6.771717690045465e-06, + "loss": 0.6001, + "step": 3430 + }, + { + "epoch": 1.3391881342701015, + "grad_norm": 0.6873995807076734, + "learning_rate": 6.769593934002053e-06, + "loss": 0.5891, + "step": 3431 + }, + { + "epoch": 1.3395784543325526, + "grad_norm": 0.5707246484004358, + "learning_rate": 6.767469812920867e-06, + "loss": 0.5862, + "step": 3432 + }, + { + "epoch": 1.339968774395004, + "grad_norm": 0.6090893622431055, + "learning_rate": 6.76534532724008e-06, + "loss": 0.6399, + "step": 3433 + }, + { + "epoch": 1.340359094457455, + "grad_norm": 0.595553972266652, + "learning_rate": 6.763220477397934e-06, + "loss": 0.6355, + "step": 3434 + }, + { + "epoch": 1.3407494145199064, + "grad_norm": 0.6349613471267259, + "learning_rate": 6.761095263832751e-06, + "loss": 0.6393, + "step": 3435 + }, + { + "epoch": 1.3411397345823575, + "grad_norm": 0.6599989805638113, + "learning_rate": 6.758969686982929e-06, + "loss": 0.6264, + "step": 3436 + }, + { + "epoch": 1.3415300546448088, + "grad_norm": 0.5891206245165824, + "learning_rate": 6.75684374728694e-06, + "loss": 0.6267, + "step": 3437 + }, + { + "epoch": 1.3419203747072599, + "grad_norm": 0.6152584699093596, + "learning_rate": 6.754717445183325e-06, + "loss": 0.5848, + "step": 3438 + }, + { + "epoch": 1.3423106947697112, + "grad_norm": 0.5847428055490967, + "learning_rate": 6.7525907811107104e-06, + "loss": 0.6172, + "step": 3439 + }, + { + "epoch": 1.3427010148321623, + "grad_norm": 0.5871851615682521, + "learning_rate": 6.750463755507788e-06, + "loss": 0.6062, + "step": 3440 + }, + { + "epoch": 1.3430913348946136, + "grad_norm": 0.5899587245485655, + "learning_rate": 6.74833636881333e-06, + "loss": 0.6013, + "step": 3441 + }, + { + "epoch": 1.3434816549570647, + "grad_norm": 0.5252188510915122, + "learning_rate": 6.74620862146618e-06, + "loss": 0.5883, + "step": 3442 + }, + { + "epoch": 1.343871975019516, + "grad_norm": 0.5833365570251932, + "learning_rate": 6.744080513905257e-06, + "loss": 0.582, + "step": 3443 + }, + { + "epoch": 1.3442622950819672, + "grad_norm": 0.5412309933633178, + "learning_rate": 6.741952046569553e-06, + "loss": 0.6079, + "step": 3444 + }, + { + "epoch": 1.3446526151444185, + "grad_norm": 0.5337413844379284, + "learning_rate": 6.739823219898136e-06, + "loss": 0.6126, + "step": 3445 + }, + { + "epoch": 1.3450429352068696, + "grad_norm": 0.5027576079586086, + "learning_rate": 6.737694034330149e-06, + "loss": 0.6223, + "step": 3446 + }, + { + "epoch": 1.345433255269321, + "grad_norm": 0.6478921404783714, + "learning_rate": 6.735564490304805e-06, + "loss": 0.6113, + "step": 3447 + }, + { + "epoch": 1.345823575331772, + "grad_norm": 0.7278713242398419, + "learning_rate": 6.733434588261395e-06, + "loss": 0.6306, + "step": 3448 + }, + { + "epoch": 1.3462138953942233, + "grad_norm": 0.5188653354654704, + "learning_rate": 6.731304328639281e-06, + "loss": 0.5942, + "step": 3449 + }, + { + "epoch": 1.3466042154566744, + "grad_norm": 0.6537578174936567, + "learning_rate": 6.729173711877905e-06, + "loss": 0.6048, + "step": 3450 + }, + { + "epoch": 1.3469945355191257, + "grad_norm": 0.5808557906349239, + "learning_rate": 6.727042738416771e-06, + "loss": 0.6112, + "step": 3451 + }, + { + "epoch": 1.3473848555815768, + "grad_norm": 0.5712283576673434, + "learning_rate": 6.724911408695466e-06, + "loss": 0.571, + "step": 3452 + }, + { + "epoch": 1.3477751756440282, + "grad_norm": 0.5805452746513928, + "learning_rate": 6.722779723153649e-06, + "loss": 0.6164, + "step": 3453 + }, + { + "epoch": 1.3481654957064793, + "grad_norm": 0.6197566165843439, + "learning_rate": 6.72064768223105e-06, + "loss": 0.6026, + "step": 3454 + }, + { + "epoch": 1.3485558157689306, + "grad_norm": 0.5179766670578485, + "learning_rate": 6.718515286367475e-06, + "loss": 0.5574, + "step": 3455 + }, + { + "epoch": 1.3489461358313817, + "grad_norm": 0.5814755193100841, + "learning_rate": 6.716382536002801e-06, + "loss": 0.6046, + "step": 3456 + }, + { + "epoch": 1.349336455893833, + "grad_norm": 0.48163720824802464, + "learning_rate": 6.714249431576978e-06, + "loss": 0.6238, + "step": 3457 + }, + { + "epoch": 1.349726775956284, + "grad_norm": 0.6495214631468693, + "learning_rate": 6.712115973530032e-06, + "loss": 0.5623, + "step": 3458 + }, + { + "epoch": 1.3501170960187354, + "grad_norm": 0.5474728920514906, + "learning_rate": 6.7099821623020575e-06, + "loss": 0.614, + "step": 3459 + }, + { + "epoch": 1.3505074160811865, + "grad_norm": 0.5540287064030744, + "learning_rate": 6.707847998333225e-06, + "loss": 0.5841, + "step": 3460 + }, + { + "epoch": 1.3508977361436378, + "grad_norm": 0.594089046817321, + "learning_rate": 6.705713482063776e-06, + "loss": 0.5686, + "step": 3461 + }, + { + "epoch": 1.351288056206089, + "grad_norm": 0.5225443636077121, + "learning_rate": 6.703578613934029e-06, + "loss": 0.6291, + "step": 3462 + }, + { + "epoch": 1.3516783762685403, + "grad_norm": 0.5998073002595337, + "learning_rate": 6.701443394384368e-06, + "loss": 0.5895, + "step": 3463 + }, + { + "epoch": 1.3520686963309914, + "grad_norm": 0.6035689716941322, + "learning_rate": 6.6993078238552564e-06, + "loss": 0.6205, + "step": 3464 + }, + { + "epoch": 1.3524590163934427, + "grad_norm": 0.5812610179714447, + "learning_rate": 6.697171902787225e-06, + "loss": 0.6184, + "step": 3465 + }, + { + "epoch": 1.3528493364558938, + "grad_norm": 0.6532457940012708, + "learning_rate": 6.695035631620879e-06, + "loss": 0.5988, + "step": 3466 + }, + { + "epoch": 1.353239656518345, + "grad_norm": 0.5812656915418845, + "learning_rate": 6.692899010796895e-06, + "loss": 0.5951, + "step": 3467 + }, + { + "epoch": 1.3536299765807962, + "grad_norm": 0.6715678232201974, + "learning_rate": 6.690762040756025e-06, + "loss": 0.5879, + "step": 3468 + }, + { + "epoch": 1.3540202966432475, + "grad_norm": 0.6440967454909936, + "learning_rate": 6.688624721939087e-06, + "loss": 0.6053, + "step": 3469 + }, + { + "epoch": 1.3544106167056986, + "grad_norm": 0.5585006828653598, + "learning_rate": 6.686487054786975e-06, + "loss": 0.6034, + "step": 3470 + }, + { + "epoch": 1.35480093676815, + "grad_norm": 0.6425426254264924, + "learning_rate": 6.684349039740655e-06, + "loss": 0.5483, + "step": 3471 + }, + { + "epoch": 1.355191256830601, + "grad_norm": 0.5588297811598236, + "learning_rate": 6.682210677241165e-06, + "loss": 0.6015, + "step": 3472 + }, + { + "epoch": 1.3555815768930524, + "grad_norm": 0.5177275340200665, + "learning_rate": 6.6800719677296124e-06, + "loss": 0.5679, + "step": 3473 + }, + { + "epoch": 1.3559718969555035, + "grad_norm": 0.6311326571332837, + "learning_rate": 6.677932911647178e-06, + "loss": 0.6079, + "step": 3474 + }, + { + "epoch": 1.3563622170179548, + "grad_norm": 0.5595098789530327, + "learning_rate": 6.675793509435111e-06, + "loss": 0.61, + "step": 3475 + }, + { + "epoch": 1.3567525370804059, + "grad_norm": 0.5571160385209016, + "learning_rate": 6.6736537615347374e-06, + "loss": 0.5989, + "step": 3476 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 0.5719085536389756, + "learning_rate": 6.67151366838745e-06, + "loss": 0.6094, + "step": 3477 + }, + { + "epoch": 1.3575331772053083, + "grad_norm": 0.5980480949263521, + "learning_rate": 6.669373230434715e-06, + "loss": 0.5975, + "step": 3478 + }, + { + "epoch": 1.3579234972677596, + "grad_norm": 0.6089158183760163, + "learning_rate": 6.667232448118068e-06, + "loss": 0.5923, + "step": 3479 + }, + { + "epoch": 1.3583138173302107, + "grad_norm": 0.6547569572304424, + "learning_rate": 6.6650913218791195e-06, + "loss": 0.5958, + "step": 3480 + }, + { + "epoch": 1.358704137392662, + "grad_norm": 0.6615389699332616, + "learning_rate": 6.662949852159546e-06, + "loss": 0.5904, + "step": 3481 + }, + { + "epoch": 1.3590944574551131, + "grad_norm": 0.5894428091718429, + "learning_rate": 6.660808039401096e-06, + "loss": 0.6192, + "step": 3482 + }, + { + "epoch": 1.3594847775175645, + "grad_norm": 0.5650615884804461, + "learning_rate": 6.658665884045592e-06, + "loss": 0.5969, + "step": 3483 + }, + { + "epoch": 1.3598750975800156, + "grad_norm": 0.6050506771971774, + "learning_rate": 6.656523386534925e-06, + "loss": 0.6293, + "step": 3484 + }, + { + "epoch": 1.360265417642467, + "grad_norm": 0.5467217557769013, + "learning_rate": 6.654380547311054e-06, + "loss": 0.6284, + "step": 3485 + }, + { + "epoch": 1.360655737704918, + "grad_norm": 0.6239758094979012, + "learning_rate": 6.652237366816015e-06, + "loss": 0.5814, + "step": 3486 + }, + { + "epoch": 1.3610460577673693, + "grad_norm": 0.60606846994118, + "learning_rate": 6.650093845491908e-06, + "loss": 0.5896, + "step": 3487 + }, + { + "epoch": 1.3614363778298204, + "grad_norm": 0.4918328725545111, + "learning_rate": 6.647949983780906e-06, + "loss": 0.6201, + "step": 3488 + }, + { + "epoch": 1.3618266978922717, + "grad_norm": 0.5700665761200276, + "learning_rate": 6.645805782125253e-06, + "loss": 0.6013, + "step": 3489 + }, + { + "epoch": 1.3622170179547228, + "grad_norm": 0.6094594991538016, + "learning_rate": 6.64366124096726e-06, + "loss": 0.6212, + "step": 3490 + }, + { + "epoch": 1.3626073380171742, + "grad_norm": 0.6181199829738304, + "learning_rate": 6.641516360749312e-06, + "loss": 0.5899, + "step": 3491 + }, + { + "epoch": 1.3629976580796253, + "grad_norm": 0.5547634753355281, + "learning_rate": 6.639371141913862e-06, + "loss": 0.5648, + "step": 3492 + }, + { + "epoch": 1.3633879781420766, + "grad_norm": 0.6142950838816762, + "learning_rate": 6.637225584903432e-06, + "loss": 0.5989, + "step": 3493 + }, + { + "epoch": 1.3637782982045277, + "grad_norm": 0.5897840143795284, + "learning_rate": 6.635079690160615e-06, + "loss": 0.5843, + "step": 3494 + }, + { + "epoch": 1.364168618266979, + "grad_norm": 0.6747272680673417, + "learning_rate": 6.632933458128074e-06, + "loss": 0.6338, + "step": 3495 + }, + { + "epoch": 1.36455893832943, + "grad_norm": 0.5496268345647579, + "learning_rate": 6.630786889248539e-06, + "loss": 0.6168, + "step": 3496 + }, + { + "epoch": 1.3649492583918814, + "grad_norm": 0.6264056870691564, + "learning_rate": 6.628639983964814e-06, + "loss": 0.6238, + "step": 3497 + }, + { + "epoch": 1.3653395784543325, + "grad_norm": 0.6525014209955801, + "learning_rate": 6.626492742719768e-06, + "loss": 0.5882, + "step": 3498 + }, + { + "epoch": 1.3657298985167838, + "grad_norm": 0.5631012896565804, + "learning_rate": 6.624345165956344e-06, + "loss": 0.628, + "step": 3499 + }, + { + "epoch": 1.366120218579235, + "grad_norm": 0.5570979603282608, + "learning_rate": 6.622197254117547e-06, + "loss": 0.5936, + "step": 3500 + }, + { + "epoch": 1.3665105386416863, + "grad_norm": 0.6341132756900651, + "learning_rate": 6.620049007646459e-06, + "loss": 0.5739, + "step": 3501 + }, + { + "epoch": 1.3669008587041374, + "grad_norm": 0.6830711316370064, + "learning_rate": 6.617900426986224e-06, + "loss": 0.5957, + "step": 3502 + }, + { + "epoch": 1.3672911787665887, + "grad_norm": 0.6948280920459541, + "learning_rate": 6.615751512580063e-06, + "loss": 0.5898, + "step": 3503 + }, + { + "epoch": 1.3676814988290398, + "grad_norm": 0.5614501430426688, + "learning_rate": 6.6136022648712575e-06, + "loss": 0.5638, + "step": 3504 + }, + { + "epoch": 1.368071818891491, + "grad_norm": 0.5906556992419758, + "learning_rate": 6.611452684303163e-06, + "loss": 0.6027, + "step": 3505 + }, + { + "epoch": 1.3684621389539422, + "grad_norm": 0.5697730381574002, + "learning_rate": 6.609302771319202e-06, + "loss": 0.6367, + "step": 3506 + }, + { + "epoch": 1.3688524590163935, + "grad_norm": 0.7018387994018382, + "learning_rate": 6.607152526362866e-06, + "loss": 0.6037, + "step": 3507 + }, + { + "epoch": 1.3692427790788446, + "grad_norm": 0.7578717410804207, + "learning_rate": 6.605001949877716e-06, + "loss": 0.5857, + "step": 3508 + }, + { + "epoch": 1.369633099141296, + "grad_norm": 0.569364127239859, + "learning_rate": 6.602851042307376e-06, + "loss": 0.6232, + "step": 3509 + }, + { + "epoch": 1.370023419203747, + "grad_norm": 0.6151054531881349, + "learning_rate": 6.600699804095547e-06, + "loss": 0.5973, + "step": 3510 + }, + { + "epoch": 1.3704137392661984, + "grad_norm": 0.540870124483734, + "learning_rate": 6.598548235685991e-06, + "loss": 0.6076, + "step": 3511 + }, + { + "epoch": 1.3708040593286495, + "grad_norm": 0.5888121302719279, + "learning_rate": 6.596396337522543e-06, + "loss": 0.6332, + "step": 3512 + }, + { + "epoch": 1.3711943793911008, + "grad_norm": 0.5865049953454945, + "learning_rate": 6.5942441100491e-06, + "loss": 0.6205, + "step": 3513 + }, + { + "epoch": 1.3715846994535519, + "grad_norm": 0.5673246637227493, + "learning_rate": 6.592091553709634e-06, + "loss": 0.6081, + "step": 3514 + }, + { + "epoch": 1.3719750195160032, + "grad_norm": 0.5046838237271181, + "learning_rate": 6.58993866894818e-06, + "loss": 0.5671, + "step": 3515 + }, + { + "epoch": 1.3723653395784543, + "grad_norm": 0.651408689253293, + "learning_rate": 6.587785456208844e-06, + "loss": 0.5741, + "step": 3516 + }, + { + "epoch": 1.3727556596409056, + "grad_norm": 0.5836552139108641, + "learning_rate": 6.585631915935796e-06, + "loss": 0.549, + "step": 3517 + }, + { + "epoch": 1.3731459797033567, + "grad_norm": 0.5436853361389077, + "learning_rate": 6.583478048573277e-06, + "loss": 0.5727, + "step": 3518 + }, + { + "epoch": 1.373536299765808, + "grad_norm": 0.6660947968987392, + "learning_rate": 6.581323854565592e-06, + "loss": 0.5629, + "step": 3519 + }, + { + "epoch": 1.3739266198282591, + "grad_norm": 0.6111786143891706, + "learning_rate": 6.579169334357118e-06, + "loss": 0.6391, + "step": 3520 + }, + { + "epoch": 1.3743169398907105, + "grad_norm": 0.6194840028124595, + "learning_rate": 6.577014488392295e-06, + "loss": 0.5985, + "step": 3521 + }, + { + "epoch": 1.3747072599531616, + "grad_norm": 0.5497598158233193, + "learning_rate": 6.574859317115632e-06, + "loss": 0.6103, + "step": 3522 + }, + { + "epoch": 1.3750975800156127, + "grad_norm": 0.5110838166315951, + "learning_rate": 6.572703820971704e-06, + "loss": 0.6105, + "step": 3523 + }, + { + "epoch": 1.375487900078064, + "grad_norm": 0.5796704029721921, + "learning_rate": 6.570548000405157e-06, + "loss": 0.594, + "step": 3524 + }, + { + "epoch": 1.3758782201405153, + "grad_norm": 0.6399824776941425, + "learning_rate": 6.568391855860698e-06, + "loss": 0.5909, + "step": 3525 + }, + { + "epoch": 1.3762685402029664, + "grad_norm": 0.6204164761992315, + "learning_rate": 6.5662353877831045e-06, + "loss": 0.6031, + "step": 3526 + }, + { + "epoch": 1.3766588602654175, + "grad_norm": 0.5593921017194039, + "learning_rate": 6.5640785966172205e-06, + "loss": 0.6099, + "step": 3527 + }, + { + "epoch": 1.3770491803278688, + "grad_norm": 0.5427547400925485, + "learning_rate": 6.5619214828079555e-06, + "loss": 0.5865, + "step": 3528 + }, + { + "epoch": 1.3774395003903201, + "grad_norm": 0.6289260298895814, + "learning_rate": 6.5597640468002866e-06, + "loss": 0.6052, + "step": 3529 + }, + { + "epoch": 1.3778298204527712, + "grad_norm": 0.6443725348874638, + "learning_rate": 6.557606289039257e-06, + "loss": 0.6252, + "step": 3530 + }, + { + "epoch": 1.3782201405152223, + "grad_norm": 0.7342589608295345, + "learning_rate": 6.555448209969977e-06, + "loss": 0.6183, + "step": 3531 + }, + { + "epoch": 1.3786104605776737, + "grad_norm": 0.5686706057279788, + "learning_rate": 6.55328981003762e-06, + "loss": 0.6166, + "step": 3532 + }, + { + "epoch": 1.379000780640125, + "grad_norm": 0.5616714450336425, + "learning_rate": 6.551131089687429e-06, + "loss": 0.6173, + "step": 3533 + }, + { + "epoch": 1.379391100702576, + "grad_norm": 0.5532370336155561, + "learning_rate": 6.548972049364715e-06, + "loss": 0.6335, + "step": 3534 + }, + { + "epoch": 1.3797814207650272, + "grad_norm": 0.6178535339939235, + "learning_rate": 6.546812689514847e-06, + "loss": 0.6306, + "step": 3535 + }, + { + "epoch": 1.3801717408274785, + "grad_norm": 0.5860393713010372, + "learning_rate": 6.544653010583268e-06, + "loss": 0.6078, + "step": 3536 + }, + { + "epoch": 1.3805620608899298, + "grad_norm": 0.5530642455123598, + "learning_rate": 6.542493013015485e-06, + "loss": 0.605, + "step": 3537 + }, + { + "epoch": 1.380952380952381, + "grad_norm": 0.543208865472969, + "learning_rate": 6.540332697257065e-06, + "loss": 0.6245, + "step": 3538 + }, + { + "epoch": 1.381342701014832, + "grad_norm": 0.6502039105974108, + "learning_rate": 6.538172063753651e-06, + "loss": 0.6268, + "step": 3539 + }, + { + "epoch": 1.3817330210772834, + "grad_norm": 0.5874953883713425, + "learning_rate": 6.536011112950942e-06, + "loss": 0.6344, + "step": 3540 + }, + { + "epoch": 1.3821233411397347, + "grad_norm": 0.5478332462712303, + "learning_rate": 6.533849845294706e-06, + "loss": 0.5986, + "step": 3541 + }, + { + "epoch": 1.3825136612021858, + "grad_norm": 0.5455788593706794, + "learning_rate": 6.5316882612307785e-06, + "loss": 0.6013, + "step": 3542 + }, + { + "epoch": 1.3829039812646369, + "grad_norm": 0.5367684359972811, + "learning_rate": 6.529526361205058e-06, + "loss": 0.6029, + "step": 3543 + }, + { + "epoch": 1.3832943013270882, + "grad_norm": 0.5367227229428695, + "learning_rate": 6.527364145663507e-06, + "loss": 0.5913, + "step": 3544 + }, + { + "epoch": 1.3836846213895395, + "grad_norm": 0.5073577395177022, + "learning_rate": 6.525201615052155e-06, + "loss": 0.6117, + "step": 3545 + }, + { + "epoch": 1.3840749414519906, + "grad_norm": 0.5640954248548151, + "learning_rate": 6.5230387698170975e-06, + "loss": 0.5935, + "step": 3546 + }, + { + "epoch": 1.3844652615144417, + "grad_norm": 0.5524341916155528, + "learning_rate": 6.520875610404492e-06, + "loss": 0.5836, + "step": 3547 + }, + { + "epoch": 1.384855581576893, + "grad_norm": 0.4682174104002333, + "learning_rate": 6.518712137260564e-06, + "loss": 0.6257, + "step": 3548 + }, + { + "epoch": 1.3852459016393444, + "grad_norm": 0.5993448340186834, + "learning_rate": 6.516548350831597e-06, + "loss": 0.6106, + "step": 3549 + }, + { + "epoch": 1.3856362217017955, + "grad_norm": 0.5530017567862663, + "learning_rate": 6.514384251563951e-06, + "loss": 0.5662, + "step": 3550 + }, + { + "epoch": 1.3860265417642466, + "grad_norm": 0.5357370784013119, + "learning_rate": 6.512219839904039e-06, + "loss": 0.5829, + "step": 3551 + }, + { + "epoch": 1.3864168618266979, + "grad_norm": 0.5553141112736013, + "learning_rate": 6.510055116298347e-06, + "loss": 0.527, + "step": 3552 + }, + { + "epoch": 1.3868071818891492, + "grad_norm": 0.6231878320194207, + "learning_rate": 6.507890081193417e-06, + "loss": 0.6042, + "step": 3553 + }, + { + "epoch": 1.3871975019516003, + "grad_norm": 0.5208260113758046, + "learning_rate": 6.505724735035863e-06, + "loss": 0.6471, + "step": 3554 + }, + { + "epoch": 1.3875878220140514, + "grad_norm": 0.5540840910319087, + "learning_rate": 6.503559078272359e-06, + "loss": 0.6303, + "step": 3555 + }, + { + "epoch": 1.3879781420765027, + "grad_norm": 0.5353760485358399, + "learning_rate": 6.501393111349643e-06, + "loss": 0.5927, + "step": 3556 + }, + { + "epoch": 1.388368462138954, + "grad_norm": 0.5121094791824073, + "learning_rate": 6.499226834714518e-06, + "loss": 0.5706, + "step": 3557 + }, + { + "epoch": 1.3887587822014051, + "grad_norm": 0.5426817604902516, + "learning_rate": 6.497060248813851e-06, + "loss": 0.6145, + "step": 3558 + }, + { + "epoch": 1.3891491022638562, + "grad_norm": 0.5381571328805067, + "learning_rate": 6.4948933540945734e-06, + "loss": 0.5836, + "step": 3559 + }, + { + "epoch": 1.3895394223263076, + "grad_norm": 0.537881978917245, + "learning_rate": 6.49272615100368e-06, + "loss": 0.5906, + "step": 3560 + }, + { + "epoch": 1.3899297423887589, + "grad_norm": 0.49486791870013164, + "learning_rate": 6.490558639988227e-06, + "loss": 0.6011, + "step": 3561 + }, + { + "epoch": 1.39032006245121, + "grad_norm": 0.5226220606272283, + "learning_rate": 6.4883908214953365e-06, + "loss": 0.5777, + "step": 3562 + }, + { + "epoch": 1.390710382513661, + "grad_norm": 0.49223239347035064, + "learning_rate": 6.4862226959721945e-06, + "loss": 0.6274, + "step": 3563 + }, + { + "epoch": 1.3911007025761124, + "grad_norm": 0.6103064624091685, + "learning_rate": 6.484054263866046e-06, + "loss": 0.5978, + "step": 3564 + }, + { + "epoch": 1.3914910226385637, + "grad_norm": 0.593262560585127, + "learning_rate": 6.481885525624208e-06, + "loss": 0.6185, + "step": 3565 + }, + { + "epoch": 1.3918813427010148, + "grad_norm": 0.5717836127027316, + "learning_rate": 6.479716481694049e-06, + "loss": 0.5915, + "step": 3566 + }, + { + "epoch": 1.392271662763466, + "grad_norm": 0.5936957262535214, + "learning_rate": 6.47754713252301e-06, + "loss": 0.6025, + "step": 3567 + }, + { + "epoch": 1.3926619828259172, + "grad_norm": 0.55162012497258, + "learning_rate": 6.475377478558591e-06, + "loss": 0.5997, + "step": 3568 + }, + { + "epoch": 1.3930523028883686, + "grad_norm": 0.6703959744998865, + "learning_rate": 6.473207520248357e-06, + "loss": 0.5963, + "step": 3569 + }, + { + "epoch": 1.3934426229508197, + "grad_norm": 0.8252281673436102, + "learning_rate": 6.4710372580399315e-06, + "loss": 0.6453, + "step": 3570 + }, + { + "epoch": 1.3938329430132708, + "grad_norm": 0.6020740155248877, + "learning_rate": 6.468866692381006e-06, + "loss": 0.5984, + "step": 3571 + }, + { + "epoch": 1.394223263075722, + "grad_norm": 0.6654104219192403, + "learning_rate": 6.466695823719329e-06, + "loss": 0.6246, + "step": 3572 + }, + { + "epoch": 1.3946135831381734, + "grad_norm": 0.6630616185668926, + "learning_rate": 6.464524652502717e-06, + "loss": 0.5932, + "step": 3573 + }, + { + "epoch": 1.3950039032006245, + "grad_norm": 0.6779434899456249, + "learning_rate": 6.462353179179048e-06, + "loss": 0.6029, + "step": 3574 + }, + { + "epoch": 1.3953942232630756, + "grad_norm": 0.6067437087684683, + "learning_rate": 6.460181404196256e-06, + "loss": 0.6403, + "step": 3575 + }, + { + "epoch": 1.395784543325527, + "grad_norm": 0.6355391292657896, + "learning_rate": 6.458009328002346e-06, + "loss": 0.6369, + "step": 3576 + }, + { + "epoch": 1.3961748633879782, + "grad_norm": 0.5614981385569233, + "learning_rate": 6.45583695104538e-06, + "loss": 0.5949, + "step": 3577 + }, + { + "epoch": 1.3965651834504293, + "grad_norm": 0.5464994566803788, + "learning_rate": 6.453664273773483e-06, + "loss": 0.5806, + "step": 3578 + }, + { + "epoch": 1.3969555035128804, + "grad_norm": 0.577675067684953, + "learning_rate": 6.451491296634842e-06, + "loss": 0.6078, + "step": 3579 + }, + { + "epoch": 1.3973458235753318, + "grad_norm": 0.5825684983561421, + "learning_rate": 6.449318020077706e-06, + "loss": 0.6088, + "step": 3580 + }, + { + "epoch": 1.397736143637783, + "grad_norm": 0.6047481806692765, + "learning_rate": 6.447144444550386e-06, + "loss": 0.6399, + "step": 3581 + }, + { + "epoch": 1.3981264637002342, + "grad_norm": 0.5362149951568149, + "learning_rate": 6.444970570501255e-06, + "loss": 0.5764, + "step": 3582 + }, + { + "epoch": 1.3985167837626853, + "grad_norm": 0.5122977558327204, + "learning_rate": 6.442796398378748e-06, + "loss": 0.6178, + "step": 3583 + }, + { + "epoch": 1.3989071038251366, + "grad_norm": 0.5675112212473431, + "learning_rate": 6.440621928631358e-06, + "loss": 0.6196, + "step": 3584 + }, + { + "epoch": 1.399297423887588, + "grad_norm": 0.5425090662647534, + "learning_rate": 6.438447161707643e-06, + "loss": 0.5785, + "step": 3585 + }, + { + "epoch": 1.399687743950039, + "grad_norm": 0.5338193398152676, + "learning_rate": 6.436272098056221e-06, + "loss": 0.6338, + "step": 3586 + }, + { + "epoch": 1.4000780640124901, + "grad_norm": 0.5508150057920057, + "learning_rate": 6.434096738125772e-06, + "loss": 0.6167, + "step": 3587 + }, + { + "epoch": 1.4004683840749415, + "grad_norm": 0.647781273697351, + "learning_rate": 6.431921082365036e-06, + "loss": 0.5974, + "step": 3588 + }, + { + "epoch": 1.4008587041373928, + "grad_norm": 0.508808163806291, + "learning_rate": 6.4297451312228145e-06, + "loss": 0.5821, + "step": 3589 + }, + { + "epoch": 1.4012490241998439, + "grad_norm": 0.5050324990972834, + "learning_rate": 6.42756888514797e-06, + "loss": 0.6203, + "step": 3590 + }, + { + "epoch": 1.401639344262295, + "grad_norm": 0.516270538778929, + "learning_rate": 6.425392344589427e-06, + "loss": 0.606, + "step": 3591 + }, + { + "epoch": 1.4020296643247463, + "grad_norm": 0.5615137831958358, + "learning_rate": 6.423215509996168e-06, + "loss": 0.5991, + "step": 3592 + }, + { + "epoch": 1.4024199843871976, + "grad_norm": 0.5432227507179883, + "learning_rate": 6.421038381817239e-06, + "loss": 0.596, + "step": 3593 + }, + { + "epoch": 1.4028103044496487, + "grad_norm": 0.5434417996042936, + "learning_rate": 6.418860960501744e-06, + "loss": 0.6352, + "step": 3594 + }, + { + "epoch": 1.4032006245120998, + "grad_norm": 0.5553902566679453, + "learning_rate": 6.416683246498849e-06, + "loss": 0.593, + "step": 3595 + }, + { + "epoch": 1.4035909445745511, + "grad_norm": 0.5432447567576701, + "learning_rate": 6.414505240257782e-06, + "loss": 0.5736, + "step": 3596 + }, + { + "epoch": 1.4039812646370025, + "grad_norm": 0.5152319770069973, + "learning_rate": 6.412326942227827e-06, + "loss": 0.5666, + "step": 3597 + }, + { + "epoch": 1.4043715846994536, + "grad_norm": 0.5324128829111031, + "learning_rate": 6.410148352858333e-06, + "loss": 0.5933, + "step": 3598 + }, + { + "epoch": 1.4047619047619047, + "grad_norm": 0.5606496338492294, + "learning_rate": 6.4079694725987054e-06, + "loss": 0.6234, + "step": 3599 + }, + { + "epoch": 1.405152224824356, + "grad_norm": 0.5023748157075314, + "learning_rate": 6.4057903018984114e-06, + "loss": 0.5809, + "step": 3600 + }, + { + "epoch": 1.4055425448868073, + "grad_norm": 0.5767919978575154, + "learning_rate": 6.403610841206976e-06, + "loss": 0.6115, + "step": 3601 + }, + { + "epoch": 1.4059328649492584, + "grad_norm": 0.5399926553058768, + "learning_rate": 6.401431090973987e-06, + "loss": 0.6107, + "step": 3602 + }, + { + "epoch": 1.4063231850117095, + "grad_norm": 0.5585250203972596, + "learning_rate": 6.399251051649091e-06, + "loss": 0.6021, + "step": 3603 + }, + { + "epoch": 1.4067135050741608, + "grad_norm": 0.5746163597555639, + "learning_rate": 6.397070723681993e-06, + "loss": 0.626, + "step": 3604 + }, + { + "epoch": 1.4071038251366121, + "grad_norm": 0.5794769222963696, + "learning_rate": 6.3948901075224586e-06, + "loss": 0.5648, + "step": 3605 + }, + { + "epoch": 1.4074941451990632, + "grad_norm": 0.571647307237817, + "learning_rate": 6.392709203620313e-06, + "loss": 0.5697, + "step": 3606 + }, + { + "epoch": 1.4078844652615143, + "grad_norm": 0.5950714578221322, + "learning_rate": 6.39052801242544e-06, + "loss": 0.6005, + "step": 3607 + }, + { + "epoch": 1.4082747853239657, + "grad_norm": 0.5836287686164929, + "learning_rate": 6.388346534387783e-06, + "loss": 0.6557, + "step": 3608 + }, + { + "epoch": 1.408665105386417, + "grad_norm": 0.6639594076962524, + "learning_rate": 6.386164769957346e-06, + "loss": 0.6225, + "step": 3609 + }, + { + "epoch": 1.409055425448868, + "grad_norm": 0.5860633548046411, + "learning_rate": 6.3839827195841875e-06, + "loss": 0.5875, + "step": 3610 + }, + { + "epoch": 1.4094457455113192, + "grad_norm": 0.5621972851346502, + "learning_rate": 6.38180038371843e-06, + "loss": 0.6025, + "step": 3611 + }, + { + "epoch": 1.4098360655737705, + "grad_norm": 0.6475869031575655, + "learning_rate": 6.379617762810254e-06, + "loss": 0.5934, + "step": 3612 + }, + { + "epoch": 1.4102263856362218, + "grad_norm": 0.6219041031503607, + "learning_rate": 6.377434857309898e-06, + "loss": 0.5939, + "step": 3613 + }, + { + "epoch": 1.410616705698673, + "grad_norm": 0.549601842892472, + "learning_rate": 6.375251667667657e-06, + "loss": 0.6137, + "step": 3614 + }, + { + "epoch": 1.411007025761124, + "grad_norm": 0.5682201114755769, + "learning_rate": 6.3730681943338865e-06, + "loss": 0.6115, + "step": 3615 + }, + { + "epoch": 1.4113973458235753, + "grad_norm": 0.7930999034839674, + "learning_rate": 6.370884437759004e-06, + "loss": 0.5834, + "step": 3616 + }, + { + "epoch": 1.4117876658860267, + "grad_norm": 0.5962520719637499, + "learning_rate": 6.3687003983934795e-06, + "loss": 0.5965, + "step": 3617 + }, + { + "epoch": 1.4121779859484778, + "grad_norm": 0.5524876527706853, + "learning_rate": 6.366516076687844e-06, + "loss": 0.6204, + "step": 3618 + }, + { + "epoch": 1.4125683060109289, + "grad_norm": 0.5773121720663861, + "learning_rate": 6.364331473092689e-06, + "loss": 0.5871, + "step": 3619 + }, + { + "epoch": 1.4129586260733802, + "grad_norm": 0.7913586399906767, + "learning_rate": 6.362146588058658e-06, + "loss": 0.6154, + "step": 3620 + }, + { + "epoch": 1.4133489461358315, + "grad_norm": 0.5612520900125783, + "learning_rate": 6.359961422036459e-06, + "loss": 0.6272, + "step": 3621 + }, + { + "epoch": 1.4137392661982826, + "grad_norm": 0.621269274269059, + "learning_rate": 6.357775975476857e-06, + "loss": 0.5745, + "step": 3622 + }, + { + "epoch": 1.4141295862607337, + "grad_norm": 0.584629026796217, + "learning_rate": 6.355590248830669e-06, + "loss": 0.5971, + "step": 3623 + }, + { + "epoch": 1.414519906323185, + "grad_norm": 0.5061121835744683, + "learning_rate": 6.3534042425487765e-06, + "loss": 0.5785, + "step": 3624 + }, + { + "epoch": 1.4149102263856363, + "grad_norm": 0.5434627667880405, + "learning_rate": 6.3512179570821165e-06, + "loss": 0.594, + "step": 3625 + }, + { + "epoch": 1.4153005464480874, + "grad_norm": 0.5869009384195872, + "learning_rate": 6.349031392881682e-06, + "loss": 0.5704, + "step": 3626 + }, + { + "epoch": 1.4156908665105385, + "grad_norm": 0.5675544447594729, + "learning_rate": 6.3468445503985264e-06, + "loss": 0.573, + "step": 3627 + }, + { + "epoch": 1.4160811865729899, + "grad_norm": 0.5166477565600986, + "learning_rate": 6.3446574300837575e-06, + "loss": 0.6147, + "step": 3628 + }, + { + "epoch": 1.4164715066354412, + "grad_norm": 0.5668637518917297, + "learning_rate": 6.342470032388541e-06, + "loss": 0.6232, + "step": 3629 + }, + { + "epoch": 1.4168618266978923, + "grad_norm": 0.5366969474499831, + "learning_rate": 6.340282357764104e-06, + "loss": 0.5957, + "step": 3630 + }, + { + "epoch": 1.4172521467603434, + "grad_norm": 0.5323629489607988, + "learning_rate": 6.338094406661726e-06, + "loss": 0.6075, + "step": 3631 + }, + { + "epoch": 1.4176424668227947, + "grad_norm": 0.5310049255245733, + "learning_rate": 6.3359061795327425e-06, + "loss": 0.625, + "step": 3632 + }, + { + "epoch": 1.418032786885246, + "grad_norm": 0.5534667889129058, + "learning_rate": 6.33371767682855e-06, + "loss": 0.5961, + "step": 3633 + }, + { + "epoch": 1.4184231069476971, + "grad_norm": 0.635046871962103, + "learning_rate": 6.331528899000601e-06, + "loss": 0.5982, + "step": 3634 + }, + { + "epoch": 1.4188134270101482, + "grad_norm": 0.5510401990500129, + "learning_rate": 6.329339846500403e-06, + "loss": 0.5914, + "step": 3635 + }, + { + "epoch": 1.4192037470725996, + "grad_norm": 0.564645155901486, + "learning_rate": 6.327150519779523e-06, + "loss": 0.6242, + "step": 3636 + }, + { + "epoch": 1.4195940671350509, + "grad_norm": 0.5631711908157359, + "learning_rate": 6.3249609192895775e-06, + "loss": 0.6264, + "step": 3637 + }, + { + "epoch": 1.419984387197502, + "grad_norm": 0.5815161908682058, + "learning_rate": 6.32277104548225e-06, + "loss": 0.6072, + "step": 3638 + }, + { + "epoch": 1.420374707259953, + "grad_norm": 0.6075736646817239, + "learning_rate": 6.320580898809273e-06, + "loss": 0.6062, + "step": 3639 + }, + { + "epoch": 1.4207650273224044, + "grad_norm": 0.5210182166008722, + "learning_rate": 6.318390479722438e-06, + "loss": 0.598, + "step": 3640 + }, + { + "epoch": 1.4211553473848557, + "grad_norm": 0.5309785832632632, + "learning_rate": 6.3161997886735905e-06, + "loss": 0.5654, + "step": 3641 + }, + { + "epoch": 1.4215456674473068, + "grad_norm": 0.5848405564779622, + "learning_rate": 6.3140088261146345e-06, + "loss": 0.6219, + "step": 3642 + }, + { + "epoch": 1.421935987509758, + "grad_norm": 0.6422076668770224, + "learning_rate": 6.311817592497529e-06, + "loss": 0.5648, + "step": 3643 + }, + { + "epoch": 1.4223263075722092, + "grad_norm": 0.5568903621460352, + "learning_rate": 6.30962608827429e-06, + "loss": 0.6051, + "step": 3644 + }, + { + "epoch": 1.4227166276346606, + "grad_norm": 0.5620645682409277, + "learning_rate": 6.307434313896986e-06, + "loss": 0.6021, + "step": 3645 + }, + { + "epoch": 1.4231069476971117, + "grad_norm": 0.5849282111855616, + "learning_rate": 6.305242269817746e-06, + "loss": 0.6029, + "step": 3646 + }, + { + "epoch": 1.4234972677595628, + "grad_norm": 0.504194673947021, + "learning_rate": 6.3030499564887516e-06, + "loss": 0.5826, + "step": 3647 + }, + { + "epoch": 1.423887587822014, + "grad_norm": 0.5660677417503815, + "learning_rate": 6.300857374362239e-06, + "loss": 0.6003, + "step": 3648 + }, + { + "epoch": 1.4242779078844654, + "grad_norm": 0.6705170320027538, + "learning_rate": 6.298664523890504e-06, + "loss": 0.6464, + "step": 3649 + }, + { + "epoch": 1.4246682279469165, + "grad_norm": 0.5561760825227821, + "learning_rate": 6.296471405525892e-06, + "loss": 0.5883, + "step": 3650 + }, + { + "epoch": 1.4250585480093676, + "grad_norm": 0.6199650769662413, + "learning_rate": 6.294278019720812e-06, + "loss": 0.6064, + "step": 3651 + }, + { + "epoch": 1.425448868071819, + "grad_norm": 0.7275544414965225, + "learning_rate": 6.292084366927717e-06, + "loss": 0.589, + "step": 3652 + }, + { + "epoch": 1.42583918813427, + "grad_norm": 0.5640573677570243, + "learning_rate": 6.289890447599124e-06, + "loss": 0.6101, + "step": 3653 + }, + { + "epoch": 1.4262295081967213, + "grad_norm": 0.5391513373306501, + "learning_rate": 6.287696262187601e-06, + "loss": 0.5965, + "step": 3654 + }, + { + "epoch": 1.4266198282591724, + "grad_norm": 0.48019715464812934, + "learning_rate": 6.285501811145774e-06, + "loss": 0.6613, + "step": 3655 + }, + { + "epoch": 1.4270101483216238, + "grad_norm": 0.5954994606797486, + "learning_rate": 6.283307094926319e-06, + "loss": 0.5845, + "step": 3656 + }, + { + "epoch": 1.4274004683840749, + "grad_norm": 0.5471882357910736, + "learning_rate": 6.2811121139819705e-06, + "loss": 0.6052, + "step": 3657 + }, + { + "epoch": 1.4277907884465262, + "grad_norm": 0.6332473769637342, + "learning_rate": 6.278916868765518e-06, + "loss": 0.6015, + "step": 3658 + }, + { + "epoch": 1.4281811085089773, + "grad_norm": 0.5610324881529762, + "learning_rate": 6.2767213597298015e-06, + "loss": 0.5923, + "step": 3659 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.5757571262590606, + "learning_rate": 6.27452558732772e-06, + "loss": 0.5907, + "step": 3660 + }, + { + "epoch": 1.4289617486338797, + "grad_norm": 0.5340317118845, + "learning_rate": 6.272329552012224e-06, + "loss": 0.5421, + "step": 3661 + }, + { + "epoch": 1.429352068696331, + "grad_norm": 0.6630790295379674, + "learning_rate": 6.270133254236319e-06, + "loss": 0.5944, + "step": 3662 + }, + { + "epoch": 1.4297423887587821, + "grad_norm": 0.6235951381678715, + "learning_rate": 6.267936694453065e-06, + "loss": 0.6007, + "step": 3663 + }, + { + "epoch": 1.4301327088212334, + "grad_norm": 0.6802070928352595, + "learning_rate": 6.265739873115575e-06, + "loss": 0.5937, + "step": 3664 + }, + { + "epoch": 1.4305230288836845, + "grad_norm": 0.5888627109044526, + "learning_rate": 6.263542790677019e-06, + "loss": 0.5953, + "step": 3665 + }, + { + "epoch": 1.4309133489461359, + "grad_norm": 0.568126812829432, + "learning_rate": 6.261345447590614e-06, + "loss": 0.5994, + "step": 3666 + }, + { + "epoch": 1.431303669008587, + "grad_norm": 0.6212000033483323, + "learning_rate": 6.259147844309641e-06, + "loss": 0.5518, + "step": 3667 + }, + { + "epoch": 1.4316939890710383, + "grad_norm": 0.6626344319372782, + "learning_rate": 6.256949981287424e-06, + "loss": 0.5917, + "step": 3668 + }, + { + "epoch": 1.4320843091334894, + "grad_norm": 0.543507918470677, + "learning_rate": 6.254751858977347e-06, + "loss": 0.5819, + "step": 3669 + }, + { + "epoch": 1.4324746291959407, + "grad_norm": 0.5330235078699445, + "learning_rate": 6.2525534778328474e-06, + "loss": 0.6256, + "step": 3670 + }, + { + "epoch": 1.4328649492583918, + "grad_norm": 0.6729451939201959, + "learning_rate": 6.2503548383074156e-06, + "loss": 0.6015, + "step": 3671 + }, + { + "epoch": 1.4332552693208431, + "grad_norm": 0.6688045858805544, + "learning_rate": 6.24815594085459e-06, + "loss": 0.6406, + "step": 3672 + }, + { + "epoch": 1.4336455893832942, + "grad_norm": 0.5108480080978893, + "learning_rate": 6.245956785927971e-06, + "loss": 0.6046, + "step": 3673 + }, + { + "epoch": 1.4340359094457455, + "grad_norm": 0.5869507877462896, + "learning_rate": 6.243757373981203e-06, + "loss": 0.582, + "step": 3674 + }, + { + "epoch": 1.4344262295081966, + "grad_norm": 0.6272128414135468, + "learning_rate": 6.241557705467993e-06, + "loss": 0.5863, + "step": 3675 + }, + { + "epoch": 1.434816549570648, + "grad_norm": 0.6676117623285577, + "learning_rate": 6.239357780842094e-06, + "loss": 0.5986, + "step": 3676 + }, + { + "epoch": 1.435206869633099, + "grad_norm": 0.619186965288521, + "learning_rate": 6.2371576005573135e-06, + "loss": 0.5996, + "step": 3677 + }, + { + "epoch": 1.4355971896955504, + "grad_norm": 0.68725718006897, + "learning_rate": 6.234957165067511e-06, + "loss": 0.5859, + "step": 3678 + }, + { + "epoch": 1.4359875097580015, + "grad_norm": 0.6257653193915542, + "learning_rate": 6.2327564748266035e-06, + "loss": 0.6106, + "step": 3679 + }, + { + "epoch": 1.4363778298204528, + "grad_norm": 0.6378871874982052, + "learning_rate": 6.230555530288552e-06, + "loss": 0.6014, + "step": 3680 + }, + { + "epoch": 1.436768149882904, + "grad_norm": 0.6686756746403961, + "learning_rate": 6.228354331907377e-06, + "loss": 0.5831, + "step": 3681 + }, + { + "epoch": 1.4371584699453552, + "grad_norm": 0.5342296660964533, + "learning_rate": 6.226152880137151e-06, + "loss": 0.6363, + "step": 3682 + }, + { + "epoch": 1.4375487900078063, + "grad_norm": 0.5568806132026763, + "learning_rate": 6.223951175431994e-06, + "loss": 0.5757, + "step": 3683 + }, + { + "epoch": 1.4379391100702577, + "grad_norm": 0.6119857315673959, + "learning_rate": 6.221749218246081e-06, + "loss": 0.5825, + "step": 3684 + }, + { + "epoch": 1.4383294301327088, + "grad_norm": 0.5009547728243099, + "learning_rate": 6.219547009033641e-06, + "loss": 0.5712, + "step": 3685 + }, + { + "epoch": 1.43871975019516, + "grad_norm": 0.5586207513152418, + "learning_rate": 6.2173445482489514e-06, + "loss": 0.5913, + "step": 3686 + }, + { + "epoch": 1.4391100702576112, + "grad_norm": 0.5311477615859944, + "learning_rate": 6.215141836346345e-06, + "loss": 0.5639, + "step": 3687 + }, + { + "epoch": 1.4395003903200625, + "grad_norm": 0.5666603816422165, + "learning_rate": 6.212938873780203e-06, + "loss": 0.6078, + "step": 3688 + }, + { + "epoch": 1.4398907103825136, + "grad_norm": 0.5514633686654767, + "learning_rate": 6.21073566100496e-06, + "loss": 0.5901, + "step": 3689 + }, + { + "epoch": 1.440281030444965, + "grad_norm": 0.5797659228946934, + "learning_rate": 6.208532198475103e-06, + "loss": 0.6303, + "step": 3690 + }, + { + "epoch": 1.440671350507416, + "grad_norm": 0.6301113406884216, + "learning_rate": 6.206328486645169e-06, + "loss": 0.6098, + "step": 3691 + }, + { + "epoch": 1.4410616705698673, + "grad_norm": 0.6454878813178749, + "learning_rate": 6.204124525969748e-06, + "loss": 0.5716, + "step": 3692 + }, + { + "epoch": 1.4414519906323184, + "grad_norm": 0.5231530633290473, + "learning_rate": 6.20192031690348e-06, + "loss": 0.6036, + "step": 3693 + }, + { + "epoch": 1.4418423106947698, + "grad_norm": 0.5360814993944408, + "learning_rate": 6.199715859901055e-06, + "loss": 0.6333, + "step": 3694 + }, + { + "epoch": 1.4422326307572209, + "grad_norm": 0.5232491642210968, + "learning_rate": 6.197511155417217e-06, + "loss": 0.5966, + "step": 3695 + }, + { + "epoch": 1.4426229508196722, + "grad_norm": 0.5923706809928931, + "learning_rate": 6.195306203906761e-06, + "loss": 0.5737, + "step": 3696 + }, + { + "epoch": 1.4430132708821233, + "grad_norm": 0.558188321271828, + "learning_rate": 6.1931010058245326e-06, + "loss": 0.6169, + "step": 3697 + }, + { + "epoch": 1.4434035909445746, + "grad_norm": 0.5266854847262482, + "learning_rate": 6.190895561625423e-06, + "loss": 0.611, + "step": 3698 + }, + { + "epoch": 1.4437939110070257, + "grad_norm": 0.5249395387601563, + "learning_rate": 6.188689871764382e-06, + "loss": 0.5871, + "step": 3699 + }, + { + "epoch": 1.444184231069477, + "grad_norm": 0.6065394027354974, + "learning_rate": 6.186483936696406e-06, + "loss": 0.6148, + "step": 3700 + }, + { + "epoch": 1.4445745511319281, + "grad_norm": 0.4820949492589036, + "learning_rate": 6.184277756876542e-06, + "loss": 0.586, + "step": 3701 + }, + { + "epoch": 1.4449648711943794, + "grad_norm": 0.5290160576468681, + "learning_rate": 6.1820713327598905e-06, + "loss": 0.5862, + "step": 3702 + }, + { + "epoch": 1.4453551912568305, + "grad_norm": 0.5137222327393384, + "learning_rate": 6.179864664801597e-06, + "loss": 0.5822, + "step": 3703 + }, + { + "epoch": 1.4457455113192819, + "grad_norm": 0.5789982359316812, + "learning_rate": 6.177657753456861e-06, + "loss": 0.6259, + "step": 3704 + }, + { + "epoch": 1.446135831381733, + "grad_norm": 0.5193194693862013, + "learning_rate": 6.175450599180934e-06, + "loss": 0.6063, + "step": 3705 + }, + { + "epoch": 1.4465261514441843, + "grad_norm": 0.5582513481077018, + "learning_rate": 6.173243202429113e-06, + "loss": 0.6122, + "step": 3706 + }, + { + "epoch": 1.4469164715066354, + "grad_norm": 0.5153848694649978, + "learning_rate": 6.171035563656749e-06, + "loss": 0.6002, + "step": 3707 + }, + { + "epoch": 1.4473067915690867, + "grad_norm": 0.5536815380788742, + "learning_rate": 6.168827683319238e-06, + "loss": 0.6314, + "step": 3708 + }, + { + "epoch": 1.4476971116315378, + "grad_norm": 0.6026271027955635, + "learning_rate": 6.166619561872031e-06, + "loss": 0.5968, + "step": 3709 + }, + { + "epoch": 1.4480874316939891, + "grad_norm": 0.5178329558069282, + "learning_rate": 6.164411199770628e-06, + "loss": 0.5712, + "step": 3710 + }, + { + "epoch": 1.4484777517564402, + "grad_norm": 0.5520143543245957, + "learning_rate": 6.162202597470574e-06, + "loss": 0.6495, + "step": 3711 + }, + { + "epoch": 1.4488680718188915, + "grad_norm": 0.6425278546986223, + "learning_rate": 6.15999375542747e-06, + "loss": 0.6224, + "step": 3712 + }, + { + "epoch": 1.4492583918813426, + "grad_norm": 0.5311336428941619, + "learning_rate": 6.157784674096961e-06, + "loss": 0.6423, + "step": 3713 + }, + { + "epoch": 1.449648711943794, + "grad_norm": 0.5560848126508755, + "learning_rate": 6.155575353934747e-06, + "loss": 0.5751, + "step": 3714 + }, + { + "epoch": 1.450039032006245, + "grad_norm": 0.5949979654257042, + "learning_rate": 6.153365795396571e-06, + "loss": 0.5997, + "step": 3715 + }, + { + "epoch": 1.4504293520686964, + "grad_norm": 0.5099186349508724, + "learning_rate": 6.151155998938228e-06, + "loss": 0.6182, + "step": 3716 + }, + { + "epoch": 1.4508196721311475, + "grad_norm": 0.5687201077314642, + "learning_rate": 6.1489459650155645e-06, + "loss": 0.6184, + "step": 3717 + }, + { + "epoch": 1.4512099921935988, + "grad_norm": 0.547532385774831, + "learning_rate": 6.146735694084471e-06, + "loss": 0.619, + "step": 3718 + }, + { + "epoch": 1.45160031225605, + "grad_norm": 0.5177247446496858, + "learning_rate": 6.144525186600893e-06, + "loss": 0.5736, + "step": 3719 + }, + { + "epoch": 1.4519906323185012, + "grad_norm": 0.5253050449529134, + "learning_rate": 6.142314443020817e-06, + "loss": 0.6618, + "step": 3720 + }, + { + "epoch": 1.4523809523809523, + "grad_norm": 0.5964446127068701, + "learning_rate": 6.140103463800286e-06, + "loss": 0.5766, + "step": 3721 + }, + { + "epoch": 1.4527712724434036, + "grad_norm": 0.5772082815399507, + "learning_rate": 6.137892249395388e-06, + "loss": 0.604, + "step": 3722 + }, + { + "epoch": 1.4531615925058547, + "grad_norm": 0.4976054786762314, + "learning_rate": 6.135680800262257e-06, + "loss": 0.6162, + "step": 3723 + }, + { + "epoch": 1.453551912568306, + "grad_norm": 0.6415147418760372, + "learning_rate": 6.133469116857082e-06, + "loss": 0.6084, + "step": 3724 + }, + { + "epoch": 1.4539422326307572, + "grad_norm": 0.6050082560781845, + "learning_rate": 6.131257199636093e-06, + "loss": 0.6111, + "step": 3725 + }, + { + "epoch": 1.4543325526932085, + "grad_norm": 0.5524243510362309, + "learning_rate": 6.129045049055575e-06, + "loss": 0.575, + "step": 3726 + }, + { + "epoch": 1.4547228727556596, + "grad_norm": 0.6745469369694056, + "learning_rate": 6.1268326655718546e-06, + "loss": 0.6115, + "step": 3727 + }, + { + "epoch": 1.455113192818111, + "grad_norm": 0.6109362081476973, + "learning_rate": 6.124620049641313e-06, + "loss": 0.6122, + "step": 3728 + }, + { + "epoch": 1.455503512880562, + "grad_norm": 0.5439829955582727, + "learning_rate": 6.122407201720373e-06, + "loss": 0.6408, + "step": 3729 + }, + { + "epoch": 1.4558938329430133, + "grad_norm": 0.5628357829549068, + "learning_rate": 6.12019412226551e-06, + "loss": 0.6113, + "step": 3730 + }, + { + "epoch": 1.4562841530054644, + "grad_norm": 0.594498471220016, + "learning_rate": 6.117980811733245e-06, + "loss": 0.5707, + "step": 3731 + }, + { + "epoch": 1.4566744730679158, + "grad_norm": 0.583782542203666, + "learning_rate": 6.115767270580147e-06, + "loss": 0.6198, + "step": 3732 + }, + { + "epoch": 1.4570647931303669, + "grad_norm": 0.6549005170617502, + "learning_rate": 6.113553499262833e-06, + "loss": 0.6162, + "step": 3733 + }, + { + "epoch": 1.4574551131928182, + "grad_norm": 0.608227162796037, + "learning_rate": 6.111339498237966e-06, + "loss": 0.6284, + "step": 3734 + }, + { + "epoch": 1.4578454332552693, + "grad_norm": 0.5886256117090193, + "learning_rate": 6.109125267962259e-06, + "loss": 0.6224, + "step": 3735 + }, + { + "epoch": 1.4582357533177206, + "grad_norm": 0.6169529028580305, + "learning_rate": 6.10691080889247e-06, + "loss": 0.5911, + "step": 3736 + }, + { + "epoch": 1.4586260733801717, + "grad_norm": 0.7892687339687121, + "learning_rate": 6.1046961214854075e-06, + "loss": 0.5928, + "step": 3737 + }, + { + "epoch": 1.459016393442623, + "grad_norm": 0.7037081537700958, + "learning_rate": 6.102481206197921e-06, + "loss": 0.5908, + "step": 3738 + }, + { + "epoch": 1.4594067135050741, + "grad_norm": 0.5560203943219121, + "learning_rate": 6.100266063486912e-06, + "loss": 0.5375, + "step": 3739 + }, + { + "epoch": 1.4597970335675254, + "grad_norm": 0.6038326955844844, + "learning_rate": 6.098050693809328e-06, + "loss": 0.5913, + "step": 3740 + }, + { + "epoch": 1.4601873536299765, + "grad_norm": 0.6935252238164497, + "learning_rate": 6.095835097622163e-06, + "loss": 0.6126, + "step": 3741 + }, + { + "epoch": 1.4605776736924279, + "grad_norm": 0.5869191980057091, + "learning_rate": 6.0936192753824575e-06, + "loss": 0.5951, + "step": 3742 + }, + { + "epoch": 1.460967993754879, + "grad_norm": 0.5346270829042572, + "learning_rate": 6.091403227547299e-06, + "loss": 0.5834, + "step": 3743 + }, + { + "epoch": 1.4613583138173303, + "grad_norm": 0.6163245935170556, + "learning_rate": 6.089186954573819e-06, + "loss": 0.6157, + "step": 3744 + }, + { + "epoch": 1.4617486338797814, + "grad_norm": 0.5736938370717479, + "learning_rate": 6.086970456919203e-06, + "loss": 0.5938, + "step": 3745 + }, + { + "epoch": 1.4621389539422327, + "grad_norm": 0.5328429429917836, + "learning_rate": 6.0847537350406704e-06, + "loss": 0.6144, + "step": 3746 + }, + { + "epoch": 1.4625292740046838, + "grad_norm": 0.5739452069878188, + "learning_rate": 6.0825367893955e-06, + "loss": 0.5929, + "step": 3747 + }, + { + "epoch": 1.4629195940671351, + "grad_norm": 0.5432752136610371, + "learning_rate": 6.080319620441009e-06, + "loss": 0.5882, + "step": 3748 + }, + { + "epoch": 1.4633099141295862, + "grad_norm": 0.5471989244889144, + "learning_rate": 6.078102228634559e-06, + "loss": 0.606, + "step": 3749 + }, + { + "epoch": 1.4637002341920375, + "grad_norm": 0.5605344175886902, + "learning_rate": 6.075884614433567e-06, + "loss": 0.6259, + "step": 3750 + }, + { + "epoch": 1.4640905542544886, + "grad_norm": 0.5260365540181527, + "learning_rate": 6.073666778295486e-06, + "loss": 0.5732, + "step": 3751 + }, + { + "epoch": 1.46448087431694, + "grad_norm": 0.6004654931989583, + "learning_rate": 6.071448720677819e-06, + "loss": 0.616, + "step": 3752 + }, + { + "epoch": 1.464871194379391, + "grad_norm": 0.5856129697279722, + "learning_rate": 6.069230442038115e-06, + "loss": 0.5857, + "step": 3753 + }, + { + "epoch": 1.4652615144418424, + "grad_norm": 0.5472559006017561, + "learning_rate": 6.06701194283397e-06, + "loss": 0.5907, + "step": 3754 + }, + { + "epoch": 1.4656518345042935, + "grad_norm": 0.5204015574099149, + "learning_rate": 6.06479322352302e-06, + "loss": 0.6246, + "step": 3755 + }, + { + "epoch": 1.4660421545667448, + "grad_norm": 0.7147693351826288, + "learning_rate": 6.062574284562951e-06, + "loss": 0.5655, + "step": 3756 + }, + { + "epoch": 1.466432474629196, + "grad_norm": 0.7975522960540379, + "learning_rate": 6.0603551264114925e-06, + "loss": 0.6126, + "step": 3757 + }, + { + "epoch": 1.4668227946916472, + "grad_norm": 0.5388355976633233, + "learning_rate": 6.058135749526423e-06, + "loss": 0.6008, + "step": 3758 + }, + { + "epoch": 1.4672131147540983, + "grad_norm": 0.6362885582312188, + "learning_rate": 6.05591615436556e-06, + "loss": 0.6354, + "step": 3759 + }, + { + "epoch": 1.4676034348165496, + "grad_norm": 0.6932678912177443, + "learning_rate": 6.053696341386769e-06, + "loss": 0.6028, + "step": 3760 + }, + { + "epoch": 1.4679937548790007, + "grad_norm": 0.5096805296841003, + "learning_rate": 6.051476311047962e-06, + "loss": 0.5942, + "step": 3761 + }, + { + "epoch": 1.468384074941452, + "grad_norm": 0.6391501809682351, + "learning_rate": 6.049256063807094e-06, + "loss": 0.6069, + "step": 3762 + }, + { + "epoch": 1.4687743950039032, + "grad_norm": 0.5863621277329875, + "learning_rate": 6.0470356001221644e-06, + "loss": 0.6073, + "step": 3763 + }, + { + "epoch": 1.4691647150663545, + "grad_norm": 0.5374925464249085, + "learning_rate": 6.0448149204512196e-06, + "loss": 0.6379, + "step": 3764 + }, + { + "epoch": 1.4695550351288056, + "grad_norm": 0.5497443425537423, + "learning_rate": 6.042594025252345e-06, + "loss": 0.6121, + "step": 3765 + }, + { + "epoch": 1.469945355191257, + "grad_norm": 0.5094325784423043, + "learning_rate": 6.040372914983678e-06, + "loss": 0.6218, + "step": 3766 + }, + { + "epoch": 1.470335675253708, + "grad_norm": 0.5100633193560229, + "learning_rate": 6.038151590103395e-06, + "loss": 0.6321, + "step": 3767 + }, + { + "epoch": 1.4707259953161593, + "grad_norm": 0.5732816234888031, + "learning_rate": 6.035930051069721e-06, + "loss": 0.5858, + "step": 3768 + }, + { + "epoch": 1.4711163153786104, + "grad_norm": 0.6227688255227062, + "learning_rate": 6.033708298340917e-06, + "loss": 0.6477, + "step": 3769 + }, + { + "epoch": 1.4715066354410617, + "grad_norm": 0.6084195532588783, + "learning_rate": 6.031486332375298e-06, + "loss": 0.6137, + "step": 3770 + }, + { + "epoch": 1.4718969555035128, + "grad_norm": 0.520427944827441, + "learning_rate": 6.029264153631218e-06, + "loss": 0.5958, + "step": 3771 + }, + { + "epoch": 1.4722872755659642, + "grad_norm": 0.5432507308445803, + "learning_rate": 6.0270417625670756e-06, + "loss": 0.5991, + "step": 3772 + }, + { + "epoch": 1.4726775956284153, + "grad_norm": 0.5173240872365742, + "learning_rate": 6.0248191596413116e-06, + "loss": 0.6165, + "step": 3773 + }, + { + "epoch": 1.4730679156908666, + "grad_norm": 0.6455362625778776, + "learning_rate": 6.022596345312412e-06, + "loss": 0.5862, + "step": 3774 + }, + { + "epoch": 1.4734582357533177, + "grad_norm": 0.5499367997919729, + "learning_rate": 6.020373320038909e-06, + "loss": 0.6063, + "step": 3775 + }, + { + "epoch": 1.473848555815769, + "grad_norm": 0.5745790556250207, + "learning_rate": 6.0181500842793735e-06, + "loss": 0.5623, + "step": 3776 + }, + { + "epoch": 1.4742388758782201, + "grad_norm": 0.5434030122983877, + "learning_rate": 6.015926638492423e-06, + "loss": 0.599, + "step": 3777 + }, + { + "epoch": 1.4746291959406714, + "grad_norm": 0.5473472279526951, + "learning_rate": 6.013702983136715e-06, + "loss": 0.5997, + "step": 3778 + }, + { + "epoch": 1.4750195160031225, + "grad_norm": 0.5490091856660688, + "learning_rate": 6.0114791186709575e-06, + "loss": 0.5912, + "step": 3779 + }, + { + "epoch": 1.4754098360655736, + "grad_norm": 0.6039087384248543, + "learning_rate": 6.0092550455538915e-06, + "loss": 0.6113, + "step": 3780 + }, + { + "epoch": 1.475800156128025, + "grad_norm": 0.6947088715814279, + "learning_rate": 6.007030764244309e-06, + "loss": 0.5899, + "step": 3781 + }, + { + "epoch": 1.4761904761904763, + "grad_norm": 0.5552541920905845, + "learning_rate": 6.004806275201043e-06, + "loss": 0.5949, + "step": 3782 + }, + { + "epoch": 1.4765807962529274, + "grad_norm": 0.7936792635786705, + "learning_rate": 6.002581578882965e-06, + "loss": 0.6078, + "step": 3783 + }, + { + "epoch": 1.4769711163153785, + "grad_norm": 0.7476155811838551, + "learning_rate": 6.000356675748995e-06, + "loss": 0.5794, + "step": 3784 + }, + { + "epoch": 1.4773614363778298, + "grad_norm": 0.5465945054818033, + "learning_rate": 5.998131566258095e-06, + "loss": 0.6004, + "step": 3785 + }, + { + "epoch": 1.4777517564402811, + "grad_norm": 0.6244913230538301, + "learning_rate": 5.995906250869267e-06, + "loss": 0.6007, + "step": 3786 + }, + { + "epoch": 1.4781420765027322, + "grad_norm": 0.7521396743054427, + "learning_rate": 5.993680730041555e-06, + "loss": 0.5723, + "step": 3787 + }, + { + "epoch": 1.4785323965651833, + "grad_norm": 0.5847087389220801, + "learning_rate": 5.991455004234047e-06, + "loss": 0.6092, + "step": 3788 + }, + { + "epoch": 1.4789227166276346, + "grad_norm": 0.5925962761939538, + "learning_rate": 5.9892290739058756e-06, + "loss": 0.5675, + "step": 3789 + }, + { + "epoch": 1.479313036690086, + "grad_norm": 0.6175559840454629, + "learning_rate": 5.987002939516212e-06, + "loss": 0.5978, + "step": 3790 + }, + { + "epoch": 1.479703356752537, + "grad_norm": 0.5752987144611593, + "learning_rate": 5.98477660152427e-06, + "loss": 0.5794, + "step": 3791 + }, + { + "epoch": 1.4800936768149882, + "grad_norm": 0.5423679769079683, + "learning_rate": 5.982550060389306e-06, + "loss": 0.6142, + "step": 3792 + }, + { + "epoch": 1.4804839968774395, + "grad_norm": 0.5894742606162707, + "learning_rate": 5.980323316570621e-06, + "loss": 0.6308, + "step": 3793 + }, + { + "epoch": 1.4808743169398908, + "grad_norm": 0.6422170423467036, + "learning_rate": 5.9780963705275505e-06, + "loss": 0.5778, + "step": 3794 + }, + { + "epoch": 1.481264637002342, + "grad_norm": 0.620259185502586, + "learning_rate": 5.975869222719482e-06, + "loss": 0.6225, + "step": 3795 + }, + { + "epoch": 1.481654957064793, + "grad_norm": 0.56042539440192, + "learning_rate": 5.973641873605833e-06, + "loss": 0.6291, + "step": 3796 + }, + { + "epoch": 1.4820452771272443, + "grad_norm": 0.571012518257135, + "learning_rate": 5.971414323646072e-06, + "loss": 0.5911, + "step": 3797 + }, + { + "epoch": 1.4824355971896956, + "grad_norm": 0.6192851244389006, + "learning_rate": 5.969186573299706e-06, + "loss": 0.6113, + "step": 3798 + }, + { + "epoch": 1.4828259172521467, + "grad_norm": 0.5265546100104855, + "learning_rate": 5.966958623026282e-06, + "loss": 0.619, + "step": 3799 + }, + { + "epoch": 1.4832162373145978, + "grad_norm": 0.5685798321260401, + "learning_rate": 5.964730473285388e-06, + "loss": 0.6215, + "step": 3800 + }, + { + "epoch": 1.4836065573770492, + "grad_norm": 0.63558310110698, + "learning_rate": 5.962502124536655e-06, + "loss": 0.5981, + "step": 3801 + }, + { + "epoch": 1.4839968774395005, + "grad_norm": 0.5857471702760756, + "learning_rate": 5.960273577239756e-06, + "loss": 0.6214, + "step": 3802 + }, + { + "epoch": 1.4843871975019516, + "grad_norm": 0.6218344462980412, + "learning_rate": 5.958044831854401e-06, + "loss": 0.6065, + "step": 3803 + }, + { + "epoch": 1.4847775175644027, + "grad_norm": 0.713428951629485, + "learning_rate": 5.955815888840343e-06, + "loss": 0.6372, + "step": 3804 + }, + { + "epoch": 1.485167837626854, + "grad_norm": 0.6092400608522215, + "learning_rate": 5.953586748657376e-06, + "loss": 0.6105, + "step": 3805 + }, + { + "epoch": 1.4855581576893053, + "grad_norm": 0.5196177651743255, + "learning_rate": 5.951357411765336e-06, + "loss": 0.6024, + "step": 3806 + }, + { + "epoch": 1.4859484777517564, + "grad_norm": 0.6113848929344639, + "learning_rate": 5.949127878624098e-06, + "loss": 0.5662, + "step": 3807 + }, + { + "epoch": 1.4863387978142075, + "grad_norm": 0.5783457724806619, + "learning_rate": 5.946898149693575e-06, + "loss": 0.5825, + "step": 3808 + }, + { + "epoch": 1.4867291178766588, + "grad_norm": 0.6797760721295242, + "learning_rate": 5.944668225433725e-06, + "loss": 0.6169, + "step": 3809 + }, + { + "epoch": 1.4871194379391102, + "grad_norm": 0.5894026993285586, + "learning_rate": 5.942438106304545e-06, + "loss": 0.6351, + "step": 3810 + }, + { + "epoch": 1.4875097580015613, + "grad_norm": 0.5650674712364551, + "learning_rate": 5.94020779276607e-06, + "loss": 0.5796, + "step": 3811 + }, + { + "epoch": 1.4879000780640124, + "grad_norm": 0.5593074273568963, + "learning_rate": 5.937977285278377e-06, + "loss": 0.5881, + "step": 3812 + }, + { + "epoch": 1.4882903981264637, + "grad_norm": 0.5671072533742836, + "learning_rate": 5.93574658430158e-06, + "loss": 0.5845, + "step": 3813 + }, + { + "epoch": 1.488680718188915, + "grad_norm": 0.5210112803262186, + "learning_rate": 5.9335156902958415e-06, + "loss": 0.6041, + "step": 3814 + }, + { + "epoch": 1.489071038251366, + "grad_norm": 0.5805320619672338, + "learning_rate": 5.931284603721352e-06, + "loss": 0.6206, + "step": 3815 + }, + { + "epoch": 1.4894613583138172, + "grad_norm": 0.5256612321293331, + "learning_rate": 5.929053325038351e-06, + "loss": 0.6176, + "step": 3816 + }, + { + "epoch": 1.4898516783762685, + "grad_norm": 0.553169261745752, + "learning_rate": 5.926821854707112e-06, + "loss": 0.6132, + "step": 3817 + }, + { + "epoch": 1.4902419984387199, + "grad_norm": 0.614602866163592, + "learning_rate": 5.924590193187951e-06, + "loss": 0.5743, + "step": 3818 + }, + { + "epoch": 1.490632318501171, + "grad_norm": 0.5643045419047472, + "learning_rate": 5.922358340941224e-06, + "loss": 0.5815, + "step": 3819 + }, + { + "epoch": 1.491022638563622, + "grad_norm": 0.5360142576887903, + "learning_rate": 5.9201262984273245e-06, + "loss": 0.5999, + "step": 3820 + }, + { + "epoch": 1.4914129586260734, + "grad_norm": 0.6017097591290869, + "learning_rate": 5.917894066106683e-06, + "loss": 0.6234, + "step": 3821 + }, + { + "epoch": 1.4918032786885247, + "grad_norm": 0.6083793398894081, + "learning_rate": 5.915661644439774e-06, + "loss": 0.5905, + "step": 3822 + }, + { + "epoch": 1.4921935987509758, + "grad_norm": 0.5943830909024655, + "learning_rate": 5.913429033887109e-06, + "loss": 0.6241, + "step": 3823 + }, + { + "epoch": 1.492583918813427, + "grad_norm": 0.6164665776866881, + "learning_rate": 5.911196234909238e-06, + "loss": 0.5935, + "step": 3824 + }, + { + "epoch": 1.4929742388758782, + "grad_norm": 0.6504943111263938, + "learning_rate": 5.9089632479667524e-06, + "loss": 0.6457, + "step": 3825 + }, + { + "epoch": 1.4933645589383295, + "grad_norm": 0.5009933225772354, + "learning_rate": 5.906730073520276e-06, + "loss": 0.615, + "step": 3826 + }, + { + "epoch": 1.4937548790007806, + "grad_norm": 0.5556224714744252, + "learning_rate": 5.9044967120304795e-06, + "loss": 0.5783, + "step": 3827 + }, + { + "epoch": 1.4941451990632317, + "grad_norm": 0.6214552116277352, + "learning_rate": 5.902263163958065e-06, + "loss": 0.5943, + "step": 3828 + }, + { + "epoch": 1.494535519125683, + "grad_norm": 0.5373253179737436, + "learning_rate": 5.900029429763779e-06, + "loss": 0.6029, + "step": 3829 + }, + { + "epoch": 1.4949258391881344, + "grad_norm": 0.5398222190066717, + "learning_rate": 5.8977955099084015e-06, + "loss": 0.5966, + "step": 3830 + }, + { + "epoch": 1.4953161592505855, + "grad_norm": 0.7694836715957373, + "learning_rate": 5.895561404852754e-06, + "loss": 0.6288, + "step": 3831 + }, + { + "epoch": 1.4957064793130366, + "grad_norm": 0.5029419633347836, + "learning_rate": 5.893327115057695e-06, + "loss": 0.6088, + "step": 3832 + }, + { + "epoch": 1.496096799375488, + "grad_norm": 0.5907967619352585, + "learning_rate": 5.891092640984122e-06, + "loss": 0.6083, + "step": 3833 + }, + { + "epoch": 1.4964871194379392, + "grad_norm": 0.5652047870465942, + "learning_rate": 5.88885798309297e-06, + "loss": 0.5854, + "step": 3834 + }, + { + "epoch": 1.4968774395003903, + "grad_norm": 0.5671690800976611, + "learning_rate": 5.886623141845209e-06, + "loss": 0.582, + "step": 3835 + }, + { + "epoch": 1.4972677595628414, + "grad_norm": 0.5482013872731412, + "learning_rate": 5.884388117701852e-06, + "loss": 0.6066, + "step": 3836 + }, + { + "epoch": 1.4976580796252927, + "grad_norm": 0.47953195140191884, + "learning_rate": 5.882152911123947e-06, + "loss": 0.5829, + "step": 3837 + }, + { + "epoch": 1.498048399687744, + "grad_norm": 0.5221343421556336, + "learning_rate": 5.879917522572582e-06, + "loss": 0.5836, + "step": 3838 + }, + { + "epoch": 1.4984387197501952, + "grad_norm": 0.560674268651418, + "learning_rate": 5.877681952508875e-06, + "loss": 0.6199, + "step": 3839 + }, + { + "epoch": 1.4988290398126463, + "grad_norm": 0.6117410461509469, + "learning_rate": 5.8754462013939905e-06, + "loss": 0.6477, + "step": 3840 + }, + { + "epoch": 1.4992193598750976, + "grad_norm": 0.5359632924560187, + "learning_rate": 5.873210269689127e-06, + "loss": 0.5986, + "step": 3841 + }, + { + "epoch": 1.499609679937549, + "grad_norm": 0.5441306981751359, + "learning_rate": 5.87097415785552e-06, + "loss": 0.6328, + "step": 3842 + }, + { + "epoch": 1.5, + "grad_norm": 0.526211854772209, + "learning_rate": 5.86873786635444e-06, + "loss": 0.5992, + "step": 3843 + }, + { + "epoch": 1.500390320062451, + "grad_norm": 0.5528289968383348, + "learning_rate": 5.866501395647201e-06, + "loss": 0.6233, + "step": 3844 + }, + { + "epoch": 1.5007806401249024, + "grad_norm": 0.5178545806610428, + "learning_rate": 5.8642647461951426e-06, + "loss": 0.6026, + "step": 3845 + }, + { + "epoch": 1.5011709601873537, + "grad_norm": 0.5377861928721093, + "learning_rate": 5.862027918459655e-06, + "loss": 0.6063, + "step": 3846 + }, + { + "epoch": 1.5015612802498048, + "grad_norm": 0.5980923074024119, + "learning_rate": 5.8597909129021564e-06, + "loss": 0.6179, + "step": 3847 + }, + { + "epoch": 1.501951600312256, + "grad_norm": 0.5131065656683546, + "learning_rate": 5.857553729984104e-06, + "loss": 0.5952, + "step": 3848 + }, + { + "epoch": 1.5023419203747073, + "grad_norm": 0.4983844442047605, + "learning_rate": 5.855316370166989e-06, + "loss": 0.5756, + "step": 3849 + }, + { + "epoch": 1.5027322404371586, + "grad_norm": 0.5234860542254588, + "learning_rate": 5.853078833912343e-06, + "loss": 0.5944, + "step": 3850 + }, + { + "epoch": 1.5031225604996097, + "grad_norm": 0.5748754666928754, + "learning_rate": 5.8508411216817365e-06, + "loss": 0.576, + "step": 3851 + }, + { + "epoch": 1.5035128805620608, + "grad_norm": 0.5847868603113998, + "learning_rate": 5.848603233936765e-06, + "loss": 0.5602, + "step": 3852 + }, + { + "epoch": 1.503903200624512, + "grad_norm": 0.6177487397210764, + "learning_rate": 5.846365171139072e-06, + "loss": 0.6202, + "step": 3853 + }, + { + "epoch": 1.5042935206869634, + "grad_norm": 0.5557388078942243, + "learning_rate": 5.844126933750333e-06, + "loss": 0.6303, + "step": 3854 + }, + { + "epoch": 1.5046838407494145, + "grad_norm": 0.5997271646331896, + "learning_rate": 5.841888522232256e-06, + "loss": 0.6149, + "step": 3855 + }, + { + "epoch": 1.5050741608118656, + "grad_norm": 0.6276615711184668, + "learning_rate": 5.839649937046592e-06, + "loss": 0.6193, + "step": 3856 + }, + { + "epoch": 1.505464480874317, + "grad_norm": 0.5183027858799659, + "learning_rate": 5.8374111786551214e-06, + "loss": 0.5691, + "step": 3857 + }, + { + "epoch": 1.5058548009367683, + "grad_norm": 0.5042287111294557, + "learning_rate": 5.8351722475196635e-06, + "loss": 0.5995, + "step": 3858 + }, + { + "epoch": 1.5062451209992194, + "grad_norm": 0.5735664960282642, + "learning_rate": 5.8329331441020705e-06, + "loss": 0.631, + "step": 3859 + }, + { + "epoch": 1.5066354410616705, + "grad_norm": 0.5557795205426794, + "learning_rate": 5.830693868864236e-06, + "loss": 0.635, + "step": 3860 + }, + { + "epoch": 1.5070257611241218, + "grad_norm": 0.5718446225144587, + "learning_rate": 5.828454422268082e-06, + "loss": 0.5811, + "step": 3861 + }, + { + "epoch": 1.507416081186573, + "grad_norm": 0.5628881443126097, + "learning_rate": 5.82621480477557e-06, + "loss": 0.6017, + "step": 3862 + }, + { + "epoch": 1.5078064012490242, + "grad_norm": 0.6061257391711289, + "learning_rate": 5.823975016848695e-06, + "loss": 0.6157, + "step": 3863 + }, + { + "epoch": 1.5081967213114753, + "grad_norm": 0.513247447824854, + "learning_rate": 5.8217350589494905e-06, + "loss": 0.5912, + "step": 3864 + }, + { + "epoch": 1.5085870413739266, + "grad_norm": 0.5175478533932873, + "learning_rate": 5.819494931540021e-06, + "loss": 0.6335, + "step": 3865 + }, + { + "epoch": 1.508977361436378, + "grad_norm": 0.494163777867591, + "learning_rate": 5.817254635082387e-06, + "loss": 0.6092, + "step": 3866 + }, + { + "epoch": 1.509367681498829, + "grad_norm": 0.5704055287785397, + "learning_rate": 5.815014170038724e-06, + "loss": 0.5975, + "step": 3867 + }, + { + "epoch": 1.5097580015612801, + "grad_norm": 0.548281703625449, + "learning_rate": 5.812773536871205e-06, + "loss": 0.628, + "step": 3868 + }, + { + "epoch": 1.5101483216237315, + "grad_norm": 0.5306464941117033, + "learning_rate": 5.810532736042033e-06, + "loss": 0.5752, + "step": 3869 + }, + { + "epoch": 1.5105386416861828, + "grad_norm": 0.5377806998657093, + "learning_rate": 5.808291768013449e-06, + "loss": 0.6077, + "step": 3870 + }, + { + "epoch": 1.510928961748634, + "grad_norm": 0.5642484189096736, + "learning_rate": 5.806050633247726e-06, + "loss": 0.5905, + "step": 3871 + }, + { + "epoch": 1.511319281811085, + "grad_norm": 0.5938367853282707, + "learning_rate": 5.8038093322071756e-06, + "loss": 0.6041, + "step": 3872 + }, + { + "epoch": 1.5117096018735363, + "grad_norm": 0.5305150920615304, + "learning_rate": 5.801567865354139e-06, + "loss": 0.5417, + "step": 3873 + }, + { + "epoch": 1.5120999219359876, + "grad_norm": 0.6361040650307003, + "learning_rate": 5.799326233150992e-06, + "loss": 0.6085, + "step": 3874 + }, + { + "epoch": 1.5124902419984387, + "grad_norm": 0.5157100342043215, + "learning_rate": 5.797084436060149e-06, + "loss": 0.5866, + "step": 3875 + }, + { + "epoch": 1.5128805620608898, + "grad_norm": 0.5892326239640684, + "learning_rate": 5.794842474544055e-06, + "loss": 0.5819, + "step": 3876 + }, + { + "epoch": 1.5132708821233412, + "grad_norm": 0.5716218471104224, + "learning_rate": 5.792600349065187e-06, + "loss": 0.568, + "step": 3877 + }, + { + "epoch": 1.5136612021857925, + "grad_norm": 0.5133799948156536, + "learning_rate": 5.790358060086061e-06, + "loss": 0.5606, + "step": 3878 + }, + { + "epoch": 1.5140515222482436, + "grad_norm": 0.6104983072084288, + "learning_rate": 5.7881156080692225e-06, + "loss": 0.5816, + "step": 3879 + }, + { + "epoch": 1.5144418423106947, + "grad_norm": 0.540189479101971, + "learning_rate": 5.78587299347725e-06, + "loss": 0.584, + "step": 3880 + }, + { + "epoch": 1.514832162373146, + "grad_norm": 0.5572559692173429, + "learning_rate": 5.783630216772761e-06, + "loss": 0.5678, + "step": 3881 + }, + { + "epoch": 1.5152224824355973, + "grad_norm": 0.6280357539815457, + "learning_rate": 5.781387278418403e-06, + "loss": 0.5931, + "step": 3882 + }, + { + "epoch": 1.5156128024980484, + "grad_norm": 0.5813883756575547, + "learning_rate": 5.779144178876855e-06, + "loss": 0.5789, + "step": 3883 + }, + { + "epoch": 1.5160031225604995, + "grad_norm": 0.6546656576905278, + "learning_rate": 5.776900918610829e-06, + "loss": 0.6261, + "step": 3884 + }, + { + "epoch": 1.5163934426229508, + "grad_norm": 0.6192153843267889, + "learning_rate": 5.7746574980830776e-06, + "loss": 0.5958, + "step": 3885 + }, + { + "epoch": 1.5167837626854022, + "grad_norm": 0.6555738772051765, + "learning_rate": 5.772413917756376e-06, + "loss": 0.6326, + "step": 3886 + }, + { + "epoch": 1.5171740827478533, + "grad_norm": 0.5622937480509548, + "learning_rate": 5.770170178093542e-06, + "loss": 0.6066, + "step": 3887 + }, + { + "epoch": 1.5175644028103044, + "grad_norm": 0.5139632254929136, + "learning_rate": 5.76792627955742e-06, + "loss": 0.6264, + "step": 3888 + }, + { + "epoch": 1.5179547228727557, + "grad_norm": 0.6289583580175353, + "learning_rate": 5.765682222610887e-06, + "loss": 0.5661, + "step": 3889 + }, + { + "epoch": 1.518345042935207, + "grad_norm": 0.5378948058062414, + "learning_rate": 5.763438007716858e-06, + "loss": 0.5801, + "step": 3890 + }, + { + "epoch": 1.518735362997658, + "grad_norm": 0.5372928460685281, + "learning_rate": 5.761193635338273e-06, + "loss": 0.5741, + "step": 3891 + }, + { + "epoch": 1.5191256830601092, + "grad_norm": 0.6434038065442842, + "learning_rate": 5.758949105938112e-06, + "loss": 0.5929, + "step": 3892 + }, + { + "epoch": 1.5195160031225605, + "grad_norm": 0.5965417282057136, + "learning_rate": 5.756704419979383e-06, + "loss": 0.5981, + "step": 3893 + }, + { + "epoch": 1.5199063231850118, + "grad_norm": 0.4866147633551361, + "learning_rate": 5.754459577925127e-06, + "loss": 0.5929, + "step": 3894 + }, + { + "epoch": 1.520296643247463, + "grad_norm": 0.533619060852416, + "learning_rate": 5.752214580238419e-06, + "loss": 0.5706, + "step": 3895 + }, + { + "epoch": 1.520686963309914, + "grad_norm": 0.519368078224415, + "learning_rate": 5.749969427382363e-06, + "loss": 0.6328, + "step": 3896 + }, + { + "epoch": 1.5210772833723654, + "grad_norm": 0.5446938602871723, + "learning_rate": 5.747724119820095e-06, + "loss": 0.5962, + "step": 3897 + }, + { + "epoch": 1.5214676034348167, + "grad_norm": 0.6441622612733331, + "learning_rate": 5.745478658014789e-06, + "loss": 0.6192, + "step": 3898 + }, + { + "epoch": 1.5218579234972678, + "grad_norm": 0.5138865502260831, + "learning_rate": 5.743233042429644e-06, + "loss": 0.5942, + "step": 3899 + }, + { + "epoch": 1.5222482435597189, + "grad_norm": 0.8047293241847924, + "learning_rate": 5.7409872735278934e-06, + "loss": 0.6046, + "step": 3900 + }, + { + "epoch": 1.5226385636221702, + "grad_norm": 0.6558946845688753, + "learning_rate": 5.738741351772802e-06, + "loss": 0.6139, + "step": 3901 + }, + { + "epoch": 1.5230288836846215, + "grad_norm": 0.5284979823730843, + "learning_rate": 5.736495277627665e-06, + "loss": 0.5748, + "step": 3902 + }, + { + "epoch": 1.5234192037470726, + "grad_norm": 0.5238760564822752, + "learning_rate": 5.734249051555811e-06, + "loss": 0.6077, + "step": 3903 + }, + { + "epoch": 1.5238095238095237, + "grad_norm": 0.6050313514715712, + "learning_rate": 5.732002674020601e-06, + "loss": 0.6017, + "step": 3904 + }, + { + "epoch": 1.524199843871975, + "grad_norm": 0.4701543424692831, + "learning_rate": 5.729756145485422e-06, + "loss": 0.6012, + "step": 3905 + }, + { + "epoch": 1.5245901639344264, + "grad_norm": 0.6864746140450865, + "learning_rate": 5.727509466413697e-06, + "loss": 0.6196, + "step": 3906 + }, + { + "epoch": 1.5249804839968775, + "grad_norm": 0.61192443634151, + "learning_rate": 5.725262637268879e-06, + "loss": 0.6073, + "step": 3907 + }, + { + "epoch": 1.5253708040593286, + "grad_norm": 0.5228753712664773, + "learning_rate": 5.7230156585144505e-06, + "loss": 0.5896, + "step": 3908 + }, + { + "epoch": 1.5257611241217799, + "grad_norm": 0.6449071534139555, + "learning_rate": 5.7207685306139264e-06, + "loss": 0.6204, + "step": 3909 + }, + { + "epoch": 1.5261514441842312, + "grad_norm": 0.6119435082701729, + "learning_rate": 5.7185212540308496e-06, + "loss": 0.6013, + "step": 3910 + }, + { + "epoch": 1.5265417642466823, + "grad_norm": 0.5755344527223407, + "learning_rate": 5.7162738292288e-06, + "loss": 0.6287, + "step": 3911 + }, + { + "epoch": 1.5269320843091334, + "grad_norm": 0.5917968690048527, + "learning_rate": 5.71402625667138e-06, + "loss": 0.591, + "step": 3912 + }, + { + "epoch": 1.5273224043715847, + "grad_norm": 0.6006383827442419, + "learning_rate": 5.71177853682223e-06, + "loss": 0.5908, + "step": 3913 + }, + { + "epoch": 1.527712724434036, + "grad_norm": 0.7703779360044676, + "learning_rate": 5.7095306701450135e-06, + "loss": 0.5972, + "step": 3914 + }, + { + "epoch": 1.5281030444964872, + "grad_norm": 0.5342638338903166, + "learning_rate": 5.70728265710343e-06, + "loss": 0.636, + "step": 3915 + }, + { + "epoch": 1.5284933645589383, + "grad_norm": 0.6115619448630416, + "learning_rate": 5.705034498161207e-06, + "loss": 0.6331, + "step": 3916 + }, + { + "epoch": 1.5288836846213896, + "grad_norm": 0.5740487561407881, + "learning_rate": 5.702786193782103e-06, + "loss": 0.5572, + "step": 3917 + }, + { + "epoch": 1.529274004683841, + "grad_norm": 0.6025151659984013, + "learning_rate": 5.7005377444299035e-06, + "loss": 0.6036, + "step": 3918 + }, + { + "epoch": 1.529664324746292, + "grad_norm": 0.5810925323939582, + "learning_rate": 5.698289150568429e-06, + "loss": 0.6024, + "step": 3919 + }, + { + "epoch": 1.530054644808743, + "grad_norm": 0.5598653338919098, + "learning_rate": 5.696040412661526e-06, + "loss": 0.5958, + "step": 3920 + }, + { + "epoch": 1.5304449648711944, + "grad_norm": 0.5686876019652932, + "learning_rate": 5.6937915311730695e-06, + "loss": 0.6101, + "step": 3921 + }, + { + "epoch": 1.5308352849336457, + "grad_norm": 0.6288677394140496, + "learning_rate": 5.6915425065669694e-06, + "loss": 0.5922, + "step": 3922 + }, + { + "epoch": 1.5312256049960968, + "grad_norm": 0.6338537335890871, + "learning_rate": 5.689293339307162e-06, + "loss": 0.5933, + "step": 3923 + }, + { + "epoch": 1.531615925058548, + "grad_norm": 0.5509459015045831, + "learning_rate": 5.68704402985761e-06, + "loss": 0.5999, + "step": 3924 + }, + { + "epoch": 1.5320062451209993, + "grad_norm": 0.6233607363760911, + "learning_rate": 5.68479457868231e-06, + "loss": 0.5941, + "step": 3925 + }, + { + "epoch": 1.5323965651834506, + "grad_norm": 0.5439689749920326, + "learning_rate": 5.682544986245289e-06, + "loss": 0.6042, + "step": 3926 + }, + { + "epoch": 1.5327868852459017, + "grad_norm": 0.6123245354448804, + "learning_rate": 5.680295253010596e-06, + "loss": 0.5911, + "step": 3927 + }, + { + "epoch": 1.5331772053083528, + "grad_norm": 0.5843384715320022, + "learning_rate": 5.6780453794423164e-06, + "loss": 0.5817, + "step": 3928 + }, + { + "epoch": 1.533567525370804, + "grad_norm": 0.6140775273008628, + "learning_rate": 5.675795366004561e-06, + "loss": 0.6221, + "step": 3929 + }, + { + "epoch": 1.5339578454332554, + "grad_norm": 0.5246470540504269, + "learning_rate": 5.673545213161469e-06, + "loss": 0.6184, + "step": 3930 + }, + { + "epoch": 1.5343481654957065, + "grad_norm": 0.5906269923386475, + "learning_rate": 5.6712949213772125e-06, + "loss": 0.5869, + "step": 3931 + }, + { + "epoch": 1.5347384855581576, + "grad_norm": 0.5896205729323316, + "learning_rate": 5.669044491115986e-06, + "loss": 0.5941, + "step": 3932 + }, + { + "epoch": 1.535128805620609, + "grad_norm": 0.638034517706058, + "learning_rate": 5.666793922842016e-06, + "loss": 0.6038, + "step": 3933 + }, + { + "epoch": 1.5355191256830603, + "grad_norm": 0.657363982954416, + "learning_rate": 5.664543217019561e-06, + "loss": 0.6194, + "step": 3934 + }, + { + "epoch": 1.5359094457455114, + "grad_norm": 0.5895144110243511, + "learning_rate": 5.662292374112901e-06, + "loss": 0.6008, + "step": 3935 + }, + { + "epoch": 1.5362997658079625, + "grad_norm": 0.5012180500231739, + "learning_rate": 5.660041394586347e-06, + "loss": 0.6011, + "step": 3936 + }, + { + "epoch": 1.5366900858704138, + "grad_norm": 0.5894055019436882, + "learning_rate": 5.657790278904239e-06, + "loss": 0.5903, + "step": 3937 + }, + { + "epoch": 1.537080405932865, + "grad_norm": 0.5918174575733399, + "learning_rate": 5.655539027530947e-06, + "loss": 0.6197, + "step": 3938 + }, + { + "epoch": 1.5374707259953162, + "grad_norm": 0.5706993361728232, + "learning_rate": 5.6532876409308644e-06, + "loss": 0.6033, + "step": 3939 + }, + { + "epoch": 1.5378610460577673, + "grad_norm": 0.6581075083766713, + "learning_rate": 5.651036119568416e-06, + "loss": 0.6136, + "step": 3940 + }, + { + "epoch": 1.5382513661202186, + "grad_norm": 0.5921727562506817, + "learning_rate": 5.64878446390805e-06, + "loss": 0.5968, + "step": 3941 + }, + { + "epoch": 1.53864168618267, + "grad_norm": 0.6111962062971082, + "learning_rate": 5.64653267441425e-06, + "loss": 0.5841, + "step": 3942 + }, + { + "epoch": 1.539032006245121, + "grad_norm": 0.6618518850483246, + "learning_rate": 5.64428075155152e-06, + "loss": 0.5871, + "step": 3943 + }, + { + "epoch": 1.5394223263075721, + "grad_norm": 0.5306142104052024, + "learning_rate": 5.6420286957843965e-06, + "loss": 0.6317, + "step": 3944 + }, + { + "epoch": 1.5398126463700235, + "grad_norm": 0.6560391630855769, + "learning_rate": 5.639776507577437e-06, + "loss": 0.6213, + "step": 3945 + }, + { + "epoch": 1.5402029664324748, + "grad_norm": 0.6701295765030284, + "learning_rate": 5.637524187395234e-06, + "loss": 0.5477, + "step": 3946 + }, + { + "epoch": 1.5405932864949259, + "grad_norm": 0.5536636206159461, + "learning_rate": 5.635271735702403e-06, + "loss": 0.5975, + "step": 3947 + }, + { + "epoch": 1.540983606557377, + "grad_norm": 0.688781801956459, + "learning_rate": 5.6330191529635865e-06, + "loss": 0.5869, + "step": 3948 + }, + { + "epoch": 1.5413739266198283, + "grad_norm": 0.6347945240345919, + "learning_rate": 5.630766439643455e-06, + "loss": 0.5956, + "step": 3949 + }, + { + "epoch": 1.5417642466822796, + "grad_norm": 0.5690189120886868, + "learning_rate": 5.628513596206706e-06, + "loss": 0.5704, + "step": 3950 + }, + { + "epoch": 1.5421545667447307, + "grad_norm": 0.6480008609128335, + "learning_rate": 5.6262606231180635e-06, + "loss": 0.6375, + "step": 3951 + }, + { + "epoch": 1.5425448868071818, + "grad_norm": 0.6918701103498487, + "learning_rate": 5.624007520842279e-06, + "loss": 0.606, + "step": 3952 + }, + { + "epoch": 1.5429352068696331, + "grad_norm": 0.5377823193616348, + "learning_rate": 5.62175428984413e-06, + "loss": 0.6004, + "step": 3953 + }, + { + "epoch": 1.5433255269320845, + "grad_norm": 0.6821733071717542, + "learning_rate": 5.6195009305884185e-06, + "loss": 0.5922, + "step": 3954 + }, + { + "epoch": 1.5437158469945356, + "grad_norm": 0.6650576066511171, + "learning_rate": 5.617247443539978e-06, + "loss": 0.586, + "step": 3955 + }, + { + "epoch": 1.5441061670569867, + "grad_norm": 0.6009711152720938, + "learning_rate": 5.614993829163663e-06, + "loss": 0.5838, + "step": 3956 + }, + { + "epoch": 1.544496487119438, + "grad_norm": 0.6220552764334445, + "learning_rate": 5.6127400879243585e-06, + "loss": 0.5819, + "step": 3957 + }, + { + "epoch": 1.5448868071818893, + "grad_norm": 0.5033003762062037, + "learning_rate": 5.610486220286972e-06, + "loss": 0.5875, + "step": 3958 + }, + { + "epoch": 1.5452771272443404, + "grad_norm": 0.6850666618297072, + "learning_rate": 5.608232226716439e-06, + "loss": 0.5945, + "step": 3959 + }, + { + "epoch": 1.5456674473067915, + "grad_norm": 0.6160161496920381, + "learning_rate": 5.605978107677722e-06, + "loss": 0.5983, + "step": 3960 + }, + { + "epoch": 1.5460577673692428, + "grad_norm": 0.5982995956952398, + "learning_rate": 5.6037238636358084e-06, + "loss": 0.5769, + "step": 3961 + }, + { + "epoch": 1.5464480874316942, + "grad_norm": 0.6920417231247634, + "learning_rate": 5.60146949505571e-06, + "loss": 0.6029, + "step": 3962 + }, + { + "epoch": 1.5468384074941453, + "grad_norm": 0.6964001183805971, + "learning_rate": 5.599215002402466e-06, + "loss": 0.5746, + "step": 3963 + }, + { + "epoch": 1.5472287275565964, + "grad_norm": 0.5663343511467903, + "learning_rate": 5.59696038614114e-06, + "loss": 0.5879, + "step": 3964 + }, + { + "epoch": 1.5476190476190477, + "grad_norm": 0.6012212292915015, + "learning_rate": 5.594705646736824e-06, + "loss": 0.5669, + "step": 3965 + }, + { + "epoch": 1.548009367681499, + "grad_norm": 0.5499718322976243, + "learning_rate": 5.592450784654632e-06, + "loss": 0.5731, + "step": 3966 + }, + { + "epoch": 1.54839968774395, + "grad_norm": 0.5880387465180908, + "learning_rate": 5.590195800359704e-06, + "loss": 0.5885, + "step": 3967 + }, + { + "epoch": 1.5487900078064012, + "grad_norm": 0.5836488638544456, + "learning_rate": 5.587940694317205e-06, + "loss": 0.5647, + "step": 3968 + }, + { + "epoch": 1.5491803278688525, + "grad_norm": 0.7173354416214659, + "learning_rate": 5.585685466992328e-06, + "loss": 0.5973, + "step": 3969 + }, + { + "epoch": 1.5495706479313038, + "grad_norm": 0.6082860314623978, + "learning_rate": 5.583430118850288e-06, + "loss": 0.608, + "step": 3970 + }, + { + "epoch": 1.549960967993755, + "grad_norm": 0.5087095447339471, + "learning_rate": 5.581174650356326e-06, + "loss": 0.5876, + "step": 3971 + }, + { + "epoch": 1.550351288056206, + "grad_norm": 0.6983299301706983, + "learning_rate": 5.578919061975707e-06, + "loss": 0.5868, + "step": 3972 + }, + { + "epoch": 1.5507416081186571, + "grad_norm": 0.6042428199340583, + "learning_rate": 5.576663354173721e-06, + "loss": 0.5867, + "step": 3973 + }, + { + "epoch": 1.5511319281811085, + "grad_norm": 0.5926654084993047, + "learning_rate": 5.5744075274156825e-06, + "loss": 0.6291, + "step": 3974 + }, + { + "epoch": 1.5515222482435598, + "grad_norm": 0.5817789634496682, + "learning_rate": 5.572151582166934e-06, + "loss": 0.6281, + "step": 3975 + }, + { + "epoch": 1.5519125683060109, + "grad_norm": 0.4873695232203616, + "learning_rate": 5.569895518892836e-06, + "loss": 0.6076, + "step": 3976 + }, + { + "epoch": 1.552302888368462, + "grad_norm": 0.56216380461261, + "learning_rate": 5.567639338058779e-06, + "loss": 0.5732, + "step": 3977 + }, + { + "epoch": 1.5526932084309133, + "grad_norm": 0.5978932798630681, + "learning_rate": 5.565383040130173e-06, + "loss": 0.5858, + "step": 3978 + }, + { + "epoch": 1.5530835284933646, + "grad_norm": 0.5776905608380382, + "learning_rate": 5.563126625572459e-06, + "loss": 0.5732, + "step": 3979 + }, + { + "epoch": 1.5534738485558157, + "grad_norm": 0.5653906559759272, + "learning_rate": 5.560870094851094e-06, + "loss": 0.5566, + "step": 3980 + }, + { + "epoch": 1.5538641686182668, + "grad_norm": 0.5417434707587222, + "learning_rate": 5.558613448431564e-06, + "loss": 0.5894, + "step": 3981 + }, + { + "epoch": 1.5542544886807181, + "grad_norm": 0.547130770043846, + "learning_rate": 5.556356686779378e-06, + "loss": 0.5787, + "step": 3982 + }, + { + "epoch": 1.5546448087431695, + "grad_norm": 0.5450347767513569, + "learning_rate": 5.5540998103600685e-06, + "loss": 0.6462, + "step": 3983 + }, + { + "epoch": 1.5550351288056206, + "grad_norm": 0.5473429967905106, + "learning_rate": 5.55184281963919e-06, + "loss": 0.6064, + "step": 3984 + }, + { + "epoch": 1.5554254488680717, + "grad_norm": 0.524759242313439, + "learning_rate": 5.549585715082323e-06, + "loss": 0.6091, + "step": 3985 + }, + { + "epoch": 1.555815768930523, + "grad_norm": 0.5753262023997092, + "learning_rate": 5.547328497155071e-06, + "loss": 0.6112, + "step": 3986 + }, + { + "epoch": 1.5562060889929743, + "grad_norm": 0.5506084524652957, + "learning_rate": 5.545071166323062e-06, + "loss": 0.5843, + "step": 3987 + }, + { + "epoch": 1.5565964090554254, + "grad_norm": 0.5778570315097146, + "learning_rate": 5.542813723051942e-06, + "loss": 0.6064, + "step": 3988 + }, + { + "epoch": 1.5569867291178765, + "grad_norm": 0.59007710513793, + "learning_rate": 5.5405561678073874e-06, + "loss": 0.5752, + "step": 3989 + }, + { + "epoch": 1.5573770491803278, + "grad_norm": 0.595178425898181, + "learning_rate": 5.5382985010550905e-06, + "loss": 0.6094, + "step": 3990 + }, + { + "epoch": 1.5577673692427791, + "grad_norm": 0.5474829017305036, + "learning_rate": 5.536040723260774e-06, + "loss": 0.5555, + "step": 3991 + }, + { + "epoch": 1.5581576893052302, + "grad_norm": 0.5394201671716585, + "learning_rate": 5.5337828348901804e-06, + "loss": 0.5987, + "step": 3992 + }, + { + "epoch": 1.5585480093676813, + "grad_norm": 0.5278841583715722, + "learning_rate": 5.5315248364090705e-06, + "loss": 0.6147, + "step": 3993 + }, + { + "epoch": 1.5589383294301327, + "grad_norm": 0.6454186997164877, + "learning_rate": 5.529266728283234e-06, + "loss": 0.6083, + "step": 3994 + }, + { + "epoch": 1.559328649492584, + "grad_norm": 0.580676569622907, + "learning_rate": 5.527008510978481e-06, + "loss": 0.6228, + "step": 3995 + }, + { + "epoch": 1.559718969555035, + "grad_norm": 0.5413597736273646, + "learning_rate": 5.524750184960644e-06, + "loss": 0.5786, + "step": 3996 + }, + { + "epoch": 1.5601092896174862, + "grad_norm": 0.6025154559643144, + "learning_rate": 5.52249175069558e-06, + "loss": 0.6307, + "step": 3997 + }, + { + "epoch": 1.5604996096799375, + "grad_norm": 0.6370297696298987, + "learning_rate": 5.520233208649163e-06, + "loss": 0.5779, + "step": 3998 + }, + { + "epoch": 1.5608899297423888, + "grad_norm": 0.49477777496902814, + "learning_rate": 5.517974559287293e-06, + "loss": 0.5984, + "step": 3999 + }, + { + "epoch": 1.56128024980484, + "grad_norm": 0.6164560529266764, + "learning_rate": 5.515715803075895e-06, + "loss": 0.6093, + "step": 4000 + }, + { + "epoch": 1.561670569867291, + "grad_norm": 0.5556829327191337, + "learning_rate": 5.513456940480911e-06, + "loss": 0.5889, + "step": 4001 + }, + { + "epoch": 1.5620608899297423, + "grad_norm": 0.7889089671773689, + "learning_rate": 5.511197971968306e-06, + "loss": 0.6119, + "step": 4002 + }, + { + "epoch": 1.5624512099921937, + "grad_norm": 0.5352527265399651, + "learning_rate": 5.508938898004069e-06, + "loss": 0.6076, + "step": 4003 + }, + { + "epoch": 1.5628415300546448, + "grad_norm": 0.5732931791312132, + "learning_rate": 5.5066797190542075e-06, + "loss": 0.5965, + "step": 4004 + }, + { + "epoch": 1.5632318501170959, + "grad_norm": 0.6389163533507598, + "learning_rate": 5.504420435584753e-06, + "loss": 0.592, + "step": 4005 + }, + { + "epoch": 1.5636221701795472, + "grad_norm": 0.6074970261179259, + "learning_rate": 5.50216104806176e-06, + "loss": 0.5874, + "step": 4006 + }, + { + "epoch": 1.5640124902419985, + "grad_norm": 0.5859565211561536, + "learning_rate": 5.499901556951302e-06, + "loss": 0.5881, + "step": 4007 + }, + { + "epoch": 1.5644028103044496, + "grad_norm": 0.6297199614787431, + "learning_rate": 5.497641962719471e-06, + "loss": 0.616, + "step": 4008 + }, + { + "epoch": 1.5647931303669007, + "grad_norm": 0.5519296746787621, + "learning_rate": 5.495382265832387e-06, + "loss": 0.5962, + "step": 4009 + }, + { + "epoch": 1.565183450429352, + "grad_norm": 0.5467924593029515, + "learning_rate": 5.493122466756188e-06, + "loss": 0.6179, + "step": 4010 + }, + { + "epoch": 1.5655737704918034, + "grad_norm": 0.6583851772169099, + "learning_rate": 5.490862565957031e-06, + "loss": 0.6033, + "step": 4011 + }, + { + "epoch": 1.5659640905542545, + "grad_norm": 0.5124384969194934, + "learning_rate": 5.488602563901098e-06, + "loss": 0.5314, + "step": 4012 + }, + { + "epoch": 1.5663544106167056, + "grad_norm": 0.49694768360523023, + "learning_rate": 5.486342461054588e-06, + "loss": 0.6102, + "step": 4013 + }, + { + "epoch": 1.5667447306791569, + "grad_norm": 0.6557438640345249, + "learning_rate": 5.484082257883724e-06, + "loss": 0.6051, + "step": 4014 + }, + { + "epoch": 1.5671350507416082, + "grad_norm": 0.47236339462582744, + "learning_rate": 5.481821954854747e-06, + "loss": 0.6161, + "step": 4015 + }, + { + "epoch": 1.5675253708040593, + "grad_norm": 0.5772383482824849, + "learning_rate": 5.479561552433921e-06, + "loss": 0.5834, + "step": 4016 + }, + { + "epoch": 1.5679156908665104, + "grad_norm": 0.5216422398118779, + "learning_rate": 5.477301051087528e-06, + "loss": 0.5616, + "step": 4017 + }, + { + "epoch": 1.5683060109289617, + "grad_norm": 0.5126662751838545, + "learning_rate": 5.475040451281874e-06, + "loss": 0.5733, + "step": 4018 + }, + { + "epoch": 1.568696330991413, + "grad_norm": 0.5724208017549299, + "learning_rate": 5.4727797534832815e-06, + "loss": 0.6017, + "step": 4019 + }, + { + "epoch": 1.5690866510538641, + "grad_norm": 0.5692773306779784, + "learning_rate": 5.470518958158095e-06, + "loss": 0.5796, + "step": 4020 + }, + { + "epoch": 1.5694769711163152, + "grad_norm": 0.5951966829025397, + "learning_rate": 5.468258065772679e-06, + "loss": 0.5864, + "step": 4021 + }, + { + "epoch": 1.5698672911787666, + "grad_norm": 0.5828424119752116, + "learning_rate": 5.465997076793419e-06, + "loss": 0.6029, + "step": 4022 + }, + { + "epoch": 1.5702576112412179, + "grad_norm": 0.5157663345483665, + "learning_rate": 5.463735991686718e-06, + "loss": 0.576, + "step": 4023 + }, + { + "epoch": 1.570647931303669, + "grad_norm": 0.5456394257986802, + "learning_rate": 5.461474810919001e-06, + "loss": 0.6039, + "step": 4024 + }, + { + "epoch": 1.57103825136612, + "grad_norm": 0.5276155678405111, + "learning_rate": 5.45921353495671e-06, + "loss": 0.5685, + "step": 4025 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.5235146415127986, + "learning_rate": 5.4569521642663115e-06, + "loss": 0.6283, + "step": 4026 + }, + { + "epoch": 1.5718188914910227, + "grad_norm": 0.529559857111214, + "learning_rate": 5.454690699314286e-06, + "loss": 0.6071, + "step": 4027 + }, + { + "epoch": 1.5722092115534738, + "grad_norm": 0.671352295280833, + "learning_rate": 5.45242914056714e-06, + "loss": 0.6265, + "step": 4028 + }, + { + "epoch": 1.572599531615925, + "grad_norm": 0.5854740195540183, + "learning_rate": 5.4501674884913915e-06, + "loss": 0.5862, + "step": 4029 + }, + { + "epoch": 1.5729898516783762, + "grad_norm": 0.5651175262992241, + "learning_rate": 5.447905743553583e-06, + "loss": 0.612, + "step": 4030 + }, + { + "epoch": 1.5733801717408276, + "grad_norm": 0.6327855573189489, + "learning_rate": 5.445643906220275e-06, + "loss": 0.5892, + "step": 4031 + }, + { + "epoch": 1.5737704918032787, + "grad_norm": 0.5462311720821345, + "learning_rate": 5.443381976958048e-06, + "loss": 0.6049, + "step": 4032 + }, + { + "epoch": 1.5741608118657298, + "grad_norm": 0.647366848052416, + "learning_rate": 5.4411199562334985e-06, + "loss": 0.6014, + "step": 4033 + }, + { + "epoch": 1.574551131928181, + "grad_norm": 0.602445794105454, + "learning_rate": 5.438857844513243e-06, + "loss": 0.5762, + "step": 4034 + }, + { + "epoch": 1.5749414519906324, + "grad_norm": 0.5326305132166874, + "learning_rate": 5.436595642263921e-06, + "loss": 0.5739, + "step": 4035 + }, + { + "epoch": 1.5753317720530835, + "grad_norm": 0.6371852021883037, + "learning_rate": 5.434333349952183e-06, + "loss": 0.5909, + "step": 4036 + }, + { + "epoch": 1.5757220921155346, + "grad_norm": 0.6806210126379858, + "learning_rate": 5.432070968044707e-06, + "loss": 0.578, + "step": 4037 + }, + { + "epoch": 1.576112412177986, + "grad_norm": 0.5469617805271585, + "learning_rate": 5.42980849700818e-06, + "loss": 0.6427, + "step": 4038 + }, + { + "epoch": 1.5765027322404372, + "grad_norm": 0.5862963388317293, + "learning_rate": 5.427545937309315e-06, + "loss": 0.5599, + "step": 4039 + }, + { + "epoch": 1.5768930523028883, + "grad_norm": 0.6421021900305298, + "learning_rate": 5.425283289414838e-06, + "loss": 0.613, + "step": 4040 + }, + { + "epoch": 1.5772833723653394, + "grad_norm": 0.5960911837639289, + "learning_rate": 5.423020553791498e-06, + "loss": 0.5962, + "step": 4041 + }, + { + "epoch": 1.5776736924277908, + "grad_norm": 0.5231854651766273, + "learning_rate": 5.420757730906059e-06, + "loss": 0.6263, + "step": 4042 + }, + { + "epoch": 1.578064012490242, + "grad_norm": 0.6671753112443294, + "learning_rate": 5.4184948212253045e-06, + "loss": 0.604, + "step": 4043 + }, + { + "epoch": 1.5784543325526932, + "grad_norm": 0.5731454155544736, + "learning_rate": 5.416231825216032e-06, + "loss": 0.5944, + "step": 4044 + }, + { + "epoch": 1.5788446526151443, + "grad_norm": 0.4771334360993246, + "learning_rate": 5.413968743345064e-06, + "loss": 0.6025, + "step": 4045 + }, + { + "epoch": 1.5792349726775956, + "grad_norm": 0.5154046762293552, + "learning_rate": 5.411705576079233e-06, + "loss": 0.6018, + "step": 4046 + }, + { + "epoch": 1.579625292740047, + "grad_norm": 0.4897781311524188, + "learning_rate": 5.409442323885396e-06, + "loss": 0.6148, + "step": 4047 + }, + { + "epoch": 1.580015612802498, + "grad_norm": 0.5300343776900004, + "learning_rate": 5.4071789872304215e-06, + "loss": 0.6141, + "step": 4048 + }, + { + "epoch": 1.5804059328649491, + "grad_norm": 0.5600396750360762, + "learning_rate": 5.404915566581201e-06, + "loss": 0.5984, + "step": 4049 + }, + { + "epoch": 1.5807962529274004, + "grad_norm": 0.6076135548139558, + "learning_rate": 5.402652062404638e-06, + "loss": 0.5767, + "step": 4050 + }, + { + "epoch": 1.5811865729898518, + "grad_norm": 0.523960770884025, + "learning_rate": 5.400388475167655e-06, + "loss": 0.5884, + "step": 4051 + }, + { + "epoch": 1.5815768930523029, + "grad_norm": 0.5576790846526509, + "learning_rate": 5.398124805337197e-06, + "loss": 0.6233, + "step": 4052 + }, + { + "epoch": 1.581967213114754, + "grad_norm": 0.5941961915378983, + "learning_rate": 5.395861053380216e-06, + "loss": 0.6198, + "step": 4053 + }, + { + "epoch": 1.5823575331772053, + "grad_norm": 0.5591650580591905, + "learning_rate": 5.39359721976369e-06, + "loss": 0.5936, + "step": 4054 + }, + { + "epoch": 1.5827478532396566, + "grad_norm": 0.5856613602260174, + "learning_rate": 5.391333304954608e-06, + "loss": 0.6019, + "step": 4055 + }, + { + "epoch": 1.5831381733021077, + "grad_norm": 0.5544166406242972, + "learning_rate": 5.389069309419979e-06, + "loss": 0.5761, + "step": 4056 + }, + { + "epoch": 1.5835284933645588, + "grad_norm": 0.541048489394923, + "learning_rate": 5.386805233626827e-06, + "loss": 0.6201, + "step": 4057 + }, + { + "epoch": 1.5839188134270101, + "grad_norm": 0.6100720329371896, + "learning_rate": 5.384541078042196e-06, + "loss": 0.5884, + "step": 4058 + }, + { + "epoch": 1.5843091334894615, + "grad_norm": 0.600947916239602, + "learning_rate": 5.382276843133138e-06, + "loss": 0.5709, + "step": 4059 + }, + { + "epoch": 1.5846994535519126, + "grad_norm": 0.5468484890050453, + "learning_rate": 5.380012529366732e-06, + "loss": 0.603, + "step": 4060 + }, + { + "epoch": 1.5850897736143637, + "grad_norm": 0.5773792070618078, + "learning_rate": 5.377748137210066e-06, + "loss": 0.6085, + "step": 4061 + }, + { + "epoch": 1.585480093676815, + "grad_norm": 0.5951110931341175, + "learning_rate": 5.375483667130247e-06, + "loss": 0.6216, + "step": 4062 + }, + { + "epoch": 1.5858704137392663, + "grad_norm": 0.5467260115242991, + "learning_rate": 5.373219119594397e-06, + "loss": 0.62, + "step": 4063 + }, + { + "epoch": 1.5862607338017174, + "grad_norm": 0.5092798578593374, + "learning_rate": 5.3709544950696554e-06, + "loss": 0.647, + "step": 4064 + }, + { + "epoch": 1.5866510538641685, + "grad_norm": 0.5172043810433692, + "learning_rate": 5.368689794023176e-06, + "loss": 0.5956, + "step": 4065 + }, + { + "epoch": 1.5870413739266198, + "grad_norm": 0.5698842030751754, + "learning_rate": 5.3664250169221284e-06, + "loss": 0.5805, + "step": 4066 + }, + { + "epoch": 1.5874316939890711, + "grad_norm": 0.5732724073789807, + "learning_rate": 5.364160164233698e-06, + "loss": 0.5858, + "step": 4067 + }, + { + "epoch": 1.5878220140515222, + "grad_norm": 0.5364138031786311, + "learning_rate": 5.36189523642509e-06, + "loss": 0.6597, + "step": 4068 + }, + { + "epoch": 1.5882123341139733, + "grad_norm": 0.5380683903679866, + "learning_rate": 5.359630233963515e-06, + "loss": 0.6311, + "step": 4069 + }, + { + "epoch": 1.5886026541764247, + "grad_norm": 0.5307407457749478, + "learning_rate": 5.357365157316211e-06, + "loss": 0.595, + "step": 4070 + }, + { + "epoch": 1.588992974238876, + "grad_norm": 0.5986507546346617, + "learning_rate": 5.355100006950422e-06, + "loss": 0.6133, + "step": 4071 + }, + { + "epoch": 1.589383294301327, + "grad_norm": 0.5393048269568845, + "learning_rate": 5.352834783333414e-06, + "loss": 0.596, + "step": 4072 + }, + { + "epoch": 1.5897736143637782, + "grad_norm": 0.5839205558242939, + "learning_rate": 5.350569486932461e-06, + "loss": 0.6026, + "step": 4073 + }, + { + "epoch": 1.5901639344262295, + "grad_norm": 0.5221017127022302, + "learning_rate": 5.3483041182148575e-06, + "loss": 0.5971, + "step": 4074 + }, + { + "epoch": 1.5905542544886808, + "grad_norm": 0.5772796113688058, + "learning_rate": 5.346038677647913e-06, + "loss": 0.6274, + "step": 4075 + }, + { + "epoch": 1.590944574551132, + "grad_norm": 0.6090102282336624, + "learning_rate": 5.34377316569895e-06, + "loss": 0.6036, + "step": 4076 + }, + { + "epoch": 1.591334894613583, + "grad_norm": 0.5335004952874073, + "learning_rate": 5.3415075828353034e-06, + "loss": 0.597, + "step": 4077 + }, + { + "epoch": 1.5917252146760343, + "grad_norm": 0.46894682044958025, + "learning_rate": 5.339241929524328e-06, + "loss": 0.5969, + "step": 4078 + }, + { + "epoch": 1.5921155347384857, + "grad_norm": 0.5238285355454492, + "learning_rate": 5.336976206233388e-06, + "loss": 0.5911, + "step": 4079 + }, + { + "epoch": 1.5925058548009368, + "grad_norm": 0.6883875481650708, + "learning_rate": 5.334710413429867e-06, + "loss": 0.6293, + "step": 4080 + }, + { + "epoch": 1.5928961748633879, + "grad_norm": 0.49432471677039774, + "learning_rate": 5.332444551581157e-06, + "loss": 0.586, + "step": 4081 + }, + { + "epoch": 1.5932864949258392, + "grad_norm": 0.5322765140877861, + "learning_rate": 5.330178621154671e-06, + "loss": 0.5866, + "step": 4082 + }, + { + "epoch": 1.5936768149882905, + "grad_norm": 0.5497667793100334, + "learning_rate": 5.3279126226178314e-06, + "loss": 0.6213, + "step": 4083 + }, + { + "epoch": 1.5940671350507416, + "grad_norm": 0.5227824763368737, + "learning_rate": 5.3256465564380755e-06, + "loss": 0.5724, + "step": 4084 + }, + { + "epoch": 1.5944574551131927, + "grad_norm": 0.5995372276743436, + "learning_rate": 5.3233804230828555e-06, + "loss": 0.5837, + "step": 4085 + }, + { + "epoch": 1.594847775175644, + "grad_norm": 0.6118970626202713, + "learning_rate": 5.321114223019635e-06, + "loss": 0.5865, + "step": 4086 + }, + { + "epoch": 1.5952380952380953, + "grad_norm": 0.5303246206251468, + "learning_rate": 5.318847956715897e-06, + "loss": 0.6181, + "step": 4087 + }, + { + "epoch": 1.5956284153005464, + "grad_norm": 0.6051042355612033, + "learning_rate": 5.316581624639132e-06, + "loss": 0.6161, + "step": 4088 + }, + { + "epoch": 1.5960187353629975, + "grad_norm": 0.5536013201710144, + "learning_rate": 5.314315227256849e-06, + "loss": 0.608, + "step": 4089 + }, + { + "epoch": 1.5964090554254489, + "grad_norm": 0.5838655508984462, + "learning_rate": 5.312048765036564e-06, + "loss": 0.6177, + "step": 4090 + }, + { + "epoch": 1.5967993754879002, + "grad_norm": 0.5385905928706498, + "learning_rate": 5.3097822384458135e-06, + "loss": 0.6122, + "step": 4091 + }, + { + "epoch": 1.5971896955503513, + "grad_norm": 0.5094345900929523, + "learning_rate": 5.307515647952143e-06, + "loss": 0.6168, + "step": 4092 + }, + { + "epoch": 1.5975800156128024, + "grad_norm": 0.5977814083524605, + "learning_rate": 5.305248994023113e-06, + "loss": 0.594, + "step": 4093 + }, + { + "epoch": 1.5979703356752537, + "grad_norm": 0.6415093470148604, + "learning_rate": 5.3029822771262976e-06, + "loss": 0.5831, + "step": 4094 + }, + { + "epoch": 1.598360655737705, + "grad_norm": 0.5281499781728157, + "learning_rate": 5.300715497729281e-06, + "loss": 0.6149, + "step": 4095 + }, + { + "epoch": 1.5987509758001561, + "grad_norm": 0.5592887256896552, + "learning_rate": 5.298448656299662e-06, + "loss": 0.5934, + "step": 4096 + }, + { + "epoch": 1.5991412958626072, + "grad_norm": 0.6701892967464846, + "learning_rate": 5.296181753305052e-06, + "loss": 0.6179, + "step": 4097 + }, + { + "epoch": 1.5995316159250585, + "grad_norm": 0.6303249808988206, + "learning_rate": 5.293914789213078e-06, + "loss": 0.606, + "step": 4098 + }, + { + "epoch": 1.5999219359875099, + "grad_norm": 0.5770769887846664, + "learning_rate": 5.291647764491374e-06, + "loss": 0.6022, + "step": 4099 + }, + { + "epoch": 1.600312256049961, + "grad_norm": 0.6489597879994786, + "learning_rate": 5.289380679607592e-06, + "loss": 0.6122, + "step": 4100 + }, + { + "epoch": 1.600702576112412, + "grad_norm": 0.718926400432554, + "learning_rate": 5.2871135350293914e-06, + "loss": 0.6497, + "step": 4101 + }, + { + "epoch": 1.6010928961748634, + "grad_norm": 0.559007012543396, + "learning_rate": 5.284846331224448e-06, + "loss": 0.5979, + "step": 4102 + }, + { + "epoch": 1.6014832162373147, + "grad_norm": 0.5595360581215122, + "learning_rate": 5.282579068660448e-06, + "loss": 0.6192, + "step": 4103 + }, + { + "epoch": 1.6018735362997658, + "grad_norm": 0.6125302995880773, + "learning_rate": 5.280311747805088e-06, + "loss": 0.606, + "step": 4104 + }, + { + "epoch": 1.602263856362217, + "grad_norm": 0.7153383676458832, + "learning_rate": 5.2780443691260805e-06, + "loss": 0.5777, + "step": 4105 + }, + { + "epoch": 1.6026541764246682, + "grad_norm": 0.5941951887453523, + "learning_rate": 5.275776933091148e-06, + "loss": 0.6096, + "step": 4106 + }, + { + "epoch": 1.6030444964871196, + "grad_norm": 0.5612701128281987, + "learning_rate": 5.273509440168025e-06, + "loss": 0.602, + "step": 4107 + }, + { + "epoch": 1.6034348165495707, + "grad_norm": 0.7305934024464684, + "learning_rate": 5.271241890824457e-06, + "loss": 0.598, + "step": 4108 + }, + { + "epoch": 1.6038251366120218, + "grad_norm": 0.6929705279748594, + "learning_rate": 5.268974285528201e-06, + "loss": 0.5941, + "step": 4109 + }, + { + "epoch": 1.604215456674473, + "grad_norm": 0.5699334201569561, + "learning_rate": 5.2667066247470265e-06, + "loss": 0.5852, + "step": 4110 + }, + { + "epoch": 1.6046057767369244, + "grad_norm": 0.8080066966201118, + "learning_rate": 5.264438908948717e-06, + "loss": 0.6141, + "step": 4111 + }, + { + "epoch": 1.6049960967993755, + "grad_norm": 0.5275622600147507, + "learning_rate": 5.26217113860106e-06, + "loss": 0.6274, + "step": 4112 + }, + { + "epoch": 1.6053864168618266, + "grad_norm": 0.6599509607915125, + "learning_rate": 5.2599033141718615e-06, + "loss": 0.5811, + "step": 4113 + }, + { + "epoch": 1.605776736924278, + "grad_norm": 0.6399481627858203, + "learning_rate": 5.257635436128934e-06, + "loss": 0.6016, + "step": 4114 + }, + { + "epoch": 1.6061670569867292, + "grad_norm": 0.6284071548085789, + "learning_rate": 5.255367504940106e-06, + "loss": 0.5756, + "step": 4115 + }, + { + "epoch": 1.6065573770491803, + "grad_norm": 0.7136763208900975, + "learning_rate": 5.253099521073212e-06, + "loss": 0.614, + "step": 4116 + }, + { + "epoch": 1.6069476971116314, + "grad_norm": 0.7423785083872183, + "learning_rate": 5.250831484996101e-06, + "loss": 0.63, + "step": 4117 + }, + { + "epoch": 1.6073380171740828, + "grad_norm": 0.5507619205396027, + "learning_rate": 5.248563397176627e-06, + "loss": 0.5935, + "step": 4118 + }, + { + "epoch": 1.607728337236534, + "grad_norm": 0.5435289888685421, + "learning_rate": 5.246295258082662e-06, + "loss": 0.5868, + "step": 4119 + }, + { + "epoch": 1.6081186572989852, + "grad_norm": 0.6275000692590844, + "learning_rate": 5.244027068182087e-06, + "loss": 0.6053, + "step": 4120 + }, + { + "epoch": 1.6085089773614363, + "grad_norm": 0.6607235636566637, + "learning_rate": 5.241758827942787e-06, + "loss": 0.5817, + "step": 4121 + }, + { + "epoch": 1.6088992974238876, + "grad_norm": 0.5874712409283623, + "learning_rate": 5.239490537832666e-06, + "loss": 0.5633, + "step": 4122 + }, + { + "epoch": 1.609289617486339, + "grad_norm": 0.6483769463611836, + "learning_rate": 5.237222198319633e-06, + "loss": 0.5919, + "step": 4123 + }, + { + "epoch": 1.60967993754879, + "grad_norm": 0.5891258286354152, + "learning_rate": 5.234953809871609e-06, + "loss": 0.6184, + "step": 4124 + }, + { + "epoch": 1.6100702576112411, + "grad_norm": 0.6066465221671847, + "learning_rate": 5.232685372956525e-06, + "loss": 0.6023, + "step": 4125 + }, + { + "epoch": 1.6104605776736924, + "grad_norm": 0.5144401913491381, + "learning_rate": 5.230416888042321e-06, + "loss": 0.5769, + "step": 4126 + }, + { + "epoch": 1.6108508977361438, + "grad_norm": 0.5535742428353131, + "learning_rate": 5.228148355596949e-06, + "loss": 0.605, + "step": 4127 + }, + { + "epoch": 1.6112412177985949, + "grad_norm": 0.6695641818850098, + "learning_rate": 5.225879776088367e-06, + "loss": 0.59, + "step": 4128 + }, + { + "epoch": 1.611631537861046, + "grad_norm": 0.6484748306243561, + "learning_rate": 5.2236111499845495e-06, + "loss": 0.5789, + "step": 4129 + }, + { + "epoch": 1.6120218579234973, + "grad_norm": 0.6516873417012682, + "learning_rate": 5.221342477753472e-06, + "loss": 0.5844, + "step": 4130 + }, + { + "epoch": 1.6124121779859486, + "grad_norm": 0.5250990642919906, + "learning_rate": 5.219073759863128e-06, + "loss": 0.6113, + "step": 4131 + }, + { + "epoch": 1.6128024980483997, + "grad_norm": 0.6855874355471325, + "learning_rate": 5.216804996781511e-06, + "loss": 0.5791, + "step": 4132 + }, + { + "epoch": 1.6131928181108508, + "grad_norm": 0.5579578104508754, + "learning_rate": 5.2145361889766334e-06, + "loss": 0.5944, + "step": 4133 + }, + { + "epoch": 1.6135831381733021, + "grad_norm": 0.5651903191092567, + "learning_rate": 5.21226733691651e-06, + "loss": 0.6032, + "step": 4134 + }, + { + "epoch": 1.6139734582357534, + "grad_norm": 0.5896007257063236, + "learning_rate": 5.2099984410691685e-06, + "loss": 0.578, + "step": 4135 + }, + { + "epoch": 1.6143637782982045, + "grad_norm": 0.5828840887058042, + "learning_rate": 5.207729501902644e-06, + "loss": 0.5652, + "step": 4136 + }, + { + "epoch": 1.6147540983606556, + "grad_norm": 0.5794501463067947, + "learning_rate": 5.205460519884981e-06, + "loss": 0.6283, + "step": 4137 + }, + { + "epoch": 1.615144418423107, + "grad_norm": 0.530399625894126, + "learning_rate": 5.203191495484232e-06, + "loss": 0.6104, + "step": 4138 + }, + { + "epoch": 1.6155347384855583, + "grad_norm": 0.5528316298951033, + "learning_rate": 5.20092242916846e-06, + "loss": 0.6078, + "step": 4139 + }, + { + "epoch": 1.6159250585480094, + "grad_norm": 0.5717020902481508, + "learning_rate": 5.198653321405734e-06, + "loss": 0.6167, + "step": 4140 + }, + { + "epoch": 1.6163153786104605, + "grad_norm": 0.5145404040971531, + "learning_rate": 5.196384172664136e-06, + "loss": 0.5997, + "step": 4141 + }, + { + "epoch": 1.6167056986729118, + "grad_norm": 0.5574555166643679, + "learning_rate": 5.194114983411752e-06, + "loss": 0.6118, + "step": 4142 + }, + { + "epoch": 1.6170960187353631, + "grad_norm": 0.520257629693206, + "learning_rate": 5.191845754116678e-06, + "loss": 0.5787, + "step": 4143 + }, + { + "epoch": 1.6174863387978142, + "grad_norm": 0.6549766575101695, + "learning_rate": 5.189576485247016e-06, + "loss": 0.6184, + "step": 4144 + }, + { + "epoch": 1.6178766588602653, + "grad_norm": 0.568492075994628, + "learning_rate": 5.187307177270882e-06, + "loss": 0.6169, + "step": 4145 + }, + { + "epoch": 1.6182669789227166, + "grad_norm": 0.6222485177399365, + "learning_rate": 5.185037830656396e-06, + "loss": 0.6092, + "step": 4146 + }, + { + "epoch": 1.618657298985168, + "grad_norm": 0.6238068947800193, + "learning_rate": 5.1827684458716845e-06, + "loss": 0.5844, + "step": 4147 + }, + { + "epoch": 1.619047619047619, + "grad_norm": 0.6138792748262198, + "learning_rate": 5.180499023384886e-06, + "loss": 0.6475, + "step": 4148 + }, + { + "epoch": 1.6194379391100702, + "grad_norm": 0.6261338762633802, + "learning_rate": 5.178229563664141e-06, + "loss": 0.6184, + "step": 4149 + }, + { + "epoch": 1.6198282591725215, + "grad_norm": 0.5235291773272877, + "learning_rate": 5.175960067177605e-06, + "loss": 0.5918, + "step": 4150 + }, + { + "epoch": 1.6202185792349728, + "grad_norm": 0.5229641029575658, + "learning_rate": 5.1736905343934365e-06, + "loss": 0.6155, + "step": 4151 + }, + { + "epoch": 1.620608899297424, + "grad_norm": 0.528326803004885, + "learning_rate": 5.171420965779801e-06, + "loss": 0.5593, + "step": 4152 + }, + { + "epoch": 1.620999219359875, + "grad_norm": 0.5537293442554394, + "learning_rate": 5.169151361804872e-06, + "loss": 0.6253, + "step": 4153 + }, + { + "epoch": 1.6213895394223263, + "grad_norm": 0.58719396674014, + "learning_rate": 5.166881722936835e-06, + "loss": 0.6285, + "step": 4154 + }, + { + "epoch": 1.6217798594847777, + "grad_norm": 0.5347712754550149, + "learning_rate": 5.164612049643877e-06, + "loss": 0.5641, + "step": 4155 + }, + { + "epoch": 1.6221701795472288, + "grad_norm": 0.5202392552248372, + "learning_rate": 5.162342342394191e-06, + "loss": 0.5771, + "step": 4156 + }, + { + "epoch": 1.6225604996096799, + "grad_norm": 0.5763200497604365, + "learning_rate": 5.160072601655981e-06, + "loss": 0.6057, + "step": 4157 + }, + { + "epoch": 1.6229508196721312, + "grad_norm": 0.5418038639639728, + "learning_rate": 5.157802827897459e-06, + "loss": 0.5705, + "step": 4158 + }, + { + "epoch": 1.6233411397345825, + "grad_norm": 0.5970207010009798, + "learning_rate": 5.155533021586841e-06, + "loss": 0.584, + "step": 4159 + }, + { + "epoch": 1.6237314597970336, + "grad_norm": 0.5820261230912993, + "learning_rate": 5.153263183192349e-06, + "loss": 0.5958, + "step": 4160 + }, + { + "epoch": 1.6241217798594847, + "grad_norm": 0.5506002650413667, + "learning_rate": 5.150993313182213e-06, + "loss": 0.6193, + "step": 4161 + }, + { + "epoch": 1.624512099921936, + "grad_norm": 0.4984977362509724, + "learning_rate": 5.148723412024669e-06, + "loss": 0.6189, + "step": 4162 + }, + { + "epoch": 1.6249024199843873, + "grad_norm": 0.5434934779930509, + "learning_rate": 5.146453480187963e-06, + "loss": 0.5822, + "step": 4163 + }, + { + "epoch": 1.6252927400468384, + "grad_norm": 0.5818298372050732, + "learning_rate": 5.1441835181403385e-06, + "loss": 0.6086, + "step": 4164 + }, + { + "epoch": 1.6256830601092895, + "grad_norm": 0.5265130153705277, + "learning_rate": 5.1419135263500575e-06, + "loss": 0.5891, + "step": 4165 + }, + { + "epoch": 1.6260733801717409, + "grad_norm": 0.5477342730748691, + "learning_rate": 5.139643505285375e-06, + "loss": 0.5693, + "step": 4166 + }, + { + "epoch": 1.6264637002341922, + "grad_norm": 0.5306091370985518, + "learning_rate": 5.1373734554145634e-06, + "loss": 0.5846, + "step": 4167 + }, + { + "epoch": 1.6268540202966433, + "grad_norm": 0.5712086453962995, + "learning_rate": 5.135103377205894e-06, + "loss": 0.5786, + "step": 4168 + }, + { + "epoch": 1.6272443403590944, + "grad_norm": 0.7253366020796083, + "learning_rate": 5.132833271127648e-06, + "loss": 0.5727, + "step": 4169 + }, + { + "epoch": 1.6276346604215457, + "grad_norm": 0.5176151102574154, + "learning_rate": 5.130563137648108e-06, + "loss": 0.5923, + "step": 4170 + }, + { + "epoch": 1.628024980483997, + "grad_norm": 0.5263765251021179, + "learning_rate": 5.128292977235565e-06, + "loss": 0.5727, + "step": 4171 + }, + { + "epoch": 1.6284153005464481, + "grad_norm": 0.5050172484008677, + "learning_rate": 5.126022790358317e-06, + "loss": 0.6004, + "step": 4172 + }, + { + "epoch": 1.6288056206088992, + "grad_norm": 0.5327078999078004, + "learning_rate": 5.1237525774846665e-06, + "loss": 0.5952, + "step": 4173 + }, + { + "epoch": 1.6291959406713505, + "grad_norm": 0.5277294464264112, + "learning_rate": 5.121482339082917e-06, + "loss": 0.5941, + "step": 4174 + }, + { + "epoch": 1.6295862607338019, + "grad_norm": 0.5178061059424178, + "learning_rate": 5.119212075621383e-06, + "loss": 0.547, + "step": 4175 + }, + { + "epoch": 1.629976580796253, + "grad_norm": 0.5978530413005504, + "learning_rate": 5.116941787568382e-06, + "loss": 0.6174, + "step": 4176 + }, + { + "epoch": 1.630366900858704, + "grad_norm": 0.5876011004061235, + "learning_rate": 5.114671475392239e-06, + "loss": 0.6045, + "step": 4177 + }, + { + "epoch": 1.6307572209211554, + "grad_norm": 0.5248431541716521, + "learning_rate": 5.1124011395612775e-06, + "loss": 0.6254, + "step": 4178 + }, + { + "epoch": 1.6311475409836067, + "grad_norm": 0.6187659518403549, + "learning_rate": 5.110130780543831e-06, + "loss": 0.5698, + "step": 4179 + }, + { + "epoch": 1.6315378610460578, + "grad_norm": 0.5890998558134525, + "learning_rate": 5.10786039880824e-06, + "loss": 0.5928, + "step": 4180 + }, + { + "epoch": 1.631928181108509, + "grad_norm": 0.5582700145707259, + "learning_rate": 5.105589994822842e-06, + "loss": 0.5754, + "step": 4181 + }, + { + "epoch": 1.6323185011709602, + "grad_norm": 0.5022253184373825, + "learning_rate": 5.103319569055988e-06, + "loss": 0.5778, + "step": 4182 + }, + { + "epoch": 1.6327088212334115, + "grad_norm": 0.5305959250026792, + "learning_rate": 5.101049121976024e-06, + "loss": 0.5894, + "step": 4183 + }, + { + "epoch": 1.6330991412958626, + "grad_norm": 0.5114033915448756, + "learning_rate": 5.09877865405131e-06, + "loss": 0.5776, + "step": 4184 + }, + { + "epoch": 1.6334894613583137, + "grad_norm": 0.6880195213446725, + "learning_rate": 5.096508165750201e-06, + "loss": 0.5971, + "step": 4185 + }, + { + "epoch": 1.633879781420765, + "grad_norm": 0.5141564123036043, + "learning_rate": 5.094237657541067e-06, + "loss": 0.5949, + "step": 4186 + }, + { + "epoch": 1.6342701014832164, + "grad_norm": 0.49239894441114956, + "learning_rate": 5.0919671298922715e-06, + "loss": 0.5924, + "step": 4187 + }, + { + "epoch": 1.6346604215456675, + "grad_norm": 0.6259493875945998, + "learning_rate": 5.089696583272187e-06, + "loss": 0.5603, + "step": 4188 + }, + { + "epoch": 1.6350507416081186, + "grad_norm": 0.5730963462464376, + "learning_rate": 5.08742601814919e-06, + "loss": 0.6427, + "step": 4189 + }, + { + "epoch": 1.63544106167057, + "grad_norm": 0.4978043234178859, + "learning_rate": 5.085155434991662e-06, + "loss": 0.5691, + "step": 4190 + }, + { + "epoch": 1.6358313817330212, + "grad_norm": 0.6495907000829725, + "learning_rate": 5.082884834267986e-06, + "loss": 0.569, + "step": 4191 + }, + { + "epoch": 1.6362217017954723, + "grad_norm": 0.5919008108023146, + "learning_rate": 5.0806142164465465e-06, + "loss": 0.6096, + "step": 4192 + }, + { + "epoch": 1.6366120218579234, + "grad_norm": 0.515226615896691, + "learning_rate": 5.078343581995737e-06, + "loss": 0.6451, + "step": 4193 + }, + { + "epoch": 1.6370023419203747, + "grad_norm": 0.5407473718075295, + "learning_rate": 5.07607293138395e-06, + "loss": 0.5776, + "step": 4194 + }, + { + "epoch": 1.637392661982826, + "grad_norm": 0.6922152800244703, + "learning_rate": 5.073802265079585e-06, + "loss": 0.6193, + "step": 4195 + }, + { + "epoch": 1.6377829820452772, + "grad_norm": 0.6357901132824514, + "learning_rate": 5.07153158355104e-06, + "loss": 0.5978, + "step": 4196 + }, + { + "epoch": 1.6381733021077283, + "grad_norm": 0.4987470612289364, + "learning_rate": 5.06926088726672e-06, + "loss": 0.5903, + "step": 4197 + }, + { + "epoch": 1.6385636221701796, + "grad_norm": 0.611883286143685, + "learning_rate": 5.066990176695031e-06, + "loss": 0.6121, + "step": 4198 + }, + { + "epoch": 1.638953942232631, + "grad_norm": 0.6684742000804238, + "learning_rate": 5.064719452304385e-06, + "loss": 0.5623, + "step": 4199 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 0.548835804224495, + "learning_rate": 5.062448714563192e-06, + "loss": 0.5945, + "step": 4200 + }, + { + "epoch": 1.639734582357533, + "grad_norm": 0.6177172735184091, + "learning_rate": 5.060177963939869e-06, + "loss": 0.6525, + "step": 4201 + }, + { + "epoch": 1.6401249024199844, + "grad_norm": 0.6930443100610558, + "learning_rate": 5.057907200902833e-06, + "loss": 0.6117, + "step": 4202 + }, + { + "epoch": 1.6405152224824358, + "grad_norm": 0.5082627863696824, + "learning_rate": 5.055636425920506e-06, + "loss": 0.6217, + "step": 4203 + }, + { + "epoch": 1.6409055425448869, + "grad_norm": 0.5710962126454973, + "learning_rate": 5.053365639461312e-06, + "loss": 0.6003, + "step": 4204 + }, + { + "epoch": 1.641295862607338, + "grad_norm": 0.6008426810954155, + "learning_rate": 5.051094841993672e-06, + "loss": 0.608, + "step": 4205 + }, + { + "epoch": 1.6416861826697893, + "grad_norm": 0.5565933853774423, + "learning_rate": 5.048824033986018e-06, + "loss": 0.618, + "step": 4206 + }, + { + "epoch": 1.6420765027322406, + "grad_norm": 0.5262886376823777, + "learning_rate": 5.046553215906778e-06, + "loss": 0.5955, + "step": 4207 + }, + { + "epoch": 1.6424668227946917, + "grad_norm": 0.5582341871715214, + "learning_rate": 5.044282388224385e-06, + "loss": 0.5933, + "step": 4208 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 0.5946027423022442, + "learning_rate": 5.042011551407272e-06, + "loss": 0.5909, + "step": 4209 + }, + { + "epoch": 1.6432474629195941, + "grad_norm": 0.6309494964810369, + "learning_rate": 5.039740705923876e-06, + "loss": 0.6388, + "step": 4210 + }, + { + "epoch": 1.6436377829820454, + "grad_norm": 0.6109382397326903, + "learning_rate": 5.037469852242633e-06, + "loss": 0.5698, + "step": 4211 + }, + { + "epoch": 1.6440281030444965, + "grad_norm": 0.6659395173422699, + "learning_rate": 5.035198990831983e-06, + "loss": 0.599, + "step": 4212 + }, + { + "epoch": 1.6444184231069476, + "grad_norm": 0.5997051600729175, + "learning_rate": 5.032928122160369e-06, + "loss": 0.5652, + "step": 4213 + }, + { + "epoch": 1.644808743169399, + "grad_norm": 0.6337088170116657, + "learning_rate": 5.030657246696229e-06, + "loss": 0.6487, + "step": 4214 + }, + { + "epoch": 1.6451990632318503, + "grad_norm": 0.5777770830772085, + "learning_rate": 5.02838636490801e-06, + "loss": 0.6045, + "step": 4215 + }, + { + "epoch": 1.6455893832943014, + "grad_norm": 0.6198316283701133, + "learning_rate": 5.026115477264156e-06, + "loss": 0.5964, + "step": 4216 + }, + { + "epoch": 1.6459797033567525, + "grad_norm": 0.5535883020675038, + "learning_rate": 5.023844584233115e-06, + "loss": 0.5767, + "step": 4217 + }, + { + "epoch": 1.6463700234192038, + "grad_norm": 0.5758531279620833, + "learning_rate": 5.021573686283332e-06, + "loss": 0.5936, + "step": 4218 + }, + { + "epoch": 1.6467603434816551, + "grad_norm": 0.5808080320821788, + "learning_rate": 5.019302783883257e-06, + "loss": 0.5979, + "step": 4219 + }, + { + "epoch": 1.6471506635441062, + "grad_norm": 0.5809172640734338, + "learning_rate": 5.017031877501339e-06, + "loss": 0.6299, + "step": 4220 + }, + { + "epoch": 1.6475409836065573, + "grad_norm": 0.5709478641893139, + "learning_rate": 5.014760967606029e-06, + "loss": 0.6016, + "step": 4221 + }, + { + "epoch": 1.6479313036690086, + "grad_norm": 0.7080278913253141, + "learning_rate": 5.012490054665776e-06, + "loss": 0.6363, + "step": 4222 + }, + { + "epoch": 1.64832162373146, + "grad_norm": 0.6007325629983483, + "learning_rate": 5.010219139149033e-06, + "loss": 0.5996, + "step": 4223 + }, + { + "epoch": 1.648711943793911, + "grad_norm": 0.5401442676722691, + "learning_rate": 5.00794822152425e-06, + "loss": 0.5798, + "step": 4224 + }, + { + "epoch": 1.6491022638563622, + "grad_norm": 0.5496706701909614, + "learning_rate": 5.0056773022598835e-06, + "loss": 0.5756, + "step": 4225 + }, + { + "epoch": 1.6494925839188135, + "grad_norm": 0.551969708208509, + "learning_rate": 5.003406381824384e-06, + "loss": 0.5997, + "step": 4226 + }, + { + "epoch": 1.6498829039812648, + "grad_norm": 0.537325580816158, + "learning_rate": 5.001135460686204e-06, + "loss": 0.5906, + "step": 4227 + }, + { + "epoch": 1.650273224043716, + "grad_norm": 0.5967170514365525, + "learning_rate": 4.998864539313798e-06, + "loss": 0.5963, + "step": 4228 + }, + { + "epoch": 1.650663544106167, + "grad_norm": 0.5253358519783191, + "learning_rate": 4.996593618175617e-06, + "loss": 0.5776, + "step": 4229 + }, + { + "epoch": 1.651053864168618, + "grad_norm": 0.5502273963889185, + "learning_rate": 4.994322697740118e-06, + "loss": 0.6223, + "step": 4230 + }, + { + "epoch": 1.6514441842310694, + "grad_norm": 0.5329464020281974, + "learning_rate": 4.992051778475751e-06, + "loss": 0.5915, + "step": 4231 + }, + { + "epoch": 1.6518345042935207, + "grad_norm": 0.714140094365162, + "learning_rate": 4.989780860850969e-06, + "loss": 0.6246, + "step": 4232 + }, + { + "epoch": 1.6522248243559718, + "grad_norm": 0.5841093636212291, + "learning_rate": 4.987509945334226e-06, + "loss": 0.603, + "step": 4233 + }, + { + "epoch": 1.652615144418423, + "grad_norm": 0.4980712171100845, + "learning_rate": 4.985239032393972e-06, + "loss": 0.5754, + "step": 4234 + }, + { + "epoch": 1.6530054644808743, + "grad_norm": 0.5604047069187787, + "learning_rate": 4.982968122498662e-06, + "loss": 0.5953, + "step": 4235 + }, + { + "epoch": 1.6533957845433256, + "grad_norm": 0.5792742130145744, + "learning_rate": 4.980697216116744e-06, + "loss": 0.6068, + "step": 4236 + }, + { + "epoch": 1.6537861046057767, + "grad_norm": 0.6229989749726113, + "learning_rate": 4.97842631371667e-06, + "loss": 0.6077, + "step": 4237 + }, + { + "epoch": 1.6541764246682278, + "grad_norm": 0.6209725241545867, + "learning_rate": 4.976155415766887e-06, + "loss": 0.6152, + "step": 4238 + }, + { + "epoch": 1.654566744730679, + "grad_norm": 0.6463497816556782, + "learning_rate": 4.973884522735845e-06, + "loss": 0.5925, + "step": 4239 + }, + { + "epoch": 1.6549570647931304, + "grad_norm": 0.624313995212008, + "learning_rate": 4.9716136350919916e-06, + "loss": 0.597, + "step": 4240 + }, + { + "epoch": 1.6553473848555815, + "grad_norm": 0.6437025908597256, + "learning_rate": 4.969342753303773e-06, + "loss": 0.5976, + "step": 4241 + }, + { + "epoch": 1.6557377049180326, + "grad_norm": 0.5164203861714655, + "learning_rate": 4.967071877839633e-06, + "loss": 0.618, + "step": 4242 + }, + { + "epoch": 1.656128024980484, + "grad_norm": 0.5393681149620421, + "learning_rate": 4.964801009168018e-06, + "loss": 0.5988, + "step": 4243 + }, + { + "epoch": 1.6565183450429353, + "grad_norm": 0.5359644699686491, + "learning_rate": 4.962530147757368e-06, + "loss": 0.6379, + "step": 4244 + }, + { + "epoch": 1.6569086651053864, + "grad_norm": 0.5833315946489096, + "learning_rate": 4.960259294076125e-06, + "loss": 0.6089, + "step": 4245 + }, + { + "epoch": 1.6572989851678375, + "grad_norm": 0.5357786220012647, + "learning_rate": 4.957988448592729e-06, + "loss": 0.5861, + "step": 4246 + }, + { + "epoch": 1.6576893052302888, + "grad_norm": 0.5316581930934896, + "learning_rate": 4.955717611775617e-06, + "loss": 0.6072, + "step": 4247 + }, + { + "epoch": 1.6580796252927401, + "grad_norm": 0.6028741036807383, + "learning_rate": 4.953446784093224e-06, + "loss": 0.6076, + "step": 4248 + }, + { + "epoch": 1.6584699453551912, + "grad_norm": 0.5615353731152257, + "learning_rate": 4.951175966013983e-06, + "loss": 0.6098, + "step": 4249 + }, + { + "epoch": 1.6588602654176423, + "grad_norm": 0.5446713445787105, + "learning_rate": 4.9489051580063295e-06, + "loss": 0.6458, + "step": 4250 + }, + { + "epoch": 1.6592505854800936, + "grad_norm": 0.5865649970824602, + "learning_rate": 4.94663436053869e-06, + "loss": 0.5849, + "step": 4251 + }, + { + "epoch": 1.659640905542545, + "grad_norm": 0.6271738541304359, + "learning_rate": 4.944363574079494e-06, + "loss": 0.6344, + "step": 4252 + }, + { + "epoch": 1.660031225604996, + "grad_norm": 0.5311470687242281, + "learning_rate": 4.942092799097167e-06, + "loss": 0.6194, + "step": 4253 + }, + { + "epoch": 1.6604215456674472, + "grad_norm": 0.526656083075295, + "learning_rate": 4.939822036060134e-06, + "loss": 0.6265, + "step": 4254 + }, + { + "epoch": 1.6608118657298985, + "grad_norm": 0.6419255631175109, + "learning_rate": 4.93755128543681e-06, + "loss": 0.6084, + "step": 4255 + }, + { + "epoch": 1.6612021857923498, + "grad_norm": 0.5572717793901067, + "learning_rate": 4.935280547695618e-06, + "loss": 0.5838, + "step": 4256 + }, + { + "epoch": 1.661592505854801, + "grad_norm": 0.5222399455476184, + "learning_rate": 4.933009823304971e-06, + "loss": 0.5754, + "step": 4257 + }, + { + "epoch": 1.661982825917252, + "grad_norm": 0.5583628669989025, + "learning_rate": 4.930739112733281e-06, + "loss": 0.594, + "step": 4258 + }, + { + "epoch": 1.6623731459797033, + "grad_norm": 0.6949714862961451, + "learning_rate": 4.928468416448961e-06, + "loss": 0.6288, + "step": 4259 + }, + { + "epoch": 1.6627634660421546, + "grad_norm": 0.5918734283233386, + "learning_rate": 4.926197734920417e-06, + "loss": 0.6359, + "step": 4260 + }, + { + "epoch": 1.6631537861046057, + "grad_norm": 0.5564894814352483, + "learning_rate": 4.92392706861605e-06, + "loss": 0.5788, + "step": 4261 + }, + { + "epoch": 1.6635441061670568, + "grad_norm": 0.6363743181453669, + "learning_rate": 4.921656418004263e-06, + "loss": 0.5632, + "step": 4262 + }, + { + "epoch": 1.6639344262295082, + "grad_norm": 0.555297346570851, + "learning_rate": 4.919385783553455e-06, + "loss": 0.6035, + "step": 4263 + }, + { + "epoch": 1.6643247462919595, + "grad_norm": 0.6765378097689188, + "learning_rate": 4.917115165732017e-06, + "loss": 0.5931, + "step": 4264 + }, + { + "epoch": 1.6647150663544106, + "grad_norm": 0.5333362188887223, + "learning_rate": 4.914844565008339e-06, + "loss": 0.5953, + "step": 4265 + }, + { + "epoch": 1.6651053864168617, + "grad_norm": 0.604914843022089, + "learning_rate": 4.9125739818508104e-06, + "loss": 0.5821, + "step": 4266 + }, + { + "epoch": 1.665495706479313, + "grad_norm": 0.6003815229397108, + "learning_rate": 4.910303416727814e-06, + "loss": 0.6221, + "step": 4267 + }, + { + "epoch": 1.6658860265417643, + "grad_norm": 0.5494746752553734, + "learning_rate": 4.90803287010773e-06, + "loss": 0.6057, + "step": 4268 + }, + { + "epoch": 1.6662763466042154, + "grad_norm": 0.5690491431691221, + "learning_rate": 4.905762342458935e-06, + "loss": 0.6258, + "step": 4269 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.6094035212266773, + "learning_rate": 4.903491834249799e-06, + "loss": 0.6082, + "step": 4270 + }, + { + "epoch": 1.6670569867291178, + "grad_norm": 0.5723776076626421, + "learning_rate": 4.901221345948691e-06, + "loss": 0.6244, + "step": 4271 + }, + { + "epoch": 1.6674473067915692, + "grad_norm": 0.5160827160516444, + "learning_rate": 4.898950878023978e-06, + "loss": 0.6366, + "step": 4272 + }, + { + "epoch": 1.6678376268540203, + "grad_norm": 0.590644805086583, + "learning_rate": 4.896680430944014e-06, + "loss": 0.6015, + "step": 4273 + }, + { + "epoch": 1.6682279469164714, + "grad_norm": 0.5648033821951687, + "learning_rate": 4.8944100051771585e-06, + "loss": 0.6018, + "step": 4274 + }, + { + "epoch": 1.6686182669789227, + "grad_norm": 0.5161471738990043, + "learning_rate": 4.892139601191761e-06, + "loss": 0.5771, + "step": 4275 + }, + { + "epoch": 1.669008587041374, + "grad_norm": 0.5218562351745799, + "learning_rate": 4.889869219456169e-06, + "loss": 0.6235, + "step": 4276 + }, + { + "epoch": 1.669398907103825, + "grad_norm": 0.5194247360230946, + "learning_rate": 4.887598860438723e-06, + "loss": 0.6155, + "step": 4277 + }, + { + "epoch": 1.6697892271662762, + "grad_norm": 0.5518040111627246, + "learning_rate": 4.885328524607762e-06, + "loss": 0.6241, + "step": 4278 + }, + { + "epoch": 1.6701795472287275, + "grad_norm": 0.5795003932591306, + "learning_rate": 4.883058212431618e-06, + "loss": 0.5995, + "step": 4279 + }, + { + "epoch": 1.6705698672911788, + "grad_norm": 0.5434634179208443, + "learning_rate": 4.8807879243786175e-06, + "loss": 0.6154, + "step": 4280 + }, + { + "epoch": 1.67096018735363, + "grad_norm": 0.5322803903177991, + "learning_rate": 4.878517660917086e-06, + "loss": 0.6042, + "step": 4281 + }, + { + "epoch": 1.671350507416081, + "grad_norm": 0.5440060537514222, + "learning_rate": 4.876247422515337e-06, + "loss": 0.6106, + "step": 4282 + }, + { + "epoch": 1.6717408274785324, + "grad_norm": 0.5071700696176992, + "learning_rate": 4.873977209641684e-06, + "loss": 0.6262, + "step": 4283 + }, + { + "epoch": 1.6721311475409837, + "grad_norm": 0.5703680627140495, + "learning_rate": 4.871707022764437e-06, + "loss": 0.6388, + "step": 4284 + }, + { + "epoch": 1.6725214676034348, + "grad_norm": 0.5304115927274609, + "learning_rate": 4.869436862351894e-06, + "loss": 0.5911, + "step": 4285 + }, + { + "epoch": 1.6729117876658859, + "grad_norm": 0.5039927588670476, + "learning_rate": 4.8671667288723535e-06, + "loss": 0.6286, + "step": 4286 + }, + { + "epoch": 1.6733021077283372, + "grad_norm": 0.5387628133962163, + "learning_rate": 4.864896622794106e-06, + "loss": 0.601, + "step": 4287 + }, + { + "epoch": 1.6736924277907885, + "grad_norm": 0.5496415299278908, + "learning_rate": 4.8626265445854365e-06, + "loss": 0.6125, + "step": 4288 + }, + { + "epoch": 1.6740827478532396, + "grad_norm": 0.5293865505708107, + "learning_rate": 4.860356494714626e-06, + "loss": 0.6418, + "step": 4289 + }, + { + "epoch": 1.6744730679156907, + "grad_norm": 0.4833755333482534, + "learning_rate": 4.858086473649945e-06, + "loss": 0.6491, + "step": 4290 + }, + { + "epoch": 1.674863387978142, + "grad_norm": 0.5259512356249716, + "learning_rate": 4.855816481859662e-06, + "loss": 0.6235, + "step": 4291 + }, + { + "epoch": 1.6752537080405934, + "grad_norm": 0.5954457600606109, + "learning_rate": 4.853546519812039e-06, + "loss": 0.5624, + "step": 4292 + }, + { + "epoch": 1.6756440281030445, + "grad_norm": 0.612355402759095, + "learning_rate": 4.851276587975332e-06, + "loss": 0.6401, + "step": 4293 + }, + { + "epoch": 1.6760343481654956, + "grad_norm": 0.542019702686455, + "learning_rate": 4.849006686817789e-06, + "loss": 0.5981, + "step": 4294 + }, + { + "epoch": 1.676424668227947, + "grad_norm": 0.6026900572198409, + "learning_rate": 4.846736816807652e-06, + "loss": 0.6024, + "step": 4295 + }, + { + "epoch": 1.6768149882903982, + "grad_norm": 0.6138448231656916, + "learning_rate": 4.84446697841316e-06, + "loss": 0.584, + "step": 4296 + }, + { + "epoch": 1.6772053083528493, + "grad_norm": 0.5790298731181888, + "learning_rate": 4.8421971721025404e-06, + "loss": 0.6209, + "step": 4297 + }, + { + "epoch": 1.6775956284153004, + "grad_norm": 0.6003808128347394, + "learning_rate": 4.839927398344021e-06, + "loss": 0.6012, + "step": 4298 + }, + { + "epoch": 1.6779859484777517, + "grad_norm": 0.6400969120969889, + "learning_rate": 4.837657657605812e-06, + "loss": 0.5865, + "step": 4299 + }, + { + "epoch": 1.678376268540203, + "grad_norm": 0.5198084397369448, + "learning_rate": 4.8353879503561266e-06, + "loss": 0.5555, + "step": 4300 + }, + { + "epoch": 1.6787665886026542, + "grad_norm": 0.4989696739116217, + "learning_rate": 4.833118277063166e-06, + "loss": 0.6348, + "step": 4301 + }, + { + "epoch": 1.6791569086651053, + "grad_norm": 0.4930597511218413, + "learning_rate": 4.8308486381951285e-06, + "loss": 0.616, + "step": 4302 + }, + { + "epoch": 1.6795472287275566, + "grad_norm": 0.5742900665288363, + "learning_rate": 4.8285790342202e-06, + "loss": 0.5925, + "step": 4303 + }, + { + "epoch": 1.679937548790008, + "grad_norm": 0.5772433849615963, + "learning_rate": 4.826309465606564e-06, + "loss": 0.6348, + "step": 4304 + }, + { + "epoch": 1.680327868852459, + "grad_norm": 0.4857832886578406, + "learning_rate": 4.824039932822396e-06, + "loss": 0.6041, + "step": 4305 + }, + { + "epoch": 1.68071818891491, + "grad_norm": 0.4905190576374374, + "learning_rate": 4.82177043633586e-06, + "loss": 0.6011, + "step": 4306 + }, + { + "epoch": 1.6811085089773614, + "grad_norm": 0.562385150120262, + "learning_rate": 4.8195009766151165e-06, + "loss": 0.5938, + "step": 4307 + }, + { + "epoch": 1.6814988290398127, + "grad_norm": 0.541663284281629, + "learning_rate": 4.817231554128317e-06, + "loss": 0.5734, + "step": 4308 + }, + { + "epoch": 1.6818891491022638, + "grad_norm": 0.4989800075092417, + "learning_rate": 4.814962169343605e-06, + "loss": 0.6179, + "step": 4309 + }, + { + "epoch": 1.682279469164715, + "grad_norm": 0.5521011436698336, + "learning_rate": 4.812692822729119e-06, + "loss": 0.5888, + "step": 4310 + }, + { + "epoch": 1.6826697892271663, + "grad_norm": 0.5312748360029554, + "learning_rate": 4.8104235147529844e-06, + "loss": 0.59, + "step": 4311 + }, + { + "epoch": 1.6830601092896176, + "grad_norm": 0.47029595243889394, + "learning_rate": 4.808154245883324e-06, + "loss": 0.6125, + "step": 4312 + }, + { + "epoch": 1.6834504293520687, + "grad_norm": 0.4624694573448564, + "learning_rate": 4.805885016588249e-06, + "loss": 0.5721, + "step": 4313 + }, + { + "epoch": 1.6838407494145198, + "grad_norm": 0.5326105099104067, + "learning_rate": 4.803615827335864e-06, + "loss": 0.5965, + "step": 4314 + }, + { + "epoch": 1.684231069476971, + "grad_norm": 0.47559909690789376, + "learning_rate": 4.801346678594265e-06, + "loss": 0.5476, + "step": 4315 + }, + { + "epoch": 1.6846213895394224, + "grad_norm": 0.6327599176906487, + "learning_rate": 4.7990775708315425e-06, + "loss": 0.5761, + "step": 4316 + }, + { + "epoch": 1.6850117096018735, + "grad_norm": 0.6018907354918898, + "learning_rate": 4.79680850451577e-06, + "loss": 0.6082, + "step": 4317 + }, + { + "epoch": 1.6854020296643246, + "grad_norm": 0.5774413605873695, + "learning_rate": 4.794539480115022e-06, + "loss": 0.5859, + "step": 4318 + }, + { + "epoch": 1.685792349726776, + "grad_norm": 0.4879852619626666, + "learning_rate": 4.792270498097358e-06, + "loss": 0.586, + "step": 4319 + }, + { + "epoch": 1.6861826697892273, + "grad_norm": 0.6022287147163617, + "learning_rate": 4.790001558930833e-06, + "loss": 0.613, + "step": 4320 + }, + { + "epoch": 1.6865729898516784, + "grad_norm": 0.5119909729130757, + "learning_rate": 4.787732663083491e-06, + "loss": 0.5882, + "step": 4321 + }, + { + "epoch": 1.6869633099141295, + "grad_norm": 0.5670328549324728, + "learning_rate": 4.785463811023367e-06, + "loss": 0.5619, + "step": 4322 + }, + { + "epoch": 1.6873536299765808, + "grad_norm": 0.5461738861950309, + "learning_rate": 4.7831950032184904e-06, + "loss": 0.6435, + "step": 4323 + }, + { + "epoch": 1.687743950039032, + "grad_norm": 0.5279665493803283, + "learning_rate": 4.780926240136874e-06, + "loss": 0.573, + "step": 4324 + }, + { + "epoch": 1.6881342701014832, + "grad_norm": 0.5052817634227288, + "learning_rate": 4.7786575222465285e-06, + "loss": 0.6208, + "step": 4325 + }, + { + "epoch": 1.6885245901639343, + "grad_norm": 0.4758051811299715, + "learning_rate": 4.776388850015451e-06, + "loss": 0.5819, + "step": 4326 + }, + { + "epoch": 1.6889149102263856, + "grad_norm": 0.5342027246133211, + "learning_rate": 4.7741202239116335e-06, + "loss": 0.6325, + "step": 4327 + }, + { + "epoch": 1.689305230288837, + "grad_norm": 0.5387369637302784, + "learning_rate": 4.7718516444030525e-06, + "loss": 0.6071, + "step": 4328 + }, + { + "epoch": 1.689695550351288, + "grad_norm": 0.6158882859555411, + "learning_rate": 4.76958311195768e-06, + "loss": 0.5867, + "step": 4329 + }, + { + "epoch": 1.6900858704137391, + "grad_norm": 0.5570266432519985, + "learning_rate": 4.767314627043476e-06, + "loss": 0.5624, + "step": 4330 + }, + { + "epoch": 1.6904761904761905, + "grad_norm": 0.6018586310209276, + "learning_rate": 4.765046190128392e-06, + "loss": 0.6, + "step": 4331 + }, + { + "epoch": 1.6908665105386418, + "grad_norm": 0.5455392605619709, + "learning_rate": 4.762777801680368e-06, + "loss": 0.5773, + "step": 4332 + }, + { + "epoch": 1.6912568306010929, + "grad_norm": 0.7825829898713653, + "learning_rate": 4.760509462167335e-06, + "loss": 0.6108, + "step": 4333 + }, + { + "epoch": 1.691647150663544, + "grad_norm": 0.5291113045186381, + "learning_rate": 4.758241172057215e-06, + "loss": 0.6395, + "step": 4334 + }, + { + "epoch": 1.6920374707259953, + "grad_norm": 0.5407792217763384, + "learning_rate": 4.755972931817916e-06, + "loss": 0.604, + "step": 4335 + }, + { + "epoch": 1.6924277907884466, + "grad_norm": 0.5483318972701812, + "learning_rate": 4.7537047419173384e-06, + "loss": 0.5846, + "step": 4336 + }, + { + "epoch": 1.6928181108508977, + "grad_norm": 0.5186725648015452, + "learning_rate": 4.751436602823374e-06, + "loss": 0.6001, + "step": 4337 + }, + { + "epoch": 1.6932084309133488, + "grad_norm": 0.5772613819362966, + "learning_rate": 4.749168515003901e-06, + "loss": 0.5926, + "step": 4338 + }, + { + "epoch": 1.6935987509758001, + "grad_norm": 0.523226303658878, + "learning_rate": 4.746900478926789e-06, + "loss": 0.6164, + "step": 4339 + }, + { + "epoch": 1.6939890710382515, + "grad_norm": 0.6476180633888934, + "learning_rate": 4.744632495059895e-06, + "loss": 0.6315, + "step": 4340 + }, + { + "epoch": 1.6943793911007026, + "grad_norm": 0.5896954369717907, + "learning_rate": 4.742364563871066e-06, + "loss": 0.6048, + "step": 4341 + }, + { + "epoch": 1.6947697111631537, + "grad_norm": 0.5506082907410655, + "learning_rate": 4.740096685828139e-06, + "loss": 0.6308, + "step": 4342 + }, + { + "epoch": 1.695160031225605, + "grad_norm": 0.5881177148137569, + "learning_rate": 4.7378288613989425e-06, + "loss": 0.608, + "step": 4343 + }, + { + "epoch": 1.6955503512880563, + "grad_norm": 0.5803263560595256, + "learning_rate": 4.735561091051286e-06, + "loss": 0.6184, + "step": 4344 + }, + { + "epoch": 1.6959406713505074, + "grad_norm": 0.589523263792334, + "learning_rate": 4.733293375252975e-06, + "loss": 0.5681, + "step": 4345 + }, + { + "epoch": 1.6963309914129585, + "grad_norm": 0.564603755109098, + "learning_rate": 4.7310257144718e-06, + "loss": 0.617, + "step": 4346 + }, + { + "epoch": 1.6967213114754098, + "grad_norm": 0.5559103017234958, + "learning_rate": 4.728758109175545e-06, + "loss": 0.5703, + "step": 4347 + }, + { + "epoch": 1.6971116315378612, + "grad_norm": 0.6741972720968868, + "learning_rate": 4.726490559831976e-06, + "loss": 0.6124, + "step": 4348 + }, + { + "epoch": 1.6975019516003123, + "grad_norm": 0.5563163898682933, + "learning_rate": 4.724223066908852e-06, + "loss": 0.606, + "step": 4349 + }, + { + "epoch": 1.6978922716627634, + "grad_norm": 0.515979446465048, + "learning_rate": 4.7219556308739195e-06, + "loss": 0.589, + "step": 4350 + }, + { + "epoch": 1.6982825917252147, + "grad_norm": 0.5115672114430998, + "learning_rate": 4.719688252194914e-06, + "loss": 0.6418, + "step": 4351 + }, + { + "epoch": 1.698672911787666, + "grad_norm": 0.5004164956657363, + "learning_rate": 4.717420931339555e-06, + "loss": 0.6166, + "step": 4352 + }, + { + "epoch": 1.699063231850117, + "grad_norm": 0.5229836220600291, + "learning_rate": 4.715153668775554e-06, + "loss": 0.591, + "step": 4353 + }, + { + "epoch": 1.6994535519125682, + "grad_norm": 0.5448343783761842, + "learning_rate": 4.71288646497061e-06, + "loss": 0.6156, + "step": 4354 + }, + { + "epoch": 1.6998438719750195, + "grad_norm": 0.5384284445708268, + "learning_rate": 4.710619320392409e-06, + "loss": 0.5789, + "step": 4355 + }, + { + "epoch": 1.7002341920374708, + "grad_norm": 0.5210462597243886, + "learning_rate": 4.708352235508627e-06, + "loss": 0.6026, + "step": 4356 + }, + { + "epoch": 1.700624512099922, + "grad_norm": 0.5434700468574846, + "learning_rate": 4.706085210786923e-06, + "loss": 0.5874, + "step": 4357 + }, + { + "epoch": 1.701014832162373, + "grad_norm": 0.5678803131037428, + "learning_rate": 4.703818246694948e-06, + "loss": 0.6026, + "step": 4358 + }, + { + "epoch": 1.7014051522248244, + "grad_norm": 0.6750568996862284, + "learning_rate": 4.701551343700339e-06, + "loss": 0.5954, + "step": 4359 + }, + { + "epoch": 1.7017954722872757, + "grad_norm": 0.5914558079794555, + "learning_rate": 4.699284502270723e-06, + "loss": 0.6032, + "step": 4360 + }, + { + "epoch": 1.7021857923497268, + "grad_norm": 0.6249717449581568, + "learning_rate": 4.697017722873705e-06, + "loss": 0.6094, + "step": 4361 + }, + { + "epoch": 1.7025761124121779, + "grad_norm": 0.6505005764861123, + "learning_rate": 4.6947510059768886e-06, + "loss": 0.6444, + "step": 4362 + }, + { + "epoch": 1.7029664324746292, + "grad_norm": 0.5448468104854136, + "learning_rate": 4.692484352047859e-06, + "loss": 0.5799, + "step": 4363 + }, + { + "epoch": 1.7033567525370805, + "grad_norm": 0.6002154732381748, + "learning_rate": 4.690217761554188e-06, + "loss": 0.6037, + "step": 4364 + }, + { + "epoch": 1.7037470725995316, + "grad_norm": 0.697147404492426, + "learning_rate": 4.687951234963438e-06, + "loss": 0.6343, + "step": 4365 + }, + { + "epoch": 1.7041373926619827, + "grad_norm": 0.48752462617290476, + "learning_rate": 4.685684772743153e-06, + "loss": 0.5835, + "step": 4366 + }, + { + "epoch": 1.704527712724434, + "grad_norm": 0.5370828296104501, + "learning_rate": 4.683418375360868e-06, + "loss": 0.617, + "step": 4367 + }, + { + "epoch": 1.7049180327868854, + "grad_norm": 0.6331724436160057, + "learning_rate": 4.681152043284103e-06, + "loss": 0.6143, + "step": 4368 + }, + { + "epoch": 1.7053083528493365, + "grad_norm": 0.5584309124636976, + "learning_rate": 4.678885776980366e-06, + "loss": 0.6027, + "step": 4369 + }, + { + "epoch": 1.7056986729117876, + "grad_norm": 0.5317070621933051, + "learning_rate": 4.676619576917147e-06, + "loss": 0.5839, + "step": 4370 + }, + { + "epoch": 1.7060889929742389, + "grad_norm": 0.5340078144212712, + "learning_rate": 4.674353443561926e-06, + "loss": 0.5969, + "step": 4371 + }, + { + "epoch": 1.7064793130366902, + "grad_norm": 0.5363399039925364, + "learning_rate": 4.672087377382169e-06, + "loss": 0.5746, + "step": 4372 + }, + { + "epoch": 1.7068696330991413, + "grad_norm": 0.6216064354583017, + "learning_rate": 4.66982137884533e-06, + "loss": 0.5848, + "step": 4373 + }, + { + "epoch": 1.7072599531615924, + "grad_norm": 0.5927727609805298, + "learning_rate": 4.6675554484188436e-06, + "loss": 0.5891, + "step": 4374 + }, + { + "epoch": 1.7076502732240437, + "grad_norm": 0.5830636040896863, + "learning_rate": 4.665289586570134e-06, + "loss": 0.5706, + "step": 4375 + }, + { + "epoch": 1.708040593286495, + "grad_norm": 0.5228494088312392, + "learning_rate": 4.6630237937666126e-06, + "loss": 0.6324, + "step": 4376 + }, + { + "epoch": 1.7084309133489461, + "grad_norm": 0.5218357962552835, + "learning_rate": 4.660758070475673e-06, + "loss": 0.5942, + "step": 4377 + }, + { + "epoch": 1.7088212334113972, + "grad_norm": 0.4592544156953792, + "learning_rate": 4.658492417164698e-06, + "loss": 0.582, + "step": 4378 + }, + { + "epoch": 1.7092115534738486, + "grad_norm": 0.5204521855445388, + "learning_rate": 4.656226834301053e-06, + "loss": 0.5826, + "step": 4379 + }, + { + "epoch": 1.7096018735362999, + "grad_norm": 0.5960983757985822, + "learning_rate": 4.653961322352088e-06, + "loss": 0.6092, + "step": 4380 + }, + { + "epoch": 1.709992193598751, + "grad_norm": 0.4933089533596694, + "learning_rate": 4.651695881785143e-06, + "loss": 0.5925, + "step": 4381 + }, + { + "epoch": 1.710382513661202, + "grad_norm": 0.49752949594650336, + "learning_rate": 4.649430513067541e-06, + "loss": 0.5821, + "step": 4382 + }, + { + "epoch": 1.7107728337236534, + "grad_norm": 0.6196197100324713, + "learning_rate": 4.647165216666588e-06, + "loss": 0.5667, + "step": 4383 + }, + { + "epoch": 1.7111631537861047, + "grad_norm": 0.5385361413817906, + "learning_rate": 4.644899993049579e-06, + "loss": 0.6034, + "step": 4384 + }, + { + "epoch": 1.7115534738485558, + "grad_norm": 0.5510204826449279, + "learning_rate": 4.64263484268379e-06, + "loss": 0.6091, + "step": 4385 + }, + { + "epoch": 1.711943793911007, + "grad_norm": 0.521490020103371, + "learning_rate": 4.640369766036485e-06, + "loss": 0.5851, + "step": 4386 + }, + { + "epoch": 1.7123341139734582, + "grad_norm": 0.5604891794412856, + "learning_rate": 4.638104763574913e-06, + "loss": 0.6089, + "step": 4387 + }, + { + "epoch": 1.7127244340359096, + "grad_norm": 0.46759061602583357, + "learning_rate": 4.635839835766303e-06, + "loss": 0.6099, + "step": 4388 + }, + { + "epoch": 1.7131147540983607, + "grad_norm": 0.5423108193743893, + "learning_rate": 4.633574983077873e-06, + "loss": 0.6039, + "step": 4389 + }, + { + "epoch": 1.7135050741608118, + "grad_norm": 0.5610260970542236, + "learning_rate": 4.631310205976826e-06, + "loss": 0.5869, + "step": 4390 + }, + { + "epoch": 1.713895394223263, + "grad_norm": 0.5149704492084337, + "learning_rate": 4.629045504930346e-06, + "loss": 0.6524, + "step": 4391 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.49518849818537347, + "learning_rate": 4.626780880405604e-06, + "loss": 0.6, + "step": 4392 + }, + { + "epoch": 1.7146760343481655, + "grad_norm": 0.6019983548495857, + "learning_rate": 4.624516332869754e-06, + "loss": 0.6073, + "step": 4393 + }, + { + "epoch": 1.7150663544106166, + "grad_norm": 0.5777307351033126, + "learning_rate": 4.622251862789934e-06, + "loss": 0.6583, + "step": 4394 + }, + { + "epoch": 1.715456674473068, + "grad_norm": 0.5348144178809338, + "learning_rate": 4.61998747063327e-06, + "loss": 0.5773, + "step": 4395 + }, + { + "epoch": 1.7158469945355193, + "grad_norm": 0.5051815162316673, + "learning_rate": 4.617723156866863e-06, + "loss": 0.6299, + "step": 4396 + }, + { + "epoch": 1.7162373145979704, + "grad_norm": 0.5550799838936872, + "learning_rate": 4.615458921957807e-06, + "loss": 0.6386, + "step": 4397 + }, + { + "epoch": 1.7166276346604215, + "grad_norm": 0.5766006983483974, + "learning_rate": 4.613194766373174e-06, + "loss": 0.6138, + "step": 4398 + }, + { + "epoch": 1.7170179547228728, + "grad_norm": 0.6174952327116996, + "learning_rate": 4.610930690580022e-06, + "loss": 0.5724, + "step": 4399 + }, + { + "epoch": 1.717408274785324, + "grad_norm": 0.5484780642956215, + "learning_rate": 4.608666695045393e-06, + "loss": 0.5867, + "step": 4400 + }, + { + "epoch": 1.7177985948477752, + "grad_norm": 0.5758600975966156, + "learning_rate": 4.606402780236311e-06, + "loss": 0.5929, + "step": 4401 + }, + { + "epoch": 1.7181889149102263, + "grad_norm": 0.5532298100144862, + "learning_rate": 4.604138946619785e-06, + "loss": 0.6016, + "step": 4402 + }, + { + "epoch": 1.7185792349726776, + "grad_norm": 0.5448341454207121, + "learning_rate": 4.6018751946628045e-06, + "loss": 0.6021, + "step": 4403 + }, + { + "epoch": 1.718969555035129, + "grad_norm": 0.5377374622307952, + "learning_rate": 4.599611524832346e-06, + "loss": 0.58, + "step": 4404 + }, + { + "epoch": 1.71935987509758, + "grad_norm": 0.5305016561960975, + "learning_rate": 4.597347937595364e-06, + "loss": 0.6328, + "step": 4405 + }, + { + "epoch": 1.7197501951600311, + "grad_norm": 0.5576696962224132, + "learning_rate": 4.595084433418802e-06, + "loss": 0.6079, + "step": 4406 + }, + { + "epoch": 1.7201405152224825, + "grad_norm": 0.5936223325275047, + "learning_rate": 4.59282101276958e-06, + "loss": 0.5708, + "step": 4407 + }, + { + "epoch": 1.7205308352849338, + "grad_norm": 0.5838904681153585, + "learning_rate": 4.590557676114605e-06, + "loss": 0.623, + "step": 4408 + }, + { + "epoch": 1.7209211553473849, + "grad_norm": 0.6445021515711691, + "learning_rate": 4.5882944239207676e-06, + "loss": 0.5902, + "step": 4409 + }, + { + "epoch": 1.721311475409836, + "grad_norm": 0.49586835647836913, + "learning_rate": 4.5860312566549366e-06, + "loss": 0.6099, + "step": 4410 + }, + { + "epoch": 1.7217017954722873, + "grad_norm": 0.5163783641025141, + "learning_rate": 4.583768174783968e-06, + "loss": 0.5841, + "step": 4411 + }, + { + "epoch": 1.7220921155347386, + "grad_norm": 0.5177871958921795, + "learning_rate": 4.581505178774696e-06, + "loss": 0.6055, + "step": 4412 + }, + { + "epoch": 1.7224824355971897, + "grad_norm": 0.5620988391559848, + "learning_rate": 4.579242269093943e-06, + "loss": 0.6239, + "step": 4413 + }, + { + "epoch": 1.7228727556596408, + "grad_norm": 0.5536419183749426, + "learning_rate": 4.576979446208504e-06, + "loss": 0.6237, + "step": 4414 + }, + { + "epoch": 1.7232630757220921, + "grad_norm": 0.5480891195880315, + "learning_rate": 4.574716710585164e-06, + "loss": 0.6152, + "step": 4415 + }, + { + "epoch": 1.7236533957845435, + "grad_norm": 0.4865784254542794, + "learning_rate": 4.572454062690688e-06, + "loss": 0.6128, + "step": 4416 + }, + { + "epoch": 1.7240437158469946, + "grad_norm": 0.5590653552380063, + "learning_rate": 4.570191502991821e-06, + "loss": 0.603, + "step": 4417 + }, + { + "epoch": 1.7244340359094457, + "grad_norm": 0.5974514906149917, + "learning_rate": 4.567929031955295e-06, + "loss": 0.6044, + "step": 4418 + }, + { + "epoch": 1.724824355971897, + "grad_norm": 0.5260802518060518, + "learning_rate": 4.565666650047818e-06, + "loss": 0.6107, + "step": 4419 + }, + { + "epoch": 1.7252146760343483, + "grad_norm": 0.650859945814554, + "learning_rate": 4.563404357736081e-06, + "loss": 0.6001, + "step": 4420 + }, + { + "epoch": 1.7256049960967994, + "grad_norm": 0.5907334435073056, + "learning_rate": 4.561142155486758e-06, + "loss": 0.6059, + "step": 4421 + }, + { + "epoch": 1.7259953161592505, + "grad_norm": 0.5287197935991305, + "learning_rate": 4.558880043766504e-06, + "loss": 0.5872, + "step": 4422 + }, + { + "epoch": 1.7263856362217018, + "grad_norm": 0.5005766929988991, + "learning_rate": 4.556618023041954e-06, + "loss": 0.5893, + "step": 4423 + }, + { + "epoch": 1.7267759562841531, + "grad_norm": 0.5259654375043591, + "learning_rate": 4.554356093779726e-06, + "loss": 0.583, + "step": 4424 + }, + { + "epoch": 1.7271662763466042, + "grad_norm": 0.6196168017478325, + "learning_rate": 4.552094256446418e-06, + "loss": 0.561, + "step": 4425 + }, + { + "epoch": 1.7275565964090553, + "grad_norm": 0.4831605974285316, + "learning_rate": 4.549832511508609e-06, + "loss": 0.569, + "step": 4426 + }, + { + "epoch": 1.7279469164715067, + "grad_norm": 0.5437249313153604, + "learning_rate": 4.547570859432861e-06, + "loss": 0.6066, + "step": 4427 + }, + { + "epoch": 1.728337236533958, + "grad_norm": 0.5188934156375344, + "learning_rate": 4.545309300685714e-06, + "loss": 0.5878, + "step": 4428 + }, + { + "epoch": 1.728727556596409, + "grad_norm": 0.4883460417987744, + "learning_rate": 4.5430478357336884e-06, + "loss": 0.5609, + "step": 4429 + }, + { + "epoch": 1.7291178766588602, + "grad_norm": 0.4978086937567886, + "learning_rate": 4.5407864650432895e-06, + "loss": 0.6234, + "step": 4430 + }, + { + "epoch": 1.7295081967213115, + "grad_norm": 0.5263548483835899, + "learning_rate": 4.538525189081002e-06, + "loss": 0.5983, + "step": 4431 + }, + { + "epoch": 1.7298985167837628, + "grad_norm": 0.4983608840515264, + "learning_rate": 4.5362640083132844e-06, + "loss": 0.6065, + "step": 4432 + }, + { + "epoch": 1.730288836846214, + "grad_norm": 0.5831907230706186, + "learning_rate": 4.534002923206583e-06, + "loss": 0.6028, + "step": 4433 + }, + { + "epoch": 1.730679156908665, + "grad_norm": 0.5980058095557214, + "learning_rate": 4.531741934227322e-06, + "loss": 0.6177, + "step": 4434 + }, + { + "epoch": 1.7310694769711163, + "grad_norm": 0.5590999269598805, + "learning_rate": 4.529481041841906e-06, + "loss": 0.5988, + "step": 4435 + }, + { + "epoch": 1.7314597970335677, + "grad_norm": 0.5504254726328422, + "learning_rate": 4.52722024651672e-06, + "loss": 0.6014, + "step": 4436 + }, + { + "epoch": 1.7318501170960188, + "grad_norm": 0.5909734493035507, + "learning_rate": 4.524959548718127e-06, + "loss": 0.611, + "step": 4437 + }, + { + "epoch": 1.7322404371584699, + "grad_norm": 0.5483040392365164, + "learning_rate": 4.522698948912472e-06, + "loss": 0.6017, + "step": 4438 + }, + { + "epoch": 1.7326307572209212, + "grad_norm": 0.5393023979207586, + "learning_rate": 4.520438447566081e-06, + "loss": 0.6153, + "step": 4439 + }, + { + "epoch": 1.7330210772833725, + "grad_norm": 0.4829729265510464, + "learning_rate": 4.518178045145255e-06, + "loss": 0.5966, + "step": 4440 + }, + { + "epoch": 1.7334113973458236, + "grad_norm": 0.6232956964039117, + "learning_rate": 4.515917742116279e-06, + "loss": 0.5835, + "step": 4441 + }, + { + "epoch": 1.7338017174082747, + "grad_norm": 0.613737610099097, + "learning_rate": 4.513657538945414e-06, + "loss": 0.6169, + "step": 4442 + }, + { + "epoch": 1.734192037470726, + "grad_norm": 0.6030320512158988, + "learning_rate": 4.511397436098904e-06, + "loss": 0.5788, + "step": 4443 + }, + { + "epoch": 1.7345823575331774, + "grad_norm": 0.6253903802919041, + "learning_rate": 4.50913743404297e-06, + "loss": 0.6212, + "step": 4444 + }, + { + "epoch": 1.7349726775956285, + "grad_norm": 0.4954680298297113, + "learning_rate": 4.506877533243813e-06, + "loss": 0.5697, + "step": 4445 + }, + { + "epoch": 1.7353629976580796, + "grad_norm": 0.5393100962174524, + "learning_rate": 4.504617734167614e-06, + "loss": 0.6018, + "step": 4446 + }, + { + "epoch": 1.7357533177205309, + "grad_norm": 0.5576636609189312, + "learning_rate": 4.50235803728053e-06, + "loss": 0.5733, + "step": 4447 + }, + { + "epoch": 1.7361436377829822, + "grad_norm": 0.5624648624768317, + "learning_rate": 4.500098443048702e-06, + "loss": 0.605, + "step": 4448 + }, + { + "epoch": 1.7365339578454333, + "grad_norm": 0.49722037696513777, + "learning_rate": 4.497838951938241e-06, + "loss": 0.5903, + "step": 4449 + }, + { + "epoch": 1.7369242779078844, + "grad_norm": 0.6224876035929839, + "learning_rate": 4.4955795644152475e-06, + "loss": 0.5917, + "step": 4450 + }, + { + "epoch": 1.7373145979703357, + "grad_norm": 0.49169637069470234, + "learning_rate": 4.493320280945794e-06, + "loss": 0.5768, + "step": 4451 + }, + { + "epoch": 1.737704918032787, + "grad_norm": 0.5536045244199286, + "learning_rate": 4.491061101995932e-06, + "loss": 0.5514, + "step": 4452 + }, + { + "epoch": 1.7380952380952381, + "grad_norm": 0.5339782411415659, + "learning_rate": 4.488802028031695e-06, + "loss": 0.574, + "step": 4453 + }, + { + "epoch": 1.7384855581576892, + "grad_norm": 0.5715031148011412, + "learning_rate": 4.48654305951909e-06, + "loss": 0.6267, + "step": 4454 + }, + { + "epoch": 1.7388758782201406, + "grad_norm": 0.4766982371652581, + "learning_rate": 4.484284196924106e-06, + "loss": 0.6137, + "step": 4455 + }, + { + "epoch": 1.7392661982825919, + "grad_norm": 0.5191284692381402, + "learning_rate": 4.482025440712706e-06, + "loss": 0.6194, + "step": 4456 + }, + { + "epoch": 1.739656518345043, + "grad_norm": 0.5475213713967565, + "learning_rate": 4.47976679135084e-06, + "loss": 0.595, + "step": 4457 + }, + { + "epoch": 1.740046838407494, + "grad_norm": 0.5225068122439913, + "learning_rate": 4.477508249304423e-06, + "loss": 0.6157, + "step": 4458 + }, + { + "epoch": 1.7404371584699454, + "grad_norm": 0.53132751378563, + "learning_rate": 4.475249815039357e-06, + "loss": 0.5604, + "step": 4459 + }, + { + "epoch": 1.7408274785323967, + "grad_norm": 0.5741888121098606, + "learning_rate": 4.472991489021521e-06, + "loss": 0.5746, + "step": 4460 + }, + { + "epoch": 1.7412177985948478, + "grad_norm": 0.5424668140075379, + "learning_rate": 4.470733271716768e-06, + "loss": 0.5878, + "step": 4461 + }, + { + "epoch": 1.741608118657299, + "grad_norm": 0.5692509955549504, + "learning_rate": 4.468475163590931e-06, + "loss": 0.6058, + "step": 4462 + }, + { + "epoch": 1.7419984387197502, + "grad_norm": 0.5095870129566307, + "learning_rate": 4.466217165109822e-06, + "loss": 0.597, + "step": 4463 + }, + { + "epoch": 1.7423887587822016, + "grad_norm": 0.5031846254086833, + "learning_rate": 4.463959276739226e-06, + "loss": 0.5776, + "step": 4464 + }, + { + "epoch": 1.7427790788446527, + "grad_norm": 0.5638027524598096, + "learning_rate": 4.4617014989449094e-06, + "loss": 0.6151, + "step": 4465 + }, + { + "epoch": 1.7431693989071038, + "grad_norm": 0.5268264446916266, + "learning_rate": 4.459443832192616e-06, + "loss": 0.5819, + "step": 4466 + }, + { + "epoch": 1.743559718969555, + "grad_norm": 0.6077064278990938, + "learning_rate": 4.45718627694806e-06, + "loss": 0.5855, + "step": 4467 + }, + { + "epoch": 1.7439500390320064, + "grad_norm": 0.5248677545703491, + "learning_rate": 4.45492883367694e-06, + "loss": 0.6049, + "step": 4468 + }, + { + "epoch": 1.7443403590944575, + "grad_norm": 0.47750323830061603, + "learning_rate": 4.4526715028449305e-06, + "loss": 0.5609, + "step": 4469 + }, + { + "epoch": 1.7447306791569086, + "grad_norm": 0.6396111415340578, + "learning_rate": 4.4504142849176775e-06, + "loss": 0.6231, + "step": 4470 + }, + { + "epoch": 1.74512099921936, + "grad_norm": 0.5611531786248409, + "learning_rate": 4.448157180360812e-06, + "loss": 0.5933, + "step": 4471 + }, + { + "epoch": 1.7455113192818112, + "grad_norm": 0.5460736303303368, + "learning_rate": 4.445900189639933e-06, + "loss": 0.6358, + "step": 4472 + }, + { + "epoch": 1.7459016393442623, + "grad_norm": 0.5649997170041732, + "learning_rate": 4.4436433132206224e-06, + "loss": 0.6423, + "step": 4473 + }, + { + "epoch": 1.7462919594067134, + "grad_norm": 0.616977513884348, + "learning_rate": 4.441386551568437e-06, + "loss": 0.5748, + "step": 4474 + }, + { + "epoch": 1.7466822794691648, + "grad_norm": 0.525753954612684, + "learning_rate": 4.439129905148908e-06, + "loss": 0.5822, + "step": 4475 + }, + { + "epoch": 1.747072599531616, + "grad_norm": 0.5226596782039441, + "learning_rate": 4.436873374427543e-06, + "loss": 0.6062, + "step": 4476 + }, + { + "epoch": 1.7474629195940672, + "grad_norm": 0.5118861873672661, + "learning_rate": 4.434616959869828e-06, + "loss": 0.6127, + "step": 4477 + }, + { + "epoch": 1.7478532396565183, + "grad_norm": 0.5599572482223947, + "learning_rate": 4.432360661941223e-06, + "loss": 0.627, + "step": 4478 + }, + { + "epoch": 1.7482435597189696, + "grad_norm": 0.6060757102151018, + "learning_rate": 4.4301044811071655e-06, + "loss": 0.6091, + "step": 4479 + }, + { + "epoch": 1.748633879781421, + "grad_norm": 0.5366845728991658, + "learning_rate": 4.427848417833067e-06, + "loss": 0.5939, + "step": 4480 + }, + { + "epoch": 1.749024199843872, + "grad_norm": 0.5982236215530278, + "learning_rate": 4.4255924725843175e-06, + "loss": 0.5872, + "step": 4481 + }, + { + "epoch": 1.7494145199063231, + "grad_norm": 0.6250198092275848, + "learning_rate": 4.423336645826281e-06, + "loss": 0.6271, + "step": 4482 + }, + { + "epoch": 1.7498048399687745, + "grad_norm": 0.5373337987395942, + "learning_rate": 4.421080938024295e-06, + "loss": 0.5936, + "step": 4483 + }, + { + "epoch": 1.7501951600312255, + "grad_norm": 0.574034708652969, + "learning_rate": 4.418825349643676e-06, + "loss": 0.6269, + "step": 4484 + }, + { + "epoch": 1.7505854800936769, + "grad_norm": 0.5281182831112651, + "learning_rate": 4.416569881149713e-06, + "loss": 0.6049, + "step": 4485 + }, + { + "epoch": 1.750975800156128, + "grad_norm": 0.6628912009383762, + "learning_rate": 4.414314533007673e-06, + "loss": 0.6015, + "step": 4486 + }, + { + "epoch": 1.751366120218579, + "grad_norm": 0.5189212498999587, + "learning_rate": 4.4120593056827955e-06, + "loss": 0.5964, + "step": 4487 + }, + { + "epoch": 1.7517564402810304, + "grad_norm": 0.5423115864864503, + "learning_rate": 4.409804199640297e-06, + "loss": 0.5813, + "step": 4488 + }, + { + "epoch": 1.7521467603434817, + "grad_norm": 0.5204779442032708, + "learning_rate": 4.407549215345369e-06, + "loss": 0.6439, + "step": 4489 + }, + { + "epoch": 1.7525370804059328, + "grad_norm": 0.5050779388588885, + "learning_rate": 4.405294353263177e-06, + "loss": 0.6062, + "step": 4490 + }, + { + "epoch": 1.752927400468384, + "grad_norm": 0.6174013424423191, + "learning_rate": 4.40303961385886e-06, + "loss": 0.5461, + "step": 4491 + }, + { + "epoch": 1.7533177205308352, + "grad_norm": 0.485529243255616, + "learning_rate": 4.400784997597537e-06, + "loss": 0.6068, + "step": 4492 + }, + { + "epoch": 1.7537080405932866, + "grad_norm": 0.47203877563573365, + "learning_rate": 4.398530504944293e-06, + "loss": 0.5807, + "step": 4493 + }, + { + "epoch": 1.7540983606557377, + "grad_norm": 0.541270058280939, + "learning_rate": 4.396276136364194e-06, + "loss": 0.6159, + "step": 4494 + }, + { + "epoch": 1.7544886807181888, + "grad_norm": 0.5997618238269831, + "learning_rate": 4.39402189232228e-06, + "loss": 0.6193, + "step": 4495 + }, + { + "epoch": 1.75487900078064, + "grad_norm": 0.5498161117034054, + "learning_rate": 4.391767773283563e-06, + "loss": 0.5872, + "step": 4496 + }, + { + "epoch": 1.7552693208430914, + "grad_norm": 0.472944754626112, + "learning_rate": 4.38951377971303e-06, + "loss": 0.6271, + "step": 4497 + }, + { + "epoch": 1.7556596409055425, + "grad_norm": 0.5954564701548846, + "learning_rate": 4.387259912075643e-06, + "loss": 0.5642, + "step": 4498 + }, + { + "epoch": 1.7560499609679936, + "grad_norm": 0.6029299621599525, + "learning_rate": 4.385006170836338e-06, + "loss": 0.6281, + "step": 4499 + }, + { + "epoch": 1.756440281030445, + "grad_norm": 0.4685638755452714, + "learning_rate": 4.382752556460023e-06, + "loss": 0.597, + "step": 4500 + }, + { + "epoch": 1.7568306010928962, + "grad_norm": 0.47803921691331397, + "learning_rate": 4.380499069411583e-06, + "loss": 0.6026, + "step": 4501 + }, + { + "epoch": 1.7572209211553473, + "grad_norm": 0.4922832924274862, + "learning_rate": 4.378245710155872e-06, + "loss": 0.585, + "step": 4502 + }, + { + "epoch": 1.7576112412177984, + "grad_norm": 0.5025924844038757, + "learning_rate": 4.375992479157723e-06, + "loss": 0.5804, + "step": 4503 + }, + { + "epoch": 1.7580015612802498, + "grad_norm": 0.5787187080771219, + "learning_rate": 4.373739376881938e-06, + "loss": 0.5704, + "step": 4504 + }, + { + "epoch": 1.758391881342701, + "grad_norm": 0.5046406613369471, + "learning_rate": 4.371486403793295e-06, + "loss": 0.5929, + "step": 4505 + }, + { + "epoch": 1.7587822014051522, + "grad_norm": 0.5886644233806194, + "learning_rate": 4.369233560356546e-06, + "loss": 0.6101, + "step": 4506 + }, + { + "epoch": 1.7591725214676033, + "grad_norm": 0.5057554960588989, + "learning_rate": 4.366980847036415e-06, + "loss": 0.5924, + "step": 4507 + }, + { + "epoch": 1.7595628415300546, + "grad_norm": 0.53775630253733, + "learning_rate": 4.364728264297598e-06, + "loss": 0.5966, + "step": 4508 + }, + { + "epoch": 1.759953161592506, + "grad_norm": 0.5963687740284846, + "learning_rate": 4.362475812604766e-06, + "loss": 0.5764, + "step": 4509 + }, + { + "epoch": 1.760343481654957, + "grad_norm": 0.5542141620383045, + "learning_rate": 4.3602234924225655e-06, + "loss": 0.5804, + "step": 4510 + }, + { + "epoch": 1.7607338017174081, + "grad_norm": 0.5113139726888548, + "learning_rate": 4.357971304215607e-06, + "loss": 0.6219, + "step": 4511 + }, + { + "epoch": 1.7611241217798594, + "grad_norm": 0.4831385873148574, + "learning_rate": 4.355719248448482e-06, + "loss": 0.6391, + "step": 4512 + }, + { + "epoch": 1.7615144418423108, + "grad_norm": 0.5141189051271341, + "learning_rate": 4.353467325585752e-06, + "loss": 0.6092, + "step": 4513 + }, + { + "epoch": 1.7619047619047619, + "grad_norm": 0.5383938712321672, + "learning_rate": 4.351215536091951e-06, + "loss": 0.603, + "step": 4514 + }, + { + "epoch": 1.762295081967213, + "grad_norm": 0.5605746155319983, + "learning_rate": 4.348963880431586e-06, + "loss": 0.6111, + "step": 4515 + }, + { + "epoch": 1.7626854020296643, + "grad_norm": 0.519772421551826, + "learning_rate": 4.346712359069137e-06, + "loss": 0.6117, + "step": 4516 + }, + { + "epoch": 1.7630757220921156, + "grad_norm": 0.5507756668787547, + "learning_rate": 4.344460972469054e-06, + "loss": 0.6278, + "step": 4517 + }, + { + "epoch": 1.7634660421545667, + "grad_norm": 0.5239159936898425, + "learning_rate": 4.342209721095761e-06, + "loss": 0.5762, + "step": 4518 + }, + { + "epoch": 1.7638563622170178, + "grad_norm": 0.49259414682224173, + "learning_rate": 4.339958605413654e-06, + "loss": 0.6281, + "step": 4519 + }, + { + "epoch": 1.7642466822794691, + "grad_norm": 0.4492817025439685, + "learning_rate": 4.337707625887102e-06, + "loss": 0.6195, + "step": 4520 + }, + { + "epoch": 1.7646370023419204, + "grad_norm": 0.5817556287170957, + "learning_rate": 4.33545678298044e-06, + "loss": 0.5989, + "step": 4521 + }, + { + "epoch": 1.7650273224043715, + "grad_norm": 0.4943533552101555, + "learning_rate": 4.3332060771579846e-06, + "loss": 0.5785, + "step": 4522 + }, + { + "epoch": 1.7654176424668226, + "grad_norm": 0.50841758787534, + "learning_rate": 4.330955508884016e-06, + "loss": 0.6356, + "step": 4523 + }, + { + "epoch": 1.765807962529274, + "grad_norm": 0.5522184919316122, + "learning_rate": 4.328705078622789e-06, + "loss": 0.5811, + "step": 4524 + }, + { + "epoch": 1.7661982825917253, + "grad_norm": 0.5828385919659995, + "learning_rate": 4.326454786838531e-06, + "loss": 0.579, + "step": 4525 + }, + { + "epoch": 1.7665886026541764, + "grad_norm": 0.5194547616880056, + "learning_rate": 4.32420463399544e-06, + "loss": 0.6124, + "step": 4526 + }, + { + "epoch": 1.7669789227166275, + "grad_norm": 0.47972801902218437, + "learning_rate": 4.321954620557684e-06, + "loss": 0.614, + "step": 4527 + }, + { + "epoch": 1.7673692427790788, + "grad_norm": 0.579381331699885, + "learning_rate": 4.319704746989407e-06, + "loss": 0.5683, + "step": 4528 + }, + { + "epoch": 1.7677595628415301, + "grad_norm": 0.5197890246947291, + "learning_rate": 4.317455013754714e-06, + "loss": 0.5963, + "step": 4529 + }, + { + "epoch": 1.7681498829039812, + "grad_norm": 0.515252039841962, + "learning_rate": 4.315205421317691e-06, + "loss": 0.6144, + "step": 4530 + }, + { + "epoch": 1.7685402029664323, + "grad_norm": 0.4789271670346705, + "learning_rate": 4.3129559701423915e-06, + "loss": 0.6051, + "step": 4531 + }, + { + "epoch": 1.7689305230288837, + "grad_norm": 0.5376673383886501, + "learning_rate": 4.31070666069284e-06, + "loss": 0.6213, + "step": 4532 + }, + { + "epoch": 1.769320843091335, + "grad_norm": 0.5015888293209433, + "learning_rate": 4.308457493433031e-06, + "loss": 0.6093, + "step": 4533 + }, + { + "epoch": 1.769711163153786, + "grad_norm": 0.4926889817299165, + "learning_rate": 4.306208468826931e-06, + "loss": 0.6001, + "step": 4534 + }, + { + "epoch": 1.7701014832162372, + "grad_norm": 0.49132003957484804, + "learning_rate": 4.303959587338475e-06, + "loss": 0.5989, + "step": 4535 + }, + { + "epoch": 1.7704918032786885, + "grad_norm": 0.602819179180311, + "learning_rate": 4.301710849431573e-06, + "loss": 0.5883, + "step": 4536 + }, + { + "epoch": 1.7708821233411398, + "grad_norm": 0.5048260344980746, + "learning_rate": 4.299462255570098e-06, + "loss": 0.609, + "step": 4537 + }, + { + "epoch": 1.771272443403591, + "grad_norm": 0.45980339257771363, + "learning_rate": 4.297213806217899e-06, + "loss": 0.6105, + "step": 4538 + }, + { + "epoch": 1.771662763466042, + "grad_norm": 0.6164234375013472, + "learning_rate": 4.294965501838795e-06, + "loss": 0.6156, + "step": 4539 + }, + { + "epoch": 1.7720530835284933, + "grad_norm": 0.56757121199655, + "learning_rate": 4.292717342896572e-06, + "loss": 0.6125, + "step": 4540 + }, + { + "epoch": 1.7724434035909447, + "grad_norm": 0.5161141605061762, + "learning_rate": 4.290469329854988e-06, + "loss": 0.6024, + "step": 4541 + }, + { + "epoch": 1.7728337236533958, + "grad_norm": 0.5124652213652805, + "learning_rate": 4.288221463177772e-06, + "loss": 0.5924, + "step": 4542 + }, + { + "epoch": 1.7732240437158469, + "grad_norm": 0.5680462924568278, + "learning_rate": 4.28597374332862e-06, + "loss": 0.5825, + "step": 4543 + }, + { + "epoch": 1.7736143637782982, + "grad_norm": 0.5245815947527385, + "learning_rate": 4.283726170771201e-06, + "loss": 0.5909, + "step": 4544 + }, + { + "epoch": 1.7740046838407495, + "grad_norm": 0.5778604873325077, + "learning_rate": 4.281478745969152e-06, + "loss": 0.6269, + "step": 4545 + }, + { + "epoch": 1.7743950039032006, + "grad_norm": 0.5218259323208692, + "learning_rate": 4.279231469386076e-06, + "loss": 0.5694, + "step": 4546 + }, + { + "epoch": 1.7747853239656517, + "grad_norm": 0.4850680139545695, + "learning_rate": 4.276984341485552e-06, + "loss": 0.6355, + "step": 4547 + }, + { + "epoch": 1.775175644028103, + "grad_norm": 0.6394708057185605, + "learning_rate": 4.274737362731122e-06, + "loss": 0.6117, + "step": 4548 + }, + { + "epoch": 1.7755659640905543, + "grad_norm": 0.568552755997075, + "learning_rate": 4.272490533586304e-06, + "loss": 0.6055, + "step": 4549 + }, + { + "epoch": 1.7759562841530054, + "grad_norm": 0.4626311304493895, + "learning_rate": 4.270243854514579e-06, + "loss": 0.61, + "step": 4550 + }, + { + "epoch": 1.7763466042154565, + "grad_norm": 0.47023480232409104, + "learning_rate": 4.2679973259794005e-06, + "loss": 0.5929, + "step": 4551 + }, + { + "epoch": 1.7767369242779079, + "grad_norm": 0.48441187822603765, + "learning_rate": 4.2657509484441886e-06, + "loss": 0.5651, + "step": 4552 + }, + { + "epoch": 1.7771272443403592, + "grad_norm": 0.501045085130341, + "learning_rate": 4.263504722372335e-06, + "loss": 0.5463, + "step": 4553 + }, + { + "epoch": 1.7775175644028103, + "grad_norm": 0.5572665641502417, + "learning_rate": 4.261258648227201e-06, + "loss": 0.6047, + "step": 4554 + }, + { + "epoch": 1.7779078844652614, + "grad_norm": 0.5013350191030715, + "learning_rate": 4.259012726472109e-06, + "loss": 0.5653, + "step": 4555 + }, + { + "epoch": 1.7782982045277127, + "grad_norm": 0.5668714487823654, + "learning_rate": 4.256766957570358e-06, + "loss": 0.5746, + "step": 4556 + }, + { + "epoch": 1.778688524590164, + "grad_norm": 0.5851591035085381, + "learning_rate": 4.254521341985213e-06, + "loss": 0.6249, + "step": 4557 + }, + { + "epoch": 1.7790788446526151, + "grad_norm": 0.5065034450254111, + "learning_rate": 4.252275880179906e-06, + "loss": 0.6164, + "step": 4558 + }, + { + "epoch": 1.7794691647150662, + "grad_norm": 0.5478705316245133, + "learning_rate": 4.25003057261764e-06, + "loss": 0.5641, + "step": 4559 + }, + { + "epoch": 1.7798594847775175, + "grad_norm": 0.49386114645016566, + "learning_rate": 4.247785419761583e-06, + "loss": 0.6022, + "step": 4560 + }, + { + "epoch": 1.7802498048399689, + "grad_norm": 0.4887726690261751, + "learning_rate": 4.245540422074873e-06, + "loss": 0.6116, + "step": 4561 + }, + { + "epoch": 1.78064012490242, + "grad_norm": 0.5076553885578294, + "learning_rate": 4.243295580020619e-06, + "loss": 0.5878, + "step": 4562 + }, + { + "epoch": 1.781030444964871, + "grad_norm": 0.5780910606063929, + "learning_rate": 4.241050894061891e-06, + "loss": 0.6003, + "step": 4563 + }, + { + "epoch": 1.7814207650273224, + "grad_norm": 0.5530124416463351, + "learning_rate": 4.238806364661729e-06, + "loss": 0.6198, + "step": 4564 + }, + { + "epoch": 1.7818110850897737, + "grad_norm": 0.5673598057455532, + "learning_rate": 4.236561992283145e-06, + "loss": 0.5952, + "step": 4565 + }, + { + "epoch": 1.7822014051522248, + "grad_norm": 0.5515688112158112, + "learning_rate": 4.234317777389115e-06, + "loss": 0.5924, + "step": 4566 + }, + { + "epoch": 1.782591725214676, + "grad_norm": 0.5488529547200658, + "learning_rate": 4.232073720442582e-06, + "loss": 0.5569, + "step": 4567 + }, + { + "epoch": 1.7829820452771272, + "grad_norm": 0.6261165919702744, + "learning_rate": 4.2298298219064585e-06, + "loss": 0.5978, + "step": 4568 + }, + { + "epoch": 1.7833723653395785, + "grad_norm": 0.5552947261314819, + "learning_rate": 4.227586082243624e-06, + "loss": 0.5907, + "step": 4569 + }, + { + "epoch": 1.7837626854020296, + "grad_norm": 0.5575661190632667, + "learning_rate": 4.225342501916923e-06, + "loss": 0.5975, + "step": 4570 + }, + { + "epoch": 1.7841530054644807, + "grad_norm": 0.4913422285807874, + "learning_rate": 4.223099081389171e-06, + "loss": 0.6013, + "step": 4571 + }, + { + "epoch": 1.784543325526932, + "grad_norm": 0.6118092914103314, + "learning_rate": 4.220855821123149e-06, + "loss": 0.5876, + "step": 4572 + }, + { + "epoch": 1.7849336455893834, + "grad_norm": 0.6210216746062175, + "learning_rate": 4.2186127215815995e-06, + "loss": 0.5904, + "step": 4573 + }, + { + "epoch": 1.7853239656518345, + "grad_norm": 0.5347997637959718, + "learning_rate": 4.21636978322724e-06, + "loss": 0.6241, + "step": 4574 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 0.5205227376464578, + "learning_rate": 4.2141270065227506e-06, + "loss": 0.5766, + "step": 4575 + }, + { + "epoch": 1.786104605776737, + "grad_norm": 0.5189052502345662, + "learning_rate": 4.211884391930779e-06, + "loss": 0.5946, + "step": 4576 + }, + { + "epoch": 1.7864949258391882, + "grad_norm": 0.6057762520763441, + "learning_rate": 4.20964193991394e-06, + "loss": 0.5725, + "step": 4577 + }, + { + "epoch": 1.7868852459016393, + "grad_norm": 0.4987929841584391, + "learning_rate": 4.2073996509348135e-06, + "loss": 0.5763, + "step": 4578 + }, + { + "epoch": 1.7872755659640904, + "grad_norm": 0.5410103522009615, + "learning_rate": 4.205157525455946e-06, + "loss": 0.5668, + "step": 4579 + }, + { + "epoch": 1.7876658860265418, + "grad_norm": 0.5280915425891055, + "learning_rate": 4.202915563939852e-06, + "loss": 0.5597, + "step": 4580 + }, + { + "epoch": 1.788056206088993, + "grad_norm": 0.6405961668764727, + "learning_rate": 4.200673766849009e-06, + "loss": 0.5995, + "step": 4581 + }, + { + "epoch": 1.7884465261514442, + "grad_norm": 0.5277189382222914, + "learning_rate": 4.198432134645863e-06, + "loss": 0.609, + "step": 4582 + }, + { + "epoch": 1.7888368462138953, + "grad_norm": 0.5744525903655688, + "learning_rate": 4.196190667792827e-06, + "loss": 0.5856, + "step": 4583 + }, + { + "epoch": 1.7892271662763466, + "grad_norm": 0.5610777390319261, + "learning_rate": 4.193949366752275e-06, + "loss": 0.6026, + "step": 4584 + }, + { + "epoch": 1.789617486338798, + "grad_norm": 0.5571023017960018, + "learning_rate": 4.191708231986554e-06, + "loss": 0.6066, + "step": 4585 + }, + { + "epoch": 1.790007806401249, + "grad_norm": 0.5713857342683251, + "learning_rate": 4.189467263957968e-06, + "loss": 0.6112, + "step": 4586 + }, + { + "epoch": 1.7903981264637001, + "grad_norm": 0.5911133921031062, + "learning_rate": 4.187226463128796e-06, + "loss": 0.651, + "step": 4587 + }, + { + "epoch": 1.7907884465261514, + "grad_norm": 0.49657857933998784, + "learning_rate": 4.184985829961276e-06, + "loss": 0.5875, + "step": 4588 + }, + { + "epoch": 1.7911787665886028, + "grad_norm": 0.5315299406925197, + "learning_rate": 4.182745364917616e-06, + "loss": 0.6283, + "step": 4589 + }, + { + "epoch": 1.7915690866510539, + "grad_norm": 0.5330691808656378, + "learning_rate": 4.1805050684599815e-06, + "loss": 0.5848, + "step": 4590 + }, + { + "epoch": 1.791959406713505, + "grad_norm": 0.5752779793811721, + "learning_rate": 4.178264941050511e-06, + "loss": 0.6187, + "step": 4591 + }, + { + "epoch": 1.7923497267759563, + "grad_norm": 0.46612473580095326, + "learning_rate": 4.176024983151306e-06, + "loss": 0.5538, + "step": 4592 + }, + { + "epoch": 1.7927400468384076, + "grad_norm": 0.5258393058941218, + "learning_rate": 4.173785195224432e-06, + "loss": 0.6137, + "step": 4593 + }, + { + "epoch": 1.7931303669008587, + "grad_norm": 0.5533638372229328, + "learning_rate": 4.17154557773192e-06, + "loss": 0.6178, + "step": 4594 + }, + { + "epoch": 1.7935206869633098, + "grad_norm": 0.5319939024510263, + "learning_rate": 4.169306131135765e-06, + "loss": 0.6347, + "step": 4595 + }, + { + "epoch": 1.7939110070257611, + "grad_norm": 0.6049765255719229, + "learning_rate": 4.16706685589793e-06, + "loss": 0.55, + "step": 4596 + }, + { + "epoch": 1.7943013270882124, + "grad_norm": 0.5208699675512196, + "learning_rate": 4.164827752480338e-06, + "loss": 0.5993, + "step": 4597 + }, + { + "epoch": 1.7946916471506635, + "grad_norm": 0.5923346558577397, + "learning_rate": 4.16258882134488e-06, + "loss": 0.5871, + "step": 4598 + }, + { + "epoch": 1.7950819672131146, + "grad_norm": 0.5073547574661337, + "learning_rate": 4.16035006295341e-06, + "loss": 0.5789, + "step": 4599 + }, + { + "epoch": 1.795472287275566, + "grad_norm": 0.5144285906989928, + "learning_rate": 4.158111477767745e-06, + "loss": 0.5489, + "step": 4600 + }, + { + "epoch": 1.7958626073380173, + "grad_norm": 0.5589879823266137, + "learning_rate": 4.155873066249669e-06, + "loss": 0.5773, + "step": 4601 + }, + { + "epoch": 1.7962529274004684, + "grad_norm": 0.5305601819374817, + "learning_rate": 4.1536348288609294e-06, + "loss": 0.5868, + "step": 4602 + }, + { + "epoch": 1.7966432474629195, + "grad_norm": 0.5663372178279747, + "learning_rate": 4.151396766063236e-06, + "loss": 0.5885, + "step": 4603 + }, + { + "epoch": 1.7970335675253708, + "grad_norm": 0.5601307004947966, + "learning_rate": 4.149158878318266e-06, + "loss": 0.6002, + "step": 4604 + }, + { + "epoch": 1.7974238875878221, + "grad_norm": 0.5031309915648524, + "learning_rate": 4.146921166087657e-06, + "loss": 0.5848, + "step": 4605 + }, + { + "epoch": 1.7978142076502732, + "grad_norm": 0.5342523023714103, + "learning_rate": 4.144683629833012e-06, + "loss": 0.5952, + "step": 4606 + }, + { + "epoch": 1.7982045277127243, + "grad_norm": 0.5526299004421872, + "learning_rate": 4.1424462700158995e-06, + "loss": 0.6317, + "step": 4607 + }, + { + "epoch": 1.7985948477751756, + "grad_norm": 0.4804369208703573, + "learning_rate": 4.140209087097845e-06, + "loss": 0.5891, + "step": 4608 + }, + { + "epoch": 1.798985167837627, + "grad_norm": 0.55838267509627, + "learning_rate": 4.137972081540346e-06, + "loss": 0.6046, + "step": 4609 + }, + { + "epoch": 1.799375487900078, + "grad_norm": 0.5831897030893941, + "learning_rate": 4.135735253804858e-06, + "loss": 0.5991, + "step": 4610 + }, + { + "epoch": 1.7997658079625292, + "grad_norm": 0.5466989864100853, + "learning_rate": 4.133498604352801e-06, + "loss": 0.5984, + "step": 4611 + }, + { + "epoch": 1.8001561280249805, + "grad_norm": 0.5178024216695838, + "learning_rate": 4.131262133645561e-06, + "loss": 0.5784, + "step": 4612 + }, + { + "epoch": 1.8005464480874318, + "grad_norm": 0.5429628720743681, + "learning_rate": 4.129025842144481e-06, + "loss": 0.5872, + "step": 4613 + }, + { + "epoch": 1.800936768149883, + "grad_norm": 0.645979604711227, + "learning_rate": 4.126789730310874e-06, + "loss": 0.6258, + "step": 4614 + }, + { + "epoch": 1.801327088212334, + "grad_norm": 0.5589702992733907, + "learning_rate": 4.1245537986060095e-06, + "loss": 0.5908, + "step": 4615 + }, + { + "epoch": 1.8017174082747853, + "grad_norm": 0.535503201055124, + "learning_rate": 4.122318047491127e-06, + "loss": 0.5979, + "step": 4616 + }, + { + "epoch": 1.8021077283372366, + "grad_norm": 0.4920360584047805, + "learning_rate": 4.120082477427422e-06, + "loss": 0.5872, + "step": 4617 + }, + { + "epoch": 1.8024980483996877, + "grad_norm": 0.5466543916194436, + "learning_rate": 4.117847088876054e-06, + "loss": 0.5952, + "step": 4618 + }, + { + "epoch": 1.8028883684621388, + "grad_norm": 0.5147563198213293, + "learning_rate": 4.115611882298149e-06, + "loss": 0.6174, + "step": 4619 + }, + { + "epoch": 1.8032786885245902, + "grad_norm": 0.5229246430671575, + "learning_rate": 4.113376858154792e-06, + "loss": 0.5796, + "step": 4620 + }, + { + "epoch": 1.8036690085870415, + "grad_norm": 0.5438242797069046, + "learning_rate": 4.111142016907032e-06, + "loss": 0.5555, + "step": 4621 + }, + { + "epoch": 1.8040593286494926, + "grad_norm": 0.5237654275582778, + "learning_rate": 4.108907359015879e-06, + "loss": 0.5694, + "step": 4622 + }, + { + "epoch": 1.8044496487119437, + "grad_norm": 0.5315329825492192, + "learning_rate": 4.106672884942306e-06, + "loss": 0.636, + "step": 4623 + }, + { + "epoch": 1.804839968774395, + "grad_norm": 0.5394308178157515, + "learning_rate": 4.104438595147247e-06, + "loss": 0.5907, + "step": 4624 + }, + { + "epoch": 1.8052302888368463, + "grad_norm": 0.6071981516792139, + "learning_rate": 4.102204490091601e-06, + "loss": 0.5859, + "step": 4625 + }, + { + "epoch": 1.8056206088992974, + "grad_norm": 0.5115324484697965, + "learning_rate": 4.099970570236223e-06, + "loss": 0.6055, + "step": 4626 + }, + { + "epoch": 1.8060109289617485, + "grad_norm": 0.5363385472639792, + "learning_rate": 4.0977368360419366e-06, + "loss": 0.6147, + "step": 4627 + }, + { + "epoch": 1.8064012490241999, + "grad_norm": 0.5753884241226932, + "learning_rate": 4.095503287969522e-06, + "loss": 0.6003, + "step": 4628 + }, + { + "epoch": 1.8067915690866512, + "grad_norm": 0.5792257134940149, + "learning_rate": 4.093269926479725e-06, + "loss": 0.6051, + "step": 4629 + }, + { + "epoch": 1.8071818891491023, + "grad_norm": 0.5040353319497751, + "learning_rate": 4.091036752033249e-06, + "loss": 0.5841, + "step": 4630 + }, + { + "epoch": 1.8075722092115534, + "grad_norm": 0.5285295744129122, + "learning_rate": 4.088803765090762e-06, + "loss": 0.5992, + "step": 4631 + }, + { + "epoch": 1.8079625292740047, + "grad_norm": 0.5906241084891009, + "learning_rate": 4.086570966112891e-06, + "loss": 0.6176, + "step": 4632 + }, + { + "epoch": 1.808352849336456, + "grad_norm": 0.5381039589953762, + "learning_rate": 4.084338355560229e-06, + "loss": 0.5988, + "step": 4633 + }, + { + "epoch": 1.8087431693989071, + "grad_norm": 0.5468546645535357, + "learning_rate": 4.0821059338933196e-06, + "loss": 0.5985, + "step": 4634 + }, + { + "epoch": 1.8091334894613582, + "grad_norm": 0.5582915355480468, + "learning_rate": 4.079873701572679e-06, + "loss": 0.5702, + "step": 4635 + }, + { + "epoch": 1.8095238095238095, + "grad_norm": 0.5247113233111182, + "learning_rate": 4.0776416590587775e-06, + "loss": 0.5878, + "step": 4636 + }, + { + "epoch": 1.8099141295862609, + "grad_norm": 0.5368419411354358, + "learning_rate": 4.07540980681205e-06, + "loss": 0.5956, + "step": 4637 + }, + { + "epoch": 1.810304449648712, + "grad_norm": 0.541177023901852, + "learning_rate": 4.07317814529289e-06, + "loss": 0.5882, + "step": 4638 + }, + { + "epoch": 1.810694769711163, + "grad_norm": 0.5761621887379828, + "learning_rate": 4.07094667496165e-06, + "loss": 0.5644, + "step": 4639 + }, + { + "epoch": 1.8110850897736144, + "grad_norm": 0.4986528648128595, + "learning_rate": 4.0687153962786485e-06, + "loss": 0.5938, + "step": 4640 + }, + { + "epoch": 1.8114754098360657, + "grad_norm": 0.563297271371124, + "learning_rate": 4.066484309704159e-06, + "loss": 0.6288, + "step": 4641 + }, + { + "epoch": 1.8118657298985168, + "grad_norm": 0.5478632638781766, + "learning_rate": 4.064253415698421e-06, + "loss": 0.573, + "step": 4642 + }, + { + "epoch": 1.812256049960968, + "grad_norm": 0.49495582941262556, + "learning_rate": 4.062022714721627e-06, + "loss": 0.5966, + "step": 4643 + }, + { + "epoch": 1.8126463700234192, + "grad_norm": 0.5673406075712288, + "learning_rate": 4.059792207233933e-06, + "loss": 0.6157, + "step": 4644 + }, + { + "epoch": 1.8130366900858705, + "grad_norm": 0.5735623477593939, + "learning_rate": 4.057561893695456e-06, + "loss": 0.598, + "step": 4645 + }, + { + "epoch": 1.8134270101483216, + "grad_norm": 0.5201265789770584, + "learning_rate": 4.0553317745662765e-06, + "loss": 0.5852, + "step": 4646 + }, + { + "epoch": 1.8138173302107727, + "grad_norm": 0.5181995468603513, + "learning_rate": 4.053101850306427e-06, + "loss": 0.5941, + "step": 4647 + }, + { + "epoch": 1.814207650273224, + "grad_norm": 0.5127506707593613, + "learning_rate": 4.0508721213759035e-06, + "loss": 0.6239, + "step": 4648 + }, + { + "epoch": 1.8145979703356754, + "grad_norm": 0.5271659304732093, + "learning_rate": 4.048642588234665e-06, + "loss": 0.6013, + "step": 4649 + }, + { + "epoch": 1.8149882903981265, + "grad_norm": 0.47636409927867346, + "learning_rate": 4.046413251342624e-06, + "loss": 0.6316, + "step": 4650 + }, + { + "epoch": 1.8153786104605776, + "grad_norm": 0.5724448964622985, + "learning_rate": 4.04418411115966e-06, + "loss": 0.5914, + "step": 4651 + }, + { + "epoch": 1.815768930523029, + "grad_norm": 0.4918994418123288, + "learning_rate": 4.041955168145601e-06, + "loss": 0.5871, + "step": 4652 + }, + { + "epoch": 1.8161592505854802, + "grad_norm": 0.5539272656714055, + "learning_rate": 4.039726422760246e-06, + "loss": 0.5838, + "step": 4653 + }, + { + "epoch": 1.8165495706479313, + "grad_norm": 0.5246123722762733, + "learning_rate": 4.0374978754633456e-06, + "loss": 0.6017, + "step": 4654 + }, + { + "epoch": 1.8169398907103824, + "grad_norm": 0.49563720890361956, + "learning_rate": 4.035269526714613e-06, + "loss": 0.5876, + "step": 4655 + }, + { + "epoch": 1.8173302107728337, + "grad_norm": 0.500332806572326, + "learning_rate": 4.033041376973719e-06, + "loss": 0.6226, + "step": 4656 + }, + { + "epoch": 1.817720530835285, + "grad_norm": 0.6782727279241889, + "learning_rate": 4.0308134267002945e-06, + "loss": 0.5881, + "step": 4657 + }, + { + "epoch": 1.8181108508977362, + "grad_norm": 0.5731415056352117, + "learning_rate": 4.028585676353928e-06, + "loss": 0.6096, + "step": 4658 + }, + { + "epoch": 1.8185011709601873, + "grad_norm": 0.5440462028728036, + "learning_rate": 4.0263581263941685e-06, + "loss": 0.6032, + "step": 4659 + }, + { + "epoch": 1.8188914910226386, + "grad_norm": 0.6195909319326732, + "learning_rate": 4.024130777280521e-06, + "loss": 0.5827, + "step": 4660 + }, + { + "epoch": 1.81928181108509, + "grad_norm": 0.6658937263235899, + "learning_rate": 4.02190362947245e-06, + "loss": 0.604, + "step": 4661 + }, + { + "epoch": 1.819672131147541, + "grad_norm": 0.5578101941771763, + "learning_rate": 4.019676683429381e-06, + "loss": 0.6261, + "step": 4662 + }, + { + "epoch": 1.820062451209992, + "grad_norm": 0.5467159277064574, + "learning_rate": 4.0174499396106945e-06, + "loss": 0.6224, + "step": 4663 + }, + { + "epoch": 1.8204527712724434, + "grad_norm": 0.5025014486160846, + "learning_rate": 4.0152233984757315e-06, + "loss": 0.6188, + "step": 4664 + }, + { + "epoch": 1.8208430913348947, + "grad_norm": 0.6155022652849859, + "learning_rate": 4.012997060483789e-06, + "loss": 0.5812, + "step": 4665 + }, + { + "epoch": 1.8212334113973458, + "grad_norm": 0.5717948994433534, + "learning_rate": 4.010770926094125e-06, + "loss": 0.5881, + "step": 4666 + }, + { + "epoch": 1.821623731459797, + "grad_norm": 0.5663545295252486, + "learning_rate": 4.008544995765953e-06, + "loss": 0.6493, + "step": 4667 + }, + { + "epoch": 1.8220140515222483, + "grad_norm": 0.4744776716790382, + "learning_rate": 4.006319269958446e-06, + "loss": 0.6031, + "step": 4668 + }, + { + "epoch": 1.8224043715846996, + "grad_norm": 0.5614380017666132, + "learning_rate": 4.0040937491307366e-06, + "loss": 0.5765, + "step": 4669 + }, + { + "epoch": 1.8227946916471507, + "grad_norm": 0.60333299758144, + "learning_rate": 4.001868433741907e-06, + "loss": 0.5782, + "step": 4670 + }, + { + "epoch": 1.8231850117096018, + "grad_norm": 0.559618321124399, + "learning_rate": 3.999643324251006e-06, + "loss": 0.5979, + "step": 4671 + }, + { + "epoch": 1.823575331772053, + "grad_norm": 0.48845078470721837, + "learning_rate": 3.997418421117036e-06, + "loss": 0.5914, + "step": 4672 + }, + { + "epoch": 1.8239656518345044, + "grad_norm": 0.6029947964023078, + "learning_rate": 3.99519372479896e-06, + "loss": 0.6085, + "step": 4673 + }, + { + "epoch": 1.8243559718969555, + "grad_norm": 0.48259001893377784, + "learning_rate": 3.9929692357556915e-06, + "loss": 0.5797, + "step": 4674 + }, + { + "epoch": 1.8247462919594066, + "grad_norm": 0.568761861154036, + "learning_rate": 3.990744954446109e-06, + "loss": 0.5719, + "step": 4675 + }, + { + "epoch": 1.825136612021858, + "grad_norm": 0.5914828641838717, + "learning_rate": 3.988520881329044e-06, + "loss": 0.5806, + "step": 4676 + }, + { + "epoch": 1.8255269320843093, + "grad_norm": 0.5625924664432, + "learning_rate": 3.986297016863286e-06, + "loss": 0.6072, + "step": 4677 + }, + { + "epoch": 1.8259172521467604, + "grad_norm": 0.5713176848524548, + "learning_rate": 3.9840733615075785e-06, + "loss": 0.6195, + "step": 4678 + }, + { + "epoch": 1.8263075722092115, + "grad_norm": 0.5522481343590697, + "learning_rate": 3.981849915720629e-06, + "loss": 0.5764, + "step": 4679 + }, + { + "epoch": 1.8266978922716628, + "grad_norm": 0.5550531388326752, + "learning_rate": 3.979626679961093e-06, + "loss": 0.619, + "step": 4680 + }, + { + "epoch": 1.8270882123341141, + "grad_norm": 0.5547951807293848, + "learning_rate": 3.9774036546875885e-06, + "loss": 0.5931, + "step": 4681 + }, + { + "epoch": 1.8274785323965652, + "grad_norm": 0.47027140429595465, + "learning_rate": 3.97518084035869e-06, + "loss": 0.6014, + "step": 4682 + }, + { + "epoch": 1.8278688524590163, + "grad_norm": 0.5392846315051808, + "learning_rate": 3.972958237432925e-06, + "loss": 0.6094, + "step": 4683 + }, + { + "epoch": 1.8282591725214676, + "grad_norm": 0.6018757517021456, + "learning_rate": 3.970735846368783e-06, + "loss": 0.6315, + "step": 4684 + }, + { + "epoch": 1.828649492583919, + "grad_norm": 0.518748793886597, + "learning_rate": 3.968513667624702e-06, + "loss": 0.6174, + "step": 4685 + }, + { + "epoch": 1.82903981264637, + "grad_norm": 0.507704321987457, + "learning_rate": 3.9662917016590856e-06, + "loss": 0.5895, + "step": 4686 + }, + { + "epoch": 1.8294301327088212, + "grad_norm": 0.6213309473103353, + "learning_rate": 3.9640699489302824e-06, + "loss": 0.6233, + "step": 4687 + }, + { + "epoch": 1.8298204527712725, + "grad_norm": 0.6553344017527014, + "learning_rate": 3.961848409896606e-06, + "loss": 0.6051, + "step": 4688 + }, + { + "epoch": 1.8302107728337238, + "grad_norm": 0.4904510422510162, + "learning_rate": 3.959627085016323e-06, + "loss": 0.5999, + "step": 4689 + }, + { + "epoch": 1.830601092896175, + "grad_norm": 0.5938972380614114, + "learning_rate": 3.957405974747656e-06, + "loss": 0.5872, + "step": 4690 + }, + { + "epoch": 1.830991412958626, + "grad_norm": 0.5277039858361319, + "learning_rate": 3.955185079548782e-06, + "loss": 0.574, + "step": 4691 + }, + { + "epoch": 1.8313817330210773, + "grad_norm": 0.5998057512594982, + "learning_rate": 3.952964399877836e-06, + "loss": 0.6037, + "step": 4692 + }, + { + "epoch": 1.8317720530835286, + "grad_norm": 0.5295685809242895, + "learning_rate": 3.950743936192907e-06, + "loss": 0.5665, + "step": 4693 + }, + { + "epoch": 1.8321623731459797, + "grad_norm": 0.5100255612339105, + "learning_rate": 3.948523688952039e-06, + "loss": 0.638, + "step": 4694 + }, + { + "epoch": 1.8325526932084308, + "grad_norm": 0.5437394224576308, + "learning_rate": 3.946303658613232e-06, + "loss": 0.6134, + "step": 4695 + }, + { + "epoch": 1.8329430132708822, + "grad_norm": 0.5303079189181384, + "learning_rate": 3.944083845634443e-06, + "loss": 0.6376, + "step": 4696 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.4825314112478202, + "learning_rate": 3.9418642504735795e-06, + "loss": 0.6033, + "step": 4697 + }, + { + "epoch": 1.8337236533957846, + "grad_norm": 0.6157691516845678, + "learning_rate": 3.939644873588508e-06, + "loss": 0.5806, + "step": 4698 + }, + { + "epoch": 1.8341139734582357, + "grad_norm": 0.5397733812688599, + "learning_rate": 3.937425715437051e-06, + "loss": 0.5851, + "step": 4699 + }, + { + "epoch": 1.834504293520687, + "grad_norm": 0.512285829755758, + "learning_rate": 3.935206776476982e-06, + "loss": 0.5816, + "step": 4700 + }, + { + "epoch": 1.8348946135831383, + "grad_norm": 0.5078035677500506, + "learning_rate": 3.932988057166031e-06, + "loss": 0.6109, + "step": 4701 + }, + { + "epoch": 1.8352849336455894, + "grad_norm": 0.5345732866863696, + "learning_rate": 3.930769557961884e-06, + "loss": 0.5977, + "step": 4702 + }, + { + "epoch": 1.8356752537080405, + "grad_norm": 0.5562611385977926, + "learning_rate": 3.928551279322181e-06, + "loss": 0.6455, + "step": 4703 + }, + { + "epoch": 1.8360655737704918, + "grad_norm": 0.5254487710647755, + "learning_rate": 3.926333221704516e-06, + "loss": 0.5924, + "step": 4704 + }, + { + "epoch": 1.8364558938329432, + "grad_norm": 0.530183846367373, + "learning_rate": 3.924115385566435e-06, + "loss": 0.5962, + "step": 4705 + }, + { + "epoch": 1.8368462138953943, + "grad_norm": 0.5454647499333958, + "learning_rate": 3.9218977713654415e-06, + "loss": 0.5928, + "step": 4706 + }, + { + "epoch": 1.8372365339578454, + "grad_norm": 0.49522646964353845, + "learning_rate": 3.9196803795589935e-06, + "loss": 0.6, + "step": 4707 + }, + { + "epoch": 1.8376268540202967, + "grad_norm": 0.5630086471471427, + "learning_rate": 3.917463210604501e-06, + "loss": 0.5967, + "step": 4708 + }, + { + "epoch": 1.838017174082748, + "grad_norm": 0.5178923234132051, + "learning_rate": 3.915246264959331e-06, + "loss": 0.6325, + "step": 4709 + }, + { + "epoch": 1.838407494145199, + "grad_norm": 0.5036498677116389, + "learning_rate": 3.9130295430808e-06, + "loss": 0.5759, + "step": 4710 + }, + { + "epoch": 1.8387978142076502, + "grad_norm": 0.47836359038401755, + "learning_rate": 3.910813045426181e-06, + "loss": 0.5632, + "step": 4711 + }, + { + "epoch": 1.8391881342701015, + "grad_norm": 0.48030470594558133, + "learning_rate": 3.908596772452702e-06, + "loss": 0.5907, + "step": 4712 + }, + { + "epoch": 1.8395784543325528, + "grad_norm": 0.6060213310518253, + "learning_rate": 3.906380724617544e-06, + "loss": 0.5668, + "step": 4713 + }, + { + "epoch": 1.839968774395004, + "grad_norm": 0.49003224617848734, + "learning_rate": 3.904164902377839e-06, + "loss": 0.5829, + "step": 4714 + }, + { + "epoch": 1.840359094457455, + "grad_norm": 0.5551739195264409, + "learning_rate": 3.901949306190673e-06, + "loss": 0.6151, + "step": 4715 + }, + { + "epoch": 1.8407494145199064, + "grad_norm": 0.5067579591105593, + "learning_rate": 3.899733936513089e-06, + "loss": 0.6013, + "step": 4716 + }, + { + "epoch": 1.8411397345823577, + "grad_norm": 0.520798836559416, + "learning_rate": 3.897518793802081e-06, + "loss": 0.5871, + "step": 4717 + }, + { + "epoch": 1.8415300546448088, + "grad_norm": 0.5263832600502902, + "learning_rate": 3.895303878514594e-06, + "loss": 0.6036, + "step": 4718 + }, + { + "epoch": 1.8419203747072599, + "grad_norm": 0.4750938130367356, + "learning_rate": 3.8930891911075295e-06, + "loss": 0.5611, + "step": 4719 + }, + { + "epoch": 1.8423106947697112, + "grad_norm": 0.5234650678517248, + "learning_rate": 3.890874732037741e-06, + "loss": 0.6032, + "step": 4720 + }, + { + "epoch": 1.8427010148321625, + "grad_norm": 0.5717979270213203, + "learning_rate": 3.888660501762036e-06, + "loss": 0.5944, + "step": 4721 + }, + { + "epoch": 1.8430913348946136, + "grad_norm": 0.5392187901587464, + "learning_rate": 3.88644650073717e-06, + "loss": 0.5937, + "step": 4722 + }, + { + "epoch": 1.8434816549570647, + "grad_norm": 0.4988018090976729, + "learning_rate": 3.884232729419855e-06, + "loss": 0.6179, + "step": 4723 + }, + { + "epoch": 1.843871975019516, + "grad_norm": 0.5319602155796996, + "learning_rate": 3.882019188266757e-06, + "loss": 0.6614, + "step": 4724 + }, + { + "epoch": 1.8442622950819674, + "grad_norm": 0.5840383692893362, + "learning_rate": 3.879805877734491e-06, + "loss": 0.5995, + "step": 4725 + }, + { + "epoch": 1.8446526151444185, + "grad_norm": 0.5430895047750235, + "learning_rate": 3.877592798279629e-06, + "loss": 0.5868, + "step": 4726 + }, + { + "epoch": 1.8450429352068696, + "grad_norm": 0.5143010945909541, + "learning_rate": 3.8753799503586885e-06, + "loss": 0.6016, + "step": 4727 + }, + { + "epoch": 1.845433255269321, + "grad_norm": 0.6961546416452078, + "learning_rate": 3.873167334428145e-06, + "loss": 0.6068, + "step": 4728 + }, + { + "epoch": 1.8458235753317722, + "grad_norm": 0.5152237241594044, + "learning_rate": 3.870954950944426e-06, + "loss": 0.5932, + "step": 4729 + }, + { + "epoch": 1.8462138953942233, + "grad_norm": 0.5351819056802892, + "learning_rate": 3.868742800363909e-06, + "loss": 0.6129, + "step": 4730 + }, + { + "epoch": 1.8466042154566744, + "grad_norm": 0.5222856017703467, + "learning_rate": 3.86653088314292e-06, + "loss": 0.6011, + "step": 4731 + }, + { + "epoch": 1.8469945355191257, + "grad_norm": 0.5857642817916165, + "learning_rate": 3.864319199737744e-06, + "loss": 0.6063, + "step": 4732 + }, + { + "epoch": 1.847384855581577, + "grad_norm": 0.49499759164552326, + "learning_rate": 3.862107750604615e-06, + "loss": 0.5892, + "step": 4733 + }, + { + "epoch": 1.8477751756440282, + "grad_norm": 0.48704926964794254, + "learning_rate": 3.859896536199715e-06, + "loss": 0.5989, + "step": 4734 + }, + { + "epoch": 1.8481654957064793, + "grad_norm": 0.523018975336465, + "learning_rate": 3.857685556979184e-06, + "loss": 0.6044, + "step": 4735 + }, + { + "epoch": 1.8485558157689306, + "grad_norm": 0.6179225361260349, + "learning_rate": 3.855474813399109e-06, + "loss": 0.6225, + "step": 4736 + }, + { + "epoch": 1.848946135831382, + "grad_norm": 0.7181886739766444, + "learning_rate": 3.8532643059155296e-06, + "loss": 0.5655, + "step": 4737 + }, + { + "epoch": 1.849336455893833, + "grad_norm": 0.5069888050707428, + "learning_rate": 3.851054034984436e-06, + "loss": 0.5882, + "step": 4738 + }, + { + "epoch": 1.849726775956284, + "grad_norm": 0.5328843388304019, + "learning_rate": 3.8488440010617734e-06, + "loss": 0.5851, + "step": 4739 + }, + { + "epoch": 1.8501170960187352, + "grad_norm": 0.6606606766820022, + "learning_rate": 3.846634204603431e-06, + "loss": 0.6149, + "step": 4740 + }, + { + "epoch": 1.8505074160811865, + "grad_norm": 0.6191540203605604, + "learning_rate": 3.844424646065254e-06, + "loss": 0.5613, + "step": 4741 + }, + { + "epoch": 1.8508977361436378, + "grad_norm": 0.4979093253604655, + "learning_rate": 3.8422153259030394e-06, + "loss": 0.5973, + "step": 4742 + }, + { + "epoch": 1.851288056206089, + "grad_norm": 0.5117624285611841, + "learning_rate": 3.8400062445725315e-06, + "loss": 0.5473, + "step": 4743 + }, + { + "epoch": 1.85167837626854, + "grad_norm": 0.5828505179705129, + "learning_rate": 3.8377974025294265e-06, + "loss": 0.5908, + "step": 4744 + }, + { + "epoch": 1.8520686963309914, + "grad_norm": 0.48614518154289454, + "learning_rate": 3.835588800229373e-06, + "loss": 0.5704, + "step": 4745 + }, + { + "epoch": 1.8524590163934427, + "grad_norm": 0.4666605304349077, + "learning_rate": 3.83338043812797e-06, + "loss": 0.5852, + "step": 4746 + }, + { + "epoch": 1.8528493364558938, + "grad_norm": 0.532752917534684, + "learning_rate": 3.831172316680763e-06, + "loss": 0.5781, + "step": 4747 + }, + { + "epoch": 1.8532396565183449, + "grad_norm": 0.4799978503280369, + "learning_rate": 3.8289644363432544e-06, + "loss": 0.5513, + "step": 4748 + }, + { + "epoch": 1.8536299765807962, + "grad_norm": 0.5158696715237101, + "learning_rate": 3.826756797570889e-06, + "loss": 0.5542, + "step": 4749 + }, + { + "epoch": 1.8540202966432475, + "grad_norm": 0.6055496759551439, + "learning_rate": 3.824549400819067e-06, + "loss": 0.5845, + "step": 4750 + }, + { + "epoch": 1.8544106167056986, + "grad_norm": 0.6834901172958872, + "learning_rate": 3.8223422465431395e-06, + "loss": 0.595, + "step": 4751 + }, + { + "epoch": 1.8548009367681497, + "grad_norm": 0.5616840076040475, + "learning_rate": 3.820135335198405e-06, + "loss": 0.5679, + "step": 4752 + }, + { + "epoch": 1.855191256830601, + "grad_norm": 0.564918919145635, + "learning_rate": 3.81792866724011e-06, + "loss": 0.6034, + "step": 4753 + }, + { + "epoch": 1.8555815768930524, + "grad_norm": 0.5880963062957936, + "learning_rate": 3.815722243123458e-06, + "loss": 0.5754, + "step": 4754 + }, + { + "epoch": 1.8559718969555035, + "grad_norm": 0.5978077440572956, + "learning_rate": 3.8135160633035956e-06, + "loss": 0.5933, + "step": 4755 + }, + { + "epoch": 1.8563622170179546, + "grad_norm": 0.5587216158635337, + "learning_rate": 3.811310128235619e-06, + "loss": 0.5998, + "step": 4756 + }, + { + "epoch": 1.8567525370804059, + "grad_norm": 0.5368568557279262, + "learning_rate": 3.809104438374579e-06, + "loss": 0.5778, + "step": 4757 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.6090168590922183, + "learning_rate": 3.8068989941754695e-06, + "loss": 0.6113, + "step": 4758 + }, + { + "epoch": 1.8575331772053083, + "grad_norm": 0.5199888013510796, + "learning_rate": 3.8046937960932402e-06, + "loss": 0.6262, + "step": 4759 + }, + { + "epoch": 1.8579234972677594, + "grad_norm": 0.4897854676853437, + "learning_rate": 3.8024888445827834e-06, + "loss": 0.6308, + "step": 4760 + }, + { + "epoch": 1.8583138173302107, + "grad_norm": 0.5745776469058079, + "learning_rate": 3.8002841400989465e-06, + "loss": 0.614, + "step": 4761 + }, + { + "epoch": 1.858704137392662, + "grad_norm": 0.5621961724948833, + "learning_rate": 3.7980796830965217e-06, + "loss": 0.6003, + "step": 4762 + }, + { + "epoch": 1.8590944574551131, + "grad_norm": 0.513899204362579, + "learning_rate": 3.7958754740302528e-06, + "loss": 0.6117, + "step": 4763 + }, + { + "epoch": 1.8594847775175642, + "grad_norm": 0.4825361454074641, + "learning_rate": 3.793671513354831e-06, + "loss": 0.6381, + "step": 4764 + }, + { + "epoch": 1.8598750975800156, + "grad_norm": 0.5678359982301362, + "learning_rate": 3.7914678015248996e-06, + "loss": 0.5922, + "step": 4765 + }, + { + "epoch": 1.860265417642467, + "grad_norm": 0.4561731904342218, + "learning_rate": 3.789264338995042e-06, + "loss": 0.565, + "step": 4766 + }, + { + "epoch": 1.860655737704918, + "grad_norm": 0.5316938862877455, + "learning_rate": 3.7870611262197995e-06, + "loss": 0.6015, + "step": 4767 + }, + { + "epoch": 1.861046057767369, + "grad_norm": 0.48197846042956816, + "learning_rate": 3.784858163653657e-06, + "loss": 0.5775, + "step": 4768 + }, + { + "epoch": 1.8614363778298204, + "grad_norm": 0.542882300378475, + "learning_rate": 3.7826554517510494e-06, + "loss": 0.5977, + "step": 4769 + }, + { + "epoch": 1.8618266978922717, + "grad_norm": 0.5211826245661507, + "learning_rate": 3.78045299096636e-06, + "loss": 0.6014, + "step": 4770 + }, + { + "epoch": 1.8622170179547228, + "grad_norm": 0.5134924829296807, + "learning_rate": 3.7782507817539194e-06, + "loss": 0.615, + "step": 4771 + }, + { + "epoch": 1.862607338017174, + "grad_norm": 0.4904411209739106, + "learning_rate": 3.7760488245680084e-06, + "loss": 0.562, + "step": 4772 + }, + { + "epoch": 1.8629976580796253, + "grad_norm": 0.5240974002987802, + "learning_rate": 3.7738471198628506e-06, + "loss": 0.6104, + "step": 4773 + }, + { + "epoch": 1.8633879781420766, + "grad_norm": 0.5297559446492532, + "learning_rate": 3.771645668092624e-06, + "loss": 0.6095, + "step": 4774 + }, + { + "epoch": 1.8637782982045277, + "grad_norm": 0.5079601649897761, + "learning_rate": 3.7694444697114498e-06, + "loss": 0.6311, + "step": 4775 + }, + { + "epoch": 1.8641686182669788, + "grad_norm": 0.5571953260591556, + "learning_rate": 3.7672435251734e-06, + "loss": 0.6253, + "step": 4776 + }, + { + "epoch": 1.86455893832943, + "grad_norm": 0.48854734586766335, + "learning_rate": 3.7650428349324902e-06, + "loss": 0.6155, + "step": 4777 + }, + { + "epoch": 1.8649492583918814, + "grad_norm": 0.47231473936298307, + "learning_rate": 3.762842399442688e-06, + "loss": 0.5955, + "step": 4778 + }, + { + "epoch": 1.8653395784543325, + "grad_norm": 0.5285887656272283, + "learning_rate": 3.7606422191579073e-06, + "loss": 0.5904, + "step": 4779 + }, + { + "epoch": 1.8657298985167836, + "grad_norm": 0.5625165121648449, + "learning_rate": 3.7584422945320076e-06, + "loss": 0.615, + "step": 4780 + }, + { + "epoch": 1.866120218579235, + "grad_norm": 0.5305996361698521, + "learning_rate": 3.756242626018797e-06, + "loss": 0.5938, + "step": 4781 + }, + { + "epoch": 1.8665105386416863, + "grad_norm": 0.5274297539375741, + "learning_rate": 3.7540432140720307e-06, + "loss": 0.6173, + "step": 4782 + }, + { + "epoch": 1.8669008587041374, + "grad_norm": 0.5434355455602825, + "learning_rate": 3.7518440591454123e-06, + "loss": 0.5826, + "step": 4783 + }, + { + "epoch": 1.8672911787665885, + "grad_norm": 0.5356039346458411, + "learning_rate": 3.749645161692588e-06, + "loss": 0.5613, + "step": 4784 + }, + { + "epoch": 1.8676814988290398, + "grad_norm": 0.4997469407134545, + "learning_rate": 3.7474465221671542e-06, + "loss": 0.6073, + "step": 4785 + }, + { + "epoch": 1.868071818891491, + "grad_norm": 0.4718767936061445, + "learning_rate": 3.7452481410226537e-06, + "loss": 0.6033, + "step": 4786 + }, + { + "epoch": 1.8684621389539422, + "grad_norm": 0.5144827902448392, + "learning_rate": 3.7430500187125774e-06, + "loss": 0.5826, + "step": 4787 + }, + { + "epoch": 1.8688524590163933, + "grad_norm": 0.5378327744139852, + "learning_rate": 3.740852155690361e-06, + "loss": 0.6295, + "step": 4788 + }, + { + "epoch": 1.8692427790788446, + "grad_norm": 0.47074642369945074, + "learning_rate": 3.7386545524093865e-06, + "loss": 0.6071, + "step": 4789 + }, + { + "epoch": 1.869633099141296, + "grad_norm": 0.46348462154014075, + "learning_rate": 3.736457209322983e-06, + "loss": 0.5908, + "step": 4790 + }, + { + "epoch": 1.870023419203747, + "grad_norm": 0.5192687180740418, + "learning_rate": 3.7342601268844248e-06, + "loss": 0.5924, + "step": 4791 + }, + { + "epoch": 1.8704137392661981, + "grad_norm": 0.5378583900097961, + "learning_rate": 3.7320633055469364e-06, + "loss": 0.5728, + "step": 4792 + }, + { + "epoch": 1.8708040593286495, + "grad_norm": 0.5610673572357999, + "learning_rate": 3.729866745763683e-06, + "loss": 0.6204, + "step": 4793 + }, + { + "epoch": 1.8711943793911008, + "grad_norm": 0.539504021197984, + "learning_rate": 3.727670447987778e-06, + "loss": 0.6045, + "step": 4794 + }, + { + "epoch": 1.8715846994535519, + "grad_norm": 0.5202142576416834, + "learning_rate": 3.725474412672282e-06, + "loss": 0.6169, + "step": 4795 + }, + { + "epoch": 1.871975019516003, + "grad_norm": 0.5191529121771328, + "learning_rate": 3.7232786402701997e-06, + "loss": 0.6299, + "step": 4796 + }, + { + "epoch": 1.8723653395784543, + "grad_norm": 0.61812327626999, + "learning_rate": 3.7210831312344834e-06, + "loss": 0.5601, + "step": 4797 + }, + { + "epoch": 1.8727556596409056, + "grad_norm": 0.6141634917619186, + "learning_rate": 3.71888788601803e-06, + "loss": 0.5966, + "step": 4798 + }, + { + "epoch": 1.8731459797033567, + "grad_norm": 0.5261561374965925, + "learning_rate": 3.716692905073682e-06, + "loss": 0.6187, + "step": 4799 + }, + { + "epoch": 1.8735362997658078, + "grad_norm": 0.487044829775128, + "learning_rate": 3.7144981888542276e-06, + "loss": 0.6224, + "step": 4800 + }, + { + "epoch": 1.8739266198282591, + "grad_norm": 0.5212845032105451, + "learning_rate": 3.712303737812401e-06, + "loss": 0.5937, + "step": 4801 + }, + { + "epoch": 1.8743169398907105, + "grad_norm": 0.5587571468642105, + "learning_rate": 3.7101095524008784e-06, + "loss": 0.6125, + "step": 4802 + }, + { + "epoch": 1.8747072599531616, + "grad_norm": 0.5340418443269986, + "learning_rate": 3.707915633072285e-06, + "loss": 0.5816, + "step": 4803 + }, + { + "epoch": 1.8750975800156127, + "grad_norm": 0.5344508825714652, + "learning_rate": 3.7057219802791897e-06, + "loss": 0.5966, + "step": 4804 + }, + { + "epoch": 1.875487900078064, + "grad_norm": 0.5396471444236836, + "learning_rate": 3.7035285944741084e-06, + "loss": 0.6066, + "step": 4805 + }, + { + "epoch": 1.8758782201405153, + "grad_norm": 0.5020869525600229, + "learning_rate": 3.701335476109497e-06, + "loss": 0.5927, + "step": 4806 + }, + { + "epoch": 1.8762685402029664, + "grad_norm": 0.48915202803195296, + "learning_rate": 3.6991426256377614e-06, + "loss": 0.563, + "step": 4807 + }, + { + "epoch": 1.8766588602654175, + "grad_norm": 0.5571133744793823, + "learning_rate": 3.6969500435112493e-06, + "loss": 0.5871, + "step": 4808 + }, + { + "epoch": 1.8770491803278688, + "grad_norm": 0.5607501513294768, + "learning_rate": 3.694757730182254e-06, + "loss": 0.611, + "step": 4809 + }, + { + "epoch": 1.8774395003903201, + "grad_norm": 0.4884924688669651, + "learning_rate": 3.6925656861030156e-06, + "loss": 0.6119, + "step": 4810 + }, + { + "epoch": 1.8778298204527712, + "grad_norm": 0.509317050235721, + "learning_rate": 3.6903739117257126e-06, + "loss": 0.6038, + "step": 4811 + }, + { + "epoch": 1.8782201405152223, + "grad_norm": 0.6108062719304718, + "learning_rate": 3.688182407502473e-06, + "loss": 0.582, + "step": 4812 + }, + { + "epoch": 1.8786104605776737, + "grad_norm": 0.565426269728047, + "learning_rate": 3.685991173885367e-06, + "loss": 0.6015, + "step": 4813 + }, + { + "epoch": 1.879000780640125, + "grad_norm": 0.4992557234590083, + "learning_rate": 3.6838002113264116e-06, + "loss": 0.6346, + "step": 4814 + }, + { + "epoch": 1.879391100702576, + "grad_norm": 0.604987038403352, + "learning_rate": 3.6816095202775636e-06, + "loss": 0.5744, + "step": 4815 + }, + { + "epoch": 1.8797814207650272, + "grad_norm": 0.45841823029954815, + "learning_rate": 3.6794191011907277e-06, + "loss": 0.5768, + "step": 4816 + }, + { + "epoch": 1.8801717408274785, + "grad_norm": 0.6146786060208325, + "learning_rate": 3.6772289545177497e-06, + "loss": 0.6293, + "step": 4817 + }, + { + "epoch": 1.8805620608899298, + "grad_norm": 0.6031151355339176, + "learning_rate": 3.675039080710424e-06, + "loss": 0.591, + "step": 4818 + }, + { + "epoch": 1.880952380952381, + "grad_norm": 0.5176286763518538, + "learning_rate": 3.6728494802204807e-06, + "loss": 0.57, + "step": 4819 + }, + { + "epoch": 1.881342701014832, + "grad_norm": 0.5006401021109785, + "learning_rate": 3.6706601534995983e-06, + "loss": 0.5819, + "step": 4820 + }, + { + "epoch": 1.8817330210772834, + "grad_norm": 0.6168294339554241, + "learning_rate": 3.6684711009994e-06, + "loss": 0.6123, + "step": 4821 + }, + { + "epoch": 1.8821233411397347, + "grad_norm": 0.5949031457096934, + "learning_rate": 3.6662823231714516e-06, + "loss": 0.6286, + "step": 4822 + }, + { + "epoch": 1.8825136612021858, + "grad_norm": 0.5156229678577624, + "learning_rate": 3.664093820467259e-06, + "loss": 0.6025, + "step": 4823 + }, + { + "epoch": 1.8829039812646369, + "grad_norm": 0.55766245063658, + "learning_rate": 3.6619055933382765e-06, + "loss": 0.602, + "step": 4824 + }, + { + "epoch": 1.8832943013270882, + "grad_norm": 0.5670186421886095, + "learning_rate": 3.6597176422358967e-06, + "loss": 0.5466, + "step": 4825 + }, + { + "epoch": 1.8836846213895395, + "grad_norm": 0.5008088865644453, + "learning_rate": 3.6575299676114584e-06, + "loss": 0.5825, + "step": 4826 + }, + { + "epoch": 1.8840749414519906, + "grad_norm": 0.5111187622983636, + "learning_rate": 3.6553425699162454e-06, + "loss": 0.6005, + "step": 4827 + }, + { + "epoch": 1.8844652615144417, + "grad_norm": 0.5633471049352448, + "learning_rate": 3.6531554496014765e-06, + "loss": 0.6104, + "step": 4828 + }, + { + "epoch": 1.884855581576893, + "grad_norm": 0.5293178776016566, + "learning_rate": 3.6509686071183203e-06, + "loss": 0.5919, + "step": 4829 + }, + { + "epoch": 1.8852459016393444, + "grad_norm": 0.534566214102344, + "learning_rate": 3.6487820429178855e-06, + "loss": 0.5851, + "step": 4830 + }, + { + "epoch": 1.8856362217017955, + "grad_norm": 0.5565258454091279, + "learning_rate": 3.646595757451225e-06, + "loss": 0.5895, + "step": 4831 + }, + { + "epoch": 1.8860265417642466, + "grad_norm": 0.5240994560364961, + "learning_rate": 3.644409751169332e-06, + "loss": 0.6031, + "step": 4832 + }, + { + "epoch": 1.8864168618266979, + "grad_norm": 0.5198753021018487, + "learning_rate": 3.642224024523145e-06, + "loss": 0.5871, + "step": 4833 + }, + { + "epoch": 1.8868071818891492, + "grad_norm": 0.4855078221454396, + "learning_rate": 3.6400385779635406e-06, + "loss": 0.6095, + "step": 4834 + }, + { + "epoch": 1.8871975019516003, + "grad_norm": 0.5533152312265127, + "learning_rate": 3.6378534119413433e-06, + "loss": 0.6291, + "step": 4835 + }, + { + "epoch": 1.8875878220140514, + "grad_norm": 0.5155985248438705, + "learning_rate": 3.635668526907314e-06, + "loss": 0.6094, + "step": 4836 + }, + { + "epoch": 1.8879781420765027, + "grad_norm": 0.5330102701625636, + "learning_rate": 3.6334839233121573e-06, + "loss": 0.5981, + "step": 4837 + }, + { + "epoch": 1.888368462138954, + "grad_norm": 0.494231390446832, + "learning_rate": 3.6312996016065218e-06, + "loss": 0.5741, + "step": 4838 + }, + { + "epoch": 1.8887587822014051, + "grad_norm": 0.5271590432191412, + "learning_rate": 3.629115562240998e-06, + "loss": 0.5993, + "step": 4839 + }, + { + "epoch": 1.8891491022638562, + "grad_norm": 0.518970376700467, + "learning_rate": 3.6269318056661148e-06, + "loss": 0.5439, + "step": 4840 + }, + { + "epoch": 1.8895394223263076, + "grad_norm": 0.46479706184780833, + "learning_rate": 3.624748332332345e-06, + "loss": 0.5891, + "step": 4841 + }, + { + "epoch": 1.8899297423887589, + "grad_norm": 0.5535317493995935, + "learning_rate": 3.622565142690104e-06, + "loss": 0.5946, + "step": 4842 + }, + { + "epoch": 1.89032006245121, + "grad_norm": 0.521924924554862, + "learning_rate": 3.6203822371897467e-06, + "loss": 0.5749, + "step": 4843 + }, + { + "epoch": 1.890710382513661, + "grad_norm": 0.5270422189487612, + "learning_rate": 3.61819961628157e-06, + "loss": 0.5783, + "step": 4844 + }, + { + "epoch": 1.8911007025761124, + "grad_norm": 0.5171707484487071, + "learning_rate": 3.6160172804158154e-06, + "loss": 0.5836, + "step": 4845 + }, + { + "epoch": 1.8914910226385637, + "grad_norm": 0.5644360692516442, + "learning_rate": 3.613835230042657e-06, + "loss": 0.6026, + "step": 4846 + }, + { + "epoch": 1.8918813427010148, + "grad_norm": 0.5058028522352895, + "learning_rate": 3.6116534656122186e-06, + "loss": 0.5875, + "step": 4847 + }, + { + "epoch": 1.892271662763466, + "grad_norm": 0.4659244350216782, + "learning_rate": 3.6094719875745616e-06, + "loss": 0.5916, + "step": 4848 + }, + { + "epoch": 1.8926619828259172, + "grad_norm": 0.54162931143637, + "learning_rate": 3.607290796379688e-06, + "loss": 0.5715, + "step": 4849 + }, + { + "epoch": 1.8930523028883686, + "grad_norm": 0.5877459561202163, + "learning_rate": 3.605109892477542e-06, + "loss": 0.5862, + "step": 4850 + }, + { + "epoch": 1.8934426229508197, + "grad_norm": 0.5236478840963817, + "learning_rate": 3.6029292763180072e-06, + "loss": 0.5661, + "step": 4851 + }, + { + "epoch": 1.8938329430132708, + "grad_norm": 0.5653172209497094, + "learning_rate": 3.6007489483509107e-06, + "loss": 0.6031, + "step": 4852 + }, + { + "epoch": 1.894223263075722, + "grad_norm": 0.49296484502974713, + "learning_rate": 3.5985689090260136e-06, + "loss": 0.5926, + "step": 4853 + }, + { + "epoch": 1.8946135831381734, + "grad_norm": 0.5137658352601521, + "learning_rate": 3.5963891587930255e-06, + "loss": 0.5948, + "step": 4854 + }, + { + "epoch": 1.8950039032006245, + "grad_norm": 0.4969747560342441, + "learning_rate": 3.5942096981015907e-06, + "loss": 0.5999, + "step": 4855 + }, + { + "epoch": 1.8953942232630756, + "grad_norm": 0.5328647289420247, + "learning_rate": 3.5920305274012966e-06, + "loss": 0.6014, + "step": 4856 + }, + { + "epoch": 1.895784543325527, + "grad_norm": 0.5504009544479729, + "learning_rate": 3.5898516471416677e-06, + "loss": 0.5961, + "step": 4857 + }, + { + "epoch": 1.8961748633879782, + "grad_norm": 0.5889045784429411, + "learning_rate": 3.587673057772173e-06, + "loss": 0.6024, + "step": 4858 + }, + { + "epoch": 1.8965651834504293, + "grad_norm": 0.4909299698889199, + "learning_rate": 3.5854947597422185e-06, + "loss": 0.6194, + "step": 4859 + }, + { + "epoch": 1.8969555035128804, + "grad_norm": 0.6412756259262484, + "learning_rate": 3.583316753501151e-06, + "loss": 0.6023, + "step": 4860 + }, + { + "epoch": 1.8973458235753318, + "grad_norm": 0.513417788033491, + "learning_rate": 3.581139039498257e-06, + "loss": 0.5602, + "step": 4861 + }, + { + "epoch": 1.897736143637783, + "grad_norm": 0.5119987147470083, + "learning_rate": 3.578961618182764e-06, + "loss": 0.6301, + "step": 4862 + }, + { + "epoch": 1.8981264637002342, + "grad_norm": 0.47999393810318486, + "learning_rate": 3.576784490003834e-06, + "loss": 0.5904, + "step": 4863 + }, + { + "epoch": 1.8985167837626853, + "grad_norm": 0.5142120934856912, + "learning_rate": 3.574607655410575e-06, + "loss": 0.6054, + "step": 4864 + }, + { + "epoch": 1.8989071038251366, + "grad_norm": 0.526807748518709, + "learning_rate": 3.572431114852032e-06, + "loss": 0.5893, + "step": 4865 + }, + { + "epoch": 1.899297423887588, + "grad_norm": 0.5368336863526059, + "learning_rate": 3.570254868777187e-06, + "loss": 0.5925, + "step": 4866 + }, + { + "epoch": 1.899687743950039, + "grad_norm": 0.494977381799225, + "learning_rate": 3.568078917634966e-06, + "loss": 0.5823, + "step": 4867 + }, + { + "epoch": 1.9000780640124901, + "grad_norm": 0.5033880620652196, + "learning_rate": 3.5659032618742294e-06, + "loss": 0.5691, + "step": 4868 + }, + { + "epoch": 1.9004683840749415, + "grad_norm": 0.5694937608905076, + "learning_rate": 3.563727901943781e-06, + "loss": 0.5981, + "step": 4869 + }, + { + "epoch": 1.9008587041373928, + "grad_norm": 0.5010259585680049, + "learning_rate": 3.5615528382923582e-06, + "loss": 0.5678, + "step": 4870 + }, + { + "epoch": 1.9012490241998439, + "grad_norm": 0.5002792614379524, + "learning_rate": 3.559378071368644e-06, + "loss": 0.6039, + "step": 4871 + }, + { + "epoch": 1.901639344262295, + "grad_norm": 0.5335716832175667, + "learning_rate": 3.557203601621254e-06, + "loss": 0.6125, + "step": 4872 + }, + { + "epoch": 1.9020296643247463, + "grad_norm": 0.4712548660434192, + "learning_rate": 3.555029429498746e-06, + "loss": 0.5739, + "step": 4873 + }, + { + "epoch": 1.9024199843871976, + "grad_norm": 0.5114553118925627, + "learning_rate": 3.5528555554496145e-06, + "loss": 0.5833, + "step": 4874 + }, + { + "epoch": 1.9028103044496487, + "grad_norm": 0.4631732997194734, + "learning_rate": 3.5506819799222945e-06, + "loss": 0.5859, + "step": 4875 + }, + { + "epoch": 1.9032006245120998, + "grad_norm": 0.611439509770734, + "learning_rate": 3.5485087033651588e-06, + "loss": 0.5614, + "step": 4876 + }, + { + "epoch": 1.9035909445745511, + "grad_norm": 0.5806263730378998, + "learning_rate": 3.5463357262265187e-06, + "loss": 0.5892, + "step": 4877 + }, + { + "epoch": 1.9039812646370025, + "grad_norm": 0.5227449267423517, + "learning_rate": 3.5441630489546206e-06, + "loss": 0.5876, + "step": 4878 + }, + { + "epoch": 1.9043715846994536, + "grad_norm": 0.5389595446969898, + "learning_rate": 3.5419906719976543e-06, + "loss": 0.6098, + "step": 4879 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.6129952018495025, + "learning_rate": 3.5398185958037455e-06, + "loss": 0.5891, + "step": 4880 + }, + { + "epoch": 1.905152224824356, + "grad_norm": 0.5605412187267595, + "learning_rate": 3.5376468208209547e-06, + "loss": 0.5473, + "step": 4881 + }, + { + "epoch": 1.9055425448868073, + "grad_norm": 0.5229336170353944, + "learning_rate": 3.5354753474972836e-06, + "loss": 0.6121, + "step": 4882 + }, + { + "epoch": 1.9059328649492584, + "grad_norm": 0.5252280863763964, + "learning_rate": 3.533304176280672e-06, + "loss": 0.6028, + "step": 4883 + }, + { + "epoch": 1.9063231850117095, + "grad_norm": 0.5901013221882412, + "learning_rate": 3.5311333076189958e-06, + "loss": 0.6422, + "step": 4884 + }, + { + "epoch": 1.9067135050741608, + "grad_norm": 0.5447467919133747, + "learning_rate": 3.52896274196007e-06, + "loss": 0.5863, + "step": 4885 + }, + { + "epoch": 1.9071038251366121, + "grad_norm": 0.5477828353528366, + "learning_rate": 3.5267924797516445e-06, + "loss": 0.5636, + "step": 4886 + }, + { + "epoch": 1.9074941451990632, + "grad_norm": 0.543121511496558, + "learning_rate": 3.524622521441409e-06, + "loss": 0.5583, + "step": 4887 + }, + { + "epoch": 1.9078844652615143, + "grad_norm": 0.5242618811841152, + "learning_rate": 3.5224528674769905e-06, + "loss": 0.5691, + "step": 4888 + }, + { + "epoch": 1.9082747853239657, + "grad_norm": 0.4819079276736222, + "learning_rate": 3.5202835183059535e-06, + "loss": 0.5676, + "step": 4889 + }, + { + "epoch": 1.908665105386417, + "grad_norm": 0.48229513901170384, + "learning_rate": 3.518114474375795e-06, + "loss": 0.5799, + "step": 4890 + }, + { + "epoch": 1.909055425448868, + "grad_norm": 0.5077219617383061, + "learning_rate": 3.515945736133955e-06, + "loss": 0.5925, + "step": 4891 + }, + { + "epoch": 1.9094457455113192, + "grad_norm": 0.5062060276675987, + "learning_rate": 3.5137773040278076e-06, + "loss": 0.5844, + "step": 4892 + }, + { + "epoch": 1.9098360655737705, + "grad_norm": 0.4910718968499953, + "learning_rate": 3.511609178504665e-06, + "loss": 0.6083, + "step": 4893 + }, + { + "epoch": 1.9102263856362218, + "grad_norm": 0.5353472930199222, + "learning_rate": 3.5094413600117737e-06, + "loss": 0.5745, + "step": 4894 + }, + { + "epoch": 1.910616705698673, + "grad_norm": 0.5401272320701783, + "learning_rate": 3.507273848996321e-06, + "loss": 0.5824, + "step": 4895 + }, + { + "epoch": 1.911007025761124, + "grad_norm": 0.5785752084985246, + "learning_rate": 3.505106645905426e-06, + "loss": 0.6207, + "step": 4896 + }, + { + "epoch": 1.9113973458235753, + "grad_norm": 0.5379247477603719, + "learning_rate": 3.5029397511861485e-06, + "loss": 0.6033, + "step": 4897 + }, + { + "epoch": 1.9117876658860267, + "grad_norm": 0.582581914362792, + "learning_rate": 3.5007731652854844e-06, + "loss": 0.5884, + "step": 4898 + }, + { + "epoch": 1.9121779859484778, + "grad_norm": 0.46057401074777815, + "learning_rate": 3.4986068886503594e-06, + "loss": 0.5358, + "step": 4899 + }, + { + "epoch": 1.9125683060109289, + "grad_norm": 0.5357617559624316, + "learning_rate": 3.496440921727643e-06, + "loss": 0.5836, + "step": 4900 + }, + { + "epoch": 1.9129586260733802, + "grad_norm": 0.47666750690281334, + "learning_rate": 3.4942752649641376e-06, + "loss": 0.613, + "step": 4901 + }, + { + "epoch": 1.9133489461358315, + "grad_norm": 0.5396433854615064, + "learning_rate": 3.492109918806584e-06, + "loss": 0.6422, + "step": 4902 + }, + { + "epoch": 1.9137392661982826, + "grad_norm": 0.614476404906973, + "learning_rate": 3.4899448837016543e-06, + "loss": 0.5745, + "step": 4903 + }, + { + "epoch": 1.9141295862607337, + "grad_norm": 0.48175139433047437, + "learning_rate": 3.4877801600959603e-06, + "loss": 0.6191, + "step": 4904 + }, + { + "epoch": 1.914519906323185, + "grad_norm": 0.48555569351441347, + "learning_rate": 3.4856157484360493e-06, + "loss": 0.5979, + "step": 4905 + }, + { + "epoch": 1.9149102263856363, + "grad_norm": 0.5253512347524831, + "learning_rate": 3.4834516491684046e-06, + "loss": 0.6021, + "step": 4906 + }, + { + "epoch": 1.9153005464480874, + "grad_norm": 0.5380286962255444, + "learning_rate": 3.4812878627394397e-06, + "loss": 0.5975, + "step": 4907 + }, + { + "epoch": 1.9156908665105385, + "grad_norm": 0.521390426461917, + "learning_rate": 3.47912438959551e-06, + "loss": 0.5871, + "step": 4908 + }, + { + "epoch": 1.9160811865729899, + "grad_norm": 0.5653323387067075, + "learning_rate": 3.4769612301829046e-06, + "loss": 0.5984, + "step": 4909 + }, + { + "epoch": 1.9164715066354412, + "grad_norm": 0.4888883437959263, + "learning_rate": 3.474798384947846e-06, + "loss": 0.6438, + "step": 4910 + }, + { + "epoch": 1.9168618266978923, + "grad_norm": 0.48973591136012, + "learning_rate": 3.4726358543364947e-06, + "loss": 0.6015, + "step": 4911 + }, + { + "epoch": 1.9172521467603434, + "grad_norm": 0.4883416309300962, + "learning_rate": 3.4704736387949433e-06, + "loss": 0.5687, + "step": 4912 + }, + { + "epoch": 1.9176424668227947, + "grad_norm": 0.5766935906404682, + "learning_rate": 3.468311738769221e-06, + "loss": 0.5528, + "step": 4913 + }, + { + "epoch": 1.918032786885246, + "grad_norm": 0.5578394903395298, + "learning_rate": 3.466150154705293e-06, + "loss": 0.6029, + "step": 4914 + }, + { + "epoch": 1.9184231069476971, + "grad_norm": 0.5342666581592846, + "learning_rate": 3.46398888704906e-06, + "loss": 0.6141, + "step": 4915 + }, + { + "epoch": 1.9188134270101482, + "grad_norm": 0.4963440754162548, + "learning_rate": 3.4618279362463504e-06, + "loss": 0.6044, + "step": 4916 + }, + { + "epoch": 1.9192037470725996, + "grad_norm": 0.5387033899517951, + "learning_rate": 3.459667302742935e-06, + "loss": 0.5623, + "step": 4917 + }, + { + "epoch": 1.9195940671350509, + "grad_norm": 0.5187302121141231, + "learning_rate": 3.4575069869845166e-06, + "loss": 0.6031, + "step": 4918 + }, + { + "epoch": 1.919984387197502, + "grad_norm": 0.5172264941312857, + "learning_rate": 3.4553469894167337e-06, + "loss": 0.5965, + "step": 4919 + }, + { + "epoch": 1.920374707259953, + "grad_norm": 0.516851457547784, + "learning_rate": 3.4531873104851544e-06, + "loss": 0.6003, + "step": 4920 + }, + { + "epoch": 1.9207650273224044, + "grad_norm": 0.5526553775061435, + "learning_rate": 3.4510279506352875e-06, + "loss": 0.6335, + "step": 4921 + }, + { + "epoch": 1.9211553473848557, + "grad_norm": 0.625684144406033, + "learning_rate": 3.448868910312571e-06, + "loss": 0.5943, + "step": 4922 + }, + { + "epoch": 1.9215456674473068, + "grad_norm": 0.5273580240315298, + "learning_rate": 3.446710189962381e-06, + "loss": 0.5877, + "step": 4923 + }, + { + "epoch": 1.921935987509758, + "grad_norm": 0.5497022531416789, + "learning_rate": 3.4445517900300263e-06, + "loss": 0.6063, + "step": 4924 + }, + { + "epoch": 1.9223263075722092, + "grad_norm": 0.5194110097914834, + "learning_rate": 3.4423937109607447e-06, + "loss": 0.6108, + "step": 4925 + }, + { + "epoch": 1.9227166276346606, + "grad_norm": 0.5916912802552843, + "learning_rate": 3.4402359531997147e-06, + "loss": 0.5818, + "step": 4926 + }, + { + "epoch": 1.9231069476971117, + "grad_norm": 0.5327923288229232, + "learning_rate": 3.438078517192046e-06, + "loss": 0.5897, + "step": 4927 + }, + { + "epoch": 1.9234972677595628, + "grad_norm": 0.4971545248372209, + "learning_rate": 3.4359214033827808e-06, + "loss": 0.5765, + "step": 4928 + }, + { + "epoch": 1.923887587822014, + "grad_norm": 0.5614022485052289, + "learning_rate": 3.4337646122168968e-06, + "loss": 0.6236, + "step": 4929 + }, + { + "epoch": 1.9242779078844654, + "grad_norm": 0.6009895571723075, + "learning_rate": 3.4316081441393032e-06, + "loss": 0.6054, + "step": 4930 + }, + { + "epoch": 1.9246682279469165, + "grad_norm": 0.5242867104253565, + "learning_rate": 3.4294519995948437e-06, + "loss": 0.5688, + "step": 4931 + }, + { + "epoch": 1.9250585480093676, + "grad_norm": 0.4888643977497633, + "learning_rate": 3.4272961790282964e-06, + "loss": 0.5824, + "step": 4932 + }, + { + "epoch": 1.925448868071819, + "grad_norm": 0.5280993492789289, + "learning_rate": 3.42514068288437e-06, + "loss": 0.5945, + "step": 4933 + }, + { + "epoch": 1.9258391881342702, + "grad_norm": 0.590420167261279, + "learning_rate": 3.422985511607707e-06, + "loss": 0.6125, + "step": 4934 + }, + { + "epoch": 1.9262295081967213, + "grad_norm": 0.5379509074027782, + "learning_rate": 3.4208306656428837e-06, + "loss": 0.5821, + "step": 4935 + }, + { + "epoch": 1.9266198282591724, + "grad_norm": 0.589261505743676, + "learning_rate": 3.41867614543441e-06, + "loss": 0.5946, + "step": 4936 + }, + { + "epoch": 1.9270101483216238, + "grad_norm": 0.5004384119930411, + "learning_rate": 3.4165219514267245e-06, + "loss": 0.5926, + "step": 4937 + }, + { + "epoch": 1.927400468384075, + "grad_norm": 0.48761940463109477, + "learning_rate": 3.4143680840642052e-06, + "loss": 0.5939, + "step": 4938 + }, + { + "epoch": 1.9277907884465262, + "grad_norm": 0.5572806461320264, + "learning_rate": 3.4122145437911568e-06, + "loss": 0.6141, + "step": 4939 + }, + { + "epoch": 1.9281811085089773, + "grad_norm": 0.5568497846566862, + "learning_rate": 3.41006133105182e-06, + "loss": 0.6318, + "step": 4940 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 0.5418024754256936, + "learning_rate": 3.4079084462903667e-06, + "loss": 0.5965, + "step": 4941 + }, + { + "epoch": 1.92896174863388, + "grad_norm": 0.5283736829958918, + "learning_rate": 3.4057558899509025e-06, + "loss": 0.5946, + "step": 4942 + }, + { + "epoch": 1.929352068696331, + "grad_norm": 0.4968400289477708, + "learning_rate": 3.40360366247746e-06, + "loss": 0.5979, + "step": 4943 + }, + { + "epoch": 1.9297423887587821, + "grad_norm": 0.5561769049655824, + "learning_rate": 3.40145176431401e-06, + "loss": 0.615, + "step": 4944 + }, + { + "epoch": 1.9301327088212334, + "grad_norm": 0.6490329128295425, + "learning_rate": 3.3993001959044543e-06, + "loss": 0.6046, + "step": 4945 + }, + { + "epoch": 1.9305230288836848, + "grad_norm": 0.5592690823022212, + "learning_rate": 3.397148957692624e-06, + "loss": 0.5945, + "step": 4946 + }, + { + "epoch": 1.9309133489461359, + "grad_norm": 0.4947199734892771, + "learning_rate": 3.3949980501222857e-06, + "loss": 0.5999, + "step": 4947 + }, + { + "epoch": 1.931303669008587, + "grad_norm": 0.5165452291942829, + "learning_rate": 3.392847473637135e-06, + "loss": 0.5623, + "step": 4948 + }, + { + "epoch": 1.9316939890710383, + "grad_norm": 0.5291451855088531, + "learning_rate": 3.3906972286807986e-06, + "loss": 0.6048, + "step": 4949 + }, + { + "epoch": 1.9320843091334896, + "grad_norm": 0.6187614526663465, + "learning_rate": 3.3885473156968375e-06, + "loss": 0.5438, + "step": 4950 + }, + { + "epoch": 1.9324746291959407, + "grad_norm": 0.4608713155707274, + "learning_rate": 3.386397735128744e-06, + "loss": 0.5565, + "step": 4951 + }, + { + "epoch": 1.9328649492583918, + "grad_norm": 0.5616145370692451, + "learning_rate": 3.38424848741994e-06, + "loss": 0.5847, + "step": 4952 + }, + { + "epoch": 1.9332552693208431, + "grad_norm": 0.5765199556990753, + "learning_rate": 3.3820995730137772e-06, + "loss": 0.6304, + "step": 4953 + }, + { + "epoch": 1.9336455893832944, + "grad_norm": 0.5013963339766859, + "learning_rate": 3.3799509923535434e-06, + "loss": 0.5865, + "step": 4954 + }, + { + "epoch": 1.9340359094457455, + "grad_norm": 0.49167476251493447, + "learning_rate": 3.377802745882455e-06, + "loss": 0.5856, + "step": 4955 + }, + { + "epoch": 1.9344262295081966, + "grad_norm": 0.5022775580280043, + "learning_rate": 3.3756548340436578e-06, + "loss": 0.5681, + "step": 4956 + }, + { + "epoch": 1.934816549570648, + "grad_norm": 0.4988745949568484, + "learning_rate": 3.373507257280232e-06, + "loss": 0.5865, + "step": 4957 + }, + { + "epoch": 1.9352068696330993, + "grad_norm": 0.5342065147856839, + "learning_rate": 3.371360016035186e-06, + "loss": 0.6208, + "step": 4958 + }, + { + "epoch": 1.9355971896955504, + "grad_norm": 0.46846933582785527, + "learning_rate": 3.3692131107514624e-06, + "loss": 0.6096, + "step": 4959 + }, + { + "epoch": 1.9359875097580015, + "grad_norm": 0.498325439178535, + "learning_rate": 3.3670665418719285e-06, + "loss": 0.5936, + "step": 4960 + }, + { + "epoch": 1.9363778298204528, + "grad_norm": 0.4762396606444344, + "learning_rate": 3.364920309839387e-06, + "loss": 0.5956, + "step": 4961 + }, + { + "epoch": 1.9367681498829041, + "grad_norm": 0.48608176599931946, + "learning_rate": 3.3627744150965697e-06, + "loss": 0.5804, + "step": 4962 + }, + { + "epoch": 1.9371584699453552, + "grad_norm": 0.5180353607642473, + "learning_rate": 3.360628858086139e-06, + "loss": 0.616, + "step": 4963 + }, + { + "epoch": 1.9375487900078063, + "grad_norm": 0.49386951950481756, + "learning_rate": 3.358483639250688e-06, + "loss": 0.5655, + "step": 4964 + }, + { + "epoch": 1.9379391100702577, + "grad_norm": 0.5850892961810731, + "learning_rate": 3.3563387590327413e-06, + "loss": 0.5954, + "step": 4965 + }, + { + "epoch": 1.938329430132709, + "grad_norm": 0.6111952806852405, + "learning_rate": 3.3541942178747475e-06, + "loss": 0.6291, + "step": 4966 + }, + { + "epoch": 1.93871975019516, + "grad_norm": 0.5577942567276432, + "learning_rate": 3.352050016219094e-06, + "loss": 0.6112, + "step": 4967 + }, + { + "epoch": 1.9391100702576112, + "grad_norm": 0.5953400923111515, + "learning_rate": 3.3499061545080923e-06, + "loss": 0.6107, + "step": 4968 + }, + { + "epoch": 1.9395003903200625, + "grad_norm": 0.6122324888293239, + "learning_rate": 3.3477626331839864e-06, + "loss": 0.6165, + "step": 4969 + }, + { + "epoch": 1.9398907103825138, + "grad_norm": 0.5462006790082127, + "learning_rate": 3.345619452688946e-06, + "loss": 0.6068, + "step": 4970 + }, + { + "epoch": 1.940281030444965, + "grad_norm": 0.5874373936830051, + "learning_rate": 3.3434766134650766e-06, + "loss": 0.5959, + "step": 4971 + }, + { + "epoch": 1.940671350507416, + "grad_norm": 0.498445415822853, + "learning_rate": 3.3413341159544086e-06, + "loss": 0.6326, + "step": 4972 + }, + { + "epoch": 1.9410616705698673, + "grad_norm": 0.5270002494302582, + "learning_rate": 3.3391919605989054e-06, + "loss": 0.5522, + "step": 4973 + }, + { + "epoch": 1.9414519906323187, + "grad_norm": 0.4910303071333285, + "learning_rate": 3.337050147840456e-06, + "loss": 0.5941, + "step": 4974 + }, + { + "epoch": 1.9418423106947698, + "grad_norm": 0.5783626917018142, + "learning_rate": 3.3349086781208817e-06, + "loss": 0.6428, + "step": 4975 + }, + { + "epoch": 1.9422326307572209, + "grad_norm": 0.7171785728291639, + "learning_rate": 3.3327675518819318e-06, + "loss": 0.6111, + "step": 4976 + }, + { + "epoch": 1.9426229508196722, + "grad_norm": 0.5263752501643835, + "learning_rate": 3.3306267695652873e-06, + "loss": 0.5932, + "step": 4977 + }, + { + "epoch": 1.9430132708821235, + "grad_norm": 0.5391000501225015, + "learning_rate": 3.3284863316125523e-06, + "loss": 0.5749, + "step": 4978 + }, + { + "epoch": 1.9434035909445746, + "grad_norm": 0.4670902686380007, + "learning_rate": 3.3263462384652647e-06, + "loss": 0.613, + "step": 4979 + }, + { + "epoch": 1.9437939110070257, + "grad_norm": 0.5185636564029791, + "learning_rate": 3.32420649056489e-06, + "loss": 0.5788, + "step": 4980 + }, + { + "epoch": 1.944184231069477, + "grad_norm": 0.6121893203683391, + "learning_rate": 3.322067088352824e-06, + "loss": 0.5827, + "step": 4981 + }, + { + "epoch": 1.9445745511319283, + "grad_norm": 0.5061630347776995, + "learning_rate": 3.319928032270389e-06, + "loss": 0.6158, + "step": 4982 + }, + { + "epoch": 1.9449648711943794, + "grad_norm": 0.5436494422182835, + "learning_rate": 3.3177893227588355e-06, + "loss": 0.595, + "step": 4983 + }, + { + "epoch": 1.9453551912568305, + "grad_norm": 0.5196069761636833, + "learning_rate": 3.3156509602593444e-06, + "loss": 0.6193, + "step": 4984 + }, + { + "epoch": 1.9457455113192819, + "grad_norm": 0.5075848146033656, + "learning_rate": 3.3135129452130254e-06, + "loss": 0.6063, + "step": 4985 + }, + { + "epoch": 1.9461358313817332, + "grad_norm": 0.5945511068668706, + "learning_rate": 3.311375278060916e-06, + "loss": 0.5897, + "step": 4986 + }, + { + "epoch": 1.9465261514441843, + "grad_norm": 0.5701619924856142, + "learning_rate": 3.3092379592439783e-06, + "loss": 0.5694, + "step": 4987 + }, + { + "epoch": 1.9469164715066354, + "grad_norm": 0.6183837091441778, + "learning_rate": 3.3071009892031064e-06, + "loss": 0.6213, + "step": 4988 + }, + { + "epoch": 1.9473067915690867, + "grad_norm": 0.5189466558011996, + "learning_rate": 3.304964368379123e-06, + "loss": 0.5951, + "step": 4989 + }, + { + "epoch": 1.947697111631538, + "grad_norm": 0.5029124268438356, + "learning_rate": 3.302828097212777e-06, + "loss": 0.5845, + "step": 4990 + }, + { + "epoch": 1.9480874316939891, + "grad_norm": 0.4893559124334553, + "learning_rate": 3.3006921761447452e-06, + "loss": 0.607, + "step": 4991 + }, + { + "epoch": 1.9484777517564402, + "grad_norm": 0.5233913114551146, + "learning_rate": 3.2985566056156325e-06, + "loss": 0.63, + "step": 4992 + }, + { + "epoch": 1.9488680718188915, + "grad_norm": 0.5737297409247031, + "learning_rate": 3.296421386065972e-06, + "loss": 0.578, + "step": 4993 + }, + { + "epoch": 1.9492583918813429, + "grad_norm": 0.5574379157016719, + "learning_rate": 3.294286517936224e-06, + "loss": 0.6043, + "step": 4994 + }, + { + "epoch": 1.949648711943794, + "grad_norm": 0.528863914068902, + "learning_rate": 3.2921520016667787e-06, + "loss": 0.6334, + "step": 4995 + }, + { + "epoch": 1.950039032006245, + "grad_norm": 0.5052832405712294, + "learning_rate": 3.290017837697945e-06, + "loss": 0.5789, + "step": 4996 + }, + { + "epoch": 1.9504293520686962, + "grad_norm": 0.5842293771806024, + "learning_rate": 3.287884026469971e-06, + "loss": 0.6157, + "step": 4997 + }, + { + "epoch": 1.9508196721311475, + "grad_norm": 0.6121244649727577, + "learning_rate": 3.285750568423023e-06, + "loss": 0.5956, + "step": 4998 + }, + { + "epoch": 1.9512099921935988, + "grad_norm": 0.546594743464118, + "learning_rate": 3.2836174639972006e-06, + "loss": 0.5996, + "step": 4999 + }, + { + "epoch": 1.95160031225605, + "grad_norm": 0.5775367332822804, + "learning_rate": 3.2814847136325254e-06, + "loss": 0.5681, + "step": 5000 + }, + { + "epoch": 1.951990632318501, + "grad_norm": 0.5051924927437028, + "learning_rate": 3.27935231776895e-06, + "loss": 0.6097, + "step": 5001 + }, + { + "epoch": 1.9523809523809523, + "grad_norm": 0.5826533925233189, + "learning_rate": 3.277220276846351e-06, + "loss": 0.5769, + "step": 5002 + }, + { + "epoch": 1.9527712724434036, + "grad_norm": 0.5191252940220916, + "learning_rate": 3.2750885913045367e-06, + "loss": 0.6, + "step": 5003 + }, + { + "epoch": 1.9531615925058547, + "grad_norm": 0.5016538752266475, + "learning_rate": 3.2729572615832323e-06, + "loss": 0.5909, + "step": 5004 + }, + { + "epoch": 1.9535519125683058, + "grad_norm": 0.5296935140848184, + "learning_rate": 3.2708262881220987e-06, + "loss": 0.6338, + "step": 5005 + }, + { + "epoch": 1.9539422326307572, + "grad_norm": 0.4725145048899566, + "learning_rate": 3.2686956713607198e-06, + "loss": 0.5849, + "step": 5006 + }, + { + "epoch": 1.9543325526932085, + "grad_norm": 0.5897957817340923, + "learning_rate": 3.266565411738607e-06, + "loss": 0.6127, + "step": 5007 + }, + { + "epoch": 1.9547228727556596, + "grad_norm": 0.5227720055352724, + "learning_rate": 3.2644355096951968e-06, + "loss": 0.6047, + "step": 5008 + }, + { + "epoch": 1.9551131928181107, + "grad_norm": 0.5582687692771743, + "learning_rate": 3.262305965669853e-06, + "loss": 0.5626, + "step": 5009 + }, + { + "epoch": 1.955503512880562, + "grad_norm": 0.4366608341004959, + "learning_rate": 3.260176780101865e-06, + "loss": 0.5897, + "step": 5010 + }, + { + "epoch": 1.9558938329430133, + "grad_norm": 0.4683655602022488, + "learning_rate": 3.2580479534304475e-06, + "loss": 0.5916, + "step": 5011 + }, + { + "epoch": 1.9562841530054644, + "grad_norm": 0.5860728508955639, + "learning_rate": 3.255919486094745e-06, + "loss": 0.5959, + "step": 5012 + }, + { + "epoch": 1.9566744730679155, + "grad_norm": 0.47826290342733246, + "learning_rate": 3.2537913785338217e-06, + "loss": 0.6144, + "step": 5013 + }, + { + "epoch": 1.9570647931303669, + "grad_norm": 0.4784617585995019, + "learning_rate": 3.2516636311866704e-06, + "loss": 0.6226, + "step": 5014 + }, + { + "epoch": 1.9574551131928182, + "grad_norm": 0.5376252985950034, + "learning_rate": 3.249536244492213e-06, + "loss": 0.5708, + "step": 5015 + }, + { + "epoch": 1.9578454332552693, + "grad_norm": 0.5681971787932719, + "learning_rate": 3.247409218889291e-06, + "loss": 0.6235, + "step": 5016 + }, + { + "epoch": 1.9582357533177204, + "grad_norm": 0.5029132726131385, + "learning_rate": 3.2452825548166754e-06, + "loss": 0.6441, + "step": 5017 + }, + { + "epoch": 1.9586260733801717, + "grad_norm": 0.5452584365647788, + "learning_rate": 3.2431562527130622e-06, + "loss": 0.628, + "step": 5018 + }, + { + "epoch": 1.959016393442623, + "grad_norm": 0.49476375716998017, + "learning_rate": 3.2410303130170712e-06, + "loss": 0.6067, + "step": 5019 + }, + { + "epoch": 1.9594067135050741, + "grad_norm": 0.5425524928082697, + "learning_rate": 3.2389047361672495e-06, + "loss": 0.5734, + "step": 5020 + }, + { + "epoch": 1.9597970335675252, + "grad_norm": 0.5268703525023692, + "learning_rate": 3.2367795226020694e-06, + "loss": 0.5702, + "step": 5021 + }, + { + "epoch": 1.9601873536299765, + "grad_norm": 0.5911114285931762, + "learning_rate": 3.2346546727599224e-06, + "loss": 0.5923, + "step": 5022 + }, + { + "epoch": 1.9605776736924279, + "grad_norm": 0.48978852604198936, + "learning_rate": 3.2325301870791335e-06, + "loss": 0.6025, + "step": 5023 + }, + { + "epoch": 1.960967993754879, + "grad_norm": 0.5262045767744443, + "learning_rate": 3.230406065997948e-06, + "loss": 0.5612, + "step": 5024 + }, + { + "epoch": 1.96135831381733, + "grad_norm": 0.49523233903069563, + "learning_rate": 3.2282823099545364e-06, + "loss": 0.5884, + "step": 5025 + }, + { + "epoch": 1.9617486338797814, + "grad_norm": 0.5556689024371486, + "learning_rate": 3.226158919386994e-06, + "loss": 0.5626, + "step": 5026 + }, + { + "epoch": 1.9621389539422327, + "grad_norm": 0.47648122332807225, + "learning_rate": 3.2240358947333416e-06, + "loss": 0.6185, + "step": 5027 + }, + { + "epoch": 1.9625292740046838, + "grad_norm": 0.5637461180879428, + "learning_rate": 3.221913236431524e-06, + "loss": 0.5792, + "step": 5028 + }, + { + "epoch": 1.962919594067135, + "grad_norm": 0.5031762545268024, + "learning_rate": 3.2197909449194096e-06, + "loss": 0.5826, + "step": 5029 + }, + { + "epoch": 1.9633099141295862, + "grad_norm": 0.5166121655448654, + "learning_rate": 3.2176690206347927e-06, + "loss": 0.6108, + "step": 5030 + }, + { + "epoch": 1.9637002341920375, + "grad_norm": 0.5349748884125999, + "learning_rate": 3.2155474640153887e-06, + "loss": 0.5647, + "step": 5031 + }, + { + "epoch": 1.9640905542544886, + "grad_norm": 0.5416481098191669, + "learning_rate": 3.213426275498843e-06, + "loss": 0.5948, + "step": 5032 + }, + { + "epoch": 1.9644808743169397, + "grad_norm": 0.48385380031702085, + "learning_rate": 3.2113054555227176e-06, + "loss": 0.5989, + "step": 5033 + }, + { + "epoch": 1.964871194379391, + "grad_norm": 0.4837527462825018, + "learning_rate": 3.2091850045245043e-06, + "loss": 0.5847, + "step": 5034 + }, + { + "epoch": 1.9652615144418424, + "grad_norm": 0.5401872857352672, + "learning_rate": 3.2070649229416175e-06, + "loss": 0.5644, + "step": 5035 + }, + { + "epoch": 1.9656518345042935, + "grad_norm": 0.4409893937806396, + "learning_rate": 3.2049452112113934e-06, + "loss": 0.5707, + "step": 5036 + }, + { + "epoch": 1.9660421545667446, + "grad_norm": 0.5394400916432204, + "learning_rate": 3.202825869771094e-06, + "loss": 0.5706, + "step": 5037 + }, + { + "epoch": 1.966432474629196, + "grad_norm": 0.5393739519299972, + "learning_rate": 3.200706899057904e-06, + "loss": 0.5391, + "step": 5038 + }, + { + "epoch": 1.9668227946916472, + "grad_norm": 0.6060922322835992, + "learning_rate": 3.198588299508934e-06, + "loss": 0.5646, + "step": 5039 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 0.5462959467867483, + "learning_rate": 3.1964700715612116e-06, + "loss": 0.5803, + "step": 5040 + }, + { + "epoch": 1.9676034348165494, + "grad_norm": 0.5005603391769181, + "learning_rate": 3.194352215651694e-06, + "loss": 0.615, + "step": 5041 + }, + { + "epoch": 1.9679937548790007, + "grad_norm": 0.5407105559515379, + "learning_rate": 3.1922347322172596e-06, + "loss": 0.5991, + "step": 5042 + }, + { + "epoch": 1.968384074941452, + "grad_norm": 0.532327585577645, + "learning_rate": 3.1901176216947098e-06, + "loss": 0.5964, + "step": 5043 + }, + { + "epoch": 1.9687743950039032, + "grad_norm": 0.5282006021822357, + "learning_rate": 3.18800088452077e-06, + "loss": 0.5726, + "step": 5044 + }, + { + "epoch": 1.9691647150663543, + "grad_norm": 0.5028594345535119, + "learning_rate": 3.1858845211320875e-06, + "loss": 0.5864, + "step": 5045 + }, + { + "epoch": 1.9695550351288056, + "grad_norm": 0.4855963985248474, + "learning_rate": 3.1837685319652324e-06, + "loss": 0.6366, + "step": 5046 + }, + { + "epoch": 1.969945355191257, + "grad_norm": 0.46467515685307303, + "learning_rate": 3.181652917456699e-06, + "loss": 0.6051, + "step": 5047 + }, + { + "epoch": 1.970335675253708, + "grad_norm": 0.4556528878695618, + "learning_rate": 3.179537678042902e-06, + "loss": 0.5962, + "step": 5048 + }, + { + "epoch": 1.970725995316159, + "grad_norm": 0.5187839871042578, + "learning_rate": 3.1774228141601814e-06, + "loss": 0.599, + "step": 5049 + }, + { + "epoch": 1.9711163153786104, + "grad_norm": 0.4777208437740578, + "learning_rate": 3.1753083262447974e-06, + "loss": 0.5892, + "step": 5050 + }, + { + "epoch": 1.9715066354410617, + "grad_norm": 0.5134407601846772, + "learning_rate": 3.1731942147329337e-06, + "loss": 0.6091, + "step": 5051 + }, + { + "epoch": 1.9718969555035128, + "grad_norm": 0.5177766718153304, + "learning_rate": 3.1710804800606966e-06, + "loss": 0.5792, + "step": 5052 + }, + { + "epoch": 1.972287275565964, + "grad_norm": 0.485448568426937, + "learning_rate": 3.1689671226641145e-06, + "loss": 0.5452, + "step": 5053 + }, + { + "epoch": 1.9726775956284153, + "grad_norm": 0.4716048229125042, + "learning_rate": 3.166854142979138e-06, + "loss": 0.6101, + "step": 5054 + }, + { + "epoch": 1.9730679156908666, + "grad_norm": 0.48023073517677123, + "learning_rate": 3.164741541441639e-06, + "loss": 0.5697, + "step": 5055 + }, + { + "epoch": 1.9734582357533177, + "grad_norm": 0.5546954687327366, + "learning_rate": 3.162629318487415e-06, + "loss": 0.5611, + "step": 5056 + }, + { + "epoch": 1.9738485558157688, + "grad_norm": 0.5204961935022526, + "learning_rate": 3.160517474552177e-06, + "loss": 0.6174, + "step": 5057 + }, + { + "epoch": 1.9742388758782201, + "grad_norm": 0.5485720622022164, + "learning_rate": 3.1584060100715663e-06, + "loss": 0.5865, + "step": 5058 + }, + { + "epoch": 1.9746291959406714, + "grad_norm": 0.561938241854932, + "learning_rate": 3.156294925481144e-06, + "loss": 0.5956, + "step": 5059 + }, + { + "epoch": 1.9750195160031225, + "grad_norm": 0.4775940284269636, + "learning_rate": 3.1541842212163886e-06, + "loss": 0.5858, + "step": 5060 + }, + { + "epoch": 1.9754098360655736, + "grad_norm": 0.4829047627704255, + "learning_rate": 3.152073897712706e-06, + "loss": 0.5559, + "step": 5061 + }, + { + "epoch": 1.975800156128025, + "grad_norm": 0.524190856639467, + "learning_rate": 3.149963955405421e-06, + "loss": 0.6188, + "step": 5062 + }, + { + "epoch": 1.9761904761904763, + "grad_norm": 0.545368720274054, + "learning_rate": 3.147854394729778e-06, + "loss": 0.5869, + "step": 5063 + }, + { + "epoch": 1.9765807962529274, + "grad_norm": 0.5240898578786535, + "learning_rate": 3.145745216120944e-06, + "loss": 0.5785, + "step": 5064 + }, + { + "epoch": 1.9769711163153785, + "grad_norm": 0.4962395326524517, + "learning_rate": 3.143636420014009e-06, + "loss": 0.5718, + "step": 5065 + }, + { + "epoch": 1.9773614363778298, + "grad_norm": 0.48903168925607166, + "learning_rate": 3.1415280068439824e-06, + "loss": 0.5685, + "step": 5066 + }, + { + "epoch": 1.9777517564402811, + "grad_norm": 0.5002610416219971, + "learning_rate": 3.1394199770457926e-06, + "loss": 0.5719, + "step": 5067 + }, + { + "epoch": 1.9781420765027322, + "grad_norm": 0.533044212493449, + "learning_rate": 3.137312331054293e-06, + "loss": 0.5916, + "step": 5068 + }, + { + "epoch": 1.9785323965651833, + "grad_norm": 0.5094669100633438, + "learning_rate": 3.1352050693042552e-06, + "loss": 0.6171, + "step": 5069 + }, + { + "epoch": 1.9789227166276346, + "grad_norm": 0.5348741300969068, + "learning_rate": 3.1330981922303726e-06, + "loss": 0.6166, + "step": 5070 + }, + { + "epoch": 1.979313036690086, + "grad_norm": 0.5336670148220736, + "learning_rate": 3.1309917002672585e-06, + "loss": 0.5933, + "step": 5071 + }, + { + "epoch": 1.979703356752537, + "grad_norm": 0.5064994595284839, + "learning_rate": 3.1288855938494465e-06, + "loss": 0.5791, + "step": 5072 + }, + { + "epoch": 1.9800936768149882, + "grad_norm": 0.47850731435022037, + "learning_rate": 3.1267798734113938e-06, + "loss": 0.5943, + "step": 5073 + }, + { + "epoch": 1.9804839968774395, + "grad_norm": 0.46220016379414586, + "learning_rate": 3.1246745393874744e-06, + "loss": 0.5943, + "step": 5074 + }, + { + "epoch": 1.9808743169398908, + "grad_norm": 0.5341325376357235, + "learning_rate": 3.122569592211982e-06, + "loss": 0.5746, + "step": 5075 + }, + { + "epoch": 1.981264637002342, + "grad_norm": 0.4778300556558636, + "learning_rate": 3.120465032319133e-06, + "loss": 0.5722, + "step": 5076 + }, + { + "epoch": 1.981654957064793, + "grad_norm": 0.49879213811981354, + "learning_rate": 3.118360860143063e-06, + "loss": 0.5787, + "step": 5077 + }, + { + "epoch": 1.9820452771272443, + "grad_norm": 0.5587458571578963, + "learning_rate": 3.116257076117829e-06, + "loss": 0.5811, + "step": 5078 + }, + { + "epoch": 1.9824355971896956, + "grad_norm": 0.4768824757266268, + "learning_rate": 3.114153680677405e-06, + "loss": 0.5973, + "step": 5079 + }, + { + "epoch": 1.9828259172521467, + "grad_norm": 0.5039449043179405, + "learning_rate": 3.1120506742556873e-06, + "loss": 0.6158, + "step": 5080 + }, + { + "epoch": 1.9832162373145978, + "grad_norm": 0.641563815648241, + "learning_rate": 3.1099480572864905e-06, + "loss": 0.5662, + "step": 5081 + }, + { + "epoch": 1.9836065573770492, + "grad_norm": 0.5219243600255463, + "learning_rate": 3.1078458302035498e-06, + "loss": 0.5549, + "step": 5082 + }, + { + "epoch": 1.9839968774395005, + "grad_norm": 0.49968439238386647, + "learning_rate": 3.105743993440522e-06, + "loss": 0.5805, + "step": 5083 + }, + { + "epoch": 1.9843871975019516, + "grad_norm": 0.5028565382901788, + "learning_rate": 3.1036425474309772e-06, + "loss": 0.6074, + "step": 5084 + }, + { + "epoch": 1.9847775175644027, + "grad_norm": 0.49391694547080284, + "learning_rate": 3.10154149260841e-06, + "loss": 0.5973, + "step": 5085 + }, + { + "epoch": 1.985167837626854, + "grad_norm": 0.5215135822759702, + "learning_rate": 3.0994408294062333e-06, + "loss": 0.5975, + "step": 5086 + }, + { + "epoch": 1.9855581576893053, + "grad_norm": 0.5757235247596899, + "learning_rate": 3.097340558257779e-06, + "loss": 0.5782, + "step": 5087 + }, + { + "epoch": 1.9859484777517564, + "grad_norm": 0.5396800315852412, + "learning_rate": 3.095240679596299e-06, + "loss": 0.5995, + "step": 5088 + }, + { + "epoch": 1.9863387978142075, + "grad_norm": 0.5302345149043153, + "learning_rate": 3.093141193854961e-06, + "loss": 0.5899, + "step": 5089 + }, + { + "epoch": 1.9867291178766588, + "grad_norm": 0.5802219452166234, + "learning_rate": 3.091042101466856e-06, + "loss": 0.5741, + "step": 5090 + }, + { + "epoch": 1.9871194379391102, + "grad_norm": 0.4969084107702309, + "learning_rate": 3.0889434028649925e-06, + "loss": 0.6262, + "step": 5091 + }, + { + "epoch": 1.9875097580015613, + "grad_norm": 0.5082800634341965, + "learning_rate": 3.0868450984822956e-06, + "loss": 0.5935, + "step": 5092 + }, + { + "epoch": 1.9879000780640124, + "grad_norm": 0.47485314174282084, + "learning_rate": 3.08474718875161e-06, + "loss": 0.6142, + "step": 5093 + }, + { + "epoch": 1.9882903981264637, + "grad_norm": 0.5417645969992595, + "learning_rate": 3.0826496741057e-06, + "loss": 0.5882, + "step": 5094 + }, + { + "epoch": 1.988680718188915, + "grad_norm": 0.46436165606885643, + "learning_rate": 3.0805525549772497e-06, + "loss": 0.5981, + "step": 5095 + }, + { + "epoch": 1.989071038251366, + "grad_norm": 0.49288432288974926, + "learning_rate": 3.0784558317988565e-06, + "loss": 0.5904, + "step": 5096 + }, + { + "epoch": 1.9894613583138172, + "grad_norm": 0.4942825951800972, + "learning_rate": 3.0763595050030424e-06, + "loss": 0.6039, + "step": 5097 + }, + { + "epoch": 1.9898516783762685, + "grad_norm": 0.5708883418927566, + "learning_rate": 3.074263575022244e-06, + "loss": 0.5563, + "step": 5098 + }, + { + "epoch": 1.9902419984387199, + "grad_norm": 0.5107321775465435, + "learning_rate": 3.0721680422888157e-06, + "loss": 0.5983, + "step": 5099 + }, + { + "epoch": 1.990632318501171, + "grad_norm": 0.5617224413954153, + "learning_rate": 3.0700729072350344e-06, + "loss": 0.5787, + "step": 5100 + }, + { + "epoch": 1.991022638563622, + "grad_norm": 0.5097389232686307, + "learning_rate": 3.067978170293087e-06, + "loss": 0.582, + "step": 5101 + }, + { + "epoch": 1.9914129586260734, + "grad_norm": 0.4754134537991157, + "learning_rate": 3.0658838318950843e-06, + "loss": 0.6132, + "step": 5102 + }, + { + "epoch": 1.9918032786885247, + "grad_norm": 0.5134234281056047, + "learning_rate": 3.0637898924730547e-06, + "loss": 0.6252, + "step": 5103 + }, + { + "epoch": 1.9921935987509758, + "grad_norm": 0.4426964484111848, + "learning_rate": 3.061696352458942e-06, + "loss": 0.6082, + "step": 5104 + }, + { + "epoch": 1.992583918813427, + "grad_norm": 0.5162687697552266, + "learning_rate": 3.05960321228461e-06, + "loss": 0.6153, + "step": 5105 + }, + { + "epoch": 1.9929742388758782, + "grad_norm": 0.5170707888563608, + "learning_rate": 3.057510472381837e-06, + "loss": 0.6506, + "step": 5106 + }, + { + "epoch": 1.9933645589383295, + "grad_norm": 0.48592931250804133, + "learning_rate": 3.055418133182321e-06, + "loss": 0.6152, + "step": 5107 + }, + { + "epoch": 1.9937548790007806, + "grad_norm": 0.49492572238054217, + "learning_rate": 3.0533261951176786e-06, + "loss": 0.6236, + "step": 5108 + }, + { + "epoch": 1.9941451990632317, + "grad_norm": 0.46763065087289896, + "learning_rate": 3.0512346586194397e-06, + "loss": 0.5985, + "step": 5109 + }, + { + "epoch": 1.994535519125683, + "grad_norm": 0.5835849859467953, + "learning_rate": 3.0491435241190527e-06, + "loss": 0.6004, + "step": 5110 + }, + { + "epoch": 1.9949258391881344, + "grad_norm": 0.5214295223526024, + "learning_rate": 3.0470527920478856e-06, + "loss": 0.6026, + "step": 5111 + }, + { + "epoch": 1.9953161592505855, + "grad_norm": 0.4740020420035232, + "learning_rate": 3.0449624628372214e-06, + "loss": 0.567, + "step": 5112 + }, + { + "epoch": 1.9957064793130366, + "grad_norm": 0.43432160998243147, + "learning_rate": 3.0428725369182595e-06, + "loss": 0.6099, + "step": 5113 + }, + { + "epoch": 1.996096799375488, + "grad_norm": 0.5050237798449388, + "learning_rate": 3.040783014722117e-06, + "loss": 0.6076, + "step": 5114 + }, + { + "epoch": 1.9964871194379392, + "grad_norm": 0.4887981204016288, + "learning_rate": 3.0386938966798275e-06, + "loss": 0.616, + "step": 5115 + }, + { + "epoch": 1.9968774395003903, + "grad_norm": 0.44110403546617477, + "learning_rate": 3.036605183222342e-06, + "loss": 0.5838, + "step": 5116 + }, + { + "epoch": 1.9972677595628414, + "grad_norm": 0.46148786006443254, + "learning_rate": 3.0345168747805264e-06, + "loss": 0.584, + "step": 5117 + }, + { + "epoch": 1.9976580796252927, + "grad_norm": 0.49104322334502964, + "learning_rate": 3.0324289717851662e-06, + "loss": 0.6103, + "step": 5118 + }, + { + "epoch": 1.998048399687744, + "grad_norm": 0.49796309464847155, + "learning_rate": 3.030341474666958e-06, + "loss": 0.5669, + "step": 5119 + }, + { + "epoch": 1.9984387197501952, + "grad_norm": 0.5019089000603645, + "learning_rate": 3.0282543838565185e-06, + "loss": 0.6127, + "step": 5120 + }, + { + "epoch": 1.9988290398126463, + "grad_norm": 0.46788154169126284, + "learning_rate": 3.026167699784381e-06, + "loss": 0.5918, + "step": 5121 + }, + { + "epoch": 1.9992193598750976, + "grad_norm": 0.4963054110002408, + "learning_rate": 3.0240814228809923e-06, + "loss": 0.6174, + "step": 5122 + }, + { + "epoch": 1.999609679937549, + "grad_norm": 0.4588040595375725, + "learning_rate": 3.0219955535767175e-06, + "loss": 0.5654, + "step": 5123 + }, + { + "epoch": 2.0, + "grad_norm": 0.460783903620839, + "learning_rate": 3.0199100923018367e-06, + "loss": 0.5799, + "step": 5124 + }, + { + "epoch": 2.000390320062451, + "grad_norm": 0.4888780607064845, + "learning_rate": 3.017825039486546e-06, + "loss": 0.5587, + "step": 5125 + }, + { + "epoch": 2.000780640124902, + "grad_norm": 0.48519051116271644, + "learning_rate": 3.0157403955609555e-06, + "loss": 0.556, + "step": 5126 + }, + { + "epoch": 2.0011709601873537, + "grad_norm": 0.5164481015852015, + "learning_rate": 3.013656160955095e-06, + "loss": 0.5098, + "step": 5127 + }, + { + "epoch": 2.001561280249805, + "grad_norm": 0.5664179783712595, + "learning_rate": 3.0115723360989056e-06, + "loss": 0.6019, + "step": 5128 + }, + { + "epoch": 2.001951600312256, + "grad_norm": 0.5064742544808374, + "learning_rate": 3.0094889214222477e-06, + "loss": 0.5841, + "step": 5129 + }, + { + "epoch": 2.002341920374707, + "grad_norm": 0.47746076821576056, + "learning_rate": 3.007405917354892e-06, + "loss": 0.5585, + "step": 5130 + }, + { + "epoch": 2.0027322404371586, + "grad_norm": 0.4740672686544043, + "learning_rate": 3.005323324326529e-06, + "loss": 0.5996, + "step": 5131 + }, + { + "epoch": 2.0031225604996097, + "grad_norm": 0.571173046992003, + "learning_rate": 3.0032411427667633e-06, + "loss": 0.5819, + "step": 5132 + }, + { + "epoch": 2.003512880562061, + "grad_norm": 0.5394720797507054, + "learning_rate": 3.0011593731051147e-06, + "loss": 0.5951, + "step": 5133 + }, + { + "epoch": 2.003903200624512, + "grad_norm": 0.5106024159621855, + "learning_rate": 2.9990780157710166e-06, + "loss": 0.557, + "step": 5134 + }, + { + "epoch": 2.0042935206869634, + "grad_norm": 0.510868231494023, + "learning_rate": 2.996997071193819e-06, + "loss": 0.5525, + "step": 5135 + }, + { + "epoch": 2.0046838407494145, + "grad_norm": 0.4754184396937899, + "learning_rate": 2.994916539802788e-06, + "loss": 0.5673, + "step": 5136 + }, + { + "epoch": 2.0050741608118656, + "grad_norm": 0.5124088175523149, + "learning_rate": 2.9928364220270976e-06, + "loss": 0.5787, + "step": 5137 + }, + { + "epoch": 2.0054644808743167, + "grad_norm": 0.45836536776473064, + "learning_rate": 2.990756718295846e-06, + "loss": 0.5743, + "step": 5138 + }, + { + "epoch": 2.0058548009367683, + "grad_norm": 0.44688764726663205, + "learning_rate": 2.9886774290380394e-06, + "loss": 0.5592, + "step": 5139 + }, + { + "epoch": 2.0062451209992194, + "grad_norm": 0.4768121647278504, + "learning_rate": 2.986598554682601e-06, + "loss": 0.5708, + "step": 5140 + }, + { + "epoch": 2.0066354410616705, + "grad_norm": 0.5247695464785508, + "learning_rate": 2.9845200956583675e-06, + "loss": 0.5586, + "step": 5141 + }, + { + "epoch": 2.0070257611241216, + "grad_norm": 0.4874378496288254, + "learning_rate": 2.982442052394093e-06, + "loss": 0.5658, + "step": 5142 + }, + { + "epoch": 2.007416081186573, + "grad_norm": 0.6221165897058882, + "learning_rate": 2.98036442531844e-06, + "loss": 0.5802, + "step": 5143 + }, + { + "epoch": 2.007806401249024, + "grad_norm": 0.4684528313025157, + "learning_rate": 2.9782872148599908e-06, + "loss": 0.6005, + "step": 5144 + }, + { + "epoch": 2.0081967213114753, + "grad_norm": 0.4794115262309572, + "learning_rate": 2.9762104214472376e-06, + "loss": 0.541, + "step": 5145 + }, + { + "epoch": 2.0085870413739264, + "grad_norm": 0.4536032503387319, + "learning_rate": 2.9741340455085876e-06, + "loss": 0.552, + "step": 5146 + }, + { + "epoch": 2.008977361436378, + "grad_norm": 0.44328608858218355, + "learning_rate": 2.9720580874723644e-06, + "loss": 0.5551, + "step": 5147 + }, + { + "epoch": 2.009367681498829, + "grad_norm": 0.44522351583136566, + "learning_rate": 2.9699825477668027e-06, + "loss": 0.5894, + "step": 5148 + }, + { + "epoch": 2.00975800156128, + "grad_norm": 0.45959242152085483, + "learning_rate": 2.9679074268200524e-06, + "loss": 0.5943, + "step": 5149 + }, + { + "epoch": 2.0101483216237312, + "grad_norm": 0.4811492270681199, + "learning_rate": 2.965832725060176e-06, + "loss": 0.5721, + "step": 5150 + }, + { + "epoch": 2.010538641686183, + "grad_norm": 0.4896817835822205, + "learning_rate": 2.96375844291515e-06, + "loss": 0.6055, + "step": 5151 + }, + { + "epoch": 2.010928961748634, + "grad_norm": 0.48276879180577414, + "learning_rate": 2.961684580812865e-06, + "loss": 0.5539, + "step": 5152 + }, + { + "epoch": 2.011319281811085, + "grad_norm": 0.5003692440592393, + "learning_rate": 2.9596111391811243e-06, + "loss": 0.5622, + "step": 5153 + }, + { + "epoch": 2.011709601873536, + "grad_norm": 0.4412248742785421, + "learning_rate": 2.9575381184476426e-06, + "loss": 0.5317, + "step": 5154 + }, + { + "epoch": 2.0120999219359876, + "grad_norm": 0.496395613270732, + "learning_rate": 2.9554655190400506e-06, + "loss": 0.5812, + "step": 5155 + }, + { + "epoch": 2.0124902419984387, + "grad_norm": 0.5404381883025023, + "learning_rate": 2.9533933413858915e-06, + "loss": 0.5659, + "step": 5156 + }, + { + "epoch": 2.01288056206089, + "grad_norm": 0.5144583579242866, + "learning_rate": 2.95132158591262e-06, + "loss": 0.5567, + "step": 5157 + }, + { + "epoch": 2.013270882123341, + "grad_norm": 0.45501526230626077, + "learning_rate": 2.9492502530476068e-06, + "loss": 0.6168, + "step": 5158 + }, + { + "epoch": 2.0136612021857925, + "grad_norm": 0.537969223298896, + "learning_rate": 2.947179343218131e-06, + "loss": 0.5659, + "step": 5159 + }, + { + "epoch": 2.0140515222482436, + "grad_norm": 0.5002308984923627, + "learning_rate": 2.945108856851388e-06, + "loss": 0.5815, + "step": 5160 + }, + { + "epoch": 2.0144418423106947, + "grad_norm": 0.46595368575885404, + "learning_rate": 2.9430387943744842e-06, + "loss": 0.5714, + "step": 5161 + }, + { + "epoch": 2.0148321623731458, + "grad_norm": 0.4728993859426223, + "learning_rate": 2.9409691562144414e-06, + "loss": 0.5656, + "step": 5162 + }, + { + "epoch": 2.0152224824355973, + "grad_norm": 0.5471243172610872, + "learning_rate": 2.938899942798188e-06, + "loss": 0.5668, + "step": 5163 + }, + { + "epoch": 2.0156128024980484, + "grad_norm": 0.5047826762648648, + "learning_rate": 2.9368311545525697e-06, + "loss": 0.5785, + "step": 5164 + }, + { + "epoch": 2.0160031225604995, + "grad_norm": 0.6485936752809139, + "learning_rate": 2.9347627919043433e-06, + "loss": 0.5836, + "step": 5165 + }, + { + "epoch": 2.0163934426229506, + "grad_norm": 0.5449050442117856, + "learning_rate": 2.9326948552801764e-06, + "loss": 0.5743, + "step": 5166 + }, + { + "epoch": 2.016783762685402, + "grad_norm": 0.4620114308621424, + "learning_rate": 2.9306273451066515e-06, + "loss": 0.5558, + "step": 5167 + }, + { + "epoch": 2.0171740827478533, + "grad_norm": 0.4836077403179827, + "learning_rate": 2.9285602618102615e-06, + "loss": 0.5819, + "step": 5168 + }, + { + "epoch": 2.0175644028103044, + "grad_norm": 0.5537547426582179, + "learning_rate": 2.92649360581741e-06, + "loss": 0.5486, + "step": 5169 + }, + { + "epoch": 2.0179547228727555, + "grad_norm": 0.5145461393874331, + "learning_rate": 2.9244273775544153e-06, + "loss": 0.5634, + "step": 5170 + }, + { + "epoch": 2.018345042935207, + "grad_norm": 0.49152036497136287, + "learning_rate": 2.922361577447506e-06, + "loss": 0.6051, + "step": 5171 + }, + { + "epoch": 2.018735362997658, + "grad_norm": 0.5241359490937871, + "learning_rate": 2.9202962059228203e-06, + "loss": 0.5713, + "step": 5172 + }, + { + "epoch": 2.019125683060109, + "grad_norm": 0.5326603409986032, + "learning_rate": 2.91823126340641e-06, + "loss": 0.5564, + "step": 5173 + }, + { + "epoch": 2.0195160031225603, + "grad_norm": 0.4932581372269859, + "learning_rate": 2.916166750324242e-06, + "loss": 0.5572, + "step": 5174 + }, + { + "epoch": 2.019906323185012, + "grad_norm": 0.5090974723063758, + "learning_rate": 2.9141026671021854e-06, + "loss": 0.5705, + "step": 5175 + }, + { + "epoch": 2.020296643247463, + "grad_norm": 0.5661622993892144, + "learning_rate": 2.9120390141660317e-06, + "loss": 0.5449, + "step": 5176 + }, + { + "epoch": 2.020686963309914, + "grad_norm": 0.5871217769858396, + "learning_rate": 2.909975791941473e-06, + "loss": 0.5554, + "step": 5177 + }, + { + "epoch": 2.021077283372365, + "grad_norm": 0.4839055741121923, + "learning_rate": 2.9079130008541225e-06, + "loss": 0.5531, + "step": 5178 + }, + { + "epoch": 2.0214676034348167, + "grad_norm": 0.4612616647545535, + "learning_rate": 2.905850641329496e-06, + "loss": 0.5408, + "step": 5179 + }, + { + "epoch": 2.021857923497268, + "grad_norm": 0.44182143296279036, + "learning_rate": 2.9037887137930287e-06, + "loss": 0.5492, + "step": 5180 + }, + { + "epoch": 2.022248243559719, + "grad_norm": 0.5657564756446796, + "learning_rate": 2.901727218670055e-06, + "loss": 0.5662, + "step": 5181 + }, + { + "epoch": 2.02263856362217, + "grad_norm": 0.4974358934490873, + "learning_rate": 2.8996661563858333e-06, + "loss": 0.5907, + "step": 5182 + }, + { + "epoch": 2.0230288836846215, + "grad_norm": 0.47558001446893144, + "learning_rate": 2.8976055273655233e-06, + "loss": 0.5659, + "step": 5183 + }, + { + "epoch": 2.0234192037470726, + "grad_norm": 0.48878437652027085, + "learning_rate": 2.895545332034197e-06, + "loss": 0.5815, + "step": 5184 + }, + { + "epoch": 2.0238095238095237, + "grad_norm": 0.5124872559204784, + "learning_rate": 2.893485570816843e-06, + "loss": 0.5594, + "step": 5185 + }, + { + "epoch": 2.024199843871975, + "grad_norm": 0.47166843300746497, + "learning_rate": 2.8914262441383504e-06, + "loss": 0.5401, + "step": 5186 + }, + { + "epoch": 2.0245901639344264, + "grad_norm": 0.4806453128756065, + "learning_rate": 2.8893673524235296e-06, + "loss": 0.5411, + "step": 5187 + }, + { + "epoch": 2.0249804839968775, + "grad_norm": 0.5658268096813331, + "learning_rate": 2.8873088960970924e-06, + "loss": 0.5913, + "step": 5188 + }, + { + "epoch": 2.0253708040593286, + "grad_norm": 0.4683219671025996, + "learning_rate": 2.885250875583665e-06, + "loss": 0.5344, + "step": 5189 + }, + { + "epoch": 2.0257611241217797, + "grad_norm": 0.48391651080156944, + "learning_rate": 2.8831932913077797e-06, + "loss": 0.6077, + "step": 5190 + }, + { + "epoch": 2.026151444184231, + "grad_norm": 0.49846607249518676, + "learning_rate": 2.8811361436938866e-06, + "loss": 0.615, + "step": 5191 + }, + { + "epoch": 2.0265417642466823, + "grad_norm": 0.5094908691831074, + "learning_rate": 2.8790794331663373e-06, + "loss": 0.5458, + "step": 5192 + }, + { + "epoch": 2.0269320843091334, + "grad_norm": 0.47627065913864103, + "learning_rate": 2.877023160149401e-06, + "loss": 0.5878, + "step": 5193 + }, + { + "epoch": 2.0273224043715845, + "grad_norm": 0.45507536215665806, + "learning_rate": 2.8749673250672476e-06, + "loss": 0.5942, + "step": 5194 + }, + { + "epoch": 2.027712724434036, + "grad_norm": 0.5204816793497139, + "learning_rate": 2.8729119283439667e-06, + "loss": 0.5617, + "step": 5195 + }, + { + "epoch": 2.028103044496487, + "grad_norm": 0.5255963840782616, + "learning_rate": 2.8708569704035492e-06, + "loss": 0.6078, + "step": 5196 + }, + { + "epoch": 2.0284933645589383, + "grad_norm": 0.479313028581225, + "learning_rate": 2.868802451669901e-06, + "loss": 0.5734, + "step": 5197 + }, + { + "epoch": 2.0288836846213893, + "grad_norm": 0.586988772668649, + "learning_rate": 2.8667483725668304e-06, + "loss": 0.5789, + "step": 5198 + }, + { + "epoch": 2.029274004683841, + "grad_norm": 0.5265907166351842, + "learning_rate": 2.8646947335180654e-06, + "loss": 0.5846, + "step": 5199 + }, + { + "epoch": 2.029664324746292, + "grad_norm": 0.5183999679505495, + "learning_rate": 2.862641534947235e-06, + "loss": 0.579, + "step": 5200 + }, + { + "epoch": 2.030054644808743, + "grad_norm": 0.504699014500216, + "learning_rate": 2.8605887772778777e-06, + "loss": 0.5472, + "step": 5201 + }, + { + "epoch": 2.030444964871194, + "grad_norm": 0.5912159913604103, + "learning_rate": 2.858536460933448e-06, + "loss": 0.5456, + "step": 5202 + }, + { + "epoch": 2.0308352849336457, + "grad_norm": 0.48795034600729137, + "learning_rate": 2.8564845863373003e-06, + "loss": 0.5751, + "step": 5203 + }, + { + "epoch": 2.031225604996097, + "grad_norm": 0.5491064545664027, + "learning_rate": 2.8544331539127056e-06, + "loss": 0.5658, + "step": 5204 + }, + { + "epoch": 2.031615925058548, + "grad_norm": 0.5234989788677229, + "learning_rate": 2.8523821640828365e-06, + "loss": 0.5659, + "step": 5205 + }, + { + "epoch": 2.032006245120999, + "grad_norm": 0.650109181921497, + "learning_rate": 2.8503316172707853e-06, + "loss": 0.5261, + "step": 5206 + }, + { + "epoch": 2.0323965651834506, + "grad_norm": 0.526955717647086, + "learning_rate": 2.8482815138995355e-06, + "loss": 0.6003, + "step": 5207 + }, + { + "epoch": 2.0327868852459017, + "grad_norm": 0.521355122392613, + "learning_rate": 2.8462318543919964e-06, + "loss": 0.5928, + "step": 5208 + }, + { + "epoch": 2.0331772053083528, + "grad_norm": 0.48206221293041396, + "learning_rate": 2.8441826391709753e-06, + "loss": 0.6018, + "step": 5209 + }, + { + "epoch": 2.033567525370804, + "grad_norm": 0.533279931918352, + "learning_rate": 2.842133868659195e-06, + "loss": 0.5898, + "step": 5210 + }, + { + "epoch": 2.0339578454332554, + "grad_norm": 0.5556761254801376, + "learning_rate": 2.8400855432792772e-06, + "loss": 0.5639, + "step": 5211 + }, + { + "epoch": 2.0343481654957065, + "grad_norm": 0.5905409255039588, + "learning_rate": 2.8380376634537633e-06, + "loss": 0.5581, + "step": 5212 + }, + { + "epoch": 2.0347384855581576, + "grad_norm": 0.5669910687269056, + "learning_rate": 2.8359902296050933e-06, + "loss": 0.5861, + "step": 5213 + }, + { + "epoch": 2.0351288056206087, + "grad_norm": 0.526198458173336, + "learning_rate": 2.8339432421556178e-06, + "loss": 0.5551, + "step": 5214 + }, + { + "epoch": 2.0355191256830603, + "grad_norm": 0.6078130761837498, + "learning_rate": 2.8318967015275996e-06, + "loss": 0.6039, + "step": 5215 + }, + { + "epoch": 2.0359094457455114, + "grad_norm": 0.5459077287563577, + "learning_rate": 2.8298506081432042e-06, + "loss": 0.5725, + "step": 5216 + }, + { + "epoch": 2.0362997658079625, + "grad_norm": 0.5632485486362528, + "learning_rate": 2.8278049624245065e-06, + "loss": 0.5777, + "step": 5217 + }, + { + "epoch": 2.0366900858704136, + "grad_norm": 0.546208577073993, + "learning_rate": 2.8257597647934876e-06, + "loss": 0.5377, + "step": 5218 + }, + { + "epoch": 2.037080405932865, + "grad_norm": 0.5543908948406605, + "learning_rate": 2.8237150156720416e-06, + "loss": 0.5856, + "step": 5219 + }, + { + "epoch": 2.037470725995316, + "grad_norm": 0.5830848439473834, + "learning_rate": 2.821670715481961e-06, + "loss": 0.5923, + "step": 5220 + }, + { + "epoch": 2.0378610460577673, + "grad_norm": 0.4717904559617981, + "learning_rate": 2.8196268646449573e-06, + "loss": 0.5877, + "step": 5221 + }, + { + "epoch": 2.0382513661202184, + "grad_norm": 0.48128979934000554, + "learning_rate": 2.8175834635826365e-06, + "loss": 0.5672, + "step": 5222 + }, + { + "epoch": 2.03864168618267, + "grad_norm": 0.5395610804332451, + "learning_rate": 2.8155405127165238e-06, + "loss": 0.5581, + "step": 5223 + }, + { + "epoch": 2.039032006245121, + "grad_norm": 0.5006634944451724, + "learning_rate": 2.813498012468043e-06, + "loss": 0.5769, + "step": 5224 + }, + { + "epoch": 2.039422326307572, + "grad_norm": 0.5707574396971057, + "learning_rate": 2.811455963258528e-06, + "loss": 0.5458, + "step": 5225 + }, + { + "epoch": 2.0398126463700232, + "grad_norm": 0.4748396085175528, + "learning_rate": 2.809414365509219e-06, + "loss": 0.5172, + "step": 5226 + }, + { + "epoch": 2.040202966432475, + "grad_norm": 0.47509177723292084, + "learning_rate": 2.8073732196412657e-06, + "loss": 0.5809, + "step": 5227 + }, + { + "epoch": 2.040593286494926, + "grad_norm": 0.48452201243731124, + "learning_rate": 2.8053325260757193e-06, + "loss": 0.5252, + "step": 5228 + }, + { + "epoch": 2.040983606557377, + "grad_norm": 0.5307322069380759, + "learning_rate": 2.8032922852335454e-06, + "loss": 0.5859, + "step": 5229 + }, + { + "epoch": 2.041373926619828, + "grad_norm": 0.5258593724968176, + "learning_rate": 2.801252497535609e-06, + "loss": 0.5776, + "step": 5230 + }, + { + "epoch": 2.0417642466822796, + "grad_norm": 0.4833309100491742, + "learning_rate": 2.7992131634026827e-06, + "loss": 0.5802, + "step": 5231 + }, + { + "epoch": 2.0421545667447307, + "grad_norm": 0.44908558951797756, + "learning_rate": 2.7971742832554504e-06, + "loss": 0.581, + "step": 5232 + }, + { + "epoch": 2.042544886807182, + "grad_norm": 0.48789646503947354, + "learning_rate": 2.795135857514499e-06, + "loss": 0.5446, + "step": 5233 + }, + { + "epoch": 2.042935206869633, + "grad_norm": 0.4972179633789077, + "learning_rate": 2.79309788660032e-06, + "loss": 0.5443, + "step": 5234 + }, + { + "epoch": 2.0433255269320845, + "grad_norm": 0.5241181669298183, + "learning_rate": 2.7910603709333116e-06, + "loss": 0.5717, + "step": 5235 + }, + { + "epoch": 2.0437158469945356, + "grad_norm": 0.5587961238794099, + "learning_rate": 2.7890233109337835e-06, + "loss": 0.5668, + "step": 5236 + }, + { + "epoch": 2.0441061670569867, + "grad_norm": 0.541181668701847, + "learning_rate": 2.7869867070219427e-06, + "loss": 0.5921, + "step": 5237 + }, + { + "epoch": 2.0444964871194378, + "grad_norm": 0.47637659374414015, + "learning_rate": 2.784950559617911e-06, + "loss": 0.5977, + "step": 5238 + }, + { + "epoch": 2.0448868071818893, + "grad_norm": 0.41913668683113814, + "learning_rate": 2.782914869141708e-06, + "loss": 0.5714, + "step": 5239 + }, + { + "epoch": 2.0452771272443404, + "grad_norm": 0.4674639349500011, + "learning_rate": 2.780879636013266e-06, + "loss": 0.5602, + "step": 5240 + }, + { + "epoch": 2.0456674473067915, + "grad_norm": 0.5443377647908381, + "learning_rate": 2.778844860652419e-06, + "loss": 0.5686, + "step": 5241 + }, + { + "epoch": 2.0460577673692426, + "grad_norm": 0.531112064788374, + "learning_rate": 2.7768105434789066e-06, + "loss": 0.574, + "step": 5242 + }, + { + "epoch": 2.046448087431694, + "grad_norm": 0.5128816686542388, + "learning_rate": 2.7747766849123724e-06, + "loss": 0.5852, + "step": 5243 + }, + { + "epoch": 2.0468384074941453, + "grad_norm": 0.5192821174441153, + "learning_rate": 2.7727432853723713e-06, + "loss": 0.5671, + "step": 5244 + }, + { + "epoch": 2.0472287275565964, + "grad_norm": 0.5077960537033102, + "learning_rate": 2.7707103452783564e-06, + "loss": 0.5321, + "step": 5245 + }, + { + "epoch": 2.0476190476190474, + "grad_norm": 0.475706449102311, + "learning_rate": 2.768677865049693e-06, + "loss": 0.5485, + "step": 5246 + }, + { + "epoch": 2.048009367681499, + "grad_norm": 0.5776923642189706, + "learning_rate": 2.766645845105646e-06, + "loss": 0.5847, + "step": 5247 + }, + { + "epoch": 2.04839968774395, + "grad_norm": 0.5541585050438127, + "learning_rate": 2.764614285865386e-06, + "loss": 0.5944, + "step": 5248 + }, + { + "epoch": 2.048790007806401, + "grad_norm": 0.5108935240300206, + "learning_rate": 2.7625831877479925e-06, + "loss": 0.5644, + "step": 5249 + }, + { + "epoch": 2.0491803278688523, + "grad_norm": 0.5309551290386214, + "learning_rate": 2.7605525511724464e-06, + "loss": 0.6013, + "step": 5250 + }, + { + "epoch": 2.049570647931304, + "grad_norm": 0.5516992166650423, + "learning_rate": 2.7585223765576338e-06, + "loss": 0.5791, + "step": 5251 + }, + { + "epoch": 2.049960967993755, + "grad_norm": 0.5205307842578744, + "learning_rate": 2.7564926643223433e-06, + "loss": 0.5969, + "step": 5252 + }, + { + "epoch": 2.050351288056206, + "grad_norm": 0.5495193165091997, + "learning_rate": 2.754463414885276e-06, + "loss": 0.5542, + "step": 5253 + }, + { + "epoch": 2.050741608118657, + "grad_norm": 0.49892803741531694, + "learning_rate": 2.752434628665028e-06, + "loss": 0.5651, + "step": 5254 + }, + { + "epoch": 2.0511319281811087, + "grad_norm": 0.4842296374404343, + "learning_rate": 2.7504063060801066e-06, + "loss": 0.5618, + "step": 5255 + }, + { + "epoch": 2.0515222482435598, + "grad_norm": 0.5027376437947775, + "learning_rate": 2.7483784475489194e-06, + "loss": 0.5472, + "step": 5256 + }, + { + "epoch": 2.051912568306011, + "grad_norm": 0.46268780688195527, + "learning_rate": 2.746351053489782e-06, + "loss": 0.5649, + "step": 5257 + }, + { + "epoch": 2.052302888368462, + "grad_norm": 0.5169681697056718, + "learning_rate": 2.7443241243209094e-06, + "loss": 0.5742, + "step": 5258 + }, + { + "epoch": 2.0526932084309135, + "grad_norm": 0.493973818054348, + "learning_rate": 2.742297660460428e-06, + "loss": 0.5653, + "step": 5259 + }, + { + "epoch": 2.0530835284933646, + "grad_norm": 0.45305894887523784, + "learning_rate": 2.7402716623263565e-06, + "loss": 0.6036, + "step": 5260 + }, + { + "epoch": 2.0534738485558157, + "grad_norm": 0.4679591172365459, + "learning_rate": 2.7382461303366305e-06, + "loss": 0.5338, + "step": 5261 + }, + { + "epoch": 2.053864168618267, + "grad_norm": 0.45291451285145135, + "learning_rate": 2.736221064909078e-06, + "loss": 0.5896, + "step": 5262 + }, + { + "epoch": 2.0542544886807184, + "grad_norm": 0.44012792647764526, + "learning_rate": 2.734196466461443e-06, + "loss": 0.5682, + "step": 5263 + }, + { + "epoch": 2.0546448087431695, + "grad_norm": 0.4692602521353422, + "learning_rate": 2.732172335411363e-06, + "loss": 0.576, + "step": 5264 + }, + { + "epoch": 2.0550351288056206, + "grad_norm": 0.4575232861201318, + "learning_rate": 2.7301486721763805e-06, + "loss": 0.5873, + "step": 5265 + }, + { + "epoch": 2.0554254488680717, + "grad_norm": 0.5092746867483823, + "learning_rate": 2.7281254771739473e-06, + "loss": 0.5757, + "step": 5266 + }, + { + "epoch": 2.055815768930523, + "grad_norm": 0.46104551746093403, + "learning_rate": 2.7261027508214115e-06, + "loss": 0.5821, + "step": 5267 + }, + { + "epoch": 2.0562060889929743, + "grad_norm": 0.44855401525682703, + "learning_rate": 2.724080493536033e-06, + "loss": 0.5785, + "step": 5268 + }, + { + "epoch": 2.0565964090554254, + "grad_norm": 0.465334521780885, + "learning_rate": 2.7220587057349635e-06, + "loss": 0.5633, + "step": 5269 + }, + { + "epoch": 2.0569867291178765, + "grad_norm": 0.5091513127102073, + "learning_rate": 2.720037387835269e-06, + "loss": 0.5463, + "step": 5270 + }, + { + "epoch": 2.057377049180328, + "grad_norm": 0.5056179245685276, + "learning_rate": 2.7180165402539095e-06, + "loss": 0.6058, + "step": 5271 + }, + { + "epoch": 2.057767369242779, + "grad_norm": 0.46325635834445383, + "learning_rate": 2.715996163407756e-06, + "loss": 0.5185, + "step": 5272 + }, + { + "epoch": 2.0581576893052302, + "grad_norm": 0.4457581123410588, + "learning_rate": 2.713976257713576e-06, + "loss": 0.5584, + "step": 5273 + }, + { + "epoch": 2.0585480093676813, + "grad_norm": 0.43444766903593096, + "learning_rate": 2.7119568235880456e-06, + "loss": 0.5409, + "step": 5274 + }, + { + "epoch": 2.058938329430133, + "grad_norm": 0.560712625362602, + "learning_rate": 2.709937861447736e-06, + "loss": 0.588, + "step": 5275 + }, + { + "epoch": 2.059328649492584, + "grad_norm": 0.4555096027081851, + "learning_rate": 2.70791937170913e-06, + "loss": 0.5789, + "step": 5276 + }, + { + "epoch": 2.059718969555035, + "grad_norm": 0.449025227860206, + "learning_rate": 2.705901354788606e-06, + "loss": 0.5259, + "step": 5277 + }, + { + "epoch": 2.060109289617486, + "grad_norm": 0.5012298155786727, + "learning_rate": 2.703883811102448e-06, + "loss": 0.5954, + "step": 5278 + }, + { + "epoch": 2.0604996096799377, + "grad_norm": 0.4406375408197075, + "learning_rate": 2.7018667410668397e-06, + "loss": 0.532, + "step": 5279 + }, + { + "epoch": 2.060889929742389, + "grad_norm": 0.4951594031855124, + "learning_rate": 2.699850145097872e-06, + "loss": 0.574, + "step": 5280 + }, + { + "epoch": 2.06128024980484, + "grad_norm": 0.4784760067389319, + "learning_rate": 2.6978340236115346e-06, + "loss": 0.5422, + "step": 5281 + }, + { + "epoch": 2.061670569867291, + "grad_norm": 0.427760114936516, + "learning_rate": 2.6958183770237167e-06, + "loss": 0.5974, + "step": 5282 + }, + { + "epoch": 2.0620608899297426, + "grad_norm": 0.4641235226409872, + "learning_rate": 2.6938032057502167e-06, + "loss": 0.5843, + "step": 5283 + }, + { + "epoch": 2.0624512099921937, + "grad_norm": 0.4788650963674393, + "learning_rate": 2.6917885102067275e-06, + "loss": 0.5732, + "step": 5284 + }, + { + "epoch": 2.0628415300546448, + "grad_norm": 0.4854037368363087, + "learning_rate": 2.6897742908088527e-06, + "loss": 0.5635, + "step": 5285 + }, + { + "epoch": 2.063231850117096, + "grad_norm": 0.48237878241824583, + "learning_rate": 2.6877605479720847e-06, + "loss": 0.5714, + "step": 5286 + }, + { + "epoch": 2.0636221701795474, + "grad_norm": 0.4907034694610485, + "learning_rate": 2.68574728211183e-06, + "loss": 0.5455, + "step": 5287 + }, + { + "epoch": 2.0640124902419985, + "grad_norm": 0.44111680377807566, + "learning_rate": 2.6837344936433896e-06, + "loss": 0.5574, + "step": 5288 + }, + { + "epoch": 2.0644028103044496, + "grad_norm": 0.4659511841468538, + "learning_rate": 2.6817221829819707e-06, + "loss": 0.5767, + "step": 5289 + }, + { + "epoch": 2.0647931303669007, + "grad_norm": 0.471646229578987, + "learning_rate": 2.679710350542677e-06, + "loss": 0.5671, + "step": 5290 + }, + { + "epoch": 2.0651834504293523, + "grad_norm": 0.45108886351926075, + "learning_rate": 2.6776989967405185e-06, + "loss": 0.5752, + "step": 5291 + }, + { + "epoch": 2.0655737704918034, + "grad_norm": 0.462837414992868, + "learning_rate": 2.6756881219904005e-06, + "loss": 0.5745, + "step": 5292 + }, + { + "epoch": 2.0659640905542545, + "grad_norm": 0.5141785653433583, + "learning_rate": 2.673677726707138e-06, + "loss": 0.5847, + "step": 5293 + }, + { + "epoch": 2.0663544106167056, + "grad_norm": 0.4614610106553128, + "learning_rate": 2.671667811305439e-06, + "loss": 0.5873, + "step": 5294 + }, + { + "epoch": 2.066744730679157, + "grad_norm": 0.457460683593674, + "learning_rate": 2.6696583761999157e-06, + "loss": 0.5755, + "step": 5295 + }, + { + "epoch": 2.067135050741608, + "grad_norm": 0.4321329076663598, + "learning_rate": 2.667649421805079e-06, + "loss": 0.5535, + "step": 5296 + }, + { + "epoch": 2.0675253708040593, + "grad_norm": 0.4494630081563005, + "learning_rate": 2.6656409485353475e-06, + "loss": 0.5539, + "step": 5297 + }, + { + "epoch": 2.0679156908665104, + "grad_norm": 0.49581268406058115, + "learning_rate": 2.6636329568050336e-06, + "loss": 0.5686, + "step": 5298 + }, + { + "epoch": 2.068306010928962, + "grad_norm": 0.4808345880991333, + "learning_rate": 2.66162544702835e-06, + "loss": 0.5774, + "step": 5299 + }, + { + "epoch": 2.068696330991413, + "grad_norm": 0.4570692463312925, + "learning_rate": 2.6596184196194165e-06, + "loss": 0.5758, + "step": 5300 + }, + { + "epoch": 2.069086651053864, + "grad_norm": 0.5605885658677858, + "learning_rate": 2.6576118749922464e-06, + "loss": 0.5556, + "step": 5301 + }, + { + "epoch": 2.0694769711163152, + "grad_norm": 0.570154765358765, + "learning_rate": 2.6556058135607594e-06, + "loss": 0.6039, + "step": 5302 + }, + { + "epoch": 2.0698672911787668, + "grad_norm": 0.5031705164261093, + "learning_rate": 2.6536002357387715e-06, + "loss": 0.5521, + "step": 5303 + }, + { + "epoch": 2.070257611241218, + "grad_norm": 0.49031304446708485, + "learning_rate": 2.651595141939999e-06, + "loss": 0.5935, + "step": 5304 + }, + { + "epoch": 2.070647931303669, + "grad_norm": 0.5719794191827111, + "learning_rate": 2.649590532578058e-06, + "loss": 0.5352, + "step": 5305 + }, + { + "epoch": 2.07103825136612, + "grad_norm": 0.5054271050329394, + "learning_rate": 2.6475864080664706e-06, + "loss": 0.6049, + "step": 5306 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 0.557223083058217, + "learning_rate": 2.645582768818649e-06, + "loss": 0.5978, + "step": 5307 + }, + { + "epoch": 2.0718188914910227, + "grad_norm": 0.5164740732399471, + "learning_rate": 2.643579615247916e-06, + "loss": 0.5565, + "step": 5308 + }, + { + "epoch": 2.072209211553474, + "grad_norm": 0.520549049691333, + "learning_rate": 2.641576947767484e-06, + "loss": 0.6057, + "step": 5309 + }, + { + "epoch": 2.072599531615925, + "grad_norm": 0.47133572116630784, + "learning_rate": 2.6395747667904743e-06, + "loss": 0.5431, + "step": 5310 + }, + { + "epoch": 2.0729898516783765, + "grad_norm": 0.49081650735511095, + "learning_rate": 2.637573072729901e-06, + "loss": 0.5673, + "step": 5311 + }, + { + "epoch": 2.0733801717408276, + "grad_norm": 0.5022552297731313, + "learning_rate": 2.6355718659986808e-06, + "loss": 0.5415, + "step": 5312 + }, + { + "epoch": 2.0737704918032787, + "grad_norm": 0.49215234919265516, + "learning_rate": 2.633571147009629e-06, + "loss": 0.5427, + "step": 5313 + }, + { + "epoch": 2.0741608118657298, + "grad_norm": 0.4598163367665486, + "learning_rate": 2.6315709161754594e-06, + "loss": 0.5636, + "step": 5314 + }, + { + "epoch": 2.0745511319281813, + "grad_norm": 0.4984425397589958, + "learning_rate": 2.6295711739087894e-06, + "loss": 0.5656, + "step": 5315 + }, + { + "epoch": 2.0749414519906324, + "grad_norm": 0.47287909589059174, + "learning_rate": 2.6275719206221286e-06, + "loss": 0.5588, + "step": 5316 + }, + { + "epoch": 2.0753317720530835, + "grad_norm": 0.5182320654161361, + "learning_rate": 2.6255731567278943e-06, + "loss": 0.6008, + "step": 5317 + }, + { + "epoch": 2.0757220921155346, + "grad_norm": 0.4690214166589259, + "learning_rate": 2.6235748826383934e-06, + "loss": 0.5565, + "step": 5318 + }, + { + "epoch": 2.076112412177986, + "grad_norm": 0.478683107535826, + "learning_rate": 2.6215770987658416e-06, + "loss": 0.5418, + "step": 5319 + }, + { + "epoch": 2.0765027322404372, + "grad_norm": 0.4573723707756871, + "learning_rate": 2.619579805522344e-06, + "loss": 0.5719, + "step": 5320 + }, + { + "epoch": 2.0768930523028883, + "grad_norm": 0.5107173067369979, + "learning_rate": 2.6175830033199146e-06, + "loss": 0.6035, + "step": 5321 + }, + { + "epoch": 2.0772833723653394, + "grad_norm": 0.4863152092656877, + "learning_rate": 2.6155866925704526e-06, + "loss": 0.5677, + "step": 5322 + }, + { + "epoch": 2.077673692427791, + "grad_norm": 0.5102047915299404, + "learning_rate": 2.6135908736857694e-06, + "loss": 0.5288, + "step": 5323 + }, + { + "epoch": 2.078064012490242, + "grad_norm": 0.4560549965621697, + "learning_rate": 2.6115955470775668e-06, + "loss": 0.5504, + "step": 5324 + }, + { + "epoch": 2.078454332552693, + "grad_norm": 0.4972693930041616, + "learning_rate": 2.6096007131574498e-06, + "loss": 0.5541, + "step": 5325 + }, + { + "epoch": 2.0788446526151443, + "grad_norm": 0.5072460685827503, + "learning_rate": 2.607606372336917e-06, + "loss": 0.5822, + "step": 5326 + }, + { + "epoch": 2.079234972677596, + "grad_norm": 0.5097307318173261, + "learning_rate": 2.6056125250273677e-06, + "loss": 0.5428, + "step": 5327 + }, + { + "epoch": 2.079625292740047, + "grad_norm": 0.47891993007088524, + "learning_rate": 2.603619171640102e-06, + "loss": 0.5613, + "step": 5328 + }, + { + "epoch": 2.080015612802498, + "grad_norm": 0.5060795412331157, + "learning_rate": 2.601626312586314e-06, + "loss": 0.5564, + "step": 5329 + }, + { + "epoch": 2.080405932864949, + "grad_norm": 0.4998707910820925, + "learning_rate": 2.599633948277097e-06, + "loss": 0.5578, + "step": 5330 + }, + { + "epoch": 2.0807962529274007, + "grad_norm": 0.5103719883360033, + "learning_rate": 2.5976420791234413e-06, + "loss": 0.5391, + "step": 5331 + }, + { + "epoch": 2.0811865729898518, + "grad_norm": 0.48135259157557414, + "learning_rate": 2.5956507055362394e-06, + "loss": 0.6272, + "step": 5332 + }, + { + "epoch": 2.081576893052303, + "grad_norm": 0.4534465238769467, + "learning_rate": 2.5936598279262753e-06, + "loss": 0.5643, + "step": 5333 + }, + { + "epoch": 2.081967213114754, + "grad_norm": 0.43557688060476724, + "learning_rate": 2.5916694467042376e-06, + "loss": 0.5514, + "step": 5334 + }, + { + "epoch": 2.0823575331772055, + "grad_norm": 0.4802021027321246, + "learning_rate": 2.5896795622807047e-06, + "loss": 0.561, + "step": 5335 + }, + { + "epoch": 2.0827478532396566, + "grad_norm": 0.5046245100488201, + "learning_rate": 2.5876901750661607e-06, + "loss": 0.6055, + "step": 5336 + }, + { + "epoch": 2.0831381733021077, + "grad_norm": 0.4507174933050022, + "learning_rate": 2.5857012854709795e-06, + "loss": 0.5497, + "step": 5337 + }, + { + "epoch": 2.083528493364559, + "grad_norm": 0.42917498122083436, + "learning_rate": 2.5837128939054414e-06, + "loss": 0.5655, + "step": 5338 + }, + { + "epoch": 2.0839188134270104, + "grad_norm": 0.4652221868256101, + "learning_rate": 2.58172500077971e-06, + "loss": 0.6015, + "step": 5339 + }, + { + "epoch": 2.0843091334894615, + "grad_norm": 0.4539638621673985, + "learning_rate": 2.5797376065038617e-06, + "loss": 0.5605, + "step": 5340 + }, + { + "epoch": 2.0846994535519126, + "grad_norm": 0.48685008298799, + "learning_rate": 2.5777507114878585e-06, + "loss": 0.5611, + "step": 5341 + }, + { + "epoch": 2.0850897736143637, + "grad_norm": 0.49139453267565, + "learning_rate": 2.5757643161415673e-06, + "loss": 0.5407, + "step": 5342 + }, + { + "epoch": 2.085480093676815, + "grad_norm": 0.45517637578682585, + "learning_rate": 2.5737784208747473e-06, + "loss": 0.5498, + "step": 5343 + }, + { + "epoch": 2.0858704137392663, + "grad_norm": 0.5188428800736968, + "learning_rate": 2.5717930260970526e-06, + "loss": 0.5556, + "step": 5344 + }, + { + "epoch": 2.0862607338017174, + "grad_norm": 0.4428247822196962, + "learning_rate": 2.5698081322180413e-06, + "loss": 0.6057, + "step": 5345 + }, + { + "epoch": 2.0866510538641685, + "grad_norm": 0.4834760081723496, + "learning_rate": 2.5678237396471607e-06, + "loss": 0.5666, + "step": 5346 + }, + { + "epoch": 2.08704137392662, + "grad_norm": 0.4788430131772422, + "learning_rate": 2.5658398487937637e-06, + "loss": 0.55, + "step": 5347 + }, + { + "epoch": 2.087431693989071, + "grad_norm": 0.4490285133349317, + "learning_rate": 2.5638564600670856e-06, + "loss": 0.5857, + "step": 5348 + }, + { + "epoch": 2.0878220140515222, + "grad_norm": 0.4913461517595399, + "learning_rate": 2.5618735738762723e-06, + "loss": 0.582, + "step": 5349 + }, + { + "epoch": 2.0882123341139733, + "grad_norm": 0.46245830033229546, + "learning_rate": 2.559891190630357e-06, + "loss": 0.575, + "step": 5350 + }, + { + "epoch": 2.088602654176425, + "grad_norm": 0.47473098958078647, + "learning_rate": 2.5579093107382754e-06, + "loss": 0.5848, + "step": 5351 + }, + { + "epoch": 2.088992974238876, + "grad_norm": 0.4594444784031994, + "learning_rate": 2.5559279346088526e-06, + "loss": 0.5561, + "step": 5352 + }, + { + "epoch": 2.089383294301327, + "grad_norm": 0.4565616397364262, + "learning_rate": 2.553947062650819e-06, + "loss": 0.5524, + "step": 5353 + }, + { + "epoch": 2.089773614363778, + "grad_norm": 0.4963586924716345, + "learning_rate": 2.5519666952727894e-06, + "loss": 0.5691, + "step": 5354 + }, + { + "epoch": 2.0901639344262297, + "grad_norm": 0.4919502567407409, + "learning_rate": 2.5499868328832854e-06, + "loss": 0.5555, + "step": 5355 + }, + { + "epoch": 2.090554254488681, + "grad_norm": 0.44746502824494283, + "learning_rate": 2.548007475890718e-06, + "loss": 0.5579, + "step": 5356 + }, + { + "epoch": 2.090944574551132, + "grad_norm": 0.4668758691833219, + "learning_rate": 2.546028624703395e-06, + "loss": 0.5851, + "step": 5357 + }, + { + "epoch": 2.091334894613583, + "grad_norm": 0.4549868704233953, + "learning_rate": 2.5440502797295195e-06, + "loss": 0.6133, + "step": 5358 + }, + { + "epoch": 2.0917252146760346, + "grad_norm": 0.4830187758975573, + "learning_rate": 2.542072441377194e-06, + "loss": 0.5815, + "step": 5359 + }, + { + "epoch": 2.0921155347384857, + "grad_norm": 0.4437366414922574, + "learning_rate": 2.5400951100544124e-06, + "loss": 0.5679, + "step": 5360 + }, + { + "epoch": 2.0925058548009368, + "grad_norm": 0.47010152467577554, + "learning_rate": 2.538118286169063e-06, + "loss": 0.585, + "step": 5361 + }, + { + "epoch": 2.092896174863388, + "grad_norm": 0.48388838853187144, + "learning_rate": 2.536141970128935e-06, + "loss": 0.5806, + "step": 5362 + }, + { + "epoch": 2.0932864949258394, + "grad_norm": 0.48515587327597276, + "learning_rate": 2.5341661623417067e-06, + "loss": 0.5691, + "step": 5363 + }, + { + "epoch": 2.0936768149882905, + "grad_norm": 0.4753496975992107, + "learning_rate": 2.532190863214957e-06, + "loss": 0.5542, + "step": 5364 + }, + { + "epoch": 2.0940671350507416, + "grad_norm": 0.4557458609844277, + "learning_rate": 2.530216073156157e-06, + "loss": 0.5931, + "step": 5365 + }, + { + "epoch": 2.0944574551131927, + "grad_norm": 0.468666365022335, + "learning_rate": 2.528241792572672e-06, + "loss": 0.5774, + "step": 5366 + }, + { + "epoch": 2.0948477751756442, + "grad_norm": 0.4526761273236207, + "learning_rate": 2.5262680218717606e-06, + "loss": 0.5857, + "step": 5367 + }, + { + "epoch": 2.0952380952380953, + "grad_norm": 0.4860040802547078, + "learning_rate": 2.524294761460584e-06, + "loss": 0.6053, + "step": 5368 + }, + { + "epoch": 2.0956284153005464, + "grad_norm": 0.420831632410599, + "learning_rate": 2.5223220117461876e-06, + "loss": 0.5742, + "step": 5369 + }, + { + "epoch": 2.0960187353629975, + "grad_norm": 0.47083472463746906, + "learning_rate": 2.5203497731355226e-06, + "loss": 0.5354, + "step": 5370 + }, + { + "epoch": 2.0964090554254486, + "grad_norm": 0.5657211683992072, + "learning_rate": 2.5183780460354234e-06, + "loss": 0.5633, + "step": 5371 + }, + { + "epoch": 2.0967993754879, + "grad_norm": 0.5235865753126993, + "learning_rate": 2.5164068308526295e-06, + "loss": 0.5806, + "step": 5372 + }, + { + "epoch": 2.0971896955503513, + "grad_norm": 0.5992224685626367, + "learning_rate": 2.5144361279937677e-06, + "loss": 0.587, + "step": 5373 + }, + { + "epoch": 2.0975800156128024, + "grad_norm": 0.4700325046899599, + "learning_rate": 2.5124659378653603e-06, + "loss": 0.5933, + "step": 5374 + }, + { + "epoch": 2.097970335675254, + "grad_norm": 0.5280793980106797, + "learning_rate": 2.5104962608738237e-06, + "loss": 0.5595, + "step": 5375 + }, + { + "epoch": 2.098360655737705, + "grad_norm": 0.5586883901200911, + "learning_rate": 2.5085270974254737e-06, + "loss": 0.5686, + "step": 5376 + }, + { + "epoch": 2.098750975800156, + "grad_norm": 0.458738059119159, + "learning_rate": 2.5065584479265126e-06, + "loss": 0.6002, + "step": 5377 + }, + { + "epoch": 2.0991412958626072, + "grad_norm": 0.4794046855879846, + "learning_rate": 2.5045903127830397e-06, + "loss": 0.5596, + "step": 5378 + }, + { + "epoch": 2.0995316159250583, + "grad_norm": 0.4553027109363169, + "learning_rate": 2.5026226924010503e-06, + "loss": 0.5583, + "step": 5379 + }, + { + "epoch": 2.09992193598751, + "grad_norm": 0.5459141813321594, + "learning_rate": 2.5006555871864302e-06, + "loss": 0.5809, + "step": 5380 + }, + { + "epoch": 2.100312256049961, + "grad_norm": 0.5482086516310136, + "learning_rate": 2.498688997544963e-06, + "loss": 0.5655, + "step": 5381 + }, + { + "epoch": 2.100702576112412, + "grad_norm": 0.4821663745945907, + "learning_rate": 2.4967229238823216e-06, + "loss": 0.5997, + "step": 5382 + }, + { + "epoch": 2.1010928961748636, + "grad_norm": 0.4560102643996648, + "learning_rate": 2.4947573666040746e-06, + "loss": 0.5456, + "step": 5383 + }, + { + "epoch": 2.1014832162373147, + "grad_norm": 0.5294055171945276, + "learning_rate": 2.492792326115682e-06, + "loss": 0.5569, + "step": 5384 + }, + { + "epoch": 2.101873536299766, + "grad_norm": 0.49913517677119834, + "learning_rate": 2.490827802822502e-06, + "loss": 0.57, + "step": 5385 + }, + { + "epoch": 2.102263856362217, + "grad_norm": 0.4524450294113604, + "learning_rate": 2.4888637971297793e-06, + "loss": 0.595, + "step": 5386 + }, + { + "epoch": 2.102654176424668, + "grad_norm": 0.48404679931816746, + "learning_rate": 2.4869003094426603e-06, + "loss": 0.5512, + "step": 5387 + }, + { + "epoch": 2.1030444964871196, + "grad_norm": 0.5207187137680298, + "learning_rate": 2.4849373401661752e-06, + "loss": 0.5621, + "step": 5388 + }, + { + "epoch": 2.1034348165495707, + "grad_norm": 0.5501371043491545, + "learning_rate": 2.482974889705256e-06, + "loss": 0.5502, + "step": 5389 + }, + { + "epoch": 2.1038251366120218, + "grad_norm": 0.4790561405930936, + "learning_rate": 2.481012958464722e-06, + "loss": 0.5514, + "step": 5390 + }, + { + "epoch": 2.1042154566744733, + "grad_norm": 0.5144538828954042, + "learning_rate": 2.4790515468492864e-06, + "loss": 0.6069, + "step": 5391 + }, + { + "epoch": 2.1046057767369244, + "grad_norm": 0.45353074600466387, + "learning_rate": 2.477090655263554e-06, + "loss": 0.5858, + "step": 5392 + }, + { + "epoch": 2.1049960967993755, + "grad_norm": 0.49411865727677756, + "learning_rate": 2.4751302841120277e-06, + "loss": 0.583, + "step": 5393 + }, + { + "epoch": 2.1053864168618266, + "grad_norm": 0.47167587562274915, + "learning_rate": 2.473170433799098e-06, + "loss": 0.5763, + "step": 5394 + }, + { + "epoch": 2.1057767369242777, + "grad_norm": 0.46646157252879505, + "learning_rate": 2.471211104729048e-06, + "loss": 0.583, + "step": 5395 + }, + { + "epoch": 2.1061670569867292, + "grad_norm": 0.4697372132524933, + "learning_rate": 2.4692522973060566e-06, + "loss": 0.5563, + "step": 5396 + }, + { + "epoch": 2.1065573770491803, + "grad_norm": 0.49139296897131696, + "learning_rate": 2.4672940119341914e-06, + "loss": 0.5613, + "step": 5397 + }, + { + "epoch": 2.1069476971116314, + "grad_norm": 0.49332318778198586, + "learning_rate": 2.4653362490174167e-06, + "loss": 0.592, + "step": 5398 + }, + { + "epoch": 2.1073380171740825, + "grad_norm": 0.42106463756456874, + "learning_rate": 2.4633790089595825e-06, + "loss": 0.5526, + "step": 5399 + }, + { + "epoch": 2.107728337236534, + "grad_norm": 0.4458574025926755, + "learning_rate": 2.4614222921644415e-06, + "loss": 0.5807, + "step": 5400 + }, + { + "epoch": 2.108118657298985, + "grad_norm": 0.45183083823275455, + "learning_rate": 2.459466099035624e-06, + "loss": 0.5963, + "step": 5401 + }, + { + "epoch": 2.1085089773614363, + "grad_norm": 0.47770927563196813, + "learning_rate": 2.457510429976665e-06, + "loss": 0.5629, + "step": 5402 + }, + { + "epoch": 2.1088992974238874, + "grad_norm": 0.4874325243851591, + "learning_rate": 2.455555285390983e-06, + "loss": 0.6043, + "step": 5403 + }, + { + "epoch": 2.109289617486339, + "grad_norm": 0.4961655426245628, + "learning_rate": 2.4536006656818957e-06, + "loss": 0.5437, + "step": 5404 + }, + { + "epoch": 2.10967993754879, + "grad_norm": 0.4789615701340569, + "learning_rate": 2.451646571252605e-06, + "loss": 0.5719, + "step": 5405 + }, + { + "epoch": 2.110070257611241, + "grad_norm": 0.44799308850593056, + "learning_rate": 2.4496930025062117e-06, + "loss": 0.5569, + "step": 5406 + }, + { + "epoch": 2.110460577673692, + "grad_norm": 0.43371146911786534, + "learning_rate": 2.4477399598457023e-06, + "loss": 0.5709, + "step": 5407 + }, + { + "epoch": 2.1108508977361438, + "grad_norm": 0.4891352082868995, + "learning_rate": 2.445787443673956e-06, + "loss": 0.5936, + "step": 5408 + }, + { + "epoch": 2.111241217798595, + "grad_norm": 0.4918152401072901, + "learning_rate": 2.443835454393748e-06, + "loss": 0.548, + "step": 5409 + }, + { + "epoch": 2.111631537861046, + "grad_norm": 0.46955868751971525, + "learning_rate": 2.4418839924077373e-06, + "loss": 0.5755, + "step": 5410 + }, + { + "epoch": 2.112021857923497, + "grad_norm": 0.4619692843892155, + "learning_rate": 2.4399330581184804e-06, + "loss": 0.5552, + "step": 5411 + }, + { + "epoch": 2.1124121779859486, + "grad_norm": 0.557978219805194, + "learning_rate": 2.43798265192842e-06, + "loss": 0.6051, + "step": 5412 + }, + { + "epoch": 2.1128024980483997, + "grad_norm": 0.5272427904323662, + "learning_rate": 2.436032774239896e-06, + "loss": 0.5657, + "step": 5413 + }, + { + "epoch": 2.113192818110851, + "grad_norm": 0.4788740463507892, + "learning_rate": 2.434083425455132e-06, + "loss": 0.5477, + "step": 5414 + }, + { + "epoch": 2.113583138173302, + "grad_norm": 0.46131010363573555, + "learning_rate": 2.43213460597625e-06, + "loss": 0.5781, + "step": 5415 + }, + { + "epoch": 2.1139734582357534, + "grad_norm": 0.4546751493550195, + "learning_rate": 2.430186316205255e-06, + "loss": 0.59, + "step": 5416 + }, + { + "epoch": 2.1143637782982045, + "grad_norm": 0.45578303854472846, + "learning_rate": 2.4282385565440514e-06, + "loss": 0.5846, + "step": 5417 + }, + { + "epoch": 2.1147540983606556, + "grad_norm": 0.47299233855986506, + "learning_rate": 2.4262913273944268e-06, + "loss": 0.5237, + "step": 5418 + }, + { + "epoch": 2.1151444184231067, + "grad_norm": 0.48313972063280175, + "learning_rate": 2.4243446291580625e-06, + "loss": 0.56, + "step": 5419 + }, + { + "epoch": 2.1155347384855583, + "grad_norm": 0.5006818451138663, + "learning_rate": 2.422398462236528e-06, + "loss": 0.5863, + "step": 5420 + }, + { + "epoch": 2.1159250585480094, + "grad_norm": 0.4781915341481003, + "learning_rate": 2.420452827031289e-06, + "loss": 0.557, + "step": 5421 + }, + { + "epoch": 2.1163153786104605, + "grad_norm": 0.4681146924987043, + "learning_rate": 2.4185077239436938e-06, + "loss": 0.5894, + "step": 5422 + }, + { + "epoch": 2.1167056986729116, + "grad_norm": 0.5061829296001719, + "learning_rate": 2.416563153374988e-06, + "loss": 0.5671, + "step": 5423 + }, + { + "epoch": 2.117096018735363, + "grad_norm": 0.43475526111465224, + "learning_rate": 2.4146191157263038e-06, + "loss": 0.6201, + "step": 5424 + }, + { + "epoch": 2.1174863387978142, + "grad_norm": 0.5324524099376255, + "learning_rate": 2.4126756113986607e-06, + "loss": 0.6042, + "step": 5425 + }, + { + "epoch": 2.1178766588602653, + "grad_norm": 0.5137006133409238, + "learning_rate": 2.4107326407929743e-06, + "loss": 0.5913, + "step": 5426 + }, + { + "epoch": 2.1182669789227164, + "grad_norm": 0.4857794098991958, + "learning_rate": 2.4087902043100477e-06, + "loss": 0.5885, + "step": 5427 + }, + { + "epoch": 2.118657298985168, + "grad_norm": 0.4457585887260656, + "learning_rate": 2.4068483023505705e-06, + "loss": 0.6042, + "step": 5428 + }, + { + "epoch": 2.119047619047619, + "grad_norm": 0.486913648242336, + "learning_rate": 2.404906935315125e-06, + "loss": 0.5674, + "step": 5429 + }, + { + "epoch": 2.11943793911007, + "grad_norm": 0.5525422804303567, + "learning_rate": 2.4029661036041853e-06, + "loss": 0.6364, + "step": 5430 + }, + { + "epoch": 2.1198282591725213, + "grad_norm": 0.519240569876777, + "learning_rate": 2.4010258076181097e-06, + "loss": 0.5854, + "step": 5431 + }, + { + "epoch": 2.120218579234973, + "grad_norm": 0.41863750231877, + "learning_rate": 2.399086047757152e-06, + "loss": 0.6038, + "step": 5432 + }, + { + "epoch": 2.120608899297424, + "grad_norm": 0.46550158028981065, + "learning_rate": 2.3971468244214496e-06, + "loss": 0.5722, + "step": 5433 + }, + { + "epoch": 2.120999219359875, + "grad_norm": 0.5217477472354711, + "learning_rate": 2.3952081380110348e-06, + "loss": 0.5679, + "step": 5434 + }, + { + "epoch": 2.121389539422326, + "grad_norm": 0.48803111704011753, + "learning_rate": 2.393269988925825e-06, + "loss": 0.6095, + "step": 5435 + }, + { + "epoch": 2.1217798594847777, + "grad_norm": 0.5052715247920225, + "learning_rate": 2.3913323775656278e-06, + "loss": 0.5596, + "step": 5436 + }, + { + "epoch": 2.1221701795472288, + "grad_norm": 0.4923817527466385, + "learning_rate": 2.389395304330139e-06, + "loss": 0.5606, + "step": 5437 + }, + { + "epoch": 2.12256049960968, + "grad_norm": 0.516728333441592, + "learning_rate": 2.387458769618947e-06, + "loss": 0.5998, + "step": 5438 + }, + { + "epoch": 2.122950819672131, + "grad_norm": 0.4316822049939585, + "learning_rate": 2.385522773831524e-06, + "loss": 0.585, + "step": 5439 + }, + { + "epoch": 2.1233411397345825, + "grad_norm": 0.4733167091051911, + "learning_rate": 2.383587317367237e-06, + "loss": 0.5355, + "step": 5440 + }, + { + "epoch": 2.1237314597970336, + "grad_norm": 0.463947320091251, + "learning_rate": 2.381652400625337e-06, + "loss": 0.5854, + "step": 5441 + }, + { + "epoch": 2.1241217798594847, + "grad_norm": 0.5546159299648863, + "learning_rate": 2.3797180240049628e-06, + "loss": 0.5812, + "step": 5442 + }, + { + "epoch": 2.124512099921936, + "grad_norm": 0.4608548643064967, + "learning_rate": 2.377784187905148e-06, + "loss": 0.5883, + "step": 5443 + }, + { + "epoch": 2.1249024199843873, + "grad_norm": 0.46506658626127934, + "learning_rate": 2.375850892724809e-06, + "loss": 0.5698, + "step": 5444 + }, + { + "epoch": 2.1252927400468384, + "grad_norm": 0.4982151960933462, + "learning_rate": 2.3739181388627524e-06, + "loss": 0.6323, + "step": 5445 + }, + { + "epoch": 2.1256830601092895, + "grad_norm": 0.4695103859384918, + "learning_rate": 2.3719859267176714e-06, + "loss": 0.5902, + "step": 5446 + }, + { + "epoch": 2.1260733801717406, + "grad_norm": 0.4979055681813701, + "learning_rate": 2.3700542566881525e-06, + "loss": 0.5698, + "step": 5447 + }, + { + "epoch": 2.126463700234192, + "grad_norm": 0.49843938360187307, + "learning_rate": 2.3681231291726636e-06, + "loss": 0.5783, + "step": 5448 + }, + { + "epoch": 2.1268540202966433, + "grad_norm": 0.4956799633704548, + "learning_rate": 2.366192544569568e-06, + "loss": 0.5319, + "step": 5449 + }, + { + "epoch": 2.1272443403590944, + "grad_norm": 0.49865259426227637, + "learning_rate": 2.3642625032771094e-06, + "loss": 0.578, + "step": 5450 + }, + { + "epoch": 2.1276346604215455, + "grad_norm": 0.47100591326589364, + "learning_rate": 2.362333005693427e-06, + "loss": 0.5459, + "step": 5451 + }, + { + "epoch": 2.128024980483997, + "grad_norm": 0.47450844810103854, + "learning_rate": 2.36040405221654e-06, + "loss": 0.5655, + "step": 5452 + }, + { + "epoch": 2.128415300546448, + "grad_norm": 0.45253792796986236, + "learning_rate": 2.358475643244365e-06, + "loss": 0.561, + "step": 5453 + }, + { + "epoch": 2.128805620608899, + "grad_norm": 0.4770008427475352, + "learning_rate": 2.3565477791746938e-06, + "loss": 0.5563, + "step": 5454 + }, + { + "epoch": 2.1291959406713503, + "grad_norm": 0.4634249406604743, + "learning_rate": 2.3546204604052176e-06, + "loss": 0.5509, + "step": 5455 + }, + { + "epoch": 2.129586260733802, + "grad_norm": 0.5052435854505176, + "learning_rate": 2.3526936873335086e-06, + "loss": 0.5531, + "step": 5456 + }, + { + "epoch": 2.129976580796253, + "grad_norm": 0.5188891230276136, + "learning_rate": 2.350767460357026e-06, + "loss": 0.5907, + "step": 5457 + }, + { + "epoch": 2.130366900858704, + "grad_norm": 0.42613698152814, + "learning_rate": 2.3488417798731227e-06, + "loss": 0.5747, + "step": 5458 + }, + { + "epoch": 2.130757220921155, + "grad_norm": 0.5123517910263997, + "learning_rate": 2.34691664627903e-06, + "loss": 0.5538, + "step": 5459 + }, + { + "epoch": 2.1311475409836067, + "grad_norm": 0.5389634316095219, + "learning_rate": 2.344992059971875e-06, + "loss": 0.5577, + "step": 5460 + }, + { + "epoch": 2.131537861046058, + "grad_norm": 0.49935859032497226, + "learning_rate": 2.3430680213486637e-06, + "loss": 0.556, + "step": 5461 + }, + { + "epoch": 2.131928181108509, + "grad_norm": 0.43669135753239263, + "learning_rate": 2.3411445308063e-06, + "loss": 0.5852, + "step": 5462 + }, + { + "epoch": 2.13231850117096, + "grad_norm": 0.48045188878018585, + "learning_rate": 2.3392215887415587e-06, + "loss": 0.5233, + "step": 5463 + }, + { + "epoch": 2.1327088212334115, + "grad_norm": 0.5223089780292333, + "learning_rate": 2.3372991955511184e-06, + "loss": 0.6134, + "step": 5464 + }, + { + "epoch": 2.1330991412958626, + "grad_norm": 0.5419759360942199, + "learning_rate": 2.335377351631531e-06, + "loss": 0.5681, + "step": 5465 + }, + { + "epoch": 2.1334894613583137, + "grad_norm": 0.504904399296293, + "learning_rate": 2.3334560573792462e-06, + "loss": 0.5523, + "step": 5466 + }, + { + "epoch": 2.133879781420765, + "grad_norm": 0.47294772452855155, + "learning_rate": 2.3315353131905906e-06, + "loss": 0.5541, + "step": 5467 + }, + { + "epoch": 2.1342701014832164, + "grad_norm": 0.44114541525117495, + "learning_rate": 2.3296151194617863e-06, + "loss": 0.5563, + "step": 5468 + }, + { + "epoch": 2.1346604215456675, + "grad_norm": 0.44842647652355644, + "learning_rate": 2.327695476588934e-06, + "loss": 0.5676, + "step": 5469 + }, + { + "epoch": 2.1350507416081186, + "grad_norm": 0.5314809278966147, + "learning_rate": 2.325776384968025e-06, + "loss": 0.5469, + "step": 5470 + }, + { + "epoch": 2.1354410616705697, + "grad_norm": 0.5232399007176368, + "learning_rate": 2.323857844994934e-06, + "loss": 0.5484, + "step": 5471 + }, + { + "epoch": 2.1358313817330212, + "grad_norm": 0.5433002148743864, + "learning_rate": 2.3219398570654276e-06, + "loss": 0.5479, + "step": 5472 + }, + { + "epoch": 2.1362217017954723, + "grad_norm": 0.4444992133962343, + "learning_rate": 2.320022421575153e-06, + "loss": 0.5845, + "step": 5473 + }, + { + "epoch": 2.1366120218579234, + "grad_norm": 0.4720280133981236, + "learning_rate": 2.3181055389196434e-06, + "loss": 0.5563, + "step": 5474 + }, + { + "epoch": 2.1370023419203745, + "grad_norm": 0.49344964033282673, + "learning_rate": 2.316189209494323e-06, + "loss": 0.5339, + "step": 5475 + }, + { + "epoch": 2.137392661982826, + "grad_norm": 0.46014516872386585, + "learning_rate": 2.314273433694495e-06, + "loss": 0.5382, + "step": 5476 + }, + { + "epoch": 2.137782982045277, + "grad_norm": 0.5263189110557267, + "learning_rate": 2.3123582119153565e-06, + "loss": 0.5913, + "step": 5477 + }, + { + "epoch": 2.1381733021077283, + "grad_norm": 0.5112839382541358, + "learning_rate": 2.3104435445519816e-06, + "loss": 0.5548, + "step": 5478 + }, + { + "epoch": 2.1385636221701794, + "grad_norm": 0.51482367121576, + "learning_rate": 2.30852943199934e-06, + "loss": 0.5512, + "step": 5479 + }, + { + "epoch": 2.138953942232631, + "grad_norm": 0.5187100703287076, + "learning_rate": 2.3066158746522745e-06, + "loss": 0.5599, + "step": 5480 + }, + { + "epoch": 2.139344262295082, + "grad_norm": 0.4474201527786867, + "learning_rate": 2.3047028729055244e-06, + "loss": 0.5131, + "step": 5481 + }, + { + "epoch": 2.139734582357533, + "grad_norm": 0.49422572591085406, + "learning_rate": 2.302790427153707e-06, + "loss": 0.5797, + "step": 5482 + }, + { + "epoch": 2.140124902419984, + "grad_norm": 0.5395110245427822, + "learning_rate": 2.300878537791332e-06, + "loss": 0.559, + "step": 5483 + }, + { + "epoch": 2.1405152224824358, + "grad_norm": 0.49408069089758666, + "learning_rate": 2.298967205212786e-06, + "loss": 0.5792, + "step": 5484 + }, + { + "epoch": 2.140905542544887, + "grad_norm": 0.44518230057197006, + "learning_rate": 2.2970564298123497e-06, + "loss": 0.5704, + "step": 5485 + }, + { + "epoch": 2.141295862607338, + "grad_norm": 0.5042714420010557, + "learning_rate": 2.2951462119841814e-06, + "loss": 0.5206, + "step": 5486 + }, + { + "epoch": 2.141686182669789, + "grad_norm": 0.47771621770626976, + "learning_rate": 2.2932365521223268e-06, + "loss": 0.5682, + "step": 5487 + }, + { + "epoch": 2.1420765027322406, + "grad_norm": 0.4243623504822693, + "learning_rate": 2.291327450620719e-06, + "loss": 0.5496, + "step": 5488 + }, + { + "epoch": 2.1424668227946917, + "grad_norm": 0.46976563981223407, + "learning_rate": 2.2894189078731734e-06, + "loss": 0.5718, + "step": 5489 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.4517190097037544, + "learning_rate": 2.2875109242733896e-06, + "loss": 0.5714, + "step": 5490 + }, + { + "epoch": 2.143247462919594, + "grad_norm": 0.4764301359429094, + "learning_rate": 2.2856035002149513e-06, + "loss": 0.5265, + "step": 5491 + }, + { + "epoch": 2.1436377829820454, + "grad_norm": 0.5176525381106164, + "learning_rate": 2.2836966360913325e-06, + "loss": 0.5404, + "step": 5492 + }, + { + "epoch": 2.1440281030444965, + "grad_norm": 0.4519214084753595, + "learning_rate": 2.281790332295883e-06, + "loss": 0.585, + "step": 5493 + }, + { + "epoch": 2.1444184231069476, + "grad_norm": 0.4367351180034545, + "learning_rate": 2.2798845892218463e-06, + "loss": 0.5729, + "step": 5494 + }, + { + "epoch": 2.1448087431693987, + "grad_norm": 0.4599077001071128, + "learning_rate": 2.2779794072623405e-06, + "loss": 0.5802, + "step": 5495 + }, + { + "epoch": 2.1451990632318503, + "grad_norm": 0.4472362297069662, + "learning_rate": 2.276074786810378e-06, + "loss": 0.5387, + "step": 5496 + }, + { + "epoch": 2.1455893832943014, + "grad_norm": 0.5137036565920793, + "learning_rate": 2.2741707282588472e-06, + "loss": 0.5771, + "step": 5497 + }, + { + "epoch": 2.1459797033567525, + "grad_norm": 0.430919509638387, + "learning_rate": 2.2722672320005247e-06, + "loss": 0.5774, + "step": 5498 + }, + { + "epoch": 2.1463700234192036, + "grad_norm": 0.44970887242320406, + "learning_rate": 2.2703642984280674e-06, + "loss": 0.5605, + "step": 5499 + }, + { + "epoch": 2.146760343481655, + "grad_norm": 0.47064789242044325, + "learning_rate": 2.2684619279340233e-06, + "loss": 0.5377, + "step": 5500 + }, + { + "epoch": 2.147150663544106, + "grad_norm": 0.49934249114901624, + "learning_rate": 2.266560120910815e-06, + "loss": 0.5509, + "step": 5501 + }, + { + "epoch": 2.1475409836065573, + "grad_norm": 0.5269759815765354, + "learning_rate": 2.2646588777507582e-06, + "loss": 0.5496, + "step": 5502 + }, + { + "epoch": 2.1479313036690084, + "grad_norm": 0.47013797350801406, + "learning_rate": 2.262758198846046e-06, + "loss": 0.5555, + "step": 5503 + }, + { + "epoch": 2.14832162373146, + "grad_norm": 0.48234401830688706, + "learning_rate": 2.2608580845887546e-06, + "loss": 0.5662, + "step": 5504 + }, + { + "epoch": 2.148711943793911, + "grad_norm": 0.5184881399563919, + "learning_rate": 2.258958535370849e-06, + "loss": 0.581, + "step": 5505 + }, + { + "epoch": 2.149102263856362, + "grad_norm": 0.438719184043676, + "learning_rate": 2.2570595515841748e-06, + "loss": 0.5801, + "step": 5506 + }, + { + "epoch": 2.1494925839188133, + "grad_norm": 0.4502882932020716, + "learning_rate": 2.2551611336204584e-06, + "loss": 0.5577, + "step": 5507 + }, + { + "epoch": 2.149882903981265, + "grad_norm": 0.47579308457584757, + "learning_rate": 2.2532632818713112e-06, + "loss": 0.5697, + "step": 5508 + }, + { + "epoch": 2.150273224043716, + "grad_norm": 0.42938222219496586, + "learning_rate": 2.251365996728232e-06, + "loss": 0.5453, + "step": 5509 + }, + { + "epoch": 2.150663544106167, + "grad_norm": 0.46710760307830584, + "learning_rate": 2.249469278582595e-06, + "loss": 0.5649, + "step": 5510 + }, + { + "epoch": 2.151053864168618, + "grad_norm": 0.5262239132772905, + "learning_rate": 2.2475731278256656e-06, + "loss": 0.5825, + "step": 5511 + }, + { + "epoch": 2.1514441842310696, + "grad_norm": 0.44474422241311545, + "learning_rate": 2.245677544848585e-06, + "loss": 0.5584, + "step": 5512 + }, + { + "epoch": 2.1518345042935207, + "grad_norm": 0.4518595593672603, + "learning_rate": 2.2437825300423834e-06, + "loss": 0.5357, + "step": 5513 + }, + { + "epoch": 2.152224824355972, + "grad_norm": 0.5603401062645683, + "learning_rate": 2.2418880837979683e-06, + "loss": 0.5792, + "step": 5514 + }, + { + "epoch": 2.152615144418423, + "grad_norm": 0.4544156326905815, + "learning_rate": 2.239994206506134e-06, + "loss": 0.5572, + "step": 5515 + }, + { + "epoch": 2.1530054644808745, + "grad_norm": 0.5091334546068177, + "learning_rate": 2.238100898557553e-06, + "loss": 0.548, + "step": 5516 + }, + { + "epoch": 2.1533957845433256, + "grad_norm": 0.45651696335214403, + "learning_rate": 2.2362081603427877e-06, + "loss": 0.564, + "step": 5517 + }, + { + "epoch": 2.1537861046057767, + "grad_norm": 0.49417935713936073, + "learning_rate": 2.234315992252274e-06, + "loss": 0.5619, + "step": 5518 + }, + { + "epoch": 2.154176424668228, + "grad_norm": 0.43721622372414864, + "learning_rate": 2.2324243946763386e-06, + "loss": 0.5724, + "step": 5519 + }, + { + "epoch": 2.1545667447306793, + "grad_norm": 0.466092774408227, + "learning_rate": 2.2305333680051843e-06, + "loss": 0.5686, + "step": 5520 + }, + { + "epoch": 2.1549570647931304, + "grad_norm": 0.47378639013698576, + "learning_rate": 2.2286429126288978e-06, + "loss": 0.5707, + "step": 5521 + }, + { + "epoch": 2.1553473848555815, + "grad_norm": 0.48357512584558765, + "learning_rate": 2.226753028937451e-06, + "loss": 0.5624, + "step": 5522 + }, + { + "epoch": 2.1557377049180326, + "grad_norm": 0.4824672964392724, + "learning_rate": 2.2248637173206944e-06, + "loss": 0.5834, + "step": 5523 + }, + { + "epoch": 2.156128024980484, + "grad_norm": 0.5011525745197969, + "learning_rate": 2.222974978168361e-06, + "loss": 0.5312, + "step": 5524 + }, + { + "epoch": 2.1565183450429353, + "grad_norm": 0.4624153553962062, + "learning_rate": 2.221086811870066e-06, + "loss": 0.5689, + "step": 5525 + }, + { + "epoch": 2.1569086651053864, + "grad_norm": 0.4921967724546974, + "learning_rate": 2.219199218815308e-06, + "loss": 0.5871, + "step": 5526 + }, + { + "epoch": 2.1572989851678375, + "grad_norm": 0.46745188875208965, + "learning_rate": 2.217312199393463e-06, + "loss": 0.5861, + "step": 5527 + }, + { + "epoch": 2.157689305230289, + "grad_norm": 0.46477169681735137, + "learning_rate": 2.2154257539937964e-06, + "loss": 0.5544, + "step": 5528 + }, + { + "epoch": 2.15807962529274, + "grad_norm": 0.5544642852069046, + "learning_rate": 2.213539883005446e-06, + "loss": 0.5894, + "step": 5529 + }, + { + "epoch": 2.158469945355191, + "grad_norm": 0.5089994830383786, + "learning_rate": 2.2116545868174395e-06, + "loss": 0.5735, + "step": 5530 + }, + { + "epoch": 2.1588602654176423, + "grad_norm": 0.4674882366215814, + "learning_rate": 2.2097698658186784e-06, + "loss": 0.58, + "step": 5531 + }, + { + "epoch": 2.159250585480094, + "grad_norm": 0.5045012499811528, + "learning_rate": 2.2078857203979547e-06, + "loss": 0.5521, + "step": 5532 + }, + { + "epoch": 2.159640905542545, + "grad_norm": 0.4887871158667781, + "learning_rate": 2.2060021509439294e-06, + "loss": 0.5399, + "step": 5533 + }, + { + "epoch": 2.160031225604996, + "grad_norm": 0.46029652873658755, + "learning_rate": 2.204119157845156e-06, + "loss": 0.5431, + "step": 5534 + }, + { + "epoch": 2.160421545667447, + "grad_norm": 0.45193976675574044, + "learning_rate": 2.202236741490062e-06, + "loss": 0.5452, + "step": 5535 + }, + { + "epoch": 2.1608118657298987, + "grad_norm": 0.5681202741613527, + "learning_rate": 2.2003549022669617e-06, + "loss": 0.5584, + "step": 5536 + }, + { + "epoch": 2.16120218579235, + "grad_norm": 0.4979782372049303, + "learning_rate": 2.1984736405640455e-06, + "loss": 0.5956, + "step": 5537 + }, + { + "epoch": 2.161592505854801, + "grad_norm": 0.5005475921094515, + "learning_rate": 2.1965929567693843e-06, + "loss": 0.5716, + "step": 5538 + }, + { + "epoch": 2.161982825917252, + "grad_norm": 0.4968585049679739, + "learning_rate": 2.1947128512709367e-06, + "loss": 0.5638, + "step": 5539 + }, + { + "epoch": 2.1623731459797035, + "grad_norm": 0.4814176070839852, + "learning_rate": 2.1928333244565315e-06, + "loss": 0.567, + "step": 5540 + }, + { + "epoch": 2.1627634660421546, + "grad_norm": 0.511026236431742, + "learning_rate": 2.1909543767138908e-06, + "loss": 0.5583, + "step": 5541 + }, + { + "epoch": 2.1631537861046057, + "grad_norm": 0.5128997134205808, + "learning_rate": 2.1890760084306025e-06, + "loss": 0.581, + "step": 5542 + }, + { + "epoch": 2.163544106167057, + "grad_norm": 0.4891253402324652, + "learning_rate": 2.1871982199941477e-06, + "loss": 0.5397, + "step": 5543 + }, + { + "epoch": 2.1639344262295084, + "grad_norm": 0.45558083800992943, + "learning_rate": 2.1853210117918807e-06, + "loss": 0.5271, + "step": 5544 + }, + { + "epoch": 2.1643247462919595, + "grad_norm": 0.4677495971824132, + "learning_rate": 2.18344438421104e-06, + "loss": 0.5627, + "step": 5545 + }, + { + "epoch": 2.1647150663544106, + "grad_norm": 0.47025602023333607, + "learning_rate": 2.18156833763874e-06, + "loss": 0.5816, + "step": 5546 + }, + { + "epoch": 2.1651053864168617, + "grad_norm": 0.5111147086208283, + "learning_rate": 2.1796928724619824e-06, + "loss": 0.5289, + "step": 5547 + }, + { + "epoch": 2.165495706479313, + "grad_norm": 0.5213387873162769, + "learning_rate": 2.1778179890676395e-06, + "loss": 0.5925, + "step": 5548 + }, + { + "epoch": 2.1658860265417643, + "grad_norm": 0.4800673656120551, + "learning_rate": 2.175943687842472e-06, + "loss": 0.5777, + "step": 5549 + }, + { + "epoch": 2.1662763466042154, + "grad_norm": 0.49235756712387735, + "learning_rate": 2.1740699691731165e-06, + "loss": 0.563, + "step": 5550 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.5029336975066565, + "learning_rate": 2.172196833446089e-06, + "loss": 0.5544, + "step": 5551 + }, + { + "epoch": 2.167056986729118, + "grad_norm": 0.47695858543632436, + "learning_rate": 2.1703242810477846e-06, + "loss": 0.5504, + "step": 5552 + }, + { + "epoch": 2.167447306791569, + "grad_norm": 0.5295198231368284, + "learning_rate": 2.168452312364484e-06, + "loss": 0.5688, + "step": 5553 + }, + { + "epoch": 2.1678376268540203, + "grad_norm": 0.45034246834230773, + "learning_rate": 2.16658092778234e-06, + "loss": 0.5835, + "step": 5554 + }, + { + "epoch": 2.1682279469164714, + "grad_norm": 0.4650015566354948, + "learning_rate": 2.1647101276873878e-06, + "loss": 0.5803, + "step": 5555 + }, + { + "epoch": 2.168618266978923, + "grad_norm": 0.5136698063698049, + "learning_rate": 2.1628399124655448e-06, + "loss": 0.5717, + "step": 5556 + }, + { + "epoch": 2.169008587041374, + "grad_norm": 0.4824810685461044, + "learning_rate": 2.160970282502602e-06, + "loss": 0.5603, + "step": 5557 + }, + { + "epoch": 2.169398907103825, + "grad_norm": 0.4495503629348701, + "learning_rate": 2.1591012381842363e-06, + "loss": 0.6166, + "step": 5558 + }, + { + "epoch": 2.169789227166276, + "grad_norm": 0.47008703373963445, + "learning_rate": 2.157232779895999e-06, + "loss": 0.6095, + "step": 5559 + }, + { + "epoch": 2.1701795472287277, + "grad_norm": 0.4924541044741657, + "learning_rate": 2.1553649080233225e-06, + "loss": 0.5397, + "step": 5560 + }, + { + "epoch": 2.170569867291179, + "grad_norm": 0.4786182692177823, + "learning_rate": 2.1534976229515147e-06, + "loss": 0.5579, + "step": 5561 + }, + { + "epoch": 2.17096018735363, + "grad_norm": 0.42637308107427957, + "learning_rate": 2.15163092506577e-06, + "loss": 0.5622, + "step": 5562 + }, + { + "epoch": 2.171350507416081, + "grad_norm": 0.45495420668203795, + "learning_rate": 2.149764814751153e-06, + "loss": 0.6306, + "step": 5563 + }, + { + "epoch": 2.1717408274785326, + "grad_norm": 0.44539589309684036, + "learning_rate": 2.1478992923926144e-06, + "loss": 0.5883, + "step": 5564 + }, + { + "epoch": 2.1721311475409837, + "grad_norm": 0.4836547060320631, + "learning_rate": 2.1460343583749774e-06, + "loss": 0.5699, + "step": 5565 + }, + { + "epoch": 2.172521467603435, + "grad_norm": 0.44979236024452596, + "learning_rate": 2.144170013082951e-06, + "loss": 0.5516, + "step": 5566 + }, + { + "epoch": 2.172911787665886, + "grad_norm": 0.4519101956938627, + "learning_rate": 2.1423062569011156e-06, + "loss": 0.5644, + "step": 5567 + }, + { + "epoch": 2.1733021077283374, + "grad_norm": 0.453125956413647, + "learning_rate": 2.140443090213934e-06, + "loss": 0.5242, + "step": 5568 + }, + { + "epoch": 2.1736924277907885, + "grad_norm": 0.5101198717402552, + "learning_rate": 2.1385805134057437e-06, + "loss": 0.5604, + "step": 5569 + }, + { + "epoch": 2.1740827478532396, + "grad_norm": 0.4449337148141187, + "learning_rate": 2.1367185268607683e-06, + "loss": 0.5544, + "step": 5570 + }, + { + "epoch": 2.1744730679156907, + "grad_norm": 0.47731928493667264, + "learning_rate": 2.1348571309631012e-06, + "loss": 0.533, + "step": 5571 + }, + { + "epoch": 2.1748633879781423, + "grad_norm": 0.5208851055850517, + "learning_rate": 2.132996326096717e-06, + "loss": 0.5936, + "step": 5572 + }, + { + "epoch": 2.1752537080405934, + "grad_norm": 0.4718255728448843, + "learning_rate": 2.1311361126454712e-06, + "loss": 0.5996, + "step": 5573 + }, + { + "epoch": 2.1756440281030445, + "grad_norm": 0.46495739538476205, + "learning_rate": 2.1292764909930925e-06, + "loss": 0.5813, + "step": 5574 + }, + { + "epoch": 2.1760343481654956, + "grad_norm": 0.49221263944654287, + "learning_rate": 2.1274174615231922e-06, + "loss": 0.578, + "step": 5575 + }, + { + "epoch": 2.176424668227947, + "grad_norm": 0.47749086872553254, + "learning_rate": 2.1255590246192565e-06, + "loss": 0.5405, + "step": 5576 + }, + { + "epoch": 2.176814988290398, + "grad_norm": 0.4704615505927714, + "learning_rate": 2.123701180664649e-06, + "loss": 0.5489, + "step": 5577 + }, + { + "epoch": 2.1772053083528493, + "grad_norm": 0.4766683748703389, + "learning_rate": 2.1218439300426113e-06, + "loss": 0.5766, + "step": 5578 + }, + { + "epoch": 2.1775956284153004, + "grad_norm": 0.5101041764039139, + "learning_rate": 2.119987273136266e-06, + "loss": 0.5759, + "step": 5579 + }, + { + "epoch": 2.177985948477752, + "grad_norm": 0.49749957889784485, + "learning_rate": 2.1181312103286073e-06, + "loss": 0.5696, + "step": 5580 + }, + { + "epoch": 2.178376268540203, + "grad_norm": 0.5200169985228411, + "learning_rate": 2.116275742002513e-06, + "loss": 0.6355, + "step": 5581 + }, + { + "epoch": 2.178766588602654, + "grad_norm": 0.47712194173148015, + "learning_rate": 2.1144208685407326e-06, + "loss": 0.5673, + "step": 5582 + }, + { + "epoch": 2.1791569086651053, + "grad_norm": 0.45582190463024896, + "learning_rate": 2.112566590325899e-06, + "loss": 0.5829, + "step": 5583 + }, + { + "epoch": 2.179547228727557, + "grad_norm": 0.5361169550646414, + "learning_rate": 2.110712907740517e-06, + "loss": 0.5953, + "step": 5584 + }, + { + "epoch": 2.179937548790008, + "grad_norm": 0.526342055639446, + "learning_rate": 2.10885982116697e-06, + "loss": 0.5887, + "step": 5585 + }, + { + "epoch": 2.180327868852459, + "grad_norm": 0.4349009335722423, + "learning_rate": 2.1070073309875206e-06, + "loss": 0.5816, + "step": 5586 + }, + { + "epoch": 2.18071818891491, + "grad_norm": 0.4733239340548701, + "learning_rate": 2.1051554375843026e-06, + "loss": 0.5599, + "step": 5587 + }, + { + "epoch": 2.1811085089773616, + "grad_norm": 0.47672637095914466, + "learning_rate": 2.1033041413393357e-06, + "loss": 0.5465, + "step": 5588 + }, + { + "epoch": 2.1814988290398127, + "grad_norm": 0.4655702560586244, + "learning_rate": 2.1014534426345077e-06, + "loss": 0.5894, + "step": 5589 + }, + { + "epoch": 2.181889149102264, + "grad_norm": 0.5062006644513388, + "learning_rate": 2.099603341851591e-06, + "loss": 0.5704, + "step": 5590 + }, + { + "epoch": 2.182279469164715, + "grad_norm": 0.48661930856302654, + "learning_rate": 2.0977538393722257e-06, + "loss": 0.5569, + "step": 5591 + }, + { + "epoch": 2.1826697892271665, + "grad_norm": 0.4476741748808881, + "learning_rate": 2.0959049355779385e-06, + "loss": 0.597, + "step": 5592 + }, + { + "epoch": 2.1830601092896176, + "grad_norm": 0.4796706025634717, + "learning_rate": 2.0940566308501225e-06, + "loss": 0.557, + "step": 5593 + }, + { + "epoch": 2.1834504293520687, + "grad_norm": 0.49194735545170537, + "learning_rate": 2.092208925570059e-06, + "loss": 0.5768, + "step": 5594 + }, + { + "epoch": 2.1838407494145198, + "grad_norm": 0.4657406113803745, + "learning_rate": 2.0903618201188907e-06, + "loss": 0.5328, + "step": 5595 + }, + { + "epoch": 2.1842310694769713, + "grad_norm": 0.43395441525738493, + "learning_rate": 2.08851531487765e-06, + "loss": 0.5384, + "step": 5596 + }, + { + "epoch": 2.1846213895394224, + "grad_norm": 0.4958703563604627, + "learning_rate": 2.0866694102272374e-06, + "loss": 0.5509, + "step": 5597 + }, + { + "epoch": 2.1850117096018735, + "grad_norm": 0.49042746902663975, + "learning_rate": 2.0848241065484347e-06, + "loss": 0.5576, + "step": 5598 + }, + { + "epoch": 2.1854020296643246, + "grad_norm": 0.4200915799916064, + "learning_rate": 2.082979404221897e-06, + "loss": 0.6008, + "step": 5599 + }, + { + "epoch": 2.185792349726776, + "grad_norm": 0.551141682949422, + "learning_rate": 2.081135303628153e-06, + "loss": 0.5928, + "step": 5600 + }, + { + "epoch": 2.1861826697892273, + "grad_norm": 0.4880113538785467, + "learning_rate": 2.079291805147613e-06, + "loss": 0.5548, + "step": 5601 + }, + { + "epoch": 2.1865729898516784, + "grad_norm": 0.5062449086349111, + "learning_rate": 2.0774489091605577e-06, + "loss": 0.5421, + "step": 5602 + }, + { + "epoch": 2.1869633099141295, + "grad_norm": 0.4389239927681722, + "learning_rate": 2.075606616047151e-06, + "loss": 0.5555, + "step": 5603 + }, + { + "epoch": 2.187353629976581, + "grad_norm": 0.5099468353453238, + "learning_rate": 2.073764926187419e-06, + "loss": 0.5623, + "step": 5604 + }, + { + "epoch": 2.187743950039032, + "grad_norm": 0.48740482019850756, + "learning_rate": 2.071923839961279e-06, + "loss": 0.567, + "step": 5605 + }, + { + "epoch": 2.188134270101483, + "grad_norm": 0.4776832577594766, + "learning_rate": 2.070083357748511e-06, + "loss": 0.5748, + "step": 5606 + }, + { + "epoch": 2.1885245901639343, + "grad_norm": 0.4838909802597691, + "learning_rate": 2.0682434799287803e-06, + "loss": 0.5562, + "step": 5607 + }, + { + "epoch": 2.1889149102263854, + "grad_norm": 0.506914309886522, + "learning_rate": 2.066404206881619e-06, + "loss": 0.5623, + "step": 5608 + }, + { + "epoch": 2.189305230288837, + "grad_norm": 0.50167145394925, + "learning_rate": 2.0645655389864434e-06, + "loss": 0.6014, + "step": 5609 + }, + { + "epoch": 2.189695550351288, + "grad_norm": 0.4653258027918453, + "learning_rate": 2.062727476622535e-06, + "loss": 0.5706, + "step": 5610 + }, + { + "epoch": 2.190085870413739, + "grad_norm": 0.5216765995695947, + "learning_rate": 2.0608900201690613e-06, + "loss": 0.5555, + "step": 5611 + }, + { + "epoch": 2.1904761904761907, + "grad_norm": 0.4565555957942967, + "learning_rate": 2.059053170005052e-06, + "loss": 0.5341, + "step": 5612 + }, + { + "epoch": 2.190866510538642, + "grad_norm": 0.4678708680563365, + "learning_rate": 2.057216926509424e-06, + "loss": 0.6014, + "step": 5613 + }, + { + "epoch": 2.191256830601093, + "grad_norm": 0.4481937619072425, + "learning_rate": 2.0553812900609605e-06, + "loss": 0.554, + "step": 5614 + }, + { + "epoch": 2.191647150663544, + "grad_norm": 0.4944561420145759, + "learning_rate": 2.0535462610383256e-06, + "loss": 0.5928, + "step": 5615 + }, + { + "epoch": 2.192037470725995, + "grad_norm": 0.47660306653818074, + "learning_rate": 2.051711839820054e-06, + "loss": 0.5648, + "step": 5616 + }, + { + "epoch": 2.1924277907884466, + "grad_norm": 0.44765718551669686, + "learning_rate": 2.0498780267845543e-06, + "loss": 0.5642, + "step": 5617 + }, + { + "epoch": 2.1928181108508977, + "grad_norm": 0.4669582766304815, + "learning_rate": 2.048044822310115e-06, + "loss": 0.5629, + "step": 5618 + }, + { + "epoch": 2.193208430913349, + "grad_norm": 0.46658601629308777, + "learning_rate": 2.046212226774893e-06, + "loss": 0.5346, + "step": 5619 + }, + { + "epoch": 2.1935987509758004, + "grad_norm": 0.4540192234130548, + "learning_rate": 2.0443802405569265e-06, + "loss": 0.6104, + "step": 5620 + }, + { + "epoch": 2.1939890710382515, + "grad_norm": 0.4946651778248044, + "learning_rate": 2.0425488640341174e-06, + "loss": 0.5808, + "step": 5621 + }, + { + "epoch": 2.1943793911007026, + "grad_norm": 0.4752383445382785, + "learning_rate": 2.040718097584253e-06, + "loss": 0.5784, + "step": 5622 + }, + { + "epoch": 2.1947697111631537, + "grad_norm": 0.44464969209714966, + "learning_rate": 2.0388879415849862e-06, + "loss": 0.5742, + "step": 5623 + }, + { + "epoch": 2.1951600312256048, + "grad_norm": 0.4500307676793741, + "learning_rate": 2.0370583964138516e-06, + "loss": 0.6055, + "step": 5624 + }, + { + "epoch": 2.1955503512880563, + "grad_norm": 0.48305333056383787, + "learning_rate": 2.035229462448251e-06, + "loss": 0.5729, + "step": 5625 + }, + { + "epoch": 2.1959406713505074, + "grad_norm": 0.48169357366071813, + "learning_rate": 2.033401140065466e-06, + "loss": 0.5612, + "step": 5626 + }, + { + "epoch": 2.1963309914129585, + "grad_norm": 0.501339379279477, + "learning_rate": 2.0315734296426454e-06, + "loss": 0.6185, + "step": 5627 + }, + { + "epoch": 2.19672131147541, + "grad_norm": 0.5531708116344162, + "learning_rate": 2.0297463315568194e-06, + "loss": 0.5644, + "step": 5628 + }, + { + "epoch": 2.197111631537861, + "grad_norm": 0.5244038544379247, + "learning_rate": 2.027919846184885e-06, + "loss": 0.5513, + "step": 5629 + }, + { + "epoch": 2.1975019516003123, + "grad_norm": 0.5076312997095682, + "learning_rate": 2.0260939739036177e-06, + "loss": 0.5511, + "step": 5630 + }, + { + "epoch": 2.1978922716627634, + "grad_norm": 0.6365316060482279, + "learning_rate": 2.0242687150896614e-06, + "loss": 0.5771, + "step": 5631 + }, + { + "epoch": 2.1982825917252145, + "grad_norm": 0.7023027669985143, + "learning_rate": 2.0224440701195404e-06, + "loss": 0.603, + "step": 5632 + }, + { + "epoch": 2.198672911787666, + "grad_norm": 0.5123544625982133, + "learning_rate": 2.020620039369646e-06, + "loss": 0.5635, + "step": 5633 + }, + { + "epoch": 2.199063231850117, + "grad_norm": 0.4887728370671005, + "learning_rate": 2.018796623216245e-06, + "loss": 0.5837, + "step": 5634 + }, + { + "epoch": 2.199453551912568, + "grad_norm": 0.5248564387873009, + "learning_rate": 2.016973822035482e-06, + "loss": 0.5657, + "step": 5635 + }, + { + "epoch": 2.1998438719750197, + "grad_norm": 0.5299812136778866, + "learning_rate": 2.0151516362033647e-06, + "loss": 0.5841, + "step": 5636 + }, + { + "epoch": 2.200234192037471, + "grad_norm": 0.46180501310481387, + "learning_rate": 2.0133300660957845e-06, + "loss": 0.5615, + "step": 5637 + }, + { + "epoch": 2.200624512099922, + "grad_norm": 0.46680556325575256, + "learning_rate": 2.011509112088499e-06, + "loss": 0.5582, + "step": 5638 + }, + { + "epoch": 2.201014832162373, + "grad_norm": 0.4763622900585284, + "learning_rate": 2.0096887745571407e-06, + "loss": 0.554, + "step": 5639 + }, + { + "epoch": 2.201405152224824, + "grad_norm": 0.5284869312110967, + "learning_rate": 2.0078690538772137e-06, + "loss": 0.5566, + "step": 5640 + }, + { + "epoch": 2.2017954722872757, + "grad_norm": 0.4625022222682762, + "learning_rate": 2.0060499504240993e-06, + "loss": 0.5672, + "step": 5641 + }, + { + "epoch": 2.202185792349727, + "grad_norm": 0.45597164881433544, + "learning_rate": 2.0042314645730445e-06, + "loss": 0.5434, + "step": 5642 + }, + { + "epoch": 2.202576112412178, + "grad_norm": 0.5817542776287247, + "learning_rate": 2.002413596699176e-06, + "loss": 0.5537, + "step": 5643 + }, + { + "epoch": 2.2029664324746294, + "grad_norm": 0.6784498838975772, + "learning_rate": 2.0005963471774864e-06, + "loss": 0.5825, + "step": 5644 + }, + { + "epoch": 2.2033567525370805, + "grad_norm": 0.47343896655710044, + "learning_rate": 1.998779716382848e-06, + "loss": 0.5714, + "step": 5645 + }, + { + "epoch": 2.2037470725995316, + "grad_norm": 0.5361089298584294, + "learning_rate": 1.996963704689999e-06, + "loss": 0.6087, + "step": 5646 + }, + { + "epoch": 2.2041373926619827, + "grad_norm": 0.5185937948478302, + "learning_rate": 1.9951483124735525e-06, + "loss": 0.5836, + "step": 5647 + }, + { + "epoch": 2.204527712724434, + "grad_norm": 0.4556756093517954, + "learning_rate": 1.993333540107993e-06, + "loss": 0.585, + "step": 5648 + }, + { + "epoch": 2.2049180327868854, + "grad_norm": 0.4893097834418867, + "learning_rate": 1.99151938796768e-06, + "loss": 0.5874, + "step": 5649 + }, + { + "epoch": 2.2053083528493365, + "grad_norm": 0.5389742028669559, + "learning_rate": 1.9897058564268418e-06, + "loss": 0.518, + "step": 5650 + }, + { + "epoch": 2.2056986729117876, + "grad_norm": 0.4848245267872112, + "learning_rate": 1.987892945859578e-06, + "loss": 0.5394, + "step": 5651 + }, + { + "epoch": 2.206088992974239, + "grad_norm": 0.5911571751358162, + "learning_rate": 1.9860806566398657e-06, + "loss": 0.5739, + "step": 5652 + }, + { + "epoch": 2.20647931303669, + "grad_norm": 0.6198853673837459, + "learning_rate": 1.984268989141545e-06, + "loss": 0.5681, + "step": 5653 + }, + { + "epoch": 2.2068696330991413, + "grad_norm": 0.5029585487093406, + "learning_rate": 1.982457943738338e-06, + "loss": 0.5752, + "step": 5654 + }, + { + "epoch": 2.2072599531615924, + "grad_norm": 0.5043618369686216, + "learning_rate": 1.9806475208038306e-06, + "loss": 0.5534, + "step": 5655 + }, + { + "epoch": 2.2076502732240435, + "grad_norm": 0.5032814494783923, + "learning_rate": 1.9788377207114827e-06, + "loss": 0.6302, + "step": 5656 + }, + { + "epoch": 2.208040593286495, + "grad_norm": 0.4835010091750912, + "learning_rate": 1.9770285438346254e-06, + "loss": 0.577, + "step": 5657 + }, + { + "epoch": 2.208430913348946, + "grad_norm": 0.5265054569752213, + "learning_rate": 1.9752199905464638e-06, + "loss": 0.5986, + "step": 5658 + }, + { + "epoch": 2.2088212334113972, + "grad_norm": 0.5805696335150308, + "learning_rate": 1.97341206122007e-06, + "loss": 0.5548, + "step": 5659 + }, + { + "epoch": 2.209211553473849, + "grad_norm": 0.5767491454207071, + "learning_rate": 1.9716047562283925e-06, + "loss": 0.5644, + "step": 5660 + }, + { + "epoch": 2.2096018735363, + "grad_norm": 0.5738807610090794, + "learning_rate": 1.969798075944245e-06, + "loss": 0.6068, + "step": 5661 + }, + { + "epoch": 2.209992193598751, + "grad_norm": 0.49270428168530866, + "learning_rate": 1.9679920207403186e-06, + "loss": 0.5697, + "step": 5662 + }, + { + "epoch": 2.210382513661202, + "grad_norm": 0.47073049512231185, + "learning_rate": 1.9661865909891716e-06, + "loss": 0.6066, + "step": 5663 + }, + { + "epoch": 2.210772833723653, + "grad_norm": 0.49013404133406885, + "learning_rate": 1.964381787063234e-06, + "loss": 0.5616, + "step": 5664 + }, + { + "epoch": 2.2111631537861047, + "grad_norm": 0.5212604498836351, + "learning_rate": 1.962577609334804e-06, + "loss": 0.5452, + "step": 5665 + }, + { + "epoch": 2.211553473848556, + "grad_norm": 0.49694083312663573, + "learning_rate": 1.9607740581760575e-06, + "loss": 0.5261, + "step": 5666 + }, + { + "epoch": 2.211943793911007, + "grad_norm": 0.5322313869205763, + "learning_rate": 1.9589711339590357e-06, + "loss": 0.5796, + "step": 5667 + }, + { + "epoch": 2.2123341139734585, + "grad_norm": 0.5155282860391803, + "learning_rate": 1.9571688370556497e-06, + "loss": 0.5908, + "step": 5668 + }, + { + "epoch": 2.2127244340359096, + "grad_norm": 0.47591789896033126, + "learning_rate": 1.955367167837687e-06, + "loss": 0.5444, + "step": 5669 + }, + { + "epoch": 2.2131147540983607, + "grad_norm": 0.45062865798983825, + "learning_rate": 1.953566126676799e-06, + "loss": 0.5936, + "step": 5670 + }, + { + "epoch": 2.2135050741608118, + "grad_norm": 0.524393424316526, + "learning_rate": 1.9517657139445134e-06, + "loss": 0.553, + "step": 5671 + }, + { + "epoch": 2.213895394223263, + "grad_norm": 0.4527679449122978, + "learning_rate": 1.9499659300122212e-06, + "loss": 0.5443, + "step": 5672 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 0.4412849665404672, + "learning_rate": 1.9481667752511945e-06, + "loss": 0.5864, + "step": 5673 + }, + { + "epoch": 2.2146760343481655, + "grad_norm": 0.45252392295915717, + "learning_rate": 1.9463682500325616e-06, + "loss": 0.5415, + "step": 5674 + }, + { + "epoch": 2.2150663544106166, + "grad_norm": 0.6025643732363165, + "learning_rate": 1.944570354727333e-06, + "loss": 0.5874, + "step": 5675 + }, + { + "epoch": 2.2154566744730677, + "grad_norm": 0.6391876724109996, + "learning_rate": 1.9427730897063814e-06, + "loss": 0.6032, + "step": 5676 + }, + { + "epoch": 2.2158469945355193, + "grad_norm": 0.4318682639815748, + "learning_rate": 1.9409764553404564e-06, + "loss": 0.5665, + "step": 5677 + }, + { + "epoch": 2.2162373145979704, + "grad_norm": 0.4702923266454191, + "learning_rate": 1.93918045200017e-06, + "loss": 0.5579, + "step": 5678 + }, + { + "epoch": 2.2166276346604215, + "grad_norm": 0.6845892062283387, + "learning_rate": 1.9373850800560113e-06, + "loss": 0.5684, + "step": 5679 + }, + { + "epoch": 2.2170179547228726, + "grad_norm": 0.5275312993536201, + "learning_rate": 1.9355903398783333e-06, + "loss": 0.5617, + "step": 5680 + }, + { + "epoch": 2.217408274785324, + "grad_norm": 0.4574744721814201, + "learning_rate": 1.9337962318373603e-06, + "loss": 0.558, + "step": 5681 + }, + { + "epoch": 2.217798594847775, + "grad_norm": 0.6236356016375448, + "learning_rate": 1.9320027563031905e-06, + "loss": 0.5709, + "step": 5682 + }, + { + "epoch": 2.2181889149102263, + "grad_norm": 0.5380050775936013, + "learning_rate": 1.930209913645785e-06, + "loss": 0.5407, + "step": 5683 + }, + { + "epoch": 2.2185792349726774, + "grad_norm": 0.5402315770108891, + "learning_rate": 1.9284177042349787e-06, + "loss": 0.5726, + "step": 5684 + }, + { + "epoch": 2.218969555035129, + "grad_norm": 0.4964229640696921, + "learning_rate": 1.926626128440473e-06, + "loss": 0.6159, + "step": 5685 + }, + { + "epoch": 2.21935987509758, + "grad_norm": 0.5350879147222656, + "learning_rate": 1.9248351866318422e-06, + "loss": 0.5916, + "step": 5686 + }, + { + "epoch": 2.219750195160031, + "grad_norm": 0.45432511388960467, + "learning_rate": 1.923044879178525e-06, + "loss": 0.5567, + "step": 5687 + }, + { + "epoch": 2.2201405152224822, + "grad_norm": 0.4331096919559844, + "learning_rate": 1.9212552064498364e-06, + "loss": 0.5529, + "step": 5688 + }, + { + "epoch": 2.220530835284934, + "grad_norm": 0.44037156748594597, + "learning_rate": 1.9194661688149512e-06, + "loss": 0.5724, + "step": 5689 + }, + { + "epoch": 2.220921155347385, + "grad_norm": 0.4735685972648729, + "learning_rate": 1.9176777666429225e-06, + "loss": 0.6264, + "step": 5690 + }, + { + "epoch": 2.221311475409836, + "grad_norm": 0.475562103684175, + "learning_rate": 1.9158900003026655e-06, + "loss": 0.6003, + "step": 5691 + }, + { + "epoch": 2.221701795472287, + "grad_norm": 0.45840466196808544, + "learning_rate": 1.914102870162967e-06, + "loss": 0.5423, + "step": 5692 + }, + { + "epoch": 2.2220921155347386, + "grad_norm": 0.4842446184983579, + "learning_rate": 1.912316376592481e-06, + "loss": 0.5393, + "step": 5693 + }, + { + "epoch": 2.2224824355971897, + "grad_norm": 0.43664145408841504, + "learning_rate": 1.910530519959734e-06, + "loss": 0.5706, + "step": 5694 + }, + { + "epoch": 2.222872755659641, + "grad_norm": 0.45828722419317847, + "learning_rate": 1.9087453006331152e-06, + "loss": 0.546, + "step": 5695 + }, + { + "epoch": 2.223263075722092, + "grad_norm": 0.415739437594793, + "learning_rate": 1.9069607189808893e-06, + "loss": 0.6098, + "step": 5696 + }, + { + "epoch": 2.2236533957845435, + "grad_norm": 0.46327582972865083, + "learning_rate": 1.9051767753711841e-06, + "loss": 0.5762, + "step": 5697 + }, + { + "epoch": 2.2240437158469946, + "grad_norm": 0.4564051956973459, + "learning_rate": 1.9033934701719959e-06, + "loss": 0.626, + "step": 5698 + }, + { + "epoch": 2.2244340359094457, + "grad_norm": 0.4501532776588237, + "learning_rate": 1.9016108037511938e-06, + "loss": 0.5623, + "step": 5699 + }, + { + "epoch": 2.2248243559718968, + "grad_norm": 0.4796399873286543, + "learning_rate": 1.8998287764765104e-06, + "loss": 0.5961, + "step": 5700 + }, + { + "epoch": 2.2252146760343483, + "grad_norm": 0.4536818072885757, + "learning_rate": 1.8980473887155493e-06, + "loss": 0.6025, + "step": 5701 + }, + { + "epoch": 2.2256049960967994, + "grad_norm": 0.5640739906204434, + "learning_rate": 1.896266640835779e-06, + "loss": 0.5592, + "step": 5702 + }, + { + "epoch": 2.2259953161592505, + "grad_norm": 0.4747697552539691, + "learning_rate": 1.8944865332045404e-06, + "loss": 0.5903, + "step": 5703 + }, + { + "epoch": 2.2263856362217016, + "grad_norm": 0.5114004919196877, + "learning_rate": 1.8927070661890384e-06, + "loss": 0.5959, + "step": 5704 + }, + { + "epoch": 2.226775956284153, + "grad_norm": 0.48917635981950347, + "learning_rate": 1.8909282401563495e-06, + "loss": 0.5796, + "step": 5705 + }, + { + "epoch": 2.2271662763466042, + "grad_norm": 0.45173656706600357, + "learning_rate": 1.8891500554734133e-06, + "loss": 0.5708, + "step": 5706 + }, + { + "epoch": 2.2275565964090553, + "grad_norm": 0.4299697476227519, + "learning_rate": 1.887372512507043e-06, + "loss": 0.5802, + "step": 5707 + }, + { + "epoch": 2.2279469164715064, + "grad_norm": 0.5145694016696635, + "learning_rate": 1.8855956116239137e-06, + "loss": 0.5679, + "step": 5708 + }, + { + "epoch": 2.228337236533958, + "grad_norm": 0.4580372996274341, + "learning_rate": 1.883819353190572e-06, + "loss": 0.575, + "step": 5709 + }, + { + "epoch": 2.228727556596409, + "grad_norm": 0.47114514856031053, + "learning_rate": 1.8820437375734268e-06, + "loss": 0.5224, + "step": 5710 + }, + { + "epoch": 2.22911787665886, + "grad_norm": 0.45000726094562804, + "learning_rate": 1.880268765138763e-06, + "loss": 0.5567, + "step": 5711 + }, + { + "epoch": 2.2295081967213113, + "grad_norm": 0.5024888102506272, + "learning_rate": 1.8784944362527235e-06, + "loss": 0.5672, + "step": 5712 + }, + { + "epoch": 2.229898516783763, + "grad_norm": 0.4979475756191486, + "learning_rate": 1.8767207512813267e-06, + "loss": 0.55, + "step": 5713 + }, + { + "epoch": 2.230288836846214, + "grad_norm": 0.5182307162733859, + "learning_rate": 1.8749477105904528e-06, + "loss": 0.5704, + "step": 5714 + }, + { + "epoch": 2.230679156908665, + "grad_norm": 0.47341614863455045, + "learning_rate": 1.8731753145458487e-06, + "loss": 0.5652, + "step": 5715 + }, + { + "epoch": 2.231069476971116, + "grad_norm": 0.5111937722606126, + "learning_rate": 1.8714035635131333e-06, + "loss": 0.5418, + "step": 5716 + }, + { + "epoch": 2.2314597970335677, + "grad_norm": 0.5630525756963662, + "learning_rate": 1.8696324578577873e-06, + "loss": 0.5545, + "step": 5717 + }, + { + "epoch": 2.2318501170960188, + "grad_norm": 0.442596520090446, + "learning_rate": 1.8678619979451618e-06, + "loss": 0.5667, + "step": 5718 + }, + { + "epoch": 2.23224043715847, + "grad_norm": 0.4501536168207353, + "learning_rate": 1.86609218414047e-06, + "loss": 0.5644, + "step": 5719 + }, + { + "epoch": 2.232630757220921, + "grad_norm": 0.5337391431838748, + "learning_rate": 1.8643230168087989e-06, + "loss": 0.5874, + "step": 5720 + }, + { + "epoch": 2.2330210772833725, + "grad_norm": 0.52381997053679, + "learning_rate": 1.8625544963150955e-06, + "loss": 0.6094, + "step": 5721 + }, + { + "epoch": 2.2334113973458236, + "grad_norm": 0.4818067834890378, + "learning_rate": 1.8607866230241782e-06, + "loss": 0.5438, + "step": 5722 + }, + { + "epoch": 2.2338017174082747, + "grad_norm": 0.6431286119915547, + "learning_rate": 1.859019397300728e-06, + "loss": 0.5868, + "step": 5723 + }, + { + "epoch": 2.234192037470726, + "grad_norm": 0.4831787111879555, + "learning_rate": 1.8572528195092964e-06, + "loss": 0.5646, + "step": 5724 + }, + { + "epoch": 2.2345823575331774, + "grad_norm": 0.5020083811496753, + "learning_rate": 1.855486890014298e-06, + "loss": 0.556, + "step": 5725 + }, + { + "epoch": 2.2349726775956285, + "grad_norm": 0.542413942115242, + "learning_rate": 1.8537216091800143e-06, + "loss": 0.5438, + "step": 5726 + }, + { + "epoch": 2.2353629976580796, + "grad_norm": 0.49959607911277865, + "learning_rate": 1.8519569773705914e-06, + "loss": 0.5612, + "step": 5727 + }, + { + "epoch": 2.2357533177205307, + "grad_norm": 0.5196667540193117, + "learning_rate": 1.8501929949500475e-06, + "loss": 0.5889, + "step": 5728 + }, + { + "epoch": 2.236143637782982, + "grad_norm": 0.5050002751682179, + "learning_rate": 1.8484296622822606e-06, + "loss": 0.6007, + "step": 5729 + }, + { + "epoch": 2.2365339578454333, + "grad_norm": 0.4711026247085566, + "learning_rate": 1.8466669797309755e-06, + "loss": 0.5815, + "step": 5730 + }, + { + "epoch": 2.2369242779078844, + "grad_norm": 0.5573546028201071, + "learning_rate": 1.8449049476598075e-06, + "loss": 0.6162, + "step": 5731 + }, + { + "epoch": 2.2373145979703355, + "grad_norm": 0.4634023164880004, + "learning_rate": 1.8431435664322312e-06, + "loss": 0.5678, + "step": 5732 + }, + { + "epoch": 2.237704918032787, + "grad_norm": 0.45894347023393867, + "learning_rate": 1.8413828364115937e-06, + "loss": 0.5706, + "step": 5733 + }, + { + "epoch": 2.238095238095238, + "grad_norm": 0.4432886655134036, + "learning_rate": 1.8396227579611003e-06, + "loss": 0.5805, + "step": 5734 + }, + { + "epoch": 2.2384855581576892, + "grad_norm": 0.47420987326380026, + "learning_rate": 1.8378633314438316e-06, + "loss": 0.5812, + "step": 5735 + }, + { + "epoch": 2.2388758782201403, + "grad_norm": 0.44996848282267093, + "learning_rate": 1.8361045572227216e-06, + "loss": 0.5628, + "step": 5736 + }, + { + "epoch": 2.239266198282592, + "grad_norm": 0.46438729527954864, + "learning_rate": 1.83434643566058e-06, + "loss": 0.5672, + "step": 5737 + }, + { + "epoch": 2.239656518345043, + "grad_norm": 0.48628622146316025, + "learning_rate": 1.8325889671200753e-06, + "loss": 0.5298, + "step": 5738 + }, + { + "epoch": 2.240046838407494, + "grad_norm": 0.47254187741455633, + "learning_rate": 1.830832151963748e-06, + "loss": 0.5718, + "step": 5739 + }, + { + "epoch": 2.240437158469945, + "grad_norm": 0.554564410536171, + "learning_rate": 1.829075990553995e-06, + "loss": 0.5644, + "step": 5740 + }, + { + "epoch": 2.2408274785323967, + "grad_norm": 0.4705173335939437, + "learning_rate": 1.8273204832530883e-06, + "loss": 0.5314, + "step": 5741 + }, + { + "epoch": 2.241217798594848, + "grad_norm": 0.4701591493459644, + "learning_rate": 1.8255656304231567e-06, + "loss": 0.5979, + "step": 5742 + }, + { + "epoch": 2.241608118657299, + "grad_norm": 0.49419548129697616, + "learning_rate": 1.8238114324261963e-06, + "loss": 0.5979, + "step": 5743 + }, + { + "epoch": 2.24199843871975, + "grad_norm": 0.5148471284446007, + "learning_rate": 1.8220578896240716e-06, + "loss": 0.5898, + "step": 5744 + }, + { + "epoch": 2.2423887587822016, + "grad_norm": 0.48890517913154685, + "learning_rate": 1.820305002378508e-06, + "loss": 0.596, + "step": 5745 + }, + { + "epoch": 2.2427790788446527, + "grad_norm": 0.5348421508001383, + "learning_rate": 1.8185527710510976e-06, + "loss": 0.5451, + "step": 5746 + }, + { + "epoch": 2.2431693989071038, + "grad_norm": 0.48202406330939895, + "learning_rate": 1.816801196003294e-06, + "loss": 0.5681, + "step": 5747 + }, + { + "epoch": 2.243559718969555, + "grad_norm": 0.49808211074791187, + "learning_rate": 1.8150502775964212e-06, + "loss": 0.5566, + "step": 5748 + }, + { + "epoch": 2.2439500390320064, + "grad_norm": 0.5224827480435921, + "learning_rate": 1.8133000161916619e-06, + "loss": 0.5546, + "step": 5749 + }, + { + "epoch": 2.2443403590944575, + "grad_norm": 0.55759923884315, + "learning_rate": 1.8115504121500687e-06, + "loss": 0.6009, + "step": 5750 + }, + { + "epoch": 2.2447306791569086, + "grad_norm": 0.47956888692228117, + "learning_rate": 1.809801465832553e-06, + "loss": 0.5691, + "step": 5751 + }, + { + "epoch": 2.2451209992193597, + "grad_norm": 0.5376203261395321, + "learning_rate": 1.8080531775998977e-06, + "loss": 0.575, + "step": 5752 + }, + { + "epoch": 2.2455113192818112, + "grad_norm": 0.51176096106565, + "learning_rate": 1.806305547812739e-06, + "loss": 0.5785, + "step": 5753 + }, + { + "epoch": 2.2459016393442623, + "grad_norm": 0.5555520017436283, + "learning_rate": 1.8045585768315893e-06, + "loss": 0.5943, + "step": 5754 + }, + { + "epoch": 2.2462919594067134, + "grad_norm": 0.5437500609444966, + "learning_rate": 1.8028122650168156e-06, + "loss": 0.5171, + "step": 5755 + }, + { + "epoch": 2.2466822794691645, + "grad_norm": 0.4894510556183693, + "learning_rate": 1.8010666127286576e-06, + "loss": 0.5855, + "step": 5756 + }, + { + "epoch": 2.247072599531616, + "grad_norm": 0.486925700751911, + "learning_rate": 1.799321620327209e-06, + "loss": 0.5787, + "step": 5757 + }, + { + "epoch": 2.247462919594067, + "grad_norm": 0.42953132536447053, + "learning_rate": 1.7975772881724378e-06, + "loss": 0.5795, + "step": 5758 + }, + { + "epoch": 2.2478532396565183, + "grad_norm": 0.5209790741097315, + "learning_rate": 1.795833616624168e-06, + "loss": 0.5866, + "step": 5759 + }, + { + "epoch": 2.2482435597189694, + "grad_norm": 0.5558615436913361, + "learning_rate": 1.7940906060420888e-06, + "loss": 0.5771, + "step": 5760 + }, + { + "epoch": 2.248633879781421, + "grad_norm": 0.5894202865951002, + "learning_rate": 1.7923482567857574e-06, + "loss": 0.5701, + "step": 5761 + }, + { + "epoch": 2.249024199843872, + "grad_norm": 0.5244218248665418, + "learning_rate": 1.7906065692145901e-06, + "loss": 0.5668, + "step": 5762 + }, + { + "epoch": 2.249414519906323, + "grad_norm": 0.6580372119389456, + "learning_rate": 1.7888655436878676e-06, + "loss": 0.5514, + "step": 5763 + }, + { + "epoch": 2.2498048399687742, + "grad_norm": 0.573959207629765, + "learning_rate": 1.7871251805647333e-06, + "loss": 0.5868, + "step": 5764 + }, + { + "epoch": 2.2501951600312258, + "grad_norm": 0.4561340220075381, + "learning_rate": 1.7853854802041982e-06, + "loss": 0.5813, + "step": 5765 + }, + { + "epoch": 2.250585480093677, + "grad_norm": 0.4985642432461105, + "learning_rate": 1.7836464429651295e-06, + "loss": 0.5651, + "step": 5766 + }, + { + "epoch": 2.250975800156128, + "grad_norm": 0.558554512885521, + "learning_rate": 1.7819080692062663e-06, + "loss": 0.599, + "step": 5767 + }, + { + "epoch": 2.251366120218579, + "grad_norm": 0.530815305644436, + "learning_rate": 1.780170359286202e-06, + "loss": 0.5416, + "step": 5768 + }, + { + "epoch": 2.2517564402810306, + "grad_norm": 0.48881205504069825, + "learning_rate": 1.7784333135634008e-06, + "loss": 0.5957, + "step": 5769 + }, + { + "epoch": 2.2521467603434817, + "grad_norm": 0.5043528835904605, + "learning_rate": 1.7766969323961846e-06, + "loss": 0.5538, + "step": 5770 + }, + { + "epoch": 2.252537080405933, + "grad_norm": 0.6112078452244387, + "learning_rate": 1.7749612161427404e-06, + "loss": 0.5958, + "step": 5771 + }, + { + "epoch": 2.252927400468384, + "grad_norm": 0.6299774925845829, + "learning_rate": 1.773226165161115e-06, + "loss": 0.5846, + "step": 5772 + }, + { + "epoch": 2.2533177205308355, + "grad_norm": 0.42883955215041797, + "learning_rate": 1.7714917798092246e-06, + "loss": 0.5878, + "step": 5773 + }, + { + "epoch": 2.2537080405932866, + "grad_norm": 0.5388916981776528, + "learning_rate": 1.7697580604448406e-06, + "loss": 0.5235, + "step": 5774 + }, + { + "epoch": 2.2540983606557377, + "grad_norm": 0.5060877429807584, + "learning_rate": 1.7680250074256027e-06, + "loss": 0.5699, + "step": 5775 + }, + { + "epoch": 2.2544886807181888, + "grad_norm": 0.5873832591450039, + "learning_rate": 1.766292621109011e-06, + "loss": 0.5756, + "step": 5776 + }, + { + "epoch": 2.2548790007806403, + "grad_norm": 0.6007842751882785, + "learning_rate": 1.7645609018524251e-06, + "loss": 0.5787, + "step": 5777 + }, + { + "epoch": 2.2552693208430914, + "grad_norm": 0.5473318038224295, + "learning_rate": 1.7628298500130741e-06, + "loss": 0.5835, + "step": 5778 + }, + { + "epoch": 2.2556596409055425, + "grad_norm": 0.6100842679152094, + "learning_rate": 1.7610994659480423e-06, + "loss": 0.5806, + "step": 5779 + }, + { + "epoch": 2.2560499609679936, + "grad_norm": 0.6943865421051651, + "learning_rate": 1.7593697500142803e-06, + "loss": 0.5894, + "step": 5780 + }, + { + "epoch": 2.256440281030445, + "grad_norm": 0.5821152494808316, + "learning_rate": 1.757640702568597e-06, + "loss": 0.6014, + "step": 5781 + }, + { + "epoch": 2.2568306010928962, + "grad_norm": 0.5442894396783627, + "learning_rate": 1.75591232396767e-06, + "loss": 0.6072, + "step": 5782 + }, + { + "epoch": 2.2572209211553473, + "grad_norm": 0.664125123163137, + "learning_rate": 1.754184614568032e-06, + "loss": 0.5503, + "step": 5783 + }, + { + "epoch": 2.2576112412177984, + "grad_norm": 0.5742589284563918, + "learning_rate": 1.7524575747260836e-06, + "loss": 0.5392, + "step": 5784 + }, + { + "epoch": 2.25800156128025, + "grad_norm": 0.5103550250599498, + "learning_rate": 1.7507312047980807e-06, + "loss": 0.5918, + "step": 5785 + }, + { + "epoch": 2.258391881342701, + "grad_norm": 0.5678790482551735, + "learning_rate": 1.7490055051401489e-06, + "loss": 0.5458, + "step": 5786 + }, + { + "epoch": 2.258782201405152, + "grad_norm": 0.49286484937579533, + "learning_rate": 1.7472804761082674e-06, + "loss": 0.5589, + "step": 5787 + }, + { + "epoch": 2.2591725214676033, + "grad_norm": 0.5227086460049717, + "learning_rate": 1.7455561180582864e-06, + "loss": 0.5779, + "step": 5788 + }, + { + "epoch": 2.259562841530055, + "grad_norm": 0.492183851686698, + "learning_rate": 1.743832431345905e-06, + "loss": 0.5827, + "step": 5789 + }, + { + "epoch": 2.259953161592506, + "grad_norm": 0.5570912088106931, + "learning_rate": 1.742109416326696e-06, + "loss": 0.5824, + "step": 5790 + }, + { + "epoch": 2.260343481654957, + "grad_norm": 0.49137879871685475, + "learning_rate": 1.7403870733560862e-06, + "loss": 0.5419, + "step": 5791 + }, + { + "epoch": 2.260733801717408, + "grad_norm": 0.44622540988140275, + "learning_rate": 1.7386654027893695e-06, + "loss": 0.5942, + "step": 5792 + }, + { + "epoch": 2.2611241217798597, + "grad_norm": 0.4683197052834554, + "learning_rate": 1.7369444049816953e-06, + "loss": 0.577, + "step": 5793 + }, + { + "epoch": 2.2615144418423108, + "grad_norm": 0.4527739387540101, + "learning_rate": 1.735224080288076e-06, + "loss": 0.551, + "step": 5794 + }, + { + "epoch": 2.261904761904762, + "grad_norm": 0.47900516375206187, + "learning_rate": 1.733504429063389e-06, + "loss": 0.564, + "step": 5795 + }, + { + "epoch": 2.262295081967213, + "grad_norm": 0.47815316676556036, + "learning_rate": 1.7317854516623683e-06, + "loss": 0.5395, + "step": 5796 + }, + { + "epoch": 2.2626854020296645, + "grad_norm": 0.4474073597835644, + "learning_rate": 1.7300671484396097e-06, + "loss": 0.5754, + "step": 5797 + }, + { + "epoch": 2.2630757220921156, + "grad_norm": 0.46285651929513183, + "learning_rate": 1.7283495197495693e-06, + "loss": 0.5514, + "step": 5798 + }, + { + "epoch": 2.2634660421545667, + "grad_norm": 0.4786496519993068, + "learning_rate": 1.7266325659465688e-06, + "loss": 0.5346, + "step": 5799 + }, + { + "epoch": 2.263856362217018, + "grad_norm": 0.446903291621599, + "learning_rate": 1.7249162873847837e-06, + "loss": 0.5171, + "step": 5800 + }, + { + "epoch": 2.2642466822794693, + "grad_norm": 0.45142033331484965, + "learning_rate": 1.7232006844182568e-06, + "loss": 0.5442, + "step": 5801 + }, + { + "epoch": 2.2646370023419204, + "grad_norm": 0.45447050023790725, + "learning_rate": 1.721485757400886e-06, + "loss": 0.5337, + "step": 5802 + }, + { + "epoch": 2.2650273224043715, + "grad_norm": 0.6089168599543772, + "learning_rate": 1.719771506686434e-06, + "loss": 0.5421, + "step": 5803 + }, + { + "epoch": 2.2654176424668226, + "grad_norm": 0.4618579061190652, + "learning_rate": 1.7180579326285202e-06, + "loss": 0.5855, + "step": 5804 + }, + { + "epoch": 2.265807962529274, + "grad_norm": 0.46787203802497707, + "learning_rate": 1.7163450355806311e-06, + "loss": 0.5969, + "step": 5805 + }, + { + "epoch": 2.2661982825917253, + "grad_norm": 0.4529788262473759, + "learning_rate": 1.7146328158961011e-06, + "loss": 0.5648, + "step": 5806 + }, + { + "epoch": 2.2665886026541764, + "grad_norm": 0.45665636366570533, + "learning_rate": 1.7129212739281392e-06, + "loss": 0.5359, + "step": 5807 + }, + { + "epoch": 2.2669789227166275, + "grad_norm": 0.5053466325350003, + "learning_rate": 1.7112104100298044e-06, + "loss": 0.56, + "step": 5808 + }, + { + "epoch": 2.267369242779079, + "grad_norm": 0.5589013213708659, + "learning_rate": 1.7095002245540215e-06, + "loss": 0.5791, + "step": 5809 + }, + { + "epoch": 2.26775956284153, + "grad_norm": 0.5204600992480888, + "learning_rate": 1.707790717853573e-06, + "loss": 0.6074, + "step": 5810 + }, + { + "epoch": 2.2681498829039812, + "grad_norm": 0.5237866256039585, + "learning_rate": 1.7060818902810993e-06, + "loss": 0.5745, + "step": 5811 + }, + { + "epoch": 2.2685402029664323, + "grad_norm": 0.48438765577206133, + "learning_rate": 1.704373742189107e-06, + "loss": 0.5617, + "step": 5812 + }, + { + "epoch": 2.268930523028884, + "grad_norm": 0.4630324328337833, + "learning_rate": 1.702666273929955e-06, + "loss": 0.5446, + "step": 5813 + }, + { + "epoch": 2.269320843091335, + "grad_norm": 0.568442657794646, + "learning_rate": 1.7009594858558698e-06, + "loss": 0.5851, + "step": 5814 + }, + { + "epoch": 2.269711163153786, + "grad_norm": 0.4745813002963367, + "learning_rate": 1.699253378318928e-06, + "loss": 0.5915, + "step": 5815 + }, + { + "epoch": 2.270101483216237, + "grad_norm": 0.46817660606473505, + "learning_rate": 1.6975479516710759e-06, + "loss": 0.5462, + "step": 5816 + }, + { + "epoch": 2.2704918032786887, + "grad_norm": 0.5428348491096823, + "learning_rate": 1.6958432062641101e-06, + "loss": 0.5522, + "step": 5817 + }, + { + "epoch": 2.27088212334114, + "grad_norm": 0.5096713674115892, + "learning_rate": 1.694139142449696e-06, + "loss": 0.5909, + "step": 5818 + }, + { + "epoch": 2.271272443403591, + "grad_norm": 0.5061423447318442, + "learning_rate": 1.6924357605793496e-06, + "loss": 0.5679, + "step": 5819 + }, + { + "epoch": 2.271662763466042, + "grad_norm": 0.4802495223557712, + "learning_rate": 1.6907330610044536e-06, + "loss": 0.5497, + "step": 5820 + }, + { + "epoch": 2.2720530835284936, + "grad_norm": 0.5561079403189368, + "learning_rate": 1.6890310440762436e-06, + "loss": 0.5536, + "step": 5821 + }, + { + "epoch": 2.2724434035909447, + "grad_norm": 0.4867064344029436, + "learning_rate": 1.6873297101458202e-06, + "loss": 0.5467, + "step": 5822 + }, + { + "epoch": 2.2728337236533958, + "grad_norm": 0.5349959285867288, + "learning_rate": 1.6856290595641388e-06, + "loss": 0.5148, + "step": 5823 + }, + { + "epoch": 2.273224043715847, + "grad_norm": 0.4814081600414478, + "learning_rate": 1.6839290926820156e-06, + "loss": 0.5746, + "step": 5824 + }, + { + "epoch": 2.2736143637782984, + "grad_norm": 0.6531652416400314, + "learning_rate": 1.6822298098501233e-06, + "loss": 0.6274, + "step": 5825 + }, + { + "epoch": 2.2740046838407495, + "grad_norm": 0.5026132195924022, + "learning_rate": 1.6805312114189991e-06, + "loss": 0.5763, + "step": 5826 + }, + { + "epoch": 2.2743950039032006, + "grad_norm": 0.4646308123723113, + "learning_rate": 1.6788332977390342e-06, + "loss": 0.5881, + "step": 5827 + }, + { + "epoch": 2.2747853239656517, + "grad_norm": 0.46584486851570295, + "learning_rate": 1.6771360691604783e-06, + "loss": 0.5639, + "step": 5828 + }, + { + "epoch": 2.275175644028103, + "grad_norm": 0.47672513623530155, + "learning_rate": 1.675439526033444e-06, + "loss": 0.5852, + "step": 5829 + }, + { + "epoch": 2.2755659640905543, + "grad_norm": 0.4640498920415771, + "learning_rate": 1.6737436687078974e-06, + "loss": 0.596, + "step": 5830 + }, + { + "epoch": 2.2759562841530054, + "grad_norm": 0.4609496336515621, + "learning_rate": 1.6720484975336688e-06, + "loss": 0.5809, + "step": 5831 + }, + { + "epoch": 2.2763466042154565, + "grad_norm": 0.5020869314503034, + "learning_rate": 1.6703540128604417e-06, + "loss": 0.5802, + "step": 5832 + }, + { + "epoch": 2.276736924277908, + "grad_norm": 0.4474562274270164, + "learning_rate": 1.6686602150377607e-06, + "loss": 0.5546, + "step": 5833 + }, + { + "epoch": 2.277127244340359, + "grad_norm": 0.4778820889365382, + "learning_rate": 1.6669671044150264e-06, + "loss": 0.5705, + "step": 5834 + }, + { + "epoch": 2.2775175644028103, + "grad_norm": 0.6621915442328896, + "learning_rate": 1.6652746813415022e-06, + "loss": 0.5572, + "step": 5835 + }, + { + "epoch": 2.2779078844652614, + "grad_norm": 0.4839592461773058, + "learning_rate": 1.6635829461663033e-06, + "loss": 0.5672, + "step": 5836 + }, + { + "epoch": 2.2782982045277125, + "grad_norm": 0.4531307101904444, + "learning_rate": 1.6618918992384108e-06, + "loss": 0.5585, + "step": 5837 + }, + { + "epoch": 2.278688524590164, + "grad_norm": 0.44810266771430596, + "learning_rate": 1.6602015409066553e-06, + "loss": 0.5811, + "step": 5838 + }, + { + "epoch": 2.279078844652615, + "grad_norm": 0.5636184684611638, + "learning_rate": 1.6585118715197335e-06, + "loss": 0.5624, + "step": 5839 + }, + { + "epoch": 2.279469164715066, + "grad_norm": 0.4730289798845888, + "learning_rate": 1.656822891426194e-06, + "loss": 0.5663, + "step": 5840 + }, + { + "epoch": 2.2798594847775178, + "grad_norm": 0.5761122290393199, + "learning_rate": 1.6551346009744451e-06, + "loss": 0.5713, + "step": 5841 + }, + { + "epoch": 2.280249804839969, + "grad_norm": 0.49034746597724105, + "learning_rate": 1.6534470005127524e-06, + "loss": 0.5179, + "step": 5842 + }, + { + "epoch": 2.28064012490242, + "grad_norm": 0.4919902960007687, + "learning_rate": 1.6517600903892416e-06, + "loss": 0.5286, + "step": 5843 + }, + { + "epoch": 2.281030444964871, + "grad_norm": 0.44810883239128096, + "learning_rate": 1.6500738709518937e-06, + "loss": 0.5388, + "step": 5844 + }, + { + "epoch": 2.281420765027322, + "grad_norm": 0.49495545712237554, + "learning_rate": 1.6483883425485458e-06, + "loss": 0.5901, + "step": 5845 + }, + { + "epoch": 2.2818110850897737, + "grad_norm": 0.4908982802793948, + "learning_rate": 1.6467035055268976e-06, + "loss": 0.5415, + "step": 5846 + }, + { + "epoch": 2.282201405152225, + "grad_norm": 0.4645327857965251, + "learning_rate": 1.6450193602344994e-06, + "loss": 0.5302, + "step": 5847 + }, + { + "epoch": 2.282591725214676, + "grad_norm": 0.48848843873309966, + "learning_rate": 1.643335907018766e-06, + "loss": 0.5635, + "step": 5848 + }, + { + "epoch": 2.2829820452771274, + "grad_norm": 0.45290868353694846, + "learning_rate": 1.6416531462269635e-06, + "loss": 0.5679, + "step": 5849 + }, + { + "epoch": 2.2833723653395785, + "grad_norm": 0.4756700051366912, + "learning_rate": 1.6399710782062183e-06, + "loss": 0.5602, + "step": 5850 + }, + { + "epoch": 2.2837626854020296, + "grad_norm": 0.5349308162829153, + "learning_rate": 1.6382897033035106e-06, + "loss": 0.5757, + "step": 5851 + }, + { + "epoch": 2.2841530054644807, + "grad_norm": 0.48512172600012005, + "learning_rate": 1.636609021865684e-06, + "loss": 0.5203, + "step": 5852 + }, + { + "epoch": 2.284543325526932, + "grad_norm": 0.4433165240906844, + "learning_rate": 1.6349290342394313e-06, + "loss": 0.5113, + "step": 5853 + }, + { + "epoch": 2.2849336455893834, + "grad_norm": 0.5280745705967272, + "learning_rate": 1.63324974077131e-06, + "loss": 0.5465, + "step": 5854 + }, + { + "epoch": 2.2853239656518345, + "grad_norm": 0.44633473453895867, + "learning_rate": 1.6315711418077262e-06, + "loss": 0.528, + "step": 5855 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.4557662716888802, + "learning_rate": 1.6298932376949505e-06, + "loss": 0.5844, + "step": 5856 + }, + { + "epoch": 2.286104605776737, + "grad_norm": 0.4858031538328135, + "learning_rate": 1.6282160287791054e-06, + "loss": 0.5891, + "step": 5857 + }, + { + "epoch": 2.2864949258391882, + "grad_norm": 0.4813909481252333, + "learning_rate": 1.6265395154061702e-06, + "loss": 0.6025, + "step": 5858 + }, + { + "epoch": 2.2868852459016393, + "grad_norm": 0.5266908599466043, + "learning_rate": 1.6248636979219833e-06, + "loss": 0.5615, + "step": 5859 + }, + { + "epoch": 2.2872755659640904, + "grad_norm": 0.5019051105759545, + "learning_rate": 1.623188576672235e-06, + "loss": 0.5784, + "step": 5860 + }, + { + "epoch": 2.2876658860265415, + "grad_norm": 0.5341537759819535, + "learning_rate": 1.621514152002478e-06, + "loss": 0.563, + "step": 5861 + }, + { + "epoch": 2.288056206088993, + "grad_norm": 0.5823658477567092, + "learning_rate": 1.6198404242581162e-06, + "loss": 0.5729, + "step": 5862 + }, + { + "epoch": 2.288446526151444, + "grad_norm": 0.6307947460782896, + "learning_rate": 1.618167393784414e-06, + "loss": 0.5961, + "step": 5863 + }, + { + "epoch": 2.2888368462138953, + "grad_norm": 0.6223432685048418, + "learning_rate": 1.6164950609264874e-06, + "loss": 0.5578, + "step": 5864 + }, + { + "epoch": 2.289227166276347, + "grad_norm": 0.5165699352864462, + "learning_rate": 1.6148234260293127e-06, + "loss": 0.575, + "step": 5865 + }, + { + "epoch": 2.289617486338798, + "grad_norm": 0.5661667508971827, + "learning_rate": 1.6131524894377187e-06, + "loss": 0.5817, + "step": 5866 + }, + { + "epoch": 2.290007806401249, + "grad_norm": 0.6210549531521649, + "learning_rate": 1.6114822514963957e-06, + "loss": 0.5647, + "step": 5867 + }, + { + "epoch": 2.2903981264637, + "grad_norm": 0.6502044042829275, + "learning_rate": 1.6098127125498798e-06, + "loss": 0.5888, + "step": 5868 + }, + { + "epoch": 2.290788446526151, + "grad_norm": 0.498919065745118, + "learning_rate": 1.6081438729425736e-06, + "loss": 0.5562, + "step": 5869 + }, + { + "epoch": 2.2911787665886028, + "grad_norm": 0.5484650976230471, + "learning_rate": 1.606475733018728e-06, + "loss": 0.5837, + "step": 5870 + }, + { + "epoch": 2.291569086651054, + "grad_norm": 0.5546531237036475, + "learning_rate": 1.6048082931224562e-06, + "loss": 0.5623, + "step": 5871 + }, + { + "epoch": 2.291959406713505, + "grad_norm": 0.5662753180522038, + "learning_rate": 1.6031415535977207e-06, + "loss": 0.5242, + "step": 5872 + }, + { + "epoch": 2.2923497267759565, + "grad_norm": 0.5174167504371999, + "learning_rate": 1.601475514788341e-06, + "loss": 0.5667, + "step": 5873 + }, + { + "epoch": 2.2927400468384076, + "grad_norm": 0.5244174385327544, + "learning_rate": 1.5998101770379965e-06, + "loss": 0.553, + "step": 5874 + }, + { + "epoch": 2.2931303669008587, + "grad_norm": 0.5078164190082612, + "learning_rate": 1.5981455406902158e-06, + "loss": 0.558, + "step": 5875 + }, + { + "epoch": 2.29352068696331, + "grad_norm": 0.6015401148140646, + "learning_rate": 1.5964816060883892e-06, + "loss": 0.5415, + "step": 5876 + }, + { + "epoch": 2.293911007025761, + "grad_norm": 0.49581614828836174, + "learning_rate": 1.594818373575754e-06, + "loss": 0.5617, + "step": 5877 + }, + { + "epoch": 2.2943013270882124, + "grad_norm": 0.43567892939079067, + "learning_rate": 1.5931558434954108e-06, + "loss": 0.5995, + "step": 5878 + }, + { + "epoch": 2.2946916471506635, + "grad_norm": 0.4907862872432407, + "learning_rate": 1.5914940161903093e-06, + "loss": 0.5485, + "step": 5879 + }, + { + "epoch": 2.2950819672131146, + "grad_norm": 0.5244759439965576, + "learning_rate": 1.5898328920032597e-06, + "loss": 0.5425, + "step": 5880 + }, + { + "epoch": 2.295472287275566, + "grad_norm": 0.5122709604639996, + "learning_rate": 1.5881724712769214e-06, + "loss": 0.5504, + "step": 5881 + }, + { + "epoch": 2.2958626073380173, + "grad_norm": 0.45852429125142663, + "learning_rate": 1.586512754353815e-06, + "loss": 0.5646, + "step": 5882 + }, + { + "epoch": 2.2962529274004684, + "grad_norm": 0.44322276801540617, + "learning_rate": 1.5848537415763082e-06, + "loss": 0.5969, + "step": 5883 + }, + { + "epoch": 2.2966432474629195, + "grad_norm": 0.49017412044582614, + "learning_rate": 1.5831954332866323e-06, + "loss": 0.5487, + "step": 5884 + }, + { + "epoch": 2.2970335675253706, + "grad_norm": 0.5373254053907687, + "learning_rate": 1.5815378298268663e-06, + "loss": 0.5818, + "step": 5885 + }, + { + "epoch": 2.297423887587822, + "grad_norm": 0.49702800167842487, + "learning_rate": 1.5798809315389462e-06, + "loss": 0.5559, + "step": 5886 + }, + { + "epoch": 2.297814207650273, + "grad_norm": 0.5054093958428524, + "learning_rate": 1.578224738764661e-06, + "loss": 0.5898, + "step": 5887 + }, + { + "epoch": 2.2982045277127243, + "grad_norm": 0.5280632984221424, + "learning_rate": 1.5765692518456593e-06, + "loss": 0.5311, + "step": 5888 + }, + { + "epoch": 2.298594847775176, + "grad_norm": 0.5094188250360576, + "learning_rate": 1.5749144711234381e-06, + "loss": 0.5941, + "step": 5889 + }, + { + "epoch": 2.298985167837627, + "grad_norm": 0.43930784376729787, + "learning_rate": 1.5732603969393506e-06, + "loss": 0.5819, + "step": 5890 + }, + { + "epoch": 2.299375487900078, + "grad_norm": 0.45139490087430273, + "learning_rate": 1.571607029634607e-06, + "loss": 0.59, + "step": 5891 + }, + { + "epoch": 2.299765807962529, + "grad_norm": 0.5233204936058902, + "learning_rate": 1.5699543695502667e-06, + "loss": 0.5957, + "step": 5892 + }, + { + "epoch": 2.3001561280249803, + "grad_norm": 0.5208594235532789, + "learning_rate": 1.568302417027251e-06, + "loss": 0.5299, + "step": 5893 + }, + { + "epoch": 2.300546448087432, + "grad_norm": 0.5354022226849251, + "learning_rate": 1.5666511724063226e-06, + "loss": 0.5573, + "step": 5894 + }, + { + "epoch": 2.300936768149883, + "grad_norm": 0.4481483719551311, + "learning_rate": 1.5650006360281122e-06, + "loss": 0.5531, + "step": 5895 + }, + { + "epoch": 2.301327088212334, + "grad_norm": 0.49522651221379704, + "learning_rate": 1.563350808233094e-06, + "loss": 0.5677, + "step": 5896 + }, + { + "epoch": 2.3017174082747855, + "grad_norm": 0.4854599411460824, + "learning_rate": 1.5617016893616033e-06, + "loss": 0.5495, + "step": 5897 + }, + { + "epoch": 2.3021077283372366, + "grad_norm": 0.49556229438545774, + "learning_rate": 1.5600532797538226e-06, + "loss": 0.5383, + "step": 5898 + }, + { + "epoch": 2.3024980483996877, + "grad_norm": 0.5310588813622489, + "learning_rate": 1.5584055797497944e-06, + "loss": 0.6085, + "step": 5899 + }, + { + "epoch": 2.302888368462139, + "grad_norm": 0.5575114043267462, + "learning_rate": 1.5567585896894089e-06, + "loss": 0.5236, + "step": 5900 + }, + { + "epoch": 2.30327868852459, + "grad_norm": 0.5144637313745093, + "learning_rate": 1.555112309912416e-06, + "loss": 0.549, + "step": 5901 + }, + { + "epoch": 2.3036690085870415, + "grad_norm": 0.5370871820596382, + "learning_rate": 1.5534667407584137e-06, + "loss": 0.5787, + "step": 5902 + }, + { + "epoch": 2.3040593286494926, + "grad_norm": 0.5057160501555219, + "learning_rate": 1.5518218825668563e-06, + "loss": 0.5623, + "step": 5903 + }, + { + "epoch": 2.3044496487119437, + "grad_norm": 0.49267969940142853, + "learning_rate": 1.550177735677048e-06, + "loss": 0.5502, + "step": 5904 + }, + { + "epoch": 2.3048399687743952, + "grad_norm": 0.4889744401453936, + "learning_rate": 1.548534300428154e-06, + "loss": 0.5801, + "step": 5905 + }, + { + "epoch": 2.3052302888368463, + "grad_norm": 0.46344761418133745, + "learning_rate": 1.5468915771591835e-06, + "loss": 0.6392, + "step": 5906 + }, + { + "epoch": 2.3056206088992974, + "grad_norm": 0.49408014588543264, + "learning_rate": 1.5452495662090033e-06, + "loss": 0.5666, + "step": 5907 + }, + { + "epoch": 2.3060109289617485, + "grad_norm": 0.49161491354362385, + "learning_rate": 1.5436082679163357e-06, + "loss": 0.5734, + "step": 5908 + }, + { + "epoch": 2.3064012490241996, + "grad_norm": 0.49963278269286654, + "learning_rate": 1.5419676826197493e-06, + "loss": 0.5473, + "step": 5909 + }, + { + "epoch": 2.306791569086651, + "grad_norm": 0.4866051428047873, + "learning_rate": 1.5403278106576736e-06, + "loss": 0.571, + "step": 5910 + }, + { + "epoch": 2.3071818891491023, + "grad_norm": 0.509272567866743, + "learning_rate": 1.5386886523683853e-06, + "loss": 0.5731, + "step": 5911 + }, + { + "epoch": 2.3075722092115534, + "grad_norm": 0.5296027289950506, + "learning_rate": 1.5370502080900147e-06, + "loss": 0.5557, + "step": 5912 + }, + { + "epoch": 2.307962529274005, + "grad_norm": 0.5385991404195577, + "learning_rate": 1.535412478160544e-06, + "loss": 0.5394, + "step": 5913 + }, + { + "epoch": 2.308352849336456, + "grad_norm": 0.5119817276802606, + "learning_rate": 1.5337754629178137e-06, + "loss": 0.5844, + "step": 5914 + }, + { + "epoch": 2.308743169398907, + "grad_norm": 0.4844238506898867, + "learning_rate": 1.5321391626995091e-06, + "loss": 0.5763, + "step": 5915 + }, + { + "epoch": 2.309133489461358, + "grad_norm": 0.474940476762392, + "learning_rate": 1.5305035778431753e-06, + "loss": 0.5765, + "step": 5916 + }, + { + "epoch": 2.3095238095238093, + "grad_norm": 0.4999675104588522, + "learning_rate": 1.5288687086862024e-06, + "loss": 0.5952, + "step": 5917 + }, + { + "epoch": 2.309914129586261, + "grad_norm": 0.44432421125359706, + "learning_rate": 1.5272345555658403e-06, + "loss": 0.5634, + "step": 5918 + }, + { + "epoch": 2.310304449648712, + "grad_norm": 0.5751805501489624, + "learning_rate": 1.5256011188191856e-06, + "loss": 0.6092, + "step": 5919 + }, + { + "epoch": 2.310694769711163, + "grad_norm": 0.5582401411872336, + "learning_rate": 1.5239683987831894e-06, + "loss": 0.557, + "step": 5920 + }, + { + "epoch": 2.3110850897736146, + "grad_norm": 0.49362269345687876, + "learning_rate": 1.522336395794653e-06, + "loss": 0.594, + "step": 5921 + }, + { + "epoch": 2.3114754098360657, + "grad_norm": 0.5172154338265695, + "learning_rate": 1.5207051101902347e-06, + "loss": 0.5695, + "step": 5922 + }, + { + "epoch": 2.311865729898517, + "grad_norm": 0.49976599756492596, + "learning_rate": 1.5190745423064402e-06, + "loss": 0.5597, + "step": 5923 + }, + { + "epoch": 2.312256049960968, + "grad_norm": 0.5250476410555094, + "learning_rate": 1.517444692479626e-06, + "loss": 0.5808, + "step": 5924 + }, + { + "epoch": 2.312646370023419, + "grad_norm": 0.4843432640802855, + "learning_rate": 1.5158155610460074e-06, + "loss": 0.5742, + "step": 5925 + }, + { + "epoch": 2.3130366900858705, + "grad_norm": 0.47652387173279237, + "learning_rate": 1.5141871483416431e-06, + "loss": 0.5894, + "step": 5926 + }, + { + "epoch": 2.3134270101483216, + "grad_norm": 0.5151130538704204, + "learning_rate": 1.5125594547024514e-06, + "loss": 0.5534, + "step": 5927 + }, + { + "epoch": 2.3138173302107727, + "grad_norm": 0.49325923401511684, + "learning_rate": 1.5109324804641945e-06, + "loss": 0.5529, + "step": 5928 + }, + { + "epoch": 2.3142076502732243, + "grad_norm": 0.5010856044002678, + "learning_rate": 1.509306225962495e-06, + "loss": 0.569, + "step": 5929 + }, + { + "epoch": 2.3145979703356754, + "grad_norm": 0.5327115454178155, + "learning_rate": 1.5076806915328169e-06, + "loss": 0.5559, + "step": 5930 + }, + { + "epoch": 2.3149882903981265, + "grad_norm": 0.529845772501604, + "learning_rate": 1.5060558775104839e-06, + "loss": 0.5603, + "step": 5931 + }, + { + "epoch": 2.3153786104605776, + "grad_norm": 0.5401804189190244, + "learning_rate": 1.5044317842306655e-06, + "loss": 0.5651, + "step": 5932 + }, + { + "epoch": 2.3157689305230287, + "grad_norm": 0.5208688000661635, + "learning_rate": 1.5028084120283892e-06, + "loss": 0.5831, + "step": 5933 + }, + { + "epoch": 2.3161592505854802, + "grad_norm": 0.5038805891896387, + "learning_rate": 1.5011857612385256e-06, + "loss": 0.5742, + "step": 5934 + }, + { + "epoch": 2.3165495706479313, + "grad_norm": 0.48729307495043767, + "learning_rate": 1.4995638321958034e-06, + "loss": 0.5651, + "step": 5935 + }, + { + "epoch": 2.3169398907103824, + "grad_norm": 0.5084724540334962, + "learning_rate": 1.497942625234799e-06, + "loss": 0.5835, + "step": 5936 + }, + { + "epoch": 2.317330210772834, + "grad_norm": 0.5056247488332906, + "learning_rate": 1.4963221406899397e-06, + "loss": 0.5751, + "step": 5937 + }, + { + "epoch": 2.317720530835285, + "grad_norm": 0.5239974714787239, + "learning_rate": 1.4947023788955034e-06, + "loss": 0.5887, + "step": 5938 + }, + { + "epoch": 2.318110850897736, + "grad_norm": 0.5860613214527818, + "learning_rate": 1.4930833401856227e-06, + "loss": 0.5388, + "step": 5939 + }, + { + "epoch": 2.3185011709601873, + "grad_norm": 0.5287966303809436, + "learning_rate": 1.4914650248942768e-06, + "loss": 0.5887, + "step": 5940 + }, + { + "epoch": 2.3188914910226384, + "grad_norm": 0.4604463610392349, + "learning_rate": 1.4898474333552966e-06, + "loss": 0.5922, + "step": 5941 + }, + { + "epoch": 2.31928181108509, + "grad_norm": 0.5893609240076134, + "learning_rate": 1.4882305659023654e-06, + "loss": 0.5849, + "step": 5942 + }, + { + "epoch": 2.319672131147541, + "grad_norm": 0.5031928053076274, + "learning_rate": 1.4866144228690148e-06, + "loss": 0.5453, + "step": 5943 + }, + { + "epoch": 2.320062451209992, + "grad_norm": 0.5203312428410528, + "learning_rate": 1.4849990045886309e-06, + "loss": 0.5444, + "step": 5944 + }, + { + "epoch": 2.3204527712724436, + "grad_norm": 0.4543570017339714, + "learning_rate": 1.483384311394444e-06, + "loss": 0.5871, + "step": 5945 + }, + { + "epoch": 2.3208430913348947, + "grad_norm": 0.46174325187247034, + "learning_rate": 1.4817703436195434e-06, + "loss": 0.5599, + "step": 5946 + }, + { + "epoch": 2.321233411397346, + "grad_norm": 0.43556160367990665, + "learning_rate": 1.4801571015968575e-06, + "loss": 0.5653, + "step": 5947 + }, + { + "epoch": 2.321623731459797, + "grad_norm": 0.4643365610623425, + "learning_rate": 1.4785445856591756e-06, + "loss": 0.589, + "step": 5948 + }, + { + "epoch": 2.322014051522248, + "grad_norm": 0.4803513649905727, + "learning_rate": 1.4769327961391306e-06, + "loss": 0.5783, + "step": 5949 + }, + { + "epoch": 2.3224043715846996, + "grad_norm": 0.5109392890138058, + "learning_rate": 1.4753217333692105e-06, + "loss": 0.5464, + "step": 5950 + }, + { + "epoch": 2.3227946916471507, + "grad_norm": 0.4789161418324131, + "learning_rate": 1.4737113976817475e-06, + "loss": 0.5857, + "step": 5951 + }, + { + "epoch": 2.323185011709602, + "grad_norm": 0.5827917095051919, + "learning_rate": 1.4721017894089307e-06, + "loss": 0.565, + "step": 5952 + }, + { + "epoch": 2.3235753317720533, + "grad_norm": 0.6048582900882588, + "learning_rate": 1.470492908882793e-06, + "loss": 0.5535, + "step": 5953 + }, + { + "epoch": 2.3239656518345044, + "grad_norm": 0.4513248506223982, + "learning_rate": 1.4688847564352189e-06, + "loss": 0.5333, + "step": 5954 + }, + { + "epoch": 2.3243559718969555, + "grad_norm": 0.49722356972046744, + "learning_rate": 1.467277332397946e-06, + "loss": 0.581, + "step": 5955 + }, + { + "epoch": 2.3247462919594066, + "grad_norm": 0.5304237368416842, + "learning_rate": 1.465670637102558e-06, + "loss": 0.5516, + "step": 5956 + }, + { + "epoch": 2.3251366120218577, + "grad_norm": 0.4977748254062779, + "learning_rate": 1.4640646708804902e-06, + "loss": 0.5807, + "step": 5957 + }, + { + "epoch": 2.3255269320843093, + "grad_norm": 0.4706755678045364, + "learning_rate": 1.4624594340630233e-06, + "loss": 0.5594, + "step": 5958 + }, + { + "epoch": 2.3259172521467604, + "grad_norm": 0.5305651290150463, + "learning_rate": 1.460854926981295e-06, + "loss": 0.5507, + "step": 5959 + }, + { + "epoch": 2.3263075722092115, + "grad_norm": 0.4928342220057301, + "learning_rate": 1.459251149966286e-06, + "loss": 0.5825, + "step": 5960 + }, + { + "epoch": 2.3266978922716626, + "grad_norm": 0.43655493833170694, + "learning_rate": 1.4576481033488305e-06, + "loss": 0.5591, + "step": 5961 + }, + { + "epoch": 2.327088212334114, + "grad_norm": 0.4851345363729905, + "learning_rate": 1.4560457874596084e-06, + "loss": 0.5718, + "step": 5962 + }, + { + "epoch": 2.327478532396565, + "grad_norm": 0.6211841056902079, + "learning_rate": 1.4544442026291532e-06, + "loss": 0.5964, + "step": 5963 + }, + { + "epoch": 2.3278688524590163, + "grad_norm": 0.48592600152950244, + "learning_rate": 1.4528433491878434e-06, + "loss": 0.5668, + "step": 5964 + }, + { + "epoch": 2.3282591725214674, + "grad_norm": 0.4920628767460698, + "learning_rate": 1.45124322746591e-06, + "loss": 0.5722, + "step": 5965 + }, + { + "epoch": 2.328649492583919, + "grad_norm": 0.47012123580417176, + "learning_rate": 1.449643837793428e-06, + "loss": 0.5617, + "step": 5966 + }, + { + "epoch": 2.32903981264637, + "grad_norm": 0.5296069165113751, + "learning_rate": 1.4480451805003294e-06, + "loss": 0.5577, + "step": 5967 + }, + { + "epoch": 2.329430132708821, + "grad_norm": 0.5031753968968249, + "learning_rate": 1.4464472559163861e-06, + "loss": 0.5678, + "step": 5968 + }, + { + "epoch": 2.3298204527712723, + "grad_norm": 0.45097782165866207, + "learning_rate": 1.4448500643712277e-06, + "loss": 0.5339, + "step": 5969 + }, + { + "epoch": 2.330210772833724, + "grad_norm": 0.45375321156651904, + "learning_rate": 1.4432536061943264e-06, + "loss": 0.5729, + "step": 5970 + }, + { + "epoch": 2.330601092896175, + "grad_norm": 0.4392547203718948, + "learning_rate": 1.4416578817150028e-06, + "loss": 0.5654, + "step": 5971 + }, + { + "epoch": 2.330991412958626, + "grad_norm": 0.4449755696061769, + "learning_rate": 1.4400628912624321e-06, + "loss": 0.5636, + "step": 5972 + }, + { + "epoch": 2.331381733021077, + "grad_norm": 0.4221517254012169, + "learning_rate": 1.4384686351656323e-06, + "loss": 0.5952, + "step": 5973 + }, + { + "epoch": 2.3317720530835286, + "grad_norm": 0.616946508505415, + "learning_rate": 1.436875113753472e-06, + "loss": 0.5657, + "step": 5974 + }, + { + "epoch": 2.3321623731459797, + "grad_norm": 0.4570414387941985, + "learning_rate": 1.4352823273546667e-06, + "loss": 0.6112, + "step": 5975 + }, + { + "epoch": 2.332552693208431, + "grad_norm": 0.4401474670664395, + "learning_rate": 1.433690276297785e-06, + "loss": 0.5737, + "step": 5976 + }, + { + "epoch": 2.332943013270882, + "grad_norm": 0.451550662361036, + "learning_rate": 1.4320989609112368e-06, + "loss": 0.5628, + "step": 5977 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.48196331158484035, + "learning_rate": 1.4305083815232878e-06, + "loss": 0.5779, + "step": 5978 + }, + { + "epoch": 2.3337236533957846, + "grad_norm": 0.4571132111973396, + "learning_rate": 1.428918538462044e-06, + "loss": 0.566, + "step": 5979 + }, + { + "epoch": 2.3341139734582357, + "grad_norm": 0.41703340279456486, + "learning_rate": 1.427329432055468e-06, + "loss": 0.578, + "step": 5980 + }, + { + "epoch": 2.334504293520687, + "grad_norm": 0.5042178106971998, + "learning_rate": 1.4257410626313634e-06, + "loss": 0.5832, + "step": 5981 + }, + { + "epoch": 2.3348946135831383, + "grad_norm": 0.4274287206944984, + "learning_rate": 1.4241534305173848e-06, + "loss": 0.5223, + "step": 5982 + }, + { + "epoch": 2.3352849336455894, + "grad_norm": 0.4619813220042311, + "learning_rate": 1.422566536041033e-06, + "loss": 0.5706, + "step": 5983 + }, + { + "epoch": 2.3356752537080405, + "grad_norm": 0.47919353458048486, + "learning_rate": 1.42098037952966e-06, + "loss": 0.5857, + "step": 5984 + }, + { + "epoch": 2.3360655737704916, + "grad_norm": 0.47039450320179615, + "learning_rate": 1.4193949613104612e-06, + "loss": 0.5868, + "step": 5985 + }, + { + "epoch": 2.336455893832943, + "grad_norm": 0.47938256928492423, + "learning_rate": 1.417810281710485e-06, + "loss": 0.5786, + "step": 5986 + }, + { + "epoch": 2.3368462138953943, + "grad_norm": 0.5259805483369058, + "learning_rate": 1.416226341056623e-06, + "loss": 0.5351, + "step": 5987 + }, + { + "epoch": 2.3372365339578454, + "grad_norm": 0.4518732129684163, + "learning_rate": 1.4146431396756139e-06, + "loss": 0.5808, + "step": 5988 + }, + { + "epoch": 2.3376268540202965, + "grad_norm": 0.45034760206893365, + "learning_rate": 1.4130606778940486e-06, + "loss": 0.5611, + "step": 5989 + }, + { + "epoch": 2.338017174082748, + "grad_norm": 0.5033620636480309, + "learning_rate": 1.4114789560383618e-06, + "loss": 0.5575, + "step": 5990 + }, + { + "epoch": 2.338407494145199, + "grad_norm": 0.5582606597411862, + "learning_rate": 1.4098979744348363e-06, + "loss": 0.5875, + "step": 5991 + }, + { + "epoch": 2.33879781420765, + "grad_norm": 0.45775810865025846, + "learning_rate": 1.4083177334096003e-06, + "loss": 0.5402, + "step": 5992 + }, + { + "epoch": 2.3391881342701013, + "grad_norm": 0.4442919882910085, + "learning_rate": 1.4067382332886348e-06, + "loss": 0.5632, + "step": 5993 + }, + { + "epoch": 2.339578454332553, + "grad_norm": 0.4856742779585048, + "learning_rate": 1.4051594743977608e-06, + "loss": 0.5558, + "step": 5994 + }, + { + "epoch": 2.339968774395004, + "grad_norm": 0.4921644901955821, + "learning_rate": 1.4035814570626537e-06, + "loss": 0.5885, + "step": 5995 + }, + { + "epoch": 2.340359094457455, + "grad_norm": 0.44964999183441295, + "learning_rate": 1.4020041816088288e-06, + "loss": 0.5822, + "step": 5996 + }, + { + "epoch": 2.340749414519906, + "grad_norm": 0.4757703013494539, + "learning_rate": 1.4004276483616546e-06, + "loss": 0.5575, + "step": 5997 + }, + { + "epoch": 2.3411397345823577, + "grad_norm": 0.5098308400237315, + "learning_rate": 1.3988518576463423e-06, + "loss": 0.5564, + "step": 5998 + }, + { + "epoch": 2.341530054644809, + "grad_norm": 0.40117154664578086, + "learning_rate": 1.3972768097879513e-06, + "loss": 0.5618, + "step": 5999 + }, + { + "epoch": 2.34192037470726, + "grad_norm": 0.4504539050554778, + "learning_rate": 1.395702505111386e-06, + "loss": 0.5258, + "step": 6000 + }, + { + "epoch": 2.342310694769711, + "grad_norm": 0.493554158324879, + "learning_rate": 1.394128943941403e-06, + "loss": 0.5531, + "step": 6001 + }, + { + "epoch": 2.3427010148321625, + "grad_norm": 0.5090169912727136, + "learning_rate": 1.3925561266025995e-06, + "loss": 0.5416, + "step": 6002 + }, + { + "epoch": 2.3430913348946136, + "grad_norm": 0.4850125088966389, + "learning_rate": 1.3909840534194202e-06, + "loss": 0.5968, + "step": 6003 + }, + { + "epoch": 2.3434816549570647, + "grad_norm": 0.4702197264586837, + "learning_rate": 1.3894127247161615e-06, + "loss": 0.5678, + "step": 6004 + }, + { + "epoch": 2.343871975019516, + "grad_norm": 0.49061263414143574, + "learning_rate": 1.3878421408169578e-06, + "loss": 0.5669, + "step": 6005 + }, + { + "epoch": 2.3442622950819674, + "grad_norm": 0.44226010706928054, + "learning_rate": 1.3862723020457991e-06, + "loss": 0.5303, + "step": 6006 + }, + { + "epoch": 2.3446526151444185, + "grad_norm": 0.5046997234178235, + "learning_rate": 1.3847032087265123e-06, + "loss": 0.596, + "step": 6007 + }, + { + "epoch": 2.3450429352068696, + "grad_norm": 0.5681322493975696, + "learning_rate": 1.3831348611827811e-06, + "loss": 0.5793, + "step": 6008 + }, + { + "epoch": 2.3454332552693207, + "grad_norm": 0.427080197119233, + "learning_rate": 1.381567259738123e-06, + "loss": 0.5599, + "step": 6009 + }, + { + "epoch": 2.345823575331772, + "grad_norm": 0.617686570281674, + "learning_rate": 1.3800004047159128e-06, + "loss": 0.6463, + "step": 6010 + }, + { + "epoch": 2.3462138953942233, + "grad_norm": 0.44581490532747675, + "learning_rate": 1.3784342964393627e-06, + "loss": 0.5382, + "step": 6011 + }, + { + "epoch": 2.3466042154566744, + "grad_norm": 0.45042204933013075, + "learning_rate": 1.376868935231539e-06, + "loss": 0.5384, + "step": 6012 + }, + { + "epoch": 2.3469945355191255, + "grad_norm": 0.49809496264356595, + "learning_rate": 1.3753043214153455e-06, + "loss": 0.5846, + "step": 6013 + }, + { + "epoch": 2.347384855581577, + "grad_norm": 0.48062669872068364, + "learning_rate": 1.3737404553135402e-06, + "loss": 0.5794, + "step": 6014 + }, + { + "epoch": 2.347775175644028, + "grad_norm": 0.4375670254782194, + "learning_rate": 1.3721773372487207e-06, + "loss": 0.5465, + "step": 6015 + }, + { + "epoch": 2.3481654957064793, + "grad_norm": 0.44850759239372834, + "learning_rate": 1.37061496754333e-06, + "loss": 0.5743, + "step": 6016 + }, + { + "epoch": 2.3485558157689304, + "grad_norm": 0.49993054999089415, + "learning_rate": 1.369053346519663e-06, + "loss": 0.5803, + "step": 6017 + }, + { + "epoch": 2.348946135831382, + "grad_norm": 0.47564757827837284, + "learning_rate": 1.367492474499854e-06, + "loss": 0.5543, + "step": 6018 + }, + { + "epoch": 2.349336455893833, + "grad_norm": 0.48433812500562734, + "learning_rate": 1.3659323518058854e-06, + "loss": 0.593, + "step": 6019 + }, + { + "epoch": 2.349726775956284, + "grad_norm": 0.46429864558424905, + "learning_rate": 1.3643729787595821e-06, + "loss": 0.5777, + "step": 6020 + }, + { + "epoch": 2.350117096018735, + "grad_norm": 0.5057039856853726, + "learning_rate": 1.3628143556826207e-06, + "loss": 0.5699, + "step": 6021 + }, + { + "epoch": 2.3505074160811867, + "grad_norm": 0.5518721124692717, + "learning_rate": 1.3612564828965163e-06, + "loss": 0.5889, + "step": 6022 + }, + { + "epoch": 2.350897736143638, + "grad_norm": 0.5058805567978824, + "learning_rate": 1.3596993607226344e-06, + "loss": 0.5719, + "step": 6023 + }, + { + "epoch": 2.351288056206089, + "grad_norm": 0.4459769958696561, + "learning_rate": 1.3581429894821807e-06, + "loss": 0.5801, + "step": 6024 + }, + { + "epoch": 2.35167837626854, + "grad_norm": 0.4706524867560989, + "learning_rate": 1.3565873694962123e-06, + "loss": 0.5325, + "step": 6025 + }, + { + "epoch": 2.3520686963309916, + "grad_norm": 0.5399903318661486, + "learning_rate": 1.3550325010856253e-06, + "loss": 0.6185, + "step": 6026 + }, + { + "epoch": 2.3524590163934427, + "grad_norm": 0.48793583044274447, + "learning_rate": 1.3534783845711636e-06, + "loss": 0.5834, + "step": 6027 + }, + { + "epoch": 2.352849336455894, + "grad_norm": 0.6150751306473116, + "learning_rate": 1.3519250202734136e-06, + "loss": 0.5865, + "step": 6028 + }, + { + "epoch": 2.353239656518345, + "grad_norm": 0.4922762197681992, + "learning_rate": 1.3503724085128123e-06, + "loss": 0.5365, + "step": 6029 + }, + { + "epoch": 2.3536299765807964, + "grad_norm": 0.5446323489527771, + "learning_rate": 1.348820549609634e-06, + "loss": 0.5513, + "step": 6030 + }, + { + "epoch": 2.3540202966432475, + "grad_norm": 0.465468560535383, + "learning_rate": 1.3472694438840045e-06, + "loss": 0.584, + "step": 6031 + }, + { + "epoch": 2.3544106167056986, + "grad_norm": 0.4577595177707856, + "learning_rate": 1.3457190916558904e-06, + "loss": 0.5833, + "step": 6032 + }, + { + "epoch": 2.3548009367681497, + "grad_norm": 0.4907618548791279, + "learning_rate": 1.3441694932451006e-06, + "loss": 0.6169, + "step": 6033 + }, + { + "epoch": 2.3551912568306013, + "grad_norm": 0.4520602190814143, + "learning_rate": 1.3426206489712957e-06, + "loss": 0.5729, + "step": 6034 + }, + { + "epoch": 2.3555815768930524, + "grad_norm": 0.45678369177604794, + "learning_rate": 1.3410725591539741e-06, + "loss": 0.57, + "step": 6035 + }, + { + "epoch": 2.3559718969555035, + "grad_norm": 0.4749263108534114, + "learning_rate": 1.3395252241124818e-06, + "loss": 0.5688, + "step": 6036 + }, + { + "epoch": 2.3563622170179546, + "grad_norm": 0.4435976296720254, + "learning_rate": 1.3379786441660064e-06, + "loss": 0.5297, + "step": 6037 + }, + { + "epoch": 2.356752537080406, + "grad_norm": 0.4798073047959737, + "learning_rate": 1.336432819633584e-06, + "loss": 0.5591, + "step": 6038 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 0.5962202861230641, + "learning_rate": 1.3348877508340907e-06, + "loss": 0.6121, + "step": 6039 + }, + { + "epoch": 2.3575331772053083, + "grad_norm": 0.4527039676969128, + "learning_rate": 1.3333434380862504e-06, + "loss": 0.5726, + "step": 6040 + }, + { + "epoch": 2.3579234972677594, + "grad_norm": 0.4532917828605115, + "learning_rate": 1.3317998817086265e-06, + "loss": 0.5475, + "step": 6041 + }, + { + "epoch": 2.358313817330211, + "grad_norm": 0.4580777435823649, + "learning_rate": 1.330257082019632e-06, + "loss": 0.53, + "step": 6042 + }, + { + "epoch": 2.358704137392662, + "grad_norm": 0.4362826007707503, + "learning_rate": 1.3287150393375192e-06, + "loss": 0.5593, + "step": 6043 + }, + { + "epoch": 2.359094457455113, + "grad_norm": 0.4839729368162287, + "learning_rate": 1.3271737539803857e-06, + "loss": 0.5566, + "step": 6044 + }, + { + "epoch": 2.3594847775175642, + "grad_norm": 0.4785053921646384, + "learning_rate": 1.3256332262661726e-06, + "loss": 0.5494, + "step": 6045 + }, + { + "epoch": 2.359875097580016, + "grad_norm": 0.47097993564043417, + "learning_rate": 1.3240934565126668e-06, + "loss": 0.5644, + "step": 6046 + }, + { + "epoch": 2.360265417642467, + "grad_norm": 0.43626819836259495, + "learning_rate": 1.3225544450374944e-06, + "loss": 0.5707, + "step": 6047 + }, + { + "epoch": 2.360655737704918, + "grad_norm": 0.45454705542493645, + "learning_rate": 1.3210161921581315e-06, + "loss": 0.5257, + "step": 6048 + }, + { + "epoch": 2.361046057767369, + "grad_norm": 0.4783244250491717, + "learning_rate": 1.3194786981918918e-06, + "loss": 0.5498, + "step": 6049 + }, + { + "epoch": 2.3614363778298206, + "grad_norm": 0.4728235612530795, + "learning_rate": 1.3179419634559337e-06, + "loss": 0.5458, + "step": 6050 + }, + { + "epoch": 2.3618266978922717, + "grad_norm": 0.45610356347942055, + "learning_rate": 1.3164059882672632e-06, + "loss": 0.5475, + "step": 6051 + }, + { + "epoch": 2.362217017954723, + "grad_norm": 0.47067166983697684, + "learning_rate": 1.3148707729427241e-06, + "loss": 0.586, + "step": 6052 + }, + { + "epoch": 2.362607338017174, + "grad_norm": 0.5354702322855927, + "learning_rate": 1.3133363177990065e-06, + "loss": 0.5635, + "step": 6053 + }, + { + "epoch": 2.3629976580796255, + "grad_norm": 0.4179299573824139, + "learning_rate": 1.311802623152641e-06, + "loss": 0.5492, + "step": 6054 + }, + { + "epoch": 2.3633879781420766, + "grad_norm": 0.4456336648189365, + "learning_rate": 1.3102696893200067e-06, + "loss": 0.5599, + "step": 6055 + }, + { + "epoch": 2.3637782982045277, + "grad_norm": 0.4685044866319979, + "learning_rate": 1.3087375166173189e-06, + "loss": 0.5682, + "step": 6056 + }, + { + "epoch": 2.3641686182669788, + "grad_norm": 0.4725534192090465, + "learning_rate": 1.3072061053606427e-06, + "loss": 0.5564, + "step": 6057 + }, + { + "epoch": 2.3645589383294303, + "grad_norm": 0.549269378304914, + "learning_rate": 1.3056754558658797e-06, + "loss": 0.5637, + "step": 6058 + }, + { + "epoch": 2.3649492583918814, + "grad_norm": 0.49577103101391523, + "learning_rate": 1.3041455684487803e-06, + "loss": 0.5065, + "step": 6059 + }, + { + "epoch": 2.3653395784543325, + "grad_norm": 0.4423968724144631, + "learning_rate": 1.3026164434249317e-06, + "loss": 0.5693, + "step": 6060 + }, + { + "epoch": 2.3657298985167836, + "grad_norm": 0.5819486330878487, + "learning_rate": 1.3010880811097714e-06, + "loss": 0.6121, + "step": 6061 + }, + { + "epoch": 2.366120218579235, + "grad_norm": 0.4818338553904948, + "learning_rate": 1.2995604818185698e-06, + "loss": 0.5609, + "step": 6062 + }, + { + "epoch": 2.3665105386416863, + "grad_norm": 0.5314466202982946, + "learning_rate": 1.2980336458664483e-06, + "loss": 0.5679, + "step": 6063 + }, + { + "epoch": 2.3669008587041374, + "grad_norm": 0.47740495081236467, + "learning_rate": 1.2965075735683664e-06, + "loss": 0.602, + "step": 6064 + }, + { + "epoch": 2.3672911787665885, + "grad_norm": 0.5244451848829592, + "learning_rate": 1.294982265239129e-06, + "loss": 0.58, + "step": 6065 + }, + { + "epoch": 2.36768149882904, + "grad_norm": 0.4949285260547684, + "learning_rate": 1.2934577211933819e-06, + "loss": 0.5778, + "step": 6066 + }, + { + "epoch": 2.368071818891491, + "grad_norm": 0.46655476102024135, + "learning_rate": 1.2919339417456101e-06, + "loss": 0.564, + "step": 6067 + }, + { + "epoch": 2.368462138953942, + "grad_norm": 0.5327202379122161, + "learning_rate": 1.2904109272101473e-06, + "loss": 0.5567, + "step": 6068 + }, + { + "epoch": 2.3688524590163933, + "grad_norm": 0.5025277757143956, + "learning_rate": 1.2888886779011634e-06, + "loss": 0.5667, + "step": 6069 + }, + { + "epoch": 2.369242779078845, + "grad_norm": 0.4955932108409256, + "learning_rate": 1.2873671941326777e-06, + "loss": 0.5591, + "step": 6070 + }, + { + "epoch": 2.369633099141296, + "grad_norm": 0.5046550406777693, + "learning_rate": 1.2858464762185407e-06, + "loss": 0.5599, + "step": 6071 + }, + { + "epoch": 2.370023419203747, + "grad_norm": 0.4606338082408184, + "learning_rate": 1.2843265244724561e-06, + "loss": 0.5594, + "step": 6072 + }, + { + "epoch": 2.370413739266198, + "grad_norm": 0.4715643980406402, + "learning_rate": 1.282807339207961e-06, + "loss": 0.5616, + "step": 6073 + }, + { + "epoch": 2.3708040593286497, + "grad_norm": 0.5501710313157938, + "learning_rate": 1.281288920738441e-06, + "loss": 0.5658, + "step": 6074 + }, + { + "epoch": 2.371194379391101, + "grad_norm": 0.4078672408811309, + "learning_rate": 1.2797712693771186e-06, + "loss": 0.5506, + "step": 6075 + }, + { + "epoch": 2.371584699453552, + "grad_norm": 0.44861476530635575, + "learning_rate": 1.2782543854370615e-06, + "loss": 0.5722, + "step": 6076 + }, + { + "epoch": 2.371975019516003, + "grad_norm": 0.5749446050953657, + "learning_rate": 1.2767382692311765e-06, + "loss": 0.6036, + "step": 6077 + }, + { + "epoch": 2.3723653395784545, + "grad_norm": 0.5012621648140505, + "learning_rate": 1.2752229210722156e-06, + "loss": 0.6032, + "step": 6078 + }, + { + "epoch": 2.3727556596409056, + "grad_norm": 0.47248548112471683, + "learning_rate": 1.2737083412727652e-06, + "loss": 0.6051, + "step": 6079 + }, + { + "epoch": 2.3731459797033567, + "grad_norm": 0.4502443226543596, + "learning_rate": 1.272194530145262e-06, + "loss": 0.6058, + "step": 6080 + }, + { + "epoch": 2.373536299765808, + "grad_norm": 0.44229350450207566, + "learning_rate": 1.2706814880019773e-06, + "loss": 0.5312, + "step": 6081 + }, + { + "epoch": 2.3739266198282594, + "grad_norm": 0.5747842803432875, + "learning_rate": 1.269169215155029e-06, + "loss": 0.5339, + "step": 6082 + }, + { + "epoch": 2.3743169398907105, + "grad_norm": 0.4597428516535293, + "learning_rate": 1.267657711916373e-06, + "loss": 0.5786, + "step": 6083 + }, + { + "epoch": 2.3747072599531616, + "grad_norm": 0.45893973340852545, + "learning_rate": 1.2661469785978053e-06, + "loss": 0.5917, + "step": 6084 + }, + { + "epoch": 2.3750975800156127, + "grad_norm": 0.47702900547089916, + "learning_rate": 1.2646370155109678e-06, + "loss": 0.5746, + "step": 6085 + }, + { + "epoch": 2.375487900078064, + "grad_norm": 0.4516847740706726, + "learning_rate": 1.2631278229673383e-06, + "loss": 0.5265, + "step": 6086 + }, + { + "epoch": 2.3758782201405153, + "grad_norm": 0.4471742960239163, + "learning_rate": 1.2616194012782424e-06, + "loss": 0.5504, + "step": 6087 + }, + { + "epoch": 2.3762685402029664, + "grad_norm": 0.49550204669757225, + "learning_rate": 1.2601117507548367e-06, + "loss": 0.5695, + "step": 6088 + }, + { + "epoch": 2.3766588602654175, + "grad_norm": 0.548762256756403, + "learning_rate": 1.258604871708128e-06, + "loss": 0.5401, + "step": 6089 + }, + { + "epoch": 2.3770491803278686, + "grad_norm": 0.51762869226802, + "learning_rate": 1.257098764448958e-06, + "loss": 0.5461, + "step": 6090 + }, + { + "epoch": 2.37743950039032, + "grad_norm": 0.4816169195435717, + "learning_rate": 1.2555934292880146e-06, + "loss": 0.5801, + "step": 6091 + }, + { + "epoch": 2.3778298204527712, + "grad_norm": 0.44619353555900193, + "learning_rate": 1.2540888665358204e-06, + "loss": 0.5738, + "step": 6092 + }, + { + "epoch": 2.3782201405152223, + "grad_norm": 0.4496180024804028, + "learning_rate": 1.2525850765027447e-06, + "loss": 0.5424, + "step": 6093 + }, + { + "epoch": 2.378610460577674, + "grad_norm": 0.4446522764138723, + "learning_rate": 1.251082059498991e-06, + "loss": 0.5606, + "step": 6094 + }, + { + "epoch": 2.379000780640125, + "grad_norm": 0.4795963827867393, + "learning_rate": 1.2495798158346095e-06, + "loss": 0.5544, + "step": 6095 + }, + { + "epoch": 2.379391100702576, + "grad_norm": 0.4570313025065977, + "learning_rate": 1.248078345819488e-06, + "loss": 0.5784, + "step": 6096 + }, + { + "epoch": 2.379781420765027, + "grad_norm": 0.4332261419282525, + "learning_rate": 1.2465776497633524e-06, + "loss": 0.54, + "step": 6097 + }, + { + "epoch": 2.3801717408274783, + "grad_norm": 0.5628161444346407, + "learning_rate": 1.245077727975772e-06, + "loss": 0.5631, + "step": 6098 + }, + { + "epoch": 2.38056206088993, + "grad_norm": 0.5212173669364258, + "learning_rate": 1.2435785807661577e-06, + "loss": 0.5438, + "step": 6099 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 0.46335780750749794, + "learning_rate": 1.2420802084437573e-06, + "loss": 0.5722, + "step": 6100 + }, + { + "epoch": 2.381342701014832, + "grad_norm": 0.4656613511325347, + "learning_rate": 1.2405826113176583e-06, + "loss": 0.5648, + "step": 6101 + }, + { + "epoch": 2.3817330210772836, + "grad_norm": 0.47986112431941197, + "learning_rate": 1.2390857896967934e-06, + "loss": 0.5879, + "step": 6102 + }, + { + "epoch": 2.3821233411397347, + "grad_norm": 0.431420102366906, + "learning_rate": 1.2375897438899288e-06, + "loss": 0.5816, + "step": 6103 + }, + { + "epoch": 2.3825136612021858, + "grad_norm": 0.5105277140651069, + "learning_rate": 1.236094474205677e-06, + "loss": 0.5713, + "step": 6104 + }, + { + "epoch": 2.382903981264637, + "grad_norm": 0.4606816137045219, + "learning_rate": 1.2345999809524855e-06, + "loss": 0.5658, + "step": 6105 + }, + { + "epoch": 2.383294301327088, + "grad_norm": 0.4475045563049736, + "learning_rate": 1.2331062644386438e-06, + "loss": 0.5773, + "step": 6106 + }, + { + "epoch": 2.3836846213895395, + "grad_norm": 0.45561061602914726, + "learning_rate": 1.2316133249722789e-06, + "loss": 0.5421, + "step": 6107 + }, + { + "epoch": 2.3840749414519906, + "grad_norm": 0.4343909608046524, + "learning_rate": 1.2301211628613624e-06, + "loss": 0.5919, + "step": 6108 + }, + { + "epoch": 2.3844652615144417, + "grad_norm": 0.4204082200084702, + "learning_rate": 1.2286297784136998e-06, + "loss": 0.5553, + "step": 6109 + }, + { + "epoch": 2.3848555815768933, + "grad_norm": 0.5376483731999568, + "learning_rate": 1.2271391719369412e-06, + "loss": 0.5962, + "step": 6110 + }, + { + "epoch": 2.3852459016393444, + "grad_norm": 0.4397660807483906, + "learning_rate": 1.2256493437385714e-06, + "loss": 0.5557, + "step": 6111 + }, + { + "epoch": 2.3856362217017955, + "grad_norm": 0.42182590216645044, + "learning_rate": 1.2241602941259206e-06, + "loss": 0.5313, + "step": 6112 + }, + { + "epoch": 2.3860265417642466, + "grad_norm": 0.4791048676013429, + "learning_rate": 1.2226720234061523e-06, + "loss": 0.602, + "step": 6113 + }, + { + "epoch": 2.3864168618266977, + "grad_norm": 0.43027345871732064, + "learning_rate": 1.221184531886273e-06, + "loss": 0.573, + "step": 6114 + }, + { + "epoch": 2.386807181889149, + "grad_norm": 0.4342377621905379, + "learning_rate": 1.2196978198731258e-06, + "loss": 0.5938, + "step": 6115 + }, + { + "epoch": 2.3871975019516003, + "grad_norm": 0.45607469158551583, + "learning_rate": 1.2182118876733972e-06, + "loss": 0.5681, + "step": 6116 + }, + { + "epoch": 2.3875878220140514, + "grad_norm": 0.4624372220013145, + "learning_rate": 1.2167267355936092e-06, + "loss": 0.5285, + "step": 6117 + }, + { + "epoch": 2.387978142076503, + "grad_norm": 0.4767265429454263, + "learning_rate": 1.2152423639401224e-06, + "loss": 0.5605, + "step": 6118 + }, + { + "epoch": 2.388368462138954, + "grad_norm": 0.4731822340021871, + "learning_rate": 1.2137587730191408e-06, + "loss": 0.5446, + "step": 6119 + }, + { + "epoch": 2.388758782201405, + "grad_norm": 0.4348818889193963, + "learning_rate": 1.212275963136701e-06, + "loss": 0.5321, + "step": 6120 + }, + { + "epoch": 2.3891491022638562, + "grad_norm": 0.5896252546143436, + "learning_rate": 1.2107939345986858e-06, + "loss": 0.6029, + "step": 6121 + }, + { + "epoch": 2.3895394223263073, + "grad_norm": 0.46877778991601954, + "learning_rate": 1.2093126877108114e-06, + "loss": 0.5308, + "step": 6122 + }, + { + "epoch": 2.389929742388759, + "grad_norm": 0.47457914343659857, + "learning_rate": 1.2078322227786343e-06, + "loss": 0.5615, + "step": 6123 + }, + { + "epoch": 2.39032006245121, + "grad_norm": 0.4570749433379305, + "learning_rate": 1.2063525401075482e-06, + "loss": 0.5549, + "step": 6124 + }, + { + "epoch": 2.390710382513661, + "grad_norm": 0.44041720239233656, + "learning_rate": 1.20487364000279e-06, + "loss": 0.5644, + "step": 6125 + }, + { + "epoch": 2.3911007025761126, + "grad_norm": 0.4319373281075183, + "learning_rate": 1.203395522769429e-06, + "loss": 0.5637, + "step": 6126 + }, + { + "epoch": 2.3914910226385637, + "grad_norm": 0.4986525592197612, + "learning_rate": 1.2019181887123793e-06, + "loss": 0.5821, + "step": 6127 + }, + { + "epoch": 2.391881342701015, + "grad_norm": 0.441046459838049, + "learning_rate": 1.2004416381363882e-06, + "loss": 0.5475, + "step": 6128 + }, + { + "epoch": 2.392271662763466, + "grad_norm": 0.4379821587276557, + "learning_rate": 1.198965871346045e-06, + "loss": 0.5642, + "step": 6129 + }, + { + "epoch": 2.392661982825917, + "grad_norm": 0.52833414989806, + "learning_rate": 1.1974908886457747e-06, + "loss": 0.5521, + "step": 6130 + }, + { + "epoch": 2.3930523028883686, + "grad_norm": 0.5271795953742305, + "learning_rate": 1.196016690339843e-06, + "loss": 0.5686, + "step": 6131 + }, + { + "epoch": 2.3934426229508197, + "grad_norm": 0.47309810986396555, + "learning_rate": 1.1945432767323506e-06, + "loss": 0.5876, + "step": 6132 + }, + { + "epoch": 2.3938329430132708, + "grad_norm": 0.49328969313715537, + "learning_rate": 1.193070648127238e-06, + "loss": 0.5575, + "step": 6133 + }, + { + "epoch": 2.3942232630757223, + "grad_norm": 0.5365051485726412, + "learning_rate": 1.1915988048282867e-06, + "loss": 0.5529, + "step": 6134 + }, + { + "epoch": 2.3946135831381734, + "grad_norm": 0.4201654178568057, + "learning_rate": 1.19012774713911e-06, + "loss": 0.5677, + "step": 6135 + }, + { + "epoch": 2.3950039032006245, + "grad_norm": 0.46138612689129227, + "learning_rate": 1.1886574753631664e-06, + "loss": 0.5813, + "step": 6136 + }, + { + "epoch": 2.3953942232630756, + "grad_norm": 0.43654754846463084, + "learning_rate": 1.1871879898037447e-06, + "loss": 0.568, + "step": 6137 + }, + { + "epoch": 2.3957845433255267, + "grad_norm": 0.5621006140033853, + "learning_rate": 1.1857192907639786e-06, + "loss": 0.5791, + "step": 6138 + }, + { + "epoch": 2.3961748633879782, + "grad_norm": 0.4689793290354928, + "learning_rate": 1.1842513785468335e-06, + "loss": 0.5624, + "step": 6139 + }, + { + "epoch": 2.3965651834504293, + "grad_norm": 0.5752248940572379, + "learning_rate": 1.1827842534551194e-06, + "loss": 0.6128, + "step": 6140 + }, + { + "epoch": 2.3969555035128804, + "grad_norm": 0.4445835409125924, + "learning_rate": 1.1813179157914738e-06, + "loss": 0.5735, + "step": 6141 + }, + { + "epoch": 2.397345823575332, + "grad_norm": 0.4548017293110441, + "learning_rate": 1.1798523658583828e-06, + "loss": 0.5529, + "step": 6142 + }, + { + "epoch": 2.397736143637783, + "grad_norm": 0.4814135686991288, + "learning_rate": 1.1783876039581616e-06, + "loss": 0.5717, + "step": 6143 + }, + { + "epoch": 2.398126463700234, + "grad_norm": 0.4718929766191592, + "learning_rate": 1.1769236303929687e-06, + "loss": 0.5811, + "step": 6144 + }, + { + "epoch": 2.3985167837626853, + "grad_norm": 0.49797787298526874, + "learning_rate": 1.1754604454647973e-06, + "loss": 0.5901, + "step": 6145 + }, + { + "epoch": 2.3989071038251364, + "grad_norm": 0.5088202973197211, + "learning_rate": 1.1739980494754749e-06, + "loss": 0.5659, + "step": 6146 + }, + { + "epoch": 2.399297423887588, + "grad_norm": 0.5519677520736203, + "learning_rate": 1.1725364427266739e-06, + "loss": 0.5666, + "step": 6147 + }, + { + "epoch": 2.399687743950039, + "grad_norm": 0.4742372971138614, + "learning_rate": 1.1710756255198958e-06, + "loss": 0.5607, + "step": 6148 + }, + { + "epoch": 2.40007806401249, + "grad_norm": 0.4345750587431082, + "learning_rate": 1.1696155981564866e-06, + "loss": 0.5737, + "step": 6149 + }, + { + "epoch": 2.4004683840749417, + "grad_norm": 0.47121747822226406, + "learning_rate": 1.1681563609376212e-06, + "loss": 0.5538, + "step": 6150 + }, + { + "epoch": 2.4008587041373928, + "grad_norm": 0.42977054621187066, + "learning_rate": 1.1666979141643198e-06, + "loss": 0.5637, + "step": 6151 + }, + { + "epoch": 2.401249024199844, + "grad_norm": 0.41992294849665396, + "learning_rate": 1.1652402581374322e-06, + "loss": 0.5591, + "step": 6152 + }, + { + "epoch": 2.401639344262295, + "grad_norm": 0.448483327377864, + "learning_rate": 1.1637833931576525e-06, + "loss": 0.5587, + "step": 6153 + }, + { + "epoch": 2.402029664324746, + "grad_norm": 0.4550757684274438, + "learning_rate": 1.1623273195255037e-06, + "loss": 0.562, + "step": 6154 + }, + { + "epoch": 2.4024199843871976, + "grad_norm": 0.43434950822640944, + "learning_rate": 1.1608720375413534e-06, + "loss": 0.5055, + "step": 6155 + }, + { + "epoch": 2.4028103044496487, + "grad_norm": 0.453415779324178, + "learning_rate": 1.1594175475053986e-06, + "loss": 0.5462, + "step": 6156 + }, + { + "epoch": 2.4032006245121, + "grad_norm": 0.436716372146526, + "learning_rate": 1.1579638497176794e-06, + "loss": 0.5659, + "step": 6157 + }, + { + "epoch": 2.4035909445745514, + "grad_norm": 0.4171589879139257, + "learning_rate": 1.1565109444780675e-06, + "loss": 0.58, + "step": 6158 + }, + { + "epoch": 2.4039812646370025, + "grad_norm": 0.466091902526949, + "learning_rate": 1.1550588320862742e-06, + "loss": 0.5929, + "step": 6159 + }, + { + "epoch": 2.4043715846994536, + "grad_norm": 0.47205422676025943, + "learning_rate": 1.153607512841844e-06, + "loss": 0.5713, + "step": 6160 + }, + { + "epoch": 2.4047619047619047, + "grad_norm": 0.4899567860589847, + "learning_rate": 1.152156987044163e-06, + "loss": 0.6161, + "step": 6161 + }, + { + "epoch": 2.4051522248243558, + "grad_norm": 0.46213824257255337, + "learning_rate": 1.1507072549924487e-06, + "loss": 0.6019, + "step": 6162 + }, + { + "epoch": 2.4055425448868073, + "grad_norm": 0.44986245976198236, + "learning_rate": 1.1492583169857553e-06, + "loss": 0.5502, + "step": 6163 + }, + { + "epoch": 2.4059328649492584, + "grad_norm": 0.4267350099535311, + "learning_rate": 1.147810173322978e-06, + "loss": 0.596, + "step": 6164 + }, + { + "epoch": 2.4063231850117095, + "grad_norm": 0.4412080940818654, + "learning_rate": 1.1463628243028414e-06, + "loss": 0.5677, + "step": 6165 + }, + { + "epoch": 2.406713505074161, + "grad_norm": 0.5231452743317183, + "learning_rate": 1.1449162702239125e-06, + "loss": 0.5789, + "step": 6166 + }, + { + "epoch": 2.407103825136612, + "grad_norm": 0.44109165113920185, + "learning_rate": 1.14347051138459e-06, + "loss": 0.612, + "step": 6167 + }, + { + "epoch": 2.4074941451990632, + "grad_norm": 0.5179471991328883, + "learning_rate": 1.1420255480831094e-06, + "loss": 0.5837, + "step": 6168 + }, + { + "epoch": 2.4078844652615143, + "grad_norm": 0.535220998735822, + "learning_rate": 1.140581380617542e-06, + "loss": 0.5859, + "step": 6169 + }, + { + "epoch": 2.4082747853239654, + "grad_norm": 0.4477957759676631, + "learning_rate": 1.139138009285798e-06, + "loss": 0.575, + "step": 6170 + }, + { + "epoch": 2.408665105386417, + "grad_norm": 0.4919661621611092, + "learning_rate": 1.1376954343856173e-06, + "loss": 0.5868, + "step": 6171 + }, + { + "epoch": 2.409055425448868, + "grad_norm": 0.45444200833975346, + "learning_rate": 1.1362536562145831e-06, + "loss": 0.5838, + "step": 6172 + }, + { + "epoch": 2.409445745511319, + "grad_norm": 0.460757061708276, + "learning_rate": 1.1348126750701066e-06, + "loss": 0.5882, + "step": 6173 + }, + { + "epoch": 2.4098360655737707, + "grad_norm": 0.44105062491242975, + "learning_rate": 1.1333724912494415e-06, + "loss": 0.5899, + "step": 6174 + }, + { + "epoch": 2.410226385636222, + "grad_norm": 0.5417932211478904, + "learning_rate": 1.1319331050496724e-06, + "loss": 0.5424, + "step": 6175 + }, + { + "epoch": 2.410616705698673, + "grad_norm": 0.430113970355694, + "learning_rate": 1.13049451676772e-06, + "loss": 0.5405, + "step": 6176 + }, + { + "epoch": 2.411007025761124, + "grad_norm": 0.4991407083397599, + "learning_rate": 1.1290567267003405e-06, + "loss": 0.5854, + "step": 6177 + }, + { + "epoch": 2.411397345823575, + "grad_norm": 0.5145943240396021, + "learning_rate": 1.127619735144128e-06, + "loss": 0.5634, + "step": 6178 + }, + { + "epoch": 2.4117876658860267, + "grad_norm": 0.4327321613490909, + "learning_rate": 1.1261835423955097e-06, + "loss": 0.579, + "step": 6179 + }, + { + "epoch": 2.4121779859484778, + "grad_norm": 0.45627105422404124, + "learning_rate": 1.124748148750746e-06, + "loss": 0.5985, + "step": 6180 + }, + { + "epoch": 2.412568306010929, + "grad_norm": 0.4745226124910538, + "learning_rate": 1.1233135545059382e-06, + "loss": 0.6102, + "step": 6181 + }, + { + "epoch": 2.4129586260733804, + "grad_norm": 0.48599993456772206, + "learning_rate": 1.1218797599570153e-06, + "loss": 0.5434, + "step": 6182 + }, + { + "epoch": 2.4133489461358315, + "grad_norm": 0.482560071740547, + "learning_rate": 1.1204467653997492e-06, + "loss": 0.5495, + "step": 6183 + }, + { + "epoch": 2.4137392661982826, + "grad_norm": 0.5023399054566746, + "learning_rate": 1.1190145711297407e-06, + "loss": 0.5714, + "step": 6184 + }, + { + "epoch": 2.4141295862607337, + "grad_norm": 0.5788234243855425, + "learning_rate": 1.117583177442428e-06, + "loss": 0.6095, + "step": 6185 + }, + { + "epoch": 2.414519906323185, + "grad_norm": 0.45136964771078614, + "learning_rate": 1.1161525846330823e-06, + "loss": 0.5355, + "step": 6186 + }, + { + "epoch": 2.4149102263856363, + "grad_norm": 0.49612491499882105, + "learning_rate": 1.1147227929968136e-06, + "loss": 0.5433, + "step": 6187 + }, + { + "epoch": 2.4153005464480874, + "grad_norm": 0.5044283841633401, + "learning_rate": 1.1132938028285617e-06, + "loss": 0.5624, + "step": 6188 + }, + { + "epoch": 2.4156908665105385, + "grad_norm": 0.5088079955761827, + "learning_rate": 1.1118656144231066e-06, + "loss": 0.5882, + "step": 6189 + }, + { + "epoch": 2.41608118657299, + "grad_norm": 0.48025832203366, + "learning_rate": 1.1104382280750564e-06, + "loss": 0.5817, + "step": 6190 + }, + { + "epoch": 2.416471506635441, + "grad_norm": 0.5716834479570809, + "learning_rate": 1.10901164407886e-06, + "loss": 0.5619, + "step": 6191 + }, + { + "epoch": 2.4168618266978923, + "grad_norm": 0.5698929916036626, + "learning_rate": 1.1075858627287966e-06, + "loss": 0.5921, + "step": 6192 + }, + { + "epoch": 2.4172521467603434, + "grad_norm": 0.4657415248955799, + "learning_rate": 1.1061608843189815e-06, + "loss": 0.5228, + "step": 6193 + }, + { + "epoch": 2.4176424668227945, + "grad_norm": 0.4178419551280907, + "learning_rate": 1.1047367091433619e-06, + "loss": 0.5663, + "step": 6194 + }, + { + "epoch": 2.418032786885246, + "grad_norm": 0.4444829853020771, + "learning_rate": 1.1033133374957244e-06, + "loss": 0.5799, + "step": 6195 + }, + { + "epoch": 2.418423106947697, + "grad_norm": 0.49496692025801975, + "learning_rate": 1.1018907696696868e-06, + "loss": 0.5545, + "step": 6196 + }, + { + "epoch": 2.4188134270101482, + "grad_norm": 0.4390453477455385, + "learning_rate": 1.1004690059586981e-06, + "loss": 0.6169, + "step": 6197 + }, + { + "epoch": 2.4192037470725998, + "grad_norm": 0.45512216734032024, + "learning_rate": 1.0990480466560478e-06, + "loss": 0.5544, + "step": 6198 + }, + { + "epoch": 2.419594067135051, + "grad_norm": 0.4969078379358089, + "learning_rate": 1.0976278920548533e-06, + "loss": 0.5895, + "step": 6199 + }, + { + "epoch": 2.419984387197502, + "grad_norm": 0.5576221661291246, + "learning_rate": 1.0962085424480722e-06, + "loss": 0.5706, + "step": 6200 + }, + { + "epoch": 2.420374707259953, + "grad_norm": 0.4994307139766459, + "learning_rate": 1.0947899981284892e-06, + "loss": 0.5637, + "step": 6201 + }, + { + "epoch": 2.420765027322404, + "grad_norm": 0.50017475308011, + "learning_rate": 1.0933722593887314e-06, + "loss": 0.5867, + "step": 6202 + }, + { + "epoch": 2.4211553473848557, + "grad_norm": 0.4457277365599909, + "learning_rate": 1.0919553265212484e-06, + "loss": 0.6311, + "step": 6203 + }, + { + "epoch": 2.421545667447307, + "grad_norm": 0.4402349432369259, + "learning_rate": 1.090539199818334e-06, + "loss": 0.6009, + "step": 6204 + }, + { + "epoch": 2.421935987509758, + "grad_norm": 0.4198667964991758, + "learning_rate": 1.0891238795721093e-06, + "loss": 0.5797, + "step": 6205 + }, + { + "epoch": 2.4223263075722095, + "grad_norm": 0.45140571703317073, + "learning_rate": 1.0877093660745337e-06, + "loss": 0.5493, + "step": 6206 + }, + { + "epoch": 2.4227166276346606, + "grad_norm": 0.45515797011237497, + "learning_rate": 1.0862956596173952e-06, + "loss": 0.5606, + "step": 6207 + }, + { + "epoch": 2.4231069476971117, + "grad_norm": 0.4743028516727171, + "learning_rate": 1.0848827604923207e-06, + "loss": 0.5593, + "step": 6208 + }, + { + "epoch": 2.4234972677595628, + "grad_norm": 0.48330545509346673, + "learning_rate": 1.083470668990766e-06, + "loss": 0.59, + "step": 6209 + }, + { + "epoch": 2.423887587822014, + "grad_norm": 0.4545716188521356, + "learning_rate": 1.0820593854040213e-06, + "loss": 0.5639, + "step": 6210 + }, + { + "epoch": 2.4242779078844654, + "grad_norm": 0.4558709576294524, + "learning_rate": 1.0806489100232132e-06, + "loss": 0.5737, + "step": 6211 + }, + { + "epoch": 2.4246682279469165, + "grad_norm": 0.4510384391000308, + "learning_rate": 1.0792392431392983e-06, + "loss": 0.5672, + "step": 6212 + }, + { + "epoch": 2.4250585480093676, + "grad_norm": 0.4496041654590867, + "learning_rate": 1.077830385043067e-06, + "loss": 0.5604, + "step": 6213 + }, + { + "epoch": 2.4254488680718187, + "grad_norm": 0.4287761701208263, + "learning_rate": 1.0764223360251424e-06, + "loss": 0.5685, + "step": 6214 + }, + { + "epoch": 2.4258391881342702, + "grad_norm": 0.48880915874638986, + "learning_rate": 1.0750150963759837e-06, + "loss": 0.5553, + "step": 6215 + }, + { + "epoch": 2.4262295081967213, + "grad_norm": 0.4609173239553444, + "learning_rate": 1.0736086663858786e-06, + "loss": 0.5488, + "step": 6216 + }, + { + "epoch": 2.4266198282591724, + "grad_norm": 0.5809877521126247, + "learning_rate": 1.0722030463449534e-06, + "loss": 0.5372, + "step": 6217 + }, + { + "epoch": 2.4270101483216235, + "grad_norm": 0.583185927865647, + "learning_rate": 1.0707982365431602e-06, + "loss": 0.5698, + "step": 6218 + }, + { + "epoch": 2.427400468384075, + "grad_norm": 0.48184799906043296, + "learning_rate": 1.0693942372702931e-06, + "loss": 0.5803, + "step": 6219 + }, + { + "epoch": 2.427790788446526, + "grad_norm": 0.4521953293741898, + "learning_rate": 1.067991048815968e-06, + "loss": 0.573, + "step": 6220 + }, + { + "epoch": 2.4281811085089773, + "grad_norm": 0.4244393264671535, + "learning_rate": 1.0665886714696433e-06, + "loss": 0.5818, + "step": 6221 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.5008810679915229, + "learning_rate": 1.065187105520603e-06, + "loss": 0.578, + "step": 6222 + }, + { + "epoch": 2.42896174863388, + "grad_norm": 0.4593412663637594, + "learning_rate": 1.0637863512579705e-06, + "loss": 0.5393, + "step": 6223 + }, + { + "epoch": 2.429352068696331, + "grad_norm": 0.5914919927350348, + "learning_rate": 1.0623864089706947e-06, + "loss": 0.528, + "step": 6224 + }, + { + "epoch": 2.429742388758782, + "grad_norm": 0.45761174674949096, + "learning_rate": 1.0609872789475634e-06, + "loss": 0.5512, + "step": 6225 + }, + { + "epoch": 2.4301327088212332, + "grad_norm": 0.44590205460327387, + "learning_rate": 1.059588961477192e-06, + "loss": 0.5799, + "step": 6226 + }, + { + "epoch": 2.4305230288836848, + "grad_norm": 0.4698674909448196, + "learning_rate": 1.0581914568480295e-06, + "loss": 0.5855, + "step": 6227 + }, + { + "epoch": 2.430913348946136, + "grad_norm": 0.5053070439681412, + "learning_rate": 1.0567947653483596e-06, + "loss": 0.5467, + "step": 6228 + }, + { + "epoch": 2.431303669008587, + "grad_norm": 0.4742723938994407, + "learning_rate": 1.0553988872662963e-06, + "loss": 0.5738, + "step": 6229 + }, + { + "epoch": 2.431693989071038, + "grad_norm": 0.4478244961425325, + "learning_rate": 1.0540038228897847e-06, + "loss": 0.5888, + "step": 6230 + }, + { + "epoch": 2.4320843091334896, + "grad_norm": 0.4454621888212595, + "learning_rate": 1.0526095725066033e-06, + "loss": 0.5298, + "step": 6231 + }, + { + "epoch": 2.4324746291959407, + "grad_norm": 0.4347655222590159, + "learning_rate": 1.0512161364043643e-06, + "loss": 0.5756, + "step": 6232 + }, + { + "epoch": 2.432864949258392, + "grad_norm": 0.4590320115924758, + "learning_rate": 1.0498235148705087e-06, + "loss": 0.5286, + "step": 6233 + }, + { + "epoch": 2.433255269320843, + "grad_norm": 0.44476627211569414, + "learning_rate": 1.0484317081923134e-06, + "loss": 0.6026, + "step": 6234 + }, + { + "epoch": 2.4336455893832944, + "grad_norm": 0.4559031512116291, + "learning_rate": 1.0470407166568819e-06, + "loss": 0.5553, + "step": 6235 + }, + { + "epoch": 2.4340359094457455, + "grad_norm": 0.4430044368527261, + "learning_rate": 1.0456505405511558e-06, + "loss": 0.57, + "step": 6236 + }, + { + "epoch": 2.4344262295081966, + "grad_norm": 0.43739255400835414, + "learning_rate": 1.044261180161904e-06, + "loss": 0.5478, + "step": 6237 + }, + { + "epoch": 2.4348165495706477, + "grad_norm": 0.432959419531625, + "learning_rate": 1.042872635775728e-06, + "loss": 0.5663, + "step": 6238 + }, + { + "epoch": 2.4352068696330993, + "grad_norm": 0.43992414119104245, + "learning_rate": 1.04148490767906e-06, + "loss": 0.5978, + "step": 6239 + }, + { + "epoch": 2.4355971896955504, + "grad_norm": 0.45101874582243073, + "learning_rate": 1.040097996158168e-06, + "loss": 0.5602, + "step": 6240 + }, + { + "epoch": 2.4359875097580015, + "grad_norm": 0.49196757356623083, + "learning_rate": 1.038711901499147e-06, + "loss": 0.562, + "step": 6241 + }, + { + "epoch": 2.4363778298204526, + "grad_norm": 0.4337305869539818, + "learning_rate": 1.0373266239879264e-06, + "loss": 0.5525, + "step": 6242 + }, + { + "epoch": 2.436768149882904, + "grad_norm": 0.4315544669585591, + "learning_rate": 1.0359421639102657e-06, + "loss": 0.5773, + "step": 6243 + }, + { + "epoch": 2.4371584699453552, + "grad_norm": 0.4428306360043646, + "learning_rate": 1.0345585215517545e-06, + "loss": 0.5515, + "step": 6244 + }, + { + "epoch": 2.4375487900078063, + "grad_norm": 0.44379584065665223, + "learning_rate": 1.0331756971978175e-06, + "loss": 0.5657, + "step": 6245 + }, + { + "epoch": 2.4379391100702574, + "grad_norm": 0.4720999827472813, + "learning_rate": 1.0317936911337068e-06, + "loss": 0.5904, + "step": 6246 + }, + { + "epoch": 2.438329430132709, + "grad_norm": 0.5004779026875527, + "learning_rate": 1.0304125036445072e-06, + "loss": 0.5591, + "step": 6247 + }, + { + "epoch": 2.43871975019516, + "grad_norm": 0.5400803072747261, + "learning_rate": 1.0290321350151344e-06, + "loss": 0.5651, + "step": 6248 + }, + { + "epoch": 2.439110070257611, + "grad_norm": 0.46709896100184023, + "learning_rate": 1.027652585530337e-06, + "loss": 0.5342, + "step": 6249 + }, + { + "epoch": 2.4395003903200623, + "grad_norm": 0.46543746885943954, + "learning_rate": 1.0262738554746914e-06, + "loss": 0.5715, + "step": 6250 + }, + { + "epoch": 2.439890710382514, + "grad_norm": 0.4432621292057772, + "learning_rate": 1.0248959451326084e-06, + "loss": 0.5983, + "step": 6251 + }, + { + "epoch": 2.440281030444965, + "grad_norm": 0.47306589104576935, + "learning_rate": 1.0235188547883262e-06, + "loss": 0.6135, + "step": 6252 + }, + { + "epoch": 2.440671350507416, + "grad_norm": 0.43452340161008635, + "learning_rate": 1.0221425847259176e-06, + "loss": 0.5422, + "step": 6253 + }, + { + "epoch": 2.441061670569867, + "grad_norm": 0.42806208031214504, + "learning_rate": 1.0207671352292814e-06, + "loss": 0.5353, + "step": 6254 + }, + { + "epoch": 2.4414519906323187, + "grad_norm": 0.5034164853990347, + "learning_rate": 1.0193925065821548e-06, + "loss": 0.5631, + "step": 6255 + }, + { + "epoch": 2.4418423106947698, + "grad_norm": 0.45958172938384323, + "learning_rate": 1.0180186990680947e-06, + "loss": 0.5256, + "step": 6256 + }, + { + "epoch": 2.442232630757221, + "grad_norm": 0.45125231677273164, + "learning_rate": 1.0166457129704987e-06, + "loss": 0.6037, + "step": 6257 + }, + { + "epoch": 2.442622950819672, + "grad_norm": 0.46339201782446865, + "learning_rate": 1.0152735485725884e-06, + "loss": 0.5814, + "step": 6258 + }, + { + "epoch": 2.4430132708821235, + "grad_norm": 0.5754604554729303, + "learning_rate": 1.0139022061574222e-06, + "loss": 0.5934, + "step": 6259 + }, + { + "epoch": 2.4434035909445746, + "grad_norm": 0.5650552174239943, + "learning_rate": 1.0125316860078827e-06, + "loss": 0.5644, + "step": 6260 + }, + { + "epoch": 2.4437939110070257, + "grad_norm": 0.4533574692134742, + "learning_rate": 1.0111619884066837e-06, + "loss": 0.539, + "step": 6261 + }, + { + "epoch": 2.444184231069477, + "grad_norm": 0.4408896623629238, + "learning_rate": 1.0097931136363754e-06, + "loss": 0.5757, + "step": 6262 + }, + { + "epoch": 2.4445745511319283, + "grad_norm": 0.5086924577711169, + "learning_rate": 1.0084250619793307e-06, + "loss": 0.5776, + "step": 6263 + }, + { + "epoch": 2.4449648711943794, + "grad_norm": 0.47292605691765793, + "learning_rate": 1.0070578337177572e-06, + "loss": 0.544, + "step": 6264 + }, + { + "epoch": 2.4453551912568305, + "grad_norm": 0.4240643657517619, + "learning_rate": 1.0056914291336888e-06, + "loss": 0.5462, + "step": 6265 + }, + { + "epoch": 2.4457455113192816, + "grad_norm": 0.4591061696099431, + "learning_rate": 1.0043258485089963e-06, + "loss": 0.5444, + "step": 6266 + }, + { + "epoch": 2.446135831381733, + "grad_norm": 0.5118427804719972, + "learning_rate": 1.0029610921253724e-06, + "loss": 0.5685, + "step": 6267 + }, + { + "epoch": 2.4465261514441843, + "grad_norm": 0.5870829482549141, + "learning_rate": 1.001597160264346e-06, + "loss": 0.5887, + "step": 6268 + }, + { + "epoch": 2.4469164715066354, + "grad_norm": 0.4404610645390417, + "learning_rate": 1.0002340532072719e-06, + "loss": 0.5254, + "step": 6269 + }, + { + "epoch": 2.4473067915690865, + "grad_norm": 0.46180025727436397, + "learning_rate": 9.988717712353384e-07, + "loss": 0.5688, + "step": 6270 + }, + { + "epoch": 2.447697111631538, + "grad_norm": 0.4680597075268176, + "learning_rate": 9.975103146295605e-07, + "loss": 0.5682, + "step": 6271 + }, + { + "epoch": 2.448087431693989, + "grad_norm": 0.429590064234782, + "learning_rate": 9.961496836707847e-07, + "loss": 0.5587, + "step": 6272 + }, + { + "epoch": 2.4484777517564402, + "grad_norm": 0.4415274341374149, + "learning_rate": 9.947898786396836e-07, + "loss": 0.5586, + "step": 6273 + }, + { + "epoch": 2.4488680718188913, + "grad_norm": 0.4456373183646598, + "learning_rate": 9.934308998167664e-07, + "loss": 0.5394, + "step": 6274 + }, + { + "epoch": 2.449258391881343, + "grad_norm": 0.5372271765575864, + "learning_rate": 9.920727474823655e-07, + "loss": 0.5601, + "step": 6275 + }, + { + "epoch": 2.449648711943794, + "grad_norm": 0.4645508876093968, + "learning_rate": 9.907154219166443e-07, + "loss": 0.5883, + "step": 6276 + }, + { + "epoch": 2.450039032006245, + "grad_norm": 0.4545334144445091, + "learning_rate": 9.893589233995992e-07, + "loss": 0.5923, + "step": 6277 + }, + { + "epoch": 2.450429352068696, + "grad_norm": 0.4370632638272949, + "learning_rate": 9.880032522110506e-07, + "loss": 0.5504, + "step": 6278 + }, + { + "epoch": 2.4508196721311477, + "grad_norm": 0.5060526466244848, + "learning_rate": 9.866484086306538e-07, + "loss": 0.5654, + "step": 6279 + }, + { + "epoch": 2.451209992193599, + "grad_norm": 0.42433801079428707, + "learning_rate": 9.852943929378873e-07, + "loss": 0.5766, + "step": 6280 + }, + { + "epoch": 2.45160031225605, + "grad_norm": 0.47798300897274787, + "learning_rate": 9.839412054120655e-07, + "loss": 0.5451, + "step": 6281 + }, + { + "epoch": 2.451990632318501, + "grad_norm": 0.48709544656553044, + "learning_rate": 9.825888463323246e-07, + "loss": 0.5835, + "step": 6282 + }, + { + "epoch": 2.4523809523809526, + "grad_norm": 0.5507852236823224, + "learning_rate": 9.812373159776367e-07, + "loss": 0.5467, + "step": 6283 + }, + { + "epoch": 2.4527712724434036, + "grad_norm": 0.43399587833523406, + "learning_rate": 9.79886614626797e-07, + "loss": 0.5984, + "step": 6284 + }, + { + "epoch": 2.4531615925058547, + "grad_norm": 0.4510542239402631, + "learning_rate": 9.785367425584369e-07, + "loss": 0.5974, + "step": 6285 + }, + { + "epoch": 2.453551912568306, + "grad_norm": 0.43192547276761767, + "learning_rate": 9.771877000510087e-07, + "loss": 0.5866, + "step": 6286 + }, + { + "epoch": 2.4539422326307574, + "grad_norm": 0.46722783612089513, + "learning_rate": 9.758394873828003e-07, + "loss": 0.5982, + "step": 6287 + }, + { + "epoch": 2.4543325526932085, + "grad_norm": 0.5084102018910813, + "learning_rate": 9.744921048319245e-07, + "loss": 0.566, + "step": 6288 + }, + { + "epoch": 2.4547228727556596, + "grad_norm": 0.4853368209492644, + "learning_rate": 9.731455526763216e-07, + "loss": 0.5534, + "step": 6289 + }, + { + "epoch": 2.4551131928181107, + "grad_norm": 0.4697956495354043, + "learning_rate": 9.717998311937671e-07, + "loss": 0.5213, + "step": 6290 + }, + { + "epoch": 2.4555035128805622, + "grad_norm": 0.52341216003825, + "learning_rate": 9.704549406618585e-07, + "loss": 0.5827, + "step": 6291 + }, + { + "epoch": 2.4558938329430133, + "grad_norm": 0.571242541378557, + "learning_rate": 9.691108813580247e-07, + "loss": 0.5507, + "step": 6292 + }, + { + "epoch": 2.4562841530054644, + "grad_norm": 0.4942947345952569, + "learning_rate": 9.67767653559521e-07, + "loss": 0.5711, + "step": 6293 + }, + { + "epoch": 2.4566744730679155, + "grad_norm": 0.41158034459648996, + "learning_rate": 9.664252575434362e-07, + "loss": 0.5855, + "step": 6294 + }, + { + "epoch": 2.457064793130367, + "grad_norm": 0.4667727771704432, + "learning_rate": 9.650836935866808e-07, + "loss": 0.5612, + "step": 6295 + }, + { + "epoch": 2.457455113192818, + "grad_norm": 0.45787168551252033, + "learning_rate": 9.637429619660011e-07, + "loss": 0.565, + "step": 6296 + }, + { + "epoch": 2.4578454332552693, + "grad_norm": 0.47591839229599825, + "learning_rate": 9.62403062957964e-07, + "loss": 0.5876, + "step": 6297 + }, + { + "epoch": 2.4582357533177204, + "grad_norm": 0.4554874771329395, + "learning_rate": 9.610639968389711e-07, + "loss": 0.5317, + "step": 6298 + }, + { + "epoch": 2.458626073380172, + "grad_norm": 0.4575547173674276, + "learning_rate": 9.597257638852486e-07, + "loss": 0.5696, + "step": 6299 + }, + { + "epoch": 2.459016393442623, + "grad_norm": 0.5079357353670232, + "learning_rate": 9.583883643728513e-07, + "loss": 0.5618, + "step": 6300 + }, + { + "epoch": 2.459406713505074, + "grad_norm": 0.43341227172201463, + "learning_rate": 9.570517985776612e-07, + "loss": 0.5494, + "step": 6301 + }, + { + "epoch": 2.459797033567525, + "grad_norm": 0.4150778612407476, + "learning_rate": 9.557160667753922e-07, + "loss": 0.5387, + "step": 6302 + }, + { + "epoch": 2.4601873536299768, + "grad_norm": 0.45334467737263784, + "learning_rate": 9.543811692415811e-07, + "loss": 0.5419, + "step": 6303 + }, + { + "epoch": 2.460577673692428, + "grad_norm": 0.457043322273831, + "learning_rate": 9.530471062515973e-07, + "loss": 0.6045, + "step": 6304 + }, + { + "epoch": 2.460967993754879, + "grad_norm": 0.41603924579884033, + "learning_rate": 9.517138780806345e-07, + "loss": 0.5542, + "step": 6305 + }, + { + "epoch": 2.46135831381733, + "grad_norm": 0.45734133793006004, + "learning_rate": 9.503814850037152e-07, + "loss": 0.5574, + "step": 6306 + }, + { + "epoch": 2.4617486338797816, + "grad_norm": 0.42798189677134413, + "learning_rate": 9.49049927295691e-07, + "loss": 0.5491, + "step": 6307 + }, + { + "epoch": 2.4621389539422327, + "grad_norm": 0.49579772414282947, + "learning_rate": 9.477192052312395e-07, + "loss": 0.5683, + "step": 6308 + }, + { + "epoch": 2.462529274004684, + "grad_norm": 0.6197566548459025, + "learning_rate": 9.463893190848666e-07, + "loss": 0.5354, + "step": 6309 + }, + { + "epoch": 2.462919594067135, + "grad_norm": 0.4444541371746607, + "learning_rate": 9.450602691309046e-07, + "loss": 0.5853, + "step": 6310 + }, + { + "epoch": 2.4633099141295864, + "grad_norm": 0.44528015135063204, + "learning_rate": 9.437320556435164e-07, + "loss": 0.5611, + "step": 6311 + }, + { + "epoch": 2.4637002341920375, + "grad_norm": 0.48803441912505513, + "learning_rate": 9.424046788966878e-07, + "loss": 0.5634, + "step": 6312 + }, + { + "epoch": 2.4640905542544886, + "grad_norm": 0.5279330362441664, + "learning_rate": 9.410781391642377e-07, + "loss": 0.5531, + "step": 6313 + }, + { + "epoch": 2.4644808743169397, + "grad_norm": 0.5068474795503078, + "learning_rate": 9.397524367198063e-07, + "loss": 0.5493, + "step": 6314 + }, + { + "epoch": 2.4648711943793913, + "grad_norm": 0.44342702195453576, + "learning_rate": 9.384275718368663e-07, + "loss": 0.5708, + "step": 6315 + }, + { + "epoch": 2.4652615144418424, + "grad_norm": 0.4367383665486524, + "learning_rate": 9.371035447887139e-07, + "loss": 0.533, + "step": 6316 + }, + { + "epoch": 2.4656518345042935, + "grad_norm": 0.46803400130464407, + "learning_rate": 9.357803558484741e-07, + "loss": 0.567, + "step": 6317 + }, + { + "epoch": 2.4660421545667446, + "grad_norm": 0.43651571184935234, + "learning_rate": 9.344580052890972e-07, + "loss": 0.5482, + "step": 6318 + }, + { + "epoch": 2.466432474629196, + "grad_norm": 0.5592742346623735, + "learning_rate": 9.331364933833654e-07, + "loss": 0.5608, + "step": 6319 + }, + { + "epoch": 2.4668227946916472, + "grad_norm": 0.5731412092278205, + "learning_rate": 9.318158204038813e-07, + "loss": 0.5803, + "step": 6320 + }, + { + "epoch": 2.4672131147540983, + "grad_norm": 0.4863728255193237, + "learning_rate": 9.304959866230806e-07, + "loss": 0.5685, + "step": 6321 + }, + { + "epoch": 2.4676034348165494, + "grad_norm": 0.41655025196120543, + "learning_rate": 9.291769923132216e-07, + "loss": 0.5445, + "step": 6322 + }, + { + "epoch": 2.467993754879001, + "grad_norm": 0.45335711399298867, + "learning_rate": 9.278588377463899e-07, + "loss": 0.5581, + "step": 6323 + }, + { + "epoch": 2.468384074941452, + "grad_norm": 0.4554350307714874, + "learning_rate": 9.265415231945013e-07, + "loss": 0.5904, + "step": 6324 + }, + { + "epoch": 2.468774395003903, + "grad_norm": 0.47473173835970406, + "learning_rate": 9.252250489292941e-07, + "loss": 0.5744, + "step": 6325 + }, + { + "epoch": 2.4691647150663543, + "grad_norm": 0.43692790879675364, + "learning_rate": 9.23909415222336e-07, + "loss": 0.5659, + "step": 6326 + }, + { + "epoch": 2.469555035128806, + "grad_norm": 0.428827597989663, + "learning_rate": 9.225946223450177e-07, + "loss": 0.5711, + "step": 6327 + }, + { + "epoch": 2.469945355191257, + "grad_norm": 0.4440227105767596, + "learning_rate": 9.212806705685634e-07, + "loss": 0.5791, + "step": 6328 + }, + { + "epoch": 2.470335675253708, + "grad_norm": 0.4444698315395411, + "learning_rate": 9.199675601640152e-07, + "loss": 0.543, + "step": 6329 + }, + { + "epoch": 2.470725995316159, + "grad_norm": 0.43711197142431574, + "learning_rate": 9.186552914022495e-07, + "loss": 0.5675, + "step": 6330 + }, + { + "epoch": 2.4711163153786107, + "grad_norm": 0.4387806664712142, + "learning_rate": 9.173438645539628e-07, + "loss": 0.589, + "step": 6331 + }, + { + "epoch": 2.4715066354410617, + "grad_norm": 0.4463668917775501, + "learning_rate": 9.160332798896832e-07, + "loss": 0.5439, + "step": 6332 + }, + { + "epoch": 2.471896955503513, + "grad_norm": 0.472082353984748, + "learning_rate": 9.147235376797592e-07, + "loss": 0.5904, + "step": 6333 + }, + { + "epoch": 2.472287275565964, + "grad_norm": 0.46987071031308797, + "learning_rate": 9.134146381943737e-07, + "loss": 0.5646, + "step": 6334 + }, + { + "epoch": 2.4726775956284155, + "grad_norm": 0.42586628602093074, + "learning_rate": 9.121065817035252e-07, + "loss": 0.5515, + "step": 6335 + }, + { + "epoch": 2.4730679156908666, + "grad_norm": 0.45030890812929325, + "learning_rate": 9.107993684770483e-07, + "loss": 0.5377, + "step": 6336 + }, + { + "epoch": 2.4734582357533177, + "grad_norm": 0.46867985436833853, + "learning_rate": 9.094929987845958e-07, + "loss": 0.5838, + "step": 6337 + }, + { + "epoch": 2.473848555815769, + "grad_norm": 0.45958023818683025, + "learning_rate": 9.081874728956536e-07, + "loss": 0.5728, + "step": 6338 + }, + { + "epoch": 2.4742388758782203, + "grad_norm": 0.3979629270840823, + "learning_rate": 9.068827910795286e-07, + "loss": 0.5676, + "step": 6339 + }, + { + "epoch": 2.4746291959406714, + "grad_norm": 0.4638552512731786, + "learning_rate": 9.055789536053528e-07, + "loss": 0.5507, + "step": 6340 + }, + { + "epoch": 2.4750195160031225, + "grad_norm": 0.455955768529014, + "learning_rate": 9.042759607420893e-07, + "loss": 0.5579, + "step": 6341 + }, + { + "epoch": 2.4754098360655736, + "grad_norm": 0.4777721005090741, + "learning_rate": 9.029738127585213e-07, + "loss": 0.56, + "step": 6342 + }, + { + "epoch": 2.4758001561280247, + "grad_norm": 0.433578562359298, + "learning_rate": 9.016725099232643e-07, + "loss": 0.5481, + "step": 6343 + }, + { + "epoch": 2.4761904761904763, + "grad_norm": 0.5307400671284643, + "learning_rate": 9.003720525047499e-07, + "loss": 0.5481, + "step": 6344 + }, + { + "epoch": 2.4765807962529274, + "grad_norm": 0.49341672840587075, + "learning_rate": 8.990724407712448e-07, + "loss": 0.566, + "step": 6345 + }, + { + "epoch": 2.4769711163153785, + "grad_norm": 0.43054677430352367, + "learning_rate": 8.977736749908345e-07, + "loss": 0.6112, + "step": 6346 + }, + { + "epoch": 2.47736143637783, + "grad_norm": 0.4348920565412537, + "learning_rate": 8.964757554314358e-07, + "loss": 0.5542, + "step": 6347 + }, + { + "epoch": 2.477751756440281, + "grad_norm": 0.43854203426578997, + "learning_rate": 8.95178682360785e-07, + "loss": 0.5687, + "step": 6348 + }, + { + "epoch": 2.478142076502732, + "grad_norm": 0.45629136890744293, + "learning_rate": 8.938824560464493e-07, + "loss": 0.5631, + "step": 6349 + }, + { + "epoch": 2.4785323965651833, + "grad_norm": 0.44346765109427255, + "learning_rate": 8.925870767558165e-07, + "loss": 0.5943, + "step": 6350 + }, + { + "epoch": 2.4789227166276344, + "grad_norm": 0.4469370476723377, + "learning_rate": 8.912925447561033e-07, + "loss": 0.5894, + "step": 6351 + }, + { + "epoch": 2.479313036690086, + "grad_norm": 0.4860290757813373, + "learning_rate": 8.8999886031435e-07, + "loss": 0.5735, + "step": 6352 + }, + { + "epoch": 2.479703356752537, + "grad_norm": 0.45379527696474903, + "learning_rate": 8.88706023697421e-07, + "loss": 0.6106, + "step": 6353 + }, + { + "epoch": 2.480093676814988, + "grad_norm": 0.4227811295690577, + "learning_rate": 8.874140351720067e-07, + "loss": 0.6022, + "step": 6354 + }, + { + "epoch": 2.4804839968774397, + "grad_norm": 0.4860948106133061, + "learning_rate": 8.861228950046247e-07, + "loss": 0.5624, + "step": 6355 + }, + { + "epoch": 2.480874316939891, + "grad_norm": 0.42279479722272784, + "learning_rate": 8.84832603461615e-07, + "loss": 0.5464, + "step": 6356 + }, + { + "epoch": 2.481264637002342, + "grad_norm": 0.45288062198687035, + "learning_rate": 8.835431608091416e-07, + "loss": 0.558, + "step": 6357 + }, + { + "epoch": 2.481654957064793, + "grad_norm": 0.4530885467277741, + "learning_rate": 8.822545673131982e-07, + "loss": 0.5547, + "step": 6358 + }, + { + "epoch": 2.482045277127244, + "grad_norm": 0.430981745484027, + "learning_rate": 8.809668232395968e-07, + "loss": 0.6223, + "step": 6359 + }, + { + "epoch": 2.4824355971896956, + "grad_norm": 0.44656396355988337, + "learning_rate": 8.796799288539814e-07, + "loss": 0.6007, + "step": 6360 + }, + { + "epoch": 2.4828259172521467, + "grad_norm": 0.5220571959970403, + "learning_rate": 8.783938844218126e-07, + "loss": 0.5491, + "step": 6361 + }, + { + "epoch": 2.483216237314598, + "grad_norm": 0.4379756254369437, + "learning_rate": 8.771086902083837e-07, + "loss": 0.5755, + "step": 6362 + }, + { + "epoch": 2.4836065573770494, + "grad_norm": 0.4284410225483686, + "learning_rate": 8.758243464788052e-07, + "loss": 0.5661, + "step": 6363 + }, + { + "epoch": 2.4839968774395005, + "grad_norm": 0.4388716108737195, + "learning_rate": 8.745408534980199e-07, + "loss": 0.5723, + "step": 6364 + }, + { + "epoch": 2.4843871975019516, + "grad_norm": 0.43391248359090195, + "learning_rate": 8.732582115307875e-07, + "loss": 0.5498, + "step": 6365 + }, + { + "epoch": 2.4847775175644027, + "grad_norm": 0.43867597059418156, + "learning_rate": 8.719764208416981e-07, + "loss": 0.5875, + "step": 6366 + }, + { + "epoch": 2.485167837626854, + "grad_norm": 0.5125901774561445, + "learning_rate": 8.706954816951618e-07, + "loss": 0.5562, + "step": 6367 + }, + { + "epoch": 2.4855581576893053, + "grad_norm": 0.48902687482785445, + "learning_rate": 8.694153943554179e-07, + "loss": 0.5704, + "step": 6368 + }, + { + "epoch": 2.4859484777517564, + "grad_norm": 0.46023289313905247, + "learning_rate": 8.681361590865245e-07, + "loss": 0.5202, + "step": 6369 + }, + { + "epoch": 2.4863387978142075, + "grad_norm": 0.44428132335804843, + "learning_rate": 8.668577761523677e-07, + "loss": 0.5724, + "step": 6370 + }, + { + "epoch": 2.486729117876659, + "grad_norm": 0.4443407866353283, + "learning_rate": 8.655802458166551e-07, + "loss": 0.5489, + "step": 6371 + }, + { + "epoch": 2.48711943793911, + "grad_norm": 0.49482731141674646, + "learning_rate": 8.643035683429229e-07, + "loss": 0.5761, + "step": 6372 + }, + { + "epoch": 2.4875097580015613, + "grad_norm": 0.6163889876095676, + "learning_rate": 8.630277439945261e-07, + "loss": 0.5746, + "step": 6373 + }, + { + "epoch": 2.4879000780640124, + "grad_norm": 0.46372603711101273, + "learning_rate": 8.617527730346453e-07, + "loss": 0.5464, + "step": 6374 + }, + { + "epoch": 2.4882903981264635, + "grad_norm": 0.43927000955204454, + "learning_rate": 8.604786557262884e-07, + "loss": 0.5418, + "step": 6375 + }, + { + "epoch": 2.488680718188915, + "grad_norm": 0.4500584759438139, + "learning_rate": 8.592053923322819e-07, + "loss": 0.5931, + "step": 6376 + }, + { + "epoch": 2.489071038251366, + "grad_norm": 0.42760995270573815, + "learning_rate": 8.579329831152816e-07, + "loss": 0.5142, + "step": 6377 + }, + { + "epoch": 2.489461358313817, + "grad_norm": 0.5003032458709668, + "learning_rate": 8.566614283377633e-07, + "loss": 0.5329, + "step": 6378 + }, + { + "epoch": 2.4898516783762688, + "grad_norm": 0.4318600539300043, + "learning_rate": 8.553907282620266e-07, + "loss": 0.5734, + "step": 6379 + }, + { + "epoch": 2.49024199843872, + "grad_norm": 0.43802081753404865, + "learning_rate": 8.541208831501957e-07, + "loss": 0.5685, + "step": 6380 + }, + { + "epoch": 2.490632318501171, + "grad_norm": 0.4727655263782488, + "learning_rate": 8.528518932642199e-07, + "loss": 0.5708, + "step": 6381 + }, + { + "epoch": 2.491022638563622, + "grad_norm": 0.5516583970855343, + "learning_rate": 8.515837588658694e-07, + "loss": 0.5815, + "step": 6382 + }, + { + "epoch": 2.491412958626073, + "grad_norm": 0.5412088521039891, + "learning_rate": 8.503164802167407e-07, + "loss": 0.565, + "step": 6383 + }, + { + "epoch": 2.4918032786885247, + "grad_norm": 0.4492818924953163, + "learning_rate": 8.490500575782501e-07, + "loss": 0.5318, + "step": 6384 + }, + { + "epoch": 2.492193598750976, + "grad_norm": 0.45883590543570124, + "learning_rate": 8.477844912116423e-07, + "loss": 0.5805, + "step": 6385 + }, + { + "epoch": 2.492583918813427, + "grad_norm": 0.4793347808285976, + "learning_rate": 8.465197813779808e-07, + "loss": 0.5944, + "step": 6386 + }, + { + "epoch": 2.4929742388758784, + "grad_norm": 0.440926529755657, + "learning_rate": 8.45255928338155e-07, + "loss": 0.5422, + "step": 6387 + }, + { + "epoch": 2.4933645589383295, + "grad_norm": 0.4407667597106852, + "learning_rate": 8.439929323528745e-07, + "loss": 0.5565, + "step": 6388 + }, + { + "epoch": 2.4937548790007806, + "grad_norm": 0.503070391689201, + "learning_rate": 8.427307936826778e-07, + "loss": 0.5295, + "step": 6389 + }, + { + "epoch": 2.4941451990632317, + "grad_norm": 0.4280456960302173, + "learning_rate": 8.414695125879208e-07, + "loss": 0.6119, + "step": 6390 + }, + { + "epoch": 2.494535519125683, + "grad_norm": 0.4638342700514624, + "learning_rate": 8.402090893287846e-07, + "loss": 0.5068, + "step": 6391 + }, + { + "epoch": 2.4949258391881344, + "grad_norm": 0.469521710637382, + "learning_rate": 8.389495241652756e-07, + "loss": 0.5568, + "step": 6392 + }, + { + "epoch": 2.4953161592505855, + "grad_norm": 0.4936665484909949, + "learning_rate": 8.376908173572184e-07, + "loss": 0.5422, + "step": 6393 + }, + { + "epoch": 2.4957064793130366, + "grad_norm": 0.48842103339336995, + "learning_rate": 8.364329691642658e-07, + "loss": 0.5491, + "step": 6394 + }, + { + "epoch": 2.496096799375488, + "grad_norm": 0.42869214441834996, + "learning_rate": 8.351759798458892e-07, + "loss": 0.5332, + "step": 6395 + }, + { + "epoch": 2.496487119437939, + "grad_norm": 0.4406077879723682, + "learning_rate": 8.33919849661387e-07, + "loss": 0.5873, + "step": 6396 + }, + { + "epoch": 2.4968774395003903, + "grad_norm": 0.4526442846818694, + "learning_rate": 8.32664578869874e-07, + "loss": 0.5615, + "step": 6397 + }, + { + "epoch": 2.4972677595628414, + "grad_norm": 0.49661971534779703, + "learning_rate": 8.314101677302955e-07, + "loss": 0.5698, + "step": 6398 + }, + { + "epoch": 2.4976580796252925, + "grad_norm": 0.48066735676813194, + "learning_rate": 8.301566165014125e-07, + "loss": 0.5584, + "step": 6399 + }, + { + "epoch": 2.498048399687744, + "grad_norm": 0.47288682561857237, + "learning_rate": 8.28903925441814e-07, + "loss": 0.5751, + "step": 6400 + }, + { + "epoch": 2.498438719750195, + "grad_norm": 0.4776627646598498, + "learning_rate": 8.276520948099093e-07, + "loss": 0.5618, + "step": 6401 + }, + { + "epoch": 2.4988290398126463, + "grad_norm": 0.5135154985377142, + "learning_rate": 8.264011248639275e-07, + "loss": 0.558, + "step": 6402 + }, + { + "epoch": 2.499219359875098, + "grad_norm": 0.4827583508799122, + "learning_rate": 8.251510158619269e-07, + "loss": 0.5789, + "step": 6403 + }, + { + "epoch": 2.499609679937549, + "grad_norm": 0.4386042488290802, + "learning_rate": 8.239017680617817e-07, + "loss": 0.5512, + "step": 6404 + }, + { + "epoch": 2.5, + "grad_norm": 0.48910942747354463, + "learning_rate": 8.226533817211912e-07, + "loss": 0.5282, + "step": 6405 + }, + { + "epoch": 2.500390320062451, + "grad_norm": 0.4865010814365337, + "learning_rate": 8.214058570976763e-07, + "loss": 0.5803, + "step": 6406 + }, + { + "epoch": 2.500780640124902, + "grad_norm": 0.4939009033703554, + "learning_rate": 8.201591944485821e-07, + "loss": 0.5613, + "step": 6407 + }, + { + "epoch": 2.5011709601873537, + "grad_norm": 0.5009706561934028, + "learning_rate": 8.189133940310717e-07, + "loss": 0.5483, + "step": 6408 + }, + { + "epoch": 2.501561280249805, + "grad_norm": 0.46358613308749314, + "learning_rate": 8.176684561021365e-07, + "loss": 0.5692, + "step": 6409 + }, + { + "epoch": 2.501951600312256, + "grad_norm": 0.44288520812502985, + "learning_rate": 8.164243809185829e-07, + "loss": 0.5779, + "step": 6410 + }, + { + "epoch": 2.5023419203747075, + "grad_norm": 0.5087034425411349, + "learning_rate": 8.151811687370465e-07, + "loss": 0.5821, + "step": 6411 + }, + { + "epoch": 2.5027322404371586, + "grad_norm": 0.4661672139860173, + "learning_rate": 8.139388198139781e-07, + "loss": 0.5836, + "step": 6412 + }, + { + "epoch": 2.5031225604996097, + "grad_norm": 0.449006891540154, + "learning_rate": 8.126973344056571e-07, + "loss": 0.5604, + "step": 6413 + }, + { + "epoch": 2.503512880562061, + "grad_norm": 0.45007295390416785, + "learning_rate": 8.114567127681771e-07, + "loss": 0.5637, + "step": 6414 + }, + { + "epoch": 2.503903200624512, + "grad_norm": 0.4084786951628341, + "learning_rate": 8.102169551574606e-07, + "loss": 0.5711, + "step": 6415 + }, + { + "epoch": 2.5042935206869634, + "grad_norm": 0.4779116927944054, + "learning_rate": 8.08978061829247e-07, + "loss": 0.5604, + "step": 6416 + }, + { + "epoch": 2.5046838407494145, + "grad_norm": 0.4883649615282539, + "learning_rate": 8.077400330391017e-07, + "loss": 0.5632, + "step": 6417 + }, + { + "epoch": 2.5050741608118656, + "grad_norm": 0.41580986715205764, + "learning_rate": 8.065028690424081e-07, + "loss": 0.5569, + "step": 6418 + }, + { + "epoch": 2.505464480874317, + "grad_norm": 0.44236041431596546, + "learning_rate": 8.052665700943718e-07, + "loss": 0.6019, + "step": 6419 + }, + { + "epoch": 2.5058548009367683, + "grad_norm": 0.4301740634362286, + "learning_rate": 8.040311364500225e-07, + "loss": 0.5579, + "step": 6420 + }, + { + "epoch": 2.5062451209992194, + "grad_norm": 0.4054545200614153, + "learning_rate": 8.027965683642069e-07, + "loss": 0.5775, + "step": 6421 + }, + { + "epoch": 2.5066354410616705, + "grad_norm": 0.44217535206677056, + "learning_rate": 8.015628660916008e-07, + "loss": 0.5602, + "step": 6422 + }, + { + "epoch": 2.5070257611241216, + "grad_norm": 0.4658688029546191, + "learning_rate": 8.003300298866906e-07, + "loss": 0.5506, + "step": 6423 + }, + { + "epoch": 2.507416081186573, + "grad_norm": 0.43886480032223446, + "learning_rate": 7.990980600037934e-07, + "loss": 0.5886, + "step": 6424 + }, + { + "epoch": 2.507806401249024, + "grad_norm": 0.4632745080312664, + "learning_rate": 7.978669566970421e-07, + "loss": 0.595, + "step": 6425 + }, + { + "epoch": 2.5081967213114753, + "grad_norm": 0.41759496508434835, + "learning_rate": 7.966367202203956e-07, + "loss": 0.5555, + "step": 6426 + }, + { + "epoch": 2.508587041373927, + "grad_norm": 0.4765555815414146, + "learning_rate": 7.954073508276278e-07, + "loss": 0.6075, + "step": 6427 + }, + { + "epoch": 2.508977361436378, + "grad_norm": 0.47299199052701196, + "learning_rate": 7.941788487723401e-07, + "loss": 0.5962, + "step": 6428 + }, + { + "epoch": 2.509367681498829, + "grad_norm": 0.4531291735207859, + "learning_rate": 7.929512143079499e-07, + "loss": 0.545, + "step": 6429 + }, + { + "epoch": 2.50975800156128, + "grad_norm": 0.42861105787233256, + "learning_rate": 7.917244476876995e-07, + "loss": 0.5619, + "step": 6430 + }, + { + "epoch": 2.5101483216237312, + "grad_norm": 0.4454465804761266, + "learning_rate": 7.904985491646494e-07, + "loss": 0.5407, + "step": 6431 + }, + { + "epoch": 2.510538641686183, + "grad_norm": 0.46131515894800773, + "learning_rate": 7.892735189916829e-07, + "loss": 0.5507, + "step": 6432 + }, + { + "epoch": 2.510928961748634, + "grad_norm": 0.4827779258501412, + "learning_rate": 7.880493574215009e-07, + "loss": 0.5694, + "step": 6433 + }, + { + "epoch": 2.511319281811085, + "grad_norm": 0.5453665758564739, + "learning_rate": 7.868260647066306e-07, + "loss": 0.5923, + "step": 6434 + }, + { + "epoch": 2.5117096018735365, + "grad_norm": 0.5043411380462414, + "learning_rate": 7.856036410994161e-07, + "loss": 0.5533, + "step": 6435 + }, + { + "epoch": 2.5120999219359876, + "grad_norm": 0.45663689012051034, + "learning_rate": 7.843820868520208e-07, + "loss": 0.5889, + "step": 6436 + }, + { + "epoch": 2.5124902419984387, + "grad_norm": 0.4145889640411252, + "learning_rate": 7.831614022164341e-07, + "loss": 0.6015, + "step": 6437 + }, + { + "epoch": 2.51288056206089, + "grad_norm": 0.4550577685054731, + "learning_rate": 7.819415874444608e-07, + "loss": 0.573, + "step": 6438 + }, + { + "epoch": 2.513270882123341, + "grad_norm": 0.44487017948382757, + "learning_rate": 7.807226427877301e-07, + "loss": 0.5807, + "step": 6439 + }, + { + "epoch": 2.5136612021857925, + "grad_norm": 0.4348932480349821, + "learning_rate": 7.795045684976893e-07, + "loss": 0.5755, + "step": 6440 + }, + { + "epoch": 2.5140515222482436, + "grad_norm": 0.42742457865916195, + "learning_rate": 7.782873648256068e-07, + "loss": 0.5429, + "step": 6441 + }, + { + "epoch": 2.5144418423106947, + "grad_norm": 0.4279004151246514, + "learning_rate": 7.770710320225699e-07, + "loss": 0.5557, + "step": 6442 + }, + { + "epoch": 2.514832162373146, + "grad_norm": 0.44111438680619636, + "learning_rate": 7.758555703394904e-07, + "loss": 0.5585, + "step": 6443 + }, + { + "epoch": 2.5152224824355973, + "grad_norm": 0.42849047113271965, + "learning_rate": 7.746409800270954e-07, + "loss": 0.5751, + "step": 6444 + }, + { + "epoch": 2.5156128024980484, + "grad_norm": 0.4338979510106038, + "learning_rate": 7.734272613359373e-07, + "loss": 0.5645, + "step": 6445 + }, + { + "epoch": 2.5160031225604995, + "grad_norm": 0.39336875234226065, + "learning_rate": 7.722144145163834e-07, + "loss": 0.5473, + "step": 6446 + }, + { + "epoch": 2.5163934426229506, + "grad_norm": 0.4055266300533513, + "learning_rate": 7.710024398186261e-07, + "loss": 0.5788, + "step": 6447 + }, + { + "epoch": 2.516783762685402, + "grad_norm": 0.4704464676623626, + "learning_rate": 7.697913374926746e-07, + "loss": 0.559, + "step": 6448 + }, + { + "epoch": 2.5171740827478533, + "grad_norm": 0.5059275736361141, + "learning_rate": 7.685811077883598e-07, + "loss": 0.5334, + "step": 6449 + }, + { + "epoch": 2.5175644028103044, + "grad_norm": 0.5900660975941315, + "learning_rate": 7.673717509553291e-07, + "loss": 0.548, + "step": 6450 + }, + { + "epoch": 2.517954722872756, + "grad_norm": 0.4469510559139454, + "learning_rate": 7.661632672430569e-07, + "loss": 0.5677, + "step": 6451 + }, + { + "epoch": 2.518345042935207, + "grad_norm": 0.4714576104592064, + "learning_rate": 7.649556569008304e-07, + "loss": 0.554, + "step": 6452 + }, + { + "epoch": 2.518735362997658, + "grad_norm": 0.4637618048076054, + "learning_rate": 7.637489201777598e-07, + "loss": 0.5808, + "step": 6453 + }, + { + "epoch": 2.519125683060109, + "grad_norm": 0.5086569826351347, + "learning_rate": 7.625430573227765e-07, + "loss": 0.571, + "step": 6454 + }, + { + "epoch": 2.5195160031225603, + "grad_norm": 0.47926731629634395, + "learning_rate": 7.613380685846273e-07, + "loss": 0.5672, + "step": 6455 + }, + { + "epoch": 2.519906323185012, + "grad_norm": 0.4727675459658486, + "learning_rate": 7.601339542118841e-07, + "loss": 0.5195, + "step": 6456 + }, + { + "epoch": 2.520296643247463, + "grad_norm": 0.5116563351900626, + "learning_rate": 7.589307144529345e-07, + "loss": 0.5589, + "step": 6457 + }, + { + "epoch": 2.520686963309914, + "grad_norm": 0.4645999243093367, + "learning_rate": 7.577283495559867e-07, + "loss": 0.5786, + "step": 6458 + }, + { + "epoch": 2.5210772833723656, + "grad_norm": 0.43239168540646766, + "learning_rate": 7.565268597690673e-07, + "loss": 0.5793, + "step": 6459 + }, + { + "epoch": 2.5214676034348167, + "grad_norm": 0.44720496877957655, + "learning_rate": 7.553262453400261e-07, + "loss": 0.5948, + "step": 6460 + }, + { + "epoch": 2.521857923497268, + "grad_norm": 0.4910184472737486, + "learning_rate": 7.541265065165276e-07, + "loss": 0.5983, + "step": 6461 + }, + { + "epoch": 2.522248243559719, + "grad_norm": 0.4237775318029961, + "learning_rate": 7.529276435460603e-07, + "loss": 0.5361, + "step": 6462 + }, + { + "epoch": 2.52263856362217, + "grad_norm": 0.4778603521340726, + "learning_rate": 7.517296566759269e-07, + "loss": 0.5564, + "step": 6463 + }, + { + "epoch": 2.5230288836846215, + "grad_norm": 0.45378586963320006, + "learning_rate": 7.505325461532553e-07, + "loss": 0.5431, + "step": 6464 + }, + { + "epoch": 2.5234192037470726, + "grad_norm": 0.4500888803269447, + "learning_rate": 7.493363122249875e-07, + "loss": 0.6036, + "step": 6465 + }, + { + "epoch": 2.5238095238095237, + "grad_norm": 0.46951985211989367, + "learning_rate": 7.481409551378866e-07, + "loss": 0.5665, + "step": 6466 + }, + { + "epoch": 2.5241998438719753, + "grad_norm": 0.451928977492355, + "learning_rate": 7.469464751385347e-07, + "loss": 0.5692, + "step": 6467 + }, + { + "epoch": 2.5245901639344264, + "grad_norm": 0.4269332679459648, + "learning_rate": 7.457528724733349e-07, + "loss": 0.5748, + "step": 6468 + }, + { + "epoch": 2.5249804839968775, + "grad_norm": 0.42242386357225803, + "learning_rate": 7.445601473885062e-07, + "loss": 0.5359, + "step": 6469 + }, + { + "epoch": 2.5253708040593286, + "grad_norm": 0.43115846921705053, + "learning_rate": 7.433683001300867e-07, + "loss": 0.5442, + "step": 6470 + }, + { + "epoch": 2.5257611241217797, + "grad_norm": 0.4390538820397604, + "learning_rate": 7.421773309439373e-07, + "loss": 0.5229, + "step": 6471 + }, + { + "epoch": 2.526151444184231, + "grad_norm": 0.4051884398252167, + "learning_rate": 7.409872400757323e-07, + "loss": 0.5841, + "step": 6472 + }, + { + "epoch": 2.5265417642466823, + "grad_norm": 0.6061077896106773, + "learning_rate": 7.39798027770971e-07, + "loss": 0.586, + "step": 6473 + }, + { + "epoch": 2.5269320843091334, + "grad_norm": 0.50436083257864, + "learning_rate": 7.386096942749649e-07, + "loss": 0.5903, + "step": 6474 + }, + { + "epoch": 2.527322404371585, + "grad_norm": 0.47053651366771326, + "learning_rate": 7.374222398328512e-07, + "loss": 0.5371, + "step": 6475 + }, + { + "epoch": 2.527712724434036, + "grad_norm": 0.46407558787784514, + "learning_rate": 7.362356646895774e-07, + "loss": 0.5923, + "step": 6476 + }, + { + "epoch": 2.528103044496487, + "grad_norm": 0.45513355381720966, + "learning_rate": 7.350499690899177e-07, + "loss": 0.5822, + "step": 6477 + }, + { + "epoch": 2.5284933645589383, + "grad_norm": 0.4778718079957872, + "learning_rate": 7.338651532784586e-07, + "loss": 0.5893, + "step": 6478 + }, + { + "epoch": 2.5288836846213893, + "grad_norm": 0.5180396949386107, + "learning_rate": 7.3268121749961e-07, + "loss": 0.5743, + "step": 6479 + }, + { + "epoch": 2.529274004683841, + "grad_norm": 0.4682941642143957, + "learning_rate": 7.314981619975975e-07, + "loss": 0.583, + "step": 6480 + }, + { + "epoch": 2.529664324746292, + "grad_norm": 0.5037303658627934, + "learning_rate": 7.303159870164661e-07, + "loss": 0.556, + "step": 6481 + }, + { + "epoch": 2.530054644808743, + "grad_norm": 0.4460072901290168, + "learning_rate": 7.291346928000786e-07, + "loss": 0.5602, + "step": 6482 + }, + { + "epoch": 2.5304449648711946, + "grad_norm": 0.43075079640437497, + "learning_rate": 7.279542795921152e-07, + "loss": 0.5839, + "step": 6483 + }, + { + "epoch": 2.5308352849336457, + "grad_norm": 0.4235051350082845, + "learning_rate": 7.267747476360775e-07, + "loss": 0.5518, + "step": 6484 + }, + { + "epoch": 2.531225604996097, + "grad_norm": 0.42727016505400733, + "learning_rate": 7.255960971752829e-07, + "loss": 0.555, + "step": 6485 + }, + { + "epoch": 2.531615925058548, + "grad_norm": 0.4363235355078226, + "learning_rate": 7.244183284528661e-07, + "loss": 0.6228, + "step": 6486 + }, + { + "epoch": 2.532006245120999, + "grad_norm": 0.43499882581573324, + "learning_rate": 7.232414417117812e-07, + "loss": 0.5337, + "step": 6487 + }, + { + "epoch": 2.5323965651834506, + "grad_norm": 0.6095862001584131, + "learning_rate": 7.220654371948016e-07, + "loss": 0.5813, + "step": 6488 + }, + { + "epoch": 2.5327868852459017, + "grad_norm": 0.4280656153283205, + "learning_rate": 7.208903151445163e-07, + "loss": 0.5635, + "step": 6489 + }, + { + "epoch": 2.5331772053083528, + "grad_norm": 0.45162575416421913, + "learning_rate": 7.19716075803335e-07, + "loss": 0.5383, + "step": 6490 + }, + { + "epoch": 2.5335675253708043, + "grad_norm": 0.43038097029124656, + "learning_rate": 7.185427194134814e-07, + "loss": 0.6012, + "step": 6491 + }, + { + "epoch": 2.5339578454332554, + "grad_norm": 0.42296209036625204, + "learning_rate": 7.173702462170024e-07, + "loss": 0.5835, + "step": 6492 + }, + { + "epoch": 2.5343481654957065, + "grad_norm": 0.4811598362223058, + "learning_rate": 7.161986564557577e-07, + "loss": 0.5652, + "step": 6493 + }, + { + "epoch": 2.5347384855581576, + "grad_norm": 0.5035212132324195, + "learning_rate": 7.150279503714269e-07, + "loss": 0.5542, + "step": 6494 + }, + { + "epoch": 2.5351288056206087, + "grad_norm": 0.4005979148693518, + "learning_rate": 7.138581282055063e-07, + "loss": 0.5548, + "step": 6495 + }, + { + "epoch": 2.5355191256830603, + "grad_norm": 0.43558607842138564, + "learning_rate": 7.126891901993127e-07, + "loss": 0.5753, + "step": 6496 + }, + { + "epoch": 2.5359094457455114, + "grad_norm": 0.4487464823374877, + "learning_rate": 7.115211365939773e-07, + "loss": 0.5617, + "step": 6497 + }, + { + "epoch": 2.5362997658079625, + "grad_norm": 0.4497166782704212, + "learning_rate": 7.103539676304511e-07, + "loss": 0.5846, + "step": 6498 + }, + { + "epoch": 2.536690085870414, + "grad_norm": 0.6286909616763701, + "learning_rate": 7.09187683549501e-07, + "loss": 0.5957, + "step": 6499 + }, + { + "epoch": 2.537080405932865, + "grad_norm": 0.5036509856457461, + "learning_rate": 7.080222845917112e-07, + "loss": 0.5608, + "step": 6500 + }, + { + "epoch": 2.537470725995316, + "grad_norm": 0.4545437454064145, + "learning_rate": 7.068577709974856e-07, + "loss": 0.5404, + "step": 6501 + }, + { + "epoch": 2.5378610460577673, + "grad_norm": 0.5043560417836014, + "learning_rate": 7.056941430070435e-07, + "loss": 0.5869, + "step": 6502 + }, + { + "epoch": 2.5382513661202184, + "grad_norm": 0.43432050086611773, + "learning_rate": 7.045314008604221e-07, + "loss": 0.5304, + "step": 6503 + }, + { + "epoch": 2.53864168618267, + "grad_norm": 0.43447670476526634, + "learning_rate": 7.033695447974736e-07, + "loss": 0.5771, + "step": 6504 + }, + { + "epoch": 2.539032006245121, + "grad_norm": 0.42620309628564035, + "learning_rate": 7.022085750578733e-07, + "loss": 0.5494, + "step": 6505 + }, + { + "epoch": 2.539422326307572, + "grad_norm": 0.46644235356185154, + "learning_rate": 7.010484918811061e-07, + "loss": 0.5991, + "step": 6506 + }, + { + "epoch": 2.5398126463700237, + "grad_norm": 0.44542898707919937, + "learning_rate": 6.998892955064817e-07, + "loss": 0.5548, + "step": 6507 + }, + { + "epoch": 2.540202966432475, + "grad_norm": 0.46019373748524195, + "learning_rate": 6.987309861731195e-07, + "loss": 0.5515, + "step": 6508 + }, + { + "epoch": 2.540593286494926, + "grad_norm": 0.4157238086558384, + "learning_rate": 6.975735641199621e-07, + "loss": 0.5414, + "step": 6509 + }, + { + "epoch": 2.540983606557377, + "grad_norm": 0.4498107203432258, + "learning_rate": 6.964170295857658e-07, + "loss": 0.5317, + "step": 6510 + }, + { + "epoch": 2.541373926619828, + "grad_norm": 0.4492677692301102, + "learning_rate": 6.952613828091037e-07, + "loss": 0.5522, + "step": 6511 + }, + { + "epoch": 2.5417642466822796, + "grad_norm": 0.43544410503668995, + "learning_rate": 6.941066240283656e-07, + "loss": 0.5112, + "step": 6512 + }, + { + "epoch": 2.5421545667447307, + "grad_norm": 0.4374540951188568, + "learning_rate": 6.929527534817609e-07, + "loss": 0.5778, + "step": 6513 + }, + { + "epoch": 2.542544886807182, + "grad_norm": 0.45950715575750023, + "learning_rate": 6.917997714073127e-07, + "loss": 0.5406, + "step": 6514 + }, + { + "epoch": 2.5429352068696334, + "grad_norm": 0.43622406252574836, + "learning_rate": 6.906476780428634e-07, + "loss": 0.6072, + "step": 6515 + }, + { + "epoch": 2.5433255269320845, + "grad_norm": 0.5044698545385707, + "learning_rate": 6.894964736260695e-07, + "loss": 0.5809, + "step": 6516 + }, + { + "epoch": 2.5437158469945356, + "grad_norm": 0.42770518626387954, + "learning_rate": 6.883461583944046e-07, + "loss": 0.5181, + "step": 6517 + }, + { + "epoch": 2.5441061670569867, + "grad_norm": 0.45620417391802065, + "learning_rate": 6.871967325851613e-07, + "loss": 0.5449, + "step": 6518 + }, + { + "epoch": 2.5444964871194378, + "grad_norm": 0.44653923636269155, + "learning_rate": 6.86048196435447e-07, + "loss": 0.5411, + "step": 6519 + }, + { + "epoch": 2.5448868071818893, + "grad_norm": 0.4267406050353522, + "learning_rate": 6.849005501821843e-07, + "loss": 0.582, + "step": 6520 + }, + { + "epoch": 2.5452771272443404, + "grad_norm": 0.4294197814916062, + "learning_rate": 6.83753794062113e-07, + "loss": 0.5637, + "step": 6521 + }, + { + "epoch": 2.5456674473067915, + "grad_norm": 0.4530216996757444, + "learning_rate": 6.82607928311792e-07, + "loss": 0.5521, + "step": 6522 + }, + { + "epoch": 2.546057767369243, + "grad_norm": 0.6119956679086795, + "learning_rate": 6.814629531675926e-07, + "loss": 0.5799, + "step": 6523 + }, + { + "epoch": 2.546448087431694, + "grad_norm": 0.4329813454525157, + "learning_rate": 6.803188688657053e-07, + "loss": 0.5201, + "step": 6524 + }, + { + "epoch": 2.5468384074941453, + "grad_norm": 0.438729737287272, + "learning_rate": 6.791756756421341e-07, + "loss": 0.6125, + "step": 6525 + }, + { + "epoch": 2.5472287275565964, + "grad_norm": 0.4471084081123204, + "learning_rate": 6.780333737327027e-07, + "loss": 0.5531, + "step": 6526 + }, + { + "epoch": 2.5476190476190474, + "grad_norm": 0.4689018921463512, + "learning_rate": 6.76891963373047e-07, + "loss": 0.5379, + "step": 6527 + }, + { + "epoch": 2.548009367681499, + "grad_norm": 0.48307096464041843, + "learning_rate": 6.757514447986241e-07, + "loss": 0.5376, + "step": 6528 + }, + { + "epoch": 2.54839968774395, + "grad_norm": 0.47567065305313694, + "learning_rate": 6.746118182446992e-07, + "loss": 0.5408, + "step": 6529 + }, + { + "epoch": 2.548790007806401, + "grad_norm": 0.5190313450761627, + "learning_rate": 6.734730839463627e-07, + "loss": 0.5401, + "step": 6530 + }, + { + "epoch": 2.5491803278688527, + "grad_norm": 0.4846058914347089, + "learning_rate": 6.723352421385132e-07, + "loss": 0.5623, + "step": 6531 + }, + { + "epoch": 2.549570647931304, + "grad_norm": 0.4334028108187198, + "learning_rate": 6.711982930558713e-07, + "loss": 0.5897, + "step": 6532 + }, + { + "epoch": 2.549960967993755, + "grad_norm": 0.45234915266187614, + "learning_rate": 6.700622369329696e-07, + "loss": 0.5812, + "step": 6533 + }, + { + "epoch": 2.550351288056206, + "grad_norm": 0.47158186714812056, + "learning_rate": 6.689270740041564e-07, + "loss": 0.5685, + "step": 6534 + }, + { + "epoch": 2.550741608118657, + "grad_norm": 0.4668767300628664, + "learning_rate": 6.677928045035986e-07, + "loss": 0.5574, + "step": 6535 + }, + { + "epoch": 2.5511319281811087, + "grad_norm": 0.4398398038740151, + "learning_rate": 6.666594286652756e-07, + "loss": 0.559, + "step": 6536 + }, + { + "epoch": 2.5515222482435598, + "grad_norm": 0.44103391623236937, + "learning_rate": 6.655269467229869e-07, + "loss": 0.5779, + "step": 6537 + }, + { + "epoch": 2.551912568306011, + "grad_norm": 0.4349221370373394, + "learning_rate": 6.643953589103408e-07, + "loss": 0.5597, + "step": 6538 + }, + { + "epoch": 2.552302888368462, + "grad_norm": 0.5813932275744527, + "learning_rate": 6.632646654607672e-07, + "loss": 0.5629, + "step": 6539 + }, + { + "epoch": 2.552693208430913, + "grad_norm": 0.40479093363402496, + "learning_rate": 6.621348666075084e-07, + "loss": 0.5871, + "step": 6540 + }, + { + "epoch": 2.5530835284933646, + "grad_norm": 0.4616843741721254, + "learning_rate": 6.610059625836246e-07, + "loss": 0.5159, + "step": 6541 + }, + { + "epoch": 2.5534738485558157, + "grad_norm": 0.4373340107965263, + "learning_rate": 6.598779536219879e-07, + "loss": 0.5362, + "step": 6542 + }, + { + "epoch": 2.553864168618267, + "grad_norm": 0.42790164092408156, + "learning_rate": 6.5875083995529e-07, + "loss": 0.5916, + "step": 6543 + }, + { + "epoch": 2.5542544886807184, + "grad_norm": 0.48084657643160317, + "learning_rate": 6.576246218160348e-07, + "loss": 0.5449, + "step": 6544 + }, + { + "epoch": 2.5546448087431695, + "grad_norm": 0.4378115237903379, + "learning_rate": 6.564992994365416e-07, + "loss": 0.573, + "step": 6545 + }, + { + "epoch": 2.5550351288056206, + "grad_norm": 0.42390324549867797, + "learning_rate": 6.553748730489451e-07, + "loss": 0.5748, + "step": 6546 + }, + { + "epoch": 2.5554254488680717, + "grad_norm": 0.4161404773899109, + "learning_rate": 6.542513428851982e-07, + "loss": 0.5744, + "step": 6547 + }, + { + "epoch": 2.5558157689305228, + "grad_norm": 0.49613334616173, + "learning_rate": 6.531287091770649e-07, + "loss": 0.5698, + "step": 6548 + }, + { + "epoch": 2.5562060889929743, + "grad_norm": 0.504999149114274, + "learning_rate": 6.520069721561251e-07, + "loss": 0.5481, + "step": 6549 + }, + { + "epoch": 2.5565964090554254, + "grad_norm": 0.49640346833906596, + "learning_rate": 6.508861320537763e-07, + "loss": 0.5589, + "step": 6550 + }, + { + "epoch": 2.5569867291178765, + "grad_norm": 0.43573756988159584, + "learning_rate": 6.497661891012269e-07, + "loss": 0.559, + "step": 6551 + }, + { + "epoch": 2.557377049180328, + "grad_norm": 0.4224763940809357, + "learning_rate": 6.486471435295055e-07, + "loss": 0.6062, + "step": 6552 + }, + { + "epoch": 2.557767369242779, + "grad_norm": 0.4703489024902764, + "learning_rate": 6.475289955694498e-07, + "loss": 0.5083, + "step": 6553 + }, + { + "epoch": 2.5581576893052302, + "grad_norm": 0.4500070159252445, + "learning_rate": 6.464117454517182e-07, + "loss": 0.5801, + "step": 6554 + }, + { + "epoch": 2.5585480093676813, + "grad_norm": 0.4666692450967314, + "learning_rate": 6.452953934067768e-07, + "loss": 0.5634, + "step": 6555 + }, + { + "epoch": 2.5589383294301324, + "grad_norm": 0.4529250994851492, + "learning_rate": 6.441799396649134e-07, + "loss": 0.5777, + "step": 6556 + }, + { + "epoch": 2.559328649492584, + "grad_norm": 0.4502302029031409, + "learning_rate": 6.430653844562256e-07, + "loss": 0.5775, + "step": 6557 + }, + { + "epoch": 2.559718969555035, + "grad_norm": 0.4332424219351853, + "learning_rate": 6.419517280106297e-07, + "loss": 0.5409, + "step": 6558 + }, + { + "epoch": 2.560109289617486, + "grad_norm": 0.4671468753517293, + "learning_rate": 6.408389705578527e-07, + "loss": 0.5764, + "step": 6559 + }, + { + "epoch": 2.5604996096799377, + "grad_norm": 0.43910627508887246, + "learning_rate": 6.39727112327439e-07, + "loss": 0.5576, + "step": 6560 + }, + { + "epoch": 2.560889929742389, + "grad_norm": 0.41903780054130874, + "learning_rate": 6.386161535487468e-07, + "loss": 0.5849, + "step": 6561 + }, + { + "epoch": 2.56128024980484, + "grad_norm": 0.4464175334611642, + "learning_rate": 6.375060944509465e-07, + "loss": 0.5542, + "step": 6562 + }, + { + "epoch": 2.561670569867291, + "grad_norm": 0.4395953399217729, + "learning_rate": 6.363969352630267e-07, + "loss": 0.5872, + "step": 6563 + }, + { + "epoch": 2.562060889929742, + "grad_norm": 0.4655066151998905, + "learning_rate": 6.352886762137883e-07, + "loss": 0.5482, + "step": 6564 + }, + { + "epoch": 2.5624512099921937, + "grad_norm": 0.4413739441915263, + "learning_rate": 6.341813175318456e-07, + "loss": 0.5813, + "step": 6565 + }, + { + "epoch": 2.5628415300546448, + "grad_norm": 0.4038467580670441, + "learning_rate": 6.330748594456282e-07, + "loss": 0.604, + "step": 6566 + }, + { + "epoch": 2.563231850117096, + "grad_norm": 0.49036949910785244, + "learning_rate": 6.319693021833818e-07, + "loss": 0.5535, + "step": 6567 + }, + { + "epoch": 2.5636221701795474, + "grad_norm": 0.5654921835944716, + "learning_rate": 6.308646459731616e-07, + "loss": 0.5851, + "step": 6568 + }, + { + "epoch": 2.5640124902419985, + "grad_norm": 0.444900624332623, + "learning_rate": 6.297608910428432e-07, + "loss": 0.5656, + "step": 6569 + }, + { + "epoch": 2.5644028103044496, + "grad_norm": 0.4195075451997835, + "learning_rate": 6.286580376201101e-07, + "loss": 0.5624, + "step": 6570 + }, + { + "epoch": 2.5647931303669007, + "grad_norm": 0.4512543012384848, + "learning_rate": 6.275560859324642e-07, + "loss": 0.5365, + "step": 6571 + }, + { + "epoch": 2.565183450429352, + "grad_norm": 0.4580099861041835, + "learning_rate": 6.264550362072197e-07, + "loss": 0.5777, + "step": 6572 + }, + { + "epoch": 2.5655737704918034, + "grad_norm": 0.40621838020832474, + "learning_rate": 6.253548886715044e-07, + "loss": 0.5711, + "step": 6573 + }, + { + "epoch": 2.5659640905542545, + "grad_norm": 0.4432222238928616, + "learning_rate": 6.24255643552259e-07, + "loss": 0.5574, + "step": 6574 + }, + { + "epoch": 2.5663544106167056, + "grad_norm": 0.442079548594655, + "learning_rate": 6.231573010762421e-07, + "loss": 0.5721, + "step": 6575 + }, + { + "epoch": 2.566744730679157, + "grad_norm": 0.49385330773954583, + "learning_rate": 6.220598614700213e-07, + "loss": 0.5527, + "step": 6576 + }, + { + "epoch": 2.567135050741608, + "grad_norm": 0.46849282785711405, + "learning_rate": 6.209633249599823e-07, + "loss": 0.5707, + "step": 6577 + }, + { + "epoch": 2.5675253708040593, + "grad_norm": 0.42250766544012947, + "learning_rate": 6.198676917723206e-07, + "loss": 0.5847, + "step": 6578 + }, + { + "epoch": 2.5679156908665104, + "grad_norm": 0.41072149657076285, + "learning_rate": 6.187729621330468e-07, + "loss": 0.5633, + "step": 6579 + }, + { + "epoch": 2.5683060109289615, + "grad_norm": 0.39826688486107187, + "learning_rate": 6.176791362679874e-07, + "loss": 0.6003, + "step": 6580 + }, + { + "epoch": 2.568696330991413, + "grad_norm": 0.4362389883629815, + "learning_rate": 6.165862144027795e-07, + "loss": 0.5391, + "step": 6581 + }, + { + "epoch": 2.569086651053864, + "grad_norm": 0.4637452335060108, + "learning_rate": 6.154941967628742e-07, + "loss": 0.553, + "step": 6582 + }, + { + "epoch": 2.5694769711163152, + "grad_norm": 0.43223336471529444, + "learning_rate": 6.144030835735354e-07, + "loss": 0.5905, + "step": 6583 + }, + { + "epoch": 2.5698672911787668, + "grad_norm": 0.5851685040711734, + "learning_rate": 6.133128750598444e-07, + "loss": 0.5528, + "step": 6584 + }, + { + "epoch": 2.570257611241218, + "grad_norm": 0.46423097833431204, + "learning_rate": 6.122235714466907e-07, + "loss": 0.5836, + "step": 6585 + }, + { + "epoch": 2.570647931303669, + "grad_norm": 0.4567195177878257, + "learning_rate": 6.111351729587816e-07, + "loss": 0.5335, + "step": 6586 + }, + { + "epoch": 2.57103825136612, + "grad_norm": 0.41726227522846737, + "learning_rate": 6.100476798206334e-07, + "loss": 0.5559, + "step": 6587 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.48092468903144464, + "learning_rate": 6.089610922565797e-07, + "loss": 0.5602, + "step": 6588 + }, + { + "epoch": 2.5718188914910227, + "grad_norm": 0.4943578632778904, + "learning_rate": 6.078754104907647e-07, + "loss": 0.5337, + "step": 6589 + }, + { + "epoch": 2.572209211553474, + "grad_norm": 0.43946485152758813, + "learning_rate": 6.067906347471463e-07, + "loss": 0.551, + "step": 6590 + }, + { + "epoch": 2.572599531615925, + "grad_norm": 0.4222600952724034, + "learning_rate": 6.057067652494952e-07, + "loss": 0.5841, + "step": 6591 + }, + { + "epoch": 2.5729898516783765, + "grad_norm": 0.4861707840032372, + "learning_rate": 6.046238022213974e-07, + "loss": 0.5666, + "step": 6592 + }, + { + "epoch": 2.5733801717408276, + "grad_norm": 0.4051475577458495, + "learning_rate": 6.035417458862474e-07, + "loss": 0.5791, + "step": 6593 + }, + { + "epoch": 2.5737704918032787, + "grad_norm": 0.4613035368719072, + "learning_rate": 6.02460596467258e-07, + "loss": 0.5848, + "step": 6594 + }, + { + "epoch": 2.5741608118657298, + "grad_norm": 0.4266144541872911, + "learning_rate": 6.013803541874518e-07, + "loss": 0.5518, + "step": 6595 + }, + { + "epoch": 2.574551131928181, + "grad_norm": 0.462799996042729, + "learning_rate": 6.003010192696629e-07, + "loss": 0.5664, + "step": 6596 + }, + { + "epoch": 2.5749414519906324, + "grad_norm": 0.49984745308119816, + "learning_rate": 5.99222591936543e-07, + "loss": 0.5907, + "step": 6597 + }, + { + "epoch": 2.5753317720530835, + "grad_norm": 0.47357626555554855, + "learning_rate": 5.98145072410552e-07, + "loss": 0.5853, + "step": 6598 + }, + { + "epoch": 2.5757220921155346, + "grad_norm": 0.4168555932766435, + "learning_rate": 5.970684609139643e-07, + "loss": 0.5842, + "step": 6599 + }, + { + "epoch": 2.576112412177986, + "grad_norm": 0.4784144710348097, + "learning_rate": 5.959927576688662e-07, + "loss": 0.5614, + "step": 6600 + }, + { + "epoch": 2.5765027322404372, + "grad_norm": 0.6464675248488808, + "learning_rate": 5.949179628971597e-07, + "loss": 0.5679, + "step": 6601 + }, + { + "epoch": 2.5768930523028883, + "grad_norm": 0.45350232118391187, + "learning_rate": 5.938440768205544e-07, + "loss": 0.5819, + "step": 6602 + }, + { + "epoch": 2.5772833723653394, + "grad_norm": 0.4237911495720786, + "learning_rate": 5.927710996605773e-07, + "loss": 0.564, + "step": 6603 + }, + { + "epoch": 2.5776736924277905, + "grad_norm": 0.48577573040098465, + "learning_rate": 5.91699031638564e-07, + "loss": 0.5478, + "step": 6604 + }, + { + "epoch": 2.578064012490242, + "grad_norm": 0.4960489762444249, + "learning_rate": 5.906278729756659e-07, + "loss": 0.5832, + "step": 6605 + }, + { + "epoch": 2.578454332552693, + "grad_norm": 0.49323076993933124, + "learning_rate": 5.895576238928435e-07, + "loss": 0.5302, + "step": 6606 + }, + { + "epoch": 2.5788446526151443, + "grad_norm": 0.42604617413415213, + "learning_rate": 5.884882846108747e-07, + "loss": 0.5888, + "step": 6607 + }, + { + "epoch": 2.579234972677596, + "grad_norm": 0.44275916051838704, + "learning_rate": 5.874198553503418e-07, + "loss": 0.5459, + "step": 6608 + }, + { + "epoch": 2.579625292740047, + "grad_norm": 0.43301868980129515, + "learning_rate": 5.86352336331647e-07, + "loss": 0.5598, + "step": 6609 + }, + { + "epoch": 2.580015612802498, + "grad_norm": 0.5007431051817091, + "learning_rate": 5.852857277749996e-07, + "loss": 0.5451, + "step": 6610 + }, + { + "epoch": 2.580405932864949, + "grad_norm": 0.47322685290143307, + "learning_rate": 5.842200299004259e-07, + "loss": 0.5949, + "step": 6611 + }, + { + "epoch": 2.5807962529274002, + "grad_norm": 0.4936840985785919, + "learning_rate": 5.831552429277604e-07, + "loss": 0.5906, + "step": 6612 + }, + { + "epoch": 2.5811865729898518, + "grad_norm": 0.44334608096952827, + "learning_rate": 5.820913670766493e-07, + "loss": 0.6017, + "step": 6613 + }, + { + "epoch": 2.581576893052303, + "grad_norm": 0.5079835560832854, + "learning_rate": 5.810284025665553e-07, + "loss": 0.5717, + "step": 6614 + }, + { + "epoch": 2.581967213114754, + "grad_norm": 0.46862172900864707, + "learning_rate": 5.799663496167473e-07, + "loss": 0.5608, + "step": 6615 + }, + { + "epoch": 2.5823575331772055, + "grad_norm": 0.4519293382057057, + "learning_rate": 5.789052084463131e-07, + "loss": 0.5652, + "step": 6616 + }, + { + "epoch": 2.5827478532396566, + "grad_norm": 0.48357506052782634, + "learning_rate": 5.778449792741442e-07, + "loss": 0.5429, + "step": 6617 + }, + { + "epoch": 2.5831381733021077, + "grad_norm": 0.45894386249909197, + "learning_rate": 5.767856623189511e-07, + "loss": 0.5541, + "step": 6618 + }, + { + "epoch": 2.583528493364559, + "grad_norm": 0.42902434765562736, + "learning_rate": 5.757272577992507e-07, + "loss": 0.5598, + "step": 6619 + }, + { + "epoch": 2.58391881342701, + "grad_norm": 0.4198599845557855, + "learning_rate": 5.746697659333771e-07, + "loss": 0.5778, + "step": 6620 + }, + { + "epoch": 2.5843091334894615, + "grad_norm": 0.4660160859660187, + "learning_rate": 5.736131869394717e-07, + "loss": 0.5345, + "step": 6621 + }, + { + "epoch": 2.5846994535519126, + "grad_norm": 0.42364737549696124, + "learning_rate": 5.7255752103549e-07, + "loss": 0.5979, + "step": 6622 + }, + { + "epoch": 2.5850897736143637, + "grad_norm": 0.4790544210395985, + "learning_rate": 5.715027684391966e-07, + "loss": 0.5804, + "step": 6623 + }, + { + "epoch": 2.585480093676815, + "grad_norm": 0.49537557626085693, + "learning_rate": 5.704489293681715e-07, + "loss": 0.5526, + "step": 6624 + }, + { + "epoch": 2.5858704137392663, + "grad_norm": 0.4795465879239039, + "learning_rate": 5.693960040398034e-07, + "loss": 0.5773, + "step": 6625 + }, + { + "epoch": 2.5862607338017174, + "grad_norm": 0.4565419543400443, + "learning_rate": 5.68343992671293e-07, + "loss": 0.5622, + "step": 6626 + }, + { + "epoch": 2.5866510538641685, + "grad_norm": 0.4298915230038055, + "learning_rate": 5.672928954796519e-07, + "loss": 0.5459, + "step": 6627 + }, + { + "epoch": 2.5870413739266196, + "grad_norm": 0.4697351003745166, + "learning_rate": 5.662427126817061e-07, + "loss": 0.6109, + "step": 6628 + }, + { + "epoch": 2.587431693989071, + "grad_norm": 0.505225759479983, + "learning_rate": 5.651934444940893e-07, + "loss": 0.5862, + "step": 6629 + }, + { + "epoch": 2.5878220140515222, + "grad_norm": 0.485419270097663, + "learning_rate": 5.641450911332469e-07, + "loss": 0.5392, + "step": 6630 + }, + { + "epoch": 2.5882123341139733, + "grad_norm": 0.449247624846803, + "learning_rate": 5.630976528154402e-07, + "loss": 0.589, + "step": 6631 + }, + { + "epoch": 2.588602654176425, + "grad_norm": 0.45067603462320954, + "learning_rate": 5.620511297567349e-07, + "loss": 0.5448, + "step": 6632 + }, + { + "epoch": 2.588992974238876, + "grad_norm": 0.5387835635541046, + "learning_rate": 5.610055221730132e-07, + "loss": 0.5781, + "step": 6633 + }, + { + "epoch": 2.589383294301327, + "grad_norm": 0.4251936898337902, + "learning_rate": 5.59960830279967e-07, + "loss": 0.5577, + "step": 6634 + }, + { + "epoch": 2.589773614363778, + "grad_norm": 0.4829391946145222, + "learning_rate": 5.589170542930972e-07, + "loss": 0.5955, + "step": 6635 + }, + { + "epoch": 2.5901639344262293, + "grad_norm": 0.4523179693433049, + "learning_rate": 5.578741944277172e-07, + "loss": 0.5727, + "step": 6636 + }, + { + "epoch": 2.590554254488681, + "grad_norm": 0.4526013847084935, + "learning_rate": 5.568322508989537e-07, + "loss": 0.5315, + "step": 6637 + }, + { + "epoch": 2.590944574551132, + "grad_norm": 0.4184650499571489, + "learning_rate": 5.557912239217401e-07, + "loss": 0.5819, + "step": 6638 + }, + { + "epoch": 2.591334894613583, + "grad_norm": 0.5403295860761292, + "learning_rate": 5.547511137108247e-07, + "loss": 0.572, + "step": 6639 + }, + { + "epoch": 2.5917252146760346, + "grad_norm": 0.5229575013989555, + "learning_rate": 5.537119204807634e-07, + "loss": 0.5749, + "step": 6640 + }, + { + "epoch": 2.5921155347384857, + "grad_norm": 0.41803243871275486, + "learning_rate": 5.526736444459263e-07, + "loss": 0.561, + "step": 6641 + }, + { + "epoch": 2.5925058548009368, + "grad_norm": 0.4178306044577266, + "learning_rate": 5.516362858204915e-07, + "loss": 0.5804, + "step": 6642 + }, + { + "epoch": 2.592896174863388, + "grad_norm": 0.44281899179111983, + "learning_rate": 5.505998448184485e-07, + "loss": 0.5916, + "step": 6643 + }, + { + "epoch": 2.593286494925839, + "grad_norm": 0.44340185354103, + "learning_rate": 5.495643216535973e-07, + "loss": 0.5997, + "step": 6644 + }, + { + "epoch": 2.5936768149882905, + "grad_norm": 0.4846190599831029, + "learning_rate": 5.485297165395509e-07, + "loss": 0.5679, + "step": 6645 + }, + { + "epoch": 2.5940671350507416, + "grad_norm": 0.5023743770267167, + "learning_rate": 5.474960296897303e-07, + "loss": 0.5917, + "step": 6646 + }, + { + "epoch": 2.5944574551131927, + "grad_norm": 0.4538397449039807, + "learning_rate": 5.464632613173659e-07, + "loss": 0.5563, + "step": 6647 + }, + { + "epoch": 2.5948477751756442, + "grad_norm": 0.4631168830223657, + "learning_rate": 5.454314116355042e-07, + "loss": 0.5754, + "step": 6648 + }, + { + "epoch": 2.5952380952380953, + "grad_norm": 0.4688659412119375, + "learning_rate": 5.444004808569953e-07, + "loss": 0.5787, + "step": 6649 + }, + { + "epoch": 2.5956284153005464, + "grad_norm": 0.5326065414857609, + "learning_rate": 5.433704691945057e-07, + "loss": 0.5318, + "step": 6650 + }, + { + "epoch": 2.5960187353629975, + "grad_norm": 0.5258583047347288, + "learning_rate": 5.423413768605091e-07, + "loss": 0.5741, + "step": 6651 + }, + { + "epoch": 2.5964090554254486, + "grad_norm": 0.5155614090221113, + "learning_rate": 5.413132040672886e-07, + "loss": 0.5595, + "step": 6652 + }, + { + "epoch": 2.5967993754879, + "grad_norm": 0.43768970433894333, + "learning_rate": 5.402859510269398e-07, + "loss": 0.5613, + "step": 6653 + }, + { + "epoch": 2.5971896955503513, + "grad_norm": 0.45418930880794683, + "learning_rate": 5.392596179513687e-07, + "loss": 0.5622, + "step": 6654 + }, + { + "epoch": 2.5975800156128024, + "grad_norm": 0.4274911510901408, + "learning_rate": 5.382342050522899e-07, + "loss": 0.5223, + "step": 6655 + }, + { + "epoch": 2.597970335675254, + "grad_norm": 0.4646680592507667, + "learning_rate": 5.372097125412296e-07, + "loss": 0.5353, + "step": 6656 + }, + { + "epoch": 2.598360655737705, + "grad_norm": 0.47404548591958273, + "learning_rate": 5.361861406295227e-07, + "loss": 0.5625, + "step": 6657 + }, + { + "epoch": 2.598750975800156, + "grad_norm": 0.44034304942930186, + "learning_rate": 5.351634895283165e-07, + "loss": 0.5969, + "step": 6658 + }, + { + "epoch": 2.5991412958626072, + "grad_norm": 0.4365419000192421, + "learning_rate": 5.34141759448566e-07, + "loss": 0.5249, + "step": 6659 + }, + { + "epoch": 2.5995316159250583, + "grad_norm": 0.4384416444613834, + "learning_rate": 5.331209506010371e-07, + "loss": 0.5693, + "step": 6660 + }, + { + "epoch": 2.59992193598751, + "grad_norm": 0.4089350017061847, + "learning_rate": 5.321010631963053e-07, + "loss": 0.5846, + "step": 6661 + }, + { + "epoch": 2.600312256049961, + "grad_norm": 0.43381269531109296, + "learning_rate": 5.310820974447573e-07, + "loss": 0.5727, + "step": 6662 + }, + { + "epoch": 2.600702576112412, + "grad_norm": 0.45175762393531405, + "learning_rate": 5.300640535565882e-07, + "loss": 0.5639, + "step": 6663 + }, + { + "epoch": 2.6010928961748636, + "grad_norm": 0.4340271962337928, + "learning_rate": 5.290469317418029e-07, + "loss": 0.5699, + "step": 6664 + }, + { + "epoch": 2.6014832162373147, + "grad_norm": 0.5982695722835439, + "learning_rate": 5.28030732210218e-07, + "loss": 0.5646, + "step": 6665 + }, + { + "epoch": 2.601873536299766, + "grad_norm": 0.4188649842245143, + "learning_rate": 5.270154551714574e-07, + "loss": 0.5838, + "step": 6666 + }, + { + "epoch": 2.602263856362217, + "grad_norm": 0.4510834625133253, + "learning_rate": 5.260011008349569e-07, + "loss": 0.5984, + "step": 6667 + }, + { + "epoch": 2.602654176424668, + "grad_norm": 0.44144156602712353, + "learning_rate": 5.249876694099598e-07, + "loss": 0.629, + "step": 6668 + }, + { + "epoch": 2.6030444964871196, + "grad_norm": 0.4735749052442121, + "learning_rate": 5.239751611055227e-07, + "loss": 0.5492, + "step": 6669 + }, + { + "epoch": 2.6034348165495707, + "grad_norm": 0.4246378534119827, + "learning_rate": 5.229635761305051e-07, + "loss": 0.6323, + "step": 6670 + }, + { + "epoch": 2.6038251366120218, + "grad_norm": 0.41603747627276527, + "learning_rate": 5.219529146935836e-07, + "loss": 0.5294, + "step": 6671 + }, + { + "epoch": 2.6042154566744733, + "grad_norm": 0.4106290512152015, + "learning_rate": 5.209431770032386e-07, + "loss": 0.5652, + "step": 6672 + }, + { + "epoch": 2.6046057767369244, + "grad_norm": 0.4808261251134444, + "learning_rate": 5.19934363267764e-07, + "loss": 0.5827, + "step": 6673 + }, + { + "epoch": 2.6049960967993755, + "grad_norm": 0.4192171616359125, + "learning_rate": 5.189264736952598e-07, + "loss": 0.5867, + "step": 6674 + }, + { + "epoch": 2.6053864168618266, + "grad_norm": 0.43549442269054056, + "learning_rate": 5.179195084936372e-07, + "loss": 0.5461, + "step": 6675 + }, + { + "epoch": 2.6057767369242777, + "grad_norm": 0.4014979335581682, + "learning_rate": 5.169134678706172e-07, + "loss": 0.5765, + "step": 6676 + }, + { + "epoch": 2.6061670569867292, + "grad_norm": 0.44422382129348553, + "learning_rate": 5.159083520337277e-07, + "loss": 0.5815, + "step": 6677 + }, + { + "epoch": 2.6065573770491803, + "grad_norm": 0.47538983873061097, + "learning_rate": 5.149041611903105e-07, + "loss": 0.5752, + "step": 6678 + }, + { + "epoch": 2.6069476971116314, + "grad_norm": 0.48899908404621667, + "learning_rate": 5.139008955475089e-07, + "loss": 0.5522, + "step": 6679 + }, + { + "epoch": 2.607338017174083, + "grad_norm": 0.46358601925107656, + "learning_rate": 5.128985553122839e-07, + "loss": 0.5956, + "step": 6680 + }, + { + "epoch": 2.607728337236534, + "grad_norm": 0.4299262086745361, + "learning_rate": 5.118971406913986e-07, + "loss": 0.5969, + "step": 6681 + }, + { + "epoch": 2.608118657298985, + "grad_norm": 0.4230445171606793, + "learning_rate": 5.108966518914305e-07, + "loss": 0.5502, + "step": 6682 + }, + { + "epoch": 2.6085089773614363, + "grad_norm": 0.4264642093623739, + "learning_rate": 5.098970891187615e-07, + "loss": 0.5767, + "step": 6683 + }, + { + "epoch": 2.6088992974238874, + "grad_norm": 0.4583972002025202, + "learning_rate": 5.088984525795876e-07, + "loss": 0.5698, + "step": 6684 + }, + { + "epoch": 2.609289617486339, + "grad_norm": 0.4589317922251266, + "learning_rate": 5.079007424799082e-07, + "loss": 0.5654, + "step": 6685 + }, + { + "epoch": 2.60967993754879, + "grad_norm": 0.46113308883018345, + "learning_rate": 5.069039590255375e-07, + "loss": 0.5713, + "step": 6686 + }, + { + "epoch": 2.610070257611241, + "grad_norm": 0.47541442803683664, + "learning_rate": 5.059081024220913e-07, + "loss": 0.5585, + "step": 6687 + }, + { + "epoch": 2.6104605776736927, + "grad_norm": 0.4418495746561603, + "learning_rate": 5.049131728750012e-07, + "loss": 0.5945, + "step": 6688 + }, + { + "epoch": 2.6108508977361438, + "grad_norm": 0.41921312098162, + "learning_rate": 5.039191705895025e-07, + "loss": 0.5416, + "step": 6689 + }, + { + "epoch": 2.611241217798595, + "grad_norm": 0.40931409773837146, + "learning_rate": 5.029260957706439e-07, + "loss": 0.5582, + "step": 6690 + }, + { + "epoch": 2.611631537861046, + "grad_norm": 0.4205043066817088, + "learning_rate": 5.019339486232783e-07, + "loss": 0.5546, + "step": 6691 + }, + { + "epoch": 2.612021857923497, + "grad_norm": 0.43833205892671767, + "learning_rate": 5.009427293520691e-07, + "loss": 0.5232, + "step": 6692 + }, + { + "epoch": 2.6124121779859486, + "grad_norm": 0.44676470916804456, + "learning_rate": 4.999524381614895e-07, + "loss": 0.5354, + "step": 6693 + }, + { + "epoch": 2.6128024980483997, + "grad_norm": 0.410001840766815, + "learning_rate": 4.989630752558189e-07, + "loss": 0.5942, + "step": 6694 + }, + { + "epoch": 2.613192818110851, + "grad_norm": 0.5529229684122939, + "learning_rate": 4.979746408391484e-07, + "loss": 0.5712, + "step": 6695 + }, + { + "epoch": 2.6135831381733023, + "grad_norm": 0.42848042356135546, + "learning_rate": 4.969871351153727e-07, + "loss": 0.5288, + "step": 6696 + }, + { + "epoch": 2.6139734582357534, + "grad_norm": 0.4169611366890269, + "learning_rate": 4.960005582881994e-07, + "loss": 0.5621, + "step": 6697 + }, + { + "epoch": 2.6143637782982045, + "grad_norm": 0.4590608221825824, + "learning_rate": 4.950149105611424e-07, + "loss": 0.6131, + "step": 6698 + }, + { + "epoch": 2.6147540983606556, + "grad_norm": 0.4973162433166865, + "learning_rate": 4.940301921375256e-07, + "loss": 0.5893, + "step": 6699 + }, + { + "epoch": 2.6151444184231067, + "grad_norm": 0.4538573505469639, + "learning_rate": 4.930464032204779e-07, + "loss": 0.5883, + "step": 6700 + }, + { + "epoch": 2.6155347384855583, + "grad_norm": 0.42060061111207603, + "learning_rate": 4.920635440129412e-07, + "loss": 0.583, + "step": 6701 + }, + { + "epoch": 2.6159250585480094, + "grad_norm": 0.5284675746052826, + "learning_rate": 4.9108161471766e-07, + "loss": 0.5434, + "step": 6702 + }, + { + "epoch": 2.6163153786104605, + "grad_norm": 0.5309054235899077, + "learning_rate": 4.901006155371929e-07, + "loss": 0.5561, + "step": 6703 + }, + { + "epoch": 2.616705698672912, + "grad_norm": 0.4357643336633218, + "learning_rate": 4.891205466739024e-07, + "loss": 0.578, + "step": 6704 + }, + { + "epoch": 2.617096018735363, + "grad_norm": 0.4318040886390468, + "learning_rate": 4.881414083299607e-07, + "loss": 0.531, + "step": 6705 + }, + { + "epoch": 2.6174863387978142, + "grad_norm": 0.428694645980145, + "learning_rate": 4.871632007073457e-07, + "loss": 0.5688, + "step": 6706 + }, + { + "epoch": 2.6178766588602653, + "grad_norm": 0.4216751041085676, + "learning_rate": 4.861859240078487e-07, + "loss": 0.6004, + "step": 6707 + }, + { + "epoch": 2.6182669789227164, + "grad_norm": 0.40925268325332353, + "learning_rate": 4.852095784330635e-07, + "loss": 0.5448, + "step": 6708 + }, + { + "epoch": 2.618657298985168, + "grad_norm": 0.438231652175031, + "learning_rate": 4.84234164184394e-07, + "loss": 0.5586, + "step": 6709 + }, + { + "epoch": 2.619047619047619, + "grad_norm": 0.4111450235346726, + "learning_rate": 4.832596814630531e-07, + "loss": 0.5794, + "step": 6710 + }, + { + "epoch": 2.61943793911007, + "grad_norm": 0.49437923866522265, + "learning_rate": 4.822861304700582e-07, + "loss": 0.5901, + "step": 6711 + }, + { + "epoch": 2.6198282591725217, + "grad_norm": 0.4532518416529962, + "learning_rate": 4.813135114062395e-07, + "loss": 0.5782, + "step": 6712 + }, + { + "epoch": 2.620218579234973, + "grad_norm": 0.49846928648110195, + "learning_rate": 4.803418244722303e-07, + "loss": 0.5358, + "step": 6713 + }, + { + "epoch": 2.620608899297424, + "grad_norm": 0.4094474677818032, + "learning_rate": 4.793710698684745e-07, + "loss": 0.5805, + "step": 6714 + }, + { + "epoch": 2.620999219359875, + "grad_norm": 0.4189546993802807, + "learning_rate": 4.784012477952205e-07, + "loss": 0.6043, + "step": 6715 + }, + { + "epoch": 2.621389539422326, + "grad_norm": 0.447666950350116, + "learning_rate": 4.774323584525287e-07, + "loss": 0.5906, + "step": 6716 + }, + { + "epoch": 2.6217798594847777, + "grad_norm": 0.55755364811507, + "learning_rate": 4.7646440204026366e-07, + "loss": 0.5666, + "step": 6717 + }, + { + "epoch": 2.6221701795472288, + "grad_norm": 0.6500607192604877, + "learning_rate": 4.7549737875809963e-07, + "loss": 0.5715, + "step": 6718 + }, + { + "epoch": 2.62256049960968, + "grad_norm": 0.4649747214325881, + "learning_rate": 4.745312888055159e-07, + "loss": 0.564, + "step": 6719 + }, + { + "epoch": 2.6229508196721314, + "grad_norm": 0.441619075341673, + "learning_rate": 4.7356613238180304e-07, + "loss": 0.5654, + "step": 6720 + }, + { + "epoch": 2.6233411397345825, + "grad_norm": 0.4348529543400768, + "learning_rate": 4.726019096860551e-07, + "loss": 0.5635, + "step": 6721 + }, + { + "epoch": 2.6237314597970336, + "grad_norm": 0.4545113059792999, + "learning_rate": 4.7163862091717525e-07, + "loss": 0.5752, + "step": 6722 + }, + { + "epoch": 2.6241217798594847, + "grad_norm": 0.4758629797389707, + "learning_rate": 4.7067626627387395e-07, + "loss": 0.5877, + "step": 6723 + }, + { + "epoch": 2.624512099921936, + "grad_norm": 0.46473353086509855, + "learning_rate": 4.6971484595466965e-07, + "loss": 0.523, + "step": 6724 + }, + { + "epoch": 2.6249024199843873, + "grad_norm": 0.433631001414026, + "learning_rate": 4.68754360157887e-07, + "loss": 0.5251, + "step": 6725 + }, + { + "epoch": 2.6252927400468384, + "grad_norm": 0.4643252994906244, + "learning_rate": 4.6779480908165695e-07, + "loss": 0.6003, + "step": 6726 + }, + { + "epoch": 2.6256830601092895, + "grad_norm": 0.4025639361067176, + "learning_rate": 4.668361929239212e-07, + "loss": 0.5815, + "step": 6727 + }, + { + "epoch": 2.626073380171741, + "grad_norm": 0.45306882393148606, + "learning_rate": 4.658785118824238e-07, + "loss": 0.5746, + "step": 6728 + }, + { + "epoch": 2.626463700234192, + "grad_norm": 0.43755052104234915, + "learning_rate": 4.6492176615472117e-07, + "loss": 0.5452, + "step": 6729 + }, + { + "epoch": 2.6268540202966433, + "grad_norm": 0.4370785808493946, + "learning_rate": 4.639659559381726e-07, + "loss": 0.5871, + "step": 6730 + }, + { + "epoch": 2.6272443403590944, + "grad_norm": 0.37585134728695213, + "learning_rate": 4.63011081429946e-07, + "loss": 0.6251, + "step": 6731 + }, + { + "epoch": 2.6276346604215455, + "grad_norm": 0.48683267038118744, + "learning_rate": 4.6205714282701486e-07, + "loss": 0.5165, + "step": 6732 + }, + { + "epoch": 2.628024980483997, + "grad_norm": 0.5203452295015443, + "learning_rate": 4.611041403261629e-07, + "loss": 0.5537, + "step": 6733 + }, + { + "epoch": 2.628415300546448, + "grad_norm": 0.4172504044007265, + "learning_rate": 4.601520741239768e-07, + "loss": 0.5319, + "step": 6734 + }, + { + "epoch": 2.628805620608899, + "grad_norm": 0.4345026583063145, + "learning_rate": 4.5920094441685384e-07, + "loss": 0.5547, + "step": 6735 + }, + { + "epoch": 2.6291959406713508, + "grad_norm": 0.44014309587049244, + "learning_rate": 4.582507514009943e-07, + "loss": 0.5295, + "step": 6736 + }, + { + "epoch": 2.629586260733802, + "grad_norm": 0.4310855274631763, + "learning_rate": 4.573014952724092e-07, + "loss": 0.5334, + "step": 6737 + }, + { + "epoch": 2.629976580796253, + "grad_norm": 0.42864599176273216, + "learning_rate": 4.563531762269136e-07, + "loss": 0.5653, + "step": 6738 + }, + { + "epoch": 2.630366900858704, + "grad_norm": 0.5020551806622258, + "learning_rate": 4.554057944601292e-07, + "loss": 0.5696, + "step": 6739 + }, + { + "epoch": 2.630757220921155, + "grad_norm": 0.42136223524001604, + "learning_rate": 4.5445935016748433e-07, + "loss": 0.5839, + "step": 6740 + }, + { + "epoch": 2.6311475409836067, + "grad_norm": 0.40894771943540786, + "learning_rate": 4.5351384354421714e-07, + "loss": 0.5546, + "step": 6741 + }, + { + "epoch": 2.631537861046058, + "grad_norm": 0.43172366632760656, + "learning_rate": 4.5256927478536826e-07, + "loss": 0.5535, + "step": 6742 + }, + { + "epoch": 2.631928181108509, + "grad_norm": 0.42341720122758014, + "learning_rate": 4.5162564408578633e-07, + "loss": 0.5844, + "step": 6743 + }, + { + "epoch": 2.6323185011709604, + "grad_norm": 0.4161638183648445, + "learning_rate": 4.5068295164012786e-07, + "loss": 0.5661, + "step": 6744 + }, + { + "epoch": 2.6327088212334115, + "grad_norm": 0.4533407544923187, + "learning_rate": 4.4974119764285285e-07, + "loss": 0.5855, + "step": 6745 + }, + { + "epoch": 2.6330991412958626, + "grad_norm": 0.4140965970100676, + "learning_rate": 4.48800382288232e-07, + "loss": 0.5793, + "step": 6746 + }, + { + "epoch": 2.6334894613583137, + "grad_norm": 0.505510166311169, + "learning_rate": 4.478605057703367e-07, + "loss": 0.5492, + "step": 6747 + }, + { + "epoch": 2.633879781420765, + "grad_norm": 0.6628596906777953, + "learning_rate": 4.469215682830519e-07, + "loss": 0.5775, + "step": 6748 + }, + { + "epoch": 2.6342701014832164, + "grad_norm": 0.42198000501112715, + "learning_rate": 4.4598357002006045e-07, + "loss": 0.5446, + "step": 6749 + }, + { + "epoch": 2.6346604215456675, + "grad_norm": 0.46123434324659673, + "learning_rate": 4.450465111748592e-07, + "loss": 0.5696, + "step": 6750 + }, + { + "epoch": 2.6350507416081186, + "grad_norm": 0.42440154106769357, + "learning_rate": 4.4411039194074466e-07, + "loss": 0.5645, + "step": 6751 + }, + { + "epoch": 2.63544106167057, + "grad_norm": 0.4448831185416889, + "learning_rate": 4.4317521251082574e-07, + "loss": 0.5683, + "step": 6752 + }, + { + "epoch": 2.6358313817330212, + "grad_norm": 0.43581274730332425, + "learning_rate": 4.4224097307801195e-07, + "loss": 0.5491, + "step": 6753 + }, + { + "epoch": 2.6362217017954723, + "grad_norm": 0.43914843976972506, + "learning_rate": 4.4130767383502303e-07, + "loss": 0.5465, + "step": 6754 + }, + { + "epoch": 2.6366120218579234, + "grad_norm": 0.4088552065330349, + "learning_rate": 4.403753149743828e-07, + "loss": 0.6084, + "step": 6755 + }, + { + "epoch": 2.6370023419203745, + "grad_norm": 0.4423796257050409, + "learning_rate": 4.394438966884196e-07, + "loss": 0.5774, + "step": 6756 + }, + { + "epoch": 2.637392661982826, + "grad_norm": 0.4364403089642391, + "learning_rate": 4.385134191692719e-07, + "loss": 0.5543, + "step": 6757 + }, + { + "epoch": 2.637782982045277, + "grad_norm": 0.5303317679038696, + "learning_rate": 4.375838826088813e-07, + "loss": 0.5963, + "step": 6758 + }, + { + "epoch": 2.6381733021077283, + "grad_norm": 0.3971055877064221, + "learning_rate": 4.366552871989949e-07, + "loss": 0.6007, + "step": 6759 + }, + { + "epoch": 2.63856362217018, + "grad_norm": 0.4316389402607404, + "learning_rate": 4.3572763313116553e-07, + "loss": 0.5643, + "step": 6760 + }, + { + "epoch": 2.638953942232631, + "grad_norm": 0.4210405859911962, + "learning_rate": 4.3480092059675515e-07, + "loss": 0.6007, + "step": 6761 + }, + { + "epoch": 2.639344262295082, + "grad_norm": 0.4659705806484662, + "learning_rate": 4.33875149786927e-07, + "loss": 0.5645, + "step": 6762 + }, + { + "epoch": 2.639734582357533, + "grad_norm": 0.43576165258898547, + "learning_rate": 4.329503208926539e-07, + "loss": 0.5377, + "step": 6763 + }, + { + "epoch": 2.640124902419984, + "grad_norm": 0.4037208381850155, + "learning_rate": 4.320264341047109e-07, + "loss": 0.5729, + "step": 6764 + }, + { + "epoch": 2.6405152224824358, + "grad_norm": 0.4185287268243863, + "learning_rate": 4.311034896136829e-07, + "loss": 0.5561, + "step": 6765 + }, + { + "epoch": 2.640905542544887, + "grad_norm": 0.4188259155811069, + "learning_rate": 4.3018148760995584e-07, + "loss": 0.566, + "step": 6766 + }, + { + "epoch": 2.641295862607338, + "grad_norm": 0.4103042118131761, + "learning_rate": 4.2926042828372484e-07, + "loss": 0.5563, + "step": 6767 + }, + { + "epoch": 2.6416861826697895, + "grad_norm": 0.4982004862340884, + "learning_rate": 4.283403118249868e-07, + "loss": 0.5348, + "step": 6768 + }, + { + "epoch": 2.6420765027322406, + "grad_norm": 0.49139985702035366, + "learning_rate": 4.274211384235494e-07, + "loss": 0.536, + "step": 6769 + }, + { + "epoch": 2.6424668227946917, + "grad_norm": 0.4397040809656545, + "learning_rate": 4.265029082690203e-07, + "loss": 0.581, + "step": 6770 + }, + { + "epoch": 2.642857142857143, + "grad_norm": 0.4293426248235209, + "learning_rate": 4.255856215508175e-07, + "loss": 0.5413, + "step": 6771 + }, + { + "epoch": 2.643247462919594, + "grad_norm": 0.4104275400603477, + "learning_rate": 4.246692784581602e-07, + "loss": 0.5544, + "step": 6772 + }, + { + "epoch": 2.6436377829820454, + "grad_norm": 0.4304582778547859, + "learning_rate": 4.237538791800744e-07, + "loss": 0.5646, + "step": 6773 + }, + { + "epoch": 2.6440281030444965, + "grad_norm": 0.4383107119076112, + "learning_rate": 4.2283942390539343e-07, + "loss": 0.5446, + "step": 6774 + }, + { + "epoch": 2.6444184231069476, + "grad_norm": 0.4134192858004631, + "learning_rate": 4.2192591282275377e-07, + "loss": 0.6003, + "step": 6775 + }, + { + "epoch": 2.644808743169399, + "grad_norm": 0.43577557212807444, + "learning_rate": 4.2101334612059674e-07, + "loss": 0.5612, + "step": 6776 + }, + { + "epoch": 2.6451990632318503, + "grad_norm": 0.4340529346798434, + "learning_rate": 4.201017239871691e-07, + "loss": 0.5688, + "step": 6777 + }, + { + "epoch": 2.6455893832943014, + "grad_norm": 0.4216931295798369, + "learning_rate": 4.1919104661052533e-07, + "loss": 0.5709, + "step": 6778 + }, + { + "epoch": 2.6459797033567525, + "grad_norm": 0.4443651842450562, + "learning_rate": 4.182813141785208e-07, + "loss": 0.5416, + "step": 6779 + }, + { + "epoch": 2.6463700234192036, + "grad_norm": 0.4278457756298061, + "learning_rate": 4.1737252687882034e-07, + "loss": 0.5566, + "step": 6780 + }, + { + "epoch": 2.646760343481655, + "grad_norm": 0.4243762283854319, + "learning_rate": 4.164646848988896e-07, + "loss": 0.569, + "step": 6781 + }, + { + "epoch": 2.647150663544106, + "grad_norm": 0.41732121326115534, + "learning_rate": 4.155577884260037e-07, + "loss": 0.5644, + "step": 6782 + }, + { + "epoch": 2.6475409836065573, + "grad_norm": 0.4041708629249281, + "learning_rate": 4.1465183764723806e-07, + "loss": 0.5947, + "step": 6783 + }, + { + "epoch": 2.647931303669009, + "grad_norm": 0.4285225939145941, + "learning_rate": 4.1374683274947646e-07, + "loss": 0.5564, + "step": 6784 + }, + { + "epoch": 2.64832162373146, + "grad_norm": 0.453586487412865, + "learning_rate": 4.1284277391940517e-07, + "loss": 0.5388, + "step": 6785 + }, + { + "epoch": 2.648711943793911, + "grad_norm": 0.4359575167943618, + "learning_rate": 4.119396613435178e-07, + "loss": 0.6034, + "step": 6786 + }, + { + "epoch": 2.649102263856362, + "grad_norm": 0.44084764917929803, + "learning_rate": 4.110374952081103e-07, + "loss": 0.5711, + "step": 6787 + }, + { + "epoch": 2.6494925839188133, + "grad_norm": 0.4208168092263393, + "learning_rate": 4.1013627569928603e-07, + "loss": 0.5638, + "step": 6788 + }, + { + "epoch": 2.649882903981265, + "grad_norm": 0.4537476486232837, + "learning_rate": 4.092360030029502e-07, + "loss": 0.5482, + "step": 6789 + }, + { + "epoch": 2.650273224043716, + "grad_norm": 0.48540382206465554, + "learning_rate": 4.083366773048142e-07, + "loss": 0.5878, + "step": 6790 + }, + { + "epoch": 2.650663544106167, + "grad_norm": 0.442596065280918, + "learning_rate": 4.0743829879039463e-07, + "loss": 0.5959, + "step": 6791 + }, + { + "epoch": 2.651053864168618, + "grad_norm": 0.42756538171559333, + "learning_rate": 4.065408676450122e-07, + "loss": 0.5503, + "step": 6792 + }, + { + "epoch": 2.651444184231069, + "grad_norm": 0.43755079561861954, + "learning_rate": 4.0564438405379103e-07, + "loss": 0.557, + "step": 6793 + }, + { + "epoch": 2.6518345042935207, + "grad_norm": 0.42894203424324356, + "learning_rate": 4.0474884820166036e-07, + "loss": 0.5475, + "step": 6794 + }, + { + "epoch": 2.652224824355972, + "grad_norm": 0.4167331387498183, + "learning_rate": 4.038542602733564e-07, + "loss": 0.5622, + "step": 6795 + }, + { + "epoch": 2.652615144418423, + "grad_norm": 0.4420359740040687, + "learning_rate": 4.0296062045341587e-07, + "loss": 0.5841, + "step": 6796 + }, + { + "epoch": 2.6530054644808745, + "grad_norm": 0.42487183899242165, + "learning_rate": 4.020679289261831e-07, + "loss": 0.5812, + "step": 6797 + }, + { + "epoch": 2.6533957845433256, + "grad_norm": 0.43479466767267133, + "learning_rate": 4.01176185875804e-07, + "loss": 0.5861, + "step": 6798 + }, + { + "epoch": 2.6537861046057767, + "grad_norm": 0.43631934367506886, + "learning_rate": 4.00285391486232e-07, + "loss": 0.5758, + "step": 6799 + }, + { + "epoch": 2.654176424668228, + "grad_norm": 0.4198693574602889, + "learning_rate": 3.993955459412213e-07, + "loss": 0.5804, + "step": 6800 + }, + { + "epoch": 2.654566744730679, + "grad_norm": 0.4748144879320864, + "learning_rate": 3.985066494243356e-07, + "loss": 0.5506, + "step": 6801 + }, + { + "epoch": 2.6549570647931304, + "grad_norm": 0.4117690216862816, + "learning_rate": 3.9761870211893485e-07, + "loss": 0.5878, + "step": 6802 + }, + { + "epoch": 2.6553473848555815, + "grad_norm": 0.4311119704563589, + "learning_rate": 3.9673170420819087e-07, + "loss": 0.5313, + "step": 6803 + }, + { + "epoch": 2.6557377049180326, + "grad_norm": 0.4062655027220686, + "learning_rate": 3.958456558750762e-07, + "loss": 0.5677, + "step": 6804 + }, + { + "epoch": 2.656128024980484, + "grad_norm": 0.39871304360279375, + "learning_rate": 3.9496055730236624e-07, + "loss": 0.5307, + "step": 6805 + }, + { + "epoch": 2.6565183450429353, + "grad_norm": 0.4439946349591874, + "learning_rate": 3.940764086726445e-07, + "loss": 0.5821, + "step": 6806 + }, + { + "epoch": 2.6569086651053864, + "grad_norm": 0.40983434489067894, + "learning_rate": 3.9319321016829336e-07, + "loss": 0.5625, + "step": 6807 + }, + { + "epoch": 2.6572989851678375, + "grad_norm": 0.4450664600710099, + "learning_rate": 3.9231096197150485e-07, + "loss": 0.6086, + "step": 6808 + }, + { + "epoch": 2.6576893052302886, + "grad_norm": 0.4560635323359226, + "learning_rate": 3.914296642642701e-07, + "loss": 0.5626, + "step": 6809 + }, + { + "epoch": 2.65807962529274, + "grad_norm": 0.4200325611083806, + "learning_rate": 3.9054931722838806e-07, + "loss": 0.5313, + "step": 6810 + }, + { + "epoch": 2.658469945355191, + "grad_norm": 0.4425294974257425, + "learning_rate": 3.896699210454574e-07, + "loss": 0.5691, + "step": 6811 + }, + { + "epoch": 2.6588602654176423, + "grad_norm": 0.41584192927374763, + "learning_rate": 3.8879147589688415e-07, + "loss": 0.561, + "step": 6812 + }, + { + "epoch": 2.659250585480094, + "grad_norm": 0.3802291855151377, + "learning_rate": 3.879139819638761e-07, + "loss": 0.5941, + "step": 6813 + }, + { + "epoch": 2.659640905542545, + "grad_norm": 0.45725455753240485, + "learning_rate": 3.8703743942744674e-07, + "loss": 0.559, + "step": 6814 + }, + { + "epoch": 2.660031225604996, + "grad_norm": 0.4346558131380635, + "learning_rate": 3.8616184846841154e-07, + "loss": 0.5841, + "step": 6815 + }, + { + "epoch": 2.660421545667447, + "grad_norm": 0.5569586659330915, + "learning_rate": 3.8528720926739096e-07, + "loss": 0.574, + "step": 6816 + }, + { + "epoch": 2.6608118657298983, + "grad_norm": 0.43724836614753254, + "learning_rate": 3.844135220048084e-07, + "loss": 0.5825, + "step": 6817 + }, + { + "epoch": 2.66120218579235, + "grad_norm": 0.46895510708840943, + "learning_rate": 3.8354078686088924e-07, + "loss": 0.5758, + "step": 6818 + }, + { + "epoch": 2.661592505854801, + "grad_norm": 0.43475234687316466, + "learning_rate": 3.826690040156672e-07, + "loss": 0.5768, + "step": 6819 + }, + { + "epoch": 2.661982825917252, + "grad_norm": 0.4460367882331678, + "learning_rate": 3.8179817364897455e-07, + "loss": 0.5993, + "step": 6820 + }, + { + "epoch": 2.6623731459797035, + "grad_norm": 0.4719520762185989, + "learning_rate": 3.809282959404498e-07, + "loss": 0.5952, + "step": 6821 + }, + { + "epoch": 2.6627634660421546, + "grad_norm": 0.451603618558961, + "learning_rate": 3.8005937106953275e-07, + "loss": 0.5554, + "step": 6822 + }, + { + "epoch": 2.6631537861046057, + "grad_norm": 0.43583416916155443, + "learning_rate": 3.7919139921547056e-07, + "loss": 0.5286, + "step": 6823 + }, + { + "epoch": 2.663544106167057, + "grad_norm": 0.44454591923839953, + "learning_rate": 3.783243805573095e-07, + "loss": 0.5698, + "step": 6824 + }, + { + "epoch": 2.663934426229508, + "grad_norm": 0.43030862860756736, + "learning_rate": 3.774583152739025e-07, + "loss": 0.5362, + "step": 6825 + }, + { + "epoch": 2.6643247462919595, + "grad_norm": 0.47658119484423117, + "learning_rate": 3.765932035439024e-07, + "loss": 0.5675, + "step": 6826 + }, + { + "epoch": 2.6647150663544106, + "grad_norm": 0.4091225761767305, + "learning_rate": 3.757290455457713e-07, + "loss": 0.5619, + "step": 6827 + }, + { + "epoch": 2.6651053864168617, + "grad_norm": 0.4149687486880883, + "learning_rate": 3.748658414577655e-07, + "loss": 0.5935, + "step": 6828 + }, + { + "epoch": 2.665495706479313, + "grad_norm": 0.427401184038789, + "learning_rate": 3.7400359145795317e-07, + "loss": 0.5416, + "step": 6829 + }, + { + "epoch": 2.6658860265417643, + "grad_norm": 0.4332309197735108, + "learning_rate": 3.7314229572420034e-07, + "loss": 0.5426, + "step": 6830 + }, + { + "epoch": 2.6662763466042154, + "grad_norm": 0.4493496961322827, + "learning_rate": 3.7228195443417934e-07, + "loss": 0.5774, + "step": 6831 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.47204820570480377, + "learning_rate": 3.714225677653632e-07, + "loss": 0.5675, + "step": 6832 + }, + { + "epoch": 2.6670569867291176, + "grad_norm": 0.409153084986195, + "learning_rate": 3.7056413589503014e-07, + "loss": 0.6028, + "step": 6833 + }, + { + "epoch": 2.667447306791569, + "grad_norm": 0.4296647946738126, + "learning_rate": 3.697066590002596e-07, + "loss": 0.556, + "step": 6834 + }, + { + "epoch": 2.6678376268540203, + "grad_norm": 0.45198773598423725, + "learning_rate": 3.688501372579345e-07, + "loss": 0.5746, + "step": 6835 + }, + { + "epoch": 2.6682279469164714, + "grad_norm": 0.444609273327409, + "learning_rate": 3.679945708447419e-07, + "loss": 0.5895, + "step": 6836 + }, + { + "epoch": 2.668618266978923, + "grad_norm": 0.42244519455285384, + "learning_rate": 3.6713995993717054e-07, + "loss": 0.5499, + "step": 6837 + }, + { + "epoch": 2.669008587041374, + "grad_norm": 0.42975753049980814, + "learning_rate": 3.662863047115123e-07, + "loss": 0.5848, + "step": 6838 + }, + { + "epoch": 2.669398907103825, + "grad_norm": 0.42373950091070073, + "learning_rate": 3.654336053438617e-07, + "loss": 0.5461, + "step": 6839 + }, + { + "epoch": 2.669789227166276, + "grad_norm": 0.44174710512884857, + "learning_rate": 3.6458186201011715e-07, + "loss": 0.6177, + "step": 6840 + }, + { + "epoch": 2.6701795472287273, + "grad_norm": 0.42511476702755197, + "learning_rate": 3.637310748859779e-07, + "loss": 0.5778, + "step": 6841 + }, + { + "epoch": 2.670569867291179, + "grad_norm": 0.4596418584726654, + "learning_rate": 3.6288124414694925e-07, + "loss": 0.5807, + "step": 6842 + }, + { + "epoch": 2.67096018735363, + "grad_norm": 0.42874579706459054, + "learning_rate": 3.6203236996833536e-07, + "loss": 0.5716, + "step": 6843 + }, + { + "epoch": 2.671350507416081, + "grad_norm": 0.43010064778837936, + "learning_rate": 3.6118445252524557e-07, + "loss": 0.5258, + "step": 6844 + }, + { + "epoch": 2.6717408274785326, + "grad_norm": 0.4236353103029881, + "learning_rate": 3.603374919925917e-07, + "loss": 0.5455, + "step": 6845 + }, + { + "epoch": 2.6721311475409837, + "grad_norm": 0.4687029691906713, + "learning_rate": 3.594914885450862e-07, + "loss": 0.5583, + "step": 6846 + }, + { + "epoch": 2.672521467603435, + "grad_norm": 0.43499433827318845, + "learning_rate": 3.586464423572461e-07, + "loss": 0.5463, + "step": 6847 + }, + { + "epoch": 2.672911787665886, + "grad_norm": 0.4338425940655585, + "learning_rate": 3.578023536033909e-07, + "loss": 0.5473, + "step": 6848 + }, + { + "epoch": 2.673302107728337, + "grad_norm": 0.4564420221570762, + "learning_rate": 3.569592224576407e-07, + "loss": 0.5638, + "step": 6849 + }, + { + "epoch": 2.6736924277907885, + "grad_norm": 0.43604395017089387, + "learning_rate": 3.561170490939214e-07, + "loss": 0.6025, + "step": 6850 + }, + { + "epoch": 2.6740827478532396, + "grad_norm": 0.4300639595201962, + "learning_rate": 3.5527583368595853e-07, + "loss": 0.622, + "step": 6851 + }, + { + "epoch": 2.6744730679156907, + "grad_norm": 0.4257287600504841, + "learning_rate": 3.544355764072793e-07, + "loss": 0.548, + "step": 6852 + }, + { + "epoch": 2.6748633879781423, + "grad_norm": 0.4529107613824038, + "learning_rate": 3.5359627743121683e-07, + "loss": 0.5339, + "step": 6853 + }, + { + "epoch": 2.6752537080405934, + "grad_norm": 0.4384440935553144, + "learning_rate": 3.527579369309042e-07, + "loss": 0.571, + "step": 6854 + }, + { + "epoch": 2.6756440281030445, + "grad_norm": 0.4468453840157635, + "learning_rate": 3.5192055507927645e-07, + "loss": 0.5302, + "step": 6855 + }, + { + "epoch": 2.6760343481654956, + "grad_norm": 0.4148026256823106, + "learning_rate": 3.5108413204907153e-07, + "loss": 0.6185, + "step": 6856 + }, + { + "epoch": 2.6764246682279467, + "grad_norm": 0.4390853878528908, + "learning_rate": 3.502486680128303e-07, + "loss": 0.5544, + "step": 6857 + }, + { + "epoch": 2.676814988290398, + "grad_norm": 0.45356668347962126, + "learning_rate": 3.494141631428932e-07, + "loss": 0.589, + "step": 6858 + }, + { + "epoch": 2.6772053083528493, + "grad_norm": 0.4553255843255499, + "learning_rate": 3.485806176114076e-07, + "loss": 0.5477, + "step": 6859 + }, + { + "epoch": 2.6775956284153004, + "grad_norm": 0.4513033191614186, + "learning_rate": 3.4774803159031757e-07, + "loss": 0.5591, + "step": 6860 + }, + { + "epoch": 2.677985948477752, + "grad_norm": 0.4269889896495436, + "learning_rate": 3.469164052513735e-07, + "loss": 0.5626, + "step": 6861 + }, + { + "epoch": 2.678376268540203, + "grad_norm": 0.4198955683209995, + "learning_rate": 3.4608573876612483e-07, + "loss": 0.5638, + "step": 6862 + }, + { + "epoch": 2.678766588602654, + "grad_norm": 0.42255510894677284, + "learning_rate": 3.452560323059262e-07, + "loss": 0.5782, + "step": 6863 + }, + { + "epoch": 2.6791569086651053, + "grad_norm": 0.4524917844144863, + "learning_rate": 3.44427286041929e-07, + "loss": 0.5292, + "step": 6864 + }, + { + "epoch": 2.6795472287275564, + "grad_norm": 0.4325397740354813, + "learning_rate": 3.435995001450926e-07, + "loss": 0.5643, + "step": 6865 + }, + { + "epoch": 2.679937548790008, + "grad_norm": 0.4465142179446072, + "learning_rate": 3.4277267478617316e-07, + "loss": 0.5715, + "step": 6866 + }, + { + "epoch": 2.680327868852459, + "grad_norm": 0.43801525181361445, + "learning_rate": 3.4194681013573315e-07, + "loss": 0.5954, + "step": 6867 + }, + { + "epoch": 2.68071818891491, + "grad_norm": 0.42296013410122674, + "learning_rate": 3.411219063641341e-07, + "loss": 0.5859, + "step": 6868 + }, + { + "epoch": 2.6811085089773616, + "grad_norm": 0.4397586950064144, + "learning_rate": 3.4029796364153875e-07, + "loss": 0.5287, + "step": 6869 + }, + { + "epoch": 2.6814988290398127, + "grad_norm": 0.43638295697726537, + "learning_rate": 3.39474982137914e-07, + "loss": 0.5368, + "step": 6870 + }, + { + "epoch": 2.681889149102264, + "grad_norm": 0.3945941051495252, + "learning_rate": 3.386529620230272e-07, + "loss": 0.5457, + "step": 6871 + }, + { + "epoch": 2.682279469164715, + "grad_norm": 0.41709440720641716, + "learning_rate": 3.3783190346644677e-07, + "loss": 0.5732, + "step": 6872 + }, + { + "epoch": 2.682669789227166, + "grad_norm": 0.4213199292324738, + "learning_rate": 3.370118066375433e-07, + "loss": 0.5991, + "step": 6873 + }, + { + "epoch": 2.6830601092896176, + "grad_norm": 0.4169906876899543, + "learning_rate": 3.361926717054903e-07, + "loss": 0.5756, + "step": 6874 + }, + { + "epoch": 2.6834504293520687, + "grad_norm": 0.44128793111766407, + "learning_rate": 3.3537449883926044e-07, + "loss": 0.6144, + "step": 6875 + }, + { + "epoch": 2.6838407494145198, + "grad_norm": 0.4403886113724794, + "learning_rate": 3.345572882076298e-07, + "loss": 0.553, + "step": 6876 + }, + { + "epoch": 2.6842310694769713, + "grad_norm": 0.4912330937547876, + "learning_rate": 3.337410399791746e-07, + "loss": 0.5865, + "step": 6877 + }, + { + "epoch": 2.6846213895394224, + "grad_norm": 0.4507145795843411, + "learning_rate": 3.329257543222747e-07, + "loss": 0.5657, + "step": 6878 + }, + { + "epoch": 2.6850117096018735, + "grad_norm": 0.4636976643742351, + "learning_rate": 3.3211143140510825e-07, + "loss": 0.6034, + "step": 6879 + }, + { + "epoch": 2.6854020296643246, + "grad_norm": 0.41511834645725726, + "learning_rate": 3.312980713956593e-07, + "loss": 0.5345, + "step": 6880 + }, + { + "epoch": 2.6857923497267757, + "grad_norm": 0.44626235706249306, + "learning_rate": 3.304856744617063e-07, + "loss": 0.527, + "step": 6881 + }, + { + "epoch": 2.6861826697892273, + "grad_norm": 0.4257698662398038, + "learning_rate": 3.29674240770837e-07, + "loss": 0.5194, + "step": 6882 + }, + { + "epoch": 2.6865729898516784, + "grad_norm": 0.44232643163962826, + "learning_rate": 3.288637704904335e-07, + "loss": 0.5839, + "step": 6883 + }, + { + "epoch": 2.6869633099141295, + "grad_norm": 0.45962115349353605, + "learning_rate": 3.2805426378768546e-07, + "loss": 0.5288, + "step": 6884 + }, + { + "epoch": 2.687353629976581, + "grad_norm": 0.4522298471084577, + "learning_rate": 3.2724572082957873e-07, + "loss": 0.5622, + "step": 6885 + }, + { + "epoch": 2.687743950039032, + "grad_norm": 0.4367144475993292, + "learning_rate": 3.264381417829021e-07, + "loss": 0.5605, + "step": 6886 + }, + { + "epoch": 2.688134270101483, + "grad_norm": 0.4195894654606244, + "learning_rate": 3.256315268142468e-07, + "loss": 0.5894, + "step": 6887 + }, + { + "epoch": 2.6885245901639343, + "grad_norm": 0.4666778893337769, + "learning_rate": 3.24825876090003e-07, + "loss": 0.5798, + "step": 6888 + }, + { + "epoch": 2.6889149102263854, + "grad_norm": 0.42907850888840454, + "learning_rate": 3.240211897763651e-07, + "loss": 0.6003, + "step": 6889 + }, + { + "epoch": 2.689305230288837, + "grad_norm": 0.43280446851826077, + "learning_rate": 3.2321746803932354e-07, + "loss": 0.5666, + "step": 6890 + }, + { + "epoch": 2.689695550351288, + "grad_norm": 0.43589750991393783, + "learning_rate": 3.224147110446746e-07, + "loss": 0.5802, + "step": 6891 + }, + { + "epoch": 2.690085870413739, + "grad_norm": 0.4760497730520407, + "learning_rate": 3.216129189580125e-07, + "loss": 0.593, + "step": 6892 + }, + { + "epoch": 2.6904761904761907, + "grad_norm": 0.41192474848218413, + "learning_rate": 3.2081209194473496e-07, + "loss": 0.6152, + "step": 6893 + }, + { + "epoch": 2.690866510538642, + "grad_norm": 0.42407410414369034, + "learning_rate": 3.200122301700381e-07, + "loss": 0.5441, + "step": 6894 + }, + { + "epoch": 2.691256830601093, + "grad_norm": 0.41412285853166714, + "learning_rate": 3.192133337989217e-07, + "loss": 0.5279, + "step": 6895 + }, + { + "epoch": 2.691647150663544, + "grad_norm": 0.4569139964547431, + "learning_rate": 3.184154029961822e-07, + "loss": 0.5473, + "step": 6896 + }, + { + "epoch": 2.692037470725995, + "grad_norm": 0.41349969498232286, + "learning_rate": 3.1761843792642244e-07, + "loss": 0.6016, + "step": 6897 + }, + { + "epoch": 2.6924277907884466, + "grad_norm": 0.45623158779659, + "learning_rate": 3.168224387540414e-07, + "loss": 0.5562, + "step": 6898 + }, + { + "epoch": 2.6928181108508977, + "grad_norm": 0.40818944643326094, + "learning_rate": 3.1602740564324054e-07, + "loss": 0.5486, + "step": 6899 + }, + { + "epoch": 2.693208430913349, + "grad_norm": 0.43302939785151784, + "learning_rate": 3.152333387580209e-07, + "loss": 0.5387, + "step": 6900 + }, + { + "epoch": 2.6935987509758004, + "grad_norm": 0.5705441278349974, + "learning_rate": 3.144402382621875e-07, + "loss": 0.541, + "step": 6901 + }, + { + "epoch": 2.6939890710382515, + "grad_norm": 0.4143902676296767, + "learning_rate": 3.136481043193429e-07, + "loss": 0.5638, + "step": 6902 + }, + { + "epoch": 2.6943793911007026, + "grad_norm": 0.4133505935848114, + "learning_rate": 3.1285693709288967e-07, + "loss": 0.5828, + "step": 6903 + }, + { + "epoch": 2.6947697111631537, + "grad_norm": 0.4267957713263853, + "learning_rate": 3.1206673674603505e-07, + "loss": 0.5965, + "step": 6904 + }, + { + "epoch": 2.6951600312256048, + "grad_norm": 0.4194837116919296, + "learning_rate": 3.1127750344178143e-07, + "loss": 0.5668, + "step": 6905 + }, + { + "epoch": 2.6955503512880563, + "grad_norm": 0.4156407154179881, + "learning_rate": 3.1048923734293744e-07, + "loss": 0.5822, + "step": 6906 + }, + { + "epoch": 2.6959406713505074, + "grad_norm": 0.4733773105148701, + "learning_rate": 3.0970193861210695e-07, + "loss": 0.5912, + "step": 6907 + }, + { + "epoch": 2.6963309914129585, + "grad_norm": 0.4529479003128122, + "learning_rate": 3.089156074116978e-07, + "loss": 0.5712, + "step": 6908 + }, + { + "epoch": 2.69672131147541, + "grad_norm": 0.4323600836692073, + "learning_rate": 3.081302439039163e-07, + "loss": 0.5853, + "step": 6909 + }, + { + "epoch": 2.697111631537861, + "grad_norm": 0.40445040259804127, + "learning_rate": 3.073458482507702e-07, + "loss": 0.5642, + "step": 6910 + }, + { + "epoch": 2.6975019516003123, + "grad_norm": 0.40630408517781785, + "learning_rate": 3.065624206140666e-07, + "loss": 0.5907, + "step": 6911 + }, + { + "epoch": 2.6978922716627634, + "grad_norm": 0.4276465799044966, + "learning_rate": 3.057799611554152e-07, + "loss": 0.574, + "step": 6912 + }, + { + "epoch": 2.6982825917252145, + "grad_norm": 0.42207103297434206, + "learning_rate": 3.0499847003622294e-07, + "loss": 0.5383, + "step": 6913 + }, + { + "epoch": 2.698672911787666, + "grad_norm": 0.427490301516547, + "learning_rate": 3.042179474176993e-07, + "loss": 0.5956, + "step": 6914 + }, + { + "epoch": 2.699063231850117, + "grad_norm": 0.4683411662477159, + "learning_rate": 3.034383934608526e-07, + "loss": 0.5938, + "step": 6915 + }, + { + "epoch": 2.699453551912568, + "grad_norm": 0.49795506977479737, + "learning_rate": 3.02659808326492e-07, + "loss": 0.534, + "step": 6916 + }, + { + "epoch": 2.6998438719750197, + "grad_norm": 0.5549765299501876, + "learning_rate": 3.018821921752257e-07, + "loss": 0.5816, + "step": 6917 + }, + { + "epoch": 2.700234192037471, + "grad_norm": 0.4544382354114439, + "learning_rate": 3.0110554516746494e-07, + "loss": 0.584, + "step": 6918 + }, + { + "epoch": 2.700624512099922, + "grad_norm": 0.43208352368295133, + "learning_rate": 3.0032986746341806e-07, + "loss": 0.6114, + "step": 6919 + }, + { + "epoch": 2.701014832162373, + "grad_norm": 0.3915964130880352, + "learning_rate": 2.995551592230933e-07, + "loss": 0.6177, + "step": 6920 + }, + { + "epoch": 2.701405152224824, + "grad_norm": 0.40465741513593134, + "learning_rate": 2.987814206063022e-07, + "loss": 0.5747, + "step": 6921 + }, + { + "epoch": 2.7017954722872757, + "grad_norm": 0.4693680530972925, + "learning_rate": 2.9800865177265216e-07, + "loss": 0.5835, + "step": 6922 + }, + { + "epoch": 2.702185792349727, + "grad_norm": 0.4437380369673776, + "learning_rate": 2.9723685288155393e-07, + "loss": 0.5577, + "step": 6923 + }, + { + "epoch": 2.702576112412178, + "grad_norm": 0.4223770998522566, + "learning_rate": 2.9646602409221683e-07, + "loss": 0.5433, + "step": 6924 + }, + { + "epoch": 2.7029664324746294, + "grad_norm": 0.38831487683731525, + "learning_rate": 2.9569616556364923e-07, + "loss": 0.585, + "step": 6925 + }, + { + "epoch": 2.7033567525370805, + "grad_norm": 0.42843755585211696, + "learning_rate": 2.949272774546602e-07, + "loss": 0.5333, + "step": 6926 + }, + { + "epoch": 2.7037470725995316, + "grad_norm": 0.4549121134639762, + "learning_rate": 2.9415935992385903e-07, + "loss": 0.5598, + "step": 6927 + }, + { + "epoch": 2.7041373926619827, + "grad_norm": 0.4200100355514212, + "learning_rate": 2.933924131296534e-07, + "loss": 0.583, + "step": 6928 + }, + { + "epoch": 2.704527712724434, + "grad_norm": 0.4307096744128078, + "learning_rate": 2.9262643723025287e-07, + "loss": 0.5629, + "step": 6929 + }, + { + "epoch": 2.7049180327868854, + "grad_norm": 0.4294387060682502, + "learning_rate": 2.918614323836644e-07, + "loss": 0.5854, + "step": 6930 + }, + { + "epoch": 2.7053083528493365, + "grad_norm": 0.4469146074320945, + "learning_rate": 2.9109739874769725e-07, + "loss": 0.536, + "step": 6931 + }, + { + "epoch": 2.7056986729117876, + "grad_norm": 0.4310667249218359, + "learning_rate": 2.9033433647995765e-07, + "loss": 0.5804, + "step": 6932 + }, + { + "epoch": 2.706088992974239, + "grad_norm": 0.44326738082344513, + "learning_rate": 2.895722457378536e-07, + "loss": 0.5759, + "step": 6933 + }, + { + "epoch": 2.70647931303669, + "grad_norm": 0.4346914571333172, + "learning_rate": 2.888111266785898e-07, + "loss": 0.5362, + "step": 6934 + }, + { + "epoch": 2.7068696330991413, + "grad_norm": 0.4197961922754012, + "learning_rate": 2.8805097945917407e-07, + "loss": 0.6008, + "step": 6935 + }, + { + "epoch": 2.7072599531615924, + "grad_norm": 0.44415231691995516, + "learning_rate": 2.8729180423641257e-07, + "loss": 0.5646, + "step": 6936 + }, + { + "epoch": 2.7076502732240435, + "grad_norm": 0.4566563952144857, + "learning_rate": 2.865336011669084e-07, + "loss": 0.5618, + "step": 6937 + }, + { + "epoch": 2.708040593286495, + "grad_norm": 0.43666074266590793, + "learning_rate": 2.857763704070682e-07, + "loss": 0.5624, + "step": 6938 + }, + { + "epoch": 2.708430913348946, + "grad_norm": 0.4062203822211292, + "learning_rate": 2.8502011211309523e-07, + "loss": 0.6083, + "step": 6939 + }, + { + "epoch": 2.7088212334113972, + "grad_norm": 0.45287176309732197, + "learning_rate": 2.8426482644099306e-07, + "loss": 0.593, + "step": 6940 + }, + { + "epoch": 2.709211553473849, + "grad_norm": 0.4152355010111029, + "learning_rate": 2.835105135465649e-07, + "loss": 0.5726, + "step": 6941 + }, + { + "epoch": 2.7096018735363, + "grad_norm": 0.4426448809186541, + "learning_rate": 2.827571735854134e-07, + "loss": 0.5228, + "step": 6942 + }, + { + "epoch": 2.709992193598751, + "grad_norm": 0.412626498221177, + "learning_rate": 2.820048067129383e-07, + "loss": 0.568, + "step": 6943 + }, + { + "epoch": 2.710382513661202, + "grad_norm": 0.4315703381337252, + "learning_rate": 2.8125341308434193e-07, + "loss": 0.5809, + "step": 6944 + }, + { + "epoch": 2.710772833723653, + "grad_norm": 0.410636753842893, + "learning_rate": 2.805029928546232e-07, + "loss": 0.5368, + "step": 6945 + }, + { + "epoch": 2.7111631537861047, + "grad_norm": 0.38952145550797107, + "learning_rate": 2.7975354617858276e-07, + "loss": 0.5995, + "step": 6946 + }, + { + "epoch": 2.711553473848556, + "grad_norm": 0.42977086792019437, + "learning_rate": 2.7900507321081795e-07, + "loss": 0.5491, + "step": 6947 + }, + { + "epoch": 2.711943793911007, + "grad_norm": 0.4465080292983717, + "learning_rate": 2.7825757410572585e-07, + "loss": 0.5653, + "step": 6948 + }, + { + "epoch": 2.7123341139734585, + "grad_norm": 0.4383042021387224, + "learning_rate": 2.7751104901750425e-07, + "loss": 0.583, + "step": 6949 + }, + { + "epoch": 2.7127244340359096, + "grad_norm": 0.41032538018205056, + "learning_rate": 2.7676549810014776e-07, + "loss": 0.6044, + "step": 6950 + }, + { + "epoch": 2.7131147540983607, + "grad_norm": 0.427286324538416, + "learning_rate": 2.7602092150745274e-07, + "loss": 0.5841, + "step": 6951 + }, + { + "epoch": 2.7135050741608118, + "grad_norm": 0.4471592784335077, + "learning_rate": 2.752773193930108e-07, + "loss": 0.5784, + "step": 6952 + }, + { + "epoch": 2.713895394223263, + "grad_norm": 0.42853288024356384, + "learning_rate": 2.74534691910216e-07, + "loss": 0.5594, + "step": 6953 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.4308305659810399, + "learning_rate": 2.7379303921225955e-07, + "loss": 0.5629, + "step": 6954 + }, + { + "epoch": 2.7146760343481655, + "grad_norm": 0.419044350342241, + "learning_rate": 2.7305236145213297e-07, + "loss": 0.5764, + "step": 6955 + }, + { + "epoch": 2.7150663544106166, + "grad_norm": 0.4611242661256175, + "learning_rate": 2.7231265878262413e-07, + "loss": 0.5509, + "step": 6956 + }, + { + "epoch": 2.715456674473068, + "grad_norm": 0.4339345192609268, + "learning_rate": 2.7157393135632316e-07, + "loss": 0.593, + "step": 6957 + }, + { + "epoch": 2.7158469945355193, + "grad_norm": 0.4324359189554161, + "learning_rate": 2.7083617932561536e-07, + "loss": 0.6031, + "step": 6958 + }, + { + "epoch": 2.7162373145979704, + "grad_norm": 0.4122847889558488, + "learning_rate": 2.7009940284268843e-07, + "loss": 0.5943, + "step": 6959 + }, + { + "epoch": 2.7166276346604215, + "grad_norm": 0.4276711343185608, + "learning_rate": 2.6936360205952695e-07, + "loss": 0.5526, + "step": 6960 + }, + { + "epoch": 2.7170179547228726, + "grad_norm": 0.43082762363586974, + "learning_rate": 2.6862877712791336e-07, + "loss": 0.5904, + "step": 6961 + }, + { + "epoch": 2.717408274785324, + "grad_norm": 0.43969897809238506, + "learning_rate": 2.678949281994292e-07, + "loss": 0.5271, + "step": 6962 + }, + { + "epoch": 2.717798594847775, + "grad_norm": 0.4434182608525866, + "learning_rate": 2.6716205542545783e-07, + "loss": 0.5898, + "step": 6963 + }, + { + "epoch": 2.7181889149102263, + "grad_norm": 0.4905380043284353, + "learning_rate": 2.664301589571766e-07, + "loss": 0.5473, + "step": 6964 + }, + { + "epoch": 2.718579234972678, + "grad_norm": 0.4795346039998401, + "learning_rate": 2.6569923894556373e-07, + "loss": 0.5784, + "step": 6965 + }, + { + "epoch": 2.718969555035129, + "grad_norm": 0.41281198705791267, + "learning_rate": 2.6496929554139737e-07, + "loss": 0.5526, + "step": 6966 + }, + { + "epoch": 2.71935987509758, + "grad_norm": 0.42343372573815125, + "learning_rate": 2.642403288952505e-07, + "loss": 0.5491, + "step": 6967 + }, + { + "epoch": 2.719750195160031, + "grad_norm": 0.42375182993823945, + "learning_rate": 2.635123391574995e-07, + "loss": 0.5424, + "step": 6968 + }, + { + "epoch": 2.7201405152224822, + "grad_norm": 0.42866962371112044, + "learning_rate": 2.627853264783131e-07, + "loss": 0.5526, + "step": 6969 + }, + { + "epoch": 2.720530835284934, + "grad_norm": 0.40344131918937737, + "learning_rate": 2.6205929100766525e-07, + "loss": 0.5831, + "step": 6970 + }, + { + "epoch": 2.720921155347385, + "grad_norm": 0.437707838746698, + "learning_rate": 2.613342328953222e-07, + "loss": 0.627, + "step": 6971 + }, + { + "epoch": 2.721311475409836, + "grad_norm": 0.4423817469265871, + "learning_rate": 2.6061015229085383e-07, + "loss": 0.5525, + "step": 6972 + }, + { + "epoch": 2.7217017954722875, + "grad_norm": 0.4258154495842107, + "learning_rate": 2.59887049343624e-07, + "loss": 0.553, + "step": 6973 + }, + { + "epoch": 2.7220921155347386, + "grad_norm": 0.4311260455349996, + "learning_rate": 2.5916492420279824e-07, + "loss": 0.5623, + "step": 6974 + }, + { + "epoch": 2.7224824355971897, + "grad_norm": 0.46057405610845137, + "learning_rate": 2.5844377701733757e-07, + "loss": 0.5111, + "step": 6975 + }, + { + "epoch": 2.722872755659641, + "grad_norm": 0.4363644274900781, + "learning_rate": 2.5772360793600403e-07, + "loss": 0.5597, + "step": 6976 + }, + { + "epoch": 2.723263075722092, + "grad_norm": 0.4139516132991161, + "learning_rate": 2.57004417107356e-07, + "loss": 0.5125, + "step": 6977 + }, + { + "epoch": 2.7236533957845435, + "grad_norm": 0.41231222600565953, + "learning_rate": 2.562862046797504e-07, + "loss": 0.6128, + "step": 6978 + }, + { + "epoch": 2.7240437158469946, + "grad_norm": 0.4201633556013868, + "learning_rate": 2.555689708013415e-07, + "loss": 0.5631, + "step": 6979 + }, + { + "epoch": 2.7244340359094457, + "grad_norm": 0.4148198387493906, + "learning_rate": 2.5485271562008475e-07, + "loss": 0.5558, + "step": 6980 + }, + { + "epoch": 2.724824355971897, + "grad_norm": 0.43383798505531246, + "learning_rate": 2.541374392837309e-07, + "loss": 0.5376, + "step": 6981 + }, + { + "epoch": 2.7252146760343483, + "grad_norm": 0.4743730557952281, + "learning_rate": 2.5342314193982807e-07, + "loss": 0.5989, + "step": 6982 + }, + { + "epoch": 2.7256049960967994, + "grad_norm": 0.42648403023391007, + "learning_rate": 2.527098237357262e-07, + "loss": 0.5819, + "step": 6983 + }, + { + "epoch": 2.7259953161592505, + "grad_norm": 0.39920533840395367, + "learning_rate": 2.519974848185691e-07, + "loss": 0.5548, + "step": 6984 + }, + { + "epoch": 2.7263856362217016, + "grad_norm": 0.444045301830954, + "learning_rate": 2.5128612533530217e-07, + "loss": 0.5685, + "step": 6985 + }, + { + "epoch": 2.726775956284153, + "grad_norm": 0.47406604104200306, + "learning_rate": 2.5057574543266626e-07, + "loss": 0.6119, + "step": 6986 + }, + { + "epoch": 2.7271662763466042, + "grad_norm": 0.3935936764975918, + "learning_rate": 2.498663452572003e-07, + "loss": 0.5872, + "step": 6987 + }, + { + "epoch": 2.7275565964090553, + "grad_norm": 0.4472510644631426, + "learning_rate": 2.491579249552423e-07, + "loss": 0.5479, + "step": 6988 + }, + { + "epoch": 2.727946916471507, + "grad_norm": 0.4797291914011001, + "learning_rate": 2.4845048467292743e-07, + "loss": 0.5241, + "step": 6989 + }, + { + "epoch": 2.728337236533958, + "grad_norm": 0.43642869239174775, + "learning_rate": 2.477440245561885e-07, + "loss": 0.5527, + "step": 6990 + }, + { + "epoch": 2.728727556596409, + "grad_norm": 0.410210117676124, + "learning_rate": 2.4703854475075784e-07, + "loss": 0.5663, + "step": 6991 + }, + { + "epoch": 2.72911787665886, + "grad_norm": 0.4324163508508241, + "learning_rate": 2.463340454021618e-07, + "loss": 0.5492, + "step": 6992 + }, + { + "epoch": 2.7295081967213113, + "grad_norm": 0.44732381744614613, + "learning_rate": 2.456305266557296e-07, + "loss": 0.5474, + "step": 6993 + }, + { + "epoch": 2.729898516783763, + "grad_norm": 0.4327266765382876, + "learning_rate": 2.4492798865658417e-07, + "loss": 0.5776, + "step": 6994 + }, + { + "epoch": 2.730288836846214, + "grad_norm": 0.4102341396499819, + "learning_rate": 2.4422643154964734e-07, + "loss": 0.5655, + "step": 6995 + }, + { + "epoch": 2.730679156908665, + "grad_norm": 0.45638570510869875, + "learning_rate": 2.435258554796377e-07, + "loss": 0.5639, + "step": 6996 + }, + { + "epoch": 2.7310694769711166, + "grad_norm": 0.4509290355961545, + "learning_rate": 2.4282626059107473e-07, + "loss": 0.5309, + "step": 6997 + }, + { + "epoch": 2.7314597970335677, + "grad_norm": 0.4299648560295519, + "learning_rate": 2.421276470282713e-07, + "loss": 0.5797, + "step": 6998 + }, + { + "epoch": 2.7318501170960188, + "grad_norm": 0.39954182601118027, + "learning_rate": 2.414300149353399e-07, + "loss": 0.5688, + "step": 6999 + }, + { + "epoch": 2.73224043715847, + "grad_norm": 0.4300782993139912, + "learning_rate": 2.4073336445619155e-07, + "loss": 0.5436, + "step": 7000 + }, + { + "epoch": 2.732630757220921, + "grad_norm": 0.42963287328917166, + "learning_rate": 2.400376957345324e-07, + "loss": 0.6004, + "step": 7001 + }, + { + "epoch": 2.7330210772833725, + "grad_norm": 0.46547920822436506, + "learning_rate": 2.3934300891386875e-07, + "loss": 0.5281, + "step": 7002 + }, + { + "epoch": 2.7334113973458236, + "grad_norm": 0.42334926776861903, + "learning_rate": 2.3864930413750156e-07, + "loss": 0.5663, + "step": 7003 + }, + { + "epoch": 2.7338017174082747, + "grad_norm": 0.46645570904221906, + "learning_rate": 2.379565815485324e-07, + "loss": 0.5126, + "step": 7004 + }, + { + "epoch": 2.7341920374707263, + "grad_norm": 0.4358586546329259, + "learning_rate": 2.372648412898554e-07, + "loss": 0.53, + "step": 7005 + }, + { + "epoch": 2.7345823575331774, + "grad_norm": 0.41298528414263835, + "learning_rate": 2.36574083504168e-07, + "loss": 0.582, + "step": 7006 + }, + { + "epoch": 2.7349726775956285, + "grad_norm": 0.41971854821133325, + "learning_rate": 2.3588430833395958e-07, + "loss": 0.529, + "step": 7007 + }, + { + "epoch": 2.7353629976580796, + "grad_norm": 0.44932955508611117, + "learning_rate": 2.3519551592152134e-07, + "loss": 0.5567, + "step": 7008 + }, + { + "epoch": 2.7357533177205307, + "grad_norm": 0.42400503043001686, + "learning_rate": 2.345077064089385e-07, + "loss": 0.5929, + "step": 7009 + }, + { + "epoch": 2.736143637782982, + "grad_norm": 0.4258684284486235, + "learning_rate": 2.3382087993809587e-07, + "loss": 0.5977, + "step": 7010 + }, + { + "epoch": 2.7365339578454333, + "grad_norm": 0.42845153744398123, + "learning_rate": 2.3313503665067295e-07, + "loss": 0.5806, + "step": 7011 + }, + { + "epoch": 2.7369242779078844, + "grad_norm": 0.42945714144613595, + "learning_rate": 2.3245017668814873e-07, + "loss": 0.5785, + "step": 7012 + }, + { + "epoch": 2.737314597970336, + "grad_norm": 0.44050946945001357, + "learning_rate": 2.3176630019179747e-07, + "loss": 0.5845, + "step": 7013 + }, + { + "epoch": 2.737704918032787, + "grad_norm": 0.4065664014903383, + "learning_rate": 2.3108340730269297e-07, + "loss": 0.5724, + "step": 7014 + }, + { + "epoch": 2.738095238095238, + "grad_norm": 0.4472689647073738, + "learning_rate": 2.3040149816170365e-07, + "loss": 0.5978, + "step": 7015 + }, + { + "epoch": 2.7384855581576892, + "grad_norm": 0.3886848426851043, + "learning_rate": 2.2972057290949533e-07, + "loss": 0.5361, + "step": 7016 + }, + { + "epoch": 2.7388758782201403, + "grad_norm": 0.4274957122056357, + "learning_rate": 2.2904063168653344e-07, + "loss": 0.5651, + "step": 7017 + }, + { + "epoch": 2.739266198282592, + "grad_norm": 0.41815984290403085, + "learning_rate": 2.2836167463307634e-07, + "loss": 0.5618, + "step": 7018 + }, + { + "epoch": 2.739656518345043, + "grad_norm": 0.5133826389726771, + "learning_rate": 2.2768370188918365e-07, + "loss": 0.5753, + "step": 7019 + }, + { + "epoch": 2.740046838407494, + "grad_norm": 0.4414750406581839, + "learning_rate": 2.2700671359470848e-07, + "loss": 0.568, + "step": 7020 + }, + { + "epoch": 2.7404371584699456, + "grad_norm": 0.417294550270596, + "learning_rate": 2.2633070988930362e-07, + "loss": 0.5373, + "step": 7021 + }, + { + "epoch": 2.7408274785323967, + "grad_norm": 0.4334061596009458, + "learning_rate": 2.256556909124158e-07, + "loss": 0.5633, + "step": 7022 + }, + { + "epoch": 2.741217798594848, + "grad_norm": 0.42275974999114074, + "learning_rate": 2.2498165680329086e-07, + "loss": 0.5848, + "step": 7023 + }, + { + "epoch": 2.741608118657299, + "grad_norm": 0.4192380417354878, + "learning_rate": 2.243086077009704e-07, + "loss": 0.5452, + "step": 7024 + }, + { + "epoch": 2.74199843871975, + "grad_norm": 0.443187852954117, + "learning_rate": 2.2363654374429443e-07, + "loss": 0.5407, + "step": 7025 + }, + { + "epoch": 2.7423887587822016, + "grad_norm": 0.4771240882898918, + "learning_rate": 2.229654650718971e-07, + "loss": 0.5575, + "step": 7026 + }, + { + "epoch": 2.7427790788446527, + "grad_norm": 0.43118129268923555, + "learning_rate": 2.2229537182221207e-07, + "loss": 0.5862, + "step": 7027 + }, + { + "epoch": 2.7431693989071038, + "grad_norm": 0.48940853476417395, + "learning_rate": 2.2162626413346765e-07, + "loss": 0.5688, + "step": 7028 + }, + { + "epoch": 2.7435597189695553, + "grad_norm": 0.44695062809560593, + "learning_rate": 2.209581421436896e-07, + "loss": 0.5526, + "step": 7029 + }, + { + "epoch": 2.7439500390320064, + "grad_norm": 0.4126867065994017, + "learning_rate": 2.2029100599070097e-07, + "loss": 0.6122, + "step": 7030 + }, + { + "epoch": 2.7443403590944575, + "grad_norm": 0.4328393374581392, + "learning_rate": 2.1962485581212056e-07, + "loss": 0.5811, + "step": 7031 + }, + { + "epoch": 2.7447306791569086, + "grad_norm": 0.4133406538397421, + "learning_rate": 2.1895969174536346e-07, + "loss": 0.5415, + "step": 7032 + }, + { + "epoch": 2.7451209992193597, + "grad_norm": 0.44954726780773013, + "learning_rate": 2.1829551392764215e-07, + "loss": 0.5838, + "step": 7033 + }, + { + "epoch": 2.7455113192818112, + "grad_norm": 0.4222344533168174, + "learning_rate": 2.1763232249596643e-07, + "loss": 0.576, + "step": 7034 + }, + { + "epoch": 2.7459016393442623, + "grad_norm": 0.388863774474158, + "learning_rate": 2.1697011758714025e-07, + "loss": 0.5911, + "step": 7035 + }, + { + "epoch": 2.7462919594067134, + "grad_norm": 0.4285087946295275, + "learning_rate": 2.163088993377671e-07, + "loss": 0.5675, + "step": 7036 + }, + { + "epoch": 2.746682279469165, + "grad_norm": 0.47758963029697277, + "learning_rate": 2.1564866788424343e-07, + "loss": 0.6016, + "step": 7037 + }, + { + "epoch": 2.747072599531616, + "grad_norm": 0.43543773731028906, + "learning_rate": 2.149894233627653e-07, + "loss": 0.5292, + "step": 7038 + }, + { + "epoch": 2.747462919594067, + "grad_norm": 0.4430397896825959, + "learning_rate": 2.1433116590932391e-07, + "loss": 0.5539, + "step": 7039 + }, + { + "epoch": 2.7478532396565183, + "grad_norm": 0.4280930078278778, + "learning_rate": 2.1367389565970619e-07, + "loss": 0.6006, + "step": 7040 + }, + { + "epoch": 2.7482435597189694, + "grad_norm": 0.43433885585268467, + "learning_rate": 2.1301761274949594e-07, + "loss": 0.5436, + "step": 7041 + }, + { + "epoch": 2.748633879781421, + "grad_norm": 0.42317932659592333, + "learning_rate": 2.123623173140743e-07, + "loss": 0.5735, + "step": 7042 + }, + { + "epoch": 2.749024199843872, + "grad_norm": 0.4259632885131084, + "learning_rate": 2.1170800948861647e-07, + "loss": 0.6016, + "step": 7043 + }, + { + "epoch": 2.749414519906323, + "grad_norm": 0.4067696303371335, + "learning_rate": 2.110546894080967e-07, + "loss": 0.5565, + "step": 7044 + }, + { + "epoch": 2.7498048399687747, + "grad_norm": 0.4114162577887116, + "learning_rate": 2.1040235720728387e-07, + "loss": 0.5949, + "step": 7045 + }, + { + "epoch": 2.7501951600312253, + "grad_norm": 0.4330502999543734, + "learning_rate": 2.097510130207414e-07, + "loss": 0.5506, + "step": 7046 + }, + { + "epoch": 2.750585480093677, + "grad_norm": 0.4235367581366137, + "learning_rate": 2.0910065698283354e-07, + "loss": 0.5882, + "step": 7047 + }, + { + "epoch": 2.750975800156128, + "grad_norm": 0.4124090355371111, + "learning_rate": 2.0845128922771574e-07, + "loss": 0.6176, + "step": 7048 + }, + { + "epoch": 2.751366120218579, + "grad_norm": 0.43498987910322257, + "learning_rate": 2.0780290988934305e-07, + "loss": 0.5534, + "step": 7049 + }, + { + "epoch": 2.7517564402810306, + "grad_norm": 0.4209001075272867, + "learning_rate": 2.0715551910146402e-07, + "loss": 0.5678, + "step": 7050 + }, + { + "epoch": 2.7521467603434817, + "grad_norm": 0.40815684944365554, + "learning_rate": 2.0650911699762634e-07, + "loss": 0.5359, + "step": 7051 + }, + { + "epoch": 2.752537080405933, + "grad_norm": 0.432622056960024, + "learning_rate": 2.0586370371116993e-07, + "loss": 0.5781, + "step": 7052 + }, + { + "epoch": 2.752927400468384, + "grad_norm": 0.4661836434779087, + "learning_rate": 2.05219279375235e-07, + "loss": 0.563, + "step": 7053 + }, + { + "epoch": 2.753317720530835, + "grad_norm": 0.3819238506987994, + "learning_rate": 2.0457584412275356e-07, + "loss": 0.5568, + "step": 7054 + }, + { + "epoch": 2.7537080405932866, + "grad_norm": 0.4178552955794845, + "learning_rate": 2.0393339808645717e-07, + "loss": 0.5713, + "step": 7055 + }, + { + "epoch": 2.7540983606557377, + "grad_norm": 0.3887266950794338, + "learning_rate": 2.0329194139887098e-07, + "loss": 0.5623, + "step": 7056 + }, + { + "epoch": 2.7544886807181888, + "grad_norm": 0.4373075805581917, + "learning_rate": 2.0265147419231746e-07, + "loss": 0.5401, + "step": 7057 + }, + { + "epoch": 2.7548790007806403, + "grad_norm": 0.4509711845837282, + "learning_rate": 2.0201199659891257e-07, + "loss": 0.5806, + "step": 7058 + }, + { + "epoch": 2.7552693208430914, + "grad_norm": 0.4082005188343103, + "learning_rate": 2.013735087505725e-07, + "loss": 0.5741, + "step": 7059 + }, + { + "epoch": 2.7556596409055425, + "grad_norm": 0.45418594468612505, + "learning_rate": 2.0073601077900407e-07, + "loss": 0.5517, + "step": 7060 + }, + { + "epoch": 2.7560499609679936, + "grad_norm": 0.4426677407570882, + "learning_rate": 2.0009950281571488e-07, + "loss": 0.5851, + "step": 7061 + }, + { + "epoch": 2.7564402810304447, + "grad_norm": 0.44139914197321545, + "learning_rate": 1.9946398499200493e-07, + "loss": 0.6068, + "step": 7062 + }, + { + "epoch": 2.7568306010928962, + "grad_norm": 0.45788980252616973, + "learning_rate": 1.9882945743896985e-07, + "loss": 0.5741, + "step": 7063 + }, + { + "epoch": 2.7572209211553473, + "grad_norm": 0.4193501451987997, + "learning_rate": 1.9819592028750335e-07, + "loss": 0.5793, + "step": 7064 + }, + { + "epoch": 2.7576112412177984, + "grad_norm": 0.4261381054895974, + "learning_rate": 1.9756337366829416e-07, + "loss": 0.6159, + "step": 7065 + }, + { + "epoch": 2.75800156128025, + "grad_norm": 0.4186755727914019, + "learning_rate": 1.9693181771182513e-07, + "loss": 0.6076, + "step": 7066 + }, + { + "epoch": 2.758391881342701, + "grad_norm": 0.44738644075288675, + "learning_rate": 1.9630125254837484e-07, + "loss": 0.5354, + "step": 7067 + }, + { + "epoch": 2.758782201405152, + "grad_norm": 0.4031800580102608, + "learning_rate": 1.9567167830802036e-07, + "loss": 0.5572, + "step": 7068 + }, + { + "epoch": 2.7591725214676033, + "grad_norm": 0.4595144111580625, + "learning_rate": 1.9504309512063113e-07, + "loss": 0.5385, + "step": 7069 + }, + { + "epoch": 2.7595628415300544, + "grad_norm": 0.4280002040319136, + "learning_rate": 1.9441550311587398e-07, + "loss": 0.5652, + "step": 7070 + }, + { + "epoch": 2.759953161592506, + "grad_norm": 0.47697299923236264, + "learning_rate": 1.9378890242321035e-07, + "loss": 0.5554, + "step": 7071 + }, + { + "epoch": 2.760343481654957, + "grad_norm": 0.42867973272601173, + "learning_rate": 1.9316329317189798e-07, + "loss": 0.5304, + "step": 7072 + }, + { + "epoch": 2.760733801717408, + "grad_norm": 0.4450079638300356, + "learning_rate": 1.925386754909886e-07, + "loss": 0.5954, + "step": 7073 + }, + { + "epoch": 2.7611241217798597, + "grad_norm": 0.428837341012005, + "learning_rate": 1.9191504950933248e-07, + "loss": 0.5722, + "step": 7074 + }, + { + "epoch": 2.7615144418423108, + "grad_norm": 0.41857893322720685, + "learning_rate": 1.9129241535557063e-07, + "loss": 0.5548, + "step": 7075 + }, + { + "epoch": 2.761904761904762, + "grad_norm": 0.44669125681381966, + "learning_rate": 1.906707731581442e-07, + "loss": 0.5176, + "step": 7076 + }, + { + "epoch": 2.762295081967213, + "grad_norm": 0.46073161478928276, + "learning_rate": 1.900501230452867e-07, + "loss": 0.5629, + "step": 7077 + }, + { + "epoch": 2.762685402029664, + "grad_norm": 0.46999616297951247, + "learning_rate": 1.8943046514502794e-07, + "loss": 0.5502, + "step": 7078 + }, + { + "epoch": 2.7630757220921156, + "grad_norm": 0.46045328035875593, + "learning_rate": 1.8881179958519346e-07, + "loss": 0.5503, + "step": 7079 + }, + { + "epoch": 2.7634660421545667, + "grad_norm": 0.4841660847299451, + "learning_rate": 1.8819412649340286e-07, + "loss": 0.5396, + "step": 7080 + }, + { + "epoch": 2.763856362217018, + "grad_norm": 0.41791327634909226, + "learning_rate": 1.8757744599707307e-07, + "loss": 0.5303, + "step": 7081 + }, + { + "epoch": 2.7642466822794693, + "grad_norm": 0.4483629835108345, + "learning_rate": 1.8696175822341344e-07, + "loss": 0.522, + "step": 7082 + }, + { + "epoch": 2.7646370023419204, + "grad_norm": 0.41746886247607545, + "learning_rate": 1.863470632994324e-07, + "loss": 0.5449, + "step": 7083 + }, + { + "epoch": 2.7650273224043715, + "grad_norm": 0.4078357649046756, + "learning_rate": 1.8573336135192853e-07, + "loss": 0.5856, + "step": 7084 + }, + { + "epoch": 2.7654176424668226, + "grad_norm": 0.42781164014082573, + "learning_rate": 1.851206525075e-07, + "loss": 0.5568, + "step": 7085 + }, + { + "epoch": 2.7658079625292737, + "grad_norm": 0.4192872809591976, + "learning_rate": 1.845089368925379e-07, + "loss": 0.5362, + "step": 7086 + }, + { + "epoch": 2.7661982825917253, + "grad_norm": 0.42199825113705824, + "learning_rate": 1.8389821463322966e-07, + "loss": 0.5821, + "step": 7087 + }, + { + "epoch": 2.7665886026541764, + "grad_norm": 0.46154455495414554, + "learning_rate": 1.8328848585555559e-07, + "loss": 0.5671, + "step": 7088 + }, + { + "epoch": 2.7669789227166275, + "grad_norm": 0.43024296344500673, + "learning_rate": 1.8267975068529453e-07, + "loss": 0.555, + "step": 7089 + }, + { + "epoch": 2.767369242779079, + "grad_norm": 0.4255328763426191, + "learning_rate": 1.820720092480177e-07, + "loss": 0.5669, + "step": 7090 + }, + { + "epoch": 2.76775956284153, + "grad_norm": 0.40446059584625604, + "learning_rate": 1.8146526166909095e-07, + "loss": 0.6118, + "step": 7091 + }, + { + "epoch": 2.7681498829039812, + "grad_norm": 0.4095853298794386, + "learning_rate": 1.8085950807367803e-07, + "loss": 0.5771, + "step": 7092 + }, + { + "epoch": 2.7685402029664323, + "grad_norm": 0.4095638055083592, + "learning_rate": 1.8025474858673507e-07, + "loss": 0.5172, + "step": 7093 + }, + { + "epoch": 2.7689305230288834, + "grad_norm": 0.46345678169855953, + "learning_rate": 1.796509833330129e-07, + "loss": 0.5822, + "step": 7094 + }, + { + "epoch": 2.769320843091335, + "grad_norm": 0.43391645658164496, + "learning_rate": 1.7904821243705905e-07, + "loss": 0.5576, + "step": 7095 + }, + { + "epoch": 2.769711163153786, + "grad_norm": 0.4431686826925146, + "learning_rate": 1.7844643602321577e-07, + "loss": 0.588, + "step": 7096 + }, + { + "epoch": 2.770101483216237, + "grad_norm": 0.4425303315837552, + "learning_rate": 1.7784565421561817e-07, + "loss": 0.5676, + "step": 7097 + }, + { + "epoch": 2.7704918032786887, + "grad_norm": 0.475986024792364, + "learning_rate": 1.772458671381988e-07, + "loss": 0.5705, + "step": 7098 + }, + { + "epoch": 2.77088212334114, + "grad_norm": 0.46893510676278943, + "learning_rate": 1.766470749146826e-07, + "loss": 0.5512, + "step": 7099 + }, + { + "epoch": 2.771272443403591, + "grad_norm": 0.44876339418170086, + "learning_rate": 1.7604927766859126e-07, + "loss": 0.528, + "step": 7100 + }, + { + "epoch": 2.771662763466042, + "grad_norm": 0.43700101362584776, + "learning_rate": 1.7545247552324063e-07, + "loss": 0.5473, + "step": 7101 + }, + { + "epoch": 2.772053083528493, + "grad_norm": 0.4489562802621727, + "learning_rate": 1.7485666860174055e-07, + "loss": 0.5697, + "step": 7102 + }, + { + "epoch": 2.7724434035909447, + "grad_norm": 0.41912346574054615, + "learning_rate": 1.742618570269955e-07, + "loss": 0.5929, + "step": 7103 + }, + { + "epoch": 2.7728337236533958, + "grad_norm": 0.45379690107439047, + "learning_rate": 1.7366804092170674e-07, + "loss": 0.6011, + "step": 7104 + }, + { + "epoch": 2.773224043715847, + "grad_norm": 0.4567101863440189, + "learning_rate": 1.7307522040836632e-07, + "loss": 0.5665, + "step": 7105 + }, + { + "epoch": 2.7736143637782984, + "grad_norm": 0.4318394768374515, + "learning_rate": 1.7248339560926585e-07, + "loss": 0.5434, + "step": 7106 + }, + { + "epoch": 2.7740046838407495, + "grad_norm": 0.4121049886780035, + "learning_rate": 1.7189256664648768e-07, + "loss": 0.5319, + "step": 7107 + }, + { + "epoch": 2.7743950039032006, + "grad_norm": 0.44430372025209125, + "learning_rate": 1.713027336419093e-07, + "loss": 0.5601, + "step": 7108 + }, + { + "epoch": 2.7747853239656517, + "grad_norm": 0.47504265048514027, + "learning_rate": 1.7071389671720505e-07, + "loss": 0.5616, + "step": 7109 + }, + { + "epoch": 2.775175644028103, + "grad_norm": 0.3886273175856571, + "learning_rate": 1.7012605599384113e-07, + "loss": 0.5904, + "step": 7110 + }, + { + "epoch": 2.7755659640905543, + "grad_norm": 0.4332132141720235, + "learning_rate": 1.6953921159307997e-07, + "loss": 0.5324, + "step": 7111 + }, + { + "epoch": 2.7759562841530054, + "grad_norm": 0.3980429641182636, + "learning_rate": 1.6895336363597636e-07, + "loss": 0.5936, + "step": 7112 + }, + { + "epoch": 2.7763466042154565, + "grad_norm": 0.4268923053628702, + "learning_rate": 1.6836851224338258e-07, + "loss": 0.5583, + "step": 7113 + }, + { + "epoch": 2.776736924277908, + "grad_norm": 0.4182301570266979, + "learning_rate": 1.6778465753594264e-07, + "loss": 0.576, + "step": 7114 + }, + { + "epoch": 2.777127244340359, + "grad_norm": 0.4239363483377318, + "learning_rate": 1.672017996340969e-07, + "loss": 0.5757, + "step": 7115 + }, + { + "epoch": 2.7775175644028103, + "grad_norm": 0.4230955304607276, + "learning_rate": 1.66619938658078e-07, + "loss": 0.5582, + "step": 7116 + }, + { + "epoch": 2.7779078844652614, + "grad_norm": 0.4385625715485527, + "learning_rate": 1.6603907472791613e-07, + "loss": 0.5423, + "step": 7117 + }, + { + "epoch": 2.7782982045277125, + "grad_norm": 0.4475532337580822, + "learning_rate": 1.6545920796343262e-07, + "loss": 0.6174, + "step": 7118 + }, + { + "epoch": 2.778688524590164, + "grad_norm": 0.5152089323085329, + "learning_rate": 1.6488033848424457e-07, + "loss": 0.5911, + "step": 7119 + }, + { + "epoch": 2.779078844652615, + "grad_norm": 0.48070252571118477, + "learning_rate": 1.6430246640976255e-07, + "loss": 0.5762, + "step": 7120 + }, + { + "epoch": 2.779469164715066, + "grad_norm": 0.4395555990482159, + "learning_rate": 1.637255918591929e-07, + "loss": 0.5791, + "step": 7121 + }, + { + "epoch": 2.7798594847775178, + "grad_norm": 0.40520001273364203, + "learning_rate": 1.6314971495153375e-07, + "loss": 0.6236, + "step": 7122 + }, + { + "epoch": 2.780249804839969, + "grad_norm": 0.4313242322707033, + "learning_rate": 1.625748358055812e-07, + "loss": 0.5731, + "step": 7123 + }, + { + "epoch": 2.78064012490242, + "grad_norm": 0.4170925978040551, + "learning_rate": 1.6200095453992147e-07, + "loss": 0.5947, + "step": 7124 + }, + { + "epoch": 2.781030444964871, + "grad_norm": 0.41965817859998555, + "learning_rate": 1.6142807127293657e-07, + "loss": 0.5857, + "step": 7125 + }, + { + "epoch": 2.781420765027322, + "grad_norm": 0.43531824230702515, + "learning_rate": 1.6085618612280472e-07, + "loss": 0.5571, + "step": 7126 + }, + { + "epoch": 2.7818110850897737, + "grad_norm": 0.4378089673900418, + "learning_rate": 1.6028529920749436e-07, + "loss": 0.548, + "step": 7127 + }, + { + "epoch": 2.782201405152225, + "grad_norm": 0.42674371816952283, + "learning_rate": 1.597154106447707e-07, + "loss": 0.5651, + "step": 7128 + }, + { + "epoch": 2.782591725214676, + "grad_norm": 0.4417617568712184, + "learning_rate": 1.5914652055219192e-07, + "loss": 0.558, + "step": 7129 + }, + { + "epoch": 2.7829820452771274, + "grad_norm": 0.3895730979768377, + "learning_rate": 1.585786290471114e-07, + "loss": 0.6065, + "step": 7130 + }, + { + "epoch": 2.7833723653395785, + "grad_norm": 0.4706584571195325, + "learning_rate": 1.580117362466743e-07, + "loss": 0.5478, + "step": 7131 + }, + { + "epoch": 2.7837626854020296, + "grad_norm": 0.41004938154756415, + "learning_rate": 1.5744584226782266e-07, + "loss": 0.5732, + "step": 7132 + }, + { + "epoch": 2.7841530054644807, + "grad_norm": 0.4677696020508303, + "learning_rate": 1.5688094722728976e-07, + "loss": 0.5448, + "step": 7133 + }, + { + "epoch": 2.784543325526932, + "grad_norm": 0.4130874863786054, + "learning_rate": 1.5631705124160512e-07, + "loss": 0.5516, + "step": 7134 + }, + { + "epoch": 2.7849336455893834, + "grad_norm": 0.4575794674703957, + "learning_rate": 1.5575415442709075e-07, + "loss": 0.5252, + "step": 7135 + }, + { + "epoch": 2.7853239656518345, + "grad_norm": 0.4325342772143382, + "learning_rate": 1.5519225689986317e-07, + "loss": 0.5521, + "step": 7136 + }, + { + "epoch": 2.7857142857142856, + "grad_norm": 0.4433376673673986, + "learning_rate": 1.546313587758308e-07, + "loss": 0.5539, + "step": 7137 + }, + { + "epoch": 2.786104605776737, + "grad_norm": 0.4166030939200683, + "learning_rate": 1.5407146017069996e-07, + "loss": 0.5809, + "step": 7138 + }, + { + "epoch": 2.7864949258391882, + "grad_norm": 0.40488381812420243, + "learning_rate": 1.5351256119996661e-07, + "loss": 0.5912, + "step": 7139 + }, + { + "epoch": 2.7868852459016393, + "grad_norm": 0.46128538800128394, + "learning_rate": 1.529546619789235e-07, + "loss": 0.5706, + "step": 7140 + }, + { + "epoch": 2.7872755659640904, + "grad_norm": 0.4117099995410046, + "learning_rate": 1.5239776262265527e-07, + "loss": 0.5793, + "step": 7141 + }, + { + "epoch": 2.7876658860265415, + "grad_norm": 0.43349914484610685, + "learning_rate": 1.5184186324604054e-07, + "loss": 0.5489, + "step": 7142 + }, + { + "epoch": 2.788056206088993, + "grad_norm": 0.41718114213030616, + "learning_rate": 1.5128696396375376e-07, + "loss": 0.5332, + "step": 7143 + }, + { + "epoch": 2.788446526151444, + "grad_norm": 0.41667311603763096, + "learning_rate": 1.5073306489025942e-07, + "loss": 0.5824, + "step": 7144 + }, + { + "epoch": 2.7888368462138953, + "grad_norm": 0.46751514911527986, + "learning_rate": 1.5018016613982e-07, + "loss": 0.5811, + "step": 7145 + }, + { + "epoch": 2.789227166276347, + "grad_norm": 0.3996319133160042, + "learning_rate": 1.4962826782648653e-07, + "loss": 0.5546, + "step": 7146 + }, + { + "epoch": 2.789617486338798, + "grad_norm": 0.4335259021046352, + "learning_rate": 1.4907737006410906e-07, + "loss": 0.5962, + "step": 7147 + }, + { + "epoch": 2.790007806401249, + "grad_norm": 0.40197042704226543, + "learning_rate": 1.4852747296632609e-07, + "loss": 0.5913, + "step": 7148 + }, + { + "epoch": 2.7903981264637, + "grad_norm": 0.39458000418266187, + "learning_rate": 1.4797857664657466e-07, + "loss": 0.602, + "step": 7149 + }, + { + "epoch": 2.790788446526151, + "grad_norm": 0.4420461672802297, + "learning_rate": 1.4743068121808147e-07, + "loss": 0.5822, + "step": 7150 + }, + { + "epoch": 2.7911787665886028, + "grad_norm": 0.43122355994057415, + "learning_rate": 1.4688378679386884e-07, + "loss": 0.5556, + "step": 7151 + }, + { + "epoch": 2.791569086651054, + "grad_norm": 0.4208704978391195, + "learning_rate": 1.4633789348675098e-07, + "loss": 0.5436, + "step": 7152 + }, + { + "epoch": 2.791959406713505, + "grad_norm": 0.4444583501268702, + "learning_rate": 1.4579300140933893e-07, + "loss": 0.5942, + "step": 7153 + }, + { + "epoch": 2.7923497267759565, + "grad_norm": 0.4393566072556816, + "learning_rate": 1.4524911067403225e-07, + "loss": 0.5378, + "step": 7154 + }, + { + "epoch": 2.7927400468384076, + "grad_norm": 0.442075625812866, + "learning_rate": 1.4470622139302782e-07, + "loss": 0.5363, + "step": 7155 + }, + { + "epoch": 2.7931303669008587, + "grad_norm": 0.4449515165655583, + "learning_rate": 1.441643336783144e-07, + "loss": 0.5526, + "step": 7156 + }, + { + "epoch": 2.79352068696331, + "grad_norm": 0.48386946224543703, + "learning_rate": 1.436234476416748e-07, + "loss": 0.5805, + "step": 7157 + }, + { + "epoch": 2.793911007025761, + "grad_norm": 0.46779393846006784, + "learning_rate": 1.430835633946842e-07, + "loss": 0.5405, + "step": 7158 + }, + { + "epoch": 2.7943013270882124, + "grad_norm": 0.4039413267464233, + "learning_rate": 1.4254468104871188e-07, + "loss": 0.6025, + "step": 7159 + }, + { + "epoch": 2.7946916471506635, + "grad_norm": 0.3826786380921076, + "learning_rate": 1.4200680071492055e-07, + "loss": 0.5423, + "step": 7160 + }, + { + "epoch": 2.7950819672131146, + "grad_norm": 0.45637761577628194, + "learning_rate": 1.4146992250426539e-07, + "loss": 0.5695, + "step": 7161 + }, + { + "epoch": 2.795472287275566, + "grad_norm": 0.43214798040542873, + "learning_rate": 1.409340465274972e-07, + "loss": 0.5539, + "step": 7162 + }, + { + "epoch": 2.7958626073380173, + "grad_norm": 0.43418416659807235, + "learning_rate": 1.4039917289515538e-07, + "loss": 0.5682, + "step": 7163 + }, + { + "epoch": 2.7962529274004684, + "grad_norm": 0.437691048104566, + "learning_rate": 1.3986530171757828e-07, + "loss": 0.5801, + "step": 7164 + }, + { + "epoch": 2.7966432474629195, + "grad_norm": 0.4203906750241195, + "learning_rate": 1.393324331048923e-07, + "loss": 0.5544, + "step": 7165 + }, + { + "epoch": 2.7970335675253706, + "grad_norm": 0.4270586630271635, + "learning_rate": 1.3880056716702118e-07, + "loss": 0.5628, + "step": 7166 + }, + { + "epoch": 2.797423887587822, + "grad_norm": 0.41880498615482764, + "learning_rate": 1.3826970401367878e-07, + "loss": 0.5644, + "step": 7167 + }, + { + "epoch": 2.797814207650273, + "grad_norm": 0.4466850346145754, + "learning_rate": 1.3773984375437477e-07, + "loss": 0.5704, + "step": 7168 + }, + { + "epoch": 2.7982045277127243, + "grad_norm": 0.42760987847004267, + "learning_rate": 1.3721098649840837e-07, + "loss": 0.5728, + "step": 7169 + }, + { + "epoch": 2.798594847775176, + "grad_norm": 0.4384070738495563, + "learning_rate": 1.366831323548762e-07, + "loss": 0.5473, + "step": 7170 + }, + { + "epoch": 2.798985167837627, + "grad_norm": 0.44514727648459407, + "learning_rate": 1.3615628143266446e-07, + "loss": 0.5914, + "step": 7171 + }, + { + "epoch": 2.799375487900078, + "grad_norm": 0.4166956973746386, + "learning_rate": 1.356304338404546e-07, + "loss": 0.5612, + "step": 7172 + }, + { + "epoch": 2.799765807962529, + "grad_norm": 0.434790747940769, + "learning_rate": 1.351055896867187e-07, + "loss": 0.5368, + "step": 7173 + }, + { + "epoch": 2.8001561280249803, + "grad_norm": 0.43338058852042877, + "learning_rate": 1.345817490797252e-07, + "loss": 0.5297, + "step": 7174 + }, + { + "epoch": 2.800546448087432, + "grad_norm": 0.42591996607214433, + "learning_rate": 1.3405891212753264e-07, + "loss": 0.5744, + "step": 7175 + }, + { + "epoch": 2.800936768149883, + "grad_norm": 0.4500690096677793, + "learning_rate": 1.3353707893799362e-07, + "loss": 0.5416, + "step": 7176 + }, + { + "epoch": 2.801327088212334, + "grad_norm": 0.44776314506389203, + "learning_rate": 1.3301624961875427e-07, + "loss": 0.5511, + "step": 7177 + }, + { + "epoch": 2.8017174082747855, + "grad_norm": 0.4530852904246233, + "learning_rate": 1.3249642427725196e-07, + "loss": 0.5908, + "step": 7178 + }, + { + "epoch": 2.8021077283372366, + "grad_norm": 0.43630256665020956, + "learning_rate": 1.3197760302071928e-07, + "loss": 0.5842, + "step": 7179 + }, + { + "epoch": 2.8024980483996877, + "grad_norm": 0.4188721015473865, + "learning_rate": 1.3145978595617947e-07, + "loss": 0.5743, + "step": 7180 + }, + { + "epoch": 2.802888368462139, + "grad_norm": 0.4311557779373229, + "learning_rate": 1.309429731904499e-07, + "loss": 0.5917, + "step": 7181 + }, + { + "epoch": 2.80327868852459, + "grad_norm": 0.4517043715777347, + "learning_rate": 1.304271648301403e-07, + "loss": 0.5894, + "step": 7182 + }, + { + "epoch": 2.8036690085870415, + "grad_norm": 0.45089877366044223, + "learning_rate": 1.2991236098165384e-07, + "loss": 0.5825, + "step": 7183 + }, + { + "epoch": 2.8040593286494926, + "grad_norm": 0.40660593634353137, + "learning_rate": 1.2939856175118503e-07, + "loss": 0.5961, + "step": 7184 + }, + { + "epoch": 2.8044496487119437, + "grad_norm": 0.43368119351721934, + "learning_rate": 1.2888576724472302e-07, + "loss": 0.5589, + "step": 7185 + }, + { + "epoch": 2.8048399687743952, + "grad_norm": 0.4459687580021802, + "learning_rate": 1.2837397756804815e-07, + "loss": 0.5878, + "step": 7186 + }, + { + "epoch": 2.8052302888368463, + "grad_norm": 0.4712009330415801, + "learning_rate": 1.2786319282673488e-07, + "loss": 0.5537, + "step": 7187 + }, + { + "epoch": 2.8056206088992974, + "grad_norm": 0.4594874946038633, + "learning_rate": 1.273534131261489e-07, + "loss": 0.5547, + "step": 7188 + }, + { + "epoch": 2.8060109289617485, + "grad_norm": 0.4460922915895374, + "learning_rate": 1.2684463857145002e-07, + "loss": 0.5368, + "step": 7189 + }, + { + "epoch": 2.8064012490241996, + "grad_norm": 0.4387813902551512, + "learning_rate": 1.2633686926758814e-07, + "loss": 0.553, + "step": 7190 + }, + { + "epoch": 2.806791569086651, + "grad_norm": 0.4744509658847511, + "learning_rate": 1.2583010531931005e-07, + "loss": 0.5585, + "step": 7191 + }, + { + "epoch": 2.8071818891491023, + "grad_norm": 0.4312672011273255, + "learning_rate": 1.2532434683115158e-07, + "loss": 0.5719, + "step": 7192 + }, + { + "epoch": 2.8075722092115534, + "grad_norm": 0.4346177849191625, + "learning_rate": 1.2481959390744147e-07, + "loss": 0.6094, + "step": 7193 + }, + { + "epoch": 2.807962529274005, + "grad_norm": 0.4273651421919471, + "learning_rate": 1.2431584665230313e-07, + "loss": 0.5761, + "step": 7194 + }, + { + "epoch": 2.808352849336456, + "grad_norm": 0.44141243586510764, + "learning_rate": 1.2381310516965062e-07, + "loss": 0.5914, + "step": 7195 + }, + { + "epoch": 2.808743169398907, + "grad_norm": 0.43579427430187695, + "learning_rate": 1.2331136956319157e-07, + "loss": 0.5633, + "step": 7196 + }, + { + "epoch": 2.809133489461358, + "grad_norm": 0.4379400251414754, + "learning_rate": 1.2281063993642538e-07, + "loss": 0.5442, + "step": 7197 + }, + { + "epoch": 2.8095238095238093, + "grad_norm": 0.42751906514891663, + "learning_rate": 1.2231091639264448e-07, + "loss": 0.6001, + "step": 7198 + }, + { + "epoch": 2.809914129586261, + "grad_norm": 0.41791925672515035, + "learning_rate": 1.2181219903493247e-07, + "loss": 0.5489, + "step": 7199 + }, + { + "epoch": 2.810304449648712, + "grad_norm": 0.426050084122952, + "learning_rate": 1.2131448796616817e-07, + "loss": 0.5508, + "step": 7200 + }, + { + "epoch": 2.810694769711163, + "grad_norm": 0.3914437884667763, + "learning_rate": 1.208177832890195e-07, + "loss": 0.562, + "step": 7201 + }, + { + "epoch": 2.8110850897736146, + "grad_norm": 0.4361686652032966, + "learning_rate": 1.2032208510594945e-07, + "loss": 0.5453, + "step": 7202 + }, + { + "epoch": 2.8114754098360657, + "grad_norm": 0.45988145047508433, + "learning_rate": 1.1982739351921124e-07, + "loss": 0.5611, + "step": 7203 + }, + { + "epoch": 2.811865729898517, + "grad_norm": 0.42433294415861855, + "learning_rate": 1.1933370863085325e-07, + "loss": 0.5648, + "step": 7204 + }, + { + "epoch": 2.812256049960968, + "grad_norm": 0.4147655131377057, + "learning_rate": 1.188410305427129e-07, + "loss": 0.6016, + "step": 7205 + }, + { + "epoch": 2.812646370023419, + "grad_norm": 0.4054405962032617, + "learning_rate": 1.1834935935642223e-07, + "loss": 0.5775, + "step": 7206 + }, + { + "epoch": 2.8130366900858705, + "grad_norm": 0.42046929755541673, + "learning_rate": 1.1785869517340398e-07, + "loss": 0.5537, + "step": 7207 + }, + { + "epoch": 2.8134270101483216, + "grad_norm": 0.46472150107684157, + "learning_rate": 1.1736903809487499e-07, + "loss": 0.6044, + "step": 7208 + }, + { + "epoch": 2.8138173302107727, + "grad_norm": 0.44365877312967517, + "learning_rate": 1.1688038822184278e-07, + "loss": 0.5626, + "step": 7209 + }, + { + "epoch": 2.8142076502732243, + "grad_norm": 0.39888058734819604, + "learning_rate": 1.1639274565510783e-07, + "loss": 0.5613, + "step": 7210 + }, + { + "epoch": 2.8145979703356754, + "grad_norm": 0.44419922570586967, + "learning_rate": 1.1590611049526302e-07, + "loss": 0.5641, + "step": 7211 + }, + { + "epoch": 2.8149882903981265, + "grad_norm": 0.4211814441701766, + "learning_rate": 1.1542048284269247e-07, + "loss": 0.6136, + "step": 7212 + }, + { + "epoch": 2.8153786104605776, + "grad_norm": 0.42329183528715686, + "learning_rate": 1.149358627975733e-07, + "loss": 0.5376, + "step": 7213 + }, + { + "epoch": 2.8157689305230287, + "grad_norm": 0.39468948820089605, + "learning_rate": 1.144522504598744e-07, + "loss": 0.5302, + "step": 7214 + }, + { + "epoch": 2.8161592505854802, + "grad_norm": 0.39811511485425655, + "learning_rate": 1.1396964592935877e-07, + "loss": 0.5559, + "step": 7215 + }, + { + "epoch": 2.8165495706479313, + "grad_norm": 0.41841396282512033, + "learning_rate": 1.1348804930557678e-07, + "loss": 0.5661, + "step": 7216 + }, + { + "epoch": 2.8169398907103824, + "grad_norm": 0.4388679284004303, + "learning_rate": 1.1300746068787505e-07, + "loss": 0.5707, + "step": 7217 + }, + { + "epoch": 2.817330210772834, + "grad_norm": 0.4388956806072438, + "learning_rate": 1.1252788017539151e-07, + "loss": 0.5588, + "step": 7218 + }, + { + "epoch": 2.817720530835285, + "grad_norm": 0.41477109813678575, + "learning_rate": 1.1204930786705537e-07, + "loss": 0.5841, + "step": 7219 + }, + { + "epoch": 2.818110850897736, + "grad_norm": 0.43181059797267884, + "learning_rate": 1.1157174386158764e-07, + "loss": 0.5584, + "step": 7220 + }, + { + "epoch": 2.8185011709601873, + "grad_norm": 0.4330835608679367, + "learning_rate": 1.1109518825750176e-07, + "loss": 0.5924, + "step": 7221 + }, + { + "epoch": 2.8188914910226384, + "grad_norm": 0.4310089479449416, + "learning_rate": 1.1061964115310463e-07, + "loss": 0.5722, + "step": 7222 + }, + { + "epoch": 2.81928181108509, + "grad_norm": 0.40791492113810146, + "learning_rate": 1.101451026464917e-07, + "loss": 0.5564, + "step": 7223 + }, + { + "epoch": 2.819672131147541, + "grad_norm": 0.46185699095984833, + "learning_rate": 1.0967157283555463e-07, + "loss": 0.5828, + "step": 7224 + }, + { + "epoch": 2.820062451209992, + "grad_norm": 0.40706425611368613, + "learning_rate": 1.0919905181797253e-07, + "loss": 0.5259, + "step": 7225 + }, + { + "epoch": 2.8204527712724436, + "grad_norm": 0.44723710010054085, + "learning_rate": 1.0872753969121964e-07, + "loss": 0.5823, + "step": 7226 + }, + { + "epoch": 2.8208430913348947, + "grad_norm": 0.44891343600668804, + "learning_rate": 1.0825703655256092e-07, + "loss": 0.5774, + "step": 7227 + }, + { + "epoch": 2.821233411397346, + "grad_norm": 0.4056684281434897, + "learning_rate": 1.0778754249905377e-07, + "loss": 0.5995, + "step": 7228 + }, + { + "epoch": 2.821623731459797, + "grad_norm": 0.46778341057002826, + "learning_rate": 1.0731905762754624e-07, + "loss": 0.6056, + "step": 7229 + }, + { + "epoch": 2.822014051522248, + "grad_norm": 0.4345020579168768, + "learning_rate": 1.0685158203467994e-07, + "loss": 0.6027, + "step": 7230 + }, + { + "epoch": 2.8224043715846996, + "grad_norm": 0.429422226470218, + "learning_rate": 1.0638511581688604e-07, + "loss": 0.5347, + "step": 7231 + }, + { + "epoch": 2.8227946916471507, + "grad_norm": 0.43510813622415734, + "learning_rate": 1.0591965907038982e-07, + "loss": 0.5921, + "step": 7232 + }, + { + "epoch": 2.823185011709602, + "grad_norm": 0.42403344223956413, + "learning_rate": 1.0545521189120722e-07, + "loss": 0.5296, + "step": 7233 + }, + { + "epoch": 2.8235753317720533, + "grad_norm": 0.44264725371646296, + "learning_rate": 1.0499177437514551e-07, + "loss": 0.5452, + "step": 7234 + }, + { + "epoch": 2.8239656518345044, + "grad_norm": 0.4127334420045309, + "learning_rate": 1.0452934661780323e-07, + "loss": 0.577, + "step": 7235 + }, + { + "epoch": 2.8243559718969555, + "grad_norm": 0.42017213831437417, + "learning_rate": 1.0406792871457349e-07, + "loss": 0.5527, + "step": 7236 + }, + { + "epoch": 2.8247462919594066, + "grad_norm": 0.4125006284399899, + "learning_rate": 1.0360752076063851e-07, + "loss": 0.5647, + "step": 7237 + }, + { + "epoch": 2.8251366120218577, + "grad_norm": 0.4256270124735977, + "learning_rate": 1.031481228509712e-07, + "loss": 0.577, + "step": 7238 + }, + { + "epoch": 2.8255269320843093, + "grad_norm": 0.41393507813955327, + "learning_rate": 1.026897350803402e-07, + "loss": 0.582, + "step": 7239 + }, + { + "epoch": 2.8259172521467604, + "grad_norm": 0.4165184262353585, + "learning_rate": 1.0223235754330096e-07, + "loss": 0.5737, + "step": 7240 + }, + { + "epoch": 2.8263075722092115, + "grad_norm": 0.44915101735430285, + "learning_rate": 1.0177599033420471e-07, + "loss": 0.5963, + "step": 7241 + }, + { + "epoch": 2.826697892271663, + "grad_norm": 0.40842558083001185, + "learning_rate": 1.0132063354719168e-07, + "loss": 0.5642, + "step": 7242 + }, + { + "epoch": 2.827088212334114, + "grad_norm": 0.44867084548325664, + "learning_rate": 1.0086628727619396e-07, + "loss": 0.5726, + "step": 7243 + }, + { + "epoch": 2.827478532396565, + "grad_norm": 0.4545593650300866, + "learning_rate": 1.0041295161493603e-07, + "loss": 0.5479, + "step": 7244 + }, + { + "epoch": 2.8278688524590163, + "grad_norm": 0.43548158002640464, + "learning_rate": 9.996062665693362e-08, + "loss": 0.5922, + "step": 7245 + }, + { + "epoch": 2.8282591725214674, + "grad_norm": 0.4192522818917228, + "learning_rate": 9.950931249549323e-08, + "loss": 0.555, + "step": 7246 + }, + { + "epoch": 2.828649492583919, + "grad_norm": 0.40267948287725613, + "learning_rate": 9.90590092237148e-08, + "loss": 0.537, + "step": 7247 + }, + { + "epoch": 2.82903981264637, + "grad_norm": 0.3965828875606547, + "learning_rate": 9.860971693448685e-08, + "loss": 0.5534, + "step": 7248 + }, + { + "epoch": 2.829430132708821, + "grad_norm": 0.42294611405507476, + "learning_rate": 9.816143572049185e-08, + "loss": 0.5322, + "step": 7249 + }, + { + "epoch": 2.8298204527712727, + "grad_norm": 0.3976445564076963, + "learning_rate": 9.771416567420256e-08, + "loss": 0.5749, + "step": 7250 + }, + { + "epoch": 2.830210772833724, + "grad_norm": 0.38182859861444524, + "learning_rate": 9.72679068878829e-08, + "loss": 0.5998, + "step": 7251 + }, + { + "epoch": 2.830601092896175, + "grad_norm": 0.3937708144049471, + "learning_rate": 9.682265945358871e-08, + "loss": 0.5851, + "step": 7252 + }, + { + "epoch": 2.830991412958626, + "grad_norm": 0.4116656554819177, + "learning_rate": 9.637842346316817e-08, + "loss": 0.6002, + "step": 7253 + }, + { + "epoch": 2.831381733021077, + "grad_norm": 0.4285522349850381, + "learning_rate": 9.59351990082591e-08, + "loss": 0.5121, + "step": 7254 + }, + { + "epoch": 2.8317720530835286, + "grad_norm": 0.43313346164396377, + "learning_rate": 9.549298618029057e-08, + "loss": 0.5896, + "step": 7255 + }, + { + "epoch": 2.8321623731459797, + "grad_norm": 0.4205155102168748, + "learning_rate": 9.505178507048463e-08, + "loss": 0.5707, + "step": 7256 + }, + { + "epoch": 2.832552693208431, + "grad_norm": 0.4070949518950773, + "learning_rate": 9.461159576985291e-08, + "loss": 0.5607, + "step": 7257 + }, + { + "epoch": 2.8329430132708824, + "grad_norm": 0.3988837903443515, + "learning_rate": 9.417241836920055e-08, + "loss": 0.5753, + "step": 7258 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.43273776258772967, + "learning_rate": 9.373425295912176e-08, + "loss": 0.5704, + "step": 7259 + }, + { + "epoch": 2.8337236533957846, + "grad_norm": 0.4070598520795719, + "learning_rate": 9.3297099630002e-08, + "loss": 0.5633, + "step": 7260 + }, + { + "epoch": 2.8341139734582357, + "grad_norm": 0.46954266810692674, + "learning_rate": 9.28609584720197e-08, + "loss": 0.5736, + "step": 7261 + }, + { + "epoch": 2.834504293520687, + "grad_norm": 0.39259411920307064, + "learning_rate": 9.242582957514346e-08, + "loss": 0.6045, + "step": 7262 + }, + { + "epoch": 2.8348946135831383, + "grad_norm": 0.42312666909092883, + "learning_rate": 9.199171302913257e-08, + "loss": 0.5721, + "step": 7263 + }, + { + "epoch": 2.8352849336455894, + "grad_norm": 0.3900866936106901, + "learning_rate": 9.155860892353874e-08, + "loss": 0.6084, + "step": 7264 + }, + { + "epoch": 2.8356752537080405, + "grad_norm": 0.4828365616454378, + "learning_rate": 9.112651734770328e-08, + "loss": 0.5916, + "step": 7265 + }, + { + "epoch": 2.836065573770492, + "grad_norm": 0.43178107864948084, + "learning_rate": 9.069543839076045e-08, + "loss": 0.5738, + "step": 7266 + }, + { + "epoch": 2.836455893832943, + "grad_norm": 0.41856672346014945, + "learning_rate": 9.026537214163466e-08, + "loss": 0.5785, + "step": 7267 + }, + { + "epoch": 2.8368462138953943, + "grad_norm": 0.3896826343798208, + "learning_rate": 8.983631868904052e-08, + "loss": 0.5861, + "step": 7268 + }, + { + "epoch": 2.8372365339578454, + "grad_norm": 0.4059171984053738, + "learning_rate": 8.9408278121485e-08, + "loss": 0.5463, + "step": 7269 + }, + { + "epoch": 2.8376268540202965, + "grad_norm": 0.4334002283399777, + "learning_rate": 8.898125052726581e-08, + "loss": 0.5837, + "step": 7270 + }, + { + "epoch": 2.838017174082748, + "grad_norm": 0.4378392722513999, + "learning_rate": 8.855523599447192e-08, + "loss": 0.5884, + "step": 7271 + }, + { + "epoch": 2.838407494145199, + "grad_norm": 0.43095017023968324, + "learning_rate": 8.813023461098191e-08, + "loss": 0.5524, + "step": 7272 + }, + { + "epoch": 2.83879781420765, + "grad_norm": 0.4153707231345609, + "learning_rate": 8.770624646446846e-08, + "loss": 0.5658, + "step": 7273 + }, + { + "epoch": 2.8391881342701017, + "grad_norm": 0.43341232103976607, + "learning_rate": 8.728327164239104e-08, + "loss": 0.6172, + "step": 7274 + }, + { + "epoch": 2.839578454332553, + "grad_norm": 0.4026276377998231, + "learning_rate": 8.686131023200428e-08, + "loss": 0.5696, + "step": 7275 + }, + { + "epoch": 2.839968774395004, + "grad_norm": 0.4472791257015232, + "learning_rate": 8.644036232035024e-08, + "loss": 0.5625, + "step": 7276 + }, + { + "epoch": 2.840359094457455, + "grad_norm": 0.4083170896731862, + "learning_rate": 8.602042799426558e-08, + "loss": 0.6051, + "step": 7277 + }, + { + "epoch": 2.840749414519906, + "grad_norm": 0.43971223073381, + "learning_rate": 8.56015073403732e-08, + "loss": 0.5679, + "step": 7278 + }, + { + "epoch": 2.8411397345823577, + "grad_norm": 0.44764046286151415, + "learning_rate": 8.518360044509066e-08, + "loss": 0.552, + "step": 7279 + }, + { + "epoch": 2.841530054644809, + "grad_norm": 0.41139121126700284, + "learning_rate": 8.476670739462511e-08, + "loss": 0.5547, + "step": 7280 + }, + { + "epoch": 2.84192037470726, + "grad_norm": 0.4099007119704445, + "learning_rate": 8.435082827497499e-08, + "loss": 0.5523, + "step": 7281 + }, + { + "epoch": 2.8423106947697114, + "grad_norm": 0.4169242456096108, + "learning_rate": 8.393596317192832e-08, + "loss": 0.6004, + "step": 7282 + }, + { + "epoch": 2.8427010148321625, + "grad_norm": 0.43554215876100816, + "learning_rate": 8.35221121710661e-08, + "loss": 0.5641, + "step": 7283 + }, + { + "epoch": 2.8430913348946136, + "grad_norm": 0.4363228379443541, + "learning_rate": 8.310927535775837e-08, + "loss": 0.5697, + "step": 7284 + }, + { + "epoch": 2.8434816549570647, + "grad_norm": 0.43390077608815675, + "learning_rate": 8.26974528171659e-08, + "loss": 0.57, + "step": 7285 + }, + { + "epoch": 2.843871975019516, + "grad_norm": 0.4227578338938979, + "learning_rate": 8.228664463424185e-08, + "loss": 0.5582, + "step": 7286 + }, + { + "epoch": 2.8442622950819674, + "grad_norm": 0.42738842054967463, + "learning_rate": 8.187685089372843e-08, + "loss": 0.5956, + "step": 7287 + }, + { + "epoch": 2.8446526151444185, + "grad_norm": 0.412341641880661, + "learning_rate": 8.146807168015969e-08, + "loss": 0.581, + "step": 7288 + }, + { + "epoch": 2.8450429352068696, + "grad_norm": 0.3999063048071383, + "learning_rate": 8.106030707785873e-08, + "loss": 0.5654, + "step": 7289 + }, + { + "epoch": 2.845433255269321, + "grad_norm": 0.397766041048863, + "learning_rate": 8.06535571709427e-08, + "loss": 0.593, + "step": 7290 + }, + { + "epoch": 2.845823575331772, + "grad_norm": 0.4095380055942889, + "learning_rate": 8.024782204331505e-08, + "loss": 0.5718, + "step": 7291 + }, + { + "epoch": 2.8462138953942233, + "grad_norm": 0.45140230028690076, + "learning_rate": 7.984310177867438e-08, + "loss": 0.5893, + "step": 7292 + }, + { + "epoch": 2.8466042154566744, + "grad_norm": 0.4255081673902413, + "learning_rate": 7.943939646050613e-08, + "loss": 0.5775, + "step": 7293 + }, + { + "epoch": 2.8469945355191255, + "grad_norm": 0.3907952230467076, + "learning_rate": 7.903670617208925e-08, + "loss": 0.5623, + "step": 7294 + }, + { + "epoch": 2.847384855581577, + "grad_norm": 0.44980183076933444, + "learning_rate": 7.863503099649062e-08, + "loss": 0.5394, + "step": 7295 + }, + { + "epoch": 2.847775175644028, + "grad_norm": 0.4169230998964158, + "learning_rate": 7.82343710165706e-08, + "loss": 0.5235, + "step": 7296 + }, + { + "epoch": 2.8481654957064793, + "grad_norm": 0.44632228509891847, + "learning_rate": 7.783472631497702e-08, + "loss": 0.551, + "step": 7297 + }, + { + "epoch": 2.848555815768931, + "grad_norm": 0.4243680139884923, + "learning_rate": 7.743609697415167e-08, + "loss": 0.5664, + "step": 7298 + }, + { + "epoch": 2.848946135831382, + "grad_norm": 0.41074129402170834, + "learning_rate": 7.703848307632378e-08, + "loss": 0.6067, + "step": 7299 + }, + { + "epoch": 2.849336455893833, + "grad_norm": 0.4235887356112223, + "learning_rate": 7.664188470351552e-08, + "loss": 0.5392, + "step": 7300 + }, + { + "epoch": 2.849726775956284, + "grad_norm": 0.39391334170278053, + "learning_rate": 7.624630193753813e-08, + "loss": 0.5887, + "step": 7301 + }, + { + "epoch": 2.850117096018735, + "grad_norm": 0.4149563718779686, + "learning_rate": 7.585173485999297e-08, + "loss": 0.5671, + "step": 7302 + }, + { + "epoch": 2.8505074160811867, + "grad_norm": 0.4138227561479417, + "learning_rate": 7.545818355227442e-08, + "loss": 0.5621, + "step": 7303 + }, + { + "epoch": 2.850897736143638, + "grad_norm": 0.4221063018394153, + "learning_rate": 7.506564809556416e-08, + "loss": 0.5344, + "step": 7304 + }, + { + "epoch": 2.851288056206089, + "grad_norm": 0.428958432412772, + "learning_rate": 7.467412857083578e-08, + "loss": 0.569, + "step": 7305 + }, + { + "epoch": 2.85167837626854, + "grad_norm": 0.44355348176732334, + "learning_rate": 7.428362505885356e-08, + "loss": 0.5604, + "step": 7306 + }, + { + "epoch": 2.852068696330991, + "grad_norm": 0.415416915550216, + "learning_rate": 7.389413764017194e-08, + "loss": 0.5766, + "step": 7307 + }, + { + "epoch": 2.8524590163934427, + "grad_norm": 0.4233132075240474, + "learning_rate": 7.350566639513557e-08, + "loss": 0.5494, + "step": 7308 + }, + { + "epoch": 2.852849336455894, + "grad_norm": 0.4272894220632814, + "learning_rate": 7.311821140387975e-08, + "loss": 0.5609, + "step": 7309 + }, + { + "epoch": 2.853239656518345, + "grad_norm": 0.3690117175273333, + "learning_rate": 7.273177274632948e-08, + "loss": 0.6216, + "step": 7310 + }, + { + "epoch": 2.8536299765807964, + "grad_norm": 0.4850739306878166, + "learning_rate": 7.234635050220151e-08, + "loss": 0.5436, + "step": 7311 + }, + { + "epoch": 2.8540202966432475, + "grad_norm": 0.41994771439977857, + "learning_rate": 7.196194475100171e-08, + "loss": 0.5666, + "step": 7312 + }, + { + "epoch": 2.8544106167056986, + "grad_norm": 0.42862248381546664, + "learning_rate": 7.157855557202608e-08, + "loss": 0.5585, + "step": 7313 + }, + { + "epoch": 2.8548009367681497, + "grad_norm": 0.43749931845489787, + "learning_rate": 7.119618304436193e-08, + "loss": 0.5652, + "step": 7314 + }, + { + "epoch": 2.855191256830601, + "grad_norm": 0.45665528434697367, + "learning_rate": 7.081482724688616e-08, + "loss": 0.5323, + "step": 7315 + }, + { + "epoch": 2.8555815768930524, + "grad_norm": 0.474997501208428, + "learning_rate": 7.043448825826582e-08, + "loss": 0.5378, + "step": 7316 + }, + { + "epoch": 2.8559718969555035, + "grad_norm": 0.44428642424877185, + "learning_rate": 7.005516615695929e-08, + "loss": 0.5659, + "step": 7317 + }, + { + "epoch": 2.8563622170179546, + "grad_norm": 0.4116131057138872, + "learning_rate": 6.967686102121396e-08, + "loss": 0.6049, + "step": 7318 + }, + { + "epoch": 2.856752537080406, + "grad_norm": 0.4407084857549753, + "learning_rate": 6.929957292906741e-08, + "loss": 0.5286, + "step": 7319 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.4446744725902647, + "learning_rate": 6.892330195834906e-08, + "loss": 0.6167, + "step": 7320 + }, + { + "epoch": 2.8575331772053083, + "grad_norm": 0.42672855067243926, + "learning_rate": 6.854804818667627e-08, + "loss": 0.5761, + "step": 7321 + }, + { + "epoch": 2.8579234972677594, + "grad_norm": 0.45963097932228686, + "learning_rate": 6.817381169145821e-08, + "loss": 0.5589, + "step": 7322 + }, + { + "epoch": 2.8583138173302105, + "grad_norm": 0.41629961744245003, + "learning_rate": 6.780059254989313e-08, + "loss": 0.5255, + "step": 7323 + }, + { + "epoch": 2.858704137392662, + "grad_norm": 0.4808691012787334, + "learning_rate": 6.742839083897112e-08, + "loss": 0.5635, + "step": 7324 + }, + { + "epoch": 2.859094457455113, + "grad_norm": 0.406458104527266, + "learning_rate": 6.705720663546911e-08, + "loss": 0.5737, + "step": 7325 + }, + { + "epoch": 2.8594847775175642, + "grad_norm": 0.41880515353786757, + "learning_rate": 6.668704001595861e-08, + "loss": 0.6007, + "step": 7326 + }, + { + "epoch": 2.859875097580016, + "grad_norm": 0.39124955993363814, + "learning_rate": 6.631789105679687e-08, + "loss": 0.5779, + "step": 7327 + }, + { + "epoch": 2.860265417642467, + "grad_norm": 0.4571217738121692, + "learning_rate": 6.59497598341341e-08, + "loss": 0.5722, + "step": 7328 + }, + { + "epoch": 2.860655737704918, + "grad_norm": 0.4067879280897085, + "learning_rate": 6.558264642390955e-08, + "loss": 0.5986, + "step": 7329 + }, + { + "epoch": 2.861046057767369, + "grad_norm": 0.4174494059662485, + "learning_rate": 6.521655090185319e-08, + "loss": 0.5727, + "step": 7330 + }, + { + "epoch": 2.86143637782982, + "grad_norm": 0.47085790248529985, + "learning_rate": 6.485147334348296e-08, + "loss": 0.5776, + "step": 7331 + }, + { + "epoch": 2.8618266978922717, + "grad_norm": 0.4274921579066053, + "learning_rate": 6.448741382410972e-08, + "loss": 0.5641, + "step": 7332 + }, + { + "epoch": 2.862217017954723, + "grad_norm": 0.4524610270858827, + "learning_rate": 6.412437241883173e-08, + "loss": 0.5358, + "step": 7333 + }, + { + "epoch": 2.862607338017174, + "grad_norm": 0.4726806842004813, + "learning_rate": 6.376234920253965e-08, + "loss": 0.5801, + "step": 7334 + }, + { + "epoch": 2.8629976580796255, + "grad_norm": 0.4152186652007084, + "learning_rate": 6.340134424991207e-08, + "loss": 0.5553, + "step": 7335 + }, + { + "epoch": 2.8633879781420766, + "grad_norm": 0.4034843113995616, + "learning_rate": 6.30413576354183e-08, + "loss": 0.5649, + "step": 7336 + }, + { + "epoch": 2.8637782982045277, + "grad_norm": 0.43708903348623496, + "learning_rate": 6.268238943331784e-08, + "loss": 0.5931, + "step": 7337 + }, + { + "epoch": 2.8641686182669788, + "grad_norm": 0.43411718714684855, + "learning_rate": 6.232443971765923e-08, + "loss": 0.5714, + "step": 7338 + }, + { + "epoch": 2.86455893832943, + "grad_norm": 0.48835079673680476, + "learning_rate": 6.196750856228229e-08, + "loss": 0.5631, + "step": 7339 + }, + { + "epoch": 2.8649492583918814, + "grad_norm": 0.41575584031333207, + "learning_rate": 6.16115960408159e-08, + "loss": 0.5613, + "step": 7340 + }, + { + "epoch": 2.8653395784543325, + "grad_norm": 0.4109394386320095, + "learning_rate": 6.125670222667857e-08, + "loss": 0.6039, + "step": 7341 + }, + { + "epoch": 2.8657298985167836, + "grad_norm": 0.3956645649921057, + "learning_rate": 6.090282719307894e-08, + "loss": 0.6091, + "step": 7342 + }, + { + "epoch": 2.866120218579235, + "grad_norm": 0.408175007706102, + "learning_rate": 6.054997101301585e-08, + "loss": 0.5709, + "step": 7343 + }, + { + "epoch": 2.8665105386416863, + "grad_norm": 0.41013021602401506, + "learning_rate": 6.019813375927718e-08, + "loss": 0.563, + "step": 7344 + }, + { + "epoch": 2.8669008587041374, + "grad_norm": 0.4281724269819442, + "learning_rate": 5.984731550444156e-08, + "loss": 0.5659, + "step": 7345 + }, + { + "epoch": 2.8672911787665885, + "grad_norm": 0.4383424632441661, + "learning_rate": 5.949751632087719e-08, + "loss": 0.5925, + "step": 7346 + }, + { + "epoch": 2.8676814988290396, + "grad_norm": 0.4471560279401305, + "learning_rate": 5.914873628074136e-08, + "loss": 0.546, + "step": 7347 + }, + { + "epoch": 2.868071818891491, + "grad_norm": 0.4199366489579412, + "learning_rate": 5.880097545598151e-08, + "loss": 0.5561, + "step": 7348 + }, + { + "epoch": 2.868462138953942, + "grad_norm": 0.4372110853010439, + "learning_rate": 5.845423391833527e-08, + "loss": 0.5987, + "step": 7349 + }, + { + "epoch": 2.8688524590163933, + "grad_norm": 0.4431722836105209, + "learning_rate": 5.81085117393293e-08, + "loss": 0.5895, + "step": 7350 + }, + { + "epoch": 2.869242779078845, + "grad_norm": 0.44270119151722903, + "learning_rate": 5.7763808990280445e-08, + "loss": 0.5414, + "step": 7351 + }, + { + "epoch": 2.869633099141296, + "grad_norm": 0.42431454045202727, + "learning_rate": 5.7420125742295164e-08, + "loss": 0.5551, + "step": 7352 + }, + { + "epoch": 2.870023419203747, + "grad_norm": 0.42024917225338915, + "learning_rate": 5.7077462066270075e-08, + "loss": 0.5951, + "step": 7353 + }, + { + "epoch": 2.870413739266198, + "grad_norm": 0.42817408336790147, + "learning_rate": 5.67358180328903e-08, + "loss": 0.5578, + "step": 7354 + }, + { + "epoch": 2.8708040593286492, + "grad_norm": 0.39956689445911475, + "learning_rate": 5.6395193712631134e-08, + "loss": 0.559, + "step": 7355 + }, + { + "epoch": 2.871194379391101, + "grad_norm": 0.44015188671152594, + "learning_rate": 5.6055589175758596e-08, + "loss": 0.5884, + "step": 7356 + }, + { + "epoch": 2.871584699453552, + "grad_norm": 0.42983027362481824, + "learning_rate": 5.571700449232664e-08, + "loss": 0.5758, + "step": 7357 + }, + { + "epoch": 2.871975019516003, + "grad_norm": 0.40516111229420826, + "learning_rate": 5.537943973218052e-08, + "loss": 0.5812, + "step": 7358 + }, + { + "epoch": 2.8723653395784545, + "grad_norm": 0.4770711526781047, + "learning_rate": 5.5042894964952855e-08, + "loss": 0.5779, + "step": 7359 + }, + { + "epoch": 2.8727556596409056, + "grad_norm": 0.4591166603239273, + "learning_rate": 5.470737026006867e-08, + "loss": 0.5799, + "step": 7360 + }, + { + "epoch": 2.8731459797033567, + "grad_norm": 0.4128896524583373, + "learning_rate": 5.4372865686740386e-08, + "loss": 0.58, + "step": 7361 + }, + { + "epoch": 2.873536299765808, + "grad_norm": 0.4344080250812791, + "learning_rate": 5.403938131397113e-08, + "loss": 0.5584, + "step": 7362 + }, + { + "epoch": 2.873926619828259, + "grad_norm": 0.41140488141332837, + "learning_rate": 5.370691721055255e-08, + "loss": 0.5835, + "step": 7363 + }, + { + "epoch": 2.8743169398907105, + "grad_norm": 0.43420961492069265, + "learning_rate": 5.337547344506699e-08, + "loss": 0.5444, + "step": 7364 + }, + { + "epoch": 2.8747072599531616, + "grad_norm": 0.42461359682510297, + "learning_rate": 5.30450500858859e-08, + "loss": 0.5643, + "step": 7365 + }, + { + "epoch": 2.8750975800156127, + "grad_norm": 0.4477818474077431, + "learning_rate": 5.271564720116917e-08, + "loss": 0.5672, + "step": 7366 + }, + { + "epoch": 2.875487900078064, + "grad_norm": 0.42055221400020515, + "learning_rate": 5.238726485886858e-08, + "loss": 0.592, + "step": 7367 + }, + { + "epoch": 2.8758782201405153, + "grad_norm": 0.446523539482072, + "learning_rate": 5.2059903126722154e-08, + "loss": 0.5887, + "step": 7368 + }, + { + "epoch": 2.8762685402029664, + "grad_norm": 0.3922085007126456, + "learning_rate": 5.173356207226088e-08, + "loss": 0.5759, + "step": 7369 + }, + { + "epoch": 2.8766588602654175, + "grad_norm": 0.426443523747311, + "learning_rate": 5.140824176280257e-08, + "loss": 0.5788, + "step": 7370 + }, + { + "epoch": 2.8770491803278686, + "grad_norm": 0.44872761961470276, + "learning_rate": 5.108394226545577e-08, + "loss": 0.5657, + "step": 7371 + }, + { + "epoch": 2.87743950039032, + "grad_norm": 0.4425522133426709, + "learning_rate": 5.076066364711752e-08, + "loss": 0.5504, + "step": 7372 + }, + { + "epoch": 2.8778298204527712, + "grad_norm": 0.45029560953981695, + "learning_rate": 5.043840597447558e-08, + "loss": 0.5575, + "step": 7373 + }, + { + "epoch": 2.8782201405152223, + "grad_norm": 0.4106267153479887, + "learning_rate": 5.011716931400623e-08, + "loss": 0.5547, + "step": 7374 + }, + { + "epoch": 2.878610460577674, + "grad_norm": 0.480971343356328, + "learning_rate": 4.97969537319748e-08, + "loss": 0.5516, + "step": 7375 + }, + { + "epoch": 2.879000780640125, + "grad_norm": 0.4519861199646894, + "learning_rate": 4.9477759294435654e-08, + "loss": 0.5468, + "step": 7376 + }, + { + "epoch": 2.879391100702576, + "grad_norm": 0.41778516544769273, + "learning_rate": 4.915958606723559e-08, + "loss": 0.5662, + "step": 7377 + }, + { + "epoch": 2.879781420765027, + "grad_norm": 0.39955370820804925, + "learning_rate": 4.8842434116005974e-08, + "loss": 0.5353, + "step": 7378 + }, + { + "epoch": 2.8801717408274783, + "grad_norm": 0.41725043273208995, + "learning_rate": 4.852630350617171e-08, + "loss": 0.5781, + "step": 7379 + }, + { + "epoch": 2.88056206088993, + "grad_norm": 0.42326264860016377, + "learning_rate": 4.821119430294452e-08, + "loss": 0.5625, + "step": 7380 + }, + { + "epoch": 2.880952380952381, + "grad_norm": 0.4335352737241384, + "learning_rate": 4.789710657132629e-08, + "loss": 0.5502, + "step": 7381 + }, + { + "epoch": 2.881342701014832, + "grad_norm": 0.4168947742749504, + "learning_rate": 4.758404037610853e-08, + "loss": 0.5491, + "step": 7382 + }, + { + "epoch": 2.8817330210772836, + "grad_norm": 0.43404416285397146, + "learning_rate": 4.727199578187125e-08, + "loss": 0.553, + "step": 7383 + }, + { + "epoch": 2.8821233411397347, + "grad_norm": 0.45597829285568475, + "learning_rate": 4.696097285298351e-08, + "loss": 0.572, + "step": 7384 + }, + { + "epoch": 2.8825136612021858, + "grad_norm": 0.40656163513859706, + "learning_rate": 4.665097165360455e-08, + "loss": 0.568, + "step": 7385 + }, + { + "epoch": 2.882903981264637, + "grad_norm": 0.39228393654778526, + "learning_rate": 4.634199224768321e-08, + "loss": 0.5517, + "step": 7386 + }, + { + "epoch": 2.883294301327088, + "grad_norm": 0.40886982449683834, + "learning_rate": 4.6034034698955175e-08, + "loss": 0.5158, + "step": 7387 + }, + { + "epoch": 2.8836846213895395, + "grad_norm": 0.4222248674037214, + "learning_rate": 4.572709907094908e-08, + "loss": 0.5457, + "step": 7388 + }, + { + "epoch": 2.8840749414519906, + "grad_norm": 0.44957609668452486, + "learning_rate": 4.542118542697871e-08, + "loss": 0.5192, + "step": 7389 + }, + { + "epoch": 2.8844652615144417, + "grad_norm": 0.42831570545396763, + "learning_rate": 4.511629383015026e-08, + "loss": 0.5801, + "step": 7390 + }, + { + "epoch": 2.8848555815768933, + "grad_norm": 0.4403058174445071, + "learning_rate": 4.481242434335675e-08, + "loss": 0.548, + "step": 7391 + }, + { + "epoch": 2.8852459016393444, + "grad_norm": 0.4527736008628337, + "learning_rate": 4.450957702928249e-08, + "loss": 0.6039, + "step": 7392 + }, + { + "epoch": 2.8856362217017955, + "grad_norm": 0.4121534005189016, + "learning_rate": 4.420775195039861e-08, + "loss": 0.6012, + "step": 7393 + }, + { + "epoch": 2.8860265417642466, + "grad_norm": 0.41399045781586963, + "learning_rate": 4.3906949168968095e-08, + "loss": 0.5969, + "step": 7394 + }, + { + "epoch": 2.8864168618266977, + "grad_norm": 0.4352474767876685, + "learning_rate": 4.360716874703963e-08, + "loss": 0.5774, + "step": 7395 + }, + { + "epoch": 2.886807181889149, + "grad_norm": 0.42996225174449604, + "learning_rate": 4.3308410746454864e-08, + "loss": 0.5085, + "step": 7396 + }, + { + "epoch": 2.8871975019516003, + "grad_norm": 0.4072320818230402, + "learning_rate": 4.3010675228841726e-08, + "loss": 0.5472, + "step": 7397 + }, + { + "epoch": 2.8875878220140514, + "grad_norm": 0.41264541847373, + "learning_rate": 4.2713962255617744e-08, + "loss": 0.5971, + "step": 7398 + }, + { + "epoch": 2.887978142076503, + "grad_norm": 0.4227336253896172, + "learning_rate": 4.2418271887990637e-08, + "loss": 0.586, + "step": 7399 + }, + { + "epoch": 2.888368462138954, + "grad_norm": 0.41773272953775487, + "learning_rate": 4.212360418695549e-08, + "loss": 0.5511, + "step": 7400 + }, + { + "epoch": 2.888758782201405, + "grad_norm": 0.45205367652060774, + "learning_rate": 4.182995921329869e-08, + "loss": 0.5644, + "step": 7401 + }, + { + "epoch": 2.8891491022638562, + "grad_norm": 0.40958696420355994, + "learning_rate": 4.153733702759288e-08, + "loss": 0.5682, + "step": 7402 + }, + { + "epoch": 2.8895394223263073, + "grad_norm": 0.42116352910822363, + "learning_rate": 4.124573769020146e-08, + "loss": 0.5369, + "step": 7403 + }, + { + "epoch": 2.889929742388759, + "grad_norm": 0.39237391815224376, + "learning_rate": 4.095516126127741e-08, + "loss": 0.5652, + "step": 7404 + }, + { + "epoch": 2.89032006245121, + "grad_norm": 0.3870879919542313, + "learning_rate": 4.066560780076112e-08, + "loss": 0.5658, + "step": 7405 + }, + { + "epoch": 2.890710382513661, + "grad_norm": 0.42478335165780834, + "learning_rate": 4.037707736838203e-08, + "loss": 0.568, + "step": 7406 + }, + { + "epoch": 2.8911007025761126, + "grad_norm": 0.37918261022715416, + "learning_rate": 4.008957002366087e-08, + "loss": 0.5703, + "step": 7407 + }, + { + "epoch": 2.8914910226385637, + "grad_norm": 0.4048203673350868, + "learning_rate": 3.980308582590409e-08, + "loss": 0.5993, + "step": 7408 + }, + { + "epoch": 2.891881342701015, + "grad_norm": 0.40079044744084824, + "learning_rate": 3.95176248342094e-08, + "loss": 0.5488, + "step": 7409 + }, + { + "epoch": 2.892271662763466, + "grad_norm": 0.4711141000176919, + "learning_rate": 3.923318710746249e-08, + "loss": 0.536, + "step": 7410 + }, + { + "epoch": 2.892661982825917, + "grad_norm": 0.4199553464699045, + "learning_rate": 3.894977270433753e-08, + "loss": 0.5605, + "step": 7411 + }, + { + "epoch": 2.8930523028883686, + "grad_norm": 0.4343049234104764, + "learning_rate": 3.866738168329887e-08, + "loss": 0.5793, + "step": 7412 + }, + { + "epoch": 2.8934426229508197, + "grad_norm": 0.45339191177291926, + "learning_rate": 3.838601410259934e-08, + "loss": 0.5884, + "step": 7413 + }, + { + "epoch": 2.8938329430132708, + "grad_norm": 0.3994186614941842, + "learning_rate": 3.810567002027976e-08, + "loss": 0.5886, + "step": 7414 + }, + { + "epoch": 2.8942232630757223, + "grad_norm": 0.4709447863291356, + "learning_rate": 3.782634949417052e-08, + "loss": 0.5559, + "step": 7415 + }, + { + "epoch": 2.8946135831381734, + "grad_norm": 0.39479007959941625, + "learning_rate": 3.754805258189109e-08, + "loss": 0.5619, + "step": 7416 + }, + { + "epoch": 2.8950039032006245, + "grad_norm": 0.4611199414211965, + "learning_rate": 3.7270779340848886e-08, + "loss": 0.5742, + "step": 7417 + }, + { + "epoch": 2.8953942232630756, + "grad_norm": 0.40979112647947236, + "learning_rate": 3.699452982824203e-08, + "loss": 0.609, + "step": 7418 + }, + { + "epoch": 2.8957845433255267, + "grad_norm": 0.44014505812165244, + "learning_rate": 3.6719304101054396e-08, + "loss": 0.5409, + "step": 7419 + }, + { + "epoch": 2.8961748633879782, + "grad_norm": 0.4625992787437147, + "learning_rate": 3.6445102216062235e-08, + "loss": 0.5213, + "step": 7420 + }, + { + "epoch": 2.8965651834504293, + "grad_norm": 0.4137173882006882, + "learning_rate": 3.617192422982752e-08, + "loss": 0.5697, + "step": 7421 + }, + { + "epoch": 2.8969555035128804, + "grad_norm": 0.4446456448716277, + "learning_rate": 3.589977019870294e-08, + "loss": 0.5399, + "step": 7422 + }, + { + "epoch": 2.897345823575332, + "grad_norm": 0.4082444685972132, + "learning_rate": 3.562864017882917e-08, + "loss": 0.5814, + "step": 7423 + }, + { + "epoch": 2.897736143637783, + "grad_norm": 0.45997308229469114, + "learning_rate": 3.5358534226136445e-08, + "loss": 0.542, + "step": 7424 + }, + { + "epoch": 2.898126463700234, + "grad_norm": 0.44866342482329485, + "learning_rate": 3.508945239634187e-08, + "loss": 0.5399, + "step": 7425 + }, + { + "epoch": 2.8985167837626853, + "grad_norm": 0.38446507097941246, + "learning_rate": 3.482139474495383e-08, + "loss": 0.54, + "step": 7426 + }, + { + "epoch": 2.8989071038251364, + "grad_norm": 0.4082347453333291, + "learning_rate": 3.4554361327266974e-08, + "loss": 0.5681, + "step": 7427 + }, + { + "epoch": 2.899297423887588, + "grad_norm": 0.47320166436126504, + "learning_rate": 3.428835219836724e-08, + "loss": 0.5517, + "step": 7428 + }, + { + "epoch": 2.899687743950039, + "grad_norm": 0.4168263388949418, + "learning_rate": 3.4023367413126285e-08, + "loss": 0.553, + "step": 7429 + }, + { + "epoch": 2.90007806401249, + "grad_norm": 0.4580093937396885, + "learning_rate": 3.3759407026207615e-08, + "loss": 0.591, + "step": 7430 + }, + { + "epoch": 2.9004683840749417, + "grad_norm": 0.40706012305133954, + "learning_rate": 3.349647109206044e-08, + "loss": 0.5782, + "step": 7431 + }, + { + "epoch": 2.9008587041373928, + "grad_norm": 0.40009927393273953, + "learning_rate": 3.323455966492528e-08, + "loss": 0.5455, + "step": 7432 + }, + { + "epoch": 2.901249024199844, + "grad_norm": 0.434953926619092, + "learning_rate": 3.2973672798829456e-08, + "loss": 0.5902, + "step": 7433 + }, + { + "epoch": 2.901639344262295, + "grad_norm": 0.418947680182882, + "learning_rate": 3.2713810547589373e-08, + "loss": 0.5517, + "step": 7434 + }, + { + "epoch": 2.902029664324746, + "grad_norm": 0.43606968747184754, + "learning_rate": 3.245497296481104e-08, + "loss": 0.5917, + "step": 7435 + }, + { + "epoch": 2.9024199843871976, + "grad_norm": 0.41662382811568915, + "learning_rate": 3.219716010388785e-08, + "loss": 0.6034, + "step": 7436 + }, + { + "epoch": 2.9028103044496487, + "grad_norm": 0.4109236996849712, + "learning_rate": 3.194037201800226e-08, + "loss": 0.5637, + "step": 7437 + }, + { + "epoch": 2.9032006245121, + "grad_norm": 0.3985757759831452, + "learning_rate": 3.1684608760125246e-08, + "loss": 0.5447, + "step": 7438 + }, + { + "epoch": 2.9035909445745514, + "grad_norm": 0.396936583996782, + "learning_rate": 3.142987038301737e-08, + "loss": 0.556, + "step": 7439 + }, + { + "epoch": 2.9039812646370025, + "grad_norm": 0.4322392139344802, + "learning_rate": 3.1176156939225486e-08, + "loss": 0.5649, + "step": 7440 + }, + { + "epoch": 2.9043715846994536, + "grad_norm": 0.41021058748968897, + "learning_rate": 3.0923468481087737e-08, + "loss": 0.5747, + "step": 7441 + }, + { + "epoch": 2.9047619047619047, + "grad_norm": 0.46282715021658144, + "learning_rate": 3.0671805060729085e-08, + "loss": 0.5757, + "step": 7442 + }, + { + "epoch": 2.9051522248243558, + "grad_norm": 0.46927513468988497, + "learning_rate": 3.0421166730063565e-08, + "loss": 0.5825, + "step": 7443 + }, + { + "epoch": 2.9055425448868073, + "grad_norm": 0.401879824108979, + "learning_rate": 3.017155354079371e-08, + "loss": 0.5523, + "step": 7444 + }, + { + "epoch": 2.9059328649492584, + "grad_norm": 0.46134658365827985, + "learning_rate": 2.992296554441054e-08, + "loss": 0.5791, + "step": 7445 + }, + { + "epoch": 2.9063231850117095, + "grad_norm": 0.41825865708563437, + "learning_rate": 2.967540279219305e-08, + "loss": 0.5879, + "step": 7446 + }, + { + "epoch": 2.906713505074161, + "grad_norm": 0.4289863269364715, + "learning_rate": 2.942886533521039e-08, + "loss": 0.5163, + "step": 7447 + }, + { + "epoch": 2.907103825136612, + "grad_norm": 0.5106442765471182, + "learning_rate": 2.9183353224318535e-08, + "loss": 0.569, + "step": 7448 + }, + { + "epoch": 2.9074941451990632, + "grad_norm": 0.4509923210383809, + "learning_rate": 2.8938866510162533e-08, + "loss": 0.5825, + "step": 7449 + }, + { + "epoch": 2.9078844652615143, + "grad_norm": 0.414174757196291, + "learning_rate": 2.8695405243175933e-08, + "loss": 0.5845, + "step": 7450 + }, + { + "epoch": 2.9082747853239654, + "grad_norm": 0.4171385194189795, + "learning_rate": 2.8452969473580227e-08, + "loss": 0.5481, + "step": 7451 + }, + { + "epoch": 2.908665105386417, + "grad_norm": 0.394185566440298, + "learning_rate": 2.8211559251387077e-08, + "loss": 0.6074, + "step": 7452 + }, + { + "epoch": 2.909055425448868, + "grad_norm": 0.40452048272401714, + "learning_rate": 2.797117462639498e-08, + "loss": 0.5768, + "step": 7453 + }, + { + "epoch": 2.909445745511319, + "grad_norm": 0.4558093924566635, + "learning_rate": 2.773181564819094e-08, + "loss": 0.5076, + "step": 7454 + }, + { + "epoch": 2.9098360655737707, + "grad_norm": 0.40504790382741146, + "learning_rate": 2.7493482366150457e-08, + "loss": 0.5838, + "step": 7455 + }, + { + "epoch": 2.910226385636222, + "grad_norm": 0.4271243290122523, + "learning_rate": 2.72561748294381e-08, + "loss": 0.5557, + "step": 7456 + }, + { + "epoch": 2.910616705698673, + "grad_norm": 0.39731582431747925, + "learning_rate": 2.7019893087006922e-08, + "loss": 0.562, + "step": 7457 + }, + { + "epoch": 2.911007025761124, + "grad_norm": 0.4147832685534338, + "learning_rate": 2.6784637187597384e-08, + "loss": 0.5486, + "step": 7458 + }, + { + "epoch": 2.911397345823575, + "grad_norm": 0.42618346188654416, + "learning_rate": 2.6550407179738446e-08, + "loss": 0.5802, + "step": 7459 + }, + { + "epoch": 2.9117876658860267, + "grad_norm": 0.39285132087969776, + "learning_rate": 2.6317203111748678e-08, + "loss": 0.6143, + "step": 7460 + }, + { + "epoch": 2.9121779859484778, + "grad_norm": 0.4426523000157984, + "learning_rate": 2.6085025031734046e-08, + "loss": 0.5754, + "step": 7461 + }, + { + "epoch": 2.912568306010929, + "grad_norm": 0.41003957292969423, + "learning_rate": 2.5853872987589012e-08, + "loss": 0.5499, + "step": 7462 + }, + { + "epoch": 2.9129586260733804, + "grad_norm": 0.4443128776766625, + "learning_rate": 2.5623747026995437e-08, + "loss": 0.5867, + "step": 7463 + }, + { + "epoch": 2.9133489461358315, + "grad_norm": 0.41802766115370077, + "learning_rate": 2.5394647197425903e-08, + "loss": 0.5937, + "step": 7464 + }, + { + "epoch": 2.9137392661982826, + "grad_norm": 0.4293724369251687, + "learning_rate": 2.5166573546139272e-08, + "loss": 0.5598, + "step": 7465 + }, + { + "epoch": 2.9141295862607337, + "grad_norm": 0.42908842121458973, + "learning_rate": 2.4939526120183465e-08, + "loss": 0.5197, + "step": 7466 + }, + { + "epoch": 2.914519906323185, + "grad_norm": 0.4104595436399852, + "learning_rate": 2.471350496639435e-08, + "loss": 0.5577, + "step": 7467 + }, + { + "epoch": 2.9149102263856363, + "grad_norm": 0.4636692174213094, + "learning_rate": 2.4488510131396305e-08, + "loss": 0.568, + "step": 7468 + }, + { + "epoch": 2.9153005464480874, + "grad_norm": 0.40868658651971074, + "learning_rate": 2.4264541661602194e-08, + "loss": 0.5572, + "step": 7469 + }, + { + "epoch": 2.9156908665105385, + "grad_norm": 0.4300687799304892, + "learning_rate": 2.4041599603212838e-08, + "loss": 0.5975, + "step": 7470 + }, + { + "epoch": 2.91608118657299, + "grad_norm": 0.4158913460792399, + "learning_rate": 2.3819684002217568e-08, + "loss": 0.5758, + "step": 7471 + }, + { + "epoch": 2.916471506635441, + "grad_norm": 0.4345979520354453, + "learning_rate": 2.3598794904394206e-08, + "loss": 0.5449, + "step": 7472 + }, + { + "epoch": 2.9168618266978923, + "grad_norm": 0.3871853160533512, + "learning_rate": 2.337893235530797e-08, + "loss": 0.5678, + "step": 7473 + }, + { + "epoch": 2.9172521467603434, + "grad_norm": 0.4317144610488142, + "learning_rate": 2.3160096400313135e-08, + "loss": 0.5587, + "step": 7474 + }, + { + "epoch": 2.9176424668227945, + "grad_norm": 0.4359631897538031, + "learning_rate": 2.2942287084551927e-08, + "loss": 0.6035, + "step": 7475 + }, + { + "epoch": 2.918032786885246, + "grad_norm": 0.49530725806127707, + "learning_rate": 2.272550445295396e-08, + "loss": 0.5539, + "step": 7476 + }, + { + "epoch": 2.918423106947697, + "grad_norm": 0.4194143230703045, + "learning_rate": 2.2509748550239574e-08, + "loss": 0.5364, + "step": 7477 + }, + { + "epoch": 2.9188134270101482, + "grad_norm": 0.4039019340927007, + "learning_rate": 2.2295019420914277e-08, + "loss": 0.5915, + "step": 7478 + }, + { + "epoch": 2.9192037470725998, + "grad_norm": 0.39525854741274163, + "learning_rate": 2.2081317109273748e-08, + "loss": 0.561, + "step": 7479 + }, + { + "epoch": 2.919594067135051, + "grad_norm": 0.40501136174766006, + "learning_rate": 2.1868641659400502e-08, + "loss": 0.5707, + "step": 7480 + }, + { + "epoch": 2.919984387197502, + "grad_norm": 0.4225662292129232, + "learning_rate": 2.165699311516667e-08, + "loss": 0.58, + "step": 7481 + }, + { + "epoch": 2.920374707259953, + "grad_norm": 0.4165209197539455, + "learning_rate": 2.144637152023177e-08, + "loss": 0.5712, + "step": 7482 + }, + { + "epoch": 2.920765027322404, + "grad_norm": 0.41334068087760073, + "learning_rate": 2.1236776918042712e-08, + "loss": 0.5655, + "step": 7483 + }, + { + "epoch": 2.9211553473848557, + "grad_norm": 0.436875164424581, + "learning_rate": 2.102820935183658e-08, + "loss": 0.5663, + "step": 7484 + }, + { + "epoch": 2.921545667447307, + "grad_norm": 0.4434049248326744, + "learning_rate": 2.082066886463674e-08, + "loss": 0.5315, + "step": 7485 + }, + { + "epoch": 2.921935987509758, + "grad_norm": 0.39098226126640023, + "learning_rate": 2.061415549925505e-08, + "loss": 0.5872, + "step": 7486 + }, + { + "epoch": 2.9223263075722095, + "grad_norm": 0.4602165558064196, + "learning_rate": 2.0408669298292438e-08, + "loss": 0.5532, + "step": 7487 + }, + { + "epoch": 2.9227166276346606, + "grad_norm": 0.41665054527824585, + "learning_rate": 2.0204210304137216e-08, + "loss": 0.602, + "step": 7488 + }, + { + "epoch": 2.9231069476971117, + "grad_norm": 0.4153937972262796, + "learning_rate": 2.0000778558965096e-08, + "loss": 0.5901, + "step": 7489 + }, + { + "epoch": 2.9234972677595628, + "grad_norm": 0.44587392390131425, + "learning_rate": 1.979837410474139e-08, + "loss": 0.5357, + "step": 7490 + }, + { + "epoch": 2.923887587822014, + "grad_norm": 0.42964022183681466, + "learning_rate": 1.959699698321882e-08, + "loss": 0.5859, + "step": 7491 + }, + { + "epoch": 2.9242779078844654, + "grad_norm": 0.44704897231355356, + "learning_rate": 1.9396647235937482e-08, + "loss": 0.596, + "step": 7492 + }, + { + "epoch": 2.9246682279469165, + "grad_norm": 0.3958842525331377, + "learning_rate": 1.91973249042271e-08, + "loss": 0.586, + "step": 7493 + }, + { + "epoch": 2.9250585480093676, + "grad_norm": 0.42400265389276165, + "learning_rate": 1.8999030029203115e-08, + "loss": 0.5709, + "step": 7494 + }, + { + "epoch": 2.925448868071819, + "grad_norm": 0.4180134523154202, + "learning_rate": 1.8801762651772248e-08, + "loss": 0.5308, + "step": 7495 + }, + { + "epoch": 2.9258391881342702, + "grad_norm": 0.447275588638761, + "learning_rate": 1.8605522812626397e-08, + "loss": 0.5495, + "step": 7496 + }, + { + "epoch": 2.9262295081967213, + "grad_norm": 0.43047653947344305, + "learning_rate": 1.841031055224707e-08, + "loss": 0.6074, + "step": 7497 + }, + { + "epoch": 2.9266198282591724, + "grad_norm": 0.45796944394886735, + "learning_rate": 1.8216125910902606e-08, + "loss": 0.5883, + "step": 7498 + }, + { + "epoch": 2.9270101483216235, + "grad_norm": 0.4188324136575753, + "learning_rate": 1.8022968928650963e-08, + "loss": 0.5657, + "step": 7499 + }, + { + "epoch": 2.927400468384075, + "grad_norm": 0.4262775089422376, + "learning_rate": 1.7830839645336384e-08, + "loss": 0.5582, + "step": 7500 + }, + { + "epoch": 2.927790788446526, + "grad_norm": 0.4289175357852278, + "learning_rate": 1.7639738100592164e-08, + "loss": 0.532, + "step": 7501 + }, + { + "epoch": 2.9281811085089773, + "grad_norm": 0.42384169347065004, + "learning_rate": 1.74496643338401e-08, + "loss": 0.5494, + "step": 7502 + }, + { + "epoch": 2.928571428571429, + "grad_norm": 0.4309399801422536, + "learning_rate": 1.7260618384288276e-08, + "loss": 0.6077, + "step": 7503 + }, + { + "epoch": 2.92896174863388, + "grad_norm": 0.4465074338517167, + "learning_rate": 1.7072600290933826e-08, + "loss": 0.5477, + "step": 7504 + }, + { + "epoch": 2.929352068696331, + "grad_norm": 0.430759276652429, + "learning_rate": 1.68856100925624e-08, + "loss": 0.5614, + "step": 7505 + }, + { + "epoch": 2.929742388758782, + "grad_norm": 0.39863584082470116, + "learning_rate": 1.6699647827746468e-08, + "loss": 0.5883, + "step": 7506 + }, + { + "epoch": 2.9301327088212332, + "grad_norm": 0.4053070802383408, + "learning_rate": 1.651471353484757e-08, + "loss": 0.6029, + "step": 7507 + }, + { + "epoch": 2.9305230288836848, + "grad_norm": 0.40553912064689945, + "learning_rate": 1.6330807252013524e-08, + "loss": 0.5992, + "step": 7508 + }, + { + "epoch": 2.930913348946136, + "grad_norm": 0.4407170605337565, + "learning_rate": 1.6147929017181763e-08, + "loss": 0.5769, + "step": 7509 + }, + { + "epoch": 2.931303669008587, + "grad_norm": 0.40635114972962544, + "learning_rate": 1.5966078868077107e-08, + "loss": 0.5708, + "step": 7510 + }, + { + "epoch": 2.9316939890710385, + "grad_norm": 0.4150333666466246, + "learning_rate": 1.5785256842211772e-08, + "loss": 0.5603, + "step": 7511 + }, + { + "epoch": 2.9320843091334896, + "grad_norm": 0.42504585937378414, + "learning_rate": 1.560546297688703e-08, + "loss": 0.5951, + "step": 7512 + }, + { + "epoch": 2.9324746291959407, + "grad_norm": 0.45591469563874243, + "learning_rate": 1.5426697309190997e-08, + "loss": 0.5314, + "step": 7513 + }, + { + "epoch": 2.932864949258392, + "grad_norm": 0.44295682438881917, + "learning_rate": 1.5248959875999724e-08, + "loss": 0.5633, + "step": 7514 + }, + { + "epoch": 2.933255269320843, + "grad_norm": 0.4339950044936066, + "learning_rate": 1.5072250713977777e-08, + "loss": 0.5425, + "step": 7515 + }, + { + "epoch": 2.9336455893832944, + "grad_norm": 0.4432142976996306, + "learning_rate": 1.4896569859577658e-08, + "loss": 0.5859, + "step": 7516 + }, + { + "epoch": 2.9340359094457455, + "grad_norm": 0.4185191187546528, + "learning_rate": 1.4721917349038717e-08, + "loss": 0.5898, + "step": 7517 + }, + { + "epoch": 2.9344262295081966, + "grad_norm": 0.4237745286704237, + "learning_rate": 1.4548293218389353e-08, + "loss": 0.6055, + "step": 7518 + }, + { + "epoch": 2.934816549570648, + "grad_norm": 0.44450814832847874, + "learning_rate": 1.4375697503444807e-08, + "loss": 0.5423, + "step": 7519 + }, + { + "epoch": 2.9352068696330993, + "grad_norm": 0.44603170590997143, + "learning_rate": 1.4204130239809377e-08, + "loss": 0.5571, + "step": 7520 + }, + { + "epoch": 2.9355971896955504, + "grad_norm": 0.434236025846802, + "learning_rate": 1.4033591462874196e-08, + "loss": 0.5521, + "step": 7521 + }, + { + "epoch": 2.9359875097580015, + "grad_norm": 0.42263419088289206, + "learning_rate": 1.3864081207818902e-08, + "loss": 0.5886, + "step": 7522 + }, + { + "epoch": 2.9363778298204526, + "grad_norm": 0.4371674914709537, + "learning_rate": 1.369559950960997e-08, + "loss": 0.5444, + "step": 7523 + }, + { + "epoch": 2.936768149882904, + "grad_norm": 0.4223213415492271, + "learning_rate": 1.3528146403002928e-08, + "loss": 0.6049, + "step": 7524 + }, + { + "epoch": 2.9371584699453552, + "grad_norm": 0.4299403639085776, + "learning_rate": 1.3361721922540149e-08, + "loss": 0.5648, + "step": 7525 + }, + { + "epoch": 2.9375487900078063, + "grad_norm": 0.4187166644668054, + "learning_rate": 1.3196326102553059e-08, + "loss": 0.6037, + "step": 7526 + }, + { + "epoch": 2.937939110070258, + "grad_norm": 0.4266277695774084, + "learning_rate": 1.303195897715881e-08, + "loss": 0.5823, + "step": 7527 + }, + { + "epoch": 2.938329430132709, + "grad_norm": 0.435228674152875, + "learning_rate": 1.2868620580264724e-08, + "loss": 0.5299, + "step": 7528 + }, + { + "epoch": 2.93871975019516, + "grad_norm": 0.44232575021130477, + "learning_rate": 1.2706310945563849e-08, + "loss": 0.5659, + "step": 7529 + }, + { + "epoch": 2.939110070257611, + "grad_norm": 0.4237986335565691, + "learning_rate": 1.2545030106538847e-08, + "loss": 0.6082, + "step": 7530 + }, + { + "epoch": 2.9395003903200623, + "grad_norm": 0.4162626689912367, + "learning_rate": 1.2384778096458661e-08, + "loss": 0.5518, + "step": 7531 + }, + { + "epoch": 2.939890710382514, + "grad_norm": 0.4301680728792138, + "learning_rate": 1.2225554948381291e-08, + "loss": 0.5387, + "step": 7532 + }, + { + "epoch": 2.940281030444965, + "grad_norm": 0.43729904408795806, + "learning_rate": 1.206736069515102e-08, + "loss": 0.5528, + "step": 7533 + }, + { + "epoch": 2.940671350507416, + "grad_norm": 0.4410784354302666, + "learning_rate": 1.1910195369400635e-08, + "loss": 0.5959, + "step": 7534 + }, + { + "epoch": 2.9410616705698676, + "grad_norm": 0.4434985894547484, + "learning_rate": 1.1754059003551976e-08, + "loss": 0.5511, + "step": 7535 + }, + { + "epoch": 2.9414519906323187, + "grad_norm": 0.44311598883162184, + "learning_rate": 1.159895162981206e-08, + "loss": 0.567, + "step": 7536 + }, + { + "epoch": 2.9418423106947698, + "grad_norm": 0.4100511831704101, + "learning_rate": 1.1444873280176961e-08, + "loss": 0.5708, + "step": 7537 + }, + { + "epoch": 2.942232630757221, + "grad_norm": 0.4394768249318403, + "learning_rate": 1.129182398643125e-08, + "loss": 0.5184, + "step": 7538 + }, + { + "epoch": 2.942622950819672, + "grad_norm": 0.4212385937065646, + "learning_rate": 1.1139803780146341e-08, + "loss": 0.5742, + "step": 7539 + }, + { + "epoch": 2.9430132708821235, + "grad_norm": 0.43883174031965444, + "learning_rate": 1.0988812692681594e-08, + "loss": 0.5639, + "step": 7540 + }, + { + "epoch": 2.9434035909445746, + "grad_norm": 0.43921457091605004, + "learning_rate": 1.0838850755183206e-08, + "loss": 0.5578, + "step": 7541 + }, + { + "epoch": 2.9437939110070257, + "grad_norm": 0.42358589492143367, + "learning_rate": 1.0689917998585874e-08, + "loss": 0.5403, + "step": 7542 + }, + { + "epoch": 2.9441842310694772, + "grad_norm": 0.4169235591617017, + "learning_rate": 1.0542014453612804e-08, + "loss": 0.5706, + "step": 7543 + }, + { + "epoch": 2.9445745511319283, + "grad_norm": 0.4545303462520662, + "learning_rate": 1.0395140150773476e-08, + "loss": 0.5509, + "step": 7544 + }, + { + "epoch": 2.9449648711943794, + "grad_norm": 0.3934901223837361, + "learning_rate": 1.0249295120365876e-08, + "loss": 0.6028, + "step": 7545 + }, + { + "epoch": 2.9453551912568305, + "grad_norm": 0.41591749678524387, + "learning_rate": 1.0104479392474832e-08, + "loss": 0.5937, + "step": 7546 + }, + { + "epoch": 2.9457455113192816, + "grad_norm": 0.4216795416033485, + "learning_rate": 9.96069299697422e-09, + "loss": 0.5814, + "step": 7547 + }, + { + "epoch": 2.946135831381733, + "grad_norm": 0.4329963617774423, + "learning_rate": 9.817935963524205e-09, + "loss": 0.5721, + "step": 7548 + }, + { + "epoch": 2.9465261514441843, + "grad_norm": 0.43546347041146766, + "learning_rate": 9.6762083215729e-09, + "loss": 0.5718, + "step": 7549 + }, + { + "epoch": 2.9469164715066354, + "grad_norm": 0.42341982541847173, + "learning_rate": 9.53551010035747e-09, + "loss": 0.5536, + "step": 7550 + }, + { + "epoch": 2.947306791569087, + "grad_norm": 0.45631149503365154, + "learning_rate": 9.395841328900811e-09, + "loss": 0.5634, + "step": 7551 + }, + { + "epoch": 2.947697111631538, + "grad_norm": 0.4128529849971503, + "learning_rate": 9.257202036014324e-09, + "loss": 0.5608, + "step": 7552 + }, + { + "epoch": 2.948087431693989, + "grad_norm": 0.4300387935480212, + "learning_rate": 9.119592250296794e-09, + "loss": 0.569, + "step": 7553 + }, + { + "epoch": 2.9484777517564402, + "grad_norm": 0.41542426167193325, + "learning_rate": 8.98301200013496e-09, + "loss": 0.5147, + "step": 7554 + }, + { + "epoch": 2.9488680718188913, + "grad_norm": 0.44235757761364425, + "learning_rate": 8.847461313703509e-09, + "loss": 0.5529, + "step": 7555 + }, + { + "epoch": 2.949258391881343, + "grad_norm": 0.38678467638733527, + "learning_rate": 8.712940218964516e-09, + "loss": 0.5662, + "step": 7556 + }, + { + "epoch": 2.949648711943794, + "grad_norm": 0.4434649681170418, + "learning_rate": 8.579448743666341e-09, + "loss": 0.5901, + "step": 7557 + }, + { + "epoch": 2.950039032006245, + "grad_norm": 0.44556500635646556, + "learning_rate": 8.446986915346401e-09, + "loss": 0.5549, + "step": 7558 + }, + { + "epoch": 2.950429352068696, + "grad_norm": 0.39242610234313346, + "learning_rate": 8.315554761330058e-09, + "loss": 0.5552, + "step": 7559 + }, + { + "epoch": 2.9508196721311473, + "grad_norm": 0.4346686598355969, + "learning_rate": 8.185152308728961e-09, + "loss": 0.5656, + "step": 7560 + }, + { + "epoch": 2.951209992193599, + "grad_norm": 0.49271552168770655, + "learning_rate": 8.055779584443258e-09, + "loss": 0.5771, + "step": 7561 + }, + { + "epoch": 2.95160031225605, + "grad_norm": 0.4133799657121429, + "learning_rate": 7.927436615159933e-09, + "loss": 0.5604, + "step": 7562 + }, + { + "epoch": 2.951990632318501, + "grad_norm": 0.45384972143045094, + "learning_rate": 7.800123427354477e-09, + "loss": 0.5793, + "step": 7563 + }, + { + "epoch": 2.9523809523809526, + "grad_norm": 0.4277840772632093, + "learning_rate": 7.673840047289216e-09, + "loss": 0.5504, + "step": 7564 + }, + { + "epoch": 2.9527712724434036, + "grad_norm": 0.4788077282651278, + "learning_rate": 7.548586501014976e-09, + "loss": 0.5626, + "step": 7565 + }, + { + "epoch": 2.9531615925058547, + "grad_norm": 0.42464046510563186, + "learning_rate": 7.424362814368313e-09, + "loss": 0.5826, + "step": 7566 + }, + { + "epoch": 2.953551912568306, + "grad_norm": 0.4200737491923108, + "learning_rate": 7.301169012975395e-09, + "loss": 0.5715, + "step": 7567 + }, + { + "epoch": 2.953942232630757, + "grad_norm": 0.45453892830034487, + "learning_rate": 7.179005122248672e-09, + "loss": 0.6226, + "step": 7568 + }, + { + "epoch": 2.9543325526932085, + "grad_norm": 0.44147096110854805, + "learning_rate": 7.057871167388541e-09, + "loss": 0.5703, + "step": 7569 + }, + { + "epoch": 2.9547228727556596, + "grad_norm": 0.4188234684615914, + "learning_rate": 6.937767173382792e-09, + "loss": 0.5632, + "step": 7570 + }, + { + "epoch": 2.9551131928181107, + "grad_norm": 0.43906671827837224, + "learning_rate": 6.818693165007717e-09, + "loss": 0.5653, + "step": 7571 + }, + { + "epoch": 2.9555035128805622, + "grad_norm": 0.42016412421987503, + "learning_rate": 6.70064916682478e-09, + "loss": 0.5523, + "step": 7572 + }, + { + "epoch": 2.9558938329430133, + "grad_norm": 0.4178466756618667, + "learning_rate": 6.583635203186167e-09, + "loss": 0.5671, + "step": 7573 + }, + { + "epoch": 2.9562841530054644, + "grad_norm": 0.4235282405747407, + "learning_rate": 6.467651298228683e-09, + "loss": 0.5789, + "step": 7574 + }, + { + "epoch": 2.9566744730679155, + "grad_norm": 0.3988501697761114, + "learning_rate": 6.352697475878744e-09, + "loss": 0.6287, + "step": 7575 + }, + { + "epoch": 2.9570647931303666, + "grad_norm": 0.41382192074052593, + "learning_rate": 6.238773759848493e-09, + "loss": 0.5635, + "step": 7576 + }, + { + "epoch": 2.957455113192818, + "grad_norm": 0.45831934704998994, + "learning_rate": 6.1258801736391316e-09, + "loss": 0.5688, + "step": 7577 + }, + { + "epoch": 2.9578454332552693, + "grad_norm": 0.46126299317001135, + "learning_rate": 6.014016740538697e-09, + "loss": 0.5516, + "step": 7578 + }, + { + "epoch": 2.9582357533177204, + "grad_norm": 0.4199381901932492, + "learning_rate": 5.903183483622621e-09, + "loss": 0.5903, + "step": 7579 + }, + { + "epoch": 2.958626073380172, + "grad_norm": 0.4429554732122842, + "learning_rate": 5.793380425754281e-09, + "loss": 0.6216, + "step": 7580 + }, + { + "epoch": 2.959016393442623, + "grad_norm": 0.4186352576488851, + "learning_rate": 5.684607589583335e-09, + "loss": 0.5957, + "step": 7581 + }, + { + "epoch": 2.959406713505074, + "grad_norm": 0.42019217444434454, + "learning_rate": 5.576864997548503e-09, + "loss": 0.5865, + "step": 7582 + }, + { + "epoch": 2.959797033567525, + "grad_norm": 0.4066923610254208, + "learning_rate": 5.470152671875339e-09, + "loss": 0.5833, + "step": 7583 + }, + { + "epoch": 2.9601873536299763, + "grad_norm": 0.39499544651070706, + "learning_rate": 5.36447063457679e-09, + "loss": 0.5891, + "step": 7584 + }, + { + "epoch": 2.960577673692428, + "grad_norm": 0.39386650512221577, + "learning_rate": 5.259818907453196e-09, + "loss": 0.5712, + "step": 7585 + }, + { + "epoch": 2.960967993754879, + "grad_norm": 0.40629456523714186, + "learning_rate": 5.156197512092287e-09, + "loss": 0.5282, + "step": 7586 + }, + { + "epoch": 2.96135831381733, + "grad_norm": 0.40123453516062474, + "learning_rate": 5.053606469869743e-09, + "loss": 0.5762, + "step": 7587 + }, + { + "epoch": 2.9617486338797816, + "grad_norm": 0.41904713958355316, + "learning_rate": 4.95204580194808e-09, + "loss": 0.5519, + "step": 7588 + }, + { + "epoch": 2.9621389539422327, + "grad_norm": 0.42415996486481405, + "learning_rate": 4.851515529277206e-09, + "loss": 0.5779, + "step": 7589 + }, + { + "epoch": 2.962529274004684, + "grad_norm": 0.4219810412153848, + "learning_rate": 4.752015672596089e-09, + "loss": 0.5605, + "step": 7590 + }, + { + "epoch": 2.962919594067135, + "grad_norm": 0.43170420759547656, + "learning_rate": 4.6535462524288645e-09, + "loss": 0.5664, + "step": 7591 + }, + { + "epoch": 2.963309914129586, + "grad_norm": 0.4479596464749973, + "learning_rate": 4.556107289088729e-09, + "loss": 0.5538, + "step": 7592 + }, + { + "epoch": 2.9637002341920375, + "grad_norm": 0.4153610288054674, + "learning_rate": 4.459698802675161e-09, + "loss": 0.5546, + "step": 7593 + }, + { + "epoch": 2.9640905542544886, + "grad_norm": 0.42870422853781787, + "learning_rate": 4.36432081307614e-09, + "loss": 0.5963, + "step": 7594 + }, + { + "epoch": 2.9644808743169397, + "grad_norm": 0.42221642091103406, + "learning_rate": 4.269973339965927e-09, + "loss": 0.5424, + "step": 7595 + }, + { + "epoch": 2.9648711943793913, + "grad_norm": 0.4354728636361043, + "learning_rate": 4.1766564028072885e-09, + "loss": 0.5532, + "step": 7596 + }, + { + "epoch": 2.9652615144418424, + "grad_norm": 0.4217234922046335, + "learning_rate": 4.084370020850381e-09, + "loss": 0.561, + "step": 7597 + }, + { + "epoch": 2.9656518345042935, + "grad_norm": 0.4323765762741967, + "learning_rate": 3.993114213131088e-09, + "loss": 0.5206, + "step": 7598 + }, + { + "epoch": 2.9660421545667446, + "grad_norm": 0.4111503289569333, + "learning_rate": 3.902888998475463e-09, + "loss": 0.5486, + "step": 7599 + }, + { + "epoch": 2.9664324746291957, + "grad_norm": 0.45513597204719597, + "learning_rate": 3.8136943954941715e-09, + "loss": 0.576, + "step": 7600 + }, + { + "epoch": 2.9668227946916472, + "grad_norm": 0.4199433653866163, + "learning_rate": 3.725530422587498e-09, + "loss": 0.5665, + "step": 7601 + }, + { + "epoch": 2.9672131147540983, + "grad_norm": 0.4698032916201517, + "learning_rate": 3.6383970979414486e-09, + "loss": 0.6299, + "step": 7602 + }, + { + "epoch": 2.9676034348165494, + "grad_norm": 0.4211269317463055, + "learning_rate": 3.5522944395310898e-09, + "loss": 0.5244, + "step": 7603 + }, + { + "epoch": 2.967993754879001, + "grad_norm": 0.4624403483249224, + "learning_rate": 3.467222465117215e-09, + "loss": 0.6123, + "step": 7604 + }, + { + "epoch": 2.968384074941452, + "grad_norm": 0.39407948440510876, + "learning_rate": 3.3831811922485635e-09, + "loss": 0.5835, + "step": 7605 + }, + { + "epoch": 2.968774395003903, + "grad_norm": 0.4583824775045866, + "learning_rate": 3.3001706382623786e-09, + "loss": 0.5785, + "step": 7606 + }, + { + "epoch": 2.9691647150663543, + "grad_norm": 0.4207605805732423, + "learning_rate": 3.21819082028163e-09, + "loss": 0.5555, + "step": 7607 + }, + { + "epoch": 2.9695550351288054, + "grad_norm": 0.40654772019137314, + "learning_rate": 3.1372417552177903e-09, + "loss": 0.5766, + "step": 7608 + }, + { + "epoch": 2.969945355191257, + "grad_norm": 0.4051762079846988, + "learning_rate": 3.0573234597691683e-09, + "loss": 0.5917, + "step": 7609 + }, + { + "epoch": 2.970335675253708, + "grad_norm": 0.45126062143547163, + "learning_rate": 2.9784359504214653e-09, + "loss": 0.5872, + "step": 7610 + }, + { + "epoch": 2.970725995316159, + "grad_norm": 0.4121511339990584, + "learning_rate": 2.9005792434477763e-09, + "loss": 0.5646, + "step": 7611 + }, + { + "epoch": 2.9711163153786107, + "grad_norm": 0.44468172148445184, + "learning_rate": 2.823753354909142e-09, + "loss": 0.5675, + "step": 7612 + }, + { + "epoch": 2.9715066354410617, + "grad_norm": 0.40029124239358244, + "learning_rate": 2.7479583006523316e-09, + "loss": 0.5791, + "step": 7613 + }, + { + "epoch": 2.971896955503513, + "grad_norm": 0.37536812975112205, + "learning_rate": 2.67319409631428e-09, + "loss": 0.5925, + "step": 7614 + }, + { + "epoch": 2.972287275565964, + "grad_norm": 0.4453968145132808, + "learning_rate": 2.599460757316541e-09, + "loss": 0.5451, + "step": 7615 + }, + { + "epoch": 2.972677595628415, + "grad_norm": 0.39101904941183735, + "learning_rate": 2.5267582988686146e-09, + "loss": 0.604, + "step": 7616 + }, + { + "epoch": 2.9730679156908666, + "grad_norm": 0.40908641533460627, + "learning_rate": 2.4550867359690587e-09, + "loss": 0.5416, + "step": 7617 + }, + { + "epoch": 2.9734582357533177, + "grad_norm": 0.3958405320028504, + "learning_rate": 2.3844460834016035e-09, + "loss": 0.5997, + "step": 7618 + }, + { + "epoch": 2.973848555815769, + "grad_norm": 0.43191428006611404, + "learning_rate": 2.3148363557390365e-09, + "loss": 0.5907, + "step": 7619 + }, + { + "epoch": 2.9742388758782203, + "grad_norm": 0.45202908380455303, + "learning_rate": 2.2462575673398713e-09, + "loss": 0.5462, + "step": 7620 + }, + { + "epoch": 2.9746291959406714, + "grad_norm": 0.39929289310888727, + "learning_rate": 2.1787097323511253e-09, + "loss": 0.5635, + "step": 7621 + }, + { + "epoch": 2.9750195160031225, + "grad_norm": 0.4222290374994417, + "learning_rate": 2.1121928647066526e-09, + "loss": 0.5508, + "step": 7622 + }, + { + "epoch": 2.9754098360655736, + "grad_norm": 0.43918390579547173, + "learning_rate": 2.0467069781276995e-09, + "loss": 0.5437, + "step": 7623 + }, + { + "epoch": 2.9758001561280247, + "grad_norm": 0.4363731516476412, + "learning_rate": 1.9822520861234597e-09, + "loss": 0.5955, + "step": 7624 + }, + { + "epoch": 2.9761904761904763, + "grad_norm": 0.41317860202083756, + "learning_rate": 1.918828201989409e-09, + "loss": 0.5816, + "step": 7625 + }, + { + "epoch": 2.9765807962529274, + "grad_norm": 0.4207142218963185, + "learning_rate": 1.8564353388089707e-09, + "loss": 0.5687, + "step": 7626 + }, + { + "epoch": 2.9769711163153785, + "grad_norm": 0.4045806116741204, + "learning_rate": 1.7950735094524052e-09, + "loss": 0.6067, + "step": 7627 + }, + { + "epoch": 2.97736143637783, + "grad_norm": 0.43855331376302764, + "learning_rate": 1.7347427265784755e-09, + "loss": 0.5611, + "step": 7628 + }, + { + "epoch": 2.977751756440281, + "grad_norm": 0.3939581001590382, + "learning_rate": 1.6754430026316715e-09, + "loss": 0.5574, + "step": 7629 + }, + { + "epoch": 2.978142076502732, + "grad_norm": 0.41996792527822624, + "learning_rate": 1.6171743498449854e-09, + "loss": 0.5403, + "step": 7630 + }, + { + "epoch": 2.9785323965651833, + "grad_norm": 0.4717335690161948, + "learning_rate": 1.559936780238247e-09, + "loss": 0.557, + "step": 7631 + }, + { + "epoch": 2.9789227166276344, + "grad_norm": 0.40972801038873546, + "learning_rate": 1.5037303056181229e-09, + "loss": 0.5856, + "step": 7632 + }, + { + "epoch": 2.979313036690086, + "grad_norm": 0.4376999895682799, + "learning_rate": 1.4485549375792274e-09, + "loss": 0.5404, + "step": 7633 + }, + { + "epoch": 2.979703356752537, + "grad_norm": 0.4319780886082567, + "learning_rate": 1.3944106875035668e-09, + "loss": 0.565, + "step": 7634 + }, + { + "epoch": 2.980093676814988, + "grad_norm": 0.4781779831813518, + "learning_rate": 1.3412975665605398e-09, + "loss": 0.648, + "step": 7635 + }, + { + "epoch": 2.9804839968774397, + "grad_norm": 0.396144254276104, + "learning_rate": 1.2892155857058275e-09, + "loss": 0.5754, + "step": 7636 + }, + { + "epoch": 2.980874316939891, + "grad_norm": 0.4496971772185042, + "learning_rate": 1.2381647556830578e-09, + "loss": 0.5798, + "step": 7637 + }, + { + "epoch": 2.981264637002342, + "grad_norm": 0.4449848312440753, + "learning_rate": 1.1881450870238066e-09, + "loss": 0.5577, + "step": 7638 + }, + { + "epoch": 2.981654957064793, + "grad_norm": 0.4366804435052152, + "learning_rate": 1.1391565900459312e-09, + "loss": 0.545, + "step": 7639 + }, + { + "epoch": 2.982045277127244, + "grad_norm": 0.4501731290921541, + "learning_rate": 1.0911992748546818e-09, + "loss": 0.5381, + "step": 7640 + }, + { + "epoch": 2.9824355971896956, + "grad_norm": 0.40997098407035815, + "learning_rate": 1.0442731513432558e-09, + "loss": 0.5672, + "step": 7641 + }, + { + "epoch": 2.9828259172521467, + "grad_norm": 0.43611351423029315, + "learning_rate": 9.983782291916878e-10, + "loss": 0.59, + "step": 7642 + }, + { + "epoch": 2.983216237314598, + "grad_norm": 0.4092106320458385, + "learning_rate": 9.535145178668493e-10, + "loss": 0.5765, + "step": 7643 + }, + { + "epoch": 2.9836065573770494, + "grad_norm": 0.4477050858216567, + "learning_rate": 9.096820266241147e-10, + "loss": 0.5544, + "step": 7644 + }, + { + "epoch": 2.9839968774395005, + "grad_norm": 0.45673757366097323, + "learning_rate": 8.668807645051402e-10, + "loss": 0.5642, + "step": 7645 + }, + { + "epoch": 2.9843871975019516, + "grad_norm": 0.39397348657540443, + "learning_rate": 8.251107403389746e-10, + "loss": 0.6189, + "step": 7646 + }, + { + "epoch": 2.9847775175644027, + "grad_norm": 0.41228964379766814, + "learning_rate": 7.843719627420587e-10, + "loss": 0.5624, + "step": 7647 + }, + { + "epoch": 2.985167837626854, + "grad_norm": 0.42595461371667054, + "learning_rate": 7.446644401182257e-10, + "loss": 0.5538, + "step": 7648 + }, + { + "epoch": 2.9855581576893053, + "grad_norm": 0.44287229661126215, + "learning_rate": 7.059881806581459e-10, + "loss": 0.5936, + "step": 7649 + }, + { + "epoch": 2.9859484777517564, + "grad_norm": 0.41320274494725573, + "learning_rate": 6.683431923409922e-10, + "loss": 0.5821, + "step": 7650 + }, + { + "epoch": 2.9863387978142075, + "grad_norm": 0.40093999144633663, + "learning_rate": 6.317294829311093e-10, + "loss": 0.5704, + "step": 7651 + }, + { + "epoch": 2.986729117876659, + "grad_norm": 0.42632183066794305, + "learning_rate": 5.961470599818997e-10, + "loss": 0.5788, + "step": 7652 + }, + { + "epoch": 2.98711943793911, + "grad_norm": 0.4347774667904066, + "learning_rate": 5.615959308341578e-10, + "loss": 0.5373, + "step": 7653 + }, + { + "epoch": 2.9875097580015613, + "grad_norm": 0.4470865747919678, + "learning_rate": 5.280761026138503e-10, + "loss": 0.5568, + "step": 7654 + }, + { + "epoch": 2.9879000780640124, + "grad_norm": 0.4409783108642844, + "learning_rate": 4.955875822360013e-10, + "loss": 0.5617, + "step": 7655 + }, + { + "epoch": 2.9882903981264635, + "grad_norm": 0.41017329818516296, + "learning_rate": 4.6413037640302694e-10, + "loss": 0.587, + "step": 7656 + }, + { + "epoch": 2.988680718188915, + "grad_norm": 0.4333665523400695, + "learning_rate": 4.337044916030708e-10, + "loss": 0.569, + "step": 7657 + }, + { + "epoch": 2.989071038251366, + "grad_norm": 0.4099786102522253, + "learning_rate": 4.043099341133339e-10, + "loss": 0.5721, + "step": 7658 + }, + { + "epoch": 2.989461358313817, + "grad_norm": 0.40055930024882364, + "learning_rate": 3.759467099972991e-10, + "loss": 0.5792, + "step": 7659 + }, + { + "epoch": 2.9898516783762688, + "grad_norm": 0.4367914033823992, + "learning_rate": 3.486148251052868e-10, + "loss": 0.5414, + "step": 7660 + }, + { + "epoch": 2.99024199843872, + "grad_norm": 0.4373486358976348, + "learning_rate": 3.223142850761196e-10, + "loss": 0.5381, + "step": 7661 + }, + { + "epoch": 2.990632318501171, + "grad_norm": 0.4082491375675802, + "learning_rate": 2.9704509533490247e-10, + "loss": 0.5809, + "step": 7662 + }, + { + "epoch": 2.991022638563622, + "grad_norm": 0.39781036286842814, + "learning_rate": 2.7280726109357724e-10, + "loss": 0.543, + "step": 7663 + }, + { + "epoch": 2.991412958626073, + "grad_norm": 0.4276588792836837, + "learning_rate": 2.4960078735314364e-10, + "loss": 0.5913, + "step": 7664 + }, + { + "epoch": 2.9918032786885247, + "grad_norm": 0.46399229670804376, + "learning_rate": 2.2742567889977306e-10, + "loss": 0.561, + "step": 7665 + }, + { + "epoch": 2.992193598750976, + "grad_norm": 0.40528387720267733, + "learning_rate": 2.0628194030869465e-10, + "loss": 0.5936, + "step": 7666 + }, + { + "epoch": 2.992583918813427, + "grad_norm": 0.4194416852474867, + "learning_rate": 1.8616957594086437e-10, + "loss": 0.535, + "step": 7667 + }, + { + "epoch": 2.9929742388758784, + "grad_norm": 0.42595782133662785, + "learning_rate": 1.670885899446306e-10, + "loss": 0.5214, + "step": 7668 + }, + { + "epoch": 2.9933645589383295, + "grad_norm": 0.4672931740538015, + "learning_rate": 1.4903898625739933e-10, + "loss": 0.5616, + "step": 7669 + }, + { + "epoch": 2.9937548790007806, + "grad_norm": 0.4383972819445655, + "learning_rate": 1.3202076860174828e-10, + "loss": 0.5913, + "step": 7670 + }, + { + "epoch": 2.9941451990632317, + "grad_norm": 0.41080948545088797, + "learning_rate": 1.1603394048820271e-10, + "loss": 0.5511, + "step": 7671 + }, + { + "epoch": 2.994535519125683, + "grad_norm": 0.4323110960126508, + "learning_rate": 1.010785052146801e-10, + "loss": 0.5528, + "step": 7672 + }, + { + "epoch": 2.9949258391881344, + "grad_norm": 0.4245999523490313, + "learning_rate": 8.715446586649023e-11, + "loss": 0.6109, + "step": 7673 + }, + { + "epoch": 2.9953161592505855, + "grad_norm": 0.43216603219568756, + "learning_rate": 7.426182531578007e-11, + "loss": 0.5324, + "step": 7674 + }, + { + "epoch": 2.9957064793130366, + "grad_norm": 0.4400613783155491, + "learning_rate": 6.240058622153378e-11, + "loss": 0.5997, + "step": 7675 + }, + { + "epoch": 2.996096799375488, + "grad_norm": 0.4119218607830544, + "learning_rate": 5.157075103123799e-11, + "loss": 0.5467, + "step": 7676 + }, + { + "epoch": 2.996487119437939, + "grad_norm": 0.3979070783472036, + "learning_rate": 4.177232197866143e-11, + "loss": 0.5673, + "step": 7677 + }, + { + "epoch": 2.9968774395003903, + "grad_norm": 0.4130985377659059, + "learning_rate": 3.3005301085520245e-11, + "loss": 0.6058, + "step": 7678 + }, + { + "epoch": 2.9972677595628414, + "grad_norm": 0.4351114486579431, + "learning_rate": 2.526969015925751e-11, + "loss": 0.582, + "step": 7679 + }, + { + "epoch": 2.9976580796252925, + "grad_norm": 0.46837999566405086, + "learning_rate": 1.856549079637393e-11, + "loss": 0.5489, + "step": 7680 + }, + { + "epoch": 2.998048399687744, + "grad_norm": 0.41017324395023397, + "learning_rate": 1.2892704380207399e-11, + "loss": 0.5545, + "step": 7681 + }, + { + "epoch": 2.998438719750195, + "grad_norm": 0.4170537922248193, + "learning_rate": 8.25133207982276e-12, + "loss": 0.5694, + "step": 7682 + }, + { + "epoch": 2.9988290398126463, + "grad_norm": 0.40484694511318614, + "learning_rate": 4.641374853342484e-12, + "loss": 0.5739, + "step": 7683 + }, + { + "epoch": 2.999219359875098, + "grad_norm": 0.4369347603972857, + "learning_rate": 2.06283344572622e-12, + "loss": 0.5924, + "step": 7684 + }, + { + "epoch": 2.999609679937549, + "grad_norm": 0.4494100355751339, + "learning_rate": 5.157083882156855e-13, + "loss": 0.5902, + "step": 7685 + }, + { + "epoch": 3.0, + "grad_norm": 0.43636468315602983, + "learning_rate": 0.0, + "loss": 0.5571, + "step": 7686 + }, + { + "epoch": 3.0, + "step": 7686, + "total_flos": 7.336904392936784e+18, + "train_loss": 0.0, + "train_runtime": 5.8987, + "train_samples_per_second": 125080.161, + "train_steps_per_second": 1302.998 + } + ], + "logging_steps": 1, + "max_steps": 7686, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.336904392936784e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}