diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,12009 +1,1195 @@ { + "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, - "global_step": 8564, + "global_step": 733, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0005838393274170948, - "grad_norm": 2.840531349182129, - "learning_rate": 5.827505827505828e-07, - "loss": 1.1342, + "epoch": 0.0068212824010914054, + "grad_norm": 1.8668294015888436, + "learning_rate": 5.405405405405406e-06, + "loss": 0.8522, + "num_tokens": 3759146.0, "step": 5 }, { - "epoch": 0.0011676786548341896, - "grad_norm": 2.684952974319458, - "learning_rate": 1.1655011655011657e-06, - "loss": 1.1191, + "epoch": 0.013642564802182811, + "grad_norm": 1.7247561733323398, + "learning_rate": 1.2162162162162164e-05, + "loss": 0.8047, + "num_tokens": 7668808.0, "step": 10 }, { - "epoch": 0.0017515179822512844, - "grad_norm": 2.490508794784546, - "learning_rate": 1.7482517482517485e-06, - "loss": 1.1026, + "epoch": 0.020463847203274217, + "grad_norm": 0.6872174146885504, + "learning_rate": 1.891891891891892e-05, + "loss": 0.7189, + "num_tokens": 11368873.0, "step": 15 }, { - "epoch": 0.002335357309668379, - "grad_norm": 1.945799708366394, - "learning_rate": 2.3310023310023313e-06, - "loss": 1.0839, + "epoch": 0.027285129604365622, + "grad_norm": 0.5588637904556512, + "learning_rate": 2.5675675675675675e-05, + "loss": 0.6772, + "num_tokens": 15118063.0, "step": 20 }, { - "epoch": 0.002919196637085474, - "grad_norm": 1.3352986574172974, - "learning_rate": 2.9137529137529138e-06, - "loss": 1.0332, + "epoch": 0.034106412005457026, + "grad_norm": 0.48706663235585207, + "learning_rate": 3.2432432432432436e-05, + "loss": 0.6547, + "num_tokens": 18906839.0, "step": 25 }, { - "epoch": 0.003503035964502569, - "grad_norm": 1.3198652267456055, - "learning_rate": 3.496503496503497e-06, - "loss": 1.0141, + "epoch": 0.040927694406548434, + "grad_norm": 0.41592123514427665, + "learning_rate": 3.918918918918919e-05, + "loss": 0.6323, + "num_tokens": 22641755.0, "step": 30 }, { - "epoch": 0.004086875291919664, - "grad_norm": 1.0377503633499146, - "learning_rate": 4.079254079254079e-06, - "loss": 0.9595, + "epoch": 0.047748976807639835, + "grad_norm": 0.3867404332753348, + "learning_rate": 4.594594594594595e-05, + "loss": 0.6278, + "num_tokens": 26636629.0, "step": 35 }, { - "epoch": 0.004670714619336758, - "grad_norm": 0.9846550822257996, - "learning_rate": 4.662004662004663e-06, - "loss": 0.9242, + "epoch": 0.054570259208731244, + "grad_norm": 0.3950136717548598, + "learning_rate": 4.999908316574644e-05, + "loss": 0.5973, + "num_tokens": 30417967.0, "step": 40 }, { - "epoch": 0.005254553946753854, - "grad_norm": 0.8690220713615417, - "learning_rate": 5.244755244755245e-06, - "loss": 0.9067, + "epoch": 0.061391541609822645, + "grad_norm": 0.3834643196131725, + "learning_rate": 4.998876963847189e-05, + "loss": 0.6004, + "num_tokens": 34231333.0, "step": 45 }, { - "epoch": 0.005838393274170948, - "grad_norm": 0.8757138252258301, - "learning_rate": 5.8275058275058275e-06, - "loss": 0.8791, + "epoch": 0.06821282401091405, + "grad_norm": 0.39680969782340475, + "learning_rate": 4.996700181165029e-05, + "loss": 0.6002, + "num_tokens": 37961424.0, "step": 50 }, { - "epoch": 0.006422232601588043, - "grad_norm": 0.8489364385604858, - "learning_rate": 6.41025641025641e-06, - "loss": 0.8935, + "epoch": 0.07503410641200546, + "grad_norm": 0.3896173679223876, + "learning_rate": 4.993379077238036e-05, + "loss": 0.6008, + "num_tokens": 41826860.0, "step": 55 }, { - "epoch": 0.007006071929005138, - "grad_norm": 0.9538592100143433, - "learning_rate": 6.993006993006994e-06, - "loss": 0.8935, + "epoch": 0.08185538881309687, + "grad_norm": 0.4410876468603876, + "learning_rate": 4.9889153436180295e-05, + "loss": 0.5886, + "num_tokens": 45543403.0, "step": 60 }, { - "epoch": 0.007589911256422233, - "grad_norm": 0.8154944777488708, - "learning_rate": 7.5757575757575764e-06, - "loss": 0.8633, + "epoch": 0.08867667121418826, + "grad_norm": 0.4175155873268927, + "learning_rate": 4.983311253837213e-05, + "loss": 0.5925, + "num_tokens": 49369486.0, "step": 65 }, { - "epoch": 0.008173750583839328, - "grad_norm": 0.8280661106109619, - "learning_rate": 8.158508158508159e-06, - "loss": 0.8678, + "epoch": 0.09549795361527967, + "grad_norm": 0.3764686151294453, + "learning_rate": 4.9765696622501846e-05, + "loss": 0.5819, + "num_tokens": 53010874.0, "step": 70 }, { - "epoch": 0.008757589911256422, - "grad_norm": 0.8275220990180969, - "learning_rate": 8.741258741258741e-06, - "loss": 0.8737, + "epoch": 0.10231923601637108, + "grad_norm": 0.4049946590100178, + "learning_rate": 4.968694002580118e-05, + "loss": 0.5841, + "num_tokens": 56909889.0, "step": 75 }, { - "epoch": 0.009341429238673517, - "grad_norm": 0.8111759424209595, - "learning_rate": 9.324009324009325e-06, - "loss": 0.8485, + "epoch": 0.10914051841746249, + "grad_norm": 0.446913495697233, + "learning_rate": 4.959688286169851e-05, + "loss": 0.5677, + "num_tokens": 60650570.0, "step": 80 }, { - "epoch": 0.009925268566090611, - "grad_norm": 0.8404977321624756, - "learning_rate": 9.906759906759908e-06, - "loss": 0.8464, + "epoch": 0.11596180081855388, + "grad_norm": 0.4622857509436459, + "learning_rate": 4.9495570999387685e-05, + "loss": 0.5616, + "num_tokens": 64564660.0, "step": 85 }, { - "epoch": 0.010509107893507707, - "grad_norm": 0.8864277601242065, - "learning_rate": 1.048951048951049e-05, - "loss": 0.8447, + "epoch": 0.12278308321964529, + "grad_norm": 0.4804240442830647, + "learning_rate": 4.9383056040465276e-05, + "loss": 0.5796, + "num_tokens": 68426882.0, "step": 90 }, { - "epoch": 0.011092947220924802, - "grad_norm": 0.8464337587356567, - "learning_rate": 1.1072261072261073e-05, - "loss": 0.8508, + "epoch": 0.1296043656207367, + "grad_norm": 0.35820565417638384, + "learning_rate": 4.925939529264815e-05, + "loss": 0.575, + "num_tokens": 72252819.0, "step": 95 }, { - "epoch": 0.011676786548341896, - "grad_norm": 0.8890232443809509, - "learning_rate": 1.1655011655011655e-05, - "loss": 0.823, + "epoch": 0.1364256480218281, + "grad_norm": 0.38001556309086676, + "learning_rate": 4.9124651740584684e-05, + "loss": 0.5613, + "num_tokens": 76160914.0, "step": 100 }, { - "epoch": 0.01226062587575899, - "grad_norm": 0.8960619568824768, - "learning_rate": 1.2237762237762239e-05, - "loss": 0.835, + "epoch": 0.1432469304229195, + "grad_norm": 0.3927183537086537, + "learning_rate": 4.897889401377447e-05, + "loss": 0.563, + "num_tokens": 80152955.0, "step": 105 }, { - "epoch": 0.012844465203176086, - "grad_norm": 0.8379752039909363, - "learning_rate": 1.282051282051282e-05, - "loss": 0.826, + "epoch": 0.15006821282401092, + "grad_norm": 0.3272034222934781, + "learning_rate": 4.882219635161306e-05, + "loss": 0.5667, + "num_tokens": 83901702.0, "step": 110 }, { - "epoch": 0.01342830453059318, - "grad_norm": 0.8305661082267761, - "learning_rate": 1.3403263403263406e-05, - "loss": 0.8172, + "epoch": 0.15688949522510232, + "grad_norm": 0.41975494955869247, + "learning_rate": 4.865463856557922e-05, + "loss": 0.5655, + "num_tokens": 87691018.0, "step": 115 }, { - "epoch": 0.014012143858010275, - "grad_norm": 0.8608613610267639, - "learning_rate": 1.3986013986013988e-05, - "loss": 0.8254, + "epoch": 0.16371077762619374, + "grad_norm": 0.36316853341182537, + "learning_rate": 4.847630599858426e-05, + "loss": 0.5546, + "num_tokens": 91542428.0, "step": 120 }, { - "epoch": 0.014595983185427371, - "grad_norm": 0.8544851541519165, - "learning_rate": 1.456876456876457e-05, - "loss": 0.8148, + "epoch": 0.17053206002728513, + "grad_norm": 0.34879469561207194, + "learning_rate": 4.8287289481503954e-05, + "loss": 0.5616, + "num_tokens": 95438170.0, "step": 125 }, { - "epoch": 0.015179822512844466, - "grad_norm": 0.9038669466972351, - "learning_rate": 1.5151515151515153e-05, - "loss": 0.8149, + "epoch": 0.17735334242837653, + "grad_norm": 0.4026771690396351, + "learning_rate": 4.8087685286915276e-05, + "loss": 0.5572, + "num_tokens": 99383692.0, "step": 130 }, { - "epoch": 0.01576366184026156, - "grad_norm": 0.9775160551071167, - "learning_rate": 1.5734265734265734e-05, - "loss": 0.8228, + "epoch": 0.18417462482946795, + "grad_norm": 0.3893249591588555, + "learning_rate": 4.787759508006147e-05, + "loss": 0.5568, + "num_tokens": 103223537.0, "step": 135 }, { - "epoch": 0.016347501167678656, - "grad_norm": 0.9204371571540833, - "learning_rate": 1.6317016317016318e-05, - "loss": 0.8002, + "epoch": 0.19099590723055934, + "grad_norm": 0.4315296747644332, + "learning_rate": 4.765712586707048e-05, + "loss": 0.5695, + "num_tokens": 106901687.0, "step": 140 }, { - "epoch": 0.01693134049509575, - "grad_norm": 0.8203734159469604, - "learning_rate": 1.68997668997669e-05, - "loss": 0.8068, + "epoch": 0.19781718963165076, + "grad_norm": 0.39972341830792835, + "learning_rate": 4.7426389940453065e-05, + "loss": 0.5419, + "num_tokens": 110840758.0, "step": 145 }, { - "epoch": 0.017515179822512845, - "grad_norm": 0.9384042024612427, - "learning_rate": 1.7482517482517483e-05, - "loss": 0.795, + "epoch": 0.20463847203274216, + "grad_norm": 0.4054705248088257, + "learning_rate": 4.718550482190837e-05, + "loss": 0.5578, + "num_tokens": 114521504.0, "step": 150 }, { - "epoch": 0.01809901914992994, - "grad_norm": 0.9079834222793579, - "learning_rate": 1.8065268065268067e-05, - "loss": 0.8047, + "epoch": 0.21145975443383355, + "grad_norm": 0.3452594231320418, + "learning_rate": 4.6934593202466127e-05, + "loss": 0.5425, + "num_tokens": 118445759.0, "step": 155 }, { - "epoch": 0.018682858477347034, - "grad_norm": 0.9305838942527771, - "learning_rate": 1.864801864801865e-05, - "loss": 0.8116, + "epoch": 0.21828103683492497, + "grad_norm": 0.3579220823187385, + "learning_rate": 4.6673782879995896e-05, + "loss": 0.5511, + "num_tokens": 122311693.0, "step": 160 }, { - "epoch": 0.01926669780476413, - "grad_norm": 0.9067521691322327, - "learning_rate": 1.923076923076923e-05, - "loss": 0.8122, + "epoch": 0.22510231923601637, + "grad_norm": 0.339431264329542, + "learning_rate": 4.640320669411526e-05, + "loss": 0.554, + "num_tokens": 126094524.0, "step": 165 }, { - "epoch": 0.019850537132181222, - "grad_norm": 0.8529767394065857, - "learning_rate": 1.9813519813519816e-05, - "loss": 0.7835, + "epoch": 0.23192360163710776, + "grad_norm": 0.34913345943773244, + "learning_rate": 4.612300245853004e-05, + "loss": 0.5473, + "num_tokens": 129971056.0, "step": 170 }, { - "epoch": 0.02043437645959832, - "grad_norm": 0.9673607349395752, - "learning_rate": 2.0396270396270396e-05, - "loss": 0.8343, + "epoch": 0.23874488403819918, + "grad_norm": 0.43872271001871566, + "learning_rate": 4.5833312890841085e-05, + "loss": 0.562, + "num_tokens": 133765982.0, "step": 175 }, { - "epoch": 0.021018215787015414, - "grad_norm": 0.9665430188179016, - "learning_rate": 2.097902097902098e-05, - "loss": 0.8132, + "epoch": 0.24556616643929058, + "grad_norm": 0.38820616445153877, + "learning_rate": 4.553428553985329e-05, + "loss": 0.5418, + "num_tokens": 137522281.0, "step": 180 }, { - "epoch": 0.021602055114432507, - "grad_norm": 0.9452393651008606, - "learning_rate": 2.156177156177156e-05, - "loss": 0.8021, + "epoch": 0.252387448840382, + "grad_norm": 0.3850943617783079, + "learning_rate": 4.522607271042399e-05, + "loss": 0.5367, + "num_tokens": 141196084.0, "step": 185 }, { - "epoch": 0.022185894441849603, - "grad_norm": 0.8940615653991699, - "learning_rate": 2.2144522144522145e-05, - "loss": 0.8128, + "epoch": 0.2592087312414734, + "grad_norm": 0.43093540895662497, + "learning_rate": 4.490883138588882e-05, + "loss": 0.548, + "num_tokens": 145136704.0, "step": 190 }, { - "epoch": 0.0227697337692667, - "grad_norm": 0.9634569883346558, - "learning_rate": 2.272727272727273e-05, - "loss": 0.7804, + "epoch": 0.2660300136425648, + "grad_norm": 0.31524529627859765, + "learning_rate": 4.458272314810479e-05, + "loss": 0.5358, + "num_tokens": 148940122.0, "step": 195 }, { - "epoch": 0.023353573096683792, - "grad_norm": 0.9269850850105286, - "learning_rate": 2.331002331002331e-05, - "loss": 0.7917, + "epoch": 0.2728512960436562, + "grad_norm": 0.2904592177300161, + "learning_rate": 4.4247914095151086e-05, + "loss": 0.5457, + "num_tokens": 152809678.0, "step": 200 }, { - "epoch": 0.023937412424100888, - "grad_norm": 0.8946079015731812, - "learning_rate": 2.3892773892773894e-05, - "loss": 0.8004, + "epoch": 0.27967257844474763, + "grad_norm": 0.3751845422333264, + "learning_rate": 4.390457475672966e-05, + "loss": 0.5394, + "num_tokens": 156683573.0, "step": 205 }, { - "epoch": 0.02452125175151798, - "grad_norm": 0.9345911145210266, - "learning_rate": 2.4475524475524478e-05, - "loss": 0.823, + "epoch": 0.286493860845839, + "grad_norm": 0.342531691115598, + "learning_rate": 4.35528800073086e-05, + "loss": 0.5408, + "num_tokens": 160433326.0, "step": 210 }, { - "epoch": 0.025105091078935077, - "grad_norm": 1.0124356746673584, - "learning_rate": 2.505827505827506e-05, - "loss": 0.7731, + "epoch": 0.2933151432469304, + "grad_norm": 0.34186298656605696, + "learning_rate": 4.31930089770526e-05, + "loss": 0.5443, + "num_tokens": 164374316.0, "step": 215 }, { - "epoch": 0.025688930406352173, - "grad_norm": 0.8967856168746948, - "learning_rate": 2.564102564102564e-05, - "loss": 0.7973, + "epoch": 0.30013642564802184, + "grad_norm": 0.34482043458760203, + "learning_rate": 4.282514496058582e-05, + "loss": 0.5238, + "num_tokens": 168223299.0, "step": 220 }, { - "epoch": 0.026272769733769265, - "grad_norm": 0.8347222208976746, - "learning_rate": 2.6223776223776224e-05, - "loss": 0.792, + "epoch": 0.3069577080491132, + "grad_norm": 0.2983303840923811, + "learning_rate": 4.24494753236337e-05, + "loss": 0.5365, + "num_tokens": 172132000.0, "step": 225 }, { - "epoch": 0.02685660906118636, - "grad_norm": 0.8380569815635681, - "learning_rate": 2.680652680652681e-05, - "loss": 0.8035, + "epoch": 0.31377899045020463, + "grad_norm": 0.3212490155633752, + "learning_rate": 4.2066191407591125e-05, + "loss": 0.5321, + "num_tokens": 176086331.0, "step": 230 }, { - "epoch": 0.027440448388603458, - "grad_norm": 0.9121975302696228, - "learning_rate": 2.738927738927739e-05, - "loss": 0.7906, + "epoch": 0.32060027285129605, + "grad_norm": 0.2893257104319784, + "learning_rate": 4.1675488432065785e-05, + "loss": 0.5244, + "num_tokens": 179917640.0, "step": 235 }, { - "epoch": 0.02802428771602055, - "grad_norm": 0.9143264889717102, - "learning_rate": 2.7972027972027976e-05, - "loss": 0.7655, + "epoch": 0.3274215552523875, + "grad_norm": 0.2812328798873313, + "learning_rate": 4.127756539544609e-05, + "loss": 0.5369, + "num_tokens": 183746129.0, "step": 240 }, { - "epoch": 0.028608127043437646, - "grad_norm": 0.9353535771369934, - "learning_rate": 2.8554778554778557e-05, - "loss": 0.7936, + "epoch": 0.33424283765347884, + "grad_norm": 0.36334234919110503, + "learning_rate": 4.087262497354452e-05, + "loss": 0.5454, + "num_tokens": 187699370.0, "step": 245 }, { - "epoch": 0.029191966370854742, - "grad_norm": 0.9576996564865112, - "learning_rate": 2.913752913752914e-05, - "loss": 0.8199, + "epoch": 0.34106412005457026, + "grad_norm": 0.3002376113285202, + "learning_rate": 4.046087341636789e-05, + "loss": 0.5279, + "num_tokens": 191512142.0, "step": 250 }, { - "epoch": 0.029775805698271835, - "grad_norm": 0.914097011089325, - "learning_rate": 2.972027972027972e-05, - "loss": 0.8145, + "epoch": 0.3478854024556617, + "grad_norm": 0.304509691483118, + "learning_rate": 4.0042520443067176e-05, + "loss": 0.5292, + "num_tokens": 195367749.0, "step": 255 }, { - "epoch": 0.03035964502568893, - "grad_norm": 0.9023700952529907, - "learning_rate": 3.0303030303030306e-05, - "loss": 0.8031, + "epoch": 0.35470668485675305, + "grad_norm": 0.31090228382258317, + "learning_rate": 3.961777913512035e-05, + "loss": 0.5182, + "num_tokens": 199215371.0, "step": 260 }, { - "epoch": 0.030943484353106024, - "grad_norm": 0.9031194448471069, - "learning_rate": 3.088578088578088e-05, - "loss": 0.7987, + "epoch": 0.3615279672578445, + "grad_norm": 0.294458994836171, + "learning_rate": 3.9186865827802724e-05, + "loss": 0.5377, + "num_tokens": 202903048.0, "step": 265 }, { - "epoch": 0.03152732368052312, - "grad_norm": 0.9260802865028381, - "learning_rate": 3.146853146853147e-05, - "loss": 0.7811, + "epoch": 0.3683492496589359, + "grad_norm": 0.307801938345619, + "learning_rate": 3.875e-05, + "loss": 0.5265, + "num_tokens": 206761213.0, "step": 270 }, { - "epoch": 0.032111163007940216, - "grad_norm": 0.9897984266281128, - "learning_rate": 3.205128205128206e-05, - "loss": 0.7877, + "epoch": 0.37517053206002726, + "grad_norm": 0.28539346006440874, + "learning_rate": 3.830740416242014e-05, + "loss": 0.5224, + "num_tokens": 210585632.0, "step": 275 }, { - "epoch": 0.03269500233535731, - "grad_norm": 0.9557703733444214, - "learning_rate": 3.2634032634032635e-05, - "loss": 0.789, + "epoch": 0.3819918144611187, + "grad_norm": 0.3027425175055548, + "learning_rate": 3.7859303744261064e-05, + "loss": 0.5282, + "num_tokens": 214261738.0, "step": 280 }, { - "epoch": 0.0332788416627744, - "grad_norm": 1.0350620746612549, - "learning_rate": 3.321678321678322e-05, - "loss": 0.8112, + "epoch": 0.3888130968622101, + "grad_norm": 0.277979061330346, + "learning_rate": 3.740592697839185e-05, + "loss": 0.533, + "num_tokens": 218144024.0, "step": 285 }, { - "epoch": 0.0338626809901915, - "grad_norm": 1.1042369604110718, - "learning_rate": 3.37995337995338e-05, - "loss": 0.7988, + "epoch": 0.3956343792633015, + "grad_norm": 0.2666174933047264, + "learning_rate": 3.694750478510596e-05, + "loss": 0.5285, + "num_tokens": 222057295.0, "step": 290 }, { - "epoch": 0.034446520317608594, - "grad_norm": 0.9341334104537964, - "learning_rate": 3.438228438228439e-05, - "loss": 0.7869, + "epoch": 0.4024556616643929, + "grad_norm": 0.26713129985891826, + "learning_rate": 3.648427065450555e-05, + "loss": 0.5197, + "num_tokens": 225828573.0, "step": 295 }, { - "epoch": 0.03503035964502569, - "grad_norm": 0.9895315766334534, - "learning_rate": 3.4965034965034965e-05, - "loss": 0.781, + "epoch": 0.4092769440654843, + "grad_norm": 0.2837840653374091, + "learning_rate": 3.601646052757707e-05, + "loss": 0.519, + "num_tokens": 229710487.0, "step": 300 }, { - "epoch": 0.035614198972442786, - "grad_norm": 0.9618644714355469, - "learning_rate": 3.554778554778555e-05, - "loss": 0.7899, + "epoch": 0.41609822646657574, + "grad_norm": 0.3141969043403706, + "learning_rate": 3.55443126760184e-05, + "loss": 0.5344, + "num_tokens": 233617525.0, "step": 305 }, { - "epoch": 0.03619803829985988, - "grad_norm": 0.9972008466720581, - "learning_rate": 3.613053613053613e-05, - "loss": 0.8028, + "epoch": 0.4229195088676671, + "grad_norm": 0.3386995775152564, + "learning_rate": 3.506806758087894e-05, + "loss": 0.532, + "num_tokens": 237394471.0, "step": 310 }, { - "epoch": 0.03678187762727697, - "grad_norm": 0.9699879884719849, - "learning_rate": 3.671328671328672e-05, - "loss": 0.7827, + "epoch": 0.4297407912687585, + "grad_norm": 0.3127602717198287, + "learning_rate": 3.458796781007437e-05, + "loss": 0.5267, + "num_tokens": 241114261.0, "step": 315 }, { - "epoch": 0.03736571695469407, - "grad_norm": 1.0316132307052612, - "learning_rate": 3.72960372960373e-05, - "loss": 0.7877, + "epoch": 0.43656207366984995, + "grad_norm": 0.2844786515151641, + "learning_rate": 3.410425789483854e-05, + "loss": 0.527, + "num_tokens": 244967987.0, "step": 320 }, { - "epoch": 0.03794955628211116, - "grad_norm": 1.0885066986083984, - "learning_rate": 3.787878787878788e-05, - "loss": 0.7815, + "epoch": 0.4433833560709413, + "grad_norm": 0.28761389844710167, + "learning_rate": 3.3617184205175304e-05, + "loss": 0.5335, + "num_tokens": 248751095.0, "step": 325 }, { - "epoch": 0.03853339560952826, - "grad_norm": 1.1321635246276855, - "learning_rate": 3.846153846153846e-05, - "loss": 0.7928, + "epoch": 0.45020463847203274, + "grad_norm": 0.3411941605718103, + "learning_rate": 3.312699482437392e-05, + "loss": 0.5207, + "num_tokens": 252607265.0, "step": 330 }, { - "epoch": 0.039117234936945355, - "grad_norm": 1.0205104351043701, - "learning_rate": 3.904428904428905e-05, - "loss": 0.8016, + "epoch": 0.45702592087312416, + "grad_norm": 0.29578624804206416, + "learning_rate": 3.263393942265168e-05, + "loss": 0.5274, + "num_tokens": 256417909.0, "step": 335 }, { - "epoch": 0.039701074264362445, - "grad_norm": 0.9675008058547974, - "learning_rate": 3.962703962703963e-05, - "loss": 0.787, + "epoch": 0.4638472032742155, + "grad_norm": 0.25727525654305566, + "learning_rate": 3.213826912998838e-05, + "loss": 0.5199, + "num_tokens": 260456429.0, "step": 340 }, { - "epoch": 0.04028491359177954, - "grad_norm": 0.943912923336029, - "learning_rate": 4.020979020979021e-05, - "loss": 0.774, + "epoch": 0.47066848567530695, + "grad_norm": 0.26795472413546734, + "learning_rate": 3.164023640821719e-05, + "loss": 0.5132, + "num_tokens": 264287905.0, "step": 345 }, { - "epoch": 0.04086875291919664, - "grad_norm": 1.0533447265625, - "learning_rate": 4.079254079254079e-05, - "loss": 0.7865, + "epoch": 0.47748976807639837, + "grad_norm": 0.2755134604375614, + "learning_rate": 3.114009492243721e-05, + "loss": 0.5216, + "num_tokens": 268098790.0, "step": 350 }, { - "epoch": 0.04145259224661373, - "grad_norm": 0.9215196371078491, - "learning_rate": 4.1375291375291377e-05, - "loss": 0.7944, + "epoch": 0.4843110504774898, + "grad_norm": 0.284559174162564, + "learning_rate": 3.063809941181321e-05, + "loss": 0.5312, + "num_tokens": 271974065.0, "step": 355 }, { - "epoch": 0.04203643157403083, - "grad_norm": 1.101007342338562, - "learning_rate": 4.195804195804196e-05, - "loss": 0.7954, + "epoch": 0.49113233287858116, + "grad_norm": 0.2826715777966556, + "learning_rate": 3.0134505559828203e-05, + "loss": 0.5349, + "num_tokens": 275852045.0, "step": 360 }, { - "epoch": 0.04262027090144792, - "grad_norm": 1.0323628187179565, - "learning_rate": 4.254079254079254e-05, - "loss": 0.7944, + "epoch": 0.4979536152796726, + "grad_norm": 0.28035063155455164, + "learning_rate": 2.9629569864055125e-05, + "loss": 0.5129, + "num_tokens": 279504484.0, "step": 365 }, { - "epoch": 0.043204110228865014, - "grad_norm": 1.0289274454116821, - "learning_rate": 4.312354312354312e-05, - "loss": 0.787, + "epoch": 0.504774897680764, + "grad_norm": 0.2703017488821357, + "learning_rate": 2.9123549505513868e-05, + "loss": 0.515, + "num_tokens": 283461546.0, "step": 370 }, { - "epoch": 0.04378794955628211, - "grad_norm": 0.8957756757736206, - "learning_rate": 4.370629370629371e-05, - "loss": 0.7577, + "epoch": 0.5115961800818554, + "grad_norm": 0.2628579990890393, + "learning_rate": 2.8616702217680134e-05, + "loss": 0.523, + "num_tokens": 287371918.0, "step": 375 }, { - "epoch": 0.044371788883699206, - "grad_norm": 0.9535892009735107, - "learning_rate": 4.428904428904429e-05, - "loss": 0.7754, + "epoch": 0.5184174624829468, + "grad_norm": 0.27329779308736485, + "learning_rate": 2.810928615521303e-05, + "loss": 0.5096, + "num_tokens": 291057738.0, "step": 380 }, { - "epoch": 0.0449556282111163, - "grad_norm": 0.9479396939277649, - "learning_rate": 4.4871794871794874e-05, - "loss": 0.7972, + "epoch": 0.5252387448840382, + "grad_norm": 0.2799338468541636, + "learning_rate": 2.7601559762468022e-05, + "loss": 0.5188, + "num_tokens": 294881963.0, "step": 385 }, { - "epoch": 0.0455394675385334, - "grad_norm": 0.9833574891090393, - "learning_rate": 4.545454545454546e-05, - "loss": 0.7825, + "epoch": 0.5320600272851296, + "grad_norm": 0.27380554160394205, + "learning_rate": 2.7093781641862387e-05, + "loss": 0.5213, + "num_tokens": 298677895.0, "step": 390 }, { - "epoch": 0.04612330686595049, - "grad_norm": 1.1595077514648438, - "learning_rate": 4.603729603729604e-05, - "loss": 0.7793, + "epoch": 0.538881309686221, + "grad_norm": 0.2686509612309849, + "learning_rate": 2.658621042216021e-05, + "loss": 0.5056, + "num_tokens": 302387985.0, "step": 395 }, { - "epoch": 0.046707146193367584, - "grad_norm": 1.058830738067627, - "learning_rate": 4.662004662004662e-05, - "loss": 0.8038, + "epoch": 0.5457025920873124, + "grad_norm": 0.2842485971285363, + "learning_rate": 2.6079104626743845e-05, + "loss": 0.5272, + "num_tokens": 306130593.0, "step": 400 }, { - "epoch": 0.04729098552078468, - "grad_norm": 0.8839348554611206, - "learning_rate": 4.7202797202797204e-05, - "loss": 0.7883, + "epoch": 0.5525238744884038, + "grad_norm": 0.28769823956802704, + "learning_rate": 2.5572722541939113e-05, + "loss": 0.5251, + "num_tokens": 309952008.0, "step": 405 }, { - "epoch": 0.047874824848201776, - "grad_norm": 0.9398172497749329, - "learning_rate": 4.778554778554779e-05, - "loss": 0.8048, + "epoch": 0.5593451568894953, + "grad_norm": 0.3105490408472939, + "learning_rate": 2.5067322085461315e-05, + "loss": 0.5105, + "num_tokens": 313725489.0, "step": 410 }, { - "epoch": 0.04845866417561887, - "grad_norm": 1.0154868364334106, - "learning_rate": 4.836829836829837e-05, - "loss": 0.7791, + "epoch": 0.5661664392905866, + "grad_norm": 0.3152229637314721, + "learning_rate": 2.4563160675048846e-05, + "loss": 0.5156, + "num_tokens": 317525148.0, "step": 415 }, { - "epoch": 0.04904250350303596, - "grad_norm": 1.0271708965301514, - "learning_rate": 4.8951048951048956e-05, - "loss": 0.7716, + "epoch": 0.572987721691678, + "grad_norm": 0.27095198422631994, + "learning_rate": 2.406049509735156e-05, + "loss": 0.5154, + "num_tokens": 321410101.0, "step": 420 }, { - "epoch": 0.04962634283045306, - "grad_norm": 1.0137770175933838, - "learning_rate": 4.9533799533799534e-05, - "loss": 0.8045, + "epoch": 0.5798090040927695, + "grad_norm": 0.28829256667797254, + "learning_rate": 2.355958137714056e-05, + "loss": 0.5108, + "num_tokens": 325102278.0, "step": 425 }, { - "epoch": 0.050210182157870153, - "grad_norm": 0.9736084938049316, - "learning_rate": 4.999999832221176e-05, - "loss": 0.7783, + "epoch": 0.5866302864938608, + "grad_norm": 0.270087706114446, + "learning_rate": 2.3060674646906004e-05, + "loss": 0.5155, + "num_tokens": 328831071.0, "step": 430 }, { - "epoch": 0.05079402148528725, - "grad_norm": 0.9527460932731628, - "learning_rate": 4.999993959964937e-05, - "loss": 0.7776, + "epoch": 0.5934515688949522, + "grad_norm": 0.3056808295329074, + "learning_rate": 2.2564029016909416e-05, + "loss": 0.5049, + "num_tokens": 332767044.0, "step": 435 }, { - "epoch": 0.051377860812704346, - "grad_norm": 0.8859015703201294, - "learning_rate": 4.999979698792484e-05, - "loss": 0.7945, + "epoch": 0.6002728512960437, + "grad_norm": 0.2692565018339337, + "learning_rate": 2.2069897445756627e-05, + "loss": 0.5027, + "num_tokens": 336595638.0, "step": 440 }, { - "epoch": 0.05196170014012144, - "grad_norm": 1.1495444774627686, - "learning_rate": 4.999957048756989e-05, - "loss": 0.7934, + "epoch": 0.607094133697135, + "grad_norm": 0.26184698283909275, + "learning_rate": 2.1578531611557322e-05, + "loss": 0.5157, + "num_tokens": 340358925.0, "step": 445 }, { - "epoch": 0.05254553946753853, - "grad_norm": 0.9109179973602295, - "learning_rate": 4.999926009942899e-05, - "loss": 0.7866, + "epoch": 0.6139154160982264, + "grad_norm": 0.23620479720611628, + "learning_rate": 2.109018178373675e-05, + "loss": 0.5146, + "num_tokens": 344239058.0, "step": 450 }, { - "epoch": 0.05312937879495563, - "grad_norm": 0.9132571220397949, - "learning_rate": 4.999886582465941e-05, - "loss": 0.8036, + "epoch": 0.6207366984993179, + "grad_norm": 0.25416857859214803, + "learning_rate": 2.0605096695564973e-05, + "loss": 0.5181, + "num_tokens": 348080585.0, "step": 455 }, { - "epoch": 0.05371321812237272, - "grad_norm": 0.8982451558113098, - "learning_rate": 4.999838766473116e-05, - "loss": 0.7842, + "epoch": 0.6275579809004093, + "grad_norm": 0.24514832467763423, + "learning_rate": 2.0123523417468466e-05, + "loss": 0.5114, + "num_tokens": 351817695.0, "step": 460 }, { - "epoch": 0.05429705744978982, - "grad_norm": 0.837429404258728, - "learning_rate": 4.999782562142702e-05, - "loss": 0.7621, + "epoch": 0.6343792633015006, + "grad_norm": 0.24280253507668187, + "learning_rate": 1.9645707231188742e-05, + "loss": 0.5058, + "num_tokens": 355639183.0, "step": 465 }, { - "epoch": 0.054880896777206915, - "grad_norm": 0.9841635823249817, - "learning_rate": 4.999717969684254e-05, - "loss": 0.7934, + "epoch": 0.6412005457025921, + "grad_norm": 0.23500875387840528, + "learning_rate": 1.9171891504851925e-05, + "loss": 0.5244, + "num_tokens": 359437581.0, "step": 470 }, { - "epoch": 0.055464736104624004, - "grad_norm": 0.9334811568260193, - "learning_rate": 4.999644989338598e-05, - "loss": 0.8056, + "epoch": 0.6480218281036835, + "grad_norm": 0.2527982888228135, + "learning_rate": 1.8702317569013094e-05, + "loss": 0.5005, + "num_tokens": 363189983.0, "step": 475 }, { - "epoch": 0.0560485754320411, - "grad_norm": 0.9606655240058899, - "learning_rate": 4.9995636213778354e-05, - "loss": 0.77, + "epoch": 0.654843110504775, + "grad_norm": 0.22972404549026726, + "learning_rate": 1.8237224593738327e-05, + "loss": 0.5026, + "num_tokens": 366863209.0, "step": 480 }, { - "epoch": 0.0566324147594582, - "grad_norm": 0.8437446355819702, - "learning_rate": 4.99947386610534e-05, - "loss": 0.7745, + "epoch": 0.6616643929058663, + "grad_norm": 0.2257095506043247, + "learning_rate": 1.7776849466787223e-05, + "loss": 0.517, + "num_tokens": 370725860.0, "step": 485 }, { - "epoch": 0.05721625408687529, - "grad_norm": 0.9755055904388428, - "learning_rate": 4.9993757238557564e-05, - "loss": 0.7895, + "epoch": 0.6684856753069577, + "grad_norm": 0.23299272252976405, + "learning_rate": 1.7321426672957896e-05, + "loss": 0.5025, + "num_tokens": 374566515.0, "step": 490 }, { - "epoch": 0.05780009341429239, - "grad_norm": 0.9168041944503784, - "learning_rate": 4.999269194995001e-05, - "loss": 0.7553, + "epoch": 0.6753069577080492, + "grad_norm": 0.2553261303731692, + "learning_rate": 1.6871188174655787e-05, + "loss": 0.4957, + "num_tokens": 378330489.0, "step": 495 }, { - "epoch": 0.058383932741709485, - "grad_norm": 0.9619467854499817, - "learning_rate": 4.9991542799202574e-05, - "loss": 0.7664, + "epoch": 0.6821282401091405, + "grad_norm": 0.2350930551306658, + "learning_rate": 1.6426363293747334e-05, + "loss": 0.5, + "num_tokens": 382103468.0, "step": 500 }, { - "epoch": 0.058967772069126574, - "grad_norm": 0.8969050645828247, - "learning_rate": 4.999030979059977e-05, - "loss": 0.7882, + "epoch": 0.6889495225102319, + "grad_norm": 0.24017067646649592, + "learning_rate": 1.598717859475846e-05, + "loss": 0.5086, + "num_tokens": 385837297.0, "step": 505 }, { - "epoch": 0.05955161139654367, - "grad_norm": 0.9399372339248657, - "learning_rate": 4.998899292873876e-05, - "loss": 0.7919, + "epoch": 0.6957708049113234, + "grad_norm": 0.24082986222057906, + "learning_rate": 1.5553857769477553e-05, + "loss": 0.5055, + "num_tokens": 389586680.0, "step": 510 }, { - "epoch": 0.060135450723960766, - "grad_norm": 0.86275315284729, - "learning_rate": 4.9987592218529364e-05, - "loss": 0.7782, + "epoch": 0.7025920873124147, + "grad_norm": 0.22863149957641887, + "learning_rate": 1.5126621523021518e-05, + "loss": 0.51, + "num_tokens": 393428760.0, "step": 515 }, { - "epoch": 0.06071929005137786, - "grad_norm": 0.8463552594184875, - "learning_rate": 4.998610766519401e-05, - "loss": 0.7742, + "epoch": 0.7094133697135061, + "grad_norm": 0.2622703448542487, + "learning_rate": 1.4705687461423209e-05, + "loss": 0.5221, + "num_tokens": 397158700.0, "step": 520 }, { - "epoch": 0.06130312937879496, - "grad_norm": 0.8989056348800659, - "learning_rate": 4.9984539274267726e-05, - "loss": 0.7666, + "epoch": 0.7162346521145976, + "grad_norm": 0.24484237745692367, + "learning_rate": 1.4291269980797139e-05, + "loss": 0.5064, + "num_tokens": 400923938.0, "step": 525 }, { - "epoch": 0.06188696870621205, - "grad_norm": 0.8802539110183716, - "learning_rate": 4.998288705159815e-05, - "loss": 0.7966, + "epoch": 0.723055934515689, + "grad_norm": 0.22718569377185907, + "learning_rate": 1.3883580158140291e-05, + "loss": 0.5004, + "num_tokens": 404685655.0, "step": 530 }, { - "epoch": 0.062470808033629144, - "grad_norm": 0.9165039658546448, - "learning_rate": 4.9981151003345436e-05, - "loss": 0.7704, + "epoch": 0.7298772169167803, + "grad_norm": 0.21525244113290895, + "learning_rate": 1.3482825643823293e-05, + "loss": 0.5061, + "num_tokens": 408582799.0, "step": 535 }, { - "epoch": 0.06305464736104624, - "grad_norm": 1.0163360834121704, - "learning_rate": 4.99793311359823e-05, - "loss": 0.7948, + "epoch": 0.7366984993178718, + "grad_norm": 0.22015694564703406, + "learning_rate": 1.3089210555827086e-05, + "loss": 0.5118, + "num_tokens": 412386009.0, "step": 540 }, { - "epoch": 0.06363848668846334, - "grad_norm": 0.8688467144966125, - "learning_rate": 4.997742745629397e-05, - "loss": 0.7895, + "epoch": 0.7435197817189632, + "grad_norm": 0.21495348315883733, + "learning_rate": 1.270293537577855e-05, + "loss": 0.5148, + "num_tokens": 416372039.0, "step": 545 }, { - "epoch": 0.06422232601588043, - "grad_norm": 0.8428617119789124, - "learning_rate": 4.997543997137816e-05, - "loss": 0.7722, + "epoch": 0.7503410641200545, + "grad_norm": 0.21838158349042594, + "learning_rate": 1.232419684683844e-05, + "loss": 0.4994, + "num_tokens": 420051975.0, "step": 550 }, { - "epoch": 0.06480616534329753, - "grad_norm": 0.9370608329772949, - "learning_rate": 4.9973368688645034e-05, - "loss": 0.7885, + "epoch": 0.757162346521146, + "grad_norm": 0.2258119025998525, + "learning_rate": 1.1953187873493303e-05, + "loss": 0.5, + "num_tokens": 423685709.0, "step": 555 }, { - "epoch": 0.06539000467071462, - "grad_norm": 0.833179771900177, - "learning_rate": 4.997121361581721e-05, - "loss": 0.7685, + "epoch": 0.7639836289222374, + "grad_norm": 0.22849400778538187, + "learning_rate": 1.1590097423302684e-05, + "loss": 0.4959, + "num_tokens": 427405904.0, "step": 560 }, { - "epoch": 0.06597384399813172, - "grad_norm": 0.8754486441612244, - "learning_rate": 4.9968974760929694e-05, - "loss": 0.7599, + "epoch": 0.7708049113233287, + "grad_norm": 0.21958667856965727, + "learning_rate": 1.1235110430651421e-05, + "loss": 0.496, + "num_tokens": 431288378.0, "step": 565 }, { - "epoch": 0.0665576833255488, - "grad_norm": 0.8196648955345154, - "learning_rate": 4.996665213232987e-05, - "loss": 0.7388, + "epoch": 0.7776261937244202, + "grad_norm": 0.23913373520191775, + "learning_rate": 1.0888407702556284e-05, + "loss": 0.4998, + "num_tokens": 435077995.0, "step": 570 }, { - "epoch": 0.0671415226529659, - "grad_norm": 0.8970746397972107, - "learning_rate": 4.9964245738677465e-05, - "loss": 0.7709, + "epoch": 0.7844474761255116, + "grad_norm": 0.2417353603062665, + "learning_rate": 1.0550165826574766e-05, + "loss": 0.4996, + "num_tokens": 439006864.0, "step": 575 }, { - "epoch": 0.067725361980383, - "grad_norm": 0.83652263879776, - "learning_rate": 4.996175558894452e-05, - "loss": 0.7636, + "epoch": 0.791268758526603, + "grad_norm": 0.23732321207266466, + "learning_rate": 1.0220557080862985e-05, + "loss": 0.5149, + "num_tokens": 443045688.0, "step": 580 }, { - "epoch": 0.06830920130780009, - "grad_norm": 0.8856996297836304, - "learning_rate": 4.9959181692415345e-05, - "loss": 0.7873, + "epoch": 0.7980900409276944, + "grad_norm": 0.21571934880634502, + "learning_rate": 9.899749346428556e-06, + "loss": 0.502, + "num_tokens": 446852018.0, "step": 585 }, { - "epoch": 0.06889304063521719, - "grad_norm": 0.9262441992759705, - "learning_rate": 4.995652405868652e-05, - "loss": 0.7707, + "epoch": 0.8049113233287858, + "grad_norm": 0.21078457513071117, + "learning_rate": 9.587906021623016e-06, + "loss": 0.5161, + "num_tokens": 450687287.0, "step": 590 }, { - "epoch": 0.06947687996263428, - "grad_norm": 0.8420197367668152, - "learning_rate": 4.99537826976668e-05, - "loss": 0.769, + "epoch": 0.8117326057298773, + "grad_norm": 0.23441605543875263, + "learning_rate": 9.28518593891749e-06, + "loss": 0.5013, + "num_tokens": 454483896.0, "step": 595 }, { - "epoch": 0.07006071929005138, - "grad_norm": 0.9980587959289551, - "learning_rate": 4.9950957619577115e-05, - "loss": 0.7912, + "epoch": 0.8185538881309686, + "grad_norm": 0.22384719028240002, + "learning_rate": 8.99174328400385e-06, + "loss": 0.4997, + "num_tokens": 458325861.0, "step": 600 }, { - "epoch": 0.07064455861746848, - "grad_norm": 0.8674629330635071, - "learning_rate": 4.9948048834950546e-05, - "loss": 0.7861, + "epoch": 0.82537517053206, + "grad_norm": 0.22941845619820603, + "learning_rate": 8.707727517262697e-06, + "loss": 0.5049, + "num_tokens": 462055868.0, "step": 605 }, { - "epoch": 0.07122839794488557, - "grad_norm": 0.8844972848892212, - "learning_rate": 4.9945056354632255e-05, - "loss": 0.7791, + "epoch": 0.8321964529331515, + "grad_norm": 0.2138256278956969, + "learning_rate": 8.433283297638053e-06, + "loss": 0.4992, + "num_tokens": 465973876.0, "step": 610 }, { - "epoch": 0.07181223727230267, - "grad_norm": 0.9377912282943726, - "learning_rate": 4.994198018977945e-05, - "loss": 0.7774, + "epoch": 0.8390177353342428, + "grad_norm": 0.2181701942086996, + "learning_rate": 8.168550408957632e-06, + "loss": 0.4969, + "num_tokens": 469791498.0, "step": 615 }, { - "epoch": 0.07239607659971976, - "grad_norm": 0.819534957408905, - "learning_rate": 4.993882035186136e-05, - "loss": 0.773, + "epoch": 0.8458390177353342, + "grad_norm": 0.23101261117117852, + "learning_rate": 7.91366368873613e-06, + "loss": 0.4944, + "num_tokens": 473570581.0, "step": 620 }, { - "epoch": 0.07297991592713685, - "grad_norm": 0.8794915080070496, - "learning_rate": 4.9935576852659175e-05, - "loss": 0.7641, + "epoch": 0.8526603001364257, + "grad_norm": 0.2269150903338963, + "learning_rate": 7.66875295949791e-06, + "loss": 0.5104, + "num_tokens": 477401353.0, "step": 625 }, { - "epoch": 0.07356375525455394, - "grad_norm": 0.7787351012229919, - "learning_rate": 4.993224970426603e-05, - "loss": 0.7522, + "epoch": 0.859481582537517, + "grad_norm": 0.20575953421081028, + "learning_rate": 7.4339429626539e-06, + "loss": 0.5102, + "num_tokens": 481348892.0, "step": 630 }, { - "epoch": 0.07414759458197104, - "grad_norm": 0.8413527607917786, - "learning_rate": 4.99288389190869e-05, - "loss": 0.7685, + "epoch": 0.8663028649386084, + "grad_norm": 0.23001645269035356, + "learning_rate": 7.2093532949665715e-06, + "loss": 0.508, + "num_tokens": 485171910.0, "step": 635 }, { - "epoch": 0.07473143390938813, - "grad_norm": 0.8053303956985474, - "learning_rate": 4.992534450983864e-05, - "loss": 0.782, + "epoch": 0.8731241473396999, + "grad_norm": 0.20665497327246388, + "learning_rate": 6.995098347635173e-06, + "loss": 0.4933, + "num_tokens": 489059548.0, "step": 640 }, { - "epoch": 0.07531527323680523, - "grad_norm": 0.9114001989364624, - "learning_rate": 4.9921766489549835e-05, - "loss": 0.7806, + "epoch": 0.8799454297407913, + "grad_norm": 0.2107190957471263, + "learning_rate": 6.791287248032431e-06, + "loss": 0.4966, + "num_tokens": 492893029.0, "step": 645 }, { - "epoch": 0.07589911256422233, - "grad_norm": 0.8040638566017151, - "learning_rate": 4.991810487156087e-05, - "loss": 0.7623, + "epoch": 0.8867667121418826, + "grad_norm": 0.21555765697326154, + "learning_rate": 6.598023804122194e-06, + "loss": 0.5038, + "num_tokens": 496905674.0, "step": 650 }, { - "epoch": 0.07648295189163942, - "grad_norm": 0.87030029296875, - "learning_rate": 4.991435966952376e-05, - "loss": 0.7848, + "epoch": 0.8935879945429741, + "grad_norm": 0.20368978219391987, + "learning_rate": 6.415406451586528e-06, + "loss": 0.4993, + "num_tokens": 500864542.0, "step": 655 }, { - "epoch": 0.07706679121905652, - "grad_norm": 0.9315908551216125, - "learning_rate": 4.991053089740219e-05, - "loss": 0.7707, + "epoch": 0.9004092769440655, + "grad_norm": 0.20685793759893298, + "learning_rate": 6.243528203689025e-06, + "loss": 0.5032, + "num_tokens": 504810366.0, "step": 660 }, { - "epoch": 0.07765063054647361, - "grad_norm": 1.1054643392562866, - "learning_rate": 4.990661856947142e-05, - "loss": 0.7724, + "epoch": 0.9072305593451568, + "grad_norm": 0.2171382169840262, + "learning_rate": 6.0824766039e-06, + "loss": 0.4993, + "num_tokens": 508607232.0, "step": 665 }, { - "epoch": 0.07823446987389071, - "grad_norm": 1.0932615995407104, - "learning_rate": 4.990262270031824e-05, - "loss": 0.7469, + "epoch": 0.9140518417462483, + "grad_norm": 0.20991074132493198, + "learning_rate": 5.932333681307571e-06, + "loss": 0.5061, + "num_tokens": 512474084.0, "step": 670 }, { - "epoch": 0.0788183092013078, - "grad_norm": 0.9709866046905518, - "learning_rate": 4.989854330484092e-05, - "loss": 0.7901, + "epoch": 0.9208731241473397, + "grad_norm": 0.20428584534232572, + "learning_rate": 5.793175908837471e-06, + "loss": 0.4967, + "num_tokens": 516216104.0, "step": 675 }, { - "epoch": 0.07940214852872489, - "grad_norm": 1.0294532775878906, - "learning_rate": 4.9894380398249135e-05, - "loss": 0.7836, + "epoch": 0.927694406548431, + "grad_norm": 0.2064710377018979, + "learning_rate": 5.665074164302742e-06, + "loss": 0.5064, + "num_tokens": 519966345.0, "step": 680 }, { - "epoch": 0.07998598785614199, - "grad_norm": 0.9290727376937866, - "learning_rate": 4.989013399606396e-05, - "loss": 0.7564, + "epoch": 0.9345156889495225, + "grad_norm": 0.19447312445035328, + "learning_rate": 5.548093694303275e-06, + "loss": 0.4918, + "num_tokens": 523793837.0, "step": 685 }, { - "epoch": 0.08056982718355908, - "grad_norm": 0.8817949891090393, - "learning_rate": 4.988580411411774e-05, - "loss": 0.7553, + "epoch": 0.9413369713506139, + "grad_norm": 0.2192489932913614, + "learning_rate": 5.442294080993446e-06, + "loss": 0.5059, + "num_tokens": 527666864.0, "step": 690 }, { - "epoch": 0.08115366651097618, - "grad_norm": 0.826866626739502, - "learning_rate": 4.988139076855408e-05, - "loss": 0.7787, + "epoch": 0.9481582537517054, + "grad_norm": 0.20348081993445674, + "learning_rate": 5.347729211734919e-06, + "loss": 0.5033, + "num_tokens": 531466359.0, "step": 695 }, { - "epoch": 0.08173750583839327, - "grad_norm": 1.0217844247817993, - "learning_rate": 4.9876893975827774e-05, - "loss": 0.7668, + "epoch": 0.9549795361527967, + "grad_norm": 0.20065081837919563, + "learning_rate": 5.264447251649954e-06, + "loss": 0.5057, + "num_tokens": 535253217.0, "step": 700 }, { - "epoch": 0.08232134516581037, - "grad_norm": 1.0376311540603638, - "learning_rate": 4.987231375270475e-05, - "loss": 0.7849, + "epoch": 0.9618008185538881, + "grad_norm": 0.20438831920445003, + "learning_rate": 5.192490619089267e-06, + "loss": 0.4899, + "num_tokens": 539137436.0, "step": 705 }, { - "epoch": 0.08290518449322747, - "grad_norm": 0.9647888541221619, - "learning_rate": 4.9867650116261994e-05, - "loss": 0.7495, + "epoch": 0.9686221009549796, + "grad_norm": 0.20651528762536564, + "learning_rate": 5.1318959640269095e-06, + "loss": 0.5004, + "num_tokens": 542824098.0, "step": 710 }, { - "epoch": 0.08348902382064456, - "grad_norm": 0.9187455773353577, - "learning_rate": 4.986290308388747e-05, - "loss": 0.7662, + "epoch": 0.975443383356071, + "grad_norm": 0.21211108490688485, + "learning_rate": 5.082694149393189e-06, + "loss": 0.5113, + "num_tokens": 546578047.0, "step": 715 }, { - "epoch": 0.08407286314806166, - "grad_norm": 0.8127819895744324, - "learning_rate": 4.98580726732801e-05, - "loss": 0.7911, + "epoch": 0.9822646657571623, + "grad_norm": 0.20432801642340406, + "learning_rate": 5.044910235355121e-06, + "loss": 0.4974, + "num_tokens": 550377811.0, "step": 720 }, { - "epoch": 0.08465670247547875, - "grad_norm": 0.9004315733909607, - "learning_rate": 4.985315890244969e-05, - "loss": 0.7485, + "epoch": 0.9890859481582538, + "grad_norm": 0.19342130410850406, + "learning_rate": 5.0185634665524255e-06, + "loss": 0.4933, + "num_tokens": 554224024.0, "step": 725 }, { - "epoch": 0.08524054180289584, - "grad_norm": 0.8265429735183716, - "learning_rate": 4.9848161789716804e-05, - "loss": 0.7881, + "epoch": 0.9959072305593452, + "grad_norm": 0.19321818042549108, + "learning_rate": 5.003667262295572e-06, + "loss": 0.5014, + "num_tokens": 558001366.0, "step": 730 }, - { - "epoch": 0.08582438113031293, - "grad_norm": 0.9231536984443665, - "learning_rate": 4.9843081353712765e-05, - "loss": 0.7812, - "step": 735 - }, - { - "epoch": 0.08640822045773003, - "grad_norm": 0.8408137559890747, - "learning_rate": 4.983791761337958e-05, - "loss": 0.751, - "step": 740 - }, - { - "epoch": 0.08699205978514712, - "grad_norm": 1.0428320169448853, - "learning_rate": 4.9832670587969804e-05, - "loss": 0.7695, - "step": 745 - }, - { - "epoch": 0.08757589911256422, - "grad_norm": 0.9854226112365723, - "learning_rate": 4.9827340297046546e-05, - "loss": 0.745, - "step": 750 - }, - { - "epoch": 0.08815973843998132, - "grad_norm": 0.9338594675064087, - "learning_rate": 4.9821926760483354e-05, - "loss": 0.771, - "step": 755 - }, - { - "epoch": 0.08874357776739841, - "grad_norm": 0.9649643898010254, - "learning_rate": 4.9816429998464155e-05, - "loss": 0.7657, - "step": 760 - }, - { - "epoch": 0.08932741709481551, - "grad_norm": 0.8294287919998169, - "learning_rate": 4.9810850031483155e-05, - "loss": 0.7486, - "step": 765 - }, - { - "epoch": 0.0899112564222326, - "grad_norm": 1.0772855281829834, - "learning_rate": 4.9805186880344826e-05, - "loss": 0.7644, - "step": 770 - }, - { - "epoch": 0.0904950957496497, - "grad_norm": 0.8379737734794617, - "learning_rate": 4.9799440566163726e-05, - "loss": 0.762, - "step": 775 - }, - { - "epoch": 0.0910789350770668, - "grad_norm": 0.903430163860321, - "learning_rate": 4.979361111036454e-05, - "loss": 0.7778, - "step": 780 - }, - { - "epoch": 0.09166277440448388, - "grad_norm": 0.8099774122238159, - "learning_rate": 4.9787698534681896e-05, - "loss": 0.783, - "step": 785 - }, - { - "epoch": 0.09224661373190098, - "grad_norm": 0.8312159776687622, - "learning_rate": 4.978170286116035e-05, - "loss": 0.7647, - "step": 790 - }, - { - "epoch": 0.09283045305931807, - "grad_norm": 0.7381492257118225, - "learning_rate": 4.9775624112154275e-05, - "loss": 0.7418, - "step": 795 - }, - { - "epoch": 0.09341429238673517, - "grad_norm": 0.7794020175933838, - "learning_rate": 4.976946231032777e-05, - "loss": 0.736, - "step": 800 - }, - { - "epoch": 0.09399813171415226, - "grad_norm": 0.8266839385032654, - "learning_rate": 4.976321747865462e-05, - "loss": 0.7671, - "step": 805 - }, - { - "epoch": 0.09458197104156936, - "grad_norm": 0.8356665372848511, - "learning_rate": 4.975688964041816e-05, - "loss": 0.7685, - "step": 810 - }, - { - "epoch": 0.09516581036898646, - "grad_norm": 0.8353412747383118, - "learning_rate": 4.975047881921119e-05, - "loss": 0.7366, - "step": 815 - }, - { - "epoch": 0.09574964969640355, - "grad_norm": 0.8657653331756592, - "learning_rate": 4.974398503893596e-05, - "loss": 0.7686, - "step": 820 - }, - { - "epoch": 0.09633348902382065, - "grad_norm": 0.7886195778846741, - "learning_rate": 4.973740832380397e-05, - "loss": 0.7767, - "step": 825 - }, - { - "epoch": 0.09691732835123774, - "grad_norm": 0.9126043319702148, - "learning_rate": 4.9730748698335954e-05, - "loss": 0.7882, - "step": 830 - }, - { - "epoch": 0.09750116767865484, - "grad_norm": 0.8418185710906982, - "learning_rate": 4.9724006187361794e-05, - "loss": 0.7805, - "step": 835 - }, - { - "epoch": 0.09808500700607192, - "grad_norm": 0.8840919733047485, - "learning_rate": 4.971718081602037e-05, - "loss": 0.7949, - "step": 840 - }, - { - "epoch": 0.09866884633348902, - "grad_norm": 0.9618021249771118, - "learning_rate": 4.971027260975952e-05, - "loss": 0.7615, - "step": 845 - }, - { - "epoch": 0.09925268566090611, - "grad_norm": 0.9216254949569702, - "learning_rate": 4.9703281594335904e-05, - "loss": 0.7577, - "step": 850 - }, - { - "epoch": 0.09983652498832321, - "grad_norm": 0.8288834691047668, - "learning_rate": 4.969620779581497e-05, - "loss": 0.7622, - "step": 855 - }, - { - "epoch": 0.10042036431574031, - "grad_norm": 0.9365575909614563, - "learning_rate": 4.968905124057077e-05, - "loss": 0.7474, - "step": 860 - }, - { - "epoch": 0.1010042036431574, - "grad_norm": 0.9112325310707092, - "learning_rate": 4.968181195528594e-05, - "loss": 0.7466, - "step": 865 - }, - { - "epoch": 0.1015880429705745, - "grad_norm": 0.756351113319397, - "learning_rate": 4.9674489966951545e-05, - "loss": 0.7636, - "step": 870 - }, - { - "epoch": 0.1021718822979916, - "grad_norm": 0.7719910144805908, - "learning_rate": 4.9667085302867015e-05, - "loss": 0.7473, - "step": 875 - }, - { - "epoch": 0.10275572162540869, - "grad_norm": 0.8179823756217957, - "learning_rate": 4.9659597990640045e-05, - "loss": 0.7503, - "step": 880 - }, - { - "epoch": 0.10333956095282579, - "grad_norm": 0.7557665705680847, - "learning_rate": 4.9652028058186435e-05, - "loss": 0.7618, - "step": 885 - }, - { - "epoch": 0.10392340028024288, - "grad_norm": 0.8512751460075378, - "learning_rate": 4.9644375533730056e-05, - "loss": 0.7829, - "step": 890 - }, - { - "epoch": 0.10450723960765997, - "grad_norm": 0.9742012619972229, - "learning_rate": 4.963664044580272e-05, - "loss": 0.7618, - "step": 895 - }, - { - "epoch": 0.10509107893507706, - "grad_norm": 0.8326590061187744, - "learning_rate": 4.9628822823244056e-05, - "loss": 0.7434, - "step": 900 - }, - { - "epoch": 0.10567491826249416, - "grad_norm": 0.9307273030281067, - "learning_rate": 4.962092269520143e-05, - "loss": 0.7416, - "step": 905 - }, - { - "epoch": 0.10625875758991125, - "grad_norm": 0.9651601314544678, - "learning_rate": 4.96129400911298e-05, - "loss": 0.7731, - "step": 910 - }, - { - "epoch": 0.10684259691732835, - "grad_norm": 0.8530255556106567, - "learning_rate": 4.960487504079166e-05, - "loss": 0.7452, - "step": 915 - }, - { - "epoch": 0.10742643624474545, - "grad_norm": 0.9880965948104858, - "learning_rate": 4.959672757425688e-05, - "loss": 0.7666, - "step": 920 - }, - { - "epoch": 0.10801027557216254, - "grad_norm": 0.8939868807792664, - "learning_rate": 4.958849772190261e-05, - "loss": 0.7621, - "step": 925 - }, - { - "epoch": 0.10859411489957964, - "grad_norm": 0.8080766797065735, - "learning_rate": 4.958018551441317e-05, - "loss": 0.7413, - "step": 930 - }, - { - "epoch": 0.10917795422699673, - "grad_norm": 1.0521798133850098, - "learning_rate": 4.957179098277994e-05, - "loss": 0.7663, - "step": 935 - }, - { - "epoch": 0.10976179355441383, - "grad_norm": 0.8313295245170593, - "learning_rate": 4.956331415830125e-05, - "loss": 0.7348, - "step": 940 - }, - { - "epoch": 0.11034563288183093, - "grad_norm": 0.9212465286254883, - "learning_rate": 4.955475507258222e-05, - "loss": 0.7698, - "step": 945 - }, - { - "epoch": 0.11092947220924801, - "grad_norm": 0.8293212652206421, - "learning_rate": 4.95461137575347e-05, - "loss": 0.7392, - "step": 950 - }, - { - "epoch": 0.1115133115366651, - "grad_norm": 0.957472562789917, - "learning_rate": 4.953739024537712e-05, - "loss": 0.7224, - "step": 955 - }, - { - "epoch": 0.1120971508640822, - "grad_norm": 0.7683919668197632, - "learning_rate": 4.952858456863437e-05, - "loss": 0.7756, - "step": 960 - }, - { - "epoch": 0.1126809901914993, - "grad_norm": 0.875116229057312, - "learning_rate": 4.951969676013768e-05, - "loss": 0.7508, - "step": 965 - }, - { - "epoch": 0.1132648295189164, - "grad_norm": 0.860049307346344, - "learning_rate": 4.951072685302452e-05, - "loss": 0.7484, - "step": 970 - }, - { - "epoch": 0.11384866884633349, - "grad_norm": 0.7837722301483154, - "learning_rate": 4.950167488073844e-05, - "loss": 0.7404, - "step": 975 - }, - { - "epoch": 0.11443250817375059, - "grad_norm": 0.9099584221839905, - "learning_rate": 4.949254087702896e-05, - "loss": 0.749, - "step": 980 - }, - { - "epoch": 0.11501634750116768, - "grad_norm": 0.7468155026435852, - "learning_rate": 4.948332487595148e-05, - "loss": 0.7553, - "step": 985 - }, - { - "epoch": 0.11560018682858478, - "grad_norm": 0.758169949054718, - "learning_rate": 4.9474026911867084e-05, - "loss": 0.7581, - "step": 990 - }, - { - "epoch": 0.11618402615600187, - "grad_norm": 0.8177466988563538, - "learning_rate": 4.9464647019442465e-05, - "loss": 0.7249, - "step": 995 - }, - { - "epoch": 0.11676786548341897, - "grad_norm": 0.8930375576019287, - "learning_rate": 4.945518523364976e-05, - "loss": 0.7559, - "step": 1000 - }, - { - "epoch": 0.11735170481083605, - "grad_norm": 0.7805625796318054, - "learning_rate": 4.944564158976647e-05, - "loss": 0.7594, - "step": 1005 - }, - { - "epoch": 0.11793554413825315, - "grad_norm": 0.8927081227302551, - "learning_rate": 4.943601612337528e-05, - "loss": 0.7519, - "step": 1010 - }, - { - "epoch": 0.11851938346567024, - "grad_norm": 1.036241054534912, - "learning_rate": 4.9426308870363934e-05, - "loss": 0.7521, - "step": 1015 - }, - { - "epoch": 0.11910322279308734, - "grad_norm": 0.8101186156272888, - "learning_rate": 4.941651986692514e-05, - "loss": 0.7633, - "step": 1020 - }, - { - "epoch": 0.11968706212050444, - "grad_norm": 0.8684472441673279, - "learning_rate": 4.940664914955637e-05, - "loss": 0.7547, - "step": 1025 - }, - { - "epoch": 0.12027090144792153, - "grad_norm": 0.7512865662574768, - "learning_rate": 4.939669675505978e-05, - "loss": 0.7656, - "step": 1030 - }, - { - "epoch": 0.12085474077533863, - "grad_norm": 0.7747283577919006, - "learning_rate": 4.938666272054205e-05, - "loss": 0.7355, - "step": 1035 - }, - { - "epoch": 0.12143858010275572, - "grad_norm": 0.74598628282547, - "learning_rate": 4.937654708341425e-05, - "loss": 0.7664, - "step": 1040 - }, - { - "epoch": 0.12202241943017282, - "grad_norm": 0.7878855466842651, - "learning_rate": 4.93663498813917e-05, - "loss": 0.7541, - "step": 1045 - }, - { - "epoch": 0.12260625875758992, - "grad_norm": 0.7905980348587036, - "learning_rate": 4.9356071152493815e-05, - "loss": 0.7451, - "step": 1050 - }, - { - "epoch": 0.123190098085007, - "grad_norm": 0.8013576865196228, - "learning_rate": 4.934571093504398e-05, - "loss": 0.7421, - "step": 1055 - }, - { - "epoch": 0.1237739374124241, - "grad_norm": 0.8601541519165039, - "learning_rate": 4.933526926766943e-05, - "loss": 0.7737, - "step": 1060 - }, - { - "epoch": 0.12435777673984119, - "grad_norm": 0.8373185396194458, - "learning_rate": 4.9324746189301027e-05, - "loss": 0.7609, - "step": 1065 - }, - { - "epoch": 0.12494161606725829, - "grad_norm": 0.859311580657959, - "learning_rate": 4.9314141739173223e-05, - "loss": 0.7568, - "step": 1070 - }, - { - "epoch": 0.12552545539467538, - "grad_norm": 0.811808705329895, - "learning_rate": 4.9303455956823816e-05, - "loss": 0.7687, - "step": 1075 - }, - { - "epoch": 0.12610929472209248, - "grad_norm": 0.7639419436454773, - "learning_rate": 4.929268888209388e-05, - "loss": 0.736, - "step": 1080 - }, - { - "epoch": 0.12669313404950958, - "grad_norm": 0.8552339673042297, - "learning_rate": 4.928184055512754e-05, - "loss": 0.7503, - "step": 1085 - }, - { - "epoch": 0.12727697337692667, - "grad_norm": 0.748649537563324, - "learning_rate": 4.927091101637189e-05, - "loss": 0.7412, - "step": 1090 - }, - { - "epoch": 0.12786081270434377, - "grad_norm": 0.8011511564254761, - "learning_rate": 4.9259900306576825e-05, - "loss": 0.728, - "step": 1095 - }, - { - "epoch": 0.12844465203176086, - "grad_norm": 0.7698752880096436, - "learning_rate": 4.924880846679485e-05, - "loss": 0.746, - "step": 1100 - }, - { - "epoch": 0.12902849135917796, - "grad_norm": 0.7327059507369995, - "learning_rate": 4.923763553838098e-05, - "loss": 0.7518, - "step": 1105 - }, - { - "epoch": 0.12961233068659506, - "grad_norm": 0.7117829918861389, - "learning_rate": 4.9226381562992546e-05, - "loss": 0.758, - "step": 1110 - }, - { - "epoch": 0.13019617001401215, - "grad_norm": 0.7271001935005188, - "learning_rate": 4.9215046582589066e-05, - "loss": 0.7494, - "step": 1115 - }, - { - "epoch": 0.13078000934142925, - "grad_norm": 0.8495776653289795, - "learning_rate": 4.9203630639432083e-05, - "loss": 0.7417, - "step": 1120 - }, - { - "epoch": 0.13136384866884634, - "grad_norm": 1.0051515102386475, - "learning_rate": 4.919213377608499e-05, - "loss": 0.7589, - "step": 1125 - }, - { - "epoch": 0.13194768799626344, - "grad_norm": 0.799396276473999, - "learning_rate": 4.9180556035412876e-05, - "loss": 0.7427, - "step": 1130 - }, - { - "epoch": 0.13253152732368054, - "grad_norm": 1.0050220489501953, - "learning_rate": 4.916889746058242e-05, - "loss": 0.766, - "step": 1135 - }, - { - "epoch": 0.1331153666510976, - "grad_norm": 0.9699201583862305, - "learning_rate": 4.9157158095061636e-05, - "loss": 0.7422, - "step": 1140 - }, - { - "epoch": 0.1336992059785147, - "grad_norm": 0.8608420491218567, - "learning_rate": 4.914533798261977e-05, - "loss": 0.7368, - "step": 1145 - }, - { - "epoch": 0.1342830453059318, - "grad_norm": 0.9254468679428101, - "learning_rate": 4.913343716732713e-05, - "loss": 0.7569, - "step": 1150 - }, - { - "epoch": 0.1348668846333489, - "grad_norm": 0.8214322924613953, - "learning_rate": 4.912145569355495e-05, - "loss": 0.7441, - "step": 1155 - }, - { - "epoch": 0.135450723960766, - "grad_norm": 0.7923891544342041, - "learning_rate": 4.910939360597514e-05, - "loss": 0.7309, - "step": 1160 - }, - { - "epoch": 0.13603456328818309, - "grad_norm": 0.7858424782752991, - "learning_rate": 4.909725094956019e-05, - "loss": 0.7547, - "step": 1165 - }, - { - "epoch": 0.13661840261560018, - "grad_norm": 0.7866080403327942, - "learning_rate": 4.908502776958301e-05, - "loss": 0.7392, - "step": 1170 - }, - { - "epoch": 0.13720224194301728, - "grad_norm": 0.9697911143302917, - "learning_rate": 4.907272411161668e-05, - "loss": 0.7389, - "step": 1175 - }, - { - "epoch": 0.13778608127043437, - "grad_norm": 0.7810333967208862, - "learning_rate": 4.9060340021534415e-05, - "loss": 0.7409, - "step": 1180 - }, - { - "epoch": 0.13836992059785147, - "grad_norm": 0.7970030307769775, - "learning_rate": 4.9047875545509235e-05, - "loss": 0.7348, - "step": 1185 - }, - { - "epoch": 0.13895375992526857, - "grad_norm": 0.7652000188827515, - "learning_rate": 4.9035330730013926e-05, - "loss": 0.7359, - "step": 1190 - }, - { - "epoch": 0.13953759925268566, - "grad_norm": 0.7753109335899353, - "learning_rate": 4.9022705621820786e-05, - "loss": 0.7217, - "step": 1195 - }, - { - "epoch": 0.14012143858010276, - "grad_norm": 0.7864140868186951, - "learning_rate": 4.901000026800148e-05, - "loss": 0.7505, - "step": 1200 - }, - { - "epoch": 0.14070527790751985, - "grad_norm": 0.8015544414520264, - "learning_rate": 4.899721471592688e-05, - "loss": 0.7302, - "step": 1205 - }, - { - "epoch": 0.14128911723493695, - "grad_norm": 0.7648766040802002, - "learning_rate": 4.898434901326685e-05, - "loss": 0.724, - "step": 1210 - }, - { - "epoch": 0.14187295656235405, - "grad_norm": 0.8306850790977478, - "learning_rate": 4.897140320799011e-05, - "loss": 0.7312, - "step": 1215 - }, - { - "epoch": 0.14245679588977114, - "grad_norm": 0.8447693586349487, - "learning_rate": 4.8958377348364e-05, - "loss": 0.7514, - "step": 1220 - }, - { - "epoch": 0.14304063521718824, - "grad_norm": 0.7389149069786072, - "learning_rate": 4.894527148295438e-05, - "loss": 0.7499, - "step": 1225 - }, - { - "epoch": 0.14362447454460534, - "grad_norm": 1.007028579711914, - "learning_rate": 4.8932085660625374e-05, - "loss": 0.753, - "step": 1230 - }, - { - "epoch": 0.14420831387202243, - "grad_norm": 0.8949772715568542, - "learning_rate": 4.8918819930539244e-05, - "loss": 0.7382, - "step": 1235 - }, - { - "epoch": 0.14479215319943953, - "grad_norm": 0.8570683002471924, - "learning_rate": 4.8905474342156144e-05, - "loss": 0.7468, - "step": 1240 - }, - { - "epoch": 0.1453759925268566, - "grad_norm": 0.8168264031410217, - "learning_rate": 4.889204894523401e-05, - "loss": 0.7468, - "step": 1245 - }, - { - "epoch": 0.1459598318542737, - "grad_norm": 0.7849008440971375, - "learning_rate": 4.8878543789828314e-05, - "loss": 0.7387, - "step": 1250 - }, - { - "epoch": 0.1465436711816908, - "grad_norm": 0.7741910219192505, - "learning_rate": 4.886495892629191e-05, - "loss": 0.7539, - "step": 1255 - }, - { - "epoch": 0.14712751050910788, - "grad_norm": 0.7739510536193848, - "learning_rate": 4.8851294405274855e-05, - "loss": 0.7235, - "step": 1260 - }, - { - "epoch": 0.14771134983652498, - "grad_norm": 0.7775066494941711, - "learning_rate": 4.8837550277724165e-05, - "loss": 0.7319, - "step": 1265 - }, - { - "epoch": 0.14829518916394208, - "grad_norm": 0.8568857312202454, - "learning_rate": 4.8823726594883696e-05, - "loss": 0.7449, - "step": 1270 - }, - { - "epoch": 0.14887902849135917, - "grad_norm": 0.8095642328262329, - "learning_rate": 4.8809823408293887e-05, - "loss": 0.7459, - "step": 1275 - }, - { - "epoch": 0.14946286781877627, - "grad_norm": 0.7756009697914124, - "learning_rate": 4.8795840769791634e-05, - "loss": 0.7492, - "step": 1280 - }, - { - "epoch": 0.15004670714619336, - "grad_norm": 0.7961511611938477, - "learning_rate": 4.878177873151004e-05, - "loss": 0.7441, - "step": 1285 - }, - { - "epoch": 0.15063054647361046, - "grad_norm": 0.8107098937034607, - "learning_rate": 4.876763734587825e-05, - "loss": 0.7356, - "step": 1290 - }, - { - "epoch": 0.15121438580102756, - "grad_norm": 0.7119124531745911, - "learning_rate": 4.8753416665621255e-05, - "loss": 0.7196, - "step": 1295 - }, - { - "epoch": 0.15179822512844465, - "grad_norm": 0.7364365458488464, - "learning_rate": 4.873911674375968e-05, - "loss": 0.7513, - "step": 1300 - }, - { - "epoch": 0.15238206445586175, - "grad_norm": 0.865628182888031, - "learning_rate": 4.87247376336096e-05, - "loss": 0.7458, - "step": 1305 - }, - { - "epoch": 0.15296590378327884, - "grad_norm": 0.8725950121879578, - "learning_rate": 4.8710279388782345e-05, - "loss": 0.7379, - "step": 1310 - }, - { - "epoch": 0.15354974311069594, - "grad_norm": 0.8291376233100891, - "learning_rate": 4.869574206318427e-05, - "loss": 0.729, - "step": 1315 - }, - { - "epoch": 0.15413358243811304, - "grad_norm": 0.7126283645629883, - "learning_rate": 4.868112571101659e-05, - "loss": 0.7121, - "step": 1320 - }, - { - "epoch": 0.15471742176553013, - "grad_norm": 0.8371273279190063, - "learning_rate": 4.866643038677519e-05, - "loss": 0.7421, - "step": 1325 - }, - { - "epoch": 0.15530126109294723, - "grad_norm": 0.8473025560379028, - "learning_rate": 4.865165614525033e-05, - "loss": 0.7433, - "step": 1330 - }, - { - "epoch": 0.15588510042036433, - "grad_norm": 0.7817255854606628, - "learning_rate": 4.863680304152657e-05, - "loss": 0.7467, - "step": 1335 - }, - { - "epoch": 0.15646893974778142, - "grad_norm": 0.8005650043487549, - "learning_rate": 4.862187113098249e-05, - "loss": 0.7467, - "step": 1340 - }, - { - "epoch": 0.15705277907519852, - "grad_norm": 0.8888115286827087, - "learning_rate": 4.8606860469290454e-05, - "loss": 0.7342, - "step": 1345 - }, - { - "epoch": 0.1576366184026156, - "grad_norm": 0.8558804392814636, - "learning_rate": 4.859177111241649e-05, - "loss": 0.7181, - "step": 1350 - }, - { - "epoch": 0.15822045773003268, - "grad_norm": 0.8104288578033447, - "learning_rate": 4.8576603116620004e-05, - "loss": 0.7328, - "step": 1355 - }, - { - "epoch": 0.15880429705744978, - "grad_norm": 0.8033810257911682, - "learning_rate": 4.8561356538453625e-05, - "loss": 0.7216, - "step": 1360 - }, - { - "epoch": 0.15938813638486687, - "grad_norm": 0.7808504104614258, - "learning_rate": 4.8546031434762954e-05, - "loss": 0.7559, - "step": 1365 - }, - { - "epoch": 0.15997197571228397, - "grad_norm": 0.8111550807952881, - "learning_rate": 4.853062786268636e-05, - "loss": 0.7506, - "step": 1370 - }, - { - "epoch": 0.16055581503970107, - "grad_norm": 0.7793529629707336, - "learning_rate": 4.85151458796548e-05, - "loss": 0.7173, - "step": 1375 - }, - { - "epoch": 0.16113965436711816, - "grad_norm": 0.8663891553878784, - "learning_rate": 4.849958554339156e-05, - "loss": 0.7539, - "step": 1380 - }, - { - "epoch": 0.16172349369453526, - "grad_norm": 0.9149182438850403, - "learning_rate": 4.8483946911912064e-05, - "loss": 0.7456, - "step": 1385 - }, - { - "epoch": 0.16230733302195235, - "grad_norm": 0.8218986988067627, - "learning_rate": 4.846823004352366e-05, - "loss": 0.7269, - "step": 1390 - }, - { - "epoch": 0.16289117234936945, - "grad_norm": 0.931961178779602, - "learning_rate": 4.845243499682539e-05, - "loss": 0.7521, - "step": 1395 - }, - { - "epoch": 0.16347501167678655, - "grad_norm": 0.7669240236282349, - "learning_rate": 4.8436561830707786e-05, - "loss": 0.7337, - "step": 1400 - }, - { - "epoch": 0.16405885100420364, - "grad_norm": 0.8319959044456482, - "learning_rate": 4.842061060435261e-05, - "loss": 0.7299, - "step": 1405 - }, - { - "epoch": 0.16464269033162074, - "grad_norm": 0.7546858191490173, - "learning_rate": 4.840458137723271e-05, - "loss": 0.7024, - "step": 1410 - }, - { - "epoch": 0.16522652965903784, - "grad_norm": 0.9058648347854614, - "learning_rate": 4.838847420911172e-05, - "loss": 0.7234, - "step": 1415 - }, - { - "epoch": 0.16581036898645493, - "grad_norm": 0.8408836126327515, - "learning_rate": 4.8372289160043895e-05, - "loss": 0.7398, - "step": 1420 - }, - { - "epoch": 0.16639420831387203, - "grad_norm": 0.8477882742881775, - "learning_rate": 4.835602629037384e-05, - "loss": 0.7309, - "step": 1425 - }, - { - "epoch": 0.16697804764128912, - "grad_norm": 0.75185626745224, - "learning_rate": 4.8339685660736324e-05, - "loss": 0.724, - "step": 1430 - }, - { - "epoch": 0.16756188696870622, - "grad_norm": 0.8539434671401978, - "learning_rate": 4.8323267332056026e-05, - "loss": 0.7342, - "step": 1435 - }, - { - "epoch": 0.16814572629612332, - "grad_norm": 0.7879071831703186, - "learning_rate": 4.830677136554733e-05, - "loss": 0.7321, - "step": 1440 - }, - { - "epoch": 0.1687295656235404, - "grad_norm": 0.8851377367973328, - "learning_rate": 4.829019782271408e-05, - "loss": 0.7534, - "step": 1445 - }, - { - "epoch": 0.1693134049509575, - "grad_norm": 0.7765153646469116, - "learning_rate": 4.827354676534937e-05, - "loss": 0.747, - "step": 1450 - }, - { - "epoch": 0.1698972442783746, - "grad_norm": 0.7362776398658752, - "learning_rate": 4.825681825553527e-05, - "loss": 0.74, - "step": 1455 - }, - { - "epoch": 0.17048108360579167, - "grad_norm": 0.7507404685020447, - "learning_rate": 4.824001235564265e-05, - "loss": 0.7462, - "step": 1460 - }, - { - "epoch": 0.17106492293320877, - "grad_norm": 0.8206664323806763, - "learning_rate": 4.822312912833092e-05, - "loss": 0.7308, - "step": 1465 - }, - { - "epoch": 0.17164876226062586, - "grad_norm": 0.8568519949913025, - "learning_rate": 4.82061686365478e-05, - "loss": 0.7341, - "step": 1470 - }, - { - "epoch": 0.17223260158804296, - "grad_norm": 0.8819417953491211, - "learning_rate": 4.818913094352907e-05, - "loss": 0.732, - "step": 1475 - }, - { - "epoch": 0.17281644091546006, - "grad_norm": 0.7527371644973755, - "learning_rate": 4.8172016112798364e-05, - "loss": 0.7458, - "step": 1480 - }, - { - "epoch": 0.17340028024287715, - "grad_norm": 0.7779048085212708, - "learning_rate": 4.8154824208166906e-05, - "loss": 0.7606, - "step": 1485 - }, - { - "epoch": 0.17398411957029425, - "grad_norm": 0.7730028629302979, - "learning_rate": 4.8137555293733294e-05, - "loss": 0.7276, - "step": 1490 - }, - { - "epoch": 0.17456795889771135, - "grad_norm": 0.827379584312439, - "learning_rate": 4.812020943388324e-05, - "loss": 0.7136, - "step": 1495 - }, - { - "epoch": 0.17515179822512844, - "grad_norm": 0.7734323143959045, - "learning_rate": 4.810278669328935e-05, - "loss": 0.744, - "step": 1500 - }, - { - "epoch": 0.17573563755254554, - "grad_norm": 0.8954327702522278, - "learning_rate": 4.808528713691087e-05, - "loss": 0.718, - "step": 1505 - }, - { - "epoch": 0.17631947687996263, - "grad_norm": 0.906179666519165, - "learning_rate": 4.806771082999346e-05, - "loss": 0.75, - "step": 1510 - }, - { - "epoch": 0.17690331620737973, - "grad_norm": 0.7314246296882629, - "learning_rate": 4.8050057838068904e-05, - "loss": 0.7432, - "step": 1515 - }, - { - "epoch": 0.17748715553479683, - "grad_norm": 0.8068938851356506, - "learning_rate": 4.803232822695493e-05, - "loss": 0.7344, - "step": 1520 - }, - { - "epoch": 0.17807099486221392, - "grad_norm": 0.8768115639686584, - "learning_rate": 4.801452206275493e-05, - "loss": 0.7292, - "step": 1525 - }, - { - "epoch": 0.17865483418963102, - "grad_norm": 0.7669022679328918, - "learning_rate": 4.79966394118577e-05, - "loss": 0.7226, - "step": 1530 - }, - { - "epoch": 0.1792386735170481, - "grad_norm": 0.8907231688499451, - "learning_rate": 4.797868034093724e-05, - "loss": 0.7485, - "step": 1535 - }, - { - "epoch": 0.1798225128444652, - "grad_norm": 0.9018024802207947, - "learning_rate": 4.7960644916952444e-05, - "loss": 0.7425, - "step": 1540 - }, - { - "epoch": 0.1804063521718823, - "grad_norm": 0.815697193145752, - "learning_rate": 4.7942533207146916e-05, - "loss": 0.7509, - "step": 1545 - }, - { - "epoch": 0.1809901914992994, - "grad_norm": 0.8392460942268372, - "learning_rate": 4.792434527904864e-05, - "loss": 0.7428, - "step": 1550 - }, - { - "epoch": 0.1815740308267165, - "grad_norm": 0.7268983125686646, - "learning_rate": 4.7906081200469835e-05, - "loss": 0.7309, - "step": 1555 - }, - { - "epoch": 0.1821578701541336, - "grad_norm": 0.7090856432914734, - "learning_rate": 4.788774103950657e-05, - "loss": 0.7269, - "step": 1560 - }, - { - "epoch": 0.1827417094815507, - "grad_norm": 0.9157699942588806, - "learning_rate": 4.7869324864538636e-05, - "loss": 0.7333, - "step": 1565 - }, - { - "epoch": 0.18332554880896776, - "grad_norm": 0.8036680817604065, - "learning_rate": 4.7850832744229216e-05, - "loss": 0.7409, - "step": 1570 - }, - { - "epoch": 0.18390938813638485, - "grad_norm": 0.8338437676429749, - "learning_rate": 4.783226474752465e-05, - "loss": 0.7391, - "step": 1575 - }, - { - "epoch": 0.18449322746380195, - "grad_norm": 0.8090097904205322, - "learning_rate": 4.781362094365417e-05, - "loss": 0.72, - "step": 1580 - }, - { - "epoch": 0.18507706679121905, - "grad_norm": 0.8660805225372314, - "learning_rate": 4.779490140212966e-05, - "loss": 0.7186, - "step": 1585 - }, - { - "epoch": 0.18566090611863614, - "grad_norm": 0.8995793461799622, - "learning_rate": 4.777610619274539e-05, - "loss": 0.7359, - "step": 1590 - }, - { - "epoch": 0.18624474544605324, - "grad_norm": 0.7604996562004089, - "learning_rate": 4.775723538557772e-05, - "loss": 0.7149, - "step": 1595 - }, - { - "epoch": 0.18682858477347034, - "grad_norm": 0.8463579416275024, - "learning_rate": 4.7738289050984905e-05, - "loss": 0.7405, - "step": 1600 - }, - { - "epoch": 0.18741242410088743, - "grad_norm": 0.9610666632652283, - "learning_rate": 4.7719267259606795e-05, - "loss": 0.7286, - "step": 1605 - }, - { - "epoch": 0.18799626342830453, - "grad_norm": 0.9258811473846436, - "learning_rate": 4.770017008236455e-05, - "loss": 0.7415, - "step": 1610 - }, - { - "epoch": 0.18858010275572162, - "grad_norm": 0.8553178310394287, - "learning_rate": 4.768099759046042e-05, - "loss": 0.712, - "step": 1615 - }, - { - "epoch": 0.18916394208313872, - "grad_norm": 0.7163110375404358, - "learning_rate": 4.766174985537745e-05, - "loss": 0.7048, - "step": 1620 - }, - { - "epoch": 0.18974778141055582, - "grad_norm": 0.761073887348175, - "learning_rate": 4.7642426948879234e-05, - "loss": 0.7123, - "step": 1625 - }, - { - "epoch": 0.1903316207379729, - "grad_norm": 0.8209511637687683, - "learning_rate": 4.762302894300962e-05, - "loss": 0.7453, - "step": 1630 - }, - { - "epoch": 0.19091546006539, - "grad_norm": 0.7964670062065125, - "learning_rate": 4.760355591009247e-05, - "loss": 0.7409, - "step": 1635 - }, - { - "epoch": 0.1914992993928071, - "grad_norm": 0.7329254746437073, - "learning_rate": 4.7584007922731383e-05, - "loss": 0.7366, - "step": 1640 - }, - { - "epoch": 0.1920831387202242, - "grad_norm": 0.7424750328063965, - "learning_rate": 4.75643850538094e-05, - "loss": 0.7171, - "step": 1645 - }, - { - "epoch": 0.1926669780476413, - "grad_norm": 0.8553922772407532, - "learning_rate": 4.754468737648878e-05, - "loss": 0.7256, - "step": 1650 - }, - { - "epoch": 0.1932508173750584, - "grad_norm": 0.8064399361610413, - "learning_rate": 4.752491496421066e-05, - "loss": 0.7318, - "step": 1655 - }, - { - "epoch": 0.1938346567024755, - "grad_norm": 0.743678629398346, - "learning_rate": 4.750506789069486e-05, - "loss": 0.7267, - "step": 1660 - }, - { - "epoch": 0.19441849602989258, - "grad_norm": 0.834180474281311, - "learning_rate": 4.7485146229939545e-05, - "loss": 0.7244, - "step": 1665 - }, - { - "epoch": 0.19500233535730968, - "grad_norm": 0.8548952341079712, - "learning_rate": 4.746515005622097e-05, - "loss": 0.7197, - "step": 1670 - }, - { - "epoch": 0.19558617468472678, - "grad_norm": 0.9763193726539612, - "learning_rate": 4.744507944409322e-05, - "loss": 0.7448, - "step": 1675 - }, - { - "epoch": 0.19617001401214385, - "grad_norm": 0.8389405608177185, - "learning_rate": 4.742493446838791e-05, - "loss": 0.711, - "step": 1680 - }, - { - "epoch": 0.19675385333956094, - "grad_norm": 0.7858707308769226, - "learning_rate": 4.740471520421392e-05, - "loss": 0.7267, - "step": 1685 - }, - { - "epoch": 0.19733769266697804, - "grad_norm": 0.7936954498291016, - "learning_rate": 4.73844217269571e-05, - "loss": 0.7247, - "step": 1690 - }, - { - "epoch": 0.19792153199439513, - "grad_norm": 0.7359188795089722, - "learning_rate": 4.736405411228e-05, - "loss": 0.715, - "step": 1695 - }, - { - "epoch": 0.19850537132181223, - "grad_norm": 0.8808438777923584, - "learning_rate": 4.7343612436121575e-05, - "loss": 0.763, - "step": 1700 - }, - { - "epoch": 0.19908921064922933, - "grad_norm": 0.741851806640625, - "learning_rate": 4.732309677469693e-05, - "loss": 0.742, - "step": 1705 - }, - { - "epoch": 0.19967304997664642, - "grad_norm": 0.7666711211204529, - "learning_rate": 4.7302507204497026e-05, - "loss": 0.7197, - "step": 1710 - }, - { - "epoch": 0.20025688930406352, - "grad_norm": 0.8063815832138062, - "learning_rate": 4.728184380228834e-05, - "loss": 0.7382, - "step": 1715 - }, - { - "epoch": 0.20084072863148061, - "grad_norm": 0.73138427734375, - "learning_rate": 4.7261106645112677e-05, - "loss": 0.7286, - "step": 1720 - }, - { - "epoch": 0.2014245679588977, - "grad_norm": 0.7864614725112915, - "learning_rate": 4.72402958102868e-05, - "loss": 0.7472, - "step": 1725 - }, - { - "epoch": 0.2020084072863148, - "grad_norm": 0.7947973012924194, - "learning_rate": 4.72194113754022e-05, - "loss": 0.7304, - "step": 1730 - }, - { - "epoch": 0.2025922466137319, - "grad_norm": 0.7898170948028564, - "learning_rate": 4.719845341832475e-05, - "loss": 0.6894, - "step": 1735 - }, - { - "epoch": 0.203176085941149, - "grad_norm": 1.143188238143921, - "learning_rate": 4.7177422017194464e-05, - "loss": 0.7212, - "step": 1740 - }, - { - "epoch": 0.2037599252685661, - "grad_norm": 0.9604560136795044, - "learning_rate": 4.715631725042517e-05, - "loss": 0.7435, - "step": 1745 - }, - { - "epoch": 0.2043437645959832, - "grad_norm": 0.8488959670066833, - "learning_rate": 4.7135139196704254e-05, - "loss": 0.7242, - "step": 1750 - }, - { - "epoch": 0.2049276039234003, - "grad_norm": 0.8216392397880554, - "learning_rate": 4.711388793499233e-05, - "loss": 0.7207, - "step": 1755 - }, - { - "epoch": 0.20551144325081738, - "grad_norm": 0.8619979023933411, - "learning_rate": 4.709256354452298e-05, - "loss": 0.7302, - "step": 1760 - }, - { - "epoch": 0.20609528257823448, - "grad_norm": 0.8287932872772217, - "learning_rate": 4.7071166104802415e-05, - "loss": 0.7246, - "step": 1765 - }, - { - "epoch": 0.20667912190565157, - "grad_norm": 0.7801551818847656, - "learning_rate": 4.7049695695609224e-05, - "loss": 0.7115, - "step": 1770 - }, - { - "epoch": 0.20726296123306867, - "grad_norm": 0.7320568561553955, - "learning_rate": 4.702815239699405e-05, - "loss": 0.7155, - "step": 1775 - }, - { - "epoch": 0.20784680056048577, - "grad_norm": 0.7555932998657227, - "learning_rate": 4.7006536289279285e-05, - "loss": 0.7184, - "step": 1780 - }, - { - "epoch": 0.20843063988790284, - "grad_norm": 0.7925651669502258, - "learning_rate": 4.698484745305882e-05, - "loss": 0.7015, - "step": 1785 - }, - { - "epoch": 0.20901447921531993, - "grad_norm": 0.8180797696113586, - "learning_rate": 4.696308596919767e-05, - "loss": 0.7464, - "step": 1790 - }, - { - "epoch": 0.20959831854273703, - "grad_norm": 0.7624222636222839, - "learning_rate": 4.694125191883174e-05, - "loss": 0.7201, - "step": 1795 - }, - { - "epoch": 0.21018215787015412, - "grad_norm": 0.7176441550254822, - "learning_rate": 4.691934538336746e-05, - "loss": 0.7259, - "step": 1800 - }, - { - "epoch": 0.21076599719757122, - "grad_norm": 0.7426697611808777, - "learning_rate": 4.6897366444481545e-05, - "loss": 0.719, - "step": 1805 - }, - { - "epoch": 0.21134983652498832, - "grad_norm": 0.8434226512908936, - "learning_rate": 4.687531518412065e-05, - "loss": 0.7312, - "step": 1810 - }, - { - "epoch": 0.2119336758524054, - "grad_norm": 0.7939966917037964, - "learning_rate": 4.685319168450107e-05, - "loss": 0.6946, - "step": 1815 - }, - { - "epoch": 0.2125175151798225, - "grad_norm": 0.8017293810844421, - "learning_rate": 4.683099602810845e-05, - "loss": 0.7453, - "step": 1820 - }, - { - "epoch": 0.2131013545072396, - "grad_norm": 0.7662323713302612, - "learning_rate": 4.680872829769745e-05, - "loss": 0.7145, - "step": 1825 - }, - { - "epoch": 0.2136851938346567, - "grad_norm": 0.8610487580299377, - "learning_rate": 4.6786388576291446e-05, - "loss": 0.739, - "step": 1830 - }, - { - "epoch": 0.2142690331620738, - "grad_norm": 0.7974783778190613, - "learning_rate": 4.6763976947182256e-05, - "loss": 0.7154, - "step": 1835 - }, - { - "epoch": 0.2148528724894909, - "grad_norm": 0.8084165453910828, - "learning_rate": 4.6741493493929794e-05, - "loss": 0.7202, - "step": 1840 - }, - { - "epoch": 0.215436711816908, - "grad_norm": 0.8559662103652954, - "learning_rate": 4.671893830036174e-05, - "loss": 0.7284, - "step": 1845 - }, - { - "epoch": 0.21602055114432508, - "grad_norm": 0.9049946665763855, - "learning_rate": 4.6696311450573266e-05, - "loss": 0.7453, - "step": 1850 - }, - { - "epoch": 0.21660439047174218, - "grad_norm": 0.9491698145866394, - "learning_rate": 4.667361302892671e-05, - "loss": 0.7048, - "step": 1855 - }, - { - "epoch": 0.21718822979915928, - "grad_norm": 0.9097338318824768, - "learning_rate": 4.665084312005126e-05, - "loss": 0.6934, - "step": 1860 - }, - { - "epoch": 0.21777206912657637, - "grad_norm": 0.7439709305763245, - "learning_rate": 4.662800180884263e-05, - "loss": 0.7397, - "step": 1865 - }, - { - "epoch": 0.21835590845399347, - "grad_norm": 0.8445188403129578, - "learning_rate": 4.660508918046277e-05, - "loss": 0.7237, - "step": 1870 - }, - { - "epoch": 0.21893974778141057, - "grad_norm": 0.725067675113678, - "learning_rate": 4.658210532033951e-05, - "loss": 0.7302, - "step": 1875 - }, - { - "epoch": 0.21952358710882766, - "grad_norm": 0.8000083565711975, - "learning_rate": 4.6559050314166264e-05, - "loss": 0.718, - "step": 1880 - }, - { - "epoch": 0.22010742643624476, - "grad_norm": 0.8217325210571289, - "learning_rate": 4.653592424790172e-05, - "loss": 0.732, - "step": 1885 - }, - { - "epoch": 0.22069126576366185, - "grad_norm": 0.7673560976982117, - "learning_rate": 4.6512727207769504e-05, - "loss": 0.7343, - "step": 1890 - }, - { - "epoch": 0.22127510509107892, - "grad_norm": 0.7679688930511475, - "learning_rate": 4.6489459280257856e-05, - "loss": 0.7172, - "step": 1895 - }, - { - "epoch": 0.22185894441849602, - "grad_norm": 0.8244835138320923, - "learning_rate": 4.646612055211933e-05, - "loss": 0.7401, - "step": 1900 - }, - { - "epoch": 0.22244278374591311, - "grad_norm": 0.7303513884544373, - "learning_rate": 4.6442711110370424e-05, - "loss": 0.6978, - "step": 1905 - }, - { - "epoch": 0.2230266230733302, - "grad_norm": 0.9410842657089233, - "learning_rate": 4.64192310422913e-05, - "loss": 0.7098, - "step": 1910 - }, - { - "epoch": 0.2236104624007473, - "grad_norm": 0.727407693862915, - "learning_rate": 4.639568043542548e-05, - "loss": 0.7048, - "step": 1915 - }, - { - "epoch": 0.2241943017281644, - "grad_norm": 0.7892005443572998, - "learning_rate": 4.6372059377579414e-05, - "loss": 0.723, - "step": 1920 - }, - { - "epoch": 0.2247781410555815, - "grad_norm": 0.8230664134025574, - "learning_rate": 4.634836795682228e-05, - "loss": 0.7155, - "step": 1925 - }, - { - "epoch": 0.2253619803829986, - "grad_norm": 0.7650606632232666, - "learning_rate": 4.632460626148558e-05, - "loss": 0.727, - "step": 1930 - }, - { - "epoch": 0.2259458197104157, - "grad_norm": 0.7176103591918945, - "learning_rate": 4.6300774380162825e-05, - "loss": 0.7307, - "step": 1935 - }, - { - "epoch": 0.2265296590378328, - "grad_norm": 0.7648828625679016, - "learning_rate": 4.627687240170921e-05, - "loss": 0.7206, - "step": 1940 - }, - { - "epoch": 0.22711349836524988, - "grad_norm": 0.8299161195755005, - "learning_rate": 4.625290041524128e-05, - "loss": 0.7235, - "step": 1945 - }, - { - "epoch": 0.22769733769266698, - "grad_norm": 0.7513009905815125, - "learning_rate": 4.6228858510136616e-05, - "loss": 0.7148, - "step": 1950 - }, - { - "epoch": 0.22828117702008408, - "grad_norm": 0.74496990442276, - "learning_rate": 4.620474677603345e-05, - "loss": 0.7244, - "step": 1955 - }, - { - "epoch": 0.22886501634750117, - "grad_norm": 0.7487322092056274, - "learning_rate": 4.61805653028304e-05, - "loss": 0.7202, - "step": 1960 - }, - { - "epoch": 0.22944885567491827, - "grad_norm": 0.935059130191803, - "learning_rate": 4.615631418068609e-05, - "loss": 0.737, - "step": 1965 - }, - { - "epoch": 0.23003269500233536, - "grad_norm": 0.8210546970367432, - "learning_rate": 4.613199350001881e-05, - "loss": 0.7316, - "step": 1970 - }, - { - "epoch": 0.23061653432975246, - "grad_norm": 0.9004955291748047, - "learning_rate": 4.6107603351506205e-05, - "loss": 0.7247, - "step": 1975 - }, - { - "epoch": 0.23120037365716956, - "grad_norm": 0.792767345905304, - "learning_rate": 4.608314382608493e-05, - "loss": 0.7187, - "step": 1980 - }, - { - "epoch": 0.23178421298458665, - "grad_norm": 0.731279730796814, - "learning_rate": 4.6058615014950315e-05, - "loss": 0.7348, - "step": 1985 - }, - { - "epoch": 0.23236805231200375, - "grad_norm": 0.7055816650390625, - "learning_rate": 4.6034017009555975e-05, - "loss": 0.7283, - "step": 1990 - }, - { - "epoch": 0.23295189163942084, - "grad_norm": 0.7454161047935486, - "learning_rate": 4.600934990161355e-05, - "loss": 0.7148, - "step": 1995 - }, - { - "epoch": 0.23353573096683794, - "grad_norm": 0.7571686506271362, - "learning_rate": 4.598461378309231e-05, - "loss": 0.7275, - "step": 2000 - }, - { - "epoch": 0.234119570294255, - "grad_norm": 0.7321173548698425, - "learning_rate": 4.5959808746218823e-05, - "loss": 0.7128, - "step": 2005 - }, - { - "epoch": 0.2347034096216721, - "grad_norm": 0.7889792323112488, - "learning_rate": 4.593493488347662e-05, - "loss": 0.7242, - "step": 2010 - }, - { - "epoch": 0.2352872489490892, - "grad_norm": 0.8360452651977539, - "learning_rate": 4.590999228760583e-05, - "loss": 0.7202, - "step": 2015 - }, - { - "epoch": 0.2358710882765063, - "grad_norm": 0.820009708404541, - "learning_rate": 4.5884981051602873e-05, - "loss": 0.7027, - "step": 2020 - }, - { - "epoch": 0.2364549276039234, - "grad_norm": 0.7243896722793579, - "learning_rate": 4.585990126872006e-05, - "loss": 0.7091, - "step": 2025 - }, - { - "epoch": 0.2370387669313405, - "grad_norm": 0.7246242761611938, - "learning_rate": 4.583475303246527e-05, - "loss": 0.7068, - "step": 2030 - }, - { - "epoch": 0.23762260625875758, - "grad_norm": 0.6940891146659851, - "learning_rate": 4.580953643660165e-05, - "loss": 0.7132, - "step": 2035 - }, - { - "epoch": 0.23820644558617468, - "grad_norm": 0.8294302821159363, - "learning_rate": 4.5784251575147176e-05, - "loss": 0.7301, - "step": 2040 - }, - { - "epoch": 0.23879028491359178, - "grad_norm": 0.7796537280082703, - "learning_rate": 4.5758898542374354e-05, - "loss": 0.7266, - "step": 2045 - }, - { - "epoch": 0.23937412424100887, - "grad_norm": 0.7590277194976807, - "learning_rate": 4.5733477432809884e-05, - "loss": 0.7033, - "step": 2050 - }, - { - "epoch": 0.23995796356842597, - "grad_norm": 0.7161651849746704, - "learning_rate": 4.570798834123425e-05, - "loss": 0.7093, - "step": 2055 - }, - { - "epoch": 0.24054180289584307, - "grad_norm": 0.7662396430969238, - "learning_rate": 4.5682431362681435e-05, - "loss": 0.7093, - "step": 2060 - }, - { - "epoch": 0.24112564222326016, - "grad_norm": 0.6958584189414978, - "learning_rate": 4.565680659243851e-05, - "loss": 0.6958, - "step": 2065 - }, - { - "epoch": 0.24170948155067726, - "grad_norm": 0.7178056836128235, - "learning_rate": 4.5631114126045315e-05, - "loss": 0.7157, - "step": 2070 - }, - { - "epoch": 0.24229332087809435, - "grad_norm": 0.6672092080116272, - "learning_rate": 4.560535405929408e-05, - "loss": 0.7045, - "step": 2075 - }, - { - "epoch": 0.24287716020551145, - "grad_norm": 0.7362955808639526, - "learning_rate": 4.557952648822908e-05, - "loss": 0.7177, - "step": 2080 - }, - { - "epoch": 0.24346099953292855, - "grad_norm": 0.7810000777244568, - "learning_rate": 4.555363150914628e-05, - "loss": 0.7299, - "step": 2085 - }, - { - "epoch": 0.24404483886034564, - "grad_norm": 0.7832990884780884, - "learning_rate": 4.552766921859297e-05, - "loss": 0.7061, - "step": 2090 - }, - { - "epoch": 0.24462867818776274, - "grad_norm": 1.5710090398788452, - "learning_rate": 4.5501639713367386e-05, - "loss": 0.6968, - "step": 2095 - }, - { - "epoch": 0.24521251751517983, - "grad_norm": 0.8379431962966919, - "learning_rate": 4.547554309051839e-05, - "loss": 0.7348, - "step": 2100 - }, - { - "epoch": 0.24579635684259693, - "grad_norm": 0.7984123229980469, - "learning_rate": 4.5449379447345084e-05, - "loss": 0.7201, - "step": 2105 - }, - { - "epoch": 0.246380196170014, - "grad_norm": 0.7767300605773926, - "learning_rate": 4.5423148881396444e-05, - "loss": 0.727, - "step": 2110 - }, - { - "epoch": 0.2469640354974311, - "grad_norm": 0.7863557934761047, - "learning_rate": 4.539685149047097e-05, - "loss": 0.7167, - "step": 2115 - }, - { - "epoch": 0.2475478748248482, - "grad_norm": 0.7811588048934937, - "learning_rate": 4.5370487372616285e-05, - "loss": 0.7078, - "step": 2120 - }, - { - "epoch": 0.2481317141522653, - "grad_norm": 0.757037341594696, - "learning_rate": 4.5344056626128847e-05, - "loss": 0.7263, - "step": 2125 - }, - { - "epoch": 0.24871555347968238, - "grad_norm": 0.8016570806503296, - "learning_rate": 4.53175593495535e-05, - "loss": 0.7212, - "step": 2130 - }, - { - "epoch": 0.24929939280709948, - "grad_norm": 0.8397887349128723, - "learning_rate": 4.529099564168312e-05, - "loss": 0.7294, - "step": 2135 - }, - { - "epoch": 0.24988323213451658, - "grad_norm": 0.8018162250518799, - "learning_rate": 4.526436560155833e-05, - "loss": 0.7142, - "step": 2140 - }, - { - "epoch": 0.2504670714619337, - "grad_norm": 0.7629238963127136, - "learning_rate": 4.5237669328467e-05, - "loss": 0.7026, - "step": 2145 - }, - { - "epoch": 0.25105091078935077, - "grad_norm": 0.8951400518417358, - "learning_rate": 4.5210906921944e-05, - "loss": 0.719, - "step": 2150 - }, - { - "epoch": 0.2516347501167679, - "grad_norm": 0.7392298579216003, - "learning_rate": 4.518407848177073e-05, - "loss": 0.7267, - "step": 2155 - }, - { - "epoch": 0.25221858944418496, - "grad_norm": 0.7416003942489624, - "learning_rate": 4.515718410797481e-05, - "loss": 0.7393, - "step": 2160 - }, - { - "epoch": 0.25280242877160203, - "grad_norm": 0.9291106462478638, - "learning_rate": 4.513022390082969e-05, - "loss": 0.7034, - "step": 2165 - }, - { - "epoch": 0.25338626809901915, - "grad_norm": 0.7439368367195129, - "learning_rate": 4.510319796085428e-05, - "loss": 0.7335, - "step": 2170 - }, - { - "epoch": 0.2539701074264362, - "grad_norm": 0.7393158674240112, - "learning_rate": 4.5076106388812534e-05, - "loss": 0.726, - "step": 2175 - }, - { - "epoch": 0.25455394675385334, - "grad_norm": 0.8040406107902527, - "learning_rate": 4.504894928571315e-05, - "loss": 0.7171, - "step": 2180 - }, - { - "epoch": 0.2551377860812704, - "grad_norm": 1.2893598079681396, - "learning_rate": 4.502172675280915e-05, - "loss": 0.7098, - "step": 2185 - }, - { - "epoch": 0.25572162540868754, - "grad_norm": 0.8029099106788635, - "learning_rate": 4.4994438891597486e-05, - "loss": 0.762, - "step": 2190 - }, - { - "epoch": 0.2563054647361046, - "grad_norm": 0.7262006402015686, - "learning_rate": 4.496708580381868e-05, - "loss": 0.7324, - "step": 2195 - }, - { - "epoch": 0.25688930406352173, - "grad_norm": 0.8342519402503967, - "learning_rate": 4.4939667591456465e-05, - "loss": 0.7108, - "step": 2200 - }, - { - "epoch": 0.2574731433909388, - "grad_norm": 0.7979913353919983, - "learning_rate": 4.491218435673737e-05, - "loss": 0.7304, - "step": 2205 - }, - { - "epoch": 0.2580569827183559, - "grad_norm": 0.7388005256652832, - "learning_rate": 4.4884636202130365e-05, - "loss": 0.7105, - "step": 2210 - }, - { - "epoch": 0.258640822045773, - "grad_norm": 0.8212599754333496, - "learning_rate": 4.485702323034647e-05, - "loss": 0.7195, - "step": 2215 - }, - { - "epoch": 0.2592246613731901, - "grad_norm": 0.7600920796394348, - "learning_rate": 4.4829345544338355e-05, - "loss": 0.7257, - "step": 2220 - }, - { - "epoch": 0.2598085007006072, - "grad_norm": 0.8607686161994934, - "learning_rate": 4.480160324729998e-05, - "loss": 0.7077, - "step": 2225 - }, - { - "epoch": 0.2603923400280243, - "grad_norm": 0.8033510446548462, - "learning_rate": 4.477379644266621e-05, - "loss": 0.712, - "step": 2230 - }, - { - "epoch": 0.2609761793554414, - "grad_norm": 0.7468022108078003, - "learning_rate": 4.47459252341124e-05, - "loss": 0.745, - "step": 2235 - }, - { - "epoch": 0.2615600186828585, - "grad_norm": 0.7821460962295532, - "learning_rate": 4.471798972555407e-05, - "loss": 0.7139, - "step": 2240 - }, - { - "epoch": 0.26214385801027557, - "grad_norm": 0.7872776985168457, - "learning_rate": 4.468999002114642e-05, - "loss": 0.7305, - "step": 2245 - }, - { - "epoch": 0.2627276973376927, - "grad_norm": 1.2564536333084106, - "learning_rate": 4.4661926225284057e-05, - "loss": 0.7144, - "step": 2250 - }, - { - "epoch": 0.26331153666510976, - "grad_norm": 0.7227553129196167, - "learning_rate": 4.463379844260051e-05, - "loss": 0.7245, - "step": 2255 - }, - { - "epoch": 0.2638953759925269, - "grad_norm": 0.7956371903419495, - "learning_rate": 4.460560677796788e-05, - "loss": 0.7052, - "step": 2260 - }, - { - "epoch": 0.26447921531994395, - "grad_norm": 0.809528112411499, - "learning_rate": 4.4577351336496466e-05, - "loss": 0.7189, - "step": 2265 - }, - { - "epoch": 0.2650630546473611, - "grad_norm": 0.8591311573982239, - "learning_rate": 4.454903222353433e-05, - "loss": 0.7027, - "step": 2270 - }, - { - "epoch": 0.26564689397477814, - "grad_norm": 0.8015133142471313, - "learning_rate": 4.4520649544666955e-05, - "loss": 0.7143, - "step": 2275 - }, - { - "epoch": 0.2662307333021952, - "grad_norm": 0.7384242415428162, - "learning_rate": 4.4492203405716804e-05, - "loss": 0.7081, - "step": 2280 - }, - { - "epoch": 0.26681457262961233, - "grad_norm": 0.7772250771522522, - "learning_rate": 4.4463693912742944e-05, - "loss": 0.7083, - "step": 2285 - }, - { - "epoch": 0.2673984119570294, - "grad_norm": 0.7199101448059082, - "learning_rate": 4.4435121172040674e-05, - "loss": 0.7085, - "step": 2290 - }, - { - "epoch": 0.2679822512844465, - "grad_norm": 0.7555168867111206, - "learning_rate": 4.4406485290141075e-05, - "loss": 0.7009, - "step": 2295 - }, - { - "epoch": 0.2685660906118636, - "grad_norm": 0.720447301864624, - "learning_rate": 4.437778637381068e-05, - "loss": 0.7021, - "step": 2300 - }, - { - "epoch": 0.2691499299392807, - "grad_norm": 0.835857629776001, - "learning_rate": 4.434902453005101e-05, - "loss": 0.7382, - "step": 2305 - }, - { - "epoch": 0.2697337692666978, - "grad_norm": 0.7551912069320679, - "learning_rate": 4.4320199866098216e-05, - "loss": 0.7081, - "step": 2310 - }, - { - "epoch": 0.2703176085941149, - "grad_norm": 0.7919111251831055, - "learning_rate": 4.4291312489422684e-05, - "loss": 0.7295, - "step": 2315 - }, - { - "epoch": 0.270901447921532, - "grad_norm": 0.7225852608680725, - "learning_rate": 4.426236250772859e-05, - "loss": 0.7091, - "step": 2320 - }, - { - "epoch": 0.2714852872489491, - "grad_norm": 0.7020233273506165, - "learning_rate": 4.423335002895358e-05, - "loss": 0.7051, - "step": 2325 - }, - { - "epoch": 0.27206912657636617, - "grad_norm": 0.7703261971473694, - "learning_rate": 4.420427516126822e-05, - "loss": 0.7331, - "step": 2330 - }, - { - "epoch": 0.2726529659037833, - "grad_norm": 0.7116292119026184, - "learning_rate": 4.4175138013075804e-05, - "loss": 0.7126, - "step": 2335 - }, - { - "epoch": 0.27323680523120036, - "grad_norm": 0.7921679019927979, - "learning_rate": 4.4145938693011747e-05, - "loss": 0.7124, - "step": 2340 - }, - { - "epoch": 0.2738206445586175, - "grad_norm": 0.944868803024292, - "learning_rate": 4.4116677309943295e-05, - "loss": 0.7173, - "step": 2345 - }, - { - "epoch": 0.27440448388603456, - "grad_norm": 0.7137965559959412, - "learning_rate": 4.40873539729691e-05, - "loss": 0.726, - "step": 2350 - }, - { - "epoch": 0.2749883232134517, - "grad_norm": 0.7402662634849548, - "learning_rate": 4.405796879141881e-05, - "loss": 0.7047, - "step": 2355 - }, - { - "epoch": 0.27557216254086875, - "grad_norm": 0.7571717500686646, - "learning_rate": 4.402852187485262e-05, - "loss": 0.7214, - "step": 2360 - }, - { - "epoch": 0.27615600186828587, - "grad_norm": 0.7862398028373718, - "learning_rate": 4.3999013333060936e-05, - "loss": 0.724, - "step": 2365 - }, - { - "epoch": 0.27673984119570294, - "grad_norm": 0.7746096253395081, - "learning_rate": 4.396944327606389e-05, - "loss": 0.7046, - "step": 2370 - }, - { - "epoch": 0.27732368052312006, - "grad_norm": 0.730959951877594, - "learning_rate": 4.393981181411102e-05, - "loss": 0.7252, - "step": 2375 - }, - { - "epoch": 0.27790751985053713, - "grad_norm": 0.7707230448722839, - "learning_rate": 4.3910119057680765e-05, - "loss": 0.6895, - "step": 2380 - }, - { - "epoch": 0.2784913591779542, - "grad_norm": 0.802091121673584, - "learning_rate": 4.3880365117480114e-05, - "loss": 0.7237, - "step": 2385 - }, - { - "epoch": 0.2790751985053713, - "grad_norm": 0.7869424819946289, - "learning_rate": 4.385055010444416e-05, - "loss": 0.7042, - "step": 2390 - }, - { - "epoch": 0.2796590378327884, - "grad_norm": 0.8035837411880493, - "learning_rate": 4.382067412973573e-05, - "loss": 0.7013, - "step": 2395 - }, - { - "epoch": 0.2802428771602055, - "grad_norm": 0.6943315863609314, - "learning_rate": 4.3790737304744906e-05, - "loss": 0.72, - "step": 2400 - }, - { - "epoch": 0.2808267164876226, - "grad_norm": 0.6930028796195984, - "learning_rate": 4.376073974108866e-05, - "loss": 0.7238, - "step": 2405 - }, - { - "epoch": 0.2814105558150397, - "grad_norm": 0.6829924583435059, - "learning_rate": 4.373068155061043e-05, - "loss": 0.7081, - "step": 2410 - }, - { - "epoch": 0.2819943951424568, - "grad_norm": 0.7173051238059998, - "learning_rate": 4.37005628453797e-05, - "loss": 0.7011, - "step": 2415 - }, - { - "epoch": 0.2825782344698739, - "grad_norm": 0.7440537810325623, - "learning_rate": 4.367038373769155e-05, - "loss": 0.7308, - "step": 2420 - }, - { - "epoch": 0.28316207379729097, - "grad_norm": 0.7433156371116638, - "learning_rate": 4.36401443400663e-05, - "loss": 0.7271, - "step": 2425 - }, - { - "epoch": 0.2837459131247081, - "grad_norm": 0.7589252591133118, - "learning_rate": 4.3609844765249034e-05, - "loss": 0.6955, - "step": 2430 - }, - { - "epoch": 0.28432975245212516, - "grad_norm": 0.7940694689750671, - "learning_rate": 4.357948512620922e-05, - "loss": 0.7056, - "step": 2435 - }, - { - "epoch": 0.2849135917795423, - "grad_norm": 0.841876208782196, - "learning_rate": 4.354906553614024e-05, - "loss": 0.7137, - "step": 2440 - }, - { - "epoch": 0.28549743110695935, - "grad_norm": 0.741405725479126, - "learning_rate": 4.3518586108459034e-05, - "loss": 0.6936, - "step": 2445 - }, - { - "epoch": 0.2860812704343765, - "grad_norm": 0.8182490468025208, - "learning_rate": 4.34880469568056e-05, - "loss": 0.6927, - "step": 2450 - }, - { - "epoch": 0.28666510976179355, - "grad_norm": 0.7856588959693909, - "learning_rate": 4.345744819504266e-05, - "loss": 0.7122, - "step": 2455 - }, - { - "epoch": 0.28724894908921067, - "grad_norm": 0.7049044370651245, - "learning_rate": 4.342678993725517e-05, - "loss": 0.7049, - "step": 2460 - }, - { - "epoch": 0.28783278841662774, - "grad_norm": 0.7574556469917297, - "learning_rate": 4.339607229774989e-05, - "loss": 0.7117, - "step": 2465 - }, - { - "epoch": 0.28841662774404486, - "grad_norm": 0.7204563021659851, - "learning_rate": 4.3365295391054996e-05, - "loss": 0.7057, - "step": 2470 - }, - { - "epoch": 0.28900046707146193, - "grad_norm": 0.7526668310165405, - "learning_rate": 4.333445933191964e-05, - "loss": 0.7185, - "step": 2475 - }, - { - "epoch": 0.28958430639887905, - "grad_norm": 0.7245593070983887, - "learning_rate": 4.330356423531352e-05, - "loss": 0.7125, - "step": 2480 - }, - { - "epoch": 0.2901681457262961, - "grad_norm": 0.7846575975418091, - "learning_rate": 4.327261021642644e-05, - "loss": 0.7203, - "step": 2485 - }, - { - "epoch": 0.2907519850537132, - "grad_norm": 0.7781563997268677, - "learning_rate": 4.32415973906679e-05, - "loss": 0.7086, - "step": 2490 - }, - { - "epoch": 0.2913358243811303, - "grad_norm": 0.761791467666626, - "learning_rate": 4.3210525873666656e-05, - "loss": 0.707, - "step": 2495 - }, - { - "epoch": 0.2919196637085474, - "grad_norm": 0.8985573053359985, - "learning_rate": 4.317939578127029e-05, - "loss": 0.7113, - "step": 2500 - }, - { - "epoch": 0.2925035030359645, - "grad_norm": 0.8003341555595398, - "learning_rate": 4.314820722954476e-05, - "loss": 0.7109, - "step": 2505 - }, - { - "epoch": 0.2930873423633816, - "grad_norm": 0.8127963542938232, - "learning_rate": 4.3116960334774e-05, - "loss": 0.7098, - "step": 2510 - }, - { - "epoch": 0.2936711816907987, - "grad_norm": 0.8357040286064148, - "learning_rate": 4.308565521345949e-05, - "loss": 0.7182, - "step": 2515 - }, - { - "epoch": 0.29425502101821577, - "grad_norm": 0.7621999979019165, - "learning_rate": 4.305429198231977e-05, - "loss": 0.7108, - "step": 2520 - }, - { - "epoch": 0.2948388603456329, - "grad_norm": 0.7130179405212402, - "learning_rate": 4.302287075829005e-05, - "loss": 0.7051, - "step": 2525 - }, - { - "epoch": 0.29542269967304996, - "grad_norm": 0.6383655071258545, - "learning_rate": 4.2991391658521765e-05, - "loss": 0.7006, - "step": 2530 - }, - { - "epoch": 0.2960065390004671, - "grad_norm": 0.7761276364326477, - "learning_rate": 4.2959854800382136e-05, - "loss": 0.7061, - "step": 2535 - }, - { - "epoch": 0.29659037832788415, - "grad_norm": 0.7239909768104553, - "learning_rate": 4.292826030145372e-05, - "loss": 0.712, - "step": 2540 - }, - { - "epoch": 0.2971742176553013, - "grad_norm": 0.835978627204895, - "learning_rate": 4.289660827953399e-05, - "loss": 0.6891, - "step": 2545 - }, - { - "epoch": 0.29775805698271834, - "grad_norm": 0.6836960315704346, - "learning_rate": 4.28648988526349e-05, - "loss": 0.7263, - "step": 2550 - }, - { - "epoch": 0.29834189631013547, - "grad_norm": 19.69857406616211, - "learning_rate": 4.2833132138982415e-05, - "loss": 0.7248, - "step": 2555 - }, - { - "epoch": 0.29892573563755254, - "grad_norm": 0.7145888805389404, - "learning_rate": 4.280130825701609e-05, - "loss": 0.7117, - "step": 2560 - }, - { - "epoch": 0.29950957496496966, - "grad_norm": 0.7149947881698608, - "learning_rate": 4.276942732538866e-05, - "loss": 0.7018, - "step": 2565 - }, - { - "epoch": 0.30009341429238673, - "grad_norm": 0.7568755149841309, - "learning_rate": 4.273748946296552e-05, - "loss": 0.7348, - "step": 2570 - }, - { - "epoch": 0.30067725361980385, - "grad_norm": 0.7530679106712341, - "learning_rate": 4.2705494788824345e-05, - "loss": 0.7133, - "step": 2575 - }, - { - "epoch": 0.3012610929472209, - "grad_norm": 0.7513715028762817, - "learning_rate": 4.267344342225463e-05, - "loss": 0.7238, - "step": 2580 - }, - { - "epoch": 0.30184493227463804, - "grad_norm": 0.7250661253929138, - "learning_rate": 4.264133548275725e-05, - "loss": 0.7018, - "step": 2585 - }, - { - "epoch": 0.3024287716020551, - "grad_norm": 0.7015860080718994, - "learning_rate": 4.2609171090044e-05, - "loss": 0.6913, - "step": 2590 - }, - { - "epoch": 0.30301261092947224, - "grad_norm": 0.7085592150688171, - "learning_rate": 4.257695036403714e-05, - "loss": 0.7042, - "step": 2595 - }, - { - "epoch": 0.3035964502568893, - "grad_norm": 0.6993227005004883, - "learning_rate": 4.2544673424868994e-05, - "loss": 0.7019, - "step": 2600 - }, - { - "epoch": 0.3041802895843064, - "grad_norm": 0.6946539878845215, - "learning_rate": 4.251234039288145e-05, - "loss": 0.7072, - "step": 2605 - }, - { - "epoch": 0.3047641289117235, - "grad_norm": 0.7894067764282227, - "learning_rate": 4.2479951388625546e-05, - "loss": 0.7294, - "step": 2610 - }, - { - "epoch": 0.30534796823914057, - "grad_norm": 0.7290725708007812, - "learning_rate": 4.2447506532861e-05, - "loss": 0.7256, - "step": 2615 - }, - { - "epoch": 0.3059318075665577, - "grad_norm": 0.7361443638801575, - "learning_rate": 4.241500594655577e-05, - "loss": 0.7051, - "step": 2620 - }, - { - "epoch": 0.30651564689397476, - "grad_norm": 0.7111384868621826, - "learning_rate": 4.2382449750885604e-05, - "loss": 0.686, - "step": 2625 - }, - { - "epoch": 0.3070994862213919, - "grad_norm": 0.7224936485290527, - "learning_rate": 4.2349838067233575e-05, - "loss": 0.6889, - "step": 2630 - }, - { - "epoch": 0.30768332554880895, - "grad_norm": 0.7539772987365723, - "learning_rate": 4.231717101718967e-05, - "loss": 0.6859, - "step": 2635 - }, - { - "epoch": 0.3082671648762261, - "grad_norm": 0.6927385926246643, - "learning_rate": 4.228444872255025e-05, - "loss": 0.6986, - "step": 2640 - }, - { - "epoch": 0.30885100420364314, - "grad_norm": 0.7830529808998108, - "learning_rate": 4.2251671305317696e-05, - "loss": 0.7097, - "step": 2645 - }, - { - "epoch": 0.30943484353106027, - "grad_norm": 0.737457811832428, - "learning_rate": 4.2218838887699894e-05, - "loss": 0.7151, - "step": 2650 - }, - { - "epoch": 0.31001868285847733, - "grad_norm": 0.8457800149917603, - "learning_rate": 4.2185951592109794e-05, - "loss": 0.6978, - "step": 2655 - }, - { - "epoch": 0.31060252218589446, - "grad_norm": 0.8648945689201355, - "learning_rate": 4.2153009541164965e-05, - "loss": 0.7151, - "step": 2660 - }, - { - "epoch": 0.3111863615133115, - "grad_norm": 0.8710271716117859, - "learning_rate": 4.21200128576871e-05, - "loss": 0.7255, - "step": 2665 - }, - { - "epoch": 0.31177020084072865, - "grad_norm": 0.7179669737815857, - "learning_rate": 4.208696166470161e-05, - "loss": 0.7184, - "step": 2670 - }, - { - "epoch": 0.3123540401681457, - "grad_norm": 0.7767862677574158, - "learning_rate": 4.2053856085437124e-05, - "loss": 0.7112, - "step": 2675 - }, - { - "epoch": 0.31293787949556284, - "grad_norm": 0.7961644530296326, - "learning_rate": 4.202069624332507e-05, - "loss": 0.7079, - "step": 2680 - }, - { - "epoch": 0.3135217188229799, - "grad_norm": 0.8015754222869873, - "learning_rate": 4.1987482261999164e-05, - "loss": 0.7237, - "step": 2685 - }, - { - "epoch": 0.31410555815039704, - "grad_norm": 0.6942439675331116, - "learning_rate": 4.1954214265294985e-05, - "loss": 0.6859, - "step": 2690 - }, - { - "epoch": 0.3146893974778141, - "grad_norm": 0.6358218789100647, - "learning_rate": 4.192089237724951e-05, - "loss": 0.6914, - "step": 2695 - }, - { - "epoch": 0.3152732368052312, - "grad_norm": 0.715323805809021, - "learning_rate": 4.188751672210063e-05, - "loss": 0.7103, - "step": 2700 - }, - { - "epoch": 0.3158570761326483, - "grad_norm": 0.7808899283409119, - "learning_rate": 4.1854087424286725e-05, - "loss": 0.6867, - "step": 2705 - }, - { - "epoch": 0.31644091546006536, - "grad_norm": 0.8459696173667908, - "learning_rate": 4.182060460844615e-05, - "loss": 0.6968, - "step": 2710 - }, - { - "epoch": 0.3170247547874825, - "grad_norm": 0.7288032174110413, - "learning_rate": 4.1787068399416825e-05, - "loss": 0.6995, - "step": 2715 - }, - { - "epoch": 0.31760859411489956, - "grad_norm": 0.7239239811897278, - "learning_rate": 4.175347892223572e-05, - "loss": 0.6783, - "step": 2720 - }, - { - "epoch": 0.3181924334423167, - "grad_norm": 0.6906855702400208, - "learning_rate": 4.1719836302138426e-05, - "loss": 0.698, - "step": 2725 - }, - { - "epoch": 0.31877627276973375, - "grad_norm": 0.8103212714195251, - "learning_rate": 4.168614066455867e-05, - "loss": 0.7, - "step": 2730 - }, - { - "epoch": 0.31936011209715087, - "grad_norm": 0.7674182057380676, - "learning_rate": 4.165239213512784e-05, - "loss": 0.6935, - "step": 2735 - }, - { - "epoch": 0.31994395142456794, - "grad_norm": 0.7329823970794678, - "learning_rate": 4.161859083967454e-05, - "loss": 0.7007, - "step": 2740 - }, - { - "epoch": 0.32052779075198506, - "grad_norm": 0.742588460445404, - "learning_rate": 4.158473690422409e-05, - "loss": 0.6914, - "step": 2745 - }, - { - "epoch": 0.32111163007940213, - "grad_norm": 0.7220062017440796, - "learning_rate": 4.15508304549981e-05, - "loss": 0.6748, - "step": 2750 - }, - { - "epoch": 0.32169546940681926, - "grad_norm": 0.7789446115493774, - "learning_rate": 4.1516871618413947e-05, - "loss": 0.7062, - "step": 2755 - }, - { - "epoch": 0.3222793087342363, - "grad_norm": 0.7176442742347717, - "learning_rate": 4.148286052108436e-05, - "loss": 0.7079, - "step": 2760 - }, - { - "epoch": 0.32286314806165345, - "grad_norm": 0.6666289567947388, - "learning_rate": 4.144879728981688e-05, - "loss": 0.7035, - "step": 2765 - }, - { - "epoch": 0.3234469873890705, - "grad_norm": 0.7119992971420288, - "learning_rate": 4.141468205161345e-05, - "loss": 0.6979, - "step": 2770 - }, - { - "epoch": 0.32403082671648764, - "grad_norm": 0.693290650844574, - "learning_rate": 4.1380514933669916e-05, - "loss": 0.6764, - "step": 2775 - }, - { - "epoch": 0.3246146660439047, - "grad_norm": 0.7222050428390503, - "learning_rate": 4.134629606337555e-05, - "loss": 0.6906, - "step": 2780 - }, - { - "epoch": 0.32519850537132183, - "grad_norm": 0.8008699417114258, - "learning_rate": 4.131202556831257e-05, - "loss": 0.7056, - "step": 2785 - }, - { - "epoch": 0.3257823446987389, - "grad_norm": 0.7525890469551086, - "learning_rate": 4.1277703576255685e-05, - "loss": 0.6902, - "step": 2790 - }, - { - "epoch": 0.326366184026156, - "grad_norm": 0.779255211353302, - "learning_rate": 4.12433302151716e-05, - "loss": 0.6932, - "step": 2795 - }, - { - "epoch": 0.3269500233535731, - "grad_norm": 0.6466972827911377, - "learning_rate": 4.1208905613218547e-05, - "loss": 0.6997, - "step": 2800 - }, - { - "epoch": 0.3275338626809902, - "grad_norm": 0.674862802028656, - "learning_rate": 4.1174429898745795e-05, - "loss": 0.6838, - "step": 2805 - }, - { - "epoch": 0.3281177020084073, - "grad_norm": 0.7087062001228333, - "learning_rate": 4.113990320029321e-05, - "loss": 0.6792, - "step": 2810 - }, - { - "epoch": 0.32870154133582435, - "grad_norm": 0.7037532925605774, - "learning_rate": 4.1105325646590714e-05, - "loss": 0.7129, - "step": 2815 - }, - { - "epoch": 0.3292853806632415, - "grad_norm": 0.6811494827270508, - "learning_rate": 4.1070697366557856e-05, - "loss": 0.694, - "step": 2820 - }, - { - "epoch": 0.32986921999065855, - "grad_norm": 0.7792772054672241, - "learning_rate": 4.103601848930332e-05, - "loss": 0.6862, - "step": 2825 - }, - { - "epoch": 0.33045305931807567, - "grad_norm": 0.8155742287635803, - "learning_rate": 4.100128914412442e-05, - "loss": 0.731, - "step": 2830 - }, - { - "epoch": 0.33103689864549274, - "grad_norm": 0.7951058745384216, - "learning_rate": 4.096650946050664e-05, - "loss": 0.6874, - "step": 2835 - }, - { - "epoch": 0.33162073797290986, - "grad_norm": 0.8006696105003357, - "learning_rate": 4.0931679568123174e-05, - "loss": 0.7132, - "step": 2840 - }, - { - "epoch": 0.33220457730032693, - "grad_norm": 0.7323281764984131, - "learning_rate": 4.0896799596834365e-05, - "loss": 0.6848, - "step": 2845 - }, - { - "epoch": 0.33278841662774405, - "grad_norm": 0.8392420411109924, - "learning_rate": 4.086186967668731e-05, - "loss": 0.7149, - "step": 2850 - }, - { - "epoch": 0.3333722559551611, - "grad_norm": 0.7774245738983154, - "learning_rate": 4.082688993791533e-05, - "loss": 0.6726, - "step": 2855 - }, - { - "epoch": 0.33395609528257825, - "grad_norm": 1.015106201171875, - "learning_rate": 4.079186051093747e-05, - "loss": 0.6943, - "step": 2860 - }, - { - "epoch": 0.3345399346099953, - "grad_norm": 0.7947196960449219, - "learning_rate": 4.075678152635807e-05, - "loss": 0.7136, - "step": 2865 - }, - { - "epoch": 0.33512377393741244, - "grad_norm": 0.734081506729126, - "learning_rate": 4.07216531149662e-05, - "loss": 0.7181, - "step": 2870 - }, - { - "epoch": 0.3357076132648295, - "grad_norm": 0.7285468578338623, - "learning_rate": 4.068647540773524e-05, - "loss": 0.7063, - "step": 2875 - }, - { - "epoch": 0.33629145259224663, - "grad_norm": 0.7461393475532532, - "learning_rate": 4.065124853582237e-05, - "loss": 0.7113, - "step": 2880 - }, - { - "epoch": 0.3368752919196637, - "grad_norm": 0.6990797519683838, - "learning_rate": 4.0615972630568055e-05, - "loss": 0.703, - "step": 2885 - }, - { - "epoch": 0.3374591312470808, - "grad_norm": 0.7384578585624695, - "learning_rate": 4.0580647823495587e-05, - "loss": 0.7041, - "step": 2890 - }, - { - "epoch": 0.3380429705744979, - "grad_norm": 0.701184868812561, - "learning_rate": 4.054527424631059e-05, - "loss": 0.7312, - "step": 2895 - }, - { - "epoch": 0.338626809901915, - "grad_norm": 0.7753100991249084, - "learning_rate": 4.0509852030900506e-05, - "loss": 0.6764, - "step": 2900 - }, - { - "epoch": 0.3392106492293321, - "grad_norm": 0.7301655411720276, - "learning_rate": 4.047438130933415e-05, - "loss": 0.7025, - "step": 2905 - }, - { - "epoch": 0.3397944885567492, - "grad_norm": 0.7775252461433411, - "learning_rate": 4.043886221386117e-05, - "loss": 0.6899, - "step": 2910 - }, - { - "epoch": 0.3403783278841663, - "grad_norm": 0.6789132952690125, - "learning_rate": 4.040329487691155e-05, - "loss": 0.6979, - "step": 2915 - }, - { - "epoch": 0.34096216721158334, - "grad_norm": 0.9006447792053223, - "learning_rate": 4.036767943109519e-05, - "loss": 0.705, - "step": 2920 - }, - { - "epoch": 0.34154600653900047, - "grad_norm": 0.8180142641067505, - "learning_rate": 4.0332016009201315e-05, - "loss": 0.7042, - "step": 2925 - }, - { - "epoch": 0.34212984586641754, - "grad_norm": 0.6808193922042847, - "learning_rate": 4.0296304744198045e-05, - "loss": 0.6953, - "step": 2930 - }, - { - "epoch": 0.34271368519383466, - "grad_norm": 0.6977395415306091, - "learning_rate": 4.0260545769231875e-05, - "loss": 0.6909, - "step": 2935 - }, - { - "epoch": 0.34329752452125173, - "grad_norm": 0.7804333567619324, - "learning_rate": 4.022473921762719e-05, - "loss": 0.7072, - "step": 2940 - }, - { - "epoch": 0.34388136384866885, - "grad_norm": 0.7728607058525085, - "learning_rate": 4.018888522288574e-05, - "loss": 0.7126, - "step": 2945 - }, - { - "epoch": 0.3444652031760859, - "grad_norm": 0.7181497812271118, - "learning_rate": 4.0152983918686175e-05, - "loss": 0.6942, - "step": 2950 - }, - { - "epoch": 0.34504904250350305, - "grad_norm": 0.7440214157104492, - "learning_rate": 4.0117035438883553e-05, - "loss": 0.6815, - "step": 2955 - }, - { - "epoch": 0.3456328818309201, - "grad_norm": 0.8292972445487976, - "learning_rate": 4.008103991750879e-05, - "loss": 0.7154, - "step": 2960 - }, - { - "epoch": 0.34621672115833724, - "grad_norm": 0.6813479065895081, - "learning_rate": 4.004499748876821e-05, - "loss": 0.6943, - "step": 2965 - }, - { - "epoch": 0.3468005604857543, - "grad_norm": 0.6901454925537109, - "learning_rate": 4.000890828704304e-05, - "loss": 0.6811, - "step": 2970 - }, - { - "epoch": 0.34738439981317143, - "grad_norm": 0.7159035205841064, - "learning_rate": 3.997277244688886e-05, - "loss": 0.7083, - "step": 2975 - }, - { - "epoch": 0.3479682391405885, - "grad_norm": 0.7618858218193054, - "learning_rate": 3.993659010303517e-05, - "loss": 0.7075, - "step": 2980 - }, - { - "epoch": 0.3485520784680056, - "grad_norm": 0.7113164067268372, - "learning_rate": 3.9900361390384836e-05, - "loss": 0.6718, - "step": 2985 - }, - { - "epoch": 0.3491359177954227, - "grad_norm": 0.6889391541481018, - "learning_rate": 3.986408644401362e-05, - "loss": 0.7083, - "step": 2990 - }, - { - "epoch": 0.3497197571228398, - "grad_norm": 0.7199043035507202, - "learning_rate": 3.982776539916966e-05, - "loss": 0.6945, - "step": 2995 - }, - { - "epoch": 0.3503035964502569, - "grad_norm": 0.7150009274482727, - "learning_rate": 3.979139839127296e-05, - "loss": 0.6793, - "step": 3000 - }, - { - "epoch": 0.350887435777674, - "grad_norm": 0.7808105945587158, - "learning_rate": 3.975498555591489e-05, - "loss": 0.6947, - "step": 3005 - }, - { - "epoch": 0.3514712751050911, - "grad_norm": 0.6760778427124023, - "learning_rate": 3.971852702885772e-05, - "loss": 0.6866, - "step": 3010 - }, - { - "epoch": 0.3520551144325082, - "grad_norm": 0.6719233989715576, - "learning_rate": 3.9682022946034006e-05, - "loss": 0.6913, - "step": 3015 - }, - { - "epoch": 0.35263895375992527, - "grad_norm": 0.7376195788383484, - "learning_rate": 3.964547344354624e-05, - "loss": 0.6955, - "step": 3020 - }, - { - "epoch": 0.3532227930873424, - "grad_norm": 0.6892589926719666, - "learning_rate": 3.9608878657666195e-05, - "loss": 0.7007, - "step": 3025 - }, - { - "epoch": 0.35380663241475946, - "grad_norm": 0.7587161064147949, - "learning_rate": 3.9572238724834503e-05, - "loss": 0.701, - "step": 3030 - }, - { - "epoch": 0.3543904717421765, - "grad_norm": 0.7055370807647705, - "learning_rate": 3.953555378166012e-05, - "loss": 0.7019, - "step": 3035 - }, - { - "epoch": 0.35497431106959365, - "grad_norm": 0.8153855204582214, - "learning_rate": 3.9498823964919827e-05, - "loss": 0.7041, - "step": 3040 - }, - { - "epoch": 0.3555581503970107, - "grad_norm": 0.7207863330841064, - "learning_rate": 3.94620494115577e-05, - "loss": 0.6884, - "step": 3045 - }, - { - "epoch": 0.35614198972442784, - "grad_norm": 0.6450507640838623, - "learning_rate": 3.942523025868461e-05, - "loss": 0.6918, - "step": 3050 - }, - { - "epoch": 0.3567258290518449, - "grad_norm": 0.7938247919082642, - "learning_rate": 3.9388366643577745e-05, - "loss": 0.6897, - "step": 3055 - }, - { - "epoch": 0.35730966837926204, - "grad_norm": 0.7001179456710815, - "learning_rate": 3.9351458703680017e-05, - "loss": 0.7098, - "step": 3060 - }, - { - "epoch": 0.3578935077066791, - "grad_norm": 0.7264606356620789, - "learning_rate": 3.931450657659963e-05, - "loss": 0.6964, - "step": 3065 - }, - { - "epoch": 0.3584773470340962, - "grad_norm": 0.7517201900482178, - "learning_rate": 3.927751040010954e-05, - "loss": 0.6755, - "step": 3070 - }, - { - "epoch": 0.3590611863615133, - "grad_norm": 0.838545024394989, - "learning_rate": 3.924047031214691e-05, - "loss": 0.7084, - "step": 3075 - }, - { - "epoch": 0.3596450256889304, - "grad_norm": 0.7725359797477722, - "learning_rate": 3.920338645081266e-05, - "loss": 0.6949, - "step": 3080 - }, - { - "epoch": 0.3602288650163475, - "grad_norm": 0.807808518409729, - "learning_rate": 3.916625895437089e-05, - "loss": 0.6691, - "step": 3085 - }, - { - "epoch": 0.3608127043437646, - "grad_norm": 0.8445301651954651, - "learning_rate": 3.912908796124839e-05, - "loss": 0.6963, - "step": 3090 - }, - { - "epoch": 0.3613965436711817, - "grad_norm": 0.7631264328956604, - "learning_rate": 3.909187361003414e-05, - "loss": 0.6855, - "step": 3095 - }, - { - "epoch": 0.3619803829985988, - "grad_norm": 0.7789484262466431, - "learning_rate": 3.905461603947878e-05, - "loss": 0.6992, - "step": 3100 - }, - { - "epoch": 0.3625642223260159, - "grad_norm": 0.7738934755325317, - "learning_rate": 3.9017315388494044e-05, - "loss": 0.6858, - "step": 3105 - }, - { - "epoch": 0.363148061653433, - "grad_norm": 0.844486653804779, - "learning_rate": 3.8979971796152346e-05, - "loss": 0.6922, - "step": 3110 - }, - { - "epoch": 0.36373190098085006, - "grad_norm": 0.640557050704956, - "learning_rate": 3.894258540168618e-05, - "loss": 0.6734, - "step": 3115 - }, - { - "epoch": 0.3643157403082672, - "grad_norm": 0.689569890499115, - "learning_rate": 3.89051563444876e-05, - "loss": 0.6791, - "step": 3120 - }, - { - "epoch": 0.36489957963568426, - "grad_norm": 0.6867648363113403, - "learning_rate": 3.886768476410777e-05, - "loss": 0.6837, - "step": 3125 - }, - { - "epoch": 0.3654834189631014, - "grad_norm": 0.8786919713020325, - "learning_rate": 3.883017080025638e-05, - "loss": 0.6845, - "step": 3130 - }, - { - "epoch": 0.36606725829051845, - "grad_norm": 0.8145090937614441, - "learning_rate": 3.879261459280111e-05, - "loss": 0.6875, - "step": 3135 - }, - { - "epoch": 0.3666510976179355, - "grad_norm": 0.7394934296607971, - "learning_rate": 3.875501628176719e-05, - "loss": 0.6938, - "step": 3140 - }, - { - "epoch": 0.36723493694535264, - "grad_norm": 0.729565441608429, - "learning_rate": 3.8717376007336814e-05, - "loss": 0.6885, - "step": 3145 - }, - { - "epoch": 0.3678187762727697, - "grad_norm": 0.7538211345672607, - "learning_rate": 3.867969390984862e-05, - "loss": 0.7139, - "step": 3150 - }, - { - "epoch": 0.36840261560018683, - "grad_norm": 0.7061357498168945, - "learning_rate": 3.864197012979719e-05, - "loss": 0.7162, - "step": 3155 - }, - { - "epoch": 0.3689864549276039, - "grad_norm": 0.7541712522506714, - "learning_rate": 3.8604204807832516e-05, - "loss": 0.7272, - "step": 3160 - }, - { - "epoch": 0.369570294255021, - "grad_norm": 0.7181931138038635, - "learning_rate": 3.856639808475947e-05, - "loss": 0.6799, - "step": 3165 - }, - { - "epoch": 0.3701541335824381, - "grad_norm": 0.7101660966873169, - "learning_rate": 3.85285501015373e-05, - "loss": 0.6895, - "step": 3170 - }, - { - "epoch": 0.3707379729098552, - "grad_norm": 0.6776376962661743, - "learning_rate": 3.8490660999279074e-05, - "loss": 0.6869, - "step": 3175 - }, - { - "epoch": 0.3713218122372723, - "grad_norm": 0.6587716341018677, - "learning_rate": 3.8452730919251174e-05, - "loss": 0.6975, - "step": 3180 - }, - { - "epoch": 0.3719056515646894, - "grad_norm": 0.6547430753707886, - "learning_rate": 3.841476000287275e-05, - "loss": 0.7066, - "step": 3185 - }, - { - "epoch": 0.3724894908921065, - "grad_norm": 1.0472015142440796, - "learning_rate": 3.837674839171524e-05, - "loss": 0.6874, - "step": 3190 - }, - { - "epoch": 0.3730733302195236, - "grad_norm": 0.6956185698509216, - "learning_rate": 3.833869622750177e-05, - "loss": 0.7078, - "step": 3195 - }, - { - "epoch": 0.37365716954694067, - "grad_norm": 0.8840404748916626, - "learning_rate": 3.8300603652106704e-05, - "loss": 0.6962, - "step": 3200 - }, - { - "epoch": 0.3742410088743578, - "grad_norm": 0.8562681078910828, - "learning_rate": 3.8262470807555045e-05, - "loss": 0.6812, - "step": 3205 - }, - { - "epoch": 0.37482484820177486, - "grad_norm": 0.7304535508155823, - "learning_rate": 3.822429783602195e-05, - "loss": 0.7003, - "step": 3210 - }, - { - "epoch": 0.375408687529192, - "grad_norm": 0.7535802721977234, - "learning_rate": 3.818608487983218e-05, - "loss": 0.6776, - "step": 3215 - }, - { - "epoch": 0.37599252685660906, - "grad_norm": 0.6769854426383972, - "learning_rate": 3.8147832081459574e-05, - "loss": 0.6867, - "step": 3220 - }, - { - "epoch": 0.3765763661840262, - "grad_norm": 0.7537841796875, - "learning_rate": 3.810953958352653e-05, - "loss": 0.7111, - "step": 3225 - }, - { - "epoch": 0.37716020551144325, - "grad_norm": 0.6647617220878601, - "learning_rate": 3.807120752880346e-05, - "loss": 0.6892, - "step": 3230 - }, - { - "epoch": 0.37774404483886037, - "grad_norm": 0.6656181216239929, - "learning_rate": 3.8032836060208265e-05, - "loss": 0.6775, - "step": 3235 - }, - { - "epoch": 0.37832788416627744, - "grad_norm": 0.6990296244621277, - "learning_rate": 3.799442532080577e-05, - "loss": 0.6798, - "step": 3240 - }, - { - "epoch": 0.3789117234936945, - "grad_norm": 0.7836006879806519, - "learning_rate": 3.795597545380724e-05, - "loss": 0.6927, - "step": 3245 - }, - { - "epoch": 0.37949556282111163, - "grad_norm": 0.794144332408905, - "learning_rate": 3.791748660256983e-05, - "loss": 0.6903, - "step": 3250 - }, - { - "epoch": 0.3800794021485287, - "grad_norm": 0.7122796177864075, - "learning_rate": 3.787895891059603e-05, - "loss": 0.6735, - "step": 3255 - }, - { - "epoch": 0.3806632414759458, - "grad_norm": 0.7318227887153625, - "learning_rate": 3.784039252153315e-05, - "loss": 0.6839, - "step": 3260 - }, - { - "epoch": 0.3812470808033629, - "grad_norm": 0.8180214166641235, - "learning_rate": 3.780178757917278e-05, - "loss": 0.705, - "step": 3265 - }, - { - "epoch": 0.38183092013078, - "grad_norm": 0.6776541471481323, - "learning_rate": 3.776314422745026e-05, - "loss": 0.6756, - "step": 3270 - }, - { - "epoch": 0.3824147594581971, - "grad_norm": 0.7650374174118042, - "learning_rate": 3.772446261044411e-05, - "loss": 0.6858, - "step": 3275 - }, - { - "epoch": 0.3829985987856142, - "grad_norm": 0.7002133131027222, - "learning_rate": 3.768574287237555e-05, - "loss": 0.6833, - "step": 3280 - }, - { - "epoch": 0.3835824381130313, - "grad_norm": 0.7493712902069092, - "learning_rate": 3.7646985157607915e-05, - "loss": 0.6904, - "step": 3285 - }, - { - "epoch": 0.3841662774404484, - "grad_norm": 0.7969913482666016, - "learning_rate": 3.760818961064614e-05, - "loss": 0.6839, - "step": 3290 - }, - { - "epoch": 0.38475011676786547, - "grad_norm": 0.8761760592460632, - "learning_rate": 3.75693563761362e-05, - "loss": 0.69, - "step": 3295 - }, - { - "epoch": 0.3853339560952826, - "grad_norm": 0.7125682830810547, - "learning_rate": 3.75304855988646e-05, - "loss": 0.7017, - "step": 3300 - }, - { - "epoch": 0.38591779542269966, - "grad_norm": 0.8172748684883118, - "learning_rate": 3.749157742375782e-05, - "loss": 0.6979, - "step": 3305 - }, - { - "epoch": 0.3865016347501168, - "grad_norm": 0.7777659893035889, - "learning_rate": 3.745263199588176e-05, - "loss": 0.6945, - "step": 3310 - }, - { - "epoch": 0.38708547407753385, - "grad_norm": 0.7778120040893555, - "learning_rate": 3.741364946044123e-05, - "loss": 0.6895, - "step": 3315 - }, - { - "epoch": 0.387669313404951, - "grad_norm": 0.6724658608436584, - "learning_rate": 3.737462996277939e-05, - "loss": 0.6868, - "step": 3320 - }, - { - "epoch": 0.38825315273236805, - "grad_norm": 0.7206020355224609, - "learning_rate": 3.73355736483772e-05, - "loss": 0.7244, - "step": 3325 - }, - { - "epoch": 0.38883699205978517, - "grad_norm": 0.7758327126502991, - "learning_rate": 3.72964806628529e-05, - "loss": 0.6802, - "step": 3330 - }, - { - "epoch": 0.38942083138720224, - "grad_norm": 0.7452991604804993, - "learning_rate": 3.725735115196145e-05, - "loss": 0.7086, - "step": 3335 - }, - { - "epoch": 0.39000467071461936, - "grad_norm": 0.7305519580841064, - "learning_rate": 3.7218185261593984e-05, - "loss": 0.6947, - "step": 3340 - }, - { - "epoch": 0.39058851004203643, - "grad_norm": 0.7128348350524902, - "learning_rate": 3.717898313777729e-05, - "loss": 0.6981, - "step": 3345 - }, - { - "epoch": 0.39117234936945355, - "grad_norm": 0.7285656332969666, - "learning_rate": 3.713974492667324e-05, - "loss": 0.6965, - "step": 3350 - }, - { - "epoch": 0.3917561886968706, - "grad_norm": 0.6574671864509583, - "learning_rate": 3.710047077457826e-05, - "loss": 0.6899, - "step": 3355 - }, - { - "epoch": 0.3923400280242877, - "grad_norm": 0.6916378140449524, - "learning_rate": 3.706116082792276e-05, - "loss": 0.682, - "step": 3360 - }, - { - "epoch": 0.3929238673517048, - "grad_norm": 0.6742314100265503, - "learning_rate": 3.702181523327064e-05, - "loss": 0.7102, - "step": 3365 - }, - { - "epoch": 0.3935077066791219, - "grad_norm": 0.7167133688926697, - "learning_rate": 3.698243413731867e-05, - "loss": 0.687, - "step": 3370 - }, - { - "epoch": 0.394091546006539, - "grad_norm": 0.8009039163589478, - "learning_rate": 3.694301768689603e-05, - "loss": 0.6918, - "step": 3375 - }, - { - "epoch": 0.3946753853339561, - "grad_norm": 0.7741237878799438, - "learning_rate": 3.690356602896368e-05, - "loss": 0.6988, - "step": 3380 - }, - { - "epoch": 0.3952592246613732, - "grad_norm": 0.8442285060882568, - "learning_rate": 3.686407931061386e-05, - "loss": 0.6772, - "step": 3385 - }, - { - "epoch": 0.39584306398879027, - "grad_norm": 0.9047139883041382, - "learning_rate": 3.682455767906954e-05, - "loss": 0.6936, - "step": 3390 - }, - { - "epoch": 0.3964269033162074, - "grad_norm": 0.7213223576545715, - "learning_rate": 3.678500128168384e-05, - "loss": 0.6752, - "step": 3395 - }, - { - "epoch": 0.39701074264362446, - "grad_norm": 0.8330225348472595, - "learning_rate": 3.674541026593952e-05, - "loss": 0.6881, - "step": 3400 - }, - { - "epoch": 0.3975945819710416, - "grad_norm": 0.7519540190696716, - "learning_rate": 3.6705784779448405e-05, - "loss": 0.6791, - "step": 3405 - }, - { - "epoch": 0.39817842129845865, - "grad_norm": 0.6645708680152893, - "learning_rate": 3.6666124969950835e-05, - "loss": 0.6838, - "step": 3410 - }, - { - "epoch": 0.3987622606258758, - "grad_norm": 0.7131589651107788, - "learning_rate": 3.662643098531513e-05, - "loss": 0.6956, - "step": 3415 - }, - { - "epoch": 0.39934609995329284, - "grad_norm": 0.8310096263885498, - "learning_rate": 3.6586702973537025e-05, - "loss": 0.6927, - "step": 3420 - }, - { - "epoch": 0.39992993928070997, - "grad_norm": 0.6871352195739746, - "learning_rate": 3.654694108273912e-05, - "loss": 0.6705, - "step": 3425 - }, - { - "epoch": 0.40051377860812704, - "grad_norm": 0.6813589930534363, - "learning_rate": 3.6507145461170345e-05, - "loss": 0.6661, - "step": 3430 - }, - { - "epoch": 0.40109761793554416, - "grad_norm": 0.7847844362258911, - "learning_rate": 3.646731625720537e-05, - "loss": 0.6716, - "step": 3435 - }, - { - "epoch": 0.40168145726296123, - "grad_norm": 0.7956090569496155, - "learning_rate": 3.642745361934408e-05, - "loss": 0.6962, - "step": 3440 - }, - { - "epoch": 0.40226529659037835, - "grad_norm": 0.7995411157608032, - "learning_rate": 3.638755769621104e-05, - "loss": 0.6945, - "step": 3445 - }, - { - "epoch": 0.4028491359177954, - "grad_norm": 0.7152298092842102, - "learning_rate": 3.634762863655487e-05, - "loss": 0.683, - "step": 3450 - }, - { - "epoch": 0.40343297524521254, - "grad_norm": 0.8431740403175354, - "learning_rate": 3.630766658924779e-05, - "loss": 0.6989, - "step": 3455 - }, - { - "epoch": 0.4040168145726296, - "grad_norm": 0.724612832069397, - "learning_rate": 3.6267671703284963e-05, - "loss": 0.6919, - "step": 3460 - }, - { - "epoch": 0.4046006539000467, - "grad_norm": 0.9266369938850403, - "learning_rate": 3.6227644127784026e-05, - "loss": 0.6998, - "step": 3465 - }, - { - "epoch": 0.4051844932274638, - "grad_norm": 0.7902385592460632, - "learning_rate": 3.618758401198447e-05, - "loss": 0.6836, - "step": 3470 - }, - { - "epoch": 0.4057683325548809, - "grad_norm": 0.9329309463500977, - "learning_rate": 3.6147491505247124e-05, - "loss": 0.6915, - "step": 3475 - }, - { - "epoch": 0.406352171882298, - "grad_norm": 0.7015485763549805, - "learning_rate": 3.6107366757053586e-05, - "loss": 0.6926, - "step": 3480 - }, - { - "epoch": 0.40693601120971507, - "grad_norm": 0.77342689037323, - "learning_rate": 3.606720991700565e-05, - "loss": 0.6983, - "step": 3485 - }, - { - "epoch": 0.4075198505371322, - "grad_norm": 0.7198393940925598, - "learning_rate": 3.602702113482477e-05, - "loss": 0.6936, - "step": 3490 - }, - { - "epoch": 0.40810368986454926, - "grad_norm": 0.7422571778297424, - "learning_rate": 3.59868005603515e-05, - "loss": 0.6996, - "step": 3495 - }, - { - "epoch": 0.4086875291919664, - "grad_norm": 0.7104097604751587, - "learning_rate": 3.5946548343544925e-05, - "loss": 0.6773, - "step": 3500 - }, - { - "epoch": 0.40927136851938345, - "grad_norm": 0.7399225831031799, - "learning_rate": 3.5906264634482084e-05, - "loss": 0.6771, - "step": 3505 - }, - { - "epoch": 0.4098552078468006, - "grad_norm": 0.7281729578971863, - "learning_rate": 3.586594958335747e-05, - "loss": 0.681, - "step": 3510 - }, - { - "epoch": 0.41043904717421764, - "grad_norm": 0.7177674770355225, - "learning_rate": 3.582560334048241e-05, - "loss": 0.6761, - "step": 3515 - }, - { - "epoch": 0.41102288650163477, - "grad_norm": 0.7471392154693604, - "learning_rate": 3.578522605628453e-05, - "loss": 0.6663, - "step": 3520 - }, - { - "epoch": 0.41160672582905183, - "grad_norm": 0.6800620555877686, - "learning_rate": 3.5744817881307184e-05, - "loss": 0.6811, - "step": 3525 - }, - { - "epoch": 0.41219056515646896, - "grad_norm": 0.7615258097648621, - "learning_rate": 3.570437896620891e-05, - "loss": 0.6839, - "step": 3530 - }, - { - "epoch": 0.412774404483886, - "grad_norm": 0.7592416405677795, - "learning_rate": 3.566390946176286e-05, - "loss": 0.6974, - "step": 3535 - }, - { - "epoch": 0.41335824381130315, - "grad_norm": 0.8110158443450928, - "learning_rate": 3.5623409518856225e-05, - "loss": 0.7031, - "step": 3540 - }, - { - "epoch": 0.4139420831387202, - "grad_norm": 0.6509615778923035, - "learning_rate": 3.55828792884897e-05, - "loss": 0.6955, - "step": 3545 - }, - { - "epoch": 0.41452592246613734, - "grad_norm": 0.7640010118484497, - "learning_rate": 3.5542318921776886e-05, - "loss": 0.6832, - "step": 3550 - }, - { - "epoch": 0.4151097617935544, - "grad_norm": 0.663804829120636, - "learning_rate": 3.5501728569943746e-05, - "loss": 0.6806, - "step": 3555 - }, - { - "epoch": 0.41569360112097153, - "grad_norm": 0.7292649745941162, - "learning_rate": 3.546110838432806e-05, - "loss": 0.6774, - "step": 3560 - }, - { - "epoch": 0.4162774404483886, - "grad_norm": 0.7868595719337463, - "learning_rate": 3.542045851637883e-05, - "loss": 0.6771, - "step": 3565 - }, - { - "epoch": 0.41686127977580567, - "grad_norm": 0.8498745560646057, - "learning_rate": 3.53797791176557e-05, - "loss": 0.7024, - "step": 3570 - }, - { - "epoch": 0.4174451191032228, - "grad_norm": 0.7394131422042847, - "learning_rate": 3.5339070339828466e-05, - "loss": 0.6871, - "step": 3575 - }, - { - "epoch": 0.41802895843063986, - "grad_norm": 0.7180430293083191, - "learning_rate": 3.529833233467642e-05, - "loss": 0.6621, - "step": 3580 - }, - { - "epoch": 0.418612797758057, - "grad_norm": 0.6864657402038574, - "learning_rate": 3.525756525408785e-05, - "loss": 0.6939, - "step": 3585 - }, - { - "epoch": 0.41919663708547406, - "grad_norm": 0.7795558571815491, - "learning_rate": 3.521676925005945e-05, - "loss": 0.6901, - "step": 3590 - }, - { - "epoch": 0.4197804764128912, - "grad_norm": 0.7224160432815552, - "learning_rate": 3.517594447469572e-05, - "loss": 0.6729, - "step": 3595 - }, - { - "epoch": 0.42036431574030825, - "grad_norm": 0.6776707172393799, - "learning_rate": 3.513509108020846e-05, - "loss": 0.693, - "step": 3600 - }, - { - "epoch": 0.42094815506772537, - "grad_norm": 0.6936048269271851, - "learning_rate": 3.5094209218916185e-05, - "loss": 0.6779, - "step": 3605 - }, - { - "epoch": 0.42153199439514244, - "grad_norm": 0.6695876717567444, - "learning_rate": 3.505329904324351e-05, - "loss": 0.6727, - "step": 3610 - }, - { - "epoch": 0.42211583372255956, - "grad_norm": 0.808820366859436, - "learning_rate": 3.501236070572066e-05, - "loss": 0.6774, - "step": 3615 - }, - { - "epoch": 0.42269967304997663, - "grad_norm": 0.7598282098770142, - "learning_rate": 3.497139435898283e-05, - "loss": 0.6852, - "step": 3620 - }, - { - "epoch": 0.42328351237739376, - "grad_norm": 0.7000810503959656, - "learning_rate": 3.4930400155769644e-05, - "loss": 0.6761, - "step": 3625 - }, - { - "epoch": 0.4238673517048108, - "grad_norm": 0.6462544798851013, - "learning_rate": 3.488937824892461e-05, - "loss": 0.6973, - "step": 3630 - }, - { - "epoch": 0.42445119103222795, - "grad_norm": 0.6422539949417114, - "learning_rate": 3.48483287913945e-05, - "loss": 0.697, - "step": 3635 - }, - { - "epoch": 0.425035030359645, - "grad_norm": 0.6988733410835266, - "learning_rate": 3.480725193622881e-05, - "loss": 0.6753, - "step": 3640 - }, - { - "epoch": 0.42561886968706214, - "grad_norm": 0.6996596455574036, - "learning_rate": 3.476614783657922e-05, - "loss": 0.702, - "step": 3645 - }, - { - "epoch": 0.4262027090144792, - "grad_norm": 0.6821566224098206, - "learning_rate": 3.472501664569894e-05, - "loss": 0.6901, - "step": 3650 - }, - { - "epoch": 0.42678654834189633, - "grad_norm": 0.6748144626617432, - "learning_rate": 3.468385851694222e-05, - "loss": 0.687, - "step": 3655 - }, - { - "epoch": 0.4273703876693134, - "grad_norm": 0.7441579103469849, - "learning_rate": 3.464267360376373e-05, - "loss": 0.6801, - "step": 3660 - }, - { - "epoch": 0.4279542269967305, - "grad_norm": 0.6940188407897949, - "learning_rate": 3.460146205971802e-05, - "loss": 0.6956, - "step": 3665 - }, - { - "epoch": 0.4285380663241476, - "grad_norm": 0.6600026488304138, - "learning_rate": 3.456022403845891e-05, - "loss": 0.6879, - "step": 3670 - }, - { - "epoch": 0.4291219056515647, - "grad_norm": 0.7706608176231384, - "learning_rate": 3.4518959693738944e-05, - "loss": 0.6789, - "step": 3675 - }, - { - "epoch": 0.4297057449789818, - "grad_norm": 0.7532742619514465, - "learning_rate": 3.4477669179408834e-05, - "loss": 0.6835, - "step": 3680 - }, - { - "epoch": 0.43028958430639885, - "grad_norm": 0.7748365998268127, - "learning_rate": 3.443635264941682e-05, - "loss": 0.6838, - "step": 3685 - }, - { - "epoch": 0.430873423633816, - "grad_norm": 0.7468739151954651, - "learning_rate": 3.4395010257808185e-05, - "loss": 0.6878, - "step": 3690 - }, - { - "epoch": 0.43145726296123305, - "grad_norm": 0.6885190010070801, - "learning_rate": 3.43536421587246e-05, - "loss": 0.6973, - "step": 3695 - }, - { - "epoch": 0.43204110228865017, - "grad_norm": 0.7122448086738586, - "learning_rate": 3.431224850640361e-05, - "loss": 0.6814, - "step": 3700 - }, - { - "epoch": 0.43262494161606724, - "grad_norm": 0.7184144854545593, - "learning_rate": 3.427082945517801e-05, - "loss": 0.6759, - "step": 3705 - }, - { - "epoch": 0.43320878094348436, - "grad_norm": 0.7180167436599731, - "learning_rate": 3.422938515947531e-05, - "loss": 0.6705, - "step": 3710 - }, - { - "epoch": 0.43379262027090143, - "grad_norm": 0.7025409936904907, - "learning_rate": 3.418791577381713e-05, - "loss": 0.6771, - "step": 3715 - }, - { - "epoch": 0.43437645959831855, - "grad_norm": 0.7823861837387085, - "learning_rate": 3.4146421452818657e-05, - "loss": 0.6992, - "step": 3720 - }, - { - "epoch": 0.4349602989257356, - "grad_norm": 0.7129303812980652, - "learning_rate": 3.4104902351188e-05, - "loss": 0.6799, - "step": 3725 - }, - { - "epoch": 0.43554413825315275, - "grad_norm": 0.7030401825904846, - "learning_rate": 3.406335862372573e-05, - "loss": 0.6692, - "step": 3730 - }, - { - "epoch": 0.4361279775805698, - "grad_norm": 0.7644011974334717, - "learning_rate": 3.402179042532417e-05, - "loss": 0.6906, - "step": 3735 - }, - { - "epoch": 0.43671181690798694, - "grad_norm": 0.6951369643211365, - "learning_rate": 3.3980197910966915e-05, - "loss": 0.6862, - "step": 3740 - }, - { - "epoch": 0.437295656235404, - "grad_norm": 0.7194583415985107, - "learning_rate": 3.3938581235728214e-05, - "loss": 0.6736, - "step": 3745 - }, - { - "epoch": 0.43787949556282113, - "grad_norm": 0.6849757432937622, - "learning_rate": 3.389694055477238e-05, - "loss": 0.695, - "step": 3750 - }, - { - "epoch": 0.4384633348902382, - "grad_norm": 0.7094421982765198, - "learning_rate": 3.385527602335327e-05, - "loss": 0.6469, - "step": 3755 - }, - { - "epoch": 0.4390471742176553, - "grad_norm": 0.7638982534408569, - "learning_rate": 3.381358779681362e-05, - "loss": 0.671, - "step": 3760 - }, - { - "epoch": 0.4396310135450724, - "grad_norm": 0.7803221940994263, - "learning_rate": 3.377187603058454e-05, - "loss": 0.6866, - "step": 3765 - }, - { - "epoch": 0.4402148528724895, - "grad_norm": 0.8005938529968262, - "learning_rate": 3.373014088018489e-05, - "loss": 0.7029, - "step": 3770 - }, - { - "epoch": 0.4407986921999066, - "grad_norm": 0.68678879737854, - "learning_rate": 3.3688382501220727e-05, - "loss": 0.6693, - "step": 3775 - }, - { - "epoch": 0.4413825315273237, - "grad_norm": 0.7795002460479736, - "learning_rate": 3.364660104938472e-05, - "loss": 0.6898, - "step": 3780 - }, - { - "epoch": 0.4419663708547408, - "grad_norm": 0.77824467420578, - "learning_rate": 3.3604796680455546e-05, - "loss": 0.7035, - "step": 3785 - }, - { - "epoch": 0.44255021018215784, - "grad_norm": 0.6533271074295044, - "learning_rate": 3.356296955029733e-05, - "loss": 0.679, - "step": 3790 - }, - { - "epoch": 0.44313404950957497, - "grad_norm": 0.6827269196510315, - "learning_rate": 3.3521119814859063e-05, - "loss": 0.685, - "step": 3795 - }, - { - "epoch": 0.44371788883699204, - "grad_norm": 0.6369045972824097, - "learning_rate": 3.347924763017403e-05, - "loss": 0.6721, - "step": 3800 - }, - { - "epoch": 0.44430172816440916, - "grad_norm": 0.6672148704528809, - "learning_rate": 3.3437353152359195e-05, - "loss": 0.6771, - "step": 3805 - }, - { - "epoch": 0.44488556749182623, - "grad_norm": 0.6707364916801453, - "learning_rate": 3.339543653761466e-05, - "loss": 0.6851, - "step": 3810 - }, - { - "epoch": 0.44546940681924335, - "grad_norm": 0.7358875274658203, - "learning_rate": 3.335349794222304e-05, - "loss": 0.6737, - "step": 3815 - }, - { - "epoch": 0.4460532461466604, - "grad_norm": 0.7061719298362732, - "learning_rate": 3.331153752254893e-05, - "loss": 0.6963, - "step": 3820 - }, - { - "epoch": 0.44663708547407754, - "grad_norm": 0.7666628360748291, - "learning_rate": 3.326955543503827e-05, - "loss": 0.7041, - "step": 3825 - }, - { - "epoch": 0.4472209248014946, - "grad_norm": 0.7456943988800049, - "learning_rate": 3.322755183621779e-05, - "loss": 0.6759, - "step": 3830 - }, - { - "epoch": 0.44780476412891174, - "grad_norm": 0.7148375511169434, - "learning_rate": 3.318552688269446e-05, - "loss": 0.6703, - "step": 3835 - }, - { - "epoch": 0.4483886034563288, - "grad_norm": 0.762797474861145, - "learning_rate": 3.314348073115481e-05, - "loss": 0.6845, - "step": 3840 - }, - { - "epoch": 0.44897244278374593, - "grad_norm": 0.6983367800712585, - "learning_rate": 3.310141353836446e-05, - "loss": 0.6889, - "step": 3845 - }, - { - "epoch": 0.449556282111163, - "grad_norm": 0.7493073344230652, - "learning_rate": 3.305932546116743e-05, - "loss": 0.6795, - "step": 3850 - }, - { - "epoch": 0.4501401214385801, - "grad_norm": 0.7129876017570496, - "learning_rate": 3.301721665648566e-05, - "loss": 0.697, - "step": 3855 - }, - { - "epoch": 0.4507239607659972, - "grad_norm": 0.7050039768218994, - "learning_rate": 3.297508728131832e-05, - "loss": 0.6747, - "step": 3860 - }, - { - "epoch": 0.4513078000934143, - "grad_norm": 0.7499178051948547, - "learning_rate": 3.29329374927413e-05, - "loss": 0.6843, - "step": 3865 - }, - { - "epoch": 0.4518916394208314, - "grad_norm": 0.6930633783340454, - "learning_rate": 3.2890767447906615e-05, - "loss": 0.6706, - "step": 3870 - }, - { - "epoch": 0.4524754787482485, - "grad_norm": 0.727230429649353, - "learning_rate": 3.284857730404176e-05, - "loss": 0.6746, - "step": 3875 - }, - { - "epoch": 0.4530593180756656, - "grad_norm": 0.7512724995613098, - "learning_rate": 3.2806367218449216e-05, - "loss": 0.6807, - "step": 3880 - }, - { - "epoch": 0.4536431574030827, - "grad_norm": 0.7921996712684631, - "learning_rate": 3.2764137348505785e-05, - "loss": 0.6776, - "step": 3885 - }, - { - "epoch": 0.45422699673049977, - "grad_norm": 0.8013817667961121, - "learning_rate": 3.2721887851662044e-05, - "loss": 0.6696, - "step": 3890 - }, - { - "epoch": 0.45481083605791683, - "grad_norm": 0.8452430367469788, - "learning_rate": 3.267961888544173e-05, - "loss": 0.6875, - "step": 3895 - }, - { - "epoch": 0.45539467538533396, - "grad_norm": 0.6620650291442871, - "learning_rate": 3.263733060744121e-05, - "loss": 0.6772, - "step": 3900 - }, - { - "epoch": 0.455978514712751, - "grad_norm": 0.7774426937103271, - "learning_rate": 3.25950231753288e-05, - "loss": 0.6695, - "step": 3905 - }, - { - "epoch": 0.45656235404016815, - "grad_norm": 0.6689557433128357, - "learning_rate": 3.255269674684427e-05, - "loss": 0.6954, - "step": 3910 - }, - { - "epoch": 0.4571461933675852, - "grad_norm": 0.7409423589706421, - "learning_rate": 3.2510351479798214e-05, - "loss": 0.6522, - "step": 3915 - }, - { - "epoch": 0.45773003269500234, - "grad_norm": 0.8153340816497803, - "learning_rate": 3.2467987532071436e-05, - "loss": 0.6756, - "step": 3920 - }, - { - "epoch": 0.4583138720224194, - "grad_norm": 0.7892290353775024, - "learning_rate": 3.242560506161442e-05, - "loss": 0.675, - "step": 3925 - }, - { - "epoch": 0.45889771134983653, - "grad_norm": 0.7862321138381958, - "learning_rate": 3.2383204226446706e-05, - "loss": 0.6875, - "step": 3930 - }, - { - "epoch": 0.4594815506772536, - "grad_norm": 0.7793058753013611, - "learning_rate": 3.234078518465628e-05, - "loss": 0.6911, - "step": 3935 - }, - { - "epoch": 0.4600653900046707, - "grad_norm": 0.71147620677948, - "learning_rate": 3.229834809439904e-05, - "loss": 0.6797, - "step": 3940 - }, - { - "epoch": 0.4606492293320878, - "grad_norm": 0.8237377405166626, - "learning_rate": 3.225589311389816e-05, - "loss": 0.6763, - "step": 3945 - }, - { - "epoch": 0.4612330686595049, - "grad_norm": 0.6844600439071655, - "learning_rate": 3.221342040144352e-05, - "loss": 0.6848, - "step": 3950 - }, - { - "epoch": 0.461816907986922, - "grad_norm": 0.6796713471412659, - "learning_rate": 3.217093011539111e-05, - "loss": 0.6897, - "step": 3955 - }, - { - "epoch": 0.4624007473143391, - "grad_norm": 0.7828940153121948, - "learning_rate": 3.2128422414162454e-05, - "loss": 0.6812, - "step": 3960 - }, - { - "epoch": 0.4629845866417562, - "grad_norm": 0.6918688416481018, - "learning_rate": 3.2085897456243986e-05, - "loss": 0.6792, - "step": 3965 - }, - { - "epoch": 0.4635684259691733, - "grad_norm": 0.7672845125198364, - "learning_rate": 3.204335540018649e-05, - "loss": 0.6756, - "step": 3970 - }, - { - "epoch": 0.46415226529659037, - "grad_norm": 0.658639132976532, - "learning_rate": 3.200079640460451e-05, - "loss": 0.6808, - "step": 3975 - }, - { - "epoch": 0.4647361046240075, - "grad_norm": 0.6817683577537537, - "learning_rate": 3.195822062817573e-05, - "loss": 0.6637, - "step": 3980 - }, - { - "epoch": 0.46531994395142456, - "grad_norm": 0.6692237257957458, - "learning_rate": 3.191562822964041e-05, - "loss": 0.6999, - "step": 3985 - }, - { - "epoch": 0.4659037832788417, - "grad_norm": 0.6766791343688965, - "learning_rate": 3.187301936780079e-05, - "loss": 0.6543, - "step": 3990 - }, - { - "epoch": 0.46648762260625876, - "grad_norm": 0.6621002554893494, - "learning_rate": 3.183039420152047e-05, - "loss": 0.6837, - "step": 3995 - }, - { - "epoch": 0.4670714619336759, - "grad_norm": 0.6324938535690308, - "learning_rate": 3.178775288972386e-05, - "loss": 0.6809, - "step": 4000 - }, - { - "epoch": 0.46765530126109295, - "grad_norm": 0.6939051747322083, - "learning_rate": 3.174509559139556e-05, - "loss": 0.6857, - "step": 4005 - }, - { - "epoch": 0.46823914058851, - "grad_norm": 0.6534227132797241, - "learning_rate": 3.170242246557978e-05, - "loss": 0.679, - "step": 4010 - }, - { - "epoch": 0.46882297991592714, - "grad_norm": 0.6965385675430298, - "learning_rate": 3.1659733671379735e-05, - "loss": 0.6862, - "step": 4015 - }, - { - "epoch": 0.4694068192433442, - "grad_norm": 0.6696798205375671, - "learning_rate": 3.1617029367957053e-05, - "loss": 0.6586, - "step": 4020 - }, - { - "epoch": 0.46999065857076133, - "grad_norm": 0.7275343537330627, - "learning_rate": 3.1574309714531195e-05, - "loss": 0.6818, - "step": 4025 - }, - { - "epoch": 0.4705744978981784, - "grad_norm": 0.6740556955337524, - "learning_rate": 3.153157487037887e-05, - "loss": 0.6581, - "step": 4030 - }, - { - "epoch": 0.4711583372255955, - "grad_norm": 0.6637656688690186, - "learning_rate": 3.1488824994833395e-05, - "loss": 0.6722, - "step": 4035 - }, - { - "epoch": 0.4717421765530126, - "grad_norm": 0.7013155221939087, - "learning_rate": 3.1446060247284134e-05, - "loss": 0.6752, - "step": 4040 - }, - { - "epoch": 0.4723260158804297, - "grad_norm": 0.7300024032592773, - "learning_rate": 3.140328078717591e-05, - "loss": 0.6839, - "step": 4045 - }, - { - "epoch": 0.4729098552078468, - "grad_norm": 0.7423737645149231, - "learning_rate": 3.1360486774008415e-05, - "loss": 0.6792, - "step": 4050 - }, - { - "epoch": 0.4734936945352639, - "grad_norm": 0.6636881828308105, - "learning_rate": 3.131767836733556e-05, - "loss": 0.6842, - "step": 4055 - }, - { - "epoch": 0.474077533862681, - "grad_norm": 0.6986633539199829, - "learning_rate": 3.127485572676496e-05, - "loss": 0.6719, - "step": 4060 - }, - { - "epoch": 0.4746613731900981, - "grad_norm": 0.7615857124328613, - "learning_rate": 3.1232019011957294e-05, - "loss": 0.6685, - "step": 4065 - }, - { - "epoch": 0.47524521251751517, - "grad_norm": 0.7390249371528625, - "learning_rate": 3.118916838262568e-05, - "loss": 0.6607, - "step": 4070 - }, - { - "epoch": 0.4758290518449323, - "grad_norm": 0.6422819495201111, - "learning_rate": 3.114630399853517e-05, - "loss": 0.6649, - "step": 4075 - }, - { - "epoch": 0.47641289117234936, - "grad_norm": 0.6538007855415344, - "learning_rate": 3.1103426019502055e-05, - "loss": 0.6559, - "step": 4080 - }, - { - "epoch": 0.4769967304997665, - "grad_norm": 0.7528535723686218, - "learning_rate": 3.1060534605393345e-05, - "loss": 0.6911, - "step": 4085 - }, - { - "epoch": 0.47758056982718355, - "grad_norm": 0.6821663975715637, - "learning_rate": 3.101762991612611e-05, - "loss": 0.6747, - "step": 4090 - }, - { - "epoch": 0.4781644091546007, - "grad_norm": 0.8261800408363342, - "learning_rate": 3.0974712111666935e-05, - "loss": 0.6857, - "step": 4095 - }, - { - "epoch": 0.47874824848201775, - "grad_norm": 0.7546287178993225, - "learning_rate": 3.09317813520313e-05, - "loss": 0.6937, - "step": 4100 - }, - { - "epoch": 0.47933208780943487, - "grad_norm": 0.6892336010932922, - "learning_rate": 3.0888837797283005e-05, - "loss": 0.6824, - "step": 4105 - }, - { - "epoch": 0.47991592713685194, - "grad_norm": 0.7267537713050842, - "learning_rate": 3.0845881607533524e-05, - "loss": 0.6772, - "step": 4110 - }, - { - "epoch": 0.480499766464269, - "grad_norm": 0.7015790939331055, - "learning_rate": 3.0802912942941453e-05, - "loss": 0.6834, - "step": 4115 - }, - { - "epoch": 0.48108360579168613, - "grad_norm": 0.6594697833061218, - "learning_rate": 3.0759931963711913e-05, - "loss": 0.6782, - "step": 4120 - }, - { - "epoch": 0.4816674451191032, - "grad_norm": 0.7593767046928406, - "learning_rate": 3.071693883009591e-05, - "loss": 0.6833, - "step": 4125 - }, - { - "epoch": 0.4822512844465203, - "grad_norm": 0.6981526613235474, - "learning_rate": 3.06739337023898e-05, - "loss": 0.6739, - "step": 4130 - }, - { - "epoch": 0.4828351237739374, - "grad_norm": 0.7820166945457458, - "learning_rate": 3.0630916740934626e-05, - "loss": 0.676, - "step": 4135 - }, - { - "epoch": 0.4834189631013545, - "grad_norm": 0.6433948278427124, - "learning_rate": 3.058788810611558e-05, - "loss": 0.6707, - "step": 4140 - }, - { - "epoch": 0.4840028024287716, - "grad_norm": 0.791711688041687, - "learning_rate": 3.054484795836136e-05, - "loss": 0.6771, - "step": 4145 - }, - { - "epoch": 0.4845866417561887, - "grad_norm": 0.7172390818595886, - "learning_rate": 3.0501796458143593e-05, - "loss": 0.6744, - "step": 4150 - }, - { - "epoch": 0.4851704810836058, - "grad_norm": 0.6948303580284119, - "learning_rate": 3.045873376597624e-05, - "loss": 0.6679, - "step": 4155 - }, - { - "epoch": 0.4857543204110229, - "grad_norm": 0.7872658967971802, - "learning_rate": 3.041566004241498e-05, - "loss": 0.6657, - "step": 4160 - }, - { - "epoch": 0.48633815973843997, - "grad_norm": 0.6825777292251587, - "learning_rate": 3.037257544805661e-05, - "loss": 0.6695, - "step": 4165 - }, - { - "epoch": 0.4869219990658571, - "grad_norm": 0.663886547088623, - "learning_rate": 3.0329480143538498e-05, - "loss": 0.682, - "step": 4170 - }, - { - "epoch": 0.48750583839327416, - "grad_norm": 0.7771258354187012, - "learning_rate": 3.0286374289537912e-05, - "loss": 0.6439, - "step": 4175 - }, - { - "epoch": 0.4880896777206913, - "grad_norm": 0.6744545698165894, - "learning_rate": 3.0243258046771446e-05, - "loss": 0.6677, - "step": 4180 - }, - { - "epoch": 0.48867351704810835, - "grad_norm": 0.7483869194984436, - "learning_rate": 3.0200131575994456e-05, - "loss": 0.6855, - "step": 4185 - }, - { - "epoch": 0.4892573563755255, - "grad_norm": 0.6919116377830505, - "learning_rate": 3.0156995038000418e-05, - "loss": 0.6563, - "step": 4190 - }, - { - "epoch": 0.48984119570294254, - "grad_norm": 0.7150753736495972, - "learning_rate": 3.011384859362034e-05, - "loss": 0.6791, - "step": 4195 - }, - { - "epoch": 0.49042503503035967, - "grad_norm": 0.8189941644668579, - "learning_rate": 3.0070692403722162e-05, - "loss": 0.6879, - "step": 4200 - }, - { - "epoch": 0.49100887435777674, - "grad_norm": 0.7260397672653198, - "learning_rate": 3.002752662921018e-05, - "loss": 0.6921, - "step": 4205 - }, - { - "epoch": 0.49159271368519386, - "grad_norm": 0.7689381241798401, - "learning_rate": 2.9984351431024394e-05, - "loss": 0.6699, - "step": 4210 - }, - { - "epoch": 0.49217655301261093, - "grad_norm": 0.6612753868103027, - "learning_rate": 2.9941166970139968e-05, - "loss": 0.6472, - "step": 4215 - }, - { - "epoch": 0.492760392340028, - "grad_norm": 0.6999014019966125, - "learning_rate": 2.9897973407566583e-05, - "loss": 0.6651, - "step": 4220 - }, - { - "epoch": 0.4933442316674451, - "grad_norm": 0.7655627727508545, - "learning_rate": 2.985477090434786e-05, - "loss": 0.6882, - "step": 4225 - }, - { - "epoch": 0.4939280709948622, - "grad_norm": 0.7117385268211365, - "learning_rate": 2.9811559621560765e-05, - "loss": 0.6796, - "step": 4230 - }, - { - "epoch": 0.4945119103222793, - "grad_norm": 0.7483447790145874, - "learning_rate": 2.976833972031498e-05, - "loss": 0.6544, - "step": 4235 - }, - { - "epoch": 0.4950957496496964, - "grad_norm": 0.6597625613212585, - "learning_rate": 2.9725111361752333e-05, - "loss": 0.6953, - "step": 4240 - }, - { - "epoch": 0.4956795889771135, - "grad_norm": 0.6553619503974915, - "learning_rate": 2.968187470704618e-05, - "loss": 0.667, - "step": 4245 - }, - { - "epoch": 0.4962634283045306, - "grad_norm": 0.7013184428215027, - "learning_rate": 2.9638629917400806e-05, - "loss": 0.6913, - "step": 4250 - }, - { - "epoch": 0.4968472676319477, - "grad_norm": 0.7164246439933777, - "learning_rate": 2.9595377154050836e-05, - "loss": 0.6907, - "step": 4255 - }, - { - "epoch": 0.49743110695936477, - "grad_norm": 0.6580268144607544, - "learning_rate": 2.955211657826061e-05, - "loss": 0.683, - "step": 4260 - }, - { - "epoch": 0.4980149462867819, - "grad_norm": 0.6795541048049927, - "learning_rate": 2.9508848351323597e-05, - "loss": 0.6555, - "step": 4265 - }, - { - "epoch": 0.49859878561419896, - "grad_norm": 0.7254034876823425, - "learning_rate": 2.9465572634561815e-05, - "loss": 0.6852, - "step": 4270 - }, - { - "epoch": 0.4991826249416161, - "grad_norm": 0.6938592195510864, - "learning_rate": 2.9422289589325187e-05, - "loss": 0.6714, - "step": 4275 - }, - { - "epoch": 0.49976646426903315, - "grad_norm": 0.8255984783172607, - "learning_rate": 2.9378999376990958e-05, - "loss": 0.6455, - "step": 4280 - }, - { - "epoch": 0.5003503035964503, - "grad_norm": 0.774491548538208, - "learning_rate": 2.9335702158963107e-05, - "loss": 0.6612, - "step": 4285 - }, - { - "epoch": 0.5009341429238674, - "grad_norm": 0.7388604879379272, - "learning_rate": 2.929239809667172e-05, - "loss": 0.6424, - "step": 4290 - }, - { - "epoch": 0.5015179822512844, - "grad_norm": 0.7996976375579834, - "learning_rate": 2.9249087351572414e-05, - "loss": 0.6889, - "step": 4295 - }, - { - "epoch": 0.5021018215787015, - "grad_norm": 0.6799002885818481, - "learning_rate": 2.9205770085145716e-05, - "loss": 0.6908, - "step": 4300 - }, - { - "epoch": 0.5026856609061187, - "grad_norm": 0.7640228271484375, - "learning_rate": 2.916244645889647e-05, - "loss": 0.6752, - "step": 4305 - }, - { - "epoch": 0.5032695002335358, - "grad_norm": 0.7042264938354492, - "learning_rate": 2.911911663435322e-05, - "loss": 0.6616, - "step": 4310 - }, - { - "epoch": 0.5038533395609528, - "grad_norm": 0.6852525472640991, - "learning_rate": 2.9075780773067644e-05, - "loss": 0.6551, - "step": 4315 - }, - { - "epoch": 0.5044371788883699, - "grad_norm": 0.6982749104499817, - "learning_rate": 2.9032439036613907e-05, - "loss": 0.693, - "step": 4320 - }, - { - "epoch": 0.505021018215787, - "grad_norm": 0.6702561974525452, - "learning_rate": 2.8989091586588085e-05, - "loss": 0.6714, - "step": 4325 - }, - { - "epoch": 0.5056048575432041, - "grad_norm": 0.6147798895835876, - "learning_rate": 2.894573858460755e-05, - "loss": 0.6748, - "step": 4330 - }, - { - "epoch": 0.5061886968706212, - "grad_norm": 0.7076907753944397, - "learning_rate": 2.8902380192310392e-05, - "loss": 0.6638, - "step": 4335 - }, - { - "epoch": 0.5067725361980383, - "grad_norm": 0.7374987602233887, - "learning_rate": 2.8859016571354778e-05, - "loss": 0.6677, - "step": 4340 - }, - { - "epoch": 0.5073563755254554, - "grad_norm": 0.6996968984603882, - "learning_rate": 2.881564788341839e-05, - "loss": 0.6856, - "step": 4345 - }, - { - "epoch": 0.5079402148528724, - "grad_norm": 0.6513121724128723, - "learning_rate": 2.877227429019778e-05, - "loss": 0.668, - "step": 4350 - }, - { - "epoch": 0.5085240541802896, - "grad_norm": 0.6951887011528015, - "learning_rate": 2.872889595340781e-05, - "loss": 0.6704, - "step": 4355 - }, - { - "epoch": 0.5091078935077067, - "grad_norm": 0.6447330117225647, - "learning_rate": 2.8685513034781003e-05, - "loss": 0.662, - "step": 4360 - }, - { - "epoch": 0.5096917328351238, - "grad_norm": 0.7027966380119324, - "learning_rate": 2.864212569606699e-05, - "loss": 0.6615, - "step": 4365 - }, - { - "epoch": 0.5102755721625408, - "grad_norm": 0.6491148471832275, - "learning_rate": 2.8598734099031878e-05, - "loss": 0.6575, - "step": 4370 - }, - { - "epoch": 0.510859411489958, - "grad_norm": 0.7248828411102295, - "learning_rate": 2.8555338405457628e-05, - "loss": 0.6858, - "step": 4375 - }, - { - "epoch": 0.5114432508173751, - "grad_norm": 0.7328376770019531, - "learning_rate": 2.85119387771415e-05, - "loss": 0.6698, - "step": 4380 - }, - { - "epoch": 0.5120270901447922, - "grad_norm": 0.7065794467926025, - "learning_rate": 2.8468535375895417e-05, - "loss": 0.6425, - "step": 4385 - }, - { - "epoch": 0.5126109294722092, - "grad_norm": 0.6677983403205872, - "learning_rate": 2.8425128363545362e-05, - "loss": 0.6874, - "step": 4390 - }, - { - "epoch": 0.5131947687996263, - "grad_norm": 0.6390778422355652, - "learning_rate": 2.8381717901930792e-05, - "loss": 0.6532, - "step": 4395 - }, - { - "epoch": 0.5137786081270435, - "grad_norm": 0.6523604393005371, - "learning_rate": 2.8338304152904016e-05, - "loss": 0.6831, - "step": 4400 - }, - { - "epoch": 0.5143624474544606, - "grad_norm": 0.8468630313873291, - "learning_rate": 2.8294887278329606e-05, - "loss": 0.675, - "step": 4405 - }, - { - "epoch": 0.5149462867818776, - "grad_norm": 0.7009800672531128, - "learning_rate": 2.825146744008378e-05, - "loss": 0.6555, - "step": 4410 - }, - { - "epoch": 0.5155301261092947, - "grad_norm": 0.7169152498245239, - "learning_rate": 2.8208044800053822e-05, - "loss": 0.7011, - "step": 4415 - }, - { - "epoch": 0.5161139654367118, - "grad_norm": 0.6061458587646484, - "learning_rate": 2.8164619520137437e-05, - "loss": 0.6684, - "step": 4420 - }, - { - "epoch": 0.516697804764129, - "grad_norm": 0.667434811592102, - "learning_rate": 2.8121191762242188e-05, - "loss": 0.6666, - "step": 4425 - }, - { - "epoch": 0.517281644091546, - "grad_norm": 0.644501268863678, - "learning_rate": 2.8077761688284886e-05, - "loss": 0.6499, - "step": 4430 - }, - { - "epoch": 0.5178654834189631, - "grad_norm": 0.6586326956748962, - "learning_rate": 2.803432946019095e-05, - "loss": 0.668, - "step": 4435 - }, - { - "epoch": 0.5184493227463802, - "grad_norm": 0.7346563339233398, - "learning_rate": 2.7990895239893866e-05, - "loss": 0.6632, - "step": 4440 - }, - { - "epoch": 0.5190331620737972, - "grad_norm": 0.6729130148887634, - "learning_rate": 2.7947459189334514e-05, - "loss": 0.6457, - "step": 4445 - }, - { - "epoch": 0.5196170014012144, - "grad_norm": 0.6930280327796936, - "learning_rate": 2.790402147046062e-05, - "loss": 0.6724, - "step": 4450 - }, - { - "epoch": 0.5202008407286315, - "grad_norm": 0.6886438727378845, - "learning_rate": 2.7860582245226114e-05, - "loss": 0.657, - "step": 4455 - }, - { - "epoch": 0.5207846800560486, - "grad_norm": 0.6696081161499023, - "learning_rate": 2.781714167559056e-05, - "loss": 0.6643, - "step": 4460 - }, - { - "epoch": 0.5213685193834656, - "grad_norm": 0.7693870067596436, - "learning_rate": 2.7773699923518527e-05, - "loss": 0.6787, - "step": 4465 - }, - { - "epoch": 0.5219523587108827, - "grad_norm": 0.6501506567001343, - "learning_rate": 2.7730257150978985e-05, - "loss": 0.6794, - "step": 4470 - }, - { - "epoch": 0.5225361980382999, - "grad_norm": 0.6283298134803772, - "learning_rate": 2.7686813519944716e-05, - "loss": 0.6743, - "step": 4475 - }, - { - "epoch": 0.523120037365717, - "grad_norm": 0.6755585670471191, - "learning_rate": 2.7643369192391705e-05, - "loss": 0.6607, - "step": 4480 - }, - { - "epoch": 0.523703876693134, - "grad_norm": 0.7047080397605896, - "learning_rate": 2.759992433029852e-05, - "loss": 0.6718, - "step": 4485 - }, - { - "epoch": 0.5242877160205511, - "grad_norm": 0.6827732920646667, - "learning_rate": 2.7556479095645753e-05, - "loss": 0.6573, - "step": 4490 - }, - { - "epoch": 0.5248715553479683, - "grad_norm": 0.6593140959739685, - "learning_rate": 2.7513033650415352e-05, - "loss": 0.6821, - "step": 4495 - }, - { - "epoch": 0.5254553946753854, - "grad_norm": 0.6783498525619507, - "learning_rate": 2.7469588156590065e-05, - "loss": 0.6536, - "step": 4500 - }, - { - "epoch": 0.5260392340028024, - "grad_norm": 0.6560935974121094, - "learning_rate": 2.742614277615282e-05, - "loss": 0.6776, - "step": 4505 - }, - { - "epoch": 0.5266230733302195, - "grad_norm": 0.6130218505859375, - "learning_rate": 2.7382697671086115e-05, - "loss": 0.6762, - "step": 4510 - }, - { - "epoch": 0.5272069126576366, - "grad_norm": 0.6400717496871948, - "learning_rate": 2.7339253003371434e-05, - "loss": 0.6758, - "step": 4515 - }, - { - "epoch": 0.5277907519850538, - "grad_norm": 0.693777322769165, - "learning_rate": 2.729580893498862e-05, - "loss": 0.6824, - "step": 4520 - }, - { - "epoch": 0.5283745913124708, - "grad_norm": 0.676154613494873, - "learning_rate": 2.725236562791529e-05, - "loss": 0.6579, - "step": 4525 - }, - { - "epoch": 0.5289584306398879, - "grad_norm": 0.631051778793335, - "learning_rate": 2.7208923244126218e-05, - "loss": 0.6687, - "step": 4530 - }, - { - "epoch": 0.529542269967305, - "grad_norm": 0.7405977845191956, - "learning_rate": 2.716548194559273e-05, - "loss": 0.6888, - "step": 4535 - }, - { - "epoch": 0.5301261092947221, - "grad_norm": 0.6706578731536865, - "learning_rate": 2.7122041894282113e-05, - "loss": 0.6803, - "step": 4540 - }, - { - "epoch": 0.5307099486221392, - "grad_norm": 0.6965911388397217, - "learning_rate": 2.707860325215701e-05, - "loss": 0.6879, - "step": 4545 - }, - { - "epoch": 0.5312937879495563, - "grad_norm": 0.660099446773529, - "learning_rate": 2.7035166181174786e-05, - "loss": 0.6723, - "step": 4550 - }, - { - "epoch": 0.5318776272769734, - "grad_norm": 0.7010653614997864, - "learning_rate": 2.6991730843286985e-05, - "loss": 0.6695, - "step": 4555 - }, - { - "epoch": 0.5324614666043904, - "grad_norm": 0.8124195337295532, - "learning_rate": 2.6948297400438654e-05, - "loss": 0.6929, - "step": 4560 - }, - { - "epoch": 0.5330453059318075, - "grad_norm": 0.6997001767158508, - "learning_rate": 2.6904866014567792e-05, - "loss": 0.6815, - "step": 4565 - }, - { - "epoch": 0.5336291452592247, - "grad_norm": 0.7359521985054016, - "learning_rate": 2.686143684760473e-05, - "loss": 0.6797, - "step": 4570 - }, - { - "epoch": 0.5342129845866418, - "grad_norm": 0.6646679043769836, - "learning_rate": 2.6818010061471516e-05, - "loss": 0.654, - "step": 4575 - }, - { - "epoch": 0.5347968239140588, - "grad_norm": 0.6694083213806152, - "learning_rate": 2.6774585818081332e-05, - "loss": 0.655, - "step": 4580 - }, - { - "epoch": 0.5353806632414759, - "grad_norm": 0.674031674861908, - "learning_rate": 2.6731164279337867e-05, - "loss": 0.6863, - "step": 4585 - }, - { - "epoch": 0.535964502568893, - "grad_norm": 0.678043007850647, - "learning_rate": 2.668774560713474e-05, - "loss": 0.6833, - "step": 4590 - }, - { - "epoch": 0.5365483418963102, - "grad_norm": 0.6792556047439575, - "learning_rate": 2.6644329963354882e-05, - "loss": 0.6775, - "step": 4595 - }, - { - "epoch": 0.5371321812237272, - "grad_norm": 0.6906665563583374, - "learning_rate": 2.6600917509869912e-05, - "loss": 0.6641, - "step": 4600 - }, - { - "epoch": 0.5377160205511443, - "grad_norm": 0.6357275247573853, - "learning_rate": 2.655750840853958e-05, - "loss": 0.652, - "step": 4605 - }, - { - "epoch": 0.5382998598785614, - "grad_norm": 0.7192896604537964, - "learning_rate": 2.6514102821211117e-05, - "loss": 0.6831, - "step": 4610 - }, - { - "epoch": 0.5388836992059786, - "grad_norm": 0.6543186902999878, - "learning_rate": 2.647070090971867e-05, - "loss": 0.6711, - "step": 4615 - }, - { - "epoch": 0.5394675385333956, - "grad_norm": 0.7396801114082336, - "learning_rate": 2.6427302835882672e-05, - "loss": 0.652, - "step": 4620 - }, - { - "epoch": 0.5400513778608127, - "grad_norm": 0.7101661562919617, - "learning_rate": 2.6383908761509252e-05, - "loss": 0.6575, - "step": 4625 - }, - { - "epoch": 0.5406352171882298, - "grad_norm": 0.6786910891532898, - "learning_rate": 2.634051884838961e-05, - "loss": 0.6721, - "step": 4630 - }, - { - "epoch": 0.541219056515647, - "grad_norm": 0.6411218643188477, - "learning_rate": 2.629713325829946e-05, - "loss": 0.6859, - "step": 4635 - }, - { - "epoch": 0.541802895843064, - "grad_norm": 0.7029268145561218, - "learning_rate": 2.625375215299838e-05, - "loss": 0.6733, - "step": 4640 - }, - { - "epoch": 0.5423867351704811, - "grad_norm": 0.6495332717895508, - "learning_rate": 2.6210375694229227e-05, - "loss": 0.6867, - "step": 4645 - }, - { - "epoch": 0.5429705744978982, - "grad_norm": 0.6691966652870178, - "learning_rate": 2.6167004043717535e-05, - "loss": 0.6857, - "step": 4650 - }, - { - "epoch": 0.5435544138253152, - "grad_norm": 0.6674373745918274, - "learning_rate": 2.6123637363170912e-05, - "loss": 0.6353, - "step": 4655 - }, - { - "epoch": 0.5441382531527323, - "grad_norm": 0.6771703362464905, - "learning_rate": 2.6080275814278444e-05, - "loss": 0.6671, - "step": 4660 - }, - { - "epoch": 0.5447220924801495, - "grad_norm": 0.6526208519935608, - "learning_rate": 2.6036919558710064e-05, - "loss": 0.6789, - "step": 4665 - }, - { - "epoch": 0.5453059318075666, - "grad_norm": 0.6926788687705994, - "learning_rate": 2.599356875811599e-05, - "loss": 0.673, - "step": 4670 - }, - { - "epoch": 0.5458897711349836, - "grad_norm": 0.6777634620666504, - "learning_rate": 2.595022357412609e-05, - "loss": 0.6422, - "step": 4675 - }, - { - "epoch": 0.5464736104624007, - "grad_norm": 0.8012570142745972, - "learning_rate": 2.59068841683493e-05, - "loss": 0.6647, - "step": 4680 - }, - { - "epoch": 0.5470574497898179, - "grad_norm": 0.7324134111404419, - "learning_rate": 2.586355070237301e-05, - "loss": 0.6716, - "step": 4685 - }, - { - "epoch": 0.547641289117235, - "grad_norm": 0.6582139730453491, - "learning_rate": 2.5820223337762438e-05, - "loss": 0.6487, - "step": 4690 - }, - { - "epoch": 0.548225128444652, - "grad_norm": 0.736298680305481, - "learning_rate": 2.5776902236060096e-05, - "loss": 0.6642, - "step": 4695 - }, - { - "epoch": 0.5488089677720691, - "grad_norm": 0.6291876435279846, - "learning_rate": 2.5733587558785126e-05, - "loss": 0.671, - "step": 4700 - }, - { - "epoch": 0.5493928070994862, - "grad_norm": 0.691855251789093, - "learning_rate": 2.569027946743271e-05, - "loss": 0.6726, - "step": 4705 - }, - { - "epoch": 0.5499766464269034, - "grad_norm": 0.755782425403595, - "learning_rate": 2.5646978123473477e-05, - "loss": 0.6603, - "step": 4710 - }, - { - "epoch": 0.5505604857543204, - "grad_norm": 0.6840935349464417, - "learning_rate": 2.560368368835291e-05, - "loss": 0.6635, - "step": 4715 - }, - { - "epoch": 0.5511443250817375, - "grad_norm": 0.7214719653129578, - "learning_rate": 2.5560396323490725e-05, - "loss": 0.6697, - "step": 4720 - }, - { - "epoch": 0.5517281644091546, - "grad_norm": 0.6904417276382446, - "learning_rate": 2.5517116190280284e-05, - "loss": 0.6865, - "step": 4725 - }, - { - "epoch": 0.5523120037365717, - "grad_norm": 0.6185650825500488, - "learning_rate": 2.547384345008797e-05, - "loss": 0.6767, - "step": 4730 - }, - { - "epoch": 0.5528958430639888, - "grad_norm": 0.6460402607917786, - "learning_rate": 2.5430578264252612e-05, - "loss": 0.6709, - "step": 4735 - }, - { - "epoch": 0.5534796823914059, - "grad_norm": 0.6563448905944824, - "learning_rate": 2.538732079408489e-05, - "loss": 0.667, - "step": 4740 - }, - { - "epoch": 0.554063521718823, - "grad_norm": 0.7151439785957336, - "learning_rate": 2.534407120086668e-05, - "loss": 0.6763, - "step": 4745 - }, - { - "epoch": 0.5546473610462401, - "grad_norm": 0.6759955286979675, - "learning_rate": 2.5300829645850533e-05, - "loss": 0.6503, - "step": 4750 - }, - { - "epoch": 0.5552312003736571, - "grad_norm": 0.7031111717224121, - "learning_rate": 2.5257596290258983e-05, - "loss": 0.6427, - "step": 4755 - }, - { - "epoch": 0.5558150397010743, - "grad_norm": 0.6886486411094666, - "learning_rate": 2.5214371295284028e-05, - "loss": 0.6799, - "step": 4760 - }, - { - "epoch": 0.5563988790284914, - "grad_norm": 0.6858464479446411, - "learning_rate": 2.517115482208649e-05, - "loss": 0.6503, - "step": 4765 - }, - { - "epoch": 0.5569827183559084, - "grad_norm": 0.6623648405075073, - "learning_rate": 2.5127947031795397e-05, - "loss": 0.6604, - "step": 4770 - }, - { - "epoch": 0.5575665576833255, - "grad_norm": 0.6712989807128906, - "learning_rate": 2.5084748085507432e-05, - "loss": 0.6669, - "step": 4775 - }, - { - "epoch": 0.5581503970107426, - "grad_norm": 0.7661103010177612, - "learning_rate": 2.5041558144286282e-05, - "loss": 0.6748, - "step": 4780 - }, - { - "epoch": 0.5587342363381598, - "grad_norm": 0.738399863243103, - "learning_rate": 2.499837736916207e-05, - "loss": 0.6579, - "step": 4785 - }, - { - "epoch": 0.5593180756655768, - "grad_norm": 0.7069480419158936, - "learning_rate": 2.495520592113074e-05, - "loss": 0.6461, - "step": 4790 - }, - { - "epoch": 0.5599019149929939, - "grad_norm": 0.7446060180664062, - "learning_rate": 2.4912043961153468e-05, - "loss": 0.666, - "step": 4795 - }, - { - "epoch": 0.560485754320411, - "grad_norm": 0.7507482767105103, - "learning_rate": 2.486889165015604e-05, - "loss": 0.6644, - "step": 4800 - }, - { - "epoch": 0.5610695936478282, - "grad_norm": 0.6437153220176697, - "learning_rate": 2.4825749149028277e-05, - "loss": 0.6593, - "step": 4805 - }, - { - "epoch": 0.5616534329752452, - "grad_norm": 0.7073465585708618, - "learning_rate": 2.4782616618623428e-05, - "loss": 0.6602, - "step": 4810 - }, - { - "epoch": 0.5622372723026623, - "grad_norm": 0.7186815142631531, - "learning_rate": 2.4739494219757554e-05, - "loss": 0.6738, - "step": 4815 - }, - { - "epoch": 0.5628211116300794, - "grad_norm": 0.6206753849983215, - "learning_rate": 2.4696382113208956e-05, - "loss": 0.6634, - "step": 4820 - }, - { - "epoch": 0.5634049509574965, - "grad_norm": 0.6816120743751526, - "learning_rate": 2.465328045971755e-05, - "loss": 0.6636, - "step": 4825 - }, - { - "epoch": 0.5639887902849136, - "grad_norm": 0.6843183040618896, - "learning_rate": 2.4610189419984285e-05, - "loss": 0.672, - "step": 4830 - }, - { - "epoch": 0.5645726296123307, - "grad_norm": 0.7881632447242737, - "learning_rate": 2.4567109154670542e-05, - "loss": 0.6588, - "step": 4835 - }, - { - "epoch": 0.5651564689397478, - "grad_norm": 0.6082138419151306, - "learning_rate": 2.452403982439751e-05, - "loss": 0.6861, - "step": 4840 - }, - { - "epoch": 0.5657403082671649, - "grad_norm": 0.6717233061790466, - "learning_rate": 2.4480981589745632e-05, - "loss": 0.6448, - "step": 4845 - }, - { - "epoch": 0.5663241475945819, - "grad_norm": 0.6517541408538818, - "learning_rate": 2.4437934611253972e-05, - "loss": 0.6666, - "step": 4850 - }, - { - "epoch": 0.5669079869219991, - "grad_norm": 0.6386508941650391, - "learning_rate": 2.4394899049419612e-05, - "loss": 0.6535, - "step": 4855 - }, - { - "epoch": 0.5674918262494162, - "grad_norm": 0.6252102255821228, - "learning_rate": 2.4351875064697093e-05, - "loss": 0.659, - "step": 4860 - }, - { - "epoch": 0.5680756655768333, - "grad_norm": 0.6333979964256287, - "learning_rate": 2.4308862817497786e-05, - "loss": 0.654, - "step": 4865 - }, - { - "epoch": 0.5686595049042503, - "grad_norm": 0.6661773920059204, - "learning_rate": 2.4265862468189283e-05, - "loss": 0.6673, - "step": 4870 - }, - { - "epoch": 0.5692433442316674, - "grad_norm": 0.6298785209655762, - "learning_rate": 2.4222874177094823e-05, - "loss": 0.6661, - "step": 4875 - }, - { - "epoch": 0.5698271835590846, - "grad_norm": 0.6240935921669006, - "learning_rate": 2.4179898104492705e-05, - "loss": 0.6492, - "step": 4880 - }, - { - "epoch": 0.5704110228865016, - "grad_norm": 0.6606541872024536, - "learning_rate": 2.4136934410615646e-05, - "loss": 0.6621, - "step": 4885 - }, - { - "epoch": 0.5709948622139187, - "grad_norm": 0.6488502025604248, - "learning_rate": 2.4093983255650227e-05, - "loss": 0.6549, - "step": 4890 - }, - { - "epoch": 0.5715787015413358, - "grad_norm": 0.6129763126373291, - "learning_rate": 2.405104479973628e-05, - "loss": 0.6804, - "step": 4895 - }, - { - "epoch": 0.572162540868753, - "grad_norm": 0.626228392124176, - "learning_rate": 2.400811920296627e-05, - "loss": 0.6536, - "step": 4900 - }, - { - "epoch": 0.57274638019617, - "grad_norm": 0.6666534543037415, - "learning_rate": 2.396520662538474e-05, - "loss": 0.6354, - "step": 4905 - }, - { - "epoch": 0.5733302195235871, - "grad_norm": 0.6743848323822021, - "learning_rate": 2.3922307226987678e-05, - "loss": 0.6559, - "step": 4910 - }, - { - "epoch": 0.5739140588510042, - "grad_norm": 0.6935588121414185, - "learning_rate": 2.3879421167721944e-05, - "loss": 0.6769, - "step": 4915 - }, - { - "epoch": 0.5744978981784213, - "grad_norm": 0.6404367089271545, - "learning_rate": 2.383654860748466e-05, - "loss": 0.6768, - "step": 4920 - }, - { - "epoch": 0.5750817375058384, - "grad_norm": 0.6550564169883728, - "learning_rate": 2.379368970612261e-05, - "loss": 0.647, - "step": 4925 - }, - { - "epoch": 0.5756655768332555, - "grad_norm": 0.645498514175415, - "learning_rate": 2.375084462343167e-05, - "loss": 0.6448, - "step": 4930 - }, - { - "epoch": 0.5762494161606726, - "grad_norm": 0.6331825852394104, - "learning_rate": 2.370801351915617e-05, - "loss": 0.6479, - "step": 4935 - }, - { - "epoch": 0.5768332554880897, - "grad_norm": 0.6524755358695984, - "learning_rate": 2.3665196552988357e-05, - "loss": 0.6673, - "step": 4940 - }, - { - "epoch": 0.5774170948155067, - "grad_norm": 0.6634130477905273, - "learning_rate": 2.362239388456773e-05, - "loss": 0.6737, - "step": 4945 - }, - { - "epoch": 0.5780009341429239, - "grad_norm": 0.6262636780738831, - "learning_rate": 2.357960567348049e-05, - "loss": 0.6609, - "step": 4950 - }, - { - "epoch": 0.578584773470341, - "grad_norm": 0.642096221446991, - "learning_rate": 2.3536832079258952e-05, - "loss": 0.6608, - "step": 4955 - }, - { - "epoch": 0.5791686127977581, - "grad_norm": 0.6466443538665771, - "learning_rate": 2.3494073261380915e-05, - "loss": 0.6749, - "step": 4960 - }, - { - "epoch": 0.5797524521251751, - "grad_norm": 0.6250627636909485, - "learning_rate": 2.34513293792691e-05, - "loss": 0.6626, - "step": 4965 - }, - { - "epoch": 0.5803362914525922, - "grad_norm": 0.6803485155105591, - "learning_rate": 2.340860059229052e-05, - "loss": 0.6425, - "step": 4970 - }, - { - "epoch": 0.5809201307800094, - "grad_norm": 0.6826525330543518, - "learning_rate": 2.3365887059755925e-05, - "loss": 0.6762, - "step": 4975 - }, - { - "epoch": 0.5815039701074264, - "grad_norm": 0.6770920157432556, - "learning_rate": 2.3323188940919188e-05, - "loss": 0.6538, - "step": 4980 - }, - { - "epoch": 0.5820878094348435, - "grad_norm": 0.7086965441703796, - "learning_rate": 2.328050639497671e-05, - "loss": 0.6654, - "step": 4985 - }, - { - "epoch": 0.5826716487622606, - "grad_norm": 0.6263920068740845, - "learning_rate": 2.3237839581066828e-05, - "loss": 0.6542, - "step": 4990 - }, - { - "epoch": 0.5832554880896778, - "grad_norm": 0.7152082920074463, - "learning_rate": 2.3195188658269224e-05, - "loss": 0.649, - "step": 4995 - }, - { - "epoch": 0.5838393274170948, - "grad_norm": 0.6616200804710388, - "learning_rate": 2.3152553785604336e-05, - "loss": 0.6597, - "step": 5000 - }, - { - "epoch": 0.5844231667445119, - "grad_norm": 0.7396272420883179, - "learning_rate": 2.3109935122032754e-05, - "loss": 0.6472, - "step": 5005 - }, - { - "epoch": 0.585007006071929, - "grad_norm": 0.7102298736572266, - "learning_rate": 2.3067332826454647e-05, - "loss": 0.6641, - "step": 5010 - }, - { - "epoch": 0.5855908453993461, - "grad_norm": 0.6186151504516602, - "learning_rate": 2.3024747057709132e-05, - "loss": 0.677, - "step": 5015 - }, - { - "epoch": 0.5861746847267632, - "grad_norm": 0.7044642567634583, - "learning_rate": 2.2982177974573733e-05, - "loss": 0.6688, - "step": 5020 - }, - { - "epoch": 0.5867585240541803, - "grad_norm": 0.6526744961738586, - "learning_rate": 2.2939625735763743e-05, - "loss": 0.6542, - "step": 5025 - }, - { - "epoch": 0.5873423633815974, - "grad_norm": 0.7178552150726318, - "learning_rate": 2.2897090499931674e-05, - "loss": 0.643, - "step": 5030 - }, - { - "epoch": 0.5879262027090145, - "grad_norm": 0.8148519396781921, - "learning_rate": 2.285457242566662e-05, - "loss": 0.6381, - "step": 5035 - }, - { - "epoch": 0.5885100420364315, - "grad_norm": 0.7523735165596008, - "learning_rate": 2.2812071671493713e-05, - "loss": 0.6537, - "step": 5040 - }, - { - "epoch": 0.5890938813638487, - "grad_norm": 0.646355926990509, - "learning_rate": 2.2769588395873482e-05, - "loss": 0.6455, - "step": 5045 - }, - { - "epoch": 0.5896777206912658, - "grad_norm": 0.6858602166175842, - "learning_rate": 2.272712275720132e-05, - "loss": 0.6664, - "step": 5050 - }, - { - "epoch": 0.5902615600186829, - "grad_norm": 0.7080469131469727, - "learning_rate": 2.268467491380683e-05, - "loss": 0.6586, - "step": 5055 - }, - { - "epoch": 0.5908453993460999, - "grad_norm": 0.6169444918632507, - "learning_rate": 2.264224502395329e-05, - "loss": 0.661, - "step": 5060 - }, - { - "epoch": 0.591429238673517, - "grad_norm": 0.6571491956710815, - "learning_rate": 2.2599833245837032e-05, - "loss": 0.6796, - "step": 5065 - }, - { - "epoch": 0.5920130780009342, - "grad_norm": 0.7221782207489014, - "learning_rate": 2.2557439737586856e-05, - "loss": 0.673, - "step": 5070 - }, - { - "epoch": 0.5925969173283513, - "grad_norm": 0.6993756294250488, - "learning_rate": 2.2515064657263447e-05, - "loss": 0.6502, - "step": 5075 - }, - { - "epoch": 0.5931807566557683, - "grad_norm": 0.6881592869758606, - "learning_rate": 2.2472708162858792e-05, - "loss": 0.658, - "step": 5080 - }, - { - "epoch": 0.5937645959831854, - "grad_norm": 0.6408549547195435, - "learning_rate": 2.2430370412295566e-05, - "loss": 0.672, - "step": 5085 - }, - { - "epoch": 0.5943484353106026, - "grad_norm": 0.668697714805603, - "learning_rate": 2.2388051563426577e-05, - "loss": 0.6904, - "step": 5090 - }, - { - "epoch": 0.5949322746380196, - "grad_norm": 0.6839088797569275, - "learning_rate": 2.2345751774034135e-05, - "loss": 0.6563, - "step": 5095 - }, - { - "epoch": 0.5955161139654367, - "grad_norm": 0.6734657287597656, - "learning_rate": 2.230347120182951e-05, - "loss": 0.6721, - "step": 5100 - }, - { - "epoch": 0.5960999532928538, - "grad_norm": 0.6243215203285217, - "learning_rate": 2.226121000445232e-05, - "loss": 0.6536, - "step": 5105 - }, - { - "epoch": 0.5966837926202709, - "grad_norm": 0.6928072571754456, - "learning_rate": 2.2218968339469932e-05, - "loss": 0.6444, - "step": 5110 - }, - { - "epoch": 0.597267631947688, - "grad_norm": 0.6660923957824707, - "learning_rate": 2.2176746364376904e-05, - "loss": 0.6573, - "step": 5115 - }, - { - "epoch": 0.5978514712751051, - "grad_norm": 0.7336210012435913, - "learning_rate": 2.2134544236594374e-05, - "loss": 0.6386, - "step": 5120 - }, - { - "epoch": 0.5984353106025222, - "grad_norm": 0.7535378336906433, - "learning_rate": 2.2092362113469474e-05, - "loss": 0.663, - "step": 5125 - }, - { - "epoch": 0.5990191499299393, - "grad_norm": 0.6672455668449402, - "learning_rate": 2.2050200152274763e-05, - "loss": 0.6761, - "step": 5130 - }, - { - "epoch": 0.5996029892573563, - "grad_norm": 0.6607290506362915, - "learning_rate": 2.2008058510207635e-05, - "loss": 0.6751, - "step": 5135 - }, - { - "epoch": 0.6001868285847735, - "grad_norm": 0.6133511066436768, - "learning_rate": 2.1965937344389692e-05, - "loss": 0.652, - "step": 5140 - }, - { - "epoch": 0.6007706679121906, - "grad_norm": 0.6707193851470947, - "learning_rate": 2.1923836811866227e-05, - "loss": 0.654, - "step": 5145 - }, - { - "epoch": 0.6013545072396077, - "grad_norm": 0.6529686450958252, - "learning_rate": 2.188175706960559e-05, - "loss": 0.6605, - "step": 5150 - }, - { - "epoch": 0.6019383465670247, - "grad_norm": 0.6049532890319824, - "learning_rate": 2.1839698274498616e-05, - "loss": 0.6815, - "step": 5155 - }, - { - "epoch": 0.6025221858944418, - "grad_norm": 0.6727195978164673, - "learning_rate": 2.1797660583358032e-05, - "loss": 0.6586, - "step": 5160 - }, - { - "epoch": 0.603106025221859, - "grad_norm": 0.6590107083320618, - "learning_rate": 2.1755644152917903e-05, - "loss": 0.6487, - "step": 5165 - }, - { - "epoch": 0.6036898645492761, - "grad_norm": 0.6550741791725159, - "learning_rate": 2.1713649139833e-05, - "loss": 0.6723, - "step": 5170 - }, - { - "epoch": 0.6042737038766931, - "grad_norm": 0.6399280428886414, - "learning_rate": 2.1671675700678257e-05, - "loss": 0.6606, - "step": 5175 - }, - { - "epoch": 0.6048575432041102, - "grad_norm": 0.6458907127380371, - "learning_rate": 2.1629723991948176e-05, - "loss": 0.6625, - "step": 5180 - }, - { - "epoch": 0.6054413825315274, - "grad_norm": 0.6260433793067932, - "learning_rate": 2.1587794170056213e-05, - "loss": 0.6652, - "step": 5185 - }, - { - "epoch": 0.6060252218589445, - "grad_norm": 0.6338814496994019, - "learning_rate": 2.154588639133425e-05, - "loss": 0.673, - "step": 5190 - }, - { - "epoch": 0.6066090611863615, - "grad_norm": 0.7035873532295227, - "learning_rate": 2.1504000812031966e-05, - "loss": 0.6665, - "step": 5195 - }, - { - "epoch": 0.6071929005137786, - "grad_norm": 0.6172133088111877, - "learning_rate": 2.1462137588316268e-05, - "loss": 0.6409, - "step": 5200 - }, - { - "epoch": 0.6077767398411957, - "grad_norm": 0.702854573726654, - "learning_rate": 2.142029687627074e-05, - "loss": 0.6493, - "step": 5205 - }, - { - "epoch": 0.6083605791686127, - "grad_norm": 0.6975967884063721, - "learning_rate": 2.1378478831895e-05, - "loss": 0.6443, - "step": 5210 - }, - { - "epoch": 0.6089444184960299, - "grad_norm": 0.7312926054000854, - "learning_rate": 2.133668361110417e-05, - "loss": 0.6673, - "step": 5215 - }, - { - "epoch": 0.609528257823447, - "grad_norm": 0.6447761058807373, - "learning_rate": 2.129491136972826e-05, - "loss": 0.638, - "step": 5220 - }, - { - "epoch": 0.6101120971508641, - "grad_norm": 0.6958921551704407, - "learning_rate": 2.125316226351163e-05, - "loss": 0.6707, - "step": 5225 - }, - { - "epoch": 0.6106959364782811, - "grad_norm": 0.6645193696022034, - "learning_rate": 2.1211436448112356e-05, - "loss": 0.6581, - "step": 5230 - }, - { - "epoch": 0.6112797758056983, - "grad_norm": 0.6870747208595276, - "learning_rate": 2.1169734079101684e-05, - "loss": 0.6643, - "step": 5235 - }, - { - "epoch": 0.6118636151331154, - "grad_norm": 0.7142140865325928, - "learning_rate": 2.1128055311963453e-05, - "loss": 0.6589, - "step": 5240 - }, - { - "epoch": 0.6124474544605325, - "grad_norm": 0.7099840044975281, - "learning_rate": 2.1086400302093483e-05, - "loss": 0.664, - "step": 5245 - }, - { - "epoch": 0.6130312937879495, - "grad_norm": 0.648786723613739, - "learning_rate": 2.104476920479905e-05, - "loss": 0.6515, - "step": 5250 - }, - { - "epoch": 0.6136151331153666, - "grad_norm": 0.6810446381568909, - "learning_rate": 2.1003162175298234e-05, - "loss": 0.6426, - "step": 5255 - }, - { - "epoch": 0.6141989724427838, - "grad_norm": 0.6511048078536987, - "learning_rate": 2.0961579368719407e-05, - "loss": 0.661, - "step": 5260 - }, - { - "epoch": 0.6147828117702009, - "grad_norm": 0.6395782828330994, - "learning_rate": 2.0920020940100626e-05, - "loss": 0.6681, - "step": 5265 - }, - { - "epoch": 0.6153666510976179, - "grad_norm": 0.646837055683136, - "learning_rate": 2.087848704438905e-05, - "loss": 0.661, - "step": 5270 - }, - { - "epoch": 0.615950490425035, - "grad_norm": 0.6477077007293701, - "learning_rate": 2.0836977836440364e-05, - "loss": 0.6418, - "step": 5275 - }, - { - "epoch": 0.6165343297524521, - "grad_norm": 0.6611143946647644, - "learning_rate": 2.0795493471018222e-05, - "loss": 0.6709, - "step": 5280 - }, - { - "epoch": 0.6171181690798693, - "grad_norm": 0.6393887400627136, - "learning_rate": 2.075403410279364e-05, - "loss": 0.6423, - "step": 5285 - }, - { - "epoch": 0.6177020084072863, - "grad_norm": 0.6555142998695374, - "learning_rate": 2.0712599886344447e-05, - "loss": 0.6472, - "step": 5290 - }, - { - "epoch": 0.6182858477347034, - "grad_norm": 0.7802370190620422, - "learning_rate": 2.067119097615468e-05, - "loss": 0.6657, - "step": 5295 - }, - { - "epoch": 0.6188696870621205, - "grad_norm": 0.6765434741973877, - "learning_rate": 2.0629807526614037e-05, - "loss": 0.6704, - "step": 5300 - }, - { - "epoch": 0.6194535263895375, - "grad_norm": 0.6658326387405396, - "learning_rate": 2.0588449692017287e-05, - "loss": 0.6513, - "step": 5305 - }, - { - "epoch": 0.6200373657169547, - "grad_norm": 0.6671853065490723, - "learning_rate": 2.054711762656369e-05, - "loss": 0.6702, - "step": 5310 - }, - { - "epoch": 0.6206212050443718, - "grad_norm": 0.6564173698425293, - "learning_rate": 2.0505811484356424e-05, - "loss": 0.6801, - "step": 5315 - }, - { - "epoch": 0.6212050443717889, - "grad_norm": 0.7250168323516846, - "learning_rate": 2.0464531419402026e-05, - "loss": 0.651, - "step": 5320 - }, - { - "epoch": 0.6217888836992059, - "grad_norm": 0.6302818059921265, - "learning_rate": 2.0423277585609806e-05, - "loss": 0.6583, - "step": 5325 - }, - { - "epoch": 0.622372723026623, - "grad_norm": 0.6456112861633301, - "learning_rate": 2.038205013679127e-05, - "loss": 0.6639, - "step": 5330 - }, - { - "epoch": 0.6229565623540402, - "grad_norm": 0.6412529349327087, - "learning_rate": 2.034084922665953e-05, - "loss": 0.6451, - "step": 5335 - }, - { - "epoch": 0.6235404016814573, - "grad_norm": 0.614487886428833, - "learning_rate": 2.0299675008828783e-05, - "loss": 0.6335, - "step": 5340 - }, - { - "epoch": 0.6241242410088743, - "grad_norm": 0.633294403553009, - "learning_rate": 2.025852763681369e-05, - "loss": 0.6409, - "step": 5345 - }, - { - "epoch": 0.6247080803362914, - "grad_norm": 0.6889130473136902, - "learning_rate": 2.021740726402882e-05, - "loss": 0.6556, - "step": 5350 - }, - { - "epoch": 0.6252919196637086, - "grad_norm": 0.6162689924240112, - "learning_rate": 2.0176314043788077e-05, - "loss": 0.6615, - "step": 5355 - }, - { - "epoch": 0.6258757589911257, - "grad_norm": 0.7135904431343079, - "learning_rate": 2.0135248129304124e-05, - "loss": 0.6468, - "step": 5360 - }, - { - "epoch": 0.6264595983185427, - "grad_norm": 0.6348279714584351, - "learning_rate": 2.009420967368784e-05, - "loss": 0.6386, - "step": 5365 - }, - { - "epoch": 0.6270434376459598, - "grad_norm": 0.608452558517456, - "learning_rate": 2.00531988299477e-05, - "loss": 0.6655, - "step": 5370 - }, - { - "epoch": 0.627627276973377, - "grad_norm": 0.6467285752296448, - "learning_rate": 2.0012215750989242e-05, - "loss": 0.6525, - "step": 5375 - }, - { - "epoch": 0.6282111163007941, - "grad_norm": 0.6569520235061646, - "learning_rate": 1.997126058961448e-05, - "loss": 0.6571, - "step": 5380 - }, - { - "epoch": 0.6287949556282111, - "grad_norm": 0.6173200607299805, - "learning_rate": 1.9930333498521354e-05, - "loss": 0.6435, - "step": 5385 - }, - { - "epoch": 0.6293787949556282, - "grad_norm": 0.6818342804908752, - "learning_rate": 1.9889434630303118e-05, - "loss": 0.6587, - "step": 5390 - }, - { - "epoch": 0.6299626342830453, - "grad_norm": 0.6321549415588379, - "learning_rate": 1.9848564137447823e-05, - "loss": 0.6562, - "step": 5395 - }, - { - "epoch": 0.6305464736104625, - "grad_norm": 0.6261130571365356, - "learning_rate": 1.9807722172337724e-05, - "loss": 0.6495, - "step": 5400 - }, - { - "epoch": 0.6311303129378795, - "grad_norm": 0.618937075138092, - "learning_rate": 1.9766908887248697e-05, - "loss": 0.6524, - "step": 5405 - }, - { - "epoch": 0.6317141522652966, - "grad_norm": 0.6402348279953003, - "learning_rate": 1.9726124434349706e-05, - "loss": 0.6478, - "step": 5410 - }, - { - "epoch": 0.6322979915927137, - "grad_norm": 0.6420966982841492, - "learning_rate": 1.9685368965702204e-05, - "loss": 0.6703, - "step": 5415 - }, - { - "epoch": 0.6328818309201307, - "grad_norm": 0.62251216173172, - "learning_rate": 1.9644642633259575e-05, - "loss": 0.6306, - "step": 5420 - }, - { - "epoch": 0.6334656702475479, - "grad_norm": 0.7122980356216431, - "learning_rate": 1.960394558886659e-05, - "loss": 0.6441, - "step": 5425 - }, - { - "epoch": 0.634049509574965, - "grad_norm": 0.6866421699523926, - "learning_rate": 1.95632779842588e-05, - "loss": 0.6533, - "step": 5430 - }, - { - "epoch": 0.6346333489023821, - "grad_norm": 0.6483855843544006, - "learning_rate": 1.9522639971062008e-05, - "loss": 0.6696, - "step": 5435 - }, - { - "epoch": 0.6352171882297991, - "grad_norm": 0.5964992046356201, - "learning_rate": 1.948203170079168e-05, - "loss": 0.6613, - "step": 5440 - }, - { - "epoch": 0.6358010275572162, - "grad_norm": 0.5932938456535339, - "learning_rate": 1.9441453324852387e-05, - "loss": 0.6358, - "step": 5445 - }, - { - "epoch": 0.6363848668846334, - "grad_norm": 0.6805095672607422, - "learning_rate": 1.9400904994537257e-05, - "loss": 0.6519, - "step": 5450 - }, - { - "epoch": 0.6369687062120505, - "grad_norm": 0.6289645433425903, - "learning_rate": 1.936038686102736e-05, - "loss": 0.6445, - "step": 5455 - }, - { - "epoch": 0.6375525455394675, - "grad_norm": 0.6276021003723145, - "learning_rate": 1.931989907539123e-05, - "loss": 0.6608, - "step": 5460 - }, - { - "epoch": 0.6381363848668846, - "grad_norm": 0.659652829170227, - "learning_rate": 1.92794417885842e-05, - "loss": 0.6661, - "step": 5465 - }, - { - "epoch": 0.6387202241943017, - "grad_norm": 0.6643206477165222, - "learning_rate": 1.9239015151447927e-05, - "loss": 0.6506, - "step": 5470 - }, - { - "epoch": 0.6393040635217189, - "grad_norm": 0.690845787525177, - "learning_rate": 1.919861931470978e-05, - "loss": 0.6584, - "step": 5475 - }, - { - "epoch": 0.6398879028491359, - "grad_norm": 0.6345598101615906, - "learning_rate": 1.9158254428982293e-05, - "loss": 0.6557, - "step": 5480 - }, - { - "epoch": 0.640471742176553, - "grad_norm": 0.6204259395599365, - "learning_rate": 1.9117920644762594e-05, - "loss": 0.6719, - "step": 5485 - }, - { - "epoch": 0.6410555815039701, - "grad_norm": 0.6789532899856567, - "learning_rate": 1.907761811243186e-05, - "loss": 0.6417, - "step": 5490 - }, - { - "epoch": 0.6416394208313873, - "grad_norm": 0.6658161878585815, - "learning_rate": 1.9037346982254755e-05, - "loss": 0.6517, - "step": 5495 - }, - { - "epoch": 0.6422232601588043, - "grad_norm": 0.642288088798523, - "learning_rate": 1.8997107404378846e-05, - "loss": 0.656, - "step": 5500 - }, - { - "epoch": 0.6428070994862214, - "grad_norm": 0.6322532892227173, - "learning_rate": 1.8956899528834065e-05, - "loss": 0.6382, - "step": 5505 - }, - { - "epoch": 0.6433909388136385, - "grad_norm": 0.6494859457015991, - "learning_rate": 1.8916723505532157e-05, - "loss": 0.646, - "step": 5510 - }, - { - "epoch": 0.6439747781410555, - "grad_norm": 0.6644103527069092, - "learning_rate": 1.8876579484266094e-05, - "loss": 0.6405, - "step": 5515 - }, - { - "epoch": 0.6445586174684726, - "grad_norm": 0.6574621796607971, - "learning_rate": 1.8836467614709535e-05, - "loss": 0.6596, - "step": 5520 - }, - { - "epoch": 0.6451424567958898, - "grad_norm": 0.6571225523948669, - "learning_rate": 1.8796388046416253e-05, - "loss": 0.6624, - "step": 5525 - }, - { - "epoch": 0.6457262961233069, - "grad_norm": 0.6391655802726746, - "learning_rate": 1.875634092881963e-05, - "loss": 0.6551, - "step": 5530 - }, - { - "epoch": 0.6463101354507239, - "grad_norm": 0.6903751492500305, - "learning_rate": 1.8716326411232016e-05, - "loss": 0.6532, - "step": 5535 - }, - { - "epoch": 0.646893974778141, - "grad_norm": 0.6607362627983093, - "learning_rate": 1.8676344642844217e-05, - "loss": 0.6425, - "step": 5540 - }, - { - "epoch": 0.6474778141055582, - "grad_norm": 0.6822094917297363, - "learning_rate": 1.8636395772724952e-05, - "loss": 0.6523, - "step": 5545 - }, - { - "epoch": 0.6480616534329753, - "grad_norm": 0.6002434492111206, - "learning_rate": 1.8596479949820273e-05, - "loss": 0.652, - "step": 5550 - }, - { - "epoch": 0.6486454927603923, - "grad_norm": 0.6083955764770508, - "learning_rate": 1.8556597322953035e-05, - "loss": 0.6633, - "step": 5555 - }, - { - "epoch": 0.6492293320878094, - "grad_norm": 0.6254439353942871, - "learning_rate": 1.8516748040822295e-05, - "loss": 0.6465, - "step": 5560 - }, - { - "epoch": 0.6498131714152265, - "grad_norm": 0.776471734046936, - "learning_rate": 1.847693225200281e-05, - "loss": 0.6522, - "step": 5565 - }, - { - "epoch": 0.6503970107426437, - "grad_norm": 0.6252726912498474, - "learning_rate": 1.843715010494445e-05, - "loss": 0.6557, - "step": 5570 - }, - { - "epoch": 0.6509808500700607, - "grad_norm": 0.6479578018188477, - "learning_rate": 1.839740174797166e-05, - "loss": 0.6692, - "step": 5575 - }, - { - "epoch": 0.6515646893974778, - "grad_norm": 0.6455087065696716, - "learning_rate": 1.8357687329282896e-05, - "loss": 0.6587, - "step": 5580 - }, - { - "epoch": 0.6521485287248949, - "grad_norm": 0.6654039621353149, - "learning_rate": 1.831800699695008e-05, - "loss": 0.652, - "step": 5585 - }, - { - "epoch": 0.652732368052312, - "grad_norm": 0.6566126346588135, - "learning_rate": 1.827836089891805e-05, - "loss": 0.66, - "step": 5590 - }, - { - "epoch": 0.6533162073797291, - "grad_norm": 0.6320335865020752, - "learning_rate": 1.823874918300399e-05, - "loss": 0.6518, - "step": 5595 - }, - { - "epoch": 0.6539000467071462, - "grad_norm": 0.6289529800415039, - "learning_rate": 1.8199171996896912e-05, - "loss": 0.6521, - "step": 5600 - }, - { - "epoch": 0.6544838860345633, - "grad_norm": 0.6901244521141052, - "learning_rate": 1.8159629488157082e-05, - "loss": 0.6702, - "step": 5605 - }, - { - "epoch": 0.6550677253619804, - "grad_norm": 0.643057644367218, - "learning_rate": 1.8120121804215466e-05, - "loss": 0.6539, - "step": 5610 - }, - { - "epoch": 0.6556515646893974, - "grad_norm": 0.6587081551551819, - "learning_rate": 1.8080649092373187e-05, - "loss": 0.6587, - "step": 5615 - }, - { - "epoch": 0.6562354040168146, - "grad_norm": 0.6559469103813171, - "learning_rate": 1.8041211499800992e-05, - "loss": 0.649, - "step": 5620 - }, - { - "epoch": 0.6568192433442317, - "grad_norm": 0.6475241184234619, - "learning_rate": 1.8001809173538676e-05, - "loss": 0.6451, - "step": 5625 - }, - { - "epoch": 0.6574030826716487, - "grad_norm": 0.6222142577171326, - "learning_rate": 1.796244226049455e-05, - "loss": 0.6546, - "step": 5630 - }, - { - "epoch": 0.6579869219990658, - "grad_norm": 0.628761887550354, - "learning_rate": 1.792311090744489e-05, - "loss": 0.6239, - "step": 5635 - }, - { - "epoch": 0.658570761326483, - "grad_norm": 0.658136785030365, - "learning_rate": 1.7883815261033393e-05, - "loss": 0.6523, - "step": 5640 - }, - { - "epoch": 0.6591546006539001, - "grad_norm": 0.6283432841300964, - "learning_rate": 1.7844555467770624e-05, - "loss": 0.666, - "step": 5645 - }, - { - "epoch": 0.6597384399813171, - "grad_norm": 0.610302746295929, - "learning_rate": 1.7805331674033466e-05, - "loss": 0.6537, - "step": 5650 - }, - { - "epoch": 0.6603222793087342, - "grad_norm": 0.6804009079933167, - "learning_rate": 1.776614402606459e-05, - "loss": 0.652, - "step": 5655 - }, - { - "epoch": 0.6609061186361513, - "grad_norm": 0.6688585877418518, - "learning_rate": 1.7726992669971904e-05, - "loss": 0.6518, - "step": 5660 - }, - { - "epoch": 0.6614899579635685, - "grad_norm": 0.6098734140396118, - "learning_rate": 1.768787775172799e-05, - "loss": 0.6546, - "step": 5665 - }, - { - "epoch": 0.6620737972909855, - "grad_norm": 0.6555027365684509, - "learning_rate": 1.7648799417169588e-05, - "loss": 0.654, - "step": 5670 - }, - { - "epoch": 0.6626576366184026, - "grad_norm": 0.6684269309043884, - "learning_rate": 1.7609757811997023e-05, - "loss": 0.6502, - "step": 5675 - }, - { - "epoch": 0.6632414759458197, - "grad_norm": 0.676682710647583, - "learning_rate": 1.75707530817737e-05, - "loss": 0.6497, - "step": 5680 - }, - { - "epoch": 0.6638253152732368, - "grad_norm": 0.6433451771736145, - "learning_rate": 1.753178537192551e-05, - "loss": 0.6536, - "step": 5685 - }, - { - "epoch": 0.6644091546006539, - "grad_norm": 0.6749869585037231, - "learning_rate": 1.7492854827740353e-05, - "loss": 0.6603, - "step": 5690 - }, - { - "epoch": 0.664992993928071, - "grad_norm": 0.6494778394699097, - "learning_rate": 1.7453961594367528e-05, - "loss": 0.6524, - "step": 5695 - }, - { - "epoch": 0.6655768332554881, - "grad_norm": 0.6811242699623108, - "learning_rate": 1.741510581681724e-05, - "loss": 0.6484, - "step": 5700 - }, - { - "epoch": 0.6661606725829052, - "grad_norm": 0.7152803540229797, - "learning_rate": 1.737628763996005e-05, - "loss": 0.6666, - "step": 5705 - }, - { - "epoch": 0.6667445119103222, - "grad_norm": 0.7093722224235535, - "learning_rate": 1.7337507208526295e-05, - "loss": 0.6542, - "step": 5710 - }, - { - "epoch": 0.6673283512377394, - "grad_norm": 0.6127923727035522, - "learning_rate": 1.729876466710561e-05, - "loss": 0.6531, - "step": 5715 - }, - { - "epoch": 0.6679121905651565, - "grad_norm": 0.6610433459281921, - "learning_rate": 1.726006016014637e-05, - "loss": 0.6704, - "step": 5720 - }, - { - "epoch": 0.6684960298925736, - "grad_norm": 0.6480225324630737, - "learning_rate": 1.7221393831955102e-05, - "loss": 0.6523, - "step": 5725 - }, - { - "epoch": 0.6690798692199906, - "grad_norm": 0.7205737829208374, - "learning_rate": 1.718276582669602e-05, - "loss": 0.6413, - "step": 5730 - }, - { - "epoch": 0.6696637085474078, - "grad_norm": 0.627311110496521, - "learning_rate": 1.7144176288390448e-05, - "loss": 0.6417, - "step": 5735 - }, - { - "epoch": 0.6702475478748249, - "grad_norm": 0.6376802325248718, - "learning_rate": 1.7105625360916276e-05, - "loss": 0.6619, - "step": 5740 - }, - { - "epoch": 0.6708313872022419, - "grad_norm": 0.6563093662261963, - "learning_rate": 1.7067113188007457e-05, - "loss": 0.6395, - "step": 5745 - }, - { - "epoch": 0.671415226529659, - "grad_norm": 0.6221945881843567, - "learning_rate": 1.7028639913253426e-05, - "loss": 0.6532, - "step": 5750 - }, - { - "epoch": 0.6719990658570761, - "grad_norm": 0.6594525575637817, - "learning_rate": 1.6990205680098612e-05, - "loss": 0.6619, - "step": 5755 - }, - { - "epoch": 0.6725829051844933, - "grad_norm": 0.6265227794647217, - "learning_rate": 1.695181063184187e-05, - "loss": 0.66, - "step": 5760 - }, - { - "epoch": 0.6731667445119103, - "grad_norm": 0.6235857605934143, - "learning_rate": 1.6913454911635954e-05, - "loss": 0.646, - "step": 5765 - }, - { - "epoch": 0.6737505838393274, - "grad_norm": 0.6509799361228943, - "learning_rate": 1.6875138662486997e-05, - "loss": 0.6411, - "step": 5770 - }, - { - "epoch": 0.6743344231667445, - "grad_norm": 0.677470862865448, - "learning_rate": 1.6836862027253963e-05, - "loss": 0.6348, - "step": 5775 - }, - { - "epoch": 0.6749182624941616, - "grad_norm": 0.7517759203910828, - "learning_rate": 1.6798625148648113e-05, - "loss": 0.6562, - "step": 5780 - }, - { - "epoch": 0.6755021018215787, - "grad_norm": 0.6516550779342651, - "learning_rate": 1.6760428169232483e-05, - "loss": 0.637, - "step": 5785 - }, - { - "epoch": 0.6760859411489958, - "grad_norm": 0.6019690036773682, - "learning_rate": 1.672227123142136e-05, - "loss": 0.6578, - "step": 5790 - }, - { - "epoch": 0.6766697804764129, - "grad_norm": 0.6093150973320007, - "learning_rate": 1.668415447747971e-05, - "loss": 0.6332, - "step": 5795 - }, - { - "epoch": 0.67725361980383, - "grad_norm": 0.649224042892456, - "learning_rate": 1.6646078049522706e-05, - "loss": 0.6387, - "step": 5800 - }, - { - "epoch": 0.677837459131247, - "grad_norm": 0.6444030404090881, - "learning_rate": 1.660804208951516e-05, - "loss": 0.6525, - "step": 5805 - }, - { - "epoch": 0.6784212984586642, - "grad_norm": 0.5827856063842773, - "learning_rate": 1.6570046739270988e-05, - "loss": 0.6522, - "step": 5810 - }, - { - "epoch": 0.6790051377860813, - "grad_norm": 0.59112948179245, - "learning_rate": 1.6532092140452725e-05, - "loss": 0.6545, - "step": 5815 - }, - { - "epoch": 0.6795889771134984, - "grad_norm": 0.6026985049247742, - "learning_rate": 1.649417843457094e-05, - "loss": 0.6375, - "step": 5820 - }, - { - "epoch": 0.6801728164409154, - "grad_norm": 0.668092668056488, - "learning_rate": 1.6456305762983742e-05, - "loss": 0.6559, - "step": 5825 - }, - { - "epoch": 0.6807566557683326, - "grad_norm": 0.634980320930481, - "learning_rate": 1.6418474266896257e-05, - "loss": 0.643, - "step": 5830 - }, - { - "epoch": 0.6813404950957497, - "grad_norm": 0.6863172650337219, - "learning_rate": 1.6380684087360088e-05, - "loss": 0.6325, - "step": 5835 - }, - { - "epoch": 0.6819243344231667, - "grad_norm": 0.6540225744247437, - "learning_rate": 1.6342935365272785e-05, - "loss": 0.6598, - "step": 5840 - }, - { - "epoch": 0.6825081737505838, - "grad_norm": 0.6378657817840576, - "learning_rate": 1.6305228241377347e-05, - "loss": 0.6343, - "step": 5845 - }, - { - "epoch": 0.6830920130780009, - "grad_norm": 0.6304488778114319, - "learning_rate": 1.6267562856261638e-05, - "loss": 0.6461, - "step": 5850 - }, - { - "epoch": 0.6836758524054181, - "grad_norm": 0.6447546482086182, - "learning_rate": 1.6229939350357952e-05, - "loss": 0.6425, - "step": 5855 - }, - { - "epoch": 0.6842596917328351, - "grad_norm": 0.623534083366394, - "learning_rate": 1.6192357863942415e-05, - "loss": 0.6465, - "step": 5860 - }, - { - "epoch": 0.6848435310602522, - "grad_norm": 0.6575029492378235, - "learning_rate": 1.615481853713448e-05, - "loss": 0.6332, - "step": 5865 - }, - { - "epoch": 0.6854273703876693, - "grad_norm": 0.6916881203651428, - "learning_rate": 1.6117321509896422e-05, - "loss": 0.6438, - "step": 5870 - }, - { - "epoch": 0.6860112097150864, - "grad_norm": 0.6092137694358826, - "learning_rate": 1.60798669220328e-05, - "loss": 0.6407, - "step": 5875 - }, - { - "epoch": 0.6865950490425035, - "grad_norm": 0.6791859865188599, - "learning_rate": 1.6042454913189946e-05, - "loss": 0.674, - "step": 5880 - }, - { - "epoch": 0.6871788883699206, - "grad_norm": 0.6624159216880798, - "learning_rate": 1.600508562285544e-05, - "loss": 0.6713, - "step": 5885 - }, - { - "epoch": 0.6877627276973377, - "grad_norm": 0.6085835695266724, - "learning_rate": 1.5967759190357585e-05, - "loss": 0.6366, - "step": 5890 - }, - { - "epoch": 0.6883465670247548, - "grad_norm": 0.6415414214134216, - "learning_rate": 1.5930475754864898e-05, - "loss": 0.6474, - "step": 5895 - }, - { - "epoch": 0.6889304063521718, - "grad_norm": 0.6924893856048584, - "learning_rate": 1.5893235455385575e-05, - "loss": 0.6607, - "step": 5900 - }, - { - "epoch": 0.689514245679589, - "grad_norm": 0.6112133860588074, - "learning_rate": 1.5856038430766994e-05, - "loss": 0.6499, - "step": 5905 - }, - { - "epoch": 0.6900980850070061, - "grad_norm": 0.6420413851737976, - "learning_rate": 1.5818884819695184e-05, - "loss": 0.6419, - "step": 5910 - }, - { - "epoch": 0.6906819243344232, - "grad_norm": 0.6981012225151062, - "learning_rate": 1.5781774760694304e-05, - "loss": 0.631, - "step": 5915 - }, - { - "epoch": 0.6912657636618402, - "grad_norm": 0.6580440402030945, - "learning_rate": 1.5744708392126138e-05, - "loss": 0.6433, - "step": 5920 - }, - { - "epoch": 0.6918496029892574, - "grad_norm": 0.6421406865119934, - "learning_rate": 1.5707685852189573e-05, - "loss": 0.6538, - "step": 5925 - }, - { - "epoch": 0.6924334423166745, - "grad_norm": 0.635778546333313, - "learning_rate": 1.5670707278920084e-05, - "loss": 0.6522, - "step": 5930 - }, - { - "epoch": 0.6930172816440916, - "grad_norm": 0.6497576236724854, - "learning_rate": 1.563377281018922e-05, - "loss": 0.6437, - "step": 5935 - }, - { - "epoch": 0.6936011209715086, - "grad_norm": 0.6821728944778442, - "learning_rate": 1.5596882583704092e-05, - "loss": 0.6698, - "step": 5940 - }, - { - "epoch": 0.6941849602989257, - "grad_norm": 0.6182253956794739, - "learning_rate": 1.5560036737006856e-05, - "loss": 0.6462, - "step": 5945 - }, - { - "epoch": 0.6947687996263429, - "grad_norm": 0.6561591029167175, - "learning_rate": 1.5523235407474195e-05, - "loss": 0.6512, - "step": 5950 - }, - { - "epoch": 0.6953526389537599, - "grad_norm": 0.6747484803199768, - "learning_rate": 1.5486478732316827e-05, - "loss": 0.6684, - "step": 5955 - }, - { - "epoch": 0.695936478281177, - "grad_norm": 0.6135779023170471, - "learning_rate": 1.5449766848578968e-05, - "loss": 0.6467, - "step": 5960 - }, - { - "epoch": 0.6965203176085941, - "grad_norm": 0.6598634123802185, - "learning_rate": 1.541309989313784e-05, - "loss": 0.668, - "step": 5965 - }, - { - "epoch": 0.6971041569360112, - "grad_norm": 0.6440311670303345, - "learning_rate": 1.5376478002703154e-05, - "loss": 0.6542, - "step": 5970 - }, - { - "epoch": 0.6976879962634283, - "grad_norm": 0.6345959901809692, - "learning_rate": 1.5339901313816584e-05, - "loss": 0.6397, - "step": 5975 - }, - { - "epoch": 0.6982718355908454, - "grad_norm": 0.6342206597328186, - "learning_rate": 1.5303369962851298e-05, - "loss": 0.6594, - "step": 5980 - }, - { - "epoch": 0.6988556749182625, - "grad_norm": 0.5983539819717407, - "learning_rate": 1.5266884086011406e-05, - "loss": 0.6545, - "step": 5985 - }, - { - "epoch": 0.6994395142456796, - "grad_norm": 0.6113163828849792, - "learning_rate": 1.5230443819331492e-05, - "loss": 0.6436, - "step": 5990 - }, - { - "epoch": 0.7000233535730966, - "grad_norm": 0.6457630395889282, - "learning_rate": 1.5194049298676061e-05, - "loss": 0.6285, - "step": 5995 - }, - { - "epoch": 0.7006071929005138, - "grad_norm": 0.686281144618988, - "learning_rate": 1.515770065973907e-05, - "loss": 0.6432, - "step": 6000 - }, - { - "epoch": 0.7011910322279309, - "grad_norm": 0.6474929451942444, - "learning_rate": 1.5121398038043421e-05, - "loss": 0.6578, - "step": 6005 - }, - { - "epoch": 0.701774871555348, - "grad_norm": 0.6438776254653931, - "learning_rate": 1.5085141568940419e-05, - "loss": 0.6409, - "step": 6010 - }, - { - "epoch": 0.702358710882765, - "grad_norm": 0.6099674105644226, - "learning_rate": 1.5048931387609321e-05, - "loss": 0.6378, - "step": 6015 - }, - { - "epoch": 0.7029425502101821, - "grad_norm": 0.6138654351234436, - "learning_rate": 1.501276762905679e-05, - "loss": 0.6526, - "step": 6020 - }, - { - "epoch": 0.7035263895375993, - "grad_norm": 0.7578221559524536, - "learning_rate": 1.4976650428116401e-05, - "loss": 0.6596, - "step": 6025 - }, - { - "epoch": 0.7041102288650164, - "grad_norm": 0.609033465385437, - "learning_rate": 1.4940579919448147e-05, - "loss": 0.644, - "step": 6030 - }, - { - "epoch": 0.7046940681924334, - "grad_norm": 0.62203449010849, - "learning_rate": 1.4904556237537936e-05, - "loss": 0.6821, - "step": 6035 - }, - { - "epoch": 0.7052779075198505, - "grad_norm": 0.6592409014701843, - "learning_rate": 1.4868579516697079e-05, - "loss": 0.6509, - "step": 6040 - }, - { - "epoch": 0.7058617468472677, - "grad_norm": 0.6275002956390381, - "learning_rate": 1.4832649891061811e-05, - "loss": 0.6417, - "step": 6045 - }, - { - "epoch": 0.7064455861746848, - "grad_norm": 0.6480668783187866, - "learning_rate": 1.4796767494592757e-05, - "loss": 0.6462, - "step": 6050 - }, - { - "epoch": 0.7070294255021018, - "grad_norm": 0.6026184558868408, - "learning_rate": 1.4760932461074467e-05, - "loss": 0.6411, - "step": 6055 - }, - { - "epoch": 0.7076132648295189, - "grad_norm": 0.6246352195739746, - "learning_rate": 1.4725144924114891e-05, - "loss": 0.6362, - "step": 6060 - }, - { - "epoch": 0.708197104156936, - "grad_norm": 0.6344712376594543, - "learning_rate": 1.4689405017144908e-05, - "loss": 0.6402, - "step": 6065 - }, - { - "epoch": 0.708780943484353, - "grad_norm": 0.7218906283378601, - "learning_rate": 1.4653712873417796e-05, - "loss": 0.6576, - "step": 6070 - }, - { - "epoch": 0.7093647828117702, - "grad_norm": 0.6179711222648621, - "learning_rate": 1.4618068626008755e-05, - "loss": 0.6322, - "step": 6075 - }, - { - "epoch": 0.7099486221391873, - "grad_norm": 0.616460919380188, - "learning_rate": 1.4582472407814419e-05, - "loss": 0.6433, - "step": 6080 - }, - { - "epoch": 0.7105324614666044, - "grad_norm": 0.5981026887893677, - "learning_rate": 1.4546924351552333e-05, - "loss": 0.6549, - "step": 6085 - }, - { - "epoch": 0.7111163007940214, - "grad_norm": 0.634906530380249, - "learning_rate": 1.4511424589760486e-05, - "loss": 0.6353, - "step": 6090 - }, - { - "epoch": 0.7117001401214386, - "grad_norm": 0.6406628489494324, - "learning_rate": 1.4475973254796799e-05, - "loss": 0.665, - "step": 6095 - }, - { - "epoch": 0.7122839794488557, - "grad_norm": 0.6126695871353149, - "learning_rate": 1.4440570478838645e-05, - "loss": 0.6545, - "step": 6100 - }, - { - "epoch": 0.7128678187762728, - "grad_norm": 0.648638904094696, - "learning_rate": 1.440521639388233e-05, - "loss": 0.6449, - "step": 6105 - }, - { - "epoch": 0.7134516581036898, - "grad_norm": 0.6560222506523132, - "learning_rate": 1.436991113174265e-05, - "loss": 0.657, - "step": 6110 - }, - { - "epoch": 0.714035497431107, - "grad_norm": 0.6068403124809265, - "learning_rate": 1.4334654824052351e-05, - "loss": 0.6363, - "step": 6115 - }, - { - "epoch": 0.7146193367585241, - "grad_norm": 0.5890648365020752, - "learning_rate": 1.429944760226164e-05, - "loss": 0.656, - "step": 6120 - }, - { - "epoch": 0.7152031760859412, - "grad_norm": 0.6379078030586243, - "learning_rate": 1.4264289597637741e-05, - "loss": 0.6596, - "step": 6125 - }, - { - "epoch": 0.7157870154133582, - "grad_norm": 0.6202497482299805, - "learning_rate": 1.4229180941264364e-05, - "loss": 0.6348, - "step": 6130 - }, - { - "epoch": 0.7163708547407753, - "grad_norm": 0.658928394317627, - "learning_rate": 1.4194121764041224e-05, - "loss": 0.6399, - "step": 6135 - }, - { - "epoch": 0.7169546940681925, - "grad_norm": 0.6576935648918152, - "learning_rate": 1.4159112196683564e-05, - "loss": 0.6382, - "step": 6140 - }, - { - "epoch": 0.7175385333956096, - "grad_norm": 0.702009379863739, - "learning_rate": 1.4124152369721655e-05, - "loss": 0.6541, - "step": 6145 - }, - { - "epoch": 0.7181223727230266, - "grad_norm": 0.6326926350593567, - "learning_rate": 1.408924241350032e-05, - "loss": 0.6213, - "step": 6150 - }, - { - "epoch": 0.7187062120504437, - "grad_norm": 0.605268657207489, - "learning_rate": 1.4054382458178439e-05, - "loss": 0.6427, - "step": 6155 - }, - { - "epoch": 0.7192900513778608, - "grad_norm": 0.6966155171394348, - "learning_rate": 1.4019572633728473e-05, - "loss": 0.653, - "step": 6160 - }, - { - "epoch": 0.7198738907052779, - "grad_norm": 0.65373295545578, - "learning_rate": 1.3984813069935967e-05, - "loss": 0.6476, - "step": 6165 - }, - { - "epoch": 0.720457730032695, - "grad_norm": 0.6140549778938293, - "learning_rate": 1.395010389639908e-05, - "loss": 0.6581, - "step": 6170 - }, - { - "epoch": 0.7210415693601121, - "grad_norm": 0.5785439610481262, - "learning_rate": 1.391544524252808e-05, - "loss": 0.6405, - "step": 6175 - }, - { - "epoch": 0.7216254086875292, - "grad_norm": 0.6497227549552917, - "learning_rate": 1.388083723754491e-05, - "loss": 0.6499, - "step": 6180 - }, - { - "epoch": 0.7222092480149462, - "grad_norm": 0.6294034719467163, - "learning_rate": 1.384628001048264e-05, - "loss": 0.6424, - "step": 6185 - }, - { - "epoch": 0.7227930873423634, - "grad_norm": 0.729047954082489, - "learning_rate": 1.381177369018503e-05, - "loss": 0.6412, - "step": 6190 - }, - { - "epoch": 0.7233769266697805, - "grad_norm": 0.6507778167724609, - "learning_rate": 1.377731840530604e-05, - "loss": 0.6436, - "step": 6195 - }, - { - "epoch": 0.7239607659971976, - "grad_norm": 0.6335770487785339, - "learning_rate": 1.374291428430935e-05, - "loss": 0.6518, - "step": 6200 - }, - { - "epoch": 0.7245446053246146, - "grad_norm": 0.6715924739837646, - "learning_rate": 1.3708561455467872e-05, - "loss": 0.6464, - "step": 6205 - }, - { - "epoch": 0.7251284446520317, - "grad_norm": 0.6121864914894104, - "learning_rate": 1.3674260046863285e-05, - "loss": 0.6364, - "step": 6210 - }, - { - "epoch": 0.7257122839794489, - "grad_norm": 0.6083370447158813, - "learning_rate": 1.3640010186385552e-05, - "loss": 0.6384, - "step": 6215 - }, - { - "epoch": 0.726296123306866, - "grad_norm": 0.6474511027336121, - "learning_rate": 1.3605812001732444e-05, - "loss": 0.6537, - "step": 6220 - }, - { - "epoch": 0.726879962634283, - "grad_norm": 0.6496933698654175, - "learning_rate": 1.3571665620409064e-05, - "loss": 0.649, - "step": 6225 - }, - { - "epoch": 0.7274638019617001, - "grad_norm": 0.6352631449699402, - "learning_rate": 1.3537571169727359e-05, - "loss": 0.6366, - "step": 6230 - }, - { - "epoch": 0.7280476412891173, - "grad_norm": 0.6287112832069397, - "learning_rate": 1.3503528776805676e-05, - "loss": 0.6296, - "step": 6235 - }, - { - "epoch": 0.7286314806165344, - "grad_norm": 0.62437504529953, - "learning_rate": 1.3469538568568255e-05, - "loss": 0.6291, - "step": 6240 - }, - { - "epoch": 0.7292153199439514, - "grad_norm": 0.6231974959373474, - "learning_rate": 1.3435600671744768e-05, - "loss": 0.6277, - "step": 6245 - }, - { - "epoch": 0.7297991592713685, - "grad_norm": 0.5917030572891235, - "learning_rate": 1.3401715212869864e-05, - "loss": 0.6214, - "step": 6250 - }, - { - "epoch": 0.7303829985987856, - "grad_norm": 0.5941394567489624, - "learning_rate": 1.3367882318282666e-05, - "loss": 0.6476, - "step": 6255 - }, - { - "epoch": 0.7309668379262028, - "grad_norm": 0.6343090534210205, - "learning_rate": 1.3334102114126314e-05, - "loss": 0.6509, - "step": 6260 - }, - { - "epoch": 0.7315506772536198, - "grad_norm": 0.6695771217346191, - "learning_rate": 1.330037472634752e-05, - "loss": 0.6514, - "step": 6265 - }, - { - "epoch": 0.7321345165810369, - "grad_norm": 0.6254714131355286, - "learning_rate": 1.3266700280696042e-05, - "loss": 0.6319, - "step": 6270 - }, - { - "epoch": 0.732718355908454, - "grad_norm": 0.6151999235153198, - "learning_rate": 1.3233078902724266e-05, - "loss": 0.6572, - "step": 6275 - }, - { - "epoch": 0.733302195235871, - "grad_norm": 0.7187328934669495, - "learning_rate": 1.3199510717786714e-05, - "loss": 0.6527, - "step": 6280 - }, - { - "epoch": 0.7338860345632882, - "grad_norm": 0.6368306875228882, - "learning_rate": 1.3165995851039591e-05, - "loss": 0.6693, - "step": 6285 - }, - { - "epoch": 0.7344698738907053, - "grad_norm": 0.6745548248291016, - "learning_rate": 1.3132534427440301e-05, - "loss": 0.6419, - "step": 6290 - }, - { - "epoch": 0.7350537132181224, - "grad_norm": 0.7100093960762024, - "learning_rate": 1.309912657174699e-05, - "loss": 0.6488, - "step": 6295 - }, - { - "epoch": 0.7356375525455394, - "grad_norm": 0.6195617318153381, - "learning_rate": 1.3065772408518085e-05, - "loss": 0.637, - "step": 6300 - }, - { - "epoch": 0.7362213918729565, - "grad_norm": 0.651724636554718, - "learning_rate": 1.3032472062111823e-05, - "loss": 0.6382, - "step": 6305 - }, - { - "epoch": 0.7368052312003737, - "grad_norm": 0.6718435287475586, - "learning_rate": 1.2999225656685781e-05, - "loss": 0.6445, - "step": 6310 - }, - { - "epoch": 0.7373890705277908, - "grad_norm": 0.636619508266449, - "learning_rate": 1.2966033316196435e-05, - "loss": 0.6532, - "step": 6315 - }, - { - "epoch": 0.7379729098552078, - "grad_norm": 0.6013931632041931, - "learning_rate": 1.2932895164398684e-05, - "loss": 0.6467, - "step": 6320 - }, - { - "epoch": 0.7385567491826249, - "grad_norm": 0.606683075428009, - "learning_rate": 1.2899811324845373e-05, - "loss": 0.6452, - "step": 6325 - }, - { - "epoch": 0.739140588510042, - "grad_norm": 0.6266047358512878, - "learning_rate": 1.2866781920886873e-05, - "loss": 0.6603, - "step": 6330 - }, - { - "epoch": 0.7397244278374592, - "grad_norm": 0.5885910391807556, - "learning_rate": 1.2833807075670564e-05, - "loss": 0.6407, - "step": 6335 - }, - { - "epoch": 0.7403082671648762, - "grad_norm": 0.6663094758987427, - "learning_rate": 1.2800886912140433e-05, - "loss": 0.6431, - "step": 6340 - }, - { - "epoch": 0.7408921064922933, - "grad_norm": 0.6538107991218567, - "learning_rate": 1.2768021553036596e-05, - "loss": 0.6282, - "step": 6345 - }, - { - "epoch": 0.7414759458197104, - "grad_norm": 0.6556307673454285, - "learning_rate": 1.2735211120894813e-05, - "loss": 0.6471, - "step": 6350 - }, - { - "epoch": 0.7420597851471276, - "grad_norm": 0.5964353680610657, - "learning_rate": 1.2702455738046068e-05, - "loss": 0.6473, - "step": 6355 - }, - { - "epoch": 0.7426436244745446, - "grad_norm": 0.7021486759185791, - "learning_rate": 1.2669755526616093e-05, - "loss": 0.658, - "step": 6360 - }, - { - "epoch": 0.7432274638019617, - "grad_norm": 0.7126589417457581, - "learning_rate": 1.2637110608524916e-05, - "loss": 0.6218, - "step": 6365 - }, - { - "epoch": 0.7438113031293788, - "grad_norm": 0.6862289905548096, - "learning_rate": 1.2604521105486417e-05, - "loss": 0.64, - "step": 6370 - }, - { - "epoch": 0.7443951424567959, - "grad_norm": 0.5892174243927002, - "learning_rate": 1.2571987139007856e-05, - "loss": 0.6419, - "step": 6375 - }, - { - "epoch": 0.744978981784213, - "grad_norm": 0.6465451121330261, - "learning_rate": 1.253950883038944e-05, - "loss": 0.651, - "step": 6380 - }, - { - "epoch": 0.7455628211116301, - "grad_norm": 0.6354182958602905, - "learning_rate": 1.2507086300723846e-05, - "loss": 0.6534, - "step": 6385 - }, - { - "epoch": 0.7461466604390472, - "grad_norm": 0.6753255724906921, - "learning_rate": 1.2474719670895796e-05, - "loss": 0.6462, - "step": 6390 - }, - { - "epoch": 0.7467304997664642, - "grad_norm": 0.6518461108207703, - "learning_rate": 1.2442409061581587e-05, - "loss": 0.6265, - "step": 6395 - }, - { - "epoch": 0.7473143390938813, - "grad_norm": 0.636256217956543, - "learning_rate": 1.2410154593248657e-05, - "loss": 0.6252, - "step": 6400 - }, - { - "epoch": 0.7478981784212985, - "grad_norm": 0.6116010546684265, - "learning_rate": 1.2377956386155114e-05, - "loss": 0.6406, - "step": 6405 - }, - { - "epoch": 0.7484820177487156, - "grad_norm": 0.6598799228668213, - "learning_rate": 1.2345814560349316e-05, - "loss": 0.6561, - "step": 6410 - }, - { - "epoch": 0.7490658570761326, - "grad_norm": 0.7191500663757324, - "learning_rate": 1.231372923566939e-05, - "loss": 0.6429, - "step": 6415 - }, - { - "epoch": 0.7496496964035497, - "grad_norm": 0.6759008765220642, - "learning_rate": 1.2281700531742818e-05, - "loss": 0.6496, - "step": 6420 - }, - { - "epoch": 0.7502335357309668, - "grad_norm": 0.6230224370956421, - "learning_rate": 1.2249728567985966e-05, - "loss": 0.6509, - "step": 6425 - }, - { - "epoch": 0.750817375058384, - "grad_norm": 0.6771016120910645, - "learning_rate": 1.2217813463603664e-05, - "loss": 0.6471, - "step": 6430 - }, - { - "epoch": 0.751401214385801, - "grad_norm": 0.593232274055481, - "learning_rate": 1.2185955337588727e-05, - "loss": 0.6329, - "step": 6435 - }, - { - "epoch": 0.7519850537132181, - "grad_norm": 0.6739088892936707, - "learning_rate": 1.2154154308721546e-05, - "loss": 0.641, - "step": 6440 - }, - { - "epoch": 0.7525688930406352, - "grad_norm": 0.6023595333099365, - "learning_rate": 1.2122410495569623e-05, - "loss": 0.639, - "step": 6445 - }, - { - "epoch": 0.7531527323680524, - "grad_norm": 0.6589640378952026, - "learning_rate": 1.2090724016487137e-05, - "loss": 0.6607, - "step": 6450 - }, - { - "epoch": 0.7537365716954694, - "grad_norm": 0.6305920481681824, - "learning_rate": 1.2059094989614503e-05, - "loss": 0.6392, - "step": 6455 - }, - { - "epoch": 0.7543204110228865, - "grad_norm": 0.6897038221359253, - "learning_rate": 1.2027523532877928e-05, - "loss": 0.6332, - "step": 6460 - }, - { - "epoch": 0.7549042503503036, - "grad_norm": 0.6643224954605103, - "learning_rate": 1.1996009763988974e-05, - "loss": 0.6301, - "step": 6465 - }, - { - "epoch": 0.7554880896777207, - "grad_norm": 0.667819619178772, - "learning_rate": 1.1964553800444123e-05, - "loss": 0.6464, - "step": 6470 - }, - { - "epoch": 0.7560719290051378, - "grad_norm": 0.6197718977928162, - "learning_rate": 1.1933155759524332e-05, - "loss": 0.6378, - "step": 6475 - }, - { - "epoch": 0.7566557683325549, - "grad_norm": 0.7117953300476074, - "learning_rate": 1.1901815758294589e-05, - "loss": 0.6337, - "step": 6480 - }, - { - "epoch": 0.757239607659972, - "grad_norm": 0.7122768759727478, - "learning_rate": 1.18705339136035e-05, - "loss": 0.657, - "step": 6485 - }, - { - "epoch": 0.757823446987389, - "grad_norm": 0.6503180861473083, - "learning_rate": 1.1839310342082835e-05, - "loss": 0.668, - "step": 6490 - }, - { - "epoch": 0.7584072863148061, - "grad_norm": 0.6244552135467529, - "learning_rate": 1.1808145160147092e-05, - "loss": 0.6171, - "step": 6495 - }, - { - "epoch": 0.7589911256422233, - "grad_norm": 0.5978888869285583, - "learning_rate": 1.1777038483993066e-05, - "loss": 0.6355, - "step": 6500 - }, - { - "epoch": 0.7595749649696404, - "grad_norm": 0.6429453492164612, - "learning_rate": 1.1745990429599439e-05, - "loss": 0.6572, - "step": 6505 - }, - { - "epoch": 0.7601588042970574, - "grad_norm": 0.6589666604995728, - "learning_rate": 1.1715001112726304e-05, - "loss": 0.6088, - "step": 6510 - }, - { - "epoch": 0.7607426436244745, - "grad_norm": 0.6185446977615356, - "learning_rate": 1.1684070648914763e-05, - "loss": 0.6504, - "step": 6515 - }, - { - "epoch": 0.7613264829518916, - "grad_norm": 0.585168719291687, - "learning_rate": 1.1653199153486488e-05, - "loss": 0.634, - "step": 6520 - }, - { - "epoch": 0.7619103222793088, - "grad_norm": 0.6552976965904236, - "learning_rate": 1.1622386741543295e-05, - "loss": 0.6336, - "step": 6525 - }, - { - "epoch": 0.7624941616067258, - "grad_norm": 0.6586318612098694, - "learning_rate": 1.1591633527966713e-05, - "loss": 0.6457, - "step": 6530 - }, - { - "epoch": 0.7630780009341429, - "grad_norm": 0.6457260847091675, - "learning_rate": 1.1560939627417555e-05, - "loss": 0.6508, - "step": 6535 - }, - { - "epoch": 0.76366184026156, - "grad_norm": 0.6527279615402222, - "learning_rate": 1.1530305154335482e-05, - "loss": 0.6324, - "step": 6540 - }, - { - "epoch": 0.7642456795889772, - "grad_norm": 0.6516167521476746, - "learning_rate": 1.1499730222938595e-05, - "loss": 0.6423, - "step": 6545 - }, - { - "epoch": 0.7648295189163942, - "grad_norm": 0.6201888918876648, - "learning_rate": 1.1469214947222993e-05, - "loss": 0.6352, - "step": 6550 - }, - { - "epoch": 0.7654133582438113, - "grad_norm": 0.6367828249931335, - "learning_rate": 1.1438759440962353e-05, - "loss": 0.6235, - "step": 6555 - }, - { - "epoch": 0.7659971975712284, - "grad_norm": 0.6098381280899048, - "learning_rate": 1.1408363817707523e-05, - "loss": 0.6443, - "step": 6560 - }, - { - "epoch": 0.7665810368986455, - "grad_norm": 0.7043296694755554, - "learning_rate": 1.1378028190786045e-05, - "loss": 0.6325, - "step": 6565 - }, - { - "epoch": 0.7671648762260626, - "grad_norm": 0.6127150058746338, - "learning_rate": 1.134775267330181e-05, - "loss": 0.6211, - "step": 6570 - }, - { - "epoch": 0.7677487155534797, - "grad_norm": 0.6486060619354248, - "learning_rate": 1.1317537378134568e-05, - "loss": 0.6501, - "step": 6575 - }, - { - "epoch": 0.7683325548808968, - "grad_norm": 0.613500714302063, - "learning_rate": 1.1287382417939555e-05, - "loss": 0.6494, - "step": 6580 - }, - { - "epoch": 0.7689163942083139, - "grad_norm": 0.6373565196990967, - "learning_rate": 1.1257287905147035e-05, - "loss": 0.6316, - "step": 6585 - }, - { - "epoch": 0.7695002335357309, - "grad_norm": 0.5736018419265747, - "learning_rate": 1.1227253951961911e-05, - "loss": 0.6197, - "step": 6590 - }, - { - "epoch": 0.7700840728631481, - "grad_norm": 0.5799152851104736, - "learning_rate": 1.1197280670363297e-05, - "loss": 0.6374, - "step": 6595 - }, - { - "epoch": 0.7706679121905652, - "grad_norm": 0.5913314819335938, - "learning_rate": 1.1167368172104084e-05, - "loss": 0.6396, - "step": 6600 - }, - { - "epoch": 0.7712517515179822, - "grad_norm": 0.6167449951171875, - "learning_rate": 1.1137516568710548e-05, - "loss": 0.6454, - "step": 6605 - }, - { - "epoch": 0.7718355908453993, - "grad_norm": 0.5984235405921936, - "learning_rate": 1.1107725971481923e-05, - "loss": 0.6383, - "step": 6610 - }, - { - "epoch": 0.7724194301728164, - "grad_norm": 0.5756741762161255, - "learning_rate": 1.107799649148998e-05, - "loss": 0.6273, - "step": 6615 - }, - { - "epoch": 0.7730032695002336, - "grad_norm": 0.6331648826599121, - "learning_rate": 1.1048328239578631e-05, - "loss": 0.6403, - "step": 6620 - }, - { - "epoch": 0.7735871088276506, - "grad_norm": 0.6066078543663025, - "learning_rate": 1.1018721326363493e-05, - "loss": 0.6283, - "step": 6625 - }, - { - "epoch": 0.7741709481550677, - "grad_norm": 0.6468260288238525, - "learning_rate": 1.0989175862231488e-05, - "loss": 0.6505, - "step": 6630 - }, - { - "epoch": 0.7747547874824848, - "grad_norm": 0.5881868600845337, - "learning_rate": 1.095969195734044e-05, - "loss": 0.652, - "step": 6635 - }, - { - "epoch": 0.775338626809902, - "grad_norm": 0.5820181965827942, - "learning_rate": 1.0930269721618641e-05, - "loss": 0.6109, - "step": 6640 - }, - { - "epoch": 0.775922466137319, - "grad_norm": 0.6042081713676453, - "learning_rate": 1.0900909264764463e-05, - "loss": 0.6266, - "step": 6645 - }, - { - "epoch": 0.7765063054647361, - "grad_norm": 0.6235969066619873, - "learning_rate": 1.0871610696245941e-05, - "loss": 0.6581, - "step": 6650 - }, - { - "epoch": 0.7770901447921532, - "grad_norm": 0.6411299705505371, - "learning_rate": 1.0842374125300364e-05, - "loss": 0.6476, - "step": 6655 - }, - { - "epoch": 0.7776739841195703, - "grad_norm": 0.659362256526947, - "learning_rate": 1.081319966093386e-05, - "loss": 0.6656, - "step": 6660 - }, - { - "epoch": 0.7782578234469874, - "grad_norm": 0.619476854801178, - "learning_rate": 1.0784087411921e-05, - "loss": 0.6211, - "step": 6665 - }, - { - "epoch": 0.7788416627744045, - "grad_norm": 0.6047325134277344, - "learning_rate": 1.0755037486804411e-05, - "loss": 0.6451, - "step": 6670 - }, - { - "epoch": 0.7794255021018216, - "grad_norm": 0.5977070331573486, - "learning_rate": 1.0726049993894324e-05, - "loss": 0.6384, - "step": 6675 - }, - { - "epoch": 0.7800093414292387, - "grad_norm": 0.6228548884391785, - "learning_rate": 1.0697125041268207e-05, - "loss": 0.6283, - "step": 6680 - }, - { - "epoch": 0.7805931807566557, - "grad_norm": 0.6101776957511902, - "learning_rate": 1.0668262736770356e-05, - "loss": 0.631, - "step": 6685 - }, - { - "epoch": 0.7811770200840729, - "grad_norm": 0.613178014755249, - "learning_rate": 1.0639463188011476e-05, - "loss": 0.6545, - "step": 6690 - }, - { - "epoch": 0.78176085941149, - "grad_norm": 0.6110402345657349, - "learning_rate": 1.0610726502368303e-05, - "loss": 0.6494, - "step": 6695 - }, - { - "epoch": 0.7823446987389071, - "grad_norm": 0.6432594060897827, - "learning_rate": 1.0582052786983194e-05, - "loss": 0.6413, - "step": 6700 - }, - { - "epoch": 0.7829285380663241, - "grad_norm": 0.6110131740570068, - "learning_rate": 1.0553442148763725e-05, - "loss": 0.6627, - "step": 6705 - }, - { - "epoch": 0.7835123773937412, - "grad_norm": 0.6538170576095581, - "learning_rate": 1.0524894694382284e-05, - "loss": 0.6343, - "step": 6710 - }, - { - "epoch": 0.7840962167211584, - "grad_norm": 0.6408452391624451, - "learning_rate": 1.0496410530275694e-05, - "loss": 0.6383, - "step": 6715 - }, - { - "epoch": 0.7846800560485754, - "grad_norm": 0.6215013265609741, - "learning_rate": 1.0467989762644803e-05, - "loss": 0.6273, - "step": 6720 - }, - { - "epoch": 0.7852638953759925, - "grad_norm": 0.6320236325263977, - "learning_rate": 1.0439632497454093e-05, - "loss": 0.6424, - "step": 6725 - }, - { - "epoch": 0.7858477347034096, - "grad_norm": 0.6162779331207275, - "learning_rate": 1.0411338840431278e-05, - "loss": 0.6409, - "step": 6730 - }, - { - "epoch": 0.7864315740308268, - "grad_norm": 0.6185814142227173, - "learning_rate": 1.0383108897066915e-05, - "loss": 0.6402, - "step": 6735 - }, - { - "epoch": 0.7870154133582438, - "grad_norm": 0.627000629901886, - "learning_rate": 1.035494277261401e-05, - "loss": 0.6107, - "step": 6740 - }, - { - "epoch": 0.7875992526856609, - "grad_norm": 0.6048628091812134, - "learning_rate": 1.0326840572087633e-05, - "loss": 0.6261, - "step": 6745 - }, - { - "epoch": 0.788183092013078, - "grad_norm": 0.5890159010887146, - "learning_rate": 1.0298802400264502e-05, - "loss": 0.6296, - "step": 6750 - }, - { - "epoch": 0.7887669313404951, - "grad_norm": 0.6244004368782043, - "learning_rate": 1.0270828361682628e-05, - "loss": 0.6376, - "step": 6755 - }, - { - "epoch": 0.7893507706679121, - "grad_norm": 0.6479686498641968, - "learning_rate": 1.0242918560640893e-05, - "loss": 0.6165, - "step": 6760 - }, - { - "epoch": 0.7899346099953293, - "grad_norm": 0.6587838530540466, - "learning_rate": 1.0215073101198683e-05, - "loss": 0.6359, - "step": 6765 - }, - { - "epoch": 0.7905184493227464, - "grad_norm": 0.6083243489265442, - "learning_rate": 1.0187292087175485e-05, - "loss": 0.6239, - "step": 6770 - }, - { - "epoch": 0.7911022886501635, - "grad_norm": 0.6163403391838074, - "learning_rate": 1.0159575622150513e-05, - "loss": 0.6404, - "step": 6775 - }, - { - "epoch": 0.7916861279775805, - "grad_norm": 0.6353800892829895, - "learning_rate": 1.0131923809462313e-05, - "loss": 0.6477, - "step": 6780 - }, - { - "epoch": 0.7922699673049977, - "grad_norm": 0.6517370343208313, - "learning_rate": 1.0104336752208374e-05, - "loss": 0.6193, - "step": 6785 - }, - { - "epoch": 0.7928538066324148, - "grad_norm": 0.6401410102844238, - "learning_rate": 1.0076814553244762e-05, - "loss": 0.6451, - "step": 6790 - }, - { - "epoch": 0.7934376459598319, - "grad_norm": 0.5974830985069275, - "learning_rate": 1.0049357315185711e-05, - "loss": 0.6357, - "step": 6795 - }, - { - "epoch": 0.7940214852872489, - "grad_norm": 0.5978230834007263, - "learning_rate": 1.0021965140403267e-05, - "loss": 0.6113, - "step": 6800 - }, - { - "epoch": 0.794605324614666, - "grad_norm": 0.6527069807052612, - "learning_rate": 9.99463813102688e-06, - "loss": 0.6216, - "step": 6805 - }, - { - "epoch": 0.7951891639420832, - "grad_norm": 0.6506062746047974, - "learning_rate": 9.967376388943042e-06, - "loss": 0.6413, - "step": 6810 - }, - { - "epoch": 0.7957730032695002, - "grad_norm": 0.6087713241577148, - "learning_rate": 9.940180015794908e-06, - "loss": 0.6569, - "step": 6815 - }, - { - "epoch": 0.7963568425969173, - "grad_norm": 0.614267110824585, - "learning_rate": 9.913049112981897e-06, - "loss": 0.6489, - "step": 6820 - }, - { - "epoch": 0.7969406819243344, - "grad_norm": 0.6012156009674072, - "learning_rate": 9.885983781659332e-06, - "loss": 0.6551, - "step": 6825 - }, - { - "epoch": 0.7975245212517516, - "grad_norm": 0.6474121809005737, - "learning_rate": 9.858984122738072e-06, - "loss": 0.6262, - "step": 6830 - }, - { - "epoch": 0.7981083605791686, - "grad_norm": 0.655981183052063, - "learning_rate": 9.832050236884102e-06, - "loss": 0.6488, - "step": 6835 - }, - { - "epoch": 0.7986921999065857, - "grad_norm": 0.61264568567276, - "learning_rate": 9.805182224518186e-06, - "loss": 0.641, - "step": 6840 - }, - { - "epoch": 0.7992760392340028, - "grad_norm": 0.5794661641120911, - "learning_rate": 9.778380185815486e-06, - "loss": 0.6373, - "step": 6845 - }, - { - "epoch": 0.7998598785614199, - "grad_norm": 0.6462731957435608, - "learning_rate": 9.751644220705187e-06, - "loss": 0.5931, - "step": 6850 - }, - { - "epoch": 0.800443717888837, - "grad_norm": 0.6341938376426697, - "learning_rate": 9.72497442887012e-06, - "loss": 0.6585, - "step": 6855 - }, - { - "epoch": 0.8010275572162541, - "grad_norm": 0.6305548548698425, - "learning_rate": 9.698370909746387e-06, - "loss": 0.631, - "step": 6860 - }, - { - "epoch": 0.8016113965436712, - "grad_norm": 0.6123278737068176, - "learning_rate": 9.671833762523016e-06, - "loss": 0.6415, - "step": 6865 - }, - { - "epoch": 0.8021952358710883, - "grad_norm": 0.6392982006072998, - "learning_rate": 9.645363086141561e-06, - "loss": 0.668, - "step": 6870 - }, - { - "epoch": 0.8027790751985053, - "grad_norm": 0.6142355799674988, - "learning_rate": 9.618958979295747e-06, - "loss": 0.6489, - "step": 6875 - }, - { - "epoch": 0.8033629145259225, - "grad_norm": 0.566252589225769, - "learning_rate": 9.592621540431101e-06, - "loss": 0.632, - "step": 6880 - }, - { - "epoch": 0.8039467538533396, - "grad_norm": 0.5913701057434082, - "learning_rate": 9.566350867744584e-06, - "loss": 0.6486, - "step": 6885 - }, - { - "epoch": 0.8045305931807567, - "grad_norm": 0.5824702382087708, - "learning_rate": 9.540147059184226e-06, - "loss": 0.6298, - "step": 6890 - }, - { - "epoch": 0.8051144325081737, - "grad_norm": 0.7043502926826477, - "learning_rate": 9.514010212448751e-06, - "loss": 0.6321, - "step": 6895 - }, - { - "epoch": 0.8056982718355908, - "grad_norm": 0.5843250751495361, - "learning_rate": 9.487940424987235e-06, - "loss": 0.6403, - "step": 6900 - }, - { - "epoch": 0.806282111163008, - "grad_norm": 0.5942621231079102, - "learning_rate": 9.461937793998723e-06, - "loss": 0.6554, - "step": 6905 - }, - { - "epoch": 0.8068659504904251, - "grad_norm": 0.6155551075935364, - "learning_rate": 9.436002416431868e-06, - "loss": 0.63, - "step": 6910 - }, - { - "epoch": 0.8074497898178421, - "grad_norm": 0.602841317653656, - "learning_rate": 9.41013438898458e-06, - "loss": 0.6315, - "step": 6915 - }, - { - "epoch": 0.8080336291452592, - "grad_norm": 0.6575081944465637, - "learning_rate": 9.384333808103656e-06, - "loss": 0.6316, - "step": 6920 - }, - { - "epoch": 0.8086174684726763, - "grad_norm": 0.6333084106445312, - "learning_rate": 9.358600769984432e-06, - "loss": 0.635, - "step": 6925 - }, - { - "epoch": 0.8092013078000934, - "grad_norm": 0.6076129674911499, - "learning_rate": 9.332935370570402e-06, - "loss": 0.6492, - "step": 6930 - }, - { - "epoch": 0.8097851471275105, - "grad_norm": 0.6189504861831665, - "learning_rate": 9.30733770555289e-06, - "loss": 0.6392, - "step": 6935 - }, - { - "epoch": 0.8103689864549276, - "grad_norm": 0.6335691213607788, - "learning_rate": 9.281807870370666e-06, - "loss": 0.644, - "step": 6940 - }, - { - "epoch": 0.8109528257823447, - "grad_norm": 0.6290645599365234, - "learning_rate": 9.256345960209608e-06, - "loss": 0.6255, - "step": 6945 - }, - { - "epoch": 0.8115366651097617, - "grad_norm": 0.6880708336830139, - "learning_rate": 9.23095207000234e-06, - "loss": 0.6734, - "step": 6950 - }, - { - "epoch": 0.8121205044371789, - "grad_norm": 0.6487305760383606, - "learning_rate": 9.205626294427885e-06, - "loss": 0.6294, - "step": 6955 - }, - { - "epoch": 0.812704343764596, - "grad_norm": 0.6282386779785156, - "learning_rate": 9.18036872791129e-06, - "loss": 0.6488, - "step": 6960 - }, - { - "epoch": 0.8132881830920131, - "grad_norm": 0.5874021649360657, - "learning_rate": 9.155179464623312e-06, - "loss": 0.632, - "step": 6965 - }, - { - "epoch": 0.8138720224194301, - "grad_norm": 0.6296844482421875, - "learning_rate": 9.130058598480027e-06, - "loss": 0.6301, - "step": 6970 - }, - { - "epoch": 0.8144558617468473, - "grad_norm": 0.6266751289367676, - "learning_rate": 9.105006223142507e-06, - "loss": 0.6291, - "step": 6975 - }, - { - "epoch": 0.8150397010742644, - "grad_norm": 0.6116609573364258, - "learning_rate": 9.080022432016457e-06, - "loss": 0.6269, - "step": 6980 - }, - { - "epoch": 0.8156235404016815, - "grad_norm": 0.5927746295928955, - "learning_rate": 9.05510731825188e-06, - "loss": 0.6446, - "step": 6985 - }, - { - "epoch": 0.8162073797290985, - "grad_norm": 0.636486291885376, - "learning_rate": 9.030260974742701e-06, - "loss": 0.6461, - "step": 6990 - }, - { - "epoch": 0.8167912190565156, - "grad_norm": 0.6360753178596497, - "learning_rate": 9.005483494126474e-06, - "loss": 0.6377, - "step": 6995 - }, - { - "epoch": 0.8173750583839328, - "grad_norm": 0.6037192344665527, - "learning_rate": 8.980774968783978e-06, - "loss": 0.6484, - "step": 7000 - }, - { - "epoch": 0.8179588977113499, - "grad_norm": 0.6330691576004028, - "learning_rate": 8.9561354908389e-06, - "loss": 0.6474, - "step": 7005 - }, - { - "epoch": 0.8185427370387669, - "grad_norm": 0.6776137351989746, - "learning_rate": 8.931565152157492e-06, - "loss": 0.6503, - "step": 7010 - }, - { - "epoch": 0.819126576366184, - "grad_norm": 0.660215437412262, - "learning_rate": 8.907064044348232e-06, - "loss": 0.6448, - "step": 7015 - }, - { - "epoch": 0.8197104156936011, - "grad_norm": 0.6278055906295776, - "learning_rate": 8.88263225876147e-06, - "loss": 0.6039, - "step": 7020 - }, - { - "epoch": 0.8202942550210183, - "grad_norm": 0.5855631232261658, - "learning_rate": 8.858269886489099e-06, - "loss": 0.6119, - "step": 7025 - }, - { - "epoch": 0.8208780943484353, - "grad_norm": 0.6020859479904175, - "learning_rate": 8.8339770183642e-06, - "loss": 0.6404, - "step": 7030 - }, - { - "epoch": 0.8214619336758524, - "grad_norm": 0.6146640181541443, - "learning_rate": 8.809753744960733e-06, - "loss": 0.6465, - "step": 7035 - }, - { - "epoch": 0.8220457730032695, - "grad_norm": 0.5913814306259155, - "learning_rate": 8.785600156593157e-06, - "loss": 0.6313, - "step": 7040 - }, - { - "epoch": 0.8226296123306865, - "grad_norm": 0.598573625087738, - "learning_rate": 8.761516343316131e-06, - "loss": 0.6432, - "step": 7045 - }, - { - "epoch": 0.8232134516581037, - "grad_norm": 0.5968788862228394, - "learning_rate": 8.737502394924158e-06, - "loss": 0.6368, - "step": 7050 - }, - { - "epoch": 0.8237972909855208, - "grad_norm": 0.6117973923683167, - "learning_rate": 8.713558400951254e-06, - "loss": 0.631, - "step": 7055 - }, - { - "epoch": 0.8243811303129379, - "grad_norm": 0.6480801105499268, - "learning_rate": 8.689684450670627e-06, - "loss": 0.6443, - "step": 7060 - }, - { - "epoch": 0.8249649696403549, - "grad_norm": 0.639126718044281, - "learning_rate": 8.665880633094314e-06, - "loss": 0.6135, - "step": 7065 - }, - { - "epoch": 0.825548808967772, - "grad_norm": 0.5657269358634949, - "learning_rate": 8.642147036972887e-06, - "loss": 0.6327, - "step": 7070 - }, - { - "epoch": 0.8261326482951892, - "grad_norm": 0.609393298625946, - "learning_rate": 8.618483750795087e-06, - "loss": 0.643, - "step": 7075 - }, - { - "epoch": 0.8267164876226063, - "grad_norm": 0.6061826944351196, - "learning_rate": 8.594890862787518e-06, - "loss": 0.6224, - "step": 7080 - }, - { - "epoch": 0.8273003269500233, - "grad_norm": 0.613351047039032, - "learning_rate": 8.571368460914316e-06, - "loss": 0.631, - "step": 7085 - }, - { - "epoch": 0.8278841662774404, - "grad_norm": 0.6105780005455017, - "learning_rate": 8.547916632876806e-06, - "loss": 0.642, - "step": 7090 - }, - { - "epoch": 0.8284680056048576, - "grad_norm": 0.6157962679862976, - "learning_rate": 8.524535466113185e-06, - "loss": 0.6282, - "step": 7095 - }, - { - "epoch": 0.8290518449322747, - "grad_norm": 0.6295271515846252, - "learning_rate": 8.5012250477982e-06, - "loss": 0.6327, - "step": 7100 - }, - { - "epoch": 0.8296356842596917, - "grad_norm": 0.596213698387146, - "learning_rate": 8.477985464842816e-06, - "loss": 0.6347, - "step": 7105 - }, - { - "epoch": 0.8302195235871088, - "grad_norm": 0.6437660455703735, - "learning_rate": 8.454816803893893e-06, - "loss": 0.6364, - "step": 7110 - }, - { - "epoch": 0.830803362914526, - "grad_norm": 0.6350366473197937, - "learning_rate": 8.431719151333864e-06, - "loss": 0.6238, - "step": 7115 - }, - { - "epoch": 0.8313872022419431, - "grad_norm": 0.6170267462730408, - "learning_rate": 8.40869259328042e-06, - "loss": 0.6434, - "step": 7120 - }, - { - "epoch": 0.8319710415693601, - "grad_norm": 0.617951512336731, - "learning_rate": 8.385737215586171e-06, - "loss": 0.6449, - "step": 7125 - }, - { - "epoch": 0.8325548808967772, - "grad_norm": 0.595119297504425, - "learning_rate": 8.362853103838344e-06, - "loss": 0.6249, - "step": 7130 - }, - { - "epoch": 0.8331387202241943, - "grad_norm": 0.6413443684577942, - "learning_rate": 8.340040343358455e-06, - "loss": 0.6107, - "step": 7135 - }, - { - "epoch": 0.8337225595516113, - "grad_norm": 0.6542086601257324, - "learning_rate": 8.317299019201996e-06, - "loss": 0.6376, - "step": 7140 - }, - { - "epoch": 0.8343063988790285, - "grad_norm": 0.5936727523803711, - "learning_rate": 8.294629216158107e-06, - "loss": 0.6378, - "step": 7145 - }, - { - "epoch": 0.8348902382064456, - "grad_norm": 0.6045513153076172, - "learning_rate": 8.272031018749272e-06, - "loss": 0.6437, - "step": 7150 - }, - { - "epoch": 0.8354740775338627, - "grad_norm": 0.5874149203300476, - "learning_rate": 8.249504511231005e-06, - "loss": 0.6491, - "step": 7155 - }, - { - "epoch": 0.8360579168612797, - "grad_norm": 0.6034518480300903, - "learning_rate": 8.227049777591516e-06, - "loss": 0.627, - "step": 7160 - }, - { - "epoch": 0.8366417561886969, - "grad_norm": 0.6214303374290466, - "learning_rate": 8.204666901551428e-06, - "loss": 0.6462, - "step": 7165 - }, - { - "epoch": 0.837225595516114, - "grad_norm": 0.6115787029266357, - "learning_rate": 8.182355966563438e-06, - "loss": 0.6302, - "step": 7170 - }, - { - "epoch": 0.8378094348435311, - "grad_norm": 0.6452166438102722, - "learning_rate": 8.160117055812019e-06, - "loss": 0.6485, - "step": 7175 - }, - { - "epoch": 0.8383932741709481, - "grad_norm": 0.6037722229957581, - "learning_rate": 8.13795025221311e-06, - "loss": 0.6261, - "step": 7180 - }, - { - "epoch": 0.8389771134983652, - "grad_norm": 0.600877046585083, - "learning_rate": 8.115855638413806e-06, - "loss": 0.621, - "step": 7185 - }, - { - "epoch": 0.8395609528257824, - "grad_norm": 0.6325284242630005, - "learning_rate": 8.09383329679204e-06, - "loss": 0.6263, - "step": 7190 - }, - { - "epoch": 0.8401447921531995, - "grad_norm": 0.6154286861419678, - "learning_rate": 8.071883309456292e-06, - "loss": 0.6262, - "step": 7195 - }, - { - "epoch": 0.8407286314806165, - "grad_norm": 0.6005722880363464, - "learning_rate": 8.050005758245274e-06, - "loss": 0.6022, - "step": 7200 - }, - { - "epoch": 0.8413124708080336, - "grad_norm": 0.6308860182762146, - "learning_rate": 8.028200724727623e-06, - "loss": 0.6264, - "step": 7205 - }, - { - "epoch": 0.8418963101354507, - "grad_norm": 0.6115047335624695, - "learning_rate": 8.006468290201603e-06, - "loss": 0.6247, - "step": 7210 - }, - { - "epoch": 0.8424801494628679, - "grad_norm": 0.5641135573387146, - "learning_rate": 7.984808535694794e-06, - "loss": 0.633, - "step": 7215 - }, - { - "epoch": 0.8430639887902849, - "grad_norm": 0.6081599593162537, - "learning_rate": 7.963221541963799e-06, - "loss": 0.6422, - "step": 7220 - }, - { - "epoch": 0.843647828117702, - "grad_norm": 0.5814548134803772, - "learning_rate": 7.94170738949394e-06, - "loss": 0.6214, - "step": 7225 - }, - { - "epoch": 0.8442316674451191, - "grad_norm": 0.6003990173339844, - "learning_rate": 7.920266158498948e-06, - "loss": 0.6383, - "step": 7230 - }, - { - "epoch": 0.8448155067725363, - "grad_norm": 0.6179484128952026, - "learning_rate": 7.898897928920684e-06, - "loss": 0.6197, - "step": 7235 - }, - { - "epoch": 0.8453993460999533, - "grad_norm": 0.5955949425697327, - "learning_rate": 7.877602780428816e-06, - "loss": 0.6186, - "step": 7240 - }, - { - "epoch": 0.8459831854273704, - "grad_norm": 0.5994767546653748, - "learning_rate": 7.856380792420549e-06, - "loss": 0.6306, - "step": 7245 - }, - { - "epoch": 0.8465670247547875, - "grad_norm": 0.6301043033599854, - "learning_rate": 7.835232044020304e-06, - "loss": 0.6366, - "step": 7250 - }, - { - "epoch": 0.8471508640822045, - "grad_norm": 0.5901968479156494, - "learning_rate": 7.81415661407944e-06, - "loss": 0.6235, - "step": 7255 - }, - { - "epoch": 0.8477347034096216, - "grad_norm": 0.6159135103225708, - "learning_rate": 7.793154581175954e-06, - "loss": 0.631, - "step": 7260 - }, - { - "epoch": 0.8483185427370388, - "grad_norm": 0.5659588575363159, - "learning_rate": 7.772226023614185e-06, - "loss": 0.6342, - "step": 7265 - }, - { - "epoch": 0.8489023820644559, - "grad_norm": 0.5551443099975586, - "learning_rate": 7.751371019424528e-06, - "loss": 0.629, - "step": 7270 - }, - { - "epoch": 0.8494862213918729, - "grad_norm": 0.6662541627883911, - "learning_rate": 7.730589646363141e-06, - "loss": 0.6352, - "step": 7275 - }, - { - "epoch": 0.85007006071929, - "grad_norm": 0.5760899186134338, - "learning_rate": 7.709881981911648e-06, - "loss": 0.6258, - "step": 7280 - }, - { - "epoch": 0.8506539000467072, - "grad_norm": 0.5908288955688477, - "learning_rate": 7.689248103276873e-06, - "loss": 0.6423, - "step": 7285 - }, - { - "epoch": 0.8512377393741243, - "grad_norm": 0.6404958367347717, - "learning_rate": 7.668688087390509e-06, - "loss": 0.6212, - "step": 7290 - }, - { - "epoch": 0.8518215787015413, - "grad_norm": 0.5786986947059631, - "learning_rate": 7.648202010908884e-06, - "loss": 0.6492, - "step": 7295 - }, - { - "epoch": 0.8524054180289584, - "grad_norm": 0.5850706696510315, - "learning_rate": 7.627789950212635e-06, - "loss": 0.6435, - "step": 7300 - }, - { - "epoch": 0.8529892573563755, - "grad_norm": 0.5872102975845337, - "learning_rate": 7.607451981406441e-06, - "loss": 0.6402, - "step": 7305 - }, - { - "epoch": 0.8535730966837927, - "grad_norm": 0.6278002262115479, - "learning_rate": 7.587188180318736e-06, - "loss": 0.6329, - "step": 7310 - }, - { - "epoch": 0.8541569360112097, - "grad_norm": 0.6291380524635315, - "learning_rate": 7.5669986225014215e-06, - "loss": 0.6138, - "step": 7315 - }, - { - "epoch": 0.8547407753386268, - "grad_norm": 0.5961721539497375, - "learning_rate": 7.546883383229594e-06, - "loss": 0.6296, - "step": 7320 - }, - { - "epoch": 0.8553246146660439, - "grad_norm": 0.5672163367271423, - "learning_rate": 7.526842537501259e-06, - "loss": 0.6185, - "step": 7325 - }, - { - "epoch": 0.855908453993461, - "grad_norm": 0.6100834012031555, - "learning_rate": 7.50687616003705e-06, - "loss": 0.6508, - "step": 7330 - }, - { - "epoch": 0.8564922933208781, - "grad_norm": 0.5889347195625305, - "learning_rate": 7.486984325279956e-06, - "loss": 0.6344, - "step": 7335 - }, - { - "epoch": 0.8570761326482952, - "grad_norm": 0.6142042875289917, - "learning_rate": 7.467167107395028e-06, - "loss": 0.6507, - "step": 7340 - }, - { - "epoch": 0.8576599719757123, - "grad_norm": 0.6116150617599487, - "learning_rate": 7.44742458026913e-06, - "loss": 0.6449, - "step": 7345 - }, - { - "epoch": 0.8582438113031294, - "grad_norm": 0.6059677004814148, - "learning_rate": 7.427756817510634e-06, - "loss": 0.6315, - "step": 7350 - }, - { - "epoch": 0.8588276506305464, - "grad_norm": 0.5788863897323608, - "learning_rate": 7.408163892449172e-06, - "loss": 0.6566, - "step": 7355 - }, - { - "epoch": 0.8594114899579636, - "grad_norm": 0.6377168893814087, - "learning_rate": 7.388645878135338e-06, - "loss": 0.625, - "step": 7360 - }, - { - "epoch": 0.8599953292853807, - "grad_norm": 0.6076430082321167, - "learning_rate": 7.369202847340432e-06, - "loss": 0.6347, - "step": 7365 - }, - { - "epoch": 0.8605791686127977, - "grad_norm": 0.6042303442955017, - "learning_rate": 7.349834872556187e-06, - "loss": 0.6181, - "step": 7370 - }, - { - "epoch": 0.8611630079402148, - "grad_norm": 0.6569790244102478, - "learning_rate": 7.330542025994495e-06, - "loss": 0.6397, - "step": 7375 - }, - { - "epoch": 0.861746847267632, - "grad_norm": 0.5703638792037964, - "learning_rate": 7.311324379587136e-06, - "loss": 0.6239, - "step": 7380 - }, - { - "epoch": 0.8623306865950491, - "grad_norm": 0.6026293039321899, - "learning_rate": 7.292182004985511e-06, - "loss": 0.6241, - "step": 7385 - }, - { - "epoch": 0.8629145259224661, - "grad_norm": 0.5959193110466003, - "learning_rate": 7.2731149735603825e-06, - "loss": 0.6377, - "step": 7390 - }, - { - "epoch": 0.8634983652498832, - "grad_norm": 0.5526796579360962, - "learning_rate": 7.254123356401597e-06, - "loss": 0.6435, - "step": 7395 - }, - { - "epoch": 0.8640822045773003, - "grad_norm": 0.5878410339355469, - "learning_rate": 7.23520722431783e-06, - "loss": 0.6324, - "step": 7400 - }, - { - "epoch": 0.8646660439047175, - "grad_norm": 0.6383752226829529, - "learning_rate": 7.216366647836306e-06, - "loss": 0.611, - "step": 7405 - }, - { - "epoch": 0.8652498832321345, - "grad_norm": 0.6415829658508301, - "learning_rate": 7.197601697202565e-06, - "loss": 0.6519, - "step": 7410 - }, - { - "epoch": 0.8658337225595516, - "grad_norm": 0.6422529220581055, - "learning_rate": 7.1789124423801645e-06, - "loss": 0.6374, - "step": 7415 - }, - { - "epoch": 0.8664175618869687, - "grad_norm": 0.5952723026275635, - "learning_rate": 7.160298953050448e-06, - "loss": 0.6193, - "step": 7420 - }, - { - "epoch": 0.8670014012143858, - "grad_norm": 0.6314033269882202, - "learning_rate": 7.141761298612267e-06, - "loss": 0.6497, - "step": 7425 - }, - { - "epoch": 0.8675852405418029, - "grad_norm": 0.6324970126152039, - "learning_rate": 7.123299548181732e-06, - "loss": 0.6469, - "step": 7430 - }, - { - "epoch": 0.86816907986922, - "grad_norm": 0.6466127634048462, - "learning_rate": 7.104913770591953e-06, - "loss": 0.6319, - "step": 7435 - }, - { - "epoch": 0.8687529191966371, - "grad_norm": 0.6144922375679016, - "learning_rate": 7.086604034392777e-06, - "loss": 0.6523, - "step": 7440 - }, - { - "epoch": 0.8693367585240542, - "grad_norm": 0.6100398302078247, - "learning_rate": 7.068370407850541e-06, - "loss": 0.6506, - "step": 7445 - }, - { - "epoch": 0.8699205978514712, - "grad_norm": 0.6031321883201599, - "learning_rate": 7.050212958947813e-06, - "loss": 0.6184, - "step": 7450 - }, - { - "epoch": 0.8705044371788884, - "grad_norm": 0.6167150735855103, - "learning_rate": 7.032131755383134e-06, - "loss": 0.6123, - "step": 7455 - }, - { - "epoch": 0.8710882765063055, - "grad_norm": 0.6024656295776367, - "learning_rate": 7.014126864570782e-06, - "loss": 0.6094, - "step": 7460 - }, - { - "epoch": 0.8716721158337225, - "grad_norm": 0.5886437296867371, - "learning_rate": 6.996198353640495e-06, - "loss": 0.6319, - "step": 7465 - }, - { - "epoch": 0.8722559551611396, - "grad_norm": 0.6112352609634399, - "learning_rate": 6.978346289437245e-06, - "loss": 0.6381, - "step": 7470 - }, - { - "epoch": 0.8728397944885568, - "grad_norm": 0.6057114601135254, - "learning_rate": 6.9605707385209755e-06, - "loss": 0.6313, - "step": 7475 - }, - { - "epoch": 0.8734236338159739, - "grad_norm": 0.6247406601905823, - "learning_rate": 6.942871767166354e-06, - "loss": 0.6168, - "step": 7480 - }, - { - "epoch": 0.8740074731433909, - "grad_norm": 0.5990908741950989, - "learning_rate": 6.925249441362533e-06, - "loss": 0.6446, - "step": 7485 - }, - { - "epoch": 0.874591312470808, - "grad_norm": 0.5741786956787109, - "learning_rate": 6.907703826812895e-06, - "loss": 0.625, - "step": 7490 - }, - { - "epoch": 0.8751751517982251, - "grad_norm": 0.6300804615020752, - "learning_rate": 6.89023498893481e-06, - "loss": 0.6195, - "step": 7495 - }, - { - "epoch": 0.8757589911256423, - "grad_norm": 0.5752413868904114, - "learning_rate": 6.872842992859395e-06, - "loss": 0.6418, - "step": 7500 - }, - { - "epoch": 0.8763428304530593, - "grad_norm": 0.5964552164077759, - "learning_rate": 6.855527903431267e-06, - "loss": 0.6547, - "step": 7505 - }, - { - "epoch": 0.8769266697804764, - "grad_norm": 0.6147460341453552, - "learning_rate": 6.838289785208303e-06, - "loss": 0.6223, - "step": 7510 - }, - { - "epoch": 0.8775105091078935, - "grad_norm": 0.6291671991348267, - "learning_rate": 6.821128702461401e-06, - "loss": 0.6227, - "step": 7515 - }, - { - "epoch": 0.8780943484353106, - "grad_norm": 0.6411397457122803, - "learning_rate": 6.804044719174235e-06, - "loss": 0.6389, - "step": 7520 - }, - { - "epoch": 0.8786781877627277, - "grad_norm": 0.6462653279304504, - "learning_rate": 6.787037899043027e-06, - "loss": 0.6489, - "step": 7525 - }, - { - "epoch": 0.8792620270901448, - "grad_norm": 0.626177191734314, - "learning_rate": 6.770108305476293e-06, - "loss": 0.633, - "step": 7530 - }, - { - "epoch": 0.8798458664175619, - "grad_norm": 0.5829746127128601, - "learning_rate": 6.753256001594622e-06, - "loss": 0.6264, - "step": 7535 - }, - { - "epoch": 0.880429705744979, - "grad_norm": 0.6104975938796997, - "learning_rate": 6.736481050230438e-06, - "loss": 0.6494, - "step": 7540 - }, - { - "epoch": 0.881013545072396, - "grad_norm": 0.6572177410125732, - "learning_rate": 6.719783513927755e-06, - "loss": 0.6475, - "step": 7545 - }, - { - "epoch": 0.8815973843998132, - "grad_norm": 0.543458878993988, - "learning_rate": 6.703163454941953e-06, - "loss": 0.6229, - "step": 7550 - }, - { - "epoch": 0.8821812237272303, - "grad_norm": 0.6018583178520203, - "learning_rate": 6.686620935239552e-06, - "loss": 0.6294, - "step": 7555 - }, - { - "epoch": 0.8827650630546474, - "grad_norm": 0.5961351990699768, - "learning_rate": 6.670156016497958e-06, - "loss": 0.6159, - "step": 7560 - }, - { - "epoch": 0.8833489023820644, - "grad_norm": 0.5867674350738525, - "learning_rate": 6.653768760105268e-06, - "loss": 0.632, - "step": 7565 - }, - { - "epoch": 0.8839327417094816, - "grad_norm": 0.5952630043029785, - "learning_rate": 6.637459227160004e-06, - "loss": 0.6268, - "step": 7570 - }, - { - "epoch": 0.8845165810368987, - "grad_norm": 0.5863736271858215, - "learning_rate": 6.621227478470911e-06, - "loss": 0.6382, - "step": 7575 - }, - { - "epoch": 0.8851004203643157, - "grad_norm": 0.6072415113449097, - "learning_rate": 6.605073574556721e-06, - "loss": 0.6474, - "step": 7580 - }, - { - "epoch": 0.8856842596917328, - "grad_norm": 0.6050090789794922, - "learning_rate": 6.588997575645929e-06, - "loss": 0.614, - "step": 7585 - }, - { - "epoch": 0.8862680990191499, - "grad_norm": 0.6141996383666992, - "learning_rate": 6.572999541676563e-06, - "loss": 0.647, - "step": 7590 - }, - { - "epoch": 0.8868519383465671, - "grad_norm": 0.6059297323226929, - "learning_rate": 6.557079532295968e-06, - "loss": 0.6318, - "step": 7595 - }, - { - "epoch": 0.8874357776739841, - "grad_norm": 0.5889031291007996, - "learning_rate": 6.541237606860582e-06, - "loss": 0.6247, - "step": 7600 - }, - { - "epoch": 0.8880196170014012, - "grad_norm": 0.5737789869308472, - "learning_rate": 6.525473824435714e-06, - "loss": 0.6431, - "step": 7605 - }, - { - "epoch": 0.8886034563288183, - "grad_norm": 0.586685836315155, - "learning_rate": 6.5097882437953205e-06, - "loss": 0.6219, - "step": 7610 - }, - { - "epoch": 0.8891872956562354, - "grad_norm": 0.6107763648033142, - "learning_rate": 6.49418092342179e-06, - "loss": 0.6445, - "step": 7615 - }, - { - "epoch": 0.8897711349836525, - "grad_norm": 0.5673049092292786, - "learning_rate": 6.478651921505727e-06, - "loss": 0.6333, - "step": 7620 - }, - { - "epoch": 0.8903549743110696, - "grad_norm": 0.5940205454826355, - "learning_rate": 6.463201295945727e-06, - "loss": 0.6406, - "step": 7625 - }, - { - "epoch": 0.8909388136384867, - "grad_norm": 0.5768308639526367, - "learning_rate": 6.447829104348171e-06, - "loss": 0.6342, - "step": 7630 - }, - { - "epoch": 0.8915226529659038, - "grad_norm": 0.598035454750061, - "learning_rate": 6.432535404026997e-06, - "loss": 0.6314, - "step": 7635 - }, - { - "epoch": 0.8921064922933208, - "grad_norm": 0.5789054036140442, - "learning_rate": 6.417320252003505e-06, - "loss": 0.6234, - "step": 7640 - }, - { - "epoch": 0.892690331620738, - "grad_norm": 0.5792223811149597, - "learning_rate": 6.402183705006127e-06, - "loss": 0.6251, - "step": 7645 - }, - { - "epoch": 0.8932741709481551, - "grad_norm": 0.5863689184188843, - "learning_rate": 6.387125819470231e-06, - "loss": 0.6282, - "step": 7650 - }, - { - "epoch": 0.8938580102755722, - "grad_norm": 0.5922786593437195, - "learning_rate": 6.372146651537892e-06, - "loss": 0.6339, - "step": 7655 - }, - { - "epoch": 0.8944418496029892, - "grad_norm": 0.5875045657157898, - "learning_rate": 6.3572462570576985e-06, - "loss": 0.6371, - "step": 7660 - }, - { - "epoch": 0.8950256889304063, - "grad_norm": 0.6048374772071838, - "learning_rate": 6.3424246915845395e-06, - "loss": 0.6504, - "step": 7665 - }, - { - "epoch": 0.8956095282578235, - "grad_norm": 0.6125035285949707, - "learning_rate": 6.327682010379392e-06, - "loss": 0.6319, - "step": 7670 - }, - { - "epoch": 0.8961933675852406, - "grad_norm": 0.6244831681251526, - "learning_rate": 6.313018268409122e-06, - "loss": 0.6123, - "step": 7675 - }, - { - "epoch": 0.8967772069126576, - "grad_norm": 0.5974205136299133, - "learning_rate": 6.2984335203462825e-06, - "loss": 0.6365, - "step": 7680 - }, - { - "epoch": 0.8973610462400747, - "grad_norm": 0.567466676235199, - "learning_rate": 6.283927820568894e-06, - "loss": 0.6157, - "step": 7685 - }, - { - "epoch": 0.8979448855674919, - "grad_norm": 0.6184478998184204, - "learning_rate": 6.269501223160259e-06, - "loss": 0.6306, - "step": 7690 - }, - { - "epoch": 0.8985287248949089, - "grad_norm": 0.5673630237579346, - "learning_rate": 6.255153781908754e-06, - "loss": 0.6352, - "step": 7695 - }, - { - "epoch": 0.899112564222326, - "grad_norm": 0.5903896689414978, - "learning_rate": 6.240885550307624e-06, - "loss": 0.6123, - "step": 7700 - }, - { - "epoch": 0.8996964035497431, - "grad_norm": 0.6115195155143738, - "learning_rate": 6.2266965815547865e-06, - "loss": 0.6247, - "step": 7705 - }, - { - "epoch": 0.9002802428771602, - "grad_norm": 0.5922051668167114, - "learning_rate": 6.212586928552641e-06, - "loss": 0.616, - "step": 7710 - }, - { - "epoch": 0.9008640822045773, - "grad_norm": 0.5885793566703796, - "learning_rate": 6.19855664390786e-06, - "loss": 0.6263, - "step": 7715 - }, - { - "epoch": 0.9014479215319944, - "grad_norm": 0.6003053784370422, - "learning_rate": 6.184605779931197e-06, - "loss": 0.6427, - "step": 7720 - }, - { - "epoch": 0.9020317608594115, - "grad_norm": 0.6021516919136047, - "learning_rate": 6.170734388637294e-06, - "loss": 0.6344, - "step": 7725 - }, - { - "epoch": 0.9026156001868286, - "grad_norm": 0.5899267792701721, - "learning_rate": 6.156942521744484e-06, - "loss": 0.6189, - "step": 7730 - }, - { - "epoch": 0.9031994395142456, - "grad_norm": 0.5902416110038757, - "learning_rate": 6.143230230674602e-06, - "loss": 0.6285, - "step": 7735 - }, - { - "epoch": 0.9037832788416628, - "grad_norm": 0.5886216759681702, - "learning_rate": 6.12959756655279e-06, - "loss": 0.6269, - "step": 7740 - }, - { - "epoch": 0.9043671181690799, - "grad_norm": 0.621340811252594, - "learning_rate": 6.11604458020731e-06, - "loss": 0.6366, - "step": 7745 - }, - { - "epoch": 0.904950957496497, - "grad_norm": 0.5737428665161133, - "learning_rate": 6.102571322169347e-06, - "loss": 0.6355, - "step": 7750 - }, - { - "epoch": 0.905534796823914, - "grad_norm": 0.6115403771400452, - "learning_rate": 6.089177842672826e-06, - "loss": 0.647, - "step": 7755 - }, - { - "epoch": 0.9061186361513311, - "grad_norm": 0.6112204790115356, - "learning_rate": 6.075864191654231e-06, - "loss": 0.6272, - "step": 7760 - }, - { - "epoch": 0.9067024754787483, - "grad_norm": 0.5775085687637329, - "learning_rate": 6.062630418752404e-06, - "loss": 0.621, - "step": 7765 - }, - { - "epoch": 0.9072863148061654, - "grad_norm": 0.6175455451011658, - "learning_rate": 6.049476573308375e-06, - "loss": 0.6477, - "step": 7770 - }, - { - "epoch": 0.9078701541335824, - "grad_norm": 0.5561819672584534, - "learning_rate": 6.036402704365168e-06, - "loss": 0.6244, - "step": 7775 - }, - { - "epoch": 0.9084539934609995, - "grad_norm": 0.6170573234558105, - "learning_rate": 6.023408860667617e-06, - "loss": 0.6452, - "step": 7780 - }, - { - "epoch": 0.9090378327884167, - "grad_norm": 0.600544810295105, - "learning_rate": 6.010495090662197e-06, - "loss": 0.6213, - "step": 7785 - }, - { - "epoch": 0.9096216721158337, - "grad_norm": 0.5967236161231995, - "learning_rate": 5.9976614424968245e-06, - "loss": 0.6234, - "step": 7790 - }, - { - "epoch": 0.9102055114432508, - "grad_norm": 0.5988119840621948, - "learning_rate": 5.9849079640207e-06, - "loss": 0.6286, - "step": 7795 - }, - { - "epoch": 0.9107893507706679, - "grad_norm": 0.6565080285072327, - "learning_rate": 5.972234702784106e-06, - "loss": 0.6394, - "step": 7800 - }, - { - "epoch": 0.911373190098085, - "grad_norm": 0.6041284799575806, - "learning_rate": 5.9596417060382545e-06, - "loss": 0.631, - "step": 7805 - }, - { - "epoch": 0.911957029425502, - "grad_norm": 0.6047517657279968, - "learning_rate": 5.9471290207350925e-06, - "loss": 0.6126, - "step": 7810 - }, - { - "epoch": 0.9125408687529192, - "grad_norm": 0.6096698045730591, - "learning_rate": 5.934696693527123e-06, - "loss": 0.6324, - "step": 7815 - }, - { - "epoch": 0.9131247080803363, - "grad_norm": 0.5708822011947632, - "learning_rate": 5.9223447707672564e-06, - "loss": 0.6154, - "step": 7820 - }, - { - "epoch": 0.9137085474077534, - "grad_norm": 0.6219106316566467, - "learning_rate": 5.910073298508609e-06, - "loss": 0.6425, - "step": 7825 - }, - { - "epoch": 0.9142923867351704, - "grad_norm": 0.5736826062202454, - "learning_rate": 5.8978823225043555e-06, - "loss": 0.6245, - "step": 7830 - }, - { - "epoch": 0.9148762260625876, - "grad_norm": 0.5958027243614197, - "learning_rate": 5.8857718882075325e-06, - "loss": 0.6459, - "step": 7835 - }, - { - "epoch": 0.9154600653900047, - "grad_norm": 0.5751909017562866, - "learning_rate": 5.8737420407708985e-06, - "loss": 0.6286, - "step": 7840 - }, - { - "epoch": 0.9160439047174218, - "grad_norm": 0.5979501605033875, - "learning_rate": 5.861792825046739e-06, - "loss": 0.6249, - "step": 7845 - }, - { - "epoch": 0.9166277440448388, - "grad_norm": 0.5704393982887268, - "learning_rate": 5.849924285586719e-06, - "loss": 0.6385, - "step": 7850 - }, - { - "epoch": 0.917211583372256, - "grad_norm": 0.5978744029998779, - "learning_rate": 5.838136466641704e-06, - "loss": 0.6286, - "step": 7855 - }, - { - "epoch": 0.9177954226996731, - "grad_norm": 0.5979689955711365, - "learning_rate": 5.8264294121616e-06, - "loss": 0.6268, - "step": 7860 - }, - { - "epoch": 0.9183792620270902, - "grad_norm": 0.6182025074958801, - "learning_rate": 5.814803165795194e-06, - "loss": 0.6333, - "step": 7865 - }, - { - "epoch": 0.9189631013545072, - "grad_norm": 0.5914352536201477, - "learning_rate": 5.803257770889978e-06, - "loss": 0.6403, - "step": 7870 - }, - { - "epoch": 0.9195469406819243, - "grad_norm": 0.6047608256340027, - "learning_rate": 5.791793270492006e-06, - "loss": 0.6325, - "step": 7875 - }, - { - "epoch": 0.9201307800093415, - "grad_norm": 0.5882280468940735, - "learning_rate": 5.780409707345714e-06, - "loss": 0.6465, - "step": 7880 - }, - { - "epoch": 0.9207146193367586, - "grad_norm": 0.5669508576393127, - "learning_rate": 5.769107123893781e-06, - "loss": 0.6176, - "step": 7885 - }, - { - "epoch": 0.9212984586641756, - "grad_norm": 0.567794680595398, - "learning_rate": 5.757885562276948e-06, - "loss": 0.6216, - "step": 7890 - }, - { - "epoch": 0.9218822979915927, - "grad_norm": 0.6060667037963867, - "learning_rate": 5.7467450643338804e-06, - "loss": 0.6289, - "step": 7895 - }, - { - "epoch": 0.9224661373190098, - "grad_norm": 0.5841490626335144, - "learning_rate": 5.7356856716010014e-06, - "loss": 0.6249, - "step": 7900 - }, - { - "epoch": 0.9230499766464269, - "grad_norm": 0.6320383548736572, - "learning_rate": 5.724707425312344e-06, - "loss": 0.6248, - "step": 7905 - }, - { - "epoch": 0.923633815973844, - "grad_norm": 0.5883224010467529, - "learning_rate": 5.7138103663993895e-06, - "loss": 0.6257, - "step": 7910 - }, - { - "epoch": 0.9242176553012611, - "grad_norm": 0.5957557559013367, - "learning_rate": 5.70299453549092e-06, - "loss": 0.6322, - "step": 7915 - }, - { - "epoch": 0.9248014946286782, - "grad_norm": 0.6172184944152832, - "learning_rate": 5.692259972912865e-06, - "loss": 0.6253, - "step": 7920 - }, - { - "epoch": 0.9253853339560952, - "grad_norm": 0.5925536155700684, - "learning_rate": 5.681606718688152e-06, - "loss": 0.6277, - "step": 7925 - }, - { - "epoch": 0.9259691732835124, - "grad_norm": 0.5635936260223389, - "learning_rate": 5.671034812536561e-06, - "loss": 0.6001, - "step": 7930 - }, - { - "epoch": 0.9265530126109295, - "grad_norm": 0.6148142218589783, - "learning_rate": 5.660544293874561e-06, - "loss": 0.6135, - "step": 7935 - }, - { - "epoch": 0.9271368519383466, - "grad_norm": 0.5931411981582642, - "learning_rate": 5.65013520181519e-06, - "loss": 0.6223, - "step": 7940 - }, - { - "epoch": 0.9277206912657636, - "grad_norm": 0.5616611242294312, - "learning_rate": 5.639807575167886e-06, - "loss": 0.6219, - "step": 7945 - }, - { - "epoch": 0.9283045305931807, - "grad_norm": 0.5918108820915222, - "learning_rate": 5.6295614524383436e-06, - "loss": 0.6241, - "step": 7950 - }, - { - "epoch": 0.9288883699205979, - "grad_norm": 0.5980703234672546, - "learning_rate": 5.619396871828387e-06, - "loss": 0.6283, - "step": 7955 - }, - { - "epoch": 0.929472209248015, - "grad_norm": 0.6507945656776428, - "learning_rate": 5.6093138712358155e-06, - "loss": 0.6326, - "step": 7960 - }, - { - "epoch": 0.930056048575432, - "grad_norm": 0.6150537729263306, - "learning_rate": 5.5993124882542584e-06, - "loss": 0.6331, - "step": 7965 - }, - { - "epoch": 0.9306398879028491, - "grad_norm": 0.5972559452056885, - "learning_rate": 5.589392760173047e-06, - "loss": 0.634, - "step": 7970 - }, - { - "epoch": 0.9312237272302663, - "grad_norm": 0.5883549451828003, - "learning_rate": 5.579554723977065e-06, - "loss": 0.6211, - "step": 7975 - }, - { - "epoch": 0.9318075665576834, - "grad_norm": 0.5784002542495728, - "learning_rate": 5.569798416346615e-06, - "loss": 0.6272, - "step": 7980 - }, - { - "epoch": 0.9323914058851004, - "grad_norm": 0.6290662288665771, - "learning_rate": 5.560123873657284e-06, - "loss": 0.622, - "step": 7985 - }, - { - "epoch": 0.9329752452125175, - "grad_norm": 0.5802225470542908, - "learning_rate": 5.550531131979804e-06, - "loss": 0.6316, - "step": 7990 - }, - { - "epoch": 0.9335590845399346, - "grad_norm": 0.5773979425430298, - "learning_rate": 5.5410202270799165e-06, - "loss": 0.6328, - "step": 7995 - }, - { - "epoch": 0.9341429238673518, - "grad_norm": 0.6007452011108398, - "learning_rate": 5.531591194418244e-06, - "loss": 0.6238, - "step": 8000 - }, - { - "epoch": 0.9347267631947688, - "grad_norm": 0.5779427289962769, - "learning_rate": 5.5222440691501534e-06, - "loss": 0.6232, - "step": 8005 - }, - { - "epoch": 0.9353106025221859, - "grad_norm": 0.5845000147819519, - "learning_rate": 5.512978886125628e-06, - "loss": 0.608, - "step": 8010 - }, - { - "epoch": 0.935894441849603, - "grad_norm": 0.5820457935333252, - "learning_rate": 5.5037956798891345e-06, - "loss": 0.6264, - "step": 8015 - }, - { - "epoch": 0.93647828117702, - "grad_norm": 0.6307477355003357, - "learning_rate": 5.494694484679501e-06, - "loss": 0.61, - "step": 8020 - }, - { - "epoch": 0.9370621205044372, - "grad_norm": 0.6008895635604858, - "learning_rate": 5.485675334429776e-06, - "loss": 0.6042, - "step": 8025 - }, - { - "epoch": 0.9376459598318543, - "grad_norm": 0.6024895310401917, - "learning_rate": 5.476738262767116e-06, - "loss": 0.6406, - "step": 8030 - }, - { - "epoch": 0.9382297991592714, - "grad_norm": 0.5925070643424988, - "learning_rate": 5.467883303012653e-06, - "loss": 0.6408, - "step": 8035 - }, - { - "epoch": 0.9388136384866884, - "grad_norm": 0.5511288642883301, - "learning_rate": 5.459110488181373e-06, - "loss": 0.6172, - "step": 8040 - }, - { - "epoch": 0.9393974778141055, - "grad_norm": 0.6154025793075562, - "learning_rate": 5.450419850981987e-06, - "loss": 0.6491, - "step": 8045 - }, - { - "epoch": 0.9399813171415227, - "grad_norm": 0.5883819460868835, - "learning_rate": 5.441811423816817e-06, - "loss": 0.6399, - "step": 8050 - }, - { - "epoch": 0.9405651564689398, - "grad_norm": 0.5946598649024963, - "learning_rate": 5.433285238781674e-06, - "loss": 0.642, - "step": 8055 - }, - { - "epoch": 0.9411489957963568, - "grad_norm": 0.5819993019104004, - "learning_rate": 5.424841327665728e-06, - "loss": 0.6354, - "step": 8060 - }, - { - "epoch": 0.9417328351237739, - "grad_norm": 0.6030775308609009, - "learning_rate": 5.416479721951409e-06, - "loss": 0.6234, - "step": 8065 - }, - { - "epoch": 0.942316674451191, - "grad_norm": 0.5876944065093994, - "learning_rate": 5.408200452814265e-06, - "loss": 0.6171, - "step": 8070 - }, - { - "epoch": 0.9429005137786082, - "grad_norm": 0.5673152804374695, - "learning_rate": 5.400003551122871e-06, - "loss": 0.6362, - "step": 8075 - }, - { - "epoch": 0.9434843531060252, - "grad_norm": 0.6293331980705261, - "learning_rate": 5.391889047438692e-06, - "loss": 0.6237, - "step": 8080 - }, - { - "epoch": 0.9440681924334423, - "grad_norm": 0.6172751188278198, - "learning_rate": 5.383856972015984e-06, - "loss": 0.6222, - "step": 8085 - }, - { - "epoch": 0.9446520317608594, - "grad_norm": 0.6005342602729797, - "learning_rate": 5.3759073548016776e-06, - "loss": 0.6399, - "step": 8090 - }, - { - "epoch": 0.9452358710882766, - "grad_norm": 0.5837724208831787, - "learning_rate": 5.368040225435264e-06, - "loss": 0.6179, - "step": 8095 - }, - { - "epoch": 0.9458197104156936, - "grad_norm": 0.5588687658309937, - "learning_rate": 5.360255613248679e-06, - "loss": 0.6149, - "step": 8100 - }, - { - "epoch": 0.9464035497431107, - "grad_norm": 0.5683503150939941, - "learning_rate": 5.352553547266205e-06, - "loss": 0.6339, - "step": 8105 - }, - { - "epoch": 0.9469873890705278, - "grad_norm": 0.5929042100906372, - "learning_rate": 5.34493405620436e-06, - "loss": 0.6446, - "step": 8110 - }, - { - "epoch": 0.9475712283979448, - "grad_norm": 0.6372909545898438, - "learning_rate": 5.337397168471786e-06, - "loss": 0.6397, - "step": 8115 - }, - { - "epoch": 0.948155067725362, - "grad_norm": 0.6157827973365784, - "learning_rate": 5.329942912169144e-06, - "loss": 0.633, - "step": 8120 - }, - { - "epoch": 0.9487389070527791, - "grad_norm": 0.5792596340179443, - "learning_rate": 5.322571315089009e-06, - "loss": 0.6257, - "step": 8125 - }, - { - "epoch": 0.9493227463801962, - "grad_norm": 0.5713419318199158, - "learning_rate": 5.315282404715776e-06, - "loss": 0.6261, - "step": 8130 - }, - { - "epoch": 0.9499065857076132, - "grad_norm": 0.5683460235595703, - "learning_rate": 5.308076208225538e-06, - "loss": 0.6255, - "step": 8135 - }, - { - "epoch": 0.9504904250350303, - "grad_norm": 0.6410444378852844, - "learning_rate": 5.300952752486006e-06, - "loss": 0.641, - "step": 8140 - }, - { - "epoch": 0.9510742643624475, - "grad_norm": 0.5755616426467896, - "learning_rate": 5.293912064056394e-06, - "loss": 0.6223, - "step": 8145 - }, - { - "epoch": 0.9516581036898646, - "grad_norm": 0.5826494693756104, - "learning_rate": 5.286954169187325e-06, - "loss": 0.616, - "step": 8150 - }, - { - "epoch": 0.9522419430172816, - "grad_norm": 0.6531776189804077, - "learning_rate": 5.280079093820737e-06, - "loss": 0.6445, - "step": 8155 - }, - { - "epoch": 0.9528257823446987, - "grad_norm": 0.5947822332382202, - "learning_rate": 5.273286863589776e-06, - "loss": 0.6408, - "step": 8160 - }, - { - "epoch": 0.9534096216721158, - "grad_norm": 0.5781073570251465, - "learning_rate": 5.266577503818708e-06, - "loss": 0.6522, - "step": 8165 - }, - { - "epoch": 0.953993460999533, - "grad_norm": 0.5978535413742065, - "learning_rate": 5.259951039522832e-06, - "loss": 0.6454, - "step": 8170 - }, - { - "epoch": 0.95457730032695, - "grad_norm": 0.6118335723876953, - "learning_rate": 5.253407495408368e-06, - "loss": 0.6232, - "step": 8175 - }, - { - "epoch": 0.9551611396543671, - "grad_norm": 0.5704180598258972, - "learning_rate": 5.24694689587238e-06, - "loss": 0.6236, - "step": 8180 - }, - { - "epoch": 0.9557449789817842, - "grad_norm": 0.5849665403366089, - "learning_rate": 5.240569265002673e-06, - "loss": 0.6233, - "step": 8185 - }, - { - "epoch": 0.9563288183092014, - "grad_norm": 0.5962663888931274, - "learning_rate": 5.234274626577723e-06, - "loss": 0.6417, - "step": 8190 - }, - { - "epoch": 0.9569126576366184, - "grad_norm": 0.6095257997512817, - "learning_rate": 5.228063004066567e-06, - "loss": 0.6181, - "step": 8195 - }, - { - "epoch": 0.9574964969640355, - "grad_norm": 0.6306756138801575, - "learning_rate": 5.22193442062872e-06, - "loss": 0.6226, - "step": 8200 - }, - { - "epoch": 0.9580803362914526, - "grad_norm": 0.6289014220237732, - "learning_rate": 5.2158888991141055e-06, - "loss": 0.6336, - "step": 8205 - }, - { - "epoch": 0.9586641756188697, - "grad_norm": 0.5935418009757996, - "learning_rate": 5.2099264620629425e-06, - "loss": 0.6345, - "step": 8210 - }, - { - "epoch": 0.9592480149462868, - "grad_norm": 0.5954404473304749, - "learning_rate": 5.204047131705689e-06, - "loss": 0.6197, - "step": 8215 - }, - { - "epoch": 0.9598318542737039, - "grad_norm": 0.6127824187278748, - "learning_rate": 5.198250929962939e-06, - "loss": 0.6406, - "step": 8220 - }, - { - "epoch": 0.960415693601121, - "grad_norm": 0.5969098806381226, - "learning_rate": 5.192537878445356e-06, - "loss": 0.6379, - "step": 8225 - }, - { - "epoch": 0.960999532928538, - "grad_norm": 0.5721200108528137, - "learning_rate": 5.186907998453573e-06, - "loss": 0.6242, - "step": 8230 - }, - { - "epoch": 0.9615833722559551, - "grad_norm": 0.6153504848480225, - "learning_rate": 5.181361310978133e-06, - "loss": 0.616, - "step": 8235 - }, - { - "epoch": 0.9621672115833723, - "grad_norm": 0.577362596988678, - "learning_rate": 5.175897836699403e-06, - "loss": 0.6431, - "step": 8240 - }, - { - "epoch": 0.9627510509107894, - "grad_norm": 0.6139353513717651, - "learning_rate": 5.170517595987493e-06, - "loss": 0.6335, - "step": 8245 - }, - { - "epoch": 0.9633348902382064, - "grad_norm": 0.6440641283988953, - "learning_rate": 5.165220608902186e-06, - "loss": 0.6284, - "step": 8250 - }, - { - "epoch": 0.9639187295656235, - "grad_norm": 0.6276947855949402, - "learning_rate": 5.160006895192858e-06, - "loss": 0.6277, - "step": 8255 - }, - { - "epoch": 0.9645025688930406, - "grad_norm": 0.6053177714347839, - "learning_rate": 5.154876474298412e-06, - "loss": 0.6083, - "step": 8260 - }, - { - "epoch": 0.9650864082204578, - "grad_norm": 0.6250970363616943, - "learning_rate": 5.149829365347197e-06, - "loss": 0.6292, - "step": 8265 - }, - { - "epoch": 0.9656702475478748, - "grad_norm": 0.5911844372749329, - "learning_rate": 5.14486558715694e-06, - "loss": 0.6333, - "step": 8270 - }, - { - "epoch": 0.9662540868752919, - "grad_norm": 0.6166476607322693, - "learning_rate": 5.139985158234677e-06, - "loss": 0.6653, - "step": 8275 - }, - { - "epoch": 0.966837926202709, - "grad_norm": 0.6100538372993469, - "learning_rate": 5.135188096776682e-06, - "loss": 0.6265, - "step": 8280 - }, - { - "epoch": 0.9674217655301262, - "grad_norm": 0.5892276167869568, - "learning_rate": 5.130474420668403e-06, - "loss": 0.6396, - "step": 8285 - }, - { - "epoch": 0.9680056048575432, - "grad_norm": 0.6185021996498108, - "learning_rate": 5.125844147484391e-06, - "loss": 0.6195, - "step": 8290 - }, - { - "epoch": 0.9685894441849603, - "grad_norm": 0.6628878116607666, - "learning_rate": 5.121297294488237e-06, - "loss": 0.6511, - "step": 8295 - }, - { - "epoch": 0.9691732835123774, - "grad_norm": 0.5766160488128662, - "learning_rate": 5.1168338786325025e-06, - "loss": 0.624, - "step": 8300 - }, - { - "epoch": 0.9697571228397945, - "grad_norm": 0.6637783050537109, - "learning_rate": 5.112453916558671e-06, - "loss": 0.6489, - "step": 8305 - }, - { - "epoch": 0.9703409621672116, - "grad_norm": 0.5742714405059814, - "learning_rate": 5.108157424597062e-06, - "loss": 0.6226, - "step": 8310 - }, - { - "epoch": 0.9709248014946287, - "grad_norm": 0.6008113026618958, - "learning_rate": 5.103944418766791e-06, - "loss": 0.6345, - "step": 8315 - }, - { - "epoch": 0.9715086408220458, - "grad_norm": 0.61330246925354, - "learning_rate": 5.099814914775706e-06, - "loss": 0.6228, - "step": 8320 - }, - { - "epoch": 0.9720924801494629, - "grad_norm": 0.6004528999328613, - "learning_rate": 5.095768928020314e-06, - "loss": 0.6389, - "step": 8325 - }, - { - "epoch": 0.9726763194768799, - "grad_norm": 0.6562114953994751, - "learning_rate": 5.09180647358575e-06, - "loss": 0.6241, - "step": 8330 - }, - { - "epoch": 0.9732601588042971, - "grad_norm": 0.6316921710968018, - "learning_rate": 5.087927566245688e-06, - "loss": 0.6227, - "step": 8335 - }, - { - "epoch": 0.9738439981317142, - "grad_norm": 0.5536362528800964, - "learning_rate": 5.0841322204623205e-06, - "loss": 0.6169, - "step": 8340 - }, - { - "epoch": 0.9744278374591312, - "grad_norm": 0.6447915434837341, - "learning_rate": 5.080420450386274e-06, - "loss": 0.6292, - "step": 8345 - }, - { - "epoch": 0.9750116767865483, - "grad_norm": 0.6012244820594788, - "learning_rate": 5.076792269856582e-06, - "loss": 0.6256, - "step": 8350 - }, - { - "epoch": 0.9755955161139654, - "grad_norm": 0.6139721274375916, - "learning_rate": 5.073247692400609e-06, - "loss": 0.6191, - "step": 8355 - }, - { - "epoch": 0.9761793554413826, - "grad_norm": 0.5939144492149353, - "learning_rate": 5.069786731234025e-06, - "loss": 0.6354, - "step": 8360 - }, - { - "epoch": 0.9767631947687996, - "grad_norm": 0.580795168876648, - "learning_rate": 5.066409399260733e-06, - "loss": 0.6474, - "step": 8365 - }, - { - "epoch": 0.9773470340962167, - "grad_norm": 0.5682657957077026, - "learning_rate": 5.063115709072837e-06, - "loss": 0.6357, - "step": 8370 - }, - { - "epoch": 0.9779308734236338, - "grad_norm": 0.5991390347480774, - "learning_rate": 5.059905672950588e-06, - "loss": 0.6336, - "step": 8375 - }, - { - "epoch": 0.978514712751051, - "grad_norm": 0.6206997036933899, - "learning_rate": 5.056779302862337e-06, - "loss": 0.6343, - "step": 8380 - }, - { - "epoch": 0.979098552078468, - "grad_norm": 0.5906264185905457, - "learning_rate": 5.0537366104645e-06, - "loss": 0.6188, - "step": 8385 - }, - { - "epoch": 0.9796823914058851, - "grad_norm": 0.6073142290115356, - "learning_rate": 5.050777607101506e-06, - "loss": 0.6267, - "step": 8390 - }, - { - "epoch": 0.9802662307333022, - "grad_norm": 0.6208410263061523, - "learning_rate": 5.047902303805746e-06, - "loss": 0.6476, - "step": 8395 - }, - { - "epoch": 0.9808500700607193, - "grad_norm": 0.5946664214134216, - "learning_rate": 5.045110711297557e-06, - "loss": 0.6251, - "step": 8400 - }, - { - "epoch": 0.9814339093881363, - "grad_norm": 0.5699525475502014, - "learning_rate": 5.042402839985161e-06, - "loss": 0.6127, - "step": 8405 - }, - { - "epoch": 0.9820177487155535, - "grad_norm": 0.6208629608154297, - "learning_rate": 5.039778699964626e-06, - "loss": 0.6373, - "step": 8410 - }, - { - "epoch": 0.9826015880429706, - "grad_norm": 0.5897505283355713, - "learning_rate": 5.037238301019845e-06, - "loss": 0.614, - "step": 8415 - }, - { - "epoch": 0.9831854273703877, - "grad_norm": 0.5908355712890625, - "learning_rate": 5.034781652622484e-06, - "loss": 0.6166, - "step": 8420 - }, - { - "epoch": 0.9837692666978047, - "grad_norm": 0.5870689749717712, - "learning_rate": 5.032408763931956e-06, - "loss": 0.6014, - "step": 8425 - }, - { - "epoch": 0.9843531060252219, - "grad_norm": 0.6261207461357117, - "learning_rate": 5.0301196437953755e-06, - "loss": 0.6223, - "step": 8430 - }, - { - "epoch": 0.984936945352639, - "grad_norm": 0.5960336327552795, - "learning_rate": 5.0279143007475425e-06, - "loss": 0.634, - "step": 8435 - }, - { - "epoch": 0.985520784680056, - "grad_norm": 0.6142226457595825, - "learning_rate": 5.02579274301089e-06, - "loss": 0.6324, - "step": 8440 - }, - { - "epoch": 0.9861046240074731, - "grad_norm": 0.6348884701728821, - "learning_rate": 5.0237549784954745e-06, - "loss": 0.6079, - "step": 8445 - }, - { - "epoch": 0.9866884633348902, - "grad_norm": 0.612058699131012, - "learning_rate": 5.021801014798933e-06, - "loss": 0.6142, - "step": 8450 - }, - { - "epoch": 0.9872723026623074, - "grad_norm": 0.5645710229873657, - "learning_rate": 5.0199308592064535e-06, - "loss": 0.6177, - "step": 8455 - }, - { - "epoch": 0.9878561419897244, - "grad_norm": 0.5899025201797485, - "learning_rate": 5.018144518690761e-06, - "loss": 0.6245, - "step": 8460 - }, - { - "epoch": 0.9884399813171415, - "grad_norm": 0.5586448907852173, - "learning_rate": 5.016441999912074e-06, - "loss": 0.6256, - "step": 8465 - }, - { - "epoch": 0.9890238206445586, - "grad_norm": 0.6206201910972595, - "learning_rate": 5.014823309218096e-06, - "loss": 0.6223, - "step": 8470 - }, - { - "epoch": 0.9896076599719758, - "grad_norm": 0.6235098838806152, - "learning_rate": 5.013288452643979e-06, - "loss": 0.6358, - "step": 8475 - }, - { - "epoch": 0.9901914992993928, - "grad_norm": 0.5994392037391663, - "learning_rate": 5.011837435912308e-06, - "loss": 0.6244, - "step": 8480 - }, - { - "epoch": 0.9907753386268099, - "grad_norm": 0.5622158050537109, - "learning_rate": 5.010470264433083e-06, - "loss": 0.6322, - "step": 8485 - }, - { - "epoch": 0.991359177954227, - "grad_norm": 0.5598063468933105, - "learning_rate": 5.009186943303684e-06, - "loss": 0.6369, - "step": 8490 - }, - { - "epoch": 0.9919430172816441, - "grad_norm": 0.5687846541404724, - "learning_rate": 5.0079874773088735e-06, - "loss": 0.6308, - "step": 8495 - }, - { - "epoch": 0.9925268566090611, - "grad_norm": 0.5996575355529785, - "learning_rate": 5.006871870920757e-06, - "loss": 0.6201, - "step": 8500 - }, - { - "epoch": 0.9931106959364783, - "grad_norm": 0.5555761456489563, - "learning_rate": 5.005840128298783e-06, - "loss": 0.6222, - "step": 8505 - }, - { - "epoch": 0.9936945352638954, - "grad_norm": 0.5931857824325562, - "learning_rate": 5.004892253289714e-06, - "loss": 0.6309, - "step": 8510 - }, - { - "epoch": 0.9942783745913125, - "grad_norm": 0.6039702296257019, - "learning_rate": 5.004028249427629e-06, - "loss": 0.6222, - "step": 8515 - }, - { - "epoch": 0.9948622139187295, - "grad_norm": 0.5616840720176697, - "learning_rate": 5.003248119933894e-06, - "loss": 0.6139, - "step": 8520 - }, - { - "epoch": 0.9954460532461467, - "grad_norm": 0.5761851072311401, - "learning_rate": 5.002551867717153e-06, - "loss": 0.6154, - "step": 8525 - }, - { - "epoch": 0.9960298925735638, - "grad_norm": 0.6196095943450928, - "learning_rate": 5.00193949537333e-06, - "loss": 0.6453, - "step": 8530 - }, - { - "epoch": 0.9966137319009809, - "grad_norm": 0.6085227727890015, - "learning_rate": 5.0014110051856e-06, - "loss": 0.6533, - "step": 8535 - }, - { - "epoch": 0.9971975712283979, - "grad_norm": 0.5806092619895935, - "learning_rate": 5.000966399124398e-06, - "loss": 0.6292, - "step": 8540 - }, - { - "epoch": 0.997781410555815, - "grad_norm": 0.6217420101165771, - "learning_rate": 5.000605678847399e-06, - "loss": 0.6206, - "step": 8545 - }, - { - "epoch": 0.9983652498832322, - "grad_norm": 0.5705360174179077, - "learning_rate": 5.000328845699522e-06, - "loss": 0.6254, - "step": 8550 - }, - { - "epoch": 0.9989490892106492, - "grad_norm": 0.6102194786071777, - "learning_rate": 5.000135900712914e-06, - "loss": 0.6245, - "step": 8555 - }, - { - "epoch": 0.9995329285380663, - "grad_norm": 0.5798147320747375, - "learning_rate": 5.000026844606953e-06, - "loss": 0.6312, - "step": 8560 - }, { "epoch": 1.0, - "step": 8564, - "total_flos": 4.412574272610894e+18, - "train_loss": 0.6859193018305697, - "train_runtime": 18680.5054, - "train_samples_per_second": 29.34, - "train_steps_per_second": 0.458 + "num_tokens": 560311272.0, + "step": 733, + "total_flos": 1147472759488512.0, + "train_loss": 0.5344447073149323, + "train_runtime": 5883.7745, + "train_samples_per_second": 15.931, + "train_steps_per_second": 0.125 } ], "logging_steps": 5, - "max_steps": 8564, + "max_steps": 733, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, @@ -12019,7 +1205,7 @@ "attributes": {} } }, - "total_flos": 4.412574272610894e+18, + "total_flos": 1147472759488512.0, "train_batch_size": 16, "trial_name": null, "trial_params": null