diff --git "a/checkpoint-1110/trainer_state.json" "b/checkpoint-1110/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1110/trainer_state.json" @@ -0,0 +1,7804 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2525597269624573, + "eval_steps": 500, + "global_step": 1110, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00022753128555176336, + "grad_norm": 12.279616595766896, + "learning_rate": 1.25e-06, + "loss": 0.1112, + "step": 1 + }, + { + "epoch": 0.0004550625711035267, + "grad_norm": 7.2629005990045785, + "learning_rate": 1.2499999936130725e-06, + "loss": 0.1394, + "step": 2 + }, + { + "epoch": 0.0006825938566552901, + "grad_norm": 9.061755966937882, + "learning_rate": 1.2499999744522896e-06, + "loss": 0.266, + "step": 3 + }, + { + "epoch": 0.0009101251422070534, + "grad_norm": 4.366742138714415, + "learning_rate": 1.2499999425176518e-06, + "loss": 0.1258, + "step": 4 + }, + { + "epoch": 0.0011376564277588168, + "grad_norm": 8.875041328214682, + "learning_rate": 1.2499998978091598e-06, + "loss": 0.1749, + "step": 5 + }, + { + "epoch": 0.0013651877133105802, + "grad_norm": 8.800851302952648, + "learning_rate": 1.2499998403268147e-06, + "loss": 0.2, + "step": 6 + }, + { + "epoch": 0.0015927189988623437, + "grad_norm": 3.6087293622719416, + "learning_rate": 1.2499997700706173e-06, + "loss": 0.1398, + "step": 7 + }, + { + "epoch": 0.0018202502844141069, + "grad_norm": 3.42318784466345, + "learning_rate": 1.2499996870405692e-06, + "loss": 0.1226, + "step": 8 + }, + { + "epoch": 0.0020477815699658703, + "grad_norm": 10.818031536677639, + "learning_rate": 1.2499995912366722e-06, + "loss": 0.171, + "step": 9 + }, + { + "epoch": 0.0022753128555176336, + "grad_norm": 4.021373220825307, + "learning_rate": 1.2499994826589282e-06, + "loss": 0.1326, + "step": 10 + }, + { + "epoch": 0.002502844141069397, + "grad_norm": 23.4957115965293, + "learning_rate": 1.2499993613073393e-06, + "loss": 0.1235, + "step": 11 + }, + { + "epoch": 0.0027303754266211604, + "grad_norm": 21.545758763913824, + "learning_rate": 1.2499992271819083e-06, + "loss": 0.0925, + "step": 12 + }, + { + "epoch": 0.0029579067121729237, + "grad_norm": 3.510359233156002, + "learning_rate": 1.2499990802826377e-06, + "loss": 0.0985, + "step": 13 + }, + { + "epoch": 0.0031854379977246873, + "grad_norm": 3.9794392758691632, + "learning_rate": 1.2499989206095304e-06, + "loss": 0.0906, + "step": 14 + }, + { + "epoch": 0.0034129692832764505, + "grad_norm": 2.4759900062021503, + "learning_rate": 1.2499987481625899e-06, + "loss": 0.0746, + "step": 15 + }, + { + "epoch": 0.0036405005688282138, + "grad_norm": 7.91228269690243, + "learning_rate": 1.2499985629418195e-06, + "loss": 0.1381, + "step": 16 + }, + { + "epoch": 0.0038680318543799774, + "grad_norm": 3.050431958605509, + "learning_rate": 1.2499983649472233e-06, + "loss": 0.0509, + "step": 17 + }, + { + "epoch": 0.004095563139931741, + "grad_norm": 9.537504830414553, + "learning_rate": 1.249998154178805e-06, + "loss": 0.0658, + "step": 18 + }, + { + "epoch": 0.004323094425483504, + "grad_norm": 5.663160798723299, + "learning_rate": 1.2499979306365692e-06, + "loss": 0.097, + "step": 19 + }, + { + "epoch": 0.004550625711035267, + "grad_norm": 13.411158998212743, + "learning_rate": 1.2499976943205202e-06, + "loss": 0.1, + "step": 20 + }, + { + "epoch": 0.00477815699658703, + "grad_norm": 3.6621965264316194, + "learning_rate": 1.249997445230663e-06, + "loss": 0.0698, + "step": 21 + }, + { + "epoch": 0.005005688282138794, + "grad_norm": 31.354306922977063, + "learning_rate": 1.2499971833670026e-06, + "loss": 0.1129, + "step": 22 + }, + { + "epoch": 0.005233219567690558, + "grad_norm": 4.025175447301565, + "learning_rate": 1.2499969087295443e-06, + "loss": 0.1076, + "step": 23 + }, + { + "epoch": 0.005460750853242321, + "grad_norm": 3.7989823695443863, + "learning_rate": 1.249996621318294e-06, + "loss": 0.0902, + "step": 24 + }, + { + "epoch": 0.005688282138794084, + "grad_norm": 2.2185511399815545, + "learning_rate": 1.2499963211332573e-06, + "loss": 0.0398, + "step": 25 + }, + { + "epoch": 0.005915813424345847, + "grad_norm": 3.775038573479949, + "learning_rate": 1.2499960081744405e-06, + "loss": 0.1145, + "step": 26 + }, + { + "epoch": 0.0061433447098976105, + "grad_norm": 7.295114622894012, + "learning_rate": 1.24999568244185e-06, + "loss": 0.1379, + "step": 27 + }, + { + "epoch": 0.006370875995449375, + "grad_norm": 21.290899699279613, + "learning_rate": 1.249995343935492e-06, + "loss": 0.0995, + "step": 28 + }, + { + "epoch": 0.006598407281001138, + "grad_norm": 3.593142181899087, + "learning_rate": 1.2499949926553743e-06, + "loss": 0.0973, + "step": 29 + }, + { + "epoch": 0.006825938566552901, + "grad_norm": 7.737004888367107, + "learning_rate": 1.2499946286015032e-06, + "loss": 0.09, + "step": 30 + }, + { + "epoch": 0.007053469852104664, + "grad_norm": 2.955413576195516, + "learning_rate": 1.2499942517738867e-06, + "loss": 0.0625, + "step": 31 + }, + { + "epoch": 0.0072810011376564275, + "grad_norm": 4.138230802608221, + "learning_rate": 1.2499938621725322e-06, + "loss": 0.0749, + "step": 32 + }, + { + "epoch": 0.007508532423208191, + "grad_norm": 2.2958240484880115, + "learning_rate": 1.2499934597974478e-06, + "loss": 0.0986, + "step": 33 + }, + { + "epoch": 0.007736063708759955, + "grad_norm": 3.9060226417706256, + "learning_rate": 1.2499930446486416e-06, + "loss": 0.1296, + "step": 34 + }, + { + "epoch": 0.007963594994311717, + "grad_norm": 1.5544518249405173, + "learning_rate": 1.2499926167261224e-06, + "loss": 0.0598, + "step": 35 + }, + { + "epoch": 0.008191126279863481, + "grad_norm": 5.2971839668544884, + "learning_rate": 1.2499921760298987e-06, + "loss": 0.1216, + "step": 36 + }, + { + "epoch": 0.008418657565415245, + "grad_norm": 6.168021178604936, + "learning_rate": 1.2499917225599796e-06, + "loss": 0.1257, + "step": 37 + }, + { + "epoch": 0.008646188850967008, + "grad_norm": 5.087771081836934, + "learning_rate": 1.2499912563163742e-06, + "loss": 0.0695, + "step": 38 + }, + { + "epoch": 0.008873720136518772, + "grad_norm": 7.633032375201886, + "learning_rate": 1.249990777299092e-06, + "loss": 0.0522, + "step": 39 + }, + { + "epoch": 0.009101251422070534, + "grad_norm": 2.9414497498297734, + "learning_rate": 1.249990285508143e-06, + "loss": 0.081, + "step": 40 + }, + { + "epoch": 0.009328782707622298, + "grad_norm": 4.449234015706431, + "learning_rate": 1.2499897809435374e-06, + "loss": 0.1104, + "step": 41 + }, + { + "epoch": 0.00955631399317406, + "grad_norm": 17.29783348718493, + "learning_rate": 1.249989263605285e-06, + "loss": 0.0952, + "step": 42 + }, + { + "epoch": 0.009783845278725825, + "grad_norm": 4.357226856628825, + "learning_rate": 1.249988733493397e-06, + "loss": 0.0639, + "step": 43 + }, + { + "epoch": 0.010011376564277589, + "grad_norm": 4.248351802911978, + "learning_rate": 1.2499881906078836e-06, + "loss": 0.0845, + "step": 44 + }, + { + "epoch": 0.010238907849829351, + "grad_norm": 2.230593274135571, + "learning_rate": 1.2499876349487564e-06, + "loss": 0.1031, + "step": 45 + }, + { + "epoch": 0.010466439135381115, + "grad_norm": 5.02333552530202, + "learning_rate": 1.2499870665160262e-06, + "loss": 0.0763, + "step": 46 + }, + { + "epoch": 0.010693970420932878, + "grad_norm": 1.7336741239976143, + "learning_rate": 1.2499864853097054e-06, + "loss": 0.0495, + "step": 47 + }, + { + "epoch": 0.010921501706484642, + "grad_norm": 4.9863935956267325, + "learning_rate": 1.2499858913298053e-06, + "loss": 0.1674, + "step": 48 + }, + { + "epoch": 0.011149032992036406, + "grad_norm": 2.8583968641628967, + "learning_rate": 1.249985284576338e-06, + "loss": 0.0728, + "step": 49 + }, + { + "epoch": 0.011376564277588168, + "grad_norm": 5.82977520269559, + "learning_rate": 1.2499846650493164e-06, + "loss": 0.1076, + "step": 50 + }, + { + "epoch": 0.011604095563139932, + "grad_norm": 2.407346961463521, + "learning_rate": 1.2499840327487528e-06, + "loss": 0.0542, + "step": 51 + }, + { + "epoch": 0.011831626848691695, + "grad_norm": 1.7581308929371289, + "learning_rate": 1.24998338767466e-06, + "loss": 0.0851, + "step": 52 + }, + { + "epoch": 0.012059158134243459, + "grad_norm": 3.6069730541490315, + "learning_rate": 1.2499827298270515e-06, + "loss": 0.0771, + "step": 53 + }, + { + "epoch": 0.012286689419795221, + "grad_norm": 2.6384830282527965, + "learning_rate": 1.2499820592059405e-06, + "loss": 0.1336, + "step": 54 + }, + { + "epoch": 0.012514220705346985, + "grad_norm": 3.1162890015900793, + "learning_rate": 1.2499813758113409e-06, + "loss": 0.1058, + "step": 55 + }, + { + "epoch": 0.01274175199089875, + "grad_norm": 3.5247536707308966, + "learning_rate": 1.2499806796432665e-06, + "loss": 0.0717, + "step": 56 + }, + { + "epoch": 0.012969283276450512, + "grad_norm": 2.2371436604921584, + "learning_rate": 1.2499799707017315e-06, + "loss": 0.07, + "step": 57 + }, + { + "epoch": 0.013196814562002276, + "grad_norm": 5.912440334168246, + "learning_rate": 1.2499792489867508e-06, + "loss": 0.1588, + "step": 58 + }, + { + "epoch": 0.013424345847554038, + "grad_norm": 1.6415965725415542, + "learning_rate": 1.2499785144983386e-06, + "loss": 0.0587, + "step": 59 + }, + { + "epoch": 0.013651877133105802, + "grad_norm": 3.921979088226587, + "learning_rate": 1.24997776723651e-06, + "loss": 0.1403, + "step": 60 + }, + { + "epoch": 0.013879408418657566, + "grad_norm": 4.854246136713843, + "learning_rate": 1.2499770072012809e-06, + "loss": 0.0706, + "step": 61 + }, + { + "epoch": 0.014106939704209329, + "grad_norm": 2.249851975045142, + "learning_rate": 1.2499762343926661e-06, + "loss": 0.0988, + "step": 62 + }, + { + "epoch": 0.014334470989761093, + "grad_norm": 2.5833858090734956, + "learning_rate": 1.2499754488106817e-06, + "loss": 0.0618, + "step": 63 + }, + { + "epoch": 0.014562002275312855, + "grad_norm": 3.523489637935598, + "learning_rate": 1.2499746504553436e-06, + "loss": 0.0788, + "step": 64 + }, + { + "epoch": 0.01478953356086462, + "grad_norm": 1.4862582066571077, + "learning_rate": 1.2499738393266684e-06, + "loss": 0.0574, + "step": 65 + }, + { + "epoch": 0.015017064846416382, + "grad_norm": 3.5354676602669506, + "learning_rate": 1.2499730154246726e-06, + "loss": 0.0742, + "step": 66 + }, + { + "epoch": 0.015244596131968146, + "grad_norm": 4.702124240777191, + "learning_rate": 1.2499721787493726e-06, + "loss": 0.0994, + "step": 67 + }, + { + "epoch": 0.01547212741751991, + "grad_norm": 2.2735257693436854, + "learning_rate": 1.2499713293007862e-06, + "loss": 0.0728, + "step": 68 + }, + { + "epoch": 0.015699658703071672, + "grad_norm": 1.7727324277447247, + "learning_rate": 1.2499704670789301e-06, + "loss": 0.0492, + "step": 69 + }, + { + "epoch": 0.015927189988623434, + "grad_norm": 2.0842450205223977, + "learning_rate": 1.2499695920838225e-06, + "loss": 0.0552, + "step": 70 + }, + { + "epoch": 0.0161547212741752, + "grad_norm": 3.5228642040697116, + "learning_rate": 1.2499687043154809e-06, + "loss": 0.0556, + "step": 71 + }, + { + "epoch": 0.016382252559726963, + "grad_norm": 2.797328413443058, + "learning_rate": 1.2499678037739235e-06, + "loss": 0.0519, + "step": 72 + }, + { + "epoch": 0.016609783845278725, + "grad_norm": 3.349987636757497, + "learning_rate": 1.2499668904591688e-06, + "loss": 0.1053, + "step": 73 + }, + { + "epoch": 0.01683731513083049, + "grad_norm": 4.71488578944105, + "learning_rate": 1.2499659643712356e-06, + "loss": 0.1072, + "step": 74 + }, + { + "epoch": 0.017064846416382253, + "grad_norm": 1.9864905040446388, + "learning_rate": 1.2499650255101425e-06, + "loss": 0.1129, + "step": 75 + }, + { + "epoch": 0.017292377701934016, + "grad_norm": 3.9786000792659073, + "learning_rate": 1.2499640738759088e-06, + "loss": 0.0798, + "step": 76 + }, + { + "epoch": 0.017519908987485778, + "grad_norm": 3.293845214395531, + "learning_rate": 1.249963109468554e-06, + "loss": 0.1235, + "step": 77 + }, + { + "epoch": 0.017747440273037544, + "grad_norm": 5.489256171335804, + "learning_rate": 1.2499621322880979e-06, + "loss": 0.0439, + "step": 78 + }, + { + "epoch": 0.017974971558589306, + "grad_norm": 5.65488778928505, + "learning_rate": 1.2499611423345604e-06, + "loss": 0.0715, + "step": 79 + }, + { + "epoch": 0.01820250284414107, + "grad_norm": 1.7907281324160922, + "learning_rate": 1.2499601396079617e-06, + "loss": 0.0668, + "step": 80 + }, + { + "epoch": 0.018430034129692834, + "grad_norm": 2.5432248323972377, + "learning_rate": 1.2499591241083222e-06, + "loss": 0.0836, + "step": 81 + }, + { + "epoch": 0.018657565415244597, + "grad_norm": 2.6549098221779146, + "learning_rate": 1.2499580958356628e-06, + "loss": 0.0612, + "step": 82 + }, + { + "epoch": 0.01888509670079636, + "grad_norm": 3.6428276701171685, + "learning_rate": 1.2499570547900045e-06, + "loss": 0.0713, + "step": 83 + }, + { + "epoch": 0.01911262798634812, + "grad_norm": 3.8990952294988994, + "learning_rate": 1.2499560009713684e-06, + "loss": 0.1046, + "step": 84 + }, + { + "epoch": 0.019340159271899887, + "grad_norm": 1.7241928848576353, + "learning_rate": 1.2499549343797764e-06, + "loss": 0.0759, + "step": 85 + }, + { + "epoch": 0.01956769055745165, + "grad_norm": 2.2613238963696545, + "learning_rate": 1.24995385501525e-06, + "loss": 0.0931, + "step": 86 + }, + { + "epoch": 0.019795221843003412, + "grad_norm": 2.3167104475270492, + "learning_rate": 1.2499527628778116e-06, + "loss": 0.0775, + "step": 87 + }, + { + "epoch": 0.020022753128555178, + "grad_norm": 2.631127335335558, + "learning_rate": 1.2499516579674831e-06, + "loss": 0.0911, + "step": 88 + }, + { + "epoch": 0.02025028441410694, + "grad_norm": 3.902893778838773, + "learning_rate": 1.2499505402842872e-06, + "loss": 0.1129, + "step": 89 + }, + { + "epoch": 0.020477815699658702, + "grad_norm": 2.6988246720898905, + "learning_rate": 1.2499494098282469e-06, + "loss": 0.088, + "step": 90 + }, + { + "epoch": 0.020705346985210465, + "grad_norm": 1.4451215893923708, + "learning_rate": 1.2499482665993851e-06, + "loss": 0.0521, + "step": 91 + }, + { + "epoch": 0.02093287827076223, + "grad_norm": 3.920423356576455, + "learning_rate": 1.2499471105977252e-06, + "loss": 0.079, + "step": 92 + }, + { + "epoch": 0.021160409556313993, + "grad_norm": 3.1520828274033486, + "learning_rate": 1.249945941823291e-06, + "loss": 0.1206, + "step": 93 + }, + { + "epoch": 0.021387940841865755, + "grad_norm": 6.767303206340345, + "learning_rate": 1.2499447602761063e-06, + "loss": 0.2231, + "step": 94 + }, + { + "epoch": 0.02161547212741752, + "grad_norm": 8.857984803581953, + "learning_rate": 1.2499435659561954e-06, + "loss": 0.2288, + "step": 95 + }, + { + "epoch": 0.021843003412969283, + "grad_norm": 3.2203370791462302, + "learning_rate": 1.2499423588635823e-06, + "loss": 0.1181, + "step": 96 + }, + { + "epoch": 0.022070534698521046, + "grad_norm": 2.3890790390173864, + "learning_rate": 1.2499411389982919e-06, + "loss": 0.0546, + "step": 97 + }, + { + "epoch": 0.02229806598407281, + "grad_norm": 3.9314083232645953, + "learning_rate": 1.2499399063603492e-06, + "loss": 0.1202, + "step": 98 + }, + { + "epoch": 0.022525597269624574, + "grad_norm": 1.534409734668097, + "learning_rate": 1.2499386609497793e-06, + "loss": 0.0575, + "step": 99 + }, + { + "epoch": 0.022753128555176336, + "grad_norm": 2.6915769569708163, + "learning_rate": 1.2499374027666078e-06, + "loss": 0.0865, + "step": 100 + }, + { + "epoch": 0.0229806598407281, + "grad_norm": 1.9363626293552272, + "learning_rate": 1.2499361318108602e-06, + "loss": 0.0691, + "step": 101 + }, + { + "epoch": 0.023208191126279865, + "grad_norm": 1.706755813278197, + "learning_rate": 1.2499348480825627e-06, + "loss": 0.0694, + "step": 102 + }, + { + "epoch": 0.023435722411831627, + "grad_norm": 2.4058707201091902, + "learning_rate": 1.2499335515817413e-06, + "loss": 0.0873, + "step": 103 + }, + { + "epoch": 0.02366325369738339, + "grad_norm": 1.4799661409619025, + "learning_rate": 1.2499322423084226e-06, + "loss": 0.0489, + "step": 104 + }, + { + "epoch": 0.023890784982935155, + "grad_norm": 1.5529996293769865, + "learning_rate": 1.2499309202626336e-06, + "loss": 0.0489, + "step": 105 + }, + { + "epoch": 0.024118316268486917, + "grad_norm": 1.2094533521149893, + "learning_rate": 1.249929585444401e-06, + "loss": 0.0555, + "step": 106 + }, + { + "epoch": 0.02434584755403868, + "grad_norm": 3.3515777105454805, + "learning_rate": 1.2499282378537522e-06, + "loss": 0.0869, + "step": 107 + }, + { + "epoch": 0.024573378839590442, + "grad_norm": 3.049734475824735, + "learning_rate": 1.2499268774907144e-06, + "loss": 0.0436, + "step": 108 + }, + { + "epoch": 0.024800910125142208, + "grad_norm": 1.466216427195955, + "learning_rate": 1.249925504355316e-06, + "loss": 0.0602, + "step": 109 + }, + { + "epoch": 0.02502844141069397, + "grad_norm": 1.7889692240619717, + "learning_rate": 1.2499241184475848e-06, + "loss": 0.0485, + "step": 110 + }, + { + "epoch": 0.025255972696245733, + "grad_norm": 1.601570429523166, + "learning_rate": 1.249922719767549e-06, + "loss": 0.0657, + "step": 111 + }, + { + "epoch": 0.0254835039817975, + "grad_norm": 1.9895197648670602, + "learning_rate": 1.2499213083152374e-06, + "loss": 0.0613, + "step": 112 + }, + { + "epoch": 0.02571103526734926, + "grad_norm": 1.559725191147961, + "learning_rate": 1.2499198840906787e-06, + "loss": 0.0376, + "step": 113 + }, + { + "epoch": 0.025938566552901023, + "grad_norm": 1.8823702931097668, + "learning_rate": 1.249918447093902e-06, + "loss": 0.0441, + "step": 114 + }, + { + "epoch": 0.026166097838452786, + "grad_norm": 2.8320457267761707, + "learning_rate": 1.249916997324937e-06, + "loss": 0.1287, + "step": 115 + }, + { + "epoch": 0.02639362912400455, + "grad_norm": 4.382916227720041, + "learning_rate": 1.2499155347838129e-06, + "loss": 0.0828, + "step": 116 + }, + { + "epoch": 0.026621160409556314, + "grad_norm": 3.200832910369845, + "learning_rate": 1.2499140594705596e-06, + "loss": 0.0621, + "step": 117 + }, + { + "epoch": 0.026848691695108076, + "grad_norm": 1.6914803041014228, + "learning_rate": 1.2499125713852076e-06, + "loss": 0.0778, + "step": 118 + }, + { + "epoch": 0.027076222980659842, + "grad_norm": 1.192120885936281, + "learning_rate": 1.2499110705277869e-06, + "loss": 0.0505, + "step": 119 + }, + { + "epoch": 0.027303754266211604, + "grad_norm": 2.0573836463025716, + "learning_rate": 1.2499095568983284e-06, + "loss": 0.0802, + "step": 120 + }, + { + "epoch": 0.027531285551763367, + "grad_norm": 1.4902545114620074, + "learning_rate": 1.2499080304968634e-06, + "loss": 0.0565, + "step": 121 + }, + { + "epoch": 0.027758816837315133, + "grad_norm": 5.341407524309934, + "learning_rate": 1.2499064913234222e-06, + "loss": 0.0556, + "step": 122 + }, + { + "epoch": 0.027986348122866895, + "grad_norm": 1.9959451584520156, + "learning_rate": 1.249904939378037e-06, + "loss": 0.0425, + "step": 123 + }, + { + "epoch": 0.028213879408418657, + "grad_norm": 4.8411742944908225, + "learning_rate": 1.2499033746607395e-06, + "loss": 0.1189, + "step": 124 + }, + { + "epoch": 0.02844141069397042, + "grad_norm": 2.6968922160166624, + "learning_rate": 1.2499017971715614e-06, + "loss": 0.0952, + "step": 125 + }, + { + "epoch": 0.028668941979522185, + "grad_norm": 2.498689273522412, + "learning_rate": 1.2499002069105348e-06, + "loss": 0.0609, + "step": 126 + }, + { + "epoch": 0.028896473265073948, + "grad_norm": 1.7926923107505222, + "learning_rate": 1.2498986038776926e-06, + "loss": 0.0674, + "step": 127 + }, + { + "epoch": 0.02912400455062571, + "grad_norm": 2.951592426546495, + "learning_rate": 1.2498969880730671e-06, + "loss": 0.0581, + "step": 128 + }, + { + "epoch": 0.029351535836177476, + "grad_norm": 2.546012031191214, + "learning_rate": 1.249895359496692e-06, + "loss": 0.0604, + "step": 129 + }, + { + "epoch": 0.02957906712172924, + "grad_norm": 2.982654740949648, + "learning_rate": 1.2498937181486e-06, + "loss": 0.1317, + "step": 130 + }, + { + "epoch": 0.029806598407281, + "grad_norm": 3.35692876580473, + "learning_rate": 1.2498920640288248e-06, + "loss": 0.1357, + "step": 131 + }, + { + "epoch": 0.030034129692832763, + "grad_norm": 3.0581666005631694, + "learning_rate": 1.2498903971374005e-06, + "loss": 0.1404, + "step": 132 + }, + { + "epoch": 0.03026166097838453, + "grad_norm": 2.8194208797307936, + "learning_rate": 1.2498887174743606e-06, + "loss": 0.1139, + "step": 133 + }, + { + "epoch": 0.03048919226393629, + "grad_norm": 1.307812448511231, + "learning_rate": 1.24988702503974e-06, + "loss": 0.0439, + "step": 134 + }, + { + "epoch": 0.030716723549488054, + "grad_norm": 2.1828107869380178, + "learning_rate": 1.2498853198335728e-06, + "loss": 0.0714, + "step": 135 + }, + { + "epoch": 0.03094425483503982, + "grad_norm": 2.7687164066652588, + "learning_rate": 1.2498836018558942e-06, + "loss": 0.0588, + "step": 136 + }, + { + "epoch": 0.031171786120591582, + "grad_norm": 2.0398223571287124, + "learning_rate": 1.2498818711067392e-06, + "loss": 0.0788, + "step": 137 + }, + { + "epoch": 0.031399317406143344, + "grad_norm": 1.544887443922682, + "learning_rate": 1.2498801275861433e-06, + "loss": 0.0424, + "step": 138 + }, + { + "epoch": 0.03162684869169511, + "grad_norm": 1.9943988491119002, + "learning_rate": 1.2498783712941418e-06, + "loss": 0.0509, + "step": 139 + }, + { + "epoch": 0.03185437997724687, + "grad_norm": 2.2756991800533726, + "learning_rate": 1.2498766022307709e-06, + "loss": 0.077, + "step": 140 + }, + { + "epoch": 0.032081911262798635, + "grad_norm": 2.111047377421262, + "learning_rate": 1.2498748203960665e-06, + "loss": 0.0698, + "step": 141 + }, + { + "epoch": 0.0323094425483504, + "grad_norm": 5.455732324109059, + "learning_rate": 1.2498730257900655e-06, + "loss": 0.0464, + "step": 142 + }, + { + "epoch": 0.03253697383390216, + "grad_norm": 5.8400695805570475, + "learning_rate": 1.249871218412804e-06, + "loss": 0.0808, + "step": 143 + }, + { + "epoch": 0.032764505119453925, + "grad_norm": 1.6154990753356673, + "learning_rate": 1.2498693982643192e-06, + "loss": 0.0579, + "step": 144 + }, + { + "epoch": 0.03299203640500569, + "grad_norm": 2.6179374503343644, + "learning_rate": 1.2498675653446485e-06, + "loss": 0.0539, + "step": 145 + }, + { + "epoch": 0.03321956769055745, + "grad_norm": 1.8802237147910983, + "learning_rate": 1.249865719653829e-06, + "loss": 0.0562, + "step": 146 + }, + { + "epoch": 0.033447098976109216, + "grad_norm": 1.9601893311780516, + "learning_rate": 1.2498638611918985e-06, + "loss": 0.0842, + "step": 147 + }, + { + "epoch": 0.03367463026166098, + "grad_norm": 3.2621758627603814, + "learning_rate": 1.249861989958895e-06, + "loss": 0.0707, + "step": 148 + }, + { + "epoch": 0.03390216154721274, + "grad_norm": 1.7359031480428844, + "learning_rate": 1.2498601059548572e-06, + "loss": 0.0552, + "step": 149 + }, + { + "epoch": 0.034129692832764506, + "grad_norm": 1.9031742914383765, + "learning_rate": 1.2498582091798228e-06, + "loss": 0.0551, + "step": 150 + }, + { + "epoch": 0.034357224118316265, + "grad_norm": 1.6839469596143435, + "learning_rate": 1.2498562996338312e-06, + "loss": 0.0549, + "step": 151 + }, + { + "epoch": 0.03458475540386803, + "grad_norm": 4.770073312573169, + "learning_rate": 1.249854377316921e-06, + "loss": 0.0922, + "step": 152 + }, + { + "epoch": 0.0348122866894198, + "grad_norm": 9.77865071194089, + "learning_rate": 1.2498524422291319e-06, + "loss": 0.0656, + "step": 153 + }, + { + "epoch": 0.035039817974971556, + "grad_norm": 3.279720923464348, + "learning_rate": 1.2498504943705033e-06, + "loss": 0.0628, + "step": 154 + }, + { + "epoch": 0.03526734926052332, + "grad_norm": 2.5593319197975086, + "learning_rate": 1.249848533741075e-06, + "loss": 0.1116, + "step": 155 + }, + { + "epoch": 0.03549488054607509, + "grad_norm": 2.7020759561258796, + "learning_rate": 1.2498465603408865e-06, + "loss": 0.0818, + "step": 156 + }, + { + "epoch": 0.035722411831626846, + "grad_norm": 1.1827259380181383, + "learning_rate": 1.2498445741699792e-06, + "loss": 0.0412, + "step": 157 + }, + { + "epoch": 0.03594994311717861, + "grad_norm": 3.090835953132798, + "learning_rate": 1.249842575228393e-06, + "loss": 0.0922, + "step": 158 + }, + { + "epoch": 0.03617747440273038, + "grad_norm": 4.286688880757478, + "learning_rate": 1.249840563516169e-06, + "loss": 0.0921, + "step": 159 + }, + { + "epoch": 0.03640500568828214, + "grad_norm": 3.826111531217773, + "learning_rate": 1.249838539033348e-06, + "loss": 0.0711, + "step": 160 + }, + { + "epoch": 0.0366325369738339, + "grad_norm": 1.286547340104366, + "learning_rate": 1.2498365017799715e-06, + "loss": 0.0363, + "step": 161 + }, + { + "epoch": 0.03686006825938567, + "grad_norm": 0.9115912875215112, + "learning_rate": 1.2498344517560815e-06, + "loss": 0.0499, + "step": 162 + }, + { + "epoch": 0.03708759954493743, + "grad_norm": 1.967900659157127, + "learning_rate": 1.2498323889617198e-06, + "loss": 0.0642, + "step": 163 + }, + { + "epoch": 0.03731513083048919, + "grad_norm": 20.150143075771535, + "learning_rate": 1.2498303133969281e-06, + "loss": 0.0764, + "step": 164 + }, + { + "epoch": 0.03754266211604096, + "grad_norm": 1.8715348615633682, + "learning_rate": 1.2498282250617492e-06, + "loss": 0.0756, + "step": 165 + }, + { + "epoch": 0.03777019340159272, + "grad_norm": 1.4158902127506805, + "learning_rate": 1.2498261239562257e-06, + "loss": 0.0669, + "step": 166 + }, + { + "epoch": 0.037997724687144484, + "grad_norm": 2.5083436605654583, + "learning_rate": 1.2498240100804005e-06, + "loss": 0.0642, + "step": 167 + }, + { + "epoch": 0.03822525597269624, + "grad_norm": 4.201645953801033, + "learning_rate": 1.249821883434317e-06, + "loss": 0.0804, + "step": 168 + }, + { + "epoch": 0.03845278725824801, + "grad_norm": 1.8056911269518585, + "learning_rate": 1.2498197440180182e-06, + "loss": 0.0826, + "step": 169 + }, + { + "epoch": 0.038680318543799774, + "grad_norm": 2.5577509171659174, + "learning_rate": 1.2498175918315484e-06, + "loss": 0.0782, + "step": 170 + }, + { + "epoch": 0.03890784982935153, + "grad_norm": 1.6347218920613042, + "learning_rate": 1.2498154268749513e-06, + "loss": 0.0526, + "step": 171 + }, + { + "epoch": 0.0391353811149033, + "grad_norm": 2.089011530912283, + "learning_rate": 1.249813249148271e-06, + "loss": 0.0845, + "step": 172 + }, + { + "epoch": 0.039362912400455065, + "grad_norm": 2.0840328778691557, + "learning_rate": 1.2498110586515525e-06, + "loss": 0.0642, + "step": 173 + }, + { + "epoch": 0.039590443686006824, + "grad_norm": 2.63166064853352, + "learning_rate": 1.2498088553848398e-06, + "loss": 0.0896, + "step": 174 + }, + { + "epoch": 0.03981797497155859, + "grad_norm": 1.7790239100772818, + "learning_rate": 1.2498066393481787e-06, + "loss": 0.0887, + "step": 175 + }, + { + "epoch": 0.040045506257110355, + "grad_norm": 2.0484275395799374, + "learning_rate": 1.249804410541614e-06, + "loss": 0.0757, + "step": 176 + }, + { + "epoch": 0.040273037542662114, + "grad_norm": 2.7339764522222674, + "learning_rate": 1.2498021689651916e-06, + "loss": 0.0752, + "step": 177 + }, + { + "epoch": 0.04050056882821388, + "grad_norm": 1.7202036539272256, + "learning_rate": 1.249799914618957e-06, + "loss": 0.0571, + "step": 178 + }, + { + "epoch": 0.040728100113765646, + "grad_norm": 0.9595120325924671, + "learning_rate": 1.2497976475029566e-06, + "loss": 0.0431, + "step": 179 + }, + { + "epoch": 0.040955631399317405, + "grad_norm": 2.2881459955255283, + "learning_rate": 1.2497953676172364e-06, + "loss": 0.0714, + "step": 180 + }, + { + "epoch": 0.04118316268486917, + "grad_norm": 1.1818626309445535, + "learning_rate": 1.2497930749618431e-06, + "loss": 0.0352, + "step": 181 + }, + { + "epoch": 0.04141069397042093, + "grad_norm": 1.7233391740704784, + "learning_rate": 1.2497907695368238e-06, + "loss": 0.0469, + "step": 182 + }, + { + "epoch": 0.041638225255972695, + "grad_norm": 1.260961347917145, + "learning_rate": 1.2497884513422253e-06, + "loss": 0.0545, + "step": 183 + }, + { + "epoch": 0.04186575654152446, + "grad_norm": 1.350181559062471, + "learning_rate": 1.249786120378095e-06, + "loss": 0.0579, + "step": 184 + }, + { + "epoch": 0.04209328782707622, + "grad_norm": 1.4312208218600422, + "learning_rate": 1.2497837766444806e-06, + "loss": 0.0671, + "step": 185 + }, + { + "epoch": 0.042320819112627986, + "grad_norm": 3.0862470606278527, + "learning_rate": 1.2497814201414304e-06, + "loss": 0.1552, + "step": 186 + }, + { + "epoch": 0.04254835039817975, + "grad_norm": 2.010075620155332, + "learning_rate": 1.249779050868992e-06, + "loss": 0.0819, + "step": 187 + }, + { + "epoch": 0.04277588168373151, + "grad_norm": 1.5041508185690229, + "learning_rate": 1.249776668827214e-06, + "loss": 0.0555, + "step": 188 + }, + { + "epoch": 0.043003412969283276, + "grad_norm": 15.160154885670842, + "learning_rate": 1.249774274016145e-06, + "loss": 0.1377, + "step": 189 + }, + { + "epoch": 0.04323094425483504, + "grad_norm": 1.3900253472134594, + "learning_rate": 1.2497718664358341e-06, + "loss": 0.0573, + "step": 190 + }, + { + "epoch": 0.0434584755403868, + "grad_norm": 1.5383922685465743, + "learning_rate": 1.2497694460863307e-06, + "loss": 0.0986, + "step": 191 + }, + { + "epoch": 0.04368600682593857, + "grad_norm": 2.2430359603043266, + "learning_rate": 1.2497670129676838e-06, + "loss": 0.0771, + "step": 192 + }, + { + "epoch": 0.04391353811149033, + "grad_norm": 2.1439327799346333, + "learning_rate": 1.2497645670799436e-06, + "loss": 0.0436, + "step": 193 + }, + { + "epoch": 0.04414106939704209, + "grad_norm": 1.771166715500092, + "learning_rate": 1.2497621084231595e-06, + "loss": 0.0523, + "step": 194 + }, + { + "epoch": 0.04436860068259386, + "grad_norm": 1.4299897341433283, + "learning_rate": 1.2497596369973823e-06, + "loss": 0.0401, + "step": 195 + }, + { + "epoch": 0.04459613196814562, + "grad_norm": 2.3614356932574654, + "learning_rate": 1.2497571528026623e-06, + "loss": 0.0888, + "step": 196 + }, + { + "epoch": 0.04482366325369738, + "grad_norm": 1.8304520866772858, + "learning_rate": 1.2497546558390503e-06, + "loss": 0.065, + "step": 197 + }, + { + "epoch": 0.04505119453924915, + "grad_norm": 1.8912148609925916, + "learning_rate": 1.2497521461065973e-06, + "loss": 0.0601, + "step": 198 + }, + { + "epoch": 0.04527872582480091, + "grad_norm": 3.522073583380884, + "learning_rate": 1.2497496236053547e-06, + "loss": 0.0841, + "step": 199 + }, + { + "epoch": 0.04550625711035267, + "grad_norm": 1.3143196144497622, + "learning_rate": 1.2497470883353738e-06, + "loss": 0.0485, + "step": 200 + }, + { + "epoch": 0.04573378839590444, + "grad_norm": 2.251264141887164, + "learning_rate": 1.2497445402967068e-06, + "loss": 0.1068, + "step": 201 + }, + { + "epoch": 0.0459613196814562, + "grad_norm": 1.5072412944239943, + "learning_rate": 1.2497419794894053e-06, + "loss": 0.0685, + "step": 202 + }, + { + "epoch": 0.04618885096700796, + "grad_norm": 3.468383203818734, + "learning_rate": 1.249739405913522e-06, + "loss": 0.0815, + "step": 203 + }, + { + "epoch": 0.04641638225255973, + "grad_norm": 2.799795848526847, + "learning_rate": 1.2497368195691095e-06, + "loss": 0.0611, + "step": 204 + }, + { + "epoch": 0.04664391353811149, + "grad_norm": 1.8317350709529971, + "learning_rate": 1.2497342204562205e-06, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.046871444823663254, + "grad_norm": 1.8552065397519177, + "learning_rate": 1.2497316085749081e-06, + "loss": 0.0475, + "step": 206 + }, + { + "epoch": 0.04709897610921502, + "grad_norm": 33.648509264650215, + "learning_rate": 1.249728983925226e-06, + "loss": 0.4463, + "step": 207 + }, + { + "epoch": 0.04732650739476678, + "grad_norm": 3.6371540108131555, + "learning_rate": 1.2497263465072274e-06, + "loss": 0.1261, + "step": 208 + }, + { + "epoch": 0.047554038680318544, + "grad_norm": 4.579077765212189, + "learning_rate": 1.2497236963209663e-06, + "loss": 0.1537, + "step": 209 + }, + { + "epoch": 0.04778156996587031, + "grad_norm": 1.8349256873988717, + "learning_rate": 1.2497210333664972e-06, + "loss": 0.0905, + "step": 210 + }, + { + "epoch": 0.04800910125142207, + "grad_norm": 1.0349470677126553, + "learning_rate": 1.2497183576438743e-06, + "loss": 0.0383, + "step": 211 + }, + { + "epoch": 0.048236632536973835, + "grad_norm": 1.7552990958997892, + "learning_rate": 1.2497156691531523e-06, + "loss": 0.0667, + "step": 212 + }, + { + "epoch": 0.048464163822525594, + "grad_norm": 1.1378619448565346, + "learning_rate": 1.249712967894386e-06, + "loss": 0.0494, + "step": 213 + }, + { + "epoch": 0.04869169510807736, + "grad_norm": 2.098843760453871, + "learning_rate": 1.2497102538676308e-06, + "loss": 0.0683, + "step": 214 + }, + { + "epoch": 0.048919226393629126, + "grad_norm": 2.4023627244072996, + "learning_rate": 1.249707527072942e-06, + "loss": 0.0752, + "step": 215 + }, + { + "epoch": 0.049146757679180884, + "grad_norm": 3.0741883094346405, + "learning_rate": 1.2497047875103757e-06, + "loss": 0.1576, + "step": 216 + }, + { + "epoch": 0.04937428896473265, + "grad_norm": 2.109066858277886, + "learning_rate": 1.2497020351799875e-06, + "loss": 0.0834, + "step": 217 + }, + { + "epoch": 0.049601820250284416, + "grad_norm": 1.1860658004774445, + "learning_rate": 1.2496992700818335e-06, + "loss": 0.0487, + "step": 218 + }, + { + "epoch": 0.049829351535836175, + "grad_norm": 2.78284096499592, + "learning_rate": 1.249696492215971e-06, + "loss": 0.0743, + "step": 219 + }, + { + "epoch": 0.05005688282138794, + "grad_norm": 2.111446571336187, + "learning_rate": 1.249693701582456e-06, + "loss": 0.0381, + "step": 220 + }, + { + "epoch": 0.05028441410693971, + "grad_norm": 1.8439980435313363, + "learning_rate": 1.2496908981813458e-06, + "loss": 0.0821, + "step": 221 + }, + { + "epoch": 0.050511945392491465, + "grad_norm": 2.1292431200987165, + "learning_rate": 1.2496880820126977e-06, + "loss": 0.102, + "step": 222 + }, + { + "epoch": 0.05073947667804323, + "grad_norm": 1.58572513687618, + "learning_rate": 1.2496852530765695e-06, + "loss": 0.0451, + "step": 223 + }, + { + "epoch": 0.050967007963595, + "grad_norm": 3.028551047012946, + "learning_rate": 1.2496824113730186e-06, + "loss": 0.1259, + "step": 224 + }, + { + "epoch": 0.051194539249146756, + "grad_norm": 1.2656190346220688, + "learning_rate": 1.2496795569021033e-06, + "loss": 0.0593, + "step": 225 + }, + { + "epoch": 0.05142207053469852, + "grad_norm": 2.5365255203279444, + "learning_rate": 1.2496766896638819e-06, + "loss": 0.0754, + "step": 226 + }, + { + "epoch": 0.05164960182025029, + "grad_norm": 2.246271311279298, + "learning_rate": 1.249673809658413e-06, + "loss": 0.0689, + "step": 227 + }, + { + "epoch": 0.05187713310580205, + "grad_norm": 1.212864625758916, + "learning_rate": 1.2496709168857555e-06, + "loss": 0.0451, + "step": 228 + }, + { + "epoch": 0.05210466439135381, + "grad_norm": 2.806330657303488, + "learning_rate": 1.2496680113459683e-06, + "loss": 0.1473, + "step": 229 + }, + { + "epoch": 0.05233219567690557, + "grad_norm": 2.652722899111948, + "learning_rate": 1.2496650930391113e-06, + "loss": 0.1155, + "step": 230 + }, + { + "epoch": 0.05255972696245734, + "grad_norm": 2.7863683658107696, + "learning_rate": 1.2496621619652435e-06, + "loss": 0.0939, + "step": 231 + }, + { + "epoch": 0.0527872582480091, + "grad_norm": 2.0852803666211925, + "learning_rate": 1.2496592181244253e-06, + "loss": 0.0385, + "step": 232 + }, + { + "epoch": 0.05301478953356086, + "grad_norm": 2.3109089889052274, + "learning_rate": 1.249656261516717e-06, + "loss": 0.0591, + "step": 233 + }, + { + "epoch": 0.05324232081911263, + "grad_norm": 1.657658325406227, + "learning_rate": 1.2496532921421781e-06, + "loss": 0.0666, + "step": 234 + }, + { + "epoch": 0.053469852104664393, + "grad_norm": 1.8341397873796956, + "learning_rate": 1.2496503100008704e-06, + "loss": 0.0893, + "step": 235 + }, + { + "epoch": 0.05369738339021615, + "grad_norm": 1.3722825156667726, + "learning_rate": 1.249647315092854e-06, + "loss": 0.0445, + "step": 236 + }, + { + "epoch": 0.05392491467576792, + "grad_norm": 2.4225397650308187, + "learning_rate": 1.2496443074181905e-06, + "loss": 0.0783, + "step": 237 + }, + { + "epoch": 0.054152445961319684, + "grad_norm": 1.27450689566742, + "learning_rate": 1.2496412869769415e-06, + "loss": 0.0482, + "step": 238 + }, + { + "epoch": 0.05437997724687144, + "grad_norm": 1.5788523588867425, + "learning_rate": 1.2496382537691686e-06, + "loss": 0.0559, + "step": 239 + }, + { + "epoch": 0.05460750853242321, + "grad_norm": 2.455674592198026, + "learning_rate": 1.2496352077949336e-06, + "loss": 0.0686, + "step": 240 + }, + { + "epoch": 0.054835039817974975, + "grad_norm": 0.9822051281919313, + "learning_rate": 1.249632149054299e-06, + "loss": 0.0333, + "step": 241 + }, + { + "epoch": 0.05506257110352673, + "grad_norm": 1.4918328919143296, + "learning_rate": 1.249629077547327e-06, + "loss": 0.0564, + "step": 242 + }, + { + "epoch": 0.0552901023890785, + "grad_norm": 2.2431233119775973, + "learning_rate": 1.2496259932740813e-06, + "loss": 0.0998, + "step": 243 + }, + { + "epoch": 0.055517633674630265, + "grad_norm": 1.0547810468664673, + "learning_rate": 1.2496228962346236e-06, + "loss": 0.054, + "step": 244 + }, + { + "epoch": 0.055745164960182024, + "grad_norm": 4.2608959679243785, + "learning_rate": 1.249619786429018e-06, + "loss": 0.1324, + "step": 245 + }, + { + "epoch": 0.05597269624573379, + "grad_norm": 0.9597443879271953, + "learning_rate": 1.2496166638573278e-06, + "loss": 0.0545, + "step": 246 + }, + { + "epoch": 0.05620022753128555, + "grad_norm": 1.0564009660406621, + "learning_rate": 1.2496135285196172e-06, + "loss": 0.0409, + "step": 247 + }, + { + "epoch": 0.056427758816837315, + "grad_norm": 5.181656955922701, + "learning_rate": 1.2496103804159497e-06, + "loss": 0.0514, + "step": 248 + }, + { + "epoch": 0.05665529010238908, + "grad_norm": 31.956127145863782, + "learning_rate": 1.2496072195463904e-06, + "loss": 0.5089, + "step": 249 + }, + { + "epoch": 0.05688282138794084, + "grad_norm": 1.6118693718642072, + "learning_rate": 1.249604045911003e-06, + "loss": 0.0601, + "step": 250 + }, + { + "epoch": 0.057110352673492605, + "grad_norm": 1.8753798893546312, + "learning_rate": 1.249600859509853e-06, + "loss": 0.0781, + "step": 251 + }, + { + "epoch": 0.05733788395904437, + "grad_norm": 1.9173456172296954, + "learning_rate": 1.2495976603430054e-06, + "loss": 0.0998, + "step": 252 + }, + { + "epoch": 0.05756541524459613, + "grad_norm": 1.7446295444585516, + "learning_rate": 1.2495944484105254e-06, + "loss": 0.0348, + "step": 253 + }, + { + "epoch": 0.057792946530147896, + "grad_norm": 3.1221291176703816, + "learning_rate": 1.2495912237124787e-06, + "loss": 0.0467, + "step": 254 + }, + { + "epoch": 0.05802047781569966, + "grad_norm": 3.8018086885799454, + "learning_rate": 1.2495879862489312e-06, + "loss": 0.0893, + "step": 255 + }, + { + "epoch": 0.05824800910125142, + "grad_norm": 7.114525761671455, + "learning_rate": 1.2495847360199495e-06, + "loss": 0.0647, + "step": 256 + }, + { + "epoch": 0.058475540386803186, + "grad_norm": 1.861881748798064, + "learning_rate": 1.2495814730255993e-06, + "loss": 0.0832, + "step": 257 + }, + { + "epoch": 0.05870307167235495, + "grad_norm": 1.920791237896296, + "learning_rate": 1.2495781972659479e-06, + "loss": 0.0346, + "step": 258 + }, + { + "epoch": 0.05893060295790671, + "grad_norm": 1.9123959618493658, + "learning_rate": 1.2495749087410618e-06, + "loss": 0.0924, + "step": 259 + }, + { + "epoch": 0.05915813424345848, + "grad_norm": 4.29696428710133, + "learning_rate": 1.2495716074510087e-06, + "loss": 0.0503, + "step": 260 + }, + { + "epoch": 0.059385665529010236, + "grad_norm": 1.518204196410266, + "learning_rate": 1.2495682933958555e-06, + "loss": 0.0516, + "step": 261 + }, + { + "epoch": 0.059613196814562, + "grad_norm": 4.632688881661792, + "learning_rate": 1.2495649665756705e-06, + "loss": 0.1211, + "step": 262 + }, + { + "epoch": 0.05984072810011377, + "grad_norm": 2.486723056340893, + "learning_rate": 1.2495616269905212e-06, + "loss": 0.0811, + "step": 263 + }, + { + "epoch": 0.060068259385665526, + "grad_norm": 1.581969386319132, + "learning_rate": 1.2495582746404762e-06, + "loss": 0.0589, + "step": 264 + }, + { + "epoch": 0.06029579067121729, + "grad_norm": 1.5286635653264549, + "learning_rate": 1.249554909525604e-06, + "loss": 0.0638, + "step": 265 + }, + { + "epoch": 0.06052332195676906, + "grad_norm": 1.2992103388623448, + "learning_rate": 1.249551531645973e-06, + "loss": 0.0309, + "step": 266 + }, + { + "epoch": 0.06075085324232082, + "grad_norm": 1.4096677819832404, + "learning_rate": 1.2495481410016527e-06, + "loss": 0.0779, + "step": 267 + }, + { + "epoch": 0.06097838452787258, + "grad_norm": 2.5850002425545586, + "learning_rate": 1.2495447375927122e-06, + "loss": 0.0718, + "step": 268 + }, + { + "epoch": 0.06120591581342435, + "grad_norm": 2.2171422082649155, + "learning_rate": 1.2495413214192209e-06, + "loss": 0.0761, + "step": 269 + }, + { + "epoch": 0.06143344709897611, + "grad_norm": 1.8061938933671926, + "learning_rate": 1.2495378924812486e-06, + "loss": 0.068, + "step": 270 + }, + { + "epoch": 0.06166097838452787, + "grad_norm": 1.805390591341637, + "learning_rate": 1.2495344507788662e-06, + "loss": 0.0589, + "step": 271 + }, + { + "epoch": 0.06188850967007964, + "grad_norm": 28.567737212004268, + "learning_rate": 1.249530996312143e-06, + "loss": 0.2672, + "step": 272 + }, + { + "epoch": 0.0621160409556314, + "grad_norm": 2.023457290937467, + "learning_rate": 1.2495275290811499e-06, + "loss": 0.0762, + "step": 273 + }, + { + "epoch": 0.062343572241183164, + "grad_norm": 1.726293880974768, + "learning_rate": 1.2495240490859581e-06, + "loss": 0.1124, + "step": 274 + }, + { + "epoch": 0.06257110352673492, + "grad_norm": 3.5654068745258094, + "learning_rate": 1.2495205563266384e-06, + "loss": 0.08, + "step": 275 + }, + { + "epoch": 0.06279863481228669, + "grad_norm": 3.330534204865081, + "learning_rate": 1.2495170508032624e-06, + "loss": 0.1063, + "step": 276 + }, + { + "epoch": 0.06302616609783845, + "grad_norm": 2.7347891521994265, + "learning_rate": 1.2495135325159015e-06, + "loss": 0.0544, + "step": 277 + }, + { + "epoch": 0.06325369738339022, + "grad_norm": 1.6191657189271829, + "learning_rate": 1.2495100014646277e-06, + "loss": 0.0399, + "step": 278 + }, + { + "epoch": 0.06348122866894199, + "grad_norm": 2.2760450280718163, + "learning_rate": 1.2495064576495134e-06, + "loss": 0.0842, + "step": 279 + }, + { + "epoch": 0.06370875995449374, + "grad_norm": 1.26417795343513, + "learning_rate": 1.2495029010706306e-06, + "loss": 0.0396, + "step": 280 + }, + { + "epoch": 0.0639362912400455, + "grad_norm": 2.452124440380249, + "learning_rate": 1.2494993317280524e-06, + "loss": 0.057, + "step": 281 + }, + { + "epoch": 0.06416382252559727, + "grad_norm": 1.6768179224480646, + "learning_rate": 1.2494957496218516e-06, + "loss": 0.0778, + "step": 282 + }, + { + "epoch": 0.06439135381114904, + "grad_norm": 1.148511080426964, + "learning_rate": 1.2494921547521013e-06, + "loss": 0.0593, + "step": 283 + }, + { + "epoch": 0.0646188850967008, + "grad_norm": 2.6954847712989687, + "learning_rate": 1.249488547118875e-06, + "loss": 0.0581, + "step": 284 + }, + { + "epoch": 0.06484641638225255, + "grad_norm": 1.7038701082720327, + "learning_rate": 1.2494849267222466e-06, + "loss": 0.0467, + "step": 285 + }, + { + "epoch": 0.06507394766780432, + "grad_norm": 2.0205984871282654, + "learning_rate": 1.24948129356229e-06, + "loss": 0.08, + "step": 286 + }, + { + "epoch": 0.06530147895335608, + "grad_norm": 2.4430734451001093, + "learning_rate": 1.2494776476390793e-06, + "loss": 0.0689, + "step": 287 + }, + { + "epoch": 0.06552901023890785, + "grad_norm": 2.2074548221975223, + "learning_rate": 1.2494739889526894e-06, + "loss": 0.036, + "step": 288 + }, + { + "epoch": 0.06575654152445962, + "grad_norm": 2.3839227269087115, + "learning_rate": 1.2494703175031946e-06, + "loss": 0.1017, + "step": 289 + }, + { + "epoch": 0.06598407281001138, + "grad_norm": 1.297764082661735, + "learning_rate": 1.2494666332906702e-06, + "loss": 0.0428, + "step": 290 + }, + { + "epoch": 0.06621160409556313, + "grad_norm": 2.29700118159146, + "learning_rate": 1.2494629363151916e-06, + "loss": 0.0564, + "step": 291 + }, + { + "epoch": 0.0664391353811149, + "grad_norm": 10.343219429302799, + "learning_rate": 1.2494592265768343e-06, + "loss": 0.2299, + "step": 292 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.7119116189327344, + "learning_rate": 1.2494555040756737e-06, + "loss": 0.0491, + "step": 293 + }, + { + "epoch": 0.06689419795221843, + "grad_norm": 1.5996315416913807, + "learning_rate": 1.2494517688117867e-06, + "loss": 0.0569, + "step": 294 + }, + { + "epoch": 0.0671217292377702, + "grad_norm": 2.2996053958249068, + "learning_rate": 1.2494480207852489e-06, + "loss": 0.0561, + "step": 295 + }, + { + "epoch": 0.06734926052332196, + "grad_norm": 2.4074335741025346, + "learning_rate": 1.249444259996137e-06, + "loss": 0.0578, + "step": 296 + }, + { + "epoch": 0.06757679180887372, + "grad_norm": 3.415732508941554, + "learning_rate": 1.2494404864445284e-06, + "loss": 0.0617, + "step": 297 + }, + { + "epoch": 0.06780432309442548, + "grad_norm": 2.777678894153543, + "learning_rate": 1.2494367001304996e-06, + "loss": 0.081, + "step": 298 + }, + { + "epoch": 0.06803185437997725, + "grad_norm": 1.6387141538383663, + "learning_rate": 1.2494329010541284e-06, + "loss": 0.0387, + "step": 299 + }, + { + "epoch": 0.06825938566552901, + "grad_norm": 2.069756645203544, + "learning_rate": 1.2494290892154922e-06, + "loss": 0.0742, + "step": 300 + }, + { + "epoch": 0.06848691695108078, + "grad_norm": 3.4307396136508332, + "learning_rate": 1.2494252646146692e-06, + "loss": 0.0957, + "step": 301 + }, + { + "epoch": 0.06871444823663253, + "grad_norm": 2.0232162640823974, + "learning_rate": 1.249421427251737e-06, + "loss": 0.0522, + "step": 302 + }, + { + "epoch": 0.0689419795221843, + "grad_norm": 2.146160892723777, + "learning_rate": 1.2494175771267748e-06, + "loss": 0.0896, + "step": 303 + }, + { + "epoch": 0.06916951080773606, + "grad_norm": 2.940508414612204, + "learning_rate": 1.2494137142398607e-06, + "loss": 0.0776, + "step": 304 + }, + { + "epoch": 0.06939704209328783, + "grad_norm": 4.306654470652337, + "learning_rate": 1.249409838591074e-06, + "loss": 0.112, + "step": 305 + }, + { + "epoch": 0.0696245733788396, + "grad_norm": 2.698854439380438, + "learning_rate": 1.2494059501804937e-06, + "loss": 0.0922, + "step": 306 + }, + { + "epoch": 0.06985210466439136, + "grad_norm": 2.684055328227073, + "learning_rate": 1.249402049008199e-06, + "loss": 0.0603, + "step": 307 + }, + { + "epoch": 0.07007963594994311, + "grad_norm": 1.6527179950360487, + "learning_rate": 1.2493981350742704e-06, + "loss": 0.035, + "step": 308 + }, + { + "epoch": 0.07030716723549488, + "grad_norm": 2.632569906843673, + "learning_rate": 1.2493942083787872e-06, + "loss": 0.0483, + "step": 309 + }, + { + "epoch": 0.07053469852104664, + "grad_norm": 1.8419184740112247, + "learning_rate": 1.2493902689218299e-06, + "loss": 0.0592, + "step": 310 + }, + { + "epoch": 0.07076222980659841, + "grad_norm": 4.098447223521742, + "learning_rate": 1.249386316703479e-06, + "loss": 0.0647, + "step": 311 + }, + { + "epoch": 0.07098976109215017, + "grad_norm": 3.425692782797119, + "learning_rate": 1.2493823517238154e-06, + "loss": 0.0304, + "step": 312 + }, + { + "epoch": 0.07121729237770194, + "grad_norm": 2.09845005259606, + "learning_rate": 1.2493783739829202e-06, + "loss": 0.0841, + "step": 313 + }, + { + "epoch": 0.07144482366325369, + "grad_norm": 1.3953874008324176, + "learning_rate": 1.2493743834808741e-06, + "loss": 0.0572, + "step": 314 + }, + { + "epoch": 0.07167235494880546, + "grad_norm": 2.623892824594455, + "learning_rate": 1.2493703802177594e-06, + "loss": 0.0625, + "step": 315 + }, + { + "epoch": 0.07189988623435722, + "grad_norm": 1.2851580205849038, + "learning_rate": 1.2493663641936576e-06, + "loss": 0.0321, + "step": 316 + }, + { + "epoch": 0.07212741751990899, + "grad_norm": 1.9343201552647253, + "learning_rate": 1.2493623354086507e-06, + "loss": 0.067, + "step": 317 + }, + { + "epoch": 0.07235494880546076, + "grad_norm": 2.2726283028135286, + "learning_rate": 1.2493582938628213e-06, + "loss": 0.0797, + "step": 318 + }, + { + "epoch": 0.07258248009101251, + "grad_norm": 1.669835013133446, + "learning_rate": 1.2493542395562516e-06, + "loss": 0.0534, + "step": 319 + }, + { + "epoch": 0.07281001137656427, + "grad_norm": 1.684538229380955, + "learning_rate": 1.2493501724890247e-06, + "loss": 0.0728, + "step": 320 + }, + { + "epoch": 0.07303754266211604, + "grad_norm": 1.9071354849813447, + "learning_rate": 1.249346092661224e-06, + "loss": 0.0678, + "step": 321 + }, + { + "epoch": 0.0732650739476678, + "grad_norm": 1.7042139773464355, + "learning_rate": 1.2493420000729322e-06, + "loss": 0.0784, + "step": 322 + }, + { + "epoch": 0.07349260523321957, + "grad_norm": 1.6735213517836012, + "learning_rate": 1.2493378947242336e-06, + "loss": 0.0776, + "step": 323 + }, + { + "epoch": 0.07372013651877134, + "grad_norm": 1.5646112601537545, + "learning_rate": 1.2493337766152119e-06, + "loss": 0.0582, + "step": 324 + }, + { + "epoch": 0.07394766780432309, + "grad_norm": 1.5543937889365869, + "learning_rate": 1.249329645745951e-06, + "loss": 0.0512, + "step": 325 + }, + { + "epoch": 0.07417519908987485, + "grad_norm": 1.982236801977647, + "learning_rate": 1.2493255021165357e-06, + "loss": 0.0804, + "step": 326 + }, + { + "epoch": 0.07440273037542662, + "grad_norm": 1.2394712998486772, + "learning_rate": 1.2493213457270504e-06, + "loss": 0.0534, + "step": 327 + }, + { + "epoch": 0.07463026166097839, + "grad_norm": 1.7803933921864556, + "learning_rate": 1.2493171765775804e-06, + "loss": 0.0687, + "step": 328 + }, + { + "epoch": 0.07485779294653015, + "grad_norm": 2.379962120888543, + "learning_rate": 1.2493129946682107e-06, + "loss": 0.071, + "step": 329 + }, + { + "epoch": 0.07508532423208192, + "grad_norm": 1.589657975033299, + "learning_rate": 1.2493087999990263e-06, + "loss": 0.0739, + "step": 330 + }, + { + "epoch": 0.07531285551763367, + "grad_norm": 3.043606210421758, + "learning_rate": 1.249304592570114e-06, + "loss": 0.1518, + "step": 331 + }, + { + "epoch": 0.07554038680318544, + "grad_norm": 1.4392385212589442, + "learning_rate": 1.2493003723815588e-06, + "loss": 0.0556, + "step": 332 + }, + { + "epoch": 0.0757679180887372, + "grad_norm": 2.0188482312629397, + "learning_rate": 1.2492961394334474e-06, + "loss": 0.0662, + "step": 333 + }, + { + "epoch": 0.07599544937428897, + "grad_norm": 1.3600493699697815, + "learning_rate": 1.2492918937258663e-06, + "loss": 0.042, + "step": 334 + }, + { + "epoch": 0.07622298065984073, + "grad_norm": 2.2314121339792714, + "learning_rate": 1.2492876352589024e-06, + "loss": 0.0751, + "step": 335 + }, + { + "epoch": 0.07645051194539249, + "grad_norm": 1.4434673698644476, + "learning_rate": 1.2492833640326424e-06, + "loss": 0.0359, + "step": 336 + }, + { + "epoch": 0.07667804323094425, + "grad_norm": 1.9053597077248743, + "learning_rate": 1.2492790800471738e-06, + "loss": 0.0881, + "step": 337 + }, + { + "epoch": 0.07690557451649602, + "grad_norm": 0.9804503872776514, + "learning_rate": 1.249274783302584e-06, + "loss": 0.0332, + "step": 338 + }, + { + "epoch": 0.07713310580204778, + "grad_norm": 1.439933416412126, + "learning_rate": 1.249270473798961e-06, + "loss": 0.0363, + "step": 339 + }, + { + "epoch": 0.07736063708759955, + "grad_norm": 1.8018763642808848, + "learning_rate": 1.249266151536393e-06, + "loss": 0.0652, + "step": 340 + }, + { + "epoch": 0.07758816837315131, + "grad_norm": 1.605885586174771, + "learning_rate": 1.249261816514968e-06, + "loss": 0.0483, + "step": 341 + }, + { + "epoch": 0.07781569965870307, + "grad_norm": 1.5475812110829932, + "learning_rate": 1.2492574687347747e-06, + "loss": 0.0518, + "step": 342 + }, + { + "epoch": 0.07804323094425483, + "grad_norm": 1.3861334147739204, + "learning_rate": 1.249253108195902e-06, + "loss": 0.0438, + "step": 343 + }, + { + "epoch": 0.0782707622298066, + "grad_norm": 1.6280816638823692, + "learning_rate": 1.249248734898439e-06, + "loss": 0.0804, + "step": 344 + }, + { + "epoch": 0.07849829351535836, + "grad_norm": 1.945038730906618, + "learning_rate": 1.2492443488424753e-06, + "loss": 0.08, + "step": 345 + }, + { + "epoch": 0.07872582480091013, + "grad_norm": 1.163435954016696, + "learning_rate": 1.2492399500281002e-06, + "loss": 0.0404, + "step": 346 + }, + { + "epoch": 0.07895335608646188, + "grad_norm": 2.0650618862775225, + "learning_rate": 1.2492355384554039e-06, + "loss": 0.0965, + "step": 347 + }, + { + "epoch": 0.07918088737201365, + "grad_norm": 1.2688387295944672, + "learning_rate": 1.2492311141244764e-06, + "loss": 0.0387, + "step": 348 + }, + { + "epoch": 0.07940841865756541, + "grad_norm": 2.2968428653497472, + "learning_rate": 1.249226677035408e-06, + "loss": 0.0569, + "step": 349 + }, + { + "epoch": 0.07963594994311718, + "grad_norm": 1.3014564625890543, + "learning_rate": 1.2492222271882896e-06, + "loss": 0.0418, + "step": 350 + }, + { + "epoch": 0.07986348122866894, + "grad_norm": 2.3510870610479433, + "learning_rate": 1.2492177645832121e-06, + "loss": 0.0932, + "step": 351 + }, + { + "epoch": 0.08009101251422071, + "grad_norm": 1.4510684428455591, + "learning_rate": 1.2492132892202668e-06, + "loss": 0.0513, + "step": 352 + }, + { + "epoch": 0.08031854379977246, + "grad_norm": 4.594413722551582, + "learning_rate": 1.2492088010995449e-06, + "loss": 0.1235, + "step": 353 + }, + { + "epoch": 0.08054607508532423, + "grad_norm": 2.137036810353962, + "learning_rate": 1.2492043002211385e-06, + "loss": 0.1026, + "step": 354 + }, + { + "epoch": 0.080773606370876, + "grad_norm": 0.7577275839475693, + "learning_rate": 1.2491997865851392e-06, + "loss": 0.0368, + "step": 355 + }, + { + "epoch": 0.08100113765642776, + "grad_norm": 1.0494266975406659, + "learning_rate": 1.2491952601916395e-06, + "loss": 0.0385, + "step": 356 + }, + { + "epoch": 0.08122866894197953, + "grad_norm": 1.0423156372262818, + "learning_rate": 1.2491907210407319e-06, + "loss": 0.0424, + "step": 357 + }, + { + "epoch": 0.08145620022753129, + "grad_norm": 1.150214646729361, + "learning_rate": 1.249186169132509e-06, + "loss": 0.0694, + "step": 358 + }, + { + "epoch": 0.08168373151308304, + "grad_norm": 2.382439951131263, + "learning_rate": 1.2491816044670641e-06, + "loss": 0.0367, + "step": 359 + }, + { + "epoch": 0.08191126279863481, + "grad_norm": 1.7484749214098252, + "learning_rate": 1.24917702704449e-06, + "loss": 0.0677, + "step": 360 + }, + { + "epoch": 0.08213879408418658, + "grad_norm": 2.1328792490346227, + "learning_rate": 1.2491724368648808e-06, + "loss": 0.049, + "step": 361 + }, + { + "epoch": 0.08236632536973834, + "grad_norm": 0.6514880974265386, + "learning_rate": 1.2491678339283303e-06, + "loss": 0.0217, + "step": 362 + }, + { + "epoch": 0.08259385665529011, + "grad_norm": 1.3802700887569803, + "learning_rate": 1.249163218234932e-06, + "loss": 0.0406, + "step": 363 + }, + { + "epoch": 0.08282138794084186, + "grad_norm": 1.7918385557024892, + "learning_rate": 1.249158589784781e-06, + "loss": 0.064, + "step": 364 + }, + { + "epoch": 0.08304891922639362, + "grad_norm": 1.2060165323705336, + "learning_rate": 1.2491539485779713e-06, + "loss": 0.0367, + "step": 365 + }, + { + "epoch": 0.08327645051194539, + "grad_norm": 1.2166249313910695, + "learning_rate": 1.2491492946145981e-06, + "loss": 0.0512, + "step": 366 + }, + { + "epoch": 0.08350398179749716, + "grad_norm": 1.3048815510766842, + "learning_rate": 1.2491446278947563e-06, + "loss": 0.0554, + "step": 367 + }, + { + "epoch": 0.08373151308304892, + "grad_norm": 1.9459326713846232, + "learning_rate": 1.2491399484185413e-06, + "loss": 0.0488, + "step": 368 + }, + { + "epoch": 0.08395904436860069, + "grad_norm": 1.7915232664153333, + "learning_rate": 1.249135256186049e-06, + "loss": 0.1229, + "step": 369 + }, + { + "epoch": 0.08418657565415244, + "grad_norm": 0.9354087864668412, + "learning_rate": 1.249130551197375e-06, + "loss": 0.0383, + "step": 370 + }, + { + "epoch": 0.0844141069397042, + "grad_norm": 2.07264306868735, + "learning_rate": 1.2491258334526155e-06, + "loss": 0.0794, + "step": 371 + }, + { + "epoch": 0.08464163822525597, + "grad_norm": 1.4534854537948092, + "learning_rate": 1.2491211029518672e-06, + "loss": 0.0282, + "step": 372 + }, + { + "epoch": 0.08486916951080774, + "grad_norm": 2.0044431124055984, + "learning_rate": 1.2491163596952264e-06, + "loss": 0.0709, + "step": 373 + }, + { + "epoch": 0.0850967007963595, + "grad_norm": 1.6305074169029203, + "learning_rate": 1.2491116036827902e-06, + "loss": 0.0407, + "step": 374 + }, + { + "epoch": 0.08532423208191127, + "grad_norm": 1.223898789967783, + "learning_rate": 1.2491068349146559e-06, + "loss": 0.0595, + "step": 375 + }, + { + "epoch": 0.08555176336746302, + "grad_norm": 1.7895390232916124, + "learning_rate": 1.249102053390921e-06, + "loss": 0.1034, + "step": 376 + }, + { + "epoch": 0.08577929465301479, + "grad_norm": 1.6725786460952021, + "learning_rate": 1.249097259111683e-06, + "loss": 0.0955, + "step": 377 + }, + { + "epoch": 0.08600682593856655, + "grad_norm": 2.4309538605657735, + "learning_rate": 1.24909245207704e-06, + "loss": 0.0794, + "step": 378 + }, + { + "epoch": 0.08623435722411832, + "grad_norm": 0.8982937912678952, + "learning_rate": 1.2490876322870904e-06, + "loss": 0.0302, + "step": 379 + }, + { + "epoch": 0.08646188850967008, + "grad_norm": 3.0756164983832, + "learning_rate": 1.2490827997419325e-06, + "loss": 0.0549, + "step": 380 + }, + { + "epoch": 0.08668941979522184, + "grad_norm": 3.676872613008804, + "learning_rate": 1.249077954441665e-06, + "loss": 0.141, + "step": 381 + }, + { + "epoch": 0.0869169510807736, + "grad_norm": 1.4373261550715828, + "learning_rate": 1.249073096386387e-06, + "loss": 0.0392, + "step": 382 + }, + { + "epoch": 0.08714448236632537, + "grad_norm": 3.1988616398871006, + "learning_rate": 1.249068225576198e-06, + "loss": 0.0915, + "step": 383 + }, + { + "epoch": 0.08737201365187713, + "grad_norm": 1.5619645893707617, + "learning_rate": 1.2490633420111974e-06, + "loss": 0.0541, + "step": 384 + }, + { + "epoch": 0.0875995449374289, + "grad_norm": 0.9211933924050371, + "learning_rate": 1.249058445691485e-06, + "loss": 0.0337, + "step": 385 + }, + { + "epoch": 0.08782707622298067, + "grad_norm": 1.1767496728905957, + "learning_rate": 1.2490535366171607e-06, + "loss": 0.039, + "step": 386 + }, + { + "epoch": 0.08805460750853242, + "grad_norm": 2.9534787325055274, + "learning_rate": 1.249048614788325e-06, + "loss": 0.1048, + "step": 387 + }, + { + "epoch": 0.08828213879408418, + "grad_norm": 6.651807262283111, + "learning_rate": 1.249043680205079e-06, + "loss": 0.0494, + "step": 388 + }, + { + "epoch": 0.08850967007963595, + "grad_norm": 2.5123377668968745, + "learning_rate": 1.2490387328675226e-06, + "loss": 0.1199, + "step": 389 + }, + { + "epoch": 0.08873720136518772, + "grad_norm": 1.5502594476140923, + "learning_rate": 1.2490337727757576e-06, + "loss": 0.0503, + "step": 390 + }, + { + "epoch": 0.08896473265073948, + "grad_norm": 5.640796726611394, + "learning_rate": 1.249028799929885e-06, + "loss": 0.06, + "step": 391 + }, + { + "epoch": 0.08919226393629125, + "grad_norm": 1.2047358188375967, + "learning_rate": 1.2490238143300066e-06, + "loss": 0.0427, + "step": 392 + }, + { + "epoch": 0.089419795221843, + "grad_norm": 1.5816796650719427, + "learning_rate": 1.2490188159762243e-06, + "loss": 0.0665, + "step": 393 + }, + { + "epoch": 0.08964732650739476, + "grad_norm": 2.8164102954322505, + "learning_rate": 1.2490138048686405e-06, + "loss": 0.076, + "step": 394 + }, + { + "epoch": 0.08987485779294653, + "grad_norm": 2.0775035442513596, + "learning_rate": 1.249008781007357e-06, + "loss": 0.0857, + "step": 395 + }, + { + "epoch": 0.0901023890784983, + "grad_norm": 1.3816134442939907, + "learning_rate": 1.2490037443924768e-06, + "loss": 0.0647, + "step": 396 + }, + { + "epoch": 0.09032992036405006, + "grad_norm": 5.3602328587776205, + "learning_rate": 1.2489986950241032e-06, + "loss": 0.0857, + "step": 397 + }, + { + "epoch": 0.09055745164960181, + "grad_norm": 1.3668732893642688, + "learning_rate": 1.2489936329023387e-06, + "loss": 0.0513, + "step": 398 + }, + { + "epoch": 0.09078498293515358, + "grad_norm": 1.3818985376090422, + "learning_rate": 1.2489885580272874e-06, + "loss": 0.0643, + "step": 399 + }, + { + "epoch": 0.09101251422070535, + "grad_norm": 1.1645733063202421, + "learning_rate": 1.2489834703990527e-06, + "loss": 0.0564, + "step": 400 + }, + { + "epoch": 0.09124004550625711, + "grad_norm": 2.579861013447331, + "learning_rate": 1.2489783700177385e-06, + "loss": 0.0844, + "step": 401 + }, + { + "epoch": 0.09146757679180888, + "grad_norm": 1.7416779142182774, + "learning_rate": 1.2489732568834492e-06, + "loss": 0.0874, + "step": 402 + }, + { + "epoch": 0.09169510807736064, + "grad_norm": 1.2036980088831546, + "learning_rate": 1.2489681309962895e-06, + "loss": 0.0605, + "step": 403 + }, + { + "epoch": 0.0919226393629124, + "grad_norm": 2.823337671795372, + "learning_rate": 1.2489629923563637e-06, + "loss": 0.0563, + "step": 404 + }, + { + "epoch": 0.09215017064846416, + "grad_norm": 3.5147572243595406, + "learning_rate": 1.2489578409637774e-06, + "loss": 0.0962, + "step": 405 + }, + { + "epoch": 0.09237770193401593, + "grad_norm": 2.441595167267241, + "learning_rate": 1.2489526768186352e-06, + "loss": 0.0409, + "step": 406 + }, + { + "epoch": 0.09260523321956769, + "grad_norm": 1.606289452778594, + "learning_rate": 1.2489474999210434e-06, + "loss": 0.083, + "step": 407 + }, + { + "epoch": 0.09283276450511946, + "grad_norm": 2.0108948875054358, + "learning_rate": 1.2489423102711068e-06, + "loss": 0.0424, + "step": 408 + }, + { + "epoch": 0.09306029579067122, + "grad_norm": 1.9517497199537932, + "learning_rate": 1.2489371078689326e-06, + "loss": 0.063, + "step": 409 + }, + { + "epoch": 0.09328782707622298, + "grad_norm": 2.1984153851855903, + "learning_rate": 1.2489318927146263e-06, + "loss": 0.0607, + "step": 410 + }, + { + "epoch": 0.09351535836177474, + "grad_norm": 0.9821028313036854, + "learning_rate": 1.2489266648082951e-06, + "loss": 0.0269, + "step": 411 + }, + { + "epoch": 0.09374288964732651, + "grad_norm": 1.4642895316806703, + "learning_rate": 1.2489214241500453e-06, + "loss": 0.0773, + "step": 412 + }, + { + "epoch": 0.09397042093287827, + "grad_norm": 2.6315717094149598, + "learning_rate": 1.2489161707399843e-06, + "loss": 0.1042, + "step": 413 + }, + { + "epoch": 0.09419795221843004, + "grad_norm": 1.2730067445859696, + "learning_rate": 1.2489109045782194e-06, + "loss": 0.0397, + "step": 414 + }, + { + "epoch": 0.09442548350398179, + "grad_norm": 1.0214233244231261, + "learning_rate": 1.2489056256648582e-06, + "loss": 0.0297, + "step": 415 + }, + { + "epoch": 0.09465301478953356, + "grad_norm": 1.8895414872949885, + "learning_rate": 1.2489003340000089e-06, + "loss": 0.1027, + "step": 416 + }, + { + "epoch": 0.09488054607508532, + "grad_norm": 1.2431995135273388, + "learning_rate": 1.2488950295837792e-06, + "loss": 0.0694, + "step": 417 + }, + { + "epoch": 0.09510807736063709, + "grad_norm": 2.741996145452892, + "learning_rate": 1.2488897124162777e-06, + "loss": 0.0815, + "step": 418 + }, + { + "epoch": 0.09533560864618885, + "grad_norm": 1.711178715312897, + "learning_rate": 1.248884382497613e-06, + "loss": 0.0955, + "step": 419 + }, + { + "epoch": 0.09556313993174062, + "grad_norm": 2.609523312381175, + "learning_rate": 1.2488790398278941e-06, + "loss": 0.1047, + "step": 420 + }, + { + "epoch": 0.09579067121729237, + "grad_norm": 1.53212810794613, + "learning_rate": 1.2488736844072304e-06, + "loss": 0.0585, + "step": 421 + }, + { + "epoch": 0.09601820250284414, + "grad_norm": 1.4174025848206742, + "learning_rate": 1.248868316235731e-06, + "loss": 0.0387, + "step": 422 + }, + { + "epoch": 0.0962457337883959, + "grad_norm": 2.1766082548840644, + "learning_rate": 1.2488629353135059e-06, + "loss": 0.0883, + "step": 423 + }, + { + "epoch": 0.09647326507394767, + "grad_norm": 1.2878325712311849, + "learning_rate": 1.2488575416406649e-06, + "loss": 0.0469, + "step": 424 + }, + { + "epoch": 0.09670079635949944, + "grad_norm": 2.1138235879975222, + "learning_rate": 1.2488521352173183e-06, + "loss": 0.0531, + "step": 425 + }, + { + "epoch": 0.09692832764505119, + "grad_norm": 1.6090227620325084, + "learning_rate": 1.2488467160435765e-06, + "loss": 0.0532, + "step": 426 + }, + { + "epoch": 0.09715585893060295, + "grad_norm": 1.5021035863134842, + "learning_rate": 1.2488412841195505e-06, + "loss": 0.0805, + "step": 427 + }, + { + "epoch": 0.09738339021615472, + "grad_norm": 2.0944361790568515, + "learning_rate": 1.2488358394453512e-06, + "loss": 0.0864, + "step": 428 + }, + { + "epoch": 0.09761092150170649, + "grad_norm": 1.3336921136142907, + "learning_rate": 1.2488303820210897e-06, + "loss": 0.0405, + "step": 429 + }, + { + "epoch": 0.09783845278725825, + "grad_norm": 1.4602632552266945, + "learning_rate": 1.2488249118468776e-06, + "loss": 0.0319, + "step": 430 + }, + { + "epoch": 0.09806598407281002, + "grad_norm": 1.1300406263272467, + "learning_rate": 1.248819428922827e-06, + "loss": 0.0328, + "step": 431 + }, + { + "epoch": 0.09829351535836177, + "grad_norm": 3.2436410504809006, + "learning_rate": 1.2488139332490495e-06, + "loss": 0.1052, + "step": 432 + }, + { + "epoch": 0.09852104664391353, + "grad_norm": 1.1456421976312734, + "learning_rate": 1.248808424825658e-06, + "loss": 0.0455, + "step": 433 + }, + { + "epoch": 0.0987485779294653, + "grad_norm": 1.8967128040132528, + "learning_rate": 1.2488029036527645e-06, + "loss": 0.0521, + "step": 434 + }, + { + "epoch": 0.09897610921501707, + "grad_norm": 2.22293609570488, + "learning_rate": 1.2487973697304822e-06, + "loss": 0.0401, + "step": 435 + }, + { + "epoch": 0.09920364050056883, + "grad_norm": 1.3201279140220556, + "learning_rate": 1.248791823058924e-06, + "loss": 0.0656, + "step": 436 + }, + { + "epoch": 0.0994311717861206, + "grad_norm": 1.8801560325756854, + "learning_rate": 1.2487862636382034e-06, + "loss": 0.036, + "step": 437 + }, + { + "epoch": 0.09965870307167235, + "grad_norm": 0.9413516301280102, + "learning_rate": 1.248780691468434e-06, + "loss": 0.0369, + "step": 438 + }, + { + "epoch": 0.09988623435722412, + "grad_norm": 1.6280926081257134, + "learning_rate": 1.2487751065497296e-06, + "loss": 0.0679, + "step": 439 + }, + { + "epoch": 0.10011376564277588, + "grad_norm": 1.1483666492978277, + "learning_rate": 1.2487695088822044e-06, + "loss": 0.0476, + "step": 440 + }, + { + "epoch": 0.10034129692832765, + "grad_norm": 1.091509019893692, + "learning_rate": 1.2487638984659729e-06, + "loss": 0.0267, + "step": 441 + }, + { + "epoch": 0.10056882821387941, + "grad_norm": 1.2894939135732226, + "learning_rate": 1.2487582753011496e-06, + "loss": 0.0379, + "step": 442 + }, + { + "epoch": 0.10079635949943117, + "grad_norm": 1.5539892877637305, + "learning_rate": 1.2487526393878497e-06, + "loss": 0.0717, + "step": 443 + }, + { + "epoch": 0.10102389078498293, + "grad_norm": 1.0799436334247106, + "learning_rate": 1.248746990726188e-06, + "loss": 0.0494, + "step": 444 + }, + { + "epoch": 0.1012514220705347, + "grad_norm": 1.0482657304812906, + "learning_rate": 1.2487413293162803e-06, + "loss": 0.0328, + "step": 445 + }, + { + "epoch": 0.10147895335608646, + "grad_norm": 2.974661390553103, + "learning_rate": 1.2487356551582421e-06, + "loss": 0.1436, + "step": 446 + }, + { + "epoch": 0.10170648464163823, + "grad_norm": 2.6800805691171843, + "learning_rate": 1.2487299682521893e-06, + "loss": 0.1383, + "step": 447 + }, + { + "epoch": 0.10193401592719, + "grad_norm": 2.2359167995553233, + "learning_rate": 1.2487242685982384e-06, + "loss": 0.0485, + "step": 448 + }, + { + "epoch": 0.10216154721274175, + "grad_norm": 2.818745463327436, + "learning_rate": 1.2487185561965057e-06, + "loss": 0.0446, + "step": 449 + }, + { + "epoch": 0.10238907849829351, + "grad_norm": 0.6315900981706886, + "learning_rate": 1.248712831047108e-06, + "loss": 0.0186, + "step": 450 + }, + { + "epoch": 0.10261660978384528, + "grad_norm": 1.9674239723403535, + "learning_rate": 1.2487070931501624e-06, + "loss": 0.0673, + "step": 451 + }, + { + "epoch": 0.10284414106939704, + "grad_norm": 1.6885325551942292, + "learning_rate": 1.2487013425057858e-06, + "loss": 0.0767, + "step": 452 + }, + { + "epoch": 0.10307167235494881, + "grad_norm": 1.891051810210748, + "learning_rate": 1.2486955791140964e-06, + "loss": 0.063, + "step": 453 + }, + { + "epoch": 0.10329920364050058, + "grad_norm": 1.1912814980714257, + "learning_rate": 1.2486898029752113e-06, + "loss": 0.0493, + "step": 454 + }, + { + "epoch": 0.10352673492605233, + "grad_norm": 0.8883039975316582, + "learning_rate": 1.248684014089249e-06, + "loss": 0.0277, + "step": 455 + }, + { + "epoch": 0.1037542662116041, + "grad_norm": 1.306066876530539, + "learning_rate": 1.2486782124563277e-06, + "loss": 0.0544, + "step": 456 + }, + { + "epoch": 0.10398179749715586, + "grad_norm": 1.5420420577450527, + "learning_rate": 1.2486723980765659e-06, + "loss": 0.077, + "step": 457 + }, + { + "epoch": 0.10420932878270762, + "grad_norm": 2.0367683349755823, + "learning_rate": 1.2486665709500826e-06, + "loss": 0.0649, + "step": 458 + }, + { + "epoch": 0.10443686006825939, + "grad_norm": 1.1542715096556455, + "learning_rate": 1.2486607310769965e-06, + "loss": 0.0586, + "step": 459 + }, + { + "epoch": 0.10466439135381114, + "grad_norm": 2.061046992152943, + "learning_rate": 1.2486548784574275e-06, + "loss": 0.0491, + "step": 460 + }, + { + "epoch": 0.10489192263936291, + "grad_norm": 1.0629825041281826, + "learning_rate": 1.2486490130914948e-06, + "loss": 0.0445, + "step": 461 + }, + { + "epoch": 0.10511945392491467, + "grad_norm": 1.415699131985968, + "learning_rate": 1.2486431349793185e-06, + "loss": 0.0679, + "step": 462 + }, + { + "epoch": 0.10534698521046644, + "grad_norm": 1.8548752695365796, + "learning_rate": 1.2486372441210188e-06, + "loss": 0.0514, + "step": 463 + }, + { + "epoch": 0.1055745164960182, + "grad_norm": 1.1301759724239804, + "learning_rate": 1.248631340516716e-06, + "loss": 0.0381, + "step": 464 + }, + { + "epoch": 0.10580204778156997, + "grad_norm": 1.408994037607189, + "learning_rate": 1.2486254241665302e-06, + "loss": 0.0692, + "step": 465 + }, + { + "epoch": 0.10602957906712172, + "grad_norm": 1.6631340659469183, + "learning_rate": 1.2486194950705831e-06, + "loss": 0.0454, + "step": 466 + }, + { + "epoch": 0.10625711035267349, + "grad_norm": 4.446207788916147, + "learning_rate": 1.248613553228996e-06, + "loss": 0.051, + "step": 467 + }, + { + "epoch": 0.10648464163822526, + "grad_norm": 2.558672760104107, + "learning_rate": 1.2486075986418896e-06, + "loss": 0.0581, + "step": 468 + }, + { + "epoch": 0.10671217292377702, + "grad_norm": 1.1293994671100194, + "learning_rate": 1.248601631309386e-06, + "loss": 0.0601, + "step": 469 + }, + { + "epoch": 0.10693970420932879, + "grad_norm": 0.9858665143030374, + "learning_rate": 1.2485956512316072e-06, + "loss": 0.042, + "step": 470 + }, + { + "epoch": 0.10716723549488055, + "grad_norm": 1.0238223731711866, + "learning_rate": 1.2485896584086754e-06, + "loss": 0.0419, + "step": 471 + }, + { + "epoch": 0.1073947667804323, + "grad_norm": 1.4636163379221103, + "learning_rate": 1.248583652840713e-06, + "loss": 0.0763, + "step": 472 + }, + { + "epoch": 0.10762229806598407, + "grad_norm": 2.3052142467408534, + "learning_rate": 1.2485776345278427e-06, + "loss": 0.0577, + "step": 473 + }, + { + "epoch": 0.10784982935153584, + "grad_norm": 1.797428495175526, + "learning_rate": 1.2485716034701876e-06, + "loss": 0.0664, + "step": 474 + }, + { + "epoch": 0.1080773606370876, + "grad_norm": 0.9502391190499007, + "learning_rate": 1.2485655596678712e-06, + "loss": 0.032, + "step": 475 + }, + { + "epoch": 0.10830489192263937, + "grad_norm": 2.0597597691340033, + "learning_rate": 1.2485595031210164e-06, + "loss": 0.0693, + "step": 476 + }, + { + "epoch": 0.10853242320819112, + "grad_norm": 1.6422248848311762, + "learning_rate": 1.2485534338297475e-06, + "loss": 0.036, + "step": 477 + }, + { + "epoch": 0.10875995449374289, + "grad_norm": 1.0205877451233056, + "learning_rate": 1.2485473517941884e-06, + "loss": 0.036, + "step": 478 + }, + { + "epoch": 0.10898748577929465, + "grad_norm": 1.352496943979393, + "learning_rate": 1.2485412570144633e-06, + "loss": 0.0619, + "step": 479 + }, + { + "epoch": 0.10921501706484642, + "grad_norm": 1.1460532851585836, + "learning_rate": 1.2485351494906969e-06, + "loss": 0.0457, + "step": 480 + }, + { + "epoch": 0.10944254835039818, + "grad_norm": 2.136834742910293, + "learning_rate": 1.2485290292230142e-06, + "loss": 0.1229, + "step": 481 + }, + { + "epoch": 0.10967007963594995, + "grad_norm": 1.2780826832089733, + "learning_rate": 1.24852289621154e-06, + "loss": 0.0295, + "step": 482 + }, + { + "epoch": 0.1098976109215017, + "grad_norm": 1.5165261909270789, + "learning_rate": 1.2485167504563995e-06, + "loss": 0.0605, + "step": 483 + }, + { + "epoch": 0.11012514220705347, + "grad_norm": 2.5598525552258153, + "learning_rate": 1.2485105919577187e-06, + "loss": 0.1343, + "step": 484 + }, + { + "epoch": 0.11035267349260523, + "grad_norm": 1.2948078456387253, + "learning_rate": 1.2485044207156233e-06, + "loss": 0.0462, + "step": 485 + }, + { + "epoch": 0.110580204778157, + "grad_norm": 2.2282842690273674, + "learning_rate": 1.2484982367302395e-06, + "loss": 0.0906, + "step": 486 + }, + { + "epoch": 0.11080773606370876, + "grad_norm": 1.58668388631447, + "learning_rate": 1.2484920400016936e-06, + "loss": 0.0754, + "step": 487 + }, + { + "epoch": 0.11103526734926053, + "grad_norm": 1.8486367636195544, + "learning_rate": 1.2484858305301122e-06, + "loss": 0.0858, + "step": 488 + }, + { + "epoch": 0.11126279863481228, + "grad_norm": 1.2490242519124817, + "learning_rate": 1.2484796083156222e-06, + "loss": 0.0412, + "step": 489 + }, + { + "epoch": 0.11149032992036405, + "grad_norm": 2.4293013886013024, + "learning_rate": 1.2484733733583511e-06, + "loss": 0.0347, + "step": 490 + }, + { + "epoch": 0.11171786120591581, + "grad_norm": 1.3895430378196065, + "learning_rate": 1.248467125658426e-06, + "loss": 0.0508, + "step": 491 + }, + { + "epoch": 0.11194539249146758, + "grad_norm": 1.0989498931762587, + "learning_rate": 1.2484608652159746e-06, + "loss": 0.0376, + "step": 492 + }, + { + "epoch": 0.11217292377701935, + "grad_norm": 1.2816896914965858, + "learning_rate": 1.248454592031125e-06, + "loss": 0.0325, + "step": 493 + }, + { + "epoch": 0.1124004550625711, + "grad_norm": 1.3834104899544186, + "learning_rate": 1.2484483061040054e-06, + "loss": 0.0502, + "step": 494 + }, + { + "epoch": 0.11262798634812286, + "grad_norm": 1.8438437167351667, + "learning_rate": 1.2484420074347441e-06, + "loss": 0.0675, + "step": 495 + }, + { + "epoch": 0.11285551763367463, + "grad_norm": 1.3847331523036337, + "learning_rate": 1.24843569602347e-06, + "loss": 0.0708, + "step": 496 + }, + { + "epoch": 0.1130830489192264, + "grad_norm": 1.4695794221134664, + "learning_rate": 1.2484293718703119e-06, + "loss": 0.0677, + "step": 497 + }, + { + "epoch": 0.11331058020477816, + "grad_norm": 1.4197704649286171, + "learning_rate": 1.2484230349753994e-06, + "loss": 0.0275, + "step": 498 + }, + { + "epoch": 0.11353811149032993, + "grad_norm": 1.9758484599559376, + "learning_rate": 1.2484166853388617e-06, + "loss": 0.0846, + "step": 499 + }, + { + "epoch": 0.11376564277588168, + "grad_norm": 1.4480733215282273, + "learning_rate": 1.2484103229608288e-06, + "loss": 0.0302, + "step": 500 + }, + { + "epoch": 0.11399317406143344, + "grad_norm": 1.6803783059061204, + "learning_rate": 1.2484039478414305e-06, + "loss": 0.033, + "step": 501 + }, + { + "epoch": 0.11422070534698521, + "grad_norm": 1.5093539895040036, + "learning_rate": 1.2483975599807972e-06, + "loss": 0.0592, + "step": 502 + }, + { + "epoch": 0.11444823663253698, + "grad_norm": 1.2912092446277637, + "learning_rate": 1.2483911593790595e-06, + "loss": 0.0341, + "step": 503 + }, + { + "epoch": 0.11467576791808874, + "grad_norm": 4.053757173255741, + "learning_rate": 1.2483847460363482e-06, + "loss": 0.08, + "step": 504 + }, + { + "epoch": 0.1149032992036405, + "grad_norm": 1.8561008975587792, + "learning_rate": 1.2483783199527943e-06, + "loss": 0.0417, + "step": 505 + }, + { + "epoch": 0.11513083048919226, + "grad_norm": 1.7954101019883197, + "learning_rate": 1.2483718811285296e-06, + "loss": 0.098, + "step": 506 + }, + { + "epoch": 0.11535836177474403, + "grad_norm": 0.90376517227131, + "learning_rate": 1.2483654295636848e-06, + "loss": 0.0204, + "step": 507 + }, + { + "epoch": 0.11558589306029579, + "grad_norm": 1.0402238740609373, + "learning_rate": 1.2483589652583924e-06, + "loss": 0.0393, + "step": 508 + }, + { + "epoch": 0.11581342434584756, + "grad_norm": 1.2739463431329932, + "learning_rate": 1.2483524882127846e-06, + "loss": 0.0361, + "step": 509 + }, + { + "epoch": 0.11604095563139932, + "grad_norm": 1.8542368928683455, + "learning_rate": 1.2483459984269933e-06, + "loss": 0.062, + "step": 510 + }, + { + "epoch": 0.11626848691695107, + "grad_norm": 1.4901585211244064, + "learning_rate": 1.2483394959011514e-06, + "loss": 0.0362, + "step": 511 + }, + { + "epoch": 0.11649601820250284, + "grad_norm": 1.655677974364157, + "learning_rate": 1.248332980635392e-06, + "loss": 0.068, + "step": 512 + }, + { + "epoch": 0.1167235494880546, + "grad_norm": 1.40135662829084, + "learning_rate": 1.2483264526298478e-06, + "loss": 0.0467, + "step": 513 + }, + { + "epoch": 0.11695108077360637, + "grad_norm": 0.9801745008839565, + "learning_rate": 1.2483199118846525e-06, + "loss": 0.0547, + "step": 514 + }, + { + "epoch": 0.11717861205915814, + "grad_norm": 0.7956319000441687, + "learning_rate": 1.2483133583999399e-06, + "loss": 0.0292, + "step": 515 + }, + { + "epoch": 0.1174061433447099, + "grad_norm": 1.5822262084465146, + "learning_rate": 1.2483067921758439e-06, + "loss": 0.0442, + "step": 516 + }, + { + "epoch": 0.11763367463026166, + "grad_norm": 1.788875375154368, + "learning_rate": 1.2483002132124983e-06, + "loss": 0.1027, + "step": 517 + }, + { + "epoch": 0.11786120591581342, + "grad_norm": 1.12691319203208, + "learning_rate": 1.2482936215100382e-06, + "loss": 0.0392, + "step": 518 + }, + { + "epoch": 0.11808873720136519, + "grad_norm": 1.4278197951300633, + "learning_rate": 1.2482870170685978e-06, + "loss": 0.0309, + "step": 519 + }, + { + "epoch": 0.11831626848691695, + "grad_norm": 2.3408918893379176, + "learning_rate": 1.2482803998883122e-06, + "loss": 0.0554, + "step": 520 + }, + { + "epoch": 0.11854379977246872, + "grad_norm": 1.308272654811059, + "learning_rate": 1.2482737699693168e-06, + "loss": 0.0386, + "step": 521 + }, + { + "epoch": 0.11877133105802047, + "grad_norm": 1.4643677550086747, + "learning_rate": 1.248267127311747e-06, + "loss": 0.0517, + "step": 522 + }, + { + "epoch": 0.11899886234357224, + "grad_norm": 1.9375531511223079, + "learning_rate": 1.2482604719157386e-06, + "loss": 0.0547, + "step": 523 + }, + { + "epoch": 0.119226393629124, + "grad_norm": 1.4368931736724269, + "learning_rate": 1.2482538037814277e-06, + "loss": 0.0594, + "step": 524 + }, + { + "epoch": 0.11945392491467577, + "grad_norm": 2.3983593700848864, + "learning_rate": 1.2482471229089502e-06, + "loss": 0.0479, + "step": 525 + }, + { + "epoch": 0.11968145620022753, + "grad_norm": 1.5406589025382933, + "learning_rate": 1.2482404292984431e-06, + "loss": 0.0468, + "step": 526 + }, + { + "epoch": 0.1199089874857793, + "grad_norm": 2.7569107151011254, + "learning_rate": 1.248233722950043e-06, + "loss": 0.1653, + "step": 527 + }, + { + "epoch": 0.12013651877133105, + "grad_norm": 1.2498018017145271, + "learning_rate": 1.2482270038638872e-06, + "loss": 0.0376, + "step": 528 + }, + { + "epoch": 0.12036405005688282, + "grad_norm": 1.663550002347151, + "learning_rate": 1.2482202720401128e-06, + "loss": 0.0336, + "step": 529 + }, + { + "epoch": 0.12059158134243458, + "grad_norm": 1.302350240596974, + "learning_rate": 1.248213527478857e-06, + "loss": 0.058, + "step": 530 + }, + { + "epoch": 0.12081911262798635, + "grad_norm": 1.6402862563370133, + "learning_rate": 1.2482067701802583e-06, + "loss": 0.0826, + "step": 531 + }, + { + "epoch": 0.12104664391353812, + "grad_norm": 1.1268914835455415, + "learning_rate": 1.2482000001444547e-06, + "loss": 0.0524, + "step": 532 + }, + { + "epoch": 0.12127417519908988, + "grad_norm": 1.877525140915591, + "learning_rate": 1.2481932173715845e-06, + "loss": 0.0539, + "step": 533 + }, + { + "epoch": 0.12150170648464163, + "grad_norm": 1.928222388008525, + "learning_rate": 1.2481864218617859e-06, + "loss": 0.1134, + "step": 534 + }, + { + "epoch": 0.1217292377701934, + "grad_norm": 1.2293579919391775, + "learning_rate": 1.2481796136151984e-06, + "loss": 0.0516, + "step": 535 + }, + { + "epoch": 0.12195676905574517, + "grad_norm": 1.5890038701436435, + "learning_rate": 1.2481727926319609e-06, + "loss": 0.0922, + "step": 536 + }, + { + "epoch": 0.12218430034129693, + "grad_norm": 1.850040007143475, + "learning_rate": 1.2481659589122127e-06, + "loss": 0.075, + "step": 537 + }, + { + "epoch": 0.1224118316268487, + "grad_norm": 2.5725083852053747, + "learning_rate": 1.2481591124560934e-06, + "loss": 0.0891, + "step": 538 + }, + { + "epoch": 0.12263936291240045, + "grad_norm": 1.768076465759339, + "learning_rate": 1.2481522532637435e-06, + "loss": 0.0533, + "step": 539 + }, + { + "epoch": 0.12286689419795221, + "grad_norm": 0.9070123837857468, + "learning_rate": 1.2481453813353026e-06, + "loss": 0.0294, + "step": 540 + }, + { + "epoch": 0.12309442548350398, + "grad_norm": 1.625916805043753, + "learning_rate": 1.2481384966709116e-06, + "loss": 0.0323, + "step": 541 + }, + { + "epoch": 0.12332195676905575, + "grad_norm": 2.291411200127928, + "learning_rate": 1.2481315992707104e-06, + "loss": 0.0964, + "step": 542 + }, + { + "epoch": 0.12354948805460751, + "grad_norm": 1.7452732363406978, + "learning_rate": 1.248124689134841e-06, + "loss": 0.0555, + "step": 543 + }, + { + "epoch": 0.12377701934015928, + "grad_norm": 1.4048824029212816, + "learning_rate": 1.2481177662634438e-06, + "loss": 0.0551, + "step": 544 + }, + { + "epoch": 0.12400455062571103, + "grad_norm": 6.9359786939510695, + "learning_rate": 1.2481108306566609e-06, + "loss": 0.0746, + "step": 545 + }, + { + "epoch": 0.1242320819112628, + "grad_norm": 1.2916063155011999, + "learning_rate": 1.2481038823146338e-06, + "loss": 0.0245, + "step": 546 + }, + { + "epoch": 0.12445961319681456, + "grad_norm": 1.9432907622995745, + "learning_rate": 1.2480969212375043e-06, + "loss": 0.1119, + "step": 547 + }, + { + "epoch": 0.12468714448236633, + "grad_norm": 2.9617568197379347, + "learning_rate": 1.2480899474254151e-06, + "loss": 0.0878, + "step": 548 + }, + { + "epoch": 0.12491467576791809, + "grad_norm": 0.9954987625211155, + "learning_rate": 1.2480829608785085e-06, + "loss": 0.0397, + "step": 549 + }, + { + "epoch": 0.12514220705346984, + "grad_norm": 1.2186721476872615, + "learning_rate": 1.2480759615969273e-06, + "loss": 0.0462, + "step": 550 + }, + { + "epoch": 0.12536973833902162, + "grad_norm": 1.1723077588981092, + "learning_rate": 1.2480689495808144e-06, + "loss": 0.0646, + "step": 551 + }, + { + "epoch": 0.12559726962457338, + "grad_norm": 1.4338215761686757, + "learning_rate": 1.2480619248303133e-06, + "loss": 0.0806, + "step": 552 + }, + { + "epoch": 0.12582480091012513, + "grad_norm": 1.841000108666981, + "learning_rate": 1.2480548873455675e-06, + "loss": 0.0827, + "step": 553 + }, + { + "epoch": 0.1260523321956769, + "grad_norm": 2.5765107503856144, + "learning_rate": 1.248047837126721e-06, + "loss": 0.1326, + "step": 554 + }, + { + "epoch": 0.12627986348122866, + "grad_norm": 1.3867433794433537, + "learning_rate": 1.248040774173918e-06, + "loss": 0.0588, + "step": 555 + }, + { + "epoch": 0.12650739476678044, + "grad_norm": 1.3761837680071665, + "learning_rate": 1.248033698487302e-06, + "loss": 0.0495, + "step": 556 + }, + { + "epoch": 0.1267349260523322, + "grad_norm": 1.1704965217659915, + "learning_rate": 1.2480266100670189e-06, + "loss": 0.0467, + "step": 557 + }, + { + "epoch": 0.12696245733788397, + "grad_norm": 2.106768640375489, + "learning_rate": 1.2480195089132125e-06, + "loss": 0.0426, + "step": 558 + }, + { + "epoch": 0.12718998862343572, + "grad_norm": 1.1575398708268563, + "learning_rate": 1.2480123950260284e-06, + "loss": 0.0413, + "step": 559 + }, + { + "epoch": 0.12741751990898748, + "grad_norm": 9.831393771519632, + "learning_rate": 1.248005268405612e-06, + "loss": 0.0434, + "step": 560 + }, + { + "epoch": 0.12764505119453926, + "grad_norm": 1.0820393760670333, + "learning_rate": 1.2479981290521087e-06, + "loss": 0.034, + "step": 561 + }, + { + "epoch": 0.127872582480091, + "grad_norm": 2.206902342168789, + "learning_rate": 1.2479909769656648e-06, + "loss": 0.073, + "step": 562 + }, + { + "epoch": 0.1281001137656428, + "grad_norm": 1.5027865833496339, + "learning_rate": 1.2479838121464263e-06, + "loss": 0.0744, + "step": 563 + }, + { + "epoch": 0.12832764505119454, + "grad_norm": 2.001324966678472, + "learning_rate": 1.2479766345945395e-06, + "loss": 0.0457, + "step": 564 + }, + { + "epoch": 0.1285551763367463, + "grad_norm": 0.9498156004808115, + "learning_rate": 1.2479694443101513e-06, + "loss": 0.0442, + "step": 565 + }, + { + "epoch": 0.12878270762229807, + "grad_norm": 1.427402300347758, + "learning_rate": 1.2479622412934087e-06, + "loss": 0.0774, + "step": 566 + }, + { + "epoch": 0.12901023890784982, + "grad_norm": 1.338474573791641, + "learning_rate": 1.2479550255444586e-06, + "loss": 0.0498, + "step": 567 + }, + { + "epoch": 0.1292377701934016, + "grad_norm": 1.0332062154173083, + "learning_rate": 1.2479477970634487e-06, + "loss": 0.0575, + "step": 568 + }, + { + "epoch": 0.12946530147895335, + "grad_norm": 2.342085432201913, + "learning_rate": 1.2479405558505267e-06, + "loss": 0.0776, + "step": 569 + }, + { + "epoch": 0.1296928327645051, + "grad_norm": 1.7614392055133243, + "learning_rate": 1.247933301905841e-06, + "loss": 0.0626, + "step": 570 + }, + { + "epoch": 0.12992036405005689, + "grad_norm": 1.401551595719724, + "learning_rate": 1.2479260352295388e-06, + "loss": 0.0354, + "step": 571 + }, + { + "epoch": 0.13014789533560864, + "grad_norm": 1.4755845320165697, + "learning_rate": 1.2479187558217697e-06, + "loss": 0.0407, + "step": 572 + }, + { + "epoch": 0.13037542662116042, + "grad_norm": 0.7585780841566192, + "learning_rate": 1.247911463682682e-06, + "loss": 0.0376, + "step": 573 + }, + { + "epoch": 0.13060295790671217, + "grad_norm": 2.2124015942087496, + "learning_rate": 1.2479041588124247e-06, + "loss": 0.0537, + "step": 574 + }, + { + "epoch": 0.13083048919226395, + "grad_norm": 1.0766166316852028, + "learning_rate": 1.2478968412111471e-06, + "loss": 0.0292, + "step": 575 + }, + { + "epoch": 0.1310580204778157, + "grad_norm": 0.995096761634457, + "learning_rate": 1.247889510878999e-06, + "loss": 0.0523, + "step": 576 + }, + { + "epoch": 0.13128555176336745, + "grad_norm": 1.6016154355497043, + "learning_rate": 1.24788216781613e-06, + "loss": 0.0634, + "step": 577 + }, + { + "epoch": 0.13151308304891923, + "grad_norm": 1.1448425600800458, + "learning_rate": 1.2478748120226902e-06, + "loss": 0.0449, + "step": 578 + }, + { + "epoch": 0.13174061433447098, + "grad_norm": 1.1092744505453933, + "learning_rate": 1.2478674434988299e-06, + "loss": 0.0377, + "step": 579 + }, + { + "epoch": 0.13196814562002276, + "grad_norm": 0.8431904554396206, + "learning_rate": 1.2478600622447001e-06, + "loss": 0.0404, + "step": 580 + }, + { + "epoch": 0.13219567690557452, + "grad_norm": 1.1168682097019669, + "learning_rate": 1.2478526682604512e-06, + "loss": 0.0324, + "step": 581 + }, + { + "epoch": 0.13242320819112627, + "grad_norm": 1.3377796205096442, + "learning_rate": 1.2478452615462345e-06, + "loss": 0.0663, + "step": 582 + }, + { + "epoch": 0.13265073947667805, + "grad_norm": 3.2228218888338667, + "learning_rate": 1.247837842102201e-06, + "loss": 0.1264, + "step": 583 + }, + { + "epoch": 0.1328782707622298, + "grad_norm": 1.5298131963208237, + "learning_rate": 1.2478304099285031e-06, + "loss": 0.0525, + "step": 584 + }, + { + "epoch": 0.13310580204778158, + "grad_norm": 1.1458963041110781, + "learning_rate": 1.2478229650252921e-06, + "loss": 0.0464, + "step": 585 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.7278138919558592, + "learning_rate": 1.2478155073927204e-06, + "loss": 0.0556, + "step": 586 + }, + { + "epoch": 0.13356086461888508, + "grad_norm": 1.2475569782092948, + "learning_rate": 1.2478080370309404e-06, + "loss": 0.0428, + "step": 587 + }, + { + "epoch": 0.13378839590443686, + "grad_norm": 1.088342211454506, + "learning_rate": 1.2478005539401046e-06, + "loss": 0.0295, + "step": 588 + }, + { + "epoch": 0.13401592718998862, + "grad_norm": 1.7587707539606798, + "learning_rate": 1.2477930581203663e-06, + "loss": 0.0442, + "step": 589 + }, + { + "epoch": 0.1342434584755404, + "grad_norm": 1.1629708095404674, + "learning_rate": 1.2477855495718782e-06, + "loss": 0.0548, + "step": 590 + }, + { + "epoch": 0.13447098976109215, + "grad_norm": 1.551063066789381, + "learning_rate": 1.2477780282947942e-06, + "loss": 0.037, + "step": 591 + }, + { + "epoch": 0.13469852104664393, + "grad_norm": 1.0301289542876222, + "learning_rate": 1.2477704942892677e-06, + "loss": 0.0443, + "step": 592 + }, + { + "epoch": 0.13492605233219568, + "grad_norm": 3.67842928304707, + "learning_rate": 1.2477629475554532e-06, + "loss": 0.0909, + "step": 593 + }, + { + "epoch": 0.13515358361774743, + "grad_norm": 1.6590539267977245, + "learning_rate": 1.2477553880935043e-06, + "loss": 0.0523, + "step": 594 + }, + { + "epoch": 0.1353811149032992, + "grad_norm": 1.9972940081293025, + "learning_rate": 1.2477478159035758e-06, + "loss": 0.0745, + "step": 595 + }, + { + "epoch": 0.13560864618885096, + "grad_norm": 2.5034292233621693, + "learning_rate": 1.2477402309858226e-06, + "loss": 0.0731, + "step": 596 + }, + { + "epoch": 0.13583617747440274, + "grad_norm": 1.7292171438267574, + "learning_rate": 1.2477326333403995e-06, + "loss": 0.0947, + "step": 597 + }, + { + "epoch": 0.1360637087599545, + "grad_norm": 1.0893865478354072, + "learning_rate": 1.2477250229674618e-06, + "loss": 0.036, + "step": 598 + }, + { + "epoch": 0.13629124004550625, + "grad_norm": 2.0220916101870854, + "learning_rate": 1.2477173998671653e-06, + "loss": 0.0695, + "step": 599 + }, + { + "epoch": 0.13651877133105803, + "grad_norm": 1.6663966081636725, + "learning_rate": 1.2477097640396655e-06, + "loss": 0.0239, + "step": 600 + }, + { + "epoch": 0.13674630261660978, + "grad_norm": 2.160820824239015, + "learning_rate": 1.2477021154851185e-06, + "loss": 0.041, + "step": 601 + }, + { + "epoch": 0.13697383390216156, + "grad_norm": 1.7560963022061893, + "learning_rate": 1.2476944542036806e-06, + "loss": 0.0646, + "step": 602 + }, + { + "epoch": 0.1372013651877133, + "grad_norm": 1.0045761668266382, + "learning_rate": 1.2476867801955086e-06, + "loss": 0.0319, + "step": 603 + }, + { + "epoch": 0.13742889647326506, + "grad_norm": 1.1722883805586992, + "learning_rate": 1.247679093460759e-06, + "loss": 0.0421, + "step": 604 + }, + { + "epoch": 0.13765642775881684, + "grad_norm": 1.5520795851774112, + "learning_rate": 1.2476713939995895e-06, + "loss": 0.0545, + "step": 605 + }, + { + "epoch": 0.1378839590443686, + "grad_norm": 1.5341083321869782, + "learning_rate": 1.2476636818121568e-06, + "loss": 0.0587, + "step": 606 + }, + { + "epoch": 0.13811149032992037, + "grad_norm": 1.427789001931161, + "learning_rate": 1.247655956898619e-06, + "loss": 0.0783, + "step": 607 + }, + { + "epoch": 0.13833902161547212, + "grad_norm": 1.6612253896109521, + "learning_rate": 1.2476482192591335e-06, + "loss": 0.0454, + "step": 608 + }, + { + "epoch": 0.1385665529010239, + "grad_norm": 4.06982296073183, + "learning_rate": 1.247640468893859e-06, + "loss": 0.0518, + "step": 609 + }, + { + "epoch": 0.13879408418657566, + "grad_norm": 0.9984415328300282, + "learning_rate": 1.2476327058029534e-06, + "loss": 0.0286, + "step": 610 + }, + { + "epoch": 0.1390216154721274, + "grad_norm": 1.5505955919902168, + "learning_rate": 1.2476249299865757e-06, + "loss": 0.0679, + "step": 611 + }, + { + "epoch": 0.1392491467576792, + "grad_norm": 1.1777984076454868, + "learning_rate": 1.2476171414448847e-06, + "loss": 0.0333, + "step": 612 + }, + { + "epoch": 0.13947667804323094, + "grad_norm": 1.5134654111327426, + "learning_rate": 1.2476093401780397e-06, + "loss": 0.0464, + "step": 613 + }, + { + "epoch": 0.13970420932878272, + "grad_norm": 8.868555700718426, + "learning_rate": 1.2476015261861998e-06, + "loss": 0.0588, + "step": 614 + }, + { + "epoch": 0.13993174061433447, + "grad_norm": 1.07185240181413, + "learning_rate": 1.247593699469525e-06, + "loss": 0.0508, + "step": 615 + }, + { + "epoch": 0.14015927189988622, + "grad_norm": 1.0467231615661654, + "learning_rate": 1.2475858600281754e-06, + "loss": 0.0375, + "step": 616 + }, + { + "epoch": 0.140386803185438, + "grad_norm": 1.6914112039280367, + "learning_rate": 1.247578007862311e-06, + "loss": 0.0385, + "step": 617 + }, + { + "epoch": 0.14061433447098975, + "grad_norm": 1.6162335462442214, + "learning_rate": 1.2475701429720923e-06, + "loss": 0.0617, + "step": 618 + }, + { + "epoch": 0.14084186575654153, + "grad_norm": 0.8727030068232594, + "learning_rate": 1.24756226535768e-06, + "loss": 0.0388, + "step": 619 + }, + { + "epoch": 0.1410693970420933, + "grad_norm": 1.5566079211643449, + "learning_rate": 1.2475543750192352e-06, + "loss": 0.094, + "step": 620 + }, + { + "epoch": 0.14129692832764504, + "grad_norm": 1.1473885930159387, + "learning_rate": 1.2475464719569192e-06, + "loss": 0.0563, + "step": 621 + }, + { + "epoch": 0.14152445961319682, + "grad_norm": 1.111358042304426, + "learning_rate": 1.2475385561708934e-06, + "loss": 0.058, + "step": 622 + }, + { + "epoch": 0.14175199089874857, + "grad_norm": 0.8961189135726134, + "learning_rate": 1.2475306276613194e-06, + "loss": 0.0393, + "step": 623 + }, + { + "epoch": 0.14197952218430035, + "grad_norm": 3.442046327889576, + "learning_rate": 1.2475226864283596e-06, + "loss": 0.0713, + "step": 624 + }, + { + "epoch": 0.1422070534698521, + "grad_norm": 4.150666880510046, + "learning_rate": 1.2475147324721764e-06, + "loss": 0.0442, + "step": 625 + }, + { + "epoch": 0.14243458475540388, + "grad_norm": 0.9296406182877743, + "learning_rate": 1.2475067657929319e-06, + "loss": 0.0414, + "step": 626 + }, + { + "epoch": 0.14266211604095563, + "grad_norm": 1.0829545070929647, + "learning_rate": 1.2474987863907894e-06, + "loss": 0.0451, + "step": 627 + }, + { + "epoch": 0.14288964732650739, + "grad_norm": 1.8334147294224943, + "learning_rate": 1.2474907942659116e-06, + "loss": 0.0771, + "step": 628 + }, + { + "epoch": 0.14311717861205916, + "grad_norm": 1.7815672754482244, + "learning_rate": 1.247482789418462e-06, + "loss": 0.0897, + "step": 629 + }, + { + "epoch": 0.14334470989761092, + "grad_norm": 1.3733903816478978, + "learning_rate": 1.2474747718486044e-06, + "loss": 0.0358, + "step": 630 + }, + { + "epoch": 0.1435722411831627, + "grad_norm": 2.28709865860884, + "learning_rate": 1.2474667415565022e-06, + "loss": 0.0652, + "step": 631 + }, + { + "epoch": 0.14379977246871445, + "grad_norm": 1.231780324099585, + "learning_rate": 1.24745869854232e-06, + "loss": 0.0425, + "step": 632 + }, + { + "epoch": 0.1440273037542662, + "grad_norm": 1.335370738840473, + "learning_rate": 1.2474506428062219e-06, + "loss": 0.0422, + "step": 633 + }, + { + "epoch": 0.14425483503981798, + "grad_norm": 2.966338619611093, + "learning_rate": 1.2474425743483726e-06, + "loss": 0.1214, + "step": 634 + }, + { + "epoch": 0.14448236632536973, + "grad_norm": 0.8989763932220884, + "learning_rate": 1.2474344931689371e-06, + "loss": 0.0341, + "step": 635 + }, + { + "epoch": 0.1447098976109215, + "grad_norm": 0.937533756927628, + "learning_rate": 1.2474263992680805e-06, + "loss": 0.0394, + "step": 636 + }, + { + "epoch": 0.14493742889647326, + "grad_norm": 1.8912948524392619, + "learning_rate": 1.247418292645968e-06, + "loss": 0.058, + "step": 637 + }, + { + "epoch": 0.14516496018202502, + "grad_norm": 2.6567132856925877, + "learning_rate": 1.2474101733027659e-06, + "loss": 0.1061, + "step": 638 + }, + { + "epoch": 0.1453924914675768, + "grad_norm": 1.0544164694645533, + "learning_rate": 1.2474020412386395e-06, + "loss": 0.0327, + "step": 639 + }, + { + "epoch": 0.14562002275312855, + "grad_norm": 0.8862493084977525, + "learning_rate": 1.2473938964537551e-06, + "loss": 0.0337, + "step": 640 + }, + { + "epoch": 0.14584755403868033, + "grad_norm": 1.1660786205758522, + "learning_rate": 1.2473857389482797e-06, + "loss": 0.0379, + "step": 641 + }, + { + "epoch": 0.14607508532423208, + "grad_norm": 1.275063352619887, + "learning_rate": 1.2473775687223794e-06, + "loss": 0.0677, + "step": 642 + }, + { + "epoch": 0.14630261660978386, + "grad_norm": 1.033368353907448, + "learning_rate": 1.2473693857762215e-06, + "loss": 0.0499, + "step": 643 + }, + { + "epoch": 0.1465301478953356, + "grad_norm": 0.8176955754401027, + "learning_rate": 1.247361190109973e-06, + "loss": 0.028, + "step": 644 + }, + { + "epoch": 0.14675767918088736, + "grad_norm": 1.5868301680057004, + "learning_rate": 1.2473529817238016e-06, + "loss": 0.0758, + "step": 645 + }, + { + "epoch": 0.14698521046643914, + "grad_norm": 1.1116607935229232, + "learning_rate": 1.2473447606178754e-06, + "loss": 0.0464, + "step": 646 + }, + { + "epoch": 0.1472127417519909, + "grad_norm": 1.3737040045295303, + "learning_rate": 1.2473365267923617e-06, + "loss": 0.038, + "step": 647 + }, + { + "epoch": 0.14744027303754267, + "grad_norm": 1.6851756106039961, + "learning_rate": 1.2473282802474293e-06, + "loss": 0.0521, + "step": 648 + }, + { + "epoch": 0.14766780432309443, + "grad_norm": 1.067298154630741, + "learning_rate": 1.2473200209832465e-06, + "loss": 0.0339, + "step": 649 + }, + { + "epoch": 0.14789533560864618, + "grad_norm": 3.389712283735002, + "learning_rate": 1.2473117489999823e-06, + "loss": 0.1032, + "step": 650 + }, + { + "epoch": 0.14812286689419796, + "grad_norm": 1.6568457682006943, + "learning_rate": 1.2473034642978057e-06, + "loss": 0.0619, + "step": 651 + }, + { + "epoch": 0.1483503981797497, + "grad_norm": 1.490856549646787, + "learning_rate": 1.247295166876886e-06, + "loss": 0.0516, + "step": 652 + }, + { + "epoch": 0.1485779294653015, + "grad_norm": 1.1851614081448008, + "learning_rate": 1.2472868567373924e-06, + "loss": 0.0431, + "step": 653 + }, + { + "epoch": 0.14880546075085324, + "grad_norm": 1.644166757760147, + "learning_rate": 1.2472785338794953e-06, + "loss": 0.082, + "step": 654 + }, + { + "epoch": 0.149032992036405, + "grad_norm": 1.556466054889254, + "learning_rate": 1.247270198303365e-06, + "loss": 0.0573, + "step": 655 + }, + { + "epoch": 0.14926052332195677, + "grad_norm": 2.233896039541245, + "learning_rate": 1.247261850009171e-06, + "loss": 0.1073, + "step": 656 + }, + { + "epoch": 0.14948805460750852, + "grad_norm": 1.5970780316095814, + "learning_rate": 1.2472534889970848e-06, + "loss": 0.0741, + "step": 657 + }, + { + "epoch": 0.1497155858930603, + "grad_norm": 1.2400037573314944, + "learning_rate": 1.2472451152672766e-06, + "loss": 0.0483, + "step": 658 + }, + { + "epoch": 0.14994311717861206, + "grad_norm": 0.7241628580276863, + "learning_rate": 1.2472367288199177e-06, + "loss": 0.0294, + "step": 659 + }, + { + "epoch": 0.15017064846416384, + "grad_norm": 1.7234352219783362, + "learning_rate": 1.2472283296551798e-06, + "loss": 0.0462, + "step": 660 + }, + { + "epoch": 0.1503981797497156, + "grad_norm": 1.4734355231360154, + "learning_rate": 1.2472199177732346e-06, + "loss": 0.0582, + "step": 661 + }, + { + "epoch": 0.15062571103526734, + "grad_norm": 2.1378423903625317, + "learning_rate": 1.2472114931742537e-06, + "loss": 0.0584, + "step": 662 + }, + { + "epoch": 0.15085324232081912, + "grad_norm": 10.24253063417928, + "learning_rate": 1.2472030558584093e-06, + "loss": 0.0477, + "step": 663 + }, + { + "epoch": 0.15108077360637087, + "grad_norm": 1.8691143785999473, + "learning_rate": 1.2471946058258742e-06, + "loss": 0.0739, + "step": 664 + }, + { + "epoch": 0.15130830489192265, + "grad_norm": 2.241077277544209, + "learning_rate": 1.2471861430768205e-06, + "loss": 0.0672, + "step": 665 + }, + { + "epoch": 0.1515358361774744, + "grad_norm": 2.2720454211658385, + "learning_rate": 1.2471776676114217e-06, + "loss": 0.0744, + "step": 666 + }, + { + "epoch": 0.15176336746302616, + "grad_norm": 1.5599655593025843, + "learning_rate": 1.2471691794298508e-06, + "loss": 0.0708, + "step": 667 + }, + { + "epoch": 0.15199089874857794, + "grad_norm": 1.2105729421418516, + "learning_rate": 1.2471606785322814e-06, + "loss": 0.0408, + "step": 668 + }, + { + "epoch": 0.1522184300341297, + "grad_norm": 1.2672422777930856, + "learning_rate": 1.247152164918887e-06, + "loss": 0.0406, + "step": 669 + }, + { + "epoch": 0.15244596131968147, + "grad_norm": 1.4429003913192175, + "learning_rate": 1.247143638589842e-06, + "loss": 0.0537, + "step": 670 + }, + { + "epoch": 0.15267349260523322, + "grad_norm": 2.4501715529962667, + "learning_rate": 1.2471350995453203e-06, + "loss": 0.0658, + "step": 671 + }, + { + "epoch": 0.15290102389078497, + "grad_norm": 1.48398143402129, + "learning_rate": 1.2471265477854966e-06, + "loss": 0.0556, + "step": 672 + }, + { + "epoch": 0.15312855517633675, + "grad_norm": 2.1744278885096278, + "learning_rate": 1.2471179833105454e-06, + "loss": 0.0404, + "step": 673 + }, + { + "epoch": 0.1533560864618885, + "grad_norm": 6.901371774679421, + "learning_rate": 1.2471094061206422e-06, + "loss": 0.0869, + "step": 674 + }, + { + "epoch": 0.15358361774744028, + "grad_norm": 1.1601432497447364, + "learning_rate": 1.247100816215962e-06, + "loss": 0.0379, + "step": 675 + }, + { + "epoch": 0.15381114903299203, + "grad_norm": 1.2820824697846318, + "learning_rate": 1.2470922135966806e-06, + "loss": 0.032, + "step": 676 + }, + { + "epoch": 0.1540386803185438, + "grad_norm": 1.2705594517037477, + "learning_rate": 1.2470835982629736e-06, + "loss": 0.0494, + "step": 677 + }, + { + "epoch": 0.15426621160409557, + "grad_norm": 1.5982674187017603, + "learning_rate": 1.247074970215017e-06, + "loss": 0.0495, + "step": 678 + }, + { + "epoch": 0.15449374288964732, + "grad_norm": 1.7109186334283122, + "learning_rate": 1.2470663294529873e-06, + "loss": 0.0414, + "step": 679 + }, + { + "epoch": 0.1547212741751991, + "grad_norm": 1.0715248913545208, + "learning_rate": 1.2470576759770612e-06, + "loss": 0.0325, + "step": 680 + }, + { + "epoch": 0.15494880546075085, + "grad_norm": 1.824156765255701, + "learning_rate": 1.2470490097874155e-06, + "loss": 0.0466, + "step": 681 + }, + { + "epoch": 0.15517633674630263, + "grad_norm": 2.4373458213618897, + "learning_rate": 1.247040330884227e-06, + "loss": 0.0539, + "step": 682 + }, + { + "epoch": 0.15540386803185438, + "grad_norm": 1.0065301428430162, + "learning_rate": 1.2470316392676738e-06, + "loss": 0.0525, + "step": 683 + }, + { + "epoch": 0.15563139931740613, + "grad_norm": 1.5815427583099921, + "learning_rate": 1.2470229349379326e-06, + "loss": 0.0519, + "step": 684 + }, + { + "epoch": 0.1558589306029579, + "grad_norm": 1.9652039642446397, + "learning_rate": 1.2470142178951822e-06, + "loss": 0.0768, + "step": 685 + }, + { + "epoch": 0.15608646188850966, + "grad_norm": 1.6149386041596687, + "learning_rate": 1.2470054881396002e-06, + "loss": 0.0832, + "step": 686 + }, + { + "epoch": 0.15631399317406144, + "grad_norm": 1.1987854832655134, + "learning_rate": 1.246996745671365e-06, + "loss": 0.0659, + "step": 687 + }, + { + "epoch": 0.1565415244596132, + "grad_norm": 1.3921581311290316, + "learning_rate": 1.2469879904906556e-06, + "loss": 0.0477, + "step": 688 + }, + { + "epoch": 0.15676905574516495, + "grad_norm": 1.5291818763479217, + "learning_rate": 1.2469792225976507e-06, + "loss": 0.0597, + "step": 689 + }, + { + "epoch": 0.15699658703071673, + "grad_norm": 1.1978579028524097, + "learning_rate": 1.2469704419925296e-06, + "loss": 0.0551, + "step": 690 + }, + { + "epoch": 0.15722411831626848, + "grad_norm": 2.2562422781402827, + "learning_rate": 1.246961648675472e-06, + "loss": 0.0709, + "step": 691 + }, + { + "epoch": 0.15745164960182026, + "grad_norm": 1.947406282057691, + "learning_rate": 1.246952842646657e-06, + "loss": 0.115, + "step": 692 + }, + { + "epoch": 0.157679180887372, + "grad_norm": 1.2450058586298152, + "learning_rate": 1.2469440239062653e-06, + "loss": 0.0499, + "step": 693 + }, + { + "epoch": 0.15790671217292376, + "grad_norm": 1.8197388877884058, + "learning_rate": 1.2469351924544766e-06, + "loss": 0.0927, + "step": 694 + }, + { + "epoch": 0.15813424345847554, + "grad_norm": 1.904635209687238, + "learning_rate": 1.2469263482914716e-06, + "loss": 0.0814, + "step": 695 + }, + { + "epoch": 0.1583617747440273, + "grad_norm": 1.4011214554570595, + "learning_rate": 1.246917491417431e-06, + "loss": 0.0585, + "step": 696 + }, + { + "epoch": 0.15858930602957907, + "grad_norm": 1.9445852493644915, + "learning_rate": 1.246908621832536e-06, + "loss": 0.1005, + "step": 697 + }, + { + "epoch": 0.15881683731513083, + "grad_norm": 1.363848979307397, + "learning_rate": 1.2468997395369677e-06, + "loss": 0.0717, + "step": 698 + }, + { + "epoch": 0.1590443686006826, + "grad_norm": 1.3903385726286603, + "learning_rate": 1.2468908445309077e-06, + "loss": 0.0557, + "step": 699 + }, + { + "epoch": 0.15927189988623436, + "grad_norm": 0.8725919788890233, + "learning_rate": 1.2468819368145376e-06, + "loss": 0.0365, + "step": 700 + }, + { + "epoch": 0.1594994311717861, + "grad_norm": 1.33354116221911, + "learning_rate": 1.2468730163880398e-06, + "loss": 0.0486, + "step": 701 + }, + { + "epoch": 0.1597269624573379, + "grad_norm": 4.544549924474667, + "learning_rate": 1.2468640832515962e-06, + "loss": 0.0302, + "step": 702 + }, + { + "epoch": 0.15995449374288964, + "grad_norm": 2.9489517825532308, + "learning_rate": 1.24685513740539e-06, + "loss": 0.0904, + "step": 703 + }, + { + "epoch": 0.16018202502844142, + "grad_norm": 1.3714756073146785, + "learning_rate": 1.2468461788496036e-06, + "loss": 0.056, + "step": 704 + }, + { + "epoch": 0.16040955631399317, + "grad_norm": 1.9329458786445137, + "learning_rate": 1.24683720758442e-06, + "loss": 0.0848, + "step": 705 + }, + { + "epoch": 0.16063708759954493, + "grad_norm": 1.0521231330897036, + "learning_rate": 1.2468282236100226e-06, + "loss": 0.0438, + "step": 706 + }, + { + "epoch": 0.1608646188850967, + "grad_norm": 1.7554793724577804, + "learning_rate": 1.2468192269265955e-06, + "loss": 0.1102, + "step": 707 + }, + { + "epoch": 0.16109215017064846, + "grad_norm": 1.0071708604696206, + "learning_rate": 1.246810217534322e-06, + "loss": 0.0266, + "step": 708 + }, + { + "epoch": 0.16131968145620024, + "grad_norm": 1.3334260212411804, + "learning_rate": 1.2468011954333864e-06, + "loss": 0.0609, + "step": 709 + }, + { + "epoch": 0.161547212741752, + "grad_norm": 1.1880594144178185, + "learning_rate": 1.2467921606239734e-06, + "loss": 0.0465, + "step": 710 + }, + { + "epoch": 0.16177474402730374, + "grad_norm": 2.7036981936596542, + "learning_rate": 1.2467831131062672e-06, + "loss": 0.0516, + "step": 711 + }, + { + "epoch": 0.16200227531285552, + "grad_norm": 1.416736687413605, + "learning_rate": 1.2467740528804528e-06, + "loss": 0.0552, + "step": 712 + }, + { + "epoch": 0.16222980659840727, + "grad_norm": 1.3251728729578673, + "learning_rate": 1.2467649799467156e-06, + "loss": 0.0615, + "step": 713 + }, + { + "epoch": 0.16245733788395905, + "grad_norm": 0.9167577669577527, + "learning_rate": 1.246755894305241e-06, + "loss": 0.0312, + "step": 714 + }, + { + "epoch": 0.1626848691695108, + "grad_norm": 1.4735705097484915, + "learning_rate": 1.2467467959562143e-06, + "loss": 0.0676, + "step": 715 + }, + { + "epoch": 0.16291240045506258, + "grad_norm": 1.6890201461846701, + "learning_rate": 1.2467376848998221e-06, + "loss": 0.0597, + "step": 716 + }, + { + "epoch": 0.16313993174061434, + "grad_norm": 1.4659558773380117, + "learning_rate": 1.2467285611362501e-06, + "loss": 0.052, + "step": 717 + }, + { + "epoch": 0.1633674630261661, + "grad_norm": 1.7717923581360278, + "learning_rate": 1.2467194246656851e-06, + "loss": 0.0563, + "step": 718 + }, + { + "epoch": 0.16359499431171787, + "grad_norm": 1.0756915461750918, + "learning_rate": 1.2467102754883136e-06, + "loss": 0.0411, + "step": 719 + }, + { + "epoch": 0.16382252559726962, + "grad_norm": 1.2683417009383617, + "learning_rate": 1.2467011136043228e-06, + "loss": 0.0454, + "step": 720 + }, + { + "epoch": 0.1640500568828214, + "grad_norm": 1.5509079166930662, + "learning_rate": 1.2466919390138995e-06, + "loss": 0.0745, + "step": 721 + }, + { + "epoch": 0.16427758816837315, + "grad_norm": 1.799801190377342, + "learning_rate": 1.246682751717232e-06, + "loss": 0.0531, + "step": 722 + }, + { + "epoch": 0.1645051194539249, + "grad_norm": 1.734480174185366, + "learning_rate": 1.2466735517145074e-06, + "loss": 0.0792, + "step": 723 + }, + { + "epoch": 0.16473265073947668, + "grad_norm": 1.961760175196997, + "learning_rate": 1.2466643390059138e-06, + "loss": 0.0685, + "step": 724 + }, + { + "epoch": 0.16496018202502843, + "grad_norm": 0.8466307246278059, + "learning_rate": 1.2466551135916398e-06, + "loss": 0.0229, + "step": 725 + }, + { + "epoch": 0.16518771331058021, + "grad_norm": 1.2173112499342529, + "learning_rate": 1.2466458754718737e-06, + "loss": 0.0632, + "step": 726 + }, + { + "epoch": 0.16541524459613197, + "grad_norm": 2.194872247572006, + "learning_rate": 1.2466366246468045e-06, + "loss": 0.0432, + "step": 727 + }, + { + "epoch": 0.16564277588168372, + "grad_norm": 1.4416201742124337, + "learning_rate": 1.246627361116621e-06, + "loss": 0.0619, + "step": 728 + }, + { + "epoch": 0.1658703071672355, + "grad_norm": 1.8390552268184857, + "learning_rate": 1.246618084881513e-06, + "loss": 0.1148, + "step": 729 + }, + { + "epoch": 0.16609783845278725, + "grad_norm": 1.7117932742809556, + "learning_rate": 1.2466087959416695e-06, + "loss": 0.0354, + "step": 730 + }, + { + "epoch": 0.16632536973833903, + "grad_norm": 2.1473525159969644, + "learning_rate": 1.2465994942972805e-06, + "loss": 0.0479, + "step": 731 + }, + { + "epoch": 0.16655290102389078, + "grad_norm": 2.0228728542376846, + "learning_rate": 1.2465901799485366e-06, + "loss": 0.0819, + "step": 732 + }, + { + "epoch": 0.16678043230944256, + "grad_norm": 0.7672679660904389, + "learning_rate": 1.2465808528956277e-06, + "loss": 0.0251, + "step": 733 + }, + { + "epoch": 0.1670079635949943, + "grad_norm": 6.58955540161242, + "learning_rate": 1.2465715131387446e-06, + "loss": 0.0608, + "step": 734 + }, + { + "epoch": 0.16723549488054607, + "grad_norm": 2.0572059988722873, + "learning_rate": 1.2465621606780778e-06, + "loss": 0.0841, + "step": 735 + }, + { + "epoch": 0.16746302616609784, + "grad_norm": 1.628470993180389, + "learning_rate": 1.2465527955138191e-06, + "loss": 0.059, + "step": 736 + }, + { + "epoch": 0.1676905574516496, + "grad_norm": 1.0886679327758462, + "learning_rate": 1.2465434176461596e-06, + "loss": 0.0419, + "step": 737 + }, + { + "epoch": 0.16791808873720138, + "grad_norm": 1.4286021962604534, + "learning_rate": 1.2465340270752908e-06, + "loss": 0.0481, + "step": 738 + }, + { + "epoch": 0.16814562002275313, + "grad_norm": 0.8663035560041373, + "learning_rate": 1.2465246238014047e-06, + "loss": 0.0295, + "step": 739 + }, + { + "epoch": 0.16837315130830488, + "grad_norm": 1.8501418707953075, + "learning_rate": 1.2465152078246936e-06, + "loss": 0.0829, + "step": 740 + }, + { + "epoch": 0.16860068259385666, + "grad_norm": 1.311824892930283, + "learning_rate": 1.24650577914535e-06, + "loss": 0.0418, + "step": 741 + }, + { + "epoch": 0.1688282138794084, + "grad_norm": 1.249901682454073, + "learning_rate": 1.2464963377635667e-06, + "loss": 0.0479, + "step": 742 + }, + { + "epoch": 0.1690557451649602, + "grad_norm": 1.9982982994055205, + "learning_rate": 1.246486883679536e-06, + "loss": 0.112, + "step": 743 + }, + { + "epoch": 0.16928327645051194, + "grad_norm": 1.8891194873050854, + "learning_rate": 1.246477416893452e-06, + "loss": 0.0489, + "step": 744 + }, + { + "epoch": 0.1695108077360637, + "grad_norm": 1.3265078136556134, + "learning_rate": 1.2464679374055074e-06, + "loss": 0.0492, + "step": 745 + }, + { + "epoch": 0.16973833902161548, + "grad_norm": 1.1620433482725554, + "learning_rate": 1.2464584452158968e-06, + "loss": 0.0217, + "step": 746 + }, + { + "epoch": 0.16996587030716723, + "grad_norm": 1.8320124858819533, + "learning_rate": 1.2464489403248133e-06, + "loss": 0.057, + "step": 747 + }, + { + "epoch": 0.170193401592719, + "grad_norm": 0.8996339413245623, + "learning_rate": 1.246439422732452e-06, + "loss": 0.0308, + "step": 748 + }, + { + "epoch": 0.17042093287827076, + "grad_norm": 1.1199200581464024, + "learning_rate": 1.2464298924390066e-06, + "loss": 0.0449, + "step": 749 + }, + { + "epoch": 0.17064846416382254, + "grad_norm": 1.6610024470498899, + "learning_rate": 1.2464203494446725e-06, + "loss": 0.0667, + "step": 750 + }, + { + "epoch": 0.1708759954493743, + "grad_norm": 1.1007524784666796, + "learning_rate": 1.2464107937496444e-06, + "loss": 0.0341, + "step": 751 + }, + { + "epoch": 0.17110352673492604, + "grad_norm": 1.0836506765705751, + "learning_rate": 1.246401225354118e-06, + "loss": 0.0397, + "step": 752 + }, + { + "epoch": 0.17133105802047782, + "grad_norm": 1.320442517854178, + "learning_rate": 1.2463916442582883e-06, + "loss": 0.0694, + "step": 753 + }, + { + "epoch": 0.17155858930602957, + "grad_norm": 1.0431861088552385, + "learning_rate": 1.2463820504623516e-06, + "loss": 0.0285, + "step": 754 + }, + { + "epoch": 0.17178612059158135, + "grad_norm": 0.9128127625945931, + "learning_rate": 1.246372443966504e-06, + "loss": 0.0394, + "step": 755 + }, + { + "epoch": 0.1720136518771331, + "grad_norm": 1.3898389984950463, + "learning_rate": 1.246362824770941e-06, + "loss": 0.0451, + "step": 756 + }, + { + "epoch": 0.17224118316268486, + "grad_norm": 0.9752697300280603, + "learning_rate": 1.2463531928758605e-06, + "loss": 0.0398, + "step": 757 + }, + { + "epoch": 0.17246871444823664, + "grad_norm": 1.8211753063899252, + "learning_rate": 1.2463435482814585e-06, + "loss": 0.0947, + "step": 758 + }, + { + "epoch": 0.1726962457337884, + "grad_norm": 1.5787941280691695, + "learning_rate": 1.246333890987932e-06, + "loss": 0.1047, + "step": 759 + }, + { + "epoch": 0.17292377701934017, + "grad_norm": 1.8356072487697, + "learning_rate": 1.246324220995479e-06, + "loss": 0.0729, + "step": 760 + }, + { + "epoch": 0.17315130830489192, + "grad_norm": 4.476837441740864, + "learning_rate": 1.2463145383042966e-06, + "loss": 0.0686, + "step": 761 + }, + { + "epoch": 0.17337883959044367, + "grad_norm": 252.52198381660855, + "learning_rate": 1.2463048429145832e-06, + "loss": 1.0972, + "step": 762 + }, + { + "epoch": 0.17360637087599545, + "grad_norm": 1.6008327751394016, + "learning_rate": 1.2462951348265364e-06, + "loss": 0.067, + "step": 763 + }, + { + "epoch": 0.1738339021615472, + "grad_norm": 1.087447761528399, + "learning_rate": 1.2462854140403553e-06, + "loss": 0.0433, + "step": 764 + }, + { + "epoch": 0.17406143344709898, + "grad_norm": 2.5989190208674384, + "learning_rate": 1.2462756805562378e-06, + "loss": 0.1183, + "step": 765 + }, + { + "epoch": 0.17428896473265074, + "grad_norm": 1.0290974538586322, + "learning_rate": 1.2462659343743832e-06, + "loss": 0.048, + "step": 766 + }, + { + "epoch": 0.17451649601820252, + "grad_norm": 1.2617664816926566, + "learning_rate": 1.2462561754949908e-06, + "loss": 0.0498, + "step": 767 + }, + { + "epoch": 0.17474402730375427, + "grad_norm": 1.6415043644592435, + "learning_rate": 1.2462464039182598e-06, + "loss": 0.062, + "step": 768 + }, + { + "epoch": 0.17497155858930602, + "grad_norm": 1.6673873794852758, + "learning_rate": 1.2462366196443903e-06, + "loss": 0.0656, + "step": 769 + }, + { + "epoch": 0.1751990898748578, + "grad_norm": 3.3205898186422718, + "learning_rate": 1.246226822673582e-06, + "loss": 0.1071, + "step": 770 + }, + { + "epoch": 0.17542662116040955, + "grad_norm": 1.280610915532866, + "learning_rate": 1.2462170130060351e-06, + "loss": 0.0318, + "step": 771 + }, + { + "epoch": 0.17565415244596133, + "grad_norm": 1.9843858659809077, + "learning_rate": 1.24620719064195e-06, + "loss": 0.0829, + "step": 772 + }, + { + "epoch": 0.17588168373151308, + "grad_norm": 2.184973202731946, + "learning_rate": 1.246197355581528e-06, + "loss": 0.0998, + "step": 773 + }, + { + "epoch": 0.17610921501706484, + "grad_norm": 1.361748313569626, + "learning_rate": 1.2461875078249694e-06, + "loss": 0.0459, + "step": 774 + }, + { + "epoch": 0.17633674630261661, + "grad_norm": 1.5219919911848705, + "learning_rate": 1.246177647372476e-06, + "loss": 0.0539, + "step": 775 + }, + { + "epoch": 0.17656427758816837, + "grad_norm": 49.0672993271897, + "learning_rate": 1.246167774224249e-06, + "loss": 0.5261, + "step": 776 + }, + { + "epoch": 0.17679180887372015, + "grad_norm": 1.5576377429332795, + "learning_rate": 1.2461578883804903e-06, + "loss": 0.036, + "step": 777 + }, + { + "epoch": 0.1770193401592719, + "grad_norm": 0.869488380021361, + "learning_rate": 1.246147989841402e-06, + "loss": 0.0252, + "step": 778 + }, + { + "epoch": 0.17724687144482365, + "grad_norm": 1.2894143452262044, + "learning_rate": 1.2461380786071863e-06, + "loss": 0.0401, + "step": 779 + }, + { + "epoch": 0.17747440273037543, + "grad_norm": 1.4297168158807256, + "learning_rate": 1.246128154678046e-06, + "loss": 0.0586, + "step": 780 + }, + { + "epoch": 0.17770193401592718, + "grad_norm": 4.251777831616221, + "learning_rate": 1.2461182180541835e-06, + "loss": 0.0484, + "step": 781 + }, + { + "epoch": 0.17792946530147896, + "grad_norm": 1.1899715722609847, + "learning_rate": 1.2461082687358022e-06, + "loss": 0.0391, + "step": 782 + }, + { + "epoch": 0.1781569965870307, + "grad_norm": 1.0511863654570817, + "learning_rate": 1.2460983067231055e-06, + "loss": 0.0424, + "step": 783 + }, + { + "epoch": 0.1783845278725825, + "grad_norm": 1.6108410955721733, + "learning_rate": 1.246088332016297e-06, + "loss": 0.0453, + "step": 784 + }, + { + "epoch": 0.17861205915813425, + "grad_norm": 1.014732446092101, + "learning_rate": 1.2460783446155802e-06, + "loss": 0.0398, + "step": 785 + }, + { + "epoch": 0.178839590443686, + "grad_norm": 3.365599666409623, + "learning_rate": 1.2460683445211596e-06, + "loss": 0.1579, + "step": 786 + }, + { + "epoch": 0.17906712172923778, + "grad_norm": 2.0790980638653984, + "learning_rate": 1.2460583317332395e-06, + "loss": 0.1013, + "step": 787 + }, + { + "epoch": 0.17929465301478953, + "grad_norm": 1.0292372040908986, + "learning_rate": 1.2460483062520246e-06, + "loss": 0.0399, + "step": 788 + }, + { + "epoch": 0.1795221843003413, + "grad_norm": 1.056684060389556, + "learning_rate": 1.2460382680777196e-06, + "loss": 0.0342, + "step": 789 + }, + { + "epoch": 0.17974971558589306, + "grad_norm": 1.3734567420277208, + "learning_rate": 1.2460282172105298e-06, + "loss": 0.0673, + "step": 790 + }, + { + "epoch": 0.1799772468714448, + "grad_norm": 1.519969265957327, + "learning_rate": 1.2460181536506608e-06, + "loss": 0.0775, + "step": 791 + }, + { + "epoch": 0.1802047781569966, + "grad_norm": 1.514321767455213, + "learning_rate": 1.2460080773983177e-06, + "loss": 0.0659, + "step": 792 + }, + { + "epoch": 0.18043230944254834, + "grad_norm": 1.0361490916169322, + "learning_rate": 1.2459979884537072e-06, + "loss": 0.0424, + "step": 793 + }, + { + "epoch": 0.18065984072810012, + "grad_norm": 1.3102760225972407, + "learning_rate": 1.2459878868170348e-06, + "loss": 0.0476, + "step": 794 + }, + { + "epoch": 0.18088737201365188, + "grad_norm": 1.4645693998145335, + "learning_rate": 1.2459777724885075e-06, + "loss": 0.0522, + "step": 795 + }, + { + "epoch": 0.18111490329920363, + "grad_norm": 1.6043429433204082, + "learning_rate": 1.2459676454683318e-06, + "loss": 0.0415, + "step": 796 + }, + { + "epoch": 0.1813424345847554, + "grad_norm": 2.3566952879735963, + "learning_rate": 1.2459575057567144e-06, + "loss": 0.0916, + "step": 797 + }, + { + "epoch": 0.18156996587030716, + "grad_norm": 1.5769029444288707, + "learning_rate": 1.245947353353863e-06, + "loss": 0.0596, + "step": 798 + }, + { + "epoch": 0.18179749715585894, + "grad_norm": 2.293268133767521, + "learning_rate": 1.245937188259985e-06, + "loss": 0.1087, + "step": 799 + }, + { + "epoch": 0.1820250284414107, + "grad_norm": 2.0227046414956096, + "learning_rate": 1.245927010475288e-06, + "loss": 0.0986, + "step": 800 + }, + { + "epoch": 0.18225255972696247, + "grad_norm": 1.4895840436039498, + "learning_rate": 1.24591681999998e-06, + "loss": 0.0633, + "step": 801 + }, + { + "epoch": 0.18248009101251422, + "grad_norm": 1.9671715135309942, + "learning_rate": 1.2459066168342693e-06, + "loss": 0.0551, + "step": 802 + }, + { + "epoch": 0.18270762229806597, + "grad_norm": 2.1525267147108322, + "learning_rate": 1.2458964009783646e-06, + "loss": 0.0809, + "step": 803 + }, + { + "epoch": 0.18293515358361775, + "grad_norm": 1.2864874343026942, + "learning_rate": 1.2458861724324745e-06, + "loss": 0.064, + "step": 804 + }, + { + "epoch": 0.1831626848691695, + "grad_norm": 3.1883103987924444, + "learning_rate": 1.2458759311968084e-06, + "loss": 0.1303, + "step": 805 + }, + { + "epoch": 0.1833902161547213, + "grad_norm": 1.0904025457159896, + "learning_rate": 1.245865677271575e-06, + "loss": 0.0419, + "step": 806 + }, + { + "epoch": 0.18361774744027304, + "grad_norm": 1.4716173202867617, + "learning_rate": 1.2458554106569844e-06, + "loss": 0.072, + "step": 807 + }, + { + "epoch": 0.1838452787258248, + "grad_norm": 0.9197168745315846, + "learning_rate": 1.2458451313532463e-06, + "loss": 0.04, + "step": 808 + }, + { + "epoch": 0.18407281001137657, + "grad_norm": 1.2386933488526581, + "learning_rate": 1.2458348393605708e-06, + "loss": 0.0383, + "step": 809 + }, + { + "epoch": 0.18430034129692832, + "grad_norm": 1.2129381734644349, + "learning_rate": 1.2458245346791678e-06, + "loss": 0.0486, + "step": 810 + }, + { + "epoch": 0.1845278725824801, + "grad_norm": 1.0772770126401692, + "learning_rate": 1.2458142173092486e-06, + "loss": 0.0654, + "step": 811 + }, + { + "epoch": 0.18475540386803185, + "grad_norm": 5.609205856069292, + "learning_rate": 1.2458038872510237e-06, + "loss": 0.0565, + "step": 812 + }, + { + "epoch": 0.1849829351535836, + "grad_norm": 1.1271779547323408, + "learning_rate": 1.2457935445047042e-06, + "loss": 0.0414, + "step": 813 + }, + { + "epoch": 0.18521046643913538, + "grad_norm": 1.1585514417402933, + "learning_rate": 1.2457831890705018e-06, + "loss": 0.0646, + "step": 814 + }, + { + "epoch": 0.18543799772468714, + "grad_norm": 1.6262463401225817, + "learning_rate": 1.2457728209486279e-06, + "loss": 0.0555, + "step": 815 + }, + { + "epoch": 0.18566552901023892, + "grad_norm": 1.6971282783367077, + "learning_rate": 1.2457624401392943e-06, + "loss": 0.0803, + "step": 816 + }, + { + "epoch": 0.18589306029579067, + "grad_norm": 1.5890865721783058, + "learning_rate": 1.2457520466427135e-06, + "loss": 0.0637, + "step": 817 + }, + { + "epoch": 0.18612059158134245, + "grad_norm": 1.663587359707044, + "learning_rate": 1.2457416404590974e-06, + "loss": 0.063, + "step": 818 + }, + { + "epoch": 0.1863481228668942, + "grad_norm": 1.4589989522369327, + "learning_rate": 1.2457312215886592e-06, + "loss": 0.0665, + "step": 819 + }, + { + "epoch": 0.18657565415244595, + "grad_norm": 1.2689131726245948, + "learning_rate": 1.2457207900316115e-06, + "loss": 0.0653, + "step": 820 + }, + { + "epoch": 0.18680318543799773, + "grad_norm": 1.2763445646366045, + "learning_rate": 1.245710345788168e-06, + "loss": 0.0691, + "step": 821 + }, + { + "epoch": 0.18703071672354948, + "grad_norm": 2.1044109818141523, + "learning_rate": 1.2456998888585414e-06, + "loss": 0.0334, + "step": 822 + }, + { + "epoch": 0.18725824800910126, + "grad_norm": 1.1582960528680695, + "learning_rate": 1.245689419242946e-06, + "loss": 0.0513, + "step": 823 + }, + { + "epoch": 0.18748577929465302, + "grad_norm": 1.4301422945996285, + "learning_rate": 1.2456789369415955e-06, + "loss": 0.0632, + "step": 824 + }, + { + "epoch": 0.18771331058020477, + "grad_norm": 1.5961327622673143, + "learning_rate": 1.2456684419547044e-06, + "loss": 0.0915, + "step": 825 + }, + { + "epoch": 0.18794084186575655, + "grad_norm": 1.156039499623056, + "learning_rate": 1.245657934282487e-06, + "loss": 0.0406, + "step": 826 + }, + { + "epoch": 0.1881683731513083, + "grad_norm": 0.7964273734238357, + "learning_rate": 1.245647413925158e-06, + "loss": 0.0339, + "step": 827 + }, + { + "epoch": 0.18839590443686008, + "grad_norm": 1.3110827687978466, + "learning_rate": 1.2456368808829327e-06, + "loss": 0.0315, + "step": 828 + }, + { + "epoch": 0.18862343572241183, + "grad_norm": 2.167581797519473, + "learning_rate": 1.2456263351560261e-06, + "loss": 0.0944, + "step": 829 + }, + { + "epoch": 0.18885096700796358, + "grad_norm": 1.7008701552637757, + "learning_rate": 1.2456157767446538e-06, + "loss": 0.0675, + "step": 830 + }, + { + "epoch": 0.18907849829351536, + "grad_norm": 0.9862167570499358, + "learning_rate": 1.245605205649032e-06, + "loss": 0.0283, + "step": 831 + }, + { + "epoch": 0.18930602957906711, + "grad_norm": 1.4504322451994889, + "learning_rate": 1.245594621869376e-06, + "loss": 0.0855, + "step": 832 + }, + { + "epoch": 0.1895335608646189, + "grad_norm": 0.8637440563578112, + "learning_rate": 1.2455840254059026e-06, + "loss": 0.0457, + "step": 833 + }, + { + "epoch": 0.18976109215017065, + "grad_norm": 1.365450351316701, + "learning_rate": 1.2455734162588282e-06, + "loss": 0.0523, + "step": 834 + }, + { + "epoch": 0.1899886234357224, + "grad_norm": 1.9203688742265796, + "learning_rate": 1.2455627944283697e-06, + "loss": 0.0421, + "step": 835 + }, + { + "epoch": 0.19021615472127418, + "grad_norm": 1.947700834519252, + "learning_rate": 1.245552159914744e-06, + "loss": 0.0631, + "step": 836 + }, + { + "epoch": 0.19044368600682593, + "grad_norm": 1.274745482531776, + "learning_rate": 1.245541512718169e-06, + "loss": 0.0408, + "step": 837 + }, + { + "epoch": 0.1906712172923777, + "grad_norm": 1.2840926844609297, + "learning_rate": 1.245530852838862e-06, + "loss": 0.0435, + "step": 838 + }, + { + "epoch": 0.19089874857792946, + "grad_norm": 1.5640003481699476, + "learning_rate": 1.2455201802770405e-06, + "loss": 0.0616, + "step": 839 + }, + { + "epoch": 0.19112627986348124, + "grad_norm": 1.2705630922274525, + "learning_rate": 1.245509495032923e-06, + "loss": 0.0604, + "step": 840 + }, + { + "epoch": 0.191353811149033, + "grad_norm": 1.2721765422307298, + "learning_rate": 1.2454987971067278e-06, + "loss": 0.0431, + "step": 841 + }, + { + "epoch": 0.19158134243458474, + "grad_norm": 1.7662664353990207, + "learning_rate": 1.2454880864986737e-06, + "loss": 0.0881, + "step": 842 + }, + { + "epoch": 0.19180887372013652, + "grad_norm": 0.8548688062918027, + "learning_rate": 1.2454773632089795e-06, + "loss": 0.0219, + "step": 843 + }, + { + "epoch": 0.19203640500568828, + "grad_norm": 1.1833145714689994, + "learning_rate": 1.2454666272378644e-06, + "loss": 0.0502, + "step": 844 + }, + { + "epoch": 0.19226393629124006, + "grad_norm": 2.15061289434072, + "learning_rate": 1.2454558785855475e-06, + "loss": 0.0687, + "step": 845 + }, + { + "epoch": 0.1924914675767918, + "grad_norm": 1.0201954402445583, + "learning_rate": 1.245445117252249e-06, + "loss": 0.0377, + "step": 846 + }, + { + "epoch": 0.19271899886234356, + "grad_norm": 2.256502679796923, + "learning_rate": 1.2454343432381886e-06, + "loss": 0.1344, + "step": 847 + }, + { + "epoch": 0.19294653014789534, + "grad_norm": 0.9607197396814332, + "learning_rate": 1.2454235565435862e-06, + "loss": 0.0293, + "step": 848 + }, + { + "epoch": 0.1931740614334471, + "grad_norm": 1.1041321579517591, + "learning_rate": 1.2454127571686629e-06, + "loss": 0.0588, + "step": 849 + }, + { + "epoch": 0.19340159271899887, + "grad_norm": 1.551888854493328, + "learning_rate": 1.245401945113639e-06, + "loss": 0.0884, + "step": 850 + }, + { + "epoch": 0.19362912400455062, + "grad_norm": 1.2207595212151812, + "learning_rate": 1.2453911203787355e-06, + "loss": 0.0495, + "step": 851 + }, + { + "epoch": 0.19385665529010238, + "grad_norm": 3.151128795398426, + "learning_rate": 1.2453802829641736e-06, + "loss": 0.0959, + "step": 852 + }, + { + "epoch": 0.19408418657565416, + "grad_norm": 1.0306148148350838, + "learning_rate": 1.2453694328701752e-06, + "loss": 0.0362, + "step": 853 + }, + { + "epoch": 0.1943117178612059, + "grad_norm": 1.4686426621438413, + "learning_rate": 1.2453585700969614e-06, + "loss": 0.0519, + "step": 854 + }, + { + "epoch": 0.1945392491467577, + "grad_norm": 4.418162916580233, + "learning_rate": 1.2453476946447547e-06, + "loss": 0.0718, + "step": 855 + }, + { + "epoch": 0.19476678043230944, + "grad_norm": 1.2510611267805927, + "learning_rate": 1.2453368065137772e-06, + "loss": 0.0471, + "step": 856 + }, + { + "epoch": 0.19499431171786122, + "grad_norm": 3.8279701209646566, + "learning_rate": 1.2453259057042514e-06, + "loss": 0.0962, + "step": 857 + }, + { + "epoch": 0.19522184300341297, + "grad_norm": 0.9857769216686043, + "learning_rate": 1.2453149922164003e-06, + "loss": 0.0361, + "step": 858 + }, + { + "epoch": 0.19544937428896472, + "grad_norm": 1.2075815978254054, + "learning_rate": 1.2453040660504468e-06, + "loss": 0.0361, + "step": 859 + }, + { + "epoch": 0.1956769055745165, + "grad_norm": 1.5382334773010704, + "learning_rate": 1.2452931272066141e-06, + "loss": 0.0489, + "step": 860 + }, + { + "epoch": 0.19590443686006825, + "grad_norm": 1.2911029078532041, + "learning_rate": 1.245282175685126e-06, + "loss": 0.0743, + "step": 861 + }, + { + "epoch": 0.19613196814562003, + "grad_norm": 2.432722962095318, + "learning_rate": 1.2452712114862063e-06, + "loss": 0.106, + "step": 862 + }, + { + "epoch": 0.19635949943117179, + "grad_norm": 1.3235947439225328, + "learning_rate": 1.245260234610079e-06, + "loss": 0.045, + "step": 863 + }, + { + "epoch": 0.19658703071672354, + "grad_norm": 1.1324676509010458, + "learning_rate": 1.2452492450569682e-06, + "loss": 0.0554, + "step": 864 + }, + { + "epoch": 0.19681456200227532, + "grad_norm": 2.2307637995876863, + "learning_rate": 1.245238242827099e-06, + "loss": 0.0832, + "step": 865 + }, + { + "epoch": 0.19704209328782707, + "grad_norm": 1.4079128192407386, + "learning_rate": 1.245227227920696e-06, + "loss": 0.079, + "step": 866 + }, + { + "epoch": 0.19726962457337885, + "grad_norm": 1.9175023199810017, + "learning_rate": 1.2452162003379842e-06, + "loss": 0.0917, + "step": 867 + }, + { + "epoch": 0.1974971558589306, + "grad_norm": 1.731489599286668, + "learning_rate": 1.2452051600791891e-06, + "loss": 0.0571, + "step": 868 + }, + { + "epoch": 0.19772468714448235, + "grad_norm": 1.3722463786220256, + "learning_rate": 1.2451941071445367e-06, + "loss": 0.039, + "step": 869 + }, + { + "epoch": 0.19795221843003413, + "grad_norm": 1.287569176842529, + "learning_rate": 1.2451830415342524e-06, + "loss": 0.0427, + "step": 870 + }, + { + "epoch": 0.19817974971558588, + "grad_norm": 2.2505234649573795, + "learning_rate": 1.2451719632485627e-06, + "loss": 0.0606, + "step": 871 + }, + { + "epoch": 0.19840728100113766, + "grad_norm": 1.3085408197305148, + "learning_rate": 1.2451608722876938e-06, + "loss": 0.0659, + "step": 872 + }, + { + "epoch": 0.19863481228668942, + "grad_norm": 2.3075742049076085, + "learning_rate": 1.2451497686518722e-06, + "loss": 0.0762, + "step": 873 + }, + { + "epoch": 0.1988623435722412, + "grad_norm": 1.1650796320462304, + "learning_rate": 1.2451386523413252e-06, + "loss": 0.0559, + "step": 874 + }, + { + "epoch": 0.19908987485779295, + "grad_norm": 1.199672804247241, + "learning_rate": 1.24512752335628e-06, + "loss": 0.0417, + "step": 875 + }, + { + "epoch": 0.1993174061433447, + "grad_norm": 1.4954309542074338, + "learning_rate": 1.2451163816969639e-06, + "loss": 0.0841, + "step": 876 + }, + { + "epoch": 0.19954493742889648, + "grad_norm": 1.4893257668939828, + "learning_rate": 1.2451052273636045e-06, + "loss": 0.0639, + "step": 877 + }, + { + "epoch": 0.19977246871444823, + "grad_norm": 1.1681659159861986, + "learning_rate": 1.24509406035643e-06, + "loss": 0.0487, + "step": 878 + }, + { + "epoch": 0.2, + "grad_norm": 1.337002748039853, + "learning_rate": 1.2450828806756685e-06, + "loss": 0.0408, + "step": 879 + }, + { + "epoch": 0.20022753128555176, + "grad_norm": 1.238199679938109, + "learning_rate": 1.245071688321549e-06, + "loss": 0.0452, + "step": 880 + }, + { + "epoch": 0.20045506257110352, + "grad_norm": 1.4016981334625263, + "learning_rate": 1.2450604832942991e-06, + "loss": 0.0462, + "step": 881 + }, + { + "epoch": 0.2006825938566553, + "grad_norm": 1.4932886959066234, + "learning_rate": 1.245049265594149e-06, + "loss": 0.0735, + "step": 882 + }, + { + "epoch": 0.20091012514220705, + "grad_norm": 1.2357598702477623, + "learning_rate": 1.2450380352213271e-06, + "loss": 0.0504, + "step": 883 + }, + { + "epoch": 0.20113765642775883, + "grad_norm": 2.015377601632808, + "learning_rate": 1.2450267921760636e-06, + "loss": 0.0523, + "step": 884 + }, + { + "epoch": 0.20136518771331058, + "grad_norm": 1.8296148009850803, + "learning_rate": 1.2450155364585878e-06, + "loss": 0.0554, + "step": 885 + }, + { + "epoch": 0.20159271899886233, + "grad_norm": 1.9872439975590221, + "learning_rate": 1.2450042680691301e-06, + "loss": 0.0737, + "step": 886 + }, + { + "epoch": 0.2018202502844141, + "grad_norm": 0.832951332510299, + "learning_rate": 1.2449929870079206e-06, + "loss": 0.0457, + "step": 887 + }, + { + "epoch": 0.20204778156996586, + "grad_norm": 0.7896517265085776, + "learning_rate": 1.24498169327519e-06, + "loss": 0.0261, + "step": 888 + }, + { + "epoch": 0.20227531285551764, + "grad_norm": 1.3980548207185421, + "learning_rate": 1.2449703868711688e-06, + "loss": 0.061, + "step": 889 + }, + { + "epoch": 0.2025028441410694, + "grad_norm": 1.3109982570334282, + "learning_rate": 1.2449590677960886e-06, + "loss": 0.0525, + "step": 890 + }, + { + "epoch": 0.20273037542662117, + "grad_norm": 0.8611577720266457, + "learning_rate": 1.2449477360501802e-06, + "loss": 0.0297, + "step": 891 + }, + { + "epoch": 0.20295790671217293, + "grad_norm": 2.2537154917621267, + "learning_rate": 1.2449363916336756e-06, + "loss": 0.0658, + "step": 892 + }, + { + "epoch": 0.20318543799772468, + "grad_norm": 2.7860866459618823, + "learning_rate": 1.2449250345468065e-06, + "loss": 0.0853, + "step": 893 + }, + { + "epoch": 0.20341296928327646, + "grad_norm": 2.063772825536578, + "learning_rate": 1.244913664789805e-06, + "loss": 0.0413, + "step": 894 + }, + { + "epoch": 0.2036405005688282, + "grad_norm": 1.3514226592309118, + "learning_rate": 1.2449022823629036e-06, + "loss": 0.0445, + "step": 895 + }, + { + "epoch": 0.20386803185438, + "grad_norm": 1.2525160508368907, + "learning_rate": 1.2448908872663347e-06, + "loss": 0.0337, + "step": 896 + }, + { + "epoch": 0.20409556313993174, + "grad_norm": 0.9460587754259561, + "learning_rate": 1.2448794795003313e-06, + "loss": 0.0391, + "step": 897 + }, + { + "epoch": 0.2043230944254835, + "grad_norm": 1.146769061895453, + "learning_rate": 1.2448680590651269e-06, + "loss": 0.0618, + "step": 898 + }, + { + "epoch": 0.20455062571103527, + "grad_norm": 1.277169361575558, + "learning_rate": 1.2448566259609543e-06, + "loss": 0.0479, + "step": 899 + }, + { + "epoch": 0.20477815699658702, + "grad_norm": 1.2913475907417995, + "learning_rate": 1.2448451801880476e-06, + "loss": 0.0352, + "step": 900 + }, + { + "epoch": 0.2050056882821388, + "grad_norm": 1.614343484279519, + "learning_rate": 1.2448337217466404e-06, + "loss": 0.0672, + "step": 901 + }, + { + "epoch": 0.20523321956769056, + "grad_norm": 1.1122189034817742, + "learning_rate": 1.2448222506369675e-06, + "loss": 0.0385, + "step": 902 + }, + { + "epoch": 0.2054607508532423, + "grad_norm": 1.5480813920575183, + "learning_rate": 1.2448107668592626e-06, + "loss": 0.0696, + "step": 903 + }, + { + "epoch": 0.2056882821387941, + "grad_norm": 2.133323122403055, + "learning_rate": 1.244799270413761e-06, + "loss": 0.0726, + "step": 904 + }, + { + "epoch": 0.20591581342434584, + "grad_norm": 1.9375085657983182, + "learning_rate": 1.2447877613006972e-06, + "loss": 0.0762, + "step": 905 + }, + { + "epoch": 0.20614334470989762, + "grad_norm": 2.138121754084785, + "learning_rate": 1.244776239520307e-06, + "loss": 0.1251, + "step": 906 + }, + { + "epoch": 0.20637087599544937, + "grad_norm": 69.30343641276347, + "learning_rate": 1.244764705072825e-06, + "loss": 1.5633, + "step": 907 + }, + { + "epoch": 0.20659840728100115, + "grad_norm": 0.9136079997375776, + "learning_rate": 1.2447531579584878e-06, + "loss": 0.0359, + "step": 908 + }, + { + "epoch": 0.2068259385665529, + "grad_norm": 2.258737842455169, + "learning_rate": 1.2447415981775312e-06, + "loss": 0.0815, + "step": 909 + }, + { + "epoch": 0.20705346985210465, + "grad_norm": 1.1302894088592583, + "learning_rate": 1.2447300257301912e-06, + "loss": 0.0377, + "step": 910 + }, + { + "epoch": 0.20728100113765643, + "grad_norm": 1.4921976481695052, + "learning_rate": 1.2447184406167045e-06, + "loss": 0.0867, + "step": 911 + }, + { + "epoch": 0.2075085324232082, + "grad_norm": 2.2983270765657546, + "learning_rate": 1.2447068428373077e-06, + "loss": 0.0726, + "step": 912 + }, + { + "epoch": 0.20773606370875997, + "grad_norm": 1.9446016021679782, + "learning_rate": 1.244695232392238e-06, + "loss": 0.064, + "step": 913 + }, + { + "epoch": 0.20796359499431172, + "grad_norm": 0.9872094991366439, + "learning_rate": 1.2446836092817328e-06, + "loss": 0.0432, + "step": 914 + }, + { + "epoch": 0.20819112627986347, + "grad_norm": 6.934093481791269, + "learning_rate": 1.2446719735060293e-06, + "loss": 0.0945, + "step": 915 + }, + { + "epoch": 0.20841865756541525, + "grad_norm": 1.360488111746127, + "learning_rate": 1.2446603250653658e-06, + "loss": 0.0603, + "step": 916 + }, + { + "epoch": 0.208646188850967, + "grad_norm": 1.3310461950551042, + "learning_rate": 1.24464866395998e-06, + "loss": 0.0365, + "step": 917 + }, + { + "epoch": 0.20887372013651878, + "grad_norm": 0.9182393241316875, + "learning_rate": 1.2446369901901102e-06, + "loss": 0.0559, + "step": 918 + }, + { + "epoch": 0.20910125142207053, + "grad_norm": 1.4023619755477905, + "learning_rate": 1.2446253037559952e-06, + "loss": 0.0457, + "step": 919 + }, + { + "epoch": 0.20932878270762229, + "grad_norm": 1.1256168684655514, + "learning_rate": 1.2446136046578739e-06, + "loss": 0.0455, + "step": 920 + }, + { + "epoch": 0.20955631399317406, + "grad_norm": 1.9433578174394295, + "learning_rate": 1.2446018928959853e-06, + "loss": 0.0962, + "step": 921 + }, + { + "epoch": 0.20978384527872582, + "grad_norm": 1.5957841959716605, + "learning_rate": 1.2445901684705685e-06, + "loss": 0.0408, + "step": 922 + }, + { + "epoch": 0.2100113765642776, + "grad_norm": 1.075018126077203, + "learning_rate": 1.2445784313818638e-06, + "loss": 0.0378, + "step": 923 + }, + { + "epoch": 0.21023890784982935, + "grad_norm": 1.5370330862679797, + "learning_rate": 1.2445666816301102e-06, + "loss": 0.0578, + "step": 924 + }, + { + "epoch": 0.21046643913538113, + "grad_norm": 1.4943129389397118, + "learning_rate": 1.2445549192155487e-06, + "loss": 0.0875, + "step": 925 + }, + { + "epoch": 0.21069397042093288, + "grad_norm": 1.1898394401405206, + "learning_rate": 1.244543144138419e-06, + "loss": 0.0486, + "step": 926 + }, + { + "epoch": 0.21092150170648463, + "grad_norm": 1.6954763665720234, + "learning_rate": 1.2445313563989624e-06, + "loss": 0.0641, + "step": 927 + }, + { + "epoch": 0.2111490329920364, + "grad_norm": 1.0426224473753456, + "learning_rate": 1.2445195559974194e-06, + "loss": 0.0404, + "step": 928 + }, + { + "epoch": 0.21137656427758816, + "grad_norm": 1.6481528761515403, + "learning_rate": 1.244507742934031e-06, + "loss": 0.0839, + "step": 929 + }, + { + "epoch": 0.21160409556313994, + "grad_norm": 0.9619424275618367, + "learning_rate": 1.2444959172090393e-06, + "loss": 0.0273, + "step": 930 + }, + { + "epoch": 0.2118316268486917, + "grad_norm": 3.69544520271876, + "learning_rate": 1.2444840788226854e-06, + "loss": 0.1652, + "step": 931 + }, + { + "epoch": 0.21205915813424345, + "grad_norm": 1.804306156177365, + "learning_rate": 1.2444722277752114e-06, + "loss": 0.1027, + "step": 932 + }, + { + "epoch": 0.21228668941979523, + "grad_norm": 1.768050037917839, + "learning_rate": 1.2444603640668596e-06, + "loss": 0.095, + "step": 933 + }, + { + "epoch": 0.21251422070534698, + "grad_norm": 1.1285321294388682, + "learning_rate": 1.2444484876978725e-06, + "loss": 0.0465, + "step": 934 + }, + { + "epoch": 0.21274175199089876, + "grad_norm": 1.7249462106069577, + "learning_rate": 1.2444365986684929e-06, + "loss": 0.0842, + "step": 935 + }, + { + "epoch": 0.2129692832764505, + "grad_norm": 1.2306924671997328, + "learning_rate": 1.2444246969789633e-06, + "loss": 0.0447, + "step": 936 + }, + { + "epoch": 0.21319681456200226, + "grad_norm": 1.1194256262692541, + "learning_rate": 1.2444127826295277e-06, + "loss": 0.0387, + "step": 937 + }, + { + "epoch": 0.21342434584755404, + "grad_norm": 0.9357846968568098, + "learning_rate": 1.244400855620429e-06, + "loss": 0.0311, + "step": 938 + }, + { + "epoch": 0.2136518771331058, + "grad_norm": 0.9399277842933605, + "learning_rate": 1.2443889159519113e-06, + "loss": 0.0408, + "step": 939 + }, + { + "epoch": 0.21387940841865757, + "grad_norm": 1.038736805523899, + "learning_rate": 1.2443769636242185e-06, + "loss": 0.0573, + "step": 940 + }, + { + "epoch": 0.21410693970420933, + "grad_norm": 1.179286632772412, + "learning_rate": 1.244364998637595e-06, + "loss": 0.0578, + "step": 941 + }, + { + "epoch": 0.2143344709897611, + "grad_norm": 3.1404649465639594, + "learning_rate": 1.2443530209922848e-06, + "loss": 0.0549, + "step": 942 + }, + { + "epoch": 0.21456200227531286, + "grad_norm": 1.4304654332903906, + "learning_rate": 1.2443410306885337e-06, + "loss": 0.0408, + "step": 943 + }, + { + "epoch": 0.2147895335608646, + "grad_norm": 1.3926671923537388, + "learning_rate": 1.244329027726586e-06, + "loss": 0.0675, + "step": 944 + }, + { + "epoch": 0.2150170648464164, + "grad_norm": 0.8473568816417418, + "learning_rate": 1.2443170121066872e-06, + "loss": 0.0388, + "step": 945 + }, + { + "epoch": 0.21524459613196814, + "grad_norm": 1.7456532933762987, + "learning_rate": 1.2443049838290827e-06, + "loss": 0.0655, + "step": 946 + }, + { + "epoch": 0.21547212741751992, + "grad_norm": 5.065824955296096, + "learning_rate": 1.2442929428940186e-06, + "loss": 0.196, + "step": 947 + }, + { + "epoch": 0.21569965870307167, + "grad_norm": 0.9216519176270381, + "learning_rate": 1.2442808893017414e-06, + "loss": 0.0376, + "step": 948 + }, + { + "epoch": 0.21592718998862342, + "grad_norm": 1.111383779669666, + "learning_rate": 1.2442688230524965e-06, + "loss": 0.0403, + "step": 949 + }, + { + "epoch": 0.2161547212741752, + "grad_norm": 1.557530476173142, + "learning_rate": 1.244256744146531e-06, + "loss": 0.0365, + "step": 950 + }, + { + "epoch": 0.21638225255972696, + "grad_norm": 1.1787378400008193, + "learning_rate": 1.244244652584092e-06, + "loss": 0.0507, + "step": 951 + }, + { + "epoch": 0.21660978384527874, + "grad_norm": 0.9890913425273592, + "learning_rate": 1.2442325483654263e-06, + "loss": 0.0533, + "step": 952 + }, + { + "epoch": 0.2168373151308305, + "grad_norm": 2.1206498633906294, + "learning_rate": 1.2442204314907812e-06, + "loss": 0.0794, + "step": 953 + }, + { + "epoch": 0.21706484641638224, + "grad_norm": 1.3153818338446894, + "learning_rate": 1.2442083019604047e-06, + "loss": 0.0706, + "step": 954 + }, + { + "epoch": 0.21729237770193402, + "grad_norm": 1.2497584639698338, + "learning_rate": 1.2441961597745447e-06, + "loss": 0.0474, + "step": 955 + }, + { + "epoch": 0.21751990898748577, + "grad_norm": 1.0089839713441826, + "learning_rate": 1.244184004933449e-06, + "loss": 0.0354, + "step": 956 + }, + { + "epoch": 0.21774744027303755, + "grad_norm": 1.967078988299315, + "learning_rate": 1.2441718374373662e-06, + "loss": 0.0371, + "step": 957 + }, + { + "epoch": 0.2179749715585893, + "grad_norm": 1.1714442804547354, + "learning_rate": 1.244159657286545e-06, + "loss": 0.0452, + "step": 958 + }, + { + "epoch": 0.21820250284414108, + "grad_norm": 0.940255872741034, + "learning_rate": 1.2441474644812345e-06, + "loss": 0.0363, + "step": 959 + }, + { + "epoch": 0.21843003412969283, + "grad_norm": 1.100910872233546, + "learning_rate": 1.2441352590216836e-06, + "loss": 0.0357, + "step": 960 + }, + { + "epoch": 0.2186575654152446, + "grad_norm": 1.9969980893987056, + "learning_rate": 1.244123040908142e-06, + "loss": 0.1572, + "step": 961 + }, + { + "epoch": 0.21888509670079637, + "grad_norm": 1.5961804013592509, + "learning_rate": 1.2441108101408592e-06, + "loss": 0.0425, + "step": 962 + }, + { + "epoch": 0.21911262798634812, + "grad_norm": 2.1162445389078024, + "learning_rate": 1.2440985667200853e-06, + "loss": 0.0517, + "step": 963 + }, + { + "epoch": 0.2193401592718999, + "grad_norm": 1.064967765846801, + "learning_rate": 1.2440863106460705e-06, + "loss": 0.023, + "step": 964 + }, + { + "epoch": 0.21956769055745165, + "grad_norm": 2.08098035050502, + "learning_rate": 1.2440740419190655e-06, + "loss": 0.0796, + "step": 965 + }, + { + "epoch": 0.2197952218430034, + "grad_norm": 2.4340967444170136, + "learning_rate": 1.2440617605393208e-06, + "loss": 0.0673, + "step": 966 + }, + { + "epoch": 0.22002275312855518, + "grad_norm": 2.139025801414634, + "learning_rate": 1.2440494665070874e-06, + "loss": 0.102, + "step": 967 + }, + { + "epoch": 0.22025028441410693, + "grad_norm": 1.3877680351054698, + "learning_rate": 1.2440371598226165e-06, + "loss": 0.0588, + "step": 968 + }, + { + "epoch": 0.2204778156996587, + "grad_norm": 1.3210352246509975, + "learning_rate": 1.2440248404861598e-06, + "loss": 0.0528, + "step": 969 + }, + { + "epoch": 0.22070534698521047, + "grad_norm": 1.4713104262343806, + "learning_rate": 1.2440125084979693e-06, + "loss": 0.0468, + "step": 970 + }, + { + "epoch": 0.22093287827076222, + "grad_norm": 1.4854620779239043, + "learning_rate": 1.2440001638582965e-06, + "loss": 0.0494, + "step": 971 + }, + { + "epoch": 0.221160409556314, + "grad_norm": 1.1733972765082468, + "learning_rate": 1.2439878065673944e-06, + "loss": 0.0517, + "step": 972 + }, + { + "epoch": 0.22138794084186575, + "grad_norm": 2.818817007281193, + "learning_rate": 1.2439754366255149e-06, + "loss": 0.0332, + "step": 973 + }, + { + "epoch": 0.22161547212741753, + "grad_norm": 1.3519809642179033, + "learning_rate": 1.2439630540329111e-06, + "loss": 0.0714, + "step": 974 + }, + { + "epoch": 0.22184300341296928, + "grad_norm": 1.2366129488699829, + "learning_rate": 1.2439506587898358e-06, + "loss": 0.0307, + "step": 975 + }, + { + "epoch": 0.22207053469852106, + "grad_norm": 1.2264022197392555, + "learning_rate": 1.243938250896543e-06, + "loss": 0.0505, + "step": 976 + }, + { + "epoch": 0.2222980659840728, + "grad_norm": 1.7725437520836973, + "learning_rate": 1.2439258303532858e-06, + "loss": 0.0551, + "step": 977 + }, + { + "epoch": 0.22252559726962456, + "grad_norm": 0.9663835721436027, + "learning_rate": 1.243913397160318e-06, + "loss": 0.0375, + "step": 978 + }, + { + "epoch": 0.22275312855517634, + "grad_norm": 0.8743576163827176, + "learning_rate": 1.2439009513178938e-06, + "loss": 0.0245, + "step": 979 + }, + { + "epoch": 0.2229806598407281, + "grad_norm": 1.054127962193629, + "learning_rate": 1.2438884928262678e-06, + "loss": 0.0323, + "step": 980 + }, + { + "epoch": 0.22320819112627988, + "grad_norm": 0.9874001278398739, + "learning_rate": 1.2438760216856944e-06, + "loss": 0.0435, + "step": 981 + }, + { + "epoch": 0.22343572241183163, + "grad_norm": 1.3544331214991197, + "learning_rate": 1.2438635378964284e-06, + "loss": 0.0707, + "step": 982 + }, + { + "epoch": 0.22366325369738338, + "grad_norm": 1.3272223315121154, + "learning_rate": 1.2438510414587251e-06, + "loss": 0.0553, + "step": 983 + }, + { + "epoch": 0.22389078498293516, + "grad_norm": 1.4246609306991453, + "learning_rate": 1.24383853237284e-06, + "loss": 0.0616, + "step": 984 + }, + { + "epoch": 0.2241183162684869, + "grad_norm": 1.3509367739810239, + "learning_rate": 1.2438260106390285e-06, + "loss": 0.054, + "step": 985 + }, + { + "epoch": 0.2243458475540387, + "grad_norm": 2.743016610014617, + "learning_rate": 1.2438134762575467e-06, + "loss": 0.1654, + "step": 986 + }, + { + "epoch": 0.22457337883959044, + "grad_norm": 1.1104234557718136, + "learning_rate": 1.243800929228651e-06, + "loss": 0.0438, + "step": 987 + }, + { + "epoch": 0.2248009101251422, + "grad_norm": 1.5813628573176925, + "learning_rate": 1.2437883695525974e-06, + "loss": 0.0577, + "step": 988 + }, + { + "epoch": 0.22502844141069397, + "grad_norm": 0.9637284966057095, + "learning_rate": 1.2437757972296427e-06, + "loss": 0.0441, + "step": 989 + }, + { + "epoch": 0.22525597269624573, + "grad_norm": 1.2736418998037702, + "learning_rate": 1.2437632122600442e-06, + "loss": 0.0627, + "step": 990 + }, + { + "epoch": 0.2254835039817975, + "grad_norm": 1.704338839028225, + "learning_rate": 1.2437506146440587e-06, + "loss": 0.0794, + "step": 991 + }, + { + "epoch": 0.22571103526734926, + "grad_norm": 1.3083723593285776, + "learning_rate": 1.243738004381944e-06, + "loss": 0.0367, + "step": 992 + }, + { + "epoch": 0.225938566552901, + "grad_norm": 1.2372256320814667, + "learning_rate": 1.2437253814739572e-06, + "loss": 0.0478, + "step": 993 + }, + { + "epoch": 0.2261660978384528, + "grad_norm": 2.064272199741316, + "learning_rate": 1.2437127459203572e-06, + "loss": 0.0673, + "step": 994 + }, + { + "epoch": 0.22639362912400454, + "grad_norm": 0.9919600393162777, + "learning_rate": 1.2437000977214015e-06, + "loss": 0.031, + "step": 995 + }, + { + "epoch": 0.22662116040955632, + "grad_norm": 1.6080832002027023, + "learning_rate": 1.243687436877349e-06, + "loss": 0.0841, + "step": 996 + }, + { + "epoch": 0.22684869169510807, + "grad_norm": 1.3930180180919087, + "learning_rate": 1.2436747633884583e-06, + "loss": 0.0616, + "step": 997 + }, + { + "epoch": 0.22707622298065985, + "grad_norm": 4.70664655158806, + "learning_rate": 1.2436620772549885e-06, + "loss": 0.1541, + "step": 998 + }, + { + "epoch": 0.2273037542662116, + "grad_norm": 2.3548939922944556, + "learning_rate": 1.243649378477199e-06, + "loss": 0.1161, + "step": 999 + }, + { + "epoch": 0.22753128555176336, + "grad_norm": 1.9186203633212395, + "learning_rate": 1.2436366670553491e-06, + "loss": 0.0725, + "step": 1000 + }, + { + "epoch": 0.22775881683731514, + "grad_norm": 1.726012700048525, + "learning_rate": 1.2436239429896988e-06, + "loss": 0.1038, + "step": 1001 + }, + { + "epoch": 0.2279863481228669, + "grad_norm": 1.7981785549763478, + "learning_rate": 1.2436112062805081e-06, + "loss": 0.0485, + "step": 1002 + }, + { + "epoch": 0.22821387940841867, + "grad_norm": 1.935791028636196, + "learning_rate": 1.2435984569280372e-06, + "loss": 0.0773, + "step": 1003 + }, + { + "epoch": 0.22844141069397042, + "grad_norm": 1.4700273599627696, + "learning_rate": 1.2435856949325467e-06, + "loss": 0.0584, + "step": 1004 + }, + { + "epoch": 0.22866894197952217, + "grad_norm": 4.712742088498197, + "learning_rate": 1.2435729202942972e-06, + "loss": 0.073, + "step": 1005 + }, + { + "epoch": 0.22889647326507395, + "grad_norm": 1.3146906101281273, + "learning_rate": 1.2435601330135506e-06, + "loss": 0.0357, + "step": 1006 + }, + { + "epoch": 0.2291240045506257, + "grad_norm": 2.3674281228532066, + "learning_rate": 1.2435473330905674e-06, + "loss": 0.0701, + "step": 1007 + }, + { + "epoch": 0.22935153583617748, + "grad_norm": 1.2459812264205032, + "learning_rate": 1.2435345205256097e-06, + "loss": 0.0375, + "step": 1008 + }, + { + "epoch": 0.22957906712172924, + "grad_norm": 1.4505448834484096, + "learning_rate": 1.243521695318939e-06, + "loss": 0.057, + "step": 1009 + }, + { + "epoch": 0.229806598407281, + "grad_norm": 1.014039827241364, + "learning_rate": 1.2435088574708178e-06, + "loss": 0.027, + "step": 1010 + }, + { + "epoch": 0.23003412969283277, + "grad_norm": 2.0117138797089766, + "learning_rate": 1.2434960069815083e-06, + "loss": 0.0583, + "step": 1011 + }, + { + "epoch": 0.23026166097838452, + "grad_norm": 2.319983736807929, + "learning_rate": 1.243483143851273e-06, + "loss": 0.0687, + "step": 1012 + }, + { + "epoch": 0.2304891922639363, + "grad_norm": 1.5313544208925962, + "learning_rate": 1.2434702680803751e-06, + "loss": 0.0531, + "step": 1013 + }, + { + "epoch": 0.23071672354948805, + "grad_norm": 2.4005097874561616, + "learning_rate": 1.2434573796690774e-06, + "loss": 0.0929, + "step": 1014 + }, + { + "epoch": 0.23094425483503983, + "grad_norm": 1.4786754339201256, + "learning_rate": 1.2434444786176435e-06, + "loss": 0.072, + "step": 1015 + }, + { + "epoch": 0.23117178612059158, + "grad_norm": 1.0190695534603456, + "learning_rate": 1.2434315649263372e-06, + "loss": 0.0336, + "step": 1016 + }, + { + "epoch": 0.23139931740614333, + "grad_norm": 0.85632474720689, + "learning_rate": 1.2434186385954225e-06, + "loss": 0.035, + "step": 1017 + }, + { + "epoch": 0.23162684869169511, + "grad_norm": 1.2607686804487748, + "learning_rate": 1.243405699625163e-06, + "loss": 0.0442, + "step": 1018 + }, + { + "epoch": 0.23185437997724687, + "grad_norm": 1.2717127046145194, + "learning_rate": 1.243392748015824e-06, + "loss": 0.0377, + "step": 1019 + }, + { + "epoch": 0.23208191126279865, + "grad_norm": 0.9597540032341592, + "learning_rate": 1.2433797837676694e-06, + "loss": 0.0434, + "step": 1020 + }, + { + "epoch": 0.2323094425483504, + "grad_norm": 0.9627749018367474, + "learning_rate": 1.2433668068809648e-06, + "loss": 0.0303, + "step": 1021 + }, + { + "epoch": 0.23253697383390215, + "grad_norm": 1.2513646734831103, + "learning_rate": 1.243353817355975e-06, + "loss": 0.039, + "step": 1022 + }, + { + "epoch": 0.23276450511945393, + "grad_norm": 1.2104231875454026, + "learning_rate": 1.2433408151929655e-06, + "loss": 0.0361, + "step": 1023 + }, + { + "epoch": 0.23299203640500568, + "grad_norm": 1.0202244699658645, + "learning_rate": 1.2433278003922026e-06, + "loss": 0.0351, + "step": 1024 + }, + { + "epoch": 0.23321956769055746, + "grad_norm": 1.4263990244044498, + "learning_rate": 1.2433147729539514e-06, + "loss": 0.0815, + "step": 1025 + }, + { + "epoch": 0.2334470989761092, + "grad_norm": 1.5336205920808743, + "learning_rate": 1.2433017328784788e-06, + "loss": 0.0637, + "step": 1026 + }, + { + "epoch": 0.23367463026166096, + "grad_norm": 1.4842765531420945, + "learning_rate": 1.2432886801660513e-06, + "loss": 0.0619, + "step": 1027 + }, + { + "epoch": 0.23390216154721274, + "grad_norm": 0.9615217436557268, + "learning_rate": 1.2432756148169354e-06, + "loss": 0.0424, + "step": 1028 + }, + { + "epoch": 0.2341296928327645, + "grad_norm": 2.5880116161383273, + "learning_rate": 1.2432625368313983e-06, + "loss": 0.0925, + "step": 1029 + }, + { + "epoch": 0.23435722411831628, + "grad_norm": 1.1806940004387316, + "learning_rate": 1.2432494462097072e-06, + "loss": 0.0547, + "step": 1030 + }, + { + "epoch": 0.23458475540386803, + "grad_norm": 0.9929303526421415, + "learning_rate": 1.2432363429521295e-06, + "loss": 0.0529, + "step": 1031 + }, + { + "epoch": 0.2348122866894198, + "grad_norm": 0.9571908888077277, + "learning_rate": 1.2432232270589335e-06, + "loss": 0.0334, + "step": 1032 + }, + { + "epoch": 0.23503981797497156, + "grad_norm": 2.067393897730002, + "learning_rate": 1.2432100985303868e-06, + "loss": 0.106, + "step": 1033 + }, + { + "epoch": 0.2352673492605233, + "grad_norm": 0.8691321322200917, + "learning_rate": 1.243196957366758e-06, + "loss": 0.0346, + "step": 1034 + }, + { + "epoch": 0.2354948805460751, + "grad_norm": 1.5305165337075897, + "learning_rate": 1.2431838035683155e-06, + "loss": 0.0848, + "step": 1035 + }, + { + "epoch": 0.23572241183162684, + "grad_norm": 1.3223616364406545, + "learning_rate": 1.2431706371353282e-06, + "loss": 0.0687, + "step": 1036 + }, + { + "epoch": 0.23594994311717862, + "grad_norm": 1.928604930493819, + "learning_rate": 1.2431574580680653e-06, + "loss": 0.0649, + "step": 1037 + }, + { + "epoch": 0.23617747440273038, + "grad_norm": 1.2898613078943457, + "learning_rate": 1.2431442663667958e-06, + "loss": 0.0542, + "step": 1038 + }, + { + "epoch": 0.23640500568828213, + "grad_norm": 1.0765527225532854, + "learning_rate": 1.2431310620317898e-06, + "loss": 0.0601, + "step": 1039 + }, + { + "epoch": 0.2366325369738339, + "grad_norm": 0.7732520151584088, + "learning_rate": 1.2431178450633168e-06, + "loss": 0.0285, + "step": 1040 + }, + { + "epoch": 0.23686006825938566, + "grad_norm": 1.1685154356326117, + "learning_rate": 1.2431046154616473e-06, + "loss": 0.0356, + "step": 1041 + }, + { + "epoch": 0.23708759954493744, + "grad_norm": 1.3543617648083628, + "learning_rate": 1.2430913732270512e-06, + "loss": 0.0446, + "step": 1042 + }, + { + "epoch": 0.2373151308304892, + "grad_norm": 1.707962300050113, + "learning_rate": 1.2430781183597995e-06, + "loss": 0.0555, + "step": 1043 + }, + { + "epoch": 0.23754266211604094, + "grad_norm": 1.2841428640766417, + "learning_rate": 1.243064850860163e-06, + "loss": 0.0447, + "step": 1044 + }, + { + "epoch": 0.23777019340159272, + "grad_norm": 6.261159556049287, + "learning_rate": 1.243051570728413e-06, + "loss": 0.0749, + "step": 1045 + }, + { + "epoch": 0.23799772468714447, + "grad_norm": 1.1364071139867686, + "learning_rate": 1.2430382779648208e-06, + "loss": 0.0326, + "step": 1046 + }, + { + "epoch": 0.23822525597269625, + "grad_norm": 1.484581864381779, + "learning_rate": 1.243024972569658e-06, + "loss": 0.0645, + "step": 1047 + }, + { + "epoch": 0.238452787258248, + "grad_norm": 1.1545473703015605, + "learning_rate": 1.2430116545431966e-06, + "loss": 0.0481, + "step": 1048 + }, + { + "epoch": 0.23868031854379979, + "grad_norm": 0.8983805133665866, + "learning_rate": 1.2429983238857088e-06, + "loss": 0.0369, + "step": 1049 + }, + { + "epoch": 0.23890784982935154, + "grad_norm": 1.068742935236811, + "learning_rate": 1.2429849805974673e-06, + "loss": 0.039, + "step": 1050 + }, + { + "epoch": 0.2391353811149033, + "grad_norm": 1.4873654976644912, + "learning_rate": 1.2429716246787444e-06, + "loss": 0.0312, + "step": 1051 + }, + { + "epoch": 0.23936291240045507, + "grad_norm": 0.9449501930869969, + "learning_rate": 1.242958256129813e-06, + "loss": 0.0567, + "step": 1052 + }, + { + "epoch": 0.23959044368600682, + "grad_norm": 1.6243209526441913, + "learning_rate": 1.242944874950947e-06, + "loss": 0.0647, + "step": 1053 + }, + { + "epoch": 0.2398179749715586, + "grad_norm": 1.4569612537340462, + "learning_rate": 1.2429314811424192e-06, + "loss": 0.0758, + "step": 1054 + }, + { + "epoch": 0.24004550625711035, + "grad_norm": 1.5825027859681509, + "learning_rate": 1.242918074704504e-06, + "loss": 0.0621, + "step": 1055 + }, + { + "epoch": 0.2402730375426621, + "grad_norm": 1.0511003317201362, + "learning_rate": 1.2429046556374747e-06, + "loss": 0.037, + "step": 1056 + }, + { + "epoch": 0.24050056882821388, + "grad_norm": 1.5928024296302492, + "learning_rate": 1.2428912239416057e-06, + "loss": 0.0453, + "step": 1057 + }, + { + "epoch": 0.24072810011376564, + "grad_norm": 1.1044477930026537, + "learning_rate": 1.242877779617172e-06, + "loss": 0.0419, + "step": 1058 + }, + { + "epoch": 0.24095563139931742, + "grad_norm": 1.3035104956572268, + "learning_rate": 1.242864322664448e-06, + "loss": 0.0531, + "step": 1059 + }, + { + "epoch": 0.24118316268486917, + "grad_norm": 1.4034057924806689, + "learning_rate": 1.2428508530837088e-06, + "loss": 0.0753, + "step": 1060 + }, + { + "epoch": 0.24141069397042092, + "grad_norm": 45.147488919338755, + "learning_rate": 1.2428373708752298e-06, + "loss": 0.4166, + "step": 1061 + }, + { + "epoch": 0.2416382252559727, + "grad_norm": 1.804049071201822, + "learning_rate": 1.2428238760392862e-06, + "loss": 0.0881, + "step": 1062 + }, + { + "epoch": 0.24186575654152445, + "grad_norm": 1.2662139418382417, + "learning_rate": 1.2428103685761543e-06, + "loss": 0.0592, + "step": 1063 + }, + { + "epoch": 0.24209328782707623, + "grad_norm": 2.633674196903193, + "learning_rate": 1.2427968484861097e-06, + "loss": 0.1104, + "step": 1064 + }, + { + "epoch": 0.24232081911262798, + "grad_norm": 1.437345835314508, + "learning_rate": 1.2427833157694292e-06, + "loss": 0.059, + "step": 1065 + }, + { + "epoch": 0.24254835039817976, + "grad_norm": 1.4759939750585893, + "learning_rate": 1.2427697704263892e-06, + "loss": 0.0564, + "step": 1066 + }, + { + "epoch": 0.24277588168373151, + "grad_norm": 1.4003983063384298, + "learning_rate": 1.2427562124572663e-06, + "loss": 0.0756, + "step": 1067 + }, + { + "epoch": 0.24300341296928327, + "grad_norm": 1.2893015359567934, + "learning_rate": 1.2427426418623377e-06, + "loss": 0.0581, + "step": 1068 + }, + { + "epoch": 0.24323094425483505, + "grad_norm": 0.9083897855186556, + "learning_rate": 1.242729058641881e-06, + "loss": 0.0353, + "step": 1069 + }, + { + "epoch": 0.2434584755403868, + "grad_norm": 2.339042375493057, + "learning_rate": 1.2427154627961737e-06, + "loss": 0.1366, + "step": 1070 + }, + { + "epoch": 0.24368600682593858, + "grad_norm": 2.1438385712601593, + "learning_rate": 1.2427018543254935e-06, + "loss": 0.082, + "step": 1071 + }, + { + "epoch": 0.24391353811149033, + "grad_norm": 1.627313372349946, + "learning_rate": 1.2426882332301187e-06, + "loss": 0.076, + "step": 1072 + }, + { + "epoch": 0.24414106939704208, + "grad_norm": 0.7791355914048339, + "learning_rate": 1.2426745995103277e-06, + "loss": 0.0309, + "step": 1073 + }, + { + "epoch": 0.24436860068259386, + "grad_norm": 1.1486991259203154, + "learning_rate": 1.242660953166399e-06, + "loss": 0.0509, + "step": 1074 + }, + { + "epoch": 0.2445961319681456, + "grad_norm": 1.7757375833942548, + "learning_rate": 1.2426472941986117e-06, + "loss": 0.0731, + "step": 1075 + }, + { + "epoch": 0.2448236632536974, + "grad_norm": 1.4454374080664514, + "learning_rate": 1.2426336226072449e-06, + "loss": 0.0868, + "step": 1076 + }, + { + "epoch": 0.24505119453924915, + "grad_norm": 1.0698078663784565, + "learning_rate": 1.242619938392578e-06, + "loss": 0.037, + "step": 1077 + }, + { + "epoch": 0.2452787258248009, + "grad_norm": 1.4863388002827098, + "learning_rate": 1.2426062415548907e-06, + "loss": 0.0677, + "step": 1078 + }, + { + "epoch": 0.24550625711035268, + "grad_norm": 0.8732937705027402, + "learning_rate": 1.2425925320944628e-06, + "loss": 0.0293, + "step": 1079 + }, + { + "epoch": 0.24573378839590443, + "grad_norm": 1.2595288446902135, + "learning_rate": 1.2425788100115747e-06, + "loss": 0.056, + "step": 1080 + }, + { + "epoch": 0.2459613196814562, + "grad_norm": 1.5998408970282165, + "learning_rate": 1.2425650753065065e-06, + "loss": 0.0764, + "step": 1081 + }, + { + "epoch": 0.24618885096700796, + "grad_norm": 1.8027463727170587, + "learning_rate": 1.2425513279795395e-06, + "loss": 0.0615, + "step": 1082 + }, + { + "epoch": 0.24641638225255974, + "grad_norm": 1.5654018901455387, + "learning_rate": 1.2425375680309543e-06, + "loss": 0.0661, + "step": 1083 + }, + { + "epoch": 0.2466439135381115, + "grad_norm": 1.0676862677438599, + "learning_rate": 1.2425237954610322e-06, + "loss": 0.0469, + "step": 1084 + }, + { + "epoch": 0.24687144482366324, + "grad_norm": 1.097337057406586, + "learning_rate": 1.2425100102700547e-06, + "loss": 0.0457, + "step": 1085 + }, + { + "epoch": 0.24709897610921502, + "grad_norm": 1.7432427011380645, + "learning_rate": 1.2424962124583033e-06, + "loss": 0.0566, + "step": 1086 + }, + { + "epoch": 0.24732650739476678, + "grad_norm": 1.2401111182610864, + "learning_rate": 1.2424824020260603e-06, + "loss": 0.0471, + "step": 1087 + }, + { + "epoch": 0.24755403868031856, + "grad_norm": 1.6086762249686064, + "learning_rate": 1.2424685789736077e-06, + "loss": 0.0712, + "step": 1088 + }, + { + "epoch": 0.2477815699658703, + "grad_norm": 2.223604715087674, + "learning_rate": 1.2424547433012284e-06, + "loss": 0.059, + "step": 1089 + }, + { + "epoch": 0.24800910125142206, + "grad_norm": 1.763749275351197, + "learning_rate": 1.2424408950092049e-06, + "loss": 0.0768, + "step": 1090 + }, + { + "epoch": 0.24823663253697384, + "grad_norm": 1.7828992657348108, + "learning_rate": 1.2424270340978204e-06, + "loss": 0.0794, + "step": 1091 + }, + { + "epoch": 0.2484641638225256, + "grad_norm": 1.5001450767436246, + "learning_rate": 1.2424131605673582e-06, + "loss": 0.0693, + "step": 1092 + }, + { + "epoch": 0.24869169510807737, + "grad_norm": 1.2301421279887537, + "learning_rate": 1.2423992744181015e-06, + "loss": 0.0512, + "step": 1093 + }, + { + "epoch": 0.24891922639362912, + "grad_norm": 1.2844864578504145, + "learning_rate": 1.2423853756503343e-06, + "loss": 0.0447, + "step": 1094 + }, + { + "epoch": 0.24914675767918087, + "grad_norm": 1.7196177827780137, + "learning_rate": 1.2423714642643408e-06, + "loss": 0.1182, + "step": 1095 + }, + { + "epoch": 0.24937428896473265, + "grad_norm": 1.46657616590301, + "learning_rate": 1.2423575402604051e-06, + "loss": 0.0656, + "step": 1096 + }, + { + "epoch": 0.2496018202502844, + "grad_norm": 1.326342830248352, + "learning_rate": 1.2423436036388122e-06, + "loss": 0.0544, + "step": 1097 + }, + { + "epoch": 0.24982935153583619, + "grad_norm": 1.952008099521945, + "learning_rate": 1.2423296543998465e-06, + "loss": 0.078, + "step": 1098 + }, + { + "epoch": 0.25005688282138794, + "grad_norm": 2.219926087555926, + "learning_rate": 1.2423156925437932e-06, + "loss": 0.0976, + "step": 1099 + }, + { + "epoch": 0.2502844141069397, + "grad_norm": 1.4678969382750013, + "learning_rate": 1.2423017180709376e-06, + "loss": 0.0727, + "step": 1100 + }, + { + "epoch": 0.25051194539249144, + "grad_norm": 1.9083840830954104, + "learning_rate": 1.2422877309815656e-06, + "loss": 0.0647, + "step": 1101 + }, + { + "epoch": 0.25073947667804325, + "grad_norm": 1.3418972776139215, + "learning_rate": 1.242273731275963e-06, + "loss": 0.0616, + "step": 1102 + }, + { + "epoch": 0.250967007963595, + "grad_norm": 1.0799686257073313, + "learning_rate": 1.2422597189544155e-06, + "loss": 0.0306, + "step": 1103 + }, + { + "epoch": 0.25119453924914675, + "grad_norm": 1.4786340245230432, + "learning_rate": 1.2422456940172101e-06, + "loss": 0.0752, + "step": 1104 + }, + { + "epoch": 0.2514220705346985, + "grad_norm": 1.3574468048142418, + "learning_rate": 1.2422316564646331e-06, + "loss": 0.0469, + "step": 1105 + }, + { + "epoch": 0.25164960182025026, + "grad_norm": 1.0996737224918256, + "learning_rate": 1.2422176062969713e-06, + "loss": 0.06, + "step": 1106 + }, + { + "epoch": 0.25187713310580206, + "grad_norm": 1.2079586241887055, + "learning_rate": 1.2422035435145121e-06, + "loss": 0.0417, + "step": 1107 + }, + { + "epoch": 0.2521046643913538, + "grad_norm": 1.8484139155779635, + "learning_rate": 1.2421894681175428e-06, + "loss": 0.1201, + "step": 1108 + }, + { + "epoch": 0.25233219567690557, + "grad_norm": 2.2964405416217977, + "learning_rate": 1.2421753801063511e-06, + "loss": 0.0661, + "step": 1109 + }, + { + "epoch": 0.2525597269624573, + "grad_norm": 1.233798632970271, + "learning_rate": 1.2421612794812248e-06, + "loss": 0.0533, + "step": 1110 + } + ], + "logging_steps": 1, + "max_steps": 21975, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 1110, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3669861113856.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}