diff --git "a/checkpoint-1110/trainer_state.json" "b/checkpoint-1110/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1110/trainer_state.json" @@ -0,0 +1,7804 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2525597269624573, + "eval_steps": 500, + "global_step": 1110, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00022753128555176336, + "grad_norm": 12.3071932545212, + "learning_rate": 1.25e-06, + "loss": 0.3186, + "step": 1 + }, + { + "epoch": 0.0004550625711035267, + "grad_norm": 11.959825961880057, + "learning_rate": 1.2499999936130725e-06, + "loss": 0.3776, + "step": 2 + }, + { + "epoch": 0.0006825938566552901, + "grad_norm": 5.4315221034586365, + "learning_rate": 1.2499999744522896e-06, + "loss": 0.4755, + "step": 3 + }, + { + "epoch": 0.0009101251422070534, + "grad_norm": 21.003860231065644, + "learning_rate": 1.2499999425176518e-06, + "loss": 0.3334, + "step": 4 + }, + { + "epoch": 0.0011376564277588168, + "grad_norm": 9.549170994835775, + "learning_rate": 1.2499998978091598e-06, + "loss": 0.375, + "step": 5 + }, + { + "epoch": 0.0013651877133105802, + "grad_norm": 3.400827392368318, + "learning_rate": 1.2499998403268147e-06, + "loss": 0.2286, + "step": 6 + }, + { + "epoch": 0.0015927189988623437, + "grad_norm": 8.451175634489234, + "learning_rate": 1.2499997700706173e-06, + "loss": 0.3216, + "step": 7 + }, + { + "epoch": 0.0018202502844141069, + "grad_norm": 7.494987211346803, + "learning_rate": 1.2499996870405692e-06, + "loss": 0.2339, + "step": 8 + }, + { + "epoch": 0.0020477815699658703, + "grad_norm": 9.138399201835718, + "learning_rate": 1.2499995912366722e-06, + "loss": 0.326, + "step": 9 + }, + { + "epoch": 0.0022753128555176336, + "grad_norm": 3.2188295955534123, + "learning_rate": 1.2499994826589282e-06, + "loss": 0.2514, + "step": 10 + }, + { + "epoch": 0.002502844141069397, + "grad_norm": 22.66663249526738, + "learning_rate": 1.2499993613073393e-06, + "loss": 0.4005, + "step": 11 + }, + { + "epoch": 0.0027303754266211604, + "grad_norm": 21.954799290073527, + "learning_rate": 1.2499992271819083e-06, + "loss": 0.1492, + "step": 12 + }, + { + "epoch": 0.0029579067121729237, + "grad_norm": 6.624298118045555, + "learning_rate": 1.2499990802826377e-06, + "loss": 0.3024, + "step": 13 + }, + { + "epoch": 0.0031854379977246873, + "grad_norm": 3.923975454400113, + "learning_rate": 1.2499989206095304e-06, + "loss": 0.2411, + "step": 14 + }, + { + "epoch": 0.0034129692832764505, + "grad_norm": 1.9398605915746092, + "learning_rate": 1.2499987481625899e-06, + "loss": 0.1849, + "step": 15 + }, + { + "epoch": 0.0036405005688282138, + "grad_norm": 5.482695785208493, + "learning_rate": 1.2499985629418195e-06, + "loss": 0.3122, + "step": 16 + }, + { + "epoch": 0.0038680318543799774, + "grad_norm": 5.552109300872593, + "learning_rate": 1.2499983649472233e-06, + "loss": 0.3393, + "step": 17 + }, + { + "epoch": 0.004095563139931741, + "grad_norm": 4.610318891370888, + "learning_rate": 1.249998154178805e-06, + "loss": 0.3, + "step": 18 + }, + { + "epoch": 0.004323094425483504, + "grad_norm": 8.793267315718285, + "learning_rate": 1.2499979306365692e-06, + "loss": 0.2266, + "step": 19 + }, + { + "epoch": 0.004550625711035267, + "grad_norm": 11.785540460314868, + "learning_rate": 1.2499976943205202e-06, + "loss": 0.258, + "step": 20 + }, + { + "epoch": 0.00477815699658703, + "grad_norm": 7.848333910468807, + "learning_rate": 1.249997445230663e-06, + "loss": 0.3733, + "step": 21 + }, + { + "epoch": 0.005005688282138794, + "grad_norm": 11.509651474854413, + "learning_rate": 1.2499971833670026e-06, + "loss": 0.3606, + "step": 22 + }, + { + "epoch": 0.005233219567690558, + "grad_norm": 8.662973783002895, + "learning_rate": 1.2499969087295443e-06, + "loss": 0.3884, + "step": 23 + }, + { + "epoch": 0.005460750853242321, + "grad_norm": 5.341258812295752, + "learning_rate": 1.249996621318294e-06, + "loss": 0.2677, + "step": 24 + }, + { + "epoch": 0.005688282138794084, + "grad_norm": 4.742018594757072, + "learning_rate": 1.2499963211332573e-06, + "loss": 0.3253, + "step": 25 + }, + { + "epoch": 0.005915813424345847, + "grad_norm": 2.4536573603250624, + "learning_rate": 1.2499960081744405e-06, + "loss": 0.2393, + "step": 26 + }, + { + "epoch": 0.0061433447098976105, + "grad_norm": 6.34705088291597, + "learning_rate": 1.24999568244185e-06, + "loss": 0.4326, + "step": 27 + }, + { + "epoch": 0.006370875995449375, + "grad_norm": 9.775833264439491, + "learning_rate": 1.249995343935492e-06, + "loss": 0.4252, + "step": 28 + }, + { + "epoch": 0.006598407281001138, + "grad_norm": 6.064212735225404, + "learning_rate": 1.2499949926553743e-06, + "loss": 0.2988, + "step": 29 + }, + { + "epoch": 0.006825938566552901, + "grad_norm": 4.4254830237015845, + "learning_rate": 1.2499946286015032e-06, + "loss": 0.2988, + "step": 30 + }, + { + "epoch": 0.007053469852104664, + "grad_norm": 4.883047495609927, + "learning_rate": 1.2499942517738867e-06, + "loss": 0.2285, + "step": 31 + }, + { + "epoch": 0.0072810011376564275, + "grad_norm": 8.135398866699179, + "learning_rate": 1.2499938621725322e-06, + "loss": 0.1529, + "step": 32 + }, + { + "epoch": 0.007508532423208191, + "grad_norm": 2.973365765084456, + "learning_rate": 1.2499934597974478e-06, + "loss": 0.2436, + "step": 33 + }, + { + "epoch": 0.007736063708759955, + "grad_norm": 5.612693729952574, + "learning_rate": 1.2499930446486416e-06, + "loss": 0.3466, + "step": 34 + }, + { + "epoch": 0.007963594994311717, + "grad_norm": 3.022290156639827, + "learning_rate": 1.2499926167261224e-06, + "loss": 0.2728, + "step": 35 + }, + { + "epoch": 0.008191126279863481, + "grad_norm": 3.1279992715224467, + "learning_rate": 1.2499921760298987e-06, + "loss": 0.2469, + "step": 36 + }, + { + "epoch": 0.008418657565415245, + "grad_norm": 14.845448376418034, + "learning_rate": 1.2499917225599796e-06, + "loss": 0.5145, + "step": 37 + }, + { + "epoch": 0.008646188850967008, + "grad_norm": 14.138433401115075, + "learning_rate": 1.2499912563163742e-06, + "loss": 0.2705, + "step": 38 + }, + { + "epoch": 0.008873720136518772, + "grad_norm": 4.324563647824762, + "learning_rate": 1.249990777299092e-06, + "loss": 0.1563, + "step": 39 + }, + { + "epoch": 0.009101251422070534, + "grad_norm": 11.315529959215173, + "learning_rate": 1.249990285508143e-06, + "loss": 0.4123, + "step": 40 + }, + { + "epoch": 0.009328782707622298, + "grad_norm": 6.3112839729366765, + "learning_rate": 1.2499897809435374e-06, + "loss": 0.1742, + "step": 41 + }, + { + "epoch": 0.00955631399317406, + "grad_norm": 8.25726966946455, + "learning_rate": 1.249989263605285e-06, + "loss": 0.3229, + "step": 42 + }, + { + "epoch": 0.009783845278725825, + "grad_norm": 6.3545712967505334, + "learning_rate": 1.249988733493397e-06, + "loss": 0.3055, + "step": 43 + }, + { + "epoch": 0.010011376564277589, + "grad_norm": 5.356373706603287, + "learning_rate": 1.2499881906078836e-06, + "loss": 0.2601, + "step": 44 + }, + { + "epoch": 0.010238907849829351, + "grad_norm": 1.9215795165819936, + "learning_rate": 1.2499876349487564e-06, + "loss": 0.1517, + "step": 45 + }, + { + "epoch": 0.010466439135381115, + "grad_norm": 8.506503892761648, + "learning_rate": 1.2499870665160262e-06, + "loss": 0.2831, + "step": 46 + }, + { + "epoch": 0.010693970420932878, + "grad_norm": 5.909503420571465, + "learning_rate": 1.2499864853097054e-06, + "loss": 0.2252, + "step": 47 + }, + { + "epoch": 0.010921501706484642, + "grad_norm": 5.488265194188453, + "learning_rate": 1.2499858913298053e-06, + "loss": 0.3466, + "step": 48 + }, + { + "epoch": 0.011149032992036406, + "grad_norm": 12.162427245650075, + "learning_rate": 1.249985284576338e-06, + "loss": 0.2426, + "step": 49 + }, + { + "epoch": 0.011376564277588168, + "grad_norm": 9.969211407495816, + "learning_rate": 1.2499846650493164e-06, + "loss": 0.2801, + "step": 50 + }, + { + "epoch": 0.011604095563139932, + "grad_norm": 5.741578552447352, + "learning_rate": 1.2499840327487528e-06, + "loss": 0.2664, + "step": 51 + }, + { + "epoch": 0.011831626848691695, + "grad_norm": 2.937767840084915, + "learning_rate": 1.24998338767466e-06, + "loss": 0.1834, + "step": 52 + }, + { + "epoch": 0.012059158134243459, + "grad_norm": 4.130655112830682, + "learning_rate": 1.2499827298270515e-06, + "loss": 0.2675, + "step": 53 + }, + { + "epoch": 0.012286689419795221, + "grad_norm": 4.5227789119131625, + "learning_rate": 1.2499820592059405e-06, + "loss": 0.3205, + "step": 54 + }, + { + "epoch": 0.012514220705346985, + "grad_norm": 4.653850683576537, + "learning_rate": 1.2499813758113409e-06, + "loss": 0.1921, + "step": 55 + }, + { + "epoch": 0.01274175199089875, + "grad_norm": 6.204991552012506, + "learning_rate": 1.2499806796432665e-06, + "loss": 0.1989, + "step": 56 + }, + { + "epoch": 0.012969283276450512, + "grad_norm": 7.81696538748595, + "learning_rate": 1.2499799707017315e-06, + "loss": 0.1301, + "step": 57 + }, + { + "epoch": 0.013196814562002276, + "grad_norm": 6.427887275035889, + "learning_rate": 1.2499792489867508e-06, + "loss": 0.3376, + "step": 58 + }, + { + "epoch": 0.013424345847554038, + "grad_norm": 4.713573539887475, + "learning_rate": 1.2499785144983386e-06, + "loss": 0.1673, + "step": 59 + }, + { + "epoch": 0.013651877133105802, + "grad_norm": 6.7169275734426055, + "learning_rate": 1.24997776723651e-06, + "loss": 0.2501, + "step": 60 + }, + { + "epoch": 0.013879408418657566, + "grad_norm": 11.702392641770421, + "learning_rate": 1.2499770072012809e-06, + "loss": 0.293, + "step": 61 + }, + { + "epoch": 0.014106939704209329, + "grad_norm": 5.86563350345107, + "learning_rate": 1.2499762343926661e-06, + "loss": 0.2346, + "step": 62 + }, + { + "epoch": 0.014334470989761093, + "grad_norm": 4.562933746130791, + "learning_rate": 1.2499754488106817e-06, + "loss": 0.1349, + "step": 63 + }, + { + "epoch": 0.014562002275312855, + "grad_norm": 16.935870758573948, + "learning_rate": 1.2499746504553436e-06, + "loss": 0.2869, + "step": 64 + }, + { + "epoch": 0.01478953356086462, + "grad_norm": 3.252674290241083, + "learning_rate": 1.2499738393266684e-06, + "loss": 0.2125, + "step": 65 + }, + { + "epoch": 0.015017064846416382, + "grad_norm": 3.767321260449828, + "learning_rate": 1.2499730154246726e-06, + "loss": 0.2049, + "step": 66 + }, + { + "epoch": 0.015244596131968146, + "grad_norm": 7.264091175555215, + "learning_rate": 1.2499721787493726e-06, + "loss": 0.2521, + "step": 67 + }, + { + "epoch": 0.01547212741751991, + "grad_norm": 2.846384337735166, + "learning_rate": 1.2499713293007862e-06, + "loss": 0.1745, + "step": 68 + }, + { + "epoch": 0.015699658703071672, + "grad_norm": 30.829215228751778, + "learning_rate": 1.2499704670789301e-06, + "loss": 0.1514, + "step": 69 + }, + { + "epoch": 0.015927189988623434, + "grad_norm": 7.168923083631056, + "learning_rate": 1.2499695920838225e-06, + "loss": 0.2393, + "step": 70 + }, + { + "epoch": 0.0161547212741752, + "grad_norm": 3.418723817035884, + "learning_rate": 1.2499687043154809e-06, + "loss": 0.1342, + "step": 71 + }, + { + "epoch": 0.016382252559726963, + "grad_norm": 6.316537441364383, + "learning_rate": 1.2499678037739235e-06, + "loss": 0.1698, + "step": 72 + }, + { + "epoch": 0.016609783845278725, + "grad_norm": 3.8561981086650596, + "learning_rate": 1.2499668904591688e-06, + "loss": 0.3104, + "step": 73 + }, + { + "epoch": 0.01683731513083049, + "grad_norm": 4.679806938064617, + "learning_rate": 1.2499659643712356e-06, + "loss": 0.2139, + "step": 74 + }, + { + "epoch": 0.017064846416382253, + "grad_norm": 4.26137230837329, + "learning_rate": 1.2499650255101425e-06, + "loss": 0.2433, + "step": 75 + }, + { + "epoch": 0.017292377701934016, + "grad_norm": 3.7227188471827914, + "learning_rate": 1.2499640738759088e-06, + "loss": 0.2334, + "step": 76 + }, + { + "epoch": 0.017519908987485778, + "grad_norm": 6.044525591826923, + "learning_rate": 1.249963109468554e-06, + "loss": 0.3106, + "step": 77 + }, + { + "epoch": 0.017747440273037544, + "grad_norm": 6.248705646938244, + "learning_rate": 1.2499621322880979e-06, + "loss": 0.2025, + "step": 78 + }, + { + "epoch": 0.017974971558589306, + "grad_norm": 2.8368621495357313, + "learning_rate": 1.2499611423345604e-06, + "loss": 0.1492, + "step": 79 + }, + { + "epoch": 0.01820250284414107, + "grad_norm": 5.049736361542706, + "learning_rate": 1.2499601396079617e-06, + "loss": 0.1341, + "step": 80 + }, + { + "epoch": 0.018430034129692834, + "grad_norm": 6.760221850362585, + "learning_rate": 1.2499591241083222e-06, + "loss": 0.2092, + "step": 81 + }, + { + "epoch": 0.018657565415244597, + "grad_norm": 6.630540720646431, + "learning_rate": 1.2499580958356628e-06, + "loss": 0.2181, + "step": 82 + }, + { + "epoch": 0.01888509670079636, + "grad_norm": 3.8482585047631863, + "learning_rate": 1.2499570547900045e-06, + "loss": 0.1613, + "step": 83 + }, + { + "epoch": 0.01911262798634812, + "grad_norm": 6.605304588968454, + "learning_rate": 1.2499560009713684e-06, + "loss": 0.2959, + "step": 84 + }, + { + "epoch": 0.019340159271899887, + "grad_norm": 6.012809221970948, + "learning_rate": 1.2499549343797764e-06, + "loss": 0.2393, + "step": 85 + }, + { + "epoch": 0.01956769055745165, + "grad_norm": 6.254621323206641, + "learning_rate": 1.24995385501525e-06, + "loss": 0.2285, + "step": 86 + }, + { + "epoch": 0.019795221843003412, + "grad_norm": 3.4046999226542733, + "learning_rate": 1.2499527628778116e-06, + "loss": 0.1187, + "step": 87 + }, + { + "epoch": 0.020022753128555178, + "grad_norm": 7.419781715158706, + "learning_rate": 1.2499516579674831e-06, + "loss": 0.2817, + "step": 88 + }, + { + "epoch": 0.02025028441410694, + "grad_norm": 21.819719933471735, + "learning_rate": 1.2499505402842872e-06, + "loss": 0.2469, + "step": 89 + }, + { + "epoch": 0.020477815699658702, + "grad_norm": 2.8418419055080766, + "learning_rate": 1.2499494098282469e-06, + "loss": 0.2955, + "step": 90 + }, + { + "epoch": 0.020705346985210465, + "grad_norm": 7.066317637431583, + "learning_rate": 1.2499482665993851e-06, + "loss": 0.2044, + "step": 91 + }, + { + "epoch": 0.02093287827076223, + "grad_norm": 5.925737098985834, + "learning_rate": 1.2499471105977252e-06, + "loss": 0.2335, + "step": 92 + }, + { + "epoch": 0.021160409556313993, + "grad_norm": 3.0480275776898473, + "learning_rate": 1.249945941823291e-06, + "loss": 0.3633, + "step": 93 + }, + { + "epoch": 0.021387940841865755, + "grad_norm": 2.946352549362824, + "learning_rate": 1.2499447602761063e-06, + "loss": 0.2011, + "step": 94 + }, + { + "epoch": 0.02161547212741752, + "grad_norm": 6.07129225638081, + "learning_rate": 1.2499435659561954e-06, + "loss": 0.2585, + "step": 95 + }, + { + "epoch": 0.021843003412969283, + "grad_norm": 4.592794032374342, + "learning_rate": 1.2499423588635823e-06, + "loss": 0.2336, + "step": 96 + }, + { + "epoch": 0.022070534698521046, + "grad_norm": 19.61835193566366, + "learning_rate": 1.2499411389982919e-06, + "loss": 0.2438, + "step": 97 + }, + { + "epoch": 0.02229806598407281, + "grad_norm": 4.697964666160796, + "learning_rate": 1.2499399063603492e-06, + "loss": 0.26, + "step": 98 + }, + { + "epoch": 0.022525597269624574, + "grad_norm": 6.831528796415563, + "learning_rate": 1.2499386609497793e-06, + "loss": 0.1291, + "step": 99 + }, + { + "epoch": 0.022753128555176336, + "grad_norm": 3.3770537551655653, + "learning_rate": 1.2499374027666078e-06, + "loss": 0.1919, + "step": 100 + }, + { + "epoch": 0.0229806598407281, + "grad_norm": 10.54402988548413, + "learning_rate": 1.2499361318108602e-06, + "loss": 0.2695, + "step": 101 + }, + { + "epoch": 0.023208191126279865, + "grad_norm": 6.4464740357818116, + "learning_rate": 1.2499348480825627e-06, + "loss": 0.1883, + "step": 102 + }, + { + "epoch": 0.023435722411831627, + "grad_norm": 5.7228283849137895, + "learning_rate": 1.2499335515817413e-06, + "loss": 0.225, + "step": 103 + }, + { + "epoch": 0.02366325369738339, + "grad_norm": 8.575195167369158, + "learning_rate": 1.2499322423084226e-06, + "loss": 0.1988, + "step": 104 + }, + { + "epoch": 0.023890784982935155, + "grad_norm": 5.524822469569831, + "learning_rate": 1.2499309202626336e-06, + "loss": 0.1362, + "step": 105 + }, + { + "epoch": 0.024118316268486917, + "grad_norm": 1.4259194554286314, + "learning_rate": 1.249929585444401e-06, + "loss": 0.1341, + "step": 106 + }, + { + "epoch": 0.02434584755403868, + "grad_norm": 5.569399731315438, + "learning_rate": 1.2499282378537522e-06, + "loss": 0.1823, + "step": 107 + }, + { + "epoch": 0.024573378839590442, + "grad_norm": 5.131038290322419, + "learning_rate": 1.2499268774907144e-06, + "loss": 0.1674, + "step": 108 + }, + { + "epoch": 0.024800910125142208, + "grad_norm": 2.9740215362829368, + "learning_rate": 1.249925504355316e-06, + "loss": 0.1443, + "step": 109 + }, + { + "epoch": 0.02502844141069397, + "grad_norm": 7.125610878241638, + "learning_rate": 1.2499241184475848e-06, + "loss": 0.1993, + "step": 110 + }, + { + "epoch": 0.025255972696245733, + "grad_norm": 3.5104920582246284, + "learning_rate": 1.249922719767549e-06, + "loss": 0.1387, + "step": 111 + }, + { + "epoch": 0.0254835039817975, + "grad_norm": 15.180689323576399, + "learning_rate": 1.2499213083152374e-06, + "loss": 0.1609, + "step": 112 + }, + { + "epoch": 0.02571103526734926, + "grad_norm": 2.6467486780240077, + "learning_rate": 1.2499198840906787e-06, + "loss": 0.0766, + "step": 113 + }, + { + "epoch": 0.025938566552901023, + "grad_norm": 6.947833673299234, + "learning_rate": 1.249918447093902e-06, + "loss": 0.1988, + "step": 114 + }, + { + "epoch": 0.026166097838452786, + "grad_norm": 3.236155694827761, + "learning_rate": 1.249916997324937e-06, + "loss": 0.2822, + "step": 115 + }, + { + "epoch": 0.02639362912400455, + "grad_norm": 4.424229361394889, + "learning_rate": 1.2499155347838129e-06, + "loss": 0.2639, + "step": 116 + }, + { + "epoch": 0.026621160409556314, + "grad_norm": 6.7125880752306, + "learning_rate": 1.2499140594705596e-06, + "loss": 0.1758, + "step": 117 + }, + { + "epoch": 0.026848691695108076, + "grad_norm": 12.978485247890044, + "learning_rate": 1.2499125713852076e-06, + "loss": 0.2966, + "step": 118 + }, + { + "epoch": 0.027076222980659842, + "grad_norm": 2.4562187666064297, + "learning_rate": 1.2499110705277869e-06, + "loss": 0.1317, + "step": 119 + }, + { + "epoch": 0.027303754266211604, + "grad_norm": 2.450514697648912, + "learning_rate": 1.2499095568983284e-06, + "loss": 0.2491, + "step": 120 + }, + { + "epoch": 0.027531285551763367, + "grad_norm": 2.962900989508568, + "learning_rate": 1.2499080304968634e-06, + "loss": 0.1782, + "step": 121 + }, + { + "epoch": 0.027758816837315133, + "grad_norm": 4.706451675787787, + "learning_rate": 1.2499064913234222e-06, + "loss": 0.2063, + "step": 122 + }, + { + "epoch": 0.027986348122866895, + "grad_norm": 4.848247166198472, + "learning_rate": 1.249904939378037e-06, + "loss": 0.1873, + "step": 123 + }, + { + "epoch": 0.028213879408418657, + "grad_norm": 5.57275566955423, + "learning_rate": 1.2499033746607395e-06, + "loss": 0.2362, + "step": 124 + }, + { + "epoch": 0.02844141069397042, + "grad_norm": 4.528761927217566, + "learning_rate": 1.2499017971715614e-06, + "loss": 0.2686, + "step": 125 + }, + { + "epoch": 0.028668941979522185, + "grad_norm": 7.35859467900191, + "learning_rate": 1.2499002069105348e-06, + "loss": 0.275, + "step": 126 + }, + { + "epoch": 0.028896473265073948, + "grad_norm": 4.494727686955716, + "learning_rate": 1.2498986038776926e-06, + "loss": 0.1759, + "step": 127 + }, + { + "epoch": 0.02912400455062571, + "grad_norm": 7.273216392666622, + "learning_rate": 1.2498969880730671e-06, + "loss": 0.2159, + "step": 128 + }, + { + "epoch": 0.029351535836177476, + "grad_norm": 4.955227920384567, + "learning_rate": 1.249895359496692e-06, + "loss": 0.1888, + "step": 129 + }, + { + "epoch": 0.02957906712172924, + "grad_norm": 6.321445200949685, + "learning_rate": 1.2498937181486e-06, + "loss": 0.3007, + "step": 130 + }, + { + "epoch": 0.029806598407281, + "grad_norm": 2.76312902269676, + "learning_rate": 1.2498920640288248e-06, + "loss": 0.2442, + "step": 131 + }, + { + "epoch": 0.030034129692832763, + "grad_norm": 56.774720129580295, + "learning_rate": 1.2498903971374005e-06, + "loss": 0.223, + "step": 132 + }, + { + "epoch": 0.03026166097838453, + "grad_norm": 3.9468490187056324, + "learning_rate": 1.2498887174743606e-06, + "loss": 0.2504, + "step": 133 + }, + { + "epoch": 0.03048919226393629, + "grad_norm": 3.9118814976883542, + "learning_rate": 1.24988702503974e-06, + "loss": 0.1939, + "step": 134 + }, + { + "epoch": 0.030716723549488054, + "grad_norm": 3.7837188268010506, + "learning_rate": 1.2498853198335728e-06, + "loss": 0.2199, + "step": 135 + }, + { + "epoch": 0.03094425483503982, + "grad_norm": 4.0297942240817175, + "learning_rate": 1.2498836018558942e-06, + "loss": 0.1566, + "step": 136 + }, + { + "epoch": 0.031171786120591582, + "grad_norm": 3.4754550482446698, + "learning_rate": 1.2498818711067392e-06, + "loss": 0.2666, + "step": 137 + }, + { + "epoch": 0.031399317406143344, + "grad_norm": 3.864651244769, + "learning_rate": 1.2498801275861433e-06, + "loss": 0.1173, + "step": 138 + }, + { + "epoch": 0.03162684869169511, + "grad_norm": 8.216814820623972, + "learning_rate": 1.2498783712941418e-06, + "loss": 0.1879, + "step": 139 + }, + { + "epoch": 0.03185437997724687, + "grad_norm": 3.637457358045326, + "learning_rate": 1.2498766022307709e-06, + "loss": 0.2047, + "step": 140 + }, + { + "epoch": 0.032081911262798635, + "grad_norm": 2.58051980801193, + "learning_rate": 1.2498748203960665e-06, + "loss": 0.1008, + "step": 141 + }, + { + "epoch": 0.0323094425483504, + "grad_norm": 3.8775724824241764, + "learning_rate": 1.2498730257900655e-06, + "loss": 0.2042, + "step": 142 + }, + { + "epoch": 0.03253697383390216, + "grad_norm": 5.772591680829651, + "learning_rate": 1.249871218412804e-06, + "loss": 0.2352, + "step": 143 + }, + { + "epoch": 0.032764505119453925, + "grad_norm": 2.210254874393301, + "learning_rate": 1.2498693982643192e-06, + "loss": 0.1803, + "step": 144 + }, + { + "epoch": 0.03299203640500569, + "grad_norm": 6.540771980552272, + "learning_rate": 1.2498675653446485e-06, + "loss": 0.2304, + "step": 145 + }, + { + "epoch": 0.03321956769055745, + "grad_norm": 2.904522388367919, + "learning_rate": 1.249865719653829e-06, + "loss": 0.1707, + "step": 146 + }, + { + "epoch": 0.033447098976109216, + "grad_norm": 9.318986716894935, + "learning_rate": 1.2498638611918985e-06, + "loss": 0.2038, + "step": 147 + }, + { + "epoch": 0.03367463026166098, + "grad_norm": 9.58516027118141, + "learning_rate": 1.249861989958895e-06, + "loss": 0.2357, + "step": 148 + }, + { + "epoch": 0.03390216154721274, + "grad_norm": 3.559770501878285, + "learning_rate": 1.2498601059548572e-06, + "loss": 0.1613, + "step": 149 + }, + { + "epoch": 0.034129692832764506, + "grad_norm": 3.348814329958542, + "learning_rate": 1.2498582091798228e-06, + "loss": 0.2016, + "step": 150 + }, + { + "epoch": 0.034357224118316265, + "grad_norm": 6.375342543891093, + "learning_rate": 1.2498562996338312e-06, + "loss": 0.2231, + "step": 151 + }, + { + "epoch": 0.03458475540386803, + "grad_norm": 7.488809251815451, + "learning_rate": 1.249854377316921e-06, + "loss": 0.1819, + "step": 152 + }, + { + "epoch": 0.0348122866894198, + "grad_norm": 2.508487580474721, + "learning_rate": 1.2498524422291319e-06, + "loss": 0.182, + "step": 153 + }, + { + "epoch": 0.035039817974971556, + "grad_norm": 3.656563964135558, + "learning_rate": 1.2498504943705033e-06, + "loss": 0.165, + "step": 154 + }, + { + "epoch": 0.03526734926052332, + "grad_norm": 2.771070563762278, + "learning_rate": 1.249848533741075e-06, + "loss": 0.2569, + "step": 155 + }, + { + "epoch": 0.03549488054607509, + "grad_norm": 5.610529774003187, + "learning_rate": 1.2498465603408865e-06, + "loss": 0.2873, + "step": 156 + }, + { + "epoch": 0.035722411831626846, + "grad_norm": 3.6657793262286638, + "learning_rate": 1.2498445741699792e-06, + "loss": 0.1086, + "step": 157 + }, + { + "epoch": 0.03594994311717861, + "grad_norm": 11.136381961854878, + "learning_rate": 1.249842575228393e-06, + "loss": 0.1653, + "step": 158 + }, + { + "epoch": 0.03617747440273038, + "grad_norm": 4.607920317694178, + "learning_rate": 1.249840563516169e-06, + "loss": 0.1816, + "step": 159 + }, + { + "epoch": 0.03640500568828214, + "grad_norm": 4.765507333684582, + "learning_rate": 1.249838539033348e-06, + "loss": 0.1735, + "step": 160 + }, + { + "epoch": 0.0366325369738339, + "grad_norm": 3.024559515436515, + "learning_rate": 1.2498365017799715e-06, + "loss": 0.0997, + "step": 161 + }, + { + "epoch": 0.03686006825938567, + "grad_norm": 3.0006086205585594, + "learning_rate": 1.2498344517560815e-06, + "loss": 0.2742, + "step": 162 + }, + { + "epoch": 0.03708759954493743, + "grad_norm": 4.390575337778858, + "learning_rate": 1.2498323889617198e-06, + "loss": 0.2112, + "step": 163 + }, + { + "epoch": 0.03731513083048919, + "grad_norm": 4.987032274568943, + "learning_rate": 1.2498303133969281e-06, + "loss": 0.2282, + "step": 164 + }, + { + "epoch": 0.03754266211604096, + "grad_norm": 3.813775711394782, + "learning_rate": 1.2498282250617492e-06, + "loss": 0.1944, + "step": 165 + }, + { + "epoch": 0.03777019340159272, + "grad_norm": 3.361678763128891, + "learning_rate": 1.2498261239562257e-06, + "loss": 0.2018, + "step": 166 + }, + { + "epoch": 0.037997724687144484, + "grad_norm": 4.992072192203259, + "learning_rate": 1.2498240100804005e-06, + "loss": 0.2089, + "step": 167 + }, + { + "epoch": 0.03822525597269624, + "grad_norm": 8.050790934059092, + "learning_rate": 1.249821883434317e-06, + "loss": 0.2696, + "step": 168 + }, + { + "epoch": 0.03845278725824801, + "grad_norm": 2.642297340192281, + "learning_rate": 1.2498197440180182e-06, + "loss": 0.2691, + "step": 169 + }, + { + "epoch": 0.038680318543799774, + "grad_norm": 3.35790306734272, + "learning_rate": 1.2498175918315484e-06, + "loss": 0.1851, + "step": 170 + }, + { + "epoch": 0.03890784982935153, + "grad_norm": 3.524642269348137, + "learning_rate": 1.2498154268749513e-06, + "loss": 0.2276, + "step": 171 + }, + { + "epoch": 0.0391353811149033, + "grad_norm": 2.188667506818875, + "learning_rate": 1.249813249148271e-06, + "loss": 0.1616, + "step": 172 + }, + { + "epoch": 0.039362912400455065, + "grad_norm": 5.1958946099491845, + "learning_rate": 1.2498110586515525e-06, + "loss": 0.1987, + "step": 173 + }, + { + "epoch": 0.039590443686006824, + "grad_norm": 5.09328084896296, + "learning_rate": 1.2498088553848398e-06, + "loss": 0.195, + "step": 174 + }, + { + "epoch": 0.03981797497155859, + "grad_norm": 2.8290595777512952, + "learning_rate": 1.2498066393481787e-06, + "loss": 0.1568, + "step": 175 + }, + { + "epoch": 0.040045506257110355, + "grad_norm": 2.360697357040943, + "learning_rate": 1.249804410541614e-06, + "loss": 0.2065, + "step": 176 + }, + { + "epoch": 0.040273037542662114, + "grad_norm": 4.718810327826489, + "learning_rate": 1.2498021689651916e-06, + "loss": 0.2003, + "step": 177 + }, + { + "epoch": 0.04050056882821388, + "grad_norm": 2.6458436624930237, + "learning_rate": 1.249799914618957e-06, + "loss": 0.1589, + "step": 178 + }, + { + "epoch": 0.040728100113765646, + "grad_norm": 3.289621635927127, + "learning_rate": 1.2497976475029566e-06, + "loss": 0.1905, + "step": 179 + }, + { + "epoch": 0.040955631399317405, + "grad_norm": 2.7547654896260028, + "learning_rate": 1.2497953676172364e-06, + "loss": 0.1538, + "step": 180 + }, + { + "epoch": 0.04118316268486917, + "grad_norm": 4.715970073162376, + "learning_rate": 1.2497930749618431e-06, + "loss": 0.1297, + "step": 181 + }, + { + "epoch": 0.04141069397042093, + "grad_norm": 13.147614048372157, + "learning_rate": 1.2497907695368238e-06, + "loss": 0.164, + "step": 182 + }, + { + "epoch": 0.041638225255972695, + "grad_norm": 2.692225418023433, + "learning_rate": 1.2497884513422253e-06, + "loss": 0.2537, + "step": 183 + }, + { + "epoch": 0.04186575654152446, + "grad_norm": 5.166049507007355, + "learning_rate": 1.249786120378095e-06, + "loss": 0.074, + "step": 184 + }, + { + "epoch": 0.04209328782707622, + "grad_norm": 3.0648916024092596, + "learning_rate": 1.2497837766444806e-06, + "loss": 0.1639, + "step": 185 + }, + { + "epoch": 0.042320819112627986, + "grad_norm": 4.567688921451397, + "learning_rate": 1.2497814201414304e-06, + "loss": 0.2905, + "step": 186 + }, + { + "epoch": 0.04254835039817975, + "grad_norm": 3.970377559361967, + "learning_rate": 1.249779050868992e-06, + "loss": 0.2001, + "step": 187 + }, + { + "epoch": 0.04277588168373151, + "grad_norm": 2.2768846909587763, + "learning_rate": 1.249776668827214e-06, + "loss": 0.0951, + "step": 188 + }, + { + "epoch": 0.043003412969283276, + "grad_norm": 6.438142708090974, + "learning_rate": 1.249774274016145e-06, + "loss": 0.203, + "step": 189 + }, + { + "epoch": 0.04323094425483504, + "grad_norm": 2.4175466744317977, + "learning_rate": 1.2497718664358341e-06, + "loss": 0.1713, + "step": 190 + }, + { + "epoch": 0.0434584755403868, + "grad_norm": 4.37204480901975, + "learning_rate": 1.2497694460863307e-06, + "loss": 0.2986, + "step": 191 + }, + { + "epoch": 0.04368600682593857, + "grad_norm": 3.2046762676937255, + "learning_rate": 1.2497670129676838e-06, + "loss": 0.1288, + "step": 192 + }, + { + "epoch": 0.04391353811149033, + "grad_norm": 3.901472238917995, + "learning_rate": 1.2497645670799436e-06, + "loss": 0.1291, + "step": 193 + }, + { + "epoch": 0.04414106939704209, + "grad_norm": 3.891177273974114, + "learning_rate": 1.2497621084231595e-06, + "loss": 0.1165, + "step": 194 + }, + { + "epoch": 0.04436860068259386, + "grad_norm": 3.831124951630966, + "learning_rate": 1.2497596369973823e-06, + "loss": 0.175, + "step": 195 + }, + { + "epoch": 0.04459613196814562, + "grad_norm": 7.137497588920377, + "learning_rate": 1.2497571528026623e-06, + "loss": 0.2319, + "step": 196 + }, + { + "epoch": 0.04482366325369738, + "grad_norm": 2.9787063992991256, + "learning_rate": 1.2497546558390503e-06, + "loss": 0.2044, + "step": 197 + }, + { + "epoch": 0.04505119453924915, + "grad_norm": 2.5728244375494413, + "learning_rate": 1.2497521461065973e-06, + "loss": 0.1395, + "step": 198 + }, + { + "epoch": 0.04527872582480091, + "grad_norm": 7.102221321561537, + "learning_rate": 1.2497496236053547e-06, + "loss": 0.1969, + "step": 199 + }, + { + "epoch": 0.04550625711035267, + "grad_norm": 2.579422809989494, + "learning_rate": 1.2497470883353738e-06, + "loss": 0.1019, + "step": 200 + }, + { + "epoch": 0.04573378839590444, + "grad_norm": 4.340132040430137, + "learning_rate": 1.2497445402967068e-06, + "loss": 0.241, + "step": 201 + }, + { + "epoch": 0.0459613196814562, + "grad_norm": 2.2195665044126276, + "learning_rate": 1.2497419794894053e-06, + "loss": 0.2059, + "step": 202 + }, + { + "epoch": 0.04618885096700796, + "grad_norm": 3.274345001247324, + "learning_rate": 1.249739405913522e-06, + "loss": 0.1328, + "step": 203 + }, + { + "epoch": 0.04641638225255973, + "grad_norm": 2.527264534705696, + "learning_rate": 1.2497368195691095e-06, + "loss": 0.1408, + "step": 204 + }, + { + "epoch": 0.04664391353811149, + "grad_norm": 3.306757570747259, + "learning_rate": 1.2497342204562205e-06, + "loss": 0.2233, + "step": 205 + }, + { + "epoch": 0.046871444823663254, + "grad_norm": 3.6647451852915336, + "learning_rate": 1.2497316085749081e-06, + "loss": 0.1239, + "step": 206 + }, + { + "epoch": 0.04709897610921502, + "grad_norm": 4.68508784917087, + "learning_rate": 1.249728983925226e-06, + "loss": 0.1707, + "step": 207 + }, + { + "epoch": 0.04732650739476678, + "grad_norm": 3.18438034976801, + "learning_rate": 1.2497263465072274e-06, + "loss": 0.1325, + "step": 208 + }, + { + "epoch": 0.047554038680318544, + "grad_norm": 2.665536371480516, + "learning_rate": 1.2497236963209663e-06, + "loss": 0.247, + "step": 209 + }, + { + "epoch": 0.04778156996587031, + "grad_norm": 3.6305897675111822, + "learning_rate": 1.2497210333664972e-06, + "loss": 0.1399, + "step": 210 + }, + { + "epoch": 0.04800910125142207, + "grad_norm": 3.427786312260657, + "learning_rate": 1.2497183576438743e-06, + "loss": 0.1595, + "step": 211 + }, + { + "epoch": 0.048236632536973835, + "grad_norm": 3.501593030667954, + "learning_rate": 1.2497156691531523e-06, + "loss": 0.1895, + "step": 212 + }, + { + "epoch": 0.048464163822525594, + "grad_norm": 2.29399983953313, + "learning_rate": 1.249712967894386e-06, + "loss": 0.1273, + "step": 213 + }, + { + "epoch": 0.04869169510807736, + "grad_norm": 4.248497703608046, + "learning_rate": 1.2497102538676308e-06, + "loss": 0.2118, + "step": 214 + }, + { + "epoch": 0.048919226393629126, + "grad_norm": 5.009911727752511, + "learning_rate": 1.249707527072942e-06, + "loss": 0.1533, + "step": 215 + }, + { + "epoch": 0.049146757679180884, + "grad_norm": 3.254064879259487, + "learning_rate": 1.2497047875103757e-06, + "loss": 0.3042, + "step": 216 + }, + { + "epoch": 0.04937428896473265, + "grad_norm": 2.700363753095535, + "learning_rate": 1.2497020351799875e-06, + "loss": 0.1933, + "step": 217 + }, + { + "epoch": 0.049601820250284416, + "grad_norm": 2.2159854350533763, + "learning_rate": 1.2496992700818335e-06, + "loss": 0.1733, + "step": 218 + }, + { + "epoch": 0.049829351535836175, + "grad_norm": 6.438623712108173, + "learning_rate": 1.249696492215971e-06, + "loss": 0.2233, + "step": 219 + }, + { + "epoch": 0.05005688282138794, + "grad_norm": 3.6403163135182552, + "learning_rate": 1.249693701582456e-06, + "loss": 0.1542, + "step": 220 + }, + { + "epoch": 0.05028441410693971, + "grad_norm": 3.280631643810882, + "learning_rate": 1.2496908981813458e-06, + "loss": 0.1799, + "step": 221 + }, + { + "epoch": 0.050511945392491465, + "grad_norm": 2.5684306853319687, + "learning_rate": 1.2496880820126977e-06, + "loss": 0.2051, + "step": 222 + }, + { + "epoch": 0.05073947667804323, + "grad_norm": 2.7401430199461108, + "learning_rate": 1.2496852530765695e-06, + "loss": 0.1828, + "step": 223 + }, + { + "epoch": 0.050967007963595, + "grad_norm": 2.95485123311806, + "learning_rate": 1.2496824113730186e-06, + "loss": 0.2602, + "step": 224 + }, + { + "epoch": 0.051194539249146756, + "grad_norm": 2.5679914292312738, + "learning_rate": 1.2496795569021033e-06, + "loss": 0.1838, + "step": 225 + }, + { + "epoch": 0.05142207053469852, + "grad_norm": 4.2106953289503055, + "learning_rate": 1.2496766896638819e-06, + "loss": 0.1831, + "step": 226 + }, + { + "epoch": 0.05164960182025029, + "grad_norm": 2.4133590857510603, + "learning_rate": 1.249673809658413e-06, + "loss": 0.1869, + "step": 227 + }, + { + "epoch": 0.05187713310580205, + "grad_norm": 2.009672236932174, + "learning_rate": 1.2496709168857555e-06, + "loss": 0.1297, + "step": 228 + }, + { + "epoch": 0.05210466439135381, + "grad_norm": 2.57569428799923, + "learning_rate": 1.2496680113459683e-06, + "loss": 0.1887, + "step": 229 + }, + { + "epoch": 0.05233219567690557, + "grad_norm": 3.3094428680937464, + "learning_rate": 1.2496650930391113e-06, + "loss": 0.2654, + "step": 230 + }, + { + "epoch": 0.05255972696245734, + "grad_norm": 2.847650693015463, + "learning_rate": 1.2496621619652435e-06, + "loss": 0.1704, + "step": 231 + }, + { + "epoch": 0.0527872582480091, + "grad_norm": 2.9888611972362167, + "learning_rate": 1.2496592181244253e-06, + "loss": 0.1601, + "step": 232 + }, + { + "epoch": 0.05301478953356086, + "grad_norm": 2.08648737949565, + "learning_rate": 1.249656261516717e-06, + "loss": 0.1953, + "step": 233 + }, + { + "epoch": 0.05324232081911263, + "grad_norm": 2.531082669247976, + "learning_rate": 1.2496532921421781e-06, + "loss": 0.1717, + "step": 234 + }, + { + "epoch": 0.053469852104664393, + "grad_norm": 2.7509933573597896, + "learning_rate": 1.2496503100008704e-06, + "loss": 0.2469, + "step": 235 + }, + { + "epoch": 0.05369738339021615, + "grad_norm": 3.5155091690123923, + "learning_rate": 1.249647315092854e-06, + "loss": 0.1314, + "step": 236 + }, + { + "epoch": 0.05392491467576792, + "grad_norm": 3.2336581137529135, + "learning_rate": 1.2496443074181905e-06, + "loss": 0.1479, + "step": 237 + }, + { + "epoch": 0.054152445961319684, + "grad_norm": 1.9727228995954271, + "learning_rate": 1.2496412869769415e-06, + "loss": 0.1072, + "step": 238 + }, + { + "epoch": 0.05437997724687144, + "grad_norm": 9.030280638699303, + "learning_rate": 1.2496382537691686e-06, + "loss": 0.1993, + "step": 239 + }, + { + "epoch": 0.05460750853242321, + "grad_norm": 2.012237999972146, + "learning_rate": 1.2496352077949336e-06, + "loss": 0.2021, + "step": 240 + }, + { + "epoch": 0.054835039817974975, + "grad_norm": 2.875480352440569, + "learning_rate": 1.249632149054299e-06, + "loss": 0.1071, + "step": 241 + }, + { + "epoch": 0.05506257110352673, + "grad_norm": 3.027078266755971, + "learning_rate": 1.249629077547327e-06, + "loss": 0.2081, + "step": 242 + }, + { + "epoch": 0.0552901023890785, + "grad_norm": 3.212706521917931, + "learning_rate": 1.2496259932740813e-06, + "loss": 0.235, + "step": 243 + }, + { + "epoch": 0.055517633674630265, + "grad_norm": 1.5899391805286471, + "learning_rate": 1.2496228962346236e-06, + "loss": 0.1498, + "step": 244 + }, + { + "epoch": 0.055745164960182024, + "grad_norm": 2.252897408154709, + "learning_rate": 1.249619786429018e-06, + "loss": 0.0875, + "step": 245 + }, + { + "epoch": 0.05597269624573379, + "grad_norm": 1.7851217439709355, + "learning_rate": 1.2496166638573278e-06, + "loss": 0.163, + "step": 246 + }, + { + "epoch": 0.05620022753128555, + "grad_norm": 4.076208180076855, + "learning_rate": 1.2496135285196172e-06, + "loss": 0.1298, + "step": 247 + }, + { + "epoch": 0.056427758816837315, + "grad_norm": 8.235783447081577, + "learning_rate": 1.2496103804159497e-06, + "loss": 0.1994, + "step": 248 + }, + { + "epoch": 0.05665529010238908, + "grad_norm": 4.224863516307238, + "learning_rate": 1.2496072195463904e-06, + "loss": 0.1917, + "step": 249 + }, + { + "epoch": 0.05688282138794084, + "grad_norm": 2.600108393969465, + "learning_rate": 1.249604045911003e-06, + "loss": 0.1728, + "step": 250 + }, + { + "epoch": 0.057110352673492605, + "grad_norm": 4.193154020881599, + "learning_rate": 1.249600859509853e-06, + "loss": 0.1469, + "step": 251 + }, + { + "epoch": 0.05733788395904437, + "grad_norm": 3.3023049454358957, + "learning_rate": 1.2495976603430054e-06, + "loss": 0.3015, + "step": 252 + }, + { + "epoch": 0.05756541524459613, + "grad_norm": 2.1335803404002815, + "learning_rate": 1.2495944484105254e-06, + "loss": 0.1237, + "step": 253 + }, + { + "epoch": 0.057792946530147896, + "grad_norm": 5.342229724882705, + "learning_rate": 1.2495912237124787e-06, + "loss": 0.1134, + "step": 254 + }, + { + "epoch": 0.05802047781569966, + "grad_norm": 4.8799722775641765, + "learning_rate": 1.2495879862489312e-06, + "loss": 0.1865, + "step": 255 + }, + { + "epoch": 0.05824800910125142, + "grad_norm": 5.731543371657422, + "learning_rate": 1.2495847360199495e-06, + "loss": 0.2008, + "step": 256 + }, + { + "epoch": 0.058475540386803186, + "grad_norm": 2.313924736001694, + "learning_rate": 1.2495814730255993e-06, + "loss": 0.1361, + "step": 257 + }, + { + "epoch": 0.05870307167235495, + "grad_norm": 1.3942403935107488, + "learning_rate": 1.2495781972659479e-06, + "loss": 0.1103, + "step": 258 + }, + { + "epoch": 0.05893060295790671, + "grad_norm": 1.8635600367271647, + "learning_rate": 1.2495749087410618e-06, + "loss": 0.1736, + "step": 259 + }, + { + "epoch": 0.05915813424345848, + "grad_norm": 3.934800507138662, + "learning_rate": 1.2495716074510087e-06, + "loss": 0.1706, + "step": 260 + }, + { + "epoch": 0.059385665529010236, + "grad_norm": 7.067913001607123, + "learning_rate": 1.2495682933958555e-06, + "loss": 0.1963, + "step": 261 + }, + { + "epoch": 0.059613196814562, + "grad_norm": 2.692944909371077, + "learning_rate": 1.2495649665756705e-06, + "loss": 0.2486, + "step": 262 + }, + { + "epoch": 0.05984072810011377, + "grad_norm": 2.4930462253175305, + "learning_rate": 1.2495616269905212e-06, + "loss": 0.1447, + "step": 263 + }, + { + "epoch": 0.060068259385665526, + "grad_norm": 1.7948148568482771, + "learning_rate": 1.2495582746404762e-06, + "loss": 0.0994, + "step": 264 + }, + { + "epoch": 0.06029579067121729, + "grad_norm": 2.021876252112372, + "learning_rate": 1.249554909525604e-06, + "loss": 0.1386, + "step": 265 + }, + { + "epoch": 0.06052332195676906, + "grad_norm": 2.069960058640526, + "learning_rate": 1.249551531645973e-06, + "loss": 0.1866, + "step": 266 + }, + { + "epoch": 0.06075085324232082, + "grad_norm": 8.549797598789278, + "learning_rate": 1.2495481410016527e-06, + "loss": 0.3426, + "step": 267 + }, + { + "epoch": 0.06097838452787258, + "grad_norm": 6.033524800668443, + "learning_rate": 1.2495447375927122e-06, + "loss": 0.2039, + "step": 268 + }, + { + "epoch": 0.06120591581342435, + "grad_norm": 3.3984019223631656, + "learning_rate": 1.2495413214192209e-06, + "loss": 0.1562, + "step": 269 + }, + { + "epoch": 0.06143344709897611, + "grad_norm": 2.78909231360363, + "learning_rate": 1.2495378924812486e-06, + "loss": 0.2056, + "step": 270 + }, + { + "epoch": 0.06166097838452787, + "grad_norm": 5.781877877875473, + "learning_rate": 1.2495344507788662e-06, + "loss": 0.2293, + "step": 271 + }, + { + "epoch": 0.06188850967007964, + "grad_norm": 2.3180826263300607, + "learning_rate": 1.249530996312143e-06, + "loss": 0.1489, + "step": 272 + }, + { + "epoch": 0.0621160409556314, + "grad_norm": 7.2617460886104475, + "learning_rate": 1.2495275290811499e-06, + "loss": 0.2172, + "step": 273 + }, + { + "epoch": 0.062343572241183164, + "grad_norm": 2.1316035699431173, + "learning_rate": 1.2495240490859581e-06, + "loss": 0.2176, + "step": 274 + }, + { + "epoch": 0.06257110352673492, + "grad_norm": 2.5542857532037235, + "learning_rate": 1.2495205563266384e-06, + "loss": 0.1521, + "step": 275 + }, + { + "epoch": 0.06279863481228669, + "grad_norm": 3.5696131149812644, + "learning_rate": 1.2495170508032624e-06, + "loss": 0.2817, + "step": 276 + }, + { + "epoch": 0.06302616609783845, + "grad_norm": 4.055804927691344, + "learning_rate": 1.2495135325159015e-06, + "loss": 0.1484, + "step": 277 + }, + { + "epoch": 0.06325369738339022, + "grad_norm": 2.830287596995614, + "learning_rate": 1.2495100014646277e-06, + "loss": 0.1714, + "step": 278 + }, + { + "epoch": 0.06348122866894199, + "grad_norm": 5.2323794095215685, + "learning_rate": 1.2495064576495134e-06, + "loss": 0.3121, + "step": 279 + }, + { + "epoch": 0.06370875995449374, + "grad_norm": 2.500465425444752, + "learning_rate": 1.2495029010706306e-06, + "loss": 0.1005, + "step": 280 + }, + { + "epoch": 0.0639362912400455, + "grad_norm": 2.7474098845449433, + "learning_rate": 1.2494993317280524e-06, + "loss": 0.1755, + "step": 281 + }, + { + "epoch": 0.06416382252559727, + "grad_norm": 3.1110646620479967, + "learning_rate": 1.2494957496218516e-06, + "loss": 0.194, + "step": 282 + }, + { + "epoch": 0.06439135381114904, + "grad_norm": 1.162926170243262, + "learning_rate": 1.2494921547521013e-06, + "loss": 0.1667, + "step": 283 + }, + { + "epoch": 0.0646188850967008, + "grad_norm": 2.034958588386092, + "learning_rate": 1.249488547118875e-06, + "loss": 0.1031, + "step": 284 + }, + { + "epoch": 0.06484641638225255, + "grad_norm": 2.8585727096596214, + "learning_rate": 1.2494849267222466e-06, + "loss": 0.1199, + "step": 285 + }, + { + "epoch": 0.06507394766780432, + "grad_norm": 2.3756686418598916, + "learning_rate": 1.24948129356229e-06, + "loss": 0.203, + "step": 286 + }, + { + "epoch": 0.06530147895335608, + "grad_norm": 6.080154909085321, + "learning_rate": 1.2494776476390793e-06, + "loss": 0.2723, + "step": 287 + }, + { + "epoch": 0.06552901023890785, + "grad_norm": 3.1578927707769684, + "learning_rate": 1.2494739889526894e-06, + "loss": 0.1218, + "step": 288 + }, + { + "epoch": 0.06575654152445962, + "grad_norm": 2.7745317736308373, + "learning_rate": 1.2494703175031946e-06, + "loss": 0.194, + "step": 289 + }, + { + "epoch": 0.06598407281001138, + "grad_norm": 2.872306438815133, + "learning_rate": 1.2494666332906702e-06, + "loss": 0.143, + "step": 290 + }, + { + "epoch": 0.06621160409556313, + "grad_norm": 2.2661659384858277, + "learning_rate": 1.2494629363151916e-06, + "loss": 0.1497, + "step": 291 + }, + { + "epoch": 0.0664391353811149, + "grad_norm": 2.7978250826969586, + "learning_rate": 1.2494592265768343e-06, + "loss": 0.1817, + "step": 292 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 2.9435086338480496, + "learning_rate": 1.2494555040756737e-06, + "loss": 0.1195, + "step": 293 + }, + { + "epoch": 0.06689419795221843, + "grad_norm": 2.525871560805257, + "learning_rate": 1.2494517688117867e-06, + "loss": 0.2054, + "step": 294 + }, + { + "epoch": 0.0671217292377702, + "grad_norm": 3.3530486331117126, + "learning_rate": 1.2494480207852489e-06, + "loss": 0.1186, + "step": 295 + }, + { + "epoch": 0.06734926052332196, + "grad_norm": 3.791549905681902, + "learning_rate": 1.249444259996137e-06, + "loss": 0.1616, + "step": 296 + }, + { + "epoch": 0.06757679180887372, + "grad_norm": 2.3603348366809236, + "learning_rate": 1.2494404864445284e-06, + "loss": 0.1392, + "step": 297 + }, + { + "epoch": 0.06780432309442548, + "grad_norm": 2.161901751847752, + "learning_rate": 1.2494367001304996e-06, + "loss": 0.1548, + "step": 298 + }, + { + "epoch": 0.06803185437997725, + "grad_norm": 2.3978175716297634, + "learning_rate": 1.2494329010541284e-06, + "loss": 0.1634, + "step": 299 + }, + { + "epoch": 0.06825938566552901, + "grad_norm": 5.413503442113624, + "learning_rate": 1.2494290892154922e-06, + "loss": 0.2876, + "step": 300 + }, + { + "epoch": 0.06848691695108078, + "grad_norm": 1.904095426332445, + "learning_rate": 1.2494252646146692e-06, + "loss": 0.1942, + "step": 301 + }, + { + "epoch": 0.06871444823663253, + "grad_norm": 2.0091735504190504, + "learning_rate": 1.249421427251737e-06, + "loss": 0.1403, + "step": 302 + }, + { + "epoch": 0.0689419795221843, + "grad_norm": 2.6001586830103123, + "learning_rate": 1.2494175771267748e-06, + "loss": 0.2376, + "step": 303 + }, + { + "epoch": 0.06916951080773606, + "grad_norm": 2.8009063420794265, + "learning_rate": 1.2494137142398607e-06, + "loss": 0.1877, + "step": 304 + }, + { + "epoch": 0.06939704209328783, + "grad_norm": 2.0648464255318517, + "learning_rate": 1.249409838591074e-06, + "loss": 0.1462, + "step": 305 + }, + { + "epoch": 0.0696245733788396, + "grad_norm": 2.6396516124770657, + "learning_rate": 1.2494059501804937e-06, + "loss": 0.256, + "step": 306 + }, + { + "epoch": 0.06985210466439136, + "grad_norm": 2.9901343092043837, + "learning_rate": 1.249402049008199e-06, + "loss": 0.1483, + "step": 307 + }, + { + "epoch": 0.07007963594994311, + "grad_norm": 3.0343546498099356, + "learning_rate": 1.2493981350742704e-06, + "loss": 0.1561, + "step": 308 + }, + { + "epoch": 0.07030716723549488, + "grad_norm": 3.2148889672864636, + "learning_rate": 1.2493942083787872e-06, + "loss": 0.1856, + "step": 309 + }, + { + "epoch": 0.07053469852104664, + "grad_norm": 2.795539793994042, + "learning_rate": 1.2493902689218299e-06, + "loss": 0.1294, + "step": 310 + }, + { + "epoch": 0.07076222980659841, + "grad_norm": 2.1866434219410307, + "learning_rate": 1.249386316703479e-06, + "loss": 0.1789, + "step": 311 + }, + { + "epoch": 0.07098976109215017, + "grad_norm": 4.93386744278198, + "learning_rate": 1.2493823517238154e-06, + "loss": 0.1529, + "step": 312 + }, + { + "epoch": 0.07121729237770194, + "grad_norm": 2.127480030167813, + "learning_rate": 1.2493783739829202e-06, + "loss": 0.1593, + "step": 313 + }, + { + "epoch": 0.07144482366325369, + "grad_norm": 2.565861378561538, + "learning_rate": 1.2493743834808741e-06, + "loss": 0.1442, + "step": 314 + }, + { + "epoch": 0.07167235494880546, + "grad_norm": 3.129314599970171, + "learning_rate": 1.2493703802177594e-06, + "loss": 0.1936, + "step": 315 + }, + { + "epoch": 0.07189988623435722, + "grad_norm": 4.26603531282599, + "learning_rate": 1.2493663641936576e-06, + "loss": 0.1343, + "step": 316 + }, + { + "epoch": 0.07212741751990899, + "grad_norm": 1.778626655821605, + "learning_rate": 1.2493623354086507e-06, + "loss": 0.1751, + "step": 317 + }, + { + "epoch": 0.07235494880546076, + "grad_norm": 2.576979617695665, + "learning_rate": 1.2493582938628213e-06, + "loss": 0.1405, + "step": 318 + }, + { + "epoch": 0.07258248009101251, + "grad_norm": 2.528946823784448, + "learning_rate": 1.2493542395562516e-06, + "loss": 0.1207, + "step": 319 + }, + { + "epoch": 0.07281001137656427, + "grad_norm": 1.7105561186222351, + "learning_rate": 1.2493501724890247e-06, + "loss": 0.1067, + "step": 320 + }, + { + "epoch": 0.07303754266211604, + "grad_norm": 3.0021555230652144, + "learning_rate": 1.249346092661224e-06, + "loss": 0.1769, + "step": 321 + }, + { + "epoch": 0.0732650739476678, + "grad_norm": 3.2473648686733787, + "learning_rate": 1.2493420000729322e-06, + "loss": 0.1797, + "step": 322 + }, + { + "epoch": 0.07349260523321957, + "grad_norm": 2.9141882965376644, + "learning_rate": 1.2493378947242336e-06, + "loss": 0.1936, + "step": 323 + }, + { + "epoch": 0.07372013651877134, + "grad_norm": 2.139000059452357, + "learning_rate": 1.2493337766152119e-06, + "loss": 0.1323, + "step": 324 + }, + { + "epoch": 0.07394766780432309, + "grad_norm": 3.7562365963393773, + "learning_rate": 1.249329645745951e-06, + "loss": 0.1521, + "step": 325 + }, + { + "epoch": 0.07417519908987485, + "grad_norm": 3.1427328506374343, + "learning_rate": 1.2493255021165357e-06, + "loss": 0.1426, + "step": 326 + }, + { + "epoch": 0.07440273037542662, + "grad_norm": 2.5928821859504225, + "learning_rate": 1.2493213457270504e-06, + "loss": 0.1492, + "step": 327 + }, + { + "epoch": 0.07463026166097839, + "grad_norm": 2.6116349350740773, + "learning_rate": 1.2493171765775804e-06, + "loss": 0.1079, + "step": 328 + }, + { + "epoch": 0.07485779294653015, + "grad_norm": 2.5063754100070796, + "learning_rate": 1.2493129946682107e-06, + "loss": 0.1449, + "step": 329 + }, + { + "epoch": 0.07508532423208192, + "grad_norm": 2.7029390289735247, + "learning_rate": 1.2493087999990263e-06, + "loss": 0.2012, + "step": 330 + }, + { + "epoch": 0.07531285551763367, + "grad_norm": 3.168250561710959, + "learning_rate": 1.249304592570114e-06, + "loss": 0.135, + "step": 331 + }, + { + "epoch": 0.07554038680318544, + "grad_norm": 3.358825282989208, + "learning_rate": 1.2493003723815588e-06, + "loss": 0.202, + "step": 332 + }, + { + "epoch": 0.0757679180887372, + "grad_norm": 3.4712230061099367, + "learning_rate": 1.2492961394334474e-06, + "loss": 0.1796, + "step": 333 + }, + { + "epoch": 0.07599544937428897, + "grad_norm": 2.7447934095202586, + "learning_rate": 1.2492918937258663e-06, + "loss": 0.1529, + "step": 334 + }, + { + "epoch": 0.07622298065984073, + "grad_norm": 4.884489478774658, + "learning_rate": 1.2492876352589024e-06, + "loss": 0.1983, + "step": 335 + }, + { + "epoch": 0.07645051194539249, + "grad_norm": 11.840111431867928, + "learning_rate": 1.2492833640326424e-06, + "loss": 0.1701, + "step": 336 + }, + { + "epoch": 0.07667804323094425, + "grad_norm": 3.6493332372043032, + "learning_rate": 1.2492790800471738e-06, + "loss": 0.1894, + "step": 337 + }, + { + "epoch": 0.07690557451649602, + "grad_norm": 2.2273861687776657, + "learning_rate": 1.249274783302584e-06, + "loss": 0.1168, + "step": 338 + }, + { + "epoch": 0.07713310580204778, + "grad_norm": 3.0155968100929016, + "learning_rate": 1.249270473798961e-06, + "loss": 0.1877, + "step": 339 + }, + { + "epoch": 0.07736063708759955, + "grad_norm": 3.6811309004263197, + "learning_rate": 1.249266151536393e-06, + "loss": 0.1841, + "step": 340 + }, + { + "epoch": 0.07758816837315131, + "grad_norm": 3.3318670131929355, + "learning_rate": 1.249261816514968e-06, + "loss": 0.1425, + "step": 341 + }, + { + "epoch": 0.07781569965870307, + "grad_norm": 1.542707864707429, + "learning_rate": 1.2492574687347747e-06, + "loss": 0.0954, + "step": 342 + }, + { + "epoch": 0.07804323094425483, + "grad_norm": 5.219514434003638, + "learning_rate": 1.249253108195902e-06, + "loss": 0.1523, + "step": 343 + }, + { + "epoch": 0.0782707622298066, + "grad_norm": 2.685054702258556, + "learning_rate": 1.249248734898439e-06, + "loss": 0.1932, + "step": 344 + }, + { + "epoch": 0.07849829351535836, + "grad_norm": 3.782143044532345, + "learning_rate": 1.2492443488424753e-06, + "loss": 0.1782, + "step": 345 + }, + { + "epoch": 0.07872582480091013, + "grad_norm": 2.987081909452687, + "learning_rate": 1.2492399500281002e-06, + "loss": 0.1174, + "step": 346 + }, + { + "epoch": 0.07895335608646188, + "grad_norm": 2.4163752446451667, + "learning_rate": 1.2492355384554039e-06, + "loss": 0.1864, + "step": 347 + }, + { + "epoch": 0.07918088737201365, + "grad_norm": 2.881696468020635, + "learning_rate": 1.2492311141244764e-06, + "loss": 0.1509, + "step": 348 + }, + { + "epoch": 0.07940841865756541, + "grad_norm": 4.2425549257036925, + "learning_rate": 1.249226677035408e-06, + "loss": 0.1384, + "step": 349 + }, + { + "epoch": 0.07963594994311718, + "grad_norm": 2.999886291999185, + "learning_rate": 1.2492222271882896e-06, + "loss": 0.1631, + "step": 350 + }, + { + "epoch": 0.07986348122866894, + "grad_norm": 4.681484131322112, + "learning_rate": 1.2492177645832121e-06, + "loss": 0.1752, + "step": 351 + }, + { + "epoch": 0.08009101251422071, + "grad_norm": 2.921704965075288, + "learning_rate": 1.2492132892202668e-06, + "loss": 0.1486, + "step": 352 + }, + { + "epoch": 0.08031854379977246, + "grad_norm": 5.592595582830648, + "learning_rate": 1.2492088010995449e-06, + "loss": 0.2707, + "step": 353 + }, + { + "epoch": 0.08054607508532423, + "grad_norm": 2.9440013961704823, + "learning_rate": 1.2492043002211385e-06, + "loss": 0.2054, + "step": 354 + }, + { + "epoch": 0.080773606370876, + "grad_norm": 2.2221784159000006, + "learning_rate": 1.2491997865851392e-06, + "loss": 0.1373, + "step": 355 + }, + { + "epoch": 0.08100113765642776, + "grad_norm": 1.7381570114572884, + "learning_rate": 1.2491952601916395e-06, + "loss": 0.0858, + "step": 356 + }, + { + "epoch": 0.08122866894197953, + "grad_norm": 2.930524510809462, + "learning_rate": 1.2491907210407319e-06, + "loss": 0.2179, + "step": 357 + }, + { + "epoch": 0.08145620022753129, + "grad_norm": 1.329914120982883, + "learning_rate": 1.249186169132509e-06, + "loss": 0.1839, + "step": 358 + }, + { + "epoch": 0.08168373151308304, + "grad_norm": 4.774637200381304, + "learning_rate": 1.2491816044670641e-06, + "loss": 0.1266, + "step": 359 + }, + { + "epoch": 0.08191126279863481, + "grad_norm": 3.0085506218930442, + "learning_rate": 1.24917702704449e-06, + "loss": 0.1813, + "step": 360 + }, + { + "epoch": 0.08213879408418658, + "grad_norm": 2.683588571853357, + "learning_rate": 1.2491724368648808e-06, + "loss": 0.1182, + "step": 361 + }, + { + "epoch": 0.08236632536973834, + "grad_norm": 4.142859587264675, + "learning_rate": 1.2491678339283303e-06, + "loss": 0.1213, + "step": 362 + }, + { + "epoch": 0.08259385665529011, + "grad_norm": 2.266538556877378, + "learning_rate": 1.249163218234932e-06, + "loss": 0.1669, + "step": 363 + }, + { + "epoch": 0.08282138794084186, + "grad_norm": 3.340308786527698, + "learning_rate": 1.249158589784781e-06, + "loss": 0.1449, + "step": 364 + }, + { + "epoch": 0.08304891922639362, + "grad_norm": 3.600922134824311, + "learning_rate": 1.2491539485779713e-06, + "loss": 0.1934, + "step": 365 + }, + { + "epoch": 0.08327645051194539, + "grad_norm": 2.5603148777390796, + "learning_rate": 1.2491492946145981e-06, + "loss": 0.1215, + "step": 366 + }, + { + "epoch": 0.08350398179749716, + "grad_norm": 1.4306937563740754, + "learning_rate": 1.2491446278947563e-06, + "loss": 0.1218, + "step": 367 + }, + { + "epoch": 0.08373151308304892, + "grad_norm": 6.514691076015768, + "learning_rate": 1.2491399484185413e-06, + "loss": 0.1723, + "step": 368 + }, + { + "epoch": 0.08395904436860069, + "grad_norm": 2.1513333963844214, + "learning_rate": 1.249135256186049e-06, + "loss": 0.242, + "step": 369 + }, + { + "epoch": 0.08418657565415244, + "grad_norm": 1.697947937157404, + "learning_rate": 1.249130551197375e-06, + "loss": 0.1045, + "step": 370 + }, + { + "epoch": 0.0844141069397042, + "grad_norm": 1.4338559958770856, + "learning_rate": 1.2491258334526155e-06, + "loss": 0.1671, + "step": 371 + }, + { + "epoch": 0.08464163822525597, + "grad_norm": 2.7532236684188773, + "learning_rate": 1.2491211029518672e-06, + "loss": 0.1034, + "step": 372 + }, + { + "epoch": 0.08486916951080774, + "grad_norm": 2.665642318134447, + "learning_rate": 1.2491163596952264e-06, + "loss": 0.1737, + "step": 373 + }, + { + "epoch": 0.0850967007963595, + "grad_norm": 1.5130437493435105, + "learning_rate": 1.2491116036827902e-06, + "loss": 0.0804, + "step": 374 + }, + { + "epoch": 0.08532423208191127, + "grad_norm": 1.3642320073282543, + "learning_rate": 1.2491068349146559e-06, + "loss": 0.1428, + "step": 375 + }, + { + "epoch": 0.08555176336746302, + "grad_norm": 2.1006895230964444, + "learning_rate": 1.249102053390921e-06, + "loss": 0.2759, + "step": 376 + }, + { + "epoch": 0.08577929465301479, + "grad_norm": 1.5335225229109515, + "learning_rate": 1.249097259111683e-06, + "loss": 0.1836, + "step": 377 + }, + { + "epoch": 0.08600682593856655, + "grad_norm": 4.09523641946509, + "learning_rate": 1.24909245207704e-06, + "loss": 0.2771, + "step": 378 + }, + { + "epoch": 0.08623435722411832, + "grad_norm": 2.2658393838403477, + "learning_rate": 1.2490876322870904e-06, + "loss": 0.1815, + "step": 379 + }, + { + "epoch": 0.08646188850967008, + "grad_norm": 3.053596441038967, + "learning_rate": 1.2490827997419325e-06, + "loss": 0.1183, + "step": 380 + }, + { + "epoch": 0.08668941979522184, + "grad_norm": 2.9366601199125153, + "learning_rate": 1.249077954441665e-06, + "loss": 0.1738, + "step": 381 + }, + { + "epoch": 0.0869169510807736, + "grad_norm": 1.9726593738442935, + "learning_rate": 1.249073096386387e-06, + "loss": 0.1427, + "step": 382 + }, + { + "epoch": 0.08714448236632537, + "grad_norm": 2.8452874204285985, + "learning_rate": 1.249068225576198e-06, + "loss": 0.2767, + "step": 383 + }, + { + "epoch": 0.08737201365187713, + "grad_norm": 4.292343700500067, + "learning_rate": 1.2490633420111974e-06, + "loss": 0.127, + "step": 384 + }, + { + "epoch": 0.0875995449374289, + "grad_norm": 4.105827667785258, + "learning_rate": 1.249058445691485e-06, + "loss": 0.1639, + "step": 385 + }, + { + "epoch": 0.08782707622298067, + "grad_norm": 4.310698395146462, + "learning_rate": 1.2490535366171607e-06, + "loss": 0.1289, + "step": 386 + }, + { + "epoch": 0.08805460750853242, + "grad_norm": 3.5788743602832795, + "learning_rate": 1.249048614788325e-06, + "loss": 0.1804, + "step": 387 + }, + { + "epoch": 0.08828213879408418, + "grad_norm": 2.6616942664445413, + "learning_rate": 1.249043680205079e-06, + "loss": 0.144, + "step": 388 + }, + { + "epoch": 0.08850967007963595, + "grad_norm": 2.989163897960478, + "learning_rate": 1.2490387328675226e-06, + "loss": 0.2016, + "step": 389 + }, + { + "epoch": 0.08873720136518772, + "grad_norm": 4.587176162210019, + "learning_rate": 1.2490337727757576e-06, + "loss": 0.2284, + "step": 390 + }, + { + "epoch": 0.08896473265073948, + "grad_norm": 2.794747809075531, + "learning_rate": 1.249028799929885e-06, + "loss": 0.2002, + "step": 391 + }, + { + "epoch": 0.08919226393629125, + "grad_norm": 2.0197262567230276, + "learning_rate": 1.2490238143300066e-06, + "loss": 0.1143, + "step": 392 + }, + { + "epoch": 0.089419795221843, + "grad_norm": 3.184614553894442, + "learning_rate": 1.2490188159762243e-06, + "loss": 0.1913, + "step": 393 + }, + { + "epoch": 0.08964732650739476, + "grad_norm": 2.518010477046937, + "learning_rate": 1.2490138048686405e-06, + "loss": 0.1981, + "step": 394 + }, + { + "epoch": 0.08987485779294653, + "grad_norm": 5.010077865699377, + "learning_rate": 1.249008781007357e-06, + "loss": 0.1423, + "step": 395 + }, + { + "epoch": 0.0901023890784983, + "grad_norm": 1.420461399090385, + "learning_rate": 1.2490037443924768e-06, + "loss": 0.1363, + "step": 396 + }, + { + "epoch": 0.09032992036405006, + "grad_norm": 2.5810652557759863, + "learning_rate": 1.2489986950241032e-06, + "loss": 0.1002, + "step": 397 + }, + { + "epoch": 0.09055745164960181, + "grad_norm": 1.8725706501255737, + "learning_rate": 1.2489936329023387e-06, + "loss": 0.1974, + "step": 398 + }, + { + "epoch": 0.09078498293515358, + "grad_norm": 3.2869147678539554, + "learning_rate": 1.2489885580272874e-06, + "loss": 0.1629, + "step": 399 + }, + { + "epoch": 0.09101251422070535, + "grad_norm": 1.7546095764098488, + "learning_rate": 1.2489834703990527e-06, + "loss": 0.1326, + "step": 400 + }, + { + "epoch": 0.09124004550625711, + "grad_norm": 3.0930989898336407, + "learning_rate": 1.2489783700177385e-06, + "loss": 0.2565, + "step": 401 + }, + { + "epoch": 0.09146757679180888, + "grad_norm": 4.363886237065706, + "learning_rate": 1.2489732568834492e-06, + "loss": 0.1425, + "step": 402 + }, + { + "epoch": 0.09169510807736064, + "grad_norm": 2.141413419957395, + "learning_rate": 1.2489681309962895e-06, + "loss": 0.1458, + "step": 403 + }, + { + "epoch": 0.0919226393629124, + "grad_norm": 4.5478526718009205, + "learning_rate": 1.2489629923563637e-06, + "loss": 0.1655, + "step": 404 + }, + { + "epoch": 0.09215017064846416, + "grad_norm": 5.253865415098631, + "learning_rate": 1.2489578409637774e-06, + "loss": 0.2702, + "step": 405 + }, + { + "epoch": 0.09237770193401593, + "grad_norm": 6.114423825591168, + "learning_rate": 1.2489526768186352e-06, + "loss": 0.1364, + "step": 406 + }, + { + "epoch": 0.09260523321956769, + "grad_norm": 2.4260049242900505, + "learning_rate": 1.2489474999210434e-06, + "loss": 0.1573, + "step": 407 + }, + { + "epoch": 0.09283276450511946, + "grad_norm": 6.696614155480106, + "learning_rate": 1.2489423102711068e-06, + "loss": 0.2365, + "step": 408 + }, + { + "epoch": 0.09306029579067122, + "grad_norm": 3.4093511525509848, + "learning_rate": 1.2489371078689326e-06, + "loss": 0.1552, + "step": 409 + }, + { + "epoch": 0.09328782707622298, + "grad_norm": 3.512014449058475, + "learning_rate": 1.2489318927146263e-06, + "loss": 0.1392, + "step": 410 + }, + { + "epoch": 0.09351535836177474, + "grad_norm": 4.385040034701264, + "learning_rate": 1.2489266648082951e-06, + "loss": 0.1184, + "step": 411 + }, + { + "epoch": 0.09374288964732651, + "grad_norm": 11.030038016242493, + "learning_rate": 1.2489214241500453e-06, + "loss": 0.2445, + "step": 412 + }, + { + "epoch": 0.09397042093287827, + "grad_norm": 3.8160488235069487, + "learning_rate": 1.2489161707399843e-06, + "loss": 0.2422, + "step": 413 + }, + { + "epoch": 0.09419795221843004, + "grad_norm": 2.5154081754915554, + "learning_rate": 1.2489109045782194e-06, + "loss": 0.1284, + "step": 414 + }, + { + "epoch": 0.09442548350398179, + "grad_norm": 2.186602019326803, + "learning_rate": 1.2489056256648582e-06, + "loss": 0.1387, + "step": 415 + }, + { + "epoch": 0.09465301478953356, + "grad_norm": 3.1244704898712223, + "learning_rate": 1.2489003340000089e-06, + "loss": 0.2695, + "step": 416 + }, + { + "epoch": 0.09488054607508532, + "grad_norm": 1.9015703147093774, + "learning_rate": 1.2488950295837792e-06, + "loss": 0.2029, + "step": 417 + }, + { + "epoch": 0.09510807736063709, + "grad_norm": 3.2255120343889523, + "learning_rate": 1.2488897124162777e-06, + "loss": 0.1708, + "step": 418 + }, + { + "epoch": 0.09533560864618885, + "grad_norm": 2.4361554392110354, + "learning_rate": 1.248884382497613e-06, + "loss": 0.237, + "step": 419 + }, + { + "epoch": 0.09556313993174062, + "grad_norm": 5.44904137240634, + "learning_rate": 1.2488790398278941e-06, + "loss": 0.2259, + "step": 420 + }, + { + "epoch": 0.09579067121729237, + "grad_norm": 2.5542725247665725, + "learning_rate": 1.2488736844072304e-06, + "loss": 0.1706, + "step": 421 + }, + { + "epoch": 0.09601820250284414, + "grad_norm": 3.3440828684749837, + "learning_rate": 1.248868316235731e-06, + "loss": 0.166, + "step": 422 + }, + { + "epoch": 0.0962457337883959, + "grad_norm": 2.837980086891423, + "learning_rate": 1.2488629353135059e-06, + "loss": 0.1974, + "step": 423 + }, + { + "epoch": 0.09647326507394767, + "grad_norm": 3.0821716156484413, + "learning_rate": 1.2488575416406649e-06, + "loss": 0.2029, + "step": 424 + }, + { + "epoch": 0.09670079635949944, + "grad_norm": 4.11082660525738, + "learning_rate": 1.2488521352173183e-06, + "loss": 0.1288, + "step": 425 + }, + { + "epoch": 0.09692832764505119, + "grad_norm": 2.792375492899653, + "learning_rate": 1.2488467160435765e-06, + "loss": 0.1318, + "step": 426 + }, + { + "epoch": 0.09715585893060295, + "grad_norm": 2.54978143800456, + "learning_rate": 1.2488412841195505e-06, + "loss": 0.2235, + "step": 427 + }, + { + "epoch": 0.09738339021615472, + "grad_norm": 1.8685713785223814, + "learning_rate": 1.2488358394453512e-06, + "loss": 0.1018, + "step": 428 + }, + { + "epoch": 0.09761092150170649, + "grad_norm": 2.19856597261874, + "learning_rate": 1.2488303820210897e-06, + "loss": 0.0955, + "step": 429 + }, + { + "epoch": 0.09783845278725825, + "grad_norm": 2.756460140283964, + "learning_rate": 1.2488249118468776e-06, + "loss": 0.161, + "step": 430 + }, + { + "epoch": 0.09806598407281002, + "grad_norm": 3.1658885878432446, + "learning_rate": 1.248819428922827e-06, + "loss": 0.1707, + "step": 431 + }, + { + "epoch": 0.09829351535836177, + "grad_norm": 3.574624372801338, + "learning_rate": 1.2488139332490495e-06, + "loss": 0.2412, + "step": 432 + }, + { + "epoch": 0.09852104664391353, + "grad_norm": 2.63473599121384, + "learning_rate": 1.248808424825658e-06, + "loss": 0.1195, + "step": 433 + }, + { + "epoch": 0.0987485779294653, + "grad_norm": 3.928170371490413, + "learning_rate": 1.2488029036527645e-06, + "loss": 0.1478, + "step": 434 + }, + { + "epoch": 0.09897610921501707, + "grad_norm": 2.0459697190569583, + "learning_rate": 1.2487973697304822e-06, + "loss": 0.0868, + "step": 435 + }, + { + "epoch": 0.09920364050056883, + "grad_norm": 2.2037192709560283, + "learning_rate": 1.248791823058924e-06, + "loss": 0.1911, + "step": 436 + }, + { + "epoch": 0.0994311717861206, + "grad_norm": 3.549121049187713, + "learning_rate": 1.2487862636382034e-06, + "loss": 0.1218, + "step": 437 + }, + { + "epoch": 0.09965870307167235, + "grad_norm": 1.4303061363329783, + "learning_rate": 1.248780691468434e-06, + "loss": 0.1116, + "step": 438 + }, + { + "epoch": 0.09988623435722412, + "grad_norm": 3.8141735085769746, + "learning_rate": 1.2487751065497296e-06, + "loss": 0.2179, + "step": 439 + }, + { + "epoch": 0.10011376564277588, + "grad_norm": 2.6329169063924986, + "learning_rate": 1.2487695088822044e-06, + "loss": 0.1492, + "step": 440 + }, + { + "epoch": 0.10034129692832765, + "grad_norm": 2.8773216855185635, + "learning_rate": 1.2487638984659729e-06, + "loss": 0.0988, + "step": 441 + }, + { + "epoch": 0.10056882821387941, + "grad_norm": 2.5448731857786284, + "learning_rate": 1.2487582753011496e-06, + "loss": 0.1023, + "step": 442 + }, + { + "epoch": 0.10079635949943117, + "grad_norm": 2.4399816480891445, + "learning_rate": 1.2487526393878497e-06, + "loss": 0.2015, + "step": 443 + }, + { + "epoch": 0.10102389078498293, + "grad_norm": 2.056202357783669, + "learning_rate": 1.248746990726188e-06, + "loss": 0.1376, + "step": 444 + }, + { + "epoch": 0.1012514220705347, + "grad_norm": 2.489946255383071, + "learning_rate": 1.2487413293162803e-06, + "loss": 0.1389, + "step": 445 + }, + { + "epoch": 0.10147895335608646, + "grad_norm": 2.3660691937468807, + "learning_rate": 1.2487356551582421e-06, + "loss": 0.2235, + "step": 446 + }, + { + "epoch": 0.10170648464163823, + "grad_norm": 2.5030375037996575, + "learning_rate": 1.2487299682521893e-06, + "loss": 0.2156, + "step": 447 + }, + { + "epoch": 0.10193401592719, + "grad_norm": 2.210721856008811, + "learning_rate": 1.2487242685982384e-06, + "loss": 0.1101, + "step": 448 + }, + { + "epoch": 0.10216154721274175, + "grad_norm": 2.250420318734035, + "learning_rate": 1.2487185561965057e-06, + "loss": 0.1241, + "step": 449 + }, + { + "epoch": 0.10238907849829351, + "grad_norm": 2.019413043508561, + "learning_rate": 1.248712831047108e-06, + "loss": 0.1217, + "step": 450 + }, + { + "epoch": 0.10261660978384528, + "grad_norm": 3.2295330442493713, + "learning_rate": 1.2487070931501624e-06, + "loss": 0.2304, + "step": 451 + }, + { + "epoch": 0.10284414106939704, + "grad_norm": 2.444299385213433, + "learning_rate": 1.2487013425057858e-06, + "loss": 0.2084, + "step": 452 + }, + { + "epoch": 0.10307167235494881, + "grad_norm": 2.8966369631126367, + "learning_rate": 1.2486955791140964e-06, + "loss": 0.1838, + "step": 453 + }, + { + "epoch": 0.10329920364050058, + "grad_norm": 2.0941566856763387, + "learning_rate": 1.2486898029752113e-06, + "loss": 0.1043, + "step": 454 + }, + { + "epoch": 0.10352673492605233, + "grad_norm": 2.3019250022426925, + "learning_rate": 1.248684014089249e-06, + "loss": 0.1189, + "step": 455 + }, + { + "epoch": 0.1037542662116041, + "grad_norm": 2.1349092143720387, + "learning_rate": 1.2486782124563277e-06, + "loss": 0.1708, + "step": 456 + }, + { + "epoch": 0.10398179749715586, + "grad_norm": 3.101054381668985, + "learning_rate": 1.2486723980765659e-06, + "loss": 0.1796, + "step": 457 + }, + { + "epoch": 0.10420932878270762, + "grad_norm": 1.9574694651381292, + "learning_rate": 1.2486665709500826e-06, + "loss": 0.1762, + "step": 458 + }, + { + "epoch": 0.10443686006825939, + "grad_norm": 1.9997685220641748, + "learning_rate": 1.2486607310769965e-06, + "loss": 0.1626, + "step": 459 + }, + { + "epoch": 0.10466439135381114, + "grad_norm": 1.4987645243428842, + "learning_rate": 1.2486548784574275e-06, + "loss": 0.1104, + "step": 460 + }, + { + "epoch": 0.10489192263936291, + "grad_norm": 3.0056305765303857, + "learning_rate": 1.2486490130914948e-06, + "loss": 0.1526, + "step": 461 + }, + { + "epoch": 0.10511945392491467, + "grad_norm": 1.6498658926200307, + "learning_rate": 1.2486431349793185e-06, + "loss": 0.1158, + "step": 462 + }, + { + "epoch": 0.10534698521046644, + "grad_norm": 2.8097802744351035, + "learning_rate": 1.2486372441210188e-06, + "loss": 0.174, + "step": 463 + }, + { + "epoch": 0.1055745164960182, + "grad_norm": 2.2295425114906955, + "learning_rate": 1.248631340516716e-06, + "loss": 0.0993, + "step": 464 + }, + { + "epoch": 0.10580204778156997, + "grad_norm": 1.7352971105344217, + "learning_rate": 1.2486254241665302e-06, + "loss": 0.1799, + "step": 465 + }, + { + "epoch": 0.10602957906712172, + "grad_norm": 3.37890451450669, + "learning_rate": 1.2486194950705831e-06, + "loss": 0.1456, + "step": 466 + }, + { + "epoch": 0.10625711035267349, + "grad_norm": 4.485196875503332, + "learning_rate": 1.248613553228996e-06, + "loss": 0.1509, + "step": 467 + }, + { + "epoch": 0.10648464163822526, + "grad_norm": 3.8128664414272833, + "learning_rate": 1.2486075986418896e-06, + "loss": 0.1217, + "step": 468 + }, + { + "epoch": 0.10671217292377702, + "grad_norm": 1.9049325746647565, + "learning_rate": 1.248601631309386e-06, + "loss": 0.1973, + "step": 469 + }, + { + "epoch": 0.10693970420932879, + "grad_norm": 1.9433225744575688, + "learning_rate": 1.2485956512316072e-06, + "loss": 0.1422, + "step": 470 + }, + { + "epoch": 0.10716723549488055, + "grad_norm": 1.7542185976103952, + "learning_rate": 1.2485896584086754e-06, + "loss": 0.1187, + "step": 471 + }, + { + "epoch": 0.1073947667804323, + "grad_norm": 0.985585738392577, + "learning_rate": 1.248583652840713e-06, + "loss": 0.1116, + "step": 472 + }, + { + "epoch": 0.10762229806598407, + "grad_norm": 6.520293791736507, + "learning_rate": 1.2485776345278427e-06, + "loss": 0.1634, + "step": 473 + }, + { + "epoch": 0.10784982935153584, + "grad_norm": 2.9958165676640935, + "learning_rate": 1.2485716034701876e-06, + "loss": 0.1468, + "step": 474 + }, + { + "epoch": 0.1080773606370876, + "grad_norm": 3.496540224028896, + "learning_rate": 1.2485655596678712e-06, + "loss": 0.1444, + "step": 475 + }, + { + "epoch": 0.10830489192263937, + "grad_norm": 2.6887910577996603, + "learning_rate": 1.2485595031210164e-06, + "loss": 0.2257, + "step": 476 + }, + { + "epoch": 0.10853242320819112, + "grad_norm": 2.210859712757279, + "learning_rate": 1.2485534338297475e-06, + "loss": 0.0858, + "step": 477 + }, + { + "epoch": 0.10875995449374289, + "grad_norm": 1.5912288577365465, + "learning_rate": 1.2485473517941884e-06, + "loss": 0.1021, + "step": 478 + }, + { + "epoch": 0.10898748577929465, + "grad_norm": 2.162920899638659, + "learning_rate": 1.2485412570144633e-06, + "loss": 0.2051, + "step": 479 + }, + { + "epoch": 0.10921501706484642, + "grad_norm": 2.3337569161162186, + "learning_rate": 1.2485351494906969e-06, + "loss": 0.1726, + "step": 480 + }, + { + "epoch": 0.10944254835039818, + "grad_norm": 1.6587972530161754, + "learning_rate": 1.2485290292230142e-06, + "loss": 0.1589, + "step": 481 + }, + { + "epoch": 0.10967007963594995, + "grad_norm": 2.549443212629399, + "learning_rate": 1.24852289621154e-06, + "loss": 0.1107, + "step": 482 + }, + { + "epoch": 0.1098976109215017, + "grad_norm": 1.9600173744992218, + "learning_rate": 1.2485167504563995e-06, + "loss": 0.1497, + "step": 483 + }, + { + "epoch": 0.11012514220705347, + "grad_norm": 2.914488733886043, + "learning_rate": 1.2485105919577187e-06, + "loss": 0.2242, + "step": 484 + }, + { + "epoch": 0.11035267349260523, + "grad_norm": 2.4334592724633475, + "learning_rate": 1.2485044207156233e-06, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.110580204778157, + "grad_norm": 2.1918094312708374, + "learning_rate": 1.2484982367302395e-06, + "loss": 0.1611, + "step": 486 + }, + { + "epoch": 0.11080773606370876, + "grad_norm": 2.2072766100880843, + "learning_rate": 1.2484920400016936e-06, + "loss": 0.1402, + "step": 487 + }, + { + "epoch": 0.11103526734926053, + "grad_norm": 1.6859469474720183, + "learning_rate": 1.2484858305301122e-06, + "loss": 0.1472, + "step": 488 + }, + { + "epoch": 0.11126279863481228, + "grad_norm": 1.590244696061809, + "learning_rate": 1.2484796083156222e-06, + "loss": 0.0824, + "step": 489 + }, + { + "epoch": 0.11149032992036405, + "grad_norm": 4.525638347888733, + "learning_rate": 1.2484733733583511e-06, + "loss": 0.1257, + "step": 490 + }, + { + "epoch": 0.11171786120591581, + "grad_norm": 2.6721724669454723, + "learning_rate": 1.248467125658426e-06, + "loss": 0.2084, + "step": 491 + }, + { + "epoch": 0.11194539249146758, + "grad_norm": 2.300055245713483, + "learning_rate": 1.2484608652159746e-06, + "loss": 0.1053, + "step": 492 + }, + { + "epoch": 0.11217292377701935, + "grad_norm": 3.273977920110333, + "learning_rate": 1.248454592031125e-06, + "loss": 0.1176, + "step": 493 + }, + { + "epoch": 0.1124004550625711, + "grad_norm": 2.101057790899636, + "learning_rate": 1.2484483061040054e-06, + "loss": 0.1277, + "step": 494 + }, + { + "epoch": 0.11262798634812286, + "grad_norm": 3.6133620556599984, + "learning_rate": 1.2484420074347441e-06, + "loss": 0.1845, + "step": 495 + }, + { + "epoch": 0.11285551763367463, + "grad_norm": 1.9619725915027257, + "learning_rate": 1.24843569602347e-06, + "loss": 0.1894, + "step": 496 + }, + { + "epoch": 0.1130830489192264, + "grad_norm": 2.636905846270966, + "learning_rate": 1.2484293718703119e-06, + "loss": 0.1874, + "step": 497 + }, + { + "epoch": 0.11331058020477816, + "grad_norm": 2.5593822043936125, + "learning_rate": 1.2484230349753994e-06, + "loss": 0.0927, + "step": 498 + }, + { + "epoch": 0.11353811149032993, + "grad_norm": 2.2440609982402715, + "learning_rate": 1.2484166853388617e-06, + "loss": 0.1381, + "step": 499 + }, + { + "epoch": 0.11376564277588168, + "grad_norm": 2.7232866925160506, + "learning_rate": 1.2484103229608288e-06, + "loss": 0.1758, + "step": 500 + }, + { + "epoch": 0.11399317406143344, + "grad_norm": 2.6484317978572816, + "learning_rate": 1.2484039478414305e-06, + "loss": 0.1259, + "step": 501 + }, + { + "epoch": 0.11422070534698521, + "grad_norm": 2.1058374053464464, + "learning_rate": 1.2483975599807972e-06, + "loss": 0.1369, + "step": 502 + }, + { + "epoch": 0.11444823663253698, + "grad_norm": 2.1458925241645903, + "learning_rate": 1.2483911593790595e-06, + "loss": 0.1004, + "step": 503 + }, + { + "epoch": 0.11467576791808874, + "grad_norm": 3.031837353586065, + "learning_rate": 1.2483847460363482e-06, + "loss": 0.154, + "step": 504 + }, + { + "epoch": 0.1149032992036405, + "grad_norm": 3.1297621875057544, + "learning_rate": 1.2483783199527943e-06, + "loss": 0.1071, + "step": 505 + }, + { + "epoch": 0.11513083048919226, + "grad_norm": 2.5407911203085787, + "learning_rate": 1.2483718811285296e-06, + "loss": 0.1744, + "step": 506 + }, + { + "epoch": 0.11535836177474403, + "grad_norm": 3.1175064627764377, + "learning_rate": 1.2483654295636848e-06, + "loss": 0.1072, + "step": 507 + }, + { + "epoch": 0.11558589306029579, + "grad_norm": 3.0988741009535667, + "learning_rate": 1.2483589652583924e-06, + "loss": 0.1753, + "step": 508 + }, + { + "epoch": 0.11581342434584756, + "grad_norm": 1.8808814641931946, + "learning_rate": 1.2483524882127846e-06, + "loss": 0.0859, + "step": 509 + }, + { + "epoch": 0.11604095563139932, + "grad_norm": 2.8937543802568158, + "learning_rate": 1.2483459984269933e-06, + "loss": 0.1816, + "step": 510 + }, + { + "epoch": 0.11626848691695107, + "grad_norm": 2.186370885841539, + "learning_rate": 1.2483394959011514e-06, + "loss": 0.0819, + "step": 511 + }, + { + "epoch": 0.11649601820250284, + "grad_norm": 1.8650801779387822, + "learning_rate": 1.248332980635392e-06, + "loss": 0.1436, + "step": 512 + }, + { + "epoch": 0.1167235494880546, + "grad_norm": 2.9270321544640994, + "learning_rate": 1.2483264526298478e-06, + "loss": 0.1308, + "step": 513 + }, + { + "epoch": 0.11695108077360637, + "grad_norm": 1.9942689645578024, + "learning_rate": 1.2483199118846525e-06, + "loss": 0.1656, + "step": 514 + }, + { + "epoch": 0.11717861205915814, + "grad_norm": 2.8104633311436116, + "learning_rate": 1.2483133583999399e-06, + "loss": 0.1681, + "step": 515 + }, + { + "epoch": 0.1174061433447099, + "grad_norm": 2.546169206593085, + "learning_rate": 1.2483067921758439e-06, + "loss": 0.0925, + "step": 516 + }, + { + "epoch": 0.11763367463026166, + "grad_norm": 2.0758430805982178, + "learning_rate": 1.2483002132124983e-06, + "loss": 0.203, + "step": 517 + }, + { + "epoch": 0.11786120591581342, + "grad_norm": 2.1497459150584386, + "learning_rate": 1.2482936215100382e-06, + "loss": 0.1056, + "step": 518 + }, + { + "epoch": 0.11808873720136519, + "grad_norm": 2.197584956184683, + "learning_rate": 1.2482870170685978e-06, + "loss": 0.0933, + "step": 519 + }, + { + "epoch": 0.11831626848691695, + "grad_norm": 4.944962250057973, + "learning_rate": 1.2482803998883122e-06, + "loss": 0.2129, + "step": 520 + }, + { + "epoch": 0.11854379977246872, + "grad_norm": 1.5333537239736301, + "learning_rate": 1.2482737699693168e-06, + "loss": 0.1729, + "step": 521 + }, + { + "epoch": 0.11877133105802047, + "grad_norm": 2.5556570479037948, + "learning_rate": 1.248267127311747e-06, + "loss": 0.1607, + "step": 522 + }, + { + "epoch": 0.11899886234357224, + "grad_norm": 2.0949542782407398, + "learning_rate": 1.2482604719157386e-06, + "loss": 0.1857, + "step": 523 + }, + { + "epoch": 0.119226393629124, + "grad_norm": 2.2586097350216385, + "learning_rate": 1.2482538037814277e-06, + "loss": 0.1258, + "step": 524 + }, + { + "epoch": 0.11945392491467577, + "grad_norm": 3.036602602741407, + "learning_rate": 1.2482471229089502e-06, + "loss": 0.161, + "step": 525 + }, + { + "epoch": 0.11968145620022753, + "grad_norm": 3.382002996482515, + "learning_rate": 1.2482404292984431e-06, + "loss": 0.1784, + "step": 526 + }, + { + "epoch": 0.1199089874857793, + "grad_norm": 1.571226708630226, + "learning_rate": 1.248233722950043e-06, + "loss": 0.1605, + "step": 527 + }, + { + "epoch": 0.12013651877133105, + "grad_norm": 3.0053996402943737, + "learning_rate": 1.2482270038638872e-06, + "loss": 0.1201, + "step": 528 + }, + { + "epoch": 0.12036405005688282, + "grad_norm": 4.663906907753179, + "learning_rate": 1.2482202720401128e-06, + "loss": 0.203, + "step": 529 + }, + { + "epoch": 0.12059158134243458, + "grad_norm": 2.107107186527039, + "learning_rate": 1.248213527478857e-06, + "loss": 0.1933, + "step": 530 + }, + { + "epoch": 0.12081911262798635, + "grad_norm": 2.191569921182264, + "learning_rate": 1.2482067701802583e-06, + "loss": 0.1735, + "step": 531 + }, + { + "epoch": 0.12104664391353812, + "grad_norm": 1.611611034864374, + "learning_rate": 1.2482000001444547e-06, + "loss": 0.1299, + "step": 532 + }, + { + "epoch": 0.12127417519908988, + "grad_norm": 1.9644367618752439, + "learning_rate": 1.2481932173715845e-06, + "loss": 0.0868, + "step": 533 + }, + { + "epoch": 0.12150170648464163, + "grad_norm": 1.7597689357542332, + "learning_rate": 1.2481864218617859e-06, + "loss": 0.1977, + "step": 534 + }, + { + "epoch": 0.1217292377701934, + "grad_norm": 1.0455766882042379, + "learning_rate": 1.2481796136151984e-06, + "loss": 0.0856, + "step": 535 + }, + { + "epoch": 0.12195676905574517, + "grad_norm": 3.2419347761543684, + "learning_rate": 1.2481727926319609e-06, + "loss": 0.2399, + "step": 536 + }, + { + "epoch": 0.12218430034129693, + "grad_norm": 3.339873316715719, + "learning_rate": 1.2481659589122127e-06, + "loss": 0.186, + "step": 537 + }, + { + "epoch": 0.1224118316268487, + "grad_norm": 3.4453888669974146, + "learning_rate": 1.2481591124560934e-06, + "loss": 0.2007, + "step": 538 + }, + { + "epoch": 0.12263936291240045, + "grad_norm": 3.4700673703521736, + "learning_rate": 1.2481522532637435e-06, + "loss": 0.1632, + "step": 539 + }, + { + "epoch": 0.12286689419795221, + "grad_norm": 2.355397510374851, + "learning_rate": 1.2481453813353026e-06, + "loss": 0.1212, + "step": 540 + }, + { + "epoch": 0.12309442548350398, + "grad_norm": 5.338957920220655, + "learning_rate": 1.2481384966709116e-06, + "loss": 0.1592, + "step": 541 + }, + { + "epoch": 0.12332195676905575, + "grad_norm": 2.990026650956376, + "learning_rate": 1.2481315992707104e-06, + "loss": 0.2656, + "step": 542 + }, + { + "epoch": 0.12354948805460751, + "grad_norm": 1.8798810865858828, + "learning_rate": 1.248124689134841e-06, + "loss": 0.1125, + "step": 543 + }, + { + "epoch": 0.12377701934015928, + "grad_norm": 1.6104299610891197, + "learning_rate": 1.2481177662634438e-06, + "loss": 0.1557, + "step": 544 + }, + { + "epoch": 0.12400455062571103, + "grad_norm": 3.302283676048537, + "learning_rate": 1.2481108306566609e-06, + "loss": 0.1799, + "step": 545 + }, + { + "epoch": 0.1242320819112628, + "grad_norm": 2.0532951352869513, + "learning_rate": 1.2481038823146338e-06, + "loss": 0.0815, + "step": 546 + }, + { + "epoch": 0.12445961319681456, + "grad_norm": 1.4326913794879275, + "learning_rate": 1.2480969212375043e-06, + "loss": 0.177, + "step": 547 + }, + { + "epoch": 0.12468714448236633, + "grad_norm": 3.5494676426295286, + "learning_rate": 1.2480899474254151e-06, + "loss": 0.136, + "step": 548 + }, + { + "epoch": 0.12491467576791809, + "grad_norm": 1.3410455744599155, + "learning_rate": 1.2480829608785085e-06, + "loss": 0.1078, + "step": 549 + }, + { + "epoch": 0.12514220705346984, + "grad_norm": 1.7709434217848017, + "learning_rate": 1.2480759615969273e-06, + "loss": 0.1114, + "step": 550 + }, + { + "epoch": 0.12536973833902162, + "grad_norm": 1.4865770903343614, + "learning_rate": 1.2480689495808144e-06, + "loss": 0.1377, + "step": 551 + }, + { + "epoch": 0.12559726962457338, + "grad_norm": 1.6211826207402742, + "learning_rate": 1.2480619248303133e-06, + "loss": 0.1873, + "step": 552 + }, + { + "epoch": 0.12582480091012513, + "grad_norm": 3.1755876159758794, + "learning_rate": 1.2480548873455675e-06, + "loss": 0.2135, + "step": 553 + }, + { + "epoch": 0.1260523321956769, + "grad_norm": 3.6986046315140952, + "learning_rate": 1.248047837126721e-06, + "loss": 0.3549, + "step": 554 + }, + { + "epoch": 0.12627986348122866, + "grad_norm": 2.782290781984551, + "learning_rate": 1.248040774173918e-06, + "loss": 0.1936, + "step": 555 + }, + { + "epoch": 0.12650739476678044, + "grad_norm": 2.329760734261347, + "learning_rate": 1.248033698487302e-06, + "loss": 0.1395, + "step": 556 + }, + { + "epoch": 0.1267349260523322, + "grad_norm": 2.258554836923121, + "learning_rate": 1.2480266100670189e-06, + "loss": 0.1605, + "step": 557 + }, + { + "epoch": 0.12696245733788397, + "grad_norm": 3.058041285297341, + "learning_rate": 1.2480195089132125e-06, + "loss": 0.1975, + "step": 558 + }, + { + "epoch": 0.12718998862343572, + "grad_norm": 2.406042057945949, + "learning_rate": 1.2480123950260284e-06, + "loss": 0.1405, + "step": 559 + }, + { + "epoch": 0.12741751990898748, + "grad_norm": 1.4634033865621767, + "learning_rate": 1.248005268405612e-06, + "loss": 0.0686, + "step": 560 + }, + { + "epoch": 0.12764505119453926, + "grad_norm": 1.1470288222889338, + "learning_rate": 1.2479981290521087e-06, + "loss": 0.0649, + "step": 561 + }, + { + "epoch": 0.127872582480091, + "grad_norm": 3.357158703331078, + "learning_rate": 1.2479909769656648e-06, + "loss": 0.1684, + "step": 562 + }, + { + "epoch": 0.1281001137656428, + "grad_norm": 2.4363436867877595, + "learning_rate": 1.2479838121464263e-06, + "loss": 0.2155, + "step": 563 + }, + { + "epoch": 0.12832764505119454, + "grad_norm": 4.051636355021599, + "learning_rate": 1.2479766345945395e-06, + "loss": 0.1853, + "step": 564 + }, + { + "epoch": 0.1285551763367463, + "grad_norm": 1.6707836764627593, + "learning_rate": 1.2479694443101513e-06, + "loss": 0.2261, + "step": 565 + }, + { + "epoch": 0.12878270762229807, + "grad_norm": 1.3008647546251737, + "learning_rate": 1.2479622412934087e-06, + "loss": 0.1606, + "step": 566 + }, + { + "epoch": 0.12901023890784982, + "grad_norm": 3.421202381350775, + "learning_rate": 1.2479550255444586e-06, + "loss": 0.147, + "step": 567 + }, + { + "epoch": 0.1292377701934016, + "grad_norm": 1.5157864652280186, + "learning_rate": 1.2479477970634487e-06, + "loss": 0.1536, + "step": 568 + }, + { + "epoch": 0.12946530147895335, + "grad_norm": 3.27856184412377, + "learning_rate": 1.2479405558505267e-06, + "loss": 0.1931, + "step": 569 + }, + { + "epoch": 0.1296928327645051, + "grad_norm": 2.5943823025048474, + "learning_rate": 1.247933301905841e-06, + "loss": 0.1384, + "step": 570 + }, + { + "epoch": 0.12992036405005689, + "grad_norm": 4.278003846990416, + "learning_rate": 1.2479260352295388e-06, + "loss": 0.1771, + "step": 571 + }, + { + "epoch": 0.13014789533560864, + "grad_norm": 3.446486195671729, + "learning_rate": 1.2479187558217697e-06, + "loss": 0.1323, + "step": 572 + }, + { + "epoch": 0.13037542662116042, + "grad_norm": 1.5099352019896337, + "learning_rate": 1.247911463682682e-06, + "loss": 0.1444, + "step": 573 + }, + { + "epoch": 0.13060295790671217, + "grad_norm": 3.798908546439363, + "learning_rate": 1.2479041588124247e-06, + "loss": 0.1504, + "step": 574 + }, + { + "epoch": 0.13083048919226395, + "grad_norm": 3.7532424433768754, + "learning_rate": 1.2478968412111471e-06, + "loss": 0.1518, + "step": 575 + }, + { + "epoch": 0.1310580204778157, + "grad_norm": 2.056630545760187, + "learning_rate": 1.247889510878999e-06, + "loss": 0.2708, + "step": 576 + }, + { + "epoch": 0.13128555176336745, + "grad_norm": 2.303355999452058, + "learning_rate": 1.24788216781613e-06, + "loss": 0.1662, + "step": 577 + }, + { + "epoch": 0.13151308304891923, + "grad_norm": 2.269104241548175, + "learning_rate": 1.2478748120226902e-06, + "loss": 0.1337, + "step": 578 + }, + { + "epoch": 0.13174061433447098, + "grad_norm": 3.0692597907642862, + "learning_rate": 1.2478674434988299e-06, + "loss": 0.1326, + "step": 579 + }, + { + "epoch": 0.13196814562002276, + "grad_norm": 1.6865202158454742, + "learning_rate": 1.2478600622447001e-06, + "loss": 0.1647, + "step": 580 + }, + { + "epoch": 0.13219567690557452, + "grad_norm": 2.939283703136826, + "learning_rate": 1.2478526682604512e-06, + "loss": 0.1303, + "step": 581 + }, + { + "epoch": 0.13242320819112627, + "grad_norm": 3.1064926411391713, + "learning_rate": 1.2478452615462345e-06, + "loss": 0.1409, + "step": 582 + }, + { + "epoch": 0.13265073947667805, + "grad_norm": 2.5571749562826485, + "learning_rate": 1.247837842102201e-06, + "loss": 0.1791, + "step": 583 + }, + { + "epoch": 0.1328782707622298, + "grad_norm": 2.795629539563545, + "learning_rate": 1.2478304099285031e-06, + "loss": 0.1567, + "step": 584 + }, + { + "epoch": 0.13310580204778158, + "grad_norm": 2.0832780528771466, + "learning_rate": 1.2478229650252921e-06, + "loss": 0.1639, + "step": 585 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 2.9969798024524117, + "learning_rate": 1.2478155073927204e-06, + "loss": 0.2444, + "step": 586 + }, + { + "epoch": 0.13356086461888508, + "grad_norm": 1.9274087851448982, + "learning_rate": 1.2478080370309404e-06, + "loss": 0.105, + "step": 587 + }, + { + "epoch": 0.13378839590443686, + "grad_norm": 4.021015627831867, + "learning_rate": 1.2478005539401046e-06, + "loss": 0.1734, + "step": 588 + }, + { + "epoch": 0.13401592718998862, + "grad_norm": 2.9342976021528027, + "learning_rate": 1.2477930581203663e-06, + "loss": 0.1465, + "step": 589 + }, + { + "epoch": 0.1342434584755404, + "grad_norm": 2.3242426333780632, + "learning_rate": 1.2477855495718782e-06, + "loss": 0.2241, + "step": 590 + }, + { + "epoch": 0.13447098976109215, + "grad_norm": 2.957504561813871, + "learning_rate": 1.2477780282947942e-06, + "loss": 0.1734, + "step": 591 + }, + { + "epoch": 0.13469852104664393, + "grad_norm": 1.8788696793522301, + "learning_rate": 1.2477704942892677e-06, + "loss": 0.1469, + "step": 592 + }, + { + "epoch": 0.13492605233219568, + "grad_norm": 2.339527187323086, + "learning_rate": 1.2477629475554532e-06, + "loss": 0.1312, + "step": 593 + }, + { + "epoch": 0.13515358361774743, + "grad_norm": 3.707567497860105, + "learning_rate": 1.2477553880935043e-06, + "loss": 0.1916, + "step": 594 + }, + { + "epoch": 0.1353811149032992, + "grad_norm": 3.2750827489523022, + "learning_rate": 1.2477478159035758e-06, + "loss": 0.1774, + "step": 595 + }, + { + "epoch": 0.13560864618885096, + "grad_norm": 2.777476705753077, + "learning_rate": 1.2477402309858226e-06, + "loss": 0.1789, + "step": 596 + }, + { + "epoch": 0.13583617747440274, + "grad_norm": 2.144596195630353, + "learning_rate": 1.2477326333403995e-06, + "loss": 0.147, + "step": 597 + }, + { + "epoch": 0.1360637087599545, + "grad_norm": 2.3685083837175935, + "learning_rate": 1.2477250229674618e-06, + "loss": 0.1831, + "step": 598 + }, + { + "epoch": 0.13629124004550625, + "grad_norm": 1.9843295041761948, + "learning_rate": 1.2477173998671653e-06, + "loss": 0.178, + "step": 599 + }, + { + "epoch": 0.13651877133105803, + "grad_norm": 3.434039497211011, + "learning_rate": 1.2477097640396655e-06, + "loss": 0.1235, + "step": 600 + }, + { + "epoch": 0.13674630261660978, + "grad_norm": 1.4586285890850859, + "learning_rate": 1.2477021154851185e-06, + "loss": 0.0977, + "step": 601 + }, + { + "epoch": 0.13697383390216156, + "grad_norm": 3.3913304667052198, + "learning_rate": 1.2476944542036806e-06, + "loss": 0.1786, + "step": 602 + }, + { + "epoch": 0.1372013651877133, + "grad_norm": 2.667804003182341, + "learning_rate": 1.2476867801955086e-06, + "loss": 0.1204, + "step": 603 + }, + { + "epoch": 0.13742889647326506, + "grad_norm": 2.4655446209984033, + "learning_rate": 1.247679093460759e-06, + "loss": 0.2298, + "step": 604 + }, + { + "epoch": 0.13765642775881684, + "grad_norm": 3.1521634114958816, + "learning_rate": 1.2476713939995895e-06, + "loss": 0.1264, + "step": 605 + }, + { + "epoch": 0.1378839590443686, + "grad_norm": 1.8219187381761075, + "learning_rate": 1.2476636818121568e-06, + "loss": 0.1028, + "step": 606 + }, + { + "epoch": 0.13811149032992037, + "grad_norm": 2.337156447435568, + "learning_rate": 1.247655956898619e-06, + "loss": 0.1946, + "step": 607 + }, + { + "epoch": 0.13833902161547212, + "grad_norm": 3.2562899945752966, + "learning_rate": 1.2476482192591335e-06, + "loss": 0.1465, + "step": 608 + }, + { + "epoch": 0.1385665529010239, + "grad_norm": 1.8250022998173558, + "learning_rate": 1.247640468893859e-06, + "loss": 0.1467, + "step": 609 + }, + { + "epoch": 0.13879408418657566, + "grad_norm": 3.5242803865119603, + "learning_rate": 1.2476327058029534e-06, + "loss": 0.1225, + "step": 610 + }, + { + "epoch": 0.1390216154721274, + "grad_norm": 3.027013883019154, + "learning_rate": 1.2476249299865757e-06, + "loss": 0.1595, + "step": 611 + }, + { + "epoch": 0.1392491467576792, + "grad_norm": 2.3807833370240843, + "learning_rate": 1.2476171414448847e-06, + "loss": 0.0984, + "step": 612 + }, + { + "epoch": 0.13947667804323094, + "grad_norm": 3.1119739781274416, + "learning_rate": 1.2476093401780397e-06, + "loss": 0.154, + "step": 613 + }, + { + "epoch": 0.13970420932878272, + "grad_norm": 3.4567643287811958, + "learning_rate": 1.2476015261861998e-06, + "loss": 0.1405, + "step": 614 + }, + { + "epoch": 0.13993174061433447, + "grad_norm": 2.6730132596017504, + "learning_rate": 1.247593699469525e-06, + "loss": 0.117, + "step": 615 + }, + { + "epoch": 0.14015927189988622, + "grad_norm": 2.78286071664722, + "learning_rate": 1.2475858600281754e-06, + "loss": 0.1504, + "step": 616 + }, + { + "epoch": 0.140386803185438, + "grad_norm": 2.0905809356248803, + "learning_rate": 1.247578007862311e-06, + "loss": 0.1221, + "step": 617 + }, + { + "epoch": 0.14061433447098975, + "grad_norm": 2.307570493464016, + "learning_rate": 1.2475701429720923e-06, + "loss": 0.1166, + "step": 618 + }, + { + "epoch": 0.14084186575654153, + "grad_norm": 1.2783682538203782, + "learning_rate": 1.24756226535768e-06, + "loss": 0.1346, + "step": 619 + }, + { + "epoch": 0.1410693970420933, + "grad_norm": 1.497656716954093, + "learning_rate": 1.2475543750192352e-06, + "loss": 0.2064, + "step": 620 + }, + { + "epoch": 0.14129692832764504, + "grad_norm": 3.79056695480817, + "learning_rate": 1.2475464719569192e-06, + "loss": 0.2673, + "step": 621 + }, + { + "epoch": 0.14152445961319682, + "grad_norm": 1.4805750856049538, + "learning_rate": 1.2475385561708934e-06, + "loss": 0.1992, + "step": 622 + }, + { + "epoch": 0.14175199089874857, + "grad_norm": 1.6748002073239907, + "learning_rate": 1.2475306276613194e-06, + "loss": 0.0979, + "step": 623 + }, + { + "epoch": 0.14197952218430035, + "grad_norm": 2.5674392190565736, + "learning_rate": 1.2475226864283596e-06, + "loss": 0.1337, + "step": 624 + }, + { + "epoch": 0.1422070534698521, + "grad_norm": 2.656075374063454, + "learning_rate": 1.2475147324721764e-06, + "loss": 0.2501, + "step": 625 + }, + { + "epoch": 0.14243458475540388, + "grad_norm": 2.03707084801983, + "learning_rate": 1.2475067657929319e-06, + "loss": 0.1673, + "step": 626 + }, + { + "epoch": 0.14266211604095563, + "grad_norm": 2.975904435297751, + "learning_rate": 1.2474987863907894e-06, + "loss": 0.135, + "step": 627 + }, + { + "epoch": 0.14288964732650739, + "grad_norm": 2.2205623276633295, + "learning_rate": 1.2474907942659116e-06, + "loss": 0.2149, + "step": 628 + }, + { + "epoch": 0.14311717861205916, + "grad_norm": 2.271865927518249, + "learning_rate": 1.247482789418462e-06, + "loss": 0.1519, + "step": 629 + }, + { + "epoch": 0.14334470989761092, + "grad_norm": 6.542697842484103, + "learning_rate": 1.2474747718486044e-06, + "loss": 0.1757, + "step": 630 + }, + { + "epoch": 0.1435722411831627, + "grad_norm": 1.8493295758356152, + "learning_rate": 1.2474667415565022e-06, + "loss": 0.096, + "step": 631 + }, + { + "epoch": 0.14379977246871445, + "grad_norm": 4.567549869753572, + "learning_rate": 1.24745869854232e-06, + "loss": 0.1745, + "step": 632 + }, + { + "epoch": 0.1440273037542662, + "grad_norm": 3.104479250541457, + "learning_rate": 1.2474506428062219e-06, + "loss": 0.14, + "step": 633 + }, + { + "epoch": 0.14425483503981798, + "grad_norm": 2.9519743566943464, + "learning_rate": 1.2474425743483726e-06, + "loss": 0.237, + "step": 634 + }, + { + "epoch": 0.14448236632536973, + "grad_norm": 1.4814831832284159, + "learning_rate": 1.2474344931689371e-06, + "loss": 0.0873, + "step": 635 + }, + { + "epoch": 0.1447098976109215, + "grad_norm": 2.0222816327136712, + "learning_rate": 1.2474263992680805e-06, + "loss": 0.155, + "step": 636 + }, + { + "epoch": 0.14493742889647326, + "grad_norm": 2.0590304829666914, + "learning_rate": 1.247418292645968e-06, + "loss": 0.107, + "step": 637 + }, + { + "epoch": 0.14516496018202502, + "grad_norm": 2.5562023131920633, + "learning_rate": 1.2474101733027659e-06, + "loss": 0.2256, + "step": 638 + }, + { + "epoch": 0.1453924914675768, + "grad_norm": 2.3833084873555195, + "learning_rate": 1.2474020412386395e-06, + "loss": 0.1087, + "step": 639 + }, + { + "epoch": 0.14562002275312855, + "grad_norm": 1.5076273114920544, + "learning_rate": 1.2473938964537551e-06, + "loss": 0.0893, + "step": 640 + }, + { + "epoch": 0.14584755403868033, + "grad_norm": 2.3708066851044887, + "learning_rate": 1.2473857389482797e-06, + "loss": 0.1247, + "step": 641 + }, + { + "epoch": 0.14607508532423208, + "grad_norm": 1.5590215080673084, + "learning_rate": 1.2473775687223794e-06, + "loss": 0.1504, + "step": 642 + }, + { + "epoch": 0.14630261660978386, + "grad_norm": 1.6107910166409294, + "learning_rate": 1.2473693857762215e-06, + "loss": 0.149, + "step": 643 + }, + { + "epoch": 0.1465301478953356, + "grad_norm": 1.7918533159116738, + "learning_rate": 1.247361190109973e-06, + "loss": 0.1104, + "step": 644 + }, + { + "epoch": 0.14675767918088736, + "grad_norm": 2.8984966135096566, + "learning_rate": 1.2473529817238016e-06, + "loss": 0.1755, + "step": 645 + }, + { + "epoch": 0.14698521046643914, + "grad_norm": 1.9091822418599347, + "learning_rate": 1.2473447606178754e-06, + "loss": 0.1077, + "step": 646 + }, + { + "epoch": 0.1472127417519909, + "grad_norm": 4.199288030915391, + "learning_rate": 1.2473365267923617e-06, + "loss": 0.2124, + "step": 647 + }, + { + "epoch": 0.14744027303754267, + "grad_norm": 2.331859473332942, + "learning_rate": 1.2473282802474293e-06, + "loss": 0.1576, + "step": 648 + }, + { + "epoch": 0.14766780432309443, + "grad_norm": 3.5722786659910577, + "learning_rate": 1.2473200209832465e-06, + "loss": 0.2027, + "step": 649 + }, + { + "epoch": 0.14789533560864618, + "grad_norm": 1.5390826591189062, + "learning_rate": 1.2473117489999823e-06, + "loss": 0.161, + "step": 650 + }, + { + "epoch": 0.14812286689419796, + "grad_norm": 2.741044883004237, + "learning_rate": 1.2473034642978057e-06, + "loss": 0.1656, + "step": 651 + }, + { + "epoch": 0.1483503981797497, + "grad_norm": 2.2681711762464034, + "learning_rate": 1.247295166876886e-06, + "loss": 0.1254, + "step": 652 + }, + { + "epoch": 0.1485779294653015, + "grad_norm": 2.2254637289761194, + "learning_rate": 1.2472868567373924e-06, + "loss": 0.1291, + "step": 653 + }, + { + "epoch": 0.14880546075085324, + "grad_norm": 2.213517163461755, + "learning_rate": 1.2472785338794953e-06, + "loss": 0.1541, + "step": 654 + }, + { + "epoch": 0.149032992036405, + "grad_norm": 1.6789308605390307, + "learning_rate": 1.247270198303365e-06, + "loss": 0.1316, + "step": 655 + }, + { + "epoch": 0.14926052332195677, + "grad_norm": 2.179149997459725, + "learning_rate": 1.247261850009171e-06, + "loss": 0.2437, + "step": 656 + }, + { + "epoch": 0.14948805460750852, + "grad_norm": 2.910894270371587, + "learning_rate": 1.2472534889970848e-06, + "loss": 0.2038, + "step": 657 + }, + { + "epoch": 0.1497155858930603, + "grad_norm": 1.751607816792672, + "learning_rate": 1.2472451152672766e-06, + "loss": 0.1164, + "step": 658 + }, + { + "epoch": 0.14994311717861206, + "grad_norm": 1.6602009490349432, + "learning_rate": 1.2472367288199177e-06, + "loss": 0.1193, + "step": 659 + }, + { + "epoch": 0.15017064846416384, + "grad_norm": 2.038150970938399, + "learning_rate": 1.2472283296551798e-06, + "loss": 0.102, + "step": 660 + }, + { + "epoch": 0.1503981797497156, + "grad_norm": 2.1439804373776936, + "learning_rate": 1.2472199177732346e-06, + "loss": 0.1502, + "step": 661 + }, + { + "epoch": 0.15062571103526734, + "grad_norm": 2.5777822840030358, + "learning_rate": 1.2472114931742537e-06, + "loss": 0.1168, + "step": 662 + }, + { + "epoch": 0.15085324232081912, + "grad_norm": 2.4175964563163177, + "learning_rate": 1.2472030558584093e-06, + "loss": 0.1035, + "step": 663 + }, + { + "epoch": 0.15108077360637087, + "grad_norm": 2.635267423704016, + "learning_rate": 1.2471946058258742e-06, + "loss": 0.1701, + "step": 664 + }, + { + "epoch": 0.15130830489192265, + "grad_norm": 1.9337561786859772, + "learning_rate": 1.2471861430768205e-06, + "loss": 0.1075, + "step": 665 + }, + { + "epoch": 0.1515358361774744, + "grad_norm": 1.7937795679496227, + "learning_rate": 1.2471776676114217e-06, + "loss": 0.1785, + "step": 666 + }, + { + "epoch": 0.15176336746302616, + "grad_norm": 3.0588710289274816, + "learning_rate": 1.2471691794298508e-06, + "loss": 0.1798, + "step": 667 + }, + { + "epoch": 0.15199089874857794, + "grad_norm": 2.638986072752188, + "learning_rate": 1.2471606785322814e-06, + "loss": 0.0878, + "step": 668 + }, + { + "epoch": 0.1522184300341297, + "grad_norm": 2.732712357601826, + "learning_rate": 1.247152164918887e-06, + "loss": 0.1267, + "step": 669 + }, + { + "epoch": 0.15244596131968147, + "grad_norm": 1.7481991977105777, + "learning_rate": 1.247143638589842e-06, + "loss": 0.1584, + "step": 670 + }, + { + "epoch": 0.15267349260523322, + "grad_norm": 2.794672743532085, + "learning_rate": 1.2471350995453203e-06, + "loss": 0.1584, + "step": 671 + }, + { + "epoch": 0.15290102389078497, + "grad_norm": 3.1279366528301633, + "learning_rate": 1.2471265477854966e-06, + "loss": 0.148, + "step": 672 + }, + { + "epoch": 0.15312855517633675, + "grad_norm": 3.920575109905724, + "learning_rate": 1.2471179833105454e-06, + "loss": 0.1732, + "step": 673 + }, + { + "epoch": 0.1533560864618885, + "grad_norm": 1.7916571238390178, + "learning_rate": 1.2471094061206422e-06, + "loss": 0.2336, + "step": 674 + }, + { + "epoch": 0.15358361774744028, + "grad_norm": 1.7363850632393116, + "learning_rate": 1.247100816215962e-06, + "loss": 0.1244, + "step": 675 + }, + { + "epoch": 0.15381114903299203, + "grad_norm": 2.504377712379844, + "learning_rate": 1.2470922135966806e-06, + "loss": 0.1674, + "step": 676 + }, + { + "epoch": 0.1540386803185438, + "grad_norm": 2.43043947984636, + "learning_rate": 1.2470835982629736e-06, + "loss": 0.1249, + "step": 677 + }, + { + "epoch": 0.15426621160409557, + "grad_norm": 3.950497364660697, + "learning_rate": 1.247074970215017e-06, + "loss": 0.2401, + "step": 678 + }, + { + "epoch": 0.15449374288964732, + "grad_norm": 3.1492013494233846, + "learning_rate": 1.2470663294529873e-06, + "loss": 0.1605, + "step": 679 + }, + { + "epoch": 0.1547212741751991, + "grad_norm": 1.80598204305421, + "learning_rate": 1.2470576759770612e-06, + "loss": 0.113, + "step": 680 + }, + { + "epoch": 0.15494880546075085, + "grad_norm": 2.0454054940402506, + "learning_rate": 1.2470490097874155e-06, + "loss": 0.1453, + "step": 681 + }, + { + "epoch": 0.15517633674630263, + "grad_norm": 3.6952564849548053, + "learning_rate": 1.247040330884227e-06, + "loss": 0.1581, + "step": 682 + }, + { + "epoch": 0.15540386803185438, + "grad_norm": 2.3655397835651075, + "learning_rate": 1.2470316392676738e-06, + "loss": 0.169, + "step": 683 + }, + { + "epoch": 0.15563139931740613, + "grad_norm": 3.416348712472315, + "learning_rate": 1.2470229349379326e-06, + "loss": 0.1347, + "step": 684 + }, + { + "epoch": 0.1558589306029579, + "grad_norm": 2.618995350775909, + "learning_rate": 1.2470142178951822e-06, + "loss": 0.1924, + "step": 685 + }, + { + "epoch": 0.15608646188850966, + "grad_norm": 1.344663220923034, + "learning_rate": 1.2470054881396002e-06, + "loss": 0.2013, + "step": 686 + }, + { + "epoch": 0.15631399317406144, + "grad_norm": 1.1568986493989724, + "learning_rate": 1.246996745671365e-06, + "loss": 0.131, + "step": 687 + }, + { + "epoch": 0.1565415244596132, + "grad_norm": 3.0558312091963473, + "learning_rate": 1.2469879904906556e-06, + "loss": 0.14, + "step": 688 + }, + { + "epoch": 0.15676905574516495, + "grad_norm": 4.767157427966137, + "learning_rate": 1.2469792225976507e-06, + "loss": 0.156, + "step": 689 + }, + { + "epoch": 0.15699658703071673, + "grad_norm": 1.9971770266956603, + "learning_rate": 1.2469704419925296e-06, + "loss": 0.1413, + "step": 690 + }, + { + "epoch": 0.15722411831626848, + "grad_norm": 3.560138993273607, + "learning_rate": 1.246961648675472e-06, + "loss": 0.2274, + "step": 691 + }, + { + "epoch": 0.15745164960182026, + "grad_norm": 1.8091873297743188, + "learning_rate": 1.246952842646657e-06, + "loss": 0.2606, + "step": 692 + }, + { + "epoch": 0.157679180887372, + "grad_norm": 1.9524492716137443, + "learning_rate": 1.2469440239062653e-06, + "loss": 0.1888, + "step": 693 + }, + { + "epoch": 0.15790671217292376, + "grad_norm": 1.978419283294589, + "learning_rate": 1.2469351924544766e-06, + "loss": 0.168, + "step": 694 + }, + { + "epoch": 0.15813424345847554, + "grad_norm": 1.909977232991382, + "learning_rate": 1.2469263482914716e-06, + "loss": 0.1302, + "step": 695 + }, + { + "epoch": 0.1583617747440273, + "grad_norm": 2.786836009335205, + "learning_rate": 1.246917491417431e-06, + "loss": 0.1603, + "step": 696 + }, + { + "epoch": 0.15858930602957907, + "grad_norm": 2.700038379786115, + "learning_rate": 1.246908621832536e-06, + "loss": 0.2268, + "step": 697 + }, + { + "epoch": 0.15881683731513083, + "grad_norm": 1.4116863857464026, + "learning_rate": 1.2468997395369677e-06, + "loss": 0.1761, + "step": 698 + }, + { + "epoch": 0.1590443686006826, + "grad_norm": 2.8928190492615133, + "learning_rate": 1.2468908445309077e-06, + "loss": 0.1789, + "step": 699 + }, + { + "epoch": 0.15927189988623436, + "grad_norm": 1.650749552825084, + "learning_rate": 1.2468819368145376e-06, + "loss": 0.1324, + "step": 700 + }, + { + "epoch": 0.1594994311717861, + "grad_norm": 2.3722473947353677, + "learning_rate": 1.2468730163880398e-06, + "loss": 0.1116, + "step": 701 + }, + { + "epoch": 0.1597269624573379, + "grad_norm": 2.879822957568519, + "learning_rate": 1.2468640832515962e-06, + "loss": 0.0564, + "step": 702 + }, + { + "epoch": 0.15995449374288964, + "grad_norm": 2.162764734574199, + "learning_rate": 1.24685513740539e-06, + "loss": 0.1739, + "step": 703 + }, + { + "epoch": 0.16018202502844142, + "grad_norm": 2.8968364936480206, + "learning_rate": 1.2468461788496036e-06, + "loss": 0.2091, + "step": 704 + }, + { + "epoch": 0.16040955631399317, + "grad_norm": 1.8559610510087743, + "learning_rate": 1.24683720758442e-06, + "loss": 0.1533, + "step": 705 + }, + { + "epoch": 0.16063708759954493, + "grad_norm": 2.184281056476426, + "learning_rate": 1.2468282236100226e-06, + "loss": 0.1582, + "step": 706 + }, + { + "epoch": 0.1608646188850967, + "grad_norm": 1.3209438595657337, + "learning_rate": 1.2468192269265955e-06, + "loss": 0.1914, + "step": 707 + }, + { + "epoch": 0.16109215017064846, + "grad_norm": 2.1470386790088174, + "learning_rate": 1.246810217534322e-06, + "loss": 0.0831, + "step": 708 + }, + { + "epoch": 0.16131968145620024, + "grad_norm": 1.594792083731403, + "learning_rate": 1.2468011954333864e-06, + "loss": 0.1349, + "step": 709 + }, + { + "epoch": 0.161547212741752, + "grad_norm": 1.9899900139983586, + "learning_rate": 1.2467921606239734e-06, + "loss": 0.1406, + "step": 710 + }, + { + "epoch": 0.16177474402730374, + "grad_norm": 2.161056989124219, + "learning_rate": 1.2467831131062672e-06, + "loss": 0.1186, + "step": 711 + }, + { + "epoch": 0.16200227531285552, + "grad_norm": 3.2786168252573438, + "learning_rate": 1.2467740528804528e-06, + "loss": 0.1525, + "step": 712 + }, + { + "epoch": 0.16222980659840727, + "grad_norm": 2.152367629184536, + "learning_rate": 1.2467649799467156e-06, + "loss": 0.1403, + "step": 713 + }, + { + "epoch": 0.16245733788395905, + "grad_norm": 2.658644939282435, + "learning_rate": 1.246755894305241e-06, + "loss": 0.1287, + "step": 714 + }, + { + "epoch": 0.1626848691695108, + "grad_norm": 1.8320157906526173, + "learning_rate": 1.2467467959562143e-06, + "loss": 0.1489, + "step": 715 + }, + { + "epoch": 0.16291240045506258, + "grad_norm": 3.0792158572997526, + "learning_rate": 1.2467376848998221e-06, + "loss": 0.1929, + "step": 716 + }, + { + "epoch": 0.16313993174061434, + "grad_norm": 2.592666663523021, + "learning_rate": 1.2467285611362501e-06, + "loss": 0.1198, + "step": 717 + }, + { + "epoch": 0.1633674630261661, + "grad_norm": 2.3270639642215123, + "learning_rate": 1.2467194246656851e-06, + "loss": 0.119, + "step": 718 + }, + { + "epoch": 0.16359499431171787, + "grad_norm": 1.5662096056295784, + "learning_rate": 1.2467102754883136e-06, + "loss": 0.1488, + "step": 719 + }, + { + "epoch": 0.16382252559726962, + "grad_norm": 2.0754259992407174, + "learning_rate": 1.2467011136043228e-06, + "loss": 0.1206, + "step": 720 + }, + { + "epoch": 0.1640500568828214, + "grad_norm": 2.377809704915352, + "learning_rate": 1.2466919390138995e-06, + "loss": 0.2349, + "step": 721 + }, + { + "epoch": 0.16427758816837315, + "grad_norm": 2.1373727350700205, + "learning_rate": 1.246682751717232e-06, + "loss": 0.1333, + "step": 722 + }, + { + "epoch": 0.1645051194539249, + "grad_norm": 3.8601459911234697, + "learning_rate": 1.2466735517145074e-06, + "loss": 0.3259, + "step": 723 + }, + { + "epoch": 0.16473265073947668, + "grad_norm": 2.1273982856593614, + "learning_rate": 1.2466643390059138e-06, + "loss": 0.199, + "step": 724 + }, + { + "epoch": 0.16496018202502843, + "grad_norm": 2.274158988300012, + "learning_rate": 1.2466551135916398e-06, + "loss": 0.1351, + "step": 725 + }, + { + "epoch": 0.16518771331058021, + "grad_norm": 2.1566789936379287, + "learning_rate": 1.2466458754718737e-06, + "loss": 0.219, + "step": 726 + }, + { + "epoch": 0.16541524459613197, + "grad_norm": 3.388462178150055, + "learning_rate": 1.2466366246468045e-06, + "loss": 0.1456, + "step": 727 + }, + { + "epoch": 0.16564277588168372, + "grad_norm": 2.792548754369155, + "learning_rate": 1.246627361116621e-06, + "loss": 0.2178, + "step": 728 + }, + { + "epoch": 0.1658703071672355, + "grad_norm": 1.7787275123381943, + "learning_rate": 1.246618084881513e-06, + "loss": 0.2584, + "step": 729 + }, + { + "epoch": 0.16609783845278725, + "grad_norm": 2.150845029279013, + "learning_rate": 1.2466087959416695e-06, + "loss": 0.1474, + "step": 730 + }, + { + "epoch": 0.16632536973833903, + "grad_norm": 3.4162019984229213, + "learning_rate": 1.2465994942972805e-06, + "loss": 0.1415, + "step": 731 + }, + { + "epoch": 0.16655290102389078, + "grad_norm": 3.5172418167047743, + "learning_rate": 1.2465901799485366e-06, + "loss": 0.2267, + "step": 732 + }, + { + "epoch": 0.16678043230944256, + "grad_norm": 1.9664520821504867, + "learning_rate": 1.2465808528956277e-06, + "loss": 0.1027, + "step": 733 + }, + { + "epoch": 0.1670079635949943, + "grad_norm": 2.053925645911197, + "learning_rate": 1.2465715131387446e-06, + "loss": 0.1405, + "step": 734 + }, + { + "epoch": 0.16723549488054607, + "grad_norm": 1.6417683696863474, + "learning_rate": 1.2465621606780778e-06, + "loss": 0.1804, + "step": 735 + }, + { + "epoch": 0.16746302616609784, + "grad_norm": 1.9532511665276102, + "learning_rate": 1.2465527955138191e-06, + "loss": 0.1438, + "step": 736 + }, + { + "epoch": 0.1676905574516496, + "grad_norm": 2.7978077296538295, + "learning_rate": 1.2465434176461596e-06, + "loss": 0.1806, + "step": 737 + }, + { + "epoch": 0.16791808873720138, + "grad_norm": 1.7861222447513503, + "learning_rate": 1.2465340270752908e-06, + "loss": 0.0953, + "step": 738 + }, + { + "epoch": 0.16814562002275313, + "grad_norm": 1.2545980680473232, + "learning_rate": 1.2465246238014047e-06, + "loss": 0.0881, + "step": 739 + }, + { + "epoch": 0.16837315130830488, + "grad_norm": 2.49195685975364, + "learning_rate": 1.2465152078246936e-06, + "loss": 0.1643, + "step": 740 + }, + { + "epoch": 0.16860068259385666, + "grad_norm": 2.0211233157427637, + "learning_rate": 1.24650577914535e-06, + "loss": 0.1263, + "step": 741 + }, + { + "epoch": 0.1688282138794084, + "grad_norm": 2.7858317155477317, + "learning_rate": 1.2464963377635667e-06, + "loss": 0.1547, + "step": 742 + }, + { + "epoch": 0.1690557451649602, + "grad_norm": 1.7097291360774547, + "learning_rate": 1.246486883679536e-06, + "loss": 0.2516, + "step": 743 + }, + { + "epoch": 0.16928327645051194, + "grad_norm": 3.9137648292026737, + "learning_rate": 1.246477416893452e-06, + "loss": 0.2036, + "step": 744 + }, + { + "epoch": 0.1695108077360637, + "grad_norm": 3.005605654107358, + "learning_rate": 1.2464679374055074e-06, + "loss": 0.1481, + "step": 745 + }, + { + "epoch": 0.16973833902161548, + "grad_norm": 3.401532765227879, + "learning_rate": 1.2464584452158968e-06, + "loss": 0.1841, + "step": 746 + }, + { + "epoch": 0.16996587030716723, + "grad_norm": 2.843140048954733, + "learning_rate": 1.2464489403248133e-06, + "loss": 0.184, + "step": 747 + }, + { + "epoch": 0.170193401592719, + "grad_norm": 1.515779223289782, + "learning_rate": 1.246439422732452e-06, + "loss": 0.1262, + "step": 748 + }, + { + "epoch": 0.17042093287827076, + "grad_norm": 2.618293101772126, + "learning_rate": 1.2464298924390066e-06, + "loss": 0.1415, + "step": 749 + }, + { + "epoch": 0.17064846416382254, + "grad_norm": 2.248269138511338, + "learning_rate": 1.2464203494446725e-06, + "loss": 0.185, + "step": 750 + }, + { + "epoch": 0.1708759954493743, + "grad_norm": 1.3558978429200024, + "learning_rate": 1.2464107937496444e-06, + "loss": 0.096, + "step": 751 + }, + { + "epoch": 0.17110352673492604, + "grad_norm": 1.8355286869437153, + "learning_rate": 1.246401225354118e-06, + "loss": 0.0936, + "step": 752 + }, + { + "epoch": 0.17133105802047782, + "grad_norm": 2.611386377303649, + "learning_rate": 1.2463916442582883e-06, + "loss": 0.2058, + "step": 753 + }, + { + "epoch": 0.17155858930602957, + "grad_norm": 1.81511526173022, + "learning_rate": 1.2463820504623516e-06, + "loss": 0.0722, + "step": 754 + }, + { + "epoch": 0.17178612059158135, + "grad_norm": 1.6836561465138316, + "learning_rate": 1.246372443966504e-06, + "loss": 0.1419, + "step": 755 + }, + { + "epoch": 0.1720136518771331, + "grad_norm": 3.189715404864015, + "learning_rate": 1.246362824770941e-06, + "loss": 0.1604, + "step": 756 + }, + { + "epoch": 0.17224118316268486, + "grad_norm": 2.8556456489625193, + "learning_rate": 1.2463531928758605e-06, + "loss": 0.1793, + "step": 757 + }, + { + "epoch": 0.17246871444823664, + "grad_norm": 2.1490228034084344, + "learning_rate": 1.2463435482814585e-06, + "loss": 0.1928, + "step": 758 + }, + { + "epoch": 0.1726962457337884, + "grad_norm": 1.866877451814791, + "learning_rate": 1.246333890987932e-06, + "loss": 0.2064, + "step": 759 + }, + { + "epoch": 0.17292377701934017, + "grad_norm": 2.7361601673612284, + "learning_rate": 1.246324220995479e-06, + "loss": 0.1024, + "step": 760 + }, + { + "epoch": 0.17315130830489192, + "grad_norm": 3.6715173407277004, + "learning_rate": 1.2463145383042966e-06, + "loss": 0.1741, + "step": 761 + }, + { + "epoch": 0.17337883959044367, + "grad_norm": 4.388914943676026, + "learning_rate": 1.2463048429145832e-06, + "loss": 0.2951, + "step": 762 + }, + { + "epoch": 0.17360637087599545, + "grad_norm": 3.0864567661578075, + "learning_rate": 1.2462951348265364e-06, + "loss": 0.1681, + "step": 763 + }, + { + "epoch": 0.1738339021615472, + "grad_norm": 2.2429137189515487, + "learning_rate": 1.2462854140403553e-06, + "loss": 0.1698, + "step": 764 + }, + { + "epoch": 0.17406143344709898, + "grad_norm": 3.7655750343422487, + "learning_rate": 1.2462756805562378e-06, + "loss": 0.1972, + "step": 765 + }, + { + "epoch": 0.17428896473265074, + "grad_norm": 1.4821109763148475, + "learning_rate": 1.2462659343743832e-06, + "loss": 0.1144, + "step": 766 + }, + { + "epoch": 0.17451649601820252, + "grad_norm": 2.9261323093043234, + "learning_rate": 1.2462561754949908e-06, + "loss": 0.1354, + "step": 767 + }, + { + "epoch": 0.17474402730375427, + "grad_norm": 2.021278631174851, + "learning_rate": 1.2462464039182598e-06, + "loss": 0.1158, + "step": 768 + }, + { + "epoch": 0.17497155858930602, + "grad_norm": 2.189903163956334, + "learning_rate": 1.2462366196443903e-06, + "loss": 0.1587, + "step": 769 + }, + { + "epoch": 0.1751990898748578, + "grad_norm": 3.7285174958892364, + "learning_rate": 1.246226822673582e-06, + "loss": 0.2024, + "step": 770 + }, + { + "epoch": 0.17542662116040955, + "grad_norm": 1.9007743093993184, + "learning_rate": 1.2462170130060351e-06, + "loss": 0.1025, + "step": 771 + }, + { + "epoch": 0.17565415244596133, + "grad_norm": 3.3341124392840134, + "learning_rate": 1.24620719064195e-06, + "loss": 0.1718, + "step": 772 + }, + { + "epoch": 0.17588168373151308, + "grad_norm": 2.271177623744295, + "learning_rate": 1.246197355581528e-06, + "loss": 0.1713, + "step": 773 + }, + { + "epoch": 0.17610921501706484, + "grad_norm": 2.631276315974309, + "learning_rate": 1.2461875078249694e-06, + "loss": 0.1769, + "step": 774 + }, + { + "epoch": 0.17633674630261661, + "grad_norm": 2.2924143983188765, + "learning_rate": 1.246177647372476e-06, + "loss": 0.1155, + "step": 775 + }, + { + "epoch": 0.17656427758816837, + "grad_norm": 4.145219852575127, + "learning_rate": 1.246167774224249e-06, + "loss": 0.1997, + "step": 776 + }, + { + "epoch": 0.17679180887372015, + "grad_norm": 3.5955716696986237, + "learning_rate": 1.2461578883804903e-06, + "loss": 0.1434, + "step": 777 + }, + { + "epoch": 0.1770193401592719, + "grad_norm": 3.5823237759342477, + "learning_rate": 1.246147989841402e-06, + "loss": 0.131, + "step": 778 + }, + { + "epoch": 0.17724687144482365, + "grad_norm": 1.7885388560764315, + "learning_rate": 1.2461380786071863e-06, + "loss": 0.0755, + "step": 779 + }, + { + "epoch": 0.17747440273037543, + "grad_norm": 2.362853335883513, + "learning_rate": 1.246128154678046e-06, + "loss": 0.1285, + "step": 780 + }, + { + "epoch": 0.17770193401592718, + "grad_norm": 2.826403481752188, + "learning_rate": 1.2461182180541835e-06, + "loss": 0.0898, + "step": 781 + }, + { + "epoch": 0.17792946530147896, + "grad_norm": 5.793503549962082, + "learning_rate": 1.2461082687358022e-06, + "loss": 0.0971, + "step": 782 + }, + { + "epoch": 0.1781569965870307, + "grad_norm": 1.8035940463938722, + "learning_rate": 1.2460983067231055e-06, + "loss": 0.1105, + "step": 783 + }, + { + "epoch": 0.1783845278725825, + "grad_norm": 2.3286047675537613, + "learning_rate": 1.246088332016297e-06, + "loss": 0.0997, + "step": 784 + }, + { + "epoch": 0.17861205915813425, + "grad_norm": 2.4331158536688067, + "learning_rate": 1.2460783446155802e-06, + "loss": 0.2145, + "step": 785 + }, + { + "epoch": 0.178839590443686, + "grad_norm": 2.4301917574272234, + "learning_rate": 1.2460683445211596e-06, + "loss": 0.1826, + "step": 786 + }, + { + "epoch": 0.17906712172923778, + "grad_norm": 3.191042960124482, + "learning_rate": 1.2460583317332395e-06, + "loss": 0.2224, + "step": 787 + }, + { + "epoch": 0.17929465301478953, + "grad_norm": 1.9281932990563415, + "learning_rate": 1.2460483062520246e-06, + "loss": 0.1012, + "step": 788 + }, + { + "epoch": 0.1795221843003413, + "grad_norm": 1.9401318974845003, + "learning_rate": 1.2460382680777196e-06, + "loss": 0.0761, + "step": 789 + }, + { + "epoch": 0.17974971558589306, + "grad_norm": 13.086161362963225, + "learning_rate": 1.2460282172105298e-06, + "loss": 0.2088, + "step": 790 + }, + { + "epoch": 0.1799772468714448, + "grad_norm": 1.4783130702588718, + "learning_rate": 1.2460181536506608e-06, + "loss": 0.2126, + "step": 791 + }, + { + "epoch": 0.1802047781569966, + "grad_norm": 2.4964786740518763, + "learning_rate": 1.2460080773983177e-06, + "loss": 0.1385, + "step": 792 + }, + { + "epoch": 0.18043230944254834, + "grad_norm": 2.7778972521749545, + "learning_rate": 1.2459979884537072e-06, + "loss": 0.1448, + "step": 793 + }, + { + "epoch": 0.18065984072810012, + "grad_norm": 2.167813491126184, + "learning_rate": 1.2459878868170348e-06, + "loss": 0.1379, + "step": 794 + }, + { + "epoch": 0.18088737201365188, + "grad_norm": 1.9654699615947284, + "learning_rate": 1.2459777724885075e-06, + "loss": 0.1314, + "step": 795 + }, + { + "epoch": 0.18111490329920363, + "grad_norm": 2.293952257528565, + "learning_rate": 1.2459676454683318e-06, + "loss": 0.1695, + "step": 796 + }, + { + "epoch": 0.1813424345847554, + "grad_norm": 3.9215044200778144, + "learning_rate": 1.2459575057567144e-06, + "loss": 0.2204, + "step": 797 + }, + { + "epoch": 0.18156996587030716, + "grad_norm": 2.8214133097210117, + "learning_rate": 1.245947353353863e-06, + "loss": 0.1558, + "step": 798 + }, + { + "epoch": 0.18179749715585894, + "grad_norm": 5.317020653859289, + "learning_rate": 1.245937188259985e-06, + "loss": 0.2603, + "step": 799 + }, + { + "epoch": 0.1820250284414107, + "grad_norm": 4.004955818619992, + "learning_rate": 1.245927010475288e-06, + "loss": 0.1196, + "step": 800 + }, + { + "epoch": 0.18225255972696247, + "grad_norm": 3.792524464667178, + "learning_rate": 1.24591681999998e-06, + "loss": 0.1821, + "step": 801 + }, + { + "epoch": 0.18248009101251422, + "grad_norm": 2.813011742342484, + "learning_rate": 1.2459066168342693e-06, + "loss": 0.1513, + "step": 802 + }, + { + "epoch": 0.18270762229806597, + "grad_norm": 3.511510747002315, + "learning_rate": 1.2458964009783646e-06, + "loss": 0.2163, + "step": 803 + }, + { + "epoch": 0.18293515358361775, + "grad_norm": 2.802158661308834, + "learning_rate": 1.2458861724324745e-06, + "loss": 0.1963, + "step": 804 + }, + { + "epoch": 0.1831626848691695, + "grad_norm": 3.64850186041969, + "learning_rate": 1.2458759311968084e-06, + "loss": 0.303, + "step": 805 + }, + { + "epoch": 0.1833902161547213, + "grad_norm": 2.6182595326596725, + "learning_rate": 1.245865677271575e-06, + "loss": 0.1456, + "step": 806 + }, + { + "epoch": 0.18361774744027304, + "grad_norm": 2.399741320725503, + "learning_rate": 1.2458554106569844e-06, + "loss": 0.2288, + "step": 807 + }, + { + "epoch": 0.1838452787258248, + "grad_norm": 1.252106549654472, + "learning_rate": 1.2458451313532463e-06, + "loss": 0.0801, + "step": 808 + }, + { + "epoch": 0.18407281001137657, + "grad_norm": 3.696224132577839, + "learning_rate": 1.2458348393605708e-06, + "loss": 0.2059, + "step": 809 + }, + { + "epoch": 0.18430034129692832, + "grad_norm": 1.3783330613855644, + "learning_rate": 1.2458245346791678e-06, + "loss": 0.1164, + "step": 810 + }, + { + "epoch": 0.1845278725824801, + "grad_norm": 1.5623432135982267, + "learning_rate": 1.2458142173092486e-06, + "loss": 0.176, + "step": 811 + }, + { + "epoch": 0.18475540386803185, + "grad_norm": 6.552053967433837, + "learning_rate": 1.2458038872510237e-06, + "loss": 0.118, + "step": 812 + }, + { + "epoch": 0.1849829351535836, + "grad_norm": 3.2237210845046964, + "learning_rate": 1.2457935445047042e-06, + "loss": 0.1875, + "step": 813 + }, + { + "epoch": 0.18521046643913538, + "grad_norm": 1.7463109516387256, + "learning_rate": 1.2457831890705018e-06, + "loss": 0.1945, + "step": 814 + }, + { + "epoch": 0.18543799772468714, + "grad_norm": 2.8292409598595953, + "learning_rate": 1.2457728209486279e-06, + "loss": 0.1711, + "step": 815 + }, + { + "epoch": 0.18566552901023892, + "grad_norm": 3.198074487753419, + "learning_rate": 1.2457624401392943e-06, + "loss": 0.2552, + "step": 816 + }, + { + "epoch": 0.18589306029579067, + "grad_norm": 3.2293783551138278, + "learning_rate": 1.2457520466427135e-06, + "loss": 0.1955, + "step": 817 + }, + { + "epoch": 0.18612059158134245, + "grad_norm": 2.5604778410965383, + "learning_rate": 1.2457416404590974e-06, + "loss": 0.1689, + "step": 818 + }, + { + "epoch": 0.1863481228668942, + "grad_norm": 2.4475267016374427, + "learning_rate": 1.2457312215886592e-06, + "loss": 0.1165, + "step": 819 + }, + { + "epoch": 0.18657565415244595, + "grad_norm": 1.9856047790588058, + "learning_rate": 1.2457207900316115e-06, + "loss": 0.195, + "step": 820 + }, + { + "epoch": 0.18680318543799773, + "grad_norm": 3.030251865029441, + "learning_rate": 1.245710345788168e-06, + "loss": 0.2233, + "step": 821 + }, + { + "epoch": 0.18703071672354948, + "grad_norm": 6.914472069589314, + "learning_rate": 1.2456998888585414e-06, + "loss": 0.1294, + "step": 822 + }, + { + "epoch": 0.18725824800910126, + "grad_norm": 1.5392801223632877, + "learning_rate": 1.245689419242946e-06, + "loss": 0.1031, + "step": 823 + }, + { + "epoch": 0.18748577929465302, + "grad_norm": 1.5563008585328006, + "learning_rate": 1.2456789369415955e-06, + "loss": 0.1233, + "step": 824 + }, + { + "epoch": 0.18771331058020477, + "grad_norm": 1.5005319006316646, + "learning_rate": 1.2456684419547044e-06, + "loss": 0.1698, + "step": 825 + }, + { + "epoch": 0.18794084186575655, + "grad_norm": 2.5311436309198245, + "learning_rate": 1.245657934282487e-06, + "loss": 0.1242, + "step": 826 + }, + { + "epoch": 0.1881683731513083, + "grad_norm": 1.3382771790085715, + "learning_rate": 1.245647413925158e-06, + "loss": 0.1173, + "step": 827 + }, + { + "epoch": 0.18839590443686008, + "grad_norm": 2.455502403566395, + "learning_rate": 1.2456368808829327e-06, + "loss": 0.0912, + "step": 828 + }, + { + "epoch": 0.18862343572241183, + "grad_norm": 2.9752303589937212, + "learning_rate": 1.2456263351560261e-06, + "loss": 0.2599, + "step": 829 + }, + { + "epoch": 0.18885096700796358, + "grad_norm": 5.043835077918359, + "learning_rate": 1.2456157767446538e-06, + "loss": 0.1609, + "step": 830 + }, + { + "epoch": 0.18907849829351536, + "grad_norm": 2.756359704558054, + "learning_rate": 1.245605205649032e-06, + "loss": 0.1323, + "step": 831 + }, + { + "epoch": 0.18930602957906711, + "grad_norm": 1.835440265718024, + "learning_rate": 1.245594621869376e-06, + "loss": 0.2094, + "step": 832 + }, + { + "epoch": 0.1895335608646189, + "grad_norm": 1.2880237601014817, + "learning_rate": 1.2455840254059026e-06, + "loss": 0.1085, + "step": 833 + }, + { + "epoch": 0.18976109215017065, + "grad_norm": 1.4808086873300856, + "learning_rate": 1.2455734162588282e-06, + "loss": 0.1067, + "step": 834 + }, + { + "epoch": 0.1899886234357224, + "grad_norm": 2.3351983872627597, + "learning_rate": 1.2455627944283697e-06, + "loss": 0.1493, + "step": 835 + }, + { + "epoch": 0.19021615472127418, + "grad_norm": 2.422722379821762, + "learning_rate": 1.245552159914744e-06, + "loss": 0.1387, + "step": 836 + }, + { + "epoch": 0.19044368600682593, + "grad_norm": 2.2005548282870477, + "learning_rate": 1.245541512718169e-06, + "loss": 0.1047, + "step": 837 + }, + { + "epoch": 0.1906712172923777, + "grad_norm": 2.379475571028047, + "learning_rate": 1.245530852838862e-06, + "loss": 0.1524, + "step": 838 + }, + { + "epoch": 0.19089874857792946, + "grad_norm": 1.669935289366072, + "learning_rate": 1.2455201802770405e-06, + "loss": 0.157, + "step": 839 + }, + { + "epoch": 0.19112627986348124, + "grad_norm": 2.357020791051429, + "learning_rate": 1.245509495032923e-06, + "loss": 0.2156, + "step": 840 + }, + { + "epoch": 0.191353811149033, + "grad_norm": 3.871602599108809, + "learning_rate": 1.2454987971067278e-06, + "loss": 0.1557, + "step": 841 + }, + { + "epoch": 0.19158134243458474, + "grad_norm": 2.5332197020943887, + "learning_rate": 1.2454880864986737e-06, + "loss": 0.1644, + "step": 842 + }, + { + "epoch": 0.19180887372013652, + "grad_norm": 3.1286962973408596, + "learning_rate": 1.2454773632089795e-06, + "loss": 0.0794, + "step": 843 + }, + { + "epoch": 0.19203640500568828, + "grad_norm": 2.3210649274985666, + "learning_rate": 1.2454666272378644e-06, + "loss": 0.129, + "step": 844 + }, + { + "epoch": 0.19226393629124006, + "grad_norm": 3.000200402253768, + "learning_rate": 1.2454558785855475e-06, + "loss": 0.1628, + "step": 845 + }, + { + "epoch": 0.1924914675767918, + "grad_norm": 2.3643323080869902, + "learning_rate": 1.245445117252249e-06, + "loss": 0.1345, + "step": 846 + }, + { + "epoch": 0.19271899886234356, + "grad_norm": 2.532625203594351, + "learning_rate": 1.2454343432381886e-06, + "loss": 0.2082, + "step": 847 + }, + { + "epoch": 0.19294653014789534, + "grad_norm": 1.9628657145639428, + "learning_rate": 1.2454235565435862e-06, + "loss": 0.0782, + "step": 848 + }, + { + "epoch": 0.1931740614334471, + "grad_norm": 1.609178421923729, + "learning_rate": 1.2454127571686629e-06, + "loss": 0.1405, + "step": 849 + }, + { + "epoch": 0.19340159271899887, + "grad_norm": 1.7728115247069527, + "learning_rate": 1.245401945113639e-06, + "loss": 0.203, + "step": 850 + }, + { + "epoch": 0.19362912400455062, + "grad_norm": 3.2450475274049118, + "learning_rate": 1.2453911203787355e-06, + "loss": 0.1524, + "step": 851 + }, + { + "epoch": 0.19385665529010238, + "grad_norm": 22.097060091469434, + "learning_rate": 1.2453802829641736e-06, + "loss": 0.2636, + "step": 852 + }, + { + "epoch": 0.19408418657565416, + "grad_norm": 2.5365065820289496, + "learning_rate": 1.2453694328701752e-06, + "loss": 0.1019, + "step": 853 + }, + { + "epoch": 0.1943117178612059, + "grad_norm": 2.090322149834491, + "learning_rate": 1.2453585700969614e-06, + "loss": 0.1498, + "step": 854 + }, + { + "epoch": 0.1945392491467577, + "grad_norm": 2.6606765925685787, + "learning_rate": 1.2453476946447547e-06, + "loss": 0.1398, + "step": 855 + }, + { + "epoch": 0.19476678043230944, + "grad_norm": 3.56083888144899, + "learning_rate": 1.2453368065137772e-06, + "loss": 0.1463, + "step": 856 + }, + { + "epoch": 0.19499431171786122, + "grad_norm": 2.1276836242796793, + "learning_rate": 1.2453259057042514e-06, + "loss": 0.1753, + "step": 857 + }, + { + "epoch": 0.19522184300341297, + "grad_norm": 2.5690977004159805, + "learning_rate": 1.2453149922164003e-06, + "loss": 0.1292, + "step": 858 + }, + { + "epoch": 0.19544937428896472, + "grad_norm": 4.345742784369693, + "learning_rate": 1.2453040660504468e-06, + "loss": 0.15, + "step": 859 + }, + { + "epoch": 0.1956769055745165, + "grad_norm": 3.118246879884093, + "learning_rate": 1.2452931272066141e-06, + "loss": 0.169, + "step": 860 + }, + { + "epoch": 0.19590443686006825, + "grad_norm": 2.68254786515319, + "learning_rate": 1.245282175685126e-06, + "loss": 0.157, + "step": 861 + }, + { + "epoch": 0.19613196814562003, + "grad_norm": 2.088476673647213, + "learning_rate": 1.2452712114862063e-06, + "loss": 0.1782, + "step": 862 + }, + { + "epoch": 0.19635949943117179, + "grad_norm": 1.568141769132608, + "learning_rate": 1.245260234610079e-06, + "loss": 0.1295, + "step": 863 + }, + { + "epoch": 0.19658703071672354, + "grad_norm": 2.186319656948205, + "learning_rate": 1.2452492450569682e-06, + "loss": 0.1734, + "step": 864 + }, + { + "epoch": 0.19681456200227532, + "grad_norm": 2.7655739546712135, + "learning_rate": 1.245238242827099e-06, + "loss": 0.1694, + "step": 865 + }, + { + "epoch": 0.19704209328782707, + "grad_norm": 3.0373302408208196, + "learning_rate": 1.245227227920696e-06, + "loss": 0.1356, + "step": 866 + }, + { + "epoch": 0.19726962457337885, + "grad_norm": 2.1820099415146914, + "learning_rate": 1.2452162003379842e-06, + "loss": 0.2082, + "step": 867 + }, + { + "epoch": 0.1974971558589306, + "grad_norm": 3.6721625065681827, + "learning_rate": 1.2452051600791891e-06, + "loss": 0.1915, + "step": 868 + }, + { + "epoch": 0.19772468714448235, + "grad_norm": 6.490462296454016, + "learning_rate": 1.2451941071445367e-06, + "loss": 0.1815, + "step": 869 + }, + { + "epoch": 0.19795221843003413, + "grad_norm": 3.246518762107006, + "learning_rate": 1.2451830415342524e-06, + "loss": 0.137, + "step": 870 + }, + { + "epoch": 0.19817974971558588, + "grad_norm": 2.7033364330836873, + "learning_rate": 1.2451719632485627e-06, + "loss": 0.1317, + "step": 871 + }, + { + "epoch": 0.19840728100113766, + "grad_norm": 3.30778551761739, + "learning_rate": 1.2451608722876938e-06, + "loss": 0.1099, + "step": 872 + }, + { + "epoch": 0.19863481228668942, + "grad_norm": 2.2687509460631294, + "learning_rate": 1.2451497686518722e-06, + "loss": 0.1361, + "step": 873 + }, + { + "epoch": 0.1988623435722412, + "grad_norm": 1.641721237453431, + "learning_rate": 1.2451386523413252e-06, + "loss": 0.1052, + "step": 874 + }, + { + "epoch": 0.19908987485779295, + "grad_norm": 2.206444085506852, + "learning_rate": 1.24512752335628e-06, + "loss": 0.1018, + "step": 875 + }, + { + "epoch": 0.1993174061433447, + "grad_norm": 2.210652731669232, + "learning_rate": 1.2451163816969639e-06, + "loss": 0.1879, + "step": 876 + }, + { + "epoch": 0.19954493742889648, + "grad_norm": 2.085600222270482, + "learning_rate": 1.2451052273636045e-06, + "loss": 0.127, + "step": 877 + }, + { + "epoch": 0.19977246871444823, + "grad_norm": 2.6309536592299705, + "learning_rate": 1.24509406035643e-06, + "loss": 0.1678, + "step": 878 + }, + { + "epoch": 0.2, + "grad_norm": 4.158698099165945, + "learning_rate": 1.2450828806756685e-06, + "loss": 0.2095, + "step": 879 + }, + { + "epoch": 0.20022753128555176, + "grad_norm": 2.602198490586786, + "learning_rate": 1.245071688321549e-06, + "loss": 0.1436, + "step": 880 + }, + { + "epoch": 0.20045506257110352, + "grad_norm": 2.252594865848713, + "learning_rate": 1.2450604832942991e-06, + "loss": 0.1231, + "step": 881 + }, + { + "epoch": 0.2006825938566553, + "grad_norm": 1.912453352899942, + "learning_rate": 1.245049265594149e-06, + "loss": 0.1408, + "step": 882 + }, + { + "epoch": 0.20091012514220705, + "grad_norm": 3.264942350461524, + "learning_rate": 1.2450380352213271e-06, + "loss": 0.1697, + "step": 883 + }, + { + "epoch": 0.20113765642775883, + "grad_norm": 2.415399674888119, + "learning_rate": 1.2450267921760636e-06, + "loss": 0.1331, + "step": 884 + }, + { + "epoch": 0.20136518771331058, + "grad_norm": 2.62867521080006, + "learning_rate": 1.2450155364585878e-06, + "loss": 0.1217, + "step": 885 + }, + { + "epoch": 0.20159271899886233, + "grad_norm": 2.3552959017058477, + "learning_rate": 1.2450042680691301e-06, + "loss": 0.1216, + "step": 886 + }, + { + "epoch": 0.2018202502844141, + "grad_norm": 1.4369969713280852, + "learning_rate": 1.2449929870079206e-06, + "loss": 0.1282, + "step": 887 + }, + { + "epoch": 0.20204778156996586, + "grad_norm": 2.305787931213179, + "learning_rate": 1.24498169327519e-06, + "loss": 0.1076, + "step": 888 + }, + { + "epoch": 0.20227531285551764, + "grad_norm": 1.7868835912702514, + "learning_rate": 1.2449703868711688e-06, + "loss": 0.1225, + "step": 889 + }, + { + "epoch": 0.2025028441410694, + "grad_norm": 2.1124657583403494, + "learning_rate": 1.2449590677960886e-06, + "loss": 0.1765, + "step": 890 + }, + { + "epoch": 0.20273037542662117, + "grad_norm": 1.6102832172606196, + "learning_rate": 1.2449477360501802e-06, + "loss": 0.0719, + "step": 891 + }, + { + "epoch": 0.20295790671217293, + "grad_norm": 3.8988824882283843, + "learning_rate": 1.2449363916336756e-06, + "loss": 0.1854, + "step": 892 + }, + { + "epoch": 0.20318543799772468, + "grad_norm": 3.2116126604298882, + "learning_rate": 1.2449250345468065e-06, + "loss": 0.2028, + "step": 893 + }, + { + "epoch": 0.20341296928327646, + "grad_norm": 2.083882159988442, + "learning_rate": 1.244913664789805e-06, + "loss": 0.1337, + "step": 894 + }, + { + "epoch": 0.2036405005688282, + "grad_norm": 1.8394649372022975, + "learning_rate": 1.2449022823629036e-06, + "loss": 0.1205, + "step": 895 + }, + { + "epoch": 0.20386803185438, + "grad_norm": 2.6323013014057004, + "learning_rate": 1.2448908872663347e-06, + "loss": 0.1133, + "step": 896 + }, + { + "epoch": 0.20409556313993174, + "grad_norm": 1.8291857038844686, + "learning_rate": 1.2448794795003313e-06, + "loss": 0.1142, + "step": 897 + }, + { + "epoch": 0.2043230944254835, + "grad_norm": 1.7184606914815217, + "learning_rate": 1.2448680590651269e-06, + "loss": 0.1222, + "step": 898 + }, + { + "epoch": 0.20455062571103527, + "grad_norm": 2.7034652156706716, + "learning_rate": 1.2448566259609543e-06, + "loss": 0.1991, + "step": 899 + }, + { + "epoch": 0.20477815699658702, + "grad_norm": 2.5930455129642653, + "learning_rate": 1.2448451801880476e-06, + "loss": 0.1085, + "step": 900 + }, + { + "epoch": 0.2050056882821388, + "grad_norm": 2.44560677998223, + "learning_rate": 1.2448337217466404e-06, + "loss": 0.1735, + "step": 901 + }, + { + "epoch": 0.20523321956769056, + "grad_norm": 2.257000828394708, + "learning_rate": 1.2448222506369675e-06, + "loss": 0.1118, + "step": 902 + }, + { + "epoch": 0.2054607508532423, + "grad_norm": 2.5459054260546323, + "learning_rate": 1.2448107668592626e-06, + "loss": 0.1975, + "step": 903 + }, + { + "epoch": 0.2056882821387941, + "grad_norm": 5.093888329917388, + "learning_rate": 1.244799270413761e-06, + "loss": 0.2277, + "step": 904 + }, + { + "epoch": 0.20591581342434584, + "grad_norm": 4.116266489839909, + "learning_rate": 1.2447877613006972e-06, + "loss": 0.2004, + "step": 905 + }, + { + "epoch": 0.20614334470989762, + "grad_norm": 1.8199951318249294, + "learning_rate": 1.244776239520307e-06, + "loss": 0.2131, + "step": 906 + }, + { + "epoch": 0.20637087599544937, + "grad_norm": 2.7663340604707267, + "learning_rate": 1.244764705072825e-06, + "loss": 0.2145, + "step": 907 + }, + { + "epoch": 0.20659840728100115, + "grad_norm": 1.8748872621346087, + "learning_rate": 1.2447531579584878e-06, + "loss": 0.1327, + "step": 908 + }, + { + "epoch": 0.2068259385665529, + "grad_norm": 3.4272822632320237, + "learning_rate": 1.2447415981775312e-06, + "loss": 0.2198, + "step": 909 + }, + { + "epoch": 0.20705346985210465, + "grad_norm": 3.1215491420073396, + "learning_rate": 1.2447300257301912e-06, + "loss": 0.1342, + "step": 910 + }, + { + "epoch": 0.20728100113765643, + "grad_norm": 2.5239722345332396, + "learning_rate": 1.2447184406167045e-06, + "loss": 0.1868, + "step": 911 + }, + { + "epoch": 0.2075085324232082, + "grad_norm": 1.9655955083845185, + "learning_rate": 1.2447068428373077e-06, + "loss": 0.1769, + "step": 912 + }, + { + "epoch": 0.20773606370875997, + "grad_norm": 3.157478086474276, + "learning_rate": 1.244695232392238e-06, + "loss": 0.1824, + "step": 913 + }, + { + "epoch": 0.20796359499431172, + "grad_norm": 1.9386984879122342, + "learning_rate": 1.2446836092817328e-06, + "loss": 0.1036, + "step": 914 + }, + { + "epoch": 0.20819112627986347, + "grad_norm": 2.2587342441489997, + "learning_rate": 1.2446719735060293e-06, + "loss": 0.2175, + "step": 915 + }, + { + "epoch": 0.20841865756541525, + "grad_norm": 2.3841098586953846, + "learning_rate": 1.2446603250653658e-06, + "loss": 0.1917, + "step": 916 + }, + { + "epoch": 0.208646188850967, + "grad_norm": 2.0643080194861496, + "learning_rate": 1.24464866395998e-06, + "loss": 0.1276, + "step": 917 + }, + { + "epoch": 0.20887372013651878, + "grad_norm": 1.1445975014034748, + "learning_rate": 1.2446369901901102e-06, + "loss": 0.0884, + "step": 918 + }, + { + "epoch": 0.20910125142207053, + "grad_norm": 3.359267538919808, + "learning_rate": 1.2446253037559952e-06, + "loss": 0.1214, + "step": 919 + }, + { + "epoch": 0.20932878270762229, + "grad_norm": 2.1583486474112927, + "learning_rate": 1.2446136046578739e-06, + "loss": 0.1093, + "step": 920 + }, + { + "epoch": 0.20955631399317406, + "grad_norm": 2.692763960200507, + "learning_rate": 1.2446018928959853e-06, + "loss": 0.2289, + "step": 921 + }, + { + "epoch": 0.20978384527872582, + "grad_norm": 2.356276890733175, + "learning_rate": 1.2445901684705685e-06, + "loss": 0.2222, + "step": 922 + }, + { + "epoch": 0.2100113765642776, + "grad_norm": 2.596476104334523, + "learning_rate": 1.2445784313818638e-06, + "loss": 0.1574, + "step": 923 + }, + { + "epoch": 0.21023890784982935, + "grad_norm": 2.788233818738729, + "learning_rate": 1.2445666816301102e-06, + "loss": 0.1303, + "step": 924 + }, + { + "epoch": 0.21046643913538113, + "grad_norm": 2.3013258694625245, + "learning_rate": 1.2445549192155487e-06, + "loss": 0.2232, + "step": 925 + }, + { + "epoch": 0.21069397042093288, + "grad_norm": 2.364410552617768, + "learning_rate": 1.244543144138419e-06, + "loss": 0.1967, + "step": 926 + }, + { + "epoch": 0.21092150170648463, + "grad_norm": 1.4320620142185012, + "learning_rate": 1.2445313563989624e-06, + "loss": 0.1533, + "step": 927 + }, + { + "epoch": 0.2111490329920364, + "grad_norm": 1.8979786639459473, + "learning_rate": 1.2445195559974194e-06, + "loss": 0.1494, + "step": 928 + }, + { + "epoch": 0.21137656427758816, + "grad_norm": 2.1174466003626446, + "learning_rate": 1.244507742934031e-06, + "loss": 0.1973, + "step": 929 + }, + { + "epoch": 0.21160409556313994, + "grad_norm": 2.164188059326067, + "learning_rate": 1.2444959172090393e-06, + "loss": 0.1336, + "step": 930 + }, + { + "epoch": 0.2118316268486917, + "grad_norm": 1.5503789009056947, + "learning_rate": 1.2444840788226854e-06, + "loss": 0.1948, + "step": 931 + }, + { + "epoch": 0.21205915813424345, + "grad_norm": 1.8654319466920093, + "learning_rate": 1.2444722277752114e-06, + "loss": 0.2043, + "step": 932 + }, + { + "epoch": 0.21228668941979523, + "grad_norm": 2.020474941013341, + "learning_rate": 1.2444603640668596e-06, + "loss": 0.2211, + "step": 933 + }, + { + "epoch": 0.21251422070534698, + "grad_norm": 2.0138343922511206, + "learning_rate": 1.2444484876978725e-06, + "loss": 0.1402, + "step": 934 + }, + { + "epoch": 0.21274175199089876, + "grad_norm": 1.5804379894073013, + "learning_rate": 1.2444365986684929e-06, + "loss": 0.1311, + "step": 935 + }, + { + "epoch": 0.2129692832764505, + "grad_norm": 2.2151819679335367, + "learning_rate": 1.2444246969789633e-06, + "loss": 0.0884, + "step": 936 + }, + { + "epoch": 0.21319681456200226, + "grad_norm": 2.4707341962723834, + "learning_rate": 1.2444127826295277e-06, + "loss": 0.1138, + "step": 937 + }, + { + "epoch": 0.21342434584755404, + "grad_norm": 2.142646726979162, + "learning_rate": 1.244400855620429e-06, + "loss": 0.1234, + "step": 938 + }, + { + "epoch": 0.2136518771331058, + "grad_norm": 1.3461044168942922, + "learning_rate": 1.2443889159519113e-06, + "loss": 0.0966, + "step": 939 + }, + { + "epoch": 0.21387940841865757, + "grad_norm": 2.824705608850421, + "learning_rate": 1.2443769636242185e-06, + "loss": 0.1736, + "step": 940 + }, + { + "epoch": 0.21410693970420933, + "grad_norm": 3.3926592270656526, + "learning_rate": 1.244364998637595e-06, + "loss": 0.102, + "step": 941 + }, + { + "epoch": 0.2143344709897611, + "grad_norm": 2.1478829302272278, + "learning_rate": 1.2443530209922848e-06, + "loss": 0.0958, + "step": 942 + }, + { + "epoch": 0.21456200227531286, + "grad_norm": 2.084791701381943, + "learning_rate": 1.2443410306885337e-06, + "loss": 0.128, + "step": 943 + }, + { + "epoch": 0.2147895335608646, + "grad_norm": 2.667044034523646, + "learning_rate": 1.244329027726586e-06, + "loss": 0.2088, + "step": 944 + }, + { + "epoch": 0.2150170648464164, + "grad_norm": 1.4354076627961647, + "learning_rate": 1.2443170121066872e-06, + "loss": 0.1295, + "step": 945 + }, + { + "epoch": 0.21524459613196814, + "grad_norm": 3.608014557262876, + "learning_rate": 1.2443049838290827e-06, + "loss": 0.1479, + "step": 946 + }, + { + "epoch": 0.21547212741751992, + "grad_norm": 2.4907426669888424, + "learning_rate": 1.2442929428940186e-06, + "loss": 0.2094, + "step": 947 + }, + { + "epoch": 0.21569965870307167, + "grad_norm": 1.889292577370491, + "learning_rate": 1.2442808893017414e-06, + "loss": 0.1182, + "step": 948 + }, + { + "epoch": 0.21592718998862342, + "grad_norm": 1.295703999044032, + "learning_rate": 1.2442688230524965e-06, + "loss": 0.1493, + "step": 949 + }, + { + "epoch": 0.2161547212741752, + "grad_norm": 3.010053578949512, + "learning_rate": 1.244256744146531e-06, + "loss": 0.1837, + "step": 950 + }, + { + "epoch": 0.21638225255972696, + "grad_norm": 2.2542440250817357, + "learning_rate": 1.244244652584092e-06, + "loss": 0.2011, + "step": 951 + }, + { + "epoch": 0.21660978384527874, + "grad_norm": 1.8471360091007536, + "learning_rate": 1.2442325483654263e-06, + "loss": 0.1529, + "step": 952 + }, + { + "epoch": 0.2168373151308305, + "grad_norm": 3.360264898638295, + "learning_rate": 1.2442204314907812e-06, + "loss": 0.1952, + "step": 953 + }, + { + "epoch": 0.21706484641638224, + "grad_norm": 2.2836983418694308, + "learning_rate": 1.2442083019604047e-06, + "loss": 0.2068, + "step": 954 + }, + { + "epoch": 0.21729237770193402, + "grad_norm": 2.534259478561885, + "learning_rate": 1.2441961597745447e-06, + "loss": 0.131, + "step": 955 + }, + { + "epoch": 0.21751990898748577, + "grad_norm": 2.116332324988344, + "learning_rate": 1.244184004933449e-06, + "loss": 0.1433, + "step": 956 + }, + { + "epoch": 0.21774744027303755, + "grad_norm": 1.9239447267712195, + "learning_rate": 1.2441718374373662e-06, + "loss": 0.1296, + "step": 957 + }, + { + "epoch": 0.2179749715585893, + "grad_norm": 3.11283517907892, + "learning_rate": 1.244159657286545e-06, + "loss": 0.1556, + "step": 958 + }, + { + "epoch": 0.21820250284414108, + "grad_norm": 2.1030310163998, + "learning_rate": 1.2441474644812345e-06, + "loss": 0.1398, + "step": 959 + }, + { + "epoch": 0.21843003412969283, + "grad_norm": 2.6301386027385734, + "learning_rate": 1.2441352590216836e-06, + "loss": 0.1328, + "step": 960 + }, + { + "epoch": 0.2186575654152446, + "grad_norm": 1.6843043929069075, + "learning_rate": 1.244123040908142e-06, + "loss": 0.2169, + "step": 961 + }, + { + "epoch": 0.21888509670079637, + "grad_norm": 2.021371056385805, + "learning_rate": 1.2441108101408592e-06, + "loss": 0.105, + "step": 962 + }, + { + "epoch": 0.21911262798634812, + "grad_norm": 2.932640255317413, + "learning_rate": 1.2440985667200853e-06, + "loss": 0.1186, + "step": 963 + }, + { + "epoch": 0.2193401592718999, + "grad_norm": 2.287879466073487, + "learning_rate": 1.2440863106460705e-06, + "loss": 0.1418, + "step": 964 + }, + { + "epoch": 0.21956769055745165, + "grad_norm": 2.4323172112890807, + "learning_rate": 1.2440740419190655e-06, + "loss": 0.2116, + "step": 965 + }, + { + "epoch": 0.2197952218430034, + "grad_norm": 2.906286752213052, + "learning_rate": 1.2440617605393208e-06, + "loss": 0.2029, + "step": 966 + }, + { + "epoch": 0.22002275312855518, + "grad_norm": 2.420234503572233, + "learning_rate": 1.2440494665070874e-06, + "loss": 0.2227, + "step": 967 + }, + { + "epoch": 0.22025028441410693, + "grad_norm": 2.1531642600457874, + "learning_rate": 1.2440371598226165e-06, + "loss": 0.1565, + "step": 968 + }, + { + "epoch": 0.2204778156996587, + "grad_norm": 1.7851844835265829, + "learning_rate": 1.2440248404861598e-06, + "loss": 0.1132, + "step": 969 + }, + { + "epoch": 0.22070534698521047, + "grad_norm": 2.2253443799094605, + "learning_rate": 1.2440125084979693e-06, + "loss": 0.1141, + "step": 970 + }, + { + "epoch": 0.22093287827076222, + "grad_norm": 3.491367387042196, + "learning_rate": 1.2440001638582965e-06, + "loss": 0.1678, + "step": 971 + }, + { + "epoch": 0.221160409556314, + "grad_norm": 2.6799332639547297, + "learning_rate": 1.2439878065673944e-06, + "loss": 0.1791, + "step": 972 + }, + { + "epoch": 0.22138794084186575, + "grad_norm": 0.9028117739016462, + "learning_rate": 1.2439754366255149e-06, + "loss": 0.0794, + "step": 973 + }, + { + "epoch": 0.22161547212741753, + "grad_norm": 1.6629358802939667, + "learning_rate": 1.2439630540329111e-06, + "loss": 0.1328, + "step": 974 + }, + { + "epoch": 0.22184300341296928, + "grad_norm": 2.734953415687441, + "learning_rate": 1.2439506587898358e-06, + "loss": 0.1168, + "step": 975 + }, + { + "epoch": 0.22207053469852106, + "grad_norm": 2.0986779517624745, + "learning_rate": 1.243938250896543e-06, + "loss": 0.1288, + "step": 976 + }, + { + "epoch": 0.2222980659840728, + "grad_norm": 2.4554262769941766, + "learning_rate": 1.2439258303532858e-06, + "loss": 0.1545, + "step": 977 + }, + { + "epoch": 0.22252559726962456, + "grad_norm": 1.7628888954012072, + "learning_rate": 1.243913397160318e-06, + "loss": 0.0967, + "step": 978 + }, + { + "epoch": 0.22275312855517634, + "grad_norm": 1.8371409568342896, + "learning_rate": 1.2439009513178938e-06, + "loss": 0.1184, + "step": 979 + }, + { + "epoch": 0.2229806598407281, + "grad_norm": 3.4838138279645103, + "learning_rate": 1.2438884928262678e-06, + "loss": 0.1686, + "step": 980 + }, + { + "epoch": 0.22320819112627988, + "grad_norm": 1.743212643613601, + "learning_rate": 1.2438760216856944e-06, + "loss": 0.1005, + "step": 981 + }, + { + "epoch": 0.22343572241183163, + "grad_norm": 2.2940811110233135, + "learning_rate": 1.2438635378964284e-06, + "loss": 0.1261, + "step": 982 + }, + { + "epoch": 0.22366325369738338, + "grad_norm": 3.306786589733754, + "learning_rate": 1.2438510414587251e-06, + "loss": 0.1057, + "step": 983 + }, + { + "epoch": 0.22389078498293516, + "grad_norm": 1.8312197926008273, + "learning_rate": 1.24383853237284e-06, + "loss": 0.1121, + "step": 984 + }, + { + "epoch": 0.2241183162684869, + "grad_norm": 1.375951456745173, + "learning_rate": 1.2438260106390285e-06, + "loss": 0.1137, + "step": 985 + }, + { + "epoch": 0.2243458475540387, + "grad_norm": 2.2850475547846507, + "learning_rate": 1.2438134762575467e-06, + "loss": 0.1528, + "step": 986 + }, + { + "epoch": 0.22457337883959044, + "grad_norm": 1.7811601291763544, + "learning_rate": 1.243800929228651e-06, + "loss": 0.114, + "step": 987 + }, + { + "epoch": 0.2248009101251422, + "grad_norm": 2.175503500486742, + "learning_rate": 1.2437883695525974e-06, + "loss": 0.2246, + "step": 988 + }, + { + "epoch": 0.22502844141069397, + "grad_norm": 2.5853887611675375, + "learning_rate": 1.2437757972296427e-06, + "loss": 0.2126, + "step": 989 + }, + { + "epoch": 0.22525597269624573, + "grad_norm": 2.4622729490723065, + "learning_rate": 1.2437632122600442e-06, + "loss": 0.1806, + "step": 990 + }, + { + "epoch": 0.2254835039817975, + "grad_norm": 2.2336859931017794, + "learning_rate": 1.2437506146440587e-06, + "loss": 0.1948, + "step": 991 + }, + { + "epoch": 0.22571103526734926, + "grad_norm": 2.388802906376772, + "learning_rate": 1.243738004381944e-06, + "loss": 0.1028, + "step": 992 + }, + { + "epoch": 0.225938566552901, + "grad_norm": 2.526457136508687, + "learning_rate": 1.2437253814739572e-06, + "loss": 0.1394, + "step": 993 + }, + { + "epoch": 0.2261660978384528, + "grad_norm": 2.282347439516019, + "learning_rate": 1.2437127459203572e-06, + "loss": 0.1678, + "step": 994 + }, + { + "epoch": 0.22639362912400454, + "grad_norm": 1.3050466119815518, + "learning_rate": 1.2437000977214015e-06, + "loss": 0.0753, + "step": 995 + }, + { + "epoch": 0.22662116040955632, + "grad_norm": 2.159334429482828, + "learning_rate": 1.243687436877349e-06, + "loss": 0.2767, + "step": 996 + }, + { + "epoch": 0.22684869169510807, + "grad_norm": 2.4741243617261617, + "learning_rate": 1.2436747633884583e-06, + "loss": 0.167, + "step": 997 + }, + { + "epoch": 0.22707622298065985, + "grad_norm": 2.522130011756034, + "learning_rate": 1.2436620772549885e-06, + "loss": 0.2229, + "step": 998 + }, + { + "epoch": 0.2273037542662116, + "grad_norm": 2.2654639871535873, + "learning_rate": 1.243649378477199e-06, + "loss": 0.1376, + "step": 999 + }, + { + "epoch": 0.22753128555176336, + "grad_norm": 2.737389406083516, + "learning_rate": 1.2436366670553491e-06, + "loss": 0.1672, + "step": 1000 + }, + { + "epoch": 0.22775881683731514, + "grad_norm": 2.497999857751637, + "learning_rate": 1.2436239429896988e-06, + "loss": 0.2831, + "step": 1001 + }, + { + "epoch": 0.2279863481228669, + "grad_norm": 2.3986139069373125, + "learning_rate": 1.2436112062805081e-06, + "loss": 0.1413, + "step": 1002 + }, + { + "epoch": 0.22821387940841867, + "grad_norm": 1.63194618315687, + "learning_rate": 1.2435984569280372e-06, + "loss": 0.1509, + "step": 1003 + }, + { + "epoch": 0.22844141069397042, + "grad_norm": 1.9884735218546312, + "learning_rate": 1.2435856949325467e-06, + "loss": 0.0909, + "step": 1004 + }, + { + "epoch": 0.22866894197952217, + "grad_norm": 3.7364717574130877, + "learning_rate": 1.2435729202942972e-06, + "loss": 0.1362, + "step": 1005 + }, + { + "epoch": 0.22889647326507395, + "grad_norm": 4.3498400339740595, + "learning_rate": 1.2435601330135506e-06, + "loss": 0.1364, + "step": 1006 + }, + { + "epoch": 0.2291240045506257, + "grad_norm": 1.468486521047109, + "learning_rate": 1.2435473330905674e-06, + "loss": 0.1902, + "step": 1007 + }, + { + "epoch": 0.22935153583617748, + "grad_norm": 2.602985360302298, + "learning_rate": 1.2435345205256097e-06, + "loss": 0.0947, + "step": 1008 + }, + { + "epoch": 0.22957906712172924, + "grad_norm": 2.117002790495142, + "learning_rate": 1.243521695318939e-06, + "loss": 0.1228, + "step": 1009 + }, + { + "epoch": 0.229806598407281, + "grad_norm": 2.0012843231226034, + "learning_rate": 1.2435088574708178e-06, + "loss": 0.1156, + "step": 1010 + }, + { + "epoch": 0.23003412969283277, + "grad_norm": 2.490148339748286, + "learning_rate": 1.2434960069815083e-06, + "loss": 0.164, + "step": 1011 + }, + { + "epoch": 0.23026166097838452, + "grad_norm": 2.450730689081713, + "learning_rate": 1.243483143851273e-06, + "loss": 0.138, + "step": 1012 + }, + { + "epoch": 0.2304891922639363, + "grad_norm": 2.892744061430906, + "learning_rate": 1.2434702680803751e-06, + "loss": 0.1061, + "step": 1013 + }, + { + "epoch": 0.23071672354948805, + "grad_norm": 2.790226387512928, + "learning_rate": 1.2434573796690774e-06, + "loss": 0.1957, + "step": 1014 + }, + { + "epoch": 0.23094425483503983, + "grad_norm": 2.4036726186705972, + "learning_rate": 1.2434444786176435e-06, + "loss": 0.1544, + "step": 1015 + }, + { + "epoch": 0.23117178612059158, + "grad_norm": 1.3271746602955339, + "learning_rate": 1.2434315649263372e-06, + "loss": 0.061, + "step": 1016 + }, + { + "epoch": 0.23139931740614333, + "grad_norm": 1.4063593684445947, + "learning_rate": 1.2434186385954225e-06, + "loss": 0.1068, + "step": 1017 + }, + { + "epoch": 0.23162684869169511, + "grad_norm": 2.9525793198909724, + "learning_rate": 1.243405699625163e-06, + "loss": 0.1067, + "step": 1018 + }, + { + "epoch": 0.23185437997724687, + "grad_norm": 2.7846219600282747, + "learning_rate": 1.243392748015824e-06, + "loss": 0.1435, + "step": 1019 + }, + { + "epoch": 0.23208191126279865, + "grad_norm": 1.5658061687677385, + "learning_rate": 1.2433797837676694e-06, + "loss": 0.1492, + "step": 1020 + }, + { + "epoch": 0.2323094425483504, + "grad_norm": 4.123388323133236, + "learning_rate": 1.2433668068809648e-06, + "loss": 0.1699, + "step": 1021 + }, + { + "epoch": 0.23253697383390215, + "grad_norm": 2.0976126762166403, + "learning_rate": 1.243353817355975e-06, + "loss": 0.1257, + "step": 1022 + }, + { + "epoch": 0.23276450511945393, + "grad_norm": 2.4116621601065296, + "learning_rate": 1.2433408151929655e-06, + "loss": 0.133, + "step": 1023 + }, + { + "epoch": 0.23299203640500568, + "grad_norm": 1.395623834578789, + "learning_rate": 1.2433278003922026e-06, + "loss": 0.0936, + "step": 1024 + }, + { + "epoch": 0.23321956769055746, + "grad_norm": 1.7768669244027402, + "learning_rate": 1.2433147729539514e-06, + "loss": 0.1264, + "step": 1025 + }, + { + "epoch": 0.2334470989761092, + "grad_norm": 2.489847520949891, + "learning_rate": 1.2433017328784788e-06, + "loss": 0.1714, + "step": 1026 + }, + { + "epoch": 0.23367463026166096, + "grad_norm": 1.722648702759186, + "learning_rate": 1.2432886801660513e-06, + "loss": 0.122, + "step": 1027 + }, + { + "epoch": 0.23390216154721274, + "grad_norm": 1.3061284883014919, + "learning_rate": 1.2432756148169354e-06, + "loss": 0.0726, + "step": 1028 + }, + { + "epoch": 0.2341296928327645, + "grad_norm": 2.807955909764041, + "learning_rate": 1.2432625368313983e-06, + "loss": 0.1667, + "step": 1029 + }, + { + "epoch": 0.23435722411831628, + "grad_norm": 1.9724601313774524, + "learning_rate": 1.2432494462097072e-06, + "loss": 0.1995, + "step": 1030 + }, + { + "epoch": 0.23458475540386803, + "grad_norm": 2.3943947067430895, + "learning_rate": 1.2432363429521295e-06, + "loss": 0.1625, + "step": 1031 + }, + { + "epoch": 0.2348122866894198, + "grad_norm": 1.5436408096888365, + "learning_rate": 1.2432232270589335e-06, + "loss": 0.076, + "step": 1032 + }, + { + "epoch": 0.23503981797497156, + "grad_norm": 1.1938881747627557, + "learning_rate": 1.2432100985303868e-06, + "loss": 0.1002, + "step": 1033 + }, + { + "epoch": 0.2352673492605233, + "grad_norm": 2.0446974564823304, + "learning_rate": 1.243196957366758e-06, + "loss": 0.1721, + "step": 1034 + }, + { + "epoch": 0.2354948805460751, + "grad_norm": 1.079879180238331, + "learning_rate": 1.2431838035683155e-06, + "loss": 0.1257, + "step": 1035 + }, + { + "epoch": 0.23572241183162684, + "grad_norm": 1.8378535292320874, + "learning_rate": 1.2431706371353282e-06, + "loss": 0.1821, + "step": 1036 + }, + { + "epoch": 0.23594994311717862, + "grad_norm": 1.969855842746801, + "learning_rate": 1.2431574580680653e-06, + "loss": 0.1436, + "step": 1037 + }, + { + "epoch": 0.23617747440273038, + "grad_norm": 3.058757707801488, + "learning_rate": 1.2431442663667958e-06, + "loss": 0.1605, + "step": 1038 + }, + { + "epoch": 0.23640500568828213, + "grad_norm": 1.2648716547694445, + "learning_rate": 1.2431310620317898e-06, + "loss": 0.1614, + "step": 1039 + }, + { + "epoch": 0.2366325369738339, + "grad_norm": 1.9610877034271015, + "learning_rate": 1.2431178450633168e-06, + "loss": 0.139, + "step": 1040 + }, + { + "epoch": 0.23686006825938566, + "grad_norm": 1.5919631273318544, + "learning_rate": 1.2431046154616473e-06, + "loss": 0.0888, + "step": 1041 + }, + { + "epoch": 0.23708759954493744, + "grad_norm": 1.791707313865184, + "learning_rate": 1.2430913732270512e-06, + "loss": 0.1087, + "step": 1042 + }, + { + "epoch": 0.2373151308304892, + "grad_norm": 3.1377911678690666, + "learning_rate": 1.2430781183597995e-06, + "loss": 0.1565, + "step": 1043 + }, + { + "epoch": 0.23754266211604094, + "grad_norm": 2.2837991793589607, + "learning_rate": 1.243064850860163e-06, + "loss": 0.1126, + "step": 1044 + }, + { + "epoch": 0.23777019340159272, + "grad_norm": 2.6823412767535246, + "learning_rate": 1.243051570728413e-06, + "loss": 0.2083, + "step": 1045 + }, + { + "epoch": 0.23799772468714447, + "grad_norm": 4.365244516577561, + "learning_rate": 1.2430382779648208e-06, + "loss": 0.1904, + "step": 1046 + }, + { + "epoch": 0.23822525597269625, + "grad_norm": 2.434739692035364, + "learning_rate": 1.243024972569658e-06, + "loss": 0.1347, + "step": 1047 + }, + { + "epoch": 0.238452787258248, + "grad_norm": 2.1595986496307384, + "learning_rate": 1.2430116545431966e-06, + "loss": 0.1926, + "step": 1048 + }, + { + "epoch": 0.23868031854379979, + "grad_norm": 2.2542031412662573, + "learning_rate": 1.2429983238857088e-06, + "loss": 0.1667, + "step": 1049 + }, + { + "epoch": 0.23890784982935154, + "grad_norm": 2.0405926385207787, + "learning_rate": 1.2429849805974673e-06, + "loss": 0.1872, + "step": 1050 + }, + { + "epoch": 0.2391353811149033, + "grad_norm": 2.2037085916589043, + "learning_rate": 1.2429716246787444e-06, + "loss": 0.0775, + "step": 1051 + }, + { + "epoch": 0.23936291240045507, + "grad_norm": 0.9628371959013814, + "learning_rate": 1.242958256129813e-06, + "loss": 0.1378, + "step": 1052 + }, + { + "epoch": 0.23959044368600682, + "grad_norm": 2.1187588487355424, + "learning_rate": 1.242944874950947e-06, + "loss": 0.159, + "step": 1053 + }, + { + "epoch": 0.2398179749715586, + "grad_norm": 1.9961766997876433, + "learning_rate": 1.2429314811424192e-06, + "loss": 0.1568, + "step": 1054 + }, + { + "epoch": 0.24004550625711035, + "grad_norm": 1.935471261024473, + "learning_rate": 1.242918074704504e-06, + "loss": 0.1596, + "step": 1055 + }, + { + "epoch": 0.2402730375426621, + "grad_norm": 1.4988665110908368, + "learning_rate": 1.2429046556374747e-06, + "loss": 0.0987, + "step": 1056 + }, + { + "epoch": 0.24050056882821388, + "grad_norm": 2.4283216098462015, + "learning_rate": 1.2428912239416057e-06, + "loss": 0.1127, + "step": 1057 + }, + { + "epoch": 0.24072810011376564, + "grad_norm": 2.3264824459084448, + "learning_rate": 1.242877779617172e-06, + "loss": 0.1274, + "step": 1058 + }, + { + "epoch": 0.24095563139931742, + "grad_norm": 2.159687331291489, + "learning_rate": 1.242864322664448e-06, + "loss": 0.1399, + "step": 1059 + }, + { + "epoch": 0.24118316268486917, + "grad_norm": 2.3632421336063087, + "learning_rate": 1.2428508530837088e-06, + "loss": 0.1751, + "step": 1060 + }, + { + "epoch": 0.24141069397042092, + "grad_norm": 4.564054038887482, + "learning_rate": 1.2428373708752298e-06, + "loss": 0.1623, + "step": 1061 + }, + { + "epoch": 0.2416382252559727, + "grad_norm": 2.913968751293169, + "learning_rate": 1.2428238760392862e-06, + "loss": 0.2404, + "step": 1062 + }, + { + "epoch": 0.24186575654152445, + "grad_norm": 2.375864551832549, + "learning_rate": 1.2428103685761543e-06, + "loss": 0.1551, + "step": 1063 + }, + { + "epoch": 0.24209328782707623, + "grad_norm": 2.773326434228427, + "learning_rate": 1.2427968484861097e-06, + "loss": 0.1129, + "step": 1064 + }, + { + "epoch": 0.24232081911262798, + "grad_norm": 3.440322207371564, + "learning_rate": 1.2427833157694292e-06, + "loss": 0.2312, + "step": 1065 + }, + { + "epoch": 0.24254835039817976, + "grad_norm": 2.09362609958651, + "learning_rate": 1.2427697704263892e-06, + "loss": 0.1047, + "step": 1066 + }, + { + "epoch": 0.24277588168373151, + "grad_norm": 2.0696892695320432, + "learning_rate": 1.2427562124572663e-06, + "loss": 0.1156, + "step": 1067 + }, + { + "epoch": 0.24300341296928327, + "grad_norm": 1.923568801452821, + "learning_rate": 1.2427426418623377e-06, + "loss": 0.1609, + "step": 1068 + }, + { + "epoch": 0.24323094425483505, + "grad_norm": 1.5158781630471698, + "learning_rate": 1.242729058641881e-06, + "loss": 0.094, + "step": 1069 + }, + { + "epoch": 0.2434584755403868, + "grad_norm": 2.2258107327352037, + "learning_rate": 1.2427154627961737e-06, + "loss": 0.2017, + "step": 1070 + }, + { + "epoch": 0.24368600682593858, + "grad_norm": 2.3481688305100645, + "learning_rate": 1.2427018543254935e-06, + "loss": 0.1535, + "step": 1071 + }, + { + "epoch": 0.24391353811149033, + "grad_norm": 2.148375299510445, + "learning_rate": 1.2426882332301187e-06, + "loss": 0.1812, + "step": 1072 + }, + { + "epoch": 0.24414106939704208, + "grad_norm": 1.6816805152718777, + "learning_rate": 1.2426745995103277e-06, + "loss": 0.1341, + "step": 1073 + }, + { + "epoch": 0.24436860068259386, + "grad_norm": 2.651811251817173, + "learning_rate": 1.242660953166399e-06, + "loss": 0.1318, + "step": 1074 + }, + { + "epoch": 0.2445961319681456, + "grad_norm": 2.473544844662378, + "learning_rate": 1.2426472941986117e-06, + "loss": 0.1972, + "step": 1075 + }, + { + "epoch": 0.2448236632536974, + "grad_norm": 1.3274925024741444, + "learning_rate": 1.2426336226072449e-06, + "loss": 0.1497, + "step": 1076 + }, + { + "epoch": 0.24505119453924915, + "grad_norm": 2.1014804926130277, + "learning_rate": 1.242619938392578e-06, + "loss": 0.1186, + "step": 1077 + }, + { + "epoch": 0.2452787258248009, + "grad_norm": 3.0260303106049973, + "learning_rate": 1.2426062415548907e-06, + "loss": 0.2506, + "step": 1078 + }, + { + "epoch": 0.24550625711035268, + "grad_norm": 1.2327761741993546, + "learning_rate": 1.2425925320944628e-06, + "loss": 0.117, + "step": 1079 + }, + { + "epoch": 0.24573378839590443, + "grad_norm": 3.2155457599215036, + "learning_rate": 1.2425788100115747e-06, + "loss": 0.1412, + "step": 1080 + }, + { + "epoch": 0.2459613196814562, + "grad_norm": 1.6672046307721682, + "learning_rate": 1.2425650753065065e-06, + "loss": 0.148, + "step": 1081 + }, + { + "epoch": 0.24618885096700796, + "grad_norm": 4.323033908726176, + "learning_rate": 1.2425513279795395e-06, + "loss": 0.1685, + "step": 1082 + }, + { + "epoch": 0.24641638225255974, + "grad_norm": 2.4128743686143146, + "learning_rate": 1.2425375680309543e-06, + "loss": 0.0992, + "step": 1083 + }, + { + "epoch": 0.2466439135381115, + "grad_norm": 2.0582783253443497, + "learning_rate": 1.2425237954610322e-06, + "loss": 0.1263, + "step": 1084 + }, + { + "epoch": 0.24687144482366324, + "grad_norm": 2.5810033905990637, + "learning_rate": 1.2425100102700547e-06, + "loss": 0.2102, + "step": 1085 + }, + { + "epoch": 0.24709897610921502, + "grad_norm": 2.269665820869707, + "learning_rate": 1.2424962124583033e-06, + "loss": 0.105, + "step": 1086 + }, + { + "epoch": 0.24732650739476678, + "grad_norm": 2.706182109515585, + "learning_rate": 1.2424824020260603e-06, + "loss": 0.1596, + "step": 1087 + }, + { + "epoch": 0.24755403868031856, + "grad_norm": 3.0056026517839016, + "learning_rate": 1.2424685789736077e-06, + "loss": 0.1809, + "step": 1088 + }, + { + "epoch": 0.2477815699658703, + "grad_norm": 2.2230272708907513, + "learning_rate": 1.2424547433012284e-06, + "loss": 0.1187, + "step": 1089 + }, + { + "epoch": 0.24800910125142206, + "grad_norm": 2.271631978747539, + "learning_rate": 1.2424408950092049e-06, + "loss": 0.1478, + "step": 1090 + }, + { + "epoch": 0.24823663253697384, + "grad_norm": 2.485671272218175, + "learning_rate": 1.2424270340978204e-06, + "loss": 0.1595, + "step": 1091 + }, + { + "epoch": 0.2484641638225256, + "grad_norm": 2.5242524420773087, + "learning_rate": 1.2424131605673582e-06, + "loss": 0.2519, + "step": 1092 + }, + { + "epoch": 0.24869169510807737, + "grad_norm": 2.6439941529662025, + "learning_rate": 1.2423992744181015e-06, + "loss": 0.1389, + "step": 1093 + }, + { + "epoch": 0.24891922639362912, + "grad_norm": 2.1610086973465417, + "learning_rate": 1.2423853756503343e-06, + "loss": 0.1017, + "step": 1094 + }, + { + "epoch": 0.24914675767918087, + "grad_norm": 1.8954846688503157, + "learning_rate": 1.2423714642643408e-06, + "loss": 0.2796, + "step": 1095 + }, + { + "epoch": 0.24937428896473265, + "grad_norm": 1.3124277359799683, + "learning_rate": 1.2423575402604051e-06, + "loss": 0.12, + "step": 1096 + }, + { + "epoch": 0.2496018202502844, + "grad_norm": 2.5234695537617444, + "learning_rate": 1.2423436036388122e-06, + "loss": 0.1242, + "step": 1097 + }, + { + "epoch": 0.24982935153583619, + "grad_norm": 2.044792039361886, + "learning_rate": 1.2423296543998465e-06, + "loss": 0.1743, + "step": 1098 + }, + { + "epoch": 0.25005688282138794, + "grad_norm": 3.6767614291561492, + "learning_rate": 1.2423156925437932e-06, + "loss": 0.2584, + "step": 1099 + }, + { + "epoch": 0.2502844141069397, + "grad_norm": 2.1397151355216506, + "learning_rate": 1.2423017180709376e-06, + "loss": 0.1586, + "step": 1100 + }, + { + "epoch": 0.25051194539249144, + "grad_norm": 1.670738860931536, + "learning_rate": 1.2422877309815656e-06, + "loss": 0.0821, + "step": 1101 + }, + { + "epoch": 0.25073947667804325, + "grad_norm": 2.3733300367714185, + "learning_rate": 1.242273731275963e-06, + "loss": 0.1335, + "step": 1102 + }, + { + "epoch": 0.250967007963595, + "grad_norm": 2.6954093027320534, + "learning_rate": 1.2422597189544155e-06, + "loss": 0.1244, + "step": 1103 + }, + { + "epoch": 0.25119453924914675, + "grad_norm": 2.17330712431736, + "learning_rate": 1.2422456940172101e-06, + "loss": 0.1799, + "step": 1104 + }, + { + "epoch": 0.2514220705346985, + "grad_norm": 2.4883101223722397, + "learning_rate": 1.2422316564646331e-06, + "loss": 0.0881, + "step": 1105 + }, + { + "epoch": 0.25164960182025026, + "grad_norm": 2.4975644528149528, + "learning_rate": 1.2422176062969713e-06, + "loss": 0.2376, + "step": 1106 + }, + { + "epoch": 0.25187713310580206, + "grad_norm": 2.242874102497345, + "learning_rate": 1.2422035435145121e-06, + "loss": 0.1117, + "step": 1107 + }, + { + "epoch": 0.2521046643913538, + "grad_norm": 2.1430334401000994, + "learning_rate": 1.2421894681175428e-06, + "loss": 0.1937, + "step": 1108 + }, + { + "epoch": 0.25233219567690557, + "grad_norm": 2.8329522904929796, + "learning_rate": 1.2421753801063511e-06, + "loss": 0.2192, + "step": 1109 + }, + { + "epoch": 0.2525597269624573, + "grad_norm": 2.7185072984242016, + "learning_rate": 1.2421612794812248e-06, + "loss": 0.1612, + "step": 1110 + } + ], + "logging_steps": 1, + "max_steps": 21975, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 1110, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4896118628352.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}