diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,129645 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 18516, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00016202203499675956, + "grad_norm": 9.86984634399414, + "learning_rate": 8.992805755395684e-09, + "loss": 0.5557, + "step": 1 + }, + { + "epoch": 0.0003240440699935191, + "grad_norm": 8.46564769744873, + "learning_rate": 1.798561151079137e-08, + "loss": 0.5138, + "step": 2 + }, + { + "epoch": 0.0004860661049902787, + "grad_norm": 8.7678861618042, + "learning_rate": 2.6978417266187054e-08, + "loss": 0.5389, + "step": 3 + }, + { + "epoch": 0.0006480881399870382, + "grad_norm": 8.85426139831543, + "learning_rate": 3.597122302158274e-08, + "loss": 0.5323, + "step": 4 + }, + { + "epoch": 0.0008101101749837978, + "grad_norm": 8.901697158813477, + "learning_rate": 4.496402877697842e-08, + "loss": 0.5217, + "step": 5 + }, + { + "epoch": 0.0009721322099805574, + "grad_norm": 8.482805252075195, + "learning_rate": 5.395683453237411e-08, + "loss": 0.5235, + "step": 6 + }, + { + "epoch": 0.001134154244977317, + "grad_norm": 8.641008377075195, + "learning_rate": 6.294964028776979e-08, + "loss": 0.5277, + "step": 7 + }, + { + "epoch": 0.0012961762799740765, + "grad_norm": 9.918697357177734, + "learning_rate": 7.194244604316547e-08, + "loss": 0.5525, + "step": 8 + }, + { + "epoch": 0.001458198314970836, + "grad_norm": 8.959720611572266, + "learning_rate": 8.093525179856116e-08, + "loss": 0.5073, + "step": 9 + }, + { + "epoch": 0.0016202203499675956, + "grad_norm": 8.834793090820312, + "learning_rate": 8.992805755395684e-08, + "loss": 0.529, + "step": 10 + }, + { + "epoch": 0.0017822423849643552, + "grad_norm": 8.80196762084961, + "learning_rate": 9.892086330935252e-08, + "loss": 0.5193, + "step": 11 + }, + { + "epoch": 0.0019442644199611147, + "grad_norm": 8.514031410217285, + "learning_rate": 1.0791366906474822e-07, + "loss": 0.5291, + "step": 12 + }, + { + "epoch": 0.002106286454957874, + "grad_norm": 8.85981559753418, + "learning_rate": 1.169064748201439e-07, + "loss": 0.5294, + "step": 13 + }, + { + "epoch": 0.002268308489954634, + "grad_norm": 8.460282325744629, + "learning_rate": 1.2589928057553958e-07, + "loss": 0.5154, + "step": 14 + }, + { + "epoch": 0.002430330524951393, + "grad_norm": 9.40372371673584, + "learning_rate": 1.3489208633093525e-07, + "loss": 0.5348, + "step": 15 + }, + { + "epoch": 0.002592352559948153, + "grad_norm": 8.342942237854004, + "learning_rate": 1.4388489208633095e-07, + "loss": 0.5128, + "step": 16 + }, + { + "epoch": 0.0027543745949449123, + "grad_norm": 8.372694969177246, + "learning_rate": 1.5287769784172664e-07, + "loss": 0.5387, + "step": 17 + }, + { + "epoch": 0.002916396629941672, + "grad_norm": 7.830298900604248, + "learning_rate": 1.618705035971223e-07, + "loss": 0.4864, + "step": 18 + }, + { + "epoch": 0.0030784186649384314, + "grad_norm": 7.605618000030518, + "learning_rate": 1.70863309352518e-07, + "loss": 0.5016, + "step": 19 + }, + { + "epoch": 0.0032404406999351912, + "grad_norm": 7.3603997230529785, + "learning_rate": 1.7985611510791368e-07, + "loss": 0.4962, + "step": 20 + }, + { + "epoch": 0.0034024627349319506, + "grad_norm": 8.310635566711426, + "learning_rate": 1.8884892086330937e-07, + "loss": 0.4981, + "step": 21 + }, + { + "epoch": 0.0035644847699287103, + "grad_norm": 7.44785737991333, + "learning_rate": 1.9784172661870504e-07, + "loss": 0.4864, + "step": 22 + }, + { + "epoch": 0.0037265068049254697, + "grad_norm": 6.7629008293151855, + "learning_rate": 2.0683453237410074e-07, + "loss": 0.4597, + "step": 23 + }, + { + "epoch": 0.0038885288399222295, + "grad_norm": 6.164111137390137, + "learning_rate": 2.1582733812949643e-07, + "loss": 0.4475, + "step": 24 + }, + { + "epoch": 0.004050550874918989, + "grad_norm": 6.299708843231201, + "learning_rate": 2.248201438848921e-07, + "loss": 0.4362, + "step": 25 + }, + { + "epoch": 0.004212572909915748, + "grad_norm": 6.691930294036865, + "learning_rate": 2.338129496402878e-07, + "loss": 0.4647, + "step": 26 + }, + { + "epoch": 0.004374594944912508, + "grad_norm": 6.3759846687316895, + "learning_rate": 2.428057553956835e-07, + "loss": 0.4502, + "step": 27 + }, + { + "epoch": 0.004536616979909268, + "grad_norm": 6.641443729400635, + "learning_rate": 2.5179856115107916e-07, + "loss": 0.4535, + "step": 28 + }, + { + "epoch": 0.004698639014906027, + "grad_norm": 6.435218811035156, + "learning_rate": 2.6079136690647483e-07, + "loss": 0.4315, + "step": 29 + }, + { + "epoch": 0.004860661049902786, + "grad_norm": 5.86301851272583, + "learning_rate": 2.697841726618705e-07, + "loss": 0.4106, + "step": 30 + }, + { + "epoch": 0.005022683084899547, + "grad_norm": 5.353550910949707, + "learning_rate": 2.787769784172662e-07, + "loss": 0.3607, + "step": 31 + }, + { + "epoch": 0.005184705119896306, + "grad_norm": 4.887706756591797, + "learning_rate": 2.877697841726619e-07, + "loss": 0.36, + "step": 32 + }, + { + "epoch": 0.005346727154893065, + "grad_norm": 4.950960159301758, + "learning_rate": 2.9676258992805756e-07, + "loss": 0.3418, + "step": 33 + }, + { + "epoch": 0.005508749189889825, + "grad_norm": 5.3128581047058105, + "learning_rate": 3.057553956834533e-07, + "loss": 0.3229, + "step": 34 + }, + { + "epoch": 0.005670771224886585, + "grad_norm": 4.85807466506958, + "learning_rate": 3.1474820143884896e-07, + "loss": 0.2946, + "step": 35 + }, + { + "epoch": 0.005832793259883344, + "grad_norm": 4.823163986206055, + "learning_rate": 3.237410071942446e-07, + "loss": 0.304, + "step": 36 + }, + { + "epoch": 0.0059948152948801035, + "grad_norm": 4.9618239402771, + "learning_rate": 3.3273381294964035e-07, + "loss": 0.3032, + "step": 37 + }, + { + "epoch": 0.006156837329876863, + "grad_norm": 4.626596927642822, + "learning_rate": 3.41726618705036e-07, + "loss": 0.272, + "step": 38 + }, + { + "epoch": 0.006318859364873623, + "grad_norm": 3.9511983394622803, + "learning_rate": 3.5071942446043163e-07, + "loss": 0.2683, + "step": 39 + }, + { + "epoch": 0.0064808813998703824, + "grad_norm": 3.7039501667022705, + "learning_rate": 3.5971223021582736e-07, + "loss": 0.2829, + "step": 40 + }, + { + "epoch": 0.006642903434867142, + "grad_norm": 3.3785510063171387, + "learning_rate": 3.68705035971223e-07, + "loss": 0.2598, + "step": 41 + }, + { + "epoch": 0.006804925469863901, + "grad_norm": 3.1743481159210205, + "learning_rate": 3.7769784172661875e-07, + "loss": 0.2307, + "step": 42 + }, + { + "epoch": 0.006966947504860661, + "grad_norm": 3.37454891204834, + "learning_rate": 3.8669064748201447e-07, + "loss": 0.2391, + "step": 43 + }, + { + "epoch": 0.007128969539857421, + "grad_norm": 3.0130465030670166, + "learning_rate": 3.956834532374101e-07, + "loss": 0.2258, + "step": 44 + }, + { + "epoch": 0.00729099157485418, + "grad_norm": 2.937732458114624, + "learning_rate": 4.0467625899280576e-07, + "loss": 0.22, + "step": 45 + }, + { + "epoch": 0.007453013609850939, + "grad_norm": 3.2253363132476807, + "learning_rate": 4.136690647482015e-07, + "loss": 0.2128, + "step": 46 + }, + { + "epoch": 0.0076150356448477, + "grad_norm": 2.944216012954712, + "learning_rate": 4.2266187050359715e-07, + "loss": 0.2199, + "step": 47 + }, + { + "epoch": 0.007777057679844459, + "grad_norm": 2.8392646312713623, + "learning_rate": 4.3165467625899287e-07, + "loss": 0.2185, + "step": 48 + }, + { + "epoch": 0.007939079714841219, + "grad_norm": 2.9585957527160645, + "learning_rate": 4.406474820143885e-07, + "loss": 0.2416, + "step": 49 + }, + { + "epoch": 0.008101101749837978, + "grad_norm": 2.977691173553467, + "learning_rate": 4.496402877697842e-07, + "loss": 0.2219, + "step": 50 + }, + { + "epoch": 0.008263123784834738, + "grad_norm": 2.6376028060913086, + "learning_rate": 4.586330935251799e-07, + "loss": 0.2271, + "step": 51 + }, + { + "epoch": 0.008425145819831496, + "grad_norm": 2.8397135734558105, + "learning_rate": 4.676258992805756e-07, + "loss": 0.2227, + "step": 52 + }, + { + "epoch": 0.008587167854828257, + "grad_norm": 2.7388408184051514, + "learning_rate": 4.7661870503597127e-07, + "loss": 0.1965, + "step": 53 + }, + { + "epoch": 0.008749189889825017, + "grad_norm": 2.7062389850616455, + "learning_rate": 4.85611510791367e-07, + "loss": 0.1991, + "step": 54 + }, + { + "epoch": 0.008911211924821775, + "grad_norm": 2.688417911529541, + "learning_rate": 4.946043165467626e-07, + "loss": 0.2121, + "step": 55 + }, + { + "epoch": 0.009073233959818535, + "grad_norm": 2.418215751647949, + "learning_rate": 5.035971223021583e-07, + "loss": 0.2104, + "step": 56 + }, + { + "epoch": 0.009235255994815296, + "grad_norm": 2.9777352809906006, + "learning_rate": 5.12589928057554e-07, + "loss": 0.2145, + "step": 57 + }, + { + "epoch": 0.009397278029812054, + "grad_norm": 2.8849077224731445, + "learning_rate": 5.215827338129497e-07, + "loss": 0.1984, + "step": 58 + }, + { + "epoch": 0.009559300064808814, + "grad_norm": 2.7535386085510254, + "learning_rate": 5.305755395683454e-07, + "loss": 0.2159, + "step": 59 + }, + { + "epoch": 0.009721322099805573, + "grad_norm": 3.017183780670166, + "learning_rate": 5.39568345323741e-07, + "loss": 0.2485, + "step": 60 + }, + { + "epoch": 0.009883344134802333, + "grad_norm": 3.307050943374634, + "learning_rate": 5.485611510791367e-07, + "loss": 0.21, + "step": 61 + }, + { + "epoch": 0.010045366169799093, + "grad_norm": 2.6310977935791016, + "learning_rate": 5.575539568345325e-07, + "loss": 0.1888, + "step": 62 + }, + { + "epoch": 0.010207388204795852, + "grad_norm": 2.3625879287719727, + "learning_rate": 5.665467625899281e-07, + "loss": 0.1938, + "step": 63 + }, + { + "epoch": 0.010369410239792612, + "grad_norm": 3.2340619564056396, + "learning_rate": 5.755395683453238e-07, + "loss": 0.1812, + "step": 64 + }, + { + "epoch": 0.010531432274789372, + "grad_norm": 2.1760573387145996, + "learning_rate": 5.845323741007194e-07, + "loss": 0.18, + "step": 65 + }, + { + "epoch": 0.01069345430978613, + "grad_norm": 2.495835542678833, + "learning_rate": 5.935251798561151e-07, + "loss": 0.2233, + "step": 66 + }, + { + "epoch": 0.01085547634478289, + "grad_norm": 2.425354242324829, + "learning_rate": 6.025179856115109e-07, + "loss": 0.2017, + "step": 67 + }, + { + "epoch": 0.01101749837977965, + "grad_norm": 2.386017322540283, + "learning_rate": 6.115107913669066e-07, + "loss": 0.1897, + "step": 68 + }, + { + "epoch": 0.01117952041477641, + "grad_norm": 2.476449728012085, + "learning_rate": 6.205035971223022e-07, + "loss": 0.2163, + "step": 69 + }, + { + "epoch": 0.01134154244977317, + "grad_norm": 2.3602700233459473, + "learning_rate": 6.294964028776979e-07, + "loss": 0.1795, + "step": 70 + }, + { + "epoch": 0.011503564484769928, + "grad_norm": 2.5833189487457275, + "learning_rate": 6.384892086330936e-07, + "loss": 0.215, + "step": 71 + }, + { + "epoch": 0.011665586519766688, + "grad_norm": 2.3865418434143066, + "learning_rate": 6.474820143884893e-07, + "loss": 0.171, + "step": 72 + }, + { + "epoch": 0.011827608554763449, + "grad_norm": 2.407026529312134, + "learning_rate": 6.564748201438849e-07, + "loss": 0.1766, + "step": 73 + }, + { + "epoch": 0.011989630589760207, + "grad_norm": 2.3541834354400635, + "learning_rate": 6.654676258992807e-07, + "loss": 0.1703, + "step": 74 + }, + { + "epoch": 0.012151652624756967, + "grad_norm": 2.7565481662750244, + "learning_rate": 6.744604316546763e-07, + "loss": 0.1833, + "step": 75 + }, + { + "epoch": 0.012313674659753726, + "grad_norm": 2.53861665725708, + "learning_rate": 6.83453237410072e-07, + "loss": 0.1967, + "step": 76 + }, + { + "epoch": 0.012475696694750486, + "grad_norm": 2.920691728591919, + "learning_rate": 6.924460431654677e-07, + "loss": 0.1945, + "step": 77 + }, + { + "epoch": 0.012637718729747246, + "grad_norm": 2.714219093322754, + "learning_rate": 7.014388489208633e-07, + "loss": 0.1961, + "step": 78 + }, + { + "epoch": 0.012799740764744005, + "grad_norm": 2.2343974113464355, + "learning_rate": 7.104316546762591e-07, + "loss": 0.1868, + "step": 79 + }, + { + "epoch": 0.012961762799740765, + "grad_norm": 2.333791971206665, + "learning_rate": 7.194244604316547e-07, + "loss": 0.1705, + "step": 80 + }, + { + "epoch": 0.013123784834737525, + "grad_norm": 2.554150104522705, + "learning_rate": 7.284172661870504e-07, + "loss": 0.1822, + "step": 81 + }, + { + "epoch": 0.013285806869734284, + "grad_norm": 2.51990008354187, + "learning_rate": 7.37410071942446e-07, + "loss": 0.1889, + "step": 82 + }, + { + "epoch": 0.013447828904731044, + "grad_norm": 2.2917540073394775, + "learning_rate": 7.464028776978418e-07, + "loss": 0.1788, + "step": 83 + }, + { + "epoch": 0.013609850939727802, + "grad_norm": 2.376124620437622, + "learning_rate": 7.553956834532375e-07, + "loss": 0.1995, + "step": 84 + }, + { + "epoch": 0.013771872974724562, + "grad_norm": 2.521143674850464, + "learning_rate": 7.643884892086331e-07, + "loss": 0.1691, + "step": 85 + }, + { + "epoch": 0.013933895009721323, + "grad_norm": 2.2666220664978027, + "learning_rate": 7.733812949640289e-07, + "loss": 0.1831, + "step": 86 + }, + { + "epoch": 0.014095917044718081, + "grad_norm": 2.2533257007598877, + "learning_rate": 7.823741007194246e-07, + "loss": 0.1726, + "step": 87 + }, + { + "epoch": 0.014257939079714841, + "grad_norm": 2.371137857437134, + "learning_rate": 7.913669064748202e-07, + "loss": 0.182, + "step": 88 + }, + { + "epoch": 0.014419961114711602, + "grad_norm": 2.547879934310913, + "learning_rate": 8.003597122302159e-07, + "loss": 0.1908, + "step": 89 + }, + { + "epoch": 0.01458198314970836, + "grad_norm": 2.433431625366211, + "learning_rate": 8.093525179856115e-07, + "loss": 0.1951, + "step": 90 + }, + { + "epoch": 0.01474400518470512, + "grad_norm": 2.1704154014587402, + "learning_rate": 8.183453237410073e-07, + "loss": 0.1545, + "step": 91 + }, + { + "epoch": 0.014906027219701879, + "grad_norm": 2.624864339828491, + "learning_rate": 8.27338129496403e-07, + "loss": 0.1969, + "step": 92 + }, + { + "epoch": 0.015068049254698639, + "grad_norm": 2.3828327655792236, + "learning_rate": 8.363309352517986e-07, + "loss": 0.165, + "step": 93 + }, + { + "epoch": 0.0152300712896954, + "grad_norm": 2.242358922958374, + "learning_rate": 8.453237410071943e-07, + "loss": 0.1761, + "step": 94 + }, + { + "epoch": 0.015392093324692158, + "grad_norm": 2.126227378845215, + "learning_rate": 8.543165467625899e-07, + "loss": 0.1626, + "step": 95 + }, + { + "epoch": 0.015554115359688918, + "grad_norm": 2.2046821117401123, + "learning_rate": 8.633093525179857e-07, + "loss": 0.1867, + "step": 96 + }, + { + "epoch": 0.015716137394685678, + "grad_norm": 2.2523837089538574, + "learning_rate": 8.723021582733814e-07, + "loss": 0.1755, + "step": 97 + }, + { + "epoch": 0.015878159429682438, + "grad_norm": 2.326197862625122, + "learning_rate": 8.81294964028777e-07, + "loss": 0.1811, + "step": 98 + }, + { + "epoch": 0.016040181464679195, + "grad_norm": 2.4861881732940674, + "learning_rate": 8.902877697841728e-07, + "loss": 0.2073, + "step": 99 + }, + { + "epoch": 0.016202203499675955, + "grad_norm": 2.3390986919403076, + "learning_rate": 8.992805755395684e-07, + "loss": 0.1749, + "step": 100 + }, + { + "epoch": 0.016364225534672715, + "grad_norm": 2.372002124786377, + "learning_rate": 9.082733812949641e-07, + "loss": 0.1778, + "step": 101 + }, + { + "epoch": 0.016526247569669476, + "grad_norm": 2.4759466648101807, + "learning_rate": 9.172661870503598e-07, + "loss": 0.1649, + "step": 102 + }, + { + "epoch": 0.016688269604666236, + "grad_norm": 2.2869863510131836, + "learning_rate": 9.262589928057554e-07, + "loss": 0.159, + "step": 103 + }, + { + "epoch": 0.016850291639662993, + "grad_norm": 2.3882856369018555, + "learning_rate": 9.352517985611512e-07, + "loss": 0.1893, + "step": 104 + }, + { + "epoch": 0.017012313674659753, + "grad_norm": 2.316922187805176, + "learning_rate": 9.442446043165468e-07, + "loss": 0.1737, + "step": 105 + }, + { + "epoch": 0.017174335709656513, + "grad_norm": 2.4775216579437256, + "learning_rate": 9.532374100719425e-07, + "loss": 0.204, + "step": 106 + }, + { + "epoch": 0.017336357744653273, + "grad_norm": 2.44256329536438, + "learning_rate": 9.622302158273383e-07, + "loss": 0.203, + "step": 107 + }, + { + "epoch": 0.017498379779650033, + "grad_norm": 2.3133935928344727, + "learning_rate": 9.71223021582734e-07, + "loss": 0.1842, + "step": 108 + }, + { + "epoch": 0.017660401814646794, + "grad_norm": 2.498054265975952, + "learning_rate": 9.802158273381295e-07, + "loss": 0.169, + "step": 109 + }, + { + "epoch": 0.01782242384964355, + "grad_norm": 2.51688814163208, + "learning_rate": 9.892086330935252e-07, + "loss": 0.1845, + "step": 110 + }, + { + "epoch": 0.01798444588464031, + "grad_norm": 2.356818437576294, + "learning_rate": 9.98201438848921e-07, + "loss": 0.1835, + "step": 111 + }, + { + "epoch": 0.01814646791963707, + "grad_norm": 2.1603775024414062, + "learning_rate": 1.0071942446043167e-06, + "loss": 0.155, + "step": 112 + }, + { + "epoch": 0.01830848995463383, + "grad_norm": 2.509432554244995, + "learning_rate": 1.0161870503597124e-06, + "loss": 0.2281, + "step": 113 + }, + { + "epoch": 0.01847051198963059, + "grad_norm": 2.5537493228912354, + "learning_rate": 1.025179856115108e-06, + "loss": 0.1793, + "step": 114 + }, + { + "epoch": 0.018632534024627348, + "grad_norm": 2.246959924697876, + "learning_rate": 1.0341726618705036e-06, + "loss": 0.1801, + "step": 115 + }, + { + "epoch": 0.018794556059624108, + "grad_norm": 2.19378662109375, + "learning_rate": 1.0431654676258993e-06, + "loss": 0.1637, + "step": 116 + }, + { + "epoch": 0.01895657809462087, + "grad_norm": 2.3158321380615234, + "learning_rate": 1.052158273381295e-06, + "loss": 0.1617, + "step": 117 + }, + { + "epoch": 0.01911860012961763, + "grad_norm": 2.4354169368743896, + "learning_rate": 1.0611510791366908e-06, + "loss": 0.1779, + "step": 118 + }, + { + "epoch": 0.01928062216461439, + "grad_norm": 2.5059328079223633, + "learning_rate": 1.0701438848920865e-06, + "loss": 0.1832, + "step": 119 + }, + { + "epoch": 0.019442644199611146, + "grad_norm": 2.449944019317627, + "learning_rate": 1.079136690647482e-06, + "loss": 0.2071, + "step": 120 + }, + { + "epoch": 0.019604666234607906, + "grad_norm": 2.3459267616271973, + "learning_rate": 1.0881294964028777e-06, + "loss": 0.1757, + "step": 121 + }, + { + "epoch": 0.019766688269604666, + "grad_norm": 2.456517457962036, + "learning_rate": 1.0971223021582735e-06, + "loss": 0.1804, + "step": 122 + }, + { + "epoch": 0.019928710304601426, + "grad_norm": 2.350886344909668, + "learning_rate": 1.1061151079136692e-06, + "loss": 0.1675, + "step": 123 + }, + { + "epoch": 0.020090732339598186, + "grad_norm": 2.2044262886047363, + "learning_rate": 1.115107913669065e-06, + "loss": 0.1666, + "step": 124 + }, + { + "epoch": 0.020252754374594947, + "grad_norm": 2.2976362705230713, + "learning_rate": 1.1241007194244604e-06, + "loss": 0.1797, + "step": 125 + }, + { + "epoch": 0.020414776409591703, + "grad_norm": 2.1294548511505127, + "learning_rate": 1.1330935251798561e-06, + "loss": 0.1529, + "step": 126 + }, + { + "epoch": 0.020576798444588464, + "grad_norm": 2.205601453781128, + "learning_rate": 1.1420863309352519e-06, + "loss": 0.1641, + "step": 127 + }, + { + "epoch": 0.020738820479585224, + "grad_norm": 2.2161824703216553, + "learning_rate": 1.1510791366906476e-06, + "loss": 0.1617, + "step": 128 + }, + { + "epoch": 0.020900842514581984, + "grad_norm": 2.266568422317505, + "learning_rate": 1.1600719424460433e-06, + "loss": 0.1746, + "step": 129 + }, + { + "epoch": 0.021062864549578744, + "grad_norm": 2.3664584159851074, + "learning_rate": 1.1690647482014388e-06, + "loss": 0.1777, + "step": 130 + }, + { + "epoch": 0.0212248865845755, + "grad_norm": 2.2521674633026123, + "learning_rate": 1.1780575539568347e-06, + "loss": 0.1707, + "step": 131 + }, + { + "epoch": 0.02138690861957226, + "grad_norm": 2.394850254058838, + "learning_rate": 1.1870503597122303e-06, + "loss": 0.1856, + "step": 132 + }, + { + "epoch": 0.02154893065456902, + "grad_norm": 2.332915782928467, + "learning_rate": 1.196043165467626e-06, + "loss": 0.1818, + "step": 133 + }, + { + "epoch": 0.02171095268956578, + "grad_norm": 2.307866334915161, + "learning_rate": 1.2050359712230217e-06, + "loss": 0.1799, + "step": 134 + }, + { + "epoch": 0.021872974724562542, + "grad_norm": 2.2809898853302, + "learning_rate": 1.2140287769784172e-06, + "loss": 0.1762, + "step": 135 + }, + { + "epoch": 0.0220349967595593, + "grad_norm": 2.1759355068206787, + "learning_rate": 1.2230215827338131e-06, + "loss": 0.1522, + "step": 136 + }, + { + "epoch": 0.02219701879455606, + "grad_norm": 2.172473907470703, + "learning_rate": 1.2320143884892087e-06, + "loss": 0.1693, + "step": 137 + }, + { + "epoch": 0.02235904082955282, + "grad_norm": 2.1808273792266846, + "learning_rate": 1.2410071942446044e-06, + "loss": 0.1549, + "step": 138 + }, + { + "epoch": 0.02252106286454958, + "grad_norm": 2.164473533630371, + "learning_rate": 1.25e-06, + "loss": 0.1798, + "step": 139 + }, + { + "epoch": 0.02268308489954634, + "grad_norm": 2.2718453407287598, + "learning_rate": 1.2589928057553958e-06, + "loss": 0.1606, + "step": 140 + }, + { + "epoch": 0.022845106934543096, + "grad_norm": 2.1400561332702637, + "learning_rate": 1.2679856115107913e-06, + "loss": 0.1428, + "step": 141 + }, + { + "epoch": 0.023007128969539856, + "grad_norm": 2.193063497543335, + "learning_rate": 1.2769784172661873e-06, + "loss": 0.1729, + "step": 142 + }, + { + "epoch": 0.023169151004536617, + "grad_norm": 2.2718231678009033, + "learning_rate": 1.285971223021583e-06, + "loss": 0.1803, + "step": 143 + }, + { + "epoch": 0.023331173039533377, + "grad_norm": 2.3830406665802, + "learning_rate": 1.2949640287769785e-06, + "loss": 0.1688, + "step": 144 + }, + { + "epoch": 0.023493195074530137, + "grad_norm": 2.2217133045196533, + "learning_rate": 1.3039568345323742e-06, + "loss": 0.172, + "step": 145 + }, + { + "epoch": 0.023655217109526897, + "grad_norm": 2.429953098297119, + "learning_rate": 1.3129496402877697e-06, + "loss": 0.1761, + "step": 146 + }, + { + "epoch": 0.023817239144523654, + "grad_norm": 2.481562614440918, + "learning_rate": 1.3219424460431657e-06, + "loss": 0.1777, + "step": 147 + }, + { + "epoch": 0.023979261179520414, + "grad_norm": 2.3909571170806885, + "learning_rate": 1.3309352517985614e-06, + "loss": 0.1814, + "step": 148 + }, + { + "epoch": 0.024141283214517174, + "grad_norm": 2.3645970821380615, + "learning_rate": 1.339928057553957e-06, + "loss": 0.179, + "step": 149 + }, + { + "epoch": 0.024303305249513935, + "grad_norm": 2.5163588523864746, + "learning_rate": 1.3489208633093526e-06, + "loss": 0.2024, + "step": 150 + }, + { + "epoch": 0.024465327284510695, + "grad_norm": 2.2522170543670654, + "learning_rate": 1.3579136690647481e-06, + "loss": 0.1844, + "step": 151 + }, + { + "epoch": 0.02462734931950745, + "grad_norm": 2.276763677597046, + "learning_rate": 1.366906474820144e-06, + "loss": 0.1934, + "step": 152 + }, + { + "epoch": 0.02478937135450421, + "grad_norm": 2.1778862476348877, + "learning_rate": 1.3758992805755398e-06, + "loss": 0.166, + "step": 153 + }, + { + "epoch": 0.024951393389500972, + "grad_norm": 2.162675619125366, + "learning_rate": 1.3848920863309353e-06, + "loss": 0.1823, + "step": 154 + }, + { + "epoch": 0.025113415424497732, + "grad_norm": 2.212240695953369, + "learning_rate": 1.393884892086331e-06, + "loss": 0.1814, + "step": 155 + }, + { + "epoch": 0.025275437459494492, + "grad_norm": 2.283370018005371, + "learning_rate": 1.4028776978417265e-06, + "loss": 0.1821, + "step": 156 + }, + { + "epoch": 0.02543745949449125, + "grad_norm": 2.1461737155914307, + "learning_rate": 1.4118705035971225e-06, + "loss": 0.1689, + "step": 157 + }, + { + "epoch": 0.02559948152948801, + "grad_norm": 2.1343040466308594, + "learning_rate": 1.4208633093525182e-06, + "loss": 0.1754, + "step": 158 + }, + { + "epoch": 0.02576150356448477, + "grad_norm": 2.200503349304199, + "learning_rate": 1.4298561151079137e-06, + "loss": 0.1824, + "step": 159 + }, + { + "epoch": 0.02592352559948153, + "grad_norm": 2.3421995639801025, + "learning_rate": 1.4388489208633094e-06, + "loss": 0.1768, + "step": 160 + }, + { + "epoch": 0.02608554763447829, + "grad_norm": 2.1087827682495117, + "learning_rate": 1.447841726618705e-06, + "loss": 0.1929, + "step": 161 + }, + { + "epoch": 0.02624756966947505, + "grad_norm": 2.4311277866363525, + "learning_rate": 1.4568345323741009e-06, + "loss": 0.1845, + "step": 162 + }, + { + "epoch": 0.026409591704471807, + "grad_norm": 2.0372049808502197, + "learning_rate": 1.4658273381294966e-06, + "loss": 0.163, + "step": 163 + }, + { + "epoch": 0.026571613739468567, + "grad_norm": 2.1894679069519043, + "learning_rate": 1.474820143884892e-06, + "loss": 0.1511, + "step": 164 + }, + { + "epoch": 0.026733635774465327, + "grad_norm": 2.3050854206085205, + "learning_rate": 1.4838129496402878e-06, + "loss": 0.1867, + "step": 165 + }, + { + "epoch": 0.026895657809462088, + "grad_norm": 2.4033379554748535, + "learning_rate": 1.4928057553956835e-06, + "loss": 0.1739, + "step": 166 + }, + { + "epoch": 0.027057679844458848, + "grad_norm": 2.3603515625, + "learning_rate": 1.5017985611510793e-06, + "loss": 0.162, + "step": 167 + }, + { + "epoch": 0.027219701879455604, + "grad_norm": 1.9225950241088867, + "learning_rate": 1.510791366906475e-06, + "loss": 0.1444, + "step": 168 + }, + { + "epoch": 0.027381723914452365, + "grad_norm": 2.3583033084869385, + "learning_rate": 1.5197841726618707e-06, + "loss": 0.1942, + "step": 169 + }, + { + "epoch": 0.027543745949449125, + "grad_norm": 2.416661262512207, + "learning_rate": 1.5287769784172662e-06, + "loss": 0.1778, + "step": 170 + }, + { + "epoch": 0.027705767984445885, + "grad_norm": 2.0838210582733154, + "learning_rate": 1.537769784172662e-06, + "loss": 0.1581, + "step": 171 + }, + { + "epoch": 0.027867790019442645, + "grad_norm": 2.3810360431671143, + "learning_rate": 1.5467625899280579e-06, + "loss": 0.174, + "step": 172 + }, + { + "epoch": 0.028029812054439402, + "grad_norm": 2.243232011795044, + "learning_rate": 1.5557553956834534e-06, + "loss": 0.1563, + "step": 173 + }, + { + "epoch": 0.028191834089436162, + "grad_norm": 2.2563958168029785, + "learning_rate": 1.5647482014388491e-06, + "loss": 0.1719, + "step": 174 + }, + { + "epoch": 0.028353856124432922, + "grad_norm": 2.283281087875366, + "learning_rate": 1.5737410071942446e-06, + "loss": 0.1607, + "step": 175 + }, + { + "epoch": 0.028515878159429683, + "grad_norm": 2.166625499725342, + "learning_rate": 1.5827338129496403e-06, + "loss": 0.1615, + "step": 176 + }, + { + "epoch": 0.028677900194426443, + "grad_norm": 2.2912237644195557, + "learning_rate": 1.5917266187050363e-06, + "loss": 0.1731, + "step": 177 + }, + { + "epoch": 0.028839922229423203, + "grad_norm": 2.196793794631958, + "learning_rate": 1.6007194244604318e-06, + "loss": 0.1644, + "step": 178 + }, + { + "epoch": 0.02900194426441996, + "grad_norm": 2.193145513534546, + "learning_rate": 1.6097122302158275e-06, + "loss": 0.1632, + "step": 179 + }, + { + "epoch": 0.02916396629941672, + "grad_norm": 2.420989990234375, + "learning_rate": 1.618705035971223e-06, + "loss": 0.1953, + "step": 180 + }, + { + "epoch": 0.02932598833441348, + "grad_norm": 2.0741348266601562, + "learning_rate": 1.6276978417266187e-06, + "loss": 0.1573, + "step": 181 + }, + { + "epoch": 0.02948801036941024, + "grad_norm": 2.2281997203826904, + "learning_rate": 1.6366906474820147e-06, + "loss": 0.1897, + "step": 182 + }, + { + "epoch": 0.029650032404407, + "grad_norm": 2.163466453552246, + "learning_rate": 1.6456834532374102e-06, + "loss": 0.1742, + "step": 183 + }, + { + "epoch": 0.029812054439403757, + "grad_norm": 2.054077625274658, + "learning_rate": 1.654676258992806e-06, + "loss": 0.1741, + "step": 184 + }, + { + "epoch": 0.029974076474400518, + "grad_norm": 2.1084864139556885, + "learning_rate": 1.6636690647482014e-06, + "loss": 0.1887, + "step": 185 + }, + { + "epoch": 0.030136098509397278, + "grad_norm": 2.160194158554077, + "learning_rate": 1.6726618705035971e-06, + "loss": 0.162, + "step": 186 + }, + { + "epoch": 0.030298120544394038, + "grad_norm": 2.227296829223633, + "learning_rate": 1.681654676258993e-06, + "loss": 0.1717, + "step": 187 + }, + { + "epoch": 0.0304601425793908, + "grad_norm": 2.0688178539276123, + "learning_rate": 1.6906474820143886e-06, + "loss": 0.1705, + "step": 188 + }, + { + "epoch": 0.030622164614387555, + "grad_norm": 2.081521511077881, + "learning_rate": 1.6996402877697843e-06, + "loss": 0.1759, + "step": 189 + }, + { + "epoch": 0.030784186649384315, + "grad_norm": 2.2949635982513428, + "learning_rate": 1.7086330935251798e-06, + "loss": 0.1693, + "step": 190 + }, + { + "epoch": 0.030946208684381075, + "grad_norm": 2.3229682445526123, + "learning_rate": 1.7176258992805755e-06, + "loss": 0.186, + "step": 191 + }, + { + "epoch": 0.031108230719377836, + "grad_norm": 2.3723998069763184, + "learning_rate": 1.7266187050359715e-06, + "loss": 0.1902, + "step": 192 + }, + { + "epoch": 0.031270252754374596, + "grad_norm": 2.150329828262329, + "learning_rate": 1.735611510791367e-06, + "loss": 0.1723, + "step": 193 + }, + { + "epoch": 0.031432274789371356, + "grad_norm": 2.097501277923584, + "learning_rate": 1.7446043165467627e-06, + "loss": 0.165, + "step": 194 + }, + { + "epoch": 0.031594296824368116, + "grad_norm": 2.10372257232666, + "learning_rate": 1.7535971223021584e-06, + "loss": 0.1578, + "step": 195 + }, + { + "epoch": 0.031756318859364877, + "grad_norm": 2.1476476192474365, + "learning_rate": 1.762589928057554e-06, + "loss": 0.1662, + "step": 196 + }, + { + "epoch": 0.03191834089436163, + "grad_norm": 2.1310970783233643, + "learning_rate": 1.7715827338129499e-06, + "loss": 0.1655, + "step": 197 + }, + { + "epoch": 0.03208036292935839, + "grad_norm": 2.2113521099090576, + "learning_rate": 1.7805755395683456e-06, + "loss": 0.1758, + "step": 198 + }, + { + "epoch": 0.03224238496435515, + "grad_norm": 2.0751709938049316, + "learning_rate": 1.7895683453237411e-06, + "loss": 0.1585, + "step": 199 + }, + { + "epoch": 0.03240440699935191, + "grad_norm": 1.9745714664459229, + "learning_rate": 1.7985611510791368e-06, + "loss": 0.1666, + "step": 200 + }, + { + "epoch": 0.03256642903434867, + "grad_norm": 2.1305060386657715, + "learning_rate": 1.8075539568345323e-06, + "loss": 0.1843, + "step": 201 + }, + { + "epoch": 0.03272845106934543, + "grad_norm": 2.0196950435638428, + "learning_rate": 1.8165467625899283e-06, + "loss": 0.1768, + "step": 202 + }, + { + "epoch": 0.03289047310434219, + "grad_norm": 2.2363412380218506, + "learning_rate": 1.825539568345324e-06, + "loss": 0.169, + "step": 203 + }, + { + "epoch": 0.03305249513933895, + "grad_norm": 2.118345260620117, + "learning_rate": 1.8345323741007195e-06, + "loss": 0.1486, + "step": 204 + }, + { + "epoch": 0.03321451717433571, + "grad_norm": 2.0574629306793213, + "learning_rate": 1.8435251798561152e-06, + "loss": 0.1794, + "step": 205 + }, + { + "epoch": 0.03337653920933247, + "grad_norm": 1.975506067276001, + "learning_rate": 1.8525179856115107e-06, + "loss": 0.1608, + "step": 206 + }, + { + "epoch": 0.03353856124432923, + "grad_norm": 2.1187586784362793, + "learning_rate": 1.8615107913669067e-06, + "loss": 0.1514, + "step": 207 + }, + { + "epoch": 0.033700583279325985, + "grad_norm": 2.031742811203003, + "learning_rate": 1.8705035971223024e-06, + "loss": 0.1567, + "step": 208 + }, + { + "epoch": 0.033862605314322745, + "grad_norm": 2.198672294616699, + "learning_rate": 1.879496402877698e-06, + "loss": 0.1709, + "step": 209 + }, + { + "epoch": 0.034024627349319506, + "grad_norm": 1.9215576648712158, + "learning_rate": 1.8884892086330936e-06, + "loss": 0.1466, + "step": 210 + }, + { + "epoch": 0.034186649384316266, + "grad_norm": 2.0358407497406006, + "learning_rate": 1.8974820143884896e-06, + "loss": 0.1641, + "step": 211 + }, + { + "epoch": 0.034348671419313026, + "grad_norm": 2.2609496116638184, + "learning_rate": 1.906474820143885e-06, + "loss": 0.1607, + "step": 212 + }, + { + "epoch": 0.034510693454309786, + "grad_norm": 2.0340802669525146, + "learning_rate": 1.915467625899281e-06, + "loss": 0.1605, + "step": 213 + }, + { + "epoch": 0.034672715489306546, + "grad_norm": 2.190073013305664, + "learning_rate": 1.9244604316546765e-06, + "loss": 0.1893, + "step": 214 + }, + { + "epoch": 0.03483473752430331, + "grad_norm": 2.129892587661743, + "learning_rate": 1.933453237410072e-06, + "loss": 0.1657, + "step": 215 + }, + { + "epoch": 0.03499675955930007, + "grad_norm": 1.9545798301696777, + "learning_rate": 1.942446043165468e-06, + "loss": 0.1698, + "step": 216 + }, + { + "epoch": 0.03515878159429683, + "grad_norm": 1.9465980529785156, + "learning_rate": 1.9514388489208637e-06, + "loss": 0.1571, + "step": 217 + }, + { + "epoch": 0.03532080362929359, + "grad_norm": 2.116178512573242, + "learning_rate": 1.960431654676259e-06, + "loss": 0.1581, + "step": 218 + }, + { + "epoch": 0.03548282566429034, + "grad_norm": 2.0396151542663574, + "learning_rate": 1.9694244604316547e-06, + "loss": 0.1637, + "step": 219 + }, + { + "epoch": 0.0356448476992871, + "grad_norm": 2.1016218662261963, + "learning_rate": 1.9784172661870504e-06, + "loss": 0.1707, + "step": 220 + }, + { + "epoch": 0.03580686973428386, + "grad_norm": 2.122370719909668, + "learning_rate": 1.987410071942446e-06, + "loss": 0.1701, + "step": 221 + }, + { + "epoch": 0.03596889176928062, + "grad_norm": 2.2020812034606934, + "learning_rate": 1.996402877697842e-06, + "loss": 0.1755, + "step": 222 + }, + { + "epoch": 0.03613091380427738, + "grad_norm": 2.160726547241211, + "learning_rate": 2.0053956834532376e-06, + "loss": 0.1704, + "step": 223 + }, + { + "epoch": 0.03629293583927414, + "grad_norm": 2.121961832046509, + "learning_rate": 2.0143884892086333e-06, + "loss": 0.1631, + "step": 224 + }, + { + "epoch": 0.0364549578742709, + "grad_norm": 2.045151472091675, + "learning_rate": 2.023381294964029e-06, + "loss": 0.1794, + "step": 225 + }, + { + "epoch": 0.03661697990926766, + "grad_norm": 1.982482671737671, + "learning_rate": 2.0323741007194248e-06, + "loss": 0.1649, + "step": 226 + }, + { + "epoch": 0.03677900194426442, + "grad_norm": 2.0936903953552246, + "learning_rate": 2.0413669064748205e-06, + "loss": 0.1894, + "step": 227 + }, + { + "epoch": 0.03694102397926118, + "grad_norm": 1.9307271242141724, + "learning_rate": 2.050359712230216e-06, + "loss": 0.1658, + "step": 228 + }, + { + "epoch": 0.037103046014257936, + "grad_norm": 2.0920655727386475, + "learning_rate": 2.0593525179856115e-06, + "loss": 0.1823, + "step": 229 + }, + { + "epoch": 0.037265068049254696, + "grad_norm": 2.037468194961548, + "learning_rate": 2.0683453237410072e-06, + "loss": 0.1854, + "step": 230 + }, + { + "epoch": 0.037427090084251456, + "grad_norm": 2.0791573524475098, + "learning_rate": 2.0773381294964034e-06, + "loss": 0.1668, + "step": 231 + }, + { + "epoch": 0.037589112119248216, + "grad_norm": 2.020608425140381, + "learning_rate": 2.0863309352517987e-06, + "loss": 0.1664, + "step": 232 + }, + { + "epoch": 0.03775113415424498, + "grad_norm": 2.0509209632873535, + "learning_rate": 2.0953237410071944e-06, + "loss": 0.1655, + "step": 233 + }, + { + "epoch": 0.03791315618924174, + "grad_norm": 1.895331621170044, + "learning_rate": 2.10431654676259e-06, + "loss": 0.1671, + "step": 234 + }, + { + "epoch": 0.0380751782242385, + "grad_norm": 1.8676968812942505, + "learning_rate": 2.113309352517986e-06, + "loss": 0.1499, + "step": 235 + }, + { + "epoch": 0.03823720025923526, + "grad_norm": 1.9968868494033813, + "learning_rate": 2.1223021582733816e-06, + "loss": 0.1704, + "step": 236 + }, + { + "epoch": 0.03839922229423202, + "grad_norm": 2.107897996902466, + "learning_rate": 2.1312949640287773e-06, + "loss": 0.1738, + "step": 237 + }, + { + "epoch": 0.03856124432922878, + "grad_norm": 2.0445058345794678, + "learning_rate": 2.140287769784173e-06, + "loss": 0.1809, + "step": 238 + }, + { + "epoch": 0.03872326636422554, + "grad_norm": 2.110241413116455, + "learning_rate": 2.1492805755395683e-06, + "loss": 0.1731, + "step": 239 + }, + { + "epoch": 0.03888528839922229, + "grad_norm": 2.0265743732452393, + "learning_rate": 2.158273381294964e-06, + "loss": 0.1581, + "step": 240 + }, + { + "epoch": 0.03904731043421905, + "grad_norm": 1.8976444005966187, + "learning_rate": 2.16726618705036e-06, + "loss": 0.1622, + "step": 241 + }, + { + "epoch": 0.03920933246921581, + "grad_norm": 1.967790126800537, + "learning_rate": 2.1762589928057555e-06, + "loss": 0.1639, + "step": 242 + }, + { + "epoch": 0.03937135450421257, + "grad_norm": 2.0550568103790283, + "learning_rate": 2.185251798561151e-06, + "loss": 0.1786, + "step": 243 + }, + { + "epoch": 0.03953337653920933, + "grad_norm": 1.9519095420837402, + "learning_rate": 2.194244604316547e-06, + "loss": 0.1772, + "step": 244 + }, + { + "epoch": 0.03969539857420609, + "grad_norm": 2.0229897499084473, + "learning_rate": 2.2032374100719426e-06, + "loss": 0.1746, + "step": 245 + }, + { + "epoch": 0.03985742060920285, + "grad_norm": 2.133192777633667, + "learning_rate": 2.2122302158273384e-06, + "loss": 0.172, + "step": 246 + }, + { + "epoch": 0.04001944264419961, + "grad_norm": 2.323986053466797, + "learning_rate": 2.221223021582734e-06, + "loss": 0.1845, + "step": 247 + }, + { + "epoch": 0.04018146467919637, + "grad_norm": 2.0216121673583984, + "learning_rate": 2.23021582733813e-06, + "loss": 0.1844, + "step": 248 + }, + { + "epoch": 0.04034348671419313, + "grad_norm": 1.8960579633712769, + "learning_rate": 2.2392086330935255e-06, + "loss": 0.1468, + "step": 249 + }, + { + "epoch": 0.04050550874918989, + "grad_norm": 1.971975564956665, + "learning_rate": 2.248201438848921e-06, + "loss": 0.1822, + "step": 250 + }, + { + "epoch": 0.040667530784186647, + "grad_norm": 1.8761088848114014, + "learning_rate": 2.257194244604317e-06, + "loss": 0.1613, + "step": 251 + }, + { + "epoch": 0.04082955281918341, + "grad_norm": 1.8940060138702393, + "learning_rate": 2.2661870503597123e-06, + "loss": 0.1827, + "step": 252 + }, + { + "epoch": 0.04099157485418017, + "grad_norm": 1.880953311920166, + "learning_rate": 2.275179856115108e-06, + "loss": 0.1738, + "step": 253 + }, + { + "epoch": 0.04115359688917693, + "grad_norm": 2.1408498287200928, + "learning_rate": 2.2841726618705037e-06, + "loss": 0.2066, + "step": 254 + }, + { + "epoch": 0.04131561892417369, + "grad_norm": 2.4467737674713135, + "learning_rate": 2.2931654676258994e-06, + "loss": 0.1646, + "step": 255 + }, + { + "epoch": 0.04147764095917045, + "grad_norm": 1.8072450160980225, + "learning_rate": 2.302158273381295e-06, + "loss": 0.1475, + "step": 256 + }, + { + "epoch": 0.04163966299416721, + "grad_norm": 2.035788059234619, + "learning_rate": 2.311151079136691e-06, + "loss": 0.1742, + "step": 257 + }, + { + "epoch": 0.04180168502916397, + "grad_norm": 2.0450069904327393, + "learning_rate": 2.3201438848920866e-06, + "loss": 0.1733, + "step": 258 + }, + { + "epoch": 0.04196370706416073, + "grad_norm": 2.0818724632263184, + "learning_rate": 2.3291366906474823e-06, + "loss": 0.1743, + "step": 259 + }, + { + "epoch": 0.04212572909915749, + "grad_norm": 2.11169695854187, + "learning_rate": 2.3381294964028776e-06, + "loss": 0.1632, + "step": 260 + }, + { + "epoch": 0.04228775113415424, + "grad_norm": 2.079584836959839, + "learning_rate": 2.3471223021582738e-06, + "loss": 0.1868, + "step": 261 + }, + { + "epoch": 0.042449773169151, + "grad_norm": 2.0339434146881104, + "learning_rate": 2.3561151079136695e-06, + "loss": 0.1536, + "step": 262 + }, + { + "epoch": 0.04261179520414776, + "grad_norm": 2.0371828079223633, + "learning_rate": 2.365107913669065e-06, + "loss": 0.174, + "step": 263 + }, + { + "epoch": 0.04277381723914452, + "grad_norm": 2.0146055221557617, + "learning_rate": 2.3741007194244605e-06, + "loss": 0.1599, + "step": 264 + }, + { + "epoch": 0.04293583927414128, + "grad_norm": 2.0458879470825195, + "learning_rate": 2.3830935251798562e-06, + "loss": 0.1722, + "step": 265 + }, + { + "epoch": 0.04309786130913804, + "grad_norm": 1.9001797437667847, + "learning_rate": 2.392086330935252e-06, + "loss": 0.1519, + "step": 266 + }, + { + "epoch": 0.0432598833441348, + "grad_norm": 1.9941308498382568, + "learning_rate": 2.4010791366906477e-06, + "loss": 0.1796, + "step": 267 + }, + { + "epoch": 0.04342190537913156, + "grad_norm": 2.0200061798095703, + "learning_rate": 2.4100719424460434e-06, + "loss": 0.1746, + "step": 268 + }, + { + "epoch": 0.04358392741412832, + "grad_norm": 2.166887044906616, + "learning_rate": 2.419064748201439e-06, + "loss": 0.1712, + "step": 269 + }, + { + "epoch": 0.043745949449125084, + "grad_norm": 2.011035680770874, + "learning_rate": 2.4280575539568344e-06, + "loss": 0.1726, + "step": 270 + }, + { + "epoch": 0.043907971484121844, + "grad_norm": 2.070662021636963, + "learning_rate": 2.4370503597122306e-06, + "loss": 0.1877, + "step": 271 + }, + { + "epoch": 0.0440699935191186, + "grad_norm": 1.8755451440811157, + "learning_rate": 2.4460431654676263e-06, + "loss": 0.153, + "step": 272 + }, + { + "epoch": 0.04423201555411536, + "grad_norm": 2.0150034427642822, + "learning_rate": 2.4550359712230216e-06, + "loss": 0.1857, + "step": 273 + }, + { + "epoch": 0.04439403758911212, + "grad_norm": 1.969179391860962, + "learning_rate": 2.4640287769784173e-06, + "loss": 0.1758, + "step": 274 + }, + { + "epoch": 0.04455605962410888, + "grad_norm": 1.8629865646362305, + "learning_rate": 2.473021582733813e-06, + "loss": 0.1643, + "step": 275 + }, + { + "epoch": 0.04471808165910564, + "grad_norm": 1.8298242092132568, + "learning_rate": 2.4820143884892088e-06, + "loss": 0.1666, + "step": 276 + }, + { + "epoch": 0.0448801036941024, + "grad_norm": 1.870679259300232, + "learning_rate": 2.4910071942446045e-06, + "loss": 0.1613, + "step": 277 + }, + { + "epoch": 0.04504212572909916, + "grad_norm": 2.0010008811950684, + "learning_rate": 2.5e-06, + "loss": 0.171, + "step": 278 + }, + { + "epoch": 0.04520414776409592, + "grad_norm": 2.0288240909576416, + "learning_rate": 2.508992805755396e-06, + "loss": 0.1759, + "step": 279 + }, + { + "epoch": 0.04536616979909268, + "grad_norm": 1.911431908607483, + "learning_rate": 2.5179856115107916e-06, + "loss": 0.1709, + "step": 280 + }, + { + "epoch": 0.04552819183408944, + "grad_norm": 2.0871028900146484, + "learning_rate": 2.526978417266187e-06, + "loss": 0.1817, + "step": 281 + }, + { + "epoch": 0.04569021386908619, + "grad_norm": 1.9303252696990967, + "learning_rate": 2.5359712230215827e-06, + "loss": 0.189, + "step": 282 + }, + { + "epoch": 0.04585223590408295, + "grad_norm": 2.0121030807495117, + "learning_rate": 2.544964028776979e-06, + "loss": 0.1956, + "step": 283 + }, + { + "epoch": 0.04601425793907971, + "grad_norm": 1.8766732215881348, + "learning_rate": 2.5539568345323745e-06, + "loss": 0.1731, + "step": 284 + }, + { + "epoch": 0.04617627997407647, + "grad_norm": 1.7872287034988403, + "learning_rate": 2.5629496402877703e-06, + "loss": 0.1672, + "step": 285 + }, + { + "epoch": 0.04633830200907323, + "grad_norm": 2.2016873359680176, + "learning_rate": 2.571942446043166e-06, + "loss": 0.1942, + "step": 286 + }, + { + "epoch": 0.04650032404406999, + "grad_norm": 2.001664400100708, + "learning_rate": 2.5809352517985613e-06, + "loss": 0.1779, + "step": 287 + }, + { + "epoch": 0.046662346079066754, + "grad_norm": 1.881262183189392, + "learning_rate": 2.589928057553957e-06, + "loss": 0.1621, + "step": 288 + }, + { + "epoch": 0.046824368114063514, + "grad_norm": 2.1064798831939697, + "learning_rate": 2.5989208633093527e-06, + "loss": 0.1788, + "step": 289 + }, + { + "epoch": 0.046986390149060274, + "grad_norm": 1.8403074741363525, + "learning_rate": 2.6079136690647484e-06, + "loss": 0.1627, + "step": 290 + }, + { + "epoch": 0.047148412184057034, + "grad_norm": 2.0622360706329346, + "learning_rate": 2.6169064748201437e-06, + "loss": 0.1742, + "step": 291 + }, + { + "epoch": 0.047310434219053794, + "grad_norm": 1.878718376159668, + "learning_rate": 2.6258992805755395e-06, + "loss": 0.1509, + "step": 292 + }, + { + "epoch": 0.04747245625405055, + "grad_norm": 2.0211925506591797, + "learning_rate": 2.6348920863309356e-06, + "loss": 0.1923, + "step": 293 + }, + { + "epoch": 0.04763447828904731, + "grad_norm": 2.1039490699768066, + "learning_rate": 2.6438848920863313e-06, + "loss": 0.1733, + "step": 294 + }, + { + "epoch": 0.04779650032404407, + "grad_norm": 2.026010751724243, + "learning_rate": 2.652877697841727e-06, + "loss": 0.1759, + "step": 295 + }, + { + "epoch": 0.04795852235904083, + "grad_norm": 2.0125303268432617, + "learning_rate": 2.6618705035971228e-06, + "loss": 0.1844, + "step": 296 + }, + { + "epoch": 0.04812054439403759, + "grad_norm": 1.8597307205200195, + "learning_rate": 2.670863309352518e-06, + "loss": 0.1637, + "step": 297 + }, + { + "epoch": 0.04828256642903435, + "grad_norm": 1.8322033882141113, + "learning_rate": 2.679856115107914e-06, + "loss": 0.1638, + "step": 298 + }, + { + "epoch": 0.04844458846403111, + "grad_norm": 1.9159802198410034, + "learning_rate": 2.6888489208633095e-06, + "loss": 0.1668, + "step": 299 + }, + { + "epoch": 0.04860661049902787, + "grad_norm": 1.653609037399292, + "learning_rate": 2.6978417266187052e-06, + "loss": 0.1523, + "step": 300 + }, + { + "epoch": 0.04876863253402463, + "grad_norm": 1.8605982065200806, + "learning_rate": 2.706834532374101e-06, + "loss": 0.1641, + "step": 301 + }, + { + "epoch": 0.04893065456902139, + "grad_norm": 1.9447520971298218, + "learning_rate": 2.7158273381294963e-06, + "loss": 0.1795, + "step": 302 + }, + { + "epoch": 0.04909267660401815, + "grad_norm": 2.1139204502105713, + "learning_rate": 2.7248201438848924e-06, + "loss": 0.1839, + "step": 303 + }, + { + "epoch": 0.0492546986390149, + "grad_norm": 2.0103182792663574, + "learning_rate": 2.733812949640288e-06, + "loss": 0.1707, + "step": 304 + }, + { + "epoch": 0.04941672067401166, + "grad_norm": 1.9638346433639526, + "learning_rate": 2.742805755395684e-06, + "loss": 0.1548, + "step": 305 + }, + { + "epoch": 0.04957874270900842, + "grad_norm": 2.0801048278808594, + "learning_rate": 2.7517985611510796e-06, + "loss": 0.1813, + "step": 306 + }, + { + "epoch": 0.049740764744005184, + "grad_norm": 1.848359227180481, + "learning_rate": 2.760791366906475e-06, + "loss": 0.1655, + "step": 307 + }, + { + "epoch": 0.049902786779001944, + "grad_norm": 1.85602605342865, + "learning_rate": 2.7697841726618706e-06, + "loss": 0.1789, + "step": 308 + }, + { + "epoch": 0.050064808813998704, + "grad_norm": 1.8783005475997925, + "learning_rate": 2.7787769784172663e-06, + "loss": 0.1753, + "step": 309 + }, + { + "epoch": 0.050226830848995464, + "grad_norm": 1.981858253479004, + "learning_rate": 2.787769784172662e-06, + "loss": 0.1798, + "step": 310 + }, + { + "epoch": 0.050388852883992225, + "grad_norm": 2.0939548015594482, + "learning_rate": 2.7967625899280578e-06, + "loss": 0.1923, + "step": 311 + }, + { + "epoch": 0.050550874918988985, + "grad_norm": 2.0538747310638428, + "learning_rate": 2.805755395683453e-06, + "loss": 0.1869, + "step": 312 + }, + { + "epoch": 0.050712896953985745, + "grad_norm": 1.807354211807251, + "learning_rate": 2.8147482014388492e-06, + "loss": 0.1719, + "step": 313 + }, + { + "epoch": 0.0508749189889825, + "grad_norm": 2.057577610015869, + "learning_rate": 2.823741007194245e-06, + "loss": 0.1939, + "step": 314 + }, + { + "epoch": 0.05103694102397926, + "grad_norm": 1.821679949760437, + "learning_rate": 2.8327338129496407e-06, + "loss": 0.1564, + "step": 315 + }, + { + "epoch": 0.05119896305897602, + "grad_norm": 1.8081183433532715, + "learning_rate": 2.8417266187050364e-06, + "loss": 0.1717, + "step": 316 + }, + { + "epoch": 0.05136098509397278, + "grad_norm": 1.8699404001235962, + "learning_rate": 2.850719424460432e-06, + "loss": 0.173, + "step": 317 + }, + { + "epoch": 0.05152300712896954, + "grad_norm": 2.0023691654205322, + "learning_rate": 2.8597122302158274e-06, + "loss": 0.1764, + "step": 318 + }, + { + "epoch": 0.0516850291639663, + "grad_norm": 2.0363411903381348, + "learning_rate": 2.868705035971223e-06, + "loss": 0.1744, + "step": 319 + }, + { + "epoch": 0.05184705119896306, + "grad_norm": 1.8756717443466187, + "learning_rate": 2.877697841726619e-06, + "loss": 0.176, + "step": 320 + }, + { + "epoch": 0.05200907323395982, + "grad_norm": 2.1065685749053955, + "learning_rate": 2.8866906474820146e-06, + "loss": 0.1713, + "step": 321 + }, + { + "epoch": 0.05217109526895658, + "grad_norm": 1.929309606552124, + "learning_rate": 2.89568345323741e-06, + "loss": 0.1705, + "step": 322 + }, + { + "epoch": 0.05233311730395334, + "grad_norm": 1.8109393119812012, + "learning_rate": 2.9046762589928064e-06, + "loss": 0.1771, + "step": 323 + }, + { + "epoch": 0.0524951393389501, + "grad_norm": 1.9056837558746338, + "learning_rate": 2.9136690647482017e-06, + "loss": 0.1714, + "step": 324 + }, + { + "epoch": 0.052657161373946854, + "grad_norm": 1.8842450380325317, + "learning_rate": 2.9226618705035975e-06, + "loss": 0.1938, + "step": 325 + }, + { + "epoch": 0.052819183408943614, + "grad_norm": 1.9185986518859863, + "learning_rate": 2.931654676258993e-06, + "loss": 0.1771, + "step": 326 + }, + { + "epoch": 0.052981205443940374, + "grad_norm": 1.8425815105438232, + "learning_rate": 2.940647482014389e-06, + "loss": 0.19, + "step": 327 + }, + { + "epoch": 0.053143227478937134, + "grad_norm": 1.9350950717926025, + "learning_rate": 2.949640287769784e-06, + "loss": 0.187, + "step": 328 + }, + { + "epoch": 0.053305249513933894, + "grad_norm": 1.8640246391296387, + "learning_rate": 2.95863309352518e-06, + "loss": 0.1677, + "step": 329 + }, + { + "epoch": 0.053467271548930655, + "grad_norm": 1.8789522647857666, + "learning_rate": 2.9676258992805756e-06, + "loss": 0.1823, + "step": 330 + }, + { + "epoch": 0.053629293583927415, + "grad_norm": 1.8093454837799072, + "learning_rate": 2.9766187050359714e-06, + "loss": 0.1587, + "step": 331 + }, + { + "epoch": 0.053791315618924175, + "grad_norm": 1.8029776811599731, + "learning_rate": 2.985611510791367e-06, + "loss": 0.1723, + "step": 332 + }, + { + "epoch": 0.053953337653920935, + "grad_norm": 1.8550293445587158, + "learning_rate": 2.9946043165467632e-06, + "loss": 0.1948, + "step": 333 + }, + { + "epoch": 0.054115359688917695, + "grad_norm": 1.8107119798660278, + "learning_rate": 3.0035971223021585e-06, + "loss": 0.172, + "step": 334 + }, + { + "epoch": 0.054277381723914456, + "grad_norm": 1.7598873376846313, + "learning_rate": 3.0125899280575543e-06, + "loss": 0.1709, + "step": 335 + }, + { + "epoch": 0.05443940375891121, + "grad_norm": 1.8725013732910156, + "learning_rate": 3.02158273381295e-06, + "loss": 0.2019, + "step": 336 + }, + { + "epoch": 0.05460142579390797, + "grad_norm": 1.7645277976989746, + "learning_rate": 3.0305755395683457e-06, + "loss": 0.1677, + "step": 337 + }, + { + "epoch": 0.05476344782890473, + "grad_norm": 1.922553300857544, + "learning_rate": 3.0395683453237414e-06, + "loss": 0.1788, + "step": 338 + }, + { + "epoch": 0.05492546986390149, + "grad_norm": 1.7409249544143677, + "learning_rate": 3.0485611510791367e-06, + "loss": 0.1629, + "step": 339 + }, + { + "epoch": 0.05508749189889825, + "grad_norm": 1.914994478225708, + "learning_rate": 3.0575539568345324e-06, + "loss": 0.1757, + "step": 340 + }, + { + "epoch": 0.05524951393389501, + "grad_norm": 1.8456170558929443, + "learning_rate": 3.066546762589928e-06, + "loss": 0.1906, + "step": 341 + }, + { + "epoch": 0.05541153596889177, + "grad_norm": 1.949918270111084, + "learning_rate": 3.075539568345324e-06, + "loss": 0.2056, + "step": 342 + }, + { + "epoch": 0.05557355800388853, + "grad_norm": 1.8012248277664185, + "learning_rate": 3.08453237410072e-06, + "loss": 0.1719, + "step": 343 + }, + { + "epoch": 0.05573558003888529, + "grad_norm": 1.942472219467163, + "learning_rate": 3.0935251798561158e-06, + "loss": 0.1834, + "step": 344 + }, + { + "epoch": 0.05589760207388205, + "grad_norm": 1.8514517545700073, + "learning_rate": 3.102517985611511e-06, + "loss": 0.1911, + "step": 345 + }, + { + "epoch": 0.056059624108878804, + "grad_norm": 1.7124977111816406, + "learning_rate": 3.1115107913669068e-06, + "loss": 0.1656, + "step": 346 + }, + { + "epoch": 0.056221646143875564, + "grad_norm": 1.6776280403137207, + "learning_rate": 3.1205035971223025e-06, + "loss": 0.1741, + "step": 347 + }, + { + "epoch": 0.056383668178872325, + "grad_norm": 2.2318480014801025, + "learning_rate": 3.1294964028776982e-06, + "loss": 0.219, + "step": 348 + }, + { + "epoch": 0.056545690213869085, + "grad_norm": 1.7956312894821167, + "learning_rate": 3.1384892086330935e-06, + "loss": 0.1731, + "step": 349 + }, + { + "epoch": 0.056707712248865845, + "grad_norm": 1.813264012336731, + "learning_rate": 3.1474820143884892e-06, + "loss": 0.1798, + "step": 350 + }, + { + "epoch": 0.056869734283862605, + "grad_norm": 1.6817779541015625, + "learning_rate": 3.156474820143885e-06, + "loss": 0.1615, + "step": 351 + }, + { + "epoch": 0.057031756318859365, + "grad_norm": 1.8656824827194214, + "learning_rate": 3.1654676258992807e-06, + "loss": 0.1682, + "step": 352 + }, + { + "epoch": 0.057193778353856126, + "grad_norm": 1.817460536956787, + "learning_rate": 3.174460431654677e-06, + "loss": 0.1608, + "step": 353 + }, + { + "epoch": 0.057355800388852886, + "grad_norm": 1.9450856447219849, + "learning_rate": 3.1834532374100726e-06, + "loss": 0.1892, + "step": 354 + }, + { + "epoch": 0.057517822423849646, + "grad_norm": 1.7994742393493652, + "learning_rate": 3.192446043165468e-06, + "loss": 0.1634, + "step": 355 + }, + { + "epoch": 0.057679844458846406, + "grad_norm": 1.8007848262786865, + "learning_rate": 3.2014388489208636e-06, + "loss": 0.1731, + "step": 356 + }, + { + "epoch": 0.05784186649384316, + "grad_norm": 1.920350432395935, + "learning_rate": 3.2104316546762593e-06, + "loss": 0.174, + "step": 357 + }, + { + "epoch": 0.05800388852883992, + "grad_norm": 1.810903549194336, + "learning_rate": 3.219424460431655e-06, + "loss": 0.1828, + "step": 358 + }, + { + "epoch": 0.05816591056383668, + "grad_norm": 1.9278273582458496, + "learning_rate": 3.2284172661870507e-06, + "loss": 0.1881, + "step": 359 + }, + { + "epoch": 0.05832793259883344, + "grad_norm": 1.7868634462356567, + "learning_rate": 3.237410071942446e-06, + "loss": 0.1719, + "step": 360 + }, + { + "epoch": 0.0584899546338302, + "grad_norm": 1.6021710634231567, + "learning_rate": 3.2464028776978418e-06, + "loss": 0.1478, + "step": 361 + }, + { + "epoch": 0.05865197666882696, + "grad_norm": 1.6602813005447388, + "learning_rate": 3.2553956834532375e-06, + "loss": 0.1527, + "step": 362 + }, + { + "epoch": 0.05881399870382372, + "grad_norm": 1.8697694540023804, + "learning_rate": 3.2643884892086336e-06, + "loss": 0.1809, + "step": 363 + }, + { + "epoch": 0.05897602073882048, + "grad_norm": 1.9650862216949463, + "learning_rate": 3.2733812949640294e-06, + "loss": 0.1869, + "step": 364 + }, + { + "epoch": 0.05913804277381724, + "grad_norm": 1.7278015613555908, + "learning_rate": 3.2823741007194247e-06, + "loss": 0.1734, + "step": 365 + }, + { + "epoch": 0.059300064808814, + "grad_norm": 1.995110273361206, + "learning_rate": 3.2913669064748204e-06, + "loss": 0.1962, + "step": 366 + }, + { + "epoch": 0.05946208684381076, + "grad_norm": 1.8341810703277588, + "learning_rate": 3.300359712230216e-06, + "loss": 0.1917, + "step": 367 + }, + { + "epoch": 0.059624108878807515, + "grad_norm": 1.8125361204147339, + "learning_rate": 3.309352517985612e-06, + "loss": 0.1794, + "step": 368 + }, + { + "epoch": 0.059786130913804275, + "grad_norm": 1.6951332092285156, + "learning_rate": 3.3183453237410075e-06, + "loss": 0.1623, + "step": 369 + }, + { + "epoch": 0.059948152948801035, + "grad_norm": 1.753805160522461, + "learning_rate": 3.327338129496403e-06, + "loss": 0.1757, + "step": 370 + }, + { + "epoch": 0.060110174983797796, + "grad_norm": 1.876604676246643, + "learning_rate": 3.3363309352517986e-06, + "loss": 0.2007, + "step": 371 + }, + { + "epoch": 0.060272197018794556, + "grad_norm": 1.8833791017532349, + "learning_rate": 3.3453237410071943e-06, + "loss": 0.19, + "step": 372 + }, + { + "epoch": 0.060434219053791316, + "grad_norm": 1.769794225692749, + "learning_rate": 3.3543165467625904e-06, + "loss": 0.1679, + "step": 373 + }, + { + "epoch": 0.060596241088788076, + "grad_norm": 1.955523133277893, + "learning_rate": 3.363309352517986e-06, + "loss": 0.1937, + "step": 374 + }, + { + "epoch": 0.060758263123784836, + "grad_norm": 1.8921654224395752, + "learning_rate": 3.372302158273382e-06, + "loss": 0.1607, + "step": 375 + }, + { + "epoch": 0.0609202851587816, + "grad_norm": 1.7015023231506348, + "learning_rate": 3.381294964028777e-06, + "loss": 0.1699, + "step": 376 + }, + { + "epoch": 0.06108230719377836, + "grad_norm": 1.8324452638626099, + "learning_rate": 3.390287769784173e-06, + "loss": 0.1949, + "step": 377 + }, + { + "epoch": 0.06124432922877511, + "grad_norm": 1.611916422843933, + "learning_rate": 3.3992805755395686e-06, + "loss": 0.1381, + "step": 378 + }, + { + "epoch": 0.06140635126377187, + "grad_norm": 1.8987873792648315, + "learning_rate": 3.4082733812949643e-06, + "loss": 0.1813, + "step": 379 + }, + { + "epoch": 0.06156837329876863, + "grad_norm": 1.7634623050689697, + "learning_rate": 3.4172661870503596e-06, + "loss": 0.1739, + "step": 380 + }, + { + "epoch": 0.06173039533376539, + "grad_norm": 1.7649253606796265, + "learning_rate": 3.4262589928057554e-06, + "loss": 0.1654, + "step": 381 + }, + { + "epoch": 0.06189241736876215, + "grad_norm": 1.7400226593017578, + "learning_rate": 3.435251798561151e-06, + "loss": 0.1646, + "step": 382 + }, + { + "epoch": 0.06205443940375891, + "grad_norm": 1.8828028440475464, + "learning_rate": 3.4442446043165472e-06, + "loss": 0.1785, + "step": 383 + }, + { + "epoch": 0.06221646143875567, + "grad_norm": 1.9339855909347534, + "learning_rate": 3.453237410071943e-06, + "loss": 0.1811, + "step": 384 + }, + { + "epoch": 0.06237848347375243, + "grad_norm": 1.6471163034439087, + "learning_rate": 3.4622302158273387e-06, + "loss": 0.1637, + "step": 385 + }, + { + "epoch": 0.06254050550874919, + "grad_norm": 1.9495569467544556, + "learning_rate": 3.471223021582734e-06, + "loss": 0.1963, + "step": 386 + }, + { + "epoch": 0.06270252754374595, + "grad_norm": 1.7704451084136963, + "learning_rate": 3.4802158273381297e-06, + "loss": 0.1774, + "step": 387 + }, + { + "epoch": 0.06286454957874271, + "grad_norm": 1.7431086301803589, + "learning_rate": 3.4892086330935254e-06, + "loss": 0.1647, + "step": 388 + }, + { + "epoch": 0.06302657161373947, + "grad_norm": 2.1561217308044434, + "learning_rate": 3.498201438848921e-06, + "loss": 0.1727, + "step": 389 + }, + { + "epoch": 0.06318859364873623, + "grad_norm": 1.6407549381256104, + "learning_rate": 3.507194244604317e-06, + "loss": 0.1625, + "step": 390 + }, + { + "epoch": 0.06335061568373299, + "grad_norm": 1.7267616987228394, + "learning_rate": 3.516187050359712e-06, + "loss": 0.1668, + "step": 391 + }, + { + "epoch": 0.06351263771872975, + "grad_norm": 1.9636503458023071, + "learning_rate": 3.525179856115108e-06, + "loss": 0.2013, + "step": 392 + }, + { + "epoch": 0.0636746597537265, + "grad_norm": 1.6342748403549194, + "learning_rate": 3.534172661870504e-06, + "loss": 0.1827, + "step": 393 + }, + { + "epoch": 0.06383668178872326, + "grad_norm": 1.8311208486557007, + "learning_rate": 3.5431654676258998e-06, + "loss": 0.1811, + "step": 394 + }, + { + "epoch": 0.06399870382372003, + "grad_norm": 1.904611587524414, + "learning_rate": 3.5521582733812955e-06, + "loss": 0.2114, + "step": 395 + }, + { + "epoch": 0.06416072585871678, + "grad_norm": 1.7322005033493042, + "learning_rate": 3.561151079136691e-06, + "loss": 0.1734, + "step": 396 + }, + { + "epoch": 0.06432274789371355, + "grad_norm": 1.7229621410369873, + "learning_rate": 3.5701438848920865e-06, + "loss": 0.1606, + "step": 397 + }, + { + "epoch": 0.0644847699287103, + "grad_norm": 1.8113038539886475, + "learning_rate": 3.5791366906474822e-06, + "loss": 0.191, + "step": 398 + }, + { + "epoch": 0.06464679196370707, + "grad_norm": 1.7715518474578857, + "learning_rate": 3.588129496402878e-06, + "loss": 0.1914, + "step": 399 + }, + { + "epoch": 0.06480881399870382, + "grad_norm": 1.8682818412780762, + "learning_rate": 3.5971223021582737e-06, + "loss": 0.1771, + "step": 400 + }, + { + "epoch": 0.06497083603370059, + "grad_norm": 1.6986286640167236, + "learning_rate": 3.606115107913669e-06, + "loss": 0.1693, + "step": 401 + }, + { + "epoch": 0.06513285806869734, + "grad_norm": 1.7627536058425903, + "learning_rate": 3.6151079136690647e-06, + "loss": 0.1719, + "step": 402 + }, + { + "epoch": 0.06529488010369411, + "grad_norm": 1.707191824913025, + "learning_rate": 3.624100719424461e-06, + "loss": 0.1713, + "step": 403 + }, + { + "epoch": 0.06545690213869086, + "grad_norm": 14.93448543548584, + "learning_rate": 3.6330935251798566e-06, + "loss": 0.1574, + "step": 404 + }, + { + "epoch": 0.06561892417368761, + "grad_norm": 2.0957789421081543, + "learning_rate": 3.6420863309352523e-06, + "loss": 0.2031, + "step": 405 + }, + { + "epoch": 0.06578094620868438, + "grad_norm": 1.8755544424057007, + "learning_rate": 3.651079136690648e-06, + "loss": 0.1766, + "step": 406 + }, + { + "epoch": 0.06594296824368114, + "grad_norm": 1.6951358318328857, + "learning_rate": 3.6600719424460433e-06, + "loss": 0.1615, + "step": 407 + }, + { + "epoch": 0.0661049902786779, + "grad_norm": 1.621053695678711, + "learning_rate": 3.669064748201439e-06, + "loss": 0.1667, + "step": 408 + }, + { + "epoch": 0.06626701231367466, + "grad_norm": 1.7325719594955444, + "learning_rate": 3.6780575539568347e-06, + "loss": 0.1811, + "step": 409 + }, + { + "epoch": 0.06642903434867142, + "grad_norm": 1.7083348035812378, + "learning_rate": 3.6870503597122305e-06, + "loss": 0.1689, + "step": 410 + }, + { + "epoch": 0.06659105638366818, + "grad_norm": 1.5969374179840088, + "learning_rate": 3.696043165467626e-06, + "loss": 0.1649, + "step": 411 + }, + { + "epoch": 0.06675307841866494, + "grad_norm": 1.688307285308838, + "learning_rate": 3.7050359712230215e-06, + "loss": 0.1681, + "step": 412 + }, + { + "epoch": 0.0669151004536617, + "grad_norm": 1.976590871810913, + "learning_rate": 3.7140287769784176e-06, + "loss": 0.2102, + "step": 413 + }, + { + "epoch": 0.06707712248865846, + "grad_norm": 1.8404573202133179, + "learning_rate": 3.7230215827338134e-06, + "loss": 0.1617, + "step": 414 + }, + { + "epoch": 0.06723914452365522, + "grad_norm": 1.8087718486785889, + "learning_rate": 3.732014388489209e-06, + "loss": 0.1652, + "step": 415 + }, + { + "epoch": 0.06740116655865197, + "grad_norm": 1.8954182863235474, + "learning_rate": 3.741007194244605e-06, + "loss": 0.1803, + "step": 416 + }, + { + "epoch": 0.06756318859364874, + "grad_norm": 1.7707210779190063, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.1833, + "step": 417 + }, + { + "epoch": 0.06772521062864549, + "grad_norm": 1.5861374139785767, + "learning_rate": 3.758992805755396e-06, + "loss": 0.1485, + "step": 418 + }, + { + "epoch": 0.06788723266364226, + "grad_norm": 1.7552915811538696, + "learning_rate": 3.7679856115107915e-06, + "loss": 0.1783, + "step": 419 + }, + { + "epoch": 0.06804925469863901, + "grad_norm": 1.6234557628631592, + "learning_rate": 3.7769784172661873e-06, + "loss": 0.1765, + "step": 420 + }, + { + "epoch": 0.06821127673363578, + "grad_norm": 1.9008007049560547, + "learning_rate": 3.785971223021583e-06, + "loss": 0.1839, + "step": 421 + }, + { + "epoch": 0.06837329876863253, + "grad_norm": 1.8210786581039429, + "learning_rate": 3.794964028776979e-06, + "loss": 0.1912, + "step": 422 + }, + { + "epoch": 0.0685353208036293, + "grad_norm": 1.6907942295074463, + "learning_rate": 3.8039568345323744e-06, + "loss": 0.1651, + "step": 423 + }, + { + "epoch": 0.06869734283862605, + "grad_norm": 1.691689372062683, + "learning_rate": 3.81294964028777e-06, + "loss": 0.176, + "step": 424 + }, + { + "epoch": 0.06885936487362282, + "grad_norm": 1.6513557434082031, + "learning_rate": 3.821942446043166e-06, + "loss": 0.173, + "step": 425 + }, + { + "epoch": 0.06902138690861957, + "grad_norm": 1.5421315431594849, + "learning_rate": 3.830935251798562e-06, + "loss": 0.1508, + "step": 426 + }, + { + "epoch": 0.06918340894361633, + "grad_norm": 1.7500141859054565, + "learning_rate": 3.839928057553957e-06, + "loss": 0.195, + "step": 427 + }, + { + "epoch": 0.06934543097861309, + "grad_norm": 1.5468772649765015, + "learning_rate": 3.848920863309353e-06, + "loss": 0.142, + "step": 428 + }, + { + "epoch": 0.06950745301360985, + "grad_norm": 1.614610195159912, + "learning_rate": 3.857913669064748e-06, + "loss": 0.1648, + "step": 429 + }, + { + "epoch": 0.06966947504860661, + "grad_norm": 1.802154541015625, + "learning_rate": 3.866906474820144e-06, + "loss": 0.185, + "step": 430 + }, + { + "epoch": 0.06983149708360337, + "grad_norm": 1.7224853038787842, + "learning_rate": 3.87589928057554e-06, + "loss": 0.1658, + "step": 431 + }, + { + "epoch": 0.06999351911860013, + "grad_norm": 1.7054455280303955, + "learning_rate": 3.884892086330936e-06, + "loss": 0.166, + "step": 432 + }, + { + "epoch": 0.07015554115359689, + "grad_norm": 1.712424397468567, + "learning_rate": 3.893884892086331e-06, + "loss": 0.1694, + "step": 433 + }, + { + "epoch": 0.07031756318859365, + "grad_norm": 1.892596960067749, + "learning_rate": 3.902877697841727e-06, + "loss": 0.1748, + "step": 434 + }, + { + "epoch": 0.07047958522359041, + "grad_norm": 1.7858827114105225, + "learning_rate": 3.911870503597123e-06, + "loss": 0.1781, + "step": 435 + }, + { + "epoch": 0.07064160725858717, + "grad_norm": 1.9324816465377808, + "learning_rate": 3.920863309352518e-06, + "loss": 0.1967, + "step": 436 + }, + { + "epoch": 0.07080362929358393, + "grad_norm": 1.74705970287323, + "learning_rate": 3.929856115107914e-06, + "loss": 0.1813, + "step": 437 + }, + { + "epoch": 0.07096565132858068, + "grad_norm": 1.7450978755950928, + "learning_rate": 3.938848920863309e-06, + "loss": 0.181, + "step": 438 + }, + { + "epoch": 0.07112767336357745, + "grad_norm": 1.758083701133728, + "learning_rate": 3.9478417266187056e-06, + "loss": 0.1903, + "step": 439 + }, + { + "epoch": 0.0712896953985742, + "grad_norm": 1.6639132499694824, + "learning_rate": 3.956834532374101e-06, + "loss": 0.1826, + "step": 440 + }, + { + "epoch": 0.07145171743357097, + "grad_norm": 1.6857917308807373, + "learning_rate": 3.965827338129496e-06, + "loss": 0.1705, + "step": 441 + }, + { + "epoch": 0.07161373946856772, + "grad_norm": 1.6692376136779785, + "learning_rate": 3.974820143884892e-06, + "loss": 0.1877, + "step": 442 + }, + { + "epoch": 0.07177576150356449, + "grad_norm": 1.5959900617599487, + "learning_rate": 3.9838129496402885e-06, + "loss": 0.1659, + "step": 443 + }, + { + "epoch": 0.07193778353856124, + "grad_norm": 1.768189549446106, + "learning_rate": 3.992805755395684e-06, + "loss": 0.1933, + "step": 444 + }, + { + "epoch": 0.07209980557355801, + "grad_norm": 1.590326189994812, + "learning_rate": 4.00179856115108e-06, + "loss": 0.1662, + "step": 445 + }, + { + "epoch": 0.07226182760855476, + "grad_norm": 1.768247365951538, + "learning_rate": 4.010791366906475e-06, + "loss": 0.1964, + "step": 446 + }, + { + "epoch": 0.07242384964355152, + "grad_norm": 1.639091968536377, + "learning_rate": 4.0197841726618705e-06, + "loss": 0.1858, + "step": 447 + }, + { + "epoch": 0.07258587167854828, + "grad_norm": 1.7374097108840942, + "learning_rate": 4.028776978417267e-06, + "loss": 0.1829, + "step": 448 + }, + { + "epoch": 0.07274789371354504, + "grad_norm": 1.716292381286621, + "learning_rate": 4.037769784172662e-06, + "loss": 0.1936, + "step": 449 + }, + { + "epoch": 0.0729099157485418, + "grad_norm": 1.5779844522476196, + "learning_rate": 4.046762589928058e-06, + "loss": 0.1685, + "step": 450 + }, + { + "epoch": 0.07307193778353856, + "grad_norm": 1.6292905807495117, + "learning_rate": 4.055755395683453e-06, + "loss": 0.1623, + "step": 451 + }, + { + "epoch": 0.07323395981853532, + "grad_norm": 1.4410508871078491, + "learning_rate": 4.0647482014388495e-06, + "loss": 0.1431, + "step": 452 + }, + { + "epoch": 0.07339598185353208, + "grad_norm": 1.6054296493530273, + "learning_rate": 4.073741007194245e-06, + "loss": 0.1682, + "step": 453 + }, + { + "epoch": 0.07355800388852884, + "grad_norm": 1.6179120540618896, + "learning_rate": 4.082733812949641e-06, + "loss": 0.1837, + "step": 454 + }, + { + "epoch": 0.0737200259235256, + "grad_norm": 1.683802604675293, + "learning_rate": 4.091726618705036e-06, + "loss": 0.1766, + "step": 455 + }, + { + "epoch": 0.07388204795852236, + "grad_norm": 1.5937223434448242, + "learning_rate": 4.100719424460432e-06, + "loss": 0.1603, + "step": 456 + }, + { + "epoch": 0.07404406999351912, + "grad_norm": 1.7861487865447998, + "learning_rate": 4.109712230215828e-06, + "loss": 0.1868, + "step": 457 + }, + { + "epoch": 0.07420609202851587, + "grad_norm": 1.786029577255249, + "learning_rate": 4.118705035971223e-06, + "loss": 0.1933, + "step": 458 + }, + { + "epoch": 0.07436811406351264, + "grad_norm": 1.6774612665176392, + "learning_rate": 4.127697841726619e-06, + "loss": 0.1834, + "step": 459 + }, + { + "epoch": 0.07453013609850939, + "grad_norm": 1.8457497358322144, + "learning_rate": 4.1366906474820145e-06, + "loss": 0.1785, + "step": 460 + }, + { + "epoch": 0.07469215813350616, + "grad_norm": 1.6226444244384766, + "learning_rate": 4.14568345323741e-06, + "loss": 0.1689, + "step": 461 + }, + { + "epoch": 0.07485418016850291, + "grad_norm": 1.7107115983963013, + "learning_rate": 4.154676258992807e-06, + "loss": 0.1993, + "step": 462 + }, + { + "epoch": 0.07501620220349968, + "grad_norm": 1.7487287521362305, + "learning_rate": 4.163669064748202e-06, + "loss": 0.2032, + "step": 463 + }, + { + "epoch": 0.07517822423849643, + "grad_norm": 1.8493058681488037, + "learning_rate": 4.172661870503597e-06, + "loss": 0.1937, + "step": 464 + }, + { + "epoch": 0.0753402462734932, + "grad_norm": 1.7110384702682495, + "learning_rate": 4.1816546762589935e-06, + "loss": 0.1755, + "step": 465 + }, + { + "epoch": 0.07550226830848995, + "grad_norm": 1.6556897163391113, + "learning_rate": 4.190647482014389e-06, + "loss": 0.1961, + "step": 466 + }, + { + "epoch": 0.07566429034348672, + "grad_norm": 1.5604274272918701, + "learning_rate": 4.199640287769784e-06, + "loss": 0.1725, + "step": 467 + }, + { + "epoch": 0.07582631237848347, + "grad_norm": 1.639374017715454, + "learning_rate": 4.20863309352518e-06, + "loss": 0.1795, + "step": 468 + }, + { + "epoch": 0.07598833441348023, + "grad_norm": 1.582411289215088, + "learning_rate": 4.2176258992805755e-06, + "loss": 0.1786, + "step": 469 + }, + { + "epoch": 0.076150356448477, + "grad_norm": 1.62992262840271, + "learning_rate": 4.226618705035972e-06, + "loss": 0.1732, + "step": 470 + }, + { + "epoch": 0.07631237848347375, + "grad_norm": 1.8388803005218506, + "learning_rate": 4.235611510791367e-06, + "loss": 0.208, + "step": 471 + }, + { + "epoch": 0.07647440051847051, + "grad_norm": 1.8594791889190674, + "learning_rate": 4.244604316546763e-06, + "loss": 0.2148, + "step": 472 + }, + { + "epoch": 0.07663642255346727, + "grad_norm": 1.814803957939148, + "learning_rate": 4.2535971223021584e-06, + "loss": 0.2031, + "step": 473 + }, + { + "epoch": 0.07679844458846403, + "grad_norm": 1.6284650564193726, + "learning_rate": 4.2625899280575546e-06, + "loss": 0.1865, + "step": 474 + }, + { + "epoch": 0.07696046662346079, + "grad_norm": 1.8454334735870361, + "learning_rate": 4.27158273381295e-06, + "loss": 0.1818, + "step": 475 + }, + { + "epoch": 0.07712248865845756, + "grad_norm": 1.8718208074569702, + "learning_rate": 4.280575539568346e-06, + "loss": 0.1912, + "step": 476 + }, + { + "epoch": 0.07728451069345431, + "grad_norm": 1.4540497064590454, + "learning_rate": 4.289568345323741e-06, + "loss": 0.1595, + "step": 477 + }, + { + "epoch": 0.07744653272845108, + "grad_norm": 1.749086618423462, + "learning_rate": 4.298561151079137e-06, + "loss": 0.2013, + "step": 478 + }, + { + "epoch": 0.07760855476344783, + "grad_norm": 1.8641357421875, + "learning_rate": 4.307553956834533e-06, + "loss": 0.202, + "step": 479 + }, + { + "epoch": 0.07777057679844458, + "grad_norm": 1.670167326927185, + "learning_rate": 4.316546762589928e-06, + "loss": 0.1815, + "step": 480 + }, + { + "epoch": 0.07793259883344135, + "grad_norm": 1.800274133682251, + "learning_rate": 4.325539568345324e-06, + "loss": 0.2006, + "step": 481 + }, + { + "epoch": 0.0780946208684381, + "grad_norm": 2.035557985305786, + "learning_rate": 4.33453237410072e-06, + "loss": 0.1903, + "step": 482 + }, + { + "epoch": 0.07825664290343487, + "grad_norm": 1.7821377515792847, + "learning_rate": 4.343525179856116e-06, + "loss": 0.2089, + "step": 483 + }, + { + "epoch": 0.07841866493843162, + "grad_norm": 1.605576515197754, + "learning_rate": 4.352517985611511e-06, + "loss": 0.1625, + "step": 484 + }, + { + "epoch": 0.07858068697342839, + "grad_norm": 1.7494524717330933, + "learning_rate": 4.361510791366907e-06, + "loss": 0.1915, + "step": 485 + }, + { + "epoch": 0.07874270900842514, + "grad_norm": 1.4881764650344849, + "learning_rate": 4.370503597122302e-06, + "loss": 0.1512, + "step": 486 + }, + { + "epoch": 0.07890473104342191, + "grad_norm": 1.9464272260665894, + "learning_rate": 4.3794964028776985e-06, + "loss": 0.1752, + "step": 487 + }, + { + "epoch": 0.07906675307841866, + "grad_norm": 1.8107142448425293, + "learning_rate": 4.388489208633094e-06, + "loss": 0.1953, + "step": 488 + }, + { + "epoch": 0.07922877511341543, + "grad_norm": 1.7034633159637451, + "learning_rate": 4.397482014388489e-06, + "loss": 0.1875, + "step": 489 + }, + { + "epoch": 0.07939079714841218, + "grad_norm": 1.7011704444885254, + "learning_rate": 4.406474820143885e-06, + "loss": 0.1942, + "step": 490 + }, + { + "epoch": 0.07955281918340894, + "grad_norm": 1.567685604095459, + "learning_rate": 4.415467625899281e-06, + "loss": 0.1869, + "step": 491 + }, + { + "epoch": 0.0797148412184057, + "grad_norm": 1.5715806484222412, + "learning_rate": 4.424460431654677e-06, + "loss": 0.1618, + "step": 492 + }, + { + "epoch": 0.07987686325340246, + "grad_norm": 1.6158721446990967, + "learning_rate": 4.433453237410073e-06, + "loss": 0.1903, + "step": 493 + }, + { + "epoch": 0.08003888528839923, + "grad_norm": 1.7132471799850464, + "learning_rate": 4.442446043165468e-06, + "loss": 0.1761, + "step": 494 + }, + { + "epoch": 0.08020090732339598, + "grad_norm": 1.561064600944519, + "learning_rate": 4.4514388489208635e-06, + "loss": 0.1644, + "step": 495 + }, + { + "epoch": 0.08036292935839275, + "grad_norm": 1.6423125267028809, + "learning_rate": 4.46043165467626e-06, + "loss": 0.1724, + "step": 496 + }, + { + "epoch": 0.0805249513933895, + "grad_norm": 1.5064512491226196, + "learning_rate": 4.469424460431655e-06, + "loss": 0.1421, + "step": 497 + }, + { + "epoch": 0.08068697342838627, + "grad_norm": 1.592776894569397, + "learning_rate": 4.478417266187051e-06, + "loss": 0.1831, + "step": 498 + }, + { + "epoch": 0.08084899546338302, + "grad_norm": 1.378753423690796, + "learning_rate": 4.487410071942446e-06, + "loss": 0.1469, + "step": 499 + }, + { + "epoch": 0.08101101749837979, + "grad_norm": 1.6191986799240112, + "learning_rate": 4.496402877697842e-06, + "loss": 0.1797, + "step": 500 + }, + { + "epoch": 0.08117303953337654, + "grad_norm": 1.6818901300430298, + "learning_rate": 4.505395683453238e-06, + "loss": 0.1666, + "step": 501 + }, + { + "epoch": 0.08133506156837329, + "grad_norm": 1.6711454391479492, + "learning_rate": 4.514388489208634e-06, + "loss": 0.1883, + "step": 502 + }, + { + "epoch": 0.08149708360337006, + "grad_norm": 1.5778876543045044, + "learning_rate": 4.523381294964029e-06, + "loss": 0.1556, + "step": 503 + }, + { + "epoch": 0.08165910563836681, + "grad_norm": 1.751618504524231, + "learning_rate": 4.5323741007194245e-06, + "loss": 0.1773, + "step": 504 + }, + { + "epoch": 0.08182112767336358, + "grad_norm": 1.6356712579727173, + "learning_rate": 4.541366906474821e-06, + "loss": 0.1674, + "step": 505 + }, + { + "epoch": 0.08198314970836033, + "grad_norm": 1.717413306236267, + "learning_rate": 4.550359712230216e-06, + "loss": 0.2015, + "step": 506 + }, + { + "epoch": 0.0821451717433571, + "grad_norm": 1.6795759201049805, + "learning_rate": 4.559352517985612e-06, + "loss": 0.1858, + "step": 507 + }, + { + "epoch": 0.08230719377835385, + "grad_norm": 1.4658695459365845, + "learning_rate": 4.5683453237410074e-06, + "loss": 0.1719, + "step": 508 + }, + { + "epoch": 0.08246921581335062, + "grad_norm": 1.865307331085205, + "learning_rate": 4.577338129496403e-06, + "loss": 0.1978, + "step": 509 + }, + { + "epoch": 0.08263123784834737, + "grad_norm": 1.8304200172424316, + "learning_rate": 4.586330935251799e-06, + "loss": 0.1906, + "step": 510 + }, + { + "epoch": 0.08279325988334413, + "grad_norm": 1.795590877532959, + "learning_rate": 4.595323741007194e-06, + "loss": 0.1827, + "step": 511 + }, + { + "epoch": 0.0829552819183409, + "grad_norm": 1.6314994096755981, + "learning_rate": 4.60431654676259e-06, + "loss": 0.1725, + "step": 512 + }, + { + "epoch": 0.08311730395333765, + "grad_norm": 1.6502032279968262, + "learning_rate": 4.6133093525179865e-06, + "loss": 0.1895, + "step": 513 + }, + { + "epoch": 0.08327932598833442, + "grad_norm": 1.5785597562789917, + "learning_rate": 4.622302158273382e-06, + "loss": 0.1602, + "step": 514 + }, + { + "epoch": 0.08344134802333117, + "grad_norm": 1.8144105672836304, + "learning_rate": 4.631294964028777e-06, + "loss": 0.174, + "step": 515 + }, + { + "epoch": 0.08360337005832794, + "grad_norm": 1.689868688583374, + "learning_rate": 4.640287769784173e-06, + "loss": 0.176, + "step": 516 + }, + { + "epoch": 0.08376539209332469, + "grad_norm": 1.7384047508239746, + "learning_rate": 4.6492805755395685e-06, + "loss": 0.1739, + "step": 517 + }, + { + "epoch": 0.08392741412832146, + "grad_norm": 1.7412197589874268, + "learning_rate": 4.658273381294965e-06, + "loss": 0.1957, + "step": 518 + }, + { + "epoch": 0.08408943616331821, + "grad_norm": 1.6650651693344116, + "learning_rate": 4.66726618705036e-06, + "loss": 0.184, + "step": 519 + }, + { + "epoch": 0.08425145819831498, + "grad_norm": 1.7459502220153809, + "learning_rate": 4.676258992805755e-06, + "loss": 0.1772, + "step": 520 + }, + { + "epoch": 0.08441348023331173, + "grad_norm": 1.6877442598342896, + "learning_rate": 4.685251798561151e-06, + "loss": 0.1872, + "step": 521 + }, + { + "epoch": 0.08457550226830848, + "grad_norm": 1.6881842613220215, + "learning_rate": 4.6942446043165475e-06, + "loss": 0.1841, + "step": 522 + }, + { + "epoch": 0.08473752430330525, + "grad_norm": 1.4388700723648071, + "learning_rate": 4.703237410071943e-06, + "loss": 0.1652, + "step": 523 + }, + { + "epoch": 0.084899546338302, + "grad_norm": 1.5876359939575195, + "learning_rate": 4.712230215827339e-06, + "loss": 0.2057, + "step": 524 + }, + { + "epoch": 0.08506156837329877, + "grad_norm": 1.647616982460022, + "learning_rate": 4.721223021582734e-06, + "loss": 0.186, + "step": 525 + }, + { + "epoch": 0.08522359040829552, + "grad_norm": 1.5112206935882568, + "learning_rate": 4.73021582733813e-06, + "loss": 0.1697, + "step": 526 + }, + { + "epoch": 0.08538561244329229, + "grad_norm": 1.553218126296997, + "learning_rate": 4.739208633093526e-06, + "loss": 0.1871, + "step": 527 + }, + { + "epoch": 0.08554763447828904, + "grad_norm": 1.5007338523864746, + "learning_rate": 4.748201438848921e-06, + "loss": 0.1555, + "step": 528 + }, + { + "epoch": 0.08570965651328581, + "grad_norm": 1.6641744375228882, + "learning_rate": 4.757194244604317e-06, + "loss": 0.1909, + "step": 529 + }, + { + "epoch": 0.08587167854828257, + "grad_norm": 1.8256597518920898, + "learning_rate": 4.7661870503597125e-06, + "loss": 0.1872, + "step": 530 + }, + { + "epoch": 0.08603370058327933, + "grad_norm": 1.818845272064209, + "learning_rate": 4.775179856115108e-06, + "loss": 0.1928, + "step": 531 + }, + { + "epoch": 0.08619572261827609, + "grad_norm": 1.664866328239441, + "learning_rate": 4.784172661870504e-06, + "loss": 0.167, + "step": 532 + }, + { + "epoch": 0.08635774465327284, + "grad_norm": 1.511879563331604, + "learning_rate": 4.7931654676259e-06, + "loss": 0.1692, + "step": 533 + }, + { + "epoch": 0.0865197666882696, + "grad_norm": 1.680817723274231, + "learning_rate": 4.802158273381295e-06, + "loss": 0.1897, + "step": 534 + }, + { + "epoch": 0.08668178872326636, + "grad_norm": 1.5544551610946655, + "learning_rate": 4.8111510791366915e-06, + "loss": 0.1597, + "step": 535 + }, + { + "epoch": 0.08684381075826313, + "grad_norm": 1.759607195854187, + "learning_rate": 4.820143884892087e-06, + "loss": 0.1925, + "step": 536 + }, + { + "epoch": 0.08700583279325988, + "grad_norm": 1.6217458248138428, + "learning_rate": 4.829136690647482e-06, + "loss": 0.1897, + "step": 537 + }, + { + "epoch": 0.08716785482825665, + "grad_norm": 1.574110507965088, + "learning_rate": 4.838129496402878e-06, + "loss": 0.1898, + "step": 538 + }, + { + "epoch": 0.0873298768632534, + "grad_norm": 1.4586173295974731, + "learning_rate": 4.8471223021582736e-06, + "loss": 0.1767, + "step": 539 + }, + { + "epoch": 0.08749189889825017, + "grad_norm": 1.533091425895691, + "learning_rate": 4.856115107913669e-06, + "loss": 0.1819, + "step": 540 + }, + { + "epoch": 0.08765392093324692, + "grad_norm": 1.4116888046264648, + "learning_rate": 4.865107913669065e-06, + "loss": 0.1611, + "step": 541 + }, + { + "epoch": 0.08781594296824369, + "grad_norm": 1.6544139385223389, + "learning_rate": 4.874100719424461e-06, + "loss": 0.1787, + "step": 542 + }, + { + "epoch": 0.08797796500324044, + "grad_norm": 1.6531710624694824, + "learning_rate": 4.8830935251798564e-06, + "loss": 0.1961, + "step": 543 + }, + { + "epoch": 0.0881399870382372, + "grad_norm": 1.4757072925567627, + "learning_rate": 4.892086330935253e-06, + "loss": 0.1713, + "step": 544 + }, + { + "epoch": 0.08830200907323396, + "grad_norm": 1.6193426847457886, + "learning_rate": 4.901079136690648e-06, + "loss": 0.1845, + "step": 545 + }, + { + "epoch": 0.08846403110823071, + "grad_norm": 1.7032020092010498, + "learning_rate": 4.910071942446043e-06, + "loss": 0.189, + "step": 546 + }, + { + "epoch": 0.08862605314322748, + "grad_norm": 1.4911900758743286, + "learning_rate": 4.919064748201439e-06, + "loss": 0.1592, + "step": 547 + }, + { + "epoch": 0.08878807517822424, + "grad_norm": 1.6090925931930542, + "learning_rate": 4.928057553956835e-06, + "loss": 0.1983, + "step": 548 + }, + { + "epoch": 0.088950097213221, + "grad_norm": 1.876451015472412, + "learning_rate": 4.937050359712231e-06, + "loss": 0.2078, + "step": 549 + }, + { + "epoch": 0.08911211924821776, + "grad_norm": 1.5491797924041748, + "learning_rate": 4.946043165467626e-06, + "loss": 0.178, + "step": 550 + }, + { + "epoch": 0.08927414128321452, + "grad_norm": 1.6761187314987183, + "learning_rate": 4.955035971223021e-06, + "loss": 0.2046, + "step": 551 + }, + { + "epoch": 0.08943616331821128, + "grad_norm": 1.6212235689163208, + "learning_rate": 4.9640287769784175e-06, + "loss": 0.2055, + "step": 552 + }, + { + "epoch": 0.08959818535320804, + "grad_norm": 1.6363742351531982, + "learning_rate": 4.973021582733814e-06, + "loss": 0.1966, + "step": 553 + }, + { + "epoch": 0.0897602073882048, + "grad_norm": 1.580815076828003, + "learning_rate": 4.982014388489209e-06, + "loss": 0.2054, + "step": 554 + }, + { + "epoch": 0.08992222942320155, + "grad_norm": 1.9822163581848145, + "learning_rate": 4.991007194244605e-06, + "loss": 0.155, + "step": 555 + }, + { + "epoch": 0.09008425145819832, + "grad_norm": 1.7965415716171265, + "learning_rate": 5e-06, + "loss": 0.1808, + "step": 556 + }, + { + "epoch": 0.09024627349319507, + "grad_norm": 1.4814484119415283, + "learning_rate": 4.999999961753026e-06, + "loss": 0.1716, + "step": 557 + }, + { + "epoch": 0.09040829552819184, + "grad_norm": 1.5439770221710205, + "learning_rate": 4.999999847012101e-06, + "loss": 0.1672, + "step": 558 + }, + { + "epoch": 0.09057031756318859, + "grad_norm": 1.5221511125564575, + "learning_rate": 4.999999655777232e-06, + "loss": 0.1829, + "step": 559 + }, + { + "epoch": 0.09073233959818536, + "grad_norm": 1.8477576971054077, + "learning_rate": 4.9999993880484235e-06, + "loss": 0.1827, + "step": 560 + }, + { + "epoch": 0.09089436163318211, + "grad_norm": 1.5962644815444946, + "learning_rate": 4.999999043825682e-06, + "loss": 0.1675, + "step": 561 + }, + { + "epoch": 0.09105638366817888, + "grad_norm": 1.5648473501205444, + "learning_rate": 4.999998623109022e-06, + "loss": 0.1645, + "step": 562 + }, + { + "epoch": 0.09121840570317563, + "grad_norm": 1.4602282047271729, + "learning_rate": 4.999998125898452e-06, + "loss": 0.1595, + "step": 563 + }, + { + "epoch": 0.09138042773817238, + "grad_norm": 2.079158306121826, + "learning_rate": 4.99999755219399e-06, + "loss": 0.2142, + "step": 564 + }, + { + "epoch": 0.09154244977316915, + "grad_norm": 1.9085109233856201, + "learning_rate": 4.9999969019956526e-06, + "loss": 0.1923, + "step": 565 + }, + { + "epoch": 0.0917044718081659, + "grad_norm": 1.8776715993881226, + "learning_rate": 4.9999961753034595e-06, + "loss": 0.1935, + "step": 566 + }, + { + "epoch": 0.09186649384316267, + "grad_norm": 1.6320695877075195, + "learning_rate": 4.9999953721174345e-06, + "loss": 0.1881, + "step": 567 + }, + { + "epoch": 0.09202851587815943, + "grad_norm": 1.6475578546524048, + "learning_rate": 4.9999944924376e-06, + "loss": 0.191, + "step": 568 + }, + { + "epoch": 0.09219053791315619, + "grad_norm": 1.4152305126190186, + "learning_rate": 4.9999935362639844e-06, + "loss": 0.1728, + "step": 569 + }, + { + "epoch": 0.09235255994815295, + "grad_norm": 1.6144766807556152, + "learning_rate": 4.999992503596616e-06, + "loss": 0.1727, + "step": 570 + }, + { + "epoch": 0.09251458198314971, + "grad_norm": 1.455117106437683, + "learning_rate": 4.999991394435527e-06, + "loss": 0.1658, + "step": 571 + }, + { + "epoch": 0.09267660401814647, + "grad_norm": 1.536605715751648, + "learning_rate": 4.999990208780751e-06, + "loss": 0.1875, + "step": 572 + }, + { + "epoch": 0.09283862605314323, + "grad_norm": 1.4941188097000122, + "learning_rate": 4.999988946632326e-06, + "loss": 0.1732, + "step": 573 + }, + { + "epoch": 0.09300064808813999, + "grad_norm": 1.8162550926208496, + "learning_rate": 4.999987607990287e-06, + "loss": 0.2139, + "step": 574 + }, + { + "epoch": 0.09316267012313674, + "grad_norm": 1.8714874982833862, + "learning_rate": 4.9999861928546786e-06, + "loss": 0.2113, + "step": 575 + }, + { + "epoch": 0.09332469215813351, + "grad_norm": 1.6756885051727295, + "learning_rate": 4.999984701225542e-06, + "loss": 0.1855, + "step": 576 + }, + { + "epoch": 0.09348671419313026, + "grad_norm": 1.7077648639678955, + "learning_rate": 4.999983133102923e-06, + "loss": 0.184, + "step": 577 + }, + { + "epoch": 0.09364873622812703, + "grad_norm": 1.6993731260299683, + "learning_rate": 4.9999814884868705e-06, + "loss": 0.2088, + "step": 578 + }, + { + "epoch": 0.09381075826312378, + "grad_norm": 1.6294496059417725, + "learning_rate": 4.999979767377434e-06, + "loss": 0.1909, + "step": 579 + }, + { + "epoch": 0.09397278029812055, + "grad_norm": 1.4724843502044678, + "learning_rate": 4.999977969774666e-06, + "loss": 0.1786, + "step": 580 + }, + { + "epoch": 0.0941348023331173, + "grad_norm": 1.5849987268447876, + "learning_rate": 4.999976095678622e-06, + "loss": 0.1942, + "step": 581 + }, + { + "epoch": 0.09429682436811407, + "grad_norm": 1.393203854560852, + "learning_rate": 4.99997414508936e-06, + "loss": 0.157, + "step": 582 + }, + { + "epoch": 0.09445884640311082, + "grad_norm": 1.5272077322006226, + "learning_rate": 4.999972118006939e-06, + "loss": 0.1809, + "step": 583 + }, + { + "epoch": 0.09462086843810759, + "grad_norm": 1.4402399063110352, + "learning_rate": 4.999970014431421e-06, + "loss": 0.187, + "step": 584 + }, + { + "epoch": 0.09478289047310434, + "grad_norm": 1.433086633682251, + "learning_rate": 4.99996783436287e-06, + "loss": 0.179, + "step": 585 + }, + { + "epoch": 0.0949449125081011, + "grad_norm": 1.463783860206604, + "learning_rate": 4.999965577801354e-06, + "loss": 0.1703, + "step": 586 + }, + { + "epoch": 0.09510693454309786, + "grad_norm": 1.4071027040481567, + "learning_rate": 4.9999632447469395e-06, + "loss": 0.1708, + "step": 587 + }, + { + "epoch": 0.09526895657809462, + "grad_norm": 1.5153921842575073, + "learning_rate": 4.999960835199701e-06, + "loss": 0.1786, + "step": 588 + }, + { + "epoch": 0.09543097861309138, + "grad_norm": 1.577370047569275, + "learning_rate": 4.999958349159709e-06, + "loss": 0.178, + "step": 589 + }, + { + "epoch": 0.09559300064808814, + "grad_norm": 1.562628984451294, + "learning_rate": 4.999955786627042e-06, + "loss": 0.1793, + "step": 590 + }, + { + "epoch": 0.0957550226830849, + "grad_norm": 1.4154744148254395, + "learning_rate": 4.999953147601779e-06, + "loss": 0.1573, + "step": 591 + }, + { + "epoch": 0.09591704471808166, + "grad_norm": 1.473583459854126, + "learning_rate": 4.999950432083998e-06, + "loss": 0.1735, + "step": 592 + }, + { + "epoch": 0.09607906675307842, + "grad_norm": 1.5967344045639038, + "learning_rate": 4.999947640073784e-06, + "loss": 0.1798, + "step": 593 + }, + { + "epoch": 0.09624108878807518, + "grad_norm": 1.5491890907287598, + "learning_rate": 4.999944771571222e-06, + "loss": 0.1899, + "step": 594 + }, + { + "epoch": 0.09640311082307194, + "grad_norm": 1.5898832082748413, + "learning_rate": 4.9999418265764e-06, + "loss": 0.1825, + "step": 595 + }, + { + "epoch": 0.0965651328580687, + "grad_norm": 1.4132360219955444, + "learning_rate": 4.999938805089407e-06, + "loss": 0.1663, + "step": 596 + }, + { + "epoch": 0.09672715489306545, + "grad_norm": 1.5513783693313599, + "learning_rate": 4.999935707110337e-06, + "loss": 0.2075, + "step": 597 + }, + { + "epoch": 0.09688917692806222, + "grad_norm": 1.444082498550415, + "learning_rate": 4.999932532639285e-06, + "loss": 0.1501, + "step": 598 + }, + { + "epoch": 0.09705119896305897, + "grad_norm": 1.5398008823394775, + "learning_rate": 4.999929281676346e-06, + "loss": 0.1837, + "step": 599 + }, + { + "epoch": 0.09721322099805574, + "grad_norm": 1.4911479949951172, + "learning_rate": 4.99992595422162e-06, + "loss": 0.1966, + "step": 600 + }, + { + "epoch": 0.09737524303305249, + "grad_norm": 1.8352231979370117, + "learning_rate": 4.99992255027521e-06, + "loss": 0.201, + "step": 601 + }, + { + "epoch": 0.09753726506804926, + "grad_norm": 1.491248369216919, + "learning_rate": 4.9999190698372216e-06, + "loss": 0.187, + "step": 602 + }, + { + "epoch": 0.09769928710304601, + "grad_norm": 1.4229164123535156, + "learning_rate": 4.999915512907757e-06, + "loss": 0.1733, + "step": 603 + }, + { + "epoch": 0.09786130913804278, + "grad_norm": 1.624070167541504, + "learning_rate": 4.9999118794869285e-06, + "loss": 0.1749, + "step": 604 + }, + { + "epoch": 0.09802333117303953, + "grad_norm": 1.3550504446029663, + "learning_rate": 4.999908169574846e-06, + "loss": 0.1683, + "step": 605 + }, + { + "epoch": 0.0981853532080363, + "grad_norm": 1.5067567825317383, + "learning_rate": 4.999904383171623e-06, + "loss": 0.1891, + "step": 606 + }, + { + "epoch": 0.09834737524303305, + "grad_norm": 1.5590304136276245, + "learning_rate": 4.999900520277376e-06, + "loss": 0.1831, + "step": 607 + }, + { + "epoch": 0.0985093972780298, + "grad_norm": 1.4621655941009521, + "learning_rate": 4.999896580892221e-06, + "loss": 0.1648, + "step": 608 + }, + { + "epoch": 0.09867141931302657, + "grad_norm": 1.5438154935836792, + "learning_rate": 4.999892565016282e-06, + "loss": 0.1728, + "step": 609 + }, + { + "epoch": 0.09883344134802333, + "grad_norm": 1.5466684103012085, + "learning_rate": 4.99988847264968e-06, + "loss": 0.1915, + "step": 610 + }, + { + "epoch": 0.0989954633830201, + "grad_norm": 1.430253267288208, + "learning_rate": 4.99988430379254e-06, + "loss": 0.1577, + "step": 611 + }, + { + "epoch": 0.09915748541801685, + "grad_norm": 1.4443409442901611, + "learning_rate": 4.99988005844499e-06, + "loss": 0.1668, + "step": 612 + }, + { + "epoch": 0.09931950745301361, + "grad_norm": 1.5595300197601318, + "learning_rate": 4.999875736607159e-06, + "loss": 0.1686, + "step": 613 + }, + { + "epoch": 0.09948152948801037, + "grad_norm": 1.4867337942123413, + "learning_rate": 4.999871338279181e-06, + "loss": 0.1906, + "step": 614 + }, + { + "epoch": 0.09964355152300713, + "grad_norm": 1.4922622442245483, + "learning_rate": 4.99986686346119e-06, + "loss": 0.1727, + "step": 615 + }, + { + "epoch": 0.09980557355800389, + "grad_norm": 1.4832661151885986, + "learning_rate": 4.999862312153322e-06, + "loss": 0.1703, + "step": 616 + }, + { + "epoch": 0.09996759559300065, + "grad_norm": 1.4716413021087646, + "learning_rate": 4.999857684355716e-06, + "loss": 0.1704, + "step": 617 + }, + { + "epoch": 0.10012961762799741, + "grad_norm": 1.4800324440002441, + "learning_rate": 4.999852980068516e-06, + "loss": 0.1604, + "step": 618 + }, + { + "epoch": 0.10029163966299416, + "grad_norm": 1.4337764978408813, + "learning_rate": 4.999848199291863e-06, + "loss": 0.1785, + "step": 619 + }, + { + "epoch": 0.10045366169799093, + "grad_norm": 1.373763918876648, + "learning_rate": 4.9998433420259055e-06, + "loss": 0.1532, + "step": 620 + }, + { + "epoch": 0.10061568373298768, + "grad_norm": 1.4618251323699951, + "learning_rate": 4.999838408270791e-06, + "loss": 0.1849, + "step": 621 + }, + { + "epoch": 0.10077770576798445, + "grad_norm": 1.356284499168396, + "learning_rate": 4.99983339802667e-06, + "loss": 0.1555, + "step": 622 + }, + { + "epoch": 0.1009397278029812, + "grad_norm": 1.548274040222168, + "learning_rate": 4.999828311293697e-06, + "loss": 0.1913, + "step": 623 + }, + { + "epoch": 0.10110174983797797, + "grad_norm": 1.549202799797058, + "learning_rate": 4.999823148072027e-06, + "loss": 0.1909, + "step": 624 + }, + { + "epoch": 0.10126377187297472, + "grad_norm": 1.5712025165557861, + "learning_rate": 4.999817908361818e-06, + "loss": 0.1814, + "step": 625 + }, + { + "epoch": 0.10142579390797149, + "grad_norm": 1.3882803916931152, + "learning_rate": 4.999812592163232e-06, + "loss": 0.1725, + "step": 626 + }, + { + "epoch": 0.10158781594296824, + "grad_norm": 1.4438263177871704, + "learning_rate": 4.999807199476428e-06, + "loss": 0.1663, + "step": 627 + }, + { + "epoch": 0.101749837977965, + "grad_norm": 1.430675745010376, + "learning_rate": 4.9998017303015735e-06, + "loss": 0.1685, + "step": 628 + }, + { + "epoch": 0.10191186001296176, + "grad_norm": 1.514258623123169, + "learning_rate": 4.999796184638836e-06, + "loss": 0.2044, + "step": 629 + }, + { + "epoch": 0.10207388204795852, + "grad_norm": 1.3589094877243042, + "learning_rate": 4.999790562488385e-06, + "loss": 0.1636, + "step": 630 + }, + { + "epoch": 0.10223590408295528, + "grad_norm": 1.56080961227417, + "learning_rate": 4.999784863850391e-06, + "loss": 0.1925, + "step": 631 + }, + { + "epoch": 0.10239792611795204, + "grad_norm": 1.3032417297363281, + "learning_rate": 4.999779088725031e-06, + "loss": 0.1394, + "step": 632 + }, + { + "epoch": 0.1025599481529488, + "grad_norm": 1.3875499963760376, + "learning_rate": 4.999773237112479e-06, + "loss": 0.1593, + "step": 633 + }, + { + "epoch": 0.10272197018794556, + "grad_norm": 1.6035758256912231, + "learning_rate": 4.999767309012916e-06, + "loss": 0.1822, + "step": 634 + }, + { + "epoch": 0.10288399222294232, + "grad_norm": 1.519217610359192, + "learning_rate": 4.999761304426523e-06, + "loss": 0.187, + "step": 635 + }, + { + "epoch": 0.10304601425793908, + "grad_norm": 1.6218081712722778, + "learning_rate": 4.999755223353483e-06, + "loss": 0.1963, + "step": 636 + }, + { + "epoch": 0.10320803629293585, + "grad_norm": 1.4830374717712402, + "learning_rate": 4.999749065793982e-06, + "loss": 0.1656, + "step": 637 + }, + { + "epoch": 0.1033700583279326, + "grad_norm": 1.5597121715545654, + "learning_rate": 4.9997428317482086e-06, + "loss": 0.1817, + "step": 638 + }, + { + "epoch": 0.10353208036292935, + "grad_norm": 1.3938210010528564, + "learning_rate": 4.999736521216355e-06, + "loss": 0.1594, + "step": 639 + }, + { + "epoch": 0.10369410239792612, + "grad_norm": 1.3650377988815308, + "learning_rate": 4.999730134198612e-06, + "loss": 0.1661, + "step": 640 + }, + { + "epoch": 0.10385612443292287, + "grad_norm": 1.493725061416626, + "learning_rate": 4.999723670695177e-06, + "loss": 0.173, + "step": 641 + }, + { + "epoch": 0.10401814646791964, + "grad_norm": 1.5311143398284912, + "learning_rate": 4.999717130706247e-06, + "loss": 0.1933, + "step": 642 + }, + { + "epoch": 0.10418016850291639, + "grad_norm": 1.5196541547775269, + "learning_rate": 4.9997105142320205e-06, + "loss": 0.1797, + "step": 643 + }, + { + "epoch": 0.10434219053791316, + "grad_norm": 1.541266679763794, + "learning_rate": 4.999703821272702e-06, + "loss": 0.1813, + "step": 644 + }, + { + "epoch": 0.10450421257290991, + "grad_norm": 1.65276038646698, + "learning_rate": 4.999697051828497e-06, + "loss": 0.2055, + "step": 645 + }, + { + "epoch": 0.10466623460790668, + "grad_norm": 1.6365846395492554, + "learning_rate": 4.99969020589961e-06, + "loss": 0.2004, + "step": 646 + }, + { + "epoch": 0.10482825664290343, + "grad_norm": 1.675693154335022, + "learning_rate": 4.999683283486252e-06, + "loss": 0.1797, + "step": 647 + }, + { + "epoch": 0.1049902786779002, + "grad_norm": 1.5289831161499023, + "learning_rate": 4.999676284588635e-06, + "loss": 0.2035, + "step": 648 + }, + { + "epoch": 0.10515230071289695, + "grad_norm": 1.4459340572357178, + "learning_rate": 4.9996692092069735e-06, + "loss": 0.1682, + "step": 649 + }, + { + "epoch": 0.10531432274789371, + "grad_norm": 1.7202799320220947, + "learning_rate": 4.999662057341482e-06, + "loss": 0.2091, + "step": 650 + }, + { + "epoch": 0.10547634478289047, + "grad_norm": 1.4932163953781128, + "learning_rate": 4.999654828992382e-06, + "loss": 0.1856, + "step": 651 + }, + { + "epoch": 0.10563836681788723, + "grad_norm": 1.3993346691131592, + "learning_rate": 4.999647524159892e-06, + "loss": 0.1592, + "step": 652 + }, + { + "epoch": 0.105800388852884, + "grad_norm": 1.7462083101272583, + "learning_rate": 4.999640142844237e-06, + "loss": 0.1681, + "step": 653 + }, + { + "epoch": 0.10596241088788075, + "grad_norm": 1.5770518779754639, + "learning_rate": 4.9996326850456435e-06, + "loss": 0.1914, + "step": 654 + }, + { + "epoch": 0.10612443292287752, + "grad_norm": 1.3480045795440674, + "learning_rate": 4.9996251507643375e-06, + "loss": 0.1521, + "step": 655 + }, + { + "epoch": 0.10628645495787427, + "grad_norm": 1.5072104930877686, + "learning_rate": 4.999617540000552e-06, + "loss": 0.1811, + "step": 656 + }, + { + "epoch": 0.10644847699287104, + "grad_norm": 1.4140270948410034, + "learning_rate": 4.9996098527545184e-06, + "loss": 0.1599, + "step": 657 + }, + { + "epoch": 0.10661049902786779, + "grad_norm": 1.3946372270584106, + "learning_rate": 4.999602089026472e-06, + "loss": 0.1585, + "step": 658 + }, + { + "epoch": 0.10677252106286456, + "grad_norm": 1.7381147146224976, + "learning_rate": 4.9995942488166506e-06, + "loss": 0.2138, + "step": 659 + }, + { + "epoch": 0.10693454309786131, + "grad_norm": 1.5091066360473633, + "learning_rate": 4.999586332125294e-06, + "loss": 0.1769, + "step": 660 + }, + { + "epoch": 0.10709656513285806, + "grad_norm": 1.4993563890457153, + "learning_rate": 4.999578338952646e-06, + "loss": 0.1819, + "step": 661 + }, + { + "epoch": 0.10725858716785483, + "grad_norm": 1.6799975633621216, + "learning_rate": 4.9995702692989476e-06, + "loss": 0.2105, + "step": 662 + }, + { + "epoch": 0.10742060920285158, + "grad_norm": 1.5190070867538452, + "learning_rate": 4.999562123164448e-06, + "loss": 0.1842, + "step": 663 + }, + { + "epoch": 0.10758263123784835, + "grad_norm": 1.4410516023635864, + "learning_rate": 4.999553900549398e-06, + "loss": 0.1664, + "step": 664 + }, + { + "epoch": 0.1077446532728451, + "grad_norm": 1.4741100072860718, + "learning_rate": 4.999545601454046e-06, + "loss": 0.1726, + "step": 665 + }, + { + "epoch": 0.10790667530784187, + "grad_norm": 1.4260085821151733, + "learning_rate": 4.999537225878648e-06, + "loss": 0.1803, + "step": 666 + }, + { + "epoch": 0.10806869734283862, + "grad_norm": 1.4949939250946045, + "learning_rate": 4.999528773823459e-06, + "loss": 0.1779, + "step": 667 + }, + { + "epoch": 0.10823071937783539, + "grad_norm": 1.3150126934051514, + "learning_rate": 4.999520245288739e-06, + "loss": 0.1637, + "step": 668 + }, + { + "epoch": 0.10839274141283214, + "grad_norm": 1.3088539838790894, + "learning_rate": 4.999511640274748e-06, + "loss": 0.157, + "step": 669 + }, + { + "epoch": 0.10855476344782891, + "grad_norm": 1.4893115758895874, + "learning_rate": 4.999502958781749e-06, + "loss": 0.1793, + "step": 670 + }, + { + "epoch": 0.10871678548282566, + "grad_norm": 1.5916615724563599, + "learning_rate": 4.999494200810009e-06, + "loss": 0.1923, + "step": 671 + }, + { + "epoch": 0.10887880751782242, + "grad_norm": 1.4635965824127197, + "learning_rate": 4.999485366359794e-06, + "loss": 0.1867, + "step": 672 + }, + { + "epoch": 0.10904082955281919, + "grad_norm": 1.5369232892990112, + "learning_rate": 4.999476455431377e-06, + "loss": 0.175, + "step": 673 + }, + { + "epoch": 0.10920285158781594, + "grad_norm": 1.5748496055603027, + "learning_rate": 4.999467468025028e-06, + "loss": 0.2062, + "step": 674 + }, + { + "epoch": 0.1093648736228127, + "grad_norm": 1.5589791536331177, + "learning_rate": 4.999458404141023e-06, + "loss": 0.1822, + "step": 675 + }, + { + "epoch": 0.10952689565780946, + "grad_norm": 1.4582257270812988, + "learning_rate": 4.99944926377964e-06, + "loss": 0.1925, + "step": 676 + }, + { + "epoch": 0.10968891769280623, + "grad_norm": 1.5575858354568481, + "learning_rate": 4.9994400469411575e-06, + "loss": 0.1785, + "step": 677 + }, + { + "epoch": 0.10985093972780298, + "grad_norm": 1.3714617490768433, + "learning_rate": 4.999430753625858e-06, + "loss": 0.1631, + "step": 678 + }, + { + "epoch": 0.11001296176279975, + "grad_norm": 1.4028851985931396, + "learning_rate": 4.999421383834027e-06, + "loss": 0.1758, + "step": 679 + }, + { + "epoch": 0.1101749837977965, + "grad_norm": 1.395218014717102, + "learning_rate": 4.999411937565949e-06, + "loss": 0.1711, + "step": 680 + }, + { + "epoch": 0.11033700583279327, + "grad_norm": 1.4894384145736694, + "learning_rate": 4.999402414821915e-06, + "loss": 0.197, + "step": 681 + }, + { + "epoch": 0.11049902786779002, + "grad_norm": 1.489985466003418, + "learning_rate": 4.999392815602214e-06, + "loss": 0.1664, + "step": 682 + }, + { + "epoch": 0.11066104990278677, + "grad_norm": 1.4633452892303467, + "learning_rate": 4.9993831399071425e-06, + "loss": 0.1793, + "step": 683 + }, + { + "epoch": 0.11082307193778354, + "grad_norm": 1.4508930444717407, + "learning_rate": 4.999373387736996e-06, + "loss": 0.1914, + "step": 684 + }, + { + "epoch": 0.1109850939727803, + "grad_norm": 1.3373337984085083, + "learning_rate": 4.999363559092071e-06, + "loss": 0.1516, + "step": 685 + }, + { + "epoch": 0.11114711600777706, + "grad_norm": 1.4252420663833618, + "learning_rate": 4.999353653972669e-06, + "loss": 0.1849, + "step": 686 + }, + { + "epoch": 0.11130913804277381, + "grad_norm": 1.383408546447754, + "learning_rate": 4.999343672379095e-06, + "loss": 0.162, + "step": 687 + }, + { + "epoch": 0.11147116007777058, + "grad_norm": 1.477158546447754, + "learning_rate": 4.999333614311652e-06, + "loss": 0.1832, + "step": 688 + }, + { + "epoch": 0.11163318211276733, + "grad_norm": 1.374411940574646, + "learning_rate": 4.999323479770649e-06, + "loss": 0.1662, + "step": 689 + }, + { + "epoch": 0.1117952041477641, + "grad_norm": 1.3238674402236938, + "learning_rate": 4.999313268756396e-06, + "loss": 0.156, + "step": 690 + }, + { + "epoch": 0.11195722618276086, + "grad_norm": 1.4178396463394165, + "learning_rate": 4.999302981269204e-06, + "loss": 0.165, + "step": 691 + }, + { + "epoch": 0.11211924821775761, + "grad_norm": 1.5902644395828247, + "learning_rate": 4.99929261730939e-06, + "loss": 0.1938, + "step": 692 + }, + { + "epoch": 0.11228127025275438, + "grad_norm": 1.420102596282959, + "learning_rate": 4.999282176877271e-06, + "loss": 0.1818, + "step": 693 + }, + { + "epoch": 0.11244329228775113, + "grad_norm": 1.4864150285720825, + "learning_rate": 4.999271659973164e-06, + "loss": 0.1992, + "step": 694 + }, + { + "epoch": 0.1126053143227479, + "grad_norm": 1.5267049074172974, + "learning_rate": 4.999261066597393e-06, + "loss": 0.1995, + "step": 695 + }, + { + "epoch": 0.11276733635774465, + "grad_norm": 1.4048455953598022, + "learning_rate": 4.999250396750281e-06, + "loss": 0.1719, + "step": 696 + }, + { + "epoch": 0.11292935839274142, + "grad_norm": 1.4921525716781616, + "learning_rate": 4.999239650432155e-06, + "loss": 0.191, + "step": 697 + }, + { + "epoch": 0.11309138042773817, + "grad_norm": 1.41100013256073, + "learning_rate": 4.999228827643344e-06, + "loss": 0.1837, + "step": 698 + }, + { + "epoch": 0.11325340246273494, + "grad_norm": 1.3732647895812988, + "learning_rate": 4.999217928384179e-06, + "loss": 0.1936, + "step": 699 + }, + { + "epoch": 0.11341542449773169, + "grad_norm": 1.556138277053833, + "learning_rate": 4.999206952654993e-06, + "loss": 0.1966, + "step": 700 + }, + { + "epoch": 0.11357744653272846, + "grad_norm": 1.5490831136703491, + "learning_rate": 4.9991959004561225e-06, + "loss": 0.1914, + "step": 701 + }, + { + "epoch": 0.11373946856772521, + "grad_norm": 1.3613611459732056, + "learning_rate": 4.999184771787905e-06, + "loss": 0.1692, + "step": 702 + }, + { + "epoch": 0.11390149060272196, + "grad_norm": 1.4281145334243774, + "learning_rate": 4.999173566650682e-06, + "loss": 0.1671, + "step": 703 + }, + { + "epoch": 0.11406351263771873, + "grad_norm": 1.629227638244629, + "learning_rate": 4.999162285044795e-06, + "loss": 0.1739, + "step": 704 + }, + { + "epoch": 0.11422553467271548, + "grad_norm": 1.5268644094467163, + "learning_rate": 4.999150926970591e-06, + "loss": 0.1809, + "step": 705 + }, + { + "epoch": 0.11438755670771225, + "grad_norm": 1.445254921913147, + "learning_rate": 4.9991394924284155e-06, + "loss": 0.1684, + "step": 706 + }, + { + "epoch": 0.114549578742709, + "grad_norm": 1.533254861831665, + "learning_rate": 4.99912798141862e-06, + "loss": 0.1923, + "step": 707 + }, + { + "epoch": 0.11471160077770577, + "grad_norm": 1.3670967817306519, + "learning_rate": 4.999116393941556e-06, + "loss": 0.1548, + "step": 708 + }, + { + "epoch": 0.11487362281270252, + "grad_norm": 1.3976548910140991, + "learning_rate": 4.999104729997577e-06, + "loss": 0.1821, + "step": 709 + }, + { + "epoch": 0.11503564484769929, + "grad_norm": 1.4151368141174316, + "learning_rate": 4.999092989587042e-06, + "loss": 0.188, + "step": 710 + }, + { + "epoch": 0.11519766688269605, + "grad_norm": 1.2826131582260132, + "learning_rate": 4.999081172710309e-06, + "loss": 0.152, + "step": 711 + }, + { + "epoch": 0.11535968891769281, + "grad_norm": 1.463115930557251, + "learning_rate": 4.9990692793677395e-06, + "loss": 0.1712, + "step": 712 + }, + { + "epoch": 0.11552171095268957, + "grad_norm": 1.4324864149093628, + "learning_rate": 4.999057309559698e-06, + "loss": 0.1749, + "step": 713 + }, + { + "epoch": 0.11568373298768632, + "grad_norm": 1.2447808980941772, + "learning_rate": 4.999045263286551e-06, + "loss": 0.1539, + "step": 714 + }, + { + "epoch": 0.11584575502268309, + "grad_norm": 1.3643749952316284, + "learning_rate": 4.999033140548666e-06, + "loss": 0.165, + "step": 715 + }, + { + "epoch": 0.11600777705767984, + "grad_norm": 1.491412878036499, + "learning_rate": 4.999020941346414e-06, + "loss": 0.1752, + "step": 716 + }, + { + "epoch": 0.1161697990926766, + "grad_norm": 1.363407850265503, + "learning_rate": 4.999008665680169e-06, + "loss": 0.1652, + "step": 717 + }, + { + "epoch": 0.11633182112767336, + "grad_norm": 1.3879029750823975, + "learning_rate": 4.998996313550306e-06, + "loss": 0.1635, + "step": 718 + }, + { + "epoch": 0.11649384316267013, + "grad_norm": 1.5233573913574219, + "learning_rate": 4.9989838849572035e-06, + "loss": 0.175, + "step": 719 + }, + { + "epoch": 0.11665586519766688, + "grad_norm": 1.3929238319396973, + "learning_rate": 4.998971379901242e-06, + "loss": 0.1714, + "step": 720 + }, + { + "epoch": 0.11681788723266365, + "grad_norm": 1.4243627786636353, + "learning_rate": 4.9989587983828036e-06, + "loss": 0.1931, + "step": 721 + }, + { + "epoch": 0.1169799092676604, + "grad_norm": 1.6758102178573608, + "learning_rate": 4.998946140402273e-06, + "loss": 0.1698, + "step": 722 + }, + { + "epoch": 0.11714193130265717, + "grad_norm": 1.4985467195510864, + "learning_rate": 4.998933405960038e-06, + "loss": 0.194, + "step": 723 + }, + { + "epoch": 0.11730395333765392, + "grad_norm": 1.500422716140747, + "learning_rate": 4.998920595056488e-06, + "loss": 0.1828, + "step": 724 + }, + { + "epoch": 0.11746597537265067, + "grad_norm": 1.4340198040008545, + "learning_rate": 4.998907707692015e-06, + "loss": 0.1903, + "step": 725 + }, + { + "epoch": 0.11762799740764744, + "grad_norm": 1.3363415002822876, + "learning_rate": 4.998894743867013e-06, + "loss": 0.1711, + "step": 726 + }, + { + "epoch": 0.1177900194426442, + "grad_norm": 1.4468626976013184, + "learning_rate": 4.998881703581879e-06, + "loss": 0.17, + "step": 727 + }, + { + "epoch": 0.11795204147764096, + "grad_norm": 1.3693926334381104, + "learning_rate": 4.998868586837013e-06, + "loss": 0.183, + "step": 728 + }, + { + "epoch": 0.11811406351263772, + "grad_norm": 1.2546919584274292, + "learning_rate": 4.998855393632815e-06, + "loss": 0.1606, + "step": 729 + }, + { + "epoch": 0.11827608554763448, + "grad_norm": 1.3889665603637695, + "learning_rate": 4.998842123969689e-06, + "loss": 0.1593, + "step": 730 + }, + { + "epoch": 0.11843810758263124, + "grad_norm": 1.3932158946990967, + "learning_rate": 4.998828777848041e-06, + "loss": 0.1779, + "step": 731 + }, + { + "epoch": 0.118600129617628, + "grad_norm": 1.3757567405700684, + "learning_rate": 4.998815355268279e-06, + "loss": 0.1707, + "step": 732 + }, + { + "epoch": 0.11876215165262476, + "grad_norm": 1.5059839487075806, + "learning_rate": 4.998801856230815e-06, + "loss": 0.1737, + "step": 733 + }, + { + "epoch": 0.11892417368762152, + "grad_norm": 1.3769277334213257, + "learning_rate": 4.998788280736061e-06, + "loss": 0.1745, + "step": 734 + }, + { + "epoch": 0.11908619572261828, + "grad_norm": 1.4347563982009888, + "learning_rate": 4.998774628784432e-06, + "loss": 0.1948, + "step": 735 + }, + { + "epoch": 0.11924821775761503, + "grad_norm": 1.40513277053833, + "learning_rate": 4.998760900376347e-06, + "loss": 0.1795, + "step": 736 + }, + { + "epoch": 0.1194102397926118, + "grad_norm": 1.2432270050048828, + "learning_rate": 4.998747095512225e-06, + "loss": 0.1416, + "step": 737 + }, + { + "epoch": 0.11957226182760855, + "grad_norm": 1.4356383085250854, + "learning_rate": 4.99873321419249e-06, + "loss": 0.1925, + "step": 738 + }, + { + "epoch": 0.11973428386260532, + "grad_norm": 1.1989301443099976, + "learning_rate": 4.998719256417563e-06, + "loss": 0.1551, + "step": 739 + }, + { + "epoch": 0.11989630589760207, + "grad_norm": 1.339026689529419, + "learning_rate": 4.998705222187875e-06, + "loss": 0.1952, + "step": 740 + }, + { + "epoch": 0.12005832793259884, + "grad_norm": 1.408982276916504, + "learning_rate": 4.998691111503854e-06, + "loss": 0.1869, + "step": 741 + }, + { + "epoch": 0.12022034996759559, + "grad_norm": 1.28006112575531, + "learning_rate": 4.998676924365931e-06, + "loss": 0.1713, + "step": 742 + }, + { + "epoch": 0.12038237200259236, + "grad_norm": 1.423261046409607, + "learning_rate": 4.998662660774541e-06, + "loss": 0.1713, + "step": 743 + }, + { + "epoch": 0.12054439403758911, + "grad_norm": 1.3350955247879028, + "learning_rate": 4.998648320730121e-06, + "loss": 0.156, + "step": 744 + }, + { + "epoch": 0.12070641607258586, + "grad_norm": 1.3964903354644775, + "learning_rate": 4.998633904233108e-06, + "loss": 0.1827, + "step": 745 + }, + { + "epoch": 0.12086843810758263, + "grad_norm": 1.4213593006134033, + "learning_rate": 4.998619411283945e-06, + "loss": 0.1742, + "step": 746 + }, + { + "epoch": 0.12103046014257939, + "grad_norm": 1.2968708276748657, + "learning_rate": 4.998604841883073e-06, + "loss": 0.1623, + "step": 747 + }, + { + "epoch": 0.12119248217757615, + "grad_norm": 1.5439331531524658, + "learning_rate": 4.998590196030942e-06, + "loss": 0.2048, + "step": 748 + }, + { + "epoch": 0.1213545042125729, + "grad_norm": 1.4168784618377686, + "learning_rate": 4.998575473727995e-06, + "loss": 0.1814, + "step": 749 + }, + { + "epoch": 0.12151652624756967, + "grad_norm": 1.568098545074463, + "learning_rate": 4.998560674974686e-06, + "loss": 0.184, + "step": 750 + }, + { + "epoch": 0.12167854828256643, + "grad_norm": 1.4985617399215698, + "learning_rate": 4.998545799771466e-06, + "loss": 0.1938, + "step": 751 + }, + { + "epoch": 0.1218405703175632, + "grad_norm": 1.5311156511306763, + "learning_rate": 4.998530848118792e-06, + "loss": 0.1802, + "step": 752 + }, + { + "epoch": 0.12200259235255995, + "grad_norm": 1.499812126159668, + "learning_rate": 4.99851582001712e-06, + "loss": 0.1726, + "step": 753 + }, + { + "epoch": 0.12216461438755671, + "grad_norm": 1.4239556789398193, + "learning_rate": 4.99850071546691e-06, + "loss": 0.2014, + "step": 754 + }, + { + "epoch": 0.12232663642255347, + "grad_norm": 1.3375072479248047, + "learning_rate": 4.998485534468624e-06, + "loss": 0.176, + "step": 755 + }, + { + "epoch": 0.12248865845755022, + "grad_norm": 1.356996774673462, + "learning_rate": 4.998470277022728e-06, + "loss": 0.1734, + "step": 756 + }, + { + "epoch": 0.12265068049254699, + "grad_norm": 1.4485342502593994, + "learning_rate": 4.998454943129687e-06, + "loss": 0.1824, + "step": 757 + }, + { + "epoch": 0.12281270252754374, + "grad_norm": 1.4348751306533813, + "learning_rate": 4.99843953278997e-06, + "loss": 0.1857, + "step": 758 + }, + { + "epoch": 0.12297472456254051, + "grad_norm": 1.4200366735458374, + "learning_rate": 4.998424046004051e-06, + "loss": 0.1772, + "step": 759 + }, + { + "epoch": 0.12313674659753726, + "grad_norm": 1.4113013744354248, + "learning_rate": 4.998408482772401e-06, + "loss": 0.1823, + "step": 760 + }, + { + "epoch": 0.12329876863253403, + "grad_norm": 1.5233389139175415, + "learning_rate": 4.9983928430954986e-06, + "loss": 0.1756, + "step": 761 + }, + { + "epoch": 0.12346079066753078, + "grad_norm": 1.6090972423553467, + "learning_rate": 4.99837712697382e-06, + "loss": 0.1848, + "step": 762 + }, + { + "epoch": 0.12362281270252755, + "grad_norm": 1.318171501159668, + "learning_rate": 4.998361334407849e-06, + "loss": 0.1513, + "step": 763 + }, + { + "epoch": 0.1237848347375243, + "grad_norm": 1.4973655939102173, + "learning_rate": 4.998345465398066e-06, + "loss": 0.1755, + "step": 764 + }, + { + "epoch": 0.12394685677252107, + "grad_norm": 1.4733107089996338, + "learning_rate": 4.998329519944957e-06, + "loss": 0.1767, + "step": 765 + }, + { + "epoch": 0.12410887880751782, + "grad_norm": 1.373856544494629, + "learning_rate": 4.998313498049011e-06, + "loss": 0.1895, + "step": 766 + }, + { + "epoch": 0.12427090084251458, + "grad_norm": 1.2825859785079956, + "learning_rate": 4.998297399710718e-06, + "loss": 0.1642, + "step": 767 + }, + { + "epoch": 0.12443292287751134, + "grad_norm": 1.416802167892456, + "learning_rate": 4.9982812249305704e-06, + "loss": 0.1892, + "step": 768 + }, + { + "epoch": 0.1245949449125081, + "grad_norm": 1.3358246088027954, + "learning_rate": 4.998264973709063e-06, + "loss": 0.1655, + "step": 769 + }, + { + "epoch": 0.12475696694750486, + "grad_norm": 1.3493046760559082, + "learning_rate": 4.998248646046693e-06, + "loss": 0.1602, + "step": 770 + }, + { + "epoch": 0.12491898898250162, + "grad_norm": 1.3179484605789185, + "learning_rate": 4.99823224194396e-06, + "loss": 0.1788, + "step": 771 + }, + { + "epoch": 0.12508101101749838, + "grad_norm": 1.3762329816818237, + "learning_rate": 4.998215761401366e-06, + "loss": 0.1727, + "step": 772 + }, + { + "epoch": 0.12524303305249515, + "grad_norm": 1.5525070428848267, + "learning_rate": 4.998199204419415e-06, + "loss": 0.1876, + "step": 773 + }, + { + "epoch": 0.1254050550874919, + "grad_norm": 1.4245859384536743, + "learning_rate": 4.9981825709986145e-06, + "loss": 0.1737, + "step": 774 + }, + { + "epoch": 0.12556707712248866, + "grad_norm": 1.5142040252685547, + "learning_rate": 4.998165861139472e-06, + "loss": 0.1775, + "step": 775 + }, + { + "epoch": 0.12572909915748542, + "grad_norm": 1.4016090631484985, + "learning_rate": 4.9981490748425e-06, + "loss": 0.1676, + "step": 776 + }, + { + "epoch": 0.12589112119248216, + "grad_norm": 1.5504176616668701, + "learning_rate": 4.998132212108212e-06, + "loss": 0.1861, + "step": 777 + }, + { + "epoch": 0.12605314322747893, + "grad_norm": 1.3723235130310059, + "learning_rate": 4.998115272937123e-06, + "loss": 0.1657, + "step": 778 + }, + { + "epoch": 0.1262151652624757, + "grad_norm": 1.4091416597366333, + "learning_rate": 4.998098257329753e-06, + "loss": 0.1698, + "step": 779 + }, + { + "epoch": 0.12637718729747247, + "grad_norm": 1.3922086954116821, + "learning_rate": 4.998081165286621e-06, + "loss": 0.1828, + "step": 780 + }, + { + "epoch": 0.1265392093324692, + "grad_norm": 1.3854937553405762, + "learning_rate": 4.998063996808251e-06, + "loss": 0.1805, + "step": 781 + }, + { + "epoch": 0.12670123136746597, + "grad_norm": 1.359626293182373, + "learning_rate": 4.9980467518951666e-06, + "loss": 0.1816, + "step": 782 + }, + { + "epoch": 0.12686325340246274, + "grad_norm": 1.2333866357803345, + "learning_rate": 4.998029430547898e-06, + "loss": 0.1541, + "step": 783 + }, + { + "epoch": 0.1270252754374595, + "grad_norm": 1.4757193326950073, + "learning_rate": 4.998012032766974e-06, + "loss": 0.1925, + "step": 784 + }, + { + "epoch": 0.12718729747245625, + "grad_norm": 1.434784173965454, + "learning_rate": 4.997994558552926e-06, + "loss": 0.1815, + "step": 785 + }, + { + "epoch": 0.127349319507453, + "grad_norm": 1.4703211784362793, + "learning_rate": 4.997977007906291e-06, + "loss": 0.1768, + "step": 786 + }, + { + "epoch": 0.12751134154244978, + "grad_norm": 1.414862871170044, + "learning_rate": 4.997959380827603e-06, + "loss": 0.1674, + "step": 787 + }, + { + "epoch": 0.12767336357744652, + "grad_norm": 1.548573613166809, + "learning_rate": 4.997941677317403e-06, + "loss": 0.197, + "step": 788 + }, + { + "epoch": 0.1278353856124433, + "grad_norm": 1.4064699411392212, + "learning_rate": 4.997923897376233e-06, + "loss": 0.1857, + "step": 789 + }, + { + "epoch": 0.12799740764744005, + "grad_norm": 1.2528257369995117, + "learning_rate": 4.997906041004637e-06, + "loss": 0.1574, + "step": 790 + }, + { + "epoch": 0.12815942968243682, + "grad_norm": 1.3743071556091309, + "learning_rate": 4.9978881082031605e-06, + "loss": 0.1833, + "step": 791 + }, + { + "epoch": 0.12832145171743356, + "grad_norm": 1.437005639076233, + "learning_rate": 4.997870098972353e-06, + "loss": 0.1712, + "step": 792 + }, + { + "epoch": 0.12848347375243033, + "grad_norm": 1.3489969968795776, + "learning_rate": 4.997852013312765e-06, + "loss": 0.1772, + "step": 793 + }, + { + "epoch": 0.1286454957874271, + "grad_norm": 1.6645337343215942, + "learning_rate": 4.99783385122495e-06, + "loss": 0.1913, + "step": 794 + }, + { + "epoch": 0.12880751782242386, + "grad_norm": 1.3913012742996216, + "learning_rate": 4.997815612709463e-06, + "loss": 0.1605, + "step": 795 + }, + { + "epoch": 0.1289695398574206, + "grad_norm": 1.3065690994262695, + "learning_rate": 4.997797297766864e-06, + "loss": 0.1604, + "step": 796 + }, + { + "epoch": 0.12913156189241737, + "grad_norm": 1.439260721206665, + "learning_rate": 4.997778906397713e-06, + "loss": 0.1689, + "step": 797 + }, + { + "epoch": 0.12929358392741414, + "grad_norm": 1.3989150524139404, + "learning_rate": 4.9977604386025704e-06, + "loss": 0.1724, + "step": 798 + }, + { + "epoch": 0.12945560596241087, + "grad_norm": 1.5063525438308716, + "learning_rate": 4.9977418943820036e-06, + "loss": 0.1865, + "step": 799 + }, + { + "epoch": 0.12961762799740764, + "grad_norm": 1.3701797723770142, + "learning_rate": 4.997723273736579e-06, + "loss": 0.1755, + "step": 800 + }, + { + "epoch": 0.1297796500324044, + "grad_norm": 2.1302490234375, + "learning_rate": 4.997704576666867e-06, + "loss": 0.1756, + "step": 801 + }, + { + "epoch": 0.12994167206740118, + "grad_norm": 1.360653042793274, + "learning_rate": 4.9976858031734375e-06, + "loss": 0.1637, + "step": 802 + }, + { + "epoch": 0.13010369410239792, + "grad_norm": 1.3926098346710205, + "learning_rate": 4.997666953256869e-06, + "loss": 0.1664, + "step": 803 + }, + { + "epoch": 0.13026571613739468, + "grad_norm": 1.2852853536605835, + "learning_rate": 4.9976480269177345e-06, + "loss": 0.1714, + "step": 804 + }, + { + "epoch": 0.13042773817239145, + "grad_norm": 1.3100239038467407, + "learning_rate": 4.997629024156615e-06, + "loss": 0.167, + "step": 805 + }, + { + "epoch": 0.13058976020738822, + "grad_norm": 1.3834192752838135, + "learning_rate": 4.997609944974092e-06, + "loss": 0.1626, + "step": 806 + }, + { + "epoch": 0.13075178224238496, + "grad_norm": 1.1783868074417114, + "learning_rate": 4.997590789370749e-06, + "loss": 0.1597, + "step": 807 + }, + { + "epoch": 0.13091380427738172, + "grad_norm": 1.45823073387146, + "learning_rate": 4.99757155734717e-06, + "loss": 0.1965, + "step": 808 + }, + { + "epoch": 0.1310758263123785, + "grad_norm": 1.4000074863433838, + "learning_rate": 4.9975522489039474e-06, + "loss": 0.1707, + "step": 809 + }, + { + "epoch": 0.13123784834737523, + "grad_norm": 1.4006067514419556, + "learning_rate": 4.997532864041669e-06, + "loss": 0.1683, + "step": 810 + }, + { + "epoch": 0.131399870382372, + "grad_norm": 1.3444918394088745, + "learning_rate": 4.99751340276093e-06, + "loss": 0.1902, + "step": 811 + }, + { + "epoch": 0.13156189241736876, + "grad_norm": 1.2597421407699585, + "learning_rate": 4.997493865062323e-06, + "loss": 0.1666, + "step": 812 + }, + { + "epoch": 0.13172391445236553, + "grad_norm": 1.7309461832046509, + "learning_rate": 4.997474250946448e-06, + "loss": 0.1891, + "step": 813 + }, + { + "epoch": 0.13188593648736227, + "grad_norm": 1.5406413078308105, + "learning_rate": 4.9974545604139055e-06, + "loss": 0.1793, + "step": 814 + }, + { + "epoch": 0.13204795852235904, + "grad_norm": 1.402835726737976, + "learning_rate": 4.9974347934652965e-06, + "loss": 0.1846, + "step": 815 + }, + { + "epoch": 0.1322099805573558, + "grad_norm": 1.3662683963775635, + "learning_rate": 4.997414950101227e-06, + "loss": 0.1669, + "step": 816 + }, + { + "epoch": 0.13237200259235257, + "grad_norm": 1.3234673738479614, + "learning_rate": 4.9973950303223026e-06, + "loss": 0.1713, + "step": 817 + }, + { + "epoch": 0.1325340246273493, + "grad_norm": 1.6133899688720703, + "learning_rate": 4.997375034129135e-06, + "loss": 0.1928, + "step": 818 + }, + { + "epoch": 0.13269604666234608, + "grad_norm": 1.3357648849487305, + "learning_rate": 4.997354961522335e-06, + "loss": 0.1872, + "step": 819 + }, + { + "epoch": 0.13285806869734285, + "grad_norm": 1.4020048379898071, + "learning_rate": 4.997334812502516e-06, + "loss": 0.1831, + "step": 820 + }, + { + "epoch": 0.13302009073233959, + "grad_norm": 1.350357174873352, + "learning_rate": 4.997314587070295e-06, + "loss": 0.166, + "step": 821 + }, + { + "epoch": 0.13318211276733635, + "grad_norm": 1.5430233478546143, + "learning_rate": 4.9972942852262915e-06, + "loss": 0.1769, + "step": 822 + }, + { + "epoch": 0.13334413480233312, + "grad_norm": 1.4370311498641968, + "learning_rate": 4.997273906971126e-06, + "loss": 0.1876, + "step": 823 + }, + { + "epoch": 0.1335061568373299, + "grad_norm": 1.2412606477737427, + "learning_rate": 4.997253452305423e-06, + "loss": 0.1494, + "step": 824 + }, + { + "epoch": 0.13366817887232663, + "grad_norm": 1.3403682708740234, + "learning_rate": 4.9972329212298065e-06, + "loss": 0.18, + "step": 825 + }, + { + "epoch": 0.1338302009073234, + "grad_norm": 1.2844048738479614, + "learning_rate": 4.9972123137449065e-06, + "loss": 0.1618, + "step": 826 + }, + { + "epoch": 0.13399222294232016, + "grad_norm": 1.5786724090576172, + "learning_rate": 4.997191629851352e-06, + "loss": 0.1946, + "step": 827 + }, + { + "epoch": 0.13415424497731693, + "grad_norm": 1.384547472000122, + "learning_rate": 4.997170869549778e-06, + "loss": 0.1725, + "step": 828 + }, + { + "epoch": 0.13431626701231367, + "grad_norm": 1.4201945066452026, + "learning_rate": 4.997150032840818e-06, + "loss": 0.1766, + "step": 829 + }, + { + "epoch": 0.13447828904731043, + "grad_norm": 1.369523048400879, + "learning_rate": 4.99712911972511e-06, + "loss": 0.1735, + "step": 830 + }, + { + "epoch": 0.1346403110823072, + "grad_norm": 1.2436349391937256, + "learning_rate": 4.997108130203293e-06, + "loss": 0.1599, + "step": 831 + }, + { + "epoch": 0.13480233311730394, + "grad_norm": 1.3385329246520996, + "learning_rate": 4.99708706427601e-06, + "loss": 0.1676, + "step": 832 + }, + { + "epoch": 0.1349643551523007, + "grad_norm": 1.412265419960022, + "learning_rate": 4.997065921943907e-06, + "loss": 0.1745, + "step": 833 + }, + { + "epoch": 0.13512637718729748, + "grad_norm": 1.3985753059387207, + "learning_rate": 4.997044703207629e-06, + "loss": 0.154, + "step": 834 + }, + { + "epoch": 0.13528839922229424, + "grad_norm": 1.3470317125320435, + "learning_rate": 4.9970234080678244e-06, + "loss": 0.1806, + "step": 835 + }, + { + "epoch": 0.13545042125729098, + "grad_norm": 1.3285126686096191, + "learning_rate": 4.9970020365251485e-06, + "loss": 0.179, + "step": 836 + }, + { + "epoch": 0.13561244329228775, + "grad_norm": 1.4892990589141846, + "learning_rate": 4.9969805885802515e-06, + "loss": 0.1902, + "step": 837 + }, + { + "epoch": 0.13577446532728452, + "grad_norm": 1.373530626296997, + "learning_rate": 4.996959064233792e-06, + "loss": 0.1539, + "step": 838 + }, + { + "epoch": 0.13593648736228128, + "grad_norm": 1.3297441005706787, + "learning_rate": 4.996937463486427e-06, + "loss": 0.1704, + "step": 839 + }, + { + "epoch": 0.13609850939727802, + "grad_norm": 1.343121886253357, + "learning_rate": 4.996915786338818e-06, + "loss": 0.1592, + "step": 840 + }, + { + "epoch": 0.1362605314322748, + "grad_norm": 1.4662681818008423, + "learning_rate": 4.9968940327916284e-06, + "loss": 0.1863, + "step": 841 + }, + { + "epoch": 0.13642255346727156, + "grad_norm": 1.4304791688919067, + "learning_rate": 4.9968722028455245e-06, + "loss": 0.1775, + "step": 842 + }, + { + "epoch": 0.1365845755022683, + "grad_norm": 1.3115490674972534, + "learning_rate": 4.996850296501172e-06, + "loss": 0.163, + "step": 843 + }, + { + "epoch": 0.13674659753726506, + "grad_norm": 1.5014792680740356, + "learning_rate": 4.996828313759245e-06, + "loss": 0.1856, + "step": 844 + }, + { + "epoch": 0.13690861957226183, + "grad_norm": 1.2926596403121948, + "learning_rate": 4.996806254620411e-06, + "loss": 0.1664, + "step": 845 + }, + { + "epoch": 0.1370706416072586, + "grad_norm": 1.501706600189209, + "learning_rate": 4.99678411908535e-06, + "loss": 0.1922, + "step": 846 + }, + { + "epoch": 0.13723266364225534, + "grad_norm": 1.2282168865203857, + "learning_rate": 4.996761907154736e-06, + "loss": 0.1679, + "step": 847 + }, + { + "epoch": 0.1373946856772521, + "grad_norm": 1.315477967262268, + "learning_rate": 4.996739618829251e-06, + "loss": 0.1789, + "step": 848 + }, + { + "epoch": 0.13755670771224887, + "grad_norm": 1.2631007432937622, + "learning_rate": 4.996717254109574e-06, + "loss": 0.1654, + "step": 849 + }, + { + "epoch": 0.13771872974724564, + "grad_norm": 1.4962035417556763, + "learning_rate": 4.996694812996391e-06, + "loss": 0.2064, + "step": 850 + }, + { + "epoch": 0.13788075178224238, + "grad_norm": 1.2344715595245361, + "learning_rate": 4.99667229549039e-06, + "loss": 0.163, + "step": 851 + }, + { + "epoch": 0.13804277381723914, + "grad_norm": 1.3445398807525635, + "learning_rate": 4.996649701592258e-06, + "loss": 0.1601, + "step": 852 + }, + { + "epoch": 0.1382047958522359, + "grad_norm": 1.3226261138916016, + "learning_rate": 4.996627031302686e-06, + "loss": 0.1853, + "step": 853 + }, + { + "epoch": 0.13836681788723265, + "grad_norm": 1.4674776792526245, + "learning_rate": 4.99660428462237e-06, + "loss": 0.2106, + "step": 854 + }, + { + "epoch": 0.13852883992222942, + "grad_norm": 1.2431912422180176, + "learning_rate": 4.996581461552003e-06, + "loss": 0.1622, + "step": 855 + }, + { + "epoch": 0.13869086195722619, + "grad_norm": 1.3208400011062622, + "learning_rate": 4.996558562092286e-06, + "loss": 0.1614, + "step": 856 + }, + { + "epoch": 0.13885288399222295, + "grad_norm": 1.3402066230773926, + "learning_rate": 4.996535586243918e-06, + "loss": 0.1581, + "step": 857 + }, + { + "epoch": 0.1390149060272197, + "grad_norm": 1.3583393096923828, + "learning_rate": 4.996512534007602e-06, + "loss": 0.1659, + "step": 858 + }, + { + "epoch": 0.13917692806221646, + "grad_norm": 1.4143537282943726, + "learning_rate": 4.9964894053840455e-06, + "loss": 0.1879, + "step": 859 + }, + { + "epoch": 0.13933895009721323, + "grad_norm": 1.3818471431732178, + "learning_rate": 4.996466200373954e-06, + "loss": 0.182, + "step": 860 + }, + { + "epoch": 0.13950097213221, + "grad_norm": 1.3754279613494873, + "learning_rate": 4.996442918978038e-06, + "loss": 0.1816, + "step": 861 + }, + { + "epoch": 0.13966299416720673, + "grad_norm": 1.2652978897094727, + "learning_rate": 4.99641956119701e-06, + "loss": 0.1495, + "step": 862 + }, + { + "epoch": 0.1398250162022035, + "grad_norm": 1.4501652717590332, + "learning_rate": 4.996396127031584e-06, + "loss": 0.1854, + "step": 863 + }, + { + "epoch": 0.13998703823720027, + "grad_norm": 1.2763866186141968, + "learning_rate": 4.996372616482478e-06, + "loss": 0.1654, + "step": 864 + }, + { + "epoch": 0.140149060272197, + "grad_norm": 1.335204839706421, + "learning_rate": 4.996349029550411e-06, + "loss": 0.1533, + "step": 865 + }, + { + "epoch": 0.14031108230719377, + "grad_norm": 1.3689930438995361, + "learning_rate": 4.996325366236105e-06, + "loss": 0.1826, + "step": 866 + }, + { + "epoch": 0.14047310434219054, + "grad_norm": 1.2703019380569458, + "learning_rate": 4.996301626540284e-06, + "loss": 0.1686, + "step": 867 + }, + { + "epoch": 0.1406351263771873, + "grad_norm": 1.4817618131637573, + "learning_rate": 4.996277810463675e-06, + "loss": 0.2053, + "step": 868 + }, + { + "epoch": 0.14079714841218405, + "grad_norm": 1.4522895812988281, + "learning_rate": 4.996253918007004e-06, + "loss": 0.1929, + "step": 869 + }, + { + "epoch": 0.14095917044718081, + "grad_norm": 1.3986607789993286, + "learning_rate": 4.996229949171004e-06, + "loss": 0.1761, + "step": 870 + }, + { + "epoch": 0.14112119248217758, + "grad_norm": 1.342969298362732, + "learning_rate": 4.996205903956409e-06, + "loss": 0.177, + "step": 871 + }, + { + "epoch": 0.14128321451717435, + "grad_norm": 1.3436371088027954, + "learning_rate": 4.996181782363955e-06, + "loss": 0.1776, + "step": 872 + }, + { + "epoch": 0.1414452365521711, + "grad_norm": 1.4952949285507202, + "learning_rate": 4.996157584394378e-06, + "loss": 0.1983, + "step": 873 + }, + { + "epoch": 0.14160725858716786, + "grad_norm": 1.2056357860565186, + "learning_rate": 4.99613331004842e-06, + "loss": 0.1559, + "step": 874 + }, + { + "epoch": 0.14176928062216462, + "grad_norm": 1.2493481636047363, + "learning_rate": 4.996108959326823e-06, + "loss": 0.1677, + "step": 875 + }, + { + "epoch": 0.14193130265716136, + "grad_norm": 1.348042607307434, + "learning_rate": 4.996084532230332e-06, + "loss": 0.1691, + "step": 876 + }, + { + "epoch": 0.14209332469215813, + "grad_norm": 1.3877140283584595, + "learning_rate": 4.996060028759695e-06, + "loss": 0.1829, + "step": 877 + }, + { + "epoch": 0.1422553467271549, + "grad_norm": 1.3358656167984009, + "learning_rate": 4.996035448915661e-06, + "loss": 0.1798, + "step": 878 + }, + { + "epoch": 0.14241736876215166, + "grad_norm": 1.5458526611328125, + "learning_rate": 4.996010792698983e-06, + "loss": 0.2146, + "step": 879 + }, + { + "epoch": 0.1425793907971484, + "grad_norm": 1.238255500793457, + "learning_rate": 4.995986060110415e-06, + "loss": 0.1554, + "step": 880 + }, + { + "epoch": 0.14274141283214517, + "grad_norm": 1.332350730895996, + "learning_rate": 4.995961251150714e-06, + "loss": 0.1563, + "step": 881 + }, + { + "epoch": 0.14290343486714194, + "grad_norm": 1.4522160291671753, + "learning_rate": 4.995936365820638e-06, + "loss": 0.194, + "step": 882 + }, + { + "epoch": 0.1430654569021387, + "grad_norm": 1.2477561235427856, + "learning_rate": 4.99591140412095e-06, + "loss": 0.1582, + "step": 883 + }, + { + "epoch": 0.14322747893713544, + "grad_norm": 1.5550086498260498, + "learning_rate": 4.9958863660524125e-06, + "loss": 0.1858, + "step": 884 + }, + { + "epoch": 0.1433895009721322, + "grad_norm": 1.1642705202102661, + "learning_rate": 4.995861251615792e-06, + "loss": 0.1353, + "step": 885 + }, + { + "epoch": 0.14355152300712898, + "grad_norm": 1.1860448122024536, + "learning_rate": 4.995836060811859e-06, + "loss": 0.1516, + "step": 886 + }, + { + "epoch": 0.14371354504212572, + "grad_norm": 1.2857884168624878, + "learning_rate": 4.99581079364138e-06, + "loss": 0.165, + "step": 887 + }, + { + "epoch": 0.14387556707712248, + "grad_norm": 1.3009867668151855, + "learning_rate": 4.995785450105131e-06, + "loss": 0.1873, + "step": 888 + }, + { + "epoch": 0.14403758911211925, + "grad_norm": 1.2586079835891724, + "learning_rate": 4.995760030203888e-06, + "loss": 0.1622, + "step": 889 + }, + { + "epoch": 0.14419961114711602, + "grad_norm": 1.3150060176849365, + "learning_rate": 4.995734533938427e-06, + "loss": 0.1756, + "step": 890 + }, + { + "epoch": 0.14436163318211276, + "grad_norm": 1.2379282712936401, + "learning_rate": 4.995708961309528e-06, + "loss": 0.1527, + "step": 891 + }, + { + "epoch": 0.14452365521710953, + "grad_norm": 1.3145521879196167, + "learning_rate": 4.995683312317975e-06, + "loss": 0.1756, + "step": 892 + }, + { + "epoch": 0.1446856772521063, + "grad_norm": 1.420900583267212, + "learning_rate": 4.9956575869645515e-06, + "loss": 0.1738, + "step": 893 + }, + { + "epoch": 0.14484769928710303, + "grad_norm": 1.2677338123321533, + "learning_rate": 4.995631785250046e-06, + "loss": 0.1759, + "step": 894 + }, + { + "epoch": 0.1450097213220998, + "grad_norm": 1.1973481178283691, + "learning_rate": 4.995605907175247e-06, + "loss": 0.1592, + "step": 895 + }, + { + "epoch": 0.14517174335709657, + "grad_norm": 1.342222809791565, + "learning_rate": 4.9955799527409465e-06, + "loss": 0.1733, + "step": 896 + }, + { + "epoch": 0.14533376539209333, + "grad_norm": 1.3735847473144531, + "learning_rate": 4.995553921947938e-06, + "loss": 0.1729, + "step": 897 + }, + { + "epoch": 0.14549578742709007, + "grad_norm": 1.2551465034484863, + "learning_rate": 4.99552781479702e-06, + "loss": 0.1819, + "step": 898 + }, + { + "epoch": 0.14565780946208684, + "grad_norm": 1.5449222326278687, + "learning_rate": 4.995501631288989e-06, + "loss": 0.1913, + "step": 899 + }, + { + "epoch": 0.1458198314970836, + "grad_norm": 1.166682243347168, + "learning_rate": 4.995475371424648e-06, + "loss": 0.1512, + "step": 900 + }, + { + "epoch": 0.14598185353208037, + "grad_norm": 1.2635172605514526, + "learning_rate": 4.995449035204798e-06, + "loss": 0.1575, + "step": 901 + }, + { + "epoch": 0.14614387556707711, + "grad_norm": 1.3577876091003418, + "learning_rate": 4.995422622630247e-06, + "loss": 0.1916, + "step": 902 + }, + { + "epoch": 0.14630589760207388, + "grad_norm": 1.3075361251831055, + "learning_rate": 4.995396133701803e-06, + "loss": 0.1454, + "step": 903 + }, + { + "epoch": 0.14646791963707065, + "grad_norm": 1.339972972869873, + "learning_rate": 4.995369568420276e-06, + "loss": 0.1741, + "step": 904 + }, + { + "epoch": 0.1466299416720674, + "grad_norm": 1.459546685218811, + "learning_rate": 4.995342926786478e-06, + "loss": 0.1882, + "step": 905 + }, + { + "epoch": 0.14679196370706415, + "grad_norm": 1.2182546854019165, + "learning_rate": 4.995316208801226e-06, + "loss": 0.1348, + "step": 906 + }, + { + "epoch": 0.14695398574206092, + "grad_norm": 1.376412272453308, + "learning_rate": 4.995289414465337e-06, + "loss": 0.1798, + "step": 907 + }, + { + "epoch": 0.1471160077770577, + "grad_norm": 1.489270806312561, + "learning_rate": 4.99526254377963e-06, + "loss": 0.1869, + "step": 908 + }, + { + "epoch": 0.14727802981205443, + "grad_norm": 1.320319652557373, + "learning_rate": 4.9952355967449265e-06, + "loss": 0.1671, + "step": 909 + }, + { + "epoch": 0.1474400518470512, + "grad_norm": 1.327093243598938, + "learning_rate": 4.995208573362053e-06, + "loss": 0.1765, + "step": 910 + }, + { + "epoch": 0.14760207388204796, + "grad_norm": 1.3886643648147583, + "learning_rate": 4.995181473631836e-06, + "loss": 0.1717, + "step": 911 + }, + { + "epoch": 0.14776409591704473, + "grad_norm": 1.3381421566009521, + "learning_rate": 4.995154297555103e-06, + "loss": 0.1716, + "step": 912 + }, + { + "epoch": 0.14792611795204147, + "grad_norm": 1.1352941989898682, + "learning_rate": 4.995127045132687e-06, + "loss": 0.1547, + "step": 913 + }, + { + "epoch": 0.14808813998703824, + "grad_norm": 1.4500410556793213, + "learning_rate": 4.995099716365421e-06, + "loss": 0.1913, + "step": 914 + }, + { + "epoch": 0.148250162022035, + "grad_norm": 1.28703773021698, + "learning_rate": 4.995072311254144e-06, + "loss": 0.18, + "step": 915 + }, + { + "epoch": 0.14841218405703174, + "grad_norm": 1.2766846418380737, + "learning_rate": 4.995044829799689e-06, + "loss": 0.1769, + "step": 916 + }, + { + "epoch": 0.1485742060920285, + "grad_norm": 1.2691501379013062, + "learning_rate": 4.995017272002902e-06, + "loss": 0.1684, + "step": 917 + }, + { + "epoch": 0.14873622812702528, + "grad_norm": 1.4125738143920898, + "learning_rate": 4.994989637864624e-06, + "loss": 0.1809, + "step": 918 + }, + { + "epoch": 0.14889825016202204, + "grad_norm": 1.3057345151901245, + "learning_rate": 4.994961927385701e-06, + "loss": 0.1608, + "step": 919 + }, + { + "epoch": 0.14906027219701878, + "grad_norm": 1.4931777715682983, + "learning_rate": 4.99493414056698e-06, + "loss": 0.1883, + "step": 920 + }, + { + "epoch": 0.14922229423201555, + "grad_norm": 1.4381946325302124, + "learning_rate": 4.994906277409313e-06, + "loss": 0.1773, + "step": 921 + }, + { + "epoch": 0.14938431626701232, + "grad_norm": 1.3123935461044312, + "learning_rate": 4.99487833791355e-06, + "loss": 0.159, + "step": 922 + }, + { + "epoch": 0.14954633830200909, + "grad_norm": 1.396132469177246, + "learning_rate": 4.994850322080549e-06, + "loss": 0.1704, + "step": 923 + }, + { + "epoch": 0.14970836033700582, + "grad_norm": 1.267357349395752, + "learning_rate": 4.9948222299111644e-06, + "loss": 0.1605, + "step": 924 + }, + { + "epoch": 0.1498703823720026, + "grad_norm": 1.2908340692520142, + "learning_rate": 4.994794061406258e-06, + "loss": 0.1625, + "step": 925 + }, + { + "epoch": 0.15003240440699936, + "grad_norm": 1.3178255558013916, + "learning_rate": 4.994765816566689e-06, + "loss": 0.176, + "step": 926 + }, + { + "epoch": 0.1501944264419961, + "grad_norm": 1.2020256519317627, + "learning_rate": 4.994737495393325e-06, + "loss": 0.1565, + "step": 927 + }, + { + "epoch": 0.15035644847699287, + "grad_norm": 1.346628189086914, + "learning_rate": 4.994709097887029e-06, + "loss": 0.1868, + "step": 928 + }, + { + "epoch": 0.15051847051198963, + "grad_norm": 1.440252661705017, + "learning_rate": 4.994680624048674e-06, + "loss": 0.184, + "step": 929 + }, + { + "epoch": 0.1506804925469864, + "grad_norm": 1.4660731554031372, + "learning_rate": 4.994652073879127e-06, + "loss": 0.192, + "step": 930 + }, + { + "epoch": 0.15084251458198314, + "grad_norm": 1.4855326414108276, + "learning_rate": 4.9946234473792645e-06, + "loss": 0.1839, + "step": 931 + }, + { + "epoch": 0.1510045366169799, + "grad_norm": 1.2965097427368164, + "learning_rate": 4.994594744549961e-06, + "loss": 0.1671, + "step": 932 + }, + { + "epoch": 0.15116655865197667, + "grad_norm": 1.3417192697525024, + "learning_rate": 4.994565965392094e-06, + "loss": 0.1841, + "step": 933 + }, + { + "epoch": 0.15132858068697344, + "grad_norm": 1.2108187675476074, + "learning_rate": 4.994537109906546e-06, + "loss": 0.1594, + "step": 934 + }, + { + "epoch": 0.15149060272197018, + "grad_norm": 1.4936498403549194, + "learning_rate": 4.994508178094199e-06, + "loss": 0.1838, + "step": 935 + }, + { + "epoch": 0.15165262475696695, + "grad_norm": 1.2515209913253784, + "learning_rate": 4.99447916995594e-06, + "loss": 0.1531, + "step": 936 + }, + { + "epoch": 0.15181464679196371, + "grad_norm": 1.2355049848556519, + "learning_rate": 4.994450085492653e-06, + "loss": 0.1466, + "step": 937 + }, + { + "epoch": 0.15197666882696045, + "grad_norm": 1.317088007926941, + "learning_rate": 4.99442092470523e-06, + "loss": 0.1851, + "step": 938 + }, + { + "epoch": 0.15213869086195722, + "grad_norm": 1.262994408607483, + "learning_rate": 4.994391687594564e-06, + "loss": 0.1656, + "step": 939 + }, + { + "epoch": 0.152300712896954, + "grad_norm": 1.1988916397094727, + "learning_rate": 4.994362374161548e-06, + "loss": 0.1457, + "step": 940 + }, + { + "epoch": 0.15246273493195076, + "grad_norm": 1.2624685764312744, + "learning_rate": 4.99433298440708e-06, + "loss": 0.1759, + "step": 941 + }, + { + "epoch": 0.1526247569669475, + "grad_norm": 1.4372855424880981, + "learning_rate": 4.994303518332059e-06, + "loss": 0.1792, + "step": 942 + }, + { + "epoch": 0.15278677900194426, + "grad_norm": 1.3524651527404785, + "learning_rate": 4.994273975937386e-06, + "loss": 0.1956, + "step": 943 + }, + { + "epoch": 0.15294880103694103, + "grad_norm": 1.2189857959747314, + "learning_rate": 4.994244357223965e-06, + "loss": 0.1646, + "step": 944 + }, + { + "epoch": 0.1531108230719378, + "grad_norm": 1.5901044607162476, + "learning_rate": 4.994214662192704e-06, + "loss": 0.1821, + "step": 945 + }, + { + "epoch": 0.15327284510693454, + "grad_norm": 1.2014747858047485, + "learning_rate": 4.994184890844509e-06, + "loss": 0.1585, + "step": 946 + }, + { + "epoch": 0.1534348671419313, + "grad_norm": 1.2449418306350708, + "learning_rate": 4.994155043180292e-06, + "loss": 0.1766, + "step": 947 + }, + { + "epoch": 0.15359688917692807, + "grad_norm": 1.1856237649917603, + "learning_rate": 4.9941251192009665e-06, + "loss": 0.1561, + "step": 948 + }, + { + "epoch": 0.1537589112119248, + "grad_norm": 1.3287744522094727, + "learning_rate": 4.994095118907449e-06, + "loss": 0.18, + "step": 949 + }, + { + "epoch": 0.15392093324692158, + "grad_norm": 1.4006623029708862, + "learning_rate": 4.994065042300655e-06, + "loss": 0.1852, + "step": 950 + }, + { + "epoch": 0.15408295528191834, + "grad_norm": 1.3355299234390259, + "learning_rate": 4.994034889381508e-06, + "loss": 0.1653, + "step": 951 + }, + { + "epoch": 0.1542449773169151, + "grad_norm": 1.5296586751937866, + "learning_rate": 4.994004660150927e-06, + "loss": 0.2171, + "step": 952 + }, + { + "epoch": 0.15440699935191185, + "grad_norm": 1.447704553604126, + "learning_rate": 4.99397435460984e-06, + "loss": 0.188, + "step": 953 + }, + { + "epoch": 0.15456902138690862, + "grad_norm": 1.272879958152771, + "learning_rate": 4.993943972759173e-06, + "loss": 0.1728, + "step": 954 + }, + { + "epoch": 0.15473104342190538, + "grad_norm": 1.1389333009719849, + "learning_rate": 4.9939135145998554e-06, + "loss": 0.1572, + "step": 955 + }, + { + "epoch": 0.15489306545690215, + "grad_norm": 1.3290883302688599, + "learning_rate": 4.993882980132819e-06, + "loss": 0.1665, + "step": 956 + }, + { + "epoch": 0.1550550874918989, + "grad_norm": 1.3863415718078613, + "learning_rate": 4.993852369358999e-06, + "loss": 0.1757, + "step": 957 + }, + { + "epoch": 0.15521710952689566, + "grad_norm": 1.211903691291809, + "learning_rate": 4.993821682279332e-06, + "loss": 0.1507, + "step": 958 + }, + { + "epoch": 0.15537913156189243, + "grad_norm": 1.2453619241714478, + "learning_rate": 4.9937909188947555e-06, + "loss": 0.1578, + "step": 959 + }, + { + "epoch": 0.15554115359688916, + "grad_norm": 1.4014263153076172, + "learning_rate": 4.993760079206212e-06, + "loss": 0.1949, + "step": 960 + }, + { + "epoch": 0.15570317563188593, + "grad_norm": 1.3202704191207886, + "learning_rate": 4.993729163214644e-06, + "loss": 0.1744, + "step": 961 + }, + { + "epoch": 0.1558651976668827, + "grad_norm": 1.3204492330551147, + "learning_rate": 4.993698170920999e-06, + "loss": 0.1833, + "step": 962 + }, + { + "epoch": 0.15602721970187947, + "grad_norm": 1.3719549179077148, + "learning_rate": 4.993667102326226e-06, + "loss": 0.1733, + "step": 963 + }, + { + "epoch": 0.1561892417368762, + "grad_norm": 1.2820831537246704, + "learning_rate": 4.993635957431273e-06, + "loss": 0.18, + "step": 964 + }, + { + "epoch": 0.15635126377187297, + "grad_norm": 1.2792859077453613, + "learning_rate": 4.993604736237094e-06, + "loss": 0.1643, + "step": 965 + }, + { + "epoch": 0.15651328580686974, + "grad_norm": 1.3071776628494263, + "learning_rate": 4.993573438744645e-06, + "loss": 0.1543, + "step": 966 + }, + { + "epoch": 0.1566753078418665, + "grad_norm": 1.3182742595672607, + "learning_rate": 4.993542064954883e-06, + "loss": 0.1719, + "step": 967 + }, + { + "epoch": 0.15683732987686325, + "grad_norm": 1.3087525367736816, + "learning_rate": 4.993510614868767e-06, + "loss": 0.1889, + "step": 968 + }, + { + "epoch": 0.15699935191186, + "grad_norm": 1.2305779457092285, + "learning_rate": 4.993479088487262e-06, + "loss": 0.1651, + "step": 969 + }, + { + "epoch": 0.15716137394685678, + "grad_norm": 1.1964472532272339, + "learning_rate": 4.99344748581133e-06, + "loss": 0.1523, + "step": 970 + }, + { + "epoch": 0.15732339598185352, + "grad_norm": 1.23350989818573, + "learning_rate": 4.993415806841939e-06, + "loss": 0.16, + "step": 971 + }, + { + "epoch": 0.1574854180168503, + "grad_norm": 1.3825608491897583, + "learning_rate": 4.993384051580059e-06, + "loss": 0.1884, + "step": 972 + }, + { + "epoch": 0.15764744005184705, + "grad_norm": 1.4116489887237549, + "learning_rate": 4.993352220026661e-06, + "loss": 0.1763, + "step": 973 + }, + { + "epoch": 0.15780946208684382, + "grad_norm": 1.418979287147522, + "learning_rate": 4.993320312182718e-06, + "loss": 0.1982, + "step": 974 + }, + { + "epoch": 0.15797148412184056, + "grad_norm": 1.3567852973937988, + "learning_rate": 4.993288328049208e-06, + "loss": 0.1845, + "step": 975 + }, + { + "epoch": 0.15813350615683733, + "grad_norm": 1.257607340812683, + "learning_rate": 4.993256267627108e-06, + "loss": 0.1626, + "step": 976 + }, + { + "epoch": 0.1582955281918341, + "grad_norm": 1.2353687286376953, + "learning_rate": 4.993224130917399e-06, + "loss": 0.1695, + "step": 977 + }, + { + "epoch": 0.15845755022683086, + "grad_norm": 1.4237691164016724, + "learning_rate": 4.993191917921066e-06, + "loss": 0.1841, + "step": 978 + }, + { + "epoch": 0.1586195722618276, + "grad_norm": 1.2207448482513428, + "learning_rate": 4.9931596286390935e-06, + "loss": 0.1518, + "step": 979 + }, + { + "epoch": 0.15878159429682437, + "grad_norm": 1.366822600364685, + "learning_rate": 4.9931272630724704e-06, + "loss": 0.1751, + "step": 980 + }, + { + "epoch": 0.15894361633182114, + "grad_norm": 1.3193942308425903, + "learning_rate": 4.993094821222186e-06, + "loss": 0.1615, + "step": 981 + }, + { + "epoch": 0.15910563836681788, + "grad_norm": 1.361041784286499, + "learning_rate": 4.993062303089233e-06, + "loss": 0.1669, + "step": 982 + }, + { + "epoch": 0.15926766040181464, + "grad_norm": 1.4017630815505981, + "learning_rate": 4.993029708674607e-06, + "loss": 0.1801, + "step": 983 + }, + { + "epoch": 0.1594296824368114, + "grad_norm": 1.3844646215438843, + "learning_rate": 4.992997037979304e-06, + "loss": 0.1771, + "step": 984 + }, + { + "epoch": 0.15959170447180818, + "grad_norm": 1.2512445449829102, + "learning_rate": 4.992964291004326e-06, + "loss": 0.1692, + "step": 985 + }, + { + "epoch": 0.15975372650680492, + "grad_norm": 1.4162150621414185, + "learning_rate": 4.992931467750673e-06, + "loss": 0.1696, + "step": 986 + }, + { + "epoch": 0.15991574854180168, + "grad_norm": 1.2711294889450073, + "learning_rate": 4.99289856821935e-06, + "loss": 0.1628, + "step": 987 + }, + { + "epoch": 0.16007777057679845, + "grad_norm": 1.318920373916626, + "learning_rate": 4.992865592411362e-06, + "loss": 0.1504, + "step": 988 + }, + { + "epoch": 0.16023979261179522, + "grad_norm": 1.3679457902908325, + "learning_rate": 4.992832540327721e-06, + "loss": 0.1738, + "step": 989 + }, + { + "epoch": 0.16040181464679196, + "grad_norm": 1.2336037158966064, + "learning_rate": 4.992799411969436e-06, + "loss": 0.1659, + "step": 990 + }, + { + "epoch": 0.16056383668178872, + "grad_norm": 1.1132322549819946, + "learning_rate": 4.992766207337523e-06, + "loss": 0.1369, + "step": 991 + }, + { + "epoch": 0.1607258587167855, + "grad_norm": 1.2798656225204468, + "learning_rate": 4.992732926432995e-06, + "loss": 0.1702, + "step": 992 + }, + { + "epoch": 0.16088788075178223, + "grad_norm": 1.3788708448410034, + "learning_rate": 4.992699569256872e-06, + "loss": 0.1978, + "step": 993 + }, + { + "epoch": 0.161049902786779, + "grad_norm": 1.2766364812850952, + "learning_rate": 4.9926661358101745e-06, + "loss": 0.1856, + "step": 994 + }, + { + "epoch": 0.16121192482177576, + "grad_norm": 1.371200680732727, + "learning_rate": 4.992632626093926e-06, + "loss": 0.1999, + "step": 995 + }, + { + "epoch": 0.16137394685677253, + "grad_norm": 1.2448960542678833, + "learning_rate": 4.9925990401091505e-06, + "loss": 0.1574, + "step": 996 + }, + { + "epoch": 0.16153596889176927, + "grad_norm": 1.2014307975769043, + "learning_rate": 4.992565377856876e-06, + "loss": 0.1572, + "step": 997 + }, + { + "epoch": 0.16169799092676604, + "grad_norm": 1.3043875694274902, + "learning_rate": 4.992531639338133e-06, + "loss": 0.1746, + "step": 998 + }, + { + "epoch": 0.1618600129617628, + "grad_norm": 1.3030551671981812, + "learning_rate": 4.992497824553954e-06, + "loss": 0.172, + "step": 999 + }, + { + "epoch": 0.16202203499675957, + "grad_norm": 1.3272135257720947, + "learning_rate": 4.992463933505374e-06, + "loss": 0.1755, + "step": 1000 + }, + { + "epoch": 0.1621840570317563, + "grad_norm": 1.2103444337844849, + "learning_rate": 4.992429966193428e-06, + "loss": 0.1522, + "step": 1001 + }, + { + "epoch": 0.16234607906675308, + "grad_norm": 1.353193759918213, + "learning_rate": 4.9923959226191574e-06, + "loss": 0.1796, + "step": 1002 + }, + { + "epoch": 0.16250810110174985, + "grad_norm": 1.339188814163208, + "learning_rate": 4.992361802783603e-06, + "loss": 0.1926, + "step": 1003 + }, + { + "epoch": 0.16267012313674659, + "grad_norm": 1.269451379776001, + "learning_rate": 4.992327606687808e-06, + "loss": 0.1597, + "step": 1004 + }, + { + "epoch": 0.16283214517174335, + "grad_norm": 1.325924277305603, + "learning_rate": 4.992293334332821e-06, + "loss": 0.1785, + "step": 1005 + }, + { + "epoch": 0.16299416720674012, + "grad_norm": 1.2406764030456543, + "learning_rate": 4.992258985719688e-06, + "loss": 0.1658, + "step": 1006 + }, + { + "epoch": 0.1631561892417369, + "grad_norm": 1.3176758289337158, + "learning_rate": 4.992224560849461e-06, + "loss": 0.1802, + "step": 1007 + }, + { + "epoch": 0.16331821127673363, + "grad_norm": 1.3824093341827393, + "learning_rate": 4.992190059723194e-06, + "loss": 0.1961, + "step": 1008 + }, + { + "epoch": 0.1634802333117304, + "grad_norm": 1.223380446434021, + "learning_rate": 4.9921554823419424e-06, + "loss": 0.1567, + "step": 1009 + }, + { + "epoch": 0.16364225534672716, + "grad_norm": 1.3019189834594727, + "learning_rate": 4.992120828706763e-06, + "loss": 0.1695, + "step": 1010 + }, + { + "epoch": 0.1638042773817239, + "grad_norm": 1.3338146209716797, + "learning_rate": 4.9920860988187185e-06, + "loss": 0.1876, + "step": 1011 + }, + { + "epoch": 0.16396629941672067, + "grad_norm": 1.167526364326477, + "learning_rate": 4.99205129267887e-06, + "loss": 0.1609, + "step": 1012 + }, + { + "epoch": 0.16412832145171743, + "grad_norm": 1.1966924667358398, + "learning_rate": 4.9920164102882816e-06, + "loss": 0.1576, + "step": 1013 + }, + { + "epoch": 0.1642903434867142, + "grad_norm": 1.1444998979568481, + "learning_rate": 4.991981451648022e-06, + "loss": 0.1646, + "step": 1014 + }, + { + "epoch": 0.16445236552171094, + "grad_norm": 1.1742830276489258, + "learning_rate": 4.99194641675916e-06, + "loss": 0.1607, + "step": 1015 + }, + { + "epoch": 0.1646143875567077, + "grad_norm": 1.199351191520691, + "learning_rate": 4.9919113056227685e-06, + "loss": 0.1506, + "step": 1016 + }, + { + "epoch": 0.16477640959170448, + "grad_norm": 1.2436610460281372, + "learning_rate": 4.991876118239922e-06, + "loss": 0.1683, + "step": 1017 + }, + { + "epoch": 0.16493843162670124, + "grad_norm": 1.322636365890503, + "learning_rate": 4.991840854611696e-06, + "loss": 0.1757, + "step": 1018 + }, + { + "epoch": 0.16510045366169798, + "grad_norm": 1.06990647315979, + "learning_rate": 4.99180551473917e-06, + "loss": 0.1369, + "step": 1019 + }, + { + "epoch": 0.16526247569669475, + "grad_norm": 1.193261981010437, + "learning_rate": 4.991770098623425e-06, + "loss": 0.1608, + "step": 1020 + }, + { + "epoch": 0.16542449773169152, + "grad_norm": 1.2813093662261963, + "learning_rate": 4.991734606265544e-06, + "loss": 0.1531, + "step": 1021 + }, + { + "epoch": 0.16558651976668826, + "grad_norm": 1.2579412460327148, + "learning_rate": 4.9916990376666156e-06, + "loss": 0.1675, + "step": 1022 + }, + { + "epoch": 0.16574854180168502, + "grad_norm": 1.3288764953613281, + "learning_rate": 4.991663392827726e-06, + "loss": 0.1501, + "step": 1023 + }, + { + "epoch": 0.1659105638366818, + "grad_norm": 1.4220918416976929, + "learning_rate": 4.991627671749966e-06, + "loss": 0.1591, + "step": 1024 + }, + { + "epoch": 0.16607258587167856, + "grad_norm": 1.4772206544876099, + "learning_rate": 4.991591874434429e-06, + "loss": 0.1787, + "step": 1025 + }, + { + "epoch": 0.1662346079066753, + "grad_norm": 1.2587242126464844, + "learning_rate": 4.9915560008822105e-06, + "loss": 0.164, + "step": 1026 + }, + { + "epoch": 0.16639662994167206, + "grad_norm": 1.2085176706314087, + "learning_rate": 4.991520051094407e-06, + "loss": 0.1701, + "step": 1027 + }, + { + "epoch": 0.16655865197666883, + "grad_norm": 1.2200427055358887, + "learning_rate": 4.99148402507212e-06, + "loss": 0.1538, + "step": 1028 + }, + { + "epoch": 0.1667206740116656, + "grad_norm": 1.2087466716766357, + "learning_rate": 4.991447922816451e-06, + "loss": 0.1672, + "step": 1029 + }, + { + "epoch": 0.16688269604666234, + "grad_norm": 1.2888163328170776, + "learning_rate": 4.991411744328505e-06, + "loss": 0.1578, + "step": 1030 + }, + { + "epoch": 0.1670447180816591, + "grad_norm": 1.478442668914795, + "learning_rate": 4.991375489609388e-06, + "loss": 0.1984, + "step": 1031 + }, + { + "epoch": 0.16720674011665587, + "grad_norm": 1.3319884538650513, + "learning_rate": 4.991339158660211e-06, + "loss": 0.1584, + "step": 1032 + }, + { + "epoch": 0.1673687621516526, + "grad_norm": 1.1861016750335693, + "learning_rate": 4.991302751482084e-06, + "loss": 0.1581, + "step": 1033 + }, + { + "epoch": 0.16753078418664938, + "grad_norm": 1.2733880281448364, + "learning_rate": 4.991266268076121e-06, + "loss": 0.1757, + "step": 1034 + }, + { + "epoch": 0.16769280622164615, + "grad_norm": 1.2338616847991943, + "learning_rate": 4.99122970844344e-06, + "loss": 0.1733, + "step": 1035 + }, + { + "epoch": 0.1678548282566429, + "grad_norm": 1.305107831954956, + "learning_rate": 4.991193072585158e-06, + "loss": 0.1722, + "step": 1036 + }, + { + "epoch": 0.16801685029163965, + "grad_norm": 1.245059847831726, + "learning_rate": 4.991156360502397e-06, + "loss": 0.1636, + "step": 1037 + }, + { + "epoch": 0.16817887232663642, + "grad_norm": 1.260838270187378, + "learning_rate": 4.99111957219628e-06, + "loss": 0.1777, + "step": 1038 + }, + { + "epoch": 0.1683408943616332, + "grad_norm": 1.13496994972229, + "learning_rate": 4.9910827076679325e-06, + "loss": 0.1579, + "step": 1039 + }, + { + "epoch": 0.16850291639662995, + "grad_norm": 1.2134323120117188, + "learning_rate": 4.991045766918482e-06, + "loss": 0.1666, + "step": 1040 + }, + { + "epoch": 0.1686649384316267, + "grad_norm": 1.0484683513641357, + "learning_rate": 4.9910087499490585e-06, + "loss": 0.1457, + "step": 1041 + }, + { + "epoch": 0.16882696046662346, + "grad_norm": 1.2500617504119873, + "learning_rate": 4.990971656760797e-06, + "loss": 0.1704, + "step": 1042 + }, + { + "epoch": 0.16898898250162023, + "grad_norm": 1.296579360961914, + "learning_rate": 4.990934487354831e-06, + "loss": 0.1739, + "step": 1043 + }, + { + "epoch": 0.16915100453661697, + "grad_norm": 1.2237029075622559, + "learning_rate": 4.990897241732296e-06, + "loss": 0.1624, + "step": 1044 + }, + { + "epoch": 0.16931302657161373, + "grad_norm": 1.361970067024231, + "learning_rate": 4.9908599198943346e-06, + "loss": 0.193, + "step": 1045 + }, + { + "epoch": 0.1694750486066105, + "grad_norm": 1.2445952892303467, + "learning_rate": 4.990822521842086e-06, + "loss": 0.1788, + "step": 1046 + }, + { + "epoch": 0.16963707064160727, + "grad_norm": 1.3412961959838867, + "learning_rate": 4.990785047576697e-06, + "loss": 0.1853, + "step": 1047 + }, + { + "epoch": 0.169799092676604, + "grad_norm": 1.2915765047073364, + "learning_rate": 4.990747497099312e-06, + "loss": 0.182, + "step": 1048 + }, + { + "epoch": 0.16996111471160077, + "grad_norm": 1.2325940132141113, + "learning_rate": 4.990709870411082e-06, + "loss": 0.1495, + "step": 1049 + }, + { + "epoch": 0.17012313674659754, + "grad_norm": 1.2738209962844849, + "learning_rate": 4.990672167513158e-06, + "loss": 0.1584, + "step": 1050 + }, + { + "epoch": 0.1702851587815943, + "grad_norm": 1.3789118528366089, + "learning_rate": 4.990634388406692e-06, + "loss": 0.1932, + "step": 1051 + }, + { + "epoch": 0.17044718081659105, + "grad_norm": 1.3370823860168457, + "learning_rate": 4.990596533092841e-06, + "loss": 0.1788, + "step": 1052 + }, + { + "epoch": 0.17060920285158782, + "grad_norm": 1.381735920906067, + "learning_rate": 4.990558601572764e-06, + "loss": 0.189, + "step": 1053 + }, + { + "epoch": 0.17077122488658458, + "grad_norm": 1.2468198537826538, + "learning_rate": 4.9905205938476195e-06, + "loss": 0.1635, + "step": 1054 + }, + { + "epoch": 0.17093324692158132, + "grad_norm": 1.2447727918624878, + "learning_rate": 4.990482509918572e-06, + "loss": 0.1759, + "step": 1055 + }, + { + "epoch": 0.1710952689565781, + "grad_norm": 1.266304612159729, + "learning_rate": 4.990444349786788e-06, + "loss": 0.186, + "step": 1056 + }, + { + "epoch": 0.17125729099157486, + "grad_norm": 1.2319304943084717, + "learning_rate": 4.990406113453433e-06, + "loss": 0.1781, + "step": 1057 + }, + { + "epoch": 0.17141931302657162, + "grad_norm": 1.234866738319397, + "learning_rate": 4.990367800919677e-06, + "loss": 0.1719, + "step": 1058 + }, + { + "epoch": 0.17158133506156836, + "grad_norm": 1.0335838794708252, + "learning_rate": 4.990329412186693e-06, + "loss": 0.1407, + "step": 1059 + }, + { + "epoch": 0.17174335709656513, + "grad_norm": 1.2708618640899658, + "learning_rate": 4.990290947255656e-06, + "loss": 0.1635, + "step": 1060 + }, + { + "epoch": 0.1719053791315619, + "grad_norm": 1.3228340148925781, + "learning_rate": 4.990252406127742e-06, + "loss": 0.186, + "step": 1061 + }, + { + "epoch": 0.17206740116655866, + "grad_norm": 1.2613164186477661, + "learning_rate": 4.9902137888041304e-06, + "loss": 0.1698, + "step": 1062 + }, + { + "epoch": 0.1722294232015554, + "grad_norm": 1.2189115285873413, + "learning_rate": 4.990175095286003e-06, + "loss": 0.1622, + "step": 1063 + }, + { + "epoch": 0.17239144523655217, + "grad_norm": 1.113931655883789, + "learning_rate": 4.990136325574545e-06, + "loss": 0.1433, + "step": 1064 + }, + { + "epoch": 0.17255346727154894, + "grad_norm": 1.1805305480957031, + "learning_rate": 4.9900974796709405e-06, + "loss": 0.1345, + "step": 1065 + }, + { + "epoch": 0.17271548930654568, + "grad_norm": 1.3531560897827148, + "learning_rate": 4.990058557576379e-06, + "loss": 0.175, + "step": 1066 + }, + { + "epoch": 0.17287751134154244, + "grad_norm": 1.403969168663025, + "learning_rate": 4.990019559292052e-06, + "loss": 0.1674, + "step": 1067 + }, + { + "epoch": 0.1730395333765392, + "grad_norm": 1.4083645343780518, + "learning_rate": 4.989980484819152e-06, + "loss": 0.1722, + "step": 1068 + }, + { + "epoch": 0.17320155541153598, + "grad_norm": 1.2746031284332275, + "learning_rate": 4.989941334158874e-06, + "loss": 0.1638, + "step": 1069 + }, + { + "epoch": 0.17336357744653272, + "grad_norm": 1.2782847881317139, + "learning_rate": 4.9899021073124175e-06, + "loss": 0.1733, + "step": 1070 + }, + { + "epoch": 0.17352559948152949, + "grad_norm": 1.4043595790863037, + "learning_rate": 4.989862804280982e-06, + "loss": 0.1756, + "step": 1071 + }, + { + "epoch": 0.17368762151652625, + "grad_norm": 1.3001036643981934, + "learning_rate": 4.989823425065769e-06, + "loss": 0.1839, + "step": 1072 + }, + { + "epoch": 0.17384964355152302, + "grad_norm": 1.2596933841705322, + "learning_rate": 4.989783969667986e-06, + "loss": 0.175, + "step": 1073 + }, + { + "epoch": 0.17401166558651976, + "grad_norm": 1.2950873374938965, + "learning_rate": 4.989744438088838e-06, + "loss": 0.1684, + "step": 1074 + }, + { + "epoch": 0.17417368762151653, + "grad_norm": 1.1493442058563232, + "learning_rate": 4.989704830329535e-06, + "loss": 0.1672, + "step": 1075 + }, + { + "epoch": 0.1743357096565133, + "grad_norm": 1.1817610263824463, + "learning_rate": 4.98966514639129e-06, + "loss": 0.1617, + "step": 1076 + }, + { + "epoch": 0.17449773169151003, + "grad_norm": 1.269439697265625, + "learning_rate": 4.989625386275315e-06, + "loss": 0.1821, + "step": 1077 + }, + { + "epoch": 0.1746597537265068, + "grad_norm": 1.2774385213851929, + "learning_rate": 4.98958554998283e-06, + "loss": 0.1772, + "step": 1078 + }, + { + "epoch": 0.17482177576150357, + "grad_norm": 1.3013243675231934, + "learning_rate": 4.98954563751505e-06, + "loss": 0.1776, + "step": 1079 + }, + { + "epoch": 0.17498379779650033, + "grad_norm": 1.1634021997451782, + "learning_rate": 4.989505648873198e-06, + "loss": 0.164, + "step": 1080 + }, + { + "epoch": 0.17514581983149707, + "grad_norm": 1.296128273010254, + "learning_rate": 4.989465584058499e-06, + "loss": 0.1689, + "step": 1081 + }, + { + "epoch": 0.17530784186649384, + "grad_norm": 1.4235268831253052, + "learning_rate": 4.989425443072177e-06, + "loss": 0.2038, + "step": 1082 + }, + { + "epoch": 0.1754698639014906, + "grad_norm": 1.384175181388855, + "learning_rate": 4.989385225915461e-06, + "loss": 0.1739, + "step": 1083 + }, + { + "epoch": 0.17563188593648738, + "grad_norm": 1.1942464113235474, + "learning_rate": 4.9893449325895804e-06, + "loss": 0.1599, + "step": 1084 + }, + { + "epoch": 0.17579390797148411, + "grad_norm": 1.3841326236724854, + "learning_rate": 4.989304563095769e-06, + "loss": 0.1975, + "step": 1085 + }, + { + "epoch": 0.17595593000648088, + "grad_norm": 1.2877038717269897, + "learning_rate": 4.989264117435263e-06, + "loss": 0.1825, + "step": 1086 + }, + { + "epoch": 0.17611795204147765, + "grad_norm": 1.3684138059616089, + "learning_rate": 4.9892235956092985e-06, + "loss": 0.1802, + "step": 1087 + }, + { + "epoch": 0.1762799740764744, + "grad_norm": 1.2257750034332275, + "learning_rate": 4.9891829976191155e-06, + "loss": 0.169, + "step": 1088 + }, + { + "epoch": 0.17644199611147116, + "grad_norm": 1.281955599784851, + "learning_rate": 4.989142323465957e-06, + "loss": 0.1884, + "step": 1089 + }, + { + "epoch": 0.17660401814646792, + "grad_norm": 1.2138808965682983, + "learning_rate": 4.9891015731510665e-06, + "loss": 0.1738, + "step": 1090 + }, + { + "epoch": 0.1767660401814647, + "grad_norm": 1.185698390007019, + "learning_rate": 4.989060746675691e-06, + "loss": 0.1527, + "step": 1091 + }, + { + "epoch": 0.17692806221646143, + "grad_norm": 1.1625752449035645, + "learning_rate": 4.989019844041081e-06, + "loss": 0.1573, + "step": 1092 + }, + { + "epoch": 0.1770900842514582, + "grad_norm": 1.1147958040237427, + "learning_rate": 4.988978865248486e-06, + "loss": 0.1471, + "step": 1093 + }, + { + "epoch": 0.17725210628645496, + "grad_norm": 1.0958008766174316, + "learning_rate": 4.988937810299161e-06, + "loss": 0.1473, + "step": 1094 + }, + { + "epoch": 0.17741412832145173, + "grad_norm": 1.2946974039077759, + "learning_rate": 4.988896679194363e-06, + "loss": 0.169, + "step": 1095 + }, + { + "epoch": 0.17757615035644847, + "grad_norm": 1.2533652782440186, + "learning_rate": 4.98885547193535e-06, + "loss": 0.1622, + "step": 1096 + }, + { + "epoch": 0.17773817239144524, + "grad_norm": 1.2509846687316895, + "learning_rate": 4.988814188523381e-06, + "loss": 0.1657, + "step": 1097 + }, + { + "epoch": 0.177900194426442, + "grad_norm": 1.2330974340438843, + "learning_rate": 4.988772828959722e-06, + "loss": 0.171, + "step": 1098 + }, + { + "epoch": 0.17806221646143874, + "grad_norm": 1.1308923959732056, + "learning_rate": 4.988731393245636e-06, + "loss": 0.1486, + "step": 1099 + }, + { + "epoch": 0.1782242384964355, + "grad_norm": 1.2318450212478638, + "learning_rate": 4.988689881382392e-06, + "loss": 0.1681, + "step": 1100 + }, + { + "epoch": 0.17838626053143228, + "grad_norm": 1.292179822921753, + "learning_rate": 4.988648293371262e-06, + "loss": 0.1751, + "step": 1101 + }, + { + "epoch": 0.17854828256642905, + "grad_norm": 1.2476286888122559, + "learning_rate": 4.988606629213515e-06, + "loss": 0.1792, + "step": 1102 + }, + { + "epoch": 0.17871030460142578, + "grad_norm": 1.4195069074630737, + "learning_rate": 4.988564888910428e-06, + "loss": 0.1311, + "step": 1103 + }, + { + "epoch": 0.17887232663642255, + "grad_norm": 1.1690012216567993, + "learning_rate": 4.9885230724632775e-06, + "loss": 0.1554, + "step": 1104 + }, + { + "epoch": 0.17903434867141932, + "grad_norm": 1.2006351947784424, + "learning_rate": 4.988481179873342e-06, + "loss": 0.1589, + "step": 1105 + }, + { + "epoch": 0.17919637070641609, + "grad_norm": 1.189113974571228, + "learning_rate": 4.9884392111419056e-06, + "loss": 0.1631, + "step": 1106 + }, + { + "epoch": 0.17935839274141283, + "grad_norm": 1.1603453159332275, + "learning_rate": 4.9883971662702514e-06, + "loss": 0.1682, + "step": 1107 + }, + { + "epoch": 0.1795204147764096, + "grad_norm": 1.2347701787948608, + "learning_rate": 4.988355045259665e-06, + "loss": 0.1641, + "step": 1108 + }, + { + "epoch": 0.17968243681140636, + "grad_norm": 1.1546489000320435, + "learning_rate": 4.988312848111436e-06, + "loss": 0.1586, + "step": 1109 + }, + { + "epoch": 0.1798444588464031, + "grad_norm": 1.1792272329330444, + "learning_rate": 4.988270574826857e-06, + "loss": 0.1579, + "step": 1110 + }, + { + "epoch": 0.18000648088139987, + "grad_norm": 1.2593525648117065, + "learning_rate": 4.988228225407218e-06, + "loss": 0.1624, + "step": 1111 + }, + { + "epoch": 0.18016850291639663, + "grad_norm": 1.1456053256988525, + "learning_rate": 4.9881857998538175e-06, + "loss": 0.1591, + "step": 1112 + }, + { + "epoch": 0.1803305249513934, + "grad_norm": 1.169472336769104, + "learning_rate": 4.988143298167952e-06, + "loss": 0.1534, + "step": 1113 + }, + { + "epoch": 0.18049254698639014, + "grad_norm": 1.1957638263702393, + "learning_rate": 4.988100720350924e-06, + "loss": 0.1684, + "step": 1114 + }, + { + "epoch": 0.1806545690213869, + "grad_norm": 1.2542282342910767, + "learning_rate": 4.988058066404035e-06, + "loss": 0.1566, + "step": 1115 + }, + { + "epoch": 0.18081659105638367, + "grad_norm": 1.421115517616272, + "learning_rate": 4.988015336328589e-06, + "loss": 0.1874, + "step": 1116 + }, + { + "epoch": 0.18097861309138044, + "grad_norm": 1.3274586200714111, + "learning_rate": 4.987972530125895e-06, + "loss": 0.19, + "step": 1117 + }, + { + "epoch": 0.18114063512637718, + "grad_norm": 1.2618639469146729, + "learning_rate": 4.987929647797263e-06, + "loss": 0.1596, + "step": 1118 + }, + { + "epoch": 0.18130265716137395, + "grad_norm": 1.172289490699768, + "learning_rate": 4.987886689344003e-06, + "loss": 0.1407, + "step": 1119 + }, + { + "epoch": 0.18146467919637072, + "grad_norm": 1.2448173761367798, + "learning_rate": 4.987843654767432e-06, + "loss": 0.1548, + "step": 1120 + }, + { + "epoch": 0.18162670123136745, + "grad_norm": 1.3023630380630493, + "learning_rate": 4.987800544068866e-06, + "loss": 0.1638, + "step": 1121 + }, + { + "epoch": 0.18178872326636422, + "grad_norm": 1.1977190971374512, + "learning_rate": 4.987757357249623e-06, + "loss": 0.1713, + "step": 1122 + }, + { + "epoch": 0.181950745301361, + "grad_norm": 1.3710788488388062, + "learning_rate": 4.987714094311026e-06, + "loss": 0.1915, + "step": 1123 + }, + { + "epoch": 0.18211276733635776, + "grad_norm": 1.228145956993103, + "learning_rate": 4.987670755254397e-06, + "loss": 0.1672, + "step": 1124 + }, + { + "epoch": 0.1822747893713545, + "grad_norm": 1.3164801597595215, + "learning_rate": 4.9876273400810636e-06, + "loss": 0.2021, + "step": 1125 + }, + { + "epoch": 0.18243681140635126, + "grad_norm": 1.2684440612792969, + "learning_rate": 4.987583848792353e-06, + "loss": 0.1668, + "step": 1126 + }, + { + "epoch": 0.18259883344134803, + "grad_norm": 1.2884535789489746, + "learning_rate": 4.987540281389596e-06, + "loss": 0.1717, + "step": 1127 + }, + { + "epoch": 0.18276085547634477, + "grad_norm": 1.170680284500122, + "learning_rate": 4.987496637874127e-06, + "loss": 0.1524, + "step": 1128 + }, + { + "epoch": 0.18292287751134154, + "grad_norm": 1.2426053285598755, + "learning_rate": 4.98745291824728e-06, + "loss": 0.167, + "step": 1129 + }, + { + "epoch": 0.1830848995463383, + "grad_norm": 1.2498632669448853, + "learning_rate": 4.987409122510394e-06, + "loss": 0.1702, + "step": 1130 + }, + { + "epoch": 0.18324692158133507, + "grad_norm": 1.2569835186004639, + "learning_rate": 4.987365250664807e-06, + "loss": 0.1658, + "step": 1131 + }, + { + "epoch": 0.1834089436163318, + "grad_norm": 1.327354073524475, + "learning_rate": 4.9873213027118635e-06, + "loss": 0.1838, + "step": 1132 + }, + { + "epoch": 0.18357096565132858, + "grad_norm": 1.0752174854278564, + "learning_rate": 4.987277278652907e-06, + "loss": 0.1393, + "step": 1133 + }, + { + "epoch": 0.18373298768632534, + "grad_norm": 1.2634954452514648, + "learning_rate": 4.987233178489285e-06, + "loss": 0.1603, + "step": 1134 + }, + { + "epoch": 0.1838950097213221, + "grad_norm": 1.1911259889602661, + "learning_rate": 4.987189002222347e-06, + "loss": 0.1575, + "step": 1135 + }, + { + "epoch": 0.18405703175631885, + "grad_norm": 1.2841031551361084, + "learning_rate": 4.987144749853444e-06, + "loss": 0.1747, + "step": 1136 + }, + { + "epoch": 0.18421905379131562, + "grad_norm": 1.2230887413024902, + "learning_rate": 4.987100421383931e-06, + "loss": 0.1785, + "step": 1137 + }, + { + "epoch": 0.18438107582631239, + "grad_norm": 1.2518471479415894, + "learning_rate": 4.987056016815163e-06, + "loss": 0.1764, + "step": 1138 + }, + { + "epoch": 0.18454309786130912, + "grad_norm": 1.0959880352020264, + "learning_rate": 4.9870115361485e-06, + "loss": 0.1398, + "step": 1139 + }, + { + "epoch": 0.1847051198963059, + "grad_norm": 1.3243485689163208, + "learning_rate": 4.986966979385302e-06, + "loss": 0.1744, + "step": 1140 + }, + { + "epoch": 0.18486714193130266, + "grad_norm": 1.2652283906936646, + "learning_rate": 4.986922346526933e-06, + "loss": 0.1814, + "step": 1141 + }, + { + "epoch": 0.18502916396629943, + "grad_norm": 1.2872614860534668, + "learning_rate": 4.986877637574758e-06, + "loss": 0.1784, + "step": 1142 + }, + { + "epoch": 0.18519118600129617, + "grad_norm": 1.1612954139709473, + "learning_rate": 4.9868328525301465e-06, + "loss": 0.1584, + "step": 1143 + }, + { + "epoch": 0.18535320803629293, + "grad_norm": 1.3280037641525269, + "learning_rate": 4.986787991394467e-06, + "loss": 0.1737, + "step": 1144 + }, + { + "epoch": 0.1855152300712897, + "grad_norm": 1.0931044816970825, + "learning_rate": 4.986743054169093e-06, + "loss": 0.1389, + "step": 1145 + }, + { + "epoch": 0.18567725210628647, + "grad_norm": 1.2529737949371338, + "learning_rate": 4.9866980408554e-06, + "loss": 0.1537, + "step": 1146 + }, + { + "epoch": 0.1858392741412832, + "grad_norm": 1.2056132555007935, + "learning_rate": 4.986652951454764e-06, + "loss": 0.1531, + "step": 1147 + }, + { + "epoch": 0.18600129617627997, + "grad_norm": 1.2318496704101562, + "learning_rate": 4.986607785968565e-06, + "loss": 0.1839, + "step": 1148 + }, + { + "epoch": 0.18616331821127674, + "grad_norm": 1.3150043487548828, + "learning_rate": 4.9865625443981854e-06, + "loss": 0.1764, + "step": 1149 + }, + { + "epoch": 0.18632534024627348, + "grad_norm": 1.3862760066986084, + "learning_rate": 4.986517226745009e-06, + "loss": 0.1665, + "step": 1150 + }, + { + "epoch": 0.18648736228127025, + "grad_norm": 1.3067452907562256, + "learning_rate": 4.986471833010423e-06, + "loss": 0.1704, + "step": 1151 + }, + { + "epoch": 0.18664938431626701, + "grad_norm": 1.1937528848648071, + "learning_rate": 4.9864263631958165e-06, + "loss": 0.1661, + "step": 1152 + }, + { + "epoch": 0.18681140635126378, + "grad_norm": 1.2431349754333496, + "learning_rate": 4.986380817302579e-06, + "loss": 0.1721, + "step": 1153 + }, + { + "epoch": 0.18697342838626052, + "grad_norm": 1.0870198011398315, + "learning_rate": 4.986335195332107e-06, + "loss": 0.1445, + "step": 1154 + }, + { + "epoch": 0.1871354504212573, + "grad_norm": 1.366522192955017, + "learning_rate": 4.986289497285794e-06, + "loss": 0.1665, + "step": 1155 + }, + { + "epoch": 0.18729747245625405, + "grad_norm": 1.1426249742507935, + "learning_rate": 4.986243723165039e-06, + "loss": 0.1526, + "step": 1156 + }, + { + "epoch": 0.18745949449125082, + "grad_norm": 1.2970033884048462, + "learning_rate": 4.986197872971244e-06, + "loss": 0.18, + "step": 1157 + }, + { + "epoch": 0.18762151652624756, + "grad_norm": 1.2247062921524048, + "learning_rate": 4.9861519467058094e-06, + "loss": 0.1613, + "step": 1158 + }, + { + "epoch": 0.18778353856124433, + "grad_norm": 1.169364333152771, + "learning_rate": 4.986105944370142e-06, + "loss": 0.1621, + "step": 1159 + }, + { + "epoch": 0.1879455605962411, + "grad_norm": 1.2808866500854492, + "learning_rate": 4.986059865965649e-06, + "loss": 0.1978, + "step": 1160 + }, + { + "epoch": 0.18810758263123784, + "grad_norm": 1.209956169128418, + "learning_rate": 4.986013711493739e-06, + "loss": 0.1637, + "step": 1161 + }, + { + "epoch": 0.1882696046662346, + "grad_norm": 1.3208413124084473, + "learning_rate": 4.985967480955827e-06, + "loss": 0.181, + "step": 1162 + }, + { + "epoch": 0.18843162670123137, + "grad_norm": 1.1594798564910889, + "learning_rate": 4.985921174353325e-06, + "loss": 0.1576, + "step": 1163 + }, + { + "epoch": 0.18859364873622814, + "grad_norm": 1.7212146520614624, + "learning_rate": 4.9858747916876515e-06, + "loss": 0.1835, + "step": 1164 + }, + { + "epoch": 0.18875567077122488, + "grad_norm": 1.0957105159759521, + "learning_rate": 4.985828332960225e-06, + "loss": 0.1462, + "step": 1165 + }, + { + "epoch": 0.18891769280622164, + "grad_norm": 1.3084644079208374, + "learning_rate": 4.985781798172467e-06, + "loss": 0.1791, + "step": 1166 + }, + { + "epoch": 0.1890797148412184, + "grad_norm": 1.1999437808990479, + "learning_rate": 4.985735187325802e-06, + "loss": 0.1615, + "step": 1167 + }, + { + "epoch": 0.18924173687621518, + "grad_norm": 1.319326400756836, + "learning_rate": 4.9856885004216545e-06, + "loss": 0.1777, + "step": 1168 + }, + { + "epoch": 0.18940375891121192, + "grad_norm": 1.5149074792861938, + "learning_rate": 4.985641737461455e-06, + "loss": 0.1957, + "step": 1169 + }, + { + "epoch": 0.18956578094620868, + "grad_norm": 1.3002344369888306, + "learning_rate": 4.985594898446633e-06, + "loss": 0.1655, + "step": 1170 + }, + { + "epoch": 0.18972780298120545, + "grad_norm": 1.3267287015914917, + "learning_rate": 4.985547983378622e-06, + "loss": 0.1698, + "step": 1171 + }, + { + "epoch": 0.1898898250162022, + "grad_norm": 1.2608460187911987, + "learning_rate": 4.9855009922588585e-06, + "loss": 0.1638, + "step": 1172 + }, + { + "epoch": 0.19005184705119896, + "grad_norm": 1.1155411005020142, + "learning_rate": 4.985453925088779e-06, + "loss": 0.1507, + "step": 1173 + }, + { + "epoch": 0.19021386908619572, + "grad_norm": 1.2303216457366943, + "learning_rate": 4.985406781869824e-06, + "loss": 0.1799, + "step": 1174 + }, + { + "epoch": 0.1903758911211925, + "grad_norm": 1.1471197605133057, + "learning_rate": 4.985359562603436e-06, + "loss": 0.1512, + "step": 1175 + }, + { + "epoch": 0.19053791315618923, + "grad_norm": 1.1257221698760986, + "learning_rate": 4.98531226729106e-06, + "loss": 0.1563, + "step": 1176 + }, + { + "epoch": 0.190699935191186, + "grad_norm": 1.333609700202942, + "learning_rate": 4.985264895934142e-06, + "loss": 0.2002, + "step": 1177 + }, + { + "epoch": 0.19086195722618277, + "grad_norm": 1.2045459747314453, + "learning_rate": 4.985217448534134e-06, + "loss": 0.1613, + "step": 1178 + }, + { + "epoch": 0.19102397926117953, + "grad_norm": 1.1349177360534668, + "learning_rate": 4.985169925092485e-06, + "loss": 0.1533, + "step": 1179 + }, + { + "epoch": 0.19118600129617627, + "grad_norm": 1.0895230770111084, + "learning_rate": 4.985122325610651e-06, + "loss": 0.1478, + "step": 1180 + }, + { + "epoch": 0.19134802333117304, + "grad_norm": 1.375070571899414, + "learning_rate": 4.985074650090087e-06, + "loss": 0.1824, + "step": 1181 + }, + { + "epoch": 0.1915100453661698, + "grad_norm": 1.3039453029632568, + "learning_rate": 4.985026898532253e-06, + "loss": 0.1727, + "step": 1182 + }, + { + "epoch": 0.19167206740116655, + "grad_norm": 1.2111849784851074, + "learning_rate": 4.984979070938609e-06, + "loss": 0.1557, + "step": 1183 + }, + { + "epoch": 0.1918340894361633, + "grad_norm": 1.2889275550842285, + "learning_rate": 4.98493116731062e-06, + "loss": 0.1659, + "step": 1184 + }, + { + "epoch": 0.19199611147116008, + "grad_norm": 1.2837518453598022, + "learning_rate": 4.98488318764975e-06, + "loss": 0.1584, + "step": 1185 + }, + { + "epoch": 0.19215813350615685, + "grad_norm": 1.1638296842575073, + "learning_rate": 4.984835131957468e-06, + "loss": 0.1552, + "step": 1186 + }, + { + "epoch": 0.1923201555411536, + "grad_norm": 1.2172795534133911, + "learning_rate": 4.9847870002352435e-06, + "loss": 0.1586, + "step": 1187 + }, + { + "epoch": 0.19248217757615035, + "grad_norm": 1.3987014293670654, + "learning_rate": 4.98473879248455e-06, + "loss": 0.1872, + "step": 1188 + }, + { + "epoch": 0.19264419961114712, + "grad_norm": 1.2352248430252075, + "learning_rate": 4.984690508706863e-06, + "loss": 0.1545, + "step": 1189 + }, + { + "epoch": 0.1928062216461439, + "grad_norm": 1.2587782144546509, + "learning_rate": 4.984642148903659e-06, + "loss": 0.1632, + "step": 1190 + }, + { + "epoch": 0.19296824368114063, + "grad_norm": 1.3047451972961426, + "learning_rate": 4.9845937130764185e-06, + "loss": 0.1715, + "step": 1191 + }, + { + "epoch": 0.1931302657161374, + "grad_norm": 1.393811583518982, + "learning_rate": 4.984545201226623e-06, + "loss": 0.1814, + "step": 1192 + }, + { + "epoch": 0.19329228775113416, + "grad_norm": 1.2711546421051025, + "learning_rate": 4.984496613355756e-06, + "loss": 0.1545, + "step": 1193 + }, + { + "epoch": 0.1934543097861309, + "grad_norm": 1.268141269683838, + "learning_rate": 4.984447949465305e-06, + "loss": 0.1825, + "step": 1194 + }, + { + "epoch": 0.19361633182112767, + "grad_norm": 1.3329243659973145, + "learning_rate": 4.984399209556759e-06, + "loss": 0.2102, + "step": 1195 + }, + { + "epoch": 0.19377835385612444, + "grad_norm": 1.380758285522461, + "learning_rate": 4.98435039363161e-06, + "loss": 0.1462, + "step": 1196 + }, + { + "epoch": 0.1939403758911212, + "grad_norm": 1.252043604850769, + "learning_rate": 4.98430150169135e-06, + "loss": 0.1578, + "step": 1197 + }, + { + "epoch": 0.19410239792611794, + "grad_norm": 1.2641617059707642, + "learning_rate": 4.984252533737477e-06, + "loss": 0.177, + "step": 1198 + }, + { + "epoch": 0.1942644199611147, + "grad_norm": 1.1985224485397339, + "learning_rate": 4.984203489771488e-06, + "loss": 0.1531, + "step": 1199 + }, + { + "epoch": 0.19442644199611148, + "grad_norm": 1.3280110359191895, + "learning_rate": 4.984154369794883e-06, + "loss": 0.181, + "step": 1200 + }, + { + "epoch": 0.19458846403110824, + "grad_norm": 1.196886658668518, + "learning_rate": 4.9841051738091675e-06, + "loss": 0.1658, + "step": 1201 + }, + { + "epoch": 0.19475048606610498, + "grad_norm": 1.1714789867401123, + "learning_rate": 4.984055901815844e-06, + "loss": 0.1547, + "step": 1202 + }, + { + "epoch": 0.19491250810110175, + "grad_norm": 1.390263557434082, + "learning_rate": 4.984006553816421e-06, + "loss": 0.1754, + "step": 1203 + }, + { + "epoch": 0.19507453013609852, + "grad_norm": 1.1799697875976562, + "learning_rate": 4.983957129812409e-06, + "loss": 0.147, + "step": 1204 + }, + { + "epoch": 0.19523655217109526, + "grad_norm": 1.1202303171157837, + "learning_rate": 4.983907629805319e-06, + "loss": 0.1609, + "step": 1205 + }, + { + "epoch": 0.19539857420609202, + "grad_norm": 1.2552509307861328, + "learning_rate": 4.9838580537966676e-06, + "loss": 0.1653, + "step": 1206 + }, + { + "epoch": 0.1955605962410888, + "grad_norm": 1.516097068786621, + "learning_rate": 4.98380840178797e-06, + "loss": 0.1858, + "step": 1207 + }, + { + "epoch": 0.19572261827608556, + "grad_norm": 1.360250473022461, + "learning_rate": 4.983758673780747e-06, + "loss": 0.1848, + "step": 1208 + }, + { + "epoch": 0.1958846403110823, + "grad_norm": 1.223142385482788, + "learning_rate": 4.983708869776518e-06, + "loss": 0.1666, + "step": 1209 + }, + { + "epoch": 0.19604666234607906, + "grad_norm": 1.1533637046813965, + "learning_rate": 4.9836589897768084e-06, + "loss": 0.1673, + "step": 1210 + }, + { + "epoch": 0.19620868438107583, + "grad_norm": 1.2063921689987183, + "learning_rate": 4.983609033783144e-06, + "loss": 0.165, + "step": 1211 + }, + { + "epoch": 0.1963707064160726, + "grad_norm": 1.1458781957626343, + "learning_rate": 4.983559001797054e-06, + "loss": 0.1422, + "step": 1212 + }, + { + "epoch": 0.19653272845106934, + "grad_norm": 1.3407319784164429, + "learning_rate": 4.9835088938200674e-06, + "loss": 0.2003, + "step": 1213 + }, + { + "epoch": 0.1966947504860661, + "grad_norm": 1.2016749382019043, + "learning_rate": 4.983458709853719e-06, + "loss": 0.1774, + "step": 1214 + }, + { + "epoch": 0.19685677252106287, + "grad_norm": 1.1205006837844849, + "learning_rate": 4.983408449899545e-06, + "loss": 0.1567, + "step": 1215 + }, + { + "epoch": 0.1970187945560596, + "grad_norm": 1.1485540866851807, + "learning_rate": 4.9833581139590814e-06, + "loss": 0.1703, + "step": 1216 + }, + { + "epoch": 0.19718081659105638, + "grad_norm": 1.086588978767395, + "learning_rate": 4.983307702033869e-06, + "loss": 0.1514, + "step": 1217 + }, + { + "epoch": 0.19734283862605315, + "grad_norm": 1.2290334701538086, + "learning_rate": 4.983257214125451e-06, + "loss": 0.1737, + "step": 1218 + }, + { + "epoch": 0.1975048606610499, + "grad_norm": 1.3126085996627808, + "learning_rate": 4.98320665023537e-06, + "loss": 0.1807, + "step": 1219 + }, + { + "epoch": 0.19766688269604665, + "grad_norm": 1.2392646074295044, + "learning_rate": 4.9831560103651765e-06, + "loss": 0.184, + "step": 1220 + }, + { + "epoch": 0.19782890473104342, + "grad_norm": 1.257860779762268, + "learning_rate": 4.983105294516418e-06, + "loss": 0.1823, + "step": 1221 + }, + { + "epoch": 0.1979909267660402, + "grad_norm": 1.3138630390167236, + "learning_rate": 4.983054502690646e-06, + "loss": 0.1887, + "step": 1222 + }, + { + "epoch": 0.19815294880103695, + "grad_norm": 1.1415594816207886, + "learning_rate": 4.983003634889415e-06, + "loss": 0.1509, + "step": 1223 + }, + { + "epoch": 0.1983149708360337, + "grad_norm": 1.1062023639678955, + "learning_rate": 4.9829526911142825e-06, + "loss": 0.1551, + "step": 1224 + }, + { + "epoch": 0.19847699287103046, + "grad_norm": 1.069726824760437, + "learning_rate": 4.982901671366805e-06, + "loss": 0.1436, + "step": 1225 + }, + { + "epoch": 0.19863901490602723, + "grad_norm": 1.1246789693832397, + "learning_rate": 4.982850575648545e-06, + "loss": 0.1428, + "step": 1226 + }, + { + "epoch": 0.19880103694102397, + "grad_norm": 1.2873611450195312, + "learning_rate": 4.982799403961067e-06, + "loss": 0.1658, + "step": 1227 + }, + { + "epoch": 0.19896305897602073, + "grad_norm": 1.4319779872894287, + "learning_rate": 4.982748156305934e-06, + "loss": 0.1705, + "step": 1228 + }, + { + "epoch": 0.1991250810110175, + "grad_norm": 1.3176896572113037, + "learning_rate": 4.982696832684716e-06, + "loss": 0.1681, + "step": 1229 + }, + { + "epoch": 0.19928710304601427, + "grad_norm": 1.3779314756393433, + "learning_rate": 4.982645433098984e-06, + "loss": 0.1747, + "step": 1230 + }, + { + "epoch": 0.199449125081011, + "grad_norm": 1.322016954421997, + "learning_rate": 4.982593957550308e-06, + "loss": 0.1696, + "step": 1231 + }, + { + "epoch": 0.19961114711600778, + "grad_norm": 1.2837717533111572, + "learning_rate": 4.982542406040266e-06, + "loss": 0.1536, + "step": 1232 + }, + { + "epoch": 0.19977316915100454, + "grad_norm": 1.2116730213165283, + "learning_rate": 4.982490778570434e-06, + "loss": 0.174, + "step": 1233 + }, + { + "epoch": 0.1999351911860013, + "grad_norm": 1.3564155101776123, + "learning_rate": 4.98243907514239e-06, + "loss": 0.1875, + "step": 1234 + }, + { + "epoch": 0.20009721322099805, + "grad_norm": 1.15635085105896, + "learning_rate": 4.982387295757719e-06, + "loss": 0.1603, + "step": 1235 + }, + { + "epoch": 0.20025923525599482, + "grad_norm": 1.2285187244415283, + "learning_rate": 4.982335440418004e-06, + "loss": 0.1765, + "step": 1236 + }, + { + "epoch": 0.20042125729099158, + "grad_norm": 1.1799185276031494, + "learning_rate": 4.982283509124831e-06, + "loss": 0.1598, + "step": 1237 + }, + { + "epoch": 0.20058327932598832, + "grad_norm": 1.2755019664764404, + "learning_rate": 4.98223150187979e-06, + "loss": 0.1781, + "step": 1238 + }, + { + "epoch": 0.2007453013609851, + "grad_norm": 1.2312284708023071, + "learning_rate": 4.982179418684473e-06, + "loss": 0.1673, + "step": 1239 + }, + { + "epoch": 0.20090732339598186, + "grad_norm": 1.0840716361999512, + "learning_rate": 4.982127259540471e-06, + "loss": 0.1442, + "step": 1240 + }, + { + "epoch": 0.20106934543097862, + "grad_norm": 1.2539925575256348, + "learning_rate": 4.9820750244493825e-06, + "loss": 0.157, + "step": 1241 + }, + { + "epoch": 0.20123136746597536, + "grad_norm": 1.1770260334014893, + "learning_rate": 4.9820227134128045e-06, + "loss": 0.1677, + "step": 1242 + }, + { + "epoch": 0.20139338950097213, + "grad_norm": 1.3164162635803223, + "learning_rate": 4.9819703264323375e-06, + "loss": 0.163, + "step": 1243 + }, + { + "epoch": 0.2015554115359689, + "grad_norm": 1.240263819694519, + "learning_rate": 4.981917863509585e-06, + "loss": 0.1804, + "step": 1244 + }, + { + "epoch": 0.20171743357096567, + "grad_norm": 1.2952622175216675, + "learning_rate": 4.981865324646152e-06, + "loss": 0.1781, + "step": 1245 + }, + { + "epoch": 0.2018794556059624, + "grad_norm": 1.1874738931655884, + "learning_rate": 4.981812709843646e-06, + "loss": 0.1665, + "step": 1246 + }, + { + "epoch": 0.20204147764095917, + "grad_norm": 1.251424789428711, + "learning_rate": 4.981760019103677e-06, + "loss": 0.1743, + "step": 1247 + }, + { + "epoch": 0.20220349967595594, + "grad_norm": 1.243334174156189, + "learning_rate": 4.981707252427857e-06, + "loss": 0.166, + "step": 1248 + }, + { + "epoch": 0.20236552171095268, + "grad_norm": 1.208656668663025, + "learning_rate": 4.981654409817801e-06, + "loss": 0.1696, + "step": 1249 + }, + { + "epoch": 0.20252754374594945, + "grad_norm": 1.227049708366394, + "learning_rate": 4.981601491275125e-06, + "loss": 0.1641, + "step": 1250 + }, + { + "epoch": 0.2026895657809462, + "grad_norm": 1.4524271488189697, + "learning_rate": 4.981548496801449e-06, + "loss": 0.1648, + "step": 1251 + }, + { + "epoch": 0.20285158781594298, + "grad_norm": 1.2255809307098389, + "learning_rate": 4.981495426398395e-06, + "loss": 0.1596, + "step": 1252 + }, + { + "epoch": 0.20301360985093972, + "grad_norm": 1.2606754302978516, + "learning_rate": 4.981442280067585e-06, + "loss": 0.1745, + "step": 1253 + }, + { + "epoch": 0.2031756318859365, + "grad_norm": 1.145128846168518, + "learning_rate": 4.981389057810647e-06, + "loss": 0.1593, + "step": 1254 + }, + { + "epoch": 0.20333765392093325, + "grad_norm": 1.2478601932525635, + "learning_rate": 4.981335759629208e-06, + "loss": 0.1837, + "step": 1255 + }, + { + "epoch": 0.20349967595593, + "grad_norm": 1.1817132234573364, + "learning_rate": 4.9812823855248996e-06, + "loss": 0.152, + "step": 1256 + }, + { + "epoch": 0.20366169799092676, + "grad_norm": 1.2523393630981445, + "learning_rate": 4.981228935499355e-06, + "loss": 0.1685, + "step": 1257 + }, + { + "epoch": 0.20382372002592353, + "grad_norm": 1.1938599348068237, + "learning_rate": 4.98117540955421e-06, + "loss": 0.1645, + "step": 1258 + }, + { + "epoch": 0.2039857420609203, + "grad_norm": 1.2206389904022217, + "learning_rate": 4.981121807691101e-06, + "loss": 0.1622, + "step": 1259 + }, + { + "epoch": 0.20414776409591703, + "grad_norm": 1.2022802829742432, + "learning_rate": 4.981068129911669e-06, + "loss": 0.1563, + "step": 1260 + }, + { + "epoch": 0.2043097861309138, + "grad_norm": 1.244292974472046, + "learning_rate": 4.981014376217556e-06, + "loss": 0.1593, + "step": 1261 + }, + { + "epoch": 0.20447180816591057, + "grad_norm": 1.2590563297271729, + "learning_rate": 4.980960546610408e-06, + "loss": 0.1762, + "step": 1262 + }, + { + "epoch": 0.20463383020090734, + "grad_norm": 1.3502979278564453, + "learning_rate": 4.98090664109187e-06, + "loss": 0.1802, + "step": 1263 + }, + { + "epoch": 0.20479585223590407, + "grad_norm": 1.1320891380310059, + "learning_rate": 4.980852659663593e-06, + "loss": 0.1658, + "step": 1264 + }, + { + "epoch": 0.20495787427090084, + "grad_norm": 1.092449426651001, + "learning_rate": 4.980798602327228e-06, + "loss": 0.1452, + "step": 1265 + }, + { + "epoch": 0.2051198963058976, + "grad_norm": 1.2534480094909668, + "learning_rate": 4.9807444690844296e-06, + "loss": 0.1628, + "step": 1266 + }, + { + "epoch": 0.20528191834089435, + "grad_norm": 1.4840904474258423, + "learning_rate": 4.980690259936853e-06, + "loss": 0.1727, + "step": 1267 + }, + { + "epoch": 0.20544394037589112, + "grad_norm": 1.1676522493362427, + "learning_rate": 4.980635974886158e-06, + "loss": 0.1553, + "step": 1268 + }, + { + "epoch": 0.20560596241088788, + "grad_norm": 1.1992971897125244, + "learning_rate": 4.980581613934005e-06, + "loss": 0.1611, + "step": 1269 + }, + { + "epoch": 0.20576798444588465, + "grad_norm": 1.3434457778930664, + "learning_rate": 4.980527177082058e-06, + "loss": 0.2013, + "step": 1270 + }, + { + "epoch": 0.2059300064808814, + "grad_norm": 1.306654691696167, + "learning_rate": 4.980472664331982e-06, + "loss": 0.196, + "step": 1271 + }, + { + "epoch": 0.20609202851587816, + "grad_norm": 1.187699317932129, + "learning_rate": 4.980418075685445e-06, + "loss": 0.1684, + "step": 1272 + }, + { + "epoch": 0.20625405055087492, + "grad_norm": 1.3280266523361206, + "learning_rate": 4.980363411144117e-06, + "loss": 0.1887, + "step": 1273 + }, + { + "epoch": 0.2064160725858717, + "grad_norm": 1.3123143911361694, + "learning_rate": 4.980308670709671e-06, + "loss": 0.1653, + "step": 1274 + }, + { + "epoch": 0.20657809462086843, + "grad_norm": 1.1524012088775635, + "learning_rate": 4.980253854383782e-06, + "loss": 0.1647, + "step": 1275 + }, + { + "epoch": 0.2067401166558652, + "grad_norm": 1.745810866355896, + "learning_rate": 4.980198962168128e-06, + "loss": 0.1887, + "step": 1276 + }, + { + "epoch": 0.20690213869086196, + "grad_norm": 1.2560205459594727, + "learning_rate": 4.980143994064387e-06, + "loss": 0.1829, + "step": 1277 + }, + { + "epoch": 0.2070641607258587, + "grad_norm": 1.1559257507324219, + "learning_rate": 4.9800889500742415e-06, + "loss": 0.1694, + "step": 1278 + }, + { + "epoch": 0.20722618276085547, + "grad_norm": 1.3864670991897583, + "learning_rate": 4.980033830199376e-06, + "loss": 0.1933, + "step": 1279 + }, + { + "epoch": 0.20738820479585224, + "grad_norm": 1.1323646306991577, + "learning_rate": 4.979978634441477e-06, + "loss": 0.1587, + "step": 1280 + }, + { + "epoch": 0.207550226830849, + "grad_norm": 1.1979477405548096, + "learning_rate": 4.979923362802233e-06, + "loss": 0.1691, + "step": 1281 + }, + { + "epoch": 0.20771224886584574, + "grad_norm": 1.0671766996383667, + "learning_rate": 4.979868015283336e-06, + "loss": 0.1499, + "step": 1282 + }, + { + "epoch": 0.2078742709008425, + "grad_norm": 1.2722365856170654, + "learning_rate": 4.979812591886478e-06, + "loss": 0.1702, + "step": 1283 + }, + { + "epoch": 0.20803629293583928, + "grad_norm": 1.3761050701141357, + "learning_rate": 4.979757092613357e-06, + "loss": 0.1916, + "step": 1284 + }, + { + "epoch": 0.20819831497083605, + "grad_norm": 1.2674570083618164, + "learning_rate": 4.9797015174656685e-06, + "loss": 0.1792, + "step": 1285 + }, + { + "epoch": 0.20836033700583279, + "grad_norm": 1.072959065437317, + "learning_rate": 4.979645866445114e-06, + "loss": 0.1454, + "step": 1286 + }, + { + "epoch": 0.20852235904082955, + "grad_norm": 1.4338332414627075, + "learning_rate": 4.979590139553398e-06, + "loss": 0.1976, + "step": 1287 + }, + { + "epoch": 0.20868438107582632, + "grad_norm": 1.3137916326522827, + "learning_rate": 4.9795343367922235e-06, + "loss": 0.1822, + "step": 1288 + }, + { + "epoch": 0.20884640311082306, + "grad_norm": 1.1669158935546875, + "learning_rate": 4.9794784581632986e-06, + "loss": 0.1585, + "step": 1289 + }, + { + "epoch": 0.20900842514581983, + "grad_norm": 1.2235841751098633, + "learning_rate": 4.979422503668334e-06, + "loss": 0.1558, + "step": 1290 + }, + { + "epoch": 0.2091704471808166, + "grad_norm": 1.178096890449524, + "learning_rate": 4.97936647330904e-06, + "loss": 0.1593, + "step": 1291 + }, + { + "epoch": 0.20933246921581336, + "grad_norm": 1.1539371013641357, + "learning_rate": 4.979310367087132e-06, + "loss": 0.1526, + "step": 1292 + }, + { + "epoch": 0.2094944912508101, + "grad_norm": 1.1756463050842285, + "learning_rate": 4.979254185004327e-06, + "loss": 0.1486, + "step": 1293 + }, + { + "epoch": 0.20965651328580687, + "grad_norm": 1.4390196800231934, + "learning_rate": 4.979197927062343e-06, + "loss": 0.1937, + "step": 1294 + }, + { + "epoch": 0.20981853532080363, + "grad_norm": 1.195878505706787, + "learning_rate": 4.979141593262902e-06, + "loss": 0.1656, + "step": 1295 + }, + { + "epoch": 0.2099805573558004, + "grad_norm": 1.1376088857650757, + "learning_rate": 4.979085183607728e-06, + "loss": 0.1668, + "step": 1296 + }, + { + "epoch": 0.21014257939079714, + "grad_norm": 1.0837515592575073, + "learning_rate": 4.979028698098546e-06, + "loss": 0.1448, + "step": 1297 + }, + { + "epoch": 0.2103046014257939, + "grad_norm": 1.2839235067367554, + "learning_rate": 4.978972136737086e-06, + "loss": 0.1886, + "step": 1298 + }, + { + "epoch": 0.21046662346079067, + "grad_norm": 1.239888072013855, + "learning_rate": 4.978915499525077e-06, + "loss": 0.175, + "step": 1299 + }, + { + "epoch": 0.21062864549578741, + "grad_norm": 1.2794440984725952, + "learning_rate": 4.978858786464252e-06, + "loss": 0.1718, + "step": 1300 + }, + { + "epoch": 0.21079066753078418, + "grad_norm": 1.2917277812957764, + "learning_rate": 4.978801997556348e-06, + "loss": 0.1761, + "step": 1301 + }, + { + "epoch": 0.21095268956578095, + "grad_norm": 1.2395626306533813, + "learning_rate": 4.978745132803101e-06, + "loss": 0.1647, + "step": 1302 + }, + { + "epoch": 0.21111471160077772, + "grad_norm": 1.2468279600143433, + "learning_rate": 4.9786881922062515e-06, + "loss": 0.1574, + "step": 1303 + }, + { + "epoch": 0.21127673363577446, + "grad_norm": 1.3926787376403809, + "learning_rate": 4.9786311757675425e-06, + "loss": 0.2085, + "step": 1304 + }, + { + "epoch": 0.21143875567077122, + "grad_norm": 1.1423304080963135, + "learning_rate": 4.978574083488716e-06, + "loss": 0.1471, + "step": 1305 + }, + { + "epoch": 0.211600777705768, + "grad_norm": 1.2583112716674805, + "learning_rate": 4.978516915371522e-06, + "loss": 0.1673, + "step": 1306 + }, + { + "epoch": 0.21176279974076476, + "grad_norm": 1.2514091730117798, + "learning_rate": 4.978459671417707e-06, + "loss": 0.1794, + "step": 1307 + }, + { + "epoch": 0.2119248217757615, + "grad_norm": 1.5359858274459839, + "learning_rate": 4.978402351629024e-06, + "loss": 0.1582, + "step": 1308 + }, + { + "epoch": 0.21208684381075826, + "grad_norm": 1.161242127418518, + "learning_rate": 4.978344956007227e-06, + "loss": 0.1542, + "step": 1309 + }, + { + "epoch": 0.21224886584575503, + "grad_norm": 1.2749699354171753, + "learning_rate": 4.9782874845540715e-06, + "loss": 0.1607, + "step": 1310 + }, + { + "epoch": 0.21241088788075177, + "grad_norm": 1.3189657926559448, + "learning_rate": 4.978229937271317e-06, + "loss": 0.1844, + "step": 1311 + }, + { + "epoch": 0.21257290991574854, + "grad_norm": 1.31024169921875, + "learning_rate": 4.978172314160724e-06, + "loss": 0.1992, + "step": 1312 + }, + { + "epoch": 0.2127349319507453, + "grad_norm": 1.2073999643325806, + "learning_rate": 4.978114615224055e-06, + "loss": 0.1685, + "step": 1313 + }, + { + "epoch": 0.21289695398574207, + "grad_norm": 1.292680263519287, + "learning_rate": 4.9780568404630746e-06, + "loss": 0.1758, + "step": 1314 + }, + { + "epoch": 0.2130589760207388, + "grad_norm": 1.2444231510162354, + "learning_rate": 4.977998989879552e-06, + "loss": 0.1771, + "step": 1315 + }, + { + "epoch": 0.21322099805573558, + "grad_norm": 1.1644980907440186, + "learning_rate": 4.977941063475258e-06, + "loss": 0.1637, + "step": 1316 + }, + { + "epoch": 0.21338302009073234, + "grad_norm": 1.2748589515686035, + "learning_rate": 4.977883061251962e-06, + "loss": 0.1813, + "step": 1317 + }, + { + "epoch": 0.2135450421257291, + "grad_norm": 1.2317109107971191, + "learning_rate": 4.977824983211443e-06, + "loss": 0.1654, + "step": 1318 + }, + { + "epoch": 0.21370706416072585, + "grad_norm": 1.2727543115615845, + "learning_rate": 4.977766829355474e-06, + "loss": 0.1848, + "step": 1319 + }, + { + "epoch": 0.21386908619572262, + "grad_norm": 1.2605267763137817, + "learning_rate": 4.977708599685837e-06, + "loss": 0.1881, + "step": 1320 + }, + { + "epoch": 0.21403110823071939, + "grad_norm": 1.2069200277328491, + "learning_rate": 4.977650294204313e-06, + "loss": 0.1601, + "step": 1321 + }, + { + "epoch": 0.21419313026571613, + "grad_norm": 1.2258086204528809, + "learning_rate": 4.977591912912685e-06, + "loss": 0.1709, + "step": 1322 + }, + { + "epoch": 0.2143551523007129, + "grad_norm": 1.255131721496582, + "learning_rate": 4.977533455812741e-06, + "loss": 0.173, + "step": 1323 + }, + { + "epoch": 0.21451717433570966, + "grad_norm": 1.2011017799377441, + "learning_rate": 4.977474922906268e-06, + "loss": 0.1784, + "step": 1324 + }, + { + "epoch": 0.21467919637070643, + "grad_norm": 1.1094778776168823, + "learning_rate": 4.977416314195058e-06, + "loss": 0.1653, + "step": 1325 + }, + { + "epoch": 0.21484121840570317, + "grad_norm": 1.3300212621688843, + "learning_rate": 4.977357629680903e-06, + "loss": 0.178, + "step": 1326 + }, + { + "epoch": 0.21500324044069993, + "grad_norm": 1.190017819404602, + "learning_rate": 4.977298869365601e-06, + "loss": 0.1671, + "step": 1327 + }, + { + "epoch": 0.2151652624756967, + "grad_norm": 1.2279374599456787, + "learning_rate": 4.977240033250948e-06, + "loss": 0.1738, + "step": 1328 + }, + { + "epoch": 0.21532728451069347, + "grad_norm": 1.1839739084243774, + "learning_rate": 4.977181121338745e-06, + "loss": 0.1663, + "step": 1329 + }, + { + "epoch": 0.2154893065456902, + "grad_norm": 1.1655634641647339, + "learning_rate": 4.977122133630795e-06, + "loss": 0.148, + "step": 1330 + }, + { + "epoch": 0.21565132858068697, + "grad_norm": 1.151414394378662, + "learning_rate": 4.977063070128902e-06, + "loss": 0.1716, + "step": 1331 + }, + { + "epoch": 0.21581335061568374, + "grad_norm": 1.125185489654541, + "learning_rate": 4.9770039308348725e-06, + "loss": 0.1489, + "step": 1332 + }, + { + "epoch": 0.21597537265068048, + "grad_norm": 1.1280821561813354, + "learning_rate": 4.976944715750517e-06, + "loss": 0.1624, + "step": 1333 + }, + { + "epoch": 0.21613739468567725, + "grad_norm": 1.2168023586273193, + "learning_rate": 4.9768854248776475e-06, + "loss": 0.168, + "step": 1334 + }, + { + "epoch": 0.21629941672067401, + "grad_norm": 1.1852880716323853, + "learning_rate": 4.976826058218079e-06, + "loss": 0.1585, + "step": 1335 + }, + { + "epoch": 0.21646143875567078, + "grad_norm": 1.210832953453064, + "learning_rate": 4.976766615773626e-06, + "loss": 0.1665, + "step": 1336 + }, + { + "epoch": 0.21662346079066752, + "grad_norm": 1.44858980178833, + "learning_rate": 4.9767070975461075e-06, + "loss": 0.1918, + "step": 1337 + }, + { + "epoch": 0.2167854828256643, + "grad_norm": 1.1260679960250854, + "learning_rate": 4.976647503537347e-06, + "loss": 0.1658, + "step": 1338 + }, + { + "epoch": 0.21694750486066106, + "grad_norm": 1.1670085191726685, + "learning_rate": 4.976587833749164e-06, + "loss": 0.1604, + "step": 1339 + }, + { + "epoch": 0.21710952689565782, + "grad_norm": 1.2441238164901733, + "learning_rate": 4.9765280881833885e-06, + "loss": 0.1674, + "step": 1340 + }, + { + "epoch": 0.21727154893065456, + "grad_norm": 1.1147247552871704, + "learning_rate": 4.976468266841846e-06, + "loss": 0.1441, + "step": 1341 + }, + { + "epoch": 0.21743357096565133, + "grad_norm": 1.1170032024383545, + "learning_rate": 4.976408369726368e-06, + "loss": 0.153, + "step": 1342 + }, + { + "epoch": 0.2175955930006481, + "grad_norm": 1.2686792612075806, + "learning_rate": 4.976348396838786e-06, + "loss": 0.1759, + "step": 1343 + }, + { + "epoch": 0.21775761503564484, + "grad_norm": 1.1871846914291382, + "learning_rate": 4.976288348180935e-06, + "loss": 0.1524, + "step": 1344 + }, + { + "epoch": 0.2179196370706416, + "grad_norm": 1.2280350923538208, + "learning_rate": 4.976228223754654e-06, + "loss": 0.1729, + "step": 1345 + }, + { + "epoch": 0.21808165910563837, + "grad_norm": 1.1394003629684448, + "learning_rate": 4.976168023561782e-06, + "loss": 0.1569, + "step": 1346 + }, + { + "epoch": 0.21824368114063514, + "grad_norm": 1.1886025667190552, + "learning_rate": 4.976107747604161e-06, + "loss": 0.1539, + "step": 1347 + }, + { + "epoch": 0.21840570317563188, + "grad_norm": 1.199474811553955, + "learning_rate": 4.976047395883634e-06, + "loss": 0.1721, + "step": 1348 + }, + { + "epoch": 0.21856772521062864, + "grad_norm": 1.3025383949279785, + "learning_rate": 4.975986968402048e-06, + "loss": 0.1944, + "step": 1349 + }, + { + "epoch": 0.2187297472456254, + "grad_norm": 1.2892481088638306, + "learning_rate": 4.975926465161254e-06, + "loss": 0.1628, + "step": 1350 + }, + { + "epoch": 0.21889176928062218, + "grad_norm": 1.1720067262649536, + "learning_rate": 4.975865886163101e-06, + "loss": 0.1611, + "step": 1351 + }, + { + "epoch": 0.21905379131561892, + "grad_norm": 1.284433364868164, + "learning_rate": 4.975805231409444e-06, + "loss": 0.1986, + "step": 1352 + }, + { + "epoch": 0.21921581335061568, + "grad_norm": 1.3091256618499756, + "learning_rate": 4.975744500902138e-06, + "loss": 0.173, + "step": 1353 + }, + { + "epoch": 0.21937783538561245, + "grad_norm": 1.1260159015655518, + "learning_rate": 4.975683694643041e-06, + "loss": 0.1485, + "step": 1354 + }, + { + "epoch": 0.2195398574206092, + "grad_norm": 1.1246238946914673, + "learning_rate": 4.975622812634014e-06, + "loss": 0.1565, + "step": 1355 + }, + { + "epoch": 0.21970187945560596, + "grad_norm": 1.2171014547348022, + "learning_rate": 4.97556185487692e-06, + "loss": 0.1701, + "step": 1356 + }, + { + "epoch": 0.21986390149060273, + "grad_norm": 1.202234148979187, + "learning_rate": 4.975500821373624e-06, + "loss": 0.146, + "step": 1357 + }, + { + "epoch": 0.2200259235255995, + "grad_norm": 1.1403602361679077, + "learning_rate": 4.9754397121259935e-06, + "loss": 0.1585, + "step": 1358 + }, + { + "epoch": 0.22018794556059623, + "grad_norm": 1.2086261510849, + "learning_rate": 4.975378527135899e-06, + "loss": 0.166, + "step": 1359 + }, + { + "epoch": 0.220349967595593, + "grad_norm": 1.2435903549194336, + "learning_rate": 4.975317266405211e-06, + "loss": 0.1783, + "step": 1360 + }, + { + "epoch": 0.22051198963058977, + "grad_norm": 1.0744258165359497, + "learning_rate": 4.975255929935805e-06, + "loss": 0.1591, + "step": 1361 + }, + { + "epoch": 0.22067401166558653, + "grad_norm": 1.2490445375442505, + "learning_rate": 4.975194517729557e-06, + "loss": 0.1816, + "step": 1362 + }, + { + "epoch": 0.22083603370058327, + "grad_norm": 1.1408127546310425, + "learning_rate": 4.975133029788347e-06, + "loss": 0.1566, + "step": 1363 + }, + { + "epoch": 0.22099805573558004, + "grad_norm": 1.2081525325775146, + "learning_rate": 4.975071466114057e-06, + "loss": 0.1567, + "step": 1364 + }, + { + "epoch": 0.2211600777705768, + "grad_norm": 1.1004225015640259, + "learning_rate": 4.975009826708568e-06, + "loss": 0.1483, + "step": 1365 + }, + { + "epoch": 0.22132209980557355, + "grad_norm": 1.3118537664413452, + "learning_rate": 4.974948111573768e-06, + "loss": 0.1818, + "step": 1366 + }, + { + "epoch": 0.2214841218405703, + "grad_norm": 1.1786134243011475, + "learning_rate": 4.974886320711546e-06, + "loss": 0.1716, + "step": 1367 + }, + { + "epoch": 0.22164614387556708, + "grad_norm": 1.2705552577972412, + "learning_rate": 4.9748244541237915e-06, + "loss": 0.1852, + "step": 1368 + }, + { + "epoch": 0.22180816591056385, + "grad_norm": 1.1799037456512451, + "learning_rate": 4.974762511812398e-06, + "loss": 0.1414, + "step": 1369 + }, + { + "epoch": 0.2219701879455606, + "grad_norm": 1.1992617845535278, + "learning_rate": 4.97470049377926e-06, + "loss": 0.1662, + "step": 1370 + }, + { + "epoch": 0.22213220998055735, + "grad_norm": 1.1669738292694092, + "learning_rate": 4.974638400026275e-06, + "loss": 0.1544, + "step": 1371 + }, + { + "epoch": 0.22229423201555412, + "grad_norm": 1.2163314819335938, + "learning_rate": 4.974576230555344e-06, + "loss": 0.1643, + "step": 1372 + }, + { + "epoch": 0.22245625405055086, + "grad_norm": 1.3139532804489136, + "learning_rate": 4.9745139853683685e-06, + "loss": 0.1617, + "step": 1373 + }, + { + "epoch": 0.22261827608554763, + "grad_norm": 1.1525776386260986, + "learning_rate": 4.974451664467253e-06, + "loss": 0.1752, + "step": 1374 + }, + { + "epoch": 0.2227802981205444, + "grad_norm": 1.110068917274475, + "learning_rate": 4.974389267853905e-06, + "loss": 0.1511, + "step": 1375 + }, + { + "epoch": 0.22294232015554116, + "grad_norm": 1.1273125410079956, + "learning_rate": 4.974326795530234e-06, + "loss": 0.1528, + "step": 1376 + }, + { + "epoch": 0.2231043421905379, + "grad_norm": 1.3362386226654053, + "learning_rate": 4.97426424749815e-06, + "loss": 0.2061, + "step": 1377 + }, + { + "epoch": 0.22326636422553467, + "grad_norm": 1.0781415700912476, + "learning_rate": 4.974201623759568e-06, + "loss": 0.1522, + "step": 1378 + }, + { + "epoch": 0.22342838626053144, + "grad_norm": 1.1233181953430176, + "learning_rate": 4.974138924316403e-06, + "loss": 0.1601, + "step": 1379 + }, + { + "epoch": 0.2235904082955282, + "grad_norm": 1.0733990669250488, + "learning_rate": 4.974076149170575e-06, + "loss": 0.1613, + "step": 1380 + }, + { + "epoch": 0.22375243033052494, + "grad_norm": 0.9839252233505249, + "learning_rate": 4.9740132983240036e-06, + "loss": 0.1275, + "step": 1381 + }, + { + "epoch": 0.2239144523655217, + "grad_norm": 1.2110671997070312, + "learning_rate": 4.973950371778612e-06, + "loss": 0.1764, + "step": 1382 + }, + { + "epoch": 0.22407647440051848, + "grad_norm": 1.2692327499389648, + "learning_rate": 4.973887369536326e-06, + "loss": 0.1741, + "step": 1383 + }, + { + "epoch": 0.22423849643551522, + "grad_norm": 1.226586937904358, + "learning_rate": 4.973824291599074e-06, + "loss": 0.1578, + "step": 1384 + }, + { + "epoch": 0.22440051847051198, + "grad_norm": 1.1944162845611572, + "learning_rate": 4.973761137968784e-06, + "loss": 0.1543, + "step": 1385 + }, + { + "epoch": 0.22456254050550875, + "grad_norm": 1.3723398447036743, + "learning_rate": 4.973697908647391e-06, + "loss": 0.1834, + "step": 1386 + }, + { + "epoch": 0.22472456254050552, + "grad_norm": 1.196521520614624, + "learning_rate": 4.973634603636828e-06, + "loss": 0.1519, + "step": 1387 + }, + { + "epoch": 0.22488658457550226, + "grad_norm": 1.104246735572815, + "learning_rate": 4.973571222939031e-06, + "loss": 0.1552, + "step": 1388 + }, + { + "epoch": 0.22504860661049902, + "grad_norm": 1.137224555015564, + "learning_rate": 4.973507766555941e-06, + "loss": 0.1628, + "step": 1389 + }, + { + "epoch": 0.2252106286454958, + "grad_norm": 1.2728477716445923, + "learning_rate": 4.973444234489499e-06, + "loss": 0.1687, + "step": 1390 + }, + { + "epoch": 0.22537265068049256, + "grad_norm": 1.2384727001190186, + "learning_rate": 4.97338062674165e-06, + "loss": 0.1916, + "step": 1391 + }, + { + "epoch": 0.2255346727154893, + "grad_norm": 1.280981183052063, + "learning_rate": 4.973316943314338e-06, + "loss": 0.1612, + "step": 1392 + }, + { + "epoch": 0.22569669475048607, + "grad_norm": 1.1161082983016968, + "learning_rate": 4.9732531842095135e-06, + "loss": 0.1507, + "step": 1393 + }, + { + "epoch": 0.22585871678548283, + "grad_norm": 1.3085819482803345, + "learning_rate": 4.9731893494291275e-06, + "loss": 0.192, + "step": 1394 + }, + { + "epoch": 0.22602073882047957, + "grad_norm": 1.1078639030456543, + "learning_rate": 4.973125438975131e-06, + "loss": 0.1516, + "step": 1395 + }, + { + "epoch": 0.22618276085547634, + "grad_norm": 1.1671632528305054, + "learning_rate": 4.973061452849481e-06, + "loss": 0.1693, + "step": 1396 + }, + { + "epoch": 0.2263447828904731, + "grad_norm": 1.103091835975647, + "learning_rate": 4.9729973910541365e-06, + "loss": 0.1415, + "step": 1397 + }, + { + "epoch": 0.22650680492546987, + "grad_norm": 1.1498920917510986, + "learning_rate": 4.972933253591056e-06, + "loss": 0.1562, + "step": 1398 + }, + { + "epoch": 0.2266688269604666, + "grad_norm": 1.1860241889953613, + "learning_rate": 4.972869040462202e-06, + "loss": 0.1641, + "step": 1399 + }, + { + "epoch": 0.22683084899546338, + "grad_norm": 1.2506409883499146, + "learning_rate": 4.972804751669539e-06, + "loss": 0.1622, + "step": 1400 + }, + { + "epoch": 0.22699287103046015, + "grad_norm": 1.1482832431793213, + "learning_rate": 4.9727403872150345e-06, + "loss": 0.1506, + "step": 1401 + }, + { + "epoch": 0.22715489306545691, + "grad_norm": 1.3503156900405884, + "learning_rate": 4.972675947100659e-06, + "loss": 0.2058, + "step": 1402 + }, + { + "epoch": 0.22731691510045365, + "grad_norm": 1.2772489786148071, + "learning_rate": 4.972611431328384e-06, + "loss": 0.1633, + "step": 1403 + }, + { + "epoch": 0.22747893713545042, + "grad_norm": 1.2101973295211792, + "learning_rate": 4.972546839900181e-06, + "loss": 0.1793, + "step": 1404 + }, + { + "epoch": 0.2276409591704472, + "grad_norm": 1.225361943244934, + "learning_rate": 4.972482172818029e-06, + "loss": 0.174, + "step": 1405 + }, + { + "epoch": 0.22780298120544393, + "grad_norm": 1.211272120475769, + "learning_rate": 4.972417430083906e-06, + "loss": 0.1821, + "step": 1406 + }, + { + "epoch": 0.2279650032404407, + "grad_norm": 1.1716586351394653, + "learning_rate": 4.9723526116997925e-06, + "loss": 0.1567, + "step": 1407 + }, + { + "epoch": 0.22812702527543746, + "grad_norm": 1.2331879138946533, + "learning_rate": 4.972287717667672e-06, + "loss": 0.1849, + "step": 1408 + }, + { + "epoch": 0.22828904731043423, + "grad_norm": 1.1068236827850342, + "learning_rate": 4.972222747989531e-06, + "loss": 0.1506, + "step": 1409 + }, + { + "epoch": 0.22845106934543097, + "grad_norm": 1.0929367542266846, + "learning_rate": 4.972157702667356e-06, + "loss": 0.1625, + "step": 1410 + }, + { + "epoch": 0.22861309138042774, + "grad_norm": 1.1465107202529907, + "learning_rate": 4.972092581703138e-06, + "loss": 0.1676, + "step": 1411 + }, + { + "epoch": 0.2287751134154245, + "grad_norm": 1.1976622343063354, + "learning_rate": 4.972027385098868e-06, + "loss": 0.1546, + "step": 1412 + }, + { + "epoch": 0.22893713545042127, + "grad_norm": 1.201499104499817, + "learning_rate": 4.971962112856543e-06, + "loss": 0.1639, + "step": 1413 + }, + { + "epoch": 0.229099157485418, + "grad_norm": 1.1784822940826416, + "learning_rate": 4.97189676497816e-06, + "loss": 0.1709, + "step": 1414 + }, + { + "epoch": 0.22926117952041478, + "grad_norm": 1.2148628234863281, + "learning_rate": 4.971831341465718e-06, + "loss": 0.1641, + "step": 1415 + }, + { + "epoch": 0.22942320155541154, + "grad_norm": 1.1798665523529053, + "learning_rate": 4.971765842321218e-06, + "loss": 0.1761, + "step": 1416 + }, + { + "epoch": 0.22958522359040828, + "grad_norm": 1.0728332996368408, + "learning_rate": 4.9717002675466645e-06, + "loss": 0.1542, + "step": 1417 + }, + { + "epoch": 0.22974724562540505, + "grad_norm": 1.094524621963501, + "learning_rate": 4.971634617144065e-06, + "loss": 0.1603, + "step": 1418 + }, + { + "epoch": 0.22990926766040182, + "grad_norm": 1.1839401721954346, + "learning_rate": 4.971568891115427e-06, + "loss": 0.1517, + "step": 1419 + }, + { + "epoch": 0.23007128969539858, + "grad_norm": 1.2270963191986084, + "learning_rate": 4.971503089462762e-06, + "loss": 0.1751, + "step": 1420 + }, + { + "epoch": 0.23023331173039532, + "grad_norm": 1.342659831047058, + "learning_rate": 4.971437212188084e-06, + "loss": 0.1662, + "step": 1421 + }, + { + "epoch": 0.2303953337653921, + "grad_norm": 1.0489001274108887, + "learning_rate": 4.9713712592934075e-06, + "loss": 0.1384, + "step": 1422 + }, + { + "epoch": 0.23055735580038886, + "grad_norm": 1.2548742294311523, + "learning_rate": 4.971305230780751e-06, + "loss": 0.1829, + "step": 1423 + }, + { + "epoch": 0.23071937783538563, + "grad_norm": 1.1241710186004639, + "learning_rate": 4.971239126652135e-06, + "loss": 0.1583, + "step": 1424 + }, + { + "epoch": 0.23088139987038236, + "grad_norm": 1.1561769247055054, + "learning_rate": 4.971172946909582e-06, + "loss": 0.1747, + "step": 1425 + }, + { + "epoch": 0.23104342190537913, + "grad_norm": 1.1572365760803223, + "learning_rate": 4.971106691555116e-06, + "loss": 0.1447, + "step": 1426 + }, + { + "epoch": 0.2312054439403759, + "grad_norm": 1.766800880432129, + "learning_rate": 4.971040360590767e-06, + "loss": 0.1769, + "step": 1427 + }, + { + "epoch": 0.23136746597537264, + "grad_norm": 1.1585261821746826, + "learning_rate": 4.9709739540185616e-06, + "loss": 0.1782, + "step": 1428 + }, + { + "epoch": 0.2315294880103694, + "grad_norm": 0.9941073656082153, + "learning_rate": 4.9709074718405335e-06, + "loss": 0.1433, + "step": 1429 + }, + { + "epoch": 0.23169151004536617, + "grad_norm": 1.3293631076812744, + "learning_rate": 4.970840914058716e-06, + "loss": 0.1575, + "step": 1430 + }, + { + "epoch": 0.23185353208036294, + "grad_norm": 1.0774224996566772, + "learning_rate": 4.970774280675146e-06, + "loss": 0.1464, + "step": 1431 + }, + { + "epoch": 0.23201555411535968, + "grad_norm": 1.1900949478149414, + "learning_rate": 4.970707571691862e-06, + "loss": 0.1418, + "step": 1432 + }, + { + "epoch": 0.23217757615035645, + "grad_norm": 1.2267847061157227, + "learning_rate": 4.9706407871109056e-06, + "loss": 0.1644, + "step": 1433 + }, + { + "epoch": 0.2323395981853532, + "grad_norm": 1.3006565570831299, + "learning_rate": 4.970573926934319e-06, + "loss": 0.1772, + "step": 1434 + }, + { + "epoch": 0.23250162022034998, + "grad_norm": 1.324775218963623, + "learning_rate": 4.97050699116415e-06, + "loss": 0.1835, + "step": 1435 + }, + { + "epoch": 0.23266364225534672, + "grad_norm": 1.1360862255096436, + "learning_rate": 4.970439979802445e-06, + "loss": 0.1578, + "step": 1436 + }, + { + "epoch": 0.2328256642903435, + "grad_norm": 1.1616891622543335, + "learning_rate": 4.970372892851255e-06, + "loss": 0.1525, + "step": 1437 + }, + { + "epoch": 0.23298768632534025, + "grad_norm": 1.2094240188598633, + "learning_rate": 4.970305730312632e-06, + "loss": 0.1678, + "step": 1438 + }, + { + "epoch": 0.233149708360337, + "grad_norm": 1.3469047546386719, + "learning_rate": 4.970238492188633e-06, + "loss": 0.1741, + "step": 1439 + }, + { + "epoch": 0.23331173039533376, + "grad_norm": 1.2809699773788452, + "learning_rate": 4.9701711784813135e-06, + "loss": 0.1724, + "step": 1440 + }, + { + "epoch": 0.23347375243033053, + "grad_norm": 1.1660102605819702, + "learning_rate": 4.970103789192734e-06, + "loss": 0.1461, + "step": 1441 + }, + { + "epoch": 0.2336357744653273, + "grad_norm": 1.1002432107925415, + "learning_rate": 4.970036324324955e-06, + "loss": 0.1546, + "step": 1442 + }, + { + "epoch": 0.23379779650032403, + "grad_norm": 1.0596157312393188, + "learning_rate": 4.9699687838800425e-06, + "loss": 0.1511, + "step": 1443 + }, + { + "epoch": 0.2339598185353208, + "grad_norm": 1.231737732887268, + "learning_rate": 4.969901167860063e-06, + "loss": 0.1683, + "step": 1444 + }, + { + "epoch": 0.23412184057031757, + "grad_norm": 1.239596962928772, + "learning_rate": 4.969833476267084e-06, + "loss": 0.1763, + "step": 1445 + }, + { + "epoch": 0.23428386260531434, + "grad_norm": 1.0822583436965942, + "learning_rate": 4.969765709103177e-06, + "loss": 0.1484, + "step": 1446 + }, + { + "epoch": 0.23444588464031108, + "grad_norm": 1.0738798379898071, + "learning_rate": 4.969697866370417e-06, + "loss": 0.1508, + "step": 1447 + }, + { + "epoch": 0.23460790667530784, + "grad_norm": 1.1625072956085205, + "learning_rate": 4.9696299480708785e-06, + "loss": 0.1582, + "step": 1448 + }, + { + "epoch": 0.2347699287103046, + "grad_norm": 1.0520604848861694, + "learning_rate": 4.969561954206641e-06, + "loss": 0.1381, + "step": 1449 + }, + { + "epoch": 0.23493195074530135, + "grad_norm": 1.2117533683776855, + "learning_rate": 4.969493884779783e-06, + "loss": 0.1823, + "step": 1450 + }, + { + "epoch": 0.23509397278029812, + "grad_norm": 1.217532753944397, + "learning_rate": 4.969425739792388e-06, + "loss": 0.1778, + "step": 1451 + }, + { + "epoch": 0.23525599481529488, + "grad_norm": 1.1805583238601685, + "learning_rate": 4.969357519246542e-06, + "loss": 0.164, + "step": 1452 + }, + { + "epoch": 0.23541801685029165, + "grad_norm": 1.243613600730896, + "learning_rate": 4.96928922314433e-06, + "loss": 0.1634, + "step": 1453 + }, + { + "epoch": 0.2355800388852884, + "grad_norm": 1.224203109741211, + "learning_rate": 4.9692208514878445e-06, + "loss": 0.1688, + "step": 1454 + }, + { + "epoch": 0.23574206092028516, + "grad_norm": 1.2090548276901245, + "learning_rate": 4.9691524042791765e-06, + "loss": 0.178, + "step": 1455 + }, + { + "epoch": 0.23590408295528192, + "grad_norm": 1.1282353401184082, + "learning_rate": 4.96908388152042e-06, + "loss": 0.1615, + "step": 1456 + }, + { + "epoch": 0.2360661049902787, + "grad_norm": 1.0806899070739746, + "learning_rate": 4.9690152832136715e-06, + "loss": 0.1528, + "step": 1457 + }, + { + "epoch": 0.23622812702527543, + "grad_norm": 1.3501883745193481, + "learning_rate": 4.968946609361031e-06, + "loss": 0.1745, + "step": 1458 + }, + { + "epoch": 0.2363901490602722, + "grad_norm": 1.1516779661178589, + "learning_rate": 4.968877859964598e-06, + "loss": 0.1427, + "step": 1459 + }, + { + "epoch": 0.23655217109526896, + "grad_norm": 1.0786315202713013, + "learning_rate": 4.968809035026477e-06, + "loss": 0.1552, + "step": 1460 + }, + { + "epoch": 0.2367141931302657, + "grad_norm": 1.1482332944869995, + "learning_rate": 4.968740134548775e-06, + "loss": 0.168, + "step": 1461 + }, + { + "epoch": 0.23687621516526247, + "grad_norm": 1.3289909362792969, + "learning_rate": 4.968671158533599e-06, + "loss": 0.1726, + "step": 1462 + }, + { + "epoch": 0.23703823720025924, + "grad_norm": 1.1058714389801025, + "learning_rate": 4.968602106983059e-06, + "loss": 0.1569, + "step": 1463 + }, + { + "epoch": 0.237200259235256, + "grad_norm": 1.113430142402649, + "learning_rate": 4.968532979899269e-06, + "loss": 0.1542, + "step": 1464 + }, + { + "epoch": 0.23736228127025275, + "grad_norm": 1.465322732925415, + "learning_rate": 4.968463777284342e-06, + "loss": 0.1846, + "step": 1465 + }, + { + "epoch": 0.2375243033052495, + "grad_norm": 1.1434563398361206, + "learning_rate": 4.9683944991403985e-06, + "loss": 0.1574, + "step": 1466 + }, + { + "epoch": 0.23768632534024628, + "grad_norm": 1.3530895709991455, + "learning_rate": 4.9683251454695565e-06, + "loss": 0.1646, + "step": 1467 + }, + { + "epoch": 0.23784834737524305, + "grad_norm": 1.193445086479187, + "learning_rate": 4.968255716273938e-06, + "loss": 0.1696, + "step": 1468 + }, + { + "epoch": 0.23801036941023979, + "grad_norm": 1.2225266695022583, + "learning_rate": 4.968186211555668e-06, + "loss": 0.1669, + "step": 1469 + }, + { + "epoch": 0.23817239144523655, + "grad_norm": 1.0870074033737183, + "learning_rate": 4.968116631316873e-06, + "loss": 0.1507, + "step": 1470 + }, + { + "epoch": 0.23833441348023332, + "grad_norm": 1.3210666179656982, + "learning_rate": 4.968046975559681e-06, + "loss": 0.1747, + "step": 1471 + }, + { + "epoch": 0.23849643551523006, + "grad_norm": 1.0983004570007324, + "learning_rate": 4.967977244286225e-06, + "loss": 0.1512, + "step": 1472 + }, + { + "epoch": 0.23865845755022683, + "grad_norm": 1.1378045082092285, + "learning_rate": 4.9679074374986365e-06, + "loss": 0.1599, + "step": 1473 + }, + { + "epoch": 0.2388204795852236, + "grad_norm": 0.968702495098114, + "learning_rate": 4.967837555199054e-06, + "loss": 0.1294, + "step": 1474 + }, + { + "epoch": 0.23898250162022036, + "grad_norm": 1.1442790031433105, + "learning_rate": 4.967767597389613e-06, + "loss": 0.1666, + "step": 1475 + }, + { + "epoch": 0.2391445236552171, + "grad_norm": 1.2723668813705444, + "learning_rate": 4.967697564072457e-06, + "loss": 0.1767, + "step": 1476 + }, + { + "epoch": 0.23930654569021387, + "grad_norm": 1.1569876670837402, + "learning_rate": 4.967627455249726e-06, + "loss": 0.1688, + "step": 1477 + }, + { + "epoch": 0.23946856772521063, + "grad_norm": 1.2262758016586304, + "learning_rate": 4.9675572709235665e-06, + "loss": 0.1873, + "step": 1478 + }, + { + "epoch": 0.2396305897602074, + "grad_norm": 1.2375082969665527, + "learning_rate": 4.967487011096126e-06, + "loss": 0.1713, + "step": 1479 + }, + { + "epoch": 0.23979261179520414, + "grad_norm": 1.2592830657958984, + "learning_rate": 4.967416675769555e-06, + "loss": 0.1847, + "step": 1480 + }, + { + "epoch": 0.2399546338302009, + "grad_norm": 1.3204195499420166, + "learning_rate": 4.9673462649460045e-06, + "loss": 0.1656, + "step": 1481 + }, + { + "epoch": 0.24011665586519768, + "grad_norm": 1.1137760877609253, + "learning_rate": 4.967275778627628e-06, + "loss": 0.1659, + "step": 1482 + }, + { + "epoch": 0.24027867790019442, + "grad_norm": 1.2828960418701172, + "learning_rate": 4.967205216816584e-06, + "loss": 0.1922, + "step": 1483 + }, + { + "epoch": 0.24044069993519118, + "grad_norm": 1.2552210092544556, + "learning_rate": 4.967134579515032e-06, + "loss": 0.1616, + "step": 1484 + }, + { + "epoch": 0.24060272197018795, + "grad_norm": 1.251631259918213, + "learning_rate": 4.96706386672513e-06, + "loss": 0.1807, + "step": 1485 + }, + { + "epoch": 0.24076474400518472, + "grad_norm": 1.195192575454712, + "learning_rate": 4.966993078449046e-06, + "loss": 0.1639, + "step": 1486 + }, + { + "epoch": 0.24092676604018146, + "grad_norm": 1.0965896844863892, + "learning_rate": 4.966922214688943e-06, + "loss": 0.1566, + "step": 1487 + }, + { + "epoch": 0.24108878807517822, + "grad_norm": 1.3898707628250122, + "learning_rate": 4.96685127544699e-06, + "loss": 0.1726, + "step": 1488 + }, + { + "epoch": 0.241250810110175, + "grad_norm": 1.1594412326812744, + "learning_rate": 4.966780260725357e-06, + "loss": 0.1645, + "step": 1489 + }, + { + "epoch": 0.24141283214517173, + "grad_norm": 1.1280404329299927, + "learning_rate": 4.966709170526219e-06, + "loss": 0.1496, + "step": 1490 + }, + { + "epoch": 0.2415748541801685, + "grad_norm": 1.0832349061965942, + "learning_rate": 4.966638004851748e-06, + "loss": 0.1577, + "step": 1491 + }, + { + "epoch": 0.24173687621516526, + "grad_norm": 1.2211941480636597, + "learning_rate": 4.966566763704124e-06, + "loss": 0.1856, + "step": 1492 + }, + { + "epoch": 0.24189889825016203, + "grad_norm": 1.1475859880447388, + "learning_rate": 4.9664954470855265e-06, + "loss": 0.1602, + "step": 1493 + }, + { + "epoch": 0.24206092028515877, + "grad_norm": 1.326402187347412, + "learning_rate": 4.966424054998137e-06, + "loss": 0.2013, + "step": 1494 + }, + { + "epoch": 0.24222294232015554, + "grad_norm": 1.3355193138122559, + "learning_rate": 4.96635258744414e-06, + "loss": 0.1864, + "step": 1495 + }, + { + "epoch": 0.2423849643551523, + "grad_norm": 1.110781192779541, + "learning_rate": 4.966281044425722e-06, + "loss": 0.1592, + "step": 1496 + }, + { + "epoch": 0.24254698639014907, + "grad_norm": 1.1372196674346924, + "learning_rate": 4.966209425945072e-06, + "loss": 0.1666, + "step": 1497 + }, + { + "epoch": 0.2427090084251458, + "grad_norm": 1.1487059593200684, + "learning_rate": 4.9661377320043815e-06, + "loss": 0.167, + "step": 1498 + }, + { + "epoch": 0.24287103046014258, + "grad_norm": 1.1017019748687744, + "learning_rate": 4.966065962605845e-06, + "loss": 0.1661, + "step": 1499 + }, + { + "epoch": 0.24303305249513935, + "grad_norm": 1.0528111457824707, + "learning_rate": 4.965994117751658e-06, + "loss": 0.1547, + "step": 1500 + }, + { + "epoch": 0.24319507453013609, + "grad_norm": 1.1754587888717651, + "learning_rate": 4.965922197444017e-06, + "loss": 0.1734, + "step": 1501 + }, + { + "epoch": 0.24335709656513285, + "grad_norm": 1.20737886428833, + "learning_rate": 4.965850201685126e-06, + "loss": 0.1809, + "step": 1502 + }, + { + "epoch": 0.24351911860012962, + "grad_norm": 1.1585618257522583, + "learning_rate": 4.965778130477185e-06, + "loss": 0.1683, + "step": 1503 + }, + { + "epoch": 0.2436811406351264, + "grad_norm": 1.1678732633590698, + "learning_rate": 4.965705983822401e-06, + "loss": 0.1567, + "step": 1504 + }, + { + "epoch": 0.24384316267012313, + "grad_norm": 1.127969741821289, + "learning_rate": 4.965633761722981e-06, + "loss": 0.1636, + "step": 1505 + }, + { + "epoch": 0.2440051847051199, + "grad_norm": 1.1678990125656128, + "learning_rate": 4.965561464181134e-06, + "loss": 0.1616, + "step": 1506 + }, + { + "epoch": 0.24416720674011666, + "grad_norm": 1.166166067123413, + "learning_rate": 4.965489091199073e-06, + "loss": 0.1483, + "step": 1507 + }, + { + "epoch": 0.24432922877511343, + "grad_norm": 1.092760443687439, + "learning_rate": 4.965416642779012e-06, + "loss": 0.1541, + "step": 1508 + }, + { + "epoch": 0.24449125081011017, + "grad_norm": 1.230621099472046, + "learning_rate": 4.965344118923168e-06, + "loss": 0.1603, + "step": 1509 + }, + { + "epoch": 0.24465327284510693, + "grad_norm": 1.213835597038269, + "learning_rate": 4.96527151963376e-06, + "loss": 0.1719, + "step": 1510 + }, + { + "epoch": 0.2448152948801037, + "grad_norm": 1.032114863395691, + "learning_rate": 4.965198844913009e-06, + "loss": 0.1459, + "step": 1511 + }, + { + "epoch": 0.24497731691510044, + "grad_norm": 1.3471325635910034, + "learning_rate": 4.9651260947631395e-06, + "loss": 0.1784, + "step": 1512 + }, + { + "epoch": 0.2451393389500972, + "grad_norm": 1.0477313995361328, + "learning_rate": 4.965053269186378e-06, + "loss": 0.1542, + "step": 1513 + }, + { + "epoch": 0.24530136098509397, + "grad_norm": 1.0779989957809448, + "learning_rate": 4.9649803681849495e-06, + "loss": 0.1431, + "step": 1514 + }, + { + "epoch": 0.24546338302009074, + "grad_norm": 1.123201847076416, + "learning_rate": 4.964907391761088e-06, + "loss": 0.1537, + "step": 1515 + }, + { + "epoch": 0.24562540505508748, + "grad_norm": 1.2395498752593994, + "learning_rate": 4.9648343399170254e-06, + "loss": 0.1803, + "step": 1516 + }, + { + "epoch": 0.24578742709008425, + "grad_norm": 1.1589041948318481, + "learning_rate": 4.964761212654997e-06, + "loss": 0.1604, + "step": 1517 + }, + { + "epoch": 0.24594944912508102, + "grad_norm": 1.2080649137496948, + "learning_rate": 4.964688009977239e-06, + "loss": 0.1731, + "step": 1518 + }, + { + "epoch": 0.24611147116007778, + "grad_norm": 1.0178292989730835, + "learning_rate": 4.964614731885994e-06, + "loss": 0.1335, + "step": 1519 + }, + { + "epoch": 0.24627349319507452, + "grad_norm": 1.047829508781433, + "learning_rate": 4.9645413783835006e-06, + "loss": 0.1516, + "step": 1520 + }, + { + "epoch": 0.2464355152300713, + "grad_norm": 1.1087828874588013, + "learning_rate": 4.964467949472006e-06, + "loss": 0.1596, + "step": 1521 + }, + { + "epoch": 0.24659753726506806, + "grad_norm": 1.0549379587173462, + "learning_rate": 4.964394445153756e-06, + "loss": 0.1287, + "step": 1522 + }, + { + "epoch": 0.2467595593000648, + "grad_norm": 1.2687568664550781, + "learning_rate": 4.964320865431001e-06, + "loss": 0.1743, + "step": 1523 + }, + { + "epoch": 0.24692158133506156, + "grad_norm": 1.0614700317382812, + "learning_rate": 4.964247210305989e-06, + "loss": 0.1444, + "step": 1524 + }, + { + "epoch": 0.24708360337005833, + "grad_norm": 1.253021478652954, + "learning_rate": 4.964173479780976e-06, + "loss": 0.15, + "step": 1525 + }, + { + "epoch": 0.2472456254050551, + "grad_norm": 1.270143747329712, + "learning_rate": 4.964099673858219e-06, + "loss": 0.1641, + "step": 1526 + }, + { + "epoch": 0.24740764744005184, + "grad_norm": 1.1180580854415894, + "learning_rate": 4.964025792539974e-06, + "loss": 0.1566, + "step": 1527 + }, + { + "epoch": 0.2475696694750486, + "grad_norm": 1.1324844360351562, + "learning_rate": 4.963951835828503e-06, + "loss": 0.1518, + "step": 1528 + }, + { + "epoch": 0.24773169151004537, + "grad_norm": 1.225598931312561, + "learning_rate": 4.963877803726068e-06, + "loss": 0.1681, + "step": 1529 + }, + { + "epoch": 0.24789371354504214, + "grad_norm": 1.3392179012298584, + "learning_rate": 4.963803696234935e-06, + "loss": 0.1815, + "step": 1530 + }, + { + "epoch": 0.24805573558003888, + "grad_norm": 1.1699576377868652, + "learning_rate": 4.963729513357372e-06, + "loss": 0.164, + "step": 1531 + }, + { + "epoch": 0.24821775761503564, + "grad_norm": 1.1221762895584106, + "learning_rate": 4.9636552550956465e-06, + "loss": 0.1688, + "step": 1532 + }, + { + "epoch": 0.2483797796500324, + "grad_norm": 1.1934576034545898, + "learning_rate": 4.963580921452033e-06, + "loss": 0.1814, + "step": 1533 + }, + { + "epoch": 0.24854180168502915, + "grad_norm": 1.0234527587890625, + "learning_rate": 4.963506512428804e-06, + "loss": 0.1457, + "step": 1534 + }, + { + "epoch": 0.24870382372002592, + "grad_norm": 1.1598873138427734, + "learning_rate": 4.963432028028239e-06, + "loss": 0.1653, + "step": 1535 + }, + { + "epoch": 0.24886584575502269, + "grad_norm": 1.1791259050369263, + "learning_rate": 4.963357468252614e-06, + "loss": 0.1833, + "step": 1536 + }, + { + "epoch": 0.24902786779001945, + "grad_norm": 1.2526721954345703, + "learning_rate": 4.9632828331042124e-06, + "loss": 0.1743, + "step": 1537 + }, + { + "epoch": 0.2491898898250162, + "grad_norm": 1.1005678176879883, + "learning_rate": 4.9632081225853165e-06, + "loss": 0.179, + "step": 1538 + }, + { + "epoch": 0.24935191186001296, + "grad_norm": 1.1455602645874023, + "learning_rate": 4.963133336698214e-06, + "loss": 0.17, + "step": 1539 + }, + { + "epoch": 0.24951393389500973, + "grad_norm": 1.3969601392745972, + "learning_rate": 4.9630584754451906e-06, + "loss": 0.1942, + "step": 1540 + }, + { + "epoch": 0.2496759559300065, + "grad_norm": 1.1257600784301758, + "learning_rate": 4.962983538828539e-06, + "loss": 0.1532, + "step": 1541 + }, + { + "epoch": 0.24983797796500323, + "grad_norm": 1.151532769203186, + "learning_rate": 4.962908526850552e-06, + "loss": 0.1781, + "step": 1542 + }, + { + "epoch": 0.25, + "grad_norm": 1.063228964805603, + "learning_rate": 4.962833439513524e-06, + "loss": 0.157, + "step": 1543 + }, + { + "epoch": 0.25016202203499677, + "grad_norm": 1.101438283920288, + "learning_rate": 4.962758276819752e-06, + "loss": 0.155, + "step": 1544 + }, + { + "epoch": 0.25032404406999353, + "grad_norm": 1.0311859846115112, + "learning_rate": 4.9626830387715365e-06, + "loss": 0.1524, + "step": 1545 + }, + { + "epoch": 0.2504860661049903, + "grad_norm": 1.2852880954742432, + "learning_rate": 4.9626077253711805e-06, + "loss": 0.1775, + "step": 1546 + }, + { + "epoch": 0.250648088139987, + "grad_norm": 1.198421835899353, + "learning_rate": 4.962532336620987e-06, + "loss": 0.1595, + "step": 1547 + }, + { + "epoch": 0.2508101101749838, + "grad_norm": 1.1567708253860474, + "learning_rate": 4.962456872523263e-06, + "loss": 0.1514, + "step": 1548 + }, + { + "epoch": 0.25097213220998055, + "grad_norm": 1.1411212682724, + "learning_rate": 4.9623813330803174e-06, + "loss": 0.1753, + "step": 1549 + }, + { + "epoch": 0.2511341542449773, + "grad_norm": 1.034947395324707, + "learning_rate": 4.962305718294462e-06, + "loss": 0.1484, + "step": 1550 + }, + { + "epoch": 0.2512961762799741, + "grad_norm": 1.2145742177963257, + "learning_rate": 4.962230028168011e-06, + "loss": 0.1791, + "step": 1551 + }, + { + "epoch": 0.25145819831497085, + "grad_norm": 1.173612356185913, + "learning_rate": 4.96215426270328e-06, + "loss": 0.1555, + "step": 1552 + }, + { + "epoch": 0.2516202203499676, + "grad_norm": 1.1682580709457397, + "learning_rate": 4.9620784219025855e-06, + "loss": 0.1707, + "step": 1553 + }, + { + "epoch": 0.2517822423849643, + "grad_norm": 1.1669775247573853, + "learning_rate": 4.962002505768251e-06, + "loss": 0.172, + "step": 1554 + }, + { + "epoch": 0.2519442644199611, + "grad_norm": 1.1306649446487427, + "learning_rate": 4.961926514302597e-06, + "loss": 0.1706, + "step": 1555 + }, + { + "epoch": 0.25210628645495786, + "grad_norm": 1.0562387704849243, + "learning_rate": 4.961850447507948e-06, + "loss": 0.1599, + "step": 1556 + }, + { + "epoch": 0.25226830848995463, + "grad_norm": 1.000566005706787, + "learning_rate": 4.961774305386634e-06, + "loss": 0.142, + "step": 1557 + }, + { + "epoch": 0.2524303305249514, + "grad_norm": 1.1344536542892456, + "learning_rate": 4.961698087940984e-06, + "loss": 0.1568, + "step": 1558 + }, + { + "epoch": 0.25259235255994816, + "grad_norm": 1.1511046886444092, + "learning_rate": 4.961621795173329e-06, + "loss": 0.1594, + "step": 1559 + }, + { + "epoch": 0.25275437459494493, + "grad_norm": 1.1231741905212402, + "learning_rate": 4.961545427086006e-06, + "loss": 0.1734, + "step": 1560 + }, + { + "epoch": 0.2529163966299417, + "grad_norm": 1.2015700340270996, + "learning_rate": 4.961468983681347e-06, + "loss": 0.1664, + "step": 1561 + }, + { + "epoch": 0.2530784186649384, + "grad_norm": 1.1765875816345215, + "learning_rate": 4.961392464961695e-06, + "loss": 0.1562, + "step": 1562 + }, + { + "epoch": 0.2532404406999352, + "grad_norm": 1.1523048877716064, + "learning_rate": 4.96131587092939e-06, + "loss": 0.1658, + "step": 1563 + }, + { + "epoch": 0.25340246273493194, + "grad_norm": 1.113807201385498, + "learning_rate": 4.961239201586776e-06, + "loss": 0.1567, + "step": 1564 + }, + { + "epoch": 0.2535644847699287, + "grad_norm": 1.1596148014068604, + "learning_rate": 4.961162456936199e-06, + "loss": 0.1667, + "step": 1565 + }, + { + "epoch": 0.2537265068049255, + "grad_norm": 1.4176827669143677, + "learning_rate": 4.961085636980005e-06, + "loss": 0.2077, + "step": 1566 + }, + { + "epoch": 0.25388852883992225, + "grad_norm": 1.307410717010498, + "learning_rate": 4.961008741720546e-06, + "loss": 0.1839, + "step": 1567 + }, + { + "epoch": 0.254050550874919, + "grad_norm": 1.387787938117981, + "learning_rate": 4.960931771160177e-06, + "loss": 0.1534, + "step": 1568 + }, + { + "epoch": 0.2542125729099157, + "grad_norm": 1.2094498872756958, + "learning_rate": 4.96085472530125e-06, + "loss": 0.1716, + "step": 1569 + }, + { + "epoch": 0.2543745949449125, + "grad_norm": 1.3138521909713745, + "learning_rate": 4.960777604146124e-06, + "loss": 0.1826, + "step": 1570 + }, + { + "epoch": 0.25453661697990926, + "grad_norm": 1.0649223327636719, + "learning_rate": 4.960700407697158e-06, + "loss": 0.1628, + "step": 1571 + }, + { + "epoch": 0.254698639014906, + "grad_norm": 1.0295525789260864, + "learning_rate": 4.9606231359567146e-06, + "loss": 0.1441, + "step": 1572 + }, + { + "epoch": 0.2548606610499028, + "grad_norm": 1.0389995574951172, + "learning_rate": 4.960545788927158e-06, + "loss": 0.1559, + "step": 1573 + }, + { + "epoch": 0.25502268308489956, + "grad_norm": 1.1343345642089844, + "learning_rate": 4.960468366610854e-06, + "loss": 0.1656, + "step": 1574 + }, + { + "epoch": 0.2551847051198963, + "grad_norm": 1.055141568183899, + "learning_rate": 4.960390869010172e-06, + "loss": 0.1614, + "step": 1575 + }, + { + "epoch": 0.25534672715489304, + "grad_norm": 1.1201953887939453, + "learning_rate": 4.960313296127485e-06, + "loss": 0.1514, + "step": 1576 + }, + { + "epoch": 0.2555087491898898, + "grad_norm": 1.041771650314331, + "learning_rate": 4.960235647965163e-06, + "loss": 0.1388, + "step": 1577 + }, + { + "epoch": 0.2556707712248866, + "grad_norm": 1.19291353225708, + "learning_rate": 4.960157924525585e-06, + "loss": 0.1508, + "step": 1578 + }, + { + "epoch": 0.25583279325988334, + "grad_norm": 1.1116414070129395, + "learning_rate": 4.960080125811127e-06, + "loss": 0.1493, + "step": 1579 + }, + { + "epoch": 0.2559948152948801, + "grad_norm": 1.1531809568405151, + "learning_rate": 4.960002251824172e-06, + "loss": 0.1732, + "step": 1580 + }, + { + "epoch": 0.2561568373298769, + "grad_norm": 1.1807605028152466, + "learning_rate": 4.9599243025671e-06, + "loss": 0.1722, + "step": 1581 + }, + { + "epoch": 0.25631885936487364, + "grad_norm": 1.049898624420166, + "learning_rate": 4.959846278042298e-06, + "loss": 0.1554, + "step": 1582 + }, + { + "epoch": 0.2564808813998704, + "grad_norm": 1.3067052364349365, + "learning_rate": 4.959768178252152e-06, + "loss": 0.194, + "step": 1583 + }, + { + "epoch": 0.2566429034348671, + "grad_norm": 1.1992570161819458, + "learning_rate": 4.959690003199052e-06, + "loss": 0.1739, + "step": 1584 + }, + { + "epoch": 0.2568049254698639, + "grad_norm": 1.368794560432434, + "learning_rate": 4.959611752885392e-06, + "loss": 0.174, + "step": 1585 + }, + { + "epoch": 0.25696694750486065, + "grad_norm": 1.3984448909759521, + "learning_rate": 4.959533427313562e-06, + "loss": 0.1909, + "step": 1586 + }, + { + "epoch": 0.2571289695398574, + "grad_norm": 1.0769413709640503, + "learning_rate": 4.959455026485963e-06, + "loss": 0.1573, + "step": 1587 + }, + { + "epoch": 0.2572909915748542, + "grad_norm": 1.1236642599105835, + "learning_rate": 4.95937655040499e-06, + "loss": 0.1559, + "step": 1588 + }, + { + "epoch": 0.25745301360985096, + "grad_norm": 1.0431153774261475, + "learning_rate": 4.959297999073048e-06, + "loss": 0.1562, + "step": 1589 + }, + { + "epoch": 0.2576150356448477, + "grad_norm": 1.1195615530014038, + "learning_rate": 4.959219372492539e-06, + "loss": 0.1543, + "step": 1590 + }, + { + "epoch": 0.25777705767984443, + "grad_norm": 1.070062518119812, + "learning_rate": 4.959140670665867e-06, + "loss": 0.1548, + "step": 1591 + }, + { + "epoch": 0.2579390797148412, + "grad_norm": 1.1161715984344482, + "learning_rate": 4.9590618935954415e-06, + "loss": 0.1677, + "step": 1592 + }, + { + "epoch": 0.25810110174983797, + "grad_norm": 1.1165508031845093, + "learning_rate": 4.958983041283674e-06, + "loss": 0.1623, + "step": 1593 + }, + { + "epoch": 0.25826312378483474, + "grad_norm": 1.2309761047363281, + "learning_rate": 4.958904113732975e-06, + "loss": 0.1937, + "step": 1594 + }, + { + "epoch": 0.2584251458198315, + "grad_norm": 1.0280898809432983, + "learning_rate": 4.958825110945761e-06, + "loss": 0.145, + "step": 1595 + }, + { + "epoch": 0.25858716785482827, + "grad_norm": 1.1085617542266846, + "learning_rate": 4.958746032924449e-06, + "loss": 0.1624, + "step": 1596 + }, + { + "epoch": 0.25874918988982504, + "grad_norm": 1.162490963935852, + "learning_rate": 4.958666879671458e-06, + "loss": 0.1569, + "step": 1597 + }, + { + "epoch": 0.25891121192482175, + "grad_norm": 1.267038106918335, + "learning_rate": 4.95858765118921e-06, + "loss": 0.1804, + "step": 1598 + }, + { + "epoch": 0.2590732339598185, + "grad_norm": 1.0841212272644043, + "learning_rate": 4.9585083474801296e-06, + "loss": 0.1546, + "step": 1599 + }, + { + "epoch": 0.2592352559948153, + "grad_norm": 1.171931266784668, + "learning_rate": 4.9584289685466444e-06, + "loss": 0.1688, + "step": 1600 + }, + { + "epoch": 0.25939727802981205, + "grad_norm": 1.20844304561615, + "learning_rate": 4.9583495143911804e-06, + "loss": 0.1752, + "step": 1601 + }, + { + "epoch": 0.2595593000648088, + "grad_norm": 1.2079849243164062, + "learning_rate": 4.958269985016172e-06, + "loss": 0.1573, + "step": 1602 + }, + { + "epoch": 0.2597213220998056, + "grad_norm": 1.101150631904602, + "learning_rate": 4.95819038042405e-06, + "loss": 0.1602, + "step": 1603 + }, + { + "epoch": 0.25988334413480235, + "grad_norm": 1.0884321928024292, + "learning_rate": 4.958110700617251e-06, + "loss": 0.1451, + "step": 1604 + }, + { + "epoch": 0.2600453661697991, + "grad_norm": 1.1064330339431763, + "learning_rate": 4.958030945598213e-06, + "loss": 0.1497, + "step": 1605 + }, + { + "epoch": 0.26020738820479583, + "grad_norm": 1.055196762084961, + "learning_rate": 4.957951115369378e-06, + "loss": 0.1493, + "step": 1606 + }, + { + "epoch": 0.2603694102397926, + "grad_norm": 1.1254777908325195, + "learning_rate": 4.957871209933185e-06, + "loss": 0.156, + "step": 1607 + }, + { + "epoch": 0.26053143227478937, + "grad_norm": 1.1287553310394287, + "learning_rate": 4.957791229292082e-06, + "loss": 0.1537, + "step": 1608 + }, + { + "epoch": 0.26069345430978613, + "grad_norm": 1.1099610328674316, + "learning_rate": 4.957711173448515e-06, + "loss": 0.1447, + "step": 1609 + }, + { + "epoch": 0.2608554763447829, + "grad_norm": 1.1488322019577026, + "learning_rate": 4.957631042404934e-06, + "loss": 0.1814, + "step": 1610 + }, + { + "epoch": 0.26101749837977967, + "grad_norm": 1.169036626815796, + "learning_rate": 4.957550836163789e-06, + "loss": 0.1601, + "step": 1611 + }, + { + "epoch": 0.26117952041477643, + "grad_norm": 1.2050291299819946, + "learning_rate": 4.957470554727536e-06, + "loss": 0.1633, + "step": 1612 + }, + { + "epoch": 0.26134154244977315, + "grad_norm": 1.1404060125350952, + "learning_rate": 4.9573901980986315e-06, + "loss": 0.1632, + "step": 1613 + }, + { + "epoch": 0.2615035644847699, + "grad_norm": 1.3234813213348389, + "learning_rate": 4.9573097662795344e-06, + "loss": 0.1938, + "step": 1614 + }, + { + "epoch": 0.2616655865197667, + "grad_norm": 1.1954011917114258, + "learning_rate": 4.957229259272705e-06, + "loss": 0.1611, + "step": 1615 + }, + { + "epoch": 0.26182760855476345, + "grad_norm": 1.1279500722885132, + "learning_rate": 4.957148677080605e-06, + "loss": 0.1471, + "step": 1616 + }, + { + "epoch": 0.2619896305897602, + "grad_norm": 1.2045884132385254, + "learning_rate": 4.957068019705703e-06, + "loss": 0.1625, + "step": 1617 + }, + { + "epoch": 0.262151652624757, + "grad_norm": 1.2116997241973877, + "learning_rate": 4.956987287150465e-06, + "loss": 0.1822, + "step": 1618 + }, + { + "epoch": 0.26231367465975375, + "grad_norm": 1.1517181396484375, + "learning_rate": 4.956906479417361e-06, + "loss": 0.1566, + "step": 1619 + }, + { + "epoch": 0.26247569669475046, + "grad_norm": 1.1915812492370605, + "learning_rate": 4.956825596508867e-06, + "loss": 0.1808, + "step": 1620 + }, + { + "epoch": 0.2626377187297472, + "grad_norm": 1.0590485334396362, + "learning_rate": 4.9567446384274525e-06, + "loss": 0.153, + "step": 1621 + }, + { + "epoch": 0.262799740764744, + "grad_norm": 1.147382140159607, + "learning_rate": 4.956663605175599e-06, + "loss": 0.1621, + "step": 1622 + }, + { + "epoch": 0.26296176279974076, + "grad_norm": 1.2645186185836792, + "learning_rate": 4.956582496755783e-06, + "loss": 0.1875, + "step": 1623 + }, + { + "epoch": 0.26312378483473753, + "grad_norm": 1.0937161445617676, + "learning_rate": 4.956501313170487e-06, + "loss": 0.1671, + "step": 1624 + }, + { + "epoch": 0.2632858068697343, + "grad_norm": 1.2296099662780762, + "learning_rate": 4.956420054422197e-06, + "loss": 0.1887, + "step": 1625 + }, + { + "epoch": 0.26344782890473106, + "grad_norm": 1.030358910560608, + "learning_rate": 4.956338720513397e-06, + "loss": 0.1567, + "step": 1626 + }, + { + "epoch": 0.26360985093972783, + "grad_norm": 1.3956369161605835, + "learning_rate": 4.956257311446576e-06, + "loss": 0.1818, + "step": 1627 + }, + { + "epoch": 0.26377187297472454, + "grad_norm": 1.2601412534713745, + "learning_rate": 4.956175827224226e-06, + "loss": 0.1668, + "step": 1628 + }, + { + "epoch": 0.2639338950097213, + "grad_norm": 1.13845694065094, + "learning_rate": 4.956094267848839e-06, + "loss": 0.1688, + "step": 1629 + }, + { + "epoch": 0.2640959170447181, + "grad_norm": 1.1298080682754517, + "learning_rate": 4.956012633322912e-06, + "loss": 0.1605, + "step": 1630 + }, + { + "epoch": 0.26425793907971484, + "grad_norm": 1.1088753938674927, + "learning_rate": 4.955930923648941e-06, + "loss": 0.1386, + "step": 1631 + }, + { + "epoch": 0.2644199611147116, + "grad_norm": 1.1304455995559692, + "learning_rate": 4.955849138829428e-06, + "loss": 0.1563, + "step": 1632 + }, + { + "epoch": 0.2645819831497084, + "grad_norm": 1.1293634176254272, + "learning_rate": 4.955767278866872e-06, + "loss": 0.1387, + "step": 1633 + }, + { + "epoch": 0.26474400518470514, + "grad_norm": 1.0126081705093384, + "learning_rate": 4.955685343763782e-06, + "loss": 0.1385, + "step": 1634 + }, + { + "epoch": 0.26490602721970186, + "grad_norm": 1.0980356931686401, + "learning_rate": 4.955603333522663e-06, + "loss": 0.1575, + "step": 1635 + }, + { + "epoch": 0.2650680492546986, + "grad_norm": 1.0100072622299194, + "learning_rate": 4.9555212481460245e-06, + "loss": 0.1492, + "step": 1636 + }, + { + "epoch": 0.2652300712896954, + "grad_norm": 1.275562047958374, + "learning_rate": 4.955439087636378e-06, + "loss": 0.1708, + "step": 1637 + }, + { + "epoch": 0.26539209332469216, + "grad_norm": 1.1762036085128784, + "learning_rate": 4.955356851996236e-06, + "loss": 0.1598, + "step": 1638 + }, + { + "epoch": 0.2655541153596889, + "grad_norm": 1.199687123298645, + "learning_rate": 4.9552745412281175e-06, + "loss": 0.1696, + "step": 1639 + }, + { + "epoch": 0.2657161373946857, + "grad_norm": 1.060361385345459, + "learning_rate": 4.955192155334539e-06, + "loss": 0.1561, + "step": 1640 + }, + { + "epoch": 0.26587815942968246, + "grad_norm": 1.3154797554016113, + "learning_rate": 4.955109694318024e-06, + "loss": 0.1891, + "step": 1641 + }, + { + "epoch": 0.26604018146467917, + "grad_norm": 1.0718739032745361, + "learning_rate": 4.955027158181092e-06, + "loss": 0.1487, + "step": 1642 + }, + { + "epoch": 0.26620220349967594, + "grad_norm": 1.0659395456314087, + "learning_rate": 4.95494454692627e-06, + "loss": 0.15, + "step": 1643 + }, + { + "epoch": 0.2663642255346727, + "grad_norm": 1.4435391426086426, + "learning_rate": 4.9548618605560855e-06, + "loss": 0.1911, + "step": 1644 + }, + { + "epoch": 0.26652624756966947, + "grad_norm": 1.1285501718521118, + "learning_rate": 4.954779099073069e-06, + "loss": 0.1573, + "step": 1645 + }, + { + "epoch": 0.26668826960466624, + "grad_norm": 1.2026793956756592, + "learning_rate": 4.954696262479753e-06, + "loss": 0.1823, + "step": 1646 + }, + { + "epoch": 0.266850291639663, + "grad_norm": 1.1052802801132202, + "learning_rate": 4.954613350778671e-06, + "loss": 0.1497, + "step": 1647 + }, + { + "epoch": 0.2670123136746598, + "grad_norm": 1.0569524765014648, + "learning_rate": 4.954530363972361e-06, + "loss": 0.1615, + "step": 1648 + }, + { + "epoch": 0.26717433570965654, + "grad_norm": 1.3057336807250977, + "learning_rate": 4.954447302063362e-06, + "loss": 0.1856, + "step": 1649 + }, + { + "epoch": 0.26733635774465325, + "grad_norm": 1.0749009847640991, + "learning_rate": 4.954364165054214e-06, + "loss": 0.169, + "step": 1650 + }, + { + "epoch": 0.26749837977965, + "grad_norm": 1.1229180097579956, + "learning_rate": 4.954280952947463e-06, + "loss": 0.1572, + "step": 1651 + }, + { + "epoch": 0.2676604018146468, + "grad_norm": 1.134561538696289, + "learning_rate": 4.9541976657456535e-06, + "loss": 0.1618, + "step": 1652 + }, + { + "epoch": 0.26782242384964355, + "grad_norm": 1.229466199874878, + "learning_rate": 4.954114303451335e-06, + "loss": 0.1516, + "step": 1653 + }, + { + "epoch": 0.2679844458846403, + "grad_norm": 1.3184914588928223, + "learning_rate": 4.954030866067057e-06, + "loss": 0.1565, + "step": 1654 + }, + { + "epoch": 0.2681464679196371, + "grad_norm": 1.1177244186401367, + "learning_rate": 4.953947353595374e-06, + "loss": 0.1492, + "step": 1655 + }, + { + "epoch": 0.26830848995463386, + "grad_norm": 1.136872410774231, + "learning_rate": 4.95386376603884e-06, + "loss": 0.1586, + "step": 1656 + }, + { + "epoch": 0.26847051198963057, + "grad_norm": 1.8951784372329712, + "learning_rate": 4.953780103400012e-06, + "loss": 0.1676, + "step": 1657 + }, + { + "epoch": 0.26863253402462733, + "grad_norm": 1.2328137159347534, + "learning_rate": 4.953696365681452e-06, + "loss": 0.1778, + "step": 1658 + }, + { + "epoch": 0.2687945560596241, + "grad_norm": 1.0323786735534668, + "learning_rate": 4.953612552885721e-06, + "loss": 0.1374, + "step": 1659 + }, + { + "epoch": 0.26895657809462087, + "grad_norm": 1.2623248100280762, + "learning_rate": 4.953528665015383e-06, + "loss": 0.1787, + "step": 1660 + }, + { + "epoch": 0.26911860012961764, + "grad_norm": 1.1066336631774902, + "learning_rate": 4.953444702073006e-06, + "loss": 0.1546, + "step": 1661 + }, + { + "epoch": 0.2692806221646144, + "grad_norm": 1.1255744695663452, + "learning_rate": 4.953360664061159e-06, + "loss": 0.1574, + "step": 1662 + }, + { + "epoch": 0.26944264419961117, + "grad_norm": 1.143039584159851, + "learning_rate": 4.953276550982411e-06, + "loss": 0.1585, + "step": 1663 + }, + { + "epoch": 0.2696046662346079, + "grad_norm": 1.1908190250396729, + "learning_rate": 4.953192362839338e-06, + "loss": 0.1744, + "step": 1664 + }, + { + "epoch": 0.26976668826960465, + "grad_norm": 1.1476013660430908, + "learning_rate": 4.953108099634516e-06, + "loss": 0.1698, + "step": 1665 + }, + { + "epoch": 0.2699287103046014, + "grad_norm": 1.0105767250061035, + "learning_rate": 4.953023761370521e-06, + "loss": 0.1454, + "step": 1666 + }, + { + "epoch": 0.2700907323395982, + "grad_norm": 0.9987475872039795, + "learning_rate": 4.9529393480499365e-06, + "loss": 0.1345, + "step": 1667 + }, + { + "epoch": 0.27025275437459495, + "grad_norm": 1.234141230583191, + "learning_rate": 4.952854859675343e-06, + "loss": 0.1642, + "step": 1668 + }, + { + "epoch": 0.2704147764095917, + "grad_norm": 1.2194494009017944, + "learning_rate": 4.952770296249328e-06, + "loss": 0.1766, + "step": 1669 + }, + { + "epoch": 0.2705767984445885, + "grad_norm": 1.22120201587677, + "learning_rate": 4.952685657774476e-06, + "loss": 0.1875, + "step": 1670 + }, + { + "epoch": 0.2707388204795852, + "grad_norm": 1.0285940170288086, + "learning_rate": 4.952600944253379e-06, + "loss": 0.1544, + "step": 1671 + }, + { + "epoch": 0.27090084251458196, + "grad_norm": 1.1357102394104004, + "learning_rate": 4.952516155688628e-06, + "loss": 0.1668, + "step": 1672 + }, + { + "epoch": 0.27106286454957873, + "grad_norm": 1.0606294870376587, + "learning_rate": 4.952431292082818e-06, + "loss": 0.1639, + "step": 1673 + }, + { + "epoch": 0.2712248865845755, + "grad_norm": 1.2481400966644287, + "learning_rate": 4.9523463534385444e-06, + "loss": 0.1709, + "step": 1674 + }, + { + "epoch": 0.27138690861957226, + "grad_norm": 1.1794558763504028, + "learning_rate": 4.9522613397584075e-06, + "loss": 0.1749, + "step": 1675 + }, + { + "epoch": 0.27154893065456903, + "grad_norm": 1.0960800647735596, + "learning_rate": 4.952176251045008e-06, + "loss": 0.1427, + "step": 1676 + }, + { + "epoch": 0.2717109526895658, + "grad_norm": 1.2068989276885986, + "learning_rate": 4.95209108730095e-06, + "loss": 0.1591, + "step": 1677 + }, + { + "epoch": 0.27187297472456257, + "grad_norm": 1.1335726976394653, + "learning_rate": 4.952005848528838e-06, + "loss": 0.1549, + "step": 1678 + }, + { + "epoch": 0.2720349967595593, + "grad_norm": 1.113123893737793, + "learning_rate": 4.95192053473128e-06, + "loss": 0.1502, + "step": 1679 + }, + { + "epoch": 0.27219701879455604, + "grad_norm": 1.2119156122207642, + "learning_rate": 4.951835145910888e-06, + "loss": 0.1636, + "step": 1680 + }, + { + "epoch": 0.2723590408295528, + "grad_norm": 1.0948169231414795, + "learning_rate": 4.951749682070274e-06, + "loss": 0.1543, + "step": 1681 + }, + { + "epoch": 0.2725210628645496, + "grad_norm": 1.4161688089370728, + "learning_rate": 4.951664143212053e-06, + "loss": 0.1859, + "step": 1682 + }, + { + "epoch": 0.27268308489954635, + "grad_norm": 1.244466781616211, + "learning_rate": 4.951578529338842e-06, + "loss": 0.1875, + "step": 1683 + }, + { + "epoch": 0.2728451069345431, + "grad_norm": 2.2999372482299805, + "learning_rate": 4.95149284045326e-06, + "loss": 0.1494, + "step": 1684 + }, + { + "epoch": 0.2730071289695399, + "grad_norm": 1.1153371334075928, + "learning_rate": 4.95140707655793e-06, + "loss": 0.1467, + "step": 1685 + }, + { + "epoch": 0.2731691510045366, + "grad_norm": 1.0886139869689941, + "learning_rate": 4.951321237655477e-06, + "loss": 0.148, + "step": 1686 + }, + { + "epoch": 0.27333117303953336, + "grad_norm": 1.16849684715271, + "learning_rate": 4.951235323748524e-06, + "loss": 0.1876, + "step": 1687 + }, + { + "epoch": 0.2734931950745301, + "grad_norm": 1.0493582487106323, + "learning_rate": 4.951149334839703e-06, + "loss": 0.1584, + "step": 1688 + }, + { + "epoch": 0.2736552171095269, + "grad_norm": 1.146984577178955, + "learning_rate": 4.951063270931644e-06, + "loss": 0.1778, + "step": 1689 + }, + { + "epoch": 0.27381723914452366, + "grad_norm": 1.1503607034683228, + "learning_rate": 4.950977132026981e-06, + "loss": 0.1555, + "step": 1690 + }, + { + "epoch": 0.27397926117952043, + "grad_norm": 1.2447178363800049, + "learning_rate": 4.950890918128348e-06, + "loss": 0.1876, + "step": 1691 + }, + { + "epoch": 0.2741412832145172, + "grad_norm": 1.197851300239563, + "learning_rate": 4.9508046292383846e-06, + "loss": 0.1763, + "step": 1692 + }, + { + "epoch": 0.2743033052495139, + "grad_norm": 1.1843172311782837, + "learning_rate": 4.950718265359729e-06, + "loss": 0.166, + "step": 1693 + }, + { + "epoch": 0.2744653272845107, + "grad_norm": 1.0466846227645874, + "learning_rate": 4.950631826495027e-06, + "loss": 0.1472, + "step": 1694 + }, + { + "epoch": 0.27462734931950744, + "grad_norm": 1.1562830209732056, + "learning_rate": 4.950545312646921e-06, + "loss": 0.1596, + "step": 1695 + }, + { + "epoch": 0.2747893713545042, + "grad_norm": 1.1788415908813477, + "learning_rate": 4.950458723818058e-06, + "loss": 0.1586, + "step": 1696 + }, + { + "epoch": 0.274951393389501, + "grad_norm": 1.238731861114502, + "learning_rate": 4.9503720600110884e-06, + "loss": 0.1626, + "step": 1697 + }, + { + "epoch": 0.27511341542449774, + "grad_norm": 1.1631656885147095, + "learning_rate": 4.950285321228664e-06, + "loss": 0.1508, + "step": 1698 + }, + { + "epoch": 0.2752754374594945, + "grad_norm": 1.2769675254821777, + "learning_rate": 4.950198507473438e-06, + "loss": 0.1555, + "step": 1699 + }, + { + "epoch": 0.2754374594944913, + "grad_norm": 1.2280595302581787, + "learning_rate": 4.950111618748067e-06, + "loss": 0.1416, + "step": 1700 + }, + { + "epoch": 0.275599481529488, + "grad_norm": 1.3224934339523315, + "learning_rate": 4.95002465505521e-06, + "loss": 0.1673, + "step": 1701 + }, + { + "epoch": 0.27576150356448476, + "grad_norm": 1.2435288429260254, + "learning_rate": 4.949937616397527e-06, + "loss": 0.1776, + "step": 1702 + }, + { + "epoch": 0.2759235255994815, + "grad_norm": 1.2028788328170776, + "learning_rate": 4.949850502777681e-06, + "loss": 0.1704, + "step": 1703 + }, + { + "epoch": 0.2760855476344783, + "grad_norm": 1.0093039274215698, + "learning_rate": 4.949763314198339e-06, + "loss": 0.1424, + "step": 1704 + }, + { + "epoch": 0.27624756966947506, + "grad_norm": 1.1535528898239136, + "learning_rate": 4.949676050662169e-06, + "loss": 0.1564, + "step": 1705 + }, + { + "epoch": 0.2764095917044718, + "grad_norm": 1.1377063989639282, + "learning_rate": 4.949588712171838e-06, + "loss": 0.1687, + "step": 1706 + }, + { + "epoch": 0.2765716137394686, + "grad_norm": 1.1593130826950073, + "learning_rate": 4.949501298730021e-06, + "loss": 0.1611, + "step": 1707 + }, + { + "epoch": 0.2767336357744653, + "grad_norm": 1.2367503643035889, + "learning_rate": 4.949413810339392e-06, + "loss": 0.1803, + "step": 1708 + }, + { + "epoch": 0.27689565780946207, + "grad_norm": 1.1531428098678589, + "learning_rate": 4.9493262470026286e-06, + "loss": 0.183, + "step": 1709 + }, + { + "epoch": 0.27705767984445884, + "grad_norm": 1.1906907558441162, + "learning_rate": 4.949238608722408e-06, + "loss": 0.1486, + "step": 1710 + }, + { + "epoch": 0.2772197018794556, + "grad_norm": 1.1683154106140137, + "learning_rate": 4.949150895501414e-06, + "loss": 0.1657, + "step": 1711 + }, + { + "epoch": 0.27738172391445237, + "grad_norm": 1.2222694158554077, + "learning_rate": 4.949063107342329e-06, + "loss": 0.1983, + "step": 1712 + }, + { + "epoch": 0.27754374594944914, + "grad_norm": 1.0516256093978882, + "learning_rate": 4.948975244247839e-06, + "loss": 0.1497, + "step": 1713 + }, + { + "epoch": 0.2777057679844459, + "grad_norm": 1.630317211151123, + "learning_rate": 4.948887306220634e-06, + "loss": 0.2039, + "step": 1714 + }, + { + "epoch": 0.2778677900194426, + "grad_norm": 1.3137788772583008, + "learning_rate": 4.948799293263403e-06, + "loss": 0.1633, + "step": 1715 + }, + { + "epoch": 0.2780298120544394, + "grad_norm": 1.0964162349700928, + "learning_rate": 4.94871120537884e-06, + "loss": 0.1566, + "step": 1716 + }, + { + "epoch": 0.27819183408943615, + "grad_norm": 1.437134027481079, + "learning_rate": 4.948623042569639e-06, + "loss": 0.1775, + "step": 1717 + }, + { + "epoch": 0.2783538561244329, + "grad_norm": 1.0482769012451172, + "learning_rate": 4.9485348048385e-06, + "loss": 0.1494, + "step": 1718 + }, + { + "epoch": 0.2785158781594297, + "grad_norm": 1.0254579782485962, + "learning_rate": 4.94844649218812e-06, + "loss": 0.1427, + "step": 1719 + }, + { + "epoch": 0.27867790019442645, + "grad_norm": 1.1733806133270264, + "learning_rate": 4.9483581046212025e-06, + "loss": 0.1733, + "step": 1720 + }, + { + "epoch": 0.2788399222294232, + "grad_norm": 1.1621792316436768, + "learning_rate": 4.948269642140453e-06, + "loss": 0.1532, + "step": 1721 + }, + { + "epoch": 0.27900194426442, + "grad_norm": 1.2027387619018555, + "learning_rate": 4.948181104748576e-06, + "loss": 0.1711, + "step": 1722 + }, + { + "epoch": 0.2791639662994167, + "grad_norm": 1.3323115110397339, + "learning_rate": 4.9480924924482824e-06, + "loss": 0.185, + "step": 1723 + }, + { + "epoch": 0.27932598833441347, + "grad_norm": 1.1466425657272339, + "learning_rate": 4.948003805242282e-06, + "loss": 0.1606, + "step": 1724 + }, + { + "epoch": 0.27948801036941023, + "grad_norm": 1.1077498197555542, + "learning_rate": 4.94791504313329e-06, + "loss": 0.1582, + "step": 1725 + }, + { + "epoch": 0.279650032404407, + "grad_norm": 1.1000559329986572, + "learning_rate": 4.9478262061240216e-06, + "loss": 0.1451, + "step": 1726 + }, + { + "epoch": 0.27981205443940377, + "grad_norm": 1.2915170192718506, + "learning_rate": 4.9477372942171945e-06, + "loss": 0.1759, + "step": 1727 + }, + { + "epoch": 0.27997407647440054, + "grad_norm": 1.1661245822906494, + "learning_rate": 4.947648307415529e-06, + "loss": 0.1615, + "step": 1728 + }, + { + "epoch": 0.2801360985093973, + "grad_norm": 1.1373041868209839, + "learning_rate": 4.947559245721749e-06, + "loss": 0.1458, + "step": 1729 + }, + { + "epoch": 0.280298120544394, + "grad_norm": 1.0064786672592163, + "learning_rate": 4.947470109138579e-06, + "loss": 0.1409, + "step": 1730 + }, + { + "epoch": 0.2804601425793908, + "grad_norm": 0.9815242290496826, + "learning_rate": 4.947380897668747e-06, + "loss": 0.1361, + "step": 1731 + }, + { + "epoch": 0.28062216461438755, + "grad_norm": 0.9623909592628479, + "learning_rate": 4.947291611314981e-06, + "loss": 0.1412, + "step": 1732 + }, + { + "epoch": 0.2807841866493843, + "grad_norm": 1.241988182067871, + "learning_rate": 4.947202250080015e-06, + "loss": 0.1754, + "step": 1733 + }, + { + "epoch": 0.2809462086843811, + "grad_norm": 1.213842749595642, + "learning_rate": 4.9471128139665826e-06, + "loss": 0.1757, + "step": 1734 + }, + { + "epoch": 0.28110823071937785, + "grad_norm": 1.181797742843628, + "learning_rate": 4.9470233029774195e-06, + "loss": 0.1591, + "step": 1735 + }, + { + "epoch": 0.2812702527543746, + "grad_norm": 1.0789086818695068, + "learning_rate": 4.9469337171152645e-06, + "loss": 0.1524, + "step": 1736 + }, + { + "epoch": 0.28143227478937133, + "grad_norm": 1.2921925783157349, + "learning_rate": 4.94684405638286e-06, + "loss": 0.1836, + "step": 1737 + }, + { + "epoch": 0.2815942968243681, + "grad_norm": 1.1767724752426147, + "learning_rate": 4.946754320782948e-06, + "loss": 0.1598, + "step": 1738 + }, + { + "epoch": 0.28175631885936486, + "grad_norm": 1.1282929182052612, + "learning_rate": 4.946664510318275e-06, + "loss": 0.1573, + "step": 1739 + }, + { + "epoch": 0.28191834089436163, + "grad_norm": 1.316778302192688, + "learning_rate": 4.946574624991589e-06, + "loss": 0.1816, + "step": 1740 + }, + { + "epoch": 0.2820803629293584, + "grad_norm": 1.234459638595581, + "learning_rate": 4.9464846648056396e-06, + "loss": 0.1854, + "step": 1741 + }, + { + "epoch": 0.28224238496435516, + "grad_norm": 1.1687872409820557, + "learning_rate": 4.946394629763181e-06, + "loss": 0.1638, + "step": 1742 + }, + { + "epoch": 0.28240440699935193, + "grad_norm": 1.0062087774276733, + "learning_rate": 4.946304519866966e-06, + "loss": 0.1454, + "step": 1743 + }, + { + "epoch": 0.2825664290343487, + "grad_norm": 1.048824429512024, + "learning_rate": 4.946214335119752e-06, + "loss": 0.1415, + "step": 1744 + }, + { + "epoch": 0.2827284510693454, + "grad_norm": 1.0653513669967651, + "learning_rate": 4.9461240755243e-06, + "loss": 0.1454, + "step": 1745 + }, + { + "epoch": 0.2828904731043422, + "grad_norm": 1.1784061193466187, + "learning_rate": 4.94603374108337e-06, + "loss": 0.1578, + "step": 1746 + }, + { + "epoch": 0.28305249513933894, + "grad_norm": 1.1248995065689087, + "learning_rate": 4.945943331799728e-06, + "loss": 0.1534, + "step": 1747 + }, + { + "epoch": 0.2832145171743357, + "grad_norm": 1.266843557357788, + "learning_rate": 4.945852847676138e-06, + "loss": 0.1917, + "step": 1748 + }, + { + "epoch": 0.2833765392093325, + "grad_norm": 1.1719672679901123, + "learning_rate": 4.945762288715371e-06, + "loss": 0.1702, + "step": 1749 + }, + { + "epoch": 0.28353856124432925, + "grad_norm": 1.1670610904693604, + "learning_rate": 4.945671654920195e-06, + "loss": 0.1615, + "step": 1750 + }, + { + "epoch": 0.283700583279326, + "grad_norm": 1.1604700088500977, + "learning_rate": 4.945580946293386e-06, + "loss": 0.1599, + "step": 1751 + }, + { + "epoch": 0.2838626053143227, + "grad_norm": 1.085565447807312, + "learning_rate": 4.945490162837718e-06, + "loss": 0.1558, + "step": 1752 + }, + { + "epoch": 0.2840246273493195, + "grad_norm": 1.1299009323120117, + "learning_rate": 4.945399304555968e-06, + "loss": 0.1584, + "step": 1753 + }, + { + "epoch": 0.28418664938431626, + "grad_norm": 1.1173944473266602, + "learning_rate": 4.945308371450919e-06, + "loss": 0.1524, + "step": 1754 + }, + { + "epoch": 0.284348671419313, + "grad_norm": 0.8910510540008545, + "learning_rate": 4.945217363525349e-06, + "loss": 0.1184, + "step": 1755 + }, + { + "epoch": 0.2845106934543098, + "grad_norm": 1.2654694318771362, + "learning_rate": 4.945126280782047e-06, + "loss": 0.2026, + "step": 1756 + }, + { + "epoch": 0.28467271548930656, + "grad_norm": 1.1252206563949585, + "learning_rate": 4.945035123223797e-06, + "loss": 0.1871, + "step": 1757 + }, + { + "epoch": 0.2848347375243033, + "grad_norm": 1.245661973953247, + "learning_rate": 4.944943890853389e-06, + "loss": 0.1596, + "step": 1758 + }, + { + "epoch": 0.28499675955930004, + "grad_norm": 1.2262191772460938, + "learning_rate": 4.944852583673615e-06, + "loss": 0.1797, + "step": 1759 + }, + { + "epoch": 0.2851587815942968, + "grad_norm": 1.0503751039505005, + "learning_rate": 4.944761201687268e-06, + "loss": 0.1437, + "step": 1760 + }, + { + "epoch": 0.2853208036292936, + "grad_norm": 1.460869550704956, + "learning_rate": 4.944669744897144e-06, + "loss": 0.1576, + "step": 1761 + }, + { + "epoch": 0.28548282566429034, + "grad_norm": 1.1839721202850342, + "learning_rate": 4.944578213306043e-06, + "loss": 0.1463, + "step": 1762 + }, + { + "epoch": 0.2856448476992871, + "grad_norm": 1.1517478227615356, + "learning_rate": 4.944486606916764e-06, + "loss": 0.1873, + "step": 1763 + }, + { + "epoch": 0.2858068697342839, + "grad_norm": 1.1901956796646118, + "learning_rate": 4.94439492573211e-06, + "loss": 0.159, + "step": 1764 + }, + { + "epoch": 0.28596889176928064, + "grad_norm": 1.0697051286697388, + "learning_rate": 4.944303169754887e-06, + "loss": 0.1668, + "step": 1765 + }, + { + "epoch": 0.2861309138042774, + "grad_norm": 1.2297983169555664, + "learning_rate": 4.944211338987901e-06, + "loss": 0.1695, + "step": 1766 + }, + { + "epoch": 0.2862929358392741, + "grad_norm": 1.160058617591858, + "learning_rate": 4.944119433433964e-06, + "loss": 0.1674, + "step": 1767 + }, + { + "epoch": 0.2864549578742709, + "grad_norm": 1.1287837028503418, + "learning_rate": 4.944027453095887e-06, + "loss": 0.1578, + "step": 1768 + }, + { + "epoch": 0.28661697990926766, + "grad_norm": 1.0510386228561401, + "learning_rate": 4.943935397976484e-06, + "loss": 0.154, + "step": 1769 + }, + { + "epoch": 0.2867790019442644, + "grad_norm": 1.118772268295288, + "learning_rate": 4.943843268078572e-06, + "loss": 0.151, + "step": 1770 + }, + { + "epoch": 0.2869410239792612, + "grad_norm": 1.0504902601242065, + "learning_rate": 4.94375106340497e-06, + "loss": 0.1569, + "step": 1771 + }, + { + "epoch": 0.28710304601425796, + "grad_norm": 1.1878551244735718, + "learning_rate": 4.9436587839585e-06, + "loss": 0.1696, + "step": 1772 + }, + { + "epoch": 0.2872650680492547, + "grad_norm": 1.1703705787658691, + "learning_rate": 4.9435664297419836e-06, + "loss": 0.1505, + "step": 1773 + }, + { + "epoch": 0.28742709008425144, + "grad_norm": 1.1289130449295044, + "learning_rate": 4.9434740007582485e-06, + "loss": 0.1568, + "step": 1774 + }, + { + "epoch": 0.2875891121192482, + "grad_norm": 1.1618683338165283, + "learning_rate": 4.943381497010122e-06, + "loss": 0.1708, + "step": 1775 + }, + { + "epoch": 0.28775113415424497, + "grad_norm": 1.0598828792572021, + "learning_rate": 4.943288918500434e-06, + "loss": 0.1608, + "step": 1776 + }, + { + "epoch": 0.28791315618924174, + "grad_norm": 1.1021727323532104, + "learning_rate": 4.943196265232018e-06, + "loss": 0.1598, + "step": 1777 + }, + { + "epoch": 0.2880751782242385, + "grad_norm": 1.0794800519943237, + "learning_rate": 4.94310353720771e-06, + "loss": 0.1445, + "step": 1778 + }, + { + "epoch": 0.28823720025923527, + "grad_norm": 1.0673820972442627, + "learning_rate": 4.9430107344303445e-06, + "loss": 0.1494, + "step": 1779 + }, + { + "epoch": 0.28839922229423204, + "grad_norm": 1.0401991605758667, + "learning_rate": 4.942917856902763e-06, + "loss": 0.1386, + "step": 1780 + }, + { + "epoch": 0.28856124432922875, + "grad_norm": 1.2189371585845947, + "learning_rate": 4.9428249046278065e-06, + "loss": 0.1655, + "step": 1781 + }, + { + "epoch": 0.2887232663642255, + "grad_norm": 1.2600358724594116, + "learning_rate": 4.942731877608319e-06, + "loss": 0.1771, + "step": 1782 + }, + { + "epoch": 0.2888852883992223, + "grad_norm": 1.2140158414840698, + "learning_rate": 4.942638775847149e-06, + "loss": 0.1741, + "step": 1783 + }, + { + "epoch": 0.28904731043421905, + "grad_norm": 1.0172653198242188, + "learning_rate": 4.942545599347142e-06, + "loss": 0.1417, + "step": 1784 + }, + { + "epoch": 0.2892093324692158, + "grad_norm": 1.2738704681396484, + "learning_rate": 4.942452348111151e-06, + "loss": 0.1731, + "step": 1785 + }, + { + "epoch": 0.2893713545042126, + "grad_norm": 1.1915194988250732, + "learning_rate": 4.942359022142028e-06, + "loss": 0.1663, + "step": 1786 + }, + { + "epoch": 0.28953337653920935, + "grad_norm": 1.04640531539917, + "learning_rate": 4.94226562144263e-06, + "loss": 0.1371, + "step": 1787 + }, + { + "epoch": 0.28969539857420606, + "grad_norm": 1.054979681968689, + "learning_rate": 4.942172146015814e-06, + "loss": 0.1486, + "step": 1788 + }, + { + "epoch": 0.28985742060920283, + "grad_norm": 1.1150860786437988, + "learning_rate": 4.942078595864441e-06, + "loss": 0.1521, + "step": 1789 + }, + { + "epoch": 0.2900194426441996, + "grad_norm": 1.2064958810806274, + "learning_rate": 4.941984970991372e-06, + "loss": 0.1682, + "step": 1790 + }, + { + "epoch": 0.29018146467919637, + "grad_norm": 1.0891704559326172, + "learning_rate": 4.941891271399473e-06, + "loss": 0.1581, + "step": 1791 + }, + { + "epoch": 0.29034348671419313, + "grad_norm": 1.2679988145828247, + "learning_rate": 4.9417974970916096e-06, + "loss": 0.1671, + "step": 1792 + }, + { + "epoch": 0.2905055087491899, + "grad_norm": 1.0891644954681396, + "learning_rate": 4.941703648070653e-06, + "loss": 0.1621, + "step": 1793 + }, + { + "epoch": 0.29066753078418667, + "grad_norm": 0.9570609331130981, + "learning_rate": 4.9416097243394725e-06, + "loss": 0.1428, + "step": 1794 + }, + { + "epoch": 0.29082955281918343, + "grad_norm": 1.0233310461044312, + "learning_rate": 4.941515725900943e-06, + "loss": 0.158, + "step": 1795 + }, + { + "epoch": 0.29099157485418015, + "grad_norm": 1.1454052925109863, + "learning_rate": 4.94142165275794e-06, + "loss": 0.1722, + "step": 1796 + }, + { + "epoch": 0.2911535968891769, + "grad_norm": 1.0904475450515747, + "learning_rate": 4.941327504913344e-06, + "loss": 0.164, + "step": 1797 + }, + { + "epoch": 0.2913156189241737, + "grad_norm": 1.0754122734069824, + "learning_rate": 4.941233282370034e-06, + "loss": 0.1507, + "step": 1798 + }, + { + "epoch": 0.29147764095917045, + "grad_norm": 1.1219291687011719, + "learning_rate": 4.941138985130893e-06, + "loss": 0.1742, + "step": 1799 + }, + { + "epoch": 0.2916396629941672, + "grad_norm": 1.2050334215164185, + "learning_rate": 4.941044613198807e-06, + "loss": 0.1694, + "step": 1800 + }, + { + "epoch": 0.291801685029164, + "grad_norm": 1.0843576192855835, + "learning_rate": 4.940950166576661e-06, + "loss": 0.1558, + "step": 1801 + }, + { + "epoch": 0.29196370706416075, + "grad_norm": 1.1150481700897217, + "learning_rate": 4.940855645267349e-06, + "loss": 0.1559, + "step": 1802 + }, + { + "epoch": 0.29212572909915746, + "grad_norm": 1.0459260940551758, + "learning_rate": 4.94076104927376e-06, + "loss": 0.1608, + "step": 1803 + }, + { + "epoch": 0.29228775113415423, + "grad_norm": 1.014258623123169, + "learning_rate": 4.94066637859879e-06, + "loss": 0.133, + "step": 1804 + }, + { + "epoch": 0.292449773169151, + "grad_norm": 1.1400861740112305, + "learning_rate": 4.940571633245335e-06, + "loss": 0.1778, + "step": 1805 + }, + { + "epoch": 0.29261179520414776, + "grad_norm": 1.1275112628936768, + "learning_rate": 4.940476813216294e-06, + "loss": 0.1555, + "step": 1806 + }, + { + "epoch": 0.29277381723914453, + "grad_norm": 1.0518170595169067, + "learning_rate": 4.940381918514568e-06, + "loss": 0.159, + "step": 1807 + }, + { + "epoch": 0.2929358392741413, + "grad_norm": 1.0463672876358032, + "learning_rate": 4.940286949143061e-06, + "loss": 0.1614, + "step": 1808 + }, + { + "epoch": 0.29309786130913806, + "grad_norm": 1.200708270072937, + "learning_rate": 4.94019190510468e-06, + "loss": 0.1809, + "step": 1809 + }, + { + "epoch": 0.2932598833441348, + "grad_norm": 1.2362070083618164, + "learning_rate": 4.940096786402331e-06, + "loss": 0.1431, + "step": 1810 + }, + { + "epoch": 0.29342190537913154, + "grad_norm": 1.1330715417861938, + "learning_rate": 4.940001593038925e-06, + "loss": 0.1431, + "step": 1811 + }, + { + "epoch": 0.2935839274141283, + "grad_norm": 1.2720601558685303, + "learning_rate": 4.939906325017374e-06, + "loss": 0.1729, + "step": 1812 + }, + { + "epoch": 0.2937459494491251, + "grad_norm": 1.2702455520629883, + "learning_rate": 4.939810982340595e-06, + "loss": 0.1722, + "step": 1813 + }, + { + "epoch": 0.29390797148412184, + "grad_norm": 1.1870192289352417, + "learning_rate": 4.939715565011504e-06, + "loss": 0.1647, + "step": 1814 + }, + { + "epoch": 0.2940699935191186, + "grad_norm": 1.2095550298690796, + "learning_rate": 4.939620073033021e-06, + "loss": 0.1647, + "step": 1815 + }, + { + "epoch": 0.2942320155541154, + "grad_norm": 1.1870856285095215, + "learning_rate": 4.939524506408068e-06, + "loss": 0.1554, + "step": 1816 + }, + { + "epoch": 0.29439403758911215, + "grad_norm": 1.080191969871521, + "learning_rate": 4.939428865139568e-06, + "loss": 0.1572, + "step": 1817 + }, + { + "epoch": 0.29455605962410886, + "grad_norm": 1.1094964742660522, + "learning_rate": 4.939333149230447e-06, + "loss": 0.1623, + "step": 1818 + }, + { + "epoch": 0.2947180816591056, + "grad_norm": 1.0105398893356323, + "learning_rate": 4.939237358683636e-06, + "loss": 0.1591, + "step": 1819 + }, + { + "epoch": 0.2948801036941024, + "grad_norm": 1.1917641162872314, + "learning_rate": 4.9391414935020656e-06, + "loss": 0.1728, + "step": 1820 + }, + { + "epoch": 0.29504212572909916, + "grad_norm": 1.0402076244354248, + "learning_rate": 4.939045553688666e-06, + "loss": 0.1525, + "step": 1821 + }, + { + "epoch": 0.2952041477640959, + "grad_norm": 1.235533356666565, + "learning_rate": 4.938949539246376e-06, + "loss": 0.1522, + "step": 1822 + }, + { + "epoch": 0.2953661697990927, + "grad_norm": 0.9965629577636719, + "learning_rate": 4.9388534501781325e-06, + "loss": 0.1515, + "step": 1823 + }, + { + "epoch": 0.29552819183408946, + "grad_norm": 1.0826505422592163, + "learning_rate": 4.938757286486874e-06, + "loss": 0.14, + "step": 1824 + }, + { + "epoch": 0.29569021386908617, + "grad_norm": 1.1033453941345215, + "learning_rate": 4.938661048175545e-06, + "loss": 0.1591, + "step": 1825 + }, + { + "epoch": 0.29585223590408294, + "grad_norm": 1.169182538986206, + "learning_rate": 4.93856473524709e-06, + "loss": 0.154, + "step": 1826 + }, + { + "epoch": 0.2960142579390797, + "grad_norm": 1.1595622301101685, + "learning_rate": 4.938468347704455e-06, + "loss": 0.1506, + "step": 1827 + }, + { + "epoch": 0.2961762799740765, + "grad_norm": 1.1768068075180054, + "learning_rate": 4.938371885550589e-06, + "loss": 0.1738, + "step": 1828 + }, + { + "epoch": 0.29633830200907324, + "grad_norm": 1.0577977895736694, + "learning_rate": 4.938275348788443e-06, + "loss": 0.1349, + "step": 1829 + }, + { + "epoch": 0.29650032404407, + "grad_norm": 1.1384469270706177, + "learning_rate": 4.938178737420974e-06, + "loss": 0.1506, + "step": 1830 + }, + { + "epoch": 0.2966623460790668, + "grad_norm": 1.1064151525497437, + "learning_rate": 4.938082051451135e-06, + "loss": 0.1583, + "step": 1831 + }, + { + "epoch": 0.2968243681140635, + "grad_norm": 1.0915504693984985, + "learning_rate": 4.937985290881886e-06, + "loss": 0.1642, + "step": 1832 + }, + { + "epoch": 0.29698639014906025, + "grad_norm": 0.9604697823524475, + "learning_rate": 4.937888455716186e-06, + "loss": 0.1342, + "step": 1833 + }, + { + "epoch": 0.297148412184057, + "grad_norm": 1.1359755992889404, + "learning_rate": 4.9377915459569995e-06, + "loss": 0.161, + "step": 1834 + }, + { + "epoch": 0.2973104342190538, + "grad_norm": 1.0202888250350952, + "learning_rate": 4.93769456160729e-06, + "loss": 0.144, + "step": 1835 + }, + { + "epoch": 0.29747245625405055, + "grad_norm": 1.0656861066818237, + "learning_rate": 4.937597502670027e-06, + "loss": 0.1468, + "step": 1836 + }, + { + "epoch": 0.2976344782890473, + "grad_norm": 1.2268567085266113, + "learning_rate": 4.937500369148179e-06, + "loss": 0.1796, + "step": 1837 + }, + { + "epoch": 0.2977965003240441, + "grad_norm": 1.1543471813201904, + "learning_rate": 4.9374031610447185e-06, + "loss": 0.1666, + "step": 1838 + }, + { + "epoch": 0.29795852235904086, + "grad_norm": 1.1135607957839966, + "learning_rate": 4.9373058783626195e-06, + "loss": 0.1525, + "step": 1839 + }, + { + "epoch": 0.29812054439403757, + "grad_norm": 0.995367169380188, + "learning_rate": 4.937208521104858e-06, + "loss": 0.1383, + "step": 1840 + }, + { + "epoch": 0.29828256642903433, + "grad_norm": 1.0049978494644165, + "learning_rate": 4.9371110892744146e-06, + "loss": 0.1365, + "step": 1841 + }, + { + "epoch": 0.2984445884640311, + "grad_norm": 1.1172624826431274, + "learning_rate": 4.937013582874269e-06, + "loss": 0.1624, + "step": 1842 + }, + { + "epoch": 0.29860661049902787, + "grad_norm": 1.1392813920974731, + "learning_rate": 4.936916001907406e-06, + "loss": 0.1725, + "step": 1843 + }, + { + "epoch": 0.29876863253402464, + "grad_norm": 1.1414449214935303, + "learning_rate": 4.93681834637681e-06, + "loss": 0.1639, + "step": 1844 + }, + { + "epoch": 0.2989306545690214, + "grad_norm": 1.2181705236434937, + "learning_rate": 4.9367206162854695e-06, + "loss": 0.1662, + "step": 1845 + }, + { + "epoch": 0.29909267660401817, + "grad_norm": 1.1003068685531616, + "learning_rate": 4.936622811636376e-06, + "loss": 0.1654, + "step": 1846 + }, + { + "epoch": 0.2992546986390149, + "grad_norm": 1.1408618688583374, + "learning_rate": 4.93652493243252e-06, + "loss": 0.1622, + "step": 1847 + }, + { + "epoch": 0.29941672067401165, + "grad_norm": 1.5164649486541748, + "learning_rate": 4.936426978676897e-06, + "loss": 0.1466, + "step": 1848 + }, + { + "epoch": 0.2995787427090084, + "grad_norm": 1.0539342164993286, + "learning_rate": 4.9363289503725055e-06, + "loss": 0.1605, + "step": 1849 + }, + { + "epoch": 0.2997407647440052, + "grad_norm": 1.0239052772521973, + "learning_rate": 4.936230847522343e-06, + "loss": 0.1398, + "step": 1850 + }, + { + "epoch": 0.29990278677900195, + "grad_norm": 0.9674493670463562, + "learning_rate": 4.9361326701294124e-06, + "loss": 0.1389, + "step": 1851 + }, + { + "epoch": 0.3000648088139987, + "grad_norm": 1.2970346212387085, + "learning_rate": 4.936034418196718e-06, + "loss": 0.1836, + "step": 1852 + }, + { + "epoch": 0.3002268308489955, + "grad_norm": 1.1987262964248657, + "learning_rate": 4.935936091727264e-06, + "loss": 0.1852, + "step": 1853 + }, + { + "epoch": 0.3003888528839922, + "grad_norm": 1.051525354385376, + "learning_rate": 4.935837690724063e-06, + "loss": 0.1552, + "step": 1854 + }, + { + "epoch": 0.30055087491898896, + "grad_norm": 1.2423964738845825, + "learning_rate": 4.9357392151901204e-06, + "loss": 0.1768, + "step": 1855 + }, + { + "epoch": 0.30071289695398573, + "grad_norm": 1.0378652811050415, + "learning_rate": 4.935640665128454e-06, + "loss": 0.1486, + "step": 1856 + }, + { + "epoch": 0.3008749189889825, + "grad_norm": 0.9993836879730225, + "learning_rate": 4.935542040542077e-06, + "loss": 0.1506, + "step": 1857 + }, + { + "epoch": 0.30103694102397927, + "grad_norm": 1.056410789489746, + "learning_rate": 4.935443341434008e-06, + "loss": 0.1442, + "step": 1858 + }, + { + "epoch": 0.30119896305897603, + "grad_norm": 1.1144357919692993, + "learning_rate": 4.935344567807265e-06, + "loss": 0.1679, + "step": 1859 + }, + { + "epoch": 0.3013609850939728, + "grad_norm": 0.9949638247489929, + "learning_rate": 4.935245719664873e-06, + "loss": 0.1405, + "step": 1860 + }, + { + "epoch": 0.30152300712896957, + "grad_norm": 1.090957760810852, + "learning_rate": 4.935146797009854e-06, + "loss": 0.1663, + "step": 1861 + }, + { + "epoch": 0.3016850291639663, + "grad_norm": 1.0422848463058472, + "learning_rate": 4.935047799845238e-06, + "loss": 0.1413, + "step": 1862 + }, + { + "epoch": 0.30184705119896305, + "grad_norm": 1.0720633268356323, + "learning_rate": 4.93494872817405e-06, + "loss": 0.1583, + "step": 1863 + }, + { + "epoch": 0.3020090732339598, + "grad_norm": 1.1506516933441162, + "learning_rate": 4.9348495819993235e-06, + "loss": 0.1554, + "step": 1864 + }, + { + "epoch": 0.3021710952689566, + "grad_norm": 1.3008663654327393, + "learning_rate": 4.934750361324092e-06, + "loss": 0.1636, + "step": 1865 + }, + { + "epoch": 0.30233311730395335, + "grad_norm": 1.1331379413604736, + "learning_rate": 4.9346510661513924e-06, + "loss": 0.149, + "step": 1866 + }, + { + "epoch": 0.3024951393389501, + "grad_norm": 1.184205174446106, + "learning_rate": 4.934551696484262e-06, + "loss": 0.1718, + "step": 1867 + }, + { + "epoch": 0.3026571613739469, + "grad_norm": 1.1299738883972168, + "learning_rate": 4.93445225232574e-06, + "loss": 0.1697, + "step": 1868 + }, + { + "epoch": 0.3028191834089436, + "grad_norm": 1.1674683094024658, + "learning_rate": 4.934352733678871e-06, + "loss": 0.1672, + "step": 1869 + }, + { + "epoch": 0.30298120544394036, + "grad_norm": 1.0782699584960938, + "learning_rate": 4.9342531405467e-06, + "loss": 0.165, + "step": 1870 + }, + { + "epoch": 0.3031432274789371, + "grad_norm": 1.0888804197311401, + "learning_rate": 4.934153472932272e-06, + "loss": 0.1549, + "step": 1871 + }, + { + "epoch": 0.3033052495139339, + "grad_norm": 1.1337841749191284, + "learning_rate": 4.934053730838639e-06, + "loss": 0.1743, + "step": 1872 + }, + { + "epoch": 0.30346727154893066, + "grad_norm": 1.0406060218811035, + "learning_rate": 4.933953914268853e-06, + "loss": 0.1504, + "step": 1873 + }, + { + "epoch": 0.30362929358392743, + "grad_norm": 1.1255491971969604, + "learning_rate": 4.9338540232259664e-06, + "loss": 0.1566, + "step": 1874 + }, + { + "epoch": 0.3037913156189242, + "grad_norm": 1.169628381729126, + "learning_rate": 4.933754057713037e-06, + "loss": 0.1566, + "step": 1875 + }, + { + "epoch": 0.3039533376539209, + "grad_norm": 1.4170974493026733, + "learning_rate": 4.9336540177331225e-06, + "loss": 0.1623, + "step": 1876 + }, + { + "epoch": 0.3041153596889177, + "grad_norm": 1.1771212816238403, + "learning_rate": 4.933553903289285e-06, + "loss": 0.1438, + "step": 1877 + }, + { + "epoch": 0.30427738172391444, + "grad_norm": 1.251401662826538, + "learning_rate": 4.9334537143845876e-06, + "loss": 0.1762, + "step": 1878 + }, + { + "epoch": 0.3044394037589112, + "grad_norm": 1.1183122396469116, + "learning_rate": 4.933353451022094e-06, + "loss": 0.1617, + "step": 1879 + }, + { + "epoch": 0.304601425793908, + "grad_norm": 1.187208652496338, + "learning_rate": 4.933253113204874e-06, + "loss": 0.1815, + "step": 1880 + }, + { + "epoch": 0.30476344782890474, + "grad_norm": 1.1539027690887451, + "learning_rate": 4.933152700935997e-06, + "loss": 0.1718, + "step": 1881 + }, + { + "epoch": 0.3049254698639015, + "grad_norm": 1.0791608095169067, + "learning_rate": 4.933052214218535e-06, + "loss": 0.1562, + "step": 1882 + }, + { + "epoch": 0.3050874918988983, + "grad_norm": 1.1433926820755005, + "learning_rate": 4.932951653055564e-06, + "loss": 0.1717, + "step": 1883 + }, + { + "epoch": 0.305249513933895, + "grad_norm": 1.2988801002502441, + "learning_rate": 4.93285101745016e-06, + "loss": 0.1815, + "step": 1884 + }, + { + "epoch": 0.30541153596889176, + "grad_norm": 1.1645463705062866, + "learning_rate": 4.932750307405402e-06, + "loss": 0.1674, + "step": 1885 + }, + { + "epoch": 0.3055735580038885, + "grad_norm": 1.1217708587646484, + "learning_rate": 4.932649522924372e-06, + "loss": 0.1636, + "step": 1886 + }, + { + "epoch": 0.3057355800388853, + "grad_norm": 1.087047815322876, + "learning_rate": 4.932548664010153e-06, + "loss": 0.1462, + "step": 1887 + }, + { + "epoch": 0.30589760207388206, + "grad_norm": 1.0934066772460938, + "learning_rate": 4.932447730665832e-06, + "loss": 0.176, + "step": 1888 + }, + { + "epoch": 0.3060596241088788, + "grad_norm": 1.0176986455917358, + "learning_rate": 4.9323467228944965e-06, + "loss": 0.1458, + "step": 1889 + }, + { + "epoch": 0.3062216461438756, + "grad_norm": 0.9754590392112732, + "learning_rate": 4.932245640699238e-06, + "loss": 0.1338, + "step": 1890 + }, + { + "epoch": 0.3063836681788723, + "grad_norm": 1.1034765243530273, + "learning_rate": 4.932144484083148e-06, + "loss": 0.16, + "step": 1891 + }, + { + "epoch": 0.30654569021386907, + "grad_norm": 1.0700440406799316, + "learning_rate": 4.932043253049323e-06, + "loss": 0.1531, + "step": 1892 + }, + { + "epoch": 0.30670771224886584, + "grad_norm": 1.2314177751541138, + "learning_rate": 4.93194194760086e-06, + "loss": 0.1607, + "step": 1893 + }, + { + "epoch": 0.3068697342838626, + "grad_norm": 1.1649889945983887, + "learning_rate": 4.931840567740858e-06, + "loss": 0.1783, + "step": 1894 + }, + { + "epoch": 0.3070317563188594, + "grad_norm": 1.099457025527954, + "learning_rate": 4.9317391134724195e-06, + "loss": 0.1763, + "step": 1895 + }, + { + "epoch": 0.30719377835385614, + "grad_norm": 1.0466156005859375, + "learning_rate": 4.93163758479865e-06, + "loss": 0.1406, + "step": 1896 + }, + { + "epoch": 0.3073558003888529, + "grad_norm": 1.3971292972564697, + "learning_rate": 4.931535981722654e-06, + "loss": 0.1728, + "step": 1897 + }, + { + "epoch": 0.3075178224238496, + "grad_norm": 1.2202054262161255, + "learning_rate": 4.931434304247541e-06, + "loss": 0.1857, + "step": 1898 + }, + { + "epoch": 0.3076798444588464, + "grad_norm": 1.1348539590835571, + "learning_rate": 4.931332552376422e-06, + "loss": 0.1688, + "step": 1899 + }, + { + "epoch": 0.30784186649384315, + "grad_norm": 1.024888038635254, + "learning_rate": 4.931230726112412e-06, + "loss": 0.1495, + "step": 1900 + }, + { + "epoch": 0.3080038885288399, + "grad_norm": 1.1341696977615356, + "learning_rate": 4.931128825458623e-06, + "loss": 0.1586, + "step": 1901 + }, + { + "epoch": 0.3081659105638367, + "grad_norm": 1.094712495803833, + "learning_rate": 4.9310268504181764e-06, + "loss": 0.1744, + "step": 1902 + }, + { + "epoch": 0.30832793259883345, + "grad_norm": 1.1410554647445679, + "learning_rate": 4.930924800994192e-06, + "loss": 0.1684, + "step": 1903 + }, + { + "epoch": 0.3084899546338302, + "grad_norm": 1.1105761528015137, + "learning_rate": 4.930822677189791e-06, + "loss": 0.1649, + "step": 1904 + }, + { + "epoch": 0.30865197666882693, + "grad_norm": 0.9734887480735779, + "learning_rate": 4.930720479008098e-06, + "loss": 0.1453, + "step": 1905 + }, + { + "epoch": 0.3088139987038237, + "grad_norm": 1.0480237007141113, + "learning_rate": 4.93061820645224e-06, + "loss": 0.1511, + "step": 1906 + }, + { + "epoch": 0.30897602073882047, + "grad_norm": 1.168448567390442, + "learning_rate": 4.930515859525348e-06, + "loss": 0.151, + "step": 1907 + }, + { + "epoch": 0.30913804277381723, + "grad_norm": 1.0188759565353394, + "learning_rate": 4.930413438230552e-06, + "loss": 0.1355, + "step": 1908 + }, + { + "epoch": 0.309300064808814, + "grad_norm": 1.064790964126587, + "learning_rate": 4.930310942570987e-06, + "loss": 0.1482, + "step": 1909 + }, + { + "epoch": 0.30946208684381077, + "grad_norm": 1.1251689195632935, + "learning_rate": 4.930208372549787e-06, + "loss": 0.1616, + "step": 1910 + }, + { + "epoch": 0.30962410887880754, + "grad_norm": 1.050378441810608, + "learning_rate": 4.930105728170093e-06, + "loss": 0.1405, + "step": 1911 + }, + { + "epoch": 0.3097861309138043, + "grad_norm": 1.0885461568832397, + "learning_rate": 4.930003009435043e-06, + "loss": 0.1513, + "step": 1912 + }, + { + "epoch": 0.309948152948801, + "grad_norm": 1.0849794149398804, + "learning_rate": 4.929900216347783e-06, + "loss": 0.1529, + "step": 1913 + }, + { + "epoch": 0.3101101749837978, + "grad_norm": 1.1687252521514893, + "learning_rate": 4.9297973489114565e-06, + "loss": 0.1697, + "step": 1914 + }, + { + "epoch": 0.31027219701879455, + "grad_norm": 1.0808390378952026, + "learning_rate": 4.929694407129211e-06, + "loss": 0.1474, + "step": 1915 + }, + { + "epoch": 0.3104342190537913, + "grad_norm": 1.0042861700057983, + "learning_rate": 4.929591391004196e-06, + "loss": 0.1445, + "step": 1916 + }, + { + "epoch": 0.3105962410887881, + "grad_norm": 1.0821775197982788, + "learning_rate": 4.929488300539564e-06, + "loss": 0.1594, + "step": 1917 + }, + { + "epoch": 0.31075826312378485, + "grad_norm": 1.214570164680481, + "learning_rate": 4.929385135738469e-06, + "loss": 0.1792, + "step": 1918 + }, + { + "epoch": 0.3109202851587816, + "grad_norm": 1.0570710897445679, + "learning_rate": 4.929281896604068e-06, + "loss": 0.1638, + "step": 1919 + }, + { + "epoch": 0.31108230719377833, + "grad_norm": 0.8534372448921204, + "learning_rate": 4.92917858313952e-06, + "loss": 0.121, + "step": 1920 + }, + { + "epoch": 0.3112443292287751, + "grad_norm": 1.1174410581588745, + "learning_rate": 4.9290751953479856e-06, + "loss": 0.1623, + "step": 1921 + }, + { + "epoch": 0.31140635126377186, + "grad_norm": 1.2499920129776, + "learning_rate": 4.928971733232628e-06, + "loss": 0.1853, + "step": 1922 + }, + { + "epoch": 0.31156837329876863, + "grad_norm": 0.961334764957428, + "learning_rate": 4.928868196796615e-06, + "loss": 0.1464, + "step": 1923 + }, + { + "epoch": 0.3117303953337654, + "grad_norm": 1.0711408853530884, + "learning_rate": 4.928764586043111e-06, + "loss": 0.1597, + "step": 1924 + }, + { + "epoch": 0.31189241736876216, + "grad_norm": 1.0463709831237793, + "learning_rate": 4.928660900975289e-06, + "loss": 0.1594, + "step": 1925 + }, + { + "epoch": 0.31205443940375893, + "grad_norm": 1.1414794921875, + "learning_rate": 4.9285571415963205e-06, + "loss": 0.1703, + "step": 1926 + }, + { + "epoch": 0.31221646143875564, + "grad_norm": 1.1475234031677246, + "learning_rate": 4.928453307909381e-06, + "loss": 0.177, + "step": 1927 + }, + { + "epoch": 0.3123784834737524, + "grad_norm": 0.9466526508331299, + "learning_rate": 4.928349399917646e-06, + "loss": 0.1405, + "step": 1928 + }, + { + "epoch": 0.3125405055087492, + "grad_norm": 1.0631221532821655, + "learning_rate": 4.928245417624297e-06, + "loss": 0.1643, + "step": 1929 + }, + { + "epoch": 0.31270252754374595, + "grad_norm": 1.1224422454833984, + "learning_rate": 4.928141361032513e-06, + "loss": 0.1689, + "step": 1930 + }, + { + "epoch": 0.3128645495787427, + "grad_norm": 1.0592671632766724, + "learning_rate": 4.928037230145481e-06, + "loss": 0.1495, + "step": 1931 + }, + { + "epoch": 0.3130265716137395, + "grad_norm": 1.071808099746704, + "learning_rate": 4.927933024966385e-06, + "loss": 0.1601, + "step": 1932 + }, + { + "epoch": 0.31318859364873625, + "grad_norm": 1.0835838317871094, + "learning_rate": 4.927828745498414e-06, + "loss": 0.1623, + "step": 1933 + }, + { + "epoch": 0.313350615683733, + "grad_norm": 1.1354283094406128, + "learning_rate": 4.927724391744758e-06, + "loss": 0.145, + "step": 1934 + }, + { + "epoch": 0.3135126377187297, + "grad_norm": 1.1181296110153198, + "learning_rate": 4.9276199637086106e-06, + "loss": 0.1633, + "step": 1935 + }, + { + "epoch": 0.3136746597537265, + "grad_norm": 1.1335471868515015, + "learning_rate": 4.927515461393167e-06, + "loss": 0.1455, + "step": 1936 + }, + { + "epoch": 0.31383668178872326, + "grad_norm": 1.091052770614624, + "learning_rate": 4.927410884801626e-06, + "loss": 0.1619, + "step": 1937 + }, + { + "epoch": 0.31399870382372, + "grad_norm": 1.1227446794509888, + "learning_rate": 4.927306233937185e-06, + "loss": 0.1757, + "step": 1938 + }, + { + "epoch": 0.3141607258587168, + "grad_norm": 0.9883224368095398, + "learning_rate": 4.927201508803048e-06, + "loss": 0.1359, + "step": 1939 + }, + { + "epoch": 0.31432274789371356, + "grad_norm": 1.1337419748306274, + "learning_rate": 4.927096709402417e-06, + "loss": 0.1501, + "step": 1940 + }, + { + "epoch": 0.31448476992871033, + "grad_norm": 1.0160683393478394, + "learning_rate": 4.9269918357385015e-06, + "loss": 0.143, + "step": 1941 + }, + { + "epoch": 0.31464679196370704, + "grad_norm": 1.1797884702682495, + "learning_rate": 4.926886887814509e-06, + "loss": 0.1696, + "step": 1942 + }, + { + "epoch": 0.3148088139987038, + "grad_norm": 1.0038928985595703, + "learning_rate": 4.92678186563365e-06, + "loss": 0.1523, + "step": 1943 + }, + { + "epoch": 0.3149708360337006, + "grad_norm": 1.1223050355911255, + "learning_rate": 4.926676769199139e-06, + "loss": 0.1695, + "step": 1944 + }, + { + "epoch": 0.31513285806869734, + "grad_norm": 1.1693401336669922, + "learning_rate": 4.9265715985141914e-06, + "loss": 0.1676, + "step": 1945 + }, + { + "epoch": 0.3152948801036941, + "grad_norm": 1.0387099981307983, + "learning_rate": 4.9264663535820256e-06, + "loss": 0.1502, + "step": 1946 + }, + { + "epoch": 0.3154569021386909, + "grad_norm": 1.1783015727996826, + "learning_rate": 4.926361034405861e-06, + "loss": 0.1697, + "step": 1947 + }, + { + "epoch": 0.31561892417368764, + "grad_norm": 1.137247920036316, + "learning_rate": 4.926255640988919e-06, + "loss": 0.1691, + "step": 1948 + }, + { + "epoch": 0.31578094620868435, + "grad_norm": 1.0531809329986572, + "learning_rate": 4.926150173334427e-06, + "loss": 0.1597, + "step": 1949 + }, + { + "epoch": 0.3159429682436811, + "grad_norm": 1.0189039707183838, + "learning_rate": 4.926044631445611e-06, + "loss": 0.1597, + "step": 1950 + }, + { + "epoch": 0.3161049902786779, + "grad_norm": 1.1217700242996216, + "learning_rate": 4.9259390153257006e-06, + "loss": 0.1609, + "step": 1951 + }, + { + "epoch": 0.31626701231367466, + "grad_norm": 1.0535531044006348, + "learning_rate": 4.925833324977926e-06, + "loss": 0.1584, + "step": 1952 + }, + { + "epoch": 0.3164290343486714, + "grad_norm": 1.066357970237732, + "learning_rate": 4.925727560405522e-06, + "loss": 0.1622, + "step": 1953 + }, + { + "epoch": 0.3165910563836682, + "grad_norm": 1.065629005432129, + "learning_rate": 4.925621721611726e-06, + "loss": 0.1709, + "step": 1954 + }, + { + "epoch": 0.31675307841866496, + "grad_norm": 1.0723075866699219, + "learning_rate": 4.925515808599774e-06, + "loss": 0.165, + "step": 1955 + }, + { + "epoch": 0.3169151004536617, + "grad_norm": 1.1291669607162476, + "learning_rate": 4.925409821372908e-06, + "loss": 0.1531, + "step": 1956 + }, + { + "epoch": 0.31707712248865844, + "grad_norm": 1.150839924812317, + "learning_rate": 4.925303759934372e-06, + "loss": 0.1676, + "step": 1957 + }, + { + "epoch": 0.3172391445236552, + "grad_norm": 1.0867832899093628, + "learning_rate": 4.925197624287409e-06, + "loss": 0.1749, + "step": 1958 + }, + { + "epoch": 0.31740116655865197, + "grad_norm": 1.1858301162719727, + "learning_rate": 4.925091414435268e-06, + "loss": 0.1802, + "step": 1959 + }, + { + "epoch": 0.31756318859364874, + "grad_norm": 1.1005200147628784, + "learning_rate": 4.924985130381198e-06, + "loss": 0.1735, + "step": 1960 + }, + { + "epoch": 0.3177252106286455, + "grad_norm": 1.015781044960022, + "learning_rate": 4.924878772128452e-06, + "loss": 0.1494, + "step": 1961 + }, + { + "epoch": 0.31788723266364227, + "grad_norm": 1.1282938718795776, + "learning_rate": 4.924772339680283e-06, + "loss": 0.1528, + "step": 1962 + }, + { + "epoch": 0.31804925469863904, + "grad_norm": 1.103499412536621, + "learning_rate": 4.9246658330399474e-06, + "loss": 0.1722, + "step": 1963 + }, + { + "epoch": 0.31821127673363575, + "grad_norm": 1.0378453731536865, + "learning_rate": 4.9245592522107065e-06, + "loss": 0.1648, + "step": 1964 + }, + { + "epoch": 0.3183732987686325, + "grad_norm": 1.1685363054275513, + "learning_rate": 4.924452597195819e-06, + "loss": 0.1796, + "step": 1965 + }, + { + "epoch": 0.3185353208036293, + "grad_norm": 1.0074273347854614, + "learning_rate": 4.92434586799855e-06, + "loss": 0.1525, + "step": 1966 + }, + { + "epoch": 0.31869734283862605, + "grad_norm": 0.9873900413513184, + "learning_rate": 4.924239064622163e-06, + "loss": 0.146, + "step": 1967 + }, + { + "epoch": 0.3188593648736228, + "grad_norm": 1.1330924034118652, + "learning_rate": 4.924132187069928e-06, + "loss": 0.1609, + "step": 1968 + }, + { + "epoch": 0.3190213869086196, + "grad_norm": 1.1219487190246582, + "learning_rate": 4.924025235345114e-06, + "loss": 0.1646, + "step": 1969 + }, + { + "epoch": 0.31918340894361635, + "grad_norm": 1.068410873413086, + "learning_rate": 4.923918209450994e-06, + "loss": 0.154, + "step": 1970 + }, + { + "epoch": 0.31934543097861307, + "grad_norm": 1.062517762184143, + "learning_rate": 4.923811109390843e-06, + "loss": 0.1454, + "step": 1971 + }, + { + "epoch": 0.31950745301360983, + "grad_norm": 1.0570393800735474, + "learning_rate": 4.9237039351679365e-06, + "loss": 0.1551, + "step": 1972 + }, + { + "epoch": 0.3196694750486066, + "grad_norm": 1.120102882385254, + "learning_rate": 4.923596686785556e-06, + "loss": 0.1628, + "step": 1973 + }, + { + "epoch": 0.31983149708360337, + "grad_norm": 1.1168919801712036, + "learning_rate": 4.923489364246981e-06, + "loss": 0.1715, + "step": 1974 + }, + { + "epoch": 0.31999351911860013, + "grad_norm": 1.2127281427383423, + "learning_rate": 4.923381967555496e-06, + "loss": 0.1598, + "step": 1975 + }, + { + "epoch": 0.3201555411535969, + "grad_norm": 1.1189041137695312, + "learning_rate": 4.923274496714387e-06, + "loss": 0.1604, + "step": 1976 + }, + { + "epoch": 0.32031756318859367, + "grad_norm": 1.0939730405807495, + "learning_rate": 4.923166951726945e-06, + "loss": 0.1557, + "step": 1977 + }, + { + "epoch": 0.32047958522359044, + "grad_norm": 1.0992319583892822, + "learning_rate": 4.923059332596456e-06, + "loss": 0.1615, + "step": 1978 + }, + { + "epoch": 0.32064160725858715, + "grad_norm": 1.0739459991455078, + "learning_rate": 4.922951639326215e-06, + "loss": 0.1537, + "step": 1979 + }, + { + "epoch": 0.3208036292935839, + "grad_norm": 1.1589041948318481, + "learning_rate": 4.922843871919518e-06, + "loss": 0.1663, + "step": 1980 + }, + { + "epoch": 0.3209656513285807, + "grad_norm": 1.0140794515609741, + "learning_rate": 4.922736030379662e-06, + "loss": 0.1435, + "step": 1981 + }, + { + "epoch": 0.32112767336357745, + "grad_norm": 1.1577235460281372, + "learning_rate": 4.922628114709945e-06, + "loss": 0.1815, + "step": 1982 + }, + { + "epoch": 0.3212896953985742, + "grad_norm": 0.9740235805511475, + "learning_rate": 4.922520124913672e-06, + "loss": 0.1442, + "step": 1983 + }, + { + "epoch": 0.321451717433571, + "grad_norm": 1.3087810277938843, + "learning_rate": 4.922412060994145e-06, + "loss": 0.1556, + "step": 1984 + }, + { + "epoch": 0.32161373946856775, + "grad_norm": 1.1714460849761963, + "learning_rate": 4.922303922954671e-06, + "loss": 0.1828, + "step": 1985 + }, + { + "epoch": 0.32177576150356446, + "grad_norm": 0.9607410430908203, + "learning_rate": 4.922195710798559e-06, + "loss": 0.1292, + "step": 1986 + }, + { + "epoch": 0.32193778353856123, + "grad_norm": 1.0208156108856201, + "learning_rate": 4.9220874245291194e-06, + "loss": 0.143, + "step": 1987 + }, + { + "epoch": 0.322099805573558, + "grad_norm": 1.004679560661316, + "learning_rate": 4.9219790641496656e-06, + "loss": 0.1454, + "step": 1988 + }, + { + "epoch": 0.32226182760855476, + "grad_norm": 1.2176995277404785, + "learning_rate": 4.921870629663514e-06, + "loss": 0.1631, + "step": 1989 + }, + { + "epoch": 0.32242384964355153, + "grad_norm": 1.067450761795044, + "learning_rate": 4.9217621210739826e-06, + "loss": 0.1605, + "step": 1990 + }, + { + "epoch": 0.3225858716785483, + "grad_norm": 1.1650382280349731, + "learning_rate": 4.92165353838439e-06, + "loss": 0.1758, + "step": 1991 + }, + { + "epoch": 0.32274789371354506, + "grad_norm": 1.0696114301681519, + "learning_rate": 4.921544881598059e-06, + "loss": 0.1483, + "step": 1992 + }, + { + "epoch": 0.3229099157485418, + "grad_norm": 1.1148940324783325, + "learning_rate": 4.921436150718316e-06, + "loss": 0.1585, + "step": 1993 + }, + { + "epoch": 0.32307193778353854, + "grad_norm": 1.0661381483078003, + "learning_rate": 4.921327345748486e-06, + "loss": 0.1714, + "step": 1994 + }, + { + "epoch": 0.3232339598185353, + "grad_norm": 1.0887075662612915, + "learning_rate": 4.921218466691898e-06, + "loss": 0.1494, + "step": 1995 + }, + { + "epoch": 0.3233959818535321, + "grad_norm": 1.2316148281097412, + "learning_rate": 4.921109513551885e-06, + "loss": 0.1681, + "step": 1996 + }, + { + "epoch": 0.32355800388852884, + "grad_norm": 0.982879102230072, + "learning_rate": 4.92100048633178e-06, + "loss": 0.1363, + "step": 1997 + }, + { + "epoch": 0.3237200259235256, + "grad_norm": 1.0092542171478271, + "learning_rate": 4.920891385034918e-06, + "loss": 0.1371, + "step": 1998 + }, + { + "epoch": 0.3238820479585224, + "grad_norm": 1.0851917266845703, + "learning_rate": 4.9207822096646385e-06, + "loss": 0.144, + "step": 1999 + }, + { + "epoch": 0.32404406999351915, + "grad_norm": 0.9787352681159973, + "learning_rate": 4.920672960224282e-06, + "loss": 0.146, + "step": 2000 + }, + { + "epoch": 0.32420609202851586, + "grad_norm": 1.1132274866104126, + "learning_rate": 4.92056363671719e-06, + "loss": 0.1693, + "step": 2001 + }, + { + "epoch": 0.3243681140635126, + "grad_norm": 1.1033443212509155, + "learning_rate": 4.920454239146709e-06, + "loss": 0.1584, + "step": 2002 + }, + { + "epoch": 0.3245301360985094, + "grad_norm": 1.1150498390197754, + "learning_rate": 4.920344767516186e-06, + "loss": 0.1472, + "step": 2003 + }, + { + "epoch": 0.32469215813350616, + "grad_norm": 1.13274347782135, + "learning_rate": 4.92023522182897e-06, + "loss": 0.1729, + "step": 2004 + }, + { + "epoch": 0.3248541801685029, + "grad_norm": 0.9556041359901428, + "learning_rate": 4.920125602088412e-06, + "loss": 0.1365, + "step": 2005 + }, + { + "epoch": 0.3250162022034997, + "grad_norm": 0.979815661907196, + "learning_rate": 4.9200159082978685e-06, + "loss": 0.1376, + "step": 2006 + }, + { + "epoch": 0.32517822423849646, + "grad_norm": 1.1807565689086914, + "learning_rate": 4.919906140460693e-06, + "loss": 0.1665, + "step": 2007 + }, + { + "epoch": 0.32534024627349317, + "grad_norm": 1.0704436302185059, + "learning_rate": 4.919796298580247e-06, + "loss": 0.158, + "step": 2008 + }, + { + "epoch": 0.32550226830848994, + "grad_norm": 1.0892672538757324, + "learning_rate": 4.919686382659889e-06, + "loss": 0.1461, + "step": 2009 + }, + { + "epoch": 0.3256642903434867, + "grad_norm": 1.1033027172088623, + "learning_rate": 4.919576392702984e-06, + "loss": 0.1592, + "step": 2010 + }, + { + "epoch": 0.3258263123784835, + "grad_norm": 1.2258738279342651, + "learning_rate": 4.919466328712897e-06, + "loss": 0.1911, + "step": 2011 + }, + { + "epoch": 0.32598833441348024, + "grad_norm": 0.9703921675682068, + "learning_rate": 4.9193561906929945e-06, + "loss": 0.1397, + "step": 2012 + }, + { + "epoch": 0.326150356448477, + "grad_norm": 1.1200995445251465, + "learning_rate": 4.919245978646648e-06, + "loss": 0.1705, + "step": 2013 + }, + { + "epoch": 0.3263123784834738, + "grad_norm": 1.0445135831832886, + "learning_rate": 4.919135692577229e-06, + "loss": 0.1488, + "step": 2014 + }, + { + "epoch": 0.3264744005184705, + "grad_norm": 0.9916481375694275, + "learning_rate": 4.919025332488111e-06, + "loss": 0.134, + "step": 2015 + }, + { + "epoch": 0.32663642255346725, + "grad_norm": 1.1209403276443481, + "learning_rate": 4.918914898382673e-06, + "loss": 0.1423, + "step": 2016 + }, + { + "epoch": 0.326798444588464, + "grad_norm": 0.954210638999939, + "learning_rate": 4.918804390264292e-06, + "loss": 0.1461, + "step": 2017 + }, + { + "epoch": 0.3269604666234608, + "grad_norm": 1.1691701412200928, + "learning_rate": 4.91869380813635e-06, + "loss": 0.1693, + "step": 2018 + }, + { + "epoch": 0.32712248865845756, + "grad_norm": 1.2081804275512695, + "learning_rate": 4.918583152002231e-06, + "loss": 0.1663, + "step": 2019 + }, + { + "epoch": 0.3272845106934543, + "grad_norm": 1.0673002004623413, + "learning_rate": 4.91847242186532e-06, + "loss": 0.1625, + "step": 2020 + }, + { + "epoch": 0.3274465327284511, + "grad_norm": 1.0329972505569458, + "learning_rate": 4.918361617729006e-06, + "loss": 0.1458, + "step": 2021 + }, + { + "epoch": 0.3276085547634478, + "grad_norm": 1.0796326398849487, + "learning_rate": 4.918250739596678e-06, + "loss": 0.1394, + "step": 2022 + }, + { + "epoch": 0.32777057679844457, + "grad_norm": 1.0734831094741821, + "learning_rate": 4.91813978747173e-06, + "loss": 0.1557, + "step": 2023 + }, + { + "epoch": 0.32793259883344134, + "grad_norm": 1.1349564790725708, + "learning_rate": 4.918028761357557e-06, + "loss": 0.1694, + "step": 2024 + }, + { + "epoch": 0.3280946208684381, + "grad_norm": 1.027542233467102, + "learning_rate": 4.917917661257554e-06, + "loss": 0.1479, + "step": 2025 + }, + { + "epoch": 0.32825664290343487, + "grad_norm": 1.1970611810684204, + "learning_rate": 4.917806487175123e-06, + "loss": 0.1549, + "step": 2026 + }, + { + "epoch": 0.32841866493843164, + "grad_norm": 1.0326762199401855, + "learning_rate": 4.917695239113665e-06, + "loss": 0.1454, + "step": 2027 + }, + { + "epoch": 0.3285806869734284, + "grad_norm": 1.1361476182937622, + "learning_rate": 4.917583917076581e-06, + "loss": 0.1529, + "step": 2028 + }, + { + "epoch": 0.32874270900842517, + "grad_norm": 1.0551778078079224, + "learning_rate": 4.917472521067281e-06, + "loss": 0.155, + "step": 2029 + }, + { + "epoch": 0.3289047310434219, + "grad_norm": 1.0310840606689453, + "learning_rate": 4.917361051089172e-06, + "loss": 0.1433, + "step": 2030 + }, + { + "epoch": 0.32906675307841865, + "grad_norm": 1.0534615516662598, + "learning_rate": 4.917249507145665e-06, + "loss": 0.1675, + "step": 2031 + }, + { + "epoch": 0.3292287751134154, + "grad_norm": 1.0969111919403076, + "learning_rate": 4.917137889240172e-06, + "loss": 0.1741, + "step": 2032 + }, + { + "epoch": 0.3293907971484122, + "grad_norm": 1.0823769569396973, + "learning_rate": 4.91702619737611e-06, + "loss": 0.1621, + "step": 2033 + }, + { + "epoch": 0.32955281918340895, + "grad_norm": 1.058327317237854, + "learning_rate": 4.916914431556895e-06, + "loss": 0.1553, + "step": 2034 + }, + { + "epoch": 0.3297148412184057, + "grad_norm": 1.0909065008163452, + "learning_rate": 4.9168025917859465e-06, + "loss": 0.1621, + "step": 2035 + }, + { + "epoch": 0.3298768632534025, + "grad_norm": 1.1126458644866943, + "learning_rate": 4.916690678066688e-06, + "loss": 0.1628, + "step": 2036 + }, + { + "epoch": 0.3300388852883992, + "grad_norm": 1.1230723857879639, + "learning_rate": 4.916578690402542e-06, + "loss": 0.1714, + "step": 2037 + }, + { + "epoch": 0.33020090732339596, + "grad_norm": 1.2173502445220947, + "learning_rate": 4.916466628796938e-06, + "loss": 0.1657, + "step": 2038 + }, + { + "epoch": 0.33036292935839273, + "grad_norm": 0.9629350900650024, + "learning_rate": 4.916354493253301e-06, + "loss": 0.1262, + "step": 2039 + }, + { + "epoch": 0.3305249513933895, + "grad_norm": 1.159567952156067, + "learning_rate": 4.9162422837750654e-06, + "loss": 0.1867, + "step": 2040 + }, + { + "epoch": 0.33068697342838627, + "grad_norm": 1.039483904838562, + "learning_rate": 4.916130000365662e-06, + "loss": 0.1369, + "step": 2041 + }, + { + "epoch": 0.33084899546338303, + "grad_norm": 1.0981976985931396, + "learning_rate": 4.916017643028529e-06, + "loss": 0.1542, + "step": 2042 + }, + { + "epoch": 0.3310110174983798, + "grad_norm": 1.1774072647094727, + "learning_rate": 4.915905211767101e-06, + "loss": 0.1406, + "step": 2043 + }, + { + "epoch": 0.3311730395333765, + "grad_norm": 1.048284649848938, + "learning_rate": 4.915792706584821e-06, + "loss": 0.1514, + "step": 2044 + }, + { + "epoch": 0.3313350615683733, + "grad_norm": 1.050802230834961, + "learning_rate": 4.9156801274851295e-06, + "loss": 0.1515, + "step": 2045 + }, + { + "epoch": 0.33149708360337005, + "grad_norm": 1.0515334606170654, + "learning_rate": 4.9155674744714725e-06, + "loss": 0.1386, + "step": 2046 + }, + { + "epoch": 0.3316591056383668, + "grad_norm": 0.9953398704528809, + "learning_rate": 4.915454747547296e-06, + "loss": 0.1465, + "step": 2047 + }, + { + "epoch": 0.3318211276733636, + "grad_norm": 0.9482908844947815, + "learning_rate": 4.91534194671605e-06, + "loss": 0.14, + "step": 2048 + }, + { + "epoch": 0.33198314970836035, + "grad_norm": 1.0365067720413208, + "learning_rate": 4.915229071981186e-06, + "loss": 0.1446, + "step": 2049 + }, + { + "epoch": 0.3321451717433571, + "grad_norm": 1.1342229843139648, + "learning_rate": 4.915116123346155e-06, + "loss": 0.1796, + "step": 2050 + }, + { + "epoch": 0.3323071937783539, + "grad_norm": 1.1378239393234253, + "learning_rate": 4.915003100814417e-06, + "loss": 0.1679, + "step": 2051 + }, + { + "epoch": 0.3324692158133506, + "grad_norm": 1.1816859245300293, + "learning_rate": 4.9148900043894275e-06, + "loss": 0.185, + "step": 2052 + }, + { + "epoch": 0.33263123784834736, + "grad_norm": 1.2354105710983276, + "learning_rate": 4.9147768340746486e-06, + "loss": 0.1913, + "step": 2053 + }, + { + "epoch": 0.33279325988334413, + "grad_norm": 1.0721561908721924, + "learning_rate": 4.914663589873541e-06, + "loss": 0.156, + "step": 2054 + }, + { + "epoch": 0.3329552819183409, + "grad_norm": 1.0587486028671265, + "learning_rate": 4.914550271789572e-06, + "loss": 0.1607, + "step": 2055 + }, + { + "epoch": 0.33311730395333766, + "grad_norm": 1.0155506134033203, + "learning_rate": 4.914436879826207e-06, + "loss": 0.1528, + "step": 2056 + }, + { + "epoch": 0.33327932598833443, + "grad_norm": 1.0975433588027954, + "learning_rate": 4.914323413986917e-06, + "loss": 0.1606, + "step": 2057 + }, + { + "epoch": 0.3334413480233312, + "grad_norm": 1.12659752368927, + "learning_rate": 4.9142098742751726e-06, + "loss": 0.1677, + "step": 2058 + }, + { + "epoch": 0.3336033700583279, + "grad_norm": 1.0046424865722656, + "learning_rate": 4.914096260694449e-06, + "loss": 0.163, + "step": 2059 + }, + { + "epoch": 0.3337653920933247, + "grad_norm": 1.103347897529602, + "learning_rate": 4.9139825732482205e-06, + "loss": 0.1583, + "step": 2060 + }, + { + "epoch": 0.33392741412832144, + "grad_norm": 1.109889268875122, + "learning_rate": 4.913868811939968e-06, + "loss": 0.162, + "step": 2061 + }, + { + "epoch": 0.3340894361633182, + "grad_norm": 1.036608338356018, + "learning_rate": 4.91375497677317e-06, + "loss": 0.1487, + "step": 2062 + }, + { + "epoch": 0.334251458198315, + "grad_norm": 1.199803113937378, + "learning_rate": 4.913641067751313e-06, + "loss": 0.1858, + "step": 2063 + }, + { + "epoch": 0.33441348023331174, + "grad_norm": 1.304969072341919, + "learning_rate": 4.913527084877879e-06, + "loss": 0.1704, + "step": 2064 + }, + { + "epoch": 0.3345755022683085, + "grad_norm": 1.1703951358795166, + "learning_rate": 4.913413028156358e-06, + "loss": 0.1688, + "step": 2065 + }, + { + "epoch": 0.3347375243033052, + "grad_norm": 1.0002027750015259, + "learning_rate": 4.913298897590237e-06, + "loss": 0.1413, + "step": 2066 + }, + { + "epoch": 0.334899546338302, + "grad_norm": 1.1760587692260742, + "learning_rate": 4.913184693183011e-06, + "loss": 0.177, + "step": 2067 + }, + { + "epoch": 0.33506156837329876, + "grad_norm": 1.080187439918518, + "learning_rate": 4.913070414938172e-06, + "loss": 0.1556, + "step": 2068 + }, + { + "epoch": 0.3352235904082955, + "grad_norm": 1.124131202697754, + "learning_rate": 4.912956062859219e-06, + "loss": 0.1701, + "step": 2069 + }, + { + "epoch": 0.3353856124432923, + "grad_norm": 1.0667952299118042, + "learning_rate": 4.912841636949649e-06, + "loss": 0.1503, + "step": 2070 + }, + { + "epoch": 0.33554763447828906, + "grad_norm": 1.2242783308029175, + "learning_rate": 4.912727137212964e-06, + "loss": 0.1686, + "step": 2071 + }, + { + "epoch": 0.3357096565132858, + "grad_norm": 1.0249295234680176, + "learning_rate": 4.912612563652667e-06, + "loss": 0.1517, + "step": 2072 + }, + { + "epoch": 0.3358716785482826, + "grad_norm": 1.5885037183761597, + "learning_rate": 4.912497916272264e-06, + "loss": 0.1591, + "step": 2073 + }, + { + "epoch": 0.3360337005832793, + "grad_norm": 1.040554404258728, + "learning_rate": 4.912383195075264e-06, + "loss": 0.1571, + "step": 2074 + }, + { + "epoch": 0.33619572261827607, + "grad_norm": 1.0694094896316528, + "learning_rate": 4.912268400065175e-06, + "loss": 0.1511, + "step": 2075 + }, + { + "epoch": 0.33635774465327284, + "grad_norm": 2.055079460144043, + "learning_rate": 4.912153531245511e-06, + "loss": 0.1697, + "step": 2076 + }, + { + "epoch": 0.3365197666882696, + "grad_norm": 1.066028356552124, + "learning_rate": 4.912038588619786e-06, + "loss": 0.1572, + "step": 2077 + }, + { + "epoch": 0.3366817887232664, + "grad_norm": 1.1714937686920166, + "learning_rate": 4.9119235721915174e-06, + "loss": 0.1764, + "step": 2078 + }, + { + "epoch": 0.33684381075826314, + "grad_norm": 1.0577641725540161, + "learning_rate": 4.911808481964224e-06, + "loss": 0.1411, + "step": 2079 + }, + { + "epoch": 0.3370058327932599, + "grad_norm": 1.0684007406234741, + "learning_rate": 4.911693317941428e-06, + "loss": 0.154, + "step": 2080 + }, + { + "epoch": 0.3371678548282566, + "grad_norm": 1.107500672340393, + "learning_rate": 4.911578080126652e-06, + "loss": 0.1549, + "step": 2081 + }, + { + "epoch": 0.3373298768632534, + "grad_norm": 1.0562461614608765, + "learning_rate": 4.911462768523423e-06, + "loss": 0.1475, + "step": 2082 + }, + { + "epoch": 0.33749189889825015, + "grad_norm": 1.016905426979065, + "learning_rate": 4.911347383135269e-06, + "loss": 0.1391, + "step": 2083 + }, + { + "epoch": 0.3376539209332469, + "grad_norm": 1.1356723308563232, + "learning_rate": 4.9112319239657204e-06, + "loss": 0.1747, + "step": 2084 + }, + { + "epoch": 0.3378159429682437, + "grad_norm": 1.1550018787384033, + "learning_rate": 4.91111639101831e-06, + "loss": 0.1509, + "step": 2085 + }, + { + "epoch": 0.33797796500324045, + "grad_norm": 1.108013391494751, + "learning_rate": 4.911000784296572e-06, + "loss": 0.1712, + "step": 2086 + }, + { + "epoch": 0.3381399870382372, + "grad_norm": 1.057617425918579, + "learning_rate": 4.910885103804046e-06, + "loss": 0.1624, + "step": 2087 + }, + { + "epoch": 0.33830200907323393, + "grad_norm": 1.2254884243011475, + "learning_rate": 4.910769349544269e-06, + "loss": 0.1485, + "step": 2088 + }, + { + "epoch": 0.3384640311082307, + "grad_norm": 1.2375078201293945, + "learning_rate": 4.910653521520784e-06, + "loss": 0.1587, + "step": 2089 + }, + { + "epoch": 0.33862605314322747, + "grad_norm": 1.054060459136963, + "learning_rate": 4.9105376197371355e-06, + "loss": 0.1391, + "step": 2090 + }, + { + "epoch": 0.33878807517822424, + "grad_norm": 1.0424585342407227, + "learning_rate": 4.910421644196868e-06, + "loss": 0.1532, + "step": 2091 + }, + { + "epoch": 0.338950097213221, + "grad_norm": 1.091741681098938, + "learning_rate": 4.9103055949035326e-06, + "loss": 0.1619, + "step": 2092 + }, + { + "epoch": 0.33911211924821777, + "grad_norm": 1.0904366970062256, + "learning_rate": 4.910189471860678e-06, + "loss": 0.1675, + "step": 2093 + }, + { + "epoch": 0.33927414128321454, + "grad_norm": 1.1803327798843384, + "learning_rate": 4.910073275071858e-06, + "loss": 0.1856, + "step": 2094 + }, + { + "epoch": 0.3394361633182113, + "grad_norm": 1.1525360345840454, + "learning_rate": 4.909957004540629e-06, + "loss": 0.1703, + "step": 2095 + }, + { + "epoch": 0.339598185353208, + "grad_norm": 1.070910096168518, + "learning_rate": 4.909840660270547e-06, + "loss": 0.1768, + "step": 2096 + }, + { + "epoch": 0.3397602073882048, + "grad_norm": 0.9406764507293701, + "learning_rate": 4.909724242265172e-06, + "loss": 0.1367, + "step": 2097 + }, + { + "epoch": 0.33992222942320155, + "grad_norm": 1.0764198303222656, + "learning_rate": 4.909607750528068e-06, + "loss": 0.1583, + "step": 2098 + }, + { + "epoch": 0.3400842514581983, + "grad_norm": 0.9914731383323669, + "learning_rate": 4.909491185062797e-06, + "loss": 0.1486, + "step": 2099 + }, + { + "epoch": 0.3402462734931951, + "grad_norm": 1.056030511856079, + "learning_rate": 4.909374545872927e-06, + "loss": 0.1546, + "step": 2100 + }, + { + "epoch": 0.34040829552819185, + "grad_norm": 1.0676145553588867, + "learning_rate": 4.909257832962026e-06, + "loss": 0.1284, + "step": 2101 + }, + { + "epoch": 0.3405703175631886, + "grad_norm": 1.0468038320541382, + "learning_rate": 4.909141046333666e-06, + "loss": 0.1365, + "step": 2102 + }, + { + "epoch": 0.34073233959818533, + "grad_norm": 1.0132776498794556, + "learning_rate": 4.90902418599142e-06, + "loss": 0.1521, + "step": 2103 + }, + { + "epoch": 0.3408943616331821, + "grad_norm": 1.1214451789855957, + "learning_rate": 4.908907251938864e-06, + "loss": 0.1561, + "step": 2104 + }, + { + "epoch": 0.34105638366817886, + "grad_norm": 1.1212339401245117, + "learning_rate": 4.9087902441795745e-06, + "loss": 0.161, + "step": 2105 + }, + { + "epoch": 0.34121840570317563, + "grad_norm": 1.0603835582733154, + "learning_rate": 4.908673162717133e-06, + "loss": 0.1596, + "step": 2106 + }, + { + "epoch": 0.3413804277381724, + "grad_norm": 1.171034812927246, + "learning_rate": 4.908556007555122e-06, + "loss": 0.1534, + "step": 2107 + }, + { + "epoch": 0.34154244977316917, + "grad_norm": 1.055219292640686, + "learning_rate": 4.908438778697125e-06, + "loss": 0.1511, + "step": 2108 + }, + { + "epoch": 0.34170447180816593, + "grad_norm": 1.1049814224243164, + "learning_rate": 4.90832147614673e-06, + "loss": 0.1679, + "step": 2109 + }, + { + "epoch": 0.34186649384316264, + "grad_norm": 1.144884705543518, + "learning_rate": 4.908204099907527e-06, + "loss": 0.1635, + "step": 2110 + }, + { + "epoch": 0.3420285158781594, + "grad_norm": 1.1654030084609985, + "learning_rate": 4.908086649983105e-06, + "loss": 0.1663, + "step": 2111 + }, + { + "epoch": 0.3421905379131562, + "grad_norm": 1.1463876962661743, + "learning_rate": 4.907969126377059e-06, + "loss": 0.1601, + "step": 2112 + }, + { + "epoch": 0.34235255994815295, + "grad_norm": 1.105735182762146, + "learning_rate": 4.9078515290929855e-06, + "loss": 0.1573, + "step": 2113 + }, + { + "epoch": 0.3425145819831497, + "grad_norm": 1.1214326620101929, + "learning_rate": 4.907733858134482e-06, + "loss": 0.1705, + "step": 2114 + }, + { + "epoch": 0.3426766040181465, + "grad_norm": 0.9656097292900085, + "learning_rate": 4.907616113505149e-06, + "loss": 0.1477, + "step": 2115 + }, + { + "epoch": 0.34283862605314325, + "grad_norm": 1.0422648191452026, + "learning_rate": 4.907498295208589e-06, + "loss": 0.1577, + "step": 2116 + }, + { + "epoch": 0.34300064808814, + "grad_norm": 1.0416967868804932, + "learning_rate": 4.907380403248408e-06, + "loss": 0.1674, + "step": 2117 + }, + { + "epoch": 0.3431626701231367, + "grad_norm": 1.0211541652679443, + "learning_rate": 4.907262437628211e-06, + "loss": 0.1615, + "step": 2118 + }, + { + "epoch": 0.3433246921581335, + "grad_norm": 1.0481899976730347, + "learning_rate": 4.90714439835161e-06, + "loss": 0.1624, + "step": 2119 + }, + { + "epoch": 0.34348671419313026, + "grad_norm": 0.9321041703224182, + "learning_rate": 4.907026285422215e-06, + "loss": 0.149, + "step": 2120 + }, + { + "epoch": 0.343648736228127, + "grad_norm": 1.062423825263977, + "learning_rate": 4.9069080988436405e-06, + "loss": 0.1582, + "step": 2121 + }, + { + "epoch": 0.3438107582631238, + "grad_norm": 1.213869571685791, + "learning_rate": 4.906789838619504e-06, + "loss": 0.1728, + "step": 2122 + }, + { + "epoch": 0.34397278029812056, + "grad_norm": 1.011985421180725, + "learning_rate": 4.9066715047534205e-06, + "loss": 0.1525, + "step": 2123 + }, + { + "epoch": 0.34413480233311733, + "grad_norm": 1.0821608304977417, + "learning_rate": 4.906553097249015e-06, + "loss": 0.1607, + "step": 2124 + }, + { + "epoch": 0.34429682436811404, + "grad_norm": 0.9949021339416504, + "learning_rate": 4.906434616109907e-06, + "loss": 0.1306, + "step": 2125 + }, + { + "epoch": 0.3444588464031108, + "grad_norm": 1.1297369003295898, + "learning_rate": 4.906316061339724e-06, + "loss": 0.1498, + "step": 2126 + }, + { + "epoch": 0.3446208684381076, + "grad_norm": 1.0807654857635498, + "learning_rate": 4.906197432942093e-06, + "loss": 0.1492, + "step": 2127 + }, + { + "epoch": 0.34478289047310434, + "grad_norm": 1.2625532150268555, + "learning_rate": 4.9060787309206436e-06, + "loss": 0.1784, + "step": 2128 + }, + { + "epoch": 0.3449449125081011, + "grad_norm": 1.1583614349365234, + "learning_rate": 4.905959955279007e-06, + "loss": 0.1652, + "step": 2129 + }, + { + "epoch": 0.3451069345430979, + "grad_norm": 1.1260557174682617, + "learning_rate": 4.905841106020818e-06, + "loss": 0.1649, + "step": 2130 + }, + { + "epoch": 0.34526895657809464, + "grad_norm": 1.1886639595031738, + "learning_rate": 4.905722183149714e-06, + "loss": 0.1724, + "step": 2131 + }, + { + "epoch": 0.34543097861309136, + "grad_norm": 1.138461709022522, + "learning_rate": 4.905603186669332e-06, + "loss": 0.177, + "step": 2132 + }, + { + "epoch": 0.3455930006480881, + "grad_norm": 1.1662522554397583, + "learning_rate": 4.905484116583314e-06, + "loss": 0.1436, + "step": 2133 + }, + { + "epoch": 0.3457550226830849, + "grad_norm": 1.0118292570114136, + "learning_rate": 4.905364972895304e-06, + "loss": 0.1573, + "step": 2134 + }, + { + "epoch": 0.34591704471808166, + "grad_norm": 0.9873135685920715, + "learning_rate": 4.905245755608946e-06, + "loss": 0.1484, + "step": 2135 + }, + { + "epoch": 0.3460790667530784, + "grad_norm": 1.1140849590301514, + "learning_rate": 4.9051264647278886e-06, + "loss": 0.1773, + "step": 2136 + }, + { + "epoch": 0.3462410887880752, + "grad_norm": 1.053364872932434, + "learning_rate": 4.9050071002557815e-06, + "loss": 0.1462, + "step": 2137 + }, + { + "epoch": 0.34640311082307196, + "grad_norm": 1.0225468873977661, + "learning_rate": 4.904887662196277e-06, + "loss": 0.165, + "step": 2138 + }, + { + "epoch": 0.34656513285806867, + "grad_norm": 1.1107808351516724, + "learning_rate": 4.90476815055303e-06, + "loss": 0.1427, + "step": 2139 + }, + { + "epoch": 0.34672715489306544, + "grad_norm": 1.0493844747543335, + "learning_rate": 4.904648565329697e-06, + "loss": 0.1416, + "step": 2140 + }, + { + "epoch": 0.3468891769280622, + "grad_norm": 1.1383299827575684, + "learning_rate": 4.9045289065299375e-06, + "loss": 0.1915, + "step": 2141 + }, + { + "epoch": 0.34705119896305897, + "grad_norm": 1.1016839742660522, + "learning_rate": 4.904409174157412e-06, + "loss": 0.1629, + "step": 2142 + }, + { + "epoch": 0.34721322099805574, + "grad_norm": 1.0556694269180298, + "learning_rate": 4.904289368215784e-06, + "loss": 0.17, + "step": 2143 + }, + { + "epoch": 0.3473752430330525, + "grad_norm": 1.0352901220321655, + "learning_rate": 4.90416948870872e-06, + "loss": 0.149, + "step": 2144 + }, + { + "epoch": 0.3475372650680493, + "grad_norm": 1.0276658535003662, + "learning_rate": 4.9040495356398874e-06, + "loss": 0.1466, + "step": 2145 + }, + { + "epoch": 0.34769928710304604, + "grad_norm": 1.1721683740615845, + "learning_rate": 4.903929509012957e-06, + "loss": 0.1652, + "step": 2146 + }, + { + "epoch": 0.34786130913804275, + "grad_norm": 1.0136381387710571, + "learning_rate": 4.903809408831601e-06, + "loss": 0.1425, + "step": 2147 + }, + { + "epoch": 0.3480233311730395, + "grad_norm": 1.0642050504684448, + "learning_rate": 4.9036892350994935e-06, + "loss": 0.1454, + "step": 2148 + }, + { + "epoch": 0.3481853532080363, + "grad_norm": 1.0709917545318604, + "learning_rate": 4.903568987820313e-06, + "loss": 0.1478, + "step": 2149 + }, + { + "epoch": 0.34834737524303305, + "grad_norm": 1.0638947486877441, + "learning_rate": 4.9034486669977375e-06, + "loss": 0.1625, + "step": 2150 + }, + { + "epoch": 0.3485093972780298, + "grad_norm": 1.0096039772033691, + "learning_rate": 4.903328272635449e-06, + "loss": 0.1462, + "step": 2151 + }, + { + "epoch": 0.3486714193130266, + "grad_norm": 1.1130759716033936, + "learning_rate": 4.903207804737132e-06, + "loss": 0.1488, + "step": 2152 + }, + { + "epoch": 0.34883344134802335, + "grad_norm": 1.105833649635315, + "learning_rate": 4.9030872633064715e-06, + "loss": 0.1669, + "step": 2153 + }, + { + "epoch": 0.34899546338302007, + "grad_norm": 1.2638037204742432, + "learning_rate": 4.902966648347156e-06, + "loss": 0.1613, + "step": 2154 + }, + { + "epoch": 0.34915748541801683, + "grad_norm": 0.990151584148407, + "learning_rate": 4.902845959862876e-06, + "loss": 0.1456, + "step": 2155 + }, + { + "epoch": 0.3493195074530136, + "grad_norm": 1.108185887336731, + "learning_rate": 4.902725197857325e-06, + "loss": 0.1522, + "step": 2156 + }, + { + "epoch": 0.34948152948801037, + "grad_norm": 0.9440566897392273, + "learning_rate": 4.902604362334197e-06, + "loss": 0.1266, + "step": 2157 + }, + { + "epoch": 0.34964355152300713, + "grad_norm": 1.2154903411865234, + "learning_rate": 4.902483453297189e-06, + "loss": 0.1911, + "step": 2158 + }, + { + "epoch": 0.3498055735580039, + "grad_norm": 1.0809717178344727, + "learning_rate": 4.902362470750002e-06, + "loss": 0.1564, + "step": 2159 + }, + { + "epoch": 0.34996759559300067, + "grad_norm": 1.0994709730148315, + "learning_rate": 4.902241414696337e-06, + "loss": 0.1707, + "step": 2160 + }, + { + "epoch": 0.3501296176279974, + "grad_norm": 1.0126726627349854, + "learning_rate": 4.902120285139898e-06, + "loss": 0.1451, + "step": 2161 + }, + { + "epoch": 0.35029163966299415, + "grad_norm": 1.068403720855713, + "learning_rate": 4.901999082084391e-06, + "loss": 0.1578, + "step": 2162 + }, + { + "epoch": 0.3504536616979909, + "grad_norm": 0.9774585962295532, + "learning_rate": 4.901877805533525e-06, + "loss": 0.1406, + "step": 2163 + }, + { + "epoch": 0.3506156837329877, + "grad_norm": 1.0109790563583374, + "learning_rate": 4.901756455491011e-06, + "loss": 0.1425, + "step": 2164 + }, + { + "epoch": 0.35077770576798445, + "grad_norm": 1.182918667793274, + "learning_rate": 4.901635031960561e-06, + "loss": 0.1952, + "step": 2165 + }, + { + "epoch": 0.3509397278029812, + "grad_norm": 0.9629144668579102, + "learning_rate": 4.901513534945891e-06, + "loss": 0.1487, + "step": 2166 + }, + { + "epoch": 0.351101749837978, + "grad_norm": 1.0952849388122559, + "learning_rate": 4.901391964450718e-06, + "loss": 0.171, + "step": 2167 + }, + { + "epoch": 0.35126377187297475, + "grad_norm": 1.0250297784805298, + "learning_rate": 4.901270320478763e-06, + "loss": 0.1532, + "step": 2168 + }, + { + "epoch": 0.35142579390797146, + "grad_norm": 1.1184073686599731, + "learning_rate": 4.901148603033747e-06, + "loss": 0.1641, + "step": 2169 + }, + { + "epoch": 0.35158781594296823, + "grad_norm": 1.0301399230957031, + "learning_rate": 4.901026812119394e-06, + "loss": 0.1515, + "step": 2170 + }, + { + "epoch": 0.351749837977965, + "grad_norm": 1.188010573387146, + "learning_rate": 4.900904947739431e-06, + "loss": 0.1565, + "step": 2171 + }, + { + "epoch": 0.35191186001296176, + "grad_norm": 1.0016369819641113, + "learning_rate": 4.9007830098975875e-06, + "loss": 0.1523, + "step": 2172 + }, + { + "epoch": 0.35207388204795853, + "grad_norm": 1.049594521522522, + "learning_rate": 4.9006609985975925e-06, + "loss": 0.1426, + "step": 2173 + }, + { + "epoch": 0.3522359040829553, + "grad_norm": 1.197577714920044, + "learning_rate": 4.900538913843181e-06, + "loss": 0.1812, + "step": 2174 + }, + { + "epoch": 0.35239792611795207, + "grad_norm": 1.06314218044281, + "learning_rate": 4.900416755638087e-06, + "loss": 0.1601, + "step": 2175 + }, + { + "epoch": 0.3525599481529488, + "grad_norm": 1.0126889944076538, + "learning_rate": 4.900294523986051e-06, + "loss": 0.1472, + "step": 2176 + }, + { + "epoch": 0.35272197018794554, + "grad_norm": 1.1102066040039062, + "learning_rate": 4.900172218890809e-06, + "loss": 0.1699, + "step": 2177 + }, + { + "epoch": 0.3528839922229423, + "grad_norm": 1.1450811624526978, + "learning_rate": 4.900049840356107e-06, + "loss": 0.1804, + "step": 2178 + }, + { + "epoch": 0.3530460142579391, + "grad_norm": 1.1297409534454346, + "learning_rate": 4.899927388385688e-06, + "loss": 0.1678, + "step": 2179 + }, + { + "epoch": 0.35320803629293585, + "grad_norm": 1.0835684537887573, + "learning_rate": 4.899804862983298e-06, + "loss": 0.173, + "step": 2180 + }, + { + "epoch": 0.3533700583279326, + "grad_norm": 1.0187311172485352, + "learning_rate": 4.899682264152686e-06, + "loss": 0.1353, + "step": 2181 + }, + { + "epoch": 0.3535320803629294, + "grad_norm": 0.9485295414924622, + "learning_rate": 4.899559591897604e-06, + "loss": 0.1516, + "step": 2182 + }, + { + "epoch": 0.3536941023979261, + "grad_norm": 0.9085932970046997, + "learning_rate": 4.899436846221807e-06, + "loss": 0.148, + "step": 2183 + }, + { + "epoch": 0.35385612443292286, + "grad_norm": 1.1273099184036255, + "learning_rate": 4.899314027129047e-06, + "loss": 0.1685, + "step": 2184 + }, + { + "epoch": 0.3540181464679196, + "grad_norm": 1.0366731882095337, + "learning_rate": 4.899191134623086e-06, + "loss": 0.1542, + "step": 2185 + }, + { + "epoch": 0.3541801685029164, + "grad_norm": 1.049047589302063, + "learning_rate": 4.89906816870768e-06, + "loss": 0.172, + "step": 2186 + }, + { + "epoch": 0.35434219053791316, + "grad_norm": 1.1898120641708374, + "learning_rate": 4.8989451293865955e-06, + "loss": 0.1729, + "step": 2187 + }, + { + "epoch": 0.3545042125729099, + "grad_norm": 1.0170189142227173, + "learning_rate": 4.898822016663595e-06, + "loss": 0.1539, + "step": 2188 + }, + { + "epoch": 0.3546662346079067, + "grad_norm": 0.9323112368583679, + "learning_rate": 4.8986988305424445e-06, + "loss": 0.1392, + "step": 2189 + }, + { + "epoch": 0.35482825664290346, + "grad_norm": 1.097751259803772, + "learning_rate": 4.898575571026916e-06, + "loss": 0.1697, + "step": 2190 + }, + { + "epoch": 0.3549902786779002, + "grad_norm": 1.2580288648605347, + "learning_rate": 4.898452238120779e-06, + "loss": 0.1925, + "step": 2191 + }, + { + "epoch": 0.35515230071289694, + "grad_norm": 1.1584991216659546, + "learning_rate": 4.898328831827808e-06, + "loss": 0.1696, + "step": 2192 + }, + { + "epoch": 0.3553143227478937, + "grad_norm": 1.0582317113876343, + "learning_rate": 4.898205352151777e-06, + "loss": 0.1498, + "step": 2193 + }, + { + "epoch": 0.3554763447828905, + "grad_norm": 1.039017915725708, + "learning_rate": 4.898081799096467e-06, + "loss": 0.1458, + "step": 2194 + }, + { + "epoch": 0.35563836681788724, + "grad_norm": 1.237728476524353, + "learning_rate": 4.897958172665658e-06, + "loss": 0.1588, + "step": 2195 + }, + { + "epoch": 0.355800388852884, + "grad_norm": 0.9922459125518799, + "learning_rate": 4.897834472863131e-06, + "loss": 0.146, + "step": 2196 + }, + { + "epoch": 0.3559624108878808, + "grad_norm": 1.1066429615020752, + "learning_rate": 4.897710699692672e-06, + "loss": 0.1492, + "step": 2197 + }, + { + "epoch": 0.3561244329228775, + "grad_norm": 1.1246432065963745, + "learning_rate": 4.897586853158067e-06, + "loss": 0.1644, + "step": 2198 + }, + { + "epoch": 0.35628645495787425, + "grad_norm": 1.1196368932724, + "learning_rate": 4.897462933263107e-06, + "loss": 0.1517, + "step": 2199 + }, + { + "epoch": 0.356448476992871, + "grad_norm": 1.1763758659362793, + "learning_rate": 4.897338940011583e-06, + "loss": 0.169, + "step": 2200 + }, + { + "epoch": 0.3566104990278678, + "grad_norm": 1.0583901405334473, + "learning_rate": 4.897214873407289e-06, + "loss": 0.1462, + "step": 2201 + }, + { + "epoch": 0.35677252106286456, + "grad_norm": 0.9856936931610107, + "learning_rate": 4.897090733454021e-06, + "loss": 0.1408, + "step": 2202 + }, + { + "epoch": 0.3569345430978613, + "grad_norm": 1.0380568504333496, + "learning_rate": 4.8969665201555775e-06, + "loss": 0.1692, + "step": 2203 + }, + { + "epoch": 0.3570965651328581, + "grad_norm": 1.0581004619598389, + "learning_rate": 4.896842233515759e-06, + "loss": 0.1714, + "step": 2204 + }, + { + "epoch": 0.3572585871678548, + "grad_norm": 1.2082271575927734, + "learning_rate": 4.896717873538368e-06, + "loss": 0.1791, + "step": 2205 + }, + { + "epoch": 0.35742060920285157, + "grad_norm": 1.0210479497909546, + "learning_rate": 4.89659344022721e-06, + "loss": 0.1652, + "step": 2206 + }, + { + "epoch": 0.35758263123784834, + "grad_norm": 0.9992541074752808, + "learning_rate": 4.896468933586094e-06, + "loss": 0.162, + "step": 2207 + }, + { + "epoch": 0.3577446532728451, + "grad_norm": 1.0144470930099487, + "learning_rate": 4.896344353618826e-06, + "loss": 0.1597, + "step": 2208 + }, + { + "epoch": 0.35790667530784187, + "grad_norm": 1.1150000095367432, + "learning_rate": 4.89621970032922e-06, + "loss": 0.167, + "step": 2209 + }, + { + "epoch": 0.35806869734283864, + "grad_norm": 1.0578536987304688, + "learning_rate": 4.896094973721091e-06, + "loss": 0.1402, + "step": 2210 + }, + { + "epoch": 0.3582307193778354, + "grad_norm": 0.9122249484062195, + "learning_rate": 4.8959701737982535e-06, + "loss": 0.1362, + "step": 2211 + }, + { + "epoch": 0.35839274141283217, + "grad_norm": 1.0043046474456787, + "learning_rate": 4.8958453005645265e-06, + "loss": 0.1537, + "step": 2212 + }, + { + "epoch": 0.3585547634478289, + "grad_norm": 1.1452409029006958, + "learning_rate": 4.895720354023732e-06, + "loss": 0.1239, + "step": 2213 + }, + { + "epoch": 0.35871678548282565, + "grad_norm": 1.0220675468444824, + "learning_rate": 4.895595334179692e-06, + "loss": 0.1461, + "step": 2214 + }, + { + "epoch": 0.3588788075178224, + "grad_norm": 1.3352243900299072, + "learning_rate": 4.895470241036232e-06, + "loss": 0.2061, + "step": 2215 + }, + { + "epoch": 0.3590408295528192, + "grad_norm": 1.1318044662475586, + "learning_rate": 4.89534507459718e-06, + "loss": 0.1479, + "step": 2216 + }, + { + "epoch": 0.35920285158781595, + "grad_norm": 1.2025889158248901, + "learning_rate": 4.895219834866364e-06, + "loss": 0.1721, + "step": 2217 + }, + { + "epoch": 0.3593648736228127, + "grad_norm": 1.0799037218093872, + "learning_rate": 4.895094521847617e-06, + "loss": 0.1504, + "step": 2218 + }, + { + "epoch": 0.3595268956578095, + "grad_norm": 0.9984576106071472, + "learning_rate": 4.894969135544776e-06, + "loss": 0.1485, + "step": 2219 + }, + { + "epoch": 0.3596889176928062, + "grad_norm": 1.1945985555648804, + "learning_rate": 4.894843675961673e-06, + "loss": 0.1784, + "step": 2220 + }, + { + "epoch": 0.35985093972780297, + "grad_norm": 0.962485671043396, + "learning_rate": 4.894718143102151e-06, + "loss": 0.1345, + "step": 2221 + }, + { + "epoch": 0.36001296176279973, + "grad_norm": 0.9608977437019348, + "learning_rate": 4.894592536970047e-06, + "loss": 0.144, + "step": 2222 + }, + { + "epoch": 0.3601749837977965, + "grad_norm": 1.0343858003616333, + "learning_rate": 4.894466857569207e-06, + "loss": 0.1507, + "step": 2223 + }, + { + "epoch": 0.36033700583279327, + "grad_norm": 1.0216684341430664, + "learning_rate": 4.894341104903476e-06, + "loss": 0.1354, + "step": 2224 + }, + { + "epoch": 0.36049902786779003, + "grad_norm": 1.18091881275177, + "learning_rate": 4.8942152789767e-06, + "loss": 0.1631, + "step": 2225 + }, + { + "epoch": 0.3606610499027868, + "grad_norm": 1.0483477115631104, + "learning_rate": 4.894089379792731e-06, + "loss": 0.1607, + "step": 2226 + }, + { + "epoch": 0.3608230719377835, + "grad_norm": 1.1260876655578613, + "learning_rate": 4.893963407355422e-06, + "loss": 0.1611, + "step": 2227 + }, + { + "epoch": 0.3609850939727803, + "grad_norm": 0.9986515045166016, + "learning_rate": 4.893837361668624e-06, + "loss": 0.1565, + "step": 2228 + }, + { + "epoch": 0.36114711600777705, + "grad_norm": 1.1207728385925293, + "learning_rate": 4.893711242736197e-06, + "loss": 0.1611, + "step": 2229 + }, + { + "epoch": 0.3613091380427738, + "grad_norm": 1.1995974779129028, + "learning_rate": 4.8935850505619985e-06, + "loss": 0.1705, + "step": 2230 + }, + { + "epoch": 0.3614711600777706, + "grad_norm": 1.1663684844970703, + "learning_rate": 4.893458785149889e-06, + "loss": 0.1841, + "step": 2231 + }, + { + "epoch": 0.36163318211276735, + "grad_norm": 0.9084141850471497, + "learning_rate": 4.8933324465037334e-06, + "loss": 0.1425, + "step": 2232 + }, + { + "epoch": 0.3617952041477641, + "grad_norm": 0.9766511917114258, + "learning_rate": 4.893206034627397e-06, + "loss": 0.1585, + "step": 2233 + }, + { + "epoch": 0.3619572261827609, + "grad_norm": 1.0167397260665894, + "learning_rate": 4.893079549524747e-06, + "loss": 0.167, + "step": 2234 + }, + { + "epoch": 0.3621192482177576, + "grad_norm": 1.0378706455230713, + "learning_rate": 4.892952991199654e-06, + "loss": 0.1502, + "step": 2235 + }, + { + "epoch": 0.36228127025275436, + "grad_norm": 0.9949929118156433, + "learning_rate": 4.89282635965599e-06, + "loss": 0.1482, + "step": 2236 + }, + { + "epoch": 0.36244329228775113, + "grad_norm": 1.006990909576416, + "learning_rate": 4.89269965489763e-06, + "loss": 0.1504, + "step": 2237 + }, + { + "epoch": 0.3626053143227479, + "grad_norm": 0.947759747505188, + "learning_rate": 4.8925728769284504e-06, + "loss": 0.1436, + "step": 2238 + }, + { + "epoch": 0.36276733635774466, + "grad_norm": 0.9912741184234619, + "learning_rate": 4.892446025752332e-06, + "loss": 0.1401, + "step": 2239 + }, + { + "epoch": 0.36292935839274143, + "grad_norm": 0.9576963782310486, + "learning_rate": 4.892319101373154e-06, + "loss": 0.1406, + "step": 2240 + }, + { + "epoch": 0.3630913804277382, + "grad_norm": 1.0038036108016968, + "learning_rate": 4.892192103794801e-06, + "loss": 0.1418, + "step": 2241 + }, + { + "epoch": 0.3632534024627349, + "grad_norm": 1.1145362854003906, + "learning_rate": 4.892065033021158e-06, + "loss": 0.1654, + "step": 2242 + }, + { + "epoch": 0.3634154244977317, + "grad_norm": 1.185004472732544, + "learning_rate": 4.8919378890561145e-06, + "loss": 0.1686, + "step": 2243 + }, + { + "epoch": 0.36357744653272844, + "grad_norm": 1.0970295667648315, + "learning_rate": 4.8918106719035594e-06, + "loss": 0.1504, + "step": 2244 + }, + { + "epoch": 0.3637394685677252, + "grad_norm": 1.0193977355957031, + "learning_rate": 4.891683381567386e-06, + "loss": 0.1405, + "step": 2245 + }, + { + "epoch": 0.363901490602722, + "grad_norm": 0.9931899309158325, + "learning_rate": 4.891556018051489e-06, + "loss": 0.1569, + "step": 2246 + }, + { + "epoch": 0.36406351263771874, + "grad_norm": 1.1715176105499268, + "learning_rate": 4.891428581359764e-06, + "loss": 0.1799, + "step": 2247 + }, + { + "epoch": 0.3642255346727155, + "grad_norm": 1.0806795358657837, + "learning_rate": 4.891301071496113e-06, + "loss": 0.1547, + "step": 2248 + }, + { + "epoch": 0.3643875567077122, + "grad_norm": 1.0172020196914673, + "learning_rate": 4.891173488464436e-06, + "loss": 0.1409, + "step": 2249 + }, + { + "epoch": 0.364549578742709, + "grad_norm": 0.9692556858062744, + "learning_rate": 4.891045832268637e-06, + "loss": 0.1519, + "step": 2250 + }, + { + "epoch": 0.36471160077770576, + "grad_norm": 1.108995795249939, + "learning_rate": 4.890918102912621e-06, + "loss": 0.1695, + "step": 2251 + }, + { + "epoch": 0.3648736228127025, + "grad_norm": 1.081821322441101, + "learning_rate": 4.890790300400297e-06, + "loss": 0.1719, + "step": 2252 + }, + { + "epoch": 0.3650356448476993, + "grad_norm": 1.0040013790130615, + "learning_rate": 4.890662424735576e-06, + "loss": 0.1603, + "step": 2253 + }, + { + "epoch": 0.36519766688269606, + "grad_norm": 1.0947988033294678, + "learning_rate": 4.8905344759223696e-06, + "loss": 0.1791, + "step": 2254 + }, + { + "epoch": 0.3653596889176928, + "grad_norm": 1.1201766729354858, + "learning_rate": 4.890406453964594e-06, + "loss": 0.1646, + "step": 2255 + }, + { + "epoch": 0.36552171095268954, + "grad_norm": 1.0785022974014282, + "learning_rate": 4.890278358866165e-06, + "loss": 0.1617, + "step": 2256 + }, + { + "epoch": 0.3656837329876863, + "grad_norm": 1.1495450735092163, + "learning_rate": 4.8901501906310024e-06, + "loss": 0.1674, + "step": 2257 + }, + { + "epoch": 0.3658457550226831, + "grad_norm": 1.0032192468643188, + "learning_rate": 4.890021949263027e-06, + "loss": 0.1427, + "step": 2258 + }, + { + "epoch": 0.36600777705767984, + "grad_norm": 1.148429274559021, + "learning_rate": 4.889893634766165e-06, + "loss": 0.1654, + "step": 2259 + }, + { + "epoch": 0.3661697990926766, + "grad_norm": 1.0179355144500732, + "learning_rate": 4.889765247144341e-06, + "loss": 0.1496, + "step": 2260 + }, + { + "epoch": 0.3663318211276734, + "grad_norm": 0.9481959939002991, + "learning_rate": 4.889636786401484e-06, + "loss": 0.1423, + "step": 2261 + }, + { + "epoch": 0.36649384316267014, + "grad_norm": 1.1533952951431274, + "learning_rate": 4.889508252541524e-06, + "loss": 0.1582, + "step": 2262 + }, + { + "epoch": 0.3666558651976669, + "grad_norm": 1.0846741199493408, + "learning_rate": 4.889379645568394e-06, + "loss": 0.1599, + "step": 2263 + }, + { + "epoch": 0.3668178872326636, + "grad_norm": 1.2179301977157593, + "learning_rate": 4.889250965486029e-06, + "loss": 0.1812, + "step": 2264 + }, + { + "epoch": 0.3669799092676604, + "grad_norm": 0.9992818832397461, + "learning_rate": 4.889122212298366e-06, + "loss": 0.138, + "step": 2265 + }, + { + "epoch": 0.36714193130265715, + "grad_norm": 1.1005460023880005, + "learning_rate": 4.888993386009345e-06, + "loss": 0.1519, + "step": 2266 + }, + { + "epoch": 0.3673039533376539, + "grad_norm": 1.0345803499221802, + "learning_rate": 4.888864486622907e-06, + "loss": 0.1539, + "step": 2267 + }, + { + "epoch": 0.3674659753726507, + "grad_norm": 1.0553728342056274, + "learning_rate": 4.888735514142998e-06, + "loss": 0.1459, + "step": 2268 + }, + { + "epoch": 0.36762799740764746, + "grad_norm": 1.0219756364822388, + "learning_rate": 4.888606468573562e-06, + "loss": 0.1459, + "step": 2269 + }, + { + "epoch": 0.3677900194426442, + "grad_norm": 1.0199123620986938, + "learning_rate": 4.8884773499185485e-06, + "loss": 0.1466, + "step": 2270 + }, + { + "epoch": 0.36795204147764093, + "grad_norm": 0.9618728160858154, + "learning_rate": 4.888348158181908e-06, + "loss": 0.1478, + "step": 2271 + }, + { + "epoch": 0.3681140635126377, + "grad_norm": 1.018974781036377, + "learning_rate": 4.8882188933675935e-06, + "loss": 0.1493, + "step": 2272 + }, + { + "epoch": 0.36827608554763447, + "grad_norm": 1.1692665815353394, + "learning_rate": 4.888089555479561e-06, + "loss": 0.1571, + "step": 2273 + }, + { + "epoch": 0.36843810758263124, + "grad_norm": 1.0094574689865112, + "learning_rate": 4.887960144521766e-06, + "loss": 0.1413, + "step": 2274 + }, + { + "epoch": 0.368600129617628, + "grad_norm": 1.0901294946670532, + "learning_rate": 4.8878306604981704e-06, + "loss": 0.1495, + "step": 2275 + }, + { + "epoch": 0.36876215165262477, + "grad_norm": 1.043537974357605, + "learning_rate": 4.887701103412734e-06, + "loss": 0.1641, + "step": 2276 + }, + { + "epoch": 0.36892417368762154, + "grad_norm": 1.0010807514190674, + "learning_rate": 4.887571473269422e-06, + "loss": 0.1562, + "step": 2277 + }, + { + "epoch": 0.36908619572261825, + "grad_norm": 0.9946657419204712, + "learning_rate": 4.8874417700722025e-06, + "loss": 0.1406, + "step": 2278 + }, + { + "epoch": 0.369248217757615, + "grad_norm": 1.007975697517395, + "learning_rate": 4.887311993825041e-06, + "loss": 0.1448, + "step": 2279 + }, + { + "epoch": 0.3694102397926118, + "grad_norm": 1.072035312652588, + "learning_rate": 4.887182144531909e-06, + "loss": 0.1686, + "step": 2280 + }, + { + "epoch": 0.36957226182760855, + "grad_norm": 2.578970432281494, + "learning_rate": 4.887052222196782e-06, + "loss": 0.1594, + "step": 2281 + }, + { + "epoch": 0.3697342838626053, + "grad_norm": 0.9519162774085999, + "learning_rate": 4.886922226823632e-06, + "loss": 0.1369, + "step": 2282 + }, + { + "epoch": 0.3698963058976021, + "grad_norm": 1.0015640258789062, + "learning_rate": 4.886792158416439e-06, + "loss": 0.1469, + "step": 2283 + }, + { + "epoch": 0.37005832793259885, + "grad_norm": 0.9746085405349731, + "learning_rate": 4.8866620169791815e-06, + "loss": 0.1342, + "step": 2284 + }, + { + "epoch": 0.3702203499675956, + "grad_norm": 1.0695668458938599, + "learning_rate": 4.886531802515842e-06, + "loss": 0.1645, + "step": 2285 + }, + { + "epoch": 0.37038237200259233, + "grad_norm": 1.1742403507232666, + "learning_rate": 4.886401515030404e-06, + "loss": 0.1668, + "step": 2286 + }, + { + "epoch": 0.3705443940375891, + "grad_norm": 1.0946766138076782, + "learning_rate": 4.886271154526856e-06, + "loss": 0.1486, + "step": 2287 + }, + { + "epoch": 0.37070641607258586, + "grad_norm": 1.0523529052734375, + "learning_rate": 4.886140721009184e-06, + "loss": 0.1458, + "step": 2288 + }, + { + "epoch": 0.37086843810758263, + "grad_norm": 1.1310120820999146, + "learning_rate": 4.88601021448138e-06, + "loss": 0.1568, + "step": 2289 + }, + { + "epoch": 0.3710304601425794, + "grad_norm": 1.176133155822754, + "learning_rate": 4.885879634947439e-06, + "loss": 0.1723, + "step": 2290 + }, + { + "epoch": 0.37119248217757617, + "grad_norm": 1.064237117767334, + "learning_rate": 4.8857489824113544e-06, + "loss": 0.1507, + "step": 2291 + }, + { + "epoch": 0.37135450421257293, + "grad_norm": 1.0737059116363525, + "learning_rate": 4.885618256877123e-06, + "loss": 0.1391, + "step": 2292 + }, + { + "epoch": 0.37151652624756965, + "grad_norm": 1.0706037282943726, + "learning_rate": 4.885487458348748e-06, + "loss": 0.1442, + "step": 2293 + }, + { + "epoch": 0.3716785482825664, + "grad_norm": 1.3316296339035034, + "learning_rate": 4.885356586830229e-06, + "loss": 0.1965, + "step": 2294 + }, + { + "epoch": 0.3718405703175632, + "grad_norm": 0.9738705158233643, + "learning_rate": 4.8852256423255706e-06, + "loss": 0.1412, + "step": 2295 + }, + { + "epoch": 0.37200259235255995, + "grad_norm": 0.9631845355033875, + "learning_rate": 4.8850946248387795e-06, + "loss": 0.1446, + "step": 2296 + }, + { + "epoch": 0.3721646143875567, + "grad_norm": 1.002110481262207, + "learning_rate": 4.884963534373864e-06, + "loss": 0.1489, + "step": 2297 + }, + { + "epoch": 0.3723266364225535, + "grad_norm": 0.8810906410217285, + "learning_rate": 4.884832370934838e-06, + "loss": 0.1249, + "step": 2298 + }, + { + "epoch": 0.37248865845755025, + "grad_norm": 1.2521817684173584, + "learning_rate": 4.88470113452571e-06, + "loss": 0.155, + "step": 2299 + }, + { + "epoch": 0.37265068049254696, + "grad_norm": 1.0234705209732056, + "learning_rate": 4.8845698251505e-06, + "loss": 0.1481, + "step": 2300 + }, + { + "epoch": 0.3728127025275437, + "grad_norm": 1.0289057493209839, + "learning_rate": 4.884438442813223e-06, + "loss": 0.1226, + "step": 2301 + }, + { + "epoch": 0.3729747245625405, + "grad_norm": 1.0109435319900513, + "learning_rate": 4.8843069875179005e-06, + "loss": 0.1527, + "step": 2302 + }, + { + "epoch": 0.37313674659753726, + "grad_norm": 1.0381628274917603, + "learning_rate": 4.884175459268554e-06, + "loss": 0.1689, + "step": 2303 + }, + { + "epoch": 0.37329876863253403, + "grad_norm": 1.0785562992095947, + "learning_rate": 4.884043858069208e-06, + "loss": 0.1507, + "step": 2304 + }, + { + "epoch": 0.3734607906675308, + "grad_norm": 1.0197018384933472, + "learning_rate": 4.883912183923889e-06, + "loss": 0.1564, + "step": 2305 + }, + { + "epoch": 0.37362281270252756, + "grad_norm": 1.1440730094909668, + "learning_rate": 4.883780436836627e-06, + "loss": 0.1594, + "step": 2306 + }, + { + "epoch": 0.37378483473752433, + "grad_norm": 1.0093541145324707, + "learning_rate": 4.883648616811451e-06, + "loss": 0.1565, + "step": 2307 + }, + { + "epoch": 0.37394685677252104, + "grad_norm": 1.0944989919662476, + "learning_rate": 4.883516723852396e-06, + "loss": 0.1549, + "step": 2308 + }, + { + "epoch": 0.3741088788075178, + "grad_norm": 1.0716201066970825, + "learning_rate": 4.883384757963498e-06, + "loss": 0.1666, + "step": 2309 + }, + { + "epoch": 0.3742709008425146, + "grad_norm": 0.946195125579834, + "learning_rate": 4.883252719148794e-06, + "loss": 0.1422, + "step": 2310 + }, + { + "epoch": 0.37443292287751134, + "grad_norm": 0.9485779404640198, + "learning_rate": 4.883120607412323e-06, + "loss": 0.1536, + "step": 2311 + }, + { + "epoch": 0.3745949449125081, + "grad_norm": 1.0378730297088623, + "learning_rate": 4.8829884227581294e-06, + "loss": 0.1448, + "step": 2312 + }, + { + "epoch": 0.3747569669475049, + "grad_norm": 1.0789496898651123, + "learning_rate": 4.882856165190256e-06, + "loss": 0.1549, + "step": 2313 + }, + { + "epoch": 0.37491898898250164, + "grad_norm": 1.0196281671524048, + "learning_rate": 4.88272383471275e-06, + "loss": 0.1511, + "step": 2314 + }, + { + "epoch": 0.37508101101749836, + "grad_norm": 0.9408444762229919, + "learning_rate": 4.882591431329662e-06, + "loss": 0.1301, + "step": 2315 + }, + { + "epoch": 0.3752430330524951, + "grad_norm": 1.0712037086486816, + "learning_rate": 4.8824589550450415e-06, + "loss": 0.1601, + "step": 2316 + }, + { + "epoch": 0.3754050550874919, + "grad_norm": 0.8943222165107727, + "learning_rate": 4.8823264058629426e-06, + "loss": 0.1318, + "step": 2317 + }, + { + "epoch": 0.37556707712248866, + "grad_norm": 0.9707005620002747, + "learning_rate": 4.882193783787421e-06, + "loss": 0.1476, + "step": 2318 + }, + { + "epoch": 0.3757290991574854, + "grad_norm": 1.0680534839630127, + "learning_rate": 4.882061088822534e-06, + "loss": 0.1753, + "step": 2319 + }, + { + "epoch": 0.3758911211924822, + "grad_norm": 1.0867925882339478, + "learning_rate": 4.881928320972342e-06, + "loss": 0.1571, + "step": 2320 + }, + { + "epoch": 0.37605314322747896, + "grad_norm": 1.1130952835083008, + "learning_rate": 4.881795480240908e-06, + "loss": 0.169, + "step": 2321 + }, + { + "epoch": 0.37621516526247567, + "grad_norm": 1.034469485282898, + "learning_rate": 4.881662566632296e-06, + "loss": 0.1709, + "step": 2322 + }, + { + "epoch": 0.37637718729747244, + "grad_norm": 1.1244481801986694, + "learning_rate": 4.881529580150573e-06, + "loss": 0.1594, + "step": 2323 + }, + { + "epoch": 0.3765392093324692, + "grad_norm": 1.132101058959961, + "learning_rate": 4.881396520799808e-06, + "loss": 0.1757, + "step": 2324 + }, + { + "epoch": 0.37670123136746597, + "grad_norm": 1.098602294921875, + "learning_rate": 4.881263388584072e-06, + "loss": 0.154, + "step": 2325 + }, + { + "epoch": 0.37686325340246274, + "grad_norm": 0.9612120389938354, + "learning_rate": 4.8811301835074384e-06, + "loss": 0.1427, + "step": 2326 + }, + { + "epoch": 0.3770252754374595, + "grad_norm": 1.0000402927398682, + "learning_rate": 4.880996905573985e-06, + "loss": 0.1531, + "step": 2327 + }, + { + "epoch": 0.3771872974724563, + "grad_norm": 1.076193928718567, + "learning_rate": 4.880863554787787e-06, + "loss": 0.1791, + "step": 2328 + }, + { + "epoch": 0.37734931950745304, + "grad_norm": 0.9946293234825134, + "learning_rate": 4.8807301311529266e-06, + "loss": 0.1507, + "step": 2329 + }, + { + "epoch": 0.37751134154244975, + "grad_norm": 0.9795472025871277, + "learning_rate": 4.880596634673484e-06, + "loss": 0.1498, + "step": 2330 + }, + { + "epoch": 0.3776733635774465, + "grad_norm": 1.1108335256576538, + "learning_rate": 4.880463065353547e-06, + "loss": 0.1735, + "step": 2331 + }, + { + "epoch": 0.3778353856124433, + "grad_norm": 1.0525665283203125, + "learning_rate": 4.8803294231972e-06, + "loss": 0.1655, + "step": 2332 + }, + { + "epoch": 0.37799740764744005, + "grad_norm": 1.1461604833602905, + "learning_rate": 4.880195708208533e-06, + "loss": 0.1622, + "step": 2333 + }, + { + "epoch": 0.3781594296824368, + "grad_norm": 1.1616642475128174, + "learning_rate": 4.8800619203916376e-06, + "loss": 0.1538, + "step": 2334 + }, + { + "epoch": 0.3783214517174336, + "grad_norm": 1.3006972074508667, + "learning_rate": 4.879928059750606e-06, + "loss": 0.1981, + "step": 2335 + }, + { + "epoch": 0.37848347375243035, + "grad_norm": 1.2075080871582031, + "learning_rate": 4.8797941262895365e-06, + "loss": 0.1655, + "step": 2336 + }, + { + "epoch": 0.37864549578742707, + "grad_norm": 1.0310864448547363, + "learning_rate": 4.8796601200125245e-06, + "loss": 0.1668, + "step": 2337 + }, + { + "epoch": 0.37880751782242383, + "grad_norm": 1.1267403364181519, + "learning_rate": 4.8795260409236725e-06, + "loss": 0.1574, + "step": 2338 + }, + { + "epoch": 0.3789695398574206, + "grad_norm": 1.0787062644958496, + "learning_rate": 4.879391889027081e-06, + "loss": 0.1417, + "step": 2339 + }, + { + "epoch": 0.37913156189241737, + "grad_norm": 0.9812328219413757, + "learning_rate": 4.879257664326856e-06, + "loss": 0.1436, + "step": 2340 + }, + { + "epoch": 0.37929358392741414, + "grad_norm": 1.0274299383163452, + "learning_rate": 4.8791233668271055e-06, + "loss": 0.1532, + "step": 2341 + }, + { + "epoch": 0.3794556059624109, + "grad_norm": 0.916479766368866, + "learning_rate": 4.8789889965319355e-06, + "loss": 0.1369, + "step": 2342 + }, + { + "epoch": 0.37961762799740767, + "grad_norm": 1.0698994398117065, + "learning_rate": 4.87885455344546e-06, + "loss": 0.1632, + "step": 2343 + }, + { + "epoch": 0.3797796500324044, + "grad_norm": 1.1658228635787964, + "learning_rate": 4.878720037571792e-06, + "loss": 0.1735, + "step": 2344 + }, + { + "epoch": 0.37994167206740115, + "grad_norm": 1.0319538116455078, + "learning_rate": 4.878585448915047e-06, + "loss": 0.1601, + "step": 2345 + }, + { + "epoch": 0.3801036941023979, + "grad_norm": 1.0444625616073608, + "learning_rate": 4.878450787479344e-06, + "loss": 0.1496, + "step": 2346 + }, + { + "epoch": 0.3802657161373947, + "grad_norm": 0.9932622909545898, + "learning_rate": 4.8783160532688026e-06, + "loss": 0.1479, + "step": 2347 + }, + { + "epoch": 0.38042773817239145, + "grad_norm": 1.0771230459213257, + "learning_rate": 4.878181246287544e-06, + "loss": 0.172, + "step": 2348 + }, + { + "epoch": 0.3805897602073882, + "grad_norm": 0.9821139574050903, + "learning_rate": 4.878046366539696e-06, + "loss": 0.1355, + "step": 2349 + }, + { + "epoch": 0.380751782242385, + "grad_norm": 1.1744256019592285, + "learning_rate": 4.877911414029382e-06, + "loss": 0.1728, + "step": 2350 + }, + { + "epoch": 0.38091380427738175, + "grad_norm": 1.0486180782318115, + "learning_rate": 4.877776388760735e-06, + "loss": 0.151, + "step": 2351 + }, + { + "epoch": 0.38107582631237846, + "grad_norm": 0.974438488483429, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.1425, + "step": 2352 + }, + { + "epoch": 0.38123784834737523, + "grad_norm": 0.9871600270271301, + "learning_rate": 4.877506119964964e-06, + "loss": 0.1509, + "step": 2353 + }, + { + "epoch": 0.381399870382372, + "grad_norm": 1.0788719654083252, + "learning_rate": 4.877370876446109e-06, + "loss": 0.1502, + "step": 2354 + }, + { + "epoch": 0.38156189241736876, + "grad_norm": 0.8554217219352722, + "learning_rate": 4.877235560185459e-06, + "loss": 0.1317, + "step": 2355 + }, + { + "epoch": 0.38172391445236553, + "grad_norm": 1.0223811864852905, + "learning_rate": 4.877100171187154e-06, + "loss": 0.1618, + "step": 2356 + }, + { + "epoch": 0.3818859364873623, + "grad_norm": 1.042266607284546, + "learning_rate": 4.876964709455335e-06, + "loss": 0.1565, + "step": 2357 + }, + { + "epoch": 0.38204795852235907, + "grad_norm": 1.0862993001937866, + "learning_rate": 4.876829174994149e-06, + "loss": 0.1689, + "step": 2358 + }, + { + "epoch": 0.3822099805573558, + "grad_norm": 1.0016382932662964, + "learning_rate": 4.876693567807741e-06, + "loss": 0.1428, + "step": 2359 + }, + { + "epoch": 0.38237200259235254, + "grad_norm": 1.009976863861084, + "learning_rate": 4.8765578879002625e-06, + "loss": 0.1525, + "step": 2360 + }, + { + "epoch": 0.3825340246273493, + "grad_norm": 0.9850345253944397, + "learning_rate": 4.876422135275863e-06, + "loss": 0.1373, + "step": 2361 + }, + { + "epoch": 0.3826960466623461, + "grad_norm": 1.0292307138442993, + "learning_rate": 4.8762863099386984e-06, + "loss": 0.1444, + "step": 2362 + }, + { + "epoch": 0.38285806869734285, + "grad_norm": 0.9896308779716492, + "learning_rate": 4.876150411892922e-06, + "loss": 0.1527, + "step": 2363 + }, + { + "epoch": 0.3830200907323396, + "grad_norm": 1.091084599494934, + "learning_rate": 4.876014441142693e-06, + "loss": 0.1541, + "step": 2364 + }, + { + "epoch": 0.3831821127673364, + "grad_norm": 1.06158447265625, + "learning_rate": 4.875878397692172e-06, + "loss": 0.1555, + "step": 2365 + }, + { + "epoch": 0.3833441348023331, + "grad_norm": 1.0953679084777832, + "learning_rate": 4.8757422815455215e-06, + "loss": 0.1643, + "step": 2366 + }, + { + "epoch": 0.38350615683732986, + "grad_norm": 1.0464844703674316, + "learning_rate": 4.8756060927069075e-06, + "loss": 0.1512, + "step": 2367 + }, + { + "epoch": 0.3836681788723266, + "grad_norm": 1.0464715957641602, + "learning_rate": 4.875469831180495e-06, + "loss": 0.1573, + "step": 2368 + }, + { + "epoch": 0.3838302009073234, + "grad_norm": 0.9663591384887695, + "learning_rate": 4.875333496970454e-06, + "loss": 0.1446, + "step": 2369 + }, + { + "epoch": 0.38399222294232016, + "grad_norm": 1.192948341369629, + "learning_rate": 4.875197090080957e-06, + "loss": 0.1528, + "step": 2370 + }, + { + "epoch": 0.3841542449773169, + "grad_norm": 1.0319154262542725, + "learning_rate": 4.875060610516176e-06, + "loss": 0.1489, + "step": 2371 + }, + { + "epoch": 0.3843162670123137, + "grad_norm": 1.1265735626220703, + "learning_rate": 4.874924058280288e-06, + "loss": 0.1558, + "step": 2372 + }, + { + "epoch": 0.38447828904731046, + "grad_norm": 1.1732511520385742, + "learning_rate": 4.874787433377472e-06, + "loss": 0.1631, + "step": 2373 + }, + { + "epoch": 0.3846403110823072, + "grad_norm": 1.2255806922912598, + "learning_rate": 4.874650735811906e-06, + "loss": 0.1717, + "step": 2374 + }, + { + "epoch": 0.38480233311730394, + "grad_norm": 1.0163358449935913, + "learning_rate": 4.874513965587775e-06, + "loss": 0.1531, + "step": 2375 + }, + { + "epoch": 0.3849643551523007, + "grad_norm": 1.047625184059143, + "learning_rate": 4.874377122709263e-06, + "loss": 0.1702, + "step": 2376 + }, + { + "epoch": 0.3851263771872975, + "grad_norm": 1.0815023183822632, + "learning_rate": 4.874240207180556e-06, + "loss": 0.1649, + "step": 2377 + }, + { + "epoch": 0.38528839922229424, + "grad_norm": 0.9695912003517151, + "learning_rate": 4.874103219005845e-06, + "loss": 0.1392, + "step": 2378 + }, + { + "epoch": 0.385450421257291, + "grad_norm": 1.0727660655975342, + "learning_rate": 4.873966158189321e-06, + "loss": 0.1428, + "step": 2379 + }, + { + "epoch": 0.3856124432922878, + "grad_norm": 1.184160590171814, + "learning_rate": 4.873829024735176e-06, + "loss": 0.1532, + "step": 2380 + }, + { + "epoch": 0.3857744653272845, + "grad_norm": 0.9503794312477112, + "learning_rate": 4.873691818647609e-06, + "loss": 0.1313, + "step": 2381 + }, + { + "epoch": 0.38593648736228126, + "grad_norm": 1.0387951135635376, + "learning_rate": 4.873554539930815e-06, + "loss": 0.1699, + "step": 2382 + }, + { + "epoch": 0.386098509397278, + "grad_norm": 1.1206920146942139, + "learning_rate": 4.873417188588997e-06, + "loss": 0.1645, + "step": 2383 + }, + { + "epoch": 0.3862605314322748, + "grad_norm": 1.0586580038070679, + "learning_rate": 4.873279764626357e-06, + "loss": 0.1505, + "step": 2384 + }, + { + "epoch": 0.38642255346727156, + "grad_norm": 1.080195665359497, + "learning_rate": 4.873142268047099e-06, + "loss": 0.1518, + "step": 2385 + }, + { + "epoch": 0.3865845755022683, + "grad_norm": 1.0671923160552979, + "learning_rate": 4.87300469885543e-06, + "loss": 0.17, + "step": 2386 + }, + { + "epoch": 0.3867465975372651, + "grad_norm": 0.9395395517349243, + "learning_rate": 4.87286705705556e-06, + "loss": 0.1497, + "step": 2387 + }, + { + "epoch": 0.3869086195722618, + "grad_norm": 1.0905933380126953, + "learning_rate": 4.872729342651701e-06, + "loss": 0.1711, + "step": 2388 + }, + { + "epoch": 0.38707064160725857, + "grad_norm": 1.1106778383255005, + "learning_rate": 4.8725915556480655e-06, + "loss": 0.1736, + "step": 2389 + }, + { + "epoch": 0.38723266364225534, + "grad_norm": 1.1426775455474854, + "learning_rate": 4.87245369604887e-06, + "loss": 0.178, + "step": 2390 + }, + { + "epoch": 0.3873946856772521, + "grad_norm": 1.0927367210388184, + "learning_rate": 4.872315763858332e-06, + "loss": 0.1624, + "step": 2391 + }, + { + "epoch": 0.38755670771224887, + "grad_norm": 1.1219305992126465, + "learning_rate": 4.872177759080673e-06, + "loss": 0.1561, + "step": 2392 + }, + { + "epoch": 0.38771872974724564, + "grad_norm": 1.0665841102600098, + "learning_rate": 4.872039681720116e-06, + "loss": 0.1651, + "step": 2393 + }, + { + "epoch": 0.3878807517822424, + "grad_norm": 1.0562465190887451, + "learning_rate": 4.8719015317808835e-06, + "loss": 0.1674, + "step": 2394 + }, + { + "epoch": 0.3880427738172391, + "grad_norm": 1.0711188316345215, + "learning_rate": 4.8717633092672045e-06, + "loss": 0.1534, + "step": 2395 + }, + { + "epoch": 0.3882047958522359, + "grad_norm": 1.0176469087600708, + "learning_rate": 4.871625014183308e-06, + "loss": 0.1558, + "step": 2396 + }, + { + "epoch": 0.38836681788723265, + "grad_norm": 1.0027252435684204, + "learning_rate": 4.871486646533425e-06, + "loss": 0.1381, + "step": 2397 + }, + { + "epoch": 0.3885288399222294, + "grad_norm": 1.0068401098251343, + "learning_rate": 4.8713482063217895e-06, + "loss": 0.1579, + "step": 2398 + }, + { + "epoch": 0.3886908619572262, + "grad_norm": 0.9579773545265198, + "learning_rate": 4.871209693552638e-06, + "loss": 0.1335, + "step": 2399 + }, + { + "epoch": 0.38885288399222295, + "grad_norm": 1.1602519750595093, + "learning_rate": 4.871071108230208e-06, + "loss": 0.1684, + "step": 2400 + }, + { + "epoch": 0.3890149060272197, + "grad_norm": 1.4036099910736084, + "learning_rate": 4.87093245035874e-06, + "loss": 0.1646, + "step": 2401 + }, + { + "epoch": 0.3891769280622165, + "grad_norm": 1.1225903034210205, + "learning_rate": 4.8707937199424756e-06, + "loss": 0.1643, + "step": 2402 + }, + { + "epoch": 0.3893389500972132, + "grad_norm": 1.1143039464950562, + "learning_rate": 4.870654916985661e-06, + "loss": 0.1577, + "step": 2403 + }, + { + "epoch": 0.38950097213220997, + "grad_norm": 1.080872893333435, + "learning_rate": 4.870516041492543e-06, + "loss": 0.1504, + "step": 2404 + }, + { + "epoch": 0.38966299416720673, + "grad_norm": 1.2829567193984985, + "learning_rate": 4.870377093467371e-06, + "loss": 0.1817, + "step": 2405 + }, + { + "epoch": 0.3898250162022035, + "grad_norm": 1.2042837142944336, + "learning_rate": 4.870238072914396e-06, + "loss": 0.1548, + "step": 2406 + }, + { + "epoch": 0.38998703823720027, + "grad_norm": 0.9398330450057983, + "learning_rate": 4.870098979837871e-06, + "loss": 0.1389, + "step": 2407 + }, + { + "epoch": 0.39014906027219703, + "grad_norm": 1.0796722173690796, + "learning_rate": 4.869959814242054e-06, + "loss": 0.157, + "step": 2408 + }, + { + "epoch": 0.3903110823071938, + "grad_norm": 1.1259878873825073, + "learning_rate": 4.869820576131202e-06, + "loss": 0.153, + "step": 2409 + }, + { + "epoch": 0.3904731043421905, + "grad_norm": 1.019061803817749, + "learning_rate": 4.8696812655095744e-06, + "loss": 0.1538, + "step": 2410 + }, + { + "epoch": 0.3906351263771873, + "grad_norm": 0.9436379075050354, + "learning_rate": 4.869541882381435e-06, + "loss": 0.1485, + "step": 2411 + }, + { + "epoch": 0.39079714841218405, + "grad_norm": 1.071523904800415, + "learning_rate": 4.869402426751048e-06, + "loss": 0.1594, + "step": 2412 + }, + { + "epoch": 0.3909591704471808, + "grad_norm": 1.1812318563461304, + "learning_rate": 4.86926289862268e-06, + "loss": 0.1701, + "step": 2413 + }, + { + "epoch": 0.3911211924821776, + "grad_norm": 1.1143845319747925, + "learning_rate": 4.8691232980006015e-06, + "loss": 0.1499, + "step": 2414 + }, + { + "epoch": 0.39128321451717435, + "grad_norm": 1.0867897272109985, + "learning_rate": 4.868983624889083e-06, + "loss": 0.1617, + "step": 2415 + }, + { + "epoch": 0.3914452365521711, + "grad_norm": 1.0865579843521118, + "learning_rate": 4.868843879292399e-06, + "loss": 0.1634, + "step": 2416 + }, + { + "epoch": 0.39160725858716783, + "grad_norm": 0.9552433490753174, + "learning_rate": 4.868704061214824e-06, + "loss": 0.1485, + "step": 2417 + }, + { + "epoch": 0.3917692806221646, + "grad_norm": 1.0676617622375488, + "learning_rate": 4.868564170660637e-06, + "loss": 0.1744, + "step": 2418 + }, + { + "epoch": 0.39193130265716136, + "grad_norm": 0.9854423403739929, + "learning_rate": 4.868424207634118e-06, + "loss": 0.1511, + "step": 2419 + }, + { + "epoch": 0.39209332469215813, + "grad_norm": 0.8792446851730347, + "learning_rate": 4.868284172139551e-06, + "loss": 0.1247, + "step": 2420 + }, + { + "epoch": 0.3922553467271549, + "grad_norm": 1.0896074771881104, + "learning_rate": 4.868144064181218e-06, + "loss": 0.1551, + "step": 2421 + }, + { + "epoch": 0.39241736876215166, + "grad_norm": 1.0454200506210327, + "learning_rate": 4.868003883763408e-06, + "loss": 0.1417, + "step": 2422 + }, + { + "epoch": 0.39257939079714843, + "grad_norm": 0.9985229969024658, + "learning_rate": 4.8678636308904095e-06, + "loss": 0.1619, + "step": 2423 + }, + { + "epoch": 0.3927414128321452, + "grad_norm": 0.9544141888618469, + "learning_rate": 4.867723305566514e-06, + "loss": 0.1435, + "step": 2424 + }, + { + "epoch": 0.3929034348671419, + "grad_norm": 1.029191017150879, + "learning_rate": 4.867582907796016e-06, + "loss": 0.1544, + "step": 2425 + }, + { + "epoch": 0.3930654569021387, + "grad_norm": 1.372963786125183, + "learning_rate": 4.86744243758321e-06, + "loss": 0.1815, + "step": 2426 + }, + { + "epoch": 0.39322747893713544, + "grad_norm": 1.0754982233047485, + "learning_rate": 4.867301894932394e-06, + "loss": 0.1632, + "step": 2427 + }, + { + "epoch": 0.3933895009721322, + "grad_norm": 1.3161104917526245, + "learning_rate": 4.8671612798478685e-06, + "loss": 0.1773, + "step": 2428 + }, + { + "epoch": 0.393551523007129, + "grad_norm": 1.0129326581954956, + "learning_rate": 4.867020592333937e-06, + "loss": 0.1519, + "step": 2429 + }, + { + "epoch": 0.39371354504212575, + "grad_norm": 0.9305555820465088, + "learning_rate": 4.866879832394903e-06, + "loss": 0.1458, + "step": 2430 + }, + { + "epoch": 0.3938755670771225, + "grad_norm": 1.048586130142212, + "learning_rate": 4.866739000035074e-06, + "loss": 0.1473, + "step": 2431 + }, + { + "epoch": 0.3940375891121192, + "grad_norm": 1.050463318824768, + "learning_rate": 4.86659809525876e-06, + "loss": 0.1673, + "step": 2432 + }, + { + "epoch": 0.394199611147116, + "grad_norm": 0.9650267958641052, + "learning_rate": 4.86645711807027e-06, + "loss": 0.1479, + "step": 2433 + }, + { + "epoch": 0.39436163318211276, + "grad_norm": 1.1384695768356323, + "learning_rate": 4.866316068473919e-06, + "loss": 0.1779, + "step": 2434 + }, + { + "epoch": 0.3945236552171095, + "grad_norm": 1.1191463470458984, + "learning_rate": 4.866174946474023e-06, + "loss": 0.1442, + "step": 2435 + }, + { + "epoch": 0.3946856772521063, + "grad_norm": 1.047663688659668, + "learning_rate": 4.8660337520749e-06, + "loss": 0.1437, + "step": 2436 + }, + { + "epoch": 0.39484769928710306, + "grad_norm": 1.0196114778518677, + "learning_rate": 4.865892485280869e-06, + "loss": 0.1477, + "step": 2437 + }, + { + "epoch": 0.3950097213220998, + "grad_norm": 0.9830042719841003, + "learning_rate": 4.865751146096255e-06, + "loss": 0.1492, + "step": 2438 + }, + { + "epoch": 0.39517174335709654, + "grad_norm": 1.0708327293395996, + "learning_rate": 4.865609734525379e-06, + "loss": 0.1685, + "step": 2439 + }, + { + "epoch": 0.3953337653920933, + "grad_norm": 1.0699785947799683, + "learning_rate": 4.865468250572571e-06, + "loss": 0.173, + "step": 2440 + }, + { + "epoch": 0.3954957874270901, + "grad_norm": 0.9510377049446106, + "learning_rate": 4.8653266942421585e-06, + "loss": 0.1457, + "step": 2441 + }, + { + "epoch": 0.39565780946208684, + "grad_norm": 0.93174147605896, + "learning_rate": 4.865185065538472e-06, + "loss": 0.1385, + "step": 2442 + }, + { + "epoch": 0.3958198314970836, + "grad_norm": 0.9724693894386292, + "learning_rate": 4.865043364465848e-06, + "loss": 0.1609, + "step": 2443 + }, + { + "epoch": 0.3959818535320804, + "grad_norm": 0.9042225480079651, + "learning_rate": 4.86490159102862e-06, + "loss": 0.1339, + "step": 2444 + }, + { + "epoch": 0.39614387556707714, + "grad_norm": 1.1967670917510986, + "learning_rate": 4.864759745231126e-06, + "loss": 0.1431, + "step": 2445 + }, + { + "epoch": 0.3963058976020739, + "grad_norm": 1.0818557739257812, + "learning_rate": 4.8646178270777055e-06, + "loss": 0.1786, + "step": 2446 + }, + { + "epoch": 0.3964679196370706, + "grad_norm": 1.0691944360733032, + "learning_rate": 4.864475836572703e-06, + "loss": 0.1758, + "step": 2447 + }, + { + "epoch": 0.3966299416720674, + "grad_norm": 0.9906453490257263, + "learning_rate": 4.864333773720461e-06, + "loss": 0.1508, + "step": 2448 + }, + { + "epoch": 0.39679196370706415, + "grad_norm": 0.993923008441925, + "learning_rate": 4.864191638525328e-06, + "loss": 0.1369, + "step": 2449 + }, + { + "epoch": 0.3969539857420609, + "grad_norm": 1.0036680698394775, + "learning_rate": 4.8640494309916506e-06, + "loss": 0.1497, + "step": 2450 + }, + { + "epoch": 0.3971160077770577, + "grad_norm": 0.9663056135177612, + "learning_rate": 4.863907151123782e-06, + "loss": 0.1443, + "step": 2451 + }, + { + "epoch": 0.39727802981205446, + "grad_norm": 1.1457011699676514, + "learning_rate": 4.863764798926076e-06, + "loss": 0.1601, + "step": 2452 + }, + { + "epoch": 0.3974400518470512, + "grad_norm": 1.0735100507736206, + "learning_rate": 4.863622374402887e-06, + "loss": 0.1587, + "step": 2453 + }, + { + "epoch": 0.39760207388204793, + "grad_norm": 1.3038159608840942, + "learning_rate": 4.863479877558573e-06, + "loss": 0.2004, + "step": 2454 + }, + { + "epoch": 0.3977640959170447, + "grad_norm": 1.0412776470184326, + "learning_rate": 4.863337308397495e-06, + "loss": 0.1383, + "step": 2455 + }, + { + "epoch": 0.39792611795204147, + "grad_norm": 1.0914403200149536, + "learning_rate": 4.863194666924013e-06, + "loss": 0.1643, + "step": 2456 + }, + { + "epoch": 0.39808813998703824, + "grad_norm": 0.943895697593689, + "learning_rate": 4.863051953142494e-06, + "loss": 0.1351, + "step": 2457 + }, + { + "epoch": 0.398250162022035, + "grad_norm": 1.0537207126617432, + "learning_rate": 4.862909167057304e-06, + "loss": 0.1631, + "step": 2458 + }, + { + "epoch": 0.39841218405703177, + "grad_norm": 1.0041303634643555, + "learning_rate": 4.862766308672811e-06, + "loss": 0.1668, + "step": 2459 + }, + { + "epoch": 0.39857420609202854, + "grad_norm": 0.9804761409759521, + "learning_rate": 4.862623377993387e-06, + "loss": 0.1657, + "step": 2460 + }, + { + "epoch": 0.39873622812702525, + "grad_norm": 0.928268551826477, + "learning_rate": 4.862480375023405e-06, + "loss": 0.1355, + "step": 2461 + }, + { + "epoch": 0.398898250162022, + "grad_norm": 1.046536922454834, + "learning_rate": 4.862337299767241e-06, + "loss": 0.1444, + "step": 2462 + }, + { + "epoch": 0.3990602721970188, + "grad_norm": 1.052215814590454, + "learning_rate": 4.862194152229271e-06, + "loss": 0.1699, + "step": 2463 + }, + { + "epoch": 0.39922229423201555, + "grad_norm": 0.983517050743103, + "learning_rate": 4.862050932413878e-06, + "loss": 0.1326, + "step": 2464 + }, + { + "epoch": 0.3993843162670123, + "grad_norm": 1.0024183988571167, + "learning_rate": 4.861907640325442e-06, + "loss": 0.1512, + "step": 2465 + }, + { + "epoch": 0.3995463383020091, + "grad_norm": 1.0899640321731567, + "learning_rate": 4.8617642759683474e-06, + "loss": 0.1622, + "step": 2466 + }, + { + "epoch": 0.39970836033700585, + "grad_norm": 1.0080574750900269, + "learning_rate": 4.861620839346982e-06, + "loss": 0.147, + "step": 2467 + }, + { + "epoch": 0.3998703823720026, + "grad_norm": 1.098242163658142, + "learning_rate": 4.861477330465734e-06, + "loss": 0.1628, + "step": 2468 + }, + { + "epoch": 0.40003240440699933, + "grad_norm": 1.0178964138031006, + "learning_rate": 4.861333749328993e-06, + "loss": 0.1432, + "step": 2469 + }, + { + "epoch": 0.4001944264419961, + "grad_norm": 1.0105563402175903, + "learning_rate": 4.861190095941155e-06, + "loss": 0.148, + "step": 2470 + }, + { + "epoch": 0.40035644847699287, + "grad_norm": 1.041428565979004, + "learning_rate": 4.861046370306613e-06, + "loss": 0.15, + "step": 2471 + }, + { + "epoch": 0.40051847051198963, + "grad_norm": 1.3379833698272705, + "learning_rate": 4.860902572429767e-06, + "loss": 0.17, + "step": 2472 + }, + { + "epoch": 0.4006804925469864, + "grad_norm": 1.0125343799591064, + "learning_rate": 4.8607587023150145e-06, + "loss": 0.1607, + "step": 2473 + }, + { + "epoch": 0.40084251458198317, + "grad_norm": 0.9886438846588135, + "learning_rate": 4.86061475996676e-06, + "loss": 0.1524, + "step": 2474 + }, + { + "epoch": 0.40100453661697993, + "grad_norm": 1.094799280166626, + "learning_rate": 4.860470745389405e-06, + "loss": 0.1548, + "step": 2475 + }, + { + "epoch": 0.40116655865197665, + "grad_norm": 0.9762154221534729, + "learning_rate": 4.860326658587358e-06, + "loss": 0.1545, + "step": 2476 + }, + { + "epoch": 0.4013285806869734, + "grad_norm": 1.0075474977493286, + "learning_rate": 4.860182499565027e-06, + "loss": 0.1675, + "step": 2477 + }, + { + "epoch": 0.4014906027219702, + "grad_norm": 0.9806322455406189, + "learning_rate": 4.860038268326823e-06, + "loss": 0.1658, + "step": 2478 + }, + { + "epoch": 0.40165262475696695, + "grad_norm": 0.9867870211601257, + "learning_rate": 4.859893964877159e-06, + "loss": 0.1424, + "step": 2479 + }, + { + "epoch": 0.4018146467919637, + "grad_norm": 1.0405807495117188, + "learning_rate": 4.85974958922045e-06, + "loss": 0.1644, + "step": 2480 + }, + { + "epoch": 0.4019766688269605, + "grad_norm": 0.9794811606407166, + "learning_rate": 4.8596051413611155e-06, + "loss": 0.1547, + "step": 2481 + }, + { + "epoch": 0.40213869086195725, + "grad_norm": 1.097029209136963, + "learning_rate": 4.859460621303572e-06, + "loss": 0.1774, + "step": 2482 + }, + { + "epoch": 0.40230071289695396, + "grad_norm": 0.9748839139938354, + "learning_rate": 4.859316029052245e-06, + "loss": 0.1384, + "step": 2483 + }, + { + "epoch": 0.4024627349319507, + "grad_norm": 1.0535310506820679, + "learning_rate": 4.859171364611556e-06, + "loss": 0.1546, + "step": 2484 + }, + { + "epoch": 0.4026247569669475, + "grad_norm": 0.8713586330413818, + "learning_rate": 4.859026627985933e-06, + "loss": 0.1416, + "step": 2485 + }, + { + "epoch": 0.40278677900194426, + "grad_norm": 1.1586978435516357, + "learning_rate": 4.8588818191798035e-06, + "loss": 0.1747, + "step": 2486 + }, + { + "epoch": 0.40294880103694103, + "grad_norm": 1.052422285079956, + "learning_rate": 4.858736938197599e-06, + "loss": 0.1516, + "step": 2487 + }, + { + "epoch": 0.4031108230719378, + "grad_norm": 0.9205193519592285, + "learning_rate": 4.858591985043751e-06, + "loss": 0.1432, + "step": 2488 + }, + { + "epoch": 0.40327284510693456, + "grad_norm": 0.9910753965377808, + "learning_rate": 4.858446959722698e-06, + "loss": 0.1499, + "step": 2489 + }, + { + "epoch": 0.40343486714193133, + "grad_norm": 1.0436080694198608, + "learning_rate": 4.858301862238874e-06, + "loss": 0.1561, + "step": 2490 + }, + { + "epoch": 0.40359688917692804, + "grad_norm": 1.0225828886032104, + "learning_rate": 4.858156692596721e-06, + "loss": 0.151, + "step": 2491 + }, + { + "epoch": 0.4037589112119248, + "grad_norm": 1.093070387840271, + "learning_rate": 4.858011450800678e-06, + "loss": 0.1608, + "step": 2492 + }, + { + "epoch": 0.4039209332469216, + "grad_norm": 1.1206576824188232, + "learning_rate": 4.857866136855192e-06, + "loss": 0.1707, + "step": 2493 + }, + { + "epoch": 0.40408295528191834, + "grad_norm": 0.9448593854904175, + "learning_rate": 4.857720750764708e-06, + "loss": 0.1395, + "step": 2494 + }, + { + "epoch": 0.4042449773169151, + "grad_norm": 0.9939359426498413, + "learning_rate": 4.857575292533675e-06, + "loss": 0.1436, + "step": 2495 + }, + { + "epoch": 0.4044069993519119, + "grad_norm": 1.074859857559204, + "learning_rate": 4.857429762166543e-06, + "loss": 0.1645, + "step": 2496 + }, + { + "epoch": 0.40456902138690864, + "grad_norm": 0.9814433455467224, + "learning_rate": 4.857284159667766e-06, + "loss": 0.1486, + "step": 2497 + }, + { + "epoch": 0.40473104342190536, + "grad_norm": 1.1380925178527832, + "learning_rate": 4.857138485041797e-06, + "loss": 0.1672, + "step": 2498 + }, + { + "epoch": 0.4048930654569021, + "grad_norm": 1.0334879159927368, + "learning_rate": 4.8569927382930945e-06, + "loss": 0.147, + "step": 2499 + }, + { + "epoch": 0.4050550874918989, + "grad_norm": 1.037170171737671, + "learning_rate": 4.856846919426118e-06, + "loss": 0.1521, + "step": 2500 + }, + { + "epoch": 0.40521710952689566, + "grad_norm": 0.9876829385757446, + "learning_rate": 4.85670102844533e-06, + "loss": 0.1543, + "step": 2501 + }, + { + "epoch": 0.4053791315618924, + "grad_norm": 1.055932879447937, + "learning_rate": 4.856555065355193e-06, + "loss": 0.1641, + "step": 2502 + }, + { + "epoch": 0.4055411535968892, + "grad_norm": 0.9136426448822021, + "learning_rate": 4.856409030160174e-06, + "loss": 0.1424, + "step": 2503 + }, + { + "epoch": 0.40570317563188596, + "grad_norm": 1.0281522274017334, + "learning_rate": 4.856262922864741e-06, + "loss": 0.1646, + "step": 2504 + }, + { + "epoch": 0.40586519766688267, + "grad_norm": 0.9944785237312317, + "learning_rate": 4.8561167434733655e-06, + "loss": 0.144, + "step": 2505 + }, + { + "epoch": 0.40602721970187944, + "grad_norm": 1.1220792531967163, + "learning_rate": 4.855970491990518e-06, + "loss": 0.1582, + "step": 2506 + }, + { + "epoch": 0.4061892417368762, + "grad_norm": 0.8896439671516418, + "learning_rate": 4.855824168420675e-06, + "loss": 0.1386, + "step": 2507 + }, + { + "epoch": 0.406351263771873, + "grad_norm": 1.0499117374420166, + "learning_rate": 4.855677772768315e-06, + "loss": 0.1566, + "step": 2508 + }, + { + "epoch": 0.40651328580686974, + "grad_norm": 0.9450101256370544, + "learning_rate": 4.855531305037914e-06, + "loss": 0.1459, + "step": 2509 + }, + { + "epoch": 0.4066753078418665, + "grad_norm": 1.0647412538528442, + "learning_rate": 4.855384765233956e-06, + "loss": 0.1569, + "step": 2510 + }, + { + "epoch": 0.4068373298768633, + "grad_norm": 0.9091509580612183, + "learning_rate": 4.855238153360924e-06, + "loss": 0.1313, + "step": 2511 + }, + { + "epoch": 0.40699935191186, + "grad_norm": 1.0979092121124268, + "learning_rate": 4.8550914694233045e-06, + "loss": 0.1662, + "step": 2512 + }, + { + "epoch": 0.40716137394685675, + "grad_norm": 1.062098503112793, + "learning_rate": 4.854944713425585e-06, + "loss": 0.1572, + "step": 2513 + }, + { + "epoch": 0.4073233959818535, + "grad_norm": 0.9755269289016724, + "learning_rate": 4.854797885372255e-06, + "loss": 0.146, + "step": 2514 + }, + { + "epoch": 0.4074854180168503, + "grad_norm": 1.0798227787017822, + "learning_rate": 4.854650985267809e-06, + "loss": 0.1669, + "step": 2515 + }, + { + "epoch": 0.40764744005184705, + "grad_norm": 1.0882648229599, + "learning_rate": 4.854504013116741e-06, + "loss": 0.163, + "step": 2516 + }, + { + "epoch": 0.4078094620868438, + "grad_norm": 0.9996983408927917, + "learning_rate": 4.854356968923549e-06, + "loss": 0.1403, + "step": 2517 + }, + { + "epoch": 0.4079714841218406, + "grad_norm": 1.076771855354309, + "learning_rate": 4.8542098526927304e-06, + "loss": 0.1401, + "step": 2518 + }, + { + "epoch": 0.40813350615683736, + "grad_norm": 1.0781205892562866, + "learning_rate": 4.854062664428787e-06, + "loss": 0.1554, + "step": 2519 + }, + { + "epoch": 0.40829552819183407, + "grad_norm": 0.9651884436607361, + "learning_rate": 4.853915404136223e-06, + "loss": 0.1401, + "step": 2520 + }, + { + "epoch": 0.40845755022683083, + "grad_norm": 1.0154225826263428, + "learning_rate": 4.853768071819544e-06, + "loss": 0.1669, + "step": 2521 + }, + { + "epoch": 0.4086195722618276, + "grad_norm": 0.9365798830986023, + "learning_rate": 4.853620667483259e-06, + "loss": 0.1321, + "step": 2522 + }, + { + "epoch": 0.40878159429682437, + "grad_norm": 0.9837552309036255, + "learning_rate": 4.8534731911318755e-06, + "loss": 0.1488, + "step": 2523 + }, + { + "epoch": 0.40894361633182114, + "grad_norm": 1.0226497650146484, + "learning_rate": 4.853325642769908e-06, + "loss": 0.1504, + "step": 2524 + }, + { + "epoch": 0.4091056383668179, + "grad_norm": 1.206057071685791, + "learning_rate": 4.853178022401872e-06, + "loss": 0.1996, + "step": 2525 + }, + { + "epoch": 0.40926766040181467, + "grad_norm": 1.0094561576843262, + "learning_rate": 4.853030330032283e-06, + "loss": 0.1633, + "step": 2526 + }, + { + "epoch": 0.4094296824368114, + "grad_norm": 1.0124329328536987, + "learning_rate": 4.8528825656656585e-06, + "loss": 0.158, + "step": 2527 + }, + { + "epoch": 0.40959170447180815, + "grad_norm": 0.9299939274787903, + "learning_rate": 4.852734729306523e-06, + "loss": 0.1398, + "step": 2528 + }, + { + "epoch": 0.4097537265068049, + "grad_norm": 1.040216088294983, + "learning_rate": 4.852586820959398e-06, + "loss": 0.1592, + "step": 2529 + }, + { + "epoch": 0.4099157485418017, + "grad_norm": 1.0537763833999634, + "learning_rate": 4.852438840628808e-06, + "loss": 0.1607, + "step": 2530 + }, + { + "epoch": 0.41007777057679845, + "grad_norm": 0.9010412693023682, + "learning_rate": 4.852290788319284e-06, + "loss": 0.1365, + "step": 2531 + }, + { + "epoch": 0.4102397926117952, + "grad_norm": 0.9371026754379272, + "learning_rate": 4.852142664035353e-06, + "loss": 0.1374, + "step": 2532 + }, + { + "epoch": 0.410401814646792, + "grad_norm": 0.8150733113288879, + "learning_rate": 4.8519944677815495e-06, + "loss": 0.1214, + "step": 2533 + }, + { + "epoch": 0.4105638366817887, + "grad_norm": 1.083972454071045, + "learning_rate": 4.8518461995624064e-06, + "loss": 0.1656, + "step": 2534 + }, + { + "epoch": 0.41072585871678546, + "grad_norm": 0.9189485907554626, + "learning_rate": 4.851697859382461e-06, + "loss": 0.1295, + "step": 2535 + }, + { + "epoch": 0.41088788075178223, + "grad_norm": 1.0893805027008057, + "learning_rate": 4.851549447246253e-06, + "loss": 0.1678, + "step": 2536 + }, + { + "epoch": 0.411049902786779, + "grad_norm": 1.0611486434936523, + "learning_rate": 4.8514009631583215e-06, + "loss": 0.1484, + "step": 2537 + }, + { + "epoch": 0.41121192482177576, + "grad_norm": 1.0211429595947266, + "learning_rate": 4.851252407123211e-06, + "loss": 0.1585, + "step": 2538 + }, + { + "epoch": 0.41137394685677253, + "grad_norm": 1.0858979225158691, + "learning_rate": 4.851103779145467e-06, + "loss": 0.1493, + "step": 2539 + }, + { + "epoch": 0.4115359688917693, + "grad_norm": 1.152523398399353, + "learning_rate": 4.850955079229637e-06, + "loss": 0.1583, + "step": 2540 + }, + { + "epoch": 0.41169799092676607, + "grad_norm": 1.043539047241211, + "learning_rate": 4.8508063073802715e-06, + "loss": 0.1549, + "step": 2541 + }, + { + "epoch": 0.4118600129617628, + "grad_norm": 0.9617857933044434, + "learning_rate": 4.850657463601921e-06, + "loss": 0.1388, + "step": 2542 + }, + { + "epoch": 0.41202203499675955, + "grad_norm": 0.9477269649505615, + "learning_rate": 4.85050854789914e-06, + "loss": 0.1371, + "step": 2543 + }, + { + "epoch": 0.4121840570317563, + "grad_norm": 1.0493178367614746, + "learning_rate": 4.850359560276486e-06, + "loss": 0.1549, + "step": 2544 + }, + { + "epoch": 0.4123460790667531, + "grad_norm": 0.9363582134246826, + "learning_rate": 4.850210500738518e-06, + "loss": 0.1428, + "step": 2545 + }, + { + "epoch": 0.41250810110174985, + "grad_norm": 0.9234378337860107, + "learning_rate": 4.850061369289795e-06, + "loss": 0.1389, + "step": 2546 + }, + { + "epoch": 0.4126701231367466, + "grad_norm": 1.1004607677459717, + "learning_rate": 4.849912165934882e-06, + "loss": 0.166, + "step": 2547 + }, + { + "epoch": 0.4128321451717434, + "grad_norm": 1.073050856590271, + "learning_rate": 4.8497628906783425e-06, + "loss": 0.1658, + "step": 2548 + }, + { + "epoch": 0.4129941672067401, + "grad_norm": 1.1251699924468994, + "learning_rate": 4.849613543524746e-06, + "loss": 0.1618, + "step": 2549 + }, + { + "epoch": 0.41315618924173686, + "grad_norm": 0.9501916766166687, + "learning_rate": 4.84946412447866e-06, + "loss": 0.1468, + "step": 2550 + }, + { + "epoch": 0.4133182112767336, + "grad_norm": 1.087240219116211, + "learning_rate": 4.849314633544659e-06, + "loss": 0.1596, + "step": 2551 + }, + { + "epoch": 0.4134802333117304, + "grad_norm": 1.0362616777420044, + "learning_rate": 4.849165070727313e-06, + "loss": 0.1707, + "step": 2552 + }, + { + "epoch": 0.41364225534672716, + "grad_norm": 0.9123900532722473, + "learning_rate": 4.849015436031202e-06, + "loss": 0.1363, + "step": 2553 + }, + { + "epoch": 0.41380427738172393, + "grad_norm": 0.9124921560287476, + "learning_rate": 4.848865729460903e-06, + "loss": 0.1288, + "step": 2554 + }, + { + "epoch": 0.4139662994167207, + "grad_norm": 0.9334890842437744, + "learning_rate": 4.848715951020997e-06, + "loss": 0.1289, + "step": 2555 + }, + { + "epoch": 0.4141283214517174, + "grad_norm": 1.121118187904358, + "learning_rate": 4.848566100716066e-06, + "loss": 0.1475, + "step": 2556 + }, + { + "epoch": 0.4142903434867142, + "grad_norm": 1.0138769149780273, + "learning_rate": 4.848416178550697e-06, + "loss": 0.1556, + "step": 2557 + }, + { + "epoch": 0.41445236552171094, + "grad_norm": 1.0010759830474854, + "learning_rate": 4.848266184529475e-06, + "loss": 0.151, + "step": 2558 + }, + { + "epoch": 0.4146143875567077, + "grad_norm": 1.0928181409835815, + "learning_rate": 4.848116118656991e-06, + "loss": 0.1721, + "step": 2559 + }, + { + "epoch": 0.4147764095917045, + "grad_norm": 1.068526029586792, + "learning_rate": 4.847965980937836e-06, + "loss": 0.1644, + "step": 2560 + }, + { + "epoch": 0.41493843162670124, + "grad_norm": 1.0553364753723145, + "learning_rate": 4.847815771376604e-06, + "loss": 0.1501, + "step": 2561 + }, + { + "epoch": 0.415100453661698, + "grad_norm": 0.9555181860923767, + "learning_rate": 4.847665489977891e-06, + "loss": 0.1336, + "step": 2562 + }, + { + "epoch": 0.4152624756966948, + "grad_norm": 0.970905601978302, + "learning_rate": 4.847515136746295e-06, + "loss": 0.1426, + "step": 2563 + }, + { + "epoch": 0.4154244977316915, + "grad_norm": 1.1288583278656006, + "learning_rate": 4.847364711686417e-06, + "loss": 0.1637, + "step": 2564 + }, + { + "epoch": 0.41558651976668826, + "grad_norm": 1.0185610055923462, + "learning_rate": 4.8472142148028585e-06, + "loss": 0.1561, + "step": 2565 + }, + { + "epoch": 0.415748541801685, + "grad_norm": 1.0335227251052856, + "learning_rate": 4.847063646100226e-06, + "loss": 0.1576, + "step": 2566 + }, + { + "epoch": 0.4159105638366818, + "grad_norm": 0.9629384875297546, + "learning_rate": 4.846913005583125e-06, + "loss": 0.1378, + "step": 2567 + }, + { + "epoch": 0.41607258587167856, + "grad_norm": 1.0787895917892456, + "learning_rate": 4.846762293256167e-06, + "loss": 0.1794, + "step": 2568 + }, + { + "epoch": 0.4162346079066753, + "grad_norm": 1.1242948770523071, + "learning_rate": 4.84661150912396e-06, + "loss": 0.1591, + "step": 2569 + }, + { + "epoch": 0.4163966299416721, + "grad_norm": 0.9214066863059998, + "learning_rate": 4.846460653191121e-06, + "loss": 0.1397, + "step": 2570 + }, + { + "epoch": 0.4165586519766688, + "grad_norm": 1.106182336807251, + "learning_rate": 4.846309725462264e-06, + "loss": 0.1657, + "step": 2571 + }, + { + "epoch": 0.41672067401166557, + "grad_norm": 1.0790849924087524, + "learning_rate": 4.846158725942006e-06, + "loss": 0.1565, + "step": 2572 + }, + { + "epoch": 0.41688269604666234, + "grad_norm": 1.0249879360198975, + "learning_rate": 4.84600765463497e-06, + "loss": 0.1579, + "step": 2573 + }, + { + "epoch": 0.4170447180816591, + "grad_norm": 1.0132466554641724, + "learning_rate": 4.845856511545777e-06, + "loss": 0.1618, + "step": 2574 + }, + { + "epoch": 0.41720674011665587, + "grad_norm": 1.0668222904205322, + "learning_rate": 4.845705296679051e-06, + "loss": 0.1762, + "step": 2575 + }, + { + "epoch": 0.41736876215165264, + "grad_norm": 1.0214135646820068, + "learning_rate": 4.84555401003942e-06, + "loss": 0.1508, + "step": 2576 + }, + { + "epoch": 0.4175307841866494, + "grad_norm": 0.97760009765625, + "learning_rate": 4.845402651631512e-06, + "loss": 0.1552, + "step": 2577 + }, + { + "epoch": 0.4176928062216461, + "grad_norm": 0.9772970080375671, + "learning_rate": 4.845251221459958e-06, + "loss": 0.1478, + "step": 2578 + }, + { + "epoch": 0.4178548282566429, + "grad_norm": 0.999789297580719, + "learning_rate": 4.845099719529393e-06, + "loss": 0.1563, + "step": 2579 + }, + { + "epoch": 0.41801685029163965, + "grad_norm": 1.1174688339233398, + "learning_rate": 4.844948145844452e-06, + "loss": 0.1767, + "step": 2580 + }, + { + "epoch": 0.4181788723266364, + "grad_norm": 0.9995019435882568, + "learning_rate": 4.844796500409771e-06, + "loss": 0.1353, + "step": 2581 + }, + { + "epoch": 0.4183408943616332, + "grad_norm": 0.9696946144104004, + "learning_rate": 4.844644783229993e-06, + "loss": 0.1431, + "step": 2582 + }, + { + "epoch": 0.41850291639662995, + "grad_norm": 1.0892887115478516, + "learning_rate": 4.844492994309757e-06, + "loss": 0.1597, + "step": 2583 + }, + { + "epoch": 0.4186649384316267, + "grad_norm": 1.0031565427780151, + "learning_rate": 4.844341133653709e-06, + "loss": 0.1602, + "step": 2584 + }, + { + "epoch": 0.4188269604666235, + "grad_norm": 1.021321177482605, + "learning_rate": 4.844189201266497e-06, + "loss": 0.162, + "step": 2585 + }, + { + "epoch": 0.4189889825016202, + "grad_norm": 1.1473654508590698, + "learning_rate": 4.844037197152767e-06, + "loss": 0.1854, + "step": 2586 + }, + { + "epoch": 0.41915100453661697, + "grad_norm": 1.1398388147354126, + "learning_rate": 4.8438851213171715e-06, + "loss": 0.1643, + "step": 2587 + }, + { + "epoch": 0.41931302657161373, + "grad_norm": 0.9572474956512451, + "learning_rate": 4.843732973764363e-06, + "loss": 0.1537, + "step": 2588 + }, + { + "epoch": 0.4194750486066105, + "grad_norm": 0.9969721436500549, + "learning_rate": 4.843580754498999e-06, + "loss": 0.1557, + "step": 2589 + }, + { + "epoch": 0.41963707064160727, + "grad_norm": 0.9691717624664307, + "learning_rate": 4.8434284635257335e-06, + "loss": 0.156, + "step": 2590 + }, + { + "epoch": 0.41979909267660404, + "grad_norm": 1.0504859685897827, + "learning_rate": 4.8432761008492284e-06, + "loss": 0.1416, + "step": 2591 + }, + { + "epoch": 0.4199611147116008, + "grad_norm": 1.0523217916488647, + "learning_rate": 4.843123666474146e-06, + "loss": 0.1704, + "step": 2592 + }, + { + "epoch": 0.4201231367465975, + "grad_norm": 1.1369895935058594, + "learning_rate": 4.842971160405149e-06, + "loss": 0.1514, + "step": 2593 + }, + { + "epoch": 0.4202851587815943, + "grad_norm": 1.0184272527694702, + "learning_rate": 4.842818582646904e-06, + "loss": 0.1512, + "step": 2594 + }, + { + "epoch": 0.42044718081659105, + "grad_norm": 1.0141220092773438, + "learning_rate": 4.84266593320408e-06, + "loss": 0.1563, + "step": 2595 + }, + { + "epoch": 0.4206092028515878, + "grad_norm": 1.072980284690857, + "learning_rate": 4.842513212081348e-06, + "loss": 0.1629, + "step": 2596 + }, + { + "epoch": 0.4207712248865846, + "grad_norm": 1.1694176197052002, + "learning_rate": 4.842360419283381e-06, + "loss": 0.169, + "step": 2597 + }, + { + "epoch": 0.42093324692158135, + "grad_norm": 1.0028104782104492, + "learning_rate": 4.8422075548148525e-06, + "loss": 0.1462, + "step": 2598 + }, + { + "epoch": 0.4210952689565781, + "grad_norm": 0.9839292764663696, + "learning_rate": 4.84205461868044e-06, + "loss": 0.1486, + "step": 2599 + }, + { + "epoch": 0.42125729099157483, + "grad_norm": 0.9316514730453491, + "learning_rate": 4.841901610884826e-06, + "loss": 0.14, + "step": 2600 + }, + { + "epoch": 0.4214193130265716, + "grad_norm": 1.1105822324752808, + "learning_rate": 4.8417485314326895e-06, + "loss": 0.1726, + "step": 2601 + }, + { + "epoch": 0.42158133506156836, + "grad_norm": 0.9990409016609192, + "learning_rate": 4.841595380328714e-06, + "loss": 0.1669, + "step": 2602 + }, + { + "epoch": 0.42174335709656513, + "grad_norm": 0.978840172290802, + "learning_rate": 4.841442157577587e-06, + "loss": 0.1542, + "step": 2603 + }, + { + "epoch": 0.4219053791315619, + "grad_norm": 1.0527600049972534, + "learning_rate": 4.841288863183996e-06, + "loss": 0.1514, + "step": 2604 + }, + { + "epoch": 0.42206740116655866, + "grad_norm": 1.0424004793167114, + "learning_rate": 4.8411354971526316e-06, + "loss": 0.1565, + "step": 2605 + }, + { + "epoch": 0.42222942320155543, + "grad_norm": 0.9171922206878662, + "learning_rate": 4.840982059488186e-06, + "loss": 0.1424, + "step": 2606 + }, + { + "epoch": 0.4223914452365522, + "grad_norm": 0.9491299986839294, + "learning_rate": 4.840828550195355e-06, + "loss": 0.14, + "step": 2607 + }, + { + "epoch": 0.4225534672715489, + "grad_norm": 1.1257144212722778, + "learning_rate": 4.840674969278836e-06, + "loss": 0.1776, + "step": 2608 + }, + { + "epoch": 0.4227154893065457, + "grad_norm": 0.9474332928657532, + "learning_rate": 4.840521316743326e-06, + "loss": 0.1463, + "step": 2609 + }, + { + "epoch": 0.42287751134154244, + "grad_norm": 1.0280464887619019, + "learning_rate": 4.8403675925935275e-06, + "loss": 0.1528, + "step": 2610 + }, + { + "epoch": 0.4230395333765392, + "grad_norm": 1.010370135307312, + "learning_rate": 4.840213796834145e-06, + "loss": 0.1519, + "step": 2611 + }, + { + "epoch": 0.423201555411536, + "grad_norm": 1.0161646604537964, + "learning_rate": 4.8400599294698825e-06, + "loss": 0.1563, + "step": 2612 + }, + { + "epoch": 0.42336357744653275, + "grad_norm": 1.0096008777618408, + "learning_rate": 4.83990599050545e-06, + "loss": 0.1559, + "step": 2613 + }, + { + "epoch": 0.4235255994815295, + "grad_norm": 0.9370619654655457, + "learning_rate": 4.839751979945556e-06, + "loss": 0.122, + "step": 2614 + }, + { + "epoch": 0.4236876215165262, + "grad_norm": 0.9001135230064392, + "learning_rate": 4.839597897794915e-06, + "loss": 0.133, + "step": 2615 + }, + { + "epoch": 0.423849643551523, + "grad_norm": 0.9671546816825867, + "learning_rate": 4.839443744058238e-06, + "loss": 0.1472, + "step": 2616 + }, + { + "epoch": 0.42401166558651976, + "grad_norm": 0.9636013507843018, + "learning_rate": 4.839289518740245e-06, + "loss": 0.1435, + "step": 2617 + }, + { + "epoch": 0.4241736876215165, + "grad_norm": 1.114350438117981, + "learning_rate": 4.839135221845654e-06, + "loss": 0.1679, + "step": 2618 + }, + { + "epoch": 0.4243357096565133, + "grad_norm": 1.0020900964736938, + "learning_rate": 4.838980853379184e-06, + "loss": 0.1374, + "step": 2619 + }, + { + "epoch": 0.42449773169151006, + "grad_norm": 1.0408121347427368, + "learning_rate": 4.838826413345561e-06, + "loss": 0.1569, + "step": 2620 + }, + { + "epoch": 0.42465975372650683, + "grad_norm": 1.0487539768218994, + "learning_rate": 4.83867190174951e-06, + "loss": 0.1568, + "step": 2621 + }, + { + "epoch": 0.42482177576150354, + "grad_norm": 1.0748343467712402, + "learning_rate": 4.838517318595758e-06, + "loss": 0.1628, + "step": 2622 + }, + { + "epoch": 0.4249837977965003, + "grad_norm": 1.2027356624603271, + "learning_rate": 4.8383626638890355e-06, + "loss": 0.1863, + "step": 2623 + }, + { + "epoch": 0.4251458198314971, + "grad_norm": 1.07537841796875, + "learning_rate": 4.838207937634074e-06, + "loss": 0.1492, + "step": 2624 + }, + { + "epoch": 0.42530784186649384, + "grad_norm": 1.2119613885879517, + "learning_rate": 4.838053139835608e-06, + "loss": 0.1757, + "step": 2625 + }, + { + "epoch": 0.4254698639014906, + "grad_norm": 0.9143966436386108, + "learning_rate": 4.837898270498374e-06, + "loss": 0.1334, + "step": 2626 + }, + { + "epoch": 0.4256318859364874, + "grad_norm": 1.0104649066925049, + "learning_rate": 4.83774332962711e-06, + "loss": 0.1594, + "step": 2627 + }, + { + "epoch": 0.42579390797148414, + "grad_norm": 0.9676437377929688, + "learning_rate": 4.837588317226558e-06, + "loss": 0.1587, + "step": 2628 + }, + { + "epoch": 0.42595593000648085, + "grad_norm": 0.922566831111908, + "learning_rate": 4.837433233301461e-06, + "loss": 0.1529, + "step": 2629 + }, + { + "epoch": 0.4261179520414776, + "grad_norm": 1.1346608400344849, + "learning_rate": 4.837278077856562e-06, + "loss": 0.1793, + "step": 2630 + }, + { + "epoch": 0.4262799740764744, + "grad_norm": 0.9662337303161621, + "learning_rate": 4.837122850896611e-06, + "loss": 0.1447, + "step": 2631 + }, + { + "epoch": 0.42644199611147116, + "grad_norm": 1.0667489767074585, + "learning_rate": 4.836967552426355e-06, + "loss": 0.1784, + "step": 2632 + }, + { + "epoch": 0.4266040181464679, + "grad_norm": 1.030396580696106, + "learning_rate": 4.836812182450549e-06, + "loss": 0.1485, + "step": 2633 + }, + { + "epoch": 0.4267660401814647, + "grad_norm": 0.9403509497642517, + "learning_rate": 4.836656740973944e-06, + "loss": 0.1476, + "step": 2634 + }, + { + "epoch": 0.42692806221646146, + "grad_norm": 1.0584341287612915, + "learning_rate": 4.836501228001298e-06, + "loss": 0.1499, + "step": 2635 + }, + { + "epoch": 0.4270900842514582, + "grad_norm": 1.0108885765075684, + "learning_rate": 4.836345643537368e-06, + "loss": 0.1485, + "step": 2636 + }, + { + "epoch": 0.42725210628645494, + "grad_norm": 0.9687297940254211, + "learning_rate": 4.8361899875869165e-06, + "loss": 0.1516, + "step": 2637 + }, + { + "epoch": 0.4274141283214517, + "grad_norm": 0.9748333096504211, + "learning_rate": 4.836034260154704e-06, + "loss": 0.1523, + "step": 2638 + }, + { + "epoch": 0.42757615035644847, + "grad_norm": 1.1141382455825806, + "learning_rate": 4.835878461245496e-06, + "loss": 0.1485, + "step": 2639 + }, + { + "epoch": 0.42773817239144524, + "grad_norm": 0.8912030458450317, + "learning_rate": 4.83572259086406e-06, + "loss": 0.133, + "step": 2640 + }, + { + "epoch": 0.427900194426442, + "grad_norm": 1.0493932962417603, + "learning_rate": 4.835566649015165e-06, + "loss": 0.1706, + "step": 2641 + }, + { + "epoch": 0.42806221646143877, + "grad_norm": 1.024950385093689, + "learning_rate": 4.835410635703582e-06, + "loss": 0.1362, + "step": 2642 + }, + { + "epoch": 0.42822423849643554, + "grad_norm": 1.094906210899353, + "learning_rate": 4.8352545509340865e-06, + "loss": 0.1505, + "step": 2643 + }, + { + "epoch": 0.42838626053143225, + "grad_norm": 1.0383245944976807, + "learning_rate": 4.835098394711451e-06, + "loss": 0.1569, + "step": 2644 + }, + { + "epoch": 0.428548282566429, + "grad_norm": 0.9308538436889648, + "learning_rate": 4.834942167040457e-06, + "loss": 0.131, + "step": 2645 + }, + { + "epoch": 0.4287103046014258, + "grad_norm": 0.9128456711769104, + "learning_rate": 4.834785867925883e-06, + "loss": 0.1431, + "step": 2646 + }, + { + "epoch": 0.42887232663642255, + "grad_norm": 1.047548770904541, + "learning_rate": 4.8346294973725115e-06, + "loss": 0.1592, + "step": 2647 + }, + { + "epoch": 0.4290343486714193, + "grad_norm": 0.9378132820129395, + "learning_rate": 4.8344730553851275e-06, + "loss": 0.146, + "step": 2648 + }, + { + "epoch": 0.4291963707064161, + "grad_norm": 1.0160073041915894, + "learning_rate": 4.8343165419685155e-06, + "loss": 0.1598, + "step": 2649 + }, + { + "epoch": 0.42935839274141285, + "grad_norm": 1.0141408443450928, + "learning_rate": 4.834159957127468e-06, + "loss": 0.1496, + "step": 2650 + }, + { + "epoch": 0.42952041477640956, + "grad_norm": 1.0546174049377441, + "learning_rate": 4.834003300866773e-06, + "loss": 0.1479, + "step": 2651 + }, + { + "epoch": 0.42968243681140633, + "grad_norm": 1.0627154111862183, + "learning_rate": 4.833846573191227e-06, + "loss": 0.1585, + "step": 2652 + }, + { + "epoch": 0.4298444588464031, + "grad_norm": 1.0541164875030518, + "learning_rate": 4.833689774105622e-06, + "loss": 0.1551, + "step": 2653 + }, + { + "epoch": 0.43000648088139987, + "grad_norm": 0.9814350008964539, + "learning_rate": 4.833532903614758e-06, + "loss": 0.1397, + "step": 2654 + }, + { + "epoch": 0.43016850291639663, + "grad_norm": 1.0309319496154785, + "learning_rate": 4.8333759617234344e-06, + "loss": 0.1603, + "step": 2655 + }, + { + "epoch": 0.4303305249513934, + "grad_norm": 1.044171929359436, + "learning_rate": 4.833218948436453e-06, + "loss": 0.1588, + "step": 2656 + }, + { + "epoch": 0.43049254698639017, + "grad_norm": 1.0367193222045898, + "learning_rate": 4.833061863758618e-06, + "loss": 0.1734, + "step": 2657 + }, + { + "epoch": 0.43065456902138693, + "grad_norm": 0.9801927804946899, + "learning_rate": 4.832904707694736e-06, + "loss": 0.1459, + "step": 2658 + }, + { + "epoch": 0.43081659105638365, + "grad_norm": 0.9928318858146667, + "learning_rate": 4.8327474802496145e-06, + "loss": 0.1478, + "step": 2659 + }, + { + "epoch": 0.4309786130913804, + "grad_norm": 0.9557510614395142, + "learning_rate": 4.832590181428066e-06, + "loss": 0.1511, + "step": 2660 + }, + { + "epoch": 0.4311406351263772, + "grad_norm": 1.0317496061325073, + "learning_rate": 4.832432811234902e-06, + "loss": 0.1646, + "step": 2661 + }, + { + "epoch": 0.43130265716137395, + "grad_norm": 1.0222221612930298, + "learning_rate": 4.832275369674939e-06, + "loss": 0.1483, + "step": 2662 + }, + { + "epoch": 0.4314646791963707, + "grad_norm": 0.9168278574943542, + "learning_rate": 4.832117856752994e-06, + "loss": 0.1329, + "step": 2663 + }, + { + "epoch": 0.4316267012313675, + "grad_norm": 0.9790104031562805, + "learning_rate": 4.831960272473886e-06, + "loss": 0.1395, + "step": 2664 + }, + { + "epoch": 0.43178872326636425, + "grad_norm": 0.9434686303138733, + "learning_rate": 4.831802616842436e-06, + "loss": 0.138, + "step": 2665 + }, + { + "epoch": 0.43195074530136096, + "grad_norm": 1.0316851139068604, + "learning_rate": 4.831644889863471e-06, + "loss": 0.1684, + "step": 2666 + }, + { + "epoch": 0.43211276733635773, + "grad_norm": 1.1002922058105469, + "learning_rate": 4.831487091541812e-06, + "loss": 0.151, + "step": 2667 + }, + { + "epoch": 0.4322747893713545, + "grad_norm": 1.2835469245910645, + "learning_rate": 4.831329221882291e-06, + "loss": 0.1845, + "step": 2668 + }, + { + "epoch": 0.43243681140635126, + "grad_norm": 1.0473268032073975, + "learning_rate": 4.831171280889739e-06, + "loss": 0.1447, + "step": 2669 + }, + { + "epoch": 0.43259883344134803, + "grad_norm": 1.0189570188522339, + "learning_rate": 4.831013268568986e-06, + "loss": 0.1545, + "step": 2670 + }, + { + "epoch": 0.4327608554763448, + "grad_norm": 1.0731713771820068, + "learning_rate": 4.830855184924868e-06, + "loss": 0.1656, + "step": 2671 + }, + { + "epoch": 0.43292287751134156, + "grad_norm": 0.955833911895752, + "learning_rate": 4.830697029962222e-06, + "loss": 0.1447, + "step": 2672 + }, + { + "epoch": 0.4330848995463383, + "grad_norm": 1.0004793405532837, + "learning_rate": 4.830538803685887e-06, + "loss": 0.1471, + "step": 2673 + }, + { + "epoch": 0.43324692158133504, + "grad_norm": 1.0488044023513794, + "learning_rate": 4.830380506100704e-06, + "loss": 0.1557, + "step": 2674 + }, + { + "epoch": 0.4334089436163318, + "grad_norm": 1.1116876602172852, + "learning_rate": 4.830222137211518e-06, + "loss": 0.1584, + "step": 2675 + }, + { + "epoch": 0.4335709656513286, + "grad_norm": 0.9907661080360413, + "learning_rate": 4.830063697023173e-06, + "loss": 0.1414, + "step": 2676 + }, + { + "epoch": 0.43373298768632534, + "grad_norm": 1.0337917804718018, + "learning_rate": 4.829905185540517e-06, + "loss": 0.1692, + "step": 2677 + }, + { + "epoch": 0.4338950097213221, + "grad_norm": 0.9875041246414185, + "learning_rate": 4.829746602768401e-06, + "loss": 0.1462, + "step": 2678 + }, + { + "epoch": 0.4340570317563189, + "grad_norm": 1.0087636709213257, + "learning_rate": 4.829587948711677e-06, + "loss": 0.1484, + "step": 2679 + }, + { + "epoch": 0.43421905379131565, + "grad_norm": 1.0348845720291138, + "learning_rate": 4.8294292233752e-06, + "loss": 0.1457, + "step": 2680 + }, + { + "epoch": 0.43438107582631236, + "grad_norm": 0.8487588167190552, + "learning_rate": 4.829270426763824e-06, + "loss": 0.1168, + "step": 2681 + }, + { + "epoch": 0.4345430978613091, + "grad_norm": 1.086271047592163, + "learning_rate": 4.829111558882411e-06, + "loss": 0.1501, + "step": 2682 + }, + { + "epoch": 0.4347051198963059, + "grad_norm": 0.9329110980033875, + "learning_rate": 4.828952619735821e-06, + "loss": 0.1482, + "step": 2683 + }, + { + "epoch": 0.43486714193130266, + "grad_norm": 1.0108580589294434, + "learning_rate": 4.828793609328916e-06, + "loss": 0.1457, + "step": 2684 + }, + { + "epoch": 0.4350291639662994, + "grad_norm": 1.005904197692871, + "learning_rate": 4.828634527666562e-06, + "loss": 0.1391, + "step": 2685 + }, + { + "epoch": 0.4351911860012962, + "grad_norm": 0.9748127460479736, + "learning_rate": 4.828475374753627e-06, + "loss": 0.1528, + "step": 2686 + }, + { + "epoch": 0.43535320803629296, + "grad_norm": 1.1028996706008911, + "learning_rate": 4.82831615059498e-06, + "loss": 0.1504, + "step": 2687 + }, + { + "epoch": 0.43551523007128967, + "grad_norm": 1.1191354990005493, + "learning_rate": 4.828156855195493e-06, + "loss": 0.1613, + "step": 2688 + }, + { + "epoch": 0.43567725210628644, + "grad_norm": 0.9752424955368042, + "learning_rate": 4.827997488560041e-06, + "loss": 0.1479, + "step": 2689 + }, + { + "epoch": 0.4358392741412832, + "grad_norm": 1.1131165027618408, + "learning_rate": 4.827838050693499e-06, + "loss": 0.1846, + "step": 2690 + }, + { + "epoch": 0.43600129617628, + "grad_norm": 0.9194788336753845, + "learning_rate": 4.827678541600747e-06, + "loss": 0.1299, + "step": 2691 + }, + { + "epoch": 0.43616331821127674, + "grad_norm": 1.0693033933639526, + "learning_rate": 4.827518961286663e-06, + "loss": 0.1686, + "step": 2692 + }, + { + "epoch": 0.4363253402462735, + "grad_norm": 1.0003294944763184, + "learning_rate": 4.827359309756132e-06, + "loss": 0.1645, + "step": 2693 + }, + { + "epoch": 0.4364873622812703, + "grad_norm": 1.1136189699172974, + "learning_rate": 4.827199587014038e-06, + "loss": 0.1632, + "step": 2694 + }, + { + "epoch": 0.436649384316267, + "grad_norm": 0.9603433012962341, + "learning_rate": 4.8270397930652685e-06, + "loss": 0.1411, + "step": 2695 + }, + { + "epoch": 0.43681140635126375, + "grad_norm": 1.076905369758606, + "learning_rate": 4.826879927914713e-06, + "loss": 0.1593, + "step": 2696 + }, + { + "epoch": 0.4369734283862605, + "grad_norm": 0.9769685864448547, + "learning_rate": 4.826719991567262e-06, + "loss": 0.1612, + "step": 2697 + }, + { + "epoch": 0.4371354504212573, + "grad_norm": 0.9933087825775146, + "learning_rate": 4.82655998402781e-06, + "loss": 0.1777, + "step": 2698 + }, + { + "epoch": 0.43729747245625405, + "grad_norm": 0.9159439206123352, + "learning_rate": 4.826399905301252e-06, + "loss": 0.1424, + "step": 2699 + }, + { + "epoch": 0.4374594944912508, + "grad_norm": 0.9449082016944885, + "learning_rate": 4.826239755392488e-06, + "loss": 0.1445, + "step": 2700 + }, + { + "epoch": 0.4376215165262476, + "grad_norm": 1.004356861114502, + "learning_rate": 4.826079534306417e-06, + "loss": 0.1589, + "step": 2701 + }, + { + "epoch": 0.43778353856124436, + "grad_norm": 1.0486918687820435, + "learning_rate": 4.8259192420479395e-06, + "loss": 0.1625, + "step": 2702 + }, + { + "epoch": 0.43794556059624107, + "grad_norm": 1.0001510381698608, + "learning_rate": 4.825758878621963e-06, + "loss": 0.1594, + "step": 2703 + }, + { + "epoch": 0.43810758263123784, + "grad_norm": 1.1367384195327759, + "learning_rate": 4.825598444033393e-06, + "loss": 0.1772, + "step": 2704 + }, + { + "epoch": 0.4382696046662346, + "grad_norm": 1.0560551881790161, + "learning_rate": 4.825437938287139e-06, + "loss": 0.164, + "step": 2705 + }, + { + "epoch": 0.43843162670123137, + "grad_norm": 0.9951310157775879, + "learning_rate": 4.82527736138811e-06, + "loss": 0.1679, + "step": 2706 + }, + { + "epoch": 0.43859364873622814, + "grad_norm": 1.0460946559906006, + "learning_rate": 4.825116713341223e-06, + "loss": 0.1679, + "step": 2707 + }, + { + "epoch": 0.4387556707712249, + "grad_norm": 1.0097354650497437, + "learning_rate": 4.824955994151389e-06, + "loss": 0.1549, + "step": 2708 + }, + { + "epoch": 0.43891769280622167, + "grad_norm": 0.9873436093330383, + "learning_rate": 4.824795203823529e-06, + "loss": 0.1522, + "step": 2709 + }, + { + "epoch": 0.4390797148412184, + "grad_norm": 0.9897306561470032, + "learning_rate": 4.824634342362561e-06, + "loss": 0.1603, + "step": 2710 + }, + { + "epoch": 0.43924173687621515, + "grad_norm": 1.0298486948013306, + "learning_rate": 4.824473409773408e-06, + "loss": 0.1711, + "step": 2711 + }, + { + "epoch": 0.4394037589112119, + "grad_norm": 1.0957099199295044, + "learning_rate": 4.824312406060995e-06, + "loss": 0.1806, + "step": 2712 + }, + { + "epoch": 0.4395657809462087, + "grad_norm": 0.9915971755981445, + "learning_rate": 4.824151331230245e-06, + "loss": 0.1612, + "step": 2713 + }, + { + "epoch": 0.43972780298120545, + "grad_norm": 0.9229410886764526, + "learning_rate": 4.82399018528609e-06, + "loss": 0.1444, + "step": 2714 + }, + { + "epoch": 0.4398898250162022, + "grad_norm": 1.0651377439498901, + "learning_rate": 4.823828968233459e-06, + "loss": 0.1627, + "step": 2715 + }, + { + "epoch": 0.440051847051199, + "grad_norm": 0.9680843353271484, + "learning_rate": 4.823667680077285e-06, + "loss": 0.1354, + "step": 2716 + }, + { + "epoch": 0.4402138690861957, + "grad_norm": 1.1376385688781738, + "learning_rate": 4.823506320822503e-06, + "loss": 0.1696, + "step": 2717 + }, + { + "epoch": 0.44037589112119246, + "grad_norm": 0.9446072578430176, + "learning_rate": 4.8233448904740505e-06, + "loss": 0.1395, + "step": 2718 + }, + { + "epoch": 0.44053791315618923, + "grad_norm": 1.1791229248046875, + "learning_rate": 4.823183389036867e-06, + "loss": 0.1819, + "step": 2719 + }, + { + "epoch": 0.440699935191186, + "grad_norm": 1.0275377035140991, + "learning_rate": 4.823021816515893e-06, + "loss": 0.1589, + "step": 2720 + }, + { + "epoch": 0.44086195722618277, + "grad_norm": 1.0309733152389526, + "learning_rate": 4.822860172916074e-06, + "loss": 0.1415, + "step": 2721 + }, + { + "epoch": 0.44102397926117953, + "grad_norm": 1.0780889987945557, + "learning_rate": 4.8226984582423545e-06, + "loss": 0.1693, + "step": 2722 + }, + { + "epoch": 0.4411860012961763, + "grad_norm": 1.0635453462600708, + "learning_rate": 4.8225366724996826e-06, + "loss": 0.1393, + "step": 2723 + }, + { + "epoch": 0.44134802333117307, + "grad_norm": 0.916522204875946, + "learning_rate": 4.82237481569301e-06, + "loss": 0.1435, + "step": 2724 + }, + { + "epoch": 0.4415100453661698, + "grad_norm": 0.949371337890625, + "learning_rate": 4.822212887827287e-06, + "loss": 0.1441, + "step": 2725 + }, + { + "epoch": 0.44167206740116655, + "grad_norm": 0.9767130613327026, + "learning_rate": 4.822050888907469e-06, + "loss": 0.1539, + "step": 2726 + }, + { + "epoch": 0.4418340894361633, + "grad_norm": 0.9982846975326538, + "learning_rate": 4.8218888189385145e-06, + "loss": 0.1326, + "step": 2727 + }, + { + "epoch": 0.4419961114711601, + "grad_norm": 0.99046790599823, + "learning_rate": 4.82172667792538e-06, + "loss": 0.1545, + "step": 2728 + }, + { + "epoch": 0.44215813350615685, + "grad_norm": 0.9805136919021606, + "learning_rate": 4.821564465873027e-06, + "loss": 0.1505, + "step": 2729 + }, + { + "epoch": 0.4423201555411536, + "grad_norm": 0.9637601375579834, + "learning_rate": 4.821402182786421e-06, + "loss": 0.1468, + "step": 2730 + }, + { + "epoch": 0.4424821775761504, + "grad_norm": 1.0324764251708984, + "learning_rate": 4.821239828670525e-06, + "loss": 0.1655, + "step": 2731 + }, + { + "epoch": 0.4426441996111471, + "grad_norm": 1.0406447649002075, + "learning_rate": 4.8210774035303085e-06, + "loss": 0.1528, + "step": 2732 + }, + { + "epoch": 0.44280622164614386, + "grad_norm": 1.000382661819458, + "learning_rate": 4.820914907370739e-06, + "loss": 0.1424, + "step": 2733 + }, + { + "epoch": 0.4429682436811406, + "grad_norm": 1.0306026935577393, + "learning_rate": 4.82075234019679e-06, + "loss": 0.1612, + "step": 2734 + }, + { + "epoch": 0.4431302657161374, + "grad_norm": 0.9841228723526001, + "learning_rate": 4.820589702013436e-06, + "loss": 0.1515, + "step": 2735 + }, + { + "epoch": 0.44329228775113416, + "grad_norm": 0.9914823770523071, + "learning_rate": 4.820426992825653e-06, + "loss": 0.1586, + "step": 2736 + }, + { + "epoch": 0.44345430978613093, + "grad_norm": 1.0662131309509277, + "learning_rate": 4.820264212638419e-06, + "loss": 0.1617, + "step": 2737 + }, + { + "epoch": 0.4436163318211277, + "grad_norm": 1.1415921449661255, + "learning_rate": 4.820101361456715e-06, + "loss": 0.1786, + "step": 2738 + }, + { + "epoch": 0.4437783538561244, + "grad_norm": 1.0659246444702148, + "learning_rate": 4.819938439285524e-06, + "loss": 0.1655, + "step": 2739 + }, + { + "epoch": 0.4439403758911212, + "grad_norm": 1.2058014869689941, + "learning_rate": 4.819775446129832e-06, + "loss": 0.1672, + "step": 2740 + }, + { + "epoch": 0.44410239792611794, + "grad_norm": 1.0413726568222046, + "learning_rate": 4.819612381994624e-06, + "loss": 0.1361, + "step": 2741 + }, + { + "epoch": 0.4442644199611147, + "grad_norm": 0.9703432321548462, + "learning_rate": 4.8194492468848895e-06, + "loss": 0.156, + "step": 2742 + }, + { + "epoch": 0.4444264419961115, + "grad_norm": 1.1094372272491455, + "learning_rate": 4.819286040805622e-06, + "loss": 0.1517, + "step": 2743 + }, + { + "epoch": 0.44458846403110824, + "grad_norm": 1.0455727577209473, + "learning_rate": 4.8191227637618145e-06, + "loss": 0.1484, + "step": 2744 + }, + { + "epoch": 0.444750486066105, + "grad_norm": 0.9904317855834961, + "learning_rate": 4.818959415758463e-06, + "loss": 0.1528, + "step": 2745 + }, + { + "epoch": 0.4449125081011017, + "grad_norm": 0.9994639158248901, + "learning_rate": 4.818795996800564e-06, + "loss": 0.1466, + "step": 2746 + }, + { + "epoch": 0.4450745301360985, + "grad_norm": 1.0804579257965088, + "learning_rate": 4.818632506893119e-06, + "loss": 0.1715, + "step": 2747 + }, + { + "epoch": 0.44523655217109526, + "grad_norm": 1.024933099746704, + "learning_rate": 4.8184689460411306e-06, + "loss": 0.1411, + "step": 2748 + }, + { + "epoch": 0.445398574206092, + "grad_norm": 1.01506507396698, + "learning_rate": 4.8183053142496025e-06, + "loss": 0.1706, + "step": 2749 + }, + { + "epoch": 0.4455605962410888, + "grad_norm": 0.9807841777801514, + "learning_rate": 4.818141611523543e-06, + "loss": 0.1517, + "step": 2750 + }, + { + "epoch": 0.44572261827608556, + "grad_norm": 0.8835573196411133, + "learning_rate": 4.81797783786796e-06, + "loss": 0.1406, + "step": 2751 + }, + { + "epoch": 0.4458846403110823, + "grad_norm": 0.9974625110626221, + "learning_rate": 4.817813993287863e-06, + "loss": 0.1729, + "step": 2752 + }, + { + "epoch": 0.4460466623460791, + "grad_norm": 1.12760329246521, + "learning_rate": 4.817650077788268e-06, + "loss": 0.1616, + "step": 2753 + }, + { + "epoch": 0.4462086843810758, + "grad_norm": 1.0579090118408203, + "learning_rate": 4.817486091374189e-06, + "loss": 0.138, + "step": 2754 + }, + { + "epoch": 0.44637070641607257, + "grad_norm": 0.9531564116477966, + "learning_rate": 4.817322034050645e-06, + "loss": 0.1276, + "step": 2755 + }, + { + "epoch": 0.44653272845106934, + "grad_norm": 0.9502880573272705, + "learning_rate": 4.817157905822652e-06, + "loss": 0.1437, + "step": 2756 + }, + { + "epoch": 0.4466947504860661, + "grad_norm": 1.0466476678848267, + "learning_rate": 4.816993706695237e-06, + "loss": 0.1648, + "step": 2757 + }, + { + "epoch": 0.4468567725210629, + "grad_norm": 1.0479638576507568, + "learning_rate": 4.816829436673421e-06, + "loss": 0.1539, + "step": 2758 + }, + { + "epoch": 0.44701879455605964, + "grad_norm": 1.0833531618118286, + "learning_rate": 4.81666509576223e-06, + "loss": 0.1675, + "step": 2759 + }, + { + "epoch": 0.4471808165910564, + "grad_norm": 0.9449465274810791, + "learning_rate": 4.816500683966694e-06, + "loss": 0.1481, + "step": 2760 + }, + { + "epoch": 0.4473428386260531, + "grad_norm": 1.0143202543258667, + "learning_rate": 4.816336201291842e-06, + "loss": 0.152, + "step": 2761 + }, + { + "epoch": 0.4475048606610499, + "grad_norm": 1.0159945487976074, + "learning_rate": 4.816171647742708e-06, + "loss": 0.1393, + "step": 2762 + }, + { + "epoch": 0.44766688269604665, + "grad_norm": 1.0358235836029053, + "learning_rate": 4.816007023324327e-06, + "loss": 0.1563, + "step": 2763 + }, + { + "epoch": 0.4478289047310434, + "grad_norm": 0.9171221852302551, + "learning_rate": 4.815842328041736e-06, + "loss": 0.1449, + "step": 2764 + }, + { + "epoch": 0.4479909267660402, + "grad_norm": 1.0671533346176147, + "learning_rate": 4.815677561899973e-06, + "loss": 0.1561, + "step": 2765 + }, + { + "epoch": 0.44815294880103695, + "grad_norm": 1.0144374370574951, + "learning_rate": 4.815512724904081e-06, + "loss": 0.1648, + "step": 2766 + }, + { + "epoch": 0.4483149708360337, + "grad_norm": 0.9934388399124146, + "learning_rate": 4.815347817059103e-06, + "loss": 0.143, + "step": 2767 + }, + { + "epoch": 0.44847699287103043, + "grad_norm": 0.9061841368675232, + "learning_rate": 4.815182838370085e-06, + "loss": 0.139, + "step": 2768 + }, + { + "epoch": 0.4486390149060272, + "grad_norm": 1.0116674900054932, + "learning_rate": 4.815017788842075e-06, + "loss": 0.1504, + "step": 2769 + }, + { + "epoch": 0.44880103694102397, + "grad_norm": 1.012621283531189, + "learning_rate": 4.814852668480122e-06, + "loss": 0.164, + "step": 2770 + }, + { + "epoch": 0.44896305897602073, + "grad_norm": 0.9846920967102051, + "learning_rate": 4.81468747728928e-06, + "loss": 0.1494, + "step": 2771 + }, + { + "epoch": 0.4491250810110175, + "grad_norm": 0.9715894460678101, + "learning_rate": 4.814522215274603e-06, + "loss": 0.1446, + "step": 2772 + }, + { + "epoch": 0.44928710304601427, + "grad_norm": 1.0353641510009766, + "learning_rate": 4.814356882441147e-06, + "loss": 0.1578, + "step": 2773 + }, + { + "epoch": 0.44944912508101104, + "grad_norm": 0.9785324931144714, + "learning_rate": 4.81419147879397e-06, + "loss": 0.1495, + "step": 2774 + }, + { + "epoch": 0.4496111471160078, + "grad_norm": 1.190711259841919, + "learning_rate": 4.814026004338135e-06, + "loss": 0.1844, + "step": 2775 + }, + { + "epoch": 0.4497731691510045, + "grad_norm": 1.0003420114517212, + "learning_rate": 4.813860459078703e-06, + "loss": 0.1485, + "step": 2776 + }, + { + "epoch": 0.4499351911860013, + "grad_norm": 0.9324052929878235, + "learning_rate": 4.8136948430207415e-06, + "loss": 0.1421, + "step": 2777 + }, + { + "epoch": 0.45009721322099805, + "grad_norm": 1.0191559791564941, + "learning_rate": 4.813529156169317e-06, + "loss": 0.1593, + "step": 2778 + }, + { + "epoch": 0.4502592352559948, + "grad_norm": 1.0456205606460571, + "learning_rate": 4.813363398529498e-06, + "loss": 0.1552, + "step": 2779 + }, + { + "epoch": 0.4504212572909916, + "grad_norm": 0.9817255735397339, + "learning_rate": 4.813197570106357e-06, + "loss": 0.1554, + "step": 2780 + }, + { + "epoch": 0.45058327932598835, + "grad_norm": 1.0291125774383545, + "learning_rate": 4.813031670904969e-06, + "loss": 0.1549, + "step": 2781 + }, + { + "epoch": 0.4507453013609851, + "grad_norm": 0.9215115904808044, + "learning_rate": 4.8128657009304096e-06, + "loss": 0.1342, + "step": 2782 + }, + { + "epoch": 0.45090732339598183, + "grad_norm": 0.9075199961662292, + "learning_rate": 4.8126996601877564e-06, + "loss": 0.1471, + "step": 2783 + }, + { + "epoch": 0.4510693454309786, + "grad_norm": 0.9484758377075195, + "learning_rate": 4.8125335486820905e-06, + "loss": 0.1392, + "step": 2784 + }, + { + "epoch": 0.45123136746597536, + "grad_norm": 1.10005521774292, + "learning_rate": 4.812367366418493e-06, + "loss": 0.162, + "step": 2785 + }, + { + "epoch": 0.45139338950097213, + "grad_norm": 0.9598970413208008, + "learning_rate": 4.8122011134020505e-06, + "loss": 0.1256, + "step": 2786 + }, + { + "epoch": 0.4515554115359689, + "grad_norm": 1.010698914527893, + "learning_rate": 4.81203478963785e-06, + "loss": 0.1521, + "step": 2787 + }, + { + "epoch": 0.45171743357096567, + "grad_norm": 0.8810874819755554, + "learning_rate": 4.8118683951309795e-06, + "loss": 0.1325, + "step": 2788 + }, + { + "epoch": 0.45187945560596243, + "grad_norm": 1.020269751548767, + "learning_rate": 4.811701929886531e-06, + "loss": 0.1457, + "step": 2789 + }, + { + "epoch": 0.45204147764095914, + "grad_norm": 0.9705826640129089, + "learning_rate": 4.811535393909598e-06, + "loss": 0.1452, + "step": 2790 + }, + { + "epoch": 0.4522034996759559, + "grad_norm": 1.204943060874939, + "learning_rate": 4.811368787205275e-06, + "loss": 0.1884, + "step": 2791 + }, + { + "epoch": 0.4523655217109527, + "grad_norm": 1.057442545890808, + "learning_rate": 4.811202109778661e-06, + "loss": 0.1642, + "step": 2792 + }, + { + "epoch": 0.45252754374594945, + "grad_norm": 1.0493812561035156, + "learning_rate": 4.811035361634855e-06, + "loss": 0.1466, + "step": 2793 + }, + { + "epoch": 0.4526895657809462, + "grad_norm": 1.0692343711853027, + "learning_rate": 4.810868542778959e-06, + "loss": 0.1522, + "step": 2794 + }, + { + "epoch": 0.452851587815943, + "grad_norm": 0.98219895362854, + "learning_rate": 4.8107016532160784e-06, + "loss": 0.1417, + "step": 2795 + }, + { + "epoch": 0.45301360985093975, + "grad_norm": 1.0239708423614502, + "learning_rate": 4.81053469295132e-06, + "loss": 0.1593, + "step": 2796 + }, + { + "epoch": 0.4531756318859365, + "grad_norm": 0.94671630859375, + "learning_rate": 4.81036766198979e-06, + "loss": 0.1411, + "step": 2797 + }, + { + "epoch": 0.4533376539209332, + "grad_norm": 1.0224705934524536, + "learning_rate": 4.810200560336601e-06, + "loss": 0.1615, + "step": 2798 + }, + { + "epoch": 0.45349967595593, + "grad_norm": 0.970841109752655, + "learning_rate": 4.810033387996865e-06, + "loss": 0.1588, + "step": 2799 + }, + { + "epoch": 0.45366169799092676, + "grad_norm": 1.008506417274475, + "learning_rate": 4.809866144975699e-06, + "loss": 0.1653, + "step": 2800 + }, + { + "epoch": 0.4538237200259235, + "grad_norm": 0.9327491521835327, + "learning_rate": 4.809698831278217e-06, + "loss": 0.1419, + "step": 2801 + }, + { + "epoch": 0.4539857420609203, + "grad_norm": 0.9707223176956177, + "learning_rate": 4.809531446909541e-06, + "loss": 0.161, + "step": 2802 + }, + { + "epoch": 0.45414776409591706, + "grad_norm": 1.0070552825927734, + "learning_rate": 4.8093639918747915e-06, + "loss": 0.1703, + "step": 2803 + }, + { + "epoch": 0.45430978613091383, + "grad_norm": 1.0490121841430664, + "learning_rate": 4.8091964661790926e-06, + "loss": 0.1556, + "step": 2804 + }, + { + "epoch": 0.45447180816591054, + "grad_norm": 1.058758020401001, + "learning_rate": 4.80902886982757e-06, + "loss": 0.1536, + "step": 2805 + }, + { + "epoch": 0.4546338302009073, + "grad_norm": 0.9229541420936584, + "learning_rate": 4.808861202825351e-06, + "loss": 0.1402, + "step": 2806 + }, + { + "epoch": 0.4547958522359041, + "grad_norm": 1.017575979232788, + "learning_rate": 4.8086934651775675e-06, + "loss": 0.1572, + "step": 2807 + }, + { + "epoch": 0.45495787427090084, + "grad_norm": 0.8822991847991943, + "learning_rate": 4.80852565688935e-06, + "loss": 0.1379, + "step": 2808 + }, + { + "epoch": 0.4551198963058976, + "grad_norm": 0.9270743131637573, + "learning_rate": 4.8083577779658344e-06, + "loss": 0.1278, + "step": 2809 + }, + { + "epoch": 0.4552819183408944, + "grad_norm": 0.9304889440536499, + "learning_rate": 4.808189828412157e-06, + "loss": 0.1374, + "step": 2810 + }, + { + "epoch": 0.45544394037589114, + "grad_norm": 0.9572131633758545, + "learning_rate": 4.8080218082334566e-06, + "loss": 0.1386, + "step": 2811 + }, + { + "epoch": 0.45560596241088785, + "grad_norm": 0.9808574318885803, + "learning_rate": 4.807853717434874e-06, + "loss": 0.1678, + "step": 2812 + }, + { + "epoch": 0.4557679844458846, + "grad_norm": 1.0128886699676514, + "learning_rate": 4.807685556021552e-06, + "loss": 0.1619, + "step": 2813 + }, + { + "epoch": 0.4559300064808814, + "grad_norm": 1.1356350183486938, + "learning_rate": 4.807517323998637e-06, + "loss": 0.1639, + "step": 2814 + }, + { + "epoch": 0.45609202851587816, + "grad_norm": 0.9979428052902222, + "learning_rate": 4.807349021371276e-06, + "loss": 0.1587, + "step": 2815 + }, + { + "epoch": 0.4562540505508749, + "grad_norm": 0.9667853116989136, + "learning_rate": 4.8071806481446194e-06, + "loss": 0.1401, + "step": 2816 + }, + { + "epoch": 0.4564160725858717, + "grad_norm": 0.9587422013282776, + "learning_rate": 4.807012204323817e-06, + "loss": 0.1515, + "step": 2817 + }, + { + "epoch": 0.45657809462086846, + "grad_norm": 0.9749378561973572, + "learning_rate": 4.806843689914025e-06, + "loss": 0.1488, + "step": 2818 + }, + { + "epoch": 0.4567401166558652, + "grad_norm": 1.0103932619094849, + "learning_rate": 4.806675104920397e-06, + "loss": 0.1579, + "step": 2819 + }, + { + "epoch": 0.45690213869086194, + "grad_norm": 1.1067167520523071, + "learning_rate": 4.806506449348094e-06, + "loss": 0.1602, + "step": 2820 + }, + { + "epoch": 0.4570641607258587, + "grad_norm": 1.0808192491531372, + "learning_rate": 4.8063377232022755e-06, + "loss": 0.1781, + "step": 2821 + }, + { + "epoch": 0.45722618276085547, + "grad_norm": 0.9388707280158997, + "learning_rate": 4.8061689264881036e-06, + "loss": 0.1362, + "step": 2822 + }, + { + "epoch": 0.45738820479585224, + "grad_norm": 0.9397252202033997, + "learning_rate": 4.806000059210744e-06, + "loss": 0.1385, + "step": 2823 + }, + { + "epoch": 0.457550226830849, + "grad_norm": 1.0616799592971802, + "learning_rate": 4.805831121375361e-06, + "loss": 0.1753, + "step": 2824 + }, + { + "epoch": 0.45771224886584577, + "grad_norm": 0.973735511302948, + "learning_rate": 4.805662112987127e-06, + "loss": 0.152, + "step": 2825 + }, + { + "epoch": 0.45787427090084254, + "grad_norm": 0.9995198845863342, + "learning_rate": 4.805493034051212e-06, + "loss": 0.1607, + "step": 2826 + }, + { + "epoch": 0.45803629293583925, + "grad_norm": 0.9708908200263977, + "learning_rate": 4.80532388457279e-06, + "loss": 0.141, + "step": 2827 + }, + { + "epoch": 0.458198314970836, + "grad_norm": 1.112001895904541, + "learning_rate": 4.805154664557034e-06, + "loss": 0.1623, + "step": 2828 + }, + { + "epoch": 0.4583603370058328, + "grad_norm": 0.9360017776489258, + "learning_rate": 4.804985374009125e-06, + "loss": 0.1294, + "step": 2829 + }, + { + "epoch": 0.45852235904082955, + "grad_norm": 0.8773815631866455, + "learning_rate": 4.804816012934242e-06, + "loss": 0.1369, + "step": 2830 + }, + { + "epoch": 0.4586843810758263, + "grad_norm": 0.9873615503311157, + "learning_rate": 4.8046465813375655e-06, + "loss": 0.1471, + "step": 2831 + }, + { + "epoch": 0.4588464031108231, + "grad_norm": 1.0159286260604858, + "learning_rate": 4.8044770792242815e-06, + "loss": 0.1654, + "step": 2832 + }, + { + "epoch": 0.45900842514581985, + "grad_norm": 0.9192469120025635, + "learning_rate": 4.8043075065995755e-06, + "loss": 0.1376, + "step": 2833 + }, + { + "epoch": 0.45917044718081657, + "grad_norm": 0.8338161110877991, + "learning_rate": 4.8041378634686355e-06, + "loss": 0.13, + "step": 2834 + }, + { + "epoch": 0.45933246921581333, + "grad_norm": 0.9531093835830688, + "learning_rate": 4.803968149836653e-06, + "loss": 0.1326, + "step": 2835 + }, + { + "epoch": 0.4594944912508101, + "grad_norm": 1.2418529987335205, + "learning_rate": 4.803798365708821e-06, + "loss": 0.1606, + "step": 2836 + }, + { + "epoch": 0.45965651328580687, + "grad_norm": 0.956653892993927, + "learning_rate": 4.803628511090333e-06, + "loss": 0.1383, + "step": 2837 + }, + { + "epoch": 0.45981853532080363, + "grad_norm": 0.9836128950119019, + "learning_rate": 4.803458585986389e-06, + "loss": 0.1557, + "step": 2838 + }, + { + "epoch": 0.4599805573558004, + "grad_norm": 1.1878564357757568, + "learning_rate": 4.803288590402185e-06, + "loss": 0.1793, + "step": 2839 + }, + { + "epoch": 0.46014257939079717, + "grad_norm": 1.0045480728149414, + "learning_rate": 4.803118524342925e-06, + "loss": 0.1658, + "step": 2840 + }, + { + "epoch": 0.46030460142579394, + "grad_norm": 1.1326467990875244, + "learning_rate": 4.802948387813812e-06, + "loss": 0.1524, + "step": 2841 + }, + { + "epoch": 0.46046662346079065, + "grad_norm": 0.9452898502349854, + "learning_rate": 4.80277818082005e-06, + "loss": 0.149, + "step": 2842 + }, + { + "epoch": 0.4606286454957874, + "grad_norm": 1.0143297910690308, + "learning_rate": 4.802607903366849e-06, + "loss": 0.1439, + "step": 2843 + }, + { + "epoch": 0.4607906675307842, + "grad_norm": 0.9265870451927185, + "learning_rate": 4.802437555459418e-06, + "loss": 0.1461, + "step": 2844 + }, + { + "epoch": 0.46095268956578095, + "grad_norm": 0.9968574643135071, + "learning_rate": 4.80226713710297e-06, + "loss": 0.1549, + "step": 2845 + }, + { + "epoch": 0.4611147116007777, + "grad_norm": 0.9609559774398804, + "learning_rate": 4.802096648302718e-06, + "loss": 0.1511, + "step": 2846 + }, + { + "epoch": 0.4612767336357745, + "grad_norm": 1.0482757091522217, + "learning_rate": 4.8019260890638805e-06, + "loss": 0.1849, + "step": 2847 + }, + { + "epoch": 0.46143875567077125, + "grad_norm": 1.261745572090149, + "learning_rate": 4.801755459391675e-06, + "loss": 0.2021, + "step": 2848 + }, + { + "epoch": 0.46160077770576796, + "grad_norm": 1.0810171365737915, + "learning_rate": 4.801584759291323e-06, + "loss": 0.1696, + "step": 2849 + }, + { + "epoch": 0.46176279974076473, + "grad_norm": 1.0707732439041138, + "learning_rate": 4.801413988768047e-06, + "loss": 0.1727, + "step": 2850 + }, + { + "epoch": 0.4619248217757615, + "grad_norm": 1.021913766860962, + "learning_rate": 4.8012431478270716e-06, + "loss": 0.1585, + "step": 2851 + }, + { + "epoch": 0.46208684381075826, + "grad_norm": 0.9242175817489624, + "learning_rate": 4.801072236473625e-06, + "loss": 0.1406, + "step": 2852 + }, + { + "epoch": 0.46224886584575503, + "grad_norm": 0.953057587146759, + "learning_rate": 4.800901254712936e-06, + "loss": 0.1605, + "step": 2853 + }, + { + "epoch": 0.4624108878807518, + "grad_norm": 0.9746292233467102, + "learning_rate": 4.800730202550237e-06, + "loss": 0.141, + "step": 2854 + }, + { + "epoch": 0.46257290991574856, + "grad_norm": 0.9918842315673828, + "learning_rate": 4.800559079990762e-06, + "loss": 0.1639, + "step": 2855 + }, + { + "epoch": 0.4627349319507453, + "grad_norm": 0.9181191325187683, + "learning_rate": 4.800387887039747e-06, + "loss": 0.1396, + "step": 2856 + }, + { + "epoch": 0.46289695398574204, + "grad_norm": 1.0667685270309448, + "learning_rate": 4.800216623702428e-06, + "loss": 0.1612, + "step": 2857 + }, + { + "epoch": 0.4630589760207388, + "grad_norm": 0.8903344869613647, + "learning_rate": 4.800045289984047e-06, + "loss": 0.1219, + "step": 2858 + }, + { + "epoch": 0.4632209980557356, + "grad_norm": 1.1288483142852783, + "learning_rate": 4.7998738858898475e-06, + "loss": 0.1552, + "step": 2859 + }, + { + "epoch": 0.46338302009073234, + "grad_norm": 0.9569337964057922, + "learning_rate": 4.799702411425071e-06, + "loss": 0.1304, + "step": 2860 + }, + { + "epoch": 0.4635450421257291, + "grad_norm": 0.9571936130523682, + "learning_rate": 4.799530866594967e-06, + "loss": 0.1502, + "step": 2861 + }, + { + "epoch": 0.4637070641607259, + "grad_norm": 0.955163836479187, + "learning_rate": 4.7993592514047825e-06, + "loss": 0.1475, + "step": 2862 + }, + { + "epoch": 0.4638690861957226, + "grad_norm": 0.8739729523658752, + "learning_rate": 4.79918756585977e-06, + "loss": 0.1334, + "step": 2863 + }, + { + "epoch": 0.46403110823071936, + "grad_norm": 1.0791646242141724, + "learning_rate": 4.7990158099651815e-06, + "loss": 0.1593, + "step": 2864 + }, + { + "epoch": 0.4641931302657161, + "grad_norm": 1.0804657936096191, + "learning_rate": 4.798843983726272e-06, + "loss": 0.148, + "step": 2865 + }, + { + "epoch": 0.4643551523007129, + "grad_norm": 0.9415281414985657, + "learning_rate": 4.798672087148301e-06, + "loss": 0.1481, + "step": 2866 + }, + { + "epoch": 0.46451717433570966, + "grad_norm": 0.9962906837463379, + "learning_rate": 4.798500120236526e-06, + "loss": 0.154, + "step": 2867 + }, + { + "epoch": 0.4646791963707064, + "grad_norm": 1.016023874282837, + "learning_rate": 4.79832808299621e-06, + "loss": 0.1592, + "step": 2868 + }, + { + "epoch": 0.4648412184057032, + "grad_norm": 0.8804002404212952, + "learning_rate": 4.7981559754326154e-06, + "loss": 0.1345, + "step": 2869 + }, + { + "epoch": 0.46500324044069996, + "grad_norm": 0.9187561869621277, + "learning_rate": 4.797983797551011e-06, + "loss": 0.1399, + "step": 2870 + }, + { + "epoch": 0.4651652624756967, + "grad_norm": 0.9533474445343018, + "learning_rate": 4.797811549356662e-06, + "loss": 0.1513, + "step": 2871 + }, + { + "epoch": 0.46532728451069344, + "grad_norm": 1.0123627185821533, + "learning_rate": 4.7976392308548416e-06, + "loss": 0.1563, + "step": 2872 + }, + { + "epoch": 0.4654893065456902, + "grad_norm": 1.0109021663665771, + "learning_rate": 4.7974668420508195e-06, + "loss": 0.1527, + "step": 2873 + }, + { + "epoch": 0.465651328580687, + "grad_norm": 1.1355345249176025, + "learning_rate": 4.797294382949873e-06, + "loss": 0.1783, + "step": 2874 + }, + { + "epoch": 0.46581335061568374, + "grad_norm": 1.0185506343841553, + "learning_rate": 4.797121853557277e-06, + "loss": 0.1591, + "step": 2875 + }, + { + "epoch": 0.4659753726506805, + "grad_norm": 0.9530416131019592, + "learning_rate": 4.796949253878311e-06, + "loss": 0.1403, + "step": 2876 + }, + { + "epoch": 0.4661373946856773, + "grad_norm": 0.9338066577911377, + "learning_rate": 4.796776583918256e-06, + "loss": 0.1505, + "step": 2877 + }, + { + "epoch": 0.466299416720674, + "grad_norm": 0.8232749700546265, + "learning_rate": 4.796603843682397e-06, + "loss": 0.1251, + "step": 2878 + }, + { + "epoch": 0.46646143875567075, + "grad_norm": 1.013901710510254, + "learning_rate": 4.7964310331760174e-06, + "loss": 0.1506, + "step": 2879 + }, + { + "epoch": 0.4666234607906675, + "grad_norm": 1.042008399963379, + "learning_rate": 4.796258152404406e-06, + "loss": 0.1578, + "step": 2880 + }, + { + "epoch": 0.4667854828256643, + "grad_norm": 1.0777783393859863, + "learning_rate": 4.796085201372852e-06, + "loss": 0.1805, + "step": 2881 + }, + { + "epoch": 0.46694750486066106, + "grad_norm": 0.9231262803077698, + "learning_rate": 4.795912180086646e-06, + "loss": 0.1404, + "step": 2882 + }, + { + "epoch": 0.4671095268956578, + "grad_norm": 0.972801685333252, + "learning_rate": 4.795739088551084e-06, + "loss": 0.1478, + "step": 2883 + }, + { + "epoch": 0.4672715489306546, + "grad_norm": 0.9988305568695068, + "learning_rate": 4.795565926771461e-06, + "loss": 0.1507, + "step": 2884 + }, + { + "epoch": 0.4674335709656513, + "grad_norm": 0.9387997388839722, + "learning_rate": 4.795392694753077e-06, + "loss": 0.1514, + "step": 2885 + }, + { + "epoch": 0.46759559300064807, + "grad_norm": 0.9761515855789185, + "learning_rate": 4.79521939250123e-06, + "loss": 0.1604, + "step": 2886 + }, + { + "epoch": 0.46775761503564484, + "grad_norm": 0.9907017946243286, + "learning_rate": 4.7950460200212244e-06, + "loss": 0.1448, + "step": 2887 + }, + { + "epoch": 0.4679196370706416, + "grad_norm": 1.0086826086044312, + "learning_rate": 4.7948725773183645e-06, + "loss": 0.1539, + "step": 2888 + }, + { + "epoch": 0.46808165910563837, + "grad_norm": 0.9991224408149719, + "learning_rate": 4.794699064397957e-06, + "loss": 0.156, + "step": 2889 + }, + { + "epoch": 0.46824368114063514, + "grad_norm": 1.0062789916992188, + "learning_rate": 4.794525481265312e-06, + "loss": 0.1594, + "step": 2890 + }, + { + "epoch": 0.4684057031756319, + "grad_norm": 1.0728275775909424, + "learning_rate": 4.794351827925739e-06, + "loss": 0.156, + "step": 2891 + }, + { + "epoch": 0.46856772521062867, + "grad_norm": 0.9946532249450684, + "learning_rate": 4.794178104384554e-06, + "loss": 0.1754, + "step": 2892 + }, + { + "epoch": 0.4687297472456254, + "grad_norm": 1.0101428031921387, + "learning_rate": 4.794004310647069e-06, + "loss": 0.1571, + "step": 2893 + }, + { + "epoch": 0.46889176928062215, + "grad_norm": 0.8463723063468933, + "learning_rate": 4.7938304467186036e-06, + "loss": 0.1326, + "step": 2894 + }, + { + "epoch": 0.4690537913156189, + "grad_norm": 0.8931572437286377, + "learning_rate": 4.793656512604478e-06, + "loss": 0.1415, + "step": 2895 + }, + { + "epoch": 0.4692158133506157, + "grad_norm": 1.0302973985671997, + "learning_rate": 4.793482508310014e-06, + "loss": 0.1637, + "step": 2896 + }, + { + "epoch": 0.46937783538561245, + "grad_norm": 1.800075888633728, + "learning_rate": 4.793308433840534e-06, + "loss": 0.1476, + "step": 2897 + }, + { + "epoch": 0.4695398574206092, + "grad_norm": 0.8768447041511536, + "learning_rate": 4.793134289201367e-06, + "loss": 0.1269, + "step": 2898 + }, + { + "epoch": 0.469701879455606, + "grad_norm": 0.9154813289642334, + "learning_rate": 4.792960074397839e-06, + "loss": 0.1353, + "step": 2899 + }, + { + "epoch": 0.4698639014906027, + "grad_norm": 1.0366954803466797, + "learning_rate": 4.792785789435283e-06, + "loss": 0.1506, + "step": 2900 + }, + { + "epoch": 0.47002592352559946, + "grad_norm": 0.8922173380851746, + "learning_rate": 4.792611434319029e-06, + "loss": 0.1285, + "step": 2901 + }, + { + "epoch": 0.47018794556059623, + "grad_norm": 0.9748638272285461, + "learning_rate": 4.792437009054413e-06, + "loss": 0.1438, + "step": 2902 + }, + { + "epoch": 0.470349967595593, + "grad_norm": 1.032471776008606, + "learning_rate": 4.792262513646773e-06, + "loss": 0.1685, + "step": 2903 + }, + { + "epoch": 0.47051198963058977, + "grad_norm": 1.1114881038665771, + "learning_rate": 4.792087948101447e-06, + "loss": 0.1312, + "step": 2904 + }, + { + "epoch": 0.47067401166558653, + "grad_norm": 2.0958445072174072, + "learning_rate": 4.791913312423776e-06, + "loss": 0.1668, + "step": 2905 + }, + { + "epoch": 0.4708360337005833, + "grad_norm": 1.0381637811660767, + "learning_rate": 4.791738606619105e-06, + "loss": 0.1517, + "step": 2906 + }, + { + "epoch": 0.47099805573558, + "grad_norm": 0.963405191898346, + "learning_rate": 4.7915638306927775e-06, + "loss": 0.1395, + "step": 2907 + }, + { + "epoch": 0.4711600777705768, + "grad_norm": 1.0255495309829712, + "learning_rate": 4.791388984650143e-06, + "loss": 0.1537, + "step": 2908 + }, + { + "epoch": 0.47132209980557355, + "grad_norm": 0.9075080156326294, + "learning_rate": 4.79121406849655e-06, + "loss": 0.142, + "step": 2909 + }, + { + "epoch": 0.4714841218405703, + "grad_norm": 0.981643557548523, + "learning_rate": 4.791039082237352e-06, + "loss": 0.1381, + "step": 2910 + }, + { + "epoch": 0.4716461438755671, + "grad_norm": 1.0010422468185425, + "learning_rate": 4.790864025877902e-06, + "loss": 0.1668, + "step": 2911 + }, + { + "epoch": 0.47180816591056385, + "grad_norm": 1.1024540662765503, + "learning_rate": 4.790688899423556e-06, + "loss": 0.1753, + "step": 2912 + }, + { + "epoch": 0.4719701879455606, + "grad_norm": 0.8568621277809143, + "learning_rate": 4.790513702879673e-06, + "loss": 0.1248, + "step": 2913 + }, + { + "epoch": 0.4721322099805574, + "grad_norm": 0.8953239917755127, + "learning_rate": 4.7903384362516135e-06, + "loss": 0.1436, + "step": 2914 + }, + { + "epoch": 0.4722942320155541, + "grad_norm": 1.0069905519485474, + "learning_rate": 4.790163099544741e-06, + "loss": 0.1673, + "step": 2915 + }, + { + "epoch": 0.47245625405055086, + "grad_norm": 0.9999399185180664, + "learning_rate": 4.78998769276442e-06, + "loss": 0.1499, + "step": 2916 + }, + { + "epoch": 0.47261827608554763, + "grad_norm": 1.05617356300354, + "learning_rate": 4.789812215916016e-06, + "loss": 0.1648, + "step": 2917 + }, + { + "epoch": 0.4727802981205444, + "grad_norm": 0.9993578195571899, + "learning_rate": 4.7896366690049016e-06, + "loss": 0.1546, + "step": 2918 + }, + { + "epoch": 0.47294232015554116, + "grad_norm": 0.8876897692680359, + "learning_rate": 4.789461052036444e-06, + "loss": 0.1327, + "step": 2919 + }, + { + "epoch": 0.47310434219053793, + "grad_norm": 0.9634594321250916, + "learning_rate": 4.789285365016019e-06, + "loss": 0.1576, + "step": 2920 + }, + { + "epoch": 0.4732663642255347, + "grad_norm": 0.8948929905891418, + "learning_rate": 4.7891096079490016e-06, + "loss": 0.137, + "step": 2921 + }, + { + "epoch": 0.4734283862605314, + "grad_norm": 1.0438789129257202, + "learning_rate": 4.788933780840771e-06, + "loss": 0.1549, + "step": 2922 + }, + { + "epoch": 0.4735904082955282, + "grad_norm": 1.0046937465667725, + "learning_rate": 4.7887578836967045e-06, + "loss": 0.1338, + "step": 2923 + }, + { + "epoch": 0.47375243033052494, + "grad_norm": 0.9940471649169922, + "learning_rate": 4.788581916522186e-06, + "loss": 0.1524, + "step": 2924 + }, + { + "epoch": 0.4739144523655217, + "grad_norm": 0.9437041878700256, + "learning_rate": 4.788405879322599e-06, + "loss": 0.1619, + "step": 2925 + }, + { + "epoch": 0.4740764744005185, + "grad_norm": 0.9506624937057495, + "learning_rate": 4.78822977210333e-06, + "loss": 0.1405, + "step": 2926 + }, + { + "epoch": 0.47423849643551524, + "grad_norm": 1.014735460281372, + "learning_rate": 4.788053594869767e-06, + "loss": 0.132, + "step": 2927 + }, + { + "epoch": 0.474400518470512, + "grad_norm": 1.1211403608322144, + "learning_rate": 4.787877347627302e-06, + "loss": 0.1818, + "step": 2928 + }, + { + "epoch": 0.4745625405055087, + "grad_norm": 1.0037801265716553, + "learning_rate": 4.787701030381326e-06, + "loss": 0.1575, + "step": 2929 + }, + { + "epoch": 0.4747245625405055, + "grad_norm": 0.911635160446167, + "learning_rate": 4.787524643137235e-06, + "loss": 0.1325, + "step": 2930 + }, + { + "epoch": 0.47488658457550226, + "grad_norm": 0.9881319999694824, + "learning_rate": 4.7873481859004245e-06, + "loss": 0.1486, + "step": 2931 + }, + { + "epoch": 0.475048606610499, + "grad_norm": 1.0285166501998901, + "learning_rate": 4.7871716586762965e-06, + "loss": 0.1545, + "step": 2932 + }, + { + "epoch": 0.4752106286454958, + "grad_norm": 1.2436772584915161, + "learning_rate": 4.786995061470249e-06, + "loss": 0.1594, + "step": 2933 + }, + { + "epoch": 0.47537265068049256, + "grad_norm": 1.0951595306396484, + "learning_rate": 4.786818394287688e-06, + "loss": 0.1712, + "step": 2934 + }, + { + "epoch": 0.4755346727154893, + "grad_norm": 1.0446057319641113, + "learning_rate": 4.786641657134017e-06, + "loss": 0.1583, + "step": 2935 + }, + { + "epoch": 0.4756966947504861, + "grad_norm": 0.9601659774780273, + "learning_rate": 4.786464850014646e-06, + "loss": 0.1574, + "step": 2936 + }, + { + "epoch": 0.4758587167854828, + "grad_norm": 0.9458787441253662, + "learning_rate": 4.786287972934984e-06, + "loss": 0.1306, + "step": 2937 + }, + { + "epoch": 0.47602073882047957, + "grad_norm": 1.0759549140930176, + "learning_rate": 4.786111025900442e-06, + "loss": 0.1646, + "step": 2938 + }, + { + "epoch": 0.47618276085547634, + "grad_norm": 0.9733848571777344, + "learning_rate": 4.785934008916435e-06, + "loss": 0.1616, + "step": 2939 + }, + { + "epoch": 0.4763447828904731, + "grad_norm": 0.9360319375991821, + "learning_rate": 4.785756921988379e-06, + "loss": 0.1504, + "step": 2940 + }, + { + "epoch": 0.4765068049254699, + "grad_norm": 0.9589405059814453, + "learning_rate": 4.785579765121693e-06, + "loss": 0.1566, + "step": 2941 + }, + { + "epoch": 0.47666882696046664, + "grad_norm": 1.0184801816940308, + "learning_rate": 4.785402538321798e-06, + "loss": 0.1502, + "step": 2942 + }, + { + "epoch": 0.4768308489954634, + "grad_norm": 1.011683702468872, + "learning_rate": 4.785225241594114e-06, + "loss": 0.165, + "step": 2943 + }, + { + "epoch": 0.4769928710304601, + "grad_norm": 0.9309144020080566, + "learning_rate": 4.785047874944069e-06, + "loss": 0.1353, + "step": 2944 + }, + { + "epoch": 0.4771548930654569, + "grad_norm": 1.050207495689392, + "learning_rate": 4.7848704383770875e-06, + "loss": 0.1605, + "step": 2945 + }, + { + "epoch": 0.47731691510045365, + "grad_norm": 0.9079956412315369, + "learning_rate": 4.784692931898601e-06, + "loss": 0.1404, + "step": 2946 + }, + { + "epoch": 0.4774789371354504, + "grad_norm": 0.9349779486656189, + "learning_rate": 4.784515355514039e-06, + "loss": 0.1332, + "step": 2947 + }, + { + "epoch": 0.4776409591704472, + "grad_norm": 1.089605689048767, + "learning_rate": 4.7843377092288365e-06, + "loss": 0.1575, + "step": 2948 + }, + { + "epoch": 0.47780298120544396, + "grad_norm": 1.0942819118499756, + "learning_rate": 4.784159993048427e-06, + "loss": 0.1675, + "step": 2949 + }, + { + "epoch": 0.4779650032404407, + "grad_norm": 0.9137088656425476, + "learning_rate": 4.7839822069782505e-06, + "loss": 0.1388, + "step": 2950 + }, + { + "epoch": 0.47812702527543743, + "grad_norm": 0.9597694277763367, + "learning_rate": 4.783804351023745e-06, + "loss": 0.155, + "step": 2951 + }, + { + "epoch": 0.4782890473104342, + "grad_norm": 0.9747596383094788, + "learning_rate": 4.783626425190353e-06, + "loss": 0.1504, + "step": 2952 + }, + { + "epoch": 0.47845106934543097, + "grad_norm": 0.9618208408355713, + "learning_rate": 4.783448429483518e-06, + "loss": 0.1479, + "step": 2953 + }, + { + "epoch": 0.47861309138042774, + "grad_norm": 0.9732487201690674, + "learning_rate": 4.783270363908687e-06, + "loss": 0.1515, + "step": 2954 + }, + { + "epoch": 0.4787751134154245, + "grad_norm": 1.075685977935791, + "learning_rate": 4.78309222847131e-06, + "loss": 0.1613, + "step": 2955 + }, + { + "epoch": 0.47893713545042127, + "grad_norm": 0.9794430136680603, + "learning_rate": 4.782914023176834e-06, + "loss": 0.1425, + "step": 2956 + }, + { + "epoch": 0.47909915748541804, + "grad_norm": 1.0761573314666748, + "learning_rate": 4.782735748030714e-06, + "loss": 0.1697, + "step": 2957 + }, + { + "epoch": 0.4792611795204148, + "grad_norm": 1.1438524723052979, + "learning_rate": 4.782557403038404e-06, + "loss": 0.1605, + "step": 2958 + }, + { + "epoch": 0.4794232015554115, + "grad_norm": 0.9872993230819702, + "learning_rate": 4.782378988205362e-06, + "loss": 0.1534, + "step": 2959 + }, + { + "epoch": 0.4795852235904083, + "grad_norm": 0.964551568031311, + "learning_rate": 4.7822005035370455e-06, + "loss": 0.1568, + "step": 2960 + }, + { + "epoch": 0.47974724562540505, + "grad_norm": 0.9149985313415527, + "learning_rate": 4.782021949038916e-06, + "loss": 0.138, + "step": 2961 + }, + { + "epoch": 0.4799092676604018, + "grad_norm": 0.9830363392829895, + "learning_rate": 4.781843324716437e-06, + "loss": 0.1573, + "step": 2962 + }, + { + "epoch": 0.4800712896953986, + "grad_norm": 0.9936883449554443, + "learning_rate": 4.781664630575076e-06, + "loss": 0.1575, + "step": 2963 + }, + { + "epoch": 0.48023331173039535, + "grad_norm": 1.0400924682617188, + "learning_rate": 4.7814858666202975e-06, + "loss": 0.1731, + "step": 2964 + }, + { + "epoch": 0.4803953337653921, + "grad_norm": 0.9391010999679565, + "learning_rate": 4.781307032857573e-06, + "loss": 0.1357, + "step": 2965 + }, + { + "epoch": 0.48055735580038883, + "grad_norm": 1.0018044710159302, + "learning_rate": 4.781128129292374e-06, + "loss": 0.1614, + "step": 2966 + }, + { + "epoch": 0.4807193778353856, + "grad_norm": 0.8832952976226807, + "learning_rate": 4.780949155930174e-06, + "loss": 0.142, + "step": 2967 + }, + { + "epoch": 0.48088139987038236, + "grad_norm": 1.0228934288024902, + "learning_rate": 4.7807701127764506e-06, + "loss": 0.166, + "step": 2968 + }, + { + "epoch": 0.48104342190537913, + "grad_norm": 0.8861875534057617, + "learning_rate": 4.78059099983668e-06, + "loss": 0.1308, + "step": 2969 + }, + { + "epoch": 0.4812054439403759, + "grad_norm": 1.0241353511810303, + "learning_rate": 4.780411817116344e-06, + "loss": 0.167, + "step": 2970 + }, + { + "epoch": 0.48136746597537267, + "grad_norm": 1.0048408508300781, + "learning_rate": 4.7802325646209255e-06, + "loss": 0.1637, + "step": 2971 + }, + { + "epoch": 0.48152948801036943, + "grad_norm": 0.9971985816955566, + "learning_rate": 4.780053242355908e-06, + "loss": 0.1671, + "step": 2972 + }, + { + "epoch": 0.48169151004536614, + "grad_norm": 1.110739827156067, + "learning_rate": 4.779873850326778e-06, + "loss": 0.1776, + "step": 2973 + }, + { + "epoch": 0.4818535320803629, + "grad_norm": 0.9932035207748413, + "learning_rate": 4.779694388539027e-06, + "loss": 0.1502, + "step": 2974 + }, + { + "epoch": 0.4820155541153597, + "grad_norm": 0.9594526290893555, + "learning_rate": 4.779514856998144e-06, + "loss": 0.1553, + "step": 2975 + }, + { + "epoch": 0.48217757615035645, + "grad_norm": 0.9843863844871521, + "learning_rate": 4.779335255709623e-06, + "loss": 0.1436, + "step": 2976 + }, + { + "epoch": 0.4823395981853532, + "grad_norm": 1.0879184007644653, + "learning_rate": 4.7791555846789585e-06, + "loss": 0.1591, + "step": 2977 + }, + { + "epoch": 0.48250162022035, + "grad_norm": 1.059070348739624, + "learning_rate": 4.778975843911649e-06, + "loss": 0.1468, + "step": 2978 + }, + { + "epoch": 0.48266364225534675, + "grad_norm": 0.9296880960464478, + "learning_rate": 4.778796033413193e-06, + "loss": 0.1387, + "step": 2979 + }, + { + "epoch": 0.48282566429034346, + "grad_norm": 1.030938982963562, + "learning_rate": 4.778616153189093e-06, + "loss": 0.1576, + "step": 2980 + }, + { + "epoch": 0.4829876863253402, + "grad_norm": 0.967088520526886, + "learning_rate": 4.778436203244853e-06, + "loss": 0.1516, + "step": 2981 + }, + { + "epoch": 0.483149708360337, + "grad_norm": 0.9913309216499329, + "learning_rate": 4.7782561835859795e-06, + "loss": 0.1429, + "step": 2982 + }, + { + "epoch": 0.48331173039533376, + "grad_norm": 1.0217386484146118, + "learning_rate": 4.77807609421798e-06, + "loss": 0.1632, + "step": 2983 + }, + { + "epoch": 0.48347375243033053, + "grad_norm": 0.9644971489906311, + "learning_rate": 4.777895935146364e-06, + "loss": 0.1548, + "step": 2984 + }, + { + "epoch": 0.4836357744653273, + "grad_norm": 0.9488604068756104, + "learning_rate": 4.777715706376645e-06, + "loss": 0.1591, + "step": 2985 + }, + { + "epoch": 0.48379779650032406, + "grad_norm": 1.0006232261657715, + "learning_rate": 4.777535407914338e-06, + "loss": 0.1522, + "step": 2986 + }, + { + "epoch": 0.48395981853532083, + "grad_norm": 1.0294928550720215, + "learning_rate": 4.777355039764958e-06, + "loss": 0.1633, + "step": 2987 + }, + { + "epoch": 0.48412184057031754, + "grad_norm": 0.8555730581283569, + "learning_rate": 4.777174601934026e-06, + "loss": 0.1297, + "step": 2988 + }, + { + "epoch": 0.4842838626053143, + "grad_norm": 0.8943042755126953, + "learning_rate": 4.776994094427061e-06, + "loss": 0.1294, + "step": 2989 + }, + { + "epoch": 0.4844458846403111, + "grad_norm": 0.976116418838501, + "learning_rate": 4.776813517249588e-06, + "loss": 0.1681, + "step": 2990 + }, + { + "epoch": 0.48460790667530784, + "grad_norm": 0.9380887746810913, + "learning_rate": 4.77663287040713e-06, + "loss": 0.156, + "step": 2991 + }, + { + "epoch": 0.4847699287103046, + "grad_norm": 0.9756038188934326, + "learning_rate": 4.776452153905216e-06, + "loss": 0.1531, + "step": 2992 + }, + { + "epoch": 0.4849319507453014, + "grad_norm": 0.9774467945098877, + "learning_rate": 4.776271367749375e-06, + "loss": 0.1688, + "step": 2993 + }, + { + "epoch": 0.48509397278029814, + "grad_norm": 1.0772470235824585, + "learning_rate": 4.776090511945139e-06, + "loss": 0.174, + "step": 2994 + }, + { + "epoch": 0.48525599481529486, + "grad_norm": 0.983753502368927, + "learning_rate": 4.77590958649804e-06, + "loss": 0.1422, + "step": 2995 + }, + { + "epoch": 0.4854180168502916, + "grad_norm": 1.1349477767944336, + "learning_rate": 4.775728591413616e-06, + "loss": 0.1654, + "step": 2996 + }, + { + "epoch": 0.4855800388852884, + "grad_norm": 1.0424829721450806, + "learning_rate": 4.775547526697405e-06, + "loss": 0.1515, + "step": 2997 + }, + { + "epoch": 0.48574206092028516, + "grad_norm": 1.055623173713684, + "learning_rate": 4.775366392354946e-06, + "loss": 0.1482, + "step": 2998 + }, + { + "epoch": 0.4859040829552819, + "grad_norm": 0.9909132122993469, + "learning_rate": 4.775185188391781e-06, + "loss": 0.1529, + "step": 2999 + }, + { + "epoch": 0.4860661049902787, + "grad_norm": 0.981861412525177, + "learning_rate": 4.775003914813456e-06, + "loss": 0.1505, + "step": 3000 + }, + { + "epoch": 0.48622812702527546, + "grad_norm": 0.9199234247207642, + "learning_rate": 4.774822571625516e-06, + "loss": 0.1376, + "step": 3001 + }, + { + "epoch": 0.48639014906027217, + "grad_norm": 1.0599560737609863, + "learning_rate": 4.7746411588335105e-06, + "loss": 0.1656, + "step": 3002 + }, + { + "epoch": 0.48655217109526894, + "grad_norm": 0.9341553449630737, + "learning_rate": 4.774459676442991e-06, + "loss": 0.1573, + "step": 3003 + }, + { + "epoch": 0.4867141931302657, + "grad_norm": 1.0569862127304077, + "learning_rate": 4.774278124459509e-06, + "loss": 0.1664, + "step": 3004 + }, + { + "epoch": 0.48687621516526247, + "grad_norm": 0.970799446105957, + "learning_rate": 4.774096502888619e-06, + "loss": 0.1452, + "step": 3005 + }, + { + "epoch": 0.48703823720025924, + "grad_norm": 1.03872811794281, + "learning_rate": 4.773914811735879e-06, + "loss": 0.173, + "step": 3006 + }, + { + "epoch": 0.487200259235256, + "grad_norm": 0.8187316656112671, + "learning_rate": 4.773733051006849e-06, + "loss": 0.1248, + "step": 3007 + }, + { + "epoch": 0.4873622812702528, + "grad_norm": 0.9771019816398621, + "learning_rate": 4.773551220707091e-06, + "loss": 0.1625, + "step": 3008 + }, + { + "epoch": 0.48752430330524954, + "grad_norm": 0.9272666573524475, + "learning_rate": 4.773369320842167e-06, + "loss": 0.1544, + "step": 3009 + }, + { + "epoch": 0.48768632534024625, + "grad_norm": 0.886873185634613, + "learning_rate": 4.773187351417643e-06, + "loss": 0.1422, + "step": 3010 + }, + { + "epoch": 0.487848347375243, + "grad_norm": 1.0471525192260742, + "learning_rate": 4.773005312439087e-06, + "loss": 0.1653, + "step": 3011 + }, + { + "epoch": 0.4880103694102398, + "grad_norm": 1.0167343616485596, + "learning_rate": 4.772823203912069e-06, + "loss": 0.1555, + "step": 3012 + }, + { + "epoch": 0.48817239144523655, + "grad_norm": 0.9511725902557373, + "learning_rate": 4.7726410258421616e-06, + "loss": 0.1502, + "step": 3013 + }, + { + "epoch": 0.4883344134802333, + "grad_norm": 1.0081369876861572, + "learning_rate": 4.772458778234938e-06, + "loss": 0.1521, + "step": 3014 + }, + { + "epoch": 0.4884964355152301, + "grad_norm": 1.0319949388504028, + "learning_rate": 4.772276461095975e-06, + "loss": 0.1658, + "step": 3015 + }, + { + "epoch": 0.48865845755022685, + "grad_norm": 1.0191717147827148, + "learning_rate": 4.772094074430852e-06, + "loss": 0.1671, + "step": 3016 + }, + { + "epoch": 0.48882047958522357, + "grad_norm": 0.9789111614227295, + "learning_rate": 4.771911618245148e-06, + "loss": 0.1515, + "step": 3017 + }, + { + "epoch": 0.48898250162022033, + "grad_norm": 1.0732017755508423, + "learning_rate": 4.771729092544446e-06, + "loss": 0.1736, + "step": 3018 + }, + { + "epoch": 0.4891445236552171, + "grad_norm": 0.9397895932197571, + "learning_rate": 4.771546497334331e-06, + "loss": 0.1399, + "step": 3019 + }, + { + "epoch": 0.48930654569021387, + "grad_norm": 0.9154207706451416, + "learning_rate": 4.771363832620391e-06, + "loss": 0.146, + "step": 3020 + }, + { + "epoch": 0.48946856772521063, + "grad_norm": 0.9853832125663757, + "learning_rate": 4.771181098408214e-06, + "loss": 0.1648, + "step": 3021 + }, + { + "epoch": 0.4896305897602074, + "grad_norm": 1.054186463356018, + "learning_rate": 4.770998294703392e-06, + "loss": 0.1758, + "step": 3022 + }, + { + "epoch": 0.48979261179520417, + "grad_norm": 1.047492504119873, + "learning_rate": 4.770815421511517e-06, + "loss": 0.1563, + "step": 3023 + }, + { + "epoch": 0.4899546338302009, + "grad_norm": 0.9914636015892029, + "learning_rate": 4.7706324788381865e-06, + "loss": 0.1574, + "step": 3024 + }, + { + "epoch": 0.49011665586519765, + "grad_norm": 1.0021421909332275, + "learning_rate": 4.770449466688996e-06, + "loss": 0.1657, + "step": 3025 + }, + { + "epoch": 0.4902786779001944, + "grad_norm": 0.9965320229530334, + "learning_rate": 4.770266385069547e-06, + "loss": 0.1498, + "step": 3026 + }, + { + "epoch": 0.4904406999351912, + "grad_norm": 1.0368542671203613, + "learning_rate": 4.77008323398544e-06, + "loss": 0.1701, + "step": 3027 + }, + { + "epoch": 0.49060272197018795, + "grad_norm": 0.9540989398956299, + "learning_rate": 4.769900013442279e-06, + "loss": 0.1525, + "step": 3028 + }, + { + "epoch": 0.4907647440051847, + "grad_norm": 0.9963111877441406, + "learning_rate": 4.769716723445672e-06, + "loss": 0.1578, + "step": 3029 + }, + { + "epoch": 0.4909267660401815, + "grad_norm": 0.9703772664070129, + "learning_rate": 4.769533364001225e-06, + "loss": 0.1462, + "step": 3030 + }, + { + "epoch": 0.49108878807517825, + "grad_norm": 1.1421406269073486, + "learning_rate": 4.769349935114549e-06, + "loss": 0.1572, + "step": 3031 + }, + { + "epoch": 0.49125081011017496, + "grad_norm": 0.9838953018188477, + "learning_rate": 4.769166436791257e-06, + "loss": 0.1629, + "step": 3032 + }, + { + "epoch": 0.49141283214517173, + "grad_norm": 0.8788158893585205, + "learning_rate": 4.768982869036964e-06, + "loss": 0.1342, + "step": 3033 + }, + { + "epoch": 0.4915748541801685, + "grad_norm": 0.9803071618080139, + "learning_rate": 4.768799231857285e-06, + "loss": 0.1427, + "step": 3034 + }, + { + "epoch": 0.49173687621516526, + "grad_norm": 0.9650258421897888, + "learning_rate": 4.76861552525784e-06, + "loss": 0.1321, + "step": 3035 + }, + { + "epoch": 0.49189889825016203, + "grad_norm": 0.854787290096283, + "learning_rate": 4.768431749244251e-06, + "loss": 0.1314, + "step": 3036 + }, + { + "epoch": 0.4920609202851588, + "grad_norm": 1.0625677108764648, + "learning_rate": 4.768247903822139e-06, + "loss": 0.1617, + "step": 3037 + }, + { + "epoch": 0.49222294232015557, + "grad_norm": 1.0747294425964355, + "learning_rate": 4.76806398899713e-06, + "loss": 0.1586, + "step": 3038 + }, + { + "epoch": 0.4923849643551523, + "grad_norm": 1.0550459623336792, + "learning_rate": 4.767880004774853e-06, + "loss": 0.1567, + "step": 3039 + }, + { + "epoch": 0.49254698639014904, + "grad_norm": 1.0270591974258423, + "learning_rate": 4.767695951160934e-06, + "loss": 0.1546, + "step": 3040 + }, + { + "epoch": 0.4927090084251458, + "grad_norm": 0.9791101813316345, + "learning_rate": 4.767511828161008e-06, + "loss": 0.1442, + "step": 3041 + }, + { + "epoch": 0.4928710304601426, + "grad_norm": 1.110756754875183, + "learning_rate": 4.767327635780707e-06, + "loss": 0.1766, + "step": 3042 + }, + { + "epoch": 0.49303305249513935, + "grad_norm": 1.0451438426971436, + "learning_rate": 4.7671433740256664e-06, + "loss": 0.1476, + "step": 3043 + }, + { + "epoch": 0.4931950745301361, + "grad_norm": 1.0751953125, + "learning_rate": 4.7669590429015265e-06, + "loss": 0.1748, + "step": 3044 + }, + { + "epoch": 0.4933570965651329, + "grad_norm": 0.9066848158836365, + "learning_rate": 4.766774642413925e-06, + "loss": 0.1459, + "step": 3045 + }, + { + "epoch": 0.4935191186001296, + "grad_norm": 0.9705883860588074, + "learning_rate": 4.7665901725685045e-06, + "loss": 0.1495, + "step": 3046 + }, + { + "epoch": 0.49368114063512636, + "grad_norm": 1.0257165431976318, + "learning_rate": 4.76640563337091e-06, + "loss": 0.1661, + "step": 3047 + }, + { + "epoch": 0.4938431626701231, + "grad_norm": 0.933527410030365, + "learning_rate": 4.766221024826788e-06, + "loss": 0.1444, + "step": 3048 + }, + { + "epoch": 0.4940051847051199, + "grad_norm": 0.9257835745811462, + "learning_rate": 4.766036346941787e-06, + "loss": 0.1586, + "step": 3049 + }, + { + "epoch": 0.49416720674011666, + "grad_norm": 1.0416429042816162, + "learning_rate": 4.765851599721557e-06, + "loss": 0.1379, + "step": 3050 + }, + { + "epoch": 0.4943292287751134, + "grad_norm": 1.0839523077011108, + "learning_rate": 4.7656667831717514e-06, + "loss": 0.1593, + "step": 3051 + }, + { + "epoch": 0.4944912508101102, + "grad_norm": 1.0328129529953003, + "learning_rate": 4.765481897298025e-06, + "loss": 0.1518, + "step": 3052 + }, + { + "epoch": 0.49465327284510696, + "grad_norm": 1.078291654586792, + "learning_rate": 4.765296942106035e-06, + "loss": 0.1678, + "step": 3053 + }, + { + "epoch": 0.4948152948801037, + "grad_norm": 0.9669474363327026, + "learning_rate": 4.76511191760144e-06, + "loss": 0.1472, + "step": 3054 + }, + { + "epoch": 0.49497731691510044, + "grad_norm": 0.9748349189758301, + "learning_rate": 4.764926823789903e-06, + "loss": 0.1488, + "step": 3055 + }, + { + "epoch": 0.4951393389500972, + "grad_norm": 0.9624823927879333, + "learning_rate": 4.764741660677085e-06, + "loss": 0.1376, + "step": 3056 + }, + { + "epoch": 0.495301360985094, + "grad_norm": 1.2067596912384033, + "learning_rate": 4.7645564282686534e-06, + "loss": 0.189, + "step": 3057 + }, + { + "epoch": 0.49546338302009074, + "grad_norm": 1.1277415752410889, + "learning_rate": 4.764371126570275e-06, + "loss": 0.1515, + "step": 3058 + }, + { + "epoch": 0.4956254050550875, + "grad_norm": 1.0058308839797974, + "learning_rate": 4.76418575558762e-06, + "loss": 0.1585, + "step": 3059 + }, + { + "epoch": 0.4957874270900843, + "grad_norm": 0.9348163604736328, + "learning_rate": 4.76400031532636e-06, + "loss": 0.1388, + "step": 3060 + }, + { + "epoch": 0.495949449125081, + "grad_norm": 0.9315893054008484, + "learning_rate": 4.763814805792169e-06, + "loss": 0.1327, + "step": 3061 + }, + { + "epoch": 0.49611147116007775, + "grad_norm": 0.9435446262359619, + "learning_rate": 4.763629226990724e-06, + "loss": 0.1536, + "step": 3062 + }, + { + "epoch": 0.4962734931950745, + "grad_norm": 1.0565837621688843, + "learning_rate": 4.763443578927701e-06, + "loss": 0.1814, + "step": 3063 + }, + { + "epoch": 0.4964355152300713, + "grad_norm": 0.9601361155509949, + "learning_rate": 4.763257861608783e-06, + "loss": 0.1412, + "step": 3064 + }, + { + "epoch": 0.49659753726506806, + "grad_norm": 0.9786501526832581, + "learning_rate": 4.763072075039651e-06, + "loss": 0.1485, + "step": 3065 + }, + { + "epoch": 0.4967595593000648, + "grad_norm": 0.9912653565406799, + "learning_rate": 4.762886219225991e-06, + "loss": 0.1415, + "step": 3066 + }, + { + "epoch": 0.4969215813350616, + "grad_norm": 0.8689770102500916, + "learning_rate": 4.762700294173487e-06, + "loss": 0.1392, + "step": 3067 + }, + { + "epoch": 0.4970836033700583, + "grad_norm": 1.0026378631591797, + "learning_rate": 4.762514299887831e-06, + "loss": 0.1389, + "step": 3068 + }, + { + "epoch": 0.49724562540505507, + "grad_norm": 0.907612144947052, + "learning_rate": 4.762328236374713e-06, + "loss": 0.1334, + "step": 3069 + }, + { + "epoch": 0.49740764744005184, + "grad_norm": 1.0089789628982544, + "learning_rate": 4.762142103639824e-06, + "loss": 0.1653, + "step": 3070 + }, + { + "epoch": 0.4975696694750486, + "grad_norm": 1.0177544355392456, + "learning_rate": 4.761955901688862e-06, + "loss": 0.1704, + "step": 3071 + }, + { + "epoch": 0.49773169151004537, + "grad_norm": 1.010158658027649, + "learning_rate": 4.761769630527523e-06, + "loss": 0.1641, + "step": 3072 + }, + { + "epoch": 0.49789371354504214, + "grad_norm": 0.8676300644874573, + "learning_rate": 4.761583290161507e-06, + "loss": 0.1422, + "step": 3073 + }, + { + "epoch": 0.4980557355800389, + "grad_norm": 0.9401457905769348, + "learning_rate": 4.761396880596515e-06, + "loss": 0.1441, + "step": 3074 + }, + { + "epoch": 0.4982177576150357, + "grad_norm": 1.0235344171524048, + "learning_rate": 4.761210401838251e-06, + "loss": 0.1493, + "step": 3075 + }, + { + "epoch": 0.4983797796500324, + "grad_norm": 0.9820840358734131, + "learning_rate": 4.76102385389242e-06, + "loss": 0.135, + "step": 3076 + }, + { + "epoch": 0.49854180168502915, + "grad_norm": 0.96462482213974, + "learning_rate": 4.760837236764731e-06, + "loss": 0.1605, + "step": 3077 + }, + { + "epoch": 0.4987038237200259, + "grad_norm": 0.929373025894165, + "learning_rate": 4.760650550460895e-06, + "loss": 0.1183, + "step": 3078 + }, + { + "epoch": 0.4988658457550227, + "grad_norm": 0.9171031713485718, + "learning_rate": 4.760463794986622e-06, + "loss": 0.1457, + "step": 3079 + }, + { + "epoch": 0.49902786779001945, + "grad_norm": 0.9517503976821899, + "learning_rate": 4.760276970347627e-06, + "loss": 0.136, + "step": 3080 + }, + { + "epoch": 0.4991898898250162, + "grad_norm": 0.9725329875946045, + "learning_rate": 4.760090076549626e-06, + "loss": 0.152, + "step": 3081 + }, + { + "epoch": 0.499351911860013, + "grad_norm": 0.9958840012550354, + "learning_rate": 4.759903113598338e-06, + "loss": 0.1427, + "step": 3082 + }, + { + "epoch": 0.4995139338950097, + "grad_norm": 1.0073212385177612, + "learning_rate": 4.759716081499484e-06, + "loss": 0.154, + "step": 3083 + }, + { + "epoch": 0.49967595593000647, + "grad_norm": 0.9372782707214355, + "learning_rate": 4.759528980258786e-06, + "loss": 0.1594, + "step": 3084 + }, + { + "epoch": 0.49983797796500323, + "grad_norm": 0.9391396641731262, + "learning_rate": 4.7593418098819695e-06, + "loss": 0.1285, + "step": 3085 + }, + { + "epoch": 0.5, + "grad_norm": 0.9143754839897156, + "learning_rate": 4.759154570374761e-06, + "loss": 0.1312, + "step": 3086 + }, + { + "epoch": 0.5001620220349967, + "grad_norm": 1.103759765625, + "learning_rate": 4.75896726174289e-06, + "loss": 0.1854, + "step": 3087 + }, + { + "epoch": 0.5003240440699935, + "grad_norm": 0.9957083463668823, + "learning_rate": 4.758779883992087e-06, + "loss": 0.1459, + "step": 3088 + }, + { + "epoch": 0.5004860661049902, + "grad_norm": 1.0584758520126343, + "learning_rate": 4.758592437128086e-06, + "loss": 0.1514, + "step": 3089 + }, + { + "epoch": 0.5006480881399871, + "grad_norm": 0.9559106230735779, + "learning_rate": 4.758404921156622e-06, + "loss": 0.1483, + "step": 3090 + }, + { + "epoch": 0.5008101101749838, + "grad_norm": 1.1724073886871338, + "learning_rate": 4.7582173360834326e-06, + "loss": 0.1854, + "step": 3091 + }, + { + "epoch": 0.5009721322099806, + "grad_norm": 1.016783595085144, + "learning_rate": 4.7580296819142565e-06, + "loss": 0.1646, + "step": 3092 + }, + { + "epoch": 0.5011341542449773, + "grad_norm": 0.9434977769851685, + "learning_rate": 4.757841958654838e-06, + "loss": 0.1467, + "step": 3093 + }, + { + "epoch": 0.501296176279974, + "grad_norm": 0.8541172742843628, + "learning_rate": 4.757654166310919e-06, + "loss": 0.1325, + "step": 3094 + }, + { + "epoch": 0.5014581983149708, + "grad_norm": 0.8900263905525208, + "learning_rate": 4.757466304888245e-06, + "loss": 0.1388, + "step": 3095 + }, + { + "epoch": 0.5016202203499676, + "grad_norm": 0.9139759540557861, + "learning_rate": 4.757278374392567e-06, + "loss": 0.1381, + "step": 3096 + }, + { + "epoch": 0.5017822423849644, + "grad_norm": 0.8878260254859924, + "learning_rate": 4.757090374829631e-06, + "loss": 0.1404, + "step": 3097 + }, + { + "epoch": 0.5019442644199611, + "grad_norm": 1.07386314868927, + "learning_rate": 4.7569023062051936e-06, + "loss": 0.1747, + "step": 3098 + }, + { + "epoch": 0.5021062864549579, + "grad_norm": 0.9467965960502625, + "learning_rate": 4.756714168525006e-06, + "loss": 0.1505, + "step": 3099 + }, + { + "epoch": 0.5022683084899546, + "grad_norm": 0.9320406913757324, + "learning_rate": 4.756525961794826e-06, + "loss": 0.1414, + "step": 3100 + }, + { + "epoch": 0.5024303305249513, + "grad_norm": 0.9513562321662903, + "learning_rate": 4.756337686020413e-06, + "loss": 0.1516, + "step": 3101 + }, + { + "epoch": 0.5025923525599482, + "grad_norm": 0.9535584449768066, + "learning_rate": 4.756149341207526e-06, + "loss": 0.1461, + "step": 3102 + }, + { + "epoch": 0.5027543745949449, + "grad_norm": 0.9998999834060669, + "learning_rate": 4.75596092736193e-06, + "loss": 0.1684, + "step": 3103 + }, + { + "epoch": 0.5029163966299417, + "grad_norm": 1.0532805919647217, + "learning_rate": 4.755772444489388e-06, + "loss": 0.1307, + "step": 3104 + }, + { + "epoch": 0.5030784186649384, + "grad_norm": 0.9271810054779053, + "learning_rate": 4.7555838925956686e-06, + "loss": 0.1446, + "step": 3105 + }, + { + "epoch": 0.5032404406999352, + "grad_norm": 1.0862815380096436, + "learning_rate": 4.75539527168654e-06, + "loss": 0.1484, + "step": 3106 + }, + { + "epoch": 0.5034024627349319, + "grad_norm": 1.0652508735656738, + "learning_rate": 4.755206581767775e-06, + "loss": 0.1749, + "step": 3107 + }, + { + "epoch": 0.5035644847699287, + "grad_norm": 0.9044068455696106, + "learning_rate": 4.755017822845145e-06, + "loss": 0.1525, + "step": 3108 + }, + { + "epoch": 0.5037265068049255, + "grad_norm": 0.9966428279876709, + "learning_rate": 4.754828994924428e-06, + "loss": 0.1536, + "step": 3109 + }, + { + "epoch": 0.5038885288399222, + "grad_norm": 1.0621579885482788, + "learning_rate": 4.754640098011399e-06, + "loss": 0.1773, + "step": 3110 + }, + { + "epoch": 0.504050550874919, + "grad_norm": 1.0683941841125488, + "learning_rate": 4.754451132111839e-06, + "loss": 0.1586, + "step": 3111 + }, + { + "epoch": 0.5042125729099157, + "grad_norm": 0.7734267115592957, + "learning_rate": 4.754262097231531e-06, + "loss": 0.1281, + "step": 3112 + }, + { + "epoch": 0.5043745949449125, + "grad_norm": 1.0103263854980469, + "learning_rate": 4.754072993376258e-06, + "loss": 0.1583, + "step": 3113 + }, + { + "epoch": 0.5045366169799093, + "grad_norm": 0.9807709455490112, + "learning_rate": 4.753883820551806e-06, + "loss": 0.148, + "step": 3114 + }, + { + "epoch": 0.5046986390149061, + "grad_norm": 1.0581525564193726, + "learning_rate": 4.753694578763963e-06, + "loss": 0.1637, + "step": 3115 + }, + { + "epoch": 0.5048606610499028, + "grad_norm": 1.050081491470337, + "learning_rate": 4.75350526801852e-06, + "loss": 0.1468, + "step": 3116 + }, + { + "epoch": 0.5050226830848995, + "grad_norm": 0.991671621799469, + "learning_rate": 4.753315888321269e-06, + "loss": 0.1562, + "step": 3117 + }, + { + "epoch": 0.5051847051198963, + "grad_norm": 0.9450017809867859, + "learning_rate": 4.753126439678005e-06, + "loss": 0.1554, + "step": 3118 + }, + { + "epoch": 0.505346727154893, + "grad_norm": 0.8860989809036255, + "learning_rate": 4.752936922094524e-06, + "loss": 0.1439, + "step": 3119 + }, + { + "epoch": 0.5055087491898899, + "grad_norm": 1.0846068859100342, + "learning_rate": 4.752747335576626e-06, + "loss": 0.1612, + "step": 3120 + }, + { + "epoch": 0.5056707712248866, + "grad_norm": 1.1103551387786865, + "learning_rate": 4.75255768013011e-06, + "loss": 0.1659, + "step": 3121 + }, + { + "epoch": 0.5058327932598834, + "grad_norm": 0.9545550346374512, + "learning_rate": 4.752367955760781e-06, + "loss": 0.1395, + "step": 3122 + }, + { + "epoch": 0.5059948152948801, + "grad_norm": 1.050429344177246, + "learning_rate": 4.752178162474443e-06, + "loss": 0.1587, + "step": 3123 + }, + { + "epoch": 0.5061568373298768, + "grad_norm": 0.9017760157585144, + "learning_rate": 4.751988300276903e-06, + "loss": 0.1398, + "step": 3124 + }, + { + "epoch": 0.5063188593648736, + "grad_norm": 1.020270586013794, + "learning_rate": 4.751798369173971e-06, + "loss": 0.1452, + "step": 3125 + }, + { + "epoch": 0.5064808813998704, + "grad_norm": 1.0533658266067505, + "learning_rate": 4.751608369171458e-06, + "loss": 0.1577, + "step": 3126 + }, + { + "epoch": 0.5066429034348672, + "grad_norm": 0.9330847263336182, + "learning_rate": 4.751418300275178e-06, + "loss": 0.142, + "step": 3127 + }, + { + "epoch": 0.5068049254698639, + "grad_norm": 0.9158099293708801, + "learning_rate": 4.751228162490946e-06, + "loss": 0.1455, + "step": 3128 + }, + { + "epoch": 0.5069669475048607, + "grad_norm": 0.9073064923286438, + "learning_rate": 4.75103795582458e-06, + "loss": 0.136, + "step": 3129 + }, + { + "epoch": 0.5071289695398574, + "grad_norm": 1.0237650871276855, + "learning_rate": 4.750847680281901e-06, + "loss": 0.1599, + "step": 3130 + }, + { + "epoch": 0.5072909915748541, + "grad_norm": 0.9799794554710388, + "learning_rate": 4.750657335868728e-06, + "loss": 0.1526, + "step": 3131 + }, + { + "epoch": 0.507453013609851, + "grad_norm": 0.9370033740997314, + "learning_rate": 4.750466922590888e-06, + "loss": 0.1508, + "step": 3132 + }, + { + "epoch": 0.5076150356448477, + "grad_norm": 0.8960480093955994, + "learning_rate": 4.750276440454207e-06, + "loss": 0.1249, + "step": 3133 + }, + { + "epoch": 0.5077770576798445, + "grad_norm": 0.9389232993125916, + "learning_rate": 4.750085889464512e-06, + "loss": 0.1578, + "step": 3134 + }, + { + "epoch": 0.5079390797148412, + "grad_norm": 0.9818000197410583, + "learning_rate": 4.749895269627633e-06, + "loss": 0.1501, + "step": 3135 + }, + { + "epoch": 0.508101101749838, + "grad_norm": 0.9019086956977844, + "learning_rate": 4.749704580949404e-06, + "loss": 0.1541, + "step": 3136 + }, + { + "epoch": 0.5082631237848347, + "grad_norm": 1.0174736976623535, + "learning_rate": 4.749513823435659e-06, + "loss": 0.1599, + "step": 3137 + }, + { + "epoch": 0.5084251458198314, + "grad_norm": 1.0276374816894531, + "learning_rate": 4.749322997092235e-06, + "loss": 0.1574, + "step": 3138 + }, + { + "epoch": 0.5085871678548283, + "grad_norm": 0.8995428681373596, + "learning_rate": 4.74913210192497e-06, + "loss": 0.1356, + "step": 3139 + }, + { + "epoch": 0.508749189889825, + "grad_norm": 1.0277477502822876, + "learning_rate": 4.748941137939706e-06, + "loss": 0.1604, + "step": 3140 + }, + { + "epoch": 0.5089112119248218, + "grad_norm": 1.0544497966766357, + "learning_rate": 4.748750105142285e-06, + "loss": 0.1749, + "step": 3141 + }, + { + "epoch": 0.5090732339598185, + "grad_norm": 0.8580374717712402, + "learning_rate": 4.748559003538553e-06, + "loss": 0.1397, + "step": 3142 + }, + { + "epoch": 0.5092352559948153, + "grad_norm": 1.0036252737045288, + "learning_rate": 4.748367833134357e-06, + "loss": 0.1463, + "step": 3143 + }, + { + "epoch": 0.509397278029812, + "grad_norm": 0.8418014049530029, + "learning_rate": 4.748176593935546e-06, + "loss": 0.1334, + "step": 3144 + }, + { + "epoch": 0.5095593000648088, + "grad_norm": 1.012681245803833, + "learning_rate": 4.747985285947972e-06, + "loss": 0.1758, + "step": 3145 + }, + { + "epoch": 0.5097213220998056, + "grad_norm": 0.9466359615325928, + "learning_rate": 4.7477939091774885e-06, + "loss": 0.1552, + "step": 3146 + }, + { + "epoch": 0.5098833441348023, + "grad_norm": 1.010433316230774, + "learning_rate": 4.74760246362995e-06, + "loss": 0.158, + "step": 3147 + }, + { + "epoch": 0.5100453661697991, + "grad_norm": 0.9007865786552429, + "learning_rate": 4.7474109493112154e-06, + "loss": 0.1287, + "step": 3148 + }, + { + "epoch": 0.5102073882047958, + "grad_norm": 0.9816288352012634, + "learning_rate": 4.747219366227145e-06, + "loss": 0.147, + "step": 3149 + }, + { + "epoch": 0.5103694102397927, + "grad_norm": 1.0334168672561646, + "learning_rate": 4.7470277143836e-06, + "loss": 0.1545, + "step": 3150 + }, + { + "epoch": 0.5105314322747894, + "grad_norm": 0.978326678276062, + "learning_rate": 4.746835993786445e-06, + "loss": 0.1715, + "step": 3151 + }, + { + "epoch": 0.5106934543097861, + "grad_norm": 0.9503743052482605, + "learning_rate": 4.746644204441545e-06, + "loss": 0.147, + "step": 3152 + }, + { + "epoch": 0.5108554763447829, + "grad_norm": 0.8829200863838196, + "learning_rate": 4.7464523463547695e-06, + "loss": 0.1584, + "step": 3153 + }, + { + "epoch": 0.5110174983797796, + "grad_norm": 1.0434906482696533, + "learning_rate": 4.746260419531989e-06, + "loss": 0.1657, + "step": 3154 + }, + { + "epoch": 0.5111795204147764, + "grad_norm": 0.9752653241157532, + "learning_rate": 4.746068423979074e-06, + "loss": 0.1518, + "step": 3155 + }, + { + "epoch": 0.5113415424497731, + "grad_norm": 1.0544825792312622, + "learning_rate": 4.745876359701902e-06, + "loss": 0.1599, + "step": 3156 + }, + { + "epoch": 0.51150356448477, + "grad_norm": 1.0769002437591553, + "learning_rate": 4.745684226706348e-06, + "loss": 0.154, + "step": 3157 + }, + { + "epoch": 0.5116655865197667, + "grad_norm": 0.8938611745834351, + "learning_rate": 4.745492024998291e-06, + "loss": 0.1473, + "step": 3158 + }, + { + "epoch": 0.5118276085547635, + "grad_norm": 0.9698961973190308, + "learning_rate": 4.745299754583612e-06, + "loss": 0.149, + "step": 3159 + }, + { + "epoch": 0.5119896305897602, + "grad_norm": 0.9677153825759888, + "learning_rate": 4.745107415468194e-06, + "loss": 0.1546, + "step": 3160 + }, + { + "epoch": 0.5121516526247569, + "grad_norm": 1.0147168636322021, + "learning_rate": 4.744915007657922e-06, + "loss": 0.1603, + "step": 3161 + }, + { + "epoch": 0.5123136746597537, + "grad_norm": 1.016696572303772, + "learning_rate": 4.744722531158683e-06, + "loss": 0.1499, + "step": 3162 + }, + { + "epoch": 0.5124756966947505, + "grad_norm": 0.9147832989692688, + "learning_rate": 4.744529985976368e-06, + "loss": 0.1331, + "step": 3163 + }, + { + "epoch": 0.5126377187297473, + "grad_norm": 1.0391688346862793, + "learning_rate": 4.744337372116866e-06, + "loss": 0.1582, + "step": 3164 + }, + { + "epoch": 0.512799740764744, + "grad_norm": 0.8867128491401672, + "learning_rate": 4.744144689586072e-06, + "loss": 0.135, + "step": 3165 + }, + { + "epoch": 0.5129617627997408, + "grad_norm": 0.9849241971969604, + "learning_rate": 4.743951938389881e-06, + "loss": 0.1613, + "step": 3166 + }, + { + "epoch": 0.5131237848347375, + "grad_norm": 1.0392513275146484, + "learning_rate": 4.743759118534191e-06, + "loss": 0.1329, + "step": 3167 + }, + { + "epoch": 0.5132858068697342, + "grad_norm": 0.9350460767745972, + "learning_rate": 4.743566230024902e-06, + "loss": 0.146, + "step": 3168 + }, + { + "epoch": 0.5134478289047311, + "grad_norm": 1.038048505783081, + "learning_rate": 4.743373272867916e-06, + "loss": 0.1474, + "step": 3169 + }, + { + "epoch": 0.5136098509397278, + "grad_norm": 0.8922699689865112, + "learning_rate": 4.7431802470691355e-06, + "loss": 0.1342, + "step": 3170 + }, + { + "epoch": 0.5137718729747246, + "grad_norm": 1.0452051162719727, + "learning_rate": 4.742987152634469e-06, + "loss": 0.1422, + "step": 3171 + }, + { + "epoch": 0.5139338950097213, + "grad_norm": 0.9297380447387695, + "learning_rate": 4.7427939895698235e-06, + "loss": 0.1562, + "step": 3172 + }, + { + "epoch": 0.5140959170447181, + "grad_norm": 1.0185930728912354, + "learning_rate": 4.7426007578811085e-06, + "loss": 0.1658, + "step": 3173 + }, + { + "epoch": 0.5142579390797148, + "grad_norm": 0.8730944991111755, + "learning_rate": 4.742407457574238e-06, + "loss": 0.1304, + "step": 3174 + }, + { + "epoch": 0.5144199611147116, + "grad_norm": 1.0207183361053467, + "learning_rate": 4.742214088655126e-06, + "loss": 0.1488, + "step": 3175 + }, + { + "epoch": 0.5145819831497084, + "grad_norm": 0.978741466999054, + "learning_rate": 4.7420206511296885e-06, + "loss": 0.1657, + "step": 3176 + }, + { + "epoch": 0.5147440051847051, + "grad_norm": 0.8174691200256348, + "learning_rate": 4.7418271450038444e-06, + "loss": 0.1233, + "step": 3177 + }, + { + "epoch": 0.5149060272197019, + "grad_norm": 0.9762808084487915, + "learning_rate": 4.7416335702835155e-06, + "loss": 0.1507, + "step": 3178 + }, + { + "epoch": 0.5150680492546986, + "grad_norm": 0.8684691786766052, + "learning_rate": 4.7414399269746235e-06, + "loss": 0.1492, + "step": 3179 + }, + { + "epoch": 0.5152300712896954, + "grad_norm": 0.9834610223770142, + "learning_rate": 4.741246215083094e-06, + "loss": 0.1429, + "step": 3180 + }, + { + "epoch": 0.5153920933246922, + "grad_norm": 0.9805695414543152, + "learning_rate": 4.741052434614854e-06, + "loss": 0.1474, + "step": 3181 + }, + { + "epoch": 0.5155541153596889, + "grad_norm": 1.0447092056274414, + "learning_rate": 4.740858585575832e-06, + "loss": 0.1786, + "step": 3182 + }, + { + "epoch": 0.5157161373946857, + "grad_norm": 1.1218931674957275, + "learning_rate": 4.740664667971962e-06, + "loss": 0.1708, + "step": 3183 + }, + { + "epoch": 0.5158781594296824, + "grad_norm": 0.8798757195472717, + "learning_rate": 4.7404706818091736e-06, + "loss": 0.1393, + "step": 3184 + }, + { + "epoch": 0.5160401814646792, + "grad_norm": 0.8483942747116089, + "learning_rate": 4.740276627093405e-06, + "loss": 0.1447, + "step": 3185 + }, + { + "epoch": 0.5162022034996759, + "grad_norm": 1.1056780815124512, + "learning_rate": 4.740082503830593e-06, + "loss": 0.1656, + "step": 3186 + }, + { + "epoch": 0.5163642255346728, + "grad_norm": 0.8028669953346252, + "learning_rate": 4.739888312026677e-06, + "loss": 0.1112, + "step": 3187 + }, + { + "epoch": 0.5165262475696695, + "grad_norm": 1.00840163230896, + "learning_rate": 4.7396940516875996e-06, + "loss": 0.1385, + "step": 3188 + }, + { + "epoch": 0.5166882696046662, + "grad_norm": 1.072086215019226, + "learning_rate": 4.739499722819304e-06, + "loss": 0.1697, + "step": 3189 + }, + { + "epoch": 0.516850291639663, + "grad_norm": 0.9224776029586792, + "learning_rate": 4.739305325427736e-06, + "loss": 0.1426, + "step": 3190 + }, + { + "epoch": 0.5170123136746597, + "grad_norm": 0.8584015369415283, + "learning_rate": 4.739110859518844e-06, + "loss": 0.1288, + "step": 3191 + }, + { + "epoch": 0.5171743357096565, + "grad_norm": 1.027614712715149, + "learning_rate": 4.738916325098579e-06, + "loss": 0.1508, + "step": 3192 + }, + { + "epoch": 0.5173363577446533, + "grad_norm": 1.064180612564087, + "learning_rate": 4.738721722172891e-06, + "loss": 0.1641, + "step": 3193 + }, + { + "epoch": 0.5174983797796501, + "grad_norm": 0.8860505223274231, + "learning_rate": 4.738527050747738e-06, + "loss": 0.1344, + "step": 3194 + }, + { + "epoch": 0.5176604018146468, + "grad_norm": 0.8795318603515625, + "learning_rate": 4.738332310829073e-06, + "loss": 0.1374, + "step": 3195 + }, + { + "epoch": 0.5178224238496435, + "grad_norm": 1.0304006338119507, + "learning_rate": 4.738137502422856e-06, + "loss": 0.1534, + "step": 3196 + }, + { + "epoch": 0.5179844458846403, + "grad_norm": 1.116926670074463, + "learning_rate": 4.737942625535048e-06, + "loss": 0.1539, + "step": 3197 + }, + { + "epoch": 0.518146467919637, + "grad_norm": 0.8668916821479797, + "learning_rate": 4.737747680171611e-06, + "loss": 0.1338, + "step": 3198 + }, + { + "epoch": 0.5183084899546339, + "grad_norm": 1.1048635244369507, + "learning_rate": 4.737552666338511e-06, + "loss": 0.1753, + "step": 3199 + }, + { + "epoch": 0.5184705119896306, + "grad_norm": 0.8767505884170532, + "learning_rate": 4.737357584041713e-06, + "loss": 0.1328, + "step": 3200 + }, + { + "epoch": 0.5186325340246274, + "grad_norm": 0.9243037700653076, + "learning_rate": 4.737162433287188e-06, + "loss": 0.1562, + "step": 3201 + }, + { + "epoch": 0.5187945560596241, + "grad_norm": 1.0530647039413452, + "learning_rate": 4.7369672140809065e-06, + "loss": 0.1638, + "step": 3202 + }, + { + "epoch": 0.5189565780946209, + "grad_norm": 0.9306735396385193, + "learning_rate": 4.736771926428841e-06, + "loss": 0.1411, + "step": 3203 + }, + { + "epoch": 0.5191186001296176, + "grad_norm": 1.0215457677841187, + "learning_rate": 4.736576570336968e-06, + "loss": 0.1399, + "step": 3204 + }, + { + "epoch": 0.5192806221646143, + "grad_norm": 0.9567215442657471, + "learning_rate": 4.736381145811264e-06, + "loss": 0.1628, + "step": 3205 + }, + { + "epoch": 0.5194426441996112, + "grad_norm": 0.9945108294487, + "learning_rate": 4.736185652857709e-06, + "loss": 0.1668, + "step": 3206 + }, + { + "epoch": 0.5196046662346079, + "grad_norm": 0.9826516509056091, + "learning_rate": 4.735990091482284e-06, + "loss": 0.1528, + "step": 3207 + }, + { + "epoch": 0.5197666882696047, + "grad_norm": 0.8901511430740356, + "learning_rate": 4.7357944616909745e-06, + "loss": 0.14, + "step": 3208 + }, + { + "epoch": 0.5199287103046014, + "grad_norm": 0.9824097156524658, + "learning_rate": 4.735598763489764e-06, + "loss": 0.1584, + "step": 3209 + }, + { + "epoch": 0.5200907323395982, + "grad_norm": 0.8953663110733032, + "learning_rate": 4.735402996884642e-06, + "loss": 0.1347, + "step": 3210 + }, + { + "epoch": 0.520252754374595, + "grad_norm": 0.8961206674575806, + "learning_rate": 4.735207161881596e-06, + "loss": 0.1368, + "step": 3211 + }, + { + "epoch": 0.5204147764095917, + "grad_norm": 0.9631252884864807, + "learning_rate": 4.7350112584866225e-06, + "loss": 0.1638, + "step": 3212 + }, + { + "epoch": 0.5205767984445885, + "grad_norm": 0.7925556898117065, + "learning_rate": 4.734815286705712e-06, + "loss": 0.13, + "step": 3213 + }, + { + "epoch": 0.5207388204795852, + "grad_norm": 1.1014782190322876, + "learning_rate": 4.734619246544862e-06, + "loss": 0.1628, + "step": 3214 + }, + { + "epoch": 0.520900842514582, + "grad_norm": 0.9577755928039551, + "learning_rate": 4.73442313801007e-06, + "loss": 0.1562, + "step": 3215 + }, + { + "epoch": 0.5210628645495787, + "grad_norm": 0.9577489495277405, + "learning_rate": 4.734226961107338e-06, + "loss": 0.1627, + "step": 3216 + }, + { + "epoch": 0.5212248865845756, + "grad_norm": 0.9195597171783447, + "learning_rate": 4.734030715842667e-06, + "loss": 0.1485, + "step": 3217 + }, + { + "epoch": 0.5213869086195723, + "grad_norm": 1.0537441968917847, + "learning_rate": 4.733834402222064e-06, + "loss": 0.1548, + "step": 3218 + }, + { + "epoch": 0.521548930654569, + "grad_norm": 1.0351758003234863, + "learning_rate": 4.733638020251532e-06, + "loss": 0.1574, + "step": 3219 + }, + { + "epoch": 0.5217109526895658, + "grad_norm": 0.934990406036377, + "learning_rate": 4.7334415699370825e-06, + "loss": 0.1425, + "step": 3220 + }, + { + "epoch": 0.5218729747245625, + "grad_norm": 1.002612590789795, + "learning_rate": 4.733245051284727e-06, + "loss": 0.1538, + "step": 3221 + }, + { + "epoch": 0.5220349967595593, + "grad_norm": 1.078823208808899, + "learning_rate": 4.733048464300476e-06, + "loss": 0.1703, + "step": 3222 + }, + { + "epoch": 0.522197018794556, + "grad_norm": 0.9427096843719482, + "learning_rate": 4.732851808990346e-06, + "loss": 0.1432, + "step": 3223 + }, + { + "epoch": 0.5223590408295529, + "grad_norm": 0.9111664891242981, + "learning_rate": 4.732655085360355e-06, + "loss": 0.1408, + "step": 3224 + }, + { + "epoch": 0.5225210628645496, + "grad_norm": 1.06932532787323, + "learning_rate": 4.732458293416519e-06, + "loss": 0.1691, + "step": 3225 + }, + { + "epoch": 0.5226830848995463, + "grad_norm": 1.0022755861282349, + "learning_rate": 4.7322614331648645e-06, + "loss": 0.161, + "step": 3226 + }, + { + "epoch": 0.5228451069345431, + "grad_norm": 0.9708278775215149, + "learning_rate": 4.73206450461141e-06, + "loss": 0.1496, + "step": 3227 + }, + { + "epoch": 0.5230071289695398, + "grad_norm": 1.0193688869476318, + "learning_rate": 4.731867507762184e-06, + "loss": 0.1484, + "step": 3228 + }, + { + "epoch": 0.5231691510045366, + "grad_norm": 0.90104079246521, + "learning_rate": 4.731670442623214e-06, + "loss": 0.1321, + "step": 3229 + }, + { + "epoch": 0.5233311730395334, + "grad_norm": 1.0102612972259521, + "learning_rate": 4.731473309200528e-06, + "loss": 0.1552, + "step": 3230 + }, + { + "epoch": 0.5234931950745302, + "grad_norm": 0.8935184478759766, + "learning_rate": 4.731276107500159e-06, + "loss": 0.1449, + "step": 3231 + }, + { + "epoch": 0.5236552171095269, + "grad_norm": 0.8871548771858215, + "learning_rate": 4.731078837528141e-06, + "loss": 0.1353, + "step": 3232 + }, + { + "epoch": 0.5238172391445236, + "grad_norm": 1.071672797203064, + "learning_rate": 4.730881499290509e-06, + "loss": 0.1683, + "step": 3233 + }, + { + "epoch": 0.5239792611795204, + "grad_norm": 0.872079074382782, + "learning_rate": 4.730684092793302e-06, + "loss": 0.1411, + "step": 3234 + }, + { + "epoch": 0.5241412832145171, + "grad_norm": 0.931596040725708, + "learning_rate": 4.73048661804256e-06, + "loss": 0.1633, + "step": 3235 + }, + { + "epoch": 0.524303305249514, + "grad_norm": 0.8818705081939697, + "learning_rate": 4.730289075044326e-06, + "loss": 0.1281, + "step": 3236 + }, + { + "epoch": 0.5244653272845107, + "grad_norm": 0.9318626523017883, + "learning_rate": 4.730091463804642e-06, + "loss": 0.1438, + "step": 3237 + }, + { + "epoch": 0.5246273493195075, + "grad_norm": 1.0269670486450195, + "learning_rate": 4.729893784329557e-06, + "loss": 0.1696, + "step": 3238 + }, + { + "epoch": 0.5247893713545042, + "grad_norm": 0.8802597522735596, + "learning_rate": 4.729696036625119e-06, + "loss": 0.1319, + "step": 3239 + }, + { + "epoch": 0.5249513933895009, + "grad_norm": 0.9144219756126404, + "learning_rate": 4.729498220697377e-06, + "loss": 0.1412, + "step": 3240 + }, + { + "epoch": 0.5251134154244977, + "grad_norm": 0.9358303546905518, + "learning_rate": 4.729300336552385e-06, + "loss": 0.1604, + "step": 3241 + }, + { + "epoch": 0.5252754374594945, + "grad_norm": 0.8559423089027405, + "learning_rate": 4.729102384196197e-06, + "loss": 0.1352, + "step": 3242 + }, + { + "epoch": 0.5254374594944913, + "grad_norm": 0.9450469017028809, + "learning_rate": 4.728904363634871e-06, + "loss": 0.1367, + "step": 3243 + }, + { + "epoch": 0.525599481529488, + "grad_norm": 0.9907295107841492, + "learning_rate": 4.728706274874465e-06, + "loss": 0.1418, + "step": 3244 + }, + { + "epoch": 0.5257615035644848, + "grad_norm": 0.9787341356277466, + "learning_rate": 4.72850811792104e-06, + "loss": 0.1519, + "step": 3245 + }, + { + "epoch": 0.5259235255994815, + "grad_norm": 0.9200270771980286, + "learning_rate": 4.72830989278066e-06, + "loss": 0.1517, + "step": 3246 + }, + { + "epoch": 0.5260855476344782, + "grad_norm": 0.9773831367492676, + "learning_rate": 4.72811159945939e-06, + "loss": 0.1637, + "step": 3247 + }, + { + "epoch": 0.5262475696694751, + "grad_norm": 1.007265329360962, + "learning_rate": 4.727913237963296e-06, + "loss": 0.1624, + "step": 3248 + }, + { + "epoch": 0.5264095917044718, + "grad_norm": 0.9802579283714294, + "learning_rate": 4.7277148082984495e-06, + "loss": 0.1675, + "step": 3249 + }, + { + "epoch": 0.5265716137394686, + "grad_norm": 0.9074774980545044, + "learning_rate": 4.72751631047092e-06, + "loss": 0.1461, + "step": 3250 + }, + { + "epoch": 0.5267336357744653, + "grad_norm": 1.1058282852172852, + "learning_rate": 4.727317744486783e-06, + "loss": 0.1912, + "step": 3251 + }, + { + "epoch": 0.5268956578094621, + "grad_norm": 0.9917949438095093, + "learning_rate": 4.727119110352112e-06, + "loss": 0.1563, + "step": 3252 + }, + { + "epoch": 0.5270576798444588, + "grad_norm": 0.9093613624572754, + "learning_rate": 4.726920408072985e-06, + "loss": 0.1402, + "step": 3253 + }, + { + "epoch": 0.5272197018794557, + "grad_norm": 0.9544733166694641, + "learning_rate": 4.726721637655484e-06, + "loss": 0.1594, + "step": 3254 + }, + { + "epoch": 0.5273817239144524, + "grad_norm": 0.920921802520752, + "learning_rate": 4.726522799105689e-06, + "loss": 0.152, + "step": 3255 + }, + { + "epoch": 0.5275437459494491, + "grad_norm": 1.037856101989746, + "learning_rate": 4.7263238924296835e-06, + "loss": 0.1575, + "step": 3256 + }, + { + "epoch": 0.5277057679844459, + "grad_norm": 0.9055295586585999, + "learning_rate": 4.726124917633556e-06, + "loss": 0.1405, + "step": 3257 + }, + { + "epoch": 0.5278677900194426, + "grad_norm": 0.952745258808136, + "learning_rate": 4.725925874723393e-06, + "loss": 0.1487, + "step": 3258 + }, + { + "epoch": 0.5280298120544394, + "grad_norm": 1.02610445022583, + "learning_rate": 4.725726763705284e-06, + "loss": 0.1586, + "step": 3259 + }, + { + "epoch": 0.5281918340894362, + "grad_norm": 0.9188981652259827, + "learning_rate": 4.725527584585322e-06, + "loss": 0.1264, + "step": 3260 + }, + { + "epoch": 0.528353856124433, + "grad_norm": 1.0480659008026123, + "learning_rate": 4.725328337369602e-06, + "loss": 0.149, + "step": 3261 + }, + { + "epoch": 0.5285158781594297, + "grad_norm": 0.9283803105354309, + "learning_rate": 4.725129022064221e-06, + "loss": 0.1442, + "step": 3262 + }, + { + "epoch": 0.5286779001944264, + "grad_norm": 0.8994842171669006, + "learning_rate": 4.7249296386752754e-06, + "loss": 0.1296, + "step": 3263 + }, + { + "epoch": 0.5288399222294232, + "grad_norm": 0.9448267817497253, + "learning_rate": 4.724730187208868e-06, + "loss": 0.1365, + "step": 3264 + }, + { + "epoch": 0.5290019442644199, + "grad_norm": 1.0473707914352417, + "learning_rate": 4.7245306676711e-06, + "loss": 0.1619, + "step": 3265 + }, + { + "epoch": 0.5291639662994168, + "grad_norm": 0.9334203004837036, + "learning_rate": 4.724331080068077e-06, + "loss": 0.1456, + "step": 3266 + }, + { + "epoch": 0.5293259883344135, + "grad_norm": 1.053633451461792, + "learning_rate": 4.724131424405906e-06, + "loss": 0.1759, + "step": 3267 + }, + { + "epoch": 0.5294880103694103, + "grad_norm": 0.9441131949424744, + "learning_rate": 4.723931700690695e-06, + "loss": 0.1525, + "step": 3268 + }, + { + "epoch": 0.529650032404407, + "grad_norm": 0.875206470489502, + "learning_rate": 4.723731908928556e-06, + "loss": 0.1461, + "step": 3269 + }, + { + "epoch": 0.5298120544394037, + "grad_norm": 0.9882826805114746, + "learning_rate": 4.7235320491256026e-06, + "loss": 0.1414, + "step": 3270 + }, + { + "epoch": 0.5299740764744005, + "grad_norm": 0.8946253061294556, + "learning_rate": 4.723332121287949e-06, + "loss": 0.1441, + "step": 3271 + }, + { + "epoch": 0.5301360985093972, + "grad_norm": 0.8672228455543518, + "learning_rate": 4.723132125421712e-06, + "loss": 0.1381, + "step": 3272 + }, + { + "epoch": 0.5302981205443941, + "grad_norm": 0.9539285898208618, + "learning_rate": 4.7229320615330136e-06, + "loss": 0.1544, + "step": 3273 + }, + { + "epoch": 0.5304601425793908, + "grad_norm": 1.0140817165374756, + "learning_rate": 4.722731929627971e-06, + "loss": 0.1621, + "step": 3274 + }, + { + "epoch": 0.5306221646143876, + "grad_norm": 1.0421953201293945, + "learning_rate": 4.7225317297127125e-06, + "loss": 0.1633, + "step": 3275 + }, + { + "epoch": 0.5307841866493843, + "grad_norm": 1.0012775659561157, + "learning_rate": 4.722331461793361e-06, + "loss": 0.1464, + "step": 3276 + }, + { + "epoch": 0.530946208684381, + "grad_norm": 0.9863828420639038, + "learning_rate": 4.722131125876044e-06, + "loss": 0.1476, + "step": 3277 + }, + { + "epoch": 0.5311082307193778, + "grad_norm": 0.9448386430740356, + "learning_rate": 4.721930721966893e-06, + "loss": 0.1432, + "step": 3278 + }, + { + "epoch": 0.5312702527543746, + "grad_norm": 0.898563802242279, + "learning_rate": 4.721730250072038e-06, + "loss": 0.1365, + "step": 3279 + }, + { + "epoch": 0.5314322747893714, + "grad_norm": 0.8489564061164856, + "learning_rate": 4.7215297101976145e-06, + "loss": 0.1388, + "step": 3280 + }, + { + "epoch": 0.5315942968243681, + "grad_norm": 0.9576735496520996, + "learning_rate": 4.721329102349757e-06, + "loss": 0.1527, + "step": 3281 + }, + { + "epoch": 0.5317563188593649, + "grad_norm": 0.9543820023536682, + "learning_rate": 4.721128426534605e-06, + "loss": 0.1396, + "step": 3282 + }, + { + "epoch": 0.5319183408943616, + "grad_norm": 1.031876564025879, + "learning_rate": 4.720927682758298e-06, + "loss": 0.1608, + "step": 3283 + }, + { + "epoch": 0.5320803629293583, + "grad_norm": 0.9371171593666077, + "learning_rate": 4.720726871026978e-06, + "loss": 0.1366, + "step": 3284 + }, + { + "epoch": 0.5322423849643552, + "grad_norm": 0.9507832527160645, + "learning_rate": 4.720525991346791e-06, + "loss": 0.1423, + "step": 3285 + }, + { + "epoch": 0.5324044069993519, + "grad_norm": 1.2056124210357666, + "learning_rate": 4.720325043723881e-06, + "loss": 0.1797, + "step": 3286 + }, + { + "epoch": 0.5325664290343487, + "grad_norm": 1.0787984132766724, + "learning_rate": 4.720124028164399e-06, + "loss": 0.1548, + "step": 3287 + }, + { + "epoch": 0.5327284510693454, + "grad_norm": 1.0400218963623047, + "learning_rate": 4.719922944674494e-06, + "loss": 0.1581, + "step": 3288 + }, + { + "epoch": 0.5328904731043422, + "grad_norm": 0.9209017753601074, + "learning_rate": 4.719721793260318e-06, + "loss": 0.1439, + "step": 3289 + }, + { + "epoch": 0.5330524951393389, + "grad_norm": 0.8602655529975891, + "learning_rate": 4.719520573928028e-06, + "loss": 0.1304, + "step": 3290 + }, + { + "epoch": 0.5332145171743357, + "grad_norm": 0.9528794884681702, + "learning_rate": 4.719319286683779e-06, + "loss": 0.1422, + "step": 3291 + }, + { + "epoch": 0.5333765392093325, + "grad_norm": 1.055066466331482, + "learning_rate": 4.71911793153373e-06, + "loss": 0.1698, + "step": 3292 + }, + { + "epoch": 0.5335385612443292, + "grad_norm": 0.9883852601051331, + "learning_rate": 4.718916508484043e-06, + "loss": 0.1581, + "step": 3293 + }, + { + "epoch": 0.533700583279326, + "grad_norm": 0.9446104764938354, + "learning_rate": 4.7187150175408805e-06, + "loss": 0.1551, + "step": 3294 + }, + { + "epoch": 0.5338626053143227, + "grad_norm": 1.1086978912353516, + "learning_rate": 4.7185134587104075e-06, + "loss": 0.1783, + "step": 3295 + }, + { + "epoch": 0.5340246273493195, + "grad_norm": 0.9049498438835144, + "learning_rate": 4.718311831998792e-06, + "loss": 0.146, + "step": 3296 + }, + { + "epoch": 0.5341866493843163, + "grad_norm": 1.0368142127990723, + "learning_rate": 4.718110137412201e-06, + "loss": 0.1789, + "step": 3297 + }, + { + "epoch": 0.5343486714193131, + "grad_norm": 0.9038468599319458, + "learning_rate": 4.71790837495681e-06, + "loss": 0.1441, + "step": 3298 + }, + { + "epoch": 0.5345106934543098, + "grad_norm": 0.9993532299995422, + "learning_rate": 4.717706544638788e-06, + "loss": 0.1552, + "step": 3299 + }, + { + "epoch": 0.5346727154893065, + "grad_norm": 0.8008403778076172, + "learning_rate": 4.717504646464314e-06, + "loss": 0.1378, + "step": 3300 + }, + { + "epoch": 0.5348347375243033, + "grad_norm": 0.9350391030311584, + "learning_rate": 4.717302680439563e-06, + "loss": 0.1507, + "step": 3301 + }, + { + "epoch": 0.5349967595593, + "grad_norm": 0.8926771879196167, + "learning_rate": 4.717100646570716e-06, + "loss": 0.1515, + "step": 3302 + }, + { + "epoch": 0.5351587815942969, + "grad_norm": 0.8540582060813904, + "learning_rate": 4.716898544863954e-06, + "loss": 0.1348, + "step": 3303 + }, + { + "epoch": 0.5353208036292936, + "grad_norm": 0.9826602935791016, + "learning_rate": 4.7166963753254616e-06, + "loss": 0.1562, + "step": 3304 + }, + { + "epoch": 0.5354828256642904, + "grad_norm": 0.9145976901054382, + "learning_rate": 4.716494137961425e-06, + "loss": 0.1367, + "step": 3305 + }, + { + "epoch": 0.5356448476992871, + "grad_norm": 0.8795981407165527, + "learning_rate": 4.716291832778031e-06, + "loss": 0.1399, + "step": 3306 + }, + { + "epoch": 0.5358068697342838, + "grad_norm": 0.932141125202179, + "learning_rate": 4.71608945978147e-06, + "loss": 0.1474, + "step": 3307 + }, + { + "epoch": 0.5359688917692806, + "grad_norm": 0.9054761528968811, + "learning_rate": 4.715887018977935e-06, + "loss": 0.1486, + "step": 3308 + }, + { + "epoch": 0.5361309138042774, + "grad_norm": 0.9789231419563293, + "learning_rate": 4.715684510373619e-06, + "loss": 0.1493, + "step": 3309 + }, + { + "epoch": 0.5362929358392742, + "grad_norm": 0.9172061681747437, + "learning_rate": 4.715481933974719e-06, + "loss": 0.1352, + "step": 3310 + }, + { + "epoch": 0.5364549578742709, + "grad_norm": 0.9078491926193237, + "learning_rate": 4.715279289787434e-06, + "loss": 0.141, + "step": 3311 + }, + { + "epoch": 0.5366169799092677, + "grad_norm": 1.0831925868988037, + "learning_rate": 4.715076577817963e-06, + "loss": 0.153, + "step": 3312 + }, + { + "epoch": 0.5367790019442644, + "grad_norm": 1.0742647647857666, + "learning_rate": 4.714873798072509e-06, + "loss": 0.165, + "step": 3313 + }, + { + "epoch": 0.5369410239792611, + "grad_norm": 1.015164852142334, + "learning_rate": 4.714670950557276e-06, + "loss": 0.1408, + "step": 3314 + }, + { + "epoch": 0.537103046014258, + "grad_norm": 1.0785647630691528, + "learning_rate": 4.714468035278473e-06, + "loss": 0.1709, + "step": 3315 + }, + { + "epoch": 0.5372650680492547, + "grad_norm": 1.0501564741134644, + "learning_rate": 4.714265052242306e-06, + "loss": 0.1692, + "step": 3316 + }, + { + "epoch": 0.5374270900842515, + "grad_norm": 0.8828537464141846, + "learning_rate": 4.714062001454986e-06, + "loss": 0.1389, + "step": 3317 + }, + { + "epoch": 0.5375891121192482, + "grad_norm": 1.0610477924346924, + "learning_rate": 4.7138588829227285e-06, + "loss": 0.1675, + "step": 3318 + }, + { + "epoch": 0.537751134154245, + "grad_norm": 0.8431143760681152, + "learning_rate": 4.713655696651746e-06, + "loss": 0.1328, + "step": 3319 + }, + { + "epoch": 0.5379131561892417, + "grad_norm": 1.091422438621521, + "learning_rate": 4.713452442648255e-06, + "loss": 0.1691, + "step": 3320 + }, + { + "epoch": 0.5380751782242384, + "grad_norm": 0.9422255158424377, + "learning_rate": 4.713249120918476e-06, + "loss": 0.1448, + "step": 3321 + }, + { + "epoch": 0.5382372002592353, + "grad_norm": 0.9391464591026306, + "learning_rate": 4.7130457314686316e-06, + "loss": 0.1406, + "step": 3322 + }, + { + "epoch": 0.538399222294232, + "grad_norm": 0.9336527585983276, + "learning_rate": 4.712842274304942e-06, + "loss": 0.1559, + "step": 3323 + }, + { + "epoch": 0.5385612443292288, + "grad_norm": 1.0600395202636719, + "learning_rate": 4.712638749433634e-06, + "loss": 0.1621, + "step": 3324 + }, + { + "epoch": 0.5387232663642255, + "grad_norm": 0.8601197004318237, + "learning_rate": 4.712435156860934e-06, + "loss": 0.1338, + "step": 3325 + }, + { + "epoch": 0.5388852883992223, + "grad_norm": 0.9096711277961731, + "learning_rate": 4.7122314965930724e-06, + "loss": 0.1335, + "step": 3326 + }, + { + "epoch": 0.539047310434219, + "grad_norm": 0.9492406845092773, + "learning_rate": 4.712027768636282e-06, + "loss": 0.1475, + "step": 3327 + }, + { + "epoch": 0.5392093324692158, + "grad_norm": 1.070809245109558, + "learning_rate": 4.711823972996793e-06, + "loss": 0.1636, + "step": 3328 + }, + { + "epoch": 0.5393713545042126, + "grad_norm": 1.0436028242111206, + "learning_rate": 4.711620109680843e-06, + "loss": 0.1788, + "step": 3329 + }, + { + "epoch": 0.5395333765392093, + "grad_norm": 0.9128246307373047, + "learning_rate": 4.711416178694671e-06, + "loss": 0.1435, + "step": 3330 + }, + { + "epoch": 0.5396953985742061, + "grad_norm": 0.9323081970214844, + "learning_rate": 4.7112121800445146e-06, + "loss": 0.1534, + "step": 3331 + }, + { + "epoch": 0.5398574206092028, + "grad_norm": 1.0300610065460205, + "learning_rate": 4.711008113736617e-06, + "loss": 0.1807, + "step": 3332 + }, + { + "epoch": 0.5400194426441997, + "grad_norm": 0.9709299206733704, + "learning_rate": 4.710803979777221e-06, + "loss": 0.1371, + "step": 3333 + }, + { + "epoch": 0.5401814646791964, + "grad_norm": 0.9480953812599182, + "learning_rate": 4.710599778172575e-06, + "loss": 0.1549, + "step": 3334 + }, + { + "epoch": 0.5403434867141931, + "grad_norm": 0.871699869632721, + "learning_rate": 4.710395508928923e-06, + "loss": 0.1341, + "step": 3335 + }, + { + "epoch": 0.5405055087491899, + "grad_norm": 1.0588428974151611, + "learning_rate": 4.7101911720525186e-06, + "loss": 0.1603, + "step": 3336 + }, + { + "epoch": 0.5406675307841866, + "grad_norm": 0.9104177951812744, + "learning_rate": 4.709986767549612e-06, + "loss": 0.1527, + "step": 3337 + }, + { + "epoch": 0.5408295528191834, + "grad_norm": 0.8339903950691223, + "learning_rate": 4.70978229542646e-06, + "loss": 0.1166, + "step": 3338 + }, + { + "epoch": 0.5409915748541801, + "grad_norm": 1.0834025144577026, + "learning_rate": 4.709577755689316e-06, + "loss": 0.1623, + "step": 3339 + }, + { + "epoch": 0.541153596889177, + "grad_norm": 0.9392449259757996, + "learning_rate": 4.709373148344441e-06, + "loss": 0.1518, + "step": 3340 + }, + { + "epoch": 0.5413156189241737, + "grad_norm": 1.014581561088562, + "learning_rate": 4.709168473398094e-06, + "loss": 0.1585, + "step": 3341 + }, + { + "epoch": 0.5414776409591704, + "grad_norm": 0.9345294833183289, + "learning_rate": 4.708963730856536e-06, + "loss": 0.1385, + "step": 3342 + }, + { + "epoch": 0.5416396629941672, + "grad_norm": 0.975787341594696, + "learning_rate": 4.708758920726036e-06, + "loss": 0.1515, + "step": 3343 + }, + { + "epoch": 0.5418016850291639, + "grad_norm": 0.9591774940490723, + "learning_rate": 4.708554043012857e-06, + "loss": 0.1466, + "step": 3344 + }, + { + "epoch": 0.5419637070641607, + "grad_norm": 0.9647694230079651, + "learning_rate": 4.708349097723268e-06, + "loss": 0.15, + "step": 3345 + }, + { + "epoch": 0.5421257290991575, + "grad_norm": 1.0015144348144531, + "learning_rate": 4.708144084863541e-06, + "loss": 0.1497, + "step": 3346 + }, + { + "epoch": 0.5422877511341543, + "grad_norm": 0.9959015846252441, + "learning_rate": 4.70793900443995e-06, + "loss": 0.1338, + "step": 3347 + }, + { + "epoch": 0.542449773169151, + "grad_norm": 1.017600655555725, + "learning_rate": 4.707733856458767e-06, + "loss": 0.1643, + "step": 3348 + }, + { + "epoch": 0.5426117952041478, + "grad_norm": 1.165099859237671, + "learning_rate": 4.707528640926271e-06, + "loss": 0.1774, + "step": 3349 + }, + { + "epoch": 0.5427738172391445, + "grad_norm": 1.0708073377609253, + "learning_rate": 4.707323357848741e-06, + "loss": 0.174, + "step": 3350 + }, + { + "epoch": 0.5429358392741412, + "grad_norm": 0.931759238243103, + "learning_rate": 4.707118007232457e-06, + "loss": 0.142, + "step": 3351 + }, + { + "epoch": 0.5430978613091381, + "grad_norm": 1.1331273317337036, + "learning_rate": 4.706912589083704e-06, + "loss": 0.171, + "step": 3352 + }, + { + "epoch": 0.5432598833441348, + "grad_norm": 0.9179681539535522, + "learning_rate": 4.706707103408767e-06, + "loss": 0.1463, + "step": 3353 + }, + { + "epoch": 0.5434219053791316, + "grad_norm": 0.9513891935348511, + "learning_rate": 4.706501550213932e-06, + "loss": 0.1706, + "step": 3354 + }, + { + "epoch": 0.5435839274141283, + "grad_norm": 0.9615473747253418, + "learning_rate": 4.706295929505489e-06, + "loss": 0.1522, + "step": 3355 + }, + { + "epoch": 0.5437459494491251, + "grad_norm": 0.9769582152366638, + "learning_rate": 4.70609024128973e-06, + "loss": 0.1464, + "step": 3356 + }, + { + "epoch": 0.5439079714841218, + "grad_norm": 0.9356233477592468, + "learning_rate": 4.705884485572948e-06, + "loss": 0.1442, + "step": 3357 + }, + { + "epoch": 0.5440699935191186, + "grad_norm": 1.0967140197753906, + "learning_rate": 4.7056786623614395e-06, + "loss": 0.1843, + "step": 3358 + }, + { + "epoch": 0.5442320155541154, + "grad_norm": 1.0115594863891602, + "learning_rate": 4.705472771661501e-06, + "loss": 0.1405, + "step": 3359 + }, + { + "epoch": 0.5443940375891121, + "grad_norm": 1.0418258905410767, + "learning_rate": 4.705266813479434e-06, + "loss": 0.1626, + "step": 3360 + }, + { + "epoch": 0.5445560596241089, + "grad_norm": 0.8964266180992126, + "learning_rate": 4.7050607878215375e-06, + "loss": 0.1398, + "step": 3361 + }, + { + "epoch": 0.5447180816591056, + "grad_norm": 0.9070065021514893, + "learning_rate": 4.704854694694117e-06, + "loss": 0.137, + "step": 3362 + }, + { + "epoch": 0.5448801036941024, + "grad_norm": 0.9841972589492798, + "learning_rate": 4.704648534103479e-06, + "loss": 0.1477, + "step": 3363 + }, + { + "epoch": 0.5450421257290992, + "grad_norm": 0.9104343056678772, + "learning_rate": 4.704442306055932e-06, + "loss": 0.1468, + "step": 3364 + }, + { + "epoch": 0.5452041477640959, + "grad_norm": 0.8985049724578857, + "learning_rate": 4.704236010557784e-06, + "loss": 0.1294, + "step": 3365 + }, + { + "epoch": 0.5453661697990927, + "grad_norm": 0.9941574335098267, + "learning_rate": 4.704029647615348e-06, + "loss": 0.1539, + "step": 3366 + }, + { + "epoch": 0.5455281918340894, + "grad_norm": 1.0449243783950806, + "learning_rate": 4.7038232172349394e-06, + "loss": 0.1488, + "step": 3367 + }, + { + "epoch": 0.5456902138690862, + "grad_norm": 1.0465726852416992, + "learning_rate": 4.703616719422873e-06, + "loss": 0.1555, + "step": 3368 + }, + { + "epoch": 0.5458522359040829, + "grad_norm": 1.0622889995574951, + "learning_rate": 4.703410154185467e-06, + "loss": 0.1572, + "step": 3369 + }, + { + "epoch": 0.5460142579390798, + "grad_norm": 0.9786834716796875, + "learning_rate": 4.703203521529044e-06, + "loss": 0.1558, + "step": 3370 + }, + { + "epoch": 0.5461762799740765, + "grad_norm": 1.072187066078186, + "learning_rate": 4.702996821459923e-06, + "loss": 0.1553, + "step": 3371 + }, + { + "epoch": 0.5463383020090732, + "grad_norm": 0.9539135098457336, + "learning_rate": 4.702790053984432e-06, + "loss": 0.1469, + "step": 3372 + }, + { + "epoch": 0.54650032404407, + "grad_norm": 0.9775049686431885, + "learning_rate": 4.702583219108895e-06, + "loss": 0.154, + "step": 3373 + }, + { + "epoch": 0.5466623460790667, + "grad_norm": 0.8917540907859802, + "learning_rate": 4.702376316839642e-06, + "loss": 0.1335, + "step": 3374 + }, + { + "epoch": 0.5468243681140635, + "grad_norm": 0.9353879690170288, + "learning_rate": 4.7021693471830035e-06, + "loss": 0.1562, + "step": 3375 + }, + { + "epoch": 0.5469863901490603, + "grad_norm": 0.9763373732566833, + "learning_rate": 4.701962310145312e-06, + "loss": 0.1546, + "step": 3376 + }, + { + "epoch": 0.5471484121840571, + "grad_norm": 0.9837629795074463, + "learning_rate": 4.701755205732902e-06, + "loss": 0.1645, + "step": 3377 + }, + { + "epoch": 0.5473104342190538, + "grad_norm": 0.8612603545188904, + "learning_rate": 4.7015480339521115e-06, + "loss": 0.1393, + "step": 3378 + }, + { + "epoch": 0.5474724562540505, + "grad_norm": 0.8714431524276733, + "learning_rate": 4.701340794809278e-06, + "loss": 0.1376, + "step": 3379 + }, + { + "epoch": 0.5476344782890473, + "grad_norm": 0.9637095332145691, + "learning_rate": 4.701133488310744e-06, + "loss": 0.1527, + "step": 3380 + }, + { + "epoch": 0.547796500324044, + "grad_norm": 1.0376760959625244, + "learning_rate": 4.700926114462852e-06, + "loss": 0.1709, + "step": 3381 + }, + { + "epoch": 0.5479585223590409, + "grad_norm": 0.9160656332969666, + "learning_rate": 4.700718673271947e-06, + "loss": 0.1439, + "step": 3382 + }, + { + "epoch": 0.5481205443940376, + "grad_norm": 0.8436545729637146, + "learning_rate": 4.700511164744376e-06, + "loss": 0.1203, + "step": 3383 + }, + { + "epoch": 0.5482825664290344, + "grad_norm": 0.9517103433609009, + "learning_rate": 4.700303588886489e-06, + "loss": 0.1516, + "step": 3384 + }, + { + "epoch": 0.5484445884640311, + "grad_norm": 0.9557342529296875, + "learning_rate": 4.700095945704636e-06, + "loss": 0.1502, + "step": 3385 + }, + { + "epoch": 0.5486066104990278, + "grad_norm": 0.9375651478767395, + "learning_rate": 4.699888235205172e-06, + "loss": 0.1455, + "step": 3386 + }, + { + "epoch": 0.5487686325340246, + "grad_norm": 0.9998489022254944, + "learning_rate": 4.699680457394451e-06, + "loss": 0.1429, + "step": 3387 + }, + { + "epoch": 0.5489306545690213, + "grad_norm": 0.9671725630760193, + "learning_rate": 4.699472612278831e-06, + "loss": 0.1548, + "step": 3388 + }, + { + "epoch": 0.5490926766040182, + "grad_norm": 1.063137412071228, + "learning_rate": 4.699264699864672e-06, + "loss": 0.1602, + "step": 3389 + }, + { + "epoch": 0.5492546986390149, + "grad_norm": 1.0313401222229004, + "learning_rate": 4.699056720158336e-06, + "loss": 0.1663, + "step": 3390 + }, + { + "epoch": 0.5494167206740117, + "grad_norm": 1.0677765607833862, + "learning_rate": 4.698848673166185e-06, + "loss": 0.1436, + "step": 3391 + }, + { + "epoch": 0.5495787427090084, + "grad_norm": 1.0899100303649902, + "learning_rate": 4.698640558894586e-06, + "loss": 0.1608, + "step": 3392 + }, + { + "epoch": 0.5497407647440052, + "grad_norm": 0.9777064323425293, + "learning_rate": 4.6984323773499066e-06, + "loss": 0.1537, + "step": 3393 + }, + { + "epoch": 0.549902786779002, + "grad_norm": 1.0204713344573975, + "learning_rate": 4.698224128538517e-06, + "loss": 0.1535, + "step": 3394 + }, + { + "epoch": 0.5500648088139987, + "grad_norm": 0.9061859846115112, + "learning_rate": 4.698015812466787e-06, + "loss": 0.1429, + "step": 3395 + }, + { + "epoch": 0.5502268308489955, + "grad_norm": 0.9810669422149658, + "learning_rate": 4.6978074291410936e-06, + "loss": 0.1544, + "step": 3396 + }, + { + "epoch": 0.5503888528839922, + "grad_norm": 0.9356314539909363, + "learning_rate": 4.697598978567811e-06, + "loss": 0.1427, + "step": 3397 + }, + { + "epoch": 0.550550874918989, + "grad_norm": 1.0116984844207764, + "learning_rate": 4.697390460753318e-06, + "loss": 0.148, + "step": 3398 + }, + { + "epoch": 0.5507128969539857, + "grad_norm": 0.9845331907272339, + "learning_rate": 4.697181875703995e-06, + "loss": 0.1722, + "step": 3399 + }, + { + "epoch": 0.5508749189889826, + "grad_norm": 1.221174716949463, + "learning_rate": 4.696973223426224e-06, + "loss": 0.1573, + "step": 3400 + }, + { + "epoch": 0.5510369410239793, + "grad_norm": 0.8805899620056152, + "learning_rate": 4.696764503926387e-06, + "loss": 0.137, + "step": 3401 + }, + { + "epoch": 0.551198963058976, + "grad_norm": 0.8739351630210876, + "learning_rate": 4.696555717210873e-06, + "loss": 0.125, + "step": 3402 + }, + { + "epoch": 0.5513609850939728, + "grad_norm": 0.8733137249946594, + "learning_rate": 4.696346863286071e-06, + "loss": 0.1306, + "step": 3403 + }, + { + "epoch": 0.5515230071289695, + "grad_norm": 0.967108428478241, + "learning_rate": 4.6961379421583685e-06, + "loss": 0.1513, + "step": 3404 + }, + { + "epoch": 0.5516850291639663, + "grad_norm": 0.9212470054626465, + "learning_rate": 4.69592895383416e-06, + "loss": 0.1431, + "step": 3405 + }, + { + "epoch": 0.551847051198963, + "grad_norm": 0.9542239904403687, + "learning_rate": 4.695719898319839e-06, + "loss": 0.149, + "step": 3406 + }, + { + "epoch": 0.5520090732339599, + "grad_norm": 0.8492709994316101, + "learning_rate": 4.6955107756218035e-06, + "loss": 0.1331, + "step": 3407 + }, + { + "epoch": 0.5521710952689566, + "grad_norm": 0.9666154384613037, + "learning_rate": 4.695301585746451e-06, + "loss": 0.1457, + "step": 3408 + }, + { + "epoch": 0.5523331173039533, + "grad_norm": 0.9427182674407959, + "learning_rate": 4.695092328700182e-06, + "loss": 0.1505, + "step": 3409 + }, + { + "epoch": 0.5524951393389501, + "grad_norm": 0.9889812469482422, + "learning_rate": 4.6948830044894016e-06, + "loss": 0.1627, + "step": 3410 + }, + { + "epoch": 0.5526571613739468, + "grad_norm": 1.1829040050506592, + "learning_rate": 4.694673613120511e-06, + "loss": 0.1849, + "step": 3411 + }, + { + "epoch": 0.5528191834089436, + "grad_norm": 0.8475874662399292, + "learning_rate": 4.6944641545999194e-06, + "loss": 0.133, + "step": 3412 + }, + { + "epoch": 0.5529812054439404, + "grad_norm": 0.9569193720817566, + "learning_rate": 4.694254628934035e-06, + "loss": 0.1498, + "step": 3413 + }, + { + "epoch": 0.5531432274789372, + "grad_norm": 1.0036123991012573, + "learning_rate": 4.694045036129269e-06, + "loss": 0.1409, + "step": 3414 + }, + { + "epoch": 0.5533052495139339, + "grad_norm": 0.9770514369010925, + "learning_rate": 4.6938353761920345e-06, + "loss": 0.1484, + "step": 3415 + }, + { + "epoch": 0.5534672715489306, + "grad_norm": 0.9479352235794067, + "learning_rate": 4.693625649128746e-06, + "loss": 0.1481, + "step": 3416 + }, + { + "epoch": 0.5536292935839274, + "grad_norm": 1.000531792640686, + "learning_rate": 4.693415854945822e-06, + "loss": 0.145, + "step": 3417 + }, + { + "epoch": 0.5537913156189241, + "grad_norm": 1.1228997707366943, + "learning_rate": 4.69320599364968e-06, + "loss": 0.1561, + "step": 3418 + }, + { + "epoch": 0.553953337653921, + "grad_norm": 0.9270069003105164, + "learning_rate": 4.692996065246742e-06, + "loss": 0.1384, + "step": 3419 + }, + { + "epoch": 0.5541153596889177, + "grad_norm": 1.013025164604187, + "learning_rate": 4.692786069743432e-06, + "loss": 0.1832, + "step": 3420 + }, + { + "epoch": 0.5542773817239145, + "grad_norm": 0.9863829612731934, + "learning_rate": 4.692576007146175e-06, + "loss": 0.149, + "step": 3421 + }, + { + "epoch": 0.5544394037589112, + "grad_norm": 0.9370028376579285, + "learning_rate": 4.692365877461397e-06, + "loss": 0.1392, + "step": 3422 + }, + { + "epoch": 0.5546014257939079, + "grad_norm": 1.1558516025543213, + "learning_rate": 4.692155680695529e-06, + "loss": 0.1732, + "step": 3423 + }, + { + "epoch": 0.5547634478289047, + "grad_norm": 0.8533956408500671, + "learning_rate": 4.691945416855002e-06, + "loss": 0.1409, + "step": 3424 + }, + { + "epoch": 0.5549254698639015, + "grad_norm": 1.1222232580184937, + "learning_rate": 4.69173508594625e-06, + "loss": 0.1489, + "step": 3425 + }, + { + "epoch": 0.5550874918988983, + "grad_norm": 1.032179594039917, + "learning_rate": 4.6915246879757084e-06, + "loss": 0.162, + "step": 3426 + }, + { + "epoch": 0.555249513933895, + "grad_norm": 1.0500115156173706, + "learning_rate": 4.691314222949814e-06, + "loss": 0.1491, + "step": 3427 + }, + { + "epoch": 0.5554115359688918, + "grad_norm": 0.944101095199585, + "learning_rate": 4.691103690875007e-06, + "loss": 0.1565, + "step": 3428 + }, + { + "epoch": 0.5555735580038885, + "grad_norm": 0.9684193134307861, + "learning_rate": 4.690893091757731e-06, + "loss": 0.1523, + "step": 3429 + }, + { + "epoch": 0.5557355800388852, + "grad_norm": 0.9088751077651978, + "learning_rate": 4.690682425604427e-06, + "loss": 0.1326, + "step": 3430 + }, + { + "epoch": 0.5558976020738821, + "grad_norm": 0.8529621958732605, + "learning_rate": 4.6904716924215425e-06, + "loss": 0.1359, + "step": 3431 + }, + { + "epoch": 0.5560596241088788, + "grad_norm": 1.1265101432800293, + "learning_rate": 4.690260892215525e-06, + "loss": 0.1779, + "step": 3432 + }, + { + "epoch": 0.5562216461438756, + "grad_norm": 0.9452961087226868, + "learning_rate": 4.690050024992825e-06, + "loss": 0.1409, + "step": 3433 + }, + { + "epoch": 0.5563836681788723, + "grad_norm": 0.9862555265426636, + "learning_rate": 4.689839090759893e-06, + "loss": 0.1692, + "step": 3434 + }, + { + "epoch": 0.5565456902138691, + "grad_norm": 0.9753914475440979, + "learning_rate": 4.689628089523185e-06, + "loss": 0.1589, + "step": 3435 + }, + { + "epoch": 0.5567077122488658, + "grad_norm": 0.8871638178825378, + "learning_rate": 4.689417021289157e-06, + "loss": 0.1438, + "step": 3436 + }, + { + "epoch": 0.5568697342838627, + "grad_norm": 0.890839159488678, + "learning_rate": 4.689205886064265e-06, + "loss": 0.1378, + "step": 3437 + }, + { + "epoch": 0.5570317563188594, + "grad_norm": 1.1297328472137451, + "learning_rate": 4.68899468385497e-06, + "loss": 0.1829, + "step": 3438 + }, + { + "epoch": 0.5571937783538561, + "grad_norm": 1.0578821897506714, + "learning_rate": 4.6887834146677365e-06, + "loss": 0.1538, + "step": 3439 + }, + { + "epoch": 0.5573558003888529, + "grad_norm": 0.9971956014633179, + "learning_rate": 4.688572078509027e-06, + "loss": 0.1608, + "step": 3440 + }, + { + "epoch": 0.5575178224238496, + "grad_norm": 1.0291696786880493, + "learning_rate": 4.688360675385308e-06, + "loss": 0.1569, + "step": 3441 + }, + { + "epoch": 0.5576798444588464, + "grad_norm": 0.9045324921607971, + "learning_rate": 4.688149205303048e-06, + "loss": 0.1374, + "step": 3442 + }, + { + "epoch": 0.5578418664938432, + "grad_norm": 1.0575282573699951, + "learning_rate": 4.687937668268718e-06, + "loss": 0.1511, + "step": 3443 + }, + { + "epoch": 0.55800388852884, + "grad_norm": 0.9568415880203247, + "learning_rate": 4.687726064288789e-06, + "loss": 0.1441, + "step": 3444 + }, + { + "epoch": 0.5581659105638367, + "grad_norm": 0.9325804710388184, + "learning_rate": 4.687514393369738e-06, + "loss": 0.1417, + "step": 3445 + }, + { + "epoch": 0.5583279325988334, + "grad_norm": 1.066925048828125, + "learning_rate": 4.6873026555180386e-06, + "loss": 0.1646, + "step": 3446 + }, + { + "epoch": 0.5584899546338302, + "grad_norm": 0.9176453948020935, + "learning_rate": 4.687090850740172e-06, + "loss": 0.1265, + "step": 3447 + }, + { + "epoch": 0.5586519766688269, + "grad_norm": 0.9668508768081665, + "learning_rate": 4.6868789790426185e-06, + "loss": 0.1433, + "step": 3448 + }, + { + "epoch": 0.5588139987038238, + "grad_norm": 0.8948216438293457, + "learning_rate": 4.68666704043186e-06, + "loss": 0.1344, + "step": 3449 + }, + { + "epoch": 0.5589760207388205, + "grad_norm": 0.9884295463562012, + "learning_rate": 4.6864550349143815e-06, + "loss": 0.1584, + "step": 3450 + }, + { + "epoch": 0.5591380427738173, + "grad_norm": 0.8515805006027222, + "learning_rate": 4.6862429624966695e-06, + "loss": 0.144, + "step": 3451 + }, + { + "epoch": 0.559300064808814, + "grad_norm": 0.9054228663444519, + "learning_rate": 4.686030823185215e-06, + "loss": 0.1445, + "step": 3452 + }, + { + "epoch": 0.5594620868438107, + "grad_norm": 1.0499194860458374, + "learning_rate": 4.685818616986506e-06, + "loss": 0.1536, + "step": 3453 + }, + { + "epoch": 0.5596241088788075, + "grad_norm": 0.8390993475914001, + "learning_rate": 4.685606343907038e-06, + "loss": 0.1301, + "step": 3454 + }, + { + "epoch": 0.5597861309138042, + "grad_norm": 0.996111273765564, + "learning_rate": 4.685394003953304e-06, + "loss": 0.1515, + "step": 3455 + }, + { + "epoch": 0.5599481529488011, + "grad_norm": 1.1226462125778198, + "learning_rate": 4.685181597131802e-06, + "loss": 0.1795, + "step": 3456 + }, + { + "epoch": 0.5601101749837978, + "grad_norm": 1.1062334775924683, + "learning_rate": 4.684969123449032e-06, + "loss": 0.1587, + "step": 3457 + }, + { + "epoch": 0.5602721970187946, + "grad_norm": 0.9502385258674622, + "learning_rate": 4.684756582911494e-06, + "loss": 0.1506, + "step": 3458 + }, + { + "epoch": 0.5604342190537913, + "grad_norm": 0.9273852109909058, + "learning_rate": 4.684543975525691e-06, + "loss": 0.1421, + "step": 3459 + }, + { + "epoch": 0.560596241088788, + "grad_norm": 1.0246306657791138, + "learning_rate": 4.6843313012981295e-06, + "loss": 0.1696, + "step": 3460 + }, + { + "epoch": 0.5607582631237849, + "grad_norm": 0.9162903428077698, + "learning_rate": 4.684118560235315e-06, + "loss": 0.1468, + "step": 3461 + }, + { + "epoch": 0.5609202851587816, + "grad_norm": 1.003145456314087, + "learning_rate": 4.6839057523437606e-06, + "loss": 0.1484, + "step": 3462 + }, + { + "epoch": 0.5610823071937784, + "grad_norm": 0.9194111824035645, + "learning_rate": 4.683692877629973e-06, + "loss": 0.1415, + "step": 3463 + }, + { + "epoch": 0.5612443292287751, + "grad_norm": 1.1066536903381348, + "learning_rate": 4.683479936100468e-06, + "loss": 0.1734, + "step": 3464 + }, + { + "epoch": 0.5614063512637719, + "grad_norm": 0.9368934631347656, + "learning_rate": 4.683266927761762e-06, + "loss": 0.1652, + "step": 3465 + }, + { + "epoch": 0.5615683732987686, + "grad_norm": 0.9979063868522644, + "learning_rate": 4.68305385262037e-06, + "loss": 0.1536, + "step": 3466 + }, + { + "epoch": 0.5617303953337653, + "grad_norm": 0.9077597856521606, + "learning_rate": 4.6828407106828135e-06, + "loss": 0.1353, + "step": 3467 + }, + { + "epoch": 0.5618924173687622, + "grad_norm": 1.0348340272903442, + "learning_rate": 4.682627501955614e-06, + "loss": 0.1543, + "step": 3468 + }, + { + "epoch": 0.5620544394037589, + "grad_norm": 0.8664389252662659, + "learning_rate": 4.6824142264452945e-06, + "loss": 0.1277, + "step": 3469 + }, + { + "epoch": 0.5622164614387557, + "grad_norm": 0.9749402403831482, + "learning_rate": 4.682200884158381e-06, + "loss": 0.1528, + "step": 3470 + }, + { + "epoch": 0.5623784834737524, + "grad_norm": 0.9896659255027771, + "learning_rate": 4.6819874751014015e-06, + "loss": 0.1463, + "step": 3471 + }, + { + "epoch": 0.5625405055087492, + "grad_norm": 0.9059292674064636, + "learning_rate": 4.6817739992808855e-06, + "loss": 0.1469, + "step": 3472 + }, + { + "epoch": 0.562702527543746, + "grad_norm": 0.8704636693000793, + "learning_rate": 4.681560456703364e-06, + "loss": 0.1475, + "step": 3473 + }, + { + "epoch": 0.5628645495787427, + "grad_norm": 0.9798029065132141, + "learning_rate": 4.681346847375373e-06, + "loss": 0.1693, + "step": 3474 + }, + { + "epoch": 0.5630265716137395, + "grad_norm": 0.971039891242981, + "learning_rate": 4.681133171303447e-06, + "loss": 0.1549, + "step": 3475 + }, + { + "epoch": 0.5631885936487362, + "grad_norm": 0.9427266120910645, + "learning_rate": 4.6809194284941236e-06, + "loss": 0.157, + "step": 3476 + }, + { + "epoch": 0.563350615683733, + "grad_norm": 1.0199015140533447, + "learning_rate": 4.680705618953944e-06, + "loss": 0.1631, + "step": 3477 + }, + { + "epoch": 0.5635126377187297, + "grad_norm": 0.9573736786842346, + "learning_rate": 4.6804917426894495e-06, + "loss": 0.1644, + "step": 3478 + }, + { + "epoch": 0.5636746597537265, + "grad_norm": 0.992607831954956, + "learning_rate": 4.680277799707185e-06, + "loss": 0.1475, + "step": 3479 + }, + { + "epoch": 0.5638366817887233, + "grad_norm": 0.8607563376426697, + "learning_rate": 4.6800637900136944e-06, + "loss": 0.1461, + "step": 3480 + }, + { + "epoch": 0.56399870382372, + "grad_norm": 0.8701785206794739, + "learning_rate": 4.6798497136155286e-06, + "loss": 0.1307, + "step": 3481 + }, + { + "epoch": 0.5641607258587168, + "grad_norm": 0.8611844778060913, + "learning_rate": 4.679635570519236e-06, + "loss": 0.1326, + "step": 3482 + }, + { + "epoch": 0.5643227478937135, + "grad_norm": 1.2296451330184937, + "learning_rate": 4.679421360731371e-06, + "loss": 0.1533, + "step": 3483 + }, + { + "epoch": 0.5644847699287103, + "grad_norm": 0.9519492983818054, + "learning_rate": 4.6792070842584855e-06, + "loss": 0.1453, + "step": 3484 + }, + { + "epoch": 0.564646791963707, + "grad_norm": 0.9066160321235657, + "learning_rate": 4.678992741107136e-06, + "loss": 0.1469, + "step": 3485 + }, + { + "epoch": 0.5648088139987039, + "grad_norm": 1.1821013689041138, + "learning_rate": 4.678778331283883e-06, + "loss": 0.1911, + "step": 3486 + }, + { + "epoch": 0.5649708360337006, + "grad_norm": 0.9712387919425964, + "learning_rate": 4.678563854795285e-06, + "loss": 0.166, + "step": 3487 + }, + { + "epoch": 0.5651328580686974, + "grad_norm": 0.964177668094635, + "learning_rate": 4.678349311647905e-06, + "loss": 0.1541, + "step": 3488 + }, + { + "epoch": 0.5652948801036941, + "grad_norm": 0.864924967288971, + "learning_rate": 4.678134701848308e-06, + "loss": 0.1395, + "step": 3489 + }, + { + "epoch": 0.5654569021386908, + "grad_norm": 0.9088456630706787, + "learning_rate": 4.67792002540306e-06, + "loss": 0.1444, + "step": 3490 + }, + { + "epoch": 0.5656189241736876, + "grad_norm": 0.9394357204437256, + "learning_rate": 4.67770528231873e-06, + "loss": 0.1482, + "step": 3491 + }, + { + "epoch": 0.5657809462086844, + "grad_norm": 0.8865934014320374, + "learning_rate": 4.677490472601888e-06, + "loss": 0.1456, + "step": 3492 + }, + { + "epoch": 0.5659429682436812, + "grad_norm": 1.0510127544403076, + "learning_rate": 4.677275596259107e-06, + "loss": 0.1593, + "step": 3493 + }, + { + "epoch": 0.5661049902786779, + "grad_norm": 0.8334269523620605, + "learning_rate": 4.677060653296961e-06, + "loss": 0.123, + "step": 3494 + }, + { + "epoch": 0.5662670123136747, + "grad_norm": 1.0426408052444458, + "learning_rate": 4.676845643722028e-06, + "loss": 0.1603, + "step": 3495 + }, + { + "epoch": 0.5664290343486714, + "grad_norm": 0.9913350343704224, + "learning_rate": 4.676630567540886e-06, + "loss": 0.1548, + "step": 3496 + }, + { + "epoch": 0.5665910563836681, + "grad_norm": 0.9299079775810242, + "learning_rate": 4.676415424760115e-06, + "loss": 0.1387, + "step": 3497 + }, + { + "epoch": 0.566753078418665, + "grad_norm": 0.8800061345100403, + "learning_rate": 4.6762002153863e-06, + "loss": 0.1399, + "step": 3498 + }, + { + "epoch": 0.5669151004536617, + "grad_norm": 0.827441930770874, + "learning_rate": 4.675984939426026e-06, + "loss": 0.1308, + "step": 3499 + }, + { + "epoch": 0.5670771224886585, + "grad_norm": 0.914867639541626, + "learning_rate": 4.675769596885877e-06, + "loss": 0.1541, + "step": 3500 + }, + { + "epoch": 0.5672391445236552, + "grad_norm": 0.7989079356193542, + "learning_rate": 4.675554187772444e-06, + "loss": 0.1272, + "step": 3501 + }, + { + "epoch": 0.567401166558652, + "grad_norm": 1.044179916381836, + "learning_rate": 4.675338712092316e-06, + "loss": 0.1792, + "step": 3502 + }, + { + "epoch": 0.5675631885936487, + "grad_norm": 1.00692617893219, + "learning_rate": 4.67512316985209e-06, + "loss": 0.1667, + "step": 3503 + }, + { + "epoch": 0.5677252106286454, + "grad_norm": 0.9069098830223083, + "learning_rate": 4.674907561058358e-06, + "loss": 0.1426, + "step": 3504 + }, + { + "epoch": 0.5678872326636423, + "grad_norm": 0.9729463458061218, + "learning_rate": 4.674691885717717e-06, + "loss": 0.1551, + "step": 3505 + }, + { + "epoch": 0.568049254698639, + "grad_norm": 0.8880230188369751, + "learning_rate": 4.674476143836768e-06, + "loss": 0.1346, + "step": 3506 + }, + { + "epoch": 0.5682112767336358, + "grad_norm": 0.9867011904716492, + "learning_rate": 4.6742603354221105e-06, + "loss": 0.1409, + "step": 3507 + }, + { + "epoch": 0.5683732987686325, + "grad_norm": 0.9959049820899963, + "learning_rate": 4.674044460480348e-06, + "loss": 0.1525, + "step": 3508 + }, + { + "epoch": 0.5685353208036293, + "grad_norm": 0.9901296496391296, + "learning_rate": 4.6738285190180865e-06, + "loss": 0.145, + "step": 3509 + }, + { + "epoch": 0.568697342838626, + "grad_norm": 0.9741671085357666, + "learning_rate": 4.673612511041933e-06, + "loss": 0.157, + "step": 3510 + }, + { + "epoch": 0.5688593648736228, + "grad_norm": 0.9612037539482117, + "learning_rate": 4.673396436558497e-06, + "loss": 0.1535, + "step": 3511 + }, + { + "epoch": 0.5690213869086196, + "grad_norm": 1.0780646800994873, + "learning_rate": 4.673180295574389e-06, + "loss": 0.1713, + "step": 3512 + }, + { + "epoch": 0.5691834089436163, + "grad_norm": 1.152421236038208, + "learning_rate": 4.672964088096223e-06, + "loss": 0.1358, + "step": 3513 + }, + { + "epoch": 0.5693454309786131, + "grad_norm": 0.9687821269035339, + "learning_rate": 4.672747814130615e-06, + "loss": 0.157, + "step": 3514 + }, + { + "epoch": 0.5695074530136098, + "grad_norm": 1.0094846487045288, + "learning_rate": 4.6725314736841806e-06, + "loss": 0.1582, + "step": 3515 + }, + { + "epoch": 0.5696694750486067, + "grad_norm": 1.0530569553375244, + "learning_rate": 4.672315066763542e-06, + "loss": 0.1764, + "step": 3516 + }, + { + "epoch": 0.5698314970836034, + "grad_norm": 0.8865777254104614, + "learning_rate": 4.672098593375319e-06, + "loss": 0.1384, + "step": 3517 + }, + { + "epoch": 0.5699935191186001, + "grad_norm": 0.9916114211082458, + "learning_rate": 4.671882053526135e-06, + "loss": 0.1531, + "step": 3518 + }, + { + "epoch": 0.5701555411535969, + "grad_norm": 0.9261521100997925, + "learning_rate": 4.671665447222615e-06, + "loss": 0.1444, + "step": 3519 + }, + { + "epoch": 0.5703175631885936, + "grad_norm": 0.8022376894950867, + "learning_rate": 4.671448774471389e-06, + "loss": 0.1337, + "step": 3520 + }, + { + "epoch": 0.5704795852235904, + "grad_norm": 0.8965422511100769, + "learning_rate": 4.671232035279085e-06, + "loss": 0.136, + "step": 3521 + }, + { + "epoch": 0.5706416072585871, + "grad_norm": 0.9427691102027893, + "learning_rate": 4.671015229652335e-06, + "loss": 0.1415, + "step": 3522 + }, + { + "epoch": 0.570803629293584, + "grad_norm": 0.9803239107131958, + "learning_rate": 4.6707983575977724e-06, + "loss": 0.1568, + "step": 3523 + }, + { + "epoch": 0.5709656513285807, + "grad_norm": 1.0181081295013428, + "learning_rate": 4.670581419122034e-06, + "loss": 0.1559, + "step": 3524 + }, + { + "epoch": 0.5711276733635774, + "grad_norm": 0.9415149688720703, + "learning_rate": 4.670364414231756e-06, + "loss": 0.1375, + "step": 3525 + }, + { + "epoch": 0.5712896953985742, + "grad_norm": 0.9571231007575989, + "learning_rate": 4.67014734293358e-06, + "loss": 0.1489, + "step": 3526 + }, + { + "epoch": 0.5714517174335709, + "grad_norm": 0.9342589378356934, + "learning_rate": 4.669930205234146e-06, + "loss": 0.1547, + "step": 3527 + }, + { + "epoch": 0.5716137394685677, + "grad_norm": 1.1925525665283203, + "learning_rate": 4.6697130011401e-06, + "loss": 0.1662, + "step": 3528 + }, + { + "epoch": 0.5717757615035645, + "grad_norm": 0.8905578851699829, + "learning_rate": 4.669495730658086e-06, + "loss": 0.1552, + "step": 3529 + }, + { + "epoch": 0.5719377835385613, + "grad_norm": 0.839648425579071, + "learning_rate": 4.669278393794753e-06, + "loss": 0.1288, + "step": 3530 + }, + { + "epoch": 0.572099805573558, + "grad_norm": 0.937215268611908, + "learning_rate": 4.669060990556751e-06, + "loss": 0.1486, + "step": 3531 + }, + { + "epoch": 0.5722618276085548, + "grad_norm": 1.0918093919754028, + "learning_rate": 4.6688435209507305e-06, + "loss": 0.1545, + "step": 3532 + }, + { + "epoch": 0.5724238496435515, + "grad_norm": 1.0010573863983154, + "learning_rate": 4.668625984983347e-06, + "loss": 0.1635, + "step": 3533 + }, + { + "epoch": 0.5725858716785482, + "grad_norm": 0.9082236289978027, + "learning_rate": 4.668408382661257e-06, + "loss": 0.136, + "step": 3534 + }, + { + "epoch": 0.5727478937135451, + "grad_norm": 0.9376429319381714, + "learning_rate": 4.6681907139911185e-06, + "loss": 0.1462, + "step": 3535 + }, + { + "epoch": 0.5729099157485418, + "grad_norm": 0.9339603185653687, + "learning_rate": 4.66797297897959e-06, + "loss": 0.1397, + "step": 3536 + }, + { + "epoch": 0.5730719377835386, + "grad_norm": 0.9145956635475159, + "learning_rate": 4.667755177633335e-06, + "loss": 0.1423, + "step": 3537 + }, + { + "epoch": 0.5732339598185353, + "grad_norm": 0.9105459451675415, + "learning_rate": 4.667537309959018e-06, + "loss": 0.1487, + "step": 3538 + }, + { + "epoch": 0.5733959818535321, + "grad_norm": 0.9851331114768982, + "learning_rate": 4.667319375963304e-06, + "loss": 0.15, + "step": 3539 + }, + { + "epoch": 0.5735580038885288, + "grad_norm": 0.9845294952392578, + "learning_rate": 4.667101375652862e-06, + "loss": 0.1442, + "step": 3540 + }, + { + "epoch": 0.5737200259235256, + "grad_norm": 0.8981902599334717, + "learning_rate": 4.666883309034362e-06, + "loss": 0.143, + "step": 3541 + }, + { + "epoch": 0.5738820479585224, + "grad_norm": 1.1543735265731812, + "learning_rate": 4.666665176114477e-06, + "loss": 0.1609, + "step": 3542 + }, + { + "epoch": 0.5740440699935191, + "grad_norm": 0.9345577359199524, + "learning_rate": 4.666446976899881e-06, + "loss": 0.1438, + "step": 3543 + }, + { + "epoch": 0.5742060920285159, + "grad_norm": 0.9611945152282715, + "learning_rate": 4.666228711397249e-06, + "loss": 0.1577, + "step": 3544 + }, + { + "epoch": 0.5743681140635126, + "grad_norm": 0.9625419974327087, + "learning_rate": 4.6660103796132615e-06, + "loss": 0.1464, + "step": 3545 + }, + { + "epoch": 0.5745301360985094, + "grad_norm": 0.9665799140930176, + "learning_rate": 4.665791981554598e-06, + "loss": 0.1501, + "step": 3546 + }, + { + "epoch": 0.5746921581335062, + "grad_norm": 1.2185211181640625, + "learning_rate": 4.665573517227942e-06, + "loss": 0.1475, + "step": 3547 + }, + { + "epoch": 0.5748541801685029, + "grad_norm": 0.938981294631958, + "learning_rate": 4.665354986639975e-06, + "loss": 0.1545, + "step": 3548 + }, + { + "epoch": 0.5750162022034997, + "grad_norm": 0.9329091310501099, + "learning_rate": 4.665136389797387e-06, + "loss": 0.1437, + "step": 3549 + }, + { + "epoch": 0.5751782242384964, + "grad_norm": 1.1116046905517578, + "learning_rate": 4.664917726706864e-06, + "loss": 0.1516, + "step": 3550 + }, + { + "epoch": 0.5753402462734932, + "grad_norm": 0.8297967910766602, + "learning_rate": 4.664698997375098e-06, + "loss": 0.1187, + "step": 3551 + }, + { + "epoch": 0.5755022683084899, + "grad_norm": 0.9134968519210815, + "learning_rate": 4.6644802018087806e-06, + "loss": 0.1374, + "step": 3552 + }, + { + "epoch": 0.5756642903434868, + "grad_norm": 1.091581106185913, + "learning_rate": 4.664261340014608e-06, + "loss": 0.1626, + "step": 3553 + }, + { + "epoch": 0.5758263123784835, + "grad_norm": 0.9347933530807495, + "learning_rate": 4.664042411999276e-06, + "loss": 0.139, + "step": 3554 + }, + { + "epoch": 0.5759883344134802, + "grad_norm": 1.0377494096755981, + "learning_rate": 4.663823417769482e-06, + "loss": 0.1703, + "step": 3555 + }, + { + "epoch": 0.576150356448477, + "grad_norm": 0.9047291874885559, + "learning_rate": 4.663604357331928e-06, + "loss": 0.14, + "step": 3556 + }, + { + "epoch": 0.5763123784834737, + "grad_norm": 0.9950608611106873, + "learning_rate": 4.663385230693316e-06, + "loss": 0.1631, + "step": 3557 + }, + { + "epoch": 0.5764744005184705, + "grad_norm": 0.8619599938392639, + "learning_rate": 4.6631660378603526e-06, + "loss": 0.1388, + "step": 3558 + }, + { + "epoch": 0.5766364225534673, + "grad_norm": 0.9685336947441101, + "learning_rate": 4.662946778839742e-06, + "loss": 0.1649, + "step": 3559 + }, + { + "epoch": 0.5767984445884641, + "grad_norm": 0.9717000722885132, + "learning_rate": 4.662727453638195e-06, + "loss": 0.1489, + "step": 3560 + }, + { + "epoch": 0.5769604666234608, + "grad_norm": 0.8928928375244141, + "learning_rate": 4.662508062262421e-06, + "loss": 0.1475, + "step": 3561 + }, + { + "epoch": 0.5771224886584575, + "grad_norm": 0.8069142699241638, + "learning_rate": 4.662288604719134e-06, + "loss": 0.125, + "step": 3562 + }, + { + "epoch": 0.5772845106934543, + "grad_norm": 0.8762704133987427, + "learning_rate": 4.662069081015047e-06, + "loss": 0.1369, + "step": 3563 + }, + { + "epoch": 0.577446532728451, + "grad_norm": 0.9283084869384766, + "learning_rate": 4.66184949115688e-06, + "loss": 0.1376, + "step": 3564 + }, + { + "epoch": 0.5776085547634479, + "grad_norm": 1.0266906023025513, + "learning_rate": 4.66162983515135e-06, + "loss": 0.1532, + "step": 3565 + }, + { + "epoch": 0.5777705767984446, + "grad_norm": 1.1265392303466797, + "learning_rate": 4.661410113005177e-06, + "loss": 0.1586, + "step": 3566 + }, + { + "epoch": 0.5779325988334414, + "grad_norm": 0.9027408361434937, + "learning_rate": 4.661190324725085e-06, + "loss": 0.1427, + "step": 3567 + }, + { + "epoch": 0.5780946208684381, + "grad_norm": 0.9591796398162842, + "learning_rate": 4.6609704703178e-06, + "loss": 0.1442, + "step": 3568 + }, + { + "epoch": 0.5782566429034348, + "grad_norm": 1.0705912113189697, + "learning_rate": 4.6607505497900475e-06, + "loss": 0.156, + "step": 3569 + }, + { + "epoch": 0.5784186649384316, + "grad_norm": 1.016605257987976, + "learning_rate": 4.660530563148557e-06, + "loss": 0.1494, + "step": 3570 + }, + { + "epoch": 0.5785806869734283, + "grad_norm": 0.9418259263038635, + "learning_rate": 4.66031051040006e-06, + "loss": 0.144, + "step": 3571 + }, + { + "epoch": 0.5787427090084252, + "grad_norm": 1.0305404663085938, + "learning_rate": 4.66009039155129e-06, + "loss": 0.1371, + "step": 3572 + }, + { + "epoch": 0.5789047310434219, + "grad_norm": 0.9588339328765869, + "learning_rate": 4.65987020660898e-06, + "loss": 0.1428, + "step": 3573 + }, + { + "epoch": 0.5790667530784187, + "grad_norm": 0.9257709980010986, + "learning_rate": 4.659649955579869e-06, + "loss": 0.1238, + "step": 3574 + }, + { + "epoch": 0.5792287751134154, + "grad_norm": 0.9726823568344116, + "learning_rate": 4.659429638470695e-06, + "loss": 0.1537, + "step": 3575 + }, + { + "epoch": 0.5793907971484121, + "grad_norm": 0.880847156047821, + "learning_rate": 4.659209255288201e-06, + "loss": 0.1404, + "step": 3576 + }, + { + "epoch": 0.579552819183409, + "grad_norm": 0.9704412817955017, + "learning_rate": 4.658988806039129e-06, + "loss": 0.1457, + "step": 3577 + }, + { + "epoch": 0.5797148412184057, + "grad_norm": 0.9277657866477966, + "learning_rate": 4.658768290730222e-06, + "loss": 0.1439, + "step": 3578 + }, + { + "epoch": 0.5798768632534025, + "grad_norm": 1.1359479427337646, + "learning_rate": 4.658547709368232e-06, + "loss": 0.181, + "step": 3579 + }, + { + "epoch": 0.5800388852883992, + "grad_norm": 1.0015472173690796, + "learning_rate": 4.658327061959904e-06, + "loss": 0.148, + "step": 3580 + }, + { + "epoch": 0.580200907323396, + "grad_norm": 0.9326437711715698, + "learning_rate": 4.658106348511992e-06, + "loss": 0.1476, + "step": 3581 + }, + { + "epoch": 0.5803629293583927, + "grad_norm": 1.0903030633926392, + "learning_rate": 4.6578855690312474e-06, + "loss": 0.1691, + "step": 3582 + }, + { + "epoch": 0.5805249513933896, + "grad_norm": 0.8469444513320923, + "learning_rate": 4.657664723524426e-06, + "loss": 0.1308, + "step": 3583 + }, + { + "epoch": 0.5806869734283863, + "grad_norm": 0.9137740135192871, + "learning_rate": 4.657443811998287e-06, + "loss": 0.1478, + "step": 3584 + }, + { + "epoch": 0.580848995463383, + "grad_norm": 0.9702494740486145, + "learning_rate": 4.657222834459588e-06, + "loss": 0.1455, + "step": 3585 + }, + { + "epoch": 0.5810110174983798, + "grad_norm": 0.9724593162536621, + "learning_rate": 4.65700179091509e-06, + "loss": 0.1515, + "step": 3586 + }, + { + "epoch": 0.5811730395333765, + "grad_norm": 0.8013084530830383, + "learning_rate": 4.6567806813715575e-06, + "loss": 0.1202, + "step": 3587 + }, + { + "epoch": 0.5813350615683733, + "grad_norm": 0.9419142007827759, + "learning_rate": 4.656559505835755e-06, + "loss": 0.1389, + "step": 3588 + }, + { + "epoch": 0.58149708360337, + "grad_norm": 0.9750432372093201, + "learning_rate": 4.6563382643144505e-06, + "loss": 0.1489, + "step": 3589 + }, + { + "epoch": 0.5816591056383669, + "grad_norm": 0.8804764151573181, + "learning_rate": 4.656116956814414e-06, + "loss": 0.1381, + "step": 3590 + }, + { + "epoch": 0.5818211276733636, + "grad_norm": 0.9245219230651855, + "learning_rate": 4.655895583342415e-06, + "loss": 0.1461, + "step": 3591 + }, + { + "epoch": 0.5819831497083603, + "grad_norm": 0.8934546113014221, + "learning_rate": 4.655674143905229e-06, + "loss": 0.141, + "step": 3592 + }, + { + "epoch": 0.5821451717433571, + "grad_norm": 0.992603600025177, + "learning_rate": 4.655452638509631e-06, + "loss": 0.1639, + "step": 3593 + }, + { + "epoch": 0.5823071937783538, + "grad_norm": 1.0013014078140259, + "learning_rate": 4.655231067162398e-06, + "loss": 0.1502, + "step": 3594 + }, + { + "epoch": 0.5824692158133506, + "grad_norm": 0.9464520812034607, + "learning_rate": 4.655009429870311e-06, + "loss": 0.1467, + "step": 3595 + }, + { + "epoch": 0.5826312378483474, + "grad_norm": 0.8180822730064392, + "learning_rate": 4.65478772664015e-06, + "loss": 0.1202, + "step": 3596 + }, + { + "epoch": 0.5827932598833442, + "grad_norm": 0.7912185788154602, + "learning_rate": 4.6545659574786985e-06, + "loss": 0.1202, + "step": 3597 + }, + { + "epoch": 0.5829552819183409, + "grad_norm": 0.8654948472976685, + "learning_rate": 4.654344122392742e-06, + "loss": 0.1296, + "step": 3598 + }, + { + "epoch": 0.5831173039533376, + "grad_norm": 1.000624418258667, + "learning_rate": 4.65412222138907e-06, + "loss": 0.1421, + "step": 3599 + }, + { + "epoch": 0.5832793259883344, + "grad_norm": 0.9236934185028076, + "learning_rate": 4.6539002544744705e-06, + "loss": 0.139, + "step": 3600 + }, + { + "epoch": 0.5834413480233311, + "grad_norm": 0.8793966174125671, + "learning_rate": 4.653678221655735e-06, + "loss": 0.1356, + "step": 3601 + }, + { + "epoch": 0.583603370058328, + "grad_norm": 0.9714186787605286, + "learning_rate": 4.653456122939659e-06, + "loss": 0.132, + "step": 3602 + }, + { + "epoch": 0.5837653920933247, + "grad_norm": 1.067306399345398, + "learning_rate": 4.653233958333036e-06, + "loss": 0.159, + "step": 3603 + }, + { + "epoch": 0.5839274141283215, + "grad_norm": 0.8334851264953613, + "learning_rate": 4.653011727842665e-06, + "loss": 0.1267, + "step": 3604 + }, + { + "epoch": 0.5840894361633182, + "grad_norm": 0.8716873526573181, + "learning_rate": 4.652789431475346e-06, + "loss": 0.1423, + "step": 3605 + }, + { + "epoch": 0.5842514581983149, + "grad_norm": 0.8517516851425171, + "learning_rate": 4.652567069237877e-06, + "loss": 0.1191, + "step": 3606 + }, + { + "epoch": 0.5844134802333117, + "grad_norm": 0.9621455669403076, + "learning_rate": 4.652344641137068e-06, + "loss": 0.1457, + "step": 3607 + }, + { + "epoch": 0.5845755022683085, + "grad_norm": 1.0080509185791016, + "learning_rate": 4.652122147179721e-06, + "loss": 0.1642, + "step": 3608 + }, + { + "epoch": 0.5847375243033053, + "grad_norm": 0.8625085353851318, + "learning_rate": 4.6518995873726434e-06, + "loss": 0.1353, + "step": 3609 + }, + { + "epoch": 0.584899546338302, + "grad_norm": 0.8965188264846802, + "learning_rate": 4.651676961722647e-06, + "loss": 0.1345, + "step": 3610 + }, + { + "epoch": 0.5850615683732988, + "grad_norm": 1.0228919982910156, + "learning_rate": 4.651454270236541e-06, + "loss": 0.1283, + "step": 3611 + }, + { + "epoch": 0.5852235904082955, + "grad_norm": 1.0728081464767456, + "learning_rate": 4.651231512921142e-06, + "loss": 0.1795, + "step": 3612 + }, + { + "epoch": 0.5853856124432922, + "grad_norm": 1.0229250192642212, + "learning_rate": 4.651008689783264e-06, + "loss": 0.1516, + "step": 3613 + }, + { + "epoch": 0.5855476344782891, + "grad_norm": 0.8845334649085999, + "learning_rate": 4.650785800829726e-06, + "loss": 0.1361, + "step": 3614 + }, + { + "epoch": 0.5857096565132858, + "grad_norm": 0.9456861615180969, + "learning_rate": 4.650562846067347e-06, + "loss": 0.159, + "step": 3615 + }, + { + "epoch": 0.5858716785482826, + "grad_norm": 1.1445139646530151, + "learning_rate": 4.650339825502949e-06, + "loss": 0.163, + "step": 3616 + }, + { + "epoch": 0.5860337005832793, + "grad_norm": 0.9295792579650879, + "learning_rate": 4.650116739143356e-06, + "loss": 0.1346, + "step": 3617 + }, + { + "epoch": 0.5861957226182761, + "grad_norm": 0.9515857696533203, + "learning_rate": 4.6498935869953945e-06, + "loss": 0.1431, + "step": 3618 + }, + { + "epoch": 0.5863577446532728, + "grad_norm": 0.8937877416610718, + "learning_rate": 4.649670369065891e-06, + "loss": 0.1438, + "step": 3619 + }, + { + "epoch": 0.5865197666882696, + "grad_norm": 0.9099476337432861, + "learning_rate": 4.649447085361677e-06, + "loss": 0.1447, + "step": 3620 + }, + { + "epoch": 0.5866817887232664, + "grad_norm": 0.973203182220459, + "learning_rate": 4.649223735889583e-06, + "loss": 0.1532, + "step": 3621 + }, + { + "epoch": 0.5868438107582631, + "grad_norm": 0.8838689923286438, + "learning_rate": 4.649000320656445e-06, + "loss": 0.1507, + "step": 3622 + }, + { + "epoch": 0.5870058327932599, + "grad_norm": 0.8679401874542236, + "learning_rate": 4.6487768396690965e-06, + "loss": 0.1281, + "step": 3623 + }, + { + "epoch": 0.5871678548282566, + "grad_norm": 0.9477259516716003, + "learning_rate": 4.648553292934377e-06, + "loss": 0.1491, + "step": 3624 + }, + { + "epoch": 0.5873298768632534, + "grad_norm": 0.923620879650116, + "learning_rate": 4.648329680459127e-06, + "loss": 0.1535, + "step": 3625 + }, + { + "epoch": 0.5874918988982502, + "grad_norm": 0.9671549797058105, + "learning_rate": 4.648106002250186e-06, + "loss": 0.1598, + "step": 3626 + }, + { + "epoch": 0.587653920933247, + "grad_norm": 0.845306396484375, + "learning_rate": 4.6478822583144015e-06, + "loss": 0.1349, + "step": 3627 + }, + { + "epoch": 0.5878159429682437, + "grad_norm": 1.0081162452697754, + "learning_rate": 4.647658448658616e-06, + "loss": 0.1665, + "step": 3628 + }, + { + "epoch": 0.5879779650032404, + "grad_norm": 0.9138731360435486, + "learning_rate": 4.64743457328968e-06, + "loss": 0.1496, + "step": 3629 + }, + { + "epoch": 0.5881399870382372, + "grad_norm": 0.9138461351394653, + "learning_rate": 4.647210632214443e-06, + "loss": 0.1446, + "step": 3630 + }, + { + "epoch": 0.5883020090732339, + "grad_norm": 0.8777061104774475, + "learning_rate": 4.6469866254397564e-06, + "loss": 0.1461, + "step": 3631 + }, + { + "epoch": 0.5884640311082308, + "grad_norm": 0.984879732131958, + "learning_rate": 4.646762552972475e-06, + "loss": 0.1488, + "step": 3632 + }, + { + "epoch": 0.5886260531432275, + "grad_norm": 0.9393163919448853, + "learning_rate": 4.646538414819454e-06, + "loss": 0.1398, + "step": 3633 + }, + { + "epoch": 0.5887880751782243, + "grad_norm": 0.8989644646644592, + "learning_rate": 4.646314210987552e-06, + "loss": 0.1317, + "step": 3634 + }, + { + "epoch": 0.588950097213221, + "grad_norm": 1.0422638654708862, + "learning_rate": 4.646089941483629e-06, + "loss": 0.1567, + "step": 3635 + }, + { + "epoch": 0.5891121192482177, + "grad_norm": 0.9822480082511902, + "learning_rate": 4.645865606314548e-06, + "loss": 0.1557, + "step": 3636 + }, + { + "epoch": 0.5892741412832145, + "grad_norm": 1.0314334630966187, + "learning_rate": 4.645641205487172e-06, + "loss": 0.1502, + "step": 3637 + }, + { + "epoch": 0.5894361633182112, + "grad_norm": 1.0244547128677368, + "learning_rate": 4.645416739008367e-06, + "loss": 0.159, + "step": 3638 + }, + { + "epoch": 0.5895981853532081, + "grad_norm": 1.0546983480453491, + "learning_rate": 4.645192206885003e-06, + "loss": 0.1466, + "step": 3639 + }, + { + "epoch": 0.5897602073882048, + "grad_norm": 0.9143233895301819, + "learning_rate": 4.644967609123947e-06, + "loss": 0.1341, + "step": 3640 + }, + { + "epoch": 0.5899222294232016, + "grad_norm": 0.9980403184890747, + "learning_rate": 4.644742945732074e-06, + "loss": 0.1516, + "step": 3641 + }, + { + "epoch": 0.5900842514581983, + "grad_norm": 0.9705789685249329, + "learning_rate": 4.644518216716256e-06, + "loss": 0.1653, + "step": 3642 + }, + { + "epoch": 0.590246273493195, + "grad_norm": 0.951261579990387, + "learning_rate": 4.6442934220833716e-06, + "loss": 0.1499, + "step": 3643 + }, + { + "epoch": 0.5904082955281919, + "grad_norm": 0.9710079431533813, + "learning_rate": 4.644068561840297e-06, + "loss": 0.1623, + "step": 3644 + }, + { + "epoch": 0.5905703175631886, + "grad_norm": 0.9166867733001709, + "learning_rate": 4.643843635993913e-06, + "loss": 0.1437, + "step": 3645 + }, + { + "epoch": 0.5907323395981854, + "grad_norm": 0.9674573540687561, + "learning_rate": 4.643618644551101e-06, + "loss": 0.1575, + "step": 3646 + }, + { + "epoch": 0.5908943616331821, + "grad_norm": 0.9722270369529724, + "learning_rate": 4.643393587518747e-06, + "loss": 0.1585, + "step": 3647 + }, + { + "epoch": 0.5910563836681789, + "grad_norm": 0.8995879888534546, + "learning_rate": 4.643168464903736e-06, + "loss": 0.1396, + "step": 3648 + }, + { + "epoch": 0.5912184057031756, + "grad_norm": 0.9309381246566772, + "learning_rate": 4.642943276712956e-06, + "loss": 0.1465, + "step": 3649 + }, + { + "epoch": 0.5913804277381723, + "grad_norm": 0.9563421010971069, + "learning_rate": 4.642718022953297e-06, + "loss": 0.1389, + "step": 3650 + }, + { + "epoch": 0.5915424497731692, + "grad_norm": 0.9059107303619385, + "learning_rate": 4.642492703631652e-06, + "loss": 0.1399, + "step": 3651 + }, + { + "epoch": 0.5917044718081659, + "grad_norm": 1.094211220741272, + "learning_rate": 4.642267318754915e-06, + "loss": 0.1802, + "step": 3652 + }, + { + "epoch": 0.5918664938431627, + "grad_norm": 0.952056348323822, + "learning_rate": 4.6420418683299825e-06, + "loss": 0.1658, + "step": 3653 + }, + { + "epoch": 0.5920285158781594, + "grad_norm": 0.9111037254333496, + "learning_rate": 4.641816352363753e-06, + "loss": 0.1377, + "step": 3654 + }, + { + "epoch": 0.5921905379131562, + "grad_norm": 0.8917416334152222, + "learning_rate": 4.641590770863126e-06, + "loss": 0.1374, + "step": 3655 + }, + { + "epoch": 0.592352559948153, + "grad_norm": 0.9995194673538208, + "learning_rate": 4.641365123835004e-06, + "loss": 0.1524, + "step": 3656 + }, + { + "epoch": 0.5925145819831497, + "grad_norm": 0.972366213798523, + "learning_rate": 4.641139411286291e-06, + "loss": 0.1565, + "step": 3657 + }, + { + "epoch": 0.5926766040181465, + "grad_norm": 1.033241868019104, + "learning_rate": 4.640913633223893e-06, + "loss": 0.1777, + "step": 3658 + }, + { + "epoch": 0.5928386260531432, + "grad_norm": 0.9050754904747009, + "learning_rate": 4.640687789654719e-06, + "loss": 0.1473, + "step": 3659 + }, + { + "epoch": 0.59300064808814, + "grad_norm": 0.962814211845398, + "learning_rate": 4.64046188058568e-06, + "loss": 0.1508, + "step": 3660 + }, + { + "epoch": 0.5931626701231367, + "grad_norm": 1.0121870040893555, + "learning_rate": 4.640235906023686e-06, + "loss": 0.1686, + "step": 3661 + }, + { + "epoch": 0.5933246921581335, + "grad_norm": 0.9426042437553406, + "learning_rate": 4.6400098659756525e-06, + "loss": 0.1478, + "step": 3662 + }, + { + "epoch": 0.5934867141931303, + "grad_norm": 1.017554759979248, + "learning_rate": 4.639783760448497e-06, + "loss": 0.1571, + "step": 3663 + }, + { + "epoch": 0.593648736228127, + "grad_norm": 0.9146802425384521, + "learning_rate": 4.639557589449135e-06, + "loss": 0.1372, + "step": 3664 + }, + { + "epoch": 0.5938107582631238, + "grad_norm": 0.938179612159729, + "learning_rate": 4.6393313529844895e-06, + "loss": 0.1441, + "step": 3665 + }, + { + "epoch": 0.5939727802981205, + "grad_norm": 0.8877557516098022, + "learning_rate": 4.639105051061481e-06, + "loss": 0.1409, + "step": 3666 + }, + { + "epoch": 0.5941348023331173, + "grad_norm": 0.9249974489212036, + "learning_rate": 4.638878683687036e-06, + "loss": 0.1561, + "step": 3667 + }, + { + "epoch": 0.594296824368114, + "grad_norm": 0.906252384185791, + "learning_rate": 4.638652250868078e-06, + "loss": 0.1418, + "step": 3668 + }, + { + "epoch": 0.5944588464031109, + "grad_norm": 0.9417980909347534, + "learning_rate": 4.638425752611536e-06, + "loss": 0.1536, + "step": 3669 + }, + { + "epoch": 0.5946208684381076, + "grad_norm": 0.8455900549888611, + "learning_rate": 4.6381991889243416e-06, + "loss": 0.1347, + "step": 3670 + }, + { + "epoch": 0.5947828904731044, + "grad_norm": 1.0007245540618896, + "learning_rate": 4.6379725598134265e-06, + "loss": 0.1662, + "step": 3671 + }, + { + "epoch": 0.5949449125081011, + "grad_norm": 0.8600866198539734, + "learning_rate": 4.637745865285725e-06, + "loss": 0.1389, + "step": 3672 + }, + { + "epoch": 0.5951069345430978, + "grad_norm": 0.9751458764076233, + "learning_rate": 4.637519105348173e-06, + "loss": 0.154, + "step": 3673 + }, + { + "epoch": 0.5952689565780946, + "grad_norm": 0.9357625842094421, + "learning_rate": 4.637292280007709e-06, + "loss": 0.1592, + "step": 3674 + }, + { + "epoch": 0.5954309786130914, + "grad_norm": 0.8890631794929504, + "learning_rate": 4.637065389271274e-06, + "loss": 0.137, + "step": 3675 + }, + { + "epoch": 0.5955930006480882, + "grad_norm": 0.8981730341911316, + "learning_rate": 4.6368384331458085e-06, + "loss": 0.1411, + "step": 3676 + }, + { + "epoch": 0.5957550226830849, + "grad_norm": 0.987389862537384, + "learning_rate": 4.636611411638259e-06, + "loss": 0.1455, + "step": 3677 + }, + { + "epoch": 0.5959170447180817, + "grad_norm": 1.0186084508895874, + "learning_rate": 4.63638432475557e-06, + "loss": 0.1597, + "step": 3678 + }, + { + "epoch": 0.5960790667530784, + "grad_norm": 1.0408918857574463, + "learning_rate": 4.636157172504692e-06, + "loss": 0.1647, + "step": 3679 + }, + { + "epoch": 0.5962410887880751, + "grad_norm": 0.9388018250465393, + "learning_rate": 4.635929954892572e-06, + "loss": 0.1402, + "step": 3680 + }, + { + "epoch": 0.596403110823072, + "grad_norm": 0.8321965932846069, + "learning_rate": 4.635702671926166e-06, + "loss": 0.1253, + "step": 3681 + }, + { + "epoch": 0.5965651328580687, + "grad_norm": 0.8523208498954773, + "learning_rate": 4.6354753236124254e-06, + "loss": 0.1384, + "step": 3682 + }, + { + "epoch": 0.5967271548930655, + "grad_norm": 1.0064256191253662, + "learning_rate": 4.635247909958308e-06, + "loss": 0.1559, + "step": 3683 + }, + { + "epoch": 0.5968891769280622, + "grad_norm": 0.8501699566841125, + "learning_rate": 4.635020430970771e-06, + "loss": 0.1278, + "step": 3684 + }, + { + "epoch": 0.597051198963059, + "grad_norm": 0.8985577821731567, + "learning_rate": 4.634792886656777e-06, + "loss": 0.1393, + "step": 3685 + }, + { + "epoch": 0.5972132209980557, + "grad_norm": 1.0347974300384521, + "learning_rate": 4.6345652770232856e-06, + "loss": 0.1612, + "step": 3686 + }, + { + "epoch": 0.5973752430330524, + "grad_norm": 0.8934628367424011, + "learning_rate": 4.634337602077263e-06, + "loss": 0.1367, + "step": 3687 + }, + { + "epoch": 0.5975372650680493, + "grad_norm": 1.1091103553771973, + "learning_rate": 4.6341098618256745e-06, + "loss": 0.16, + "step": 3688 + }, + { + "epoch": 0.597699287103046, + "grad_norm": 1.0096241235733032, + "learning_rate": 4.633882056275488e-06, + "loss": 0.1496, + "step": 3689 + }, + { + "epoch": 0.5978613091380428, + "grad_norm": 0.793545126914978, + "learning_rate": 4.633654185433676e-06, + "loss": 0.1135, + "step": 3690 + }, + { + "epoch": 0.5980233311730395, + "grad_norm": 0.9871333241462708, + "learning_rate": 4.633426249307208e-06, + "loss": 0.1621, + "step": 3691 + }, + { + "epoch": 0.5981853532080363, + "grad_norm": 0.9502182602882385, + "learning_rate": 4.63319824790306e-06, + "loss": 0.1588, + "step": 3692 + }, + { + "epoch": 0.598347375243033, + "grad_norm": 0.9651210904121399, + "learning_rate": 4.632970181228208e-06, + "loss": 0.1581, + "step": 3693 + }, + { + "epoch": 0.5985093972780298, + "grad_norm": 0.91889488697052, + "learning_rate": 4.6327420492896295e-06, + "loss": 0.1405, + "step": 3694 + }, + { + "epoch": 0.5986714193130266, + "grad_norm": 0.9224652051925659, + "learning_rate": 4.632513852094306e-06, + "loss": 0.1439, + "step": 3695 + }, + { + "epoch": 0.5988334413480233, + "grad_norm": 1.0088536739349365, + "learning_rate": 4.632285589649219e-06, + "loss": 0.1434, + "step": 3696 + }, + { + "epoch": 0.5989954633830201, + "grad_norm": 0.9861193299293518, + "learning_rate": 4.632057261961353e-06, + "loss": 0.1488, + "step": 3697 + }, + { + "epoch": 0.5991574854180168, + "grad_norm": 0.9628931879997253, + "learning_rate": 4.631828869037694e-06, + "loss": 0.1572, + "step": 3698 + }, + { + "epoch": 0.5993195074530137, + "grad_norm": 0.9910879135131836, + "learning_rate": 4.631600410885231e-06, + "loss": 0.1529, + "step": 3699 + }, + { + "epoch": 0.5994815294880104, + "grad_norm": 0.876059889793396, + "learning_rate": 4.631371887510954e-06, + "loss": 0.1401, + "step": 3700 + }, + { + "epoch": 0.5996435515230071, + "grad_norm": 0.9525460004806519, + "learning_rate": 4.6311432989218545e-06, + "loss": 0.1607, + "step": 3701 + }, + { + "epoch": 0.5998055735580039, + "grad_norm": 0.8962662816047668, + "learning_rate": 4.630914645124928e-06, + "loss": 0.155, + "step": 3702 + }, + { + "epoch": 0.5999675955930006, + "grad_norm": 0.852394163608551, + "learning_rate": 4.630685926127169e-06, + "loss": 0.1339, + "step": 3703 + }, + { + "epoch": 0.6001296176279974, + "grad_norm": 1.0351753234863281, + "learning_rate": 4.630457141935577e-06, + "loss": 0.1648, + "step": 3704 + }, + { + "epoch": 0.6002916396629941, + "grad_norm": 0.8926696181297302, + "learning_rate": 4.630228292557153e-06, + "loss": 0.1472, + "step": 3705 + }, + { + "epoch": 0.600453661697991, + "grad_norm": 0.8851702213287354, + "learning_rate": 4.629999377998898e-06, + "loss": 0.1355, + "step": 3706 + }, + { + "epoch": 0.6006156837329877, + "grad_norm": 0.9555678367614746, + "learning_rate": 4.629770398267815e-06, + "loss": 0.1617, + "step": 3707 + }, + { + "epoch": 0.6007777057679844, + "grad_norm": 0.979743242263794, + "learning_rate": 4.629541353370914e-06, + "loss": 0.1756, + "step": 3708 + }, + { + "epoch": 0.6009397278029812, + "grad_norm": 0.8310829997062683, + "learning_rate": 4.6293122433152e-06, + "loss": 0.127, + "step": 3709 + }, + { + "epoch": 0.6011017498379779, + "grad_norm": 0.9136250615119934, + "learning_rate": 4.629083068107684e-06, + "loss": 0.1452, + "step": 3710 + }, + { + "epoch": 0.6012637718729748, + "grad_norm": 0.915210485458374, + "learning_rate": 4.628853827755378e-06, + "loss": 0.1557, + "step": 3711 + }, + { + "epoch": 0.6014257939079715, + "grad_norm": 0.902146577835083, + "learning_rate": 4.628624522265298e-06, + "loss": 0.1351, + "step": 3712 + }, + { + "epoch": 0.6015878159429683, + "grad_norm": 0.8265898823738098, + "learning_rate": 4.628395151644458e-06, + "loss": 0.1417, + "step": 3713 + }, + { + "epoch": 0.601749837977965, + "grad_norm": 1.038116693496704, + "learning_rate": 4.628165715899877e-06, + "loss": 0.1567, + "step": 3714 + }, + { + "epoch": 0.6019118600129617, + "grad_norm": 1.0465978384017944, + "learning_rate": 4.6279362150385755e-06, + "loss": 0.1566, + "step": 3715 + }, + { + "epoch": 0.6020738820479585, + "grad_norm": 0.9295293688774109, + "learning_rate": 4.627706649067575e-06, + "loss": 0.1515, + "step": 3716 + }, + { + "epoch": 0.6022359040829552, + "grad_norm": 0.8935041427612305, + "learning_rate": 4.6274770179939e-06, + "loss": 0.1467, + "step": 3717 + }, + { + "epoch": 0.6023979261179521, + "grad_norm": 1.1107021570205688, + "learning_rate": 4.627247321824576e-06, + "loss": 0.1565, + "step": 3718 + }, + { + "epoch": 0.6025599481529488, + "grad_norm": 0.9548209309577942, + "learning_rate": 4.627017560566633e-06, + "loss": 0.1464, + "step": 3719 + }, + { + "epoch": 0.6027219701879456, + "grad_norm": 0.8951996564865112, + "learning_rate": 4.6267877342271e-06, + "loss": 0.1415, + "step": 3720 + }, + { + "epoch": 0.6028839922229423, + "grad_norm": 0.8305820226669312, + "learning_rate": 4.626557842813008e-06, + "loss": 0.1319, + "step": 3721 + }, + { + "epoch": 0.6030460142579391, + "grad_norm": 0.8860799074172974, + "learning_rate": 4.626327886331392e-06, + "loss": 0.1335, + "step": 3722 + }, + { + "epoch": 0.6032080362929358, + "grad_norm": 0.9841209053993225, + "learning_rate": 4.626097864789289e-06, + "loss": 0.1727, + "step": 3723 + }, + { + "epoch": 0.6033700583279326, + "grad_norm": 0.8832147717475891, + "learning_rate": 4.625867778193737e-06, + "loss": 0.1418, + "step": 3724 + }, + { + "epoch": 0.6035320803629294, + "grad_norm": 0.9186353087425232, + "learning_rate": 4.625637626551774e-06, + "loss": 0.1462, + "step": 3725 + }, + { + "epoch": 0.6036941023979261, + "grad_norm": 0.8324099779129028, + "learning_rate": 4.625407409870444e-06, + "loss": 0.1441, + "step": 3726 + }, + { + "epoch": 0.6038561244329229, + "grad_norm": 0.7835507988929749, + "learning_rate": 4.625177128156791e-06, + "loss": 0.1322, + "step": 3727 + }, + { + "epoch": 0.6040181464679196, + "grad_norm": 0.9128600358963013, + "learning_rate": 4.624946781417861e-06, + "loss": 0.1451, + "step": 3728 + }, + { + "epoch": 0.6041801685029164, + "grad_norm": 0.7894352078437805, + "learning_rate": 4.624716369660701e-06, + "loss": 0.1217, + "step": 3729 + }, + { + "epoch": 0.6043421905379132, + "grad_norm": 0.8810085654258728, + "learning_rate": 4.624485892892363e-06, + "loss": 0.1463, + "step": 3730 + }, + { + "epoch": 0.6045042125729099, + "grad_norm": 1.0495206117630005, + "learning_rate": 4.624255351119897e-06, + "loss": 0.1634, + "step": 3731 + }, + { + "epoch": 0.6046662346079067, + "grad_norm": 1.0064868927001953, + "learning_rate": 4.624024744350358e-06, + "loss": 0.1705, + "step": 3732 + }, + { + "epoch": 0.6048282566429034, + "grad_norm": 0.985504150390625, + "learning_rate": 4.6237940725908014e-06, + "loss": 0.1366, + "step": 3733 + }, + { + "epoch": 0.6049902786779002, + "grad_norm": 0.9360939264297485, + "learning_rate": 4.623563335848286e-06, + "loss": 0.1567, + "step": 3734 + }, + { + "epoch": 0.6051523007128969, + "grad_norm": 0.9163661003112793, + "learning_rate": 4.623332534129872e-06, + "loss": 0.1453, + "step": 3735 + }, + { + "epoch": 0.6053143227478938, + "grad_norm": 1.0676296949386597, + "learning_rate": 4.62310166744262e-06, + "loss": 0.1633, + "step": 3736 + }, + { + "epoch": 0.6054763447828905, + "grad_norm": 0.9336465001106262, + "learning_rate": 4.622870735793595e-06, + "loss": 0.1483, + "step": 3737 + }, + { + "epoch": 0.6056383668178872, + "grad_norm": 0.8538773655891418, + "learning_rate": 4.622639739189863e-06, + "loss": 0.128, + "step": 3738 + }, + { + "epoch": 0.605800388852884, + "grad_norm": 0.9462392926216125, + "learning_rate": 4.622408677638491e-06, + "loss": 0.1465, + "step": 3739 + }, + { + "epoch": 0.6059624108878807, + "grad_norm": 0.965332567691803, + "learning_rate": 4.62217755114655e-06, + "loss": 0.1309, + "step": 3740 + }, + { + "epoch": 0.6061244329228775, + "grad_norm": 0.9652532339096069, + "learning_rate": 4.621946359721112e-06, + "loss": 0.1488, + "step": 3741 + }, + { + "epoch": 0.6062864549578743, + "grad_norm": 0.9327961802482605, + "learning_rate": 4.62171510336925e-06, + "loss": 0.1618, + "step": 3742 + }, + { + "epoch": 0.6064484769928711, + "grad_norm": 0.9430671334266663, + "learning_rate": 4.621483782098041e-06, + "loss": 0.1497, + "step": 3743 + }, + { + "epoch": 0.6066104990278678, + "grad_norm": 1.0052106380462646, + "learning_rate": 4.621252395914561e-06, + "loss": 0.1474, + "step": 3744 + }, + { + "epoch": 0.6067725210628645, + "grad_norm": 0.8685867786407471, + "learning_rate": 4.621020944825891e-06, + "loss": 0.1379, + "step": 3745 + }, + { + "epoch": 0.6069345430978613, + "grad_norm": 0.9735004305839539, + "learning_rate": 4.620789428839114e-06, + "loss": 0.1687, + "step": 3746 + }, + { + "epoch": 0.607096565132858, + "grad_norm": 0.8425636291503906, + "learning_rate": 4.6205578479613125e-06, + "loss": 0.1331, + "step": 3747 + }, + { + "epoch": 0.6072585871678549, + "grad_norm": 0.9645400047302246, + "learning_rate": 4.620326202199572e-06, + "loss": 0.1448, + "step": 3748 + }, + { + "epoch": 0.6074206092028516, + "grad_norm": 0.8548397421836853, + "learning_rate": 4.62009449156098e-06, + "loss": 0.139, + "step": 3749 + }, + { + "epoch": 0.6075826312378484, + "grad_norm": 0.92023104429245, + "learning_rate": 4.619862716052629e-06, + "loss": 0.1418, + "step": 3750 + }, + { + "epoch": 0.6077446532728451, + "grad_norm": 1.001118540763855, + "learning_rate": 4.6196308756816075e-06, + "loss": 0.1444, + "step": 3751 + }, + { + "epoch": 0.6079066753078418, + "grad_norm": 1.030665397644043, + "learning_rate": 4.6193989704550105e-06, + "loss": 0.1619, + "step": 3752 + }, + { + "epoch": 0.6080686973428386, + "grad_norm": 0.9593086242675781, + "learning_rate": 4.619167000379934e-06, + "loss": 0.1527, + "step": 3753 + }, + { + "epoch": 0.6082307193778353, + "grad_norm": 0.9325686097145081, + "learning_rate": 4.6189349654634766e-06, + "loss": 0.1493, + "step": 3754 + }, + { + "epoch": 0.6083927414128322, + "grad_norm": 1.058605670928955, + "learning_rate": 4.618702865712736e-06, + "loss": 0.1687, + "step": 3755 + }, + { + "epoch": 0.6085547634478289, + "grad_norm": 0.9116065502166748, + "learning_rate": 4.618470701134815e-06, + "loss": 0.1231, + "step": 3756 + }, + { + "epoch": 0.6087167854828257, + "grad_norm": 1.0380024909973145, + "learning_rate": 4.6182384717368174e-06, + "loss": 0.1602, + "step": 3757 + }, + { + "epoch": 0.6088788075178224, + "grad_norm": 1.04635488986969, + "learning_rate": 4.618006177525849e-06, + "loss": 0.1619, + "step": 3758 + }, + { + "epoch": 0.6090408295528191, + "grad_norm": 0.9052918553352356, + "learning_rate": 4.617773818509016e-06, + "loss": 0.1408, + "step": 3759 + }, + { + "epoch": 0.609202851587816, + "grad_norm": 0.8943411111831665, + "learning_rate": 4.61754139469343e-06, + "loss": 0.1294, + "step": 3760 + }, + { + "epoch": 0.6093648736228127, + "grad_norm": 0.9428607225418091, + "learning_rate": 4.617308906086201e-06, + "loss": 0.1561, + "step": 3761 + }, + { + "epoch": 0.6095268956578095, + "grad_norm": 0.9149422645568848, + "learning_rate": 4.6170763526944425e-06, + "loss": 0.1324, + "step": 3762 + }, + { + "epoch": 0.6096889176928062, + "grad_norm": 0.9159535765647888, + "learning_rate": 4.616843734525272e-06, + "loss": 0.1459, + "step": 3763 + }, + { + "epoch": 0.609850939727803, + "grad_norm": 0.915641188621521, + "learning_rate": 4.616611051585806e-06, + "loss": 0.1408, + "step": 3764 + }, + { + "epoch": 0.6100129617627997, + "grad_norm": 0.9071922898292542, + "learning_rate": 4.616378303883163e-06, + "loss": 0.145, + "step": 3765 + }, + { + "epoch": 0.6101749837977966, + "grad_norm": 0.8846275806427002, + "learning_rate": 4.6161454914244665e-06, + "loss": 0.1196, + "step": 3766 + }, + { + "epoch": 0.6103370058327933, + "grad_norm": 0.8655915856361389, + "learning_rate": 4.615912614216838e-06, + "loss": 0.1349, + "step": 3767 + }, + { + "epoch": 0.61049902786779, + "grad_norm": 0.8834684491157532, + "learning_rate": 4.615679672267405e-06, + "loss": 0.144, + "step": 3768 + }, + { + "epoch": 0.6106610499027868, + "grad_norm": 0.9629760384559631, + "learning_rate": 4.615446665583293e-06, + "loss": 0.149, + "step": 3769 + }, + { + "epoch": 0.6108230719377835, + "grad_norm": 0.8233225345611572, + "learning_rate": 4.615213594171633e-06, + "loss": 0.1256, + "step": 3770 + }, + { + "epoch": 0.6109850939727803, + "grad_norm": 0.8711490631103516, + "learning_rate": 4.6149804580395555e-06, + "loss": 0.1413, + "step": 3771 + }, + { + "epoch": 0.611147116007777, + "grad_norm": 0.8631579875946045, + "learning_rate": 4.614747257194194e-06, + "loss": 0.1279, + "step": 3772 + }, + { + "epoch": 0.6113091380427739, + "grad_norm": 1.0351872444152832, + "learning_rate": 4.614513991642684e-06, + "loss": 0.1701, + "step": 3773 + }, + { + "epoch": 0.6114711600777706, + "grad_norm": 0.8989577293395996, + "learning_rate": 4.614280661392163e-06, + "loss": 0.1516, + "step": 3774 + }, + { + "epoch": 0.6116331821127673, + "grad_norm": 0.9219076037406921, + "learning_rate": 4.61404726644977e-06, + "loss": 0.1482, + "step": 3775 + }, + { + "epoch": 0.6117952041477641, + "grad_norm": 1.016165852546692, + "learning_rate": 4.613813806822647e-06, + "loss": 0.1632, + "step": 3776 + }, + { + "epoch": 0.6119572261827608, + "grad_norm": 0.8687697649002075, + "learning_rate": 4.613580282517936e-06, + "loss": 0.1438, + "step": 3777 + }, + { + "epoch": 0.6121192482177576, + "grad_norm": 0.941464900970459, + "learning_rate": 4.613346693542784e-06, + "loss": 0.1477, + "step": 3778 + }, + { + "epoch": 0.6122812702527544, + "grad_norm": 0.8832119703292847, + "learning_rate": 4.613113039904337e-06, + "loss": 0.1303, + "step": 3779 + }, + { + "epoch": 0.6124432922877512, + "grad_norm": 0.837990403175354, + "learning_rate": 4.6128793216097445e-06, + "loss": 0.1366, + "step": 3780 + }, + { + "epoch": 0.6126053143227479, + "grad_norm": 0.8604587912559509, + "learning_rate": 4.612645538666157e-06, + "loss": 0.1297, + "step": 3781 + }, + { + "epoch": 0.6127673363577446, + "grad_norm": 0.9468358755111694, + "learning_rate": 4.61241169108073e-06, + "loss": 0.1561, + "step": 3782 + }, + { + "epoch": 0.6129293583927414, + "grad_norm": 0.8678791522979736, + "learning_rate": 4.612177778860617e-06, + "loss": 0.1352, + "step": 3783 + }, + { + "epoch": 0.6130913804277381, + "grad_norm": 0.8907611966133118, + "learning_rate": 4.611943802012975e-06, + "loss": 0.1373, + "step": 3784 + }, + { + "epoch": 0.613253402462735, + "grad_norm": 0.8536321520805359, + "learning_rate": 4.611709760544963e-06, + "loss": 0.1302, + "step": 3785 + }, + { + "epoch": 0.6134154244977317, + "grad_norm": 0.8262412548065186, + "learning_rate": 4.611475654463743e-06, + "loss": 0.1426, + "step": 3786 + }, + { + "epoch": 0.6135774465327285, + "grad_norm": 0.9103348255157471, + "learning_rate": 4.611241483776478e-06, + "loss": 0.1445, + "step": 3787 + }, + { + "epoch": 0.6137394685677252, + "grad_norm": 0.9351794123649597, + "learning_rate": 4.6110072484903326e-06, + "loss": 0.1353, + "step": 3788 + }, + { + "epoch": 0.6139014906027219, + "grad_norm": 1.0249840021133423, + "learning_rate": 4.610772948612473e-06, + "loss": 0.1455, + "step": 3789 + }, + { + "epoch": 0.6140635126377187, + "grad_norm": 0.9844290018081665, + "learning_rate": 4.610538584150071e-06, + "loss": 0.1643, + "step": 3790 + }, + { + "epoch": 0.6142255346727155, + "grad_norm": 0.9268943667411804, + "learning_rate": 4.6103041551102935e-06, + "loss": 0.1388, + "step": 3791 + }, + { + "epoch": 0.6143875567077123, + "grad_norm": 0.9610852599143982, + "learning_rate": 4.610069661500317e-06, + "loss": 0.1476, + "step": 3792 + }, + { + "epoch": 0.614549578742709, + "grad_norm": 0.9968616366386414, + "learning_rate": 4.609835103327315e-06, + "loss": 0.1548, + "step": 3793 + }, + { + "epoch": 0.6147116007777058, + "grad_norm": 0.9769883155822754, + "learning_rate": 4.609600480598464e-06, + "loss": 0.1619, + "step": 3794 + }, + { + "epoch": 0.6148736228127025, + "grad_norm": 0.9648511409759521, + "learning_rate": 4.609365793320944e-06, + "loss": 0.1464, + "step": 3795 + }, + { + "epoch": 0.6150356448476992, + "grad_norm": 0.9245080351829529, + "learning_rate": 4.6091310415019355e-06, + "loss": 0.1612, + "step": 3796 + }, + { + "epoch": 0.6151976668826961, + "grad_norm": 0.8954278230667114, + "learning_rate": 4.608896225148621e-06, + "loss": 0.1421, + "step": 3797 + }, + { + "epoch": 0.6153596889176928, + "grad_norm": 1.078444004058838, + "learning_rate": 4.608661344268185e-06, + "loss": 0.1863, + "step": 3798 + }, + { + "epoch": 0.6155217109526896, + "grad_norm": 0.7994515299797058, + "learning_rate": 4.608426398867815e-06, + "loss": 0.1162, + "step": 3799 + }, + { + "epoch": 0.6156837329876863, + "grad_norm": 0.8549524545669556, + "learning_rate": 4.608191388954699e-06, + "loss": 0.1416, + "step": 3800 + }, + { + "epoch": 0.6158457550226831, + "grad_norm": 0.8880654573440552, + "learning_rate": 4.607956314536029e-06, + "loss": 0.1308, + "step": 3801 + }, + { + "epoch": 0.6160077770576798, + "grad_norm": 0.9206951856613159, + "learning_rate": 4.607721175618997e-06, + "loss": 0.1543, + "step": 3802 + }, + { + "epoch": 0.6161697990926766, + "grad_norm": 1.0166141986846924, + "learning_rate": 4.607485972210797e-06, + "loss": 0.1533, + "step": 3803 + }, + { + "epoch": 0.6163318211276734, + "grad_norm": 0.9442442655563354, + "learning_rate": 4.6072507043186265e-06, + "loss": 0.1442, + "step": 3804 + }, + { + "epoch": 0.6164938431626701, + "grad_norm": 1.1184486150741577, + "learning_rate": 4.607015371949683e-06, + "loss": 0.158, + "step": 3805 + }, + { + "epoch": 0.6166558651976669, + "grad_norm": 0.9163842797279358, + "learning_rate": 4.60677997511117e-06, + "loss": 0.142, + "step": 3806 + }, + { + "epoch": 0.6168178872326636, + "grad_norm": 0.8781442642211914, + "learning_rate": 4.606544513810287e-06, + "loss": 0.1387, + "step": 3807 + }, + { + "epoch": 0.6169799092676604, + "grad_norm": 0.9204709529876709, + "learning_rate": 4.606308988054239e-06, + "loss": 0.1549, + "step": 3808 + }, + { + "epoch": 0.6171419313026572, + "grad_norm": 0.9622353315353394, + "learning_rate": 4.606073397850234e-06, + "loss": 0.1534, + "step": 3809 + }, + { + "epoch": 0.6173039533376539, + "grad_norm": 1.0953677892684937, + "learning_rate": 4.605837743205479e-06, + "loss": 0.144, + "step": 3810 + }, + { + "epoch": 0.6174659753726507, + "grad_norm": 0.8764045834541321, + "learning_rate": 4.6056020241271855e-06, + "loss": 0.1359, + "step": 3811 + }, + { + "epoch": 0.6176279974076474, + "grad_norm": 0.9213500022888184, + "learning_rate": 4.605366240622565e-06, + "loss": 0.1521, + "step": 3812 + }, + { + "epoch": 0.6177900194426442, + "grad_norm": 0.9212419986724854, + "learning_rate": 4.605130392698833e-06, + "loss": 0.1425, + "step": 3813 + }, + { + "epoch": 0.6179520414776409, + "grad_norm": 0.9497610330581665, + "learning_rate": 4.604894480363205e-06, + "loss": 0.1536, + "step": 3814 + }, + { + "epoch": 0.6181140635126378, + "grad_norm": 0.9440318942070007, + "learning_rate": 4.6046585036229005e-06, + "loss": 0.1481, + "step": 3815 + }, + { + "epoch": 0.6182760855476345, + "grad_norm": 0.8853862881660461, + "learning_rate": 4.604422462485138e-06, + "loss": 0.141, + "step": 3816 + }, + { + "epoch": 0.6184381075826313, + "grad_norm": 0.9499521851539612, + "learning_rate": 4.604186356957141e-06, + "loss": 0.147, + "step": 3817 + }, + { + "epoch": 0.618600129617628, + "grad_norm": 0.8915985822677612, + "learning_rate": 4.603950187046134e-06, + "loss": 0.1515, + "step": 3818 + }, + { + "epoch": 0.6187621516526247, + "grad_norm": 0.9126549959182739, + "learning_rate": 4.6037139527593424e-06, + "loss": 0.1505, + "step": 3819 + }, + { + "epoch": 0.6189241736876215, + "grad_norm": 1.0714317560195923, + "learning_rate": 4.603477654103994e-06, + "loss": 0.171, + "step": 3820 + }, + { + "epoch": 0.6190861957226182, + "grad_norm": 1.0061242580413818, + "learning_rate": 4.60324129108732e-06, + "loss": 0.1606, + "step": 3821 + }, + { + "epoch": 0.6192482177576151, + "grad_norm": 1.0254570245742798, + "learning_rate": 4.603004863716553e-06, + "loss": 0.177, + "step": 3822 + }, + { + "epoch": 0.6194102397926118, + "grad_norm": 0.7951987981796265, + "learning_rate": 4.602768371998925e-06, + "loss": 0.1198, + "step": 3823 + }, + { + "epoch": 0.6195722618276086, + "grad_norm": 0.9415546655654907, + "learning_rate": 4.602531815941676e-06, + "loss": 0.1581, + "step": 3824 + }, + { + "epoch": 0.6197342838626053, + "grad_norm": 0.9311978220939636, + "learning_rate": 4.602295195552039e-06, + "loss": 0.1474, + "step": 3825 + }, + { + "epoch": 0.619896305897602, + "grad_norm": 0.8883768320083618, + "learning_rate": 4.602058510837257e-06, + "loss": 0.1383, + "step": 3826 + }, + { + "epoch": 0.6200583279325989, + "grad_norm": 1.01768958568573, + "learning_rate": 4.601821761804572e-06, + "loss": 0.1721, + "step": 3827 + }, + { + "epoch": 0.6202203499675956, + "grad_norm": 0.9163593053817749, + "learning_rate": 4.6015849484612265e-06, + "loss": 0.1258, + "step": 3828 + }, + { + "epoch": 0.6203823720025924, + "grad_norm": 0.9427400231361389, + "learning_rate": 4.601348070814468e-06, + "loss": 0.1308, + "step": 3829 + }, + { + "epoch": 0.6205443940375891, + "grad_norm": 1.179629921913147, + "learning_rate": 4.601111128871544e-06, + "loss": 0.1838, + "step": 3830 + }, + { + "epoch": 0.6207064160725859, + "grad_norm": 1.0116398334503174, + "learning_rate": 4.600874122639703e-06, + "loss": 0.147, + "step": 3831 + }, + { + "epoch": 0.6208684381075826, + "grad_norm": 0.8127241134643555, + "learning_rate": 4.600637052126199e-06, + "loss": 0.1333, + "step": 3832 + }, + { + "epoch": 0.6210304601425793, + "grad_norm": 0.9073969125747681, + "learning_rate": 4.600399917338284e-06, + "loss": 0.1369, + "step": 3833 + }, + { + "epoch": 0.6211924821775762, + "grad_norm": 0.8607815504074097, + "learning_rate": 4.600162718283215e-06, + "loss": 0.1357, + "step": 3834 + }, + { + "epoch": 0.6213545042125729, + "grad_norm": 0.9723119735717773, + "learning_rate": 4.5999254549682484e-06, + "loss": 0.1456, + "step": 3835 + }, + { + "epoch": 0.6215165262475697, + "grad_norm": 0.9572594165802002, + "learning_rate": 4.599688127400645e-06, + "loss": 0.1548, + "step": 3836 + }, + { + "epoch": 0.6216785482825664, + "grad_norm": 0.9637323021888733, + "learning_rate": 4.599450735587666e-06, + "loss": 0.1533, + "step": 3837 + }, + { + "epoch": 0.6218405703175632, + "grad_norm": 0.9875199794769287, + "learning_rate": 4.599213279536575e-06, + "loss": 0.159, + "step": 3838 + }, + { + "epoch": 0.62200259235256, + "grad_norm": 0.9891331195831299, + "learning_rate": 4.598975759254638e-06, + "loss": 0.1594, + "step": 3839 + }, + { + "epoch": 0.6221646143875567, + "grad_norm": 0.941134512424469, + "learning_rate": 4.598738174749121e-06, + "loss": 0.1466, + "step": 3840 + }, + { + "epoch": 0.6223266364225535, + "grad_norm": 0.8841784596443176, + "learning_rate": 4.598500526027296e-06, + "loss": 0.1571, + "step": 3841 + }, + { + "epoch": 0.6224886584575502, + "grad_norm": 0.9083060622215271, + "learning_rate": 4.598262813096432e-06, + "loss": 0.1543, + "step": 3842 + }, + { + "epoch": 0.622650680492547, + "grad_norm": 0.8198730945587158, + "learning_rate": 4.598025035963805e-06, + "loss": 0.1286, + "step": 3843 + }, + { + "epoch": 0.6228127025275437, + "grad_norm": 0.9744411706924438, + "learning_rate": 4.597787194636688e-06, + "loss": 0.1523, + "step": 3844 + }, + { + "epoch": 0.6229747245625405, + "grad_norm": 0.8268557190895081, + "learning_rate": 4.597549289122361e-06, + "loss": 0.1294, + "step": 3845 + }, + { + "epoch": 0.6231367465975373, + "grad_norm": 1.0291190147399902, + "learning_rate": 4.597311319428099e-06, + "loss": 0.1556, + "step": 3846 + }, + { + "epoch": 0.623298768632534, + "grad_norm": 0.8949023485183716, + "learning_rate": 4.597073285561188e-06, + "loss": 0.1354, + "step": 3847 + }, + { + "epoch": 0.6234607906675308, + "grad_norm": 0.9648371338844299, + "learning_rate": 4.596835187528908e-06, + "loss": 0.1517, + "step": 3848 + }, + { + "epoch": 0.6236228127025275, + "grad_norm": 1.00344717502594, + "learning_rate": 4.596597025338547e-06, + "loss": 0.1659, + "step": 3849 + }, + { + "epoch": 0.6237848347375243, + "grad_norm": 0.8660650849342346, + "learning_rate": 4.59635879899739e-06, + "loss": 0.1356, + "step": 3850 + }, + { + "epoch": 0.623946856772521, + "grad_norm": 1.0011287927627563, + "learning_rate": 4.596120508512727e-06, + "loss": 0.1452, + "step": 3851 + }, + { + "epoch": 0.6241088788075179, + "grad_norm": 0.8602931499481201, + "learning_rate": 4.595882153891849e-06, + "loss": 0.1315, + "step": 3852 + }, + { + "epoch": 0.6242709008425146, + "grad_norm": 0.8699659705162048, + "learning_rate": 4.595643735142049e-06, + "loss": 0.1395, + "step": 3853 + }, + { + "epoch": 0.6244329228775113, + "grad_norm": 0.782909631729126, + "learning_rate": 4.595405252270622e-06, + "loss": 0.1228, + "step": 3854 + }, + { + "epoch": 0.6245949449125081, + "grad_norm": 0.9927259683609009, + "learning_rate": 4.595166705284864e-06, + "loss": 0.1533, + "step": 3855 + }, + { + "epoch": 0.6247569669475048, + "grad_norm": 0.9355255365371704, + "learning_rate": 4.594928094192076e-06, + "loss": 0.1478, + "step": 3856 + }, + { + "epoch": 0.6249189889825016, + "grad_norm": 0.9448959231376648, + "learning_rate": 4.594689418999558e-06, + "loss": 0.1456, + "step": 3857 + }, + { + "epoch": 0.6250810110174984, + "grad_norm": 0.8999738097190857, + "learning_rate": 4.594450679714613e-06, + "loss": 0.144, + "step": 3858 + }, + { + "epoch": 0.6252430330524952, + "grad_norm": 0.852476954460144, + "learning_rate": 4.594211876344545e-06, + "loss": 0.1304, + "step": 3859 + }, + { + "epoch": 0.6254050550874919, + "grad_norm": 0.8585977554321289, + "learning_rate": 4.593973008896662e-06, + "loss": 0.14, + "step": 3860 + }, + { + "epoch": 0.6255670771224887, + "grad_norm": 0.8411082625389099, + "learning_rate": 4.593734077378273e-06, + "loss": 0.1301, + "step": 3861 + }, + { + "epoch": 0.6257290991574854, + "grad_norm": 0.9660947322845459, + "learning_rate": 4.593495081796686e-06, + "loss": 0.1478, + "step": 3862 + }, + { + "epoch": 0.6258911211924821, + "grad_norm": 0.9528719186782837, + "learning_rate": 4.593256022159217e-06, + "loss": 0.141, + "step": 3863 + }, + { + "epoch": 0.626053143227479, + "grad_norm": 0.8987707495689392, + "learning_rate": 4.59301689847318e-06, + "loss": 0.1377, + "step": 3864 + }, + { + "epoch": 0.6262151652624757, + "grad_norm": 0.8041302561759949, + "learning_rate": 4.592777710745889e-06, + "loss": 0.1266, + "step": 3865 + }, + { + "epoch": 0.6263771872974725, + "grad_norm": 0.9248653650283813, + "learning_rate": 4.592538458984666e-06, + "loss": 0.1373, + "step": 3866 + }, + { + "epoch": 0.6265392093324692, + "grad_norm": 0.9307569265365601, + "learning_rate": 4.592299143196829e-06, + "loss": 0.1531, + "step": 3867 + }, + { + "epoch": 0.626701231367466, + "grad_norm": 1.1055402755737305, + "learning_rate": 4.5920597633897015e-06, + "loss": 0.1396, + "step": 3868 + }, + { + "epoch": 0.6268632534024627, + "grad_norm": 0.9206196069717407, + "learning_rate": 4.591820319570609e-06, + "loss": 0.1401, + "step": 3869 + }, + { + "epoch": 0.6270252754374595, + "grad_norm": 0.8944006562232971, + "learning_rate": 4.5915808117468766e-06, + "loss": 0.1455, + "step": 3870 + }, + { + "epoch": 0.6271872974724563, + "grad_norm": 0.87538743019104, + "learning_rate": 4.591341239925831e-06, + "loss": 0.1393, + "step": 3871 + }, + { + "epoch": 0.627349319507453, + "grad_norm": 1.0033069849014282, + "learning_rate": 4.591101604114807e-06, + "loss": 0.1531, + "step": 3872 + }, + { + "epoch": 0.6275113415424498, + "grad_norm": 1.0902032852172852, + "learning_rate": 4.590861904321133e-06, + "loss": 0.1743, + "step": 3873 + }, + { + "epoch": 0.6276733635774465, + "grad_norm": 1.0226129293441772, + "learning_rate": 4.590622140552144e-06, + "loss": 0.1443, + "step": 3874 + }, + { + "epoch": 0.6278353856124433, + "grad_norm": 0.9415045380592346, + "learning_rate": 4.590382312815178e-06, + "loss": 0.1412, + "step": 3875 + }, + { + "epoch": 0.62799740764744, + "grad_norm": 1.0171313285827637, + "learning_rate": 4.5901424211175715e-06, + "loss": 0.1658, + "step": 3876 + }, + { + "epoch": 0.6281594296824368, + "grad_norm": 0.9125556945800781, + "learning_rate": 4.589902465466665e-06, + "loss": 0.1417, + "step": 3877 + }, + { + "epoch": 0.6283214517174336, + "grad_norm": 0.9008194804191589, + "learning_rate": 4.5896624458698e-06, + "loss": 0.1405, + "step": 3878 + }, + { + "epoch": 0.6284834737524303, + "grad_norm": 0.9593300819396973, + "learning_rate": 4.589422362334321e-06, + "loss": 0.1403, + "step": 3879 + }, + { + "epoch": 0.6286454957874271, + "grad_norm": 0.9999030232429504, + "learning_rate": 4.5891822148675745e-06, + "loss": 0.1457, + "step": 3880 + }, + { + "epoch": 0.6288075178224238, + "grad_norm": 0.9375879764556885, + "learning_rate": 4.588942003476907e-06, + "loss": 0.1425, + "step": 3881 + }, + { + "epoch": 0.6289695398574207, + "grad_norm": 0.8549643754959106, + "learning_rate": 4.588701728169671e-06, + "loss": 0.1354, + "step": 3882 + }, + { + "epoch": 0.6291315618924174, + "grad_norm": 0.9586682915687561, + "learning_rate": 4.588461388953216e-06, + "loss": 0.1707, + "step": 3883 + }, + { + "epoch": 0.6292935839274141, + "grad_norm": 0.8284324407577515, + "learning_rate": 4.5882209858348956e-06, + "loss": 0.1326, + "step": 3884 + }, + { + "epoch": 0.6294556059624109, + "grad_norm": 0.9033730030059814, + "learning_rate": 4.587980518822067e-06, + "loss": 0.1452, + "step": 3885 + }, + { + "epoch": 0.6296176279974076, + "grad_norm": 0.9468452334403992, + "learning_rate": 4.587739987922087e-06, + "loss": 0.1466, + "step": 3886 + }, + { + "epoch": 0.6297796500324044, + "grad_norm": 0.97198885679245, + "learning_rate": 4.587499393142316e-06, + "loss": 0.1459, + "step": 3887 + }, + { + "epoch": 0.6299416720674011, + "grad_norm": 1.0194365978240967, + "learning_rate": 4.587258734490115e-06, + "loss": 0.1597, + "step": 3888 + }, + { + "epoch": 0.630103694102398, + "grad_norm": 0.9163743257522583, + "learning_rate": 4.587018011972848e-06, + "loss": 0.1414, + "step": 3889 + }, + { + "epoch": 0.6302657161373947, + "grad_norm": 0.7911040186882019, + "learning_rate": 4.586777225597881e-06, + "loss": 0.1259, + "step": 3890 + }, + { + "epoch": 0.6304277381723914, + "grad_norm": 0.9265354871749878, + "learning_rate": 4.58653637537258e-06, + "loss": 0.1403, + "step": 3891 + }, + { + "epoch": 0.6305897602073882, + "grad_norm": 0.9925926923751831, + "learning_rate": 4.586295461304315e-06, + "loss": 0.1521, + "step": 3892 + }, + { + "epoch": 0.6307517822423849, + "grad_norm": 1.0709103345870972, + "learning_rate": 4.586054483400459e-06, + "loss": 0.1772, + "step": 3893 + }, + { + "epoch": 0.6309138042773818, + "grad_norm": 0.9420156478881836, + "learning_rate": 4.585813441668383e-06, + "loss": 0.1495, + "step": 3894 + }, + { + "epoch": 0.6310758263123785, + "grad_norm": 0.840775728225708, + "learning_rate": 4.585572336115463e-06, + "loss": 0.1301, + "step": 3895 + }, + { + "epoch": 0.6312378483473753, + "grad_norm": 0.8908707499504089, + "learning_rate": 4.585331166749077e-06, + "loss": 0.1455, + "step": 3896 + }, + { + "epoch": 0.631399870382372, + "grad_norm": 0.9626536965370178, + "learning_rate": 4.5850899335766034e-06, + "loss": 0.1543, + "step": 3897 + }, + { + "epoch": 0.6315618924173687, + "grad_norm": 0.8213000297546387, + "learning_rate": 4.584848636605423e-06, + "loss": 0.133, + "step": 3898 + }, + { + "epoch": 0.6317239144523655, + "grad_norm": 0.9156107306480408, + "learning_rate": 4.584607275842921e-06, + "loss": 0.1383, + "step": 3899 + }, + { + "epoch": 0.6318859364873622, + "grad_norm": 0.9265947341918945, + "learning_rate": 4.58436585129648e-06, + "loss": 0.1336, + "step": 3900 + }, + { + "epoch": 0.6320479585223591, + "grad_norm": 0.9135920405387878, + "learning_rate": 4.584124362973488e-06, + "loss": 0.1347, + "step": 3901 + }, + { + "epoch": 0.6322099805573558, + "grad_norm": 1.0750758647918701, + "learning_rate": 4.583882810881334e-06, + "loss": 0.1476, + "step": 3902 + }, + { + "epoch": 0.6323720025923526, + "grad_norm": 1.0379818677902222, + "learning_rate": 4.583641195027409e-06, + "loss": 0.1563, + "step": 3903 + }, + { + "epoch": 0.6325340246273493, + "grad_norm": 0.9261550903320312, + "learning_rate": 4.583399515419106e-06, + "loss": 0.139, + "step": 3904 + }, + { + "epoch": 0.6326960466623461, + "grad_norm": 0.9920796155929565, + "learning_rate": 4.58315777206382e-06, + "loss": 0.1358, + "step": 3905 + }, + { + "epoch": 0.6328580686973428, + "grad_norm": 0.9471383690834045, + "learning_rate": 4.582915964968946e-06, + "loss": 0.1418, + "step": 3906 + }, + { + "epoch": 0.6330200907323396, + "grad_norm": 1.0869243144989014, + "learning_rate": 4.582674094141885e-06, + "loss": 0.166, + "step": 3907 + }, + { + "epoch": 0.6331821127673364, + "grad_norm": 0.9842541813850403, + "learning_rate": 4.582432159590037e-06, + "loss": 0.1565, + "step": 3908 + }, + { + "epoch": 0.6333441348023331, + "grad_norm": 1.0878725051879883, + "learning_rate": 4.582190161320803e-06, + "loss": 0.173, + "step": 3909 + }, + { + "epoch": 0.6335061568373299, + "grad_norm": 0.9759945869445801, + "learning_rate": 4.58194809934159e-06, + "loss": 0.1576, + "step": 3910 + }, + { + "epoch": 0.6336681788723266, + "grad_norm": 0.9469559192657471, + "learning_rate": 4.581705973659803e-06, + "loss": 0.1445, + "step": 3911 + }, + { + "epoch": 0.6338302009073234, + "grad_norm": 0.9183310866355896, + "learning_rate": 4.5814637842828506e-06, + "loss": 0.1411, + "step": 3912 + }, + { + "epoch": 0.6339922229423202, + "grad_norm": 1.0365331172943115, + "learning_rate": 4.581221531218144e-06, + "loss": 0.154, + "step": 3913 + }, + { + "epoch": 0.6341542449773169, + "grad_norm": 0.9276525974273682, + "learning_rate": 4.580979214473095e-06, + "loss": 0.1452, + "step": 3914 + }, + { + "epoch": 0.6343162670123137, + "grad_norm": 0.9412997961044312, + "learning_rate": 4.580736834055117e-06, + "loss": 0.1418, + "step": 3915 + }, + { + "epoch": 0.6344782890473104, + "grad_norm": 0.906779408454895, + "learning_rate": 4.580494389971628e-06, + "loss": 0.152, + "step": 3916 + }, + { + "epoch": 0.6346403110823072, + "grad_norm": 0.8911001682281494, + "learning_rate": 4.580251882230045e-06, + "loss": 0.1523, + "step": 3917 + }, + { + "epoch": 0.6348023331173039, + "grad_norm": 0.9510197639465332, + "learning_rate": 4.580009310837789e-06, + "loss": 0.1451, + "step": 3918 + }, + { + "epoch": 0.6349643551523008, + "grad_norm": 0.9477640986442566, + "learning_rate": 4.579766675802281e-06, + "loss": 0.1485, + "step": 3919 + }, + { + "epoch": 0.6351263771872975, + "grad_norm": 0.9456864595413208, + "learning_rate": 4.579523977130946e-06, + "loss": 0.1489, + "step": 3920 + }, + { + "epoch": 0.6352883992222942, + "grad_norm": 0.8579930663108826, + "learning_rate": 4.57928121483121e-06, + "loss": 0.1462, + "step": 3921 + }, + { + "epoch": 0.635450421257291, + "grad_norm": 0.8796806931495667, + "learning_rate": 4.579038388910499e-06, + "loss": 0.1349, + "step": 3922 + }, + { + "epoch": 0.6356124432922877, + "grad_norm": 0.9751520156860352, + "learning_rate": 4.578795499376246e-06, + "loss": 0.1583, + "step": 3923 + }, + { + "epoch": 0.6357744653272845, + "grad_norm": 0.9027172327041626, + "learning_rate": 4.578552546235882e-06, + "loss": 0.1483, + "step": 3924 + }, + { + "epoch": 0.6359364873622813, + "grad_norm": 0.9351487159729004, + "learning_rate": 4.578309529496839e-06, + "loss": 0.1398, + "step": 3925 + }, + { + "epoch": 0.6360985093972781, + "grad_norm": 0.99167400598526, + "learning_rate": 4.578066449166554e-06, + "loss": 0.172, + "step": 3926 + }, + { + "epoch": 0.6362605314322748, + "grad_norm": 1.0527704954147339, + "learning_rate": 4.577823305252464e-06, + "loss": 0.1422, + "step": 3927 + }, + { + "epoch": 0.6364225534672715, + "grad_norm": 0.8460220694541931, + "learning_rate": 4.57758009776201e-06, + "loss": 0.1374, + "step": 3928 + }, + { + "epoch": 0.6365845755022683, + "grad_norm": 0.8465033769607544, + "learning_rate": 4.577336826702631e-06, + "loss": 0.1408, + "step": 3929 + }, + { + "epoch": 0.636746597537265, + "grad_norm": 0.9382408261299133, + "learning_rate": 4.577093492081774e-06, + "loss": 0.1412, + "step": 3930 + }, + { + "epoch": 0.6369086195722619, + "grad_norm": 0.9168792366981506, + "learning_rate": 4.576850093906881e-06, + "loss": 0.143, + "step": 3931 + }, + { + "epoch": 0.6370706416072586, + "grad_norm": 0.8806384205818176, + "learning_rate": 4.576606632185403e-06, + "loss": 0.1321, + "step": 3932 + }, + { + "epoch": 0.6372326636422554, + "grad_norm": 0.8602378964424133, + "learning_rate": 4.576363106924785e-06, + "loss": 0.122, + "step": 3933 + }, + { + "epoch": 0.6373946856772521, + "grad_norm": 0.9462723135948181, + "learning_rate": 4.576119518132483e-06, + "loss": 0.1421, + "step": 3934 + }, + { + "epoch": 0.6375567077122488, + "grad_norm": 0.9258326292037964, + "learning_rate": 4.5758758658159465e-06, + "loss": 0.1483, + "step": 3935 + }, + { + "epoch": 0.6377187297472456, + "grad_norm": 0.9369986653327942, + "learning_rate": 4.575632149982631e-06, + "loss": 0.1457, + "step": 3936 + }, + { + "epoch": 0.6378807517822424, + "grad_norm": 0.8889022469520569, + "learning_rate": 4.575388370639997e-06, + "loss": 0.1267, + "step": 3937 + }, + { + "epoch": 0.6380427738172392, + "grad_norm": 0.9667481780052185, + "learning_rate": 4.5751445277955e-06, + "loss": 0.1417, + "step": 3938 + }, + { + "epoch": 0.6382047958522359, + "grad_norm": 0.9078060388565063, + "learning_rate": 4.574900621456602e-06, + "loss": 0.1371, + "step": 3939 + }, + { + "epoch": 0.6383668178872327, + "grad_norm": 0.9612594246864319, + "learning_rate": 4.574656651630767e-06, + "loss": 0.1446, + "step": 3940 + }, + { + "epoch": 0.6385288399222294, + "grad_norm": 0.9944157004356384, + "learning_rate": 4.574412618325458e-06, + "loss": 0.146, + "step": 3941 + }, + { + "epoch": 0.6386908619572261, + "grad_norm": 0.9619837999343872, + "learning_rate": 4.574168521548144e-06, + "loss": 0.1485, + "step": 3942 + }, + { + "epoch": 0.638852883992223, + "grad_norm": 1.0251742601394653, + "learning_rate": 4.5739243613062915e-06, + "loss": 0.1626, + "step": 3943 + }, + { + "epoch": 0.6390149060272197, + "grad_norm": 1.0261211395263672, + "learning_rate": 4.573680137607373e-06, + "loss": 0.1482, + "step": 3944 + }, + { + "epoch": 0.6391769280622165, + "grad_norm": 0.9001795053482056, + "learning_rate": 4.57343585045886e-06, + "loss": 0.1326, + "step": 3945 + }, + { + "epoch": 0.6393389500972132, + "grad_norm": 1.0013598203659058, + "learning_rate": 4.573191499868228e-06, + "loss": 0.1509, + "step": 3946 + }, + { + "epoch": 0.63950097213221, + "grad_norm": 1.0281734466552734, + "learning_rate": 4.572947085842952e-06, + "loss": 0.1414, + "step": 3947 + }, + { + "epoch": 0.6396629941672067, + "grad_norm": 0.9392046928405762, + "learning_rate": 4.572702608390513e-06, + "loss": 0.1421, + "step": 3948 + }, + { + "epoch": 0.6398250162022034, + "grad_norm": 0.9891103506088257, + "learning_rate": 4.57245806751839e-06, + "loss": 0.1592, + "step": 3949 + }, + { + "epoch": 0.6399870382372003, + "grad_norm": 0.8271520137786865, + "learning_rate": 4.572213463234065e-06, + "loss": 0.1301, + "step": 3950 + }, + { + "epoch": 0.640149060272197, + "grad_norm": 0.8745349645614624, + "learning_rate": 4.571968795545023e-06, + "loss": 0.1386, + "step": 3951 + }, + { + "epoch": 0.6403110823071938, + "grad_norm": 0.9697176218032837, + "learning_rate": 4.5717240644587495e-06, + "loss": 0.1628, + "step": 3952 + }, + { + "epoch": 0.6404731043421905, + "grad_norm": 1.1860628128051758, + "learning_rate": 4.571479269982734e-06, + "loss": 0.1444, + "step": 3953 + }, + { + "epoch": 0.6406351263771873, + "grad_norm": 0.9648879766464233, + "learning_rate": 4.571234412124464e-06, + "loss": 0.1604, + "step": 3954 + }, + { + "epoch": 0.640797148412184, + "grad_norm": 0.9528911709785461, + "learning_rate": 4.570989490891434e-06, + "loss": 0.1569, + "step": 3955 + }, + { + "epoch": 0.6409591704471809, + "grad_norm": 0.8974754214286804, + "learning_rate": 4.570744506291138e-06, + "loss": 0.1485, + "step": 3956 + }, + { + "epoch": 0.6411211924821776, + "grad_norm": 0.8654069304466248, + "learning_rate": 4.570499458331071e-06, + "loss": 0.14, + "step": 3957 + }, + { + "epoch": 0.6412832145171743, + "grad_norm": 0.906300961971283, + "learning_rate": 4.570254347018731e-06, + "loss": 0.1508, + "step": 3958 + }, + { + "epoch": 0.6414452365521711, + "grad_norm": 1.011763572692871, + "learning_rate": 4.570009172361617e-06, + "loss": 0.1649, + "step": 3959 + }, + { + "epoch": 0.6416072585871678, + "grad_norm": 0.956451952457428, + "learning_rate": 4.5697639343672325e-06, + "loss": 0.1585, + "step": 3960 + }, + { + "epoch": 0.6417692806221647, + "grad_norm": 0.8895651698112488, + "learning_rate": 4.569518633043081e-06, + "loss": 0.1425, + "step": 3961 + }, + { + "epoch": 0.6419313026571614, + "grad_norm": 0.922642171382904, + "learning_rate": 4.569273268396667e-06, + "loss": 0.1554, + "step": 3962 + }, + { + "epoch": 0.6420933246921582, + "grad_norm": 0.8361677527427673, + "learning_rate": 4.569027840435498e-06, + "loss": 0.1271, + "step": 3963 + }, + { + "epoch": 0.6422553467271549, + "grad_norm": 0.762519121170044, + "learning_rate": 4.568782349167084e-06, + "loss": 0.1148, + "step": 3964 + }, + { + "epoch": 0.6424173687621516, + "grad_norm": 1.0175034999847412, + "learning_rate": 4.568536794598937e-06, + "loss": 0.1408, + "step": 3965 + }, + { + "epoch": 0.6425793907971484, + "grad_norm": 1.0413769483566284, + "learning_rate": 4.56829117673857e-06, + "loss": 0.17, + "step": 3966 + }, + { + "epoch": 0.6427414128321451, + "grad_norm": 0.8488859534263611, + "learning_rate": 4.568045495593497e-06, + "loss": 0.1435, + "step": 3967 + }, + { + "epoch": 0.642903434867142, + "grad_norm": 0.8258789777755737, + "learning_rate": 4.567799751171237e-06, + "loss": 0.1261, + "step": 3968 + }, + { + "epoch": 0.6430654569021387, + "grad_norm": 0.8980733752250671, + "learning_rate": 4.567553943479309e-06, + "loss": 0.1372, + "step": 3969 + }, + { + "epoch": 0.6432274789371355, + "grad_norm": 0.8406203389167786, + "learning_rate": 4.567308072525233e-06, + "loss": 0.122, + "step": 3970 + }, + { + "epoch": 0.6433895009721322, + "grad_norm": 0.9138239622116089, + "learning_rate": 4.567062138316534e-06, + "loss": 0.1483, + "step": 3971 + }, + { + "epoch": 0.6435515230071289, + "grad_norm": 0.9522116780281067, + "learning_rate": 4.566816140860735e-06, + "loss": 0.1437, + "step": 3972 + }, + { + "epoch": 0.6437135450421257, + "grad_norm": 0.8937114477157593, + "learning_rate": 4.566570080165363e-06, + "loss": 0.1242, + "step": 3973 + }, + { + "epoch": 0.6438755670771225, + "grad_norm": 0.9830919504165649, + "learning_rate": 4.566323956237948e-06, + "loss": 0.1407, + "step": 3974 + }, + { + "epoch": 0.6440375891121193, + "grad_norm": 1.0365149974822998, + "learning_rate": 4.566077769086022e-06, + "loss": 0.1556, + "step": 3975 + }, + { + "epoch": 0.644199611147116, + "grad_norm": 0.9378160238265991, + "learning_rate": 4.565831518717114e-06, + "loss": 0.1469, + "step": 3976 + }, + { + "epoch": 0.6443616331821128, + "grad_norm": 0.9110143184661865, + "learning_rate": 4.565585205138761e-06, + "loss": 0.1368, + "step": 3977 + }, + { + "epoch": 0.6445236552171095, + "grad_norm": 0.936230480670929, + "learning_rate": 4.5653388283585e-06, + "loss": 0.1328, + "step": 3978 + }, + { + "epoch": 0.6446856772521062, + "grad_norm": 1.0089045763015747, + "learning_rate": 4.565092388383869e-06, + "loss": 0.1427, + "step": 3979 + }, + { + "epoch": 0.6448476992871031, + "grad_norm": 0.9380981922149658, + "learning_rate": 4.564845885222407e-06, + "loss": 0.1466, + "step": 3980 + }, + { + "epoch": 0.6450097213220998, + "grad_norm": 0.9660612344741821, + "learning_rate": 4.564599318881659e-06, + "loss": 0.1387, + "step": 3981 + }, + { + "epoch": 0.6451717433570966, + "grad_norm": 0.8815035223960876, + "learning_rate": 4.564352689369168e-06, + "loss": 0.1407, + "step": 3982 + }, + { + "epoch": 0.6453337653920933, + "grad_norm": 0.8399519324302673, + "learning_rate": 4.56410599669248e-06, + "loss": 0.1371, + "step": 3983 + }, + { + "epoch": 0.6454957874270901, + "grad_norm": 0.965580403804779, + "learning_rate": 4.563859240859144e-06, + "loss": 0.1474, + "step": 3984 + }, + { + "epoch": 0.6456578094620868, + "grad_norm": 0.9897104501724243, + "learning_rate": 4.5636124218767095e-06, + "loss": 0.1633, + "step": 3985 + }, + { + "epoch": 0.6458198314970836, + "grad_norm": 1.1038873195648193, + "learning_rate": 4.563365539752728e-06, + "loss": 0.1622, + "step": 3986 + }, + { + "epoch": 0.6459818535320804, + "grad_norm": 0.9341737031936646, + "learning_rate": 4.563118594494755e-06, + "loss": 0.1524, + "step": 3987 + }, + { + "epoch": 0.6461438755670771, + "grad_norm": 1.0048877000808716, + "learning_rate": 4.5628715861103455e-06, + "loss": 0.1549, + "step": 3988 + }, + { + "epoch": 0.6463058976020739, + "grad_norm": 0.8541507124900818, + "learning_rate": 4.562624514607058e-06, + "loss": 0.1366, + "step": 3989 + }, + { + "epoch": 0.6464679196370706, + "grad_norm": 0.8617687821388245, + "learning_rate": 4.562377379992451e-06, + "loss": 0.1352, + "step": 3990 + }, + { + "epoch": 0.6466299416720674, + "grad_norm": 0.8900352120399475, + "learning_rate": 4.5621301822740875e-06, + "loss": 0.1451, + "step": 3991 + }, + { + "epoch": 0.6467919637070642, + "grad_norm": 0.917232871055603, + "learning_rate": 4.56188292145953e-06, + "loss": 0.1397, + "step": 3992 + }, + { + "epoch": 0.6469539857420609, + "grad_norm": 0.8828876614570618, + "learning_rate": 4.5616355975563456e-06, + "loss": 0.1346, + "step": 3993 + }, + { + "epoch": 0.6471160077770577, + "grad_norm": 1.0017690658569336, + "learning_rate": 4.561388210572101e-06, + "loss": 0.1667, + "step": 3994 + }, + { + "epoch": 0.6472780298120544, + "grad_norm": 0.9873506426811218, + "learning_rate": 4.561140760514365e-06, + "loss": 0.1538, + "step": 3995 + }, + { + "epoch": 0.6474400518470512, + "grad_norm": 0.9482616782188416, + "learning_rate": 4.56089324739071e-06, + "loss": 0.1576, + "step": 3996 + }, + { + "epoch": 0.6476020738820479, + "grad_norm": 0.8878632187843323, + "learning_rate": 4.560645671208709e-06, + "loss": 0.1395, + "step": 3997 + }, + { + "epoch": 0.6477640959170448, + "grad_norm": 1.057747483253479, + "learning_rate": 4.560398031975937e-06, + "loss": 0.1718, + "step": 3998 + }, + { + "epoch": 0.6479261179520415, + "grad_norm": 0.8953585028648376, + "learning_rate": 4.560150329699971e-06, + "loss": 0.1451, + "step": 3999 + }, + { + "epoch": 0.6480881399870383, + "grad_norm": 0.900346577167511, + "learning_rate": 4.55990256438839e-06, + "loss": 0.1447, + "step": 4000 + }, + { + "epoch": 0.648250162022035, + "grad_norm": 0.8216709494590759, + "learning_rate": 4.559654736048776e-06, + "loss": 0.1327, + "step": 4001 + }, + { + "epoch": 0.6484121840570317, + "grad_norm": 0.850792646408081, + "learning_rate": 4.559406844688711e-06, + "loss": 0.1341, + "step": 4002 + }, + { + "epoch": 0.6485742060920285, + "grad_norm": 0.9442715644836426, + "learning_rate": 4.5591588903157816e-06, + "loss": 0.1442, + "step": 4003 + }, + { + "epoch": 0.6487362281270252, + "grad_norm": 0.8908452391624451, + "learning_rate": 4.558910872937572e-06, + "loss": 0.1427, + "step": 4004 + }, + { + "epoch": 0.6488982501620221, + "grad_norm": 0.9406809210777283, + "learning_rate": 4.558662792561672e-06, + "loss": 0.1616, + "step": 4005 + }, + { + "epoch": 0.6490602721970188, + "grad_norm": 0.8570629954338074, + "learning_rate": 4.558414649195673e-06, + "loss": 0.1443, + "step": 4006 + }, + { + "epoch": 0.6492222942320156, + "grad_norm": 0.8232815861701965, + "learning_rate": 4.558166442847166e-06, + "loss": 0.1351, + "step": 4007 + }, + { + "epoch": 0.6493843162670123, + "grad_norm": 0.9287505745887756, + "learning_rate": 4.557918173523747e-06, + "loss": 0.1492, + "step": 4008 + }, + { + "epoch": 0.649546338302009, + "grad_norm": 0.9767974615097046, + "learning_rate": 4.557669841233013e-06, + "loss": 0.1583, + "step": 4009 + }, + { + "epoch": 0.6497083603370059, + "grad_norm": 0.8061318397521973, + "learning_rate": 4.55742144598256e-06, + "loss": 0.1257, + "step": 4010 + }, + { + "epoch": 0.6498703823720026, + "grad_norm": 0.8351037502288818, + "learning_rate": 4.557172987779991e-06, + "loss": 0.143, + "step": 4011 + }, + { + "epoch": 0.6500324044069994, + "grad_norm": 0.8559525609016418, + "learning_rate": 4.5569244666329055e-06, + "loss": 0.1358, + "step": 4012 + }, + { + "epoch": 0.6501944264419961, + "grad_norm": 0.9949284195899963, + "learning_rate": 4.556675882548909e-06, + "loss": 0.1463, + "step": 4013 + }, + { + "epoch": 0.6503564484769929, + "grad_norm": 0.8767020106315613, + "learning_rate": 4.5564272355356085e-06, + "loss": 0.1366, + "step": 4014 + }, + { + "epoch": 0.6505184705119896, + "grad_norm": 0.9822929501533508, + "learning_rate": 4.556178525600611e-06, + "loss": 0.1454, + "step": 4015 + }, + { + "epoch": 0.6506804925469863, + "grad_norm": 0.8953209519386292, + "learning_rate": 4.555929752751526e-06, + "loss": 0.1393, + "step": 4016 + }, + { + "epoch": 0.6508425145819832, + "grad_norm": 0.9238772988319397, + "learning_rate": 4.555680916995965e-06, + "loss": 0.15, + "step": 4017 + }, + { + "epoch": 0.6510045366169799, + "grad_norm": 0.8017407655715942, + "learning_rate": 4.5554320183415435e-06, + "loss": 0.1256, + "step": 4018 + }, + { + "epoch": 0.6511665586519767, + "grad_norm": 0.9731351137161255, + "learning_rate": 4.555183056795877e-06, + "loss": 0.1371, + "step": 4019 + }, + { + "epoch": 0.6513285806869734, + "grad_norm": 0.8913125991821289, + "learning_rate": 4.5549340323665815e-06, + "loss": 0.1392, + "step": 4020 + }, + { + "epoch": 0.6514906027219702, + "grad_norm": 0.9241263270378113, + "learning_rate": 4.5546849450612774e-06, + "loss": 0.1483, + "step": 4021 + }, + { + "epoch": 0.651652624756967, + "grad_norm": 0.925363302230835, + "learning_rate": 4.554435794887586e-06, + "loss": 0.1376, + "step": 4022 + }, + { + "epoch": 0.6518146467919637, + "grad_norm": 1.0443159341812134, + "learning_rate": 4.5541865818531315e-06, + "loss": 0.1505, + "step": 4023 + }, + { + "epoch": 0.6519766688269605, + "grad_norm": 0.9155001044273376, + "learning_rate": 4.553937305965539e-06, + "loss": 0.1499, + "step": 4024 + }, + { + "epoch": 0.6521386908619572, + "grad_norm": 0.920136034488678, + "learning_rate": 4.5536879672324345e-06, + "loss": 0.1312, + "step": 4025 + }, + { + "epoch": 0.652300712896954, + "grad_norm": 0.8369788527488708, + "learning_rate": 4.553438565661448e-06, + "loss": 0.1225, + "step": 4026 + }, + { + "epoch": 0.6524627349319507, + "grad_norm": 0.9391274452209473, + "learning_rate": 4.553189101260211e-06, + "loss": 0.1424, + "step": 4027 + }, + { + "epoch": 0.6526247569669476, + "grad_norm": 0.9724919199943542, + "learning_rate": 4.552939574036356e-06, + "loss": 0.1591, + "step": 4028 + }, + { + "epoch": 0.6527867790019443, + "grad_norm": 0.8575350642204285, + "learning_rate": 4.552689983997519e-06, + "loss": 0.1452, + "step": 4029 + }, + { + "epoch": 0.652948801036941, + "grad_norm": 0.9005478620529175, + "learning_rate": 4.552440331151334e-06, + "loss": 0.152, + "step": 4030 + }, + { + "epoch": 0.6531108230719378, + "grad_norm": 0.9679160118103027, + "learning_rate": 4.552190615505444e-06, + "loss": 0.1539, + "step": 4031 + }, + { + "epoch": 0.6532728451069345, + "grad_norm": 0.8400557041168213, + "learning_rate": 4.551940837067486e-06, + "loss": 0.1406, + "step": 4032 + }, + { + "epoch": 0.6534348671419313, + "grad_norm": 0.9820280075073242, + "learning_rate": 4.551690995845104e-06, + "loss": 0.1618, + "step": 4033 + }, + { + "epoch": 0.653596889176928, + "grad_norm": 0.862453818321228, + "learning_rate": 4.551441091845942e-06, + "loss": 0.1438, + "step": 4034 + }, + { + "epoch": 0.6537589112119249, + "grad_norm": 0.8491960167884827, + "learning_rate": 4.551191125077647e-06, + "loss": 0.1303, + "step": 4035 + }, + { + "epoch": 0.6539209332469216, + "grad_norm": 0.8754890561103821, + "learning_rate": 4.550941095547869e-06, + "loss": 0.1446, + "step": 4036 + }, + { + "epoch": 0.6540829552819183, + "grad_norm": 0.9268251657485962, + "learning_rate": 4.550691003264256e-06, + "loss": 0.1556, + "step": 4037 + }, + { + "epoch": 0.6542449773169151, + "grad_norm": 0.8505957126617432, + "learning_rate": 4.55044084823446e-06, + "loss": 0.1343, + "step": 4038 + }, + { + "epoch": 0.6544069993519118, + "grad_norm": 0.8797240853309631, + "learning_rate": 4.550190630466137e-06, + "loss": 0.1445, + "step": 4039 + }, + { + "epoch": 0.6545690213869086, + "grad_norm": 0.8217586278915405, + "learning_rate": 4.5499403499669415e-06, + "loss": 0.1309, + "step": 4040 + }, + { + "epoch": 0.6547310434219054, + "grad_norm": 1.0774375200271606, + "learning_rate": 4.549690006744531e-06, + "loss": 0.1524, + "step": 4041 + }, + { + "epoch": 0.6548930654569022, + "grad_norm": 0.9880624413490295, + "learning_rate": 4.549439600806568e-06, + "loss": 0.1523, + "step": 4042 + }, + { + "epoch": 0.6550550874918989, + "grad_norm": 0.9190874695777893, + "learning_rate": 4.549189132160713e-06, + "loss": 0.133, + "step": 4043 + }, + { + "epoch": 0.6552171095268956, + "grad_norm": 0.9123296141624451, + "learning_rate": 4.548938600814629e-06, + "loss": 0.1377, + "step": 4044 + }, + { + "epoch": 0.6553791315618924, + "grad_norm": 1.0104409456253052, + "learning_rate": 4.548688006775981e-06, + "loss": 0.1488, + "step": 4045 + }, + { + "epoch": 0.6555411535968891, + "grad_norm": 0.955862283706665, + "learning_rate": 4.5484373500524395e-06, + "loss": 0.1588, + "step": 4046 + }, + { + "epoch": 0.655703175631886, + "grad_norm": 0.7813193202018738, + "learning_rate": 4.548186630651671e-06, + "loss": 0.1188, + "step": 4047 + }, + { + "epoch": 0.6558651976668827, + "grad_norm": 1.0729082822799683, + "learning_rate": 4.547935848581349e-06, + "loss": 0.178, + "step": 4048 + }, + { + "epoch": 0.6560272197018795, + "grad_norm": 0.8658989071846008, + "learning_rate": 4.547685003849145e-06, + "loss": 0.1391, + "step": 4049 + }, + { + "epoch": 0.6561892417368762, + "grad_norm": 1.0283029079437256, + "learning_rate": 4.5474340964627365e-06, + "loss": 0.1565, + "step": 4050 + }, + { + "epoch": 0.656351263771873, + "grad_norm": 0.9804076552391052, + "learning_rate": 4.547183126429798e-06, + "loss": 0.1667, + "step": 4051 + }, + { + "epoch": 0.6565132858068697, + "grad_norm": 0.7968271374702454, + "learning_rate": 4.5469320937580105e-06, + "loss": 0.1288, + "step": 4052 + }, + { + "epoch": 0.6566753078418665, + "grad_norm": 0.9019933938980103, + "learning_rate": 4.546680998455054e-06, + "loss": 0.1439, + "step": 4053 + }, + { + "epoch": 0.6568373298768633, + "grad_norm": 0.8464137315750122, + "learning_rate": 4.546429840528612e-06, + "loss": 0.1405, + "step": 4054 + }, + { + "epoch": 0.65699935191186, + "grad_norm": 0.8122446537017822, + "learning_rate": 4.54617861998637e-06, + "loss": 0.1338, + "step": 4055 + }, + { + "epoch": 0.6571613739468568, + "grad_norm": 0.9290385246276855, + "learning_rate": 4.545927336836013e-06, + "loss": 0.1323, + "step": 4056 + }, + { + "epoch": 0.6573233959818535, + "grad_norm": 0.9269915223121643, + "learning_rate": 4.545675991085231e-06, + "loss": 0.1515, + "step": 4057 + }, + { + "epoch": 0.6574854180168503, + "grad_norm": 1.0693535804748535, + "learning_rate": 4.545424582741714e-06, + "loss": 0.1695, + "step": 4058 + }, + { + "epoch": 0.657647440051847, + "grad_norm": 0.9048995971679688, + "learning_rate": 4.545173111813154e-06, + "loss": 0.1444, + "step": 4059 + }, + { + "epoch": 0.6578094620868438, + "grad_norm": 0.8983332514762878, + "learning_rate": 4.544921578307246e-06, + "loss": 0.1323, + "step": 4060 + }, + { + "epoch": 0.6579714841218406, + "grad_norm": 1.1153713464736938, + "learning_rate": 4.544669982231688e-06, + "loss": 0.1742, + "step": 4061 + }, + { + "epoch": 0.6581335061568373, + "grad_norm": 0.9735933542251587, + "learning_rate": 4.544418323594175e-06, + "loss": 0.1485, + "step": 4062 + }, + { + "epoch": 0.6582955281918341, + "grad_norm": 0.9374943375587463, + "learning_rate": 4.544166602402409e-06, + "loss": 0.144, + "step": 4063 + }, + { + "epoch": 0.6584575502268308, + "grad_norm": 0.9201976656913757, + "learning_rate": 4.543914818664092e-06, + "loss": 0.1497, + "step": 4064 + }, + { + "epoch": 0.6586195722618277, + "grad_norm": 1.0155818462371826, + "learning_rate": 4.543662972386927e-06, + "loss": 0.1638, + "step": 4065 + }, + { + "epoch": 0.6587815942968244, + "grad_norm": 0.913718044757843, + "learning_rate": 4.543411063578621e-06, + "loss": 0.1466, + "step": 4066 + }, + { + "epoch": 0.6589436163318211, + "grad_norm": 0.813508927822113, + "learning_rate": 4.5431590922468815e-06, + "loss": 0.1324, + "step": 4067 + }, + { + "epoch": 0.6591056383668179, + "grad_norm": 0.7492986917495728, + "learning_rate": 4.5429070583994185e-06, + "loss": 0.1263, + "step": 4068 + }, + { + "epoch": 0.6592676604018146, + "grad_norm": 0.8122268915176392, + "learning_rate": 4.542654962043943e-06, + "loss": 0.1274, + "step": 4069 + }, + { + "epoch": 0.6594296824368114, + "grad_norm": 0.8873049020767212, + "learning_rate": 4.542402803188168e-06, + "loss": 0.1413, + "step": 4070 + }, + { + "epoch": 0.6595917044718081, + "grad_norm": 0.9349288940429688, + "learning_rate": 4.542150581839811e-06, + "loss": 0.1534, + "step": 4071 + }, + { + "epoch": 0.659753726506805, + "grad_norm": 0.8754650354385376, + "learning_rate": 4.5418982980065874e-06, + "loss": 0.1503, + "step": 4072 + }, + { + "epoch": 0.6599157485418017, + "grad_norm": 0.8489086627960205, + "learning_rate": 4.541645951696217e-06, + "loss": 0.1353, + "step": 4073 + }, + { + "epoch": 0.6600777705767984, + "grad_norm": 0.8746541142463684, + "learning_rate": 4.541393542916423e-06, + "loss": 0.1541, + "step": 4074 + }, + { + "epoch": 0.6602397926117952, + "grad_norm": 0.9938592910766602, + "learning_rate": 4.541141071674924e-06, + "loss": 0.1663, + "step": 4075 + }, + { + "epoch": 0.6604018146467919, + "grad_norm": 0.9091808199882507, + "learning_rate": 4.540888537979449e-06, + "loss": 0.1333, + "step": 4076 + }, + { + "epoch": 0.6605638366817888, + "grad_norm": 0.8329112529754639, + "learning_rate": 4.540635941837723e-06, + "loss": 0.1314, + "step": 4077 + }, + { + "epoch": 0.6607258587167855, + "grad_norm": 0.8982328176498413, + "learning_rate": 4.540383283257477e-06, + "loss": 0.1536, + "step": 4078 + }, + { + "epoch": 0.6608878807517823, + "grad_norm": 0.9931498169898987, + "learning_rate": 4.540130562246439e-06, + "loss": 0.1613, + "step": 4079 + }, + { + "epoch": 0.661049902786779, + "grad_norm": 0.9610978364944458, + "learning_rate": 4.539877778812342e-06, + "loss": 0.1464, + "step": 4080 + }, + { + "epoch": 0.6612119248217757, + "grad_norm": 0.8849239349365234, + "learning_rate": 4.539624932962923e-06, + "loss": 0.1327, + "step": 4081 + }, + { + "epoch": 0.6613739468567725, + "grad_norm": 0.9079961776733398, + "learning_rate": 4.539372024705916e-06, + "loss": 0.1298, + "step": 4082 + }, + { + "epoch": 0.6615359688917692, + "grad_norm": 0.8636612296104431, + "learning_rate": 4.5391190540490595e-06, + "loss": 0.1354, + "step": 4083 + }, + { + "epoch": 0.6616979909267661, + "grad_norm": 1.0267540216445923, + "learning_rate": 4.538866021000096e-06, + "loss": 0.1476, + "step": 4084 + }, + { + "epoch": 0.6618600129617628, + "grad_norm": 0.9484116435050964, + "learning_rate": 4.538612925566765e-06, + "loss": 0.1392, + "step": 4085 + }, + { + "epoch": 0.6620220349967596, + "grad_norm": 0.9751802086830139, + "learning_rate": 4.538359767756813e-06, + "loss": 0.1462, + "step": 4086 + }, + { + "epoch": 0.6621840570317563, + "grad_norm": 0.9586076140403748, + "learning_rate": 4.538106547577984e-06, + "loss": 0.1576, + "step": 4087 + }, + { + "epoch": 0.662346079066753, + "grad_norm": 0.9770367741584778, + "learning_rate": 4.537853265038027e-06, + "loss": 0.1509, + "step": 4088 + }, + { + "epoch": 0.6625081011017498, + "grad_norm": 0.9630382657051086, + "learning_rate": 4.537599920144692e-06, + "loss": 0.1464, + "step": 4089 + }, + { + "epoch": 0.6626701231367466, + "grad_norm": 0.9468212127685547, + "learning_rate": 4.537346512905729e-06, + "loss": 0.1563, + "step": 4090 + }, + { + "epoch": 0.6628321451717434, + "grad_norm": 1.0076836347579956, + "learning_rate": 4.537093043328894e-06, + "loss": 0.1756, + "step": 4091 + }, + { + "epoch": 0.6629941672067401, + "grad_norm": 0.879906415939331, + "learning_rate": 4.536839511421941e-06, + "loss": 0.1328, + "step": 4092 + }, + { + "epoch": 0.6631561892417369, + "grad_norm": 0.8465386629104614, + "learning_rate": 4.536585917192629e-06, + "loss": 0.1299, + "step": 4093 + }, + { + "epoch": 0.6633182112767336, + "grad_norm": 0.960082471370697, + "learning_rate": 4.536332260648716e-06, + "loss": 0.1401, + "step": 4094 + }, + { + "epoch": 0.6634802333117304, + "grad_norm": 0.8401903510093689, + "learning_rate": 4.536078541797964e-06, + "loss": 0.1229, + "step": 4095 + }, + { + "epoch": 0.6636422553467272, + "grad_norm": 1.0268646478652954, + "learning_rate": 4.535824760648135e-06, + "loss": 0.1713, + "step": 4096 + }, + { + "epoch": 0.6638042773817239, + "grad_norm": 0.8968318104743958, + "learning_rate": 4.535570917206995e-06, + "loss": 0.1454, + "step": 4097 + }, + { + "epoch": 0.6639662994167207, + "grad_norm": 0.9677649736404419, + "learning_rate": 4.535317011482311e-06, + "loss": 0.1556, + "step": 4098 + }, + { + "epoch": 0.6641283214517174, + "grad_norm": 0.9355975389480591, + "learning_rate": 4.535063043481852e-06, + "loss": 0.1549, + "step": 4099 + }, + { + "epoch": 0.6642903434867142, + "grad_norm": 0.8558212518692017, + "learning_rate": 4.534809013213389e-06, + "loss": 0.1448, + "step": 4100 + }, + { + "epoch": 0.6644523655217109, + "grad_norm": 0.8515587449073792, + "learning_rate": 4.534554920684694e-06, + "loss": 0.1419, + "step": 4101 + }, + { + "epoch": 0.6646143875567078, + "grad_norm": 0.807841956615448, + "learning_rate": 4.534300765903542e-06, + "loss": 0.1292, + "step": 4102 + }, + { + "epoch": 0.6647764095917045, + "grad_norm": 0.8981359601020813, + "learning_rate": 4.534046548877709e-06, + "loss": 0.1352, + "step": 4103 + }, + { + "epoch": 0.6649384316267012, + "grad_norm": 0.893144965171814, + "learning_rate": 4.533792269614974e-06, + "loss": 0.1303, + "step": 4104 + }, + { + "epoch": 0.665100453661698, + "grad_norm": 0.9923272728919983, + "learning_rate": 4.533537928123118e-06, + "loss": 0.1647, + "step": 4105 + }, + { + "epoch": 0.6652624756966947, + "grad_norm": 0.8646336793899536, + "learning_rate": 4.533283524409922e-06, + "loss": 0.1513, + "step": 4106 + }, + { + "epoch": 0.6654244977316915, + "grad_norm": 0.952130138874054, + "learning_rate": 4.53302905848317e-06, + "loss": 0.1633, + "step": 4107 + }, + { + "epoch": 0.6655865197666883, + "grad_norm": 0.8783090710639954, + "learning_rate": 4.53277453035065e-06, + "loss": 0.1316, + "step": 4108 + }, + { + "epoch": 0.6657485418016851, + "grad_norm": 0.8041051626205444, + "learning_rate": 4.532519940020148e-06, + "loss": 0.1197, + "step": 4109 + }, + { + "epoch": 0.6659105638366818, + "grad_norm": 0.9836557507514954, + "learning_rate": 4.532265287499454e-06, + "loss": 0.1687, + "step": 4110 + }, + { + "epoch": 0.6660725858716785, + "grad_norm": 0.8173419833183289, + "learning_rate": 4.532010572796361e-06, + "loss": 0.1313, + "step": 4111 + }, + { + "epoch": 0.6662346079066753, + "grad_norm": 0.8361591696739197, + "learning_rate": 4.531755795918661e-06, + "loss": 0.128, + "step": 4112 + }, + { + "epoch": 0.666396629941672, + "grad_norm": 0.9987451434135437, + "learning_rate": 4.531500956874151e-06, + "loss": 0.1556, + "step": 4113 + }, + { + "epoch": 0.6665586519766689, + "grad_norm": 0.9050056338310242, + "learning_rate": 4.531246055670627e-06, + "loss": 0.1359, + "step": 4114 + }, + { + "epoch": 0.6667206740116656, + "grad_norm": 0.8897657990455627, + "learning_rate": 4.53099109231589e-06, + "loss": 0.1415, + "step": 4115 + }, + { + "epoch": 0.6668826960466624, + "grad_norm": 1.0086582899093628, + "learning_rate": 4.53073606681774e-06, + "loss": 0.1757, + "step": 4116 + }, + { + "epoch": 0.6670447180816591, + "grad_norm": 0.9577541351318359, + "learning_rate": 4.530480979183981e-06, + "loss": 0.1415, + "step": 4117 + }, + { + "epoch": 0.6672067401166558, + "grad_norm": 0.8575126528739929, + "learning_rate": 4.530225829422418e-06, + "loss": 0.1258, + "step": 4118 + }, + { + "epoch": 0.6673687621516526, + "grad_norm": 0.9692230224609375, + "learning_rate": 4.529970617540857e-06, + "loss": 0.1456, + "step": 4119 + }, + { + "epoch": 0.6675307841866494, + "grad_norm": 0.8811309337615967, + "learning_rate": 4.529715343547107e-06, + "loss": 0.1295, + "step": 4120 + }, + { + "epoch": 0.6676928062216462, + "grad_norm": 0.9429895281791687, + "learning_rate": 4.529460007448981e-06, + "loss": 0.1667, + "step": 4121 + }, + { + "epoch": 0.6678548282566429, + "grad_norm": 0.9843415021896362, + "learning_rate": 4.5292046092542885e-06, + "loss": 0.1663, + "step": 4122 + }, + { + "epoch": 0.6680168502916397, + "grad_norm": 0.8283978700637817, + "learning_rate": 4.528949148970846e-06, + "loss": 0.1327, + "step": 4123 + }, + { + "epoch": 0.6681788723266364, + "grad_norm": 0.9020947217941284, + "learning_rate": 4.52869362660647e-06, + "loss": 0.1461, + "step": 4124 + }, + { + "epoch": 0.6683408943616331, + "grad_norm": 0.8452586531639099, + "learning_rate": 4.528438042168978e-06, + "loss": 0.145, + "step": 4125 + }, + { + "epoch": 0.66850291639663, + "grad_norm": 0.914142370223999, + "learning_rate": 4.5281823956661905e-06, + "loss": 0.1416, + "step": 4126 + }, + { + "epoch": 0.6686649384316267, + "grad_norm": 0.9229021072387695, + "learning_rate": 4.52792668710593e-06, + "loss": 0.1534, + "step": 4127 + }, + { + "epoch": 0.6688269604666235, + "grad_norm": 0.8286415338516235, + "learning_rate": 4.527670916496021e-06, + "loss": 0.1303, + "step": 4128 + }, + { + "epoch": 0.6689889825016202, + "grad_norm": 1.0366954803466797, + "learning_rate": 4.5274150838442875e-06, + "loss": 0.1616, + "step": 4129 + }, + { + "epoch": 0.669151004536617, + "grad_norm": 0.886921226978302, + "learning_rate": 4.52715918915856e-06, + "loss": 0.1416, + "step": 4130 + }, + { + "epoch": 0.6693130265716137, + "grad_norm": 0.9015594124794006, + "learning_rate": 4.5269032324466656e-06, + "loss": 0.1492, + "step": 4131 + }, + { + "epoch": 0.6694750486066104, + "grad_norm": 0.8952576518058777, + "learning_rate": 4.526647213716438e-06, + "loss": 0.1392, + "step": 4132 + }, + { + "epoch": 0.6696370706416073, + "grad_norm": 0.8199195861816406, + "learning_rate": 4.526391132975711e-06, + "loss": 0.1342, + "step": 4133 + }, + { + "epoch": 0.669799092676604, + "grad_norm": 0.822807252407074, + "learning_rate": 4.526134990232317e-06, + "loss": 0.1377, + "step": 4134 + }, + { + "epoch": 0.6699611147116008, + "grad_norm": 0.8194425702095032, + "learning_rate": 4.525878785494097e-06, + "loss": 0.1341, + "step": 4135 + }, + { + "epoch": 0.6701231367465975, + "grad_norm": 1.0225645303726196, + "learning_rate": 4.525622518768888e-06, + "loss": 0.1631, + "step": 4136 + }, + { + "epoch": 0.6702851587815943, + "grad_norm": 0.8572471737861633, + "learning_rate": 4.5253661900645315e-06, + "loss": 0.1328, + "step": 4137 + }, + { + "epoch": 0.670447180816591, + "grad_norm": 0.9486353397369385, + "learning_rate": 4.5251097993888726e-06, + "loss": 0.1543, + "step": 4138 + }, + { + "epoch": 0.6706092028515879, + "grad_norm": 0.9721882343292236, + "learning_rate": 4.524853346749753e-06, + "loss": 0.1558, + "step": 4139 + }, + { + "epoch": 0.6707712248865846, + "grad_norm": 0.8730854988098145, + "learning_rate": 4.524596832155022e-06, + "loss": 0.1354, + "step": 4140 + }, + { + "epoch": 0.6709332469215813, + "grad_norm": 0.8489865660667419, + "learning_rate": 4.524340255612526e-06, + "loss": 0.1314, + "step": 4141 + }, + { + "epoch": 0.6710952689565781, + "grad_norm": 0.8910239934921265, + "learning_rate": 4.524083617130118e-06, + "loss": 0.1545, + "step": 4142 + }, + { + "epoch": 0.6712572909915748, + "grad_norm": 0.7663037776947021, + "learning_rate": 4.523826916715649e-06, + "loss": 0.1255, + "step": 4143 + }, + { + "epoch": 0.6714193130265717, + "grad_norm": 0.9194484353065491, + "learning_rate": 4.523570154376975e-06, + "loss": 0.1457, + "step": 4144 + }, + { + "epoch": 0.6715813350615684, + "grad_norm": 0.9784338474273682, + "learning_rate": 4.52331333012195e-06, + "loss": 0.1645, + "step": 4145 + }, + { + "epoch": 0.6717433570965652, + "grad_norm": 0.8883211612701416, + "learning_rate": 4.5230564439584335e-06, + "loss": 0.1359, + "step": 4146 + }, + { + "epoch": 0.6719053791315619, + "grad_norm": 0.9348551034927368, + "learning_rate": 4.522799495894286e-06, + "loss": 0.1549, + "step": 4147 + }, + { + "epoch": 0.6720674011665586, + "grad_norm": 0.76348876953125, + "learning_rate": 4.522542485937369e-06, + "loss": 0.116, + "step": 4148 + }, + { + "epoch": 0.6722294232015554, + "grad_norm": 0.8616440296173096, + "learning_rate": 4.522285414095547e-06, + "loss": 0.1296, + "step": 4149 + }, + { + "epoch": 0.6723914452365521, + "grad_norm": 0.901292622089386, + "learning_rate": 4.522028280376683e-06, + "loss": 0.1495, + "step": 4150 + }, + { + "epoch": 0.672553467271549, + "grad_norm": 0.8072662353515625, + "learning_rate": 4.521771084788649e-06, + "loss": 0.1316, + "step": 4151 + }, + { + "epoch": 0.6727154893065457, + "grad_norm": 0.9773443937301636, + "learning_rate": 4.521513827339311e-06, + "loss": 0.164, + "step": 4152 + }, + { + "epoch": 0.6728775113415425, + "grad_norm": 0.8923728466033936, + "learning_rate": 4.521256508036543e-06, + "loss": 0.1437, + "step": 4153 + }, + { + "epoch": 0.6730395333765392, + "grad_norm": 1.0206551551818848, + "learning_rate": 4.5209991268882165e-06, + "loss": 0.1608, + "step": 4154 + }, + { + "epoch": 0.6732015554115359, + "grad_norm": 0.9942020177841187, + "learning_rate": 4.520741683902208e-06, + "loss": 0.1553, + "step": 4155 + }, + { + "epoch": 0.6733635774465327, + "grad_norm": 0.9606051445007324, + "learning_rate": 4.520484179086394e-06, + "loss": 0.1577, + "step": 4156 + }, + { + "epoch": 0.6735255994815295, + "grad_norm": 0.9637233018875122, + "learning_rate": 4.520226612448653e-06, + "loss": 0.1646, + "step": 4157 + }, + { + "epoch": 0.6736876215165263, + "grad_norm": 0.8612246513366699, + "learning_rate": 4.519968983996867e-06, + "loss": 0.1358, + "step": 4158 + }, + { + "epoch": 0.673849643551523, + "grad_norm": 0.9165799617767334, + "learning_rate": 4.519711293738918e-06, + "loss": 0.154, + "step": 4159 + }, + { + "epoch": 0.6740116655865198, + "grad_norm": 0.853792130947113, + "learning_rate": 4.519453541682691e-06, + "loss": 0.1378, + "step": 4160 + }, + { + "epoch": 0.6741736876215165, + "grad_norm": 0.9091629385948181, + "learning_rate": 4.519195727836073e-06, + "loss": 0.1311, + "step": 4161 + }, + { + "epoch": 0.6743357096565132, + "grad_norm": 1.033930778503418, + "learning_rate": 4.518937852206952e-06, + "loss": 0.1692, + "step": 4162 + }, + { + "epoch": 0.6744977316915101, + "grad_norm": 0.9184512495994568, + "learning_rate": 4.518679914803218e-06, + "loss": 0.1417, + "step": 4163 + }, + { + "epoch": 0.6746597537265068, + "grad_norm": 0.8692227602005005, + "learning_rate": 4.518421915632764e-06, + "loss": 0.1491, + "step": 4164 + }, + { + "epoch": 0.6748217757615036, + "grad_norm": 0.817550778388977, + "learning_rate": 4.518163854703484e-06, + "loss": 0.1407, + "step": 4165 + }, + { + "epoch": 0.6749837977965003, + "grad_norm": 0.8003614544868469, + "learning_rate": 4.5179057320232735e-06, + "loss": 0.126, + "step": 4166 + }, + { + "epoch": 0.6751458198314971, + "grad_norm": 0.847512423992157, + "learning_rate": 4.517647547600032e-06, + "loss": 0.1203, + "step": 4167 + }, + { + "epoch": 0.6753078418664938, + "grad_norm": 0.9135223031044006, + "learning_rate": 4.517389301441657e-06, + "loss": 0.1311, + "step": 4168 + }, + { + "epoch": 0.6754698639014906, + "grad_norm": 0.9252517819404602, + "learning_rate": 4.517130993556051e-06, + "loss": 0.1576, + "step": 4169 + }, + { + "epoch": 0.6756318859364874, + "grad_norm": 0.8751834034919739, + "learning_rate": 4.51687262395112e-06, + "loss": 0.1332, + "step": 4170 + }, + { + "epoch": 0.6757939079714841, + "grad_norm": 0.8638715147972107, + "learning_rate": 4.516614192634765e-06, + "loss": 0.1313, + "step": 4171 + }, + { + "epoch": 0.6759559300064809, + "grad_norm": 0.910304844379425, + "learning_rate": 4.516355699614897e-06, + "loss": 0.1441, + "step": 4172 + }, + { + "epoch": 0.6761179520414776, + "grad_norm": 0.8590049743652344, + "learning_rate": 4.516097144899424e-06, + "loss": 0.1324, + "step": 4173 + }, + { + "epoch": 0.6762799740764744, + "grad_norm": 0.9081874489784241, + "learning_rate": 4.515838528496257e-06, + "loss": 0.1493, + "step": 4174 + }, + { + "epoch": 0.6764419961114712, + "grad_norm": 0.9282753467559814, + "learning_rate": 4.51557985041331e-06, + "loss": 0.1555, + "step": 4175 + }, + { + "epoch": 0.6766040181464679, + "grad_norm": 0.9835655093193054, + "learning_rate": 4.5153211106584965e-06, + "loss": 0.1559, + "step": 4176 + }, + { + "epoch": 0.6767660401814647, + "grad_norm": 1.0135793685913086, + "learning_rate": 4.515062309239734e-06, + "loss": 0.1568, + "step": 4177 + }, + { + "epoch": 0.6769280622164614, + "grad_norm": 0.8695417046546936, + "learning_rate": 4.514803446164941e-06, + "loss": 0.1355, + "step": 4178 + }, + { + "epoch": 0.6770900842514582, + "grad_norm": 0.7575681805610657, + "learning_rate": 4.514544521442039e-06, + "loss": 0.1289, + "step": 4179 + }, + { + "epoch": 0.6772521062864549, + "grad_norm": 0.9172993302345276, + "learning_rate": 4.514285535078949e-06, + "loss": 0.1565, + "step": 4180 + }, + { + "epoch": 0.6774141283214518, + "grad_norm": 0.8855761885643005, + "learning_rate": 4.5140264870835974e-06, + "loss": 0.1476, + "step": 4181 + }, + { + "epoch": 0.6775761503564485, + "grad_norm": 0.9483667016029358, + "learning_rate": 4.513767377463908e-06, + "loss": 0.1516, + "step": 4182 + }, + { + "epoch": 0.6777381723914452, + "grad_norm": 0.8638037443161011, + "learning_rate": 4.51350820622781e-06, + "loss": 0.1359, + "step": 4183 + }, + { + "epoch": 0.677900194426442, + "grad_norm": 0.8898397088050842, + "learning_rate": 4.513248973383234e-06, + "loss": 0.1323, + "step": 4184 + }, + { + "epoch": 0.6780622164614387, + "grad_norm": 0.9830394983291626, + "learning_rate": 4.512989678938111e-06, + "loss": 0.1544, + "step": 4185 + }, + { + "epoch": 0.6782242384964355, + "grad_norm": 0.8503865599632263, + "learning_rate": 4.512730322900375e-06, + "loss": 0.1273, + "step": 4186 + }, + { + "epoch": 0.6783862605314323, + "grad_norm": 1.0824391841888428, + "learning_rate": 4.5124709052779626e-06, + "loss": 0.1775, + "step": 4187 + }, + { + "epoch": 0.6785482825664291, + "grad_norm": 0.9633022546768188, + "learning_rate": 4.51221142607881e-06, + "loss": 0.1557, + "step": 4188 + }, + { + "epoch": 0.6787103046014258, + "grad_norm": 0.9733066558837891, + "learning_rate": 4.511951885310858e-06, + "loss": 0.1404, + "step": 4189 + }, + { + "epoch": 0.6788723266364226, + "grad_norm": 1.0073535442352295, + "learning_rate": 4.511692282982047e-06, + "loss": 0.1678, + "step": 4190 + }, + { + "epoch": 0.6790343486714193, + "grad_norm": 0.8856287002563477, + "learning_rate": 4.511432619100319e-06, + "loss": 0.131, + "step": 4191 + }, + { + "epoch": 0.679196370706416, + "grad_norm": 0.896113395690918, + "learning_rate": 4.511172893673621e-06, + "loss": 0.1373, + "step": 4192 + }, + { + "epoch": 0.6793583927414129, + "grad_norm": 0.9079231023788452, + "learning_rate": 4.5109131067099e-06, + "loss": 0.1411, + "step": 4193 + }, + { + "epoch": 0.6795204147764096, + "grad_norm": 0.7973667979240417, + "learning_rate": 4.510653258217103e-06, + "loss": 0.1338, + "step": 4194 + }, + { + "epoch": 0.6796824368114064, + "grad_norm": 0.9222817420959473, + "learning_rate": 4.510393348203184e-06, + "loss": 0.1556, + "step": 4195 + }, + { + "epoch": 0.6798444588464031, + "grad_norm": 1.0345180034637451, + "learning_rate": 4.5101333766760926e-06, + "loss": 0.1483, + "step": 4196 + }, + { + "epoch": 0.6800064808813999, + "grad_norm": 0.8539285659790039, + "learning_rate": 4.509873343643783e-06, + "loss": 0.1385, + "step": 4197 + }, + { + "epoch": 0.6801685029163966, + "grad_norm": 0.8832521438598633, + "learning_rate": 4.509613249114215e-06, + "loss": 0.1512, + "step": 4198 + }, + { + "epoch": 0.6803305249513933, + "grad_norm": 1.0024913549423218, + "learning_rate": 4.509353093095344e-06, + "loss": 0.1768, + "step": 4199 + }, + { + "epoch": 0.6804925469863902, + "grad_norm": 0.8852626085281372, + "learning_rate": 4.509092875595131e-06, + "loss": 0.1321, + "step": 4200 + }, + { + "epoch": 0.6806545690213869, + "grad_norm": 0.8801068663597107, + "learning_rate": 4.5088325966215375e-06, + "loss": 0.147, + "step": 4201 + }, + { + "epoch": 0.6808165910563837, + "grad_norm": 0.850282609462738, + "learning_rate": 4.508572256182528e-06, + "loss": 0.1439, + "step": 4202 + }, + { + "epoch": 0.6809786130913804, + "grad_norm": 0.880113959312439, + "learning_rate": 4.508311854286068e-06, + "loss": 0.1409, + "step": 4203 + }, + { + "epoch": 0.6811406351263772, + "grad_norm": 0.895704984664917, + "learning_rate": 4.508051390940125e-06, + "loss": 0.1374, + "step": 4204 + }, + { + "epoch": 0.681302657161374, + "grad_norm": 0.9211687445640564, + "learning_rate": 4.507790866152669e-06, + "loss": 0.1412, + "step": 4205 + }, + { + "epoch": 0.6814646791963707, + "grad_norm": 0.7838913798332214, + "learning_rate": 4.507530279931673e-06, + "loss": 0.1248, + "step": 4206 + }, + { + "epoch": 0.6816267012313675, + "grad_norm": 0.923410952091217, + "learning_rate": 4.507269632285106e-06, + "loss": 0.1537, + "step": 4207 + }, + { + "epoch": 0.6817887232663642, + "grad_norm": 0.8629544377326965, + "learning_rate": 4.5070089232209465e-06, + "loss": 0.1259, + "step": 4208 + }, + { + "epoch": 0.681950745301361, + "grad_norm": 1.0048764944076538, + "learning_rate": 4.506748152747171e-06, + "loss": 0.1619, + "step": 4209 + }, + { + "epoch": 0.6821127673363577, + "grad_norm": 1.133623719215393, + "learning_rate": 4.506487320871758e-06, + "loss": 0.1596, + "step": 4210 + }, + { + "epoch": 0.6822747893713546, + "grad_norm": 0.979425311088562, + "learning_rate": 4.50622642760269e-06, + "loss": 0.1672, + "step": 4211 + }, + { + "epoch": 0.6824368114063513, + "grad_norm": 0.9712530374526978, + "learning_rate": 4.5059654729479474e-06, + "loss": 0.1245, + "step": 4212 + }, + { + "epoch": 0.682598833441348, + "grad_norm": 0.7488314509391785, + "learning_rate": 4.505704456915515e-06, + "loss": 0.1177, + "step": 4213 + }, + { + "epoch": 0.6827608554763448, + "grad_norm": 0.8927283883094788, + "learning_rate": 4.505443379513381e-06, + "loss": 0.1482, + "step": 4214 + }, + { + "epoch": 0.6829228775113415, + "grad_norm": 0.9022080898284912, + "learning_rate": 4.5051822407495314e-06, + "loss": 0.1382, + "step": 4215 + }, + { + "epoch": 0.6830848995463383, + "grad_norm": 1.0110646486282349, + "learning_rate": 4.5049210406319585e-06, + "loss": 0.1619, + "step": 4216 + }, + { + "epoch": 0.683246921581335, + "grad_norm": 0.8575459122657776, + "learning_rate": 4.504659779168654e-06, + "loss": 0.147, + "step": 4217 + }, + { + "epoch": 0.6834089436163319, + "grad_norm": 0.9155558943748474, + "learning_rate": 4.5043984563676105e-06, + "loss": 0.1438, + "step": 4218 + }, + { + "epoch": 0.6835709656513286, + "grad_norm": 0.9820864796638489, + "learning_rate": 4.504137072236825e-06, + "loss": 0.1536, + "step": 4219 + }, + { + "epoch": 0.6837329876863253, + "grad_norm": 0.8118696808815002, + "learning_rate": 4.503875626784295e-06, + "loss": 0.1275, + "step": 4220 + }, + { + "epoch": 0.6838950097213221, + "grad_norm": 0.9118869304656982, + "learning_rate": 4.5036141200180206e-06, + "loss": 0.1514, + "step": 4221 + }, + { + "epoch": 0.6840570317563188, + "grad_norm": 0.895452082157135, + "learning_rate": 4.503352551946003e-06, + "loss": 0.1496, + "step": 4222 + }, + { + "epoch": 0.6842190537913156, + "grad_norm": 0.8009963035583496, + "learning_rate": 4.503090922576244e-06, + "loss": 0.1173, + "step": 4223 + }, + { + "epoch": 0.6843810758263124, + "grad_norm": 0.8450411558151245, + "learning_rate": 4.5028292319167515e-06, + "loss": 0.1401, + "step": 4224 + }, + { + "epoch": 0.6845430978613092, + "grad_norm": 0.884223997592926, + "learning_rate": 4.5025674799755306e-06, + "loss": 0.1356, + "step": 4225 + }, + { + "epoch": 0.6847051198963059, + "grad_norm": 1.1534299850463867, + "learning_rate": 4.502305666760592e-06, + "loss": 0.1775, + "step": 4226 + }, + { + "epoch": 0.6848671419313026, + "grad_norm": 0.9515455365180969, + "learning_rate": 4.502043792279943e-06, + "loss": 0.1428, + "step": 4227 + }, + { + "epoch": 0.6850291639662994, + "grad_norm": 0.9897893071174622, + "learning_rate": 4.501781856541601e-06, + "loss": 0.1498, + "step": 4228 + }, + { + "epoch": 0.6851911860012961, + "grad_norm": 0.8131996393203735, + "learning_rate": 4.501519859553578e-06, + "loss": 0.1341, + "step": 4229 + }, + { + "epoch": 0.685353208036293, + "grad_norm": 0.8663079142570496, + "learning_rate": 4.50125780132389e-06, + "loss": 0.1286, + "step": 4230 + }, + { + "epoch": 0.6855152300712897, + "grad_norm": 0.8376840949058533, + "learning_rate": 4.500995681860557e-06, + "loss": 0.1301, + "step": 4231 + }, + { + "epoch": 0.6856772521062865, + "grad_norm": 0.8951122164726257, + "learning_rate": 4.500733501171599e-06, + "loss": 0.1411, + "step": 4232 + }, + { + "epoch": 0.6858392741412832, + "grad_norm": 1.0260791778564453, + "learning_rate": 4.500471259265037e-06, + "loss": 0.1705, + "step": 4233 + }, + { + "epoch": 0.68600129617628, + "grad_norm": 1.0505539178848267, + "learning_rate": 4.500208956148895e-06, + "loss": 0.1712, + "step": 4234 + }, + { + "epoch": 0.6861633182112767, + "grad_norm": 1.0583857297897339, + "learning_rate": 4.4999465918312e-06, + "loss": 0.1724, + "step": 4235 + }, + { + "epoch": 0.6863253402462735, + "grad_norm": 0.8932731747627258, + "learning_rate": 4.499684166319978e-06, + "loss": 0.1523, + "step": 4236 + }, + { + "epoch": 0.6864873622812703, + "grad_norm": 0.9509609937667847, + "learning_rate": 4.499421679623261e-06, + "loss": 0.1455, + "step": 4237 + }, + { + "epoch": 0.686649384316267, + "grad_norm": 0.9110451340675354, + "learning_rate": 4.499159131749079e-06, + "loss": 0.1367, + "step": 4238 + }, + { + "epoch": 0.6868114063512638, + "grad_norm": 0.9273454546928406, + "learning_rate": 4.498896522705465e-06, + "loss": 0.1397, + "step": 4239 + }, + { + "epoch": 0.6869734283862605, + "grad_norm": 0.8918704390525818, + "learning_rate": 4.498633852500455e-06, + "loss": 0.1376, + "step": 4240 + }, + { + "epoch": 0.6871354504212573, + "grad_norm": 0.9055836200714111, + "learning_rate": 4.4983711211420844e-06, + "loss": 0.1428, + "step": 4241 + }, + { + "epoch": 0.687297472456254, + "grad_norm": 0.9966475963592529, + "learning_rate": 4.498108328638395e-06, + "loss": 0.1578, + "step": 4242 + }, + { + "epoch": 0.6874594944912508, + "grad_norm": 0.954372227191925, + "learning_rate": 4.497845474997425e-06, + "loss": 0.1375, + "step": 4243 + }, + { + "epoch": 0.6876215165262476, + "grad_norm": 1.3921228647232056, + "learning_rate": 4.4975825602272185e-06, + "loss": 0.1365, + "step": 4244 + }, + { + "epoch": 0.6877835385612443, + "grad_norm": 0.9259489178657532, + "learning_rate": 4.497319584335821e-06, + "loss": 0.1462, + "step": 4245 + }, + { + "epoch": 0.6879455605962411, + "grad_norm": 0.9492351412773132, + "learning_rate": 4.497056547331276e-06, + "loss": 0.149, + "step": 4246 + }, + { + "epoch": 0.6881075826312378, + "grad_norm": 0.9279522895812988, + "learning_rate": 4.496793449221634e-06, + "loss": 0.1434, + "step": 4247 + }, + { + "epoch": 0.6882696046662347, + "grad_norm": 1.0066229104995728, + "learning_rate": 4.496530290014945e-06, + "loss": 0.1634, + "step": 4248 + }, + { + "epoch": 0.6884316267012314, + "grad_norm": 0.853121817111969, + "learning_rate": 4.496267069719259e-06, + "loss": 0.1469, + "step": 4249 + }, + { + "epoch": 0.6885936487362281, + "grad_norm": 1.041290521621704, + "learning_rate": 4.496003788342633e-06, + "loss": 0.1604, + "step": 4250 + }, + { + "epoch": 0.6887556707712249, + "grad_norm": 1.0122541189193726, + "learning_rate": 4.495740445893121e-06, + "loss": 0.1334, + "step": 4251 + }, + { + "epoch": 0.6889176928062216, + "grad_norm": 1.0344403982162476, + "learning_rate": 4.495477042378781e-06, + "loss": 0.1643, + "step": 4252 + }, + { + "epoch": 0.6890797148412184, + "grad_norm": 0.811650276184082, + "learning_rate": 4.495213577807672e-06, + "loss": 0.1343, + "step": 4253 + }, + { + "epoch": 0.6892417368762151, + "grad_norm": 0.842697024345398, + "learning_rate": 4.494950052187857e-06, + "loss": 0.1383, + "step": 4254 + }, + { + "epoch": 0.689403758911212, + "grad_norm": 0.8029190897941589, + "learning_rate": 4.494686465527397e-06, + "loss": 0.1306, + "step": 4255 + }, + { + "epoch": 0.6895657809462087, + "grad_norm": 0.8258494734764099, + "learning_rate": 4.494422817834359e-06, + "loss": 0.1334, + "step": 4256 + }, + { + "epoch": 0.6897278029812054, + "grad_norm": 0.8976909518241882, + "learning_rate": 4.494159109116809e-06, + "loss": 0.1548, + "step": 4257 + }, + { + "epoch": 0.6898898250162022, + "grad_norm": 0.8805419206619263, + "learning_rate": 4.493895339382815e-06, + "loss": 0.1286, + "step": 4258 + }, + { + "epoch": 0.6900518470511989, + "grad_norm": 0.7982555627822876, + "learning_rate": 4.49363150864045e-06, + "loss": 0.1213, + "step": 4259 + }, + { + "epoch": 0.6902138690861958, + "grad_norm": 1.0019187927246094, + "learning_rate": 4.493367616897785e-06, + "loss": 0.1542, + "step": 4260 + }, + { + "epoch": 0.6903758911211925, + "grad_norm": 0.832612931728363, + "learning_rate": 4.4931036641628946e-06, + "loss": 0.1404, + "step": 4261 + }, + { + "epoch": 0.6905379131561893, + "grad_norm": 0.9622425436973572, + "learning_rate": 4.4928396504438555e-06, + "loss": 0.1418, + "step": 4262 + }, + { + "epoch": 0.690699935191186, + "grad_norm": 0.8642094731330872, + "learning_rate": 4.492575575748746e-06, + "loss": 0.1437, + "step": 4263 + }, + { + "epoch": 0.6908619572261827, + "grad_norm": 1.0327246189117432, + "learning_rate": 4.4923114400856445e-06, + "loss": 0.1633, + "step": 4264 + }, + { + "epoch": 0.6910239792611795, + "grad_norm": 1.0118496417999268, + "learning_rate": 4.492047243462636e-06, + "loss": 0.1497, + "step": 4265 + }, + { + "epoch": 0.6911860012961762, + "grad_norm": 0.860984742641449, + "learning_rate": 4.491782985887802e-06, + "loss": 0.1416, + "step": 4266 + }, + { + "epoch": 0.6913480233311731, + "grad_norm": 0.9283857941627502, + "learning_rate": 4.491518667369228e-06, + "loss": 0.1471, + "step": 4267 + }, + { + "epoch": 0.6915100453661698, + "grad_norm": 0.9157022833824158, + "learning_rate": 4.491254287915003e-06, + "loss": 0.1482, + "step": 4268 + }, + { + "epoch": 0.6916720674011666, + "grad_norm": 0.9066836833953857, + "learning_rate": 4.490989847533214e-06, + "loss": 0.1549, + "step": 4269 + }, + { + "epoch": 0.6918340894361633, + "grad_norm": 0.7749550938606262, + "learning_rate": 4.490725346231954e-06, + "loss": 0.1114, + "step": 4270 + }, + { + "epoch": 0.69199611147116, + "grad_norm": 0.7799788117408752, + "learning_rate": 4.490460784019317e-06, + "loss": 0.1278, + "step": 4271 + }, + { + "epoch": 0.6921581335061568, + "grad_norm": 1.0473695993423462, + "learning_rate": 4.4901961609033965e-06, + "loss": 0.19, + "step": 4272 + }, + { + "epoch": 0.6923201555411536, + "grad_norm": 0.9344850778579712, + "learning_rate": 4.489931476892289e-06, + "loss": 0.1531, + "step": 4273 + }, + { + "epoch": 0.6924821775761504, + "grad_norm": 0.990973711013794, + "learning_rate": 4.489666731994095e-06, + "loss": 0.1538, + "step": 4274 + }, + { + "epoch": 0.6926441996111471, + "grad_norm": 0.9593062400817871, + "learning_rate": 4.4894019262169134e-06, + "loss": 0.1559, + "step": 4275 + }, + { + "epoch": 0.6928062216461439, + "grad_norm": 0.8276656866073608, + "learning_rate": 4.489137059568847e-06, + "loss": 0.1293, + "step": 4276 + }, + { + "epoch": 0.6929682436811406, + "grad_norm": 0.8525620698928833, + "learning_rate": 4.488872132058001e-06, + "loss": 0.1359, + "step": 4277 + }, + { + "epoch": 0.6931302657161373, + "grad_norm": 0.933274507522583, + "learning_rate": 4.48860714369248e-06, + "loss": 0.1463, + "step": 4278 + }, + { + "epoch": 0.6932922877511342, + "grad_norm": 1.014924168586731, + "learning_rate": 4.4883420944803925e-06, + "loss": 0.1627, + "step": 4279 + }, + { + "epoch": 0.6934543097861309, + "grad_norm": 0.9470938444137573, + "learning_rate": 4.488076984429849e-06, + "loss": 0.1612, + "step": 4280 + }, + { + "epoch": 0.6936163318211277, + "grad_norm": 0.861634373664856, + "learning_rate": 4.48781181354896e-06, + "loss": 0.1445, + "step": 4281 + }, + { + "epoch": 0.6937783538561244, + "grad_norm": 0.8474523425102234, + "learning_rate": 4.4875465818458404e-06, + "loss": 0.1379, + "step": 4282 + }, + { + "epoch": 0.6939403758911212, + "grad_norm": 0.8774752616882324, + "learning_rate": 4.487281289328605e-06, + "loss": 0.1374, + "step": 4283 + }, + { + "epoch": 0.6941023979261179, + "grad_norm": 0.8172398805618286, + "learning_rate": 4.4870159360053725e-06, + "loss": 0.1332, + "step": 4284 + }, + { + "epoch": 0.6942644199611148, + "grad_norm": 0.8533669114112854, + "learning_rate": 4.4867505218842596e-06, + "loss": 0.1421, + "step": 4285 + }, + { + "epoch": 0.6944264419961115, + "grad_norm": 0.909589946269989, + "learning_rate": 4.4864850469733886e-06, + "loss": 0.1532, + "step": 4286 + }, + { + "epoch": 0.6945884640311082, + "grad_norm": 0.7759212255477905, + "learning_rate": 4.486219511280883e-06, + "loss": 0.1142, + "step": 4287 + }, + { + "epoch": 0.694750486066105, + "grad_norm": 0.9085809588432312, + "learning_rate": 4.485953914814867e-06, + "loss": 0.1354, + "step": 4288 + }, + { + "epoch": 0.6949125081011017, + "grad_norm": 0.857865035533905, + "learning_rate": 4.485688257583467e-06, + "loss": 0.1398, + "step": 4289 + }, + { + "epoch": 0.6950745301360985, + "grad_norm": 0.8624712824821472, + "learning_rate": 4.485422539594811e-06, + "loss": 0.1342, + "step": 4290 + }, + { + "epoch": 0.6952365521710953, + "grad_norm": 0.8267756104469299, + "learning_rate": 4.48515676085703e-06, + "loss": 0.1381, + "step": 4291 + }, + { + "epoch": 0.6953985742060921, + "grad_norm": 0.9393563270568848, + "learning_rate": 4.4848909213782566e-06, + "loss": 0.1485, + "step": 4292 + }, + { + "epoch": 0.6955605962410888, + "grad_norm": 0.9284929037094116, + "learning_rate": 4.484625021166624e-06, + "loss": 0.1549, + "step": 4293 + }, + { + "epoch": 0.6957226182760855, + "grad_norm": 0.9308271408081055, + "learning_rate": 4.484359060230269e-06, + "loss": 0.1443, + "step": 4294 + }, + { + "epoch": 0.6958846403110823, + "grad_norm": 0.9445673227310181, + "learning_rate": 4.484093038577329e-06, + "loss": 0.127, + "step": 4295 + }, + { + "epoch": 0.696046662346079, + "grad_norm": 0.8204240798950195, + "learning_rate": 4.483826956215942e-06, + "loss": 0.1279, + "step": 4296 + }, + { + "epoch": 0.6962086843810759, + "grad_norm": 0.9389863610267639, + "learning_rate": 4.483560813154252e-06, + "loss": 0.1493, + "step": 4297 + }, + { + "epoch": 0.6963707064160726, + "grad_norm": 0.8122605085372925, + "learning_rate": 4.4832946094004e-06, + "loss": 0.1314, + "step": 4298 + }, + { + "epoch": 0.6965327284510694, + "grad_norm": 0.9366967082023621, + "learning_rate": 4.483028344962534e-06, + "loss": 0.1396, + "step": 4299 + }, + { + "epoch": 0.6966947504860661, + "grad_norm": 0.8493170738220215, + "learning_rate": 4.482762019848799e-06, + "loss": 0.1372, + "step": 4300 + }, + { + "epoch": 0.6968567725210628, + "grad_norm": 0.9560868740081787, + "learning_rate": 4.482495634067344e-06, + "loss": 0.1642, + "step": 4301 + }, + { + "epoch": 0.6970187945560596, + "grad_norm": 0.8868750333786011, + "learning_rate": 4.48222918762632e-06, + "loss": 0.1407, + "step": 4302 + }, + { + "epoch": 0.6971808165910564, + "grad_norm": 0.8307478427886963, + "learning_rate": 4.48196268053388e-06, + "loss": 0.136, + "step": 4303 + }, + { + "epoch": 0.6973428386260532, + "grad_norm": 0.923088014125824, + "learning_rate": 4.481696112798179e-06, + "loss": 0.1425, + "step": 4304 + }, + { + "epoch": 0.6975048606610499, + "grad_norm": 0.8834422826766968, + "learning_rate": 4.481429484427372e-06, + "loss": 0.1395, + "step": 4305 + }, + { + "epoch": 0.6976668826960467, + "grad_norm": 0.9474985599517822, + "learning_rate": 4.481162795429618e-06, + "loss": 0.153, + "step": 4306 + }, + { + "epoch": 0.6978289047310434, + "grad_norm": 0.9343580603599548, + "learning_rate": 4.480896045813076e-06, + "loss": 0.145, + "step": 4307 + }, + { + "epoch": 0.6979909267660401, + "grad_norm": 0.746100127696991, + "learning_rate": 4.480629235585909e-06, + "loss": 0.1194, + "step": 4308 + }, + { + "epoch": 0.698152948801037, + "grad_norm": 0.9636530876159668, + "learning_rate": 4.480362364756281e-06, + "loss": 0.1526, + "step": 4309 + }, + { + "epoch": 0.6983149708360337, + "grad_norm": 0.7834259271621704, + "learning_rate": 4.480095433332357e-06, + "loss": 0.1296, + "step": 4310 + }, + { + "epoch": 0.6984769928710305, + "grad_norm": 0.9413818717002869, + "learning_rate": 4.479828441322304e-06, + "loss": 0.1542, + "step": 4311 + }, + { + "epoch": 0.6986390149060272, + "grad_norm": 0.9346475005149841, + "learning_rate": 4.4795613887342916e-06, + "loss": 0.1429, + "step": 4312 + }, + { + "epoch": 0.698801036941024, + "grad_norm": 0.917719841003418, + "learning_rate": 4.479294275576492e-06, + "loss": 0.1366, + "step": 4313 + }, + { + "epoch": 0.6989630589760207, + "grad_norm": 0.8321374654769897, + "learning_rate": 4.479027101857076e-06, + "loss": 0.1395, + "step": 4314 + }, + { + "epoch": 0.6991250810110174, + "grad_norm": 0.9253593683242798, + "learning_rate": 4.478759867584221e-06, + "loss": 0.1371, + "step": 4315 + }, + { + "epoch": 0.6992871030460143, + "grad_norm": 0.8277562856674194, + "learning_rate": 4.4784925727661025e-06, + "loss": 0.1368, + "step": 4316 + }, + { + "epoch": 0.699449125081011, + "grad_norm": 1.0059376955032349, + "learning_rate": 4.478225217410898e-06, + "loss": 0.1577, + "step": 4317 + }, + { + "epoch": 0.6996111471160078, + "grad_norm": 0.9551058411598206, + "learning_rate": 4.47795780152679e-06, + "loss": 0.1414, + "step": 4318 + }, + { + "epoch": 0.6997731691510045, + "grad_norm": 1.059830665588379, + "learning_rate": 4.47769032512196e-06, + "loss": 0.1566, + "step": 4319 + }, + { + "epoch": 0.6999351911860013, + "grad_norm": 0.8938872814178467, + "learning_rate": 4.477422788204592e-06, + "loss": 0.1362, + "step": 4320 + }, + { + "epoch": 0.700097213220998, + "grad_norm": 0.9738820195198059, + "learning_rate": 4.4771551907828714e-06, + "loss": 0.1627, + "step": 4321 + }, + { + "epoch": 0.7002592352559948, + "grad_norm": 0.812980592250824, + "learning_rate": 4.476887532864986e-06, + "loss": 0.1344, + "step": 4322 + }, + { + "epoch": 0.7004212572909916, + "grad_norm": 0.9212116599082947, + "learning_rate": 4.476619814459128e-06, + "loss": 0.1346, + "step": 4323 + }, + { + "epoch": 0.7005832793259883, + "grad_norm": 0.7404506802558899, + "learning_rate": 4.476352035573486e-06, + "loss": 0.1101, + "step": 4324 + }, + { + "epoch": 0.7007453013609851, + "grad_norm": 0.8756342530250549, + "learning_rate": 4.4760841962162535e-06, + "loss": 0.1492, + "step": 4325 + }, + { + "epoch": 0.7009073233959818, + "grad_norm": 0.9470415711402893, + "learning_rate": 4.475816296395627e-06, + "loss": 0.1547, + "step": 4326 + }, + { + "epoch": 0.7010693454309787, + "grad_norm": 0.7365562915802002, + "learning_rate": 4.475548336119804e-06, + "loss": 0.1169, + "step": 4327 + }, + { + "epoch": 0.7012313674659754, + "grad_norm": 0.9501516819000244, + "learning_rate": 4.475280315396982e-06, + "loss": 0.1503, + "step": 4328 + }, + { + "epoch": 0.7013933895009722, + "grad_norm": 0.8905579447746277, + "learning_rate": 4.475012234235363e-06, + "loss": 0.1577, + "step": 4329 + }, + { + "epoch": 0.7015554115359689, + "grad_norm": 0.9142157435417175, + "learning_rate": 4.474744092643149e-06, + "loss": 0.1435, + "step": 4330 + }, + { + "epoch": 0.7017174335709656, + "grad_norm": 0.8763073086738586, + "learning_rate": 4.474475890628545e-06, + "loss": 0.1384, + "step": 4331 + }, + { + "epoch": 0.7018794556059624, + "grad_norm": 0.8825966715812683, + "learning_rate": 4.474207628199756e-06, + "loss": 0.1515, + "step": 4332 + }, + { + "epoch": 0.7020414776409591, + "grad_norm": 0.876340389251709, + "learning_rate": 4.473939305364991e-06, + "loss": 0.1513, + "step": 4333 + }, + { + "epoch": 0.702203499675956, + "grad_norm": 0.8109468221664429, + "learning_rate": 4.47367092213246e-06, + "loss": 0.1298, + "step": 4334 + }, + { + "epoch": 0.7023655217109527, + "grad_norm": 0.9166135787963867, + "learning_rate": 4.473402478510376e-06, + "loss": 0.1428, + "step": 4335 + }, + { + "epoch": 0.7025275437459495, + "grad_norm": 0.838951051235199, + "learning_rate": 4.473133974506951e-06, + "loss": 0.1315, + "step": 4336 + }, + { + "epoch": 0.7026895657809462, + "grad_norm": 0.866460382938385, + "learning_rate": 4.472865410130401e-06, + "loss": 0.1348, + "step": 4337 + }, + { + "epoch": 0.7028515878159429, + "grad_norm": 0.8910289406776428, + "learning_rate": 4.472596785388944e-06, + "loss": 0.1366, + "step": 4338 + }, + { + "epoch": 0.7030136098509397, + "grad_norm": 1.0762733221054077, + "learning_rate": 4.472328100290799e-06, + "loss": 0.173, + "step": 4339 + }, + { + "epoch": 0.7031756318859365, + "grad_norm": 0.9918913841247559, + "learning_rate": 4.472059354844187e-06, + "loss": 0.1718, + "step": 4340 + }, + { + "epoch": 0.7033376539209333, + "grad_norm": 0.9252434968948364, + "learning_rate": 4.471790549057332e-06, + "loss": 0.147, + "step": 4341 + }, + { + "epoch": 0.70349967595593, + "grad_norm": 0.9707101583480835, + "learning_rate": 4.4715216829384566e-06, + "loss": 0.1484, + "step": 4342 + }, + { + "epoch": 0.7036616979909268, + "grad_norm": 0.9649327397346497, + "learning_rate": 4.471252756495789e-06, + "loss": 0.1663, + "step": 4343 + }, + { + "epoch": 0.7038237200259235, + "grad_norm": 0.8951583504676819, + "learning_rate": 4.470983769737557e-06, + "loss": 0.1424, + "step": 4344 + }, + { + "epoch": 0.7039857420609202, + "grad_norm": 0.885223388671875, + "learning_rate": 4.470714722671992e-06, + "loss": 0.1515, + "step": 4345 + }, + { + "epoch": 0.7041477640959171, + "grad_norm": 1.0771913528442383, + "learning_rate": 4.470445615307325e-06, + "loss": 0.179, + "step": 4346 + }, + { + "epoch": 0.7043097861309138, + "grad_norm": 1.0075687170028687, + "learning_rate": 4.470176447651791e-06, + "loss": 0.1614, + "step": 4347 + }, + { + "epoch": 0.7044718081659106, + "grad_norm": 0.9713208675384521, + "learning_rate": 4.4699072197136255e-06, + "loss": 0.1478, + "step": 4348 + }, + { + "epoch": 0.7046338302009073, + "grad_norm": 0.9104232788085938, + "learning_rate": 4.469637931501066e-06, + "loss": 0.1519, + "step": 4349 + }, + { + "epoch": 0.7047958522359041, + "grad_norm": 0.9640597105026245, + "learning_rate": 4.469368583022352e-06, + "loss": 0.172, + "step": 4350 + }, + { + "epoch": 0.7049578742709008, + "grad_norm": 0.966541588306427, + "learning_rate": 4.469099174285725e-06, + "loss": 0.1659, + "step": 4351 + }, + { + "epoch": 0.7051198963058976, + "grad_norm": 0.9105446338653564, + "learning_rate": 4.468829705299429e-06, + "loss": 0.1513, + "step": 4352 + }, + { + "epoch": 0.7052819183408944, + "grad_norm": 0.916776180267334, + "learning_rate": 4.4685601760717075e-06, + "loss": 0.1533, + "step": 4353 + }, + { + "epoch": 0.7054439403758911, + "grad_norm": 0.8211317658424377, + "learning_rate": 4.4682905866108094e-06, + "loss": 0.1411, + "step": 4354 + }, + { + "epoch": 0.7056059624108879, + "grad_norm": 0.908372163772583, + "learning_rate": 4.468020936924983e-06, + "loss": 0.1392, + "step": 4355 + }, + { + "epoch": 0.7057679844458846, + "grad_norm": 0.9611108303070068, + "learning_rate": 4.467751227022478e-06, + "loss": 0.1592, + "step": 4356 + }, + { + "epoch": 0.7059300064808814, + "grad_norm": 0.8858509063720703, + "learning_rate": 4.467481456911547e-06, + "loss": 0.1414, + "step": 4357 + }, + { + "epoch": 0.7060920285158782, + "grad_norm": 0.8831855654716492, + "learning_rate": 4.467211626600444e-06, + "loss": 0.1331, + "step": 4358 + }, + { + "epoch": 0.7062540505508749, + "grad_norm": 0.8268503546714783, + "learning_rate": 4.466941736097427e-06, + "loss": 0.1466, + "step": 4359 + }, + { + "epoch": 0.7064160725858717, + "grad_norm": 0.952812135219574, + "learning_rate": 4.466671785410752e-06, + "loss": 0.1511, + "step": 4360 + }, + { + "epoch": 0.7065780946208684, + "grad_norm": 0.9500433802604675, + "learning_rate": 4.46640177454868e-06, + "loss": 0.1439, + "step": 4361 + }, + { + "epoch": 0.7067401166558652, + "grad_norm": 0.8875350952148438, + "learning_rate": 4.4661317035194716e-06, + "loss": 0.145, + "step": 4362 + }, + { + "epoch": 0.7069021386908619, + "grad_norm": 0.8836730718612671, + "learning_rate": 4.465861572331392e-06, + "loss": 0.1386, + "step": 4363 + }, + { + "epoch": 0.7070641607258588, + "grad_norm": 0.9382752180099487, + "learning_rate": 4.4655913809927045e-06, + "loss": 0.1529, + "step": 4364 + }, + { + "epoch": 0.7072261827608555, + "grad_norm": 0.968842625617981, + "learning_rate": 4.465321129511678e-06, + "loss": 0.1463, + "step": 4365 + }, + { + "epoch": 0.7073882047958522, + "grad_norm": 0.9342696666717529, + "learning_rate": 4.4650508178965814e-06, + "loss": 0.1376, + "step": 4366 + }, + { + "epoch": 0.707550226830849, + "grad_norm": 1.0476247072219849, + "learning_rate": 4.464780446155684e-06, + "loss": 0.1389, + "step": 4367 + }, + { + "epoch": 0.7077122488658457, + "grad_norm": 0.9861329197883606, + "learning_rate": 4.464510014297261e-06, + "loss": 0.1561, + "step": 4368 + }, + { + "epoch": 0.7078742709008425, + "grad_norm": 0.8797351717948914, + "learning_rate": 4.464239522329585e-06, + "loss": 0.1307, + "step": 4369 + }, + { + "epoch": 0.7080362929358393, + "grad_norm": 0.9944042563438416, + "learning_rate": 4.4639689702609326e-06, + "loss": 0.1431, + "step": 4370 + }, + { + "epoch": 0.7081983149708361, + "grad_norm": 0.8967122435569763, + "learning_rate": 4.463698358099583e-06, + "loss": 0.1483, + "step": 4371 + }, + { + "epoch": 0.7083603370058328, + "grad_norm": 0.9998703598976135, + "learning_rate": 4.463427685853815e-06, + "loss": 0.1347, + "step": 4372 + }, + { + "epoch": 0.7085223590408296, + "grad_norm": 0.9268200397491455, + "learning_rate": 4.463156953531912e-06, + "loss": 0.1591, + "step": 4373 + }, + { + "epoch": 0.7086843810758263, + "grad_norm": 0.8729379177093506, + "learning_rate": 4.462886161142157e-06, + "loss": 0.1271, + "step": 4374 + }, + { + "epoch": 0.708846403110823, + "grad_norm": 0.8896871209144592, + "learning_rate": 4.462615308692835e-06, + "loss": 0.1595, + "step": 4375 + }, + { + "epoch": 0.7090084251458199, + "grad_norm": 0.8534110188484192, + "learning_rate": 4.4623443961922334e-06, + "loss": 0.1321, + "step": 4376 + }, + { + "epoch": 0.7091704471808166, + "grad_norm": 0.9220858216285706, + "learning_rate": 4.462073423648643e-06, + "loss": 0.1584, + "step": 4377 + }, + { + "epoch": 0.7093324692158134, + "grad_norm": 0.8980048298835754, + "learning_rate": 4.461802391070354e-06, + "loss": 0.1432, + "step": 4378 + }, + { + "epoch": 0.7094944912508101, + "grad_norm": 0.8806954622268677, + "learning_rate": 4.461531298465659e-06, + "loss": 0.1342, + "step": 4379 + }, + { + "epoch": 0.7096565132858069, + "grad_norm": 0.9965754747390747, + "learning_rate": 4.4612601458428525e-06, + "loss": 0.1537, + "step": 4380 + }, + { + "epoch": 0.7098185353208036, + "grad_norm": 0.9695927500724792, + "learning_rate": 4.460988933210233e-06, + "loss": 0.1357, + "step": 4381 + }, + { + "epoch": 0.7099805573558003, + "grad_norm": 0.9649356007575989, + "learning_rate": 4.460717660576097e-06, + "loss": 0.1534, + "step": 4382 + }, + { + "epoch": 0.7101425793907972, + "grad_norm": 0.9325234293937683, + "learning_rate": 4.460446327948745e-06, + "loss": 0.1443, + "step": 4383 + }, + { + "epoch": 0.7103046014257939, + "grad_norm": 1.038757562637329, + "learning_rate": 4.46017493533648e-06, + "loss": 0.1623, + "step": 4384 + }, + { + "epoch": 0.7104666234607907, + "grad_norm": 0.959611177444458, + "learning_rate": 4.459903482747605e-06, + "loss": 0.1428, + "step": 4385 + }, + { + "epoch": 0.7106286454957874, + "grad_norm": 0.9437382817268372, + "learning_rate": 4.459631970190428e-06, + "loss": 0.1429, + "step": 4386 + }, + { + "epoch": 0.7107906675307842, + "grad_norm": 0.9380490779876709, + "learning_rate": 4.459360397673253e-06, + "loss": 0.1459, + "step": 4387 + }, + { + "epoch": 0.710952689565781, + "grad_norm": 0.8897026777267456, + "learning_rate": 4.4590887652043925e-06, + "loss": 0.1397, + "step": 4388 + }, + { + "epoch": 0.7111147116007777, + "grad_norm": 0.9225705862045288, + "learning_rate": 4.458817072792155e-06, + "loss": 0.1516, + "step": 4389 + }, + { + "epoch": 0.7112767336357745, + "grad_norm": 0.8232748508453369, + "learning_rate": 4.458545320444857e-06, + "loss": 0.1326, + "step": 4390 + }, + { + "epoch": 0.7114387556707712, + "grad_norm": 0.9818866848945618, + "learning_rate": 4.458273508170812e-06, + "loss": 0.1557, + "step": 4391 + }, + { + "epoch": 0.711600777705768, + "grad_norm": 1.086909294128418, + "learning_rate": 4.458001635978335e-06, + "loss": 0.1492, + "step": 4392 + }, + { + "epoch": 0.7117627997407647, + "grad_norm": 0.8664153814315796, + "learning_rate": 4.457729703875749e-06, + "loss": 0.135, + "step": 4393 + }, + { + "epoch": 0.7119248217757616, + "grad_norm": 0.8988549113273621, + "learning_rate": 4.457457711871369e-06, + "loss": 0.1473, + "step": 4394 + }, + { + "epoch": 0.7120868438107583, + "grad_norm": 0.9167562127113342, + "learning_rate": 4.45718565997352e-06, + "loss": 0.1553, + "step": 4395 + }, + { + "epoch": 0.712248865845755, + "grad_norm": 0.8664883971214294, + "learning_rate": 4.4569135481905274e-06, + "loss": 0.1404, + "step": 4396 + }, + { + "epoch": 0.7124108878807518, + "grad_norm": 0.7986405491828918, + "learning_rate": 4.456641376530715e-06, + "loss": 0.1239, + "step": 4397 + }, + { + "epoch": 0.7125729099157485, + "grad_norm": 0.9306362867355347, + "learning_rate": 4.456369145002412e-06, + "loss": 0.1489, + "step": 4398 + }, + { + "epoch": 0.7127349319507453, + "grad_norm": 0.9109500050544739, + "learning_rate": 4.456096853613948e-06, + "loss": 0.1439, + "step": 4399 + }, + { + "epoch": 0.712896953985742, + "grad_norm": 0.8157075643539429, + "learning_rate": 4.455824502373653e-06, + "loss": 0.1308, + "step": 4400 + }, + { + "epoch": 0.7130589760207389, + "grad_norm": 0.8615176677703857, + "learning_rate": 4.4555520912898616e-06, + "loss": 0.1299, + "step": 4401 + }, + { + "epoch": 0.7132209980557356, + "grad_norm": 0.9204617142677307, + "learning_rate": 4.455279620370908e-06, + "loss": 0.1405, + "step": 4402 + }, + { + "epoch": 0.7133830200907323, + "grad_norm": 0.8997926115989685, + "learning_rate": 4.455007089625131e-06, + "loss": 0.1319, + "step": 4403 + }, + { + "epoch": 0.7135450421257291, + "grad_norm": 0.861057698726654, + "learning_rate": 4.454734499060867e-06, + "loss": 0.1556, + "step": 4404 + }, + { + "epoch": 0.7137070641607258, + "grad_norm": 0.9755337238311768, + "learning_rate": 4.4544618486864575e-06, + "loss": 0.1606, + "step": 4405 + }, + { + "epoch": 0.7138690861957226, + "grad_norm": 0.9111999869346619, + "learning_rate": 4.454189138510246e-06, + "loss": 0.1339, + "step": 4406 + }, + { + "epoch": 0.7140311082307194, + "grad_norm": 0.8962233662605286, + "learning_rate": 4.4539163685405755e-06, + "loss": 0.1345, + "step": 4407 + }, + { + "epoch": 0.7141931302657162, + "grad_norm": 0.9078991413116455, + "learning_rate": 4.453643538785793e-06, + "loss": 0.1504, + "step": 4408 + }, + { + "epoch": 0.7143551523007129, + "grad_norm": 0.9566717743873596, + "learning_rate": 4.453370649254245e-06, + "loss": 0.1546, + "step": 4409 + }, + { + "epoch": 0.7145171743357096, + "grad_norm": 0.8746940493583679, + "learning_rate": 4.453097699954282e-06, + "loss": 0.147, + "step": 4410 + }, + { + "epoch": 0.7146791963707064, + "grad_norm": 0.9663369059562683, + "learning_rate": 4.452824690894257e-06, + "loss": 0.1594, + "step": 4411 + }, + { + "epoch": 0.7148412184057031, + "grad_norm": 0.9418163895606995, + "learning_rate": 4.452551622082522e-06, + "loss": 0.1533, + "step": 4412 + }, + { + "epoch": 0.7150032404407, + "grad_norm": 0.8881242871284485, + "learning_rate": 4.452278493527431e-06, + "loss": 0.1353, + "step": 4413 + }, + { + "epoch": 0.7151652624756967, + "grad_norm": 0.9275038838386536, + "learning_rate": 4.452005305237344e-06, + "loss": 0.1411, + "step": 4414 + }, + { + "epoch": 0.7153272845106935, + "grad_norm": 1.107954740524292, + "learning_rate": 4.451732057220618e-06, + "loss": 0.1572, + "step": 4415 + }, + { + "epoch": 0.7154893065456902, + "grad_norm": 0.9651256799697876, + "learning_rate": 4.451458749485614e-06, + "loss": 0.153, + "step": 4416 + }, + { + "epoch": 0.7156513285806869, + "grad_norm": 0.8925125598907471, + "learning_rate": 4.451185382040695e-06, + "loss": 0.1423, + "step": 4417 + }, + { + "epoch": 0.7158133506156837, + "grad_norm": 0.8692678213119507, + "learning_rate": 4.4509119548942245e-06, + "loss": 0.1448, + "step": 4418 + }, + { + "epoch": 0.7159753726506805, + "grad_norm": 0.9533895254135132, + "learning_rate": 4.45063846805457e-06, + "loss": 0.1495, + "step": 4419 + }, + { + "epoch": 0.7161373946856773, + "grad_norm": 0.8917126059532166, + "learning_rate": 4.450364921530099e-06, + "loss": 0.1437, + "step": 4420 + }, + { + "epoch": 0.716299416720674, + "grad_norm": 0.7850384712219238, + "learning_rate": 4.450091315329181e-06, + "loss": 0.1339, + "step": 4421 + }, + { + "epoch": 0.7164614387556708, + "grad_norm": 1.0266042947769165, + "learning_rate": 4.449817649460187e-06, + "loss": 0.1571, + "step": 4422 + }, + { + "epoch": 0.7166234607906675, + "grad_norm": 1.084693193435669, + "learning_rate": 4.449543923931493e-06, + "loss": 0.1458, + "step": 4423 + }, + { + "epoch": 0.7167854828256643, + "grad_norm": 0.9740076065063477, + "learning_rate": 4.449270138751471e-06, + "loss": 0.1395, + "step": 4424 + }, + { + "epoch": 0.7169475048606611, + "grad_norm": 0.8249508738517761, + "learning_rate": 4.4489962939285015e-06, + "loss": 0.133, + "step": 4425 + }, + { + "epoch": 0.7171095268956578, + "grad_norm": 0.9032569527626038, + "learning_rate": 4.4487223894709606e-06, + "loss": 0.1395, + "step": 4426 + }, + { + "epoch": 0.7172715489306546, + "grad_norm": 0.7856298089027405, + "learning_rate": 4.448448425387231e-06, + "loss": 0.1226, + "step": 4427 + }, + { + "epoch": 0.7174335709656513, + "grad_norm": 0.8503175377845764, + "learning_rate": 4.448174401685694e-06, + "loss": 0.1409, + "step": 4428 + }, + { + "epoch": 0.7175955930006481, + "grad_norm": 0.9140015244483948, + "learning_rate": 4.447900318374736e-06, + "loss": 0.1488, + "step": 4429 + }, + { + "epoch": 0.7177576150356448, + "grad_norm": 0.8657589554786682, + "learning_rate": 4.447626175462741e-06, + "loss": 0.1395, + "step": 4430 + }, + { + "epoch": 0.7179196370706417, + "grad_norm": 0.8597119450569153, + "learning_rate": 4.447351972958099e-06, + "loss": 0.1285, + "step": 4431 + }, + { + "epoch": 0.7180816591056384, + "grad_norm": 1.0180467367172241, + "learning_rate": 4.447077710869199e-06, + "loss": 0.1668, + "step": 4432 + }, + { + "epoch": 0.7182436811406351, + "grad_norm": 0.924322783946991, + "learning_rate": 4.446803389204433e-06, + "loss": 0.1472, + "step": 4433 + }, + { + "epoch": 0.7184057031756319, + "grad_norm": 0.9639684557914734, + "learning_rate": 4.4465290079721935e-06, + "loss": 0.1419, + "step": 4434 + }, + { + "epoch": 0.7185677252106286, + "grad_norm": 0.9265577793121338, + "learning_rate": 4.446254567180877e-06, + "loss": 0.1461, + "step": 4435 + }, + { + "epoch": 0.7187297472456254, + "grad_norm": 0.9008076190948486, + "learning_rate": 4.445980066838882e-06, + "loss": 0.1408, + "step": 4436 + }, + { + "epoch": 0.7188917692806222, + "grad_norm": 0.8096259236335754, + "learning_rate": 4.445705506954605e-06, + "loss": 0.1268, + "step": 4437 + }, + { + "epoch": 0.719053791315619, + "grad_norm": 0.9201697111129761, + "learning_rate": 4.4454308875364486e-06, + "loss": 0.1484, + "step": 4438 + }, + { + "epoch": 0.7192158133506157, + "grad_norm": 1.0293805599212646, + "learning_rate": 4.445156208592814e-06, + "loss": 0.1722, + "step": 4439 + }, + { + "epoch": 0.7193778353856124, + "grad_norm": 0.9077914357185364, + "learning_rate": 4.444881470132108e-06, + "loss": 0.1476, + "step": 4440 + }, + { + "epoch": 0.7195398574206092, + "grad_norm": 0.9666748046875, + "learning_rate": 4.444606672162735e-06, + "loss": 0.1537, + "step": 4441 + }, + { + "epoch": 0.7197018794556059, + "grad_norm": 0.942487895488739, + "learning_rate": 4.444331814693103e-06, + "loss": 0.1257, + "step": 4442 + }, + { + "epoch": 0.7198639014906028, + "grad_norm": 0.9764305353164673, + "learning_rate": 4.444056897731622e-06, + "loss": 0.1494, + "step": 4443 + }, + { + "epoch": 0.7200259235255995, + "grad_norm": 0.8661373257637024, + "learning_rate": 4.443781921286706e-06, + "loss": 0.1331, + "step": 4444 + }, + { + "epoch": 0.7201879455605963, + "grad_norm": 0.8567464351654053, + "learning_rate": 4.443506885366767e-06, + "loss": 0.143, + "step": 4445 + }, + { + "epoch": 0.720349967595593, + "grad_norm": 0.8924309015274048, + "learning_rate": 4.4432317899802205e-06, + "loss": 0.1297, + "step": 4446 + }, + { + "epoch": 0.7205119896305897, + "grad_norm": 0.874181866645813, + "learning_rate": 4.442956635135482e-06, + "loss": 0.1448, + "step": 4447 + }, + { + "epoch": 0.7206740116655865, + "grad_norm": 0.9796168804168701, + "learning_rate": 4.442681420840974e-06, + "loss": 0.1361, + "step": 4448 + }, + { + "epoch": 0.7208360337005832, + "grad_norm": 0.9041408896446228, + "learning_rate": 4.442406147105116e-06, + "loss": 0.1405, + "step": 4449 + }, + { + "epoch": 0.7209980557355801, + "grad_norm": 0.9344844222068787, + "learning_rate": 4.44213081393633e-06, + "loss": 0.1571, + "step": 4450 + }, + { + "epoch": 0.7211600777705768, + "grad_norm": 0.927335798740387, + "learning_rate": 4.4418554213430405e-06, + "loss": 0.1417, + "step": 4451 + }, + { + "epoch": 0.7213220998055736, + "grad_norm": 0.7550799250602722, + "learning_rate": 4.441579969333675e-06, + "loss": 0.1111, + "step": 4452 + }, + { + "epoch": 0.7214841218405703, + "grad_norm": 0.8267915844917297, + "learning_rate": 4.44130445791666e-06, + "loss": 0.1308, + "step": 4453 + }, + { + "epoch": 0.721646143875567, + "grad_norm": 1.0004557371139526, + "learning_rate": 4.441028887100427e-06, + "loss": 0.1523, + "step": 4454 + }, + { + "epoch": 0.7218081659105638, + "grad_norm": 0.9250260591506958, + "learning_rate": 4.440753256893408e-06, + "loss": 0.1553, + "step": 4455 + }, + { + "epoch": 0.7219701879455606, + "grad_norm": 0.8540483713150024, + "learning_rate": 4.4404775673040346e-06, + "loss": 0.156, + "step": 4456 + }, + { + "epoch": 0.7221322099805574, + "grad_norm": 0.8714447617530823, + "learning_rate": 4.4402018183407435e-06, + "loss": 0.1415, + "step": 4457 + }, + { + "epoch": 0.7222942320155541, + "grad_norm": 0.9081652164459229, + "learning_rate": 4.4399260100119726e-06, + "loss": 0.148, + "step": 4458 + }, + { + "epoch": 0.7224562540505509, + "grad_norm": 0.9180371165275574, + "learning_rate": 4.439650142326161e-06, + "loss": 0.1537, + "step": 4459 + }, + { + "epoch": 0.7226182760855476, + "grad_norm": 0.9707509279251099, + "learning_rate": 4.439374215291748e-06, + "loss": 0.1333, + "step": 4460 + }, + { + "epoch": 0.7227802981205443, + "grad_norm": 0.9013661742210388, + "learning_rate": 4.439098228917177e-06, + "loss": 0.1352, + "step": 4461 + }, + { + "epoch": 0.7229423201555412, + "grad_norm": 0.8960440158843994, + "learning_rate": 4.438822183210894e-06, + "loss": 0.1514, + "step": 4462 + }, + { + "epoch": 0.7231043421905379, + "grad_norm": 0.8287968635559082, + "learning_rate": 4.4385460781813426e-06, + "loss": 0.1281, + "step": 4463 + }, + { + "epoch": 0.7232663642255347, + "grad_norm": 0.8026749491691589, + "learning_rate": 4.438269913836972e-06, + "loss": 0.1293, + "step": 4464 + }, + { + "epoch": 0.7234283862605314, + "grad_norm": 0.9157952070236206, + "learning_rate": 4.437993690186234e-06, + "loss": 0.1577, + "step": 4465 + }, + { + "epoch": 0.7235904082955282, + "grad_norm": 0.8798143863677979, + "learning_rate": 4.437717407237578e-06, + "loss": 0.1296, + "step": 4466 + }, + { + "epoch": 0.7237524303305249, + "grad_norm": 0.8337207436561584, + "learning_rate": 4.437441064999459e-06, + "loss": 0.1351, + "step": 4467 + }, + { + "epoch": 0.7239144523655218, + "grad_norm": 0.9726783037185669, + "learning_rate": 4.437164663480332e-06, + "loss": 0.1518, + "step": 4468 + }, + { + "epoch": 0.7240764744005185, + "grad_norm": 0.9185786843299866, + "learning_rate": 4.436888202688654e-06, + "loss": 0.1466, + "step": 4469 + }, + { + "epoch": 0.7242384964355152, + "grad_norm": 1.0107709169387817, + "learning_rate": 4.436611682632884e-06, + "loss": 0.1403, + "step": 4470 + }, + { + "epoch": 0.724400518470512, + "grad_norm": 0.886813759803772, + "learning_rate": 4.436335103321484e-06, + "loss": 0.1544, + "step": 4471 + }, + { + "epoch": 0.7245625405055087, + "grad_norm": 0.8717353343963623, + "learning_rate": 4.436058464762915e-06, + "loss": 0.1332, + "step": 4472 + }, + { + "epoch": 0.7247245625405055, + "grad_norm": 0.9517882466316223, + "learning_rate": 4.435781766965641e-06, + "loss": 0.1582, + "step": 4473 + }, + { + "epoch": 0.7248865845755023, + "grad_norm": 0.946277379989624, + "learning_rate": 4.435505009938131e-06, + "loss": 0.1437, + "step": 4474 + }, + { + "epoch": 0.7250486066104991, + "grad_norm": 0.970646858215332, + "learning_rate": 4.435228193688851e-06, + "loss": 0.1502, + "step": 4475 + }, + { + "epoch": 0.7252106286454958, + "grad_norm": 0.8949056267738342, + "learning_rate": 4.434951318226272e-06, + "loss": 0.1389, + "step": 4476 + }, + { + "epoch": 0.7253726506804925, + "grad_norm": 0.8123460412025452, + "learning_rate": 4.434674383558865e-06, + "loss": 0.1244, + "step": 4477 + }, + { + "epoch": 0.7255346727154893, + "grad_norm": 0.8786303400993347, + "learning_rate": 4.434397389695102e-06, + "loss": 0.1267, + "step": 4478 + }, + { + "epoch": 0.725696694750486, + "grad_norm": 0.9474332332611084, + "learning_rate": 4.434120336643462e-06, + "loss": 0.1479, + "step": 4479 + }, + { + "epoch": 0.7258587167854829, + "grad_norm": 0.917175829410553, + "learning_rate": 4.433843224412419e-06, + "loss": 0.1538, + "step": 4480 + }, + { + "epoch": 0.7260207388204796, + "grad_norm": 0.8738764524459839, + "learning_rate": 4.433566053010454e-06, + "loss": 0.1279, + "step": 4481 + }, + { + "epoch": 0.7261827608554764, + "grad_norm": 0.9041066765785217, + "learning_rate": 4.4332888224460466e-06, + "loss": 0.1384, + "step": 4482 + }, + { + "epoch": 0.7263447828904731, + "grad_norm": 0.8361867070198059, + "learning_rate": 4.433011532727679e-06, + "loss": 0.1305, + "step": 4483 + }, + { + "epoch": 0.7265068049254698, + "grad_norm": 0.9306924343109131, + "learning_rate": 4.432734183863837e-06, + "loss": 0.1426, + "step": 4484 + }, + { + "epoch": 0.7266688269604666, + "grad_norm": 0.9259305596351624, + "learning_rate": 4.432456775863006e-06, + "loss": 0.1332, + "step": 4485 + }, + { + "epoch": 0.7268308489954634, + "grad_norm": 0.8822481036186218, + "learning_rate": 4.432179308733674e-06, + "loss": 0.1458, + "step": 4486 + }, + { + "epoch": 0.7269928710304602, + "grad_norm": 0.922392725944519, + "learning_rate": 4.4319017824843315e-06, + "loss": 0.1534, + "step": 4487 + }, + { + "epoch": 0.7271548930654569, + "grad_norm": 0.8622284531593323, + "learning_rate": 4.43162419712347e-06, + "loss": 0.1243, + "step": 4488 + }, + { + "epoch": 0.7273169151004537, + "grad_norm": 0.9705631136894226, + "learning_rate": 4.431346552659581e-06, + "loss": 0.1426, + "step": 4489 + }, + { + "epoch": 0.7274789371354504, + "grad_norm": 0.9161760807037354, + "learning_rate": 4.431068849101162e-06, + "loss": 0.1513, + "step": 4490 + }, + { + "epoch": 0.7276409591704471, + "grad_norm": 0.9033561944961548, + "learning_rate": 4.430791086456709e-06, + "loss": 0.133, + "step": 4491 + }, + { + "epoch": 0.727802981205444, + "grad_norm": 0.9011300206184387, + "learning_rate": 4.4305132647347215e-06, + "loss": 0.1466, + "step": 4492 + }, + { + "epoch": 0.7279650032404407, + "grad_norm": 1.0322880744934082, + "learning_rate": 4.4302353839437e-06, + "loss": 0.1472, + "step": 4493 + }, + { + "epoch": 0.7281270252754375, + "grad_norm": 0.9922388195991516, + "learning_rate": 4.429957444092146e-06, + "loss": 0.1596, + "step": 4494 + }, + { + "epoch": 0.7282890473104342, + "grad_norm": 1.0610861778259277, + "learning_rate": 4.4296794451885665e-06, + "loss": 0.178, + "step": 4495 + }, + { + "epoch": 0.728451069345431, + "grad_norm": 0.9445720314979553, + "learning_rate": 4.429401387241464e-06, + "loss": 0.1644, + "step": 4496 + }, + { + "epoch": 0.7286130913804277, + "grad_norm": 1.009913444519043, + "learning_rate": 4.429123270259348e-06, + "loss": 0.1726, + "step": 4497 + }, + { + "epoch": 0.7287751134154244, + "grad_norm": 0.9682244658470154, + "learning_rate": 4.428845094250729e-06, + "loss": 0.1474, + "step": 4498 + }, + { + "epoch": 0.7289371354504213, + "grad_norm": 0.9171390533447266, + "learning_rate": 4.4285668592241186e-06, + "loss": 0.1586, + "step": 4499 + }, + { + "epoch": 0.729099157485418, + "grad_norm": 0.9223143458366394, + "learning_rate": 4.428288565188028e-06, + "loss": 0.1357, + "step": 4500 + }, + { + "epoch": 0.7292611795204148, + "grad_norm": 0.9437743425369263, + "learning_rate": 4.4280102121509734e-06, + "loss": 0.1514, + "step": 4501 + }, + { + "epoch": 0.7294232015554115, + "grad_norm": 0.8216745853424072, + "learning_rate": 4.427731800121473e-06, + "loss": 0.1336, + "step": 4502 + }, + { + "epoch": 0.7295852235904083, + "grad_norm": 0.8366663455963135, + "learning_rate": 4.427453329108045e-06, + "loss": 0.1307, + "step": 4503 + }, + { + "epoch": 0.729747245625405, + "grad_norm": 0.8991605043411255, + "learning_rate": 4.427174799119208e-06, + "loss": 0.151, + "step": 4504 + }, + { + "epoch": 0.7299092676604018, + "grad_norm": 0.7747397422790527, + "learning_rate": 4.426896210163487e-06, + "loss": 0.1273, + "step": 4505 + }, + { + "epoch": 0.7300712896953986, + "grad_norm": 1.0024263858795166, + "learning_rate": 4.426617562249405e-06, + "loss": 0.1545, + "step": 4506 + }, + { + "epoch": 0.7302333117303953, + "grad_norm": 0.8834472298622131, + "learning_rate": 4.426338855385487e-06, + "loss": 0.1525, + "step": 4507 + }, + { + "epoch": 0.7303953337653921, + "grad_norm": 0.8129113912582397, + "learning_rate": 4.426060089580262e-06, + "loss": 0.1393, + "step": 4508 + }, + { + "epoch": 0.7305573558003888, + "grad_norm": 1.0072026252746582, + "learning_rate": 4.42578126484226e-06, + "loss": 0.1591, + "step": 4509 + }, + { + "epoch": 0.7307193778353857, + "grad_norm": 0.9830167889595032, + "learning_rate": 4.42550238118001e-06, + "loss": 0.1498, + "step": 4510 + }, + { + "epoch": 0.7308813998703824, + "grad_norm": 0.8229345083236694, + "learning_rate": 4.425223438602047e-06, + "loss": 0.133, + "step": 4511 + }, + { + "epoch": 0.7310434219053791, + "grad_norm": 0.8896669745445251, + "learning_rate": 4.424944437116907e-06, + "loss": 0.1397, + "step": 4512 + }, + { + "epoch": 0.7312054439403759, + "grad_norm": 0.9277743101119995, + "learning_rate": 4.424665376733125e-06, + "loss": 0.1421, + "step": 4513 + }, + { + "epoch": 0.7313674659753726, + "grad_norm": 0.9299547076225281, + "learning_rate": 4.424386257459241e-06, + "loss": 0.1507, + "step": 4514 + }, + { + "epoch": 0.7315294880103694, + "grad_norm": 0.9168359637260437, + "learning_rate": 4.424107079303793e-06, + "loss": 0.149, + "step": 4515 + }, + { + "epoch": 0.7316915100453661, + "grad_norm": 0.994986891746521, + "learning_rate": 4.423827842275325e-06, + "loss": 0.1692, + "step": 4516 + }, + { + "epoch": 0.731853532080363, + "grad_norm": 1.0216580629348755, + "learning_rate": 4.42354854638238e-06, + "loss": 0.1601, + "step": 4517 + }, + { + "epoch": 0.7320155541153597, + "grad_norm": 0.8891658782958984, + "learning_rate": 4.4232691916335055e-06, + "loss": 0.1446, + "step": 4518 + }, + { + "epoch": 0.7321775761503565, + "grad_norm": 0.8627836108207703, + "learning_rate": 4.422989778037248e-06, + "loss": 0.1371, + "step": 4519 + }, + { + "epoch": 0.7323395981853532, + "grad_norm": 0.8696764707565308, + "learning_rate": 4.422710305602156e-06, + "loss": 0.1213, + "step": 4520 + }, + { + "epoch": 0.7325016202203499, + "grad_norm": 0.8861494064331055, + "learning_rate": 4.422430774336782e-06, + "loss": 0.1454, + "step": 4521 + }, + { + "epoch": 0.7326636422553467, + "grad_norm": 0.9271621108055115, + "learning_rate": 4.422151184249679e-06, + "loss": 0.1513, + "step": 4522 + }, + { + "epoch": 0.7328256642903435, + "grad_norm": 0.8663860559463501, + "learning_rate": 4.4218715353494e-06, + "loss": 0.1484, + "step": 4523 + }, + { + "epoch": 0.7329876863253403, + "grad_norm": 0.8715428113937378, + "learning_rate": 4.421591827644503e-06, + "loss": 0.1486, + "step": 4524 + }, + { + "epoch": 0.733149708360337, + "grad_norm": 0.8773646950721741, + "learning_rate": 4.4213120611435475e-06, + "loss": 0.1462, + "step": 4525 + }, + { + "epoch": 0.7333117303953338, + "grad_norm": 0.775646984577179, + "learning_rate": 4.4210322358550915e-06, + "loss": 0.1371, + "step": 4526 + }, + { + "epoch": 0.7334737524303305, + "grad_norm": 0.9878357648849487, + "learning_rate": 4.420752351787698e-06, + "loss": 0.1479, + "step": 4527 + }, + { + "epoch": 0.7336357744653272, + "grad_norm": 1.0782580375671387, + "learning_rate": 4.420472408949931e-06, + "loss": 0.1573, + "step": 4528 + }, + { + "epoch": 0.7337977965003241, + "grad_norm": 0.8356068134307861, + "learning_rate": 4.420192407350355e-06, + "loss": 0.1349, + "step": 4529 + }, + { + "epoch": 0.7339598185353208, + "grad_norm": 0.8317657113075256, + "learning_rate": 4.419912346997539e-06, + "loss": 0.122, + "step": 4530 + }, + { + "epoch": 0.7341218405703176, + "grad_norm": 0.8698350787162781, + "learning_rate": 4.4196322279000506e-06, + "loss": 0.1496, + "step": 4531 + }, + { + "epoch": 0.7342838626053143, + "grad_norm": 0.7942752242088318, + "learning_rate": 4.419352050066462e-06, + "loss": 0.1128, + "step": 4532 + }, + { + "epoch": 0.7344458846403111, + "grad_norm": 1.0515409708023071, + "learning_rate": 4.419071813505345e-06, + "loss": 0.165, + "step": 4533 + }, + { + "epoch": 0.7346079066753078, + "grad_norm": 0.9110780358314514, + "learning_rate": 4.418791518225275e-06, + "loss": 0.1305, + "step": 4534 + }, + { + "epoch": 0.7347699287103046, + "grad_norm": 0.7942949533462524, + "learning_rate": 4.4185111642348276e-06, + "loss": 0.1108, + "step": 4535 + }, + { + "epoch": 0.7349319507453014, + "grad_norm": 0.920792818069458, + "learning_rate": 4.418230751542581e-06, + "loss": 0.1577, + "step": 4536 + }, + { + "epoch": 0.7350939727802981, + "grad_norm": 1.1174674034118652, + "learning_rate": 4.417950280157115e-06, + "loss": 0.1558, + "step": 4537 + }, + { + "epoch": 0.7352559948152949, + "grad_norm": 0.9814813137054443, + "learning_rate": 4.417669750087014e-06, + "loss": 0.1552, + "step": 4538 + }, + { + "epoch": 0.7354180168502916, + "grad_norm": 0.7974295020103455, + "learning_rate": 4.417389161340857e-06, + "loss": 0.1274, + "step": 4539 + }, + { + "epoch": 0.7355800388852884, + "grad_norm": 0.9223327040672302, + "learning_rate": 4.417108513927233e-06, + "loss": 0.1425, + "step": 4540 + }, + { + "epoch": 0.7357420609202852, + "grad_norm": 0.8545518517494202, + "learning_rate": 4.416827807854727e-06, + "loss": 0.1264, + "step": 4541 + }, + { + "epoch": 0.7359040829552819, + "grad_norm": 0.8425375819206238, + "learning_rate": 4.416547043131929e-06, + "loss": 0.1276, + "step": 4542 + }, + { + "epoch": 0.7360661049902787, + "grad_norm": 0.8444299697875977, + "learning_rate": 4.416266219767429e-06, + "loss": 0.1357, + "step": 4543 + }, + { + "epoch": 0.7362281270252754, + "grad_norm": 0.8412787914276123, + "learning_rate": 4.41598533776982e-06, + "loss": 0.135, + "step": 4544 + }, + { + "epoch": 0.7363901490602722, + "grad_norm": 0.9908415079116821, + "learning_rate": 4.415704397147698e-06, + "loss": 0.1655, + "step": 4545 + }, + { + "epoch": 0.7365521710952689, + "grad_norm": 0.8918123245239258, + "learning_rate": 4.415423397909655e-06, + "loss": 0.1396, + "step": 4546 + }, + { + "epoch": 0.7367141931302658, + "grad_norm": 0.8907253742218018, + "learning_rate": 4.4151423400642925e-06, + "loss": 0.1421, + "step": 4547 + }, + { + "epoch": 0.7368762151652625, + "grad_norm": 0.9191277027130127, + "learning_rate": 4.414861223620209e-06, + "loss": 0.1483, + "step": 4548 + }, + { + "epoch": 0.7370382372002592, + "grad_norm": 0.7574411034584045, + "learning_rate": 4.414580048586005e-06, + "loss": 0.1211, + "step": 4549 + }, + { + "epoch": 0.737200259235256, + "grad_norm": 0.7837011218070984, + "learning_rate": 4.414298814970286e-06, + "loss": 0.1293, + "step": 4550 + }, + { + "epoch": 0.7373622812702527, + "grad_norm": 0.8395361304283142, + "learning_rate": 4.414017522781655e-06, + "loss": 0.1331, + "step": 4551 + }, + { + "epoch": 0.7375243033052495, + "grad_norm": 0.9138143658638, + "learning_rate": 4.41373617202872e-06, + "loss": 0.1533, + "step": 4552 + }, + { + "epoch": 0.7376863253402463, + "grad_norm": 0.8615904450416565, + "learning_rate": 4.413454762720088e-06, + "loss": 0.1347, + "step": 4553 + }, + { + "epoch": 0.7378483473752431, + "grad_norm": 0.8940994739532471, + "learning_rate": 4.413173294864373e-06, + "loss": 0.134, + "step": 4554 + }, + { + "epoch": 0.7380103694102398, + "grad_norm": 0.9082682132720947, + "learning_rate": 4.412891768470183e-06, + "loss": 0.1427, + "step": 4555 + }, + { + "epoch": 0.7381723914452365, + "grad_norm": 0.978569507598877, + "learning_rate": 4.412610183546135e-06, + "loss": 0.149, + "step": 4556 + }, + { + "epoch": 0.7383344134802333, + "grad_norm": 0.9564186930656433, + "learning_rate": 4.412328540100843e-06, + "loss": 0.1526, + "step": 4557 + }, + { + "epoch": 0.73849643551523, + "grad_norm": 0.756712794303894, + "learning_rate": 4.412046838142927e-06, + "loss": 0.1162, + "step": 4558 + }, + { + "epoch": 0.7386584575502269, + "grad_norm": 0.9216713309288025, + "learning_rate": 4.411765077681003e-06, + "loss": 0.1544, + "step": 4559 + }, + { + "epoch": 0.7388204795852236, + "grad_norm": 0.7456929087638855, + "learning_rate": 4.411483258723695e-06, + "loss": 0.1176, + "step": 4560 + }, + { + "epoch": 0.7389825016202204, + "grad_norm": 0.8160392045974731, + "learning_rate": 4.411201381279625e-06, + "loss": 0.1369, + "step": 4561 + }, + { + "epoch": 0.7391445236552171, + "grad_norm": 0.8832926750183105, + "learning_rate": 4.410919445357418e-06, + "loss": 0.1453, + "step": 4562 + }, + { + "epoch": 0.7393065456902139, + "grad_norm": 0.9234940409660339, + "learning_rate": 4.410637450965699e-06, + "loss": 0.1416, + "step": 4563 + }, + { + "epoch": 0.7394685677252106, + "grad_norm": 0.9423860311508179, + "learning_rate": 4.410355398113099e-06, + "loss": 0.1502, + "step": 4564 + }, + { + "epoch": 0.7396305897602073, + "grad_norm": 0.9716334939002991, + "learning_rate": 4.410073286808247e-06, + "loss": 0.1462, + "step": 4565 + }, + { + "epoch": 0.7397926117952042, + "grad_norm": 0.8593721985816956, + "learning_rate": 4.409791117059773e-06, + "loss": 0.1287, + "step": 4566 + }, + { + "epoch": 0.7399546338302009, + "grad_norm": 0.9580432176589966, + "learning_rate": 4.409508888876313e-06, + "loss": 0.1527, + "step": 4567 + }, + { + "epoch": 0.7401166558651977, + "grad_norm": 0.8143693804740906, + "learning_rate": 4.409226602266503e-06, + "loss": 0.1272, + "step": 4568 + }, + { + "epoch": 0.7402786779001944, + "grad_norm": 0.8784132599830627, + "learning_rate": 4.408944257238979e-06, + "loss": 0.1321, + "step": 4569 + }, + { + "epoch": 0.7404406999351912, + "grad_norm": 0.9013129472732544, + "learning_rate": 4.408661853802379e-06, + "loss": 0.1524, + "step": 4570 + }, + { + "epoch": 0.740602721970188, + "grad_norm": 0.8545945286750793, + "learning_rate": 4.408379391965346e-06, + "loss": 0.1461, + "step": 4571 + }, + { + "epoch": 0.7407647440051847, + "grad_norm": 0.9265003800392151, + "learning_rate": 4.408096871736522e-06, + "loss": 0.1592, + "step": 4572 + }, + { + "epoch": 0.7409267660401815, + "grad_norm": 0.901270866394043, + "learning_rate": 4.407814293124551e-06, + "loss": 0.1615, + "step": 4573 + }, + { + "epoch": 0.7410887880751782, + "grad_norm": 0.9868318438529968, + "learning_rate": 4.407531656138079e-06, + "loss": 0.1498, + "step": 4574 + }, + { + "epoch": 0.741250810110175, + "grad_norm": 0.847172737121582, + "learning_rate": 4.407248960785756e-06, + "loss": 0.1408, + "step": 4575 + }, + { + "epoch": 0.7414128321451717, + "grad_norm": 0.941987156867981, + "learning_rate": 4.406966207076229e-06, + "loss": 0.1502, + "step": 4576 + }, + { + "epoch": 0.7415748541801686, + "grad_norm": 0.8444817662239075, + "learning_rate": 4.406683395018151e-06, + "loss": 0.1493, + "step": 4577 + }, + { + "epoch": 0.7417368762151653, + "grad_norm": 0.7697441577911377, + "learning_rate": 4.406400524620174e-06, + "loss": 0.1207, + "step": 4578 + }, + { + "epoch": 0.741898898250162, + "grad_norm": 0.8291786909103394, + "learning_rate": 4.406117595890956e-06, + "loss": 0.1369, + "step": 4579 + }, + { + "epoch": 0.7420609202851588, + "grad_norm": 0.8335320353507996, + "learning_rate": 4.405834608839152e-06, + "loss": 0.1313, + "step": 4580 + }, + { + "epoch": 0.7422229423201555, + "grad_norm": 0.8066971302032471, + "learning_rate": 4.405551563473421e-06, + "loss": 0.1419, + "step": 4581 + }, + { + "epoch": 0.7423849643551523, + "grad_norm": 0.9154806137084961, + "learning_rate": 4.405268459802423e-06, + "loss": 0.1421, + "step": 4582 + }, + { + "epoch": 0.742546986390149, + "grad_norm": 0.8322879672050476, + "learning_rate": 4.404985297834821e-06, + "loss": 0.1335, + "step": 4583 + }, + { + "epoch": 0.7427090084251459, + "grad_norm": 0.9182136058807373, + "learning_rate": 4.404702077579279e-06, + "loss": 0.1531, + "step": 4584 + }, + { + "epoch": 0.7428710304601426, + "grad_norm": 0.9670420289039612, + "learning_rate": 4.404418799044463e-06, + "loss": 0.1728, + "step": 4585 + }, + { + "epoch": 0.7430330524951393, + "grad_norm": 1.0733063220977783, + "learning_rate": 4.4041354622390395e-06, + "loss": 0.1469, + "step": 4586 + }, + { + "epoch": 0.7431950745301361, + "grad_norm": 0.7664371132850647, + "learning_rate": 4.40385206717168e-06, + "loss": 0.1228, + "step": 4587 + }, + { + "epoch": 0.7433570965651328, + "grad_norm": 0.8765845894813538, + "learning_rate": 4.403568613851054e-06, + "loss": 0.1436, + "step": 4588 + }, + { + "epoch": 0.7435191186001296, + "grad_norm": 0.9102659225463867, + "learning_rate": 4.403285102285835e-06, + "loss": 0.148, + "step": 4589 + }, + { + "epoch": 0.7436811406351264, + "grad_norm": 1.0098685026168823, + "learning_rate": 4.403001532484697e-06, + "loss": 0.1589, + "step": 4590 + }, + { + "epoch": 0.7438431626701232, + "grad_norm": 0.912421703338623, + "learning_rate": 4.402717904456318e-06, + "loss": 0.1474, + "step": 4591 + }, + { + "epoch": 0.7440051847051199, + "grad_norm": 0.960193932056427, + "learning_rate": 4.4024342182093745e-06, + "loss": 0.1535, + "step": 4592 + }, + { + "epoch": 0.7441672067401166, + "grad_norm": 0.8692451119422913, + "learning_rate": 4.402150473752549e-06, + "loss": 0.1506, + "step": 4593 + }, + { + "epoch": 0.7443292287751134, + "grad_norm": 0.9402160048484802, + "learning_rate": 4.401866671094522e-06, + "loss": 0.156, + "step": 4594 + }, + { + "epoch": 0.7444912508101101, + "grad_norm": 0.8891216516494751, + "learning_rate": 4.401582810243977e-06, + "loss": 0.1464, + "step": 4595 + }, + { + "epoch": 0.744653272845107, + "grad_norm": 0.9088370203971863, + "learning_rate": 4.4012988912096e-06, + "loss": 0.137, + "step": 4596 + }, + { + "epoch": 0.7448152948801037, + "grad_norm": 0.8495668172836304, + "learning_rate": 4.401014914000078e-06, + "loss": 0.1337, + "step": 4597 + }, + { + "epoch": 0.7449773169151005, + "grad_norm": 0.8261443376541138, + "learning_rate": 4.4007308786241e-06, + "loss": 0.1217, + "step": 4598 + }, + { + "epoch": 0.7451393389500972, + "grad_norm": 0.8053951263427734, + "learning_rate": 4.400446785090356e-06, + "loss": 0.1249, + "step": 4599 + }, + { + "epoch": 0.7453013609850939, + "grad_norm": 0.8850007653236389, + "learning_rate": 4.40016263340754e-06, + "loss": 0.1391, + "step": 4600 + }, + { + "epoch": 0.7454633830200907, + "grad_norm": 0.8763007521629333, + "learning_rate": 4.399878423584345e-06, + "loss": 0.1301, + "step": 4601 + }, + { + "epoch": 0.7456254050550875, + "grad_norm": 0.9268038272857666, + "learning_rate": 4.399594155629469e-06, + "loss": 0.135, + "step": 4602 + }, + { + "epoch": 0.7457874270900843, + "grad_norm": 0.8517520427703857, + "learning_rate": 4.3993098295516085e-06, + "loss": 0.1419, + "step": 4603 + }, + { + "epoch": 0.745949449125081, + "grad_norm": 0.8787991404533386, + "learning_rate": 4.3990254453594634e-06, + "loss": 0.1233, + "step": 4604 + }, + { + "epoch": 0.7461114711600778, + "grad_norm": 0.9639225602149963, + "learning_rate": 4.398741003061735e-06, + "loss": 0.1407, + "step": 4605 + }, + { + "epoch": 0.7462734931950745, + "grad_norm": 0.8817579746246338, + "learning_rate": 4.398456502667127e-06, + "loss": 0.134, + "step": 4606 + }, + { + "epoch": 0.7464355152300713, + "grad_norm": 0.9382551908493042, + "learning_rate": 4.398171944184344e-06, + "loss": 0.1614, + "step": 4607 + }, + { + "epoch": 0.7465975372650681, + "grad_norm": 0.917212963104248, + "learning_rate": 4.397887327622093e-06, + "loss": 0.1304, + "step": 4608 + }, + { + "epoch": 0.7467595593000648, + "grad_norm": 1.028524398803711, + "learning_rate": 4.397602652989083e-06, + "loss": 0.1568, + "step": 4609 + }, + { + "epoch": 0.7469215813350616, + "grad_norm": 0.9571022391319275, + "learning_rate": 4.397317920294023e-06, + "loss": 0.1503, + "step": 4610 + }, + { + "epoch": 0.7470836033700583, + "grad_norm": 0.9256038069725037, + "learning_rate": 4.397033129545627e-06, + "loss": 0.1341, + "step": 4611 + }, + { + "epoch": 0.7472456254050551, + "grad_norm": 0.8719132542610168, + "learning_rate": 4.396748280752608e-06, + "loss": 0.1495, + "step": 4612 + }, + { + "epoch": 0.7474076474400518, + "grad_norm": 0.8998635411262512, + "learning_rate": 4.39646337392368e-06, + "loss": 0.1421, + "step": 4613 + }, + { + "epoch": 0.7475696694750487, + "grad_norm": 0.9811137914657593, + "learning_rate": 4.396178409067564e-06, + "loss": 0.1531, + "step": 4614 + }, + { + "epoch": 0.7477316915100454, + "grad_norm": 0.962475061416626, + "learning_rate": 4.395893386192976e-06, + "loss": 0.1606, + "step": 4615 + }, + { + "epoch": 0.7478937135450421, + "grad_norm": 0.8668140172958374, + "learning_rate": 4.395608305308639e-06, + "loss": 0.1432, + "step": 4616 + }, + { + "epoch": 0.7480557355800389, + "grad_norm": 1.009864330291748, + "learning_rate": 4.3953231664232755e-06, + "loss": 0.1518, + "step": 4617 + }, + { + "epoch": 0.7482177576150356, + "grad_norm": 1.0338926315307617, + "learning_rate": 4.395037969545609e-06, + "loss": 0.1602, + "step": 4618 + }, + { + "epoch": 0.7483797796500324, + "grad_norm": 0.937615692615509, + "learning_rate": 4.394752714684367e-06, + "loss": 0.1508, + "step": 4619 + }, + { + "epoch": 0.7485418016850292, + "grad_norm": 1.010367512702942, + "learning_rate": 4.394467401848277e-06, + "loss": 0.1512, + "step": 4620 + }, + { + "epoch": 0.748703823720026, + "grad_norm": 0.8209222555160522, + "learning_rate": 4.394182031046069e-06, + "loss": 0.1412, + "step": 4621 + }, + { + "epoch": 0.7488658457550227, + "grad_norm": 0.8631600141525269, + "learning_rate": 4.393896602286475e-06, + "loss": 0.1371, + "step": 4622 + }, + { + "epoch": 0.7490278677900194, + "grad_norm": 0.7968021631240845, + "learning_rate": 4.393611115578228e-06, + "loss": 0.1263, + "step": 4623 + }, + { + "epoch": 0.7491898898250162, + "grad_norm": 0.9319000244140625, + "learning_rate": 4.3933255709300635e-06, + "loss": 0.158, + "step": 4624 + }, + { + "epoch": 0.7493519118600129, + "grad_norm": 0.8134043216705322, + "learning_rate": 4.393039968350718e-06, + "loss": 0.1389, + "step": 4625 + }, + { + "epoch": 0.7495139338950098, + "grad_norm": 0.968559205532074, + "learning_rate": 4.3927543078489295e-06, + "loss": 0.1761, + "step": 4626 + }, + { + "epoch": 0.7496759559300065, + "grad_norm": 0.8765543103218079, + "learning_rate": 4.392468589433441e-06, + "loss": 0.1496, + "step": 4627 + }, + { + "epoch": 0.7498379779650033, + "grad_norm": 0.9812090992927551, + "learning_rate": 4.392182813112993e-06, + "loss": 0.1566, + "step": 4628 + }, + { + "epoch": 0.75, + "grad_norm": 0.9258951544761658, + "learning_rate": 4.3918969788963295e-06, + "loss": 0.1479, + "step": 4629 + }, + { + "epoch": 0.7501620220349967, + "grad_norm": 0.8851484656333923, + "learning_rate": 4.391611086792198e-06, + "loss": 0.1463, + "step": 4630 + }, + { + "epoch": 0.7503240440699935, + "grad_norm": 1.0019028186798096, + "learning_rate": 4.391325136809344e-06, + "loss": 0.1457, + "step": 4631 + }, + { + "epoch": 0.7504860661049902, + "grad_norm": 0.8665192723274231, + "learning_rate": 4.391039128956517e-06, + "loss": 0.1279, + "step": 4632 + }, + { + "epoch": 0.7506480881399871, + "grad_norm": 0.8466840982437134, + "learning_rate": 4.39075306324247e-06, + "loss": 0.1273, + "step": 4633 + }, + { + "epoch": 0.7508101101749838, + "grad_norm": 0.9541529417037964, + "learning_rate": 4.390466939675954e-06, + "loss": 0.1667, + "step": 4634 + }, + { + "epoch": 0.7509721322099806, + "grad_norm": 1.0194523334503174, + "learning_rate": 4.390180758265725e-06, + "loss": 0.1509, + "step": 4635 + }, + { + "epoch": 0.7511341542449773, + "grad_norm": 0.9222609996795654, + "learning_rate": 4.389894519020539e-06, + "loss": 0.1455, + "step": 4636 + }, + { + "epoch": 0.751296176279974, + "grad_norm": 0.9126465320587158, + "learning_rate": 4.389608221949153e-06, + "loss": 0.1303, + "step": 4637 + }, + { + "epoch": 0.7514581983149708, + "grad_norm": 0.8073568344116211, + "learning_rate": 4.38932186706033e-06, + "loss": 0.1254, + "step": 4638 + }, + { + "epoch": 0.7516202203499676, + "grad_norm": 0.9920159578323364, + "learning_rate": 4.389035454362829e-06, + "loss": 0.1476, + "step": 4639 + }, + { + "epoch": 0.7517822423849644, + "grad_norm": 0.8352819085121155, + "learning_rate": 4.388748983865414e-06, + "loss": 0.1289, + "step": 4640 + }, + { + "epoch": 0.7519442644199611, + "grad_norm": 0.9489824175834656, + "learning_rate": 4.388462455576852e-06, + "loss": 0.1533, + "step": 4641 + }, + { + "epoch": 0.7521062864549579, + "grad_norm": 0.9182109832763672, + "learning_rate": 4.388175869505908e-06, + "loss": 0.1494, + "step": 4642 + }, + { + "epoch": 0.7522683084899546, + "grad_norm": 0.9489102959632874, + "learning_rate": 4.387889225661352e-06, + "loss": 0.1317, + "step": 4643 + }, + { + "epoch": 0.7524303305249513, + "grad_norm": 0.887995183467865, + "learning_rate": 4.387602524051954e-06, + "loss": 0.1433, + "step": 4644 + }, + { + "epoch": 0.7525923525599482, + "grad_norm": 0.934482753276825, + "learning_rate": 4.387315764686487e-06, + "loss": 0.1458, + "step": 4645 + }, + { + "epoch": 0.7527543745949449, + "grad_norm": 0.757559061050415, + "learning_rate": 4.387028947573724e-06, + "loss": 0.1199, + "step": 4646 + }, + { + "epoch": 0.7529163966299417, + "grad_norm": 0.8067179322242737, + "learning_rate": 4.386742072722443e-06, + "loss": 0.1321, + "step": 4647 + }, + { + "epoch": 0.7530784186649384, + "grad_norm": 0.9727403521537781, + "learning_rate": 4.3864551401414195e-06, + "loss": 0.1172, + "step": 4648 + }, + { + "epoch": 0.7532404406999352, + "grad_norm": 0.9205012917518616, + "learning_rate": 4.386168149839434e-06, + "loss": 0.1555, + "step": 4649 + }, + { + "epoch": 0.7534024627349319, + "grad_norm": 0.8873549103736877, + "learning_rate": 4.385881101825268e-06, + "loss": 0.1476, + "step": 4650 + }, + { + "epoch": 0.7535644847699287, + "grad_norm": 0.9281837940216064, + "learning_rate": 4.3855939961077034e-06, + "loss": 0.1385, + "step": 4651 + }, + { + "epoch": 0.7537265068049255, + "grad_norm": 1.0171685218811035, + "learning_rate": 4.385306832695526e-06, + "loss": 0.1635, + "step": 4652 + }, + { + "epoch": 0.7538885288399222, + "grad_norm": 0.8333146572113037, + "learning_rate": 4.385019611597522e-06, + "loss": 0.1398, + "step": 4653 + }, + { + "epoch": 0.754050550874919, + "grad_norm": 0.9079596400260925, + "learning_rate": 4.384732332822479e-06, + "loss": 0.1396, + "step": 4654 + }, + { + "epoch": 0.7542125729099157, + "grad_norm": 0.8788375854492188, + "learning_rate": 4.384444996379188e-06, + "loss": 0.1384, + "step": 4655 + }, + { + "epoch": 0.7543745949449125, + "grad_norm": 0.8239230513572693, + "learning_rate": 4.38415760227644e-06, + "loss": 0.1278, + "step": 4656 + }, + { + "epoch": 0.7545366169799093, + "grad_norm": 0.8959946036338806, + "learning_rate": 4.383870150523029e-06, + "loss": 0.1499, + "step": 4657 + }, + { + "epoch": 0.7546986390149061, + "grad_norm": 0.8564679026603699, + "learning_rate": 4.38358264112775e-06, + "loss": 0.1381, + "step": 4658 + }, + { + "epoch": 0.7548606610499028, + "grad_norm": 0.8459985256195068, + "learning_rate": 4.383295074099402e-06, + "loss": 0.1358, + "step": 4659 + }, + { + "epoch": 0.7550226830848995, + "grad_norm": 0.9368695020675659, + "learning_rate": 4.3830074494467815e-06, + "loss": 0.1419, + "step": 4660 + }, + { + "epoch": 0.7551847051198963, + "grad_norm": 0.8994658589363098, + "learning_rate": 4.382719767178689e-06, + "loss": 0.1558, + "step": 4661 + }, + { + "epoch": 0.755346727154893, + "grad_norm": 0.9667396545410156, + "learning_rate": 4.382432027303928e-06, + "loss": 0.1627, + "step": 4662 + }, + { + "epoch": 0.7555087491898899, + "grad_norm": 0.8609565496444702, + "learning_rate": 4.382144229831302e-06, + "loss": 0.1351, + "step": 4663 + }, + { + "epoch": 0.7556707712248866, + "grad_norm": 0.845554769039154, + "learning_rate": 4.381856374769617e-06, + "loss": 0.1273, + "step": 4664 + }, + { + "epoch": 0.7558327932598834, + "grad_norm": 0.9373748302459717, + "learning_rate": 4.3815684621276824e-06, + "loss": 0.1663, + "step": 4665 + }, + { + "epoch": 0.7559948152948801, + "grad_norm": 0.8447197079658508, + "learning_rate": 4.3812804919143055e-06, + "loss": 0.1337, + "step": 4666 + }, + { + "epoch": 0.7561568373298768, + "grad_norm": 0.8827223777770996, + "learning_rate": 4.380992464138298e-06, + "loss": 0.1419, + "step": 4667 + }, + { + "epoch": 0.7563188593648736, + "grad_norm": 0.8952583074569702, + "learning_rate": 4.380704378808473e-06, + "loss": 0.1249, + "step": 4668 + }, + { + "epoch": 0.7564808813998704, + "grad_norm": 0.8964098691940308, + "learning_rate": 4.380416235933646e-06, + "loss": 0.1544, + "step": 4669 + }, + { + "epoch": 0.7566429034348672, + "grad_norm": 1.0078829526901245, + "learning_rate": 4.380128035522632e-06, + "loss": 0.1629, + "step": 4670 + }, + { + "epoch": 0.7568049254698639, + "grad_norm": 0.9272462725639343, + "learning_rate": 4.379839777584249e-06, + "loss": 0.158, + "step": 4671 + }, + { + "epoch": 0.7569669475048607, + "grad_norm": 0.9298140406608582, + "learning_rate": 4.379551462127319e-06, + "loss": 0.1442, + "step": 4672 + }, + { + "epoch": 0.7571289695398574, + "grad_norm": 1.0363284349441528, + "learning_rate": 4.3792630891606635e-06, + "loss": 0.1739, + "step": 4673 + }, + { + "epoch": 0.7572909915748541, + "grad_norm": 0.9505801796913147, + "learning_rate": 4.3789746586931034e-06, + "loss": 0.1657, + "step": 4674 + }, + { + "epoch": 0.757453013609851, + "grad_norm": 1.0468703508377075, + "learning_rate": 4.3786861707334676e-06, + "loss": 0.1587, + "step": 4675 + }, + { + "epoch": 0.7576150356448477, + "grad_norm": 0.8523300886154175, + "learning_rate": 4.37839762529058e-06, + "loss": 0.1465, + "step": 4676 + }, + { + "epoch": 0.7577770576798445, + "grad_norm": 0.9114488363265991, + "learning_rate": 4.378109022373272e-06, + "loss": 0.127, + "step": 4677 + }, + { + "epoch": 0.7579390797148412, + "grad_norm": 1.036751627922058, + "learning_rate": 4.3778203619903716e-06, + "loss": 0.1341, + "step": 4678 + }, + { + "epoch": 0.758101101749838, + "grad_norm": 1.0055261850357056, + "learning_rate": 4.377531644150712e-06, + "loss": 0.1549, + "step": 4679 + }, + { + "epoch": 0.7582631237848347, + "grad_norm": 1.0141489505767822, + "learning_rate": 4.3772428688631285e-06, + "loss": 0.1552, + "step": 4680 + }, + { + "epoch": 0.7584251458198314, + "grad_norm": 0.9310604929924011, + "learning_rate": 4.376954036136456e-06, + "loss": 0.1502, + "step": 4681 + }, + { + "epoch": 0.7585871678548283, + "grad_norm": 0.7873426675796509, + "learning_rate": 4.376665145979532e-06, + "loss": 0.1249, + "step": 4682 + }, + { + "epoch": 0.758749189889825, + "grad_norm": 0.8380732536315918, + "learning_rate": 4.376376198401195e-06, + "loss": 0.154, + "step": 4683 + }, + { + "epoch": 0.7589112119248218, + "grad_norm": 0.762469470500946, + "learning_rate": 4.376087193410289e-06, + "loss": 0.1214, + "step": 4684 + }, + { + "epoch": 0.7590732339598185, + "grad_norm": 0.868675947189331, + "learning_rate": 4.375798131015654e-06, + "loss": 0.1446, + "step": 4685 + }, + { + "epoch": 0.7592352559948153, + "grad_norm": 0.998455286026001, + "learning_rate": 4.375509011226135e-06, + "loss": 0.1611, + "step": 4686 + }, + { + "epoch": 0.759397278029812, + "grad_norm": 1.0500266551971436, + "learning_rate": 4.3752198340505795e-06, + "loss": 0.1398, + "step": 4687 + }, + { + "epoch": 0.7595593000648088, + "grad_norm": 0.8594135642051697, + "learning_rate": 4.374930599497835e-06, + "loss": 0.1354, + "step": 4688 + }, + { + "epoch": 0.7597213220998056, + "grad_norm": 0.8801107406616211, + "learning_rate": 4.374641307576751e-06, + "loss": 0.1432, + "step": 4689 + }, + { + "epoch": 0.7598833441348023, + "grad_norm": 0.8573765158653259, + "learning_rate": 4.37435195829618e-06, + "loss": 0.1404, + "step": 4690 + }, + { + "epoch": 0.7600453661697991, + "grad_norm": 0.8854019045829773, + "learning_rate": 4.3740625516649755e-06, + "loss": 0.1406, + "step": 4691 + }, + { + "epoch": 0.7602073882047958, + "grad_norm": 0.899628758430481, + "learning_rate": 4.373773087691992e-06, + "loss": 0.1331, + "step": 4692 + }, + { + "epoch": 0.7603694102397927, + "grad_norm": 0.9409993886947632, + "learning_rate": 4.373483566386086e-06, + "loss": 0.1582, + "step": 4693 + }, + { + "epoch": 0.7605314322747894, + "grad_norm": 0.9930229783058167, + "learning_rate": 4.373193987756116e-06, + "loss": 0.1317, + "step": 4694 + }, + { + "epoch": 0.7606934543097861, + "grad_norm": 0.8028172254562378, + "learning_rate": 4.372904351810943e-06, + "loss": 0.1267, + "step": 4695 + }, + { + "epoch": 0.7608554763447829, + "grad_norm": 0.8876671195030212, + "learning_rate": 4.3726146585594296e-06, + "loss": 0.1482, + "step": 4696 + }, + { + "epoch": 0.7610174983797796, + "grad_norm": 0.8735641837120056, + "learning_rate": 4.3723249080104395e-06, + "loss": 0.1362, + "step": 4697 + }, + { + "epoch": 0.7611795204147764, + "grad_norm": 1.0168266296386719, + "learning_rate": 4.372035100172838e-06, + "loss": 0.1535, + "step": 4698 + }, + { + "epoch": 0.7613415424497731, + "grad_norm": 0.9453420042991638, + "learning_rate": 4.371745235055492e-06, + "loss": 0.1625, + "step": 4699 + }, + { + "epoch": 0.76150356448477, + "grad_norm": 0.8696652054786682, + "learning_rate": 4.371455312667272e-06, + "loss": 0.1469, + "step": 4700 + }, + { + "epoch": 0.7616655865197667, + "grad_norm": 0.8512628078460693, + "learning_rate": 4.371165333017049e-06, + "loss": 0.1464, + "step": 4701 + }, + { + "epoch": 0.7618276085547635, + "grad_norm": 0.9412540793418884, + "learning_rate": 4.370875296113694e-06, + "loss": 0.1495, + "step": 4702 + }, + { + "epoch": 0.7619896305897602, + "grad_norm": 0.9683601260185242, + "learning_rate": 4.370585201966082e-06, + "loss": 0.1696, + "step": 4703 + }, + { + "epoch": 0.7621516526247569, + "grad_norm": 0.9484912753105164, + "learning_rate": 4.370295050583091e-06, + "loss": 0.162, + "step": 4704 + }, + { + "epoch": 0.7623136746597537, + "grad_norm": 0.8469820022583008, + "learning_rate": 4.370004841973596e-06, + "loss": 0.1332, + "step": 4705 + }, + { + "epoch": 0.7624756966947505, + "grad_norm": 0.9001607894897461, + "learning_rate": 4.3697145761464785e-06, + "loss": 0.1468, + "step": 4706 + }, + { + "epoch": 0.7626377187297473, + "grad_norm": 0.8998278975486755, + "learning_rate": 4.36942425311062e-06, + "loss": 0.1493, + "step": 4707 + }, + { + "epoch": 0.762799740764744, + "grad_norm": 0.9656628966331482, + "learning_rate": 4.369133872874903e-06, + "loss": 0.1528, + "step": 4708 + }, + { + "epoch": 0.7629617627997408, + "grad_norm": 1.009851098060608, + "learning_rate": 4.368843435448213e-06, + "loss": 0.1392, + "step": 4709 + }, + { + "epoch": 0.7631237848347375, + "grad_norm": 0.9109621644020081, + "learning_rate": 4.368552940839436e-06, + "loss": 0.1571, + "step": 4710 + }, + { + "epoch": 0.7632858068697342, + "grad_norm": 0.9232974052429199, + "learning_rate": 4.368262389057462e-06, + "loss": 0.1551, + "step": 4711 + }, + { + "epoch": 0.7634478289047311, + "grad_norm": 0.8760313987731934, + "learning_rate": 4.367971780111179e-06, + "loss": 0.1587, + "step": 4712 + }, + { + "epoch": 0.7636098509397278, + "grad_norm": 0.945943295955658, + "learning_rate": 4.36768111400948e-06, + "loss": 0.1431, + "step": 4713 + }, + { + "epoch": 0.7637718729747246, + "grad_norm": 0.8429069519042969, + "learning_rate": 4.367390390761258e-06, + "loss": 0.1377, + "step": 4714 + }, + { + "epoch": 0.7639338950097213, + "grad_norm": 0.8647720813751221, + "learning_rate": 4.367099610375409e-06, + "loss": 0.1262, + "step": 4715 + }, + { + "epoch": 0.7640959170447181, + "grad_norm": 0.8221338987350464, + "learning_rate": 4.3668087728608314e-06, + "loss": 0.1269, + "step": 4716 + }, + { + "epoch": 0.7642579390797148, + "grad_norm": 0.8592271208763123, + "learning_rate": 4.366517878226423e-06, + "loss": 0.1344, + "step": 4717 + }, + { + "epoch": 0.7644199611147116, + "grad_norm": 0.8569756150245667, + "learning_rate": 4.366226926481083e-06, + "loss": 0.1417, + "step": 4718 + }, + { + "epoch": 0.7645819831497084, + "grad_norm": 0.8143072128295898, + "learning_rate": 4.365935917633716e-06, + "loss": 0.1295, + "step": 4719 + }, + { + "epoch": 0.7647440051847051, + "grad_norm": 0.9709054827690125, + "learning_rate": 4.365644851693226e-06, + "loss": 0.1597, + "step": 4720 + }, + { + "epoch": 0.7649060272197019, + "grad_norm": 0.8787278532981873, + "learning_rate": 4.365353728668518e-06, + "loss": 0.14, + "step": 4721 + }, + { + "epoch": 0.7650680492546986, + "grad_norm": 0.945436418056488, + "learning_rate": 4.3650625485685e-06, + "loss": 0.153, + "step": 4722 + }, + { + "epoch": 0.7652300712896954, + "grad_norm": 0.9423735737800598, + "learning_rate": 4.3647713114020805e-06, + "loss": 0.1324, + "step": 4723 + }, + { + "epoch": 0.7653920933246922, + "grad_norm": 0.9254855513572693, + "learning_rate": 4.364480017178172e-06, + "loss": 0.149, + "step": 4724 + }, + { + "epoch": 0.7655541153596889, + "grad_norm": 0.8730553984642029, + "learning_rate": 4.364188665905687e-06, + "loss": 0.1386, + "step": 4725 + }, + { + "epoch": 0.7657161373946857, + "grad_norm": 0.8605020046234131, + "learning_rate": 4.36389725759354e-06, + "loss": 0.1381, + "step": 4726 + }, + { + "epoch": 0.7658781594296824, + "grad_norm": 1.0751097202301025, + "learning_rate": 4.363605792250648e-06, + "loss": 0.1849, + "step": 4727 + }, + { + "epoch": 0.7660401814646792, + "grad_norm": 0.9074150919914246, + "learning_rate": 4.363314269885928e-06, + "loss": 0.1377, + "step": 4728 + }, + { + "epoch": 0.7662022034996759, + "grad_norm": 0.8164258599281311, + "learning_rate": 4.363022690508301e-06, + "loss": 0.1357, + "step": 4729 + }, + { + "epoch": 0.7663642255346728, + "grad_norm": 0.9147708415985107, + "learning_rate": 4.362731054126687e-06, + "loss": 0.1494, + "step": 4730 + }, + { + "epoch": 0.7665262475696695, + "grad_norm": 0.9929783344268799, + "learning_rate": 4.362439360750012e-06, + "loss": 0.162, + "step": 4731 + }, + { + "epoch": 0.7666882696046662, + "grad_norm": 0.977254331111908, + "learning_rate": 4.362147610387198e-06, + "loss": 0.1586, + "step": 4732 + }, + { + "epoch": 0.766850291639663, + "grad_norm": 0.9575954079627991, + "learning_rate": 4.361855803047175e-06, + "loss": 0.1561, + "step": 4733 + }, + { + "epoch": 0.7670123136746597, + "grad_norm": 0.8501836657524109, + "learning_rate": 4.361563938738869e-06, + "loss": 0.138, + "step": 4734 + }, + { + "epoch": 0.7671743357096565, + "grad_norm": 0.8874675035476685, + "learning_rate": 4.361272017471212e-06, + "loss": 0.1503, + "step": 4735 + }, + { + "epoch": 0.7673363577446533, + "grad_norm": 0.830808699131012, + "learning_rate": 4.3609800392531345e-06, + "loss": 0.1304, + "step": 4736 + }, + { + "epoch": 0.7674983797796501, + "grad_norm": 0.8549823760986328, + "learning_rate": 4.3606880040935714e-06, + "loss": 0.1475, + "step": 4737 + }, + { + "epoch": 0.7676604018146468, + "grad_norm": 0.9000952243804932, + "learning_rate": 4.36039591200146e-06, + "loss": 0.1549, + "step": 4738 + }, + { + "epoch": 0.7678224238496435, + "grad_norm": 0.9087912440299988, + "learning_rate": 4.360103762985734e-06, + "loss": 0.1516, + "step": 4739 + }, + { + "epoch": 0.7679844458846403, + "grad_norm": 0.9787373542785645, + "learning_rate": 4.359811557055335e-06, + "loss": 0.172, + "step": 4740 + }, + { + "epoch": 0.768146467919637, + "grad_norm": 0.8458462953567505, + "learning_rate": 4.359519294219201e-06, + "loss": 0.1462, + "step": 4741 + }, + { + "epoch": 0.7683084899546339, + "grad_norm": 0.8422055840492249, + "learning_rate": 4.3592269744862794e-06, + "loss": 0.1269, + "step": 4742 + }, + { + "epoch": 0.7684705119896306, + "grad_norm": 0.8210686445236206, + "learning_rate": 4.35893459786551e-06, + "loss": 0.1425, + "step": 4743 + }, + { + "epoch": 0.7686325340246274, + "grad_norm": 0.9287550449371338, + "learning_rate": 4.3586421643658404e-06, + "loss": 0.1574, + "step": 4744 + }, + { + "epoch": 0.7687945560596241, + "grad_norm": 0.9414339065551758, + "learning_rate": 4.3583496739962195e-06, + "loss": 0.1515, + "step": 4745 + }, + { + "epoch": 0.7689565780946209, + "grad_norm": 0.8130404353141785, + "learning_rate": 4.3580571267655945e-06, + "loss": 0.1309, + "step": 4746 + }, + { + "epoch": 0.7691186001296176, + "grad_norm": 0.7749109864234924, + "learning_rate": 4.357764522682919e-06, + "loss": 0.1363, + "step": 4747 + }, + { + "epoch": 0.7692806221646143, + "grad_norm": 0.874220609664917, + "learning_rate": 4.357471861757144e-06, + "loss": 0.1415, + "step": 4748 + }, + { + "epoch": 0.7694426441996112, + "grad_norm": 0.9805276393890381, + "learning_rate": 4.357179143997225e-06, + "loss": 0.1563, + "step": 4749 + }, + { + "epoch": 0.7696046662346079, + "grad_norm": 0.9434357285499573, + "learning_rate": 4.3568863694121185e-06, + "loss": 0.1492, + "step": 4750 + }, + { + "epoch": 0.7697666882696047, + "grad_norm": 0.968651294708252, + "learning_rate": 4.356593538010783e-06, + "loss": 0.1611, + "step": 4751 + }, + { + "epoch": 0.7699287103046014, + "grad_norm": 0.7928199172019958, + "learning_rate": 4.356300649802178e-06, + "loss": 0.1264, + "step": 4752 + }, + { + "epoch": 0.7700907323395982, + "grad_norm": 0.8838950991630554, + "learning_rate": 4.356007704795265e-06, + "loss": 0.142, + "step": 4753 + }, + { + "epoch": 0.770252754374595, + "grad_norm": 0.8881464004516602, + "learning_rate": 4.355714702999008e-06, + "loss": 0.1438, + "step": 4754 + }, + { + "epoch": 0.7704147764095917, + "grad_norm": 0.938313364982605, + "learning_rate": 4.355421644422372e-06, + "loss": 0.1423, + "step": 4755 + }, + { + "epoch": 0.7705767984445885, + "grad_norm": 0.892318069934845, + "learning_rate": 4.355128529074323e-06, + "loss": 0.1451, + "step": 4756 + }, + { + "epoch": 0.7707388204795852, + "grad_norm": 0.8982798457145691, + "learning_rate": 4.354835356963831e-06, + "loss": 0.1375, + "step": 4757 + }, + { + "epoch": 0.770900842514582, + "grad_norm": 0.8019870519638062, + "learning_rate": 4.354542128099866e-06, + "loss": 0.1271, + "step": 4758 + }, + { + "epoch": 0.7710628645495787, + "grad_norm": 0.8931732773780823, + "learning_rate": 4.354248842491399e-06, + "loss": 0.1481, + "step": 4759 + }, + { + "epoch": 0.7712248865845756, + "grad_norm": 0.9127657413482666, + "learning_rate": 4.353955500147405e-06, + "loss": 0.1466, + "step": 4760 + }, + { + "epoch": 0.7713869086195723, + "grad_norm": 0.9457529783248901, + "learning_rate": 4.353662101076859e-06, + "loss": 0.1488, + "step": 4761 + }, + { + "epoch": 0.771548930654569, + "grad_norm": 0.9101020097732544, + "learning_rate": 4.353368645288738e-06, + "loss": 0.1464, + "step": 4762 + }, + { + "epoch": 0.7717109526895658, + "grad_norm": 0.9181843400001526, + "learning_rate": 4.353075132792023e-06, + "loss": 0.1379, + "step": 4763 + }, + { + "epoch": 0.7718729747245625, + "grad_norm": 0.8580861687660217, + "learning_rate": 4.352781563595691e-06, + "loss": 0.145, + "step": 4764 + }, + { + "epoch": 0.7720349967595593, + "grad_norm": 0.8708130717277527, + "learning_rate": 4.352487937708729e-06, + "loss": 0.1536, + "step": 4765 + }, + { + "epoch": 0.772197018794556, + "grad_norm": 0.8263076543807983, + "learning_rate": 4.352194255140118e-06, + "loss": 0.1327, + "step": 4766 + }, + { + "epoch": 0.7723590408295529, + "grad_norm": 0.9026300311088562, + "learning_rate": 4.351900515898846e-06, + "loss": 0.1477, + "step": 4767 + }, + { + "epoch": 0.7725210628645496, + "grad_norm": 0.9587908387184143, + "learning_rate": 4.351606719993899e-06, + "loss": 0.1537, + "step": 4768 + }, + { + "epoch": 0.7726830848995463, + "grad_norm": 0.78399258852005, + "learning_rate": 4.3513128674342665e-06, + "loss": 0.1308, + "step": 4769 + }, + { + "epoch": 0.7728451069345431, + "grad_norm": 0.8863502144813538, + "learning_rate": 4.351018958228941e-06, + "loss": 0.1467, + "step": 4770 + }, + { + "epoch": 0.7730071289695398, + "grad_norm": 0.9153334498405457, + "learning_rate": 4.350724992386915e-06, + "loss": 0.1646, + "step": 4771 + }, + { + "epoch": 0.7731691510045366, + "grad_norm": 0.8299160003662109, + "learning_rate": 4.350430969917182e-06, + "loss": 0.1437, + "step": 4772 + }, + { + "epoch": 0.7733311730395334, + "grad_norm": 0.9863147139549255, + "learning_rate": 4.35013689082874e-06, + "loss": 0.1435, + "step": 4773 + }, + { + "epoch": 0.7734931950745302, + "grad_norm": 0.9008839130401611, + "learning_rate": 4.349842755130587e-06, + "loss": 0.1472, + "step": 4774 + }, + { + "epoch": 0.7736552171095269, + "grad_norm": 0.9455453157424927, + "learning_rate": 4.349548562831721e-06, + "loss": 0.1493, + "step": 4775 + }, + { + "epoch": 0.7738172391445236, + "grad_norm": 0.9574447870254517, + "learning_rate": 4.349254313941146e-06, + "loss": 0.1572, + "step": 4776 + }, + { + "epoch": 0.7739792611795204, + "grad_norm": 0.9903965592384338, + "learning_rate": 4.348960008467863e-06, + "loss": 0.1688, + "step": 4777 + }, + { + "epoch": 0.7741412832145171, + "grad_norm": 0.8893287181854248, + "learning_rate": 4.3486656464208785e-06, + "loss": 0.142, + "step": 4778 + }, + { + "epoch": 0.774303305249514, + "grad_norm": 0.8900960087776184, + "learning_rate": 4.348371227809199e-06, + "loss": 0.1408, + "step": 4779 + }, + { + "epoch": 0.7744653272845107, + "grad_norm": 0.9652172923088074, + "learning_rate": 4.348076752641834e-06, + "loss": 0.1675, + "step": 4780 + }, + { + "epoch": 0.7746273493195075, + "grad_norm": 0.8140687346458435, + "learning_rate": 4.34778222092779e-06, + "loss": 0.1229, + "step": 4781 + }, + { + "epoch": 0.7747893713545042, + "grad_norm": 0.7977285385131836, + "learning_rate": 4.347487632676084e-06, + "loss": 0.122, + "step": 4782 + }, + { + "epoch": 0.7749513933895009, + "grad_norm": 0.8936629891395569, + "learning_rate": 4.347192987895726e-06, + "loss": 0.147, + "step": 4783 + }, + { + "epoch": 0.7751134154244977, + "grad_norm": 0.811756432056427, + "learning_rate": 4.346898286595733e-06, + "loss": 0.126, + "step": 4784 + }, + { + "epoch": 0.7752754374594945, + "grad_norm": 0.8740372657775879, + "learning_rate": 4.346603528785122e-06, + "loss": 0.1515, + "step": 4785 + }, + { + "epoch": 0.7754374594944913, + "grad_norm": 0.8331345319747925, + "learning_rate": 4.3463087144729115e-06, + "loss": 0.1341, + "step": 4786 + }, + { + "epoch": 0.775599481529488, + "grad_norm": 0.8488640785217285, + "learning_rate": 4.346013843668122e-06, + "loss": 0.1361, + "step": 4787 + }, + { + "epoch": 0.7757615035644848, + "grad_norm": 0.9174832701683044, + "learning_rate": 4.3457189163797776e-06, + "loss": 0.1343, + "step": 4788 + }, + { + "epoch": 0.7759235255994815, + "grad_norm": 0.8470262289047241, + "learning_rate": 4.345423932616899e-06, + "loss": 0.1319, + "step": 4789 + }, + { + "epoch": 0.7760855476344782, + "grad_norm": 0.8204308152198792, + "learning_rate": 4.345128892388515e-06, + "loss": 0.1331, + "step": 4790 + }, + { + "epoch": 0.7762475696694751, + "grad_norm": 0.8398123383522034, + "learning_rate": 4.344833795703652e-06, + "loss": 0.1382, + "step": 4791 + }, + { + "epoch": 0.7764095917044718, + "grad_norm": 0.9840171933174133, + "learning_rate": 4.344538642571339e-06, + "loss": 0.1512, + "step": 4792 + }, + { + "epoch": 0.7765716137394686, + "grad_norm": 0.8746889233589172, + "learning_rate": 4.3442434330006075e-06, + "loss": 0.1425, + "step": 4793 + }, + { + "epoch": 0.7767336357744653, + "grad_norm": 0.8782622218132019, + "learning_rate": 4.3439481670004895e-06, + "loss": 0.1317, + "step": 4794 + }, + { + "epoch": 0.7768956578094621, + "grad_norm": 0.8894055485725403, + "learning_rate": 4.34365284458002e-06, + "loss": 0.1424, + "step": 4795 + }, + { + "epoch": 0.7770576798444588, + "grad_norm": 0.8703826069831848, + "learning_rate": 4.343357465748235e-06, + "loss": 0.1309, + "step": 4796 + }, + { + "epoch": 0.7772197018794557, + "grad_norm": 0.8910863399505615, + "learning_rate": 4.343062030514172e-06, + "loss": 0.1431, + "step": 4797 + }, + { + "epoch": 0.7773817239144524, + "grad_norm": 0.9824634194374084, + "learning_rate": 4.342766538886872e-06, + "loss": 0.1456, + "step": 4798 + }, + { + "epoch": 0.7775437459494491, + "grad_norm": 0.862075686454773, + "learning_rate": 4.342470990875375e-06, + "loss": 0.1338, + "step": 4799 + }, + { + "epoch": 0.7777057679844459, + "grad_norm": 0.7366591095924377, + "learning_rate": 4.342175386488724e-06, + "loss": 0.1238, + "step": 4800 + }, + { + "epoch": 0.7778677900194426, + "grad_norm": 0.8682642579078674, + "learning_rate": 4.341879725735965e-06, + "loss": 0.1458, + "step": 4801 + }, + { + "epoch": 0.7780298120544394, + "grad_norm": 1.0655760765075684, + "learning_rate": 4.341584008626143e-06, + "loss": 0.1728, + "step": 4802 + }, + { + "epoch": 0.7781918340894362, + "grad_norm": 0.9638577103614807, + "learning_rate": 4.341288235168306e-06, + "loss": 0.1636, + "step": 4803 + }, + { + "epoch": 0.778353856124433, + "grad_norm": 0.9434710741043091, + "learning_rate": 4.340992405371506e-06, + "loss": 0.1469, + "step": 4804 + }, + { + "epoch": 0.7785158781594297, + "grad_norm": 0.9497054219245911, + "learning_rate": 4.340696519244794e-06, + "loss": 0.1343, + "step": 4805 + }, + { + "epoch": 0.7786779001944264, + "grad_norm": 0.8136816620826721, + "learning_rate": 4.340400576797221e-06, + "loss": 0.1239, + "step": 4806 + }, + { + "epoch": 0.7788399222294232, + "grad_norm": 0.81424480676651, + "learning_rate": 4.340104578037846e-06, + "loss": 0.1231, + "step": 4807 + }, + { + "epoch": 0.7790019442644199, + "grad_norm": 0.8290922045707703, + "learning_rate": 4.339808522975722e-06, + "loss": 0.1283, + "step": 4808 + }, + { + "epoch": 0.7791639662994168, + "grad_norm": 0.9101153016090393, + "learning_rate": 4.339512411619912e-06, + "loss": 0.1419, + "step": 4809 + }, + { + "epoch": 0.7793259883344135, + "grad_norm": 0.8872000575065613, + "learning_rate": 4.339216243979471e-06, + "loss": 0.1378, + "step": 4810 + }, + { + "epoch": 0.7794880103694103, + "grad_norm": 0.9886912703514099, + "learning_rate": 4.338920020063465e-06, + "loss": 0.1599, + "step": 4811 + }, + { + "epoch": 0.779650032404407, + "grad_norm": 0.7226459383964539, + "learning_rate": 4.3386237398809576e-06, + "loss": 0.1087, + "step": 4812 + }, + { + "epoch": 0.7798120544394037, + "grad_norm": 0.9056746959686279, + "learning_rate": 4.338327403441012e-06, + "loss": 0.1485, + "step": 4813 + }, + { + "epoch": 0.7799740764744005, + "grad_norm": 0.831541121006012, + "learning_rate": 4.338031010752696e-06, + "loss": 0.1161, + "step": 4814 + }, + { + "epoch": 0.7801360985093972, + "grad_norm": 0.9287052750587463, + "learning_rate": 4.337734561825079e-06, + "loss": 0.1564, + "step": 4815 + }, + { + "epoch": 0.7802981205443941, + "grad_norm": 0.9070348739624023, + "learning_rate": 4.337438056667233e-06, + "loss": 0.1533, + "step": 4816 + }, + { + "epoch": 0.7804601425793908, + "grad_norm": 0.8945810794830322, + "learning_rate": 4.337141495288228e-06, + "loss": 0.141, + "step": 4817 + }, + { + "epoch": 0.7806221646143876, + "grad_norm": 0.9323962926864624, + "learning_rate": 4.336844877697139e-06, + "loss": 0.1525, + "step": 4818 + }, + { + "epoch": 0.7807841866493843, + "grad_norm": 0.8296406269073486, + "learning_rate": 4.336548203903042e-06, + "loss": 0.1358, + "step": 4819 + }, + { + "epoch": 0.780946208684381, + "grad_norm": 0.8179200887680054, + "learning_rate": 4.336251473915015e-06, + "loss": 0.138, + "step": 4820 + }, + { + "epoch": 0.7811082307193778, + "grad_norm": 0.8966086506843567, + "learning_rate": 4.335954687742136e-06, + "loss": 0.1401, + "step": 4821 + }, + { + "epoch": 0.7812702527543746, + "grad_norm": 1.0740383863449097, + "learning_rate": 4.335657845393486e-06, + "loss": 0.1471, + "step": 4822 + }, + { + "epoch": 0.7814322747893714, + "grad_norm": 0.9319535493850708, + "learning_rate": 4.335360946878148e-06, + "loss": 0.1565, + "step": 4823 + }, + { + "epoch": 0.7815942968243681, + "grad_norm": 0.9597566723823547, + "learning_rate": 4.335063992205207e-06, + "loss": 0.1614, + "step": 4824 + }, + { + "epoch": 0.7817563188593649, + "grad_norm": 0.8953394293785095, + "learning_rate": 4.334766981383749e-06, + "loss": 0.1495, + "step": 4825 + }, + { + "epoch": 0.7819183408943616, + "grad_norm": 0.8695473074913025, + "learning_rate": 4.3344699144228605e-06, + "loss": 0.1353, + "step": 4826 + }, + { + "epoch": 0.7820803629293583, + "grad_norm": 0.8189257383346558, + "learning_rate": 4.334172791331633e-06, + "loss": 0.1269, + "step": 4827 + }, + { + "epoch": 0.7822423849643552, + "grad_norm": 0.8564877510070801, + "learning_rate": 4.333875612119156e-06, + "loss": 0.1302, + "step": 4828 + }, + { + "epoch": 0.7824044069993519, + "grad_norm": 0.8532876968383789, + "learning_rate": 4.3335783767945235e-06, + "loss": 0.1405, + "step": 4829 + }, + { + "epoch": 0.7825664290343487, + "grad_norm": 0.9211379289627075, + "learning_rate": 4.333281085366829e-06, + "loss": 0.1515, + "step": 4830 + }, + { + "epoch": 0.7827284510693454, + "grad_norm": 0.8540376424789429, + "learning_rate": 4.332983737845171e-06, + "loss": 0.1295, + "step": 4831 + }, + { + "epoch": 0.7828904731043422, + "grad_norm": 0.886938750743866, + "learning_rate": 4.332686334238646e-06, + "loss": 0.1318, + "step": 4832 + }, + { + "epoch": 0.7830524951393389, + "grad_norm": 0.8990640640258789, + "learning_rate": 4.3323888745563544e-06, + "loss": 0.1463, + "step": 4833 + }, + { + "epoch": 0.7832145171743357, + "grad_norm": 0.8602086305618286, + "learning_rate": 4.332091358807397e-06, + "loss": 0.1309, + "step": 4834 + }, + { + "epoch": 0.7833765392093325, + "grad_norm": 1.16105318069458, + "learning_rate": 4.331793787000878e-06, + "loss": 0.1761, + "step": 4835 + }, + { + "epoch": 0.7835385612443292, + "grad_norm": 0.9003329277038574, + "learning_rate": 4.3314961591459015e-06, + "loss": 0.1451, + "step": 4836 + }, + { + "epoch": 0.783700583279326, + "grad_norm": 0.7860771417617798, + "learning_rate": 4.3311984752515745e-06, + "loss": 0.1227, + "step": 4837 + }, + { + "epoch": 0.7838626053143227, + "grad_norm": 0.7542071342468262, + "learning_rate": 4.330900735327006e-06, + "loss": 0.1211, + "step": 4838 + }, + { + "epoch": 0.7840246273493195, + "grad_norm": 0.9031782150268555, + "learning_rate": 4.330602939381306e-06, + "loss": 0.1391, + "step": 4839 + }, + { + "epoch": 0.7841866493843163, + "grad_norm": 0.8484506607055664, + "learning_rate": 4.330305087423585e-06, + "loss": 0.1333, + "step": 4840 + }, + { + "epoch": 0.7843486714193131, + "grad_norm": 1.0116180181503296, + "learning_rate": 4.3300071794629585e-06, + "loss": 0.1477, + "step": 4841 + }, + { + "epoch": 0.7845106934543098, + "grad_norm": 0.8738240003585815, + "learning_rate": 4.329709215508541e-06, + "loss": 0.1412, + "step": 4842 + }, + { + "epoch": 0.7846727154893065, + "grad_norm": 0.8802550435066223, + "learning_rate": 4.329411195569448e-06, + "loss": 0.1366, + "step": 4843 + }, + { + "epoch": 0.7848347375243033, + "grad_norm": 1.0167311429977417, + "learning_rate": 4.329113119654801e-06, + "loss": 0.1548, + "step": 4844 + }, + { + "epoch": 0.7849967595593, + "grad_norm": 0.9607383608818054, + "learning_rate": 4.328814987773718e-06, + "loss": 0.1532, + "step": 4845 + }, + { + "epoch": 0.7851587815942969, + "grad_norm": 0.8469568490982056, + "learning_rate": 4.328516799935323e-06, + "loss": 0.1294, + "step": 4846 + }, + { + "epoch": 0.7853208036292936, + "grad_norm": 0.8837965130805969, + "learning_rate": 4.328218556148738e-06, + "loss": 0.1422, + "step": 4847 + }, + { + "epoch": 0.7854828256642904, + "grad_norm": 1.0237815380096436, + "learning_rate": 4.327920256423089e-06, + "loss": 0.165, + "step": 4848 + }, + { + "epoch": 0.7856448476992871, + "grad_norm": 0.8523423671722412, + "learning_rate": 4.327621900767504e-06, + "loss": 0.1523, + "step": 4849 + }, + { + "epoch": 0.7858068697342838, + "grad_norm": 0.831685483455658, + "learning_rate": 4.3273234891911135e-06, + "loss": 0.137, + "step": 4850 + }, + { + "epoch": 0.7859688917692806, + "grad_norm": 1.195661187171936, + "learning_rate": 4.327025021703044e-06, + "loss": 0.146, + "step": 4851 + }, + { + "epoch": 0.7861309138042774, + "grad_norm": 0.8539892435073853, + "learning_rate": 4.3267264983124304e-06, + "loss": 0.1462, + "step": 4852 + }, + { + "epoch": 0.7862929358392742, + "grad_norm": 0.8452509045600891, + "learning_rate": 4.326427919028407e-06, + "loss": 0.1356, + "step": 4853 + }, + { + "epoch": 0.7864549578742709, + "grad_norm": 0.9559705853462219, + "learning_rate": 4.326129283860109e-06, + "loss": 0.1544, + "step": 4854 + }, + { + "epoch": 0.7866169799092677, + "grad_norm": 0.9361833333969116, + "learning_rate": 4.325830592816675e-06, + "loss": 0.1396, + "step": 4855 + }, + { + "epoch": 0.7867790019442644, + "grad_norm": 0.9473642110824585, + "learning_rate": 4.3255318459072415e-06, + "loss": 0.1549, + "step": 4856 + }, + { + "epoch": 0.7869410239792611, + "grad_norm": 0.8166648149490356, + "learning_rate": 4.325233043140952e-06, + "loss": 0.1309, + "step": 4857 + }, + { + "epoch": 0.787103046014258, + "grad_norm": 0.8858399987220764, + "learning_rate": 4.324934184526949e-06, + "loss": 0.1317, + "step": 4858 + }, + { + "epoch": 0.7872650680492547, + "grad_norm": 0.9922860264778137, + "learning_rate": 4.324635270074375e-06, + "loss": 0.1545, + "step": 4859 + }, + { + "epoch": 0.7874270900842515, + "grad_norm": 0.9596473574638367, + "learning_rate": 4.324336299792378e-06, + "loss": 0.163, + "step": 4860 + }, + { + "epoch": 0.7875891121192482, + "grad_norm": 0.8286867737770081, + "learning_rate": 4.3240372736901044e-06, + "loss": 0.1262, + "step": 4861 + }, + { + "epoch": 0.787751134154245, + "grad_norm": 0.8577548861503601, + "learning_rate": 4.3237381917767054e-06, + "loss": 0.1487, + "step": 4862 + }, + { + "epoch": 0.7879131561892417, + "grad_norm": 0.9122594594955444, + "learning_rate": 4.323439054061331e-06, + "loss": 0.1592, + "step": 4863 + }, + { + "epoch": 0.7880751782242384, + "grad_norm": 0.7763839364051819, + "learning_rate": 4.323139860553133e-06, + "loss": 0.1135, + "step": 4864 + }, + { + "epoch": 0.7882372002592353, + "grad_norm": 0.8384743332862854, + "learning_rate": 4.3228406112612686e-06, + "loss": 0.1249, + "step": 4865 + }, + { + "epoch": 0.788399222294232, + "grad_norm": 0.9874293804168701, + "learning_rate": 4.3225413061948915e-06, + "loss": 0.1608, + "step": 4866 + }, + { + "epoch": 0.7885612443292288, + "grad_norm": 0.8717676997184753, + "learning_rate": 4.322241945363161e-06, + "loss": 0.1304, + "step": 4867 + }, + { + "epoch": 0.7887232663642255, + "grad_norm": 0.8286629319190979, + "learning_rate": 4.321942528775238e-06, + "loss": 0.1303, + "step": 4868 + }, + { + "epoch": 0.7888852883992223, + "grad_norm": 0.7938746809959412, + "learning_rate": 4.3216430564402815e-06, + "loss": 0.1231, + "step": 4869 + }, + { + "epoch": 0.789047310434219, + "grad_norm": 1.0110461711883545, + "learning_rate": 4.3213435283674556e-06, + "loss": 0.149, + "step": 4870 + }, + { + "epoch": 0.7892093324692158, + "grad_norm": 0.8735683560371399, + "learning_rate": 4.3210439445659255e-06, + "loss": 0.1276, + "step": 4871 + }, + { + "epoch": 0.7893713545042126, + "grad_norm": 1.0407592058181763, + "learning_rate": 4.320744305044858e-06, + "loss": 0.1411, + "step": 4872 + }, + { + "epoch": 0.7895333765392093, + "grad_norm": 0.9165899753570557, + "learning_rate": 4.3204446098134215e-06, + "loss": 0.1503, + "step": 4873 + }, + { + "epoch": 0.7896953985742061, + "grad_norm": 0.9459313154220581, + "learning_rate": 4.320144858880784e-06, + "loss": 0.1372, + "step": 4874 + }, + { + "epoch": 0.7898574206092028, + "grad_norm": 0.8336299061775208, + "learning_rate": 4.319845052256119e-06, + "loss": 0.1305, + "step": 4875 + }, + { + "epoch": 0.7900194426441997, + "grad_norm": 0.928596019744873, + "learning_rate": 4.319545189948599e-06, + "loss": 0.1473, + "step": 4876 + }, + { + "epoch": 0.7901814646791964, + "grad_norm": 0.8122962713241577, + "learning_rate": 4.3192452719674e-06, + "loss": 0.1276, + "step": 4877 + }, + { + "epoch": 0.7903434867141931, + "grad_norm": 0.7669326066970825, + "learning_rate": 4.318945298321698e-06, + "loss": 0.1314, + "step": 4878 + }, + { + "epoch": 0.7905055087491899, + "grad_norm": 0.8381757140159607, + "learning_rate": 4.318645269020671e-06, + "loss": 0.1312, + "step": 4879 + }, + { + "epoch": 0.7906675307841866, + "grad_norm": 0.8036909103393555, + "learning_rate": 4.3183451840735e-06, + "loss": 0.1092, + "step": 4880 + }, + { + "epoch": 0.7908295528191834, + "grad_norm": 0.8607558608055115, + "learning_rate": 4.318045043489367e-06, + "loss": 0.137, + "step": 4881 + }, + { + "epoch": 0.7909915748541801, + "grad_norm": 0.8910214900970459, + "learning_rate": 4.3177448472774566e-06, + "loss": 0.153, + "step": 4882 + }, + { + "epoch": 0.791153596889177, + "grad_norm": 0.8832724094390869, + "learning_rate": 4.317444595446951e-06, + "loss": 0.148, + "step": 4883 + }, + { + "epoch": 0.7913156189241737, + "grad_norm": 0.9177297353744507, + "learning_rate": 4.317144288007039e-06, + "loss": 0.1537, + "step": 4884 + }, + { + "epoch": 0.7914776409591704, + "grad_norm": 0.9235965609550476, + "learning_rate": 4.316843924966909e-06, + "loss": 0.1404, + "step": 4885 + }, + { + "epoch": 0.7916396629941672, + "grad_norm": 0.8111236691474915, + "learning_rate": 4.316543506335752e-06, + "loss": 0.1276, + "step": 4886 + }, + { + "epoch": 0.7918016850291639, + "grad_norm": 0.8697571158409119, + "learning_rate": 4.31624303212276e-06, + "loss": 0.1325, + "step": 4887 + }, + { + "epoch": 0.7919637070641607, + "grad_norm": 0.8985753059387207, + "learning_rate": 4.315942502337126e-06, + "loss": 0.1418, + "step": 4888 + }, + { + "epoch": 0.7921257290991575, + "grad_norm": 0.8289467096328735, + "learning_rate": 4.315641916988046e-06, + "loss": 0.1169, + "step": 4889 + }, + { + "epoch": 0.7922877511341543, + "grad_norm": 1.004921555519104, + "learning_rate": 4.315341276084717e-06, + "loss": 0.1646, + "step": 4890 + }, + { + "epoch": 0.792449773169151, + "grad_norm": 0.9182339906692505, + "learning_rate": 4.315040579636339e-06, + "loss": 0.1565, + "step": 4891 + }, + { + "epoch": 0.7926117952041478, + "grad_norm": 0.8737165927886963, + "learning_rate": 4.3147398276521105e-06, + "loss": 0.1278, + "step": 4892 + }, + { + "epoch": 0.7927738172391445, + "grad_norm": 0.8931154608726501, + "learning_rate": 4.314439020141235e-06, + "loss": 0.145, + "step": 4893 + }, + { + "epoch": 0.7929358392741412, + "grad_norm": 0.9454561471939087, + "learning_rate": 4.314138157112916e-06, + "loss": 0.1496, + "step": 4894 + }, + { + "epoch": 0.7930978613091381, + "grad_norm": 0.8674540519714355, + "learning_rate": 4.313837238576361e-06, + "loss": 0.1277, + "step": 4895 + }, + { + "epoch": 0.7932598833441348, + "grad_norm": 0.8071511387825012, + "learning_rate": 4.313536264540774e-06, + "loss": 0.1258, + "step": 4896 + }, + { + "epoch": 0.7934219053791316, + "grad_norm": 0.9354859590530396, + "learning_rate": 4.313235235015367e-06, + "loss": 0.1537, + "step": 4897 + }, + { + "epoch": 0.7935839274141283, + "grad_norm": 0.9247655272483826, + "learning_rate": 4.312934150009351e-06, + "loss": 0.1535, + "step": 4898 + }, + { + "epoch": 0.7937459494491251, + "grad_norm": 0.8361133933067322, + "learning_rate": 4.312633009531935e-06, + "loss": 0.1397, + "step": 4899 + }, + { + "epoch": 0.7939079714841218, + "grad_norm": 1.0772885084152222, + "learning_rate": 4.3123318135923355e-06, + "loss": 0.1683, + "step": 4900 + }, + { + "epoch": 0.7940699935191186, + "grad_norm": 0.7447131872177124, + "learning_rate": 4.312030562199769e-06, + "loss": 0.1155, + "step": 4901 + }, + { + "epoch": 0.7942320155541154, + "grad_norm": 0.7897053360939026, + "learning_rate": 4.311729255363453e-06, + "loss": 0.1229, + "step": 4902 + }, + { + "epoch": 0.7943940375891121, + "grad_norm": 0.8198691606521606, + "learning_rate": 4.3114278930926055e-06, + "loss": 0.1323, + "step": 4903 + }, + { + "epoch": 0.7945560596241089, + "grad_norm": 0.8728782534599304, + "learning_rate": 4.3111264753964475e-06, + "loss": 0.1368, + "step": 4904 + }, + { + "epoch": 0.7947180816591056, + "grad_norm": 0.8745116591453552, + "learning_rate": 4.3108250022842026e-06, + "loss": 0.1475, + "step": 4905 + }, + { + "epoch": 0.7948801036941024, + "grad_norm": 0.8484328389167786, + "learning_rate": 4.310523473765095e-06, + "loss": 0.1349, + "step": 4906 + }, + { + "epoch": 0.7950421257290992, + "grad_norm": 0.8352507948875427, + "learning_rate": 4.31022188984835e-06, + "loss": 0.1349, + "step": 4907 + }, + { + "epoch": 0.7952041477640959, + "grad_norm": 0.9030001759529114, + "learning_rate": 4.309920250543196e-06, + "loss": 0.1343, + "step": 4908 + }, + { + "epoch": 0.7953661697990927, + "grad_norm": 0.9628517627716064, + "learning_rate": 4.3096185558588625e-06, + "loss": 0.158, + "step": 4909 + }, + { + "epoch": 0.7955281918340894, + "grad_norm": 0.84720778465271, + "learning_rate": 4.30931680580458e-06, + "loss": 0.1336, + "step": 4910 + }, + { + "epoch": 0.7956902138690862, + "grad_norm": 0.9344910383224487, + "learning_rate": 4.309015000389583e-06, + "loss": 0.1469, + "step": 4911 + }, + { + "epoch": 0.7958522359040829, + "grad_norm": 0.8999947309494019, + "learning_rate": 4.308713139623103e-06, + "loss": 0.1324, + "step": 4912 + }, + { + "epoch": 0.7960142579390798, + "grad_norm": 0.9617306590080261, + "learning_rate": 4.308411223514378e-06, + "loss": 0.1633, + "step": 4913 + }, + { + "epoch": 0.7961762799740765, + "grad_norm": 0.8142529726028442, + "learning_rate": 4.308109252072647e-06, + "loss": 0.1394, + "step": 4914 + }, + { + "epoch": 0.7963383020090732, + "grad_norm": 0.837073028087616, + "learning_rate": 4.307807225307148e-06, + "loss": 0.1277, + "step": 4915 + }, + { + "epoch": 0.79650032404407, + "grad_norm": 0.9646647572517395, + "learning_rate": 4.307505143227122e-06, + "loss": 0.1675, + "step": 4916 + }, + { + "epoch": 0.7966623460790667, + "grad_norm": 0.893146812915802, + "learning_rate": 4.307203005841813e-06, + "loss": 0.146, + "step": 4917 + }, + { + "epoch": 0.7968243681140635, + "grad_norm": 0.9233022928237915, + "learning_rate": 4.306900813160466e-06, + "loss": 0.1535, + "step": 4918 + }, + { + "epoch": 0.7969863901490603, + "grad_norm": 0.8315757513046265, + "learning_rate": 4.306598565192327e-06, + "loss": 0.133, + "step": 4919 + }, + { + "epoch": 0.7971484121840571, + "grad_norm": 0.8391256332397461, + "learning_rate": 4.306296261946643e-06, + "loss": 0.1378, + "step": 4920 + }, + { + "epoch": 0.7973104342190538, + "grad_norm": 0.9489231109619141, + "learning_rate": 4.305993903432664e-06, + "loss": 0.1461, + "step": 4921 + }, + { + "epoch": 0.7974724562540505, + "grad_norm": 0.8281417489051819, + "learning_rate": 4.305691489659643e-06, + "loss": 0.1266, + "step": 4922 + }, + { + "epoch": 0.7976344782890473, + "grad_norm": 0.8485118746757507, + "learning_rate": 4.305389020636832e-06, + "loss": 0.1373, + "step": 4923 + }, + { + "epoch": 0.797796500324044, + "grad_norm": 0.8453590869903564, + "learning_rate": 4.3050864963734854e-06, + "loss": 0.1371, + "step": 4924 + }, + { + "epoch": 0.7979585223590409, + "grad_norm": 0.8031056523323059, + "learning_rate": 4.304783916878861e-06, + "loss": 0.1262, + "step": 4925 + }, + { + "epoch": 0.7981205443940376, + "grad_norm": 0.9986342191696167, + "learning_rate": 4.304481282162215e-06, + "loss": 0.1689, + "step": 4926 + }, + { + "epoch": 0.7982825664290344, + "grad_norm": 0.8515965938568115, + "learning_rate": 4.304178592232809e-06, + "loss": 0.1373, + "step": 4927 + }, + { + "epoch": 0.7984445884640311, + "grad_norm": 0.9090455174446106, + "learning_rate": 4.3038758470999056e-06, + "loss": 0.1508, + "step": 4928 + }, + { + "epoch": 0.7986066104990278, + "grad_norm": 0.8485004305839539, + "learning_rate": 4.303573046772765e-06, + "loss": 0.1513, + "step": 4929 + }, + { + "epoch": 0.7987686325340246, + "grad_norm": 1.051450490951538, + "learning_rate": 4.303270191260654e-06, + "loss": 0.1781, + "step": 4930 + }, + { + "epoch": 0.7989306545690213, + "grad_norm": 0.8531157374382019, + "learning_rate": 4.302967280572839e-06, + "loss": 0.1348, + "step": 4931 + }, + { + "epoch": 0.7990926766040182, + "grad_norm": 0.7763565182685852, + "learning_rate": 4.302664314718588e-06, + "loss": 0.1182, + "step": 4932 + }, + { + "epoch": 0.7992546986390149, + "grad_norm": 0.9212713837623596, + "learning_rate": 4.302361293707172e-06, + "loss": 0.1209, + "step": 4933 + }, + { + "epoch": 0.7994167206740117, + "grad_norm": 0.8668341040611267, + "learning_rate": 4.302058217547862e-06, + "loss": 0.1316, + "step": 4934 + }, + { + "epoch": 0.7995787427090084, + "grad_norm": 1.0723251104354858, + "learning_rate": 4.3017550862499314e-06, + "loss": 0.1604, + "step": 4935 + }, + { + "epoch": 0.7997407647440052, + "grad_norm": 0.9705539345741272, + "learning_rate": 4.301451899822655e-06, + "loss": 0.1541, + "step": 4936 + }, + { + "epoch": 0.799902786779002, + "grad_norm": 0.8101187348365784, + "learning_rate": 4.30114865827531e-06, + "loss": 0.1226, + "step": 4937 + }, + { + "epoch": 0.8000648088139987, + "grad_norm": 0.9303157925605774, + "learning_rate": 4.3008453616171746e-06, + "loss": 0.1661, + "step": 4938 + }, + { + "epoch": 0.8002268308489955, + "grad_norm": 0.8247880339622498, + "learning_rate": 4.300542009857529e-06, + "loss": 0.1287, + "step": 4939 + }, + { + "epoch": 0.8003888528839922, + "grad_norm": 0.8863083124160767, + "learning_rate": 4.300238603005656e-06, + "loss": 0.1557, + "step": 4940 + }, + { + "epoch": 0.800550874918989, + "grad_norm": 0.9141151905059814, + "learning_rate": 4.299935141070837e-06, + "loss": 0.1515, + "step": 4941 + }, + { + "epoch": 0.8007128969539857, + "grad_norm": 0.925873875617981, + "learning_rate": 4.299631624062359e-06, + "loss": 0.1467, + "step": 4942 + }, + { + "epoch": 0.8008749189889826, + "grad_norm": 0.9137710332870483, + "learning_rate": 4.299328051989509e-06, + "loss": 0.1325, + "step": 4943 + }, + { + "epoch": 0.8010369410239793, + "grad_norm": 1.1716556549072266, + "learning_rate": 4.299024424861574e-06, + "loss": 0.1653, + "step": 4944 + }, + { + "epoch": 0.801198963058976, + "grad_norm": 1.0240533351898193, + "learning_rate": 4.298720742687846e-06, + "loss": 0.1516, + "step": 4945 + }, + { + "epoch": 0.8013609850939728, + "grad_norm": 0.8595703840255737, + "learning_rate": 4.298417005477616e-06, + "loss": 0.1388, + "step": 4946 + }, + { + "epoch": 0.8015230071289695, + "grad_norm": 0.817602813243866, + "learning_rate": 4.298113213240176e-06, + "loss": 0.1203, + "step": 4947 + }, + { + "epoch": 0.8016850291639663, + "grad_norm": 0.9021492004394531, + "learning_rate": 4.2978093659848255e-06, + "loss": 0.1397, + "step": 4948 + }, + { + "epoch": 0.801847051198963, + "grad_norm": 0.756421685218811, + "learning_rate": 4.297505463720857e-06, + "loss": 0.1294, + "step": 4949 + }, + { + "epoch": 0.8020090732339599, + "grad_norm": 0.8492708206176758, + "learning_rate": 4.2972015064575726e-06, + "loss": 0.146, + "step": 4950 + }, + { + "epoch": 0.8021710952689566, + "grad_norm": 0.8680381774902344, + "learning_rate": 4.29689749420427e-06, + "loss": 0.1556, + "step": 4951 + }, + { + "epoch": 0.8023331173039533, + "grad_norm": 0.7963783144950867, + "learning_rate": 4.2965934269702535e-06, + "loss": 0.1291, + "step": 4952 + }, + { + "epoch": 0.8024951393389501, + "grad_norm": 0.8368898630142212, + "learning_rate": 4.296289304764825e-06, + "loss": 0.1396, + "step": 4953 + }, + { + "epoch": 0.8026571613739468, + "grad_norm": 0.951833188533783, + "learning_rate": 4.295985127597291e-06, + "loss": 0.1592, + "step": 4954 + }, + { + "epoch": 0.8028191834089436, + "grad_norm": 0.8976386785507202, + "learning_rate": 4.295680895476959e-06, + "loss": 0.1454, + "step": 4955 + }, + { + "epoch": 0.8029812054439404, + "grad_norm": 0.8689819574356079, + "learning_rate": 4.295376608413136e-06, + "loss": 0.1393, + "step": 4956 + }, + { + "epoch": 0.8031432274789372, + "grad_norm": 0.9349297285079956, + "learning_rate": 4.295072266415135e-06, + "loss": 0.1617, + "step": 4957 + }, + { + "epoch": 0.8033052495139339, + "grad_norm": 0.9541299939155579, + "learning_rate": 4.294767869492265e-06, + "loss": 0.1545, + "step": 4958 + }, + { + "epoch": 0.8034672715489306, + "grad_norm": 0.9361594915390015, + "learning_rate": 4.294463417653842e-06, + "loss": 0.156, + "step": 4959 + }, + { + "epoch": 0.8036292935839274, + "grad_norm": 0.8772572875022888, + "learning_rate": 4.294158910909181e-06, + "loss": 0.14, + "step": 4960 + }, + { + "epoch": 0.8037913156189241, + "grad_norm": 0.84719318151474, + "learning_rate": 4.2938543492676e-06, + "loss": 0.1324, + "step": 4961 + }, + { + "epoch": 0.803953337653921, + "grad_norm": 0.7897992730140686, + "learning_rate": 4.293549732738415e-06, + "loss": 0.1266, + "step": 4962 + }, + { + "epoch": 0.8041153596889177, + "grad_norm": 0.9460259079933167, + "learning_rate": 4.29324506133095e-06, + "loss": 0.1521, + "step": 4963 + }, + { + "epoch": 0.8042773817239145, + "grad_norm": 0.8699532747268677, + "learning_rate": 4.2929403350545255e-06, + "loss": 0.1435, + "step": 4964 + }, + { + "epoch": 0.8044394037589112, + "grad_norm": 0.9408538341522217, + "learning_rate": 4.292635553918466e-06, + "loss": 0.1431, + "step": 4965 + }, + { + "epoch": 0.8046014257939079, + "grad_norm": 0.885382354259491, + "learning_rate": 4.292330717932095e-06, + "loss": 0.1411, + "step": 4966 + }, + { + "epoch": 0.8047634478289047, + "grad_norm": 0.9126858115196228, + "learning_rate": 4.292025827104744e-06, + "loss": 0.1511, + "step": 4967 + }, + { + "epoch": 0.8049254698639015, + "grad_norm": 0.8485498428344727, + "learning_rate": 4.2917208814457364e-06, + "loss": 0.1325, + "step": 4968 + }, + { + "epoch": 0.8050874918988983, + "grad_norm": 0.9165229797363281, + "learning_rate": 4.291415880964407e-06, + "loss": 0.1521, + "step": 4969 + }, + { + "epoch": 0.805249513933895, + "grad_norm": 0.811354398727417, + "learning_rate": 4.291110825670087e-06, + "loss": 0.1297, + "step": 4970 + }, + { + "epoch": 0.8054115359688918, + "grad_norm": 0.8046636581420898, + "learning_rate": 4.29080571557211e-06, + "loss": 0.1192, + "step": 4971 + }, + { + "epoch": 0.8055735580038885, + "grad_norm": 0.8413184881210327, + "learning_rate": 4.290500550679811e-06, + "loss": 0.1316, + "step": 4972 + }, + { + "epoch": 0.8057355800388852, + "grad_norm": 0.9264662861824036, + "learning_rate": 4.290195331002529e-06, + "loss": 0.15, + "step": 4973 + }, + { + "epoch": 0.8058976020738821, + "grad_norm": 0.917666494846344, + "learning_rate": 4.289890056549603e-06, + "loss": 0.1395, + "step": 4974 + }, + { + "epoch": 0.8060596241088788, + "grad_norm": 0.9180327653884888, + "learning_rate": 4.2895847273303705e-06, + "loss": 0.1427, + "step": 4975 + }, + { + "epoch": 0.8062216461438756, + "grad_norm": 0.902094841003418, + "learning_rate": 4.289279343354178e-06, + "loss": 0.1421, + "step": 4976 + }, + { + "epoch": 0.8063836681788723, + "grad_norm": 0.8890548944473267, + "learning_rate": 4.288973904630366e-06, + "loss": 0.134, + "step": 4977 + }, + { + "epoch": 0.8065456902138691, + "grad_norm": 0.8452367186546326, + "learning_rate": 4.288668411168283e-06, + "loss": 0.1293, + "step": 4978 + }, + { + "epoch": 0.8067077122488658, + "grad_norm": 0.9811566472053528, + "learning_rate": 4.288362862977274e-06, + "loss": 0.1597, + "step": 4979 + }, + { + "epoch": 0.8068697342838627, + "grad_norm": 0.9972265958786011, + "learning_rate": 4.28805726006669e-06, + "loss": 0.1607, + "step": 4980 + }, + { + "epoch": 0.8070317563188594, + "grad_norm": 0.9556589722633362, + "learning_rate": 4.287751602445881e-06, + "loss": 0.1415, + "step": 4981 + }, + { + "epoch": 0.8071937783538561, + "grad_norm": 0.9611114859580994, + "learning_rate": 4.287445890124198e-06, + "loss": 0.1601, + "step": 4982 + }, + { + "epoch": 0.8073558003888529, + "grad_norm": 0.8882191181182861, + "learning_rate": 4.287140123110998e-06, + "loss": 0.1359, + "step": 4983 + }, + { + "epoch": 0.8075178224238496, + "grad_norm": 0.9331012964248657, + "learning_rate": 4.286834301415634e-06, + "loss": 0.1498, + "step": 4984 + }, + { + "epoch": 0.8076798444588464, + "grad_norm": 0.8011325001716614, + "learning_rate": 4.286528425047464e-06, + "loss": 0.1285, + "step": 4985 + }, + { + "epoch": 0.8078418664938432, + "grad_norm": 0.9134606719017029, + "learning_rate": 4.286222494015848e-06, + "loss": 0.155, + "step": 4986 + }, + { + "epoch": 0.80800388852884, + "grad_norm": 0.8747284412384033, + "learning_rate": 4.285916508330146e-06, + "loss": 0.143, + "step": 4987 + }, + { + "epoch": 0.8081659105638367, + "grad_norm": 0.8503855466842651, + "learning_rate": 4.285610467999722e-06, + "loss": 0.1233, + "step": 4988 + }, + { + "epoch": 0.8083279325988334, + "grad_norm": 0.8365163207054138, + "learning_rate": 4.285304373033938e-06, + "loss": 0.1299, + "step": 4989 + }, + { + "epoch": 0.8084899546338302, + "grad_norm": 0.7736684083938599, + "learning_rate": 4.28499822344216e-06, + "loss": 0.1336, + "step": 4990 + }, + { + "epoch": 0.8086519766688269, + "grad_norm": 0.7120151519775391, + "learning_rate": 4.284692019233756e-06, + "loss": 0.1262, + "step": 4991 + }, + { + "epoch": 0.8088139987038238, + "grad_norm": 0.987061083316803, + "learning_rate": 4.2843857604180955e-06, + "loss": 0.1582, + "step": 4992 + }, + { + "epoch": 0.8089760207388205, + "grad_norm": 0.804357647895813, + "learning_rate": 4.2840794470045484e-06, + "loss": 0.1307, + "step": 4993 + }, + { + "epoch": 0.8091380427738173, + "grad_norm": 0.9282926917076111, + "learning_rate": 4.283773079002488e-06, + "loss": 0.1554, + "step": 4994 + }, + { + "epoch": 0.809300064808814, + "grad_norm": 0.8444812297821045, + "learning_rate": 4.283466656421289e-06, + "loss": 0.1413, + "step": 4995 + }, + { + "epoch": 0.8094620868438107, + "grad_norm": 0.9229618310928345, + "learning_rate": 4.283160179270325e-06, + "loss": 0.1442, + "step": 4996 + }, + { + "epoch": 0.8096241088788075, + "grad_norm": 0.8210486173629761, + "learning_rate": 4.282853647558974e-06, + "loss": 0.1277, + "step": 4997 + }, + { + "epoch": 0.8097861309138042, + "grad_norm": 0.8737907409667969, + "learning_rate": 4.282547061296618e-06, + "loss": 0.1377, + "step": 4998 + }, + { + "epoch": 0.8099481529488011, + "grad_norm": 0.8875065445899963, + "learning_rate": 4.2822404204926334e-06, + "loss": 0.1343, + "step": 4999 + }, + { + "epoch": 0.8101101749837978, + "grad_norm": 0.8774368762969971, + "learning_rate": 4.281933725156406e-06, + "loss": 0.1423, + "step": 5000 + }, + { + "epoch": 0.8102721970187946, + "grad_norm": 0.8778204917907715, + "learning_rate": 4.281626975297319e-06, + "loss": 0.1279, + "step": 5001 + }, + { + "epoch": 0.8104342190537913, + "grad_norm": 0.8505150079727173, + "learning_rate": 4.281320170924758e-06, + "loss": 0.1398, + "step": 5002 + }, + { + "epoch": 0.810596241088788, + "grad_norm": 0.7801287770271301, + "learning_rate": 4.281013312048109e-06, + "loss": 0.1218, + "step": 5003 + }, + { + "epoch": 0.8107582631237849, + "grad_norm": 0.9131519794464111, + "learning_rate": 4.280706398676764e-06, + "loss": 0.1413, + "step": 5004 + }, + { + "epoch": 0.8109202851587816, + "grad_norm": 0.8857017159461975, + "learning_rate": 4.280399430820112e-06, + "loss": 0.1381, + "step": 5005 + }, + { + "epoch": 0.8110823071937784, + "grad_norm": 0.8741483092308044, + "learning_rate": 4.2800924084875465e-06, + "loss": 0.1353, + "step": 5006 + }, + { + "epoch": 0.8112443292287751, + "grad_norm": 0.816307783126831, + "learning_rate": 4.27978533168846e-06, + "loss": 0.1354, + "step": 5007 + }, + { + "epoch": 0.8114063512637719, + "grad_norm": 0.8606013059616089, + "learning_rate": 4.27947820043225e-06, + "loss": 0.1452, + "step": 5008 + }, + { + "epoch": 0.8115683732987686, + "grad_norm": 0.999919593334198, + "learning_rate": 4.279171014728314e-06, + "loss": 0.1466, + "step": 5009 + }, + { + "epoch": 0.8117303953337653, + "grad_norm": 0.8366535305976868, + "learning_rate": 4.278863774586049e-06, + "loss": 0.1267, + "step": 5010 + }, + { + "epoch": 0.8118924173687622, + "grad_norm": 0.9078406691551208, + "learning_rate": 4.278556480014858e-06, + "loss": 0.1422, + "step": 5011 + }, + { + "epoch": 0.8120544394037589, + "grad_norm": 0.9016123414039612, + "learning_rate": 4.2782491310241426e-06, + "loss": 0.1574, + "step": 5012 + }, + { + "epoch": 0.8122164614387557, + "grad_norm": 0.7560967206954956, + "learning_rate": 4.277941727623307e-06, + "loss": 0.1211, + "step": 5013 + }, + { + "epoch": 0.8123784834737524, + "grad_norm": 0.8945764303207397, + "learning_rate": 4.2776342698217575e-06, + "loss": 0.148, + "step": 5014 + }, + { + "epoch": 0.8125405055087492, + "grad_norm": 0.8035917282104492, + "learning_rate": 4.277326757628901e-06, + "loss": 0.1312, + "step": 5015 + }, + { + "epoch": 0.812702527543746, + "grad_norm": 0.925362229347229, + "learning_rate": 4.277019191054146e-06, + "loss": 0.1421, + "step": 5016 + }, + { + "epoch": 0.8128645495787427, + "grad_norm": 0.8655949831008911, + "learning_rate": 4.276711570106905e-06, + "loss": 0.1454, + "step": 5017 + }, + { + "epoch": 0.8130265716137395, + "grad_norm": 0.9567270874977112, + "learning_rate": 4.276403894796589e-06, + "loss": 0.1528, + "step": 5018 + }, + { + "epoch": 0.8131885936487362, + "grad_norm": 0.8218672275543213, + "learning_rate": 4.276096165132613e-06, + "loss": 0.1435, + "step": 5019 + }, + { + "epoch": 0.813350615683733, + "grad_norm": 0.9071885943412781, + "learning_rate": 4.275788381124393e-06, + "loss": 0.1343, + "step": 5020 + }, + { + "epoch": 0.8135126377187297, + "grad_norm": 0.801608681678772, + "learning_rate": 4.2754805427813455e-06, + "loss": 0.138, + "step": 5021 + }, + { + "epoch": 0.8136746597537265, + "grad_norm": 0.960382878780365, + "learning_rate": 4.275172650112889e-06, + "loss": 0.159, + "step": 5022 + }, + { + "epoch": 0.8138366817887233, + "grad_norm": 0.8618503212928772, + "learning_rate": 4.274864703128446e-06, + "loss": 0.1346, + "step": 5023 + }, + { + "epoch": 0.81399870382372, + "grad_norm": 0.8866019248962402, + "learning_rate": 4.274556701837438e-06, + "loss": 0.1402, + "step": 5024 + }, + { + "epoch": 0.8141607258587168, + "grad_norm": 0.8554643988609314, + "learning_rate": 4.27424864624929e-06, + "loss": 0.1376, + "step": 5025 + }, + { + "epoch": 0.8143227478937135, + "grad_norm": 0.8216258883476257, + "learning_rate": 4.273940536373426e-06, + "loss": 0.1221, + "step": 5026 + }, + { + "epoch": 0.8144847699287103, + "grad_norm": 1.0153284072875977, + "learning_rate": 4.273632372219274e-06, + "loss": 0.1503, + "step": 5027 + }, + { + "epoch": 0.814646791963707, + "grad_norm": 0.8892536759376526, + "learning_rate": 4.273324153796264e-06, + "loss": 0.1476, + "step": 5028 + }, + { + "epoch": 0.8148088139987039, + "grad_norm": 0.9032856225967407, + "learning_rate": 4.273015881113827e-06, + "loss": 0.1476, + "step": 5029 + }, + { + "epoch": 0.8149708360337006, + "grad_norm": 0.8178720474243164, + "learning_rate": 4.2727075541813945e-06, + "loss": 0.136, + "step": 5030 + }, + { + "epoch": 0.8151328580686974, + "grad_norm": 0.8843294382095337, + "learning_rate": 4.2723991730084e-06, + "loss": 0.152, + "step": 5031 + }, + { + "epoch": 0.8152948801036941, + "grad_norm": 0.8823347091674805, + "learning_rate": 4.27209073760428e-06, + "loss": 0.1439, + "step": 5032 + }, + { + "epoch": 0.8154569021386908, + "grad_norm": 0.886072039604187, + "learning_rate": 4.271782247978473e-06, + "loss": 0.1416, + "step": 5033 + }, + { + "epoch": 0.8156189241736876, + "grad_norm": 0.8647177815437317, + "learning_rate": 4.271473704140415e-06, + "loss": 0.1335, + "step": 5034 + }, + { + "epoch": 0.8157809462086844, + "grad_norm": 0.8254427313804626, + "learning_rate": 4.271165106099549e-06, + "loss": 0.1284, + "step": 5035 + }, + { + "epoch": 0.8159429682436812, + "grad_norm": 0.8761928081512451, + "learning_rate": 4.270856453865318e-06, + "loss": 0.1494, + "step": 5036 + }, + { + "epoch": 0.8161049902786779, + "grad_norm": 0.8046637773513794, + "learning_rate": 4.2705477474471645e-06, + "loss": 0.1394, + "step": 5037 + }, + { + "epoch": 0.8162670123136747, + "grad_norm": 0.8774124383926392, + "learning_rate": 4.270238986854534e-06, + "loss": 0.1426, + "step": 5038 + }, + { + "epoch": 0.8164290343486714, + "grad_norm": 0.7928304672241211, + "learning_rate": 4.269930172096875e-06, + "loss": 0.1345, + "step": 5039 + }, + { + "epoch": 0.8165910563836681, + "grad_norm": 0.7432117462158203, + "learning_rate": 4.2696213031836355e-06, + "loss": 0.1129, + "step": 5040 + }, + { + "epoch": 0.816753078418665, + "grad_norm": 0.860908031463623, + "learning_rate": 4.269312380124268e-06, + "loss": 0.1405, + "step": 5041 + }, + { + "epoch": 0.8169151004536617, + "grad_norm": 0.9895366430282593, + "learning_rate": 4.2690034029282214e-06, + "loss": 0.1602, + "step": 5042 + }, + { + "epoch": 0.8170771224886585, + "grad_norm": 0.9045162200927734, + "learning_rate": 4.268694371604952e-06, + "loss": 0.1551, + "step": 5043 + }, + { + "epoch": 0.8172391445236552, + "grad_norm": 0.8091841340065002, + "learning_rate": 4.268385286163915e-06, + "loss": 0.1301, + "step": 5044 + }, + { + "epoch": 0.817401166558652, + "grad_norm": 0.8731530904769897, + "learning_rate": 4.2680761466145685e-06, + "loss": 0.1509, + "step": 5045 + }, + { + "epoch": 0.8175631885936487, + "grad_norm": 1.0045936107635498, + "learning_rate": 4.267766952966369e-06, + "loss": 0.1587, + "step": 5046 + }, + { + "epoch": 0.8177252106286454, + "grad_norm": 0.7668003439903259, + "learning_rate": 4.267457705228781e-06, + "loss": 0.1308, + "step": 5047 + }, + { + "epoch": 0.8178872326636423, + "grad_norm": 0.9017691612243652, + "learning_rate": 4.267148403411261e-06, + "loss": 0.1451, + "step": 5048 + }, + { + "epoch": 0.818049254698639, + "grad_norm": 0.8749794363975525, + "learning_rate": 4.266839047523279e-06, + "loss": 0.1391, + "step": 5049 + }, + { + "epoch": 0.8182112767336358, + "grad_norm": 0.7766733765602112, + "learning_rate": 4.266529637574297e-06, + "loss": 0.1278, + "step": 5050 + }, + { + "epoch": 0.8183732987686325, + "grad_norm": 0.8146058320999146, + "learning_rate": 4.266220173573783e-06, + "loss": 0.1348, + "step": 5051 + }, + { + "epoch": 0.8185353208036293, + "grad_norm": 0.8790404796600342, + "learning_rate": 4.265910655531206e-06, + "loss": 0.1364, + "step": 5052 + }, + { + "epoch": 0.818697342838626, + "grad_norm": 0.8823887705802917, + "learning_rate": 4.265601083456036e-06, + "loss": 0.1339, + "step": 5053 + }, + { + "epoch": 0.8188593648736228, + "grad_norm": 0.8041099309921265, + "learning_rate": 4.265291457357746e-06, + "loss": 0.1253, + "step": 5054 + }, + { + "epoch": 0.8190213869086196, + "grad_norm": 0.9392186403274536, + "learning_rate": 4.264981777245809e-06, + "loss": 0.1374, + "step": 5055 + }, + { + "epoch": 0.8191834089436163, + "grad_norm": 0.8520307540893555, + "learning_rate": 4.2646720431297006e-06, + "loss": 0.1322, + "step": 5056 + }, + { + "epoch": 0.8193454309786131, + "grad_norm": 0.8722667694091797, + "learning_rate": 4.264362255018898e-06, + "loss": 0.1467, + "step": 5057 + }, + { + "epoch": 0.8195074530136098, + "grad_norm": 0.8683182001113892, + "learning_rate": 4.2640524129228815e-06, + "loss": 0.1249, + "step": 5058 + }, + { + "epoch": 0.8196694750486067, + "grad_norm": 0.8108190298080444, + "learning_rate": 4.263742516851128e-06, + "loss": 0.125, + "step": 5059 + }, + { + "epoch": 0.8198314970836034, + "grad_norm": 1.2350842952728271, + "learning_rate": 4.263432566813123e-06, + "loss": 0.159, + "step": 5060 + }, + { + "epoch": 0.8199935191186001, + "grad_norm": 1.0421861410140991, + "learning_rate": 4.263122562818349e-06, + "loss": 0.1681, + "step": 5061 + }, + { + "epoch": 0.8201555411535969, + "grad_norm": 0.8750341534614563, + "learning_rate": 4.262812504876291e-06, + "loss": 0.1288, + "step": 5062 + }, + { + "epoch": 0.8203175631885936, + "grad_norm": 0.8388615846633911, + "learning_rate": 4.262502392996436e-06, + "loss": 0.1289, + "step": 5063 + }, + { + "epoch": 0.8204795852235904, + "grad_norm": 0.865910530090332, + "learning_rate": 4.262192227188273e-06, + "loss": 0.1329, + "step": 5064 + }, + { + "epoch": 0.8206416072585871, + "grad_norm": 0.9594810009002686, + "learning_rate": 4.261882007461292e-06, + "loss": 0.153, + "step": 5065 + }, + { + "epoch": 0.820803629293584, + "grad_norm": 0.8188930153846741, + "learning_rate": 4.261571733824986e-06, + "loss": 0.1377, + "step": 5066 + }, + { + "epoch": 0.8209656513285807, + "grad_norm": 0.9599230289459229, + "learning_rate": 4.261261406288847e-06, + "loss": 0.1379, + "step": 5067 + }, + { + "epoch": 0.8211276733635774, + "grad_norm": 0.9220377802848816, + "learning_rate": 4.260951024862372e-06, + "loss": 0.1523, + "step": 5068 + }, + { + "epoch": 0.8212896953985742, + "grad_norm": 0.8499447703361511, + "learning_rate": 4.2606405895550565e-06, + "loss": 0.1336, + "step": 5069 + }, + { + "epoch": 0.8214517174335709, + "grad_norm": 0.9115306735038757, + "learning_rate": 4.2603301003763994e-06, + "loss": 0.1438, + "step": 5070 + }, + { + "epoch": 0.8216137394685677, + "grad_norm": 0.9516029357910156, + "learning_rate": 4.260019557335902e-06, + "loss": 0.153, + "step": 5071 + }, + { + "epoch": 0.8217757615035645, + "grad_norm": 0.7999269962310791, + "learning_rate": 4.259708960443065e-06, + "loss": 0.1234, + "step": 5072 + }, + { + "epoch": 0.8219377835385613, + "grad_norm": 0.8605833053588867, + "learning_rate": 4.259398309707392e-06, + "loss": 0.1404, + "step": 5073 + }, + { + "epoch": 0.822099805573558, + "grad_norm": 0.8901498317718506, + "learning_rate": 4.259087605138388e-06, + "loss": 0.1481, + "step": 5074 + }, + { + "epoch": 0.8222618276085548, + "grad_norm": 0.8830893039703369, + "learning_rate": 4.258776846745561e-06, + "loss": 0.1479, + "step": 5075 + }, + { + "epoch": 0.8224238496435515, + "grad_norm": 0.9161304831504822, + "learning_rate": 4.2584660345384176e-06, + "loss": 0.1487, + "step": 5076 + }, + { + "epoch": 0.8225858716785482, + "grad_norm": 0.8231655955314636, + "learning_rate": 4.25815516852647e-06, + "loss": 0.1412, + "step": 5077 + }, + { + "epoch": 0.8227478937135451, + "grad_norm": 0.7701215744018555, + "learning_rate": 4.257844248719229e-06, + "loss": 0.1308, + "step": 5078 + }, + { + "epoch": 0.8229099157485418, + "grad_norm": 0.9185945987701416, + "learning_rate": 4.257533275126206e-06, + "loss": 0.1517, + "step": 5079 + }, + { + "epoch": 0.8230719377835386, + "grad_norm": 0.9465458989143372, + "learning_rate": 4.25722224775692e-06, + "loss": 0.1686, + "step": 5080 + }, + { + "epoch": 0.8232339598185353, + "grad_norm": 0.8534404635429382, + "learning_rate": 4.256911166620885e-06, + "loss": 0.1425, + "step": 5081 + }, + { + "epoch": 0.8233959818535321, + "grad_norm": 0.8880043625831604, + "learning_rate": 4.25660003172762e-06, + "loss": 0.1425, + "step": 5082 + }, + { + "epoch": 0.8235580038885288, + "grad_norm": 1.02664053440094, + "learning_rate": 4.256288843086645e-06, + "loss": 0.1425, + "step": 5083 + }, + { + "epoch": 0.8237200259235256, + "grad_norm": 0.8933080434799194, + "learning_rate": 4.255977600707481e-06, + "loss": 0.1529, + "step": 5084 + }, + { + "epoch": 0.8238820479585224, + "grad_norm": 0.9547656774520874, + "learning_rate": 4.255666304599653e-06, + "loss": 0.157, + "step": 5085 + }, + { + "epoch": 0.8240440699935191, + "grad_norm": 0.8259825706481934, + "learning_rate": 4.255354954772684e-06, + "loss": 0.1333, + "step": 5086 + }, + { + "epoch": 0.8242060920285159, + "grad_norm": 0.7905333638191223, + "learning_rate": 4.255043551236101e-06, + "loss": 0.1245, + "step": 5087 + }, + { + "epoch": 0.8243681140635126, + "grad_norm": 0.8866136074066162, + "learning_rate": 4.2547320939994315e-06, + "loss": 0.147, + "step": 5088 + }, + { + "epoch": 0.8245301360985094, + "grad_norm": 1.0445470809936523, + "learning_rate": 4.254420583072209e-06, + "loss": 0.1631, + "step": 5089 + }, + { + "epoch": 0.8246921581335062, + "grad_norm": 0.865193247795105, + "learning_rate": 4.25410901846396e-06, + "loss": 0.1331, + "step": 5090 + }, + { + "epoch": 0.8248541801685029, + "grad_norm": 0.8681652545928955, + "learning_rate": 4.25379740018422e-06, + "loss": 0.1356, + "step": 5091 + }, + { + "epoch": 0.8250162022034997, + "grad_norm": 0.803639829158783, + "learning_rate": 4.253485728242525e-06, + "loss": 0.1306, + "step": 5092 + }, + { + "epoch": 0.8251782242384964, + "grad_norm": 0.8816442489624023, + "learning_rate": 4.253174002648409e-06, + "loss": 0.1355, + "step": 5093 + }, + { + "epoch": 0.8253402462734932, + "grad_norm": 0.8702470660209656, + "learning_rate": 4.252862223411412e-06, + "loss": 0.1454, + "step": 5094 + }, + { + "epoch": 0.8255022683084899, + "grad_norm": 0.7982497811317444, + "learning_rate": 4.2525503905410715e-06, + "loss": 0.1281, + "step": 5095 + }, + { + "epoch": 0.8256642903434868, + "grad_norm": 0.8184255957603455, + "learning_rate": 4.252238504046931e-06, + "loss": 0.1397, + "step": 5096 + }, + { + "epoch": 0.8258263123784835, + "grad_norm": 0.8369611501693726, + "learning_rate": 4.251926563938533e-06, + "loss": 0.1266, + "step": 5097 + }, + { + "epoch": 0.8259883344134802, + "grad_norm": 0.7753140926361084, + "learning_rate": 4.251614570225421e-06, + "loss": 0.1157, + "step": 5098 + }, + { + "epoch": 0.826150356448477, + "grad_norm": 1.0960832834243774, + "learning_rate": 4.251302522917142e-06, + "loss": 0.1757, + "step": 5099 + }, + { + "epoch": 0.8263123784834737, + "grad_norm": 0.8341733813285828, + "learning_rate": 4.250990422023243e-06, + "loss": 0.1455, + "step": 5100 + }, + { + "epoch": 0.8264744005184705, + "grad_norm": 0.8899952173233032, + "learning_rate": 4.250678267553277e-06, + "loss": 0.1514, + "step": 5101 + }, + { + "epoch": 0.8266364225534673, + "grad_norm": 0.850158154964447, + "learning_rate": 4.250366059516791e-06, + "loss": 0.1365, + "step": 5102 + }, + { + "epoch": 0.8267984445884641, + "grad_norm": 0.8187429904937744, + "learning_rate": 4.250053797923339e-06, + "loss": 0.1324, + "step": 5103 + }, + { + "epoch": 0.8269604666234608, + "grad_norm": 0.9219319224357605, + "learning_rate": 4.249741482782476e-06, + "loss": 0.1514, + "step": 5104 + }, + { + "epoch": 0.8271224886584575, + "grad_norm": 0.9627091884613037, + "learning_rate": 4.249429114103758e-06, + "loss": 0.1689, + "step": 5105 + }, + { + "epoch": 0.8272845106934543, + "grad_norm": 0.7787367105484009, + "learning_rate": 4.249116691896743e-06, + "loss": 0.1235, + "step": 5106 + }, + { + "epoch": 0.827446532728451, + "grad_norm": 0.8211469054222107, + "learning_rate": 4.24880421617099e-06, + "loss": 0.1301, + "step": 5107 + }, + { + "epoch": 0.8276085547634479, + "grad_norm": 0.8499755859375, + "learning_rate": 4.248491686936059e-06, + "loss": 0.138, + "step": 5108 + }, + { + "epoch": 0.8277705767984446, + "grad_norm": 0.8932889699935913, + "learning_rate": 4.248179104201515e-06, + "loss": 0.1618, + "step": 5109 + }, + { + "epoch": 0.8279325988334414, + "grad_norm": 0.8149641156196594, + "learning_rate": 4.2478664679769196e-06, + "loss": 0.1358, + "step": 5110 + }, + { + "epoch": 0.8280946208684381, + "grad_norm": 0.8986614942550659, + "learning_rate": 4.247553778271841e-06, + "loss": 0.1365, + "step": 5111 + }, + { + "epoch": 0.8282566429034348, + "grad_norm": 0.9837633371353149, + "learning_rate": 4.247241035095846e-06, + "loss": 0.1583, + "step": 5112 + }, + { + "epoch": 0.8284186649384316, + "grad_norm": 0.7755539417266846, + "learning_rate": 4.246928238458503e-06, + "loss": 0.1245, + "step": 5113 + }, + { + "epoch": 0.8285806869734283, + "grad_norm": 0.8996148109436035, + "learning_rate": 4.246615388369384e-06, + "loss": 0.1397, + "step": 5114 + }, + { + "epoch": 0.8287427090084252, + "grad_norm": 1.0767894983291626, + "learning_rate": 4.24630248483806e-06, + "loss": 0.1456, + "step": 5115 + }, + { + "epoch": 0.8289047310434219, + "grad_norm": 0.9750000238418579, + "learning_rate": 4.245989527874107e-06, + "loss": 0.1515, + "step": 5116 + }, + { + "epoch": 0.8290667530784187, + "grad_norm": 0.9112747311592102, + "learning_rate": 4.245676517487098e-06, + "loss": 0.1469, + "step": 5117 + }, + { + "epoch": 0.8292287751134154, + "grad_norm": 0.7812881469726562, + "learning_rate": 4.245363453686614e-06, + "loss": 0.1129, + "step": 5118 + }, + { + "epoch": 0.8293907971484121, + "grad_norm": 0.838598370552063, + "learning_rate": 4.245050336482231e-06, + "loss": 0.1267, + "step": 5119 + }, + { + "epoch": 0.829552819183409, + "grad_norm": 0.9569753408432007, + "learning_rate": 4.24473716588353e-06, + "loss": 0.1627, + "step": 5120 + }, + { + "epoch": 0.8297148412184057, + "grad_norm": 0.9220561385154724, + "learning_rate": 4.244423941900095e-06, + "loss": 0.1537, + "step": 5121 + }, + { + "epoch": 0.8298768632534025, + "grad_norm": 0.9660263061523438, + "learning_rate": 4.2441106645415085e-06, + "loss": 0.1571, + "step": 5122 + }, + { + "epoch": 0.8300388852883992, + "grad_norm": 0.9250351190567017, + "learning_rate": 4.243797333817356e-06, + "loss": 0.1553, + "step": 5123 + }, + { + "epoch": 0.830200907323396, + "grad_norm": 0.8232597708702087, + "learning_rate": 4.243483949737225e-06, + "loss": 0.1277, + "step": 5124 + }, + { + "epoch": 0.8303629293583927, + "grad_norm": 0.9091588854789734, + "learning_rate": 4.2431705123107045e-06, + "loss": 0.1436, + "step": 5125 + }, + { + "epoch": 0.8305249513933896, + "grad_norm": 0.9420716762542725, + "learning_rate": 4.242857021547385e-06, + "loss": 0.1722, + "step": 5126 + }, + { + "epoch": 0.8306869734283863, + "grad_norm": 0.8245367407798767, + "learning_rate": 4.242543477456858e-06, + "loss": 0.1339, + "step": 5127 + }, + { + "epoch": 0.830848995463383, + "grad_norm": 0.8320567011833191, + "learning_rate": 4.242229880048718e-06, + "loss": 0.1509, + "step": 5128 + }, + { + "epoch": 0.8310110174983798, + "grad_norm": 0.8776043057441711, + "learning_rate": 4.241916229332559e-06, + "loss": 0.148, + "step": 5129 + }, + { + "epoch": 0.8311730395333765, + "grad_norm": 0.8687680959701538, + "learning_rate": 4.241602525317979e-06, + "loss": 0.1468, + "step": 5130 + }, + { + "epoch": 0.8313350615683733, + "grad_norm": 1.0185246467590332, + "learning_rate": 4.241288768014576e-06, + "loss": 0.1771, + "step": 5131 + }, + { + "epoch": 0.83149708360337, + "grad_norm": 0.9902436137199402, + "learning_rate": 4.240974957431951e-06, + "loss": 0.1443, + "step": 5132 + }, + { + "epoch": 0.8316591056383669, + "grad_norm": 0.865923285484314, + "learning_rate": 4.240661093579705e-06, + "loss": 0.1428, + "step": 5133 + }, + { + "epoch": 0.8318211276733636, + "grad_norm": 0.8009111881256104, + "learning_rate": 4.240347176467442e-06, + "loss": 0.1209, + "step": 5134 + }, + { + "epoch": 0.8319831497083603, + "grad_norm": 0.8269287943840027, + "learning_rate": 4.240033206104767e-06, + "loss": 0.1354, + "step": 5135 + }, + { + "epoch": 0.8321451717433571, + "grad_norm": 0.8295719623565674, + "learning_rate": 4.2397191825012865e-06, + "loss": 0.1359, + "step": 5136 + }, + { + "epoch": 0.8323071937783538, + "grad_norm": 0.9293026328086853, + "learning_rate": 4.23940510566661e-06, + "loss": 0.1586, + "step": 5137 + }, + { + "epoch": 0.8324692158133506, + "grad_norm": 0.8136693239212036, + "learning_rate": 4.239090975610346e-06, + "loss": 0.1386, + "step": 5138 + }, + { + "epoch": 0.8326312378483474, + "grad_norm": 0.8754627108573914, + "learning_rate": 4.238776792342106e-06, + "loss": 0.1393, + "step": 5139 + }, + { + "epoch": 0.8327932598833442, + "grad_norm": 0.8677736520767212, + "learning_rate": 4.2384625558715045e-06, + "loss": 0.1352, + "step": 5140 + }, + { + "epoch": 0.8329552819183409, + "grad_norm": 0.8522474765777588, + "learning_rate": 4.2381482662081555e-06, + "loss": 0.1534, + "step": 5141 + }, + { + "epoch": 0.8331173039533376, + "grad_norm": 0.7467418313026428, + "learning_rate": 4.237833923361676e-06, + "loss": 0.1175, + "step": 5142 + }, + { + "epoch": 0.8332793259883344, + "grad_norm": 0.905116856098175, + "learning_rate": 4.237519527341684e-06, + "loss": 0.1425, + "step": 5143 + }, + { + "epoch": 0.8334413480233311, + "grad_norm": 1.009063720703125, + "learning_rate": 4.237205078157799e-06, + "loss": 0.1333, + "step": 5144 + }, + { + "epoch": 0.833603370058328, + "grad_norm": 0.7382147908210754, + "learning_rate": 4.2368905758196436e-06, + "loss": 0.1214, + "step": 5145 + }, + { + "epoch": 0.8337653920933247, + "grad_norm": 1.0095723867416382, + "learning_rate": 4.236576020336838e-06, + "loss": 0.1717, + "step": 5146 + }, + { + "epoch": 0.8339274141283215, + "grad_norm": 0.9047743082046509, + "learning_rate": 4.236261411719009e-06, + "loss": 0.1274, + "step": 5147 + }, + { + "epoch": 0.8340894361633182, + "grad_norm": 0.8646296262741089, + "learning_rate": 4.235946749975783e-06, + "loss": 0.152, + "step": 5148 + }, + { + "epoch": 0.8342514581983149, + "grad_norm": 0.8817318677902222, + "learning_rate": 4.235632035116788e-06, + "loss": 0.1393, + "step": 5149 + }, + { + "epoch": 0.8344134802333117, + "grad_norm": 0.9330871105194092, + "learning_rate": 4.235317267151652e-06, + "loss": 0.1393, + "step": 5150 + }, + { + "epoch": 0.8345755022683085, + "grad_norm": 0.9412763714790344, + "learning_rate": 4.235002446090007e-06, + "loss": 0.167, + "step": 5151 + }, + { + "epoch": 0.8347375243033053, + "grad_norm": 0.8925046920776367, + "learning_rate": 4.234687571941486e-06, + "loss": 0.1515, + "step": 5152 + }, + { + "epoch": 0.834899546338302, + "grad_norm": 0.7668562531471252, + "learning_rate": 4.234372644715723e-06, + "loss": 0.1195, + "step": 5153 + }, + { + "epoch": 0.8350615683732988, + "grad_norm": 0.8491714596748352, + "learning_rate": 4.234057664422354e-06, + "loss": 0.1408, + "step": 5154 + }, + { + "epoch": 0.8352235904082955, + "grad_norm": 0.8507294654846191, + "learning_rate": 4.233742631071017e-06, + "loss": 0.1333, + "step": 5155 + }, + { + "epoch": 0.8353856124432922, + "grad_norm": 0.9354631900787354, + "learning_rate": 4.2334275446713515e-06, + "loss": 0.1507, + "step": 5156 + }, + { + "epoch": 0.8355476344782891, + "grad_norm": 0.8924568295478821, + "learning_rate": 4.233112405232998e-06, + "loss": 0.1431, + "step": 5157 + }, + { + "epoch": 0.8357096565132858, + "grad_norm": 0.8328578472137451, + "learning_rate": 4.232797212765598e-06, + "loss": 0.1397, + "step": 5158 + }, + { + "epoch": 0.8358716785482826, + "grad_norm": 0.8072636127471924, + "learning_rate": 4.2324819672787976e-06, + "loss": 0.1458, + "step": 5159 + }, + { + "epoch": 0.8360337005832793, + "grad_norm": 0.813198447227478, + "learning_rate": 4.2321666687822405e-06, + "loss": 0.1361, + "step": 5160 + }, + { + "epoch": 0.8361957226182761, + "grad_norm": 0.8552424311637878, + "learning_rate": 4.231851317285576e-06, + "loss": 0.1433, + "step": 5161 + }, + { + "epoch": 0.8363577446532728, + "grad_norm": 0.8645519018173218, + "learning_rate": 4.231535912798452e-06, + "loss": 0.1511, + "step": 5162 + }, + { + "epoch": 0.8365197666882696, + "grad_norm": 0.8857008814811707, + "learning_rate": 4.2312204553305195e-06, + "loss": 0.1396, + "step": 5163 + }, + { + "epoch": 0.8366817887232664, + "grad_norm": 0.9378755688667297, + "learning_rate": 4.23090494489143e-06, + "loss": 0.1589, + "step": 5164 + }, + { + "epoch": 0.8368438107582631, + "grad_norm": 0.8300303220748901, + "learning_rate": 4.230589381490837e-06, + "loss": 0.1385, + "step": 5165 + }, + { + "epoch": 0.8370058327932599, + "grad_norm": 0.8071770668029785, + "learning_rate": 4.230273765138399e-06, + "loss": 0.1184, + "step": 5166 + }, + { + "epoch": 0.8371678548282566, + "grad_norm": 0.7939083576202393, + "learning_rate": 4.229958095843769e-06, + "loss": 0.1276, + "step": 5167 + }, + { + "epoch": 0.8373298768632534, + "grad_norm": 0.8759216070175171, + "learning_rate": 4.229642373616609e-06, + "loss": 0.1323, + "step": 5168 + }, + { + "epoch": 0.8374918988982502, + "grad_norm": 0.8885746598243713, + "learning_rate": 4.2293265984665775e-06, + "loss": 0.159, + "step": 5169 + }, + { + "epoch": 0.837653920933247, + "grad_norm": 0.9272924065589905, + "learning_rate": 4.229010770403337e-06, + "loss": 0.1432, + "step": 5170 + }, + { + "epoch": 0.8378159429682437, + "grad_norm": 0.7635154724121094, + "learning_rate": 4.228694889436552e-06, + "loss": 0.1129, + "step": 5171 + }, + { + "epoch": 0.8379779650032404, + "grad_norm": 0.9018572568893433, + "learning_rate": 4.228378955575885e-06, + "loss": 0.1413, + "step": 5172 + }, + { + "epoch": 0.8381399870382372, + "grad_norm": 0.8228177428245544, + "learning_rate": 4.228062968831006e-06, + "loss": 0.1292, + "step": 5173 + }, + { + "epoch": 0.8383020090732339, + "grad_norm": 0.7802498936653137, + "learning_rate": 4.227746929211582e-06, + "loss": 0.1169, + "step": 5174 + }, + { + "epoch": 0.8384640311082308, + "grad_norm": 0.8804471492767334, + "learning_rate": 4.227430836727282e-06, + "loss": 0.1482, + "step": 5175 + }, + { + "epoch": 0.8386260531432275, + "grad_norm": 0.8976423740386963, + "learning_rate": 4.227114691387779e-06, + "loss": 0.1446, + "step": 5176 + }, + { + "epoch": 0.8387880751782243, + "grad_norm": 0.8988388180732727, + "learning_rate": 4.226798493202746e-06, + "loss": 0.1433, + "step": 5177 + }, + { + "epoch": 0.838950097213221, + "grad_norm": 0.8048079013824463, + "learning_rate": 4.226482242181859e-06, + "loss": 0.1248, + "step": 5178 + }, + { + "epoch": 0.8391121192482177, + "grad_norm": 0.9093589782714844, + "learning_rate": 4.226165938334792e-06, + "loss": 0.1449, + "step": 5179 + }, + { + "epoch": 0.8392741412832145, + "grad_norm": 0.9186120629310608, + "learning_rate": 4.225849581671225e-06, + "loss": 0.1496, + "step": 5180 + }, + { + "epoch": 0.8394361633182112, + "grad_norm": 0.9053645730018616, + "learning_rate": 4.225533172200837e-06, + "loss": 0.1418, + "step": 5181 + }, + { + "epoch": 0.8395981853532081, + "grad_norm": 0.8142003417015076, + "learning_rate": 4.225216709933309e-06, + "loss": 0.132, + "step": 5182 + }, + { + "epoch": 0.8397602073882048, + "grad_norm": 0.8198449611663818, + "learning_rate": 4.224900194878326e-06, + "loss": 0.1387, + "step": 5183 + }, + { + "epoch": 0.8399222294232016, + "grad_norm": 0.8121005892753601, + "learning_rate": 4.2245836270455706e-06, + "loss": 0.1237, + "step": 5184 + }, + { + "epoch": 0.8400842514581983, + "grad_norm": 0.8428120017051697, + "learning_rate": 4.22426700644473e-06, + "loss": 0.1441, + "step": 5185 + }, + { + "epoch": 0.840246273493195, + "grad_norm": 0.9493716955184937, + "learning_rate": 4.223950333085492e-06, + "loss": 0.1443, + "step": 5186 + }, + { + "epoch": 0.8404082955281919, + "grad_norm": 0.794461190700531, + "learning_rate": 4.2236336069775445e-06, + "loss": 0.1181, + "step": 5187 + }, + { + "epoch": 0.8405703175631886, + "grad_norm": 0.9470990300178528, + "learning_rate": 4.223316828130581e-06, + "loss": 0.1715, + "step": 5188 + }, + { + "epoch": 0.8407323395981854, + "grad_norm": 0.7863299250602722, + "learning_rate": 4.222999996554291e-06, + "loss": 0.1198, + "step": 5189 + }, + { + "epoch": 0.8408943616331821, + "grad_norm": 0.916057288646698, + "learning_rate": 4.222683112258372e-06, + "loss": 0.148, + "step": 5190 + }, + { + "epoch": 0.8410563836681789, + "grad_norm": 0.8652752041816711, + "learning_rate": 4.222366175252519e-06, + "loss": 0.1299, + "step": 5191 + }, + { + "epoch": 0.8412184057031756, + "grad_norm": 0.8845123648643494, + "learning_rate": 4.222049185546428e-06, + "loss": 0.1356, + "step": 5192 + }, + { + "epoch": 0.8413804277381723, + "grad_norm": 0.9572479724884033, + "learning_rate": 4.2217321431498e-06, + "loss": 0.1432, + "step": 5193 + }, + { + "epoch": 0.8415424497731692, + "grad_norm": 0.885009765625, + "learning_rate": 4.221415048072335e-06, + "loss": 0.1171, + "step": 5194 + }, + { + "epoch": 0.8417044718081659, + "grad_norm": 1.277252435684204, + "learning_rate": 4.221097900323735e-06, + "loss": 0.1719, + "step": 5195 + }, + { + "epoch": 0.8418664938431627, + "grad_norm": 1.0181325674057007, + "learning_rate": 4.220780699913704e-06, + "loss": 0.18, + "step": 5196 + }, + { + "epoch": 0.8420285158781594, + "grad_norm": 0.9635038375854492, + "learning_rate": 4.220463446851948e-06, + "loss": 0.1485, + "step": 5197 + }, + { + "epoch": 0.8421905379131562, + "grad_norm": 0.8910326361656189, + "learning_rate": 4.220146141148174e-06, + "loss": 0.1497, + "step": 5198 + }, + { + "epoch": 0.842352559948153, + "grad_norm": 0.8969202041625977, + "learning_rate": 4.2198287828120905e-06, + "loss": 0.1315, + "step": 5199 + }, + { + "epoch": 0.8425145819831497, + "grad_norm": 0.8396177291870117, + "learning_rate": 4.219511371853408e-06, + "loss": 0.1371, + "step": 5200 + }, + { + "epoch": 0.8426766040181465, + "grad_norm": 0.8654994964599609, + "learning_rate": 4.21919390828184e-06, + "loss": 0.1358, + "step": 5201 + }, + { + "epoch": 0.8428386260531432, + "grad_norm": 0.877016544342041, + "learning_rate": 4.2188763921070974e-06, + "loss": 0.1539, + "step": 5202 + }, + { + "epoch": 0.84300064808814, + "grad_norm": 0.7908318042755127, + "learning_rate": 4.218558823338898e-06, + "loss": 0.1257, + "step": 5203 + }, + { + "epoch": 0.8431626701231367, + "grad_norm": 0.9762617349624634, + "learning_rate": 4.2182412019869556e-06, + "loss": 0.1717, + "step": 5204 + }, + { + "epoch": 0.8433246921581335, + "grad_norm": 0.8188162446022034, + "learning_rate": 4.217923528060992e-06, + "loss": 0.1281, + "step": 5205 + }, + { + "epoch": 0.8434867141931303, + "grad_norm": 0.8744212985038757, + "learning_rate": 4.217605801570725e-06, + "loss": 0.1317, + "step": 5206 + }, + { + "epoch": 0.843648736228127, + "grad_norm": 1.0196107625961304, + "learning_rate": 4.217288022525877e-06, + "loss": 0.1875, + "step": 5207 + }, + { + "epoch": 0.8438107582631238, + "grad_norm": 0.7780096530914307, + "learning_rate": 4.216970190936171e-06, + "loss": 0.1387, + "step": 5208 + }, + { + "epoch": 0.8439727802981205, + "grad_norm": 0.9348713755607605, + "learning_rate": 4.216652306811333e-06, + "loss": 0.1568, + "step": 5209 + }, + { + "epoch": 0.8441348023331173, + "grad_norm": 0.8681674599647522, + "learning_rate": 4.2163343701610884e-06, + "loss": 0.1436, + "step": 5210 + }, + { + "epoch": 0.844296824368114, + "grad_norm": 0.8520421981811523, + "learning_rate": 4.216016380995166e-06, + "loss": 0.1438, + "step": 5211 + }, + { + "epoch": 0.8444588464031109, + "grad_norm": 0.8671303391456604, + "learning_rate": 4.215698339323294e-06, + "loss": 0.1472, + "step": 5212 + }, + { + "epoch": 0.8446208684381076, + "grad_norm": 0.9922376275062561, + "learning_rate": 4.215380245155205e-06, + "loss": 0.1798, + "step": 5213 + }, + { + "epoch": 0.8447828904731044, + "grad_norm": 0.8721550703048706, + "learning_rate": 4.215062098500632e-06, + "loss": 0.1332, + "step": 5214 + }, + { + "epoch": 0.8449449125081011, + "grad_norm": 0.8614236116409302, + "learning_rate": 4.214743899369309e-06, + "loss": 0.1504, + "step": 5215 + }, + { + "epoch": 0.8451069345430978, + "grad_norm": 0.8543869853019714, + "learning_rate": 4.214425647770972e-06, + "loss": 0.1403, + "step": 5216 + }, + { + "epoch": 0.8452689565780946, + "grad_norm": 0.7991150617599487, + "learning_rate": 4.21410734371536e-06, + "loss": 0.1359, + "step": 5217 + }, + { + "epoch": 0.8454309786130914, + "grad_norm": 0.9248189926147461, + "learning_rate": 4.213788987212211e-06, + "loss": 0.1542, + "step": 5218 + }, + { + "epoch": 0.8455930006480882, + "grad_norm": 0.950682520866394, + "learning_rate": 4.213470578271265e-06, + "loss": 0.169, + "step": 5219 + }, + { + "epoch": 0.8457550226830849, + "grad_norm": 0.9078273177146912, + "learning_rate": 4.213152116902267e-06, + "loss": 0.1633, + "step": 5220 + }, + { + "epoch": 0.8459170447180817, + "grad_norm": 0.8999932408332825, + "learning_rate": 4.21283360311496e-06, + "loss": 0.1556, + "step": 5221 + }, + { + "epoch": 0.8460790667530784, + "grad_norm": 0.7840633988380432, + "learning_rate": 4.212515036919089e-06, + "loss": 0.1242, + "step": 5222 + }, + { + "epoch": 0.8462410887880751, + "grad_norm": 0.8409323692321777, + "learning_rate": 4.212196418324402e-06, + "loss": 0.1342, + "step": 5223 + }, + { + "epoch": 0.846403110823072, + "grad_norm": 0.7986196279525757, + "learning_rate": 4.211877747340649e-06, + "loss": 0.1334, + "step": 5224 + }, + { + "epoch": 0.8465651328580687, + "grad_norm": 0.8227624297142029, + "learning_rate": 4.211559023977579e-06, + "loss": 0.1352, + "step": 5225 + }, + { + "epoch": 0.8467271548930655, + "grad_norm": 0.7546223402023315, + "learning_rate": 4.211240248244945e-06, + "loss": 0.1311, + "step": 5226 + }, + { + "epoch": 0.8468891769280622, + "grad_norm": 0.7613492012023926, + "learning_rate": 4.2109214201525e-06, + "loss": 0.1243, + "step": 5227 + }, + { + "epoch": 0.847051198963059, + "grad_norm": 0.9470206499099731, + "learning_rate": 4.21060253971e-06, + "loss": 0.1578, + "step": 5228 + }, + { + "epoch": 0.8472132209980557, + "grad_norm": 0.716661274433136, + "learning_rate": 4.210283606927203e-06, + "loss": 0.1045, + "step": 5229 + }, + { + "epoch": 0.8473752430330524, + "grad_norm": 0.8112010359764099, + "learning_rate": 4.2099646218138655e-06, + "loss": 0.1199, + "step": 5230 + }, + { + "epoch": 0.8475372650680493, + "grad_norm": 0.9128946661949158, + "learning_rate": 4.209645584379748e-06, + "loss": 0.1431, + "step": 5231 + }, + { + "epoch": 0.847699287103046, + "grad_norm": 1.0120484828948975, + "learning_rate": 4.209326494634614e-06, + "loss": 0.1603, + "step": 5232 + }, + { + "epoch": 0.8478613091380428, + "grad_norm": 0.7939407229423523, + "learning_rate": 4.209007352588226e-06, + "loss": 0.1269, + "step": 5233 + }, + { + "epoch": 0.8480233311730395, + "grad_norm": 0.8421609401702881, + "learning_rate": 4.208688158250348e-06, + "loss": 0.1258, + "step": 5234 + }, + { + "epoch": 0.8481853532080363, + "grad_norm": 0.8701285719871521, + "learning_rate": 4.208368911630747e-06, + "loss": 0.1282, + "step": 5235 + }, + { + "epoch": 0.848347375243033, + "grad_norm": 0.9212756752967834, + "learning_rate": 4.2080496127391914e-06, + "loss": 0.13, + "step": 5236 + }, + { + "epoch": 0.8485093972780298, + "grad_norm": 0.8270485997200012, + "learning_rate": 4.207730261585452e-06, + "loss": 0.1358, + "step": 5237 + }, + { + "epoch": 0.8486714193130266, + "grad_norm": 0.8947330713272095, + "learning_rate": 4.207410858179298e-06, + "loss": 0.1462, + "step": 5238 + }, + { + "epoch": 0.8488334413480233, + "grad_norm": 0.7907639741897583, + "learning_rate": 4.207091402530504e-06, + "loss": 0.1296, + "step": 5239 + }, + { + "epoch": 0.8489954633830201, + "grad_norm": 0.956722617149353, + "learning_rate": 4.206771894648846e-06, + "loss": 0.157, + "step": 5240 + }, + { + "epoch": 0.8491574854180168, + "grad_norm": 0.8899962306022644, + "learning_rate": 4.206452334544096e-06, + "loss": 0.1576, + "step": 5241 + }, + { + "epoch": 0.8493195074530137, + "grad_norm": 0.8392747640609741, + "learning_rate": 4.206132722226035e-06, + "loss": 0.132, + "step": 5242 + }, + { + "epoch": 0.8494815294880104, + "grad_norm": 0.9162569642066956, + "learning_rate": 4.205813057704441e-06, + "loss": 0.1577, + "step": 5243 + }, + { + "epoch": 0.8496435515230071, + "grad_norm": 0.9247191548347473, + "learning_rate": 4.205493340989096e-06, + "loss": 0.1536, + "step": 5244 + }, + { + "epoch": 0.8498055735580039, + "grad_norm": 0.8115431666374207, + "learning_rate": 4.2051735720897815e-06, + "loss": 0.1302, + "step": 5245 + }, + { + "epoch": 0.8499675955930006, + "grad_norm": 0.8264175653457642, + "learning_rate": 4.204853751016282e-06, + "loss": 0.1469, + "step": 5246 + }, + { + "epoch": 0.8501296176279974, + "grad_norm": 0.8077431321144104, + "learning_rate": 4.2045338777783844e-06, + "loss": 0.141, + "step": 5247 + }, + { + "epoch": 0.8502916396629941, + "grad_norm": 0.9603082537651062, + "learning_rate": 4.204213952385875e-06, + "loss": 0.1644, + "step": 5248 + }, + { + "epoch": 0.850453661697991, + "grad_norm": 0.8778827786445618, + "learning_rate": 4.2038939748485416e-06, + "loss": 0.1475, + "step": 5249 + }, + { + "epoch": 0.8506156837329877, + "grad_norm": 0.7682041525840759, + "learning_rate": 4.203573945176177e-06, + "loss": 0.1167, + "step": 5250 + }, + { + "epoch": 0.8507777057679844, + "grad_norm": 0.8808218836784363, + "learning_rate": 4.203253863378571e-06, + "loss": 0.137, + "step": 5251 + }, + { + "epoch": 0.8509397278029812, + "grad_norm": 0.7965406775474548, + "learning_rate": 4.202933729465519e-06, + "loss": 0.131, + "step": 5252 + }, + { + "epoch": 0.8511017498379779, + "grad_norm": 0.8025760650634766, + "learning_rate": 4.202613543446817e-06, + "loss": 0.1314, + "step": 5253 + }, + { + "epoch": 0.8512637718729748, + "grad_norm": 0.9073064923286438, + "learning_rate": 4.20229330533226e-06, + "loss": 0.1461, + "step": 5254 + }, + { + "epoch": 0.8514257939079715, + "grad_norm": 0.90974360704422, + "learning_rate": 4.201973015131647e-06, + "loss": 0.1437, + "step": 5255 + }, + { + "epoch": 0.8515878159429683, + "grad_norm": 0.9711911082267761, + "learning_rate": 4.201652672854779e-06, + "loss": 0.1507, + "step": 5256 + }, + { + "epoch": 0.851749837977965, + "grad_norm": 0.9854370355606079, + "learning_rate": 4.2013322785114574e-06, + "loss": 0.1656, + "step": 5257 + }, + { + "epoch": 0.8519118600129617, + "grad_norm": 0.7474818229675293, + "learning_rate": 4.201011832111485e-06, + "loss": 0.1203, + "step": 5258 + }, + { + "epoch": 0.8520738820479585, + "grad_norm": 0.90165776014328, + "learning_rate": 4.200691333664666e-06, + "loss": 0.1446, + "step": 5259 + }, + { + "epoch": 0.8522359040829552, + "grad_norm": 0.8739867210388184, + "learning_rate": 4.2003707831808086e-06, + "loss": 0.1355, + "step": 5260 + }, + { + "epoch": 0.8523979261179521, + "grad_norm": 0.9299155473709106, + "learning_rate": 4.20005018066972e-06, + "loss": 0.1332, + "step": 5261 + }, + { + "epoch": 0.8525599481529488, + "grad_norm": 0.8256843686103821, + "learning_rate": 4.199729526141209e-06, + "loss": 0.1293, + "step": 5262 + }, + { + "epoch": 0.8527219701879456, + "grad_norm": 0.8616979122161865, + "learning_rate": 4.199408819605089e-06, + "loss": 0.1399, + "step": 5263 + }, + { + "epoch": 0.8528839922229423, + "grad_norm": 0.9190691709518433, + "learning_rate": 4.199088061071172e-06, + "loss": 0.1357, + "step": 5264 + }, + { + "epoch": 0.8530460142579391, + "grad_norm": 0.9460611939430237, + "learning_rate": 4.19876725054927e-06, + "loss": 0.1527, + "step": 5265 + }, + { + "epoch": 0.8532080362929358, + "grad_norm": 0.9079461693763733, + "learning_rate": 4.198446388049203e-06, + "loss": 0.1596, + "step": 5266 + }, + { + "epoch": 0.8533700583279326, + "grad_norm": 0.7783133387565613, + "learning_rate": 4.198125473580786e-06, + "loss": 0.1377, + "step": 5267 + }, + { + "epoch": 0.8535320803629294, + "grad_norm": 0.9109153151512146, + "learning_rate": 4.197804507153838e-06, + "loss": 0.1472, + "step": 5268 + }, + { + "epoch": 0.8536941023979261, + "grad_norm": 0.8502582907676697, + "learning_rate": 4.197483488778182e-06, + "loss": 0.137, + "step": 5269 + }, + { + "epoch": 0.8538561244329229, + "grad_norm": 0.7953091859817505, + "learning_rate": 4.197162418463639e-06, + "loss": 0.1279, + "step": 5270 + }, + { + "epoch": 0.8540181464679196, + "grad_norm": 0.8548662662506104, + "learning_rate": 4.196841296220033e-06, + "loss": 0.1379, + "step": 5271 + }, + { + "epoch": 0.8541801685029164, + "grad_norm": 0.8967195749282837, + "learning_rate": 4.1965201220571895e-06, + "loss": 0.1413, + "step": 5272 + }, + { + "epoch": 0.8543421905379132, + "grad_norm": 0.771815836429596, + "learning_rate": 4.1961988959849355e-06, + "loss": 0.1281, + "step": 5273 + }, + { + "epoch": 0.8545042125729099, + "grad_norm": 1.0291990041732788, + "learning_rate": 4.1958776180131e-06, + "loss": 0.1764, + "step": 5274 + }, + { + "epoch": 0.8546662346079067, + "grad_norm": 0.8116664290428162, + "learning_rate": 4.195556288151513e-06, + "loss": 0.1252, + "step": 5275 + }, + { + "epoch": 0.8548282566429034, + "grad_norm": 0.8838712573051453, + "learning_rate": 4.1952349064100074e-06, + "loss": 0.1545, + "step": 5276 + }, + { + "epoch": 0.8549902786779002, + "grad_norm": 0.8008884787559509, + "learning_rate": 4.194913472798415e-06, + "loss": 0.1244, + "step": 5277 + }, + { + "epoch": 0.8551523007128969, + "grad_norm": 0.890133261680603, + "learning_rate": 4.194591987326574e-06, + "loss": 0.1484, + "step": 5278 + }, + { + "epoch": 0.8553143227478938, + "grad_norm": 0.8935836553573608, + "learning_rate": 4.194270450004317e-06, + "loss": 0.1325, + "step": 5279 + }, + { + "epoch": 0.8554763447828905, + "grad_norm": 0.888762354850769, + "learning_rate": 4.193948860841485e-06, + "loss": 0.1455, + "step": 5280 + }, + { + "epoch": 0.8556383668178872, + "grad_norm": 0.7954049706459045, + "learning_rate": 4.193627219847918e-06, + "loss": 0.1257, + "step": 5281 + }, + { + "epoch": 0.855800388852884, + "grad_norm": 0.8666353821754456, + "learning_rate": 4.193305527033456e-06, + "loss": 0.1473, + "step": 5282 + }, + { + "epoch": 0.8559624108878807, + "grad_norm": 0.8803139328956604, + "learning_rate": 4.192983782407941e-06, + "loss": 0.1557, + "step": 5283 + }, + { + "epoch": 0.8561244329228775, + "grad_norm": 0.81354159116745, + "learning_rate": 4.192661985981221e-06, + "loss": 0.1376, + "step": 5284 + }, + { + "epoch": 0.8562864549578743, + "grad_norm": 0.936042070388794, + "learning_rate": 4.19234013776314e-06, + "loss": 0.1426, + "step": 5285 + }, + { + "epoch": 0.8564484769928711, + "grad_norm": 0.8939327597618103, + "learning_rate": 4.192018237763547e-06, + "loss": 0.1422, + "step": 5286 + }, + { + "epoch": 0.8566104990278678, + "grad_norm": 0.806371808052063, + "learning_rate": 4.19169628599229e-06, + "loss": 0.1313, + "step": 5287 + }, + { + "epoch": 0.8567725210628645, + "grad_norm": 0.9480794072151184, + "learning_rate": 4.19137428245922e-06, + "loss": 0.1468, + "step": 5288 + }, + { + "epoch": 0.8569345430978613, + "grad_norm": 0.8664402365684509, + "learning_rate": 4.191052227174189e-06, + "loss": 0.1419, + "step": 5289 + }, + { + "epoch": 0.857096565132858, + "grad_norm": 0.8385558724403381, + "learning_rate": 4.190730120147054e-06, + "loss": 0.1322, + "step": 5290 + }, + { + "epoch": 0.8572585871678549, + "grad_norm": 0.8843077421188354, + "learning_rate": 4.190407961387668e-06, + "loss": 0.1497, + "step": 5291 + }, + { + "epoch": 0.8574206092028516, + "grad_norm": 0.8863962292671204, + "learning_rate": 4.190085750905889e-06, + "loss": 0.1397, + "step": 5292 + }, + { + "epoch": 0.8575826312378484, + "grad_norm": 0.8218333125114441, + "learning_rate": 4.189763488711576e-06, + "loss": 0.123, + "step": 5293 + }, + { + "epoch": 0.8577446532728451, + "grad_norm": 0.8600748777389526, + "learning_rate": 4.189441174814589e-06, + "loss": 0.1472, + "step": 5294 + }, + { + "epoch": 0.8579066753078418, + "grad_norm": 0.7937850952148438, + "learning_rate": 4.189118809224792e-06, + "loss": 0.1289, + "step": 5295 + }, + { + "epoch": 0.8580686973428386, + "grad_norm": 0.8559263348579407, + "learning_rate": 4.188796391952046e-06, + "loss": 0.1388, + "step": 5296 + }, + { + "epoch": 0.8582307193778353, + "grad_norm": 0.8307374119758606, + "learning_rate": 4.1884739230062165e-06, + "loss": 0.1346, + "step": 5297 + }, + { + "epoch": 0.8583927414128322, + "grad_norm": 0.8185672760009766, + "learning_rate": 4.188151402397172e-06, + "loss": 0.1309, + "step": 5298 + }, + { + "epoch": 0.8585547634478289, + "grad_norm": 0.8880597949028015, + "learning_rate": 4.187828830134779e-06, + "loss": 0.1561, + "step": 5299 + }, + { + "epoch": 0.8587167854828257, + "grad_norm": 0.7549867630004883, + "learning_rate": 4.187506206228909e-06, + "loss": 0.1112, + "step": 5300 + }, + { + "epoch": 0.8588788075178224, + "grad_norm": 0.9183105230331421, + "learning_rate": 4.187183530689433e-06, + "loss": 0.1465, + "step": 5301 + }, + { + "epoch": 0.8590408295528191, + "grad_norm": 0.8254534602165222, + "learning_rate": 4.1868608035262225e-06, + "loss": 0.1268, + "step": 5302 + }, + { + "epoch": 0.859202851587816, + "grad_norm": 0.8823205232620239, + "learning_rate": 4.186538024749155e-06, + "loss": 0.1542, + "step": 5303 + }, + { + "epoch": 0.8593648736228127, + "grad_norm": 0.793454110622406, + "learning_rate": 4.186215194368105e-06, + "loss": 0.1343, + "step": 5304 + }, + { + "epoch": 0.8595268956578095, + "grad_norm": 0.820057213306427, + "learning_rate": 4.18589231239295e-06, + "loss": 0.1341, + "step": 5305 + }, + { + "epoch": 0.8596889176928062, + "grad_norm": 0.7398642301559448, + "learning_rate": 4.18556937883357e-06, + "loss": 0.1248, + "step": 5306 + }, + { + "epoch": 0.859850939727803, + "grad_norm": 0.8161394596099854, + "learning_rate": 4.185246393699847e-06, + "loss": 0.131, + "step": 5307 + }, + { + "epoch": 0.8600129617627997, + "grad_norm": 0.9015643000602722, + "learning_rate": 4.184923357001661e-06, + "loss": 0.1234, + "step": 5308 + }, + { + "epoch": 0.8601749837977966, + "grad_norm": 0.8390319347381592, + "learning_rate": 4.184600268748899e-06, + "loss": 0.1388, + "step": 5309 + }, + { + "epoch": 0.8603370058327933, + "grad_norm": 0.9225760102272034, + "learning_rate": 4.184277128951445e-06, + "loss": 0.1541, + "step": 5310 + }, + { + "epoch": 0.86049902786779, + "grad_norm": 0.8592953085899353, + "learning_rate": 4.183953937619187e-06, + "loss": 0.1292, + "step": 5311 + }, + { + "epoch": 0.8606610499027868, + "grad_norm": 0.9123047590255737, + "learning_rate": 4.1836306947620135e-06, + "loss": 0.152, + "step": 5312 + }, + { + "epoch": 0.8608230719377835, + "grad_norm": 0.903361439704895, + "learning_rate": 4.183307400389815e-06, + "loss": 0.1498, + "step": 5313 + }, + { + "epoch": 0.8609850939727803, + "grad_norm": 0.8942160606384277, + "learning_rate": 4.182984054512483e-06, + "loss": 0.1397, + "step": 5314 + }, + { + "epoch": 0.861147116007777, + "grad_norm": 0.9034626483917236, + "learning_rate": 4.1826606571399134e-06, + "loss": 0.1574, + "step": 5315 + }, + { + "epoch": 0.8613091380427739, + "grad_norm": 0.8866369128227234, + "learning_rate": 4.182337208281998e-06, + "loss": 0.1503, + "step": 5316 + }, + { + "epoch": 0.8614711600777706, + "grad_norm": 0.8173493146896362, + "learning_rate": 4.182013707948635e-06, + "loss": 0.1335, + "step": 5317 + }, + { + "epoch": 0.8616331821127673, + "grad_norm": 0.9902571439743042, + "learning_rate": 4.181690156149724e-06, + "loss": 0.158, + "step": 5318 + }, + { + "epoch": 0.8617952041477641, + "grad_norm": 0.8877143859863281, + "learning_rate": 4.181366552895163e-06, + "loss": 0.1396, + "step": 5319 + }, + { + "epoch": 0.8619572261827608, + "grad_norm": 0.9022709131240845, + "learning_rate": 4.1810428981948555e-06, + "loss": 0.145, + "step": 5320 + }, + { + "epoch": 0.8621192482177576, + "grad_norm": 0.8367879986763, + "learning_rate": 4.180719192058702e-06, + "loss": 0.135, + "step": 5321 + }, + { + "epoch": 0.8622812702527544, + "grad_norm": 0.8607010245323181, + "learning_rate": 4.1803954344966095e-06, + "loss": 0.1522, + "step": 5322 + }, + { + "epoch": 0.8624432922877512, + "grad_norm": 0.9045525193214417, + "learning_rate": 4.180071625518482e-06, + "loss": 0.1469, + "step": 5323 + }, + { + "epoch": 0.8626053143227479, + "grad_norm": 0.7549746632575989, + "learning_rate": 4.17974776513423e-06, + "loss": 0.1298, + "step": 5324 + }, + { + "epoch": 0.8627673363577446, + "grad_norm": 0.8301894068717957, + "learning_rate": 4.17942385335376e-06, + "loss": 0.13, + "step": 5325 + }, + { + "epoch": 0.8629293583927414, + "grad_norm": 0.9236185550689697, + "learning_rate": 4.179099890186985e-06, + "loss": 0.1521, + "step": 5326 + }, + { + "epoch": 0.8630913804277381, + "grad_norm": 0.8897588849067688, + "learning_rate": 4.1787758756438166e-06, + "loss": 0.1416, + "step": 5327 + }, + { + "epoch": 0.863253402462735, + "grad_norm": 0.8857617974281311, + "learning_rate": 4.178451809734168e-06, + "loss": 0.1507, + "step": 5328 + }, + { + "epoch": 0.8634154244977317, + "grad_norm": 0.7978904247283936, + "learning_rate": 4.178127692467957e-06, + "loss": 0.1303, + "step": 5329 + }, + { + "epoch": 0.8635774465327285, + "grad_norm": 0.880963921546936, + "learning_rate": 4.1778035238550995e-06, + "loss": 0.1519, + "step": 5330 + }, + { + "epoch": 0.8637394685677252, + "grad_norm": 0.9408493638038635, + "learning_rate": 4.177479303905514e-06, + "loss": 0.1644, + "step": 5331 + }, + { + "epoch": 0.8639014906027219, + "grad_norm": 0.8565788269042969, + "learning_rate": 4.177155032629122e-06, + "loss": 0.1322, + "step": 5332 + }, + { + "epoch": 0.8640635126377187, + "grad_norm": 0.8621842861175537, + "learning_rate": 4.176830710035843e-06, + "loss": 0.1456, + "step": 5333 + }, + { + "epoch": 0.8642255346727155, + "grad_norm": 1.0310484170913696, + "learning_rate": 4.176506336135603e-06, + "loss": 0.1557, + "step": 5334 + }, + { + "epoch": 0.8643875567077123, + "grad_norm": 0.9071840643882751, + "learning_rate": 4.176181910938326e-06, + "loss": 0.154, + "step": 5335 + }, + { + "epoch": 0.864549578742709, + "grad_norm": 0.8645082116127014, + "learning_rate": 4.175857434453939e-06, + "loss": 0.1404, + "step": 5336 + }, + { + "epoch": 0.8647116007777058, + "grad_norm": 0.9229341149330139, + "learning_rate": 4.1755329066923705e-06, + "loss": 0.1429, + "step": 5337 + }, + { + "epoch": 0.8648736228127025, + "grad_norm": 0.9101964831352234, + "learning_rate": 4.175208327663549e-06, + "loss": 0.1418, + "step": 5338 + }, + { + "epoch": 0.8650356448476992, + "grad_norm": 0.8676378726959229, + "learning_rate": 4.1748836973774075e-06, + "loss": 0.1481, + "step": 5339 + }, + { + "epoch": 0.8651976668826961, + "grad_norm": 0.8613499402999878, + "learning_rate": 4.174559015843878e-06, + "loss": 0.147, + "step": 5340 + }, + { + "epoch": 0.8653596889176928, + "grad_norm": 0.8355089426040649, + "learning_rate": 4.174234283072894e-06, + "loss": 0.1394, + "step": 5341 + }, + { + "epoch": 0.8655217109526896, + "grad_norm": 0.8457769751548767, + "learning_rate": 4.173909499074392e-06, + "loss": 0.1457, + "step": 5342 + }, + { + "epoch": 0.8656837329876863, + "grad_norm": 0.7920171618461609, + "learning_rate": 4.173584663858311e-06, + "loss": 0.115, + "step": 5343 + }, + { + "epoch": 0.8658457550226831, + "grad_norm": 0.7907648682594299, + "learning_rate": 4.173259777434589e-06, + "loss": 0.1376, + "step": 5344 + }, + { + "epoch": 0.8660077770576798, + "grad_norm": 0.8158167004585266, + "learning_rate": 4.172934839813168e-06, + "loss": 0.1325, + "step": 5345 + }, + { + "epoch": 0.8661697990926766, + "grad_norm": 0.8299226760864258, + "learning_rate": 4.1726098510039894e-06, + "loss": 0.1327, + "step": 5346 + }, + { + "epoch": 0.8663318211276734, + "grad_norm": 0.7698670029640198, + "learning_rate": 4.172284811016996e-06, + "loss": 0.1341, + "step": 5347 + }, + { + "epoch": 0.8664938431626701, + "grad_norm": 0.9853094816207886, + "learning_rate": 4.171959719862134e-06, + "loss": 0.1616, + "step": 5348 + }, + { + "epoch": 0.8666558651976669, + "grad_norm": 0.7773309350013733, + "learning_rate": 4.171634577549351e-06, + "loss": 0.1279, + "step": 5349 + }, + { + "epoch": 0.8668178872326636, + "grad_norm": 0.8574365377426147, + "learning_rate": 4.171309384088596e-06, + "loss": 0.1493, + "step": 5350 + }, + { + "epoch": 0.8669799092676604, + "grad_norm": 0.8746230006217957, + "learning_rate": 4.170984139489817e-06, + "loss": 0.1465, + "step": 5351 + }, + { + "epoch": 0.8671419313026572, + "grad_norm": 0.9685440063476562, + "learning_rate": 4.170658843762968e-06, + "loss": 0.1518, + "step": 5352 + }, + { + "epoch": 0.8673039533376539, + "grad_norm": 0.930526077747345, + "learning_rate": 4.170333496918001e-06, + "loss": 0.1746, + "step": 5353 + }, + { + "epoch": 0.8674659753726507, + "grad_norm": 0.8363896608352661, + "learning_rate": 4.170008098964871e-06, + "loss": 0.1354, + "step": 5354 + }, + { + "epoch": 0.8676279974076474, + "grad_norm": 0.8706420063972473, + "learning_rate": 4.1696826499135345e-06, + "loss": 0.1363, + "step": 5355 + }, + { + "epoch": 0.8677900194426442, + "grad_norm": 0.8633265495300293, + "learning_rate": 4.169357149773949e-06, + "loss": 0.1452, + "step": 5356 + }, + { + "epoch": 0.8679520414776409, + "grad_norm": 0.8911048173904419, + "learning_rate": 4.169031598556076e-06, + "loss": 0.1588, + "step": 5357 + }, + { + "epoch": 0.8681140635126378, + "grad_norm": 0.8662182092666626, + "learning_rate": 4.168705996269874e-06, + "loss": 0.148, + "step": 5358 + }, + { + "epoch": 0.8682760855476345, + "grad_norm": 0.9110774397850037, + "learning_rate": 4.168380342925307e-06, + "loss": 0.1447, + "step": 5359 + }, + { + "epoch": 0.8684381075826313, + "grad_norm": 0.7783262729644775, + "learning_rate": 4.168054638532338e-06, + "loss": 0.1278, + "step": 5360 + }, + { + "epoch": 0.868600129617628, + "grad_norm": 0.8996500372886658, + "learning_rate": 4.167728883100935e-06, + "loss": 0.1546, + "step": 5361 + }, + { + "epoch": 0.8687621516526247, + "grad_norm": 0.8864397406578064, + "learning_rate": 4.167403076641063e-06, + "loss": 0.1348, + "step": 5362 + }, + { + "epoch": 0.8689241736876215, + "grad_norm": 0.7758836150169373, + "learning_rate": 4.167077219162693e-06, + "loss": 0.1296, + "step": 5363 + }, + { + "epoch": 0.8690861957226182, + "grad_norm": 0.8843368291854858, + "learning_rate": 4.166751310675793e-06, + "loss": 0.1553, + "step": 5364 + }, + { + "epoch": 0.8692482177576151, + "grad_norm": 0.7979202270507812, + "learning_rate": 4.166425351190337e-06, + "loss": 0.1344, + "step": 5365 + }, + { + "epoch": 0.8694102397926118, + "grad_norm": 0.7076486945152283, + "learning_rate": 4.166099340716298e-06, + "loss": 0.1014, + "step": 5366 + }, + { + "epoch": 0.8695722618276086, + "grad_norm": 0.8106122612953186, + "learning_rate": 4.165773279263651e-06, + "loss": 0.1328, + "step": 5367 + }, + { + "epoch": 0.8697342838626053, + "grad_norm": 0.7798807621002197, + "learning_rate": 4.165447166842373e-06, + "loss": 0.1404, + "step": 5368 + }, + { + "epoch": 0.869896305897602, + "grad_norm": 0.776866614818573, + "learning_rate": 4.165121003462441e-06, + "loss": 0.1164, + "step": 5369 + }, + { + "epoch": 0.8700583279325989, + "grad_norm": 0.8250222206115723, + "learning_rate": 4.164794789133837e-06, + "loss": 0.1363, + "step": 5370 + }, + { + "epoch": 0.8702203499675956, + "grad_norm": 0.8319069147109985, + "learning_rate": 4.164468523866541e-06, + "loss": 0.1417, + "step": 5371 + }, + { + "epoch": 0.8703823720025924, + "grad_norm": 0.919909656047821, + "learning_rate": 4.164142207670536e-06, + "loss": 0.1611, + "step": 5372 + }, + { + "epoch": 0.8705443940375891, + "grad_norm": 0.8057491183280945, + "learning_rate": 4.163815840555806e-06, + "loss": 0.1266, + "step": 5373 + }, + { + "epoch": 0.8707064160725859, + "grad_norm": 0.8016455769538879, + "learning_rate": 4.163489422532338e-06, + "loss": 0.1256, + "step": 5374 + }, + { + "epoch": 0.8708684381075826, + "grad_norm": 0.976750910282135, + "learning_rate": 4.1631629536101195e-06, + "loss": 0.1645, + "step": 5375 + }, + { + "epoch": 0.8710304601425793, + "grad_norm": 0.9348950982093811, + "learning_rate": 4.162836433799139e-06, + "loss": 0.1425, + "step": 5376 + }, + { + "epoch": 0.8711924821775762, + "grad_norm": 0.7779992818832397, + "learning_rate": 4.162509863109389e-06, + "loss": 0.1247, + "step": 5377 + }, + { + "epoch": 0.8713545042125729, + "grad_norm": 0.9029181003570557, + "learning_rate": 4.162183241550858e-06, + "loss": 0.141, + "step": 5378 + }, + { + "epoch": 0.8715165262475697, + "grad_norm": 0.8280603885650635, + "learning_rate": 4.1618565691335434e-06, + "loss": 0.1285, + "step": 5379 + }, + { + "epoch": 0.8716785482825664, + "grad_norm": 0.792360782623291, + "learning_rate": 4.161529845867439e-06, + "loss": 0.128, + "step": 5380 + }, + { + "epoch": 0.8718405703175632, + "grad_norm": 0.9347276091575623, + "learning_rate": 4.161203071762543e-06, + "loss": 0.1447, + "step": 5381 + }, + { + "epoch": 0.87200259235256, + "grad_norm": 0.9125970602035522, + "learning_rate": 4.160876246828853e-06, + "loss": 0.1381, + "step": 5382 + }, + { + "epoch": 0.8721646143875567, + "grad_norm": 0.8670451045036316, + "learning_rate": 4.160549371076369e-06, + "loss": 0.1382, + "step": 5383 + }, + { + "epoch": 0.8723266364225535, + "grad_norm": 0.8932413458824158, + "learning_rate": 4.160222444515092e-06, + "loss": 0.1445, + "step": 5384 + }, + { + "epoch": 0.8724886584575502, + "grad_norm": 0.8334319591522217, + "learning_rate": 4.159895467155026e-06, + "loss": 0.1404, + "step": 5385 + }, + { + "epoch": 0.872650680492547, + "grad_norm": 0.8121306896209717, + "learning_rate": 4.159568439006176e-06, + "loss": 0.1379, + "step": 5386 + }, + { + "epoch": 0.8728127025275437, + "grad_norm": 0.8688573837280273, + "learning_rate": 4.159241360078548e-06, + "loss": 0.1456, + "step": 5387 + }, + { + "epoch": 0.8729747245625405, + "grad_norm": 0.8056637644767761, + "learning_rate": 4.1589142303821485e-06, + "loss": 0.1377, + "step": 5388 + }, + { + "epoch": 0.8731367465975373, + "grad_norm": 0.8120809197425842, + "learning_rate": 4.15858704992699e-06, + "loss": 0.1306, + "step": 5389 + }, + { + "epoch": 0.873298768632534, + "grad_norm": 0.8274014592170715, + "learning_rate": 4.158259818723079e-06, + "loss": 0.1463, + "step": 5390 + }, + { + "epoch": 0.8734607906675308, + "grad_norm": 0.9297308921813965, + "learning_rate": 4.157932536780432e-06, + "loss": 0.145, + "step": 5391 + }, + { + "epoch": 0.8736228127025275, + "grad_norm": 0.8492165803909302, + "learning_rate": 4.157605204109062e-06, + "loss": 0.1381, + "step": 5392 + }, + { + "epoch": 0.8737848347375243, + "grad_norm": 0.8986421823501587, + "learning_rate": 4.157277820718983e-06, + "loss": 0.145, + "step": 5393 + }, + { + "epoch": 0.873946856772521, + "grad_norm": 0.8207853436470032, + "learning_rate": 4.156950386620214e-06, + "loss": 0.1245, + "step": 5394 + }, + { + "epoch": 0.8741088788075179, + "grad_norm": 0.8820725679397583, + "learning_rate": 4.156622901822772e-06, + "loss": 0.1459, + "step": 5395 + }, + { + "epoch": 0.8742709008425146, + "grad_norm": 0.8960903286933899, + "learning_rate": 4.156295366336679e-06, + "loss": 0.1464, + "step": 5396 + }, + { + "epoch": 0.8744329228775113, + "grad_norm": 0.8151302337646484, + "learning_rate": 4.1559677801719554e-06, + "loss": 0.1344, + "step": 5397 + }, + { + "epoch": 0.8745949449125081, + "grad_norm": 0.8475366830825806, + "learning_rate": 4.155640143338625e-06, + "loss": 0.1303, + "step": 5398 + }, + { + "epoch": 0.8747569669475048, + "grad_norm": 0.8408517241477966, + "learning_rate": 4.155312455846714e-06, + "loss": 0.1476, + "step": 5399 + }, + { + "epoch": 0.8749189889825016, + "grad_norm": 0.8452920317649841, + "learning_rate": 4.154984717706246e-06, + "loss": 0.143, + "step": 5400 + }, + { + "epoch": 0.8750810110174984, + "grad_norm": 0.8104416728019714, + "learning_rate": 4.154656928927252e-06, + "loss": 0.1317, + "step": 5401 + }, + { + "epoch": 0.8752430330524952, + "grad_norm": 0.8546898365020752, + "learning_rate": 4.15432908951976e-06, + "loss": 0.1447, + "step": 5402 + }, + { + "epoch": 0.8754050550874919, + "grad_norm": 0.9186674952507019, + "learning_rate": 4.1540011994938e-06, + "loss": 0.1344, + "step": 5403 + }, + { + "epoch": 0.8755670771224887, + "grad_norm": 0.8319907188415527, + "learning_rate": 4.153673258859406e-06, + "loss": 0.1459, + "step": 5404 + }, + { + "epoch": 0.8757290991574854, + "grad_norm": 0.8684608340263367, + "learning_rate": 4.153345267626614e-06, + "loss": 0.1466, + "step": 5405 + }, + { + "epoch": 0.8758911211924821, + "grad_norm": 0.8214244246482849, + "learning_rate": 4.153017225805456e-06, + "loss": 0.1257, + "step": 5406 + }, + { + "epoch": 0.876053143227479, + "grad_norm": 0.9074698090553284, + "learning_rate": 4.152689133405971e-06, + "loss": 0.1296, + "step": 5407 + }, + { + "epoch": 0.8762151652624757, + "grad_norm": 0.8300620317459106, + "learning_rate": 4.1523609904382e-06, + "loss": 0.1331, + "step": 5408 + }, + { + "epoch": 0.8763771872974725, + "grad_norm": 0.8042019009590149, + "learning_rate": 4.152032796912179e-06, + "loss": 0.1265, + "step": 5409 + }, + { + "epoch": 0.8765392093324692, + "grad_norm": 0.8547518253326416, + "learning_rate": 4.1517045528379544e-06, + "loss": 0.138, + "step": 5410 + }, + { + "epoch": 0.876701231367466, + "grad_norm": 0.9264114499092102, + "learning_rate": 4.1513762582255655e-06, + "loss": 0.1608, + "step": 5411 + }, + { + "epoch": 0.8768632534024627, + "grad_norm": 0.7929533123970032, + "learning_rate": 4.151047913085061e-06, + "loss": 0.129, + "step": 5412 + }, + { + "epoch": 0.8770252754374595, + "grad_norm": 0.9087817072868347, + "learning_rate": 4.150719517426485e-06, + "loss": 0.1529, + "step": 5413 + }, + { + "epoch": 0.8771872974724563, + "grad_norm": 0.8429016470909119, + "learning_rate": 4.150391071259886e-06, + "loss": 0.1428, + "step": 5414 + }, + { + "epoch": 0.877349319507453, + "grad_norm": 0.9927331209182739, + "learning_rate": 4.1500625745953145e-06, + "loss": 0.1493, + "step": 5415 + }, + { + "epoch": 0.8775113415424498, + "grad_norm": 0.7533746361732483, + "learning_rate": 4.149734027442821e-06, + "loss": 0.1202, + "step": 5416 + }, + { + "epoch": 0.8776733635774465, + "grad_norm": 0.8377231359481812, + "learning_rate": 4.14940542981246e-06, + "loss": 0.1399, + "step": 5417 + }, + { + "epoch": 0.8778353856124433, + "grad_norm": 1.0180152654647827, + "learning_rate": 4.149076781714283e-06, + "loss": 0.1536, + "step": 5418 + }, + { + "epoch": 0.87799740764744, + "grad_norm": 0.9475023746490479, + "learning_rate": 4.148748083158347e-06, + "loss": 0.1683, + "step": 5419 + }, + { + "epoch": 0.8781594296824368, + "grad_norm": 0.863429844379425, + "learning_rate": 4.1484193341547106e-06, + "loss": 0.1407, + "step": 5420 + }, + { + "epoch": 0.8783214517174336, + "grad_norm": 0.7526499032974243, + "learning_rate": 4.14809053471343e-06, + "loss": 0.1262, + "step": 5421 + }, + { + "epoch": 0.8784834737524303, + "grad_norm": 0.851190984249115, + "learning_rate": 4.147761684844569e-06, + "loss": 0.1361, + "step": 5422 + }, + { + "epoch": 0.8786454957874271, + "grad_norm": 0.823888897895813, + "learning_rate": 4.147432784558188e-06, + "loss": 0.1298, + "step": 5423 + }, + { + "epoch": 0.8788075178224238, + "grad_norm": 0.8875905871391296, + "learning_rate": 4.147103833864349e-06, + "loss": 0.1347, + "step": 5424 + }, + { + "epoch": 0.8789695398574207, + "grad_norm": 0.9177705645561218, + "learning_rate": 4.146774832773119e-06, + "loss": 0.1559, + "step": 5425 + }, + { + "epoch": 0.8791315618924174, + "grad_norm": 0.8365907669067383, + "learning_rate": 4.146445781294566e-06, + "loss": 0.1295, + "step": 5426 + }, + { + "epoch": 0.8792935839274141, + "grad_norm": 0.8464487791061401, + "learning_rate": 4.146116679438754e-06, + "loss": 0.135, + "step": 5427 + }, + { + "epoch": 0.8794556059624109, + "grad_norm": 0.8880855441093445, + "learning_rate": 4.145787527215757e-06, + "loss": 0.1451, + "step": 5428 + }, + { + "epoch": 0.8796176279974076, + "grad_norm": 0.991811215877533, + "learning_rate": 4.145458324635643e-06, + "loss": 0.1647, + "step": 5429 + }, + { + "epoch": 0.8797796500324044, + "grad_norm": 0.7490970492362976, + "learning_rate": 4.145129071708487e-06, + "loss": 0.1117, + "step": 5430 + }, + { + "epoch": 0.8799416720674011, + "grad_norm": 1.0946390628814697, + "learning_rate": 4.144799768444362e-06, + "loss": 0.1776, + "step": 5431 + }, + { + "epoch": 0.880103694102398, + "grad_norm": 0.8403367400169373, + "learning_rate": 4.144470414853345e-06, + "loss": 0.148, + "step": 5432 + }, + { + "epoch": 0.8802657161373947, + "grad_norm": 0.9710992574691772, + "learning_rate": 4.1441410109455126e-06, + "loss": 0.1562, + "step": 5433 + }, + { + "epoch": 0.8804277381723914, + "grad_norm": 0.8550183176994324, + "learning_rate": 4.143811556730944e-06, + "loss": 0.1501, + "step": 5434 + }, + { + "epoch": 0.8805897602073882, + "grad_norm": 0.92138671875, + "learning_rate": 4.143482052219719e-06, + "loss": 0.1599, + "step": 5435 + }, + { + "epoch": 0.8807517822423849, + "grad_norm": 0.8856903910636902, + "learning_rate": 4.143152497421922e-06, + "loss": 0.1497, + "step": 5436 + }, + { + "epoch": 0.8809138042773818, + "grad_norm": 0.7469887733459473, + "learning_rate": 4.142822892347634e-06, + "loss": 0.1194, + "step": 5437 + }, + { + "epoch": 0.8810758263123785, + "grad_norm": 0.8389981389045715, + "learning_rate": 4.142493237006941e-06, + "loss": 0.1428, + "step": 5438 + }, + { + "epoch": 0.8812378483473753, + "grad_norm": 0.7978473901748657, + "learning_rate": 4.14216353140993e-06, + "loss": 0.1363, + "step": 5439 + }, + { + "epoch": 0.881399870382372, + "grad_norm": 0.9018765687942505, + "learning_rate": 4.141833775566688e-06, + "loss": 0.141, + "step": 5440 + }, + { + "epoch": 0.8815618924173687, + "grad_norm": 0.9037135243415833, + "learning_rate": 4.141503969487307e-06, + "loss": 0.1424, + "step": 5441 + }, + { + "epoch": 0.8817239144523655, + "grad_norm": 0.9119111895561218, + "learning_rate": 4.1411741131818765e-06, + "loss": 0.1625, + "step": 5442 + }, + { + "epoch": 0.8818859364873622, + "grad_norm": 0.796134352684021, + "learning_rate": 4.140844206660489e-06, + "loss": 0.1313, + "step": 5443 + }, + { + "epoch": 0.8820479585223591, + "grad_norm": 0.7966582179069519, + "learning_rate": 4.14051424993324e-06, + "loss": 0.1299, + "step": 5444 + }, + { + "epoch": 0.8822099805573558, + "grad_norm": 0.9936891794204712, + "learning_rate": 4.140184243010225e-06, + "loss": 0.156, + "step": 5445 + }, + { + "epoch": 0.8823720025923526, + "grad_norm": 0.7525246739387512, + "learning_rate": 4.1398541859015405e-06, + "loss": 0.1276, + "step": 5446 + }, + { + "epoch": 0.8825340246273493, + "grad_norm": 0.8001253604888916, + "learning_rate": 4.139524078617287e-06, + "loss": 0.1233, + "step": 5447 + }, + { + "epoch": 0.8826960466623461, + "grad_norm": 0.8744903206825256, + "learning_rate": 4.139193921167565e-06, + "loss": 0.1431, + "step": 5448 + }, + { + "epoch": 0.8828580686973428, + "grad_norm": 0.8717635273933411, + "learning_rate": 4.138863713562475e-06, + "loss": 0.1468, + "step": 5449 + }, + { + "epoch": 0.8830200907323396, + "grad_norm": 0.8670765161514282, + "learning_rate": 4.138533455812121e-06, + "loss": 0.1377, + "step": 5450 + }, + { + "epoch": 0.8831821127673364, + "grad_norm": 0.9572997689247131, + "learning_rate": 4.1382031479266084e-06, + "loss": 0.1574, + "step": 5451 + }, + { + "epoch": 0.8833441348023331, + "grad_norm": 0.9534028768539429, + "learning_rate": 4.137872789916044e-06, + "loss": 0.176, + "step": 5452 + }, + { + "epoch": 0.8835061568373299, + "grad_norm": 1.0097438097000122, + "learning_rate": 4.137542381790537e-06, + "loss": 0.1481, + "step": 5453 + }, + { + "epoch": 0.8836681788723266, + "grad_norm": 0.7446777820587158, + "learning_rate": 4.137211923560195e-06, + "loss": 0.1169, + "step": 5454 + }, + { + "epoch": 0.8838302009073234, + "grad_norm": 0.7878828644752502, + "learning_rate": 4.13688141523513e-06, + "loss": 0.1216, + "step": 5455 + }, + { + "epoch": 0.8839922229423202, + "grad_norm": 0.9208990931510925, + "learning_rate": 4.136550856825455e-06, + "loss": 0.1597, + "step": 5456 + }, + { + "epoch": 0.8841542449773169, + "grad_norm": 0.872546911239624, + "learning_rate": 4.136220248341284e-06, + "loss": 0.1296, + "step": 5457 + }, + { + "epoch": 0.8843162670123137, + "grad_norm": 0.9872225522994995, + "learning_rate": 4.135889589792733e-06, + "loss": 0.1712, + "step": 5458 + }, + { + "epoch": 0.8844782890473104, + "grad_norm": 0.8968865871429443, + "learning_rate": 4.135558881189919e-06, + "loss": 0.1464, + "step": 5459 + }, + { + "epoch": 0.8846403110823072, + "grad_norm": 0.883246898651123, + "learning_rate": 4.135228122542962e-06, + "loss": 0.1519, + "step": 5460 + }, + { + "epoch": 0.8848023331173039, + "grad_norm": 0.9164681434631348, + "learning_rate": 4.134897313861981e-06, + "loss": 0.1489, + "step": 5461 + }, + { + "epoch": 0.8849643551523008, + "grad_norm": 0.8244009017944336, + "learning_rate": 4.1345664551570985e-06, + "loss": 0.1366, + "step": 5462 + }, + { + "epoch": 0.8851263771872975, + "grad_norm": 0.8056451082229614, + "learning_rate": 4.134235546438439e-06, + "loss": 0.1461, + "step": 5463 + }, + { + "epoch": 0.8852883992222942, + "grad_norm": 0.8311920762062073, + "learning_rate": 4.133904587716126e-06, + "loss": 0.1495, + "step": 5464 + }, + { + "epoch": 0.885450421257291, + "grad_norm": 0.9182047843933105, + "learning_rate": 4.133573579000286e-06, + "loss": 0.1392, + "step": 5465 + }, + { + "epoch": 0.8856124432922877, + "grad_norm": 0.8886732459068298, + "learning_rate": 4.133242520301049e-06, + "loss": 0.133, + "step": 5466 + }, + { + "epoch": 0.8857744653272845, + "grad_norm": 0.6573583483695984, + "learning_rate": 4.1329114116285415e-06, + "loss": 0.1026, + "step": 5467 + }, + { + "epoch": 0.8859364873622813, + "grad_norm": 0.8312880992889404, + "learning_rate": 4.132580252992898e-06, + "loss": 0.1333, + "step": 5468 + }, + { + "epoch": 0.8860985093972781, + "grad_norm": 0.8526415228843689, + "learning_rate": 4.132249044404249e-06, + "loss": 0.1405, + "step": 5469 + }, + { + "epoch": 0.8862605314322748, + "grad_norm": 0.9522249102592468, + "learning_rate": 4.131917785872728e-06, + "loss": 0.1444, + "step": 5470 + }, + { + "epoch": 0.8864225534672715, + "grad_norm": 0.8345977067947388, + "learning_rate": 4.131586477408473e-06, + "loss": 0.1337, + "step": 5471 + }, + { + "epoch": 0.8865845755022683, + "grad_norm": 0.8408584594726562, + "learning_rate": 4.13125511902162e-06, + "loss": 0.1444, + "step": 5472 + }, + { + "epoch": 0.886746597537265, + "grad_norm": 0.8456746935844421, + "learning_rate": 4.1309237107223086e-06, + "loss": 0.1427, + "step": 5473 + }, + { + "epoch": 0.8869086195722619, + "grad_norm": 0.8888773918151855, + "learning_rate": 4.130592252520677e-06, + "loss": 0.1442, + "step": 5474 + }, + { + "epoch": 0.8870706416072586, + "grad_norm": 0.9684158563613892, + "learning_rate": 4.13026074442687e-06, + "loss": 0.1564, + "step": 5475 + }, + { + "epoch": 0.8872326636422554, + "grad_norm": 0.9485166072845459, + "learning_rate": 4.129929186451028e-06, + "loss": 0.1569, + "step": 5476 + }, + { + "epoch": 0.8873946856772521, + "grad_norm": 0.9848198294639587, + "learning_rate": 4.129597578603298e-06, + "loss": 0.1365, + "step": 5477 + }, + { + "epoch": 0.8875567077122488, + "grad_norm": 0.9694206118583679, + "learning_rate": 4.129265920893826e-06, + "loss": 0.1594, + "step": 5478 + }, + { + "epoch": 0.8877187297472456, + "grad_norm": 0.7698529958724976, + "learning_rate": 4.128934213332759e-06, + "loss": 0.1278, + "step": 5479 + }, + { + "epoch": 0.8878807517822424, + "grad_norm": 0.814104437828064, + "learning_rate": 4.128602455930247e-06, + "loss": 0.1389, + "step": 5480 + }, + { + "epoch": 0.8880427738172392, + "grad_norm": 0.8766177892684937, + "learning_rate": 4.128270648696441e-06, + "loss": 0.1513, + "step": 5481 + }, + { + "epoch": 0.8882047958522359, + "grad_norm": 0.796972393989563, + "learning_rate": 4.127938791641493e-06, + "loss": 0.1341, + "step": 5482 + }, + { + "epoch": 0.8883668178872327, + "grad_norm": 0.8404687643051147, + "learning_rate": 4.127606884775559e-06, + "loss": 0.1433, + "step": 5483 + }, + { + "epoch": 0.8885288399222294, + "grad_norm": 0.8364979028701782, + "learning_rate": 4.127274928108792e-06, + "loss": 0.1228, + "step": 5484 + }, + { + "epoch": 0.8886908619572261, + "grad_norm": 0.872944176197052, + "learning_rate": 4.12694292165135e-06, + "loss": 0.1582, + "step": 5485 + }, + { + "epoch": 0.888852883992223, + "grad_norm": 0.8381401300430298, + "learning_rate": 4.126610865413392e-06, + "loss": 0.1407, + "step": 5486 + }, + { + "epoch": 0.8890149060272197, + "grad_norm": 0.8389954566955566, + "learning_rate": 4.126278759405078e-06, + "loss": 0.1522, + "step": 5487 + }, + { + "epoch": 0.8891769280622165, + "grad_norm": 0.8609594702720642, + "learning_rate": 4.125946603636569e-06, + "loss": 0.149, + "step": 5488 + }, + { + "epoch": 0.8893389500972132, + "grad_norm": 0.9145193099975586, + "learning_rate": 4.12561439811803e-06, + "loss": 0.1501, + "step": 5489 + }, + { + "epoch": 0.88950097213221, + "grad_norm": 0.8599084615707397, + "learning_rate": 4.125282142859622e-06, + "loss": 0.1438, + "step": 5490 + }, + { + "epoch": 0.8896629941672067, + "grad_norm": 0.8774310946464539, + "learning_rate": 4.124949837871516e-06, + "loss": 0.1433, + "step": 5491 + }, + { + "epoch": 0.8898250162022034, + "grad_norm": 0.8296245336532593, + "learning_rate": 4.124617483163876e-06, + "loss": 0.1127, + "step": 5492 + }, + { + "epoch": 0.8899870382372003, + "grad_norm": 0.8327217102050781, + "learning_rate": 4.124285078746872e-06, + "loss": 0.1337, + "step": 5493 + }, + { + "epoch": 0.890149060272197, + "grad_norm": 0.8418365120887756, + "learning_rate": 4.123952624630676e-06, + "loss": 0.1461, + "step": 5494 + }, + { + "epoch": 0.8903110823071938, + "grad_norm": 0.8435823917388916, + "learning_rate": 4.123620120825459e-06, + "loss": 0.1376, + "step": 5495 + }, + { + "epoch": 0.8904731043421905, + "grad_norm": 0.8990997076034546, + "learning_rate": 4.123287567341396e-06, + "loss": 0.1331, + "step": 5496 + }, + { + "epoch": 0.8906351263771873, + "grad_norm": 0.9464835524559021, + "learning_rate": 4.122954964188662e-06, + "loss": 0.1385, + "step": 5497 + }, + { + "epoch": 0.890797148412184, + "grad_norm": 0.8224294781684875, + "learning_rate": 4.122622311377433e-06, + "loss": 0.1298, + "step": 5498 + }, + { + "epoch": 0.8909591704471809, + "grad_norm": 0.8208863735198975, + "learning_rate": 4.122289608917888e-06, + "loss": 0.1374, + "step": 5499 + }, + { + "epoch": 0.8911211924821776, + "grad_norm": 0.7936856746673584, + "learning_rate": 4.121956856820207e-06, + "loss": 0.1262, + "step": 5500 + }, + { + "epoch": 0.8912832145171743, + "grad_norm": 0.8944551348686218, + "learning_rate": 4.121624055094571e-06, + "loss": 0.1464, + "step": 5501 + }, + { + "epoch": 0.8914452365521711, + "grad_norm": 0.8844398260116577, + "learning_rate": 4.1212912037511634e-06, + "loss": 0.1422, + "step": 5502 + }, + { + "epoch": 0.8916072585871678, + "grad_norm": 0.790374755859375, + "learning_rate": 4.120958302800169e-06, + "loss": 0.1264, + "step": 5503 + }, + { + "epoch": 0.8917692806221647, + "grad_norm": 0.838116466999054, + "learning_rate": 4.1206253522517725e-06, + "loss": 0.1279, + "step": 5504 + }, + { + "epoch": 0.8919313026571614, + "grad_norm": 0.9478814601898193, + "learning_rate": 4.120292352116162e-06, + "loss": 0.1509, + "step": 5505 + }, + { + "epoch": 0.8920933246921582, + "grad_norm": 0.8016614317893982, + "learning_rate": 4.119959302403527e-06, + "loss": 0.1353, + "step": 5506 + }, + { + "epoch": 0.8922553467271549, + "grad_norm": 0.7916797399520874, + "learning_rate": 4.119626203124056e-06, + "loss": 0.1264, + "step": 5507 + }, + { + "epoch": 0.8924173687621516, + "grad_norm": 0.8813158273696899, + "learning_rate": 4.119293054287945e-06, + "loss": 0.1237, + "step": 5508 + }, + { + "epoch": 0.8925793907971484, + "grad_norm": 0.8696245551109314, + "learning_rate": 4.118959855905383e-06, + "loss": 0.1459, + "step": 5509 + }, + { + "epoch": 0.8927414128321451, + "grad_norm": 0.7300010919570923, + "learning_rate": 4.118626607986569e-06, + "loss": 0.1244, + "step": 5510 + }, + { + "epoch": 0.892903434867142, + "grad_norm": 0.8156650066375732, + "learning_rate": 4.118293310541697e-06, + "loss": 0.1353, + "step": 5511 + }, + { + "epoch": 0.8930654569021387, + "grad_norm": 0.7121819853782654, + "learning_rate": 4.1179599635809654e-06, + "loss": 0.1122, + "step": 5512 + }, + { + "epoch": 0.8932274789371355, + "grad_norm": 0.8799218535423279, + "learning_rate": 4.117626567114575e-06, + "loss": 0.1382, + "step": 5513 + }, + { + "epoch": 0.8933895009721322, + "grad_norm": 0.863865315914154, + "learning_rate": 4.1172931211527254e-06, + "loss": 0.1367, + "step": 5514 + }, + { + "epoch": 0.8935515230071289, + "grad_norm": 0.9362986087799072, + "learning_rate": 4.116959625705621e-06, + "loss": 0.151, + "step": 5515 + }, + { + "epoch": 0.8937135450421257, + "grad_norm": 1.1234683990478516, + "learning_rate": 4.116626080783464e-06, + "loss": 0.1598, + "step": 5516 + }, + { + "epoch": 0.8938755670771225, + "grad_norm": 0.8734990954399109, + "learning_rate": 4.116292486396463e-06, + "loss": 0.148, + "step": 5517 + }, + { + "epoch": 0.8940375891121193, + "grad_norm": 0.8898338079452515, + "learning_rate": 4.1159588425548215e-06, + "loss": 0.1485, + "step": 5518 + }, + { + "epoch": 0.894199611147116, + "grad_norm": 0.9480844140052795, + "learning_rate": 4.1156251492687505e-06, + "loss": 0.1463, + "step": 5519 + }, + { + "epoch": 0.8943616331821128, + "grad_norm": 0.8957088589668274, + "learning_rate": 4.11529140654846e-06, + "loss": 0.1609, + "step": 5520 + }, + { + "epoch": 0.8945236552171095, + "grad_norm": 0.8064636588096619, + "learning_rate": 4.114957614404161e-06, + "loss": 0.1282, + "step": 5521 + }, + { + "epoch": 0.8946856772521062, + "grad_norm": 0.9250004291534424, + "learning_rate": 4.114623772846067e-06, + "loss": 0.1473, + "step": 5522 + }, + { + "epoch": 0.8948476992871031, + "grad_norm": 0.8078949451446533, + "learning_rate": 4.114289881884394e-06, + "loss": 0.1319, + "step": 5523 + }, + { + "epoch": 0.8950097213220998, + "grad_norm": 0.7498173713684082, + "learning_rate": 4.113955941529355e-06, + "loss": 0.1088, + "step": 5524 + }, + { + "epoch": 0.8951717433570966, + "grad_norm": 0.7994401454925537, + "learning_rate": 4.1136219517911715e-06, + "loss": 0.1244, + "step": 5525 + }, + { + "epoch": 0.8953337653920933, + "grad_norm": 1.1339514255523682, + "learning_rate": 4.113287912680061e-06, + "loss": 0.1646, + "step": 5526 + }, + { + "epoch": 0.8954957874270901, + "grad_norm": 0.9239411950111389, + "learning_rate": 4.112953824206244e-06, + "loss": 0.1468, + "step": 5527 + }, + { + "epoch": 0.8956578094620868, + "grad_norm": 0.9831513166427612, + "learning_rate": 4.112619686379944e-06, + "loss": 0.1582, + "step": 5528 + }, + { + "epoch": 0.8958198314970836, + "grad_norm": 0.8446866869926453, + "learning_rate": 4.112285499211383e-06, + "loss": 0.1353, + "step": 5529 + }, + { + "epoch": 0.8959818535320804, + "grad_norm": 0.8573483228683472, + "learning_rate": 4.111951262710788e-06, + "loss": 0.1379, + "step": 5530 + }, + { + "epoch": 0.8961438755670771, + "grad_norm": 0.9121212959289551, + "learning_rate": 4.111616976888385e-06, + "loss": 0.1577, + "step": 5531 + }, + { + "epoch": 0.8963058976020739, + "grad_norm": 0.8051736950874329, + "learning_rate": 4.111282641754403e-06, + "loss": 0.1327, + "step": 5532 + }, + { + "epoch": 0.8964679196370706, + "grad_norm": 0.7908130288124084, + "learning_rate": 4.1109482573190705e-06, + "loss": 0.1223, + "step": 5533 + }, + { + "epoch": 0.8966299416720674, + "grad_norm": 0.8380170464515686, + "learning_rate": 4.110613823592621e-06, + "loss": 0.1249, + "step": 5534 + }, + { + "epoch": 0.8967919637070642, + "grad_norm": 0.9545575380325317, + "learning_rate": 4.110279340585285e-06, + "loss": 0.1605, + "step": 5535 + }, + { + "epoch": 0.8969539857420609, + "grad_norm": 0.8907868266105652, + "learning_rate": 4.109944808307298e-06, + "loss": 0.1562, + "step": 5536 + }, + { + "epoch": 0.8971160077770577, + "grad_norm": 0.8784735202789307, + "learning_rate": 4.109610226768897e-06, + "loss": 0.1361, + "step": 5537 + }, + { + "epoch": 0.8972780298120544, + "grad_norm": 0.8206544518470764, + "learning_rate": 4.109275595980316e-06, + "loss": 0.1343, + "step": 5538 + }, + { + "epoch": 0.8974400518470512, + "grad_norm": 0.7294670939445496, + "learning_rate": 4.108940915951798e-06, + "loss": 0.1152, + "step": 5539 + }, + { + "epoch": 0.8976020738820479, + "grad_norm": 0.9412605166435242, + "learning_rate": 4.108606186693582e-06, + "loss": 0.1527, + "step": 5540 + }, + { + "epoch": 0.8977640959170448, + "grad_norm": 0.8774265050888062, + "learning_rate": 4.1082714082159084e-06, + "loss": 0.147, + "step": 5541 + }, + { + "epoch": 0.8979261179520415, + "grad_norm": 0.8635554313659668, + "learning_rate": 4.1079365805290214e-06, + "loss": 0.1484, + "step": 5542 + }, + { + "epoch": 0.8980881399870383, + "grad_norm": 0.9252685904502869, + "learning_rate": 4.107601703643167e-06, + "loss": 0.1404, + "step": 5543 + }, + { + "epoch": 0.898250162022035, + "grad_norm": 0.9969372749328613, + "learning_rate": 4.10726677756859e-06, + "loss": 0.1578, + "step": 5544 + }, + { + "epoch": 0.8984121840570317, + "grad_norm": 0.9096860289573669, + "learning_rate": 4.1069318023155405e-06, + "loss": 0.1433, + "step": 5545 + }, + { + "epoch": 0.8985742060920285, + "grad_norm": 0.8402926325798035, + "learning_rate": 4.106596777894265e-06, + "loss": 0.1256, + "step": 5546 + }, + { + "epoch": 0.8987362281270252, + "grad_norm": 0.8750645518302917, + "learning_rate": 4.106261704315017e-06, + "loss": 0.1416, + "step": 5547 + }, + { + "epoch": 0.8988982501620221, + "grad_norm": 0.7757678627967834, + "learning_rate": 4.105926581588046e-06, + "loss": 0.1162, + "step": 5548 + }, + { + "epoch": 0.8990602721970188, + "grad_norm": 1.0043946504592896, + "learning_rate": 4.10559140972361e-06, + "loss": 0.1457, + "step": 5549 + }, + { + "epoch": 0.8992222942320156, + "grad_norm": 0.8587821125984192, + "learning_rate": 4.105256188731962e-06, + "loss": 0.1499, + "step": 5550 + }, + { + "epoch": 0.8993843162670123, + "grad_norm": 0.8745139241218567, + "learning_rate": 4.104920918623359e-06, + "loss": 0.1486, + "step": 5551 + }, + { + "epoch": 0.899546338302009, + "grad_norm": 0.8056247234344482, + "learning_rate": 4.104585599408059e-06, + "loss": 0.1264, + "step": 5552 + }, + { + "epoch": 0.8997083603370059, + "grad_norm": 0.9480019211769104, + "learning_rate": 4.104250231096324e-06, + "loss": 0.1563, + "step": 5553 + }, + { + "epoch": 0.8998703823720026, + "grad_norm": 0.8790834546089172, + "learning_rate": 4.1039148136984134e-06, + "loss": 0.1446, + "step": 5554 + }, + { + "epoch": 0.9000324044069994, + "grad_norm": 0.8392221927642822, + "learning_rate": 4.1035793472245905e-06, + "loss": 0.1363, + "step": 5555 + }, + { + "epoch": 0.9001944264419961, + "grad_norm": 0.9883176684379578, + "learning_rate": 4.103243831685121e-06, + "loss": 0.1532, + "step": 5556 + }, + { + "epoch": 0.9003564484769929, + "grad_norm": 0.9498316049575806, + "learning_rate": 4.102908267090269e-06, + "loss": 0.1664, + "step": 5557 + }, + { + "epoch": 0.9005184705119896, + "grad_norm": 1.109565258026123, + "learning_rate": 4.102572653450304e-06, + "loss": 0.1679, + "step": 5558 + }, + { + "epoch": 0.9006804925469863, + "grad_norm": 1.8112865686416626, + "learning_rate": 4.102236990775493e-06, + "loss": 0.1594, + "step": 5559 + }, + { + "epoch": 0.9008425145819832, + "grad_norm": 0.8903408050537109, + "learning_rate": 4.101901279076108e-06, + "loss": 0.1408, + "step": 5560 + }, + { + "epoch": 0.9010045366169799, + "grad_norm": 0.8736206293106079, + "learning_rate": 4.101565518362421e-06, + "loss": 0.1407, + "step": 5561 + }, + { + "epoch": 0.9011665586519767, + "grad_norm": 0.8691003322601318, + "learning_rate": 4.101229708644704e-06, + "loss": 0.1545, + "step": 5562 + }, + { + "epoch": 0.9013285806869734, + "grad_norm": 0.951155960559845, + "learning_rate": 4.100893849933234e-06, + "loss": 0.1373, + "step": 5563 + }, + { + "epoch": 0.9014906027219702, + "grad_norm": 0.774764358997345, + "learning_rate": 4.100557942238284e-06, + "loss": 0.1234, + "step": 5564 + }, + { + "epoch": 0.901652624756967, + "grad_norm": 0.8135146498680115, + "learning_rate": 4.100221985570137e-06, + "loss": 0.1403, + "step": 5565 + }, + { + "epoch": 0.9018146467919637, + "grad_norm": 0.8030052781105042, + "learning_rate": 4.099885979939068e-06, + "loss": 0.1311, + "step": 5566 + }, + { + "epoch": 0.9019766688269605, + "grad_norm": 0.7571940422058105, + "learning_rate": 4.099549925355359e-06, + "loss": 0.1109, + "step": 5567 + }, + { + "epoch": 0.9021386908619572, + "grad_norm": 0.7888020277023315, + "learning_rate": 4.099213821829295e-06, + "loss": 0.1194, + "step": 5568 + }, + { + "epoch": 0.902300712896954, + "grad_norm": 0.9889339208602905, + "learning_rate": 4.098877669371156e-06, + "loss": 0.1492, + "step": 5569 + }, + { + "epoch": 0.9024627349319507, + "grad_norm": 0.8497052192687988, + "learning_rate": 4.098541467991231e-06, + "loss": 0.1194, + "step": 5570 + }, + { + "epoch": 0.9026247569669476, + "grad_norm": 0.8576518297195435, + "learning_rate": 4.098205217699806e-06, + "loss": 0.1441, + "step": 5571 + }, + { + "epoch": 0.9027867790019443, + "grad_norm": 0.8945521712303162, + "learning_rate": 4.097868918507168e-06, + "loss": 0.1584, + "step": 5572 + }, + { + "epoch": 0.902948801036941, + "grad_norm": 0.9076288342475891, + "learning_rate": 4.097532570423608e-06, + "loss": 0.1563, + "step": 5573 + }, + { + "epoch": 0.9031108230719378, + "grad_norm": 0.8376015424728394, + "learning_rate": 4.097196173459417e-06, + "loss": 0.1199, + "step": 5574 + }, + { + "epoch": 0.9032728451069345, + "grad_norm": 0.9018917679786682, + "learning_rate": 4.096859727624889e-06, + "loss": 0.1483, + "step": 5575 + }, + { + "epoch": 0.9034348671419313, + "grad_norm": 0.8175559639930725, + "learning_rate": 4.0965232329303175e-06, + "loss": 0.1333, + "step": 5576 + }, + { + "epoch": 0.903596889176928, + "grad_norm": 0.8534868955612183, + "learning_rate": 4.096186689385997e-06, + "loss": 0.136, + "step": 5577 + }, + { + "epoch": 0.9037589112119249, + "grad_norm": 0.8894876837730408, + "learning_rate": 4.095850097002228e-06, + "loss": 0.146, + "step": 5578 + }, + { + "epoch": 0.9039209332469216, + "grad_norm": 0.852396547794342, + "learning_rate": 4.095513455789307e-06, + "loss": 0.1268, + "step": 5579 + }, + { + "epoch": 0.9040829552819183, + "grad_norm": 0.9027949571609497, + "learning_rate": 4.095176765757537e-06, + "loss": 0.1505, + "step": 5580 + }, + { + "epoch": 0.9042449773169151, + "grad_norm": 0.9837702512741089, + "learning_rate": 4.094840026917217e-06, + "loss": 0.154, + "step": 5581 + }, + { + "epoch": 0.9044069993519118, + "grad_norm": 0.7658412456512451, + "learning_rate": 4.094503239278652e-06, + "loss": 0.1292, + "step": 5582 + }, + { + "epoch": 0.9045690213869086, + "grad_norm": 0.8777854442596436, + "learning_rate": 4.094166402852146e-06, + "loss": 0.1327, + "step": 5583 + }, + { + "epoch": 0.9047310434219054, + "grad_norm": 0.9733120799064636, + "learning_rate": 4.0938295176480055e-06, + "loss": 0.1326, + "step": 5584 + }, + { + "epoch": 0.9048930654569022, + "grad_norm": 0.9805978536605835, + "learning_rate": 4.09349258367654e-06, + "loss": 0.1428, + "step": 5585 + }, + { + "epoch": 0.9050550874918989, + "grad_norm": 0.8723875284194946, + "learning_rate": 4.093155600948057e-06, + "loss": 0.1496, + "step": 5586 + }, + { + "epoch": 0.9052171095268956, + "grad_norm": 0.8509448766708374, + "learning_rate": 4.092818569472869e-06, + "loss": 0.1371, + "step": 5587 + }, + { + "epoch": 0.9053791315618924, + "grad_norm": 1.0776581764221191, + "learning_rate": 4.092481489261285e-06, + "loss": 0.1783, + "step": 5588 + }, + { + "epoch": 0.9055411535968891, + "grad_norm": 0.8364329934120178, + "learning_rate": 4.0921443603236235e-06, + "loss": 0.1295, + "step": 5589 + }, + { + "epoch": 0.905703175631886, + "grad_norm": 0.8056396245956421, + "learning_rate": 4.0918071826701966e-06, + "loss": 0.1196, + "step": 5590 + }, + { + "epoch": 0.9058651976668827, + "grad_norm": 0.8377984166145325, + "learning_rate": 4.0914699563113214e-06, + "loss": 0.1338, + "step": 5591 + }, + { + "epoch": 0.9060272197018795, + "grad_norm": 0.9143077731132507, + "learning_rate": 4.091132681257317e-06, + "loss": 0.1505, + "step": 5592 + }, + { + "epoch": 0.9061892417368762, + "grad_norm": 0.874538242816925, + "learning_rate": 4.0907953575185035e-06, + "loss": 0.1465, + "step": 5593 + }, + { + "epoch": 0.906351263771873, + "grad_norm": 0.9015555381774902, + "learning_rate": 4.090457985105202e-06, + "loss": 0.1463, + "step": 5594 + }, + { + "epoch": 0.9065132858068697, + "grad_norm": 0.8938778638839722, + "learning_rate": 4.090120564027734e-06, + "loss": 0.167, + "step": 5595 + }, + { + "epoch": 0.9066753078418665, + "grad_norm": 0.7953081130981445, + "learning_rate": 4.089783094296425e-06, + "loss": 0.1215, + "step": 5596 + }, + { + "epoch": 0.9068373298768633, + "grad_norm": 0.9266294836997986, + "learning_rate": 4.0894455759216015e-06, + "loss": 0.1584, + "step": 5597 + }, + { + "epoch": 0.90699935191186, + "grad_norm": 0.9055389165878296, + "learning_rate": 4.089108008913589e-06, + "loss": 0.1614, + "step": 5598 + }, + { + "epoch": 0.9071613739468568, + "grad_norm": 0.9142413139343262, + "learning_rate": 4.088770393282717e-06, + "loss": 0.1326, + "step": 5599 + }, + { + "epoch": 0.9073233959818535, + "grad_norm": 0.922514796257019, + "learning_rate": 4.088432729039316e-06, + "loss": 0.1363, + "step": 5600 + }, + { + "epoch": 0.9074854180168503, + "grad_norm": 0.8099052906036377, + "learning_rate": 4.088095016193717e-06, + "loss": 0.1321, + "step": 5601 + }, + { + "epoch": 0.907647440051847, + "grad_norm": 0.7604457139968872, + "learning_rate": 4.087757254756254e-06, + "loss": 0.1283, + "step": 5602 + }, + { + "epoch": 0.9078094620868438, + "grad_norm": 0.8297195434570312, + "learning_rate": 4.087419444737261e-06, + "loss": 0.1483, + "step": 5603 + }, + { + "epoch": 0.9079714841218406, + "grad_norm": 0.7264284491539001, + "learning_rate": 4.087081586147075e-06, + "loss": 0.1193, + "step": 5604 + }, + { + "epoch": 0.9081335061568373, + "grad_norm": 0.7358347177505493, + "learning_rate": 4.086743678996032e-06, + "loss": 0.1254, + "step": 5605 + }, + { + "epoch": 0.9082955281918341, + "grad_norm": 0.7477496862411499, + "learning_rate": 4.086405723294474e-06, + "loss": 0.1247, + "step": 5606 + }, + { + "epoch": 0.9084575502268308, + "grad_norm": 0.8757005929946899, + "learning_rate": 4.086067719052739e-06, + "loss": 0.1475, + "step": 5607 + }, + { + "epoch": 0.9086195722618277, + "grad_norm": 0.7548813223838806, + "learning_rate": 4.0857296662811696e-06, + "loss": 0.1146, + "step": 5608 + }, + { + "epoch": 0.9087815942968244, + "grad_norm": 0.9473977088928223, + "learning_rate": 4.08539156499011e-06, + "loss": 0.134, + "step": 5609 + }, + { + "epoch": 0.9089436163318211, + "grad_norm": 0.8978647589683533, + "learning_rate": 4.085053415189905e-06, + "loss": 0.151, + "step": 5610 + }, + { + "epoch": 0.9091056383668179, + "grad_norm": 0.8081299662590027, + "learning_rate": 4.084715216890902e-06, + "loss": 0.1178, + "step": 5611 + }, + { + "epoch": 0.9092676604018146, + "grad_norm": 0.8348207473754883, + "learning_rate": 4.084376970103448e-06, + "loss": 0.1308, + "step": 5612 + }, + { + "epoch": 0.9094296824368114, + "grad_norm": 1.0297025442123413, + "learning_rate": 4.0840386748378914e-06, + "loss": 0.1464, + "step": 5613 + }, + { + "epoch": 0.9095917044718081, + "grad_norm": 0.7381079196929932, + "learning_rate": 4.0837003311045865e-06, + "loss": 0.1112, + "step": 5614 + }, + { + "epoch": 0.909753726506805, + "grad_norm": 0.8947216868400574, + "learning_rate": 4.083361938913884e-06, + "loss": 0.1424, + "step": 5615 + }, + { + "epoch": 0.9099157485418017, + "grad_norm": 0.9418050050735474, + "learning_rate": 4.083023498276136e-06, + "loss": 0.1445, + "step": 5616 + }, + { + "epoch": 0.9100777705767984, + "grad_norm": 0.8582934141159058, + "learning_rate": 4.0826850092017015e-06, + "loss": 0.1446, + "step": 5617 + }, + { + "epoch": 0.9102397926117952, + "grad_norm": 0.8298516273498535, + "learning_rate": 4.082346471700935e-06, + "loss": 0.1382, + "step": 5618 + }, + { + "epoch": 0.9104018146467919, + "grad_norm": 0.8987380862236023, + "learning_rate": 4.082007885784196e-06, + "loss": 0.1405, + "step": 5619 + }, + { + "epoch": 0.9105638366817888, + "grad_norm": 0.7453068494796753, + "learning_rate": 4.081669251461844e-06, + "loss": 0.1175, + "step": 5620 + }, + { + "epoch": 0.9107258587167855, + "grad_norm": 0.9048896431922913, + "learning_rate": 4.08133056874424e-06, + "loss": 0.1491, + "step": 5621 + }, + { + "epoch": 0.9108878807517823, + "grad_norm": 0.9545086026191711, + "learning_rate": 4.080991837641748e-06, + "loss": 0.1519, + "step": 5622 + }, + { + "epoch": 0.911049902786779, + "grad_norm": 0.8169971704483032, + "learning_rate": 4.0806530581647305e-06, + "loss": 0.1244, + "step": 5623 + }, + { + "epoch": 0.9112119248217757, + "grad_norm": 0.7835274338722229, + "learning_rate": 4.080314230323556e-06, + "loss": 0.1404, + "step": 5624 + }, + { + "epoch": 0.9113739468567725, + "grad_norm": 0.9390804171562195, + "learning_rate": 4.07997535412859e-06, + "loss": 0.1438, + "step": 5625 + }, + { + "epoch": 0.9115359688917692, + "grad_norm": 0.7569699883460999, + "learning_rate": 4.079636429590201e-06, + "loss": 0.1186, + "step": 5626 + }, + { + "epoch": 0.9116979909267661, + "grad_norm": 0.9515724778175354, + "learning_rate": 4.07929745671876e-06, + "loss": 0.1582, + "step": 5627 + }, + { + "epoch": 0.9118600129617628, + "grad_norm": 0.892797589302063, + "learning_rate": 4.07895843552464e-06, + "loss": 0.1337, + "step": 5628 + }, + { + "epoch": 0.9120220349967596, + "grad_norm": 0.9236721396446228, + "learning_rate": 4.078619366018212e-06, + "loss": 0.1684, + "step": 5629 + }, + { + "epoch": 0.9121840570317563, + "grad_norm": 0.8024287223815918, + "learning_rate": 4.078280248209851e-06, + "loss": 0.1236, + "step": 5630 + }, + { + "epoch": 0.912346079066753, + "grad_norm": 0.8392508029937744, + "learning_rate": 4.077941082109934e-06, + "loss": 0.1306, + "step": 5631 + }, + { + "epoch": 0.9125081011017498, + "grad_norm": 0.9787663221359253, + "learning_rate": 4.077601867728839e-06, + "loss": 0.1641, + "step": 5632 + }, + { + "epoch": 0.9126701231367466, + "grad_norm": 0.7922909259796143, + "learning_rate": 4.077262605076943e-06, + "loss": 0.1286, + "step": 5633 + }, + { + "epoch": 0.9128321451717434, + "grad_norm": 0.7346786260604858, + "learning_rate": 4.07692329416463e-06, + "loss": 0.1231, + "step": 5634 + }, + { + "epoch": 0.9129941672067401, + "grad_norm": 0.8033357262611389, + "learning_rate": 4.07658393500228e-06, + "loss": 0.1274, + "step": 5635 + }, + { + "epoch": 0.9131561892417369, + "grad_norm": 0.8313119411468506, + "learning_rate": 4.0762445276002765e-06, + "loss": 0.1404, + "step": 5636 + }, + { + "epoch": 0.9133182112767336, + "grad_norm": 0.7644381523132324, + "learning_rate": 4.075905071969005e-06, + "loss": 0.1241, + "step": 5637 + }, + { + "epoch": 0.9134802333117304, + "grad_norm": 0.8910484313964844, + "learning_rate": 4.075565568118852e-06, + "loss": 0.1439, + "step": 5638 + }, + { + "epoch": 0.9136422553467272, + "grad_norm": 0.8554601669311523, + "learning_rate": 4.075226016060205e-06, + "loss": 0.1388, + "step": 5639 + }, + { + "epoch": 0.9138042773817239, + "grad_norm": 0.8526726365089417, + "learning_rate": 4.074886415803454e-06, + "loss": 0.1508, + "step": 5640 + }, + { + "epoch": 0.9139662994167207, + "grad_norm": 0.8573095798492432, + "learning_rate": 4.07454676735899e-06, + "loss": 0.1487, + "step": 5641 + }, + { + "epoch": 0.9141283214517174, + "grad_norm": 0.7731853723526001, + "learning_rate": 4.074207070737205e-06, + "loss": 0.1233, + "step": 5642 + }, + { + "epoch": 0.9142903434867142, + "grad_norm": 0.9754795432090759, + "learning_rate": 4.073867325948494e-06, + "loss": 0.1559, + "step": 5643 + }, + { + "epoch": 0.9144523655217109, + "grad_norm": 0.8023756146430969, + "learning_rate": 4.07352753300325e-06, + "loss": 0.1269, + "step": 5644 + }, + { + "epoch": 0.9146143875567078, + "grad_norm": 0.9316949248313904, + "learning_rate": 4.073187691911873e-06, + "loss": 0.146, + "step": 5645 + }, + { + "epoch": 0.9147764095917045, + "grad_norm": 0.8113452196121216, + "learning_rate": 4.072847802684758e-06, + "loss": 0.1353, + "step": 5646 + }, + { + "epoch": 0.9149384316267012, + "grad_norm": 0.8404650092124939, + "learning_rate": 4.072507865332308e-06, + "loss": 0.1431, + "step": 5647 + }, + { + "epoch": 0.915100453661698, + "grad_norm": 0.9183782935142517, + "learning_rate": 4.072167879864922e-06, + "loss": 0.1453, + "step": 5648 + }, + { + "epoch": 0.9152624756966947, + "grad_norm": 0.8436623811721802, + "learning_rate": 4.071827846293004e-06, + "loss": 0.1267, + "step": 5649 + }, + { + "epoch": 0.9154244977316915, + "grad_norm": 0.9088074564933777, + "learning_rate": 4.071487764626957e-06, + "loss": 0.1451, + "step": 5650 + }, + { + "epoch": 0.9155865197666883, + "grad_norm": 0.8326597213745117, + "learning_rate": 4.071147634877187e-06, + "loss": 0.1361, + "step": 5651 + }, + { + "epoch": 0.9157485418016851, + "grad_norm": 0.850050687789917, + "learning_rate": 4.070807457054102e-06, + "loss": 0.1234, + "step": 5652 + }, + { + "epoch": 0.9159105638366818, + "grad_norm": 0.8121395707130432, + "learning_rate": 4.07046723116811e-06, + "loss": 0.1254, + "step": 5653 + }, + { + "epoch": 0.9160725858716785, + "grad_norm": 0.8910734057426453, + "learning_rate": 4.070126957229622e-06, + "loss": 0.1436, + "step": 5654 + }, + { + "epoch": 0.9162346079066753, + "grad_norm": 0.8616095781326294, + "learning_rate": 4.0697866352490475e-06, + "loss": 0.1444, + "step": 5655 + }, + { + "epoch": 0.916396629941672, + "grad_norm": 0.9634042978286743, + "learning_rate": 4.069446265236801e-06, + "loss": 0.1737, + "step": 5656 + }, + { + "epoch": 0.9165586519766689, + "grad_norm": 0.8356743454933167, + "learning_rate": 4.0691058472032975e-06, + "loss": 0.1345, + "step": 5657 + }, + { + "epoch": 0.9167206740116656, + "grad_norm": 0.8691932559013367, + "learning_rate": 4.068765381158951e-06, + "loss": 0.1398, + "step": 5658 + }, + { + "epoch": 0.9168826960466624, + "grad_norm": 0.9177868366241455, + "learning_rate": 4.068424867114181e-06, + "loss": 0.1573, + "step": 5659 + }, + { + "epoch": 0.9170447180816591, + "grad_norm": 0.802828311920166, + "learning_rate": 4.068084305079406e-06, + "loss": 0.1188, + "step": 5660 + }, + { + "epoch": 0.9172067401166558, + "grad_norm": 0.920111358165741, + "learning_rate": 4.067743695065045e-06, + "loss": 0.1502, + "step": 5661 + }, + { + "epoch": 0.9173687621516526, + "grad_norm": 0.872612714767456, + "learning_rate": 4.067403037081522e-06, + "loss": 0.1551, + "step": 5662 + }, + { + "epoch": 0.9175307841866494, + "grad_norm": 0.7936202883720398, + "learning_rate": 4.0670623311392575e-06, + "loss": 0.1376, + "step": 5663 + }, + { + "epoch": 0.9176928062216462, + "grad_norm": 0.8577109575271606, + "learning_rate": 4.06672157724868e-06, + "loss": 0.1404, + "step": 5664 + }, + { + "epoch": 0.9178548282566429, + "grad_norm": 0.7876363396644592, + "learning_rate": 4.066380775420211e-06, + "loss": 0.1236, + "step": 5665 + }, + { + "epoch": 0.9180168502916397, + "grad_norm": 0.9405772089958191, + "learning_rate": 4.066039925664283e-06, + "loss": 0.1358, + "step": 5666 + }, + { + "epoch": 0.9181788723266364, + "grad_norm": 0.8037310242652893, + "learning_rate": 4.065699027991322e-06, + "loss": 0.1344, + "step": 5667 + }, + { + "epoch": 0.9183408943616331, + "grad_norm": 0.9089890718460083, + "learning_rate": 4.06535808241176e-06, + "loss": 0.1502, + "step": 5668 + }, + { + "epoch": 0.91850291639663, + "grad_norm": 0.7645153999328613, + "learning_rate": 4.065017088936028e-06, + "loss": 0.1121, + "step": 5669 + }, + { + "epoch": 0.9186649384316267, + "grad_norm": 0.8740200400352478, + "learning_rate": 4.064676047574561e-06, + "loss": 0.1576, + "step": 5670 + }, + { + "epoch": 0.9188269604666235, + "grad_norm": 0.8704571723937988, + "learning_rate": 4.064334958337794e-06, + "loss": 0.1325, + "step": 5671 + }, + { + "epoch": 0.9189889825016202, + "grad_norm": 0.9229740500450134, + "learning_rate": 4.063993821236162e-06, + "loss": 0.1428, + "step": 5672 + }, + { + "epoch": 0.919151004536617, + "grad_norm": 1.0061372518539429, + "learning_rate": 4.063652636280105e-06, + "loss": 0.1629, + "step": 5673 + }, + { + "epoch": 0.9193130265716137, + "grad_norm": 0.8159903883934021, + "learning_rate": 4.063311403480061e-06, + "loss": 0.1316, + "step": 5674 + }, + { + "epoch": 0.9194750486066104, + "grad_norm": 0.91038578748703, + "learning_rate": 4.06297012284647e-06, + "loss": 0.1555, + "step": 5675 + }, + { + "epoch": 0.9196370706416073, + "grad_norm": 0.9407716989517212, + "learning_rate": 4.0626287943897765e-06, + "loss": 0.1476, + "step": 5676 + }, + { + "epoch": 0.919799092676604, + "grad_norm": 0.8589774370193481, + "learning_rate": 4.062287418120423e-06, + "loss": 0.1333, + "step": 5677 + }, + { + "epoch": 0.9199611147116008, + "grad_norm": 0.8598899841308594, + "learning_rate": 4.061945994048855e-06, + "loss": 0.1264, + "step": 5678 + }, + { + "epoch": 0.9201231367465975, + "grad_norm": 0.8190400004386902, + "learning_rate": 4.06160452218552e-06, + "loss": 0.1291, + "step": 5679 + }, + { + "epoch": 0.9202851587815943, + "grad_norm": 0.9325578808784485, + "learning_rate": 4.061263002540865e-06, + "loss": 0.15, + "step": 5680 + }, + { + "epoch": 0.920447180816591, + "grad_norm": 0.8312631845474243, + "learning_rate": 4.060921435125341e-06, + "loss": 0.1357, + "step": 5681 + }, + { + "epoch": 0.9206092028515879, + "grad_norm": 0.8420397043228149, + "learning_rate": 4.060579819949398e-06, + "loss": 0.1447, + "step": 5682 + }, + { + "epoch": 0.9207712248865846, + "grad_norm": 0.7866024971008301, + "learning_rate": 4.06023815702349e-06, + "loss": 0.1397, + "step": 5683 + }, + { + "epoch": 0.9209332469215813, + "grad_norm": 0.8895429372787476, + "learning_rate": 4.059896446358068e-06, + "loss": 0.1463, + "step": 5684 + }, + { + "epoch": 0.9210952689565781, + "grad_norm": 0.7228549122810364, + "learning_rate": 4.059554687963591e-06, + "loss": 0.1146, + "step": 5685 + }, + { + "epoch": 0.9212572909915748, + "grad_norm": 0.7941901087760925, + "learning_rate": 4.059212881850515e-06, + "loss": 0.1317, + "step": 5686 + }, + { + "epoch": 0.9214193130265717, + "grad_norm": 0.7205981016159058, + "learning_rate": 4.058871028029296e-06, + "loss": 0.1143, + "step": 5687 + }, + { + "epoch": 0.9215813350615684, + "grad_norm": 0.934319257736206, + "learning_rate": 4.0585291265103985e-06, + "loss": 0.1565, + "step": 5688 + }, + { + "epoch": 0.9217433570965652, + "grad_norm": 0.8551682233810425, + "learning_rate": 4.05818717730428e-06, + "loss": 0.141, + "step": 5689 + }, + { + "epoch": 0.9219053791315619, + "grad_norm": 0.9247624278068542, + "learning_rate": 4.057845180421405e-06, + "loss": 0.1454, + "step": 5690 + }, + { + "epoch": 0.9220674011665586, + "grad_norm": 0.8106127381324768, + "learning_rate": 4.057503135872237e-06, + "loss": 0.1259, + "step": 5691 + }, + { + "epoch": 0.9222294232015554, + "grad_norm": 0.8142223954200745, + "learning_rate": 4.057161043667243e-06, + "loss": 0.1383, + "step": 5692 + }, + { + "epoch": 0.9223914452365521, + "grad_norm": 0.7778276801109314, + "learning_rate": 4.056818903816888e-06, + "loss": 0.1258, + "step": 5693 + }, + { + "epoch": 0.922553467271549, + "grad_norm": 0.9407106041908264, + "learning_rate": 4.056476716331643e-06, + "loss": 0.1488, + "step": 5694 + }, + { + "epoch": 0.9227154893065457, + "grad_norm": 0.8241350054740906, + "learning_rate": 4.056134481221977e-06, + "loss": 0.1358, + "step": 5695 + }, + { + "epoch": 0.9228775113415425, + "grad_norm": 0.9014641046524048, + "learning_rate": 4.05579219849836e-06, + "loss": 0.1459, + "step": 5696 + }, + { + "epoch": 0.9230395333765392, + "grad_norm": 0.7756420969963074, + "learning_rate": 4.055449868171269e-06, + "loss": 0.1265, + "step": 5697 + }, + { + "epoch": 0.9232015554115359, + "grad_norm": 0.9437333345413208, + "learning_rate": 4.055107490251175e-06, + "loss": 0.1625, + "step": 5698 + }, + { + "epoch": 0.9233635774465327, + "grad_norm": 0.9008864164352417, + "learning_rate": 4.054765064748554e-06, + "loss": 0.1386, + "step": 5699 + }, + { + "epoch": 0.9235255994815295, + "grad_norm": 0.8397160768508911, + "learning_rate": 4.054422591673887e-06, + "loss": 0.1367, + "step": 5700 + }, + { + "epoch": 0.9236876215165263, + "grad_norm": 0.8157175183296204, + "learning_rate": 4.054080071037649e-06, + "loss": 0.1287, + "step": 5701 + }, + { + "epoch": 0.923849643551523, + "grad_norm": 0.8612614870071411, + "learning_rate": 4.0537375028503225e-06, + "loss": 0.1411, + "step": 5702 + }, + { + "epoch": 0.9240116655865198, + "grad_norm": 0.933879554271698, + "learning_rate": 4.053394887122387e-06, + "loss": 0.1231, + "step": 5703 + }, + { + "epoch": 0.9241736876215165, + "grad_norm": 0.8254159688949585, + "learning_rate": 4.053052223864328e-06, + "loss": 0.1382, + "step": 5704 + }, + { + "epoch": 0.9243357096565132, + "grad_norm": 0.9842225313186646, + "learning_rate": 4.052709513086629e-06, + "loss": 0.161, + "step": 5705 + }, + { + "epoch": 0.9244977316915101, + "grad_norm": 0.8593588471412659, + "learning_rate": 4.052366754799776e-06, + "loss": 0.1222, + "step": 5706 + }, + { + "epoch": 0.9246597537265068, + "grad_norm": 0.7498965859413147, + "learning_rate": 4.052023949014257e-06, + "loss": 0.1276, + "step": 5707 + }, + { + "epoch": 0.9248217757615036, + "grad_norm": 0.9924911856651306, + "learning_rate": 4.051681095740561e-06, + "loss": 0.1548, + "step": 5708 + }, + { + "epoch": 0.9249837977965003, + "grad_norm": 0.8889843821525574, + "learning_rate": 4.051338194989179e-06, + "loss": 0.1553, + "step": 5709 + }, + { + "epoch": 0.9251458198314971, + "grad_norm": 0.8717356324195862, + "learning_rate": 4.050995246770602e-06, + "loss": 0.146, + "step": 5710 + }, + { + "epoch": 0.9253078418664938, + "grad_norm": 0.8971138596534729, + "learning_rate": 4.050652251095324e-06, + "loss": 0.1565, + "step": 5711 + }, + { + "epoch": 0.9254698639014906, + "grad_norm": 0.8394131660461426, + "learning_rate": 4.05030920797384e-06, + "loss": 0.1485, + "step": 5712 + }, + { + "epoch": 0.9256318859364874, + "grad_norm": 0.8194043636322021, + "learning_rate": 4.049966117416645e-06, + "loss": 0.1376, + "step": 5713 + }, + { + "epoch": 0.9257939079714841, + "grad_norm": 0.9140671491622925, + "learning_rate": 4.049622979434239e-06, + "loss": 0.1417, + "step": 5714 + }, + { + "epoch": 0.9259559300064809, + "grad_norm": 0.7816025018692017, + "learning_rate": 4.049279794037118e-06, + "loss": 0.1382, + "step": 5715 + }, + { + "epoch": 0.9261179520414776, + "grad_norm": 0.9249107241630554, + "learning_rate": 4.0489365612357854e-06, + "loss": 0.1547, + "step": 5716 + }, + { + "epoch": 0.9262799740764744, + "grad_norm": 0.8584994673728943, + "learning_rate": 4.048593281040743e-06, + "loss": 0.129, + "step": 5717 + }, + { + "epoch": 0.9264419961114712, + "grad_norm": 0.8270529508590698, + "learning_rate": 4.0482499534624934e-06, + "loss": 0.1382, + "step": 5718 + }, + { + "epoch": 0.9266040181464679, + "grad_norm": 0.8987339735031128, + "learning_rate": 4.047906578511542e-06, + "loss": 0.1495, + "step": 5719 + }, + { + "epoch": 0.9267660401814647, + "grad_norm": 0.7987256050109863, + "learning_rate": 4.047563156198394e-06, + "loss": 0.1449, + "step": 5720 + }, + { + "epoch": 0.9269280622164614, + "grad_norm": 0.746697723865509, + "learning_rate": 4.047219686533559e-06, + "loss": 0.1149, + "step": 5721 + }, + { + "epoch": 0.9270900842514582, + "grad_norm": 0.860031008720398, + "learning_rate": 4.046876169527547e-06, + "loss": 0.1473, + "step": 5722 + }, + { + "epoch": 0.9272521062864549, + "grad_norm": 0.837323784828186, + "learning_rate": 4.046532605190866e-06, + "loss": 0.1438, + "step": 5723 + }, + { + "epoch": 0.9274141283214518, + "grad_norm": 0.8923266530036926, + "learning_rate": 4.04618899353403e-06, + "loss": 0.1301, + "step": 5724 + }, + { + "epoch": 0.9275761503564485, + "grad_norm": 0.9036858677864075, + "learning_rate": 4.045845334567553e-06, + "loss": 0.14, + "step": 5725 + }, + { + "epoch": 0.9277381723914452, + "grad_norm": 0.8130469918251038, + "learning_rate": 4.04550162830195e-06, + "loss": 0.1394, + "step": 5726 + }, + { + "epoch": 0.927900194426442, + "grad_norm": 0.9334059953689575, + "learning_rate": 4.045157874747737e-06, + "loss": 0.1509, + "step": 5727 + }, + { + "epoch": 0.9280622164614387, + "grad_norm": 0.8468704223632812, + "learning_rate": 4.044814073915432e-06, + "loss": 0.1319, + "step": 5728 + }, + { + "epoch": 0.9282242384964355, + "grad_norm": 0.875339150428772, + "learning_rate": 4.0444702258155545e-06, + "loss": 0.1525, + "step": 5729 + }, + { + "epoch": 0.9283862605314323, + "grad_norm": 1.0649487972259521, + "learning_rate": 4.044126330458626e-06, + "loss": 0.1664, + "step": 5730 + }, + { + "epoch": 0.9285482825664291, + "grad_norm": 0.8550140261650085, + "learning_rate": 4.043782387855169e-06, + "loss": 0.1398, + "step": 5731 + }, + { + "epoch": 0.9287103046014258, + "grad_norm": 0.7701947689056396, + "learning_rate": 4.0434383980157055e-06, + "loss": 0.1186, + "step": 5732 + }, + { + "epoch": 0.9288723266364226, + "grad_norm": 0.7681037783622742, + "learning_rate": 4.043094360950763e-06, + "loss": 0.1364, + "step": 5733 + }, + { + "epoch": 0.9290343486714193, + "grad_norm": 0.8084831833839417, + "learning_rate": 4.042750276670867e-06, + "loss": 0.1258, + "step": 5734 + }, + { + "epoch": 0.929196370706416, + "grad_norm": 0.824355959892273, + "learning_rate": 4.042406145186546e-06, + "loss": 0.1295, + "step": 5735 + }, + { + "epoch": 0.9293583927414129, + "grad_norm": 0.9090822339057922, + "learning_rate": 4.04206196650833e-06, + "loss": 0.1374, + "step": 5736 + }, + { + "epoch": 0.9295204147764096, + "grad_norm": 0.7409492135047913, + "learning_rate": 4.041717740646749e-06, + "loss": 0.1126, + "step": 5737 + }, + { + "epoch": 0.9296824368114064, + "grad_norm": 0.8566955327987671, + "learning_rate": 4.041373467612337e-06, + "loss": 0.1297, + "step": 5738 + }, + { + "epoch": 0.9298444588464031, + "grad_norm": 0.8296953439712524, + "learning_rate": 4.0410291474156246e-06, + "loss": 0.1296, + "step": 5739 + }, + { + "epoch": 0.9300064808813999, + "grad_norm": 0.8668819665908813, + "learning_rate": 4.0406847800671515e-06, + "loss": 0.1438, + "step": 5740 + }, + { + "epoch": 0.9301685029163966, + "grad_norm": 0.8064781427383423, + "learning_rate": 4.040340365577452e-06, + "loss": 0.1289, + "step": 5741 + }, + { + "epoch": 0.9303305249513933, + "grad_norm": 0.9188829064369202, + "learning_rate": 4.0399959039570646e-06, + "loss": 0.1417, + "step": 5742 + }, + { + "epoch": 0.9304925469863902, + "grad_norm": 0.9218767881393433, + "learning_rate": 4.039651395216529e-06, + "loss": 0.1355, + "step": 5743 + }, + { + "epoch": 0.9306545690213869, + "grad_norm": 1.0256109237670898, + "learning_rate": 4.039306839366387e-06, + "loss": 0.1755, + "step": 5744 + }, + { + "epoch": 0.9308165910563837, + "grad_norm": 0.7666441202163696, + "learning_rate": 4.038962236417181e-06, + "loss": 0.1259, + "step": 5745 + }, + { + "epoch": 0.9309786130913804, + "grad_norm": 0.7907612323760986, + "learning_rate": 4.038617586379455e-06, + "loss": 0.1331, + "step": 5746 + }, + { + "epoch": 0.9311406351263772, + "grad_norm": 0.8179818391799927, + "learning_rate": 4.0382728892637535e-06, + "loss": 0.1369, + "step": 5747 + }, + { + "epoch": 0.931302657161374, + "grad_norm": 0.8541617393493652, + "learning_rate": 4.0379281450806255e-06, + "loss": 0.1495, + "step": 5748 + }, + { + "epoch": 0.9314646791963707, + "grad_norm": 0.7930795550346375, + "learning_rate": 4.037583353840616e-06, + "loss": 0.1259, + "step": 5749 + }, + { + "epoch": 0.9316267012313675, + "grad_norm": 0.8774799108505249, + "learning_rate": 4.037238515554278e-06, + "loss": 0.1422, + "step": 5750 + }, + { + "epoch": 0.9317887232663642, + "grad_norm": 0.7685874700546265, + "learning_rate": 4.03689363023216e-06, + "loss": 0.137, + "step": 5751 + }, + { + "epoch": 0.931950745301361, + "grad_norm": 0.8230496644973755, + "learning_rate": 4.0365486978848176e-06, + "loss": 0.1345, + "step": 5752 + }, + { + "epoch": 0.9321127673363577, + "grad_norm": 0.7255528569221497, + "learning_rate": 4.0362037185228045e-06, + "loss": 0.1223, + "step": 5753 + }, + { + "epoch": 0.9322747893713546, + "grad_norm": 0.8288103938102722, + "learning_rate": 4.035858692156673e-06, + "loss": 0.1412, + "step": 5754 + }, + { + "epoch": 0.9324368114063513, + "grad_norm": 0.7872039675712585, + "learning_rate": 4.035513618796983e-06, + "loss": 0.1254, + "step": 5755 + }, + { + "epoch": 0.932598833441348, + "grad_norm": 0.7703442573547363, + "learning_rate": 4.035168498454292e-06, + "loss": 0.1177, + "step": 5756 + }, + { + "epoch": 0.9327608554763448, + "grad_norm": 0.7858262658119202, + "learning_rate": 4.034823331139161e-06, + "loss": 0.123, + "step": 5757 + }, + { + "epoch": 0.9329228775113415, + "grad_norm": 0.9467945694923401, + "learning_rate": 4.034478116862149e-06, + "loss": 0.1372, + "step": 5758 + }, + { + "epoch": 0.9330848995463383, + "grad_norm": 0.7315438985824585, + "learning_rate": 4.03413285563382e-06, + "loss": 0.1219, + "step": 5759 + }, + { + "epoch": 0.933246921581335, + "grad_norm": 0.8702850937843323, + "learning_rate": 4.033787547464738e-06, + "loss": 0.1397, + "step": 5760 + }, + { + "epoch": 0.9334089436163319, + "grad_norm": 0.9242250919342041, + "learning_rate": 4.03344219236547e-06, + "loss": 0.1486, + "step": 5761 + }, + { + "epoch": 0.9335709656513286, + "grad_norm": 0.8591980934143066, + "learning_rate": 4.033096790346581e-06, + "loss": 0.1383, + "step": 5762 + }, + { + "epoch": 0.9337329876863253, + "grad_norm": 0.8047747611999512, + "learning_rate": 4.03275134141864e-06, + "loss": 0.127, + "step": 5763 + }, + { + "epoch": 0.9338950097213221, + "grad_norm": 0.8145176768302917, + "learning_rate": 4.032405845592218e-06, + "loss": 0.1187, + "step": 5764 + }, + { + "epoch": 0.9340570317563188, + "grad_norm": 0.8953231573104858, + "learning_rate": 4.0320603028778845e-06, + "loss": 0.1398, + "step": 5765 + }, + { + "epoch": 0.9342190537913156, + "grad_norm": 0.8610212206840515, + "learning_rate": 4.0317147132862135e-06, + "loss": 0.1355, + "step": 5766 + }, + { + "epoch": 0.9343810758263124, + "grad_norm": 0.7924541234970093, + "learning_rate": 4.031369076827779e-06, + "loss": 0.1243, + "step": 5767 + }, + { + "epoch": 0.9345430978613092, + "grad_norm": 0.8468953371047974, + "learning_rate": 4.031023393513157e-06, + "loss": 0.1312, + "step": 5768 + }, + { + "epoch": 0.9347051198963059, + "grad_norm": 0.8163899779319763, + "learning_rate": 4.030677663352924e-06, + "loss": 0.1254, + "step": 5769 + }, + { + "epoch": 0.9348671419313026, + "grad_norm": 0.8453715443611145, + "learning_rate": 4.030331886357659e-06, + "loss": 0.1306, + "step": 5770 + }, + { + "epoch": 0.9350291639662994, + "grad_norm": 0.8978089094161987, + "learning_rate": 4.0299860625379405e-06, + "loss": 0.1512, + "step": 5771 + }, + { + "epoch": 0.9351911860012961, + "grad_norm": 0.8564348816871643, + "learning_rate": 4.029640191904352e-06, + "loss": 0.147, + "step": 5772 + }, + { + "epoch": 0.935353208036293, + "grad_norm": 0.771431028842926, + "learning_rate": 4.029294274467475e-06, + "loss": 0.1263, + "step": 5773 + }, + { + "epoch": 0.9355152300712897, + "grad_norm": 0.7943999767303467, + "learning_rate": 4.028948310237893e-06, + "loss": 0.1132, + "step": 5774 + }, + { + "epoch": 0.9356772521062865, + "grad_norm": 0.8058924078941345, + "learning_rate": 4.028602299226194e-06, + "loss": 0.1315, + "step": 5775 + }, + { + "epoch": 0.9358392741412832, + "grad_norm": 0.9111430048942566, + "learning_rate": 4.0282562414429635e-06, + "loss": 0.1582, + "step": 5776 + }, + { + "epoch": 0.93600129617628, + "grad_norm": 0.8768683075904846, + "learning_rate": 4.02791013689879e-06, + "loss": 0.147, + "step": 5777 + }, + { + "epoch": 0.9361633182112767, + "grad_norm": 0.7665877342224121, + "learning_rate": 4.027563985604264e-06, + "loss": 0.1359, + "step": 5778 + }, + { + "epoch": 0.9363253402462735, + "grad_norm": 0.9200119972229004, + "learning_rate": 4.027217787569977e-06, + "loss": 0.1459, + "step": 5779 + }, + { + "epoch": 0.9364873622812703, + "grad_norm": 0.9305282831192017, + "learning_rate": 4.026871542806521e-06, + "loss": 0.1408, + "step": 5780 + }, + { + "epoch": 0.936649384316267, + "grad_norm": 0.7743881344795227, + "learning_rate": 4.026525251324491e-06, + "loss": 0.1419, + "step": 5781 + }, + { + "epoch": 0.9368114063512638, + "grad_norm": 0.8831008672714233, + "learning_rate": 4.026178913134482e-06, + "loss": 0.1248, + "step": 5782 + }, + { + "epoch": 0.9369734283862605, + "grad_norm": 0.9593657851219177, + "learning_rate": 4.025832528247092e-06, + "loss": 0.1668, + "step": 5783 + }, + { + "epoch": 0.9371354504212573, + "grad_norm": 0.8962870836257935, + "learning_rate": 4.02548609667292e-06, + "loss": 0.1516, + "step": 5784 + }, + { + "epoch": 0.937297472456254, + "grad_norm": 0.8342962265014648, + "learning_rate": 4.025139618422563e-06, + "loss": 0.1261, + "step": 5785 + }, + { + "epoch": 0.9374594944912508, + "grad_norm": 0.8116139769554138, + "learning_rate": 4.024793093506626e-06, + "loss": 0.1366, + "step": 5786 + }, + { + "epoch": 0.9376215165262476, + "grad_norm": 0.8429117798805237, + "learning_rate": 4.024446521935709e-06, + "loss": 0.1453, + "step": 5787 + }, + { + "epoch": 0.9377835385612443, + "grad_norm": 0.8313448429107666, + "learning_rate": 4.024099903720419e-06, + "loss": 0.1356, + "step": 5788 + }, + { + "epoch": 0.9379455605962411, + "grad_norm": 0.8871910572052002, + "learning_rate": 4.023753238871359e-06, + "loss": 0.1517, + "step": 5789 + }, + { + "epoch": 0.9381075826312378, + "grad_norm": 0.7346284985542297, + "learning_rate": 4.023406527399137e-06, + "loss": 0.1137, + "step": 5790 + }, + { + "epoch": 0.9382696046662347, + "grad_norm": 0.7601230144500732, + "learning_rate": 4.023059769314363e-06, + "loss": 0.127, + "step": 5791 + }, + { + "epoch": 0.9384316267012314, + "grad_norm": 0.7658340930938721, + "learning_rate": 4.022712964627645e-06, + "loss": 0.1183, + "step": 5792 + }, + { + "epoch": 0.9385936487362281, + "grad_norm": 0.7574445605278015, + "learning_rate": 4.022366113349596e-06, + "loss": 0.1352, + "step": 5793 + }, + { + "epoch": 0.9387556707712249, + "grad_norm": 0.9390842914581299, + "learning_rate": 4.022019215490827e-06, + "loss": 0.1511, + "step": 5794 + }, + { + "epoch": 0.9389176928062216, + "grad_norm": 0.8839474320411682, + "learning_rate": 4.021672271061955e-06, + "loss": 0.1477, + "step": 5795 + }, + { + "epoch": 0.9390797148412184, + "grad_norm": 0.7684565186500549, + "learning_rate": 4.021325280073592e-06, + "loss": 0.1171, + "step": 5796 + }, + { + "epoch": 0.9392417368762151, + "grad_norm": 0.9148947596549988, + "learning_rate": 4.020978242536357e-06, + "loss": 0.1434, + "step": 5797 + }, + { + "epoch": 0.939403758911212, + "grad_norm": 0.8702596426010132, + "learning_rate": 4.0206311584608705e-06, + "loss": 0.1391, + "step": 5798 + }, + { + "epoch": 0.9395657809462087, + "grad_norm": 0.8113718628883362, + "learning_rate": 4.020284027857748e-06, + "loss": 0.1407, + "step": 5799 + }, + { + "epoch": 0.9397278029812054, + "grad_norm": 1.005882978439331, + "learning_rate": 4.019936850737615e-06, + "loss": 0.1302, + "step": 5800 + }, + { + "epoch": 0.9398898250162022, + "grad_norm": 0.8756170272827148, + "learning_rate": 4.019589627111092e-06, + "loss": 0.1336, + "step": 5801 + }, + { + "epoch": 0.9400518470511989, + "grad_norm": 0.8532119393348694, + "learning_rate": 4.019242356988803e-06, + "loss": 0.1408, + "step": 5802 + }, + { + "epoch": 0.9402138690861958, + "grad_norm": 1.0048043727874756, + "learning_rate": 4.018895040381375e-06, + "loss": 0.1587, + "step": 5803 + }, + { + "epoch": 0.9403758911211925, + "grad_norm": 0.9122523069381714, + "learning_rate": 4.018547677299434e-06, + "loss": 0.1292, + "step": 5804 + }, + { + "epoch": 0.9405379131561893, + "grad_norm": 0.8948111534118652, + "learning_rate": 4.018200267753609e-06, + "loss": 0.1612, + "step": 5805 + }, + { + "epoch": 0.940699935191186, + "grad_norm": 0.7902446985244751, + "learning_rate": 4.01785281175453e-06, + "loss": 0.1225, + "step": 5806 + }, + { + "epoch": 0.9408619572261827, + "grad_norm": 0.9093374013900757, + "learning_rate": 4.017505309312829e-06, + "loss": 0.1477, + "step": 5807 + }, + { + "epoch": 0.9410239792611795, + "grad_norm": 0.9980859756469727, + "learning_rate": 4.017157760439136e-06, + "loss": 0.1574, + "step": 5808 + }, + { + "epoch": 0.9411860012961762, + "grad_norm": 0.8467405438423157, + "learning_rate": 4.0168101651440885e-06, + "loss": 0.139, + "step": 5809 + }, + { + "epoch": 0.9413480233311731, + "grad_norm": 0.7334152460098267, + "learning_rate": 4.01646252343832e-06, + "loss": 0.1238, + "step": 5810 + }, + { + "epoch": 0.9415100453661698, + "grad_norm": 0.8250093460083008, + "learning_rate": 4.016114835332467e-06, + "loss": 0.1381, + "step": 5811 + }, + { + "epoch": 0.9416720674011666, + "grad_norm": 0.8436107039451599, + "learning_rate": 4.015767100837171e-06, + "loss": 0.1459, + "step": 5812 + }, + { + "epoch": 0.9418340894361633, + "grad_norm": 0.8007429242134094, + "learning_rate": 4.015419319963069e-06, + "loss": 0.1275, + "step": 5813 + }, + { + "epoch": 0.94199611147116, + "grad_norm": 0.7847580909729004, + "learning_rate": 4.015071492720802e-06, + "loss": 0.1295, + "step": 5814 + }, + { + "epoch": 0.9421581335061568, + "grad_norm": 0.8911027312278748, + "learning_rate": 4.014723619121015e-06, + "loss": 0.1504, + "step": 5815 + }, + { + "epoch": 0.9423201555411536, + "grad_norm": 0.811799168586731, + "learning_rate": 4.014375699174351e-06, + "loss": 0.1243, + "step": 5816 + }, + { + "epoch": 0.9424821775761504, + "grad_norm": 0.8510453701019287, + "learning_rate": 4.014027732891454e-06, + "loss": 0.1331, + "step": 5817 + }, + { + "epoch": 0.9426441996111471, + "grad_norm": 0.9293267726898193, + "learning_rate": 4.013679720282973e-06, + "loss": 0.1531, + "step": 5818 + }, + { + "epoch": 0.9428062216461439, + "grad_norm": 0.8839837312698364, + "learning_rate": 4.013331661359556e-06, + "loss": 0.158, + "step": 5819 + }, + { + "epoch": 0.9429682436811406, + "grad_norm": 0.7744541764259338, + "learning_rate": 4.012983556131852e-06, + "loss": 0.1346, + "step": 5820 + }, + { + "epoch": 0.9431302657161373, + "grad_norm": 0.8198082447052002, + "learning_rate": 4.012635404610512e-06, + "loss": 0.1272, + "step": 5821 + }, + { + "epoch": 0.9432922877511342, + "grad_norm": 0.9199150204658508, + "learning_rate": 4.01228720680619e-06, + "loss": 0.145, + "step": 5822 + }, + { + "epoch": 0.9434543097861309, + "grad_norm": 0.9087578058242798, + "learning_rate": 4.011938962729538e-06, + "loss": 0.1471, + "step": 5823 + }, + { + "epoch": 0.9436163318211277, + "grad_norm": 0.7970622181892395, + "learning_rate": 4.011590672391213e-06, + "loss": 0.1335, + "step": 5824 + }, + { + "epoch": 0.9437783538561244, + "grad_norm": 0.9035589694976807, + "learning_rate": 4.011242335801872e-06, + "loss": 0.1493, + "step": 5825 + }, + { + "epoch": 0.9439403758911212, + "grad_norm": 0.825301468372345, + "learning_rate": 4.010893952972173e-06, + "loss": 0.1256, + "step": 5826 + }, + { + "epoch": 0.9441023979261179, + "grad_norm": 0.8092053532600403, + "learning_rate": 4.010545523912775e-06, + "loss": 0.1258, + "step": 5827 + }, + { + "epoch": 0.9442644199611148, + "grad_norm": 0.9639272689819336, + "learning_rate": 4.010197048634338e-06, + "loss": 0.1487, + "step": 5828 + }, + { + "epoch": 0.9444264419961115, + "grad_norm": 1.0004661083221436, + "learning_rate": 4.009848527147527e-06, + "loss": 0.1527, + "step": 5829 + }, + { + "epoch": 0.9445884640311082, + "grad_norm": 0.8995679616928101, + "learning_rate": 4.009499959463005e-06, + "loss": 0.1308, + "step": 5830 + }, + { + "epoch": 0.944750486066105, + "grad_norm": 0.9287495613098145, + "learning_rate": 4.009151345591437e-06, + "loss": 0.1367, + "step": 5831 + }, + { + "epoch": 0.9449125081011017, + "grad_norm": 0.9027858376502991, + "learning_rate": 4.00880268554349e-06, + "loss": 0.1273, + "step": 5832 + }, + { + "epoch": 0.9450745301360985, + "grad_norm": 0.8884682655334473, + "learning_rate": 4.008453979329832e-06, + "loss": 0.144, + "step": 5833 + }, + { + "epoch": 0.9452365521710953, + "grad_norm": 0.932540774345398, + "learning_rate": 4.008105226961132e-06, + "loss": 0.1585, + "step": 5834 + }, + { + "epoch": 0.9453985742060921, + "grad_norm": 0.7685552835464478, + "learning_rate": 4.0077564284480625e-06, + "loss": 0.1161, + "step": 5835 + }, + { + "epoch": 0.9455605962410888, + "grad_norm": 0.830932080745697, + "learning_rate": 4.007407583801295e-06, + "loss": 0.1369, + "step": 5836 + }, + { + "epoch": 0.9457226182760855, + "grad_norm": 0.6946200132369995, + "learning_rate": 4.007058693031502e-06, + "loss": 0.1087, + "step": 5837 + }, + { + "epoch": 0.9458846403110823, + "grad_norm": 0.8081919550895691, + "learning_rate": 4.006709756149362e-06, + "loss": 0.1276, + "step": 5838 + }, + { + "epoch": 0.946046662346079, + "grad_norm": 0.9541810154914856, + "learning_rate": 4.0063607731655486e-06, + "loss": 0.1513, + "step": 5839 + }, + { + "epoch": 0.9462086843810759, + "grad_norm": 0.8741765022277832, + "learning_rate": 4.006011744090741e-06, + "loss": 0.151, + "step": 5840 + }, + { + "epoch": 0.9463707064160726, + "grad_norm": 0.8932287096977234, + "learning_rate": 4.005662668935618e-06, + "loss": 0.1519, + "step": 5841 + }, + { + "epoch": 0.9465327284510694, + "grad_norm": 0.7929561138153076, + "learning_rate": 4.005313547710861e-06, + "loss": 0.1335, + "step": 5842 + }, + { + "epoch": 0.9466947504860661, + "grad_norm": 0.8250575661659241, + "learning_rate": 4.004964380427153e-06, + "loss": 0.1288, + "step": 5843 + }, + { + "epoch": 0.9468567725210628, + "grad_norm": 0.8816642761230469, + "learning_rate": 4.004615167095176e-06, + "loss": 0.144, + "step": 5844 + }, + { + "epoch": 0.9470187945560596, + "grad_norm": 0.7737101316452026, + "learning_rate": 4.004265907725616e-06, + "loss": 0.1375, + "step": 5845 + }, + { + "epoch": 0.9471808165910564, + "grad_norm": 0.851394534111023, + "learning_rate": 4.003916602329161e-06, + "loss": 0.1353, + "step": 5846 + }, + { + "epoch": 0.9473428386260532, + "grad_norm": 0.8754749894142151, + "learning_rate": 4.003567250916496e-06, + "loss": 0.1465, + "step": 5847 + }, + { + "epoch": 0.9475048606610499, + "grad_norm": 0.8369944095611572, + "learning_rate": 4.0032178534983115e-06, + "loss": 0.1441, + "step": 5848 + }, + { + "epoch": 0.9476668826960467, + "grad_norm": 0.9141656160354614, + "learning_rate": 4.0028684100852986e-06, + "loss": 0.1613, + "step": 5849 + }, + { + "epoch": 0.9478289047310434, + "grad_norm": 0.8459381461143494, + "learning_rate": 4.00251892068815e-06, + "loss": 0.1346, + "step": 5850 + }, + { + "epoch": 0.9479909267660401, + "grad_norm": 0.8450573086738586, + "learning_rate": 4.002169385317558e-06, + "loss": 0.1378, + "step": 5851 + }, + { + "epoch": 0.948152948801037, + "grad_norm": 0.8298230767250061, + "learning_rate": 4.001819803984218e-06, + "loss": 0.1431, + "step": 5852 + }, + { + "epoch": 0.9483149708360337, + "grad_norm": 0.7699709534645081, + "learning_rate": 4.001470176698826e-06, + "loss": 0.132, + "step": 5853 + }, + { + "epoch": 0.9484769928710305, + "grad_norm": 0.7468587160110474, + "learning_rate": 4.00112050347208e-06, + "loss": 0.1125, + "step": 5854 + }, + { + "epoch": 0.9486390149060272, + "grad_norm": 0.8313590288162231, + "learning_rate": 4.00077078431468e-06, + "loss": 0.1382, + "step": 5855 + }, + { + "epoch": 0.948801036941024, + "grad_norm": 0.9513286352157593, + "learning_rate": 4.000421019237326e-06, + "loss": 0.1512, + "step": 5856 + }, + { + "epoch": 0.9489630589760207, + "grad_norm": 0.9002991914749146, + "learning_rate": 4.000071208250719e-06, + "loss": 0.1662, + "step": 5857 + }, + { + "epoch": 0.9491250810110174, + "grad_norm": 0.9496574401855469, + "learning_rate": 3.999721351365563e-06, + "loss": 0.1571, + "step": 5858 + }, + { + "epoch": 0.9492871030460143, + "grad_norm": 0.9157005548477173, + "learning_rate": 3.999371448592563e-06, + "loss": 0.1489, + "step": 5859 + }, + { + "epoch": 0.949449125081011, + "grad_norm": 0.8769446015357971, + "learning_rate": 3.999021499942425e-06, + "loss": 0.1484, + "step": 5860 + }, + { + "epoch": 0.9496111471160078, + "grad_norm": 0.8674890398979187, + "learning_rate": 3.9986715054258575e-06, + "loss": 0.1579, + "step": 5861 + }, + { + "epoch": 0.9497731691510045, + "grad_norm": 0.766059935092926, + "learning_rate": 3.998321465053568e-06, + "loss": 0.13, + "step": 5862 + }, + { + "epoch": 0.9499351911860013, + "grad_norm": 0.8092472553253174, + "learning_rate": 3.997971378836268e-06, + "loss": 0.1296, + "step": 5863 + }, + { + "epoch": 0.950097213220998, + "grad_norm": 0.8249281048774719, + "learning_rate": 3.9976212467846674e-06, + "loss": 0.1396, + "step": 5864 + }, + { + "epoch": 0.9502592352559948, + "grad_norm": 0.782484769821167, + "learning_rate": 3.997271068909483e-06, + "loss": 0.1208, + "step": 5865 + }, + { + "epoch": 0.9504212572909916, + "grad_norm": 0.7566808462142944, + "learning_rate": 3.996920845221425e-06, + "loss": 0.1269, + "step": 5866 + }, + { + "epoch": 0.9505832793259883, + "grad_norm": 0.7882890105247498, + "learning_rate": 3.9965705757312136e-06, + "loss": 0.1227, + "step": 5867 + }, + { + "epoch": 0.9507453013609851, + "grad_norm": 0.8613394498825073, + "learning_rate": 3.996220260449563e-06, + "loss": 0.1334, + "step": 5868 + }, + { + "epoch": 0.9509073233959818, + "grad_norm": 0.9516828060150146, + "learning_rate": 3.9958698993871935e-06, + "loss": 0.1641, + "step": 5869 + }, + { + "epoch": 0.9510693454309787, + "grad_norm": 0.8244841694831848, + "learning_rate": 3.9955194925548245e-06, + "loss": 0.1327, + "step": 5870 + }, + { + "epoch": 0.9512313674659754, + "grad_norm": 0.7766361832618713, + "learning_rate": 3.995169039963179e-06, + "loss": 0.1201, + "step": 5871 + }, + { + "epoch": 0.9513933895009722, + "grad_norm": 0.8498024344444275, + "learning_rate": 3.994818541622979e-06, + "loss": 0.1465, + "step": 5872 + }, + { + "epoch": 0.9515554115359689, + "grad_norm": 0.9167964458465576, + "learning_rate": 3.994467997544948e-06, + "loss": 0.1403, + "step": 5873 + }, + { + "epoch": 0.9517174335709656, + "grad_norm": 1.072054386138916, + "learning_rate": 3.994117407739814e-06, + "loss": 0.1786, + "step": 5874 + }, + { + "epoch": 0.9518794556059624, + "grad_norm": 0.8586252331733704, + "learning_rate": 3.993766772218303e-06, + "loss": 0.1465, + "step": 5875 + }, + { + "epoch": 0.9520414776409591, + "grad_norm": 0.8002846837043762, + "learning_rate": 3.993416090991143e-06, + "loss": 0.1172, + "step": 5876 + }, + { + "epoch": 0.952203499675956, + "grad_norm": 0.7997404932975769, + "learning_rate": 3.9930653640690655e-06, + "loss": 0.125, + "step": 5877 + }, + { + "epoch": 0.9523655217109527, + "grad_norm": 0.8767850399017334, + "learning_rate": 3.992714591462799e-06, + "loss": 0.1404, + "step": 5878 + }, + { + "epoch": 0.9525275437459495, + "grad_norm": 0.8756327629089355, + "learning_rate": 3.992363773183081e-06, + "loss": 0.1434, + "step": 5879 + }, + { + "epoch": 0.9526895657809462, + "grad_norm": 0.8966729044914246, + "learning_rate": 3.992012909240641e-06, + "loss": 0.1502, + "step": 5880 + }, + { + "epoch": 0.9528515878159429, + "grad_norm": 0.7201352715492249, + "learning_rate": 3.991661999646218e-06, + "loss": 0.1114, + "step": 5881 + }, + { + "epoch": 0.9530136098509397, + "grad_norm": 0.8088727593421936, + "learning_rate": 3.991311044410546e-06, + "loss": 0.1477, + "step": 5882 + }, + { + "epoch": 0.9531756318859365, + "grad_norm": 0.8761907815933228, + "learning_rate": 3.9909600435443665e-06, + "loss": 0.1541, + "step": 5883 + }, + { + "epoch": 0.9533376539209333, + "grad_norm": 0.7636457681655884, + "learning_rate": 3.990608997058416e-06, + "loss": 0.1187, + "step": 5884 + }, + { + "epoch": 0.95349967595593, + "grad_norm": 0.7406367063522339, + "learning_rate": 3.9902579049634385e-06, + "loss": 0.1194, + "step": 5885 + }, + { + "epoch": 0.9536616979909268, + "grad_norm": 0.8607000112533569, + "learning_rate": 3.989906767270175e-06, + "loss": 0.1342, + "step": 5886 + }, + { + "epoch": 0.9538237200259235, + "grad_norm": 0.8230603933334351, + "learning_rate": 3.98955558398937e-06, + "loss": 0.1441, + "step": 5887 + }, + { + "epoch": 0.9539857420609202, + "grad_norm": 0.8925058841705322, + "learning_rate": 3.989204355131769e-06, + "loss": 0.1424, + "step": 5888 + }, + { + "epoch": 0.9541477640959171, + "grad_norm": 0.9012094140052795, + "learning_rate": 3.98885308070812e-06, + "loss": 0.1487, + "step": 5889 + }, + { + "epoch": 0.9543097861309138, + "grad_norm": 0.818138599395752, + "learning_rate": 3.988501760729168e-06, + "loss": 0.1225, + "step": 5890 + }, + { + "epoch": 0.9544718081659106, + "grad_norm": 0.8324544429779053, + "learning_rate": 3.988150395205665e-06, + "loss": 0.1278, + "step": 5891 + }, + { + "epoch": 0.9546338302009073, + "grad_norm": 0.8996634483337402, + "learning_rate": 3.98779898414836e-06, + "loss": 0.1454, + "step": 5892 + }, + { + "epoch": 0.9547958522359041, + "grad_norm": 0.9417580962181091, + "learning_rate": 3.987447527568007e-06, + "loss": 0.1461, + "step": 5893 + }, + { + "epoch": 0.9549578742709008, + "grad_norm": 0.9308149814605713, + "learning_rate": 3.98709602547536e-06, + "loss": 0.154, + "step": 5894 + }, + { + "epoch": 0.9551198963058976, + "grad_norm": 0.8258563876152039, + "learning_rate": 3.986744477881172e-06, + "loss": 0.1396, + "step": 5895 + }, + { + "epoch": 0.9552819183408944, + "grad_norm": 0.9481770396232605, + "learning_rate": 3.986392884796202e-06, + "loss": 0.1515, + "step": 5896 + }, + { + "epoch": 0.9554439403758911, + "grad_norm": 0.8203932046890259, + "learning_rate": 3.986041246231206e-06, + "loss": 0.1401, + "step": 5897 + }, + { + "epoch": 0.9556059624108879, + "grad_norm": 0.8350275158882141, + "learning_rate": 3.9856895621969435e-06, + "loss": 0.1318, + "step": 5898 + }, + { + "epoch": 0.9557679844458846, + "grad_norm": 0.8285270929336548, + "learning_rate": 3.985337832704177e-06, + "loss": 0.139, + "step": 5899 + }, + { + "epoch": 0.9559300064808814, + "grad_norm": 0.7445301413536072, + "learning_rate": 3.984986057763667e-06, + "loss": 0.1271, + "step": 5900 + }, + { + "epoch": 0.9560920285158782, + "grad_norm": 0.8104132413864136, + "learning_rate": 3.984634237386177e-06, + "loss": 0.1355, + "step": 5901 + }, + { + "epoch": 0.9562540505508749, + "grad_norm": 0.9375022649765015, + "learning_rate": 3.984282371582472e-06, + "loss": 0.1473, + "step": 5902 + }, + { + "epoch": 0.9564160725858717, + "grad_norm": 0.7736932635307312, + "learning_rate": 3.983930460363318e-06, + "loss": 0.1289, + "step": 5903 + }, + { + "epoch": 0.9565780946208684, + "grad_norm": 0.8169133067131042, + "learning_rate": 3.983578503739483e-06, + "loss": 0.1125, + "step": 5904 + }, + { + "epoch": 0.9567401166558652, + "grad_norm": 0.9903810620307922, + "learning_rate": 3.983226501721736e-06, + "loss": 0.1406, + "step": 5905 + }, + { + "epoch": 0.9569021386908619, + "grad_norm": 0.8533335328102112, + "learning_rate": 3.982874454320849e-06, + "loss": 0.135, + "step": 5906 + }, + { + "epoch": 0.9570641607258588, + "grad_norm": 0.8362215161323547, + "learning_rate": 3.98252236154759e-06, + "loss": 0.1378, + "step": 5907 + }, + { + "epoch": 0.9572261827608555, + "grad_norm": 0.8589804768562317, + "learning_rate": 3.982170223412735e-06, + "loss": 0.1484, + "step": 5908 + }, + { + "epoch": 0.9573882047958522, + "grad_norm": 0.8295727372169495, + "learning_rate": 3.981818039927058e-06, + "loss": 0.147, + "step": 5909 + }, + { + "epoch": 0.957550226830849, + "grad_norm": 0.9022241234779358, + "learning_rate": 3.981465811101335e-06, + "loss": 0.1553, + "step": 5910 + }, + { + "epoch": 0.9577122488658457, + "grad_norm": 0.8496821522712708, + "learning_rate": 3.981113536946344e-06, + "loss": 0.1388, + "step": 5911 + }, + { + "epoch": 0.9578742709008425, + "grad_norm": 0.7762805819511414, + "learning_rate": 3.9807612174728615e-06, + "loss": 0.1212, + "step": 5912 + }, + { + "epoch": 0.9580362929358393, + "grad_norm": 0.7569793462753296, + "learning_rate": 3.9804088526916706e-06, + "loss": 0.1175, + "step": 5913 + }, + { + "epoch": 0.9581983149708361, + "grad_norm": 1.0253901481628418, + "learning_rate": 3.98005644261355e-06, + "loss": 0.1636, + "step": 5914 + }, + { + "epoch": 0.9583603370058328, + "grad_norm": 0.7959587574005127, + "learning_rate": 3.979703987249285e-06, + "loss": 0.1359, + "step": 5915 + }, + { + "epoch": 0.9585223590408296, + "grad_norm": 0.8208194971084595, + "learning_rate": 3.979351486609659e-06, + "loss": 0.1304, + "step": 5916 + }, + { + "epoch": 0.9586843810758263, + "grad_norm": 0.8786490559577942, + "learning_rate": 3.978998940705456e-06, + "loss": 0.144, + "step": 5917 + }, + { + "epoch": 0.958846403110823, + "grad_norm": 0.8570854663848877, + "learning_rate": 3.978646349547466e-06, + "loss": 0.1458, + "step": 5918 + }, + { + "epoch": 0.9590084251458199, + "grad_norm": 0.8755070567131042, + "learning_rate": 3.978293713146475e-06, + "loss": 0.1521, + "step": 5919 + }, + { + "epoch": 0.9591704471808166, + "grad_norm": 0.8440230488777161, + "learning_rate": 3.977941031513275e-06, + "loss": 0.1334, + "step": 5920 + }, + { + "epoch": 0.9593324692158134, + "grad_norm": 0.7312674522399902, + "learning_rate": 3.977588304658654e-06, + "loss": 0.1117, + "step": 5921 + }, + { + "epoch": 0.9594944912508101, + "grad_norm": 0.9204146265983582, + "learning_rate": 3.977235532593408e-06, + "loss": 0.1492, + "step": 5922 + }, + { + "epoch": 0.9596565132858069, + "grad_norm": 0.7905224561691284, + "learning_rate": 3.9768827153283295e-06, + "loss": 0.128, + "step": 5923 + }, + { + "epoch": 0.9598185353208036, + "grad_norm": 0.8433038592338562, + "learning_rate": 3.976529852874214e-06, + "loss": 0.1244, + "step": 5924 + }, + { + "epoch": 0.9599805573558003, + "grad_norm": 0.8728172183036804, + "learning_rate": 3.976176945241857e-06, + "loss": 0.1406, + "step": 5925 + }, + { + "epoch": 0.9601425793907972, + "grad_norm": 0.9237974286079407, + "learning_rate": 3.975823992442058e-06, + "loss": 0.1445, + "step": 5926 + }, + { + "epoch": 0.9603046014257939, + "grad_norm": 0.8911517262458801, + "learning_rate": 3.9754709944856175e-06, + "loss": 0.1443, + "step": 5927 + }, + { + "epoch": 0.9604666234607907, + "grad_norm": 0.9263610243797302, + "learning_rate": 3.975117951383334e-06, + "loss": 0.1428, + "step": 5928 + }, + { + "epoch": 0.9606286454957874, + "grad_norm": 0.7457212209701538, + "learning_rate": 3.974764863146012e-06, + "loss": 0.1228, + "step": 5929 + }, + { + "epoch": 0.9607906675307842, + "grad_norm": 0.9272271990776062, + "learning_rate": 3.974411729784453e-06, + "loss": 0.1505, + "step": 5930 + }, + { + "epoch": 0.960952689565781, + "grad_norm": 0.9309628009796143, + "learning_rate": 3.974058551309463e-06, + "loss": 0.1544, + "step": 5931 + }, + { + "epoch": 0.9611147116007777, + "grad_norm": 0.8147203326225281, + "learning_rate": 3.973705327731849e-06, + "loss": 0.1389, + "step": 5932 + }, + { + "epoch": 0.9612767336357745, + "grad_norm": 0.7795599699020386, + "learning_rate": 3.9733520590624185e-06, + "loss": 0.1237, + "step": 5933 + }, + { + "epoch": 0.9614387556707712, + "grad_norm": 0.804670512676239, + "learning_rate": 3.97299874531198e-06, + "loss": 0.139, + "step": 5934 + }, + { + "epoch": 0.961600777705768, + "grad_norm": 0.8195658326148987, + "learning_rate": 3.972645386491345e-06, + "loss": 0.1495, + "step": 5935 + }, + { + "epoch": 0.9617627997407647, + "grad_norm": 0.7763250470161438, + "learning_rate": 3.972291982611325e-06, + "loss": 0.1301, + "step": 5936 + }, + { + "epoch": 0.9619248217757616, + "grad_norm": 1.2038568258285522, + "learning_rate": 3.971938533682732e-06, + "loss": 0.1845, + "step": 5937 + }, + { + "epoch": 0.9620868438107583, + "grad_norm": 0.8428332805633545, + "learning_rate": 3.971585039716382e-06, + "loss": 0.1433, + "step": 5938 + }, + { + "epoch": 0.962248865845755, + "grad_norm": 0.8756902813911438, + "learning_rate": 3.971231500723093e-06, + "loss": 0.148, + "step": 5939 + }, + { + "epoch": 0.9624108878807518, + "grad_norm": 0.8229630589485168, + "learning_rate": 3.970877916713678e-06, + "loss": 0.1428, + "step": 5940 + }, + { + "epoch": 0.9625729099157485, + "grad_norm": 0.8338178992271423, + "learning_rate": 3.97052428769896e-06, + "loss": 0.1336, + "step": 5941 + }, + { + "epoch": 0.9627349319507453, + "grad_norm": 0.8867552280426025, + "learning_rate": 3.9701706136897564e-06, + "loss": 0.1449, + "step": 5942 + }, + { + "epoch": 0.962896953985742, + "grad_norm": 0.8758612275123596, + "learning_rate": 3.96981689469689e-06, + "loss": 0.1592, + "step": 5943 + }, + { + "epoch": 0.9630589760207389, + "grad_norm": 0.8010472059249878, + "learning_rate": 3.969463130731183e-06, + "loss": 0.1295, + "step": 5944 + }, + { + "epoch": 0.9632209980557356, + "grad_norm": 0.9243650436401367, + "learning_rate": 3.969109321803461e-06, + "loss": 0.1523, + "step": 5945 + }, + { + "epoch": 0.9633830200907323, + "grad_norm": 0.8313032984733582, + "learning_rate": 3.968755467924549e-06, + "loss": 0.1382, + "step": 5946 + }, + { + "epoch": 0.9635450421257291, + "grad_norm": 0.8543305397033691, + "learning_rate": 3.9684015691052736e-06, + "loss": 0.143, + "step": 5947 + }, + { + "epoch": 0.9637070641607258, + "grad_norm": 0.8567736148834229, + "learning_rate": 3.968047625356463e-06, + "loss": 0.1359, + "step": 5948 + }, + { + "epoch": 0.9638690861957226, + "grad_norm": 0.9071134328842163, + "learning_rate": 3.967693636688948e-06, + "loss": 0.1325, + "step": 5949 + }, + { + "epoch": 0.9640311082307194, + "grad_norm": 0.7577062249183655, + "learning_rate": 3.96733960311356e-06, + "loss": 0.1227, + "step": 5950 + }, + { + "epoch": 0.9641931302657162, + "grad_norm": 0.9051762223243713, + "learning_rate": 3.966985524641132e-06, + "loss": 0.1367, + "step": 5951 + }, + { + "epoch": 0.9643551523007129, + "grad_norm": 0.8882315754890442, + "learning_rate": 3.966631401282495e-06, + "loss": 0.1543, + "step": 5952 + }, + { + "epoch": 0.9645171743357096, + "grad_norm": 0.8317487835884094, + "learning_rate": 3.966277233048487e-06, + "loss": 0.1269, + "step": 5953 + }, + { + "epoch": 0.9646791963707064, + "grad_norm": 0.8450263738632202, + "learning_rate": 3.965923019949944e-06, + "loss": 0.1332, + "step": 5954 + }, + { + "epoch": 0.9648412184057031, + "grad_norm": 0.9014632105827332, + "learning_rate": 3.965568761997704e-06, + "loss": 0.1534, + "step": 5955 + }, + { + "epoch": 0.9650032404407, + "grad_norm": 0.8284904956817627, + "learning_rate": 3.965214459202607e-06, + "loss": 0.1298, + "step": 5956 + }, + { + "epoch": 0.9651652624756967, + "grad_norm": 0.7886707782745361, + "learning_rate": 3.964860111575493e-06, + "loss": 0.1353, + "step": 5957 + }, + { + "epoch": 0.9653272845106935, + "grad_norm": 0.8882688879966736, + "learning_rate": 3.964505719127205e-06, + "loss": 0.1421, + "step": 5958 + }, + { + "epoch": 0.9654893065456902, + "grad_norm": 0.8411626219749451, + "learning_rate": 3.964151281868585e-06, + "loss": 0.1326, + "step": 5959 + }, + { + "epoch": 0.9656513285806869, + "grad_norm": 0.9165163636207581, + "learning_rate": 3.963796799810479e-06, + "loss": 0.1466, + "step": 5960 + }, + { + "epoch": 0.9658133506156837, + "grad_norm": 0.7748314142227173, + "learning_rate": 3.963442272963735e-06, + "loss": 0.133, + "step": 5961 + }, + { + "epoch": 0.9659753726506805, + "grad_norm": 0.8058779835700989, + "learning_rate": 3.9630877013391964e-06, + "loss": 0.1459, + "step": 5962 + }, + { + "epoch": 0.9661373946856773, + "grad_norm": 0.9120404720306396, + "learning_rate": 3.962733084947717e-06, + "loss": 0.1459, + "step": 5963 + }, + { + "epoch": 0.966299416720674, + "grad_norm": 0.8716152906417847, + "learning_rate": 3.962378423800143e-06, + "loss": 0.1448, + "step": 5964 + }, + { + "epoch": 0.9664614387556708, + "grad_norm": 0.8008790612220764, + "learning_rate": 3.962023717907329e-06, + "loss": 0.131, + "step": 5965 + }, + { + "epoch": 0.9666234607906675, + "grad_norm": 0.7553631067276001, + "learning_rate": 3.961668967280128e-06, + "loss": 0.1304, + "step": 5966 + }, + { + "epoch": 0.9667854828256643, + "grad_norm": 0.9899448752403259, + "learning_rate": 3.961314171929392e-06, + "loss": 0.1312, + "step": 5967 + }, + { + "epoch": 0.9669475048606611, + "grad_norm": 0.7708137631416321, + "learning_rate": 3.96095933186598e-06, + "loss": 0.1328, + "step": 5968 + }, + { + "epoch": 0.9671095268956578, + "grad_norm": 0.7999005317687988, + "learning_rate": 3.960604447100747e-06, + "loss": 0.1344, + "step": 5969 + }, + { + "epoch": 0.9672715489306546, + "grad_norm": 0.7382838129997253, + "learning_rate": 3.960249517644553e-06, + "loss": 0.1226, + "step": 5970 + }, + { + "epoch": 0.9674335709656513, + "grad_norm": 1.0125248432159424, + "learning_rate": 3.959894543508258e-06, + "loss": 0.1487, + "step": 5971 + }, + { + "epoch": 0.9675955930006481, + "grad_norm": 0.871058464050293, + "learning_rate": 3.959539524702722e-06, + "loss": 0.133, + "step": 5972 + }, + { + "epoch": 0.9677576150356448, + "grad_norm": 0.7588186264038086, + "learning_rate": 3.9591844612388095e-06, + "loss": 0.1224, + "step": 5973 + }, + { + "epoch": 0.9679196370706417, + "grad_norm": 0.7937440276145935, + "learning_rate": 3.958829353127383e-06, + "loss": 0.1285, + "step": 5974 + }, + { + "epoch": 0.9680816591056384, + "grad_norm": 0.8277894258499146, + "learning_rate": 3.958474200379309e-06, + "loss": 0.136, + "step": 5975 + }, + { + "epoch": 0.9682436811406351, + "grad_norm": 0.8382477760314941, + "learning_rate": 3.958119003005453e-06, + "loss": 0.1454, + "step": 5976 + }, + { + "epoch": 0.9684057031756319, + "grad_norm": 0.8663250207901001, + "learning_rate": 3.9577637610166855e-06, + "loss": 0.1466, + "step": 5977 + }, + { + "epoch": 0.9685677252106286, + "grad_norm": 0.9586002230644226, + "learning_rate": 3.9574084744238735e-06, + "loss": 0.1648, + "step": 5978 + }, + { + "epoch": 0.9687297472456254, + "grad_norm": 0.7380560636520386, + "learning_rate": 3.95705314323789e-06, + "loss": 0.1182, + "step": 5979 + }, + { + "epoch": 0.9688917692806222, + "grad_norm": 0.8138405084609985, + "learning_rate": 3.956697767469606e-06, + "loss": 0.1381, + "step": 5980 + }, + { + "epoch": 0.969053791315619, + "grad_norm": 0.8596041202545166, + "learning_rate": 3.956342347129894e-06, + "loss": 0.1617, + "step": 5981 + }, + { + "epoch": 0.9692158133506157, + "grad_norm": 0.83723384141922, + "learning_rate": 3.955986882229632e-06, + "loss": 0.1415, + "step": 5982 + }, + { + "epoch": 0.9693778353856124, + "grad_norm": 0.8260191679000854, + "learning_rate": 3.955631372779694e-06, + "loss": 0.1442, + "step": 5983 + }, + { + "epoch": 0.9695398574206092, + "grad_norm": 0.776252806186676, + "learning_rate": 3.95527581879096e-06, + "loss": 0.134, + "step": 5984 + }, + { + "epoch": 0.9697018794556059, + "grad_norm": 0.8889530301094055, + "learning_rate": 3.954920220274307e-06, + "loss": 0.1459, + "step": 5985 + }, + { + "epoch": 0.9698639014906028, + "grad_norm": 0.9564645886421204, + "learning_rate": 3.954564577240615e-06, + "loss": 0.164, + "step": 5986 + }, + { + "epoch": 0.9700259235255995, + "grad_norm": 0.7842409610748291, + "learning_rate": 3.954208889700768e-06, + "loss": 0.1222, + "step": 5987 + }, + { + "epoch": 0.9701879455605963, + "grad_norm": 0.8455632328987122, + "learning_rate": 3.9538531576656465e-06, + "loss": 0.1347, + "step": 5988 + }, + { + "epoch": 0.970349967595593, + "grad_norm": 0.688930869102478, + "learning_rate": 3.953497381146139e-06, + "loss": 0.1216, + "step": 5989 + }, + { + "epoch": 0.9705119896305897, + "grad_norm": 1.0175490379333496, + "learning_rate": 3.953141560153128e-06, + "loss": 0.1671, + "step": 5990 + }, + { + "epoch": 0.9706740116655865, + "grad_norm": 0.9077379703521729, + "learning_rate": 3.952785694697502e-06, + "loss": 0.1554, + "step": 5991 + }, + { + "epoch": 0.9708360337005832, + "grad_norm": 0.8699192404747009, + "learning_rate": 3.952429784790148e-06, + "loss": 0.1375, + "step": 5992 + }, + { + "epoch": 0.9709980557355801, + "grad_norm": 0.8834347724914551, + "learning_rate": 3.952073830441959e-06, + "loss": 0.1395, + "step": 5993 + }, + { + "epoch": 0.9711600777705768, + "grad_norm": 0.8686573505401611, + "learning_rate": 3.951717831663825e-06, + "loss": 0.1358, + "step": 5994 + }, + { + "epoch": 0.9713220998055736, + "grad_norm": 1.110762357711792, + "learning_rate": 3.951361788466636e-06, + "loss": 0.1464, + "step": 5995 + }, + { + "epoch": 0.9714841218405703, + "grad_norm": 0.8450319170951843, + "learning_rate": 3.951005700861291e-06, + "loss": 0.1428, + "step": 5996 + }, + { + "epoch": 0.971646143875567, + "grad_norm": 0.9132416844367981, + "learning_rate": 3.950649568858682e-06, + "loss": 0.1431, + "step": 5997 + }, + { + "epoch": 0.9718081659105638, + "grad_norm": 0.695837140083313, + "learning_rate": 3.9502933924697076e-06, + "loss": 0.1142, + "step": 5998 + }, + { + "epoch": 0.9719701879455606, + "grad_norm": 0.8655622005462646, + "learning_rate": 3.949937171705264e-06, + "loss": 0.1355, + "step": 5999 + }, + { + "epoch": 0.9721322099805574, + "grad_norm": 0.7984117865562439, + "learning_rate": 3.949580906576252e-06, + "loss": 0.1272, + "step": 6000 + }, + { + "epoch": 0.9722942320155541, + "grad_norm": 0.8514125347137451, + "learning_rate": 3.949224597093572e-06, + "loss": 0.1446, + "step": 6001 + }, + { + "epoch": 0.9724562540505509, + "grad_norm": 0.9616315960884094, + "learning_rate": 3.948868243268127e-06, + "loss": 0.1523, + "step": 6002 + }, + { + "epoch": 0.9726182760855476, + "grad_norm": 0.9208241105079651, + "learning_rate": 3.948511845110819e-06, + "loss": 0.1457, + "step": 6003 + }, + { + "epoch": 0.9727802981205443, + "grad_norm": 0.7493133544921875, + "learning_rate": 3.948155402632554e-06, + "loss": 0.1158, + "step": 6004 + }, + { + "epoch": 0.9729423201555412, + "grad_norm": 1.0176682472229004, + "learning_rate": 3.947798915844239e-06, + "loss": 0.1731, + "step": 6005 + }, + { + "epoch": 0.9731043421905379, + "grad_norm": 0.8895021080970764, + "learning_rate": 3.94744238475678e-06, + "loss": 0.1423, + "step": 6006 + }, + { + "epoch": 0.9732663642255347, + "grad_norm": 0.7985418438911438, + "learning_rate": 3.947085809381087e-06, + "loss": 0.1244, + "step": 6007 + }, + { + "epoch": 0.9734283862605314, + "grad_norm": 0.7558802366256714, + "learning_rate": 3.94672918972807e-06, + "loss": 0.1223, + "step": 6008 + }, + { + "epoch": 0.9735904082955282, + "grad_norm": 0.8864288330078125, + "learning_rate": 3.946372525808641e-06, + "loss": 0.1423, + "step": 6009 + }, + { + "epoch": 0.9737524303305249, + "grad_norm": 0.9449102282524109, + "learning_rate": 3.946015817633714e-06, + "loss": 0.1252, + "step": 6010 + }, + { + "epoch": 0.9739144523655218, + "grad_norm": 0.7771594524383545, + "learning_rate": 3.9456590652142005e-06, + "loss": 0.1204, + "step": 6011 + }, + { + "epoch": 0.9740764744005185, + "grad_norm": 0.7628992199897766, + "learning_rate": 3.945302268561019e-06, + "loss": 0.1282, + "step": 6012 + }, + { + "epoch": 0.9742384964355152, + "grad_norm": 0.8803264498710632, + "learning_rate": 3.944945427685085e-06, + "loss": 0.1308, + "step": 6013 + }, + { + "epoch": 0.974400518470512, + "grad_norm": 0.7819192409515381, + "learning_rate": 3.944588542597319e-06, + "loss": 0.1302, + "step": 6014 + }, + { + "epoch": 0.9745625405055087, + "grad_norm": 0.9194301962852478, + "learning_rate": 3.944231613308637e-06, + "loss": 0.1479, + "step": 6015 + }, + { + "epoch": 0.9747245625405055, + "grad_norm": 0.9355913996696472, + "learning_rate": 3.943874639829964e-06, + "loss": 0.1463, + "step": 6016 + }, + { + "epoch": 0.9748865845755023, + "grad_norm": 0.8575900793075562, + "learning_rate": 3.9435176221722215e-06, + "loss": 0.125, + "step": 6017 + }, + { + "epoch": 0.9750486066104991, + "grad_norm": 0.9341332912445068, + "learning_rate": 3.943160560346332e-06, + "loss": 0.1502, + "step": 6018 + }, + { + "epoch": 0.9752106286454958, + "grad_norm": 0.9694084525108337, + "learning_rate": 3.942803454363224e-06, + "loss": 0.1459, + "step": 6019 + }, + { + "epoch": 0.9753726506804925, + "grad_norm": 0.7550426721572876, + "learning_rate": 3.942446304233819e-06, + "loss": 0.1141, + "step": 6020 + }, + { + "epoch": 0.9755346727154893, + "grad_norm": 0.890663743019104, + "learning_rate": 3.942089109969049e-06, + "loss": 0.144, + "step": 6021 + }, + { + "epoch": 0.975696694750486, + "grad_norm": 0.7910590171813965, + "learning_rate": 3.941731871579842e-06, + "loss": 0.1293, + "step": 6022 + }, + { + "epoch": 0.9758587167854829, + "grad_norm": 0.9037308096885681, + "learning_rate": 3.941374589077128e-06, + "loss": 0.1524, + "step": 6023 + }, + { + "epoch": 0.9760207388204796, + "grad_norm": 0.9725434184074402, + "learning_rate": 3.94101726247184e-06, + "loss": 0.1665, + "step": 6024 + }, + { + "epoch": 0.9761827608554764, + "grad_norm": 0.8812149167060852, + "learning_rate": 3.940659891774912e-06, + "loss": 0.139, + "step": 6025 + }, + { + "epoch": 0.9763447828904731, + "grad_norm": 0.9266581535339355, + "learning_rate": 3.9403024769972766e-06, + "loss": 0.1425, + "step": 6026 + }, + { + "epoch": 0.9765068049254698, + "grad_norm": 0.7862972617149353, + "learning_rate": 3.939945018149871e-06, + "loss": 0.1288, + "step": 6027 + }, + { + "epoch": 0.9766688269604666, + "grad_norm": 0.7440429925918579, + "learning_rate": 3.939587515243632e-06, + "loss": 0.1236, + "step": 6028 + }, + { + "epoch": 0.9768308489954634, + "grad_norm": 0.9152294397354126, + "learning_rate": 3.9392299682894995e-06, + "loss": 0.1558, + "step": 6029 + }, + { + "epoch": 0.9769928710304602, + "grad_norm": 0.907541811466217, + "learning_rate": 3.938872377298413e-06, + "loss": 0.1379, + "step": 6030 + }, + { + "epoch": 0.9771548930654569, + "grad_norm": 0.8130697011947632, + "learning_rate": 3.938514742281313e-06, + "loss": 0.1342, + "step": 6031 + }, + { + "epoch": 0.9773169151004537, + "grad_norm": 0.7944709658622742, + "learning_rate": 3.938157063249144e-06, + "loss": 0.1385, + "step": 6032 + }, + { + "epoch": 0.9774789371354504, + "grad_norm": 0.8079535365104675, + "learning_rate": 3.937799340212849e-06, + "loss": 0.1355, + "step": 6033 + }, + { + "epoch": 0.9776409591704471, + "grad_norm": 0.8324079513549805, + "learning_rate": 3.937441573183373e-06, + "loss": 0.1358, + "step": 6034 + }, + { + "epoch": 0.977802981205444, + "grad_norm": 0.868264377117157, + "learning_rate": 3.937083762171663e-06, + "loss": 0.1393, + "step": 6035 + }, + { + "epoch": 0.9779650032404407, + "grad_norm": 0.7842541933059692, + "learning_rate": 3.936725907188668e-06, + "loss": 0.1252, + "step": 6036 + }, + { + "epoch": 0.9781270252754375, + "grad_norm": 0.9624860286712646, + "learning_rate": 3.936368008245337e-06, + "loss": 0.1542, + "step": 6037 + }, + { + "epoch": 0.9782890473104342, + "grad_norm": 0.866369366645813, + "learning_rate": 3.936010065352622e-06, + "loss": 0.1383, + "step": 6038 + }, + { + "epoch": 0.978451069345431, + "grad_norm": 0.8309353590011597, + "learning_rate": 3.935652078521473e-06, + "loss": 0.1299, + "step": 6039 + }, + { + "epoch": 0.9786130913804277, + "grad_norm": 0.8431254029273987, + "learning_rate": 3.935294047762844e-06, + "loss": 0.1346, + "step": 6040 + }, + { + "epoch": 0.9787751134154244, + "grad_norm": 0.8997429609298706, + "learning_rate": 3.934935973087691e-06, + "loss": 0.1351, + "step": 6041 + }, + { + "epoch": 0.9789371354504213, + "grad_norm": 0.9118685722351074, + "learning_rate": 3.93457785450697e-06, + "loss": 0.1442, + "step": 6042 + }, + { + "epoch": 0.979099157485418, + "grad_norm": 0.8979122638702393, + "learning_rate": 3.934219692031639e-06, + "loss": 0.1526, + "step": 6043 + }, + { + "epoch": 0.9792611795204148, + "grad_norm": 0.8222776055335999, + "learning_rate": 3.933861485672656e-06, + "loss": 0.1327, + "step": 6044 + }, + { + "epoch": 0.9794232015554115, + "grad_norm": 0.854524552822113, + "learning_rate": 3.9335032354409794e-06, + "loss": 0.1402, + "step": 6045 + }, + { + "epoch": 0.9795852235904083, + "grad_norm": 0.8611454367637634, + "learning_rate": 3.933144941347574e-06, + "loss": 0.1437, + "step": 6046 + }, + { + "epoch": 0.979747245625405, + "grad_norm": 0.8141959309577942, + "learning_rate": 3.9327866034034025e-06, + "loss": 0.1492, + "step": 6047 + }, + { + "epoch": 0.9799092676604018, + "grad_norm": 0.7863458395004272, + "learning_rate": 3.932428221619427e-06, + "loss": 0.1275, + "step": 6048 + }, + { + "epoch": 0.9800712896953986, + "grad_norm": 0.8340588808059692, + "learning_rate": 3.9320697960066155e-06, + "loss": 0.1455, + "step": 6049 + }, + { + "epoch": 0.9802333117303953, + "grad_norm": 0.9195173978805542, + "learning_rate": 3.931711326575933e-06, + "loss": 0.1282, + "step": 6050 + }, + { + "epoch": 0.9803953337653921, + "grad_norm": 0.8525646328926086, + "learning_rate": 3.931352813338348e-06, + "loss": 0.1431, + "step": 6051 + }, + { + "epoch": 0.9805573558003888, + "grad_norm": 0.8366729617118835, + "learning_rate": 3.9309942563048315e-06, + "loss": 0.1291, + "step": 6052 + }, + { + "epoch": 0.9807193778353857, + "grad_norm": 0.8262105584144592, + "learning_rate": 3.930635655486353e-06, + "loss": 0.1377, + "step": 6053 + }, + { + "epoch": 0.9808813998703824, + "grad_norm": 0.9952908158302307, + "learning_rate": 3.930277010893887e-06, + "loss": 0.1637, + "step": 6054 + }, + { + "epoch": 0.9810434219053791, + "grad_norm": 0.8296589851379395, + "learning_rate": 3.929918322538404e-06, + "loss": 0.1395, + "step": 6055 + }, + { + "epoch": 0.9812054439403759, + "grad_norm": 0.8813459873199463, + "learning_rate": 3.929559590430881e-06, + "loss": 0.1525, + "step": 6056 + }, + { + "epoch": 0.9813674659753726, + "grad_norm": 2.0823469161987305, + "learning_rate": 3.9292008145822955e-06, + "loss": 0.1402, + "step": 6057 + }, + { + "epoch": 0.9815294880103694, + "grad_norm": 0.8811355233192444, + "learning_rate": 3.928841995003622e-06, + "loss": 0.1431, + "step": 6058 + }, + { + "epoch": 0.9816915100453661, + "grad_norm": 0.7101410627365112, + "learning_rate": 3.928483131705842e-06, + "loss": 0.1126, + "step": 6059 + }, + { + "epoch": 0.981853532080363, + "grad_norm": 0.7249727249145508, + "learning_rate": 3.928124224699935e-06, + "loss": 0.1111, + "step": 6060 + }, + { + "epoch": 0.9820155541153597, + "grad_norm": 1.166760802268982, + "learning_rate": 3.927765273996882e-06, + "loss": 0.1405, + "step": 6061 + }, + { + "epoch": 0.9821775761503565, + "grad_norm": 0.7682209610939026, + "learning_rate": 3.927406279607668e-06, + "loss": 0.1365, + "step": 6062 + }, + { + "epoch": 0.9823395981853532, + "grad_norm": 0.916122317314148, + "learning_rate": 3.927047241543275e-06, + "loss": 0.1388, + "step": 6063 + }, + { + "epoch": 0.9825016202203499, + "grad_norm": 0.9284025430679321, + "learning_rate": 3.92668815981469e-06, + "loss": 0.1475, + "step": 6064 + }, + { + "epoch": 0.9826636422553467, + "grad_norm": 0.9036963582038879, + "learning_rate": 3.9263290344329e-06, + "loss": 0.1482, + "step": 6065 + }, + { + "epoch": 0.9828256642903435, + "grad_norm": 0.9499265551567078, + "learning_rate": 3.925969865408893e-06, + "loss": 0.1494, + "step": 6066 + }, + { + "epoch": 0.9829876863253403, + "grad_norm": 0.8025641441345215, + "learning_rate": 3.925610652753659e-06, + "loss": 0.1319, + "step": 6067 + }, + { + "epoch": 0.983149708360337, + "grad_norm": 0.840201199054718, + "learning_rate": 3.925251396478189e-06, + "loss": 0.1483, + "step": 6068 + }, + { + "epoch": 0.9833117303953338, + "grad_norm": 0.9733452796936035, + "learning_rate": 3.924892096593476e-06, + "loss": 0.1581, + "step": 6069 + }, + { + "epoch": 0.9834737524303305, + "grad_norm": 0.7932161688804626, + "learning_rate": 3.9245327531105115e-06, + "loss": 0.1362, + "step": 6070 + }, + { + "epoch": 0.9836357744653272, + "grad_norm": 0.9006250500679016, + "learning_rate": 3.924173366040294e-06, + "loss": 0.1351, + "step": 6071 + }, + { + "epoch": 0.9837977965003241, + "grad_norm": 0.913811206817627, + "learning_rate": 3.923813935393816e-06, + "loss": 0.1392, + "step": 6072 + }, + { + "epoch": 0.9839598185353208, + "grad_norm": 0.7927339673042297, + "learning_rate": 3.923454461182078e-06, + "loss": 0.1271, + "step": 6073 + }, + { + "epoch": 0.9841218405703176, + "grad_norm": 0.8521562218666077, + "learning_rate": 3.923094943416078e-06, + "loss": 0.1226, + "step": 6074 + }, + { + "epoch": 0.9842838626053143, + "grad_norm": 1.042307734489441, + "learning_rate": 3.922735382106817e-06, + "loss": 0.1802, + "step": 6075 + }, + { + "epoch": 0.9844458846403111, + "grad_norm": 0.9505312442779541, + "learning_rate": 3.922375777265296e-06, + "loss": 0.1345, + "step": 6076 + }, + { + "epoch": 0.9846079066753078, + "grad_norm": 0.8530185222625732, + "learning_rate": 3.922016128902519e-06, + "loss": 0.1367, + "step": 6077 + }, + { + "epoch": 0.9847699287103046, + "grad_norm": 0.8222798705101013, + "learning_rate": 3.921656437029488e-06, + "loss": 0.1481, + "step": 6078 + }, + { + "epoch": 0.9849319507453014, + "grad_norm": 1.2158775329589844, + "learning_rate": 3.921296701657211e-06, + "loss": 0.1225, + "step": 6079 + }, + { + "epoch": 0.9850939727802981, + "grad_norm": 2.2171168327331543, + "learning_rate": 3.9209369227966945e-06, + "loss": 0.1395, + "step": 6080 + }, + { + "epoch": 0.9852559948152949, + "grad_norm": 0.7408256530761719, + "learning_rate": 3.920577100458948e-06, + "loss": 0.1185, + "step": 6081 + }, + { + "epoch": 0.9854180168502916, + "grad_norm": 0.8751834034919739, + "learning_rate": 3.920217234654978e-06, + "loss": 0.1487, + "step": 6082 + }, + { + "epoch": 0.9855800388852884, + "grad_norm": 0.9367472529411316, + "learning_rate": 3.919857325395799e-06, + "loss": 0.1458, + "step": 6083 + }, + { + "epoch": 0.9857420609202852, + "grad_norm": 0.9146280288696289, + "learning_rate": 3.919497372692421e-06, + "loss": 0.1509, + "step": 6084 + }, + { + "epoch": 0.9859040829552819, + "grad_norm": 0.8548406958580017, + "learning_rate": 3.919137376555859e-06, + "loss": 0.1375, + "step": 6085 + }, + { + "epoch": 0.9860661049902787, + "grad_norm": 0.9017359614372253, + "learning_rate": 3.918777336997127e-06, + "loss": 0.1334, + "step": 6086 + }, + { + "epoch": 0.9862281270252754, + "grad_norm": 0.6784223318099976, + "learning_rate": 3.918417254027243e-06, + "loss": 0.1084, + "step": 6087 + }, + { + "epoch": 0.9863901490602722, + "grad_norm": 0.7751449346542358, + "learning_rate": 3.918057127657222e-06, + "loss": 0.1193, + "step": 6088 + }, + { + "epoch": 0.9865521710952689, + "grad_norm": 0.9636908769607544, + "learning_rate": 3.917696957898085e-06, + "loss": 0.1814, + "step": 6089 + }, + { + "epoch": 0.9867141931302658, + "grad_norm": 0.8328016996383667, + "learning_rate": 3.9173367447608525e-06, + "loss": 0.1353, + "step": 6090 + }, + { + "epoch": 0.9868762151652625, + "grad_norm": 0.9201585054397583, + "learning_rate": 3.9169764882565445e-06, + "loss": 0.13, + "step": 6091 + }, + { + "epoch": 0.9870382372002592, + "grad_norm": 0.8333777189254761, + "learning_rate": 3.916616188396185e-06, + "loss": 0.1367, + "step": 6092 + }, + { + "epoch": 0.987200259235256, + "grad_norm": 0.8652458190917969, + "learning_rate": 3.916255845190799e-06, + "loss": 0.1338, + "step": 6093 + }, + { + "epoch": 0.9873622812702527, + "grad_norm": 0.9521787762641907, + "learning_rate": 3.915895458651411e-06, + "loss": 0.1585, + "step": 6094 + }, + { + "epoch": 0.9875243033052495, + "grad_norm": 0.70457923412323, + "learning_rate": 3.915535028789049e-06, + "loss": 0.1024, + "step": 6095 + }, + { + "epoch": 0.9876863253402463, + "grad_norm": 0.9282441735267639, + "learning_rate": 3.9151745556147404e-06, + "loss": 0.1636, + "step": 6096 + }, + { + "epoch": 0.9878483473752431, + "grad_norm": 0.8370496034622192, + "learning_rate": 3.914814039139515e-06, + "loss": 0.13, + "step": 6097 + }, + { + "epoch": 0.9880103694102398, + "grad_norm": 0.8045435547828674, + "learning_rate": 3.914453479374403e-06, + "loss": 0.1413, + "step": 6098 + }, + { + "epoch": 0.9881723914452365, + "grad_norm": 0.8645136952400208, + "learning_rate": 3.914092876330439e-06, + "loss": 0.1422, + "step": 6099 + }, + { + "epoch": 0.9883344134802333, + "grad_norm": 0.8007838726043701, + "learning_rate": 3.913732230018654e-06, + "loss": 0.1364, + "step": 6100 + }, + { + "epoch": 0.98849643551523, + "grad_norm": 0.9162132143974304, + "learning_rate": 3.913371540450084e-06, + "loss": 0.1577, + "step": 6101 + }, + { + "epoch": 0.9886584575502269, + "grad_norm": 0.8496082425117493, + "learning_rate": 3.913010807635765e-06, + "loss": 0.1321, + "step": 6102 + }, + { + "epoch": 0.9888204795852236, + "grad_norm": 0.7729654312133789, + "learning_rate": 3.912650031586734e-06, + "loss": 0.1298, + "step": 6103 + }, + { + "epoch": 0.9889825016202204, + "grad_norm": 0.7477056980133057, + "learning_rate": 3.9122892123140324e-06, + "loss": 0.111, + "step": 6104 + }, + { + "epoch": 0.9891445236552171, + "grad_norm": 0.937633752822876, + "learning_rate": 3.911928349828697e-06, + "loss": 0.1427, + "step": 6105 + }, + { + "epoch": 0.9893065456902139, + "grad_norm": 0.8032740354537964, + "learning_rate": 3.911567444141771e-06, + "loss": 0.1345, + "step": 6106 + }, + { + "epoch": 0.9894685677252106, + "grad_norm": 0.8660422563552856, + "learning_rate": 3.911206495264299e-06, + "loss": 0.1322, + "step": 6107 + }, + { + "epoch": 0.9896305897602073, + "grad_norm": 0.987167477607727, + "learning_rate": 3.910845503207322e-06, + "loss": 0.1606, + "step": 6108 + }, + { + "epoch": 0.9897926117952042, + "grad_norm": 0.7517658472061157, + "learning_rate": 3.910484467981886e-06, + "loss": 0.1172, + "step": 6109 + }, + { + "epoch": 0.9899546338302009, + "grad_norm": 0.8762201070785522, + "learning_rate": 3.9101233895990396e-06, + "loss": 0.145, + "step": 6110 + }, + { + "epoch": 0.9901166558651977, + "grad_norm": 0.7897788882255554, + "learning_rate": 3.9097622680698296e-06, + "loss": 0.1191, + "step": 6111 + }, + { + "epoch": 0.9902786779001944, + "grad_norm": 0.8457385301589966, + "learning_rate": 3.909401103405307e-06, + "loss": 0.1304, + "step": 6112 + }, + { + "epoch": 0.9904406999351912, + "grad_norm": 0.8108502626419067, + "learning_rate": 3.9090398956165194e-06, + "loss": 0.1161, + "step": 6113 + }, + { + "epoch": 0.990602721970188, + "grad_norm": 0.9371310472488403, + "learning_rate": 3.908678644714522e-06, + "loss": 0.1596, + "step": 6114 + }, + { + "epoch": 0.9907647440051847, + "grad_norm": 0.8349683284759521, + "learning_rate": 3.908317350710366e-06, + "loss": 0.1356, + "step": 6115 + }, + { + "epoch": 0.9909267660401815, + "grad_norm": 0.8057882785797119, + "learning_rate": 3.907956013615108e-06, + "loss": 0.1202, + "step": 6116 + }, + { + "epoch": 0.9910887880751782, + "grad_norm": 0.8097211718559265, + "learning_rate": 3.907594633439803e-06, + "loss": 0.1269, + "step": 6117 + }, + { + "epoch": 0.991250810110175, + "grad_norm": 0.8821548819541931, + "learning_rate": 3.907233210195508e-06, + "loss": 0.1455, + "step": 6118 + }, + { + "epoch": 0.9914128321451717, + "grad_norm": 0.88327556848526, + "learning_rate": 3.906871743893283e-06, + "loss": 0.143, + "step": 6119 + }, + { + "epoch": 0.9915748541801686, + "grad_norm": 0.7924086451530457, + "learning_rate": 3.906510234544186e-06, + "loss": 0.1278, + "step": 6120 + }, + { + "epoch": 0.9917368762151653, + "grad_norm": 0.8668779134750366, + "learning_rate": 3.906148682159281e-06, + "loss": 0.1289, + "step": 6121 + }, + { + "epoch": 0.991898898250162, + "grad_norm": 0.8895964026451111, + "learning_rate": 3.905787086749628e-06, + "loss": 0.1447, + "step": 6122 + }, + { + "epoch": 0.9920609202851588, + "grad_norm": 0.8018289804458618, + "learning_rate": 3.905425448326293e-06, + "loss": 0.1378, + "step": 6123 + }, + { + "epoch": 0.9922229423201555, + "grad_norm": 0.8257730603218079, + "learning_rate": 3.90506376690034e-06, + "loss": 0.1348, + "step": 6124 + }, + { + "epoch": 0.9923849643551523, + "grad_norm": 0.8249266743659973, + "learning_rate": 3.9047020424828355e-06, + "loss": 0.1194, + "step": 6125 + }, + { + "epoch": 0.992546986390149, + "grad_norm": 0.8011137247085571, + "learning_rate": 3.904340275084848e-06, + "loss": 0.1286, + "step": 6126 + }, + { + "epoch": 0.9927090084251459, + "grad_norm": 0.8544657230377197, + "learning_rate": 3.903978464717446e-06, + "loss": 0.1367, + "step": 6127 + }, + { + "epoch": 0.9928710304601426, + "grad_norm": 0.8813843131065369, + "learning_rate": 3.9036166113917015e-06, + "loss": 0.1451, + "step": 6128 + }, + { + "epoch": 0.9930330524951393, + "grad_norm": 0.7664751410484314, + "learning_rate": 3.903254715118686e-06, + "loss": 0.1218, + "step": 6129 + }, + { + "epoch": 0.9931950745301361, + "grad_norm": 0.7715081572532654, + "learning_rate": 3.90289277590947e-06, + "loss": 0.1336, + "step": 6130 + }, + { + "epoch": 0.9933570965651328, + "grad_norm": 0.8148114085197449, + "learning_rate": 3.902530793775132e-06, + "loss": 0.1296, + "step": 6131 + }, + { + "epoch": 0.9935191186001296, + "grad_norm": 0.8578192591667175, + "learning_rate": 3.902168768726745e-06, + "loss": 0.1366, + "step": 6132 + }, + { + "epoch": 0.9936811406351264, + "grad_norm": 0.7358187437057495, + "learning_rate": 3.9018067007753865e-06, + "loss": 0.1169, + "step": 6133 + }, + { + "epoch": 0.9938431626701232, + "grad_norm": 0.7255628108978271, + "learning_rate": 3.9014445899321355e-06, + "loss": 0.1157, + "step": 6134 + }, + { + "epoch": 0.9940051847051199, + "grad_norm": 0.7971196174621582, + "learning_rate": 3.901082436208071e-06, + "loss": 0.1348, + "step": 6135 + }, + { + "epoch": 0.9941672067401166, + "grad_norm": 0.800977349281311, + "learning_rate": 3.900720239614275e-06, + "loss": 0.1239, + "step": 6136 + }, + { + "epoch": 0.9943292287751134, + "grad_norm": 0.8563368320465088, + "learning_rate": 3.90035800016183e-06, + "loss": 0.1358, + "step": 6137 + }, + { + "epoch": 0.9944912508101101, + "grad_norm": 0.7976403832435608, + "learning_rate": 3.899995717861818e-06, + "loss": 0.1327, + "step": 6138 + }, + { + "epoch": 0.994653272845107, + "grad_norm": 0.7980660200119019, + "learning_rate": 3.899633392725325e-06, + "loss": 0.1272, + "step": 6139 + }, + { + "epoch": 0.9948152948801037, + "grad_norm": 0.7953473329544067, + "learning_rate": 3.899271024763438e-06, + "loss": 0.135, + "step": 6140 + }, + { + "epoch": 0.9949773169151005, + "grad_norm": 0.7407287955284119, + "learning_rate": 3.898908613987243e-06, + "loss": 0.119, + "step": 6141 + }, + { + "epoch": 0.9951393389500972, + "grad_norm": 0.8958144187927246, + "learning_rate": 3.89854616040783e-06, + "loss": 0.1443, + "step": 6142 + }, + { + "epoch": 0.9953013609850939, + "grad_norm": 0.7830373644828796, + "learning_rate": 3.898183664036289e-06, + "loss": 0.1207, + "step": 6143 + }, + { + "epoch": 0.9954633830200907, + "grad_norm": 0.7853714227676392, + "learning_rate": 3.897821124883711e-06, + "loss": 0.1148, + "step": 6144 + }, + { + "epoch": 0.9956254050550875, + "grad_norm": 0.9703736305236816, + "learning_rate": 3.89745854296119e-06, + "loss": 0.1581, + "step": 6145 + }, + { + "epoch": 0.9957874270900843, + "grad_norm": 0.6939820051193237, + "learning_rate": 3.897095918279818e-06, + "loss": 0.11, + "step": 6146 + }, + { + "epoch": 0.995949449125081, + "grad_norm": 1.0396939516067505, + "learning_rate": 3.896733250850694e-06, + "loss": 0.1744, + "step": 6147 + }, + { + "epoch": 0.9961114711600778, + "grad_norm": 0.8816990256309509, + "learning_rate": 3.896370540684911e-06, + "loss": 0.1486, + "step": 6148 + }, + { + "epoch": 0.9962734931950745, + "grad_norm": 0.8170563578605652, + "learning_rate": 3.896007787793569e-06, + "loss": 0.1283, + "step": 6149 + }, + { + "epoch": 0.9964355152300713, + "grad_norm": 0.7984526753425598, + "learning_rate": 3.895644992187767e-06, + "loss": 0.1279, + "step": 6150 + }, + { + "epoch": 0.9965975372650681, + "grad_norm": 0.8505795001983643, + "learning_rate": 3.895282153878606e-06, + "loss": 0.1496, + "step": 6151 + }, + { + "epoch": 0.9967595593000648, + "grad_norm": 0.8687652349472046, + "learning_rate": 3.894919272877187e-06, + "loss": 0.1546, + "step": 6152 + }, + { + "epoch": 0.9969215813350616, + "grad_norm": 0.7245486974716187, + "learning_rate": 3.894556349194613e-06, + "loss": 0.1203, + "step": 6153 + }, + { + "epoch": 0.9970836033700583, + "grad_norm": 0.8453713059425354, + "learning_rate": 3.894193382841991e-06, + "loss": 0.1318, + "step": 6154 + }, + { + "epoch": 0.9972456254050551, + "grad_norm": 0.7163641452789307, + "learning_rate": 3.893830373830425e-06, + "loss": 0.1158, + "step": 6155 + }, + { + "epoch": 0.9974076474400518, + "grad_norm": 0.8621683120727539, + "learning_rate": 3.893467322171022e-06, + "loss": 0.1385, + "step": 6156 + }, + { + "epoch": 0.9975696694750487, + "grad_norm": 0.7639792561531067, + "learning_rate": 3.893104227874892e-06, + "loss": 0.1332, + "step": 6157 + }, + { + "epoch": 0.9977316915100454, + "grad_norm": 0.8099455237388611, + "learning_rate": 3.892741090953143e-06, + "loss": 0.1299, + "step": 6158 + }, + { + "epoch": 0.9978937135450421, + "grad_norm": 1.0411269664764404, + "learning_rate": 3.892377911416888e-06, + "loss": 0.1516, + "step": 6159 + }, + { + "epoch": 0.9980557355800389, + "grad_norm": 0.8103548288345337, + "learning_rate": 3.892014689277238e-06, + "loss": 0.1205, + "step": 6160 + }, + { + "epoch": 0.9982177576150356, + "grad_norm": 0.7883999943733215, + "learning_rate": 3.891651424545307e-06, + "loss": 0.1273, + "step": 6161 + }, + { + "epoch": 0.9983797796500324, + "grad_norm": 0.8199430704116821, + "learning_rate": 3.891288117232209e-06, + "loss": 0.1318, + "step": 6162 + }, + { + "epoch": 0.9985418016850292, + "grad_norm": 0.8376681208610535, + "learning_rate": 3.890924767349062e-06, + "loss": 0.1385, + "step": 6163 + }, + { + "epoch": 0.998703823720026, + "grad_norm": 0.8806131482124329, + "learning_rate": 3.890561374906985e-06, + "loss": 0.1461, + "step": 6164 + }, + { + "epoch": 0.9988658457550227, + "grad_norm": 0.7740499377250671, + "learning_rate": 3.8901979399170935e-06, + "loss": 0.1256, + "step": 6165 + }, + { + "epoch": 0.9990278677900194, + "grad_norm": 0.8058324456214905, + "learning_rate": 3.889834462390509e-06, + "loss": 0.123, + "step": 6166 + }, + { + "epoch": 0.9991898898250162, + "grad_norm": 0.8365480303764343, + "learning_rate": 3.889470942338354e-06, + "loss": 0.1291, + "step": 6167 + }, + { + "epoch": 0.9993519118600129, + "grad_norm": 0.8258197903633118, + "learning_rate": 3.889107379771749e-06, + "loss": 0.1342, + "step": 6168 + }, + { + "epoch": 0.9995139338950098, + "grad_norm": 0.8261362314224243, + "learning_rate": 3.888743774701822e-06, + "loss": 0.1371, + "step": 6169 + }, + { + "epoch": 0.9996759559300065, + "grad_norm": 0.8388069868087769, + "learning_rate": 3.888380127139695e-06, + "loss": 0.1249, + "step": 6170 + }, + { + "epoch": 0.9998379779650033, + "grad_norm": 0.9523016214370728, + "learning_rate": 3.888016437096497e-06, + "loss": 0.1439, + "step": 6171 + }, + { + "epoch": 1.0, + "grad_norm": 0.9021637439727783, + "learning_rate": 3.887652704583354e-06, + "loss": 0.1394, + "step": 6172 + }, + { + "epoch": 1.0001620220349967, + "grad_norm": 0.7438717484474182, + "learning_rate": 3.887288929611396e-06, + "loss": 0.1082, + "step": 6173 + }, + { + "epoch": 1.0003240440699934, + "grad_norm": 0.7213918566703796, + "learning_rate": 3.886925112191754e-06, + "loss": 0.1127, + "step": 6174 + }, + { + "epoch": 1.0004860661049904, + "grad_norm": 0.6617137789726257, + "learning_rate": 3.88656125233556e-06, + "loss": 0.0903, + "step": 6175 + }, + { + "epoch": 1.000648088139987, + "grad_norm": 0.8376255631446838, + "learning_rate": 3.886197350053948e-06, + "loss": 0.1015, + "step": 6176 + }, + { + "epoch": 1.0008101101749838, + "grad_norm": 0.7548001408576965, + "learning_rate": 3.88583340535805e-06, + "loss": 0.1017, + "step": 6177 + }, + { + "epoch": 1.0009721322099805, + "grad_norm": 0.7314820885658264, + "learning_rate": 3.885469418259005e-06, + "loss": 0.1011, + "step": 6178 + }, + { + "epoch": 1.0011341542449774, + "grad_norm": 0.7919092178344727, + "learning_rate": 3.885105388767948e-06, + "loss": 0.1008, + "step": 6179 + }, + { + "epoch": 1.0012961762799741, + "grad_norm": 0.7691347002983093, + "learning_rate": 3.8847413168960175e-06, + "loss": 0.1067, + "step": 6180 + }, + { + "epoch": 1.0014581983149708, + "grad_norm": 0.7515387535095215, + "learning_rate": 3.884377202654354e-06, + "loss": 0.0979, + "step": 6181 + }, + { + "epoch": 1.0016202203499676, + "grad_norm": 0.819832980632782, + "learning_rate": 3.884013046054098e-06, + "loss": 0.1017, + "step": 6182 + }, + { + "epoch": 1.0017822423849643, + "grad_norm": 0.7462260723114014, + "learning_rate": 3.883648847106393e-06, + "loss": 0.1009, + "step": 6183 + }, + { + "epoch": 1.0019442644199612, + "grad_norm": 0.8221039772033691, + "learning_rate": 3.8832846058223814e-06, + "loss": 0.1054, + "step": 6184 + }, + { + "epoch": 1.002106286454958, + "grad_norm": 0.8322626948356628, + "learning_rate": 3.882920322213207e-06, + "loss": 0.0982, + "step": 6185 + }, + { + "epoch": 1.0022683084899546, + "grad_norm": 0.8119961023330688, + "learning_rate": 3.882555996290019e-06, + "loss": 0.0944, + "step": 6186 + }, + { + "epoch": 1.0024303305249513, + "grad_norm": 1.05998957157135, + "learning_rate": 3.882191628063962e-06, + "loss": 0.1197, + "step": 6187 + }, + { + "epoch": 1.002592352559948, + "grad_norm": 0.9906516075134277, + "learning_rate": 3.881827217546187e-06, + "loss": 0.1164, + "step": 6188 + }, + { + "epoch": 1.002754374594945, + "grad_norm": 0.8134022355079651, + "learning_rate": 3.881462764747842e-06, + "loss": 0.0937, + "step": 6189 + }, + { + "epoch": 1.0029163966299417, + "grad_norm": 1.1540935039520264, + "learning_rate": 3.881098269680081e-06, + "loss": 0.1009, + "step": 6190 + }, + { + "epoch": 1.0030784186649384, + "grad_norm": 0.8985638618469238, + "learning_rate": 3.880733732354054e-06, + "loss": 0.1034, + "step": 6191 + }, + { + "epoch": 1.0032404406999351, + "grad_norm": 0.9259430766105652, + "learning_rate": 3.880369152780916e-06, + "loss": 0.1073, + "step": 6192 + }, + { + "epoch": 1.003402462734932, + "grad_norm": 0.8292042016983032, + "learning_rate": 3.880004530971823e-06, + "loss": 0.1135, + "step": 6193 + }, + { + "epoch": 1.0035644847699288, + "grad_norm": 0.7862805128097534, + "learning_rate": 3.879639866937931e-06, + "loss": 0.0931, + "step": 6194 + }, + { + "epoch": 1.0037265068049255, + "grad_norm": 0.7968874573707581, + "learning_rate": 3.879275160690397e-06, + "loss": 0.1072, + "step": 6195 + }, + { + "epoch": 1.0038885288399222, + "grad_norm": 0.7896031141281128, + "learning_rate": 3.8789104122403815e-06, + "loss": 0.0998, + "step": 6196 + }, + { + "epoch": 1.004050550874919, + "grad_norm": 0.8073359727859497, + "learning_rate": 3.878545621599043e-06, + "loss": 0.1079, + "step": 6197 + }, + { + "epoch": 1.0042125729099158, + "grad_norm": 0.871297299861908, + "learning_rate": 3.878180788777546e-06, + "loss": 0.116, + "step": 6198 + }, + { + "epoch": 1.0043745949449125, + "grad_norm": 0.7919323444366455, + "learning_rate": 3.877815913787052e-06, + "loss": 0.1082, + "step": 6199 + }, + { + "epoch": 1.0045366169799093, + "grad_norm": 0.8027322292327881, + "learning_rate": 3.877450996638725e-06, + "loss": 0.1072, + "step": 6200 + }, + { + "epoch": 1.004698639014906, + "grad_norm": 0.7986040115356445, + "learning_rate": 3.87708603734373e-06, + "loss": 0.1078, + "step": 6201 + }, + { + "epoch": 1.0048606610499027, + "grad_norm": 0.7671363949775696, + "learning_rate": 3.876721035913236e-06, + "loss": 0.1029, + "step": 6202 + }, + { + "epoch": 1.0050226830848996, + "grad_norm": 0.7083774209022522, + "learning_rate": 3.87635599235841e-06, + "loss": 0.0964, + "step": 6203 + }, + { + "epoch": 1.0051847051198963, + "grad_norm": 0.7173096537590027, + "learning_rate": 3.87599090669042e-06, + "loss": 0.0972, + "step": 6204 + }, + { + "epoch": 1.005346727154893, + "grad_norm": 0.8262728452682495, + "learning_rate": 3.8756257789204384e-06, + "loss": 0.1103, + "step": 6205 + }, + { + "epoch": 1.0055087491898898, + "grad_norm": 0.783673107624054, + "learning_rate": 3.875260609059638e-06, + "loss": 0.0982, + "step": 6206 + }, + { + "epoch": 1.0056707712248867, + "grad_norm": 0.7423332929611206, + "learning_rate": 3.8748953971191895e-06, + "loss": 0.089, + "step": 6207 + }, + { + "epoch": 1.0058327932598834, + "grad_norm": 0.7428287267684937, + "learning_rate": 3.87453014311027e-06, + "loss": 0.0988, + "step": 6208 + }, + { + "epoch": 1.00599481529488, + "grad_norm": 0.886810302734375, + "learning_rate": 3.874164847044054e-06, + "loss": 0.1142, + "step": 6209 + }, + { + "epoch": 1.0061568373298768, + "grad_norm": 0.8856390714645386, + "learning_rate": 3.87379950893172e-06, + "loss": 0.1035, + "step": 6210 + }, + { + "epoch": 1.0063188593648735, + "grad_norm": 0.7835367321968079, + "learning_rate": 3.873434128784444e-06, + "loss": 0.1062, + "step": 6211 + }, + { + "epoch": 1.0064808813998705, + "grad_norm": 0.9305019974708557, + "learning_rate": 3.8730687066134086e-06, + "loss": 0.1045, + "step": 6212 + }, + { + "epoch": 1.0066429034348672, + "grad_norm": 1.0636266469955444, + "learning_rate": 3.872703242429794e-06, + "loss": 0.1275, + "step": 6213 + }, + { + "epoch": 1.0068049254698639, + "grad_norm": 0.8035116195678711, + "learning_rate": 3.8723377362447805e-06, + "loss": 0.1081, + "step": 6214 + }, + { + "epoch": 1.0069669475048606, + "grad_norm": 0.7572904825210571, + "learning_rate": 3.871972188069554e-06, + "loss": 0.0901, + "step": 6215 + }, + { + "epoch": 1.0071289695398573, + "grad_norm": 0.8394489288330078, + "learning_rate": 3.871606597915298e-06, + "loss": 0.0988, + "step": 6216 + }, + { + "epoch": 1.0072909915748542, + "grad_norm": 0.9537179470062256, + "learning_rate": 3.871240965793201e-06, + "loss": 0.1117, + "step": 6217 + }, + { + "epoch": 1.007453013609851, + "grad_norm": 0.8602259755134583, + "learning_rate": 3.870875291714448e-06, + "loss": 0.1031, + "step": 6218 + }, + { + "epoch": 1.0076150356448477, + "grad_norm": 0.894371747970581, + "learning_rate": 3.870509575690228e-06, + "loss": 0.117, + "step": 6219 + }, + { + "epoch": 1.0077770576798444, + "grad_norm": 0.8317716121673584, + "learning_rate": 3.870143817731732e-06, + "loss": 0.104, + "step": 6220 + }, + { + "epoch": 1.0079390797148413, + "grad_norm": 0.8154535293579102, + "learning_rate": 3.86977801785015e-06, + "loss": 0.1053, + "step": 6221 + }, + { + "epoch": 1.008101101749838, + "grad_norm": 0.9335169196128845, + "learning_rate": 3.8694121760566765e-06, + "loss": 0.1125, + "step": 6222 + }, + { + "epoch": 1.0082631237848347, + "grad_norm": 0.8863696455955505, + "learning_rate": 3.869046292362504e-06, + "loss": 0.1095, + "step": 6223 + }, + { + "epoch": 1.0084251458198314, + "grad_norm": 0.7653059363365173, + "learning_rate": 3.868680366778828e-06, + "loss": 0.0922, + "step": 6224 + }, + { + "epoch": 1.0085871678548282, + "grad_norm": 0.7611356973648071, + "learning_rate": 3.868314399316845e-06, + "loss": 0.1027, + "step": 6225 + }, + { + "epoch": 1.008749189889825, + "grad_norm": 0.9434992671012878, + "learning_rate": 3.867948389987752e-06, + "loss": 0.1091, + "step": 6226 + }, + { + "epoch": 1.0089112119248218, + "grad_norm": 0.8308627605438232, + "learning_rate": 3.86758233880275e-06, + "loss": 0.1032, + "step": 6227 + }, + { + "epoch": 1.0090732339598185, + "grad_norm": 1.1444343328475952, + "learning_rate": 3.8672162457730365e-06, + "loss": 0.1183, + "step": 6228 + }, + { + "epoch": 1.0092352559948152, + "grad_norm": 0.9171835780143738, + "learning_rate": 3.866850110909816e-06, + "loss": 0.1025, + "step": 6229 + }, + { + "epoch": 1.0093972780298122, + "grad_norm": 0.8272443413734436, + "learning_rate": 3.866483934224288e-06, + "loss": 0.1077, + "step": 6230 + }, + { + "epoch": 1.0095593000648089, + "grad_norm": 0.7675562500953674, + "learning_rate": 3.866117715727659e-06, + "loss": 0.0994, + "step": 6231 + }, + { + "epoch": 1.0097213220998056, + "grad_norm": 0.8303428888320923, + "learning_rate": 3.865751455431134e-06, + "loss": 0.1019, + "step": 6232 + }, + { + "epoch": 1.0098833441348023, + "grad_norm": 0.9183821082115173, + "learning_rate": 3.86538515334592e-06, + "loss": 0.1054, + "step": 6233 + }, + { + "epoch": 1.010045366169799, + "grad_norm": 0.8656824827194214, + "learning_rate": 3.865018809483224e-06, + "loss": 0.1171, + "step": 6234 + }, + { + "epoch": 1.010207388204796, + "grad_norm": 0.8201124668121338, + "learning_rate": 3.864652423854256e-06, + "loss": 0.1118, + "step": 6235 + }, + { + "epoch": 1.0103694102397927, + "grad_norm": 0.8729208111763, + "learning_rate": 3.864285996470226e-06, + "loss": 0.1078, + "step": 6236 + }, + { + "epoch": 1.0105314322747894, + "grad_norm": 0.8456924557685852, + "learning_rate": 3.863919527342346e-06, + "loss": 0.1091, + "step": 6237 + }, + { + "epoch": 1.010693454309786, + "grad_norm": 0.9836832284927368, + "learning_rate": 3.863553016481829e-06, + "loss": 0.1226, + "step": 6238 + }, + { + "epoch": 1.0108554763447828, + "grad_norm": 0.7698848843574524, + "learning_rate": 3.863186463899891e-06, + "loss": 0.1032, + "step": 6239 + }, + { + "epoch": 1.0110174983797797, + "grad_norm": 0.8878111839294434, + "learning_rate": 3.862819869607743e-06, + "loss": 0.1039, + "step": 6240 + }, + { + "epoch": 1.0111795204147764, + "grad_norm": 0.7739588618278503, + "learning_rate": 3.862453233616608e-06, + "loss": 0.1052, + "step": 6241 + }, + { + "epoch": 1.0113415424497731, + "grad_norm": 0.7633801698684692, + "learning_rate": 3.862086555937699e-06, + "loss": 0.0993, + "step": 6242 + }, + { + "epoch": 1.0115035644847699, + "grad_norm": 0.8758237957954407, + "learning_rate": 3.861719836582239e-06, + "loss": 0.1156, + "step": 6243 + }, + { + "epoch": 1.0116655865197668, + "grad_norm": 0.8216294050216675, + "learning_rate": 3.861353075561446e-06, + "loss": 0.1035, + "step": 6244 + }, + { + "epoch": 1.0118276085547635, + "grad_norm": 0.9009713530540466, + "learning_rate": 3.860986272886545e-06, + "loss": 0.1155, + "step": 6245 + }, + { + "epoch": 1.0119896305897602, + "grad_norm": 0.8081684112548828, + "learning_rate": 3.860619428568756e-06, + "loss": 0.1059, + "step": 6246 + }, + { + "epoch": 1.012151652624757, + "grad_norm": 0.6876736283302307, + "learning_rate": 3.860252542619305e-06, + "loss": 0.0884, + "step": 6247 + }, + { + "epoch": 1.0123136746597536, + "grad_norm": 0.8331660032272339, + "learning_rate": 3.859885615049419e-06, + "loss": 0.1155, + "step": 6248 + }, + { + "epoch": 1.0124756966947506, + "grad_norm": 0.8493173122406006, + "learning_rate": 3.859518645870323e-06, + "loss": 0.1121, + "step": 6249 + }, + { + "epoch": 1.0126377187297473, + "grad_norm": 0.7302095293998718, + "learning_rate": 3.8591516350932476e-06, + "loss": 0.0871, + "step": 6250 + }, + { + "epoch": 1.012799740764744, + "grad_norm": 0.8454660773277283, + "learning_rate": 3.85878458272942e-06, + "loss": 0.1086, + "step": 6251 + }, + { + "epoch": 1.0129617627997407, + "grad_norm": 0.786479651927948, + "learning_rate": 3.8584174887900735e-06, + "loss": 0.0989, + "step": 6252 + }, + { + "epoch": 1.0131237848347374, + "grad_norm": 0.8221831917762756, + "learning_rate": 3.858050353286439e-06, + "loss": 0.104, + "step": 6253 + }, + { + "epoch": 1.0132858068697344, + "grad_norm": 0.7293557524681091, + "learning_rate": 3.8576831762297495e-06, + "loss": 0.0906, + "step": 6254 + }, + { + "epoch": 1.013447828904731, + "grad_norm": 0.7902601361274719, + "learning_rate": 3.85731595763124e-06, + "loss": 0.1, + "step": 6255 + }, + { + "epoch": 1.0136098509397278, + "grad_norm": 0.8392426371574402, + "learning_rate": 3.856948697502148e-06, + "loss": 0.1109, + "step": 6256 + }, + { + "epoch": 1.0137718729747245, + "grad_norm": 0.7996932864189148, + "learning_rate": 3.856581395853709e-06, + "loss": 0.1005, + "step": 6257 + }, + { + "epoch": 1.0139338950097214, + "grad_norm": 0.7863364815711975, + "learning_rate": 3.8562140526971625e-06, + "loss": 0.0929, + "step": 6258 + }, + { + "epoch": 1.0140959170447181, + "grad_norm": 0.7616758942604065, + "learning_rate": 3.855846668043747e-06, + "loss": 0.0958, + "step": 6259 + }, + { + "epoch": 1.0142579390797148, + "grad_norm": 0.8799887299537659, + "learning_rate": 3.855479241904705e-06, + "loss": 0.0998, + "step": 6260 + }, + { + "epoch": 1.0144199611147116, + "grad_norm": 0.8284506797790527, + "learning_rate": 3.855111774291279e-06, + "loss": 0.1061, + "step": 6261 + }, + { + "epoch": 1.0145819831497083, + "grad_norm": 0.7687853574752808, + "learning_rate": 3.8547442652147115e-06, + "loss": 0.0868, + "step": 6262 + }, + { + "epoch": 1.0147440051847052, + "grad_norm": 0.8260930776596069, + "learning_rate": 3.854376714686249e-06, + "loss": 0.1064, + "step": 6263 + }, + { + "epoch": 1.014906027219702, + "grad_norm": 0.7864606380462646, + "learning_rate": 3.854009122717135e-06, + "loss": 0.1048, + "step": 6264 + }, + { + "epoch": 1.0150680492546986, + "grad_norm": 0.8198474645614624, + "learning_rate": 3.853641489318619e-06, + "loss": 0.1052, + "step": 6265 + }, + { + "epoch": 1.0152300712896953, + "grad_norm": 0.8373472690582275, + "learning_rate": 3.8532738145019484e-06, + "loss": 0.1046, + "step": 6266 + }, + { + "epoch": 1.0153920933246923, + "grad_norm": 0.8073359131813049, + "learning_rate": 3.8529060982783756e-06, + "loss": 0.0921, + "step": 6267 + }, + { + "epoch": 1.015554115359689, + "grad_norm": 1.0620416402816772, + "learning_rate": 3.852538340659149e-06, + "loss": 0.1189, + "step": 6268 + }, + { + "epoch": 1.0157161373946857, + "grad_norm": 0.9277188181877136, + "learning_rate": 3.852170541655523e-06, + "loss": 0.111, + "step": 6269 + }, + { + "epoch": 1.0158781594296824, + "grad_norm": 0.8073578476905823, + "learning_rate": 3.85180270127875e-06, + "loss": 0.0924, + "step": 6270 + }, + { + "epoch": 1.0160401814646791, + "grad_norm": 0.8376033902168274, + "learning_rate": 3.8514348195400854e-06, + "loss": 0.1046, + "step": 6271 + }, + { + "epoch": 1.016202203499676, + "grad_norm": 0.7943440675735474, + "learning_rate": 3.851066896450787e-06, + "loss": 0.0997, + "step": 6272 + }, + { + "epoch": 1.0163642255346728, + "grad_norm": 0.8442563414573669, + "learning_rate": 3.85069893202211e-06, + "loss": 0.1051, + "step": 6273 + }, + { + "epoch": 1.0165262475696695, + "grad_norm": 0.8523194789886475, + "learning_rate": 3.850330926265314e-06, + "loss": 0.1097, + "step": 6274 + }, + { + "epoch": 1.0166882696046662, + "grad_norm": 0.6826653480529785, + "learning_rate": 3.849962879191661e-06, + "loss": 0.0823, + "step": 6275 + }, + { + "epoch": 1.016850291639663, + "grad_norm": 0.782434344291687, + "learning_rate": 3.849594790812409e-06, + "loss": 0.1043, + "step": 6276 + }, + { + "epoch": 1.0170123136746598, + "grad_norm": 0.8440970182418823, + "learning_rate": 3.849226661138823e-06, + "loss": 0.1049, + "step": 6277 + }, + { + "epoch": 1.0171743357096565, + "grad_norm": 0.809670090675354, + "learning_rate": 3.848858490182167e-06, + "loss": 0.1051, + "step": 6278 + }, + { + "epoch": 1.0173363577446533, + "grad_norm": 0.7815456390380859, + "learning_rate": 3.848490277953704e-06, + "loss": 0.0917, + "step": 6279 + }, + { + "epoch": 1.01749837977965, + "grad_norm": 0.8251093626022339, + "learning_rate": 3.8481220244647025e-06, + "loss": 0.1014, + "step": 6280 + }, + { + "epoch": 1.017660401814647, + "grad_norm": 0.7042380571365356, + "learning_rate": 3.84775372972643e-06, + "loss": 0.078, + "step": 6281 + }, + { + "epoch": 1.0178224238496436, + "grad_norm": 0.795159101486206, + "learning_rate": 3.847385393750154e-06, + "loss": 0.1048, + "step": 6282 + }, + { + "epoch": 1.0179844458846403, + "grad_norm": 0.8827878832817078, + "learning_rate": 3.847017016547146e-06, + "loss": 0.1123, + "step": 6283 + }, + { + "epoch": 1.018146467919637, + "grad_norm": 0.7256340980529785, + "learning_rate": 3.846648598128677e-06, + "loss": 0.0877, + "step": 6284 + }, + { + "epoch": 1.0183084899546337, + "grad_norm": 1.0416500568389893, + "learning_rate": 3.846280138506019e-06, + "loss": 0.1112, + "step": 6285 + }, + { + "epoch": 1.0184705119896307, + "grad_norm": 0.8687002062797546, + "learning_rate": 3.8459116376904475e-06, + "loss": 0.1075, + "step": 6286 + }, + { + "epoch": 1.0186325340246274, + "grad_norm": 0.8167076706886292, + "learning_rate": 3.845543095693236e-06, + "loss": 0.0987, + "step": 6287 + }, + { + "epoch": 1.018794556059624, + "grad_norm": 0.7660663723945618, + "learning_rate": 3.8451745125256635e-06, + "loss": 0.1017, + "step": 6288 + }, + { + "epoch": 1.0189565780946208, + "grad_norm": 0.80263751745224, + "learning_rate": 3.8448058881990055e-06, + "loss": 0.0947, + "step": 6289 + }, + { + "epoch": 1.0191186001296175, + "grad_norm": 0.9206402897834778, + "learning_rate": 3.8444372227245415e-06, + "loss": 0.1107, + "step": 6290 + }, + { + "epoch": 1.0192806221646145, + "grad_norm": 0.9525105953216553, + "learning_rate": 3.8440685161135514e-06, + "loss": 0.1077, + "step": 6291 + }, + { + "epoch": 1.0194426441996112, + "grad_norm": 0.7992419004440308, + "learning_rate": 3.843699768377318e-06, + "loss": 0.1066, + "step": 6292 + }, + { + "epoch": 1.0196046662346079, + "grad_norm": 0.7668746709823608, + "learning_rate": 3.843330979527124e-06, + "loss": 0.0952, + "step": 6293 + }, + { + "epoch": 1.0197666882696046, + "grad_norm": 0.8306681513786316, + "learning_rate": 3.842962149574252e-06, + "loss": 0.1061, + "step": 6294 + }, + { + "epoch": 1.0199287103046015, + "grad_norm": 0.9119642376899719, + "learning_rate": 3.8425932785299875e-06, + "loss": 0.1174, + "step": 6295 + }, + { + "epoch": 1.0200907323395982, + "grad_norm": 0.8632789254188538, + "learning_rate": 3.842224366405619e-06, + "loss": 0.1145, + "step": 6296 + }, + { + "epoch": 1.020252754374595, + "grad_norm": 0.8445510864257812, + "learning_rate": 3.841855413212432e-06, + "loss": 0.1029, + "step": 6297 + }, + { + "epoch": 1.0204147764095917, + "grad_norm": 0.8951784372329712, + "learning_rate": 3.841486418961717e-06, + "loss": 0.1144, + "step": 6298 + }, + { + "epoch": 1.0205767984445884, + "grad_norm": 1.5688501596450806, + "learning_rate": 3.841117383664763e-06, + "loss": 0.0966, + "step": 6299 + }, + { + "epoch": 1.0207388204795853, + "grad_norm": 0.9988122582435608, + "learning_rate": 3.840748307332865e-06, + "loss": 0.1283, + "step": 6300 + }, + { + "epoch": 1.020900842514582, + "grad_norm": 0.7491373419761658, + "learning_rate": 3.84037918997731e-06, + "loss": 0.0965, + "step": 6301 + }, + { + "epoch": 1.0210628645495787, + "grad_norm": 0.8068737387657166, + "learning_rate": 3.840010031609398e-06, + "loss": 0.0956, + "step": 6302 + }, + { + "epoch": 1.0212248865845754, + "grad_norm": 0.8668238520622253, + "learning_rate": 3.839640832240421e-06, + "loss": 0.1107, + "step": 6303 + }, + { + "epoch": 1.0213869086195722, + "grad_norm": 0.9297085404396057, + "learning_rate": 3.8392715918816755e-06, + "loss": 0.1196, + "step": 6304 + }, + { + "epoch": 1.021548930654569, + "grad_norm": 0.7360817193984985, + "learning_rate": 3.8389023105444625e-06, + "loss": 0.0893, + "step": 6305 + }, + { + "epoch": 1.0217109526895658, + "grad_norm": 0.8899022340774536, + "learning_rate": 3.838532988240077e-06, + "loss": 0.1111, + "step": 6306 + }, + { + "epoch": 1.0218729747245625, + "grad_norm": 0.6964313387870789, + "learning_rate": 3.838163624979822e-06, + "loss": 0.0904, + "step": 6307 + }, + { + "epoch": 1.0220349967595592, + "grad_norm": 0.8330636024475098, + "learning_rate": 3.837794220774998e-06, + "loss": 0.0966, + "step": 6308 + }, + { + "epoch": 1.0221970187945562, + "grad_norm": 0.9185472726821899, + "learning_rate": 3.837424775636908e-06, + "loss": 0.1158, + "step": 6309 + }, + { + "epoch": 1.0223590408295529, + "grad_norm": 0.8500825762748718, + "learning_rate": 3.8370552895768565e-06, + "loss": 0.1024, + "step": 6310 + }, + { + "epoch": 1.0225210628645496, + "grad_norm": 0.9668059349060059, + "learning_rate": 3.836685762606149e-06, + "loss": 0.1048, + "step": 6311 + }, + { + "epoch": 1.0226830848995463, + "grad_norm": 0.9085898399353027, + "learning_rate": 3.836316194736093e-06, + "loss": 0.1038, + "step": 6312 + }, + { + "epoch": 1.022845106934543, + "grad_norm": 0.829475998878479, + "learning_rate": 3.8359465859779934e-06, + "loss": 0.1031, + "step": 6313 + }, + { + "epoch": 1.02300712896954, + "grad_norm": 0.946996808052063, + "learning_rate": 3.835576936343162e-06, + "loss": 0.107, + "step": 6314 + }, + { + "epoch": 1.0231691510045366, + "grad_norm": 0.9723039865493774, + "learning_rate": 3.835207245842908e-06, + "loss": 0.1155, + "step": 6315 + }, + { + "epoch": 1.0233311730395334, + "grad_norm": 0.9182832837104797, + "learning_rate": 3.8348375144885445e-06, + "loss": 0.1155, + "step": 6316 + }, + { + "epoch": 1.02349319507453, + "grad_norm": 1.0076240301132202, + "learning_rate": 3.834467742291382e-06, + "loss": 0.12, + "step": 6317 + }, + { + "epoch": 1.023655217109527, + "grad_norm": 0.7207759618759155, + "learning_rate": 3.834097929262737e-06, + "loss": 0.0801, + "step": 6318 + }, + { + "epoch": 1.0238172391445237, + "grad_norm": 0.9244574904441833, + "learning_rate": 3.833728075413923e-06, + "loss": 0.1174, + "step": 6319 + }, + { + "epoch": 1.0239792611795204, + "grad_norm": 0.7919167280197144, + "learning_rate": 3.833358180756258e-06, + "loss": 0.0981, + "step": 6320 + }, + { + "epoch": 1.0241412832145171, + "grad_norm": 0.7644730806350708, + "learning_rate": 3.832988245301058e-06, + "loss": 0.0937, + "step": 6321 + }, + { + "epoch": 1.0243033052495139, + "grad_norm": 0.8419947624206543, + "learning_rate": 3.832618269059645e-06, + "loss": 0.0974, + "step": 6322 + }, + { + "epoch": 1.0244653272845108, + "grad_norm": 0.7853063941001892, + "learning_rate": 3.832248252043338e-06, + "loss": 0.0995, + "step": 6323 + }, + { + "epoch": 1.0246273493195075, + "grad_norm": 0.9024096727371216, + "learning_rate": 3.831878194263458e-06, + "loss": 0.1134, + "step": 6324 + }, + { + "epoch": 1.0247893713545042, + "grad_norm": 0.8228088021278381, + "learning_rate": 3.831508095731328e-06, + "loss": 0.1003, + "step": 6325 + }, + { + "epoch": 1.024951393389501, + "grad_norm": 0.798202395439148, + "learning_rate": 3.831137956458272e-06, + "loss": 0.0996, + "step": 6326 + }, + { + "epoch": 1.0251134154244976, + "grad_norm": 1.0323954820632935, + "learning_rate": 3.830767776455617e-06, + "loss": 0.1177, + "step": 6327 + }, + { + "epoch": 1.0252754374594946, + "grad_norm": 0.8482728004455566, + "learning_rate": 3.830397555734687e-06, + "loss": 0.1058, + "step": 6328 + }, + { + "epoch": 1.0254374594944913, + "grad_norm": 0.8211125731468201, + "learning_rate": 3.830027294306813e-06, + "loss": 0.1168, + "step": 6329 + }, + { + "epoch": 1.025599481529488, + "grad_norm": 0.8774312138557434, + "learning_rate": 3.8296569921833214e-06, + "loss": 0.1134, + "step": 6330 + }, + { + "epoch": 1.0257615035644847, + "grad_norm": 0.8460497856140137, + "learning_rate": 3.829286649375544e-06, + "loss": 0.1064, + "step": 6331 + }, + { + "epoch": 1.0259235255994816, + "grad_norm": 0.8636878728866577, + "learning_rate": 3.8289162658948114e-06, + "loss": 0.0956, + "step": 6332 + }, + { + "epoch": 1.0260855476344783, + "grad_norm": 0.8535585403442383, + "learning_rate": 3.828545841752457e-06, + "loss": 0.1093, + "step": 6333 + }, + { + "epoch": 1.026247569669475, + "grad_norm": 0.9328159093856812, + "learning_rate": 3.828175376959815e-06, + "loss": 0.1123, + "step": 6334 + }, + { + "epoch": 1.0264095917044718, + "grad_norm": 0.8551136255264282, + "learning_rate": 3.827804871528221e-06, + "loss": 0.0988, + "step": 6335 + }, + { + "epoch": 1.0265716137394685, + "grad_norm": 0.7710674405097961, + "learning_rate": 3.827434325469011e-06, + "loss": 0.0962, + "step": 6336 + }, + { + "epoch": 1.0267336357744654, + "grad_norm": 0.7422839999198914, + "learning_rate": 3.827063738793523e-06, + "loss": 0.0941, + "step": 6337 + }, + { + "epoch": 1.0268956578094621, + "grad_norm": 0.8090795874595642, + "learning_rate": 3.8266931115130955e-06, + "loss": 0.0914, + "step": 6338 + }, + { + "epoch": 1.0270576798444588, + "grad_norm": 0.8428863286972046, + "learning_rate": 3.82632244363907e-06, + "loss": 0.1066, + "step": 6339 + }, + { + "epoch": 1.0272197018794555, + "grad_norm": 0.8122255802154541, + "learning_rate": 3.8259517351827866e-06, + "loss": 0.1008, + "step": 6340 + }, + { + "epoch": 1.0273817239144523, + "grad_norm": 0.9258431792259216, + "learning_rate": 3.8255809861555895e-06, + "loss": 0.1109, + "step": 6341 + }, + { + "epoch": 1.0275437459494492, + "grad_norm": 0.886329174041748, + "learning_rate": 3.825210196568823e-06, + "loss": 0.1144, + "step": 6342 + }, + { + "epoch": 1.027705767984446, + "grad_norm": 0.9017022848129272, + "learning_rate": 3.824839366433829e-06, + "loss": 0.1152, + "step": 6343 + }, + { + "epoch": 1.0278677900194426, + "grad_norm": 0.8935456871986389, + "learning_rate": 3.824468495761958e-06, + "loss": 0.0945, + "step": 6344 + }, + { + "epoch": 1.0280298120544393, + "grad_norm": 0.8333491683006287, + "learning_rate": 3.824097584564556e-06, + "loss": 0.1041, + "step": 6345 + }, + { + "epoch": 1.0281918340894363, + "grad_norm": 0.7439111471176147, + "learning_rate": 3.823726632852972e-06, + "loss": 0.0931, + "step": 6346 + }, + { + "epoch": 1.028353856124433, + "grad_norm": 0.8706629276275635, + "learning_rate": 3.823355640638557e-06, + "loss": 0.1069, + "step": 6347 + }, + { + "epoch": 1.0285158781594297, + "grad_norm": 0.8351131081581116, + "learning_rate": 3.822984607932661e-06, + "loss": 0.1065, + "step": 6348 + }, + { + "epoch": 1.0286779001944264, + "grad_norm": 0.7511221766471863, + "learning_rate": 3.822613534746638e-06, + "loss": 0.0905, + "step": 6349 + }, + { + "epoch": 1.028839922229423, + "grad_norm": 0.9029641151428223, + "learning_rate": 3.8222424210918404e-06, + "loss": 0.1051, + "step": 6350 + }, + { + "epoch": 1.02900194426442, + "grad_norm": 0.770819902420044, + "learning_rate": 3.821871266979626e-06, + "loss": 0.0997, + "step": 6351 + }, + { + "epoch": 1.0291639662994168, + "grad_norm": 0.9414584040641785, + "learning_rate": 3.821500072421349e-06, + "loss": 0.1117, + "step": 6352 + }, + { + "epoch": 1.0293259883344135, + "grad_norm": 0.836219072341919, + "learning_rate": 3.821128837428368e-06, + "loss": 0.1091, + "step": 6353 + }, + { + "epoch": 1.0294880103694102, + "grad_norm": 0.9013179540634155, + "learning_rate": 3.820757562012042e-06, + "loss": 0.1025, + "step": 6354 + }, + { + "epoch": 1.029650032404407, + "grad_norm": 0.7845829725265503, + "learning_rate": 3.82038624618373e-06, + "loss": 0.1003, + "step": 6355 + }, + { + "epoch": 1.0298120544394038, + "grad_norm": 0.7390191555023193, + "learning_rate": 3.820014889954794e-06, + "loss": 0.0994, + "step": 6356 + }, + { + "epoch": 1.0299740764744005, + "grad_norm": 0.8984519243240356, + "learning_rate": 3.819643493336598e-06, + "loss": 0.1132, + "step": 6357 + }, + { + "epoch": 1.0301360985093972, + "grad_norm": 0.8943284749984741, + "learning_rate": 3.819272056340504e-06, + "loss": 0.1086, + "step": 6358 + }, + { + "epoch": 1.030298120544394, + "grad_norm": 0.9379931688308716, + "learning_rate": 3.818900578977877e-06, + "loss": 0.1062, + "step": 6359 + }, + { + "epoch": 1.030460142579391, + "grad_norm": 0.8385069370269775, + "learning_rate": 3.818529061260084e-06, + "loss": 0.1045, + "step": 6360 + }, + { + "epoch": 1.0306221646143876, + "grad_norm": 0.8240720629692078, + "learning_rate": 3.8181575031984935e-06, + "loss": 0.0985, + "step": 6361 + }, + { + "epoch": 1.0307841866493843, + "grad_norm": 0.8299695253372192, + "learning_rate": 3.817785904804473e-06, + "loss": 0.1021, + "step": 6362 + }, + { + "epoch": 1.030946208684381, + "grad_norm": 0.7998603582382202, + "learning_rate": 3.817414266089392e-06, + "loss": 0.1041, + "step": 6363 + }, + { + "epoch": 1.0311082307193777, + "grad_norm": 0.9111914038658142, + "learning_rate": 3.817042587064623e-06, + "loss": 0.1054, + "step": 6364 + }, + { + "epoch": 1.0312702527543747, + "grad_norm": 0.7303438186645508, + "learning_rate": 3.816670867741538e-06, + "loss": 0.0943, + "step": 6365 + }, + { + "epoch": 1.0314322747893714, + "grad_norm": 0.8028538227081299, + "learning_rate": 3.81629910813151e-06, + "loss": 0.0959, + "step": 6366 + }, + { + "epoch": 1.031594296824368, + "grad_norm": 0.7954698204994202, + "learning_rate": 3.815927308245917e-06, + "loss": 0.1083, + "step": 6367 + }, + { + "epoch": 1.0317563188593648, + "grad_norm": 0.7787458896636963, + "learning_rate": 3.815555468096131e-06, + "loss": 0.1001, + "step": 6368 + }, + { + "epoch": 1.0319183408943617, + "grad_norm": 0.8813876509666443, + "learning_rate": 3.815183587693531e-06, + "loss": 0.1022, + "step": 6369 + }, + { + "epoch": 1.0320803629293585, + "grad_norm": 0.9205014705657959, + "learning_rate": 3.814811667049497e-06, + "loss": 0.1054, + "step": 6370 + }, + { + "epoch": 1.0322423849643552, + "grad_norm": 0.7909181714057922, + "learning_rate": 3.8144397061754066e-06, + "loss": 0.1005, + "step": 6371 + }, + { + "epoch": 1.0324044069993519, + "grad_norm": 0.7991738319396973, + "learning_rate": 3.814067705082643e-06, + "loss": 0.0988, + "step": 6372 + }, + { + "epoch": 1.0325664290343486, + "grad_norm": 0.8876237869262695, + "learning_rate": 3.8136956637825878e-06, + "loss": 0.1064, + "step": 6373 + }, + { + "epoch": 1.0327284510693455, + "grad_norm": 1.0412876605987549, + "learning_rate": 3.8133235822866234e-06, + "loss": 0.1069, + "step": 6374 + }, + { + "epoch": 1.0328904731043422, + "grad_norm": 0.9145109057426453, + "learning_rate": 3.812951460606136e-06, + "loss": 0.0998, + "step": 6375 + }, + { + "epoch": 1.033052495139339, + "grad_norm": 0.7650631070137024, + "learning_rate": 3.812579298752511e-06, + "loss": 0.0925, + "step": 6376 + }, + { + "epoch": 1.0332145171743357, + "grad_norm": 0.7742469310760498, + "learning_rate": 3.812207096737137e-06, + "loss": 0.0951, + "step": 6377 + }, + { + "epoch": 1.0333765392093324, + "grad_norm": 0.900032639503479, + "learning_rate": 3.8118348545714e-06, + "loss": 0.1075, + "step": 6378 + }, + { + "epoch": 1.0335385612443293, + "grad_norm": 0.8306341767311096, + "learning_rate": 3.811462572266691e-06, + "loss": 0.1033, + "step": 6379 + }, + { + "epoch": 1.033700583279326, + "grad_norm": 0.9517765641212463, + "learning_rate": 3.8110902498344023e-06, + "loss": 0.1076, + "step": 6380 + }, + { + "epoch": 1.0338626053143227, + "grad_norm": 0.8399965763092041, + "learning_rate": 3.810717887285923e-06, + "loss": 0.0931, + "step": 6381 + }, + { + "epoch": 1.0340246273493194, + "grad_norm": 1.0168367624282837, + "learning_rate": 3.8103454846326493e-06, + "loss": 0.103, + "step": 6382 + }, + { + "epoch": 1.0341866493843164, + "grad_norm": 0.8459004163742065, + "learning_rate": 3.8099730418859743e-06, + "loss": 0.1079, + "step": 6383 + }, + { + "epoch": 1.034348671419313, + "grad_norm": 0.8548713326454163, + "learning_rate": 3.809600559057295e-06, + "loss": 0.0932, + "step": 6384 + }, + { + "epoch": 1.0345106934543098, + "grad_norm": 0.986228346824646, + "learning_rate": 3.809228036158007e-06, + "loss": 0.1074, + "step": 6385 + }, + { + "epoch": 1.0346727154893065, + "grad_norm": 0.919900119304657, + "learning_rate": 3.80885547319951e-06, + "loss": 0.1051, + "step": 6386 + }, + { + "epoch": 1.0348347375243032, + "grad_norm": 0.8741225004196167, + "learning_rate": 3.808482870193202e-06, + "loss": 0.1084, + "step": 6387 + }, + { + "epoch": 1.0349967595593002, + "grad_norm": 0.898740828037262, + "learning_rate": 3.808110227150485e-06, + "loss": 0.0994, + "step": 6388 + }, + { + "epoch": 1.0351587815942969, + "grad_norm": 0.8765122890472412, + "learning_rate": 3.8077375440827613e-06, + "loss": 0.1025, + "step": 6389 + }, + { + "epoch": 1.0353208036292936, + "grad_norm": 0.8248468637466431, + "learning_rate": 3.8073648210014323e-06, + "loss": 0.0955, + "step": 6390 + }, + { + "epoch": 1.0354828256642903, + "grad_norm": 0.7942231297492981, + "learning_rate": 3.8069920579179042e-06, + "loss": 0.0982, + "step": 6391 + }, + { + "epoch": 1.035644847699287, + "grad_norm": 0.8907127380371094, + "learning_rate": 3.806619254843582e-06, + "loss": 0.1016, + "step": 6392 + }, + { + "epoch": 1.035806869734284, + "grad_norm": 0.9660011529922485, + "learning_rate": 3.806246411789872e-06, + "loss": 0.1185, + "step": 6393 + }, + { + "epoch": 1.0359688917692806, + "grad_norm": 0.7657979130744934, + "learning_rate": 3.8058735287681835e-06, + "loss": 0.0972, + "step": 6394 + }, + { + "epoch": 1.0361309138042774, + "grad_norm": 0.8937506675720215, + "learning_rate": 3.8055006057899254e-06, + "loss": 0.1065, + "step": 6395 + }, + { + "epoch": 1.036292935839274, + "grad_norm": 0.789402425289154, + "learning_rate": 3.8051276428665074e-06, + "loss": 0.091, + "step": 6396 + }, + { + "epoch": 1.036454957874271, + "grad_norm": 0.9826111197471619, + "learning_rate": 3.8047546400093425e-06, + "loss": 0.1106, + "step": 6397 + }, + { + "epoch": 1.0366169799092677, + "grad_norm": 0.8755872249603271, + "learning_rate": 3.8043815972298424e-06, + "loss": 0.114, + "step": 6398 + }, + { + "epoch": 1.0367790019442644, + "grad_norm": 0.8057990074157715, + "learning_rate": 3.8040085145394224e-06, + "loss": 0.1052, + "step": 6399 + }, + { + "epoch": 1.0369410239792611, + "grad_norm": 0.8804965615272522, + "learning_rate": 3.8036353919494973e-06, + "loss": 0.1134, + "step": 6400 + }, + { + "epoch": 1.0371030460142578, + "grad_norm": 0.8010181784629822, + "learning_rate": 3.8032622294714837e-06, + "loss": 0.1062, + "step": 6401 + }, + { + "epoch": 1.0372650680492548, + "grad_norm": 0.7778938412666321, + "learning_rate": 3.8028890271168e-06, + "loss": 0.0924, + "step": 6402 + }, + { + "epoch": 1.0374270900842515, + "grad_norm": 0.7856367826461792, + "learning_rate": 3.8025157848968653e-06, + "loss": 0.1058, + "step": 6403 + }, + { + "epoch": 1.0375891121192482, + "grad_norm": 0.6976110339164734, + "learning_rate": 3.8021425028230994e-06, + "loss": 0.0898, + "step": 6404 + }, + { + "epoch": 1.037751134154245, + "grad_norm": 0.7459751963615417, + "learning_rate": 3.8017691809069234e-06, + "loss": 0.0889, + "step": 6405 + }, + { + "epoch": 1.0379131561892416, + "grad_norm": 0.7587976455688477, + "learning_rate": 3.801395819159761e-06, + "loss": 0.0969, + "step": 6406 + }, + { + "epoch": 1.0380751782242386, + "grad_norm": 0.7939549684524536, + "learning_rate": 3.8010224175930366e-06, + "loss": 0.0979, + "step": 6407 + }, + { + "epoch": 1.0382372002592353, + "grad_norm": 0.7911587953567505, + "learning_rate": 3.8006489762181744e-06, + "loss": 0.0932, + "step": 6408 + }, + { + "epoch": 1.038399222294232, + "grad_norm": 0.9099779725074768, + "learning_rate": 3.8002754950466004e-06, + "loss": 0.1058, + "step": 6409 + }, + { + "epoch": 1.0385612443292287, + "grad_norm": 0.7849684953689575, + "learning_rate": 3.7999019740897423e-06, + "loss": 0.0961, + "step": 6410 + }, + { + "epoch": 1.0387232663642256, + "grad_norm": 0.8940638303756714, + "learning_rate": 3.7995284133590317e-06, + "loss": 0.0876, + "step": 6411 + }, + { + "epoch": 1.0388852883992223, + "grad_norm": 0.9506855607032776, + "learning_rate": 3.799154812865894e-06, + "loss": 0.1111, + "step": 6412 + }, + { + "epoch": 1.039047310434219, + "grad_norm": 0.8368770480155945, + "learning_rate": 3.798781172621765e-06, + "loss": 0.1079, + "step": 6413 + }, + { + "epoch": 1.0392093324692158, + "grad_norm": 0.8906238079071045, + "learning_rate": 3.7984074926380733e-06, + "loss": 0.1126, + "step": 6414 + }, + { + "epoch": 1.0393713545042125, + "grad_norm": 0.950920581817627, + "learning_rate": 3.7980337729262555e-06, + "loss": 0.1118, + "step": 6415 + }, + { + "epoch": 1.0395333765392094, + "grad_norm": 0.9369775652885437, + "learning_rate": 3.7976600134977455e-06, + "loss": 0.1117, + "step": 6416 + }, + { + "epoch": 1.0396953985742061, + "grad_norm": 0.8446056246757507, + "learning_rate": 3.7972862143639788e-06, + "loss": 0.0977, + "step": 6417 + }, + { + "epoch": 1.0398574206092028, + "grad_norm": 0.930582582950592, + "learning_rate": 3.7969123755363935e-06, + "loss": 0.1135, + "step": 6418 + }, + { + "epoch": 1.0400194426441995, + "grad_norm": 0.7642794251441956, + "learning_rate": 3.796538497026428e-06, + "loss": 0.0935, + "step": 6419 + }, + { + "epoch": 1.0401814646791965, + "grad_norm": 0.9868995547294617, + "learning_rate": 3.7961645788455225e-06, + "loss": 0.1143, + "step": 6420 + }, + { + "epoch": 1.0403434867141932, + "grad_norm": 1.0182535648345947, + "learning_rate": 3.7957906210051173e-06, + "loss": 0.0991, + "step": 6421 + }, + { + "epoch": 1.04050550874919, + "grad_norm": 0.9137951731681824, + "learning_rate": 3.7954166235166545e-06, + "loss": 0.1159, + "step": 6422 + }, + { + "epoch": 1.0406675307841866, + "grad_norm": 0.9183018207550049, + "learning_rate": 3.795042586391578e-06, + "loss": 0.09, + "step": 6423 + }, + { + "epoch": 1.0408295528191833, + "grad_norm": 0.856632649898529, + "learning_rate": 3.794668509641332e-06, + "loss": 0.1037, + "step": 6424 + }, + { + "epoch": 1.0409915748541803, + "grad_norm": 0.8316843509674072, + "learning_rate": 3.7942943932773636e-06, + "loss": 0.1061, + "step": 6425 + }, + { + "epoch": 1.041153596889177, + "grad_norm": 0.8080172538757324, + "learning_rate": 3.793920237311118e-06, + "loss": 0.1112, + "step": 6426 + }, + { + "epoch": 1.0413156189241737, + "grad_norm": 0.8021390438079834, + "learning_rate": 3.793546041754044e-06, + "loss": 0.1035, + "step": 6427 + }, + { + "epoch": 1.0414776409591704, + "grad_norm": 0.8733294010162354, + "learning_rate": 3.793171806617593e-06, + "loss": 0.1044, + "step": 6428 + }, + { + "epoch": 1.041639662994167, + "grad_norm": 0.8201737403869629, + "learning_rate": 3.7927975319132133e-06, + "loss": 0.0958, + "step": 6429 + }, + { + "epoch": 1.041801685029164, + "grad_norm": 0.7947893142700195, + "learning_rate": 3.7924232176523574e-06, + "loss": 0.1048, + "step": 6430 + }, + { + "epoch": 1.0419637070641607, + "grad_norm": 0.839570939540863, + "learning_rate": 3.7920488638464788e-06, + "loss": 0.1012, + "step": 6431 + }, + { + "epoch": 1.0421257290991575, + "grad_norm": 0.8176724910736084, + "learning_rate": 3.7916744705070318e-06, + "loss": 0.0941, + "step": 6432 + }, + { + "epoch": 1.0422877511341542, + "grad_norm": 0.7297854423522949, + "learning_rate": 3.7913000376454713e-06, + "loss": 0.0962, + "step": 6433 + }, + { + "epoch": 1.042449773169151, + "grad_norm": 0.7584792375564575, + "learning_rate": 3.790925565273255e-06, + "loss": 0.0948, + "step": 6434 + }, + { + "epoch": 1.0426117952041478, + "grad_norm": 0.7733738422393799, + "learning_rate": 3.790551053401841e-06, + "loss": 0.0947, + "step": 6435 + }, + { + "epoch": 1.0427738172391445, + "grad_norm": 0.8243172764778137, + "learning_rate": 3.790176502042686e-06, + "loss": 0.1108, + "step": 6436 + }, + { + "epoch": 1.0429358392741412, + "grad_norm": 0.8829347491264343, + "learning_rate": 3.7898019112072537e-06, + "loss": 0.114, + "step": 6437 + }, + { + "epoch": 1.043097861309138, + "grad_norm": 0.8396970629692078, + "learning_rate": 3.789427280907004e-06, + "loss": 0.1037, + "step": 6438 + }, + { + "epoch": 1.0432598833441349, + "grad_norm": 0.8729767203330994, + "learning_rate": 3.7890526111534e-06, + "loss": 0.107, + "step": 6439 + }, + { + "epoch": 1.0434219053791316, + "grad_norm": 0.9895196557044983, + "learning_rate": 3.7886779019579045e-06, + "loss": 0.1178, + "step": 6440 + }, + { + "epoch": 1.0435839274141283, + "grad_norm": 0.8271682262420654, + "learning_rate": 3.788303153331985e-06, + "loss": 0.1127, + "step": 6441 + }, + { + "epoch": 1.043745949449125, + "grad_norm": 0.8002727031707764, + "learning_rate": 3.787928365287106e-06, + "loss": 0.1027, + "step": 6442 + }, + { + "epoch": 1.043907971484122, + "grad_norm": 0.788192629814148, + "learning_rate": 3.7875535378347356e-06, + "loss": 0.1054, + "step": 6443 + }, + { + "epoch": 1.0440699935191187, + "grad_norm": 0.8381497263908386, + "learning_rate": 3.7871786709863435e-06, + "loss": 0.1006, + "step": 6444 + }, + { + "epoch": 1.0442320155541154, + "grad_norm": 0.9270142912864685, + "learning_rate": 3.7868037647533977e-06, + "loss": 0.1152, + "step": 6445 + }, + { + "epoch": 1.044394037589112, + "grad_norm": 0.8742038011550903, + "learning_rate": 3.7864288191473718e-06, + "loss": 0.11, + "step": 6446 + }, + { + "epoch": 1.0445560596241088, + "grad_norm": 0.7907035946846008, + "learning_rate": 3.786053834179737e-06, + "loss": 0.105, + "step": 6447 + }, + { + "epoch": 1.0447180816591057, + "grad_norm": 0.7882545590400696, + "learning_rate": 3.7856788098619667e-06, + "loss": 0.1039, + "step": 6448 + }, + { + "epoch": 1.0448801036941024, + "grad_norm": 0.880588710308075, + "learning_rate": 3.7853037462055366e-06, + "loss": 0.1136, + "step": 6449 + }, + { + "epoch": 1.0450421257290992, + "grad_norm": 0.7238618731498718, + "learning_rate": 3.7849286432219216e-06, + "loss": 0.0888, + "step": 6450 + }, + { + "epoch": 1.0452041477640959, + "grad_norm": 0.887286365032196, + "learning_rate": 3.7845535009226e-06, + "loss": 0.107, + "step": 6451 + }, + { + "epoch": 1.0453661697990926, + "grad_norm": 0.8213676810264587, + "learning_rate": 3.78417831931905e-06, + "loss": 0.1036, + "step": 6452 + }, + { + "epoch": 1.0455281918340895, + "grad_norm": 0.8090675473213196, + "learning_rate": 3.783803098422751e-06, + "loss": 0.0995, + "step": 6453 + }, + { + "epoch": 1.0456902138690862, + "grad_norm": 0.8054291605949402, + "learning_rate": 3.783427838245184e-06, + "loss": 0.1053, + "step": 6454 + }, + { + "epoch": 1.045852235904083, + "grad_norm": 0.9021458625793457, + "learning_rate": 3.78305253879783e-06, + "loss": 0.1142, + "step": 6455 + }, + { + "epoch": 1.0460142579390797, + "grad_norm": 0.710974395275116, + "learning_rate": 3.7826772000921742e-06, + "loss": 0.0924, + "step": 6456 + }, + { + "epoch": 1.0461762799740764, + "grad_norm": 0.8471440672874451, + "learning_rate": 3.7823018221397e-06, + "loss": 0.1111, + "step": 6457 + }, + { + "epoch": 1.0463383020090733, + "grad_norm": 0.8694987297058105, + "learning_rate": 3.781926404951893e-06, + "loss": 0.1016, + "step": 6458 + }, + { + "epoch": 1.04650032404407, + "grad_norm": 0.8285094499588013, + "learning_rate": 3.78155094854024e-06, + "loss": 0.1016, + "step": 6459 + }, + { + "epoch": 1.0466623460790667, + "grad_norm": 0.9631251692771912, + "learning_rate": 3.7811754529162294e-06, + "loss": 0.1083, + "step": 6460 + }, + { + "epoch": 1.0468243681140634, + "grad_norm": 0.7672160267829895, + "learning_rate": 3.7807999180913514e-06, + "loss": 0.0994, + "step": 6461 + }, + { + "epoch": 1.0469863901490604, + "grad_norm": 0.8148001432418823, + "learning_rate": 3.7804243440770936e-06, + "loss": 0.1043, + "step": 6462 + }, + { + "epoch": 1.047148412184057, + "grad_norm": 0.9620845317840576, + "learning_rate": 3.780048730884951e-06, + "loss": 0.1056, + "step": 6463 + }, + { + "epoch": 1.0473104342190538, + "grad_norm": 0.840048611164093, + "learning_rate": 3.779673078526414e-06, + "loss": 0.1031, + "step": 6464 + }, + { + "epoch": 1.0474724562540505, + "grad_norm": 1.0370488166809082, + "learning_rate": 3.7792973870129773e-06, + "loss": 0.1211, + "step": 6465 + }, + { + "epoch": 1.0476344782890472, + "grad_norm": 0.7531951665878296, + "learning_rate": 3.7789216563561373e-06, + "loss": 0.0909, + "step": 6466 + }, + { + "epoch": 1.0477965003240441, + "grad_norm": 0.9405876994132996, + "learning_rate": 3.7785458865673885e-06, + "loss": 0.1147, + "step": 6467 + }, + { + "epoch": 1.0479585223590409, + "grad_norm": 1.0181210041046143, + "learning_rate": 3.778170077658231e-06, + "loss": 0.1209, + "step": 6468 + }, + { + "epoch": 1.0481205443940376, + "grad_norm": 0.8447110056877136, + "learning_rate": 3.7777942296401606e-06, + "loss": 0.1024, + "step": 6469 + }, + { + "epoch": 1.0482825664290343, + "grad_norm": 0.8490682244300842, + "learning_rate": 3.77741834252468e-06, + "loss": 0.1012, + "step": 6470 + }, + { + "epoch": 1.0484445884640312, + "grad_norm": 0.8966073393821716, + "learning_rate": 3.777042416323289e-06, + "loss": 0.104, + "step": 6471 + }, + { + "epoch": 1.048606610499028, + "grad_norm": 0.7608264088630676, + "learning_rate": 3.7766664510474903e-06, + "loss": 0.0988, + "step": 6472 + }, + { + "epoch": 1.0487686325340246, + "grad_norm": 0.9018314480781555, + "learning_rate": 3.776290446708789e-06, + "loss": 0.1082, + "step": 6473 + }, + { + "epoch": 1.0489306545690213, + "grad_norm": 0.8391294479370117, + "learning_rate": 3.775914403318687e-06, + "loss": 0.1015, + "step": 6474 + }, + { + "epoch": 1.049092676604018, + "grad_norm": 0.8135133981704712, + "learning_rate": 3.7755383208886923e-06, + "loss": 0.095, + "step": 6475 + }, + { + "epoch": 1.049254698639015, + "grad_norm": 0.8267797231674194, + "learning_rate": 3.7751621994303123e-06, + "loss": 0.0941, + "step": 6476 + }, + { + "epoch": 1.0494167206740117, + "grad_norm": 0.7961505055427551, + "learning_rate": 3.774786038955054e-06, + "loss": 0.1048, + "step": 6477 + }, + { + "epoch": 1.0495787427090084, + "grad_norm": 0.7784950137138367, + "learning_rate": 3.7744098394744287e-06, + "loss": 0.1042, + "step": 6478 + }, + { + "epoch": 1.0497407647440051, + "grad_norm": 0.8187713623046875, + "learning_rate": 3.774033600999946e-06, + "loss": 0.1076, + "step": 6479 + }, + { + "epoch": 1.0499027867790018, + "grad_norm": 0.8849846720695496, + "learning_rate": 3.7736573235431174e-06, + "loss": 0.1115, + "step": 6480 + }, + { + "epoch": 1.0500648088139988, + "grad_norm": 0.830732524394989, + "learning_rate": 3.773281007115458e-06, + "loss": 0.1076, + "step": 6481 + }, + { + "epoch": 1.0502268308489955, + "grad_norm": 0.8834472298622131, + "learning_rate": 3.7729046517284805e-06, + "loss": 0.1195, + "step": 6482 + }, + { + "epoch": 1.0503888528839922, + "grad_norm": 0.8346211910247803, + "learning_rate": 3.7725282573937015e-06, + "loss": 0.112, + "step": 6483 + }, + { + "epoch": 1.050550874918989, + "grad_norm": 0.7828723192214966, + "learning_rate": 3.7721518241226375e-06, + "loss": 0.0984, + "step": 6484 + }, + { + "epoch": 1.0507128969539858, + "grad_norm": 0.8663693070411682, + "learning_rate": 3.7717753519268053e-06, + "loss": 0.1056, + "step": 6485 + }, + { + "epoch": 1.0508749189889826, + "grad_norm": 0.819918155670166, + "learning_rate": 3.771398840817725e-06, + "loss": 0.0987, + "step": 6486 + }, + { + "epoch": 1.0510369410239793, + "grad_norm": 0.7646182179450989, + "learning_rate": 3.771022290806917e-06, + "loss": 0.0878, + "step": 6487 + }, + { + "epoch": 1.051198963058976, + "grad_norm": 0.8001990914344788, + "learning_rate": 3.770645701905904e-06, + "loss": 0.095, + "step": 6488 + }, + { + "epoch": 1.0513609850939727, + "grad_norm": 0.9029562473297119, + "learning_rate": 3.770269074126206e-06, + "loss": 0.1188, + "step": 6489 + }, + { + "epoch": 1.0515230071289696, + "grad_norm": 0.9364879727363586, + "learning_rate": 3.7698924074793484e-06, + "loss": 0.1128, + "step": 6490 + }, + { + "epoch": 1.0516850291639663, + "grad_norm": 0.8705113530158997, + "learning_rate": 3.769515701976856e-06, + "loss": 0.0952, + "step": 6491 + }, + { + "epoch": 1.051847051198963, + "grad_norm": 0.8115032911300659, + "learning_rate": 3.7691389576302567e-06, + "loss": 0.099, + "step": 6492 + }, + { + "epoch": 1.0520090732339598, + "grad_norm": 0.8196799159049988, + "learning_rate": 3.7687621744510756e-06, + "loss": 0.0995, + "step": 6493 + }, + { + "epoch": 1.0521710952689567, + "grad_norm": 0.8506253957748413, + "learning_rate": 3.768385352450842e-06, + "loss": 0.0996, + "step": 6494 + }, + { + "epoch": 1.0523331173039534, + "grad_norm": 0.7713782787322998, + "learning_rate": 3.7680084916410876e-06, + "loss": 0.0889, + "step": 6495 + }, + { + "epoch": 1.0524951393389501, + "grad_norm": 0.8281717896461487, + "learning_rate": 3.7676315920333396e-06, + "loss": 0.1026, + "step": 6496 + }, + { + "epoch": 1.0526571613739468, + "grad_norm": 0.870488703250885, + "learning_rate": 3.7672546536391343e-06, + "loss": 0.1065, + "step": 6497 + }, + { + "epoch": 1.0528191834089435, + "grad_norm": 0.9622962474822998, + "learning_rate": 3.7668776764700023e-06, + "loss": 0.1097, + "step": 6498 + }, + { + "epoch": 1.0529812054439405, + "grad_norm": 0.8624140620231628, + "learning_rate": 3.76650066053748e-06, + "loss": 0.1093, + "step": 6499 + }, + { + "epoch": 1.0531432274789372, + "grad_norm": 0.9349178671836853, + "learning_rate": 3.766123605853101e-06, + "loss": 0.115, + "step": 6500 + }, + { + "epoch": 1.053305249513934, + "grad_norm": 0.8151592016220093, + "learning_rate": 3.7657465124284047e-06, + "loss": 0.0977, + "step": 6501 + }, + { + "epoch": 1.0534672715489306, + "grad_norm": 0.9163764715194702, + "learning_rate": 3.765369380274928e-06, + "loss": 0.1155, + "step": 6502 + }, + { + "epoch": 1.0536292935839273, + "grad_norm": 0.8974437117576599, + "learning_rate": 3.76499220940421e-06, + "loss": 0.1149, + "step": 6503 + }, + { + "epoch": 1.0537913156189243, + "grad_norm": 0.8582158088684082, + "learning_rate": 3.7646149998277924e-06, + "loss": 0.1047, + "step": 6504 + }, + { + "epoch": 1.053953337653921, + "grad_norm": 0.8871607184410095, + "learning_rate": 3.7642377515572153e-06, + "loss": 0.0993, + "step": 6505 + }, + { + "epoch": 1.0541153596889177, + "grad_norm": 0.8530762791633606, + "learning_rate": 3.7638604646040232e-06, + "loss": 0.1047, + "step": 6506 + }, + { + "epoch": 1.0542773817239144, + "grad_norm": 0.8357167840003967, + "learning_rate": 3.763483138979759e-06, + "loss": 0.111, + "step": 6507 + }, + { + "epoch": 1.054439403758911, + "grad_norm": 0.8622068166732788, + "learning_rate": 3.763105774695968e-06, + "loss": 0.111, + "step": 6508 + }, + { + "epoch": 1.054601425793908, + "grad_norm": 0.8668839335441589, + "learning_rate": 3.762728371764197e-06, + "loss": 0.102, + "step": 6509 + }, + { + "epoch": 1.0547634478289047, + "grad_norm": 0.7583625316619873, + "learning_rate": 3.7623509301959935e-06, + "loss": 0.102, + "step": 6510 + }, + { + "epoch": 1.0549254698639015, + "grad_norm": 0.7609004974365234, + "learning_rate": 3.761973450002907e-06, + "loss": 0.0888, + "step": 6511 + }, + { + "epoch": 1.0550874918988982, + "grad_norm": 0.8587732911109924, + "learning_rate": 3.7615959311964865e-06, + "loss": 0.1126, + "step": 6512 + }, + { + "epoch": 1.055249513933895, + "grad_norm": 0.8386414647102356, + "learning_rate": 3.7612183737882833e-06, + "loss": 0.1107, + "step": 6513 + }, + { + "epoch": 1.0554115359688918, + "grad_norm": 0.7540522813796997, + "learning_rate": 3.760840777789851e-06, + "loss": 0.0933, + "step": 6514 + }, + { + "epoch": 1.0555735580038885, + "grad_norm": 0.7534650564193726, + "learning_rate": 3.7604631432127413e-06, + "loss": 0.0938, + "step": 6515 + }, + { + "epoch": 1.0557355800388852, + "grad_norm": 0.831891655921936, + "learning_rate": 3.7600854700685095e-06, + "loss": 0.108, + "step": 6516 + }, + { + "epoch": 1.055897602073882, + "grad_norm": 0.8216171264648438, + "learning_rate": 3.7597077583687115e-06, + "loss": 0.1109, + "step": 6517 + }, + { + "epoch": 1.0560596241088789, + "grad_norm": 0.8234738707542419, + "learning_rate": 3.759330008124905e-06, + "loss": 0.1066, + "step": 6518 + }, + { + "epoch": 1.0562216461438756, + "grad_norm": 0.799088180065155, + "learning_rate": 3.7589522193486476e-06, + "loss": 0.0963, + "step": 6519 + }, + { + "epoch": 1.0563836681788723, + "grad_norm": 0.850976288318634, + "learning_rate": 3.7585743920514985e-06, + "loss": 0.1049, + "step": 6520 + }, + { + "epoch": 1.056545690213869, + "grad_norm": 0.8160610198974609, + "learning_rate": 3.7581965262450193e-06, + "loss": 0.1048, + "step": 6521 + }, + { + "epoch": 1.056707712248866, + "grad_norm": 0.850570023059845, + "learning_rate": 3.757818621940771e-06, + "loss": 0.1042, + "step": 6522 + }, + { + "epoch": 1.0568697342838627, + "grad_norm": 0.7830667495727539, + "learning_rate": 3.7574406791503167e-06, + "loss": 0.1023, + "step": 6523 + }, + { + "epoch": 1.0570317563188594, + "grad_norm": 0.831290066242218, + "learning_rate": 3.7570626978852203e-06, + "loss": 0.1041, + "step": 6524 + }, + { + "epoch": 1.057193778353856, + "grad_norm": 0.9002595543861389, + "learning_rate": 3.7566846781570476e-06, + "loss": 0.1133, + "step": 6525 + }, + { + "epoch": 1.0573558003888528, + "grad_norm": 1.0122647285461426, + "learning_rate": 3.7563066199773645e-06, + "loss": 0.1241, + "step": 6526 + }, + { + "epoch": 1.0575178224238497, + "grad_norm": 0.858625054359436, + "learning_rate": 3.75592852335774e-06, + "loss": 0.1033, + "step": 6527 + }, + { + "epoch": 1.0576798444588464, + "grad_norm": 0.7898378372192383, + "learning_rate": 3.7555503883097414e-06, + "loss": 0.1033, + "step": 6528 + }, + { + "epoch": 1.0578418664938432, + "grad_norm": 0.8522166609764099, + "learning_rate": 3.755172214844939e-06, + "loss": 0.1046, + "step": 6529 + }, + { + "epoch": 1.0580038885288399, + "grad_norm": 0.8669360280036926, + "learning_rate": 3.7547940029749054e-06, + "loss": 0.1087, + "step": 6530 + }, + { + "epoch": 1.0581659105638366, + "grad_norm": 0.8492510914802551, + "learning_rate": 3.7544157527112103e-06, + "loss": 0.1089, + "step": 6531 + }, + { + "epoch": 1.0583279325988335, + "grad_norm": 0.7833178043365479, + "learning_rate": 3.75403746406543e-06, + "loss": 0.0964, + "step": 6532 + }, + { + "epoch": 1.0584899546338302, + "grad_norm": 0.8604576587677002, + "learning_rate": 3.7536591370491373e-06, + "loss": 0.1123, + "step": 6533 + }, + { + "epoch": 1.058651976668827, + "grad_norm": 0.7791811227798462, + "learning_rate": 3.7532807716739082e-06, + "loss": 0.1048, + "step": 6534 + }, + { + "epoch": 1.0588139987038236, + "grad_norm": 0.8196133375167847, + "learning_rate": 3.7529023679513217e-06, + "loss": 0.109, + "step": 6535 + }, + { + "epoch": 1.0589760207388206, + "grad_norm": 0.7942836284637451, + "learning_rate": 3.752523925892954e-06, + "loss": 0.0982, + "step": 6536 + }, + { + "epoch": 1.0591380427738173, + "grad_norm": 0.7868502140045166, + "learning_rate": 3.7521454455103857e-06, + "loss": 0.1046, + "step": 6537 + }, + { + "epoch": 1.059300064808814, + "grad_norm": 0.8380435109138489, + "learning_rate": 3.7517669268151967e-06, + "loss": 0.1071, + "step": 6538 + }, + { + "epoch": 1.0594620868438107, + "grad_norm": 0.6852514743804932, + "learning_rate": 3.751388369818969e-06, + "loss": 0.0811, + "step": 6539 + }, + { + "epoch": 1.0596241088788074, + "grad_norm": 0.7679120302200317, + "learning_rate": 3.751009774533285e-06, + "loss": 0.0952, + "step": 6540 + }, + { + "epoch": 1.0597861309138044, + "grad_norm": 0.8072113394737244, + "learning_rate": 3.7506311409697295e-06, + "loss": 0.1035, + "step": 6541 + }, + { + "epoch": 1.059948152948801, + "grad_norm": 1.01187002658844, + "learning_rate": 3.7502524691398877e-06, + "loss": 0.131, + "step": 6542 + }, + { + "epoch": 1.0601101749837978, + "grad_norm": 0.8866243958473206, + "learning_rate": 3.7498737590553465e-06, + "loss": 0.1028, + "step": 6543 + }, + { + "epoch": 1.0602721970187945, + "grad_norm": 0.9462065696716309, + "learning_rate": 3.7494950107276917e-06, + "loss": 0.1029, + "step": 6544 + }, + { + "epoch": 1.0604342190537914, + "grad_norm": 0.7820582985877991, + "learning_rate": 3.749116224168514e-06, + "loss": 0.0942, + "step": 6545 + }, + { + "epoch": 1.0605962410887881, + "grad_norm": 0.9271798133850098, + "learning_rate": 3.7487373993894027e-06, + "loss": 0.1145, + "step": 6546 + }, + { + "epoch": 1.0607582631237849, + "grad_norm": 0.8751158714294434, + "learning_rate": 3.748358536401949e-06, + "loss": 0.105, + "step": 6547 + }, + { + "epoch": 1.0609202851587816, + "grad_norm": 0.8312302231788635, + "learning_rate": 3.7479796352177445e-06, + "loss": 0.0998, + "step": 6548 + }, + { + "epoch": 1.0610823071937783, + "grad_norm": 0.7504762411117554, + "learning_rate": 3.7476006958483835e-06, + "loss": 0.0974, + "step": 6549 + }, + { + "epoch": 1.0612443292287752, + "grad_norm": 0.8843858242034912, + "learning_rate": 3.7472217183054605e-06, + "loss": 0.1055, + "step": 6550 + }, + { + "epoch": 1.061406351263772, + "grad_norm": 0.9589284658432007, + "learning_rate": 3.7468427026005705e-06, + "loss": 0.1222, + "step": 6551 + }, + { + "epoch": 1.0615683732987686, + "grad_norm": 0.8354429602622986, + "learning_rate": 3.7464636487453122e-06, + "loss": 0.1088, + "step": 6552 + }, + { + "epoch": 1.0617303953337653, + "grad_norm": 0.7289818525314331, + "learning_rate": 3.7460845567512817e-06, + "loss": 0.0927, + "step": 6553 + }, + { + "epoch": 1.061892417368762, + "grad_norm": 0.8219195604324341, + "learning_rate": 3.74570542663008e-06, + "loss": 0.1049, + "step": 6554 + }, + { + "epoch": 1.062054439403759, + "grad_norm": 0.8895604610443115, + "learning_rate": 3.745326258393306e-06, + "loss": 0.1069, + "step": 6555 + }, + { + "epoch": 1.0622164614387557, + "grad_norm": 0.8279933929443359, + "learning_rate": 3.744947052052562e-06, + "loss": 0.1081, + "step": 6556 + }, + { + "epoch": 1.0623784834737524, + "grad_norm": 0.7519248127937317, + "learning_rate": 3.744567807619451e-06, + "loss": 0.0904, + "step": 6557 + }, + { + "epoch": 1.0625405055087491, + "grad_norm": 0.7404701113700867, + "learning_rate": 3.7441885251055774e-06, + "loss": 0.0901, + "step": 6558 + }, + { + "epoch": 1.0627025275437458, + "grad_norm": 0.8219286203384399, + "learning_rate": 3.743809204522546e-06, + "loss": 0.1059, + "step": 6559 + }, + { + "epoch": 1.0628645495787428, + "grad_norm": 0.8403288125991821, + "learning_rate": 3.7434298458819622e-06, + "loss": 0.1126, + "step": 6560 + }, + { + "epoch": 1.0630265716137395, + "grad_norm": 0.742680549621582, + "learning_rate": 3.743050449195435e-06, + "loss": 0.0932, + "step": 6561 + }, + { + "epoch": 1.0631885936487362, + "grad_norm": 0.8612416386604309, + "learning_rate": 3.7426710144745717e-06, + "loss": 0.1066, + "step": 6562 + }, + { + "epoch": 1.063350615683733, + "grad_norm": 0.8878268599510193, + "learning_rate": 3.7422915417309825e-06, + "loss": 0.1157, + "step": 6563 + }, + { + "epoch": 1.0635126377187298, + "grad_norm": 0.8379852771759033, + "learning_rate": 3.7419120309762787e-06, + "loss": 0.1026, + "step": 6564 + }, + { + "epoch": 1.0636746597537265, + "grad_norm": 0.8249161839485168, + "learning_rate": 3.7415324822220717e-06, + "loss": 0.1052, + "step": 6565 + }, + { + "epoch": 1.0638366817887233, + "grad_norm": 0.8621309399604797, + "learning_rate": 3.7411528954799752e-06, + "loss": 0.1006, + "step": 6566 + }, + { + "epoch": 1.06399870382372, + "grad_norm": 0.860681414604187, + "learning_rate": 3.740773270761604e-06, + "loss": 0.103, + "step": 6567 + }, + { + "epoch": 1.0641607258587167, + "grad_norm": 0.995371401309967, + "learning_rate": 3.740393608078573e-06, + "loss": 0.1238, + "step": 6568 + }, + { + "epoch": 1.0643227478937136, + "grad_norm": 0.8341780304908752, + "learning_rate": 3.7400139074424997e-06, + "loss": 0.1076, + "step": 6569 + }, + { + "epoch": 1.0644847699287103, + "grad_norm": 0.8263168931007385, + "learning_rate": 3.739634168865001e-06, + "loss": 0.1026, + "step": 6570 + }, + { + "epoch": 1.064646791963707, + "grad_norm": 0.959525465965271, + "learning_rate": 3.7392543923576974e-06, + "loss": 0.1143, + "step": 6571 + }, + { + "epoch": 1.0648088139987038, + "grad_norm": 0.8506279587745667, + "learning_rate": 3.738874577932208e-06, + "loss": 0.1033, + "step": 6572 + }, + { + "epoch": 1.0649708360337007, + "grad_norm": 0.8389852046966553, + "learning_rate": 3.7384947256001534e-06, + "loss": 0.1043, + "step": 6573 + }, + { + "epoch": 1.0651328580686974, + "grad_norm": 0.7704646587371826, + "learning_rate": 3.738114835373159e-06, + "loss": 0.0953, + "step": 6574 + }, + { + "epoch": 1.065294880103694, + "grad_norm": 0.8785123229026794, + "learning_rate": 3.7377349072628457e-06, + "loss": 0.1146, + "step": 6575 + }, + { + "epoch": 1.0654569021386908, + "grad_norm": 0.7751638889312744, + "learning_rate": 3.73735494128084e-06, + "loss": 0.0961, + "step": 6576 + }, + { + "epoch": 1.0656189241736875, + "grad_norm": 0.8245144486427307, + "learning_rate": 3.7369749374387677e-06, + "loss": 0.108, + "step": 6577 + }, + { + "epoch": 1.0657809462086845, + "grad_norm": 0.8200885653495789, + "learning_rate": 3.736594895748255e-06, + "loss": 0.0974, + "step": 6578 + }, + { + "epoch": 1.0659429682436812, + "grad_norm": 0.8444353342056274, + "learning_rate": 3.7362148162209315e-06, + "loss": 0.0993, + "step": 6579 + }, + { + "epoch": 1.0661049902786779, + "grad_norm": 0.822864830493927, + "learning_rate": 3.7358346988684258e-06, + "loss": 0.1042, + "step": 6580 + }, + { + "epoch": 1.0662670123136746, + "grad_norm": 0.8917192220687866, + "learning_rate": 3.73545454370237e-06, + "loss": 0.1073, + "step": 6581 + }, + { + "epoch": 1.0664290343486713, + "grad_norm": 1.0070949792861938, + "learning_rate": 3.735074350734393e-06, + "loss": 0.1275, + "step": 6582 + }, + { + "epoch": 1.0665910563836682, + "grad_norm": 0.8910343647003174, + "learning_rate": 3.7346941199761317e-06, + "loss": 0.1139, + "step": 6583 + }, + { + "epoch": 1.066753078418665, + "grad_norm": 0.8535321950912476, + "learning_rate": 3.734313851439217e-06, + "loss": 0.103, + "step": 6584 + }, + { + "epoch": 1.0669151004536617, + "grad_norm": 0.8377382159233093, + "learning_rate": 3.7339335451352864e-06, + "loss": 0.1053, + "step": 6585 + }, + { + "epoch": 1.0670771224886584, + "grad_norm": 0.8273501992225647, + "learning_rate": 3.7335532010759747e-06, + "loss": 0.1034, + "step": 6586 + }, + { + "epoch": 1.0672391445236553, + "grad_norm": 0.9124737977981567, + "learning_rate": 3.73317281927292e-06, + "loss": 0.1059, + "step": 6587 + }, + { + "epoch": 1.067401166558652, + "grad_norm": 0.8772713541984558, + "learning_rate": 3.732792399737761e-06, + "loss": 0.1118, + "step": 6588 + }, + { + "epoch": 1.0675631885936487, + "grad_norm": 0.7443407773971558, + "learning_rate": 3.7324119424821387e-06, + "loss": 0.0956, + "step": 6589 + }, + { + "epoch": 1.0677252106286454, + "grad_norm": 0.7781869173049927, + "learning_rate": 3.7320314475176933e-06, + "loss": 0.0921, + "step": 6590 + }, + { + "epoch": 1.0678872326636422, + "grad_norm": 1.039441466331482, + "learning_rate": 3.7316509148560664e-06, + "loss": 0.1076, + "step": 6591 + }, + { + "epoch": 1.068049254698639, + "grad_norm": 0.7879117131233215, + "learning_rate": 3.731270344508903e-06, + "loss": 0.0951, + "step": 6592 + }, + { + "epoch": 1.0682112767336358, + "grad_norm": 0.8069095015525818, + "learning_rate": 3.730889736487846e-06, + "loss": 0.103, + "step": 6593 + }, + { + "epoch": 1.0683732987686325, + "grad_norm": 0.7860719561576843, + "learning_rate": 3.7305090908045422e-06, + "loss": 0.0989, + "step": 6594 + }, + { + "epoch": 1.0685353208036292, + "grad_norm": 0.7485154271125793, + "learning_rate": 3.7301284074706372e-06, + "loss": 0.0916, + "step": 6595 + }, + { + "epoch": 1.0686973428386262, + "grad_norm": 0.8204521536827087, + "learning_rate": 3.7297476864977805e-06, + "loss": 0.1062, + "step": 6596 + }, + { + "epoch": 1.0688593648736229, + "grad_norm": 0.7526237964630127, + "learning_rate": 3.72936692789762e-06, + "loss": 0.0921, + "step": 6597 + }, + { + "epoch": 1.0690213869086196, + "grad_norm": 0.7098335027694702, + "learning_rate": 3.7289861316818077e-06, + "loss": 0.0926, + "step": 6598 + }, + { + "epoch": 1.0691834089436163, + "grad_norm": 0.8343385457992554, + "learning_rate": 3.7286052978619926e-06, + "loss": 0.1039, + "step": 6599 + }, + { + "epoch": 1.069345430978613, + "grad_norm": 0.8440389633178711, + "learning_rate": 3.728224426449829e-06, + "loss": 0.1092, + "step": 6600 + }, + { + "epoch": 1.06950745301361, + "grad_norm": 0.9151831865310669, + "learning_rate": 3.72784351745697e-06, + "loss": 0.1205, + "step": 6601 + }, + { + "epoch": 1.0696694750486067, + "grad_norm": 0.8347547650337219, + "learning_rate": 3.7274625708950706e-06, + "loss": 0.0992, + "step": 6602 + }, + { + "epoch": 1.0698314970836034, + "grad_norm": 0.9815458655357361, + "learning_rate": 3.727081586775787e-06, + "loss": 0.1116, + "step": 6603 + }, + { + "epoch": 1.0699935191186, + "grad_norm": 0.7796982526779175, + "learning_rate": 3.7267005651107763e-06, + "loss": 0.1061, + "step": 6604 + }, + { + "epoch": 1.0701555411535968, + "grad_norm": 0.7754164338111877, + "learning_rate": 3.7263195059116973e-06, + "loss": 0.1041, + "step": 6605 + }, + { + "epoch": 1.0703175631885937, + "grad_norm": 0.8905292749404907, + "learning_rate": 3.7259384091902085e-06, + "loss": 0.1112, + "step": 6606 + }, + { + "epoch": 1.0704795852235904, + "grad_norm": 0.7484425902366638, + "learning_rate": 3.7255572749579716e-06, + "loss": 0.0919, + "step": 6607 + }, + { + "epoch": 1.0706416072585871, + "grad_norm": 0.863379955291748, + "learning_rate": 3.7251761032266475e-06, + "loss": 0.1023, + "step": 6608 + }, + { + "epoch": 1.0708036292935839, + "grad_norm": 0.9683666229248047, + "learning_rate": 3.7247948940078996e-06, + "loss": 0.1008, + "step": 6609 + }, + { + "epoch": 1.0709656513285806, + "grad_norm": 0.9362031817436218, + "learning_rate": 3.7244136473133924e-06, + "loss": 0.1129, + "step": 6610 + }, + { + "epoch": 1.0711276733635775, + "grad_norm": 0.8648001551628113, + "learning_rate": 3.72403236315479e-06, + "loss": 0.1067, + "step": 6611 + }, + { + "epoch": 1.0712896953985742, + "grad_norm": 0.8078235387802124, + "learning_rate": 3.7236510415437598e-06, + "loss": 0.1048, + "step": 6612 + }, + { + "epoch": 1.071451717433571, + "grad_norm": 0.7274784445762634, + "learning_rate": 3.7232696824919685e-06, + "loss": 0.0971, + "step": 6613 + }, + { + "epoch": 1.0716137394685676, + "grad_norm": 0.820236325263977, + "learning_rate": 3.7228882860110856e-06, + "loss": 0.0984, + "step": 6614 + }, + { + "epoch": 1.0717757615035646, + "grad_norm": 0.8109707832336426, + "learning_rate": 3.7225068521127793e-06, + "loss": 0.1016, + "step": 6615 + }, + { + "epoch": 1.0719377835385613, + "grad_norm": 0.8222618103027344, + "learning_rate": 3.7221253808087234e-06, + "loss": 0.1114, + "step": 6616 + }, + { + "epoch": 1.072099805573558, + "grad_norm": 0.7346887588500977, + "learning_rate": 3.7217438721105876e-06, + "loss": 0.0923, + "step": 6617 + }, + { + "epoch": 1.0722618276085547, + "grad_norm": 0.7791684865951538, + "learning_rate": 3.721362326030046e-06, + "loss": 0.1004, + "step": 6618 + }, + { + "epoch": 1.0724238496435514, + "grad_norm": 0.8646988868713379, + "learning_rate": 3.7209807425787724e-06, + "loss": 0.1047, + "step": 6619 + }, + { + "epoch": 1.0725858716785484, + "grad_norm": 0.7894246578216553, + "learning_rate": 3.720599121768443e-06, + "loss": 0.0985, + "step": 6620 + }, + { + "epoch": 1.072747893713545, + "grad_norm": 0.9039403796195984, + "learning_rate": 3.720217463610735e-06, + "loss": 0.1114, + "step": 6621 + }, + { + "epoch": 1.0729099157485418, + "grad_norm": 0.8473367094993591, + "learning_rate": 3.7198357681173247e-06, + "loss": 0.1048, + "step": 6622 + }, + { + "epoch": 1.0730719377835385, + "grad_norm": 0.8533846735954285, + "learning_rate": 3.719454035299892e-06, + "loss": 0.1001, + "step": 6623 + }, + { + "epoch": 1.0732339598185354, + "grad_norm": 0.8822036981582642, + "learning_rate": 3.7190722651701166e-06, + "loss": 0.1085, + "step": 6624 + }, + { + "epoch": 1.0733959818535321, + "grad_norm": 0.74126797914505, + "learning_rate": 3.7186904577396805e-06, + "loss": 0.0936, + "step": 6625 + }, + { + "epoch": 1.0735580038885288, + "grad_norm": 1.021998643875122, + "learning_rate": 3.718308613020265e-06, + "loss": 0.1299, + "step": 6626 + }, + { + "epoch": 1.0737200259235256, + "grad_norm": 0.7277312278747559, + "learning_rate": 3.7179267310235544e-06, + "loss": 0.0948, + "step": 6627 + }, + { + "epoch": 1.0738820479585223, + "grad_norm": 0.8704293966293335, + "learning_rate": 3.717544811761233e-06, + "loss": 0.1089, + "step": 6628 + }, + { + "epoch": 1.0740440699935192, + "grad_norm": 0.7847870588302612, + "learning_rate": 3.717162855244988e-06, + "loss": 0.0992, + "step": 6629 + }, + { + "epoch": 1.074206092028516, + "grad_norm": 0.8332440853118896, + "learning_rate": 3.716780861486503e-06, + "loss": 0.1022, + "step": 6630 + }, + { + "epoch": 1.0743681140635126, + "grad_norm": 0.8781189322471619, + "learning_rate": 3.7163988304974704e-06, + "loss": 0.1105, + "step": 6631 + }, + { + "epoch": 1.0745301360985093, + "grad_norm": 0.9302778244018555, + "learning_rate": 3.716016762289576e-06, + "loss": 0.1082, + "step": 6632 + }, + { + "epoch": 1.074692158133506, + "grad_norm": 0.7837849855422974, + "learning_rate": 3.715634656874511e-06, + "loss": 0.096, + "step": 6633 + }, + { + "epoch": 1.074854180168503, + "grad_norm": 0.7850882411003113, + "learning_rate": 3.7152525142639682e-06, + "loss": 0.1039, + "step": 6634 + }, + { + "epoch": 1.0750162022034997, + "grad_norm": 0.8564865589141846, + "learning_rate": 3.7148703344696386e-06, + "loss": 0.1117, + "step": 6635 + }, + { + "epoch": 1.0751782242384964, + "grad_norm": 0.9498354196548462, + "learning_rate": 3.7144881175032178e-06, + "loss": 0.1151, + "step": 6636 + }, + { + "epoch": 1.0753402462734931, + "grad_norm": 0.949449896812439, + "learning_rate": 3.714105863376398e-06, + "loss": 0.1191, + "step": 6637 + }, + { + "epoch": 1.07550226830849, + "grad_norm": 0.8418267369270325, + "learning_rate": 3.713723572100878e-06, + "loss": 0.1067, + "step": 6638 + }, + { + "epoch": 1.0756642903434868, + "grad_norm": 0.7026755809783936, + "learning_rate": 3.713341243688353e-06, + "loss": 0.091, + "step": 6639 + }, + { + "epoch": 1.0758263123784835, + "grad_norm": 0.7640417218208313, + "learning_rate": 3.7129588781505232e-06, + "loss": 0.0962, + "step": 6640 + }, + { + "epoch": 1.0759883344134802, + "grad_norm": 0.7686159610748291, + "learning_rate": 3.7125764754990864e-06, + "loss": 0.0942, + "step": 6641 + }, + { + "epoch": 1.076150356448477, + "grad_norm": 0.9323800206184387, + "learning_rate": 3.7121940357457438e-06, + "loss": 0.1188, + "step": 6642 + }, + { + "epoch": 1.0763123784834738, + "grad_norm": 0.801908016204834, + "learning_rate": 3.7118115589021976e-06, + "loss": 0.0964, + "step": 6643 + }, + { + "epoch": 1.0764744005184705, + "grad_norm": 0.7992369532585144, + "learning_rate": 3.7114290449801493e-06, + "loss": 0.0955, + "step": 6644 + }, + { + "epoch": 1.0766364225534673, + "grad_norm": 0.8944490551948547, + "learning_rate": 3.711046493991305e-06, + "loss": 0.1083, + "step": 6645 + }, + { + "epoch": 1.076798444588464, + "grad_norm": 0.8392427563667297, + "learning_rate": 3.7106639059473675e-06, + "loss": 0.1101, + "step": 6646 + }, + { + "epoch": 1.076960466623461, + "grad_norm": 0.8066583871841431, + "learning_rate": 3.7102812808600452e-06, + "loss": 0.0912, + "step": 6647 + }, + { + "epoch": 1.0771224886584576, + "grad_norm": 0.8310062885284424, + "learning_rate": 3.7098986187410447e-06, + "loss": 0.1061, + "step": 6648 + }, + { + "epoch": 1.0772845106934543, + "grad_norm": 0.9091915488243103, + "learning_rate": 3.7095159196020736e-06, + "loss": 0.1213, + "step": 6649 + }, + { + "epoch": 1.077446532728451, + "grad_norm": 0.8808692693710327, + "learning_rate": 3.7091331834548427e-06, + "loss": 0.1142, + "step": 6650 + }, + { + "epoch": 1.0776085547634477, + "grad_norm": 0.8686168789863586, + "learning_rate": 3.708750410311062e-06, + "loss": 0.1112, + "step": 6651 + }, + { + "epoch": 1.0777705767984447, + "grad_norm": 0.8946408629417419, + "learning_rate": 3.7083676001824443e-06, + "loss": 0.1145, + "step": 6652 + }, + { + "epoch": 1.0779325988334414, + "grad_norm": 0.8658775091171265, + "learning_rate": 3.7079847530807023e-06, + "loss": 0.1079, + "step": 6653 + }, + { + "epoch": 1.078094620868438, + "grad_norm": 0.8047525882720947, + "learning_rate": 3.70760186901755e-06, + "loss": 0.0999, + "step": 6654 + }, + { + "epoch": 1.0782566429034348, + "grad_norm": 0.7709795236587524, + "learning_rate": 3.7072189480047027e-06, + "loss": 0.0851, + "step": 6655 + }, + { + "epoch": 1.0784186649384315, + "grad_norm": 0.9803235530853271, + "learning_rate": 3.706835990053877e-06, + "loss": 0.1154, + "step": 6656 + }, + { + "epoch": 1.0785806869734285, + "grad_norm": 0.8865209817886353, + "learning_rate": 3.7064529951767905e-06, + "loss": 0.1043, + "step": 6657 + }, + { + "epoch": 1.0787427090084252, + "grad_norm": 0.8521233797073364, + "learning_rate": 3.7060699633851615e-06, + "loss": 0.1057, + "step": 6658 + }, + { + "epoch": 1.0789047310434219, + "grad_norm": 0.8935887217521667, + "learning_rate": 3.705686894690711e-06, + "loss": 0.1043, + "step": 6659 + }, + { + "epoch": 1.0790667530784186, + "grad_norm": 0.7139661908149719, + "learning_rate": 3.7053037891051596e-06, + "loss": 0.0868, + "step": 6660 + }, + { + "epoch": 1.0792287751134155, + "grad_norm": 0.8877597451210022, + "learning_rate": 3.7049206466402278e-06, + "loss": 0.1123, + "step": 6661 + }, + { + "epoch": 1.0793907971484122, + "grad_norm": 0.8162228465080261, + "learning_rate": 3.704537467307641e-06, + "loss": 0.1032, + "step": 6662 + }, + { + "epoch": 1.079552819183409, + "grad_norm": 0.8396819829940796, + "learning_rate": 3.704154251119122e-06, + "loss": 0.1027, + "step": 6663 + }, + { + "epoch": 1.0797148412184057, + "grad_norm": 0.8406090140342712, + "learning_rate": 3.7037709980863974e-06, + "loss": 0.0978, + "step": 6664 + }, + { + "epoch": 1.0798768632534024, + "grad_norm": 0.9074726700782776, + "learning_rate": 3.703387708221193e-06, + "loss": 0.1035, + "step": 6665 + }, + { + "epoch": 1.0800388852883993, + "grad_norm": 0.9996922612190247, + "learning_rate": 3.703004381535237e-06, + "loss": 0.1203, + "step": 6666 + }, + { + "epoch": 1.080200907323396, + "grad_norm": 0.8596282005310059, + "learning_rate": 3.7026210180402588e-06, + "loss": 0.1132, + "step": 6667 + }, + { + "epoch": 1.0803629293583927, + "grad_norm": 0.7845479249954224, + "learning_rate": 3.7022376177479863e-06, + "loss": 0.0991, + "step": 6668 + }, + { + "epoch": 1.0805249513933894, + "grad_norm": 0.8344932794570923, + "learning_rate": 3.701854180670153e-06, + "loss": 0.1019, + "step": 6669 + }, + { + "epoch": 1.0806869734283864, + "grad_norm": 0.8211415410041809, + "learning_rate": 3.7014707068184895e-06, + "loss": 0.1064, + "step": 6670 + }, + { + "epoch": 1.080848995463383, + "grad_norm": 0.7003439664840698, + "learning_rate": 3.7010871962047314e-06, + "loss": 0.0901, + "step": 6671 + }, + { + "epoch": 1.0810110174983798, + "grad_norm": 0.8313621282577515, + "learning_rate": 3.70070364884061e-06, + "loss": 0.0949, + "step": 6672 + }, + { + "epoch": 1.0811730395333765, + "grad_norm": 0.9178413152694702, + "learning_rate": 3.7003200647378634e-06, + "loss": 0.1111, + "step": 6673 + }, + { + "epoch": 1.0813350615683732, + "grad_norm": 0.8344830870628357, + "learning_rate": 3.6999364439082274e-06, + "loss": 0.1049, + "step": 6674 + }, + { + "epoch": 1.0814970836033702, + "grad_norm": 0.8041746616363525, + "learning_rate": 3.69955278636344e-06, + "loss": 0.1088, + "step": 6675 + }, + { + "epoch": 1.0816591056383669, + "grad_norm": 0.8075587153434753, + "learning_rate": 3.6991690921152407e-06, + "loss": 0.1055, + "step": 6676 + }, + { + "epoch": 1.0818211276733636, + "grad_norm": 0.729253351688385, + "learning_rate": 3.6987853611753686e-06, + "loss": 0.0843, + "step": 6677 + }, + { + "epoch": 1.0819831497083603, + "grad_norm": 0.9236080646514893, + "learning_rate": 3.698401593555565e-06, + "loss": 0.117, + "step": 6678 + }, + { + "epoch": 1.082145171743357, + "grad_norm": 0.9156818985939026, + "learning_rate": 3.6980177892675735e-06, + "loss": 0.1183, + "step": 6679 + }, + { + "epoch": 1.082307193778354, + "grad_norm": 0.8869015574455261, + "learning_rate": 3.697633948323136e-06, + "loss": 0.103, + "step": 6680 + }, + { + "epoch": 1.0824692158133506, + "grad_norm": 0.7443458437919617, + "learning_rate": 3.6972500707339986e-06, + "loss": 0.0958, + "step": 6681 + }, + { + "epoch": 1.0826312378483474, + "grad_norm": 0.8404899835586548, + "learning_rate": 3.6968661565119062e-06, + "loss": 0.1051, + "step": 6682 + }, + { + "epoch": 1.082793259883344, + "grad_norm": 0.8908175826072693, + "learning_rate": 3.6964822056686057e-06, + "loss": 0.108, + "step": 6683 + }, + { + "epoch": 1.0829552819183408, + "grad_norm": 0.8125652074813843, + "learning_rate": 3.6960982182158458e-06, + "loss": 0.1058, + "step": 6684 + }, + { + "epoch": 1.0831173039533377, + "grad_norm": 1.0027449131011963, + "learning_rate": 3.695714194165374e-06, + "loss": 0.1219, + "step": 6685 + }, + { + "epoch": 1.0832793259883344, + "grad_norm": 0.8607674837112427, + "learning_rate": 3.6953301335289415e-06, + "loss": 0.1124, + "step": 6686 + }, + { + "epoch": 1.0834413480233311, + "grad_norm": 0.8296065926551819, + "learning_rate": 3.6949460363183e-06, + "loss": 0.114, + "step": 6687 + }, + { + "epoch": 1.0836033700583279, + "grad_norm": 0.923507571220398, + "learning_rate": 3.6945619025452006e-06, + "loss": 0.1124, + "step": 6688 + }, + { + "epoch": 1.0837653920933248, + "grad_norm": 0.8684310913085938, + "learning_rate": 3.694177732221399e-06, + "loss": 0.1112, + "step": 6689 + }, + { + "epoch": 1.0839274141283215, + "grad_norm": 0.7922837138175964, + "learning_rate": 3.6937935253586475e-06, + "loss": 0.1036, + "step": 6690 + }, + { + "epoch": 1.0840894361633182, + "grad_norm": 0.7880176305770874, + "learning_rate": 3.693409281968704e-06, + "loss": 0.0899, + "step": 6691 + }, + { + "epoch": 1.084251458198315, + "grad_norm": 0.8277053833007812, + "learning_rate": 3.6930250020633237e-06, + "loss": 0.1015, + "step": 6692 + }, + { + "epoch": 1.0844134802333116, + "grad_norm": 0.8327934145927429, + "learning_rate": 3.692640685654266e-06, + "loss": 0.1002, + "step": 6693 + }, + { + "epoch": 1.0845755022683086, + "grad_norm": 0.8471828699111938, + "learning_rate": 3.692256332753289e-06, + "loss": 0.1098, + "step": 6694 + }, + { + "epoch": 1.0847375243033053, + "grad_norm": 0.8789288401603699, + "learning_rate": 3.691871943372154e-06, + "loss": 0.1028, + "step": 6695 + }, + { + "epoch": 1.084899546338302, + "grad_norm": 0.8185938000679016, + "learning_rate": 3.691487517522621e-06, + "loss": 0.1081, + "step": 6696 + }, + { + "epoch": 1.0850615683732987, + "grad_norm": 0.8919969797134399, + "learning_rate": 3.691103055216454e-06, + "loss": 0.1175, + "step": 6697 + }, + { + "epoch": 1.0852235904082956, + "grad_norm": 0.8887705206871033, + "learning_rate": 3.690718556465416e-06, + "loss": 0.1059, + "step": 6698 + }, + { + "epoch": 1.0853856124432923, + "grad_norm": 0.8780819773674011, + "learning_rate": 3.690334021281271e-06, + "loss": 0.0972, + "step": 6699 + }, + { + "epoch": 1.085547634478289, + "grad_norm": 0.9211699962615967, + "learning_rate": 3.689949449675786e-06, + "loss": 0.1204, + "step": 6700 + }, + { + "epoch": 1.0857096565132858, + "grad_norm": 0.8538796901702881, + "learning_rate": 3.6895648416607273e-06, + "loss": 0.1136, + "step": 6701 + }, + { + "epoch": 1.0858716785482825, + "grad_norm": 0.9533454775810242, + "learning_rate": 3.689180197247863e-06, + "loss": 0.1214, + "step": 6702 + }, + { + "epoch": 1.0860337005832794, + "grad_norm": 0.7428584098815918, + "learning_rate": 3.6887955164489626e-06, + "loss": 0.0825, + "step": 6703 + }, + { + "epoch": 1.0861957226182761, + "grad_norm": 0.8387787342071533, + "learning_rate": 3.688410799275796e-06, + "loss": 0.105, + "step": 6704 + }, + { + "epoch": 1.0863577446532728, + "grad_norm": 0.9467548131942749, + "learning_rate": 3.6880260457401353e-06, + "loss": 0.1085, + "step": 6705 + }, + { + "epoch": 1.0865197666882696, + "grad_norm": 0.88310307264328, + "learning_rate": 3.6876412558537524e-06, + "loss": 0.1143, + "step": 6706 + }, + { + "epoch": 1.0866817887232663, + "grad_norm": 0.8482463955879211, + "learning_rate": 3.6872564296284214e-06, + "loss": 0.1039, + "step": 6707 + }, + { + "epoch": 1.0868438107582632, + "grad_norm": 0.8076966404914856, + "learning_rate": 3.686871567075916e-06, + "loss": 0.1022, + "step": 6708 + }, + { + "epoch": 1.08700583279326, + "grad_norm": 0.8349097967147827, + "learning_rate": 3.686486668208013e-06, + "loss": 0.1116, + "step": 6709 + }, + { + "epoch": 1.0871678548282566, + "grad_norm": 0.8799702525138855, + "learning_rate": 3.6861017330364897e-06, + "loss": 0.1052, + "step": 6710 + }, + { + "epoch": 1.0873298768632533, + "grad_norm": 1.1798293590545654, + "learning_rate": 3.685716761573123e-06, + "loss": 0.1036, + "step": 6711 + }, + { + "epoch": 1.0874918988982503, + "grad_norm": 0.907839298248291, + "learning_rate": 3.685331753829693e-06, + "loss": 0.1081, + "step": 6712 + }, + { + "epoch": 1.087653920933247, + "grad_norm": 0.8660617470741272, + "learning_rate": 3.68494670981798e-06, + "loss": 0.1047, + "step": 6713 + }, + { + "epoch": 1.0878159429682437, + "grad_norm": 0.8517418503761292, + "learning_rate": 3.684561629549765e-06, + "loss": 0.0974, + "step": 6714 + }, + { + "epoch": 1.0879779650032404, + "grad_norm": 0.862423300743103, + "learning_rate": 3.684176513036831e-06, + "loss": 0.1084, + "step": 6715 + }, + { + "epoch": 1.088139987038237, + "grad_norm": 0.6887312531471252, + "learning_rate": 3.6837913602909615e-06, + "loss": 0.0862, + "step": 6716 + }, + { + "epoch": 1.088302009073234, + "grad_norm": 0.7741922736167908, + "learning_rate": 3.683406171323941e-06, + "loss": 0.0962, + "step": 6717 + }, + { + "epoch": 1.0884640311082308, + "grad_norm": 0.8818541169166565, + "learning_rate": 3.6830209461475554e-06, + "loss": 0.1033, + "step": 6718 + }, + { + "epoch": 1.0886260531432275, + "grad_norm": 0.8155554533004761, + "learning_rate": 3.682635684773591e-06, + "loss": 0.1041, + "step": 6719 + }, + { + "epoch": 1.0887880751782242, + "grad_norm": 0.86274254322052, + "learning_rate": 3.6822503872138377e-06, + "loss": 0.1046, + "step": 6720 + }, + { + "epoch": 1.0889500972132211, + "grad_norm": 0.8824654817581177, + "learning_rate": 3.681865053480082e-06, + "loss": 0.1147, + "step": 6721 + }, + { + "epoch": 1.0891121192482178, + "grad_norm": 0.7932639122009277, + "learning_rate": 3.6814796835841172e-06, + "loss": 0.1008, + "step": 6722 + }, + { + "epoch": 1.0892741412832145, + "grad_norm": 0.7773560285568237, + "learning_rate": 3.681094277537732e-06, + "loss": 0.1036, + "step": 6723 + }, + { + "epoch": 1.0894361633182112, + "grad_norm": 0.7581130862236023, + "learning_rate": 3.6807088353527216e-06, + "loss": 0.0872, + "step": 6724 + }, + { + "epoch": 1.089598185353208, + "grad_norm": 0.8676707148551941, + "learning_rate": 3.680323357040877e-06, + "loss": 0.1011, + "step": 6725 + }, + { + "epoch": 1.089760207388205, + "grad_norm": 0.8844422101974487, + "learning_rate": 3.6799378426139942e-06, + "loss": 0.1222, + "step": 6726 + }, + { + "epoch": 1.0899222294232016, + "grad_norm": 0.7242690920829773, + "learning_rate": 3.679552292083869e-06, + "loss": 0.0894, + "step": 6727 + }, + { + "epoch": 1.0900842514581983, + "grad_norm": 0.7637506127357483, + "learning_rate": 3.679166705462298e-06, + "loss": 0.0938, + "step": 6728 + }, + { + "epoch": 1.090246273493195, + "grad_norm": 0.9128175973892212, + "learning_rate": 3.67878108276108e-06, + "loss": 0.1092, + "step": 6729 + }, + { + "epoch": 1.0904082955281917, + "grad_norm": 0.7904967069625854, + "learning_rate": 3.6783954239920118e-06, + "loss": 0.0979, + "step": 6730 + }, + { + "epoch": 1.0905703175631887, + "grad_norm": 0.827260434627533, + "learning_rate": 3.678009729166897e-06, + "loss": 0.1045, + "step": 6731 + }, + { + "epoch": 1.0907323395981854, + "grad_norm": 0.7866484522819519, + "learning_rate": 3.677623998297534e-06, + "loss": 0.0973, + "step": 6732 + }, + { + "epoch": 1.090894361633182, + "grad_norm": 0.8426867723464966, + "learning_rate": 3.677238231395727e-06, + "loss": 0.1104, + "step": 6733 + }, + { + "epoch": 1.0910563836681788, + "grad_norm": 0.7677156329154968, + "learning_rate": 3.6768524284732794e-06, + "loss": 0.0942, + "step": 6734 + }, + { + "epoch": 1.0912184057031755, + "grad_norm": 1.0549900531768799, + "learning_rate": 3.676466589541995e-06, + "loss": 0.1023, + "step": 6735 + }, + { + "epoch": 1.0913804277381725, + "grad_norm": 0.7646496295928955, + "learning_rate": 3.6760807146136796e-06, + "loss": 0.0946, + "step": 6736 + }, + { + "epoch": 1.0915424497731692, + "grad_norm": 0.8362929224967957, + "learning_rate": 3.6756948037001406e-06, + "loss": 0.1068, + "step": 6737 + }, + { + "epoch": 1.0917044718081659, + "grad_norm": 0.940573513507843, + "learning_rate": 3.675308856813186e-06, + "loss": 0.1196, + "step": 6738 + }, + { + "epoch": 1.0918664938431626, + "grad_norm": 0.9485230445861816, + "learning_rate": 3.674922873964625e-06, + "loss": 0.1154, + "step": 6739 + }, + { + "epoch": 1.0920285158781595, + "grad_norm": 0.8467913866043091, + "learning_rate": 3.6745368551662663e-06, + "loss": 0.0991, + "step": 6740 + }, + { + "epoch": 1.0921905379131562, + "grad_norm": 0.8293401002883911, + "learning_rate": 3.6741508004299227e-06, + "loss": 0.1055, + "step": 6741 + }, + { + "epoch": 1.092352559948153, + "grad_norm": 0.9706871509552002, + "learning_rate": 3.6737647097674056e-06, + "loss": 0.1058, + "step": 6742 + }, + { + "epoch": 1.0925145819831497, + "grad_norm": 0.9424450993537903, + "learning_rate": 3.673378583190529e-06, + "loss": 0.1142, + "step": 6743 + }, + { + "epoch": 1.0926766040181464, + "grad_norm": 0.8551537990570068, + "learning_rate": 3.6729924207111077e-06, + "loss": 0.1007, + "step": 6744 + }, + { + "epoch": 1.0928386260531433, + "grad_norm": 0.8865105509757996, + "learning_rate": 3.6726062223409563e-06, + "loss": 0.1095, + "step": 6745 + }, + { + "epoch": 1.09300064808814, + "grad_norm": 0.7189201712608337, + "learning_rate": 3.6722199880918928e-06, + "loss": 0.091, + "step": 6746 + }, + { + "epoch": 1.0931626701231367, + "grad_norm": 0.8280571699142456, + "learning_rate": 3.6718337179757336e-06, + "loss": 0.1077, + "step": 6747 + }, + { + "epoch": 1.0933246921581334, + "grad_norm": 0.9577053189277649, + "learning_rate": 3.6714474120042993e-06, + "loss": 0.1168, + "step": 6748 + }, + { + "epoch": 1.0934867141931304, + "grad_norm": 0.8207298517227173, + "learning_rate": 3.6710610701894088e-06, + "loss": 0.1025, + "step": 6749 + }, + { + "epoch": 1.093648736228127, + "grad_norm": 0.78386390209198, + "learning_rate": 3.6706746925428833e-06, + "loss": 0.1061, + "step": 6750 + }, + { + "epoch": 1.0938107582631238, + "grad_norm": 0.799771249294281, + "learning_rate": 3.6702882790765453e-06, + "loss": 0.0973, + "step": 6751 + }, + { + "epoch": 1.0939727802981205, + "grad_norm": 0.824006199836731, + "learning_rate": 3.6699018298022173e-06, + "loss": 0.1105, + "step": 6752 + }, + { + "epoch": 1.0941348023331172, + "grad_norm": 0.8404650092124939, + "learning_rate": 3.6695153447317254e-06, + "loss": 0.1, + "step": 6753 + }, + { + "epoch": 1.0942968243681142, + "grad_norm": 0.7487378120422363, + "learning_rate": 3.6691288238768928e-06, + "loss": 0.0998, + "step": 6754 + }, + { + "epoch": 1.0944588464031109, + "grad_norm": 0.9007918834686279, + "learning_rate": 3.6687422672495493e-06, + "loss": 0.1164, + "step": 6755 + }, + { + "epoch": 1.0946208684381076, + "grad_norm": 0.7542750835418701, + "learning_rate": 3.6683556748615196e-06, + "loss": 0.0965, + "step": 6756 + }, + { + "epoch": 1.0947828904731043, + "grad_norm": 0.8000967502593994, + "learning_rate": 3.6679690467246337e-06, + "loss": 0.1096, + "step": 6757 + }, + { + "epoch": 1.094944912508101, + "grad_norm": 0.7331892251968384, + "learning_rate": 3.667582382850721e-06, + "loss": 0.0891, + "step": 6758 + }, + { + "epoch": 1.095106934543098, + "grad_norm": 0.790254533290863, + "learning_rate": 3.6671956832516136e-06, + "loss": 0.097, + "step": 6759 + }, + { + "epoch": 1.0952689565780946, + "grad_norm": 0.7792706489562988, + "learning_rate": 3.6668089479391433e-06, + "loss": 0.0989, + "step": 6760 + }, + { + "epoch": 1.0954309786130914, + "grad_norm": 0.8442437648773193, + "learning_rate": 3.6664221769251414e-06, + "loss": 0.1006, + "step": 6761 + }, + { + "epoch": 1.095593000648088, + "grad_norm": 0.928298830986023, + "learning_rate": 3.666035370221445e-06, + "loss": 0.103, + "step": 6762 + }, + { + "epoch": 1.095755022683085, + "grad_norm": 0.8222261071205139, + "learning_rate": 3.6656485278398866e-06, + "loss": 0.1058, + "step": 6763 + }, + { + "epoch": 1.0959170447180817, + "grad_norm": 0.7384744882583618, + "learning_rate": 3.665261649792305e-06, + "loss": 0.0896, + "step": 6764 + }, + { + "epoch": 1.0960790667530784, + "grad_norm": 0.836329460144043, + "learning_rate": 3.664874736090537e-06, + "loss": 0.1059, + "step": 6765 + }, + { + "epoch": 1.0962410887880751, + "grad_norm": 1.0209158658981323, + "learning_rate": 3.66448778674642e-06, + "loss": 0.1111, + "step": 6766 + }, + { + "epoch": 1.0964031108230718, + "grad_norm": 0.7318083047866821, + "learning_rate": 3.6641008017717957e-06, + "loss": 0.0875, + "step": 6767 + }, + { + "epoch": 1.0965651328580688, + "grad_norm": 0.7760915160179138, + "learning_rate": 3.663713781178504e-06, + "loss": 0.0986, + "step": 6768 + }, + { + "epoch": 1.0967271548930655, + "grad_norm": 0.7997096180915833, + "learning_rate": 3.6633267249783854e-06, + "loss": 0.0943, + "step": 6769 + }, + { + "epoch": 1.0968891769280622, + "grad_norm": 0.8263211846351624, + "learning_rate": 3.6629396331832854e-06, + "loss": 0.0919, + "step": 6770 + }, + { + "epoch": 1.097051198963059, + "grad_norm": 0.9232663512229919, + "learning_rate": 3.6625525058050467e-06, + "loss": 0.1063, + "step": 6771 + }, + { + "epoch": 1.0972132209980558, + "grad_norm": 0.9478903412818909, + "learning_rate": 3.6621653428555144e-06, + "loss": 0.1076, + "step": 6772 + }, + { + "epoch": 1.0973752430330526, + "grad_norm": 0.8772745132446289, + "learning_rate": 3.661778144346535e-06, + "loss": 0.1131, + "step": 6773 + }, + { + "epoch": 1.0975372650680493, + "grad_norm": 0.8307527899742126, + "learning_rate": 3.661390910289956e-06, + "loss": 0.1062, + "step": 6774 + }, + { + "epoch": 1.097699287103046, + "grad_norm": 0.8453426361083984, + "learning_rate": 3.661003640697626e-06, + "loss": 0.0974, + "step": 6775 + }, + { + "epoch": 1.0978613091380427, + "grad_norm": 1.1416118144989014, + "learning_rate": 3.6606163355813935e-06, + "loss": 0.1092, + "step": 6776 + }, + { + "epoch": 1.0980233311730396, + "grad_norm": 0.8578381538391113, + "learning_rate": 3.66022899495311e-06, + "loss": 0.1045, + "step": 6777 + }, + { + "epoch": 1.0981853532080363, + "grad_norm": 0.7609858512878418, + "learning_rate": 3.6598416188246265e-06, + "loss": 0.0962, + "step": 6778 + }, + { + "epoch": 1.098347375243033, + "grad_norm": 0.8034330606460571, + "learning_rate": 3.659454207207798e-06, + "loss": 0.1032, + "step": 6779 + }, + { + "epoch": 1.0985093972780298, + "grad_norm": 0.9241167306900024, + "learning_rate": 3.6590667601144748e-06, + "loss": 0.1066, + "step": 6780 + }, + { + "epoch": 1.0986714193130265, + "grad_norm": 0.7677343487739563, + "learning_rate": 3.6586792775565137e-06, + "loss": 0.0913, + "step": 6781 + }, + { + "epoch": 1.0988334413480234, + "grad_norm": 1.0235810279846191, + "learning_rate": 3.6582917595457718e-06, + "loss": 0.1226, + "step": 6782 + }, + { + "epoch": 1.0989954633830201, + "grad_norm": 0.7913617491722107, + "learning_rate": 3.657904206094104e-06, + "loss": 0.1079, + "step": 6783 + }, + { + "epoch": 1.0991574854180168, + "grad_norm": 0.7789272665977478, + "learning_rate": 3.6575166172133703e-06, + "loss": 0.0998, + "step": 6784 + }, + { + "epoch": 1.0993195074530135, + "grad_norm": 0.7309193015098572, + "learning_rate": 3.657128992915428e-06, + "loss": 0.0935, + "step": 6785 + }, + { + "epoch": 1.0994815294880103, + "grad_norm": 0.9271590113639832, + "learning_rate": 3.6567413332121402e-06, + "loss": 0.1065, + "step": 6786 + }, + { + "epoch": 1.0996435515230072, + "grad_norm": 0.8196094632148743, + "learning_rate": 3.6563536381153663e-06, + "loss": 0.1061, + "step": 6787 + }, + { + "epoch": 1.099805573558004, + "grad_norm": 0.8200319409370422, + "learning_rate": 3.655965907636969e-06, + "loss": 0.1075, + "step": 6788 + }, + { + "epoch": 1.0999675955930006, + "grad_norm": 0.8080103993415833, + "learning_rate": 3.6555781417888126e-06, + "loss": 0.106, + "step": 6789 + }, + { + "epoch": 1.1001296176279973, + "grad_norm": 0.8300761580467224, + "learning_rate": 3.6551903405827615e-06, + "loss": 0.111, + "step": 6790 + }, + { + "epoch": 1.1002916396629943, + "grad_norm": 0.887389600276947, + "learning_rate": 3.6548025040306816e-06, + "loss": 0.1125, + "step": 6791 + }, + { + "epoch": 1.100453661697991, + "grad_norm": 0.6711512207984924, + "learning_rate": 3.6544146321444397e-06, + "loss": 0.0879, + "step": 6792 + }, + { + "epoch": 1.1006156837329877, + "grad_norm": 0.7568984627723694, + "learning_rate": 3.654026724935904e-06, + "loss": 0.0932, + "step": 6793 + }, + { + "epoch": 1.1007777057679844, + "grad_norm": 0.8204541206359863, + "learning_rate": 3.653638782416943e-06, + "loss": 0.1042, + "step": 6794 + }, + { + "epoch": 1.100939727802981, + "grad_norm": 0.8643372058868408, + "learning_rate": 3.6532508045994262e-06, + "loss": 0.1071, + "step": 6795 + }, + { + "epoch": 1.101101749837978, + "grad_norm": 0.8105908036231995, + "learning_rate": 3.6528627914952263e-06, + "loss": 0.094, + "step": 6796 + }, + { + "epoch": 1.1012637718729748, + "grad_norm": 0.8288483023643494, + "learning_rate": 3.6524747431162148e-06, + "loss": 0.1059, + "step": 6797 + }, + { + "epoch": 1.1014257939079715, + "grad_norm": 0.8761540651321411, + "learning_rate": 3.652086659474265e-06, + "loss": 0.1127, + "step": 6798 + }, + { + "epoch": 1.1015878159429682, + "grad_norm": 0.8513060808181763, + "learning_rate": 3.651698540581252e-06, + "loss": 0.1, + "step": 6799 + }, + { + "epoch": 1.101749837977965, + "grad_norm": 0.791142463684082, + "learning_rate": 3.6513103864490497e-06, + "loss": 0.1036, + "step": 6800 + }, + { + "epoch": 1.1019118600129618, + "grad_norm": 0.7884705662727356, + "learning_rate": 3.6509221970895365e-06, + "loss": 0.0956, + "step": 6801 + }, + { + "epoch": 1.1020738820479585, + "grad_norm": 0.8608754277229309, + "learning_rate": 3.650533972514589e-06, + "loss": 0.1153, + "step": 6802 + }, + { + "epoch": 1.1022359040829552, + "grad_norm": 0.7507930397987366, + "learning_rate": 3.6501457127360863e-06, + "loss": 0.0989, + "step": 6803 + }, + { + "epoch": 1.102397926117952, + "grad_norm": 0.9590283036231995, + "learning_rate": 3.6497574177659073e-06, + "loss": 0.1085, + "step": 6804 + }, + { + "epoch": 1.1025599481529489, + "grad_norm": 0.805808424949646, + "learning_rate": 3.6493690876159343e-06, + "loss": 0.0971, + "step": 6805 + }, + { + "epoch": 1.1027219701879456, + "grad_norm": 0.7817717790603638, + "learning_rate": 3.6489807222980487e-06, + "loss": 0.0987, + "step": 6806 + }, + { + "epoch": 1.1028839922229423, + "grad_norm": 0.8280859589576721, + "learning_rate": 3.648592321824133e-06, + "loss": 0.1153, + "step": 6807 + }, + { + "epoch": 1.103046014257939, + "grad_norm": 0.9206019639968872, + "learning_rate": 3.648203886206073e-06, + "loss": 0.1135, + "step": 6808 + }, + { + "epoch": 1.1032080362929357, + "grad_norm": 0.8165472745895386, + "learning_rate": 3.647815415455751e-06, + "loss": 0.1011, + "step": 6809 + }, + { + "epoch": 1.1033700583279327, + "grad_norm": 0.8732919096946716, + "learning_rate": 3.6474269095850568e-06, + "loss": 0.1107, + "step": 6810 + }, + { + "epoch": 1.1035320803629294, + "grad_norm": 0.9100021719932556, + "learning_rate": 3.647038368605875e-06, + "loss": 0.1159, + "step": 6811 + }, + { + "epoch": 1.103694102397926, + "grad_norm": 0.8154706358909607, + "learning_rate": 3.646649792530094e-06, + "loss": 0.1027, + "step": 6812 + }, + { + "epoch": 1.1038561244329228, + "grad_norm": 0.8055473566055298, + "learning_rate": 3.646261181369605e-06, + "loss": 0.1039, + "step": 6813 + }, + { + "epoch": 1.1040181464679197, + "grad_norm": 0.7719459533691406, + "learning_rate": 3.645872535136298e-06, + "loss": 0.0941, + "step": 6814 + }, + { + "epoch": 1.1041801685029164, + "grad_norm": 0.8571450710296631, + "learning_rate": 3.6454838538420645e-06, + "loss": 0.1105, + "step": 6815 + }, + { + "epoch": 1.1043421905379132, + "grad_norm": 0.8617749214172363, + "learning_rate": 3.6450951374987958e-06, + "loss": 0.1069, + "step": 6816 + }, + { + "epoch": 1.1045042125729099, + "grad_norm": 0.7611509561538696, + "learning_rate": 3.6447063861183886e-06, + "loss": 0.1015, + "step": 6817 + }, + { + "epoch": 1.1046662346079066, + "grad_norm": 0.7744086980819702, + "learning_rate": 3.6443175997127354e-06, + "loss": 0.094, + "step": 6818 + }, + { + "epoch": 1.1048282566429035, + "grad_norm": 0.7558098435401917, + "learning_rate": 3.6439287782937328e-06, + "loss": 0.0985, + "step": 6819 + }, + { + "epoch": 1.1049902786779002, + "grad_norm": 0.8518016934394836, + "learning_rate": 3.6435399218732776e-06, + "loss": 0.1097, + "step": 6820 + }, + { + "epoch": 1.105152300712897, + "grad_norm": 0.789505660533905, + "learning_rate": 3.6431510304632683e-06, + "loss": 0.1019, + "step": 6821 + }, + { + "epoch": 1.1053143227478937, + "grad_norm": 0.9358975291252136, + "learning_rate": 3.642762104075604e-06, + "loss": 0.117, + "step": 6822 + }, + { + "epoch": 1.1054763447828906, + "grad_norm": 0.9172407388687134, + "learning_rate": 3.642373142722185e-06, + "loss": 0.1055, + "step": 6823 + }, + { + "epoch": 1.1056383668178873, + "grad_norm": 0.8525999784469604, + "learning_rate": 3.641984146414912e-06, + "loss": 0.1054, + "step": 6824 + }, + { + "epoch": 1.105800388852884, + "grad_norm": 0.8727070093154907, + "learning_rate": 3.6415951151656874e-06, + "loss": 0.108, + "step": 6825 + }, + { + "epoch": 1.1059624108878807, + "grad_norm": 0.8490038514137268, + "learning_rate": 3.6412060489864155e-06, + "loss": 0.1079, + "step": 6826 + }, + { + "epoch": 1.1061244329228774, + "grad_norm": 0.8071814179420471, + "learning_rate": 3.6408169478889997e-06, + "loss": 0.1039, + "step": 6827 + }, + { + "epoch": 1.1062864549578744, + "grad_norm": 0.8197512030601501, + "learning_rate": 3.640427811885346e-06, + "loss": 0.0976, + "step": 6828 + }, + { + "epoch": 1.106448476992871, + "grad_norm": 0.8252851963043213, + "learning_rate": 3.640038640987361e-06, + "loss": 0.0959, + "step": 6829 + }, + { + "epoch": 1.1066104990278678, + "grad_norm": 0.9495880603790283, + "learning_rate": 3.639649435206953e-06, + "loss": 0.11, + "step": 6830 + }, + { + "epoch": 1.1067725210628645, + "grad_norm": 0.9515889286994934, + "learning_rate": 3.639260194556029e-06, + "loss": 0.1006, + "step": 6831 + }, + { + "epoch": 1.1069345430978612, + "grad_norm": 0.9411259889602661, + "learning_rate": 3.6388709190465018e-06, + "loss": 0.1134, + "step": 6832 + }, + { + "epoch": 1.1070965651328581, + "grad_norm": 1.127980351448059, + "learning_rate": 3.638481608690279e-06, + "loss": 0.1175, + "step": 6833 + }, + { + "epoch": 1.1072585871678549, + "grad_norm": 0.776599109172821, + "learning_rate": 3.638092263499274e-06, + "loss": 0.0971, + "step": 6834 + }, + { + "epoch": 1.1074206092028516, + "grad_norm": 1.0031170845031738, + "learning_rate": 3.637702883485401e-06, + "loss": 0.1137, + "step": 6835 + }, + { + "epoch": 1.1075826312378483, + "grad_norm": 0.817516028881073, + "learning_rate": 3.6373134686605722e-06, + "loss": 0.1019, + "step": 6836 + }, + { + "epoch": 1.107744653272845, + "grad_norm": 0.7830043435096741, + "learning_rate": 3.636924019036704e-06, + "loss": 0.0928, + "step": 6837 + }, + { + "epoch": 1.107906675307842, + "grad_norm": 0.7438381314277649, + "learning_rate": 3.6365345346257112e-06, + "loss": 0.0897, + "step": 6838 + }, + { + "epoch": 1.1080686973428386, + "grad_norm": 0.8136735558509827, + "learning_rate": 3.636145015439513e-06, + "loss": 0.1014, + "step": 6839 + }, + { + "epoch": 1.1082307193778353, + "grad_norm": 0.8344704508781433, + "learning_rate": 3.635755461490026e-06, + "loss": 0.0962, + "step": 6840 + }, + { + "epoch": 1.108392741412832, + "grad_norm": 1.0317944288253784, + "learning_rate": 3.635365872789171e-06, + "loss": 0.1298, + "step": 6841 + }, + { + "epoch": 1.108554763447829, + "grad_norm": 0.8629235029220581, + "learning_rate": 3.634976249348867e-06, + "loss": 0.1053, + "step": 6842 + }, + { + "epoch": 1.1087167854828257, + "grad_norm": 0.8366121649742126, + "learning_rate": 3.6345865911810373e-06, + "loss": 0.1031, + "step": 6843 + }, + { + "epoch": 1.1088788075178224, + "grad_norm": 0.7734202146530151, + "learning_rate": 3.6341968982976027e-06, + "loss": 0.0979, + "step": 6844 + }, + { + "epoch": 1.1090408295528191, + "grad_norm": 0.9085724353790283, + "learning_rate": 3.633807170710488e-06, + "loss": 0.1118, + "step": 6845 + }, + { + "epoch": 1.1092028515878158, + "grad_norm": 0.8711262941360474, + "learning_rate": 3.6334174084316186e-06, + "loss": 0.1035, + "step": 6846 + }, + { + "epoch": 1.1093648736228128, + "grad_norm": 0.7932325005531311, + "learning_rate": 3.6330276114729185e-06, + "loss": 0.1024, + "step": 6847 + }, + { + "epoch": 1.1095268956578095, + "grad_norm": 0.8877636194229126, + "learning_rate": 3.632637779846315e-06, + "loss": 0.0966, + "step": 6848 + }, + { + "epoch": 1.1096889176928062, + "grad_norm": 0.7809078693389893, + "learning_rate": 3.6322479135637366e-06, + "loss": 0.1062, + "step": 6849 + }, + { + "epoch": 1.109850939727803, + "grad_norm": 0.7734099626541138, + "learning_rate": 3.6318580126371124e-06, + "loss": 0.0957, + "step": 6850 + }, + { + "epoch": 1.1100129617627998, + "grad_norm": 0.811259925365448, + "learning_rate": 3.6314680770783717e-06, + "loss": 0.108, + "step": 6851 + }, + { + "epoch": 1.1101749837977966, + "grad_norm": 0.9896193742752075, + "learning_rate": 3.631078106899446e-06, + "loss": 0.1288, + "step": 6852 + }, + { + "epoch": 1.1103370058327933, + "grad_norm": 0.7642198801040649, + "learning_rate": 3.6306881021122675e-06, + "loss": 0.0916, + "step": 6853 + }, + { + "epoch": 1.11049902786779, + "grad_norm": 0.774316668510437, + "learning_rate": 3.630298062728769e-06, + "loss": 0.1009, + "step": 6854 + }, + { + "epoch": 1.1106610499027867, + "grad_norm": 0.9205217361450195, + "learning_rate": 3.629907988760886e-06, + "loss": 0.1038, + "step": 6855 + }, + { + "epoch": 1.1108230719377836, + "grad_norm": 0.8705753087997437, + "learning_rate": 3.6295178802205515e-06, + "loss": 0.1142, + "step": 6856 + }, + { + "epoch": 1.1109850939727803, + "grad_norm": 0.8856770992279053, + "learning_rate": 3.6291277371197042e-06, + "loss": 0.1111, + "step": 6857 + }, + { + "epoch": 1.111147116007777, + "grad_norm": 0.7869232296943665, + "learning_rate": 3.62873755947028e-06, + "loss": 0.1016, + "step": 6858 + }, + { + "epoch": 1.1113091380427738, + "grad_norm": 0.8357443809509277, + "learning_rate": 3.628347347284218e-06, + "loss": 0.1095, + "step": 6859 + }, + { + "epoch": 1.1114711600777705, + "grad_norm": 0.8363984227180481, + "learning_rate": 3.6279571005734583e-06, + "loss": 0.0984, + "step": 6860 + }, + { + "epoch": 1.1116331821127674, + "grad_norm": 0.9447827339172363, + "learning_rate": 3.627566819349941e-06, + "loss": 0.1097, + "step": 6861 + }, + { + "epoch": 1.1117952041477641, + "grad_norm": 0.9603884220123291, + "learning_rate": 3.6271765036256064e-06, + "loss": 0.1168, + "step": 6862 + }, + { + "epoch": 1.1119572261827608, + "grad_norm": 0.7766293883323669, + "learning_rate": 3.6267861534124e-06, + "loss": 0.0919, + "step": 6863 + }, + { + "epoch": 1.1121192482177575, + "grad_norm": 0.6767370700836182, + "learning_rate": 3.6263957687222633e-06, + "loss": 0.0832, + "step": 6864 + }, + { + "epoch": 1.1122812702527545, + "grad_norm": 0.8346736431121826, + "learning_rate": 3.6260053495671423e-06, + "loss": 0.1018, + "step": 6865 + }, + { + "epoch": 1.1124432922877512, + "grad_norm": 0.871659517288208, + "learning_rate": 3.625614895958982e-06, + "loss": 0.1006, + "step": 6866 + }, + { + "epoch": 1.112605314322748, + "grad_norm": 0.6866405010223389, + "learning_rate": 3.6252244079097296e-06, + "loss": 0.0874, + "step": 6867 + }, + { + "epoch": 1.1127673363577446, + "grad_norm": 0.8131086230278015, + "learning_rate": 3.624833885431334e-06, + "loss": 0.1062, + "step": 6868 + }, + { + "epoch": 1.1129293583927413, + "grad_norm": 0.9282816052436829, + "learning_rate": 3.6244433285357433e-06, + "loss": 0.1105, + "step": 6869 + }, + { + "epoch": 1.1130913804277383, + "grad_norm": 0.7851607799530029, + "learning_rate": 3.624052737234908e-06, + "loss": 0.1028, + "step": 6870 + }, + { + "epoch": 1.113253402462735, + "grad_norm": 0.7424038648605347, + "learning_rate": 3.623662111540779e-06, + "loss": 0.0843, + "step": 6871 + }, + { + "epoch": 1.1134154244977317, + "grad_norm": 0.8808372616767883, + "learning_rate": 3.6232714514653082e-06, + "loss": 0.1105, + "step": 6872 + }, + { + "epoch": 1.1135774465327284, + "grad_norm": 0.9049409627914429, + "learning_rate": 3.62288075702045e-06, + "loss": 0.1022, + "step": 6873 + }, + { + "epoch": 1.1137394685677253, + "grad_norm": 0.8752559423446655, + "learning_rate": 3.6224900282181574e-06, + "loss": 0.1034, + "step": 6874 + }, + { + "epoch": 1.113901490602722, + "grad_norm": 0.7354977130889893, + "learning_rate": 3.622099265070386e-06, + "loss": 0.0939, + "step": 6875 + }, + { + "epoch": 1.1140635126377187, + "grad_norm": 0.9659753441810608, + "learning_rate": 3.6217084675890935e-06, + "loss": 0.1085, + "step": 6876 + }, + { + "epoch": 1.1142255346727155, + "grad_norm": 0.8423113226890564, + "learning_rate": 3.6213176357862364e-06, + "loss": 0.1078, + "step": 6877 + }, + { + "epoch": 1.1143875567077122, + "grad_norm": 0.8920717239379883, + "learning_rate": 3.6209267696737723e-06, + "loss": 0.111, + "step": 6878 + }, + { + "epoch": 1.114549578742709, + "grad_norm": 0.9556100964546204, + "learning_rate": 3.6205358692636618e-06, + "loss": 0.1241, + "step": 6879 + }, + { + "epoch": 1.1147116007777058, + "grad_norm": 0.8358135223388672, + "learning_rate": 3.6201449345678657e-06, + "loss": 0.1113, + "step": 6880 + }, + { + "epoch": 1.1148736228127025, + "grad_norm": 0.9949773550033569, + "learning_rate": 3.6197539655983447e-06, + "loss": 0.1158, + "step": 6881 + }, + { + "epoch": 1.1150356448476992, + "grad_norm": 0.7863255739212036, + "learning_rate": 3.6193629623670627e-06, + "loss": 0.101, + "step": 6882 + }, + { + "epoch": 1.115197666882696, + "grad_norm": 0.8213348388671875, + "learning_rate": 3.6189719248859827e-06, + "loss": 0.0921, + "step": 6883 + }, + { + "epoch": 1.1153596889176929, + "grad_norm": 0.841783344745636, + "learning_rate": 3.6185808531670695e-06, + "loss": 0.1075, + "step": 6884 + }, + { + "epoch": 1.1155217109526896, + "grad_norm": 0.8018286228179932, + "learning_rate": 3.61818974722229e-06, + "loss": 0.1009, + "step": 6885 + }, + { + "epoch": 1.1156837329876863, + "grad_norm": 0.826497495174408, + "learning_rate": 3.617798607063609e-06, + "loss": 0.0976, + "step": 6886 + }, + { + "epoch": 1.115845755022683, + "grad_norm": 0.796943187713623, + "learning_rate": 3.6174074327029964e-06, + "loss": 0.1033, + "step": 6887 + }, + { + "epoch": 1.1160077770576797, + "grad_norm": 0.8021527528762817, + "learning_rate": 3.61701622415242e-06, + "loss": 0.1055, + "step": 6888 + }, + { + "epoch": 1.1161697990926767, + "grad_norm": 0.7664604187011719, + "learning_rate": 3.6166249814238503e-06, + "loss": 0.0922, + "step": 6889 + }, + { + "epoch": 1.1163318211276734, + "grad_norm": 0.7481946349143982, + "learning_rate": 3.616233704529259e-06, + "loss": 0.0954, + "step": 6890 + }, + { + "epoch": 1.11649384316267, + "grad_norm": 0.7271841168403625, + "learning_rate": 3.6158423934806164e-06, + "loss": 0.09, + "step": 6891 + }, + { + "epoch": 1.1166558651976668, + "grad_norm": 1.0113109350204468, + "learning_rate": 3.6154510482898973e-06, + "loss": 0.1189, + "step": 6892 + }, + { + "epoch": 1.1168178872326637, + "grad_norm": 0.7146120667457581, + "learning_rate": 3.6150596689690753e-06, + "loss": 0.0896, + "step": 6893 + }, + { + "epoch": 1.1169799092676604, + "grad_norm": 0.8736560344696045, + "learning_rate": 3.6146682555301266e-06, + "loss": 0.111, + "step": 6894 + }, + { + "epoch": 1.1171419313026572, + "grad_norm": 0.9498534202575684, + "learning_rate": 3.614276807985026e-06, + "loss": 0.1128, + "step": 6895 + }, + { + "epoch": 1.1173039533376539, + "grad_norm": 0.9466055631637573, + "learning_rate": 3.613885326345752e-06, + "loss": 0.1196, + "step": 6896 + }, + { + "epoch": 1.1174659753726506, + "grad_norm": 0.8748520016670227, + "learning_rate": 3.6134938106242823e-06, + "loss": 0.1088, + "step": 6897 + }, + { + "epoch": 1.1176279974076475, + "grad_norm": 1.0820103883743286, + "learning_rate": 3.6131022608325973e-06, + "loss": 0.1119, + "step": 6898 + }, + { + "epoch": 1.1177900194426442, + "grad_norm": 0.7906978726387024, + "learning_rate": 3.6127106769826763e-06, + "loss": 0.1029, + "step": 6899 + }, + { + "epoch": 1.117952041477641, + "grad_norm": 0.7698807120323181, + "learning_rate": 3.6123190590865e-06, + "loss": 0.0988, + "step": 6900 + }, + { + "epoch": 1.1181140635126376, + "grad_norm": 0.8202066421508789, + "learning_rate": 3.6119274071560545e-06, + "loss": 0.1104, + "step": 6901 + }, + { + "epoch": 1.1182760855476346, + "grad_norm": 0.969008207321167, + "learning_rate": 3.6115357212033196e-06, + "loss": 0.1047, + "step": 6902 + }, + { + "epoch": 1.1184381075826313, + "grad_norm": 0.7530080676078796, + "learning_rate": 3.611144001240282e-06, + "loss": 0.0888, + "step": 6903 + }, + { + "epoch": 1.118600129617628, + "grad_norm": 0.8667441010475159, + "learning_rate": 3.610752247278927e-06, + "loss": 0.1088, + "step": 6904 + }, + { + "epoch": 1.1187621516526247, + "grad_norm": 0.7977768778800964, + "learning_rate": 3.610360459331241e-06, + "loss": 0.1009, + "step": 6905 + }, + { + "epoch": 1.1189241736876214, + "grad_norm": 0.8574357032775879, + "learning_rate": 3.609968637409212e-06, + "loss": 0.1125, + "step": 6906 + }, + { + "epoch": 1.1190861957226184, + "grad_norm": 0.8181487321853638, + "learning_rate": 3.609576781524829e-06, + "loss": 0.1047, + "step": 6907 + }, + { + "epoch": 1.119248217757615, + "grad_norm": 0.8313112258911133, + "learning_rate": 3.6091848916900816e-06, + "loss": 0.1016, + "step": 6908 + }, + { + "epoch": 1.1194102397926118, + "grad_norm": 0.7930647730827332, + "learning_rate": 3.6087929679169603e-06, + "loss": 0.1034, + "step": 6909 + }, + { + "epoch": 1.1195722618276085, + "grad_norm": 0.9460997581481934, + "learning_rate": 3.6084010102174576e-06, + "loss": 0.1059, + "step": 6910 + }, + { + "epoch": 1.1197342838626052, + "grad_norm": 0.9013634920120239, + "learning_rate": 3.6080090186035664e-06, + "loss": 0.1091, + "step": 6911 + }, + { + "epoch": 1.1198963058976021, + "grad_norm": 0.905096173286438, + "learning_rate": 3.6076169930872805e-06, + "loss": 0.1161, + "step": 6912 + }, + { + "epoch": 1.1200583279325989, + "grad_norm": 0.7841644883155823, + "learning_rate": 3.607224933680595e-06, + "loss": 0.106, + "step": 6913 + }, + { + "epoch": 1.1202203499675956, + "grad_norm": 0.8874266743659973, + "learning_rate": 3.606832840395506e-06, + "loss": 0.1087, + "step": 6914 + }, + { + "epoch": 1.1203823720025923, + "grad_norm": 0.7959884405136108, + "learning_rate": 3.60644071324401e-06, + "loss": 0.1002, + "step": 6915 + }, + { + "epoch": 1.1205443940375892, + "grad_norm": 0.8270828127861023, + "learning_rate": 3.6060485522381067e-06, + "loss": 0.1056, + "step": 6916 + }, + { + "epoch": 1.120706416072586, + "grad_norm": 0.7726624011993408, + "learning_rate": 3.6056563573897927e-06, + "loss": 0.0999, + "step": 6917 + }, + { + "epoch": 1.1208684381075826, + "grad_norm": 0.9138970375061035, + "learning_rate": 3.605264128711072e-06, + "loss": 0.1154, + "step": 6918 + }, + { + "epoch": 1.1210304601425793, + "grad_norm": 0.8613343238830566, + "learning_rate": 3.604871866213942e-06, + "loss": 0.111, + "step": 6919 + }, + { + "epoch": 1.121192482177576, + "grad_norm": 0.855797290802002, + "learning_rate": 3.6044795699104074e-06, + "loss": 0.1185, + "step": 6920 + }, + { + "epoch": 1.121354504212573, + "grad_norm": 0.7498341202735901, + "learning_rate": 3.6040872398124705e-06, + "loss": 0.0892, + "step": 6921 + }, + { + "epoch": 1.1215165262475697, + "grad_norm": 0.7906498312950134, + "learning_rate": 3.6036948759321357e-06, + "loss": 0.0972, + "step": 6922 + }, + { + "epoch": 1.1216785482825664, + "grad_norm": 0.8063822388648987, + "learning_rate": 3.603302478281409e-06, + "loss": 0.1016, + "step": 6923 + }, + { + "epoch": 1.1218405703175631, + "grad_norm": 0.7980828881263733, + "learning_rate": 3.6029100468722954e-06, + "loss": 0.0919, + "step": 6924 + }, + { + "epoch": 1.12200259235256, + "grad_norm": 0.7965691685676575, + "learning_rate": 3.6025175817168046e-06, + "loss": 0.0889, + "step": 6925 + }, + { + "epoch": 1.1221646143875568, + "grad_norm": 0.8087196350097656, + "learning_rate": 3.602125082826944e-06, + "loss": 0.0985, + "step": 6926 + }, + { + "epoch": 1.1223266364225535, + "grad_norm": 0.9410887956619263, + "learning_rate": 3.6017325502147215e-06, + "loss": 0.101, + "step": 6927 + }, + { + "epoch": 1.1224886584575502, + "grad_norm": 0.8947740197181702, + "learning_rate": 3.60133998389215e-06, + "loss": 0.111, + "step": 6928 + }, + { + "epoch": 1.122650680492547, + "grad_norm": 0.7792215347290039, + "learning_rate": 3.6009473838712405e-06, + "loss": 0.0952, + "step": 6929 + }, + { + "epoch": 1.1228127025275438, + "grad_norm": 0.9168524146080017, + "learning_rate": 3.600554750164005e-06, + "loss": 0.1141, + "step": 6930 + }, + { + "epoch": 1.1229747245625405, + "grad_norm": 0.8868537545204163, + "learning_rate": 3.600162082782457e-06, + "loss": 0.1154, + "step": 6931 + }, + { + "epoch": 1.1231367465975373, + "grad_norm": 0.8856688737869263, + "learning_rate": 3.5997693817386128e-06, + "loss": 0.0962, + "step": 6932 + }, + { + "epoch": 1.123298768632534, + "grad_norm": 0.8674602508544922, + "learning_rate": 3.5993766470444856e-06, + "loss": 0.096, + "step": 6933 + }, + { + "epoch": 1.1234607906675307, + "grad_norm": 0.8470571041107178, + "learning_rate": 3.598983878712094e-06, + "loss": 0.1131, + "step": 6934 + }, + { + "epoch": 1.1236228127025276, + "grad_norm": 0.9755722284317017, + "learning_rate": 3.598591076753455e-06, + "loss": 0.1109, + "step": 6935 + }, + { + "epoch": 1.1237848347375243, + "grad_norm": 0.8115435838699341, + "learning_rate": 3.598198241180588e-06, + "loss": 0.1044, + "step": 6936 + }, + { + "epoch": 1.123946856772521, + "grad_norm": 0.7542571425437927, + "learning_rate": 3.5978053720055117e-06, + "loss": 0.097, + "step": 6937 + }, + { + "epoch": 1.1241088788075178, + "grad_norm": 0.7188454270362854, + "learning_rate": 3.597412469240248e-06, + "loss": 0.086, + "step": 6938 + }, + { + "epoch": 1.1242709008425145, + "grad_norm": 0.807829737663269, + "learning_rate": 3.5970195328968183e-06, + "loss": 0.1037, + "step": 6939 + }, + { + "epoch": 1.1244329228775114, + "grad_norm": 0.7683031558990479, + "learning_rate": 3.5966265629872466e-06, + "loss": 0.1012, + "step": 6940 + }, + { + "epoch": 1.124594944912508, + "grad_norm": 0.805733323097229, + "learning_rate": 3.5962335595235547e-06, + "loss": 0.1005, + "step": 6941 + }, + { + "epoch": 1.1247569669475048, + "grad_norm": 0.8244421482086182, + "learning_rate": 3.595840522517769e-06, + "loss": 0.1096, + "step": 6942 + }, + { + "epoch": 1.1249189889825015, + "grad_norm": 0.7452391386032104, + "learning_rate": 3.5954474519819155e-06, + "loss": 0.1078, + "step": 6943 + }, + { + "epoch": 1.1250810110174985, + "grad_norm": 0.8935032486915588, + "learning_rate": 3.5950543479280205e-06, + "loss": 0.1056, + "step": 6944 + }, + { + "epoch": 1.1252430330524952, + "grad_norm": 0.7510908246040344, + "learning_rate": 3.5946612103681135e-06, + "loss": 0.0906, + "step": 6945 + }, + { + "epoch": 1.125405055087492, + "grad_norm": 0.7809535264968872, + "learning_rate": 3.5942680393142203e-06, + "loss": 0.1014, + "step": 6946 + }, + { + "epoch": 1.1255670771224886, + "grad_norm": 0.9290381073951721, + "learning_rate": 3.5938748347783754e-06, + "loss": 0.12, + "step": 6947 + }, + { + "epoch": 1.1257290991574855, + "grad_norm": 0.7380486726760864, + "learning_rate": 3.593481596772606e-06, + "loss": 0.0862, + "step": 6948 + }, + { + "epoch": 1.1258911211924822, + "grad_norm": 0.7775296568870544, + "learning_rate": 3.593088325308947e-06, + "loss": 0.1076, + "step": 6949 + }, + { + "epoch": 1.126053143227479, + "grad_norm": 0.8212987184524536, + "learning_rate": 3.5926950203994303e-06, + "loss": 0.0965, + "step": 6950 + }, + { + "epoch": 1.1262151652624757, + "grad_norm": 0.758618175983429, + "learning_rate": 3.5923016820560904e-06, + "loss": 0.0958, + "step": 6951 + }, + { + "epoch": 1.1263771872974724, + "grad_norm": 0.7440875172615051, + "learning_rate": 3.5919083102909615e-06, + "loss": 0.092, + "step": 6952 + }, + { + "epoch": 1.1265392093324693, + "grad_norm": 0.755118727684021, + "learning_rate": 3.5915149051160812e-06, + "loss": 0.0912, + "step": 6953 + }, + { + "epoch": 1.126701231367466, + "grad_norm": 0.9674204587936401, + "learning_rate": 3.591121466543487e-06, + "loss": 0.119, + "step": 6954 + }, + { + "epoch": 1.1268632534024627, + "grad_norm": 0.7956207990646362, + "learning_rate": 3.5907279945852145e-06, + "loss": 0.0993, + "step": 6955 + }, + { + "epoch": 1.1270252754374595, + "grad_norm": 0.8555571436882019, + "learning_rate": 3.5903344892533067e-06, + "loss": 0.1116, + "step": 6956 + }, + { + "epoch": 1.1271872974724562, + "grad_norm": 0.741603434085846, + "learning_rate": 3.5899409505598014e-06, + "loss": 0.0924, + "step": 6957 + }, + { + "epoch": 1.127349319507453, + "grad_norm": 0.8872222900390625, + "learning_rate": 3.5895473785167407e-06, + "loss": 0.0995, + "step": 6958 + }, + { + "epoch": 1.1275113415424498, + "grad_norm": 0.8348520398139954, + "learning_rate": 3.589153773136167e-06, + "loss": 0.1014, + "step": 6959 + }, + { + "epoch": 1.1276733635774465, + "grad_norm": 0.8107140064239502, + "learning_rate": 3.5887601344301228e-06, + "loss": 0.1088, + "step": 6960 + }, + { + "epoch": 1.1278353856124432, + "grad_norm": 0.8019354939460754, + "learning_rate": 3.588366462410654e-06, + "loss": 0.1078, + "step": 6961 + }, + { + "epoch": 1.12799740764744, + "grad_norm": 0.8682474493980408, + "learning_rate": 3.587972757089805e-06, + "loss": 0.1116, + "step": 6962 + }, + { + "epoch": 1.1281594296824369, + "grad_norm": 0.9506396055221558, + "learning_rate": 3.5875790184796228e-06, + "loss": 0.1215, + "step": 6963 + }, + { + "epoch": 1.1283214517174336, + "grad_norm": 0.8694068789482117, + "learning_rate": 3.587185246592154e-06, + "loss": 0.0923, + "step": 6964 + }, + { + "epoch": 1.1284834737524303, + "grad_norm": 0.8598493933677673, + "learning_rate": 3.5867914414394478e-06, + "loss": 0.1113, + "step": 6965 + }, + { + "epoch": 1.128645495787427, + "grad_norm": 0.7127923965454102, + "learning_rate": 3.5863976030335535e-06, + "loss": 0.0917, + "step": 6966 + }, + { + "epoch": 1.128807517822424, + "grad_norm": 0.9294934272766113, + "learning_rate": 3.5860037313865216e-06, + "loss": 0.1139, + "step": 6967 + }, + { + "epoch": 1.1289695398574207, + "grad_norm": 0.7821537256240845, + "learning_rate": 3.5856098265104033e-06, + "loss": 0.1006, + "step": 6968 + }, + { + "epoch": 1.1291315618924174, + "grad_norm": 0.7920688986778259, + "learning_rate": 3.5852158884172523e-06, + "loss": 0.0979, + "step": 6969 + }, + { + "epoch": 1.129293583927414, + "grad_norm": 0.772294819355011, + "learning_rate": 3.58482191711912e-06, + "loss": 0.0933, + "step": 6970 + }, + { + "epoch": 1.1294556059624108, + "grad_norm": 0.7953448295593262, + "learning_rate": 3.5844279126280635e-06, + "loss": 0.1057, + "step": 6971 + }, + { + "epoch": 1.1296176279974077, + "grad_norm": 0.7928208112716675, + "learning_rate": 3.5840338749561365e-06, + "loss": 0.1052, + "step": 6972 + }, + { + "epoch": 1.1297796500324044, + "grad_norm": 0.885570228099823, + "learning_rate": 3.5836398041153962e-06, + "loss": 0.106, + "step": 6973 + }, + { + "epoch": 1.1299416720674011, + "grad_norm": 0.8886401057243347, + "learning_rate": 3.5832457001179e-06, + "loss": 0.107, + "step": 6974 + }, + { + "epoch": 1.1301036941023979, + "grad_norm": 0.6969801783561707, + "learning_rate": 3.5828515629757073e-06, + "loss": 0.0911, + "step": 6975 + }, + { + "epoch": 1.1302657161373948, + "grad_norm": 0.8328415155410767, + "learning_rate": 3.582457392700878e-06, + "loss": 0.1073, + "step": 6976 + }, + { + "epoch": 1.1304277381723915, + "grad_norm": 0.8379804491996765, + "learning_rate": 3.5820631893054703e-06, + "loss": 0.114, + "step": 6977 + }, + { + "epoch": 1.1305897602073882, + "grad_norm": 0.8626766204833984, + "learning_rate": 3.5816689528015485e-06, + "loss": 0.1088, + "step": 6978 + }, + { + "epoch": 1.130751782242385, + "grad_norm": 0.7580254077911377, + "learning_rate": 3.5812746832011734e-06, + "loss": 0.0952, + "step": 6979 + }, + { + "epoch": 1.1309138042773816, + "grad_norm": 0.9701016545295715, + "learning_rate": 3.580880380516411e-06, + "loss": 0.1154, + "step": 6980 + }, + { + "epoch": 1.1310758263123786, + "grad_norm": 0.8925403356552124, + "learning_rate": 3.5804860447593236e-06, + "loss": 0.1066, + "step": 6981 + }, + { + "epoch": 1.1312378483473753, + "grad_norm": 0.861957848072052, + "learning_rate": 3.5800916759419784e-06, + "loss": 0.0999, + "step": 6982 + }, + { + "epoch": 1.131399870382372, + "grad_norm": 0.9451190233230591, + "learning_rate": 3.579697274076442e-06, + "loss": 0.1148, + "step": 6983 + }, + { + "epoch": 1.1315618924173687, + "grad_norm": 0.8173058032989502, + "learning_rate": 3.579302839174781e-06, + "loss": 0.1155, + "step": 6984 + }, + { + "epoch": 1.1317239144523654, + "grad_norm": 0.9785115718841553, + "learning_rate": 3.578908371249066e-06, + "loss": 0.12, + "step": 6985 + }, + { + "epoch": 1.1318859364873624, + "grad_norm": 0.8944734334945679, + "learning_rate": 3.578513870311365e-06, + "loss": 0.1105, + "step": 6986 + }, + { + "epoch": 1.132047958522359, + "grad_norm": 0.8797488808631897, + "learning_rate": 3.57811933637375e-06, + "loss": 0.1095, + "step": 6987 + }, + { + "epoch": 1.1322099805573558, + "grad_norm": 0.8068286180496216, + "learning_rate": 3.577724769448292e-06, + "loss": 0.1002, + "step": 6988 + }, + { + "epoch": 1.1323720025923525, + "grad_norm": 0.993378758430481, + "learning_rate": 3.577330169547064e-06, + "loss": 0.1254, + "step": 6989 + }, + { + "epoch": 1.1325340246273492, + "grad_norm": 0.7981913089752197, + "learning_rate": 3.57693553668214e-06, + "loss": 0.1031, + "step": 6990 + }, + { + "epoch": 1.1326960466623461, + "grad_norm": 0.8486551642417908, + "learning_rate": 3.5765408708655946e-06, + "loss": 0.1028, + "step": 6991 + }, + { + "epoch": 1.1328580686973428, + "grad_norm": 0.8483676910400391, + "learning_rate": 3.5761461721095037e-06, + "loss": 0.1061, + "step": 6992 + }, + { + "epoch": 1.1330200907323396, + "grad_norm": 0.8058338165283203, + "learning_rate": 3.5757514404259447e-06, + "loss": 0.1053, + "step": 6993 + }, + { + "epoch": 1.1331821127673363, + "grad_norm": 0.9978876709938049, + "learning_rate": 3.575356675826995e-06, + "loss": 0.1242, + "step": 6994 + }, + { + "epoch": 1.1333441348023332, + "grad_norm": 0.8573361039161682, + "learning_rate": 3.5749618783247326e-06, + "loss": 0.1093, + "step": 6995 + }, + { + "epoch": 1.13350615683733, + "grad_norm": 0.8889877200126648, + "learning_rate": 3.574567047931238e-06, + "loss": 0.1067, + "step": 6996 + }, + { + "epoch": 1.1336681788723266, + "grad_norm": 0.7973397970199585, + "learning_rate": 3.5741721846585926e-06, + "loss": 0.0991, + "step": 6997 + }, + { + "epoch": 1.1338302009073233, + "grad_norm": 0.8841112852096558, + "learning_rate": 3.5737772885188777e-06, + "loss": 0.1111, + "step": 6998 + }, + { + "epoch": 1.1339922229423203, + "grad_norm": 0.8456192016601562, + "learning_rate": 3.5733823595241758e-06, + "loss": 0.111, + "step": 6999 + }, + { + "epoch": 1.134154244977317, + "grad_norm": 1.4436651468276978, + "learning_rate": 3.5729873976865726e-06, + "loss": 0.0931, + "step": 7000 + }, + { + "epoch": 1.1343162670123137, + "grad_norm": 0.8534703850746155, + "learning_rate": 3.5725924030181498e-06, + "loss": 0.1073, + "step": 7001 + }, + { + "epoch": 1.1344782890473104, + "grad_norm": 0.8066534996032715, + "learning_rate": 3.5721973755309963e-06, + "loss": 0.0973, + "step": 7002 + }, + { + "epoch": 1.1346403110823071, + "grad_norm": 0.8632422685623169, + "learning_rate": 3.571802315237197e-06, + "loss": 0.1128, + "step": 7003 + }, + { + "epoch": 1.134802333117304, + "grad_norm": 0.9532308578491211, + "learning_rate": 3.5714072221488414e-06, + "loss": 0.1121, + "step": 7004 + }, + { + "epoch": 1.1349643551523008, + "grad_norm": 0.8398197889328003, + "learning_rate": 3.571012096278017e-06, + "loss": 0.1081, + "step": 7005 + }, + { + "epoch": 1.1351263771872975, + "grad_norm": 0.7505828142166138, + "learning_rate": 3.5706169376368143e-06, + "loss": 0.0898, + "step": 7006 + }, + { + "epoch": 1.1352883992222942, + "grad_norm": 0.8282100558280945, + "learning_rate": 3.570221746237325e-06, + "loss": 0.1008, + "step": 7007 + }, + { + "epoch": 1.135450421257291, + "grad_norm": 0.8855400681495667, + "learning_rate": 3.5698265220916388e-06, + "loss": 0.118, + "step": 7008 + }, + { + "epoch": 1.1356124432922878, + "grad_norm": 0.8593195080757141, + "learning_rate": 3.5694312652118513e-06, + "loss": 0.1072, + "step": 7009 + }, + { + "epoch": 1.1357744653272845, + "grad_norm": 0.9126600027084351, + "learning_rate": 3.5690359756100532e-06, + "loss": 0.1101, + "step": 7010 + }, + { + "epoch": 1.1359364873622813, + "grad_norm": 0.7796016335487366, + "learning_rate": 3.5686406532983432e-06, + "loss": 0.1009, + "step": 7011 + }, + { + "epoch": 1.136098509397278, + "grad_norm": 0.822391152381897, + "learning_rate": 3.5682452982888143e-06, + "loss": 0.1099, + "step": 7012 + }, + { + "epoch": 1.1362605314322747, + "grad_norm": 0.8462624549865723, + "learning_rate": 3.5678499105935642e-06, + "loss": 0.1055, + "step": 7013 + }, + { + "epoch": 1.1364225534672716, + "grad_norm": 0.8373414278030396, + "learning_rate": 3.5674544902246916e-06, + "loss": 0.1144, + "step": 7014 + }, + { + "epoch": 1.1365845755022683, + "grad_norm": 0.8660956621170044, + "learning_rate": 3.567059037194294e-06, + "loss": 0.1096, + "step": 7015 + }, + { + "epoch": 1.136746597537265, + "grad_norm": 0.841236412525177, + "learning_rate": 3.566663551514473e-06, + "loss": 0.1058, + "step": 7016 + }, + { + "epoch": 1.1369086195722617, + "grad_norm": 0.8246383666992188, + "learning_rate": 3.5662680331973282e-06, + "loss": 0.1015, + "step": 7017 + }, + { + "epoch": 1.1370706416072587, + "grad_norm": 0.8704448342323303, + "learning_rate": 3.5658724822549624e-06, + "loss": 0.1055, + "step": 7018 + }, + { + "epoch": 1.1372326636422554, + "grad_norm": 0.7135106325149536, + "learning_rate": 3.565476898699477e-06, + "loss": 0.0863, + "step": 7019 + }, + { + "epoch": 1.137394685677252, + "grad_norm": 0.7876743674278259, + "learning_rate": 3.5650812825429774e-06, + "loss": 0.1019, + "step": 7020 + }, + { + "epoch": 1.1375567077122488, + "grad_norm": 0.773827850818634, + "learning_rate": 3.564685633797568e-06, + "loss": 0.0966, + "step": 7021 + }, + { + "epoch": 1.1377187297472457, + "grad_norm": 0.8527294397354126, + "learning_rate": 3.5642899524753548e-06, + "loss": 0.1016, + "step": 7022 + }, + { + "epoch": 1.1378807517822425, + "grad_norm": 0.8665226697921753, + "learning_rate": 3.5638942385884443e-06, + "loss": 0.1061, + "step": 7023 + }, + { + "epoch": 1.1380427738172392, + "grad_norm": 0.7744306325912476, + "learning_rate": 3.5634984921489455e-06, + "loss": 0.0958, + "step": 7024 + }, + { + "epoch": 1.1382047958522359, + "grad_norm": 0.7468297481536865, + "learning_rate": 3.563102713168966e-06, + "loss": 0.0925, + "step": 7025 + }, + { + "epoch": 1.1383668178872326, + "grad_norm": 0.7687481641769409, + "learning_rate": 3.562706901660616e-06, + "loss": 0.0983, + "step": 7026 + }, + { + "epoch": 1.1385288399222295, + "grad_norm": 0.899817943572998, + "learning_rate": 3.5623110576360065e-06, + "loss": 0.1092, + "step": 7027 + }, + { + "epoch": 1.1386908619572262, + "grad_norm": 0.8666191101074219, + "learning_rate": 3.561915181107249e-06, + "loss": 0.1056, + "step": 7028 + }, + { + "epoch": 1.138852883992223, + "grad_norm": 0.8024566173553467, + "learning_rate": 3.5615192720864572e-06, + "loss": 0.1007, + "step": 7029 + }, + { + "epoch": 1.1390149060272197, + "grad_norm": 0.7231770753860474, + "learning_rate": 3.561123330585744e-06, + "loss": 0.0895, + "step": 7030 + }, + { + "epoch": 1.1391769280622164, + "grad_norm": 0.9641432762145996, + "learning_rate": 3.5607273566172255e-06, + "loss": 0.1041, + "step": 7031 + }, + { + "epoch": 1.1393389500972133, + "grad_norm": 0.7931097745895386, + "learning_rate": 3.560331350193016e-06, + "loss": 0.1088, + "step": 7032 + }, + { + "epoch": 1.13950097213221, + "grad_norm": 0.8860135674476624, + "learning_rate": 3.5599353113252343e-06, + "loss": 0.1082, + "step": 7033 + }, + { + "epoch": 1.1396629941672067, + "grad_norm": 0.8670450448989868, + "learning_rate": 3.5595392400259963e-06, + "loss": 0.1046, + "step": 7034 + }, + { + "epoch": 1.1398250162022034, + "grad_norm": 0.7805025577545166, + "learning_rate": 3.5591431363074214e-06, + "loss": 0.0926, + "step": 7035 + }, + { + "epoch": 1.1399870382372002, + "grad_norm": 0.8997758626937866, + "learning_rate": 3.55874700018163e-06, + "loss": 0.1134, + "step": 7036 + }, + { + "epoch": 1.140149060272197, + "grad_norm": 0.799332857131958, + "learning_rate": 3.558350831660742e-06, + "loss": 0.0953, + "step": 7037 + }, + { + "epoch": 1.1403110823071938, + "grad_norm": 0.8448926210403442, + "learning_rate": 3.5579546307568807e-06, + "loss": 0.1027, + "step": 7038 + }, + { + "epoch": 1.1404731043421905, + "grad_norm": 0.8926445245742798, + "learning_rate": 3.557558397482167e-06, + "loss": 0.1064, + "step": 7039 + }, + { + "epoch": 1.1406351263771872, + "grad_norm": 0.9030429124832153, + "learning_rate": 3.557162131848726e-06, + "loss": 0.1143, + "step": 7040 + }, + { + "epoch": 1.140797148412184, + "grad_norm": 0.8533268570899963, + "learning_rate": 3.556765833868682e-06, + "loss": 0.1089, + "step": 7041 + }, + { + "epoch": 1.1409591704471809, + "grad_norm": 0.9436481595039368, + "learning_rate": 3.5563695035541607e-06, + "loss": 0.1096, + "step": 7042 + }, + { + "epoch": 1.1411211924821776, + "grad_norm": 0.7506798505783081, + "learning_rate": 3.5559731409172887e-06, + "loss": 0.0839, + "step": 7043 + }, + { + "epoch": 1.1412832145171743, + "grad_norm": 0.858508288860321, + "learning_rate": 3.5555767459701946e-06, + "loss": 0.1047, + "step": 7044 + }, + { + "epoch": 1.141445236552171, + "grad_norm": 0.7885152697563171, + "learning_rate": 3.5551803187250062e-06, + "loss": 0.1009, + "step": 7045 + }, + { + "epoch": 1.141607258587168, + "grad_norm": 0.7987487316131592, + "learning_rate": 3.554783859193853e-06, + "loss": 0.0999, + "step": 7046 + }, + { + "epoch": 1.1417692806221647, + "grad_norm": 0.8181909322738647, + "learning_rate": 3.5543873673888676e-06, + "loss": 0.0995, + "step": 7047 + }, + { + "epoch": 1.1419313026571614, + "grad_norm": 0.8372443914413452, + "learning_rate": 3.5539908433221793e-06, + "loss": 0.1056, + "step": 7048 + }, + { + "epoch": 1.142093324692158, + "grad_norm": 0.851033627986908, + "learning_rate": 3.553594287005922e-06, + "loss": 0.1099, + "step": 7049 + }, + { + "epoch": 1.142255346727155, + "grad_norm": 0.7364274859428406, + "learning_rate": 3.553197698452229e-06, + "loss": 0.098, + "step": 7050 + }, + { + "epoch": 1.1424173687621517, + "grad_norm": 0.8522658348083496, + "learning_rate": 3.5528010776732354e-06, + "loss": 0.1083, + "step": 7051 + }, + { + "epoch": 1.1425793907971484, + "grad_norm": 0.8110232949256897, + "learning_rate": 3.5524044246810764e-06, + "loss": 0.0991, + "step": 7052 + }, + { + "epoch": 1.1427414128321451, + "grad_norm": 0.9086326956748962, + "learning_rate": 3.5520077394878883e-06, + "loss": 0.1123, + "step": 7053 + }, + { + "epoch": 1.1429034348671419, + "grad_norm": 0.9316560626029968, + "learning_rate": 3.5516110221058096e-06, + "loss": 0.1043, + "step": 7054 + }, + { + "epoch": 1.1430654569021388, + "grad_norm": 0.8712682723999023, + "learning_rate": 3.551214272546979e-06, + "loss": 0.1018, + "step": 7055 + }, + { + "epoch": 1.1432274789371355, + "grad_norm": 0.8193374872207642, + "learning_rate": 3.550817490823535e-06, + "loss": 0.1063, + "step": 7056 + }, + { + "epoch": 1.1433895009721322, + "grad_norm": 0.7940148115158081, + "learning_rate": 3.550420676947619e-06, + "loss": 0.0989, + "step": 7057 + }, + { + "epoch": 1.143551523007129, + "grad_norm": 1.0240070819854736, + "learning_rate": 3.5500238309313717e-06, + "loss": 0.0994, + "step": 7058 + }, + { + "epoch": 1.1437135450421256, + "grad_norm": 0.8923311233520508, + "learning_rate": 3.549626952786937e-06, + "loss": 0.1026, + "step": 7059 + }, + { + "epoch": 1.1438755670771226, + "grad_norm": 0.8232337236404419, + "learning_rate": 3.5492300425264574e-06, + "loss": 0.099, + "step": 7060 + }, + { + "epoch": 1.1440375891121193, + "grad_norm": 0.8222522139549255, + "learning_rate": 3.548833100162077e-06, + "loss": 0.0961, + "step": 7061 + }, + { + "epoch": 1.144199611147116, + "grad_norm": 0.8657082319259644, + "learning_rate": 3.5484361257059425e-06, + "loss": 0.1066, + "step": 7062 + }, + { + "epoch": 1.1443616331821127, + "grad_norm": 0.9332712888717651, + "learning_rate": 3.548039119170199e-06, + "loss": 0.1138, + "step": 7063 + }, + { + "epoch": 1.1445236552171094, + "grad_norm": 1.0337094068527222, + "learning_rate": 3.5476420805669953e-06, + "loss": 0.1053, + "step": 7064 + }, + { + "epoch": 1.1446856772521063, + "grad_norm": 0.874484658241272, + "learning_rate": 3.5472450099084786e-06, + "loss": 0.1125, + "step": 7065 + }, + { + "epoch": 1.144847699287103, + "grad_norm": 0.7913826704025269, + "learning_rate": 3.5468479072067996e-06, + "loss": 0.1014, + "step": 7066 + }, + { + "epoch": 1.1450097213220998, + "grad_norm": 0.8043907880783081, + "learning_rate": 3.5464507724741076e-06, + "loss": 0.1029, + "step": 7067 + }, + { + "epoch": 1.1451717433570965, + "grad_norm": 0.833845317363739, + "learning_rate": 3.5460536057225542e-06, + "loss": 0.1024, + "step": 7068 + }, + { + "epoch": 1.1453337653920934, + "grad_norm": 0.8379101753234863, + "learning_rate": 3.545656406964292e-06, + "loss": 0.1051, + "step": 7069 + }, + { + "epoch": 1.1454957874270901, + "grad_norm": 0.7530349493026733, + "learning_rate": 3.545259176211474e-06, + "loss": 0.0899, + "step": 7070 + }, + { + "epoch": 1.1456578094620868, + "grad_norm": 0.7313252687454224, + "learning_rate": 3.5448619134762552e-06, + "loss": 0.09, + "step": 7071 + }, + { + "epoch": 1.1458198314970836, + "grad_norm": 0.8761491179466248, + "learning_rate": 3.5444646187707897e-06, + "loss": 0.106, + "step": 7072 + }, + { + "epoch": 1.1459818535320805, + "grad_norm": 0.8301008343696594, + "learning_rate": 3.544067292107235e-06, + "loss": 0.1041, + "step": 7073 + }, + { + "epoch": 1.1461438755670772, + "grad_norm": 0.8845946788787842, + "learning_rate": 3.5436699334977476e-06, + "loss": 0.1079, + "step": 7074 + }, + { + "epoch": 1.146305897602074, + "grad_norm": 0.9687334299087524, + "learning_rate": 3.5432725429544856e-06, + "loss": 0.1142, + "step": 7075 + }, + { + "epoch": 1.1464679196370706, + "grad_norm": 0.9008753299713135, + "learning_rate": 3.5428751204896083e-06, + "loss": 0.1243, + "step": 7076 + }, + { + "epoch": 1.1466299416720673, + "grad_norm": 0.8190997838973999, + "learning_rate": 3.5424776661152766e-06, + "loss": 0.1043, + "step": 7077 + }, + { + "epoch": 1.1467919637070643, + "grad_norm": 0.7773595452308655, + "learning_rate": 3.542080179843651e-06, + "loss": 0.0952, + "step": 7078 + }, + { + "epoch": 1.146953985742061, + "grad_norm": 0.8167815208435059, + "learning_rate": 3.5416826616868938e-06, + "loss": 0.1005, + "step": 7079 + }, + { + "epoch": 1.1471160077770577, + "grad_norm": 0.881354570388794, + "learning_rate": 3.5412851116571673e-06, + "loss": 0.1099, + "step": 7080 + }, + { + "epoch": 1.1472780298120544, + "grad_norm": 0.9158351421356201, + "learning_rate": 3.5408875297666366e-06, + "loss": 0.1125, + "step": 7081 + }, + { + "epoch": 1.1474400518470511, + "grad_norm": 0.8541279435157776, + "learning_rate": 3.5404899160274664e-06, + "loss": 0.0947, + "step": 7082 + }, + { + "epoch": 1.147602073882048, + "grad_norm": 0.9512801170349121, + "learning_rate": 3.5400922704518225e-06, + "loss": 0.1134, + "step": 7083 + }, + { + "epoch": 1.1477640959170448, + "grad_norm": 0.8026208281517029, + "learning_rate": 3.5396945930518722e-06, + "loss": 0.1004, + "step": 7084 + }, + { + "epoch": 1.1479261179520415, + "grad_norm": 0.8425459861755371, + "learning_rate": 3.5392968838397836e-06, + "loss": 0.1065, + "step": 7085 + }, + { + "epoch": 1.1480881399870382, + "grad_norm": 0.9077960252761841, + "learning_rate": 3.538899142827726e-06, + "loss": 0.1203, + "step": 7086 + }, + { + "epoch": 1.148250162022035, + "grad_norm": 0.8521792888641357, + "learning_rate": 3.538501370027867e-06, + "loss": 0.1073, + "step": 7087 + }, + { + "epoch": 1.1484121840570318, + "grad_norm": 0.7184135913848877, + "learning_rate": 3.538103565452381e-06, + "loss": 0.0894, + "step": 7088 + }, + { + "epoch": 1.1485742060920285, + "grad_norm": 0.8005532622337341, + "learning_rate": 3.537705729113437e-06, + "loss": 0.1096, + "step": 7089 + }, + { + "epoch": 1.1487362281270252, + "grad_norm": 0.8464052081108093, + "learning_rate": 3.537307861023209e-06, + "loss": 0.1035, + "step": 7090 + }, + { + "epoch": 1.148898250162022, + "grad_norm": 0.7864227890968323, + "learning_rate": 3.536909961193872e-06, + "loss": 0.0955, + "step": 7091 + }, + { + "epoch": 1.1490602721970187, + "grad_norm": 0.923495352268219, + "learning_rate": 3.536512029637597e-06, + "loss": 0.1104, + "step": 7092 + }, + { + "epoch": 1.1492222942320156, + "grad_norm": 0.732354998588562, + "learning_rate": 3.5361140663665644e-06, + "loss": 0.0921, + "step": 7093 + }, + { + "epoch": 1.1493843162670123, + "grad_norm": 0.8492215871810913, + "learning_rate": 3.5357160713929473e-06, + "loss": 0.1059, + "step": 7094 + }, + { + "epoch": 1.149546338302009, + "grad_norm": 0.899074137210846, + "learning_rate": 3.5353180447289253e-06, + "loss": 0.1078, + "step": 7095 + }, + { + "epoch": 1.1497083603370057, + "grad_norm": 0.8724895119667053, + "learning_rate": 3.534919986386676e-06, + "loss": 0.1126, + "step": 7096 + }, + { + "epoch": 1.1498703823720027, + "grad_norm": 0.9067556262016296, + "learning_rate": 3.534521896378381e-06, + "loss": 0.1107, + "step": 7097 + }, + { + "epoch": 1.1500324044069994, + "grad_norm": 0.8136131763458252, + "learning_rate": 3.5341237747162183e-06, + "loss": 0.1047, + "step": 7098 + }, + { + "epoch": 1.150194426441996, + "grad_norm": 0.8796071410179138, + "learning_rate": 3.533725621412371e-06, + "loss": 0.1117, + "step": 7099 + }, + { + "epoch": 1.1503564484769928, + "grad_norm": 0.8252900242805481, + "learning_rate": 3.533327436479021e-06, + "loss": 0.1106, + "step": 7100 + }, + { + "epoch": 1.1505184705119897, + "grad_norm": 0.8371667861938477, + "learning_rate": 3.5329292199283522e-06, + "loss": 0.1012, + "step": 7101 + }, + { + "epoch": 1.1506804925469865, + "grad_norm": 0.8886693716049194, + "learning_rate": 3.53253097177255e-06, + "loss": 0.1118, + "step": 7102 + }, + { + "epoch": 1.1508425145819832, + "grad_norm": 0.8799384832382202, + "learning_rate": 3.532132692023797e-06, + "loss": 0.114, + "step": 7103 + }, + { + "epoch": 1.1510045366169799, + "grad_norm": 0.815346896648407, + "learning_rate": 3.531734380694282e-06, + "loss": 0.1071, + "step": 7104 + }, + { + "epoch": 1.1511665586519766, + "grad_norm": 0.8964957594871521, + "learning_rate": 3.531336037796192e-06, + "loss": 0.1094, + "step": 7105 + }, + { + "epoch": 1.1513285806869735, + "grad_norm": 0.8892197012901306, + "learning_rate": 3.5309376633417146e-06, + "loss": 0.1091, + "step": 7106 + }, + { + "epoch": 1.1514906027219702, + "grad_norm": 0.9133774042129517, + "learning_rate": 3.5305392573430398e-06, + "loss": 0.1154, + "step": 7107 + }, + { + "epoch": 1.151652624756967, + "grad_norm": 1.0447365045547485, + "learning_rate": 3.530140819812357e-06, + "loss": 0.1256, + "step": 7108 + }, + { + "epoch": 1.1518146467919637, + "grad_norm": 0.8976051807403564, + "learning_rate": 3.5297423507618582e-06, + "loss": 0.107, + "step": 7109 + }, + { + "epoch": 1.1519766688269604, + "grad_norm": 0.7730125784873962, + "learning_rate": 3.5293438502037363e-06, + "loss": 0.0979, + "step": 7110 + }, + { + "epoch": 1.1521386908619573, + "grad_norm": 0.9154132008552551, + "learning_rate": 3.5289453181501832e-06, + "loss": 0.1154, + "step": 7111 + }, + { + "epoch": 1.152300712896954, + "grad_norm": 0.731338620185852, + "learning_rate": 3.5285467546133926e-06, + "loss": 0.0924, + "step": 7112 + }, + { + "epoch": 1.1524627349319507, + "grad_norm": 0.8675764203071594, + "learning_rate": 3.5281481596055613e-06, + "loss": 0.1181, + "step": 7113 + }, + { + "epoch": 1.1526247569669474, + "grad_norm": 0.7357743382453918, + "learning_rate": 3.5277495331388835e-06, + "loss": 0.0895, + "step": 7114 + }, + { + "epoch": 1.1527867790019442, + "grad_norm": 0.8133664131164551, + "learning_rate": 3.527350875225558e-06, + "loss": 0.1026, + "step": 7115 + }, + { + "epoch": 1.152948801036941, + "grad_norm": 0.8403448462486267, + "learning_rate": 3.526952185877781e-06, + "loss": 0.1002, + "step": 7116 + }, + { + "epoch": 1.1531108230719378, + "grad_norm": 0.7883726358413696, + "learning_rate": 3.5265534651077527e-06, + "loss": 0.1038, + "step": 7117 + }, + { + "epoch": 1.1532728451069345, + "grad_norm": 0.9349051117897034, + "learning_rate": 3.526154712927672e-06, + "loss": 0.1239, + "step": 7118 + }, + { + "epoch": 1.1534348671419312, + "grad_norm": 0.8447469472885132, + "learning_rate": 3.525755929349741e-06, + "loss": 0.1049, + "step": 7119 + }, + { + "epoch": 1.1535968891769282, + "grad_norm": 0.8494859933853149, + "learning_rate": 3.525357114386161e-06, + "loss": 0.1031, + "step": 7120 + }, + { + "epoch": 1.1537589112119249, + "grad_norm": 0.7881488800048828, + "learning_rate": 3.5249582680491346e-06, + "loss": 0.102, + "step": 7121 + }, + { + "epoch": 1.1539209332469216, + "grad_norm": 0.7983292937278748, + "learning_rate": 3.524559390350865e-06, + "loss": 0.0992, + "step": 7122 + }, + { + "epoch": 1.1540829552819183, + "grad_norm": 0.8363736867904663, + "learning_rate": 3.5241604813035577e-06, + "loss": 0.1017, + "step": 7123 + }, + { + "epoch": 1.1542449773169152, + "grad_norm": 0.9091150760650635, + "learning_rate": 3.523761540919418e-06, + "loss": 0.116, + "step": 7124 + }, + { + "epoch": 1.154406999351912, + "grad_norm": 0.8165498971939087, + "learning_rate": 3.5233625692106525e-06, + "loss": 0.1047, + "step": 7125 + }, + { + "epoch": 1.1545690213869086, + "grad_norm": 0.7147705554962158, + "learning_rate": 3.5229635661894696e-06, + "loss": 0.0823, + "step": 7126 + }, + { + "epoch": 1.1547310434219054, + "grad_norm": 0.7142750024795532, + "learning_rate": 3.5225645318680768e-06, + "loss": 0.0944, + "step": 7127 + }, + { + "epoch": 1.154893065456902, + "grad_norm": 0.795220673084259, + "learning_rate": 3.5221654662586837e-06, + "loss": 0.0923, + "step": 7128 + }, + { + "epoch": 1.155055087491899, + "grad_norm": 0.7979345321655273, + "learning_rate": 3.5217663693735006e-06, + "loss": 0.0994, + "step": 7129 + }, + { + "epoch": 1.1552171095268957, + "grad_norm": 0.856704592704773, + "learning_rate": 3.521367241224739e-06, + "loss": 0.1009, + "step": 7130 + }, + { + "epoch": 1.1553791315618924, + "grad_norm": 0.7614887356758118, + "learning_rate": 3.5209680818246124e-06, + "loss": 0.0994, + "step": 7131 + }, + { + "epoch": 1.1555411535968891, + "grad_norm": 0.8549346923828125, + "learning_rate": 3.5205688911853326e-06, + "loss": 0.098, + "step": 7132 + }, + { + "epoch": 1.1557031756318858, + "grad_norm": 0.7560814619064331, + "learning_rate": 3.520169669319115e-06, + "loss": 0.0975, + "step": 7133 + }, + { + "epoch": 1.1558651976668828, + "grad_norm": 0.754621148109436, + "learning_rate": 3.5197704162381742e-06, + "loss": 0.0918, + "step": 7134 + }, + { + "epoch": 1.1560272197018795, + "grad_norm": 0.878743588924408, + "learning_rate": 3.5193711319547257e-06, + "loss": 0.1169, + "step": 7135 + }, + { + "epoch": 1.1561892417368762, + "grad_norm": 0.7919618487358093, + "learning_rate": 3.5189718164809884e-06, + "loss": 0.1018, + "step": 7136 + }, + { + "epoch": 1.156351263771873, + "grad_norm": 0.8777773976325989, + "learning_rate": 3.5185724698291784e-06, + "loss": 0.1095, + "step": 7137 + }, + { + "epoch": 1.1565132858068696, + "grad_norm": 0.9273558855056763, + "learning_rate": 3.5181730920115165e-06, + "loss": 0.1212, + "step": 7138 + }, + { + "epoch": 1.1566753078418666, + "grad_norm": 0.7581186890602112, + "learning_rate": 3.5177736830402213e-06, + "loss": 0.1012, + "step": 7139 + }, + { + "epoch": 1.1568373298768633, + "grad_norm": 0.8174877762794495, + "learning_rate": 3.517374242927514e-06, + "loss": 0.107, + "step": 7140 + }, + { + "epoch": 1.15699935191186, + "grad_norm": 0.8098114132881165, + "learning_rate": 3.5169747716856186e-06, + "loss": 0.1038, + "step": 7141 + }, + { + "epoch": 1.1571613739468567, + "grad_norm": 0.8155732154846191, + "learning_rate": 3.516575269326755e-06, + "loss": 0.0981, + "step": 7142 + }, + { + "epoch": 1.1573233959818534, + "grad_norm": 0.8303235769271851, + "learning_rate": 3.5161757358631482e-06, + "loss": 0.1058, + "step": 7143 + }, + { + "epoch": 1.1574854180168503, + "grad_norm": 0.7622935771942139, + "learning_rate": 3.515776171307023e-06, + "loss": 0.0971, + "step": 7144 + }, + { + "epoch": 1.157647440051847, + "grad_norm": 0.7291949391365051, + "learning_rate": 3.5153765756706054e-06, + "loss": 0.0866, + "step": 7145 + }, + { + "epoch": 1.1578094620868438, + "grad_norm": 0.7386295199394226, + "learning_rate": 3.5149769489661216e-06, + "loss": 0.095, + "step": 7146 + }, + { + "epoch": 1.1579714841218405, + "grad_norm": 0.7064818143844604, + "learning_rate": 3.514577291205799e-06, + "loss": 0.0891, + "step": 7147 + }, + { + "epoch": 1.1581335061568374, + "grad_norm": 0.6918752193450928, + "learning_rate": 3.5141776024018676e-06, + "loss": 0.0936, + "step": 7148 + }, + { + "epoch": 1.1582955281918341, + "grad_norm": 0.750155508518219, + "learning_rate": 3.5137778825665542e-06, + "loss": 0.0925, + "step": 7149 + }, + { + "epoch": 1.1584575502268308, + "grad_norm": 0.7644116878509521, + "learning_rate": 3.513378131712092e-06, + "loss": 0.0917, + "step": 7150 + }, + { + "epoch": 1.1586195722618275, + "grad_norm": 0.8276258707046509, + "learning_rate": 3.5129783498507114e-06, + "loss": 0.0993, + "step": 7151 + }, + { + "epoch": 1.1587815942968245, + "grad_norm": 0.9577162265777588, + "learning_rate": 3.5125785369946442e-06, + "loss": 0.1086, + "step": 7152 + }, + { + "epoch": 1.1589436163318212, + "grad_norm": 0.9136195182800293, + "learning_rate": 3.512178693156124e-06, + "loss": 0.1052, + "step": 7153 + }, + { + "epoch": 1.159105638366818, + "grad_norm": 0.9026337265968323, + "learning_rate": 3.5117788183473856e-06, + "loss": 0.1135, + "step": 7154 + }, + { + "epoch": 1.1592676604018146, + "grad_norm": 0.836702823638916, + "learning_rate": 3.511378912580664e-06, + "loss": 0.106, + "step": 7155 + }, + { + "epoch": 1.1594296824368113, + "grad_norm": 0.7680188417434692, + "learning_rate": 3.5109789758681944e-06, + "loss": 0.0917, + "step": 7156 + }, + { + "epoch": 1.1595917044718083, + "grad_norm": 0.9207019209861755, + "learning_rate": 3.5105790082222157e-06, + "loss": 0.1087, + "step": 7157 + }, + { + "epoch": 1.159753726506805, + "grad_norm": 0.7964971661567688, + "learning_rate": 3.5101790096549643e-06, + "loss": 0.0916, + "step": 7158 + }, + { + "epoch": 1.1599157485418017, + "grad_norm": 0.8500689268112183, + "learning_rate": 3.5097789801786797e-06, + "loss": 0.1081, + "step": 7159 + }, + { + "epoch": 1.1600777705767984, + "grad_norm": 0.8392435908317566, + "learning_rate": 3.509378919805602e-06, + "loss": 0.095, + "step": 7160 + }, + { + "epoch": 1.160239792611795, + "grad_norm": 0.7941946983337402, + "learning_rate": 3.508978828547972e-06, + "loss": 0.0961, + "step": 7161 + }, + { + "epoch": 1.160401814646792, + "grad_norm": 0.800559937953949, + "learning_rate": 3.5085787064180317e-06, + "loss": 0.1083, + "step": 7162 + }, + { + "epoch": 1.1605638366817888, + "grad_norm": 0.8886862993240356, + "learning_rate": 3.5081785534280233e-06, + "loss": 0.1146, + "step": 7163 + }, + { + "epoch": 1.1607258587167855, + "grad_norm": 0.8028414249420166, + "learning_rate": 3.5077783695901917e-06, + "loss": 0.1006, + "step": 7164 + }, + { + "epoch": 1.1608878807517822, + "grad_norm": 0.7642043828964233, + "learning_rate": 3.507378154916781e-06, + "loss": 0.0978, + "step": 7165 + }, + { + "epoch": 1.1610499027867789, + "grad_norm": 0.7976152896881104, + "learning_rate": 3.506977909420035e-06, + "loss": 0.0988, + "step": 7166 + }, + { + "epoch": 1.1612119248217758, + "grad_norm": 0.8192991018295288, + "learning_rate": 3.5065776331122027e-06, + "loss": 0.1036, + "step": 7167 + }, + { + "epoch": 1.1613739468567725, + "grad_norm": 0.7200789451599121, + "learning_rate": 3.506177326005531e-06, + "loss": 0.0982, + "step": 7168 + }, + { + "epoch": 1.1615359688917692, + "grad_norm": 0.7286558151245117, + "learning_rate": 3.5057769881122674e-06, + "loss": 0.1023, + "step": 7169 + }, + { + "epoch": 1.161697990926766, + "grad_norm": 0.9604185223579407, + "learning_rate": 3.5053766194446626e-06, + "loss": 0.1162, + "step": 7170 + }, + { + "epoch": 1.161860012961763, + "grad_norm": 0.8336222171783447, + "learning_rate": 3.504976220014965e-06, + "loss": 0.098, + "step": 7171 + }, + { + "epoch": 1.1620220349967596, + "grad_norm": 0.7740049362182617, + "learning_rate": 3.504575789835428e-06, + "loss": 0.1082, + "step": 7172 + }, + { + "epoch": 1.1621840570317563, + "grad_norm": 0.8649178147315979, + "learning_rate": 3.5041753289183033e-06, + "loss": 0.107, + "step": 7173 + }, + { + "epoch": 1.162346079066753, + "grad_norm": 0.8420133590698242, + "learning_rate": 3.503774837275843e-06, + "loss": 0.1032, + "step": 7174 + }, + { + "epoch": 1.16250810110175, + "grad_norm": 0.8649719953536987, + "learning_rate": 3.5033743149203013e-06, + "loss": 0.1087, + "step": 7175 + }, + { + "epoch": 1.1626701231367467, + "grad_norm": 0.8566991686820984, + "learning_rate": 3.5029737618639344e-06, + "loss": 0.0972, + "step": 7176 + }, + { + "epoch": 1.1628321451717434, + "grad_norm": 0.7619419097900391, + "learning_rate": 3.5025731781189976e-06, + "loss": 0.0886, + "step": 7177 + }, + { + "epoch": 1.16299416720674, + "grad_norm": 0.7711379528045654, + "learning_rate": 3.5021725636977466e-06, + "loss": 0.0956, + "step": 7178 + }, + { + "epoch": 1.1631561892417368, + "grad_norm": 0.9927428364753723, + "learning_rate": 3.501771918612442e-06, + "loss": 0.1124, + "step": 7179 + }, + { + "epoch": 1.1633182112767337, + "grad_norm": 0.776023268699646, + "learning_rate": 3.5013712428753392e-06, + "loss": 0.0964, + "step": 7180 + }, + { + "epoch": 1.1634802333117304, + "grad_norm": 0.9973410367965698, + "learning_rate": 3.500970536498701e-06, + "loss": 0.1263, + "step": 7181 + }, + { + "epoch": 1.1636422553467272, + "grad_norm": 0.8526068329811096, + "learning_rate": 3.500569799494786e-06, + "loss": 0.1016, + "step": 7182 + }, + { + "epoch": 1.1638042773817239, + "grad_norm": 0.8753652572631836, + "learning_rate": 3.500169031875857e-06, + "loss": 0.0948, + "step": 7183 + }, + { + "epoch": 1.1639662994167206, + "grad_norm": 0.8186274170875549, + "learning_rate": 3.4997682336541756e-06, + "loss": 0.1025, + "step": 7184 + }, + { + "epoch": 1.1641283214517175, + "grad_norm": 0.7919983267784119, + "learning_rate": 3.4993674048420056e-06, + "loss": 0.1053, + "step": 7185 + }, + { + "epoch": 1.1642903434867142, + "grad_norm": 0.8245944976806641, + "learning_rate": 3.498966545451612e-06, + "loss": 0.0985, + "step": 7186 + }, + { + "epoch": 1.164452365521711, + "grad_norm": 0.8587673902511597, + "learning_rate": 3.4985656554952583e-06, + "loss": 0.1062, + "step": 7187 + }, + { + "epoch": 1.1646143875567077, + "grad_norm": 0.9467253088951111, + "learning_rate": 3.4981647349852137e-06, + "loss": 0.1154, + "step": 7188 + }, + { + "epoch": 1.1647764095917044, + "grad_norm": 0.8093448877334595, + "learning_rate": 3.497763783933743e-06, + "loss": 0.1013, + "step": 7189 + }, + { + "epoch": 1.1649384316267013, + "grad_norm": 0.8070668578147888, + "learning_rate": 3.4973628023531146e-06, + "loss": 0.0967, + "step": 7190 + }, + { + "epoch": 1.165100453661698, + "grad_norm": 0.8548862934112549, + "learning_rate": 3.4969617902555984e-06, + "loss": 0.0957, + "step": 7191 + }, + { + "epoch": 1.1652624756966947, + "grad_norm": 0.8574987649917603, + "learning_rate": 3.496560747653464e-06, + "loss": 0.0881, + "step": 7192 + }, + { + "epoch": 1.1654244977316914, + "grad_norm": 0.7991284132003784, + "learning_rate": 3.496159674558982e-06, + "loss": 0.0937, + "step": 7193 + }, + { + "epoch": 1.1655865197666881, + "grad_norm": 0.9537588357925415, + "learning_rate": 3.4957585709844254e-06, + "loss": 0.1166, + "step": 7194 + }, + { + "epoch": 1.165748541801685, + "grad_norm": 0.7716676592826843, + "learning_rate": 3.495357436942066e-06, + "loss": 0.0916, + "step": 7195 + }, + { + "epoch": 1.1659105638366818, + "grad_norm": 0.8516098856925964, + "learning_rate": 3.494956272444177e-06, + "loss": 0.1054, + "step": 7196 + }, + { + "epoch": 1.1660725858716785, + "grad_norm": 0.7702400088310242, + "learning_rate": 3.4945550775030346e-06, + "loss": 0.0919, + "step": 7197 + }, + { + "epoch": 1.1662346079066752, + "grad_norm": 0.8230445981025696, + "learning_rate": 3.494153852130913e-06, + "loss": 0.102, + "step": 7198 + }, + { + "epoch": 1.1663966299416721, + "grad_norm": 0.8861101269721985, + "learning_rate": 3.49375259634009e-06, + "loss": 0.1092, + "step": 7199 + }, + { + "epoch": 1.1665586519766689, + "grad_norm": 0.8347850441932678, + "learning_rate": 3.4933513101428416e-06, + "loss": 0.1064, + "step": 7200 + }, + { + "epoch": 1.1667206740116656, + "grad_norm": 0.7879016399383545, + "learning_rate": 3.492949993551448e-06, + "loss": 0.0927, + "step": 7201 + }, + { + "epoch": 1.1668826960466623, + "grad_norm": 0.8113725781440735, + "learning_rate": 3.4925486465781865e-06, + "loss": 0.0954, + "step": 7202 + }, + { + "epoch": 1.1670447180816592, + "grad_norm": 0.709119439125061, + "learning_rate": 3.492147269235339e-06, + "loss": 0.092, + "step": 7203 + }, + { + "epoch": 1.167206740116656, + "grad_norm": 1.0254310369491577, + "learning_rate": 3.4917458615351853e-06, + "loss": 0.1181, + "step": 7204 + }, + { + "epoch": 1.1673687621516526, + "grad_norm": 0.7908617258071899, + "learning_rate": 3.4913444234900092e-06, + "loss": 0.0991, + "step": 7205 + }, + { + "epoch": 1.1675307841866494, + "grad_norm": 0.9039345979690552, + "learning_rate": 3.490942955112092e-06, + "loss": 0.1172, + "step": 7206 + }, + { + "epoch": 1.167692806221646, + "grad_norm": 0.7758670449256897, + "learning_rate": 3.4905414564137187e-06, + "loss": 0.0894, + "step": 7207 + }, + { + "epoch": 1.167854828256643, + "grad_norm": 0.9767671823501587, + "learning_rate": 3.490139927407174e-06, + "loss": 0.1139, + "step": 7208 + }, + { + "epoch": 1.1680168502916397, + "grad_norm": 1.0379667282104492, + "learning_rate": 3.489738368104743e-06, + "loss": 0.1256, + "step": 7209 + }, + { + "epoch": 1.1681788723266364, + "grad_norm": 0.9443620443344116, + "learning_rate": 3.4893367785187137e-06, + "loss": 0.1217, + "step": 7210 + }, + { + "epoch": 1.1683408943616331, + "grad_norm": 0.8020828366279602, + "learning_rate": 3.488935158661373e-06, + "loss": 0.1089, + "step": 7211 + }, + { + "epoch": 1.1685029163966298, + "grad_norm": 0.8911034464836121, + "learning_rate": 3.4885335085450095e-06, + "loss": 0.1104, + "step": 7212 + }, + { + "epoch": 1.1686649384316268, + "grad_norm": 0.8075478672981262, + "learning_rate": 3.4881318281819134e-06, + "loss": 0.0951, + "step": 7213 + }, + { + "epoch": 1.1688269604666235, + "grad_norm": 0.9641019701957703, + "learning_rate": 3.4877301175843735e-06, + "loss": 0.1115, + "step": 7214 + }, + { + "epoch": 1.1689889825016202, + "grad_norm": 0.8437277674674988, + "learning_rate": 3.4873283767646828e-06, + "loss": 0.1074, + "step": 7215 + }, + { + "epoch": 1.169151004536617, + "grad_norm": 0.7787491679191589, + "learning_rate": 3.486926605735133e-06, + "loss": 0.0975, + "step": 7216 + }, + { + "epoch": 1.1693130265716136, + "grad_norm": 0.9039609432220459, + "learning_rate": 3.486524804508018e-06, + "loss": 0.1196, + "step": 7217 + }, + { + "epoch": 1.1694750486066106, + "grad_norm": 0.8110989928245544, + "learning_rate": 3.486122973095631e-06, + "loss": 0.0996, + "step": 7218 + }, + { + "epoch": 1.1696370706416073, + "grad_norm": 0.7526705265045166, + "learning_rate": 3.485721111510267e-06, + "loss": 0.1038, + "step": 7219 + }, + { + "epoch": 1.169799092676604, + "grad_norm": 0.9018827676773071, + "learning_rate": 3.4853192197642226e-06, + "loss": 0.124, + "step": 7220 + }, + { + "epoch": 1.1699611147116007, + "grad_norm": 0.8486402034759521, + "learning_rate": 3.4849172978697942e-06, + "loss": 0.1047, + "step": 7221 + }, + { + "epoch": 1.1701231367465976, + "grad_norm": 0.9394757151603699, + "learning_rate": 3.48451534583928e-06, + "loss": 0.1075, + "step": 7222 + }, + { + "epoch": 1.1702851587815943, + "grad_norm": 0.8895528316497803, + "learning_rate": 3.4841133636849787e-06, + "loss": 0.1223, + "step": 7223 + }, + { + "epoch": 1.170447180816591, + "grad_norm": 0.8455293774604797, + "learning_rate": 3.4837113514191907e-06, + "loss": 0.1074, + "step": 7224 + }, + { + "epoch": 1.1706092028515878, + "grad_norm": 0.9192505478858948, + "learning_rate": 3.483309309054216e-06, + "loss": 0.1201, + "step": 7225 + }, + { + "epoch": 1.1707712248865847, + "grad_norm": 0.9902666211128235, + "learning_rate": 3.482907236602354e-06, + "loss": 0.1139, + "step": 7226 + }, + { + "epoch": 1.1709332469215814, + "grad_norm": 0.8750571608543396, + "learning_rate": 3.4825051340759114e-06, + "loss": 0.1164, + "step": 7227 + }, + { + "epoch": 1.1710952689565781, + "grad_norm": 0.793886125087738, + "learning_rate": 3.4821030014871886e-06, + "loss": 0.1018, + "step": 7228 + }, + { + "epoch": 1.1712572909915748, + "grad_norm": 0.824790358543396, + "learning_rate": 3.48170083884849e-06, + "loss": 0.1069, + "step": 7229 + }, + { + "epoch": 1.1714193130265715, + "grad_norm": 0.8293957114219666, + "learning_rate": 3.481298646172122e-06, + "loss": 0.1003, + "step": 7230 + }, + { + "epoch": 1.1715813350615685, + "grad_norm": 0.8428707718849182, + "learning_rate": 3.4808964234703903e-06, + "loss": 0.0993, + "step": 7231 + }, + { + "epoch": 1.1717433570965652, + "grad_norm": 0.8865581154823303, + "learning_rate": 3.480494170755602e-06, + "loss": 0.1059, + "step": 7232 + }, + { + "epoch": 1.171905379131562, + "grad_norm": 0.8242313265800476, + "learning_rate": 3.4800918880400635e-06, + "loss": 0.105, + "step": 7233 + }, + { + "epoch": 1.1720674011665586, + "grad_norm": 0.843008279800415, + "learning_rate": 3.479689575336086e-06, + "loss": 0.099, + "step": 7234 + }, + { + "epoch": 1.1722294232015553, + "grad_norm": 0.7857580780982971, + "learning_rate": 3.479287232655978e-06, + "loss": 0.0967, + "step": 7235 + }, + { + "epoch": 1.1723914452365523, + "grad_norm": 0.8480691313743591, + "learning_rate": 3.4788848600120507e-06, + "loss": 0.0949, + "step": 7236 + }, + { + "epoch": 1.172553467271549, + "grad_norm": 0.783110499382019, + "learning_rate": 3.4784824574166153e-06, + "loss": 0.0997, + "step": 7237 + }, + { + "epoch": 1.1727154893065457, + "grad_norm": 0.8678748607635498, + "learning_rate": 3.4780800248819847e-06, + "loss": 0.0947, + "step": 7238 + }, + { + "epoch": 1.1728775113415424, + "grad_norm": 0.8238358497619629, + "learning_rate": 3.477677562420472e-06, + "loss": 0.1107, + "step": 7239 + }, + { + "epoch": 1.173039533376539, + "grad_norm": 0.8292809724807739, + "learning_rate": 3.4772750700443923e-06, + "loss": 0.1047, + "step": 7240 + }, + { + "epoch": 1.173201555411536, + "grad_norm": 0.8215393424034119, + "learning_rate": 3.47687254776606e-06, + "loss": 0.0902, + "step": 7241 + }, + { + "epoch": 1.1733635774465327, + "grad_norm": 0.9231199622154236, + "learning_rate": 3.476469995597792e-06, + "loss": 0.1207, + "step": 7242 + }, + { + "epoch": 1.1735255994815295, + "grad_norm": 0.7816325426101685, + "learning_rate": 3.476067413551906e-06, + "loss": 0.0951, + "step": 7243 + }, + { + "epoch": 1.1736876215165262, + "grad_norm": 0.8484027981758118, + "learning_rate": 3.4756648016407175e-06, + "loss": 0.103, + "step": 7244 + }, + { + "epoch": 1.173849643551523, + "grad_norm": 0.7972317337989807, + "learning_rate": 3.475262159876548e-06, + "loss": 0.1049, + "step": 7245 + }, + { + "epoch": 1.1740116655865198, + "grad_norm": 0.8663620948791504, + "learning_rate": 3.4748594882717163e-06, + "loss": 0.0965, + "step": 7246 + }, + { + "epoch": 1.1741736876215165, + "grad_norm": 0.9606160521507263, + "learning_rate": 3.4744567868385432e-06, + "loss": 0.1068, + "step": 7247 + }, + { + "epoch": 1.1743357096565132, + "grad_norm": 0.9100252389907837, + "learning_rate": 3.474054055589351e-06, + "loss": 0.1114, + "step": 7248 + }, + { + "epoch": 1.17449773169151, + "grad_norm": 0.7617812156677246, + "learning_rate": 3.473651294536462e-06, + "loss": 0.1033, + "step": 7249 + }, + { + "epoch": 1.1746597537265069, + "grad_norm": 0.7390443086624146, + "learning_rate": 3.473248503692199e-06, + "loss": 0.0894, + "step": 7250 + }, + { + "epoch": 1.1748217757615036, + "grad_norm": 0.9591291546821594, + "learning_rate": 3.4728456830688873e-06, + "loss": 0.1099, + "step": 7251 + }, + { + "epoch": 1.1749837977965003, + "grad_norm": 0.8032423853874207, + "learning_rate": 3.472442832678852e-06, + "loss": 0.1055, + "step": 7252 + }, + { + "epoch": 1.175145819831497, + "grad_norm": 0.8758203387260437, + "learning_rate": 3.472039952534419e-06, + "loss": 0.107, + "step": 7253 + }, + { + "epoch": 1.175307841866494, + "grad_norm": 0.8825302124023438, + "learning_rate": 3.471637042647916e-06, + "loss": 0.1115, + "step": 7254 + }, + { + "epoch": 1.1754698639014907, + "grad_norm": 0.9467746019363403, + "learning_rate": 3.471234103031671e-06, + "loss": 0.122, + "step": 7255 + }, + { + "epoch": 1.1756318859364874, + "grad_norm": 0.8400065302848816, + "learning_rate": 3.470831133698013e-06, + "loss": 0.1034, + "step": 7256 + }, + { + "epoch": 1.175793907971484, + "grad_norm": 0.8600103259086609, + "learning_rate": 3.4704281346592703e-06, + "loss": 0.1021, + "step": 7257 + }, + { + "epoch": 1.1759559300064808, + "grad_norm": 1.0073620080947876, + "learning_rate": 3.470025105927777e-06, + "loss": 0.1043, + "step": 7258 + }, + { + "epoch": 1.1761179520414777, + "grad_norm": 0.8043482303619385, + "learning_rate": 3.4696220475158615e-06, + "loss": 0.0953, + "step": 7259 + }, + { + "epoch": 1.1762799740764744, + "grad_norm": 0.9232493042945862, + "learning_rate": 3.4692189594358578e-06, + "loss": 0.1104, + "step": 7260 + }, + { + "epoch": 1.1764419961114712, + "grad_norm": 0.8835765719413757, + "learning_rate": 3.4688158417000993e-06, + "loss": 0.1025, + "step": 7261 + }, + { + "epoch": 1.1766040181464679, + "grad_norm": 0.9452476501464844, + "learning_rate": 3.468412694320921e-06, + "loss": 0.1119, + "step": 7262 + }, + { + "epoch": 1.1767660401814646, + "grad_norm": 0.7869393825531006, + "learning_rate": 3.468009517310659e-06, + "loss": 0.0965, + "step": 7263 + }, + { + "epoch": 1.1769280622164615, + "grad_norm": 1.0424165725708008, + "learning_rate": 3.467606310681646e-06, + "loss": 0.1077, + "step": 7264 + }, + { + "epoch": 1.1770900842514582, + "grad_norm": 0.8021036982536316, + "learning_rate": 3.4672030744462224e-06, + "loss": 0.1049, + "step": 7265 + }, + { + "epoch": 1.177252106286455, + "grad_norm": 0.830315113067627, + "learning_rate": 3.4667998086167253e-06, + "loss": 0.099, + "step": 7266 + }, + { + "epoch": 1.1774141283214516, + "grad_norm": 0.93221116065979, + "learning_rate": 3.4663965132054943e-06, + "loss": 0.1029, + "step": 7267 + }, + { + "epoch": 1.1775761503564484, + "grad_norm": 0.9032275080680847, + "learning_rate": 3.465993188224868e-06, + "loss": 0.1074, + "step": 7268 + }, + { + "epoch": 1.1777381723914453, + "grad_norm": 0.8030032515525818, + "learning_rate": 3.465589833687188e-06, + "loss": 0.0987, + "step": 7269 + }, + { + "epoch": 1.177900194426442, + "grad_norm": 0.7802344560623169, + "learning_rate": 3.4651864496047952e-06, + "loss": 0.0968, + "step": 7270 + }, + { + "epoch": 1.1780622164614387, + "grad_norm": 0.816124439239502, + "learning_rate": 3.4647830359900335e-06, + "loss": 0.1089, + "step": 7271 + }, + { + "epoch": 1.1782242384964354, + "grad_norm": 0.6987655758857727, + "learning_rate": 3.464379592855246e-06, + "loss": 0.0939, + "step": 7272 + }, + { + "epoch": 1.1783862605314324, + "grad_norm": 0.9841788411140442, + "learning_rate": 3.463976120212776e-06, + "loss": 0.1049, + "step": 7273 + }, + { + "epoch": 1.178548282566429, + "grad_norm": 0.9158380627632141, + "learning_rate": 3.4635726180749698e-06, + "loss": 0.1152, + "step": 7274 + }, + { + "epoch": 1.1787103046014258, + "grad_norm": 0.7727981805801392, + "learning_rate": 3.4631690864541723e-06, + "loss": 0.1027, + "step": 7275 + }, + { + "epoch": 1.1788723266364225, + "grad_norm": 0.7879157662391663, + "learning_rate": 3.4627655253627324e-06, + "loss": 0.1041, + "step": 7276 + }, + { + "epoch": 1.1790343486714194, + "grad_norm": 0.8352596759796143, + "learning_rate": 3.4623619348129973e-06, + "loss": 0.1104, + "step": 7277 + }, + { + "epoch": 1.1791963707064161, + "grad_norm": 0.8539366126060486, + "learning_rate": 3.461958314817316e-06, + "loss": 0.1097, + "step": 7278 + }, + { + "epoch": 1.1793583927414129, + "grad_norm": 0.8420810699462891, + "learning_rate": 3.461554665388038e-06, + "loss": 0.111, + "step": 7279 + }, + { + "epoch": 1.1795204147764096, + "grad_norm": 0.864422082901001, + "learning_rate": 3.4611509865375143e-06, + "loss": 0.0987, + "step": 7280 + }, + { + "epoch": 1.1796824368114063, + "grad_norm": 0.9048283100128174, + "learning_rate": 3.460747278278096e-06, + "loss": 0.1139, + "step": 7281 + }, + { + "epoch": 1.1798444588464032, + "grad_norm": 0.9229883551597595, + "learning_rate": 3.4603435406221356e-06, + "loss": 0.1075, + "step": 7282 + }, + { + "epoch": 1.1800064808814, + "grad_norm": 0.8494004011154175, + "learning_rate": 3.4599397735819877e-06, + "loss": 0.1002, + "step": 7283 + }, + { + "epoch": 1.1801685029163966, + "grad_norm": 0.926969051361084, + "learning_rate": 3.4595359771700055e-06, + "loss": 0.1131, + "step": 7284 + }, + { + "epoch": 1.1803305249513933, + "grad_norm": 0.9146028757095337, + "learning_rate": 3.459132151398544e-06, + "loss": 0.1114, + "step": 7285 + }, + { + "epoch": 1.18049254698639, + "grad_norm": 0.937514066696167, + "learning_rate": 3.4587282962799602e-06, + "loss": 0.1124, + "step": 7286 + }, + { + "epoch": 1.180654569021387, + "grad_norm": 0.7594295144081116, + "learning_rate": 3.4583244118266107e-06, + "loss": 0.0934, + "step": 7287 + }, + { + "epoch": 1.1808165910563837, + "grad_norm": 0.8070241212844849, + "learning_rate": 3.4579204980508525e-06, + "loss": 0.0971, + "step": 7288 + }, + { + "epoch": 1.1809786130913804, + "grad_norm": 0.8179810047149658, + "learning_rate": 3.4575165549650463e-06, + "loss": 0.0992, + "step": 7289 + }, + { + "epoch": 1.1811406351263771, + "grad_norm": 0.8048136830329895, + "learning_rate": 3.45711258258155e-06, + "loss": 0.1008, + "step": 7290 + }, + { + "epoch": 1.1813026571613738, + "grad_norm": 0.755553662776947, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.0869, + "step": 7291 + }, + { + "epoch": 1.1814646791963708, + "grad_norm": 0.9157342314720154, + "learning_rate": 3.4563045499709324e-06, + "loss": 0.118, + "step": 7292 + }, + { + "epoch": 1.1816267012313675, + "grad_norm": 1.028659462928772, + "learning_rate": 3.455900489768535e-06, + "loss": 0.1275, + "step": 7293 + }, + { + "epoch": 1.1817887232663642, + "grad_norm": 0.9139890074729919, + "learning_rate": 3.455496400317896e-06, + "loss": 0.1034, + "step": 7294 + }, + { + "epoch": 1.181950745301361, + "grad_norm": 0.8444374799728394, + "learning_rate": 3.455092281631379e-06, + "loss": 0.1173, + "step": 7295 + }, + { + "epoch": 1.1821127673363578, + "grad_norm": 0.8052892088890076, + "learning_rate": 3.45468813372135e-06, + "loss": 0.097, + "step": 7296 + }, + { + "epoch": 1.1822747893713546, + "grad_norm": 0.7657241225242615, + "learning_rate": 3.4542839566001728e-06, + "loss": 0.0919, + "step": 7297 + }, + { + "epoch": 1.1824368114063513, + "grad_norm": 0.8102367520332336, + "learning_rate": 3.453879750280218e-06, + "loss": 0.1024, + "step": 7298 + }, + { + "epoch": 1.182598833441348, + "grad_norm": 0.7548478245735168, + "learning_rate": 3.45347551477385e-06, + "loss": 0.0999, + "step": 7299 + }, + { + "epoch": 1.1827608554763447, + "grad_norm": 0.8339917659759521, + "learning_rate": 3.4530712500934393e-06, + "loss": 0.0953, + "step": 7300 + }, + { + "epoch": 1.1829228775113416, + "grad_norm": 0.7205444574356079, + "learning_rate": 3.4526669562513536e-06, + "loss": 0.0898, + "step": 7301 + }, + { + "epoch": 1.1830848995463383, + "grad_norm": 0.8617227077484131, + "learning_rate": 3.4522626332599657e-06, + "loss": 0.1154, + "step": 7302 + }, + { + "epoch": 1.183246921581335, + "grad_norm": 0.7841586470603943, + "learning_rate": 3.4518582811316455e-06, + "loss": 0.1032, + "step": 7303 + }, + { + "epoch": 1.1834089436163318, + "grad_norm": 0.9288532733917236, + "learning_rate": 3.451453899878765e-06, + "loss": 0.1166, + "step": 7304 + }, + { + "epoch": 1.1835709656513287, + "grad_norm": 0.7976135611534119, + "learning_rate": 3.4510494895136975e-06, + "loss": 0.1023, + "step": 7305 + }, + { + "epoch": 1.1837329876863254, + "grad_norm": 0.8904582858085632, + "learning_rate": 3.450645050048817e-06, + "loss": 0.1189, + "step": 7306 + }, + { + "epoch": 1.1838950097213221, + "grad_norm": 0.8351088166236877, + "learning_rate": 3.450240581496499e-06, + "loss": 0.0989, + "step": 7307 + }, + { + "epoch": 1.1840570317563188, + "grad_norm": 0.8244988322257996, + "learning_rate": 3.449836083869118e-06, + "loss": 0.1055, + "step": 7308 + }, + { + "epoch": 1.1842190537913155, + "grad_norm": 0.7345836758613586, + "learning_rate": 3.449431557179052e-06, + "loss": 0.0927, + "step": 7309 + }, + { + "epoch": 1.1843810758263125, + "grad_norm": 0.8357600569725037, + "learning_rate": 3.449027001438678e-06, + "loss": 0.1007, + "step": 7310 + }, + { + "epoch": 1.1845430978613092, + "grad_norm": 0.7918616533279419, + "learning_rate": 3.4486224166603743e-06, + "loss": 0.0971, + "step": 7311 + }, + { + "epoch": 1.184705119896306, + "grad_norm": 0.8694443702697754, + "learning_rate": 3.44821780285652e-06, + "loss": 0.109, + "step": 7312 + }, + { + "epoch": 1.1848671419313026, + "grad_norm": 0.8621920943260193, + "learning_rate": 3.447813160039496e-06, + "loss": 0.106, + "step": 7313 + }, + { + "epoch": 1.1850291639662993, + "grad_norm": 0.8037292957305908, + "learning_rate": 3.4474084882216826e-06, + "loss": 0.1025, + "step": 7314 + }, + { + "epoch": 1.1851911860012962, + "grad_norm": 0.8269169330596924, + "learning_rate": 3.447003787415462e-06, + "loss": 0.114, + "step": 7315 + }, + { + "epoch": 1.185353208036293, + "grad_norm": 0.9500218629837036, + "learning_rate": 3.4465990576332177e-06, + "loss": 0.114, + "step": 7316 + }, + { + "epoch": 1.1855152300712897, + "grad_norm": 0.9417467713356018, + "learning_rate": 3.4461942988873322e-06, + "loss": 0.1184, + "step": 7317 + }, + { + "epoch": 1.1856772521062864, + "grad_norm": 0.8411504030227661, + "learning_rate": 3.445789511190192e-06, + "loss": 0.1066, + "step": 7318 + }, + { + "epoch": 1.185839274141283, + "grad_norm": 0.8924190402030945, + "learning_rate": 3.44538469455418e-06, + "loss": 0.102, + "step": 7319 + }, + { + "epoch": 1.18600129617628, + "grad_norm": 0.977077305316925, + "learning_rate": 3.4449798489916856e-06, + "loss": 0.1224, + "step": 7320 + }, + { + "epoch": 1.1861633182112767, + "grad_norm": 0.949083149433136, + "learning_rate": 3.444574974515094e-06, + "loss": 0.1256, + "step": 7321 + }, + { + "epoch": 1.1863253402462735, + "grad_norm": 0.8046411871910095, + "learning_rate": 3.444170071136794e-06, + "loss": 0.098, + "step": 7322 + }, + { + "epoch": 1.1864873622812702, + "grad_norm": 0.918962299823761, + "learning_rate": 3.443765138869175e-06, + "loss": 0.1103, + "step": 7323 + }, + { + "epoch": 1.186649384316267, + "grad_norm": 0.9088669419288635, + "learning_rate": 3.4433601777246263e-06, + "loss": 0.1128, + "step": 7324 + }, + { + "epoch": 1.1868114063512638, + "grad_norm": 0.8256091475486755, + "learning_rate": 3.4429551877155396e-06, + "loss": 0.1039, + "step": 7325 + }, + { + "epoch": 1.1869734283862605, + "grad_norm": 0.8927512168884277, + "learning_rate": 3.442550168854305e-06, + "loss": 0.1106, + "step": 7326 + }, + { + "epoch": 1.1871354504212572, + "grad_norm": 0.7968646883964539, + "learning_rate": 3.442145121153317e-06, + "loss": 0.102, + "step": 7327 + }, + { + "epoch": 1.1872974724562542, + "grad_norm": 0.8282347321510315, + "learning_rate": 3.4417400446249684e-06, + "loss": 0.1074, + "step": 7328 + }, + { + "epoch": 1.1874594944912509, + "grad_norm": 0.7620131969451904, + "learning_rate": 3.4413349392816526e-06, + "loss": 0.0978, + "step": 7329 + }, + { + "epoch": 1.1876215165262476, + "grad_norm": 0.7769449353218079, + "learning_rate": 3.440929805135766e-06, + "loss": 0.0909, + "step": 7330 + }, + { + "epoch": 1.1877835385612443, + "grad_norm": 0.8219037055969238, + "learning_rate": 3.440524642199704e-06, + "loss": 0.1035, + "step": 7331 + }, + { + "epoch": 1.187945560596241, + "grad_norm": 0.9052240252494812, + "learning_rate": 3.440119450485865e-06, + "loss": 0.1266, + "step": 7332 + }, + { + "epoch": 1.188107582631238, + "grad_norm": 0.8714154958724976, + "learning_rate": 3.439714230006645e-06, + "loss": 0.1143, + "step": 7333 + }, + { + "epoch": 1.1882696046662347, + "grad_norm": 0.7879825234413147, + "learning_rate": 3.439308980774444e-06, + "loss": 0.0973, + "step": 7334 + }, + { + "epoch": 1.1884316267012314, + "grad_norm": 0.8592246770858765, + "learning_rate": 3.4389037028016615e-06, + "loss": 0.1139, + "step": 7335 + }, + { + "epoch": 1.188593648736228, + "grad_norm": 0.7850585579872131, + "learning_rate": 3.438498396100697e-06, + "loss": 0.1065, + "step": 7336 + }, + { + "epoch": 1.1887556707712248, + "grad_norm": 0.8654761910438538, + "learning_rate": 3.438093060683953e-06, + "loss": 0.1017, + "step": 7337 + }, + { + "epoch": 1.1889176928062217, + "grad_norm": 0.6961141228675842, + "learning_rate": 3.4376876965638317e-06, + "loss": 0.0819, + "step": 7338 + }, + { + "epoch": 1.1890797148412184, + "grad_norm": 0.9045152068138123, + "learning_rate": 3.4372823037527357e-06, + "loss": 0.1117, + "step": 7339 + }, + { + "epoch": 1.1892417368762151, + "grad_norm": 0.9146777391433716, + "learning_rate": 3.4368768822630705e-06, + "loss": 0.1137, + "step": 7340 + }, + { + "epoch": 1.1894037589112119, + "grad_norm": 0.7968778014183044, + "learning_rate": 3.4364714321072384e-06, + "loss": 0.0937, + "step": 7341 + }, + { + "epoch": 1.1895657809462086, + "grad_norm": 0.8366844654083252, + "learning_rate": 3.4360659532976475e-06, + "loss": 0.1011, + "step": 7342 + }, + { + "epoch": 1.1897278029812055, + "grad_norm": 0.9418715238571167, + "learning_rate": 3.4356604458467034e-06, + "loss": 0.1189, + "step": 7343 + }, + { + "epoch": 1.1898898250162022, + "grad_norm": 0.8848615288734436, + "learning_rate": 3.435254909766814e-06, + "loss": 0.1043, + "step": 7344 + }, + { + "epoch": 1.190051847051199, + "grad_norm": 0.7343488931655884, + "learning_rate": 3.434849345070388e-06, + "loss": 0.1001, + "step": 7345 + }, + { + "epoch": 1.1902138690861956, + "grad_norm": 0.9168867468833923, + "learning_rate": 3.4344437517698336e-06, + "loss": 0.1032, + "step": 7346 + }, + { + "epoch": 1.1903758911211926, + "grad_norm": 0.7533358931541443, + "learning_rate": 3.4340381298775628e-06, + "loss": 0.0936, + "step": 7347 + }, + { + "epoch": 1.1905379131561893, + "grad_norm": 0.8484786152839661, + "learning_rate": 3.433632479405984e-06, + "loss": 0.1138, + "step": 7348 + }, + { + "epoch": 1.190699935191186, + "grad_norm": 0.7753875255584717, + "learning_rate": 3.4332268003675117e-06, + "loss": 0.0983, + "step": 7349 + }, + { + "epoch": 1.1908619572261827, + "grad_norm": 0.7216082215309143, + "learning_rate": 3.4328210927745577e-06, + "loss": 0.0931, + "step": 7350 + }, + { + "epoch": 1.1910239792611796, + "grad_norm": 0.8276437520980835, + "learning_rate": 3.432415356639536e-06, + "loss": 0.1084, + "step": 7351 + }, + { + "epoch": 1.1911860012961764, + "grad_norm": 0.9757489562034607, + "learning_rate": 3.4320095919748596e-06, + "loss": 0.1231, + "step": 7352 + }, + { + "epoch": 1.191348023331173, + "grad_norm": 0.760451078414917, + "learning_rate": 3.4316037987929456e-06, + "loss": 0.0991, + "step": 7353 + }, + { + "epoch": 1.1915100453661698, + "grad_norm": 0.7524585127830505, + "learning_rate": 3.43119797710621e-06, + "loss": 0.0946, + "step": 7354 + }, + { + "epoch": 1.1916720674011665, + "grad_norm": 0.9145233631134033, + "learning_rate": 3.4307921269270694e-06, + "loss": 0.1226, + "step": 7355 + }, + { + "epoch": 1.1918340894361634, + "grad_norm": 0.8342117667198181, + "learning_rate": 3.4303862482679435e-06, + "loss": 0.1062, + "step": 7356 + }, + { + "epoch": 1.1919961114711601, + "grad_norm": 0.9443584680557251, + "learning_rate": 3.429980341141248e-06, + "loss": 0.1181, + "step": 7357 + }, + { + "epoch": 1.1921581335061568, + "grad_norm": 0.7842267155647278, + "learning_rate": 3.429574405559406e-06, + "loss": 0.1026, + "step": 7358 + }, + { + "epoch": 1.1923201555411536, + "grad_norm": 0.8545981645584106, + "learning_rate": 3.429168441534836e-06, + "loss": 0.1103, + "step": 7359 + }, + { + "epoch": 1.1924821775761503, + "grad_norm": 0.8200758695602417, + "learning_rate": 3.4287624490799605e-06, + "loss": 0.1054, + "step": 7360 + }, + { + "epoch": 1.1926441996111472, + "grad_norm": 0.8782671093940735, + "learning_rate": 3.428356428207201e-06, + "loss": 0.1025, + "step": 7361 + }, + { + "epoch": 1.192806221646144, + "grad_norm": 0.841168999671936, + "learning_rate": 3.4279503789289824e-06, + "loss": 0.1002, + "step": 7362 + }, + { + "epoch": 1.1929682436811406, + "grad_norm": 0.7707106471061707, + "learning_rate": 3.427544301257727e-06, + "loss": 0.0937, + "step": 7363 + }, + { + "epoch": 1.1931302657161373, + "grad_norm": 1.0922257900238037, + "learning_rate": 3.4271381952058607e-06, + "loss": 0.1023, + "step": 7364 + }, + { + "epoch": 1.193292287751134, + "grad_norm": 0.8869131803512573, + "learning_rate": 3.4267320607858094e-06, + "loss": 0.1113, + "step": 7365 + }, + { + "epoch": 1.193454309786131, + "grad_norm": 0.8469724655151367, + "learning_rate": 3.42632589801e-06, + "loss": 0.1005, + "step": 7366 + }, + { + "epoch": 1.1936163318211277, + "grad_norm": 0.9287112951278687, + "learning_rate": 3.42591970689086e-06, + "loss": 0.1235, + "step": 7367 + }, + { + "epoch": 1.1937783538561244, + "grad_norm": 0.7303425669670105, + "learning_rate": 3.425513487440817e-06, + "loss": 0.0855, + "step": 7368 + }, + { + "epoch": 1.1939403758911211, + "grad_norm": 0.7840588688850403, + "learning_rate": 3.425107239672301e-06, + "loss": 0.1109, + "step": 7369 + }, + { + "epoch": 1.1941023979261178, + "grad_norm": 0.9029366970062256, + "learning_rate": 3.4247009635977425e-06, + "loss": 0.1064, + "step": 7370 + }, + { + "epoch": 1.1942644199611148, + "grad_norm": 0.9053102731704712, + "learning_rate": 3.4242946592295724e-06, + "loss": 0.1151, + "step": 7371 + }, + { + "epoch": 1.1944264419961115, + "grad_norm": 0.7992795705795288, + "learning_rate": 3.4238883265802215e-06, + "loss": 0.1023, + "step": 7372 + }, + { + "epoch": 1.1945884640311082, + "grad_norm": 0.7910366058349609, + "learning_rate": 3.423481965662125e-06, + "loss": 0.1009, + "step": 7373 + }, + { + "epoch": 1.194750486066105, + "grad_norm": 0.756036102771759, + "learning_rate": 3.4230755764877133e-06, + "loss": 0.0944, + "step": 7374 + }, + { + "epoch": 1.1949125081011018, + "grad_norm": 0.8976595401763916, + "learning_rate": 3.4226691590694244e-06, + "loss": 0.1048, + "step": 7375 + }, + { + "epoch": 1.1950745301360985, + "grad_norm": 0.8526069521903992, + "learning_rate": 3.4222627134196917e-06, + "loss": 0.1115, + "step": 7376 + }, + { + "epoch": 1.1952365521710953, + "grad_norm": 0.8503440022468567, + "learning_rate": 3.4218562395509513e-06, + "loss": 0.1178, + "step": 7377 + }, + { + "epoch": 1.195398574206092, + "grad_norm": 0.8521484732627869, + "learning_rate": 3.4214497374756415e-06, + "loss": 0.1027, + "step": 7378 + }, + { + "epoch": 1.195560596241089, + "grad_norm": 0.8041279315948486, + "learning_rate": 3.421043207206199e-06, + "loss": 0.0938, + "step": 7379 + }, + { + "epoch": 1.1957226182760856, + "grad_norm": 0.7933258414268494, + "learning_rate": 3.4206366487550637e-06, + "loss": 0.0986, + "step": 7380 + }, + { + "epoch": 1.1958846403110823, + "grad_norm": 0.7659348845481873, + "learning_rate": 3.420230062134674e-06, + "loss": 0.0948, + "step": 7381 + }, + { + "epoch": 1.196046662346079, + "grad_norm": 0.766547679901123, + "learning_rate": 3.419823447357472e-06, + "loss": 0.0984, + "step": 7382 + }, + { + "epoch": 1.1962086843810757, + "grad_norm": 0.9550457000732422, + "learning_rate": 3.419416804435899e-06, + "loss": 0.1161, + "step": 7383 + }, + { + "epoch": 1.1963707064160727, + "grad_norm": 0.8955060243606567, + "learning_rate": 3.4190101333823956e-06, + "loss": 0.1151, + "step": 7384 + }, + { + "epoch": 1.1965327284510694, + "grad_norm": 0.8570713996887207, + "learning_rate": 3.4186034342094066e-06, + "loss": 0.1072, + "step": 7385 + }, + { + "epoch": 1.196694750486066, + "grad_norm": 0.9379814863204956, + "learning_rate": 3.4181967069293754e-06, + "loss": 0.1144, + "step": 7386 + }, + { + "epoch": 1.1968567725210628, + "grad_norm": 0.8272217512130737, + "learning_rate": 3.417789951554747e-06, + "loss": 0.1065, + "step": 7387 + }, + { + "epoch": 1.1970187945560595, + "grad_norm": 0.8084843754768372, + "learning_rate": 3.417383168097967e-06, + "loss": 0.0922, + "step": 7388 + }, + { + "epoch": 1.1971808165910565, + "grad_norm": 0.894687294960022, + "learning_rate": 3.4169763565714824e-06, + "loss": 0.1148, + "step": 7389 + }, + { + "epoch": 1.1973428386260532, + "grad_norm": 0.7344521284103394, + "learning_rate": 3.41656951698774e-06, + "loss": 0.0919, + "step": 7390 + }, + { + "epoch": 1.1975048606610499, + "grad_norm": 0.7177636623382568, + "learning_rate": 3.416162649359189e-06, + "loss": 0.0873, + "step": 7391 + }, + { + "epoch": 1.1976668826960466, + "grad_norm": 0.8915464282035828, + "learning_rate": 3.4157557536982773e-06, + "loss": 0.1083, + "step": 7392 + }, + { + "epoch": 1.1978289047310433, + "grad_norm": 0.8106177449226379, + "learning_rate": 3.4153488300174557e-06, + "loss": 0.0969, + "step": 7393 + }, + { + "epoch": 1.1979909267660402, + "grad_norm": 0.8630214929580688, + "learning_rate": 3.414941878329175e-06, + "loss": 0.1064, + "step": 7394 + }, + { + "epoch": 1.198152948801037, + "grad_norm": 0.9472280144691467, + "learning_rate": 3.4145348986458874e-06, + "loss": 0.1136, + "step": 7395 + }, + { + "epoch": 1.1983149708360337, + "grad_norm": 0.795714795589447, + "learning_rate": 3.4141278909800444e-06, + "loss": 0.0959, + "step": 7396 + }, + { + "epoch": 1.1984769928710304, + "grad_norm": 0.8350076675415039, + "learning_rate": 3.4137208553441008e-06, + "loss": 0.0958, + "step": 7397 + }, + { + "epoch": 1.1986390149060273, + "grad_norm": 0.9593220353126526, + "learning_rate": 3.41331379175051e-06, + "loss": 0.1109, + "step": 7398 + }, + { + "epoch": 1.198801036941024, + "grad_norm": 0.7994752526283264, + "learning_rate": 3.4129067002117266e-06, + "loss": 0.1018, + "step": 7399 + }, + { + "epoch": 1.1989630589760207, + "grad_norm": 0.7729980945587158, + "learning_rate": 3.4124995807402082e-06, + "loss": 0.0983, + "step": 7400 + }, + { + "epoch": 1.1991250810110174, + "grad_norm": 0.8964594602584839, + "learning_rate": 3.412092433348411e-06, + "loss": 0.1145, + "step": 7401 + }, + { + "epoch": 1.1992871030460144, + "grad_norm": 0.8727504014968872, + "learning_rate": 3.4116852580487925e-06, + "loss": 0.1036, + "step": 7402 + }, + { + "epoch": 1.199449125081011, + "grad_norm": 0.8404155373573303, + "learning_rate": 3.4112780548538097e-06, + "loss": 0.101, + "step": 7403 + }, + { + "epoch": 1.1996111471160078, + "grad_norm": 0.7900668382644653, + "learning_rate": 3.4108708237759258e-06, + "loss": 0.096, + "step": 7404 + }, + { + "epoch": 1.1997731691510045, + "grad_norm": 1.0467677116394043, + "learning_rate": 3.4104635648275975e-06, + "loss": 0.1102, + "step": 7405 + }, + { + "epoch": 1.1999351911860012, + "grad_norm": 0.7542744874954224, + "learning_rate": 3.4100562780212887e-06, + "loss": 0.0906, + "step": 7406 + }, + { + "epoch": 1.2000972132209982, + "grad_norm": 0.898759663105011, + "learning_rate": 3.409648963369459e-06, + "loss": 0.1133, + "step": 7407 + }, + { + "epoch": 1.2002592352559949, + "grad_norm": 0.758513867855072, + "learning_rate": 3.4092416208845723e-06, + "loss": 0.0935, + "step": 7408 + }, + { + "epoch": 1.2004212572909916, + "grad_norm": 0.8105527758598328, + "learning_rate": 3.408834250579093e-06, + "loss": 0.1066, + "step": 7409 + }, + { + "epoch": 1.2005832793259883, + "grad_norm": 0.8187546730041504, + "learning_rate": 3.4084268524654847e-06, + "loss": 0.1023, + "step": 7410 + }, + { + "epoch": 1.200745301360985, + "grad_norm": 0.8059993386268616, + "learning_rate": 3.4080194265562133e-06, + "loss": 0.1112, + "step": 7411 + }, + { + "epoch": 1.200907323395982, + "grad_norm": 0.8363770246505737, + "learning_rate": 3.407611972863744e-06, + "loss": 0.1069, + "step": 7412 + }, + { + "epoch": 1.2010693454309787, + "grad_norm": 0.7790720462799072, + "learning_rate": 3.407204491400546e-06, + "loss": 0.1021, + "step": 7413 + }, + { + "epoch": 1.2012313674659754, + "grad_norm": 0.8233287930488586, + "learning_rate": 3.406796982179085e-06, + "loss": 0.1054, + "step": 7414 + }, + { + "epoch": 1.201393389500972, + "grad_norm": 0.8368380665779114, + "learning_rate": 3.4063894452118313e-06, + "loss": 0.1095, + "step": 7415 + }, + { + "epoch": 1.2015554115359688, + "grad_norm": 0.8810364007949829, + "learning_rate": 3.4059818805112534e-06, + "loss": 0.1126, + "step": 7416 + }, + { + "epoch": 1.2017174335709657, + "grad_norm": 0.9108153581619263, + "learning_rate": 3.4055742880898223e-06, + "loss": 0.1095, + "step": 7417 + }, + { + "epoch": 1.2018794556059624, + "grad_norm": 0.7670288681983948, + "learning_rate": 3.4051666679600105e-06, + "loss": 0.0969, + "step": 7418 + }, + { + "epoch": 1.2020414776409591, + "grad_norm": 0.6589247584342957, + "learning_rate": 3.404759020134288e-06, + "loss": 0.0842, + "step": 7419 + }, + { + "epoch": 1.2022034996759559, + "grad_norm": 0.7312752604484558, + "learning_rate": 3.40435134462513e-06, + "loss": 0.0972, + "step": 7420 + }, + { + "epoch": 1.2023655217109526, + "grad_norm": 0.8262923359870911, + "learning_rate": 3.403943641445008e-06, + "loss": 0.1024, + "step": 7421 + }, + { + "epoch": 1.2025275437459495, + "grad_norm": 0.8193771243095398, + "learning_rate": 3.403535910606399e-06, + "loss": 0.1016, + "step": 7422 + }, + { + "epoch": 1.2026895657809462, + "grad_norm": 0.7638645768165588, + "learning_rate": 3.4031281521217772e-06, + "loss": 0.0947, + "step": 7423 + }, + { + "epoch": 1.202851587815943, + "grad_norm": 0.9505733847618103, + "learning_rate": 3.4027203660036202e-06, + "loss": 0.1089, + "step": 7424 + }, + { + "epoch": 1.2030136098509396, + "grad_norm": 0.7956478595733643, + "learning_rate": 3.402312552264404e-06, + "loss": 0.0925, + "step": 7425 + }, + { + "epoch": 1.2031756318859366, + "grad_norm": 0.7707629203796387, + "learning_rate": 3.4019047109166077e-06, + "loss": 0.0984, + "step": 7426 + }, + { + "epoch": 1.2033376539209333, + "grad_norm": 0.8056702613830566, + "learning_rate": 3.401496841972709e-06, + "loss": 0.1076, + "step": 7427 + }, + { + "epoch": 1.20349967595593, + "grad_norm": 0.7400745749473572, + "learning_rate": 3.401088945445189e-06, + "loss": 0.0873, + "step": 7428 + }, + { + "epoch": 1.2036616979909267, + "grad_norm": 0.891022801399231, + "learning_rate": 3.400681021346528e-06, + "loss": 0.1001, + "step": 7429 + }, + { + "epoch": 1.2038237200259236, + "grad_norm": 0.8220235705375671, + "learning_rate": 3.4002730696892073e-06, + "loss": 0.1081, + "step": 7430 + }, + { + "epoch": 1.2039857420609203, + "grad_norm": 0.8286675810813904, + "learning_rate": 3.399865090485709e-06, + "loss": 0.1008, + "step": 7431 + }, + { + "epoch": 1.204147764095917, + "grad_norm": 0.8980154991149902, + "learning_rate": 3.3994570837485163e-06, + "loss": 0.1087, + "step": 7432 + }, + { + "epoch": 1.2043097861309138, + "grad_norm": 1.0518361330032349, + "learning_rate": 3.3990490494901148e-06, + "loss": 0.1222, + "step": 7433 + }, + { + "epoch": 1.2044718081659105, + "grad_norm": 0.9503573179244995, + "learning_rate": 3.3986409877229863e-06, + "loss": 0.1152, + "step": 7434 + }, + { + "epoch": 1.2046338302009074, + "grad_norm": 0.7729112505912781, + "learning_rate": 3.3982328984596196e-06, + "loss": 0.0995, + "step": 7435 + }, + { + "epoch": 1.2047958522359041, + "grad_norm": 0.8068438172340393, + "learning_rate": 3.3978247817124986e-06, + "loss": 0.0985, + "step": 7436 + }, + { + "epoch": 1.2049578742709008, + "grad_norm": 0.8518000245094299, + "learning_rate": 3.3974166374941137e-06, + "loss": 0.11, + "step": 7437 + }, + { + "epoch": 1.2051198963058976, + "grad_norm": 0.9522218704223633, + "learning_rate": 3.39700846581695e-06, + "loss": 0.1175, + "step": 7438 + }, + { + "epoch": 1.2052819183408943, + "grad_norm": 0.724781334400177, + "learning_rate": 3.396600266693498e-06, + "loss": 0.0842, + "step": 7439 + }, + { + "epoch": 1.2054439403758912, + "grad_norm": 0.7208175659179688, + "learning_rate": 3.3961920401362488e-06, + "loss": 0.0929, + "step": 7440 + }, + { + "epoch": 1.205605962410888, + "grad_norm": 0.8319954872131348, + "learning_rate": 3.395783786157691e-06, + "loss": 0.1001, + "step": 7441 + }, + { + "epoch": 1.2057679844458846, + "grad_norm": 0.8171935081481934, + "learning_rate": 3.3953755047703174e-06, + "loss": 0.104, + "step": 7442 + }, + { + "epoch": 1.2059300064808813, + "grad_norm": 0.7243193984031677, + "learning_rate": 3.394967195986619e-06, + "loss": 0.0957, + "step": 7443 + }, + { + "epoch": 1.206092028515878, + "grad_norm": 0.8252148628234863, + "learning_rate": 3.394558859819092e-06, + "loss": 0.1097, + "step": 7444 + }, + { + "epoch": 1.206254050550875, + "grad_norm": 0.8779266476631165, + "learning_rate": 3.3941504962802273e-06, + "loss": 0.1172, + "step": 7445 + }, + { + "epoch": 1.2064160725858717, + "grad_norm": 0.8700554370880127, + "learning_rate": 3.393742105382522e-06, + "loss": 0.1084, + "step": 7446 + }, + { + "epoch": 1.2065780946208684, + "grad_norm": 0.9085803627967834, + "learning_rate": 3.393333687138471e-06, + "loss": 0.1156, + "step": 7447 + }, + { + "epoch": 1.2067401166558651, + "grad_norm": 0.938339352607727, + "learning_rate": 3.3929252415605708e-06, + "loss": 0.1032, + "step": 7448 + }, + { + "epoch": 1.206902138690862, + "grad_norm": 0.8974847197532654, + "learning_rate": 3.392516768661319e-06, + "loss": 0.11, + "step": 7449 + }, + { + "epoch": 1.2070641607258588, + "grad_norm": 0.9363797903060913, + "learning_rate": 3.3921082684532143e-06, + "loss": 0.112, + "step": 7450 + }, + { + "epoch": 1.2072261827608555, + "grad_norm": 0.7975560426712036, + "learning_rate": 3.3916997409487552e-06, + "loss": 0.1048, + "step": 7451 + }, + { + "epoch": 1.2073882047958522, + "grad_norm": 0.7326340675354004, + "learning_rate": 3.391291186160441e-06, + "loss": 0.0983, + "step": 7452 + }, + { + "epoch": 1.2075502268308491, + "grad_norm": 0.8057294487953186, + "learning_rate": 3.390882604100775e-06, + "loss": 0.0962, + "step": 7453 + }, + { + "epoch": 1.2077122488658458, + "grad_norm": 0.7411153316497803, + "learning_rate": 3.3904739947822556e-06, + "loss": 0.0974, + "step": 7454 + }, + { + "epoch": 1.2078742709008425, + "grad_norm": 0.822479248046875, + "learning_rate": 3.3900653582173883e-06, + "loss": 0.1059, + "step": 7455 + }, + { + "epoch": 1.2080362929358393, + "grad_norm": 0.9575856924057007, + "learning_rate": 3.3896566944186737e-06, + "loss": 0.1204, + "step": 7456 + }, + { + "epoch": 1.208198314970836, + "grad_norm": 0.7944145798683167, + "learning_rate": 3.3892480033986186e-06, + "loss": 0.1, + "step": 7457 + }, + { + "epoch": 1.208360337005833, + "grad_norm": 0.6548885107040405, + "learning_rate": 3.388839285169725e-06, + "loss": 0.0808, + "step": 7458 + }, + { + "epoch": 1.2085223590408296, + "grad_norm": 0.9237521886825562, + "learning_rate": 3.3884305397445017e-06, + "loss": 0.1132, + "step": 7459 + }, + { + "epoch": 1.2086843810758263, + "grad_norm": 0.8143603205680847, + "learning_rate": 3.3880217671354527e-06, + "loss": 0.0988, + "step": 7460 + }, + { + "epoch": 1.208846403110823, + "grad_norm": 0.8373053073883057, + "learning_rate": 3.3876129673550873e-06, + "loss": 0.1062, + "step": 7461 + }, + { + "epoch": 1.2090084251458197, + "grad_norm": 0.8747492432594299, + "learning_rate": 3.3872041404159124e-06, + "loss": 0.0978, + "step": 7462 + }, + { + "epoch": 1.2091704471808167, + "grad_norm": 0.9876968860626221, + "learning_rate": 3.386795286330438e-06, + "loss": 0.1106, + "step": 7463 + }, + { + "epoch": 1.2093324692158134, + "grad_norm": 0.8512849807739258, + "learning_rate": 3.3863864051111744e-06, + "loss": 0.1119, + "step": 7464 + }, + { + "epoch": 1.20949449125081, + "grad_norm": 0.816228449344635, + "learning_rate": 3.385977496770631e-06, + "loss": 0.113, + "step": 7465 + }, + { + "epoch": 1.2096565132858068, + "grad_norm": 0.8686400055885315, + "learning_rate": 3.385568561321321e-06, + "loss": 0.1065, + "step": 7466 + }, + { + "epoch": 1.2098185353208035, + "grad_norm": 0.9017913937568665, + "learning_rate": 3.385159598775755e-06, + "loss": 0.105, + "step": 7467 + }, + { + "epoch": 1.2099805573558005, + "grad_norm": 0.8263371586799622, + "learning_rate": 3.3847506091464487e-06, + "loss": 0.1017, + "step": 7468 + }, + { + "epoch": 1.2101425793907972, + "grad_norm": 0.8818573355674744, + "learning_rate": 3.3843415924459146e-06, + "loss": 0.1109, + "step": 7469 + }, + { + "epoch": 1.2103046014257939, + "grad_norm": 0.8400024771690369, + "learning_rate": 3.383932548686667e-06, + "loss": 0.1062, + "step": 7470 + }, + { + "epoch": 1.2104666234607906, + "grad_norm": 0.8324894309043884, + "learning_rate": 3.3835234778812232e-06, + "loss": 0.1058, + "step": 7471 + }, + { + "epoch": 1.2106286454957873, + "grad_norm": 0.8616311550140381, + "learning_rate": 3.3831143800420983e-06, + "loss": 0.1095, + "step": 7472 + }, + { + "epoch": 1.2107906675307842, + "grad_norm": 0.821205735206604, + "learning_rate": 3.3827052551818113e-06, + "loss": 0.1111, + "step": 7473 + }, + { + "epoch": 1.210952689565781, + "grad_norm": 0.794984757900238, + "learning_rate": 3.3822961033128793e-06, + "loss": 0.1011, + "step": 7474 + }, + { + "epoch": 1.2111147116007777, + "grad_norm": 0.7533223032951355, + "learning_rate": 3.3818869244478214e-06, + "loss": 0.0949, + "step": 7475 + }, + { + "epoch": 1.2112767336357744, + "grad_norm": 0.8010703921318054, + "learning_rate": 3.3814777185991577e-06, + "loss": 0.1074, + "step": 7476 + }, + { + "epoch": 1.2114387556707713, + "grad_norm": 0.7717397212982178, + "learning_rate": 3.3810684857794097e-06, + "loss": 0.0989, + "step": 7477 + }, + { + "epoch": 1.211600777705768, + "grad_norm": 0.8146265745162964, + "learning_rate": 3.380659226001097e-06, + "loss": 0.1, + "step": 7478 + }, + { + "epoch": 1.2117627997407647, + "grad_norm": 0.7482508420944214, + "learning_rate": 3.380249939276744e-06, + "loss": 0.0988, + "step": 7479 + }, + { + "epoch": 1.2119248217757614, + "grad_norm": 0.8667011260986328, + "learning_rate": 3.3798406256188725e-06, + "loss": 0.1127, + "step": 7480 + }, + { + "epoch": 1.2120868438107584, + "grad_norm": 0.8829639554023743, + "learning_rate": 3.379431285040008e-06, + "loss": 0.1111, + "step": 7481 + }, + { + "epoch": 1.212248865845755, + "grad_norm": 0.7957995533943176, + "learning_rate": 3.3790219175526733e-06, + "loss": 0.1036, + "step": 7482 + }, + { + "epoch": 1.2124108878807518, + "grad_norm": 0.844648003578186, + "learning_rate": 3.3786125231693955e-06, + "loss": 0.0985, + "step": 7483 + }, + { + "epoch": 1.2125729099157485, + "grad_norm": 0.8586723804473877, + "learning_rate": 3.3782031019027006e-06, + "loss": 0.1066, + "step": 7484 + }, + { + "epoch": 1.2127349319507452, + "grad_norm": 0.8165646195411682, + "learning_rate": 3.3777936537651162e-06, + "loss": 0.1031, + "step": 7485 + }, + { + "epoch": 1.2128969539857422, + "grad_norm": 0.825261652469635, + "learning_rate": 3.3773841787691708e-06, + "loss": 0.0991, + "step": 7486 + }, + { + "epoch": 1.2130589760207389, + "grad_norm": 0.8294909000396729, + "learning_rate": 3.3769746769273915e-06, + "loss": 0.1069, + "step": 7487 + }, + { + "epoch": 1.2132209980557356, + "grad_norm": 0.8192000389099121, + "learning_rate": 3.3765651482523097e-06, + "loss": 0.0998, + "step": 7488 + }, + { + "epoch": 1.2133830200907323, + "grad_norm": 0.7805353999137878, + "learning_rate": 3.3761555927564553e-06, + "loss": 0.0919, + "step": 7489 + }, + { + "epoch": 1.213545042125729, + "grad_norm": 0.8784085512161255, + "learning_rate": 3.375746010452361e-06, + "loss": 0.117, + "step": 7490 + }, + { + "epoch": 1.213707064160726, + "grad_norm": 0.9832420945167542, + "learning_rate": 3.375336401352557e-06, + "loss": 0.115, + "step": 7491 + }, + { + "epoch": 1.2138690861957226, + "grad_norm": 0.8437249064445496, + "learning_rate": 3.374926765469578e-06, + "loss": 0.1156, + "step": 7492 + }, + { + "epoch": 1.2140311082307194, + "grad_norm": 0.8112325668334961, + "learning_rate": 3.374517102815958e-06, + "loss": 0.0974, + "step": 7493 + }, + { + "epoch": 1.214193130265716, + "grad_norm": 0.8753647208213806, + "learning_rate": 3.3741074134042297e-06, + "loss": 0.1087, + "step": 7494 + }, + { + "epoch": 1.2143551523007128, + "grad_norm": 0.9637497067451477, + "learning_rate": 3.373697697246931e-06, + "loss": 0.1211, + "step": 7495 + }, + { + "epoch": 1.2145171743357097, + "grad_norm": 0.8705383539199829, + "learning_rate": 3.3732879543565955e-06, + "loss": 0.1168, + "step": 7496 + }, + { + "epoch": 1.2146791963707064, + "grad_norm": 0.7167220115661621, + "learning_rate": 3.372878184745764e-06, + "loss": 0.0967, + "step": 7497 + }, + { + "epoch": 1.2148412184057031, + "grad_norm": 0.8306147456169128, + "learning_rate": 3.3724683884269702e-06, + "loss": 0.1044, + "step": 7498 + }, + { + "epoch": 1.2150032404406998, + "grad_norm": 0.9196276068687439, + "learning_rate": 3.3720585654127564e-06, + "loss": 0.1258, + "step": 7499 + }, + { + "epoch": 1.2151652624756968, + "grad_norm": 0.8953840732574463, + "learning_rate": 3.37164871571566e-06, + "loss": 0.1224, + "step": 7500 + }, + { + "epoch": 1.2153272845106935, + "grad_norm": 0.8981062173843384, + "learning_rate": 3.3712388393482224e-06, + "loss": 0.1102, + "step": 7501 + }, + { + "epoch": 1.2154893065456902, + "grad_norm": 0.7924917340278625, + "learning_rate": 3.370828936322985e-06, + "loss": 0.1097, + "step": 7502 + }, + { + "epoch": 1.215651328580687, + "grad_norm": 0.806941568851471, + "learning_rate": 3.37041900665249e-06, + "loss": 0.104, + "step": 7503 + }, + { + "epoch": 1.2158133506156839, + "grad_norm": 0.7318122982978821, + "learning_rate": 3.3700090503492795e-06, + "loss": 0.0876, + "step": 7504 + }, + { + "epoch": 1.2159753726506806, + "grad_norm": 0.7857891917228699, + "learning_rate": 3.369599067425897e-06, + "loss": 0.1066, + "step": 7505 + }, + { + "epoch": 1.2161373946856773, + "grad_norm": 0.7979137897491455, + "learning_rate": 3.3691890578948876e-06, + "loss": 0.1032, + "step": 7506 + }, + { + "epoch": 1.216299416720674, + "grad_norm": 0.8069559931755066, + "learning_rate": 3.3687790217687966e-06, + "loss": 0.1047, + "step": 7507 + }, + { + "epoch": 1.2164614387556707, + "grad_norm": 0.9257494211196899, + "learning_rate": 3.36836895906017e-06, + "loss": 0.1059, + "step": 7508 + }, + { + "epoch": 1.2166234607906676, + "grad_norm": 0.8408833146095276, + "learning_rate": 3.367958869781554e-06, + "loss": 0.0991, + "step": 7509 + }, + { + "epoch": 1.2167854828256643, + "grad_norm": 0.7625467777252197, + "learning_rate": 3.3675487539454972e-06, + "loss": 0.102, + "step": 7510 + }, + { + "epoch": 1.216947504860661, + "grad_norm": 0.7763059735298157, + "learning_rate": 3.367138611564548e-06, + "loss": 0.1047, + "step": 7511 + }, + { + "epoch": 1.2171095268956578, + "grad_norm": 0.872353196144104, + "learning_rate": 3.3667284426512565e-06, + "loss": 0.1122, + "step": 7512 + }, + { + "epoch": 1.2172715489306545, + "grad_norm": 0.8491617441177368, + "learning_rate": 3.366318247218171e-06, + "loss": 0.0998, + "step": 7513 + }, + { + "epoch": 1.2174335709656514, + "grad_norm": 0.8177269101142883, + "learning_rate": 3.3659080252778446e-06, + "loss": 0.1037, + "step": 7514 + }, + { + "epoch": 1.2175955930006481, + "grad_norm": 0.7532639503479004, + "learning_rate": 3.3654977768428276e-06, + "loss": 0.0947, + "step": 7515 + }, + { + "epoch": 1.2177576150356448, + "grad_norm": 0.7339494824409485, + "learning_rate": 3.365087501925673e-06, + "loss": 0.1038, + "step": 7516 + }, + { + "epoch": 1.2179196370706415, + "grad_norm": 0.9405337572097778, + "learning_rate": 3.364677200538935e-06, + "loss": 0.1128, + "step": 7517 + }, + { + "epoch": 1.2180816591056383, + "grad_norm": 0.8983637094497681, + "learning_rate": 3.3642668726951657e-06, + "loss": 0.1095, + "step": 7518 + }, + { + "epoch": 1.2182436811406352, + "grad_norm": 0.9077023863792419, + "learning_rate": 3.363856518406923e-06, + "loss": 0.1199, + "step": 7519 + }, + { + "epoch": 1.218405703175632, + "grad_norm": 0.8318829536437988, + "learning_rate": 3.36344613768676e-06, + "loss": 0.0994, + "step": 7520 + }, + { + "epoch": 1.2185677252106286, + "grad_norm": 0.9089265465736389, + "learning_rate": 3.3630357305472363e-06, + "loss": 0.1143, + "step": 7521 + }, + { + "epoch": 1.2187297472456253, + "grad_norm": 0.8133178353309631, + "learning_rate": 3.362625297000906e-06, + "loss": 0.0978, + "step": 7522 + }, + { + "epoch": 1.2188917692806223, + "grad_norm": 0.8779920339584351, + "learning_rate": 3.3622148370603305e-06, + "loss": 0.0998, + "step": 7523 + }, + { + "epoch": 1.219053791315619, + "grad_norm": 0.8766910433769226, + "learning_rate": 3.3618043507380673e-06, + "loss": 0.1154, + "step": 7524 + }, + { + "epoch": 1.2192158133506157, + "grad_norm": 0.8510516285896301, + "learning_rate": 3.3613938380466758e-06, + "loss": 0.1042, + "step": 7525 + }, + { + "epoch": 1.2193778353856124, + "grad_norm": 0.8500891327857971, + "learning_rate": 3.3609832989987178e-06, + "loss": 0.1009, + "step": 7526 + }, + { + "epoch": 1.219539857420609, + "grad_norm": 0.8215943574905396, + "learning_rate": 3.360572733606754e-06, + "loss": 0.1003, + "step": 7527 + }, + { + "epoch": 1.219701879455606, + "grad_norm": 0.8531811833381653, + "learning_rate": 3.360162141883348e-06, + "loss": 0.1103, + "step": 7528 + }, + { + "epoch": 1.2198639014906028, + "grad_norm": 0.7764996290206909, + "learning_rate": 3.359751523841062e-06, + "loss": 0.0973, + "step": 7529 + }, + { + "epoch": 1.2200259235255995, + "grad_norm": 0.7268168926239014, + "learning_rate": 3.3593408794924585e-06, + "loss": 0.0937, + "step": 7530 + }, + { + "epoch": 1.2201879455605962, + "grad_norm": 0.7981647253036499, + "learning_rate": 3.358930208850105e-06, + "loss": 0.1023, + "step": 7531 + }, + { + "epoch": 1.220349967595593, + "grad_norm": 0.7587500214576721, + "learning_rate": 3.358519511926565e-06, + "loss": 0.1031, + "step": 7532 + }, + { + "epoch": 1.2205119896305898, + "grad_norm": 0.8148356676101685, + "learning_rate": 3.358108788734406e-06, + "loss": 0.0965, + "step": 7533 + }, + { + "epoch": 1.2206740116655865, + "grad_norm": 0.7737621665000916, + "learning_rate": 3.357698039286194e-06, + "loss": 0.0945, + "step": 7534 + }, + { + "epoch": 1.2208360337005832, + "grad_norm": 0.8707150816917419, + "learning_rate": 3.3572872635944982e-06, + "loss": 0.1078, + "step": 7535 + }, + { + "epoch": 1.22099805573558, + "grad_norm": 0.8325858116149902, + "learning_rate": 3.356876461671887e-06, + "loss": 0.1106, + "step": 7536 + }, + { + "epoch": 1.221160077770577, + "grad_norm": 0.8393840789794922, + "learning_rate": 3.3564656335309293e-06, + "loss": 0.1064, + "step": 7537 + }, + { + "epoch": 1.2213220998055736, + "grad_norm": 0.8071523308753967, + "learning_rate": 3.3560547791841957e-06, + "loss": 0.1005, + "step": 7538 + }, + { + "epoch": 1.2214841218405703, + "grad_norm": 0.845231831073761, + "learning_rate": 3.3556438986442574e-06, + "loss": 0.1026, + "step": 7539 + }, + { + "epoch": 1.221646143875567, + "grad_norm": 0.8520015478134155, + "learning_rate": 3.3552329919236865e-06, + "loss": 0.0976, + "step": 7540 + }, + { + "epoch": 1.2218081659105637, + "grad_norm": 0.9010183215141296, + "learning_rate": 3.3548220590350563e-06, + "loss": 0.1105, + "step": 7541 + }, + { + "epoch": 1.2219701879455607, + "grad_norm": 0.7583953142166138, + "learning_rate": 3.3544110999909385e-06, + "loss": 0.0901, + "step": 7542 + }, + { + "epoch": 1.2221322099805574, + "grad_norm": 0.7752031683921814, + "learning_rate": 3.354000114803909e-06, + "loss": 0.099, + "step": 7543 + }, + { + "epoch": 1.222294232015554, + "grad_norm": 0.8464565873146057, + "learning_rate": 3.3535891034865433e-06, + "loss": 0.0952, + "step": 7544 + }, + { + "epoch": 1.2224562540505508, + "grad_norm": 0.8120436668395996, + "learning_rate": 3.3531780660514164e-06, + "loss": 0.098, + "step": 7545 + }, + { + "epoch": 1.2226182760855475, + "grad_norm": 0.7880799770355225, + "learning_rate": 3.3527670025111046e-06, + "loss": 0.1071, + "step": 7546 + }, + { + "epoch": 1.2227802981205445, + "grad_norm": 0.8038657903671265, + "learning_rate": 3.352355912878187e-06, + "loss": 0.0996, + "step": 7547 + }, + { + "epoch": 1.2229423201555412, + "grad_norm": 0.7888135313987732, + "learning_rate": 3.3519447971652407e-06, + "loss": 0.089, + "step": 7548 + }, + { + "epoch": 1.2231043421905379, + "grad_norm": 0.7552701830863953, + "learning_rate": 3.3515336553848454e-06, + "loss": 0.0972, + "step": 7549 + }, + { + "epoch": 1.2232663642255346, + "grad_norm": 0.720099151134491, + "learning_rate": 3.351122487549582e-06, + "loss": 0.0838, + "step": 7550 + }, + { + "epoch": 1.2234283862605315, + "grad_norm": 0.8370726108551025, + "learning_rate": 3.350711293672029e-06, + "loss": 0.1043, + "step": 7551 + }, + { + "epoch": 1.2235904082955282, + "grad_norm": 0.8462739586830139, + "learning_rate": 3.3503000737647696e-06, + "loss": 0.1001, + "step": 7552 + }, + { + "epoch": 1.223752430330525, + "grad_norm": 0.8034543991088867, + "learning_rate": 3.349888827840385e-06, + "loss": 0.0989, + "step": 7553 + }, + { + "epoch": 1.2239144523655217, + "grad_norm": 0.9257510304450989, + "learning_rate": 3.349477555911459e-06, + "loss": 0.1208, + "step": 7554 + }, + { + "epoch": 1.2240764744005186, + "grad_norm": 0.79314786195755, + "learning_rate": 3.349066257990576e-06, + "loss": 0.0993, + "step": 7555 + }, + { + "epoch": 1.2242384964355153, + "grad_norm": 0.8348277807235718, + "learning_rate": 3.3486549340903196e-06, + "loss": 0.096, + "step": 7556 + }, + { + "epoch": 1.224400518470512, + "grad_norm": 0.7744476199150085, + "learning_rate": 3.3482435842232763e-06, + "loss": 0.1027, + "step": 7557 + }, + { + "epoch": 1.2245625405055087, + "grad_norm": 0.9177954792976379, + "learning_rate": 3.3478322084020322e-06, + "loss": 0.1159, + "step": 7558 + }, + { + "epoch": 1.2247245625405054, + "grad_norm": 0.8459001779556274, + "learning_rate": 3.3474208066391747e-06, + "loss": 0.0988, + "step": 7559 + }, + { + "epoch": 1.2248865845755024, + "grad_norm": 0.8714538812637329, + "learning_rate": 3.34700937894729e-06, + "loss": 0.1032, + "step": 7560 + }, + { + "epoch": 1.225048606610499, + "grad_norm": 0.887595534324646, + "learning_rate": 3.3465979253389685e-06, + "loss": 0.1079, + "step": 7561 + }, + { + "epoch": 1.2252106286454958, + "grad_norm": 0.8663240075111389, + "learning_rate": 3.3461864458267996e-06, + "loss": 0.1049, + "step": 7562 + }, + { + "epoch": 1.2253726506804925, + "grad_norm": 0.7935128211975098, + "learning_rate": 3.3457749404233724e-06, + "loss": 0.1011, + "step": 7563 + }, + { + "epoch": 1.2255346727154892, + "grad_norm": 0.8313045501708984, + "learning_rate": 3.3453634091412795e-06, + "loss": 0.093, + "step": 7564 + }, + { + "epoch": 1.2256966947504861, + "grad_norm": 0.8548048138618469, + "learning_rate": 3.3449518519931123e-06, + "loss": 0.1095, + "step": 7565 + }, + { + "epoch": 1.2258587167854829, + "grad_norm": 0.8249443173408508, + "learning_rate": 3.344540268991462e-06, + "loss": 0.1114, + "step": 7566 + }, + { + "epoch": 1.2260207388204796, + "grad_norm": 0.850307285785675, + "learning_rate": 3.344128660148924e-06, + "loss": 0.1191, + "step": 7567 + }, + { + "epoch": 1.2261827608554763, + "grad_norm": 0.9916208386421204, + "learning_rate": 3.343717025478092e-06, + "loss": 0.1222, + "step": 7568 + }, + { + "epoch": 1.226344782890473, + "grad_norm": 0.825520932674408, + "learning_rate": 3.3433053649915603e-06, + "loss": 0.1055, + "step": 7569 + }, + { + "epoch": 1.22650680492547, + "grad_norm": 0.7133052349090576, + "learning_rate": 3.342893678701925e-06, + "loss": 0.0932, + "step": 7570 + }, + { + "epoch": 1.2266688269604666, + "grad_norm": 0.7931620478630066, + "learning_rate": 3.3424819666217834e-06, + "loss": 0.0977, + "step": 7571 + }, + { + "epoch": 1.2268308489954634, + "grad_norm": 0.80473393201828, + "learning_rate": 3.3420702287637325e-06, + "loss": 0.106, + "step": 7572 + }, + { + "epoch": 1.22699287103046, + "grad_norm": 1.0347249507904053, + "learning_rate": 3.3416584651403696e-06, + "loss": 0.132, + "step": 7573 + }, + { + "epoch": 1.227154893065457, + "grad_norm": 0.8027249574661255, + "learning_rate": 3.341246675764295e-06, + "loss": 0.1066, + "step": 7574 + }, + { + "epoch": 1.2273169151004537, + "grad_norm": 0.7861801385879517, + "learning_rate": 3.3408348606481078e-06, + "loss": 0.0994, + "step": 7575 + }, + { + "epoch": 1.2274789371354504, + "grad_norm": 0.943520188331604, + "learning_rate": 3.3404230198044085e-06, + "loss": 0.1147, + "step": 7576 + }, + { + "epoch": 1.2276409591704471, + "grad_norm": 0.9658901691436768, + "learning_rate": 3.340011153245799e-06, + "loss": 0.1152, + "step": 7577 + }, + { + "epoch": 1.2278029812054438, + "grad_norm": 0.7559435367584229, + "learning_rate": 3.3395992609848804e-06, + "loss": 0.0998, + "step": 7578 + }, + { + "epoch": 1.2279650032404408, + "grad_norm": 0.8726533651351929, + "learning_rate": 3.339187343034257e-06, + "loss": 0.1108, + "step": 7579 + }, + { + "epoch": 1.2281270252754375, + "grad_norm": 0.7859377264976501, + "learning_rate": 3.338775399406531e-06, + "loss": 0.102, + "step": 7580 + }, + { + "epoch": 1.2282890473104342, + "grad_norm": 0.8078779578208923, + "learning_rate": 3.338363430114308e-06, + "loss": 0.0931, + "step": 7581 + }, + { + "epoch": 1.228451069345431, + "grad_norm": 0.9209697842597961, + "learning_rate": 3.3379514351701924e-06, + "loss": 0.112, + "step": 7582 + }, + { + "epoch": 1.2286130913804278, + "grad_norm": 0.8043226599693298, + "learning_rate": 3.3375394145867912e-06, + "loss": 0.0952, + "step": 7583 + }, + { + "epoch": 1.2287751134154246, + "grad_norm": 0.8135362267494202, + "learning_rate": 3.3371273683767102e-06, + "loss": 0.1067, + "step": 7584 + }, + { + "epoch": 1.2289371354504213, + "grad_norm": 0.7498790621757507, + "learning_rate": 3.336715296552558e-06, + "loss": 0.0919, + "step": 7585 + }, + { + "epoch": 1.229099157485418, + "grad_norm": 0.8408077359199524, + "learning_rate": 3.3363031991269423e-06, + "loss": 0.101, + "step": 7586 + }, + { + "epoch": 1.2292611795204147, + "grad_norm": 0.8685369491577148, + "learning_rate": 3.3358910761124724e-06, + "loss": 0.1081, + "step": 7587 + }, + { + "epoch": 1.2294232015554116, + "grad_norm": 0.9170325994491577, + "learning_rate": 3.3354789275217587e-06, + "loss": 0.1099, + "step": 7588 + }, + { + "epoch": 1.2295852235904083, + "grad_norm": 0.8715429306030273, + "learning_rate": 3.3350667533674108e-06, + "loss": 0.108, + "step": 7589 + }, + { + "epoch": 1.229747245625405, + "grad_norm": 0.870263397693634, + "learning_rate": 3.3346545536620425e-06, + "loss": 0.1057, + "step": 7590 + }, + { + "epoch": 1.2299092676604018, + "grad_norm": 0.7764174938201904, + "learning_rate": 3.334242328418264e-06, + "loss": 0.1045, + "step": 7591 + }, + { + "epoch": 1.2300712896953985, + "grad_norm": 0.8364241719245911, + "learning_rate": 3.3338300776486886e-06, + "loss": 0.1014, + "step": 7592 + }, + { + "epoch": 1.2302333117303954, + "grad_norm": 0.8532553315162659, + "learning_rate": 3.333417801365931e-06, + "loss": 0.1043, + "step": 7593 + }, + { + "epoch": 1.2303953337653921, + "grad_norm": 0.795436441898346, + "learning_rate": 3.3330054995826056e-06, + "loss": 0.1042, + "step": 7594 + }, + { + "epoch": 1.2305573558003888, + "grad_norm": 0.7866604328155518, + "learning_rate": 3.332593172311328e-06, + "loss": 0.0932, + "step": 7595 + }, + { + "epoch": 1.2307193778353855, + "grad_norm": 0.8465556502342224, + "learning_rate": 3.3321808195647144e-06, + "loss": 0.1037, + "step": 7596 + }, + { + "epoch": 1.2308813998703823, + "grad_norm": 0.8134123086929321, + "learning_rate": 3.33176844135538e-06, + "loss": 0.1038, + "step": 7597 + }, + { + "epoch": 1.2310434219053792, + "grad_norm": 0.7845098972320557, + "learning_rate": 3.3313560376959456e-06, + "loss": 0.103, + "step": 7598 + }, + { + "epoch": 1.231205443940376, + "grad_norm": 0.8044149279594421, + "learning_rate": 3.330943608599028e-06, + "loss": 0.0938, + "step": 7599 + }, + { + "epoch": 1.2313674659753726, + "grad_norm": 0.7864116430282593, + "learning_rate": 3.3305311540772467e-06, + "loss": 0.0998, + "step": 7600 + }, + { + "epoch": 1.2315294880103693, + "grad_norm": 0.8183262944221497, + "learning_rate": 3.3301186741432217e-06, + "loss": 0.0995, + "step": 7601 + }, + { + "epoch": 1.2316915100453663, + "grad_norm": 0.9076040983200073, + "learning_rate": 3.3297061688095746e-06, + "loss": 0.1081, + "step": 7602 + }, + { + "epoch": 1.231853532080363, + "grad_norm": 0.6733155250549316, + "learning_rate": 3.3292936380889262e-06, + "loss": 0.0878, + "step": 7603 + }, + { + "epoch": 1.2320155541153597, + "grad_norm": 0.8160568475723267, + "learning_rate": 3.3288810819938995e-06, + "loss": 0.1036, + "step": 7604 + }, + { + "epoch": 1.2321775761503564, + "grad_norm": 0.757789134979248, + "learning_rate": 3.3284685005371176e-06, + "loss": 0.0993, + "step": 7605 + }, + { + "epoch": 1.2323395981853533, + "grad_norm": 0.8649821877479553, + "learning_rate": 3.3280558937312037e-06, + "loss": 0.1062, + "step": 7606 + }, + { + "epoch": 1.23250162022035, + "grad_norm": 0.8108959794044495, + "learning_rate": 3.3276432615887843e-06, + "loss": 0.0959, + "step": 7607 + }, + { + "epoch": 1.2326636422553467, + "grad_norm": 0.7675016522407532, + "learning_rate": 3.327230604122484e-06, + "loss": 0.0941, + "step": 7608 + }, + { + "epoch": 1.2328256642903435, + "grad_norm": 0.8105819225311279, + "learning_rate": 3.326817921344928e-06, + "loss": 0.1078, + "step": 7609 + }, + { + "epoch": 1.2329876863253402, + "grad_norm": 0.7653875946998596, + "learning_rate": 3.326405213268745e-06, + "loss": 0.0968, + "step": 7610 + }, + { + "epoch": 1.233149708360337, + "grad_norm": 0.7907019257545471, + "learning_rate": 3.3259924799065628e-06, + "loss": 0.1014, + "step": 7611 + }, + { + "epoch": 1.2333117303953338, + "grad_norm": 0.9260424971580505, + "learning_rate": 3.3255797212710095e-06, + "loss": 0.1051, + "step": 7612 + }, + { + "epoch": 1.2334737524303305, + "grad_norm": 0.8345485925674438, + "learning_rate": 3.325166937374714e-06, + "loss": 0.1032, + "step": 7613 + }, + { + "epoch": 1.2336357744653272, + "grad_norm": 0.9643160104751587, + "learning_rate": 3.3247541282303082e-06, + "loss": 0.1185, + "step": 7614 + }, + { + "epoch": 1.233797796500324, + "grad_norm": 0.8535207509994507, + "learning_rate": 3.3243412938504205e-06, + "loss": 0.1094, + "step": 7615 + }, + { + "epoch": 1.2339598185353209, + "grad_norm": 0.8478245735168457, + "learning_rate": 3.3239284342476852e-06, + "loss": 0.1035, + "step": 7616 + }, + { + "epoch": 1.2341218405703176, + "grad_norm": 0.8517696261405945, + "learning_rate": 3.3235155494347325e-06, + "loss": 0.1113, + "step": 7617 + }, + { + "epoch": 1.2342838626053143, + "grad_norm": 1.065172791481018, + "learning_rate": 3.3231026394241983e-06, + "loss": 0.1272, + "step": 7618 + }, + { + "epoch": 1.234445884640311, + "grad_norm": 0.9062070250511169, + "learning_rate": 3.3226897042287145e-06, + "loss": 0.1209, + "step": 7619 + }, + { + "epoch": 1.2346079066753077, + "grad_norm": 0.8547554612159729, + "learning_rate": 3.3222767438609166e-06, + "loss": 0.1091, + "step": 7620 + }, + { + "epoch": 1.2347699287103047, + "grad_norm": 0.7908268570899963, + "learning_rate": 3.32186375833344e-06, + "loss": 0.0999, + "step": 7621 + }, + { + "epoch": 1.2349319507453014, + "grad_norm": 0.7787481546401978, + "learning_rate": 3.321450747658922e-06, + "loss": 0.0974, + "step": 7622 + }, + { + "epoch": 1.235093972780298, + "grad_norm": 0.8876661062240601, + "learning_rate": 3.321037711849998e-06, + "loss": 0.1232, + "step": 7623 + }, + { + "epoch": 1.2352559948152948, + "grad_norm": 0.8357555270195007, + "learning_rate": 3.3206246509193076e-06, + "loss": 0.1095, + "step": 7624 + }, + { + "epoch": 1.2354180168502917, + "grad_norm": 0.8862370848655701, + "learning_rate": 3.320211564879488e-06, + "loss": 0.1157, + "step": 7625 + }, + { + "epoch": 1.2355800388852884, + "grad_norm": 0.835319459438324, + "learning_rate": 3.3197984537431797e-06, + "loss": 0.1024, + "step": 7626 + }, + { + "epoch": 1.2357420609202852, + "grad_norm": 0.8422128558158875, + "learning_rate": 3.319385317523024e-06, + "loss": 0.1071, + "step": 7627 + }, + { + "epoch": 1.2359040829552819, + "grad_norm": 0.7388719916343689, + "learning_rate": 3.3189721562316585e-06, + "loss": 0.0847, + "step": 7628 + }, + { + "epoch": 1.2360661049902788, + "grad_norm": 0.7805131673812866, + "learning_rate": 3.318558969881728e-06, + "loss": 0.0995, + "step": 7629 + }, + { + "epoch": 1.2362281270252755, + "grad_norm": 1.0341185331344604, + "learning_rate": 3.3181457584858736e-06, + "loss": 0.1243, + "step": 7630 + }, + { + "epoch": 1.2363901490602722, + "grad_norm": 0.7121812701225281, + "learning_rate": 3.3177325220567385e-06, + "loss": 0.0965, + "step": 7631 + }, + { + "epoch": 1.236552171095269, + "grad_norm": 0.7928653955459595, + "learning_rate": 3.3173192606069673e-06, + "loss": 0.1043, + "step": 7632 + }, + { + "epoch": 1.2367141931302656, + "grad_norm": 0.7301590442657471, + "learning_rate": 3.316905974149205e-06, + "loss": 0.0967, + "step": 7633 + }, + { + "epoch": 1.2368762151652626, + "grad_norm": 0.9105049967765808, + "learning_rate": 3.316492662696097e-06, + "loss": 0.1122, + "step": 7634 + }, + { + "epoch": 1.2370382372002593, + "grad_norm": 0.8258975148200989, + "learning_rate": 3.3160793262602882e-06, + "loss": 0.1109, + "step": 7635 + }, + { + "epoch": 1.237200259235256, + "grad_norm": 0.9241932034492493, + "learning_rate": 3.3156659648544276e-06, + "loss": 0.1102, + "step": 7636 + }, + { + "epoch": 1.2373622812702527, + "grad_norm": 0.8554189205169678, + "learning_rate": 3.315252578491162e-06, + "loss": 0.1104, + "step": 7637 + }, + { + "epoch": 1.2375243033052494, + "grad_norm": 0.9000004529953003, + "learning_rate": 3.314839167183141e-06, + "loss": 0.1058, + "step": 7638 + }, + { + "epoch": 1.2376863253402464, + "grad_norm": 0.7197251319885254, + "learning_rate": 3.3144257309430127e-06, + "loss": 0.0863, + "step": 7639 + }, + { + "epoch": 1.237848347375243, + "grad_norm": 0.948573887348175, + "learning_rate": 3.3140122697834287e-06, + "loss": 0.1086, + "step": 7640 + }, + { + "epoch": 1.2380103694102398, + "grad_norm": 0.8102221488952637, + "learning_rate": 3.3135987837170386e-06, + "loss": 0.1001, + "step": 7641 + }, + { + "epoch": 1.2381723914452365, + "grad_norm": 0.7119641304016113, + "learning_rate": 3.3131852727564947e-06, + "loss": 0.0848, + "step": 7642 + }, + { + "epoch": 1.2383344134802332, + "grad_norm": 0.8429257869720459, + "learning_rate": 3.31277173691445e-06, + "loss": 0.1068, + "step": 7643 + }, + { + "epoch": 1.2384964355152301, + "grad_norm": 0.7876508831977844, + "learning_rate": 3.3123581762035557e-06, + "loss": 0.0945, + "step": 7644 + }, + { + "epoch": 1.2386584575502269, + "grad_norm": 0.9862732291221619, + "learning_rate": 3.311944590636468e-06, + "loss": 0.1212, + "step": 7645 + }, + { + "epoch": 1.2388204795852236, + "grad_norm": 0.8856245279312134, + "learning_rate": 3.31153098022584e-06, + "loss": 0.1084, + "step": 7646 + }, + { + "epoch": 1.2389825016202203, + "grad_norm": 0.9834122657775879, + "learning_rate": 3.3111173449843283e-06, + "loss": 0.1125, + "step": 7647 + }, + { + "epoch": 1.239144523655217, + "grad_norm": 0.7890782356262207, + "learning_rate": 3.3107036849245883e-06, + "loss": 0.0953, + "step": 7648 + }, + { + "epoch": 1.239306545690214, + "grad_norm": 0.8613621592521667, + "learning_rate": 3.310290000059278e-06, + "loss": 0.1074, + "step": 7649 + }, + { + "epoch": 1.2394685677252106, + "grad_norm": 0.8193936944007874, + "learning_rate": 3.309876290401054e-06, + "loss": 0.1, + "step": 7650 + }, + { + "epoch": 1.2396305897602073, + "grad_norm": 0.7670435309410095, + "learning_rate": 3.309462555962576e-06, + "loss": 0.0993, + "step": 7651 + }, + { + "epoch": 1.239792611795204, + "grad_norm": 0.9082610011100769, + "learning_rate": 3.309048796756503e-06, + "loss": 0.1142, + "step": 7652 + }, + { + "epoch": 1.239954633830201, + "grad_norm": 0.9768080115318298, + "learning_rate": 3.3086350127954935e-06, + "loss": 0.1186, + "step": 7653 + }, + { + "epoch": 1.2401166558651977, + "grad_norm": 0.7574945092201233, + "learning_rate": 3.3082212040922103e-06, + "loss": 0.0913, + "step": 7654 + }, + { + "epoch": 1.2402786779001944, + "grad_norm": 0.8311034440994263, + "learning_rate": 3.3078073706593133e-06, + "loss": 0.1038, + "step": 7655 + }, + { + "epoch": 1.2404406999351911, + "grad_norm": 0.7912598252296448, + "learning_rate": 3.307393512509466e-06, + "loss": 0.1065, + "step": 7656 + }, + { + "epoch": 1.240602721970188, + "grad_norm": 0.8380460143089294, + "learning_rate": 3.3069796296553316e-06, + "loss": 0.1073, + "step": 7657 + }, + { + "epoch": 1.2407647440051848, + "grad_norm": 0.8099128007888794, + "learning_rate": 3.3065657221095732e-06, + "loss": 0.0964, + "step": 7658 + }, + { + "epoch": 1.2409267660401815, + "grad_norm": 0.6533287167549133, + "learning_rate": 3.306151789884855e-06, + "loss": 0.0861, + "step": 7659 + }, + { + "epoch": 1.2410887880751782, + "grad_norm": 0.6852719187736511, + "learning_rate": 3.3057378329938432e-06, + "loss": 0.0837, + "step": 7660 + }, + { + "epoch": 1.241250810110175, + "grad_norm": 0.8555914163589478, + "learning_rate": 3.305323851449203e-06, + "loss": 0.1047, + "step": 7661 + }, + { + "epoch": 1.2414128321451718, + "grad_norm": 0.8162693381309509, + "learning_rate": 3.304909845263603e-06, + "loss": 0.1062, + "step": 7662 + }, + { + "epoch": 1.2415748541801686, + "grad_norm": 0.8287134766578674, + "learning_rate": 3.3044958144497086e-06, + "loss": 0.1038, + "step": 7663 + }, + { + "epoch": 1.2417368762151653, + "grad_norm": 0.7691091299057007, + "learning_rate": 3.3040817590201897e-06, + "loss": 0.0984, + "step": 7664 + }, + { + "epoch": 1.241898898250162, + "grad_norm": 0.8376115560531616, + "learning_rate": 3.3036676789877146e-06, + "loss": 0.1, + "step": 7665 + }, + { + "epoch": 1.2420609202851587, + "grad_norm": 0.7699323296546936, + "learning_rate": 3.303253574364953e-06, + "loss": 0.0892, + "step": 7666 + }, + { + "epoch": 1.2422229423201556, + "grad_norm": 0.8108561038970947, + "learning_rate": 3.3028394451645766e-06, + "loss": 0.1031, + "step": 7667 + }, + { + "epoch": 1.2423849643551523, + "grad_norm": 0.8874189257621765, + "learning_rate": 3.3024252913992548e-06, + "loss": 0.11, + "step": 7668 + }, + { + "epoch": 1.242546986390149, + "grad_norm": 0.7574527859687805, + "learning_rate": 3.3020111130816623e-06, + "loss": 0.0989, + "step": 7669 + }, + { + "epoch": 1.2427090084251458, + "grad_norm": 0.9647727608680725, + "learning_rate": 3.3015969102244704e-06, + "loss": 0.1051, + "step": 7670 + }, + { + "epoch": 1.2428710304601425, + "grad_norm": 0.9136444926261902, + "learning_rate": 3.3011826828403527e-06, + "loss": 0.1138, + "step": 7671 + }, + { + "epoch": 1.2430330524951394, + "grad_norm": 0.9177752733230591, + "learning_rate": 3.300768430941983e-06, + "loss": 0.1238, + "step": 7672 + }, + { + "epoch": 1.2431950745301361, + "grad_norm": 0.7569882869720459, + "learning_rate": 3.3003541545420377e-06, + "loss": 0.1009, + "step": 7673 + }, + { + "epoch": 1.2433570965651328, + "grad_norm": 0.7180430889129639, + "learning_rate": 3.299939853653192e-06, + "loss": 0.0873, + "step": 7674 + }, + { + "epoch": 1.2435191186001295, + "grad_norm": 0.7186658978462219, + "learning_rate": 3.2995255282881227e-06, + "loss": 0.0835, + "step": 7675 + }, + { + "epoch": 1.2436811406351265, + "grad_norm": 0.8784341812133789, + "learning_rate": 3.299111178459507e-06, + "loss": 0.1175, + "step": 7676 + }, + { + "epoch": 1.2438431626701232, + "grad_norm": 0.8925139307975769, + "learning_rate": 3.2986968041800234e-06, + "loss": 0.1186, + "step": 7677 + }, + { + "epoch": 1.24400518470512, + "grad_norm": 0.8931295275688171, + "learning_rate": 3.29828240546235e-06, + "loss": 0.1053, + "step": 7678 + }, + { + "epoch": 1.2441672067401166, + "grad_norm": 0.8283528685569763, + "learning_rate": 3.297867982319166e-06, + "loss": 0.1039, + "step": 7679 + }, + { + "epoch": 1.2443292287751135, + "grad_norm": 0.9566637873649597, + "learning_rate": 3.297453534763154e-06, + "loss": 0.1112, + "step": 7680 + }, + { + "epoch": 1.2444912508101102, + "grad_norm": 0.8479618430137634, + "learning_rate": 3.2970390628069924e-06, + "loss": 0.107, + "step": 7681 + }, + { + "epoch": 1.244653272845107, + "grad_norm": 0.7277421951293945, + "learning_rate": 3.2966245664633654e-06, + "loss": 0.0957, + "step": 7682 + }, + { + "epoch": 1.2448152948801037, + "grad_norm": 0.7973564863204956, + "learning_rate": 3.296210045744954e-06, + "loss": 0.0933, + "step": 7683 + }, + { + "epoch": 1.2449773169151004, + "grad_norm": 0.8752889037132263, + "learning_rate": 3.295795500664442e-06, + "loss": 0.1146, + "step": 7684 + }, + { + "epoch": 1.2451393389500973, + "grad_norm": 0.772816002368927, + "learning_rate": 3.295380931234513e-06, + "loss": 0.0988, + "step": 7685 + }, + { + "epoch": 1.245301360985094, + "grad_norm": 1.0557066202163696, + "learning_rate": 3.294966337467853e-06, + "loss": 0.1175, + "step": 7686 + }, + { + "epoch": 1.2454633830200907, + "grad_norm": 0.7834747433662415, + "learning_rate": 3.294551719377147e-06, + "loss": 0.1024, + "step": 7687 + }, + { + "epoch": 1.2456254050550875, + "grad_norm": 0.8686286807060242, + "learning_rate": 3.2941370769750804e-06, + "loss": 0.1102, + "step": 7688 + }, + { + "epoch": 1.2457874270900842, + "grad_norm": 0.8577167391777039, + "learning_rate": 3.2937224102743414e-06, + "loss": 0.1075, + "step": 7689 + }, + { + "epoch": 1.245949449125081, + "grad_norm": 0.712014377117157, + "learning_rate": 3.293307719287617e-06, + "loss": 0.0949, + "step": 7690 + }, + { + "epoch": 1.2461114711600778, + "grad_norm": 0.7292968034744263, + "learning_rate": 3.292893004027597e-06, + "loss": 0.0953, + "step": 7691 + }, + { + "epoch": 1.2462734931950745, + "grad_norm": 0.8187505006790161, + "learning_rate": 3.2924782645069684e-06, + "loss": 0.0978, + "step": 7692 + }, + { + "epoch": 1.2464355152300712, + "grad_norm": 0.8897111415863037, + "learning_rate": 3.292063500738424e-06, + "loss": 0.1111, + "step": 7693 + }, + { + "epoch": 1.246597537265068, + "grad_norm": 0.806907057762146, + "learning_rate": 3.291648712734653e-06, + "loss": 0.096, + "step": 7694 + }, + { + "epoch": 1.2467595593000649, + "grad_norm": 0.8400275111198425, + "learning_rate": 3.2912339005083473e-06, + "loss": 0.0945, + "step": 7695 + }, + { + "epoch": 1.2469215813350616, + "grad_norm": 0.8764445185661316, + "learning_rate": 3.290819064072198e-06, + "loss": 0.1005, + "step": 7696 + }, + { + "epoch": 1.2470836033700583, + "grad_norm": 0.994835615158081, + "learning_rate": 3.2904042034389e-06, + "loss": 0.1262, + "step": 7697 + }, + { + "epoch": 1.247245625405055, + "grad_norm": 0.8427457213401794, + "learning_rate": 3.289989318621146e-06, + "loss": 0.1038, + "step": 7698 + }, + { + "epoch": 1.2474076474400517, + "grad_norm": 0.8053460717201233, + "learning_rate": 3.289574409631631e-06, + "loss": 0.0978, + "step": 7699 + }, + { + "epoch": 1.2475696694750487, + "grad_norm": 0.9428670406341553, + "learning_rate": 3.289159476483049e-06, + "loss": 0.1128, + "step": 7700 + }, + { + "epoch": 1.2477316915100454, + "grad_norm": 0.9878700375556946, + "learning_rate": 3.288744519188097e-06, + "loss": 0.1245, + "step": 7701 + }, + { + "epoch": 1.247893713545042, + "grad_norm": 0.84808748960495, + "learning_rate": 3.2883295377594716e-06, + "loss": 0.0883, + "step": 7702 + }, + { + "epoch": 1.2480557355800388, + "grad_norm": 0.8799005746841431, + "learning_rate": 3.2879145322098694e-06, + "loss": 0.1154, + "step": 7703 + }, + { + "epoch": 1.2482177576150357, + "grad_norm": 0.9856189489364624, + "learning_rate": 3.2874995025519897e-06, + "loss": 0.115, + "step": 7704 + }, + { + "epoch": 1.2483797796500324, + "grad_norm": 0.798912525177002, + "learning_rate": 3.2870844487985307e-06, + "loss": 0.0959, + "step": 7705 + }, + { + "epoch": 1.2485418016850292, + "grad_norm": 0.8618741035461426, + "learning_rate": 3.2866693709621933e-06, + "loss": 0.1131, + "step": 7706 + }, + { + "epoch": 1.2487038237200259, + "grad_norm": 0.8063377737998962, + "learning_rate": 3.2862542690556765e-06, + "loss": 0.1011, + "step": 7707 + }, + { + "epoch": 1.2488658457550228, + "grad_norm": 0.7270522713661194, + "learning_rate": 3.285839143091681e-06, + "loss": 0.098, + "step": 7708 + }, + { + "epoch": 1.2490278677900195, + "grad_norm": 0.7814246416091919, + "learning_rate": 3.2854239930829097e-06, + "loss": 0.098, + "step": 7709 + }, + { + "epoch": 1.2491898898250162, + "grad_norm": 0.7192318439483643, + "learning_rate": 3.2850088190420647e-06, + "loss": 0.0922, + "step": 7710 + }, + { + "epoch": 1.249351911860013, + "grad_norm": 0.7969322204589844, + "learning_rate": 3.284593620981851e-06, + "loss": 0.1072, + "step": 7711 + }, + { + "epoch": 1.2495139338950096, + "grad_norm": 0.8182503581047058, + "learning_rate": 3.284178398914969e-06, + "loss": 0.1014, + "step": 7712 + }, + { + "epoch": 1.2496759559300066, + "grad_norm": 0.9507556557655334, + "learning_rate": 3.283763152854127e-06, + "loss": 0.1117, + "step": 7713 + }, + { + "epoch": 1.2498379779650033, + "grad_norm": 0.7872381210327148, + "learning_rate": 3.283347882812028e-06, + "loss": 0.0951, + "step": 7714 + }, + { + "epoch": 1.25, + "grad_norm": 0.7765859961509705, + "learning_rate": 3.282932588801381e-06, + "loss": 0.0978, + "step": 7715 + }, + { + "epoch": 1.2501620220349967, + "grad_norm": 0.9354822635650635, + "learning_rate": 3.282517270834891e-06, + "loss": 0.1167, + "step": 7716 + }, + { + "epoch": 1.2503240440699934, + "grad_norm": 0.8700661659240723, + "learning_rate": 3.2821019289252654e-06, + "loss": 0.1112, + "step": 7717 + }, + { + "epoch": 1.2504860661049904, + "grad_norm": 0.8417790532112122, + "learning_rate": 3.281686563085214e-06, + "loss": 0.1065, + "step": 7718 + }, + { + "epoch": 1.250648088139987, + "grad_norm": 0.8531923294067383, + "learning_rate": 3.2812711733274453e-06, + "loss": 0.1171, + "step": 7719 + }, + { + "epoch": 1.2508101101749838, + "grad_norm": 0.8502379059791565, + "learning_rate": 3.28085575966467e-06, + "loss": 0.1064, + "step": 7720 + }, + { + "epoch": 1.2509721322099805, + "grad_norm": 0.9494119882583618, + "learning_rate": 3.280440322109597e-06, + "loss": 0.1099, + "step": 7721 + }, + { + "epoch": 1.2511341542449772, + "grad_norm": 0.7425954341888428, + "learning_rate": 3.2800248606749395e-06, + "loss": 0.0846, + "step": 7722 + }, + { + "epoch": 1.2512961762799741, + "grad_norm": 0.7303003668785095, + "learning_rate": 3.2796093753734087e-06, + "loss": 0.098, + "step": 7723 + }, + { + "epoch": 1.2514581983149708, + "grad_norm": 0.9115877151489258, + "learning_rate": 3.2791938662177174e-06, + "loss": 0.1093, + "step": 7724 + }, + { + "epoch": 1.2516202203499676, + "grad_norm": 0.7848078608512878, + "learning_rate": 3.2787783332205796e-06, + "loss": 0.0934, + "step": 7725 + }, + { + "epoch": 1.2517822423849643, + "grad_norm": 0.8890565037727356, + "learning_rate": 3.278362776394709e-06, + "loss": 0.1088, + "step": 7726 + }, + { + "epoch": 1.251944264419961, + "grad_norm": 1.0119807720184326, + "learning_rate": 3.277947195752822e-06, + "loss": 0.1263, + "step": 7727 + }, + { + "epoch": 1.252106286454958, + "grad_norm": 0.8432857394218445, + "learning_rate": 3.277531591307632e-06, + "loss": 0.1109, + "step": 7728 + }, + { + "epoch": 1.2522683084899546, + "grad_norm": 0.8839412331581116, + "learning_rate": 3.2771159630718584e-06, + "loss": 0.1058, + "step": 7729 + }, + { + "epoch": 1.2524303305249513, + "grad_norm": 0.8032359480857849, + "learning_rate": 3.2767003110582164e-06, + "loss": 0.0991, + "step": 7730 + }, + { + "epoch": 1.2525923525599483, + "grad_norm": 0.7420846819877625, + "learning_rate": 3.276284635279424e-06, + "loss": 0.0962, + "step": 7731 + }, + { + "epoch": 1.252754374594945, + "grad_norm": 0.7983898520469666, + "learning_rate": 3.275868935748201e-06, + "loss": 0.1003, + "step": 7732 + }, + { + "epoch": 1.2529163966299417, + "grad_norm": 0.8384861946105957, + "learning_rate": 3.2754532124772653e-06, + "loss": 0.1081, + "step": 7733 + }, + { + "epoch": 1.2530784186649384, + "grad_norm": 0.8961964845657349, + "learning_rate": 3.2750374654793387e-06, + "loss": 0.1239, + "step": 7734 + }, + { + "epoch": 1.2532404406999351, + "grad_norm": 0.818870484828949, + "learning_rate": 3.2746216947671405e-06, + "loss": 0.1061, + "step": 7735 + }, + { + "epoch": 1.253402462734932, + "grad_norm": 0.8016375303268433, + "learning_rate": 3.2742059003533933e-06, + "loss": 0.0954, + "step": 7736 + }, + { + "epoch": 1.2535644847699288, + "grad_norm": 0.9985673427581787, + "learning_rate": 3.2737900822508197e-06, + "loss": 0.1111, + "step": 7737 + }, + { + "epoch": 1.2537265068049255, + "grad_norm": 0.8439716100692749, + "learning_rate": 3.2733742404721413e-06, + "loss": 0.1127, + "step": 7738 + }, + { + "epoch": 1.2538885288399222, + "grad_norm": 0.7654055953025818, + "learning_rate": 3.272958375030083e-06, + "loss": 0.1004, + "step": 7739 + }, + { + "epoch": 1.254050550874919, + "grad_norm": 0.7196124792098999, + "learning_rate": 3.272542485937369e-06, + "loss": 0.0928, + "step": 7740 + }, + { + "epoch": 1.2542125729099158, + "grad_norm": 0.9231361150741577, + "learning_rate": 3.272126573206724e-06, + "loss": 0.1016, + "step": 7741 + }, + { + "epoch": 1.2543745949449125, + "grad_norm": 0.7543466687202454, + "learning_rate": 3.2717106368508755e-06, + "loss": 0.0929, + "step": 7742 + }, + { + "epoch": 1.2545366169799093, + "grad_norm": 0.8236292600631714, + "learning_rate": 3.271294676882548e-06, + "loss": 0.1005, + "step": 7743 + }, + { + "epoch": 1.254698639014906, + "grad_norm": 0.8038961887359619, + "learning_rate": 3.27087869331447e-06, + "loss": 0.101, + "step": 7744 + }, + { + "epoch": 1.2548606610499027, + "grad_norm": 0.8604749441146851, + "learning_rate": 3.270462686159369e-06, + "loss": 0.1037, + "step": 7745 + }, + { + "epoch": 1.2550226830848996, + "grad_norm": 0.744813859462738, + "learning_rate": 3.2700466554299755e-06, + "loss": 0.0934, + "step": 7746 + }, + { + "epoch": 1.2551847051198963, + "grad_norm": 0.8217744827270508, + "learning_rate": 3.2696306011390167e-06, + "loss": 0.106, + "step": 7747 + }, + { + "epoch": 1.255346727154893, + "grad_norm": 0.755609929561615, + "learning_rate": 3.2692145232992244e-06, + "loss": 0.0933, + "step": 7748 + }, + { + "epoch": 1.2555087491898898, + "grad_norm": 0.7776917219161987, + "learning_rate": 3.2687984219233295e-06, + "loss": 0.1035, + "step": 7749 + }, + { + "epoch": 1.2556707712248865, + "grad_norm": 0.8656284809112549, + "learning_rate": 3.268382297024063e-06, + "loss": 0.111, + "step": 7750 + }, + { + "epoch": 1.2558327932598834, + "grad_norm": 0.8041141629219055, + "learning_rate": 3.2679661486141577e-06, + "loss": 0.1009, + "step": 7751 + }, + { + "epoch": 1.25599481529488, + "grad_norm": 0.7365778088569641, + "learning_rate": 3.2675499767063464e-06, + "loss": 0.0935, + "step": 7752 + }, + { + "epoch": 1.2561568373298768, + "grad_norm": 0.8223831057548523, + "learning_rate": 3.267133781313364e-06, + "loss": 0.1034, + "step": 7753 + }, + { + "epoch": 1.2563188593648738, + "grad_norm": 0.8573729395866394, + "learning_rate": 3.266717562447944e-06, + "loss": 0.1009, + "step": 7754 + }, + { + "epoch": 1.2564808813998705, + "grad_norm": 0.9449803829193115, + "learning_rate": 3.2663013201228216e-06, + "loss": 0.1201, + "step": 7755 + }, + { + "epoch": 1.2566429034348672, + "grad_norm": 0.7694631814956665, + "learning_rate": 3.2658850543507336e-06, + "loss": 0.0973, + "step": 7756 + }, + { + "epoch": 1.2568049254698639, + "grad_norm": 1.0734034776687622, + "learning_rate": 3.265468765144416e-06, + "loss": 0.1069, + "step": 7757 + }, + { + "epoch": 1.2569669475048606, + "grad_norm": 0.9590340852737427, + "learning_rate": 3.2650524525166064e-06, + "loss": 0.1212, + "step": 7758 + }, + { + "epoch": 1.2571289695398575, + "grad_norm": 0.7816064357757568, + "learning_rate": 3.264636116480044e-06, + "loss": 0.1041, + "step": 7759 + }, + { + "epoch": 1.2572909915748542, + "grad_norm": 0.745184600353241, + "learning_rate": 3.2642197570474665e-06, + "loss": 0.0875, + "step": 7760 + }, + { + "epoch": 1.257453013609851, + "grad_norm": 0.8741956949234009, + "learning_rate": 3.2638033742316137e-06, + "loss": 0.1039, + "step": 7761 + }, + { + "epoch": 1.2576150356448477, + "grad_norm": 0.9357261061668396, + "learning_rate": 3.263386968045226e-06, + "loss": 0.1157, + "step": 7762 + }, + { + "epoch": 1.2577770576798444, + "grad_norm": 0.773931086063385, + "learning_rate": 3.2629705385010445e-06, + "loss": 0.0962, + "step": 7763 + }, + { + "epoch": 1.2579390797148413, + "grad_norm": 0.8367682695388794, + "learning_rate": 3.262554085611811e-06, + "loss": 0.1009, + "step": 7764 + }, + { + "epoch": 1.258101101749838, + "grad_norm": 0.8734476566314697, + "learning_rate": 3.2621376093902675e-06, + "loss": 0.1155, + "step": 7765 + }, + { + "epoch": 1.2582631237848347, + "grad_norm": 0.9115141034126282, + "learning_rate": 3.261721109849158e-06, + "loss": 0.1112, + "step": 7766 + }, + { + "epoch": 1.2584251458198314, + "grad_norm": 0.7026832103729248, + "learning_rate": 3.261304587001225e-06, + "loss": 0.0882, + "step": 7767 + }, + { + "epoch": 1.2585871678548282, + "grad_norm": 0.7548426389694214, + "learning_rate": 3.2608880408592148e-06, + "loss": 0.097, + "step": 7768 + }, + { + "epoch": 1.258749189889825, + "grad_norm": 0.736499011516571, + "learning_rate": 3.2604714714358716e-06, + "loss": 0.0918, + "step": 7769 + }, + { + "epoch": 1.2589112119248218, + "grad_norm": 0.7644399404525757, + "learning_rate": 3.2600548787439413e-06, + "loss": 0.094, + "step": 7770 + }, + { + "epoch": 1.2590732339598185, + "grad_norm": 0.8792965412139893, + "learning_rate": 3.2596382627961714e-06, + "loss": 0.1082, + "step": 7771 + }, + { + "epoch": 1.2592352559948152, + "grad_norm": 0.8653140664100647, + "learning_rate": 3.2592216236053086e-06, + "loss": 0.1051, + "step": 7772 + }, + { + "epoch": 1.259397278029812, + "grad_norm": 0.898324728012085, + "learning_rate": 3.2588049611841023e-06, + "loss": 0.107, + "step": 7773 + }, + { + "epoch": 1.2595593000648089, + "grad_norm": 0.8365474343299866, + "learning_rate": 3.2583882755452994e-06, + "loss": 0.102, + "step": 7774 + }, + { + "epoch": 1.2597213220998056, + "grad_norm": 0.8603896498680115, + "learning_rate": 3.2579715667016516e-06, + "loss": 0.1069, + "step": 7775 + }, + { + "epoch": 1.2598833441348023, + "grad_norm": 0.8619314432144165, + "learning_rate": 3.257554834665907e-06, + "loss": 0.1045, + "step": 7776 + }, + { + "epoch": 1.2600453661697992, + "grad_norm": 0.7768551111221313, + "learning_rate": 3.2571380794508183e-06, + "loss": 0.0992, + "step": 7777 + }, + { + "epoch": 1.2602073882047957, + "grad_norm": 0.8619160056114197, + "learning_rate": 3.2567213010691367e-06, + "loss": 0.1118, + "step": 7778 + }, + { + "epoch": 1.2603694102397927, + "grad_norm": 0.9219297170639038, + "learning_rate": 3.256304499533614e-06, + "loss": 0.1117, + "step": 7779 + }, + { + "epoch": 1.2605314322747894, + "grad_norm": 0.8448036313056946, + "learning_rate": 3.255887674857004e-06, + "loss": 0.104, + "step": 7780 + }, + { + "epoch": 1.260693454309786, + "grad_norm": 0.80354905128479, + "learning_rate": 3.255470827052061e-06, + "loss": 0.1035, + "step": 7781 + }, + { + "epoch": 1.260855476344783, + "grad_norm": 0.7997376918792725, + "learning_rate": 3.2550539561315385e-06, + "loss": 0.1034, + "step": 7782 + }, + { + "epoch": 1.2610174983797797, + "grad_norm": 0.7824645042419434, + "learning_rate": 3.2546370621081912e-06, + "loss": 0.0989, + "step": 7783 + }, + { + "epoch": 1.2611795204147764, + "grad_norm": 0.757111668586731, + "learning_rate": 3.2542201449947774e-06, + "loss": 0.1016, + "step": 7784 + }, + { + "epoch": 1.2613415424497731, + "grad_norm": 0.8208655714988708, + "learning_rate": 3.253803204804052e-06, + "loss": 0.1116, + "step": 7785 + }, + { + "epoch": 1.2615035644847699, + "grad_norm": 0.817868709564209, + "learning_rate": 3.2533862415487723e-06, + "loss": 0.1056, + "step": 7786 + }, + { + "epoch": 1.2616655865197668, + "grad_norm": 0.7902513146400452, + "learning_rate": 3.252969255241697e-06, + "loss": 0.1007, + "step": 7787 + }, + { + "epoch": 1.2618276085547635, + "grad_norm": 0.8338719010353088, + "learning_rate": 3.2525522458955843e-06, + "loss": 0.1024, + "step": 7788 + }, + { + "epoch": 1.2619896305897602, + "grad_norm": 0.8464298844337463, + "learning_rate": 3.2521352135231944e-06, + "loss": 0.1108, + "step": 7789 + }, + { + "epoch": 1.262151652624757, + "grad_norm": 0.809252142906189, + "learning_rate": 3.251718158137287e-06, + "loss": 0.1044, + "step": 7790 + }, + { + "epoch": 1.2623136746597536, + "grad_norm": 0.6791431903839111, + "learning_rate": 3.2513010797506236e-06, + "loss": 0.0965, + "step": 7791 + }, + { + "epoch": 1.2624756966947506, + "grad_norm": 0.7555492520332336, + "learning_rate": 3.2508839783759642e-06, + "loss": 0.1014, + "step": 7792 + }, + { + "epoch": 1.2626377187297473, + "grad_norm": 0.6891449093818665, + "learning_rate": 3.2504668540260732e-06, + "loss": 0.088, + "step": 7793 + }, + { + "epoch": 1.262799740764744, + "grad_norm": 0.770533561706543, + "learning_rate": 3.2500497067137116e-06, + "loss": 0.1035, + "step": 7794 + }, + { + "epoch": 1.2629617627997407, + "grad_norm": 0.7673627138137817, + "learning_rate": 3.2496325364516444e-06, + "loss": 0.0951, + "step": 7795 + }, + { + "epoch": 1.2631237848347374, + "grad_norm": 0.8320962190628052, + "learning_rate": 3.2492153432526356e-06, + "loss": 0.1048, + "step": 7796 + }, + { + "epoch": 1.2632858068697344, + "grad_norm": 0.8069162368774414, + "learning_rate": 3.248798127129451e-06, + "loss": 0.1025, + "step": 7797 + }, + { + "epoch": 1.263447828904731, + "grad_norm": 0.9537956118583679, + "learning_rate": 3.2483808880948552e-06, + "loss": 0.1023, + "step": 7798 + }, + { + "epoch": 1.2636098509397278, + "grad_norm": 0.9524184465408325, + "learning_rate": 3.2479636261616156e-06, + "loss": 0.1296, + "step": 7799 + }, + { + "epoch": 1.2637718729747245, + "grad_norm": 0.8504661321640015, + "learning_rate": 3.2475463413424983e-06, + "loss": 0.1071, + "step": 7800 + }, + { + "epoch": 1.2639338950097212, + "grad_norm": 0.8363580703735352, + "learning_rate": 3.247129033650273e-06, + "loss": 0.0971, + "step": 7801 + }, + { + "epoch": 1.2640959170447181, + "grad_norm": 0.7588921785354614, + "learning_rate": 3.246711703097707e-06, + "loss": 0.0923, + "step": 7802 + }, + { + "epoch": 1.2642579390797148, + "grad_norm": 0.7907635569572449, + "learning_rate": 3.2462943496975696e-06, + "loss": 0.1052, + "step": 7803 + }, + { + "epoch": 1.2644199611147116, + "grad_norm": 0.8892641067504883, + "learning_rate": 3.2458769734626315e-06, + "loss": 0.1188, + "step": 7804 + }, + { + "epoch": 1.2645819831497085, + "grad_norm": 0.7947612404823303, + "learning_rate": 3.245459574405662e-06, + "loss": 0.1109, + "step": 7805 + }, + { + "epoch": 1.2647440051847052, + "grad_norm": 0.8161318898200989, + "learning_rate": 3.245042152539435e-06, + "loss": 0.1071, + "step": 7806 + }, + { + "epoch": 1.264906027219702, + "grad_norm": 0.8670127987861633, + "learning_rate": 3.2446247078767195e-06, + "loss": 0.1086, + "step": 7807 + }, + { + "epoch": 1.2650680492546986, + "grad_norm": 0.8083024621009827, + "learning_rate": 3.2442072404302917e-06, + "loss": 0.1044, + "step": 7808 + }, + { + "epoch": 1.2652300712896953, + "grad_norm": 0.7659672498703003, + "learning_rate": 3.243789750212922e-06, + "loss": 0.097, + "step": 7809 + }, + { + "epoch": 1.2653920933246923, + "grad_norm": 0.7633987069129944, + "learning_rate": 3.243372237237386e-06, + "loss": 0.0838, + "step": 7810 + }, + { + "epoch": 1.265554115359689, + "grad_norm": 0.7828187942504883, + "learning_rate": 3.2429547015164585e-06, + "loss": 0.0969, + "step": 7811 + }, + { + "epoch": 1.2657161373946857, + "grad_norm": 0.9652653932571411, + "learning_rate": 3.2425371430629155e-06, + "loss": 0.1092, + "step": 7812 + }, + { + "epoch": 1.2658781594296824, + "grad_norm": 0.807366669178009, + "learning_rate": 3.242119561889533e-06, + "loss": 0.0976, + "step": 7813 + }, + { + "epoch": 1.2660401814646791, + "grad_norm": 0.8526018857955933, + "learning_rate": 3.241701958009087e-06, + "loss": 0.1037, + "step": 7814 + }, + { + "epoch": 1.266202203499676, + "grad_norm": 0.9918192625045776, + "learning_rate": 3.2412843314343566e-06, + "loss": 0.1089, + "step": 7815 + }, + { + "epoch": 1.2663642255346728, + "grad_norm": 0.7938478589057922, + "learning_rate": 3.2408666821781186e-06, + "loss": 0.1041, + "step": 7816 + }, + { + "epoch": 1.2665262475696695, + "grad_norm": 0.8537562489509583, + "learning_rate": 3.2404490102531536e-06, + "loss": 0.1142, + "step": 7817 + }, + { + "epoch": 1.2666882696046662, + "grad_norm": 0.858700692653656, + "learning_rate": 3.2400313156722414e-06, + "loss": 0.1092, + "step": 7818 + }, + { + "epoch": 1.266850291639663, + "grad_norm": 0.8112673163414001, + "learning_rate": 3.2396135984481607e-06, + "loss": 0.097, + "step": 7819 + }, + { + "epoch": 1.2670123136746598, + "grad_norm": 0.7495766282081604, + "learning_rate": 3.2391958585936946e-06, + "loss": 0.1006, + "step": 7820 + }, + { + "epoch": 1.2671743357096565, + "grad_norm": 0.8106545805931091, + "learning_rate": 3.2387780961216237e-06, + "loss": 0.1012, + "step": 7821 + }, + { + "epoch": 1.2673363577446533, + "grad_norm": 0.7898968458175659, + "learning_rate": 3.2383603110447304e-06, + "loss": 0.0889, + "step": 7822 + }, + { + "epoch": 1.26749837977965, + "grad_norm": 0.8760552406311035, + "learning_rate": 3.237942503375799e-06, + "loss": 0.1115, + "step": 7823 + }, + { + "epoch": 1.2676604018146467, + "grad_norm": 0.7963511943817139, + "learning_rate": 3.2375246731276122e-06, + "loss": 0.1045, + "step": 7824 + }, + { + "epoch": 1.2678224238496436, + "grad_norm": 0.8547530770301819, + "learning_rate": 3.237106820312956e-06, + "loss": 0.1028, + "step": 7825 + }, + { + "epoch": 1.2679844458846403, + "grad_norm": 0.749346137046814, + "learning_rate": 3.236688944944614e-06, + "loss": 0.0918, + "step": 7826 + }, + { + "epoch": 1.268146467919637, + "grad_norm": 0.6958228945732117, + "learning_rate": 3.2362710470353737e-06, + "loss": 0.087, + "step": 7827 + }, + { + "epoch": 1.268308489954634, + "grad_norm": 0.9344890713691711, + "learning_rate": 3.2358531265980207e-06, + "loss": 0.1125, + "step": 7828 + }, + { + "epoch": 1.2684705119896305, + "grad_norm": 0.8283147215843201, + "learning_rate": 3.2354351836453423e-06, + "loss": 0.097, + "step": 7829 + }, + { + "epoch": 1.2686325340246274, + "grad_norm": 0.9189488291740417, + "learning_rate": 3.2350172181901283e-06, + "loss": 0.1061, + "step": 7830 + }, + { + "epoch": 1.268794556059624, + "grad_norm": 0.8217201232910156, + "learning_rate": 3.234599230245165e-06, + "loss": 0.1026, + "step": 7831 + }, + { + "epoch": 1.2689565780946208, + "grad_norm": 0.9388702511787415, + "learning_rate": 3.2341812198232437e-06, + "loss": 0.1086, + "step": 7832 + }, + { + "epoch": 1.2691186001296177, + "grad_norm": 0.7720819115638733, + "learning_rate": 3.2337631869371534e-06, + "loss": 0.1009, + "step": 7833 + }, + { + "epoch": 1.2692806221646145, + "grad_norm": 0.9018075466156006, + "learning_rate": 3.2333451315996857e-06, + "loss": 0.1147, + "step": 7834 + }, + { + "epoch": 1.2694426441996112, + "grad_norm": 0.7737125754356384, + "learning_rate": 3.2329270538236313e-06, + "loss": 0.098, + "step": 7835 + }, + { + "epoch": 1.2696046662346079, + "grad_norm": 0.9752976298332214, + "learning_rate": 3.232508953621782e-06, + "loss": 0.1188, + "step": 7836 + }, + { + "epoch": 1.2697666882696046, + "grad_norm": 0.8050476908683777, + "learning_rate": 3.232090831006932e-06, + "loss": 0.1102, + "step": 7837 + }, + { + "epoch": 1.2699287103046015, + "grad_norm": 0.8317782878875732, + "learning_rate": 3.231672685991874e-06, + "loss": 0.0991, + "step": 7838 + }, + { + "epoch": 1.2700907323395982, + "grad_norm": 0.8194529414176941, + "learning_rate": 3.231254518589403e-06, + "loss": 0.1068, + "step": 7839 + }, + { + "epoch": 1.270252754374595, + "grad_norm": 0.8640487194061279, + "learning_rate": 3.2308363288123128e-06, + "loss": 0.1159, + "step": 7840 + }, + { + "epoch": 1.2704147764095917, + "grad_norm": 0.7963087558746338, + "learning_rate": 3.2304181166733993e-06, + "loss": 0.1065, + "step": 7841 + }, + { + "epoch": 1.2705767984445884, + "grad_norm": 1.0018774271011353, + "learning_rate": 3.2299998821854593e-06, + "loss": 0.1209, + "step": 7842 + }, + { + "epoch": 1.2707388204795853, + "grad_norm": 0.7300251722335815, + "learning_rate": 3.2295816253612897e-06, + "loss": 0.0895, + "step": 7843 + }, + { + "epoch": 1.270900842514582, + "grad_norm": 0.7629234790802002, + "learning_rate": 3.229163346213688e-06, + "loss": 0.1015, + "step": 7844 + }, + { + "epoch": 1.2710628645495787, + "grad_norm": 0.7023546099662781, + "learning_rate": 3.2287450447554526e-06, + "loss": 0.0956, + "step": 7845 + }, + { + "epoch": 1.2712248865845754, + "grad_norm": 0.8268736600875854, + "learning_rate": 3.228326720999382e-06, + "loss": 0.1019, + "step": 7846 + }, + { + "epoch": 1.2713869086195722, + "grad_norm": 0.7980479001998901, + "learning_rate": 3.227908374958276e-06, + "loss": 0.1008, + "step": 7847 + }, + { + "epoch": 1.271548930654569, + "grad_norm": 0.7699779868125916, + "learning_rate": 3.2274900066449355e-06, + "loss": 0.096, + "step": 7848 + }, + { + "epoch": 1.2717109526895658, + "grad_norm": 0.9146498441696167, + "learning_rate": 3.2270716160721612e-06, + "loss": 0.1143, + "step": 7849 + }, + { + "epoch": 1.2718729747245625, + "grad_norm": 0.8463569283485413, + "learning_rate": 3.2266532032527548e-06, + "loss": 0.105, + "step": 7850 + }, + { + "epoch": 1.2720349967595592, + "grad_norm": 0.8177133202552795, + "learning_rate": 3.2262347681995187e-06, + "loss": 0.1015, + "step": 7851 + }, + { + "epoch": 1.272197018794556, + "grad_norm": 0.8679166436195374, + "learning_rate": 3.225816310925257e-06, + "loss": 0.0998, + "step": 7852 + }, + { + "epoch": 1.2723590408295529, + "grad_norm": 0.7825645804405212, + "learning_rate": 3.2253978314427716e-06, + "loss": 0.1001, + "step": 7853 + }, + { + "epoch": 1.2725210628645496, + "grad_norm": 0.8941231966018677, + "learning_rate": 3.224979329764869e-06, + "loss": 0.0982, + "step": 7854 + }, + { + "epoch": 1.2726830848995463, + "grad_norm": 0.8882502913475037, + "learning_rate": 3.2245608059043525e-06, + "loss": 0.1052, + "step": 7855 + }, + { + "epoch": 1.2728451069345432, + "grad_norm": 0.7035767436027527, + "learning_rate": 3.224142259874029e-06, + "loss": 0.0808, + "step": 7856 + }, + { + "epoch": 1.27300712896954, + "grad_norm": 0.837063193321228, + "learning_rate": 3.2237236916867047e-06, + "loss": 0.1036, + "step": 7857 + }, + { + "epoch": 1.2731691510045366, + "grad_norm": 0.8827104568481445, + "learning_rate": 3.223305101355187e-06, + "loss": 0.1098, + "step": 7858 + }, + { + "epoch": 1.2733311730395334, + "grad_norm": 0.8305638432502747, + "learning_rate": 3.2228864888922838e-06, + "loss": 0.1051, + "step": 7859 + }, + { + "epoch": 1.27349319507453, + "grad_norm": 0.9665209054946899, + "learning_rate": 3.2224678543108024e-06, + "loss": 0.1178, + "step": 7860 + }, + { + "epoch": 1.273655217109527, + "grad_norm": 0.8135932683944702, + "learning_rate": 3.222049197623554e-06, + "loss": 0.0927, + "step": 7861 + }, + { + "epoch": 1.2738172391445237, + "grad_norm": 0.8016318082809448, + "learning_rate": 3.221630518843347e-06, + "loss": 0.1005, + "step": 7862 + }, + { + "epoch": 1.2739792611795204, + "grad_norm": 0.8300334811210632, + "learning_rate": 3.2212118179829925e-06, + "loss": 0.1057, + "step": 7863 + }, + { + "epoch": 1.2741412832145171, + "grad_norm": 0.7906784415245056, + "learning_rate": 3.2207930950553017e-06, + "loss": 0.1001, + "step": 7864 + }, + { + "epoch": 1.2743033052495139, + "grad_norm": 0.8609299063682556, + "learning_rate": 3.2203743500730867e-06, + "loss": 0.1019, + "step": 7865 + }, + { + "epoch": 1.2744653272845108, + "grad_norm": 0.783918023109436, + "learning_rate": 3.2199555830491597e-06, + "loss": 0.0979, + "step": 7866 + }, + { + "epoch": 1.2746273493195075, + "grad_norm": 0.9125205278396606, + "learning_rate": 3.219536793996334e-06, + "loss": 0.1175, + "step": 7867 + }, + { + "epoch": 1.2747893713545042, + "grad_norm": 0.8157711029052734, + "learning_rate": 3.2191179829274244e-06, + "loss": 0.0942, + "step": 7868 + }, + { + "epoch": 1.274951393389501, + "grad_norm": 0.9252278208732605, + "learning_rate": 3.218699149855244e-06, + "loss": 0.1111, + "step": 7869 + }, + { + "epoch": 1.2751134154244976, + "grad_norm": 0.8688713908195496, + "learning_rate": 3.2182802947926086e-06, + "loss": 0.1111, + "step": 7870 + }, + { + "epoch": 1.2752754374594946, + "grad_norm": 0.841914176940918, + "learning_rate": 3.217861417752335e-06, + "loss": 0.1005, + "step": 7871 + }, + { + "epoch": 1.2754374594944913, + "grad_norm": 0.8829323053359985, + "learning_rate": 3.2174425187472387e-06, + "loss": 0.1121, + "step": 7872 + }, + { + "epoch": 1.275599481529488, + "grad_norm": 0.8816058039665222, + "learning_rate": 3.2170235977901375e-06, + "loss": 0.1215, + "step": 7873 + }, + { + "epoch": 1.2757615035644847, + "grad_norm": 0.7793697118759155, + "learning_rate": 3.2166046548938497e-06, + "loss": 0.1007, + "step": 7874 + }, + { + "epoch": 1.2759235255994814, + "grad_norm": 0.7956479787826538, + "learning_rate": 3.216185690071193e-06, + "loss": 0.1034, + "step": 7875 + }, + { + "epoch": 1.2760855476344783, + "grad_norm": 0.80839604139328, + "learning_rate": 3.215766703334988e-06, + "loss": 0.1001, + "step": 7876 + }, + { + "epoch": 1.276247569669475, + "grad_norm": 0.8526255488395691, + "learning_rate": 3.215347694698054e-06, + "loss": 0.1031, + "step": 7877 + }, + { + "epoch": 1.2764095917044718, + "grad_norm": 0.8941551446914673, + "learning_rate": 3.214928664173211e-06, + "loss": 0.1132, + "step": 7878 + }, + { + "epoch": 1.2765716137394687, + "grad_norm": 0.7045158743858337, + "learning_rate": 3.2145096117732823e-06, + "loss": 0.0901, + "step": 7879 + }, + { + "epoch": 1.2767336357744652, + "grad_norm": 0.9706689715385437, + "learning_rate": 3.2140905375110875e-06, + "loss": 0.1244, + "step": 7880 + }, + { + "epoch": 1.2768956578094621, + "grad_norm": 0.9292829036712646, + "learning_rate": 3.21367144139945e-06, + "loss": 0.1131, + "step": 7881 + }, + { + "epoch": 1.2770576798444588, + "grad_norm": 0.8300909996032715, + "learning_rate": 3.2132523234511943e-06, + "loss": 0.1164, + "step": 7882 + }, + { + "epoch": 1.2772197018794555, + "grad_norm": 0.7891339659690857, + "learning_rate": 3.2128331836791436e-06, + "loss": 0.1007, + "step": 7883 + }, + { + "epoch": 1.2773817239144525, + "grad_norm": 0.6846839189529419, + "learning_rate": 3.2124140220961215e-06, + "loss": 0.0874, + "step": 7884 + }, + { + "epoch": 1.2775437459494492, + "grad_norm": 0.8767536282539368, + "learning_rate": 3.211994838714955e-06, + "loss": 0.1121, + "step": 7885 + }, + { + "epoch": 1.277705767984446, + "grad_norm": 0.891893744468689, + "learning_rate": 3.2115756335484694e-06, + "loss": 0.113, + "step": 7886 + }, + { + "epoch": 1.2778677900194426, + "grad_norm": 0.8657979965209961, + "learning_rate": 3.2111564066094913e-06, + "loss": 0.1176, + "step": 7887 + }, + { + "epoch": 1.2780298120544393, + "grad_norm": 0.8780067563056946, + "learning_rate": 3.210737157910848e-06, + "loss": 0.111, + "step": 7888 + }, + { + "epoch": 1.2781918340894363, + "grad_norm": 0.9224578738212585, + "learning_rate": 3.2103178874653677e-06, + "loss": 0.1151, + "step": 7889 + }, + { + "epoch": 1.278353856124433, + "grad_norm": 0.7573640942573547, + "learning_rate": 3.2098985952858796e-06, + "loss": 0.1087, + "step": 7890 + }, + { + "epoch": 1.2785158781594297, + "grad_norm": 0.8327637910842896, + "learning_rate": 3.2094792813852116e-06, + "loss": 0.1107, + "step": 7891 + }, + { + "epoch": 1.2786779001944264, + "grad_norm": 0.9042618870735168, + "learning_rate": 3.209059945776195e-06, + "loss": 0.1194, + "step": 7892 + }, + { + "epoch": 1.278839922229423, + "grad_norm": 0.9340540766716003, + "learning_rate": 3.2086405884716592e-06, + "loss": 0.1316, + "step": 7893 + }, + { + "epoch": 1.27900194426442, + "grad_norm": 0.7579165697097778, + "learning_rate": 3.2082212094844374e-06, + "loss": 0.099, + "step": 7894 + }, + { + "epoch": 1.2791639662994168, + "grad_norm": 0.7476134896278381, + "learning_rate": 3.20780180882736e-06, + "loss": 0.1061, + "step": 7895 + }, + { + "epoch": 1.2793259883344135, + "grad_norm": 0.8922802209854126, + "learning_rate": 3.20738238651326e-06, + "loss": 0.1033, + "step": 7896 + }, + { + "epoch": 1.2794880103694102, + "grad_norm": 0.8457995057106018, + "learning_rate": 3.2069629425549705e-06, + "loss": 0.1123, + "step": 7897 + }, + { + "epoch": 1.279650032404407, + "grad_norm": 0.7586995959281921, + "learning_rate": 3.206543476965326e-06, + "loss": 0.0922, + "step": 7898 + }, + { + "epoch": 1.2798120544394038, + "grad_norm": 0.9203418493270874, + "learning_rate": 3.2061239897571613e-06, + "loss": 0.1247, + "step": 7899 + }, + { + "epoch": 1.2799740764744005, + "grad_norm": 0.8178397417068481, + "learning_rate": 3.2057044809433108e-06, + "loss": 0.1039, + "step": 7900 + }, + { + "epoch": 1.2801360985093972, + "grad_norm": 0.794354259967804, + "learning_rate": 3.2052849505366113e-06, + "loss": 0.1085, + "step": 7901 + }, + { + "epoch": 1.280298120544394, + "grad_norm": 0.7944015264511108, + "learning_rate": 3.2048653985498985e-06, + "loss": 0.1099, + "step": 7902 + }, + { + "epoch": 1.2804601425793907, + "grad_norm": 0.8562796115875244, + "learning_rate": 3.2044458249960108e-06, + "loss": 0.1018, + "step": 7903 + }, + { + "epoch": 1.2806221646143876, + "grad_norm": 0.9842815399169922, + "learning_rate": 3.204026229887785e-06, + "loss": 0.12, + "step": 7904 + }, + { + "epoch": 1.2807841866493843, + "grad_norm": 0.7473559379577637, + "learning_rate": 3.2036066132380606e-06, + "loss": 0.0911, + "step": 7905 + }, + { + "epoch": 1.280946208684381, + "grad_norm": 0.7652568221092224, + "learning_rate": 3.203186975059677e-06, + "loss": 0.1059, + "step": 7906 + }, + { + "epoch": 1.281108230719378, + "grad_norm": 0.8993542790412903, + "learning_rate": 3.2027673153654733e-06, + "loss": 0.112, + "step": 7907 + }, + { + "epoch": 1.2812702527543747, + "grad_norm": 0.8552115559577942, + "learning_rate": 3.2023476341682902e-06, + "loss": 0.1073, + "step": 7908 + }, + { + "epoch": 1.2814322747893714, + "grad_norm": 0.7751871347427368, + "learning_rate": 3.2019279314809694e-06, + "loss": 0.0883, + "step": 7909 + }, + { + "epoch": 1.281594296824368, + "grad_norm": 0.891760528087616, + "learning_rate": 3.2015082073163524e-06, + "loss": 0.114, + "step": 7910 + }, + { + "epoch": 1.2817563188593648, + "grad_norm": 0.8688755035400391, + "learning_rate": 3.201088461687282e-06, + "loss": 0.1083, + "step": 7911 + }, + { + "epoch": 1.2819183408943617, + "grad_norm": 0.8324447274208069, + "learning_rate": 3.2006686946066012e-06, + "loss": 0.107, + "step": 7912 + }, + { + "epoch": 1.2820803629293585, + "grad_norm": 0.779424250125885, + "learning_rate": 3.2002489060871534e-06, + "loss": 0.0916, + "step": 7913 + }, + { + "epoch": 1.2822423849643552, + "grad_norm": 0.7420961260795593, + "learning_rate": 3.1998290961417844e-06, + "loss": 0.0904, + "step": 7914 + }, + { + "epoch": 1.2824044069993519, + "grad_norm": 0.8292026519775391, + "learning_rate": 3.199409264783338e-06, + "loss": 0.0986, + "step": 7915 + }, + { + "epoch": 1.2825664290343486, + "grad_norm": 0.8516577482223511, + "learning_rate": 3.1989894120246613e-06, + "loss": 0.103, + "step": 7916 + }, + { + "epoch": 1.2827284510693455, + "grad_norm": 0.7372321486473083, + "learning_rate": 3.1985695378786e-06, + "loss": 0.0895, + "step": 7917 + }, + { + "epoch": 1.2828904731043422, + "grad_norm": 0.8905530571937561, + "learning_rate": 3.1981496423580012e-06, + "loss": 0.1178, + "step": 7918 + }, + { + "epoch": 1.283052495139339, + "grad_norm": 0.819083571434021, + "learning_rate": 3.1977297254757124e-06, + "loss": 0.1058, + "step": 7919 + }, + { + "epoch": 1.2832145171743357, + "grad_norm": 0.8953108787536621, + "learning_rate": 3.1973097872445828e-06, + "loss": 0.1166, + "step": 7920 + }, + { + "epoch": 1.2833765392093324, + "grad_norm": 0.7964316606521606, + "learning_rate": 3.196889827677462e-06, + "loss": 0.0898, + "step": 7921 + }, + { + "epoch": 1.2835385612443293, + "grad_norm": 0.8181843161582947, + "learning_rate": 3.1964698467871976e-06, + "loss": 0.0913, + "step": 7922 + }, + { + "epoch": 1.283700583279326, + "grad_norm": 0.8924493789672852, + "learning_rate": 3.1960498445866423e-06, + "loss": 0.1187, + "step": 7923 + }, + { + "epoch": 1.2838626053143227, + "grad_norm": 0.9556626677513123, + "learning_rate": 3.1956298210886454e-06, + "loss": 0.1147, + "step": 7924 + }, + { + "epoch": 1.2840246273493194, + "grad_norm": 0.8342770934104919, + "learning_rate": 3.1952097763060595e-06, + "loss": 0.1027, + "step": 7925 + }, + { + "epoch": 1.2841866493843161, + "grad_norm": 0.7476953864097595, + "learning_rate": 3.1947897102517374e-06, + "loss": 0.0885, + "step": 7926 + }, + { + "epoch": 1.284348671419313, + "grad_norm": 0.8038281798362732, + "learning_rate": 3.1943696229385307e-06, + "loss": 0.1014, + "step": 7927 + }, + { + "epoch": 1.2845106934543098, + "grad_norm": 0.9089291095733643, + "learning_rate": 3.1939495143792944e-06, + "loss": 0.1119, + "step": 7928 + }, + { + "epoch": 1.2846727154893065, + "grad_norm": 0.9920743107795715, + "learning_rate": 3.193529384586882e-06, + "loss": 0.1191, + "step": 7929 + }, + { + "epoch": 1.2848347375243034, + "grad_norm": 0.7742721438407898, + "learning_rate": 3.1931092335741497e-06, + "loss": 0.0964, + "step": 7930 + }, + { + "epoch": 1.2849967595593, + "grad_norm": 0.8795700073242188, + "learning_rate": 3.1926890613539513e-06, + "loss": 0.1061, + "step": 7931 + }, + { + "epoch": 1.2851587815942969, + "grad_norm": 0.8437364101409912, + "learning_rate": 3.192268867939144e-06, + "loss": 0.1071, + "step": 7932 + }, + { + "epoch": 1.2853208036292936, + "grad_norm": 0.8689562678337097, + "learning_rate": 3.191848653342584e-06, + "loss": 0.1101, + "step": 7933 + }, + { + "epoch": 1.2854828256642903, + "grad_norm": 0.8216108679771423, + "learning_rate": 3.1914284175771303e-06, + "loss": 0.1049, + "step": 7934 + }, + { + "epoch": 1.2856448476992872, + "grad_norm": 0.8345628380775452, + "learning_rate": 3.19100816065564e-06, + "loss": 0.0987, + "step": 7935 + }, + { + "epoch": 1.285806869734284, + "grad_norm": 0.830443799495697, + "learning_rate": 3.1905878825909726e-06, + "loss": 0.107, + "step": 7936 + }, + { + "epoch": 1.2859688917692806, + "grad_norm": 0.7912009358406067, + "learning_rate": 3.190167583395986e-06, + "loss": 0.1001, + "step": 7937 + }, + { + "epoch": 1.2861309138042774, + "grad_norm": 0.8424001932144165, + "learning_rate": 3.189747263083543e-06, + "loss": 0.1061, + "step": 7938 + }, + { + "epoch": 1.286292935839274, + "grad_norm": 0.8694502115249634, + "learning_rate": 3.1893269216665017e-06, + "loss": 0.1152, + "step": 7939 + }, + { + "epoch": 1.286454957874271, + "grad_norm": 0.8436936736106873, + "learning_rate": 3.188906559157725e-06, + "loss": 0.1028, + "step": 7940 + }, + { + "epoch": 1.2866169799092677, + "grad_norm": 0.7603984475135803, + "learning_rate": 3.188486175570075e-06, + "loss": 0.0907, + "step": 7941 + }, + { + "epoch": 1.2867790019442644, + "grad_norm": 0.8339808583259583, + "learning_rate": 3.1880657709164144e-06, + "loss": 0.107, + "step": 7942 + }, + { + "epoch": 1.2869410239792611, + "grad_norm": 0.8520210385322571, + "learning_rate": 3.187645345209606e-06, + "loss": 0.1049, + "step": 7943 + }, + { + "epoch": 1.2871030460142578, + "grad_norm": 0.8916787505149841, + "learning_rate": 3.1872248984625135e-06, + "loss": 0.108, + "step": 7944 + }, + { + "epoch": 1.2872650680492548, + "grad_norm": 0.9074912667274475, + "learning_rate": 3.1868044306880037e-06, + "loss": 0.1019, + "step": 7945 + }, + { + "epoch": 1.2874270900842515, + "grad_norm": 0.9418996572494507, + "learning_rate": 3.1863839418989385e-06, + "loss": 0.1114, + "step": 7946 + }, + { + "epoch": 1.2875891121192482, + "grad_norm": 0.7757999897003174, + "learning_rate": 3.185963432108187e-06, + "loss": 0.0916, + "step": 7947 + }, + { + "epoch": 1.287751134154245, + "grad_norm": 0.8359844088554382, + "learning_rate": 3.185542901328613e-06, + "loss": 0.1024, + "step": 7948 + }, + { + "epoch": 1.2879131561892416, + "grad_norm": 1.3665879964828491, + "learning_rate": 3.185122349573087e-06, + "loss": 0.1411, + "step": 7949 + }, + { + "epoch": 1.2880751782242386, + "grad_norm": 0.7807172536849976, + "learning_rate": 3.184701776854474e-06, + "loss": 0.104, + "step": 7950 + }, + { + "epoch": 1.2882372002592353, + "grad_norm": 0.7108926177024841, + "learning_rate": 3.1842811831856444e-06, + "loss": 0.0831, + "step": 7951 + }, + { + "epoch": 1.288399222294232, + "grad_norm": 0.7794750332832336, + "learning_rate": 3.1838605685794665e-06, + "loss": 0.1034, + "step": 7952 + }, + { + "epoch": 1.2885612443292287, + "grad_norm": 0.8731608986854553, + "learning_rate": 3.183439933048809e-06, + "loss": 0.0996, + "step": 7953 + }, + { + "epoch": 1.2887232663642254, + "grad_norm": 0.8640766143798828, + "learning_rate": 3.1830192766065445e-06, + "loss": 0.1089, + "step": 7954 + }, + { + "epoch": 1.2888852883992223, + "grad_norm": 0.8059561848640442, + "learning_rate": 3.1825985992655422e-06, + "loss": 0.0989, + "step": 7955 + }, + { + "epoch": 1.289047310434219, + "grad_norm": 0.889413595199585, + "learning_rate": 3.1821779010386755e-06, + "loss": 0.1129, + "step": 7956 + }, + { + "epoch": 1.2892093324692158, + "grad_norm": 0.7468616366386414, + "learning_rate": 3.181757181938815e-06, + "loss": 0.0968, + "step": 7957 + }, + { + "epoch": 1.2893713545042127, + "grad_norm": 0.7996143102645874, + "learning_rate": 3.181336441978835e-06, + "loss": 0.0968, + "step": 7958 + }, + { + "epoch": 1.2895333765392094, + "grad_norm": 0.8068464398384094, + "learning_rate": 3.1809156811716084e-06, + "loss": 0.0977, + "step": 7959 + }, + { + "epoch": 1.2896953985742061, + "grad_norm": 0.7814459800720215, + "learning_rate": 3.18049489953001e-06, + "loss": 0.1007, + "step": 7960 + }, + { + "epoch": 1.2898574206092028, + "grad_norm": 0.7772566080093384, + "learning_rate": 3.180074097066914e-06, + "loss": 0.0983, + "step": 7961 + }, + { + "epoch": 1.2900194426441995, + "grad_norm": 0.7291167974472046, + "learning_rate": 3.1796532737951975e-06, + "loss": 0.0959, + "step": 7962 + }, + { + "epoch": 1.2901814646791965, + "grad_norm": 0.7671595215797424, + "learning_rate": 3.1792324297277345e-06, + "loss": 0.0991, + "step": 7963 + }, + { + "epoch": 1.2903434867141932, + "grad_norm": 0.7343456745147705, + "learning_rate": 3.1788115648774033e-06, + "loss": 0.0958, + "step": 7964 + }, + { + "epoch": 1.29050550874919, + "grad_norm": 0.8339682817459106, + "learning_rate": 3.1783906792570805e-06, + "loss": 0.1065, + "step": 7965 + }, + { + "epoch": 1.2906675307841866, + "grad_norm": 0.9149320125579834, + "learning_rate": 3.177969772879645e-06, + "loss": 0.1138, + "step": 7966 + }, + { + "epoch": 1.2908295528191833, + "grad_norm": 0.8161713480949402, + "learning_rate": 3.1775488457579756e-06, + "loss": 0.1041, + "step": 7967 + }, + { + "epoch": 1.2909915748541803, + "grad_norm": 0.8028397560119629, + "learning_rate": 3.1771278979049496e-06, + "loss": 0.0971, + "step": 7968 + }, + { + "epoch": 1.291153596889177, + "grad_norm": 0.7893111109733582, + "learning_rate": 3.1767069293334502e-06, + "loss": 0.0984, + "step": 7969 + }, + { + "epoch": 1.2913156189241737, + "grad_norm": 0.9668081402778625, + "learning_rate": 3.176285940056355e-06, + "loss": 0.1123, + "step": 7970 + }, + { + "epoch": 1.2914776409591704, + "grad_norm": 0.896835207939148, + "learning_rate": 3.1758649300865473e-06, + "loss": 0.1062, + "step": 7971 + }, + { + "epoch": 1.291639662994167, + "grad_norm": 0.8189420700073242, + "learning_rate": 3.1754438994369087e-06, + "loss": 0.1035, + "step": 7972 + }, + { + "epoch": 1.291801685029164, + "grad_norm": 0.8541057109832764, + "learning_rate": 3.1750228481203206e-06, + "loss": 0.107, + "step": 7973 + }, + { + "epoch": 1.2919637070641607, + "grad_norm": 0.7424408793449402, + "learning_rate": 3.174601776149668e-06, + "loss": 0.0974, + "step": 7974 + }, + { + "epoch": 1.2921257290991575, + "grad_norm": 0.9748353958129883, + "learning_rate": 3.174180683537832e-06, + "loss": 0.1157, + "step": 7975 + }, + { + "epoch": 1.2922877511341542, + "grad_norm": 0.9521284699440002, + "learning_rate": 3.1737595702976996e-06, + "loss": 0.1038, + "step": 7976 + }, + { + "epoch": 1.2924497731691509, + "grad_norm": 0.9242678284645081, + "learning_rate": 3.1733384364421536e-06, + "loss": 0.1133, + "step": 7977 + }, + { + "epoch": 1.2926117952041478, + "grad_norm": 0.8057260513305664, + "learning_rate": 3.1729172819840825e-06, + "loss": 0.0995, + "step": 7978 + }, + { + "epoch": 1.2927738172391445, + "grad_norm": 0.7817633152008057, + "learning_rate": 3.17249610693637e-06, + "loss": 0.1035, + "step": 7979 + }, + { + "epoch": 1.2929358392741412, + "grad_norm": 0.8198245763778687, + "learning_rate": 3.1720749113119045e-06, + "loss": 0.1063, + "step": 7980 + }, + { + "epoch": 1.2930978613091382, + "grad_norm": 0.7432113885879517, + "learning_rate": 3.1716536951235727e-06, + "loss": 0.0965, + "step": 7981 + }, + { + "epoch": 1.2932598833441347, + "grad_norm": 0.7126169800758362, + "learning_rate": 3.1712324583842637e-06, + "loss": 0.0963, + "step": 7982 + }, + { + "epoch": 1.2934219053791316, + "grad_norm": 0.7479450702667236, + "learning_rate": 3.1708112011068647e-06, + "loss": 0.1006, + "step": 7983 + }, + { + "epoch": 1.2935839274141283, + "grad_norm": 0.8944701552391052, + "learning_rate": 3.1703899233042675e-06, + "loss": 0.1105, + "step": 7984 + }, + { + "epoch": 1.293745949449125, + "grad_norm": 0.8011032938957214, + "learning_rate": 3.1699686249893614e-06, + "loss": 0.1094, + "step": 7985 + }, + { + "epoch": 1.293907971484122, + "grad_norm": 0.7931639552116394, + "learning_rate": 3.1695473061750353e-06, + "loss": 0.1034, + "step": 7986 + }, + { + "epoch": 1.2940699935191187, + "grad_norm": 0.8425852060317993, + "learning_rate": 3.1691259668741823e-06, + "loss": 0.0978, + "step": 7987 + }, + { + "epoch": 1.2942320155541154, + "grad_norm": 0.7550665736198425, + "learning_rate": 3.1687046070996942e-06, + "loss": 0.0942, + "step": 7988 + }, + { + "epoch": 1.294394037589112, + "grad_norm": 0.8623692393302917, + "learning_rate": 3.168283226864463e-06, + "loss": 0.111, + "step": 7989 + }, + { + "epoch": 1.2945560596241088, + "grad_norm": 0.8119961619377136, + "learning_rate": 3.1678618261813828e-06, + "loss": 0.1057, + "step": 7990 + }, + { + "epoch": 1.2947180816591057, + "grad_norm": 0.9069503545761108, + "learning_rate": 3.1674404050633465e-06, + "loss": 0.1106, + "step": 7991 + }, + { + "epoch": 1.2948801036941024, + "grad_norm": 0.716511070728302, + "learning_rate": 3.167018963523249e-06, + "loss": 0.0845, + "step": 7992 + }, + { + "epoch": 1.2950421257290992, + "grad_norm": 0.8659084439277649, + "learning_rate": 3.166597501573986e-06, + "loss": 0.1073, + "step": 7993 + }, + { + "epoch": 1.2952041477640959, + "grad_norm": 0.9604227542877197, + "learning_rate": 3.1661760192284518e-06, + "loss": 0.1231, + "step": 7994 + }, + { + "epoch": 1.2953661697990926, + "grad_norm": 0.869018018245697, + "learning_rate": 3.165754516499544e-06, + "loss": 0.0979, + "step": 7995 + }, + { + "epoch": 1.2955281918340895, + "grad_norm": 0.8611443638801575, + "learning_rate": 3.165332993400159e-06, + "loss": 0.1135, + "step": 7996 + }, + { + "epoch": 1.2956902138690862, + "grad_norm": 0.8204544186592102, + "learning_rate": 3.1649114499431944e-06, + "loss": 0.108, + "step": 7997 + }, + { + "epoch": 1.295852235904083, + "grad_norm": 0.9157761931419373, + "learning_rate": 3.1644898861415484e-06, + "loss": 0.1186, + "step": 7998 + }, + { + "epoch": 1.2960142579390797, + "grad_norm": 0.8155969381332397, + "learning_rate": 3.1640683020081196e-06, + "loss": 0.1094, + "step": 7999 + }, + { + "epoch": 1.2961762799740764, + "grad_norm": 0.9700890779495239, + "learning_rate": 3.163646697555809e-06, + "loss": 0.1176, + "step": 8000 + }, + { + "epoch": 1.2963383020090733, + "grad_norm": 0.9052625894546509, + "learning_rate": 3.163225072797514e-06, + "loss": 0.1113, + "step": 8001 + }, + { + "epoch": 1.29650032404407, + "grad_norm": 0.8415457010269165, + "learning_rate": 3.1628034277461376e-06, + "loss": 0.103, + "step": 8002 + }, + { + "epoch": 1.2966623460790667, + "grad_norm": 0.9100468158721924, + "learning_rate": 3.1623817624145804e-06, + "loss": 0.1048, + "step": 8003 + }, + { + "epoch": 1.2968243681140634, + "grad_norm": 0.8173115849494934, + "learning_rate": 3.161960076815743e-06, + "loss": 0.0973, + "step": 8004 + }, + { + "epoch": 1.2969863901490601, + "grad_norm": 0.8509911298751831, + "learning_rate": 3.1615383709625303e-06, + "loss": 0.1052, + "step": 8005 + }, + { + "epoch": 1.297148412184057, + "grad_norm": 0.8229318261146545, + "learning_rate": 3.1611166448678445e-06, + "loss": 0.1083, + "step": 8006 + }, + { + "epoch": 1.2973104342190538, + "grad_norm": 0.7618302702903748, + "learning_rate": 3.1606948985445884e-06, + "loss": 0.1004, + "step": 8007 + }, + { + "epoch": 1.2974724562540505, + "grad_norm": 0.7803520560264587, + "learning_rate": 3.1602731320056675e-06, + "loss": 0.0968, + "step": 8008 + }, + { + "epoch": 1.2976344782890474, + "grad_norm": 1.071578860282898, + "learning_rate": 3.1598513452639867e-06, + "loss": 0.1186, + "step": 8009 + }, + { + "epoch": 1.2977965003240441, + "grad_norm": 0.9254783987998962, + "learning_rate": 3.159429538332452e-06, + "loss": 0.1081, + "step": 8010 + }, + { + "epoch": 1.2979585223590409, + "grad_norm": 0.8896194100379944, + "learning_rate": 3.1590077112239685e-06, + "loss": 0.1153, + "step": 8011 + }, + { + "epoch": 1.2981205443940376, + "grad_norm": 0.8195780515670776, + "learning_rate": 3.1585858639514444e-06, + "loss": 0.1114, + "step": 8012 + }, + { + "epoch": 1.2982825664290343, + "grad_norm": 0.869594931602478, + "learning_rate": 3.158163996527786e-06, + "loss": 0.104, + "step": 8013 + }, + { + "epoch": 1.2984445884640312, + "grad_norm": 0.9179608225822449, + "learning_rate": 3.1577421089659023e-06, + "loss": 0.1009, + "step": 8014 + }, + { + "epoch": 1.298606610499028, + "grad_norm": 0.8011884689331055, + "learning_rate": 3.157320201278702e-06, + "loss": 0.0954, + "step": 8015 + }, + { + "epoch": 1.2987686325340246, + "grad_norm": 0.8461195826530457, + "learning_rate": 3.1568982734790943e-06, + "loss": 0.108, + "step": 8016 + }, + { + "epoch": 1.2989306545690213, + "grad_norm": 0.7561697959899902, + "learning_rate": 3.1564763255799886e-06, + "loss": 0.0965, + "step": 8017 + }, + { + "epoch": 1.299092676604018, + "grad_norm": 0.8390095233917236, + "learning_rate": 3.1560543575942958e-06, + "loss": 0.1127, + "step": 8018 + }, + { + "epoch": 1.299254698639015, + "grad_norm": 0.9161712527275085, + "learning_rate": 3.155632369534928e-06, + "loss": 0.1163, + "step": 8019 + }, + { + "epoch": 1.2994167206740117, + "grad_norm": 0.8636978268623352, + "learning_rate": 3.1552103614147955e-06, + "loss": 0.1006, + "step": 8020 + }, + { + "epoch": 1.2995787427090084, + "grad_norm": 0.7662655115127563, + "learning_rate": 3.154788333246812e-06, + "loss": 0.093, + "step": 8021 + }, + { + "epoch": 1.2997407647440051, + "grad_norm": 0.7853266000747681, + "learning_rate": 3.1543662850438905e-06, + "loss": 0.0991, + "step": 8022 + }, + { + "epoch": 1.2999027867790018, + "grad_norm": 0.8680589199066162, + "learning_rate": 3.153944216818943e-06, + "loss": 0.1214, + "step": 8023 + }, + { + "epoch": 1.3000648088139988, + "grad_norm": 0.7828645706176758, + "learning_rate": 3.1535221285848866e-06, + "loss": 0.1039, + "step": 8024 + }, + { + "epoch": 1.3002268308489955, + "grad_norm": 0.8467226624488831, + "learning_rate": 3.1531000203546336e-06, + "loss": 0.1025, + "step": 8025 + }, + { + "epoch": 1.3003888528839922, + "grad_norm": 0.8064707517623901, + "learning_rate": 3.1526778921411006e-06, + "loss": 0.1102, + "step": 8026 + }, + { + "epoch": 1.300550874918989, + "grad_norm": 0.8857673406600952, + "learning_rate": 3.152255743957203e-06, + "loss": 0.1082, + "step": 8027 + }, + { + "epoch": 1.3007128969539856, + "grad_norm": 0.7488471865653992, + "learning_rate": 3.151833575815859e-06, + "loss": 0.1013, + "step": 8028 + }, + { + "epoch": 1.3008749189889826, + "grad_norm": 0.7051052451133728, + "learning_rate": 3.1514113877299844e-06, + "loss": 0.087, + "step": 8029 + }, + { + "epoch": 1.3010369410239793, + "grad_norm": 0.8191677927970886, + "learning_rate": 3.1509891797124977e-06, + "loss": 0.1059, + "step": 8030 + }, + { + "epoch": 1.301198963058976, + "grad_norm": 0.8188061118125916, + "learning_rate": 3.150566951776318e-06, + "loss": 0.1143, + "step": 8031 + }, + { + "epoch": 1.301360985093973, + "grad_norm": 0.9264690279960632, + "learning_rate": 3.150144703934363e-06, + "loss": 0.1195, + "step": 8032 + }, + { + "epoch": 1.3015230071289696, + "grad_norm": 0.786266565322876, + "learning_rate": 3.1497224361995544e-06, + "loss": 0.1015, + "step": 8033 + }, + { + "epoch": 1.3016850291639663, + "grad_norm": 0.9195615649223328, + "learning_rate": 3.149300148584811e-06, + "loss": 0.1219, + "step": 8034 + }, + { + "epoch": 1.301847051198963, + "grad_norm": 0.8377419114112854, + "learning_rate": 3.1488778411030547e-06, + "loss": 0.1083, + "step": 8035 + }, + { + "epoch": 1.3020090732339598, + "grad_norm": 0.7512086629867554, + "learning_rate": 3.1484555137672063e-06, + "loss": 0.0884, + "step": 8036 + }, + { + "epoch": 1.3021710952689567, + "grad_norm": 0.7718841433525085, + "learning_rate": 3.148033166590188e-06, + "loss": 0.0941, + "step": 8037 + }, + { + "epoch": 1.3023331173039534, + "grad_norm": 0.7764375805854797, + "learning_rate": 3.147610799584924e-06, + "loss": 0.1025, + "step": 8038 + }, + { + "epoch": 1.3024951393389501, + "grad_norm": 0.9073190689086914, + "learning_rate": 3.147188412764336e-06, + "loss": 0.1038, + "step": 8039 + }, + { + "epoch": 1.3026571613739468, + "grad_norm": 0.794582724571228, + "learning_rate": 3.1467660061413497e-06, + "loss": 0.0986, + "step": 8040 + }, + { + "epoch": 1.3028191834089435, + "grad_norm": 0.8445225358009338, + "learning_rate": 3.1463435797288876e-06, + "loss": 0.11, + "step": 8041 + }, + { + "epoch": 1.3029812054439405, + "grad_norm": 0.8097845315933228, + "learning_rate": 3.1459211335398765e-06, + "loss": 0.1001, + "step": 8042 + }, + { + "epoch": 1.3031432274789372, + "grad_norm": 0.8664547801017761, + "learning_rate": 3.1454986675872417e-06, + "loss": 0.1093, + "step": 8043 + }, + { + "epoch": 1.303305249513934, + "grad_norm": 0.9166635274887085, + "learning_rate": 3.14507618188391e-06, + "loss": 0.1105, + "step": 8044 + }, + { + "epoch": 1.3034672715489306, + "grad_norm": 0.8189054727554321, + "learning_rate": 3.1446536764428083e-06, + "loss": 0.1063, + "step": 8045 + }, + { + "epoch": 1.3036292935839273, + "grad_norm": 0.8049901723861694, + "learning_rate": 3.144231151276864e-06, + "loss": 0.1105, + "step": 8046 + }, + { + "epoch": 1.3037913156189243, + "grad_norm": 0.8540433049201965, + "learning_rate": 3.1438086063990054e-06, + "loss": 0.1084, + "step": 8047 + }, + { + "epoch": 1.303953337653921, + "grad_norm": 0.8205851912498474, + "learning_rate": 3.143386041822162e-06, + "loss": 0.0978, + "step": 8048 + }, + { + "epoch": 1.3041153596889177, + "grad_norm": 0.9036335945129395, + "learning_rate": 3.1429634575592617e-06, + "loss": 0.1028, + "step": 8049 + }, + { + "epoch": 1.3042773817239144, + "grad_norm": 0.8568320870399475, + "learning_rate": 3.142540853623236e-06, + "loss": 0.1099, + "step": 8050 + }, + { + "epoch": 1.304439403758911, + "grad_norm": 0.9346566796302795, + "learning_rate": 3.1421182300270146e-06, + "loss": 0.1128, + "step": 8051 + }, + { + "epoch": 1.304601425793908, + "grad_norm": 0.8423227667808533, + "learning_rate": 3.14169558678353e-06, + "loss": 0.1033, + "step": 8052 + }, + { + "epoch": 1.3047634478289047, + "grad_norm": 0.8017847537994385, + "learning_rate": 3.1412729239057133e-06, + "loss": 0.0991, + "step": 8053 + }, + { + "epoch": 1.3049254698639015, + "grad_norm": 0.9350404739379883, + "learning_rate": 3.1408502414064963e-06, + "loss": 0.1108, + "step": 8054 + }, + { + "epoch": 1.3050874918988984, + "grad_norm": 0.7774401307106018, + "learning_rate": 3.140427539298814e-06, + "loss": 0.0955, + "step": 8055 + }, + { + "epoch": 1.3052495139338949, + "grad_norm": 0.7763550281524658, + "learning_rate": 3.140004817595597e-06, + "loss": 0.0892, + "step": 8056 + }, + { + "epoch": 1.3054115359688918, + "grad_norm": 0.8174300193786621, + "learning_rate": 3.139582076309783e-06, + "loss": 0.1017, + "step": 8057 + }, + { + "epoch": 1.3055735580038885, + "grad_norm": 0.7830010056495667, + "learning_rate": 3.1391593154543043e-06, + "loss": 0.0931, + "step": 8058 + }, + { + "epoch": 1.3057355800388852, + "grad_norm": 0.8983414769172668, + "learning_rate": 3.1387365350420973e-06, + "loss": 0.1045, + "step": 8059 + }, + { + "epoch": 1.3058976020738822, + "grad_norm": 0.8986459374427795, + "learning_rate": 3.138313735086099e-06, + "loss": 0.1145, + "step": 8060 + }, + { + "epoch": 1.3060596241088789, + "grad_norm": 0.924191415309906, + "learning_rate": 3.137890915599243e-06, + "loss": 0.1091, + "step": 8061 + }, + { + "epoch": 1.3062216461438756, + "grad_norm": 0.9674059152603149, + "learning_rate": 3.137468076594471e-06, + "loss": 0.1148, + "step": 8062 + }, + { + "epoch": 1.3063836681788723, + "grad_norm": 0.8017002940177917, + "learning_rate": 3.1370452180847165e-06, + "loss": 0.108, + "step": 8063 + }, + { + "epoch": 1.306545690213869, + "grad_norm": 0.8276110291481018, + "learning_rate": 3.1366223400829215e-06, + "loss": 0.1081, + "step": 8064 + }, + { + "epoch": 1.306707712248866, + "grad_norm": 0.8368842005729675, + "learning_rate": 3.136199442602023e-06, + "loss": 0.1065, + "step": 8065 + }, + { + "epoch": 1.3068697342838627, + "grad_norm": 0.8028512597084045, + "learning_rate": 3.135776525654961e-06, + "loss": 0.0926, + "step": 8066 + }, + { + "epoch": 1.3070317563188594, + "grad_norm": 0.7708948850631714, + "learning_rate": 3.135353589254676e-06, + "loss": 0.0956, + "step": 8067 + }, + { + "epoch": 1.307193778353856, + "grad_norm": 0.7509301900863647, + "learning_rate": 3.1349306334141084e-06, + "loss": 0.1024, + "step": 8068 + }, + { + "epoch": 1.3073558003888528, + "grad_norm": 0.8279680609703064, + "learning_rate": 3.1345076581462007e-06, + "loss": 0.1069, + "step": 8069 + }, + { + "epoch": 1.3075178224238497, + "grad_norm": 0.763145387172699, + "learning_rate": 3.134084663463894e-06, + "loss": 0.097, + "step": 8070 + }, + { + "epoch": 1.3076798444588464, + "grad_norm": 0.8883261680603027, + "learning_rate": 3.1336616493801305e-06, + "loss": 0.1074, + "step": 8071 + }, + { + "epoch": 1.3078418664938432, + "grad_norm": 0.9554191827774048, + "learning_rate": 3.1332386159078536e-06, + "loss": 0.1057, + "step": 8072 + }, + { + "epoch": 1.3080038885288399, + "grad_norm": 0.9759199023246765, + "learning_rate": 3.132815563060008e-06, + "loss": 0.1251, + "step": 8073 + }, + { + "epoch": 1.3081659105638366, + "grad_norm": 0.7938733696937561, + "learning_rate": 3.132392490849537e-06, + "loss": 0.0819, + "step": 8074 + }, + { + "epoch": 1.3083279325988335, + "grad_norm": 0.7426988482475281, + "learning_rate": 3.1319693992893874e-06, + "loss": 0.089, + "step": 8075 + }, + { + "epoch": 1.3084899546338302, + "grad_norm": 0.8131446838378906, + "learning_rate": 3.1315462883925026e-06, + "loss": 0.0958, + "step": 8076 + }, + { + "epoch": 1.308651976668827, + "grad_norm": 0.9161220192909241, + "learning_rate": 3.1311231581718303e-06, + "loss": 0.1206, + "step": 8077 + }, + { + "epoch": 1.3088139987038236, + "grad_norm": 0.834787905216217, + "learning_rate": 3.1307000086403162e-06, + "loss": 0.1093, + "step": 8078 + }, + { + "epoch": 1.3089760207388204, + "grad_norm": 1.0066626071929932, + "learning_rate": 3.1302768398109077e-06, + "loss": 0.1017, + "step": 8079 + }, + { + "epoch": 1.3091380427738173, + "grad_norm": 0.7865386605262756, + "learning_rate": 3.1298536516965537e-06, + "loss": 0.1025, + "step": 8080 + }, + { + "epoch": 1.309300064808814, + "grad_norm": 0.8171322345733643, + "learning_rate": 3.129430444310202e-06, + "loss": 0.1062, + "step": 8081 + }, + { + "epoch": 1.3094620868438107, + "grad_norm": 0.850459635257721, + "learning_rate": 3.129007217664802e-06, + "loss": 0.107, + "step": 8082 + }, + { + "epoch": 1.3096241088788076, + "grad_norm": 0.8040875792503357, + "learning_rate": 3.128583971773303e-06, + "loss": 0.1006, + "step": 8083 + }, + { + "epoch": 1.3097861309138044, + "grad_norm": 0.7739570736885071, + "learning_rate": 3.1281607066486565e-06, + "loss": 0.0991, + "step": 8084 + }, + { + "epoch": 1.309948152948801, + "grad_norm": 0.7959015369415283, + "learning_rate": 3.127737422303811e-06, + "loss": 0.097, + "step": 8085 + }, + { + "epoch": 1.3101101749837978, + "grad_norm": 0.8150349259376526, + "learning_rate": 3.127314118751721e-06, + "loss": 0.0953, + "step": 8086 + }, + { + "epoch": 1.3102721970187945, + "grad_norm": 0.8278794884681702, + "learning_rate": 3.1268907960053356e-06, + "loss": 0.0999, + "step": 8087 + }, + { + "epoch": 1.3104342190537914, + "grad_norm": 0.7229242324829102, + "learning_rate": 3.12646745407761e-06, + "loss": 0.0914, + "step": 8088 + }, + { + "epoch": 1.3105962410887881, + "grad_norm": 0.9541682004928589, + "learning_rate": 3.126044092981496e-06, + "loss": 0.1125, + "step": 8089 + }, + { + "epoch": 1.3107582631237849, + "grad_norm": 0.8512731790542603, + "learning_rate": 3.1256207127299475e-06, + "loss": 0.1065, + "step": 8090 + }, + { + "epoch": 1.3109202851587816, + "grad_norm": 0.9198841452598572, + "learning_rate": 3.12519731333592e-06, + "loss": 0.1117, + "step": 8091 + }, + { + "epoch": 1.3110823071937783, + "grad_norm": 0.7064549326896667, + "learning_rate": 3.124773894812367e-06, + "loss": 0.0941, + "step": 8092 + }, + { + "epoch": 1.3112443292287752, + "grad_norm": 0.806461751461029, + "learning_rate": 3.124350457172245e-06, + "loss": 0.0987, + "step": 8093 + }, + { + "epoch": 1.311406351263772, + "grad_norm": 0.7722499370574951, + "learning_rate": 3.123927000428509e-06, + "loss": 0.1029, + "step": 8094 + }, + { + "epoch": 1.3115683732987686, + "grad_norm": 0.9062454104423523, + "learning_rate": 3.123503524594118e-06, + "loss": 0.1205, + "step": 8095 + }, + { + "epoch": 1.3117303953337653, + "grad_norm": 0.9097535610198975, + "learning_rate": 3.123080029682027e-06, + "loss": 0.1028, + "step": 8096 + }, + { + "epoch": 1.311892417368762, + "grad_norm": 0.9619364738464355, + "learning_rate": 3.1226565157051953e-06, + "loss": 0.116, + "step": 8097 + }, + { + "epoch": 1.312054439403759, + "grad_norm": 0.845770001411438, + "learning_rate": 3.1222329826765806e-06, + "loss": 0.1112, + "step": 8098 + }, + { + "epoch": 1.3122164614387557, + "grad_norm": 0.7698118686676025, + "learning_rate": 3.121809430609143e-06, + "loss": 0.101, + "step": 8099 + }, + { + "epoch": 1.3123784834737524, + "grad_norm": 0.9154854416847229, + "learning_rate": 3.121385859515842e-06, + "loss": 0.101, + "step": 8100 + }, + { + "epoch": 1.3125405055087491, + "grad_norm": 0.9142758250236511, + "learning_rate": 3.1209622694096362e-06, + "loss": 0.1169, + "step": 8101 + }, + { + "epoch": 1.3127025275437458, + "grad_norm": 0.8378267288208008, + "learning_rate": 3.1205386603034886e-06, + "loss": 0.1112, + "step": 8102 + }, + { + "epoch": 1.3128645495787428, + "grad_norm": 0.8595819473266602, + "learning_rate": 3.1201150322103593e-06, + "loss": 0.1037, + "step": 8103 + }, + { + "epoch": 1.3130265716137395, + "grad_norm": 0.9312635660171509, + "learning_rate": 3.1196913851432108e-06, + "loss": 0.1021, + "step": 8104 + }, + { + "epoch": 1.3131885936487362, + "grad_norm": 0.7517058253288269, + "learning_rate": 3.119267719115005e-06, + "loss": 0.0987, + "step": 8105 + }, + { + "epoch": 1.3133506156837331, + "grad_norm": 0.7930306196212769, + "learning_rate": 3.1188440341387063e-06, + "loss": 0.0988, + "step": 8106 + }, + { + "epoch": 1.3135126377187296, + "grad_norm": 0.692436933517456, + "learning_rate": 3.1184203302272775e-06, + "loss": 0.0828, + "step": 8107 + }, + { + "epoch": 1.3136746597537265, + "grad_norm": 0.7574872970581055, + "learning_rate": 3.1179966073936837e-06, + "loss": 0.0962, + "step": 8108 + }, + { + "epoch": 1.3138366817887233, + "grad_norm": 0.758668065071106, + "learning_rate": 3.1175728656508874e-06, + "loss": 0.1004, + "step": 8109 + }, + { + "epoch": 1.31399870382372, + "grad_norm": 0.7493375539779663, + "learning_rate": 3.117149105011858e-06, + "loss": 0.0977, + "step": 8110 + }, + { + "epoch": 1.314160725858717, + "grad_norm": 0.7759285569190979, + "learning_rate": 3.1167253254895584e-06, + "loss": 0.1007, + "step": 8111 + }, + { + "epoch": 1.3143227478937136, + "grad_norm": 0.8017150163650513, + "learning_rate": 3.1163015270969567e-06, + "loss": 0.101, + "step": 8112 + }, + { + "epoch": 1.3144847699287103, + "grad_norm": 0.9174550771713257, + "learning_rate": 3.1158777098470194e-06, + "loss": 0.1181, + "step": 8113 + }, + { + "epoch": 1.314646791963707, + "grad_norm": 1.0454412698745728, + "learning_rate": 3.115453873752714e-06, + "loss": 0.1151, + "step": 8114 + }, + { + "epoch": 1.3148088139987038, + "grad_norm": 0.9942188262939453, + "learning_rate": 3.115030018827011e-06, + "loss": 0.1252, + "step": 8115 + }, + { + "epoch": 1.3149708360337007, + "grad_norm": 0.8930821418762207, + "learning_rate": 3.114606145082876e-06, + "loss": 0.1013, + "step": 8116 + }, + { + "epoch": 1.3151328580686974, + "grad_norm": 0.8020307421684265, + "learning_rate": 3.1141822525332815e-06, + "loss": 0.0981, + "step": 8117 + }, + { + "epoch": 1.315294880103694, + "grad_norm": 0.9933139681816101, + "learning_rate": 3.1137583411911954e-06, + "loss": 0.1177, + "step": 8118 + }, + { + "epoch": 1.3154569021386908, + "grad_norm": 0.9036347270011902, + "learning_rate": 3.113334411069591e-06, + "loss": 0.1096, + "step": 8119 + }, + { + "epoch": 1.3156189241736875, + "grad_norm": 0.874993622303009, + "learning_rate": 3.1129104621814365e-06, + "loss": 0.1011, + "step": 8120 + }, + { + "epoch": 1.3157809462086845, + "grad_norm": 0.870242178440094, + "learning_rate": 3.112486494539705e-06, + "loss": 0.1122, + "step": 8121 + }, + { + "epoch": 1.3159429682436812, + "grad_norm": 0.7542531490325928, + "learning_rate": 3.1120625081573696e-06, + "loss": 0.0923, + "step": 8122 + }, + { + "epoch": 1.3161049902786779, + "grad_norm": 0.8048637509346008, + "learning_rate": 3.111638503047402e-06, + "loss": 0.0955, + "step": 8123 + }, + { + "epoch": 1.3162670123136746, + "grad_norm": 0.7695650458335876, + "learning_rate": 3.1112144792227774e-06, + "loss": 0.0995, + "step": 8124 + }, + { + "epoch": 1.3164290343486713, + "grad_norm": 0.8860523700714111, + "learning_rate": 3.110790436696468e-06, + "loss": 0.113, + "step": 8125 + }, + { + "epoch": 1.3165910563836682, + "grad_norm": 0.7619615197181702, + "learning_rate": 3.1103663754814493e-06, + "loss": 0.1018, + "step": 8126 + }, + { + "epoch": 1.316753078418665, + "grad_norm": 1.0961564779281616, + "learning_rate": 3.1099422955906965e-06, + "loss": 0.1428, + "step": 8127 + }, + { + "epoch": 1.3169151004536617, + "grad_norm": 0.8678660988807678, + "learning_rate": 3.109518197037186e-06, + "loss": 0.1052, + "step": 8128 + }, + { + "epoch": 1.3170771224886584, + "grad_norm": 0.8276517987251282, + "learning_rate": 3.109094079833893e-06, + "loss": 0.0979, + "step": 8129 + }, + { + "epoch": 1.317239144523655, + "grad_norm": 0.8221682906150818, + "learning_rate": 3.1086699439937957e-06, + "loss": 0.101, + "step": 8130 + }, + { + "epoch": 1.317401166558652, + "grad_norm": 0.7636736631393433, + "learning_rate": 3.1082457895298705e-06, + "loss": 0.0981, + "step": 8131 + }, + { + "epoch": 1.3175631885936487, + "grad_norm": 0.7403510212898254, + "learning_rate": 3.1078216164550966e-06, + "loss": 0.1002, + "step": 8132 + }, + { + "epoch": 1.3177252106286454, + "grad_norm": 0.9044564962387085, + "learning_rate": 3.1073974247824523e-06, + "loss": 0.1244, + "step": 8133 + }, + { + "epoch": 1.3178872326636424, + "grad_norm": 0.7922056913375854, + "learning_rate": 3.1069732145249166e-06, + "loss": 0.1003, + "step": 8134 + }, + { + "epoch": 1.318049254698639, + "grad_norm": 0.8038402795791626, + "learning_rate": 3.106548985695469e-06, + "loss": 0.1019, + "step": 8135 + }, + { + "epoch": 1.3182112767336358, + "grad_norm": 0.7442052364349365, + "learning_rate": 3.1061247383070905e-06, + "loss": 0.0865, + "step": 8136 + }, + { + "epoch": 1.3183732987686325, + "grad_norm": 0.6717336177825928, + "learning_rate": 3.105700472372762e-06, + "loss": 0.0862, + "step": 8137 + }, + { + "epoch": 1.3185353208036292, + "grad_norm": 0.7845094799995422, + "learning_rate": 3.1052761879054637e-06, + "loss": 0.1059, + "step": 8138 + }, + { + "epoch": 1.3186973428386262, + "grad_norm": 0.9265906810760498, + "learning_rate": 3.1048518849181795e-06, + "loss": 0.1059, + "step": 8139 + }, + { + "epoch": 1.3188593648736229, + "grad_norm": 0.7679669260978699, + "learning_rate": 3.1044275634238913e-06, + "loss": 0.097, + "step": 8140 + }, + { + "epoch": 1.3190213869086196, + "grad_norm": 0.7996553182601929, + "learning_rate": 3.1040032234355827e-06, + "loss": 0.0952, + "step": 8141 + }, + { + "epoch": 1.3191834089436163, + "grad_norm": 0.8960670232772827, + "learning_rate": 3.103578864966237e-06, + "loss": 0.1098, + "step": 8142 + }, + { + "epoch": 1.319345430978613, + "grad_norm": 0.8924859166145325, + "learning_rate": 3.1031544880288384e-06, + "loss": 0.1093, + "step": 8143 + }, + { + "epoch": 1.31950745301361, + "grad_norm": 0.874876856803894, + "learning_rate": 3.1027300926363723e-06, + "loss": 0.1064, + "step": 8144 + }, + { + "epoch": 1.3196694750486067, + "grad_norm": 1.0655916929244995, + "learning_rate": 3.1023056788018234e-06, + "loss": 0.1288, + "step": 8145 + }, + { + "epoch": 1.3198314970836034, + "grad_norm": 0.8831046223640442, + "learning_rate": 3.1018812465381796e-06, + "loss": 0.1017, + "step": 8146 + }, + { + "epoch": 1.3199935191186, + "grad_norm": 0.7947402596473694, + "learning_rate": 3.1014567958584246e-06, + "loss": 0.0977, + "step": 8147 + }, + { + "epoch": 1.3201555411535968, + "grad_norm": 0.945785641670227, + "learning_rate": 3.1010323267755486e-06, + "loss": 0.1148, + "step": 8148 + }, + { + "epoch": 1.3203175631885937, + "grad_norm": 0.8066475987434387, + "learning_rate": 3.1006078393025366e-06, + "loss": 0.1053, + "step": 8149 + }, + { + "epoch": 1.3204795852235904, + "grad_norm": 0.8218502998352051, + "learning_rate": 3.100183333452379e-06, + "loss": 0.0961, + "step": 8150 + }, + { + "epoch": 1.3206416072585871, + "grad_norm": 0.7845996618270874, + "learning_rate": 3.0997588092380636e-06, + "loss": 0.1035, + "step": 8151 + }, + { + "epoch": 1.3208036292935839, + "grad_norm": 0.7798967957496643, + "learning_rate": 3.0993342666725803e-06, + "loss": 0.095, + "step": 8152 + }, + { + "epoch": 1.3209656513285806, + "grad_norm": 0.8794483542442322, + "learning_rate": 3.0989097057689175e-06, + "loss": 0.1131, + "step": 8153 + }, + { + "epoch": 1.3211276733635775, + "grad_norm": 0.8552145957946777, + "learning_rate": 3.0984851265400683e-06, + "loss": 0.0966, + "step": 8154 + }, + { + "epoch": 1.3212896953985742, + "grad_norm": 0.8511651158332825, + "learning_rate": 3.098060528999023e-06, + "loss": 0.102, + "step": 8155 + }, + { + "epoch": 1.321451717433571, + "grad_norm": 0.8631334900856018, + "learning_rate": 3.097635913158772e-06, + "loss": 0.1088, + "step": 8156 + }, + { + "epoch": 1.3216137394685679, + "grad_norm": 0.6932099461555481, + "learning_rate": 3.0972112790323076e-06, + "loss": 0.0885, + "step": 8157 + }, + { + "epoch": 1.3217757615035644, + "grad_norm": 0.8453887104988098, + "learning_rate": 3.096786626632624e-06, + "loss": 0.111, + "step": 8158 + }, + { + "epoch": 1.3219377835385613, + "grad_norm": 0.8994348049163818, + "learning_rate": 3.0963619559727143e-06, + "loss": 0.1123, + "step": 8159 + }, + { + "epoch": 1.322099805573558, + "grad_norm": 0.869174599647522, + "learning_rate": 3.0959372670655714e-06, + "loss": 0.1109, + "step": 8160 + }, + { + "epoch": 1.3222618276085547, + "grad_norm": 0.7786162495613098, + "learning_rate": 3.09551255992419e-06, + "loss": 0.0986, + "step": 8161 + }, + { + "epoch": 1.3224238496435516, + "grad_norm": 0.8360916972160339, + "learning_rate": 3.0950878345615654e-06, + "loss": 0.1046, + "step": 8162 + }, + { + "epoch": 1.3225858716785484, + "grad_norm": 0.8206416368484497, + "learning_rate": 3.0946630909906943e-06, + "loss": 0.1086, + "step": 8163 + }, + { + "epoch": 1.322747893713545, + "grad_norm": 0.904058575630188, + "learning_rate": 3.0942383292245704e-06, + "loss": 0.1145, + "step": 8164 + }, + { + "epoch": 1.3229099157485418, + "grad_norm": 0.8536134362220764, + "learning_rate": 3.0938135492761923e-06, + "loss": 0.1122, + "step": 8165 + }, + { + "epoch": 1.3230719377835385, + "grad_norm": 0.7881593108177185, + "learning_rate": 3.0933887511585564e-06, + "loss": 0.0954, + "step": 8166 + }, + { + "epoch": 1.3232339598185354, + "grad_norm": 0.8875266909599304, + "learning_rate": 3.0929639348846604e-06, + "loss": 0.1132, + "step": 8167 + }, + { + "epoch": 1.3233959818535321, + "grad_norm": 0.8193500638008118, + "learning_rate": 3.0925391004675037e-06, + "loss": 0.1013, + "step": 8168 + }, + { + "epoch": 1.3235580038885288, + "grad_norm": 0.8682994842529297, + "learning_rate": 3.0921142479200833e-06, + "loss": 0.1098, + "step": 8169 + }, + { + "epoch": 1.3237200259235256, + "grad_norm": 0.7923477292060852, + "learning_rate": 3.0916893772554006e-06, + "loss": 0.0961, + "step": 8170 + }, + { + "epoch": 1.3238820479585223, + "grad_norm": 0.8617781400680542, + "learning_rate": 3.0912644884864547e-06, + "loss": 0.1053, + "step": 8171 + }, + { + "epoch": 1.3240440699935192, + "grad_norm": 0.7619832158088684, + "learning_rate": 3.0908395816262466e-06, + "loss": 0.1016, + "step": 8172 + }, + { + "epoch": 1.324206092028516, + "grad_norm": 0.8054799437522888, + "learning_rate": 3.0904146566877762e-06, + "loss": 0.1082, + "step": 8173 + }, + { + "epoch": 1.3243681140635126, + "grad_norm": 0.8365015387535095, + "learning_rate": 3.0899897136840468e-06, + "loss": 0.1075, + "step": 8174 + }, + { + "epoch": 1.3245301360985093, + "grad_norm": 0.753804087638855, + "learning_rate": 3.0895647526280598e-06, + "loss": 0.0947, + "step": 8175 + }, + { + "epoch": 1.324692158133506, + "grad_norm": 0.9248984456062317, + "learning_rate": 3.0891397735328176e-06, + "loss": 0.1242, + "step": 8176 + }, + { + "epoch": 1.324854180168503, + "grad_norm": 0.7784146070480347, + "learning_rate": 3.088714776411325e-06, + "loss": 0.1127, + "step": 8177 + }, + { + "epoch": 1.3250162022034997, + "grad_norm": 0.8793583512306213, + "learning_rate": 3.088289761276584e-06, + "loss": 0.09, + "step": 8178 + }, + { + "epoch": 1.3251782242384964, + "grad_norm": 0.7790801525115967, + "learning_rate": 3.0878647281416007e-06, + "loss": 0.0928, + "step": 8179 + }, + { + "epoch": 1.3253402462734931, + "grad_norm": 0.8331507444381714, + "learning_rate": 3.0874396770193785e-06, + "loss": 0.1003, + "step": 8180 + }, + { + "epoch": 1.3255022683084898, + "grad_norm": 0.8366468548774719, + "learning_rate": 3.0870146079229245e-06, + "loss": 0.1221, + "step": 8181 + }, + { + "epoch": 1.3256642903434868, + "grad_norm": 0.8266817331314087, + "learning_rate": 3.0865895208652436e-06, + "loss": 0.1096, + "step": 8182 + }, + { + "epoch": 1.3258263123784835, + "grad_norm": 0.8993229866027832, + "learning_rate": 3.086164415859343e-06, + "loss": 0.1211, + "step": 8183 + }, + { + "epoch": 1.3259883344134802, + "grad_norm": 0.8876615762710571, + "learning_rate": 3.0857392929182296e-06, + "loss": 0.1092, + "step": 8184 + }, + { + "epoch": 1.3261503564484771, + "grad_norm": 0.7923975586891174, + "learning_rate": 3.085314152054911e-06, + "loss": 0.1, + "step": 8185 + }, + { + "epoch": 1.3263123784834738, + "grad_norm": 0.7979410886764526, + "learning_rate": 3.084888993282397e-06, + "loss": 0.1007, + "step": 8186 + }, + { + "epoch": 1.3264744005184705, + "grad_norm": 0.9142311215400696, + "learning_rate": 3.0844638166136943e-06, + "loss": 0.1154, + "step": 8187 + }, + { + "epoch": 1.3266364225534673, + "grad_norm": 0.8483611941337585, + "learning_rate": 3.0840386220618137e-06, + "loss": 0.1061, + "step": 8188 + }, + { + "epoch": 1.326798444588464, + "grad_norm": 0.9049779772758484, + "learning_rate": 3.0836134096397642e-06, + "loss": 0.1039, + "step": 8189 + }, + { + "epoch": 1.326960466623461, + "grad_norm": 0.8768754601478577, + "learning_rate": 3.083188179360556e-06, + "loss": 0.1141, + "step": 8190 + }, + { + "epoch": 1.3271224886584576, + "grad_norm": 0.7958570718765259, + "learning_rate": 3.082762931237202e-06, + "loss": 0.1006, + "step": 8191 + }, + { + "epoch": 1.3272845106934543, + "grad_norm": 0.8159347176551819, + "learning_rate": 3.0823376652827123e-06, + "loss": 0.1026, + "step": 8192 + }, + { + "epoch": 1.327446532728451, + "grad_norm": 0.8997524976730347, + "learning_rate": 3.081912381510099e-06, + "loss": 0.1096, + "step": 8193 + }, + { + "epoch": 1.3276085547634477, + "grad_norm": 0.8138853311538696, + "learning_rate": 3.0814870799323748e-06, + "loss": 0.1031, + "step": 8194 + }, + { + "epoch": 1.3277705767984447, + "grad_norm": 0.8049913048744202, + "learning_rate": 3.0810617605625538e-06, + "loss": 0.0991, + "step": 8195 + }, + { + "epoch": 1.3279325988334414, + "grad_norm": 0.7610349059104919, + "learning_rate": 3.080636423413649e-06, + "loss": 0.1038, + "step": 8196 + }, + { + "epoch": 1.328094620868438, + "grad_norm": 0.8136640191078186, + "learning_rate": 3.0802110684986742e-06, + "loss": 0.101, + "step": 8197 + }, + { + "epoch": 1.3282566429034348, + "grad_norm": 0.8479365706443787, + "learning_rate": 3.079785695830645e-06, + "loss": 0.1213, + "step": 8198 + }, + { + "epoch": 1.3284186649384315, + "grad_norm": 0.8573151230812073, + "learning_rate": 3.0793603054225767e-06, + "loss": 0.1135, + "step": 8199 + }, + { + "epoch": 1.3285806869734285, + "grad_norm": 0.8498213887214661, + "learning_rate": 3.0789348972874844e-06, + "loss": 0.1053, + "step": 8200 + }, + { + "epoch": 1.3287427090084252, + "grad_norm": 0.764045774936676, + "learning_rate": 3.078509471438386e-06, + "loss": 0.1018, + "step": 8201 + }, + { + "epoch": 1.3289047310434219, + "grad_norm": 0.7893896102905273, + "learning_rate": 3.0780840278882974e-06, + "loss": 0.0983, + "step": 8202 + }, + { + "epoch": 1.3290667530784186, + "grad_norm": 0.7424217462539673, + "learning_rate": 3.0776585666502367e-06, + "loss": 0.093, + "step": 8203 + }, + { + "epoch": 1.3292287751134153, + "grad_norm": 0.8020110726356506, + "learning_rate": 3.077233087737222e-06, + "loss": 0.1035, + "step": 8204 + }, + { + "epoch": 1.3293907971484122, + "grad_norm": 0.7891095280647278, + "learning_rate": 3.0768075911622712e-06, + "loss": 0.1097, + "step": 8205 + }, + { + "epoch": 1.329552819183409, + "grad_norm": 0.769567608833313, + "learning_rate": 3.0763820769384038e-06, + "loss": 0.0996, + "step": 8206 + }, + { + "epoch": 1.3297148412184057, + "grad_norm": 0.8365778923034668, + "learning_rate": 3.07595654507864e-06, + "loss": 0.0994, + "step": 8207 + }, + { + "epoch": 1.3298768632534026, + "grad_norm": 0.8200099468231201, + "learning_rate": 3.0755309955960007e-06, + "loss": 0.104, + "step": 8208 + }, + { + "epoch": 1.330038885288399, + "grad_norm": 0.7496293783187866, + "learning_rate": 3.0751054285035037e-06, + "loss": 0.1011, + "step": 8209 + }, + { + "epoch": 1.330200907323396, + "grad_norm": 0.9090161919593811, + "learning_rate": 3.074679843814174e-06, + "loss": 0.1109, + "step": 8210 + }, + { + "epoch": 1.3303629293583927, + "grad_norm": 0.8106396198272705, + "learning_rate": 3.0742542415410307e-06, + "loss": 0.0995, + "step": 8211 + }, + { + "epoch": 1.3305249513933894, + "grad_norm": 0.9422435164451599, + "learning_rate": 3.073828621697098e-06, + "loss": 0.1133, + "step": 8212 + }, + { + "epoch": 1.3306869734283864, + "grad_norm": 0.8159134387969971, + "learning_rate": 3.0734029842953976e-06, + "loss": 0.1045, + "step": 8213 + }, + { + "epoch": 1.330848995463383, + "grad_norm": 0.8325510621070862, + "learning_rate": 3.072977329348954e-06, + "loss": 0.1119, + "step": 8214 + }, + { + "epoch": 1.3310110174983798, + "grad_norm": 0.8749605417251587, + "learning_rate": 3.07255165687079e-06, + "loss": 0.1091, + "step": 8215 + }, + { + "epoch": 1.3311730395333765, + "grad_norm": 0.8472917079925537, + "learning_rate": 3.072125966873932e-06, + "loss": 0.11, + "step": 8216 + }, + { + "epoch": 1.3313350615683732, + "grad_norm": 0.9041043519973755, + "learning_rate": 3.0717002593714027e-06, + "loss": 0.1122, + "step": 8217 + }, + { + "epoch": 1.3314970836033702, + "grad_norm": 0.9191260933876038, + "learning_rate": 3.0712745343762295e-06, + "loss": 0.1157, + "step": 8218 + }, + { + "epoch": 1.3316591056383669, + "grad_norm": 0.8039606809616089, + "learning_rate": 3.070848791901438e-06, + "loss": 0.1042, + "step": 8219 + }, + { + "epoch": 1.3318211276733636, + "grad_norm": 0.8702144026756287, + "learning_rate": 3.0704230319600547e-06, + "loss": 0.1087, + "step": 8220 + }, + { + "epoch": 1.3319831497083603, + "grad_norm": 0.7728626132011414, + "learning_rate": 3.0699972545651067e-06, + "loss": 0.0971, + "step": 8221 + }, + { + "epoch": 1.332145171743357, + "grad_norm": 0.7619557976722717, + "learning_rate": 3.069571459729623e-06, + "loss": 0.092, + "step": 8222 + }, + { + "epoch": 1.332307193778354, + "grad_norm": 0.7611822485923767, + "learning_rate": 3.069145647466631e-06, + "loss": 0.1038, + "step": 8223 + }, + { + "epoch": 1.3324692158133506, + "grad_norm": 0.7991523146629333, + "learning_rate": 3.068719817789158e-06, + "loss": 0.1018, + "step": 8224 + }, + { + "epoch": 1.3326312378483474, + "grad_norm": 0.7888767719268799, + "learning_rate": 3.0682939707102366e-06, + "loss": 0.0988, + "step": 8225 + }, + { + "epoch": 1.332793259883344, + "grad_norm": 0.7734952569007874, + "learning_rate": 3.067868106242894e-06, + "loss": 0.0969, + "step": 8226 + }, + { + "epoch": 1.3329552819183408, + "grad_norm": 0.848461925983429, + "learning_rate": 3.0674422244001616e-06, + "loss": 0.1006, + "step": 8227 + }, + { + "epoch": 1.3331173039533377, + "grad_norm": 0.7484978437423706, + "learning_rate": 3.0670163251950703e-06, + "loss": 0.0981, + "step": 8228 + }, + { + "epoch": 1.3332793259883344, + "grad_norm": 0.7978498935699463, + "learning_rate": 3.0665904086406516e-06, + "loss": 0.1004, + "step": 8229 + }, + { + "epoch": 1.3334413480233311, + "grad_norm": 0.8219764828681946, + "learning_rate": 3.0661644747499385e-06, + "loss": 0.0978, + "step": 8230 + }, + { + "epoch": 1.3336033700583279, + "grad_norm": 0.910052478313446, + "learning_rate": 3.065738523535961e-06, + "loss": 0.1144, + "step": 8231 + }, + { + "epoch": 1.3337653920933246, + "grad_norm": 0.7298288941383362, + "learning_rate": 3.0653125550117547e-06, + "loss": 0.0957, + "step": 8232 + }, + { + "epoch": 1.3339274141283215, + "grad_norm": 0.7882039546966553, + "learning_rate": 3.064886569190352e-06, + "loss": 0.101, + "step": 8233 + }, + { + "epoch": 1.3340894361633182, + "grad_norm": 0.8351643085479736, + "learning_rate": 3.0644605660847875e-06, + "loss": 0.1027, + "step": 8234 + }, + { + "epoch": 1.334251458198315, + "grad_norm": 0.8841098546981812, + "learning_rate": 3.0640345457080955e-06, + "loss": 0.1066, + "step": 8235 + }, + { + "epoch": 1.3344134802333119, + "grad_norm": 0.9468151330947876, + "learning_rate": 3.0636085080733113e-06, + "loss": 0.1148, + "step": 8236 + }, + { + "epoch": 1.3345755022683086, + "grad_norm": 0.7528395056724548, + "learning_rate": 3.0631824531934707e-06, + "loss": 0.0928, + "step": 8237 + }, + { + "epoch": 1.3347375243033053, + "grad_norm": 0.81801837682724, + "learning_rate": 3.0627563810816097e-06, + "loss": 0.0954, + "step": 8238 + }, + { + "epoch": 1.334899546338302, + "grad_norm": 0.9190750122070312, + "learning_rate": 3.0623302917507657e-06, + "loss": 0.1095, + "step": 8239 + }, + { + "epoch": 1.3350615683732987, + "grad_norm": 0.8107596039772034, + "learning_rate": 3.0619041852139746e-06, + "loss": 0.1023, + "step": 8240 + }, + { + "epoch": 1.3352235904082956, + "grad_norm": 0.8642371296882629, + "learning_rate": 3.0614780614842764e-06, + "loss": 0.1095, + "step": 8241 + }, + { + "epoch": 1.3353856124432923, + "grad_norm": 0.92511385679245, + "learning_rate": 3.061051920574708e-06, + "loss": 0.1094, + "step": 8242 + }, + { + "epoch": 1.335547634478289, + "grad_norm": 0.87492835521698, + "learning_rate": 3.0606257624983082e-06, + "loss": 0.1073, + "step": 8243 + }, + { + "epoch": 1.3357096565132858, + "grad_norm": 0.7280792593955994, + "learning_rate": 3.0601995872681167e-06, + "loss": 0.0947, + "step": 8244 + }, + { + "epoch": 1.3358716785482825, + "grad_norm": 0.7862234711647034, + "learning_rate": 3.0597733948971737e-06, + "loss": 0.092, + "step": 8245 + }, + { + "epoch": 1.3360337005832794, + "grad_norm": 0.9407415390014648, + "learning_rate": 3.0593471853985197e-06, + "loss": 0.1128, + "step": 8246 + }, + { + "epoch": 1.3361957226182761, + "grad_norm": 0.7237008213996887, + "learning_rate": 3.0589209587851954e-06, + "loss": 0.0908, + "step": 8247 + }, + { + "epoch": 1.3363577446532728, + "grad_norm": 0.8024490475654602, + "learning_rate": 3.058494715070242e-06, + "loss": 0.096, + "step": 8248 + }, + { + "epoch": 1.3365197666882696, + "grad_norm": 0.7283364534378052, + "learning_rate": 3.0580684542667016e-06, + "loss": 0.0916, + "step": 8249 + }, + { + "epoch": 1.3366817887232663, + "grad_norm": 0.8358945250511169, + "learning_rate": 3.0576421763876174e-06, + "loss": 0.0954, + "step": 8250 + }, + { + "epoch": 1.3368438107582632, + "grad_norm": 0.9684699773788452, + "learning_rate": 3.0572158814460323e-06, + "loss": 0.126, + "step": 8251 + }, + { + "epoch": 1.33700583279326, + "grad_norm": 0.7843111157417297, + "learning_rate": 3.056789569454989e-06, + "loss": 0.099, + "step": 8252 + }, + { + "epoch": 1.3371678548282566, + "grad_norm": 0.7540812492370605, + "learning_rate": 3.056363240427533e-06, + "loss": 0.0931, + "step": 8253 + }, + { + "epoch": 1.3373298768632533, + "grad_norm": 0.9374097585678101, + "learning_rate": 3.055936894376708e-06, + "loss": 0.1093, + "step": 8254 + }, + { + "epoch": 1.33749189889825, + "grad_norm": 0.8523105978965759, + "learning_rate": 3.0555105313155587e-06, + "loss": 0.102, + "step": 8255 + }, + { + "epoch": 1.337653920933247, + "grad_norm": 0.914612352848053, + "learning_rate": 3.055084151257133e-06, + "loss": 0.1071, + "step": 8256 + }, + { + "epoch": 1.3378159429682437, + "grad_norm": 0.8243370056152344, + "learning_rate": 3.0546577542144734e-06, + "loss": 0.1144, + "step": 8257 + }, + { + "epoch": 1.3379779650032404, + "grad_norm": 0.7740969061851501, + "learning_rate": 3.054231340200631e-06, + "loss": 0.0894, + "step": 8258 + }, + { + "epoch": 1.3381399870382373, + "grad_norm": 0.6962242722511292, + "learning_rate": 3.053804909228649e-06, + "loss": 0.0889, + "step": 8259 + }, + { + "epoch": 1.3383020090732338, + "grad_norm": 0.7744547128677368, + "learning_rate": 3.053378461311578e-06, + "loss": 0.0924, + "step": 8260 + }, + { + "epoch": 1.3384640311082308, + "grad_norm": 0.8196523189544678, + "learning_rate": 3.052951996462465e-06, + "loss": 0.0982, + "step": 8261 + }, + { + "epoch": 1.3386260531432275, + "grad_norm": 0.7271378040313721, + "learning_rate": 3.0525255146943582e-06, + "loss": 0.0969, + "step": 8262 + }, + { + "epoch": 1.3387880751782242, + "grad_norm": 0.7864364981651306, + "learning_rate": 3.052099016020309e-06, + "loss": 0.0988, + "step": 8263 + }, + { + "epoch": 1.3389500972132211, + "grad_norm": 0.7747987508773804, + "learning_rate": 3.0516725004533648e-06, + "loss": 0.0925, + "step": 8264 + }, + { + "epoch": 1.3391121192482178, + "grad_norm": 0.8016791343688965, + "learning_rate": 3.0512459680065785e-06, + "loss": 0.1004, + "step": 8265 + }, + { + "epoch": 1.3392741412832145, + "grad_norm": 0.8657497763633728, + "learning_rate": 3.0508194186929983e-06, + "loss": 0.1072, + "step": 8266 + }, + { + "epoch": 1.3394361633182112, + "grad_norm": 0.8073965311050415, + "learning_rate": 3.0503928525256775e-06, + "loss": 0.098, + "step": 8267 + }, + { + "epoch": 1.339598185353208, + "grad_norm": 0.7711490988731384, + "learning_rate": 3.0499662695176675e-06, + "loss": 0.1004, + "step": 8268 + }, + { + "epoch": 1.339760207388205, + "grad_norm": 0.7585821151733398, + "learning_rate": 3.04953966968202e-06, + "loss": 0.0958, + "step": 8269 + }, + { + "epoch": 1.3399222294232016, + "grad_norm": 0.7577686309814453, + "learning_rate": 3.0491130530317887e-06, + "loss": 0.0968, + "step": 8270 + }, + { + "epoch": 1.3400842514581983, + "grad_norm": 0.7601026296615601, + "learning_rate": 3.048686419580027e-06, + "loss": 0.0991, + "step": 8271 + }, + { + "epoch": 1.340246273493195, + "grad_norm": 0.9418452382087708, + "learning_rate": 3.0482597693397887e-06, + "loss": 0.1168, + "step": 8272 + }, + { + "epoch": 1.3404082955281917, + "grad_norm": 0.8291217684745789, + "learning_rate": 3.047833102324128e-06, + "loss": 0.1024, + "step": 8273 + }, + { + "epoch": 1.3405703175631887, + "grad_norm": 0.8256798982620239, + "learning_rate": 3.0474064185461e-06, + "loss": 0.1015, + "step": 8274 + }, + { + "epoch": 1.3407323395981854, + "grad_norm": 0.823760986328125, + "learning_rate": 3.0469797180187606e-06, + "loss": 0.1101, + "step": 8275 + }, + { + "epoch": 1.340894361633182, + "grad_norm": 0.7810642123222351, + "learning_rate": 3.0465530007551646e-06, + "loss": 0.1018, + "step": 8276 + }, + { + "epoch": 1.3410563836681788, + "grad_norm": 0.8536295890808105, + "learning_rate": 3.04612626676837e-06, + "loss": 0.1047, + "step": 8277 + }, + { + "epoch": 1.3412184057031755, + "grad_norm": 0.837985098361969, + "learning_rate": 3.0456995160714344e-06, + "loss": 0.1103, + "step": 8278 + }, + { + "epoch": 1.3413804277381725, + "grad_norm": 0.765397846698761, + "learning_rate": 3.0452727486774118e-06, + "loss": 0.0972, + "step": 8279 + }, + { + "epoch": 1.3415424497731692, + "grad_norm": 0.8630374073982239, + "learning_rate": 3.044845964599365e-06, + "loss": 0.0982, + "step": 8280 + }, + { + "epoch": 1.3417044718081659, + "grad_norm": 0.7347887754440308, + "learning_rate": 3.044419163850349e-06, + "loss": 0.0935, + "step": 8281 + }, + { + "epoch": 1.3418664938431626, + "grad_norm": 0.773932158946991, + "learning_rate": 3.043992346443424e-06, + "loss": 0.0996, + "step": 8282 + }, + { + "epoch": 1.3420285158781593, + "grad_norm": 0.77265864610672, + "learning_rate": 3.04356551239165e-06, + "loss": 0.0913, + "step": 8283 + }, + { + "epoch": 1.3421905379131562, + "grad_norm": 0.7659427523612976, + "learning_rate": 3.043138661708086e-06, + "loss": 0.1001, + "step": 8284 + }, + { + "epoch": 1.342352559948153, + "grad_norm": 0.8944420218467712, + "learning_rate": 3.0427117944057943e-06, + "loss": 0.1002, + "step": 8285 + }, + { + "epoch": 1.3425145819831497, + "grad_norm": 0.8467785716056824, + "learning_rate": 3.042284910497834e-06, + "loss": 0.0995, + "step": 8286 + }, + { + "epoch": 1.3426766040181466, + "grad_norm": 0.9520701766014099, + "learning_rate": 3.0418580099972687e-06, + "loss": 0.1151, + "step": 8287 + }, + { + "epoch": 1.3428386260531433, + "grad_norm": 0.7847069501876831, + "learning_rate": 3.0414310929171587e-06, + "loss": 0.1078, + "step": 8288 + }, + { + "epoch": 1.34300064808814, + "grad_norm": 0.9613496661186218, + "learning_rate": 3.0410041592705687e-06, + "loss": 0.1154, + "step": 8289 + }, + { + "epoch": 1.3431626701231367, + "grad_norm": 0.8269860148429871, + "learning_rate": 3.04057720907056e-06, + "loss": 0.096, + "step": 8290 + }, + { + "epoch": 1.3433246921581334, + "grad_norm": 0.8524847626686096, + "learning_rate": 3.0401502423301966e-06, + "loss": 0.111, + "step": 8291 + }, + { + "epoch": 1.3434867141931304, + "grad_norm": 0.7680644392967224, + "learning_rate": 3.039723259062543e-06, + "loss": 0.0969, + "step": 8292 + }, + { + "epoch": 1.343648736228127, + "grad_norm": 0.8680287003517151, + "learning_rate": 3.0392962592806635e-06, + "loss": 0.1138, + "step": 8293 + }, + { + "epoch": 1.3438107582631238, + "grad_norm": 0.8054776191711426, + "learning_rate": 3.0388692429976247e-06, + "loss": 0.0998, + "step": 8294 + }, + { + "epoch": 1.3439727802981205, + "grad_norm": 0.8584299087524414, + "learning_rate": 3.03844221022649e-06, + "loss": 0.1017, + "step": 8295 + }, + { + "epoch": 1.3441348023331172, + "grad_norm": 0.9022834300994873, + "learning_rate": 3.038015160980327e-06, + "loss": 0.1122, + "step": 8296 + }, + { + "epoch": 1.3442968243681142, + "grad_norm": 0.9513327479362488, + "learning_rate": 3.037588095272202e-06, + "loss": 0.1153, + "step": 8297 + }, + { + "epoch": 1.3444588464031109, + "grad_norm": 0.7637044787406921, + "learning_rate": 3.0371610131151823e-06, + "loss": 0.1051, + "step": 8298 + }, + { + "epoch": 1.3446208684381076, + "grad_norm": 0.8572631478309631, + "learning_rate": 3.0367339145223352e-06, + "loss": 0.1051, + "step": 8299 + }, + { + "epoch": 1.3447828904731043, + "grad_norm": 0.7625088095664978, + "learning_rate": 3.0363067995067297e-06, + "loss": 0.1026, + "step": 8300 + }, + { + "epoch": 1.344944912508101, + "grad_norm": 0.9297232627868652, + "learning_rate": 3.0358796680814333e-06, + "loss": 0.1002, + "step": 8301 + }, + { + "epoch": 1.345106934543098, + "grad_norm": 0.8700084090232849, + "learning_rate": 3.035452520259517e-06, + "loss": 0.1198, + "step": 8302 + }, + { + "epoch": 1.3452689565780946, + "grad_norm": 0.8522663116455078, + "learning_rate": 3.035025356054049e-06, + "loss": 0.106, + "step": 8303 + }, + { + "epoch": 1.3454309786130914, + "grad_norm": 0.7052852511405945, + "learning_rate": 3.034598175478099e-06, + "loss": 0.0937, + "step": 8304 + }, + { + "epoch": 1.345593000648088, + "grad_norm": 0.8086321353912354, + "learning_rate": 3.034170978544739e-06, + "loss": 0.1022, + "step": 8305 + }, + { + "epoch": 1.3457550226830848, + "grad_norm": 0.8114489912986755, + "learning_rate": 3.03374376526704e-06, + "loss": 0.1035, + "step": 8306 + }, + { + "epoch": 1.3459170447180817, + "grad_norm": 0.8548650741577148, + "learning_rate": 3.033316535658073e-06, + "loss": 0.1087, + "step": 8307 + }, + { + "epoch": 1.3460790667530784, + "grad_norm": 1.0227341651916504, + "learning_rate": 3.0328892897309105e-06, + "loss": 0.1139, + "step": 8308 + }, + { + "epoch": 1.3462410887880751, + "grad_norm": 0.9773898720741272, + "learning_rate": 3.032462027498626e-06, + "loss": 0.1205, + "step": 8309 + }, + { + "epoch": 1.346403110823072, + "grad_norm": 0.853813111782074, + "learning_rate": 3.0320347489742905e-06, + "loss": 0.1066, + "step": 8310 + }, + { + "epoch": 1.3465651328580686, + "grad_norm": 0.7821661829948425, + "learning_rate": 3.0316074541709813e-06, + "loss": 0.105, + "step": 8311 + }, + { + "epoch": 1.3467271548930655, + "grad_norm": 0.7272818684577942, + "learning_rate": 3.031180143101769e-06, + "loss": 0.0946, + "step": 8312 + }, + { + "epoch": 1.3468891769280622, + "grad_norm": 0.8688091039657593, + "learning_rate": 3.0307528157797306e-06, + "loss": 0.1046, + "step": 8313 + }, + { + "epoch": 1.347051198963059, + "grad_norm": 0.8635636568069458, + "learning_rate": 3.03032547221794e-06, + "loss": 0.1076, + "step": 8314 + }, + { + "epoch": 1.3472132209980558, + "grad_norm": 0.8457701802253723, + "learning_rate": 3.029898112429473e-06, + "loss": 0.1164, + "step": 8315 + }, + { + "epoch": 1.3473752430330526, + "grad_norm": 0.8496267199516296, + "learning_rate": 3.0294707364274066e-06, + "loss": 0.1043, + "step": 8316 + }, + { + "epoch": 1.3475372650680493, + "grad_norm": 0.7697789669036865, + "learning_rate": 3.0290433442248163e-06, + "loss": 0.0969, + "step": 8317 + }, + { + "epoch": 1.347699287103046, + "grad_norm": 0.8936092257499695, + "learning_rate": 3.028615935834781e-06, + "loss": 0.1069, + "step": 8318 + }, + { + "epoch": 1.3478613091380427, + "grad_norm": 0.7584058046340942, + "learning_rate": 3.028188511270376e-06, + "loss": 0.0984, + "step": 8319 + }, + { + "epoch": 1.3480233311730396, + "grad_norm": 0.7890931963920593, + "learning_rate": 3.027761070544682e-06, + "loss": 0.1024, + "step": 8320 + }, + { + "epoch": 1.3481853532080363, + "grad_norm": 0.7327829599380493, + "learning_rate": 3.027333613670775e-06, + "loss": 0.09, + "step": 8321 + }, + { + "epoch": 1.348347375243033, + "grad_norm": 0.7482281923294067, + "learning_rate": 3.026906140661737e-06, + "loss": 0.102, + "step": 8322 + }, + { + "epoch": 1.3485093972780298, + "grad_norm": 0.7569482922554016, + "learning_rate": 3.0264786515306453e-06, + "loss": 0.096, + "step": 8323 + }, + { + "epoch": 1.3486714193130265, + "grad_norm": 0.7618415355682373, + "learning_rate": 3.026051146290581e-06, + "loss": 0.0961, + "step": 8324 + }, + { + "epoch": 1.3488334413480234, + "grad_norm": 0.8373454213142395, + "learning_rate": 3.0256236249546256e-06, + "loss": 0.103, + "step": 8325 + }, + { + "epoch": 1.3489954633830201, + "grad_norm": 0.9048294425010681, + "learning_rate": 3.025196087535858e-06, + "loss": 0.111, + "step": 8326 + }, + { + "epoch": 1.3491574854180168, + "grad_norm": 0.8331298232078552, + "learning_rate": 3.024768534047362e-06, + "loss": 0.1084, + "step": 8327 + }, + { + "epoch": 1.3493195074530135, + "grad_norm": 0.8242894411087036, + "learning_rate": 3.024340964502218e-06, + "loss": 0.1079, + "step": 8328 + }, + { + "epoch": 1.3494815294880103, + "grad_norm": 0.7231031060218811, + "learning_rate": 3.0239133789135094e-06, + "loss": 0.0966, + "step": 8329 + }, + { + "epoch": 1.3496435515230072, + "grad_norm": 0.7646691799163818, + "learning_rate": 3.0234857772943197e-06, + "loss": 0.0967, + "step": 8330 + }, + { + "epoch": 1.349805573558004, + "grad_norm": 0.7422172427177429, + "learning_rate": 3.023058159657732e-06, + "loss": 0.0914, + "step": 8331 + }, + { + "epoch": 1.3499675955930006, + "grad_norm": 0.7762441635131836, + "learning_rate": 3.0226305260168298e-06, + "loss": 0.0945, + "step": 8332 + }, + { + "epoch": 1.3501296176279973, + "grad_norm": 0.9666473865509033, + "learning_rate": 3.0222028763846994e-06, + "loss": 0.1287, + "step": 8333 + }, + { + "epoch": 1.350291639662994, + "grad_norm": 0.8605021834373474, + "learning_rate": 3.0217752107744237e-06, + "loss": 0.1062, + "step": 8334 + }, + { + "epoch": 1.350453661697991, + "grad_norm": 0.877007246017456, + "learning_rate": 3.0213475291990897e-06, + "loss": 0.1046, + "step": 8335 + }, + { + "epoch": 1.3506156837329877, + "grad_norm": 0.7591307163238525, + "learning_rate": 3.0209198316717825e-06, + "loss": 0.0909, + "step": 8336 + }, + { + "epoch": 1.3507777057679844, + "grad_norm": 0.9145157337188721, + "learning_rate": 3.020492118205589e-06, + "loss": 0.1147, + "step": 8337 + }, + { + "epoch": 1.3509397278029813, + "grad_norm": 0.8037732243537903, + "learning_rate": 3.0200643888135973e-06, + "loss": 0.1012, + "step": 8338 + }, + { + "epoch": 1.351101749837978, + "grad_norm": 0.9152517318725586, + "learning_rate": 3.0196366435088926e-06, + "loss": 0.115, + "step": 8339 + }, + { + "epoch": 1.3512637718729748, + "grad_norm": 0.868812620639801, + "learning_rate": 3.019208882304565e-06, + "loss": 0.1139, + "step": 8340 + }, + { + "epoch": 1.3514257939079715, + "grad_norm": 0.7427473068237305, + "learning_rate": 3.018781105213701e-06, + "loss": 0.0896, + "step": 8341 + }, + { + "epoch": 1.3515878159429682, + "grad_norm": 0.7403958439826965, + "learning_rate": 3.0183533122493917e-06, + "loss": 0.0943, + "step": 8342 + }, + { + "epoch": 1.351749837977965, + "grad_norm": 0.8998346328735352, + "learning_rate": 3.017925503424725e-06, + "loss": 0.1174, + "step": 8343 + }, + { + "epoch": 1.3519118600129618, + "grad_norm": 0.8514041304588318, + "learning_rate": 3.017497678752791e-06, + "loss": 0.1092, + "step": 8344 + }, + { + "epoch": 1.3520738820479585, + "grad_norm": 0.8314045667648315, + "learning_rate": 3.0170698382466805e-06, + "loss": 0.1074, + "step": 8345 + }, + { + "epoch": 1.3522359040829552, + "grad_norm": 0.9838075041770935, + "learning_rate": 3.016641981919485e-06, + "loss": 0.111, + "step": 8346 + }, + { + "epoch": 1.352397926117952, + "grad_norm": 0.9298703670501709, + "learning_rate": 3.0162141097842943e-06, + "loss": 0.1102, + "step": 8347 + }, + { + "epoch": 1.3525599481529489, + "grad_norm": 0.7722398638725281, + "learning_rate": 3.0157862218542004e-06, + "loss": 0.0921, + "step": 8348 + }, + { + "epoch": 1.3527219701879456, + "grad_norm": 0.9279158711433411, + "learning_rate": 3.015358318142298e-06, + "loss": 0.1113, + "step": 8349 + }, + { + "epoch": 1.3528839922229423, + "grad_norm": 0.8341646194458008, + "learning_rate": 3.0149303986616772e-06, + "loss": 0.1125, + "step": 8350 + }, + { + "epoch": 1.353046014257939, + "grad_norm": 0.8737770915031433, + "learning_rate": 3.0145024634254323e-06, + "loss": 0.1119, + "step": 8351 + }, + { + "epoch": 1.3532080362929357, + "grad_norm": 0.8188356757164001, + "learning_rate": 3.014074512446657e-06, + "loss": 0.0972, + "step": 8352 + }, + { + "epoch": 1.3533700583279327, + "grad_norm": 0.8271098136901855, + "learning_rate": 3.0136465457384454e-06, + "loss": 0.1017, + "step": 8353 + }, + { + "epoch": 1.3535320803629294, + "grad_norm": 0.852016031742096, + "learning_rate": 3.0132185633138934e-06, + "loss": 0.115, + "step": 8354 + }, + { + "epoch": 1.353694102397926, + "grad_norm": 0.8249271512031555, + "learning_rate": 3.0127905651860946e-06, + "loss": 0.0977, + "step": 8355 + }, + { + "epoch": 1.3538561244329228, + "grad_norm": 0.7700305581092834, + "learning_rate": 3.0123625513681463e-06, + "loss": 0.1055, + "step": 8356 + }, + { + "epoch": 1.3540181464679195, + "grad_norm": 0.9442518353462219, + "learning_rate": 3.0119345218731433e-06, + "loss": 0.1065, + "step": 8357 + }, + { + "epoch": 1.3541801685029164, + "grad_norm": 0.8089423179626465, + "learning_rate": 3.0115064767141827e-06, + "loss": 0.0996, + "step": 8358 + }, + { + "epoch": 1.3543421905379132, + "grad_norm": 0.9584595561027527, + "learning_rate": 3.0110784159043614e-06, + "loss": 0.1212, + "step": 8359 + }, + { + "epoch": 1.3545042125729099, + "grad_norm": 0.7914947867393494, + "learning_rate": 3.0106503394567775e-06, + "loss": 0.1007, + "step": 8360 + }, + { + "epoch": 1.3546662346079068, + "grad_norm": 0.7712557911872864, + "learning_rate": 3.0102222473845296e-06, + "loss": 0.1009, + "step": 8361 + }, + { + "epoch": 1.3548282566429035, + "grad_norm": 0.9304143190383911, + "learning_rate": 3.0097941397007156e-06, + "loss": 0.1097, + "step": 8362 + }, + { + "epoch": 1.3549902786779002, + "grad_norm": 0.7734033465385437, + "learning_rate": 3.0093660164184333e-06, + "loss": 0.0984, + "step": 8363 + }, + { + "epoch": 1.355152300712897, + "grad_norm": 0.7931289672851562, + "learning_rate": 3.008937877550785e-06, + "loss": 0.1012, + "step": 8364 + }, + { + "epoch": 1.3553143227478937, + "grad_norm": 0.8496924638748169, + "learning_rate": 3.008509723110869e-06, + "loss": 0.1057, + "step": 8365 + }, + { + "epoch": 1.3554763447828906, + "grad_norm": 0.8549742102622986, + "learning_rate": 3.008081553111786e-06, + "loss": 0.1145, + "step": 8366 + }, + { + "epoch": 1.3556383668178873, + "grad_norm": 0.8030797243118286, + "learning_rate": 3.007653367566636e-06, + "loss": 0.1055, + "step": 8367 + }, + { + "epoch": 1.355800388852884, + "grad_norm": 0.9936630725860596, + "learning_rate": 3.0072251664885222e-06, + "loss": 0.1309, + "step": 8368 + }, + { + "epoch": 1.3559624108878807, + "grad_norm": 0.8453781604766846, + "learning_rate": 3.0067969498905463e-06, + "loss": 0.0999, + "step": 8369 + }, + { + "epoch": 1.3561244329228774, + "grad_norm": 0.8510198593139648, + "learning_rate": 3.006368717785809e-06, + "loss": 0.103, + "step": 8370 + }, + { + "epoch": 1.3562864549578744, + "grad_norm": 0.770231306552887, + "learning_rate": 3.0059404701874157e-06, + "loss": 0.0957, + "step": 8371 + }, + { + "epoch": 1.356448476992871, + "grad_norm": 0.7896842956542969, + "learning_rate": 3.005512207108467e-06, + "loss": 0.0984, + "step": 8372 + }, + { + "epoch": 1.3566104990278678, + "grad_norm": 0.7394090294837952, + "learning_rate": 3.005083928562069e-06, + "loss": 0.094, + "step": 8373 + }, + { + "epoch": 1.3567725210628645, + "grad_norm": 0.8859434127807617, + "learning_rate": 3.004655634561325e-06, + "loss": 0.0986, + "step": 8374 + }, + { + "epoch": 1.3569345430978612, + "grad_norm": 1.0016928911209106, + "learning_rate": 3.004227325119339e-06, + "loss": 0.1174, + "step": 8375 + }, + { + "epoch": 1.3570965651328581, + "grad_norm": 0.7929229736328125, + "learning_rate": 3.003799000249218e-06, + "loss": 0.1043, + "step": 8376 + }, + { + "epoch": 1.3572585871678549, + "grad_norm": 0.8156124949455261, + "learning_rate": 3.0033706599640665e-06, + "loss": 0.106, + "step": 8377 + }, + { + "epoch": 1.3574206092028516, + "grad_norm": 0.8548935055732727, + "learning_rate": 3.002942304276991e-06, + "loss": 0.111, + "step": 8378 + }, + { + "epoch": 1.3575826312378483, + "grad_norm": 0.8992919325828552, + "learning_rate": 3.0025139332010976e-06, + "loss": 0.0987, + "step": 8379 + }, + { + "epoch": 1.357744653272845, + "grad_norm": 0.8693149089813232, + "learning_rate": 3.002085546749495e-06, + "loss": 0.1025, + "step": 8380 + }, + { + "epoch": 1.357906675307842, + "grad_norm": 0.8654988408088684, + "learning_rate": 3.0016571449352882e-06, + "loss": 0.0976, + "step": 8381 + }, + { + "epoch": 1.3580686973428386, + "grad_norm": 0.8404430747032166, + "learning_rate": 3.001228727771588e-06, + "loss": 0.1063, + "step": 8382 + }, + { + "epoch": 1.3582307193778353, + "grad_norm": 0.9719839692115784, + "learning_rate": 3.0008002952715008e-06, + "loss": 0.121, + "step": 8383 + }, + { + "epoch": 1.3583927414128323, + "grad_norm": 0.9208077788352966, + "learning_rate": 3.000371847448137e-06, + "loss": 0.11, + "step": 8384 + }, + { + "epoch": 1.3585547634478288, + "grad_norm": 0.7984682321548462, + "learning_rate": 2.9999433843146055e-06, + "loss": 0.0949, + "step": 8385 + }, + { + "epoch": 1.3587167854828257, + "grad_norm": 0.8321592807769775, + "learning_rate": 2.9995149058840157e-06, + "loss": 0.1012, + "step": 8386 + }, + { + "epoch": 1.3588788075178224, + "grad_norm": 0.8269400596618652, + "learning_rate": 2.9990864121694795e-06, + "loss": 0.1062, + "step": 8387 + }, + { + "epoch": 1.3590408295528191, + "grad_norm": 0.862639844417572, + "learning_rate": 2.998657903184107e-06, + "loss": 0.0967, + "step": 8388 + }, + { + "epoch": 1.359202851587816, + "grad_norm": 0.8576428890228271, + "learning_rate": 2.9982293789410083e-06, + "loss": 0.0987, + "step": 8389 + }, + { + "epoch": 1.3593648736228128, + "grad_norm": 0.9003967642784119, + "learning_rate": 2.9978008394532966e-06, + "loss": 0.1054, + "step": 8390 + }, + { + "epoch": 1.3595268956578095, + "grad_norm": 0.8770749568939209, + "learning_rate": 2.997372284734084e-06, + "loss": 0.1018, + "step": 8391 + }, + { + "epoch": 1.3596889176928062, + "grad_norm": 0.8922051191329956, + "learning_rate": 2.996943714796483e-06, + "loss": 0.1078, + "step": 8392 + }, + { + "epoch": 1.359850939727803, + "grad_norm": 0.8432496190071106, + "learning_rate": 2.9965151296536076e-06, + "loss": 0.1048, + "step": 8393 + }, + { + "epoch": 1.3600129617627998, + "grad_norm": 0.8538945913314819, + "learning_rate": 2.9960865293185697e-06, + "loss": 0.1141, + "step": 8394 + }, + { + "epoch": 1.3601749837977966, + "grad_norm": 0.9138311147689819, + "learning_rate": 2.9956579138044857e-06, + "loss": 0.1115, + "step": 8395 + }, + { + "epoch": 1.3603370058327933, + "grad_norm": 0.8055441379547119, + "learning_rate": 2.995229283124468e-06, + "loss": 0.0974, + "step": 8396 + }, + { + "epoch": 1.36049902786779, + "grad_norm": 0.7506464719772339, + "learning_rate": 2.9948006372916332e-06, + "loss": 0.0969, + "step": 8397 + }, + { + "epoch": 1.3606610499027867, + "grad_norm": 0.7606310248374939, + "learning_rate": 2.994371976319096e-06, + "loss": 0.1064, + "step": 8398 + }, + { + "epoch": 1.3608230719377836, + "grad_norm": 0.8419187068939209, + "learning_rate": 2.993943300219973e-06, + "loss": 0.1104, + "step": 8399 + }, + { + "epoch": 1.3609850939727803, + "grad_norm": 0.7976471781730652, + "learning_rate": 2.993514609007381e-06, + "loss": 0.0947, + "step": 8400 + }, + { + "epoch": 1.361147116007777, + "grad_norm": 0.935541033744812, + "learning_rate": 2.993085902694434e-06, + "loss": 0.1214, + "step": 8401 + }, + { + "epoch": 1.3613091380427738, + "grad_norm": 1.0250697135925293, + "learning_rate": 2.992657181294254e-06, + "loss": 0.1132, + "step": 8402 + }, + { + "epoch": 1.3614711600777705, + "grad_norm": 0.7330549955368042, + "learning_rate": 2.9922284448199548e-06, + "loss": 0.0954, + "step": 8403 + }, + { + "epoch": 1.3616331821127674, + "grad_norm": 0.7346470355987549, + "learning_rate": 2.9917996932846572e-06, + "loss": 0.0933, + "step": 8404 + }, + { + "epoch": 1.3617952041477641, + "grad_norm": 0.8301489353179932, + "learning_rate": 2.991370926701479e-06, + "loss": 0.0975, + "step": 8405 + }, + { + "epoch": 1.3619572261827608, + "grad_norm": 0.8129703402519226, + "learning_rate": 2.99094214508354e-06, + "loss": 0.0998, + "step": 8406 + }, + { + "epoch": 1.3621192482177575, + "grad_norm": 0.9655615091323853, + "learning_rate": 2.9905133484439585e-06, + "loss": 0.1182, + "step": 8407 + }, + { + "epoch": 1.3622812702527543, + "grad_norm": 0.8164478540420532, + "learning_rate": 2.990084536795856e-06, + "loss": 0.1012, + "step": 8408 + }, + { + "epoch": 1.3624432922877512, + "grad_norm": 0.7988752722740173, + "learning_rate": 2.989655710152353e-06, + "loss": 0.095, + "step": 8409 + }, + { + "epoch": 1.362605314322748, + "grad_norm": 0.7468502521514893, + "learning_rate": 2.989226868526569e-06, + "loss": 0.09, + "step": 8410 + }, + { + "epoch": 1.3627673363577446, + "grad_norm": 0.816019594669342, + "learning_rate": 2.9887980119316284e-06, + "loss": 0.0967, + "step": 8411 + }, + { + "epoch": 1.3629293583927415, + "grad_norm": 0.7134393453598022, + "learning_rate": 2.98836914038065e-06, + "loss": 0.093, + "step": 8412 + }, + { + "epoch": 1.3630913804277383, + "grad_norm": 0.8280972242355347, + "learning_rate": 2.9879402538867584e-06, + "loss": 0.0959, + "step": 8413 + }, + { + "epoch": 1.363253402462735, + "grad_norm": 0.9556366801261902, + "learning_rate": 2.987511352463076e-06, + "loss": 0.1118, + "step": 8414 + }, + { + "epoch": 1.3634154244977317, + "grad_norm": 0.9054288268089294, + "learning_rate": 2.9870824361227257e-06, + "loss": 0.1048, + "step": 8415 + }, + { + "epoch": 1.3635774465327284, + "grad_norm": 0.9046202898025513, + "learning_rate": 2.9866535048788314e-06, + "loss": 0.1089, + "step": 8416 + }, + { + "epoch": 1.3637394685677253, + "grad_norm": 0.8369488716125488, + "learning_rate": 2.986224558744519e-06, + "loss": 0.1042, + "step": 8417 + }, + { + "epoch": 1.363901490602722, + "grad_norm": 0.710746705532074, + "learning_rate": 2.9857955977329095e-06, + "loss": 0.0917, + "step": 8418 + }, + { + "epoch": 1.3640635126377187, + "grad_norm": 0.8769986629486084, + "learning_rate": 2.985366621857132e-06, + "loss": 0.0997, + "step": 8419 + }, + { + "epoch": 1.3642255346727155, + "grad_norm": 0.8238828182220459, + "learning_rate": 2.9849376311303095e-06, + "loss": 0.1029, + "step": 8420 + }, + { + "epoch": 1.3643875567077122, + "grad_norm": 0.7568957209587097, + "learning_rate": 2.9845086255655692e-06, + "loss": 0.0914, + "step": 8421 + }, + { + "epoch": 1.364549578742709, + "grad_norm": 0.7604775428771973, + "learning_rate": 2.984079605176038e-06, + "loss": 0.1007, + "step": 8422 + }, + { + "epoch": 1.3647116007777058, + "grad_norm": 0.9038381576538086, + "learning_rate": 2.9836505699748414e-06, + "loss": 0.1094, + "step": 8423 + }, + { + "epoch": 1.3648736228127025, + "grad_norm": 0.9362145066261292, + "learning_rate": 2.9832215199751085e-06, + "loss": 0.1154, + "step": 8424 + }, + { + "epoch": 1.3650356448476992, + "grad_norm": 0.8526574969291687, + "learning_rate": 2.9827924551899657e-06, + "loss": 0.1054, + "step": 8425 + }, + { + "epoch": 1.365197666882696, + "grad_norm": 0.8114335536956787, + "learning_rate": 2.9823633756325433e-06, + "loss": 0.1028, + "step": 8426 + }, + { + "epoch": 1.3653596889176929, + "grad_norm": 0.814687967300415, + "learning_rate": 2.9819342813159674e-06, + "loss": 0.0995, + "step": 8427 + }, + { + "epoch": 1.3655217109526896, + "grad_norm": 0.7649007439613342, + "learning_rate": 2.9815051722533707e-06, + "loss": 0.0909, + "step": 8428 + }, + { + "epoch": 1.3656837329876863, + "grad_norm": 0.9378827214241028, + "learning_rate": 2.9810760484578794e-06, + "loss": 0.1072, + "step": 8429 + }, + { + "epoch": 1.365845755022683, + "grad_norm": 0.841758668422699, + "learning_rate": 2.9806469099426254e-06, + "loss": 0.1043, + "step": 8430 + }, + { + "epoch": 1.3660077770576797, + "grad_norm": 0.7740828990936279, + "learning_rate": 2.98021775672074e-06, + "loss": 0.0929, + "step": 8431 + }, + { + "epoch": 1.3661697990926767, + "grad_norm": 0.7625635266304016, + "learning_rate": 2.9797885888053517e-06, + "loss": 0.0944, + "step": 8432 + }, + { + "epoch": 1.3663318211276734, + "grad_norm": 0.8351330757141113, + "learning_rate": 2.9793594062095955e-06, + "loss": 0.1061, + "step": 8433 + }, + { + "epoch": 1.36649384316267, + "grad_norm": 0.791860044002533, + "learning_rate": 2.9789302089466e-06, + "loss": 0.0981, + "step": 8434 + }, + { + "epoch": 1.366655865197667, + "grad_norm": 0.8011177778244019, + "learning_rate": 2.9785009970294997e-06, + "loss": 0.1084, + "step": 8435 + }, + { + "epoch": 1.3668178872326635, + "grad_norm": 0.9865897297859192, + "learning_rate": 2.978071770471427e-06, + "loss": 0.1206, + "step": 8436 + }, + { + "epoch": 1.3669799092676604, + "grad_norm": 0.9098086357116699, + "learning_rate": 2.977642529285515e-06, + "loss": 0.1129, + "step": 8437 + }, + { + "epoch": 1.3671419313026572, + "grad_norm": 0.9035070538520813, + "learning_rate": 2.9772132734848974e-06, + "loss": 0.111, + "step": 8438 + }, + { + "epoch": 1.3673039533376539, + "grad_norm": 0.8441570401191711, + "learning_rate": 2.9767840030827082e-06, + "loss": 0.1142, + "step": 8439 + }, + { + "epoch": 1.3674659753726508, + "grad_norm": 0.857961118221283, + "learning_rate": 2.9763547180920825e-06, + "loss": 0.108, + "step": 8440 + }, + { + "epoch": 1.3676279974076475, + "grad_norm": 0.7608411312103271, + "learning_rate": 2.9759254185261555e-06, + "loss": 0.0939, + "step": 8441 + }, + { + "epoch": 1.3677900194426442, + "grad_norm": 0.8588743209838867, + "learning_rate": 2.9754961043980623e-06, + "loss": 0.1139, + "step": 8442 + }, + { + "epoch": 1.367952041477641, + "grad_norm": 0.771369993686676, + "learning_rate": 2.9750667757209385e-06, + "loss": 0.1037, + "step": 8443 + }, + { + "epoch": 1.3681140635126376, + "grad_norm": 0.8773581385612488, + "learning_rate": 2.9746374325079213e-06, + "loss": 0.1133, + "step": 8444 + }, + { + "epoch": 1.3682760855476346, + "grad_norm": 0.798323392868042, + "learning_rate": 2.9742080747721473e-06, + "loss": 0.1001, + "step": 8445 + }, + { + "epoch": 1.3684381075826313, + "grad_norm": 0.826897144317627, + "learning_rate": 2.973778702526754e-06, + "loss": 0.1104, + "step": 8446 + }, + { + "epoch": 1.368600129617628, + "grad_norm": 0.8716105222702026, + "learning_rate": 2.973349315784878e-06, + "loss": 0.1109, + "step": 8447 + }, + { + "epoch": 1.3687621516526247, + "grad_norm": 0.9297152757644653, + "learning_rate": 2.97291991455966e-06, + "loss": 0.1176, + "step": 8448 + }, + { + "epoch": 1.3689241736876214, + "grad_norm": 0.8426374197006226, + "learning_rate": 2.9724904988642357e-06, + "loss": 0.0943, + "step": 8449 + }, + { + "epoch": 1.3690861957226184, + "grad_norm": 0.7848527431488037, + "learning_rate": 2.9720610687117462e-06, + "loss": 0.1075, + "step": 8450 + }, + { + "epoch": 1.369248217757615, + "grad_norm": 0.822970986366272, + "learning_rate": 2.9716316241153303e-06, + "loss": 0.1023, + "step": 8451 + }, + { + "epoch": 1.3694102397926118, + "grad_norm": 0.8297960162162781, + "learning_rate": 2.971202165088128e-06, + "loss": 0.1, + "step": 8452 + }, + { + "epoch": 1.3695722618276085, + "grad_norm": 0.799751341342926, + "learning_rate": 2.9707726916432793e-06, + "loss": 0.1051, + "step": 8453 + }, + { + "epoch": 1.3697342838626052, + "grad_norm": 0.8356010317802429, + "learning_rate": 2.9703432037939255e-06, + "loss": 0.1089, + "step": 8454 + }, + { + "epoch": 1.3698963058976021, + "grad_norm": 0.8152604699134827, + "learning_rate": 2.969913701553209e-06, + "loss": 0.1082, + "step": 8455 + }, + { + "epoch": 1.3700583279325989, + "grad_norm": 0.8205832839012146, + "learning_rate": 2.9694841849342688e-06, + "loss": 0.1016, + "step": 8456 + }, + { + "epoch": 1.3702203499675956, + "grad_norm": 0.9030212163925171, + "learning_rate": 2.9690546539502496e-06, + "loss": 0.1063, + "step": 8457 + }, + { + "epoch": 1.3703823720025923, + "grad_norm": 0.8552389144897461, + "learning_rate": 2.9686251086142927e-06, + "loss": 0.1158, + "step": 8458 + }, + { + "epoch": 1.370544394037589, + "grad_norm": 0.8218772411346436, + "learning_rate": 2.968195548939542e-06, + "loss": 0.1007, + "step": 8459 + }, + { + "epoch": 1.370706416072586, + "grad_norm": 0.8865529894828796, + "learning_rate": 2.9677659749391404e-06, + "loss": 0.1133, + "step": 8460 + }, + { + "epoch": 1.3708684381075826, + "grad_norm": 0.8135964274406433, + "learning_rate": 2.967336386626232e-06, + "loss": 0.0994, + "step": 8461 + }, + { + "epoch": 1.3710304601425793, + "grad_norm": 0.7750198841094971, + "learning_rate": 2.9669067840139603e-06, + "loss": 0.1008, + "step": 8462 + }, + { + "epoch": 1.3711924821775763, + "grad_norm": 0.7373397946357727, + "learning_rate": 2.966477167115472e-06, + "loss": 0.0951, + "step": 8463 + }, + { + "epoch": 1.371354504212573, + "grad_norm": 0.8936923146247864, + "learning_rate": 2.9660475359439113e-06, + "loss": 0.1062, + "step": 8464 + }, + { + "epoch": 1.3715165262475697, + "grad_norm": 0.8021456003189087, + "learning_rate": 2.9656178905124222e-06, + "loss": 0.102, + "step": 8465 + }, + { + "epoch": 1.3716785482825664, + "grad_norm": 0.921617865562439, + "learning_rate": 2.965188230834154e-06, + "loss": 0.1193, + "step": 8466 + }, + { + "epoch": 1.3718405703175631, + "grad_norm": 0.8052314519882202, + "learning_rate": 2.9647585569222516e-06, + "loss": 0.0948, + "step": 8467 + }, + { + "epoch": 1.37200259235256, + "grad_norm": 1.5328847169876099, + "learning_rate": 2.9643288687898614e-06, + "loss": 0.1096, + "step": 8468 + }, + { + "epoch": 1.3721646143875568, + "grad_norm": 0.905802309513092, + "learning_rate": 2.9638991664501314e-06, + "loss": 0.1174, + "step": 8469 + }, + { + "epoch": 1.3723266364225535, + "grad_norm": 0.80458664894104, + "learning_rate": 2.96346944991621e-06, + "loss": 0.0972, + "step": 8470 + }, + { + "epoch": 1.3724886584575502, + "grad_norm": 0.8991355299949646, + "learning_rate": 2.9630397192012445e-06, + "loss": 0.1096, + "step": 8471 + }, + { + "epoch": 1.372650680492547, + "grad_norm": 1.1109977960586548, + "learning_rate": 2.962609974318385e-06, + "loss": 0.1276, + "step": 8472 + }, + { + "epoch": 1.3728127025275438, + "grad_norm": 0.8459230065345764, + "learning_rate": 2.962180215280779e-06, + "loss": 0.107, + "step": 8473 + }, + { + "epoch": 1.3729747245625405, + "grad_norm": 0.8833880424499512, + "learning_rate": 2.961750442101577e-06, + "loss": 0.106, + "step": 8474 + }, + { + "epoch": 1.3731367465975373, + "grad_norm": 0.8358442783355713, + "learning_rate": 2.9613206547939287e-06, + "loss": 0.1001, + "step": 8475 + }, + { + "epoch": 1.373298768632534, + "grad_norm": 0.8312876224517822, + "learning_rate": 2.9608908533709852e-06, + "loss": 0.104, + "step": 8476 + }, + { + "epoch": 1.3734607906675307, + "grad_norm": 0.7915255427360535, + "learning_rate": 2.9604610378458965e-06, + "loss": 0.0953, + "step": 8477 + }, + { + "epoch": 1.3736228127025276, + "grad_norm": 0.7739593386650085, + "learning_rate": 2.9600312082318144e-06, + "loss": 0.1022, + "step": 8478 + }, + { + "epoch": 1.3737848347375243, + "grad_norm": 0.9240787029266357, + "learning_rate": 2.9596013645418913e-06, + "loss": 0.1201, + "step": 8479 + }, + { + "epoch": 1.373946856772521, + "grad_norm": 0.7890738248825073, + "learning_rate": 2.9591715067892777e-06, + "loss": 0.0993, + "step": 8480 + }, + { + "epoch": 1.3741088788075178, + "grad_norm": 0.7773252725601196, + "learning_rate": 2.9587416349871277e-06, + "loss": 0.095, + "step": 8481 + }, + { + "epoch": 1.3742709008425145, + "grad_norm": 0.888715386390686, + "learning_rate": 2.958311749148594e-06, + "loss": 0.1149, + "step": 8482 + }, + { + "epoch": 1.3744329228775114, + "grad_norm": 0.8670759797096252, + "learning_rate": 2.9578818492868293e-06, + "loss": 0.1132, + "step": 8483 + }, + { + "epoch": 1.374594944912508, + "grad_norm": 0.8122788667678833, + "learning_rate": 2.9574519354149884e-06, + "loss": 0.0999, + "step": 8484 + }, + { + "epoch": 1.3747569669475048, + "grad_norm": 0.9208338260650635, + "learning_rate": 2.9570220075462254e-06, + "loss": 0.1139, + "step": 8485 + }, + { + "epoch": 1.3749189889825018, + "grad_norm": 0.7937371134757996, + "learning_rate": 2.9565920656936947e-06, + "loss": 0.1054, + "step": 8486 + }, + { + "epoch": 1.3750810110174982, + "grad_norm": 0.9310660362243652, + "learning_rate": 2.956162109870551e-06, + "loss": 0.1085, + "step": 8487 + }, + { + "epoch": 1.3752430330524952, + "grad_norm": 0.8265377283096313, + "learning_rate": 2.9557321400899524e-06, + "loss": 0.1031, + "step": 8488 + }, + { + "epoch": 1.375405055087492, + "grad_norm": 0.7675789594650269, + "learning_rate": 2.9553021563650514e-06, + "loss": 0.0969, + "step": 8489 + }, + { + "epoch": 1.3755670771224886, + "grad_norm": 0.8487404584884644, + "learning_rate": 2.9548721587090075e-06, + "loss": 0.1223, + "step": 8490 + }, + { + "epoch": 1.3757290991574855, + "grad_norm": 0.8002118468284607, + "learning_rate": 2.9544421471349753e-06, + "loss": 0.0937, + "step": 8491 + }, + { + "epoch": 1.3758911211924822, + "grad_norm": 0.7949121594429016, + "learning_rate": 2.954012121656114e-06, + "loss": 0.0965, + "step": 8492 + }, + { + "epoch": 1.376053143227479, + "grad_norm": 0.8120603561401367, + "learning_rate": 2.9535820822855797e-06, + "loss": 0.0894, + "step": 8493 + }, + { + "epoch": 1.3762151652624757, + "grad_norm": 0.7125270366668701, + "learning_rate": 2.9531520290365316e-06, + "loss": 0.0855, + "step": 8494 + }, + { + "epoch": 1.3763771872974724, + "grad_norm": 0.819965124130249, + "learning_rate": 2.9527219619221293e-06, + "loss": 0.0934, + "step": 8495 + }, + { + "epoch": 1.3765392093324693, + "grad_norm": 0.8064921498298645, + "learning_rate": 2.952291880955529e-06, + "loss": 0.0961, + "step": 8496 + }, + { + "epoch": 1.376701231367466, + "grad_norm": 0.7966877818107605, + "learning_rate": 2.9518617861498924e-06, + "loss": 0.0986, + "step": 8497 + }, + { + "epoch": 1.3768632534024627, + "grad_norm": 0.8604177236557007, + "learning_rate": 2.9514316775183777e-06, + "loss": 0.1135, + "step": 8498 + }, + { + "epoch": 1.3770252754374595, + "grad_norm": 0.8813721537590027, + "learning_rate": 2.9510015550741467e-06, + "loss": 0.108, + "step": 8499 + }, + { + "epoch": 1.3771872974724562, + "grad_norm": 0.8800192475318909, + "learning_rate": 2.950571418830359e-06, + "loss": 0.1143, + "step": 8500 + }, + { + "epoch": 1.377349319507453, + "grad_norm": 0.8808828592300415, + "learning_rate": 2.950141268800177e-06, + "loss": 0.1095, + "step": 8501 + }, + { + "epoch": 1.3775113415424498, + "grad_norm": 0.7128430008888245, + "learning_rate": 2.949711104996761e-06, + "loss": 0.0887, + "step": 8502 + }, + { + "epoch": 1.3776733635774465, + "grad_norm": 0.7968690395355225, + "learning_rate": 2.9492809274332745e-06, + "loss": 0.0957, + "step": 8503 + }, + { + "epoch": 1.3778353856124432, + "grad_norm": 0.8926577568054199, + "learning_rate": 2.948850736122878e-06, + "loss": 0.1194, + "step": 8504 + }, + { + "epoch": 1.37799740764744, + "grad_norm": 0.9188405871391296, + "learning_rate": 2.948420531078735e-06, + "loss": 0.1107, + "step": 8505 + }, + { + "epoch": 1.3781594296824369, + "grad_norm": 0.7922679781913757, + "learning_rate": 2.947990312314009e-06, + "loss": 0.0989, + "step": 8506 + }, + { + "epoch": 1.3783214517174336, + "grad_norm": 0.9483537077903748, + "learning_rate": 2.9475600798418636e-06, + "loss": 0.1224, + "step": 8507 + }, + { + "epoch": 1.3784834737524303, + "grad_norm": 0.9121973514556885, + "learning_rate": 2.9471298336754633e-06, + "loss": 0.1141, + "step": 8508 + }, + { + "epoch": 1.378645495787427, + "grad_norm": 0.8246192932128906, + "learning_rate": 2.9466995738279715e-06, + "loss": 0.1031, + "step": 8509 + }, + { + "epoch": 1.3788075178224237, + "grad_norm": 0.8705335259437561, + "learning_rate": 2.9462693003125544e-06, + "loss": 0.1033, + "step": 8510 + }, + { + "epoch": 1.3789695398574207, + "grad_norm": 0.7904585003852844, + "learning_rate": 2.9458390131423754e-06, + "loss": 0.0992, + "step": 8511 + }, + { + "epoch": 1.3791315618924174, + "grad_norm": 0.8808677196502686, + "learning_rate": 2.945408712330603e-06, + "loss": 0.1059, + "step": 8512 + }, + { + "epoch": 1.379293583927414, + "grad_norm": 1.1480685472488403, + "learning_rate": 2.944978397890401e-06, + "loss": 0.1, + "step": 8513 + }, + { + "epoch": 1.379455605962411, + "grad_norm": 0.8541058301925659, + "learning_rate": 2.944548069834937e-06, + "loss": 0.1072, + "step": 8514 + }, + { + "epoch": 1.3796176279974077, + "grad_norm": 0.7569074630737305, + "learning_rate": 2.9441177281773783e-06, + "loss": 0.1054, + "step": 8515 + }, + { + "epoch": 1.3797796500324044, + "grad_norm": 0.8691943287849426, + "learning_rate": 2.943687372930891e-06, + "loss": 0.1116, + "step": 8516 + }, + { + "epoch": 1.3799416720674011, + "grad_norm": 0.8004394173622131, + "learning_rate": 2.943257004108645e-06, + "loss": 0.1001, + "step": 8517 + }, + { + "epoch": 1.3801036941023979, + "grad_norm": 0.8240222930908203, + "learning_rate": 2.942826621723806e-06, + "loss": 0.1008, + "step": 8518 + }, + { + "epoch": 1.3802657161373948, + "grad_norm": 0.8319725394248962, + "learning_rate": 2.942396225789545e-06, + "loss": 0.1021, + "step": 8519 + }, + { + "epoch": 1.3804277381723915, + "grad_norm": 0.8528062701225281, + "learning_rate": 2.9419658163190295e-06, + "loss": 0.1136, + "step": 8520 + }, + { + "epoch": 1.3805897602073882, + "grad_norm": 0.7287147641181946, + "learning_rate": 2.9415353933254297e-06, + "loss": 0.0952, + "step": 8521 + }, + { + "epoch": 1.380751782242385, + "grad_norm": 0.8255961537361145, + "learning_rate": 2.9411049568219153e-06, + "loss": 0.1072, + "step": 8522 + }, + { + "epoch": 1.3809138042773816, + "grad_norm": 0.7438040971755981, + "learning_rate": 2.940674506821657e-06, + "loss": 0.0972, + "step": 8523 + }, + { + "epoch": 1.3810758263123786, + "grad_norm": 0.7560163736343384, + "learning_rate": 2.9402440433378247e-06, + "loss": 0.0906, + "step": 8524 + }, + { + "epoch": 1.3812378483473753, + "grad_norm": 0.952171802520752, + "learning_rate": 2.9398135663835904e-06, + "loss": 0.1007, + "step": 8525 + }, + { + "epoch": 1.381399870382372, + "grad_norm": 0.7956331968307495, + "learning_rate": 2.939383075972125e-06, + "loss": 0.1024, + "step": 8526 + }, + { + "epoch": 1.3815618924173687, + "grad_norm": 0.8440989255905151, + "learning_rate": 2.9389525721166013e-06, + "loss": 0.1028, + "step": 8527 + }, + { + "epoch": 1.3817239144523654, + "grad_norm": 1.5592060089111328, + "learning_rate": 2.9385220548301906e-06, + "loss": 0.1023, + "step": 8528 + }, + { + "epoch": 1.3818859364873624, + "grad_norm": 0.8761277794837952, + "learning_rate": 2.9380915241260665e-06, + "loss": 0.0997, + "step": 8529 + }, + { + "epoch": 1.382047958522359, + "grad_norm": 0.8244863748550415, + "learning_rate": 2.937660980017402e-06, + "loss": 0.1096, + "step": 8530 + }, + { + "epoch": 1.3822099805573558, + "grad_norm": 0.7663264870643616, + "learning_rate": 2.9372304225173703e-06, + "loss": 0.0926, + "step": 8531 + }, + { + "epoch": 1.3823720025923525, + "grad_norm": 0.8507534861564636, + "learning_rate": 2.936799851639146e-06, + "loss": 0.1017, + "step": 8532 + }, + { + "epoch": 1.3825340246273492, + "grad_norm": 0.9161567091941833, + "learning_rate": 2.936369267395903e-06, + "loss": 0.1143, + "step": 8533 + }, + { + "epoch": 1.3826960466623461, + "grad_norm": 0.8694104552268982, + "learning_rate": 2.9359386698008172e-06, + "loss": 0.1074, + "step": 8534 + }, + { + "epoch": 1.3828580686973428, + "grad_norm": 0.853909969329834, + "learning_rate": 2.9355080588670626e-06, + "loss": 0.1031, + "step": 8535 + }, + { + "epoch": 1.3830200907323396, + "grad_norm": 0.7998418211936951, + "learning_rate": 2.935077434607815e-06, + "loss": 0.1022, + "step": 8536 + }, + { + "epoch": 1.3831821127673365, + "grad_norm": 0.9292730689048767, + "learning_rate": 2.934646797036251e-06, + "loss": 0.1037, + "step": 8537 + }, + { + "epoch": 1.383344134802333, + "grad_norm": 0.7879573106765747, + "learning_rate": 2.9342161461655468e-06, + "loss": 0.095, + "step": 8538 + }, + { + "epoch": 1.38350615683733, + "grad_norm": 0.9311516284942627, + "learning_rate": 2.9337854820088797e-06, + "loss": 0.1086, + "step": 8539 + }, + { + "epoch": 1.3836681788723266, + "grad_norm": 0.8644829392433167, + "learning_rate": 2.9333548045794253e-06, + "loss": 0.1038, + "step": 8540 + }, + { + "epoch": 1.3838302009073233, + "grad_norm": 0.8739936947822571, + "learning_rate": 2.9329241138903642e-06, + "loss": 0.1093, + "step": 8541 + }, + { + "epoch": 1.3839922229423203, + "grad_norm": 0.7543818354606628, + "learning_rate": 2.9324934099548713e-06, + "loss": 0.0904, + "step": 8542 + }, + { + "epoch": 1.384154244977317, + "grad_norm": 0.7458750009536743, + "learning_rate": 2.9320626927861283e-06, + "loss": 0.0913, + "step": 8543 + }, + { + "epoch": 1.3843162670123137, + "grad_norm": 0.8349683284759521, + "learning_rate": 2.931631962397311e-06, + "loss": 0.1058, + "step": 8544 + }, + { + "epoch": 1.3844782890473104, + "grad_norm": 1.0070006847381592, + "learning_rate": 2.9312012188016014e-06, + "loss": 0.1281, + "step": 8545 + }, + { + "epoch": 1.3846403110823071, + "grad_norm": 0.8791117072105408, + "learning_rate": 2.9307704620121775e-06, + "loss": 0.1125, + "step": 8546 + }, + { + "epoch": 1.384802333117304, + "grad_norm": 0.8105389475822449, + "learning_rate": 2.9303396920422196e-06, + "loss": 0.1058, + "step": 8547 + }, + { + "epoch": 1.3849643551523008, + "grad_norm": 0.8246057629585266, + "learning_rate": 2.9299089089049092e-06, + "loss": 0.1031, + "step": 8548 + }, + { + "epoch": 1.3851263771872975, + "grad_norm": 0.9281556606292725, + "learning_rate": 2.9294781126134254e-06, + "loss": 0.1114, + "step": 8549 + }, + { + "epoch": 1.3852883992222942, + "grad_norm": 0.885179340839386, + "learning_rate": 2.929047303180952e-06, + "loss": 0.1128, + "step": 8550 + }, + { + "epoch": 1.385450421257291, + "grad_norm": 0.7432131767272949, + "learning_rate": 2.9286164806206683e-06, + "loss": 0.0927, + "step": 8551 + }, + { + "epoch": 1.3856124432922878, + "grad_norm": 0.7628340721130371, + "learning_rate": 2.9281856449457587e-06, + "loss": 0.1045, + "step": 8552 + }, + { + "epoch": 1.3857744653272845, + "grad_norm": 0.8845008015632629, + "learning_rate": 2.9277547961694037e-06, + "loss": 0.1161, + "step": 8553 + }, + { + "epoch": 1.3859364873622813, + "grad_norm": 0.7263888120651245, + "learning_rate": 2.927323934304787e-06, + "loss": 0.095, + "step": 8554 + }, + { + "epoch": 1.386098509397278, + "grad_norm": 0.9359971284866333, + "learning_rate": 2.9268930593650926e-06, + "loss": 0.1342, + "step": 8555 + }, + { + "epoch": 1.3862605314322747, + "grad_norm": 0.8538017272949219, + "learning_rate": 2.926462171363503e-06, + "loss": 0.1154, + "step": 8556 + }, + { + "epoch": 1.3864225534672716, + "grad_norm": 0.8223465085029602, + "learning_rate": 2.9260312703132037e-06, + "loss": 0.1037, + "step": 8557 + }, + { + "epoch": 1.3865845755022683, + "grad_norm": 0.7046705484390259, + "learning_rate": 2.9256003562273784e-06, + "loss": 0.0974, + "step": 8558 + }, + { + "epoch": 1.386746597537265, + "grad_norm": 0.8450808525085449, + "learning_rate": 2.9251694291192113e-06, + "loss": 0.1041, + "step": 8559 + }, + { + "epoch": 1.3869086195722617, + "grad_norm": 0.7554501295089722, + "learning_rate": 2.924738489001889e-06, + "loss": 0.0989, + "step": 8560 + }, + { + "epoch": 1.3870706416072585, + "grad_norm": 0.7961752414703369, + "learning_rate": 2.924307535888597e-06, + "loss": 0.0983, + "step": 8561 + }, + { + "epoch": 1.3872326636422554, + "grad_norm": 0.7975520491600037, + "learning_rate": 2.923876569792521e-06, + "loss": 0.1012, + "step": 8562 + }, + { + "epoch": 1.387394685677252, + "grad_norm": 0.980924129486084, + "learning_rate": 2.923445590726848e-06, + "loss": 0.1208, + "step": 8563 + }, + { + "epoch": 1.3875567077122488, + "grad_norm": 0.92117840051651, + "learning_rate": 2.923014598704764e-06, + "loss": 0.1097, + "step": 8564 + }, + { + "epoch": 1.3877187297472457, + "grad_norm": 0.801445484161377, + "learning_rate": 2.922583593739458e-06, + "loss": 0.103, + "step": 8565 + }, + { + "epoch": 1.3878807517822425, + "grad_norm": 0.9321786761283875, + "learning_rate": 2.9221525758441155e-06, + "loss": 0.1197, + "step": 8566 + }, + { + "epoch": 1.3880427738172392, + "grad_norm": 0.7319613099098206, + "learning_rate": 2.921721545031927e-06, + "loss": 0.0961, + "step": 8567 + }, + { + "epoch": 1.3882047958522359, + "grad_norm": 0.7948114275932312, + "learning_rate": 2.9212905013160784e-06, + "loss": 0.1095, + "step": 8568 + }, + { + "epoch": 1.3883668178872326, + "grad_norm": 0.7754814624786377, + "learning_rate": 2.920859444709761e-06, + "loss": 0.0997, + "step": 8569 + }, + { + "epoch": 1.3885288399222295, + "grad_norm": 0.9290817975997925, + "learning_rate": 2.920428375226163e-06, + "loss": 0.101, + "step": 8570 + }, + { + "epoch": 1.3886908619572262, + "grad_norm": 0.761486291885376, + "learning_rate": 2.919997292878474e-06, + "loss": 0.0954, + "step": 8571 + }, + { + "epoch": 1.388852883992223, + "grad_norm": 0.672540009021759, + "learning_rate": 2.9195661976798838e-06, + "loss": 0.0868, + "step": 8572 + }, + { + "epoch": 1.3890149060272197, + "grad_norm": 0.8108746409416199, + "learning_rate": 2.919135089643583e-06, + "loss": 0.0953, + "step": 8573 + }, + { + "epoch": 1.3891769280622164, + "grad_norm": 1.304598093032837, + "learning_rate": 2.918703968782764e-06, + "loss": 0.0915, + "step": 8574 + }, + { + "epoch": 1.3893389500972133, + "grad_norm": 0.9010568261146545, + "learning_rate": 2.918272835110616e-06, + "loss": 0.1142, + "step": 8575 + }, + { + "epoch": 1.38950097213221, + "grad_norm": 0.7341850399971008, + "learning_rate": 2.9178416886403318e-06, + "loss": 0.0943, + "step": 8576 + }, + { + "epoch": 1.3896629941672067, + "grad_norm": 0.8488278388977051, + "learning_rate": 2.9174105293851025e-06, + "loss": 0.1099, + "step": 8577 + }, + { + "epoch": 1.3898250162022034, + "grad_norm": 0.9017966389656067, + "learning_rate": 2.916979357358121e-06, + "loss": 0.1107, + "step": 8578 + }, + { + "epoch": 1.3899870382372002, + "grad_norm": 0.7202101945877075, + "learning_rate": 2.916548172572581e-06, + "loss": 0.098, + "step": 8579 + }, + { + "epoch": 1.390149060272197, + "grad_norm": 0.857900857925415, + "learning_rate": 2.9161169750416746e-06, + "loss": 0.1077, + "step": 8580 + }, + { + "epoch": 1.3903110823071938, + "grad_norm": 0.8433865904808044, + "learning_rate": 2.9156857647785964e-06, + "loss": 0.1083, + "step": 8581 + }, + { + "epoch": 1.3904731043421905, + "grad_norm": 0.8308601975440979, + "learning_rate": 2.915254541796539e-06, + "loss": 0.1088, + "step": 8582 + }, + { + "epoch": 1.3906351263771872, + "grad_norm": 0.77713942527771, + "learning_rate": 2.9148233061086973e-06, + "loss": 0.1047, + "step": 8583 + }, + { + "epoch": 1.390797148412184, + "grad_norm": 0.6983367204666138, + "learning_rate": 2.914392057728267e-06, + "loss": 0.0904, + "step": 8584 + }, + { + "epoch": 1.3909591704471809, + "grad_norm": 0.844786524772644, + "learning_rate": 2.913960796668442e-06, + "loss": 0.1129, + "step": 8585 + }, + { + "epoch": 1.3911211924821776, + "grad_norm": 0.756144106388092, + "learning_rate": 2.913529522942418e-06, + "loss": 0.1032, + "step": 8586 + }, + { + "epoch": 1.3912832145171743, + "grad_norm": 0.7644606828689575, + "learning_rate": 2.9130982365633926e-06, + "loss": 0.1047, + "step": 8587 + }, + { + "epoch": 1.3914452365521712, + "grad_norm": 0.8337986469268799, + "learning_rate": 2.9126669375445595e-06, + "loss": 0.1056, + "step": 8588 + }, + { + "epoch": 1.3916072585871677, + "grad_norm": 0.7386319041252136, + "learning_rate": 2.912235625899118e-06, + "loss": 0.1003, + "step": 8589 + }, + { + "epoch": 1.3917692806221647, + "grad_norm": 0.8051424026489258, + "learning_rate": 2.911804301640263e-06, + "loss": 0.1058, + "step": 8590 + }, + { + "epoch": 1.3919313026571614, + "grad_norm": 0.8822247982025146, + "learning_rate": 2.9113729647811935e-06, + "loss": 0.1107, + "step": 8591 + }, + { + "epoch": 1.392093324692158, + "grad_norm": 0.8278679251670837, + "learning_rate": 2.910941615335106e-06, + "loss": 0.115, + "step": 8592 + }, + { + "epoch": 1.392255346727155, + "grad_norm": 0.8865328431129456, + "learning_rate": 2.9105102533152e-06, + "loss": 0.1151, + "step": 8593 + }, + { + "epoch": 1.3924173687621517, + "grad_norm": 0.8504626750946045, + "learning_rate": 2.9100788787346746e-06, + "loss": 0.1131, + "step": 8594 + }, + { + "epoch": 1.3925793907971484, + "grad_norm": 0.8320386409759521, + "learning_rate": 2.9096474916067264e-06, + "loss": 0.1116, + "step": 8595 + }, + { + "epoch": 1.3927414128321451, + "grad_norm": 0.8168127536773682, + "learning_rate": 2.9092160919445566e-06, + "loss": 0.1058, + "step": 8596 + }, + { + "epoch": 1.3929034348671419, + "grad_norm": 0.8525434136390686, + "learning_rate": 2.9087846797613645e-06, + "loss": 0.0983, + "step": 8597 + }, + { + "epoch": 1.3930654569021388, + "grad_norm": 0.8819026947021484, + "learning_rate": 2.9083532550703515e-06, + "loss": 0.1126, + "step": 8598 + }, + { + "epoch": 1.3932274789371355, + "grad_norm": 0.7245476841926575, + "learning_rate": 2.907921817884716e-06, + "loss": 0.0919, + "step": 8599 + }, + { + "epoch": 1.3933895009721322, + "grad_norm": 0.7397482395172119, + "learning_rate": 2.9074903682176607e-06, + "loss": 0.1035, + "step": 8600 + }, + { + "epoch": 1.393551523007129, + "grad_norm": 0.7988397479057312, + "learning_rate": 2.907058906082386e-06, + "loss": 0.1043, + "step": 8601 + }, + { + "epoch": 1.3937135450421256, + "grad_norm": 0.9405267238616943, + "learning_rate": 2.906627431492094e-06, + "loss": 0.1193, + "step": 8602 + }, + { + "epoch": 1.3938755670771226, + "grad_norm": 0.766861617565155, + "learning_rate": 2.9061959444599867e-06, + "loss": 0.0918, + "step": 8603 + }, + { + "epoch": 1.3940375891121193, + "grad_norm": 0.8565654754638672, + "learning_rate": 2.9057644449992655e-06, + "loss": 0.1066, + "step": 8604 + }, + { + "epoch": 1.394199611147116, + "grad_norm": 0.8554398417472839, + "learning_rate": 2.9053329331231356e-06, + "loss": 0.1046, + "step": 8605 + }, + { + "epoch": 1.3943616331821127, + "grad_norm": 0.8772569298744202, + "learning_rate": 2.904901408844798e-06, + "loss": 0.1049, + "step": 8606 + }, + { + "epoch": 1.3945236552171094, + "grad_norm": 0.8543737530708313, + "learning_rate": 2.904469872177458e-06, + "loss": 0.1047, + "step": 8607 + }, + { + "epoch": 1.3946856772521063, + "grad_norm": 0.8698646426200867, + "learning_rate": 2.9040383231343173e-06, + "loss": 0.1063, + "step": 8608 + }, + { + "epoch": 1.394847699287103, + "grad_norm": 0.8685057163238525, + "learning_rate": 2.9036067617285825e-06, + "loss": 0.1025, + "step": 8609 + }, + { + "epoch": 1.3950097213220998, + "grad_norm": 0.7601488828659058, + "learning_rate": 2.903175187973457e-06, + "loss": 0.0914, + "step": 8610 + }, + { + "epoch": 1.3951717433570965, + "grad_norm": 0.7056039571762085, + "learning_rate": 2.902743601882147e-06, + "loss": 0.0907, + "step": 8611 + }, + { + "epoch": 1.3953337653920932, + "grad_norm": 0.9211429357528687, + "learning_rate": 2.9023120034678575e-06, + "loss": 0.1088, + "step": 8612 + }, + { + "epoch": 1.3954957874270901, + "grad_norm": 0.8788020610809326, + "learning_rate": 2.9018803927437946e-06, + "loss": 0.117, + "step": 8613 + }, + { + "epoch": 1.3956578094620868, + "grad_norm": 0.7978447675704956, + "learning_rate": 2.901448769723163e-06, + "loss": 0.1123, + "step": 8614 + }, + { + "epoch": 1.3958198314970836, + "grad_norm": 0.8556854724884033, + "learning_rate": 2.901017134419171e-06, + "loss": 0.1007, + "step": 8615 + }, + { + "epoch": 1.3959818535320805, + "grad_norm": 0.8050543069839478, + "learning_rate": 2.900585486845026e-06, + "loss": 0.0957, + "step": 8616 + }, + { + "epoch": 1.3961438755670772, + "grad_norm": 0.8602048754692078, + "learning_rate": 2.900153827013933e-06, + "loss": 0.1133, + "step": 8617 + }, + { + "epoch": 1.396305897602074, + "grad_norm": 0.8299257159233093, + "learning_rate": 2.8997221549391025e-06, + "loss": 0.112, + "step": 8618 + }, + { + "epoch": 1.3964679196370706, + "grad_norm": 0.9464213848114014, + "learning_rate": 2.8992904706337406e-06, + "loss": 0.1197, + "step": 8619 + }, + { + "epoch": 1.3966299416720673, + "grad_norm": 0.7960741519927979, + "learning_rate": 2.8988587741110575e-06, + "loss": 0.097, + "step": 8620 + }, + { + "epoch": 1.3967919637070643, + "grad_norm": 0.7059590220451355, + "learning_rate": 2.89842706538426e-06, + "loss": 0.0874, + "step": 8621 + }, + { + "epoch": 1.396953985742061, + "grad_norm": 0.7564810514450073, + "learning_rate": 2.8979953444665585e-06, + "loss": 0.0873, + "step": 8622 + }, + { + "epoch": 1.3971160077770577, + "grad_norm": 0.7459419965744019, + "learning_rate": 2.8975636113711637e-06, + "loss": 0.0916, + "step": 8623 + }, + { + "epoch": 1.3972780298120544, + "grad_norm": 0.8232574462890625, + "learning_rate": 2.8971318661112836e-06, + "loss": 0.0907, + "step": 8624 + }, + { + "epoch": 1.3974400518470511, + "grad_norm": 0.8205267190933228, + "learning_rate": 2.89670010870013e-06, + "loss": 0.1081, + "step": 8625 + }, + { + "epoch": 1.397602073882048, + "grad_norm": 0.8244900703430176, + "learning_rate": 2.896268339150912e-06, + "loss": 0.0983, + "step": 8626 + }, + { + "epoch": 1.3977640959170448, + "grad_norm": 0.7982784509658813, + "learning_rate": 2.8958365574768434e-06, + "loss": 0.0898, + "step": 8627 + }, + { + "epoch": 1.3979261179520415, + "grad_norm": 0.8250177502632141, + "learning_rate": 2.895404763691132e-06, + "loss": 0.1079, + "step": 8628 + }, + { + "epoch": 1.3980881399870382, + "grad_norm": 0.828636646270752, + "learning_rate": 2.8949729578069936e-06, + "loss": 0.105, + "step": 8629 + }, + { + "epoch": 1.398250162022035, + "grad_norm": 0.9336274266242981, + "learning_rate": 2.894541139837638e-06, + "loss": 0.1116, + "step": 8630 + }, + { + "epoch": 1.3984121840570318, + "grad_norm": 0.760558545589447, + "learning_rate": 2.8941093097962776e-06, + "loss": 0.0934, + "step": 8631 + }, + { + "epoch": 1.3985742060920285, + "grad_norm": 0.7663840055465698, + "learning_rate": 2.8936774676961264e-06, + "loss": 0.1026, + "step": 8632 + }, + { + "epoch": 1.3987362281270252, + "grad_norm": 0.8325403928756714, + "learning_rate": 2.893245613550397e-06, + "loss": 0.1097, + "step": 8633 + }, + { + "epoch": 1.398898250162022, + "grad_norm": 0.8603909611701965, + "learning_rate": 2.892813747372305e-06, + "loss": 0.1057, + "step": 8634 + }, + { + "epoch": 1.3990602721970187, + "grad_norm": 0.859589159488678, + "learning_rate": 2.892381869175061e-06, + "loss": 0.1124, + "step": 8635 + }, + { + "epoch": 1.3992222942320156, + "grad_norm": 0.9289984703063965, + "learning_rate": 2.891949978971883e-06, + "loss": 0.1186, + "step": 8636 + }, + { + "epoch": 1.3993843162670123, + "grad_norm": 0.7763188481330872, + "learning_rate": 2.891518076775983e-06, + "loss": 0.0958, + "step": 8637 + }, + { + "epoch": 1.399546338302009, + "grad_norm": 0.8290835618972778, + "learning_rate": 2.8910861626005774e-06, + "loss": 0.112, + "step": 8638 + }, + { + "epoch": 1.399708360337006, + "grad_norm": 0.7143067121505737, + "learning_rate": 2.890654236458882e-06, + "loss": 0.0856, + "step": 8639 + }, + { + "epoch": 1.3998703823720027, + "grad_norm": 0.8053386211395264, + "learning_rate": 2.890222298364112e-06, + "loss": 0.0909, + "step": 8640 + }, + { + "epoch": 1.4000324044069994, + "grad_norm": 0.9519616961479187, + "learning_rate": 2.8897903483294844e-06, + "loss": 0.1239, + "step": 8641 + }, + { + "epoch": 1.400194426441996, + "grad_norm": 0.7467250227928162, + "learning_rate": 2.8893583863682157e-06, + "loss": 0.0984, + "step": 8642 + }, + { + "epoch": 1.4003564484769928, + "grad_norm": 0.7794899940490723, + "learning_rate": 2.8889264124935217e-06, + "loss": 0.1038, + "step": 8643 + }, + { + "epoch": 1.4005184705119897, + "grad_norm": 0.8558558821678162, + "learning_rate": 2.888494426718621e-06, + "loss": 0.1056, + "step": 8644 + }, + { + "epoch": 1.4006804925469865, + "grad_norm": 0.9650840759277344, + "learning_rate": 2.888062429056731e-06, + "loss": 0.1189, + "step": 8645 + }, + { + "epoch": 1.4008425145819832, + "grad_norm": 1.1352195739746094, + "learning_rate": 2.8876304195210697e-06, + "loss": 0.1153, + "step": 8646 + }, + { + "epoch": 1.4010045366169799, + "grad_norm": 0.755615770816803, + "learning_rate": 2.8871983981248556e-06, + "loss": 0.098, + "step": 8647 + }, + { + "epoch": 1.4011665586519766, + "grad_norm": 0.7980273365974426, + "learning_rate": 2.8867663648813077e-06, + "loss": 0.1014, + "step": 8648 + }, + { + "epoch": 1.4013285806869735, + "grad_norm": 0.9927170872688293, + "learning_rate": 2.8863343198036453e-06, + "loss": 0.116, + "step": 8649 + }, + { + "epoch": 1.4014906027219702, + "grad_norm": 0.8513253331184387, + "learning_rate": 2.885902262905087e-06, + "loss": 0.1084, + "step": 8650 + }, + { + "epoch": 1.401652624756967, + "grad_norm": 0.849332332611084, + "learning_rate": 2.885470194198854e-06, + "loss": 0.1006, + "step": 8651 + }, + { + "epoch": 1.4018146467919637, + "grad_norm": 0.7961606979370117, + "learning_rate": 2.885038113698165e-06, + "loss": 0.0955, + "step": 8652 + }, + { + "epoch": 1.4019766688269604, + "grad_norm": 0.7978127002716064, + "learning_rate": 2.8846060214162426e-06, + "loss": 0.1055, + "step": 8653 + }, + { + "epoch": 1.4021386908619573, + "grad_norm": 0.9275472164154053, + "learning_rate": 2.8841739173663057e-06, + "loss": 0.1165, + "step": 8654 + }, + { + "epoch": 1.402300712896954, + "grad_norm": 0.9332693815231323, + "learning_rate": 2.883741801561577e-06, + "loss": 0.1121, + "step": 8655 + }, + { + "epoch": 1.4024627349319507, + "grad_norm": 0.7103630304336548, + "learning_rate": 2.883309674015278e-06, + "loss": 0.0933, + "step": 8656 + }, + { + "epoch": 1.4026247569669474, + "grad_norm": 0.8611851930618286, + "learning_rate": 2.8828775347406295e-06, + "loss": 0.1044, + "step": 8657 + }, + { + "epoch": 1.4027867790019442, + "grad_norm": 0.7644463181495667, + "learning_rate": 2.8824453837508563e-06, + "loss": 0.0959, + "step": 8658 + }, + { + "epoch": 1.402948801036941, + "grad_norm": 0.7625099420547485, + "learning_rate": 2.882013221059179e-06, + "loss": 0.0867, + "step": 8659 + }, + { + "epoch": 1.4031108230719378, + "grad_norm": 0.9080460667610168, + "learning_rate": 2.8815810466788225e-06, + "loss": 0.109, + "step": 8660 + }, + { + "epoch": 1.4032728451069345, + "grad_norm": 0.8725658059120178, + "learning_rate": 2.881148860623009e-06, + "loss": 0.1114, + "step": 8661 + }, + { + "epoch": 1.4034348671419314, + "grad_norm": 0.9554914832115173, + "learning_rate": 2.8807166629049623e-06, + "loss": 0.1124, + "step": 8662 + }, + { + "epoch": 1.403596889176928, + "grad_norm": 0.9919114708900452, + "learning_rate": 2.880284453537907e-06, + "loss": 0.1289, + "step": 8663 + }, + { + "epoch": 1.4037589112119249, + "grad_norm": 0.8158717751502991, + "learning_rate": 2.8798522325350683e-06, + "loss": 0.1049, + "step": 8664 + }, + { + "epoch": 1.4039209332469216, + "grad_norm": 0.8903204202651978, + "learning_rate": 2.8794199999096708e-06, + "loss": 0.105, + "step": 8665 + }, + { + "epoch": 1.4040829552819183, + "grad_norm": 0.7267638444900513, + "learning_rate": 2.8789877556749383e-06, + "loss": 0.0889, + "step": 8666 + }, + { + "epoch": 1.4042449773169152, + "grad_norm": 0.7867768406867981, + "learning_rate": 2.8785554998440983e-06, + "loss": 0.0943, + "step": 8667 + }, + { + "epoch": 1.404406999351912, + "grad_norm": 0.7844392657279968, + "learning_rate": 2.8781232324303758e-06, + "loss": 0.1015, + "step": 8668 + }, + { + "epoch": 1.4045690213869086, + "grad_norm": 0.8047603964805603, + "learning_rate": 2.8776909534469976e-06, + "loss": 0.1025, + "step": 8669 + }, + { + "epoch": 1.4047310434219054, + "grad_norm": 0.85113525390625, + "learning_rate": 2.8772586629071902e-06, + "loss": 0.1092, + "step": 8670 + }, + { + "epoch": 1.404893065456902, + "grad_norm": 0.7649945020675659, + "learning_rate": 2.8768263608241805e-06, + "loss": 0.1005, + "step": 8671 + }, + { + "epoch": 1.405055087491899, + "grad_norm": 0.8279699683189392, + "learning_rate": 2.876394047211196e-06, + "loss": 0.1035, + "step": 8672 + }, + { + "epoch": 1.4052171095268957, + "grad_norm": 0.8674699664115906, + "learning_rate": 2.8759617220814654e-06, + "loss": 0.1056, + "step": 8673 + }, + { + "epoch": 1.4053791315618924, + "grad_norm": 0.7891427278518677, + "learning_rate": 2.875529385448215e-06, + "loss": 0.0974, + "step": 8674 + }, + { + "epoch": 1.4055411535968891, + "grad_norm": 0.7829881906509399, + "learning_rate": 2.8750970373246745e-06, + "loss": 0.096, + "step": 8675 + }, + { + "epoch": 1.4057031756318858, + "grad_norm": 0.8128500580787659, + "learning_rate": 2.8746646777240724e-06, + "loss": 0.1085, + "step": 8676 + }, + { + "epoch": 1.4058651976668828, + "grad_norm": 0.9597805142402649, + "learning_rate": 2.874232306659638e-06, + "loss": 0.12, + "step": 8677 + }, + { + "epoch": 1.4060272197018795, + "grad_norm": 0.7913134694099426, + "learning_rate": 2.8737999241446e-06, + "loss": 0.102, + "step": 8678 + }, + { + "epoch": 1.4061892417368762, + "grad_norm": 0.8597573637962341, + "learning_rate": 2.8733675301921893e-06, + "loss": 0.1114, + "step": 8679 + }, + { + "epoch": 1.406351263771873, + "grad_norm": 0.8230311274528503, + "learning_rate": 2.8729351248156364e-06, + "loss": 0.1063, + "step": 8680 + }, + { + "epoch": 1.4065132858068696, + "grad_norm": 0.9289247989654541, + "learning_rate": 2.87250270802817e-06, + "loss": 0.1094, + "step": 8681 + }, + { + "epoch": 1.4066753078418666, + "grad_norm": 0.647177517414093, + "learning_rate": 2.872070279843023e-06, + "loss": 0.0788, + "step": 8682 + }, + { + "epoch": 1.4068373298768633, + "grad_norm": 0.850144624710083, + "learning_rate": 2.871637840273425e-06, + "loss": 0.1101, + "step": 8683 + }, + { + "epoch": 1.40699935191186, + "grad_norm": 0.8800421357154846, + "learning_rate": 2.8712053893326088e-06, + "loss": 0.1125, + "step": 8684 + }, + { + "epoch": 1.4071613739468567, + "grad_norm": 0.9055140614509583, + "learning_rate": 2.8707729270338058e-06, + "loss": 0.1144, + "step": 8685 + }, + { + "epoch": 1.4073233959818534, + "grad_norm": 0.9264971017837524, + "learning_rate": 2.8703404533902492e-06, + "loss": 0.1058, + "step": 8686 + }, + { + "epoch": 1.4074854180168503, + "grad_norm": 0.8147197365760803, + "learning_rate": 2.869907968415171e-06, + "loss": 0.1054, + "step": 8687 + }, + { + "epoch": 1.407647440051847, + "grad_norm": 0.9164337515830994, + "learning_rate": 2.8694754721218027e-06, + "loss": 0.1209, + "step": 8688 + }, + { + "epoch": 1.4078094620868438, + "grad_norm": 0.8125103116035461, + "learning_rate": 2.8690429645233808e-06, + "loss": 0.0983, + "step": 8689 + }, + { + "epoch": 1.4079714841218407, + "grad_norm": 0.7318618297576904, + "learning_rate": 2.8686104456331356e-06, + "loss": 0.0946, + "step": 8690 + }, + { + "epoch": 1.4081335061568374, + "grad_norm": 0.7763205170631409, + "learning_rate": 2.868177915464304e-06, + "loss": 0.0966, + "step": 8691 + }, + { + "epoch": 1.4082955281918341, + "grad_norm": 0.7962520122528076, + "learning_rate": 2.8677453740301185e-06, + "loss": 0.0979, + "step": 8692 + }, + { + "epoch": 1.4084575502268308, + "grad_norm": 0.8198848366737366, + "learning_rate": 2.867312821343815e-06, + "loss": 0.0941, + "step": 8693 + }, + { + "epoch": 1.4086195722618275, + "grad_norm": 0.8297473192214966, + "learning_rate": 2.8668802574186277e-06, + "loss": 0.1059, + "step": 8694 + }, + { + "epoch": 1.4087815942968245, + "grad_norm": 0.9335203766822815, + "learning_rate": 2.866447682267792e-06, + "loss": 0.1144, + "step": 8695 + }, + { + "epoch": 1.4089436163318212, + "grad_norm": 0.8151853084564209, + "learning_rate": 2.8660150959045456e-06, + "loss": 0.1041, + "step": 8696 + }, + { + "epoch": 1.409105638366818, + "grad_norm": 0.8563622832298279, + "learning_rate": 2.8655824983421217e-06, + "loss": 0.1103, + "step": 8697 + }, + { + "epoch": 1.4092676604018146, + "grad_norm": 0.8096403479576111, + "learning_rate": 2.865149889593758e-06, + "loss": 0.1094, + "step": 8698 + }, + { + "epoch": 1.4094296824368113, + "grad_norm": 0.859600841999054, + "learning_rate": 2.8647172696726917e-06, + "loss": 0.1041, + "step": 8699 + }, + { + "epoch": 1.4095917044718083, + "grad_norm": 0.8782287836074829, + "learning_rate": 2.8642846385921593e-06, + "loss": 0.1014, + "step": 8700 + }, + { + "epoch": 1.409753726506805, + "grad_norm": 0.8271836638450623, + "learning_rate": 2.8638519963653987e-06, + "loss": 0.1045, + "step": 8701 + }, + { + "epoch": 1.4099157485418017, + "grad_norm": 0.7663862705230713, + "learning_rate": 2.863419343005647e-06, + "loss": 0.1, + "step": 8702 + }, + { + "epoch": 1.4100777705767984, + "grad_norm": 0.9826388955116272, + "learning_rate": 2.8629866785261435e-06, + "loss": 0.1191, + "step": 8703 + }, + { + "epoch": 1.410239792611795, + "grad_norm": 0.7487360239028931, + "learning_rate": 2.8625540029401262e-06, + "loss": 0.0963, + "step": 8704 + }, + { + "epoch": 1.410401814646792, + "grad_norm": 0.8938003778457642, + "learning_rate": 2.862121316260833e-06, + "loss": 0.107, + "step": 8705 + }, + { + "epoch": 1.4105638366817888, + "grad_norm": 0.8720892071723938, + "learning_rate": 2.8616886185015046e-06, + "loss": 0.1147, + "step": 8706 + }, + { + "epoch": 1.4107258587167855, + "grad_norm": 0.8877646327018738, + "learning_rate": 2.8612559096753797e-06, + "loss": 0.1134, + "step": 8707 + }, + { + "epoch": 1.4108878807517822, + "grad_norm": 0.8477475643157959, + "learning_rate": 2.860823189795697e-06, + "loss": 0.1173, + "step": 8708 + }, + { + "epoch": 1.4110499027867789, + "grad_norm": 0.8832873106002808, + "learning_rate": 2.8603904588756996e-06, + "loss": 0.1127, + "step": 8709 + }, + { + "epoch": 1.4112119248217758, + "grad_norm": 0.8365153074264526, + "learning_rate": 2.859957716928625e-06, + "loss": 0.1029, + "step": 8710 + }, + { + "epoch": 1.4113739468567725, + "grad_norm": 0.861546516418457, + "learning_rate": 2.8595249639677164e-06, + "loss": 0.1085, + "step": 8711 + }, + { + "epoch": 1.4115359688917692, + "grad_norm": 0.8814380168914795, + "learning_rate": 2.8590922000062125e-06, + "loss": 0.1142, + "step": 8712 + }, + { + "epoch": 1.4116979909267662, + "grad_norm": 0.8180842995643616, + "learning_rate": 2.858659425057357e-06, + "loss": 0.1038, + "step": 8713 + }, + { + "epoch": 1.4118600129617627, + "grad_norm": 0.7602577209472656, + "learning_rate": 2.858226639134391e-06, + "loss": 0.0989, + "step": 8714 + }, + { + "epoch": 1.4120220349967596, + "grad_norm": 0.8542532324790955, + "learning_rate": 2.8577938422505573e-06, + "loss": 0.1079, + "step": 8715 + }, + { + "epoch": 1.4121840570317563, + "grad_norm": 0.7752845883369446, + "learning_rate": 2.8573610344190978e-06, + "loss": 0.1, + "step": 8716 + }, + { + "epoch": 1.412346079066753, + "grad_norm": 0.8004235029220581, + "learning_rate": 2.8569282156532548e-06, + "loss": 0.1044, + "step": 8717 + }, + { + "epoch": 1.41250810110175, + "grad_norm": 0.904018759727478, + "learning_rate": 2.8564953859662725e-06, + "loss": 0.1214, + "step": 8718 + }, + { + "epoch": 1.4126701231367467, + "grad_norm": 0.8129547238349915, + "learning_rate": 2.8560625453713935e-06, + "loss": 0.1058, + "step": 8719 + }, + { + "epoch": 1.4128321451717434, + "grad_norm": 0.8800033926963806, + "learning_rate": 2.8556296938818632e-06, + "loss": 0.0912, + "step": 8720 + }, + { + "epoch": 1.41299416720674, + "grad_norm": 0.7877295017242432, + "learning_rate": 2.8551968315109246e-06, + "loss": 0.0965, + "step": 8721 + }, + { + "epoch": 1.4131561892417368, + "grad_norm": 0.9171632528305054, + "learning_rate": 2.8547639582718223e-06, + "loss": 0.1216, + "step": 8722 + }, + { + "epoch": 1.4133182112767337, + "grad_norm": 0.766700804233551, + "learning_rate": 2.8543310741778013e-06, + "loss": 0.0901, + "step": 8723 + }, + { + "epoch": 1.4134802333117304, + "grad_norm": 0.8369571566581726, + "learning_rate": 2.853898179242107e-06, + "loss": 0.1022, + "step": 8724 + }, + { + "epoch": 1.4136422553467272, + "grad_norm": 0.7450268268585205, + "learning_rate": 2.853465273477985e-06, + "loss": 0.0961, + "step": 8725 + }, + { + "epoch": 1.4138042773817239, + "grad_norm": 0.8542775511741638, + "learning_rate": 2.8530323568986805e-06, + "loss": 0.1118, + "step": 8726 + }, + { + "epoch": 1.4139662994167206, + "grad_norm": 0.7954638004302979, + "learning_rate": 2.852599429517441e-06, + "loss": 0.104, + "step": 8727 + }, + { + "epoch": 1.4141283214517175, + "grad_norm": 0.7494431138038635, + "learning_rate": 2.8521664913475123e-06, + "loss": 0.0874, + "step": 8728 + }, + { + "epoch": 1.4142903434867142, + "grad_norm": 0.8307495713233948, + "learning_rate": 2.8517335424021404e-06, + "loss": 0.0994, + "step": 8729 + }, + { + "epoch": 1.414452365521711, + "grad_norm": 0.8603584170341492, + "learning_rate": 2.8513005826945733e-06, + "loss": 0.11, + "step": 8730 + }, + { + "epoch": 1.4146143875567077, + "grad_norm": 0.9381308555603027, + "learning_rate": 2.850867612238059e-06, + "loss": 0.1049, + "step": 8731 + }, + { + "epoch": 1.4147764095917044, + "grad_norm": 0.7319527864456177, + "learning_rate": 2.8504346310458446e-06, + "loss": 0.0898, + "step": 8732 + }, + { + "epoch": 1.4149384316267013, + "grad_norm": 0.8726157546043396, + "learning_rate": 2.850001639131179e-06, + "loss": 0.1157, + "step": 8733 + }, + { + "epoch": 1.415100453661698, + "grad_norm": 0.6778660416603088, + "learning_rate": 2.8495686365073096e-06, + "loss": 0.088, + "step": 8734 + }, + { + "epoch": 1.4152624756966947, + "grad_norm": 0.9084062576293945, + "learning_rate": 2.849135623187486e-06, + "loss": 0.1115, + "step": 8735 + }, + { + "epoch": 1.4154244977316914, + "grad_norm": 0.8225180506706238, + "learning_rate": 2.848702599184957e-06, + "loss": 0.1022, + "step": 8736 + }, + { + "epoch": 1.4155865197666881, + "grad_norm": 0.7956673502922058, + "learning_rate": 2.8482695645129725e-06, + "loss": 0.0988, + "step": 8737 + }, + { + "epoch": 1.415748541801685, + "grad_norm": 0.710880696773529, + "learning_rate": 2.8478365191847824e-06, + "loss": 0.0868, + "step": 8738 + }, + { + "epoch": 1.4159105638366818, + "grad_norm": 0.7995860576629639, + "learning_rate": 2.8474034632136365e-06, + "loss": 0.1029, + "step": 8739 + }, + { + "epoch": 1.4160725858716785, + "grad_norm": 0.8467408418655396, + "learning_rate": 2.8469703966127853e-06, + "loss": 0.1085, + "step": 8740 + }, + { + "epoch": 1.4162346079066754, + "grad_norm": 0.8162935972213745, + "learning_rate": 2.8465373193954794e-06, + "loss": 0.1065, + "step": 8741 + }, + { + "epoch": 1.4163966299416721, + "grad_norm": 0.8570012450218201, + "learning_rate": 2.8461042315749706e-06, + "loss": 0.1045, + "step": 8742 + }, + { + "epoch": 1.4165586519766689, + "grad_norm": 0.7713600397109985, + "learning_rate": 2.8456711331645085e-06, + "loss": 0.0956, + "step": 8743 + }, + { + "epoch": 1.4167206740116656, + "grad_norm": 0.7573341131210327, + "learning_rate": 2.845238024177348e-06, + "loss": 0.098, + "step": 8744 + }, + { + "epoch": 1.4168826960466623, + "grad_norm": 0.8098436594009399, + "learning_rate": 2.8448049046267377e-06, + "loss": 0.0979, + "step": 8745 + }, + { + "epoch": 1.4170447180816592, + "grad_norm": 0.8652111887931824, + "learning_rate": 2.8443717745259335e-06, + "loss": 0.1117, + "step": 8746 + }, + { + "epoch": 1.417206740116656, + "grad_norm": 0.8840187788009644, + "learning_rate": 2.843938633888186e-06, + "loss": 0.1069, + "step": 8747 + }, + { + "epoch": 1.4173687621516526, + "grad_norm": 0.7981458902359009, + "learning_rate": 2.8435054827267476e-06, + "loss": 0.1045, + "step": 8748 + }, + { + "epoch": 1.4175307841866494, + "grad_norm": 0.8698453903198242, + "learning_rate": 2.843072321054873e-06, + "loss": 0.1061, + "step": 8749 + }, + { + "epoch": 1.417692806221646, + "grad_norm": 0.9237022995948792, + "learning_rate": 2.8426391488858163e-06, + "loss": 0.1168, + "step": 8750 + }, + { + "epoch": 1.417854828256643, + "grad_norm": 0.8793653249740601, + "learning_rate": 2.8422059662328306e-06, + "loss": 0.117, + "step": 8751 + }, + { + "epoch": 1.4180168502916397, + "grad_norm": 0.7878406047821045, + "learning_rate": 2.8417727731091705e-06, + "loss": 0.1027, + "step": 8752 + }, + { + "epoch": 1.4181788723266364, + "grad_norm": 0.9198635816574097, + "learning_rate": 2.84133956952809e-06, + "loss": 0.1156, + "step": 8753 + }, + { + "epoch": 1.4183408943616331, + "grad_norm": 0.8705915212631226, + "learning_rate": 2.840906355502845e-06, + "loss": 0.1141, + "step": 8754 + }, + { + "epoch": 1.4185029163966298, + "grad_norm": 0.8760009407997131, + "learning_rate": 2.8404731310466904e-06, + "loss": 0.1222, + "step": 8755 + }, + { + "epoch": 1.4186649384316268, + "grad_norm": 0.8203396201133728, + "learning_rate": 2.840039896172882e-06, + "loss": 0.1032, + "step": 8756 + }, + { + "epoch": 1.4188269604666235, + "grad_norm": 0.7818074226379395, + "learning_rate": 2.8396066508946757e-06, + "loss": 0.1038, + "step": 8757 + }, + { + "epoch": 1.4189889825016202, + "grad_norm": 0.7406920194625854, + "learning_rate": 2.8391733952253277e-06, + "loss": 0.0913, + "step": 8758 + }, + { + "epoch": 1.419151004536617, + "grad_norm": 0.8644748330116272, + "learning_rate": 2.8387401291780953e-06, + "loss": 0.1071, + "step": 8759 + }, + { + "epoch": 1.4193130265716136, + "grad_norm": 0.8011223077774048, + "learning_rate": 2.838306852766234e-06, + "loss": 0.1088, + "step": 8760 + }, + { + "epoch": 1.4194750486066106, + "grad_norm": 0.9043474197387695, + "learning_rate": 2.8378735660030015e-06, + "loss": 0.1187, + "step": 8761 + }, + { + "epoch": 1.4196370706416073, + "grad_norm": 0.7650865316390991, + "learning_rate": 2.8374402689016557e-06, + "loss": 0.0959, + "step": 8762 + }, + { + "epoch": 1.419799092676604, + "grad_norm": 0.7926430702209473, + "learning_rate": 2.8370069614754543e-06, + "loss": 0.094, + "step": 8763 + }, + { + "epoch": 1.419961114711601, + "grad_norm": 0.8434385061264038, + "learning_rate": 2.8365736437376555e-06, + "loss": 0.1034, + "step": 8764 + }, + { + "epoch": 1.4201231367465974, + "grad_norm": 0.7807652950286865, + "learning_rate": 2.836140315701517e-06, + "loss": 0.1004, + "step": 8765 + }, + { + "epoch": 1.4202851587815943, + "grad_norm": 0.9462375640869141, + "learning_rate": 2.8357069773802996e-06, + "loss": 0.1111, + "step": 8766 + }, + { + "epoch": 1.420447180816591, + "grad_norm": 0.8678317666053772, + "learning_rate": 2.8352736287872593e-06, + "loss": 0.1101, + "step": 8767 + }, + { + "epoch": 1.4206092028515878, + "grad_norm": 0.8266333937644958, + "learning_rate": 2.834840269935659e-06, + "loss": 0.1046, + "step": 8768 + }, + { + "epoch": 1.4207712248865847, + "grad_norm": 0.8634325265884399, + "learning_rate": 2.8344069008387565e-06, + "loss": 0.1141, + "step": 8769 + }, + { + "epoch": 1.4209332469215814, + "grad_norm": 0.7722366452217102, + "learning_rate": 2.833973521509812e-06, + "loss": 0.0944, + "step": 8770 + }, + { + "epoch": 1.4210952689565781, + "grad_norm": 0.7650349736213684, + "learning_rate": 2.8335401319620855e-06, + "loss": 0.1033, + "step": 8771 + }, + { + "epoch": 1.4212572909915748, + "grad_norm": 0.94795161485672, + "learning_rate": 2.833106732208838e-06, + "loss": 0.1154, + "step": 8772 + }, + { + "epoch": 1.4214193130265715, + "grad_norm": 0.8152647614479065, + "learning_rate": 2.832673322263331e-06, + "loss": 0.1029, + "step": 8773 + }, + { + "epoch": 1.4215813350615685, + "grad_norm": 0.843647301197052, + "learning_rate": 2.8322399021388248e-06, + "loss": 0.1097, + "step": 8774 + }, + { + "epoch": 1.4217433570965652, + "grad_norm": 0.8536216616630554, + "learning_rate": 2.8318064718485826e-06, + "loss": 0.1011, + "step": 8775 + }, + { + "epoch": 1.421905379131562, + "grad_norm": 0.8077113628387451, + "learning_rate": 2.8313730314058645e-06, + "loss": 0.103, + "step": 8776 + }, + { + "epoch": 1.4220674011665586, + "grad_norm": 0.964287281036377, + "learning_rate": 2.830939580823934e-06, + "loss": 0.1125, + "step": 8777 + }, + { + "epoch": 1.4222294232015553, + "grad_norm": 0.8470258712768555, + "learning_rate": 2.830506120116053e-06, + "loss": 0.1101, + "step": 8778 + }, + { + "epoch": 1.4223914452365523, + "grad_norm": 1.022905707359314, + "learning_rate": 2.8300726492954845e-06, + "loss": 0.1195, + "step": 8779 + }, + { + "epoch": 1.422553467271549, + "grad_norm": 0.8365886211395264, + "learning_rate": 2.8296391683754916e-06, + "loss": 0.107, + "step": 8780 + }, + { + "epoch": 1.4227154893065457, + "grad_norm": 0.8074929118156433, + "learning_rate": 2.829205677369338e-06, + "loss": 0.1062, + "step": 8781 + }, + { + "epoch": 1.4228775113415424, + "grad_norm": 0.8210521340370178, + "learning_rate": 2.8287721762902877e-06, + "loss": 0.1056, + "step": 8782 + }, + { + "epoch": 1.423039533376539, + "grad_norm": 0.845281183719635, + "learning_rate": 2.8283386651516037e-06, + "loss": 0.1036, + "step": 8783 + }, + { + "epoch": 1.423201555411536, + "grad_norm": 0.7844056487083435, + "learning_rate": 2.8279051439665516e-06, + "loss": 0.0968, + "step": 8784 + }, + { + "epoch": 1.4233635774465327, + "grad_norm": 0.8035227656364441, + "learning_rate": 2.8274716127483955e-06, + "loss": 0.1052, + "step": 8785 + }, + { + "epoch": 1.4235255994815295, + "grad_norm": 0.8737231492996216, + "learning_rate": 2.8270380715104e-06, + "loss": 0.114, + "step": 8786 + }, + { + "epoch": 1.4236876215165262, + "grad_norm": 0.8196536302566528, + "learning_rate": 2.8266045202658316e-06, + "loss": 0.1088, + "step": 8787 + }, + { + "epoch": 1.4238496435515229, + "grad_norm": 0.7950606942176819, + "learning_rate": 2.826170959027956e-06, + "loss": 0.0977, + "step": 8788 + }, + { + "epoch": 1.4240116655865198, + "grad_norm": 0.8498073816299438, + "learning_rate": 2.8257373878100363e-06, + "loss": 0.1049, + "step": 8789 + }, + { + "epoch": 1.4241736876215165, + "grad_norm": 0.8204236030578613, + "learning_rate": 2.8253038066253423e-06, + "loss": 0.1023, + "step": 8790 + }, + { + "epoch": 1.4243357096565132, + "grad_norm": 0.9035735130310059, + "learning_rate": 2.8248702154871387e-06, + "loss": 0.1169, + "step": 8791 + }, + { + "epoch": 1.4244977316915102, + "grad_norm": 0.9391461610794067, + "learning_rate": 2.8244366144086926e-06, + "loss": 0.1122, + "step": 8792 + }, + { + "epoch": 1.4246597537265069, + "grad_norm": 0.8815183639526367, + "learning_rate": 2.824003003403271e-06, + "loss": 0.1041, + "step": 8793 + }, + { + "epoch": 1.4248217757615036, + "grad_norm": 0.7553703784942627, + "learning_rate": 2.823569382484142e-06, + "loss": 0.0949, + "step": 8794 + }, + { + "epoch": 1.4249837977965003, + "grad_norm": 0.8771671652793884, + "learning_rate": 2.823135751664573e-06, + "loss": 0.1142, + "step": 8795 + }, + { + "epoch": 1.425145819831497, + "grad_norm": 0.845828652381897, + "learning_rate": 2.822702110957831e-06, + "loss": 0.1074, + "step": 8796 + }, + { + "epoch": 1.425307841866494, + "grad_norm": 0.823070764541626, + "learning_rate": 2.8222684603771867e-06, + "loss": 0.1042, + "step": 8797 + }, + { + "epoch": 1.4254698639014907, + "grad_norm": 0.8956811428070068, + "learning_rate": 2.8218347999359066e-06, + "loss": 0.1038, + "step": 8798 + }, + { + "epoch": 1.4256318859364874, + "grad_norm": 0.9194256067276001, + "learning_rate": 2.821401129647261e-06, + "loss": 0.1194, + "step": 8799 + }, + { + "epoch": 1.425793907971484, + "grad_norm": 0.787661612033844, + "learning_rate": 2.8209674495245177e-06, + "loss": 0.0979, + "step": 8800 + }, + { + "epoch": 1.4259559300064808, + "grad_norm": 0.8394095301628113, + "learning_rate": 2.820533759580948e-06, + "loss": 0.1078, + "step": 8801 + }, + { + "epoch": 1.4261179520414777, + "grad_norm": 0.7722134590148926, + "learning_rate": 2.82010005982982e-06, + "loss": 0.1064, + "step": 8802 + }, + { + "epoch": 1.4262799740764744, + "grad_norm": 0.841996967792511, + "learning_rate": 2.8196663502844057e-06, + "loss": 0.0996, + "step": 8803 + }, + { + "epoch": 1.4264419961114712, + "grad_norm": 0.9296277165412903, + "learning_rate": 2.819232630957975e-06, + "loss": 0.1196, + "step": 8804 + }, + { + "epoch": 1.4266040181464679, + "grad_norm": 0.8732765316963196, + "learning_rate": 2.8187989018637967e-06, + "loss": 0.1097, + "step": 8805 + }, + { + "epoch": 1.4267660401814646, + "grad_norm": 0.9206928014755249, + "learning_rate": 2.818365163015145e-06, + "loss": 0.1229, + "step": 8806 + }, + { + "epoch": 1.4269280622164615, + "grad_norm": 0.853338897228241, + "learning_rate": 2.817931414425289e-06, + "loss": 0.0942, + "step": 8807 + }, + { + "epoch": 1.4270900842514582, + "grad_norm": 0.8418228030204773, + "learning_rate": 2.8174976561075013e-06, + "loss": 0.1125, + "step": 8808 + }, + { + "epoch": 1.427252106286455, + "grad_norm": 0.9058745503425598, + "learning_rate": 2.8170638880750534e-06, + "loss": 0.1103, + "step": 8809 + }, + { + "epoch": 1.4274141283214516, + "grad_norm": 0.8129554986953735, + "learning_rate": 2.816630110341218e-06, + "loss": 0.1046, + "step": 8810 + }, + { + "epoch": 1.4275761503564484, + "grad_norm": 0.7814062237739563, + "learning_rate": 2.8161963229192677e-06, + "loss": 0.103, + "step": 8811 + }, + { + "epoch": 1.4277381723914453, + "grad_norm": 0.7614774703979492, + "learning_rate": 2.8157625258224746e-06, + "loss": 0.0985, + "step": 8812 + }, + { + "epoch": 1.427900194426442, + "grad_norm": 0.8251438140869141, + "learning_rate": 2.8153287190641133e-06, + "loss": 0.105, + "step": 8813 + }, + { + "epoch": 1.4280622164614387, + "grad_norm": 0.8020395040512085, + "learning_rate": 2.814894902657456e-06, + "loss": 0.1027, + "step": 8814 + }, + { + "epoch": 1.4282242384964356, + "grad_norm": 0.7605652213096619, + "learning_rate": 2.8144610766157758e-06, + "loss": 0.0941, + "step": 8815 + }, + { + "epoch": 1.4283862605314321, + "grad_norm": 0.7266712784767151, + "learning_rate": 2.814027240952348e-06, + "loss": 0.085, + "step": 8816 + }, + { + "epoch": 1.428548282566429, + "grad_norm": 0.8249565958976746, + "learning_rate": 2.813593395680447e-06, + "loss": 0.1041, + "step": 8817 + }, + { + "epoch": 1.4287103046014258, + "grad_norm": 1.0275297164916992, + "learning_rate": 2.8131595408133467e-06, + "loss": 0.1263, + "step": 8818 + }, + { + "epoch": 1.4288723266364225, + "grad_norm": 1.5727760791778564, + "learning_rate": 2.812725676364322e-06, + "loss": 0.1006, + "step": 8819 + }, + { + "epoch": 1.4290343486714194, + "grad_norm": 0.8301246762275696, + "learning_rate": 2.8122918023466485e-06, + "loss": 0.1036, + "step": 8820 + }, + { + "epoch": 1.4291963707064161, + "grad_norm": 0.8229403495788574, + "learning_rate": 2.811857918773602e-06, + "loss": 0.0966, + "step": 8821 + }, + { + "epoch": 1.4293583927414129, + "grad_norm": 0.794026255607605, + "learning_rate": 2.811424025658458e-06, + "loss": 0.1077, + "step": 8822 + }, + { + "epoch": 1.4295204147764096, + "grad_norm": 0.6794479489326477, + "learning_rate": 2.810990123014492e-06, + "loss": 0.0826, + "step": 8823 + }, + { + "epoch": 1.4296824368114063, + "grad_norm": 0.759669303894043, + "learning_rate": 2.8105562108549807e-06, + "loss": 0.0996, + "step": 8824 + }, + { + "epoch": 1.4298444588464032, + "grad_norm": 0.8474135398864746, + "learning_rate": 2.8101222891932013e-06, + "loss": 0.1085, + "step": 8825 + }, + { + "epoch": 1.4300064808814, + "grad_norm": 0.9885803461074829, + "learning_rate": 2.80968835804243e-06, + "loss": 0.121, + "step": 8826 + }, + { + "epoch": 1.4301685029163966, + "grad_norm": 0.8411709070205688, + "learning_rate": 2.809254417415944e-06, + "loss": 0.1085, + "step": 8827 + }, + { + "epoch": 1.4303305249513933, + "grad_norm": 1.034460425376892, + "learning_rate": 2.808820467327022e-06, + "loss": 0.1044, + "step": 8828 + }, + { + "epoch": 1.43049254698639, + "grad_norm": 0.9514307975769043, + "learning_rate": 2.8083865077889404e-06, + "loss": 0.1225, + "step": 8829 + }, + { + "epoch": 1.430654569021387, + "grad_norm": 0.7484989762306213, + "learning_rate": 2.8079525388149787e-06, + "loss": 0.0957, + "step": 8830 + }, + { + "epoch": 1.4308165910563837, + "grad_norm": 0.8544020652770996, + "learning_rate": 2.807518560418414e-06, + "loss": 0.1155, + "step": 8831 + }, + { + "epoch": 1.4309786130913804, + "grad_norm": 0.8093070387840271, + "learning_rate": 2.8070845726125257e-06, + "loss": 0.1042, + "step": 8832 + }, + { + "epoch": 1.4311406351263771, + "grad_norm": 0.7702214121818542, + "learning_rate": 2.806650575410592e-06, + "loss": 0.0942, + "step": 8833 + }, + { + "epoch": 1.4313026571613738, + "grad_norm": 0.793833315372467, + "learning_rate": 2.8062165688258934e-06, + "loss": 0.0903, + "step": 8834 + }, + { + "epoch": 1.4314646791963708, + "grad_norm": 0.9053971171379089, + "learning_rate": 2.8057825528717093e-06, + "loss": 0.1259, + "step": 8835 + }, + { + "epoch": 1.4316267012313675, + "grad_norm": 0.9572237730026245, + "learning_rate": 2.8053485275613177e-06, + "loss": 0.111, + "step": 8836 + }, + { + "epoch": 1.4317887232663642, + "grad_norm": 0.7796591520309448, + "learning_rate": 2.804914492908001e-06, + "loss": 0.0959, + "step": 8837 + }, + { + "epoch": 1.431950745301361, + "grad_norm": 0.733897864818573, + "learning_rate": 2.804480448925039e-06, + "loss": 0.0852, + "step": 8838 + }, + { + "epoch": 1.4321127673363576, + "grad_norm": 0.7287041544914246, + "learning_rate": 2.8040463956257113e-06, + "loss": 0.0934, + "step": 8839 + }, + { + "epoch": 1.4322747893713546, + "grad_norm": 0.8480597138404846, + "learning_rate": 2.8036123330233e-06, + "loss": 0.1066, + "step": 8840 + }, + { + "epoch": 1.4324368114063513, + "grad_norm": 0.8591471314430237, + "learning_rate": 2.8031782611310863e-06, + "loss": 0.1044, + "step": 8841 + }, + { + "epoch": 1.432598833441348, + "grad_norm": 0.7007931470870972, + "learning_rate": 2.802744179962351e-06, + "loss": 0.0905, + "step": 8842 + }, + { + "epoch": 1.432760855476345, + "grad_norm": 0.7390332818031311, + "learning_rate": 2.802310089530377e-06, + "loss": 0.0923, + "step": 8843 + }, + { + "epoch": 1.4329228775113416, + "grad_norm": 0.8909687995910645, + "learning_rate": 2.801875989848446e-06, + "loss": 0.1139, + "step": 8844 + }, + { + "epoch": 1.4330848995463383, + "grad_norm": 0.795418918132782, + "learning_rate": 2.801441880929839e-06, + "loss": 0.103, + "step": 8845 + }, + { + "epoch": 1.433246921581335, + "grad_norm": 0.7816963195800781, + "learning_rate": 2.8010077627878414e-06, + "loss": 0.1, + "step": 8846 + }, + { + "epoch": 1.4334089436163318, + "grad_norm": 0.8099395632743835, + "learning_rate": 2.8005736354357338e-06, + "loss": 0.094, + "step": 8847 + }, + { + "epoch": 1.4335709656513287, + "grad_norm": 0.7562690377235413, + "learning_rate": 2.8001394988868003e-06, + "loss": 0.0969, + "step": 8848 + }, + { + "epoch": 1.4337329876863254, + "grad_norm": 0.9833892583847046, + "learning_rate": 2.7997053531543246e-06, + "loss": 0.1045, + "step": 8849 + }, + { + "epoch": 1.4338950097213221, + "grad_norm": 0.8875625133514404, + "learning_rate": 2.7992711982515908e-06, + "loss": 0.1067, + "step": 8850 + }, + { + "epoch": 1.4340570317563188, + "grad_norm": 0.7625311613082886, + "learning_rate": 2.7988370341918814e-06, + "loss": 0.0928, + "step": 8851 + }, + { + "epoch": 1.4342190537913155, + "grad_norm": 0.8266885876655579, + "learning_rate": 2.798402860988483e-06, + "loss": 0.1055, + "step": 8852 + }, + { + "epoch": 1.4343810758263125, + "grad_norm": 0.8304306864738464, + "learning_rate": 2.7979686786546784e-06, + "loss": 0.1017, + "step": 8853 + }, + { + "epoch": 1.4345430978613092, + "grad_norm": 0.8403946161270142, + "learning_rate": 2.797534487203755e-06, + "loss": 0.0894, + "step": 8854 + }, + { + "epoch": 1.434705119896306, + "grad_norm": 0.7953592538833618, + "learning_rate": 2.7971002866489944e-06, + "loss": 0.098, + "step": 8855 + }, + { + "epoch": 1.4348671419313026, + "grad_norm": 0.7994452714920044, + "learning_rate": 2.7966660770036845e-06, + "loss": 0.1017, + "step": 8856 + }, + { + "epoch": 1.4350291639662993, + "grad_norm": 0.8061219453811646, + "learning_rate": 2.7962318582811113e-06, + "loss": 0.1053, + "step": 8857 + }, + { + "epoch": 1.4351911860012962, + "grad_norm": 0.9328574538230896, + "learning_rate": 2.795797630494559e-06, + "loss": 0.1206, + "step": 8858 + }, + { + "epoch": 1.435353208036293, + "grad_norm": 0.8536462187767029, + "learning_rate": 2.795363393657316e-06, + "loss": 0.1042, + "step": 8859 + }, + { + "epoch": 1.4355152300712897, + "grad_norm": 0.7858737111091614, + "learning_rate": 2.7949291477826666e-06, + "loss": 0.0977, + "step": 8860 + }, + { + "epoch": 1.4356772521062864, + "grad_norm": 0.887958824634552, + "learning_rate": 2.7944948928839007e-06, + "loss": 0.1106, + "step": 8861 + }, + { + "epoch": 1.435839274141283, + "grad_norm": 0.817033588886261, + "learning_rate": 2.7940606289743026e-06, + "loss": 0.0977, + "step": 8862 + }, + { + "epoch": 1.43600129617628, + "grad_norm": 0.8055084943771362, + "learning_rate": 2.793626356067161e-06, + "loss": 0.0928, + "step": 8863 + }, + { + "epoch": 1.4361633182112767, + "grad_norm": 0.8124540448188782, + "learning_rate": 2.793192074175764e-06, + "loss": 0.0977, + "step": 8864 + }, + { + "epoch": 1.4363253402462735, + "grad_norm": 0.9017152786254883, + "learning_rate": 2.7927577833133984e-06, + "loss": 0.1147, + "step": 8865 + }, + { + "epoch": 1.4364873622812704, + "grad_norm": 0.8836957812309265, + "learning_rate": 2.792323483493354e-06, + "loss": 0.1094, + "step": 8866 + }, + { + "epoch": 1.4366493843162669, + "grad_norm": 0.8717904686927795, + "learning_rate": 2.791889174728918e-06, + "loss": 0.1102, + "step": 8867 + }, + { + "epoch": 1.4368114063512638, + "grad_norm": 0.7438794374465942, + "learning_rate": 2.791454857033379e-06, + "loss": 0.0957, + "step": 8868 + }, + { + "epoch": 1.4369734283862605, + "grad_norm": 0.7357698678970337, + "learning_rate": 2.7910205304200273e-06, + "loss": 0.0928, + "step": 8869 + }, + { + "epoch": 1.4371354504212572, + "grad_norm": 0.8071755170822144, + "learning_rate": 2.790586194902151e-06, + "loss": 0.0974, + "step": 8870 + }, + { + "epoch": 1.4372974724562542, + "grad_norm": 0.8086465001106262, + "learning_rate": 2.790151850493041e-06, + "loss": 0.1087, + "step": 8871 + }, + { + "epoch": 1.4374594944912509, + "grad_norm": 0.7857526540756226, + "learning_rate": 2.789717497205986e-06, + "loss": 0.0962, + "step": 8872 + }, + { + "epoch": 1.4376215165262476, + "grad_norm": 0.8768362998962402, + "learning_rate": 2.789283135054277e-06, + "loss": 0.0975, + "step": 8873 + }, + { + "epoch": 1.4377835385612443, + "grad_norm": 0.8221284747123718, + "learning_rate": 2.7888487640512046e-06, + "loss": 0.0953, + "step": 8874 + }, + { + "epoch": 1.437945560596241, + "grad_norm": 0.9030830264091492, + "learning_rate": 2.7884143842100573e-06, + "loss": 0.1122, + "step": 8875 + }, + { + "epoch": 1.438107582631238, + "grad_norm": 0.9299379587173462, + "learning_rate": 2.78797999554413e-06, + "loss": 0.1183, + "step": 8876 + }, + { + "epoch": 1.4382696046662347, + "grad_norm": 0.7504619359970093, + "learning_rate": 2.7875455980667106e-06, + "loss": 0.0942, + "step": 8877 + }, + { + "epoch": 1.4384316267012314, + "grad_norm": 0.7830450534820557, + "learning_rate": 2.787111191791092e-06, + "loss": 0.0983, + "step": 8878 + }, + { + "epoch": 1.438593648736228, + "grad_norm": 0.7624955177307129, + "learning_rate": 2.786676776730566e-06, + "loss": 0.0961, + "step": 8879 + }, + { + "epoch": 1.4387556707712248, + "grad_norm": 0.7818362712860107, + "learning_rate": 2.7862423528984233e-06, + "loss": 0.1087, + "step": 8880 + }, + { + "epoch": 1.4389176928062217, + "grad_norm": 0.8774171471595764, + "learning_rate": 2.7858079203079587e-06, + "loss": 0.1192, + "step": 8881 + }, + { + "epoch": 1.4390797148412184, + "grad_norm": 0.7342621088027954, + "learning_rate": 2.7853734789724618e-06, + "loss": 0.0933, + "step": 8882 + }, + { + "epoch": 1.4392417368762151, + "grad_norm": 0.7560212016105652, + "learning_rate": 2.7849390289052287e-06, + "loss": 0.0867, + "step": 8883 + }, + { + "epoch": 1.4394037589112119, + "grad_norm": 1.23517644405365, + "learning_rate": 2.7845045701195494e-06, + "loss": 0.1019, + "step": 8884 + }, + { + "epoch": 1.4395657809462086, + "grad_norm": 0.9435912370681763, + "learning_rate": 2.78407010262872e-06, + "loss": 0.1217, + "step": 8885 + }, + { + "epoch": 1.4397278029812055, + "grad_norm": 0.9272345304489136, + "learning_rate": 2.7836356264460316e-06, + "loss": 0.1167, + "step": 8886 + }, + { + "epoch": 1.4398898250162022, + "grad_norm": 0.8792025446891785, + "learning_rate": 2.7832011415847802e-06, + "loss": 0.1165, + "step": 8887 + }, + { + "epoch": 1.440051847051199, + "grad_norm": 0.7143024206161499, + "learning_rate": 2.7827666480582593e-06, + "loss": 0.0933, + "step": 8888 + }, + { + "epoch": 1.4402138690861956, + "grad_norm": 0.8933820724487305, + "learning_rate": 2.782332145879763e-06, + "loss": 0.1105, + "step": 8889 + }, + { + "epoch": 1.4403758911211924, + "grad_norm": 1.0869182348251343, + "learning_rate": 2.7818976350625864e-06, + "loss": 0.0966, + "step": 8890 + }, + { + "epoch": 1.4405379131561893, + "grad_norm": 0.7515729665756226, + "learning_rate": 2.781463115620024e-06, + "loss": 0.0954, + "step": 8891 + }, + { + "epoch": 1.440699935191186, + "grad_norm": 0.6556647419929504, + "learning_rate": 2.781028587565372e-06, + "loss": 0.0812, + "step": 8892 + }, + { + "epoch": 1.4408619572261827, + "grad_norm": 0.8055543899536133, + "learning_rate": 2.780594050911925e-06, + "loss": 0.1076, + "step": 8893 + }, + { + "epoch": 1.4410239792611796, + "grad_norm": 0.8170286417007446, + "learning_rate": 2.780159505672979e-06, + "loss": 0.1098, + "step": 8894 + }, + { + "epoch": 1.4411860012961764, + "grad_norm": 0.8746091723442078, + "learning_rate": 2.7797249518618304e-06, + "loss": 0.1001, + "step": 8895 + }, + { + "epoch": 1.441348023331173, + "grad_norm": 0.8125444650650024, + "learning_rate": 2.7792903894917746e-06, + "loss": 0.1104, + "step": 8896 + }, + { + "epoch": 1.4415100453661698, + "grad_norm": 0.8494876623153687, + "learning_rate": 2.778855818576109e-06, + "loss": 0.1037, + "step": 8897 + }, + { + "epoch": 1.4416720674011665, + "grad_norm": 0.8670206069946289, + "learning_rate": 2.7784212391281307e-06, + "loss": 0.1096, + "step": 8898 + }, + { + "epoch": 1.4418340894361634, + "grad_norm": 0.7742645144462585, + "learning_rate": 2.777986651161136e-06, + "loss": 0.0943, + "step": 8899 + }, + { + "epoch": 1.4419961114711601, + "grad_norm": 0.8516318798065186, + "learning_rate": 2.7775520546884216e-06, + "loss": 0.1088, + "step": 8900 + }, + { + "epoch": 1.4421581335061568, + "grad_norm": 0.8558171987533569, + "learning_rate": 2.7771174497232867e-06, + "loss": 0.0966, + "step": 8901 + }, + { + "epoch": 1.4423201555411536, + "grad_norm": 0.8049555420875549, + "learning_rate": 2.7766828362790283e-06, + "loss": 0.1069, + "step": 8902 + }, + { + "epoch": 1.4424821775761503, + "grad_norm": 0.8404254913330078, + "learning_rate": 2.776248214368945e-06, + "loss": 0.1062, + "step": 8903 + }, + { + "epoch": 1.4426441996111472, + "grad_norm": 0.8144481182098389, + "learning_rate": 2.7758135840063344e-06, + "loss": 0.1033, + "step": 8904 + }, + { + "epoch": 1.442806221646144, + "grad_norm": 0.8399234414100647, + "learning_rate": 2.7753789452044965e-06, + "loss": 0.1119, + "step": 8905 + }, + { + "epoch": 1.4429682436811406, + "grad_norm": 0.8735939264297485, + "learning_rate": 2.7749442979767276e-06, + "loss": 0.1109, + "step": 8906 + }, + { + "epoch": 1.4431302657161373, + "grad_norm": 0.7629880309104919, + "learning_rate": 2.7745096423363304e-06, + "loss": 0.096, + "step": 8907 + }, + { + "epoch": 1.443292287751134, + "grad_norm": 0.9925363659858704, + "learning_rate": 2.7740749782966016e-06, + "loss": 0.1199, + "step": 8908 + }, + { + "epoch": 1.443454309786131, + "grad_norm": 0.8804534077644348, + "learning_rate": 2.7736403058708418e-06, + "loss": 0.1041, + "step": 8909 + }, + { + "epoch": 1.4436163318211277, + "grad_norm": 0.8817947506904602, + "learning_rate": 2.7732056250723505e-06, + "loss": 0.1021, + "step": 8910 + }, + { + "epoch": 1.4437783538561244, + "grad_norm": 0.8750995397567749, + "learning_rate": 2.7727709359144285e-06, + "loss": 0.1109, + "step": 8911 + }, + { + "epoch": 1.4439403758911211, + "grad_norm": 0.8417905569076538, + "learning_rate": 2.7723362384103757e-06, + "loss": 0.0968, + "step": 8912 + }, + { + "epoch": 1.4441023979261178, + "grad_norm": 0.8828087449073792, + "learning_rate": 2.771901532573493e-06, + "loss": 0.1105, + "step": 8913 + }, + { + "epoch": 1.4442644199611148, + "grad_norm": 0.8812400102615356, + "learning_rate": 2.771466818417082e-06, + "loss": 0.1137, + "step": 8914 + }, + { + "epoch": 1.4444264419961115, + "grad_norm": 1.0078636407852173, + "learning_rate": 2.7710320959544425e-06, + "loss": 0.1276, + "step": 8915 + }, + { + "epoch": 1.4445884640311082, + "grad_norm": 0.8423323035240173, + "learning_rate": 2.7705973651988777e-06, + "loss": 0.0995, + "step": 8916 + }, + { + "epoch": 1.4447504860661051, + "grad_norm": 0.6651060581207275, + "learning_rate": 2.7701626261636878e-06, + "loss": 0.0942, + "step": 8917 + }, + { + "epoch": 1.4449125081011016, + "grad_norm": 0.8124906420707703, + "learning_rate": 2.769727878862175e-06, + "loss": 0.0977, + "step": 8918 + }, + { + "epoch": 1.4450745301360985, + "grad_norm": 0.784057080745697, + "learning_rate": 2.7692931233076424e-06, + "loss": 0.0979, + "step": 8919 + }, + { + "epoch": 1.4452365521710953, + "grad_norm": 0.825549840927124, + "learning_rate": 2.768858359513392e-06, + "loss": 0.1119, + "step": 8920 + }, + { + "epoch": 1.445398574206092, + "grad_norm": 0.7953062653541565, + "learning_rate": 2.7684235874927264e-06, + "loss": 0.1072, + "step": 8921 + }, + { + "epoch": 1.445560596241089, + "grad_norm": 0.8686825633049011, + "learning_rate": 2.767988807258948e-06, + "loss": 0.103, + "step": 8922 + }, + { + "epoch": 1.4457226182760856, + "grad_norm": 0.8071175813674927, + "learning_rate": 2.7675540188253606e-06, + "loss": 0.1091, + "step": 8923 + }, + { + "epoch": 1.4458846403110823, + "grad_norm": 0.8495711088180542, + "learning_rate": 2.7671192222052685e-06, + "loss": 0.1146, + "step": 8924 + }, + { + "epoch": 1.446046662346079, + "grad_norm": 0.7676787376403809, + "learning_rate": 2.7666844174119738e-06, + "loss": 0.0902, + "step": 8925 + }, + { + "epoch": 1.4462086843810757, + "grad_norm": 0.950420618057251, + "learning_rate": 2.7662496044587817e-06, + "loss": 0.1138, + "step": 8926 + }, + { + "epoch": 1.4463707064160727, + "grad_norm": 0.7517897486686707, + "learning_rate": 2.765814783358996e-06, + "loss": 0.094, + "step": 8927 + }, + { + "epoch": 1.4465327284510694, + "grad_norm": 0.8687278032302856, + "learning_rate": 2.765379954125921e-06, + "loss": 0.1039, + "step": 8928 + }, + { + "epoch": 1.446694750486066, + "grad_norm": 0.9656805992126465, + "learning_rate": 2.764945116772862e-06, + "loss": 0.1127, + "step": 8929 + }, + { + "epoch": 1.4468567725210628, + "grad_norm": 0.9119703769683838, + "learning_rate": 2.764510271313123e-06, + "loss": 0.117, + "step": 8930 + }, + { + "epoch": 1.4470187945560595, + "grad_norm": 0.7118834853172302, + "learning_rate": 2.7640754177600105e-06, + "loss": 0.0926, + "step": 8931 + }, + { + "epoch": 1.4471808165910565, + "grad_norm": 0.7804062962532043, + "learning_rate": 2.7636405561268286e-06, + "loss": 0.1061, + "step": 8932 + }, + { + "epoch": 1.4473428386260532, + "grad_norm": 0.8037486672401428, + "learning_rate": 2.763205686426884e-06, + "loss": 0.1024, + "step": 8933 + }, + { + "epoch": 1.4475048606610499, + "grad_norm": 0.9153349995613098, + "learning_rate": 2.7627708086734827e-06, + "loss": 0.1165, + "step": 8934 + }, + { + "epoch": 1.4476668826960466, + "grad_norm": 0.8715000748634338, + "learning_rate": 2.7623359228799295e-06, + "loss": 0.1049, + "step": 8935 + }, + { + "epoch": 1.4478289047310433, + "grad_norm": 1.018729329109192, + "learning_rate": 2.7619010290595333e-06, + "loss": 0.1107, + "step": 8936 + }, + { + "epoch": 1.4479909267660402, + "grad_norm": 0.7773792743682861, + "learning_rate": 2.761466127225598e-06, + "loss": 0.0909, + "step": 8937 + }, + { + "epoch": 1.448152948801037, + "grad_norm": 0.722775399684906, + "learning_rate": 2.7610312173914334e-06, + "loss": 0.0939, + "step": 8938 + }, + { + "epoch": 1.4483149708360337, + "grad_norm": 0.8534601926803589, + "learning_rate": 2.760596299570344e-06, + "loss": 0.0987, + "step": 8939 + }, + { + "epoch": 1.4484769928710304, + "grad_norm": 0.81373131275177, + "learning_rate": 2.760161373775639e-06, + "loss": 0.101, + "step": 8940 + }, + { + "epoch": 1.448639014906027, + "grad_norm": 0.8647781014442444, + "learning_rate": 2.7597264400206255e-06, + "loss": 0.1064, + "step": 8941 + }, + { + "epoch": 1.448801036941024, + "grad_norm": 0.9041435122489929, + "learning_rate": 2.7592914983186113e-06, + "loss": 0.104, + "step": 8942 + }, + { + "epoch": 1.4489630589760207, + "grad_norm": 0.7579626441001892, + "learning_rate": 2.7588565486829054e-06, + "loss": 0.1023, + "step": 8943 + }, + { + "epoch": 1.4491250810110174, + "grad_norm": 0.7907978892326355, + "learning_rate": 2.758421591126814e-06, + "loss": 0.0998, + "step": 8944 + }, + { + "epoch": 1.4492871030460144, + "grad_norm": 0.7959480285644531, + "learning_rate": 2.757986625663649e-06, + "loss": 0.1035, + "step": 8945 + }, + { + "epoch": 1.449449125081011, + "grad_norm": 0.8328735828399658, + "learning_rate": 2.757551652306717e-06, + "loss": 0.1081, + "step": 8946 + }, + { + "epoch": 1.4496111471160078, + "grad_norm": 0.8220754265785217, + "learning_rate": 2.757116671069327e-06, + "loss": 0.0985, + "step": 8947 + }, + { + "epoch": 1.4497731691510045, + "grad_norm": 0.7394413352012634, + "learning_rate": 2.7566816819647897e-06, + "loss": 0.0931, + "step": 8948 + }, + { + "epoch": 1.4499351911860012, + "grad_norm": 0.8674811124801636, + "learning_rate": 2.756246685006414e-06, + "loss": 0.1102, + "step": 8949 + }, + { + "epoch": 1.4500972132209982, + "grad_norm": 0.944241464138031, + "learning_rate": 2.7558116802075095e-06, + "loss": 0.1129, + "step": 8950 + }, + { + "epoch": 1.4502592352559949, + "grad_norm": 0.8318021297454834, + "learning_rate": 2.755376667581387e-06, + "loss": 0.1066, + "step": 8951 + }, + { + "epoch": 1.4504212572909916, + "grad_norm": 0.7419362664222717, + "learning_rate": 2.754941647141357e-06, + "loss": 0.0957, + "step": 8952 + }, + { + "epoch": 1.4505832793259883, + "grad_norm": 0.7914148569107056, + "learning_rate": 2.754506618900729e-06, + "loss": 0.1047, + "step": 8953 + }, + { + "epoch": 1.450745301360985, + "grad_norm": 0.8463177680969238, + "learning_rate": 2.754071582872814e-06, + "loss": 0.1088, + "step": 8954 + }, + { + "epoch": 1.450907323395982, + "grad_norm": 0.9250964522361755, + "learning_rate": 2.753636539070924e-06, + "loss": 0.118, + "step": 8955 + }, + { + "epoch": 1.4510693454309787, + "grad_norm": 0.8688222169876099, + "learning_rate": 2.753201487508369e-06, + "loss": 0.0961, + "step": 8956 + }, + { + "epoch": 1.4512313674659754, + "grad_norm": 0.8172348737716675, + "learning_rate": 2.752766428198462e-06, + "loss": 0.1063, + "step": 8957 + }, + { + "epoch": 1.451393389500972, + "grad_norm": 0.8788450360298157, + "learning_rate": 2.7523313611545133e-06, + "loss": 0.1075, + "step": 8958 + }, + { + "epoch": 1.4515554115359688, + "grad_norm": 0.8048200011253357, + "learning_rate": 2.7518962863898356e-06, + "loss": 0.1012, + "step": 8959 + }, + { + "epoch": 1.4517174335709657, + "grad_norm": 0.8715628981590271, + "learning_rate": 2.7514612039177422e-06, + "loss": 0.1027, + "step": 8960 + }, + { + "epoch": 1.4518794556059624, + "grad_norm": 0.9554488658905029, + "learning_rate": 2.7510261137515437e-06, + "loss": 0.1079, + "step": 8961 + }, + { + "epoch": 1.4520414776409591, + "grad_norm": 0.908086895942688, + "learning_rate": 2.7505910159045534e-06, + "loss": 0.1199, + "step": 8962 + }, + { + "epoch": 1.4522034996759559, + "grad_norm": 0.7679954171180725, + "learning_rate": 2.750155910390085e-06, + "loss": 0.0917, + "step": 8963 + }, + { + "epoch": 1.4523655217109526, + "grad_norm": 0.8922917246818542, + "learning_rate": 2.74972079722145e-06, + "loss": 0.1107, + "step": 8964 + }, + { + "epoch": 1.4525275437459495, + "grad_norm": 0.8348422050476074, + "learning_rate": 2.7492856764119644e-06, + "loss": 0.1075, + "step": 8965 + }, + { + "epoch": 1.4526895657809462, + "grad_norm": 0.852973997592926, + "learning_rate": 2.7488505479749395e-06, + "loss": 0.1131, + "step": 8966 + }, + { + "epoch": 1.452851587815943, + "grad_norm": 0.8772859573364258, + "learning_rate": 2.7484154119236906e-06, + "loss": 0.1085, + "step": 8967 + }, + { + "epoch": 1.4530136098509399, + "grad_norm": 0.8057224154472351, + "learning_rate": 2.74798026827153e-06, + "loss": 0.1, + "step": 8968 + }, + { + "epoch": 1.4531756318859366, + "grad_norm": 0.8312748670578003, + "learning_rate": 2.7475451170317748e-06, + "loss": 0.1064, + "step": 8969 + }, + { + "epoch": 1.4533376539209333, + "grad_norm": 0.8106637001037598, + "learning_rate": 2.747109958217737e-06, + "loss": 0.106, + "step": 8970 + }, + { + "epoch": 1.45349967595593, + "grad_norm": 0.8003639578819275, + "learning_rate": 2.7466747918427326e-06, + "loss": 0.0978, + "step": 8971 + }, + { + "epoch": 1.4536616979909267, + "grad_norm": 0.8317378759384155, + "learning_rate": 2.746239617920077e-06, + "loss": 0.0995, + "step": 8972 + }, + { + "epoch": 1.4538237200259236, + "grad_norm": 0.8461683988571167, + "learning_rate": 2.7458044364630844e-06, + "loss": 0.0959, + "step": 8973 + }, + { + "epoch": 1.4539857420609203, + "grad_norm": 0.9303465485572815, + "learning_rate": 2.745369247485072e-06, + "loss": 0.1011, + "step": 8974 + }, + { + "epoch": 1.454147764095917, + "grad_norm": 0.7978488802909851, + "learning_rate": 2.7449340509993526e-06, + "loss": 0.0928, + "step": 8975 + }, + { + "epoch": 1.4543097861309138, + "grad_norm": 0.7847487330436707, + "learning_rate": 2.7444988470192457e-06, + "loss": 0.0917, + "step": 8976 + }, + { + "epoch": 1.4544718081659105, + "grad_norm": 0.7997845411300659, + "learning_rate": 2.744063635558065e-06, + "loss": 0.0966, + "step": 8977 + }, + { + "epoch": 1.4546338302009074, + "grad_norm": 0.7787443995475769, + "learning_rate": 2.743628416629128e-06, + "loss": 0.0986, + "step": 8978 + }, + { + "epoch": 1.4547958522359041, + "grad_norm": 0.9163244366645813, + "learning_rate": 2.7431931902457504e-06, + "loss": 0.1012, + "step": 8979 + }, + { + "epoch": 1.4549578742709008, + "grad_norm": 0.7863480448722839, + "learning_rate": 2.7427579564212496e-06, + "loss": 0.095, + "step": 8980 + }, + { + "epoch": 1.4551198963058976, + "grad_norm": 0.8067967295646667, + "learning_rate": 2.7423227151689436e-06, + "loss": 0.0963, + "step": 8981 + }, + { + "epoch": 1.4552819183408943, + "grad_norm": 0.8217342495918274, + "learning_rate": 2.7418874665021483e-06, + "loss": 0.1071, + "step": 8982 + }, + { + "epoch": 1.4554439403758912, + "grad_norm": 0.754622757434845, + "learning_rate": 2.7414522104341827e-06, + "loss": 0.0997, + "step": 8983 + }, + { + "epoch": 1.455605962410888, + "grad_norm": 0.8360654711723328, + "learning_rate": 2.7410169469783632e-06, + "loss": 0.1013, + "step": 8984 + }, + { + "epoch": 1.4557679844458846, + "grad_norm": 0.8221597075462341, + "learning_rate": 2.740581676148008e-06, + "loss": 0.101, + "step": 8985 + }, + { + "epoch": 1.4559300064808813, + "grad_norm": 0.9259410500526428, + "learning_rate": 2.7401463979564365e-06, + "loss": 0.1103, + "step": 8986 + }, + { + "epoch": 1.456092028515878, + "grad_norm": 0.8288673162460327, + "learning_rate": 2.739711112416966e-06, + "loss": 0.0889, + "step": 8987 + }, + { + "epoch": 1.456254050550875, + "grad_norm": 0.8852518796920776, + "learning_rate": 2.7392758195429153e-06, + "loss": 0.1054, + "step": 8988 + }, + { + "epoch": 1.4564160725858717, + "grad_norm": 0.7813482284545898, + "learning_rate": 2.738840519347604e-06, + "loss": 0.104, + "step": 8989 + }, + { + "epoch": 1.4565780946208684, + "grad_norm": 0.753773033618927, + "learning_rate": 2.73840521184435e-06, + "loss": 0.0911, + "step": 8990 + }, + { + "epoch": 1.4567401166558653, + "grad_norm": 0.8681461215019226, + "learning_rate": 2.737969897046475e-06, + "loss": 0.107, + "step": 8991 + }, + { + "epoch": 1.4569021386908618, + "grad_norm": 0.8012184500694275, + "learning_rate": 2.737534574967295e-06, + "loss": 0.1021, + "step": 8992 + }, + { + "epoch": 1.4570641607258588, + "grad_norm": 0.9006675481796265, + "learning_rate": 2.7370992456201333e-06, + "loss": 0.1089, + "step": 8993 + }, + { + "epoch": 1.4572261827608555, + "grad_norm": 0.8169563412666321, + "learning_rate": 2.7366639090183076e-06, + "loss": 0.1066, + "step": 8994 + }, + { + "epoch": 1.4573882047958522, + "grad_norm": 0.9039627313613892, + "learning_rate": 2.7362285651751396e-06, + "loss": 0.1084, + "step": 8995 + }, + { + "epoch": 1.4575502268308491, + "grad_norm": 1.059561014175415, + "learning_rate": 2.7357932141039494e-06, + "loss": 0.1334, + "step": 8996 + }, + { + "epoch": 1.4577122488658458, + "grad_norm": 0.8857733011245728, + "learning_rate": 2.7353578558180566e-06, + "loss": 0.1026, + "step": 8997 + }, + { + "epoch": 1.4578742709008425, + "grad_norm": 0.9009446501731873, + "learning_rate": 2.7349224903307836e-06, + "loss": 0.1097, + "step": 8998 + }, + { + "epoch": 1.4580362929358393, + "grad_norm": 0.794425368309021, + "learning_rate": 2.7344871176554498e-06, + "loss": 0.0953, + "step": 8999 + }, + { + "epoch": 1.458198314970836, + "grad_norm": 0.752216637134552, + "learning_rate": 2.734051737805379e-06, + "loss": 0.1114, + "step": 9000 + }, + { + "epoch": 1.458360337005833, + "grad_norm": 0.8077507019042969, + "learning_rate": 2.733616350793891e-06, + "loss": 0.0994, + "step": 9001 + }, + { + "epoch": 1.4585223590408296, + "grad_norm": 0.8075626492500305, + "learning_rate": 2.733180956634308e-06, + "loss": 0.1092, + "step": 9002 + }, + { + "epoch": 1.4586843810758263, + "grad_norm": 0.8624060153961182, + "learning_rate": 2.7327455553399523e-06, + "loss": 0.1134, + "step": 9003 + }, + { + "epoch": 1.458846403110823, + "grad_norm": 0.8211466073989868, + "learning_rate": 2.7323101469241454e-06, + "loss": 0.102, + "step": 9004 + }, + { + "epoch": 1.4590084251458197, + "grad_norm": 0.838489830493927, + "learning_rate": 2.7318747314002108e-06, + "loss": 0.1028, + "step": 9005 + }, + { + "epoch": 1.4591704471808167, + "grad_norm": 0.7504817247390747, + "learning_rate": 2.7314393087814693e-06, + "loss": 0.103, + "step": 9006 + }, + { + "epoch": 1.4593324692158134, + "grad_norm": 0.8312764167785645, + "learning_rate": 2.731003879081246e-06, + "loss": 0.1052, + "step": 9007 + }, + { + "epoch": 1.45949449125081, + "grad_norm": 0.7648634910583496, + "learning_rate": 2.7305684423128633e-06, + "loss": 0.0971, + "step": 9008 + }, + { + "epoch": 1.4596565132858068, + "grad_norm": 0.7204511165618896, + "learning_rate": 2.7301329984896435e-06, + "loss": 0.088, + "step": 9009 + }, + { + "epoch": 1.4598185353208035, + "grad_norm": 0.8412272334098816, + "learning_rate": 2.729697547624911e-06, + "loss": 0.1028, + "step": 9010 + }, + { + "epoch": 1.4599805573558005, + "grad_norm": 0.7355968952178955, + "learning_rate": 2.7292620897319892e-06, + "loss": 0.0942, + "step": 9011 + }, + { + "epoch": 1.4601425793907972, + "grad_norm": 0.8538913726806641, + "learning_rate": 2.7288266248242025e-06, + "loss": 0.0954, + "step": 9012 + }, + { + "epoch": 1.4603046014257939, + "grad_norm": 0.7657922506332397, + "learning_rate": 2.7283911529148753e-06, + "loss": 0.1011, + "step": 9013 + }, + { + "epoch": 1.4604666234607906, + "grad_norm": 0.951962947845459, + "learning_rate": 2.7279556740173306e-06, + "loss": 0.1025, + "step": 9014 + }, + { + "epoch": 1.4606286454957873, + "grad_norm": 0.7168994545936584, + "learning_rate": 2.727520188144895e-06, + "loss": 0.0845, + "step": 9015 + }, + { + "epoch": 1.4607906675307842, + "grad_norm": 0.7955917716026306, + "learning_rate": 2.7270846953108913e-06, + "loss": 0.0934, + "step": 9016 + }, + { + "epoch": 1.460952689565781, + "grad_norm": 0.8339834213256836, + "learning_rate": 2.7266491955286457e-06, + "loss": 0.1052, + "step": 9017 + }, + { + "epoch": 1.4611147116007777, + "grad_norm": 0.9418988823890686, + "learning_rate": 2.7262136888114833e-06, + "loss": 0.1146, + "step": 9018 + }, + { + "epoch": 1.4612767336357746, + "grad_norm": 0.7261226177215576, + "learning_rate": 2.725778175172729e-06, + "loss": 0.0869, + "step": 9019 + }, + { + "epoch": 1.4614387556707713, + "grad_norm": 1.1076301336288452, + "learning_rate": 2.72534265462571e-06, + "loss": 0.1158, + "step": 9020 + }, + { + "epoch": 1.461600777705768, + "grad_norm": 0.8176051378250122, + "learning_rate": 2.7249071271837503e-06, + "loss": 0.1012, + "step": 9021 + }, + { + "epoch": 1.4617627997407647, + "grad_norm": 0.7100493311882019, + "learning_rate": 2.7244715928601774e-06, + "loss": 0.088, + "step": 9022 + }, + { + "epoch": 1.4619248217757614, + "grad_norm": 0.8428002595901489, + "learning_rate": 2.7240360516683155e-06, + "loss": 0.0962, + "step": 9023 + }, + { + "epoch": 1.4620868438107584, + "grad_norm": 0.8279069066047668, + "learning_rate": 2.723600503621494e-06, + "loss": 0.1123, + "step": 9024 + }, + { + "epoch": 1.462248865845755, + "grad_norm": 0.8103442788124084, + "learning_rate": 2.723164948733038e-06, + "loss": 0.1006, + "step": 9025 + }, + { + "epoch": 1.4624108878807518, + "grad_norm": 0.8227192163467407, + "learning_rate": 2.7227293870162742e-06, + "loss": 0.1054, + "step": 9026 + }, + { + "epoch": 1.4625729099157485, + "grad_norm": 0.7976576089859009, + "learning_rate": 2.7222938184845304e-06, + "loss": 0.0935, + "step": 9027 + }, + { + "epoch": 1.4627349319507452, + "grad_norm": 0.8542677164077759, + "learning_rate": 2.721858243151133e-06, + "loss": 0.1126, + "step": 9028 + }, + { + "epoch": 1.4628969539857422, + "grad_norm": 0.913292407989502, + "learning_rate": 2.7214226610294114e-06, + "loss": 0.1131, + "step": 9029 + }, + { + "epoch": 1.4630589760207389, + "grad_norm": 0.9596574902534485, + "learning_rate": 2.7209870721326915e-06, + "loss": 0.1056, + "step": 9030 + }, + { + "epoch": 1.4632209980557356, + "grad_norm": 0.8938264846801758, + "learning_rate": 2.7205514764743025e-06, + "loss": 0.1035, + "step": 9031 + }, + { + "epoch": 1.4633830200907323, + "grad_norm": 0.7663517594337463, + "learning_rate": 2.7201158740675714e-06, + "loss": 0.0973, + "step": 9032 + }, + { + "epoch": 1.463545042125729, + "grad_norm": 0.8649742007255554, + "learning_rate": 2.7196802649258273e-06, + "loss": 0.0919, + "step": 9033 + }, + { + "epoch": 1.463707064160726, + "grad_norm": 0.9753785729408264, + "learning_rate": 2.719244649062399e-06, + "loss": 0.1166, + "step": 9034 + }, + { + "epoch": 1.4638690861957226, + "grad_norm": 0.7970572710037231, + "learning_rate": 2.7188090264906147e-06, + "loss": 0.0942, + "step": 9035 + }, + { + "epoch": 1.4640311082307194, + "grad_norm": 0.8879532217979431, + "learning_rate": 2.718373397223804e-06, + "loss": 0.1078, + "step": 9036 + }, + { + "epoch": 1.464193130265716, + "grad_norm": 0.7902569770812988, + "learning_rate": 2.7179377612752954e-06, + "loss": 0.1028, + "step": 9037 + }, + { + "epoch": 1.4643551523007128, + "grad_norm": 0.7548561096191406, + "learning_rate": 2.71750211865842e-06, + "loss": 0.0948, + "step": 9038 + }, + { + "epoch": 1.4645171743357097, + "grad_norm": 0.7446286082267761, + "learning_rate": 2.7170664693865045e-06, + "loss": 0.0925, + "step": 9039 + }, + { + "epoch": 1.4646791963707064, + "grad_norm": 0.7989212274551392, + "learning_rate": 2.7166308134728814e-06, + "loss": 0.0981, + "step": 9040 + }, + { + "epoch": 1.4648412184057031, + "grad_norm": 0.8018403053283691, + "learning_rate": 2.7161951509308785e-06, + "loss": 0.0979, + "step": 9041 + }, + { + "epoch": 1.4650032404407, + "grad_norm": 0.7907342314720154, + "learning_rate": 2.715759481773828e-06, + "loss": 0.1051, + "step": 9042 + }, + { + "epoch": 1.4651652624756966, + "grad_norm": 0.8176236152648926, + "learning_rate": 2.7153238060150592e-06, + "loss": 0.1035, + "step": 9043 + }, + { + "epoch": 1.4653272845106935, + "grad_norm": 0.8785366415977478, + "learning_rate": 2.7148881236679035e-06, + "loss": 0.1067, + "step": 9044 + }, + { + "epoch": 1.4654893065456902, + "grad_norm": 0.8045976161956787, + "learning_rate": 2.7144524347456906e-06, + "loss": 0.1027, + "step": 9045 + }, + { + "epoch": 1.465651328580687, + "grad_norm": 0.7513163685798645, + "learning_rate": 2.7140167392617527e-06, + "loss": 0.0968, + "step": 9046 + }, + { + "epoch": 1.4658133506156839, + "grad_norm": 0.9477952122688293, + "learning_rate": 2.7135810372294204e-06, + "loss": 0.1219, + "step": 9047 + }, + { + "epoch": 1.4659753726506806, + "grad_norm": 0.7233965992927551, + "learning_rate": 2.7131453286620253e-06, + "loss": 0.0966, + "step": 9048 + }, + { + "epoch": 1.4661373946856773, + "grad_norm": 0.8609042763710022, + "learning_rate": 2.7127096135728987e-06, + "loss": 0.1081, + "step": 9049 + }, + { + "epoch": 1.466299416720674, + "grad_norm": 0.7274024486541748, + "learning_rate": 2.712273891975372e-06, + "loss": 0.097, + "step": 9050 + }, + { + "epoch": 1.4664614387556707, + "grad_norm": 0.8457092046737671, + "learning_rate": 2.7118381638827795e-06, + "loss": 0.1131, + "step": 9051 + }, + { + "epoch": 1.4666234607906676, + "grad_norm": 0.8323071599006653, + "learning_rate": 2.7114024293084502e-06, + "loss": 0.1064, + "step": 9052 + }, + { + "epoch": 1.4667854828256643, + "grad_norm": 0.6933931112289429, + "learning_rate": 2.710966688265719e-06, + "loss": 0.0793, + "step": 9053 + }, + { + "epoch": 1.466947504860661, + "grad_norm": 0.8105365633964539, + "learning_rate": 2.710530940767917e-06, + "loss": 0.0991, + "step": 9054 + }, + { + "epoch": 1.4671095268956578, + "grad_norm": 0.7702953815460205, + "learning_rate": 2.7100951868283785e-06, + "loss": 0.1003, + "step": 9055 + }, + { + "epoch": 1.4672715489306545, + "grad_norm": 0.8761657476425171, + "learning_rate": 2.7096594264604357e-06, + "loss": 0.1091, + "step": 9056 + }, + { + "epoch": 1.4674335709656514, + "grad_norm": 0.8410671353340149, + "learning_rate": 2.709223659677421e-06, + "loss": 0.1059, + "step": 9057 + }, + { + "epoch": 1.4675955930006481, + "grad_norm": 0.7424046993255615, + "learning_rate": 2.7087878864926696e-06, + "loss": 0.0873, + "step": 9058 + }, + { + "epoch": 1.4677576150356448, + "grad_norm": 0.8431706428527832, + "learning_rate": 2.7083521069195134e-06, + "loss": 0.1037, + "step": 9059 + }, + { + "epoch": 1.4679196370706415, + "grad_norm": 0.7804480195045471, + "learning_rate": 2.707916320971288e-06, + "loss": 0.0958, + "step": 9060 + }, + { + "epoch": 1.4680816591056383, + "grad_norm": 0.7562078237533569, + "learning_rate": 2.707480528661325e-06, + "loss": 0.0974, + "step": 9061 + }, + { + "epoch": 1.4682436811406352, + "grad_norm": 0.7641584277153015, + "learning_rate": 2.7070447300029607e-06, + "loss": 0.095, + "step": 9062 + }, + { + "epoch": 1.468405703175632, + "grad_norm": 0.9363652467727661, + "learning_rate": 2.7066089250095284e-06, + "loss": 0.1097, + "step": 9063 + }, + { + "epoch": 1.4685677252106286, + "grad_norm": 1.033353567123413, + "learning_rate": 2.706173113694363e-06, + "loss": 0.1097, + "step": 9064 + }, + { + "epoch": 1.4687297472456253, + "grad_norm": 0.8495287299156189, + "learning_rate": 2.705737296070799e-06, + "loss": 0.1042, + "step": 9065 + }, + { + "epoch": 1.468891769280622, + "grad_norm": 0.7492153644561768, + "learning_rate": 2.705301472152172e-06, + "loss": 0.0886, + "step": 9066 + }, + { + "epoch": 1.469053791315619, + "grad_norm": 0.9077339768409729, + "learning_rate": 2.7048656419518168e-06, + "loss": 0.1071, + "step": 9067 + }, + { + "epoch": 1.4692158133506157, + "grad_norm": 0.9256390333175659, + "learning_rate": 2.7044298054830687e-06, + "loss": 0.1117, + "step": 9068 + }, + { + "epoch": 1.4693778353856124, + "grad_norm": 0.8640137910842896, + "learning_rate": 2.703993962759263e-06, + "loss": 0.1049, + "step": 9069 + }, + { + "epoch": 1.4695398574206093, + "grad_norm": 0.8645317554473877, + "learning_rate": 2.703558113793736e-06, + "loss": 0.1036, + "step": 9070 + }, + { + "epoch": 1.469701879455606, + "grad_norm": 0.8858664631843567, + "learning_rate": 2.703122258599823e-06, + "loss": 0.1113, + "step": 9071 + }, + { + "epoch": 1.4698639014906028, + "grad_norm": 0.856381893157959, + "learning_rate": 2.7026863971908607e-06, + "loss": 0.0984, + "step": 9072 + }, + { + "epoch": 1.4700259235255995, + "grad_norm": 0.8103935718536377, + "learning_rate": 2.702250529580185e-06, + "loss": 0.1053, + "step": 9073 + }, + { + "epoch": 1.4701879455605962, + "grad_norm": 0.8574365377426147, + "learning_rate": 2.7018146557811325e-06, + "loss": 0.0997, + "step": 9074 + }, + { + "epoch": 1.470349967595593, + "grad_norm": 0.8630319833755493, + "learning_rate": 2.70137877580704e-06, + "loss": 0.1014, + "step": 9075 + }, + { + "epoch": 1.4705119896305898, + "grad_norm": 0.8821179866790771, + "learning_rate": 2.7009428896712443e-06, + "loss": 0.1108, + "step": 9076 + }, + { + "epoch": 1.4706740116655865, + "grad_norm": 0.8524888157844543, + "learning_rate": 2.7005069973870823e-06, + "loss": 0.1072, + "step": 9077 + }, + { + "epoch": 1.4708360337005832, + "grad_norm": 0.790708601474762, + "learning_rate": 2.700071098967892e-06, + "loss": 0.0959, + "step": 9078 + }, + { + "epoch": 1.47099805573558, + "grad_norm": 0.7711350917816162, + "learning_rate": 2.6996351944270096e-06, + "loss": 0.0944, + "step": 9079 + }, + { + "epoch": 1.471160077770577, + "grad_norm": 0.862248957157135, + "learning_rate": 2.699199283777773e-06, + "loss": 0.1142, + "step": 9080 + }, + { + "epoch": 1.4713220998055736, + "grad_norm": 0.7187860608100891, + "learning_rate": 2.698763367033521e-06, + "loss": 0.0918, + "step": 9081 + }, + { + "epoch": 1.4714841218405703, + "grad_norm": 0.7436991333961487, + "learning_rate": 2.6983274442075914e-06, + "loss": 0.0882, + "step": 9082 + }, + { + "epoch": 1.471646143875567, + "grad_norm": 0.7763605117797852, + "learning_rate": 2.6978915153133207e-06, + "loss": 0.1046, + "step": 9083 + }, + { + "epoch": 1.4718081659105637, + "grad_norm": 0.8496381044387817, + "learning_rate": 2.69745558036405e-06, + "loss": 0.1087, + "step": 9084 + }, + { + "epoch": 1.4719701879455607, + "grad_norm": 0.8233307003974915, + "learning_rate": 2.6970196393731146e-06, + "loss": 0.1056, + "step": 9085 + }, + { + "epoch": 1.4721322099805574, + "grad_norm": 0.806120753288269, + "learning_rate": 2.6965836923538568e-06, + "loss": 0.1013, + "step": 9086 + }, + { + "epoch": 1.472294232015554, + "grad_norm": 0.7680891156196594, + "learning_rate": 2.696147739319613e-06, + "loss": 0.0902, + "step": 9087 + }, + { + "epoch": 1.4724562540505508, + "grad_norm": 0.8429840803146362, + "learning_rate": 2.695711780283723e-06, + "loss": 0.1087, + "step": 9088 + }, + { + "epoch": 1.4726182760855475, + "grad_norm": 0.8204013109207153, + "learning_rate": 2.695275815259526e-06, + "loss": 0.1044, + "step": 9089 + }, + { + "epoch": 1.4727802981205445, + "grad_norm": 0.7914881110191345, + "learning_rate": 2.694839844260361e-06, + "loss": 0.1021, + "step": 9090 + }, + { + "epoch": 1.4729423201555412, + "grad_norm": 0.8489986658096313, + "learning_rate": 2.69440386729957e-06, + "loss": 0.1064, + "step": 9091 + }, + { + "epoch": 1.4731043421905379, + "grad_norm": 0.8216438293457031, + "learning_rate": 2.6939678843904897e-06, + "loss": 0.1061, + "step": 9092 + }, + { + "epoch": 1.4732663642255348, + "grad_norm": 0.8662644028663635, + "learning_rate": 2.6935318955464624e-06, + "loss": 0.1127, + "step": 9093 + }, + { + "epoch": 1.4734283862605313, + "grad_norm": 0.9734071493148804, + "learning_rate": 2.6930959007808268e-06, + "loss": 0.127, + "step": 9094 + }, + { + "epoch": 1.4735904082955282, + "grad_norm": 0.9388130307197571, + "learning_rate": 2.692659900106924e-06, + "loss": 0.111, + "step": 9095 + }, + { + "epoch": 1.473752430330525, + "grad_norm": 0.8008855581283569, + "learning_rate": 2.6922238935380946e-06, + "loss": 0.1069, + "step": 9096 + }, + { + "epoch": 1.4739144523655217, + "grad_norm": 0.8032642602920532, + "learning_rate": 2.691787881087679e-06, + "loss": 0.0985, + "step": 9097 + }, + { + "epoch": 1.4740764744005186, + "grad_norm": 0.7424758672714233, + "learning_rate": 2.691351862769018e-06, + "loss": 0.1027, + "step": 9098 + }, + { + "epoch": 1.4742384964355153, + "grad_norm": 0.8054472208023071, + "learning_rate": 2.6909158385954544e-06, + "loss": 0.105, + "step": 9099 + }, + { + "epoch": 1.474400518470512, + "grad_norm": 0.7597915530204773, + "learning_rate": 2.6904798085803276e-06, + "loss": 0.0986, + "step": 9100 + }, + { + "epoch": 1.4745625405055087, + "grad_norm": 0.7698268294334412, + "learning_rate": 2.6900437727369793e-06, + "loss": 0.0944, + "step": 9101 + }, + { + "epoch": 1.4747245625405054, + "grad_norm": 0.7732874155044556, + "learning_rate": 2.689607731078751e-06, + "loss": 0.1, + "step": 9102 + }, + { + "epoch": 1.4748865845755024, + "grad_norm": 0.7452662587165833, + "learning_rate": 2.6891716836189857e-06, + "loss": 0.0938, + "step": 9103 + }, + { + "epoch": 1.475048606610499, + "grad_norm": 0.7833341956138611, + "learning_rate": 2.688735630371024e-06, + "loss": 0.1036, + "step": 9104 + }, + { + "epoch": 1.4752106286454958, + "grad_norm": 0.7782242894172668, + "learning_rate": 2.6882995713482097e-06, + "loss": 0.0966, + "step": 9105 + }, + { + "epoch": 1.4753726506804925, + "grad_norm": 0.7549180388450623, + "learning_rate": 2.6878635065638843e-06, + "loss": 0.1022, + "step": 9106 + }, + { + "epoch": 1.4755346727154892, + "grad_norm": 0.8578351736068726, + "learning_rate": 2.687427436031389e-06, + "loss": 0.1039, + "step": 9107 + }, + { + "epoch": 1.4756966947504861, + "grad_norm": 0.7809894680976868, + "learning_rate": 2.6869913597640686e-06, + "loss": 0.101, + "step": 9108 + }, + { + "epoch": 1.4758587167854829, + "grad_norm": 0.7138332724571228, + "learning_rate": 2.6865552777752644e-06, + "loss": 0.0917, + "step": 9109 + }, + { + "epoch": 1.4760207388204796, + "grad_norm": 0.8516620397567749, + "learning_rate": 2.6861191900783213e-06, + "loss": 0.1023, + "step": 9110 + }, + { + "epoch": 1.4761827608554763, + "grad_norm": 0.8884662389755249, + "learning_rate": 2.6856830966865804e-06, + "loss": 0.1067, + "step": 9111 + }, + { + "epoch": 1.476344782890473, + "grad_norm": 0.9057988524436951, + "learning_rate": 2.685246997613386e-06, + "loss": 0.1059, + "step": 9112 + }, + { + "epoch": 1.47650680492547, + "grad_norm": 0.7630183696746826, + "learning_rate": 2.684810892872083e-06, + "loss": 0.1064, + "step": 9113 + }, + { + "epoch": 1.4766688269604666, + "grad_norm": 0.751277506351471, + "learning_rate": 2.6843747824760125e-06, + "loss": 0.0947, + "step": 9114 + }, + { + "epoch": 1.4768308489954634, + "grad_norm": 0.8783870935440063, + "learning_rate": 2.683938666438521e-06, + "loss": 0.1032, + "step": 9115 + }, + { + "epoch": 1.47699287103046, + "grad_norm": 0.8420021533966064, + "learning_rate": 2.6835025447729495e-06, + "loss": 0.0947, + "step": 9116 + }, + { + "epoch": 1.4771548930654568, + "grad_norm": 0.7928836941719055, + "learning_rate": 2.6830664174926465e-06, + "loss": 0.1003, + "step": 9117 + }, + { + "epoch": 1.4773169151004537, + "grad_norm": 0.7881826162338257, + "learning_rate": 2.682630284610953e-06, + "loss": 0.1031, + "step": 9118 + }, + { + "epoch": 1.4774789371354504, + "grad_norm": 0.7746166586875916, + "learning_rate": 2.682194146141215e-06, + "loss": 0.0952, + "step": 9119 + }, + { + "epoch": 1.4776409591704471, + "grad_norm": 0.8245477676391602, + "learning_rate": 2.6817580020967767e-06, + "loss": 0.1023, + "step": 9120 + }, + { + "epoch": 1.477802981205444, + "grad_norm": 0.8022861480712891, + "learning_rate": 2.6813218524909836e-06, + "loss": 0.0994, + "step": 9121 + }, + { + "epoch": 1.4779650032404408, + "grad_norm": 0.8441992402076721, + "learning_rate": 2.680885697337181e-06, + "loss": 0.0999, + "step": 9122 + }, + { + "epoch": 1.4781270252754375, + "grad_norm": 0.7806877493858337, + "learning_rate": 2.6804495366487132e-06, + "loss": 0.1001, + "step": 9123 + }, + { + "epoch": 1.4782890473104342, + "grad_norm": 0.6983300447463989, + "learning_rate": 2.6800133704389263e-06, + "loss": 0.0843, + "step": 9124 + }, + { + "epoch": 1.478451069345431, + "grad_norm": 0.8451159596443176, + "learning_rate": 2.679577198721166e-06, + "loss": 0.1103, + "step": 9125 + }, + { + "epoch": 1.4786130913804278, + "grad_norm": 0.7998707890510559, + "learning_rate": 2.6791410215087783e-06, + "loss": 0.0962, + "step": 9126 + }, + { + "epoch": 1.4787751134154246, + "grad_norm": 0.8116523027420044, + "learning_rate": 2.678704838815108e-06, + "loss": 0.1025, + "step": 9127 + }, + { + "epoch": 1.4789371354504213, + "grad_norm": 0.7786074876785278, + "learning_rate": 2.678268650653503e-06, + "loss": 0.0974, + "step": 9128 + }, + { + "epoch": 1.479099157485418, + "grad_norm": 0.8149200081825256, + "learning_rate": 2.6778324570373083e-06, + "loss": 0.1001, + "step": 9129 + }, + { + "epoch": 1.4792611795204147, + "grad_norm": 0.8207477331161499, + "learning_rate": 2.6773962579798713e-06, + "loss": 0.0947, + "step": 9130 + }, + { + "epoch": 1.4794232015554116, + "grad_norm": 0.8058983683586121, + "learning_rate": 2.676960053494538e-06, + "loss": 0.107, + "step": 9131 + }, + { + "epoch": 1.4795852235904083, + "grad_norm": 0.9266466498374939, + "learning_rate": 2.6765238435946543e-06, + "loss": 0.1122, + "step": 9132 + }, + { + "epoch": 1.479747245625405, + "grad_norm": 0.9578770995140076, + "learning_rate": 2.676087628293569e-06, + "loss": 0.1173, + "step": 9133 + }, + { + "epoch": 1.4799092676604018, + "grad_norm": 0.8516537547111511, + "learning_rate": 2.675651407604628e-06, + "loss": 0.1006, + "step": 9134 + }, + { + "epoch": 1.4800712896953985, + "grad_norm": 0.8952048420906067, + "learning_rate": 2.6752151815411797e-06, + "loss": 0.1053, + "step": 9135 + }, + { + "epoch": 1.4802333117303954, + "grad_norm": 0.694008469581604, + "learning_rate": 2.67477895011657e-06, + "loss": 0.0859, + "step": 9136 + }, + { + "epoch": 1.4803953337653921, + "grad_norm": 0.8128898739814758, + "learning_rate": 2.674342713344148e-06, + "loss": 0.1016, + "step": 9137 + }, + { + "epoch": 1.4805573558003888, + "grad_norm": 0.8021711111068726, + "learning_rate": 2.6739064712372596e-06, + "loss": 0.1034, + "step": 9138 + }, + { + "epoch": 1.4807193778353855, + "grad_norm": 0.9167004823684692, + "learning_rate": 2.6734702238092557e-06, + "loss": 0.1221, + "step": 9139 + }, + { + "epoch": 1.4808813998703823, + "grad_norm": 0.8511152863502502, + "learning_rate": 2.6730339710734815e-06, + "loss": 0.1096, + "step": 9140 + }, + { + "epoch": 1.4810434219053792, + "grad_norm": 0.9012813568115234, + "learning_rate": 2.6725977130432877e-06, + "loss": 0.114, + "step": 9141 + }, + { + "epoch": 1.481205443940376, + "grad_norm": 0.7799967527389526, + "learning_rate": 2.672161449732021e-06, + "loss": 0.0945, + "step": 9142 + }, + { + "epoch": 1.4813674659753726, + "grad_norm": 0.8400177359580994, + "learning_rate": 2.6717251811530304e-06, + "loss": 0.1052, + "step": 9143 + }, + { + "epoch": 1.4815294880103695, + "grad_norm": 0.7064500451087952, + "learning_rate": 2.671288907319666e-06, + "loss": 0.0897, + "step": 9144 + }, + { + "epoch": 1.481691510045366, + "grad_norm": 0.8531193733215332, + "learning_rate": 2.6708526282452734e-06, + "loss": 0.1076, + "step": 9145 + }, + { + "epoch": 1.481853532080363, + "grad_norm": 0.8008085489273071, + "learning_rate": 2.670416343943205e-06, + "loss": 0.1068, + "step": 9146 + }, + { + "epoch": 1.4820155541153597, + "grad_norm": 0.7878074645996094, + "learning_rate": 2.669980054426809e-06, + "loss": 0.0997, + "step": 9147 + }, + { + "epoch": 1.4821775761503564, + "grad_norm": 0.8357486724853516, + "learning_rate": 2.669543759709434e-06, + "loss": 0.1048, + "step": 9148 + }, + { + "epoch": 1.4823395981853533, + "grad_norm": 0.7349844574928284, + "learning_rate": 2.669107459804431e-06, + "loss": 0.0923, + "step": 9149 + }, + { + "epoch": 1.48250162022035, + "grad_norm": 0.7902621030807495, + "learning_rate": 2.668671154725149e-06, + "loss": 0.1033, + "step": 9150 + }, + { + "epoch": 1.4826636422553467, + "grad_norm": 0.7515067458152771, + "learning_rate": 2.6682348444849373e-06, + "loss": 0.0931, + "step": 9151 + }, + { + "epoch": 1.4828256642903435, + "grad_norm": 0.798224151134491, + "learning_rate": 2.6677985290971464e-06, + "loss": 0.102, + "step": 9152 + }, + { + "epoch": 1.4829876863253402, + "grad_norm": 0.8462631702423096, + "learning_rate": 2.6673622085751275e-06, + "loss": 0.0963, + "step": 9153 + }, + { + "epoch": 1.483149708360337, + "grad_norm": 0.7932848334312439, + "learning_rate": 2.666925882932229e-06, + "loss": 0.1096, + "step": 9154 + }, + { + "epoch": 1.4833117303953338, + "grad_norm": 0.9925400614738464, + "learning_rate": 2.666489552181803e-06, + "loss": 0.0995, + "step": 9155 + }, + { + "epoch": 1.4834737524303305, + "grad_norm": 0.8233938813209534, + "learning_rate": 2.6660532163371995e-06, + "loss": 0.097, + "step": 9156 + }, + { + "epoch": 1.4836357744653272, + "grad_norm": 0.819572925567627, + "learning_rate": 2.6656168754117697e-06, + "loss": 0.1022, + "step": 9157 + }, + { + "epoch": 1.483797796500324, + "grad_norm": 0.7211984992027283, + "learning_rate": 2.665180529418863e-06, + "loss": 0.09, + "step": 9158 + }, + { + "epoch": 1.4839598185353209, + "grad_norm": 0.7187896966934204, + "learning_rate": 2.6647441783718335e-06, + "loss": 0.0809, + "step": 9159 + }, + { + "epoch": 1.4841218405703176, + "grad_norm": 0.7895532250404358, + "learning_rate": 2.6643078222840295e-06, + "loss": 0.1055, + "step": 9160 + }, + { + "epoch": 1.4842838626053143, + "grad_norm": 0.8991692066192627, + "learning_rate": 2.663871461168805e-06, + "loss": 0.1035, + "step": 9161 + }, + { + "epoch": 1.484445884640311, + "grad_norm": 0.9207788109779358, + "learning_rate": 2.6634350950395096e-06, + "loss": 0.1109, + "step": 9162 + }, + { + "epoch": 1.4846079066753077, + "grad_norm": 0.8399395942687988, + "learning_rate": 2.6629987239094956e-06, + "loss": 0.1089, + "step": 9163 + }, + { + "epoch": 1.4847699287103047, + "grad_norm": 0.8203895688056946, + "learning_rate": 2.662562347792116e-06, + "loss": 0.0992, + "step": 9164 + }, + { + "epoch": 1.4849319507453014, + "grad_norm": 0.7525299787521362, + "learning_rate": 2.662125966700721e-06, + "loss": 0.0946, + "step": 9165 + }, + { + "epoch": 1.485093972780298, + "grad_norm": 0.9387903213500977, + "learning_rate": 2.6616895806486644e-06, + "loss": 0.118, + "step": 9166 + }, + { + "epoch": 1.4852559948152948, + "grad_norm": 0.8505684733390808, + "learning_rate": 2.661253189649297e-06, + "loss": 0.1121, + "step": 9167 + }, + { + "epoch": 1.4854180168502915, + "grad_norm": 0.8152345418930054, + "learning_rate": 2.6608167937159735e-06, + "loss": 0.1032, + "step": 9168 + }, + { + "epoch": 1.4855800388852884, + "grad_norm": 0.7628380060195923, + "learning_rate": 2.6603803928620436e-06, + "loss": 0.1009, + "step": 9169 + }, + { + "epoch": 1.4857420609202852, + "grad_norm": 0.8258995413780212, + "learning_rate": 2.6599439871008636e-06, + "loss": 0.1027, + "step": 9170 + }, + { + "epoch": 1.4859040829552819, + "grad_norm": 0.7622812986373901, + "learning_rate": 2.6595075764457838e-06, + "loss": 0.0986, + "step": 9171 + }, + { + "epoch": 1.4860661049902788, + "grad_norm": 0.7521347999572754, + "learning_rate": 2.659071160910158e-06, + "loss": 0.0953, + "step": 9172 + }, + { + "epoch": 1.4862281270252755, + "grad_norm": 0.7840893864631653, + "learning_rate": 2.6586347405073398e-06, + "loss": 0.0931, + "step": 9173 + }, + { + "epoch": 1.4863901490602722, + "grad_norm": 0.7666772603988647, + "learning_rate": 2.6581983152506825e-06, + "loss": 0.0981, + "step": 9174 + }, + { + "epoch": 1.486552171095269, + "grad_norm": 0.8099150061607361, + "learning_rate": 2.65776188515354e-06, + "loss": 0.0986, + "step": 9175 + }, + { + "epoch": 1.4867141931302656, + "grad_norm": 0.8243248462677002, + "learning_rate": 2.6573254502292644e-06, + "loss": 0.1039, + "step": 9176 + }, + { + "epoch": 1.4868762151652626, + "grad_norm": 0.799664318561554, + "learning_rate": 2.6568890104912123e-06, + "loss": 0.1003, + "step": 9177 + }, + { + "epoch": 1.4870382372002593, + "grad_norm": 0.7624625563621521, + "learning_rate": 2.656452565952735e-06, + "loss": 0.1001, + "step": 9178 + }, + { + "epoch": 1.487200259235256, + "grad_norm": 0.835109531879425, + "learning_rate": 2.656016116627188e-06, + "loss": 0.1002, + "step": 9179 + }, + { + "epoch": 1.4873622812702527, + "grad_norm": 0.7921520471572876, + "learning_rate": 2.6555796625279257e-06, + "loss": 0.0996, + "step": 9180 + }, + { + "epoch": 1.4875243033052494, + "grad_norm": 0.8584132790565491, + "learning_rate": 2.6551432036683017e-06, + "loss": 0.1059, + "step": 9181 + }, + { + "epoch": 1.4876863253402464, + "grad_norm": 0.8519357442855835, + "learning_rate": 2.6547067400616717e-06, + "loss": 0.0943, + "step": 9182 + }, + { + "epoch": 1.487848347375243, + "grad_norm": 0.8248621225357056, + "learning_rate": 2.654270271721389e-06, + "loss": 0.0998, + "step": 9183 + }, + { + "epoch": 1.4880103694102398, + "grad_norm": 0.8023591041564941, + "learning_rate": 2.6538337986608105e-06, + "loss": 0.0943, + "step": 9184 + }, + { + "epoch": 1.4881723914452365, + "grad_norm": 0.8533332347869873, + "learning_rate": 2.6533973208932893e-06, + "loss": 0.1109, + "step": 9185 + }, + { + "epoch": 1.4883344134802332, + "grad_norm": 0.7940263748168945, + "learning_rate": 2.6529608384321815e-06, + "loss": 0.1073, + "step": 9186 + }, + { + "epoch": 1.4884964355152301, + "grad_norm": 0.8785202503204346, + "learning_rate": 2.6525243512908423e-06, + "loss": 0.1016, + "step": 9187 + }, + { + "epoch": 1.4886584575502269, + "grad_norm": 0.7625306844711304, + "learning_rate": 2.6520878594826268e-06, + "loss": 0.0974, + "step": 9188 + }, + { + "epoch": 1.4888204795852236, + "grad_norm": 0.8531996607780457, + "learning_rate": 2.6516513630208906e-06, + "loss": 0.1111, + "step": 9189 + }, + { + "epoch": 1.4889825016202203, + "grad_norm": 0.8253282904624939, + "learning_rate": 2.651214861918991e-06, + "loss": 0.1047, + "step": 9190 + }, + { + "epoch": 1.489144523655217, + "grad_norm": 0.8038217425346375, + "learning_rate": 2.6507783561902804e-06, + "loss": 0.1079, + "step": 9191 + }, + { + "epoch": 1.489306545690214, + "grad_norm": 0.9079601168632507, + "learning_rate": 2.6503418458481188e-06, + "loss": 0.1085, + "step": 9192 + }, + { + "epoch": 1.4894685677252106, + "grad_norm": 0.7232383489608765, + "learning_rate": 2.6499053309058593e-06, + "loss": 0.0884, + "step": 9193 + }, + { + "epoch": 1.4896305897602073, + "grad_norm": 0.7744395136833191, + "learning_rate": 2.649468811376861e-06, + "loss": 0.099, + "step": 9194 + }, + { + "epoch": 1.4897926117952043, + "grad_norm": 0.8593305349349976, + "learning_rate": 2.649032287274477e-06, + "loss": 0.1131, + "step": 9195 + }, + { + "epoch": 1.4899546338302008, + "grad_norm": 0.7993839383125305, + "learning_rate": 2.6485957586120664e-06, + "loss": 0.1102, + "step": 9196 + }, + { + "epoch": 1.4901166558651977, + "grad_norm": 0.7705836296081543, + "learning_rate": 2.6481592254029854e-06, + "loss": 0.0971, + "step": 9197 + }, + { + "epoch": 1.4902786779001944, + "grad_norm": 0.816331148147583, + "learning_rate": 2.6477226876605903e-06, + "loss": 0.1041, + "step": 9198 + }, + { + "epoch": 1.4904406999351911, + "grad_norm": 0.853085458278656, + "learning_rate": 2.647286145398239e-06, + "loss": 0.1112, + "step": 9199 + }, + { + "epoch": 1.490602721970188, + "grad_norm": 0.7312716841697693, + "learning_rate": 2.646849598629287e-06, + "loss": 0.0957, + "step": 9200 + }, + { + "epoch": 1.4907647440051848, + "grad_norm": 0.8262644410133362, + "learning_rate": 2.646413047367094e-06, + "loss": 0.107, + "step": 9201 + }, + { + "epoch": 1.4909267660401815, + "grad_norm": 0.8195114731788635, + "learning_rate": 2.645976491625015e-06, + "loss": 0.1021, + "step": 9202 + }, + { + "epoch": 1.4910887880751782, + "grad_norm": 0.813556432723999, + "learning_rate": 2.6455399314164095e-06, + "loss": 0.1105, + "step": 9203 + }, + { + "epoch": 1.491250810110175, + "grad_norm": 0.8205865025520325, + "learning_rate": 2.645103366754633e-06, + "loss": 0.1, + "step": 9204 + }, + { + "epoch": 1.4914128321451718, + "grad_norm": 0.8455175161361694, + "learning_rate": 2.644666797653046e-06, + "loss": 0.1036, + "step": 9205 + }, + { + "epoch": 1.4915748541801686, + "grad_norm": 0.7989739775657654, + "learning_rate": 2.6442302241250047e-06, + "loss": 0.1039, + "step": 9206 + }, + { + "epoch": 1.4917368762151653, + "grad_norm": 0.8118696808815002, + "learning_rate": 2.643793646183867e-06, + "loss": 0.0953, + "step": 9207 + }, + { + "epoch": 1.491898898250162, + "grad_norm": 0.7257483005523682, + "learning_rate": 2.6433570638429923e-06, + "loss": 0.0943, + "step": 9208 + }, + { + "epoch": 1.4920609202851587, + "grad_norm": 0.6711858510971069, + "learning_rate": 2.6429204771157384e-06, + "loss": 0.0901, + "step": 9209 + }, + { + "epoch": 1.4922229423201556, + "grad_norm": 0.7894310355186462, + "learning_rate": 2.6424838860154633e-06, + "loss": 0.104, + "step": 9210 + }, + { + "epoch": 1.4923849643551523, + "grad_norm": 0.7803335785865784, + "learning_rate": 2.6420472905555262e-06, + "loss": 0.0985, + "step": 9211 + }, + { + "epoch": 1.492546986390149, + "grad_norm": 0.7969242334365845, + "learning_rate": 2.641610690749286e-06, + "loss": 0.1104, + "step": 9212 + }, + { + "epoch": 1.4927090084251458, + "grad_norm": 0.9152027368545532, + "learning_rate": 2.641174086610101e-06, + "loss": 0.1166, + "step": 9213 + }, + { + "epoch": 1.4928710304601425, + "grad_norm": 0.883071780204773, + "learning_rate": 2.640737478151331e-06, + "loss": 0.114, + "step": 9214 + }, + { + "epoch": 1.4930330524951394, + "grad_norm": 0.7122350335121155, + "learning_rate": 2.6403008653863343e-06, + "loss": 0.0903, + "step": 9215 + }, + { + "epoch": 1.4931950745301361, + "grad_norm": 0.8668462634086609, + "learning_rate": 2.6398642483284716e-06, + "loss": 0.1001, + "step": 9216 + }, + { + "epoch": 1.4933570965651328, + "grad_norm": 0.8878929615020752, + "learning_rate": 2.6394276269911003e-06, + "loss": 0.1123, + "step": 9217 + }, + { + "epoch": 1.4935191186001295, + "grad_norm": 0.7911863327026367, + "learning_rate": 2.6389910013875814e-06, + "loss": 0.1051, + "step": 9218 + }, + { + "epoch": 1.4936811406351262, + "grad_norm": 0.9753386378288269, + "learning_rate": 2.638554371531274e-06, + "loss": 0.0956, + "step": 9219 + }, + { + "epoch": 1.4938431626701232, + "grad_norm": 0.7793456315994263, + "learning_rate": 2.638117737435538e-06, + "loss": 0.0986, + "step": 9220 + }, + { + "epoch": 1.49400518470512, + "grad_norm": 0.8406729698181152, + "learning_rate": 2.637681099113735e-06, + "loss": 0.1014, + "step": 9221 + }, + { + "epoch": 1.4941672067401166, + "grad_norm": 0.7772307395935059, + "learning_rate": 2.637244456579221e-06, + "loss": 0.0949, + "step": 9222 + }, + { + "epoch": 1.4943292287751135, + "grad_norm": 0.8179559111595154, + "learning_rate": 2.636807809845361e-06, + "loss": 0.1041, + "step": 9223 + }, + { + "epoch": 1.4944912508101102, + "grad_norm": 0.8753211498260498, + "learning_rate": 2.6363711589255115e-06, + "loss": 0.1051, + "step": 9224 + }, + { + "epoch": 1.494653272845107, + "grad_norm": 0.8662494421005249, + "learning_rate": 2.6359345038330352e-06, + "loss": 0.1087, + "step": 9225 + }, + { + "epoch": 1.4948152948801037, + "grad_norm": 0.7673994898796082, + "learning_rate": 2.6354978445812923e-06, + "loss": 0.0913, + "step": 9226 + }, + { + "epoch": 1.4949773169151004, + "grad_norm": 0.8224777579307556, + "learning_rate": 2.6350611811836428e-06, + "loss": 0.1055, + "step": 9227 + }, + { + "epoch": 1.4951393389500973, + "grad_norm": 0.8199198246002197, + "learning_rate": 2.6346245136534483e-06, + "loss": 0.0981, + "step": 9228 + }, + { + "epoch": 1.495301360985094, + "grad_norm": 0.7925064563751221, + "learning_rate": 2.6341878420040694e-06, + "loss": 0.0915, + "step": 9229 + }, + { + "epoch": 1.4954633830200907, + "grad_norm": 0.7900672554969788, + "learning_rate": 2.6337511662488678e-06, + "loss": 0.1045, + "step": 9230 + }, + { + "epoch": 1.4956254050550875, + "grad_norm": 0.8776764869689941, + "learning_rate": 2.6333144864012027e-06, + "loss": 0.1074, + "step": 9231 + }, + { + "epoch": 1.4957874270900842, + "grad_norm": 0.9464653730392456, + "learning_rate": 2.6328778024744384e-06, + "loss": 0.1074, + "step": 9232 + }, + { + "epoch": 1.495949449125081, + "grad_norm": 0.7300053834915161, + "learning_rate": 2.6324411144819345e-06, + "loss": 0.0959, + "step": 9233 + }, + { + "epoch": 1.4961114711600778, + "grad_norm": 0.793613851070404, + "learning_rate": 2.6320044224370526e-06, + "loss": 0.1007, + "step": 9234 + }, + { + "epoch": 1.4962734931950745, + "grad_norm": 0.7878672480583191, + "learning_rate": 2.631567726353155e-06, + "loss": 0.1016, + "step": 9235 + }, + { + "epoch": 1.4964355152300712, + "grad_norm": 0.7945536971092224, + "learning_rate": 2.6311310262436035e-06, + "loss": 0.0929, + "step": 9236 + }, + { + "epoch": 1.496597537265068, + "grad_norm": 0.9132923483848572, + "learning_rate": 2.6306943221217597e-06, + "loss": 0.114, + "step": 9237 + }, + { + "epoch": 1.4967595593000649, + "grad_norm": 0.9040144681930542, + "learning_rate": 2.6302576140009866e-06, + "loss": 0.117, + "step": 9238 + }, + { + "epoch": 1.4969215813350616, + "grad_norm": 0.8708134889602661, + "learning_rate": 2.629820901894645e-06, + "loss": 0.1075, + "step": 9239 + }, + { + "epoch": 1.4970836033700583, + "grad_norm": 0.7671794295310974, + "learning_rate": 2.6293841858160983e-06, + "loss": 0.0955, + "step": 9240 + }, + { + "epoch": 1.497245625405055, + "grad_norm": 0.7187214493751526, + "learning_rate": 2.6289474657787084e-06, + "loss": 0.0901, + "step": 9241 + }, + { + "epoch": 1.4974076474400517, + "grad_norm": 0.8010789752006531, + "learning_rate": 2.6285107417958385e-06, + "loss": 0.1064, + "step": 9242 + }, + { + "epoch": 1.4975696694750487, + "grad_norm": 0.7064031958580017, + "learning_rate": 2.6280740138808503e-06, + "loss": 0.0918, + "step": 9243 + }, + { + "epoch": 1.4977316915100454, + "grad_norm": 0.7605817317962646, + "learning_rate": 2.6276372820471073e-06, + "loss": 0.0905, + "step": 9244 + }, + { + "epoch": 1.497893713545042, + "grad_norm": 0.8160539269447327, + "learning_rate": 2.6272005463079732e-06, + "loss": 0.1117, + "step": 9245 + }, + { + "epoch": 1.498055735580039, + "grad_norm": 0.7572857141494751, + "learning_rate": 2.6267638066768087e-06, + "loss": 0.0911, + "step": 9246 + }, + { + "epoch": 1.4982177576150357, + "grad_norm": 0.7898995876312256, + "learning_rate": 2.6263270631669796e-06, + "loss": 0.0998, + "step": 9247 + }, + { + "epoch": 1.4983797796500324, + "grad_norm": 0.8465975522994995, + "learning_rate": 2.625890315791848e-06, + "loss": 0.1061, + "step": 9248 + }, + { + "epoch": 1.4985418016850292, + "grad_norm": 0.7732163667678833, + "learning_rate": 2.6254535645647772e-06, + "loss": 0.0983, + "step": 9249 + }, + { + "epoch": 1.4987038237200259, + "grad_norm": 0.7840855717658997, + "learning_rate": 2.625016809499131e-06, + "loss": 0.1021, + "step": 9250 + }, + { + "epoch": 1.4988658457550228, + "grad_norm": 0.7952522039413452, + "learning_rate": 2.6245800506082727e-06, + "loss": 0.0972, + "step": 9251 + }, + { + "epoch": 1.4990278677900195, + "grad_norm": 0.7457720637321472, + "learning_rate": 2.6241432879055667e-06, + "loss": 0.0979, + "step": 9252 + }, + { + "epoch": 1.4991898898250162, + "grad_norm": 0.8276432752609253, + "learning_rate": 2.6237065214043754e-06, + "loss": 0.108, + "step": 9253 + }, + { + "epoch": 1.499351911860013, + "grad_norm": 0.7755012512207031, + "learning_rate": 2.6232697511180654e-06, + "loss": 0.0939, + "step": 9254 + }, + { + "epoch": 1.4995139338950096, + "grad_norm": 0.7345067858695984, + "learning_rate": 2.622832977059998e-06, + "loss": 0.0905, + "step": 9255 + }, + { + "epoch": 1.4996759559300066, + "grad_norm": 0.8585684299468994, + "learning_rate": 2.6223961992435406e-06, + "loss": 0.1049, + "step": 9256 + }, + { + "epoch": 1.4998379779650033, + "grad_norm": 0.729276180267334, + "learning_rate": 2.621959417682054e-06, + "loss": 0.0873, + "step": 9257 + }, + { + "epoch": 1.5, + "grad_norm": 0.8658743500709534, + "learning_rate": 2.6215226323889048e-06, + "loss": 0.1089, + "step": 9258 + }, + { + "epoch": 1.5001620220349967, + "grad_norm": 0.7544455528259277, + "learning_rate": 2.621085843377457e-06, + "loss": 0.0943, + "step": 9259 + }, + { + "epoch": 1.5003240440699934, + "grad_norm": 0.7996721863746643, + "learning_rate": 2.620649050661076e-06, + "loss": 0.0983, + "step": 9260 + }, + { + "epoch": 1.5004860661049904, + "grad_norm": 0.8732225894927979, + "learning_rate": 2.620212254253126e-06, + "loss": 0.1048, + "step": 9261 + }, + { + "epoch": 1.500648088139987, + "grad_norm": 0.8265805244445801, + "learning_rate": 2.6197754541669714e-06, + "loss": 0.1041, + "step": 9262 + }, + { + "epoch": 1.5008101101749838, + "grad_norm": 0.8330001831054688, + "learning_rate": 2.6193386504159777e-06, + "loss": 0.1099, + "step": 9263 + }, + { + "epoch": 1.5009721322099807, + "grad_norm": 0.780525267124176, + "learning_rate": 2.6189018430135106e-06, + "loss": 0.0999, + "step": 9264 + }, + { + "epoch": 1.5011341542449772, + "grad_norm": 0.9457441568374634, + "learning_rate": 2.618465031972935e-06, + "loss": 0.1189, + "step": 9265 + }, + { + "epoch": 1.5012961762799741, + "grad_norm": 0.8373966813087463, + "learning_rate": 2.6180282173076156e-06, + "loss": 0.1104, + "step": 9266 + }, + { + "epoch": 1.5014581983149708, + "grad_norm": 0.7619836330413818, + "learning_rate": 2.6175913990309184e-06, + "loss": 0.0991, + "step": 9267 + }, + { + "epoch": 1.5016202203499676, + "grad_norm": 0.7752346992492676, + "learning_rate": 2.6171545771562085e-06, + "loss": 0.0955, + "step": 9268 + }, + { + "epoch": 1.5017822423849645, + "grad_norm": 0.8263670206069946, + "learning_rate": 2.6167177516968536e-06, + "loss": 0.0997, + "step": 9269 + }, + { + "epoch": 1.501944264419961, + "grad_norm": 0.796293318271637, + "learning_rate": 2.6162809226662167e-06, + "loss": 0.1056, + "step": 9270 + }, + { + "epoch": 1.502106286454958, + "grad_norm": 0.7782325148582458, + "learning_rate": 2.615844090077665e-06, + "loss": 0.0962, + "step": 9271 + }, + { + "epoch": 1.5022683084899546, + "grad_norm": 0.9106411337852478, + "learning_rate": 2.6154072539445645e-06, + "loss": 0.1008, + "step": 9272 + }, + { + "epoch": 1.5024303305249513, + "grad_norm": 0.8080524802207947, + "learning_rate": 2.614970414280281e-06, + "loss": 0.1053, + "step": 9273 + }, + { + "epoch": 1.5025923525599483, + "grad_norm": 0.9011566638946533, + "learning_rate": 2.6145335710981817e-06, + "loss": 0.1223, + "step": 9274 + }, + { + "epoch": 1.5027543745949448, + "grad_norm": 0.7807307243347168, + "learning_rate": 2.6140967244116322e-06, + "loss": 0.0926, + "step": 9275 + }, + { + "epoch": 1.5029163966299417, + "grad_norm": 0.8064615726470947, + "learning_rate": 2.613659874233999e-06, + "loss": 0.0976, + "step": 9276 + }, + { + "epoch": 1.5030784186649384, + "grad_norm": 0.8541350960731506, + "learning_rate": 2.6132230205786483e-06, + "loss": 0.1034, + "step": 9277 + }, + { + "epoch": 1.5032404406999351, + "grad_norm": 0.8118453621864319, + "learning_rate": 2.612786163458948e-06, + "loss": 0.1009, + "step": 9278 + }, + { + "epoch": 1.503402462734932, + "grad_norm": 0.852788507938385, + "learning_rate": 2.6123493028882634e-06, + "loss": 0.1146, + "step": 9279 + }, + { + "epoch": 1.5035644847699285, + "grad_norm": 0.7566162347793579, + "learning_rate": 2.611912438879962e-06, + "loss": 0.0975, + "step": 9280 + }, + { + "epoch": 1.5037265068049255, + "grad_norm": 0.9245677590370178, + "learning_rate": 2.611475571447411e-06, + "loss": 0.1078, + "step": 9281 + }, + { + "epoch": 1.5038885288399222, + "grad_norm": 0.9976536631584167, + "learning_rate": 2.611038700603977e-06, + "loss": 0.116, + "step": 9282 + }, + { + "epoch": 1.504050550874919, + "grad_norm": 0.8083858489990234, + "learning_rate": 2.6106018263630283e-06, + "loss": 0.1036, + "step": 9283 + }, + { + "epoch": 1.5042125729099158, + "grad_norm": 0.7877292037010193, + "learning_rate": 2.6101649487379304e-06, + "loss": 0.0926, + "step": 9284 + }, + { + "epoch": 1.5043745949449125, + "grad_norm": 0.9315115809440613, + "learning_rate": 2.609728067742053e-06, + "loss": 0.1173, + "step": 9285 + }, + { + "epoch": 1.5045366169799093, + "grad_norm": 0.8206225037574768, + "learning_rate": 2.6092911833887602e-06, + "loss": 0.1077, + "step": 9286 + }, + { + "epoch": 1.5046986390149062, + "grad_norm": 0.805716335773468, + "learning_rate": 2.6088542956914233e-06, + "loss": 0.1041, + "step": 9287 + }, + { + "epoch": 1.5048606610499027, + "grad_norm": 0.9114910364151001, + "learning_rate": 2.6084174046634075e-06, + "loss": 0.1181, + "step": 9288 + }, + { + "epoch": 1.5050226830848996, + "grad_norm": 0.8441805243492126, + "learning_rate": 2.607980510318082e-06, + "loss": 0.1069, + "step": 9289 + }, + { + "epoch": 1.5051847051198963, + "grad_norm": 0.8596044182777405, + "learning_rate": 2.607543612668814e-06, + "loss": 0.101, + "step": 9290 + }, + { + "epoch": 1.505346727154893, + "grad_norm": 0.8306862711906433, + "learning_rate": 2.6071067117289717e-06, + "loss": 0.1047, + "step": 9291 + }, + { + "epoch": 1.50550874918989, + "grad_norm": 0.7705062031745911, + "learning_rate": 2.6066698075119237e-06, + "loss": 0.0974, + "step": 9292 + }, + { + "epoch": 1.5056707712248865, + "grad_norm": 0.8047751784324646, + "learning_rate": 2.606232900031037e-06, + "loss": 0.1029, + "step": 9293 + }, + { + "epoch": 1.5058327932598834, + "grad_norm": 0.7562994360923767, + "learning_rate": 2.605795989299681e-06, + "loss": 0.0957, + "step": 9294 + }, + { + "epoch": 1.50599481529488, + "grad_norm": 0.9327423572540283, + "learning_rate": 2.6053590753312237e-06, + "loss": 0.0989, + "step": 9295 + }, + { + "epoch": 1.5061568373298768, + "grad_norm": 0.7977178692817688, + "learning_rate": 2.604922158139033e-06, + "loss": 0.1054, + "step": 9296 + }, + { + "epoch": 1.5063188593648738, + "grad_norm": 0.7948964238166809, + "learning_rate": 2.6044852377364794e-06, + "loss": 0.0938, + "step": 9297 + }, + { + "epoch": 1.5064808813998702, + "grad_norm": 0.8069312572479248, + "learning_rate": 2.6040483141369293e-06, + "loss": 0.0934, + "step": 9298 + }, + { + "epoch": 1.5066429034348672, + "grad_norm": 0.850750744342804, + "learning_rate": 2.6036113873537526e-06, + "loss": 0.0958, + "step": 9299 + }, + { + "epoch": 1.5068049254698639, + "grad_norm": 0.8110750317573547, + "learning_rate": 2.603174457400319e-06, + "loss": 0.1068, + "step": 9300 + }, + { + "epoch": 1.5069669475048606, + "grad_norm": 0.8430596590042114, + "learning_rate": 2.602737524289996e-06, + "loss": 0.0992, + "step": 9301 + }, + { + "epoch": 1.5071289695398575, + "grad_norm": 0.8320333361625671, + "learning_rate": 2.602300588036154e-06, + "loss": 0.1031, + "step": 9302 + }, + { + "epoch": 1.507290991574854, + "grad_norm": 0.8486113548278809, + "learning_rate": 2.6018636486521615e-06, + "loss": 0.1004, + "step": 9303 + }, + { + "epoch": 1.507453013609851, + "grad_norm": 0.7198436260223389, + "learning_rate": 2.6014267061513875e-06, + "loss": 0.0936, + "step": 9304 + }, + { + "epoch": 1.5076150356448477, + "grad_norm": 0.7572947144508362, + "learning_rate": 2.6009897605472022e-06, + "loss": 0.094, + "step": 9305 + }, + { + "epoch": 1.5077770576798444, + "grad_norm": 0.8611274361610413, + "learning_rate": 2.6005528118529738e-06, + "loss": 0.1026, + "step": 9306 + }, + { + "epoch": 1.5079390797148413, + "grad_norm": 0.904730498790741, + "learning_rate": 2.6001158600820735e-06, + "loss": 0.1111, + "step": 9307 + }, + { + "epoch": 1.508101101749838, + "grad_norm": 0.8339022994041443, + "learning_rate": 2.5996789052478693e-06, + "loss": 0.1016, + "step": 9308 + }, + { + "epoch": 1.5082631237848347, + "grad_norm": 0.8966616988182068, + "learning_rate": 2.5992419473637327e-06, + "loss": 0.1131, + "step": 9309 + }, + { + "epoch": 1.5084251458198314, + "grad_norm": 0.7851499319076538, + "learning_rate": 2.5988049864430314e-06, + "loss": 0.1015, + "step": 9310 + }, + { + "epoch": 1.5085871678548282, + "grad_norm": 0.8004862666130066, + "learning_rate": 2.598368022499138e-06, + "loss": 0.0977, + "step": 9311 + }, + { + "epoch": 1.508749189889825, + "grad_norm": 0.7532079219818115, + "learning_rate": 2.597931055545421e-06, + "loss": 0.0939, + "step": 9312 + }, + { + "epoch": 1.5089112119248218, + "grad_norm": 0.8671373128890991, + "learning_rate": 2.59749408559525e-06, + "loss": 0.1082, + "step": 9313 + }, + { + "epoch": 1.5090732339598185, + "grad_norm": 0.7479811906814575, + "learning_rate": 2.597057112661997e-06, + "loss": 0.0937, + "step": 9314 + }, + { + "epoch": 1.5092352559948155, + "grad_norm": 0.7849185466766357, + "learning_rate": 2.59662013675903e-06, + "loss": 0.1082, + "step": 9315 + }, + { + "epoch": 1.509397278029812, + "grad_norm": 0.7421824336051941, + "learning_rate": 2.5961831578997214e-06, + "loss": 0.1007, + "step": 9316 + }, + { + "epoch": 1.5095593000648089, + "grad_norm": 0.752606213092804, + "learning_rate": 2.5957461760974407e-06, + "loss": 0.0929, + "step": 9317 + }, + { + "epoch": 1.5097213220998056, + "grad_norm": 0.8576077222824097, + "learning_rate": 2.5953091913655586e-06, + "loss": 0.1081, + "step": 9318 + }, + { + "epoch": 1.5098833441348023, + "grad_norm": 0.7942414879798889, + "learning_rate": 2.594872203717446e-06, + "loss": 0.1102, + "step": 9319 + }, + { + "epoch": 1.5100453661697992, + "grad_norm": 0.7361174821853638, + "learning_rate": 2.594435213166473e-06, + "loss": 0.0973, + "step": 9320 + }, + { + "epoch": 1.5102073882047957, + "grad_norm": 0.7839969992637634, + "learning_rate": 2.5939982197260115e-06, + "loss": 0.0995, + "step": 9321 + }, + { + "epoch": 1.5103694102397927, + "grad_norm": 0.9291492104530334, + "learning_rate": 2.593561223409432e-06, + "loss": 0.1089, + "step": 9322 + }, + { + "epoch": 1.5105314322747894, + "grad_norm": 0.8989511132240295, + "learning_rate": 2.5931242242301054e-06, + "loss": 0.1078, + "step": 9323 + }, + { + "epoch": 1.510693454309786, + "grad_norm": 0.8241267800331116, + "learning_rate": 2.592687222201403e-06, + "loss": 0.1092, + "step": 9324 + }, + { + "epoch": 1.510855476344783, + "grad_norm": 0.7994397878646851, + "learning_rate": 2.592250217336696e-06, + "loss": 0.1023, + "step": 9325 + }, + { + "epoch": 1.5110174983797795, + "grad_norm": 0.777235746383667, + "learning_rate": 2.5918132096493552e-06, + "loss": 0.0913, + "step": 9326 + }, + { + "epoch": 1.5111795204147764, + "grad_norm": 0.7653077840805054, + "learning_rate": 2.5913761991527527e-06, + "loss": 0.0998, + "step": 9327 + }, + { + "epoch": 1.5113415424497731, + "grad_norm": 0.9247298836708069, + "learning_rate": 2.5909391858602596e-06, + "loss": 0.114, + "step": 9328 + }, + { + "epoch": 1.5115035644847699, + "grad_norm": 0.8177992701530457, + "learning_rate": 2.590502169785247e-06, + "loss": 0.0999, + "step": 9329 + }, + { + "epoch": 1.5116655865197668, + "grad_norm": 0.7962645292282104, + "learning_rate": 2.5900651509410875e-06, + "loss": 0.1064, + "step": 9330 + }, + { + "epoch": 1.5118276085547635, + "grad_norm": 0.8084893822669983, + "learning_rate": 2.589628129341153e-06, + "loss": 0.1103, + "step": 9331 + }, + { + "epoch": 1.5119896305897602, + "grad_norm": 0.7229411005973816, + "learning_rate": 2.5891911049988133e-06, + "loss": 0.089, + "step": 9332 + }, + { + "epoch": 1.512151652624757, + "grad_norm": 1.190307855606079, + "learning_rate": 2.588754077927442e-06, + "loss": 0.1183, + "step": 9333 + }, + { + "epoch": 1.5123136746597536, + "grad_norm": 0.746012806892395, + "learning_rate": 2.5883170481404112e-06, + "loss": 0.0924, + "step": 9334 + }, + { + "epoch": 1.5124756966947506, + "grad_norm": 0.8450835347175598, + "learning_rate": 2.5878800156510925e-06, + "loss": 0.1091, + "step": 9335 + }, + { + "epoch": 1.5126377187297473, + "grad_norm": 0.7555860280990601, + "learning_rate": 2.587442980472858e-06, + "loss": 0.0875, + "step": 9336 + }, + { + "epoch": 1.512799740764744, + "grad_norm": 0.8183492422103882, + "learning_rate": 2.5870059426190787e-06, + "loss": 0.0995, + "step": 9337 + }, + { + "epoch": 1.512961762799741, + "grad_norm": 0.8046535849571228, + "learning_rate": 2.5865689021031292e-06, + "loss": 0.0967, + "step": 9338 + }, + { + "epoch": 1.5131237848347374, + "grad_norm": 0.8389936685562134, + "learning_rate": 2.5861318589383806e-06, + "loss": 0.0875, + "step": 9339 + }, + { + "epoch": 1.5132858068697344, + "grad_norm": 0.9016577005386353, + "learning_rate": 2.5856948131382055e-06, + "loss": 0.1099, + "step": 9340 + }, + { + "epoch": 1.513447828904731, + "grad_norm": 0.8551927804946899, + "learning_rate": 2.585257764715976e-06, + "loss": 0.1044, + "step": 9341 + }, + { + "epoch": 1.5136098509397278, + "grad_norm": 0.8470696210861206, + "learning_rate": 2.584820713685066e-06, + "loss": 0.1048, + "step": 9342 + }, + { + "epoch": 1.5137718729747247, + "grad_norm": 0.9146429896354675, + "learning_rate": 2.5843836600588474e-06, + "loss": 0.0846, + "step": 9343 + }, + { + "epoch": 1.5139338950097212, + "grad_norm": 0.8819512128829956, + "learning_rate": 2.5839466038506927e-06, + "loss": 0.1131, + "step": 9344 + }, + { + "epoch": 1.5140959170447181, + "grad_norm": 0.7611178755760193, + "learning_rate": 2.583509545073975e-06, + "loss": 0.0911, + "step": 9345 + }, + { + "epoch": 1.5142579390797148, + "grad_norm": 0.8343874216079712, + "learning_rate": 2.5830724837420675e-06, + "loss": 0.1025, + "step": 9346 + }, + { + "epoch": 1.5144199611147116, + "grad_norm": 0.8855875134468079, + "learning_rate": 2.5826354198683433e-06, + "loss": 0.1137, + "step": 9347 + }, + { + "epoch": 1.5145819831497085, + "grad_norm": 0.8919705152511597, + "learning_rate": 2.582198353466175e-06, + "loss": 0.1163, + "step": 9348 + }, + { + "epoch": 1.514744005184705, + "grad_norm": 0.7063683271408081, + "learning_rate": 2.5817612845489355e-06, + "loss": 0.0853, + "step": 9349 + }, + { + "epoch": 1.514906027219702, + "grad_norm": 0.9223026633262634, + "learning_rate": 2.5813242131299986e-06, + "loss": 0.1116, + "step": 9350 + }, + { + "epoch": 1.5150680492546986, + "grad_norm": 0.7677808403968811, + "learning_rate": 2.580887139222738e-06, + "loss": 0.0966, + "step": 9351 + }, + { + "epoch": 1.5152300712896953, + "grad_norm": 0.8146395087242126, + "learning_rate": 2.5804500628405265e-06, + "loss": 0.0944, + "step": 9352 + }, + { + "epoch": 1.5153920933246923, + "grad_norm": 0.804042637348175, + "learning_rate": 2.580012983996738e-06, + "loss": 0.1033, + "step": 9353 + }, + { + "epoch": 1.5155541153596888, + "grad_norm": 0.9203242659568787, + "learning_rate": 2.5795759027047457e-06, + "loss": 0.1108, + "step": 9354 + }, + { + "epoch": 1.5157161373946857, + "grad_norm": 0.8421667218208313, + "learning_rate": 2.5791388189779237e-06, + "loss": 0.0992, + "step": 9355 + }, + { + "epoch": 1.5158781594296824, + "grad_norm": 0.8935291767120361, + "learning_rate": 2.578701732829645e-06, + "loss": 0.1045, + "step": 9356 + }, + { + "epoch": 1.5160401814646791, + "grad_norm": 0.8606316447257996, + "learning_rate": 2.5782646442732833e-06, + "loss": 0.1096, + "step": 9357 + }, + { + "epoch": 1.516202203499676, + "grad_norm": 0.8459725975990295, + "learning_rate": 2.5778275533222135e-06, + "loss": 0.1099, + "step": 9358 + }, + { + "epoch": 1.5163642255346728, + "grad_norm": 0.7800662517547607, + "learning_rate": 2.5773904599898087e-06, + "loss": 0.0971, + "step": 9359 + }, + { + "epoch": 1.5165262475696695, + "grad_norm": 0.8285083770751953, + "learning_rate": 2.5769533642894433e-06, + "loss": 0.1019, + "step": 9360 + }, + { + "epoch": 1.5166882696046662, + "grad_norm": 0.8594668507575989, + "learning_rate": 2.5765162662344906e-06, + "loss": 0.1067, + "step": 9361 + }, + { + "epoch": 1.516850291639663, + "grad_norm": 0.7574124932289124, + "learning_rate": 2.576079165838326e-06, + "loss": 0.0894, + "step": 9362 + }, + { + "epoch": 1.5170123136746598, + "grad_norm": 0.8705188632011414, + "learning_rate": 2.575642063114322e-06, + "loss": 0.1104, + "step": 9363 + }, + { + "epoch": 1.5171743357096565, + "grad_norm": 0.8245108723640442, + "learning_rate": 2.5752049580758555e-06, + "loss": 0.104, + "step": 9364 + }, + { + "epoch": 1.5173363577446533, + "grad_norm": 0.9124253988265991, + "learning_rate": 2.5747678507362977e-06, + "loss": 0.1215, + "step": 9365 + }, + { + "epoch": 1.5174983797796502, + "grad_norm": 0.8030577898025513, + "learning_rate": 2.5743307411090255e-06, + "loss": 0.1064, + "step": 9366 + }, + { + "epoch": 1.5176604018146467, + "grad_norm": 0.7944320440292358, + "learning_rate": 2.5738936292074122e-06, + "loss": 0.0991, + "step": 9367 + }, + { + "epoch": 1.5178224238496436, + "grad_norm": 0.7724756598472595, + "learning_rate": 2.5734565150448325e-06, + "loss": 0.1032, + "step": 9368 + }, + { + "epoch": 1.5179844458846403, + "grad_norm": 0.7421293258666992, + "learning_rate": 2.5730193986346623e-06, + "loss": 0.0941, + "step": 9369 + }, + { + "epoch": 1.518146467919637, + "grad_norm": 0.7390490770339966, + "learning_rate": 2.5725822799902738e-06, + "loss": 0.0941, + "step": 9370 + }, + { + "epoch": 1.518308489954634, + "grad_norm": 0.8915257453918457, + "learning_rate": 2.572145159125044e-06, + "loss": 0.1067, + "step": 9371 + }, + { + "epoch": 1.5184705119896305, + "grad_norm": 0.8176239132881165, + "learning_rate": 2.5717080360523464e-06, + "loss": 0.1001, + "step": 9372 + }, + { + "epoch": 1.5186325340246274, + "grad_norm": 0.7526306509971619, + "learning_rate": 2.571270910785557e-06, + "loss": 0.1013, + "step": 9373 + }, + { + "epoch": 1.518794556059624, + "grad_norm": 0.7733591794967651, + "learning_rate": 2.57083378333805e-06, + "loss": 0.1024, + "step": 9374 + }, + { + "epoch": 1.5189565780946208, + "grad_norm": 0.7548816204071045, + "learning_rate": 2.5703966537232006e-06, + "loss": 0.104, + "step": 9375 + }, + { + "epoch": 1.5191186001296177, + "grad_norm": 0.8029460906982422, + "learning_rate": 2.5699595219543838e-06, + "loss": 0.1031, + "step": 9376 + }, + { + "epoch": 1.5192806221646142, + "grad_norm": 0.912986695766449, + "learning_rate": 2.569522388044975e-06, + "loss": 0.1102, + "step": 9377 + }, + { + "epoch": 1.5194426441996112, + "grad_norm": 0.8626551032066345, + "learning_rate": 2.5690852520083496e-06, + "loss": 0.1074, + "step": 9378 + }, + { + "epoch": 1.5196046662346079, + "grad_norm": 0.7937400937080383, + "learning_rate": 2.5686481138578824e-06, + "loss": 0.0928, + "step": 9379 + }, + { + "epoch": 1.5197666882696046, + "grad_norm": 0.7942370772361755, + "learning_rate": 2.5682109736069492e-06, + "loss": 0.101, + "step": 9380 + }, + { + "epoch": 1.5199287103046015, + "grad_norm": 0.8187708854675293, + "learning_rate": 2.5677738312689248e-06, + "loss": 0.1009, + "step": 9381 + }, + { + "epoch": 1.5200907323395982, + "grad_norm": 1.0334949493408203, + "learning_rate": 2.5673366868571858e-06, + "loss": 0.1277, + "step": 9382 + }, + { + "epoch": 1.520252754374595, + "grad_norm": 0.7634156942367554, + "learning_rate": 2.5668995403851065e-06, + "loss": 0.0979, + "step": 9383 + }, + { + "epoch": 1.5204147764095917, + "grad_norm": 0.7355403304100037, + "learning_rate": 2.566462391866064e-06, + "loss": 0.0939, + "step": 9384 + }, + { + "epoch": 1.5205767984445884, + "grad_norm": 0.8203917145729065, + "learning_rate": 2.5660252413134323e-06, + "loss": 0.103, + "step": 9385 + }, + { + "epoch": 1.5207388204795853, + "grad_norm": 0.8019030690193176, + "learning_rate": 2.5655880887405893e-06, + "loss": 0.1089, + "step": 9386 + }, + { + "epoch": 1.520900842514582, + "grad_norm": 0.8758410215377808, + "learning_rate": 2.565150934160908e-06, + "loss": 0.1171, + "step": 9387 + }, + { + "epoch": 1.5210628645495787, + "grad_norm": 0.8551106452941895, + "learning_rate": 2.564713777587767e-06, + "loss": 0.1006, + "step": 9388 + }, + { + "epoch": 1.5212248865845757, + "grad_norm": 0.7470712065696716, + "learning_rate": 2.5642766190345396e-06, + "loss": 0.0948, + "step": 9389 + }, + { + "epoch": 1.5213869086195722, + "grad_norm": 0.845240592956543, + "learning_rate": 2.5638394585146044e-06, + "loss": 0.0975, + "step": 9390 + }, + { + "epoch": 1.521548930654569, + "grad_norm": 0.9042078256607056, + "learning_rate": 2.5634022960413362e-06, + "loss": 0.1057, + "step": 9391 + }, + { + "epoch": 1.5217109526895658, + "grad_norm": 0.844704270362854, + "learning_rate": 2.56296513162811e-06, + "loss": 0.1008, + "step": 9392 + }, + { + "epoch": 1.5218729747245625, + "grad_norm": 0.8592380881309509, + "learning_rate": 2.5625279652883043e-06, + "loss": 0.1054, + "step": 9393 + }, + { + "epoch": 1.5220349967595594, + "grad_norm": 0.7636672258377075, + "learning_rate": 2.5620907970352937e-06, + "loss": 0.0947, + "step": 9394 + }, + { + "epoch": 1.522197018794556, + "grad_norm": 0.887532114982605, + "learning_rate": 2.5616536268824555e-06, + "loss": 0.1053, + "step": 9395 + }, + { + "epoch": 1.5223590408295529, + "grad_norm": 0.9102559089660645, + "learning_rate": 2.561216454843165e-06, + "loss": 0.1074, + "step": 9396 + }, + { + "epoch": 1.5225210628645496, + "grad_norm": 0.9450837969779968, + "learning_rate": 2.560779280930799e-06, + "loss": 0.1085, + "step": 9397 + }, + { + "epoch": 1.5226830848995463, + "grad_norm": 0.7672545313835144, + "learning_rate": 2.5603421051587344e-06, + "loss": 0.0921, + "step": 9398 + }, + { + "epoch": 1.5228451069345432, + "grad_norm": 0.8284415006637573, + "learning_rate": 2.559904927540347e-06, + "loss": 0.1048, + "step": 9399 + }, + { + "epoch": 1.5230071289695397, + "grad_norm": 0.856656014919281, + "learning_rate": 2.5594677480890152e-06, + "loss": 0.1047, + "step": 9400 + }, + { + "epoch": 1.5231691510045366, + "grad_norm": 0.791228711605072, + "learning_rate": 2.559030566818112e-06, + "loss": 0.1041, + "step": 9401 + }, + { + "epoch": 1.5233311730395334, + "grad_norm": 0.8752877116203308, + "learning_rate": 2.558593383741018e-06, + "loss": 0.0996, + "step": 9402 + }, + { + "epoch": 1.52349319507453, + "grad_norm": 0.8415939211845398, + "learning_rate": 2.558156198871108e-06, + "loss": 0.0964, + "step": 9403 + }, + { + "epoch": 1.523655217109527, + "grad_norm": 0.8422085046768188, + "learning_rate": 2.5577190122217583e-06, + "loss": 0.1056, + "step": 9404 + }, + { + "epoch": 1.5238172391445235, + "grad_norm": 0.7601490616798401, + "learning_rate": 2.557281823806347e-06, + "loss": 0.0986, + "step": 9405 + }, + { + "epoch": 1.5239792611795204, + "grad_norm": 0.8581531047821045, + "learning_rate": 2.55684463363825e-06, + "loss": 0.1018, + "step": 9406 + }, + { + "epoch": 1.5241412832145171, + "grad_norm": 0.7569452524185181, + "learning_rate": 2.5564074417308454e-06, + "loss": 0.0993, + "step": 9407 + }, + { + "epoch": 1.5243033052495139, + "grad_norm": 0.7859471440315247, + "learning_rate": 2.5559702480975094e-06, + "loss": 0.0949, + "step": 9408 + }, + { + "epoch": 1.5244653272845108, + "grad_norm": 0.7991012930870056, + "learning_rate": 2.5555330527516197e-06, + "loss": 0.1051, + "step": 9409 + }, + { + "epoch": 1.5246273493195075, + "grad_norm": 0.8928097486495972, + "learning_rate": 2.5550958557065523e-06, + "loss": 0.0995, + "step": 9410 + }, + { + "epoch": 1.5247893713545042, + "grad_norm": 1.0804157257080078, + "learning_rate": 2.554658656975686e-06, + "loss": 0.1198, + "step": 9411 + }, + { + "epoch": 1.524951393389501, + "grad_norm": 0.8879733681678772, + "learning_rate": 2.554221456572396e-06, + "loss": 0.1064, + "step": 9412 + }, + { + "epoch": 1.5251134154244976, + "grad_norm": 0.8105085492134094, + "learning_rate": 2.553784254510061e-06, + "loss": 0.104, + "step": 9413 + }, + { + "epoch": 1.5252754374594946, + "grad_norm": 0.7311967015266418, + "learning_rate": 2.553347050802058e-06, + "loss": 0.0945, + "step": 9414 + }, + { + "epoch": 1.5254374594944913, + "grad_norm": 0.8524883985519409, + "learning_rate": 2.5529098454617644e-06, + "loss": 0.1129, + "step": 9415 + }, + { + "epoch": 1.525599481529488, + "grad_norm": 0.7317702174186707, + "learning_rate": 2.552472638502557e-06, + "loss": 0.0935, + "step": 9416 + }, + { + "epoch": 1.525761503564485, + "grad_norm": 0.8908723592758179, + "learning_rate": 2.5520354299378145e-06, + "loss": 0.1069, + "step": 9417 + }, + { + "epoch": 1.5259235255994814, + "grad_norm": 0.8989855051040649, + "learning_rate": 2.5515982197809142e-06, + "loss": 0.1121, + "step": 9418 + }, + { + "epoch": 1.5260855476344783, + "grad_norm": 0.9153575897216797, + "learning_rate": 2.5511610080452322e-06, + "loss": 0.1078, + "step": 9419 + }, + { + "epoch": 1.526247569669475, + "grad_norm": 1.041412353515625, + "learning_rate": 2.5507237947441478e-06, + "loss": 0.1138, + "step": 9420 + }, + { + "epoch": 1.5264095917044718, + "grad_norm": 0.935231626033783, + "learning_rate": 2.5502865798910377e-06, + "loss": 0.1152, + "step": 9421 + }, + { + "epoch": 1.5265716137394687, + "grad_norm": 0.7345618009567261, + "learning_rate": 2.5498493634992803e-06, + "loss": 0.0955, + "step": 9422 + }, + { + "epoch": 1.5267336357744652, + "grad_norm": 0.8384798765182495, + "learning_rate": 2.5494121455822526e-06, + "loss": 0.1117, + "step": 9423 + }, + { + "epoch": 1.5268956578094621, + "grad_norm": 0.8286236524581909, + "learning_rate": 2.5489749261533333e-06, + "loss": 0.104, + "step": 9424 + }, + { + "epoch": 1.5270576798444588, + "grad_norm": 0.7071642875671387, + "learning_rate": 2.5485377052258987e-06, + "loss": 0.0943, + "step": 9425 + }, + { + "epoch": 1.5272197018794555, + "grad_norm": 0.7644874453544617, + "learning_rate": 2.548100482813329e-06, + "loss": 0.0962, + "step": 9426 + }, + { + "epoch": 1.5273817239144525, + "grad_norm": 0.8506636619567871, + "learning_rate": 2.547663258929001e-06, + "loss": 0.1085, + "step": 9427 + }, + { + "epoch": 1.527543745949449, + "grad_norm": 0.7493010759353638, + "learning_rate": 2.5472260335862915e-06, + "loss": 0.0899, + "step": 9428 + }, + { + "epoch": 1.527705767984446, + "grad_norm": 0.7209338545799255, + "learning_rate": 2.5467888067985803e-06, + "loss": 0.0993, + "step": 9429 + }, + { + "epoch": 1.5278677900194426, + "grad_norm": 0.9093368053436279, + "learning_rate": 2.546351578579245e-06, + "loss": 0.1087, + "step": 9430 + }, + { + "epoch": 1.5280298120544393, + "grad_norm": 0.7698819637298584, + "learning_rate": 2.545914348941664e-06, + "loss": 0.0959, + "step": 9431 + }, + { + "epoch": 1.5281918340894363, + "grad_norm": 0.8163896799087524, + "learning_rate": 2.545477117899213e-06, + "loss": 0.0969, + "step": 9432 + }, + { + "epoch": 1.528353856124433, + "grad_norm": 0.7964690327644348, + "learning_rate": 2.5450398854652747e-06, + "loss": 0.1028, + "step": 9433 + }, + { + "epoch": 1.5285158781594297, + "grad_norm": 0.894778311252594, + "learning_rate": 2.5446026516532235e-06, + "loss": 0.111, + "step": 9434 + }, + { + "epoch": 1.5286779001944264, + "grad_norm": 0.903476893901825, + "learning_rate": 2.5441654164764396e-06, + "loss": 0.1154, + "step": 9435 + }, + { + "epoch": 1.528839922229423, + "grad_norm": 0.8074314594268799, + "learning_rate": 2.5437281799483005e-06, + "loss": 0.1042, + "step": 9436 + }, + { + "epoch": 1.52900194426442, + "grad_norm": 0.844743013381958, + "learning_rate": 2.543290942082185e-06, + "loss": 0.1131, + "step": 9437 + }, + { + "epoch": 1.5291639662994168, + "grad_norm": 0.8503636121749878, + "learning_rate": 2.542853702891471e-06, + "loss": 0.1046, + "step": 9438 + }, + { + "epoch": 1.5293259883344135, + "grad_norm": 0.8960352540016174, + "learning_rate": 2.542416462389539e-06, + "loss": 0.1142, + "step": 9439 + }, + { + "epoch": 1.5294880103694104, + "grad_norm": 0.7933899164199829, + "learning_rate": 2.541979220589765e-06, + "loss": 0.0976, + "step": 9440 + }, + { + "epoch": 1.529650032404407, + "grad_norm": 0.8900126814842224, + "learning_rate": 2.5415419775055277e-06, + "loss": 0.1091, + "step": 9441 + }, + { + "epoch": 1.5298120544394038, + "grad_norm": 0.7652503848075867, + "learning_rate": 2.541104733150207e-06, + "loss": 0.0999, + "step": 9442 + }, + { + "epoch": 1.5299740764744005, + "grad_norm": 0.818130373954773, + "learning_rate": 2.5406674875371807e-06, + "loss": 0.1101, + "step": 9443 + }, + { + "epoch": 1.5301360985093972, + "grad_norm": 0.7544717788696289, + "learning_rate": 2.540230240679828e-06, + "loss": 0.0881, + "step": 9444 + }, + { + "epoch": 1.5302981205443942, + "grad_norm": 0.856606662273407, + "learning_rate": 2.539792992591527e-06, + "loss": 0.1124, + "step": 9445 + }, + { + "epoch": 1.5304601425793907, + "grad_norm": 0.7668637633323669, + "learning_rate": 2.5393557432856575e-06, + "loss": 0.0951, + "step": 9446 + }, + { + "epoch": 1.5306221646143876, + "grad_norm": 0.8169387578964233, + "learning_rate": 2.538918492775596e-06, + "loss": 0.1015, + "step": 9447 + }, + { + "epoch": 1.5307841866493843, + "grad_norm": 0.7206966876983643, + "learning_rate": 2.5384812410747244e-06, + "loss": 0.0873, + "step": 9448 + }, + { + "epoch": 1.530946208684381, + "grad_norm": 0.803081750869751, + "learning_rate": 2.5380439881964185e-06, + "loss": 0.093, + "step": 9449 + }, + { + "epoch": 1.531108230719378, + "grad_norm": 0.9361199736595154, + "learning_rate": 2.53760673415406e-06, + "loss": 0.1114, + "step": 9450 + }, + { + "epoch": 1.5312702527543745, + "grad_norm": 0.7436367273330688, + "learning_rate": 2.537169478961026e-06, + "loss": 0.0945, + "step": 9451 + }, + { + "epoch": 1.5314322747893714, + "grad_norm": 0.8524910807609558, + "learning_rate": 2.5367322226306956e-06, + "loss": 0.1094, + "step": 9452 + }, + { + "epoch": 1.531594296824368, + "grad_norm": 0.8166900873184204, + "learning_rate": 2.5362949651764484e-06, + "loss": 0.1032, + "step": 9453 + }, + { + "epoch": 1.5317563188593648, + "grad_norm": 0.8203127384185791, + "learning_rate": 2.5358577066116622e-06, + "loss": 0.1051, + "step": 9454 + }, + { + "epoch": 1.5319183408943617, + "grad_norm": 0.7983186841011047, + "learning_rate": 2.5354204469497185e-06, + "loss": 0.1018, + "step": 9455 + }, + { + "epoch": 1.5320803629293582, + "grad_norm": 0.7971822023391724, + "learning_rate": 2.534983186203993e-06, + "loss": 0.1028, + "step": 9456 + }, + { + "epoch": 1.5322423849643552, + "grad_norm": 0.7759663462638855, + "learning_rate": 2.5345459243878684e-06, + "loss": 0.0967, + "step": 9457 + }, + { + "epoch": 1.5324044069993519, + "grad_norm": 0.8662815093994141, + "learning_rate": 2.5341086615147207e-06, + "loss": 0.1036, + "step": 9458 + }, + { + "epoch": 1.5325664290343486, + "grad_norm": 0.8577237725257874, + "learning_rate": 2.5336713975979315e-06, + "loss": 0.1091, + "step": 9459 + }, + { + "epoch": 1.5327284510693455, + "grad_norm": 0.713832676410675, + "learning_rate": 2.5332341326508786e-06, + "loss": 0.0945, + "step": 9460 + }, + { + "epoch": 1.5328904731043422, + "grad_norm": 0.7215037941932678, + "learning_rate": 2.532796866686942e-06, + "loss": 0.0919, + "step": 9461 + }, + { + "epoch": 1.533052495139339, + "grad_norm": 0.7585532665252686, + "learning_rate": 2.5323595997195005e-06, + "loss": 0.098, + "step": 9462 + }, + { + "epoch": 1.5332145171743357, + "grad_norm": 0.7682742476463318, + "learning_rate": 2.5319223317619333e-06, + "loss": 0.0952, + "step": 9463 + }, + { + "epoch": 1.5333765392093324, + "grad_norm": 0.7284139394760132, + "learning_rate": 2.53148506282762e-06, + "loss": 0.0969, + "step": 9464 + }, + { + "epoch": 1.5335385612443293, + "grad_norm": 0.8855341076850891, + "learning_rate": 2.5310477929299402e-06, + "loss": 0.1052, + "step": 9465 + }, + { + "epoch": 1.533700583279326, + "grad_norm": 0.7966269254684448, + "learning_rate": 2.530610522082273e-06, + "loss": 0.1066, + "step": 9466 + }, + { + "epoch": 1.5338626053143227, + "grad_norm": 0.8642690181732178, + "learning_rate": 2.5301732502979977e-06, + "loss": 0.1016, + "step": 9467 + }, + { + "epoch": 1.5340246273493197, + "grad_norm": 1.0195221900939941, + "learning_rate": 2.529735977590494e-06, + "loss": 0.1063, + "step": 9468 + }, + { + "epoch": 1.5341866493843161, + "grad_norm": 0.9482093453407288, + "learning_rate": 2.5292987039731415e-06, + "loss": 0.116, + "step": 9469 + }, + { + "epoch": 1.534348671419313, + "grad_norm": 0.8184714317321777, + "learning_rate": 2.52886142945932e-06, + "loss": 0.0935, + "step": 9470 + }, + { + "epoch": 1.5345106934543098, + "grad_norm": 0.9412416219711304, + "learning_rate": 2.5284241540624077e-06, + "loss": 0.1175, + "step": 9471 + }, + { + "epoch": 1.5346727154893065, + "grad_norm": 0.9039766192436218, + "learning_rate": 2.527986877795786e-06, + "loss": 0.1045, + "step": 9472 + }, + { + "epoch": 1.5348347375243034, + "grad_norm": 0.7086098790168762, + "learning_rate": 2.527549600672833e-06, + "loss": 0.0866, + "step": 9473 + }, + { + "epoch": 1.5349967595593, + "grad_norm": 0.7739736437797546, + "learning_rate": 2.527112322706929e-06, + "loss": 0.1019, + "step": 9474 + }, + { + "epoch": 1.5351587815942969, + "grad_norm": 0.8489776253700256, + "learning_rate": 2.5266750439114533e-06, + "loss": 0.1016, + "step": 9475 + }, + { + "epoch": 1.5353208036292936, + "grad_norm": 0.8796210885047913, + "learning_rate": 2.526237764299786e-06, + "loss": 0.1076, + "step": 9476 + }, + { + "epoch": 1.5354828256642903, + "grad_norm": 0.7349861264228821, + "learning_rate": 2.525800483885307e-06, + "loss": 0.0936, + "step": 9477 + }, + { + "epoch": 1.5356448476992872, + "grad_norm": 0.833838701248169, + "learning_rate": 2.5253632026813945e-06, + "loss": 0.1022, + "step": 9478 + }, + { + "epoch": 1.5358068697342837, + "grad_norm": 0.7381966710090637, + "learning_rate": 2.524925920701431e-06, + "loss": 0.0966, + "step": 9479 + }, + { + "epoch": 1.5359688917692806, + "grad_norm": 0.8320499658584595, + "learning_rate": 2.524488637958793e-06, + "loss": 0.1105, + "step": 9480 + }, + { + "epoch": 1.5361309138042774, + "grad_norm": 0.9006655216217041, + "learning_rate": 2.5240513544668634e-06, + "loss": 0.11, + "step": 9481 + }, + { + "epoch": 1.536292935839274, + "grad_norm": 0.910051703453064, + "learning_rate": 2.5236140702390194e-06, + "loss": 0.1015, + "step": 9482 + }, + { + "epoch": 1.536454957874271, + "grad_norm": 0.788629949092865, + "learning_rate": 2.5231767852886424e-06, + "loss": 0.0979, + "step": 9483 + }, + { + "epoch": 1.5366169799092677, + "grad_norm": 0.8495368957519531, + "learning_rate": 2.522739499629112e-06, + "loss": 0.0949, + "step": 9484 + }, + { + "epoch": 1.5367790019442644, + "grad_norm": 0.7806404232978821, + "learning_rate": 2.522302213273808e-06, + "loss": 0.1007, + "step": 9485 + }, + { + "epoch": 1.5369410239792611, + "grad_norm": 0.8518389463424683, + "learning_rate": 2.5218649262361104e-06, + "loss": 0.1034, + "step": 9486 + }, + { + "epoch": 1.5371030460142578, + "grad_norm": 0.9559544324874878, + "learning_rate": 2.521427638529398e-06, + "loss": 0.1126, + "step": 9487 + }, + { + "epoch": 1.5372650680492548, + "grad_norm": 0.7953001260757446, + "learning_rate": 2.520990350167053e-06, + "loss": 0.0949, + "step": 9488 + }, + { + "epoch": 1.5374270900842515, + "grad_norm": 0.8259668946266174, + "learning_rate": 2.5205530611624537e-06, + "loss": 0.0983, + "step": 9489 + }, + { + "epoch": 1.5375891121192482, + "grad_norm": 0.9413937330245972, + "learning_rate": 2.5201157715289796e-06, + "loss": 0.1171, + "step": 9490 + }, + { + "epoch": 1.5377511341542451, + "grad_norm": 0.7779377698898315, + "learning_rate": 2.5196784812800125e-06, + "loss": 0.1009, + "step": 9491 + }, + { + "epoch": 1.5379131561892416, + "grad_norm": 0.7650463581085205, + "learning_rate": 2.519241190428931e-06, + "loss": 0.0964, + "step": 9492 + }, + { + "epoch": 1.5380751782242386, + "grad_norm": 0.8391408920288086, + "learning_rate": 2.5188038989891154e-06, + "loss": 0.1014, + "step": 9493 + }, + { + "epoch": 1.5382372002592353, + "grad_norm": 0.9325923323631287, + "learning_rate": 2.518366606973947e-06, + "loss": 0.1169, + "step": 9494 + }, + { + "epoch": 1.538399222294232, + "grad_norm": 0.8795773983001709, + "learning_rate": 2.5179293143968048e-06, + "loss": 0.1103, + "step": 9495 + }, + { + "epoch": 1.538561244329229, + "grad_norm": 0.9822404384613037, + "learning_rate": 2.517492021271068e-06, + "loss": 0.1189, + "step": 9496 + }, + { + "epoch": 1.5387232663642254, + "grad_norm": 0.8007500171661377, + "learning_rate": 2.517054727610118e-06, + "loss": 0.1077, + "step": 9497 + }, + { + "epoch": 1.5388852883992223, + "grad_norm": 0.7420747876167297, + "learning_rate": 2.5166174334273347e-06, + "loss": 0.0917, + "step": 9498 + }, + { + "epoch": 1.539047310434219, + "grad_norm": 0.7478721141815186, + "learning_rate": 2.5161801387360986e-06, + "loss": 0.1011, + "step": 9499 + }, + { + "epoch": 1.5392093324692158, + "grad_norm": 0.7703060507774353, + "learning_rate": 2.5157428435497887e-06, + "loss": 0.0919, + "step": 9500 + }, + { + "epoch": 1.5393713545042127, + "grad_norm": 0.8613007068634033, + "learning_rate": 2.515305547881787e-06, + "loss": 0.1087, + "step": 9501 + }, + { + "epoch": 1.5395333765392092, + "grad_norm": 0.7977644205093384, + "learning_rate": 2.5148682517454707e-06, + "loss": 0.1016, + "step": 9502 + }, + { + "epoch": 1.5396953985742061, + "grad_norm": 0.8988528251647949, + "learning_rate": 2.5144309551542233e-06, + "loss": 0.1072, + "step": 9503 + }, + { + "epoch": 1.5398574206092028, + "grad_norm": 0.7992510795593262, + "learning_rate": 2.5139936581214235e-06, + "loss": 0.0993, + "step": 9504 + }, + { + "epoch": 1.5400194426441995, + "grad_norm": 0.7492799758911133, + "learning_rate": 2.513556360660451e-06, + "loss": 0.0916, + "step": 9505 + }, + { + "epoch": 1.5401814646791965, + "grad_norm": 0.6518522500991821, + "learning_rate": 2.5131190627846875e-06, + "loss": 0.0852, + "step": 9506 + }, + { + "epoch": 1.540343486714193, + "grad_norm": 0.8497022390365601, + "learning_rate": 2.512681764507512e-06, + "loss": 0.109, + "step": 9507 + }, + { + "epoch": 1.54050550874919, + "grad_norm": 0.756066083908081, + "learning_rate": 2.512244465842305e-06, + "loss": 0.1012, + "step": 9508 + }, + { + "epoch": 1.5406675307841866, + "grad_norm": 0.8988474607467651, + "learning_rate": 2.511807166802447e-06, + "loss": 0.1147, + "step": 9509 + }, + { + "epoch": 1.5408295528191833, + "grad_norm": 0.7836807370185852, + "learning_rate": 2.5113698674013186e-06, + "loss": 0.0989, + "step": 9510 + }, + { + "epoch": 1.5409915748541803, + "grad_norm": 0.8765679597854614, + "learning_rate": 2.510932567652299e-06, + "loss": 0.1124, + "step": 9511 + }, + { + "epoch": 1.541153596889177, + "grad_norm": 0.8468899130821228, + "learning_rate": 2.5104952675687706e-06, + "loss": 0.1, + "step": 9512 + }, + { + "epoch": 1.5413156189241737, + "grad_norm": 0.8771848082542419, + "learning_rate": 2.5100579671641114e-06, + "loss": 0.1102, + "step": 9513 + }, + { + "epoch": 1.5414776409591704, + "grad_norm": 0.8875617980957031, + "learning_rate": 2.509620666451703e-06, + "loss": 0.1088, + "step": 9514 + }, + { + "epoch": 1.541639662994167, + "grad_norm": 0.7843181490898132, + "learning_rate": 2.5091833654449254e-06, + "loss": 0.0942, + "step": 9515 + }, + { + "epoch": 1.541801685029164, + "grad_norm": 0.9153252243995667, + "learning_rate": 2.5087460641571594e-06, + "loss": 0.1122, + "step": 9516 + }, + { + "epoch": 1.5419637070641607, + "grad_norm": 0.8474258184432983, + "learning_rate": 2.5083087626017847e-06, + "loss": 0.1033, + "step": 9517 + }, + { + "epoch": 1.5421257290991575, + "grad_norm": 0.8334153294563293, + "learning_rate": 2.5078714607921825e-06, + "loss": 0.106, + "step": 9518 + }, + { + "epoch": 1.5422877511341544, + "grad_norm": 0.8846608400344849, + "learning_rate": 2.507434158741732e-06, + "loss": 0.1001, + "step": 9519 + }, + { + "epoch": 1.5424497731691509, + "grad_norm": 0.8367900848388672, + "learning_rate": 2.506996856463814e-06, + "loss": 0.1098, + "step": 9520 + }, + { + "epoch": 1.5426117952041478, + "grad_norm": 0.7622178792953491, + "learning_rate": 2.5065595539718098e-06, + "loss": 0.0984, + "step": 9521 + }, + { + "epoch": 1.5427738172391445, + "grad_norm": 0.7733889818191528, + "learning_rate": 2.506122251279099e-06, + "loss": 0.1089, + "step": 9522 + }, + { + "epoch": 1.5429358392741412, + "grad_norm": 0.9063085913658142, + "learning_rate": 2.5056849483990614e-06, + "loss": 0.1027, + "step": 9523 + }, + { + "epoch": 1.5430978613091382, + "grad_norm": 0.9877629280090332, + "learning_rate": 2.5052476453450788e-06, + "loss": 0.1207, + "step": 9524 + }, + { + "epoch": 1.5432598833441347, + "grad_norm": 0.8588782548904419, + "learning_rate": 2.5048103421305313e-06, + "loss": 0.1079, + "step": 9525 + }, + { + "epoch": 1.5434219053791316, + "grad_norm": 0.7452812194824219, + "learning_rate": 2.504373038768799e-06, + "loss": 0.0914, + "step": 9526 + }, + { + "epoch": 1.5435839274141283, + "grad_norm": 0.8148205876350403, + "learning_rate": 2.5039357352732613e-06, + "loss": 0.109, + "step": 9527 + }, + { + "epoch": 1.543745949449125, + "grad_norm": 0.9041098356246948, + "learning_rate": 2.5034984316573003e-06, + "loss": 0.1128, + "step": 9528 + }, + { + "epoch": 1.543907971484122, + "grad_norm": 0.8906643986701965, + "learning_rate": 2.5030611279342955e-06, + "loss": 0.123, + "step": 9529 + }, + { + "epoch": 1.5440699935191184, + "grad_norm": 0.8250147700309753, + "learning_rate": 2.5026238241176283e-06, + "loss": 0.1044, + "step": 9530 + }, + { + "epoch": 1.5442320155541154, + "grad_norm": 0.8400077819824219, + "learning_rate": 2.502186520220677e-06, + "loss": 0.1025, + "step": 9531 + }, + { + "epoch": 1.544394037589112, + "grad_norm": 0.8428972959518433, + "learning_rate": 2.5017492162568246e-06, + "loss": 0.0909, + "step": 9532 + }, + { + "epoch": 1.5445560596241088, + "grad_norm": 0.7809198498725891, + "learning_rate": 2.5013119122394495e-06, + "loss": 0.0961, + "step": 9533 + }, + { + "epoch": 1.5447180816591057, + "grad_norm": 0.9663342237472534, + "learning_rate": 2.5008746081819345e-06, + "loss": 0.1164, + "step": 9534 + }, + { + "epoch": 1.5448801036941024, + "grad_norm": 0.9518924355506897, + "learning_rate": 2.5004373040976574e-06, + "loss": 0.1091, + "step": 9535 + }, + { + "epoch": 1.5450421257290992, + "grad_norm": 0.8073295950889587, + "learning_rate": 2.5e-06, + "loss": 0.1027, + "step": 9536 + }, + { + "epoch": 1.5452041477640959, + "grad_norm": 0.8112584948539734, + "learning_rate": 2.499562695902343e-06, + "loss": 0.0952, + "step": 9537 + }, + { + "epoch": 1.5453661697990926, + "grad_norm": 0.8796746730804443, + "learning_rate": 2.4991253918180668e-06, + "loss": 0.1113, + "step": 9538 + }, + { + "epoch": 1.5455281918340895, + "grad_norm": 0.7987788915634155, + "learning_rate": 2.4986880877605504e-06, + "loss": 0.0995, + "step": 9539 + }, + { + "epoch": 1.5456902138690862, + "grad_norm": 0.924967348575592, + "learning_rate": 2.498250783743176e-06, + "loss": 0.1205, + "step": 9540 + }, + { + "epoch": 1.545852235904083, + "grad_norm": 0.9094009399414062, + "learning_rate": 2.497813479779324e-06, + "loss": 0.1179, + "step": 9541 + }, + { + "epoch": 1.5460142579390799, + "grad_norm": 0.8155476450920105, + "learning_rate": 2.4973761758823734e-06, + "loss": 0.1004, + "step": 9542 + }, + { + "epoch": 1.5461762799740764, + "grad_norm": 0.7931246161460876, + "learning_rate": 2.4969388720657058e-06, + "loss": 0.103, + "step": 9543 + }, + { + "epoch": 1.5463383020090733, + "grad_norm": 0.8180692791938782, + "learning_rate": 2.4965015683427005e-06, + "loss": 0.1055, + "step": 9544 + }, + { + "epoch": 1.54650032404407, + "grad_norm": 0.7871511578559875, + "learning_rate": 2.4960642647267395e-06, + "loss": 0.0943, + "step": 9545 + }, + { + "epoch": 1.5466623460790667, + "grad_norm": 0.8144401907920837, + "learning_rate": 2.4956269612312025e-06, + "loss": 0.1029, + "step": 9546 + }, + { + "epoch": 1.5468243681140637, + "grad_norm": 0.7799917459487915, + "learning_rate": 2.49518965786947e-06, + "loss": 0.1033, + "step": 9547 + }, + { + "epoch": 1.5469863901490601, + "grad_norm": 0.7506991028785706, + "learning_rate": 2.494752354654921e-06, + "loss": 0.1017, + "step": 9548 + }, + { + "epoch": 1.547148412184057, + "grad_norm": 0.80491042137146, + "learning_rate": 2.4943150516009386e-06, + "loss": 0.0994, + "step": 9549 + }, + { + "epoch": 1.5473104342190538, + "grad_norm": 0.7802474498748779, + "learning_rate": 2.4938777487209022e-06, + "loss": 0.1002, + "step": 9550 + }, + { + "epoch": 1.5474724562540505, + "grad_norm": 0.9787641167640686, + "learning_rate": 2.493440446028191e-06, + "loss": 0.1154, + "step": 9551 + }, + { + "epoch": 1.5476344782890474, + "grad_norm": 0.9254834055900574, + "learning_rate": 2.493003143536187e-06, + "loss": 0.118, + "step": 9552 + }, + { + "epoch": 1.547796500324044, + "grad_norm": 0.8243023157119751, + "learning_rate": 2.492565841258268e-06, + "loss": 0.1069, + "step": 9553 + }, + { + "epoch": 1.5479585223590409, + "grad_norm": 0.8549127578735352, + "learning_rate": 2.4921285392078184e-06, + "loss": 0.1025, + "step": 9554 + }, + { + "epoch": 1.5481205443940376, + "grad_norm": 0.8300154805183411, + "learning_rate": 2.4916912373982157e-06, + "loss": 0.1019, + "step": 9555 + }, + { + "epoch": 1.5482825664290343, + "grad_norm": 0.7590198516845703, + "learning_rate": 2.491253935842842e-06, + "loss": 0.0964, + "step": 9556 + }, + { + "epoch": 1.5484445884640312, + "grad_norm": 0.899855375289917, + "learning_rate": 2.490816634555075e-06, + "loss": 0.1141, + "step": 9557 + }, + { + "epoch": 1.5486066104990277, + "grad_norm": 0.7786848545074463, + "learning_rate": 2.490379333548297e-06, + "loss": 0.1019, + "step": 9558 + }, + { + "epoch": 1.5487686325340246, + "grad_norm": 0.7648515105247498, + "learning_rate": 2.489942032835889e-06, + "loss": 0.0955, + "step": 9559 + }, + { + "epoch": 1.5489306545690213, + "grad_norm": 1.0073561668395996, + "learning_rate": 2.4895047324312303e-06, + "loss": 0.1208, + "step": 9560 + }, + { + "epoch": 1.549092676604018, + "grad_norm": 0.7584699392318726, + "learning_rate": 2.4890674323477016e-06, + "loss": 0.0963, + "step": 9561 + }, + { + "epoch": 1.549254698639015, + "grad_norm": 0.757125735282898, + "learning_rate": 2.4886301325986827e-06, + "loss": 0.104, + "step": 9562 + }, + { + "epoch": 1.5494167206740117, + "grad_norm": 0.7259702682495117, + "learning_rate": 2.4881928331975534e-06, + "loss": 0.0944, + "step": 9563 + }, + { + "epoch": 1.5495787427090084, + "grad_norm": 0.786858320236206, + "learning_rate": 2.4877555341576955e-06, + "loss": 0.0979, + "step": 9564 + }, + { + "epoch": 1.5497407647440054, + "grad_norm": 0.7230560183525085, + "learning_rate": 2.487318235492489e-06, + "loss": 0.0913, + "step": 9565 + }, + { + "epoch": 1.5499027867790018, + "grad_norm": 0.7546066641807556, + "learning_rate": 2.4868809372153137e-06, + "loss": 0.0931, + "step": 9566 + }, + { + "epoch": 1.5500648088139988, + "grad_norm": 0.8449350595474243, + "learning_rate": 2.48644363933955e-06, + "loss": 0.1074, + "step": 9567 + }, + { + "epoch": 1.5502268308489955, + "grad_norm": 0.8704739212989807, + "learning_rate": 2.4860063418785773e-06, + "loss": 0.1049, + "step": 9568 + }, + { + "epoch": 1.5503888528839922, + "grad_norm": 0.7751917839050293, + "learning_rate": 2.485569044845777e-06, + "loss": 0.0923, + "step": 9569 + }, + { + "epoch": 1.5505508749189891, + "grad_norm": 0.7371068596839905, + "learning_rate": 2.4851317482545297e-06, + "loss": 0.0853, + "step": 9570 + }, + { + "epoch": 1.5507128969539856, + "grad_norm": 0.8631237745285034, + "learning_rate": 2.4846944521182144e-06, + "loss": 0.1069, + "step": 9571 + }, + { + "epoch": 1.5508749189889826, + "grad_norm": 0.8396697044372559, + "learning_rate": 2.4842571564502117e-06, + "loss": 0.1098, + "step": 9572 + }, + { + "epoch": 1.5510369410239793, + "grad_norm": 0.7827080488204956, + "learning_rate": 2.4838198612639018e-06, + "loss": 0.1022, + "step": 9573 + }, + { + "epoch": 1.551198963058976, + "grad_norm": 0.7799431681632996, + "learning_rate": 2.4833825665726657e-06, + "loss": 0.0985, + "step": 9574 + }, + { + "epoch": 1.551360985093973, + "grad_norm": 0.8115366101264954, + "learning_rate": 2.4829452723898824e-06, + "loss": 0.1082, + "step": 9575 + }, + { + "epoch": 1.5515230071289694, + "grad_norm": 0.7954683899879456, + "learning_rate": 2.482507978728933e-06, + "loss": 0.1012, + "step": 9576 + }, + { + "epoch": 1.5516850291639663, + "grad_norm": 0.8462672233581543, + "learning_rate": 2.482070685603196e-06, + "loss": 0.1127, + "step": 9577 + }, + { + "epoch": 1.551847051198963, + "grad_norm": 0.7555440664291382, + "learning_rate": 2.4816333930260535e-06, + "loss": 0.1004, + "step": 9578 + }, + { + "epoch": 1.5520090732339598, + "grad_norm": 0.7217851281166077, + "learning_rate": 2.481196101010885e-06, + "loss": 0.0943, + "step": 9579 + }, + { + "epoch": 1.5521710952689567, + "grad_norm": 0.8560697436332703, + "learning_rate": 2.4807588095710696e-06, + "loss": 0.1011, + "step": 9580 + }, + { + "epoch": 1.5523331173039532, + "grad_norm": 0.8755968809127808, + "learning_rate": 2.4803215187199883e-06, + "loss": 0.0974, + "step": 9581 + }, + { + "epoch": 1.5524951393389501, + "grad_norm": 0.7938658595085144, + "learning_rate": 2.4798842284710203e-06, + "loss": 0.1006, + "step": 9582 + }, + { + "epoch": 1.5526571613739468, + "grad_norm": 0.8489831686019897, + "learning_rate": 2.4794469388375476e-06, + "loss": 0.0943, + "step": 9583 + }, + { + "epoch": 1.5528191834089435, + "grad_norm": 0.963909924030304, + "learning_rate": 2.4790096498329477e-06, + "loss": 0.1159, + "step": 9584 + }, + { + "epoch": 1.5529812054439405, + "grad_norm": 0.8376120924949646, + "learning_rate": 2.4785723614706025e-06, + "loss": 0.1055, + "step": 9585 + }, + { + "epoch": 1.5531432274789372, + "grad_norm": 0.9356231689453125, + "learning_rate": 2.478135073763891e-06, + "loss": 0.1079, + "step": 9586 + }, + { + "epoch": 1.553305249513934, + "grad_norm": 0.904419481754303, + "learning_rate": 2.477697786726192e-06, + "loss": 0.112, + "step": 9587 + }, + { + "epoch": 1.5534672715489306, + "grad_norm": 0.7490609288215637, + "learning_rate": 2.4772605003708885e-06, + "loss": 0.0986, + "step": 9588 + }, + { + "epoch": 1.5536292935839273, + "grad_norm": 0.7533844113349915, + "learning_rate": 2.476823214711358e-06, + "loss": 0.0991, + "step": 9589 + }, + { + "epoch": 1.5537913156189243, + "grad_norm": 0.8684905767440796, + "learning_rate": 2.476385929760981e-06, + "loss": 0.1085, + "step": 9590 + }, + { + "epoch": 1.553953337653921, + "grad_norm": 0.8226639032363892, + "learning_rate": 2.475948645533138e-06, + "loss": 0.1083, + "step": 9591 + }, + { + "epoch": 1.5541153596889177, + "grad_norm": 0.8018856644630432, + "learning_rate": 2.475511362041207e-06, + "loss": 0.1005, + "step": 9592 + }, + { + "epoch": 1.5542773817239146, + "grad_norm": 0.6188633441925049, + "learning_rate": 2.47507407929857e-06, + "loss": 0.0776, + "step": 9593 + }, + { + "epoch": 1.554439403758911, + "grad_norm": 0.7864198088645935, + "learning_rate": 2.4746367973186063e-06, + "loss": 0.0957, + "step": 9594 + }, + { + "epoch": 1.554601425793908, + "grad_norm": 0.7992034554481506, + "learning_rate": 2.474199516114694e-06, + "loss": 0.099, + "step": 9595 + }, + { + "epoch": 1.5547634478289047, + "grad_norm": 0.849092423915863, + "learning_rate": 2.473762235700214e-06, + "loss": 0.1055, + "step": 9596 + }, + { + "epoch": 1.5549254698639015, + "grad_norm": 0.709578275680542, + "learning_rate": 2.473324956088547e-06, + "loss": 0.0904, + "step": 9597 + }, + { + "epoch": 1.5550874918988984, + "grad_norm": 0.7516464591026306, + "learning_rate": 2.472887677293072e-06, + "loss": 0.0981, + "step": 9598 + }, + { + "epoch": 1.5552495139338949, + "grad_norm": 0.7759402394294739, + "learning_rate": 2.472450399327168e-06, + "loss": 0.0952, + "step": 9599 + }, + { + "epoch": 1.5554115359688918, + "grad_norm": 0.7998428344726562, + "learning_rate": 2.4720131222042156e-06, + "loss": 0.1004, + "step": 9600 + }, + { + "epoch": 1.5555735580038885, + "grad_norm": 0.8491755127906799, + "learning_rate": 2.4715758459375923e-06, + "loss": 0.1057, + "step": 9601 + }, + { + "epoch": 1.5557355800388852, + "grad_norm": 0.8094433546066284, + "learning_rate": 2.4711385705406805e-06, + "loss": 0.0951, + "step": 9602 + }, + { + "epoch": 1.5558976020738822, + "grad_norm": 0.7939477562904358, + "learning_rate": 2.4707012960268594e-06, + "loss": 0.1063, + "step": 9603 + }, + { + "epoch": 1.5560596241088787, + "grad_norm": 1.0064921379089355, + "learning_rate": 2.4702640224095066e-06, + "loss": 0.1098, + "step": 9604 + }, + { + "epoch": 1.5562216461438756, + "grad_norm": 0.7896994948387146, + "learning_rate": 2.4698267497020035e-06, + "loss": 0.1027, + "step": 9605 + }, + { + "epoch": 1.5563836681788723, + "grad_norm": 0.9108518362045288, + "learning_rate": 2.469389477917727e-06, + "loss": 0.115, + "step": 9606 + }, + { + "epoch": 1.556545690213869, + "grad_norm": 0.7819544076919556, + "learning_rate": 2.4689522070700606e-06, + "loss": 0.0948, + "step": 9607 + }, + { + "epoch": 1.556707712248866, + "grad_norm": 0.7864106297492981, + "learning_rate": 2.4685149371723806e-06, + "loss": 0.1019, + "step": 9608 + }, + { + "epoch": 1.5568697342838627, + "grad_norm": 0.7853366136550903, + "learning_rate": 2.468077668238068e-06, + "loss": 0.1051, + "step": 9609 + }, + { + "epoch": 1.5570317563188594, + "grad_norm": 0.7699379324913025, + "learning_rate": 2.467640400280501e-06, + "loss": 0.0979, + "step": 9610 + }, + { + "epoch": 1.557193778353856, + "grad_norm": 0.7062657475471497, + "learning_rate": 2.4672031333130584e-06, + "loss": 0.0885, + "step": 9611 + }, + { + "epoch": 1.5573558003888528, + "grad_norm": 0.8259475231170654, + "learning_rate": 2.466765867349122e-06, + "loss": 0.0942, + "step": 9612 + }, + { + "epoch": 1.5575178224238497, + "grad_norm": 0.875845193862915, + "learning_rate": 2.466328602402069e-06, + "loss": 0.1193, + "step": 9613 + }, + { + "epoch": 1.5576798444588464, + "grad_norm": 0.7817234396934509, + "learning_rate": 2.46589133848528e-06, + "loss": 0.0976, + "step": 9614 + }, + { + "epoch": 1.5578418664938432, + "grad_norm": 0.8780922889709473, + "learning_rate": 2.465454075612132e-06, + "loss": 0.1128, + "step": 9615 + }, + { + "epoch": 1.55800388852884, + "grad_norm": 0.7593608498573303, + "learning_rate": 2.465016813796007e-06, + "loss": 0.1004, + "step": 9616 + }, + { + "epoch": 1.5581659105638366, + "grad_norm": 0.9103496074676514, + "learning_rate": 2.4645795530502823e-06, + "loss": 0.1147, + "step": 9617 + }, + { + "epoch": 1.5583279325988335, + "grad_norm": 0.7615711688995361, + "learning_rate": 2.464142293388338e-06, + "loss": 0.0939, + "step": 9618 + }, + { + "epoch": 1.5584899546338302, + "grad_norm": 0.7604543566703796, + "learning_rate": 2.463705034823553e-06, + "loss": 0.0899, + "step": 9619 + }, + { + "epoch": 1.558651976668827, + "grad_norm": 0.7992773056030273, + "learning_rate": 2.4632677773693048e-06, + "loss": 0.0994, + "step": 9620 + }, + { + "epoch": 1.5588139987038239, + "grad_norm": 0.7582528591156006, + "learning_rate": 2.462830521038975e-06, + "loss": 0.0977, + "step": 9621 + }, + { + "epoch": 1.5589760207388204, + "grad_norm": 0.7414041757583618, + "learning_rate": 2.4623932658459406e-06, + "loss": 0.0937, + "step": 9622 + }, + { + "epoch": 1.5591380427738173, + "grad_norm": 0.8438722491264343, + "learning_rate": 2.461956011803582e-06, + "loss": 0.1067, + "step": 9623 + }, + { + "epoch": 1.559300064808814, + "grad_norm": 0.8454424142837524, + "learning_rate": 2.461518758925277e-06, + "loss": 0.107, + "step": 9624 + }, + { + "epoch": 1.5594620868438107, + "grad_norm": 0.7657983303070068, + "learning_rate": 2.461081507224404e-06, + "loss": 0.0993, + "step": 9625 + }, + { + "epoch": 1.5596241088788076, + "grad_norm": 0.8123411536216736, + "learning_rate": 2.4606442567143434e-06, + "loss": 0.0954, + "step": 9626 + }, + { + "epoch": 1.5597861309138041, + "grad_norm": 0.8028444051742554, + "learning_rate": 2.460207007408474e-06, + "loss": 0.0998, + "step": 9627 + }, + { + "epoch": 1.559948152948801, + "grad_norm": 0.8465455174446106, + "learning_rate": 2.4597697593201728e-06, + "loss": 0.1055, + "step": 9628 + }, + { + "epoch": 1.5601101749837978, + "grad_norm": 0.7659565210342407, + "learning_rate": 2.4593325124628206e-06, + "loss": 0.0974, + "step": 9629 + }, + { + "epoch": 1.5602721970187945, + "grad_norm": 0.7613688707351685, + "learning_rate": 2.4588952668497937e-06, + "loss": 0.0921, + "step": 9630 + }, + { + "epoch": 1.5604342190537914, + "grad_norm": 0.8725959062576294, + "learning_rate": 2.458458022494473e-06, + "loss": 0.0952, + "step": 9631 + }, + { + "epoch": 1.560596241088788, + "grad_norm": 0.7647036910057068, + "learning_rate": 2.4580207794102364e-06, + "loss": 0.1001, + "step": 9632 + }, + { + "epoch": 1.5607582631237849, + "grad_norm": 0.7943955659866333, + "learning_rate": 2.4575835376104624e-06, + "loss": 0.098, + "step": 9633 + }, + { + "epoch": 1.5609202851587816, + "grad_norm": 0.7511798739433289, + "learning_rate": 2.4571462971085293e-06, + "loss": 0.0927, + "step": 9634 + }, + { + "epoch": 1.5610823071937783, + "grad_norm": 0.8319833874702454, + "learning_rate": 2.456709057917815e-06, + "loss": 0.0968, + "step": 9635 + }, + { + "epoch": 1.5612443292287752, + "grad_norm": 0.8254125118255615, + "learning_rate": 2.4562718200517003e-06, + "loss": 0.0988, + "step": 9636 + }, + { + "epoch": 1.561406351263772, + "grad_norm": 0.7782612442970276, + "learning_rate": 2.4558345835235613e-06, + "loss": 0.1042, + "step": 9637 + }, + { + "epoch": 1.5615683732987686, + "grad_norm": 0.8092712163925171, + "learning_rate": 2.4553973483467778e-06, + "loss": 0.1003, + "step": 9638 + }, + { + "epoch": 1.5617303953337653, + "grad_norm": 0.8870753049850464, + "learning_rate": 2.454960114534726e-06, + "loss": 0.1047, + "step": 9639 + }, + { + "epoch": 1.561892417368762, + "grad_norm": 0.7420865297317505, + "learning_rate": 2.454522882100787e-06, + "loss": 0.0899, + "step": 9640 + }, + { + "epoch": 1.562054439403759, + "grad_norm": 0.8217720985412598, + "learning_rate": 2.4540856510583374e-06, + "loss": 0.1038, + "step": 9641 + }, + { + "epoch": 1.5622164614387557, + "grad_norm": 0.7875351309776306, + "learning_rate": 2.453648421420756e-06, + "loss": 0.0947, + "step": 9642 + }, + { + "epoch": 1.5623784834737524, + "grad_norm": 0.8513997793197632, + "learning_rate": 2.4532111932014205e-06, + "loss": 0.1039, + "step": 9643 + }, + { + "epoch": 1.5625405055087493, + "grad_norm": 0.814285159111023, + "learning_rate": 2.4527739664137085e-06, + "loss": 0.0976, + "step": 9644 + }, + { + "epoch": 1.5627025275437458, + "grad_norm": 0.8172099590301514, + "learning_rate": 2.4523367410710004e-06, + "loss": 0.096, + "step": 9645 + }, + { + "epoch": 1.5628645495787428, + "grad_norm": 0.7031353116035461, + "learning_rate": 2.4518995171866717e-06, + "loss": 0.0891, + "step": 9646 + }, + { + "epoch": 1.5630265716137395, + "grad_norm": 0.9233510494232178, + "learning_rate": 2.451462294774102e-06, + "loss": 0.1044, + "step": 9647 + }, + { + "epoch": 1.5631885936487362, + "grad_norm": 0.9054529070854187, + "learning_rate": 2.451025073846668e-06, + "loss": 0.1108, + "step": 9648 + }, + { + "epoch": 1.5633506156837331, + "grad_norm": 0.8287831544876099, + "learning_rate": 2.450587854417748e-06, + "loss": 0.1046, + "step": 9649 + }, + { + "epoch": 1.5635126377187296, + "grad_norm": 0.9027771949768066, + "learning_rate": 2.45015063650072e-06, + "loss": 0.1089, + "step": 9650 + }, + { + "epoch": 1.5636746597537265, + "grad_norm": 0.7133320569992065, + "learning_rate": 2.449713420108963e-06, + "loss": 0.0931, + "step": 9651 + }, + { + "epoch": 1.5638366817887233, + "grad_norm": 0.8660880327224731, + "learning_rate": 2.449276205255853e-06, + "loss": 0.1106, + "step": 9652 + }, + { + "epoch": 1.56399870382372, + "grad_norm": 0.779906690120697, + "learning_rate": 2.448838991954769e-06, + "loss": 0.0977, + "step": 9653 + }, + { + "epoch": 1.564160725858717, + "grad_norm": 0.9731188416481018, + "learning_rate": 2.448401780219087e-06, + "loss": 0.1092, + "step": 9654 + }, + { + "epoch": 1.5643227478937134, + "grad_norm": 0.8192594647407532, + "learning_rate": 2.447964570062186e-06, + "loss": 0.0919, + "step": 9655 + }, + { + "epoch": 1.5644847699287103, + "grad_norm": 0.8036348223686218, + "learning_rate": 2.4475273614974437e-06, + "loss": 0.0945, + "step": 9656 + }, + { + "epoch": 1.564646791963707, + "grad_norm": 0.8575658202171326, + "learning_rate": 2.447090154538237e-06, + "loss": 0.1013, + "step": 9657 + }, + { + "epoch": 1.5648088139987038, + "grad_norm": 0.7713079452514648, + "learning_rate": 2.4466529491979437e-06, + "loss": 0.1026, + "step": 9658 + }, + { + "epoch": 1.5649708360337007, + "grad_norm": 0.7967318892478943, + "learning_rate": 2.4462157454899393e-06, + "loss": 0.0993, + "step": 9659 + }, + { + "epoch": 1.5651328580686974, + "grad_norm": 0.8018705248832703, + "learning_rate": 2.445778543427605e-06, + "loss": 0.1044, + "step": 9660 + }, + { + "epoch": 1.565294880103694, + "grad_norm": 0.7897207140922546, + "learning_rate": 2.445341343024315e-06, + "loss": 0.1001, + "step": 9661 + }, + { + "epoch": 1.5654569021386908, + "grad_norm": 0.818270206451416, + "learning_rate": 2.4449041442934485e-06, + "loss": 0.092, + "step": 9662 + }, + { + "epoch": 1.5656189241736875, + "grad_norm": 0.9617789387702942, + "learning_rate": 2.4444669472483807e-06, + "loss": 0.1061, + "step": 9663 + }, + { + "epoch": 1.5657809462086845, + "grad_norm": 0.7740570902824402, + "learning_rate": 2.4440297519024906e-06, + "loss": 0.1002, + "step": 9664 + }, + { + "epoch": 1.5659429682436812, + "grad_norm": 0.8566815853118896, + "learning_rate": 2.443592558269155e-06, + "loss": 0.1084, + "step": 9665 + }, + { + "epoch": 1.5661049902786779, + "grad_norm": 0.8553325533866882, + "learning_rate": 2.4431553663617502e-06, + "loss": 0.1038, + "step": 9666 + }, + { + "epoch": 1.5662670123136748, + "grad_norm": 0.7744235396385193, + "learning_rate": 2.4427181761936535e-06, + "loss": 0.0937, + "step": 9667 + }, + { + "epoch": 1.5664290343486713, + "grad_norm": 0.8114905953407288, + "learning_rate": 2.4422809877782417e-06, + "loss": 0.1026, + "step": 9668 + }, + { + "epoch": 1.5665910563836682, + "grad_norm": 0.8174008727073669, + "learning_rate": 2.4418438011288926e-06, + "loss": 0.0945, + "step": 9669 + }, + { + "epoch": 1.566753078418665, + "grad_norm": 0.9269281625747681, + "learning_rate": 2.4414066162589823e-06, + "loss": 0.1222, + "step": 9670 + }, + { + "epoch": 1.5669151004536617, + "grad_norm": 0.7899512052536011, + "learning_rate": 2.4409694331818884e-06, + "loss": 0.1009, + "step": 9671 + }, + { + "epoch": 1.5670771224886586, + "grad_norm": 0.7267419099807739, + "learning_rate": 2.4405322519109864e-06, + "loss": 0.0993, + "step": 9672 + }, + { + "epoch": 1.567239144523655, + "grad_norm": 0.8098709583282471, + "learning_rate": 2.4400950724596527e-06, + "loss": 0.0992, + "step": 9673 + }, + { + "epoch": 1.567401166558652, + "grad_norm": 0.7339199185371399, + "learning_rate": 2.4396578948412664e-06, + "loss": 0.1005, + "step": 9674 + }, + { + "epoch": 1.5675631885936487, + "grad_norm": 0.7503772377967834, + "learning_rate": 2.4392207190692015e-06, + "loss": 0.1032, + "step": 9675 + }, + { + "epoch": 1.5677252106286454, + "grad_norm": 0.8887887001037598, + "learning_rate": 2.4387835451568355e-06, + "loss": 0.12, + "step": 9676 + }, + { + "epoch": 1.5678872326636424, + "grad_norm": 0.8450667262077332, + "learning_rate": 2.4383463731175457e-06, + "loss": 0.108, + "step": 9677 + }, + { + "epoch": 1.5680492546986389, + "grad_norm": 0.9332994818687439, + "learning_rate": 2.4379092029647067e-06, + "loss": 0.1086, + "step": 9678 + }, + { + "epoch": 1.5682112767336358, + "grad_norm": 0.7733569741249084, + "learning_rate": 2.437472034711696e-06, + "loss": 0.0988, + "step": 9679 + }, + { + "epoch": 1.5683732987686325, + "grad_norm": 0.806662917137146, + "learning_rate": 2.4370348683718906e-06, + "loss": 0.0925, + "step": 9680 + }, + { + "epoch": 1.5685353208036292, + "grad_norm": 0.8055416941642761, + "learning_rate": 2.436597703958665e-06, + "loss": 0.108, + "step": 9681 + }, + { + "epoch": 1.5686973428386262, + "grad_norm": 0.7006134390830994, + "learning_rate": 2.436160541485396e-06, + "loss": 0.0885, + "step": 9682 + }, + { + "epoch": 1.5688593648736227, + "grad_norm": 0.8832897543907166, + "learning_rate": 2.4357233809654608e-06, + "loss": 0.1098, + "step": 9683 + }, + { + "epoch": 1.5690213869086196, + "grad_norm": 0.7593520879745483, + "learning_rate": 2.4352862224122344e-06, + "loss": 0.0944, + "step": 9684 + }, + { + "epoch": 1.5691834089436163, + "grad_norm": 0.8768290877342224, + "learning_rate": 2.4348490658390924e-06, + "loss": 0.114, + "step": 9685 + }, + { + "epoch": 1.569345430978613, + "grad_norm": 0.7737889289855957, + "learning_rate": 2.4344119112594124e-06, + "loss": 0.0968, + "step": 9686 + }, + { + "epoch": 1.56950745301361, + "grad_norm": 0.6443475484848022, + "learning_rate": 2.4339747586865677e-06, + "loss": 0.0738, + "step": 9687 + }, + { + "epoch": 1.5696694750486067, + "grad_norm": 0.7420233488082886, + "learning_rate": 2.4335376081339364e-06, + "loss": 0.0994, + "step": 9688 + }, + { + "epoch": 1.5698314970836034, + "grad_norm": 0.8285709023475647, + "learning_rate": 2.433100459614894e-06, + "loss": 0.1032, + "step": 9689 + }, + { + "epoch": 1.5699935191186, + "grad_norm": 0.8334299921989441, + "learning_rate": 2.4326633131428147e-06, + "loss": 0.1027, + "step": 9690 + }, + { + "epoch": 1.5701555411535968, + "grad_norm": 0.7420817613601685, + "learning_rate": 2.432226168731076e-06, + "loss": 0.0909, + "step": 9691 + }, + { + "epoch": 1.5703175631885937, + "grad_norm": 0.7946280241012573, + "learning_rate": 2.4317890263930516e-06, + "loss": 0.0953, + "step": 9692 + }, + { + "epoch": 1.5704795852235904, + "grad_norm": 0.8967946767807007, + "learning_rate": 2.431351886142118e-06, + "loss": 0.1133, + "step": 9693 + }, + { + "epoch": 1.5706416072585871, + "grad_norm": 0.7146013379096985, + "learning_rate": 2.430914747991651e-06, + "loss": 0.0835, + "step": 9694 + }, + { + "epoch": 1.570803629293584, + "grad_norm": 0.9583059549331665, + "learning_rate": 2.430477611955026e-06, + "loss": 0.1078, + "step": 9695 + }, + { + "epoch": 1.5709656513285806, + "grad_norm": 0.8162189722061157, + "learning_rate": 2.430040478045617e-06, + "loss": 0.1036, + "step": 9696 + }, + { + "epoch": 1.5711276733635775, + "grad_norm": 0.8183581829071045, + "learning_rate": 2.4296033462768e-06, + "loss": 0.0958, + "step": 9697 + }, + { + "epoch": 1.5712896953985742, + "grad_norm": 0.7942171096801758, + "learning_rate": 2.429166216661951e-06, + "loss": 0.0945, + "step": 9698 + }, + { + "epoch": 1.571451717433571, + "grad_norm": 0.9565669298171997, + "learning_rate": 2.4287290892144434e-06, + "loss": 0.1132, + "step": 9699 + }, + { + "epoch": 1.5716137394685679, + "grad_norm": 0.7852692604064941, + "learning_rate": 2.4282919639476544e-06, + "loss": 0.0944, + "step": 9700 + }, + { + "epoch": 1.5717757615035644, + "grad_norm": 0.8304145336151123, + "learning_rate": 2.427854840874957e-06, + "loss": 0.092, + "step": 9701 + }, + { + "epoch": 1.5719377835385613, + "grad_norm": 0.869964599609375, + "learning_rate": 2.4274177200097266e-06, + "loss": 0.1011, + "step": 9702 + }, + { + "epoch": 1.572099805573558, + "grad_norm": 0.9880481958389282, + "learning_rate": 2.4269806013653385e-06, + "loss": 0.1098, + "step": 9703 + }, + { + "epoch": 1.5722618276085547, + "grad_norm": 0.8023558259010315, + "learning_rate": 2.426543484955168e-06, + "loss": 0.1075, + "step": 9704 + }, + { + "epoch": 1.5724238496435516, + "grad_norm": 0.8949816823005676, + "learning_rate": 2.426106370792588e-06, + "loss": 0.1046, + "step": 9705 + }, + { + "epoch": 1.5725858716785481, + "grad_norm": 1.0093536376953125, + "learning_rate": 2.425669258890975e-06, + "loss": 0.1205, + "step": 9706 + }, + { + "epoch": 1.572747893713545, + "grad_norm": 0.8207192420959473, + "learning_rate": 2.4252321492637027e-06, + "loss": 0.1094, + "step": 9707 + }, + { + "epoch": 1.5729099157485418, + "grad_norm": 0.9045940637588501, + "learning_rate": 2.4247950419241457e-06, + "loss": 0.1129, + "step": 9708 + }, + { + "epoch": 1.5730719377835385, + "grad_norm": 0.9074610471725464, + "learning_rate": 2.4243579368856787e-06, + "loss": 0.1103, + "step": 9709 + }, + { + "epoch": 1.5732339598185354, + "grad_norm": 0.7920365333557129, + "learning_rate": 2.4239208341616755e-06, + "loss": 0.099, + "step": 9710 + }, + { + "epoch": 1.5733959818535321, + "grad_norm": 0.9034141898155212, + "learning_rate": 2.4234837337655098e-06, + "loss": 0.1067, + "step": 9711 + }, + { + "epoch": 1.5735580038885288, + "grad_norm": 0.7888884544372559, + "learning_rate": 2.4230466357105575e-06, + "loss": 0.1013, + "step": 9712 + }, + { + "epoch": 1.5737200259235256, + "grad_norm": 0.8596574068069458, + "learning_rate": 2.422609540010192e-06, + "loss": 0.1108, + "step": 9713 + }, + { + "epoch": 1.5738820479585223, + "grad_norm": 0.9732898473739624, + "learning_rate": 2.4221724466777874e-06, + "loss": 0.0983, + "step": 9714 + }, + { + "epoch": 1.5740440699935192, + "grad_norm": 0.8367780447006226, + "learning_rate": 2.421735355726718e-06, + "loss": 0.1024, + "step": 9715 + }, + { + "epoch": 1.574206092028516, + "grad_norm": 0.7838643193244934, + "learning_rate": 2.421298267170356e-06, + "loss": 0.0972, + "step": 9716 + }, + { + "epoch": 1.5743681140635126, + "grad_norm": 0.9189515709877014, + "learning_rate": 2.420861181022077e-06, + "loss": 0.1078, + "step": 9717 + }, + { + "epoch": 1.5745301360985096, + "grad_norm": 0.812330424785614, + "learning_rate": 2.420424097295255e-06, + "loss": 0.1065, + "step": 9718 + }, + { + "epoch": 1.574692158133506, + "grad_norm": 0.7901645302772522, + "learning_rate": 2.419987016003263e-06, + "loss": 0.0998, + "step": 9719 + }, + { + "epoch": 1.574854180168503, + "grad_norm": 0.9493116736412048, + "learning_rate": 2.419549937159474e-06, + "loss": 0.1202, + "step": 9720 + }, + { + "epoch": 1.5750162022034997, + "grad_norm": 0.873347282409668, + "learning_rate": 2.419112860777262e-06, + "loss": 0.0935, + "step": 9721 + }, + { + "epoch": 1.5751782242384964, + "grad_norm": 0.7348765730857849, + "learning_rate": 2.418675786870002e-06, + "loss": 0.0989, + "step": 9722 + }, + { + "epoch": 1.5753402462734933, + "grad_norm": 0.9190308451652527, + "learning_rate": 2.4182387154510653e-06, + "loss": 0.1148, + "step": 9723 + }, + { + "epoch": 1.5755022683084898, + "grad_norm": 0.8104141354560852, + "learning_rate": 2.4178016465338266e-06, + "loss": 0.102, + "step": 9724 + }, + { + "epoch": 1.5756642903434868, + "grad_norm": 0.826193630695343, + "learning_rate": 2.417364580131658e-06, + "loss": 0.1042, + "step": 9725 + }, + { + "epoch": 1.5758263123784835, + "grad_norm": 0.7655068635940552, + "learning_rate": 2.416927516257933e-06, + "loss": 0.0976, + "step": 9726 + }, + { + "epoch": 1.5759883344134802, + "grad_norm": 0.8392152190208435, + "learning_rate": 2.416490454926026e-06, + "loss": 0.1102, + "step": 9727 + }, + { + "epoch": 1.5761503564484771, + "grad_norm": 0.7975762486457825, + "learning_rate": 2.416053396149308e-06, + "loss": 0.1035, + "step": 9728 + }, + { + "epoch": 1.5763123784834736, + "grad_norm": 0.8951084613800049, + "learning_rate": 2.4156163399411534e-06, + "loss": 0.0971, + "step": 9729 + }, + { + "epoch": 1.5764744005184705, + "grad_norm": 0.8403456211090088, + "learning_rate": 2.415179286314934e-06, + "loss": 0.1045, + "step": 9730 + }, + { + "epoch": 1.5766364225534673, + "grad_norm": 0.8686217665672302, + "learning_rate": 2.414742235284024e-06, + "loss": 0.1051, + "step": 9731 + }, + { + "epoch": 1.576798444588464, + "grad_norm": 0.810461699962616, + "learning_rate": 2.414305186861795e-06, + "loss": 0.107, + "step": 9732 + }, + { + "epoch": 1.576960466623461, + "grad_norm": 0.8022754788398743, + "learning_rate": 2.4138681410616206e-06, + "loss": 0.1036, + "step": 9733 + }, + { + "epoch": 1.5771224886584574, + "grad_norm": 0.8617199063301086, + "learning_rate": 2.4134310978968716e-06, + "loss": 0.112, + "step": 9734 + }, + { + "epoch": 1.5772845106934543, + "grad_norm": 0.8537782430648804, + "learning_rate": 2.4129940573809213e-06, + "loss": 0.1043, + "step": 9735 + }, + { + "epoch": 1.577446532728451, + "grad_norm": 0.7416249513626099, + "learning_rate": 2.412557019527143e-06, + "loss": 0.0956, + "step": 9736 + }, + { + "epoch": 1.5776085547634477, + "grad_norm": 0.8986727595329285, + "learning_rate": 2.4121199843489084e-06, + "loss": 0.1091, + "step": 9737 + }, + { + "epoch": 1.5777705767984447, + "grad_norm": 0.7465632557868958, + "learning_rate": 2.4116829518595896e-06, + "loss": 0.0912, + "step": 9738 + }, + { + "epoch": 1.5779325988334414, + "grad_norm": 0.7991532683372498, + "learning_rate": 2.4112459220725588e-06, + "loss": 0.0968, + "step": 9739 + }, + { + "epoch": 1.578094620868438, + "grad_norm": 0.7432222366333008, + "learning_rate": 2.410808895001187e-06, + "loss": 0.0916, + "step": 9740 + }, + { + "epoch": 1.5782566429034348, + "grad_norm": 0.8202318549156189, + "learning_rate": 2.4103718706588477e-06, + "loss": 0.106, + "step": 9741 + }, + { + "epoch": 1.5784186649384315, + "grad_norm": 0.8392171263694763, + "learning_rate": 2.409934849058913e-06, + "loss": 0.1007, + "step": 9742 + }, + { + "epoch": 1.5785806869734285, + "grad_norm": 0.8795257806777954, + "learning_rate": 2.4094978302147533e-06, + "loss": 0.106, + "step": 9743 + }, + { + "epoch": 1.5787427090084252, + "grad_norm": 0.8620293140411377, + "learning_rate": 2.4090608141397417e-06, + "loss": 0.1091, + "step": 9744 + }, + { + "epoch": 1.5789047310434219, + "grad_norm": 0.7913221120834351, + "learning_rate": 2.4086238008472473e-06, + "loss": 0.1002, + "step": 9745 + }, + { + "epoch": 1.5790667530784188, + "grad_norm": 0.8350681662559509, + "learning_rate": 2.408186790350645e-06, + "loss": 0.0996, + "step": 9746 + }, + { + "epoch": 1.5792287751134153, + "grad_norm": 0.6608853936195374, + "learning_rate": 2.4077497826633045e-06, + "loss": 0.0797, + "step": 9747 + }, + { + "epoch": 1.5793907971484122, + "grad_norm": 0.8556117415428162, + "learning_rate": 2.4073127777985982e-06, + "loss": 0.1055, + "step": 9748 + }, + { + "epoch": 1.579552819183409, + "grad_norm": 0.8157343864440918, + "learning_rate": 2.4068757757698954e-06, + "loss": 0.0863, + "step": 9749 + }, + { + "epoch": 1.5797148412184057, + "grad_norm": 0.8553090691566467, + "learning_rate": 2.406438776590568e-06, + "loss": 0.0932, + "step": 9750 + }, + { + "epoch": 1.5798768632534026, + "grad_norm": 0.8190301656723022, + "learning_rate": 2.406001780273989e-06, + "loss": 0.1047, + "step": 9751 + }, + { + "epoch": 1.580038885288399, + "grad_norm": 0.7428902983665466, + "learning_rate": 2.4055647868335273e-06, + "loss": 0.0918, + "step": 9752 + }, + { + "epoch": 1.580200907323396, + "grad_norm": 0.7674239277839661, + "learning_rate": 2.4051277962825555e-06, + "loss": 0.0956, + "step": 9753 + }, + { + "epoch": 1.5803629293583927, + "grad_norm": 0.7433567047119141, + "learning_rate": 2.404690808634442e-06, + "loss": 0.0922, + "step": 9754 + }, + { + "epoch": 1.5805249513933894, + "grad_norm": 0.8673844337463379, + "learning_rate": 2.40425382390256e-06, + "loss": 0.1007, + "step": 9755 + }, + { + "epoch": 1.5806869734283864, + "grad_norm": 0.8878071904182434, + "learning_rate": 2.4038168421002795e-06, + "loss": 0.1117, + "step": 9756 + }, + { + "epoch": 1.5808489954633829, + "grad_norm": 0.81288743019104, + "learning_rate": 2.403379863240971e-06, + "loss": 0.104, + "step": 9757 + }, + { + "epoch": 1.5810110174983798, + "grad_norm": 0.8590624332427979, + "learning_rate": 2.4029428873380044e-06, + "loss": 0.1045, + "step": 9758 + }, + { + "epoch": 1.5811730395333765, + "grad_norm": 0.8249778151512146, + "learning_rate": 2.40250591440475e-06, + "loss": 0.0995, + "step": 9759 + }, + { + "epoch": 1.5813350615683732, + "grad_norm": 0.747291624546051, + "learning_rate": 2.4020689444545796e-06, + "loss": 0.0925, + "step": 9760 + }, + { + "epoch": 1.5814970836033702, + "grad_norm": 0.8458421230316162, + "learning_rate": 2.4016319775008623e-06, + "loss": 0.1052, + "step": 9761 + }, + { + "epoch": 1.5816591056383669, + "grad_norm": 0.7993534803390503, + "learning_rate": 2.401195013556969e-06, + "loss": 0.1025, + "step": 9762 + }, + { + "epoch": 1.5818211276733636, + "grad_norm": 0.8762085437774658, + "learning_rate": 2.4007580526362685e-06, + "loss": 0.1053, + "step": 9763 + }, + { + "epoch": 1.5819831497083603, + "grad_norm": 0.9038422107696533, + "learning_rate": 2.400321094752131e-06, + "loss": 0.1019, + "step": 9764 + }, + { + "epoch": 1.582145171743357, + "grad_norm": 0.793109655380249, + "learning_rate": 2.399884139917927e-06, + "loss": 0.0976, + "step": 9765 + }, + { + "epoch": 1.582307193778354, + "grad_norm": 0.8598572611808777, + "learning_rate": 2.399447188147027e-06, + "loss": 0.103, + "step": 9766 + }, + { + "epoch": 1.5824692158133506, + "grad_norm": 0.7793141007423401, + "learning_rate": 2.399010239452799e-06, + "loss": 0.0916, + "step": 9767 + }, + { + "epoch": 1.5826312378483474, + "grad_norm": 0.8122705817222595, + "learning_rate": 2.3985732938486137e-06, + "loss": 0.0946, + "step": 9768 + }, + { + "epoch": 1.5827932598833443, + "grad_norm": 0.8980499505996704, + "learning_rate": 2.3981363513478394e-06, + "loss": 0.1131, + "step": 9769 + }, + { + "epoch": 1.5829552819183408, + "grad_norm": 0.8115260004997253, + "learning_rate": 2.3976994119638464e-06, + "loss": 0.1007, + "step": 9770 + }, + { + "epoch": 1.5831173039533377, + "grad_norm": 0.766045868396759, + "learning_rate": 2.3972624757100044e-06, + "loss": 0.1038, + "step": 9771 + }, + { + "epoch": 1.5832793259883344, + "grad_norm": 0.8186202049255371, + "learning_rate": 2.3968255425996817e-06, + "loss": 0.1021, + "step": 9772 + }, + { + "epoch": 1.5834413480233311, + "grad_norm": 0.7019743919372559, + "learning_rate": 2.396388612646247e-06, + "loss": 0.0887, + "step": 9773 + }, + { + "epoch": 1.583603370058328, + "grad_norm": 0.9179200530052185, + "learning_rate": 2.3959516858630707e-06, + "loss": 0.1108, + "step": 9774 + }, + { + "epoch": 1.5837653920933246, + "grad_norm": 0.8982585668563843, + "learning_rate": 2.395514762263522e-06, + "loss": 0.1048, + "step": 9775 + }, + { + "epoch": 1.5839274141283215, + "grad_norm": 0.8416269421577454, + "learning_rate": 2.3950778418609676e-06, + "loss": 0.1036, + "step": 9776 + }, + { + "epoch": 1.5840894361633182, + "grad_norm": 0.6837180256843567, + "learning_rate": 2.3946409246687775e-06, + "loss": 0.083, + "step": 9777 + }, + { + "epoch": 1.584251458198315, + "grad_norm": 0.7735916376113892, + "learning_rate": 2.39420401070032e-06, + "loss": 0.0908, + "step": 9778 + }, + { + "epoch": 1.5844134802333119, + "grad_norm": 0.894906759262085, + "learning_rate": 2.3937670999689634e-06, + "loss": 0.1112, + "step": 9779 + }, + { + "epoch": 1.5845755022683083, + "grad_norm": 0.8407071232795715, + "learning_rate": 2.3933301924880768e-06, + "loss": 0.1083, + "step": 9780 + }, + { + "epoch": 1.5847375243033053, + "grad_norm": 0.7255405187606812, + "learning_rate": 2.392893288271029e-06, + "loss": 0.088, + "step": 9781 + }, + { + "epoch": 1.584899546338302, + "grad_norm": 0.8209105134010315, + "learning_rate": 2.3924563873311868e-06, + "loss": 0.104, + "step": 9782 + }, + { + "epoch": 1.5850615683732987, + "grad_norm": 0.8289636373519897, + "learning_rate": 2.3920194896819183e-06, + "loss": 0.1019, + "step": 9783 + }, + { + "epoch": 1.5852235904082956, + "grad_norm": 0.8864776492118835, + "learning_rate": 2.391582595336593e-06, + "loss": 0.1073, + "step": 9784 + }, + { + "epoch": 1.5853856124432921, + "grad_norm": 0.8953683376312256, + "learning_rate": 2.391145704308577e-06, + "loss": 0.1121, + "step": 9785 + }, + { + "epoch": 1.585547634478289, + "grad_norm": 0.8129318356513977, + "learning_rate": 2.3907088166112406e-06, + "loss": 0.1012, + "step": 9786 + }, + { + "epoch": 1.5857096565132858, + "grad_norm": 0.7446659207344055, + "learning_rate": 2.3902719322579487e-06, + "loss": 0.0929, + "step": 9787 + }, + { + "epoch": 1.5858716785482825, + "grad_norm": 0.8847704529762268, + "learning_rate": 2.3898350512620696e-06, + "loss": 0.1074, + "step": 9788 + }, + { + "epoch": 1.5860337005832794, + "grad_norm": 0.8319103121757507, + "learning_rate": 2.389398173636972e-06, + "loss": 0.1071, + "step": 9789 + }, + { + "epoch": 1.5861957226182761, + "grad_norm": 0.884433388710022, + "learning_rate": 2.3889612993960233e-06, + "loss": 0.1163, + "step": 9790 + }, + { + "epoch": 1.5863577446532728, + "grad_norm": 0.813884973526001, + "learning_rate": 2.3885244285525892e-06, + "loss": 0.1017, + "step": 9791 + }, + { + "epoch": 1.5865197666882696, + "grad_norm": 0.8731319308280945, + "learning_rate": 2.3880875611200387e-06, + "loss": 0.1089, + "step": 9792 + }, + { + "epoch": 1.5866817887232663, + "grad_norm": 0.7939242720603943, + "learning_rate": 2.387650697111737e-06, + "loss": 0.1039, + "step": 9793 + }, + { + "epoch": 1.5868438107582632, + "grad_norm": 0.7616491317749023, + "learning_rate": 2.3872138365410525e-06, + "loss": 0.1019, + "step": 9794 + }, + { + "epoch": 1.58700583279326, + "grad_norm": 0.8765544295310974, + "learning_rate": 2.386776979421352e-06, + "loss": 0.1133, + "step": 9795 + }, + { + "epoch": 1.5871678548282566, + "grad_norm": 0.8053047060966492, + "learning_rate": 2.3863401257660016e-06, + "loss": 0.0989, + "step": 9796 + }, + { + "epoch": 1.5873298768632536, + "grad_norm": 0.7080039381980896, + "learning_rate": 2.3859032755883677e-06, + "loss": 0.0945, + "step": 9797 + }, + { + "epoch": 1.58749189889825, + "grad_norm": 0.7942770719528198, + "learning_rate": 2.3854664289018182e-06, + "loss": 0.1005, + "step": 9798 + }, + { + "epoch": 1.587653920933247, + "grad_norm": 0.8453302383422852, + "learning_rate": 2.3850295857197193e-06, + "loss": 0.1095, + "step": 9799 + }, + { + "epoch": 1.5878159429682437, + "grad_norm": 0.9450112581253052, + "learning_rate": 2.3845927460554363e-06, + "loss": 0.1154, + "step": 9800 + }, + { + "epoch": 1.5879779650032404, + "grad_norm": 0.8308724164962769, + "learning_rate": 2.3841559099223363e-06, + "loss": 0.1083, + "step": 9801 + }, + { + "epoch": 1.5881399870382373, + "grad_norm": 0.8302947878837585, + "learning_rate": 2.383719077333784e-06, + "loss": 0.0977, + "step": 9802 + }, + { + "epoch": 1.5883020090732338, + "grad_norm": 0.8785433173179626, + "learning_rate": 2.3832822483031477e-06, + "loss": 0.1076, + "step": 9803 + }, + { + "epoch": 1.5884640311082308, + "grad_norm": 0.7806341648101807, + "learning_rate": 2.382845422843792e-06, + "loss": 0.1086, + "step": 9804 + }, + { + "epoch": 1.5886260531432275, + "grad_norm": 0.737848699092865, + "learning_rate": 2.382408600969083e-06, + "loss": 0.0928, + "step": 9805 + }, + { + "epoch": 1.5887880751782242, + "grad_norm": 0.8146355748176575, + "learning_rate": 2.381971782692386e-06, + "loss": 0.1076, + "step": 9806 + }, + { + "epoch": 1.5889500972132211, + "grad_norm": 0.7324455380439758, + "learning_rate": 2.3815349680270654e-06, + "loss": 0.0893, + "step": 9807 + }, + { + "epoch": 1.5891121192482176, + "grad_norm": 0.781540036201477, + "learning_rate": 2.3810981569864898e-06, + "loss": 0.097, + "step": 9808 + }, + { + "epoch": 1.5892741412832145, + "grad_norm": 0.7014777660369873, + "learning_rate": 2.3806613495840227e-06, + "loss": 0.0886, + "step": 9809 + }, + { + "epoch": 1.5894361633182112, + "grad_norm": 0.8267843723297119, + "learning_rate": 2.38022454583303e-06, + "loss": 0.1021, + "step": 9810 + }, + { + "epoch": 1.589598185353208, + "grad_norm": 0.7938618063926697, + "learning_rate": 2.379787745746875e-06, + "loss": 0.0953, + "step": 9811 + }, + { + "epoch": 1.589760207388205, + "grad_norm": 0.9035655856132507, + "learning_rate": 2.379350949338924e-06, + "loss": 0.1138, + "step": 9812 + }, + { + "epoch": 1.5899222294232016, + "grad_norm": 0.7727793455123901, + "learning_rate": 2.3789141566225437e-06, + "loss": 0.0966, + "step": 9813 + }, + { + "epoch": 1.5900842514581983, + "grad_norm": 1.0545909404754639, + "learning_rate": 2.378477367611096e-06, + "loss": 0.12, + "step": 9814 + }, + { + "epoch": 1.590246273493195, + "grad_norm": 0.8968018889427185, + "learning_rate": 2.378040582317947e-06, + "loss": 0.109, + "step": 9815 + }, + { + "epoch": 1.5904082955281917, + "grad_norm": 0.7647299766540527, + "learning_rate": 2.377603800756461e-06, + "loss": 0.0939, + "step": 9816 + }, + { + "epoch": 1.5905703175631887, + "grad_norm": 0.8128365278244019, + "learning_rate": 2.377167022940002e-06, + "loss": 0.1, + "step": 9817 + }, + { + "epoch": 1.5907323395981854, + "grad_norm": 0.7933886051177979, + "learning_rate": 2.376730248881935e-06, + "loss": 0.1007, + "step": 9818 + }, + { + "epoch": 1.590894361633182, + "grad_norm": 0.8362367153167725, + "learning_rate": 2.376293478595625e-06, + "loss": 0.1051, + "step": 9819 + }, + { + "epoch": 1.591056383668179, + "grad_norm": 0.7798475027084351, + "learning_rate": 2.3758567120944345e-06, + "loss": 0.0938, + "step": 9820 + }, + { + "epoch": 1.5912184057031755, + "grad_norm": 0.8067512512207031, + "learning_rate": 2.3754199493917277e-06, + "loss": 0.0933, + "step": 9821 + }, + { + "epoch": 1.5913804277381725, + "grad_norm": 0.7828888893127441, + "learning_rate": 2.3749831905008704e-06, + "loss": 0.1024, + "step": 9822 + }, + { + "epoch": 1.5915424497731692, + "grad_norm": 0.7774432897567749, + "learning_rate": 2.3745464354352236e-06, + "loss": 0.0959, + "step": 9823 + }, + { + "epoch": 1.5917044718081659, + "grad_norm": 0.7905421257019043, + "learning_rate": 2.374109684208153e-06, + "loss": 0.0848, + "step": 9824 + }, + { + "epoch": 1.5918664938431628, + "grad_norm": 0.9159008264541626, + "learning_rate": 2.3736729368330212e-06, + "loss": 0.1114, + "step": 9825 + }, + { + "epoch": 1.5920285158781593, + "grad_norm": 0.9248816967010498, + "learning_rate": 2.3732361933231917e-06, + "loss": 0.0909, + "step": 9826 + }, + { + "epoch": 1.5921905379131562, + "grad_norm": 0.7748779654502869, + "learning_rate": 2.3727994536920276e-06, + "loss": 0.0993, + "step": 9827 + }, + { + "epoch": 1.592352559948153, + "grad_norm": 0.8885847926139832, + "learning_rate": 2.3723627179528935e-06, + "loss": 0.1007, + "step": 9828 + }, + { + "epoch": 1.5925145819831497, + "grad_norm": 0.7419458031654358, + "learning_rate": 2.3719259861191506e-06, + "loss": 0.0938, + "step": 9829 + }, + { + "epoch": 1.5926766040181466, + "grad_norm": 0.7908516526222229, + "learning_rate": 2.371489258204163e-06, + "loss": 0.0994, + "step": 9830 + }, + { + "epoch": 1.592838626053143, + "grad_norm": 0.8385130763053894, + "learning_rate": 2.3710525342212925e-06, + "loss": 0.1031, + "step": 9831 + }, + { + "epoch": 1.59300064808814, + "grad_norm": 0.7185788750648499, + "learning_rate": 2.3706158141839025e-06, + "loss": 0.0933, + "step": 9832 + }, + { + "epoch": 1.5931626701231367, + "grad_norm": 0.7869822382926941, + "learning_rate": 2.3701790981053556e-06, + "loss": 0.0997, + "step": 9833 + }, + { + "epoch": 1.5933246921581334, + "grad_norm": 0.881678581237793, + "learning_rate": 2.3697423859990147e-06, + "loss": 0.1069, + "step": 9834 + }, + { + "epoch": 1.5934867141931304, + "grad_norm": 0.9156317710876465, + "learning_rate": 2.3693056778782407e-06, + "loss": 0.1083, + "step": 9835 + }, + { + "epoch": 1.5936487362281269, + "grad_norm": 0.9538189768791199, + "learning_rate": 2.3688689737563965e-06, + "loss": 0.0896, + "step": 9836 + }, + { + "epoch": 1.5938107582631238, + "grad_norm": 0.7372860312461853, + "learning_rate": 2.3684322736468457e-06, + "loss": 0.0993, + "step": 9837 + }, + { + "epoch": 1.5939727802981205, + "grad_norm": 0.8044807314872742, + "learning_rate": 2.367995577562948e-06, + "loss": 0.1033, + "step": 9838 + }, + { + "epoch": 1.5941348023331172, + "grad_norm": 0.8724144697189331, + "learning_rate": 2.3675588855180668e-06, + "loss": 0.1125, + "step": 9839 + }, + { + "epoch": 1.5942968243681142, + "grad_norm": 0.7166265249252319, + "learning_rate": 2.3671221975255616e-06, + "loss": 0.0858, + "step": 9840 + }, + { + "epoch": 1.5944588464031109, + "grad_norm": 0.9747242331504822, + "learning_rate": 2.3666855135987972e-06, + "loss": 0.1067, + "step": 9841 + }, + { + "epoch": 1.5946208684381076, + "grad_norm": 0.8712856769561768, + "learning_rate": 2.366248833751133e-06, + "loss": 0.1019, + "step": 9842 + }, + { + "epoch": 1.5947828904731045, + "grad_norm": 0.8115395903587341, + "learning_rate": 2.3658121579959314e-06, + "loss": 0.0982, + "step": 9843 + }, + { + "epoch": 1.594944912508101, + "grad_norm": 0.9539070129394531, + "learning_rate": 2.365375486346552e-06, + "loss": 0.1215, + "step": 9844 + }, + { + "epoch": 1.595106934543098, + "grad_norm": 0.870464026927948, + "learning_rate": 2.3649388188163572e-06, + "loss": 0.1112, + "step": 9845 + }, + { + "epoch": 1.5952689565780946, + "grad_norm": 0.7583725452423096, + "learning_rate": 2.3645021554187086e-06, + "loss": 0.0935, + "step": 9846 + }, + { + "epoch": 1.5954309786130914, + "grad_norm": 0.8613632321357727, + "learning_rate": 2.364065496166965e-06, + "loss": 0.1064, + "step": 9847 + }, + { + "epoch": 1.5955930006480883, + "grad_norm": 0.8895267844200134, + "learning_rate": 2.3636288410744894e-06, + "loss": 0.1075, + "step": 9848 + }, + { + "epoch": 1.5957550226830848, + "grad_norm": 0.7960240840911865, + "learning_rate": 2.36319219015464e-06, + "loss": 0.0979, + "step": 9849 + }, + { + "epoch": 1.5959170447180817, + "grad_norm": 0.7139129042625427, + "learning_rate": 2.3627555434207787e-06, + "loss": 0.0916, + "step": 9850 + }, + { + "epoch": 1.5960790667530784, + "grad_norm": 0.9061529636383057, + "learning_rate": 2.3623189008862664e-06, + "loss": 0.1055, + "step": 9851 + }, + { + "epoch": 1.5962410887880751, + "grad_norm": 0.8803797364234924, + "learning_rate": 2.3618822625644624e-06, + "loss": 0.1099, + "step": 9852 + }, + { + "epoch": 1.596403110823072, + "grad_norm": 1.062333106994629, + "learning_rate": 2.3614456284687267e-06, + "loss": 0.1194, + "step": 9853 + }, + { + "epoch": 1.5965651328580686, + "grad_norm": 0.7200803756713867, + "learning_rate": 2.36100899861242e-06, + "loss": 0.0902, + "step": 9854 + }, + { + "epoch": 1.5967271548930655, + "grad_norm": 0.7464329600334167, + "learning_rate": 2.3605723730089e-06, + "loss": 0.0964, + "step": 9855 + }, + { + "epoch": 1.5968891769280622, + "grad_norm": 0.743965744972229, + "learning_rate": 2.3601357516715297e-06, + "loss": 0.0935, + "step": 9856 + }, + { + "epoch": 1.597051198963059, + "grad_norm": 0.8007190227508545, + "learning_rate": 2.3596991346136666e-06, + "loss": 0.1018, + "step": 9857 + }, + { + "epoch": 1.5972132209980558, + "grad_norm": 0.7653955817222595, + "learning_rate": 2.35926252184867e-06, + "loss": 0.0963, + "step": 9858 + }, + { + "epoch": 1.5973752430330523, + "grad_norm": 0.9055894017219543, + "learning_rate": 2.3588259133898995e-06, + "loss": 0.1082, + "step": 9859 + }, + { + "epoch": 1.5975372650680493, + "grad_norm": 0.8475822806358337, + "learning_rate": 2.3583893092507144e-06, + "loss": 0.1045, + "step": 9860 + }, + { + "epoch": 1.597699287103046, + "grad_norm": 0.8215590119361877, + "learning_rate": 2.357952709444474e-06, + "loss": 0.1003, + "step": 9861 + }, + { + "epoch": 1.5978613091380427, + "grad_norm": 0.7612400650978088, + "learning_rate": 2.3575161139845375e-06, + "loss": 0.0869, + "step": 9862 + }, + { + "epoch": 1.5980233311730396, + "grad_norm": 0.7190806865692139, + "learning_rate": 2.357079522884263e-06, + "loss": 0.0876, + "step": 9863 + }, + { + "epoch": 1.5981853532080363, + "grad_norm": 0.8134793043136597, + "learning_rate": 2.356642936157008e-06, + "loss": 0.0957, + "step": 9864 + }, + { + "epoch": 1.598347375243033, + "grad_norm": 0.8630086779594421, + "learning_rate": 2.3562063538161332e-06, + "loss": 0.1002, + "step": 9865 + }, + { + "epoch": 1.5985093972780298, + "grad_norm": 0.7363705635070801, + "learning_rate": 2.3557697758749966e-06, + "loss": 0.0914, + "step": 9866 + }, + { + "epoch": 1.5986714193130265, + "grad_norm": 0.9773649573326111, + "learning_rate": 2.355333202346955e-06, + "loss": 0.1158, + "step": 9867 + }, + { + "epoch": 1.5988334413480234, + "grad_norm": 0.8616942167282104, + "learning_rate": 2.3548966332453673e-06, + "loss": 0.1058, + "step": 9868 + }, + { + "epoch": 1.5989954633830201, + "grad_norm": 0.78786700963974, + "learning_rate": 2.354460068583591e-06, + "loss": 0.1064, + "step": 9869 + }, + { + "epoch": 1.5991574854180168, + "grad_norm": 0.8264607787132263, + "learning_rate": 2.3540235083749853e-06, + "loss": 0.0996, + "step": 9870 + }, + { + "epoch": 1.5993195074530138, + "grad_norm": 0.8713717460632324, + "learning_rate": 2.3535869526329067e-06, + "loss": 0.0998, + "step": 9871 + }, + { + "epoch": 1.5994815294880103, + "grad_norm": 0.8369178175926208, + "learning_rate": 2.3531504013707134e-06, + "loss": 0.1026, + "step": 9872 + }, + { + "epoch": 1.5996435515230072, + "grad_norm": 0.8778187036514282, + "learning_rate": 2.3527138546017623e-06, + "loss": 0.1061, + "step": 9873 + }, + { + "epoch": 1.599805573558004, + "grad_norm": 0.7916249632835388, + "learning_rate": 2.35227731233941e-06, + "loss": 0.0948, + "step": 9874 + }, + { + "epoch": 1.5999675955930006, + "grad_norm": 0.8178213238716125, + "learning_rate": 2.3518407745970155e-06, + "loss": 0.1131, + "step": 9875 + }, + { + "epoch": 1.6001296176279975, + "grad_norm": 0.7969405055046082, + "learning_rate": 2.3514042413879344e-06, + "loss": 0.0874, + "step": 9876 + }, + { + "epoch": 1.600291639662994, + "grad_norm": 0.9230097532272339, + "learning_rate": 2.3509677127255233e-06, + "loss": 0.1043, + "step": 9877 + }, + { + "epoch": 1.600453661697991, + "grad_norm": 0.8090982437133789, + "learning_rate": 2.350531188623141e-06, + "loss": 0.0954, + "step": 9878 + }, + { + "epoch": 1.6006156837329877, + "grad_norm": 0.8020190000534058, + "learning_rate": 2.3500946690941407e-06, + "loss": 0.1007, + "step": 9879 + }, + { + "epoch": 1.6007777057679844, + "grad_norm": 0.8556835651397705, + "learning_rate": 2.349658154151882e-06, + "loss": 0.105, + "step": 9880 + }, + { + "epoch": 1.6009397278029813, + "grad_norm": 0.7550347447395325, + "learning_rate": 2.34922164380972e-06, + "loss": 0.0951, + "step": 9881 + }, + { + "epoch": 1.6011017498379778, + "grad_norm": 0.8687736988067627, + "learning_rate": 2.3487851380810106e-06, + "loss": 0.1147, + "step": 9882 + }, + { + "epoch": 1.6012637718729748, + "grad_norm": 0.8163692951202393, + "learning_rate": 2.3483486369791106e-06, + "loss": 0.0996, + "step": 9883 + }, + { + "epoch": 1.6014257939079715, + "grad_norm": 0.9547244310379028, + "learning_rate": 2.3479121405173736e-06, + "loss": 0.1209, + "step": 9884 + }, + { + "epoch": 1.6015878159429682, + "grad_norm": 0.7805865406990051, + "learning_rate": 2.3474756487091586e-06, + "loss": 0.1022, + "step": 9885 + }, + { + "epoch": 1.601749837977965, + "grad_norm": 0.7808001041412354, + "learning_rate": 2.347039161567819e-06, + "loss": 0.1002, + "step": 9886 + }, + { + "epoch": 1.6019118600129616, + "grad_norm": 0.8721002340316772, + "learning_rate": 2.346602679106712e-06, + "loss": 0.1171, + "step": 9887 + }, + { + "epoch": 1.6020738820479585, + "grad_norm": 0.8056678175926208, + "learning_rate": 2.34616620133919e-06, + "loss": 0.0971, + "step": 9888 + }, + { + "epoch": 1.6022359040829552, + "grad_norm": 0.7318929433822632, + "learning_rate": 2.345729728278611e-06, + "loss": 0.1024, + "step": 9889 + }, + { + "epoch": 1.602397926117952, + "grad_norm": 0.7396207451820374, + "learning_rate": 2.345293259938329e-06, + "loss": 0.0944, + "step": 9890 + }, + { + "epoch": 1.6025599481529489, + "grad_norm": 0.8219144344329834, + "learning_rate": 2.3448567963316987e-06, + "loss": 0.1008, + "step": 9891 + }, + { + "epoch": 1.6027219701879456, + "grad_norm": 0.7780699133872986, + "learning_rate": 2.3444203374720755e-06, + "loss": 0.0938, + "step": 9892 + }, + { + "epoch": 1.6028839922229423, + "grad_norm": 0.7455930709838867, + "learning_rate": 2.3439838833728122e-06, + "loss": 0.0948, + "step": 9893 + }, + { + "epoch": 1.6030460142579392, + "grad_norm": 0.8056654334068298, + "learning_rate": 2.3435474340472657e-06, + "loss": 0.098, + "step": 9894 + }, + { + "epoch": 1.6032080362929357, + "grad_norm": 0.7863870859146118, + "learning_rate": 2.3431109895087886e-06, + "loss": 0.1008, + "step": 9895 + }, + { + "epoch": 1.6033700583279327, + "grad_norm": 0.7881834506988525, + "learning_rate": 2.3426745497707364e-06, + "loss": 0.1083, + "step": 9896 + }, + { + "epoch": 1.6035320803629294, + "grad_norm": 0.8019961714744568, + "learning_rate": 2.3422381148464614e-06, + "loss": 0.0984, + "step": 9897 + }, + { + "epoch": 1.603694102397926, + "grad_norm": 0.8999032974243164, + "learning_rate": 2.341801684749318e-06, + "loss": 0.1112, + "step": 9898 + }, + { + "epoch": 1.603856124432923, + "grad_norm": 0.7355244755744934, + "learning_rate": 2.341365259492661e-06, + "loss": 0.0951, + "step": 9899 + }, + { + "epoch": 1.6040181464679195, + "grad_norm": 0.8507843017578125, + "learning_rate": 2.3409288390898427e-06, + "loss": 0.1107, + "step": 9900 + }, + { + "epoch": 1.6041801685029164, + "grad_norm": 0.7582950592041016, + "learning_rate": 2.3404924235542175e-06, + "loss": 0.0901, + "step": 9901 + }, + { + "epoch": 1.6043421905379132, + "grad_norm": 0.9285005927085876, + "learning_rate": 2.3400560128991377e-06, + "loss": 0.1125, + "step": 9902 + }, + { + "epoch": 1.6045042125729099, + "grad_norm": 0.7797400951385498, + "learning_rate": 2.3396196071379563e-06, + "loss": 0.1004, + "step": 9903 + }, + { + "epoch": 1.6046662346079068, + "grad_norm": 0.8635115623474121, + "learning_rate": 2.3391832062840273e-06, + "loss": 0.1106, + "step": 9904 + }, + { + "epoch": 1.6048282566429033, + "grad_norm": 0.7101535797119141, + "learning_rate": 2.3387468103507037e-06, + "loss": 0.0905, + "step": 9905 + }, + { + "epoch": 1.6049902786779002, + "grad_norm": 0.8763110041618347, + "learning_rate": 2.338310419351337e-06, + "loss": 0.1038, + "step": 9906 + }, + { + "epoch": 1.605152300712897, + "grad_norm": 0.6906706094741821, + "learning_rate": 2.3378740332992794e-06, + "loss": 0.0851, + "step": 9907 + }, + { + "epoch": 1.6053143227478937, + "grad_norm": 0.7445785403251648, + "learning_rate": 2.3374376522078852e-06, + "loss": 0.0896, + "step": 9908 + }, + { + "epoch": 1.6054763447828906, + "grad_norm": 0.9417932629585266, + "learning_rate": 2.337001276090505e-06, + "loss": 0.1059, + "step": 9909 + }, + { + "epoch": 1.605638366817887, + "grad_norm": 0.8038756847381592, + "learning_rate": 2.3365649049604917e-06, + "loss": 0.0981, + "step": 9910 + }, + { + "epoch": 1.605800388852884, + "grad_norm": 0.903634786605835, + "learning_rate": 2.3361285388311963e-06, + "loss": 0.1137, + "step": 9911 + }, + { + "epoch": 1.6059624108878807, + "grad_norm": 1.037934422492981, + "learning_rate": 2.3356921777159705e-06, + "loss": 0.1227, + "step": 9912 + }, + { + "epoch": 1.6061244329228774, + "grad_norm": 0.8287703990936279, + "learning_rate": 2.335255821628167e-06, + "loss": 0.0974, + "step": 9913 + }, + { + "epoch": 1.6062864549578744, + "grad_norm": 0.8133599758148193, + "learning_rate": 2.334819470581137e-06, + "loss": 0.0945, + "step": 9914 + }, + { + "epoch": 1.606448476992871, + "grad_norm": 0.8264515995979309, + "learning_rate": 2.3343831245882316e-06, + "loss": 0.108, + "step": 9915 + }, + { + "epoch": 1.6066104990278678, + "grad_norm": 0.7875205874443054, + "learning_rate": 2.3339467836628018e-06, + "loss": 0.0986, + "step": 9916 + }, + { + "epoch": 1.6067725210628645, + "grad_norm": 0.784372091293335, + "learning_rate": 2.333510447818198e-06, + "loss": 0.1, + "step": 9917 + }, + { + "epoch": 1.6069345430978612, + "grad_norm": 0.9532830715179443, + "learning_rate": 2.3330741170677713e-06, + "loss": 0.1321, + "step": 9918 + }, + { + "epoch": 1.6070965651328581, + "grad_norm": 0.8498433828353882, + "learning_rate": 2.3326377914248733e-06, + "loss": 0.1047, + "step": 9919 + }, + { + "epoch": 1.6072585871678549, + "grad_norm": 0.9842066168785095, + "learning_rate": 2.3322014709028545e-06, + "loss": 0.1192, + "step": 9920 + }, + { + "epoch": 1.6074206092028516, + "grad_norm": 0.6417335867881775, + "learning_rate": 2.3317651555150636e-06, + "loss": 0.0823, + "step": 9921 + }, + { + "epoch": 1.6075826312378485, + "grad_norm": 0.8965574502944946, + "learning_rate": 2.3313288452748515e-06, + "loss": 0.1156, + "step": 9922 + }, + { + "epoch": 1.607744653272845, + "grad_norm": 0.810722291469574, + "learning_rate": 2.3308925401955694e-06, + "loss": 0.0982, + "step": 9923 + }, + { + "epoch": 1.607906675307842, + "grad_norm": 0.8201885223388672, + "learning_rate": 2.3304562402905662e-06, + "loss": 0.1068, + "step": 9924 + }, + { + "epoch": 1.6080686973428386, + "grad_norm": 0.8500423431396484, + "learning_rate": 2.3300199455731922e-06, + "loss": 0.1053, + "step": 9925 + }, + { + "epoch": 1.6082307193778353, + "grad_norm": 0.8203928470611572, + "learning_rate": 2.329583656056796e-06, + "loss": 0.0957, + "step": 9926 + }, + { + "epoch": 1.6083927414128323, + "grad_norm": 0.7739825248718262, + "learning_rate": 2.329147371754727e-06, + "loss": 0.1001, + "step": 9927 + }, + { + "epoch": 1.6085547634478288, + "grad_norm": 0.8251372575759888, + "learning_rate": 2.3287110926803354e-06, + "loss": 0.1153, + "step": 9928 + }, + { + "epoch": 1.6087167854828257, + "grad_norm": 0.7434987425804138, + "learning_rate": 2.3282748188469704e-06, + "loss": 0.0985, + "step": 9929 + }, + { + "epoch": 1.6088788075178224, + "grad_norm": 1.1180413961410522, + "learning_rate": 2.32783855026798e-06, + "loss": 0.1098, + "step": 9930 + }, + { + "epoch": 1.6090408295528191, + "grad_norm": 0.8994811177253723, + "learning_rate": 2.3274022869567123e-06, + "loss": 0.1136, + "step": 9931 + }, + { + "epoch": 1.609202851587816, + "grad_norm": 0.8187955021858215, + "learning_rate": 2.3269660289265184e-06, + "loss": 0.1092, + "step": 9932 + }, + { + "epoch": 1.6093648736228126, + "grad_norm": 0.894491970539093, + "learning_rate": 2.3265297761907447e-06, + "loss": 0.1171, + "step": 9933 + }, + { + "epoch": 1.6095268956578095, + "grad_norm": 0.9280018210411072, + "learning_rate": 2.3260935287627408e-06, + "loss": 0.1186, + "step": 9934 + }, + { + "epoch": 1.6096889176928062, + "grad_norm": 0.9082358479499817, + "learning_rate": 2.3256572866558533e-06, + "loss": 0.1149, + "step": 9935 + }, + { + "epoch": 1.609850939727803, + "grad_norm": 0.8570298552513123, + "learning_rate": 2.3252210498834306e-06, + "loss": 0.1134, + "step": 9936 + }, + { + "epoch": 1.6100129617627998, + "grad_norm": 0.8263987898826599, + "learning_rate": 2.3247848184588208e-06, + "loss": 0.0962, + "step": 9937 + }, + { + "epoch": 1.6101749837977966, + "grad_norm": 0.8012087345123291, + "learning_rate": 2.3243485923953725e-06, + "loss": 0.1005, + "step": 9938 + }, + { + "epoch": 1.6103370058327933, + "grad_norm": 0.8303671479225159, + "learning_rate": 2.323912371706432e-06, + "loss": 0.1127, + "step": 9939 + }, + { + "epoch": 1.61049902786779, + "grad_norm": 0.9363792538642883, + "learning_rate": 2.323476156405347e-06, + "loss": 0.1046, + "step": 9940 + }, + { + "epoch": 1.6106610499027867, + "grad_norm": 0.8461704254150391, + "learning_rate": 2.323039946505463e-06, + "loss": 0.1076, + "step": 9941 + }, + { + "epoch": 1.6108230719377836, + "grad_norm": 0.7788037061691284, + "learning_rate": 2.3226037420201296e-06, + "loss": 0.0963, + "step": 9942 + }, + { + "epoch": 1.6109850939727803, + "grad_norm": 0.9676366448402405, + "learning_rate": 2.3221675429626925e-06, + "loss": 0.1045, + "step": 9943 + }, + { + "epoch": 1.611147116007777, + "grad_norm": 0.9113138318061829, + "learning_rate": 2.3217313493464977e-06, + "loss": 0.108, + "step": 9944 + }, + { + "epoch": 1.611309138042774, + "grad_norm": 0.6841544508934021, + "learning_rate": 2.3212951611848927e-06, + "loss": 0.0867, + "step": 9945 + }, + { + "epoch": 1.6114711600777705, + "grad_norm": 0.7147634625434875, + "learning_rate": 2.320858978491222e-06, + "loss": 0.0938, + "step": 9946 + }, + { + "epoch": 1.6116331821127674, + "grad_norm": 0.8967075347900391, + "learning_rate": 2.3204228012788346e-06, + "loss": 0.1185, + "step": 9947 + }, + { + "epoch": 1.6117952041477641, + "grad_norm": 0.7635133266448975, + "learning_rate": 2.319986629561074e-06, + "loss": 0.0941, + "step": 9948 + }, + { + "epoch": 1.6119572261827608, + "grad_norm": 0.9004403352737427, + "learning_rate": 2.319550463351288e-06, + "loss": 0.1137, + "step": 9949 + }, + { + "epoch": 1.6121192482177578, + "grad_norm": 0.7615978121757507, + "learning_rate": 2.3191143026628206e-06, + "loss": 0.0906, + "step": 9950 + }, + { + "epoch": 1.6122812702527543, + "grad_norm": 0.8137632608413696, + "learning_rate": 2.3186781475090168e-06, + "loss": 0.0992, + "step": 9951 + }, + { + "epoch": 1.6124432922877512, + "grad_norm": 0.8467097282409668, + "learning_rate": 2.318241997903224e-06, + "loss": 0.1082, + "step": 9952 + }, + { + "epoch": 1.612605314322748, + "grad_norm": 0.8358951210975647, + "learning_rate": 2.317805853858786e-06, + "loss": 0.1054, + "step": 9953 + }, + { + "epoch": 1.6127673363577446, + "grad_norm": 0.8378834128379822, + "learning_rate": 2.3173697153890486e-06, + "loss": 0.1095, + "step": 9954 + }, + { + "epoch": 1.6129293583927415, + "grad_norm": 0.7304298281669617, + "learning_rate": 2.316933582507354e-06, + "loss": 0.0923, + "step": 9955 + }, + { + "epoch": 1.613091380427738, + "grad_norm": 0.7708770632743835, + "learning_rate": 2.31649745522705e-06, + "loss": 0.0938, + "step": 9956 + }, + { + "epoch": 1.613253402462735, + "grad_norm": 0.8786473870277405, + "learning_rate": 2.31606133356148e-06, + "loss": 0.111, + "step": 9957 + }, + { + "epoch": 1.6134154244977317, + "grad_norm": 0.724189281463623, + "learning_rate": 2.3156252175239883e-06, + "loss": 0.0913, + "step": 9958 + }, + { + "epoch": 1.6135774465327284, + "grad_norm": 0.6984865665435791, + "learning_rate": 2.3151891071279183e-06, + "loss": 0.0833, + "step": 9959 + }, + { + "epoch": 1.6137394685677253, + "grad_norm": 0.833328902721405, + "learning_rate": 2.3147530023866136e-06, + "loss": 0.1095, + "step": 9960 + }, + { + "epoch": 1.6139014906027218, + "grad_norm": 0.8859132528305054, + "learning_rate": 2.3143169033134204e-06, + "loss": 0.1241, + "step": 9961 + }, + { + "epoch": 1.6140635126377187, + "grad_norm": 0.8652672171592712, + "learning_rate": 2.3138808099216796e-06, + "loss": 0.1024, + "step": 9962 + }, + { + "epoch": 1.6142255346727155, + "grad_norm": 0.7666911482810974, + "learning_rate": 2.313444722224736e-06, + "loss": 0.0994, + "step": 9963 + }, + { + "epoch": 1.6143875567077122, + "grad_norm": 0.6945036053657532, + "learning_rate": 2.3130086402359327e-06, + "loss": 0.0851, + "step": 9964 + }, + { + "epoch": 1.614549578742709, + "grad_norm": 0.8811757564544678, + "learning_rate": 2.3125725639686116e-06, + "loss": 0.11, + "step": 9965 + }, + { + "epoch": 1.6147116007777058, + "grad_norm": 0.7738437056541443, + "learning_rate": 2.312136493436117e-06, + "loss": 0.0957, + "step": 9966 + }, + { + "epoch": 1.6148736228127025, + "grad_norm": 0.8249509334564209, + "learning_rate": 2.311700428651791e-06, + "loss": 0.1109, + "step": 9967 + }, + { + "epoch": 1.6150356448476992, + "grad_norm": 0.8863063454627991, + "learning_rate": 2.311264369628976e-06, + "loss": 0.1033, + "step": 9968 + }, + { + "epoch": 1.615197666882696, + "grad_norm": 0.7870522141456604, + "learning_rate": 2.3108283163810155e-06, + "loss": 0.0972, + "step": 9969 + }, + { + "epoch": 1.6153596889176929, + "grad_norm": 0.8956674337387085, + "learning_rate": 2.3103922689212494e-06, + "loss": 0.1076, + "step": 9970 + }, + { + "epoch": 1.6155217109526896, + "grad_norm": 0.7711880207061768, + "learning_rate": 2.3099562272630216e-06, + "loss": 0.0966, + "step": 9971 + }, + { + "epoch": 1.6156837329876863, + "grad_norm": 0.8216359615325928, + "learning_rate": 2.3095201914196732e-06, + "loss": 0.1039, + "step": 9972 + }, + { + "epoch": 1.6158457550226832, + "grad_norm": 0.7300341725349426, + "learning_rate": 2.309084161404547e-06, + "loss": 0.0933, + "step": 9973 + }, + { + "epoch": 1.6160077770576797, + "grad_norm": 0.783735454082489, + "learning_rate": 2.308648137230982e-06, + "loss": 0.0987, + "step": 9974 + }, + { + "epoch": 1.6161697990926767, + "grad_norm": 0.8075628876686096, + "learning_rate": 2.3082121189123213e-06, + "loss": 0.1045, + "step": 9975 + }, + { + "epoch": 1.6163318211276734, + "grad_norm": 0.8673627972602844, + "learning_rate": 2.3077761064619062e-06, + "loss": 0.1028, + "step": 9976 + }, + { + "epoch": 1.61649384316267, + "grad_norm": 0.8210455179214478, + "learning_rate": 2.307340099893077e-06, + "loss": 0.1024, + "step": 9977 + }, + { + "epoch": 1.616655865197667, + "grad_norm": 0.803597092628479, + "learning_rate": 2.3069040992191745e-06, + "loss": 0.0922, + "step": 9978 + }, + { + "epoch": 1.6168178872326635, + "grad_norm": 0.8294203877449036, + "learning_rate": 2.3064681044535385e-06, + "loss": 0.1002, + "step": 9979 + }, + { + "epoch": 1.6169799092676604, + "grad_norm": 0.7242622971534729, + "learning_rate": 2.3060321156095107e-06, + "loss": 0.0904, + "step": 9980 + }, + { + "epoch": 1.6171419313026572, + "grad_norm": 0.6747761964797974, + "learning_rate": 2.305596132700431e-06, + "loss": 0.0841, + "step": 9981 + }, + { + "epoch": 1.6173039533376539, + "grad_norm": 0.7614217400550842, + "learning_rate": 2.3051601557396393e-06, + "loss": 0.0929, + "step": 9982 + }, + { + "epoch": 1.6174659753726508, + "grad_norm": 0.8482229113578796, + "learning_rate": 2.304724184740475e-06, + "loss": 0.1065, + "step": 9983 + }, + { + "epoch": 1.6176279974076473, + "grad_norm": 0.8954651951789856, + "learning_rate": 2.3042882197162776e-06, + "loss": 0.1151, + "step": 9984 + }, + { + "epoch": 1.6177900194426442, + "grad_norm": 0.9330970048904419, + "learning_rate": 2.3038522606803882e-06, + "loss": 0.1117, + "step": 9985 + }, + { + "epoch": 1.617952041477641, + "grad_norm": 0.8899834156036377, + "learning_rate": 2.303416307646144e-06, + "loss": 0.106, + "step": 9986 + }, + { + "epoch": 1.6181140635126376, + "grad_norm": 0.8274690508842468, + "learning_rate": 2.302980360626886e-06, + "loss": 0.1017, + "step": 9987 + }, + { + "epoch": 1.6182760855476346, + "grad_norm": 0.8075361251831055, + "learning_rate": 2.3025444196359513e-06, + "loss": 0.1055, + "step": 9988 + }, + { + "epoch": 1.6184381075826313, + "grad_norm": 0.8113299012184143, + "learning_rate": 2.3021084846866793e-06, + "loss": 0.1039, + "step": 9989 + }, + { + "epoch": 1.618600129617628, + "grad_norm": 0.7555704712867737, + "learning_rate": 2.3016725557924095e-06, + "loss": 0.0869, + "step": 9990 + }, + { + "epoch": 1.6187621516526247, + "grad_norm": 0.8604114055633545, + "learning_rate": 2.3012366329664794e-06, + "loss": 0.109, + "step": 9991 + }, + { + "epoch": 1.6189241736876214, + "grad_norm": 0.8491653800010681, + "learning_rate": 2.3008007162222273e-06, + "loss": 0.1071, + "step": 9992 + }, + { + "epoch": 1.6190861957226184, + "grad_norm": 0.7831771969795227, + "learning_rate": 2.3003648055729917e-06, + "loss": 0.0946, + "step": 9993 + }, + { + "epoch": 1.619248217757615, + "grad_norm": 0.8524945378303528, + "learning_rate": 2.2999289010321092e-06, + "loss": 0.1057, + "step": 9994 + }, + { + "epoch": 1.6194102397926118, + "grad_norm": 0.7014240622520447, + "learning_rate": 2.299493002612918e-06, + "loss": 0.092, + "step": 9995 + }, + { + "epoch": 1.6195722618276087, + "grad_norm": 0.8205536007881165, + "learning_rate": 2.299057110328757e-06, + "loss": 0.1018, + "step": 9996 + }, + { + "epoch": 1.6197342838626052, + "grad_norm": 0.810691773891449, + "learning_rate": 2.298621224192961e-06, + "loss": 0.0999, + "step": 9997 + }, + { + "epoch": 1.6198963058976021, + "grad_norm": 0.7380390763282776, + "learning_rate": 2.298185344218868e-06, + "loss": 0.0948, + "step": 9998 + }, + { + "epoch": 1.6200583279325989, + "grad_norm": 0.8573938012123108, + "learning_rate": 2.297749470419815e-06, + "loss": 0.0986, + "step": 9999 + }, + { + "epoch": 1.6202203499675956, + "grad_norm": 0.749604344367981, + "learning_rate": 2.29731360280914e-06, + "loss": 0.1046, + "step": 10000 + }, + { + "epoch": 1.6203823720025925, + "grad_norm": 0.8711916208267212, + "learning_rate": 2.2968777414001773e-06, + "loss": 0.1106, + "step": 10001 + }, + { + "epoch": 1.620544394037589, + "grad_norm": 0.9774971008300781, + "learning_rate": 2.2964418862062655e-06, + "loss": 0.1258, + "step": 10002 + }, + { + "epoch": 1.620706416072586, + "grad_norm": 0.9113691449165344, + "learning_rate": 2.2960060372407377e-06, + "loss": 0.1038, + "step": 10003 + }, + { + "epoch": 1.6208684381075826, + "grad_norm": 0.7974822521209717, + "learning_rate": 2.2955701945169317e-06, + "loss": 0.1024, + "step": 10004 + }, + { + "epoch": 1.6210304601425793, + "grad_norm": 0.7355794310569763, + "learning_rate": 2.295134358048184e-06, + "loss": 0.093, + "step": 10005 + }, + { + "epoch": 1.6211924821775763, + "grad_norm": 0.8737595677375793, + "learning_rate": 2.294698527847829e-06, + "loss": 0.1173, + "step": 10006 + }, + { + "epoch": 1.6213545042125728, + "grad_norm": 0.8428897261619568, + "learning_rate": 2.2942627039292016e-06, + "loss": 0.1065, + "step": 10007 + }, + { + "epoch": 1.6215165262475697, + "grad_norm": 0.7106808423995972, + "learning_rate": 2.2938268863056373e-06, + "loss": 0.0899, + "step": 10008 + }, + { + "epoch": 1.6216785482825664, + "grad_norm": 0.989214301109314, + "learning_rate": 2.2933910749904724e-06, + "loss": 0.1183, + "step": 10009 + }, + { + "epoch": 1.6218405703175631, + "grad_norm": 0.7430517077445984, + "learning_rate": 2.29295526999704e-06, + "loss": 0.0899, + "step": 10010 + }, + { + "epoch": 1.62200259235256, + "grad_norm": 0.9804396033287048, + "learning_rate": 2.292519471338676e-06, + "loss": 0.1184, + "step": 10011 + }, + { + "epoch": 1.6221646143875565, + "grad_norm": 0.7713508009910583, + "learning_rate": 2.2920836790287134e-06, + "loss": 0.0946, + "step": 10012 + }, + { + "epoch": 1.6223266364225535, + "grad_norm": 0.7687755227088928, + "learning_rate": 2.2916478930804865e-06, + "loss": 0.1005, + "step": 10013 + }, + { + "epoch": 1.6224886584575502, + "grad_norm": 0.7584219574928284, + "learning_rate": 2.291212113507331e-06, + "loss": 0.0925, + "step": 10014 + }, + { + "epoch": 1.622650680492547, + "grad_norm": 0.6680698394775391, + "learning_rate": 2.2907763403225793e-06, + "loss": 0.0855, + "step": 10015 + }, + { + "epoch": 1.6228127025275438, + "grad_norm": 0.7672422528266907, + "learning_rate": 2.290340573539565e-06, + "loss": 0.0994, + "step": 10016 + }, + { + "epoch": 1.6229747245625405, + "grad_norm": 0.8922812938690186, + "learning_rate": 2.2899048131716223e-06, + "loss": 0.1152, + "step": 10017 + }, + { + "epoch": 1.6231367465975373, + "grad_norm": 0.8483216166496277, + "learning_rate": 2.2894690592320827e-06, + "loss": 0.1128, + "step": 10018 + }, + { + "epoch": 1.623298768632534, + "grad_norm": 0.6468700766563416, + "learning_rate": 2.2890333117342813e-06, + "loss": 0.086, + "step": 10019 + }, + { + "epoch": 1.6234607906675307, + "grad_norm": 0.7255584001541138, + "learning_rate": 2.2885975706915506e-06, + "loss": 0.0884, + "step": 10020 + }, + { + "epoch": 1.6236228127025276, + "grad_norm": 0.8776664137840271, + "learning_rate": 2.288161836117222e-06, + "loss": 0.1049, + "step": 10021 + }, + { + "epoch": 1.6237848347375243, + "grad_norm": 0.750275731086731, + "learning_rate": 2.287726108024628e-06, + "loss": 0.0964, + "step": 10022 + }, + { + "epoch": 1.623946856772521, + "grad_norm": 0.9269810318946838, + "learning_rate": 2.2872903864271017e-06, + "loss": 0.112, + "step": 10023 + }, + { + "epoch": 1.624108878807518, + "grad_norm": 0.8907431364059448, + "learning_rate": 2.2868546713379755e-06, + "loss": 0.1046, + "step": 10024 + }, + { + "epoch": 1.6242709008425145, + "grad_norm": 0.8910091519355774, + "learning_rate": 2.28641896277058e-06, + "loss": 0.0983, + "step": 10025 + }, + { + "epoch": 1.6244329228775114, + "grad_norm": 0.8178685307502747, + "learning_rate": 2.285983260738248e-06, + "loss": 0.1033, + "step": 10026 + }, + { + "epoch": 1.624594944912508, + "grad_norm": 0.8685728907585144, + "learning_rate": 2.2855475652543094e-06, + "loss": 0.1143, + "step": 10027 + }, + { + "epoch": 1.6247569669475048, + "grad_norm": 0.996419370174408, + "learning_rate": 2.285111876332097e-06, + "loss": 0.111, + "step": 10028 + }, + { + "epoch": 1.6249189889825018, + "grad_norm": 0.9297820925712585, + "learning_rate": 2.284676193984941e-06, + "loss": 0.1146, + "step": 10029 + }, + { + "epoch": 1.6250810110174982, + "grad_norm": 0.7859435677528381, + "learning_rate": 2.2842405182261725e-06, + "loss": 0.0975, + "step": 10030 + }, + { + "epoch": 1.6252430330524952, + "grad_norm": 0.8805785775184631, + "learning_rate": 2.2838048490691223e-06, + "loss": 0.104, + "step": 10031 + }, + { + "epoch": 1.625405055087492, + "grad_norm": 0.8301118612289429, + "learning_rate": 2.283369186527119e-06, + "loss": 0.1087, + "step": 10032 + }, + { + "epoch": 1.6255670771224886, + "grad_norm": 0.7605770230293274, + "learning_rate": 2.282933530613496e-06, + "loss": 0.0949, + "step": 10033 + }, + { + "epoch": 1.6257290991574855, + "grad_norm": 0.8895880579948425, + "learning_rate": 2.282497881341581e-06, + "loss": 0.1106, + "step": 10034 + }, + { + "epoch": 1.625891121192482, + "grad_norm": 0.8832073211669922, + "learning_rate": 2.282062238724705e-06, + "loss": 0.1151, + "step": 10035 + }, + { + "epoch": 1.626053143227479, + "grad_norm": 0.6511034965515137, + "learning_rate": 2.2816266027761965e-06, + "loss": 0.0846, + "step": 10036 + }, + { + "epoch": 1.6262151652624757, + "grad_norm": 0.7454072833061218, + "learning_rate": 2.2811909735093853e-06, + "loss": 0.0911, + "step": 10037 + }, + { + "epoch": 1.6263771872974724, + "grad_norm": 0.9449716806411743, + "learning_rate": 2.280755350937602e-06, + "loss": 0.1066, + "step": 10038 + }, + { + "epoch": 1.6265392093324693, + "grad_norm": 0.7572311162948608, + "learning_rate": 2.280319735074173e-06, + "loss": 0.0937, + "step": 10039 + }, + { + "epoch": 1.626701231367466, + "grad_norm": 0.8121317028999329, + "learning_rate": 2.27988412593243e-06, + "loss": 0.1019, + "step": 10040 + }, + { + "epoch": 1.6268632534024627, + "grad_norm": 0.8382424712181091, + "learning_rate": 2.279448523525699e-06, + "loss": 0.1101, + "step": 10041 + }, + { + "epoch": 1.6270252754374595, + "grad_norm": 0.7908496856689453, + "learning_rate": 2.279012927867309e-06, + "loss": 0.0963, + "step": 10042 + }, + { + "epoch": 1.6271872974724562, + "grad_norm": 0.8468271493911743, + "learning_rate": 2.278577338970589e-06, + "loss": 0.1045, + "step": 10043 + }, + { + "epoch": 1.627349319507453, + "grad_norm": 0.8646363019943237, + "learning_rate": 2.2781417568488677e-06, + "loss": 0.1113, + "step": 10044 + }, + { + "epoch": 1.6275113415424498, + "grad_norm": 0.8169398307800293, + "learning_rate": 2.2777061815154705e-06, + "loss": 0.1056, + "step": 10045 + }, + { + "epoch": 1.6276733635774465, + "grad_norm": 0.8818169832229614, + "learning_rate": 2.277270612983726e-06, + "loss": 0.1123, + "step": 10046 + }, + { + "epoch": 1.6278353856124435, + "grad_norm": 0.8572877049446106, + "learning_rate": 2.276835051266963e-06, + "loss": 0.0976, + "step": 10047 + }, + { + "epoch": 1.62799740764744, + "grad_norm": 0.8052610754966736, + "learning_rate": 2.2763994963785066e-06, + "loss": 0.1041, + "step": 10048 + }, + { + "epoch": 1.6281594296824369, + "grad_norm": 0.7382464408874512, + "learning_rate": 2.275963948331685e-06, + "loss": 0.0905, + "step": 10049 + }, + { + "epoch": 1.6283214517174336, + "grad_norm": 0.7890332341194153, + "learning_rate": 2.2755284071398243e-06, + "loss": 0.1029, + "step": 10050 + }, + { + "epoch": 1.6284834737524303, + "grad_norm": 0.794537365436554, + "learning_rate": 2.27509287281625e-06, + "loss": 0.0951, + "step": 10051 + }, + { + "epoch": 1.6286454957874272, + "grad_norm": 0.7802528738975525, + "learning_rate": 2.2746573453742905e-06, + "loss": 0.0916, + "step": 10052 + }, + { + "epoch": 1.6288075178224237, + "grad_norm": 0.8495976328849792, + "learning_rate": 2.2742218248272714e-06, + "loss": 0.1044, + "step": 10053 + }, + { + "epoch": 1.6289695398574207, + "grad_norm": 0.8333568572998047, + "learning_rate": 2.2737863111885175e-06, + "loss": 0.1017, + "step": 10054 + }, + { + "epoch": 1.6291315618924174, + "grad_norm": 0.7988643646240234, + "learning_rate": 2.273350804471355e-06, + "loss": 0.1004, + "step": 10055 + }, + { + "epoch": 1.629293583927414, + "grad_norm": 0.8200765252113342, + "learning_rate": 2.2729153046891095e-06, + "loss": 0.1005, + "step": 10056 + }, + { + "epoch": 1.629455605962411, + "grad_norm": 0.7785143852233887, + "learning_rate": 2.272479811855106e-06, + "loss": 0.0918, + "step": 10057 + }, + { + "epoch": 1.6296176279974075, + "grad_norm": 0.7829576730728149, + "learning_rate": 2.2720443259826702e-06, + "loss": 0.0891, + "step": 10058 + }, + { + "epoch": 1.6297796500324044, + "grad_norm": 0.8104037046432495, + "learning_rate": 2.271608847085126e-06, + "loss": 0.0981, + "step": 10059 + }, + { + "epoch": 1.6299416720674011, + "grad_norm": 0.9911179542541504, + "learning_rate": 2.2711733751757983e-06, + "loss": 0.1145, + "step": 10060 + }, + { + "epoch": 1.6301036941023979, + "grad_norm": 0.7678511142730713, + "learning_rate": 2.270737910268011e-06, + "loss": 0.0952, + "step": 10061 + }, + { + "epoch": 1.6302657161373948, + "grad_norm": 0.9023809432983398, + "learning_rate": 2.27030245237509e-06, + "loss": 0.0994, + "step": 10062 + }, + { + "epoch": 1.6304277381723913, + "grad_norm": 0.8956974148750305, + "learning_rate": 2.2698670015103574e-06, + "loss": 0.1128, + "step": 10063 + }, + { + "epoch": 1.6305897602073882, + "grad_norm": 0.8878682255744934, + "learning_rate": 2.2694315576871384e-06, + "loss": 0.0983, + "step": 10064 + }, + { + "epoch": 1.630751782242385, + "grad_norm": 0.7095435857772827, + "learning_rate": 2.2689961209187543e-06, + "loss": 0.0903, + "step": 10065 + }, + { + "epoch": 1.6309138042773816, + "grad_norm": 0.8130254149436951, + "learning_rate": 2.268560691218531e-06, + "loss": 0.0988, + "step": 10066 + }, + { + "epoch": 1.6310758263123786, + "grad_norm": 0.8411709070205688, + "learning_rate": 2.26812526859979e-06, + "loss": 0.1038, + "step": 10067 + }, + { + "epoch": 1.6312378483473753, + "grad_norm": 0.9909535050392151, + "learning_rate": 2.2676898530758554e-06, + "loss": 0.12, + "step": 10068 + }, + { + "epoch": 1.631399870382372, + "grad_norm": 0.8369352221488953, + "learning_rate": 2.2672544446600485e-06, + "loss": 0.0975, + "step": 10069 + }, + { + "epoch": 1.6315618924173687, + "grad_norm": 0.8436316847801208, + "learning_rate": 2.266819043365692e-06, + "loss": 0.102, + "step": 10070 + }, + { + "epoch": 1.6317239144523654, + "grad_norm": 0.7915019392967224, + "learning_rate": 2.2663836492061097e-06, + "loss": 0.1006, + "step": 10071 + }, + { + "epoch": 1.6318859364873624, + "grad_norm": 0.6947129964828491, + "learning_rate": 2.265948262194621e-06, + "loss": 0.0943, + "step": 10072 + }, + { + "epoch": 1.632047958522359, + "grad_norm": 0.738064169883728, + "learning_rate": 2.2655128823445507e-06, + "loss": 0.0989, + "step": 10073 + }, + { + "epoch": 1.6322099805573558, + "grad_norm": 0.9031091928482056, + "learning_rate": 2.2650775096692176e-06, + "loss": 0.1185, + "step": 10074 + }, + { + "epoch": 1.6323720025923527, + "grad_norm": 0.736219048500061, + "learning_rate": 2.264642144181944e-06, + "loss": 0.0998, + "step": 10075 + }, + { + "epoch": 1.6325340246273492, + "grad_norm": 0.7197458744049072, + "learning_rate": 2.2642067858960514e-06, + "loss": 0.0991, + "step": 10076 + }, + { + "epoch": 1.6326960466623461, + "grad_norm": 0.8605630993843079, + "learning_rate": 2.263771434824861e-06, + "loss": 0.1069, + "step": 10077 + }, + { + "epoch": 1.6328580686973428, + "grad_norm": 0.7979494333267212, + "learning_rate": 2.263336090981693e-06, + "loss": 0.0967, + "step": 10078 + }, + { + "epoch": 1.6330200907323396, + "grad_norm": 0.8447449207305908, + "learning_rate": 2.262900754379868e-06, + "loss": 0.099, + "step": 10079 + }, + { + "epoch": 1.6331821127673365, + "grad_norm": 0.9383016228675842, + "learning_rate": 2.2624654250327054e-06, + "loss": 0.1159, + "step": 10080 + }, + { + "epoch": 1.633344134802333, + "grad_norm": 0.8210467100143433, + "learning_rate": 2.2620301029535264e-06, + "loss": 0.1078, + "step": 10081 + }, + { + "epoch": 1.63350615683733, + "grad_norm": 0.8023772239685059, + "learning_rate": 2.2615947881556506e-06, + "loss": 0.101, + "step": 10082 + }, + { + "epoch": 1.6336681788723266, + "grad_norm": 0.8335037231445312, + "learning_rate": 2.2611594806523975e-06, + "loss": 0.103, + "step": 10083 + }, + { + "epoch": 1.6338302009073233, + "grad_norm": 0.7512911558151245, + "learning_rate": 2.2607241804570864e-06, + "loss": 0.0998, + "step": 10084 + }, + { + "epoch": 1.6339922229423203, + "grad_norm": 0.8822583556175232, + "learning_rate": 2.2602888875830346e-06, + "loss": 0.1034, + "step": 10085 + }, + { + "epoch": 1.6341542449773168, + "grad_norm": 0.830302357673645, + "learning_rate": 2.2598536020435644e-06, + "loss": 0.1027, + "step": 10086 + }, + { + "epoch": 1.6343162670123137, + "grad_norm": 0.7789014577865601, + "learning_rate": 2.2594183238519923e-06, + "loss": 0.0965, + "step": 10087 + }, + { + "epoch": 1.6344782890473104, + "grad_norm": 0.7837060689926147, + "learning_rate": 2.258983053021638e-06, + "loss": 0.1004, + "step": 10088 + }, + { + "epoch": 1.6346403110823071, + "grad_norm": 0.8886387348175049, + "learning_rate": 2.258547789565818e-06, + "loss": 0.1169, + "step": 10089 + }, + { + "epoch": 1.634802333117304, + "grad_norm": 0.9173817038536072, + "learning_rate": 2.2581125334978517e-06, + "loss": 0.1102, + "step": 10090 + }, + { + "epoch": 1.6349643551523008, + "grad_norm": 0.8326734304428101, + "learning_rate": 2.2576772848310572e-06, + "loss": 0.1016, + "step": 10091 + }, + { + "epoch": 1.6351263771872975, + "grad_norm": 0.7596706748008728, + "learning_rate": 2.257242043578751e-06, + "loss": 0.0957, + "step": 10092 + }, + { + "epoch": 1.6352883992222942, + "grad_norm": 0.7185407876968384, + "learning_rate": 2.256806809754251e-06, + "loss": 0.084, + "step": 10093 + }, + { + "epoch": 1.635450421257291, + "grad_norm": 0.8314585089683533, + "learning_rate": 2.2563715833708726e-06, + "loss": 0.1079, + "step": 10094 + }, + { + "epoch": 1.6356124432922878, + "grad_norm": 0.804526686668396, + "learning_rate": 2.2559363644419357e-06, + "loss": 0.1031, + "step": 10095 + }, + { + "epoch": 1.6357744653272845, + "grad_norm": 0.7893649935722351, + "learning_rate": 2.255501152980755e-06, + "loss": 0.0935, + "step": 10096 + }, + { + "epoch": 1.6359364873622813, + "grad_norm": 0.9371634721755981, + "learning_rate": 2.255065949000648e-06, + "loss": 0.1163, + "step": 10097 + }, + { + "epoch": 1.6360985093972782, + "grad_norm": 0.8292739987373352, + "learning_rate": 2.2546307525149293e-06, + "loss": 0.1, + "step": 10098 + }, + { + "epoch": 1.6362605314322747, + "grad_norm": 0.7563901543617249, + "learning_rate": 2.2541955635369156e-06, + "loss": 0.0997, + "step": 10099 + }, + { + "epoch": 1.6364225534672716, + "grad_norm": 0.8337042927742004, + "learning_rate": 2.253760382079924e-06, + "loss": 0.1052, + "step": 10100 + }, + { + "epoch": 1.6365845755022683, + "grad_norm": 0.7527819275856018, + "learning_rate": 2.253325208157268e-06, + "loss": 0.0978, + "step": 10101 + }, + { + "epoch": 1.636746597537265, + "grad_norm": 0.8317774534225464, + "learning_rate": 2.2528900417822636e-06, + "loss": 0.1005, + "step": 10102 + }, + { + "epoch": 1.636908619572262, + "grad_norm": 0.8075408935546875, + "learning_rate": 2.252454882968227e-06, + "loss": 0.1078, + "step": 10103 + }, + { + "epoch": 1.6370706416072585, + "grad_norm": 0.9008114337921143, + "learning_rate": 2.2520197317284702e-06, + "loss": 0.1172, + "step": 10104 + }, + { + "epoch": 1.6372326636422554, + "grad_norm": 0.8030856251716614, + "learning_rate": 2.2515845880763102e-06, + "loss": 0.099, + "step": 10105 + }, + { + "epoch": 1.637394685677252, + "grad_norm": 0.8751403093338013, + "learning_rate": 2.2511494520250613e-06, + "loss": 0.1087, + "step": 10106 + }, + { + "epoch": 1.6375567077122488, + "grad_norm": 0.8014907836914062, + "learning_rate": 2.2507143235880364e-06, + "loss": 0.0864, + "step": 10107 + }, + { + "epoch": 1.6377187297472457, + "grad_norm": 0.7244542241096497, + "learning_rate": 2.2502792027785508e-06, + "loss": 0.0942, + "step": 10108 + }, + { + "epoch": 1.6378807517822422, + "grad_norm": 0.9491717219352722, + "learning_rate": 2.249844089609916e-06, + "loss": 0.119, + "step": 10109 + }, + { + "epoch": 1.6380427738172392, + "grad_norm": 0.730972170829773, + "learning_rate": 2.249408984095447e-06, + "loss": 0.0864, + "step": 10110 + }, + { + "epoch": 1.6382047958522359, + "grad_norm": 0.7497561573982239, + "learning_rate": 2.248973886248457e-06, + "loss": 0.1, + "step": 10111 + }, + { + "epoch": 1.6383668178872326, + "grad_norm": 0.8914108276367188, + "learning_rate": 2.248538796082259e-06, + "loss": 0.1044, + "step": 10112 + }, + { + "epoch": 1.6385288399222295, + "grad_norm": 0.8104687929153442, + "learning_rate": 2.248103713610164e-06, + "loss": 0.104, + "step": 10113 + }, + { + "epoch": 1.638690861957226, + "grad_norm": 0.851868748664856, + "learning_rate": 2.2476686388454867e-06, + "loss": 0.1021, + "step": 10114 + }, + { + "epoch": 1.638852883992223, + "grad_norm": 0.8389064073562622, + "learning_rate": 2.247233571801539e-06, + "loss": 0.1069, + "step": 10115 + }, + { + "epoch": 1.6390149060272197, + "grad_norm": 0.7837998867034912, + "learning_rate": 2.2467985124916314e-06, + "loss": 0.0959, + "step": 10116 + }, + { + "epoch": 1.6391769280622164, + "grad_norm": 0.7479491233825684, + "learning_rate": 2.2463634609290776e-06, + "loss": 0.1026, + "step": 10117 + }, + { + "epoch": 1.6393389500972133, + "grad_norm": 0.7724059820175171, + "learning_rate": 2.2459284171271863e-06, + "loss": 0.1027, + "step": 10118 + }, + { + "epoch": 1.63950097213221, + "grad_norm": 0.7824938893318176, + "learning_rate": 2.245493381099272e-06, + "loss": 0.1015, + "step": 10119 + }, + { + "epoch": 1.6396629941672067, + "grad_norm": 0.8370165824890137, + "learning_rate": 2.2450583528586437e-06, + "loss": 0.112, + "step": 10120 + }, + { + "epoch": 1.6398250162022034, + "grad_norm": 0.8152546286582947, + "learning_rate": 2.244623332418614e-06, + "loss": 0.1056, + "step": 10121 + }, + { + "epoch": 1.6399870382372002, + "grad_norm": 0.7992122173309326, + "learning_rate": 2.244188319792491e-06, + "loss": 0.0924, + "step": 10122 + }, + { + "epoch": 1.640149060272197, + "grad_norm": 0.7578029036521912, + "learning_rate": 2.243753314993586e-06, + "loss": 0.0947, + "step": 10123 + }, + { + "epoch": 1.6403110823071938, + "grad_norm": 0.8097305297851562, + "learning_rate": 2.243318318035211e-06, + "loss": 0.1012, + "step": 10124 + }, + { + "epoch": 1.6404731043421905, + "grad_norm": 0.7099080681800842, + "learning_rate": 2.2428833289306735e-06, + "loss": 0.0909, + "step": 10125 + }, + { + "epoch": 1.6406351263771874, + "grad_norm": 0.8755943179130554, + "learning_rate": 2.2424483476932847e-06, + "loss": 0.1099, + "step": 10126 + }, + { + "epoch": 1.640797148412184, + "grad_norm": 0.7338682413101196, + "learning_rate": 2.2420133743363524e-06, + "loss": 0.0978, + "step": 10127 + }, + { + "epoch": 1.6409591704471809, + "grad_norm": 0.7446590065956116, + "learning_rate": 2.241578408873186e-06, + "loss": 0.0917, + "step": 10128 + }, + { + "epoch": 1.6411211924821776, + "grad_norm": 0.8787501454353333, + "learning_rate": 2.2411434513170955e-06, + "loss": 0.1037, + "step": 10129 + }, + { + "epoch": 1.6412832145171743, + "grad_norm": 0.73140549659729, + "learning_rate": 2.2407085016813895e-06, + "loss": 0.0915, + "step": 10130 + }, + { + "epoch": 1.6414452365521712, + "grad_norm": 0.6761218905448914, + "learning_rate": 2.2402735599793754e-06, + "loss": 0.0846, + "step": 10131 + }, + { + "epoch": 1.6416072585871677, + "grad_norm": 0.9457839131355286, + "learning_rate": 2.239838626224361e-06, + "loss": 0.1162, + "step": 10132 + }, + { + "epoch": 1.6417692806221647, + "grad_norm": 0.7481329441070557, + "learning_rate": 2.2394037004296566e-06, + "loss": 0.0894, + "step": 10133 + }, + { + "epoch": 1.6419313026571614, + "grad_norm": 0.8582826852798462, + "learning_rate": 2.2389687826085675e-06, + "loss": 0.1031, + "step": 10134 + }, + { + "epoch": 1.642093324692158, + "grad_norm": 0.9034931063652039, + "learning_rate": 2.2385338727744027e-06, + "loss": 0.1094, + "step": 10135 + }, + { + "epoch": 1.642255346727155, + "grad_norm": 0.8110890984535217, + "learning_rate": 2.238098970940468e-06, + "loss": 0.0943, + "step": 10136 + }, + { + "epoch": 1.6424173687621515, + "grad_norm": 0.7057623863220215, + "learning_rate": 2.23766407712007e-06, + "loss": 0.0871, + "step": 10137 + }, + { + "epoch": 1.6425793907971484, + "grad_norm": 0.8530978560447693, + "learning_rate": 2.2372291913265177e-06, + "loss": 0.0934, + "step": 10138 + }, + { + "epoch": 1.6427414128321451, + "grad_norm": 0.8763788342475891, + "learning_rate": 2.2367943135731164e-06, + "loss": 0.1104, + "step": 10139 + }, + { + "epoch": 1.6429034348671419, + "grad_norm": 0.8446993827819824, + "learning_rate": 2.236359443873172e-06, + "loss": 0.1086, + "step": 10140 + }, + { + "epoch": 1.6430654569021388, + "grad_norm": 0.9074305891990662, + "learning_rate": 2.2359245822399908e-06, + "loss": 0.1015, + "step": 10141 + }, + { + "epoch": 1.6432274789371355, + "grad_norm": 0.8572295904159546, + "learning_rate": 2.2354897286868773e-06, + "loss": 0.1, + "step": 10142 + }, + { + "epoch": 1.6433895009721322, + "grad_norm": 0.8138932585716248, + "learning_rate": 2.2350548832271386e-06, + "loss": 0.0973, + "step": 10143 + }, + { + "epoch": 1.643551523007129, + "grad_norm": 0.9844521880149841, + "learning_rate": 2.23462004587408e-06, + "loss": 0.1023, + "step": 10144 + }, + { + "epoch": 1.6437135450421256, + "grad_norm": 0.794780433177948, + "learning_rate": 2.2341852166410048e-06, + "loss": 0.0918, + "step": 10145 + }, + { + "epoch": 1.6438755670771226, + "grad_norm": 0.8207656145095825, + "learning_rate": 2.233750395541219e-06, + "loss": 0.0913, + "step": 10146 + }, + { + "epoch": 1.6440375891121193, + "grad_norm": 0.7891299724578857, + "learning_rate": 2.233315582588026e-06, + "loss": 0.0977, + "step": 10147 + }, + { + "epoch": 1.644199611147116, + "grad_norm": 0.7756161689758301, + "learning_rate": 2.2328807777947323e-06, + "loss": 0.094, + "step": 10148 + }, + { + "epoch": 1.644361633182113, + "grad_norm": 0.9181487560272217, + "learning_rate": 2.23244598117464e-06, + "loss": 0.1011, + "step": 10149 + }, + { + "epoch": 1.6445236552171094, + "grad_norm": 0.7459124326705933, + "learning_rate": 2.232011192741053e-06, + "loss": 0.0858, + "step": 10150 + }, + { + "epoch": 1.6446856772521063, + "grad_norm": 0.8103591203689575, + "learning_rate": 2.231576412507275e-06, + "loss": 0.1069, + "step": 10151 + }, + { + "epoch": 1.644847699287103, + "grad_norm": 0.9799508452415466, + "learning_rate": 2.2311416404866085e-06, + "loss": 0.1186, + "step": 10152 + }, + { + "epoch": 1.6450097213220998, + "grad_norm": 0.7545525431632996, + "learning_rate": 2.2307068766923584e-06, + "loss": 0.0958, + "step": 10153 + }, + { + "epoch": 1.6451717433570967, + "grad_norm": 0.8151463866233826, + "learning_rate": 2.2302721211378254e-06, + "loss": 0.0976, + "step": 10154 + }, + { + "epoch": 1.6453337653920932, + "grad_norm": 0.8132693767547607, + "learning_rate": 2.229837373836313e-06, + "loss": 0.1047, + "step": 10155 + }, + { + "epoch": 1.6454957874270901, + "grad_norm": 0.6447371244430542, + "learning_rate": 2.2294026348011223e-06, + "loss": 0.0838, + "step": 10156 + }, + { + "epoch": 1.6456578094620868, + "grad_norm": 0.8918012380599976, + "learning_rate": 2.228967904045558e-06, + "loss": 0.1196, + "step": 10157 + }, + { + "epoch": 1.6458198314970836, + "grad_norm": 0.7891850471496582, + "learning_rate": 2.2285331815829187e-06, + "loss": 0.1002, + "step": 10158 + }, + { + "epoch": 1.6459818535320805, + "grad_norm": 0.7595945000648499, + "learning_rate": 2.2280984674265077e-06, + "loss": 0.0927, + "step": 10159 + }, + { + "epoch": 1.646143875567077, + "grad_norm": 0.8276386857032776, + "learning_rate": 2.227663761589625e-06, + "loss": 0.1031, + "step": 10160 + }, + { + "epoch": 1.646305897602074, + "grad_norm": 0.8383728861808777, + "learning_rate": 2.227229064085572e-06, + "loss": 0.0979, + "step": 10161 + }, + { + "epoch": 1.6464679196370706, + "grad_norm": 0.6650486588478088, + "learning_rate": 2.2267943749276503e-06, + "loss": 0.0886, + "step": 10162 + }, + { + "epoch": 1.6466299416720673, + "grad_norm": 0.7667943835258484, + "learning_rate": 2.2263596941291595e-06, + "loss": 0.0959, + "step": 10163 + }, + { + "epoch": 1.6467919637070643, + "grad_norm": 0.9279943108558655, + "learning_rate": 2.225925021703399e-06, + "loss": 0.1164, + "step": 10164 + }, + { + "epoch": 1.6469539857420608, + "grad_norm": 0.608851969242096, + "learning_rate": 2.2254903576636713e-06, + "loss": 0.076, + "step": 10165 + }, + { + "epoch": 1.6471160077770577, + "grad_norm": 0.7714102268218994, + "learning_rate": 2.2250557020232724e-06, + "loss": 0.0894, + "step": 10166 + }, + { + "epoch": 1.6472780298120544, + "grad_norm": 0.9038023352622986, + "learning_rate": 2.2246210547955043e-06, + "loss": 0.1038, + "step": 10167 + }, + { + "epoch": 1.6474400518470511, + "grad_norm": 0.8809518814086914, + "learning_rate": 2.2241864159936664e-06, + "loss": 0.1081, + "step": 10168 + }, + { + "epoch": 1.647602073882048, + "grad_norm": 0.7901051640510559, + "learning_rate": 2.2237517856310558e-06, + "loss": 0.0955, + "step": 10169 + }, + { + "epoch": 1.6477640959170448, + "grad_norm": 0.8911927938461304, + "learning_rate": 2.223317163720973e-06, + "loss": 0.1086, + "step": 10170 + }, + { + "epoch": 1.6479261179520415, + "grad_norm": 0.7963345050811768, + "learning_rate": 2.2228825502767133e-06, + "loss": 0.0976, + "step": 10171 + }, + { + "epoch": 1.6480881399870384, + "grad_norm": 0.902242124080658, + "learning_rate": 2.222447945311579e-06, + "loss": 0.1149, + "step": 10172 + }, + { + "epoch": 1.648250162022035, + "grad_norm": 0.8996853232383728, + "learning_rate": 2.2220133488388652e-06, + "loss": 0.118, + "step": 10173 + }, + { + "epoch": 1.6484121840570318, + "grad_norm": 0.9371250867843628, + "learning_rate": 2.2215787608718706e-06, + "loss": 0.1137, + "step": 10174 + }, + { + "epoch": 1.6485742060920285, + "grad_norm": 0.9435984492301941, + "learning_rate": 2.221144181423892e-06, + "loss": 0.1131, + "step": 10175 + }, + { + "epoch": 1.6487362281270252, + "grad_norm": 0.7899986505508423, + "learning_rate": 2.220709610508226e-06, + "loss": 0.095, + "step": 10176 + }, + { + "epoch": 1.6488982501620222, + "grad_norm": 0.7958756685256958, + "learning_rate": 2.220275048138171e-06, + "loss": 0.096, + "step": 10177 + }, + { + "epoch": 1.6490602721970187, + "grad_norm": 0.7749899625778198, + "learning_rate": 2.2198404943270217e-06, + "loss": 0.0984, + "step": 10178 + }, + { + "epoch": 1.6492222942320156, + "grad_norm": 0.7689141631126404, + "learning_rate": 2.2194059490880764e-06, + "loss": 0.0908, + "step": 10179 + }, + { + "epoch": 1.6493843162670123, + "grad_norm": 0.869452714920044, + "learning_rate": 2.218971412434628e-06, + "loss": 0.1048, + "step": 10180 + }, + { + "epoch": 1.649546338302009, + "grad_norm": 0.840836226940155, + "learning_rate": 2.2185368843799764e-06, + "loss": 0.1033, + "step": 10181 + }, + { + "epoch": 1.649708360337006, + "grad_norm": 0.8797898888587952, + "learning_rate": 2.218102364937414e-06, + "loss": 0.1093, + "step": 10182 + }, + { + "epoch": 1.6498703823720025, + "grad_norm": 0.7758781909942627, + "learning_rate": 2.217667854120238e-06, + "loss": 0.094, + "step": 10183 + }, + { + "epoch": 1.6500324044069994, + "grad_norm": 0.830178439617157, + "learning_rate": 2.2172333519417415e-06, + "loss": 0.1035, + "step": 10184 + }, + { + "epoch": 1.650194426441996, + "grad_norm": 0.8420971632003784, + "learning_rate": 2.2167988584152198e-06, + "loss": 0.1108, + "step": 10185 + }, + { + "epoch": 1.6503564484769928, + "grad_norm": 0.8024832606315613, + "learning_rate": 2.2163643735539688e-06, + "loss": 0.0986, + "step": 10186 + }, + { + "epoch": 1.6505184705119897, + "grad_norm": 0.8306187391281128, + "learning_rate": 2.215929897371281e-06, + "loss": 0.1036, + "step": 10187 + }, + { + "epoch": 1.6506804925469862, + "grad_norm": 0.7863216400146484, + "learning_rate": 2.2154954298804514e-06, + "loss": 0.1019, + "step": 10188 + }, + { + "epoch": 1.6508425145819832, + "grad_norm": 0.7025604248046875, + "learning_rate": 2.215060971094773e-06, + "loss": 0.0838, + "step": 10189 + }, + { + "epoch": 1.6510045366169799, + "grad_norm": 0.9769557118415833, + "learning_rate": 2.214626521027538e-06, + "loss": 0.0856, + "step": 10190 + }, + { + "epoch": 1.6511665586519766, + "grad_norm": 0.7868211269378662, + "learning_rate": 2.214192079692042e-06, + "loss": 0.0988, + "step": 10191 + }, + { + "epoch": 1.6513285806869735, + "grad_norm": 0.8536260724067688, + "learning_rate": 2.213757647101577e-06, + "loss": 0.1069, + "step": 10192 + }, + { + "epoch": 1.6514906027219702, + "grad_norm": 0.8652046322822571, + "learning_rate": 2.2133232232694354e-06, + "loss": 0.1003, + "step": 10193 + }, + { + "epoch": 1.651652624756967, + "grad_norm": 0.7852055430412292, + "learning_rate": 2.2128888082089093e-06, + "loss": 0.0992, + "step": 10194 + }, + { + "epoch": 1.6518146467919637, + "grad_norm": 0.7507728338241577, + "learning_rate": 2.2124544019332898e-06, + "loss": 0.0894, + "step": 10195 + }, + { + "epoch": 1.6519766688269604, + "grad_norm": 0.8173365592956543, + "learning_rate": 2.2120200044558705e-06, + "loss": 0.1081, + "step": 10196 + }, + { + "epoch": 1.6521386908619573, + "grad_norm": 0.9333679676055908, + "learning_rate": 2.211585615789943e-06, + "loss": 0.1155, + "step": 10197 + }, + { + "epoch": 1.652300712896954, + "grad_norm": 0.861423671245575, + "learning_rate": 2.2111512359487967e-06, + "loss": 0.0958, + "step": 10198 + }, + { + "epoch": 1.6524627349319507, + "grad_norm": 0.7481124997138977, + "learning_rate": 2.2107168649457233e-06, + "loss": 0.097, + "step": 10199 + }, + { + "epoch": 1.6526247569669477, + "grad_norm": 0.8404136896133423, + "learning_rate": 2.2102825027940143e-06, + "loss": 0.0984, + "step": 10200 + }, + { + "epoch": 1.6527867790019442, + "grad_norm": 0.7924706339836121, + "learning_rate": 2.20984814950696e-06, + "loss": 0.103, + "step": 10201 + }, + { + "epoch": 1.652948801036941, + "grad_norm": 0.8639355301856995, + "learning_rate": 2.2094138050978496e-06, + "loss": 0.1058, + "step": 10202 + }, + { + "epoch": 1.6531108230719378, + "grad_norm": 0.7134700417518616, + "learning_rate": 2.2089794695799744e-06, + "loss": 0.093, + "step": 10203 + }, + { + "epoch": 1.6532728451069345, + "grad_norm": 0.7675890922546387, + "learning_rate": 2.2085451429666215e-06, + "loss": 0.0912, + "step": 10204 + }, + { + "epoch": 1.6534348671419314, + "grad_norm": 0.9659287929534912, + "learning_rate": 2.208110825271083e-06, + "loss": 0.1052, + "step": 10205 + }, + { + "epoch": 1.653596889176928, + "grad_norm": 0.821732759475708, + "learning_rate": 2.207676516506647e-06, + "loss": 0.0996, + "step": 10206 + }, + { + "epoch": 1.6537589112119249, + "grad_norm": 0.8499715924263, + "learning_rate": 2.2072422166866024e-06, + "loss": 0.1036, + "step": 10207 + }, + { + "epoch": 1.6539209332469216, + "grad_norm": 0.8570707440376282, + "learning_rate": 2.206807925824237e-06, + "loss": 0.0909, + "step": 10208 + }, + { + "epoch": 1.6540829552819183, + "grad_norm": 0.8913640975952148, + "learning_rate": 2.206373643932839e-06, + "loss": 0.1169, + "step": 10209 + }, + { + "epoch": 1.6542449773169152, + "grad_norm": 0.7439228296279907, + "learning_rate": 2.205939371025698e-06, + "loss": 0.0964, + "step": 10210 + }, + { + "epoch": 1.6544069993519117, + "grad_norm": 0.8434868454933167, + "learning_rate": 2.2055051071161e-06, + "loss": 0.1139, + "step": 10211 + }, + { + "epoch": 1.6545690213869086, + "grad_norm": 0.8420630097389221, + "learning_rate": 2.205070852217334e-06, + "loss": 0.1057, + "step": 10212 + }, + { + "epoch": 1.6547310434219054, + "grad_norm": 1.035088062286377, + "learning_rate": 2.204636606342685e-06, + "loss": 0.112, + "step": 10213 + }, + { + "epoch": 1.654893065456902, + "grad_norm": 0.7095579504966736, + "learning_rate": 2.204202369505441e-06, + "loss": 0.0954, + "step": 10214 + }, + { + "epoch": 1.655055087491899, + "grad_norm": 0.8334465622901917, + "learning_rate": 2.2037681417188895e-06, + "loss": 0.1064, + "step": 10215 + }, + { + "epoch": 1.6552171095268955, + "grad_norm": 0.7914650440216064, + "learning_rate": 2.203333922996316e-06, + "loss": 0.1005, + "step": 10216 + }, + { + "epoch": 1.6553791315618924, + "grad_norm": 0.6868200898170471, + "learning_rate": 2.2028997133510065e-06, + "loss": 0.0868, + "step": 10217 + }, + { + "epoch": 1.6555411535968891, + "grad_norm": 0.7120256423950195, + "learning_rate": 2.202465512796247e-06, + "loss": 0.0839, + "step": 10218 + }, + { + "epoch": 1.6557031756318858, + "grad_norm": 0.8592171669006348, + "learning_rate": 2.2020313213453216e-06, + "loss": 0.1126, + "step": 10219 + }, + { + "epoch": 1.6558651976668828, + "grad_norm": 0.8605828285217285, + "learning_rate": 2.2015971390115172e-06, + "loss": 0.0982, + "step": 10220 + }, + { + "epoch": 1.6560272197018795, + "grad_norm": 0.75331050157547, + "learning_rate": 2.2011629658081194e-06, + "loss": 0.0965, + "step": 10221 + }, + { + "epoch": 1.6561892417368762, + "grad_norm": 0.8181818127632141, + "learning_rate": 2.2007288017484105e-06, + "loss": 0.0948, + "step": 10222 + }, + { + "epoch": 1.6563512637718731, + "grad_norm": 0.8317630290985107, + "learning_rate": 2.2002946468456758e-06, + "loss": 0.0963, + "step": 10223 + }, + { + "epoch": 1.6565132858068696, + "grad_norm": 0.8039714694023132, + "learning_rate": 2.1998605011131997e-06, + "loss": 0.0992, + "step": 10224 + }, + { + "epoch": 1.6566753078418666, + "grad_norm": 0.8288991451263428, + "learning_rate": 2.199426364564267e-06, + "loss": 0.1062, + "step": 10225 + }, + { + "epoch": 1.6568373298768633, + "grad_norm": 0.781379222869873, + "learning_rate": 2.19899223721216e-06, + "loss": 0.0961, + "step": 10226 + }, + { + "epoch": 1.65699935191186, + "grad_norm": 0.8361613750457764, + "learning_rate": 2.1985581190701617e-06, + "loss": 0.1011, + "step": 10227 + }, + { + "epoch": 1.657161373946857, + "grad_norm": 0.8540022373199463, + "learning_rate": 2.1981240101515548e-06, + "loss": 0.1012, + "step": 10228 + }, + { + "epoch": 1.6573233959818534, + "grad_norm": 0.7892126441001892, + "learning_rate": 2.197689910469623e-06, + "loss": 0.087, + "step": 10229 + }, + { + "epoch": 1.6574854180168503, + "grad_norm": 0.9751836061477661, + "learning_rate": 2.1972558200376497e-06, + "loss": 0.1122, + "step": 10230 + }, + { + "epoch": 1.657647440051847, + "grad_norm": 0.9051089882850647, + "learning_rate": 2.1968217388689145e-06, + "loss": 0.1043, + "step": 10231 + }, + { + "epoch": 1.6578094620868438, + "grad_norm": 0.783577024936676, + "learning_rate": 2.1963876669767008e-06, + "loss": 0.094, + "step": 10232 + }, + { + "epoch": 1.6579714841218407, + "grad_norm": 0.835684597492218, + "learning_rate": 2.1959536043742887e-06, + "loss": 0.0975, + "step": 10233 + }, + { + "epoch": 1.6581335061568372, + "grad_norm": 0.8699052929878235, + "learning_rate": 2.1955195510749614e-06, + "loss": 0.1112, + "step": 10234 + }, + { + "epoch": 1.6582955281918341, + "grad_norm": 0.8392834067344666, + "learning_rate": 2.1950855070919992e-06, + "loss": 0.1118, + "step": 10235 + }, + { + "epoch": 1.6584575502268308, + "grad_norm": 0.7662588357925415, + "learning_rate": 2.1946514724386827e-06, + "loss": 0.1046, + "step": 10236 + }, + { + "epoch": 1.6586195722618275, + "grad_norm": 0.8604794144630432, + "learning_rate": 2.194217447128292e-06, + "loss": 0.1106, + "step": 10237 + }, + { + "epoch": 1.6587815942968245, + "grad_norm": 0.7735698223114014, + "learning_rate": 2.1937834311741066e-06, + "loss": 0.091, + "step": 10238 + }, + { + "epoch": 1.658943616331821, + "grad_norm": 0.8237504959106445, + "learning_rate": 2.1933494245894087e-06, + "loss": 0.0916, + "step": 10239 + }, + { + "epoch": 1.659105638366818, + "grad_norm": 0.7927703857421875, + "learning_rate": 2.192915427387475e-06, + "loss": 0.0967, + "step": 10240 + }, + { + "epoch": 1.6592676604018146, + "grad_norm": 0.7646491527557373, + "learning_rate": 2.1924814395815875e-06, + "loss": 0.0996, + "step": 10241 + }, + { + "epoch": 1.6594296824368113, + "grad_norm": 0.7301465272903442, + "learning_rate": 2.1920474611850225e-06, + "loss": 0.0896, + "step": 10242 + }, + { + "epoch": 1.6595917044718083, + "grad_norm": 0.7497079372406006, + "learning_rate": 2.19161349221106e-06, + "loss": 0.0904, + "step": 10243 + }, + { + "epoch": 1.659753726506805, + "grad_norm": 0.8112010955810547, + "learning_rate": 2.1911795326729784e-06, + "loss": 0.0977, + "step": 10244 + }, + { + "epoch": 1.6599157485418017, + "grad_norm": 0.7948684692382812, + "learning_rate": 2.1907455825840568e-06, + "loss": 0.0968, + "step": 10245 + }, + { + "epoch": 1.6600777705767984, + "grad_norm": 0.7585175633430481, + "learning_rate": 2.190311641957571e-06, + "loss": 0.0896, + "step": 10246 + }, + { + "epoch": 1.660239792611795, + "grad_norm": 0.7399813532829285, + "learning_rate": 2.189877710806799e-06, + "loss": 0.0879, + "step": 10247 + }, + { + "epoch": 1.660401814646792, + "grad_norm": 0.7585265636444092, + "learning_rate": 2.18944378914502e-06, + "loss": 0.0918, + "step": 10248 + }, + { + "epoch": 1.6605638366817888, + "grad_norm": 0.9892318844795227, + "learning_rate": 2.189009876985509e-06, + "loss": 0.1184, + "step": 10249 + }, + { + "epoch": 1.6607258587167855, + "grad_norm": 0.8371285200119019, + "learning_rate": 2.188575974341543e-06, + "loss": 0.103, + "step": 10250 + }, + { + "epoch": 1.6608878807517824, + "grad_norm": 0.7434375882148743, + "learning_rate": 2.188142081226399e-06, + "loss": 0.0905, + "step": 10251 + }, + { + "epoch": 1.6610499027867789, + "grad_norm": 0.7424872517585754, + "learning_rate": 2.1877081976533515e-06, + "loss": 0.0832, + "step": 10252 + }, + { + "epoch": 1.6612119248217758, + "grad_norm": 0.9748948812484741, + "learning_rate": 2.1872743236356783e-06, + "loss": 0.1224, + "step": 10253 + }, + { + "epoch": 1.6613739468567725, + "grad_norm": 0.7499638199806213, + "learning_rate": 2.186840459186654e-06, + "loss": 0.0857, + "step": 10254 + }, + { + "epoch": 1.6615359688917692, + "grad_norm": 0.8377159237861633, + "learning_rate": 2.186406604319554e-06, + "loss": 0.0981, + "step": 10255 + }, + { + "epoch": 1.6616979909267662, + "grad_norm": 0.8437566161155701, + "learning_rate": 2.185972759047653e-06, + "loss": 0.0917, + "step": 10256 + }, + { + "epoch": 1.6618600129617627, + "grad_norm": 0.9447170495986938, + "learning_rate": 2.185538923384225e-06, + "loss": 0.1206, + "step": 10257 + }, + { + "epoch": 1.6620220349967596, + "grad_norm": 0.9525909423828125, + "learning_rate": 2.1851050973425454e-06, + "loss": 0.1178, + "step": 10258 + }, + { + "epoch": 1.6621840570317563, + "grad_norm": 0.8417654633522034, + "learning_rate": 2.1846712809358876e-06, + "loss": 0.0978, + "step": 10259 + }, + { + "epoch": 1.662346079066753, + "grad_norm": 0.8003092408180237, + "learning_rate": 2.1842374741775262e-06, + "loss": 0.1017, + "step": 10260 + }, + { + "epoch": 1.66250810110175, + "grad_norm": 0.7186848521232605, + "learning_rate": 2.183803677080733e-06, + "loss": 0.0841, + "step": 10261 + }, + { + "epoch": 1.6626701231367464, + "grad_norm": 0.8107516765594482, + "learning_rate": 2.1833698896587816e-06, + "loss": 0.0997, + "step": 10262 + }, + { + "epoch": 1.6628321451717434, + "grad_norm": 0.852218508720398, + "learning_rate": 2.182936111924947e-06, + "loss": 0.1055, + "step": 10263 + }, + { + "epoch": 1.66299416720674, + "grad_norm": 0.7575379610061646, + "learning_rate": 2.1825023438924995e-06, + "loss": 0.095, + "step": 10264 + }, + { + "epoch": 1.6631561892417368, + "grad_norm": 0.7673975825309753, + "learning_rate": 2.182068585574712e-06, + "loss": 0.1017, + "step": 10265 + }, + { + "epoch": 1.6633182112767337, + "grad_norm": 0.8737732172012329, + "learning_rate": 2.1816348369848555e-06, + "loss": 0.1138, + "step": 10266 + }, + { + "epoch": 1.6634802333117304, + "grad_norm": 0.7601016163825989, + "learning_rate": 2.1812010981362033e-06, + "loss": 0.0916, + "step": 10267 + }, + { + "epoch": 1.6636422553467272, + "grad_norm": 0.7540645003318787, + "learning_rate": 2.180767369042026e-06, + "loss": 0.1005, + "step": 10268 + }, + { + "epoch": 1.6638042773817239, + "grad_norm": 0.7578513622283936, + "learning_rate": 2.180333649715595e-06, + "loss": 0.0875, + "step": 10269 + }, + { + "epoch": 1.6639662994167206, + "grad_norm": 0.682778000831604, + "learning_rate": 2.1798999401701802e-06, + "loss": 0.0829, + "step": 10270 + }, + { + "epoch": 1.6641283214517175, + "grad_norm": 0.7566827535629272, + "learning_rate": 2.1794662404190526e-06, + "loss": 0.0932, + "step": 10271 + }, + { + "epoch": 1.6642903434867142, + "grad_norm": 0.7969744205474854, + "learning_rate": 2.1790325504754827e-06, + "loss": 0.099, + "step": 10272 + }, + { + "epoch": 1.664452365521711, + "grad_norm": 0.7715543508529663, + "learning_rate": 2.17859887035274e-06, + "loss": 0.0954, + "step": 10273 + }, + { + "epoch": 1.6646143875567079, + "grad_norm": 0.9869740009307861, + "learning_rate": 2.1781652000640947e-06, + "loss": 0.1277, + "step": 10274 + }, + { + "epoch": 1.6647764095917044, + "grad_norm": 0.7891120910644531, + "learning_rate": 2.1777315396228145e-06, + "loss": 0.1008, + "step": 10275 + }, + { + "epoch": 1.6649384316267013, + "grad_norm": 0.7361397743225098, + "learning_rate": 2.177297889042169e-06, + "loss": 0.0878, + "step": 10276 + }, + { + "epoch": 1.665100453661698, + "grad_norm": 0.8581258654594421, + "learning_rate": 2.1768642483354274e-06, + "loss": 0.103, + "step": 10277 + }, + { + "epoch": 1.6652624756966947, + "grad_norm": 0.8369101285934448, + "learning_rate": 2.1764306175158588e-06, + "loss": 0.0994, + "step": 10278 + }, + { + "epoch": 1.6654244977316917, + "grad_norm": 1.0208182334899902, + "learning_rate": 2.1759969965967293e-06, + "loss": 0.1056, + "step": 10279 + }, + { + "epoch": 1.6655865197666881, + "grad_norm": 0.717728316783905, + "learning_rate": 2.1755633855913086e-06, + "loss": 0.0869, + "step": 10280 + }, + { + "epoch": 1.665748541801685, + "grad_norm": 0.7904380559921265, + "learning_rate": 2.175129784512862e-06, + "loss": 0.0956, + "step": 10281 + }, + { + "epoch": 1.6659105638366818, + "grad_norm": 0.8746015429496765, + "learning_rate": 2.174696193374658e-06, + "loss": 0.1046, + "step": 10282 + }, + { + "epoch": 1.6660725858716785, + "grad_norm": 0.7690415978431702, + "learning_rate": 2.1742626121899645e-06, + "loss": 0.0932, + "step": 10283 + }, + { + "epoch": 1.6662346079066754, + "grad_norm": 0.8206772208213806, + "learning_rate": 2.173829040972046e-06, + "loss": 0.107, + "step": 10284 + }, + { + "epoch": 1.666396629941672, + "grad_norm": 0.830139696598053, + "learning_rate": 2.1733954797341692e-06, + "loss": 0.1108, + "step": 10285 + }, + { + "epoch": 1.6665586519766689, + "grad_norm": 0.7665162086486816, + "learning_rate": 2.1729619284896e-06, + "loss": 0.096, + "step": 10286 + }, + { + "epoch": 1.6667206740116656, + "grad_norm": 0.9206477403640747, + "learning_rate": 2.1725283872516053e-06, + "loss": 0.1022, + "step": 10287 + }, + { + "epoch": 1.6668826960466623, + "grad_norm": 0.7268409729003906, + "learning_rate": 2.1720948560334492e-06, + "loss": 0.0966, + "step": 10288 + }, + { + "epoch": 1.6670447180816592, + "grad_norm": 0.7334338426589966, + "learning_rate": 2.171661334848397e-06, + "loss": 0.0948, + "step": 10289 + }, + { + "epoch": 1.6672067401166557, + "grad_norm": 0.661564826965332, + "learning_rate": 2.171227823709713e-06, + "loss": 0.0766, + "step": 10290 + }, + { + "epoch": 1.6673687621516526, + "grad_norm": 1.0224709510803223, + "learning_rate": 2.1707943226306626e-06, + "loss": 0.1268, + "step": 10291 + }, + { + "epoch": 1.6675307841866494, + "grad_norm": 0.9280935525894165, + "learning_rate": 2.1703608316245092e-06, + "loss": 0.1148, + "step": 10292 + }, + { + "epoch": 1.667692806221646, + "grad_norm": 0.8443484306335449, + "learning_rate": 2.1699273507045163e-06, + "loss": 0.1064, + "step": 10293 + }, + { + "epoch": 1.667854828256643, + "grad_norm": 0.8637576103210449, + "learning_rate": 2.169493879883948e-06, + "loss": 0.1034, + "step": 10294 + }, + { + "epoch": 1.6680168502916397, + "grad_norm": 0.8016158938407898, + "learning_rate": 2.169060419176066e-06, + "loss": 0.0982, + "step": 10295 + }, + { + "epoch": 1.6681788723266364, + "grad_norm": 0.7897706627845764, + "learning_rate": 2.168626968594136e-06, + "loss": 0.0963, + "step": 10296 + }, + { + "epoch": 1.6683408943616331, + "grad_norm": 0.7999807596206665, + "learning_rate": 2.1681935281514182e-06, + "loss": 0.0962, + "step": 10297 + }, + { + "epoch": 1.6685029163966298, + "grad_norm": 0.8183321356773376, + "learning_rate": 2.167760097861176e-06, + "loss": 0.1015, + "step": 10298 + }, + { + "epoch": 1.6686649384316268, + "grad_norm": 0.7776053547859192, + "learning_rate": 2.16732667773667e-06, + "loss": 0.0998, + "step": 10299 + }, + { + "epoch": 1.6688269604666235, + "grad_norm": 0.983116626739502, + "learning_rate": 2.1668932677911624e-06, + "loss": 0.1021, + "step": 10300 + }, + { + "epoch": 1.6689889825016202, + "grad_norm": 0.8453471064567566, + "learning_rate": 2.1664598680379158e-06, + "loss": 0.0946, + "step": 10301 + }, + { + "epoch": 1.6691510045366171, + "grad_norm": 0.7629480957984924, + "learning_rate": 2.166026478490189e-06, + "loss": 0.097, + "step": 10302 + }, + { + "epoch": 1.6693130265716136, + "grad_norm": 0.7451595664024353, + "learning_rate": 2.1655930991612443e-06, + "loss": 0.0916, + "step": 10303 + }, + { + "epoch": 1.6694750486066106, + "grad_norm": 0.7532944083213806, + "learning_rate": 2.1651597300643418e-06, + "loss": 0.0916, + "step": 10304 + }, + { + "epoch": 1.6696370706416073, + "grad_norm": 0.7477149963378906, + "learning_rate": 2.1647263712127402e-06, + "loss": 0.0871, + "step": 10305 + }, + { + "epoch": 1.669799092676604, + "grad_norm": 0.7762525081634521, + "learning_rate": 2.1642930226197012e-06, + "loss": 0.0979, + "step": 10306 + }, + { + "epoch": 1.669961114711601, + "grad_norm": 0.8461878299713135, + "learning_rate": 2.1638596842984834e-06, + "loss": 0.0979, + "step": 10307 + }, + { + "epoch": 1.6701231367465974, + "grad_norm": 0.9171335697174072, + "learning_rate": 2.1634263562623454e-06, + "loss": 0.105, + "step": 10308 + }, + { + "epoch": 1.6702851587815943, + "grad_norm": 0.7819655537605286, + "learning_rate": 2.162993038524547e-06, + "loss": 0.0919, + "step": 10309 + }, + { + "epoch": 1.670447180816591, + "grad_norm": 0.8039749264717102, + "learning_rate": 2.162559731098345e-06, + "loss": 0.1012, + "step": 10310 + }, + { + "epoch": 1.6706092028515878, + "grad_norm": 0.7070164084434509, + "learning_rate": 2.162126433996999e-06, + "loss": 0.085, + "step": 10311 + }, + { + "epoch": 1.6707712248865847, + "grad_norm": 0.6756847500801086, + "learning_rate": 2.161693147233767e-06, + "loss": 0.0854, + "step": 10312 + }, + { + "epoch": 1.6709332469215812, + "grad_norm": 0.8924721479415894, + "learning_rate": 2.161259870821906e-06, + "loss": 0.1165, + "step": 10313 + }, + { + "epoch": 1.6710952689565781, + "grad_norm": 0.8302946090698242, + "learning_rate": 2.1608266047746723e-06, + "loss": 0.1093, + "step": 10314 + }, + { + "epoch": 1.6712572909915748, + "grad_norm": 0.9837422370910645, + "learning_rate": 2.1603933491053243e-06, + "loss": 0.1093, + "step": 10315 + }, + { + "epoch": 1.6714193130265715, + "grad_norm": 0.8880236744880676, + "learning_rate": 2.1599601038271186e-06, + "loss": 0.1147, + "step": 10316 + }, + { + "epoch": 1.6715813350615685, + "grad_norm": 0.7705572247505188, + "learning_rate": 2.1595268689533105e-06, + "loss": 0.0923, + "step": 10317 + }, + { + "epoch": 1.6717433570965652, + "grad_norm": 0.834516167640686, + "learning_rate": 2.1590936444971563e-06, + "loss": 0.0956, + "step": 10318 + }, + { + "epoch": 1.671905379131562, + "grad_norm": 0.8553721904754639, + "learning_rate": 2.15866043047191e-06, + "loss": 0.1118, + "step": 10319 + }, + { + "epoch": 1.6720674011665586, + "grad_norm": 0.899256706237793, + "learning_rate": 2.1582272268908307e-06, + "loss": 0.1104, + "step": 10320 + }, + { + "epoch": 1.6722294232015553, + "grad_norm": 0.8428982496261597, + "learning_rate": 2.1577940337671698e-06, + "loss": 0.108, + "step": 10321 + }, + { + "epoch": 1.6723914452365523, + "grad_norm": 0.8363228440284729, + "learning_rate": 2.1573608511141845e-06, + "loss": 0.105, + "step": 10322 + }, + { + "epoch": 1.672553467271549, + "grad_norm": 0.7880693674087524, + "learning_rate": 2.1569276789451273e-06, + "loss": 0.1044, + "step": 10323 + }, + { + "epoch": 1.6727154893065457, + "grad_norm": 0.7894632816314697, + "learning_rate": 2.1564945172732523e-06, + "loss": 0.0986, + "step": 10324 + }, + { + "epoch": 1.6728775113415426, + "grad_norm": 0.8343285918235779, + "learning_rate": 2.1560613661118154e-06, + "loss": 0.1084, + "step": 10325 + }, + { + "epoch": 1.673039533376539, + "grad_norm": 0.8286653757095337, + "learning_rate": 2.155628225474067e-06, + "loss": 0.1083, + "step": 10326 + }, + { + "epoch": 1.673201555411536, + "grad_norm": 0.7715649008750916, + "learning_rate": 2.1551950953732627e-06, + "loss": 0.1026, + "step": 10327 + }, + { + "epoch": 1.6733635774465327, + "grad_norm": 0.7973687648773193, + "learning_rate": 2.154761975822653e-06, + "loss": 0.1054, + "step": 10328 + }, + { + "epoch": 1.6735255994815295, + "grad_norm": 0.7777444124221802, + "learning_rate": 2.1543288668354914e-06, + "loss": 0.0949, + "step": 10329 + }, + { + "epoch": 1.6736876215165264, + "grad_norm": 0.8096343278884888, + "learning_rate": 2.1538957684250303e-06, + "loss": 0.1013, + "step": 10330 + }, + { + "epoch": 1.6738496435515229, + "grad_norm": 0.751850962638855, + "learning_rate": 2.153462680604522e-06, + "loss": 0.0921, + "step": 10331 + }, + { + "epoch": 1.6740116655865198, + "grad_norm": 0.8717265725135803, + "learning_rate": 2.1530296033872155e-06, + "loss": 0.1096, + "step": 10332 + }, + { + "epoch": 1.6741736876215165, + "grad_norm": 0.8423851728439331, + "learning_rate": 2.152596536786364e-06, + "loss": 0.1089, + "step": 10333 + }, + { + "epoch": 1.6743357096565132, + "grad_norm": 0.7620172500610352, + "learning_rate": 2.152163480815218e-06, + "loss": 0.095, + "step": 10334 + }, + { + "epoch": 1.6744977316915102, + "grad_norm": 0.7917490005493164, + "learning_rate": 2.151730435487028e-06, + "loss": 0.0961, + "step": 10335 + }, + { + "epoch": 1.6746597537265067, + "grad_norm": 0.8136554956436157, + "learning_rate": 2.151297400815044e-06, + "loss": 0.1113, + "step": 10336 + }, + { + "epoch": 1.6748217757615036, + "grad_norm": 0.8516972064971924, + "learning_rate": 2.150864376812515e-06, + "loss": 0.1168, + "step": 10337 + }, + { + "epoch": 1.6749837977965003, + "grad_norm": 0.7402398586273193, + "learning_rate": 2.150431363492691e-06, + "loss": 0.0879, + "step": 10338 + }, + { + "epoch": 1.675145819831497, + "grad_norm": 0.7780933976173401, + "learning_rate": 2.1499983608688217e-06, + "loss": 0.0975, + "step": 10339 + }, + { + "epoch": 1.675307841866494, + "grad_norm": 0.8871686458587646, + "learning_rate": 2.1495653689541562e-06, + "loss": 0.1125, + "step": 10340 + }, + { + "epoch": 1.6754698639014904, + "grad_norm": 0.854103147983551, + "learning_rate": 2.149132387761942e-06, + "loss": 0.113, + "step": 10341 + }, + { + "epoch": 1.6756318859364874, + "grad_norm": 0.8291221857070923, + "learning_rate": 2.1486994173054276e-06, + "loss": 0.1044, + "step": 10342 + }, + { + "epoch": 1.675793907971484, + "grad_norm": 0.838173508644104, + "learning_rate": 2.14826645759786e-06, + "loss": 0.1034, + "step": 10343 + }, + { + "epoch": 1.6759559300064808, + "grad_norm": 0.7588802576065063, + "learning_rate": 2.1478335086524885e-06, + "loss": 0.1, + "step": 10344 + }, + { + "epoch": 1.6761179520414777, + "grad_norm": 0.7420892119407654, + "learning_rate": 2.14740057048256e-06, + "loss": 0.0877, + "step": 10345 + }, + { + "epoch": 1.6762799740764744, + "grad_norm": 0.823940634727478, + "learning_rate": 2.14696764310132e-06, + "loss": 0.1046, + "step": 10346 + }, + { + "epoch": 1.6764419961114712, + "grad_norm": 0.8017604351043701, + "learning_rate": 2.146534726522016e-06, + "loss": 0.1034, + "step": 10347 + }, + { + "epoch": 1.6766040181464679, + "grad_norm": 0.8648914694786072, + "learning_rate": 2.1461018207578932e-06, + "loss": 0.1073, + "step": 10348 + }, + { + "epoch": 1.6767660401814646, + "grad_norm": 0.7010998129844666, + "learning_rate": 2.145668925822199e-06, + "loss": 0.0923, + "step": 10349 + }, + { + "epoch": 1.6769280622164615, + "grad_norm": 0.7532062530517578, + "learning_rate": 2.1452360417281786e-06, + "loss": 0.0892, + "step": 10350 + }, + { + "epoch": 1.6770900842514582, + "grad_norm": 0.7723690867424011, + "learning_rate": 2.1448031684890767e-06, + "loss": 0.0971, + "step": 10351 + }, + { + "epoch": 1.677252106286455, + "grad_norm": 0.7597529292106628, + "learning_rate": 2.144370306118138e-06, + "loss": 0.095, + "step": 10352 + }, + { + "epoch": 1.6774141283214519, + "grad_norm": 0.9111921787261963, + "learning_rate": 2.1439374546286065e-06, + "loss": 0.0962, + "step": 10353 + }, + { + "epoch": 1.6775761503564484, + "grad_norm": 0.7498681545257568, + "learning_rate": 2.143504614033728e-06, + "loss": 0.0917, + "step": 10354 + }, + { + "epoch": 1.6777381723914453, + "grad_norm": 0.7228400707244873, + "learning_rate": 2.143071784346746e-06, + "loss": 0.0927, + "step": 10355 + }, + { + "epoch": 1.677900194426442, + "grad_norm": 0.6969931721687317, + "learning_rate": 2.142638965580903e-06, + "loss": 0.0866, + "step": 10356 + }, + { + "epoch": 1.6780622164614387, + "grad_norm": 0.8031118512153625, + "learning_rate": 2.1422061577494427e-06, + "loss": 0.1015, + "step": 10357 + }, + { + "epoch": 1.6782242384964356, + "grad_norm": 0.8556954264640808, + "learning_rate": 2.141773360865609e-06, + "loss": 0.1002, + "step": 10358 + }, + { + "epoch": 1.6783862605314321, + "grad_norm": 0.8014951348304749, + "learning_rate": 2.1413405749426432e-06, + "loss": 0.0944, + "step": 10359 + }, + { + "epoch": 1.678548282566429, + "grad_norm": 1.1695481538772583, + "learning_rate": 2.1409077999937883e-06, + "loss": 0.1157, + "step": 10360 + }, + { + "epoch": 1.6787103046014258, + "grad_norm": 0.8790541887283325, + "learning_rate": 2.1404750360322852e-06, + "loss": 0.1036, + "step": 10361 + }, + { + "epoch": 1.6788723266364225, + "grad_norm": 0.8709704279899597, + "learning_rate": 2.1400422830713752e-06, + "loss": 0.1126, + "step": 10362 + }, + { + "epoch": 1.6790343486714194, + "grad_norm": 0.8556258678436279, + "learning_rate": 2.139609541124301e-06, + "loss": 0.1063, + "step": 10363 + }, + { + "epoch": 1.679196370706416, + "grad_norm": 0.779319703578949, + "learning_rate": 2.1391768102043032e-06, + "loss": 0.0961, + "step": 10364 + }, + { + "epoch": 1.6793583927414129, + "grad_norm": 0.8869314789772034, + "learning_rate": 2.138744090324621e-06, + "loss": 0.1104, + "step": 10365 + }, + { + "epoch": 1.6795204147764096, + "grad_norm": 0.9177558422088623, + "learning_rate": 2.1383113814984967e-06, + "loss": 0.1191, + "step": 10366 + }, + { + "epoch": 1.6796824368114063, + "grad_norm": 0.7949991822242737, + "learning_rate": 2.1378786837391673e-06, + "loss": 0.1051, + "step": 10367 + }, + { + "epoch": 1.6798444588464032, + "grad_norm": 0.8216454982757568, + "learning_rate": 2.137445997059874e-06, + "loss": 0.0968, + "step": 10368 + }, + { + "epoch": 1.6800064808814, + "grad_norm": 0.7865923643112183, + "learning_rate": 2.1370133214738573e-06, + "loss": 0.1002, + "step": 10369 + }, + { + "epoch": 1.6801685029163966, + "grad_norm": 0.7961848378181458, + "learning_rate": 2.1365806569943533e-06, + "loss": 0.0945, + "step": 10370 + }, + { + "epoch": 1.6803305249513933, + "grad_norm": 0.843292236328125, + "learning_rate": 2.1361480036346025e-06, + "loss": 0.1049, + "step": 10371 + }, + { + "epoch": 1.68049254698639, + "grad_norm": 0.9252605438232422, + "learning_rate": 2.1357153614078407e-06, + "loss": 0.114, + "step": 10372 + }, + { + "epoch": 1.680654569021387, + "grad_norm": 0.7975597977638245, + "learning_rate": 2.135282730327309e-06, + "loss": 0.0955, + "step": 10373 + }, + { + "epoch": 1.6808165910563837, + "grad_norm": 0.9786580204963684, + "learning_rate": 2.1348501104062423e-06, + "loss": 0.1155, + "step": 10374 + }, + { + "epoch": 1.6809786130913804, + "grad_norm": 0.9285010695457458, + "learning_rate": 2.1344175016578796e-06, + "loss": 0.1207, + "step": 10375 + }, + { + "epoch": 1.6811406351263773, + "grad_norm": 0.7795856595039368, + "learning_rate": 2.1339849040954556e-06, + "loss": 0.0978, + "step": 10376 + }, + { + "epoch": 1.6813026571613738, + "grad_norm": 1.0095703601837158, + "learning_rate": 2.133552317732208e-06, + "loss": 0.1229, + "step": 10377 + }, + { + "epoch": 1.6814646791963708, + "grad_norm": 0.7144725918769836, + "learning_rate": 2.133119742581373e-06, + "loss": 0.0896, + "step": 10378 + }, + { + "epoch": 1.6816267012313675, + "grad_norm": 0.7718963623046875, + "learning_rate": 2.1326871786561856e-06, + "loss": 0.0949, + "step": 10379 + }, + { + "epoch": 1.6817887232663642, + "grad_norm": 0.8956825733184814, + "learning_rate": 2.1322546259698823e-06, + "loss": 0.1074, + "step": 10380 + }, + { + "epoch": 1.6819507453013611, + "grad_norm": 0.8126304745674133, + "learning_rate": 2.131822084535696e-06, + "loss": 0.1, + "step": 10381 + }, + { + "epoch": 1.6821127673363576, + "grad_norm": 0.8905256986618042, + "learning_rate": 2.1313895543668644e-06, + "loss": 0.1035, + "step": 10382 + }, + { + "epoch": 1.6822747893713546, + "grad_norm": 0.7790430188179016, + "learning_rate": 2.13095703547662e-06, + "loss": 0.0894, + "step": 10383 + }, + { + "epoch": 1.6824368114063513, + "grad_norm": 0.7982741594314575, + "learning_rate": 2.1305245278781977e-06, + "loss": 0.1081, + "step": 10384 + }, + { + "epoch": 1.682598833441348, + "grad_norm": 0.790199875831604, + "learning_rate": 2.1300920315848307e-06, + "loss": 0.1041, + "step": 10385 + }, + { + "epoch": 1.682760855476345, + "grad_norm": 0.8125954866409302, + "learning_rate": 2.129659546609751e-06, + "loss": 0.0963, + "step": 10386 + }, + { + "epoch": 1.6829228775113414, + "grad_norm": 0.9038329124450684, + "learning_rate": 2.1292270729661946e-06, + "loss": 0.1123, + "step": 10387 + }, + { + "epoch": 1.6830848995463383, + "grad_norm": 0.8060694336891174, + "learning_rate": 2.1287946106673916e-06, + "loss": 0.0908, + "step": 10388 + }, + { + "epoch": 1.683246921581335, + "grad_norm": 0.7719234824180603, + "learning_rate": 2.128362159726576e-06, + "loss": 0.097, + "step": 10389 + }, + { + "epoch": 1.6834089436163318, + "grad_norm": 1.3537698984146118, + "learning_rate": 2.1279297201569787e-06, + "loss": 0.0971, + "step": 10390 + }, + { + "epoch": 1.6835709656513287, + "grad_norm": 0.8270101547241211, + "learning_rate": 2.1274972919718305e-06, + "loss": 0.1124, + "step": 10391 + }, + { + "epoch": 1.6837329876863252, + "grad_norm": 0.8727843761444092, + "learning_rate": 2.127064875184365e-06, + "loss": 0.1085, + "step": 10392 + }, + { + "epoch": 1.6838950097213221, + "grad_norm": 0.8102154731750488, + "learning_rate": 2.1266324698078116e-06, + "loss": 0.0979, + "step": 10393 + }, + { + "epoch": 1.6840570317563188, + "grad_norm": 0.7432061433792114, + "learning_rate": 2.126200075855401e-06, + "loss": 0.0951, + "step": 10394 + }, + { + "epoch": 1.6842190537913155, + "grad_norm": 0.7901977300643921, + "learning_rate": 2.1257676933403637e-06, + "loss": 0.1051, + "step": 10395 + }, + { + "epoch": 1.6843810758263125, + "grad_norm": 0.84058678150177, + "learning_rate": 2.125335322275928e-06, + "loss": 0.11, + "step": 10396 + }, + { + "epoch": 1.6845430978613092, + "grad_norm": 0.6626092195510864, + "learning_rate": 2.124902962675326e-06, + "loss": 0.0884, + "step": 10397 + }, + { + "epoch": 1.684705119896306, + "grad_norm": 0.7434554100036621, + "learning_rate": 2.1244706145517853e-06, + "loss": 0.0998, + "step": 10398 + }, + { + "epoch": 1.6848671419313026, + "grad_norm": 0.7729745507240295, + "learning_rate": 2.124038277918536e-06, + "loss": 0.0976, + "step": 10399 + }, + { + "epoch": 1.6850291639662993, + "grad_norm": 0.6964062452316284, + "learning_rate": 2.1236059527888044e-06, + "loss": 0.0883, + "step": 10400 + }, + { + "epoch": 1.6851911860012962, + "grad_norm": 0.8118695616722107, + "learning_rate": 2.1231736391758195e-06, + "loss": 0.1021, + "step": 10401 + }, + { + "epoch": 1.685353208036293, + "grad_norm": 0.7241361737251282, + "learning_rate": 2.1227413370928106e-06, + "loss": 0.0955, + "step": 10402 + }, + { + "epoch": 1.6855152300712897, + "grad_norm": 0.8646603226661682, + "learning_rate": 2.1223090465530032e-06, + "loss": 0.0978, + "step": 10403 + }, + { + "epoch": 1.6856772521062866, + "grad_norm": 0.8380978107452393, + "learning_rate": 2.1218767675696255e-06, + "loss": 0.1027, + "step": 10404 + }, + { + "epoch": 1.685839274141283, + "grad_norm": 0.8121195435523987, + "learning_rate": 2.1214445001559025e-06, + "loss": 0.0994, + "step": 10405 + }, + { + "epoch": 1.68600129617628, + "grad_norm": 0.7910208106040955, + "learning_rate": 2.1210122443250625e-06, + "loss": 0.0954, + "step": 10406 + }, + { + "epoch": 1.6861633182112767, + "grad_norm": 0.8031769394874573, + "learning_rate": 2.1205800000903305e-06, + "loss": 0.1046, + "step": 10407 + }, + { + "epoch": 1.6863253402462735, + "grad_norm": 0.7870137095451355, + "learning_rate": 2.1201477674649326e-06, + "loss": 0.1009, + "step": 10408 + }, + { + "epoch": 1.6864873622812704, + "grad_norm": 0.8784118294715881, + "learning_rate": 2.1197155464620934e-06, + "loss": 0.1071, + "step": 10409 + }, + { + "epoch": 1.6866493843162669, + "grad_norm": 0.7897698283195496, + "learning_rate": 2.119283337095038e-06, + "loss": 0.0943, + "step": 10410 + }, + { + "epoch": 1.6868114063512638, + "grad_norm": 0.770966112613678, + "learning_rate": 2.118851139376992e-06, + "loss": 0.0914, + "step": 10411 + }, + { + "epoch": 1.6869734283862605, + "grad_norm": 0.8083310127258301, + "learning_rate": 2.1184189533211783e-06, + "loss": 0.099, + "step": 10412 + }, + { + "epoch": 1.6871354504212572, + "grad_norm": 0.8235817551612854, + "learning_rate": 2.117986778940822e-06, + "loss": 0.1128, + "step": 10413 + }, + { + "epoch": 1.6872974724562542, + "grad_norm": 0.7883199453353882, + "learning_rate": 2.117554616249145e-06, + "loss": 0.0981, + "step": 10414 + }, + { + "epoch": 1.6874594944912507, + "grad_norm": 0.8687751293182373, + "learning_rate": 2.11712246525937e-06, + "loss": 0.1017, + "step": 10415 + }, + { + "epoch": 1.6876215165262476, + "grad_norm": 0.863337516784668, + "learning_rate": 2.1166903259847228e-06, + "loss": 0.111, + "step": 10416 + }, + { + "epoch": 1.6877835385612443, + "grad_norm": 0.7061415314674377, + "learning_rate": 2.116258198438424e-06, + "loss": 0.0925, + "step": 10417 + }, + { + "epoch": 1.687945560596241, + "grad_norm": 0.7574256658554077, + "learning_rate": 2.115826082633695e-06, + "loss": 0.0906, + "step": 10418 + }, + { + "epoch": 1.688107582631238, + "grad_norm": 0.7754940390586853, + "learning_rate": 2.115393978583759e-06, + "loss": 0.1083, + "step": 10419 + }, + { + "epoch": 1.6882696046662347, + "grad_norm": 0.799916684627533, + "learning_rate": 2.114961886301835e-06, + "loss": 0.0945, + "step": 10420 + }, + { + "epoch": 1.6884316267012314, + "grad_norm": 0.9458333849906921, + "learning_rate": 2.114529805801147e-06, + "loss": 0.1102, + "step": 10421 + }, + { + "epoch": 1.688593648736228, + "grad_norm": 0.7674143314361572, + "learning_rate": 2.114097737094914e-06, + "loss": 0.0925, + "step": 10422 + }, + { + "epoch": 1.6887556707712248, + "grad_norm": 1.0405738353729248, + "learning_rate": 2.1136656801963556e-06, + "loss": 0.1039, + "step": 10423 + }, + { + "epoch": 1.6889176928062217, + "grad_norm": 0.9508488178253174, + "learning_rate": 2.1132336351186923e-06, + "loss": 0.1144, + "step": 10424 + }, + { + "epoch": 1.6890797148412184, + "grad_norm": 0.8733676075935364, + "learning_rate": 2.1128016018751444e-06, + "loss": 0.1035, + "step": 10425 + }, + { + "epoch": 1.6892417368762151, + "grad_norm": 0.8814464807510376, + "learning_rate": 2.1123695804789307e-06, + "loss": 0.1017, + "step": 10426 + }, + { + "epoch": 1.689403758911212, + "grad_norm": 0.7157515287399292, + "learning_rate": 2.1119375709432696e-06, + "loss": 0.0853, + "step": 10427 + }, + { + "epoch": 1.6895657809462086, + "grad_norm": 0.8367986083030701, + "learning_rate": 2.11150557328138e-06, + "loss": 0.1023, + "step": 10428 + }, + { + "epoch": 1.6897278029812055, + "grad_norm": 0.8281162977218628, + "learning_rate": 2.1110735875064787e-06, + "loss": 0.1074, + "step": 10429 + }, + { + "epoch": 1.6898898250162022, + "grad_norm": 0.8819671273231506, + "learning_rate": 2.110641613631785e-06, + "loss": 0.1114, + "step": 10430 + }, + { + "epoch": 1.690051847051199, + "grad_norm": 0.796600341796875, + "learning_rate": 2.1102096516705165e-06, + "loss": 0.099, + "step": 10431 + }, + { + "epoch": 1.6902138690861959, + "grad_norm": 0.8326752185821533, + "learning_rate": 2.109777701635889e-06, + "loss": 0.1044, + "step": 10432 + }, + { + "epoch": 1.6903758911211924, + "grad_norm": 0.9205048084259033, + "learning_rate": 2.109345763541119e-06, + "loss": 0.1107, + "step": 10433 + }, + { + "epoch": 1.6905379131561893, + "grad_norm": 0.799943745136261, + "learning_rate": 2.1089138373994226e-06, + "loss": 0.1003, + "step": 10434 + }, + { + "epoch": 1.690699935191186, + "grad_norm": 0.8054278492927551, + "learning_rate": 2.1084819232240177e-06, + "loss": 0.0972, + "step": 10435 + }, + { + "epoch": 1.6908619572261827, + "grad_norm": 0.7532472014427185, + "learning_rate": 2.108050021028118e-06, + "loss": 0.0929, + "step": 10436 + }, + { + "epoch": 1.6910239792611796, + "grad_norm": 0.7412585020065308, + "learning_rate": 2.1076181308249396e-06, + "loss": 0.0968, + "step": 10437 + }, + { + "epoch": 1.6911860012961761, + "grad_norm": 0.8518087267875671, + "learning_rate": 2.1071862526276963e-06, + "loss": 0.0933, + "step": 10438 + }, + { + "epoch": 1.691348023331173, + "grad_norm": 0.7559210658073425, + "learning_rate": 2.1067543864496028e-06, + "loss": 0.0815, + "step": 10439 + }, + { + "epoch": 1.6915100453661698, + "grad_norm": 0.7951977849006653, + "learning_rate": 2.1063225323038744e-06, + "loss": 0.0965, + "step": 10440 + }, + { + "epoch": 1.6916720674011665, + "grad_norm": 0.7670897841453552, + "learning_rate": 2.1058906902037228e-06, + "loss": 0.0946, + "step": 10441 + }, + { + "epoch": 1.6918340894361634, + "grad_norm": 0.7800466418266296, + "learning_rate": 2.1054588601623634e-06, + "loss": 0.0971, + "step": 10442 + }, + { + "epoch": 1.69199611147116, + "grad_norm": 0.872276246547699, + "learning_rate": 2.1050270421930077e-06, + "loss": 0.0954, + "step": 10443 + }, + { + "epoch": 1.6921581335061568, + "grad_norm": 0.8790923357009888, + "learning_rate": 2.104595236308868e-06, + "loss": 0.1148, + "step": 10444 + }, + { + "epoch": 1.6923201555411536, + "grad_norm": 0.8432591557502747, + "learning_rate": 2.104163442523158e-06, + "loss": 0.1034, + "step": 10445 + }, + { + "epoch": 1.6924821775761503, + "grad_norm": 0.7778016328811646, + "learning_rate": 2.1037316608490886e-06, + "loss": 0.0907, + "step": 10446 + }, + { + "epoch": 1.6926441996111472, + "grad_norm": 0.8325784206390381, + "learning_rate": 2.1032998912998712e-06, + "loss": 0.1069, + "step": 10447 + }, + { + "epoch": 1.692806221646144, + "grad_norm": 0.789277195930481, + "learning_rate": 2.1028681338887164e-06, + "loss": 0.1051, + "step": 10448 + }, + { + "epoch": 1.6929682436811406, + "grad_norm": 0.8284058570861816, + "learning_rate": 2.1024363886288375e-06, + "loss": 0.11, + "step": 10449 + }, + { + "epoch": 1.6931302657161373, + "grad_norm": 0.8092173337936401, + "learning_rate": 2.102004655533442e-06, + "loss": 0.1038, + "step": 10450 + }, + { + "epoch": 1.693292287751134, + "grad_norm": 0.7652307152748108, + "learning_rate": 2.1015729346157406e-06, + "loss": 0.097, + "step": 10451 + }, + { + "epoch": 1.693454309786131, + "grad_norm": 0.6917365789413452, + "learning_rate": 2.101141225888944e-06, + "loss": 0.082, + "step": 10452 + }, + { + "epoch": 1.6936163318211277, + "grad_norm": 0.770603597164154, + "learning_rate": 2.10070952936626e-06, + "loss": 0.0931, + "step": 10453 + }, + { + "epoch": 1.6937783538561244, + "grad_norm": 0.8083109855651855, + "learning_rate": 2.100277845060898e-06, + "loss": 0.1012, + "step": 10454 + }, + { + "epoch": 1.6939403758911213, + "grad_norm": 0.7694876194000244, + "learning_rate": 2.0998461729860675e-06, + "loss": 0.0965, + "step": 10455 + }, + { + "epoch": 1.6941023979261178, + "grad_norm": 0.7325409054756165, + "learning_rate": 2.0994145131549755e-06, + "loss": 0.0911, + "step": 10456 + }, + { + "epoch": 1.6942644199611148, + "grad_norm": 0.8317882418632507, + "learning_rate": 2.09898286558083e-06, + "loss": 0.099, + "step": 10457 + }, + { + "epoch": 1.6944264419961115, + "grad_norm": 0.8942394852638245, + "learning_rate": 2.0985512302768366e-06, + "loss": 0.1108, + "step": 10458 + }, + { + "epoch": 1.6945884640311082, + "grad_norm": 0.850330114364624, + "learning_rate": 2.0981196072562067e-06, + "loss": 0.1008, + "step": 10459 + }, + { + "epoch": 1.6947504860661051, + "grad_norm": 0.7139987945556641, + "learning_rate": 2.097687996532143e-06, + "loss": 0.077, + "step": 10460 + }, + { + "epoch": 1.6949125081011016, + "grad_norm": 0.7986920475959778, + "learning_rate": 2.097256398117854e-06, + "loss": 0.0954, + "step": 10461 + }, + { + "epoch": 1.6950745301360985, + "grad_norm": 0.7126197218894958, + "learning_rate": 2.0968248120265433e-06, + "loss": 0.083, + "step": 10462 + }, + { + "epoch": 1.6952365521710953, + "grad_norm": 0.8080899715423584, + "learning_rate": 2.0963932382714175e-06, + "loss": 0.1012, + "step": 10463 + }, + { + "epoch": 1.695398574206092, + "grad_norm": 0.9293273687362671, + "learning_rate": 2.095961676865683e-06, + "loss": 0.1071, + "step": 10464 + }, + { + "epoch": 1.695560596241089, + "grad_norm": 0.7605911493301392, + "learning_rate": 2.0955301278225433e-06, + "loss": 0.1003, + "step": 10465 + }, + { + "epoch": 1.6957226182760854, + "grad_norm": 0.7792356014251709, + "learning_rate": 2.095098591155203e-06, + "loss": 0.0969, + "step": 10466 + }, + { + "epoch": 1.6958846403110823, + "grad_norm": 0.7824127674102783, + "learning_rate": 2.0946670668768652e-06, + "loss": 0.0951, + "step": 10467 + }, + { + "epoch": 1.696046662346079, + "grad_norm": 0.8019390106201172, + "learning_rate": 2.094235555000734e-06, + "loss": 0.0993, + "step": 10468 + }, + { + "epoch": 1.6962086843810757, + "grad_norm": 0.8202789425849915, + "learning_rate": 2.0938040555400137e-06, + "loss": 0.0955, + "step": 10469 + }, + { + "epoch": 1.6963707064160727, + "grad_norm": 0.8047801852226257, + "learning_rate": 2.093372568507907e-06, + "loss": 0.0991, + "step": 10470 + }, + { + "epoch": 1.6965327284510694, + "grad_norm": 0.7065772414207458, + "learning_rate": 2.0929410939176147e-06, + "loss": 0.0858, + "step": 10471 + }, + { + "epoch": 1.696694750486066, + "grad_norm": 0.8477860689163208, + "learning_rate": 2.0925096317823393e-06, + "loss": 0.097, + "step": 10472 + }, + { + "epoch": 1.6968567725210628, + "grad_norm": 0.7459856867790222, + "learning_rate": 2.0920781821152843e-06, + "loss": 0.0914, + "step": 10473 + }, + { + "epoch": 1.6970187945560595, + "grad_norm": 0.8197339773178101, + "learning_rate": 2.091646744929649e-06, + "loss": 0.099, + "step": 10474 + }, + { + "epoch": 1.6971808165910565, + "grad_norm": 0.7804821133613586, + "learning_rate": 2.091215320238636e-06, + "loss": 0.101, + "step": 10475 + }, + { + "epoch": 1.6973428386260532, + "grad_norm": 0.7543721795082092, + "learning_rate": 2.0907839080554443e-06, + "loss": 0.0866, + "step": 10476 + }, + { + "epoch": 1.6975048606610499, + "grad_norm": 0.8172874450683594, + "learning_rate": 2.090352508393274e-06, + "loss": 0.1058, + "step": 10477 + }, + { + "epoch": 1.6976668826960468, + "grad_norm": 0.8281826972961426, + "learning_rate": 2.0899211212653262e-06, + "loss": 0.0992, + "step": 10478 + }, + { + "epoch": 1.6978289047310433, + "grad_norm": 0.784435510635376, + "learning_rate": 2.0894897466848007e-06, + "loss": 0.098, + "step": 10479 + }, + { + "epoch": 1.6979909267660402, + "grad_norm": 0.871392548084259, + "learning_rate": 2.0890583846648945e-06, + "loss": 0.1029, + "step": 10480 + }, + { + "epoch": 1.698152948801037, + "grad_norm": 0.7556376457214355, + "learning_rate": 2.0886270352188082e-06, + "loss": 0.0959, + "step": 10481 + }, + { + "epoch": 1.6983149708360337, + "grad_norm": 0.7945913672447205, + "learning_rate": 2.0881956983597375e-06, + "loss": 0.0944, + "step": 10482 + }, + { + "epoch": 1.6984769928710306, + "grad_norm": 0.8585154414176941, + "learning_rate": 2.0877643741008828e-06, + "loss": 0.106, + "step": 10483 + }, + { + "epoch": 1.698639014906027, + "grad_norm": 0.8008867502212524, + "learning_rate": 2.087333062455441e-06, + "loss": 0.0965, + "step": 10484 + }, + { + "epoch": 1.698801036941024, + "grad_norm": 0.8277882933616638, + "learning_rate": 2.0869017634366087e-06, + "loss": 0.1087, + "step": 10485 + }, + { + "epoch": 1.6989630589760207, + "grad_norm": 0.781322717666626, + "learning_rate": 2.0864704770575824e-06, + "loss": 0.1053, + "step": 10486 + }, + { + "epoch": 1.6991250810110174, + "grad_norm": 0.8764674663543701, + "learning_rate": 2.0860392033315584e-06, + "loss": 0.1127, + "step": 10487 + }, + { + "epoch": 1.6992871030460144, + "grad_norm": 0.8520863652229309, + "learning_rate": 2.085607942271734e-06, + "loss": 0.0996, + "step": 10488 + }, + { + "epoch": 1.6994491250810109, + "grad_norm": 0.8383838534355164, + "learning_rate": 2.085176693891303e-06, + "loss": 0.1004, + "step": 10489 + }, + { + "epoch": 1.6996111471160078, + "grad_norm": 0.8455978631973267, + "learning_rate": 2.0847454582034625e-06, + "loss": 0.0961, + "step": 10490 + }, + { + "epoch": 1.6997731691510045, + "grad_norm": 0.8698278665542603, + "learning_rate": 2.084314235221405e-06, + "loss": 0.109, + "step": 10491 + }, + { + "epoch": 1.6999351911860012, + "grad_norm": 0.7847959995269775, + "learning_rate": 2.0838830249583254e-06, + "loss": 0.1002, + "step": 10492 + }, + { + "epoch": 1.7000972132209982, + "grad_norm": 0.8136616349220276, + "learning_rate": 2.0834518274274195e-06, + "loss": 0.0992, + "step": 10493 + }, + { + "epoch": 1.7002592352559946, + "grad_norm": 0.8103066086769104, + "learning_rate": 2.0830206426418794e-06, + "loss": 0.0894, + "step": 10494 + }, + { + "epoch": 1.7004212572909916, + "grad_norm": 0.7221809029579163, + "learning_rate": 2.0825894706148984e-06, + "loss": 0.0907, + "step": 10495 + }, + { + "epoch": 1.7005832793259883, + "grad_norm": 0.9179300665855408, + "learning_rate": 2.0821583113596686e-06, + "loss": 0.1152, + "step": 10496 + }, + { + "epoch": 1.700745301360985, + "grad_norm": 0.8785554766654968, + "learning_rate": 2.0817271648893848e-06, + "loss": 0.1152, + "step": 10497 + }, + { + "epoch": 1.700907323395982, + "grad_norm": 0.7163464426994324, + "learning_rate": 2.081296031217237e-06, + "loss": 0.0873, + "step": 10498 + }, + { + "epoch": 1.7010693454309787, + "grad_norm": 0.8808443546295166, + "learning_rate": 2.0808649103564173e-06, + "loss": 0.1108, + "step": 10499 + }, + { + "epoch": 1.7012313674659754, + "grad_norm": 0.7949004769325256, + "learning_rate": 2.080433802320117e-06, + "loss": 0.0997, + "step": 10500 + }, + { + "epoch": 1.7013933895009723, + "grad_norm": 0.7522899508476257, + "learning_rate": 2.0800027071215265e-06, + "loss": 0.0996, + "step": 10501 + }, + { + "epoch": 1.7015554115359688, + "grad_norm": 0.7748807072639465, + "learning_rate": 2.0795716247738374e-06, + "loss": 0.1031, + "step": 10502 + }, + { + "epoch": 1.7017174335709657, + "grad_norm": 0.7756296396255493, + "learning_rate": 2.0791405552902396e-06, + "loss": 0.0983, + "step": 10503 + }, + { + "epoch": 1.7018794556059624, + "grad_norm": 0.8680400848388672, + "learning_rate": 2.078709498683922e-06, + "loss": 0.1047, + "step": 10504 + }, + { + "epoch": 1.7020414776409591, + "grad_norm": 0.7261191606521606, + "learning_rate": 2.0782784549680744e-06, + "loss": 0.091, + "step": 10505 + }, + { + "epoch": 1.702203499675956, + "grad_norm": 0.7516106963157654, + "learning_rate": 2.0778474241558845e-06, + "loss": 0.0944, + "step": 10506 + }, + { + "epoch": 1.7023655217109526, + "grad_norm": 0.8887795805931091, + "learning_rate": 2.0774164062605425e-06, + "loss": 0.1098, + "step": 10507 + }, + { + "epoch": 1.7025275437459495, + "grad_norm": 0.7267820835113525, + "learning_rate": 2.0769854012952368e-06, + "loss": 0.0917, + "step": 10508 + }, + { + "epoch": 1.7026895657809462, + "grad_norm": 0.6994649767875671, + "learning_rate": 2.076554409273153e-06, + "loss": 0.0883, + "step": 10509 + }, + { + "epoch": 1.702851587815943, + "grad_norm": 0.8000863194465637, + "learning_rate": 2.0761234302074803e-06, + "loss": 0.0984, + "step": 10510 + }, + { + "epoch": 1.7030136098509399, + "grad_norm": 0.8726096749305725, + "learning_rate": 2.075692464111403e-06, + "loss": 0.1051, + "step": 10511 + }, + { + "epoch": 1.7031756318859363, + "grad_norm": 0.8187968134880066, + "learning_rate": 2.0752615109981116e-06, + "loss": 0.1046, + "step": 10512 + }, + { + "epoch": 1.7033376539209333, + "grad_norm": 0.753304660320282, + "learning_rate": 2.074830570880789e-06, + "loss": 0.0933, + "step": 10513 + }, + { + "epoch": 1.70349967595593, + "grad_norm": 0.8444907665252686, + "learning_rate": 2.0743996437726233e-06, + "loss": 0.093, + "step": 10514 + }, + { + "epoch": 1.7036616979909267, + "grad_norm": 0.8190134167671204, + "learning_rate": 2.073968729686797e-06, + "loss": 0.0956, + "step": 10515 + }, + { + "epoch": 1.7038237200259236, + "grad_norm": 0.6358878016471863, + "learning_rate": 2.073537828636497e-06, + "loss": 0.0766, + "step": 10516 + }, + { + "epoch": 1.7039857420609201, + "grad_norm": 0.7422366738319397, + "learning_rate": 2.0731069406349087e-06, + "loss": 0.086, + "step": 10517 + }, + { + "epoch": 1.704147764095917, + "grad_norm": 0.775372326374054, + "learning_rate": 2.0726760656952137e-06, + "loss": 0.0876, + "step": 10518 + }, + { + "epoch": 1.7043097861309138, + "grad_norm": 0.8271961808204651, + "learning_rate": 2.0722452038305976e-06, + "loss": 0.1032, + "step": 10519 + }, + { + "epoch": 1.7044718081659105, + "grad_norm": 0.6599870920181274, + "learning_rate": 2.0718143550542418e-06, + "loss": 0.0808, + "step": 10520 + }, + { + "epoch": 1.7046338302009074, + "grad_norm": 0.7971540689468384, + "learning_rate": 2.071383519379332e-06, + "loss": 0.1085, + "step": 10521 + }, + { + "epoch": 1.7047958522359041, + "grad_norm": 0.7135748267173767, + "learning_rate": 2.0709526968190483e-06, + "loss": 0.0867, + "step": 10522 + }, + { + "epoch": 1.7049578742709008, + "grad_norm": 0.9244253039360046, + "learning_rate": 2.070521887386575e-06, + "loss": 0.1061, + "step": 10523 + }, + { + "epoch": 1.7051198963058976, + "grad_norm": 0.9357718825340271, + "learning_rate": 2.070091091095092e-06, + "loss": 0.1023, + "step": 10524 + }, + { + "epoch": 1.7052819183408943, + "grad_norm": 0.9502805471420288, + "learning_rate": 2.0696603079577808e-06, + "loss": 0.1071, + "step": 10525 + }, + { + "epoch": 1.7054439403758912, + "grad_norm": 0.8359770178794861, + "learning_rate": 2.0692295379878237e-06, + "loss": 0.0975, + "step": 10526 + }, + { + "epoch": 1.705605962410888, + "grad_norm": 0.9007936120033264, + "learning_rate": 2.0687987811983994e-06, + "loss": 0.1114, + "step": 10527 + }, + { + "epoch": 1.7057679844458846, + "grad_norm": 0.8316621780395508, + "learning_rate": 2.0683680376026897e-06, + "loss": 0.1015, + "step": 10528 + }, + { + "epoch": 1.7059300064808816, + "grad_norm": 0.8183777332305908, + "learning_rate": 2.067937307213873e-06, + "loss": 0.108, + "step": 10529 + }, + { + "epoch": 1.706092028515878, + "grad_norm": 0.8549315333366394, + "learning_rate": 2.0675065900451287e-06, + "loss": 0.1099, + "step": 10530 + }, + { + "epoch": 1.706254050550875, + "grad_norm": 0.8095566630363464, + "learning_rate": 2.0670758861096366e-06, + "loss": 0.1013, + "step": 10531 + }, + { + "epoch": 1.7064160725858717, + "grad_norm": 0.8298711776733398, + "learning_rate": 2.066645195420575e-06, + "loss": 0.1061, + "step": 10532 + }, + { + "epoch": 1.7065780946208684, + "grad_norm": 0.8974525928497314, + "learning_rate": 2.0662145179911216e-06, + "loss": 0.0994, + "step": 10533 + }, + { + "epoch": 1.7067401166558653, + "grad_norm": 0.8173624873161316, + "learning_rate": 2.0657838538344545e-06, + "loss": 0.1013, + "step": 10534 + }, + { + "epoch": 1.7069021386908618, + "grad_norm": 0.8199102878570557, + "learning_rate": 2.06535320296375e-06, + "loss": 0.1004, + "step": 10535 + }, + { + "epoch": 1.7070641607258588, + "grad_norm": 0.8255974054336548, + "learning_rate": 2.0649225653921855e-06, + "loss": 0.1085, + "step": 10536 + }, + { + "epoch": 1.7072261827608555, + "grad_norm": 0.703346848487854, + "learning_rate": 2.064491941132938e-06, + "loss": 0.0922, + "step": 10537 + }, + { + "epoch": 1.7073882047958522, + "grad_norm": 0.7433433532714844, + "learning_rate": 2.064061330199184e-06, + "loss": 0.0975, + "step": 10538 + }, + { + "epoch": 1.7075502268308491, + "grad_norm": 0.7282018065452576, + "learning_rate": 2.0636307326040972e-06, + "loss": 0.0926, + "step": 10539 + }, + { + "epoch": 1.7077122488658456, + "grad_norm": 0.7705861330032349, + "learning_rate": 2.0632001483608544e-06, + "loss": 0.1065, + "step": 10540 + }, + { + "epoch": 1.7078742709008425, + "grad_norm": 0.771189272403717, + "learning_rate": 2.0627695774826305e-06, + "loss": 0.0907, + "step": 10541 + }, + { + "epoch": 1.7080362929358393, + "grad_norm": 0.7684487700462341, + "learning_rate": 2.062339019982599e-06, + "loss": 0.1023, + "step": 10542 + }, + { + "epoch": 1.708198314970836, + "grad_norm": 0.8463597297668457, + "learning_rate": 2.0619084758739348e-06, + "loss": 0.1077, + "step": 10543 + }, + { + "epoch": 1.708360337005833, + "grad_norm": 0.7792380452156067, + "learning_rate": 2.06147794516981e-06, + "loss": 0.102, + "step": 10544 + }, + { + "epoch": 1.7085223590408296, + "grad_norm": 0.8330045938491821, + "learning_rate": 2.061047427883399e-06, + "loss": 0.1016, + "step": 10545 + }, + { + "epoch": 1.7086843810758263, + "grad_norm": 0.7635950446128845, + "learning_rate": 2.0606169240278752e-06, + "loss": 0.1005, + "step": 10546 + }, + { + "epoch": 1.708846403110823, + "grad_norm": 0.8526593446731567, + "learning_rate": 2.0601864336164104e-06, + "loss": 0.1061, + "step": 10547 + }, + { + "epoch": 1.7090084251458197, + "grad_norm": 0.8109990954399109, + "learning_rate": 2.059755956662176e-06, + "loss": 0.1015, + "step": 10548 + }, + { + "epoch": 1.7091704471808167, + "grad_norm": 0.7520654201507568, + "learning_rate": 2.0593254931783436e-06, + "loss": 0.0998, + "step": 10549 + }, + { + "epoch": 1.7093324692158134, + "grad_norm": 0.9032507538795471, + "learning_rate": 2.058895043178085e-06, + "loss": 0.1102, + "step": 10550 + }, + { + "epoch": 1.70949449125081, + "grad_norm": 0.7866460084915161, + "learning_rate": 2.0584646066745707e-06, + "loss": 0.0917, + "step": 10551 + }, + { + "epoch": 1.709656513285807, + "grad_norm": 0.800690770149231, + "learning_rate": 2.0580341836809718e-06, + "loss": 0.0962, + "step": 10552 + }, + { + "epoch": 1.7098185353208035, + "grad_norm": 0.7801680564880371, + "learning_rate": 2.0576037742104563e-06, + "loss": 0.0926, + "step": 10553 + }, + { + "epoch": 1.7099805573558005, + "grad_norm": 0.7548884153366089, + "learning_rate": 2.0571733782761943e-06, + "loss": 0.0968, + "step": 10554 + }, + { + "epoch": 1.7101425793907972, + "grad_norm": 0.7508038282394409, + "learning_rate": 2.056742995891356e-06, + "loss": 0.0899, + "step": 10555 + }, + { + "epoch": 1.7103046014257939, + "grad_norm": 0.8736903667449951, + "learning_rate": 2.0563126270691097e-06, + "loss": 0.1057, + "step": 10556 + }, + { + "epoch": 1.7104666234607908, + "grad_norm": 0.8422747254371643, + "learning_rate": 2.0558822718226226e-06, + "loss": 0.1087, + "step": 10557 + }, + { + "epoch": 1.7106286454957873, + "grad_norm": 0.8379523754119873, + "learning_rate": 2.055451930165063e-06, + "loss": 0.1065, + "step": 10558 + }, + { + "epoch": 1.7107906675307842, + "grad_norm": 0.7081695795059204, + "learning_rate": 2.0550216021095993e-06, + "loss": 0.0838, + "step": 10559 + }, + { + "epoch": 1.710952689565781, + "grad_norm": 0.9157091975212097, + "learning_rate": 2.054591287669398e-06, + "loss": 0.0991, + "step": 10560 + }, + { + "epoch": 1.7111147116007777, + "grad_norm": 0.8604560494422913, + "learning_rate": 2.054160986857625e-06, + "loss": 0.0997, + "step": 10561 + }, + { + "epoch": 1.7112767336357746, + "grad_norm": 0.861049473285675, + "learning_rate": 2.053730699687447e-06, + "loss": 0.1093, + "step": 10562 + }, + { + "epoch": 1.711438755670771, + "grad_norm": 0.8312821388244629, + "learning_rate": 2.053300426172029e-06, + "loss": 0.1004, + "step": 10563 + }, + { + "epoch": 1.711600777705768, + "grad_norm": 0.8654624223709106, + "learning_rate": 2.052870166324537e-06, + "loss": 0.1035, + "step": 10564 + }, + { + "epoch": 1.7117627997407647, + "grad_norm": 0.8277021646499634, + "learning_rate": 2.052439920158137e-06, + "loss": 0.0949, + "step": 10565 + }, + { + "epoch": 1.7119248217757614, + "grad_norm": 0.8619728088378906, + "learning_rate": 2.0520096876859918e-06, + "loss": 0.1026, + "step": 10566 + }, + { + "epoch": 1.7120868438107584, + "grad_norm": 0.8873988389968872, + "learning_rate": 2.051579468921266e-06, + "loss": 0.1064, + "step": 10567 + }, + { + "epoch": 1.7122488658457549, + "grad_norm": 0.8158526420593262, + "learning_rate": 2.051149263877123e-06, + "loss": 0.1018, + "step": 10568 + }, + { + "epoch": 1.7124108878807518, + "grad_norm": 0.8675525784492493, + "learning_rate": 2.0507190725667263e-06, + "loss": 0.1038, + "step": 10569 + }, + { + "epoch": 1.7125729099157485, + "grad_norm": 0.7022666931152344, + "learning_rate": 2.0502888950032396e-06, + "loss": 0.0982, + "step": 10570 + }, + { + "epoch": 1.7127349319507452, + "grad_norm": 1.0205082893371582, + "learning_rate": 2.0498587311998235e-06, + "loss": 0.1244, + "step": 10571 + }, + { + "epoch": 1.7128969539857422, + "grad_norm": 0.7745731472969055, + "learning_rate": 2.0494285811696417e-06, + "loss": 0.1027, + "step": 10572 + }, + { + "epoch": 1.7130589760207389, + "grad_norm": 0.8017035722732544, + "learning_rate": 2.0489984449258537e-06, + "loss": 0.0978, + "step": 10573 + }, + { + "epoch": 1.7132209980557356, + "grad_norm": 0.7118944525718689, + "learning_rate": 2.048568322481623e-06, + "loss": 0.0959, + "step": 10574 + }, + { + "epoch": 1.7133830200907323, + "grad_norm": 0.8438950777053833, + "learning_rate": 2.048138213850109e-06, + "loss": 0.1062, + "step": 10575 + }, + { + "epoch": 1.713545042125729, + "grad_norm": 0.7368760108947754, + "learning_rate": 2.0477081190444724e-06, + "loss": 0.0892, + "step": 10576 + }, + { + "epoch": 1.713707064160726, + "grad_norm": 0.7474925518035889, + "learning_rate": 2.0472780380778724e-06, + "loss": 0.0932, + "step": 10577 + }, + { + "epoch": 1.7138690861957226, + "grad_norm": 0.794523298740387, + "learning_rate": 2.046847970963468e-06, + "loss": 0.0957, + "step": 10578 + }, + { + "epoch": 1.7140311082307194, + "grad_norm": 0.8395006060600281, + "learning_rate": 2.0464179177144207e-06, + "loss": 0.1031, + "step": 10579 + }, + { + "epoch": 1.7141931302657163, + "grad_norm": 0.8234453797340393, + "learning_rate": 2.0459878783438867e-06, + "loss": 0.105, + "step": 10580 + }, + { + "epoch": 1.7143551523007128, + "grad_norm": 0.7391911149024963, + "learning_rate": 2.045557852865025e-06, + "loss": 0.0862, + "step": 10581 + }, + { + "epoch": 1.7145171743357097, + "grad_norm": 0.7759642601013184, + "learning_rate": 2.045127841290993e-06, + "loss": 0.1004, + "step": 10582 + }, + { + "epoch": 1.7146791963707064, + "grad_norm": 0.7772481441497803, + "learning_rate": 2.0446978436349486e-06, + "loss": 0.0965, + "step": 10583 + }, + { + "epoch": 1.7148412184057031, + "grad_norm": 0.8068507313728333, + "learning_rate": 2.0442678599100484e-06, + "loss": 0.0977, + "step": 10584 + }, + { + "epoch": 1.7150032404407, + "grad_norm": 0.8451778292655945, + "learning_rate": 2.0438378901294493e-06, + "loss": 0.1098, + "step": 10585 + }, + { + "epoch": 1.7151652624756966, + "grad_norm": 0.7878326773643494, + "learning_rate": 2.043407934306306e-06, + "loss": 0.0967, + "step": 10586 + }, + { + "epoch": 1.7153272845106935, + "grad_norm": 0.8413339853286743, + "learning_rate": 2.042977992453775e-06, + "loss": 0.0935, + "step": 10587 + }, + { + "epoch": 1.7154893065456902, + "grad_norm": 0.9668422937393188, + "learning_rate": 2.0425480645850124e-06, + "loss": 0.1142, + "step": 10588 + }, + { + "epoch": 1.715651328580687, + "grad_norm": 0.829973578453064, + "learning_rate": 2.042118150713171e-06, + "loss": 0.1115, + "step": 10589 + }, + { + "epoch": 1.7158133506156839, + "grad_norm": 0.8030633330345154, + "learning_rate": 2.041688250851407e-06, + "loss": 0.0899, + "step": 10590 + }, + { + "epoch": 1.7159753726506803, + "grad_norm": 0.9173603057861328, + "learning_rate": 2.041258365012873e-06, + "loss": 0.1067, + "step": 10591 + }, + { + "epoch": 1.7161373946856773, + "grad_norm": 0.9267182946205139, + "learning_rate": 2.0408284932107227e-06, + "loss": 0.1038, + "step": 10592 + }, + { + "epoch": 1.716299416720674, + "grad_norm": 0.7872363328933716, + "learning_rate": 2.040398635458109e-06, + "loss": 0.1063, + "step": 10593 + }, + { + "epoch": 1.7164614387556707, + "grad_norm": 0.7939862012863159, + "learning_rate": 2.039968791768186e-06, + "loss": 0.1016, + "step": 10594 + }, + { + "epoch": 1.7166234607906676, + "grad_norm": 0.7923547625541687, + "learning_rate": 2.039538962154104e-06, + "loss": 0.0921, + "step": 10595 + }, + { + "epoch": 1.7167854828256643, + "grad_norm": 0.7851249575614929, + "learning_rate": 2.039109146629016e-06, + "loss": 0.0993, + "step": 10596 + }, + { + "epoch": 1.716947504860661, + "grad_norm": 0.7416827082633972, + "learning_rate": 2.0386793452060717e-06, + "loss": 0.0926, + "step": 10597 + }, + { + "epoch": 1.7171095268956578, + "grad_norm": 0.8344098329544067, + "learning_rate": 2.0382495578984236e-06, + "loss": 0.1, + "step": 10598 + }, + { + "epoch": 1.7172715489306545, + "grad_norm": 0.81064373254776, + "learning_rate": 2.0378197847192216e-06, + "loss": 0.0975, + "step": 10599 + }, + { + "epoch": 1.7174335709656514, + "grad_norm": 0.8358397483825684, + "learning_rate": 2.0373900256816166e-06, + "loss": 0.1013, + "step": 10600 + }, + { + "epoch": 1.7175955930006481, + "grad_norm": 0.7919216752052307, + "learning_rate": 2.0369602807987564e-06, + "loss": 0.0989, + "step": 10601 + }, + { + "epoch": 1.7177576150356448, + "grad_norm": 0.7866799831390381, + "learning_rate": 2.0365305500837906e-06, + "loss": 0.0988, + "step": 10602 + }, + { + "epoch": 1.7179196370706418, + "grad_norm": 0.7129443287849426, + "learning_rate": 2.0361008335498695e-06, + "loss": 0.0871, + "step": 10603 + }, + { + "epoch": 1.7180816591056383, + "grad_norm": 0.8652666807174683, + "learning_rate": 2.0356711312101394e-06, + "loss": 0.1064, + "step": 10604 + }, + { + "epoch": 1.7182436811406352, + "grad_norm": 0.7662056088447571, + "learning_rate": 2.03524144307775e-06, + "loss": 0.097, + "step": 10605 + }, + { + "epoch": 1.718405703175632, + "grad_norm": 0.8492453098297119, + "learning_rate": 2.0348117691658463e-06, + "loss": 0.1026, + "step": 10606 + }, + { + "epoch": 1.7185677252106286, + "grad_norm": 0.7717750668525696, + "learning_rate": 2.0343821094875777e-06, + "loss": 0.0952, + "step": 10607 + }, + { + "epoch": 1.7187297472456255, + "grad_norm": 0.8496338725090027, + "learning_rate": 2.03395246405609e-06, + "loss": 0.0991, + "step": 10608 + }, + { + "epoch": 1.718891769280622, + "grad_norm": 0.7787384390830994, + "learning_rate": 2.0335228328845293e-06, + "loss": 0.0885, + "step": 10609 + }, + { + "epoch": 1.719053791315619, + "grad_norm": 0.6998451948165894, + "learning_rate": 2.03309321598604e-06, + "loss": 0.089, + "step": 10610 + }, + { + "epoch": 1.7192158133506157, + "grad_norm": 0.8012830018997192, + "learning_rate": 2.0326636133737686e-06, + "loss": 0.0946, + "step": 10611 + }, + { + "epoch": 1.7193778353856124, + "grad_norm": 0.7706624269485474, + "learning_rate": 2.03223402506086e-06, + "loss": 0.0964, + "step": 10612 + }, + { + "epoch": 1.7195398574206093, + "grad_norm": 0.8254458904266357, + "learning_rate": 2.0318044510604586e-06, + "loss": 0.1, + "step": 10613 + }, + { + "epoch": 1.7197018794556058, + "grad_norm": 0.7956833243370056, + "learning_rate": 2.031374891385708e-06, + "loss": 0.0903, + "step": 10614 + }, + { + "epoch": 1.7198639014906028, + "grad_norm": 0.8892257213592529, + "learning_rate": 2.030945346049751e-06, + "loss": 0.1094, + "step": 10615 + }, + { + "epoch": 1.7200259235255995, + "grad_norm": 0.7731269598007202, + "learning_rate": 2.0305158150657316e-06, + "loss": 0.0916, + "step": 10616 + }, + { + "epoch": 1.7201879455605962, + "grad_norm": 0.8199908137321472, + "learning_rate": 2.030086298446792e-06, + "loss": 0.0944, + "step": 10617 + }, + { + "epoch": 1.720349967595593, + "grad_norm": 0.8723816275596619, + "learning_rate": 2.0296567962060753e-06, + "loss": 0.1072, + "step": 10618 + }, + { + "epoch": 1.7205119896305896, + "grad_norm": 0.8999892473220825, + "learning_rate": 2.0292273083567215e-06, + "loss": 0.1013, + "step": 10619 + }, + { + "epoch": 1.7206740116655865, + "grad_norm": 0.7920383810997009, + "learning_rate": 2.0287978349118737e-06, + "loss": 0.0988, + "step": 10620 + }, + { + "epoch": 1.7208360337005832, + "grad_norm": 0.9639683365821838, + "learning_rate": 2.0283683758846705e-06, + "loss": 0.1159, + "step": 10621 + }, + { + "epoch": 1.72099805573558, + "grad_norm": 0.9006828665733337, + "learning_rate": 2.0279389312882546e-06, + "loss": 0.0991, + "step": 10622 + }, + { + "epoch": 1.721160077770577, + "grad_norm": 0.8471527099609375, + "learning_rate": 2.0275095011357655e-06, + "loss": 0.0966, + "step": 10623 + }, + { + "epoch": 1.7213220998055736, + "grad_norm": 0.9707726836204529, + "learning_rate": 2.027080085440341e-06, + "loss": 0.1061, + "step": 10624 + }, + { + "epoch": 1.7214841218405703, + "grad_norm": 0.7146053910255432, + "learning_rate": 2.0266506842151216e-06, + "loss": 0.0878, + "step": 10625 + }, + { + "epoch": 1.721646143875567, + "grad_norm": 0.7884452939033508, + "learning_rate": 2.0262212974732465e-06, + "loss": 0.0955, + "step": 10626 + }, + { + "epoch": 1.7218081659105637, + "grad_norm": 0.8306124210357666, + "learning_rate": 2.0257919252278535e-06, + "loss": 0.1006, + "step": 10627 + }, + { + "epoch": 1.7219701879455607, + "grad_norm": 0.8302614688873291, + "learning_rate": 2.0253625674920795e-06, + "loss": 0.0927, + "step": 10628 + }, + { + "epoch": 1.7221322099805574, + "grad_norm": 0.7767832279205322, + "learning_rate": 2.0249332242790627e-06, + "loss": 0.0956, + "step": 10629 + }, + { + "epoch": 1.722294232015554, + "grad_norm": 0.8298105001449585, + "learning_rate": 2.0245038956019386e-06, + "loss": 0.1043, + "step": 10630 + }, + { + "epoch": 1.722456254050551, + "grad_norm": 0.8026810884475708, + "learning_rate": 2.024074581473845e-06, + "loss": 0.1006, + "step": 10631 + }, + { + "epoch": 1.7226182760855475, + "grad_norm": 0.7589970231056213, + "learning_rate": 2.0236452819079183e-06, + "loss": 0.0942, + "step": 10632 + }, + { + "epoch": 1.7227802981205445, + "grad_norm": 0.7752396464347839, + "learning_rate": 2.0232159969172926e-06, + "loss": 0.095, + "step": 10633 + }, + { + "epoch": 1.7229423201555412, + "grad_norm": 0.7742292284965515, + "learning_rate": 2.0227867265151035e-06, + "loss": 0.0986, + "step": 10634 + }, + { + "epoch": 1.7231043421905379, + "grad_norm": 0.9077650904655457, + "learning_rate": 2.0223574707144854e-06, + "loss": 0.1147, + "step": 10635 + }, + { + "epoch": 1.7232663642255348, + "grad_norm": 0.7141374349594116, + "learning_rate": 2.0219282295285734e-06, + "loss": 0.0923, + "step": 10636 + }, + { + "epoch": 1.7234283862605313, + "grad_norm": 0.7725054621696472, + "learning_rate": 2.0214990029705007e-06, + "loss": 0.0992, + "step": 10637 + }, + { + "epoch": 1.7235904082955282, + "grad_norm": 0.8039776086807251, + "learning_rate": 2.021069791053401e-06, + "loss": 0.0928, + "step": 10638 + }, + { + "epoch": 1.723752430330525, + "grad_norm": 0.8312842845916748, + "learning_rate": 2.0206405937904058e-06, + "loss": 0.0994, + "step": 10639 + }, + { + "epoch": 1.7239144523655217, + "grad_norm": 0.7526655793190002, + "learning_rate": 2.0202114111946483e-06, + "loss": 0.0902, + "step": 10640 + }, + { + "epoch": 1.7240764744005186, + "grad_norm": 0.9838579297065735, + "learning_rate": 2.0197822432792606e-06, + "loss": 0.109, + "step": 10641 + }, + { + "epoch": 1.724238496435515, + "grad_norm": 0.9125879406929016, + "learning_rate": 2.019353090057375e-06, + "loss": 0.1185, + "step": 10642 + }, + { + "epoch": 1.724400518470512, + "grad_norm": 0.7206192016601562, + "learning_rate": 2.0189239515421214e-06, + "loss": 0.0869, + "step": 10643 + }, + { + "epoch": 1.7245625405055087, + "grad_norm": 0.8060803413391113, + "learning_rate": 2.018494827746631e-06, + "loss": 0.0971, + "step": 10644 + }, + { + "epoch": 1.7247245625405054, + "grad_norm": 0.804918110370636, + "learning_rate": 2.0180657186840326e-06, + "loss": 0.1081, + "step": 10645 + }, + { + "epoch": 1.7248865845755024, + "grad_norm": 0.8952481150627136, + "learning_rate": 2.0176366243674575e-06, + "loss": 0.1074, + "step": 10646 + }, + { + "epoch": 1.725048606610499, + "grad_norm": 0.7076264023780823, + "learning_rate": 2.0172075448100347e-06, + "loss": 0.0892, + "step": 10647 + }, + { + "epoch": 1.7252106286454958, + "grad_norm": 0.7837821245193481, + "learning_rate": 2.0167784800248924e-06, + "loss": 0.1005, + "step": 10648 + }, + { + "epoch": 1.7253726506804925, + "grad_norm": 0.7912270426750183, + "learning_rate": 2.0163494300251586e-06, + "loss": 0.1001, + "step": 10649 + }, + { + "epoch": 1.7255346727154892, + "grad_norm": 0.8220000863075256, + "learning_rate": 2.0159203948239624e-06, + "loss": 0.1018, + "step": 10650 + }, + { + "epoch": 1.7256966947504861, + "grad_norm": 0.8370126485824585, + "learning_rate": 2.0154913744344316e-06, + "loss": 0.1055, + "step": 10651 + }, + { + "epoch": 1.7258587167854829, + "grad_norm": 0.8922955989837646, + "learning_rate": 2.015062368869691e-06, + "loss": 0.1027, + "step": 10652 + }, + { + "epoch": 1.7260207388204796, + "grad_norm": 0.7914833426475525, + "learning_rate": 2.0146333781428694e-06, + "loss": 0.0964, + "step": 10653 + }, + { + "epoch": 1.7261827608554765, + "grad_norm": 0.8470481038093567, + "learning_rate": 2.0142044022670905e-06, + "loss": 0.1105, + "step": 10654 + }, + { + "epoch": 1.726344782890473, + "grad_norm": 0.7832110524177551, + "learning_rate": 2.0137754412554823e-06, + "loss": 0.095, + "step": 10655 + }, + { + "epoch": 1.72650680492547, + "grad_norm": 0.8746216297149658, + "learning_rate": 2.013346495121169e-06, + "loss": 0.1111, + "step": 10656 + }, + { + "epoch": 1.7266688269604666, + "grad_norm": 0.8226485848426819, + "learning_rate": 2.012917563877275e-06, + "loss": 0.0996, + "step": 10657 + }, + { + "epoch": 1.7268308489954634, + "grad_norm": 0.7802627682685852, + "learning_rate": 2.012488647536925e-06, + "loss": 0.1043, + "step": 10658 + }, + { + "epoch": 1.7269928710304603, + "grad_norm": 0.9438074231147766, + "learning_rate": 2.0120597461132416e-06, + "loss": 0.1009, + "step": 10659 + }, + { + "epoch": 1.7271548930654568, + "grad_norm": 0.810778796672821, + "learning_rate": 2.0116308596193502e-06, + "loss": 0.1009, + "step": 10660 + }, + { + "epoch": 1.7273169151004537, + "grad_norm": 0.7767440676689148, + "learning_rate": 2.0112019880683724e-06, + "loss": 0.0921, + "step": 10661 + }, + { + "epoch": 1.7274789371354504, + "grad_norm": 0.8341066241264343, + "learning_rate": 2.0107731314734316e-06, + "loss": 0.1, + "step": 10662 + }, + { + "epoch": 1.7276409591704471, + "grad_norm": 0.8395964503288269, + "learning_rate": 2.0103442898476484e-06, + "loss": 0.1091, + "step": 10663 + }, + { + "epoch": 1.727802981205444, + "grad_norm": 0.9881349802017212, + "learning_rate": 2.0099154632041446e-06, + "loss": 0.1047, + "step": 10664 + }, + { + "epoch": 1.7279650032404406, + "grad_norm": 0.7424138784408569, + "learning_rate": 2.0094866515560424e-06, + "loss": 0.0817, + "step": 10665 + }, + { + "epoch": 1.7281270252754375, + "grad_norm": 1.004927635192871, + "learning_rate": 2.0090578549164614e-06, + "loss": 0.117, + "step": 10666 + }, + { + "epoch": 1.7282890473104342, + "grad_norm": 0.8730446100234985, + "learning_rate": 2.0086290732985224e-06, + "loss": 0.0988, + "step": 10667 + }, + { + "epoch": 1.728451069345431, + "grad_norm": 0.8061848282814026, + "learning_rate": 2.0082003067153436e-06, + "loss": 0.1047, + "step": 10668 + }, + { + "epoch": 1.7286130913804278, + "grad_norm": 0.7493446469306946, + "learning_rate": 2.0077715551800457e-06, + "loss": 0.0995, + "step": 10669 + }, + { + "epoch": 1.7287751134154243, + "grad_norm": 0.8697327971458435, + "learning_rate": 2.007342818705747e-06, + "loss": 0.0994, + "step": 10670 + }, + { + "epoch": 1.7289371354504213, + "grad_norm": 0.842828631401062, + "learning_rate": 2.0069140973055663e-06, + "loss": 0.1031, + "step": 10671 + }, + { + "epoch": 1.729099157485418, + "grad_norm": 0.7752946019172668, + "learning_rate": 2.006485390992621e-06, + "loss": 0.0983, + "step": 10672 + }, + { + "epoch": 1.7292611795204147, + "grad_norm": 0.71074378490448, + "learning_rate": 2.0060566997800274e-06, + "loss": 0.0777, + "step": 10673 + }, + { + "epoch": 1.7294232015554116, + "grad_norm": 0.7402112483978271, + "learning_rate": 2.0056280236809044e-06, + "loss": 0.0999, + "step": 10674 + }, + { + "epoch": 1.7295852235904083, + "grad_norm": 0.7987510561943054, + "learning_rate": 2.005199362708367e-06, + "loss": 0.0993, + "step": 10675 + }, + { + "epoch": 1.729747245625405, + "grad_norm": 0.9736399054527283, + "learning_rate": 2.004770716875533e-06, + "loss": 0.1144, + "step": 10676 + }, + { + "epoch": 1.7299092676604018, + "grad_norm": 0.7902863621711731, + "learning_rate": 2.0043420861955155e-06, + "loss": 0.0946, + "step": 10677 + }, + { + "epoch": 1.7300712896953985, + "grad_norm": 0.8167175650596619, + "learning_rate": 2.0039134706814303e-06, + "loss": 0.1059, + "step": 10678 + }, + { + "epoch": 1.7302333117303954, + "grad_norm": 0.8277512788772583, + "learning_rate": 2.003484870346393e-06, + "loss": 0.1024, + "step": 10679 + }, + { + "epoch": 1.7303953337653921, + "grad_norm": 0.7947298884391785, + "learning_rate": 2.0030562852035175e-06, + "loss": 0.1047, + "step": 10680 + }, + { + "epoch": 1.7305573558003888, + "grad_norm": 0.7207613587379456, + "learning_rate": 2.0026277152659166e-06, + "loss": 0.0903, + "step": 10681 + }, + { + "epoch": 1.7307193778353858, + "grad_norm": 0.8288989663124084, + "learning_rate": 2.0021991605467043e-06, + "loss": 0.0966, + "step": 10682 + }, + { + "epoch": 1.7308813998703823, + "grad_norm": 0.7648212909698486, + "learning_rate": 2.0017706210589925e-06, + "loss": 0.0918, + "step": 10683 + }, + { + "epoch": 1.7310434219053792, + "grad_norm": 0.7417365312576294, + "learning_rate": 2.0013420968158944e-06, + "loss": 0.0972, + "step": 10684 + }, + { + "epoch": 1.731205443940376, + "grad_norm": 0.8182329535484314, + "learning_rate": 2.000913587830521e-06, + "loss": 0.1003, + "step": 10685 + }, + { + "epoch": 1.7313674659753726, + "grad_norm": 0.8272933959960938, + "learning_rate": 2.0004850941159847e-06, + "loss": 0.1045, + "step": 10686 + }, + { + "epoch": 1.7315294880103695, + "grad_norm": 0.8044247031211853, + "learning_rate": 2.0000566156853957e-06, + "loss": 0.0977, + "step": 10687 + }, + { + "epoch": 1.731691510045366, + "grad_norm": 0.7936481237411499, + "learning_rate": 1.999628152551863e-06, + "loss": 0.0906, + "step": 10688 + }, + { + "epoch": 1.731853532080363, + "grad_norm": 0.8275524973869324, + "learning_rate": 1.9991997047285e-06, + "loss": 0.0972, + "step": 10689 + }, + { + "epoch": 1.7320155541153597, + "grad_norm": 0.7702337503433228, + "learning_rate": 1.9987712722284132e-06, + "loss": 0.0968, + "step": 10690 + }, + { + "epoch": 1.7321775761503564, + "grad_norm": 0.6895307302474976, + "learning_rate": 1.9983428550647126e-06, + "loss": 0.0807, + "step": 10691 + }, + { + "epoch": 1.7323395981853533, + "grad_norm": 0.8178178668022156, + "learning_rate": 1.9979144532505064e-06, + "loss": 0.0974, + "step": 10692 + }, + { + "epoch": 1.7325016202203498, + "grad_norm": 0.7914819717407227, + "learning_rate": 1.997486066798903e-06, + "loss": 0.0964, + "step": 10693 + }, + { + "epoch": 1.7326636422553467, + "grad_norm": 0.8385084867477417, + "learning_rate": 1.9970576957230094e-06, + "loss": 0.0965, + "step": 10694 + }, + { + "epoch": 1.7328256642903435, + "grad_norm": 0.9208899140357971, + "learning_rate": 1.9966293400359343e-06, + "loss": 0.1082, + "step": 10695 + }, + { + "epoch": 1.7329876863253402, + "grad_norm": 0.9039163589477539, + "learning_rate": 1.996200999750783e-06, + "loss": 0.1038, + "step": 10696 + }, + { + "epoch": 1.733149708360337, + "grad_norm": 0.7650443911552429, + "learning_rate": 1.9957726748806608e-06, + "loss": 0.0898, + "step": 10697 + }, + { + "epoch": 1.7333117303953338, + "grad_norm": 0.8120889663696289, + "learning_rate": 1.995344365438676e-06, + "loss": 0.1004, + "step": 10698 + }, + { + "epoch": 1.7334737524303305, + "grad_norm": 0.9312616586685181, + "learning_rate": 1.9949160714379314e-06, + "loss": 0.1158, + "step": 10699 + }, + { + "epoch": 1.7336357744653272, + "grad_norm": 0.8903511166572571, + "learning_rate": 1.994487792891534e-06, + "loss": 0.1053, + "step": 10700 + }, + { + "epoch": 1.733797796500324, + "grad_norm": 0.8750895261764526, + "learning_rate": 1.9940595298125855e-06, + "loss": 0.1043, + "step": 10701 + }, + { + "epoch": 1.7339598185353209, + "grad_norm": 0.8411079049110413, + "learning_rate": 1.993631282214191e-06, + "loss": 0.1051, + "step": 10702 + }, + { + "epoch": 1.7341218405703176, + "grad_norm": 0.7505423426628113, + "learning_rate": 1.993203050109454e-06, + "loss": 0.1055, + "step": 10703 + }, + { + "epoch": 1.7342838626053143, + "grad_norm": 0.7114359736442566, + "learning_rate": 1.992774833511478e-06, + "loss": 0.0923, + "step": 10704 + }, + { + "epoch": 1.7344458846403112, + "grad_norm": 1.49692964553833, + "learning_rate": 1.9923466324333646e-06, + "loss": 0.1233, + "step": 10705 + }, + { + "epoch": 1.7346079066753077, + "grad_norm": 0.8229801654815674, + "learning_rate": 1.991918446888216e-06, + "loss": 0.1105, + "step": 10706 + }, + { + "epoch": 1.7347699287103047, + "grad_norm": 0.7137386202812195, + "learning_rate": 1.991490276889132e-06, + "loss": 0.0934, + "step": 10707 + }, + { + "epoch": 1.7349319507453014, + "grad_norm": 0.8458231687545776, + "learning_rate": 1.9910621224492154e-06, + "loss": 0.1179, + "step": 10708 + }, + { + "epoch": 1.735093972780298, + "grad_norm": 0.8449655771255493, + "learning_rate": 1.990633983581567e-06, + "loss": 0.1009, + "step": 10709 + }, + { + "epoch": 1.735255994815295, + "grad_norm": 0.8100825548171997, + "learning_rate": 1.9902058602992856e-06, + "loss": 0.0982, + "step": 10710 + }, + { + "epoch": 1.7354180168502915, + "grad_norm": 0.6856768131256104, + "learning_rate": 1.9897777526154717e-06, + "loss": 0.0872, + "step": 10711 + }, + { + "epoch": 1.7355800388852884, + "grad_norm": 0.8536728620529175, + "learning_rate": 1.989349660543222e-06, + "loss": 0.1053, + "step": 10712 + }, + { + "epoch": 1.7357420609202852, + "grad_norm": 0.6712336540222168, + "learning_rate": 1.988921584095639e-06, + "loss": 0.08, + "step": 10713 + }, + { + "epoch": 1.7359040829552819, + "grad_norm": 0.8158391118049622, + "learning_rate": 1.988493523285818e-06, + "loss": 0.0878, + "step": 10714 + }, + { + "epoch": 1.7360661049902788, + "grad_norm": 0.8350120782852173, + "learning_rate": 1.988065478126858e-06, + "loss": 0.0983, + "step": 10715 + }, + { + "epoch": 1.7362281270252753, + "grad_norm": 0.8612555265426636, + "learning_rate": 1.9876374486318545e-06, + "loss": 0.1114, + "step": 10716 + }, + { + "epoch": 1.7363901490602722, + "grad_norm": 0.8192741274833679, + "learning_rate": 1.9872094348139054e-06, + "loss": 0.1062, + "step": 10717 + }, + { + "epoch": 1.736552171095269, + "grad_norm": 0.8786032795906067, + "learning_rate": 1.9867814366861075e-06, + "loss": 0.1065, + "step": 10718 + }, + { + "epoch": 1.7367141931302656, + "grad_norm": 0.7366222739219666, + "learning_rate": 1.986353454261555e-06, + "loss": 0.0918, + "step": 10719 + }, + { + "epoch": 1.7368762151652626, + "grad_norm": 0.7951706051826477, + "learning_rate": 1.9859254875533435e-06, + "loss": 0.0903, + "step": 10720 + }, + { + "epoch": 1.737038237200259, + "grad_norm": 0.8006754517555237, + "learning_rate": 1.985497536574568e-06, + "loss": 0.0967, + "step": 10721 + }, + { + "epoch": 1.737200259235256, + "grad_norm": 0.8424306511878967, + "learning_rate": 1.9850696013383236e-06, + "loss": 0.103, + "step": 10722 + }, + { + "epoch": 1.7373622812702527, + "grad_norm": 0.7816236615180969, + "learning_rate": 1.9846416818577025e-06, + "loss": 0.0935, + "step": 10723 + }, + { + "epoch": 1.7375243033052494, + "grad_norm": 0.9311386942863464, + "learning_rate": 1.9842137781458e-06, + "loss": 0.1199, + "step": 10724 + }, + { + "epoch": 1.7376863253402464, + "grad_norm": 0.7541922926902771, + "learning_rate": 1.983785890215707e-06, + "loss": 0.0901, + "step": 10725 + }, + { + "epoch": 1.737848347375243, + "grad_norm": 0.842083215713501, + "learning_rate": 1.9833580180805155e-06, + "loss": 0.1022, + "step": 10726 + }, + { + "epoch": 1.7380103694102398, + "grad_norm": 0.7754312753677368, + "learning_rate": 1.98293016175332e-06, + "loss": 0.104, + "step": 10727 + }, + { + "epoch": 1.7381723914452365, + "grad_norm": 0.7790486216545105, + "learning_rate": 1.9825023212472095e-06, + "loss": 0.0973, + "step": 10728 + }, + { + "epoch": 1.7383344134802332, + "grad_norm": 0.9156441688537598, + "learning_rate": 1.9820744965752754e-06, + "loss": 0.1056, + "step": 10729 + }, + { + "epoch": 1.7384964355152301, + "grad_norm": 0.7913225293159485, + "learning_rate": 1.9816466877506095e-06, + "loss": 0.0946, + "step": 10730 + }, + { + "epoch": 1.7386584575502269, + "grad_norm": 0.7693167328834534, + "learning_rate": 1.981218894786299e-06, + "loss": 0.0908, + "step": 10731 + }, + { + "epoch": 1.7388204795852236, + "grad_norm": 0.7898248434066772, + "learning_rate": 1.9807911176954357e-06, + "loss": 0.0949, + "step": 10732 + }, + { + "epoch": 1.7389825016202205, + "grad_norm": 0.7249963283538818, + "learning_rate": 1.980363356491108e-06, + "loss": 0.0939, + "step": 10733 + }, + { + "epoch": 1.739144523655217, + "grad_norm": 0.8590476512908936, + "learning_rate": 1.9799356111864036e-06, + "loss": 0.1106, + "step": 10734 + }, + { + "epoch": 1.739306545690214, + "grad_norm": 0.8085042834281921, + "learning_rate": 1.979507881794412e-06, + "loss": 0.1028, + "step": 10735 + }, + { + "epoch": 1.7394685677252106, + "grad_norm": 0.801541805267334, + "learning_rate": 1.979080168328218e-06, + "loss": 0.0979, + "step": 10736 + }, + { + "epoch": 1.7396305897602073, + "grad_norm": 0.8037110567092896, + "learning_rate": 1.978652470800911e-06, + "loss": 0.1054, + "step": 10737 + }, + { + "epoch": 1.7397926117952043, + "grad_norm": 0.830059826374054, + "learning_rate": 1.9782247892255767e-06, + "loss": 0.0985, + "step": 10738 + }, + { + "epoch": 1.7399546338302008, + "grad_norm": 0.8223294019699097, + "learning_rate": 1.977797123615302e-06, + "loss": 0.0968, + "step": 10739 + }, + { + "epoch": 1.7401166558651977, + "grad_norm": 0.8012174367904663, + "learning_rate": 1.9773694739831702e-06, + "loss": 0.0944, + "step": 10740 + }, + { + "epoch": 1.7402786779001944, + "grad_norm": 0.9736762046813965, + "learning_rate": 1.9769418403422685e-06, + "loss": 0.1149, + "step": 10741 + }, + { + "epoch": 1.7404406999351911, + "grad_norm": 0.8288683891296387, + "learning_rate": 1.976514222705681e-06, + "loss": 0.0896, + "step": 10742 + }, + { + "epoch": 1.740602721970188, + "grad_norm": 0.908768892288208, + "learning_rate": 1.976086621086491e-06, + "loss": 0.1, + "step": 10743 + }, + { + "epoch": 1.7407647440051845, + "grad_norm": 0.8298162221908569, + "learning_rate": 1.975659035497783e-06, + "loss": 0.0956, + "step": 10744 + }, + { + "epoch": 1.7409267660401815, + "grad_norm": 0.8755427598953247, + "learning_rate": 1.9752314659526383e-06, + "loss": 0.1017, + "step": 10745 + }, + { + "epoch": 1.7410887880751782, + "grad_norm": 0.7467525005340576, + "learning_rate": 1.9748039124641426e-06, + "loss": 0.0907, + "step": 10746 + }, + { + "epoch": 1.741250810110175, + "grad_norm": 0.7956831455230713, + "learning_rate": 1.974376375045375e-06, + "loss": 0.0944, + "step": 10747 + }, + { + "epoch": 1.7414128321451718, + "grad_norm": 0.7522971034049988, + "learning_rate": 1.9739488537094197e-06, + "loss": 0.0952, + "step": 10748 + }, + { + "epoch": 1.7415748541801686, + "grad_norm": 0.8951601982116699, + "learning_rate": 1.973521348469355e-06, + "loss": 0.1079, + "step": 10749 + }, + { + "epoch": 1.7417368762151653, + "grad_norm": 0.8221662640571594, + "learning_rate": 1.973093859338263e-06, + "loss": 0.098, + "step": 10750 + }, + { + "epoch": 1.741898898250162, + "grad_norm": 0.8099002242088318, + "learning_rate": 1.972666386329225e-06, + "loss": 0.0924, + "step": 10751 + }, + { + "epoch": 1.7420609202851587, + "grad_norm": 0.8443790674209595, + "learning_rate": 1.9722389294553188e-06, + "loss": 0.1002, + "step": 10752 + }, + { + "epoch": 1.7422229423201556, + "grad_norm": 0.777952253818512, + "learning_rate": 1.9718114887296245e-06, + "loss": 0.0979, + "step": 10753 + }, + { + "epoch": 1.7423849643551523, + "grad_norm": 0.7357156276702881, + "learning_rate": 1.9713840641652206e-06, + "loss": 0.093, + "step": 10754 + }, + { + "epoch": 1.742546986390149, + "grad_norm": 0.8331161141395569, + "learning_rate": 1.970956655775184e-06, + "loss": 0.1058, + "step": 10755 + }, + { + "epoch": 1.742709008425146, + "grad_norm": 1.1018832921981812, + "learning_rate": 1.970529263572594e-06, + "loss": 0.1138, + "step": 10756 + }, + { + "epoch": 1.7428710304601425, + "grad_norm": 0.8057063221931458, + "learning_rate": 1.9701018875705277e-06, + "loss": 0.1083, + "step": 10757 + }, + { + "epoch": 1.7430330524951394, + "grad_norm": 0.8819854259490967, + "learning_rate": 1.9696745277820613e-06, + "loss": 0.1038, + "step": 10758 + }, + { + "epoch": 1.7431950745301361, + "grad_norm": 0.7874314188957214, + "learning_rate": 1.969247184220271e-06, + "loss": 0.0942, + "step": 10759 + }, + { + "epoch": 1.7433570965651328, + "grad_norm": 0.8394420742988586, + "learning_rate": 1.9688198568982316e-06, + "loss": 0.1002, + "step": 10760 + }, + { + "epoch": 1.7435191186001298, + "grad_norm": 0.8666523694992065, + "learning_rate": 1.9683925458290196e-06, + "loss": 0.1118, + "step": 10761 + }, + { + "epoch": 1.7436811406351262, + "grad_norm": 0.8129344582557678, + "learning_rate": 1.96796525102571e-06, + "loss": 0.1116, + "step": 10762 + }, + { + "epoch": 1.7438431626701232, + "grad_norm": 0.7237949967384338, + "learning_rate": 1.9675379725013752e-06, + "loss": 0.0892, + "step": 10763 + }, + { + "epoch": 1.74400518470512, + "grad_norm": 0.757645845413208, + "learning_rate": 1.96711071026909e-06, + "loss": 0.0978, + "step": 10764 + }, + { + "epoch": 1.7441672067401166, + "grad_norm": 0.7295411825180054, + "learning_rate": 1.9666834643419276e-06, + "loss": 0.087, + "step": 10765 + }, + { + "epoch": 1.7443292287751135, + "grad_norm": 0.8933332562446594, + "learning_rate": 1.9662562347329613e-06, + "loss": 0.1023, + "step": 10766 + }, + { + "epoch": 1.74449125081011, + "grad_norm": 0.8272913098335266, + "learning_rate": 1.9658290214552616e-06, + "loss": 0.1089, + "step": 10767 + }, + { + "epoch": 1.744653272845107, + "grad_norm": 0.8127201795578003, + "learning_rate": 1.9654018245219024e-06, + "loss": 0.0923, + "step": 10768 + }, + { + "epoch": 1.7448152948801037, + "grad_norm": 0.7642537951469421, + "learning_rate": 1.9649746439459523e-06, + "loss": 0.0996, + "step": 10769 + }, + { + "epoch": 1.7449773169151004, + "grad_norm": 0.7745651602745056, + "learning_rate": 1.9645474797404838e-06, + "loss": 0.0977, + "step": 10770 + }, + { + "epoch": 1.7451393389500973, + "grad_norm": 0.7775667905807495, + "learning_rate": 1.964120331918567e-06, + "loss": 0.0981, + "step": 10771 + }, + { + "epoch": 1.7453013609850938, + "grad_norm": 0.776843786239624, + "learning_rate": 1.963693200493271e-06, + "loss": 0.1047, + "step": 10772 + }, + { + "epoch": 1.7454633830200907, + "grad_norm": 0.6424158215522766, + "learning_rate": 1.963266085477665e-06, + "loss": 0.0803, + "step": 10773 + }, + { + "epoch": 1.7456254050550875, + "grad_norm": 0.8612257242202759, + "learning_rate": 1.962838986884818e-06, + "loss": 0.1058, + "step": 10774 + }, + { + "epoch": 1.7457874270900842, + "grad_norm": 0.8193663358688354, + "learning_rate": 1.9624119047277987e-06, + "loss": 0.0973, + "step": 10775 + }, + { + "epoch": 1.745949449125081, + "grad_norm": 0.7953779101371765, + "learning_rate": 1.9619848390196734e-06, + "loss": 0.1113, + "step": 10776 + }, + { + "epoch": 1.7461114711600778, + "grad_norm": 0.8136135339736938, + "learning_rate": 1.961557789773511e-06, + "loss": 0.1051, + "step": 10777 + }, + { + "epoch": 1.7462734931950745, + "grad_norm": 0.7347664833068848, + "learning_rate": 1.9611307570023766e-06, + "loss": 0.0897, + "step": 10778 + }, + { + "epoch": 1.7464355152300715, + "grad_norm": 0.6952412128448486, + "learning_rate": 1.9607037407193365e-06, + "loss": 0.0851, + "step": 10779 + }, + { + "epoch": 1.746597537265068, + "grad_norm": 0.8789510726928711, + "learning_rate": 1.960276740937458e-06, + "loss": 0.1074, + "step": 10780 + }, + { + "epoch": 1.7467595593000649, + "grad_norm": 0.8486936092376709, + "learning_rate": 1.9598497576698043e-06, + "loss": 0.0999, + "step": 10781 + }, + { + "epoch": 1.7469215813350616, + "grad_norm": 0.8231163620948792, + "learning_rate": 1.959422790929441e-06, + "loss": 0.1075, + "step": 10782 + }, + { + "epoch": 1.7470836033700583, + "grad_norm": 0.7864092588424683, + "learning_rate": 1.9589958407294317e-06, + "loss": 0.0967, + "step": 10783 + }, + { + "epoch": 1.7472456254050552, + "grad_norm": 0.7766144871711731, + "learning_rate": 1.9585689070828413e-06, + "loss": 0.0872, + "step": 10784 + }, + { + "epoch": 1.7474076474400517, + "grad_norm": 0.9048768877983093, + "learning_rate": 1.9581419900027317e-06, + "loss": 0.101, + "step": 10785 + }, + { + "epoch": 1.7475696694750487, + "grad_norm": 0.7717433571815491, + "learning_rate": 1.9577150895021664e-06, + "loss": 0.0954, + "step": 10786 + }, + { + "epoch": 1.7477316915100454, + "grad_norm": 0.8796309232711792, + "learning_rate": 1.9572882055942065e-06, + "loss": 0.101, + "step": 10787 + }, + { + "epoch": 1.747893713545042, + "grad_norm": 0.9133003354072571, + "learning_rate": 1.9568613382919142e-06, + "loss": 0.1146, + "step": 10788 + }, + { + "epoch": 1.748055735580039, + "grad_norm": 0.7639305591583252, + "learning_rate": 1.9564344876083504e-06, + "loss": 0.0979, + "step": 10789 + }, + { + "epoch": 1.7482177576150355, + "grad_norm": 0.7829166650772095, + "learning_rate": 1.9560076535565766e-06, + "loss": 0.1073, + "step": 10790 + }, + { + "epoch": 1.7483797796500324, + "grad_norm": 0.7283692359924316, + "learning_rate": 1.955580836149652e-06, + "loss": 0.0943, + "step": 10791 + }, + { + "epoch": 1.7485418016850292, + "grad_norm": 0.886214017868042, + "learning_rate": 1.9551540354006366e-06, + "loss": 0.1067, + "step": 10792 + }, + { + "epoch": 1.7487038237200259, + "grad_norm": 0.7852003574371338, + "learning_rate": 1.954727251322588e-06, + "loss": 0.0953, + "step": 10793 + }, + { + "epoch": 1.7488658457550228, + "grad_norm": 0.8498847484588623, + "learning_rate": 1.954300483928567e-06, + "loss": 0.1003, + "step": 10794 + }, + { + "epoch": 1.7490278677900193, + "grad_norm": 0.8580987453460693, + "learning_rate": 1.9538737332316304e-06, + "loss": 0.1139, + "step": 10795 + }, + { + "epoch": 1.7491898898250162, + "grad_norm": 0.800059974193573, + "learning_rate": 1.953446999244836e-06, + "loss": 0.0961, + "step": 10796 + }, + { + "epoch": 1.749351911860013, + "grad_norm": 0.7808043956756592, + "learning_rate": 1.953020281981241e-06, + "loss": 0.0931, + "step": 10797 + }, + { + "epoch": 1.7495139338950096, + "grad_norm": 0.7637292146682739, + "learning_rate": 1.9525935814539e-06, + "loss": 0.0983, + "step": 10798 + }, + { + "epoch": 1.7496759559300066, + "grad_norm": 0.7604991793632507, + "learning_rate": 1.952166897675873e-06, + "loss": 0.0898, + "step": 10799 + }, + { + "epoch": 1.7498379779650033, + "grad_norm": 0.7365829944610596, + "learning_rate": 1.951740230660212e-06, + "loss": 0.0953, + "step": 10800 + }, + { + "epoch": 1.75, + "grad_norm": 0.8294869065284729, + "learning_rate": 1.951313580419974e-06, + "loss": 0.1092, + "step": 10801 + }, + { + "epoch": 1.7501620220349967, + "grad_norm": 0.8134753704071045, + "learning_rate": 1.950886946968212e-06, + "loss": 0.1008, + "step": 10802 + }, + { + "epoch": 1.7503240440699934, + "grad_norm": 0.967177152633667, + "learning_rate": 1.9504603303179805e-06, + "loss": 0.1083, + "step": 10803 + }, + { + "epoch": 1.7504860661049904, + "grad_norm": 0.8222710490226746, + "learning_rate": 1.9500337304823333e-06, + "loss": 0.0935, + "step": 10804 + }, + { + "epoch": 1.750648088139987, + "grad_norm": 0.7220136523246765, + "learning_rate": 1.949607147474323e-06, + "loss": 0.0931, + "step": 10805 + }, + { + "epoch": 1.7508101101749838, + "grad_norm": 0.6608802676200867, + "learning_rate": 1.9491805813070025e-06, + "loss": 0.0843, + "step": 10806 + }, + { + "epoch": 1.7509721322099807, + "grad_norm": 0.816785454750061, + "learning_rate": 1.948754031993422e-06, + "loss": 0.0995, + "step": 10807 + }, + { + "epoch": 1.7511341542449772, + "grad_norm": 0.8732054829597473, + "learning_rate": 1.948327499546635e-06, + "loss": 0.1105, + "step": 10808 + }, + { + "epoch": 1.7512961762799741, + "grad_norm": 0.9379077553749084, + "learning_rate": 1.9479009839796913e-06, + "loss": 0.1121, + "step": 10809 + }, + { + "epoch": 1.7514581983149708, + "grad_norm": 0.8605905771255493, + "learning_rate": 1.947474485305642e-06, + "loss": 0.0981, + "step": 10810 + }, + { + "epoch": 1.7516202203499676, + "grad_norm": 0.8368394374847412, + "learning_rate": 1.947048003537536e-06, + "loss": 0.1033, + "step": 10811 + }, + { + "epoch": 1.7517822423849645, + "grad_norm": 0.7215535640716553, + "learning_rate": 1.9466215386884223e-06, + "loss": 0.0826, + "step": 10812 + }, + { + "epoch": 1.751944264419961, + "grad_norm": 0.8332518339157104, + "learning_rate": 1.9461950907713517e-06, + "loss": 0.1041, + "step": 10813 + }, + { + "epoch": 1.752106286454958, + "grad_norm": 0.9338610768318176, + "learning_rate": 1.9457686597993704e-06, + "loss": 0.1167, + "step": 10814 + }, + { + "epoch": 1.7522683084899546, + "grad_norm": 0.8345838785171509, + "learning_rate": 1.9453422457855274e-06, + "loss": 0.1062, + "step": 10815 + }, + { + "epoch": 1.7524303305249513, + "grad_norm": 0.7421085834503174, + "learning_rate": 1.9449158487428688e-06, + "loss": 0.0859, + "step": 10816 + }, + { + "epoch": 1.7525923525599483, + "grad_norm": 0.9578739404678345, + "learning_rate": 1.9444894686844417e-06, + "loss": 0.1091, + "step": 10817 + }, + { + "epoch": 1.7527543745949448, + "grad_norm": 0.819701075553894, + "learning_rate": 1.9440631056232926e-06, + "loss": 0.1051, + "step": 10818 + }, + { + "epoch": 1.7529163966299417, + "grad_norm": 0.8500429391860962, + "learning_rate": 1.943636759572468e-06, + "loss": 0.1019, + "step": 10819 + }, + { + "epoch": 1.7530784186649384, + "grad_norm": 0.8731544017791748, + "learning_rate": 1.9432104305450117e-06, + "loss": 0.1023, + "step": 10820 + }, + { + "epoch": 1.7532404406999351, + "grad_norm": 0.8563147783279419, + "learning_rate": 1.9427841185539693e-06, + "loss": 0.1049, + "step": 10821 + }, + { + "epoch": 1.753402462734932, + "grad_norm": 0.8676608800888062, + "learning_rate": 1.942357823612383e-06, + "loss": 0.1079, + "step": 10822 + }, + { + "epoch": 1.7535644847699285, + "grad_norm": 0.870190441608429, + "learning_rate": 1.941931545733299e-06, + "loss": 0.1082, + "step": 10823 + }, + { + "epoch": 1.7537265068049255, + "grad_norm": 0.8860259652137756, + "learning_rate": 1.9415052849297585e-06, + "loss": 0.1054, + "step": 10824 + }, + { + "epoch": 1.7538885288399222, + "grad_norm": 1.0227184295654297, + "learning_rate": 1.941079041214806e-06, + "loss": 0.1208, + "step": 10825 + }, + { + "epoch": 1.754050550874919, + "grad_norm": 0.787338137626648, + "learning_rate": 1.9406528146014815e-06, + "loss": 0.1075, + "step": 10826 + }, + { + "epoch": 1.7542125729099158, + "grad_norm": 0.9310184121131897, + "learning_rate": 1.9402266051028263e-06, + "loss": 0.1064, + "step": 10827 + }, + { + "epoch": 1.7543745949449125, + "grad_norm": 0.8110270500183105, + "learning_rate": 1.939800412731884e-06, + "loss": 0.1018, + "step": 10828 + }, + { + "epoch": 1.7545366169799093, + "grad_norm": 0.8377658128738403, + "learning_rate": 1.9393742375016926e-06, + "loss": 0.1095, + "step": 10829 + }, + { + "epoch": 1.7546986390149062, + "grad_norm": 0.8631830811500549, + "learning_rate": 1.9389480794252933e-06, + "loss": 0.1097, + "step": 10830 + }, + { + "epoch": 1.7548606610499027, + "grad_norm": 0.76842200756073, + "learning_rate": 1.9385219385157244e-06, + "loss": 0.0994, + "step": 10831 + }, + { + "epoch": 1.7550226830848996, + "grad_norm": 0.9799662828445435, + "learning_rate": 1.9380958147860254e-06, + "loss": 0.1145, + "step": 10832 + }, + { + "epoch": 1.7551847051198963, + "grad_norm": 0.7072003483772278, + "learning_rate": 1.937669708249235e-06, + "loss": 0.0899, + "step": 10833 + }, + { + "epoch": 1.755346727154893, + "grad_norm": 0.8198095560073853, + "learning_rate": 1.937243618918391e-06, + "loss": 0.1064, + "step": 10834 + }, + { + "epoch": 1.75550874918989, + "grad_norm": 0.8468987941741943, + "learning_rate": 1.9368175468065305e-06, + "loss": 0.1043, + "step": 10835 + }, + { + "epoch": 1.7556707712248865, + "grad_norm": 0.9012013077735901, + "learning_rate": 1.936391491926689e-06, + "loss": 0.107, + "step": 10836 + }, + { + "epoch": 1.7558327932598834, + "grad_norm": 0.7513500452041626, + "learning_rate": 1.9359654542919054e-06, + "loss": 0.0948, + "step": 10837 + }, + { + "epoch": 1.75599481529488, + "grad_norm": 0.7412696480751038, + "learning_rate": 1.9355394339152133e-06, + "loss": 0.0926, + "step": 10838 + }, + { + "epoch": 1.7561568373298768, + "grad_norm": 0.8077243566513062, + "learning_rate": 1.9351134308096493e-06, + "loss": 0.0994, + "step": 10839 + }, + { + "epoch": 1.7563188593648738, + "grad_norm": 0.7305166721343994, + "learning_rate": 1.9346874449882465e-06, + "loss": 0.0951, + "step": 10840 + }, + { + "epoch": 1.7564808813998702, + "grad_norm": 0.8272958993911743, + "learning_rate": 1.9342614764640392e-06, + "loss": 0.1083, + "step": 10841 + }, + { + "epoch": 1.7566429034348672, + "grad_norm": 0.7705273628234863, + "learning_rate": 1.9338355252500624e-06, + "loss": 0.095, + "step": 10842 + }, + { + "epoch": 1.7568049254698639, + "grad_norm": 0.754991352558136, + "learning_rate": 1.933409591359349e-06, + "loss": 0.1042, + "step": 10843 + }, + { + "epoch": 1.7569669475048606, + "grad_norm": 0.8502324819564819, + "learning_rate": 1.93298367480493e-06, + "loss": 0.0999, + "step": 10844 + }, + { + "epoch": 1.7571289695398575, + "grad_norm": 0.7320595383644104, + "learning_rate": 1.9325577755998397e-06, + "loss": 0.0893, + "step": 10845 + }, + { + "epoch": 1.757290991574854, + "grad_norm": 0.7977878451347351, + "learning_rate": 1.932131893757107e-06, + "loss": 0.0983, + "step": 10846 + }, + { + "epoch": 1.757453013609851, + "grad_norm": 0.7108481526374817, + "learning_rate": 1.9317060292897643e-06, + "loss": 0.0906, + "step": 10847 + }, + { + "epoch": 1.7576150356448477, + "grad_norm": 0.7112817764282227, + "learning_rate": 1.9312801822108425e-06, + "loss": 0.0883, + "step": 10848 + }, + { + "epoch": 1.7577770576798444, + "grad_norm": 0.7918272614479065, + "learning_rate": 1.9308543525333707e-06, + "loss": 0.0957, + "step": 10849 + }, + { + "epoch": 1.7579390797148413, + "grad_norm": 0.8627058267593384, + "learning_rate": 1.9304285402703775e-06, + "loss": 0.1079, + "step": 10850 + }, + { + "epoch": 1.758101101749838, + "grad_norm": 0.7404596209526062, + "learning_rate": 1.9300027454348932e-06, + "loss": 0.0908, + "step": 10851 + }, + { + "epoch": 1.7582631237848347, + "grad_norm": 0.879859447479248, + "learning_rate": 1.929576968039946e-06, + "loss": 0.1072, + "step": 10852 + }, + { + "epoch": 1.7584251458198314, + "grad_norm": 0.8356106281280518, + "learning_rate": 1.9291512080985626e-06, + "loss": 0.105, + "step": 10853 + }, + { + "epoch": 1.7585871678548282, + "grad_norm": 0.8642502427101135, + "learning_rate": 1.928725465623772e-06, + "loss": 0.1099, + "step": 10854 + }, + { + "epoch": 1.758749189889825, + "grad_norm": 0.9117339849472046, + "learning_rate": 1.928299740628598e-06, + "loss": 0.113, + "step": 10855 + }, + { + "epoch": 1.7589112119248218, + "grad_norm": 0.8020870685577393, + "learning_rate": 1.927874033126069e-06, + "loss": 0.1042, + "step": 10856 + }, + { + "epoch": 1.7590732339598185, + "grad_norm": 0.8990389704704285, + "learning_rate": 1.9274483431292107e-06, + "loss": 0.109, + "step": 10857 + }, + { + "epoch": 1.7592352559948155, + "grad_norm": 0.9079889059066772, + "learning_rate": 1.927022670651047e-06, + "loss": 0.1068, + "step": 10858 + }, + { + "epoch": 1.759397278029812, + "grad_norm": 0.928896427154541, + "learning_rate": 1.9265970157046037e-06, + "loss": 0.1076, + "step": 10859 + }, + { + "epoch": 1.7595593000648089, + "grad_norm": 0.936068594455719, + "learning_rate": 1.9261713783029024e-06, + "loss": 0.1145, + "step": 10860 + }, + { + "epoch": 1.7597213220998056, + "grad_norm": 0.791902482509613, + "learning_rate": 1.9257457584589697e-06, + "loss": 0.1006, + "step": 10861 + }, + { + "epoch": 1.7598833441348023, + "grad_norm": 0.7125447988510132, + "learning_rate": 1.9253201561858266e-06, + "loss": 0.0929, + "step": 10862 + }, + { + "epoch": 1.7600453661697992, + "grad_norm": 0.7819201946258545, + "learning_rate": 1.9248945714964967e-06, + "loss": 0.0972, + "step": 10863 + }, + { + "epoch": 1.7602073882047957, + "grad_norm": 0.6780325174331665, + "learning_rate": 1.924469004404001e-06, + "loss": 0.0861, + "step": 10864 + }, + { + "epoch": 1.7603694102397927, + "grad_norm": 0.755989670753479, + "learning_rate": 1.92404345492136e-06, + "loss": 0.0954, + "step": 10865 + }, + { + "epoch": 1.7605314322747894, + "grad_norm": 0.6931602358818054, + "learning_rate": 1.9236179230615967e-06, + "loss": 0.0822, + "step": 10866 + }, + { + "epoch": 1.760693454309786, + "grad_norm": 0.7863826155662537, + "learning_rate": 1.9231924088377296e-06, + "loss": 0.0948, + "step": 10867 + }, + { + "epoch": 1.760855476344783, + "grad_norm": 0.6952497959136963, + "learning_rate": 1.922766912262779e-06, + "loss": 0.0819, + "step": 10868 + }, + { + "epoch": 1.7610174983797795, + "grad_norm": 0.6934300065040588, + "learning_rate": 1.922341433349764e-06, + "loss": 0.0829, + "step": 10869 + }, + { + "epoch": 1.7611795204147764, + "grad_norm": 0.7480776906013489, + "learning_rate": 1.921915972111703e-06, + "loss": 0.0958, + "step": 10870 + }, + { + "epoch": 1.7613415424497731, + "grad_norm": 0.8322224020957947, + "learning_rate": 1.9214905285616147e-06, + "loss": 0.1, + "step": 10871 + }, + { + "epoch": 1.7615035644847699, + "grad_norm": 0.8011393547058105, + "learning_rate": 1.9210651027125164e-06, + "loss": 0.1024, + "step": 10872 + }, + { + "epoch": 1.7616655865197668, + "grad_norm": 0.8309357166290283, + "learning_rate": 1.9206396945774246e-06, + "loss": 0.1033, + "step": 10873 + }, + { + "epoch": 1.7618276085547635, + "grad_norm": 0.7737796306610107, + "learning_rate": 1.9202143041693554e-06, + "loss": 0.0988, + "step": 10874 + }, + { + "epoch": 1.7619896305897602, + "grad_norm": 0.7730358242988586, + "learning_rate": 1.919788931501327e-06, + "loss": 0.0998, + "step": 10875 + }, + { + "epoch": 1.762151652624757, + "grad_norm": 0.8785960078239441, + "learning_rate": 1.919363576586352e-06, + "loss": 0.1073, + "step": 10876 + }, + { + "epoch": 1.7623136746597536, + "grad_norm": 0.8502126932144165, + "learning_rate": 1.918938239437447e-06, + "loss": 0.1092, + "step": 10877 + }, + { + "epoch": 1.7624756966947506, + "grad_norm": 0.7489198446273804, + "learning_rate": 1.918512920067626e-06, + "loss": 0.0944, + "step": 10878 + }, + { + "epoch": 1.7626377187297473, + "grad_norm": 0.7126352787017822, + "learning_rate": 1.9180876184899015e-06, + "loss": 0.0952, + "step": 10879 + }, + { + "epoch": 1.762799740764744, + "grad_norm": 0.8675928711891174, + "learning_rate": 1.9176623347172885e-06, + "loss": 0.1067, + "step": 10880 + }, + { + "epoch": 1.762961762799741, + "grad_norm": 0.820986807346344, + "learning_rate": 1.9172370687627987e-06, + "loss": 0.0972, + "step": 10881 + }, + { + "epoch": 1.7631237848347374, + "grad_norm": 0.8926191329956055, + "learning_rate": 1.9168118206394443e-06, + "loss": 0.1056, + "step": 10882 + }, + { + "epoch": 1.7632858068697344, + "grad_norm": 0.8152083158493042, + "learning_rate": 1.9163865903602374e-06, + "loss": 0.1041, + "step": 10883 + }, + { + "epoch": 1.763447828904731, + "grad_norm": 0.837230920791626, + "learning_rate": 1.915961377938187e-06, + "loss": 0.1074, + "step": 10884 + }, + { + "epoch": 1.7636098509397278, + "grad_norm": 0.8547905683517456, + "learning_rate": 1.915536183386306e-06, + "loss": 0.1056, + "step": 10885 + }, + { + "epoch": 1.7637718729747247, + "grad_norm": 0.8433102965354919, + "learning_rate": 1.9151110067176038e-06, + "loss": 0.1013, + "step": 10886 + }, + { + "epoch": 1.7639338950097212, + "grad_norm": 0.8947946429252625, + "learning_rate": 1.9146858479450894e-06, + "loss": 0.1166, + "step": 10887 + }, + { + "epoch": 1.7640959170447181, + "grad_norm": 0.9684112668037415, + "learning_rate": 1.914260707081771e-06, + "loss": 0.1186, + "step": 10888 + }, + { + "epoch": 1.7642579390797148, + "grad_norm": 0.7762362360954285, + "learning_rate": 1.913835584140657e-06, + "loss": 0.0967, + "step": 10889 + }, + { + "epoch": 1.7644199611147116, + "grad_norm": 0.7626681923866272, + "learning_rate": 1.913410479134757e-06, + "loss": 0.0881, + "step": 10890 + }, + { + "epoch": 1.7645819831497085, + "grad_norm": 0.7825966477394104, + "learning_rate": 1.9129853920770763e-06, + "loss": 0.0941, + "step": 10891 + }, + { + "epoch": 1.764744005184705, + "grad_norm": 0.8394178748130798, + "learning_rate": 1.9125603229806223e-06, + "loss": 0.1112, + "step": 10892 + }, + { + "epoch": 1.764906027219702, + "grad_norm": 0.7935326099395752, + "learning_rate": 1.9121352718584006e-06, + "loss": 0.1012, + "step": 10893 + }, + { + "epoch": 1.7650680492546986, + "grad_norm": 0.8410793542861938, + "learning_rate": 1.9117102387234165e-06, + "loss": 0.1077, + "step": 10894 + }, + { + "epoch": 1.7652300712896953, + "grad_norm": 0.7813910245895386, + "learning_rate": 1.9112852235886757e-06, + "loss": 0.0947, + "step": 10895 + }, + { + "epoch": 1.7653920933246923, + "grad_norm": 0.81944340467453, + "learning_rate": 1.910860226467183e-06, + "loss": 0.0989, + "step": 10896 + }, + { + "epoch": 1.7655541153596888, + "grad_norm": 0.7879267930984497, + "learning_rate": 1.910435247371941e-06, + "loss": 0.1065, + "step": 10897 + }, + { + "epoch": 1.7657161373946857, + "grad_norm": 0.8651462197303772, + "learning_rate": 1.910010286315953e-06, + "loss": 0.1011, + "step": 10898 + }, + { + "epoch": 1.7658781594296824, + "grad_norm": 0.7657998204231262, + "learning_rate": 1.909585343312224e-06, + "loss": 0.0925, + "step": 10899 + }, + { + "epoch": 1.7660401814646791, + "grad_norm": 0.652816891670227, + "learning_rate": 1.9091604183737546e-06, + "loss": 0.0832, + "step": 10900 + }, + { + "epoch": 1.766202203499676, + "grad_norm": 0.8381374478340149, + "learning_rate": 1.9087355115135465e-06, + "loss": 0.1008, + "step": 10901 + }, + { + "epoch": 1.7663642255346728, + "grad_norm": 0.7962815761566162, + "learning_rate": 1.9083106227446e-06, + "loss": 0.0991, + "step": 10902 + }, + { + "epoch": 1.7665262475696695, + "grad_norm": 0.7816174030303955, + "learning_rate": 1.9078857520799167e-06, + "loss": 0.098, + "step": 10903 + }, + { + "epoch": 1.7666882696046662, + "grad_norm": 0.8635630011558533, + "learning_rate": 1.907460899532497e-06, + "loss": 0.0986, + "step": 10904 + }, + { + "epoch": 1.766850291639663, + "grad_norm": 0.8344462513923645, + "learning_rate": 1.9070360651153402e-06, + "loss": 0.094, + "step": 10905 + }, + { + "epoch": 1.7670123136746598, + "grad_norm": 0.6713218092918396, + "learning_rate": 1.9066112488414445e-06, + "loss": 0.0832, + "step": 10906 + }, + { + "epoch": 1.7671743357096565, + "grad_norm": 0.8062579035758972, + "learning_rate": 1.906186450723809e-06, + "loss": 0.104, + "step": 10907 + }, + { + "epoch": 1.7673363577446533, + "grad_norm": 0.8023903965950012, + "learning_rate": 1.90576167077543e-06, + "loss": 0.091, + "step": 10908 + }, + { + "epoch": 1.7674983797796502, + "grad_norm": 0.739756166934967, + "learning_rate": 1.9053369090093065e-06, + "loss": 0.0819, + "step": 10909 + }, + { + "epoch": 1.7676604018146467, + "grad_norm": 0.9430824518203735, + "learning_rate": 1.904912165438435e-06, + "loss": 0.1085, + "step": 10910 + }, + { + "epoch": 1.7678224238496436, + "grad_norm": 1.0915939807891846, + "learning_rate": 1.9044874400758106e-06, + "loss": 0.12, + "step": 10911 + }, + { + "epoch": 1.7679844458846403, + "grad_norm": 0.6942973136901855, + "learning_rate": 1.9040627329344296e-06, + "loss": 0.0896, + "step": 10912 + }, + { + "epoch": 1.768146467919637, + "grad_norm": 0.8167296648025513, + "learning_rate": 1.9036380440272861e-06, + "loss": 0.0949, + "step": 10913 + }, + { + "epoch": 1.768308489954634, + "grad_norm": 0.8740625381469727, + "learning_rate": 1.9032133733673764e-06, + "loss": 0.0991, + "step": 10914 + }, + { + "epoch": 1.7684705119896305, + "grad_norm": 0.9350792169570923, + "learning_rate": 1.9027887209676925e-06, + "loss": 0.1148, + "step": 10915 + }, + { + "epoch": 1.7686325340246274, + "grad_norm": 0.7898386716842651, + "learning_rate": 1.9023640868412297e-06, + "loss": 0.0929, + "step": 10916 + }, + { + "epoch": 1.768794556059624, + "grad_norm": 0.7293332815170288, + "learning_rate": 1.901939471000978e-06, + "loss": 0.0902, + "step": 10917 + }, + { + "epoch": 1.7689565780946208, + "grad_norm": 0.8032740950584412, + "learning_rate": 1.9015148734599317e-06, + "loss": 0.097, + "step": 10918 + }, + { + "epoch": 1.7691186001296177, + "grad_norm": 0.7515488266944885, + "learning_rate": 1.9010902942310827e-06, + "loss": 0.1018, + "step": 10919 + }, + { + "epoch": 1.7692806221646142, + "grad_norm": 0.7839230895042419, + "learning_rate": 1.900665733327421e-06, + "loss": 0.0935, + "step": 10920 + }, + { + "epoch": 1.7694426441996112, + "grad_norm": 0.8209660649299622, + "learning_rate": 1.9002411907619372e-06, + "loss": 0.1026, + "step": 10921 + }, + { + "epoch": 1.7696046662346079, + "grad_norm": 0.7910726070404053, + "learning_rate": 1.899816666547621e-06, + "loss": 0.0966, + "step": 10922 + }, + { + "epoch": 1.7697666882696046, + "grad_norm": 0.8440234065055847, + "learning_rate": 1.8993921606974636e-06, + "loss": 0.1055, + "step": 10923 + }, + { + "epoch": 1.7699287103046015, + "grad_norm": 0.9682288765907288, + "learning_rate": 1.8989676732244522e-06, + "loss": 0.1216, + "step": 10924 + }, + { + "epoch": 1.7700907323395982, + "grad_norm": 0.8367732763290405, + "learning_rate": 1.8985432041415758e-06, + "loss": 0.1072, + "step": 10925 + }, + { + "epoch": 1.770252754374595, + "grad_norm": 1.015958547592163, + "learning_rate": 1.8981187534618217e-06, + "loss": 0.1244, + "step": 10926 + }, + { + "epoch": 1.7704147764095917, + "grad_norm": 1.0870450735092163, + "learning_rate": 1.8976943211981764e-06, + "loss": 0.1068, + "step": 10927 + }, + { + "epoch": 1.7705767984445884, + "grad_norm": 0.8047299981117249, + "learning_rate": 1.8972699073636283e-06, + "loss": 0.1014, + "step": 10928 + }, + { + "epoch": 1.7707388204795853, + "grad_norm": 0.7418354153633118, + "learning_rate": 1.896845511971162e-06, + "loss": 0.0959, + "step": 10929 + }, + { + "epoch": 1.770900842514582, + "grad_norm": 0.6954073905944824, + "learning_rate": 1.8964211350337637e-06, + "loss": 0.0887, + "step": 10930 + }, + { + "epoch": 1.7710628645495787, + "grad_norm": 0.8665586113929749, + "learning_rate": 1.8959967765644182e-06, + "loss": 0.1077, + "step": 10931 + }, + { + "epoch": 1.7712248865845757, + "grad_norm": 0.6979767084121704, + "learning_rate": 1.895572436576109e-06, + "loss": 0.0904, + "step": 10932 + }, + { + "epoch": 1.7713869086195722, + "grad_norm": 0.8404514193534851, + "learning_rate": 1.8951481150818206e-06, + "loss": 0.1054, + "step": 10933 + }, + { + "epoch": 1.771548930654569, + "grad_norm": 0.7205252647399902, + "learning_rate": 1.8947238120945372e-06, + "loss": 0.0875, + "step": 10934 + }, + { + "epoch": 1.7717109526895658, + "grad_norm": 0.8082873821258545, + "learning_rate": 1.8942995276272396e-06, + "loss": 0.0989, + "step": 10935 + }, + { + "epoch": 1.7718729747245625, + "grad_norm": 0.9559950828552246, + "learning_rate": 1.8938752616929112e-06, + "loss": 0.1251, + "step": 10936 + }, + { + "epoch": 1.7720349967595594, + "grad_norm": 0.7924472093582153, + "learning_rate": 1.8934510143045316e-06, + "loss": 0.097, + "step": 10937 + }, + { + "epoch": 1.772197018794556, + "grad_norm": 0.8172940015792847, + "learning_rate": 1.8930267854750845e-06, + "loss": 0.1057, + "step": 10938 + }, + { + "epoch": 1.7723590408295529, + "grad_norm": 0.8141445517539978, + "learning_rate": 1.8926025752175486e-06, + "loss": 0.1122, + "step": 10939 + }, + { + "epoch": 1.7725210628645496, + "grad_norm": 0.8678747415542603, + "learning_rate": 1.8921783835449042e-06, + "loss": 0.1103, + "step": 10940 + }, + { + "epoch": 1.7726830848995463, + "grad_norm": 0.6963508725166321, + "learning_rate": 1.8917542104701297e-06, + "loss": 0.0906, + "step": 10941 + }, + { + "epoch": 1.7728451069345432, + "grad_norm": 0.7853334546089172, + "learning_rate": 1.8913300560062047e-06, + "loss": 0.0972, + "step": 10942 + }, + { + "epoch": 1.7730071289695397, + "grad_norm": 0.8051917552947998, + "learning_rate": 1.8909059201661079e-06, + "loss": 0.0975, + "step": 10943 + }, + { + "epoch": 1.7731691510045366, + "grad_norm": 0.7873483896255493, + "learning_rate": 1.890481802962815e-06, + "loss": 0.1039, + "step": 10944 + }, + { + "epoch": 1.7733311730395334, + "grad_norm": 0.7235662937164307, + "learning_rate": 1.8900577044093045e-06, + "loss": 0.0817, + "step": 10945 + }, + { + "epoch": 1.77349319507453, + "grad_norm": 0.793947696685791, + "learning_rate": 1.889633624518551e-06, + "loss": 0.0988, + "step": 10946 + }, + { + "epoch": 1.773655217109527, + "grad_norm": 0.8600531816482544, + "learning_rate": 1.889209563303533e-06, + "loss": 0.1138, + "step": 10947 + }, + { + "epoch": 1.7738172391445235, + "grad_norm": 0.7487325072288513, + "learning_rate": 1.8887855207772235e-06, + "loss": 0.0858, + "step": 10948 + }, + { + "epoch": 1.7739792611795204, + "grad_norm": 0.8943690061569214, + "learning_rate": 1.8883614969525987e-06, + "loss": 0.1172, + "step": 10949 + }, + { + "epoch": 1.7741412832145171, + "grad_norm": 0.7959859371185303, + "learning_rate": 1.8879374918426312e-06, + "loss": 0.1, + "step": 10950 + }, + { + "epoch": 1.7743033052495139, + "grad_norm": 0.8319025635719299, + "learning_rate": 1.887513505460295e-06, + "loss": 0.1021, + "step": 10951 + }, + { + "epoch": 1.7744653272845108, + "grad_norm": 0.7502698302268982, + "learning_rate": 1.8870895378185643e-06, + "loss": 0.092, + "step": 10952 + }, + { + "epoch": 1.7746273493195075, + "grad_norm": 0.8591358661651611, + "learning_rate": 1.88666558893041e-06, + "loss": 0.1105, + "step": 10953 + }, + { + "epoch": 1.7747893713545042, + "grad_norm": 0.7255021333694458, + "learning_rate": 1.886241658808805e-06, + "loss": 0.0905, + "step": 10954 + }, + { + "epoch": 1.774951393389501, + "grad_norm": 0.7662566304206848, + "learning_rate": 1.8858177474667195e-06, + "loss": 0.0917, + "step": 10955 + }, + { + "epoch": 1.7751134154244976, + "grad_norm": 0.7199138402938843, + "learning_rate": 1.8853938549171242e-06, + "loss": 0.0868, + "step": 10956 + }, + { + "epoch": 1.7752754374594946, + "grad_norm": 0.8491278886795044, + "learning_rate": 1.88496998117299e-06, + "loss": 0.1025, + "step": 10957 + }, + { + "epoch": 1.7754374594944913, + "grad_norm": 0.9104598164558411, + "learning_rate": 1.8845461262472863e-06, + "loss": 0.108, + "step": 10958 + }, + { + "epoch": 1.775599481529488, + "grad_norm": 0.7580164670944214, + "learning_rate": 1.8841222901529816e-06, + "loss": 0.0952, + "step": 10959 + }, + { + "epoch": 1.775761503564485, + "grad_norm": 0.8028944730758667, + "learning_rate": 1.883698472903045e-06, + "loss": 0.1113, + "step": 10960 + }, + { + "epoch": 1.7759235255994814, + "grad_norm": 0.7645183801651001, + "learning_rate": 1.8832746745104425e-06, + "loss": 0.1035, + "step": 10961 + }, + { + "epoch": 1.7760855476344783, + "grad_norm": 0.7198721170425415, + "learning_rate": 1.882850894988143e-06, + "loss": 0.0931, + "step": 10962 + }, + { + "epoch": 1.776247569669475, + "grad_norm": 0.7722495794296265, + "learning_rate": 1.882427134349113e-06, + "loss": 0.0999, + "step": 10963 + }, + { + "epoch": 1.7764095917044718, + "grad_norm": 0.8241525292396545, + "learning_rate": 1.882003392606318e-06, + "loss": 0.1011, + "step": 10964 + }, + { + "epoch": 1.7765716137394687, + "grad_norm": 0.7520769834518433, + "learning_rate": 1.881579669772723e-06, + "loss": 0.0956, + "step": 10965 + }, + { + "epoch": 1.7767336357744652, + "grad_norm": 0.7766212224960327, + "learning_rate": 1.8811559658612941e-06, + "loss": 0.1007, + "step": 10966 + }, + { + "epoch": 1.7768956578094621, + "grad_norm": 0.7454885244369507, + "learning_rate": 1.8807322808849953e-06, + "loss": 0.0895, + "step": 10967 + }, + { + "epoch": 1.7770576798444588, + "grad_norm": 0.7523648142814636, + "learning_rate": 1.88030861485679e-06, + "loss": 0.0972, + "step": 10968 + }, + { + "epoch": 1.7772197018794555, + "grad_norm": 0.902167022228241, + "learning_rate": 1.879884967789642e-06, + "loss": 0.1076, + "step": 10969 + }, + { + "epoch": 1.7773817239144525, + "grad_norm": 0.9376851320266724, + "learning_rate": 1.879461339696512e-06, + "loss": 0.1125, + "step": 10970 + }, + { + "epoch": 1.777543745949449, + "grad_norm": 0.7125734090805054, + "learning_rate": 1.879037730590364e-06, + "loss": 0.0869, + "step": 10971 + }, + { + "epoch": 1.777705767984446, + "grad_norm": 0.9217870831489563, + "learning_rate": 1.8786141404841587e-06, + "loss": 0.1053, + "step": 10972 + }, + { + "epoch": 1.7778677900194426, + "grad_norm": 0.7570266723632812, + "learning_rate": 1.8781905693908575e-06, + "loss": 0.0966, + "step": 10973 + }, + { + "epoch": 1.7780298120544393, + "grad_norm": 0.8740879893302917, + "learning_rate": 1.8777670173234198e-06, + "loss": 0.1015, + "step": 10974 + }, + { + "epoch": 1.7781918340894363, + "grad_norm": 0.7980040907859802, + "learning_rate": 1.877343484294805e-06, + "loss": 0.0967, + "step": 10975 + }, + { + "epoch": 1.778353856124433, + "grad_norm": 0.8354352116584778, + "learning_rate": 1.8769199703179736e-06, + "loss": 0.1052, + "step": 10976 + }, + { + "epoch": 1.7785158781594297, + "grad_norm": 0.7894411683082581, + "learning_rate": 1.876496475405883e-06, + "loss": 0.0979, + "step": 10977 + }, + { + "epoch": 1.7786779001944264, + "grad_norm": 0.7108971476554871, + "learning_rate": 1.8760729995714916e-06, + "loss": 0.0876, + "step": 10978 + }, + { + "epoch": 1.778839922229423, + "grad_norm": 0.8221232891082764, + "learning_rate": 1.8756495428277562e-06, + "loss": 0.1034, + "step": 10979 + }, + { + "epoch": 1.77900194426442, + "grad_norm": 0.7995948195457458, + "learning_rate": 1.8752261051876337e-06, + "loss": 0.0979, + "step": 10980 + }, + { + "epoch": 1.7791639662994168, + "grad_norm": 0.8195783495903015, + "learning_rate": 1.8748026866640806e-06, + "loss": 0.104, + "step": 10981 + }, + { + "epoch": 1.7793259883344135, + "grad_norm": 0.8994091153144836, + "learning_rate": 1.8743792872700529e-06, + "loss": 0.1015, + "step": 10982 + }, + { + "epoch": 1.7794880103694104, + "grad_norm": 0.9404149055480957, + "learning_rate": 1.8739559070185045e-06, + "loss": 0.1078, + "step": 10983 + }, + { + "epoch": 1.779650032404407, + "grad_norm": 0.7274839878082275, + "learning_rate": 1.873532545922391e-06, + "loss": 0.0896, + "step": 10984 + }, + { + "epoch": 1.7798120544394038, + "grad_norm": 0.7796977758407593, + "learning_rate": 1.8731092039946646e-06, + "loss": 0.098, + "step": 10985 + }, + { + "epoch": 1.7799740764744005, + "grad_norm": 0.7981799840927124, + "learning_rate": 1.8726858812482798e-06, + "loss": 0.0993, + "step": 10986 + }, + { + "epoch": 1.7801360985093972, + "grad_norm": 0.8132720589637756, + "learning_rate": 1.8722625776961894e-06, + "loss": 0.1009, + "step": 10987 + }, + { + "epoch": 1.7802981205443942, + "grad_norm": 0.769990086555481, + "learning_rate": 1.871839293351345e-06, + "loss": 0.0948, + "step": 10988 + }, + { + "epoch": 1.7804601425793907, + "grad_norm": 0.7847530245780945, + "learning_rate": 1.8714160282266973e-06, + "loss": 0.0981, + "step": 10989 + }, + { + "epoch": 1.7806221646143876, + "grad_norm": 0.8644593954086304, + "learning_rate": 1.870992782335198e-06, + "loss": 0.0972, + "step": 10990 + }, + { + "epoch": 1.7807841866493843, + "grad_norm": 0.777860701084137, + "learning_rate": 1.8705695556897986e-06, + "loss": 0.0979, + "step": 10991 + }, + { + "epoch": 1.780946208684381, + "grad_norm": 0.9635716676712036, + "learning_rate": 1.8701463483034471e-06, + "loss": 0.1061, + "step": 10992 + }, + { + "epoch": 1.781108230719378, + "grad_norm": 0.7884843945503235, + "learning_rate": 1.8697231601890933e-06, + "loss": 0.0937, + "step": 10993 + }, + { + "epoch": 1.7812702527543745, + "grad_norm": 0.8972017168998718, + "learning_rate": 1.8692999913596846e-06, + "loss": 0.1078, + "step": 10994 + }, + { + "epoch": 1.7814322747893714, + "grad_norm": 0.7947750091552734, + "learning_rate": 1.8688768418281705e-06, + "loss": 0.0987, + "step": 10995 + }, + { + "epoch": 1.781594296824368, + "grad_norm": 1.048323154449463, + "learning_rate": 1.8684537116074983e-06, + "loss": 0.126, + "step": 10996 + }, + { + "epoch": 1.7817563188593648, + "grad_norm": 0.8322978615760803, + "learning_rate": 1.8680306007106136e-06, + "loss": 0.1015, + "step": 10997 + }, + { + "epoch": 1.7819183408943617, + "grad_norm": 0.8976532816886902, + "learning_rate": 1.8676075091504637e-06, + "loss": 0.1016, + "step": 10998 + }, + { + "epoch": 1.7820803629293582, + "grad_norm": 0.6236394643783569, + "learning_rate": 1.8671844369399922e-06, + "loss": 0.0794, + "step": 10999 + }, + { + "epoch": 1.7822423849643552, + "grad_norm": 0.8208524584770203, + "learning_rate": 1.866761384092147e-06, + "loss": 0.1006, + "step": 11000 + }, + { + "epoch": 1.7824044069993519, + "grad_norm": 0.8767183423042297, + "learning_rate": 1.8663383506198706e-06, + "loss": 0.0964, + "step": 11001 + }, + { + "epoch": 1.7825664290343486, + "grad_norm": 0.7991448044776917, + "learning_rate": 1.8659153365361076e-06, + "loss": 0.1013, + "step": 11002 + }, + { + "epoch": 1.7827284510693455, + "grad_norm": 0.8050661683082581, + "learning_rate": 1.8654923418538003e-06, + "loss": 0.0933, + "step": 11003 + }, + { + "epoch": 1.7828904731043422, + "grad_norm": 0.8594214916229248, + "learning_rate": 1.8650693665858916e-06, + "loss": 0.1113, + "step": 11004 + }, + { + "epoch": 1.783052495139339, + "grad_norm": 0.8001613616943359, + "learning_rate": 1.8646464107453247e-06, + "loss": 0.1044, + "step": 11005 + }, + { + "epoch": 1.7832145171743357, + "grad_norm": 0.865673840045929, + "learning_rate": 1.8642234743450394e-06, + "loss": 0.1033, + "step": 11006 + }, + { + "epoch": 1.7833765392093324, + "grad_norm": 0.720534086227417, + "learning_rate": 1.8638005573979776e-06, + "loss": 0.0874, + "step": 11007 + }, + { + "epoch": 1.7835385612443293, + "grad_norm": 0.7906041145324707, + "learning_rate": 1.8633776599170783e-06, + "loss": 0.1006, + "step": 11008 + }, + { + "epoch": 1.783700583279326, + "grad_norm": 0.8074170351028442, + "learning_rate": 1.8629547819152832e-06, + "loss": 0.1024, + "step": 11009 + }, + { + "epoch": 1.7838626053143227, + "grad_norm": 0.7554319500923157, + "learning_rate": 1.86253192340553e-06, + "loss": 0.0986, + "step": 11010 + }, + { + "epoch": 1.7840246273493197, + "grad_norm": 0.7377730011940002, + "learning_rate": 1.8621090844007572e-06, + "loss": 0.0984, + "step": 11011 + }, + { + "epoch": 1.7841866493843161, + "grad_norm": 0.9073734283447266, + "learning_rate": 1.8616862649139024e-06, + "loss": 0.1075, + "step": 11012 + }, + { + "epoch": 1.784348671419313, + "grad_norm": 0.8492515683174133, + "learning_rate": 1.861263464957903e-06, + "loss": 0.1033, + "step": 11013 + }, + { + "epoch": 1.7845106934543098, + "grad_norm": 0.7091047763824463, + "learning_rate": 1.8608406845456968e-06, + "loss": 0.0868, + "step": 11014 + }, + { + "epoch": 1.7846727154893065, + "grad_norm": 0.7672532796859741, + "learning_rate": 1.860417923690218e-06, + "loss": 0.0974, + "step": 11015 + }, + { + "epoch": 1.7848347375243034, + "grad_norm": 0.7299427390098572, + "learning_rate": 1.8599951824044033e-06, + "loss": 0.0926, + "step": 11016 + }, + { + "epoch": 1.7849967595593, + "grad_norm": 0.8017887473106384, + "learning_rate": 1.8595724607011878e-06, + "loss": 0.0925, + "step": 11017 + }, + { + "epoch": 1.7851587815942969, + "grad_norm": 0.7088046669960022, + "learning_rate": 1.8591497585935041e-06, + "loss": 0.0911, + "step": 11018 + }, + { + "epoch": 1.7853208036292936, + "grad_norm": 0.9126763939857483, + "learning_rate": 1.8587270760942875e-06, + "loss": 0.1143, + "step": 11019 + }, + { + "epoch": 1.7854828256642903, + "grad_norm": 0.9007970094680786, + "learning_rate": 1.858304413216471e-06, + "loss": 0.1132, + "step": 11020 + }, + { + "epoch": 1.7856448476992872, + "grad_norm": 0.7455145120620728, + "learning_rate": 1.8578817699729862e-06, + "loss": 0.0945, + "step": 11021 + }, + { + "epoch": 1.7858068697342837, + "grad_norm": 0.8875641226768494, + "learning_rate": 1.8574591463767656e-06, + "loss": 0.1057, + "step": 11022 + }, + { + "epoch": 1.7859688917692806, + "grad_norm": 0.8011730909347534, + "learning_rate": 1.8570365424407394e-06, + "loss": 0.1005, + "step": 11023 + }, + { + "epoch": 1.7861309138042774, + "grad_norm": 0.843669593334198, + "learning_rate": 1.8566139581778392e-06, + "loss": 0.0926, + "step": 11024 + }, + { + "epoch": 1.786292935839274, + "grad_norm": 0.7854774594306946, + "learning_rate": 1.856191393600995e-06, + "loss": 0.0967, + "step": 11025 + }, + { + "epoch": 1.786454957874271, + "grad_norm": 0.9077264666557312, + "learning_rate": 1.855768848723137e-06, + "loss": 0.1092, + "step": 11026 + }, + { + "epoch": 1.7866169799092677, + "grad_norm": 0.8441132307052612, + "learning_rate": 1.8553463235571927e-06, + "loss": 0.1036, + "step": 11027 + }, + { + "epoch": 1.7867790019442644, + "grad_norm": 1.07819664478302, + "learning_rate": 1.85492381811609e-06, + "loss": 0.0925, + "step": 11028 + }, + { + "epoch": 1.7869410239792611, + "grad_norm": 0.8448095917701721, + "learning_rate": 1.8545013324127587e-06, + "loss": 0.1002, + "step": 11029 + }, + { + "epoch": 1.7871030460142578, + "grad_norm": 0.8025045394897461, + "learning_rate": 1.854078866460124e-06, + "loss": 0.1058, + "step": 11030 + }, + { + "epoch": 1.7872650680492548, + "grad_norm": 0.7943110466003418, + "learning_rate": 1.8536564202711135e-06, + "loss": 0.1004, + "step": 11031 + }, + { + "epoch": 1.7874270900842515, + "grad_norm": 0.8177294731140137, + "learning_rate": 1.8532339938586513e-06, + "loss": 0.0892, + "step": 11032 + }, + { + "epoch": 1.7875891121192482, + "grad_norm": 0.9388481378555298, + "learning_rate": 1.8528115872356641e-06, + "loss": 0.111, + "step": 11033 + }, + { + "epoch": 1.7877511341542451, + "grad_norm": 0.8275322914123535, + "learning_rate": 1.8523892004150765e-06, + "loss": 0.1, + "step": 11034 + }, + { + "epoch": 1.7879131561892416, + "grad_norm": 0.777814507484436, + "learning_rate": 1.8519668334098124e-06, + "loss": 0.1026, + "step": 11035 + }, + { + "epoch": 1.7880751782242386, + "grad_norm": 1.0052603483200073, + "learning_rate": 1.8515444862327947e-06, + "loss": 0.1034, + "step": 11036 + }, + { + "epoch": 1.7882372002592353, + "grad_norm": 0.8155965805053711, + "learning_rate": 1.8511221588969457e-06, + "loss": 0.1, + "step": 11037 + }, + { + "epoch": 1.788399222294232, + "grad_norm": 0.7355263829231262, + "learning_rate": 1.8506998514151896e-06, + "loss": 0.091, + "step": 11038 + }, + { + "epoch": 1.788561244329229, + "grad_norm": 0.7432408332824707, + "learning_rate": 1.850277563800446e-06, + "loss": 0.092, + "step": 11039 + }, + { + "epoch": 1.7887232663642254, + "grad_norm": 0.7793557643890381, + "learning_rate": 1.8498552960656378e-06, + "loss": 0.1009, + "step": 11040 + }, + { + "epoch": 1.7888852883992223, + "grad_norm": 0.8657031655311584, + "learning_rate": 1.8494330482236832e-06, + "loss": 0.0957, + "step": 11041 + }, + { + "epoch": 1.789047310434219, + "grad_norm": 0.752953052520752, + "learning_rate": 1.8490108202875023e-06, + "loss": 0.0939, + "step": 11042 + }, + { + "epoch": 1.7892093324692158, + "grad_norm": 0.9895876049995422, + "learning_rate": 1.8485886122700158e-06, + "loss": 0.1191, + "step": 11043 + }, + { + "epoch": 1.7893713545042127, + "grad_norm": 0.7776055335998535, + "learning_rate": 1.848166424184142e-06, + "loss": 0.0981, + "step": 11044 + }, + { + "epoch": 1.7895333765392092, + "grad_norm": 0.7290987968444824, + "learning_rate": 1.8477442560427975e-06, + "loss": 0.0947, + "step": 11045 + }, + { + "epoch": 1.7896953985742061, + "grad_norm": 0.7357193231582642, + "learning_rate": 1.8473221078589006e-06, + "loss": 0.0906, + "step": 11046 + }, + { + "epoch": 1.7898574206092028, + "grad_norm": 0.8186837434768677, + "learning_rate": 1.8468999796453672e-06, + "loss": 0.1071, + "step": 11047 + }, + { + "epoch": 1.7900194426441995, + "grad_norm": 0.8652331233024597, + "learning_rate": 1.846477871415114e-06, + "loss": 0.1072, + "step": 11048 + }, + { + "epoch": 1.7901814646791965, + "grad_norm": 0.8034560084342957, + "learning_rate": 1.8460557831810571e-06, + "loss": 0.0951, + "step": 11049 + }, + { + "epoch": 1.790343486714193, + "grad_norm": 0.7994994521141052, + "learning_rate": 1.8456337149561105e-06, + "loss": 0.0932, + "step": 11050 + }, + { + "epoch": 1.79050550874919, + "grad_norm": 0.8264526724815369, + "learning_rate": 1.8452116667531886e-06, + "loss": 0.0989, + "step": 11051 + }, + { + "epoch": 1.7906675307841866, + "grad_norm": 0.8260409235954285, + "learning_rate": 1.8447896385852043e-06, + "loss": 0.104, + "step": 11052 + }, + { + "epoch": 1.7908295528191833, + "grad_norm": 0.8477978706359863, + "learning_rate": 1.844367630465073e-06, + "loss": 0.1045, + "step": 11053 + }, + { + "epoch": 1.7909915748541803, + "grad_norm": 0.790149986743927, + "learning_rate": 1.8439456424057044e-06, + "loss": 0.0985, + "step": 11054 + }, + { + "epoch": 1.791153596889177, + "grad_norm": 0.7050209045410156, + "learning_rate": 1.8435236744200126e-06, + "loss": 0.0867, + "step": 11055 + }, + { + "epoch": 1.7913156189241737, + "grad_norm": 0.9373990297317505, + "learning_rate": 1.8431017265209067e-06, + "loss": 0.1088, + "step": 11056 + }, + { + "epoch": 1.7914776409591704, + "grad_norm": 0.8682579398155212, + "learning_rate": 1.8426797987212985e-06, + "loss": 0.1047, + "step": 11057 + }, + { + "epoch": 1.791639662994167, + "grad_norm": 0.7753797769546509, + "learning_rate": 1.8422578910340985e-06, + "loss": 0.0957, + "step": 11058 + }, + { + "epoch": 1.791801685029164, + "grad_norm": 0.9039705395698547, + "learning_rate": 1.8418360034722149e-06, + "loss": 0.1037, + "step": 11059 + }, + { + "epoch": 1.7919637070641607, + "grad_norm": 0.7357102632522583, + "learning_rate": 1.8414141360485565e-06, + "loss": 0.0893, + "step": 11060 + }, + { + "epoch": 1.7921257290991575, + "grad_norm": 0.7715552449226379, + "learning_rate": 1.8409922887760317e-06, + "loss": 0.0987, + "step": 11061 + }, + { + "epoch": 1.7922877511341544, + "grad_norm": 0.8303348422050476, + "learning_rate": 1.840570461667549e-06, + "loss": 0.1043, + "step": 11062 + }, + { + "epoch": 1.7924497731691509, + "grad_norm": 0.7379674315452576, + "learning_rate": 1.8401486547360137e-06, + "loss": 0.0908, + "step": 11063 + }, + { + "epoch": 1.7926117952041478, + "grad_norm": 0.8676923513412476, + "learning_rate": 1.8397268679943333e-06, + "loss": 0.0983, + "step": 11064 + }, + { + "epoch": 1.7927738172391445, + "grad_norm": 0.8327484130859375, + "learning_rate": 1.8393051014554124e-06, + "loss": 0.1041, + "step": 11065 + }, + { + "epoch": 1.7929358392741412, + "grad_norm": 0.8785510659217834, + "learning_rate": 1.8388833551321562e-06, + "loss": 0.0972, + "step": 11066 + }, + { + "epoch": 1.7930978613091382, + "grad_norm": 0.8931607007980347, + "learning_rate": 1.8384616290374705e-06, + "loss": 0.1099, + "step": 11067 + }, + { + "epoch": 1.7932598833441347, + "grad_norm": 0.7808845639228821, + "learning_rate": 1.838039923184257e-06, + "loss": 0.0937, + "step": 11068 + }, + { + "epoch": 1.7934219053791316, + "grad_norm": 0.8837364912033081, + "learning_rate": 1.8376182375854207e-06, + "loss": 0.1046, + "step": 11069 + }, + { + "epoch": 1.7935839274141283, + "grad_norm": 0.7206534743309021, + "learning_rate": 1.8371965722538636e-06, + "loss": 0.0915, + "step": 11070 + }, + { + "epoch": 1.793745949449125, + "grad_norm": 0.7323435544967651, + "learning_rate": 1.8367749272024865e-06, + "loss": 0.0969, + "step": 11071 + }, + { + "epoch": 1.793907971484122, + "grad_norm": 0.7837962508201599, + "learning_rate": 1.836353302444192e-06, + "loss": 0.1013, + "step": 11072 + }, + { + "epoch": 1.7940699935191184, + "grad_norm": 0.7067288160324097, + "learning_rate": 1.8359316979918808e-06, + "loss": 0.0903, + "step": 11073 + }, + { + "epoch": 1.7942320155541154, + "grad_norm": 0.7567185163497925, + "learning_rate": 1.8355101138584524e-06, + "loss": 0.0876, + "step": 11074 + }, + { + "epoch": 1.794394037589112, + "grad_norm": 0.9895825982093811, + "learning_rate": 1.835088550056806e-06, + "loss": 0.1207, + "step": 11075 + }, + { + "epoch": 1.7945560596241088, + "grad_norm": 0.8607103824615479, + "learning_rate": 1.8346670065998411e-06, + "loss": 0.0999, + "step": 11076 + }, + { + "epoch": 1.7947180816591057, + "grad_norm": 0.7561851143836975, + "learning_rate": 1.8342454835004566e-06, + "loss": 0.0952, + "step": 11077 + }, + { + "epoch": 1.7948801036941024, + "grad_norm": 0.756078839302063, + "learning_rate": 1.8338239807715486e-06, + "loss": 0.0955, + "step": 11078 + }, + { + "epoch": 1.7950421257290992, + "grad_norm": 0.948203444480896, + "learning_rate": 1.833402498426015e-06, + "loss": 0.1169, + "step": 11079 + }, + { + "epoch": 1.7952041477640959, + "grad_norm": 0.809870183467865, + "learning_rate": 1.8329810364767511e-06, + "loss": 0.0992, + "step": 11080 + }, + { + "epoch": 1.7953661697990926, + "grad_norm": 0.7944110631942749, + "learning_rate": 1.8325595949366537e-06, + "loss": 0.0974, + "step": 11081 + }, + { + "epoch": 1.7955281918340895, + "grad_norm": 0.7319938540458679, + "learning_rate": 1.8321381738186178e-06, + "loss": 0.0953, + "step": 11082 + }, + { + "epoch": 1.7956902138690862, + "grad_norm": 0.7145232558250427, + "learning_rate": 1.8317167731355373e-06, + "loss": 0.0909, + "step": 11083 + }, + { + "epoch": 1.795852235904083, + "grad_norm": 0.7656394243240356, + "learning_rate": 1.8312953929003068e-06, + "loss": 0.0937, + "step": 11084 + }, + { + "epoch": 1.7960142579390799, + "grad_norm": 0.8850581645965576, + "learning_rate": 1.8308740331258177e-06, + "loss": 0.1003, + "step": 11085 + }, + { + "epoch": 1.7961762799740764, + "grad_norm": 0.7528477311134338, + "learning_rate": 1.8304526938249653e-06, + "loss": 0.0894, + "step": 11086 + }, + { + "epoch": 1.7963383020090733, + "grad_norm": 0.7964629530906677, + "learning_rate": 1.8300313750106396e-06, + "loss": 0.0934, + "step": 11087 + }, + { + "epoch": 1.79650032404407, + "grad_norm": 0.6928067207336426, + "learning_rate": 1.8296100766957331e-06, + "loss": 0.0834, + "step": 11088 + }, + { + "epoch": 1.7966623460790667, + "grad_norm": 0.9328933954238892, + "learning_rate": 1.8291887988931357e-06, + "loss": 0.1128, + "step": 11089 + }, + { + "epoch": 1.7968243681140637, + "grad_norm": 0.7624589800834656, + "learning_rate": 1.828767541615737e-06, + "loss": 0.0925, + "step": 11090 + }, + { + "epoch": 1.7969863901490601, + "grad_norm": 0.7582963109016418, + "learning_rate": 1.828346304876428e-06, + "loss": 0.0924, + "step": 11091 + }, + { + "epoch": 1.797148412184057, + "grad_norm": 0.8816431760787964, + "learning_rate": 1.8279250886880962e-06, + "loss": 0.0985, + "step": 11092 + }, + { + "epoch": 1.7973104342190538, + "grad_norm": 0.9561519026756287, + "learning_rate": 1.8275038930636314e-06, + "loss": 0.1172, + "step": 11093 + }, + { + "epoch": 1.7974724562540505, + "grad_norm": 0.845643162727356, + "learning_rate": 1.827082718015919e-06, + "loss": 0.1133, + "step": 11094 + }, + { + "epoch": 1.7976344782890474, + "grad_norm": 0.8886216282844543, + "learning_rate": 1.8266615635578464e-06, + "loss": 0.1089, + "step": 11095 + }, + { + "epoch": 1.797796500324044, + "grad_norm": 0.8109029531478882, + "learning_rate": 1.8262404297023013e-06, + "loss": 0.0978, + "step": 11096 + }, + { + "epoch": 1.7979585223590409, + "grad_norm": 0.7375802993774414, + "learning_rate": 1.825819316462169e-06, + "loss": 0.0906, + "step": 11097 + }, + { + "epoch": 1.7981205443940376, + "grad_norm": 0.8273477554321289, + "learning_rate": 1.8253982238503338e-06, + "loss": 0.1088, + "step": 11098 + }, + { + "epoch": 1.7982825664290343, + "grad_norm": 0.6932176947593689, + "learning_rate": 1.8249771518796794e-06, + "loss": 0.0845, + "step": 11099 + }, + { + "epoch": 1.7984445884640312, + "grad_norm": 0.7964252233505249, + "learning_rate": 1.8245561005630921e-06, + "loss": 0.0918, + "step": 11100 + }, + { + "epoch": 1.7986066104990277, + "grad_norm": 0.708993136882782, + "learning_rate": 1.824135069913453e-06, + "loss": 0.0911, + "step": 11101 + }, + { + "epoch": 1.7987686325340246, + "grad_norm": 0.8443676233291626, + "learning_rate": 1.823714059943646e-06, + "loss": 0.1014, + "step": 11102 + }, + { + "epoch": 1.7989306545690213, + "grad_norm": 0.8077982068061829, + "learning_rate": 1.823293070666551e-06, + "loss": 0.0996, + "step": 11103 + }, + { + "epoch": 1.799092676604018, + "grad_norm": 0.8241330981254578, + "learning_rate": 1.8228721020950504e-06, + "loss": 0.0983, + "step": 11104 + }, + { + "epoch": 1.799254698639015, + "grad_norm": 0.8449083566665649, + "learning_rate": 1.8224511542420254e-06, + "loss": 0.1019, + "step": 11105 + }, + { + "epoch": 1.7994167206740117, + "grad_norm": 0.6822419166564941, + "learning_rate": 1.8220302271203557e-06, + "loss": 0.0793, + "step": 11106 + }, + { + "epoch": 1.7995787427090084, + "grad_norm": 0.8199465870857239, + "learning_rate": 1.82160932074292e-06, + "loss": 0.0983, + "step": 11107 + }, + { + "epoch": 1.7997407647440054, + "grad_norm": 0.9466975927352905, + "learning_rate": 1.8211884351225978e-06, + "loss": 0.1151, + "step": 11108 + }, + { + "epoch": 1.7999027867790018, + "grad_norm": 0.7171126008033752, + "learning_rate": 1.8207675702722661e-06, + "loss": 0.0904, + "step": 11109 + }, + { + "epoch": 1.8000648088139988, + "grad_norm": 0.8177133202552795, + "learning_rate": 1.8203467262048033e-06, + "loss": 0.096, + "step": 11110 + }, + { + "epoch": 1.8002268308489955, + "grad_norm": 0.9946437478065491, + "learning_rate": 1.8199259029330865e-06, + "loss": 0.1111, + "step": 11111 + }, + { + "epoch": 1.8003888528839922, + "grad_norm": 0.8421857357025146, + "learning_rate": 1.819505100469991e-06, + "loss": 0.1072, + "step": 11112 + }, + { + "epoch": 1.8005508749189891, + "grad_norm": 0.7888447642326355, + "learning_rate": 1.8190843188283925e-06, + "loss": 0.0884, + "step": 11113 + }, + { + "epoch": 1.8007128969539856, + "grad_norm": 0.7778288722038269, + "learning_rate": 1.8186635580211654e-06, + "loss": 0.0878, + "step": 11114 + }, + { + "epoch": 1.8008749189889826, + "grad_norm": 0.7060684561729431, + "learning_rate": 1.8182428180611855e-06, + "loss": 0.0799, + "step": 11115 + }, + { + "epoch": 1.8010369410239793, + "grad_norm": 0.8897756934165955, + "learning_rate": 1.8178220989613255e-06, + "loss": 0.1116, + "step": 11116 + }, + { + "epoch": 1.801198963058976, + "grad_norm": 0.7480401992797852, + "learning_rate": 1.8174014007344586e-06, + "loss": 0.0954, + "step": 11117 + }, + { + "epoch": 1.801360985093973, + "grad_norm": 0.7541413307189941, + "learning_rate": 1.8169807233934567e-06, + "loss": 0.0945, + "step": 11118 + }, + { + "epoch": 1.8015230071289694, + "grad_norm": 0.7814289927482605, + "learning_rate": 1.8165600669511912e-06, + "loss": 0.0951, + "step": 11119 + }, + { + "epoch": 1.8016850291639663, + "grad_norm": 0.8429241180419922, + "learning_rate": 1.8161394314205343e-06, + "loss": 0.1045, + "step": 11120 + }, + { + "epoch": 1.801847051198963, + "grad_norm": 0.8381679654121399, + "learning_rate": 1.8157188168143564e-06, + "loss": 0.1059, + "step": 11121 + }, + { + "epoch": 1.8020090732339598, + "grad_norm": 0.8170045018196106, + "learning_rate": 1.8152982231455262e-06, + "loss": 0.0991, + "step": 11122 + }, + { + "epoch": 1.8021710952689567, + "grad_norm": 0.8006271719932556, + "learning_rate": 1.8148776504269129e-06, + "loss": 0.0989, + "step": 11123 + }, + { + "epoch": 1.8023331173039532, + "grad_norm": 0.7720314264297485, + "learning_rate": 1.8144570986713867e-06, + "loss": 0.0928, + "step": 11124 + }, + { + "epoch": 1.8024951393389501, + "grad_norm": 0.7991904020309448, + "learning_rate": 1.8140365678918138e-06, + "loss": 0.0986, + "step": 11125 + }, + { + "epoch": 1.8026571613739468, + "grad_norm": 0.9030035138130188, + "learning_rate": 1.8136160581010624e-06, + "loss": 0.1187, + "step": 11126 + }, + { + "epoch": 1.8028191834089435, + "grad_norm": 0.8388223648071289, + "learning_rate": 1.813195569311998e-06, + "loss": 0.1017, + "step": 11127 + }, + { + "epoch": 1.8029812054439405, + "grad_norm": 0.9252680540084839, + "learning_rate": 1.8127751015374865e-06, + "loss": 0.1104, + "step": 11128 + }, + { + "epoch": 1.8031432274789372, + "grad_norm": 0.8280481696128845, + "learning_rate": 1.8123546547903944e-06, + "loss": 0.1046, + "step": 11129 + }, + { + "epoch": 1.803305249513934, + "grad_norm": 0.8752947449684143, + "learning_rate": 1.8119342290835864e-06, + "loss": 0.1067, + "step": 11130 + }, + { + "epoch": 1.8034672715489306, + "grad_norm": 0.753736138343811, + "learning_rate": 1.8115138244299254e-06, + "loss": 0.0953, + "step": 11131 + }, + { + "epoch": 1.8036292935839273, + "grad_norm": 0.8405237197875977, + "learning_rate": 1.8110934408422758e-06, + "loss": 0.0967, + "step": 11132 + }, + { + "epoch": 1.8037913156189243, + "grad_norm": 0.9108558893203735, + "learning_rate": 1.8106730783334985e-06, + "loss": 0.1006, + "step": 11133 + }, + { + "epoch": 1.803953337653921, + "grad_norm": 0.7528913021087646, + "learning_rate": 1.810252736916458e-06, + "loss": 0.0983, + "step": 11134 + }, + { + "epoch": 1.8041153596889177, + "grad_norm": 0.8099750280380249, + "learning_rate": 1.8098324166040146e-06, + "loss": 0.1013, + "step": 11135 + }, + { + "epoch": 1.8042773817239146, + "grad_norm": 0.9023746848106384, + "learning_rate": 1.8094121174090288e-06, + "loss": 0.1148, + "step": 11136 + }, + { + "epoch": 1.804439403758911, + "grad_norm": 0.8526237607002258, + "learning_rate": 1.8089918393443611e-06, + "loss": 0.1046, + "step": 11137 + }, + { + "epoch": 1.804601425793908, + "grad_norm": 0.7785555124282837, + "learning_rate": 1.80857158242287e-06, + "loss": 0.0932, + "step": 11138 + }, + { + "epoch": 1.8047634478289047, + "grad_norm": 0.7802260518074036, + "learning_rate": 1.8081513466574164e-06, + "loss": 0.1013, + "step": 11139 + }, + { + "epoch": 1.8049254698639015, + "grad_norm": 0.8971158862113953, + "learning_rate": 1.8077311320608571e-06, + "loss": 0.1221, + "step": 11140 + }, + { + "epoch": 1.8050874918988984, + "grad_norm": 0.833736002445221, + "learning_rate": 1.8073109386460502e-06, + "loss": 0.1064, + "step": 11141 + }, + { + "epoch": 1.8052495139338949, + "grad_norm": 0.8659934401512146, + "learning_rate": 1.806890766425851e-06, + "loss": 0.1017, + "step": 11142 + }, + { + "epoch": 1.8054115359688918, + "grad_norm": 0.7669563889503479, + "learning_rate": 1.8064706154131179e-06, + "loss": 0.0973, + "step": 11143 + }, + { + "epoch": 1.8055735580038885, + "grad_norm": 0.9565890431404114, + "learning_rate": 1.8060504856207062e-06, + "loss": 0.1079, + "step": 11144 + }, + { + "epoch": 1.8057355800388852, + "grad_norm": 0.8553591370582581, + "learning_rate": 1.8056303770614697e-06, + "loss": 0.1065, + "step": 11145 + }, + { + "epoch": 1.8058976020738822, + "grad_norm": 0.9055613279342651, + "learning_rate": 1.8052102897482643e-06, + "loss": 0.1114, + "step": 11146 + }, + { + "epoch": 1.8060596241088787, + "grad_norm": 0.8043663501739502, + "learning_rate": 1.8047902236939405e-06, + "loss": 0.1029, + "step": 11147 + }, + { + "epoch": 1.8062216461438756, + "grad_norm": 0.8190831542015076, + "learning_rate": 1.8043701789113552e-06, + "loss": 0.098, + "step": 11148 + }, + { + "epoch": 1.8063836681788723, + "grad_norm": 0.8615659475326538, + "learning_rate": 1.8039501554133588e-06, + "loss": 0.0942, + "step": 11149 + }, + { + "epoch": 1.806545690213869, + "grad_norm": 0.9080948829650879, + "learning_rate": 1.8035301532128032e-06, + "loss": 0.1049, + "step": 11150 + }, + { + "epoch": 1.806707712248866, + "grad_norm": 0.7901988625526428, + "learning_rate": 1.8031101723225393e-06, + "loss": 0.0924, + "step": 11151 + }, + { + "epoch": 1.8068697342838627, + "grad_norm": 0.8067225813865662, + "learning_rate": 1.8026902127554172e-06, + "loss": 0.1016, + "step": 11152 + }, + { + "epoch": 1.8070317563188594, + "grad_norm": 0.7822276949882507, + "learning_rate": 1.8022702745242882e-06, + "loss": 0.0953, + "step": 11153 + }, + { + "epoch": 1.807193778353856, + "grad_norm": 0.8535957336425781, + "learning_rate": 1.8018503576419996e-06, + "loss": 0.1074, + "step": 11154 + }, + { + "epoch": 1.8073558003888528, + "grad_norm": 0.7549822926521301, + "learning_rate": 1.8014304621214008e-06, + "loss": 0.0964, + "step": 11155 + }, + { + "epoch": 1.8075178224238497, + "grad_norm": 0.816744863986969, + "learning_rate": 1.8010105879753398e-06, + "loss": 0.105, + "step": 11156 + }, + { + "epoch": 1.8076798444588464, + "grad_norm": 0.7305224537849426, + "learning_rate": 1.800590735216662e-06, + "loss": 0.0952, + "step": 11157 + }, + { + "epoch": 1.8078418664938432, + "grad_norm": 0.8567788600921631, + "learning_rate": 1.800170903858216e-06, + "loss": 0.1097, + "step": 11158 + }, + { + "epoch": 1.80800388852884, + "grad_norm": 0.8007251024246216, + "learning_rate": 1.799751093912847e-06, + "loss": 0.0898, + "step": 11159 + }, + { + "epoch": 1.8081659105638366, + "grad_norm": 0.7454932928085327, + "learning_rate": 1.7993313053933998e-06, + "loss": 0.0942, + "step": 11160 + }, + { + "epoch": 1.8083279325988335, + "grad_norm": 0.8023138046264648, + "learning_rate": 1.7989115383127195e-06, + "loss": 0.102, + "step": 11161 + }, + { + "epoch": 1.8084899546338302, + "grad_norm": 0.7055501937866211, + "learning_rate": 1.7984917926836484e-06, + "loss": 0.0865, + "step": 11162 + }, + { + "epoch": 1.808651976668827, + "grad_norm": 0.7876911759376526, + "learning_rate": 1.7980720685190314e-06, + "loss": 0.0841, + "step": 11163 + }, + { + "epoch": 1.8088139987038239, + "grad_norm": 0.8725854158401489, + "learning_rate": 1.7976523658317104e-06, + "loss": 0.1082, + "step": 11164 + }, + { + "epoch": 1.8089760207388204, + "grad_norm": 0.8327855467796326, + "learning_rate": 1.7972326846345277e-06, + "loss": 0.1103, + "step": 11165 + }, + { + "epoch": 1.8091380427738173, + "grad_norm": 0.9465359449386597, + "learning_rate": 1.7968130249403238e-06, + "loss": 0.114, + "step": 11166 + }, + { + "epoch": 1.809300064808814, + "grad_norm": 0.8367246389389038, + "learning_rate": 1.7963933867619396e-06, + "loss": 0.1114, + "step": 11167 + }, + { + "epoch": 1.8094620868438107, + "grad_norm": 0.821707546710968, + "learning_rate": 1.7959737701122157e-06, + "loss": 0.0933, + "step": 11168 + }, + { + "epoch": 1.8096241088788076, + "grad_norm": 0.951481282711029, + "learning_rate": 1.79555417500399e-06, + "loss": 0.1188, + "step": 11169 + }, + { + "epoch": 1.8097861309138041, + "grad_norm": 0.7933133244514465, + "learning_rate": 1.7951346014501027e-06, + "loss": 0.1043, + "step": 11170 + }, + { + "epoch": 1.809948152948801, + "grad_norm": 0.8114445209503174, + "learning_rate": 1.7947150494633897e-06, + "loss": 0.0908, + "step": 11171 + }, + { + "epoch": 1.8101101749837978, + "grad_norm": 0.7849157452583313, + "learning_rate": 1.7942955190566899e-06, + "loss": 0.1009, + "step": 11172 + }, + { + "epoch": 1.8102721970187945, + "grad_norm": 0.7490556836128235, + "learning_rate": 1.7938760102428396e-06, + "loss": 0.0816, + "step": 11173 + }, + { + "epoch": 1.8104342190537914, + "grad_norm": 0.8201428651809692, + "learning_rate": 1.7934565230346752e-06, + "loss": 0.1019, + "step": 11174 + }, + { + "epoch": 1.810596241088788, + "grad_norm": 0.7032244801521301, + "learning_rate": 1.7930370574450304e-06, + "loss": 0.0911, + "step": 11175 + }, + { + "epoch": 1.8107582631237849, + "grad_norm": 0.8158414959907532, + "learning_rate": 1.7926176134867408e-06, + "loss": 0.1025, + "step": 11176 + }, + { + "epoch": 1.8109202851587816, + "grad_norm": 0.8338707685470581, + "learning_rate": 1.792198191172641e-06, + "loss": 0.1059, + "step": 11177 + }, + { + "epoch": 1.8110823071937783, + "grad_norm": 0.7679630517959595, + "learning_rate": 1.7917787905155634e-06, + "loss": 0.0831, + "step": 11178 + }, + { + "epoch": 1.8112443292287752, + "grad_norm": 0.8508248329162598, + "learning_rate": 1.7913594115283414e-06, + "loss": 0.1109, + "step": 11179 + }, + { + "epoch": 1.811406351263772, + "grad_norm": 0.8345566391944885, + "learning_rate": 1.790940054223806e-06, + "loss": 0.1056, + "step": 11180 + }, + { + "epoch": 1.8115683732987686, + "grad_norm": 0.8556733131408691, + "learning_rate": 1.7905207186147888e-06, + "loss": 0.1059, + "step": 11181 + }, + { + "epoch": 1.8117303953337653, + "grad_norm": 0.7630212903022766, + "learning_rate": 1.7901014047141208e-06, + "loss": 0.0991, + "step": 11182 + }, + { + "epoch": 1.811892417368762, + "grad_norm": 0.9459897875785828, + "learning_rate": 1.7896821125346325e-06, + "loss": 0.11, + "step": 11183 + }, + { + "epoch": 1.812054439403759, + "grad_norm": 0.7780237793922424, + "learning_rate": 1.7892628420891526e-06, + "loss": 0.1011, + "step": 11184 + }, + { + "epoch": 1.8122164614387557, + "grad_norm": 0.7486929297447205, + "learning_rate": 1.7888435933905097e-06, + "loss": 0.0906, + "step": 11185 + }, + { + "epoch": 1.8123784834737524, + "grad_norm": 0.8000259399414062, + "learning_rate": 1.788424366451531e-06, + "loss": 0.0972, + "step": 11186 + }, + { + "epoch": 1.8125405055087493, + "grad_norm": 0.837604284286499, + "learning_rate": 1.7880051612850455e-06, + "loss": 0.1022, + "step": 11187 + }, + { + "epoch": 1.8127025275437458, + "grad_norm": 0.7862725853919983, + "learning_rate": 1.7875859779038796e-06, + "loss": 0.0986, + "step": 11188 + }, + { + "epoch": 1.8128645495787428, + "grad_norm": 0.7725867629051208, + "learning_rate": 1.7871668163208577e-06, + "loss": 0.0968, + "step": 11189 + }, + { + "epoch": 1.8130265716137395, + "grad_norm": 0.7349421977996826, + "learning_rate": 1.7867476765488061e-06, + "loss": 0.0899, + "step": 11190 + }, + { + "epoch": 1.8131885936487362, + "grad_norm": 0.7476187944412231, + "learning_rate": 1.78632855860055e-06, + "loss": 0.0975, + "step": 11191 + }, + { + "epoch": 1.8133506156837331, + "grad_norm": 0.8673322200775146, + "learning_rate": 1.7859094624889135e-06, + "loss": 0.0985, + "step": 11192 + }, + { + "epoch": 1.8135126377187296, + "grad_norm": 0.77607262134552, + "learning_rate": 1.785490388226719e-06, + "loss": 0.0923, + "step": 11193 + }, + { + "epoch": 1.8136746597537265, + "grad_norm": 0.8594310283660889, + "learning_rate": 1.7850713358267897e-06, + "loss": 0.1137, + "step": 11194 + }, + { + "epoch": 1.8138366817887233, + "grad_norm": 0.7893880009651184, + "learning_rate": 1.7846523053019466e-06, + "loss": 0.096, + "step": 11195 + }, + { + "epoch": 1.81399870382372, + "grad_norm": 0.8915963768959045, + "learning_rate": 1.7842332966650122e-06, + "loss": 0.1095, + "step": 11196 + }, + { + "epoch": 1.814160725858717, + "grad_norm": 0.7770095467567444, + "learning_rate": 1.7838143099288075e-06, + "loss": 0.0921, + "step": 11197 + }, + { + "epoch": 1.8143227478937134, + "grad_norm": 0.8442981243133545, + "learning_rate": 1.7833953451061513e-06, + "loss": 0.1042, + "step": 11198 + }, + { + "epoch": 1.8144847699287103, + "grad_norm": 0.8559162020683289, + "learning_rate": 1.7829764022098633e-06, + "loss": 0.1041, + "step": 11199 + }, + { + "epoch": 1.814646791963707, + "grad_norm": 0.7358631491661072, + "learning_rate": 1.7825574812527617e-06, + "loss": 0.0866, + "step": 11200 + }, + { + "epoch": 1.8148088139987038, + "grad_norm": 0.8252244591712952, + "learning_rate": 1.7821385822476661e-06, + "loss": 0.1032, + "step": 11201 + }, + { + "epoch": 1.8149708360337007, + "grad_norm": 0.725712239742279, + "learning_rate": 1.781719705207392e-06, + "loss": 0.0829, + "step": 11202 + }, + { + "epoch": 1.8151328580686974, + "grad_norm": 0.7640675902366638, + "learning_rate": 1.7813008501447576e-06, + "loss": 0.0867, + "step": 11203 + }, + { + "epoch": 1.815294880103694, + "grad_norm": 0.8494378924369812, + "learning_rate": 1.7808820170725772e-06, + "loss": 0.1012, + "step": 11204 + }, + { + "epoch": 1.8154569021386908, + "grad_norm": 0.7609313130378723, + "learning_rate": 1.7804632060036665e-06, + "loss": 0.096, + "step": 11205 + }, + { + "epoch": 1.8156189241736875, + "grad_norm": 0.7626781463623047, + "learning_rate": 1.7800444169508414e-06, + "loss": 0.0921, + "step": 11206 + }, + { + "epoch": 1.8157809462086845, + "grad_norm": 0.9517927169799805, + "learning_rate": 1.7796256499269141e-06, + "loss": 0.1178, + "step": 11207 + }, + { + "epoch": 1.8159429682436812, + "grad_norm": 0.8355745673179626, + "learning_rate": 1.7792069049446987e-06, + "loss": 0.1029, + "step": 11208 + }, + { + "epoch": 1.8161049902786779, + "grad_norm": 0.8401842713356018, + "learning_rate": 1.7787881820170073e-06, + "loss": 0.1033, + "step": 11209 + }, + { + "epoch": 1.8162670123136748, + "grad_norm": 0.8746349215507507, + "learning_rate": 1.7783694811566534e-06, + "loss": 0.104, + "step": 11210 + }, + { + "epoch": 1.8164290343486713, + "grad_norm": 0.8786240816116333, + "learning_rate": 1.7779508023764464e-06, + "loss": 0.1073, + "step": 11211 + }, + { + "epoch": 1.8165910563836682, + "grad_norm": 0.700187087059021, + "learning_rate": 1.777532145689198e-06, + "loss": 0.0841, + "step": 11212 + }, + { + "epoch": 1.816753078418665, + "grad_norm": 0.7752106785774231, + "learning_rate": 1.7771135111077173e-06, + "loss": 0.0987, + "step": 11213 + }, + { + "epoch": 1.8169151004536617, + "grad_norm": 0.8160271644592285, + "learning_rate": 1.7766948986448131e-06, + "loss": 0.096, + "step": 11214 + }, + { + "epoch": 1.8170771224886586, + "grad_norm": 0.8085820078849792, + "learning_rate": 1.7762763083132958e-06, + "loss": 0.0965, + "step": 11215 + }, + { + "epoch": 1.817239144523655, + "grad_norm": 0.8033862113952637, + "learning_rate": 1.7758577401259716e-06, + "loss": 0.0961, + "step": 11216 + }, + { + "epoch": 1.817401166558652, + "grad_norm": 0.7184333205223083, + "learning_rate": 1.775439194095648e-06, + "loss": 0.0934, + "step": 11217 + }, + { + "epoch": 1.8175631885936487, + "grad_norm": 0.8042249083518982, + "learning_rate": 1.7750206702351325e-06, + "loss": 0.1002, + "step": 11218 + }, + { + "epoch": 1.8177252106286454, + "grad_norm": 0.779135525226593, + "learning_rate": 1.7746021685572284e-06, + "loss": 0.0907, + "step": 11219 + }, + { + "epoch": 1.8178872326636424, + "grad_norm": 0.8143093585968018, + "learning_rate": 1.7741836890747438e-06, + "loss": 0.1011, + "step": 11220 + }, + { + "epoch": 1.8180492546986389, + "grad_norm": 0.8787755370140076, + "learning_rate": 1.7737652318004818e-06, + "loss": 0.1014, + "step": 11221 + }, + { + "epoch": 1.8182112767336358, + "grad_norm": 0.8515397310256958, + "learning_rate": 1.7733467967472459e-06, + "loss": 0.1032, + "step": 11222 + }, + { + "epoch": 1.8183732987686325, + "grad_norm": 0.8510406017303467, + "learning_rate": 1.7729283839278403e-06, + "loss": 0.0972, + "step": 11223 + }, + { + "epoch": 1.8185353208036292, + "grad_norm": 0.8091854453086853, + "learning_rate": 1.7725099933550649e-06, + "loss": 0.0957, + "step": 11224 + }, + { + "epoch": 1.8186973428386262, + "grad_norm": 0.7440215349197388, + "learning_rate": 1.7720916250417248e-06, + "loss": 0.0914, + "step": 11225 + }, + { + "epoch": 1.8188593648736227, + "grad_norm": 0.8673259615898132, + "learning_rate": 1.7716732790006188e-06, + "loss": 0.101, + "step": 11226 + }, + { + "epoch": 1.8190213869086196, + "grad_norm": 0.8858534097671509, + "learning_rate": 1.7712549552445484e-06, + "loss": 0.1069, + "step": 11227 + }, + { + "epoch": 1.8191834089436163, + "grad_norm": 0.7519997954368591, + "learning_rate": 1.7708366537863129e-06, + "loss": 0.0896, + "step": 11228 + }, + { + "epoch": 1.819345430978613, + "grad_norm": 0.7740821242332458, + "learning_rate": 1.7704183746387105e-06, + "loss": 0.0916, + "step": 11229 + }, + { + "epoch": 1.81950745301361, + "grad_norm": 0.898308277130127, + "learning_rate": 1.7700001178145409e-06, + "loss": 0.1155, + "step": 11230 + }, + { + "epoch": 1.8196694750486067, + "grad_norm": 0.7707512378692627, + "learning_rate": 1.7695818833266009e-06, + "loss": 0.0876, + "step": 11231 + }, + { + "epoch": 1.8198314970836034, + "grad_norm": 0.7827706336975098, + "learning_rate": 1.7691636711876883e-06, + "loss": 0.1, + "step": 11232 + }, + { + "epoch": 1.8199935191186, + "grad_norm": 0.8508397340774536, + "learning_rate": 1.768745481410597e-06, + "loss": 0.0975, + "step": 11233 + }, + { + "epoch": 1.8201555411535968, + "grad_norm": 0.8238134384155273, + "learning_rate": 1.768327314008126e-06, + "loss": 0.0968, + "step": 11234 + }, + { + "epoch": 1.8203175631885937, + "grad_norm": 0.7700952291488647, + "learning_rate": 1.7679091689930683e-06, + "loss": 0.0831, + "step": 11235 + }, + { + "epoch": 1.8204795852235904, + "grad_norm": 0.7718937993049622, + "learning_rate": 1.7674910463782186e-06, + "loss": 0.1014, + "step": 11236 + }, + { + "epoch": 1.8206416072585871, + "grad_norm": 0.9156939387321472, + "learning_rate": 1.76707294617637e-06, + "loss": 0.1105, + "step": 11237 + }, + { + "epoch": 1.820803629293584, + "grad_norm": 0.85805743932724, + "learning_rate": 1.766654868400315e-06, + "loss": 0.105, + "step": 11238 + }, + { + "epoch": 1.8209656513285806, + "grad_norm": 0.7723628878593445, + "learning_rate": 1.766236813062847e-06, + "loss": 0.0991, + "step": 11239 + }, + { + "epoch": 1.8211276733635775, + "grad_norm": 0.7656691670417786, + "learning_rate": 1.7658187801767568e-06, + "loss": 0.0986, + "step": 11240 + }, + { + "epoch": 1.8212896953985742, + "grad_norm": 0.8363527655601501, + "learning_rate": 1.765400769754836e-06, + "loss": 0.0918, + "step": 11241 + }, + { + "epoch": 1.821451717433571, + "grad_norm": 0.6773914694786072, + "learning_rate": 1.7649827818098727e-06, + "loss": 0.0836, + "step": 11242 + }, + { + "epoch": 1.8216137394685679, + "grad_norm": 0.751105546951294, + "learning_rate": 1.7645648163546574e-06, + "loss": 0.0893, + "step": 11243 + }, + { + "epoch": 1.8217757615035644, + "grad_norm": 0.8291444182395935, + "learning_rate": 1.7641468734019795e-06, + "loss": 0.0998, + "step": 11244 + }, + { + "epoch": 1.8219377835385613, + "grad_norm": 0.735002875328064, + "learning_rate": 1.7637289529646273e-06, + "loss": 0.0898, + "step": 11245 + }, + { + "epoch": 1.822099805573558, + "grad_norm": 0.7483386993408203, + "learning_rate": 1.7633110550553867e-06, + "loss": 0.0863, + "step": 11246 + }, + { + "epoch": 1.8222618276085547, + "grad_norm": 0.7296152114868164, + "learning_rate": 1.7628931796870454e-06, + "loss": 0.0961, + "step": 11247 + }, + { + "epoch": 1.8224238496435516, + "grad_norm": 0.7359948754310608, + "learning_rate": 1.7624753268723882e-06, + "loss": 0.0927, + "step": 11248 + }, + { + "epoch": 1.8225858716785481, + "grad_norm": 0.8318410515785217, + "learning_rate": 1.7620574966242015e-06, + "loss": 0.1004, + "step": 11249 + }, + { + "epoch": 1.822747893713545, + "grad_norm": 0.9866887331008911, + "learning_rate": 1.7616396889552706e-06, + "loss": 0.1179, + "step": 11250 + }, + { + "epoch": 1.8229099157485418, + "grad_norm": 0.913759708404541, + "learning_rate": 1.7612219038783775e-06, + "loss": 0.105, + "step": 11251 + }, + { + "epoch": 1.8230719377835385, + "grad_norm": 0.7952281832695007, + "learning_rate": 1.7608041414063065e-06, + "loss": 0.0996, + "step": 11252 + }, + { + "epoch": 1.8232339598185354, + "grad_norm": 0.9138891100883484, + "learning_rate": 1.7603864015518392e-06, + "loss": 0.1103, + "step": 11253 + }, + { + "epoch": 1.8233959818535321, + "grad_norm": 0.9621608853340149, + "learning_rate": 1.7599686843277596e-06, + "loss": 0.1116, + "step": 11254 + }, + { + "epoch": 1.8235580038885288, + "grad_norm": 0.9279875755310059, + "learning_rate": 1.7595509897468466e-06, + "loss": 0.1177, + "step": 11255 + }, + { + "epoch": 1.8237200259235256, + "grad_norm": 0.7723210453987122, + "learning_rate": 1.7591333178218823e-06, + "loss": 0.1005, + "step": 11256 + }, + { + "epoch": 1.8238820479585223, + "grad_norm": 0.7563832998275757, + "learning_rate": 1.7587156685656442e-06, + "loss": 0.0964, + "step": 11257 + }, + { + "epoch": 1.8240440699935192, + "grad_norm": 0.7410457134246826, + "learning_rate": 1.7582980419909135e-06, + "loss": 0.0911, + "step": 11258 + }, + { + "epoch": 1.824206092028516, + "grad_norm": 0.8052392601966858, + "learning_rate": 1.7578804381104678e-06, + "loss": 0.1009, + "step": 11259 + }, + { + "epoch": 1.8243681140635126, + "grad_norm": 0.8688940405845642, + "learning_rate": 1.7574628569370855e-06, + "loss": 0.1155, + "step": 11260 + }, + { + "epoch": 1.8245301360985096, + "grad_norm": 0.8440880179405212, + "learning_rate": 1.757045298483542e-06, + "loss": 0.1034, + "step": 11261 + }, + { + "epoch": 1.824692158133506, + "grad_norm": 0.764716386795044, + "learning_rate": 1.756627762762614e-06, + "loss": 0.0984, + "step": 11262 + }, + { + "epoch": 1.824854180168503, + "grad_norm": 0.7497915029525757, + "learning_rate": 1.7562102497870787e-06, + "loss": 0.0923, + "step": 11263 + }, + { + "epoch": 1.8250162022034997, + "grad_norm": 0.8336397409439087, + "learning_rate": 1.7557927595697094e-06, + "loss": 0.1071, + "step": 11264 + }, + { + "epoch": 1.8251782242384964, + "grad_norm": 0.8785907626152039, + "learning_rate": 1.7553752921232809e-06, + "loss": 0.1079, + "step": 11265 + }, + { + "epoch": 1.8253402462734933, + "grad_norm": 0.8184788823127747, + "learning_rate": 1.7549578474605661e-06, + "loss": 0.0985, + "step": 11266 + }, + { + "epoch": 1.8255022683084898, + "grad_norm": 0.7712414860725403, + "learning_rate": 1.754540425594338e-06, + "loss": 0.095, + "step": 11267 + }, + { + "epoch": 1.8256642903434868, + "grad_norm": 0.8796233534812927, + "learning_rate": 1.754123026537369e-06, + "loss": 0.1065, + "step": 11268 + }, + { + "epoch": 1.8258263123784835, + "grad_norm": 0.9053205251693726, + "learning_rate": 1.7537056503024314e-06, + "loss": 0.0946, + "step": 11269 + }, + { + "epoch": 1.8259883344134802, + "grad_norm": 0.7080380916595459, + "learning_rate": 1.7532882969022941e-06, + "loss": 0.0892, + "step": 11270 + }, + { + "epoch": 1.8261503564484771, + "grad_norm": 0.8562909364700317, + "learning_rate": 1.7528709663497282e-06, + "loss": 0.1094, + "step": 11271 + }, + { + "epoch": 1.8263123784834736, + "grad_norm": 0.7841035723686218, + "learning_rate": 1.752453658657502e-06, + "loss": 0.0991, + "step": 11272 + }, + { + "epoch": 1.8264744005184705, + "grad_norm": 0.7752327919006348, + "learning_rate": 1.752036373838385e-06, + "loss": 0.0916, + "step": 11273 + }, + { + "epoch": 1.8266364225534673, + "grad_norm": 0.7416565418243408, + "learning_rate": 1.7516191119051456e-06, + "loss": 0.0904, + "step": 11274 + }, + { + "epoch": 1.826798444588464, + "grad_norm": 0.8184176683425903, + "learning_rate": 1.7512018728705498e-06, + "loss": 0.1011, + "step": 11275 + }, + { + "epoch": 1.826960466623461, + "grad_norm": 0.7727778553962708, + "learning_rate": 1.7507846567473643e-06, + "loss": 0.0946, + "step": 11276 + }, + { + "epoch": 1.8271224886584574, + "grad_norm": 0.7771718502044678, + "learning_rate": 1.7503674635483558e-06, + "loss": 0.0977, + "step": 11277 + }, + { + "epoch": 1.8272845106934543, + "grad_norm": 0.958016037940979, + "learning_rate": 1.749950293286289e-06, + "loss": 0.1002, + "step": 11278 + }, + { + "epoch": 1.827446532728451, + "grad_norm": 0.7841407060623169, + "learning_rate": 1.7495331459739278e-06, + "loss": 0.1045, + "step": 11279 + }, + { + "epoch": 1.8276085547634477, + "grad_norm": 0.7898461818695068, + "learning_rate": 1.7491160216240368e-06, + "loss": 0.0953, + "step": 11280 + }, + { + "epoch": 1.8277705767984447, + "grad_norm": 0.8376603126525879, + "learning_rate": 1.7486989202493775e-06, + "loss": 0.1077, + "step": 11281 + }, + { + "epoch": 1.8279325988334414, + "grad_norm": 0.7808676958084106, + "learning_rate": 1.7482818418627134e-06, + "loss": 0.0983, + "step": 11282 + }, + { + "epoch": 1.828094620868438, + "grad_norm": 0.8467119932174683, + "learning_rate": 1.7478647864768067e-06, + "loss": 0.1026, + "step": 11283 + }, + { + "epoch": 1.8282566429034348, + "grad_norm": 0.7723277807235718, + "learning_rate": 1.7474477541044165e-06, + "loss": 0.0954, + "step": 11284 + }, + { + "epoch": 1.8284186649384315, + "grad_norm": 0.8611959218978882, + "learning_rate": 1.7470307447583047e-06, + "loss": 0.0971, + "step": 11285 + }, + { + "epoch": 1.8285806869734285, + "grad_norm": 0.7999573945999146, + "learning_rate": 1.746613758451228e-06, + "loss": 0.0988, + "step": 11286 + }, + { + "epoch": 1.8287427090084252, + "grad_norm": 0.8502542972564697, + "learning_rate": 1.746196795195949e-06, + "loss": 0.0988, + "step": 11287 + }, + { + "epoch": 1.8289047310434219, + "grad_norm": 0.6951650977134705, + "learning_rate": 1.7457798550052232e-06, + "loss": 0.0888, + "step": 11288 + }, + { + "epoch": 1.8290667530784188, + "grad_norm": 0.7761609554290771, + "learning_rate": 1.7453629378918094e-06, + "loss": 0.097, + "step": 11289 + }, + { + "epoch": 1.8292287751134153, + "grad_norm": 0.882725715637207, + "learning_rate": 1.744946043868463e-06, + "loss": 0.1073, + "step": 11290 + }, + { + "epoch": 1.8293907971484122, + "grad_norm": 0.8751239776611328, + "learning_rate": 1.7445291729479397e-06, + "loss": 0.1095, + "step": 11291 + }, + { + "epoch": 1.829552819183409, + "grad_norm": 0.7810752987861633, + "learning_rate": 1.7441123251429968e-06, + "loss": 0.0916, + "step": 11292 + }, + { + "epoch": 1.8297148412184057, + "grad_norm": 0.8557392358779907, + "learning_rate": 1.7436955004663868e-06, + "loss": 0.0986, + "step": 11293 + }, + { + "epoch": 1.8298768632534026, + "grad_norm": 0.88213050365448, + "learning_rate": 1.7432786989308648e-06, + "loss": 0.1046, + "step": 11294 + }, + { + "epoch": 1.830038885288399, + "grad_norm": 0.8699725270271301, + "learning_rate": 1.7428619205491831e-06, + "loss": 0.1018, + "step": 11295 + }, + { + "epoch": 1.830200907323396, + "grad_norm": 0.7817003726959229, + "learning_rate": 1.7424451653340934e-06, + "loss": 0.0926, + "step": 11296 + }, + { + "epoch": 1.8303629293583927, + "grad_norm": 0.8012980818748474, + "learning_rate": 1.7420284332983495e-06, + "loss": 0.1063, + "step": 11297 + }, + { + "epoch": 1.8305249513933894, + "grad_norm": 0.7765569686889648, + "learning_rate": 1.7416117244547014e-06, + "loss": 0.0899, + "step": 11298 + }, + { + "epoch": 1.8306869734283864, + "grad_norm": 0.8100259900093079, + "learning_rate": 1.7411950388158987e-06, + "loss": 0.1055, + "step": 11299 + }, + { + "epoch": 1.8308489954633829, + "grad_norm": 0.996043860912323, + "learning_rate": 1.7407783763946911e-06, + "loss": 0.1143, + "step": 11300 + }, + { + "epoch": 1.8310110174983798, + "grad_norm": 0.9620465636253357, + "learning_rate": 1.7403617372038293e-06, + "loss": 0.1167, + "step": 11301 + }, + { + "epoch": 1.8311730395333765, + "grad_norm": 0.7914530038833618, + "learning_rate": 1.7399451212560593e-06, + "loss": 0.1029, + "step": 11302 + }, + { + "epoch": 1.8313350615683732, + "grad_norm": 0.8515611886978149, + "learning_rate": 1.7395285285641292e-06, + "loss": 0.1059, + "step": 11303 + }, + { + "epoch": 1.8314970836033702, + "grad_norm": 0.8178187608718872, + "learning_rate": 1.7391119591407863e-06, + "loss": 0.0974, + "step": 11304 + }, + { + "epoch": 1.8316591056383669, + "grad_norm": 0.8800912499427795, + "learning_rate": 1.7386954129987754e-06, + "loss": 0.1066, + "step": 11305 + }, + { + "epoch": 1.8318211276733636, + "grad_norm": 0.8402878642082214, + "learning_rate": 1.7382788901508426e-06, + "loss": 0.1038, + "step": 11306 + }, + { + "epoch": 1.8319831497083603, + "grad_norm": 0.8607720732688904, + "learning_rate": 1.7378623906097333e-06, + "loss": 0.1116, + "step": 11307 + }, + { + "epoch": 1.832145171743357, + "grad_norm": 0.8464848399162292, + "learning_rate": 1.7374459143881899e-06, + "loss": 0.1, + "step": 11308 + }, + { + "epoch": 1.832307193778354, + "grad_norm": 0.7194477319717407, + "learning_rate": 1.737029461498957e-06, + "loss": 0.0803, + "step": 11309 + }, + { + "epoch": 1.8324692158133506, + "grad_norm": 0.9045358300209045, + "learning_rate": 1.7366130319547747e-06, + "loss": 0.1053, + "step": 11310 + }, + { + "epoch": 1.8326312378483474, + "grad_norm": 0.8817188143730164, + "learning_rate": 1.736196625768387e-06, + "loss": 0.1033, + "step": 11311 + }, + { + "epoch": 1.8327932598833443, + "grad_norm": 0.7992186546325684, + "learning_rate": 1.735780242952534e-06, + "loss": 0.0999, + "step": 11312 + }, + { + "epoch": 1.8329552819183408, + "grad_norm": 0.711328387260437, + "learning_rate": 1.7353638835199568e-06, + "loss": 0.0875, + "step": 11313 + }, + { + "epoch": 1.8331173039533377, + "grad_norm": 0.8536904454231262, + "learning_rate": 1.7349475474833938e-06, + "loss": 0.1047, + "step": 11314 + }, + { + "epoch": 1.8332793259883344, + "grad_norm": 0.7888259887695312, + "learning_rate": 1.7345312348555843e-06, + "loss": 0.1089, + "step": 11315 + }, + { + "epoch": 1.8334413480233311, + "grad_norm": 0.8059415221214294, + "learning_rate": 1.7341149456492672e-06, + "loss": 0.0995, + "step": 11316 + }, + { + "epoch": 1.833603370058328, + "grad_norm": 0.8481978178024292, + "learning_rate": 1.733698679877179e-06, + "loss": 0.1035, + "step": 11317 + }, + { + "epoch": 1.8337653920933246, + "grad_norm": 0.72404545545578, + "learning_rate": 1.7332824375520574e-06, + "loss": 0.0971, + "step": 11318 + }, + { + "epoch": 1.8339274141283215, + "grad_norm": 0.8225178718566895, + "learning_rate": 1.7328662186866373e-06, + "loss": 0.0951, + "step": 11319 + }, + { + "epoch": 1.8340894361633182, + "grad_norm": 0.7030364871025085, + "learning_rate": 1.7324500232936536e-06, + "loss": 0.0844, + "step": 11320 + }, + { + "epoch": 1.834251458198315, + "grad_norm": 0.9015457630157471, + "learning_rate": 1.7320338513858425e-06, + "loss": 0.1048, + "step": 11321 + }, + { + "epoch": 1.8344134802333119, + "grad_norm": 0.8932374715805054, + "learning_rate": 1.731617702975938e-06, + "loss": 0.1184, + "step": 11322 + }, + { + "epoch": 1.8345755022683083, + "grad_norm": 1.008551836013794, + "learning_rate": 1.7312015780766714e-06, + "loss": 0.1336, + "step": 11323 + }, + { + "epoch": 1.8347375243033053, + "grad_norm": 0.7870475053787231, + "learning_rate": 1.7307854767007756e-06, + "loss": 0.1007, + "step": 11324 + }, + { + "epoch": 1.834899546338302, + "grad_norm": 0.7753018736839294, + "learning_rate": 1.7303693988609837e-06, + "loss": 0.0904, + "step": 11325 + }, + { + "epoch": 1.8350615683732987, + "grad_norm": 0.9246848225593567, + "learning_rate": 1.7299533445700253e-06, + "loss": 0.1144, + "step": 11326 + }, + { + "epoch": 1.8352235904082956, + "grad_norm": 0.7962560057640076, + "learning_rate": 1.7295373138406318e-06, + "loss": 0.1006, + "step": 11327 + }, + { + "epoch": 1.8353856124432921, + "grad_norm": 0.8699910044670105, + "learning_rate": 1.7291213066855312e-06, + "loss": 0.1003, + "step": 11328 + }, + { + "epoch": 1.835547634478289, + "grad_norm": 0.8488613367080688, + "learning_rate": 1.7287053231174528e-06, + "loss": 0.1104, + "step": 11329 + }, + { + "epoch": 1.8357096565132858, + "grad_norm": 0.8561351895332336, + "learning_rate": 1.7282893631491253e-06, + "loss": 0.1013, + "step": 11330 + }, + { + "epoch": 1.8358716785482825, + "grad_norm": 0.7828042507171631, + "learning_rate": 1.7278734267932764e-06, + "loss": 0.0941, + "step": 11331 + }, + { + "epoch": 1.8360337005832794, + "grad_norm": 0.916395366191864, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.1124, + "step": 11332 + }, + { + "epoch": 1.8361957226182761, + "grad_norm": 0.8568091988563538, + "learning_rate": 1.7270416249699179e-06, + "loss": 0.0995, + "step": 11333 + }, + { + "epoch": 1.8363577446532728, + "grad_norm": 0.7647148966789246, + "learning_rate": 1.7266257595278591e-06, + "loss": 0.0951, + "step": 11334 + }, + { + "epoch": 1.8365197666882696, + "grad_norm": 0.8157481551170349, + "learning_rate": 1.726209917749181e-06, + "loss": 0.0979, + "step": 11335 + }, + { + "epoch": 1.8366817887232663, + "grad_norm": 0.8966127634048462, + "learning_rate": 1.725794099646607e-06, + "loss": 0.1024, + "step": 11336 + }, + { + "epoch": 1.8368438107582632, + "grad_norm": 0.8767910003662109, + "learning_rate": 1.72537830523286e-06, + "loss": 0.1108, + "step": 11337 + }, + { + "epoch": 1.83700583279326, + "grad_norm": 0.8303874731063843, + "learning_rate": 1.7249625345206623e-06, + "loss": 0.0929, + "step": 11338 + }, + { + "epoch": 1.8371678548282566, + "grad_norm": 0.7946867346763611, + "learning_rate": 1.7245467875227345e-06, + "loss": 0.1038, + "step": 11339 + }, + { + "epoch": 1.8373298768632536, + "grad_norm": 0.7958531379699707, + "learning_rate": 1.7241310642517998e-06, + "loss": 0.0946, + "step": 11340 + }, + { + "epoch": 1.83749189889825, + "grad_norm": 0.7723376750946045, + "learning_rate": 1.7237153647205762e-06, + "loss": 0.1002, + "step": 11341 + }, + { + "epoch": 1.837653920933247, + "grad_norm": 1.005839228630066, + "learning_rate": 1.7232996889417846e-06, + "loss": 0.109, + "step": 11342 + }, + { + "epoch": 1.8378159429682437, + "grad_norm": 0.7882110476493835, + "learning_rate": 1.7228840369281424e-06, + "loss": 0.0978, + "step": 11343 + }, + { + "epoch": 1.8379779650032404, + "grad_norm": 0.6778146028518677, + "learning_rate": 1.7224684086923677e-06, + "loss": 0.0867, + "step": 11344 + }, + { + "epoch": 1.8381399870382373, + "grad_norm": 0.8371578454971313, + "learning_rate": 1.722052804247179e-06, + "loss": 0.104, + "step": 11345 + }, + { + "epoch": 1.8383020090732338, + "grad_norm": 0.684536874294281, + "learning_rate": 1.7216372236052914e-06, + "loss": 0.0873, + "step": 11346 + }, + { + "epoch": 1.8384640311082308, + "grad_norm": 0.777190089225769, + "learning_rate": 1.7212216667794213e-06, + "loss": 0.1056, + "step": 11347 + }, + { + "epoch": 1.8386260531432275, + "grad_norm": 0.7780445218086243, + "learning_rate": 1.7208061337822828e-06, + "loss": 0.0936, + "step": 11348 + }, + { + "epoch": 1.8387880751782242, + "grad_norm": 0.8263536095619202, + "learning_rate": 1.7203906246265921e-06, + "loss": 0.095, + "step": 11349 + }, + { + "epoch": 1.8389500972132211, + "grad_norm": 0.7346975803375244, + "learning_rate": 1.7199751393250614e-06, + "loss": 0.0939, + "step": 11350 + }, + { + "epoch": 1.8391121192482176, + "grad_norm": 0.8008426427841187, + "learning_rate": 1.719559677890404e-06, + "loss": 0.0932, + "step": 11351 + }, + { + "epoch": 1.8392741412832145, + "grad_norm": 0.8390111923217773, + "learning_rate": 1.7191442403353314e-06, + "loss": 0.0961, + "step": 11352 + }, + { + "epoch": 1.8394361633182112, + "grad_norm": 0.8071438074111938, + "learning_rate": 1.7187288266725549e-06, + "loss": 0.094, + "step": 11353 + }, + { + "epoch": 1.839598185353208, + "grad_norm": 0.8307203054428101, + "learning_rate": 1.7183134369147866e-06, + "loss": 0.1013, + "step": 11354 + }, + { + "epoch": 1.839760207388205, + "grad_norm": 0.8359047174453735, + "learning_rate": 1.717898071074735e-06, + "loss": 0.1013, + "step": 11355 + }, + { + "epoch": 1.8399222294232016, + "grad_norm": 0.925969660282135, + "learning_rate": 1.71748272916511e-06, + "loss": 0.1027, + "step": 11356 + }, + { + "epoch": 1.8400842514581983, + "grad_norm": 0.7569020986557007, + "learning_rate": 1.7170674111986202e-06, + "loss": 0.0964, + "step": 11357 + }, + { + "epoch": 1.840246273493195, + "grad_norm": 0.8798254728317261, + "learning_rate": 1.716652117187972e-06, + "loss": 0.105, + "step": 11358 + }, + { + "epoch": 1.8404082955281917, + "grad_norm": 0.757815957069397, + "learning_rate": 1.7162368471458738e-06, + "loss": 0.0955, + "step": 11359 + }, + { + "epoch": 1.8405703175631887, + "grad_norm": 0.8111533522605896, + "learning_rate": 1.7158216010850318e-06, + "loss": 0.0971, + "step": 11360 + }, + { + "epoch": 1.8407323395981854, + "grad_norm": 0.793456494808197, + "learning_rate": 1.7154063790181507e-06, + "loss": 0.1047, + "step": 11361 + }, + { + "epoch": 1.840894361633182, + "grad_norm": 0.7648953199386597, + "learning_rate": 1.7149911809579361e-06, + "loss": 0.0951, + "step": 11362 + }, + { + "epoch": 1.841056383668179, + "grad_norm": 0.7821177244186401, + "learning_rate": 1.7145760069170905e-06, + "loss": 0.0973, + "step": 11363 + }, + { + "epoch": 1.8412184057031755, + "grad_norm": 0.8774672150611877, + "learning_rate": 1.7141608569083195e-06, + "loss": 0.1019, + "step": 11364 + }, + { + "epoch": 1.8413804277381725, + "grad_norm": 0.815719485282898, + "learning_rate": 1.7137457309443245e-06, + "loss": 0.0915, + "step": 11365 + }, + { + "epoch": 1.8415424497731692, + "grad_norm": 0.8151124715805054, + "learning_rate": 1.7133306290378077e-06, + "loss": 0.094, + "step": 11366 + }, + { + "epoch": 1.8417044718081659, + "grad_norm": 0.6767386794090271, + "learning_rate": 1.7129155512014692e-06, + "loss": 0.0797, + "step": 11367 + }, + { + "epoch": 1.8418664938431628, + "grad_norm": 0.8355568051338196, + "learning_rate": 1.7125004974480102e-06, + "loss": 0.0956, + "step": 11368 + }, + { + "epoch": 1.8420285158781593, + "grad_norm": 0.9013076424598694, + "learning_rate": 1.7120854677901312e-06, + "loss": 0.1056, + "step": 11369 + }, + { + "epoch": 1.8421905379131562, + "grad_norm": 0.7785685062408447, + "learning_rate": 1.7116704622405295e-06, + "loss": 0.0867, + "step": 11370 + }, + { + "epoch": 1.842352559948153, + "grad_norm": 0.7849324345588684, + "learning_rate": 1.7112554808119043e-06, + "loss": 0.0983, + "step": 11371 + }, + { + "epoch": 1.8425145819831497, + "grad_norm": 0.9096629023551941, + "learning_rate": 1.7108405235169511e-06, + "loss": 0.1079, + "step": 11372 + }, + { + "epoch": 1.8426766040181466, + "grad_norm": 0.7472807765007019, + "learning_rate": 1.71042559036837e-06, + "loss": 0.0946, + "step": 11373 + }, + { + "epoch": 1.842838626053143, + "grad_norm": 0.6788498163223267, + "learning_rate": 1.7100106813788544e-06, + "loss": 0.0785, + "step": 11374 + }, + { + "epoch": 1.84300064808814, + "grad_norm": 0.8583519458770752, + "learning_rate": 1.7095957965611008e-06, + "loss": 0.1001, + "step": 11375 + }, + { + "epoch": 1.8431626701231367, + "grad_norm": 0.9140331149101257, + "learning_rate": 1.7091809359278025e-06, + "loss": 0.1117, + "step": 11376 + }, + { + "epoch": 1.8433246921581334, + "grad_norm": 0.8352115154266357, + "learning_rate": 1.7087660994916533e-06, + "loss": 0.1087, + "step": 11377 + }, + { + "epoch": 1.8434867141931304, + "grad_norm": 0.8460627198219299, + "learning_rate": 1.7083512872653477e-06, + "loss": 0.1032, + "step": 11378 + }, + { + "epoch": 1.8436487362281269, + "grad_norm": 0.8629457950592041, + "learning_rate": 1.7079364992615766e-06, + "loss": 0.1015, + "step": 11379 + }, + { + "epoch": 1.8438107582631238, + "grad_norm": 0.8817080855369568, + "learning_rate": 1.7075217354930324e-06, + "loss": 0.1085, + "step": 11380 + }, + { + "epoch": 1.8439727802981205, + "grad_norm": 0.9188164472579956, + "learning_rate": 1.7071069959724046e-06, + "loss": 0.0987, + "step": 11381 + }, + { + "epoch": 1.8441348023331172, + "grad_norm": 0.7526326179504395, + "learning_rate": 1.7066922807123834e-06, + "loss": 0.0948, + "step": 11382 + }, + { + "epoch": 1.8442968243681142, + "grad_norm": 0.7177049517631531, + "learning_rate": 1.7062775897256593e-06, + "loss": 0.0892, + "step": 11383 + }, + { + "epoch": 1.8444588464031109, + "grad_norm": 0.8121328353881836, + "learning_rate": 1.7058629230249207e-06, + "loss": 0.0961, + "step": 11384 + }, + { + "epoch": 1.8446208684381076, + "grad_norm": 0.7545210123062134, + "learning_rate": 1.7054482806228543e-06, + "loss": 0.0927, + "step": 11385 + }, + { + "epoch": 1.8447828904731045, + "grad_norm": 0.8250703811645508, + "learning_rate": 1.7050336625321484e-06, + "loss": 0.095, + "step": 11386 + }, + { + "epoch": 1.844944912508101, + "grad_norm": 0.772592306137085, + "learning_rate": 1.7046190687654873e-06, + "loss": 0.0977, + "step": 11387 + }, + { + "epoch": 1.845106934543098, + "grad_norm": 0.7452101111412048, + "learning_rate": 1.704204499335559e-06, + "loss": 0.0897, + "step": 11388 + }, + { + "epoch": 1.8452689565780946, + "grad_norm": 0.7722578048706055, + "learning_rate": 1.703789954255047e-06, + "loss": 0.0977, + "step": 11389 + }, + { + "epoch": 1.8454309786130914, + "grad_norm": 0.8865172863006592, + "learning_rate": 1.7033754335366356e-06, + "loss": 0.1031, + "step": 11390 + }, + { + "epoch": 1.8455930006480883, + "grad_norm": 0.8430839776992798, + "learning_rate": 1.7029609371930076e-06, + "loss": 0.1073, + "step": 11391 + }, + { + "epoch": 1.8457550226830848, + "grad_norm": 0.8359355330467224, + "learning_rate": 1.7025464652368464e-06, + "loss": 0.101, + "step": 11392 + }, + { + "epoch": 1.8459170447180817, + "grad_norm": 0.8882021307945251, + "learning_rate": 1.7021320176808343e-06, + "loss": 0.1072, + "step": 11393 + }, + { + "epoch": 1.8460790667530784, + "grad_norm": 0.7827454209327698, + "learning_rate": 1.701717594537651e-06, + "loss": 0.0927, + "step": 11394 + }, + { + "epoch": 1.8462410887880751, + "grad_norm": 0.8166926503181458, + "learning_rate": 1.7013031958199783e-06, + "loss": 0.1026, + "step": 11395 + }, + { + "epoch": 1.846403110823072, + "grad_norm": 0.7511534690856934, + "learning_rate": 1.7008888215404933e-06, + "loss": 0.0942, + "step": 11396 + }, + { + "epoch": 1.8465651328580686, + "grad_norm": 0.8353126645088196, + "learning_rate": 1.7004744717118777e-06, + "loss": 0.1063, + "step": 11397 + }, + { + "epoch": 1.8467271548930655, + "grad_norm": 0.6988396644592285, + "learning_rate": 1.7000601463468088e-06, + "loss": 0.0877, + "step": 11398 + }, + { + "epoch": 1.8468891769280622, + "grad_norm": 0.8091744184494019, + "learning_rate": 1.6996458454579632e-06, + "loss": 0.0984, + "step": 11399 + }, + { + "epoch": 1.847051198963059, + "grad_norm": 0.7793344855308533, + "learning_rate": 1.6992315690580178e-06, + "loss": 0.093, + "step": 11400 + }, + { + "epoch": 1.8472132209980558, + "grad_norm": 0.7006025910377502, + "learning_rate": 1.6988173171596479e-06, + "loss": 0.0805, + "step": 11401 + }, + { + "epoch": 1.8473752430330523, + "grad_norm": 0.7687976956367493, + "learning_rate": 1.6984030897755304e-06, + "loss": 0.0942, + "step": 11402 + }, + { + "epoch": 1.8475372650680493, + "grad_norm": 0.8496354222297668, + "learning_rate": 1.697988886918338e-06, + "loss": 0.1107, + "step": 11403 + }, + { + "epoch": 1.847699287103046, + "grad_norm": 0.8020095229148865, + "learning_rate": 1.6975747086007454e-06, + "loss": 0.0906, + "step": 11404 + }, + { + "epoch": 1.8478613091380427, + "grad_norm": 0.8590508699417114, + "learning_rate": 1.6971605548354244e-06, + "loss": 0.0976, + "step": 11405 + }, + { + "epoch": 1.8480233311730396, + "grad_norm": 0.8174715042114258, + "learning_rate": 1.6967464256350468e-06, + "loss": 0.0967, + "step": 11406 + }, + { + "epoch": 1.8481853532080363, + "grad_norm": 0.8774664998054504, + "learning_rate": 1.6963323210122856e-06, + "loss": 0.0971, + "step": 11407 + }, + { + "epoch": 1.848347375243033, + "grad_norm": 0.8936482667922974, + "learning_rate": 1.6959182409798111e-06, + "loss": 0.1196, + "step": 11408 + }, + { + "epoch": 1.8485093972780298, + "grad_norm": 0.7813578248023987, + "learning_rate": 1.6955041855502918e-06, + "loss": 0.1045, + "step": 11409 + }, + { + "epoch": 1.8486714193130265, + "grad_norm": 0.8374517560005188, + "learning_rate": 1.695090154736398e-06, + "loss": 0.1012, + "step": 11410 + }, + { + "epoch": 1.8488334413480234, + "grad_norm": 0.7886912822723389, + "learning_rate": 1.694676148550797e-06, + "loss": 0.0903, + "step": 11411 + }, + { + "epoch": 1.8489954633830201, + "grad_norm": 0.8474147915840149, + "learning_rate": 1.6942621670061574e-06, + "loss": 0.1001, + "step": 11412 + }, + { + "epoch": 1.8491574854180168, + "grad_norm": 0.7622694373130798, + "learning_rate": 1.693848210115146e-06, + "loss": 0.0996, + "step": 11413 + }, + { + "epoch": 1.8493195074530138, + "grad_norm": 0.7860474586486816, + "learning_rate": 1.693434277890428e-06, + "loss": 0.0952, + "step": 11414 + }, + { + "epoch": 1.8494815294880103, + "grad_norm": 0.8053168058395386, + "learning_rate": 1.693020370344669e-06, + "loss": 0.0991, + "step": 11415 + }, + { + "epoch": 1.8496435515230072, + "grad_norm": 0.769270658493042, + "learning_rate": 1.692606487490534e-06, + "loss": 0.1014, + "step": 11416 + }, + { + "epoch": 1.849805573558004, + "grad_norm": 0.7316917181015015, + "learning_rate": 1.6921926293406874e-06, + "loss": 0.0897, + "step": 11417 + }, + { + "epoch": 1.8499675955930006, + "grad_norm": 0.7723559737205505, + "learning_rate": 1.6917787959077907e-06, + "loss": 0.0973, + "step": 11418 + }, + { + "epoch": 1.8501296176279975, + "grad_norm": 0.8190063238143921, + "learning_rate": 1.6913649872045076e-06, + "loss": 0.1082, + "step": 11419 + }, + { + "epoch": 1.850291639662994, + "grad_norm": 0.8806280493736267, + "learning_rate": 1.6909512032434984e-06, + "loss": 0.1103, + "step": 11420 + }, + { + "epoch": 1.850453661697991, + "grad_norm": 0.7780027985572815, + "learning_rate": 1.6905374440374245e-06, + "loss": 0.1017, + "step": 11421 + }, + { + "epoch": 1.8506156837329877, + "grad_norm": 0.9774267077445984, + "learning_rate": 1.6901237095989464e-06, + "loss": 0.1157, + "step": 11422 + }, + { + "epoch": 1.8507777057679844, + "grad_norm": 0.7308449745178223, + "learning_rate": 1.689709999940723e-06, + "loss": 0.0916, + "step": 11423 + }, + { + "epoch": 1.8509397278029813, + "grad_norm": 0.7397428154945374, + "learning_rate": 1.6892963150754128e-06, + "loss": 0.0873, + "step": 11424 + }, + { + "epoch": 1.8511017498379778, + "grad_norm": 0.8941960334777832, + "learning_rate": 1.688882655015672e-06, + "loss": 0.102, + "step": 11425 + }, + { + "epoch": 1.8512637718729748, + "grad_norm": 0.77532958984375, + "learning_rate": 1.6884690197741608e-06, + "loss": 0.0966, + "step": 11426 + }, + { + "epoch": 1.8514257939079715, + "grad_norm": 0.834740936756134, + "learning_rate": 1.6880554093635331e-06, + "loss": 0.1077, + "step": 11427 + }, + { + "epoch": 1.8515878159429682, + "grad_norm": 0.8465557098388672, + "learning_rate": 1.6876418237964453e-06, + "loss": 0.1014, + "step": 11428 + }, + { + "epoch": 1.851749837977965, + "grad_norm": 0.8378772139549255, + "learning_rate": 1.6872282630855519e-06, + "loss": 0.0937, + "step": 11429 + }, + { + "epoch": 1.8519118600129616, + "grad_norm": 0.9061734676361084, + "learning_rate": 1.6868147272435057e-06, + "loss": 0.1131, + "step": 11430 + }, + { + "epoch": 1.8520738820479585, + "grad_norm": 0.8114215135574341, + "learning_rate": 1.6864012162829624e-06, + "loss": 0.0961, + "step": 11431 + }, + { + "epoch": 1.8522359040829552, + "grad_norm": 0.8266593217849731, + "learning_rate": 1.6859877302165723e-06, + "loss": 0.0923, + "step": 11432 + }, + { + "epoch": 1.852397926117952, + "grad_norm": 0.7605752944946289, + "learning_rate": 1.6855742690569881e-06, + "loss": 0.098, + "step": 11433 + }, + { + "epoch": 1.8525599481529489, + "grad_norm": 0.77652508020401, + "learning_rate": 1.6851608328168589e-06, + "loss": 0.0995, + "step": 11434 + }, + { + "epoch": 1.8527219701879456, + "grad_norm": 0.7098603248596191, + "learning_rate": 1.6847474215088382e-06, + "loss": 0.0844, + "step": 11435 + }, + { + "epoch": 1.8528839922229423, + "grad_norm": 0.9365832209587097, + "learning_rate": 1.6843340351455728e-06, + "loss": 0.1145, + "step": 11436 + }, + { + "epoch": 1.8530460142579392, + "grad_norm": 0.8150047659873962, + "learning_rate": 1.6839206737397126e-06, + "loss": 0.0992, + "step": 11437 + }, + { + "epoch": 1.8532080362929357, + "grad_norm": 0.7815777659416199, + "learning_rate": 1.6835073373039045e-06, + "loss": 0.092, + "step": 11438 + }, + { + "epoch": 1.8533700583279327, + "grad_norm": 0.8175069689750671, + "learning_rate": 1.6830940258507955e-06, + "loss": 0.0988, + "step": 11439 + }, + { + "epoch": 1.8535320803629294, + "grad_norm": 0.8436422944068909, + "learning_rate": 1.6826807393930334e-06, + "loss": 0.0944, + "step": 11440 + }, + { + "epoch": 1.853694102397926, + "grad_norm": 0.6752289533615112, + "learning_rate": 1.682267477943262e-06, + "loss": 0.0803, + "step": 11441 + }, + { + "epoch": 1.853856124432923, + "grad_norm": 0.7708499431610107, + "learning_rate": 1.6818542415141273e-06, + "loss": 0.0933, + "step": 11442 + }, + { + "epoch": 1.8540181464679195, + "grad_norm": 0.8612736463546753, + "learning_rate": 1.6814410301182732e-06, + "loss": 0.0977, + "step": 11443 + }, + { + "epoch": 1.8541801685029164, + "grad_norm": 0.8443825840950012, + "learning_rate": 1.6810278437683419e-06, + "loss": 0.1044, + "step": 11444 + }, + { + "epoch": 1.8543421905379132, + "grad_norm": 0.8466036319732666, + "learning_rate": 1.680614682476977e-06, + "loss": 0.0982, + "step": 11445 + }, + { + "epoch": 1.8545042125729099, + "grad_norm": 0.9107270240783691, + "learning_rate": 1.6802015462568205e-06, + "loss": 0.1083, + "step": 11446 + }, + { + "epoch": 1.8546662346079068, + "grad_norm": 0.7344313859939575, + "learning_rate": 1.6797884351205123e-06, + "loss": 0.0926, + "step": 11447 + }, + { + "epoch": 1.8548282566429033, + "grad_norm": 0.8583900332450867, + "learning_rate": 1.6793753490806939e-06, + "loss": 0.1143, + "step": 11448 + }, + { + "epoch": 1.8549902786779002, + "grad_norm": 0.7386242151260376, + "learning_rate": 1.678962288150003e-06, + "loss": 0.0855, + "step": 11449 + }, + { + "epoch": 1.855152300712897, + "grad_norm": 0.7551776766777039, + "learning_rate": 1.678549252341079e-06, + "loss": 0.0893, + "step": 11450 + }, + { + "epoch": 1.8553143227478937, + "grad_norm": 0.8036359548568726, + "learning_rate": 1.6781362416665602e-06, + "loss": 0.0918, + "step": 11451 + }, + { + "epoch": 1.8554763447828906, + "grad_norm": 0.7629950046539307, + "learning_rate": 1.6777232561390844e-06, + "loss": 0.1067, + "step": 11452 + }, + { + "epoch": 1.855638366817887, + "grad_norm": 0.7417677044868469, + "learning_rate": 1.6773102957712866e-06, + "loss": 0.087, + "step": 11453 + }, + { + "epoch": 1.855800388852884, + "grad_norm": 0.7128724455833435, + "learning_rate": 1.6768973605758021e-06, + "loss": 0.091, + "step": 11454 + }, + { + "epoch": 1.8559624108878807, + "grad_norm": 0.721200704574585, + "learning_rate": 1.6764844505652677e-06, + "loss": 0.0895, + "step": 11455 + }, + { + "epoch": 1.8561244329228774, + "grad_norm": 0.7439916729927063, + "learning_rate": 1.6760715657523158e-06, + "loss": 0.0916, + "step": 11456 + }, + { + "epoch": 1.8562864549578744, + "grad_norm": 0.8428785800933838, + "learning_rate": 1.6756587061495805e-06, + "loss": 0.1077, + "step": 11457 + }, + { + "epoch": 1.856448476992871, + "grad_norm": 0.7453662753105164, + "learning_rate": 1.6752458717696928e-06, + "loss": 0.089, + "step": 11458 + }, + { + "epoch": 1.8566104990278678, + "grad_norm": 0.8296511173248291, + "learning_rate": 1.6748330626252862e-06, + "loss": 0.1011, + "step": 11459 + }, + { + "epoch": 1.8567725210628645, + "grad_norm": 0.8146061301231384, + "learning_rate": 1.674420278728991e-06, + "loss": 0.0924, + "step": 11460 + }, + { + "epoch": 1.8569345430978612, + "grad_norm": 0.8605503439903259, + "learning_rate": 1.674007520093438e-06, + "loss": 0.0933, + "step": 11461 + }, + { + "epoch": 1.8570965651328581, + "grad_norm": 0.820659875869751, + "learning_rate": 1.6735947867312553e-06, + "loss": 0.0988, + "step": 11462 + }, + { + "epoch": 1.8572585871678549, + "grad_norm": 0.8566899299621582, + "learning_rate": 1.6731820786550717e-06, + "loss": 0.1045, + "step": 11463 + }, + { + "epoch": 1.8574206092028516, + "grad_norm": 0.8467245101928711, + "learning_rate": 1.6727693958775172e-06, + "loss": 0.1007, + "step": 11464 + }, + { + "epoch": 1.8575826312378485, + "grad_norm": 0.8376954197883606, + "learning_rate": 1.6723567384112161e-06, + "loss": 0.1074, + "step": 11465 + }, + { + "epoch": 1.857744653272845, + "grad_norm": 0.7786714434623718, + "learning_rate": 1.671944106268797e-06, + "loss": 0.0937, + "step": 11466 + }, + { + "epoch": 1.857906675307842, + "grad_norm": 0.9477052688598633, + "learning_rate": 1.6715314994628834e-06, + "loss": 0.1135, + "step": 11467 + }, + { + "epoch": 1.8580686973428386, + "grad_norm": 0.9861614108085632, + "learning_rate": 1.671118918006101e-06, + "loss": 0.1181, + "step": 11468 + }, + { + "epoch": 1.8582307193778353, + "grad_norm": 0.8557963371276855, + "learning_rate": 1.6707063619110742e-06, + "loss": 0.1045, + "step": 11469 + }, + { + "epoch": 1.8583927414128323, + "grad_norm": 0.7471309900283813, + "learning_rate": 1.6702938311904262e-06, + "loss": 0.0944, + "step": 11470 + }, + { + "epoch": 1.8585547634478288, + "grad_norm": 0.7850518822669983, + "learning_rate": 1.669881325856779e-06, + "loss": 0.0951, + "step": 11471 + }, + { + "epoch": 1.8587167854828257, + "grad_norm": 0.8640868067741394, + "learning_rate": 1.6694688459227545e-06, + "loss": 0.1049, + "step": 11472 + }, + { + "epoch": 1.8588788075178224, + "grad_norm": 0.909514844417572, + "learning_rate": 1.6690563914009728e-06, + "loss": 0.0996, + "step": 11473 + }, + { + "epoch": 1.8590408295528191, + "grad_norm": 0.8319307565689087, + "learning_rate": 1.6686439623040548e-06, + "loss": 0.1022, + "step": 11474 + }, + { + "epoch": 1.859202851587816, + "grad_norm": 0.797558605670929, + "learning_rate": 1.6682315586446205e-06, + "loss": 0.0896, + "step": 11475 + }, + { + "epoch": 1.8593648736228126, + "grad_norm": 0.7331224083900452, + "learning_rate": 1.6678191804352873e-06, + "loss": 0.084, + "step": 11476 + }, + { + "epoch": 1.8595268956578095, + "grad_norm": 0.8644435405731201, + "learning_rate": 1.6674068276886734e-06, + "loss": 0.1117, + "step": 11477 + }, + { + "epoch": 1.8596889176928062, + "grad_norm": 0.7379981279373169, + "learning_rate": 1.6669945004173944e-06, + "loss": 0.0877, + "step": 11478 + }, + { + "epoch": 1.859850939727803, + "grad_norm": 0.7866598963737488, + "learning_rate": 1.6665821986340695e-06, + "loss": 0.0984, + "step": 11479 + }, + { + "epoch": 1.8600129617627998, + "grad_norm": 0.8214098215103149, + "learning_rate": 1.6661699223513118e-06, + "loss": 0.1077, + "step": 11480 + }, + { + "epoch": 1.8601749837977966, + "grad_norm": 0.7403610348701477, + "learning_rate": 1.6657576715817372e-06, + "loss": 0.0934, + "step": 11481 + }, + { + "epoch": 1.8603370058327933, + "grad_norm": 0.8874310255050659, + "learning_rate": 1.6653454463379582e-06, + "loss": 0.1102, + "step": 11482 + }, + { + "epoch": 1.86049902786779, + "grad_norm": 1.0349223613739014, + "learning_rate": 1.664933246632589e-06, + "loss": 0.115, + "step": 11483 + }, + { + "epoch": 1.8606610499027867, + "grad_norm": 0.7765905857086182, + "learning_rate": 1.6645210724782423e-06, + "loss": 0.1052, + "step": 11484 + }, + { + "epoch": 1.8608230719377836, + "grad_norm": 0.8592008948326111, + "learning_rate": 1.6641089238875283e-06, + "loss": 0.1013, + "step": 11485 + }, + { + "epoch": 1.8609850939727803, + "grad_norm": 0.8249764442443848, + "learning_rate": 1.6636968008730586e-06, + "loss": 0.1116, + "step": 11486 + }, + { + "epoch": 1.861147116007777, + "grad_norm": 0.7741264700889587, + "learning_rate": 1.6632847034474423e-06, + "loss": 0.0926, + "step": 11487 + }, + { + "epoch": 1.861309138042774, + "grad_norm": 0.6054975390434265, + "learning_rate": 1.6628726316232902e-06, + "loss": 0.0725, + "step": 11488 + }, + { + "epoch": 1.8614711600777705, + "grad_norm": 0.7995757460594177, + "learning_rate": 1.6624605854132094e-06, + "loss": 0.0928, + "step": 11489 + }, + { + "epoch": 1.8616331821127674, + "grad_norm": 0.6884124279022217, + "learning_rate": 1.6620485648298084e-06, + "loss": 0.0815, + "step": 11490 + }, + { + "epoch": 1.8617952041477641, + "grad_norm": 0.8780040740966797, + "learning_rate": 1.661636569885693e-06, + "loss": 0.1026, + "step": 11491 + }, + { + "epoch": 1.8619572261827608, + "grad_norm": 0.8116469383239746, + "learning_rate": 1.6612246005934694e-06, + "loss": 0.0942, + "step": 11492 + }, + { + "epoch": 1.8621192482177578, + "grad_norm": 0.8133668899536133, + "learning_rate": 1.660812656965744e-06, + "loss": 0.0935, + "step": 11493 + }, + { + "epoch": 1.8622812702527543, + "grad_norm": 0.6727808713912964, + "learning_rate": 1.66040073901512e-06, + "loss": 0.0841, + "step": 11494 + }, + { + "epoch": 1.8624432922877512, + "grad_norm": 0.8516779541969299, + "learning_rate": 1.6599888467542017e-06, + "loss": 0.114, + "step": 11495 + }, + { + "epoch": 1.862605314322748, + "grad_norm": 0.7665314078330994, + "learning_rate": 1.6595769801955925e-06, + "loss": 0.0951, + "step": 11496 + }, + { + "epoch": 1.8627673363577446, + "grad_norm": 0.7693126797676086, + "learning_rate": 1.6591651393518926e-06, + "loss": 0.0922, + "step": 11497 + }, + { + "epoch": 1.8629293583927415, + "grad_norm": 0.7686997652053833, + "learning_rate": 1.6587533242357053e-06, + "loss": 0.0905, + "step": 11498 + }, + { + "epoch": 1.863091380427738, + "grad_norm": 0.7887589335441589, + "learning_rate": 1.658341534859631e-06, + "loss": 0.0993, + "step": 11499 + }, + { + "epoch": 1.863253402462735, + "grad_norm": 0.8120632171630859, + "learning_rate": 1.6579297712362686e-06, + "loss": 0.0902, + "step": 11500 + }, + { + "epoch": 1.8634154244977317, + "grad_norm": 0.9442248940467834, + "learning_rate": 1.657518033378217e-06, + "loss": 0.1086, + "step": 11501 + }, + { + "epoch": 1.8635774465327284, + "grad_norm": 0.9302598237991333, + "learning_rate": 1.6571063212980753e-06, + "loss": 0.1051, + "step": 11502 + }, + { + "epoch": 1.8637394685677253, + "grad_norm": 0.7944306135177612, + "learning_rate": 1.6566946350084405e-06, + "loss": 0.0994, + "step": 11503 + }, + { + "epoch": 1.8639014906027218, + "grad_norm": 0.8751698732376099, + "learning_rate": 1.6562829745219089e-06, + "loss": 0.1055, + "step": 11504 + }, + { + "epoch": 1.8640635126377187, + "grad_norm": 0.8934447169303894, + "learning_rate": 1.6558713398510767e-06, + "loss": 0.1028, + "step": 11505 + }, + { + "epoch": 1.8642255346727155, + "grad_norm": 0.8261483907699585, + "learning_rate": 1.6554597310085383e-06, + "loss": 0.0983, + "step": 11506 + }, + { + "epoch": 1.8643875567077122, + "grad_norm": 0.7163456082344055, + "learning_rate": 1.6550481480068887e-06, + "loss": 0.0841, + "step": 11507 + }, + { + "epoch": 1.864549578742709, + "grad_norm": 0.7598121166229248, + "learning_rate": 1.6546365908587213e-06, + "loss": 0.0926, + "step": 11508 + }, + { + "epoch": 1.8647116007777058, + "grad_norm": 0.7900456786155701, + "learning_rate": 1.654225059576628e-06, + "loss": 0.1034, + "step": 11509 + }, + { + "epoch": 1.8648736228127025, + "grad_norm": 0.792512834072113, + "learning_rate": 1.653813554173202e-06, + "loss": 0.092, + "step": 11510 + }, + { + "epoch": 1.8650356448476992, + "grad_norm": 0.8468700051307678, + "learning_rate": 1.6534020746610315e-06, + "loss": 0.1061, + "step": 11511 + }, + { + "epoch": 1.865197666882696, + "grad_norm": 0.883598268032074, + "learning_rate": 1.6529906210527107e-06, + "loss": 0.1075, + "step": 11512 + }, + { + "epoch": 1.8653596889176929, + "grad_norm": 0.8703530430793762, + "learning_rate": 1.6525791933608266e-06, + "loss": 0.1078, + "step": 11513 + }, + { + "epoch": 1.8655217109526896, + "grad_norm": 0.7216241955757141, + "learning_rate": 1.6521677915979688e-06, + "loss": 0.0921, + "step": 11514 + }, + { + "epoch": 1.8656837329876863, + "grad_norm": 0.7918997406959534, + "learning_rate": 1.6517564157767245e-06, + "loss": 0.0948, + "step": 11515 + }, + { + "epoch": 1.8658457550226832, + "grad_norm": 0.7993650436401367, + "learning_rate": 1.6513450659096804e-06, + "loss": 0.1015, + "step": 11516 + }, + { + "epoch": 1.8660077770576797, + "grad_norm": 0.9507842659950256, + "learning_rate": 1.650933742009425e-06, + "loss": 0.096, + "step": 11517 + }, + { + "epoch": 1.8661697990926767, + "grad_norm": 0.7698103189468384, + "learning_rate": 1.6505224440885414e-06, + "loss": 0.0967, + "step": 11518 + }, + { + "epoch": 1.8663318211276734, + "grad_norm": 0.9620030522346497, + "learning_rate": 1.6501111721596163e-06, + "loss": 0.1113, + "step": 11519 + }, + { + "epoch": 1.86649384316267, + "grad_norm": 0.7965664863586426, + "learning_rate": 1.649699926235232e-06, + "loss": 0.101, + "step": 11520 + }, + { + "epoch": 1.866655865197667, + "grad_norm": 0.7318791747093201, + "learning_rate": 1.6492887063279717e-06, + "loss": 0.0812, + "step": 11521 + }, + { + "epoch": 1.8668178872326635, + "grad_norm": 0.7944748401641846, + "learning_rate": 1.6488775124504188e-06, + "loss": 0.1008, + "step": 11522 + }, + { + "epoch": 1.8669799092676604, + "grad_norm": 0.8979195356369019, + "learning_rate": 1.648466344615155e-06, + "loss": 0.1114, + "step": 11523 + }, + { + "epoch": 1.8671419313026572, + "grad_norm": 0.8624005317687988, + "learning_rate": 1.6480552028347597e-06, + "loss": 0.1084, + "step": 11524 + }, + { + "epoch": 1.8673039533376539, + "grad_norm": 0.9684587717056274, + "learning_rate": 1.647644087121813e-06, + "loss": 0.1113, + "step": 11525 + }, + { + "epoch": 1.8674659753726508, + "grad_norm": 0.7721800804138184, + "learning_rate": 1.6472329974888956e-06, + "loss": 0.0923, + "step": 11526 + }, + { + "epoch": 1.8676279974076473, + "grad_norm": 0.7591527700424194, + "learning_rate": 1.6468219339485845e-06, + "loss": 0.0978, + "step": 11527 + }, + { + "epoch": 1.8677900194426442, + "grad_norm": 0.7007966637611389, + "learning_rate": 1.6464108965134578e-06, + "loss": 0.0874, + "step": 11528 + }, + { + "epoch": 1.867952041477641, + "grad_norm": 0.8641650080680847, + "learning_rate": 1.6459998851960918e-06, + "loss": 0.1124, + "step": 11529 + }, + { + "epoch": 1.8681140635126376, + "grad_norm": 0.8576594591140747, + "learning_rate": 1.645588900009062e-06, + "loss": 0.1001, + "step": 11530 + }, + { + "epoch": 1.8682760855476346, + "grad_norm": 0.8817536234855652, + "learning_rate": 1.645177940964945e-06, + "loss": 0.1046, + "step": 11531 + }, + { + "epoch": 1.8684381075826313, + "grad_norm": 0.7928271889686584, + "learning_rate": 1.6447670080763146e-06, + "loss": 0.0985, + "step": 11532 + }, + { + "epoch": 1.868600129617628, + "grad_norm": 0.8168180584907532, + "learning_rate": 1.6443561013557434e-06, + "loss": 0.1076, + "step": 11533 + }, + { + "epoch": 1.8687621516526247, + "grad_norm": 0.845324695110321, + "learning_rate": 1.6439452208158058e-06, + "loss": 0.1029, + "step": 11534 + }, + { + "epoch": 1.8689241736876214, + "grad_norm": 0.9935814142227173, + "learning_rate": 1.6435343664690718e-06, + "loss": 0.1097, + "step": 11535 + }, + { + "epoch": 1.8690861957226184, + "grad_norm": 0.796512246131897, + "learning_rate": 1.6431235383281135e-06, + "loss": 0.0944, + "step": 11536 + }, + { + "epoch": 1.869248217757615, + "grad_norm": 0.8797528147697449, + "learning_rate": 1.6427127364055024e-06, + "loss": 0.1073, + "step": 11537 + }, + { + "epoch": 1.8694102397926118, + "grad_norm": 0.7383131384849548, + "learning_rate": 1.6423019607138064e-06, + "loss": 0.0979, + "step": 11538 + }, + { + "epoch": 1.8695722618276087, + "grad_norm": 0.794525146484375, + "learning_rate": 1.641891211265595e-06, + "loss": 0.1006, + "step": 11539 + }, + { + "epoch": 1.8697342838626052, + "grad_norm": 0.7232054471969604, + "learning_rate": 1.641480488073435e-06, + "loss": 0.0898, + "step": 11540 + }, + { + "epoch": 1.8698963058976021, + "grad_norm": 0.7711024284362793, + "learning_rate": 1.6410697911498957e-06, + "loss": 0.099, + "step": 11541 + }, + { + "epoch": 1.8700583279325989, + "grad_norm": 0.7795062065124512, + "learning_rate": 1.6406591205075417e-06, + "loss": 0.1017, + "step": 11542 + }, + { + "epoch": 1.8702203499675956, + "grad_norm": 0.7736190557479858, + "learning_rate": 1.6402484761589397e-06, + "loss": 0.0874, + "step": 11543 + }, + { + "epoch": 1.8703823720025925, + "grad_norm": 0.7875672578811646, + "learning_rate": 1.639837858116653e-06, + "loss": 0.1025, + "step": 11544 + }, + { + "epoch": 1.870544394037589, + "grad_norm": 0.7884228825569153, + "learning_rate": 1.639427266393246e-06, + "loss": 0.0903, + "step": 11545 + }, + { + "epoch": 1.870706416072586, + "grad_norm": 0.6791999340057373, + "learning_rate": 1.6390167010012824e-06, + "loss": 0.0852, + "step": 11546 + }, + { + "epoch": 1.8708684381075826, + "grad_norm": 0.9688091278076172, + "learning_rate": 1.638606161953325e-06, + "loss": 0.1071, + "step": 11547 + }, + { + "epoch": 1.8710304601425793, + "grad_norm": 0.7724692225456238, + "learning_rate": 1.638195649261934e-06, + "loss": 0.0987, + "step": 11548 + }, + { + "epoch": 1.8711924821775763, + "grad_norm": 0.685142457485199, + "learning_rate": 1.6377851629396695e-06, + "loss": 0.0857, + "step": 11549 + }, + { + "epoch": 1.8713545042125728, + "grad_norm": 0.8404377698898315, + "learning_rate": 1.6373747029990943e-06, + "loss": 0.1011, + "step": 11550 + }, + { + "epoch": 1.8715165262475697, + "grad_norm": 0.8327875137329102, + "learning_rate": 1.6369642694527648e-06, + "loss": 0.1033, + "step": 11551 + }, + { + "epoch": 1.8716785482825664, + "grad_norm": 0.777384877204895, + "learning_rate": 1.6365538623132405e-06, + "loss": 0.1067, + "step": 11552 + }, + { + "epoch": 1.8718405703175631, + "grad_norm": 0.806379497051239, + "learning_rate": 1.6361434815930782e-06, + "loss": 0.0952, + "step": 11553 + }, + { + "epoch": 1.87200259235256, + "grad_norm": 0.7385959029197693, + "learning_rate": 1.6357331273048343e-06, + "loss": 0.0857, + "step": 11554 + }, + { + "epoch": 1.8721646143875565, + "grad_norm": 0.8200345635414124, + "learning_rate": 1.635322799461066e-06, + "loss": 0.1041, + "step": 11555 + }, + { + "epoch": 1.8723266364225535, + "grad_norm": 0.8210601806640625, + "learning_rate": 1.6349124980743278e-06, + "loss": 0.101, + "step": 11556 + }, + { + "epoch": 1.8724886584575502, + "grad_norm": 0.7943647503852844, + "learning_rate": 1.6345022231571734e-06, + "loss": 0.0912, + "step": 11557 + }, + { + "epoch": 1.872650680492547, + "grad_norm": 0.8460164666175842, + "learning_rate": 1.6340919747221568e-06, + "loss": 0.0955, + "step": 11558 + }, + { + "epoch": 1.8728127025275438, + "grad_norm": 0.7457050085067749, + "learning_rate": 1.6336817527818292e-06, + "loss": 0.0884, + "step": 11559 + }, + { + "epoch": 1.8729747245625405, + "grad_norm": 1.021488070487976, + "learning_rate": 1.633271557348744e-06, + "loss": 0.1097, + "step": 11560 + }, + { + "epoch": 1.8731367465975373, + "grad_norm": 0.8276289701461792, + "learning_rate": 1.6328613884354524e-06, + "loss": 0.0963, + "step": 11561 + }, + { + "epoch": 1.873298768632534, + "grad_norm": 0.6760364770889282, + "learning_rate": 1.6324512460545034e-06, + "loss": 0.0817, + "step": 11562 + }, + { + "epoch": 1.8734607906675307, + "grad_norm": 0.7393625378608704, + "learning_rate": 1.6320411302184474e-06, + "loss": 0.0919, + "step": 11563 + }, + { + "epoch": 1.8736228127025276, + "grad_norm": 0.7952508926391602, + "learning_rate": 1.6316310409398306e-06, + "loss": 0.0932, + "step": 11564 + }, + { + "epoch": 1.8737848347375243, + "grad_norm": 0.8211168050765991, + "learning_rate": 1.6312209782312044e-06, + "loss": 0.1028, + "step": 11565 + }, + { + "epoch": 1.873946856772521, + "grad_norm": 0.8613480925559998, + "learning_rate": 1.6308109421051132e-06, + "loss": 0.106, + "step": 11566 + }, + { + "epoch": 1.874108878807518, + "grad_norm": 0.6946225166320801, + "learning_rate": 1.6304009325741044e-06, + "loss": 0.0865, + "step": 11567 + }, + { + "epoch": 1.8742709008425145, + "grad_norm": 0.8030040264129639, + "learning_rate": 1.6299909496507214e-06, + "loss": 0.1029, + "step": 11568 + }, + { + "epoch": 1.8744329228775114, + "grad_norm": 0.840975284576416, + "learning_rate": 1.6295809933475103e-06, + "loss": 0.1061, + "step": 11569 + }, + { + "epoch": 1.874594944912508, + "grad_norm": 0.893821120262146, + "learning_rate": 1.6291710636770152e-06, + "loss": 0.1042, + "step": 11570 + }, + { + "epoch": 1.8747569669475048, + "grad_norm": 0.8448208570480347, + "learning_rate": 1.6287611606517778e-06, + "loss": 0.0991, + "step": 11571 + }, + { + "epoch": 1.8749189889825018, + "grad_norm": 0.7172380089759827, + "learning_rate": 1.628351284284341e-06, + "loss": 0.0892, + "step": 11572 + }, + { + "epoch": 1.8750810110174982, + "grad_norm": 0.8485018610954285, + "learning_rate": 1.627941434587244e-06, + "loss": 0.0992, + "step": 11573 + }, + { + "epoch": 1.8752430330524952, + "grad_norm": 0.8565303087234497, + "learning_rate": 1.6275316115730302e-06, + "loss": 0.0989, + "step": 11574 + }, + { + "epoch": 1.875405055087492, + "grad_norm": 0.8015487194061279, + "learning_rate": 1.6271218152542373e-06, + "loss": 0.0978, + "step": 11575 + }, + { + "epoch": 1.8755670771224886, + "grad_norm": 0.962988555431366, + "learning_rate": 1.626712045643405e-06, + "loss": 0.1113, + "step": 11576 + }, + { + "epoch": 1.8757290991574855, + "grad_norm": 0.8654093742370605, + "learning_rate": 1.6263023027530706e-06, + "loss": 0.0986, + "step": 11577 + }, + { + "epoch": 1.875891121192482, + "grad_norm": 0.8079492449760437, + "learning_rate": 1.6258925865957703e-06, + "loss": 0.1023, + "step": 11578 + }, + { + "epoch": 1.876053143227479, + "grad_norm": 0.7464910745620728, + "learning_rate": 1.6254828971840432e-06, + "loss": 0.0936, + "step": 11579 + }, + { + "epoch": 1.8762151652624757, + "grad_norm": 0.8631336688995361, + "learning_rate": 1.625073234530422e-06, + "loss": 0.1021, + "step": 11580 + }, + { + "epoch": 1.8763771872974724, + "grad_norm": 0.7467739582061768, + "learning_rate": 1.6246635986474436e-06, + "loss": 0.0909, + "step": 11581 + }, + { + "epoch": 1.8765392093324693, + "grad_norm": 0.8743352293968201, + "learning_rate": 1.62425398954764e-06, + "loss": 0.1001, + "step": 11582 + }, + { + "epoch": 1.876701231367466, + "grad_norm": 0.70003342628479, + "learning_rate": 1.6238444072435447e-06, + "loss": 0.0814, + "step": 11583 + }, + { + "epoch": 1.8768632534024627, + "grad_norm": 0.7987949848175049, + "learning_rate": 1.6234348517476905e-06, + "loss": 0.1061, + "step": 11584 + }, + { + "epoch": 1.8770252754374595, + "grad_norm": 0.8156781196594238, + "learning_rate": 1.6230253230726096e-06, + "loss": 0.1068, + "step": 11585 + }, + { + "epoch": 1.8771872974724562, + "grad_norm": 0.8381296396255493, + "learning_rate": 1.6226158212308307e-06, + "loss": 0.1087, + "step": 11586 + }, + { + "epoch": 1.877349319507453, + "grad_norm": 0.794044017791748, + "learning_rate": 1.622206346234885e-06, + "loss": 0.0952, + "step": 11587 + }, + { + "epoch": 1.8775113415424498, + "grad_norm": 0.829454243183136, + "learning_rate": 1.6217968980972998e-06, + "loss": 0.1039, + "step": 11588 + }, + { + "epoch": 1.8776733635774465, + "grad_norm": 0.7840944528579712, + "learning_rate": 1.621387476830605e-06, + "loss": 0.0937, + "step": 11589 + }, + { + "epoch": 1.8778353856124435, + "grad_norm": 0.6949406862258911, + "learning_rate": 1.620978082447327e-06, + "loss": 0.0796, + "step": 11590 + }, + { + "epoch": 1.87799740764744, + "grad_norm": 0.8238313794136047, + "learning_rate": 1.6205687149599933e-06, + "loss": 0.1038, + "step": 11591 + }, + { + "epoch": 1.8781594296824369, + "grad_norm": 0.7915367484092712, + "learning_rate": 1.6201593743811275e-06, + "loss": 0.1021, + "step": 11592 + }, + { + "epoch": 1.8783214517174336, + "grad_norm": 0.8443301320075989, + "learning_rate": 1.6197500607232563e-06, + "loss": 0.0991, + "step": 11593 + }, + { + "epoch": 1.8784834737524303, + "grad_norm": 0.7770126461982727, + "learning_rate": 1.6193407739989037e-06, + "loss": 0.0969, + "step": 11594 + }, + { + "epoch": 1.8786454957874272, + "grad_norm": 0.8288414478302002, + "learning_rate": 1.6189315142205914e-06, + "loss": 0.1031, + "step": 11595 + }, + { + "epoch": 1.8788075178224237, + "grad_norm": 0.7647958993911743, + "learning_rate": 1.6185222814008434e-06, + "loss": 0.0886, + "step": 11596 + }, + { + "epoch": 1.8789695398574207, + "grad_norm": 0.7341493368148804, + "learning_rate": 1.6181130755521792e-06, + "loss": 0.0968, + "step": 11597 + }, + { + "epoch": 1.8791315618924174, + "grad_norm": 0.7972742319107056, + "learning_rate": 1.6177038966871213e-06, + "loss": 0.1008, + "step": 11598 + }, + { + "epoch": 1.879293583927414, + "grad_norm": 0.8043832182884216, + "learning_rate": 1.617294744818189e-06, + "loss": 0.0959, + "step": 11599 + }, + { + "epoch": 1.879455605962411, + "grad_norm": 0.8683483600616455, + "learning_rate": 1.6168856199579025e-06, + "loss": 0.1057, + "step": 11600 + }, + { + "epoch": 1.8796176279974075, + "grad_norm": 0.8487406373023987, + "learning_rate": 1.6164765221187778e-06, + "loss": 0.0997, + "step": 11601 + }, + { + "epoch": 1.8797796500324044, + "grad_norm": 0.7058298587799072, + "learning_rate": 1.6160674513133332e-06, + "loss": 0.0892, + "step": 11602 + }, + { + "epoch": 1.8799416720674011, + "grad_norm": 1.010647177696228, + "learning_rate": 1.6156584075540864e-06, + "loss": 0.1148, + "step": 11603 + }, + { + "epoch": 1.8801036941023979, + "grad_norm": 0.839988648891449, + "learning_rate": 1.615249390853552e-06, + "loss": 0.0978, + "step": 11604 + }, + { + "epoch": 1.8802657161373948, + "grad_norm": 0.8383321762084961, + "learning_rate": 1.6148404012242453e-06, + "loss": 0.0976, + "step": 11605 + }, + { + "epoch": 1.8804277381723913, + "grad_norm": 0.7959474325180054, + "learning_rate": 1.61443143867868e-06, + "loss": 0.0901, + "step": 11606 + }, + { + "epoch": 1.8805897602073882, + "grad_norm": 0.854000985622406, + "learning_rate": 1.614022503229369e-06, + "loss": 0.1036, + "step": 11607 + }, + { + "epoch": 1.880751782242385, + "grad_norm": 0.8434250950813293, + "learning_rate": 1.613613594888826e-06, + "loss": 0.0996, + "step": 11608 + }, + { + "epoch": 1.8809138042773816, + "grad_norm": 0.7653728127479553, + "learning_rate": 1.6132047136695625e-06, + "loss": 0.101, + "step": 11609 + }, + { + "epoch": 1.8810758263123786, + "grad_norm": 0.7736960649490356, + "learning_rate": 1.612795859584088e-06, + "loss": 0.0956, + "step": 11610 + }, + { + "epoch": 1.8812378483473753, + "grad_norm": 0.7493591904640198, + "learning_rate": 1.6123870326449144e-06, + "loss": 0.0897, + "step": 11611 + }, + { + "epoch": 1.881399870382372, + "grad_norm": 0.7444273829460144, + "learning_rate": 1.611978232864548e-06, + "loss": 0.0943, + "step": 11612 + }, + { + "epoch": 1.8815618924173687, + "grad_norm": 0.7813089489936829, + "learning_rate": 1.6115694602554994e-06, + "loss": 0.0952, + "step": 11613 + }, + { + "epoch": 1.8817239144523654, + "grad_norm": 0.8079869151115417, + "learning_rate": 1.6111607148302758e-06, + "loss": 0.1069, + "step": 11614 + }, + { + "epoch": 1.8818859364873624, + "grad_norm": 0.8186015486717224, + "learning_rate": 1.6107519966013828e-06, + "loss": 0.1025, + "step": 11615 + }, + { + "epoch": 1.882047958522359, + "grad_norm": 0.7281806468963623, + "learning_rate": 1.6103433055813265e-06, + "loss": 0.0893, + "step": 11616 + }, + { + "epoch": 1.8822099805573558, + "grad_norm": 0.7015278935432434, + "learning_rate": 1.6099346417826123e-06, + "loss": 0.0844, + "step": 11617 + }, + { + "epoch": 1.8823720025923527, + "grad_norm": 0.825904905796051, + "learning_rate": 1.6095260052177446e-06, + "loss": 0.1027, + "step": 11618 + }, + { + "epoch": 1.8825340246273492, + "grad_norm": 0.8017039895057678, + "learning_rate": 1.6091173958992261e-06, + "loss": 0.0955, + "step": 11619 + }, + { + "epoch": 1.8826960466623461, + "grad_norm": 0.7551977038383484, + "learning_rate": 1.6087088138395598e-06, + "loss": 0.0907, + "step": 11620 + }, + { + "epoch": 1.8828580686973428, + "grad_norm": 0.9527973532676697, + "learning_rate": 1.6083002590512458e-06, + "loss": 0.1074, + "step": 11621 + }, + { + "epoch": 1.8830200907323396, + "grad_norm": 0.8547660112380981, + "learning_rate": 1.6078917315467867e-06, + "loss": 0.106, + "step": 11622 + }, + { + "epoch": 1.8831821127673365, + "grad_norm": 0.7676581144332886, + "learning_rate": 1.607483231338682e-06, + "loss": 0.0914, + "step": 11623 + }, + { + "epoch": 1.883344134802333, + "grad_norm": 0.7763146162033081, + "learning_rate": 1.6070747584394303e-06, + "loss": 0.0927, + "step": 11624 + }, + { + "epoch": 1.88350615683733, + "grad_norm": 0.8205904364585876, + "learning_rate": 1.6066663128615301e-06, + "loss": 0.1028, + "step": 11625 + }, + { + "epoch": 1.8836681788723266, + "grad_norm": 0.7976201176643372, + "learning_rate": 1.6062578946174785e-06, + "loss": 0.0999, + "step": 11626 + }, + { + "epoch": 1.8838302009073233, + "grad_norm": 0.8827847242355347, + "learning_rate": 1.605849503719773e-06, + "loss": 0.109, + "step": 11627 + }, + { + "epoch": 1.8839922229423203, + "grad_norm": 0.8832690715789795, + "learning_rate": 1.605441140180909e-06, + "loss": 0.11, + "step": 11628 + }, + { + "epoch": 1.8841542449773168, + "grad_norm": 0.7509612441062927, + "learning_rate": 1.605032804013381e-06, + "loss": 0.091, + "step": 11629 + }, + { + "epoch": 1.8843162670123137, + "grad_norm": 0.7290849685668945, + "learning_rate": 1.6046244952296839e-06, + "loss": 0.0928, + "step": 11630 + }, + { + "epoch": 1.8844782890473104, + "grad_norm": 0.7547253966331482, + "learning_rate": 1.6042162138423095e-06, + "loss": 0.0907, + "step": 11631 + }, + { + "epoch": 1.8846403110823071, + "grad_norm": 0.7190500497817993, + "learning_rate": 1.6038079598637523e-06, + "loss": 0.0858, + "step": 11632 + }, + { + "epoch": 1.884802333117304, + "grad_norm": 0.9019315242767334, + "learning_rate": 1.6033997333065022e-06, + "loss": 0.0975, + "step": 11633 + }, + { + "epoch": 1.8849643551523008, + "grad_norm": 0.8361161947250366, + "learning_rate": 1.6029915341830503e-06, + "loss": 0.102, + "step": 11634 + }, + { + "epoch": 1.8851263771872975, + "grad_norm": 1.2841830253601074, + "learning_rate": 1.6025833625058878e-06, + "loss": 0.1219, + "step": 11635 + }, + { + "epoch": 1.8852883992222942, + "grad_norm": 0.7469258308410645, + "learning_rate": 1.6021752182875012e-06, + "loss": 0.0904, + "step": 11636 + }, + { + "epoch": 1.885450421257291, + "grad_norm": 0.8182607293128967, + "learning_rate": 1.601767101540381e-06, + "loss": 0.1008, + "step": 11637 + }, + { + "epoch": 1.8856124432922878, + "grad_norm": 0.6999529004096985, + "learning_rate": 1.6013590122770143e-06, + "loss": 0.0842, + "step": 11638 + }, + { + "epoch": 1.8857744653272845, + "grad_norm": 0.7251670956611633, + "learning_rate": 1.6009509505098863e-06, + "loss": 0.0931, + "step": 11639 + }, + { + "epoch": 1.8859364873622813, + "grad_norm": 0.825188934803009, + "learning_rate": 1.6005429162514834e-06, + "loss": 0.1004, + "step": 11640 + }, + { + "epoch": 1.8860985093972782, + "grad_norm": 0.72586590051651, + "learning_rate": 1.6001349095142918e-06, + "loss": 0.0852, + "step": 11641 + }, + { + "epoch": 1.8862605314322747, + "grad_norm": 0.8619117140769958, + "learning_rate": 1.5997269303107937e-06, + "loss": 0.0906, + "step": 11642 + }, + { + "epoch": 1.8864225534672716, + "grad_norm": 0.8306232690811157, + "learning_rate": 1.5993189786534727e-06, + "loss": 0.1013, + "step": 11643 + }, + { + "epoch": 1.8865845755022683, + "grad_norm": 0.7801927924156189, + "learning_rate": 1.598911054554812e-06, + "loss": 0.0915, + "step": 11644 + }, + { + "epoch": 1.886746597537265, + "grad_norm": 0.7791352272033691, + "learning_rate": 1.5985031580272914e-06, + "loss": 0.0875, + "step": 11645 + }, + { + "epoch": 1.886908619572262, + "grad_norm": 0.8080638647079468, + "learning_rate": 1.5980952890833929e-06, + "loss": 0.0956, + "step": 11646 + }, + { + "epoch": 1.8870706416072585, + "grad_norm": 0.7200146913528442, + "learning_rate": 1.597687447735597e-06, + "loss": 0.0807, + "step": 11647 + }, + { + "epoch": 1.8872326636422554, + "grad_norm": 0.8803225755691528, + "learning_rate": 1.5972796339963806e-06, + "loss": 0.1038, + "step": 11648 + }, + { + "epoch": 1.887394685677252, + "grad_norm": 0.8330278992652893, + "learning_rate": 1.5968718478782236e-06, + "loss": 0.1015, + "step": 11649 + }, + { + "epoch": 1.8875567077122488, + "grad_norm": 0.7998868823051453, + "learning_rate": 1.5964640893936015e-06, + "loss": 0.0993, + "step": 11650 + }, + { + "epoch": 1.8877187297472457, + "grad_norm": 0.9711896777153015, + "learning_rate": 1.596056358554992e-06, + "loss": 0.1076, + "step": 11651 + }, + { + "epoch": 1.8878807517822422, + "grad_norm": 0.8102350831031799, + "learning_rate": 1.595648655374871e-06, + "loss": 0.1031, + "step": 11652 + }, + { + "epoch": 1.8880427738172392, + "grad_norm": 0.879172146320343, + "learning_rate": 1.5952409798657127e-06, + "loss": 0.1006, + "step": 11653 + }, + { + "epoch": 1.8882047958522359, + "grad_norm": 0.8931054472923279, + "learning_rate": 1.5948333320399905e-06, + "loss": 0.1153, + "step": 11654 + }, + { + "epoch": 1.8883668178872326, + "grad_norm": 0.7753148078918457, + "learning_rate": 1.5944257119101775e-06, + "loss": 0.0959, + "step": 11655 + }, + { + "epoch": 1.8885288399222295, + "grad_norm": 0.8476399183273315, + "learning_rate": 1.5940181194887472e-06, + "loss": 0.1019, + "step": 11656 + }, + { + "epoch": 1.888690861957226, + "grad_norm": 0.7567358613014221, + "learning_rate": 1.5936105547881697e-06, + "loss": 0.0865, + "step": 11657 + }, + { + "epoch": 1.888852883992223, + "grad_norm": 0.7247362732887268, + "learning_rate": 1.5932030178209163e-06, + "loss": 0.0863, + "step": 11658 + }, + { + "epoch": 1.8890149060272197, + "grad_norm": 0.7682359218597412, + "learning_rate": 1.5927955085994544e-06, + "loss": 0.0938, + "step": 11659 + }, + { + "epoch": 1.8891769280622164, + "grad_norm": 0.750938355922699, + "learning_rate": 1.592388027136256e-06, + "loss": 0.0947, + "step": 11660 + }, + { + "epoch": 1.8893389500972133, + "grad_norm": 0.8155219554901123, + "learning_rate": 1.5919805734437871e-06, + "loss": 0.1027, + "step": 11661 + }, + { + "epoch": 1.88950097213221, + "grad_norm": 0.8531356453895569, + "learning_rate": 1.591573147534516e-06, + "loss": 0.1002, + "step": 11662 + }, + { + "epoch": 1.8896629941672067, + "grad_norm": 0.8241977095603943, + "learning_rate": 1.5911657494209077e-06, + "loss": 0.0958, + "step": 11663 + }, + { + "epoch": 1.8898250162022034, + "grad_norm": 0.8130655884742737, + "learning_rate": 1.5907583791154275e-06, + "loss": 0.1017, + "step": 11664 + }, + { + "epoch": 1.8899870382372002, + "grad_norm": 0.7908192873001099, + "learning_rate": 1.5903510366305416e-06, + "loss": 0.1012, + "step": 11665 + }, + { + "epoch": 1.890149060272197, + "grad_norm": 0.7589678168296814, + "learning_rate": 1.5899437219787124e-06, + "loss": 0.0919, + "step": 11666 + }, + { + "epoch": 1.8903110823071938, + "grad_norm": 0.7555517554283142, + "learning_rate": 1.5895364351724033e-06, + "loss": 0.0946, + "step": 11667 + }, + { + "epoch": 1.8904731043421905, + "grad_norm": 0.7375280857086182, + "learning_rate": 1.5891291762240757e-06, + "loss": 0.09, + "step": 11668 + }, + { + "epoch": 1.8906351263771874, + "grad_norm": 0.8817306756973267, + "learning_rate": 1.5887219451461903e-06, + "loss": 0.1076, + "step": 11669 + }, + { + "epoch": 1.890797148412184, + "grad_norm": 0.7232506275177002, + "learning_rate": 1.5883147419512086e-06, + "loss": 0.0868, + "step": 11670 + }, + { + "epoch": 1.8909591704471809, + "grad_norm": 0.8023999333381653, + "learning_rate": 1.5879075666515903e-06, + "loss": 0.1019, + "step": 11671 + }, + { + "epoch": 1.8911211924821776, + "grad_norm": 0.8410249948501587, + "learning_rate": 1.5875004192597926e-06, + "loss": 0.0945, + "step": 11672 + }, + { + "epoch": 1.8912832145171743, + "grad_norm": 0.8737829327583313, + "learning_rate": 1.5870932997882742e-06, + "loss": 0.1045, + "step": 11673 + }, + { + "epoch": 1.8914452365521712, + "grad_norm": 0.8467708230018616, + "learning_rate": 1.5866862082494907e-06, + "loss": 0.094, + "step": 11674 + }, + { + "epoch": 1.8916072585871677, + "grad_norm": 0.6961696743965149, + "learning_rate": 1.5862791446558999e-06, + "loss": 0.0826, + "step": 11675 + }, + { + "epoch": 1.8917692806221647, + "grad_norm": 0.8878881931304932, + "learning_rate": 1.5858721090199564e-06, + "loss": 0.097, + "step": 11676 + }, + { + "epoch": 1.8919313026571614, + "grad_norm": 0.9326772093772888, + "learning_rate": 1.5854651013541134e-06, + "loss": 0.1038, + "step": 11677 + }, + { + "epoch": 1.892093324692158, + "grad_norm": 0.769069492816925, + "learning_rate": 1.5850581216708254e-06, + "loss": 0.0946, + "step": 11678 + }, + { + "epoch": 1.892255346727155, + "grad_norm": 0.8138481378555298, + "learning_rate": 1.5846511699825445e-06, + "loss": 0.099, + "step": 11679 + }, + { + "epoch": 1.8924173687621515, + "grad_norm": 0.9026851058006287, + "learning_rate": 1.5842442463017235e-06, + "loss": 0.113, + "step": 11680 + }, + { + "epoch": 1.8925793907971484, + "grad_norm": 0.796542763710022, + "learning_rate": 1.583837350640812e-06, + "loss": 0.0943, + "step": 11681 + }, + { + "epoch": 1.8927414128321451, + "grad_norm": 0.7942173480987549, + "learning_rate": 1.583430483012261e-06, + "loss": 0.0911, + "step": 11682 + }, + { + "epoch": 1.8929034348671419, + "grad_norm": 0.748563289642334, + "learning_rate": 1.5830236434285182e-06, + "loss": 0.0934, + "step": 11683 + }, + { + "epoch": 1.8930654569021388, + "grad_norm": 0.8524467349052429, + "learning_rate": 1.5826168319020332e-06, + "loss": 0.105, + "step": 11684 + }, + { + "epoch": 1.8932274789371355, + "grad_norm": 0.8268295526504517, + "learning_rate": 1.5822100484452538e-06, + "loss": 0.0841, + "step": 11685 + }, + { + "epoch": 1.8933895009721322, + "grad_norm": 0.7947415709495544, + "learning_rate": 1.5818032930706254e-06, + "loss": 0.1011, + "step": 11686 + }, + { + "epoch": 1.893551523007129, + "grad_norm": 0.8413145542144775, + "learning_rate": 1.5813965657905942e-06, + "loss": 0.1021, + "step": 11687 + }, + { + "epoch": 1.8937135450421256, + "grad_norm": 0.7836431860923767, + "learning_rate": 1.5809898666176044e-06, + "loss": 0.0948, + "step": 11688 + }, + { + "epoch": 1.8938755670771226, + "grad_norm": 0.8638646602630615, + "learning_rate": 1.5805831955641022e-06, + "loss": 0.1104, + "step": 11689 + }, + { + "epoch": 1.8940375891121193, + "grad_norm": 0.7815718650817871, + "learning_rate": 1.5801765526425283e-06, + "loss": 0.0857, + "step": 11690 + }, + { + "epoch": 1.894199611147116, + "grad_norm": 0.8989560008049011, + "learning_rate": 1.5797699378653267e-06, + "loss": 0.1111, + "step": 11691 + }, + { + "epoch": 1.894361633182113, + "grad_norm": 0.8104327321052551, + "learning_rate": 1.5793633512449374e-06, + "loss": 0.0974, + "step": 11692 + }, + { + "epoch": 1.8945236552171094, + "grad_norm": 0.7694452404975891, + "learning_rate": 1.5789567927938016e-06, + "loss": 0.0981, + "step": 11693 + }, + { + "epoch": 1.8946856772521063, + "grad_norm": 0.8841857314109802, + "learning_rate": 1.578550262524359e-06, + "loss": 0.101, + "step": 11694 + }, + { + "epoch": 1.894847699287103, + "grad_norm": 0.7945596575737, + "learning_rate": 1.5781437604490493e-06, + "loss": 0.0931, + "step": 11695 + }, + { + "epoch": 1.8950097213220998, + "grad_norm": 0.8388197422027588, + "learning_rate": 1.5777372865803091e-06, + "loss": 0.0982, + "step": 11696 + }, + { + "epoch": 1.8951717433570967, + "grad_norm": 0.8069587349891663, + "learning_rate": 1.5773308409305766e-06, + "loss": 0.0995, + "step": 11697 + }, + { + "epoch": 1.8953337653920932, + "grad_norm": 0.7525715231895447, + "learning_rate": 1.5769244235122867e-06, + "loss": 0.0856, + "step": 11698 + }, + { + "epoch": 1.8954957874270901, + "grad_norm": 0.9167400598526001, + "learning_rate": 1.576518034337876e-06, + "loss": 0.1076, + "step": 11699 + }, + { + "epoch": 1.8956578094620868, + "grad_norm": 0.7757900357246399, + "learning_rate": 1.576111673419779e-06, + "loss": 0.0995, + "step": 11700 + }, + { + "epoch": 1.8958198314970836, + "grad_norm": 0.8285884857177734, + "learning_rate": 1.5757053407704287e-06, + "loss": 0.106, + "step": 11701 + }, + { + "epoch": 1.8959818535320805, + "grad_norm": 0.7494697570800781, + "learning_rate": 1.5752990364022588e-06, + "loss": 0.0959, + "step": 11702 + }, + { + "epoch": 1.896143875567077, + "grad_norm": 0.7144713401794434, + "learning_rate": 1.5748927603276992e-06, + "loss": 0.0885, + "step": 11703 + }, + { + "epoch": 1.896305897602074, + "grad_norm": 0.7766308784484863, + "learning_rate": 1.5744865125591837e-06, + "loss": 0.0969, + "step": 11704 + }, + { + "epoch": 1.8964679196370706, + "grad_norm": 0.818439245223999, + "learning_rate": 1.574080293109141e-06, + "loss": 0.1071, + "step": 11705 + }, + { + "epoch": 1.8966299416720673, + "grad_norm": 0.8813479542732239, + "learning_rate": 1.573674101990001e-06, + "loss": 0.1012, + "step": 11706 + }, + { + "epoch": 1.8967919637070643, + "grad_norm": 0.8053517937660217, + "learning_rate": 1.5732679392141906e-06, + "loss": 0.1002, + "step": 11707 + }, + { + "epoch": 1.8969539857420608, + "grad_norm": 0.8213833570480347, + "learning_rate": 1.5728618047941393e-06, + "loss": 0.1046, + "step": 11708 + }, + { + "epoch": 1.8971160077770577, + "grad_norm": 0.9207565784454346, + "learning_rate": 1.5724556987422738e-06, + "loss": 0.1037, + "step": 11709 + }, + { + "epoch": 1.8972780298120544, + "grad_norm": 0.7941535115242004, + "learning_rate": 1.5720496210710185e-06, + "loss": 0.0841, + "step": 11710 + }, + { + "epoch": 1.8974400518470511, + "grad_norm": 0.9520290493965149, + "learning_rate": 1.5716435717927996e-06, + "loss": 0.1096, + "step": 11711 + }, + { + "epoch": 1.897602073882048, + "grad_norm": 0.720905065536499, + "learning_rate": 1.5712375509200397e-06, + "loss": 0.0864, + "step": 11712 + }, + { + "epoch": 1.8977640959170448, + "grad_norm": 0.8394162654876709, + "learning_rate": 1.5708315584651646e-06, + "loss": 0.1007, + "step": 11713 + }, + { + "epoch": 1.8979261179520415, + "grad_norm": 0.9244471192359924, + "learning_rate": 1.5704255944405947e-06, + "loss": 0.1049, + "step": 11714 + }, + { + "epoch": 1.8980881399870384, + "grad_norm": 0.9457299709320068, + "learning_rate": 1.5700196588587527e-06, + "loss": 0.1148, + "step": 11715 + }, + { + "epoch": 1.898250162022035, + "grad_norm": 0.8000479340553284, + "learning_rate": 1.5696137517320582e-06, + "loss": 0.0928, + "step": 11716 + }, + { + "epoch": 1.8984121840570318, + "grad_norm": 0.8423092365264893, + "learning_rate": 1.5692078730729304e-06, + "loss": 0.1003, + "step": 11717 + }, + { + "epoch": 1.8985742060920285, + "grad_norm": 0.8390052914619446, + "learning_rate": 1.5688020228937905e-06, + "loss": 0.1021, + "step": 11718 + }, + { + "epoch": 1.8987362281270252, + "grad_norm": 0.8341346979141235, + "learning_rate": 1.5683962012070546e-06, + "loss": 0.1038, + "step": 11719 + }, + { + "epoch": 1.8988982501620222, + "grad_norm": 0.7826020121574402, + "learning_rate": 1.5679904080251414e-06, + "loss": 0.1, + "step": 11720 + }, + { + "epoch": 1.8990602721970187, + "grad_norm": 0.9335909485816956, + "learning_rate": 1.5675846433604658e-06, + "loss": 0.1173, + "step": 11721 + }, + { + "epoch": 1.8992222942320156, + "grad_norm": 0.8499523401260376, + "learning_rate": 1.567178907225443e-06, + "loss": 0.1009, + "step": 11722 + }, + { + "epoch": 1.8993843162670123, + "grad_norm": 0.8055524826049805, + "learning_rate": 1.5667731996324887e-06, + "loss": 0.104, + "step": 11723 + }, + { + "epoch": 1.899546338302009, + "grad_norm": 0.8389691710472107, + "learning_rate": 1.5663675205940164e-06, + "loss": 0.1059, + "step": 11724 + }, + { + "epoch": 1.899708360337006, + "grad_norm": 0.7978635430335999, + "learning_rate": 1.5659618701224385e-06, + "loss": 0.0991, + "step": 11725 + }, + { + "epoch": 1.8998703823720025, + "grad_norm": 0.8267003297805786, + "learning_rate": 1.5655562482301664e-06, + "loss": 0.1018, + "step": 11726 + }, + { + "epoch": 1.9000324044069994, + "grad_norm": 0.8983197808265686, + "learning_rate": 1.565150654929613e-06, + "loss": 0.1094, + "step": 11727 + }, + { + "epoch": 1.900194426441996, + "grad_norm": 0.7554136514663696, + "learning_rate": 1.5647450902331866e-06, + "loss": 0.0945, + "step": 11728 + }, + { + "epoch": 1.9003564484769928, + "grad_norm": 0.8314508199691772, + "learning_rate": 1.5643395541532972e-06, + "loss": 0.1002, + "step": 11729 + }, + { + "epoch": 1.9005184705119897, + "grad_norm": 0.8851383924484253, + "learning_rate": 1.5639340467023534e-06, + "loss": 0.1073, + "step": 11730 + }, + { + "epoch": 1.9006804925469862, + "grad_norm": 0.7740817070007324, + "learning_rate": 1.563528567892762e-06, + "loss": 0.0942, + "step": 11731 + }, + { + "epoch": 1.9008425145819832, + "grad_norm": 0.7232766151428223, + "learning_rate": 1.5631231177369305e-06, + "loss": 0.0925, + "step": 11732 + }, + { + "epoch": 1.9010045366169799, + "grad_norm": 0.7759274840354919, + "learning_rate": 1.562717696247265e-06, + "loss": 0.1035, + "step": 11733 + }, + { + "epoch": 1.9011665586519766, + "grad_norm": 0.7025986313819885, + "learning_rate": 1.562312303436169e-06, + "loss": 0.0873, + "step": 11734 + }, + { + "epoch": 1.9013285806869735, + "grad_norm": 0.8542094230651855, + "learning_rate": 1.561906939316048e-06, + "loss": 0.106, + "step": 11735 + }, + { + "epoch": 1.9014906027219702, + "grad_norm": 0.7692488431930542, + "learning_rate": 1.5615016038993036e-06, + "loss": 0.0989, + "step": 11736 + }, + { + "epoch": 1.901652624756967, + "grad_norm": 0.7880541682243347, + "learning_rate": 1.5610962971983395e-06, + "loss": 0.1038, + "step": 11737 + }, + { + "epoch": 1.9018146467919637, + "grad_norm": 0.8704632520675659, + "learning_rate": 1.5606910192255565e-06, + "loss": 0.0971, + "step": 11738 + }, + { + "epoch": 1.9019766688269604, + "grad_norm": 0.8276471495628357, + "learning_rate": 1.560285769993356e-06, + "loss": 0.1017, + "step": 11739 + }, + { + "epoch": 1.9021386908619573, + "grad_norm": 0.9419677257537842, + "learning_rate": 1.5598805495141362e-06, + "loss": 0.1009, + "step": 11740 + }, + { + "epoch": 1.902300712896954, + "grad_norm": 0.8188934326171875, + "learning_rate": 1.5594753578002957e-06, + "loss": 0.1005, + "step": 11741 + }, + { + "epoch": 1.9024627349319507, + "grad_norm": 0.7875339984893799, + "learning_rate": 1.5590701948642348e-06, + "loss": 0.0983, + "step": 11742 + }, + { + "epoch": 1.9026247569669477, + "grad_norm": 0.756129264831543, + "learning_rate": 1.5586650607183482e-06, + "loss": 0.0916, + "step": 11743 + }, + { + "epoch": 1.9027867790019442, + "grad_norm": 0.8351160287857056, + "learning_rate": 1.5582599553750332e-06, + "loss": 0.106, + "step": 11744 + }, + { + "epoch": 1.902948801036941, + "grad_norm": 0.8231625556945801, + "learning_rate": 1.5578548788466841e-06, + "loss": 0.1069, + "step": 11745 + }, + { + "epoch": 1.9031108230719378, + "grad_norm": 0.7880319952964783, + "learning_rate": 1.5574498311456953e-06, + "loss": 0.0998, + "step": 11746 + }, + { + "epoch": 1.9032728451069345, + "grad_norm": 0.8101006150245667, + "learning_rate": 1.5570448122844612e-06, + "loss": 0.0968, + "step": 11747 + }, + { + "epoch": 1.9034348671419314, + "grad_norm": 0.7617364525794983, + "learning_rate": 1.5566398222753745e-06, + "loss": 0.0917, + "step": 11748 + }, + { + "epoch": 1.903596889176928, + "grad_norm": 0.8054034113883972, + "learning_rate": 1.5562348611308259e-06, + "loss": 0.0939, + "step": 11749 + }, + { + "epoch": 1.9037589112119249, + "grad_norm": 0.8272371292114258, + "learning_rate": 1.5558299288632061e-06, + "loss": 0.0987, + "step": 11750 + }, + { + "epoch": 1.9039209332469216, + "grad_norm": 0.8034398555755615, + "learning_rate": 1.5554250254849065e-06, + "loss": 0.0987, + "step": 11751 + }, + { + "epoch": 1.9040829552819183, + "grad_norm": 0.681370198726654, + "learning_rate": 1.555020151008315e-06, + "loss": 0.0833, + "step": 11752 + }, + { + "epoch": 1.9042449773169152, + "grad_norm": 0.9123519062995911, + "learning_rate": 1.5546153054458204e-06, + "loss": 0.1037, + "step": 11753 + }, + { + "epoch": 1.9044069993519117, + "grad_norm": 0.8379228115081787, + "learning_rate": 1.5542104888098093e-06, + "loss": 0.1071, + "step": 11754 + }, + { + "epoch": 1.9045690213869086, + "grad_norm": 0.6765666007995605, + "learning_rate": 1.5538057011126678e-06, + "loss": 0.0874, + "step": 11755 + }, + { + "epoch": 1.9047310434219054, + "grad_norm": 0.7915697693824768, + "learning_rate": 1.553400942366783e-06, + "loss": 0.0974, + "step": 11756 + }, + { + "epoch": 1.904893065456902, + "grad_norm": 0.7531107664108276, + "learning_rate": 1.5529962125845387e-06, + "loss": 0.0915, + "step": 11757 + }, + { + "epoch": 1.905055087491899, + "grad_norm": 0.8496636152267456, + "learning_rate": 1.5525915117783182e-06, + "loss": 0.1105, + "step": 11758 + }, + { + "epoch": 1.9052171095268955, + "grad_norm": 0.9320754408836365, + "learning_rate": 1.5521868399605057e-06, + "loss": 0.1043, + "step": 11759 + }, + { + "epoch": 1.9053791315618924, + "grad_norm": 0.80690598487854, + "learning_rate": 1.5517821971434804e-06, + "loss": 0.105, + "step": 11760 + }, + { + "epoch": 1.9055411535968891, + "grad_norm": 0.8309117555618286, + "learning_rate": 1.5513775833396263e-06, + "loss": 0.0969, + "step": 11761 + }, + { + "epoch": 1.9057031756318858, + "grad_norm": 0.7836124300956726, + "learning_rate": 1.5509729985613232e-06, + "loss": 0.0968, + "step": 11762 + }, + { + "epoch": 1.9058651976668828, + "grad_norm": 0.7297149300575256, + "learning_rate": 1.5505684428209487e-06, + "loss": 0.0931, + "step": 11763 + }, + { + "epoch": 1.9060272197018795, + "grad_norm": 0.8328400254249573, + "learning_rate": 1.5501639161308829e-06, + "loss": 0.104, + "step": 11764 + }, + { + "epoch": 1.9061892417368762, + "grad_norm": 0.8439556956291199, + "learning_rate": 1.5497594185035015e-06, + "loss": 0.1035, + "step": 11765 + }, + { + "epoch": 1.9063512637718731, + "grad_norm": 0.7502602934837341, + "learning_rate": 1.5493549499511834e-06, + "loss": 0.0911, + "step": 11766 + }, + { + "epoch": 1.9065132858068696, + "grad_norm": 0.9024834632873535, + "learning_rate": 1.5489505104863034e-06, + "loss": 0.1085, + "step": 11767 + }, + { + "epoch": 1.9066753078418666, + "grad_norm": 0.8212124705314636, + "learning_rate": 1.5485461001212365e-06, + "loss": 0.1028, + "step": 11768 + }, + { + "epoch": 1.9068373298768633, + "grad_norm": 0.8439403176307678, + "learning_rate": 1.5481417188683557e-06, + "loss": 0.1067, + "step": 11769 + }, + { + "epoch": 1.90699935191186, + "grad_norm": 0.811410665512085, + "learning_rate": 1.5477373667400347e-06, + "loss": 0.0983, + "step": 11770 + }, + { + "epoch": 1.907161373946857, + "grad_norm": 0.8574284315109253, + "learning_rate": 1.5473330437486466e-06, + "loss": 0.1048, + "step": 11771 + }, + { + "epoch": 1.9073233959818534, + "grad_norm": 0.7073046565055847, + "learning_rate": 1.5469287499065615e-06, + "loss": 0.0905, + "step": 11772 + }, + { + "epoch": 1.9074854180168503, + "grad_norm": 0.7401828169822693, + "learning_rate": 1.5465244852261505e-06, + "loss": 0.0906, + "step": 11773 + }, + { + "epoch": 1.907647440051847, + "grad_norm": 0.7841954827308655, + "learning_rate": 1.5461202497197821e-06, + "loss": 0.0979, + "step": 11774 + }, + { + "epoch": 1.9078094620868438, + "grad_norm": 0.7952117919921875, + "learning_rate": 1.545716043399827e-06, + "loss": 0.0935, + "step": 11775 + }, + { + "epoch": 1.9079714841218407, + "grad_norm": 0.7880851626396179, + "learning_rate": 1.5453118662786509e-06, + "loss": 0.1009, + "step": 11776 + }, + { + "epoch": 1.9081335061568372, + "grad_norm": 0.849980354309082, + "learning_rate": 1.544907718368622e-06, + "loss": 0.1105, + "step": 11777 + }, + { + "epoch": 1.9082955281918341, + "grad_norm": 0.858780026435852, + "learning_rate": 1.544503599682105e-06, + "loss": 0.1007, + "step": 11778 + }, + { + "epoch": 1.9084575502268308, + "grad_norm": 0.7746371030807495, + "learning_rate": 1.5440995102314654e-06, + "loss": 0.0995, + "step": 11779 + }, + { + "epoch": 1.9086195722618275, + "grad_norm": 0.8200014233589172, + "learning_rate": 1.5436954500290684e-06, + "loss": 0.0951, + "step": 11780 + }, + { + "epoch": 1.9087815942968245, + "grad_norm": 0.9434183835983276, + "learning_rate": 1.5432914190872757e-06, + "loss": 0.1081, + "step": 11781 + }, + { + "epoch": 1.908943616331821, + "grad_norm": 0.7809907793998718, + "learning_rate": 1.5428874174184509e-06, + "loss": 0.0902, + "step": 11782 + }, + { + "epoch": 1.909105638366818, + "grad_norm": 0.8580632209777832, + "learning_rate": 1.5424834450349552e-06, + "loss": 0.103, + "step": 11783 + }, + { + "epoch": 1.9092676604018146, + "grad_norm": 0.8701772689819336, + "learning_rate": 1.5420795019491475e-06, + "loss": 0.1071, + "step": 11784 + }, + { + "epoch": 1.9094296824368113, + "grad_norm": 0.784704327583313, + "learning_rate": 1.54167558817339e-06, + "loss": 0.0969, + "step": 11785 + }, + { + "epoch": 1.9095917044718083, + "grad_norm": 0.7510059475898743, + "learning_rate": 1.5412717037200406e-06, + "loss": 0.0939, + "step": 11786 + }, + { + "epoch": 1.909753726506805, + "grad_norm": 0.7119048237800598, + "learning_rate": 1.5408678486014567e-06, + "loss": 0.0859, + "step": 11787 + }, + { + "epoch": 1.9099157485418017, + "grad_norm": 0.7860952615737915, + "learning_rate": 1.540464022829996e-06, + "loss": 0.095, + "step": 11788 + }, + { + "epoch": 1.9100777705767984, + "grad_norm": 0.9239879846572876, + "learning_rate": 1.540060226418013e-06, + "loss": 0.1001, + "step": 11789 + }, + { + "epoch": 1.910239792611795, + "grad_norm": 0.7606763243675232, + "learning_rate": 1.5396564593778646e-06, + "loss": 0.0849, + "step": 11790 + }, + { + "epoch": 1.910401814646792, + "grad_norm": 0.830071747303009, + "learning_rate": 1.5392527217219047e-06, + "loss": 0.0984, + "step": 11791 + }, + { + "epoch": 1.9105638366817888, + "grad_norm": 0.8571628332138062, + "learning_rate": 1.538849013462487e-06, + "loss": 0.103, + "step": 11792 + }, + { + "epoch": 1.9107258587167855, + "grad_norm": 0.8239210844039917, + "learning_rate": 1.5384453346119628e-06, + "loss": 0.0916, + "step": 11793 + }, + { + "epoch": 1.9108878807517824, + "grad_norm": 0.7969984412193298, + "learning_rate": 1.5380416851826845e-06, + "loss": 0.0952, + "step": 11794 + }, + { + "epoch": 1.9110499027867789, + "grad_norm": 0.8779683113098145, + "learning_rate": 1.5376380651870033e-06, + "loss": 0.1102, + "step": 11795 + }, + { + "epoch": 1.9112119248217758, + "grad_norm": 0.8378877639770508, + "learning_rate": 1.537234474637268e-06, + "loss": 0.1004, + "step": 11796 + }, + { + "epoch": 1.9113739468567725, + "grad_norm": 0.8366298675537109, + "learning_rate": 1.5368309135458287e-06, + "loss": 0.1061, + "step": 11797 + }, + { + "epoch": 1.9115359688917692, + "grad_norm": 0.8169813752174377, + "learning_rate": 1.5364273819250308e-06, + "loss": 0.0967, + "step": 11798 + }, + { + "epoch": 1.9116979909267662, + "grad_norm": 0.880442202091217, + "learning_rate": 1.536023879787225e-06, + "loss": 0.1037, + "step": 11799 + }, + { + "epoch": 1.9118600129617627, + "grad_norm": 0.8345661759376526, + "learning_rate": 1.535620407144755e-06, + "loss": 0.0997, + "step": 11800 + }, + { + "epoch": 1.9120220349967596, + "grad_norm": 0.9036890268325806, + "learning_rate": 1.5352169640099673e-06, + "loss": 0.1058, + "step": 11801 + }, + { + "epoch": 1.9121840570317563, + "grad_norm": 0.7998917102813721, + "learning_rate": 1.534813550395205e-06, + "loss": 0.1014, + "step": 11802 + }, + { + "epoch": 1.912346079066753, + "grad_norm": 0.8374770879745483, + "learning_rate": 1.5344101663128121e-06, + "loss": 0.1047, + "step": 11803 + }, + { + "epoch": 1.91250810110175, + "grad_norm": 0.8265429735183716, + "learning_rate": 1.5340068117751329e-06, + "loss": 0.098, + "step": 11804 + }, + { + "epoch": 1.9126701231367464, + "grad_norm": 0.7321347594261169, + "learning_rate": 1.5336034867945065e-06, + "loss": 0.0839, + "step": 11805 + }, + { + "epoch": 1.9128321451717434, + "grad_norm": 0.7928110361099243, + "learning_rate": 1.5332001913832754e-06, + "loss": 0.0964, + "step": 11806 + }, + { + "epoch": 1.91299416720674, + "grad_norm": 0.7539445161819458, + "learning_rate": 1.5327969255537784e-06, + "loss": 0.0951, + "step": 11807 + }, + { + "epoch": 1.9131561892417368, + "grad_norm": 0.8430777192115784, + "learning_rate": 1.5323936893183542e-06, + "loss": 0.1087, + "step": 11808 + }, + { + "epoch": 1.9133182112767337, + "grad_norm": 0.8476277589797974, + "learning_rate": 1.5319904826893422e-06, + "loss": 0.1013, + "step": 11809 + }, + { + "epoch": 1.9134802333117304, + "grad_norm": 0.8722755908966064, + "learning_rate": 1.5315873056790791e-06, + "loss": 0.0985, + "step": 11810 + }, + { + "epoch": 1.9136422553467272, + "grad_norm": 0.708776593208313, + "learning_rate": 1.5311841582999009e-06, + "loss": 0.0864, + "step": 11811 + }, + { + "epoch": 1.9138042773817239, + "grad_norm": 0.8320741057395935, + "learning_rate": 1.5307810405641433e-06, + "loss": 0.0971, + "step": 11812 + }, + { + "epoch": 1.9139662994167206, + "grad_norm": 0.79482501745224, + "learning_rate": 1.5303779524841395e-06, + "loss": 0.1019, + "step": 11813 + }, + { + "epoch": 1.9141283214517175, + "grad_norm": 0.8025671243667603, + "learning_rate": 1.5299748940722241e-06, + "loss": 0.0934, + "step": 11814 + }, + { + "epoch": 1.9142903434867142, + "grad_norm": 0.9434292316436768, + "learning_rate": 1.5295718653407305e-06, + "loss": 0.1017, + "step": 11815 + }, + { + "epoch": 1.914452365521711, + "grad_norm": 0.734461784362793, + "learning_rate": 1.5291688663019885e-06, + "loss": 0.0876, + "step": 11816 + }, + { + "epoch": 1.9146143875567079, + "grad_norm": 0.858860194683075, + "learning_rate": 1.5287658969683294e-06, + "loss": 0.105, + "step": 11817 + }, + { + "epoch": 1.9147764095917044, + "grad_norm": 0.7746593356132507, + "learning_rate": 1.5283629573520841e-06, + "loss": 0.0921, + "step": 11818 + }, + { + "epoch": 1.9149384316267013, + "grad_norm": 0.7499451637268066, + "learning_rate": 1.5279600474655814e-06, + "loss": 0.0905, + "step": 11819 + }, + { + "epoch": 1.915100453661698, + "grad_norm": 0.7706909775733948, + "learning_rate": 1.5275571673211487e-06, + "loss": 0.0979, + "step": 11820 + }, + { + "epoch": 1.9152624756966947, + "grad_norm": 0.9270902872085571, + "learning_rate": 1.5271543169311137e-06, + "loss": 0.1123, + "step": 11821 + }, + { + "epoch": 1.9154244977316917, + "grad_norm": 0.9533674120903015, + "learning_rate": 1.5267514963078014e-06, + "loss": 0.1095, + "step": 11822 + }, + { + "epoch": 1.9155865197666881, + "grad_norm": 0.6820551156997681, + "learning_rate": 1.5263487054635386e-06, + "loss": 0.0837, + "step": 11823 + }, + { + "epoch": 1.915748541801685, + "grad_norm": 0.7506440281867981, + "learning_rate": 1.5259459444106497e-06, + "loss": 0.0904, + "step": 11824 + }, + { + "epoch": 1.9159105638366818, + "grad_norm": 0.7518582344055176, + "learning_rate": 1.5255432131614572e-06, + "loss": 0.0924, + "step": 11825 + }, + { + "epoch": 1.9160725858716785, + "grad_norm": 0.7959820032119751, + "learning_rate": 1.5251405117282843e-06, + "loss": 0.0992, + "step": 11826 + }, + { + "epoch": 1.9162346079066754, + "grad_norm": 0.8564347624778748, + "learning_rate": 1.5247378401234521e-06, + "loss": 0.1062, + "step": 11827 + }, + { + "epoch": 1.916396629941672, + "grad_norm": 0.8178135752677917, + "learning_rate": 1.524335198359283e-06, + "loss": 0.0948, + "step": 11828 + }, + { + "epoch": 1.9165586519766689, + "grad_norm": 0.8334271907806396, + "learning_rate": 1.5239325864480952e-06, + "loss": 0.1052, + "step": 11829 + }, + { + "epoch": 1.9167206740116656, + "grad_norm": 0.7920629978179932, + "learning_rate": 1.5235300044022088e-06, + "loss": 0.0976, + "step": 11830 + }, + { + "epoch": 1.9168826960466623, + "grad_norm": 0.7598364353179932, + "learning_rate": 1.5231274522339407e-06, + "loss": 0.091, + "step": 11831 + }, + { + "epoch": 1.9170447180816592, + "grad_norm": 0.8865482211112976, + "learning_rate": 1.522724929955608e-06, + "loss": 0.1087, + "step": 11832 + }, + { + "epoch": 1.9172067401166557, + "grad_norm": 0.7840036153793335, + "learning_rate": 1.5223224375795287e-06, + "loss": 0.0972, + "step": 11833 + }, + { + "epoch": 1.9173687621516526, + "grad_norm": 0.7893587350845337, + "learning_rate": 1.5219199751180162e-06, + "loss": 0.0954, + "step": 11834 + }, + { + "epoch": 1.9175307841866494, + "grad_norm": 0.7772044539451599, + "learning_rate": 1.5215175425833855e-06, + "loss": 0.0893, + "step": 11835 + }, + { + "epoch": 1.917692806221646, + "grad_norm": 0.8612580299377441, + "learning_rate": 1.5211151399879505e-06, + "loss": 0.0945, + "step": 11836 + }, + { + "epoch": 1.917854828256643, + "grad_norm": 0.8095962405204773, + "learning_rate": 1.5207127673440226e-06, + "loss": 0.1019, + "step": 11837 + }, + { + "epoch": 1.9180168502916397, + "grad_norm": 0.794070839881897, + "learning_rate": 1.5203104246639144e-06, + "loss": 0.0995, + "step": 11838 + }, + { + "epoch": 1.9181788723266364, + "grad_norm": 0.8148672580718994, + "learning_rate": 1.5199081119599373e-06, + "loss": 0.1067, + "step": 11839 + }, + { + "epoch": 1.9183408943616331, + "grad_norm": 0.8468709588050842, + "learning_rate": 1.5195058292443996e-06, + "loss": 0.1072, + "step": 11840 + }, + { + "epoch": 1.9185029163966298, + "grad_norm": 0.8336006999015808, + "learning_rate": 1.5191035765296104e-06, + "loss": 0.1073, + "step": 11841 + }, + { + "epoch": 1.9186649384316268, + "grad_norm": 0.7484931945800781, + "learning_rate": 1.518701353827878e-06, + "loss": 0.0902, + "step": 11842 + }, + { + "epoch": 1.9188269604666235, + "grad_norm": 0.8069817423820496, + "learning_rate": 1.5182991611515102e-06, + "loss": 0.0964, + "step": 11843 + }, + { + "epoch": 1.9189889825016202, + "grad_norm": 0.7606700658798218, + "learning_rate": 1.5178969985128122e-06, + "loss": 0.0955, + "step": 11844 + }, + { + "epoch": 1.9191510045366171, + "grad_norm": 0.936642050743103, + "learning_rate": 1.5174948659240896e-06, + "loss": 0.1097, + "step": 11845 + }, + { + "epoch": 1.9193130265716136, + "grad_norm": 0.827983021736145, + "learning_rate": 1.5170927633976457e-06, + "loss": 0.1045, + "step": 11846 + }, + { + "epoch": 1.9194750486066106, + "grad_norm": 0.8345808386802673, + "learning_rate": 1.516690690945785e-06, + "loss": 0.0996, + "step": 11847 + }, + { + "epoch": 1.9196370706416073, + "grad_norm": 0.8033564686775208, + "learning_rate": 1.5162886485808102e-06, + "loss": 0.1027, + "step": 11848 + }, + { + "epoch": 1.919799092676604, + "grad_norm": 0.8006458878517151, + "learning_rate": 1.5158866363150215e-06, + "loss": 0.1001, + "step": 11849 + }, + { + "epoch": 1.919961114711601, + "grad_norm": 0.8043130040168762, + "learning_rate": 1.515484654160721e-06, + "loss": 0.1028, + "step": 11850 + }, + { + "epoch": 1.9201231367465974, + "grad_norm": 0.8533475995063782, + "learning_rate": 1.515082702130206e-06, + "loss": 0.1065, + "step": 11851 + }, + { + "epoch": 1.9202851587815943, + "grad_norm": 0.9136732816696167, + "learning_rate": 1.5146807802357782e-06, + "loss": 0.1156, + "step": 11852 + }, + { + "epoch": 1.920447180816591, + "grad_norm": 0.8210397958755493, + "learning_rate": 1.5142788884897336e-06, + "loss": 0.1002, + "step": 11853 + }, + { + "epoch": 1.9206092028515878, + "grad_norm": 0.906101405620575, + "learning_rate": 1.5138770269043704e-06, + "loss": 0.1062, + "step": 11854 + }, + { + "epoch": 1.9207712248865847, + "grad_norm": 0.8309211730957031, + "learning_rate": 1.5134751954919833e-06, + "loss": 0.0985, + "step": 11855 + }, + { + "epoch": 1.9209332469215812, + "grad_norm": 0.9222484230995178, + "learning_rate": 1.513073394264867e-06, + "loss": 0.1017, + "step": 11856 + }, + { + "epoch": 1.9210952689565781, + "grad_norm": 0.7615657448768616, + "learning_rate": 1.5126716232353178e-06, + "loss": 0.0931, + "step": 11857 + }, + { + "epoch": 1.9212572909915748, + "grad_norm": 0.823805570602417, + "learning_rate": 1.5122698824156271e-06, + "loss": 0.0979, + "step": 11858 + }, + { + "epoch": 1.9214193130265715, + "grad_norm": 0.801001787185669, + "learning_rate": 1.5118681718180883e-06, + "loss": 0.1006, + "step": 11859 + }, + { + "epoch": 1.9215813350615685, + "grad_norm": 0.7479082345962524, + "learning_rate": 1.5114664914549903e-06, + "loss": 0.0909, + "step": 11860 + }, + { + "epoch": 1.9217433570965652, + "grad_norm": 0.662217915058136, + "learning_rate": 1.5110648413386275e-06, + "loss": 0.0826, + "step": 11861 + }, + { + "epoch": 1.921905379131562, + "grad_norm": 0.837660014629364, + "learning_rate": 1.5106632214812865e-06, + "loss": 0.1019, + "step": 11862 + }, + { + "epoch": 1.9220674011665586, + "grad_norm": 0.8495209813117981, + "learning_rate": 1.5102616318952575e-06, + "loss": 0.105, + "step": 11863 + }, + { + "epoch": 1.9222294232015553, + "grad_norm": 0.723486065864563, + "learning_rate": 1.5098600725928269e-06, + "loss": 0.0883, + "step": 11864 + }, + { + "epoch": 1.9223914452365523, + "grad_norm": 0.7528011798858643, + "learning_rate": 1.5094585435862817e-06, + "loss": 0.0919, + "step": 11865 + }, + { + "epoch": 1.922553467271549, + "grad_norm": 0.9446701407432556, + "learning_rate": 1.5090570448879088e-06, + "loss": 0.115, + "step": 11866 + }, + { + "epoch": 1.9227154893065457, + "grad_norm": 0.8201817274093628, + "learning_rate": 1.5086555765099916e-06, + "loss": 0.1131, + "step": 11867 + }, + { + "epoch": 1.9228775113415426, + "grad_norm": 0.6670837998390198, + "learning_rate": 1.5082541384648154e-06, + "loss": 0.0816, + "step": 11868 + }, + { + "epoch": 1.923039533376539, + "grad_norm": 0.6985659003257751, + "learning_rate": 1.5078527307646622e-06, + "loss": 0.0847, + "step": 11869 + }, + { + "epoch": 1.923201555411536, + "grad_norm": 0.8145870566368103, + "learning_rate": 1.5074513534218137e-06, + "loss": 0.1065, + "step": 11870 + }, + { + "epoch": 1.9233635774465327, + "grad_norm": 0.7868598699569702, + "learning_rate": 1.5070500064485527e-06, + "loss": 0.0941, + "step": 11871 + }, + { + "epoch": 1.9235255994815295, + "grad_norm": 0.8277310729026794, + "learning_rate": 1.5066486898571588e-06, + "loss": 0.0937, + "step": 11872 + }, + { + "epoch": 1.9236876215165264, + "grad_norm": 0.752480685710907, + "learning_rate": 1.5062474036599109e-06, + "loss": 0.1008, + "step": 11873 + }, + { + "epoch": 1.9238496435515229, + "grad_norm": 0.7744075655937195, + "learning_rate": 1.5058461478690878e-06, + "loss": 0.0879, + "step": 11874 + }, + { + "epoch": 1.9240116655865198, + "grad_norm": 0.8470804691314697, + "learning_rate": 1.505444922496966e-06, + "loss": 0.0971, + "step": 11875 + }, + { + "epoch": 1.9241736876215165, + "grad_norm": 0.6907162666320801, + "learning_rate": 1.5050437275558233e-06, + "loss": 0.0813, + "step": 11876 + }, + { + "epoch": 1.9243357096565132, + "grad_norm": 0.8127276301383972, + "learning_rate": 1.5046425630579348e-06, + "loss": 0.1044, + "step": 11877 + }, + { + "epoch": 1.9244977316915102, + "grad_norm": 0.8600353598594666, + "learning_rate": 1.5042414290155754e-06, + "loss": 0.1021, + "step": 11878 + }, + { + "epoch": 1.9246597537265067, + "grad_norm": 0.7428255081176758, + "learning_rate": 1.5038403254410183e-06, + "loss": 0.0966, + "step": 11879 + }, + { + "epoch": 1.9248217757615036, + "grad_norm": 0.8952332139015198, + "learning_rate": 1.5034392523465364e-06, + "loss": 0.097, + "step": 11880 + }, + { + "epoch": 1.9249837977965003, + "grad_norm": 0.7860379815101624, + "learning_rate": 1.5030382097444025e-06, + "loss": 0.0909, + "step": 11881 + }, + { + "epoch": 1.925145819831497, + "grad_norm": 0.8410727977752686, + "learning_rate": 1.502637197646886e-06, + "loss": 0.0759, + "step": 11882 + }, + { + "epoch": 1.925307841866494, + "grad_norm": 0.8401789665222168, + "learning_rate": 1.5022362160662584e-06, + "loss": 0.1057, + "step": 11883 + }, + { + "epoch": 1.9254698639014904, + "grad_norm": 0.7923102378845215, + "learning_rate": 1.5018352650147872e-06, + "loss": 0.0931, + "step": 11884 + }, + { + "epoch": 1.9256318859364874, + "grad_norm": 0.8386868834495544, + "learning_rate": 1.5014343445047414e-06, + "loss": 0.1025, + "step": 11885 + }, + { + "epoch": 1.925793907971484, + "grad_norm": 0.723055362701416, + "learning_rate": 1.5010334545483885e-06, + "loss": 0.0889, + "step": 11886 + }, + { + "epoch": 1.9259559300064808, + "grad_norm": 0.8790740966796875, + "learning_rate": 1.5006325951579948e-06, + "loss": 0.1046, + "step": 11887 + }, + { + "epoch": 1.9261179520414777, + "grad_norm": 0.8958154320716858, + "learning_rate": 1.500231766345825e-06, + "loss": 0.1142, + "step": 11888 + }, + { + "epoch": 1.9262799740764744, + "grad_norm": 0.7713485360145569, + "learning_rate": 1.499830968124143e-06, + "loss": 0.1023, + "step": 11889 + }, + { + "epoch": 1.9264419961114712, + "grad_norm": 0.8863093852996826, + "learning_rate": 1.4994302005052141e-06, + "loss": 0.1149, + "step": 11890 + }, + { + "epoch": 1.9266040181464679, + "grad_norm": 0.7300902009010315, + "learning_rate": 1.4990294635012995e-06, + "loss": 0.0852, + "step": 11891 + }, + { + "epoch": 1.9267660401814646, + "grad_norm": 0.7891831398010254, + "learning_rate": 1.4986287571246614e-06, + "loss": 0.1016, + "step": 11892 + }, + { + "epoch": 1.9269280622164615, + "grad_norm": 0.8113868236541748, + "learning_rate": 1.4982280813875593e-06, + "loss": 0.1071, + "step": 11893 + }, + { + "epoch": 1.9270900842514582, + "grad_norm": 0.8107513189315796, + "learning_rate": 1.4978274363022532e-06, + "loss": 0.0996, + "step": 11894 + }, + { + "epoch": 1.927252106286455, + "grad_norm": 0.7865254878997803, + "learning_rate": 1.497426821881003e-06, + "loss": 0.0956, + "step": 11895 + }, + { + "epoch": 1.9274141283214519, + "grad_norm": 0.8345635533332825, + "learning_rate": 1.4970262381360664e-06, + "loss": 0.1072, + "step": 11896 + }, + { + "epoch": 1.9275761503564484, + "grad_norm": 0.7102372050285339, + "learning_rate": 1.4966256850796993e-06, + "loss": 0.0904, + "step": 11897 + }, + { + "epoch": 1.9277381723914453, + "grad_norm": 0.8325737118721008, + "learning_rate": 1.4962251627241583e-06, + "loss": 0.0958, + "step": 11898 + }, + { + "epoch": 1.927900194426442, + "grad_norm": 0.8160897493362427, + "learning_rate": 1.4958246710816976e-06, + "loss": 0.101, + "step": 11899 + }, + { + "epoch": 1.9280622164614387, + "grad_norm": 0.794330358505249, + "learning_rate": 1.4954242101645722e-06, + "loss": 0.0976, + "step": 11900 + }, + { + "epoch": 1.9282242384964356, + "grad_norm": 0.8225795030593872, + "learning_rate": 1.4950237799850354e-06, + "loss": 0.104, + "step": 11901 + }, + { + "epoch": 1.9283862605314321, + "grad_norm": 0.7986380457878113, + "learning_rate": 1.4946233805553387e-06, + "loss": 0.0807, + "step": 11902 + }, + { + "epoch": 1.928548282566429, + "grad_norm": 0.7666733860969543, + "learning_rate": 1.4942230118877337e-06, + "loss": 0.0926, + "step": 11903 + }, + { + "epoch": 1.9287103046014258, + "grad_norm": 0.8928430676460266, + "learning_rate": 1.4938226739944694e-06, + "loss": 0.1056, + "step": 11904 + }, + { + "epoch": 1.9288723266364225, + "grad_norm": 0.7845842838287354, + "learning_rate": 1.4934223668877979e-06, + "loss": 0.0987, + "step": 11905 + }, + { + "epoch": 1.9290343486714194, + "grad_norm": 0.8057361245155334, + "learning_rate": 1.4930220905799652e-06, + "loss": 0.0966, + "step": 11906 + }, + { + "epoch": 1.929196370706416, + "grad_norm": 0.7778323888778687, + "learning_rate": 1.4926218450832208e-06, + "loss": 0.0946, + "step": 11907 + }, + { + "epoch": 1.9293583927414129, + "grad_norm": 0.884856641292572, + "learning_rate": 1.4922216304098085e-06, + "loss": 0.1084, + "step": 11908 + }, + { + "epoch": 1.9295204147764096, + "grad_norm": 0.9439958333969116, + "learning_rate": 1.4918214465719765e-06, + "loss": 0.1146, + "step": 11909 + }, + { + "epoch": 1.9296824368114063, + "grad_norm": 0.7851779460906982, + "learning_rate": 1.4914212935819689e-06, + "loss": 0.0973, + "step": 11910 + }, + { + "epoch": 1.9298444588464032, + "grad_norm": 0.8911312818527222, + "learning_rate": 1.4910211714520285e-06, + "loss": 0.1109, + "step": 11911 + }, + { + "epoch": 1.9300064808814, + "grad_norm": 0.8925055265426636, + "learning_rate": 1.4906210801943985e-06, + "loss": 0.1051, + "step": 11912 + }, + { + "epoch": 1.9301685029163966, + "grad_norm": 0.8144211769104004, + "learning_rate": 1.4902210198213203e-06, + "loss": 0.1023, + "step": 11913 + }, + { + "epoch": 1.9303305249513933, + "grad_norm": 0.8194781541824341, + "learning_rate": 1.4898209903450361e-06, + "loss": 0.0997, + "step": 11914 + }, + { + "epoch": 1.93049254698639, + "grad_norm": 0.8512380123138428, + "learning_rate": 1.489420991777785e-06, + "loss": 0.1035, + "step": 11915 + }, + { + "epoch": 1.930654569021387, + "grad_norm": 0.8246690630912781, + "learning_rate": 1.489021024131806e-06, + "loss": 0.107, + "step": 11916 + }, + { + "epoch": 1.9308165910563837, + "grad_norm": 0.8465759754180908, + "learning_rate": 1.4886210874193368e-06, + "loss": 0.0924, + "step": 11917 + }, + { + "epoch": 1.9309786130913804, + "grad_norm": 0.757154643535614, + "learning_rate": 1.4882211816526144e-06, + "loss": 0.0958, + "step": 11918 + }, + { + "epoch": 1.9311406351263773, + "grad_norm": 0.8065274357795715, + "learning_rate": 1.4878213068438762e-06, + "loss": 0.0939, + "step": 11919 + }, + { + "epoch": 1.9313026571613738, + "grad_norm": 0.8495281934738159, + "learning_rate": 1.4874214630053562e-06, + "loss": 0.0914, + "step": 11920 + }, + { + "epoch": 1.9314646791963708, + "grad_norm": 0.8558754920959473, + "learning_rate": 1.4870216501492892e-06, + "loss": 0.1046, + "step": 11921 + }, + { + "epoch": 1.9316267012313675, + "grad_norm": 0.7717517018318176, + "learning_rate": 1.4866218682879088e-06, + "loss": 0.0917, + "step": 11922 + }, + { + "epoch": 1.9317887232663642, + "grad_norm": 0.7911754250526428, + "learning_rate": 1.4862221174334457e-06, + "loss": 0.0999, + "step": 11923 + }, + { + "epoch": 1.9319507453013611, + "grad_norm": 0.7910676598548889, + "learning_rate": 1.4858223975981334e-06, + "loss": 0.1002, + "step": 11924 + }, + { + "epoch": 1.9321127673363576, + "grad_norm": 0.816967785358429, + "learning_rate": 1.4854227087942016e-06, + "loss": 0.0959, + "step": 11925 + }, + { + "epoch": 1.9322747893713546, + "grad_norm": 0.828427255153656, + "learning_rate": 1.4850230510338792e-06, + "loss": 0.1038, + "step": 11926 + }, + { + "epoch": 1.9324368114063513, + "grad_norm": 0.7696671485900879, + "learning_rate": 1.484623424329395e-06, + "loss": 0.0931, + "step": 11927 + }, + { + "epoch": 1.932598833441348, + "grad_norm": 0.8365792036056519, + "learning_rate": 1.4842238286929777e-06, + "loss": 0.1076, + "step": 11928 + }, + { + "epoch": 1.932760855476345, + "grad_norm": 0.7698932886123657, + "learning_rate": 1.4838242641368526e-06, + "loss": 0.089, + "step": 11929 + }, + { + "epoch": 1.9329228775113414, + "grad_norm": 0.8664256930351257, + "learning_rate": 1.4834247306732457e-06, + "loss": 0.1108, + "step": 11930 + }, + { + "epoch": 1.9330848995463383, + "grad_norm": 0.9179291129112244, + "learning_rate": 1.4830252283143825e-06, + "loss": 0.107, + "step": 11931 + }, + { + "epoch": 1.933246921581335, + "grad_norm": 0.798081636428833, + "learning_rate": 1.4826257570724856e-06, + "loss": 0.0952, + "step": 11932 + }, + { + "epoch": 1.9334089436163318, + "grad_norm": 0.7657841444015503, + "learning_rate": 1.4822263169597789e-06, + "loss": 0.0892, + "step": 11933 + }, + { + "epoch": 1.9335709656513287, + "grad_norm": 0.7410942316055298, + "learning_rate": 1.4818269079884845e-06, + "loss": 0.091, + "step": 11934 + }, + { + "epoch": 1.9337329876863252, + "grad_norm": 0.8873445987701416, + "learning_rate": 1.4814275301708222e-06, + "loss": 0.1068, + "step": 11935 + }, + { + "epoch": 1.9338950097213221, + "grad_norm": 0.835503101348877, + "learning_rate": 1.4810281835190132e-06, + "loss": 0.0989, + "step": 11936 + }, + { + "epoch": 1.9340570317563188, + "grad_norm": 0.831396758556366, + "learning_rate": 1.4806288680452747e-06, + "loss": 0.1001, + "step": 11937 + }, + { + "epoch": 1.9342190537913155, + "grad_norm": 0.7926315665245056, + "learning_rate": 1.4802295837618268e-06, + "loss": 0.0978, + "step": 11938 + }, + { + "epoch": 1.9343810758263125, + "grad_norm": 0.9609958529472351, + "learning_rate": 1.4798303306808857e-06, + "loss": 0.1113, + "step": 11939 + }, + { + "epoch": 1.9345430978613092, + "grad_norm": 0.8151983618736267, + "learning_rate": 1.479431108814668e-06, + "loss": 0.0924, + "step": 11940 + }, + { + "epoch": 1.934705119896306, + "grad_norm": 0.763329803943634, + "learning_rate": 1.4790319181753884e-06, + "loss": 0.0871, + "step": 11941 + }, + { + "epoch": 1.9348671419313026, + "grad_norm": 0.827495813369751, + "learning_rate": 1.4786327587752608e-06, + "loss": 0.0946, + "step": 11942 + }, + { + "epoch": 1.9350291639662993, + "grad_norm": 0.7745943665504456, + "learning_rate": 1.4782336306265002e-06, + "loss": 0.0914, + "step": 11943 + }, + { + "epoch": 1.9351911860012962, + "grad_norm": 0.8348422646522522, + "learning_rate": 1.4778345337413174e-06, + "loss": 0.1025, + "step": 11944 + }, + { + "epoch": 1.935353208036293, + "grad_norm": 0.9461818933486938, + "learning_rate": 1.4774354681319247e-06, + "loss": 0.1125, + "step": 11945 + }, + { + "epoch": 1.9355152300712897, + "grad_norm": 8.034222602844238, + "learning_rate": 1.4770364338105315e-06, + "loss": 0.0958, + "step": 11946 + }, + { + "epoch": 1.9356772521062866, + "grad_norm": 0.747870147228241, + "learning_rate": 1.4766374307893477e-06, + "loss": 0.0903, + "step": 11947 + }, + { + "epoch": 1.935839274141283, + "grad_norm": 0.8536267876625061, + "learning_rate": 1.4762384590805823e-06, + "loss": 0.1048, + "step": 11948 + }, + { + "epoch": 1.93600129617628, + "grad_norm": 0.8837407827377319, + "learning_rate": 1.475839518696443e-06, + "loss": 0.1007, + "step": 11949 + }, + { + "epoch": 1.9361633182112767, + "grad_norm": 0.8272178173065186, + "learning_rate": 1.475440609649136e-06, + "loss": 0.1017, + "step": 11950 + }, + { + "epoch": 1.9363253402462735, + "grad_norm": 0.8612387180328369, + "learning_rate": 1.4750417319508658e-06, + "loss": 0.1037, + "step": 11951 + }, + { + "epoch": 1.9364873622812704, + "grad_norm": 0.8727052211761475, + "learning_rate": 1.4746428856138395e-06, + "loss": 0.1062, + "step": 11952 + }, + { + "epoch": 1.9366493843162669, + "grad_norm": 0.7352006435394287, + "learning_rate": 1.4742440706502591e-06, + "loss": 0.0934, + "step": 11953 + }, + { + "epoch": 1.9368114063512638, + "grad_norm": 0.8387872576713562, + "learning_rate": 1.4738452870723286e-06, + "loss": 0.1012, + "step": 11954 + }, + { + "epoch": 1.9369734283862605, + "grad_norm": 0.8393522500991821, + "learning_rate": 1.4734465348922484e-06, + "loss": 0.0972, + "step": 11955 + }, + { + "epoch": 1.9371354504212572, + "grad_norm": 0.8170238733291626, + "learning_rate": 1.4730478141222194e-06, + "loss": 0.0957, + "step": 11956 + }, + { + "epoch": 1.9372974724562542, + "grad_norm": 0.908352792263031, + "learning_rate": 1.4726491247744429e-06, + "loss": 0.1151, + "step": 11957 + }, + { + "epoch": 1.9374594944912507, + "grad_norm": 0.8904739022254944, + "learning_rate": 1.4722504668611172e-06, + "loss": 0.1008, + "step": 11958 + }, + { + "epoch": 1.9376215165262476, + "grad_norm": 0.7439394593238831, + "learning_rate": 1.4718518403944398e-06, + "loss": 0.0903, + "step": 11959 + }, + { + "epoch": 1.9377835385612443, + "grad_norm": 0.8450915217399597, + "learning_rate": 1.4714532453866084e-06, + "loss": 0.0941, + "step": 11960 + }, + { + "epoch": 1.937945560596241, + "grad_norm": 0.8702685236930847, + "learning_rate": 1.4710546818498178e-06, + "loss": 0.095, + "step": 11961 + }, + { + "epoch": 1.938107582631238, + "grad_norm": 0.807630717754364, + "learning_rate": 1.4706561497962644e-06, + "loss": 0.1006, + "step": 11962 + }, + { + "epoch": 1.9382696046662347, + "grad_norm": 0.8445601463317871, + "learning_rate": 1.470257649238142e-06, + "loss": 0.0985, + "step": 11963 + }, + { + "epoch": 1.9384316267012314, + "grad_norm": 0.7522417902946472, + "learning_rate": 1.4698591801876435e-06, + "loss": 0.0966, + "step": 11964 + }, + { + "epoch": 1.938593648736228, + "grad_norm": 0.9122772812843323, + "learning_rate": 1.4694607426569613e-06, + "loss": 0.106, + "step": 11965 + }, + { + "epoch": 1.9387556707712248, + "grad_norm": 0.8337773084640503, + "learning_rate": 1.4690623366582856e-06, + "loss": 0.1084, + "step": 11966 + }, + { + "epoch": 1.9389176928062217, + "grad_norm": 0.8264670968055725, + "learning_rate": 1.468663962203809e-06, + "loss": 0.108, + "step": 11967 + }, + { + "epoch": 1.9390797148412184, + "grad_norm": 0.7818570733070374, + "learning_rate": 1.4682656193057189e-06, + "loss": 0.0925, + "step": 11968 + }, + { + "epoch": 1.9392417368762151, + "grad_norm": 0.7806037664413452, + "learning_rate": 1.467867307976204e-06, + "loss": 0.0911, + "step": 11969 + }, + { + "epoch": 1.939403758911212, + "grad_norm": 0.8423177599906921, + "learning_rate": 1.4674690282274517e-06, + "loss": 0.0982, + "step": 11970 + }, + { + "epoch": 1.9395657809462086, + "grad_norm": 0.963801920413971, + "learning_rate": 1.4670707800716478e-06, + "loss": 0.1058, + "step": 11971 + }, + { + "epoch": 1.9397278029812055, + "grad_norm": 0.8713017106056213, + "learning_rate": 1.4666725635209794e-06, + "loss": 0.1015, + "step": 11972 + }, + { + "epoch": 1.9398898250162022, + "grad_norm": 0.7701468467712402, + "learning_rate": 1.4662743785876298e-06, + "loss": 0.0996, + "step": 11973 + }, + { + "epoch": 1.940051847051199, + "grad_norm": 0.7816067934036255, + "learning_rate": 1.4658762252837821e-06, + "loss": 0.0938, + "step": 11974 + }, + { + "epoch": 1.9402138690861959, + "grad_norm": 1.0052039623260498, + "learning_rate": 1.4654781036216193e-06, + "loss": 0.1145, + "step": 11975 + }, + { + "epoch": 1.9403758911211924, + "grad_norm": 0.8187944889068604, + "learning_rate": 1.4650800136133238e-06, + "loss": 0.094, + "step": 11976 + }, + { + "epoch": 1.9405379131561893, + "grad_norm": 0.7152525186538696, + "learning_rate": 1.4646819552710751e-06, + "loss": 0.0915, + "step": 11977 + }, + { + "epoch": 1.940699935191186, + "grad_norm": 0.8357136845588684, + "learning_rate": 1.4642839286070537e-06, + "loss": 0.0951, + "step": 11978 + }, + { + "epoch": 1.9408619572261827, + "grad_norm": 0.8096917271614075, + "learning_rate": 1.463885933633437e-06, + "loss": 0.1029, + "step": 11979 + }, + { + "epoch": 1.9410239792611796, + "grad_norm": 0.8772410154342651, + "learning_rate": 1.4634879703624027e-06, + "loss": 0.108, + "step": 11980 + }, + { + "epoch": 1.9411860012961761, + "grad_norm": 0.7658387422561646, + "learning_rate": 1.4630900388061292e-06, + "loss": 0.0968, + "step": 11981 + }, + { + "epoch": 1.941348023331173, + "grad_norm": 0.7496578693389893, + "learning_rate": 1.4626921389767915e-06, + "loss": 0.0883, + "step": 11982 + }, + { + "epoch": 1.9415100453661698, + "grad_norm": 0.852181613445282, + "learning_rate": 1.4622942708865635e-06, + "loss": 0.0962, + "step": 11983 + }, + { + "epoch": 1.9416720674011665, + "grad_norm": 0.8182169198989868, + "learning_rate": 1.4618964345476203e-06, + "loss": 0.1018, + "step": 11984 + }, + { + "epoch": 1.9418340894361634, + "grad_norm": 0.781848669052124, + "learning_rate": 1.4614986299721328e-06, + "loss": 0.0939, + "step": 11985 + }, + { + "epoch": 1.94199611147116, + "grad_norm": 0.7615278363227844, + "learning_rate": 1.4611008571722748e-06, + "loss": 0.0952, + "step": 11986 + }, + { + "epoch": 1.9421581335061568, + "grad_norm": 0.8403747081756592, + "learning_rate": 1.460703116160217e-06, + "loss": 0.1036, + "step": 11987 + }, + { + "epoch": 1.9423201555411536, + "grad_norm": 0.7197666168212891, + "learning_rate": 1.4603054069481282e-06, + "loss": 0.0873, + "step": 11988 + }, + { + "epoch": 1.9424821775761503, + "grad_norm": 0.7320033311843872, + "learning_rate": 1.4599077295481783e-06, + "loss": 0.0831, + "step": 11989 + }, + { + "epoch": 1.9426441996111472, + "grad_norm": 0.7511314749717712, + "learning_rate": 1.4595100839725338e-06, + "loss": 0.0869, + "step": 11990 + }, + { + "epoch": 1.942806221646144, + "grad_norm": 0.809007465839386, + "learning_rate": 1.4591124702333636e-06, + "loss": 0.0954, + "step": 11991 + }, + { + "epoch": 1.9429682436811406, + "grad_norm": 0.8283510804176331, + "learning_rate": 1.4587148883428337e-06, + "loss": 0.1011, + "step": 11992 + }, + { + "epoch": 1.9431302657161373, + "grad_norm": 0.7291625738143921, + "learning_rate": 1.4583173383131077e-06, + "loss": 0.0918, + "step": 11993 + }, + { + "epoch": 1.943292287751134, + "grad_norm": 0.7869513034820557, + "learning_rate": 1.45791982015635e-06, + "loss": 0.0993, + "step": 11994 + }, + { + "epoch": 1.943454309786131, + "grad_norm": 0.7561447620391846, + "learning_rate": 1.457522333884724e-06, + "loss": 0.0897, + "step": 11995 + }, + { + "epoch": 1.9436163318211277, + "grad_norm": 0.8446448445320129, + "learning_rate": 1.4571248795103921e-06, + "loss": 0.1032, + "step": 11996 + }, + { + "epoch": 1.9437783538561244, + "grad_norm": 0.8000361919403076, + "learning_rate": 1.456727457045515e-06, + "loss": 0.096, + "step": 11997 + }, + { + "epoch": 1.9439403758911213, + "grad_norm": 0.7143806219100952, + "learning_rate": 1.4563300665022534e-06, + "loss": 0.0851, + "step": 11998 + }, + { + "epoch": 1.9441023979261178, + "grad_norm": 0.7861617207527161, + "learning_rate": 1.4559327078927656e-06, + "loss": 0.098, + "step": 11999 + }, + { + "epoch": 1.9442644199611148, + "grad_norm": 0.8464603424072266, + "learning_rate": 1.4555353812292105e-06, + "loss": 0.0929, + "step": 12000 + }, + { + "epoch": 1.9444264419961115, + "grad_norm": 0.817179262638092, + "learning_rate": 1.4551380865237456e-06, + "loss": 0.09, + "step": 12001 + }, + { + "epoch": 1.9445884640311082, + "grad_norm": 0.9822414517402649, + "learning_rate": 1.4547408237885262e-06, + "loss": 0.1172, + "step": 12002 + }, + { + "epoch": 1.9447504860661051, + "grad_norm": 0.6776533722877502, + "learning_rate": 1.454343593035709e-06, + "loss": 0.078, + "step": 12003 + }, + { + "epoch": 1.9449125081011016, + "grad_norm": 0.7004638314247131, + "learning_rate": 1.4539463942774462e-06, + "loss": 0.0857, + "step": 12004 + }, + { + "epoch": 1.9450745301360985, + "grad_norm": 0.8583202362060547, + "learning_rate": 1.4535492275258928e-06, + "loss": 0.0998, + "step": 12005 + }, + { + "epoch": 1.9452365521710953, + "grad_norm": 0.9123525619506836, + "learning_rate": 1.4531520927932017e-06, + "loss": 0.107, + "step": 12006 + }, + { + "epoch": 1.945398574206092, + "grad_norm": 0.8352869153022766, + "learning_rate": 1.4527549900915222e-06, + "loss": 0.1025, + "step": 12007 + }, + { + "epoch": 1.945560596241089, + "grad_norm": 0.7704386115074158, + "learning_rate": 1.452357919433006e-06, + "loss": 0.0921, + "step": 12008 + }, + { + "epoch": 1.9457226182760854, + "grad_norm": 0.7887938618659973, + "learning_rate": 1.4519608808298007e-06, + "loss": 0.1008, + "step": 12009 + }, + { + "epoch": 1.9458846403110823, + "grad_norm": 0.7636969685554504, + "learning_rate": 1.4515638742940585e-06, + "loss": 0.0949, + "step": 12010 + }, + { + "epoch": 1.946046662346079, + "grad_norm": 0.7282156348228455, + "learning_rate": 1.4511668998379238e-06, + "loss": 0.082, + "step": 12011 + }, + { + "epoch": 1.9462086843810757, + "grad_norm": 0.8383048176765442, + "learning_rate": 1.4507699574735436e-06, + "loss": 0.1019, + "step": 12012 + }, + { + "epoch": 1.9463707064160727, + "grad_norm": 0.8490771055221558, + "learning_rate": 1.450373047213064e-06, + "loss": 0.0947, + "step": 12013 + }, + { + "epoch": 1.9465327284510694, + "grad_norm": 0.7578926086425781, + "learning_rate": 1.4499761690686287e-06, + "loss": 0.0871, + "step": 12014 + }, + { + "epoch": 1.946694750486066, + "grad_norm": 0.847022533416748, + "learning_rate": 1.4495793230523817e-06, + "loss": 0.1013, + "step": 12015 + }, + { + "epoch": 1.9468567725210628, + "grad_norm": 0.7949681282043457, + "learning_rate": 1.4491825091764656e-06, + "loss": 0.094, + "step": 12016 + }, + { + "epoch": 1.9470187945560595, + "grad_norm": 0.8004774451255798, + "learning_rate": 1.4487857274530214e-06, + "loss": 0.0959, + "step": 12017 + }, + { + "epoch": 1.9471808165910565, + "grad_norm": 0.8467872142791748, + "learning_rate": 1.4483889778941904e-06, + "loss": 0.1049, + "step": 12018 + }, + { + "epoch": 1.9473428386260532, + "grad_norm": 0.767632246017456, + "learning_rate": 1.4479922605121117e-06, + "loss": 0.0965, + "step": 12019 + }, + { + "epoch": 1.9475048606610499, + "grad_norm": 0.7307461500167847, + "learning_rate": 1.447595575318924e-06, + "loss": 0.0858, + "step": 12020 + }, + { + "epoch": 1.9476668826960468, + "grad_norm": 0.7664514780044556, + "learning_rate": 1.447198922326766e-06, + "loss": 0.1005, + "step": 12021 + }, + { + "epoch": 1.9478289047310433, + "grad_norm": 0.9641528129577637, + "learning_rate": 1.4468023015477722e-06, + "loss": 0.1177, + "step": 12022 + }, + { + "epoch": 1.9479909267660402, + "grad_norm": 0.8184154629707336, + "learning_rate": 1.4464057129940783e-06, + "loss": 0.0997, + "step": 12023 + }, + { + "epoch": 1.948152948801037, + "grad_norm": 0.7754473090171814, + "learning_rate": 1.446009156677822e-06, + "loss": 0.0928, + "step": 12024 + }, + { + "epoch": 1.9483149708360337, + "grad_norm": 0.8045803308486938, + "learning_rate": 1.4456126326111337e-06, + "loss": 0.0991, + "step": 12025 + }, + { + "epoch": 1.9484769928710306, + "grad_norm": 0.7364550828933716, + "learning_rate": 1.4452161408061478e-06, + "loss": 0.0943, + "step": 12026 + }, + { + "epoch": 1.948639014906027, + "grad_norm": 0.8381508588790894, + "learning_rate": 1.4448196812749948e-06, + "loss": 0.1039, + "step": 12027 + }, + { + "epoch": 1.948801036941024, + "grad_norm": 0.7855244278907776, + "learning_rate": 1.4444232540298064e-06, + "loss": 0.106, + "step": 12028 + }, + { + "epoch": 1.9489630589760207, + "grad_norm": 0.8092867136001587, + "learning_rate": 1.4440268590827117e-06, + "loss": 0.1001, + "step": 12029 + }, + { + "epoch": 1.9491250810110174, + "grad_norm": 0.8405110239982605, + "learning_rate": 1.44363049644584e-06, + "loss": 0.1049, + "step": 12030 + }, + { + "epoch": 1.9492871030460144, + "grad_norm": 0.821543276309967, + "learning_rate": 1.4432341661313188e-06, + "loss": 0.1045, + "step": 12031 + }, + { + "epoch": 1.9494491250810109, + "grad_norm": 0.7550950646400452, + "learning_rate": 1.4428378681512755e-06, + "loss": 0.0893, + "step": 12032 + }, + { + "epoch": 1.9496111471160078, + "grad_norm": 0.733284056186676, + "learning_rate": 1.4424416025178335e-06, + "loss": 0.0866, + "step": 12033 + }, + { + "epoch": 1.9497731691510045, + "grad_norm": 0.8085827231407166, + "learning_rate": 1.4420453692431197e-06, + "loss": 0.0921, + "step": 12034 + }, + { + "epoch": 1.9499351911860012, + "grad_norm": 0.7645505666732788, + "learning_rate": 1.441649168339258e-06, + "loss": 0.0936, + "step": 12035 + }, + { + "epoch": 1.9500972132209982, + "grad_norm": 0.781045138835907, + "learning_rate": 1.441252999818371e-06, + "loss": 0.0983, + "step": 12036 + }, + { + "epoch": 1.9502592352559946, + "grad_norm": 0.7690817713737488, + "learning_rate": 1.4408568636925796e-06, + "loss": 0.0885, + "step": 12037 + }, + { + "epoch": 1.9504212572909916, + "grad_norm": 0.8504323363304138, + "learning_rate": 1.440460759974004e-06, + "loss": 0.099, + "step": 12038 + }, + { + "epoch": 1.9505832793259883, + "grad_norm": 0.788281261920929, + "learning_rate": 1.4400646886747672e-06, + "loss": 0.0929, + "step": 12039 + }, + { + "epoch": 1.950745301360985, + "grad_norm": 0.7282304167747498, + "learning_rate": 1.4396686498069844e-06, + "loss": 0.0861, + "step": 12040 + }, + { + "epoch": 1.950907323395982, + "grad_norm": 0.8458184003829956, + "learning_rate": 1.4392726433827754e-06, + "loss": 0.1047, + "step": 12041 + }, + { + "epoch": 1.9510693454309787, + "grad_norm": 0.9088663458824158, + "learning_rate": 1.4388766694142553e-06, + "loss": 0.1065, + "step": 12042 + }, + { + "epoch": 1.9512313674659754, + "grad_norm": 0.827163815498352, + "learning_rate": 1.4384807279135438e-06, + "loss": 0.095, + "step": 12043 + }, + { + "epoch": 1.9513933895009723, + "grad_norm": 0.7869982123374939, + "learning_rate": 1.4380848188927516e-06, + "loss": 0.0893, + "step": 12044 + }, + { + "epoch": 1.9515554115359688, + "grad_norm": 0.940324604511261, + "learning_rate": 1.4376889423639945e-06, + "loss": 0.1045, + "step": 12045 + }, + { + "epoch": 1.9517174335709657, + "grad_norm": 0.8107567429542542, + "learning_rate": 1.4372930983393849e-06, + "loss": 0.0969, + "step": 12046 + }, + { + "epoch": 1.9518794556059624, + "grad_norm": 0.8711578845977783, + "learning_rate": 1.4368972868310349e-06, + "loss": 0.0977, + "step": 12047 + }, + { + "epoch": 1.9520414776409591, + "grad_norm": 0.7904216647148132, + "learning_rate": 1.4365015078510553e-06, + "loss": 0.102, + "step": 12048 + }, + { + "epoch": 1.952203499675956, + "grad_norm": 0.7687509655952454, + "learning_rate": 1.4361057614115557e-06, + "loss": 0.0939, + "step": 12049 + }, + { + "epoch": 1.9523655217109526, + "grad_norm": 0.7089811563491821, + "learning_rate": 1.4357100475246463e-06, + "loss": 0.0847, + "step": 12050 + }, + { + "epoch": 1.9525275437459495, + "grad_norm": 0.8005346059799194, + "learning_rate": 1.435314366202433e-06, + "loss": 0.0934, + "step": 12051 + }, + { + "epoch": 1.9526895657809462, + "grad_norm": 0.8521600365638733, + "learning_rate": 1.4349187174570226e-06, + "loss": 0.0972, + "step": 12052 + }, + { + "epoch": 1.952851587815943, + "grad_norm": 0.7018685936927795, + "learning_rate": 1.4345231013005229e-06, + "loss": 0.0859, + "step": 12053 + }, + { + "epoch": 1.9530136098509399, + "grad_norm": 0.8380179405212402, + "learning_rate": 1.4341275177450389e-06, + "loss": 0.0971, + "step": 12054 + }, + { + "epoch": 1.9531756318859363, + "grad_norm": 0.8714627623558044, + "learning_rate": 1.4337319668026726e-06, + "loss": 0.1073, + "step": 12055 + }, + { + "epoch": 1.9533376539209333, + "grad_norm": 0.7577527165412903, + "learning_rate": 1.4333364484855277e-06, + "loss": 0.0881, + "step": 12056 + }, + { + "epoch": 1.95349967595593, + "grad_norm": 0.8456433415412903, + "learning_rate": 1.4329409628057062e-06, + "loss": 0.0975, + "step": 12057 + }, + { + "epoch": 1.9536616979909267, + "grad_norm": 0.7880712151527405, + "learning_rate": 1.432545509775309e-06, + "loss": 0.0929, + "step": 12058 + }, + { + "epoch": 1.9538237200259236, + "grad_norm": 0.7699628472328186, + "learning_rate": 1.432150089406436e-06, + "loss": 0.0881, + "step": 12059 + }, + { + "epoch": 1.9539857420609201, + "grad_norm": 0.7075265049934387, + "learning_rate": 1.4317547017111865e-06, + "loss": 0.0902, + "step": 12060 + }, + { + "epoch": 1.954147764095917, + "grad_norm": 0.8275308609008789, + "learning_rate": 1.4313593467016576e-06, + "loss": 0.1006, + "step": 12061 + }, + { + "epoch": 1.9543097861309138, + "grad_norm": 0.876686155796051, + "learning_rate": 1.4309640243899467e-06, + "loss": 0.0997, + "step": 12062 + }, + { + "epoch": 1.9544718081659105, + "grad_norm": 0.9776948690414429, + "learning_rate": 1.4305687347881497e-06, + "loss": 0.102, + "step": 12063 + }, + { + "epoch": 1.9546338302009074, + "grad_norm": 0.8320091962814331, + "learning_rate": 1.4301734779083614e-06, + "loss": 0.0979, + "step": 12064 + }, + { + "epoch": 1.9547958522359041, + "grad_norm": 0.9126972556114197, + "learning_rate": 1.429778253762677e-06, + "loss": 0.1078, + "step": 12065 + }, + { + "epoch": 1.9549578742709008, + "grad_norm": 0.7437605857849121, + "learning_rate": 1.4293830623631857e-06, + "loss": 0.0844, + "step": 12066 + }, + { + "epoch": 1.9551198963058976, + "grad_norm": 0.8160969018936157, + "learning_rate": 1.4289879037219832e-06, + "loss": 0.0981, + "step": 12067 + }, + { + "epoch": 1.9552819183408943, + "grad_norm": 0.7898657917976379, + "learning_rate": 1.4285927778511598e-06, + "loss": 0.0951, + "step": 12068 + }, + { + "epoch": 1.9554439403758912, + "grad_norm": 0.87503582239151, + "learning_rate": 1.4281976847628038e-06, + "loss": 0.1059, + "step": 12069 + }, + { + "epoch": 1.955605962410888, + "grad_norm": 0.8932657241821289, + "learning_rate": 1.4278026244690046e-06, + "loss": 0.1041, + "step": 12070 + }, + { + "epoch": 1.9557679844458846, + "grad_norm": 0.878448486328125, + "learning_rate": 1.4274075969818498e-06, + "loss": 0.092, + "step": 12071 + }, + { + "epoch": 1.9559300064808816, + "grad_norm": 0.7802169322967529, + "learning_rate": 1.427012602313429e-06, + "loss": 0.0938, + "step": 12072 + }, + { + "epoch": 1.956092028515878, + "grad_norm": 0.9439387321472168, + "learning_rate": 1.4266176404758246e-06, + "loss": 0.1151, + "step": 12073 + }, + { + "epoch": 1.956254050550875, + "grad_norm": 0.7736334800720215, + "learning_rate": 1.4262227114811233e-06, + "loss": 0.0914, + "step": 12074 + }, + { + "epoch": 1.9564160725858717, + "grad_norm": 0.8040494322776794, + "learning_rate": 1.4258278153414082e-06, + "loss": 0.0988, + "step": 12075 + }, + { + "epoch": 1.9565780946208684, + "grad_norm": 0.8778871297836304, + "learning_rate": 1.4254329520687626e-06, + "loss": 0.1024, + "step": 12076 + }, + { + "epoch": 1.9567401166558653, + "grad_norm": 0.7685941457748413, + "learning_rate": 1.4250381216752685e-06, + "loss": 0.0928, + "step": 12077 + }, + { + "epoch": 1.9569021386908618, + "grad_norm": 0.8489927053451538, + "learning_rate": 1.4246433241730062e-06, + "loss": 0.1049, + "step": 12078 + }, + { + "epoch": 1.9570641607258588, + "grad_norm": 0.7054093480110168, + "learning_rate": 1.4242485595740557e-06, + "loss": 0.0874, + "step": 12079 + }, + { + "epoch": 1.9572261827608555, + "grad_norm": 0.8379080295562744, + "learning_rate": 1.4238538278904973e-06, + "loss": 0.1119, + "step": 12080 + }, + { + "epoch": 1.9573882047958522, + "grad_norm": 0.7417575716972351, + "learning_rate": 1.4234591291344058e-06, + "loss": 0.0906, + "step": 12081 + }, + { + "epoch": 1.9575502268308491, + "grad_norm": 0.7702432870864868, + "learning_rate": 1.4230644633178603e-06, + "loss": 0.0917, + "step": 12082 + }, + { + "epoch": 1.9577122488658456, + "grad_norm": 0.7377634644508362, + "learning_rate": 1.4226698304529373e-06, + "loss": 0.0904, + "step": 12083 + }, + { + "epoch": 1.9578742709008425, + "grad_norm": 0.7292881608009338, + "learning_rate": 1.4222752305517093e-06, + "loss": 0.0947, + "step": 12084 + }, + { + "epoch": 1.9580362929358393, + "grad_norm": 0.8291403651237488, + "learning_rate": 1.4218806636262504e-06, + "loss": 0.0972, + "step": 12085 + }, + { + "epoch": 1.958198314970836, + "grad_norm": 0.8070185780525208, + "learning_rate": 1.421486129688635e-06, + "loss": 0.0977, + "step": 12086 + }, + { + "epoch": 1.958360337005833, + "grad_norm": 0.8218309283256531, + "learning_rate": 1.421091628750935e-06, + "loss": 0.1092, + "step": 12087 + }, + { + "epoch": 1.9585223590408296, + "grad_norm": 0.911953330039978, + "learning_rate": 1.4206971608252196e-06, + "loss": 0.1041, + "step": 12088 + }, + { + "epoch": 1.9586843810758263, + "grad_norm": 0.8383828997612, + "learning_rate": 1.4203027259235592e-06, + "loss": 0.0998, + "step": 12089 + }, + { + "epoch": 1.958846403110823, + "grad_norm": 0.7073849439620972, + "learning_rate": 1.4199083240580218e-06, + "loss": 0.0869, + "step": 12090 + }, + { + "epoch": 1.9590084251458197, + "grad_norm": 0.8118810653686523, + "learning_rate": 1.4195139552406766e-06, + "loss": 0.095, + "step": 12091 + }, + { + "epoch": 1.9591704471808167, + "grad_norm": 0.8553704619407654, + "learning_rate": 1.41911961948359e-06, + "loss": 0.1069, + "step": 12092 + }, + { + "epoch": 1.9593324692158134, + "grad_norm": 0.8816598653793335, + "learning_rate": 1.4187253167988266e-06, + "loss": 0.1, + "step": 12093 + }, + { + "epoch": 1.95949449125081, + "grad_norm": 0.7908227443695068, + "learning_rate": 1.4183310471984532e-06, + "loss": 0.0944, + "step": 12094 + }, + { + "epoch": 1.959656513285807, + "grad_norm": 0.8013996481895447, + "learning_rate": 1.41793681069453e-06, + "loss": 0.1013, + "step": 12095 + }, + { + "epoch": 1.9598185353208035, + "grad_norm": 0.8322909474372864, + "learning_rate": 1.4175426072991234e-06, + "loss": 0.0962, + "step": 12096 + }, + { + "epoch": 1.9599805573558005, + "grad_norm": 0.8282430768013, + "learning_rate": 1.4171484370242927e-06, + "loss": 0.1035, + "step": 12097 + }, + { + "epoch": 1.9601425793907972, + "grad_norm": 0.8286914229393005, + "learning_rate": 1.416754299882101e-06, + "loss": 0.0942, + "step": 12098 + }, + { + "epoch": 1.9603046014257939, + "grad_norm": 0.7469590902328491, + "learning_rate": 1.4163601958846052e-06, + "loss": 0.0882, + "step": 12099 + }, + { + "epoch": 1.9604666234607908, + "grad_norm": 0.8161764144897461, + "learning_rate": 1.415966125043864e-06, + "loss": 0.1035, + "step": 12100 + }, + { + "epoch": 1.9606286454957873, + "grad_norm": 0.8531409502029419, + "learning_rate": 1.4155720873719378e-06, + "loss": 0.0952, + "step": 12101 + }, + { + "epoch": 1.9607906675307842, + "grad_norm": 0.8322476148605347, + "learning_rate": 1.415178082880881e-06, + "loss": 0.0941, + "step": 12102 + }, + { + "epoch": 1.960952689565781, + "grad_norm": 0.7497605681419373, + "learning_rate": 1.414784111582749e-06, + "loss": 0.087, + "step": 12103 + }, + { + "epoch": 1.9611147116007777, + "grad_norm": 0.7348429560661316, + "learning_rate": 1.4143901734895973e-06, + "loss": 0.0873, + "step": 12104 + }, + { + "epoch": 1.9612767336357746, + "grad_norm": 0.8899442553520203, + "learning_rate": 1.4139962686134792e-06, + "loss": 0.1037, + "step": 12105 + }, + { + "epoch": 1.961438755670771, + "grad_norm": 0.7474508881568909, + "learning_rate": 1.4136023969664471e-06, + "loss": 0.0878, + "step": 12106 + }, + { + "epoch": 1.961600777705768, + "grad_norm": 0.8204247951507568, + "learning_rate": 1.4132085585605528e-06, + "loss": 0.1014, + "step": 12107 + }, + { + "epoch": 1.9617627997407647, + "grad_norm": 0.906720757484436, + "learning_rate": 1.4128147534078469e-06, + "loss": 0.0965, + "step": 12108 + }, + { + "epoch": 1.9619248217757614, + "grad_norm": 0.8094850778579712, + "learning_rate": 1.4124209815203779e-06, + "loss": 0.0905, + "step": 12109 + }, + { + "epoch": 1.9620868438107584, + "grad_norm": 0.8408055901527405, + "learning_rate": 1.4120272429101955e-06, + "loss": 0.1028, + "step": 12110 + }, + { + "epoch": 1.9622488658457549, + "grad_norm": 0.852473258972168, + "learning_rate": 1.4116335375893464e-06, + "loss": 0.0992, + "step": 12111 + }, + { + "epoch": 1.9624108878807518, + "grad_norm": 0.7675461173057556, + "learning_rate": 1.4112398655698772e-06, + "loss": 0.0989, + "step": 12112 + }, + { + "epoch": 1.9625729099157485, + "grad_norm": 0.7929125428199768, + "learning_rate": 1.4108462268638346e-06, + "loss": 0.0971, + "step": 12113 + }, + { + "epoch": 1.9627349319507452, + "grad_norm": 0.7752059102058411, + "learning_rate": 1.4104526214832595e-06, + "loss": 0.0955, + "step": 12114 + }, + { + "epoch": 1.9628969539857422, + "grad_norm": 0.7863343358039856, + "learning_rate": 1.4100590494401988e-06, + "loss": 0.0925, + "step": 12115 + }, + { + "epoch": 1.9630589760207389, + "grad_norm": 0.8089168071746826, + "learning_rate": 1.4096655107466943e-06, + "loss": 0.0934, + "step": 12116 + }, + { + "epoch": 1.9632209980557356, + "grad_norm": 0.8375380635261536, + "learning_rate": 1.4092720054147857e-06, + "loss": 0.0978, + "step": 12117 + }, + { + "epoch": 1.9633830200907323, + "grad_norm": 0.7996324896812439, + "learning_rate": 1.4088785334565145e-06, + "loss": 0.0971, + "step": 12118 + }, + { + "epoch": 1.963545042125729, + "grad_norm": 0.7718796730041504, + "learning_rate": 1.4084850948839194e-06, + "loss": 0.1003, + "step": 12119 + }, + { + "epoch": 1.963707064160726, + "grad_norm": 0.8828782439231873, + "learning_rate": 1.4080916897090391e-06, + "loss": 0.1085, + "step": 12120 + }, + { + "epoch": 1.9638690861957226, + "grad_norm": 0.8628697395324707, + "learning_rate": 1.4076983179439107e-06, + "loss": 0.1042, + "step": 12121 + }, + { + "epoch": 1.9640311082307194, + "grad_norm": 0.802811861038208, + "learning_rate": 1.4073049796005705e-06, + "loss": 0.1005, + "step": 12122 + }, + { + "epoch": 1.9641931302657163, + "grad_norm": 0.7491890788078308, + "learning_rate": 1.4069116746910536e-06, + "loss": 0.0984, + "step": 12123 + }, + { + "epoch": 1.9643551523007128, + "grad_norm": 0.9189904928207397, + "learning_rate": 1.4065184032273942e-06, + "loss": 0.1173, + "step": 12124 + }, + { + "epoch": 1.9645171743357097, + "grad_norm": 0.6986983418464661, + "learning_rate": 1.4061251652216254e-06, + "loss": 0.0886, + "step": 12125 + }, + { + "epoch": 1.9646791963707064, + "grad_norm": 0.9891625642776489, + "learning_rate": 1.4057319606857795e-06, + "loss": 0.1135, + "step": 12126 + }, + { + "epoch": 1.9648412184057031, + "grad_norm": 0.8146089911460876, + "learning_rate": 1.4053387896318888e-06, + "loss": 0.1027, + "step": 12127 + }, + { + "epoch": 1.9650032404407, + "grad_norm": 0.8153820633888245, + "learning_rate": 1.4049456520719805e-06, + "loss": 0.0987, + "step": 12128 + }, + { + "epoch": 1.9651652624756966, + "grad_norm": 0.6867256164550781, + "learning_rate": 1.4045525480180849e-06, + "loss": 0.0889, + "step": 12129 + }, + { + "epoch": 1.9653272845106935, + "grad_norm": 0.8280233144760132, + "learning_rate": 1.404159477482231e-06, + "loss": 0.1033, + "step": 12130 + }, + { + "epoch": 1.9654893065456902, + "grad_norm": 0.7022069096565247, + "learning_rate": 1.4037664404764465e-06, + "loss": 0.083, + "step": 12131 + }, + { + "epoch": 1.965651328580687, + "grad_norm": 0.8097946643829346, + "learning_rate": 1.403373437012755e-06, + "loss": 0.0949, + "step": 12132 + }, + { + "epoch": 1.9658133506156839, + "grad_norm": 0.7029479742050171, + "learning_rate": 1.4029804671031812e-06, + "loss": 0.0877, + "step": 12133 + }, + { + "epoch": 1.9659753726506803, + "grad_norm": 0.7599433064460754, + "learning_rate": 1.4025875307597528e-06, + "loss": 0.0892, + "step": 12134 + }, + { + "epoch": 1.9661373946856773, + "grad_norm": 0.754822313785553, + "learning_rate": 1.4021946279944893e-06, + "loss": 0.0842, + "step": 12135 + }, + { + "epoch": 1.966299416720674, + "grad_norm": 0.8841560482978821, + "learning_rate": 1.4018017588194132e-06, + "loss": 0.1086, + "step": 12136 + }, + { + "epoch": 1.9664614387556707, + "grad_norm": 0.7273808121681213, + "learning_rate": 1.4014089232465458e-06, + "loss": 0.0901, + "step": 12137 + }, + { + "epoch": 1.9666234607906676, + "grad_norm": 0.7973136305809021, + "learning_rate": 1.401016121287907e-06, + "loss": 0.085, + "step": 12138 + }, + { + "epoch": 1.9667854828256643, + "grad_norm": 0.7728840708732605, + "learning_rate": 1.4006233529555152e-06, + "loss": 0.0985, + "step": 12139 + }, + { + "epoch": 1.966947504860661, + "grad_norm": 0.807805597782135, + "learning_rate": 1.4002306182613885e-06, + "loss": 0.1028, + "step": 12140 + }, + { + "epoch": 1.9671095268956578, + "grad_norm": 0.8040981888771057, + "learning_rate": 1.399837917217543e-06, + "loss": 0.105, + "step": 12141 + }, + { + "epoch": 1.9672715489306545, + "grad_norm": 0.8386695384979248, + "learning_rate": 1.3994452498359963e-06, + "loss": 0.1036, + "step": 12142 + }, + { + "epoch": 1.9674335709656514, + "grad_norm": 0.7889523506164551, + "learning_rate": 1.39905261612876e-06, + "loss": 0.091, + "step": 12143 + }, + { + "epoch": 1.9675955930006481, + "grad_norm": 0.792294979095459, + "learning_rate": 1.39866001610785e-06, + "loss": 0.0958, + "step": 12144 + }, + { + "epoch": 1.9677576150356448, + "grad_norm": 0.7585206627845764, + "learning_rate": 1.3982674497852794e-06, + "loss": 0.0992, + "step": 12145 + }, + { + "epoch": 1.9679196370706418, + "grad_norm": 0.8587349057197571, + "learning_rate": 1.3978749171730577e-06, + "loss": 0.1085, + "step": 12146 + }, + { + "epoch": 1.9680816591056383, + "grad_norm": 0.742311418056488, + "learning_rate": 1.3974824182831965e-06, + "loss": 0.0892, + "step": 12147 + }, + { + "epoch": 1.9682436811406352, + "grad_norm": 0.805219829082489, + "learning_rate": 1.397089953127704e-06, + "loss": 0.0963, + "step": 12148 + }, + { + "epoch": 1.968405703175632, + "grad_norm": 0.8392459154129028, + "learning_rate": 1.3966975217185922e-06, + "loss": 0.0924, + "step": 12149 + }, + { + "epoch": 1.9685677252106286, + "grad_norm": 0.7114721536636353, + "learning_rate": 1.3963051240678652e-06, + "loss": 0.0909, + "step": 12150 + }, + { + "epoch": 1.9687297472456255, + "grad_norm": 0.8719701766967773, + "learning_rate": 1.3959127601875305e-06, + "loss": 0.1132, + "step": 12151 + }, + { + "epoch": 1.968891769280622, + "grad_norm": 0.7276889085769653, + "learning_rate": 1.3955204300895937e-06, + "loss": 0.0862, + "step": 12152 + }, + { + "epoch": 1.969053791315619, + "grad_norm": 0.7744901180267334, + "learning_rate": 1.3951281337860583e-06, + "loss": 0.0942, + "step": 12153 + }, + { + "epoch": 1.9692158133506157, + "grad_norm": 0.8694067001342773, + "learning_rate": 1.3947358712889292e-06, + "loss": 0.0981, + "step": 12154 + }, + { + "epoch": 1.9693778353856124, + "grad_norm": 0.8138403296470642, + "learning_rate": 1.394343642610207e-06, + "loss": 0.1028, + "step": 12155 + }, + { + "epoch": 1.9695398574206093, + "grad_norm": 0.8411658406257629, + "learning_rate": 1.3939514477618944e-06, + "loss": 0.0972, + "step": 12156 + }, + { + "epoch": 1.9697018794556058, + "grad_norm": 0.7511509656906128, + "learning_rate": 1.3935592867559902e-06, + "loss": 0.0868, + "step": 12157 + }, + { + "epoch": 1.9698639014906028, + "grad_norm": 0.7799199819564819, + "learning_rate": 1.3931671596044946e-06, + "loss": 0.103, + "step": 12158 + }, + { + "epoch": 1.9700259235255995, + "grad_norm": 0.7285469770431519, + "learning_rate": 1.3927750663194055e-06, + "loss": 0.0917, + "step": 12159 + }, + { + "epoch": 1.9701879455605962, + "grad_norm": 0.8250842690467834, + "learning_rate": 1.392383006912721e-06, + "loss": 0.0929, + "step": 12160 + }, + { + "epoch": 1.970349967595593, + "grad_norm": 0.7213485836982727, + "learning_rate": 1.391990981396435e-06, + "loss": 0.0876, + "step": 12161 + }, + { + "epoch": 1.9705119896305896, + "grad_norm": 0.7967683672904968, + "learning_rate": 1.3915989897825424e-06, + "loss": 0.0975, + "step": 12162 + }, + { + "epoch": 1.9706740116655865, + "grad_norm": 0.7662811875343323, + "learning_rate": 1.3912070320830406e-06, + "loss": 0.0975, + "step": 12163 + }, + { + "epoch": 1.9708360337005832, + "grad_norm": 0.7548596262931824, + "learning_rate": 1.3908151083099195e-06, + "loss": 0.0958, + "step": 12164 + }, + { + "epoch": 1.97099805573558, + "grad_norm": 0.7877540588378906, + "learning_rate": 1.390423218475172e-06, + "loss": 0.0953, + "step": 12165 + }, + { + "epoch": 1.971160077770577, + "grad_norm": 0.7539090514183044, + "learning_rate": 1.3900313625907886e-06, + "loss": 0.0947, + "step": 12166 + }, + { + "epoch": 1.9713220998055736, + "grad_norm": 0.7667350769042969, + "learning_rate": 1.3896395406687597e-06, + "loss": 0.091, + "step": 12167 + }, + { + "epoch": 1.9714841218405703, + "grad_norm": 0.9332156777381897, + "learning_rate": 1.3892477527210734e-06, + "loss": 0.1106, + "step": 12168 + }, + { + "epoch": 1.971646143875567, + "grad_norm": 0.8445827960968018, + "learning_rate": 1.3888559987597182e-06, + "loss": 0.1082, + "step": 12169 + }, + { + "epoch": 1.9718081659105637, + "grad_norm": 0.8100192546844482, + "learning_rate": 1.3884642787966806e-06, + "loss": 0.0948, + "step": 12170 + }, + { + "epoch": 1.9719701879455607, + "grad_norm": 1.1084216833114624, + "learning_rate": 1.3880725928439472e-06, + "loss": 0.1227, + "step": 12171 + }, + { + "epoch": 1.9721322099805574, + "grad_norm": 0.7511377930641174, + "learning_rate": 1.3876809409134994e-06, + "loss": 0.0881, + "step": 12172 + }, + { + "epoch": 1.972294232015554, + "grad_norm": 0.8674477338790894, + "learning_rate": 1.3872893230173245e-06, + "loss": 0.1032, + "step": 12173 + }, + { + "epoch": 1.972456254050551, + "grad_norm": 0.8338192701339722, + "learning_rate": 1.3868977391674033e-06, + "loss": 0.1097, + "step": 12174 + }, + { + "epoch": 1.9726182760855475, + "grad_norm": 0.7292324304580688, + "learning_rate": 1.3865061893757187e-06, + "loss": 0.0908, + "step": 12175 + }, + { + "epoch": 1.9727802981205445, + "grad_norm": 0.8517992496490479, + "learning_rate": 1.386114673654248e-06, + "loss": 0.0971, + "step": 12176 + }, + { + "epoch": 1.9729423201555412, + "grad_norm": 0.6976124048233032, + "learning_rate": 1.3857231920149738e-06, + "loss": 0.0792, + "step": 12177 + }, + { + "epoch": 1.9731043421905379, + "grad_norm": 0.8370488286018372, + "learning_rate": 1.3853317444698744e-06, + "loss": 0.1007, + "step": 12178 + }, + { + "epoch": 1.9732663642255348, + "grad_norm": 1.0718648433685303, + "learning_rate": 1.3849403310309251e-06, + "loss": 0.1228, + "step": 12179 + }, + { + "epoch": 1.9734283862605313, + "grad_norm": 0.7935207486152649, + "learning_rate": 1.3845489517101036e-06, + "loss": 0.1041, + "step": 12180 + }, + { + "epoch": 1.9735904082955282, + "grad_norm": 0.8802226185798645, + "learning_rate": 1.3841576065193834e-06, + "loss": 0.1127, + "step": 12181 + }, + { + "epoch": 1.973752430330525, + "grad_norm": 0.8075469136238098, + "learning_rate": 1.3837662954707426e-06, + "loss": 0.0987, + "step": 12182 + }, + { + "epoch": 1.9739144523655217, + "grad_norm": 0.7968436479568481, + "learning_rate": 1.3833750185761507e-06, + "loss": 0.0968, + "step": 12183 + }, + { + "epoch": 1.9740764744005186, + "grad_norm": 0.8242005109786987, + "learning_rate": 1.3829837758475808e-06, + "loss": 0.0978, + "step": 12184 + }, + { + "epoch": 1.974238496435515, + "grad_norm": 0.7901813983917236, + "learning_rate": 1.3825925672970048e-06, + "loss": 0.1014, + "step": 12185 + }, + { + "epoch": 1.974400518470512, + "grad_norm": 0.8069804310798645, + "learning_rate": 1.3822013929363914e-06, + "loss": 0.0971, + "step": 12186 + }, + { + "epoch": 1.9745625405055087, + "grad_norm": 0.8082523345947266, + "learning_rate": 1.3818102527777111e-06, + "loss": 0.0949, + "step": 12187 + }, + { + "epoch": 1.9747245625405054, + "grad_norm": 0.7194974422454834, + "learning_rate": 1.3814191468329307e-06, + "loss": 0.0868, + "step": 12188 + }, + { + "epoch": 1.9748865845755024, + "grad_norm": 0.8970668911933899, + "learning_rate": 1.3810280751140188e-06, + "loss": 0.1057, + "step": 12189 + }, + { + "epoch": 1.975048606610499, + "grad_norm": 0.7686210870742798, + "learning_rate": 1.3806370376329388e-06, + "loss": 0.0913, + "step": 12190 + }, + { + "epoch": 1.9752106286454958, + "grad_norm": 0.8302438259124756, + "learning_rate": 1.3802460344016552e-06, + "loss": 0.1052, + "step": 12191 + }, + { + "epoch": 1.9753726506804925, + "grad_norm": 0.7827860116958618, + "learning_rate": 1.3798550654321347e-06, + "loss": 0.091, + "step": 12192 + }, + { + "epoch": 1.9755346727154892, + "grad_norm": 0.8492727875709534, + "learning_rate": 1.3794641307363393e-06, + "loss": 0.1065, + "step": 12193 + }, + { + "epoch": 1.9756966947504861, + "grad_norm": 0.8518586158752441, + "learning_rate": 1.379073230326229e-06, + "loss": 0.1037, + "step": 12194 + }, + { + "epoch": 1.9758587167854829, + "grad_norm": 0.8506153225898743, + "learning_rate": 1.378682364213765e-06, + "loss": 0.1091, + "step": 12195 + }, + { + "epoch": 1.9760207388204796, + "grad_norm": 0.895915687084198, + "learning_rate": 1.3782915324109075e-06, + "loss": 0.1101, + "step": 12196 + }, + { + "epoch": 1.9761827608554765, + "grad_norm": 0.8467504978179932, + "learning_rate": 1.377900734929614e-06, + "loss": 0.0992, + "step": 12197 + }, + { + "epoch": 1.976344782890473, + "grad_norm": 0.7702296376228333, + "learning_rate": 1.3775099717818432e-06, + "loss": 0.0924, + "step": 12198 + }, + { + "epoch": 1.97650680492547, + "grad_norm": 0.8202542066574097, + "learning_rate": 1.377119242979551e-06, + "loss": 0.0916, + "step": 12199 + }, + { + "epoch": 1.9766688269604666, + "grad_norm": 0.9093190431594849, + "learning_rate": 1.376728548534692e-06, + "loss": 0.1095, + "step": 12200 + }, + { + "epoch": 1.9768308489954634, + "grad_norm": 0.8310642242431641, + "learning_rate": 1.3763378884592215e-06, + "loss": 0.0967, + "step": 12201 + }, + { + "epoch": 1.9769928710304603, + "grad_norm": 0.774259090423584, + "learning_rate": 1.3759472627650926e-06, + "loss": 0.0962, + "step": 12202 + }, + { + "epoch": 1.9771548930654568, + "grad_norm": 0.7820743918418884, + "learning_rate": 1.3755566714642571e-06, + "loss": 0.0974, + "step": 12203 + }, + { + "epoch": 1.9773169151004537, + "grad_norm": 0.8799723386764526, + "learning_rate": 1.3751661145686673e-06, + "loss": 0.1064, + "step": 12204 + }, + { + "epoch": 1.9774789371354504, + "grad_norm": 0.7707881927490234, + "learning_rate": 1.3747755920902706e-06, + "loss": 0.094, + "step": 12205 + }, + { + "epoch": 1.9776409591704471, + "grad_norm": 0.9051207900047302, + "learning_rate": 1.3743851040410183e-06, + "loss": 0.1111, + "step": 12206 + }, + { + "epoch": 1.977802981205444, + "grad_norm": 0.8284469246864319, + "learning_rate": 1.3739946504328594e-06, + "loss": 0.1007, + "step": 12207 + }, + { + "epoch": 1.9779650032404406, + "grad_norm": 0.7556292414665222, + "learning_rate": 1.3736042312777381e-06, + "loss": 0.0925, + "step": 12208 + }, + { + "epoch": 1.9781270252754375, + "grad_norm": 0.798641562461853, + "learning_rate": 1.3732138465876012e-06, + "loss": 0.0906, + "step": 12209 + }, + { + "epoch": 1.9782890473104342, + "grad_norm": 0.7734487652778625, + "learning_rate": 1.3728234963743931e-06, + "loss": 0.0985, + "step": 12210 + }, + { + "epoch": 1.978451069345431, + "grad_norm": 0.7930302619934082, + "learning_rate": 1.3724331806500604e-06, + "loss": 0.0921, + "step": 12211 + }, + { + "epoch": 1.9786130913804278, + "grad_norm": 0.8522673845291138, + "learning_rate": 1.3720428994265427e-06, + "loss": 0.0993, + "step": 12212 + }, + { + "epoch": 1.9787751134154243, + "grad_norm": 0.7778444290161133, + "learning_rate": 1.3716526527157826e-06, + "loss": 0.0865, + "step": 12213 + }, + { + "epoch": 1.9789371354504213, + "grad_norm": 0.8435918092727661, + "learning_rate": 1.3712624405297209e-06, + "loss": 0.0991, + "step": 12214 + }, + { + "epoch": 1.979099157485418, + "grad_norm": 0.9310854077339172, + "learning_rate": 1.3708722628802968e-06, + "loss": 0.1127, + "step": 12215 + }, + { + "epoch": 1.9792611795204147, + "grad_norm": 0.7594826221466064, + "learning_rate": 1.3704821197794491e-06, + "loss": 0.093, + "step": 12216 + }, + { + "epoch": 1.9794232015554116, + "grad_norm": 0.6724945902824402, + "learning_rate": 1.3700920112391152e-06, + "loss": 0.0864, + "step": 12217 + }, + { + "epoch": 1.9795852235904083, + "grad_norm": 0.7259680032730103, + "learning_rate": 1.369701937271231e-06, + "loss": 0.0876, + "step": 12218 + }, + { + "epoch": 1.979747245625405, + "grad_norm": 1.109673261642456, + "learning_rate": 1.369311897887733e-06, + "loss": 0.107, + "step": 12219 + }, + { + "epoch": 1.9799092676604018, + "grad_norm": 0.7597100138664246, + "learning_rate": 1.3689218931005543e-06, + "loss": 0.09, + "step": 12220 + }, + { + "epoch": 1.9800712896953985, + "grad_norm": 0.8943421244621277, + "learning_rate": 1.3685319229216287e-06, + "loss": 0.1104, + "step": 12221 + }, + { + "epoch": 1.9802333117303954, + "grad_norm": 0.8570875525474548, + "learning_rate": 1.368141987362889e-06, + "loss": 0.0965, + "step": 12222 + }, + { + "epoch": 1.9803953337653921, + "grad_norm": 0.8116633296012878, + "learning_rate": 1.3677520864362644e-06, + "loss": 0.0989, + "step": 12223 + }, + { + "epoch": 1.9805573558003888, + "grad_norm": 0.8231261968612671, + "learning_rate": 1.3673622201536852e-06, + "loss": 0.0923, + "step": 12224 + }, + { + "epoch": 1.9807193778353858, + "grad_norm": 0.7679911851882935, + "learning_rate": 1.366972388527082e-06, + "loss": 0.0988, + "step": 12225 + }, + { + "epoch": 1.9808813998703823, + "grad_norm": 0.7934950590133667, + "learning_rate": 1.3665825915683829e-06, + "loss": 0.0949, + "step": 12226 + }, + { + "epoch": 1.9810434219053792, + "grad_norm": 0.8774157762527466, + "learning_rate": 1.3661928292895123e-06, + "loss": 0.1021, + "step": 12227 + }, + { + "epoch": 1.981205443940376, + "grad_norm": 0.8431322574615479, + "learning_rate": 1.3658031017023977e-06, + "loss": 0.1071, + "step": 12228 + }, + { + "epoch": 1.9813674659753726, + "grad_norm": 0.7929509878158569, + "learning_rate": 1.3654134088189636e-06, + "loss": 0.0955, + "step": 12229 + }, + { + "epoch": 1.9815294880103695, + "grad_norm": 0.8263244032859802, + "learning_rate": 1.3650237506511333e-06, + "loss": 0.0889, + "step": 12230 + }, + { + "epoch": 1.981691510045366, + "grad_norm": 0.8262062668800354, + "learning_rate": 1.36463412721083e-06, + "loss": 0.1029, + "step": 12231 + }, + { + "epoch": 1.981853532080363, + "grad_norm": 0.8104032874107361, + "learning_rate": 1.3642445385099746e-06, + "loss": 0.093, + "step": 12232 + }, + { + "epoch": 1.9820155541153597, + "grad_norm": 0.7863628268241882, + "learning_rate": 1.3638549845604886e-06, + "loss": 0.0944, + "step": 12233 + }, + { + "epoch": 1.9821775761503564, + "grad_norm": 0.7456783652305603, + "learning_rate": 1.363465465374289e-06, + "loss": 0.093, + "step": 12234 + }, + { + "epoch": 1.9823395981853533, + "grad_norm": 0.7725854516029358, + "learning_rate": 1.3630759809632965e-06, + "loss": 0.0886, + "step": 12235 + }, + { + "epoch": 1.9825016202203498, + "grad_norm": 0.7280664443969727, + "learning_rate": 1.362686531339428e-06, + "loss": 0.0885, + "step": 12236 + }, + { + "epoch": 1.9826636422553467, + "grad_norm": 0.9137564301490784, + "learning_rate": 1.3622971165146005e-06, + "loss": 0.1132, + "step": 12237 + }, + { + "epoch": 1.9828256642903435, + "grad_norm": 0.8117855787277222, + "learning_rate": 1.3619077365007266e-06, + "loss": 0.0978, + "step": 12238 + }, + { + "epoch": 1.9829876863253402, + "grad_norm": 0.7241969108581543, + "learning_rate": 1.3615183913097211e-06, + "loss": 0.0911, + "step": 12239 + }, + { + "epoch": 1.983149708360337, + "grad_norm": 0.9119266867637634, + "learning_rate": 1.3611290809534997e-06, + "loss": 0.1046, + "step": 12240 + }, + { + "epoch": 1.9833117303953338, + "grad_norm": 0.7932629585266113, + "learning_rate": 1.3607398054439713e-06, + "loss": 0.1004, + "step": 12241 + }, + { + "epoch": 1.9834737524303305, + "grad_norm": 0.79780513048172, + "learning_rate": 1.3603505647930481e-06, + "loss": 0.0932, + "step": 12242 + }, + { + "epoch": 1.9836357744653272, + "grad_norm": 0.7685292959213257, + "learning_rate": 1.3599613590126388e-06, + "loss": 0.0969, + "step": 12243 + }, + { + "epoch": 1.983797796500324, + "grad_norm": 0.7224975228309631, + "learning_rate": 1.3595721881146548e-06, + "loss": 0.0876, + "step": 12244 + }, + { + "epoch": 1.9839598185353209, + "grad_norm": 0.7350090742111206, + "learning_rate": 1.359183052111001e-06, + "loss": 0.0943, + "step": 12245 + }, + { + "epoch": 1.9841218405703176, + "grad_norm": 0.7922118306159973, + "learning_rate": 1.3587939510135856e-06, + "loss": 0.0939, + "step": 12246 + }, + { + "epoch": 1.9842838626053143, + "grad_norm": 0.7330470085144043, + "learning_rate": 1.358404884834313e-06, + "loss": 0.0913, + "step": 12247 + }, + { + "epoch": 1.9844458846403112, + "grad_norm": 0.8037322759628296, + "learning_rate": 1.3580158535850884e-06, + "loss": 0.1021, + "step": 12248 + }, + { + "epoch": 1.9846079066753077, + "grad_norm": 0.7963745594024658, + "learning_rate": 1.3576268572778156e-06, + "loss": 0.1056, + "step": 12249 + }, + { + "epoch": 1.9847699287103047, + "grad_norm": 0.8091640472412109, + "learning_rate": 1.357237895924396e-06, + "loss": 0.0959, + "step": 12250 + }, + { + "epoch": 1.9849319507453014, + "grad_norm": 0.8512594103813171, + "learning_rate": 1.3568489695367325e-06, + "loss": 0.0948, + "step": 12251 + }, + { + "epoch": 1.985093972780298, + "grad_norm": 0.8013456463813782, + "learning_rate": 1.3564600781267234e-06, + "loss": 0.0988, + "step": 12252 + }, + { + "epoch": 1.985255994815295, + "grad_norm": 0.6600555181503296, + "learning_rate": 1.3560712217062676e-06, + "loss": 0.0808, + "step": 12253 + }, + { + "epoch": 1.9854180168502915, + "grad_norm": 0.8468796014785767, + "learning_rate": 1.3556824002872648e-06, + "loss": 0.0989, + "step": 12254 + }, + { + "epoch": 1.9855800388852884, + "grad_norm": 0.8054071664810181, + "learning_rate": 1.3552936138816124e-06, + "loss": 0.1016, + "step": 12255 + }, + { + "epoch": 1.9857420609202852, + "grad_norm": 0.7995196580886841, + "learning_rate": 1.3549048625012046e-06, + "loss": 0.0946, + "step": 12256 + }, + { + "epoch": 1.9859040829552819, + "grad_norm": 0.9262045621871948, + "learning_rate": 1.3545161461579367e-06, + "loss": 0.1046, + "step": 12257 + }, + { + "epoch": 1.9860661049902788, + "grad_norm": 0.9522650837898254, + "learning_rate": 1.354127464863703e-06, + "loss": 0.1038, + "step": 12258 + }, + { + "epoch": 1.9862281270252753, + "grad_norm": 0.8085470199584961, + "learning_rate": 1.3537388186303956e-06, + "loss": 0.0969, + "step": 12259 + }, + { + "epoch": 1.9863901490602722, + "grad_norm": 0.7567479014396667, + "learning_rate": 1.3533502074699065e-06, + "loss": 0.0935, + "step": 12260 + }, + { + "epoch": 1.986552171095269, + "grad_norm": 0.7655736804008484, + "learning_rate": 1.3529616313941264e-06, + "loss": 0.0944, + "step": 12261 + }, + { + "epoch": 1.9867141931302656, + "grad_norm": 0.8020748496055603, + "learning_rate": 1.3525730904149443e-06, + "loss": 0.0902, + "step": 12262 + }, + { + "epoch": 1.9868762151652626, + "grad_norm": 0.8509409427642822, + "learning_rate": 1.3521845845442489e-06, + "loss": 0.1011, + "step": 12263 + }, + { + "epoch": 1.987038237200259, + "grad_norm": 0.8280978798866272, + "learning_rate": 1.351796113793928e-06, + "loss": 0.0972, + "step": 12264 + }, + { + "epoch": 1.987200259235256, + "grad_norm": 0.8213714957237244, + "learning_rate": 1.351407678175867e-06, + "loss": 0.0884, + "step": 12265 + }, + { + "epoch": 1.9873622812702527, + "grad_norm": 0.8149861097335815, + "learning_rate": 1.3510192777019527e-06, + "loss": 0.0976, + "step": 12266 + }, + { + "epoch": 1.9875243033052494, + "grad_norm": 0.7219667434692383, + "learning_rate": 1.3506309123840659e-06, + "loss": 0.0856, + "step": 12267 + }, + { + "epoch": 1.9876863253402464, + "grad_norm": 0.7291889786720276, + "learning_rate": 1.3502425822340925e-06, + "loss": 0.0941, + "step": 12268 + }, + { + "epoch": 1.987848347375243, + "grad_norm": 0.7790191769599915, + "learning_rate": 1.3498542872639142e-06, + "loss": 0.0993, + "step": 12269 + }, + { + "epoch": 1.9880103694102398, + "grad_norm": 0.7414619326591492, + "learning_rate": 1.3494660274854122e-06, + "loss": 0.0905, + "step": 12270 + }, + { + "epoch": 1.9881723914452365, + "grad_norm": 0.7423388957977295, + "learning_rate": 1.3490778029104646e-06, + "loss": 0.0886, + "step": 12271 + }, + { + "epoch": 1.9883344134802332, + "grad_norm": 0.8346954584121704, + "learning_rate": 1.3486896135509503e-06, + "loss": 0.1073, + "step": 12272 + }, + { + "epoch": 1.9884964355152301, + "grad_norm": 0.7810722589492798, + "learning_rate": 1.3483014594187493e-06, + "loss": 0.0918, + "step": 12273 + }, + { + "epoch": 1.9886584575502269, + "grad_norm": 0.7663103938102722, + "learning_rate": 1.3479133405257355e-06, + "loss": 0.0959, + "step": 12274 + }, + { + "epoch": 1.9888204795852236, + "grad_norm": 0.7423315048217773, + "learning_rate": 1.347525256883786e-06, + "loss": 0.0895, + "step": 12275 + }, + { + "epoch": 1.9889825016202205, + "grad_norm": 0.7934430241584778, + "learning_rate": 1.3471372085047743e-06, + "loss": 0.0964, + "step": 12276 + }, + { + "epoch": 1.989144523655217, + "grad_norm": 0.8418073654174805, + "learning_rate": 1.346749195400574e-06, + "loss": 0.1047, + "step": 12277 + }, + { + "epoch": 1.989306545690214, + "grad_norm": 0.7353390455245972, + "learning_rate": 1.3463612175830578e-06, + "loss": 0.0857, + "step": 12278 + }, + { + "epoch": 1.9894685677252106, + "grad_norm": 0.8936602473258972, + "learning_rate": 1.3459732750640967e-06, + "loss": 0.1083, + "step": 12279 + }, + { + "epoch": 1.9896305897602073, + "grad_norm": 0.7523042559623718, + "learning_rate": 1.3455853678555605e-06, + "loss": 0.086, + "step": 12280 + }, + { + "epoch": 1.9897926117952043, + "grad_norm": 0.7889019846916199, + "learning_rate": 1.3451974959693193e-06, + "loss": 0.0887, + "step": 12281 + }, + { + "epoch": 1.9899546338302008, + "grad_norm": 0.8025287389755249, + "learning_rate": 1.3448096594172383e-06, + "loss": 0.0981, + "step": 12282 + }, + { + "epoch": 1.9901166558651977, + "grad_norm": 0.7614328861236572, + "learning_rate": 1.3444218582111872e-06, + "loss": 0.0908, + "step": 12283 + }, + { + "epoch": 1.9902786779001944, + "grad_norm": 0.7384836673736572, + "learning_rate": 1.344034092363032e-06, + "loss": 0.0878, + "step": 12284 + }, + { + "epoch": 1.9904406999351911, + "grad_norm": 0.7912430167198181, + "learning_rate": 1.3436463618846351e-06, + "loss": 0.0902, + "step": 12285 + }, + { + "epoch": 1.990602721970188, + "grad_norm": 0.8135248422622681, + "learning_rate": 1.343258666787861e-06, + "loss": 0.0986, + "step": 12286 + }, + { + "epoch": 1.9907647440051845, + "grad_norm": 0.7323644757270813, + "learning_rate": 1.3428710070845716e-06, + "loss": 0.0917, + "step": 12287 + }, + { + "epoch": 1.9909267660401815, + "grad_norm": 0.7355465292930603, + "learning_rate": 1.3424833827866312e-06, + "loss": 0.0901, + "step": 12288 + }, + { + "epoch": 1.9910887880751782, + "grad_norm": 0.7884531021118164, + "learning_rate": 1.342095793905897e-06, + "loss": 0.0937, + "step": 12289 + }, + { + "epoch": 1.991250810110175, + "grad_norm": 0.7865089178085327, + "learning_rate": 1.3417082404542295e-06, + "loss": 0.0997, + "step": 12290 + }, + { + "epoch": 1.9914128321451718, + "grad_norm": 0.711365818977356, + "learning_rate": 1.3413207224434867e-06, + "loss": 0.0833, + "step": 12291 + }, + { + "epoch": 1.9915748541801686, + "grad_norm": 0.9250528812408447, + "learning_rate": 1.3409332398855263e-06, + "loss": 0.1068, + "step": 12292 + }, + { + "epoch": 1.9917368762151653, + "grad_norm": 0.9538305401802063, + "learning_rate": 1.3405457927922032e-06, + "loss": 0.0967, + "step": 12293 + }, + { + "epoch": 1.991898898250162, + "grad_norm": 0.7699099183082581, + "learning_rate": 1.3401583811753735e-06, + "loss": 0.0936, + "step": 12294 + }, + { + "epoch": 1.9920609202851587, + "grad_norm": 0.7329535484313965, + "learning_rate": 1.3397710050468903e-06, + "loss": 0.0914, + "step": 12295 + }, + { + "epoch": 1.9922229423201556, + "grad_norm": 0.7969571352005005, + "learning_rate": 1.339383664418607e-06, + "loss": 0.0971, + "step": 12296 + }, + { + "epoch": 1.9923849643551523, + "grad_norm": 0.7721100449562073, + "learning_rate": 1.3389963593023747e-06, + "loss": 0.0894, + "step": 12297 + }, + { + "epoch": 1.992546986390149, + "grad_norm": 0.8299007415771484, + "learning_rate": 1.3386090897100442e-06, + "loss": 0.0902, + "step": 12298 + }, + { + "epoch": 1.992709008425146, + "grad_norm": 0.8381181955337524, + "learning_rate": 1.338221855653466e-06, + "loss": 0.0899, + "step": 12299 + }, + { + "epoch": 1.9928710304601425, + "grad_norm": 0.7096841931343079, + "learning_rate": 1.3378346571444866e-06, + "loss": 0.0798, + "step": 12300 + }, + { + "epoch": 1.9930330524951394, + "grad_norm": 0.7077094912528992, + "learning_rate": 1.3374474941949535e-06, + "loss": 0.0878, + "step": 12301 + }, + { + "epoch": 1.9931950745301361, + "grad_norm": 0.8370009660720825, + "learning_rate": 1.3370603668167156e-06, + "loss": 0.1005, + "step": 12302 + }, + { + "epoch": 1.9933570965651328, + "grad_norm": 0.7673652768135071, + "learning_rate": 1.3366732750216154e-06, + "loss": 0.0939, + "step": 12303 + }, + { + "epoch": 1.9935191186001298, + "grad_norm": 0.7367184162139893, + "learning_rate": 1.3362862188214977e-06, + "loss": 0.0913, + "step": 12304 + }, + { + "epoch": 1.9936811406351262, + "grad_norm": 0.8323283195495605, + "learning_rate": 1.3358991982282055e-06, + "loss": 0.1035, + "step": 12305 + }, + { + "epoch": 1.9938431626701232, + "grad_norm": 0.7226582765579224, + "learning_rate": 1.3355122132535806e-06, + "loss": 0.0931, + "step": 12306 + }, + { + "epoch": 1.99400518470512, + "grad_norm": 0.8198063969612122, + "learning_rate": 1.3351252639094641e-06, + "loss": 0.1003, + "step": 12307 + }, + { + "epoch": 1.9941672067401166, + "grad_norm": 0.6655625104904175, + "learning_rate": 1.3347383502076955e-06, + "loss": 0.0806, + "step": 12308 + }, + { + "epoch": 1.9943292287751135, + "grad_norm": 0.8288505673408508, + "learning_rate": 1.3343514721601136e-06, + "loss": 0.0963, + "step": 12309 + }, + { + "epoch": 1.99449125081011, + "grad_norm": 0.7732831835746765, + "learning_rate": 1.333964629778556e-06, + "loss": 0.1015, + "step": 12310 + }, + { + "epoch": 1.994653272845107, + "grad_norm": 0.821479320526123, + "learning_rate": 1.3335778230748588e-06, + "loss": 0.1061, + "step": 12311 + }, + { + "epoch": 1.9948152948801037, + "grad_norm": 0.8334558010101318, + "learning_rate": 1.3331910520608576e-06, + "loss": 0.095, + "step": 12312 + }, + { + "epoch": 1.9949773169151004, + "grad_norm": 0.8023782968521118, + "learning_rate": 1.3328043167483868e-06, + "loss": 0.1001, + "step": 12313 + }, + { + "epoch": 1.9951393389500973, + "grad_norm": 0.8799611330032349, + "learning_rate": 1.3324176171492798e-06, + "loss": 0.1116, + "step": 12314 + }, + { + "epoch": 1.9953013609850938, + "grad_norm": 0.8682718276977539, + "learning_rate": 1.3320309532753667e-06, + "loss": 0.1067, + "step": 12315 + }, + { + "epoch": 1.9954633830200907, + "grad_norm": 0.7998306751251221, + "learning_rate": 1.3316443251384808e-06, + "loss": 0.0978, + "step": 12316 + }, + { + "epoch": 1.9956254050550875, + "grad_norm": 1.0022040605545044, + "learning_rate": 1.3312577327504522e-06, + "loss": 0.1222, + "step": 12317 + }, + { + "epoch": 1.9957874270900842, + "grad_norm": 0.7971022129058838, + "learning_rate": 1.3308711761231074e-06, + "loss": 0.0886, + "step": 12318 + }, + { + "epoch": 1.995949449125081, + "grad_norm": 0.7757930755615234, + "learning_rate": 1.3304846552682756e-06, + "loss": 0.0984, + "step": 12319 + }, + { + "epoch": 1.9961114711600778, + "grad_norm": 0.8637643456459045, + "learning_rate": 1.3300981701977834e-06, + "loss": 0.1074, + "step": 12320 + }, + { + "epoch": 1.9962734931950745, + "grad_norm": 0.7365697622299194, + "learning_rate": 1.3297117209234558e-06, + "loss": 0.0892, + "step": 12321 + }, + { + "epoch": 1.9964355152300715, + "grad_norm": 0.861108660697937, + "learning_rate": 1.3293253074571178e-06, + "loss": 0.0967, + "step": 12322 + }, + { + "epoch": 1.996597537265068, + "grad_norm": 0.7773643732070923, + "learning_rate": 1.328938929810592e-06, + "loss": 0.0964, + "step": 12323 + }, + { + "epoch": 1.9967595593000649, + "grad_norm": 0.7831589579582214, + "learning_rate": 1.3285525879957011e-06, + "loss": 0.0884, + "step": 12324 + }, + { + "epoch": 1.9969215813350616, + "grad_norm": 0.7906063795089722, + "learning_rate": 1.3281662820242664e-06, + "loss": 0.0979, + "step": 12325 + }, + { + "epoch": 1.9970836033700583, + "grad_norm": 0.9140375256538391, + "learning_rate": 1.3277800119081077e-06, + "loss": 0.1065, + "step": 12326 + }, + { + "epoch": 1.9972456254050552, + "grad_norm": 0.8144938349723816, + "learning_rate": 1.327393777659044e-06, + "loss": 0.1046, + "step": 12327 + }, + { + "epoch": 1.9974076474400517, + "grad_norm": 0.795957624912262, + "learning_rate": 1.3270075792888937e-06, + "loss": 0.0933, + "step": 12328 + }, + { + "epoch": 1.9975696694750487, + "grad_norm": 0.7858502268791199, + "learning_rate": 1.326621416809472e-06, + "loss": 0.093, + "step": 12329 + }, + { + "epoch": 1.9977316915100454, + "grad_norm": 0.7812694907188416, + "learning_rate": 1.3262352902325944e-06, + "loss": 0.1023, + "step": 12330 + }, + { + "epoch": 1.997893713545042, + "grad_norm": 0.8662413954734802, + "learning_rate": 1.3258491995700777e-06, + "loss": 0.1051, + "step": 12331 + }, + { + "epoch": 1.998055735580039, + "grad_norm": 0.8278566598892212, + "learning_rate": 1.325463144833735e-06, + "loss": 0.0963, + "step": 12332 + }, + { + "epoch": 1.9982177576150355, + "grad_norm": 0.7443529963493347, + "learning_rate": 1.3250771260353764e-06, + "loss": 0.0903, + "step": 12333 + }, + { + "epoch": 1.9983797796500324, + "grad_norm": 0.7548669576644897, + "learning_rate": 1.324691143186814e-06, + "loss": 0.0901, + "step": 12334 + }, + { + "epoch": 1.9985418016850292, + "grad_norm": 0.839152455329895, + "learning_rate": 1.3243051962998598e-06, + "loss": 0.0943, + "step": 12335 + }, + { + "epoch": 1.9987038237200259, + "grad_norm": 0.7400187849998474, + "learning_rate": 1.323919285386321e-06, + "loss": 0.0864, + "step": 12336 + }, + { + "epoch": 1.9988658457550228, + "grad_norm": 0.7706547379493713, + "learning_rate": 1.3235334104580061e-06, + "loss": 0.0983, + "step": 12337 + }, + { + "epoch": 1.9990278677900193, + "grad_norm": 0.8732326626777649, + "learning_rate": 1.3231475715267217e-06, + "loss": 0.1046, + "step": 12338 + }, + { + "epoch": 1.9991898898250162, + "grad_norm": 0.7628592848777771, + "learning_rate": 1.3227617686042734e-06, + "loss": 0.0925, + "step": 12339 + }, + { + "epoch": 1.999351911860013, + "grad_norm": 0.7879310846328735, + "learning_rate": 1.3223760017024661e-06, + "loss": 0.1, + "step": 12340 + }, + { + "epoch": 1.9995139338950096, + "grad_norm": 0.7694489359855652, + "learning_rate": 1.321990270833104e-06, + "loss": 0.0851, + "step": 12341 + }, + { + "epoch": 1.9996759559300066, + "grad_norm": 0.7563639283180237, + "learning_rate": 1.3216045760079882e-06, + "loss": 0.0893, + "step": 12342 + }, + { + "epoch": 1.9998379779650033, + "grad_norm": 0.8943942189216614, + "learning_rate": 1.321218917238922e-06, + "loss": 0.1126, + "step": 12343 + }, + { + "epoch": 2.0, + "grad_norm": 0.822969377040863, + "learning_rate": 1.3208332945377022e-06, + "loss": 0.1075, + "step": 12344 + }, + { + "epoch": 2.000162022034997, + "grad_norm": 0.6357743740081787, + "learning_rate": 1.3204477079161312e-06, + "loss": 0.0723, + "step": 12345 + }, + { + "epoch": 2.0003240440699934, + "grad_norm": 0.5724396109580994, + "learning_rate": 1.3200621573860068e-06, + "loss": 0.0654, + "step": 12346 + }, + { + "epoch": 2.0004860661049904, + "grad_norm": 0.6248465776443481, + "learning_rate": 1.319676642959124e-06, + "loss": 0.0719, + "step": 12347 + }, + { + "epoch": 2.000648088139987, + "grad_norm": 0.5666736960411072, + "learning_rate": 1.3192911646472796e-06, + "loss": 0.0628, + "step": 12348 + }, + { + "epoch": 2.000810110174984, + "grad_norm": 0.5950966477394104, + "learning_rate": 1.3189057224622676e-06, + "loss": 0.0634, + "step": 12349 + }, + { + "epoch": 2.0009721322099807, + "grad_norm": 0.624751091003418, + "learning_rate": 1.3185203164158838e-06, + "loss": 0.0675, + "step": 12350 + }, + { + "epoch": 2.001134154244977, + "grad_norm": 0.7292012572288513, + "learning_rate": 1.3181349465199184e-06, + "loss": 0.0717, + "step": 12351 + }, + { + "epoch": 2.001296176279974, + "grad_norm": 0.5715608596801758, + "learning_rate": 1.3177496127861635e-06, + "loss": 0.0591, + "step": 12352 + }, + { + "epoch": 2.0014581983149706, + "grad_norm": 0.6922410130500793, + "learning_rate": 1.31736431522641e-06, + "loss": 0.0729, + "step": 12353 + }, + { + "epoch": 2.0016202203499676, + "grad_norm": 0.5971651077270508, + "learning_rate": 1.3169790538524457e-06, + "loss": 0.0648, + "step": 12354 + }, + { + "epoch": 2.0017822423849645, + "grad_norm": 0.6563034057617188, + "learning_rate": 1.3165938286760599e-06, + "loss": 0.0687, + "step": 12355 + }, + { + "epoch": 2.001944264419961, + "grad_norm": 0.6092911958694458, + "learning_rate": 1.316208639709039e-06, + "loss": 0.0641, + "step": 12356 + }, + { + "epoch": 2.002106286454958, + "grad_norm": 0.5998396873474121, + "learning_rate": 1.3158234869631692e-06, + "loss": 0.0602, + "step": 12357 + }, + { + "epoch": 2.002268308489955, + "grad_norm": 0.6295647025108337, + "learning_rate": 1.3154383704502349e-06, + "loss": 0.0667, + "step": 12358 + }, + { + "epoch": 2.0024303305249513, + "grad_norm": 0.6843347549438477, + "learning_rate": 1.31505329018202e-06, + "loss": 0.0644, + "step": 12359 + }, + { + "epoch": 2.0025923525599483, + "grad_norm": 0.8760754466056824, + "learning_rate": 1.3146682461703069e-06, + "loss": 0.0674, + "step": 12360 + }, + { + "epoch": 2.0027543745949448, + "grad_norm": 0.617087721824646, + "learning_rate": 1.314283238426878e-06, + "loss": 0.0603, + "step": 12361 + }, + { + "epoch": 2.0029163966299417, + "grad_norm": 0.7366885542869568, + "learning_rate": 1.3138982669635117e-06, + "loss": 0.0619, + "step": 12362 + }, + { + "epoch": 2.0030784186649386, + "grad_norm": 0.6823921799659729, + "learning_rate": 1.3135133317919868e-06, + "loss": 0.0606, + "step": 12363 + }, + { + "epoch": 2.003240440699935, + "grad_norm": 0.6802733540534973, + "learning_rate": 1.313128432924084e-06, + "loss": 0.0667, + "step": 12364 + }, + { + "epoch": 2.003402462734932, + "grad_norm": 0.6357323527336121, + "learning_rate": 1.3127435703715802e-06, + "loss": 0.0608, + "step": 12365 + }, + { + "epoch": 2.0035644847699285, + "grad_norm": 0.6992286443710327, + "learning_rate": 1.3123587441462487e-06, + "loss": 0.0601, + "step": 12366 + }, + { + "epoch": 2.0037265068049255, + "grad_norm": 0.7034692168235779, + "learning_rate": 1.3119739542598655e-06, + "loss": 0.0617, + "step": 12367 + }, + { + "epoch": 2.0038885288399224, + "grad_norm": 0.7643810510635376, + "learning_rate": 1.3115892007242046e-06, + "loss": 0.0653, + "step": 12368 + }, + { + "epoch": 2.004050550874919, + "grad_norm": 0.6599617004394531, + "learning_rate": 1.3112044835510378e-06, + "loss": 0.0582, + "step": 12369 + }, + { + "epoch": 2.004212572909916, + "grad_norm": 0.7879641652107239, + "learning_rate": 1.3108198027521374e-06, + "loss": 0.0667, + "step": 12370 + }, + { + "epoch": 2.0043745949449123, + "grad_norm": 0.7727499008178711, + "learning_rate": 1.3104351583392732e-06, + "loss": 0.0622, + "step": 12371 + }, + { + "epoch": 2.0045366169799093, + "grad_norm": 0.7340289354324341, + "learning_rate": 1.3100505503242156e-06, + "loss": 0.0605, + "step": 12372 + }, + { + "epoch": 2.004698639014906, + "grad_norm": 0.7155749797821045, + "learning_rate": 1.3096659787187294e-06, + "loss": 0.062, + "step": 12373 + }, + { + "epoch": 2.0048606610499027, + "grad_norm": 0.7992011308670044, + "learning_rate": 1.3092814435345845e-06, + "loss": 0.0655, + "step": 12374 + }, + { + "epoch": 2.0050226830848996, + "grad_norm": 0.6771997809410095, + "learning_rate": 1.3088969447835464e-06, + "loss": 0.0567, + "step": 12375 + }, + { + "epoch": 2.005184705119896, + "grad_norm": 0.7328808307647705, + "learning_rate": 1.3085124824773797e-06, + "loss": 0.0597, + "step": 12376 + }, + { + "epoch": 2.005346727154893, + "grad_norm": 0.7776128649711609, + "learning_rate": 1.3081280566278464e-06, + "loss": 0.0617, + "step": 12377 + }, + { + "epoch": 2.00550874918989, + "grad_norm": 0.8074184656143188, + "learning_rate": 1.307743667246711e-06, + "loss": 0.0738, + "step": 12378 + }, + { + "epoch": 2.0056707712248865, + "grad_norm": 0.7445822358131409, + "learning_rate": 1.3073593143457353e-06, + "loss": 0.0609, + "step": 12379 + }, + { + "epoch": 2.0058327932598834, + "grad_norm": 0.811712920665741, + "learning_rate": 1.306974997936677e-06, + "loss": 0.0675, + "step": 12380 + }, + { + "epoch": 2.00599481529488, + "grad_norm": 0.8170737624168396, + "learning_rate": 1.306590718031297e-06, + "loss": 0.0713, + "step": 12381 + }, + { + "epoch": 2.006156837329877, + "grad_norm": 0.6781127452850342, + "learning_rate": 1.3062064746413522e-06, + "loss": 0.0607, + "step": 12382 + }, + { + "epoch": 2.0063188593648738, + "grad_norm": 0.8460241556167603, + "learning_rate": 1.305822267778602e-06, + "loss": 0.0651, + "step": 12383 + }, + { + "epoch": 2.0064808813998702, + "grad_norm": 0.7995967864990234, + "learning_rate": 1.3054380974547998e-06, + "loss": 0.0606, + "step": 12384 + }, + { + "epoch": 2.006642903434867, + "grad_norm": 0.8841012120246887, + "learning_rate": 1.3050539636817012e-06, + "loss": 0.0734, + "step": 12385 + }, + { + "epoch": 2.006804925469864, + "grad_norm": 0.8261283040046692, + "learning_rate": 1.3046698664710595e-06, + "loss": 0.0605, + "step": 12386 + }, + { + "epoch": 2.0069669475048606, + "grad_norm": 0.6940838098526001, + "learning_rate": 1.304285805834627e-06, + "loss": 0.0601, + "step": 12387 + }, + { + "epoch": 2.0071289695398575, + "grad_norm": 0.8337867856025696, + "learning_rate": 1.3039017817841553e-06, + "loss": 0.0617, + "step": 12388 + }, + { + "epoch": 2.007290991574854, + "grad_norm": 0.7418117523193359, + "learning_rate": 1.3035177943313947e-06, + "loss": 0.0594, + "step": 12389 + }, + { + "epoch": 2.007453013609851, + "grad_norm": 0.853374183177948, + "learning_rate": 1.3031338434880952e-06, + "loss": 0.0681, + "step": 12390 + }, + { + "epoch": 2.007615035644848, + "grad_norm": 0.8610039949417114, + "learning_rate": 1.3027499292660022e-06, + "loss": 0.0707, + "step": 12391 + }, + { + "epoch": 2.0077770576798444, + "grad_norm": 0.8662420511245728, + "learning_rate": 1.3023660516768638e-06, + "loss": 0.0693, + "step": 12392 + }, + { + "epoch": 2.0079390797148413, + "grad_norm": 0.8422592282295227, + "learning_rate": 1.3019822107324267e-06, + "loss": 0.0759, + "step": 12393 + }, + { + "epoch": 2.008101101749838, + "grad_norm": 0.8292291760444641, + "learning_rate": 1.301598406444436e-06, + "loss": 0.0707, + "step": 12394 + }, + { + "epoch": 2.0082631237848347, + "grad_norm": 0.7821356058120728, + "learning_rate": 1.3012146388246328e-06, + "loss": 0.0616, + "step": 12395 + }, + { + "epoch": 2.0084251458198317, + "grad_norm": 0.8509025573730469, + "learning_rate": 1.3008309078847605e-06, + "loss": 0.0696, + "step": 12396 + }, + { + "epoch": 2.008587167854828, + "grad_norm": 0.8189096450805664, + "learning_rate": 1.3004472136365609e-06, + "loss": 0.0658, + "step": 12397 + }, + { + "epoch": 2.008749189889825, + "grad_norm": 0.7005947828292847, + "learning_rate": 1.3000635560917735e-06, + "loss": 0.0585, + "step": 12398 + }, + { + "epoch": 2.0089112119248216, + "grad_norm": 0.749654233455658, + "learning_rate": 1.2996799352621372e-06, + "loss": 0.0564, + "step": 12399 + }, + { + "epoch": 2.0090732339598185, + "grad_norm": 0.8583326935768127, + "learning_rate": 1.2992963511593904e-06, + "loss": 0.0669, + "step": 12400 + }, + { + "epoch": 2.0092352559948155, + "grad_norm": 0.834697425365448, + "learning_rate": 1.2989128037952698e-06, + "loss": 0.0709, + "step": 12401 + }, + { + "epoch": 2.009397278029812, + "grad_norm": 0.7957351207733154, + "learning_rate": 1.2985292931815105e-06, + "loss": 0.063, + "step": 12402 + }, + { + "epoch": 2.009559300064809, + "grad_norm": 0.7907761335372925, + "learning_rate": 1.2981458193298473e-06, + "loss": 0.0607, + "step": 12403 + }, + { + "epoch": 2.0097213220998054, + "grad_norm": 0.813335120677948, + "learning_rate": 1.2977623822520141e-06, + "loss": 0.0622, + "step": 12404 + }, + { + "epoch": 2.0098833441348023, + "grad_norm": 0.9492120146751404, + "learning_rate": 1.2973789819597431e-06, + "loss": 0.0669, + "step": 12405 + }, + { + "epoch": 2.0100453661697992, + "grad_norm": 0.8153406381607056, + "learning_rate": 1.296995618464763e-06, + "loss": 0.0651, + "step": 12406 + }, + { + "epoch": 2.0102073882047957, + "grad_norm": 0.7369259595870972, + "learning_rate": 1.296612291778807e-06, + "loss": 0.058, + "step": 12407 + }, + { + "epoch": 2.0103694102397927, + "grad_norm": 0.7126782536506653, + "learning_rate": 1.2962290019136028e-06, + "loss": 0.0606, + "step": 12408 + }, + { + "epoch": 2.0105314322747896, + "grad_norm": 0.7159781455993652, + "learning_rate": 1.295845748880879e-06, + "loss": 0.0635, + "step": 12409 + }, + { + "epoch": 2.010693454309786, + "grad_norm": 0.7070374488830566, + "learning_rate": 1.2954625326923602e-06, + "loss": 0.0552, + "step": 12410 + }, + { + "epoch": 2.010855476344783, + "grad_norm": 0.7887537479400635, + "learning_rate": 1.2950793533597722e-06, + "loss": 0.0673, + "step": 12411 + }, + { + "epoch": 2.0110174983797795, + "grad_norm": 0.7388550639152527, + "learning_rate": 1.294696210894842e-06, + "loss": 0.0542, + "step": 12412 + }, + { + "epoch": 2.0111795204147764, + "grad_norm": 0.8547154068946838, + "learning_rate": 1.2943131053092895e-06, + "loss": 0.0693, + "step": 12413 + }, + { + "epoch": 2.0113415424497734, + "grad_norm": 0.7168740034103394, + "learning_rate": 1.2939300366148389e-06, + "loss": 0.0591, + "step": 12414 + }, + { + "epoch": 2.01150356448477, + "grad_norm": 0.717652440071106, + "learning_rate": 1.2935470048232102e-06, + "loss": 0.0568, + "step": 12415 + }, + { + "epoch": 2.011665586519767, + "grad_norm": 0.8575769066810608, + "learning_rate": 1.2931640099461237e-06, + "loss": 0.0617, + "step": 12416 + }, + { + "epoch": 2.0118276085547633, + "grad_norm": 0.862146258354187, + "learning_rate": 1.292781051995298e-06, + "loss": 0.0645, + "step": 12417 + }, + { + "epoch": 2.01198963058976, + "grad_norm": 0.8514499068260193, + "learning_rate": 1.2923981309824507e-06, + "loss": 0.0685, + "step": 12418 + }, + { + "epoch": 2.012151652624757, + "grad_norm": 0.8388655781745911, + "learning_rate": 1.292015246919298e-06, + "loss": 0.0724, + "step": 12419 + }, + { + "epoch": 2.0123136746597536, + "grad_norm": 0.8496232032775879, + "learning_rate": 1.291632399817557e-06, + "loss": 0.0593, + "step": 12420 + }, + { + "epoch": 2.0124756966947506, + "grad_norm": 0.7816452383995056, + "learning_rate": 1.2912495896889383e-06, + "loss": 0.0612, + "step": 12421 + }, + { + "epoch": 2.012637718729747, + "grad_norm": 0.808066189289093, + "learning_rate": 1.2908668165451577e-06, + "loss": 0.0599, + "step": 12422 + }, + { + "epoch": 2.012799740764744, + "grad_norm": 0.8434270024299622, + "learning_rate": 1.2904840803979276e-06, + "loss": 0.0663, + "step": 12423 + }, + { + "epoch": 2.012961762799741, + "grad_norm": 0.8326642513275146, + "learning_rate": 1.290101381258957e-06, + "loss": 0.0616, + "step": 12424 + }, + { + "epoch": 2.0131237848347374, + "grad_norm": 0.971601128578186, + "learning_rate": 1.2897187191399546e-06, + "loss": 0.0676, + "step": 12425 + }, + { + "epoch": 2.0132858068697344, + "grad_norm": 0.8341547250747681, + "learning_rate": 1.289336094052632e-06, + "loss": 0.0643, + "step": 12426 + }, + { + "epoch": 2.013447828904731, + "grad_norm": 0.917320728302002, + "learning_rate": 1.288953506008696e-06, + "loss": 0.0688, + "step": 12427 + }, + { + "epoch": 2.0136098509397278, + "grad_norm": 0.8390260934829712, + "learning_rate": 1.288570955019851e-06, + "loss": 0.0646, + "step": 12428 + }, + { + "epoch": 2.0137718729747247, + "grad_norm": 0.9914124608039856, + "learning_rate": 1.2881884410978034e-06, + "loss": 0.0705, + "step": 12429 + }, + { + "epoch": 2.013933895009721, + "grad_norm": 0.8766891956329346, + "learning_rate": 1.2878059642542566e-06, + "loss": 0.0725, + "step": 12430 + }, + { + "epoch": 2.014095917044718, + "grad_norm": 0.7540597915649414, + "learning_rate": 1.2874235245009143e-06, + "loss": 0.0608, + "step": 12431 + }, + { + "epoch": 2.0142579390797146, + "grad_norm": 0.8450373411178589, + "learning_rate": 1.2870411218494778e-06, + "loss": 0.0668, + "step": 12432 + }, + { + "epoch": 2.0144199611147116, + "grad_norm": 0.743740975856781, + "learning_rate": 1.2866587563116473e-06, + "loss": 0.058, + "step": 12433 + }, + { + "epoch": 2.0145819831497085, + "grad_norm": 0.7940791845321655, + "learning_rate": 1.2862764278991236e-06, + "loss": 0.0618, + "step": 12434 + }, + { + "epoch": 2.014744005184705, + "grad_norm": 0.8542442321777344, + "learning_rate": 1.2858941366236021e-06, + "loss": 0.0641, + "step": 12435 + }, + { + "epoch": 2.014906027219702, + "grad_norm": 0.8491990566253662, + "learning_rate": 1.2855118824967833e-06, + "loss": 0.0603, + "step": 12436 + }, + { + "epoch": 2.015068049254699, + "grad_norm": 1.0003999471664429, + "learning_rate": 1.2851296655303616e-06, + "loss": 0.064, + "step": 12437 + }, + { + "epoch": 2.0152300712896953, + "grad_norm": 0.8259167671203613, + "learning_rate": 1.2847474857360332e-06, + "loss": 0.0616, + "step": 12438 + }, + { + "epoch": 2.0153920933246923, + "grad_norm": 0.9689138531684875, + "learning_rate": 1.28436534312549e-06, + "loss": 0.0662, + "step": 12439 + }, + { + "epoch": 2.0155541153596888, + "grad_norm": 0.8320235013961792, + "learning_rate": 1.2839832377104245e-06, + "loss": 0.0579, + "step": 12440 + }, + { + "epoch": 2.0157161373946857, + "grad_norm": 0.8748543858528137, + "learning_rate": 1.283601169502531e-06, + "loss": 0.0762, + "step": 12441 + }, + { + "epoch": 2.0158781594296826, + "grad_norm": 0.920680820941925, + "learning_rate": 1.2832191385134972e-06, + "loss": 0.0721, + "step": 12442 + }, + { + "epoch": 2.016040181464679, + "grad_norm": 0.6789097785949707, + "learning_rate": 1.2828371447550133e-06, + "loss": 0.0546, + "step": 12443 + }, + { + "epoch": 2.016202203499676, + "grad_norm": 0.8041635155677795, + "learning_rate": 1.2824551882387664e-06, + "loss": 0.0615, + "step": 12444 + }, + { + "epoch": 2.0163642255346725, + "grad_norm": 0.8807735443115234, + "learning_rate": 1.2820732689764462e-06, + "loss": 0.0688, + "step": 12445 + }, + { + "epoch": 2.0165262475696695, + "grad_norm": 0.7847744822502136, + "learning_rate": 1.2816913869797353e-06, + "loss": 0.0639, + "step": 12446 + }, + { + "epoch": 2.0166882696046664, + "grad_norm": 0.9476509094238281, + "learning_rate": 1.2813095422603203e-06, + "loss": 0.0652, + "step": 12447 + }, + { + "epoch": 2.016850291639663, + "grad_norm": 0.7772141695022583, + "learning_rate": 1.2809277348298838e-06, + "loss": 0.0583, + "step": 12448 + }, + { + "epoch": 2.01701231367466, + "grad_norm": 0.8144988417625427, + "learning_rate": 1.2805459647001087e-06, + "loss": 0.0634, + "step": 12449 + }, + { + "epoch": 2.0171743357096563, + "grad_norm": 0.9111658334732056, + "learning_rate": 1.2801642318826759e-06, + "loss": 0.0599, + "step": 12450 + }, + { + "epoch": 2.0173363577446533, + "grad_norm": 0.9419159889221191, + "learning_rate": 1.2797825363892658e-06, + "loss": 0.0759, + "step": 12451 + }, + { + "epoch": 2.01749837977965, + "grad_norm": 0.9087123870849609, + "learning_rate": 1.279400878231557e-06, + "loss": 0.0676, + "step": 12452 + }, + { + "epoch": 2.0176604018146467, + "grad_norm": 0.7883154153823853, + "learning_rate": 1.2790192574212287e-06, + "loss": 0.0585, + "step": 12453 + }, + { + "epoch": 2.0178224238496436, + "grad_norm": 0.7743737101554871, + "learning_rate": 1.2786376739699547e-06, + "loss": 0.0643, + "step": 12454 + }, + { + "epoch": 2.01798444588464, + "grad_norm": 0.8277117013931274, + "learning_rate": 1.2782561278894126e-06, + "loss": 0.069, + "step": 12455 + }, + { + "epoch": 2.018146467919637, + "grad_norm": 0.7758954763412476, + "learning_rate": 1.2778746191912778e-06, + "loss": 0.0629, + "step": 12456 + }, + { + "epoch": 2.018308489954634, + "grad_norm": 0.7998802065849304, + "learning_rate": 1.277493147887221e-06, + "loss": 0.0572, + "step": 12457 + }, + { + "epoch": 2.0184705119896305, + "grad_norm": 0.7002156972885132, + "learning_rate": 1.2771117139889155e-06, + "loss": 0.0612, + "step": 12458 + }, + { + "epoch": 2.0186325340246274, + "grad_norm": 0.7348694801330566, + "learning_rate": 1.2767303175080325e-06, + "loss": 0.0593, + "step": 12459 + }, + { + "epoch": 2.0187945560596243, + "grad_norm": 0.7863420248031616, + "learning_rate": 1.276348958456241e-06, + "loss": 0.0638, + "step": 12460 + }, + { + "epoch": 2.018956578094621, + "grad_norm": 0.8472959995269775, + "learning_rate": 1.2759676368452106e-06, + "loss": 0.0664, + "step": 12461 + }, + { + "epoch": 2.0191186001296177, + "grad_norm": 0.7812498807907104, + "learning_rate": 1.2755863526866087e-06, + "loss": 0.0568, + "step": 12462 + }, + { + "epoch": 2.0192806221646142, + "grad_norm": 0.85575932264328, + "learning_rate": 1.2752051059921005e-06, + "loss": 0.0662, + "step": 12463 + }, + { + "epoch": 2.019442644199611, + "grad_norm": 0.9001664519309998, + "learning_rate": 1.2748238967733529e-06, + "loss": 0.0682, + "step": 12464 + }, + { + "epoch": 2.019604666234608, + "grad_norm": 0.948502779006958, + "learning_rate": 1.2744427250420288e-06, + "loss": 0.0695, + "step": 12465 + }, + { + "epoch": 2.0197666882696046, + "grad_norm": 0.8993748426437378, + "learning_rate": 1.2740615908097915e-06, + "loss": 0.0641, + "step": 12466 + }, + { + "epoch": 2.0199287103046015, + "grad_norm": 0.8371912837028503, + "learning_rate": 1.273680494088304e-06, + "loss": 0.064, + "step": 12467 + }, + { + "epoch": 2.020090732339598, + "grad_norm": 1.5723109245300293, + "learning_rate": 1.2732994348892237e-06, + "loss": 0.0644, + "step": 12468 + }, + { + "epoch": 2.020252754374595, + "grad_norm": 0.9138685464859009, + "learning_rate": 1.2729184132242131e-06, + "loss": 0.0678, + "step": 12469 + }, + { + "epoch": 2.020414776409592, + "grad_norm": 0.8680717349052429, + "learning_rate": 1.2725374291049296e-06, + "loss": 0.0615, + "step": 12470 + }, + { + "epoch": 2.0205767984445884, + "grad_norm": 1.0072784423828125, + "learning_rate": 1.2721564825430313e-06, + "loss": 0.0675, + "step": 12471 + }, + { + "epoch": 2.0207388204795853, + "grad_norm": 0.8381701707839966, + "learning_rate": 1.2717755735501725e-06, + "loss": 0.0593, + "step": 12472 + }, + { + "epoch": 2.020900842514582, + "grad_norm": 0.8638261556625366, + "learning_rate": 1.2713947021380078e-06, + "loss": 0.0674, + "step": 12473 + }, + { + "epoch": 2.0210628645495787, + "grad_norm": 0.8350870609283447, + "learning_rate": 1.2710138683181937e-06, + "loss": 0.0686, + "step": 12474 + }, + { + "epoch": 2.0212248865845757, + "grad_norm": 0.8235458731651306, + "learning_rate": 1.2706330721023807e-06, + "loss": 0.0662, + "step": 12475 + }, + { + "epoch": 2.021386908619572, + "grad_norm": 0.7439950704574585, + "learning_rate": 1.2702523135022205e-06, + "loss": 0.0597, + "step": 12476 + }, + { + "epoch": 2.021548930654569, + "grad_norm": 0.7728898525238037, + "learning_rate": 1.2698715925293634e-06, + "loss": 0.0563, + "step": 12477 + }, + { + "epoch": 2.0217109526895656, + "grad_norm": 0.7948071956634521, + "learning_rate": 1.2694909091954588e-06, + "loss": 0.0595, + "step": 12478 + }, + { + "epoch": 2.0218729747245625, + "grad_norm": 0.8696475625038147, + "learning_rate": 1.2691102635121544e-06, + "loss": 0.0629, + "step": 12479 + }, + { + "epoch": 2.0220349967595594, + "grad_norm": 0.869878888130188, + "learning_rate": 1.2687296554910978e-06, + "loss": 0.0731, + "step": 12480 + }, + { + "epoch": 2.022197018794556, + "grad_norm": 0.812859833240509, + "learning_rate": 1.2683490851439334e-06, + "loss": 0.0624, + "step": 12481 + }, + { + "epoch": 2.022359040829553, + "grad_norm": 0.8346046209335327, + "learning_rate": 1.2679685524823082e-06, + "loss": 0.0628, + "step": 12482 + }, + { + "epoch": 2.0225210628645494, + "grad_norm": 0.808914065361023, + "learning_rate": 1.2675880575178613e-06, + "loss": 0.0587, + "step": 12483 + }, + { + "epoch": 2.0226830848995463, + "grad_norm": 0.7890689969062805, + "learning_rate": 1.2672076002622386e-06, + "loss": 0.0623, + "step": 12484 + }, + { + "epoch": 2.0228451069345432, + "grad_norm": 0.7109611630439758, + "learning_rate": 1.266827180727081e-06, + "loss": 0.053, + "step": 12485 + }, + { + "epoch": 2.0230071289695397, + "grad_norm": 0.9208827018737793, + "learning_rate": 1.2664467989240265e-06, + "loss": 0.0608, + "step": 12486 + }, + { + "epoch": 2.0231691510045366, + "grad_norm": 1.064787745475769, + "learning_rate": 1.266066454864715e-06, + "loss": 0.0668, + "step": 12487 + }, + { + "epoch": 2.0233311730395336, + "grad_norm": 0.8241201639175415, + "learning_rate": 1.2656861485607828e-06, + "loss": 0.06, + "step": 12488 + }, + { + "epoch": 2.02349319507453, + "grad_norm": 0.8332204222679138, + "learning_rate": 1.2653058800238693e-06, + "loss": 0.0669, + "step": 12489 + }, + { + "epoch": 2.023655217109527, + "grad_norm": 0.8376948833465576, + "learning_rate": 1.264925649265607e-06, + "loss": 0.0604, + "step": 12490 + }, + { + "epoch": 2.0238172391445235, + "grad_norm": 0.971579372882843, + "learning_rate": 1.2645454562976311e-06, + "loss": 0.0593, + "step": 12491 + }, + { + "epoch": 2.0239792611795204, + "grad_norm": 1.004150390625, + "learning_rate": 1.2641653011315746e-06, + "loss": 0.0687, + "step": 12492 + }, + { + "epoch": 2.0241412832145174, + "grad_norm": 0.7998213768005371, + "learning_rate": 1.2637851837790694e-06, + "loss": 0.0622, + "step": 12493 + }, + { + "epoch": 2.024303305249514, + "grad_norm": 1.0076199769973755, + "learning_rate": 1.2634051042517453e-06, + "loss": 0.064, + "step": 12494 + }, + { + "epoch": 2.024465327284511, + "grad_norm": 0.8546391725540161, + "learning_rate": 1.2630250625612331e-06, + "loss": 0.0648, + "step": 12495 + }, + { + "epoch": 2.0246273493195073, + "grad_norm": 0.9467459917068481, + "learning_rate": 1.2626450587191602e-06, + "loss": 0.0631, + "step": 12496 + }, + { + "epoch": 2.024789371354504, + "grad_norm": 0.8554684519767761, + "learning_rate": 1.2622650927371543e-06, + "loss": 0.0655, + "step": 12497 + }, + { + "epoch": 2.024951393389501, + "grad_norm": 0.8624327182769775, + "learning_rate": 1.2618851646268416e-06, + "loss": 0.0617, + "step": 12498 + }, + { + "epoch": 2.0251134154244976, + "grad_norm": 0.9098758101463318, + "learning_rate": 1.2615052743998463e-06, + "loss": 0.066, + "step": 12499 + }, + { + "epoch": 2.0252754374594946, + "grad_norm": 0.8350497484207153, + "learning_rate": 1.2611254220677937e-06, + "loss": 0.064, + "step": 12500 + }, + { + "epoch": 2.025437459494491, + "grad_norm": 0.877099871635437, + "learning_rate": 1.260745607642304e-06, + "loss": 0.0547, + "step": 12501 + }, + { + "epoch": 2.025599481529488, + "grad_norm": 0.7435240149497986, + "learning_rate": 1.260365831134999e-06, + "loss": 0.052, + "step": 12502 + }, + { + "epoch": 2.025761503564485, + "grad_norm": 0.7691801190376282, + "learning_rate": 1.2599860925575014e-06, + "loss": 0.0535, + "step": 12503 + }, + { + "epoch": 2.0259235255994814, + "grad_norm": 0.7755981087684631, + "learning_rate": 1.259606391921428e-06, + "loss": 0.0574, + "step": 12504 + }, + { + "epoch": 2.0260855476344783, + "grad_norm": 1.0379624366760254, + "learning_rate": 1.259226729238397e-06, + "loss": 0.0631, + "step": 12505 + }, + { + "epoch": 2.026247569669475, + "grad_norm": 0.8337475657463074, + "learning_rate": 1.2588471045200256e-06, + "loss": 0.0557, + "step": 12506 + }, + { + "epoch": 2.0264095917044718, + "grad_norm": 0.8900526165962219, + "learning_rate": 1.2584675177779294e-06, + "loss": 0.0599, + "step": 12507 + }, + { + "epoch": 2.0265716137394687, + "grad_norm": 0.7923566102981567, + "learning_rate": 1.2580879690237224e-06, + "loss": 0.0552, + "step": 12508 + }, + { + "epoch": 2.026733635774465, + "grad_norm": 0.8455385565757751, + "learning_rate": 1.257708458269018e-06, + "loss": 0.0609, + "step": 12509 + }, + { + "epoch": 2.026895657809462, + "grad_norm": 0.8646577000617981, + "learning_rate": 1.257328985525429e-06, + "loss": 0.068, + "step": 12510 + }, + { + "epoch": 2.027057679844459, + "grad_norm": 0.8488696217536926, + "learning_rate": 1.2569495508045656e-06, + "loss": 0.0622, + "step": 12511 + }, + { + "epoch": 2.0272197018794555, + "grad_norm": 0.9858669638633728, + "learning_rate": 1.256570154118038e-06, + "loss": 0.0687, + "step": 12512 + }, + { + "epoch": 2.0273817239144525, + "grad_norm": 0.9625844359397888, + "learning_rate": 1.2561907954774544e-06, + "loss": 0.0651, + "step": 12513 + }, + { + "epoch": 2.027543745949449, + "grad_norm": 0.9064575433731079, + "learning_rate": 1.2558114748944226e-06, + "loss": 0.0618, + "step": 12514 + }, + { + "epoch": 2.027705767984446, + "grad_norm": 0.8604201674461365, + "learning_rate": 1.2554321923805496e-06, + "loss": 0.0648, + "step": 12515 + }, + { + "epoch": 2.027867790019443, + "grad_norm": 1.1641322374343872, + "learning_rate": 1.2550529479474383e-06, + "loss": 0.0645, + "step": 12516 + }, + { + "epoch": 2.0280298120544393, + "grad_norm": 0.8054379224777222, + "learning_rate": 1.2546737416066945e-06, + "loss": 0.0605, + "step": 12517 + }, + { + "epoch": 2.0281918340894363, + "grad_norm": 0.9893051385879517, + "learning_rate": 1.2542945733699216e-06, + "loss": 0.0739, + "step": 12518 + }, + { + "epoch": 2.0283538561244328, + "grad_norm": 0.8926251530647278, + "learning_rate": 1.2539154432487193e-06, + "loss": 0.0588, + "step": 12519 + }, + { + "epoch": 2.0285158781594297, + "grad_norm": 0.8101766705513, + "learning_rate": 1.2535363512546892e-06, + "loss": 0.0602, + "step": 12520 + }, + { + "epoch": 2.0286779001944266, + "grad_norm": 0.9098559021949768, + "learning_rate": 1.2531572973994293e-06, + "loss": 0.0676, + "step": 12521 + }, + { + "epoch": 2.028839922229423, + "grad_norm": 1.1210435628890991, + "learning_rate": 1.2527782816945405e-06, + "loss": 0.0665, + "step": 12522 + }, + { + "epoch": 2.02900194426442, + "grad_norm": 0.73442542552948, + "learning_rate": 1.2523993041516175e-06, + "loss": 0.045, + "step": 12523 + }, + { + "epoch": 2.0291639662994165, + "grad_norm": 0.8347865343093872, + "learning_rate": 1.2520203647822563e-06, + "loss": 0.0643, + "step": 12524 + }, + { + "epoch": 2.0293259883344135, + "grad_norm": 0.9335924983024597, + "learning_rate": 1.2516414635980518e-06, + "loss": 0.0657, + "step": 12525 + }, + { + "epoch": 2.0294880103694104, + "grad_norm": 0.845848023891449, + "learning_rate": 1.2512626006105977e-06, + "loss": 0.0618, + "step": 12526 + }, + { + "epoch": 2.029650032404407, + "grad_norm": 0.8612715601921082, + "learning_rate": 1.2508837758314862e-06, + "loss": 0.0725, + "step": 12527 + }, + { + "epoch": 2.029812054439404, + "grad_norm": 0.936465322971344, + "learning_rate": 1.2505049892723083e-06, + "loss": 0.0711, + "step": 12528 + }, + { + "epoch": 2.0299740764744003, + "grad_norm": 0.8308136463165283, + "learning_rate": 1.2501262409446552e-06, + "loss": 0.0601, + "step": 12529 + }, + { + "epoch": 2.0301360985093972, + "grad_norm": 0.8098605871200562, + "learning_rate": 1.2497475308601134e-06, + "loss": 0.0643, + "step": 12530 + }, + { + "epoch": 2.030298120544394, + "grad_norm": 0.932350754737854, + "learning_rate": 1.2493688590302705e-06, + "loss": 0.0697, + "step": 12531 + }, + { + "epoch": 2.0304601425793907, + "grad_norm": 0.8417340517044067, + "learning_rate": 1.248990225466715e-06, + "loss": 0.0661, + "step": 12532 + }, + { + "epoch": 2.0306221646143876, + "grad_norm": 0.7914611101150513, + "learning_rate": 1.2486116301810322e-06, + "loss": 0.0647, + "step": 12533 + }, + { + "epoch": 2.0307841866493845, + "grad_norm": 1.3596220016479492, + "learning_rate": 1.2482330731848044e-06, + "loss": 0.0616, + "step": 12534 + }, + { + "epoch": 2.030946208684381, + "grad_norm": 1.0075905323028564, + "learning_rate": 1.2478545544896143e-06, + "loss": 0.0648, + "step": 12535 + }, + { + "epoch": 2.031108230719378, + "grad_norm": 0.8666545748710632, + "learning_rate": 1.2474760741070465e-06, + "loss": 0.0621, + "step": 12536 + }, + { + "epoch": 2.0312702527543745, + "grad_norm": 0.8424669504165649, + "learning_rate": 1.2470976320486792e-06, + "loss": 0.0616, + "step": 12537 + }, + { + "epoch": 2.0314322747893714, + "grad_norm": 0.858910083770752, + "learning_rate": 1.246719228326092e-06, + "loss": 0.0631, + "step": 12538 + }, + { + "epoch": 2.0315942968243683, + "grad_norm": 0.9575031995773315, + "learning_rate": 1.2463408629508635e-06, + "loss": 0.0641, + "step": 12539 + }, + { + "epoch": 2.031756318859365, + "grad_norm": 1.0462077856063843, + "learning_rate": 1.2459625359345712e-06, + "loss": 0.0718, + "step": 12540 + }, + { + "epoch": 2.0319183408943617, + "grad_norm": 0.9476367831230164, + "learning_rate": 1.2455842472887903e-06, + "loss": 0.0599, + "step": 12541 + }, + { + "epoch": 2.0320803629293582, + "grad_norm": 1.0100579261779785, + "learning_rate": 1.2452059970250957e-06, + "loss": 0.0597, + "step": 12542 + }, + { + "epoch": 2.032242384964355, + "grad_norm": 0.8134729266166687, + "learning_rate": 1.2448277851550613e-06, + "loss": 0.0586, + "step": 12543 + }, + { + "epoch": 2.032404406999352, + "grad_norm": 0.9285221695899963, + "learning_rate": 1.2444496116902602e-06, + "loss": 0.0693, + "step": 12544 + }, + { + "epoch": 2.0325664290343486, + "grad_norm": 1.1232776641845703, + "learning_rate": 1.2440714766422604e-06, + "loss": 0.0791, + "step": 12545 + }, + { + "epoch": 2.0327284510693455, + "grad_norm": 0.7684057950973511, + "learning_rate": 1.2436933800226352e-06, + "loss": 0.0565, + "step": 12546 + }, + { + "epoch": 2.032890473104342, + "grad_norm": 0.7600722908973694, + "learning_rate": 1.2433153218429526e-06, + "loss": 0.0612, + "step": 12547 + }, + { + "epoch": 2.033052495139339, + "grad_norm": 0.8198429942131042, + "learning_rate": 1.2429373021147808e-06, + "loss": 0.0616, + "step": 12548 + }, + { + "epoch": 2.033214517174336, + "grad_norm": 0.9691295027732849, + "learning_rate": 1.2425593208496844e-06, + "loss": 0.0707, + "step": 12549 + }, + { + "epoch": 2.0333765392093324, + "grad_norm": 0.7660728693008423, + "learning_rate": 1.2421813780592294e-06, + "loss": 0.0643, + "step": 12550 + }, + { + "epoch": 2.0335385612443293, + "grad_norm": 0.9578198194503784, + "learning_rate": 1.2418034737549818e-06, + "loss": 0.0708, + "step": 12551 + }, + { + "epoch": 2.033700583279326, + "grad_norm": 0.7433199882507324, + "learning_rate": 1.2414256079485021e-06, + "loss": 0.0557, + "step": 12552 + }, + { + "epoch": 2.0338626053143227, + "grad_norm": 0.862591028213501, + "learning_rate": 1.2410477806513535e-06, + "loss": 0.0666, + "step": 12553 + }, + { + "epoch": 2.0340246273493197, + "grad_norm": 0.8760819435119629, + "learning_rate": 1.240669991875096e-06, + "loss": 0.0661, + "step": 12554 + }, + { + "epoch": 2.034186649384316, + "grad_norm": 0.8833930492401123, + "learning_rate": 1.2402922416312891e-06, + "loss": 0.0626, + "step": 12555 + }, + { + "epoch": 2.034348671419313, + "grad_norm": 0.8621806502342224, + "learning_rate": 1.2399145299314913e-06, + "loss": 0.0646, + "step": 12556 + }, + { + "epoch": 2.0345106934543096, + "grad_norm": 0.8710420727729797, + "learning_rate": 1.2395368567872596e-06, + "loss": 0.0686, + "step": 12557 + }, + { + "epoch": 2.0346727154893065, + "grad_norm": 0.935194194316864, + "learning_rate": 1.2391592222101497e-06, + "loss": 0.0659, + "step": 12558 + }, + { + "epoch": 2.0348347375243034, + "grad_norm": 0.9124763011932373, + "learning_rate": 1.2387816262117167e-06, + "loss": 0.0611, + "step": 12559 + }, + { + "epoch": 2.0349967595593, + "grad_norm": 0.861625075340271, + "learning_rate": 1.2384040688035135e-06, + "loss": 0.0576, + "step": 12560 + }, + { + "epoch": 2.035158781594297, + "grad_norm": 0.9553015232086182, + "learning_rate": 1.2380265499970932e-06, + "loss": 0.0698, + "step": 12561 + }, + { + "epoch": 2.035320803629294, + "grad_norm": 0.7892826795578003, + "learning_rate": 1.2376490698040069e-06, + "loss": 0.0561, + "step": 12562 + }, + { + "epoch": 2.0354828256642903, + "grad_norm": 0.7279547452926636, + "learning_rate": 1.2372716282358038e-06, + "loss": 0.0527, + "step": 12563 + }, + { + "epoch": 2.035644847699287, + "grad_norm": 0.7964885234832764, + "learning_rate": 1.236894225304032e-06, + "loss": 0.0593, + "step": 12564 + }, + { + "epoch": 2.0358068697342837, + "grad_norm": 0.7441739439964294, + "learning_rate": 1.2365168610202411e-06, + "loss": 0.0557, + "step": 12565 + }, + { + "epoch": 2.0359688917692806, + "grad_norm": 1.0166600942611694, + "learning_rate": 1.2361395353959776e-06, + "loss": 0.064, + "step": 12566 + }, + { + "epoch": 2.0361309138042776, + "grad_norm": 0.8261235356330872, + "learning_rate": 1.2357622484427854e-06, + "loss": 0.0591, + "step": 12567 + }, + { + "epoch": 2.036292935839274, + "grad_norm": 0.8217520117759705, + "learning_rate": 1.2353850001722084e-06, + "loss": 0.06, + "step": 12568 + }, + { + "epoch": 2.036454957874271, + "grad_norm": 0.887876570224762, + "learning_rate": 1.2350077905957902e-06, + "loss": 0.0678, + "step": 12569 + }, + { + "epoch": 2.0366169799092675, + "grad_norm": 0.8636623024940491, + "learning_rate": 1.2346306197250727e-06, + "loss": 0.0623, + "step": 12570 + }, + { + "epoch": 2.0367790019442644, + "grad_norm": 0.9496206045150757, + "learning_rate": 1.2342534875715958e-06, + "loss": 0.0559, + "step": 12571 + }, + { + "epoch": 2.0369410239792614, + "grad_norm": 0.827489972114563, + "learning_rate": 1.2338763941468993e-06, + "loss": 0.0609, + "step": 12572 + }, + { + "epoch": 2.037103046014258, + "grad_norm": 0.8658286929130554, + "learning_rate": 1.2334993394625219e-06, + "loss": 0.0712, + "step": 12573 + }, + { + "epoch": 2.037265068049255, + "grad_norm": 0.8474152088165283, + "learning_rate": 1.2331223235299983e-06, + "loss": 0.0627, + "step": 12574 + }, + { + "epoch": 2.0374270900842513, + "grad_norm": 0.8493207693099976, + "learning_rate": 1.2327453463608663e-06, + "loss": 0.0655, + "step": 12575 + }, + { + "epoch": 2.037589112119248, + "grad_norm": 0.9206482172012329, + "learning_rate": 1.2323684079666604e-06, + "loss": 0.0604, + "step": 12576 + }, + { + "epoch": 2.037751134154245, + "grad_norm": 1.0142006874084473, + "learning_rate": 1.2319915083589143e-06, + "loss": 0.0613, + "step": 12577 + }, + { + "epoch": 2.0379131561892416, + "grad_norm": 0.7615647315979004, + "learning_rate": 1.2316146475491578e-06, + "loss": 0.059, + "step": 12578 + }, + { + "epoch": 2.0380751782242386, + "grad_norm": 0.7730655670166016, + "learning_rate": 1.2312378255489246e-06, + "loss": 0.0621, + "step": 12579 + }, + { + "epoch": 2.038237200259235, + "grad_norm": 0.788704514503479, + "learning_rate": 1.2308610423697446e-06, + "loss": 0.0553, + "step": 12580 + }, + { + "epoch": 2.038399222294232, + "grad_norm": 0.8466129302978516, + "learning_rate": 1.2304842980231442e-06, + "loss": 0.0635, + "step": 12581 + }, + { + "epoch": 2.038561244329229, + "grad_norm": 0.8458765745162964, + "learning_rate": 1.2301075925206524e-06, + "loss": 0.0605, + "step": 12582 + }, + { + "epoch": 2.0387232663642254, + "grad_norm": 0.8162645101547241, + "learning_rate": 1.229730925873794e-06, + "loss": 0.0538, + "step": 12583 + }, + { + "epoch": 2.0388852883992223, + "grad_norm": 0.9227370023727417, + "learning_rate": 1.2293542980940974e-06, + "loss": 0.0655, + "step": 12584 + }, + { + "epoch": 2.039047310434219, + "grad_norm": 0.759735643863678, + "learning_rate": 1.2289777091930832e-06, + "loss": 0.0537, + "step": 12585 + }, + { + "epoch": 2.0392093324692158, + "grad_norm": 0.8518356680870056, + "learning_rate": 1.2286011591822756e-06, + "loss": 0.0646, + "step": 12586 + }, + { + "epoch": 2.0393713545042127, + "grad_norm": 0.9240495562553406, + "learning_rate": 1.2282246480731955e-06, + "loss": 0.0714, + "step": 12587 + }, + { + "epoch": 2.039533376539209, + "grad_norm": 0.8255005478858948, + "learning_rate": 1.2278481758773636e-06, + "loss": 0.0656, + "step": 12588 + }, + { + "epoch": 2.039695398574206, + "grad_norm": 0.8890777826309204, + "learning_rate": 1.227471742606299e-06, + "loss": 0.0655, + "step": 12589 + }, + { + "epoch": 2.039857420609203, + "grad_norm": 0.9725179076194763, + "learning_rate": 1.2270953482715197e-06, + "loss": 0.0681, + "step": 12590 + }, + { + "epoch": 2.0400194426441995, + "grad_norm": 0.9976947903633118, + "learning_rate": 1.2267189928845424e-06, + "loss": 0.0639, + "step": 12591 + }, + { + "epoch": 2.0401814646791965, + "grad_norm": 0.8435698747634888, + "learning_rate": 1.2263426764568835e-06, + "loss": 0.0615, + "step": 12592 + }, + { + "epoch": 2.040343486714193, + "grad_norm": 1.729702353477478, + "learning_rate": 1.2259663990000544e-06, + "loss": 0.0608, + "step": 12593 + }, + { + "epoch": 2.04050550874919, + "grad_norm": 0.8571469783782959, + "learning_rate": 1.2255901605255715e-06, + "loss": 0.0673, + "step": 12594 + }, + { + "epoch": 2.040667530784187, + "grad_norm": 1.0115898847579956, + "learning_rate": 1.2252139610449468e-06, + "loss": 0.0618, + "step": 12595 + }, + { + "epoch": 2.0408295528191833, + "grad_norm": 0.7657039165496826, + "learning_rate": 1.224837800569689e-06, + "loss": 0.0583, + "step": 12596 + }, + { + "epoch": 2.0409915748541803, + "grad_norm": 0.8274670243263245, + "learning_rate": 1.2244616791113085e-06, + "loss": 0.0617, + "step": 12597 + }, + { + "epoch": 2.0411535968891767, + "grad_norm": 0.8327546119689941, + "learning_rate": 1.224085596681314e-06, + "loss": 0.0532, + "step": 12598 + }, + { + "epoch": 2.0413156189241737, + "grad_norm": 0.8720483779907227, + "learning_rate": 1.2237095532912125e-06, + "loss": 0.0622, + "step": 12599 + }, + { + "epoch": 2.0414776409591706, + "grad_norm": 0.9215652942657471, + "learning_rate": 1.22333354895251e-06, + "loss": 0.0658, + "step": 12600 + }, + { + "epoch": 2.041639662994167, + "grad_norm": 0.792291522026062, + "learning_rate": 1.2229575836767115e-06, + "loss": 0.0579, + "step": 12601 + }, + { + "epoch": 2.041801685029164, + "grad_norm": 0.8133555054664612, + "learning_rate": 1.2225816574753208e-06, + "loss": 0.0622, + "step": 12602 + }, + { + "epoch": 2.0419637070641605, + "grad_norm": 0.7037177681922913, + "learning_rate": 1.2222057703598398e-06, + "loss": 0.0573, + "step": 12603 + }, + { + "epoch": 2.0421257290991575, + "grad_norm": 0.9011344909667969, + "learning_rate": 1.2218299223417702e-06, + "loss": 0.0612, + "step": 12604 + }, + { + "epoch": 2.0422877511341544, + "grad_norm": 0.8436145782470703, + "learning_rate": 1.2214541134326117e-06, + "loss": 0.0604, + "step": 12605 + }, + { + "epoch": 2.042449773169151, + "grad_norm": 0.7856259346008301, + "learning_rate": 1.2210783436438644e-06, + "loss": 0.0538, + "step": 12606 + }, + { + "epoch": 2.042611795204148, + "grad_norm": 1.0986276865005493, + "learning_rate": 1.2207026129870229e-06, + "loss": 0.0682, + "step": 12607 + }, + { + "epoch": 2.0427738172391443, + "grad_norm": 0.8682820796966553, + "learning_rate": 1.2203269214735866e-06, + "loss": 0.0579, + "step": 12608 + }, + { + "epoch": 2.0429358392741412, + "grad_norm": 0.8327391147613525, + "learning_rate": 1.2199512691150496e-06, + "loss": 0.0624, + "step": 12609 + }, + { + "epoch": 2.043097861309138, + "grad_norm": 0.8508796691894531, + "learning_rate": 1.2195756559229072e-06, + "loss": 0.0579, + "step": 12610 + }, + { + "epoch": 2.0432598833441347, + "grad_norm": 0.8715740442276001, + "learning_rate": 1.2192000819086502e-06, + "loss": 0.0582, + "step": 12611 + }, + { + "epoch": 2.0434219053791316, + "grad_norm": 0.8794846534729004, + "learning_rate": 1.2188245470837702e-06, + "loss": 0.0638, + "step": 12612 + }, + { + "epoch": 2.0435839274141285, + "grad_norm": 0.8919373750686646, + "learning_rate": 1.2184490514597606e-06, + "loss": 0.068, + "step": 12613 + }, + { + "epoch": 2.043745949449125, + "grad_norm": 1.1181578636169434, + "learning_rate": 1.218073595048108e-06, + "loss": 0.074, + "step": 12614 + }, + { + "epoch": 2.043907971484122, + "grad_norm": 0.9722860455513, + "learning_rate": 1.217698177860301e-06, + "loss": 0.0629, + "step": 12615 + }, + { + "epoch": 2.0440699935191184, + "grad_norm": 0.8910360932350159, + "learning_rate": 1.2173227999078264e-06, + "loss": 0.0586, + "step": 12616 + }, + { + "epoch": 2.0442320155541154, + "grad_norm": 1.0762535333633423, + "learning_rate": 1.2169474612021703e-06, + "loss": 0.066, + "step": 12617 + }, + { + "epoch": 2.0443940375891123, + "grad_norm": 0.9633141756057739, + "learning_rate": 1.2165721617548172e-06, + "loss": 0.0638, + "step": 12618 + }, + { + "epoch": 2.044556059624109, + "grad_norm": 0.9522542357444763, + "learning_rate": 1.2161969015772498e-06, + "loss": 0.0645, + "step": 12619 + }, + { + "epoch": 2.0447180816591057, + "grad_norm": 0.907698392868042, + "learning_rate": 1.2158216806809505e-06, + "loss": 0.0629, + "step": 12620 + }, + { + "epoch": 2.0448801036941022, + "grad_norm": 0.9610335230827332, + "learning_rate": 1.2154464990774013e-06, + "loss": 0.0621, + "step": 12621 + }, + { + "epoch": 2.045042125729099, + "grad_norm": 0.8890466690063477, + "learning_rate": 1.2150713567780786e-06, + "loss": 0.0585, + "step": 12622 + }, + { + "epoch": 2.045204147764096, + "grad_norm": 0.8295314908027649, + "learning_rate": 1.2146962537944638e-06, + "loss": 0.0649, + "step": 12623 + }, + { + "epoch": 2.0453661697990926, + "grad_norm": 0.883902907371521, + "learning_rate": 1.2143211901380341e-06, + "loss": 0.055, + "step": 12624 + }, + { + "epoch": 2.0455281918340895, + "grad_norm": 0.8097559213638306, + "learning_rate": 1.2139461658202642e-06, + "loss": 0.0591, + "step": 12625 + }, + { + "epoch": 2.045690213869086, + "grad_norm": 0.8150593638420105, + "learning_rate": 1.2135711808526282e-06, + "loss": 0.0614, + "step": 12626 + }, + { + "epoch": 2.045852235904083, + "grad_norm": 0.8336517810821533, + "learning_rate": 1.213196235246602e-06, + "loss": 0.0626, + "step": 12627 + }, + { + "epoch": 2.04601425793908, + "grad_norm": 0.7647026777267456, + "learning_rate": 1.2128213290136578e-06, + "loss": 0.0603, + "step": 12628 + }, + { + "epoch": 2.0461762799740764, + "grad_norm": 0.7869901657104492, + "learning_rate": 1.212446462165265e-06, + "loss": 0.0578, + "step": 12629 + }, + { + "epoch": 2.0463383020090733, + "grad_norm": 0.957288920879364, + "learning_rate": 1.212071634712895e-06, + "loss": 0.0735, + "step": 12630 + }, + { + "epoch": 2.04650032404407, + "grad_norm": 0.9335697889328003, + "learning_rate": 1.2116968466680159e-06, + "loss": 0.062, + "step": 12631 + }, + { + "epoch": 2.0466623460790667, + "grad_norm": 0.85823655128479, + "learning_rate": 1.211322098042096e-06, + "loss": 0.0543, + "step": 12632 + }, + { + "epoch": 2.0468243681140637, + "grad_norm": 0.9083124995231628, + "learning_rate": 1.210947388846601e-06, + "loss": 0.0638, + "step": 12633 + }, + { + "epoch": 2.04698639014906, + "grad_norm": 0.773314893245697, + "learning_rate": 1.2105727190929967e-06, + "loss": 0.0619, + "step": 12634 + }, + { + "epoch": 2.047148412184057, + "grad_norm": 0.8823757767677307, + "learning_rate": 1.2101980887927467e-06, + "loss": 0.0663, + "step": 12635 + }, + { + "epoch": 2.047310434219054, + "grad_norm": 1.0058071613311768, + "learning_rate": 1.209823497957314e-06, + "loss": 0.0649, + "step": 12636 + }, + { + "epoch": 2.0474724562540505, + "grad_norm": 0.9731652736663818, + "learning_rate": 1.2094489465981602e-06, + "loss": 0.0596, + "step": 12637 + }, + { + "epoch": 2.0476344782890474, + "grad_norm": 0.9225329160690308, + "learning_rate": 1.2090744347267452e-06, + "loss": 0.074, + "step": 12638 + }, + { + "epoch": 2.047796500324044, + "grad_norm": 0.7908154129981995, + "learning_rate": 1.2086999623545297e-06, + "loss": 0.0611, + "step": 12639 + }, + { + "epoch": 2.047958522359041, + "grad_norm": 0.7675386667251587, + "learning_rate": 1.2083255294929697e-06, + "loss": 0.0613, + "step": 12640 + }, + { + "epoch": 2.048120544394038, + "grad_norm": 0.7376556396484375, + "learning_rate": 1.2079511361535214e-06, + "loss": 0.0549, + "step": 12641 + }, + { + "epoch": 2.0482825664290343, + "grad_norm": 1.0447229146957397, + "learning_rate": 1.2075767823476439e-06, + "loss": 0.0698, + "step": 12642 + }, + { + "epoch": 2.048444588464031, + "grad_norm": 0.8112842440605164, + "learning_rate": 1.207202468086788e-06, + "loss": 0.0597, + "step": 12643 + }, + { + "epoch": 2.0486066104990277, + "grad_norm": 0.9694043397903442, + "learning_rate": 1.2068281933824084e-06, + "loss": 0.0697, + "step": 12644 + }, + { + "epoch": 2.0487686325340246, + "grad_norm": 1.0804142951965332, + "learning_rate": 1.2064539582459564e-06, + "loss": 0.0691, + "step": 12645 + }, + { + "epoch": 2.0489306545690216, + "grad_norm": 0.8556890487670898, + "learning_rate": 1.2060797626888828e-06, + "loss": 0.0708, + "step": 12646 + }, + { + "epoch": 2.049092676604018, + "grad_norm": 0.839736819267273, + "learning_rate": 1.2057056067226374e-06, + "loss": 0.0549, + "step": 12647 + }, + { + "epoch": 2.049254698639015, + "grad_norm": 0.7325344681739807, + "learning_rate": 1.2053314903586685e-06, + "loss": 0.0569, + "step": 12648 + }, + { + "epoch": 2.0494167206740115, + "grad_norm": 0.9087137579917908, + "learning_rate": 1.2049574136084228e-06, + "loss": 0.066, + "step": 12649 + }, + { + "epoch": 2.0495787427090084, + "grad_norm": 0.8081238865852356, + "learning_rate": 1.2045833764833461e-06, + "loss": 0.064, + "step": 12650 + }, + { + "epoch": 2.0497407647440054, + "grad_norm": 1.0425145626068115, + "learning_rate": 1.2042093789948836e-06, + "loss": 0.0651, + "step": 12651 + }, + { + "epoch": 2.049902786779002, + "grad_norm": 0.8078098893165588, + "learning_rate": 1.2038354211544781e-06, + "loss": 0.0593, + "step": 12652 + }, + { + "epoch": 2.0500648088139988, + "grad_norm": 0.8982751369476318, + "learning_rate": 1.2034615029735722e-06, + "loss": 0.0703, + "step": 12653 + }, + { + "epoch": 2.0502268308489953, + "grad_norm": 1.2443002462387085, + "learning_rate": 1.2030876244636078e-06, + "loss": 0.0627, + "step": 12654 + }, + { + "epoch": 2.050388852883992, + "grad_norm": 0.8545292019844055, + "learning_rate": 1.2027137856360212e-06, + "loss": 0.0665, + "step": 12655 + }, + { + "epoch": 2.050550874918989, + "grad_norm": 0.7731471061706543, + "learning_rate": 1.202339986502255e-06, + "loss": 0.058, + "step": 12656 + }, + { + "epoch": 2.0507128969539856, + "grad_norm": 0.8670762777328491, + "learning_rate": 1.2019662270737455e-06, + "loss": 0.0616, + "step": 12657 + }, + { + "epoch": 2.0508749189889826, + "grad_norm": 0.7677872776985168, + "learning_rate": 1.2015925073619275e-06, + "loss": 0.0561, + "step": 12658 + }, + { + "epoch": 2.051036941023979, + "grad_norm": 0.8861536979675293, + "learning_rate": 1.2012188273782367e-06, + "loss": 0.0666, + "step": 12659 + }, + { + "epoch": 2.051198963058976, + "grad_norm": 0.8551197052001953, + "learning_rate": 1.2008451871341056e-06, + "loss": 0.0547, + "step": 12660 + }, + { + "epoch": 2.051360985093973, + "grad_norm": 0.7268028855323792, + "learning_rate": 1.20047158664097e-06, + "loss": 0.0543, + "step": 12661 + }, + { + "epoch": 2.0515230071289694, + "grad_norm": 1.0221055746078491, + "learning_rate": 1.200098025910258e-06, + "loss": 0.0664, + "step": 12662 + }, + { + "epoch": 2.0516850291639663, + "grad_norm": 0.7850842475891113, + "learning_rate": 1.1997245049534007e-06, + "loss": 0.0598, + "step": 12663 + }, + { + "epoch": 2.0518470511989633, + "grad_norm": 0.9158909320831299, + "learning_rate": 1.1993510237818269e-06, + "loss": 0.0676, + "step": 12664 + }, + { + "epoch": 2.0520090732339598, + "grad_norm": 0.9576240181922913, + "learning_rate": 1.1989775824069645e-06, + "loss": 0.0613, + "step": 12665 + }, + { + "epoch": 2.0521710952689567, + "grad_norm": 0.7837166786193848, + "learning_rate": 1.1986041808402393e-06, + "loss": 0.0554, + "step": 12666 + }, + { + "epoch": 2.052333117303953, + "grad_norm": 0.9599198698997498, + "learning_rate": 1.198230819093077e-06, + "loss": 0.0716, + "step": 12667 + }, + { + "epoch": 2.05249513933895, + "grad_norm": 1.0300556421279907, + "learning_rate": 1.1978574971769025e-06, + "loss": 0.0639, + "step": 12668 + }, + { + "epoch": 2.052657161373947, + "grad_norm": 0.9385240077972412, + "learning_rate": 1.1974842151031354e-06, + "loss": 0.0651, + "step": 12669 + }, + { + "epoch": 2.0528191834089435, + "grad_norm": 0.8229407072067261, + "learning_rate": 1.1971109728832003e-06, + "loss": 0.0599, + "step": 12670 + }, + { + "epoch": 2.0529812054439405, + "grad_norm": 0.8111064434051514, + "learning_rate": 1.1967377705285163e-06, + "loss": 0.0614, + "step": 12671 + }, + { + "epoch": 2.053143227478937, + "grad_norm": 0.716200053691864, + "learning_rate": 1.196364608050504e-06, + "loss": 0.061, + "step": 12672 + }, + { + "epoch": 2.053305249513934, + "grad_norm": 0.8643745183944702, + "learning_rate": 1.1959914854605788e-06, + "loss": 0.0606, + "step": 12673 + }, + { + "epoch": 2.053467271548931, + "grad_norm": 0.9790539741516113, + "learning_rate": 1.1956184027701576e-06, + "loss": 0.0629, + "step": 12674 + }, + { + "epoch": 2.0536292935839273, + "grad_norm": 0.9859734177589417, + "learning_rate": 1.1952453599906585e-06, + "loss": 0.0655, + "step": 12675 + }, + { + "epoch": 2.0537913156189243, + "grad_norm": 0.784511148929596, + "learning_rate": 1.1948723571334932e-06, + "loss": 0.0544, + "step": 12676 + }, + { + "epoch": 2.0539533376539207, + "grad_norm": 0.7424229383468628, + "learning_rate": 1.1944993942100755e-06, + "loss": 0.0563, + "step": 12677 + }, + { + "epoch": 2.0541153596889177, + "grad_norm": 0.9328092932701111, + "learning_rate": 1.1941264712318167e-06, + "loss": 0.0677, + "step": 12678 + }, + { + "epoch": 2.0542773817239146, + "grad_norm": 0.8559496998786926, + "learning_rate": 1.193753588210128e-06, + "loss": 0.068, + "step": 12679 + }, + { + "epoch": 2.054439403758911, + "grad_norm": 1.0441864728927612, + "learning_rate": 1.1933807451564186e-06, + "loss": 0.0597, + "step": 12680 + }, + { + "epoch": 2.054601425793908, + "grad_norm": 0.8556796908378601, + "learning_rate": 1.1930079420820962e-06, + "loss": 0.0575, + "step": 12681 + }, + { + "epoch": 2.0547634478289045, + "grad_norm": 0.9726545214653015, + "learning_rate": 1.192635178998568e-06, + "loss": 0.0698, + "step": 12682 + }, + { + "epoch": 2.0549254698639015, + "grad_norm": 0.7726395726203918, + "learning_rate": 1.1922624559172404e-06, + "loss": 0.053, + "step": 12683 + }, + { + "epoch": 2.0550874918988984, + "grad_norm": 1.3257524967193604, + "learning_rate": 1.191889772849515e-06, + "loss": 0.057, + "step": 12684 + }, + { + "epoch": 2.055249513933895, + "grad_norm": 0.8716280460357666, + "learning_rate": 1.1915171298067982e-06, + "loss": 0.0626, + "step": 12685 + }, + { + "epoch": 2.055411535968892, + "grad_norm": 0.7775639891624451, + "learning_rate": 1.1911445268004917e-06, + "loss": 0.0586, + "step": 12686 + }, + { + "epoch": 2.0555735580038887, + "grad_norm": 0.7742239236831665, + "learning_rate": 1.1907719638419943e-06, + "loss": 0.0577, + "step": 12687 + }, + { + "epoch": 2.0557355800388852, + "grad_norm": 0.9106540679931641, + "learning_rate": 1.1903994409427063e-06, + "loss": 0.0641, + "step": 12688 + }, + { + "epoch": 2.055897602073882, + "grad_norm": 0.8494895100593567, + "learning_rate": 1.1900269581140257e-06, + "loss": 0.0622, + "step": 12689 + }, + { + "epoch": 2.0560596241088787, + "grad_norm": 0.9407444596290588, + "learning_rate": 1.1896545153673517e-06, + "loss": 0.067, + "step": 12690 + }, + { + "epoch": 2.0562216461438756, + "grad_norm": 0.734492301940918, + "learning_rate": 1.1892821127140777e-06, + "loss": 0.0553, + "step": 12691 + }, + { + "epoch": 2.0563836681788725, + "grad_norm": 0.8618312478065491, + "learning_rate": 1.1889097501655991e-06, + "loss": 0.0624, + "step": 12692 + }, + { + "epoch": 2.056545690213869, + "grad_norm": 0.8262837529182434, + "learning_rate": 1.1885374277333095e-06, + "loss": 0.0579, + "step": 12693 + }, + { + "epoch": 2.056707712248866, + "grad_norm": 0.832383930683136, + "learning_rate": 1.1881651454286008e-06, + "loss": 0.0627, + "step": 12694 + }, + { + "epoch": 2.0568697342838624, + "grad_norm": 0.8754958510398865, + "learning_rate": 1.187792903262864e-06, + "loss": 0.0592, + "step": 12695 + }, + { + "epoch": 2.0570317563188594, + "grad_norm": 0.9576855301856995, + "learning_rate": 1.1874207012474891e-06, + "loss": 0.0672, + "step": 12696 + }, + { + "epoch": 2.0571937783538563, + "grad_norm": 0.8752747774124146, + "learning_rate": 1.1870485393938644e-06, + "loss": 0.0658, + "step": 12697 + }, + { + "epoch": 2.057355800388853, + "grad_norm": 0.9489736557006836, + "learning_rate": 1.186676417713377e-06, + "loss": 0.062, + "step": 12698 + }, + { + "epoch": 2.0575178224238497, + "grad_norm": 0.791542112827301, + "learning_rate": 1.1863043362174129e-06, + "loss": 0.056, + "step": 12699 + }, + { + "epoch": 2.057679844458846, + "grad_norm": 0.8600035309791565, + "learning_rate": 1.1859322949173572e-06, + "loss": 0.0646, + "step": 12700 + }, + { + "epoch": 2.057841866493843, + "grad_norm": 0.8470669984817505, + "learning_rate": 1.1855602938245942e-06, + "loss": 0.0658, + "step": 12701 + }, + { + "epoch": 2.05800388852884, + "grad_norm": 0.8929564952850342, + "learning_rate": 1.1851883329505043e-06, + "loss": 0.0589, + "step": 12702 + }, + { + "epoch": 2.0581659105638366, + "grad_norm": 0.9186873435974121, + "learning_rate": 1.1848164123064687e-06, + "loss": 0.073, + "step": 12703 + }, + { + "epoch": 2.0583279325988335, + "grad_norm": 0.803128719329834, + "learning_rate": 1.1844445319038694e-06, + "loss": 0.059, + "step": 12704 + }, + { + "epoch": 2.05848995463383, + "grad_norm": 0.7790929675102234, + "learning_rate": 1.1840726917540846e-06, + "loss": 0.0562, + "step": 12705 + }, + { + "epoch": 2.058651976668827, + "grad_norm": 0.9201391935348511, + "learning_rate": 1.18370089186849e-06, + "loss": 0.0644, + "step": 12706 + }, + { + "epoch": 2.058813998703824, + "grad_norm": 1.1075505018234253, + "learning_rate": 1.1833291322584625e-06, + "loss": 0.0671, + "step": 12707 + }, + { + "epoch": 2.0589760207388204, + "grad_norm": 0.869609534740448, + "learning_rate": 1.1829574129353777e-06, + "loss": 0.0663, + "step": 12708 + }, + { + "epoch": 2.0591380427738173, + "grad_norm": 1.0262631177902222, + "learning_rate": 1.1825857339106086e-06, + "loss": 0.059, + "step": 12709 + }, + { + "epoch": 2.059300064808814, + "grad_norm": 0.8231914043426514, + "learning_rate": 1.182214095195528e-06, + "loss": 0.0576, + "step": 12710 + }, + { + "epoch": 2.0594620868438107, + "grad_norm": 0.8270055651664734, + "learning_rate": 1.181842496801507e-06, + "loss": 0.0574, + "step": 12711 + }, + { + "epoch": 2.0596241088788076, + "grad_norm": 1.0092082023620605, + "learning_rate": 1.181470938739917e-06, + "loss": 0.0619, + "step": 12712 + }, + { + "epoch": 2.059786130913804, + "grad_norm": 0.7911940217018127, + "learning_rate": 1.1810994210221234e-06, + "loss": 0.0587, + "step": 12713 + }, + { + "epoch": 2.059948152948801, + "grad_norm": 0.9792639017105103, + "learning_rate": 1.1807279436594967e-06, + "loss": 0.0734, + "step": 12714 + }, + { + "epoch": 2.060110174983798, + "grad_norm": 1.0325294733047485, + "learning_rate": 1.1803565066634027e-06, + "loss": 0.0708, + "step": 12715 + }, + { + "epoch": 2.0602721970187945, + "grad_norm": 0.8511001467704773, + "learning_rate": 1.1799851100452067e-06, + "loss": 0.0583, + "step": 12716 + }, + { + "epoch": 2.0604342190537914, + "grad_norm": 0.9019601345062256, + "learning_rate": 1.17961375381627e-06, + "loss": 0.0669, + "step": 12717 + }, + { + "epoch": 2.060596241088788, + "grad_norm": 0.9868301153182983, + "learning_rate": 1.1792424379879582e-06, + "loss": 0.0608, + "step": 12718 + }, + { + "epoch": 2.060758263123785, + "grad_norm": 0.954416036605835, + "learning_rate": 1.178871162571633e-06, + "loss": 0.0735, + "step": 12719 + }, + { + "epoch": 2.060920285158782, + "grad_norm": 1.1071062088012695, + "learning_rate": 1.1784999275786515e-06, + "loss": 0.0564, + "step": 12720 + }, + { + "epoch": 2.0610823071937783, + "grad_norm": 0.8846614956855774, + "learning_rate": 1.1781287330203747e-06, + "loss": 0.0631, + "step": 12721 + }, + { + "epoch": 2.061244329228775, + "grad_norm": 0.894943118095398, + "learning_rate": 1.177757578908159e-06, + "loss": 0.0604, + "step": 12722 + }, + { + "epoch": 2.0614063512637717, + "grad_norm": 1.3799413442611694, + "learning_rate": 1.177386465253363e-06, + "loss": 0.0675, + "step": 12723 + }, + { + "epoch": 2.0615683732987686, + "grad_norm": 0.8426232933998108, + "learning_rate": 1.17701539206734e-06, + "loss": 0.0613, + "step": 12724 + }, + { + "epoch": 2.0617303953337656, + "grad_norm": 0.9407797455787659, + "learning_rate": 1.176644359361444e-06, + "loss": 0.0729, + "step": 12725 + }, + { + "epoch": 2.061892417368762, + "grad_norm": 0.9105675220489502, + "learning_rate": 1.1762733671470285e-06, + "loss": 0.0679, + "step": 12726 + }, + { + "epoch": 2.062054439403759, + "grad_norm": 0.748227596282959, + "learning_rate": 1.1759024154354446e-06, + "loss": 0.0594, + "step": 12727 + }, + { + "epoch": 2.0622164614387555, + "grad_norm": 0.8079039454460144, + "learning_rate": 1.1755315042380425e-06, + "loss": 0.0629, + "step": 12728 + }, + { + "epoch": 2.0623784834737524, + "grad_norm": 0.9434335231781006, + "learning_rate": 1.175160633566171e-06, + "loss": 0.0646, + "step": 12729 + }, + { + "epoch": 2.0625405055087493, + "grad_norm": 0.8539111018180847, + "learning_rate": 1.1747898034311782e-06, + "loss": 0.0605, + "step": 12730 + }, + { + "epoch": 2.062702527543746, + "grad_norm": 0.8666152954101562, + "learning_rate": 1.1744190138444118e-06, + "loss": 0.071, + "step": 12731 + }, + { + "epoch": 2.0628645495787428, + "grad_norm": 1.0582692623138428, + "learning_rate": 1.1740482648172132e-06, + "loss": 0.0718, + "step": 12732 + }, + { + "epoch": 2.0630265716137393, + "grad_norm": 0.9556350111961365, + "learning_rate": 1.1736775563609305e-06, + "loss": 0.0716, + "step": 12733 + }, + { + "epoch": 2.063188593648736, + "grad_norm": 0.829980731010437, + "learning_rate": 1.1733068884869053e-06, + "loss": 0.0654, + "step": 12734 + }, + { + "epoch": 2.063350615683733, + "grad_norm": 0.7806819081306458, + "learning_rate": 1.1729362612064782e-06, + "loss": 0.0574, + "step": 12735 + }, + { + "epoch": 2.0635126377187296, + "grad_norm": 1.1992825269699097, + "learning_rate": 1.172565674530989e-06, + "loss": 0.0556, + "step": 12736 + }, + { + "epoch": 2.0636746597537265, + "grad_norm": 0.9343496561050415, + "learning_rate": 1.1721951284717797e-06, + "loss": 0.0697, + "step": 12737 + }, + { + "epoch": 2.0638366817887235, + "grad_norm": 0.8090648055076599, + "learning_rate": 1.1718246230401856e-06, + "loss": 0.0644, + "step": 12738 + }, + { + "epoch": 2.06399870382372, + "grad_norm": 0.9163773059844971, + "learning_rate": 1.1714541582475435e-06, + "loss": 0.0707, + "step": 12739 + }, + { + "epoch": 2.064160725858717, + "grad_norm": 0.9538021683692932, + "learning_rate": 1.1710837341051892e-06, + "loss": 0.0655, + "step": 12740 + }, + { + "epoch": 2.0643227478937134, + "grad_norm": 0.894012451171875, + "learning_rate": 1.170713350624457e-06, + "loss": 0.067, + "step": 12741 + }, + { + "epoch": 2.0644847699287103, + "grad_norm": 0.896256148815155, + "learning_rate": 1.1703430078166792e-06, + "loss": 0.0683, + "step": 12742 + }, + { + "epoch": 2.0646467919637073, + "grad_norm": 0.8130720257759094, + "learning_rate": 1.1699727056931878e-06, + "loss": 0.0666, + "step": 12743 + }, + { + "epoch": 2.0648088139987038, + "grad_norm": 0.7944749593734741, + "learning_rate": 1.169602444265313e-06, + "loss": 0.0639, + "step": 12744 + }, + { + "epoch": 2.0649708360337007, + "grad_norm": 0.9021603465080261, + "learning_rate": 1.1692322235443845e-06, + "loss": 0.069, + "step": 12745 + }, + { + "epoch": 2.065132858068697, + "grad_norm": 0.8242917060852051, + "learning_rate": 1.168862043541728e-06, + "loss": 0.0675, + "step": 12746 + }, + { + "epoch": 2.065294880103694, + "grad_norm": 0.806065559387207, + "learning_rate": 1.1684919042686727e-06, + "loss": 0.0595, + "step": 12747 + }, + { + "epoch": 2.065456902138691, + "grad_norm": 0.8553471565246582, + "learning_rate": 1.1681218057365429e-06, + "loss": 0.0588, + "step": 12748 + }, + { + "epoch": 2.0656189241736875, + "grad_norm": 0.8592274785041809, + "learning_rate": 1.1677517479566636e-06, + "loss": 0.0677, + "step": 12749 + }, + { + "epoch": 2.0657809462086845, + "grad_norm": 0.8480504155158997, + "learning_rate": 1.167381730940356e-06, + "loss": 0.0636, + "step": 12750 + }, + { + "epoch": 2.065942968243681, + "grad_norm": 0.768464207649231, + "learning_rate": 1.1670117546989416e-06, + "loss": 0.0603, + "step": 12751 + }, + { + "epoch": 2.066104990278678, + "grad_norm": 0.8079333305358887, + "learning_rate": 1.1666418192437434e-06, + "loss": 0.0615, + "step": 12752 + }, + { + "epoch": 2.066267012313675, + "grad_norm": 0.8408043384552002, + "learning_rate": 1.1662719245860782e-06, + "loss": 0.0604, + "step": 12753 + }, + { + "epoch": 2.0664290343486713, + "grad_norm": 0.8342251181602478, + "learning_rate": 1.1659020707372643e-06, + "loss": 0.061, + "step": 12754 + }, + { + "epoch": 2.0665910563836682, + "grad_norm": 0.868290364742279, + "learning_rate": 1.1655322577086186e-06, + "loss": 0.0628, + "step": 12755 + }, + { + "epoch": 2.0667530784186647, + "grad_norm": 0.7598575353622437, + "learning_rate": 1.1651624855114565e-06, + "loss": 0.0547, + "step": 12756 + }, + { + "epoch": 2.0669151004536617, + "grad_norm": 0.7536554932594299, + "learning_rate": 1.1647927541570922e-06, + "loss": 0.0583, + "step": 12757 + }, + { + "epoch": 2.0670771224886586, + "grad_norm": 0.8469033241271973, + "learning_rate": 1.1644230636568384e-06, + "loss": 0.0659, + "step": 12758 + }, + { + "epoch": 2.067239144523655, + "grad_norm": 0.796055257320404, + "learning_rate": 1.164053414022007e-06, + "loss": 0.0568, + "step": 12759 + }, + { + "epoch": 2.067401166558652, + "grad_norm": 0.922766387462616, + "learning_rate": 1.1636838052639081e-06, + "loss": 0.0627, + "step": 12760 + }, + { + "epoch": 2.067563188593649, + "grad_norm": 0.876051664352417, + "learning_rate": 1.163314237393851e-06, + "loss": 0.0594, + "step": 12761 + }, + { + "epoch": 2.0677252106286454, + "grad_norm": 0.8684565424919128, + "learning_rate": 1.1629447104231435e-06, + "loss": 0.0642, + "step": 12762 + }, + { + "epoch": 2.0678872326636424, + "grad_norm": 0.8998528122901917, + "learning_rate": 1.162575224363093e-06, + "loss": 0.0714, + "step": 12763 + }, + { + "epoch": 2.068049254698639, + "grad_norm": 0.874921977519989, + "learning_rate": 1.1622057792250033e-06, + "loss": 0.0613, + "step": 12764 + }, + { + "epoch": 2.068211276733636, + "grad_norm": 0.9340734481811523, + "learning_rate": 1.1618363750201784e-06, + "loss": 0.0604, + "step": 12765 + }, + { + "epoch": 2.0683732987686327, + "grad_norm": 1.0002615451812744, + "learning_rate": 1.1614670117599231e-06, + "loss": 0.0656, + "step": 12766 + }, + { + "epoch": 2.0685353208036292, + "grad_norm": 0.8985110521316528, + "learning_rate": 1.161097689455539e-06, + "loss": 0.0595, + "step": 12767 + }, + { + "epoch": 2.068697342838626, + "grad_norm": 0.829571008682251, + "learning_rate": 1.1607284081183245e-06, + "loss": 0.0554, + "step": 12768 + }, + { + "epoch": 2.0688593648736227, + "grad_norm": 0.9190569519996643, + "learning_rate": 1.16035916775958e-06, + "loss": 0.0607, + "step": 12769 + }, + { + "epoch": 2.0690213869086196, + "grad_norm": 0.9735108017921448, + "learning_rate": 1.1599899683906026e-06, + "loss": 0.0649, + "step": 12770 + }, + { + "epoch": 2.0691834089436165, + "grad_norm": 0.8792665600776672, + "learning_rate": 1.1596208100226899e-06, + "loss": 0.0554, + "step": 12771 + }, + { + "epoch": 2.069345430978613, + "grad_norm": 0.8092507719993591, + "learning_rate": 1.1592516926671367e-06, + "loss": 0.0611, + "step": 12772 + }, + { + "epoch": 2.06950745301361, + "grad_norm": 0.9477719068527222, + "learning_rate": 1.1588826163352369e-06, + "loss": 0.0648, + "step": 12773 + }, + { + "epoch": 2.0696694750486064, + "grad_norm": 0.806533694267273, + "learning_rate": 1.1585135810382836e-06, + "loss": 0.0603, + "step": 12774 + }, + { + "epoch": 2.0698314970836034, + "grad_norm": 0.8330726027488708, + "learning_rate": 1.1581445867875684e-06, + "loss": 0.0618, + "step": 12775 + }, + { + "epoch": 2.0699935191186003, + "grad_norm": 0.9184839129447937, + "learning_rate": 1.1577756335943818e-06, + "loss": 0.0624, + "step": 12776 + }, + { + "epoch": 2.070155541153597, + "grad_norm": 0.8566496968269348, + "learning_rate": 1.1574067214700127e-06, + "loss": 0.0613, + "step": 12777 + }, + { + "epoch": 2.0703175631885937, + "grad_norm": 0.7928066849708557, + "learning_rate": 1.1570378504257499e-06, + "loss": 0.0562, + "step": 12778 + }, + { + "epoch": 2.07047958522359, + "grad_norm": 0.8417590856552124, + "learning_rate": 1.1566690204728779e-06, + "loss": 0.0549, + "step": 12779 + }, + { + "epoch": 2.070641607258587, + "grad_norm": 0.9719398021697998, + "learning_rate": 1.156300231622682e-06, + "loss": 0.0687, + "step": 12780 + }, + { + "epoch": 2.070803629293584, + "grad_norm": 0.9284224510192871, + "learning_rate": 1.1559314838864494e-06, + "loss": 0.0637, + "step": 12781 + }, + { + "epoch": 2.0709656513285806, + "grad_norm": 1.0845189094543457, + "learning_rate": 1.1555627772754595e-06, + "loss": 0.0689, + "step": 12782 + }, + { + "epoch": 2.0711276733635775, + "grad_norm": 0.9417441487312317, + "learning_rate": 1.1551941118009957e-06, + "loss": 0.0633, + "step": 12783 + }, + { + "epoch": 2.071289695398574, + "grad_norm": 0.8231827020645142, + "learning_rate": 1.1548254874743365e-06, + "loss": 0.06, + "step": 12784 + }, + { + "epoch": 2.071451717433571, + "grad_norm": 0.8818009495735168, + "learning_rate": 1.154456904306764e-06, + "loss": 0.0611, + "step": 12785 + }, + { + "epoch": 2.071613739468568, + "grad_norm": 0.8181620240211487, + "learning_rate": 1.154088362309553e-06, + "loss": 0.0578, + "step": 12786 + }, + { + "epoch": 2.0717757615035644, + "grad_norm": 0.9999836087226868, + "learning_rate": 1.1537198614939812e-06, + "loss": 0.0684, + "step": 12787 + }, + { + "epoch": 2.0719377835385613, + "grad_norm": 0.8913933634757996, + "learning_rate": 1.1533514018713238e-06, + "loss": 0.0593, + "step": 12788 + }, + { + "epoch": 2.072099805573558, + "grad_norm": 0.8349035978317261, + "learning_rate": 1.1529829834528547e-06, + "loss": 0.0594, + "step": 12789 + }, + { + "epoch": 2.0722618276085547, + "grad_norm": 1.0070099830627441, + "learning_rate": 1.1526146062498464e-06, + "loss": 0.067, + "step": 12790 + }, + { + "epoch": 2.0724238496435516, + "grad_norm": 0.8604516386985779, + "learning_rate": 1.1522462702735708e-06, + "loss": 0.0622, + "step": 12791 + }, + { + "epoch": 2.072585871678548, + "grad_norm": 0.8990740776062012, + "learning_rate": 1.1518779755352977e-06, + "loss": 0.0691, + "step": 12792 + }, + { + "epoch": 2.072747893713545, + "grad_norm": 0.9744791388511658, + "learning_rate": 1.151509722046297e-06, + "loss": 0.0604, + "step": 12793 + }, + { + "epoch": 2.072909915748542, + "grad_norm": 0.8340404629707336, + "learning_rate": 1.1511415098178336e-06, + "loss": 0.0611, + "step": 12794 + }, + { + "epoch": 2.0730719377835385, + "grad_norm": 0.9593438506126404, + "learning_rate": 1.1507733388611768e-06, + "loss": 0.0699, + "step": 12795 + }, + { + "epoch": 2.0732339598185354, + "grad_norm": 0.8299152255058289, + "learning_rate": 1.1504052091875917e-06, + "loss": 0.0567, + "step": 12796 + }, + { + "epoch": 2.073395981853532, + "grad_norm": 0.7919155359268188, + "learning_rate": 1.1500371208083405e-06, + "loss": 0.0604, + "step": 12797 + }, + { + "epoch": 2.073558003888529, + "grad_norm": 0.9348534345626831, + "learning_rate": 1.1496690737346864e-06, + "loss": 0.0608, + "step": 12798 + }, + { + "epoch": 2.073720025923526, + "grad_norm": 0.8300426006317139, + "learning_rate": 1.14930106797789e-06, + "loss": 0.0642, + "step": 12799 + }, + { + "epoch": 2.0738820479585223, + "grad_norm": 0.8624827861785889, + "learning_rate": 1.148933103549214e-06, + "loss": 0.0623, + "step": 12800 + }, + { + "epoch": 2.074044069993519, + "grad_norm": 0.9354908466339111, + "learning_rate": 1.148565180459915e-06, + "loss": 0.0667, + "step": 12801 + }, + { + "epoch": 2.0742060920285157, + "grad_norm": 0.9018098711967468, + "learning_rate": 1.1481972987212505e-06, + "loss": 0.0565, + "step": 12802 + }, + { + "epoch": 2.0743681140635126, + "grad_norm": 0.9622835516929626, + "learning_rate": 1.1478294583444779e-06, + "loss": 0.0651, + "step": 12803 + }, + { + "epoch": 2.0745301360985096, + "grad_norm": 0.8161885142326355, + "learning_rate": 1.1474616593408513e-06, + "loss": 0.0615, + "step": 12804 + }, + { + "epoch": 2.074692158133506, + "grad_norm": 0.909511148929596, + "learning_rate": 1.147093901721625e-06, + "loss": 0.062, + "step": 12805 + }, + { + "epoch": 2.074854180168503, + "grad_norm": 0.7995123267173767, + "learning_rate": 1.1467261854980513e-06, + "loss": 0.063, + "step": 12806 + }, + { + "epoch": 2.0750162022034995, + "grad_norm": 0.8639039993286133, + "learning_rate": 1.1463585106813823e-06, + "loss": 0.0635, + "step": 12807 + }, + { + "epoch": 2.0751782242384964, + "grad_norm": 0.8831427693367004, + "learning_rate": 1.1459908772828658e-06, + "loss": 0.0637, + "step": 12808 + }, + { + "epoch": 2.0753402462734933, + "grad_norm": 0.9369395971298218, + "learning_rate": 1.1456232853137522e-06, + "loss": 0.065, + "step": 12809 + }, + { + "epoch": 2.07550226830849, + "grad_norm": 0.8888958096504211, + "learning_rate": 1.1452557347852885e-06, + "loss": 0.062, + "step": 12810 + }, + { + "epoch": 2.0756642903434868, + "grad_norm": 0.9641885757446289, + "learning_rate": 1.1448882257087222e-06, + "loss": 0.0675, + "step": 12811 + }, + { + "epoch": 2.0758263123784833, + "grad_norm": 1.0952035188674927, + "learning_rate": 1.1445207580952956e-06, + "loss": 0.0677, + "step": 12812 + }, + { + "epoch": 2.07598833441348, + "grad_norm": 1.0690641403198242, + "learning_rate": 1.1441533319562528e-06, + "loss": 0.0685, + "step": 12813 + }, + { + "epoch": 2.076150356448477, + "grad_norm": 0.9642135500907898, + "learning_rate": 1.143785947302839e-06, + "loss": 0.0711, + "step": 12814 + }, + { + "epoch": 2.0763123784834736, + "grad_norm": 1.0191136598587036, + "learning_rate": 1.143418604146292e-06, + "loss": 0.0728, + "step": 12815 + }, + { + "epoch": 2.0764744005184705, + "grad_norm": 0.8897978663444519, + "learning_rate": 1.143051302497853e-06, + "loss": 0.0671, + "step": 12816 + }, + { + "epoch": 2.0766364225534675, + "grad_norm": 1.1707680225372314, + "learning_rate": 1.1426840423687605e-06, + "loss": 0.0633, + "step": 12817 + }, + { + "epoch": 2.076798444588464, + "grad_norm": 0.7090242505073547, + "learning_rate": 1.1423168237702515e-06, + "loss": 0.0524, + "step": 12818 + }, + { + "epoch": 2.076960466623461, + "grad_norm": 0.8160051703453064, + "learning_rate": 1.141949646713562e-06, + "loss": 0.0589, + "step": 12819 + }, + { + "epoch": 2.0771224886584574, + "grad_norm": 0.8117794394493103, + "learning_rate": 1.1415825112099274e-06, + "loss": 0.0533, + "step": 12820 + }, + { + "epoch": 2.0772845106934543, + "grad_norm": 0.9710679054260254, + "learning_rate": 1.1412154172705803e-06, + "loss": 0.0625, + "step": 12821 + }, + { + "epoch": 2.0774465327284513, + "grad_norm": 0.8708137273788452, + "learning_rate": 1.1408483649067541e-06, + "loss": 0.0557, + "step": 12822 + }, + { + "epoch": 2.0776085547634477, + "grad_norm": 0.8636717200279236, + "learning_rate": 1.1404813541296772e-06, + "loss": 0.0605, + "step": 12823 + }, + { + "epoch": 2.0777705767984447, + "grad_norm": 0.763903021812439, + "learning_rate": 1.1401143849505816e-06, + "loss": 0.0597, + "step": 12824 + }, + { + "epoch": 2.077932598833441, + "grad_norm": 0.9749400019645691, + "learning_rate": 1.139747457380696e-06, + "loss": 0.0633, + "step": 12825 + }, + { + "epoch": 2.078094620868438, + "grad_norm": 0.9545486569404602, + "learning_rate": 1.1393805714312456e-06, + "loss": 0.0734, + "step": 12826 + }, + { + "epoch": 2.078256642903435, + "grad_norm": 0.8538177013397217, + "learning_rate": 1.1390137271134564e-06, + "loss": 0.064, + "step": 12827 + }, + { + "epoch": 2.0784186649384315, + "grad_norm": 1.0204607248306274, + "learning_rate": 1.138646924438554e-06, + "loss": 0.0705, + "step": 12828 + }, + { + "epoch": 2.0785806869734285, + "grad_norm": 0.9345843195915222, + "learning_rate": 1.1382801634177627e-06, + "loss": 0.0648, + "step": 12829 + }, + { + "epoch": 2.078742709008425, + "grad_norm": 0.9654140472412109, + "learning_rate": 1.1379134440623018e-06, + "loss": 0.0674, + "step": 12830 + }, + { + "epoch": 2.078904731043422, + "grad_norm": 0.8916672468185425, + "learning_rate": 1.1375467663833935e-06, + "loss": 0.0664, + "step": 12831 + }, + { + "epoch": 2.079066753078419, + "grad_norm": 0.9074366092681885, + "learning_rate": 1.137180130392257e-06, + "loss": 0.0525, + "step": 12832 + }, + { + "epoch": 2.0792287751134153, + "grad_norm": 0.9792858362197876, + "learning_rate": 1.1368135361001107e-06, + "loss": 0.0692, + "step": 12833 + }, + { + "epoch": 2.0793907971484122, + "grad_norm": 0.9119909405708313, + "learning_rate": 1.1364469835181712e-06, + "loss": 0.0681, + "step": 12834 + }, + { + "epoch": 2.0795528191834087, + "grad_norm": 0.876586377620697, + "learning_rate": 1.1360804726576543e-06, + "loss": 0.0603, + "step": 12835 + }, + { + "epoch": 2.0797148412184057, + "grad_norm": 0.8376288414001465, + "learning_rate": 1.1357140035297745e-06, + "loss": 0.0638, + "step": 12836 + }, + { + "epoch": 2.0798768632534026, + "grad_norm": 0.8386695981025696, + "learning_rate": 1.1353475761457445e-06, + "loss": 0.057, + "step": 12837 + }, + { + "epoch": 2.080038885288399, + "grad_norm": 0.8405042886734009, + "learning_rate": 1.1349811905167762e-06, + "loss": 0.066, + "step": 12838 + }, + { + "epoch": 2.080200907323396, + "grad_norm": 0.9115575551986694, + "learning_rate": 1.13461484665408e-06, + "loss": 0.0623, + "step": 12839 + }, + { + "epoch": 2.080362929358393, + "grad_norm": 0.7548916339874268, + "learning_rate": 1.134248544568867e-06, + "loss": 0.0554, + "step": 12840 + }, + { + "epoch": 2.0805249513933894, + "grad_norm": 0.873468816280365, + "learning_rate": 1.1338822842723418e-06, + "loss": 0.0674, + "step": 12841 + }, + { + "epoch": 2.0806869734283864, + "grad_norm": 0.9145466685295105, + "learning_rate": 1.1335160657757121e-06, + "loss": 0.0608, + "step": 12842 + }, + { + "epoch": 2.080848995463383, + "grad_norm": 0.8636155724525452, + "learning_rate": 1.1331498890901851e-06, + "loss": 0.064, + "step": 12843 + }, + { + "epoch": 2.08101101749838, + "grad_norm": 0.9404274821281433, + "learning_rate": 1.1327837542269645e-06, + "loss": 0.0603, + "step": 12844 + }, + { + "epoch": 2.0811730395333767, + "grad_norm": 0.867577850818634, + "learning_rate": 1.1324176611972515e-06, + "loss": 0.0659, + "step": 12845 + }, + { + "epoch": 2.0813350615683732, + "grad_norm": 0.7574745416641235, + "learning_rate": 1.1320516100122487e-06, + "loss": 0.0572, + "step": 12846 + }, + { + "epoch": 2.08149708360337, + "grad_norm": 0.8237707614898682, + "learning_rate": 1.1316856006831562e-06, + "loss": 0.0643, + "step": 12847 + }, + { + "epoch": 2.0816591056383666, + "grad_norm": 0.950802743434906, + "learning_rate": 1.1313196332211728e-06, + "loss": 0.0632, + "step": 12848 + }, + { + "epoch": 2.0818211276733636, + "grad_norm": 0.9620135426521301, + "learning_rate": 1.1309537076374968e-06, + "loss": 0.0682, + "step": 12849 + }, + { + "epoch": 2.0819831497083605, + "grad_norm": 0.9418061375617981, + "learning_rate": 1.130587823943324e-06, + "loss": 0.0629, + "step": 12850 + }, + { + "epoch": 2.082145171743357, + "grad_norm": 0.8530777096748352, + "learning_rate": 1.1302219821498502e-06, + "loss": 0.0565, + "step": 12851 + }, + { + "epoch": 2.082307193778354, + "grad_norm": 0.8891876339912415, + "learning_rate": 1.1298561822682687e-06, + "loss": 0.0709, + "step": 12852 + }, + { + "epoch": 2.0824692158133504, + "grad_norm": 0.9837715029716492, + "learning_rate": 1.1294904243097726e-06, + "loss": 0.0619, + "step": 12853 + }, + { + "epoch": 2.0826312378483474, + "grad_norm": 0.8900513648986816, + "learning_rate": 1.1291247082855528e-06, + "loss": 0.0661, + "step": 12854 + }, + { + "epoch": 2.0827932598833443, + "grad_norm": 0.8813618421554565, + "learning_rate": 1.1287590342068005e-06, + "loss": 0.0588, + "step": 12855 + }, + { + "epoch": 2.082955281918341, + "grad_norm": 0.7814889550209045, + "learning_rate": 1.1283934020847015e-06, + "loss": 0.0516, + "step": 12856 + }, + { + "epoch": 2.0831173039533377, + "grad_norm": 0.7788052558898926, + "learning_rate": 1.128027811930446e-06, + "loss": 0.0581, + "step": 12857 + }, + { + "epoch": 2.083279325988334, + "grad_norm": 0.8594955205917358, + "learning_rate": 1.1276622637552203e-06, + "loss": 0.0575, + "step": 12858 + }, + { + "epoch": 2.083441348023331, + "grad_norm": 0.8879890441894531, + "learning_rate": 1.1272967575702075e-06, + "loss": 0.0587, + "step": 12859 + }, + { + "epoch": 2.083603370058328, + "grad_norm": 1.0055869817733765, + "learning_rate": 1.126931293386592e-06, + "loss": 0.0642, + "step": 12860 + }, + { + "epoch": 2.0837653920933246, + "grad_norm": 1.0345205068588257, + "learning_rate": 1.1265658712155552e-06, + "loss": 0.0691, + "step": 12861 + }, + { + "epoch": 2.0839274141283215, + "grad_norm": 0.8441528081893921, + "learning_rate": 1.1262004910682811e-06, + "loss": 0.0597, + "step": 12862 + }, + { + "epoch": 2.0840894361633184, + "grad_norm": 0.9225409030914307, + "learning_rate": 1.1258351529559463e-06, + "loss": 0.0598, + "step": 12863 + }, + { + "epoch": 2.084251458198315, + "grad_norm": 0.9410536885261536, + "learning_rate": 1.1254698568897308e-06, + "loss": 0.0637, + "step": 12864 + }, + { + "epoch": 2.084413480233312, + "grad_norm": 0.9590262174606323, + "learning_rate": 1.1251046028808107e-06, + "loss": 0.0622, + "step": 12865 + }, + { + "epoch": 2.0845755022683083, + "grad_norm": 0.944026529788971, + "learning_rate": 1.124739390940363e-06, + "loss": 0.0729, + "step": 12866 + }, + { + "epoch": 2.0847375243033053, + "grad_norm": 0.8297381401062012, + "learning_rate": 1.124374221079562e-06, + "loss": 0.0582, + "step": 12867 + }, + { + "epoch": 2.084899546338302, + "grad_norm": 0.8802173137664795, + "learning_rate": 1.1240090933095806e-06, + "loss": 0.0617, + "step": 12868 + }, + { + "epoch": 2.0850615683732987, + "grad_norm": 0.8409082889556885, + "learning_rate": 1.1236440076415923e-06, + "loss": 0.0575, + "step": 12869 + }, + { + "epoch": 2.0852235904082956, + "grad_norm": 0.8129790425300598, + "learning_rate": 1.1232789640867644e-06, + "loss": 0.0586, + "step": 12870 + }, + { + "epoch": 2.085385612443292, + "grad_norm": 0.9125465750694275, + "learning_rate": 1.1229139626562698e-06, + "loss": 0.0634, + "step": 12871 + }, + { + "epoch": 2.085547634478289, + "grad_norm": 0.8047336935997009, + "learning_rate": 1.1225490033612755e-06, + "loss": 0.0589, + "step": 12872 + }, + { + "epoch": 2.085709656513286, + "grad_norm": 0.9558115005493164, + "learning_rate": 1.1221840862129493e-06, + "loss": 0.0637, + "step": 12873 + }, + { + "epoch": 2.0858716785482825, + "grad_norm": 0.873754620552063, + "learning_rate": 1.1218192112224547e-06, + "loss": 0.0675, + "step": 12874 + }, + { + "epoch": 2.0860337005832794, + "grad_norm": 0.857243001461029, + "learning_rate": 1.1214543784009563e-06, + "loss": 0.065, + "step": 12875 + }, + { + "epoch": 2.086195722618276, + "grad_norm": 0.8386802673339844, + "learning_rate": 1.1210895877596195e-06, + "loss": 0.0564, + "step": 12876 + }, + { + "epoch": 2.086357744653273, + "grad_norm": 0.958071231842041, + "learning_rate": 1.1207248393096038e-06, + "loss": 0.0732, + "step": 12877 + }, + { + "epoch": 2.0865197666882698, + "grad_norm": 0.8746595978736877, + "learning_rate": 1.12036013306207e-06, + "loss": 0.0591, + "step": 12878 + }, + { + "epoch": 2.0866817887232663, + "grad_norm": 0.854904294013977, + "learning_rate": 1.1199954690281779e-06, + "loss": 0.0653, + "step": 12879 + }, + { + "epoch": 2.086843810758263, + "grad_norm": 0.8066260814666748, + "learning_rate": 1.1196308472190845e-06, + "loss": 0.0576, + "step": 12880 + }, + { + "epoch": 2.0870058327932597, + "grad_norm": 0.894486129283905, + "learning_rate": 1.1192662676459468e-06, + "loss": 0.0684, + "step": 12881 + }, + { + "epoch": 2.0871678548282566, + "grad_norm": 0.8854237198829651, + "learning_rate": 1.1189017303199198e-06, + "loss": 0.0656, + "step": 12882 + }, + { + "epoch": 2.0873298768632536, + "grad_norm": 0.8725504875183105, + "learning_rate": 1.1185372352521581e-06, + "loss": 0.0578, + "step": 12883 + }, + { + "epoch": 2.08749189889825, + "grad_norm": 0.8695107102394104, + "learning_rate": 1.1181727824538147e-06, + "loss": 0.0676, + "step": 12884 + }, + { + "epoch": 2.087653920933247, + "grad_norm": 0.9441255331039429, + "learning_rate": 1.117808371936038e-06, + "loss": 0.065, + "step": 12885 + }, + { + "epoch": 2.087815942968244, + "grad_norm": 0.9579770565032959, + "learning_rate": 1.1174440037099815e-06, + "loss": 0.0623, + "step": 12886 + }, + { + "epoch": 2.0879779650032404, + "grad_norm": 0.9786186218261719, + "learning_rate": 1.117079677786793e-06, + "loss": 0.0702, + "step": 12887 + }, + { + "epoch": 2.0881399870382373, + "grad_norm": 0.8708941340446472, + "learning_rate": 1.1167153941776205e-06, + "loss": 0.056, + "step": 12888 + }, + { + "epoch": 2.088302009073234, + "grad_norm": 0.8261350989341736, + "learning_rate": 1.1163511528936084e-06, + "loss": 0.0617, + "step": 12889 + }, + { + "epoch": 2.0884640311082308, + "grad_norm": 0.9200928211212158, + "learning_rate": 1.1159869539459018e-06, + "loss": 0.0622, + "step": 12890 + }, + { + "epoch": 2.0886260531432277, + "grad_norm": 1.0103826522827148, + "learning_rate": 1.1156227973456468e-06, + "loss": 0.0651, + "step": 12891 + }, + { + "epoch": 2.088788075178224, + "grad_norm": 1.0453431606292725, + "learning_rate": 1.1152586831039835e-06, + "loss": 0.0617, + "step": 12892 + }, + { + "epoch": 2.088950097213221, + "grad_norm": 0.8877750039100647, + "learning_rate": 1.1148946112320533e-06, + "loss": 0.06, + "step": 12893 + }, + { + "epoch": 2.0891121192482176, + "grad_norm": 0.9256067872047424, + "learning_rate": 1.1145305817409962e-06, + "loss": 0.0679, + "step": 12894 + }, + { + "epoch": 2.0892741412832145, + "grad_norm": 0.7883033752441406, + "learning_rate": 1.1141665946419506e-06, + "loss": 0.0532, + "step": 12895 + }, + { + "epoch": 2.0894361633182115, + "grad_norm": 0.9102799892425537, + "learning_rate": 1.1138026499460532e-06, + "loss": 0.0666, + "step": 12896 + }, + { + "epoch": 2.089598185353208, + "grad_norm": 0.9670751690864563, + "learning_rate": 1.1134387476644407e-06, + "loss": 0.0625, + "step": 12897 + }, + { + "epoch": 2.089760207388205, + "grad_norm": 0.8531269431114197, + "learning_rate": 1.1130748878082467e-06, + "loss": 0.0646, + "step": 12898 + }, + { + "epoch": 2.0899222294232014, + "grad_norm": 0.749323844909668, + "learning_rate": 1.1127110703886048e-06, + "loss": 0.0549, + "step": 12899 + }, + { + "epoch": 2.0900842514581983, + "grad_norm": 0.8951242566108704, + "learning_rate": 1.1123472954166473e-06, + "loss": 0.0562, + "step": 12900 + }, + { + "epoch": 2.0902462734931953, + "grad_norm": 0.8564949035644531, + "learning_rate": 1.1119835629035042e-06, + "loss": 0.0605, + "step": 12901 + }, + { + "epoch": 2.0904082955281917, + "grad_norm": 0.7760194540023804, + "learning_rate": 1.1116198728603061e-06, + "loss": 0.06, + "step": 12902 + }, + { + "epoch": 2.0905703175631887, + "grad_norm": 0.9799525141716003, + "learning_rate": 1.1112562252981793e-06, + "loss": 0.0622, + "step": 12903 + }, + { + "epoch": 2.090732339598185, + "grad_norm": 0.8512018918991089, + "learning_rate": 1.1108926202282505e-06, + "loss": 0.0646, + "step": 12904 + }, + { + "epoch": 2.090894361633182, + "grad_norm": 0.9717202186584473, + "learning_rate": 1.1105290576616467e-06, + "loss": 0.0642, + "step": 12905 + }, + { + "epoch": 2.091056383668179, + "grad_norm": 0.8135266304016113, + "learning_rate": 1.110165537609492e-06, + "loss": 0.0635, + "step": 12906 + }, + { + "epoch": 2.0912184057031755, + "grad_norm": 0.8754247426986694, + "learning_rate": 1.109802060082908e-06, + "loss": 0.0609, + "step": 12907 + }, + { + "epoch": 2.0913804277381725, + "grad_norm": 0.9074878692626953, + "learning_rate": 1.1094386250930164e-06, + "loss": 0.0601, + "step": 12908 + }, + { + "epoch": 2.091542449773169, + "grad_norm": 0.8210632801055908, + "learning_rate": 1.1090752326509379e-06, + "loss": 0.059, + "step": 12909 + }, + { + "epoch": 2.091704471808166, + "grad_norm": 0.9320157766342163, + "learning_rate": 1.1087118827677915e-06, + "loss": 0.0734, + "step": 12910 + }, + { + "epoch": 2.091866493843163, + "grad_norm": 0.8074702024459839, + "learning_rate": 1.1083485754546944e-06, + "loss": 0.0599, + "step": 12911 + }, + { + "epoch": 2.0920285158781593, + "grad_norm": 0.81490558385849, + "learning_rate": 1.1079853107227634e-06, + "loss": 0.0622, + "step": 12912 + }, + { + "epoch": 2.0921905379131562, + "grad_norm": 0.9869770407676697, + "learning_rate": 1.107622088583113e-06, + "loss": 0.0634, + "step": 12913 + }, + { + "epoch": 2.0923525599481527, + "grad_norm": 1.0530625581741333, + "learning_rate": 1.1072589090468571e-06, + "loss": 0.067, + "step": 12914 + }, + { + "epoch": 2.0925145819831497, + "grad_norm": 0.9498258233070374, + "learning_rate": 1.1068957721251085e-06, + "loss": 0.0615, + "step": 12915 + }, + { + "epoch": 2.0926766040181466, + "grad_norm": 0.9322092533111572, + "learning_rate": 1.1065326778289782e-06, + "loss": 0.0678, + "step": 12916 + }, + { + "epoch": 2.092838626053143, + "grad_norm": 1.2187130451202393, + "learning_rate": 1.1061696261695765e-06, + "loss": 0.0707, + "step": 12917 + }, + { + "epoch": 2.09300064808814, + "grad_norm": 0.8521288633346558, + "learning_rate": 1.1058066171580092e-06, + "loss": 0.0667, + "step": 12918 + }, + { + "epoch": 2.093162670123137, + "grad_norm": 0.9618637561798096, + "learning_rate": 1.1054436508053866e-06, + "loss": 0.0642, + "step": 12919 + }, + { + "epoch": 2.0933246921581334, + "grad_norm": 0.7557185888290405, + "learning_rate": 1.1050807271228146e-06, + "loss": 0.0558, + "step": 12920 + }, + { + "epoch": 2.0934867141931304, + "grad_norm": 0.9032410383224487, + "learning_rate": 1.1047178461213956e-06, + "loss": 0.0654, + "step": 12921 + }, + { + "epoch": 2.093648736228127, + "grad_norm": 0.813040018081665, + "learning_rate": 1.1043550078122342e-06, + "loss": 0.0575, + "step": 12922 + }, + { + "epoch": 2.093810758263124, + "grad_norm": 0.8633285164833069, + "learning_rate": 1.103992212206431e-06, + "loss": 0.061, + "step": 12923 + }, + { + "epoch": 2.0939727802981207, + "grad_norm": 0.9251546263694763, + "learning_rate": 1.1036294593150898e-06, + "loss": 0.0627, + "step": 12924 + }, + { + "epoch": 2.094134802333117, + "grad_norm": 0.7767398953437805, + "learning_rate": 1.103266749149307e-06, + "loss": 0.0583, + "step": 12925 + }, + { + "epoch": 2.094296824368114, + "grad_norm": 0.823419988155365, + "learning_rate": 1.1029040817201819e-06, + "loss": 0.0631, + "step": 12926 + }, + { + "epoch": 2.0944588464031106, + "grad_norm": 0.9466148018836975, + "learning_rate": 1.1025414570388108e-06, + "loss": 0.0655, + "step": 12927 + }, + { + "epoch": 2.0946208684381076, + "grad_norm": 0.8676579594612122, + "learning_rate": 1.1021788751162893e-06, + "loss": 0.0575, + "step": 12928 + }, + { + "epoch": 2.0947828904731045, + "grad_norm": 0.8906098008155823, + "learning_rate": 1.1018163359637116e-06, + "loss": 0.0659, + "step": 12929 + }, + { + "epoch": 2.094944912508101, + "grad_norm": 0.9030002951622009, + "learning_rate": 1.1014538395921704e-06, + "loss": 0.0685, + "step": 12930 + }, + { + "epoch": 2.095106934543098, + "grad_norm": 0.8031485676765442, + "learning_rate": 1.1010913860127572e-06, + "loss": 0.0627, + "step": 12931 + }, + { + "epoch": 2.0952689565780944, + "grad_norm": 0.8047374486923218, + "learning_rate": 1.1007289752365635e-06, + "loss": 0.0581, + "step": 12932 + }, + { + "epoch": 2.0954309786130914, + "grad_norm": 0.9250509738922119, + "learning_rate": 1.100366607274675e-06, + "loss": 0.0613, + "step": 12933 + }, + { + "epoch": 2.0955930006480883, + "grad_norm": 0.7186943292617798, + "learning_rate": 1.1000042821381823e-06, + "loss": 0.0544, + "step": 12934 + }, + { + "epoch": 2.095755022683085, + "grad_norm": 0.7666640281677246, + "learning_rate": 1.0996419998381713e-06, + "loss": 0.0594, + "step": 12935 + }, + { + "epoch": 2.0959170447180817, + "grad_norm": 0.913008987903595, + "learning_rate": 1.0992797603857257e-06, + "loss": 0.0621, + "step": 12936 + }, + { + "epoch": 2.096079066753078, + "grad_norm": 0.9413930177688599, + "learning_rate": 1.0989175637919297e-06, + "loss": 0.067, + "step": 12937 + }, + { + "epoch": 2.096241088788075, + "grad_norm": 0.8046421408653259, + "learning_rate": 1.0985554100678647e-06, + "loss": 0.057, + "step": 12938 + }, + { + "epoch": 2.096403110823072, + "grad_norm": 0.99111407995224, + "learning_rate": 1.0981932992246144e-06, + "loss": 0.0691, + "step": 12939 + }, + { + "epoch": 2.0965651328580686, + "grad_norm": 0.8477975130081177, + "learning_rate": 1.0978312312732562e-06, + "loss": 0.0584, + "step": 12940 + }, + { + "epoch": 2.0967271548930655, + "grad_norm": 0.8106995820999146, + "learning_rate": 1.097469206224869e-06, + "loss": 0.0588, + "step": 12941 + }, + { + "epoch": 2.0968891769280624, + "grad_norm": 0.8158612847328186, + "learning_rate": 1.09710722409053e-06, + "loss": 0.0622, + "step": 12942 + }, + { + "epoch": 2.097051198963059, + "grad_norm": 0.8949916362762451, + "learning_rate": 1.096745284881315e-06, + "loss": 0.0689, + "step": 12943 + }, + { + "epoch": 2.097213220998056, + "grad_norm": 0.8782987594604492, + "learning_rate": 1.0963833886082987e-06, + "loss": 0.0649, + "step": 12944 + }, + { + "epoch": 2.0973752430330523, + "grad_norm": 0.9232782125473022, + "learning_rate": 1.0960215352825537e-06, + "loss": 0.0711, + "step": 12945 + }, + { + "epoch": 2.0975372650680493, + "grad_norm": 0.8627330660820007, + "learning_rate": 1.0956597249151532e-06, + "loss": 0.0611, + "step": 12946 + }, + { + "epoch": 2.097699287103046, + "grad_norm": 0.779122531414032, + "learning_rate": 1.0952979575171649e-06, + "loss": 0.0556, + "step": 12947 + }, + { + "epoch": 2.0978613091380427, + "grad_norm": 1.0145422220230103, + "learning_rate": 1.0949362330996605e-06, + "loss": 0.0594, + "step": 12948 + }, + { + "epoch": 2.0980233311730396, + "grad_norm": 0.8492004871368408, + "learning_rate": 1.0945745516737075e-06, + "loss": 0.0625, + "step": 12949 + }, + { + "epoch": 2.098185353208036, + "grad_norm": 0.8178960680961609, + "learning_rate": 1.094212913250373e-06, + "loss": 0.053, + "step": 12950 + }, + { + "epoch": 2.098347375243033, + "grad_norm": 0.9389135837554932, + "learning_rate": 1.0938513178407201e-06, + "loss": 0.0631, + "step": 12951 + }, + { + "epoch": 2.09850939727803, + "grad_norm": 0.9091375470161438, + "learning_rate": 1.0934897654558134e-06, + "loss": 0.0636, + "step": 12952 + }, + { + "epoch": 2.0986714193130265, + "grad_norm": 0.8559199571609497, + "learning_rate": 1.0931282561067183e-06, + "loss": 0.0569, + "step": 12953 + }, + { + "epoch": 2.0988334413480234, + "grad_norm": 0.8254791498184204, + "learning_rate": 1.0927667898044927e-06, + "loss": 0.0629, + "step": 12954 + }, + { + "epoch": 2.09899546338302, + "grad_norm": 0.7848954796791077, + "learning_rate": 1.092405366560198e-06, + "loss": 0.0606, + "step": 12955 + }, + { + "epoch": 2.099157485418017, + "grad_norm": 0.9098137021064758, + "learning_rate": 1.092043986384893e-06, + "loss": 0.0613, + "step": 12956 + }, + { + "epoch": 2.0993195074530138, + "grad_norm": 0.7770287990570068, + "learning_rate": 1.0916826492896346e-06, + "loss": 0.0554, + "step": 12957 + }, + { + "epoch": 2.0994815294880103, + "grad_norm": 0.8795650005340576, + "learning_rate": 1.091321355285479e-06, + "loss": 0.0574, + "step": 12958 + }, + { + "epoch": 2.099643551523007, + "grad_norm": 1.0702177286148071, + "learning_rate": 1.0909601043834812e-06, + "loss": 0.0708, + "step": 12959 + }, + { + "epoch": 2.0998055735580037, + "grad_norm": 0.8428245782852173, + "learning_rate": 1.0905988965946942e-06, + "loss": 0.0629, + "step": 12960 + }, + { + "epoch": 2.0999675955930006, + "grad_norm": 0.8084932565689087, + "learning_rate": 1.0902377319301704e-06, + "loss": 0.0526, + "step": 12961 + }, + { + "epoch": 2.1001296176279975, + "grad_norm": 0.9167864918708801, + "learning_rate": 1.0898766104009606e-06, + "loss": 0.0617, + "step": 12962 + }, + { + "epoch": 2.100291639662994, + "grad_norm": 0.9969832301139832, + "learning_rate": 1.089515532018114e-06, + "loss": 0.0737, + "step": 12963 + }, + { + "epoch": 2.100453661697991, + "grad_norm": 1.0059758424758911, + "learning_rate": 1.0891544967926795e-06, + "loss": 0.068, + "step": 12964 + }, + { + "epoch": 2.100615683732988, + "grad_norm": 0.7887366414070129, + "learning_rate": 1.0887935047357023e-06, + "loss": 0.0574, + "step": 12965 + }, + { + "epoch": 2.1007777057679844, + "grad_norm": 0.8655834794044495, + "learning_rate": 1.0884325558582283e-06, + "loss": 0.0611, + "step": 12966 + }, + { + "epoch": 2.1009397278029813, + "grad_norm": 0.8879144191741943, + "learning_rate": 1.0880716501713025e-06, + "loss": 0.0605, + "step": 12967 + }, + { + "epoch": 2.101101749837978, + "grad_norm": 0.7682301998138428, + "learning_rate": 1.0877107876859688e-06, + "loss": 0.0619, + "step": 12968 + }, + { + "epoch": 2.1012637718729748, + "grad_norm": 1.0318995714187622, + "learning_rate": 1.0873499684132663e-06, + "loss": 0.0671, + "step": 12969 + }, + { + "epoch": 2.1014257939079717, + "grad_norm": 0.8464838862419128, + "learning_rate": 1.086989192364236e-06, + "loss": 0.0651, + "step": 12970 + }, + { + "epoch": 2.101587815942968, + "grad_norm": 1.0440428256988525, + "learning_rate": 1.0866284595499172e-06, + "loss": 0.068, + "step": 12971 + }, + { + "epoch": 2.101749837977965, + "grad_norm": 0.8565576076507568, + "learning_rate": 1.0862677699813471e-06, + "loss": 0.0539, + "step": 12972 + }, + { + "epoch": 2.1019118600129616, + "grad_norm": 0.86493980884552, + "learning_rate": 1.0859071236695623e-06, + "loss": 0.0623, + "step": 12973 + }, + { + "epoch": 2.1020738820479585, + "grad_norm": 0.8555430173873901, + "learning_rate": 1.0855465206255972e-06, + "loss": 0.0614, + "step": 12974 + }, + { + "epoch": 2.1022359040829555, + "grad_norm": 1.089228630065918, + "learning_rate": 1.0851859608604858e-06, + "loss": 0.0658, + "step": 12975 + }, + { + "epoch": 2.102397926117952, + "grad_norm": 0.8358057141304016, + "learning_rate": 1.0848254443852602e-06, + "loss": 0.0618, + "step": 12976 + }, + { + "epoch": 2.102559948152949, + "grad_norm": 0.9118403792381287, + "learning_rate": 1.0844649712109515e-06, + "loss": 0.0661, + "step": 12977 + }, + { + "epoch": 2.1027219701879454, + "grad_norm": 0.7774009704589844, + "learning_rate": 1.084104541348589e-06, + "loss": 0.06, + "step": 12978 + }, + { + "epoch": 2.1028839922229423, + "grad_norm": 0.921910285949707, + "learning_rate": 1.083744154809202e-06, + "loss": 0.0647, + "step": 12979 + }, + { + "epoch": 2.1030460142579392, + "grad_norm": 0.8106131553649902, + "learning_rate": 1.0833838116038156e-06, + "loss": 0.0586, + "step": 12980 + }, + { + "epoch": 2.1032080362929357, + "grad_norm": 0.8339465856552124, + "learning_rate": 1.0830235117434557e-06, + "loss": 0.062, + "step": 12981 + }, + { + "epoch": 2.1033700583279327, + "grad_norm": 0.9177747368812561, + "learning_rate": 1.0826632552391484e-06, + "loss": 0.0655, + "step": 12982 + }, + { + "epoch": 2.103532080362929, + "grad_norm": 0.8722216486930847, + "learning_rate": 1.0823030421019163e-06, + "loss": 0.062, + "step": 12983 + }, + { + "epoch": 2.103694102397926, + "grad_norm": 0.8818717002868652, + "learning_rate": 1.081942872342779e-06, + "loss": 0.0585, + "step": 12984 + }, + { + "epoch": 2.103856124432923, + "grad_norm": 0.8914388418197632, + "learning_rate": 1.081582745972758e-06, + "loss": 0.066, + "step": 12985 + }, + { + "epoch": 2.1040181464679195, + "grad_norm": 0.9045519828796387, + "learning_rate": 1.0812226630028738e-06, + "loss": 0.0622, + "step": 12986 + }, + { + "epoch": 2.1041801685029164, + "grad_norm": 1.0446083545684814, + "learning_rate": 1.080862623444142e-06, + "loss": 0.0668, + "step": 12987 + }, + { + "epoch": 2.1043421905379134, + "grad_norm": 0.7879253029823303, + "learning_rate": 1.0805026273075797e-06, + "loss": 0.0604, + "step": 12988 + }, + { + "epoch": 2.10450421257291, + "grad_norm": 0.8485068082809448, + "learning_rate": 1.0801426746042018e-06, + "loss": 0.0651, + "step": 12989 + }, + { + "epoch": 2.104666234607907, + "grad_norm": 0.8506676554679871, + "learning_rate": 1.0797827653450222e-06, + "loss": 0.0647, + "step": 12990 + }, + { + "epoch": 2.1048282566429033, + "grad_norm": 0.8726806640625, + "learning_rate": 1.079422899541053e-06, + "loss": 0.0626, + "step": 12991 + }, + { + "epoch": 2.1049902786779002, + "grad_norm": 0.9030711650848389, + "learning_rate": 1.0790630772033057e-06, + "loss": 0.0698, + "step": 12992 + }, + { + "epoch": 2.105152300712897, + "grad_norm": 0.8381808400154114, + "learning_rate": 1.0787032983427892e-06, + "loss": 0.0691, + "step": 12993 + }, + { + "epoch": 2.1053143227478937, + "grad_norm": 0.9773826003074646, + "learning_rate": 1.0783435629705134e-06, + "loss": 0.0644, + "step": 12994 + }, + { + "epoch": 2.1054763447828906, + "grad_norm": 0.8287825584411621, + "learning_rate": 1.0779838710974822e-06, + "loss": 0.0559, + "step": 12995 + }, + { + "epoch": 2.105638366817887, + "grad_norm": 0.862917423248291, + "learning_rate": 1.0776242227347044e-06, + "loss": 0.0587, + "step": 12996 + }, + { + "epoch": 2.105800388852884, + "grad_norm": 0.8506877422332764, + "learning_rate": 1.0772646178931843e-06, + "loss": 0.0565, + "step": 12997 + }, + { + "epoch": 2.105962410887881, + "grad_norm": 0.8896209001541138, + "learning_rate": 1.0769050565839228e-06, + "loss": 0.0658, + "step": 12998 + }, + { + "epoch": 2.1061244329228774, + "grad_norm": 0.9213668704032898, + "learning_rate": 1.076545538817923e-06, + "loss": 0.0604, + "step": 12999 + }, + { + "epoch": 2.1062864549578744, + "grad_norm": 0.9148227572441101, + "learning_rate": 1.0761860646061838e-06, + "loss": 0.0712, + "step": 13000 + }, + { + "epoch": 2.106448476992871, + "grad_norm": 0.8725471496582031, + "learning_rate": 1.0758266339597077e-06, + "loss": 0.0651, + "step": 13001 + }, + { + "epoch": 2.106610499027868, + "grad_norm": 0.9468567967414856, + "learning_rate": 1.0754672468894889e-06, + "loss": 0.0638, + "step": 13002 + }, + { + "epoch": 2.1067725210628647, + "grad_norm": 0.9501492977142334, + "learning_rate": 1.075107903406525e-06, + "loss": 0.0691, + "step": 13003 + }, + { + "epoch": 2.106934543097861, + "grad_norm": 0.8555724024772644, + "learning_rate": 1.0747486035218116e-06, + "loss": 0.0641, + "step": 13004 + }, + { + "epoch": 2.107096565132858, + "grad_norm": 1.0084507465362549, + "learning_rate": 1.0743893472463416e-06, + "loss": 0.0585, + "step": 13005 + }, + { + "epoch": 2.1072585871678546, + "grad_norm": 0.8593863844871521, + "learning_rate": 1.0740301345911075e-06, + "loss": 0.0604, + "step": 13006 + }, + { + "epoch": 2.1074206092028516, + "grad_norm": 0.9241552948951721, + "learning_rate": 1.0736709655671004e-06, + "loss": 0.0619, + "step": 13007 + }, + { + "epoch": 2.1075826312378485, + "grad_norm": 0.9510383009910583, + "learning_rate": 1.0733118401853112e-06, + "loss": 0.0679, + "step": 13008 + }, + { + "epoch": 2.107744653272845, + "grad_norm": 0.8677833676338196, + "learning_rate": 1.0729527584567255e-06, + "loss": 0.0654, + "step": 13009 + }, + { + "epoch": 2.107906675307842, + "grad_norm": 0.8571850061416626, + "learning_rate": 1.0725937203923327e-06, + "loss": 0.0565, + "step": 13010 + }, + { + "epoch": 2.1080686973428384, + "grad_norm": 0.7107164263725281, + "learning_rate": 1.072234726003118e-06, + "loss": 0.053, + "step": 13011 + }, + { + "epoch": 2.1082307193778353, + "grad_norm": 0.9328707456588745, + "learning_rate": 1.0718757753000665e-06, + "loss": 0.0591, + "step": 13012 + }, + { + "epoch": 2.1083927414128323, + "grad_norm": 0.8621041178703308, + "learning_rate": 1.0715168682941591e-06, + "loss": 0.0615, + "step": 13013 + }, + { + "epoch": 2.1085547634478288, + "grad_norm": 0.8346017599105835, + "learning_rate": 1.071158004996378e-06, + "loss": 0.063, + "step": 13014 + }, + { + "epoch": 2.1087167854828257, + "grad_norm": 0.8426311016082764, + "learning_rate": 1.0707991854177057e-06, + "loss": 0.06, + "step": 13015 + }, + { + "epoch": 2.108878807517822, + "grad_norm": 0.9094135761260986, + "learning_rate": 1.070440409569119e-06, + "loss": 0.0647, + "step": 13016 + }, + { + "epoch": 2.109040829552819, + "grad_norm": 1.048732042312622, + "learning_rate": 1.0700816774615964e-06, + "loss": 0.0622, + "step": 13017 + }, + { + "epoch": 2.109202851587816, + "grad_norm": 0.9794455170631409, + "learning_rate": 1.0697229891061141e-06, + "loss": 0.0608, + "step": 13018 + }, + { + "epoch": 2.1093648736228126, + "grad_norm": 0.8779764771461487, + "learning_rate": 1.069364344513647e-06, + "loss": 0.0625, + "step": 13019 + }, + { + "epoch": 2.1095268956578095, + "grad_norm": 0.8880550861358643, + "learning_rate": 1.0690057436951689e-06, + "loss": 0.0581, + "step": 13020 + }, + { + "epoch": 2.1096889176928064, + "grad_norm": 0.8012394309043884, + "learning_rate": 1.0686471866616525e-06, + "loss": 0.0571, + "step": 13021 + }, + { + "epoch": 2.109850939727803, + "grad_norm": 0.9028552770614624, + "learning_rate": 1.068288673424068e-06, + "loss": 0.0609, + "step": 13022 + }, + { + "epoch": 2.1100129617628, + "grad_norm": 0.8535613417625427, + "learning_rate": 1.0679302039933864e-06, + "loss": 0.0554, + "step": 13023 + }, + { + "epoch": 2.1101749837977963, + "grad_norm": 0.8417321443557739, + "learning_rate": 1.067571778380573e-06, + "loss": 0.0549, + "step": 13024 + }, + { + "epoch": 2.1103370058327933, + "grad_norm": 0.7988325357437134, + "learning_rate": 1.0672133965965981e-06, + "loss": 0.0567, + "step": 13025 + }, + { + "epoch": 2.11049902786779, + "grad_norm": 1.0051651000976562, + "learning_rate": 1.0668550586524256e-06, + "loss": 0.0633, + "step": 13026 + }, + { + "epoch": 2.1106610499027867, + "grad_norm": 0.9682597517967224, + "learning_rate": 1.0664967645590212e-06, + "loss": 0.0544, + "step": 13027 + }, + { + "epoch": 2.1108230719377836, + "grad_norm": 0.841816782951355, + "learning_rate": 1.066138514327345e-06, + "loss": 0.0603, + "step": 13028 + }, + { + "epoch": 2.11098509397278, + "grad_norm": 0.9751487970352173, + "learning_rate": 1.0657803079683616e-06, + "loss": 0.0612, + "step": 13029 + }, + { + "epoch": 2.111147116007777, + "grad_norm": 0.8450110554695129, + "learning_rate": 1.0654221454930305e-06, + "loss": 0.0551, + "step": 13030 + }, + { + "epoch": 2.111309138042774, + "grad_norm": 0.9710997343063354, + "learning_rate": 1.0650640269123095e-06, + "loss": 0.0593, + "step": 13031 + }, + { + "epoch": 2.1114711600777705, + "grad_norm": 0.8456966280937195, + "learning_rate": 1.0647059522371565e-06, + "loss": 0.0608, + "step": 13032 + }, + { + "epoch": 2.1116331821127674, + "grad_norm": 1.07785964012146, + "learning_rate": 1.0643479214785283e-06, + "loss": 0.0738, + "step": 13033 + }, + { + "epoch": 2.111795204147764, + "grad_norm": 1.0498229265213013, + "learning_rate": 1.0639899346473792e-06, + "loss": 0.0668, + "step": 13034 + }, + { + "epoch": 2.111957226182761, + "grad_norm": 0.8354172706604004, + "learning_rate": 1.0636319917546631e-06, + "loss": 0.0569, + "step": 13035 + }, + { + "epoch": 2.1121192482177578, + "grad_norm": 0.8369887471199036, + "learning_rate": 1.0632740928113323e-06, + "loss": 0.0584, + "step": 13036 + }, + { + "epoch": 2.1122812702527543, + "grad_norm": 0.9104105830192566, + "learning_rate": 1.0629162378283372e-06, + "loss": 0.0655, + "step": 13037 + }, + { + "epoch": 2.112443292287751, + "grad_norm": 0.890618622303009, + "learning_rate": 1.0625584268166278e-06, + "loss": 0.0622, + "step": 13038 + }, + { + "epoch": 2.1126053143227477, + "grad_norm": 0.9149554967880249, + "learning_rate": 1.062200659787152e-06, + "loss": 0.067, + "step": 13039 + }, + { + "epoch": 2.1127673363577446, + "grad_norm": 0.8385174870491028, + "learning_rate": 1.0618429367508564e-06, + "loss": 0.0558, + "step": 13040 + }, + { + "epoch": 2.1129293583927415, + "grad_norm": 0.7837886214256287, + "learning_rate": 1.0614852577186877e-06, + "loss": 0.0578, + "step": 13041 + }, + { + "epoch": 2.113091380427738, + "grad_norm": 0.9414006471633911, + "learning_rate": 1.061127622701588e-06, + "loss": 0.0681, + "step": 13042 + }, + { + "epoch": 2.113253402462735, + "grad_norm": 0.8170459270477295, + "learning_rate": 1.0607700317105002e-06, + "loss": 0.0576, + "step": 13043 + }, + { + "epoch": 2.113415424497732, + "grad_norm": 0.7887612581253052, + "learning_rate": 1.0604124847563674e-06, + "loss": 0.0565, + "step": 13044 + }, + { + "epoch": 2.1135774465327284, + "grad_norm": 0.9711698293685913, + "learning_rate": 1.0600549818501298e-06, + "loss": 0.0656, + "step": 13045 + }, + { + "epoch": 2.1137394685677253, + "grad_norm": 1.077919363975525, + "learning_rate": 1.0596975230027243e-06, + "loss": 0.0674, + "step": 13046 + }, + { + "epoch": 2.113901490602722, + "grad_norm": 0.9038639068603516, + "learning_rate": 1.059340108225089e-06, + "loss": 0.0589, + "step": 13047 + }, + { + "epoch": 2.1140635126377187, + "grad_norm": 0.9091435074806213, + "learning_rate": 1.05898273752816e-06, + "loss": 0.068, + "step": 13048 + }, + { + "epoch": 2.1142255346727157, + "grad_norm": 0.8895291686058044, + "learning_rate": 1.0586254109228722e-06, + "loss": 0.0593, + "step": 13049 + }, + { + "epoch": 2.114387556707712, + "grad_norm": 0.8643745183944702, + "learning_rate": 1.0582681284201587e-06, + "loss": 0.0627, + "step": 13050 + }, + { + "epoch": 2.114549578742709, + "grad_norm": 0.9382444024085999, + "learning_rate": 1.0579108900309518e-06, + "loss": 0.0609, + "step": 13051 + }, + { + "epoch": 2.1147116007777056, + "grad_norm": 0.8059027791023254, + "learning_rate": 1.0575536957661814e-06, + "loss": 0.061, + "step": 13052 + }, + { + "epoch": 2.1148736228127025, + "grad_norm": 0.8786547780036926, + "learning_rate": 1.0571965456367774e-06, + "loss": 0.0579, + "step": 13053 + }, + { + "epoch": 2.1150356448476995, + "grad_norm": 0.7798833847045898, + "learning_rate": 1.056839439653668e-06, + "loss": 0.0528, + "step": 13054 + }, + { + "epoch": 2.115197666882696, + "grad_norm": 0.9558907151222229, + "learning_rate": 1.056482377827779e-06, + "loss": 0.0664, + "step": 13055 + }, + { + "epoch": 2.115359688917693, + "grad_norm": 0.9504494667053223, + "learning_rate": 1.056125360170037e-06, + "loss": 0.0659, + "step": 13056 + }, + { + "epoch": 2.1155217109526894, + "grad_norm": 1.01918363571167, + "learning_rate": 1.055768386691363e-06, + "loss": 0.0646, + "step": 13057 + }, + { + "epoch": 2.1156837329876863, + "grad_norm": 0.8410986065864563, + "learning_rate": 1.0554114574026823e-06, + "loss": 0.0581, + "step": 13058 + }, + { + "epoch": 2.1158457550226832, + "grad_norm": 1.0521918535232544, + "learning_rate": 1.055054572314916e-06, + "loss": 0.0752, + "step": 13059 + }, + { + "epoch": 2.1160077770576797, + "grad_norm": 0.847416877746582, + "learning_rate": 1.0546977314389822e-06, + "loss": 0.062, + "step": 13060 + }, + { + "epoch": 2.1161697990926767, + "grad_norm": 1.0600688457489014, + "learning_rate": 1.0543409347858001e-06, + "loss": 0.069, + "step": 13061 + }, + { + "epoch": 2.116331821127673, + "grad_norm": 0.8803973197937012, + "learning_rate": 1.0539841823662867e-06, + "loss": 0.0616, + "step": 13062 + }, + { + "epoch": 2.11649384316267, + "grad_norm": 0.8875924348831177, + "learning_rate": 1.053627474191359e-06, + "loss": 0.0583, + "step": 13063 + }, + { + "epoch": 2.116655865197667, + "grad_norm": 0.8933229446411133, + "learning_rate": 1.0532708102719303e-06, + "loss": 0.07, + "step": 13064 + }, + { + "epoch": 2.1168178872326635, + "grad_norm": 0.9175959825515747, + "learning_rate": 1.0529141906189133e-06, + "loss": 0.0598, + "step": 13065 + }, + { + "epoch": 2.1169799092676604, + "grad_norm": 0.8931822180747986, + "learning_rate": 1.0525576152432204e-06, + "loss": 0.0656, + "step": 13066 + }, + { + "epoch": 2.1171419313026574, + "grad_norm": 0.9046509265899658, + "learning_rate": 1.0522010841557615e-06, + "loss": 0.0662, + "step": 13067 + }, + { + "epoch": 2.117303953337654, + "grad_norm": 1.0218912363052368, + "learning_rate": 1.051844597367446e-06, + "loss": 0.0644, + "step": 13068 + }, + { + "epoch": 2.117465975372651, + "grad_norm": 0.8441329598426819, + "learning_rate": 1.0514881548891813e-06, + "loss": 0.062, + "step": 13069 + }, + { + "epoch": 2.1176279974076473, + "grad_norm": 0.9138961434364319, + "learning_rate": 1.0511317567318737e-06, + "loss": 0.0687, + "step": 13070 + }, + { + "epoch": 2.1177900194426442, + "grad_norm": 0.9639347791671753, + "learning_rate": 1.0507754029064293e-06, + "loss": 0.0589, + "step": 13071 + }, + { + "epoch": 2.117952041477641, + "grad_norm": 0.9006848931312561, + "learning_rate": 1.0504190934237484e-06, + "loss": 0.0641, + "step": 13072 + }, + { + "epoch": 2.1181140635126376, + "grad_norm": 1.6156253814697266, + "learning_rate": 1.0500628282947365e-06, + "loss": 0.0539, + "step": 13073 + }, + { + "epoch": 2.1182760855476346, + "grad_norm": 0.8980904817581177, + "learning_rate": 1.0497066075302939e-06, + "loss": 0.0691, + "step": 13074 + }, + { + "epoch": 2.118438107582631, + "grad_norm": 1.003623604774475, + "learning_rate": 1.0493504311413185e-06, + "loss": 0.0757, + "step": 13075 + }, + { + "epoch": 2.118600129617628, + "grad_norm": 0.8838923573493958, + "learning_rate": 1.0489942991387088e-06, + "loss": 0.0564, + "step": 13076 + }, + { + "epoch": 2.118762151652625, + "grad_norm": 0.8242804408073425, + "learning_rate": 1.0486382115333638e-06, + "loss": 0.0575, + "step": 13077 + }, + { + "epoch": 2.1189241736876214, + "grad_norm": 0.7853438258171082, + "learning_rate": 1.0482821683361767e-06, + "loss": 0.0624, + "step": 13078 + }, + { + "epoch": 2.1190861957226184, + "grad_norm": 0.8636956810951233, + "learning_rate": 1.0479261695580417e-06, + "loss": 0.062, + "step": 13079 + }, + { + "epoch": 2.119248217757615, + "grad_norm": 0.9305495023727417, + "learning_rate": 1.0475702152098522e-06, + "loss": 0.0606, + "step": 13080 + }, + { + "epoch": 2.119410239792612, + "grad_norm": 0.8130801320075989, + "learning_rate": 1.0472143053024994e-06, + "loss": 0.0592, + "step": 13081 + }, + { + "epoch": 2.1195722618276087, + "grad_norm": 0.9285720586776733, + "learning_rate": 1.0468584398468729e-06, + "loss": 0.0645, + "step": 13082 + }, + { + "epoch": 2.119734283862605, + "grad_norm": 0.7963539958000183, + "learning_rate": 1.0465026188538618e-06, + "loss": 0.0576, + "step": 13083 + }, + { + "epoch": 2.119896305897602, + "grad_norm": 0.8605098724365234, + "learning_rate": 1.0461468423343532e-06, + "loss": 0.0569, + "step": 13084 + }, + { + "epoch": 2.1200583279325986, + "grad_norm": 0.8885794878005981, + "learning_rate": 1.0457911102992338e-06, + "loss": 0.0636, + "step": 13085 + }, + { + "epoch": 2.1202203499675956, + "grad_norm": 0.8622367978096008, + "learning_rate": 1.0454354227593855e-06, + "loss": 0.0614, + "step": 13086 + }, + { + "epoch": 2.1203823720025925, + "grad_norm": 0.8698747754096985, + "learning_rate": 1.0450797797256942e-06, + "loss": 0.063, + "step": 13087 + }, + { + "epoch": 2.120544394037589, + "grad_norm": 0.8544244766235352, + "learning_rate": 1.0447241812090408e-06, + "loss": 0.0548, + "step": 13088 + }, + { + "epoch": 2.120706416072586, + "grad_norm": 0.7378759980201721, + "learning_rate": 1.0443686272203066e-06, + "loss": 0.056, + "step": 13089 + }, + { + "epoch": 2.120868438107583, + "grad_norm": 0.8694587349891663, + "learning_rate": 1.0440131177703692e-06, + "loss": 0.068, + "step": 13090 + }, + { + "epoch": 2.1210304601425793, + "grad_norm": 0.8376104831695557, + "learning_rate": 1.0436576528701057e-06, + "loss": 0.0592, + "step": 13091 + }, + { + "epoch": 2.1211924821775763, + "grad_norm": 0.9313507080078125, + "learning_rate": 1.0433022325303956e-06, + "loss": 0.0627, + "step": 13092 + }, + { + "epoch": 2.1213545042125728, + "grad_norm": 0.9477971792221069, + "learning_rate": 1.0429468567621115e-06, + "loss": 0.0663, + "step": 13093 + }, + { + "epoch": 2.1215165262475697, + "grad_norm": 0.8192235827445984, + "learning_rate": 1.042591525576127e-06, + "loss": 0.0603, + "step": 13094 + }, + { + "epoch": 2.1216785482825666, + "grad_norm": 0.8660534620285034, + "learning_rate": 1.0422362389833145e-06, + "loss": 0.0631, + "step": 13095 + }, + { + "epoch": 2.121840570317563, + "grad_norm": 0.8360748887062073, + "learning_rate": 1.041880996994547e-06, + "loss": 0.0584, + "step": 13096 + }, + { + "epoch": 2.12200259235256, + "grad_norm": 0.8174402713775635, + "learning_rate": 1.0415257996206918e-06, + "loss": 0.0593, + "step": 13097 + }, + { + "epoch": 2.1221646143875565, + "grad_norm": 0.9432098269462585, + "learning_rate": 1.0411706468726173e-06, + "loss": 0.0633, + "step": 13098 + }, + { + "epoch": 2.1223266364225535, + "grad_norm": 1.0108842849731445, + "learning_rate": 1.040815538761191e-06, + "loss": 0.0607, + "step": 13099 + }, + { + "epoch": 2.1224886584575504, + "grad_norm": 0.98541259765625, + "learning_rate": 1.040460475297278e-06, + "loss": 0.069, + "step": 13100 + }, + { + "epoch": 2.122650680492547, + "grad_norm": 0.8681195974349976, + "learning_rate": 1.0401054564917423e-06, + "loss": 0.0654, + "step": 13101 + }, + { + "epoch": 2.122812702527544, + "grad_norm": 0.9383473992347717, + "learning_rate": 1.039750482355447e-06, + "loss": 0.0651, + "step": 13102 + }, + { + "epoch": 2.1229747245625403, + "grad_norm": 1.2092524766921997, + "learning_rate": 1.0393955528992535e-06, + "loss": 0.0658, + "step": 13103 + }, + { + "epoch": 2.1231367465975373, + "grad_norm": 0.8957673907279968, + "learning_rate": 1.0390406681340212e-06, + "loss": 0.0572, + "step": 13104 + }, + { + "epoch": 2.123298768632534, + "grad_norm": 0.9200911521911621, + "learning_rate": 1.038685828070608e-06, + "loss": 0.0673, + "step": 13105 + }, + { + "epoch": 2.1234607906675307, + "grad_norm": 0.8420102000236511, + "learning_rate": 1.0383310327198728e-06, + "loss": 0.0562, + "step": 13106 + }, + { + "epoch": 2.1236228127025276, + "grad_norm": 0.9516891837120056, + "learning_rate": 1.037976282092672e-06, + "loss": 0.0657, + "step": 13107 + }, + { + "epoch": 2.123784834737524, + "grad_norm": 0.9453445076942444, + "learning_rate": 1.0376215761998578e-06, + "loss": 0.0617, + "step": 13108 + }, + { + "epoch": 2.123946856772521, + "grad_norm": 0.8707835674285889, + "learning_rate": 1.0372669150522845e-06, + "loss": 0.0593, + "step": 13109 + }, + { + "epoch": 2.124108878807518, + "grad_norm": 0.9308366775512695, + "learning_rate": 1.0369122986608044e-06, + "loss": 0.0588, + "step": 13110 + }, + { + "epoch": 2.1242709008425145, + "grad_norm": 0.8552700877189636, + "learning_rate": 1.0365577270362668e-06, + "loss": 0.0617, + "step": 13111 + }, + { + "epoch": 2.1244329228775114, + "grad_norm": 0.8702750205993652, + "learning_rate": 1.0362032001895214e-06, + "loss": 0.0667, + "step": 13112 + }, + { + "epoch": 2.124594944912508, + "grad_norm": 0.8597316145896912, + "learning_rate": 1.0358487181314155e-06, + "loss": 0.0642, + "step": 13113 + }, + { + "epoch": 2.124756966947505, + "grad_norm": 0.7736619114875793, + "learning_rate": 1.0354942808727962e-06, + "loss": 0.0589, + "step": 13114 + }, + { + "epoch": 2.1249189889825018, + "grad_norm": 1.05555260181427, + "learning_rate": 1.0351398884245076e-06, + "loss": 0.0707, + "step": 13115 + }, + { + "epoch": 2.1250810110174982, + "grad_norm": 0.8743281364440918, + "learning_rate": 1.0347855407973933e-06, + "loss": 0.0598, + "step": 13116 + }, + { + "epoch": 2.125243033052495, + "grad_norm": 0.9425612092018127, + "learning_rate": 1.0344312380022961e-06, + "loss": 0.0646, + "step": 13117 + }, + { + "epoch": 2.1254050550874917, + "grad_norm": 0.8367472290992737, + "learning_rate": 1.034076980050057e-06, + "loss": 0.0601, + "step": 13118 + }, + { + "epoch": 2.1255670771224886, + "grad_norm": 0.9993242621421814, + "learning_rate": 1.0337227669515131e-06, + "loss": 0.0639, + "step": 13119 + }, + { + "epoch": 2.1257290991574855, + "grad_norm": 0.7848953008651733, + "learning_rate": 1.0333685987175052e-06, + "loss": 0.0534, + "step": 13120 + }, + { + "epoch": 2.125891121192482, + "grad_norm": 0.973812460899353, + "learning_rate": 1.0330144753588688e-06, + "loss": 0.0685, + "step": 13121 + }, + { + "epoch": 2.126053143227479, + "grad_norm": 0.8284931182861328, + "learning_rate": 1.0326603968864407e-06, + "loss": 0.0557, + "step": 13122 + }, + { + "epoch": 2.126215165262476, + "grad_norm": 0.995773196220398, + "learning_rate": 1.0323063633110525e-06, + "loss": 0.0593, + "step": 13123 + }, + { + "epoch": 2.1263771872974724, + "grad_norm": 0.8277838230133057, + "learning_rate": 1.0319523746435367e-06, + "loss": 0.063, + "step": 13124 + }, + { + "epoch": 2.1265392093324693, + "grad_norm": 0.8277966380119324, + "learning_rate": 1.0315984308947277e-06, + "loss": 0.0618, + "step": 13125 + }, + { + "epoch": 2.126701231367466, + "grad_norm": 1.1131237745285034, + "learning_rate": 1.0312445320754522e-06, + "loss": 0.0729, + "step": 13126 + }, + { + "epoch": 2.1268632534024627, + "grad_norm": 0.846409797668457, + "learning_rate": 1.0308906781965396e-06, + "loss": 0.0642, + "step": 13127 + }, + { + "epoch": 2.1270252754374597, + "grad_norm": 0.8704565763473511, + "learning_rate": 1.0305368692688175e-06, + "loss": 0.07, + "step": 13128 + }, + { + "epoch": 2.127187297472456, + "grad_norm": 0.8594701886177063, + "learning_rate": 1.0301831053031109e-06, + "loss": 0.0583, + "step": 13129 + }, + { + "epoch": 2.127349319507453, + "grad_norm": 0.9101399183273315, + "learning_rate": 1.0298293863102444e-06, + "loss": 0.064, + "step": 13130 + }, + { + "epoch": 2.1275113415424496, + "grad_norm": 0.896522045135498, + "learning_rate": 1.0294757123010406e-06, + "loss": 0.0645, + "step": 13131 + }, + { + "epoch": 2.1276733635774465, + "grad_norm": 0.8539774417877197, + "learning_rate": 1.0291220832863219e-06, + "loss": 0.0617, + "step": 13132 + }, + { + "epoch": 2.1278353856124435, + "grad_norm": 0.8655493259429932, + "learning_rate": 1.0287684992769086e-06, + "loss": 0.065, + "step": 13133 + }, + { + "epoch": 2.12799740764744, + "grad_norm": 0.8757855296134949, + "learning_rate": 1.0284149602836174e-06, + "loss": 0.0602, + "step": 13134 + }, + { + "epoch": 2.128159429682437, + "grad_norm": 0.9795240759849548, + "learning_rate": 1.028061466317268e-06, + "loss": 0.0667, + "step": 13135 + }, + { + "epoch": 2.1283214517174334, + "grad_norm": 0.8074472546577454, + "learning_rate": 1.0277080173886766e-06, + "loss": 0.0633, + "step": 13136 + }, + { + "epoch": 2.1284834737524303, + "grad_norm": 0.8354179263114929, + "learning_rate": 1.0273546135086559e-06, + "loss": 0.0551, + "step": 13137 + }, + { + "epoch": 2.1286454957874272, + "grad_norm": 0.8012445569038391, + "learning_rate": 1.0270012546880207e-06, + "loss": 0.0567, + "step": 13138 + }, + { + "epoch": 2.1288075178224237, + "grad_norm": 0.8529260158538818, + "learning_rate": 1.0266479409375813e-06, + "loss": 0.0646, + "step": 13139 + }, + { + "epoch": 2.1289695398574207, + "grad_norm": 0.8222674131393433, + "learning_rate": 1.0262946722681513e-06, + "loss": 0.0624, + "step": 13140 + }, + { + "epoch": 2.129131561892417, + "grad_norm": 0.757540762424469, + "learning_rate": 1.0259414486905373e-06, + "loss": 0.0591, + "step": 13141 + }, + { + "epoch": 2.129293583927414, + "grad_norm": 0.9325169920921326, + "learning_rate": 1.0255882702155476e-06, + "loss": 0.0637, + "step": 13142 + }, + { + "epoch": 2.129455605962411, + "grad_norm": 0.7963911294937134, + "learning_rate": 1.025235136853989e-06, + "loss": 0.0561, + "step": 13143 + }, + { + "epoch": 2.1296176279974075, + "grad_norm": 0.8913578391075134, + "learning_rate": 1.024882048616666e-06, + "loss": 0.058, + "step": 13144 + }, + { + "epoch": 2.1297796500324044, + "grad_norm": 0.9068898558616638, + "learning_rate": 1.024529005514383e-06, + "loss": 0.0654, + "step": 13145 + }, + { + "epoch": 2.1299416720674014, + "grad_norm": 0.9450251460075378, + "learning_rate": 1.0241760075579418e-06, + "loss": 0.0663, + "step": 13146 + }, + { + "epoch": 2.130103694102398, + "grad_norm": 0.8904002904891968, + "learning_rate": 1.023823054758144e-06, + "loss": 0.0605, + "step": 13147 + }, + { + "epoch": 2.130265716137395, + "grad_norm": 0.7318344712257385, + "learning_rate": 1.0234701471257868e-06, + "loss": 0.0577, + "step": 13148 + }, + { + "epoch": 2.1304277381723913, + "grad_norm": 0.8698272705078125, + "learning_rate": 1.023117284671671e-06, + "loss": 0.062, + "step": 13149 + }, + { + "epoch": 2.130589760207388, + "grad_norm": 0.868567168712616, + "learning_rate": 1.0227644674065923e-06, + "loss": 0.0588, + "step": 13150 + }, + { + "epoch": 2.130751782242385, + "grad_norm": 0.8194864988327026, + "learning_rate": 1.0224116953413468e-06, + "loss": 0.0544, + "step": 13151 + }, + { + "epoch": 2.1309138042773816, + "grad_norm": 0.8874611258506775, + "learning_rate": 1.0220589684867269e-06, + "loss": 0.0614, + "step": 13152 + }, + { + "epoch": 2.1310758263123786, + "grad_norm": 0.9767943620681763, + "learning_rate": 1.0217062868535249e-06, + "loss": 0.0582, + "step": 13153 + }, + { + "epoch": 2.131237848347375, + "grad_norm": 0.8773373365402222, + "learning_rate": 1.021353650452535e-06, + "loss": 0.0601, + "step": 13154 + }, + { + "epoch": 2.131399870382372, + "grad_norm": 0.8418240547180176, + "learning_rate": 1.0210010592945442e-06, + "loss": 0.0585, + "step": 13155 + }, + { + "epoch": 2.131561892417369, + "grad_norm": 0.8298531174659729, + "learning_rate": 1.0206485133903424e-06, + "loss": 0.0618, + "step": 13156 + }, + { + "epoch": 2.1317239144523654, + "grad_norm": 0.81301349401474, + "learning_rate": 1.0202960127507155e-06, + "loss": 0.0536, + "step": 13157 + }, + { + "epoch": 2.1318859364873624, + "grad_norm": 0.8824573755264282, + "learning_rate": 1.0199435573864502e-06, + "loss": 0.0617, + "step": 13158 + }, + { + "epoch": 2.132047958522359, + "grad_norm": 0.9553408622741699, + "learning_rate": 1.01959114730833e-06, + "loss": 0.0646, + "step": 13159 + }, + { + "epoch": 2.1322099805573558, + "grad_norm": 0.8561825156211853, + "learning_rate": 1.0192387825271384e-06, + "loss": 0.0622, + "step": 13160 + }, + { + "epoch": 2.1323720025923527, + "grad_norm": 0.9502358436584473, + "learning_rate": 1.0188864630536568e-06, + "loss": 0.0666, + "step": 13161 + }, + { + "epoch": 2.132534024627349, + "grad_norm": 0.973298966884613, + "learning_rate": 1.018534188898665e-06, + "loss": 0.0671, + "step": 13162 + }, + { + "epoch": 2.132696046662346, + "grad_norm": 0.9729242920875549, + "learning_rate": 1.018181960072942e-06, + "loss": 0.0594, + "step": 13163 + }, + { + "epoch": 2.1328580686973426, + "grad_norm": 0.9466162919998169, + "learning_rate": 1.0178297765872651e-06, + "loss": 0.0709, + "step": 13164 + }, + { + "epoch": 2.1330200907323396, + "grad_norm": 0.8360689878463745, + "learning_rate": 1.0174776384524104e-06, + "loss": 0.0593, + "step": 13165 + }, + { + "epoch": 2.1331821127673365, + "grad_norm": 0.8890894055366516, + "learning_rate": 1.0171255456791531e-06, + "loss": 0.0664, + "step": 13166 + }, + { + "epoch": 2.133344134802333, + "grad_norm": 0.9430520534515381, + "learning_rate": 1.0167734982782636e-06, + "loss": 0.0605, + "step": 13167 + }, + { + "epoch": 2.13350615683733, + "grad_norm": 1.001865029335022, + "learning_rate": 1.016421496260517e-06, + "loss": 0.0626, + "step": 13168 + }, + { + "epoch": 2.133668178872327, + "grad_norm": 1.0136559009552002, + "learning_rate": 1.0160695396366832e-06, + "loss": 0.0638, + "step": 13169 + }, + { + "epoch": 2.1338302009073233, + "grad_norm": 1.062779188156128, + "learning_rate": 1.0157176284175293e-06, + "loss": 0.0654, + "step": 13170 + }, + { + "epoch": 2.1339922229423203, + "grad_norm": 0.7638817429542542, + "learning_rate": 1.0153657626138244e-06, + "loss": 0.0505, + "step": 13171 + }, + { + "epoch": 2.1341542449773168, + "grad_norm": 0.8198952078819275, + "learning_rate": 1.0150139422363342e-06, + "loss": 0.0552, + "step": 13172 + }, + { + "epoch": 2.1343162670123137, + "grad_norm": 0.9854885935783386, + "learning_rate": 1.0146621672958238e-06, + "loss": 0.061, + "step": 13173 + }, + { + "epoch": 2.1344782890473106, + "grad_norm": 0.9766724705696106, + "learning_rate": 1.0143104378030565e-06, + "loss": 0.0619, + "step": 13174 + }, + { + "epoch": 2.134640311082307, + "grad_norm": 1.0882524251937866, + "learning_rate": 1.013958753768795e-06, + "loss": 0.0698, + "step": 13175 + }, + { + "epoch": 2.134802333117304, + "grad_norm": 0.8586061596870422, + "learning_rate": 1.013607115203799e-06, + "loss": 0.0645, + "step": 13176 + }, + { + "epoch": 2.1349643551523005, + "grad_norm": 0.8522896766662598, + "learning_rate": 1.0132555221188285e-06, + "loss": 0.0626, + "step": 13177 + }, + { + "epoch": 2.1351263771872975, + "grad_norm": 0.9053431153297424, + "learning_rate": 1.012903974524641e-06, + "loss": 0.0658, + "step": 13178 + }, + { + "epoch": 2.1352883992222944, + "grad_norm": 0.8362330794334412, + "learning_rate": 1.0125524724319936e-06, + "loss": 0.0619, + "step": 13179 + }, + { + "epoch": 2.135450421257291, + "grad_norm": 0.8866271376609802, + "learning_rate": 1.0122010158516412e-06, + "loss": 0.0626, + "step": 13180 + }, + { + "epoch": 2.135612443292288, + "grad_norm": 0.9750482439994812, + "learning_rate": 1.0118496047943368e-06, + "loss": 0.0746, + "step": 13181 + }, + { + "epoch": 2.1357744653272843, + "grad_norm": 0.9252564311027527, + "learning_rate": 1.0114982392708325e-06, + "loss": 0.0663, + "step": 13182 + }, + { + "epoch": 2.1359364873622813, + "grad_norm": 1.186126947402954, + "learning_rate": 1.0111469192918808e-06, + "loss": 0.0598, + "step": 13183 + }, + { + "epoch": 2.136098509397278, + "grad_norm": 0.952793300151825, + "learning_rate": 1.010795644868231e-06, + "loss": 0.0554, + "step": 13184 + }, + { + "epoch": 2.1362605314322747, + "grad_norm": 1.0087121725082397, + "learning_rate": 1.01044441601063e-06, + "loss": 0.0665, + "step": 13185 + }, + { + "epoch": 2.1364225534672716, + "grad_norm": 0.8914116024971008, + "learning_rate": 1.0100932327298244e-06, + "loss": 0.064, + "step": 13186 + }, + { + "epoch": 2.136584575502268, + "grad_norm": 0.8840557336807251, + "learning_rate": 1.009742095036562e-06, + "loss": 0.0569, + "step": 13187 + }, + { + "epoch": 2.136746597537265, + "grad_norm": 0.9341459274291992, + "learning_rate": 1.0093910029415843e-06, + "loss": 0.0543, + "step": 13188 + }, + { + "epoch": 2.136908619572262, + "grad_norm": 0.9242621064186096, + "learning_rate": 1.0090399564556348e-06, + "loss": 0.0626, + "step": 13189 + }, + { + "epoch": 2.1370706416072585, + "grad_norm": 0.9278979897499084, + "learning_rate": 1.0086889555894545e-06, + "loss": 0.0592, + "step": 13190 + }, + { + "epoch": 2.1372326636422554, + "grad_norm": 0.8471470475196838, + "learning_rate": 1.0083380003537831e-06, + "loss": 0.0644, + "step": 13191 + }, + { + "epoch": 2.1373946856772523, + "grad_norm": 0.8700275421142578, + "learning_rate": 1.0079870907593592e-06, + "loss": 0.063, + "step": 13192 + }, + { + "epoch": 2.137556707712249, + "grad_norm": 0.7276986241340637, + "learning_rate": 1.0076362268169199e-06, + "loss": 0.0525, + "step": 13193 + }, + { + "epoch": 2.1377187297472457, + "grad_norm": 1.0328106880187988, + "learning_rate": 1.0072854085372005e-06, + "loss": 0.0698, + "step": 13194 + }, + { + "epoch": 2.1378807517822422, + "grad_norm": 0.8020308017730713, + "learning_rate": 1.0069346359309362e-06, + "loss": 0.0555, + "step": 13195 + }, + { + "epoch": 2.138042773817239, + "grad_norm": 0.9141387343406677, + "learning_rate": 1.0065839090088572e-06, + "loss": 0.0637, + "step": 13196 + }, + { + "epoch": 2.138204795852236, + "grad_norm": 0.8611671328544617, + "learning_rate": 1.0062332277816972e-06, + "loss": 0.0624, + "step": 13197 + }, + { + "epoch": 2.1383668178872326, + "grad_norm": 0.8084477186203003, + "learning_rate": 1.0058825922601866e-06, + "loss": 0.0594, + "step": 13198 + }, + { + "epoch": 2.1385288399222295, + "grad_norm": 0.9061813950538635, + "learning_rate": 1.0055320024550521e-06, + "loss": 0.0678, + "step": 13199 + }, + { + "epoch": 2.138690861957226, + "grad_norm": 0.9596869349479675, + "learning_rate": 1.005181458377022e-06, + "loss": 0.0662, + "step": 13200 + }, + { + "epoch": 2.138852883992223, + "grad_norm": 0.8806290626525879, + "learning_rate": 1.004830960036821e-06, + "loss": 0.0613, + "step": 13201 + }, + { + "epoch": 2.13901490602722, + "grad_norm": 0.7227916121482849, + "learning_rate": 1.0044805074451757e-06, + "loss": 0.0509, + "step": 13202 + }, + { + "epoch": 2.1391769280622164, + "grad_norm": 0.7963190674781799, + "learning_rate": 1.0041301006128073e-06, + "loss": 0.0637, + "step": 13203 + }, + { + "epoch": 2.1393389500972133, + "grad_norm": 0.936798632144928, + "learning_rate": 1.003779739550438e-06, + "loss": 0.0689, + "step": 13204 + }, + { + "epoch": 2.13950097213221, + "grad_norm": 0.857064962387085, + "learning_rate": 1.0034294242687875e-06, + "loss": 0.0585, + "step": 13205 + }, + { + "epoch": 2.1396629941672067, + "grad_norm": 0.9607579112052917, + "learning_rate": 1.003079154778575e-06, + "loss": 0.0691, + "step": 13206 + }, + { + "epoch": 2.1398250162022037, + "grad_norm": 0.9430453181266785, + "learning_rate": 1.0027289310905181e-06, + "loss": 0.059, + "step": 13207 + }, + { + "epoch": 2.1399870382372, + "grad_norm": 0.8209136724472046, + "learning_rate": 1.0023787532153325e-06, + "loss": 0.0648, + "step": 13208 + }, + { + "epoch": 2.140149060272197, + "grad_norm": 0.9626993536949158, + "learning_rate": 1.0020286211637328e-06, + "loss": 0.0641, + "step": 13209 + }, + { + "epoch": 2.1403110823071936, + "grad_norm": 0.9732939004898071, + "learning_rate": 1.0016785349464326e-06, + "loss": 0.0647, + "step": 13210 + }, + { + "epoch": 2.1404731043421905, + "grad_norm": 0.8415422439575195, + "learning_rate": 1.0013284945741431e-06, + "loss": 0.0608, + "step": 13211 + }, + { + "epoch": 2.1406351263771874, + "grad_norm": 0.9403479099273682, + "learning_rate": 1.0009785000575747e-06, + "loss": 0.0601, + "step": 13212 + }, + { + "epoch": 2.140797148412184, + "grad_norm": 0.9788222312927246, + "learning_rate": 1.000628551407438e-06, + "loss": 0.0685, + "step": 13213 + }, + { + "epoch": 2.140959170447181, + "grad_norm": 0.8836336135864258, + "learning_rate": 1.0002786486344379e-06, + "loss": 0.0625, + "step": 13214 + }, + { + "epoch": 2.141121192482178, + "grad_norm": 0.8284887671470642, + "learning_rate": 9.999287917492814e-07, + "loss": 0.057, + "step": 13215 + }, + { + "epoch": 2.1412832145171743, + "grad_norm": 0.9652777314186096, + "learning_rate": 9.995789807626754e-07, + "loss": 0.0552, + "step": 13216 + }, + { + "epoch": 2.1414452365521712, + "grad_norm": 0.9502488970756531, + "learning_rate": 9.992292156853207e-07, + "loss": 0.0636, + "step": 13217 + }, + { + "epoch": 2.1416072585871677, + "grad_norm": 0.9706515073776245, + "learning_rate": 9.988794965279203e-07, + "loss": 0.0636, + "step": 13218 + }, + { + "epoch": 2.1417692806221647, + "grad_norm": 0.9208532571792603, + "learning_rate": 9.985298233011746e-07, + "loss": 0.072, + "step": 13219 + }, + { + "epoch": 2.141931302657161, + "grad_norm": 0.8984748125076294, + "learning_rate": 9.981801960157827e-07, + "loss": 0.0574, + "step": 13220 + }, + { + "epoch": 2.142093324692158, + "grad_norm": 0.8832286596298218, + "learning_rate": 9.978306146824427e-07, + "loss": 0.0625, + "step": 13221 + }, + { + "epoch": 2.142255346727155, + "grad_norm": 1.0187299251556396, + "learning_rate": 9.974810793118505e-07, + "loss": 0.0616, + "step": 13222 + }, + { + "epoch": 2.1424173687621515, + "grad_norm": 0.8346063494682312, + "learning_rate": 9.971315899147012e-07, + "loss": 0.0624, + "step": 13223 + }, + { + "epoch": 2.1425793907971484, + "grad_norm": 1.0231930017471313, + "learning_rate": 9.967821465016893e-07, + "loss": 0.0675, + "step": 13224 + }, + { + "epoch": 2.1427414128321454, + "grad_norm": 0.936922550201416, + "learning_rate": 9.964327490835045e-07, + "loss": 0.0647, + "step": 13225 + }, + { + "epoch": 2.142903434867142, + "grad_norm": 0.8798637390136719, + "learning_rate": 9.960833976708398e-07, + "loss": 0.052, + "step": 13226 + }, + { + "epoch": 2.143065456902139, + "grad_norm": 0.7966178059577942, + "learning_rate": 9.957340922743835e-07, + "loss": 0.0607, + "step": 13227 + }, + { + "epoch": 2.1432274789371353, + "grad_norm": 0.7437213659286499, + "learning_rate": 9.953848329048248e-07, + "loss": 0.0558, + "step": 13228 + }, + { + "epoch": 2.143389500972132, + "grad_norm": 1.0451050996780396, + "learning_rate": 9.950356195728483e-07, + "loss": 0.0761, + "step": 13229 + }, + { + "epoch": 2.143551523007129, + "grad_norm": 0.8481780290603638, + "learning_rate": 9.94686452289139e-07, + "loss": 0.0546, + "step": 13230 + }, + { + "epoch": 2.1437135450421256, + "grad_norm": 0.8842169046401978, + "learning_rate": 9.943373310643831e-07, + "loss": 0.0643, + "step": 13231 + }, + { + "epoch": 2.1438755670771226, + "grad_norm": 0.9944992065429688, + "learning_rate": 9.939882559092604e-07, + "loss": 0.0705, + "step": 13232 + }, + { + "epoch": 2.144037589112119, + "grad_norm": 0.9027183651924133, + "learning_rate": 9.936392268344525e-07, + "loss": 0.0682, + "step": 13233 + }, + { + "epoch": 2.144199611147116, + "grad_norm": 0.7881889939308167, + "learning_rate": 9.93290243850638e-07, + "loss": 0.0604, + "step": 13234 + }, + { + "epoch": 2.144361633182113, + "grad_norm": 0.8811602592468262, + "learning_rate": 9.929413069684979e-07, + "loss": 0.0649, + "step": 13235 + }, + { + "epoch": 2.1445236552171094, + "grad_norm": 0.8543482422828674, + "learning_rate": 9.925924161987057e-07, + "loss": 0.0558, + "step": 13236 + }, + { + "epoch": 2.1446856772521063, + "grad_norm": 1.0113856792449951, + "learning_rate": 9.922435715519379e-07, + "loss": 0.0649, + "step": 13237 + }, + { + "epoch": 2.144847699287103, + "grad_norm": 0.8757432103157043, + "learning_rate": 9.918947730388682e-07, + "loss": 0.0651, + "step": 13238 + }, + { + "epoch": 2.1450097213220998, + "grad_norm": 1.0214018821716309, + "learning_rate": 9.915460206701685e-07, + "loss": 0.0677, + "step": 13239 + }, + { + "epoch": 2.1451717433570967, + "grad_norm": 0.9439998865127563, + "learning_rate": 9.911973144565105e-07, + "loss": 0.066, + "step": 13240 + }, + { + "epoch": 2.145333765392093, + "grad_norm": 1.0487086772918701, + "learning_rate": 9.908486544085632e-07, + "loss": 0.0734, + "step": 13241 + }, + { + "epoch": 2.14549578742709, + "grad_norm": 0.9988869428634644, + "learning_rate": 9.90500040536996e-07, + "loss": 0.0692, + "step": 13242 + }, + { + "epoch": 2.1456578094620866, + "grad_norm": 0.87641841173172, + "learning_rate": 9.901514728524739e-07, + "loss": 0.056, + "step": 13243 + }, + { + "epoch": 2.1458198314970836, + "grad_norm": 0.9582691192626953, + "learning_rate": 9.898029513656618e-07, + "loss": 0.0598, + "step": 13244 + }, + { + "epoch": 2.1459818535320805, + "grad_norm": 1.0479621887207031, + "learning_rate": 9.89454476087226e-07, + "loss": 0.0682, + "step": 13245 + }, + { + "epoch": 2.146143875567077, + "grad_norm": 0.893338143825531, + "learning_rate": 9.891060470278286e-07, + "loss": 0.0585, + "step": 13246 + }, + { + "epoch": 2.146305897602074, + "grad_norm": 0.880436360836029, + "learning_rate": 9.887576641981285e-07, + "loss": 0.0668, + "step": 13247 + }, + { + "epoch": 2.146467919637071, + "grad_norm": 0.9374086856842041, + "learning_rate": 9.884093276087871e-07, + "loss": 0.0614, + "step": 13248 + }, + { + "epoch": 2.1466299416720673, + "grad_norm": 0.9596349596977234, + "learning_rate": 9.880610372704624e-07, + "loss": 0.0688, + "step": 13249 + }, + { + "epoch": 2.1467919637070643, + "grad_norm": 1.77931809425354, + "learning_rate": 9.877127931938111e-07, + "loss": 0.0633, + "step": 13250 + }, + { + "epoch": 2.1469539857420608, + "grad_norm": 0.8775843977928162, + "learning_rate": 9.873645953894887e-07, + "loss": 0.0577, + "step": 13251 + }, + { + "epoch": 2.1471160077770577, + "grad_norm": 0.9120129942893982, + "learning_rate": 9.87016443868149e-07, + "loss": 0.0629, + "step": 13252 + }, + { + "epoch": 2.1472780298120546, + "grad_norm": 0.838457465171814, + "learning_rate": 9.86668338640445e-07, + "loss": 0.0577, + "step": 13253 + }, + { + "epoch": 2.147440051847051, + "grad_norm": 0.864747941493988, + "learning_rate": 9.863202797170273e-07, + "loss": 0.0634, + "step": 13254 + }, + { + "epoch": 2.147602073882048, + "grad_norm": 0.7873530387878418, + "learning_rate": 9.85972267108546e-07, + "loss": 0.0582, + "step": 13255 + }, + { + "epoch": 2.1477640959170445, + "grad_norm": 0.9956313371658325, + "learning_rate": 9.8562430082565e-07, + "loss": 0.0662, + "step": 13256 + }, + { + "epoch": 2.1479261179520415, + "grad_norm": 0.8575721383094788, + "learning_rate": 9.852763808789862e-07, + "loss": 0.06, + "step": 13257 + }, + { + "epoch": 2.1480881399870384, + "grad_norm": 0.9134724140167236, + "learning_rate": 9.849285072791978e-07, + "loss": 0.0657, + "step": 13258 + }, + { + "epoch": 2.148250162022035, + "grad_norm": 0.8377612829208374, + "learning_rate": 9.845806800369316e-07, + "loss": 0.0613, + "step": 13259 + }, + { + "epoch": 2.148412184057032, + "grad_norm": 0.7430257201194763, + "learning_rate": 9.8423289916283e-07, + "loss": 0.0535, + "step": 13260 + }, + { + "epoch": 2.1485742060920283, + "grad_norm": 0.9967155456542969, + "learning_rate": 9.838851646675329e-07, + "loss": 0.0691, + "step": 13261 + }, + { + "epoch": 2.1487362281270252, + "grad_norm": 0.809754490852356, + "learning_rate": 9.835374765616809e-07, + "loss": 0.052, + "step": 13262 + }, + { + "epoch": 2.148898250162022, + "grad_norm": 0.816158652305603, + "learning_rate": 9.831898348559115e-07, + "loss": 0.0594, + "step": 13263 + }, + { + "epoch": 2.1490602721970187, + "grad_norm": 0.8675599098205566, + "learning_rate": 9.82842239560864e-07, + "loss": 0.0603, + "step": 13264 + }, + { + "epoch": 2.1492222942320156, + "grad_norm": 0.8979247808456421, + "learning_rate": 9.824946906871721e-07, + "loss": 0.0641, + "step": 13265 + }, + { + "epoch": 2.149384316267012, + "grad_norm": 0.7833936214447021, + "learning_rate": 9.821471882454703e-07, + "loss": 0.0568, + "step": 13266 + }, + { + "epoch": 2.149546338302009, + "grad_norm": 0.8592799305915833, + "learning_rate": 9.817997322463912e-07, + "loss": 0.0624, + "step": 13267 + }, + { + "epoch": 2.149708360337006, + "grad_norm": 0.9396870732307434, + "learning_rate": 9.814523227005662e-07, + "loss": 0.0621, + "step": 13268 + }, + { + "epoch": 2.1498703823720025, + "grad_norm": 0.876347005367279, + "learning_rate": 9.811049596186255e-07, + "loss": 0.0709, + "step": 13269 + }, + { + "epoch": 2.1500324044069994, + "grad_norm": 0.9403771162033081, + "learning_rate": 9.807576430111975e-07, + "loss": 0.0612, + "step": 13270 + }, + { + "epoch": 2.1501944264419963, + "grad_norm": 0.8575648665428162, + "learning_rate": 9.804103728889089e-07, + "loss": 0.0569, + "step": 13271 + }, + { + "epoch": 2.150356448476993, + "grad_norm": 0.7617552876472473, + "learning_rate": 9.800631492623867e-07, + "loss": 0.0533, + "step": 13272 + }, + { + "epoch": 2.1505184705119897, + "grad_norm": 0.7716460824012756, + "learning_rate": 9.79715972142252e-07, + "loss": 0.0536, + "step": 13273 + }, + { + "epoch": 2.1506804925469862, + "grad_norm": 0.911914050579071, + "learning_rate": 9.793688415391304e-07, + "loss": 0.0623, + "step": 13274 + }, + { + "epoch": 2.150842514581983, + "grad_norm": 0.832737386226654, + "learning_rate": 9.790217574636433e-07, + "loss": 0.0606, + "step": 13275 + }, + { + "epoch": 2.15100453661698, + "grad_norm": 1.0580108165740967, + "learning_rate": 9.786747199264088e-07, + "loss": 0.0632, + "step": 13276 + }, + { + "epoch": 2.1511665586519766, + "grad_norm": 0.8876791000366211, + "learning_rate": 9.783277289380456e-07, + "loss": 0.0628, + "step": 13277 + }, + { + "epoch": 2.1513285806869735, + "grad_norm": 0.9212145209312439, + "learning_rate": 9.779807845091722e-07, + "loss": 0.0622, + "step": 13278 + }, + { + "epoch": 2.15149060272197, + "grad_norm": 0.7946116924285889, + "learning_rate": 9.776338866504045e-07, + "loss": 0.0588, + "step": 13279 + }, + { + "epoch": 2.151652624756967, + "grad_norm": 0.796489417552948, + "learning_rate": 9.77287035372355e-07, + "loss": 0.0572, + "step": 13280 + }, + { + "epoch": 2.151814646791964, + "grad_norm": 0.8203312754631042, + "learning_rate": 9.769402306856373e-07, + "loss": 0.0578, + "step": 13281 + }, + { + "epoch": 2.1519766688269604, + "grad_norm": 1.2622346878051758, + "learning_rate": 9.76593472600863e-07, + "loss": 0.0665, + "step": 13282 + }, + { + "epoch": 2.1521386908619573, + "grad_norm": 0.9482854604721069, + "learning_rate": 9.762467611286416e-07, + "loss": 0.0666, + "step": 13283 + }, + { + "epoch": 2.152300712896954, + "grad_norm": 0.8730493783950806, + "learning_rate": 9.75900096279582e-07, + "loss": 0.0604, + "step": 13284 + }, + { + "epoch": 2.1524627349319507, + "grad_norm": 0.9414070248603821, + "learning_rate": 9.755534780642911e-07, + "loss": 0.0669, + "step": 13285 + }, + { + "epoch": 2.1526247569669477, + "grad_norm": 0.9391159415245056, + "learning_rate": 9.752069064933758e-07, + "loss": 0.0634, + "step": 13286 + }, + { + "epoch": 2.152786779001944, + "grad_norm": 0.8309518098831177, + "learning_rate": 9.748603815774371e-07, + "loss": 0.0635, + "step": 13287 + }, + { + "epoch": 2.152948801036941, + "grad_norm": 0.7785058617591858, + "learning_rate": 9.745139033270812e-07, + "loss": 0.06, + "step": 13288 + }, + { + "epoch": 2.1531108230719376, + "grad_norm": 0.9786415696144104, + "learning_rate": 9.74167471752908e-07, + "loss": 0.0626, + "step": 13289 + }, + { + "epoch": 2.1532728451069345, + "grad_norm": 1.009415626525879, + "learning_rate": 9.738210868655187e-07, + "loss": 0.0712, + "step": 13290 + }, + { + "epoch": 2.1534348671419314, + "grad_norm": 0.8282557725906372, + "learning_rate": 9.7347474867551e-07, + "loss": 0.0582, + "step": 13291 + }, + { + "epoch": 2.153596889176928, + "grad_norm": 1.0145543813705444, + "learning_rate": 9.73128457193479e-07, + "loss": 0.0637, + "step": 13292 + }, + { + "epoch": 2.153758911211925, + "grad_norm": 0.950143039226532, + "learning_rate": 9.72782212430024e-07, + "loss": 0.0676, + "step": 13293 + }, + { + "epoch": 2.153920933246922, + "grad_norm": 0.8464543223381042, + "learning_rate": 9.724360143957367e-07, + "loss": 0.0622, + "step": 13294 + }, + { + "epoch": 2.1540829552819183, + "grad_norm": 0.9932398200035095, + "learning_rate": 9.720898631012106e-07, + "loss": 0.0596, + "step": 13295 + }, + { + "epoch": 2.154244977316915, + "grad_norm": 0.8855057954788208, + "learning_rate": 9.717437585570375e-07, + "loss": 0.0592, + "step": 13296 + }, + { + "epoch": 2.1544069993519117, + "grad_norm": 0.9049186110496521, + "learning_rate": 9.713977007738068e-07, + "loss": 0.0685, + "step": 13297 + }, + { + "epoch": 2.1545690213869086, + "grad_norm": 1.360369324684143, + "learning_rate": 9.710516897621072e-07, + "loss": 0.0636, + "step": 13298 + }, + { + "epoch": 2.1547310434219056, + "grad_norm": 0.8323555588722229, + "learning_rate": 9.707057255325262e-07, + "loss": 0.0633, + "step": 13299 + }, + { + "epoch": 2.154893065456902, + "grad_norm": 0.8513553738594055, + "learning_rate": 9.703598080956488e-07, + "loss": 0.0532, + "step": 13300 + }, + { + "epoch": 2.155055087491899, + "grad_norm": 0.8135280609130859, + "learning_rate": 9.700139374620602e-07, + "loss": 0.0648, + "step": 13301 + }, + { + "epoch": 2.1552171095268955, + "grad_norm": 0.821293830871582, + "learning_rate": 9.696681136423422e-07, + "loss": 0.0607, + "step": 13302 + }, + { + "epoch": 2.1553791315618924, + "grad_norm": 0.984480082988739, + "learning_rate": 9.693223366470767e-07, + "loss": 0.0664, + "step": 13303 + }, + { + "epoch": 2.1555411535968894, + "grad_norm": 0.8220646381378174, + "learning_rate": 9.689766064868434e-07, + "loss": 0.0597, + "step": 13304 + }, + { + "epoch": 2.155703175631886, + "grad_norm": 0.9679084420204163, + "learning_rate": 9.686309231722219e-07, + "loss": 0.0619, + "step": 13305 + }, + { + "epoch": 2.155865197666883, + "grad_norm": 1.0595329999923706, + "learning_rate": 9.682852867137865e-07, + "loss": 0.0668, + "step": 13306 + }, + { + "epoch": 2.1560272197018793, + "grad_norm": 0.989339292049408, + "learning_rate": 9.679396971221155e-07, + "loss": 0.0646, + "step": 13307 + }, + { + "epoch": 2.156189241736876, + "grad_norm": 1.0220208168029785, + "learning_rate": 9.675941544077833e-07, + "loss": 0.0686, + "step": 13308 + }, + { + "epoch": 2.156351263771873, + "grad_norm": 0.9169559478759766, + "learning_rate": 9.672486585813606e-07, + "loss": 0.0612, + "step": 13309 + }, + { + "epoch": 2.1565132858068696, + "grad_norm": 0.8436880707740784, + "learning_rate": 9.6690320965342e-07, + "loss": 0.0566, + "step": 13310 + }, + { + "epoch": 2.1566753078418666, + "grad_norm": 0.835830569267273, + "learning_rate": 9.665578076345307e-07, + "loss": 0.0622, + "step": 13311 + }, + { + "epoch": 2.156837329876863, + "grad_norm": 0.8049341440200806, + "learning_rate": 9.66212452535262e-07, + "loss": 0.0625, + "step": 13312 + }, + { + "epoch": 2.15699935191186, + "grad_norm": 0.9086779952049255, + "learning_rate": 9.658671443661804e-07, + "loss": 0.0625, + "step": 13313 + }, + { + "epoch": 2.157161373946857, + "grad_norm": 0.9593300223350525, + "learning_rate": 9.655218831378518e-07, + "loss": 0.0657, + "step": 13314 + }, + { + "epoch": 2.1573233959818534, + "grad_norm": 0.790436863899231, + "learning_rate": 9.651766688608402e-07, + "loss": 0.0626, + "step": 13315 + }, + { + "epoch": 2.1574854180168503, + "grad_norm": 0.783841073513031, + "learning_rate": 9.648315015457083e-07, + "loss": 0.0611, + "step": 13316 + }, + { + "epoch": 2.1576474400518473, + "grad_norm": 0.861705482006073, + "learning_rate": 9.644863812030176e-07, + "loss": 0.0595, + "step": 13317 + }, + { + "epoch": 2.1578094620868438, + "grad_norm": 0.8327277302742004, + "learning_rate": 9.641413078433274e-07, + "loss": 0.0562, + "step": 13318 + }, + { + "epoch": 2.1579714841218407, + "grad_norm": 0.9547343850135803, + "learning_rate": 9.637962814771976e-07, + "loss": 0.0606, + "step": 13319 + }, + { + "epoch": 2.158133506156837, + "grad_norm": 0.9020182490348816, + "learning_rate": 9.63451302115182e-07, + "loss": 0.0591, + "step": 13320 + }, + { + "epoch": 2.158295528191834, + "grad_norm": 0.896525502204895, + "learning_rate": 9.631063697678392e-07, + "loss": 0.0574, + "step": 13321 + }, + { + "epoch": 2.158457550226831, + "grad_norm": 0.833091139793396, + "learning_rate": 9.627614844457222e-07, + "loss": 0.0551, + "step": 13322 + }, + { + "epoch": 2.1586195722618275, + "grad_norm": 0.8933079838752747, + "learning_rate": 9.624166461593848e-07, + "loss": 0.0587, + "step": 13323 + }, + { + "epoch": 2.1587815942968245, + "grad_norm": 0.9775946736335754, + "learning_rate": 9.620718549193764e-07, + "loss": 0.0632, + "step": 13324 + }, + { + "epoch": 2.158943616331821, + "grad_norm": 0.898419976234436, + "learning_rate": 9.617271107362465e-07, + "loss": 0.0688, + "step": 13325 + }, + { + "epoch": 2.159105638366818, + "grad_norm": 0.8033084869384766, + "learning_rate": 9.61382413620546e-07, + "loss": 0.0575, + "step": 13326 + }, + { + "epoch": 2.159267660401815, + "grad_norm": 0.9245520830154419, + "learning_rate": 9.610377635828197e-07, + "loss": 0.0599, + "step": 13327 + }, + { + "epoch": 2.1594296824368113, + "grad_norm": 0.9904624819755554, + "learning_rate": 9.606931606336134e-07, + "loss": 0.0731, + "step": 13328 + }, + { + "epoch": 2.1595917044718083, + "grad_norm": 1.0654817819595337, + "learning_rate": 9.603486047834712e-07, + "loss": 0.0672, + "step": 13329 + }, + { + "epoch": 2.1597537265068047, + "grad_norm": 0.7820091843605042, + "learning_rate": 9.60004096042936e-07, + "loss": 0.0544, + "step": 13330 + }, + { + "epoch": 2.1599157485418017, + "grad_norm": 0.8341110348701477, + "learning_rate": 9.59659634422549e-07, + "loss": 0.054, + "step": 13331 + }, + { + "epoch": 2.1600777705767986, + "grad_norm": 0.8635512590408325, + "learning_rate": 9.593152199328494e-07, + "loss": 0.0546, + "step": 13332 + }, + { + "epoch": 2.160239792611795, + "grad_norm": 0.8399879932403564, + "learning_rate": 9.589708525843754e-07, + "loss": 0.0633, + "step": 13333 + }, + { + "epoch": 2.160401814646792, + "grad_norm": 0.8963909149169922, + "learning_rate": 9.586265323876653e-07, + "loss": 0.0589, + "step": 13334 + }, + { + "epoch": 2.1605638366817885, + "grad_norm": 0.9390884637832642, + "learning_rate": 9.582822593532514e-07, + "loss": 0.0639, + "step": 13335 + }, + { + "epoch": 2.1607258587167855, + "grad_norm": 1.0376230478286743, + "learning_rate": 9.579380334916704e-07, + "loss": 0.0714, + "step": 13336 + }, + { + "epoch": 2.1608878807517824, + "grad_norm": 0.9008492827415466, + "learning_rate": 9.575938548134548e-07, + "loss": 0.0586, + "step": 13337 + }, + { + "epoch": 2.161049902786779, + "grad_norm": 0.8083958625793457, + "learning_rate": 9.572497233291337e-07, + "loss": 0.0612, + "step": 13338 + }, + { + "epoch": 2.161211924821776, + "grad_norm": 0.9335343837738037, + "learning_rate": 9.56905639049238e-07, + "loss": 0.0678, + "step": 13339 + }, + { + "epoch": 2.1613739468567728, + "grad_norm": 0.9797310829162598, + "learning_rate": 9.56561601984294e-07, + "loss": 0.0699, + "step": 13340 + }, + { + "epoch": 2.1615359688917692, + "grad_norm": 0.8286275267601013, + "learning_rate": 9.562176121448322e-07, + "loss": 0.059, + "step": 13341 + }, + { + "epoch": 2.161697990926766, + "grad_norm": 0.887620210647583, + "learning_rate": 9.558736695413745e-07, + "loss": 0.0608, + "step": 13342 + }, + { + "epoch": 2.1618600129617627, + "grad_norm": 0.888256847858429, + "learning_rate": 9.55529774184446e-07, + "loss": 0.0678, + "step": 13343 + }, + { + "epoch": 2.1620220349967596, + "grad_norm": 0.908109724521637, + "learning_rate": 9.551859260845686e-07, + "loss": 0.0631, + "step": 13344 + }, + { + "epoch": 2.162184057031756, + "grad_norm": 1.2315661907196045, + "learning_rate": 9.548421252522635e-07, + "loss": 0.0606, + "step": 13345 + }, + { + "epoch": 2.162346079066753, + "grad_norm": 0.8499714136123657, + "learning_rate": 9.544983716980505e-07, + "loss": 0.06, + "step": 13346 + }, + { + "epoch": 2.16250810110175, + "grad_norm": 0.9646509885787964, + "learning_rate": 9.54154665432447e-07, + "loss": 0.0673, + "step": 13347 + }, + { + "epoch": 2.1626701231367464, + "grad_norm": 0.8329025506973267, + "learning_rate": 9.5381100646597e-07, + "loss": 0.0613, + "step": 13348 + }, + { + "epoch": 2.1628321451717434, + "grad_norm": 0.9725831151008606, + "learning_rate": 9.534673948091344e-07, + "loss": 0.0676, + "step": 13349 + }, + { + "epoch": 2.1629941672067403, + "grad_norm": 0.8599954843521118, + "learning_rate": 9.531238304724538e-07, + "loss": 0.06, + "step": 13350 + }, + { + "epoch": 2.163156189241737, + "grad_norm": 0.8382558822631836, + "learning_rate": 9.52780313466441e-07, + "loss": 0.0622, + "step": 13351 + }, + { + "epoch": 2.1633182112767337, + "grad_norm": 0.8793473839759827, + "learning_rate": 9.524368438016071e-07, + "loss": 0.0563, + "step": 13352 + }, + { + "epoch": 2.1634802333117302, + "grad_norm": 0.9254469275474548, + "learning_rate": 9.520934214884598e-07, + "loss": 0.0632, + "step": 13353 + }, + { + "epoch": 2.163642255346727, + "grad_norm": 0.8985252976417542, + "learning_rate": 9.517500465375071e-07, + "loss": 0.0598, + "step": 13354 + }, + { + "epoch": 2.163804277381724, + "grad_norm": 0.7945815920829773, + "learning_rate": 9.514067189592583e-07, + "loss": 0.0595, + "step": 13355 + }, + { + "epoch": 2.1639662994167206, + "grad_norm": 0.8715730309486389, + "learning_rate": 9.510634387642151e-07, + "loss": 0.0598, + "step": 13356 + }, + { + "epoch": 2.1641283214517175, + "grad_norm": 0.8382450342178345, + "learning_rate": 9.507202059628826e-07, + "loss": 0.0691, + "step": 13357 + }, + { + "epoch": 2.164290343486714, + "grad_norm": 1.0573970079421997, + "learning_rate": 9.503770205657625e-07, + "loss": 0.0636, + "step": 13358 + }, + { + "epoch": 2.164452365521711, + "grad_norm": 0.8974131345748901, + "learning_rate": 9.500338825833555e-07, + "loss": 0.0545, + "step": 13359 + }, + { + "epoch": 2.164614387556708, + "grad_norm": 0.8579263687133789, + "learning_rate": 9.496907920261609e-07, + "loss": 0.063, + "step": 13360 + }, + { + "epoch": 2.1647764095917044, + "grad_norm": 0.8979440331459045, + "learning_rate": 9.493477489046762e-07, + "loss": 0.0669, + "step": 13361 + }, + { + "epoch": 2.1649384316267013, + "grad_norm": 0.8528677225112915, + "learning_rate": 9.490047532293984e-07, + "loss": 0.0587, + "step": 13362 + }, + { + "epoch": 2.165100453661698, + "grad_norm": 0.8880028128623962, + "learning_rate": 9.486618050108223e-07, + "loss": 0.0628, + "step": 13363 + }, + { + "epoch": 2.1652624756966947, + "grad_norm": 0.9306471347808838, + "learning_rate": 9.48318904259439e-07, + "loss": 0.0645, + "step": 13364 + }, + { + "epoch": 2.1654244977316917, + "grad_norm": 0.8764260411262512, + "learning_rate": 9.479760509857433e-07, + "loss": 0.0621, + "step": 13365 + }, + { + "epoch": 2.165586519766688, + "grad_norm": 0.8734313249588013, + "learning_rate": 9.476332452002245e-07, + "loss": 0.0585, + "step": 13366 + }, + { + "epoch": 2.165748541801685, + "grad_norm": 0.9644814729690552, + "learning_rate": 9.472904869133726e-07, + "loss": 0.07, + "step": 13367 + }, + { + "epoch": 2.1659105638366816, + "grad_norm": 0.8495195508003235, + "learning_rate": 9.469477761356727e-07, + "loss": 0.0591, + "step": 13368 + }, + { + "epoch": 2.1660725858716785, + "grad_norm": 0.8992612361907959, + "learning_rate": 9.466051128776133e-07, + "loss": 0.0637, + "step": 13369 + }, + { + "epoch": 2.1662346079066754, + "grad_norm": 0.9654873013496399, + "learning_rate": 9.462624971496793e-07, + "loss": 0.0631, + "step": 13370 + }, + { + "epoch": 2.166396629941672, + "grad_norm": 1.062732219696045, + "learning_rate": 9.459199289623519e-07, + "loss": 0.0635, + "step": 13371 + }, + { + "epoch": 2.166558651976669, + "grad_norm": 0.927365243434906, + "learning_rate": 9.455774083261138e-07, + "loss": 0.0664, + "step": 13372 + }, + { + "epoch": 2.166720674011666, + "grad_norm": 0.9588886499404907, + "learning_rate": 9.452349352514448e-07, + "loss": 0.0605, + "step": 13373 + }, + { + "epoch": 2.1668826960466623, + "grad_norm": 0.8306574821472168, + "learning_rate": 9.448925097488257e-07, + "loss": 0.0587, + "step": 13374 + }, + { + "epoch": 2.167044718081659, + "grad_norm": 0.8604657649993896, + "learning_rate": 9.445501318287317e-07, + "loss": 0.0605, + "step": 13375 + }, + { + "epoch": 2.1672067401166557, + "grad_norm": 0.8701779842376709, + "learning_rate": 9.442078015016398e-07, + "loss": 0.0555, + "step": 13376 + }, + { + "epoch": 2.1673687621516526, + "grad_norm": 1.0996429920196533, + "learning_rate": 9.43865518778024e-07, + "loss": 0.068, + "step": 13377 + }, + { + "epoch": 2.1675307841866496, + "grad_norm": 0.9881284236907959, + "learning_rate": 9.435232836683577e-07, + "loss": 0.0707, + "step": 13378 + }, + { + "epoch": 2.167692806221646, + "grad_norm": 1.0332772731781006, + "learning_rate": 9.431810961831123e-07, + "loss": 0.0676, + "step": 13379 + }, + { + "epoch": 2.167854828256643, + "grad_norm": 0.9715332984924316, + "learning_rate": 9.42838956332758e-07, + "loss": 0.0713, + "step": 13380 + }, + { + "epoch": 2.1680168502916395, + "grad_norm": 1.1476157903671265, + "learning_rate": 9.42496864127764e-07, + "loss": 0.0618, + "step": 13381 + }, + { + "epoch": 2.1681788723266364, + "grad_norm": 0.9035125970840454, + "learning_rate": 9.421548195785962e-07, + "loss": 0.0641, + "step": 13382 + }, + { + "epoch": 2.1683408943616334, + "grad_norm": 1.018491268157959, + "learning_rate": 9.418128226957202e-07, + "loss": 0.0682, + "step": 13383 + }, + { + "epoch": 2.16850291639663, + "grad_norm": 0.8716436624526978, + "learning_rate": 9.414708734896019e-07, + "loss": 0.0631, + "step": 13384 + }, + { + "epoch": 2.1686649384316268, + "grad_norm": 0.9046812057495117, + "learning_rate": 9.411289719707039e-07, + "loss": 0.0658, + "step": 13385 + }, + { + "epoch": 2.1688269604666233, + "grad_norm": 0.9490453004837036, + "learning_rate": 9.407871181494865e-07, + "loss": 0.0601, + "step": 13386 + }, + { + "epoch": 2.16898898250162, + "grad_norm": 0.8839377164840698, + "learning_rate": 9.40445312036409e-07, + "loss": 0.0684, + "step": 13387 + }, + { + "epoch": 2.169151004536617, + "grad_norm": 1.0054982900619507, + "learning_rate": 9.401035536419326e-07, + "loss": 0.0635, + "step": 13388 + }, + { + "epoch": 2.1693130265716136, + "grad_norm": 0.8407875299453735, + "learning_rate": 9.397618429765118e-07, + "loss": 0.0653, + "step": 13389 + }, + { + "epoch": 2.1694750486066106, + "grad_norm": 0.8645152449607849, + "learning_rate": 9.394201800506028e-07, + "loss": 0.0612, + "step": 13390 + }, + { + "epoch": 2.169637070641607, + "grad_norm": 1.003045678138733, + "learning_rate": 9.390785648746598e-07, + "loss": 0.0665, + "step": 13391 + }, + { + "epoch": 2.169799092676604, + "grad_norm": 0.8517448306083679, + "learning_rate": 9.387369974591353e-07, + "loss": 0.065, + "step": 13392 + }, + { + "epoch": 2.169961114711601, + "grad_norm": 0.9961963891983032, + "learning_rate": 9.383954778144807e-07, + "loss": 0.0635, + "step": 13393 + }, + { + "epoch": 2.1701231367465974, + "grad_norm": 0.8124943375587463, + "learning_rate": 9.380540059511453e-07, + "loss": 0.0552, + "step": 13394 + }, + { + "epoch": 2.1702851587815943, + "grad_norm": 0.801011323928833, + "learning_rate": 9.377125818795777e-07, + "loss": 0.0616, + "step": 13395 + }, + { + "epoch": 2.1704471808165913, + "grad_norm": 0.938008725643158, + "learning_rate": 9.373712056102249e-07, + "loss": 0.0663, + "step": 13396 + }, + { + "epoch": 2.1706092028515878, + "grad_norm": 0.9394778609275818, + "learning_rate": 9.370298771535302e-07, + "loss": 0.0658, + "step": 13397 + }, + { + "epoch": 2.1707712248865847, + "grad_norm": 0.9980944991111755, + "learning_rate": 9.366885965199398e-07, + "loss": 0.0664, + "step": 13398 + }, + { + "epoch": 2.170933246921581, + "grad_norm": 0.8614987730979919, + "learning_rate": 9.363473637198964e-07, + "loss": 0.0605, + "step": 13399 + }, + { + "epoch": 2.171095268956578, + "grad_norm": 0.7758714556694031, + "learning_rate": 9.360061787638383e-07, + "loss": 0.0641, + "step": 13400 + }, + { + "epoch": 2.171257290991575, + "grad_norm": 0.9587964415550232, + "learning_rate": 9.356650416622065e-07, + "loss": 0.0654, + "step": 13401 + }, + { + "epoch": 2.1714193130265715, + "grad_norm": 0.970716118812561, + "learning_rate": 9.353239524254382e-07, + "loss": 0.0568, + "step": 13402 + }, + { + "epoch": 2.1715813350615685, + "grad_norm": 0.8484756946563721, + "learning_rate": 9.349829110639718e-07, + "loss": 0.0628, + "step": 13403 + }, + { + "epoch": 2.171743357096565, + "grad_norm": 0.8023274540901184, + "learning_rate": 9.346419175882407e-07, + "loss": 0.0593, + "step": 13404 + }, + { + "epoch": 2.171905379131562, + "grad_norm": 0.8955522179603577, + "learning_rate": 9.343009720086785e-07, + "loss": 0.0591, + "step": 13405 + }, + { + "epoch": 2.172067401166559, + "grad_norm": 0.9078856706619263, + "learning_rate": 9.339600743357177e-07, + "loss": 0.0688, + "step": 13406 + }, + { + "epoch": 2.1722294232015553, + "grad_norm": 1.0572879314422607, + "learning_rate": 9.33619224579789e-07, + "loss": 0.0575, + "step": 13407 + }, + { + "epoch": 2.1723914452365523, + "grad_norm": 0.8648132681846619, + "learning_rate": 9.332784227513212e-07, + "loss": 0.0621, + "step": 13408 + }, + { + "epoch": 2.1725534672715487, + "grad_norm": 0.8254750967025757, + "learning_rate": 9.329376688607425e-07, + "loss": 0.06, + "step": 13409 + }, + { + "epoch": 2.1727154893065457, + "grad_norm": 0.8656960725784302, + "learning_rate": 9.325969629184789e-07, + "loss": 0.0655, + "step": 13410 + }, + { + "epoch": 2.1728775113415426, + "grad_norm": 0.8903995156288147, + "learning_rate": 9.32256304934955e-07, + "loss": 0.0603, + "step": 13411 + }, + { + "epoch": 2.173039533376539, + "grad_norm": 0.9863763451576233, + "learning_rate": 9.319156949205943e-07, + "loss": 0.0615, + "step": 13412 + }, + { + "epoch": 2.173201555411536, + "grad_norm": 0.7889726161956787, + "learning_rate": 9.315751328858189e-07, + "loss": 0.0544, + "step": 13413 + }, + { + "epoch": 2.1733635774465325, + "grad_norm": 0.8542593717575073, + "learning_rate": 9.312346188410496e-07, + "loss": 0.0636, + "step": 13414 + }, + { + "epoch": 2.1735255994815295, + "grad_norm": 0.8379946947097778, + "learning_rate": 9.308941527967039e-07, + "loss": 0.0606, + "step": 13415 + }, + { + "epoch": 2.1736876215165264, + "grad_norm": 0.9521237015724182, + "learning_rate": 9.30553734763199e-07, + "loss": 0.0582, + "step": 13416 + }, + { + "epoch": 2.173849643551523, + "grad_norm": 0.8582279682159424, + "learning_rate": 9.302133647509526e-07, + "loss": 0.0613, + "step": 13417 + }, + { + "epoch": 2.17401166558652, + "grad_norm": 0.9010220766067505, + "learning_rate": 9.298730427703795e-07, + "loss": 0.0658, + "step": 13418 + }, + { + "epoch": 2.1741736876215167, + "grad_norm": 1.0892761945724487, + "learning_rate": 9.295327688318906e-07, + "loss": 0.0687, + "step": 13419 + }, + { + "epoch": 2.1743357096565132, + "grad_norm": 0.9256922006607056, + "learning_rate": 9.291925429458987e-07, + "loss": 0.0606, + "step": 13420 + }, + { + "epoch": 2.17449773169151, + "grad_norm": 1.116697907447815, + "learning_rate": 9.288523651228134e-07, + "loss": 0.0613, + "step": 13421 + }, + { + "epoch": 2.1746597537265067, + "grad_norm": 0.9603244066238403, + "learning_rate": 9.285122353730439e-07, + "loss": 0.0636, + "step": 13422 + }, + { + "epoch": 2.1748217757615036, + "grad_norm": 0.980385959148407, + "learning_rate": 9.281721537069971e-07, + "loss": 0.0623, + "step": 13423 + }, + { + "epoch": 2.1749837977965005, + "grad_norm": 0.8736245036125183, + "learning_rate": 9.278321201350784e-07, + "loss": 0.0626, + "step": 13424 + }, + { + "epoch": 2.175145819831497, + "grad_norm": 0.8763680458068848, + "learning_rate": 9.274921346676935e-07, + "loss": 0.0618, + "step": 13425 + }, + { + "epoch": 2.175307841866494, + "grad_norm": 0.9780583381652832, + "learning_rate": 9.271521973152418e-07, + "loss": 0.0612, + "step": 13426 + }, + { + "epoch": 2.1754698639014904, + "grad_norm": 1.1210222244262695, + "learning_rate": 9.268123080881275e-07, + "loss": 0.0623, + "step": 13427 + }, + { + "epoch": 2.1756318859364874, + "grad_norm": 0.9153855443000793, + "learning_rate": 9.264724669967498e-07, + "loss": 0.0649, + "step": 13428 + }, + { + "epoch": 2.1757939079714843, + "grad_norm": 0.8484050631523132, + "learning_rate": 9.261326740515075e-07, + "loss": 0.0588, + "step": 13429 + }, + { + "epoch": 2.175955930006481, + "grad_norm": 1.0825287103652954, + "learning_rate": 9.257929292627956e-07, + "loss": 0.0637, + "step": 13430 + }, + { + "epoch": 2.1761179520414777, + "grad_norm": 0.7929718494415283, + "learning_rate": 9.254532326410101e-07, + "loss": 0.0594, + "step": 13431 + }, + { + "epoch": 2.176279974076474, + "grad_norm": 0.9638476967811584, + "learning_rate": 9.251135841965467e-07, + "loss": 0.0708, + "step": 13432 + }, + { + "epoch": 2.176441996111471, + "grad_norm": 0.856880247592926, + "learning_rate": 9.24773983939796e-07, + "loss": 0.0646, + "step": 13433 + }, + { + "epoch": 2.176604018146468, + "grad_norm": 0.894572377204895, + "learning_rate": 9.244344318811491e-07, + "loss": 0.0658, + "step": 13434 + }, + { + "epoch": 2.1767660401814646, + "grad_norm": 0.8938385248184204, + "learning_rate": 9.240949280309949e-07, + "loss": 0.0606, + "step": 13435 + }, + { + "epoch": 2.1769280622164615, + "grad_norm": 0.8438916206359863, + "learning_rate": 9.237554723997242e-07, + "loss": 0.0586, + "step": 13436 + }, + { + "epoch": 2.177090084251458, + "grad_norm": 0.9637516140937805, + "learning_rate": 9.234160649977206e-07, + "loss": 0.0687, + "step": 13437 + }, + { + "epoch": 2.177252106286455, + "grad_norm": 0.8094197511672974, + "learning_rate": 9.230767058353701e-07, + "loss": 0.0559, + "step": 13438 + }, + { + "epoch": 2.177414128321452, + "grad_norm": 0.8560066223144531, + "learning_rate": 9.227373949230567e-07, + "loss": 0.0559, + "step": 13439 + }, + { + "epoch": 2.1775761503564484, + "grad_norm": 1.0146995782852173, + "learning_rate": 9.223981322711617e-07, + "loss": 0.0702, + "step": 13440 + }, + { + "epoch": 2.1777381723914453, + "grad_norm": 0.8843867182731628, + "learning_rate": 9.220589178900663e-07, + "loss": 0.0625, + "step": 13441 + }, + { + "epoch": 2.1779001944264422, + "grad_norm": 0.7497606873512268, + "learning_rate": 9.217197517901494e-07, + "loss": 0.0572, + "step": 13442 + }, + { + "epoch": 2.1780622164614387, + "grad_norm": 0.8896629810333252, + "learning_rate": 9.213806339817897e-07, + "loss": 0.0626, + "step": 13443 + }, + { + "epoch": 2.1782242384964356, + "grad_norm": 0.8931178450584412, + "learning_rate": 9.210415644753615e-07, + "loss": 0.0585, + "step": 13444 + }, + { + "epoch": 2.178386260531432, + "grad_norm": 0.8438544869422913, + "learning_rate": 9.207025432812397e-07, + "loss": 0.0594, + "step": 13445 + }, + { + "epoch": 2.178548282566429, + "grad_norm": 0.9344618320465088, + "learning_rate": 9.203635704097988e-07, + "loss": 0.0618, + "step": 13446 + }, + { + "epoch": 2.1787103046014256, + "grad_norm": 0.863814115524292, + "learning_rate": 9.20024645871411e-07, + "loss": 0.0653, + "step": 13447 + }, + { + "epoch": 2.1788723266364225, + "grad_norm": 0.8747982382774353, + "learning_rate": 9.196857696764446e-07, + "loss": 0.0585, + "step": 13448 + }, + { + "epoch": 2.1790343486714194, + "grad_norm": 0.9165652394294739, + "learning_rate": 9.193469418352696e-07, + "loss": 0.069, + "step": 13449 + }, + { + "epoch": 2.179196370706416, + "grad_norm": 0.9408144950866699, + "learning_rate": 9.190081623582531e-07, + "loss": 0.0711, + "step": 13450 + }, + { + "epoch": 2.179358392741413, + "grad_norm": 0.9184764623641968, + "learning_rate": 9.186694312557606e-07, + "loss": 0.0666, + "step": 13451 + }, + { + "epoch": 2.17952041477641, + "grad_norm": 0.8569600582122803, + "learning_rate": 9.183307485381571e-07, + "loss": 0.0613, + "step": 13452 + }, + { + "epoch": 2.1796824368114063, + "grad_norm": 0.8777344226837158, + "learning_rate": 9.17992114215805e-07, + "loss": 0.0628, + "step": 13453 + }, + { + "epoch": 2.179844458846403, + "grad_norm": 0.9402377605438232, + "learning_rate": 9.176535282990656e-07, + "loss": 0.0669, + "step": 13454 + }, + { + "epoch": 2.1800064808813997, + "grad_norm": 0.9360073208808899, + "learning_rate": 9.173149907982993e-07, + "loss": 0.0617, + "step": 13455 + }, + { + "epoch": 2.1801685029163966, + "grad_norm": 0.9287109971046448, + "learning_rate": 9.169765017238641e-07, + "loss": 0.0681, + "step": 13456 + }, + { + "epoch": 2.1803305249513936, + "grad_norm": 0.8160260915756226, + "learning_rate": 9.166380610861172e-07, + "loss": 0.0637, + "step": 13457 + }, + { + "epoch": 2.18049254698639, + "grad_norm": 0.8967964053153992, + "learning_rate": 9.162996688954148e-07, + "loss": 0.0634, + "step": 13458 + }, + { + "epoch": 2.180654569021387, + "grad_norm": 0.968971848487854, + "learning_rate": 9.159613251621083e-07, + "loss": 0.065, + "step": 13459 + }, + { + "epoch": 2.1808165910563835, + "grad_norm": 0.8118306994438171, + "learning_rate": 9.156230298965529e-07, + "loss": 0.0613, + "step": 13460 + }, + { + "epoch": 2.1809786130913804, + "grad_norm": 0.9221633672714233, + "learning_rate": 9.152847831090986e-07, + "loss": 0.0642, + "step": 13461 + }, + { + "epoch": 2.1811406351263773, + "grad_norm": 0.8746361136436462, + "learning_rate": 9.149465848100958e-07, + "loss": 0.0629, + "step": 13462 + }, + { + "epoch": 2.181302657161374, + "grad_norm": 0.9616087079048157, + "learning_rate": 9.14608435009891e-07, + "loss": 0.0727, + "step": 13463 + }, + { + "epoch": 2.1814646791963708, + "grad_norm": 0.8658609390258789, + "learning_rate": 9.142703337188305e-07, + "loss": 0.06, + "step": 13464 + }, + { + "epoch": 2.1816267012313673, + "grad_norm": 0.833065390586853, + "learning_rate": 9.139322809472623e-07, + "loss": 0.0552, + "step": 13465 + }, + { + "epoch": 2.181788723266364, + "grad_norm": 0.8363578915596008, + "learning_rate": 9.135942767055272e-07, + "loss": 0.0673, + "step": 13466 + }, + { + "epoch": 2.181950745301361, + "grad_norm": 0.9329082369804382, + "learning_rate": 9.132563210039683e-07, + "loss": 0.0571, + "step": 13467 + }, + { + "epoch": 2.1821127673363576, + "grad_norm": 0.8257620334625244, + "learning_rate": 9.129184138529259e-07, + "loss": 0.0663, + "step": 13468 + }, + { + "epoch": 2.1822747893713546, + "grad_norm": 1.1378121376037598, + "learning_rate": 9.125805552627395e-07, + "loss": 0.0648, + "step": 13469 + }, + { + "epoch": 2.182436811406351, + "grad_norm": 0.8082774877548218, + "learning_rate": 9.122427452437465e-07, + "loss": 0.0596, + "step": 13470 + }, + { + "epoch": 2.182598833441348, + "grad_norm": 1.0287437438964844, + "learning_rate": 9.119049838062832e-07, + "loss": 0.067, + "step": 13471 + }, + { + "epoch": 2.182760855476345, + "grad_norm": 0.9428204894065857, + "learning_rate": 9.115672709606846e-07, + "loss": 0.0679, + "step": 13472 + }, + { + "epoch": 2.1829228775113414, + "grad_norm": 0.9620710611343384, + "learning_rate": 9.11229606717284e-07, + "loss": 0.0726, + "step": 13473 + }, + { + "epoch": 2.1830848995463383, + "grad_norm": 0.8988024592399597, + "learning_rate": 9.108919910864111e-07, + "loss": 0.063, + "step": 13474 + }, + { + "epoch": 2.1832469215813353, + "grad_norm": 0.978762686252594, + "learning_rate": 9.105544240783987e-07, + "loss": 0.0629, + "step": 13475 + }, + { + "epoch": 2.1834089436163318, + "grad_norm": 0.9975540041923523, + "learning_rate": 9.102169057035753e-07, + "loss": 0.0659, + "step": 13476 + }, + { + "epoch": 2.1835709656513287, + "grad_norm": 0.9281214475631714, + "learning_rate": 9.098794359722668e-07, + "loss": 0.0587, + "step": 13477 + }, + { + "epoch": 2.183732987686325, + "grad_norm": 0.8466314673423767, + "learning_rate": 9.095420148947984e-07, + "loss": 0.0631, + "step": 13478 + }, + { + "epoch": 2.183895009721322, + "grad_norm": 0.8057236671447754, + "learning_rate": 9.092046424814962e-07, + "loss": 0.0631, + "step": 13479 + }, + { + "epoch": 2.184057031756319, + "grad_norm": 0.8913443088531494, + "learning_rate": 9.088673187426836e-07, + "loss": 0.0579, + "step": 13480 + }, + { + "epoch": 2.1842190537913155, + "grad_norm": 0.8665725588798523, + "learning_rate": 9.085300436886793e-07, + "loss": 0.0602, + "step": 13481 + }, + { + "epoch": 2.1843810758263125, + "grad_norm": 0.8465587496757507, + "learning_rate": 9.081928173298046e-07, + "loss": 0.0576, + "step": 13482 + }, + { + "epoch": 2.184543097861309, + "grad_norm": 0.9389297962188721, + "learning_rate": 9.078556396763777e-07, + "loss": 0.0637, + "step": 13483 + }, + { + "epoch": 2.184705119896306, + "grad_norm": 0.9527860879898071, + "learning_rate": 9.075185107387149e-07, + "loss": 0.0648, + "step": 13484 + }, + { + "epoch": 2.184867141931303, + "grad_norm": 0.8697720170021057, + "learning_rate": 9.071814305271323e-07, + "loss": 0.0654, + "step": 13485 + }, + { + "epoch": 2.1850291639662993, + "grad_norm": 0.963236391544342, + "learning_rate": 9.068443990519432e-07, + "loss": 0.066, + "step": 13486 + }, + { + "epoch": 2.1851911860012962, + "grad_norm": 0.8899818658828735, + "learning_rate": 9.065074163234602e-07, + "loss": 0.0604, + "step": 13487 + }, + { + "epoch": 2.1853532080362927, + "grad_norm": 0.8355003595352173, + "learning_rate": 9.061704823519943e-07, + "loss": 0.067, + "step": 13488 + }, + { + "epoch": 2.1855152300712897, + "grad_norm": 0.9532532691955566, + "learning_rate": 9.058335971478543e-07, + "loss": 0.0583, + "step": 13489 + }, + { + "epoch": 2.1856772521062866, + "grad_norm": 0.9321505427360535, + "learning_rate": 9.054967607213486e-07, + "loss": 0.0689, + "step": 13490 + }, + { + "epoch": 2.185839274141283, + "grad_norm": 0.9366322755813599, + "learning_rate": 9.051599730827842e-07, + "loss": 0.0692, + "step": 13491 + }, + { + "epoch": 2.18600129617628, + "grad_norm": 0.8937353491783142, + "learning_rate": 9.048232342424642e-07, + "loss": 0.0551, + "step": 13492 + }, + { + "epoch": 2.1861633182112765, + "grad_norm": 0.9362406730651855, + "learning_rate": 9.044865442106923e-07, + "loss": 0.0698, + "step": 13493 + }, + { + "epoch": 2.1863253402462735, + "grad_norm": 0.7960699200630188, + "learning_rate": 9.04149902997773e-07, + "loss": 0.0627, + "step": 13494 + }, + { + "epoch": 2.1864873622812704, + "grad_norm": 0.9175137281417847, + "learning_rate": 9.038133106140034e-07, + "loss": 0.0695, + "step": 13495 + }, + { + "epoch": 2.186649384316267, + "grad_norm": 1.0003280639648438, + "learning_rate": 9.034767670696842e-07, + "loss": 0.0628, + "step": 13496 + }, + { + "epoch": 2.186811406351264, + "grad_norm": 1.0133966207504272, + "learning_rate": 9.031402723751123e-07, + "loss": 0.068, + "step": 13497 + }, + { + "epoch": 2.1869734283862607, + "grad_norm": 1.0157874822616577, + "learning_rate": 9.028038265405836e-07, + "loss": 0.0642, + "step": 13498 + }, + { + "epoch": 2.1871354504212572, + "grad_norm": 0.7945261001586914, + "learning_rate": 9.02467429576393e-07, + "loss": 0.0581, + "step": 13499 + }, + { + "epoch": 2.187297472456254, + "grad_norm": 1.0244712829589844, + "learning_rate": 9.021310814928328e-07, + "loss": 0.0721, + "step": 13500 + }, + { + "epoch": 2.1874594944912507, + "grad_norm": 0.8607012033462524, + "learning_rate": 9.01794782300195e-07, + "loss": 0.0603, + "step": 13501 + }, + { + "epoch": 2.1876215165262476, + "grad_norm": 1.0268734693527222, + "learning_rate": 9.01458532008769e-07, + "loss": 0.0597, + "step": 13502 + }, + { + "epoch": 2.1877835385612445, + "grad_norm": 0.8796284198760986, + "learning_rate": 9.011223306288436e-07, + "loss": 0.0594, + "step": 13503 + }, + { + "epoch": 2.187945560596241, + "grad_norm": 0.8240768313407898, + "learning_rate": 9.007861781707056e-07, + "loss": 0.0583, + "step": 13504 + }, + { + "epoch": 2.188107582631238, + "grad_norm": 0.7857369780540466, + "learning_rate": 9.004500746446407e-07, + "loss": 0.0549, + "step": 13505 + }, + { + "epoch": 2.1882696046662344, + "grad_norm": 0.9636929631233215, + "learning_rate": 9.001140200609334e-07, + "loss": 0.0667, + "step": 13506 + }, + { + "epoch": 2.1884316267012314, + "grad_norm": 0.9375138878822327, + "learning_rate": 8.997780144298641e-07, + "loss": 0.0653, + "step": 13507 + }, + { + "epoch": 2.1885936487362283, + "grad_norm": 0.9468481540679932, + "learning_rate": 8.994420577617155e-07, + "loss": 0.0652, + "step": 13508 + }, + { + "epoch": 2.188755670771225, + "grad_norm": 0.946871280670166, + "learning_rate": 8.991061500667674e-07, + "loss": 0.0666, + "step": 13509 + }, + { + "epoch": 2.1889176928062217, + "grad_norm": 0.8926617503166199, + "learning_rate": 8.987702913552964e-07, + "loss": 0.066, + "step": 13510 + }, + { + "epoch": 2.189079714841218, + "grad_norm": 0.8519583940505981, + "learning_rate": 8.984344816375798e-07, + "loss": 0.0596, + "step": 13511 + }, + { + "epoch": 2.189241736876215, + "grad_norm": 0.8857722878456116, + "learning_rate": 8.980987209238922e-07, + "loss": 0.0557, + "step": 13512 + }, + { + "epoch": 2.189403758911212, + "grad_norm": 0.9426934719085693, + "learning_rate": 8.977630092245071e-07, + "loss": 0.0582, + "step": 13513 + }, + { + "epoch": 2.1895657809462086, + "grad_norm": 0.86836177110672, + "learning_rate": 8.974273465496966e-07, + "loss": 0.062, + "step": 13514 + }, + { + "epoch": 2.1897278029812055, + "grad_norm": 0.8490039110183716, + "learning_rate": 8.970917329097312e-07, + "loss": 0.0581, + "step": 13515 + }, + { + "epoch": 2.189889825016202, + "grad_norm": 0.9485715627670288, + "learning_rate": 8.967561683148798e-07, + "loss": 0.0639, + "step": 13516 + }, + { + "epoch": 2.190051847051199, + "grad_norm": 0.7836689949035645, + "learning_rate": 8.964206527754099e-07, + "loss": 0.0571, + "step": 13517 + }, + { + "epoch": 2.190213869086196, + "grad_norm": 0.9445635080337524, + "learning_rate": 8.960851863015874e-07, + "loss": 0.0558, + "step": 13518 + }, + { + "epoch": 2.1903758911211924, + "grad_norm": 0.8634076118469238, + "learning_rate": 8.957497689036768e-07, + "loss": 0.0585, + "step": 13519 + }, + { + "epoch": 2.1905379131561893, + "grad_norm": 1.2281583547592163, + "learning_rate": 8.954144005919422e-07, + "loss": 0.0657, + "step": 13520 + }, + { + "epoch": 2.190699935191186, + "grad_norm": 0.9403387308120728, + "learning_rate": 8.950790813766416e-07, + "loss": 0.0543, + "step": 13521 + }, + { + "epoch": 2.1908619572261827, + "grad_norm": 1.0141313076019287, + "learning_rate": 8.947438112680387e-07, + "loss": 0.0712, + "step": 13522 + }, + { + "epoch": 2.1910239792611796, + "grad_norm": 0.9815289378166199, + "learning_rate": 8.944085902763902e-07, + "loss": 0.0637, + "step": 13523 + }, + { + "epoch": 2.191186001296176, + "grad_norm": 0.9215153455734253, + "learning_rate": 8.940734184119542e-07, + "loss": 0.0563, + "step": 13524 + }, + { + "epoch": 2.191348023331173, + "grad_norm": 0.9290637969970703, + "learning_rate": 8.937382956849847e-07, + "loss": 0.0601, + "step": 13525 + }, + { + "epoch": 2.19151004536617, + "grad_norm": 0.8032289743423462, + "learning_rate": 8.934032221057354e-07, + "loss": 0.0625, + "step": 13526 + }, + { + "epoch": 2.1916720674011665, + "grad_norm": 0.9063378572463989, + "learning_rate": 8.930681976844613e-07, + "loss": 0.059, + "step": 13527 + }, + { + "epoch": 2.1918340894361634, + "grad_norm": 1.034857988357544, + "learning_rate": 8.927332224314106e-07, + "loss": 0.0695, + "step": 13528 + }, + { + "epoch": 2.19199611147116, + "grad_norm": 1.0457537174224854, + "learning_rate": 8.92398296356834e-07, + "loss": 0.0632, + "step": 13529 + }, + { + "epoch": 2.192158133506157, + "grad_norm": 0.8522835969924927, + "learning_rate": 8.92063419470979e-07, + "loss": 0.059, + "step": 13530 + }, + { + "epoch": 2.192320155541154, + "grad_norm": 0.9645273685455322, + "learning_rate": 8.917285917840926e-07, + "loss": 0.0624, + "step": 13531 + }, + { + "epoch": 2.1924821775761503, + "grad_norm": 1.049136996269226, + "learning_rate": 8.91393813306419e-07, + "loss": 0.0591, + "step": 13532 + }, + { + "epoch": 2.192644199611147, + "grad_norm": 0.846191942691803, + "learning_rate": 8.910590840482023e-07, + "loss": 0.0586, + "step": 13533 + }, + { + "epoch": 2.1928062216461437, + "grad_norm": 0.8312897682189941, + "learning_rate": 8.907244040196836e-07, + "loss": 0.0588, + "step": 13534 + }, + { + "epoch": 2.1929682436811406, + "grad_norm": 0.8862128257751465, + "learning_rate": 8.903897732311048e-07, + "loss": 0.0608, + "step": 13535 + }, + { + "epoch": 2.1931302657161376, + "grad_norm": 0.8992831110954285, + "learning_rate": 8.900551916927022e-07, + "loss": 0.0632, + "step": 13536 + }, + { + "epoch": 2.193292287751134, + "grad_norm": 1.0759751796722412, + "learning_rate": 8.897206594147156e-07, + "loss": 0.0659, + "step": 13537 + }, + { + "epoch": 2.193454309786131, + "grad_norm": 1.051218867301941, + "learning_rate": 8.893861764073808e-07, + "loss": 0.0672, + "step": 13538 + }, + { + "epoch": 2.1936163318211275, + "grad_norm": 0.8192399144172668, + "learning_rate": 8.890517426809306e-07, + "loss": 0.0631, + "step": 13539 + }, + { + "epoch": 2.1937783538561244, + "grad_norm": 0.7525157332420349, + "learning_rate": 8.887173582455985e-07, + "loss": 0.0506, + "step": 13540 + }, + { + "epoch": 2.1939403758911213, + "grad_norm": 0.8876081109046936, + "learning_rate": 8.883830231116153e-07, + "loss": 0.0565, + "step": 13541 + }, + { + "epoch": 2.194102397926118, + "grad_norm": 0.7892178893089294, + "learning_rate": 8.88048737289213e-07, + "loss": 0.0581, + "step": 13542 + }, + { + "epoch": 2.1942644199611148, + "grad_norm": 0.8460841178894043, + "learning_rate": 8.877145007886179e-07, + "loss": 0.0665, + "step": 13543 + }, + { + "epoch": 2.1944264419961117, + "grad_norm": 1.0782990455627441, + "learning_rate": 8.873803136200574e-07, + "loss": 0.0677, + "step": 13544 + }, + { + "epoch": 2.194588464031108, + "grad_norm": 0.9548028707504272, + "learning_rate": 8.870461757937568e-07, + "loss": 0.0645, + "step": 13545 + }, + { + "epoch": 2.194750486066105, + "grad_norm": 0.8092406392097473, + "learning_rate": 8.8671208731994e-07, + "loss": 0.0627, + "step": 13546 + }, + { + "epoch": 2.1949125081011016, + "grad_norm": 0.8001680970191956, + "learning_rate": 8.863780482088291e-07, + "loss": 0.0547, + "step": 13547 + }, + { + "epoch": 2.1950745301360985, + "grad_norm": 0.9187164306640625, + "learning_rate": 8.860440584706451e-07, + "loss": 0.0666, + "step": 13548 + }, + { + "epoch": 2.195236552171095, + "grad_norm": 0.9525156617164612, + "learning_rate": 8.857101181156072e-07, + "loss": 0.0621, + "step": 13549 + }, + { + "epoch": 2.195398574206092, + "grad_norm": 0.9552687406539917, + "learning_rate": 8.853762271539332e-07, + "loss": 0.0632, + "step": 13550 + }, + { + "epoch": 2.195560596241089, + "grad_norm": 0.9133409857749939, + "learning_rate": 8.850423855958393e-07, + "loss": 0.0629, + "step": 13551 + }, + { + "epoch": 2.1957226182760854, + "grad_norm": 0.7785288691520691, + "learning_rate": 8.847085934515404e-07, + "loss": 0.0539, + "step": 13552 + }, + { + "epoch": 2.1958846403110823, + "grad_norm": 0.856397271156311, + "learning_rate": 8.843748507312505e-07, + "loss": 0.061, + "step": 13553 + }, + { + "epoch": 2.1960466623460793, + "grad_norm": 0.9479225873947144, + "learning_rate": 8.840411574451793e-07, + "loss": 0.0606, + "step": 13554 + }, + { + "epoch": 2.1962086843810757, + "grad_norm": 0.817360520362854, + "learning_rate": 8.837075136035375e-07, + "loss": 0.0542, + "step": 13555 + }, + { + "epoch": 2.1963707064160727, + "grad_norm": 0.8960983157157898, + "learning_rate": 8.833739192165352e-07, + "loss": 0.0653, + "step": 13556 + }, + { + "epoch": 2.196532728451069, + "grad_norm": 0.8757344484329224, + "learning_rate": 8.830403742943797e-07, + "loss": 0.0521, + "step": 13557 + }, + { + "epoch": 2.196694750486066, + "grad_norm": 0.868101954460144, + "learning_rate": 8.827068788472751e-07, + "loss": 0.0589, + "step": 13558 + }, + { + "epoch": 2.196856772521063, + "grad_norm": 0.8532265424728394, + "learning_rate": 8.823734328854259e-07, + "loss": 0.0614, + "step": 13559 + }, + { + "epoch": 2.1970187945560595, + "grad_norm": 0.8555817008018494, + "learning_rate": 8.820400364190351e-07, + "loss": 0.0593, + "step": 13560 + }, + { + "epoch": 2.1971808165910565, + "grad_norm": 0.8289614915847778, + "learning_rate": 8.81706689458304e-07, + "loss": 0.0605, + "step": 13561 + }, + { + "epoch": 2.197342838626053, + "grad_norm": 0.767209529876709, + "learning_rate": 8.813733920134321e-07, + "loss": 0.0496, + "step": 13562 + }, + { + "epoch": 2.19750486066105, + "grad_norm": 0.9439572691917419, + "learning_rate": 8.81040144094617e-07, + "loss": 0.0634, + "step": 13563 + }, + { + "epoch": 2.197666882696047, + "grad_norm": 0.8966138958930969, + "learning_rate": 8.807069457120571e-07, + "loss": 0.0612, + "step": 13564 + }, + { + "epoch": 2.1978289047310433, + "grad_norm": 1.0695065259933472, + "learning_rate": 8.803737968759438e-07, + "loss": 0.0759, + "step": 13565 + }, + { + "epoch": 2.1979909267660402, + "grad_norm": 0.8874850273132324, + "learning_rate": 8.80040697596474e-07, + "loss": 0.0636, + "step": 13566 + }, + { + "epoch": 2.198152948801037, + "grad_norm": 0.8661726713180542, + "learning_rate": 8.797076478838388e-07, + "loss": 0.0588, + "step": 13567 + }, + { + "epoch": 2.1983149708360337, + "grad_norm": 0.8928310871124268, + "learning_rate": 8.79374647748229e-07, + "loss": 0.0601, + "step": 13568 + }, + { + "epoch": 2.1984769928710306, + "grad_norm": 0.8463062644004822, + "learning_rate": 8.790416971998317e-07, + "loss": 0.059, + "step": 13569 + }, + { + "epoch": 2.198639014906027, + "grad_norm": 0.9622822999954224, + "learning_rate": 8.787087962488367e-07, + "loss": 0.0595, + "step": 13570 + }, + { + "epoch": 2.198801036941024, + "grad_norm": 0.9242643117904663, + "learning_rate": 8.783759449054296e-07, + "loss": 0.0645, + "step": 13571 + }, + { + "epoch": 2.1989630589760205, + "grad_norm": 0.957097589969635, + "learning_rate": 8.780431431797937e-07, + "loss": 0.0685, + "step": 13572 + }, + { + "epoch": 2.1991250810110174, + "grad_norm": 0.9083278775215149, + "learning_rate": 8.777103910821127e-07, + "loss": 0.0633, + "step": 13573 + }, + { + "epoch": 2.1992871030460144, + "grad_norm": 0.9371110796928406, + "learning_rate": 8.773776886225668e-07, + "loss": 0.0639, + "step": 13574 + }, + { + "epoch": 2.199449125081011, + "grad_norm": 0.9061893820762634, + "learning_rate": 8.770450358113389e-07, + "loss": 0.0606, + "step": 13575 + }, + { + "epoch": 2.199611147116008, + "grad_norm": 0.9202576875686646, + "learning_rate": 8.767124326586043e-07, + "loss": 0.0632, + "step": 13576 + }, + { + "epoch": 2.1997731691510047, + "grad_norm": 0.9155436754226685, + "learning_rate": 8.763798791745413e-07, + "loss": 0.0637, + "step": 13577 + }, + { + "epoch": 2.1999351911860012, + "grad_norm": 1.0268107652664185, + "learning_rate": 8.760473753693243e-07, + "loss": 0.0673, + "step": 13578 + }, + { + "epoch": 2.200097213220998, + "grad_norm": 1.1556326150894165, + "learning_rate": 8.757149212531282e-07, + "loss": 0.0645, + "step": 13579 + }, + { + "epoch": 2.2002592352559946, + "grad_norm": 0.7752742171287537, + "learning_rate": 8.753825168361249e-07, + "loss": 0.0561, + "step": 13580 + }, + { + "epoch": 2.2004212572909916, + "grad_norm": 0.9289119839668274, + "learning_rate": 8.750501621284849e-07, + "loss": 0.0638, + "step": 13581 + }, + { + "epoch": 2.2005832793259885, + "grad_norm": 0.8528614640235901, + "learning_rate": 8.747178571403786e-07, + "loss": 0.0601, + "step": 13582 + }, + { + "epoch": 2.200745301360985, + "grad_norm": 0.8271398544311523, + "learning_rate": 8.743856018819719e-07, + "loss": 0.0596, + "step": 13583 + }, + { + "epoch": 2.200907323395982, + "grad_norm": 0.8116289377212524, + "learning_rate": 8.74053396363431e-07, + "loss": 0.0615, + "step": 13584 + }, + { + "epoch": 2.2010693454309784, + "grad_norm": 0.7971740961074829, + "learning_rate": 8.737212405949222e-07, + "loss": 0.0615, + "step": 13585 + }, + { + "epoch": 2.2012313674659754, + "grad_norm": 1.0324573516845703, + "learning_rate": 8.733891345866088e-07, + "loss": 0.0666, + "step": 13586 + }, + { + "epoch": 2.2013933895009723, + "grad_norm": 0.8397610783576965, + "learning_rate": 8.730570783486508e-07, + "loss": 0.0664, + "step": 13587 + }, + { + "epoch": 2.201555411535969, + "grad_norm": 0.9985054135322571, + "learning_rate": 8.727250718912089e-07, + "loss": 0.0602, + "step": 13588 + }, + { + "epoch": 2.2017174335709657, + "grad_norm": 1.0729403495788574, + "learning_rate": 8.723931152244421e-07, + "loss": 0.0624, + "step": 13589 + }, + { + "epoch": 2.201879455605962, + "grad_norm": 1.0045422315597534, + "learning_rate": 8.72061208358507e-07, + "loss": 0.0627, + "step": 13590 + }, + { + "epoch": 2.202041477640959, + "grad_norm": 0.8267765045166016, + "learning_rate": 8.717293513035596e-07, + "loss": 0.0644, + "step": 13591 + }, + { + "epoch": 2.202203499675956, + "grad_norm": 0.8496410250663757, + "learning_rate": 8.713975440697536e-07, + "loss": 0.0568, + "step": 13592 + }, + { + "epoch": 2.2023655217109526, + "grad_norm": 0.9486085772514343, + "learning_rate": 8.710657866672417e-07, + "loss": 0.0668, + "step": 13593 + }, + { + "epoch": 2.2025275437459495, + "grad_norm": 0.867942750453949, + "learning_rate": 8.707340791061747e-07, + "loss": 0.057, + "step": 13594 + }, + { + "epoch": 2.202689565780946, + "grad_norm": 0.8752855062484741, + "learning_rate": 8.704024213967021e-07, + "loss": 0.0651, + "step": 13595 + }, + { + "epoch": 2.202851587815943, + "grad_norm": 1.1905330419540405, + "learning_rate": 8.700708135489722e-07, + "loss": 0.0653, + "step": 13596 + }, + { + "epoch": 2.20301360985094, + "grad_norm": 0.9045699834823608, + "learning_rate": 8.697392555731315e-07, + "loss": 0.0604, + "step": 13597 + }, + { + "epoch": 2.2031756318859363, + "grad_norm": 0.8511764407157898, + "learning_rate": 8.694077474793227e-07, + "loss": 0.0662, + "step": 13598 + }, + { + "epoch": 2.2033376539209333, + "grad_norm": 0.8251369595527649, + "learning_rate": 8.690762892776918e-07, + "loss": 0.0641, + "step": 13599 + }, + { + "epoch": 2.20349967595593, + "grad_norm": 0.9330815076828003, + "learning_rate": 8.687448809783799e-07, + "loss": 0.0646, + "step": 13600 + }, + { + "epoch": 2.2036616979909267, + "grad_norm": 0.8293221592903137, + "learning_rate": 8.684135225915277e-07, + "loss": 0.0595, + "step": 13601 + }, + { + "epoch": 2.2038237200259236, + "grad_norm": 1.0434253215789795, + "learning_rate": 8.680822141272727e-07, + "loss": 0.0698, + "step": 13602 + }, + { + "epoch": 2.20398574206092, + "grad_norm": 0.8310006260871887, + "learning_rate": 8.677509555957517e-07, + "loss": 0.0568, + "step": 13603 + }, + { + "epoch": 2.204147764095917, + "grad_norm": 0.8653905391693115, + "learning_rate": 8.674197470071033e-07, + "loss": 0.0583, + "step": 13604 + }, + { + "epoch": 2.204309786130914, + "grad_norm": 0.8970719575881958, + "learning_rate": 8.670885883714591e-07, + "loss": 0.0587, + "step": 13605 + }, + { + "epoch": 2.2044718081659105, + "grad_norm": 1.2163879871368408, + "learning_rate": 8.667574796989526e-07, + "loss": 0.0714, + "step": 13606 + }, + { + "epoch": 2.2046338302009074, + "grad_norm": 0.865898609161377, + "learning_rate": 8.664264209997144e-07, + "loss": 0.0627, + "step": 13607 + }, + { + "epoch": 2.204795852235904, + "grad_norm": 0.900995671749115, + "learning_rate": 8.66095412283875e-07, + "loss": 0.0608, + "step": 13608 + }, + { + "epoch": 2.204957874270901, + "grad_norm": 0.8361657857894897, + "learning_rate": 8.657644535615617e-07, + "loss": 0.0561, + "step": 13609 + }, + { + "epoch": 2.2051198963058978, + "grad_norm": 0.9006054401397705, + "learning_rate": 8.654335448429016e-07, + "loss": 0.0639, + "step": 13610 + }, + { + "epoch": 2.2052819183408943, + "grad_norm": 0.8334879875183105, + "learning_rate": 8.651026861380193e-07, + "loss": 0.0558, + "step": 13611 + }, + { + "epoch": 2.205443940375891, + "grad_norm": 0.9533306360244751, + "learning_rate": 8.647718774570385e-07, + "loss": 0.0637, + "step": 13612 + }, + { + "epoch": 2.2056059624108877, + "grad_norm": 0.8768250346183777, + "learning_rate": 8.644411188100812e-07, + "loss": 0.0609, + "step": 13613 + }, + { + "epoch": 2.2057679844458846, + "grad_norm": 0.8379773497581482, + "learning_rate": 8.641104102072676e-07, + "loss": 0.0561, + "step": 13614 + }, + { + "epoch": 2.2059300064808816, + "grad_norm": 1.2272884845733643, + "learning_rate": 8.637797516587173e-07, + "loss": 0.0692, + "step": 13615 + }, + { + "epoch": 2.206092028515878, + "grad_norm": 0.8420472145080566, + "learning_rate": 8.634491431745465e-07, + "loss": 0.057, + "step": 13616 + }, + { + "epoch": 2.206254050550875, + "grad_norm": 0.835096001625061, + "learning_rate": 8.631185847648704e-07, + "loss": 0.0571, + "step": 13617 + }, + { + "epoch": 2.2064160725858715, + "grad_norm": 0.9500763416290283, + "learning_rate": 8.627880764398055e-07, + "loss": 0.0538, + "step": 13618 + }, + { + "epoch": 2.2065780946208684, + "grad_norm": 0.8583593368530273, + "learning_rate": 8.62457618209464e-07, + "loss": 0.0588, + "step": 13619 + }, + { + "epoch": 2.2067401166558653, + "grad_norm": 0.816092848777771, + "learning_rate": 8.621272100839562e-07, + "loss": 0.056, + "step": 13620 + }, + { + "epoch": 2.206902138690862, + "grad_norm": 0.8541872501373291, + "learning_rate": 8.617968520733919e-07, + "loss": 0.0554, + "step": 13621 + }, + { + "epoch": 2.2070641607258588, + "grad_norm": 0.8463016152381897, + "learning_rate": 8.614665441878798e-07, + "loss": 0.0608, + "step": 13622 + }, + { + "epoch": 2.2072261827608557, + "grad_norm": 0.942939281463623, + "learning_rate": 8.611362864375261e-07, + "loss": 0.0682, + "step": 13623 + }, + { + "epoch": 2.207388204795852, + "grad_norm": 1.0515944957733154, + "learning_rate": 8.60806078832436e-07, + "loss": 0.0771, + "step": 13624 + }, + { + "epoch": 2.207550226830849, + "grad_norm": 0.9806361794471741, + "learning_rate": 8.604759213827133e-07, + "loss": 0.0698, + "step": 13625 + }, + { + "epoch": 2.2077122488658456, + "grad_norm": 0.9298086762428284, + "learning_rate": 8.601458140984606e-07, + "loss": 0.0579, + "step": 13626 + }, + { + "epoch": 2.2078742709008425, + "grad_norm": 0.8107653856277466, + "learning_rate": 8.598157569897758e-07, + "loss": 0.0553, + "step": 13627 + }, + { + "epoch": 2.2080362929358395, + "grad_norm": 0.874210774898529, + "learning_rate": 8.594857500667606e-07, + "loss": 0.0583, + "step": 13628 + }, + { + "epoch": 2.208198314970836, + "grad_norm": 0.8932999968528748, + "learning_rate": 8.591557933395115e-07, + "loss": 0.0645, + "step": 13629 + }, + { + "epoch": 2.208360337005833, + "grad_norm": 0.8853711485862732, + "learning_rate": 8.588258868181251e-07, + "loss": 0.0611, + "step": 13630 + }, + { + "epoch": 2.2085223590408294, + "grad_norm": 0.9822854995727539, + "learning_rate": 8.584960305126943e-07, + "loss": 0.0742, + "step": 13631 + }, + { + "epoch": 2.2086843810758263, + "grad_norm": 0.930212676525116, + "learning_rate": 8.581662244333116e-07, + "loss": 0.0657, + "step": 13632 + }, + { + "epoch": 2.2088464031108233, + "grad_norm": 0.8969360589981079, + "learning_rate": 8.578364685900711e-07, + "loss": 0.0596, + "step": 13633 + }, + { + "epoch": 2.2090084251458197, + "grad_norm": 0.8433337211608887, + "learning_rate": 8.575067629930601e-07, + "loss": 0.0564, + "step": 13634 + }, + { + "epoch": 2.2091704471808167, + "grad_norm": 0.8767102360725403, + "learning_rate": 8.571771076523669e-07, + "loss": 0.0608, + "step": 13635 + }, + { + "epoch": 2.209332469215813, + "grad_norm": 0.8478491306304932, + "learning_rate": 8.568475025780781e-07, + "loss": 0.061, + "step": 13636 + }, + { + "epoch": 2.20949449125081, + "grad_norm": 1.1657311916351318, + "learning_rate": 8.56517947780281e-07, + "loss": 0.0705, + "step": 13637 + }, + { + "epoch": 2.209656513285807, + "grad_norm": 0.980461835861206, + "learning_rate": 8.561884432690568e-07, + "loss": 0.0583, + "step": 13638 + }, + { + "epoch": 2.2098185353208035, + "grad_norm": 0.9112688302993774, + "learning_rate": 8.55858989054488e-07, + "loss": 0.0618, + "step": 13639 + }, + { + "epoch": 2.2099805573558005, + "grad_norm": 1.0051236152648926, + "learning_rate": 8.555295851466556e-07, + "loss": 0.066, + "step": 13640 + }, + { + "epoch": 2.210142579390797, + "grad_norm": 0.8563796281814575, + "learning_rate": 8.552002315556382e-07, + "loss": 0.0613, + "step": 13641 + }, + { + "epoch": 2.210304601425794, + "grad_norm": 0.7843127846717834, + "learning_rate": 8.548709282915135e-07, + "loss": 0.0638, + "step": 13642 + }, + { + "epoch": 2.210466623460791, + "grad_norm": 0.9892216324806213, + "learning_rate": 8.545416753643574e-07, + "loss": 0.0652, + "step": 13643 + }, + { + "epoch": 2.2106286454957873, + "grad_norm": 0.9359889030456543, + "learning_rate": 8.542124727842438e-07, + "loss": 0.0596, + "step": 13644 + }, + { + "epoch": 2.2107906675307842, + "grad_norm": 0.7579911947250366, + "learning_rate": 8.538833205612468e-07, + "loss": 0.0536, + "step": 13645 + }, + { + "epoch": 2.210952689565781, + "grad_norm": 0.7758036255836487, + "learning_rate": 8.535542187054352e-07, + "loss": 0.059, + "step": 13646 + }, + { + "epoch": 2.2111147116007777, + "grad_norm": 0.8697202801704407, + "learning_rate": 8.532251672268807e-07, + "loss": 0.0619, + "step": 13647 + }, + { + "epoch": 2.2112767336357746, + "grad_norm": 0.9321410059928894, + "learning_rate": 8.528961661356519e-07, + "loss": 0.0647, + "step": 13648 + }, + { + "epoch": 2.211438755670771, + "grad_norm": 1.245924711227417, + "learning_rate": 8.525672154418138e-07, + "loss": 0.0676, + "step": 13649 + }, + { + "epoch": 2.211600777705768, + "grad_norm": 0.8289289474487305, + "learning_rate": 8.52238315155432e-07, + "loss": 0.0551, + "step": 13650 + }, + { + "epoch": 2.211762799740765, + "grad_norm": 0.8378967642784119, + "learning_rate": 8.519094652865703e-07, + "loss": 0.0534, + "step": 13651 + }, + { + "epoch": 2.2119248217757614, + "grad_norm": 0.9754065871238708, + "learning_rate": 8.515806658452908e-07, + "loss": 0.0669, + "step": 13652 + }, + { + "epoch": 2.2120868438107584, + "grad_norm": 0.8882692456245422, + "learning_rate": 8.512519168416536e-07, + "loss": 0.0619, + "step": 13653 + }, + { + "epoch": 2.212248865845755, + "grad_norm": 0.9585002660751343, + "learning_rate": 8.50923218285718e-07, + "loss": 0.0655, + "step": 13654 + }, + { + "epoch": 2.212410887880752, + "grad_norm": 0.9813425540924072, + "learning_rate": 8.505945701875412e-07, + "loss": 0.0657, + "step": 13655 + }, + { + "epoch": 2.2125729099157487, + "grad_norm": 0.9413607716560364, + "learning_rate": 8.502659725571791e-07, + "loss": 0.0606, + "step": 13656 + }, + { + "epoch": 2.212734931950745, + "grad_norm": 0.8411443829536438, + "learning_rate": 8.499374254046858e-07, + "loss": 0.0617, + "step": 13657 + }, + { + "epoch": 2.212896953985742, + "grad_norm": 0.9197219610214233, + "learning_rate": 8.496089287401144e-07, + "loss": 0.0619, + "step": 13658 + }, + { + "epoch": 2.2130589760207386, + "grad_norm": 1.0152454376220703, + "learning_rate": 8.492804825735166e-07, + "loss": 0.0579, + "step": 13659 + }, + { + "epoch": 2.2132209980557356, + "grad_norm": 0.9079082012176514, + "learning_rate": 8.489520869149398e-07, + "loss": 0.0711, + "step": 13660 + }, + { + "epoch": 2.2133830200907325, + "grad_norm": 0.9641551375389099, + "learning_rate": 8.486237417744344e-07, + "loss": 0.0587, + "step": 13661 + }, + { + "epoch": 2.213545042125729, + "grad_norm": 0.8907647132873535, + "learning_rate": 8.482954471620464e-07, + "loss": 0.0601, + "step": 13662 + }, + { + "epoch": 2.213707064160726, + "grad_norm": 0.7322314381599426, + "learning_rate": 8.479672030878213e-07, + "loss": 0.054, + "step": 13663 + }, + { + "epoch": 2.2138690861957224, + "grad_norm": 0.8832927346229553, + "learning_rate": 8.476390095618015e-07, + "loss": 0.0571, + "step": 13664 + }, + { + "epoch": 2.2140311082307194, + "grad_norm": 0.8694517016410828, + "learning_rate": 8.47310866594028e-07, + "loss": 0.06, + "step": 13665 + }, + { + "epoch": 2.2141931302657163, + "grad_norm": 0.8959830403327942, + "learning_rate": 8.469827741945447e-07, + "loss": 0.0642, + "step": 13666 + }, + { + "epoch": 2.214355152300713, + "grad_norm": 0.9911413192749023, + "learning_rate": 8.466547323733873e-07, + "loss": 0.0602, + "step": 13667 + }, + { + "epoch": 2.2145171743357097, + "grad_norm": 0.8468191027641296, + "learning_rate": 8.46326741140594e-07, + "loss": 0.0601, + "step": 13668 + }, + { + "epoch": 2.2146791963707066, + "grad_norm": 1.0290790796279907, + "learning_rate": 8.459988005062006e-07, + "loss": 0.06, + "step": 13669 + }, + { + "epoch": 2.214841218405703, + "grad_norm": 0.9934695363044739, + "learning_rate": 8.456709104802413e-07, + "loss": 0.0616, + "step": 13670 + }, + { + "epoch": 2.2150032404407, + "grad_norm": 0.8425288200378418, + "learning_rate": 8.453430710727486e-07, + "loss": 0.0595, + "step": 13671 + }, + { + "epoch": 2.2151652624756966, + "grad_norm": 0.862170398235321, + "learning_rate": 8.450152822937541e-07, + "loss": 0.0619, + "step": 13672 + }, + { + "epoch": 2.2153272845106935, + "grad_norm": 0.8608142137527466, + "learning_rate": 8.446875441532868e-07, + "loss": 0.0645, + "step": 13673 + }, + { + "epoch": 2.21548930654569, + "grad_norm": 0.8314158916473389, + "learning_rate": 8.443598566613756e-07, + "loss": 0.058, + "step": 13674 + }, + { + "epoch": 2.215651328580687, + "grad_norm": 0.9272691607475281, + "learning_rate": 8.440322198280446e-07, + "loss": 0.0604, + "step": 13675 + }, + { + "epoch": 2.215813350615684, + "grad_norm": 1.0183619260787964, + "learning_rate": 8.437046336633212e-07, + "loss": 0.0642, + "step": 13676 + }, + { + "epoch": 2.2159753726506803, + "grad_norm": 0.8743815422058105, + "learning_rate": 8.433770981772285e-07, + "loss": 0.0625, + "step": 13677 + }, + { + "epoch": 2.2161373946856773, + "grad_norm": 0.8977541327476501, + "learning_rate": 8.430496133797872e-07, + "loss": 0.067, + "step": 13678 + }, + { + "epoch": 2.216299416720674, + "grad_norm": 0.9570757150650024, + "learning_rate": 8.427221792810169e-07, + "loss": 0.0597, + "step": 13679 + }, + { + "epoch": 2.2164614387556707, + "grad_norm": 0.8252487778663635, + "learning_rate": 8.423947958909381e-07, + "loss": 0.0618, + "step": 13680 + }, + { + "epoch": 2.2166234607906676, + "grad_norm": 0.801275372505188, + "learning_rate": 8.420674632195683e-07, + "loss": 0.0581, + "step": 13681 + }, + { + "epoch": 2.216785482825664, + "grad_norm": 0.8917983770370483, + "learning_rate": 8.41740181276921e-07, + "loss": 0.0685, + "step": 13682 + }, + { + "epoch": 2.216947504860661, + "grad_norm": 0.9908973574638367, + "learning_rate": 8.414129500730115e-07, + "loss": 0.0666, + "step": 13683 + }, + { + "epoch": 2.217109526895658, + "grad_norm": 1.1001694202423096, + "learning_rate": 8.410857696178518e-07, + "loss": 0.0664, + "step": 13684 + }, + { + "epoch": 2.2172715489306545, + "grad_norm": 0.8610407710075378, + "learning_rate": 8.407586399214529e-07, + "loss": 0.0634, + "step": 13685 + }, + { + "epoch": 2.2174335709656514, + "grad_norm": 0.8666363954544067, + "learning_rate": 8.404315609938246e-07, + "loss": 0.0623, + "step": 13686 + }, + { + "epoch": 2.217595593000648, + "grad_norm": 0.8178418278694153, + "learning_rate": 8.401045328449742e-07, + "loss": 0.0607, + "step": 13687 + }, + { + "epoch": 2.217757615035645, + "grad_norm": 0.8369463086128235, + "learning_rate": 8.397775554849086e-07, + "loss": 0.0577, + "step": 13688 + }, + { + "epoch": 2.2179196370706418, + "grad_norm": 1.0143942832946777, + "learning_rate": 8.394506289236317e-07, + "loss": 0.0673, + "step": 13689 + }, + { + "epoch": 2.2180816591056383, + "grad_norm": 1.006434679031372, + "learning_rate": 8.391237531711474e-07, + "loss": 0.0585, + "step": 13690 + }, + { + "epoch": 2.218243681140635, + "grad_norm": 0.9059908390045166, + "learning_rate": 8.38796928237457e-07, + "loss": 0.064, + "step": 13691 + }, + { + "epoch": 2.2184057031756317, + "grad_norm": 0.9959692358970642, + "learning_rate": 8.384701541325612e-07, + "loss": 0.066, + "step": 13692 + }, + { + "epoch": 2.2185677252106286, + "grad_norm": 0.9817578792572021, + "learning_rate": 8.381434308664574e-07, + "loss": 0.0707, + "step": 13693 + }, + { + "epoch": 2.2187297472456255, + "grad_norm": 0.9044135808944702, + "learning_rate": 8.378167584491417e-07, + "loss": 0.0646, + "step": 13694 + }, + { + "epoch": 2.218891769280622, + "grad_norm": 0.751171886920929, + "learning_rate": 8.374901368906127e-07, + "loss": 0.0566, + "step": 13695 + }, + { + "epoch": 2.219053791315619, + "grad_norm": 0.8422271609306335, + "learning_rate": 8.371635662008615e-07, + "loss": 0.0642, + "step": 13696 + }, + { + "epoch": 2.2192158133506155, + "grad_norm": 0.8549221754074097, + "learning_rate": 8.368370463898812e-07, + "loss": 0.0518, + "step": 13697 + }, + { + "epoch": 2.2193778353856124, + "grad_norm": 0.8980638384819031, + "learning_rate": 8.365105774676624e-07, + "loss": 0.0633, + "step": 13698 + }, + { + "epoch": 2.2195398574206093, + "grad_norm": 0.9105513691902161, + "learning_rate": 8.361841594441944e-07, + "loss": 0.06, + "step": 13699 + }, + { + "epoch": 2.219701879455606, + "grad_norm": 0.8758683204650879, + "learning_rate": 8.358577923294647e-07, + "loss": 0.0646, + "step": 13700 + }, + { + "epoch": 2.2198639014906028, + "grad_norm": 1.3330185413360596, + "learning_rate": 8.355314761334596e-07, + "loss": 0.0658, + "step": 13701 + }, + { + "epoch": 2.2200259235255997, + "grad_norm": 0.9840524792671204, + "learning_rate": 8.352052108661634e-07, + "loss": 0.0674, + "step": 13702 + }, + { + "epoch": 2.220187945560596, + "grad_norm": 0.8081815242767334, + "learning_rate": 8.34878996537559e-07, + "loss": 0.06, + "step": 13703 + }, + { + "epoch": 2.220349967595593, + "grad_norm": 0.8371722102165222, + "learning_rate": 8.345528331576275e-07, + "loss": 0.0601, + "step": 13704 + }, + { + "epoch": 2.2205119896305896, + "grad_norm": 1.0199873447418213, + "learning_rate": 8.342267207363492e-07, + "loss": 0.0612, + "step": 13705 + }, + { + "epoch": 2.2206740116655865, + "grad_norm": 1.0433920621871948, + "learning_rate": 8.339006592837021e-07, + "loss": 0.0671, + "step": 13706 + }, + { + "epoch": 2.2208360337005835, + "grad_norm": 0.8671245574951172, + "learning_rate": 8.335746488096639e-07, + "loss": 0.0677, + "step": 13707 + }, + { + "epoch": 2.22099805573558, + "grad_norm": 0.9357213377952576, + "learning_rate": 8.33248689324207e-07, + "loss": 0.0716, + "step": 13708 + }, + { + "epoch": 2.221160077770577, + "grad_norm": 0.8133964538574219, + "learning_rate": 8.329227808373078e-07, + "loss": 0.0578, + "step": 13709 + }, + { + "epoch": 2.2213220998055734, + "grad_norm": 0.8895506858825684, + "learning_rate": 8.325969233589376e-07, + "loss": 0.0653, + "step": 13710 + }, + { + "epoch": 2.2214841218405703, + "grad_norm": 0.8264877200126648, + "learning_rate": 8.322711168990661e-07, + "loss": 0.057, + "step": 13711 + }, + { + "epoch": 2.2216461438755672, + "grad_norm": 0.8504375219345093, + "learning_rate": 8.319453614676626e-07, + "loss": 0.0644, + "step": 13712 + }, + { + "epoch": 2.2218081659105637, + "grad_norm": 0.824181318283081, + "learning_rate": 8.316196570746934e-07, + "loss": 0.0557, + "step": 13713 + }, + { + "epoch": 2.2219701879455607, + "grad_norm": 0.832217812538147, + "learning_rate": 8.31294003730127e-07, + "loss": 0.0613, + "step": 13714 + }, + { + "epoch": 2.222132209980557, + "grad_norm": 0.8441714644432068, + "learning_rate": 8.309684014439251e-07, + "loss": 0.0639, + "step": 13715 + }, + { + "epoch": 2.222294232015554, + "grad_norm": 0.998146653175354, + "learning_rate": 8.306428502260511e-07, + "loss": 0.0665, + "step": 13716 + }, + { + "epoch": 2.222456254050551, + "grad_norm": 0.8211577534675598, + "learning_rate": 8.303173500864661e-07, + "loss": 0.0599, + "step": 13717 + }, + { + "epoch": 2.2226182760855475, + "grad_norm": 0.9386523962020874, + "learning_rate": 8.299919010351296e-07, + "loss": 0.0625, + "step": 13718 + }, + { + "epoch": 2.2227802981205445, + "grad_norm": 0.9545656442642212, + "learning_rate": 8.296665030819998e-07, + "loss": 0.0648, + "step": 13719 + }, + { + "epoch": 2.222942320155541, + "grad_norm": 0.8156535029411316, + "learning_rate": 8.293411562370327e-07, + "loss": 0.0536, + "step": 13720 + }, + { + "epoch": 2.223104342190538, + "grad_norm": 0.9009618163108826, + "learning_rate": 8.290158605101842e-07, + "loss": 0.0621, + "step": 13721 + }, + { + "epoch": 2.223266364225535, + "grad_norm": 1.074135184288025, + "learning_rate": 8.286906159114058e-07, + "loss": 0.061, + "step": 13722 + }, + { + "epoch": 2.2234283862605313, + "grad_norm": 0.9844740033149719, + "learning_rate": 8.283654224506491e-07, + "loss": 0.0656, + "step": 13723 + }, + { + "epoch": 2.2235904082955282, + "grad_norm": 0.8761224746704102, + "learning_rate": 8.280402801378662e-07, + "loss": 0.0586, + "step": 13724 + }, + { + "epoch": 2.223752430330525, + "grad_norm": 0.88485187292099, + "learning_rate": 8.277151889830054e-07, + "loss": 0.063, + "step": 13725 + }, + { + "epoch": 2.2239144523655217, + "grad_norm": 1.0839747190475464, + "learning_rate": 8.27390148996012e-07, + "loss": 0.069, + "step": 13726 + }, + { + "epoch": 2.2240764744005186, + "grad_norm": 0.9806898832321167, + "learning_rate": 8.270651601868321e-07, + "loss": 0.0658, + "step": 13727 + }, + { + "epoch": 2.224238496435515, + "grad_norm": 0.8707731366157532, + "learning_rate": 8.267402225654112e-07, + "loss": 0.0614, + "step": 13728 + }, + { + "epoch": 2.224400518470512, + "grad_norm": 0.7389007210731506, + "learning_rate": 8.264153361416893e-07, + "loss": 0.0543, + "step": 13729 + }, + { + "epoch": 2.224562540505509, + "grad_norm": 0.907781183719635, + "learning_rate": 8.260905009256081e-07, + "loss": 0.061, + "step": 13730 + }, + { + "epoch": 2.2247245625405054, + "grad_norm": 0.976334273815155, + "learning_rate": 8.257657169271071e-07, + "loss": 0.0654, + "step": 13731 + }, + { + "epoch": 2.2248865845755024, + "grad_norm": 0.9755533337593079, + "learning_rate": 8.254409841561234e-07, + "loss": 0.0697, + "step": 13732 + }, + { + "epoch": 2.225048606610499, + "grad_norm": 0.8844923377037048, + "learning_rate": 8.251163026225934e-07, + "loss": 0.0638, + "step": 13733 + }, + { + "epoch": 2.225210628645496, + "grad_norm": 0.8760762810707092, + "learning_rate": 8.24791672336451e-07, + "loss": 0.059, + "step": 13734 + }, + { + "epoch": 2.2253726506804927, + "grad_norm": 0.8645013570785522, + "learning_rate": 8.244670933076298e-07, + "loss": 0.0717, + "step": 13735 + }, + { + "epoch": 2.225534672715489, + "grad_norm": 0.8673563003540039, + "learning_rate": 8.241425655460616e-07, + "loss": 0.0601, + "step": 13736 + }, + { + "epoch": 2.225696694750486, + "grad_norm": 0.8699643015861511, + "learning_rate": 8.23818089061674e-07, + "loss": 0.0619, + "step": 13737 + }, + { + "epoch": 2.2258587167854826, + "grad_norm": 0.8396235108375549, + "learning_rate": 8.23493663864397e-07, + "loss": 0.0601, + "step": 13738 + }, + { + "epoch": 2.2260207388204796, + "grad_norm": 0.9061728715896606, + "learning_rate": 8.231692899641572e-07, + "loss": 0.0656, + "step": 13739 + }, + { + "epoch": 2.2261827608554765, + "grad_norm": 0.9282206892967224, + "learning_rate": 8.228449673708797e-07, + "loss": 0.0617, + "step": 13740 + }, + { + "epoch": 2.226344782890473, + "grad_norm": 0.8784255385398865, + "learning_rate": 8.22520696094487e-07, + "loss": 0.0644, + "step": 13741 + }, + { + "epoch": 2.22650680492547, + "grad_norm": 1.012131690979004, + "learning_rate": 8.221964761449008e-07, + "loss": 0.0589, + "step": 13742 + }, + { + "epoch": 2.2266688269604664, + "grad_norm": 0.80937260389328, + "learning_rate": 8.218723075320437e-07, + "loss": 0.0556, + "step": 13743 + }, + { + "epoch": 2.2268308489954634, + "grad_norm": 0.9021971821784973, + "learning_rate": 8.215481902658323e-07, + "loss": 0.0644, + "step": 13744 + }, + { + "epoch": 2.2269928710304603, + "grad_norm": 0.916597306728363, + "learning_rate": 8.212241243561845e-07, + "loss": 0.0588, + "step": 13745 + }, + { + "epoch": 2.2271548930654568, + "grad_norm": 0.8220802545547485, + "learning_rate": 8.209001098130157e-07, + "loss": 0.0563, + "step": 13746 + }, + { + "epoch": 2.2273169151004537, + "grad_norm": 0.8931221961975098, + "learning_rate": 8.205761466462403e-07, + "loss": 0.0614, + "step": 13747 + }, + { + "epoch": 2.2274789371354506, + "grad_norm": 0.755279004573822, + "learning_rate": 8.20252234865771e-07, + "loss": 0.0602, + "step": 13748 + }, + { + "epoch": 2.227640959170447, + "grad_norm": 0.8481705188751221, + "learning_rate": 8.199283744815181e-07, + "loss": 0.057, + "step": 13749 + }, + { + "epoch": 2.227802981205444, + "grad_norm": 0.9167376160621643, + "learning_rate": 8.196045655033913e-07, + "loss": 0.0679, + "step": 13750 + }, + { + "epoch": 2.2279650032404406, + "grad_norm": 0.9409997463226318, + "learning_rate": 8.192808079412984e-07, + "loss": 0.0669, + "step": 13751 + }, + { + "epoch": 2.2281270252754375, + "grad_norm": 0.7802384495735168, + "learning_rate": 8.189571018051454e-07, + "loss": 0.0577, + "step": 13752 + }, + { + "epoch": 2.2282890473104344, + "grad_norm": 1.0504868030548096, + "learning_rate": 8.186334471048371e-07, + "loss": 0.0615, + "step": 13753 + }, + { + "epoch": 2.228451069345431, + "grad_norm": 0.762758731842041, + "learning_rate": 8.183098438502771e-07, + "loss": 0.0565, + "step": 13754 + }, + { + "epoch": 2.228613091380428, + "grad_norm": 0.9312151074409485, + "learning_rate": 8.179862920513656e-07, + "loss": 0.063, + "step": 13755 + }, + { + "epoch": 2.2287751134154243, + "grad_norm": 1.0615299940109253, + "learning_rate": 8.176627917180025e-07, + "loss": 0.0693, + "step": 13756 + }, + { + "epoch": 2.2289371354504213, + "grad_norm": 1.0079830884933472, + "learning_rate": 8.173393428600876e-07, + "loss": 0.0621, + "step": 13757 + }, + { + "epoch": 2.229099157485418, + "grad_norm": 1.1644383668899536, + "learning_rate": 8.170159454875173e-07, + "loss": 0.0672, + "step": 13758 + }, + { + "epoch": 2.2292611795204147, + "grad_norm": 0.910068929195404, + "learning_rate": 8.16692599610186e-07, + "loss": 0.0656, + "step": 13759 + }, + { + "epoch": 2.2294232015554116, + "grad_norm": 0.8690840005874634, + "learning_rate": 8.163693052379873e-07, + "loss": 0.0583, + "step": 13760 + }, + { + "epoch": 2.229585223590408, + "grad_norm": 0.8714174628257751, + "learning_rate": 8.160460623808136e-07, + "loss": 0.067, + "step": 13761 + }, + { + "epoch": 2.229747245625405, + "grad_norm": 0.9453072547912598, + "learning_rate": 8.157228710485554e-07, + "loss": 0.0694, + "step": 13762 + }, + { + "epoch": 2.229909267660402, + "grad_norm": 0.8522040843963623, + "learning_rate": 8.153997312511014e-07, + "loss": 0.06, + "step": 13763 + }, + { + "epoch": 2.2300712896953985, + "grad_norm": 1.055083155632019, + "learning_rate": 8.15076642998339e-07, + "loss": 0.0612, + "step": 13764 + }, + { + "epoch": 2.2302333117303954, + "grad_norm": 0.7458101511001587, + "learning_rate": 8.147536063001549e-07, + "loss": 0.0586, + "step": 13765 + }, + { + "epoch": 2.230395333765392, + "grad_norm": 0.8568379282951355, + "learning_rate": 8.144306211664302e-07, + "loss": 0.0622, + "step": 13766 + }, + { + "epoch": 2.230557355800389, + "grad_norm": 0.8845552802085876, + "learning_rate": 8.141076876070505e-07, + "loss": 0.0635, + "step": 13767 + }, + { + "epoch": 2.2307193778353858, + "grad_norm": 0.8834683299064636, + "learning_rate": 8.137848056318959e-07, + "loss": 0.0577, + "step": 13768 + }, + { + "epoch": 2.2308813998703823, + "grad_norm": 0.8642720580101013, + "learning_rate": 8.134619752508463e-07, + "loss": 0.0618, + "step": 13769 + }, + { + "epoch": 2.231043421905379, + "grad_norm": 0.991050660610199, + "learning_rate": 8.131391964737773e-07, + "loss": 0.0673, + "step": 13770 + }, + { + "epoch": 2.231205443940376, + "grad_norm": 0.8958488702774048, + "learning_rate": 8.128164693105678e-07, + "loss": 0.064, + "step": 13771 + }, + { + "epoch": 2.2313674659753726, + "grad_norm": 1.0806387662887573, + "learning_rate": 8.12493793771092e-07, + "loss": 0.0648, + "step": 13772 + }, + { + "epoch": 2.2315294880103695, + "grad_norm": 0.922431468963623, + "learning_rate": 8.121711698652219e-07, + "loss": 0.0572, + "step": 13773 + }, + { + "epoch": 2.231691510045366, + "grad_norm": 0.9419412016868591, + "learning_rate": 8.118485976028292e-07, + "loss": 0.0612, + "step": 13774 + }, + { + "epoch": 2.231853532080363, + "grad_norm": 1.0189546346664429, + "learning_rate": 8.115260769937835e-07, + "loss": 0.065, + "step": 13775 + }, + { + "epoch": 2.2320155541153595, + "grad_norm": 0.8037611246109009, + "learning_rate": 8.112036080479554e-07, + "loss": 0.0612, + "step": 13776 + }, + { + "epoch": 2.2321775761503564, + "grad_norm": 0.9175558090209961, + "learning_rate": 8.108811907752093e-07, + "loss": 0.065, + "step": 13777 + }, + { + "epoch": 2.2323395981853533, + "grad_norm": 0.9038329124450684, + "learning_rate": 8.10558825185411e-07, + "loss": 0.0585, + "step": 13778 + }, + { + "epoch": 2.23250162022035, + "grad_norm": 0.8255992531776428, + "learning_rate": 8.102365112884248e-07, + "loss": 0.0649, + "step": 13779 + }, + { + "epoch": 2.2326636422553467, + "grad_norm": 0.8868559002876282, + "learning_rate": 8.099142490941117e-07, + "loss": 0.0623, + "step": 13780 + }, + { + "epoch": 2.2328256642903437, + "grad_norm": 0.8174579739570618, + "learning_rate": 8.095920386123327e-07, + "loss": 0.059, + "step": 13781 + }, + { + "epoch": 2.23298768632534, + "grad_norm": 0.9110251069068909, + "learning_rate": 8.09269879852947e-07, + "loss": 0.064, + "step": 13782 + }, + { + "epoch": 2.233149708360337, + "grad_norm": 1.100704550743103, + "learning_rate": 8.08947772825811e-07, + "loss": 0.0637, + "step": 13783 + }, + { + "epoch": 2.2333117303953336, + "grad_norm": 0.9022136330604553, + "learning_rate": 8.086257175407819e-07, + "loss": 0.0534, + "step": 13784 + }, + { + "epoch": 2.2334737524303305, + "grad_norm": 1.0603289604187012, + "learning_rate": 8.083037140077113e-07, + "loss": 0.0619, + "step": 13785 + }, + { + "epoch": 2.2336357744653275, + "grad_norm": 0.9479257464408875, + "learning_rate": 8.079817622364539e-07, + "loss": 0.0647, + "step": 13786 + }, + { + "epoch": 2.233797796500324, + "grad_norm": 0.8262972235679626, + "learning_rate": 8.076598622368606e-07, + "loss": 0.0535, + "step": 13787 + }, + { + "epoch": 2.233959818535321, + "grad_norm": 1.2946476936340332, + "learning_rate": 8.073380140187795e-07, + "loss": 0.0728, + "step": 13788 + }, + { + "epoch": 2.2341218405703174, + "grad_norm": 0.8733221292495728, + "learning_rate": 8.07016217592059e-07, + "loss": 0.0654, + "step": 13789 + }, + { + "epoch": 2.2342838626053143, + "grad_norm": 0.9510716795921326, + "learning_rate": 8.066944729665455e-07, + "loss": 0.0665, + "step": 13790 + }, + { + "epoch": 2.2344458846403112, + "grad_norm": 0.9815870523452759, + "learning_rate": 8.063727801520832e-07, + "loss": 0.0639, + "step": 13791 + }, + { + "epoch": 2.2346079066753077, + "grad_norm": 0.7917437553405762, + "learning_rate": 8.060511391585152e-07, + "loss": 0.0597, + "step": 13792 + }, + { + "epoch": 2.2347699287103047, + "grad_norm": 1.0030494928359985, + "learning_rate": 8.057295499956832e-07, + "loss": 0.0701, + "step": 13793 + }, + { + "epoch": 2.234931950745301, + "grad_norm": 0.933049201965332, + "learning_rate": 8.054080126734271e-07, + "loss": 0.0682, + "step": 13794 + }, + { + "epoch": 2.235093972780298, + "grad_norm": 0.9111407399177551, + "learning_rate": 8.050865272015848e-07, + "loss": 0.0632, + "step": 13795 + }, + { + "epoch": 2.235255994815295, + "grad_norm": 0.9394775032997131, + "learning_rate": 8.047650935899931e-07, + "loss": 0.0664, + "step": 13796 + }, + { + "epoch": 2.2354180168502915, + "grad_norm": 0.9155844449996948, + "learning_rate": 8.044437118484874e-07, + "loss": 0.0626, + "step": 13797 + }, + { + "epoch": 2.2355800388852884, + "grad_norm": 0.9947577118873596, + "learning_rate": 8.041223819869015e-07, + "loss": 0.0598, + "step": 13798 + }, + { + "epoch": 2.235742060920285, + "grad_norm": 0.9133854508399963, + "learning_rate": 8.03801104015065e-07, + "loss": 0.0639, + "step": 13799 + }, + { + "epoch": 2.235904082955282, + "grad_norm": 0.981562077999115, + "learning_rate": 8.034798779428113e-07, + "loss": 0.0565, + "step": 13800 + }, + { + "epoch": 2.236066104990279, + "grad_norm": 0.8836338520050049, + "learning_rate": 8.031587037799673e-07, + "loss": 0.0606, + "step": 13801 + }, + { + "epoch": 2.2362281270252753, + "grad_norm": 0.9972084760665894, + "learning_rate": 8.02837581536362e-07, + "loss": 0.0707, + "step": 13802 + }, + { + "epoch": 2.2363901490602722, + "grad_norm": 0.8902301788330078, + "learning_rate": 8.025165112218186e-07, + "loss": 0.0634, + "step": 13803 + }, + { + "epoch": 2.236552171095269, + "grad_norm": 0.9194254875183105, + "learning_rate": 8.021954928461611e-07, + "loss": 0.0632, + "step": 13804 + }, + { + "epoch": 2.2367141931302656, + "grad_norm": 1.0688180923461914, + "learning_rate": 8.018745264192148e-07, + "loss": 0.0654, + "step": 13805 + }, + { + "epoch": 2.2368762151652626, + "grad_norm": 0.9109787940979004, + "learning_rate": 8.015536119507977e-07, + "loss": 0.0604, + "step": 13806 + }, + { + "epoch": 2.237038237200259, + "grad_norm": 0.9395532011985779, + "learning_rate": 8.012327494507302e-07, + "loss": 0.0617, + "step": 13807 + }, + { + "epoch": 2.237200259235256, + "grad_norm": 0.8452723622322083, + "learning_rate": 8.009119389288292e-07, + "loss": 0.0591, + "step": 13808 + }, + { + "epoch": 2.237362281270253, + "grad_norm": 0.9636961221694946, + "learning_rate": 8.005911803949115e-07, + "loss": 0.0644, + "step": 13809 + }, + { + "epoch": 2.2375243033052494, + "grad_norm": 0.8222397565841675, + "learning_rate": 8.002704738587911e-07, + "loss": 0.0579, + "step": 13810 + }, + { + "epoch": 2.2376863253402464, + "grad_norm": 0.8121089935302734, + "learning_rate": 7.999498193302807e-07, + "loss": 0.059, + "step": 13811 + }, + { + "epoch": 2.237848347375243, + "grad_norm": 0.8787469863891602, + "learning_rate": 7.996292168191919e-07, + "loss": 0.0537, + "step": 13812 + }, + { + "epoch": 2.23801036941024, + "grad_norm": 0.7887811064720154, + "learning_rate": 7.993086663353344e-07, + "loss": 0.06, + "step": 13813 + }, + { + "epoch": 2.2381723914452367, + "grad_norm": 0.9501045942306519, + "learning_rate": 7.989881678885158e-07, + "loss": 0.061, + "step": 13814 + }, + { + "epoch": 2.238334413480233, + "grad_norm": 0.8966881036758423, + "learning_rate": 7.986677214885433e-07, + "loss": 0.0625, + "step": 13815 + }, + { + "epoch": 2.23849643551523, + "grad_norm": 0.8494555354118347, + "learning_rate": 7.983473271452219e-07, + "loss": 0.0558, + "step": 13816 + }, + { + "epoch": 2.2386584575502266, + "grad_norm": 0.8177490234375, + "learning_rate": 7.980269848683536e-07, + "loss": 0.0542, + "step": 13817 + }, + { + "epoch": 2.2388204795852236, + "grad_norm": 0.9580766558647156, + "learning_rate": 7.977066946677404e-07, + "loss": 0.0619, + "step": 13818 + }, + { + "epoch": 2.2389825016202205, + "grad_norm": 1.0076230764389038, + "learning_rate": 7.973864565531833e-07, + "loss": 0.0616, + "step": 13819 + }, + { + "epoch": 2.239144523655217, + "grad_norm": 1.0264263153076172, + "learning_rate": 7.970662705344812e-07, + "loss": 0.058, + "step": 13820 + }, + { + "epoch": 2.239306545690214, + "grad_norm": 0.7906507253646851, + "learning_rate": 7.967461366214293e-07, + "loss": 0.0569, + "step": 13821 + }, + { + "epoch": 2.2394685677252104, + "grad_norm": 0.7899366617202759, + "learning_rate": 7.964260548238242e-07, + "loss": 0.0547, + "step": 13822 + }, + { + "epoch": 2.2396305897602073, + "grad_norm": 0.9811822772026062, + "learning_rate": 7.961060251514591e-07, + "loss": 0.0609, + "step": 13823 + }, + { + "epoch": 2.2397926117952043, + "grad_norm": 0.7992488145828247, + "learning_rate": 7.957860476141261e-07, + "loss": 0.0589, + "step": 13824 + }, + { + "epoch": 2.2399546338302008, + "grad_norm": 0.9526104927062988, + "learning_rate": 7.954661222216162e-07, + "loss": 0.067, + "step": 13825 + }, + { + "epoch": 2.2401166558651977, + "grad_norm": 0.8279509544372559, + "learning_rate": 7.951462489837178e-07, + "loss": 0.0608, + "step": 13826 + }, + { + "epoch": 2.2402786779001946, + "grad_norm": 0.9480863213539124, + "learning_rate": 7.948264279102186e-07, + "loss": 0.0639, + "step": 13827 + }, + { + "epoch": 2.240440699935191, + "grad_norm": 1.0198253393173218, + "learning_rate": 7.945066590109044e-07, + "loss": 0.0667, + "step": 13828 + }, + { + "epoch": 2.240602721970188, + "grad_norm": 0.9767336845397949, + "learning_rate": 7.941869422955592e-07, + "loss": 0.0663, + "step": 13829 + }, + { + "epoch": 2.2407647440051845, + "grad_norm": 0.8995802402496338, + "learning_rate": 7.938672777739654e-07, + "loss": 0.0588, + "step": 13830 + }, + { + "epoch": 2.2409267660401815, + "grad_norm": 1.019019603729248, + "learning_rate": 7.935476654559052e-07, + "loss": 0.0574, + "step": 13831 + }, + { + "epoch": 2.2410887880751784, + "grad_norm": 0.8723592758178711, + "learning_rate": 7.932281053511559e-07, + "loss": 0.0649, + "step": 13832 + }, + { + "epoch": 2.241250810110175, + "grad_norm": 1.199785590171814, + "learning_rate": 7.929085974694956e-07, + "loss": 0.0671, + "step": 13833 + }, + { + "epoch": 2.241412832145172, + "grad_norm": 0.9804271459579468, + "learning_rate": 7.925891418207024e-07, + "loss": 0.0613, + "step": 13834 + }, + { + "epoch": 2.2415748541801683, + "grad_norm": 0.7946493625640869, + "learning_rate": 7.922697384145492e-07, + "loss": 0.0595, + "step": 13835 + }, + { + "epoch": 2.2417368762151653, + "grad_norm": 0.9132854342460632, + "learning_rate": 7.919503872608092e-07, + "loss": 0.0628, + "step": 13836 + }, + { + "epoch": 2.241898898250162, + "grad_norm": 0.9299383163452148, + "learning_rate": 7.916310883692532e-07, + "loss": 0.07, + "step": 13837 + }, + { + "epoch": 2.2420609202851587, + "grad_norm": 0.9028305411338806, + "learning_rate": 7.913118417496532e-07, + "loss": 0.069, + "step": 13838 + }, + { + "epoch": 2.2422229423201556, + "grad_norm": 0.9909502267837524, + "learning_rate": 7.909926474117752e-07, + "loss": 0.0608, + "step": 13839 + }, + { + "epoch": 2.242384964355152, + "grad_norm": 1.0152370929718018, + "learning_rate": 7.906735053653866e-07, + "loss": 0.0648, + "step": 13840 + }, + { + "epoch": 2.242546986390149, + "grad_norm": 0.8076937794685364, + "learning_rate": 7.90354415620252e-07, + "loss": 0.0589, + "step": 13841 + }, + { + "epoch": 2.242709008425146, + "grad_norm": 0.9617068767547607, + "learning_rate": 7.900353781861353e-07, + "loss": 0.0603, + "step": 13842 + }, + { + "epoch": 2.2428710304601425, + "grad_norm": 1.0339735746383667, + "learning_rate": 7.897163930727977e-07, + "loss": 0.0601, + "step": 13843 + }, + { + "epoch": 2.2430330524951394, + "grad_norm": 1.0885800123214722, + "learning_rate": 7.893974602899998e-07, + "loss": 0.0664, + "step": 13844 + }, + { + "epoch": 2.243195074530136, + "grad_norm": 0.8064873218536377, + "learning_rate": 7.890785798475001e-07, + "loss": 0.0589, + "step": 13845 + }, + { + "epoch": 2.243357096565133, + "grad_norm": 0.8364428281784058, + "learning_rate": 7.887597517550564e-07, + "loss": 0.0587, + "step": 13846 + }, + { + "epoch": 2.2435191186001298, + "grad_norm": 1.0745041370391846, + "learning_rate": 7.884409760224212e-07, + "loss": 0.074, + "step": 13847 + }, + { + "epoch": 2.2436811406351262, + "grad_norm": 0.9691507816314697, + "learning_rate": 7.881222526593513e-07, + "loss": 0.0607, + "step": 13848 + }, + { + "epoch": 2.243843162670123, + "grad_norm": 0.9089518189430237, + "learning_rate": 7.878035816755985e-07, + "loss": 0.0575, + "step": 13849 + }, + { + "epoch": 2.24400518470512, + "grad_norm": 0.8140989542007446, + "learning_rate": 7.87484963080912e-07, + "loss": 0.0586, + "step": 13850 + }, + { + "epoch": 2.2441672067401166, + "grad_norm": 0.6760884523391724, + "learning_rate": 7.871663968850413e-07, + "loss": 0.0508, + "step": 13851 + }, + { + "epoch": 2.2443292287751135, + "grad_norm": 0.9919477105140686, + "learning_rate": 7.868478830977331e-07, + "loss": 0.0655, + "step": 13852 + }, + { + "epoch": 2.24449125081011, + "grad_norm": 0.8631847500801086, + "learning_rate": 7.865294217287356e-07, + "loss": 0.0672, + "step": 13853 + }, + { + "epoch": 2.244653272845107, + "grad_norm": 0.964144229888916, + "learning_rate": 7.862110127877903e-07, + "loss": 0.0652, + "step": 13854 + }, + { + "epoch": 2.244815294880104, + "grad_norm": 0.8476447463035583, + "learning_rate": 7.858926562846409e-07, + "loss": 0.0572, + "step": 13855 + }, + { + "epoch": 2.2449773169151004, + "grad_norm": 0.7753357887268066, + "learning_rate": 7.855743522290283e-07, + "loss": 0.056, + "step": 13856 + }, + { + "epoch": 2.2451393389500973, + "grad_norm": 0.9003604650497437, + "learning_rate": 7.852561006306913e-07, + "loss": 0.0638, + "step": 13857 + }, + { + "epoch": 2.245301360985094, + "grad_norm": 0.8805102109909058, + "learning_rate": 7.849379014993683e-07, + "loss": 0.0593, + "step": 13858 + }, + { + "epoch": 2.2454633830200907, + "grad_norm": 1.0749431848526, + "learning_rate": 7.84619754844795e-07, + "loss": 0.07, + "step": 13859 + }, + { + "epoch": 2.2456254050550877, + "grad_norm": 0.8165993094444275, + "learning_rate": 7.84301660676707e-07, + "loss": 0.0574, + "step": 13860 + }, + { + "epoch": 2.245787427090084, + "grad_norm": 0.9074599742889404, + "learning_rate": 7.839836190048344e-07, + "loss": 0.0617, + "step": 13861 + }, + { + "epoch": 2.245949449125081, + "grad_norm": 0.8614346981048584, + "learning_rate": 7.836656298389114e-07, + "loss": 0.0585, + "step": 13862 + }, + { + "epoch": 2.2461114711600776, + "grad_norm": 0.908950924873352, + "learning_rate": 7.833476931886666e-07, + "loss": 0.0617, + "step": 13863 + }, + { + "epoch": 2.2462734931950745, + "grad_norm": 0.9891899228096008, + "learning_rate": 7.830298090638291e-07, + "loss": 0.0613, + "step": 13864 + }, + { + "epoch": 2.2464355152300715, + "grad_norm": 0.8221472501754761, + "learning_rate": 7.827119774741238e-07, + "loss": 0.0609, + "step": 13865 + }, + { + "epoch": 2.246597537265068, + "grad_norm": 0.8985829949378967, + "learning_rate": 7.823941984292752e-07, + "loss": 0.0644, + "step": 13866 + }, + { + "epoch": 2.246759559300065, + "grad_norm": 1.03141450881958, + "learning_rate": 7.82076471939009e-07, + "loss": 0.0713, + "step": 13867 + }, + { + "epoch": 2.2469215813350614, + "grad_norm": 0.8261798620223999, + "learning_rate": 7.817587980130451e-07, + "loss": 0.0625, + "step": 13868 + }, + { + "epoch": 2.2470836033700583, + "grad_norm": 0.9878301620483398, + "learning_rate": 7.814411766611035e-07, + "loss": 0.0581, + "step": 13869 + }, + { + "epoch": 2.2472456254050552, + "grad_norm": 0.9584047794342041, + "learning_rate": 7.811236078929033e-07, + "loss": 0.0579, + "step": 13870 + }, + { + "epoch": 2.2474076474400517, + "grad_norm": 1.0649832487106323, + "learning_rate": 7.808060917181609e-07, + "loss": 0.0656, + "step": 13871 + }, + { + "epoch": 2.2475696694750487, + "grad_norm": 0.9474183917045593, + "learning_rate": 7.80488628146592e-07, + "loss": 0.0678, + "step": 13872 + }, + { + "epoch": 2.2477316915100456, + "grad_norm": 1.1049025058746338, + "learning_rate": 7.801712171879098e-07, + "loss": 0.0667, + "step": 13873 + }, + { + "epoch": 2.247893713545042, + "grad_norm": 0.8262870907783508, + "learning_rate": 7.798538588518265e-07, + "loss": 0.0596, + "step": 13874 + }, + { + "epoch": 2.248055735580039, + "grad_norm": 0.9269388914108276, + "learning_rate": 7.795365531480531e-07, + "loss": 0.0747, + "step": 13875 + }, + { + "epoch": 2.2482177576150355, + "grad_norm": 0.8992902040481567, + "learning_rate": 7.792193000862964e-07, + "loss": 0.0583, + "step": 13876 + }, + { + "epoch": 2.2483797796500324, + "grad_norm": 0.9247242212295532, + "learning_rate": 7.789020996762656e-07, + "loss": 0.0561, + "step": 13877 + }, + { + "epoch": 2.248541801685029, + "grad_norm": 0.9147598147392273, + "learning_rate": 7.785849519276661e-07, + "loss": 0.0595, + "step": 13878 + }, + { + "epoch": 2.248703823720026, + "grad_norm": 0.9816052913665771, + "learning_rate": 7.782678568502008e-07, + "loss": 0.0665, + "step": 13879 + }, + { + "epoch": 2.248865845755023, + "grad_norm": 0.8535481691360474, + "learning_rate": 7.779508144535725e-07, + "loss": 0.0621, + "step": 13880 + }, + { + "epoch": 2.2490278677900193, + "grad_norm": 0.8713977336883545, + "learning_rate": 7.776338247474812e-07, + "loss": 0.0641, + "step": 13881 + }, + { + "epoch": 2.249189889825016, + "grad_norm": 0.8746277093887329, + "learning_rate": 7.773168877416285e-07, + "loss": 0.058, + "step": 13882 + }, + { + "epoch": 2.249351911860013, + "grad_norm": 0.837213397026062, + "learning_rate": 7.770000034457092e-07, + "loss": 0.0616, + "step": 13883 + }, + { + "epoch": 2.2495139338950096, + "grad_norm": 0.8210061192512512, + "learning_rate": 7.766831718694204e-07, + "loss": 0.0613, + "step": 13884 + }, + { + "epoch": 2.2496759559300066, + "grad_norm": 0.8450611233711243, + "learning_rate": 7.763663930224563e-07, + "loss": 0.0617, + "step": 13885 + }, + { + "epoch": 2.249837977965003, + "grad_norm": 1.0194191932678223, + "learning_rate": 7.760496669145093e-07, + "loss": 0.0662, + "step": 13886 + }, + { + "epoch": 2.25, + "grad_norm": 0.9273967146873474, + "learning_rate": 7.757329935552707e-07, + "loss": 0.0602, + "step": 13887 + }, + { + "epoch": 2.250162022034997, + "grad_norm": 0.8759946227073669, + "learning_rate": 7.754163729544297e-07, + "loss": 0.059, + "step": 13888 + }, + { + "epoch": 2.2503240440699934, + "grad_norm": 0.870991051197052, + "learning_rate": 7.750998051216743e-07, + "loss": 0.0674, + "step": 13889 + }, + { + "epoch": 2.2504860661049904, + "grad_norm": 0.8487216830253601, + "learning_rate": 7.747832900666907e-07, + "loss": 0.0642, + "step": 13890 + }, + { + "epoch": 2.250648088139987, + "grad_norm": 1.057746171951294, + "learning_rate": 7.744668277991635e-07, + "loss": 0.0703, + "step": 13891 + }, + { + "epoch": 2.250810110174984, + "grad_norm": 0.8206788301467896, + "learning_rate": 7.741504183287757e-07, + "loss": 0.0604, + "step": 13892 + }, + { + "epoch": 2.2509721322099807, + "grad_norm": 0.9289180040359497, + "learning_rate": 7.738340616652096e-07, + "loss": 0.0557, + "step": 13893 + }, + { + "epoch": 2.251134154244977, + "grad_norm": 0.8246344923973083, + "learning_rate": 7.73517757818143e-07, + "loss": 0.0589, + "step": 13894 + }, + { + "epoch": 2.251296176279974, + "grad_norm": 0.9617959856987, + "learning_rate": 7.73201506797254e-07, + "loss": 0.0698, + "step": 13895 + }, + { + "epoch": 2.251458198314971, + "grad_norm": 0.9463308453559875, + "learning_rate": 7.728853086122212e-07, + "loss": 0.062, + "step": 13896 + }, + { + "epoch": 2.2516202203499676, + "grad_norm": 0.9526486396789551, + "learning_rate": 7.725691632727192e-07, + "loss": 0.0671, + "step": 13897 + }, + { + "epoch": 2.2517822423849645, + "grad_norm": 1.0356853008270264, + "learning_rate": 7.722530707884196e-07, + "loss": 0.0744, + "step": 13898 + }, + { + "epoch": 2.251944264419961, + "grad_norm": 0.8983619213104248, + "learning_rate": 7.71937031168995e-07, + "loss": 0.0556, + "step": 13899 + }, + { + "epoch": 2.252106286454958, + "grad_norm": 0.8707557916641235, + "learning_rate": 7.716210444241154e-07, + "loss": 0.0623, + "step": 13900 + }, + { + "epoch": 2.2522683084899544, + "grad_norm": 0.9758105874061584, + "learning_rate": 7.713051105634492e-07, + "loss": 0.0633, + "step": 13901 + }, + { + "epoch": 2.2524303305249513, + "grad_norm": 0.8898715972900391, + "learning_rate": 7.709892295966634e-07, + "loss": 0.056, + "step": 13902 + }, + { + "epoch": 2.2525923525599483, + "grad_norm": 0.957287609577179, + "learning_rate": 7.706734015334228e-07, + "loss": 0.0673, + "step": 13903 + }, + { + "epoch": 2.2527543745949448, + "grad_norm": 0.8354018330574036, + "learning_rate": 7.703576263833915e-07, + "loss": 0.0635, + "step": 13904 + }, + { + "epoch": 2.2529163966299417, + "grad_norm": 0.9206665754318237, + "learning_rate": 7.70041904156231e-07, + "loss": 0.058, + "step": 13905 + }, + { + "epoch": 2.2530784186649386, + "grad_norm": 0.9412673711776733, + "learning_rate": 7.697262348616019e-07, + "loss": 0.0661, + "step": 13906 + }, + { + "epoch": 2.253240440699935, + "grad_norm": 0.8687724471092224, + "learning_rate": 7.694106185091627e-07, + "loss": 0.062, + "step": 13907 + }, + { + "epoch": 2.253402462734932, + "grad_norm": 1.0838840007781982, + "learning_rate": 7.690950551085716e-07, + "loss": 0.0682, + "step": 13908 + }, + { + "epoch": 2.2535644847699285, + "grad_norm": 0.8716097474098206, + "learning_rate": 7.687795446694815e-07, + "loss": 0.0592, + "step": 13909 + }, + { + "epoch": 2.2537265068049255, + "grad_norm": 0.7702820897102356, + "learning_rate": 7.684640872015484e-07, + "loss": 0.0537, + "step": 13910 + }, + { + "epoch": 2.2538885288399224, + "grad_norm": 0.8733231425285339, + "learning_rate": 7.68148682714425e-07, + "loss": 0.0612, + "step": 13911 + }, + { + "epoch": 2.254050550874919, + "grad_norm": 0.8055692315101624, + "learning_rate": 7.678333312177602e-07, + "loss": 0.0562, + "step": 13912 + }, + { + "epoch": 2.254212572909916, + "grad_norm": 0.948461651802063, + "learning_rate": 7.675180327212037e-07, + "loss": 0.0644, + "step": 13913 + }, + { + "epoch": 2.2543745949449123, + "grad_norm": 0.8782212138175964, + "learning_rate": 7.672027872344017e-07, + "loss": 0.063, + "step": 13914 + }, + { + "epoch": 2.2545366169799093, + "grad_norm": 0.9695743322372437, + "learning_rate": 7.668875947670032e-07, + "loss": 0.0607, + "step": 13915 + }, + { + "epoch": 2.254698639014906, + "grad_norm": 0.8532809615135193, + "learning_rate": 7.665724553286491e-07, + "loss": 0.0584, + "step": 13916 + }, + { + "epoch": 2.2548606610499027, + "grad_norm": 0.8807068467140198, + "learning_rate": 7.662573689289832e-07, + "loss": 0.0658, + "step": 13917 + }, + { + "epoch": 2.2550226830848996, + "grad_norm": 0.8487615585327148, + "learning_rate": 7.659423355776463e-07, + "loss": 0.0602, + "step": 13918 + }, + { + "epoch": 2.2551847051198965, + "grad_norm": 0.9274862408638, + "learning_rate": 7.656273552842774e-07, + "loss": 0.0589, + "step": 13919 + }, + { + "epoch": 2.255346727154893, + "grad_norm": 0.8932498693466187, + "learning_rate": 7.653124280585145e-07, + "loss": 0.0658, + "step": 13920 + }, + { + "epoch": 2.25550874918989, + "grad_norm": 0.9409620761871338, + "learning_rate": 7.649975539099935e-07, + "loss": 0.0634, + "step": 13921 + }, + { + "epoch": 2.2556707712248865, + "grad_norm": 0.9468281269073486, + "learning_rate": 7.646827328483486e-07, + "loss": 0.0615, + "step": 13922 + }, + { + "epoch": 2.2558327932598834, + "grad_norm": 1.0393003225326538, + "learning_rate": 7.643679648832133e-07, + "loss": 0.0657, + "step": 13923 + }, + { + "epoch": 2.25599481529488, + "grad_norm": 0.9196584820747375, + "learning_rate": 7.64053250024217e-07, + "loss": 0.0633, + "step": 13924 + }, + { + "epoch": 2.256156837329877, + "grad_norm": 1.1040884256362915, + "learning_rate": 7.637385882809909e-07, + "loss": 0.0673, + "step": 13925 + }, + { + "epoch": 2.2563188593648738, + "grad_norm": 1.06027352809906, + "learning_rate": 7.634239796631629e-07, + "loss": 0.0616, + "step": 13926 + }, + { + "epoch": 2.2564808813998702, + "grad_norm": 1.0992965698242188, + "learning_rate": 7.631094241803582e-07, + "loss": 0.078, + "step": 13927 + }, + { + "epoch": 2.256642903434867, + "grad_norm": 0.9714069366455078, + "learning_rate": 7.62794921842201e-07, + "loss": 0.0667, + "step": 13928 + }, + { + "epoch": 2.256804925469864, + "grad_norm": 1.0589038133621216, + "learning_rate": 7.624804726583169e-07, + "loss": 0.0668, + "step": 13929 + }, + { + "epoch": 2.2569669475048606, + "grad_norm": 0.8686789870262146, + "learning_rate": 7.621660766383246e-07, + "loss": 0.0573, + "step": 13930 + }, + { + "epoch": 2.2571289695398575, + "grad_norm": 0.8950777649879456, + "learning_rate": 7.618517337918451e-07, + "loss": 0.0524, + "step": 13931 + }, + { + "epoch": 2.257290991574854, + "grad_norm": 0.8135221600532532, + "learning_rate": 7.615374441284962e-07, + "loss": 0.0596, + "step": 13932 + }, + { + "epoch": 2.257453013609851, + "grad_norm": 0.8295992612838745, + "learning_rate": 7.612232076578946e-07, + "loss": 0.0618, + "step": 13933 + }, + { + "epoch": 2.257615035644848, + "grad_norm": 0.9673225283622742, + "learning_rate": 7.60909024389655e-07, + "loss": 0.0575, + "step": 13934 + }, + { + "epoch": 2.2577770576798444, + "grad_norm": 0.8375781178474426, + "learning_rate": 7.605948943333908e-07, + "loss": 0.0577, + "step": 13935 + }, + { + "epoch": 2.2579390797148413, + "grad_norm": 1.1369965076446533, + "learning_rate": 7.602808174987137e-07, + "loss": 0.0696, + "step": 13936 + }, + { + "epoch": 2.258101101749838, + "grad_norm": 0.774905800819397, + "learning_rate": 7.599667938952341e-07, + "loss": 0.0567, + "step": 13937 + }, + { + "epoch": 2.2582631237848347, + "grad_norm": 0.8955923914909363, + "learning_rate": 7.596528235325582e-07, + "loss": 0.0651, + "step": 13938 + }, + { + "epoch": 2.2584251458198317, + "grad_norm": 0.830094039440155, + "learning_rate": 7.593389064202952e-07, + "loss": 0.0658, + "step": 13939 + }, + { + "epoch": 2.258587167854828, + "grad_norm": 0.7732976675033569, + "learning_rate": 7.590250425680496e-07, + "loss": 0.0535, + "step": 13940 + }, + { + "epoch": 2.258749189889825, + "grad_norm": 1.0507856607437134, + "learning_rate": 7.58711231985425e-07, + "loss": 0.0673, + "step": 13941 + }, + { + "epoch": 2.2589112119248216, + "grad_norm": 0.9185804128646851, + "learning_rate": 7.583974746820222e-07, + "loss": 0.0662, + "step": 13942 + }, + { + "epoch": 2.2590732339598185, + "grad_norm": 0.9794391393661499, + "learning_rate": 7.580837706674415e-07, + "loss": 0.0692, + "step": 13943 + }, + { + "epoch": 2.2592352559948155, + "grad_norm": 0.8743561506271362, + "learning_rate": 7.577701199512835e-07, + "loss": 0.0611, + "step": 13944 + }, + { + "epoch": 2.259397278029812, + "grad_norm": 0.845887303352356, + "learning_rate": 7.574565225431427e-07, + "loss": 0.0609, + "step": 13945 + }, + { + "epoch": 2.259559300064809, + "grad_norm": 1.0307512283325195, + "learning_rate": 7.571429784526157e-07, + "loss": 0.0634, + "step": 13946 + }, + { + "epoch": 2.2597213220998054, + "grad_norm": 0.8212060332298279, + "learning_rate": 7.56829487689296e-07, + "loss": 0.0584, + "step": 13947 + }, + { + "epoch": 2.2598833441348023, + "grad_norm": 0.892123281955719, + "learning_rate": 7.565160502627752e-07, + "loss": 0.0619, + "step": 13948 + }, + { + "epoch": 2.2600453661697992, + "grad_norm": 0.8927651643753052, + "learning_rate": 7.562026661826446e-07, + "loss": 0.063, + "step": 13949 + }, + { + "epoch": 2.2602073882047957, + "grad_norm": 0.908979594707489, + "learning_rate": 7.558893354584923e-07, + "loss": 0.06, + "step": 13950 + }, + { + "epoch": 2.2603694102397927, + "grad_norm": 0.8717660903930664, + "learning_rate": 7.555760580999055e-07, + "loss": 0.0631, + "step": 13951 + }, + { + "epoch": 2.2605314322747896, + "grad_norm": 1.0289645195007324, + "learning_rate": 7.5526283411647e-07, + "loss": 0.0634, + "step": 13952 + }, + { + "epoch": 2.260693454309786, + "grad_norm": 0.8531572222709656, + "learning_rate": 7.549496635177698e-07, + "loss": 0.0628, + "step": 13953 + }, + { + "epoch": 2.260855476344783, + "grad_norm": 0.8821384310722351, + "learning_rate": 7.546365463133867e-07, + "loss": 0.0641, + "step": 13954 + }, + { + "epoch": 2.2610174983797795, + "grad_norm": 0.923827052116394, + "learning_rate": 7.543234825129026e-07, + "loss": 0.0629, + "step": 13955 + }, + { + "epoch": 2.2611795204147764, + "grad_norm": 0.8377568125724792, + "learning_rate": 7.540104721258945e-07, + "loss": 0.0635, + "step": 13956 + }, + { + "epoch": 2.261341542449773, + "grad_norm": 0.8890910148620605, + "learning_rate": 7.5369751516194e-07, + "loss": 0.0614, + "step": 13957 + }, + { + "epoch": 2.26150356448477, + "grad_norm": 1.1075289249420166, + "learning_rate": 7.533846116306162e-07, + "loss": 0.0598, + "step": 13958 + }, + { + "epoch": 2.261665586519767, + "grad_norm": 0.8395311236381531, + "learning_rate": 7.530717615414976e-07, + "loss": 0.0583, + "step": 13959 + }, + { + "epoch": 2.2618276085547633, + "grad_norm": 0.872234046459198, + "learning_rate": 7.527589649041548e-07, + "loss": 0.0643, + "step": 13960 + }, + { + "epoch": 2.26198963058976, + "grad_norm": 0.8673476576805115, + "learning_rate": 7.524462217281592e-07, + "loss": 0.0624, + "step": 13961 + }, + { + "epoch": 2.262151652624757, + "grad_norm": 0.9594100117683411, + "learning_rate": 7.521335320230804e-07, + "loss": 0.0614, + "step": 13962 + }, + { + "epoch": 2.2623136746597536, + "grad_norm": 0.8554080128669739, + "learning_rate": 7.518208957984857e-07, + "loss": 0.0596, + "step": 13963 + }, + { + "epoch": 2.2624756966947506, + "grad_norm": 0.8185822367668152, + "learning_rate": 7.515083130639411e-07, + "loss": 0.0617, + "step": 13964 + }, + { + "epoch": 2.262637718729747, + "grad_norm": 0.7946313619613647, + "learning_rate": 7.51195783829011e-07, + "loss": 0.0602, + "step": 13965 + }, + { + "epoch": 2.262799740764744, + "grad_norm": 0.8937302827835083, + "learning_rate": 7.508833081032577e-07, + "loss": 0.065, + "step": 13966 + }, + { + "epoch": 2.262961762799741, + "grad_norm": 0.9195818901062012, + "learning_rate": 7.505708858962424e-07, + "loss": 0.0651, + "step": 13967 + }, + { + "epoch": 2.2631237848347374, + "grad_norm": 0.9528251886367798, + "learning_rate": 7.502585172175244e-07, + "loss": 0.0698, + "step": 13968 + }, + { + "epoch": 2.2632858068697344, + "grad_norm": 0.9592039585113525, + "learning_rate": 7.499462020766618e-07, + "loss": 0.0562, + "step": 13969 + }, + { + "epoch": 2.263447828904731, + "grad_norm": 0.8582433462142944, + "learning_rate": 7.496339404832109e-07, + "loss": 0.0619, + "step": 13970 + }, + { + "epoch": 2.2636098509397278, + "grad_norm": 0.9389162063598633, + "learning_rate": 7.493217324467239e-07, + "loss": 0.071, + "step": 13971 + }, + { + "epoch": 2.2637718729747247, + "grad_norm": 0.9363165497779846, + "learning_rate": 7.490095779767564e-07, + "loss": 0.0612, + "step": 13972 + }, + { + "epoch": 2.263933895009721, + "grad_norm": 0.8165293335914612, + "learning_rate": 7.486974770828592e-07, + "loss": 0.0586, + "step": 13973 + }, + { + "epoch": 2.264095917044718, + "grad_norm": 0.822784423828125, + "learning_rate": 7.483854297745805e-07, + "loss": 0.0595, + "step": 13974 + }, + { + "epoch": 2.264257939079715, + "grad_norm": 1.0148231983184814, + "learning_rate": 7.480734360614686e-07, + "loss": 0.0658, + "step": 13975 + }, + { + "epoch": 2.2644199611147116, + "grad_norm": 0.8952381610870361, + "learning_rate": 7.47761495953069e-07, + "loss": 0.0551, + "step": 13976 + }, + { + "epoch": 2.2645819831497085, + "grad_norm": 1.0946059226989746, + "learning_rate": 7.474496094589292e-07, + "loss": 0.069, + "step": 13977 + }, + { + "epoch": 2.264744005184705, + "grad_norm": 0.7693538069725037, + "learning_rate": 7.471377765885893e-07, + "loss": 0.0551, + "step": 13978 + }, + { + "epoch": 2.264906027219702, + "grad_norm": 0.9450730681419373, + "learning_rate": 7.468259973515918e-07, + "loss": 0.0671, + "step": 13979 + }, + { + "epoch": 2.2650680492546984, + "grad_norm": 0.9570711255073547, + "learning_rate": 7.465142717574761e-07, + "loss": 0.0681, + "step": 13980 + }, + { + "epoch": 2.2652300712896953, + "grad_norm": 1.0384302139282227, + "learning_rate": 7.462025998157801e-07, + "loss": 0.0637, + "step": 13981 + }, + { + "epoch": 2.2653920933246923, + "grad_norm": 0.9101808071136475, + "learning_rate": 7.458909815360407e-07, + "loss": 0.0672, + "step": 13982 + }, + { + "epoch": 2.2655541153596888, + "grad_norm": 0.8050899505615234, + "learning_rate": 7.455794169277922e-07, + "loss": 0.0546, + "step": 13983 + }, + { + "epoch": 2.2657161373946857, + "grad_norm": 0.863530695438385, + "learning_rate": 7.45267906000568e-07, + "loss": 0.0597, + "step": 13984 + }, + { + "epoch": 2.2658781594296826, + "grad_norm": 0.8293231129646301, + "learning_rate": 7.449564487639005e-07, + "loss": 0.0586, + "step": 13985 + }, + { + "epoch": 2.266040181464679, + "grad_norm": 0.9019443988800049, + "learning_rate": 7.446450452273168e-07, + "loss": 0.0652, + "step": 13986 + }, + { + "epoch": 2.266202203499676, + "grad_norm": 1.059104084968567, + "learning_rate": 7.443336954003474e-07, + "loss": 0.0654, + "step": 13987 + }, + { + "epoch": 2.2663642255346725, + "grad_norm": 0.8297203779220581, + "learning_rate": 7.440223992925194e-07, + "loss": 0.054, + "step": 13988 + }, + { + "epoch": 2.2665262475696695, + "grad_norm": 1.012291669845581, + "learning_rate": 7.437111569133556e-07, + "loss": 0.0667, + "step": 13989 + }, + { + "epoch": 2.2666882696046664, + "grad_norm": 0.8839371800422668, + "learning_rate": 7.433999682723805e-07, + "loss": 0.0632, + "step": 13990 + }, + { + "epoch": 2.266850291639663, + "grad_norm": 0.8371898531913757, + "learning_rate": 7.430888333791144e-07, + "loss": 0.0566, + "step": 13991 + }, + { + "epoch": 2.26701231367466, + "grad_norm": 0.8507769703865051, + "learning_rate": 7.427777522430804e-07, + "loss": 0.0567, + "step": 13992 + }, + { + "epoch": 2.2671743357096563, + "grad_norm": 0.8428038358688354, + "learning_rate": 7.424667248737936e-07, + "loss": 0.0611, + "step": 13993 + }, + { + "epoch": 2.2673363577446533, + "grad_norm": 0.8883320093154907, + "learning_rate": 7.42155751280772e-07, + "loss": 0.0616, + "step": 13994 + }, + { + "epoch": 2.26749837977965, + "grad_norm": 0.839263916015625, + "learning_rate": 7.418448314735305e-07, + "loss": 0.0583, + "step": 13995 + }, + { + "epoch": 2.2676604018146467, + "grad_norm": 0.9638897180557251, + "learning_rate": 7.415339654615824e-07, + "loss": 0.0617, + "step": 13996 + }, + { + "epoch": 2.2678224238496436, + "grad_norm": 0.8128506541252136, + "learning_rate": 7.412231532544398e-07, + "loss": 0.0549, + "step": 13997 + }, + { + "epoch": 2.2679844458846405, + "grad_norm": 0.9353517889976501, + "learning_rate": 7.409123948616123e-07, + "loss": 0.0626, + "step": 13998 + }, + { + "epoch": 2.268146467919637, + "grad_norm": 0.7773940563201904, + "learning_rate": 7.406016902926094e-07, + "loss": 0.0569, + "step": 13999 + }, + { + "epoch": 2.268308489954634, + "grad_norm": 0.9288927912712097, + "learning_rate": 7.402910395569357e-07, + "loss": 0.0608, + "step": 14000 + }, + { + "epoch": 2.2684705119896305, + "grad_norm": 0.931483268737793, + "learning_rate": 7.399804426640983e-07, + "loss": 0.0536, + "step": 14001 + }, + { + "epoch": 2.2686325340246274, + "grad_norm": 0.8563708662986755, + "learning_rate": 7.396698996236004e-07, + "loss": 0.0656, + "step": 14002 + }, + { + "epoch": 2.268794556059624, + "grad_norm": 0.9786216020584106, + "learning_rate": 7.393594104449445e-07, + "loss": 0.0657, + "step": 14003 + }, + { + "epoch": 2.268956578094621, + "grad_norm": 0.9174492955207825, + "learning_rate": 7.39048975137629e-07, + "loss": 0.0639, + "step": 14004 + }, + { + "epoch": 2.2691186001296177, + "grad_norm": 0.8567657470703125, + "learning_rate": 7.387385937111527e-07, + "loss": 0.0573, + "step": 14005 + }, + { + "epoch": 2.2692806221646142, + "grad_norm": 0.9229654669761658, + "learning_rate": 7.38428266175015e-07, + "loss": 0.0627, + "step": 14006 + }, + { + "epoch": 2.269442644199611, + "grad_norm": 0.8565087914466858, + "learning_rate": 7.381179925387086e-07, + "loss": 0.0587, + "step": 14007 + }, + { + "epoch": 2.269604666234608, + "grad_norm": 0.9805523753166199, + "learning_rate": 7.378077728117277e-07, + "loss": 0.0606, + "step": 14008 + }, + { + "epoch": 2.2697666882696046, + "grad_norm": 0.7847617268562317, + "learning_rate": 7.374976070035647e-07, + "loss": 0.0519, + "step": 14009 + }, + { + "epoch": 2.2699287103046015, + "grad_norm": 0.8503203392028809, + "learning_rate": 7.371874951237099e-07, + "loss": 0.058, + "step": 14010 + }, + { + "epoch": 2.270090732339598, + "grad_norm": 0.955046534538269, + "learning_rate": 7.368774371816517e-07, + "loss": 0.0654, + "step": 14011 + }, + { + "epoch": 2.270252754374595, + "grad_norm": 0.9594247937202454, + "learning_rate": 7.365674331868772e-07, + "loss": 0.0609, + "step": 14012 + }, + { + "epoch": 2.270414776409592, + "grad_norm": 0.8345913887023926, + "learning_rate": 7.36257483148872e-07, + "loss": 0.0617, + "step": 14013 + }, + { + "epoch": 2.2705767984445884, + "grad_norm": 0.8707557916641235, + "learning_rate": 7.359475870771202e-07, + "loss": 0.0613, + "step": 14014 + }, + { + "epoch": 2.2707388204795853, + "grad_norm": 0.9228283166885376, + "learning_rate": 7.356377449811017e-07, + "loss": 0.0598, + "step": 14015 + }, + { + "epoch": 2.270900842514582, + "grad_norm": 1.0070610046386719, + "learning_rate": 7.353279568702995e-07, + "loss": 0.0629, + "step": 14016 + }, + { + "epoch": 2.2710628645495787, + "grad_norm": 0.901984453201294, + "learning_rate": 7.350182227541922e-07, + "loss": 0.0562, + "step": 14017 + }, + { + "epoch": 2.2712248865845757, + "grad_norm": 0.9663466215133667, + "learning_rate": 7.347085426422551e-07, + "loss": 0.0641, + "step": 14018 + }, + { + "epoch": 2.271386908619572, + "grad_norm": 0.9153159856796265, + "learning_rate": 7.343989165439641e-07, + "loss": 0.0643, + "step": 14019 + }, + { + "epoch": 2.271548930654569, + "grad_norm": 0.9758960008621216, + "learning_rate": 7.340893444687944e-07, + "loss": 0.0615, + "step": 14020 + }, + { + "epoch": 2.271710952689566, + "grad_norm": 0.8344132900238037, + "learning_rate": 7.33779826426218e-07, + "loss": 0.0622, + "step": 14021 + }, + { + "epoch": 2.2718729747245625, + "grad_norm": 0.9549560546875, + "learning_rate": 7.334703624257039e-07, + "loss": 0.0642, + "step": 14022 + }, + { + "epoch": 2.2720349967595594, + "grad_norm": 0.8046680688858032, + "learning_rate": 7.331609524767219e-07, + "loss": 0.0568, + "step": 14023 + }, + { + "epoch": 2.272197018794556, + "grad_norm": 0.9434444904327393, + "learning_rate": 7.328515965887389e-07, + "loss": 0.0654, + "step": 14024 + }, + { + "epoch": 2.272359040829553, + "grad_norm": 0.8848881721496582, + "learning_rate": 7.32542294771221e-07, + "loss": 0.0582, + "step": 14025 + }, + { + "epoch": 2.2725210628645494, + "grad_norm": 0.8448486328125, + "learning_rate": 7.322330470336314e-07, + "loss": 0.0562, + "step": 14026 + }, + { + "epoch": 2.2726830848995463, + "grad_norm": 0.896988034248352, + "learning_rate": 7.319238533854328e-07, + "loss": 0.0596, + "step": 14027 + }, + { + "epoch": 2.2728451069345432, + "grad_norm": 0.843999981880188, + "learning_rate": 7.316147138360855e-07, + "loss": 0.0583, + "step": 14028 + }, + { + "epoch": 2.2730071289695397, + "grad_norm": 0.8346846699714661, + "learning_rate": 7.313056283950487e-07, + "loss": 0.0557, + "step": 14029 + }, + { + "epoch": 2.2731691510045366, + "grad_norm": 0.9750611186027527, + "learning_rate": 7.309965970717795e-07, + "loss": 0.0644, + "step": 14030 + }, + { + "epoch": 2.2733311730395336, + "grad_norm": 0.939724862575531, + "learning_rate": 7.306876198757332e-07, + "loss": 0.0625, + "step": 14031 + }, + { + "epoch": 2.27349319507453, + "grad_norm": 1.0688453912734985, + "learning_rate": 7.303786968163651e-07, + "loss": 0.073, + "step": 14032 + }, + { + "epoch": 2.273655217109527, + "grad_norm": 0.954890251159668, + "learning_rate": 7.300698279031257e-07, + "loss": 0.0579, + "step": 14033 + }, + { + "epoch": 2.2738172391445235, + "grad_norm": 1.0080914497375488, + "learning_rate": 7.297610131454657e-07, + "loss": 0.0592, + "step": 14034 + }, + { + "epoch": 2.2739792611795204, + "grad_norm": 0.9062652587890625, + "learning_rate": 7.294522525528355e-07, + "loss": 0.0622, + "step": 14035 + }, + { + "epoch": 2.2741412832145174, + "grad_norm": 1.0314079523086548, + "learning_rate": 7.291435461346827e-07, + "loss": 0.0665, + "step": 14036 + }, + { + "epoch": 2.274303305249514, + "grad_norm": 0.8445022702217102, + "learning_rate": 7.288348939004508e-07, + "loss": 0.063, + "step": 14037 + }, + { + "epoch": 2.274465327284511, + "grad_norm": 0.9952003955841064, + "learning_rate": 7.285262958595846e-07, + "loss": 0.0579, + "step": 14038 + }, + { + "epoch": 2.2746273493195073, + "grad_norm": 0.773461639881134, + "learning_rate": 7.282177520215283e-07, + "loss": 0.0542, + "step": 14039 + }, + { + "epoch": 2.274789371354504, + "grad_norm": 0.8747705221176147, + "learning_rate": 7.279092623957204e-07, + "loss": 0.06, + "step": 14040 + }, + { + "epoch": 2.274951393389501, + "grad_norm": 0.9074735045433044, + "learning_rate": 7.276008269916008e-07, + "loss": 0.0685, + "step": 14041 + }, + { + "epoch": 2.2751134154244976, + "grad_norm": 0.9272582530975342, + "learning_rate": 7.272924458186064e-07, + "loss": 0.0637, + "step": 14042 + }, + { + "epoch": 2.2752754374594946, + "grad_norm": 0.9657379984855652, + "learning_rate": 7.269841188861737e-07, + "loss": 0.0644, + "step": 14043 + }, + { + "epoch": 2.2754374594944915, + "grad_norm": 1.0053846836090088, + "learning_rate": 7.26675846203736e-07, + "loss": 0.0702, + "step": 14044 + }, + { + "epoch": 2.275599481529488, + "grad_norm": 0.7461309432983398, + "learning_rate": 7.263676277807263e-07, + "loss": 0.0544, + "step": 14045 + }, + { + "epoch": 2.275761503564485, + "grad_norm": 0.8807432055473328, + "learning_rate": 7.26059463626575e-07, + "loss": 0.063, + "step": 14046 + }, + { + "epoch": 2.2759235255994814, + "grad_norm": 0.9560192823410034, + "learning_rate": 7.257513537507121e-07, + "loss": 0.0615, + "step": 14047 + }, + { + "epoch": 2.2760855476344783, + "grad_norm": 0.7775177359580994, + "learning_rate": 7.254432981625626e-07, + "loss": 0.0573, + "step": 14048 + }, + { + "epoch": 2.276247569669475, + "grad_norm": 0.8764267563819885, + "learning_rate": 7.251352968715544e-07, + "loss": 0.0547, + "step": 14049 + }, + { + "epoch": 2.2764095917044718, + "grad_norm": 0.8146836161613464, + "learning_rate": 7.248273498871119e-07, + "loss": 0.0563, + "step": 14050 + }, + { + "epoch": 2.2765716137394687, + "grad_norm": 0.8144499659538269, + "learning_rate": 7.245194572186562e-07, + "loss": 0.0548, + "step": 14051 + }, + { + "epoch": 2.276733635774465, + "grad_norm": 0.8982124328613281, + "learning_rate": 7.242116188756082e-07, + "loss": 0.0632, + "step": 14052 + }, + { + "epoch": 2.276895657809462, + "grad_norm": 1.0810835361480713, + "learning_rate": 7.239038348673866e-07, + "loss": 0.0683, + "step": 14053 + }, + { + "epoch": 2.277057679844459, + "grad_norm": 0.9049067497253418, + "learning_rate": 7.235961052034113e-07, + "loss": 0.0659, + "step": 14054 + }, + { + "epoch": 2.2772197018794555, + "grad_norm": 0.8906067609786987, + "learning_rate": 7.232884298930953e-07, + "loss": 0.062, + "step": 14055 + }, + { + "epoch": 2.2773817239144525, + "grad_norm": 0.9106201529502869, + "learning_rate": 7.22980808945854e-07, + "loss": 0.063, + "step": 14056 + }, + { + "epoch": 2.277543745949449, + "grad_norm": 0.9221261739730835, + "learning_rate": 7.226732423710998e-07, + "loss": 0.0688, + "step": 14057 + }, + { + "epoch": 2.277705767984446, + "grad_norm": 0.8350453972816467, + "learning_rate": 7.22365730178243e-07, + "loss": 0.0599, + "step": 14058 + }, + { + "epoch": 2.2778677900194424, + "grad_norm": 0.9052740335464478, + "learning_rate": 7.220582723766931e-07, + "loss": 0.0645, + "step": 14059 + }, + { + "epoch": 2.2780298120544393, + "grad_norm": 0.8463963866233826, + "learning_rate": 7.217508689758576e-07, + "loss": 0.0665, + "step": 14060 + }, + { + "epoch": 2.2781918340894363, + "grad_norm": 0.822176992893219, + "learning_rate": 7.214435199851432e-07, + "loss": 0.0555, + "step": 14061 + }, + { + "epoch": 2.2783538561244328, + "grad_norm": 0.9079717993736267, + "learning_rate": 7.211362254139512e-07, + "loss": 0.0625, + "step": 14062 + }, + { + "epoch": 2.2785158781594297, + "grad_norm": 0.9785674214363098, + "learning_rate": 7.208289852716868e-07, + "loss": 0.0684, + "step": 14063 + }, + { + "epoch": 2.2786779001944266, + "grad_norm": 0.8325532078742981, + "learning_rate": 7.205217995677502e-07, + "loss": 0.0595, + "step": 14064 + }, + { + "epoch": 2.278839922229423, + "grad_norm": 0.9397990107536316, + "learning_rate": 7.202146683115408e-07, + "loss": 0.0605, + "step": 14065 + }, + { + "epoch": 2.27900194426442, + "grad_norm": 0.7694252729415894, + "learning_rate": 7.199075915124548e-07, + "loss": 0.0561, + "step": 14066 + }, + { + "epoch": 2.2791639662994165, + "grad_norm": 0.9818680286407471, + "learning_rate": 7.19600569179888e-07, + "loss": 0.069, + "step": 14067 + }, + { + "epoch": 2.2793259883344135, + "grad_norm": 0.8381089568138123, + "learning_rate": 7.192936013232368e-07, + "loss": 0.0555, + "step": 14068 + }, + { + "epoch": 2.2794880103694104, + "grad_norm": 1.1629765033721924, + "learning_rate": 7.189866879518914e-07, + "loss": 0.0749, + "step": 14069 + }, + { + "epoch": 2.279650032404407, + "grad_norm": 0.8009434342384338, + "learning_rate": 7.186798290752436e-07, + "loss": 0.0594, + "step": 14070 + }, + { + "epoch": 2.279812054439404, + "grad_norm": 0.85289067029953, + "learning_rate": 7.183730247026821e-07, + "loss": 0.0626, + "step": 14071 + }, + { + "epoch": 2.2799740764744003, + "grad_norm": 0.7887542247772217, + "learning_rate": 7.180662748435946e-07, + "loss": 0.0601, + "step": 14072 + }, + { + "epoch": 2.2801360985093972, + "grad_norm": 0.8790128827095032, + "learning_rate": 7.17759579507367e-07, + "loss": 0.0633, + "step": 14073 + }, + { + "epoch": 2.280298120544394, + "grad_norm": 0.8336759209632874, + "learning_rate": 7.174529387033832e-07, + "loss": 0.058, + "step": 14074 + }, + { + "epoch": 2.2804601425793907, + "grad_norm": 0.8878383040428162, + "learning_rate": 7.171463524410258e-07, + "loss": 0.0615, + "step": 14075 + }, + { + "epoch": 2.2806221646143876, + "grad_norm": 0.8392332196235657, + "learning_rate": 7.168398207296764e-07, + "loss": 0.0626, + "step": 14076 + }, + { + "epoch": 2.2807841866493845, + "grad_norm": 0.8218026161193848, + "learning_rate": 7.165333435787119e-07, + "loss": 0.0639, + "step": 14077 + }, + { + "epoch": 2.280946208684381, + "grad_norm": 1.1141018867492676, + "learning_rate": 7.162269209975117e-07, + "loss": 0.0645, + "step": 14078 + }, + { + "epoch": 2.281108230719378, + "grad_norm": 0.8250923156738281, + "learning_rate": 7.159205529954513e-07, + "loss": 0.0575, + "step": 14079 + }, + { + "epoch": 2.2812702527543745, + "grad_norm": 0.8710193037986755, + "learning_rate": 7.156142395819055e-07, + "loss": 0.0597, + "step": 14080 + }, + { + "epoch": 2.2814322747893714, + "grad_norm": 0.8891174793243408, + "learning_rate": 7.153079807662447e-07, + "loss": 0.0643, + "step": 14081 + }, + { + "epoch": 2.281594296824368, + "grad_norm": 1.0327494144439697, + "learning_rate": 7.150017765578401e-07, + "loss": 0.0676, + "step": 14082 + }, + { + "epoch": 2.281756318859365, + "grad_norm": 0.9421011805534363, + "learning_rate": 7.146956269660632e-07, + "loss": 0.0664, + "step": 14083 + }, + { + "epoch": 2.2819183408943617, + "grad_norm": 0.9506751894950867, + "learning_rate": 7.143895320002789e-07, + "loss": 0.0699, + "step": 14084 + }, + { + "epoch": 2.2820803629293582, + "grad_norm": 0.8076117038726807, + "learning_rate": 7.140834916698539e-07, + "loss": 0.0615, + "step": 14085 + }, + { + "epoch": 2.282242384964355, + "grad_norm": 0.993713915348053, + "learning_rate": 7.137775059841523e-07, + "loss": 0.0682, + "step": 14086 + }, + { + "epoch": 2.282404406999352, + "grad_norm": 0.8705652952194214, + "learning_rate": 7.134715749525364e-07, + "loss": 0.0666, + "step": 14087 + }, + { + "epoch": 2.2825664290343486, + "grad_norm": 0.9547131657600403, + "learning_rate": 7.131656985843669e-07, + "loss": 0.0683, + "step": 14088 + }, + { + "epoch": 2.2827284510693455, + "grad_norm": 1.0205283164978027, + "learning_rate": 7.12859876889003e-07, + "loss": 0.0634, + "step": 14089 + }, + { + "epoch": 2.282890473104342, + "grad_norm": 0.892135739326477, + "learning_rate": 7.125541098758021e-07, + "loss": 0.062, + "step": 14090 + }, + { + "epoch": 2.283052495139339, + "grad_norm": 0.9227086305618286, + "learning_rate": 7.122483975541197e-07, + "loss": 0.0658, + "step": 14091 + }, + { + "epoch": 2.283214517174336, + "grad_norm": 0.8980578184127808, + "learning_rate": 7.119427399333104e-07, + "loss": 0.0683, + "step": 14092 + }, + { + "epoch": 2.2833765392093324, + "grad_norm": 0.7832695245742798, + "learning_rate": 7.116371370227259e-07, + "loss": 0.054, + "step": 14093 + }, + { + "epoch": 2.2835385612443293, + "grad_norm": 1.094404935836792, + "learning_rate": 7.113315888317182e-07, + "loss": 0.0658, + "step": 14094 + }, + { + "epoch": 2.283700583279326, + "grad_norm": 1.046769380569458, + "learning_rate": 7.110260953696346e-07, + "loss": 0.0687, + "step": 14095 + }, + { + "epoch": 2.2838626053143227, + "grad_norm": 0.9280028343200684, + "learning_rate": 7.107206566458225e-07, + "loss": 0.0613, + "step": 14096 + }, + { + "epoch": 2.2840246273493197, + "grad_norm": 0.8256856203079224, + "learning_rate": 7.104152726696292e-07, + "loss": 0.0597, + "step": 14097 + }, + { + "epoch": 2.284186649384316, + "grad_norm": 0.9466334581375122, + "learning_rate": 7.101099434503986e-07, + "loss": 0.0654, + "step": 14098 + }, + { + "epoch": 2.284348671419313, + "grad_norm": 0.8240459561347961, + "learning_rate": 7.098046689974714e-07, + "loss": 0.0608, + "step": 14099 + }, + { + "epoch": 2.28451069345431, + "grad_norm": 0.8247748613357544, + "learning_rate": 7.09499449320189e-07, + "loss": 0.0593, + "step": 14100 + }, + { + "epoch": 2.2846727154893065, + "grad_norm": 0.9899136424064636, + "learning_rate": 7.091942844278907e-07, + "loss": 0.0675, + "step": 14101 + }, + { + "epoch": 2.2848347375243034, + "grad_norm": 1.1560351848602295, + "learning_rate": 7.088891743299136e-07, + "loss": 0.068, + "step": 14102 + }, + { + "epoch": 2.2849967595593, + "grad_norm": 0.8623514771461487, + "learning_rate": 7.085841190355932e-07, + "loss": 0.0599, + "step": 14103 + }, + { + "epoch": 2.285158781594297, + "grad_norm": 0.869429886341095, + "learning_rate": 7.08279118554264e-07, + "loss": 0.0593, + "step": 14104 + }, + { + "epoch": 2.2853208036292934, + "grad_norm": 1.031187891960144, + "learning_rate": 7.079741728952578e-07, + "loss": 0.0635, + "step": 14105 + }, + { + "epoch": 2.2854828256642903, + "grad_norm": 1.0560411214828491, + "learning_rate": 7.076692820679051e-07, + "loss": 0.0651, + "step": 14106 + }, + { + "epoch": 2.285644847699287, + "grad_norm": 0.8320545554161072, + "learning_rate": 7.073644460815348e-07, + "loss": 0.06, + "step": 14107 + }, + { + "epoch": 2.2858068697342837, + "grad_norm": 0.963650107383728, + "learning_rate": 7.070596649454748e-07, + "loss": 0.0651, + "step": 14108 + }, + { + "epoch": 2.2859688917692806, + "grad_norm": 0.8947854042053223, + "learning_rate": 7.067549386690509e-07, + "loss": 0.057, + "step": 14109 + }, + { + "epoch": 2.2861309138042776, + "grad_norm": 0.8852594494819641, + "learning_rate": 7.064502672615847e-07, + "loss": 0.0603, + "step": 14110 + }, + { + "epoch": 2.286292935839274, + "grad_norm": 0.9978240728378296, + "learning_rate": 7.061456507324008e-07, + "loss": 0.0671, + "step": 14111 + }, + { + "epoch": 2.286454957874271, + "grad_norm": 0.8848221302032471, + "learning_rate": 7.058410890908196e-07, + "loss": 0.0624, + "step": 14112 + }, + { + "epoch": 2.2866169799092675, + "grad_norm": 0.8844780921936035, + "learning_rate": 7.055365823461585e-07, + "loss": 0.0554, + "step": 14113 + }, + { + "epoch": 2.2867790019442644, + "grad_norm": 0.9852705001831055, + "learning_rate": 7.052321305077356e-07, + "loss": 0.0651, + "step": 14114 + }, + { + "epoch": 2.2869410239792614, + "grad_norm": 0.9480606913566589, + "learning_rate": 7.049277335848656e-07, + "loss": 0.0659, + "step": 14115 + }, + { + "epoch": 2.287103046014258, + "grad_norm": 0.9317473769187927, + "learning_rate": 7.046233915868642e-07, + "loss": 0.0626, + "step": 14116 + }, + { + "epoch": 2.287265068049255, + "grad_norm": 0.8471056222915649, + "learning_rate": 7.043191045230418e-07, + "loss": 0.0551, + "step": 14117 + }, + { + "epoch": 2.2874270900842513, + "grad_norm": 0.8562641143798828, + "learning_rate": 7.04014872402709e-07, + "loss": 0.0583, + "step": 14118 + }, + { + "epoch": 2.287589112119248, + "grad_norm": 0.9273170232772827, + "learning_rate": 7.037106952351752e-07, + "loss": 0.0621, + "step": 14119 + }, + { + "epoch": 2.287751134154245, + "grad_norm": 0.8602285981178284, + "learning_rate": 7.034065730297471e-07, + "loss": 0.0566, + "step": 14120 + }, + { + "epoch": 2.2879131561892416, + "grad_norm": 0.9283899664878845, + "learning_rate": 7.031025057957302e-07, + "loss": 0.0624, + "step": 14121 + }, + { + "epoch": 2.2880751782242386, + "grad_norm": 0.8553148508071899, + "learning_rate": 7.027984935424284e-07, + "loss": 0.064, + "step": 14122 + }, + { + "epoch": 2.2882372002592355, + "grad_norm": 0.964326798915863, + "learning_rate": 7.024945362791432e-07, + "loss": 0.0618, + "step": 14123 + }, + { + "epoch": 2.288399222294232, + "grad_norm": 0.9071498513221741, + "learning_rate": 7.021906340151763e-07, + "loss": 0.0718, + "step": 14124 + }, + { + "epoch": 2.288561244329229, + "grad_norm": 0.9609620571136475, + "learning_rate": 7.01886786759824e-07, + "loss": 0.0663, + "step": 14125 + }, + { + "epoch": 2.2887232663642254, + "grad_norm": 0.9779571890830994, + "learning_rate": 7.015829945223851e-07, + "loss": 0.065, + "step": 14126 + }, + { + "epoch": 2.2888852883992223, + "grad_norm": 1.0127842426300049, + "learning_rate": 7.012792573121551e-07, + "loss": 0.0688, + "step": 14127 + }, + { + "epoch": 2.289047310434219, + "grad_norm": 0.9270831942558289, + "learning_rate": 7.009755751384267e-07, + "loss": 0.0569, + "step": 14128 + }, + { + "epoch": 2.2892093324692158, + "grad_norm": 1.1184446811676025, + "learning_rate": 7.006719480104913e-07, + "loss": 0.0677, + "step": 14129 + }, + { + "epoch": 2.2893713545042127, + "grad_norm": 0.8415164947509766, + "learning_rate": 7.003683759376415e-07, + "loss": 0.0582, + "step": 14130 + }, + { + "epoch": 2.289533376539209, + "grad_norm": 1.0956602096557617, + "learning_rate": 7.000648589291634e-07, + "loss": 0.0626, + "step": 14131 + }, + { + "epoch": 2.289695398574206, + "grad_norm": 0.8971714973449707, + "learning_rate": 6.997613969943451e-07, + "loss": 0.0578, + "step": 14132 + }, + { + "epoch": 2.289857420609203, + "grad_norm": 0.8495174050331116, + "learning_rate": 6.994579901424714e-07, + "loss": 0.0575, + "step": 14133 + }, + { + "epoch": 2.2900194426441995, + "grad_norm": 1.226990818977356, + "learning_rate": 6.99154638382826e-07, + "loss": 0.0649, + "step": 14134 + }, + { + "epoch": 2.2901814646791965, + "grad_norm": 1.0273133516311646, + "learning_rate": 6.988513417246906e-07, + "loss": 0.0696, + "step": 14135 + }, + { + "epoch": 2.290343486714193, + "grad_norm": 0.996566116809845, + "learning_rate": 6.985481001773456e-07, + "loss": 0.0684, + "step": 14136 + }, + { + "epoch": 2.29050550874919, + "grad_norm": 0.8226109743118286, + "learning_rate": 6.982449137500694e-07, + "loss": 0.0552, + "step": 14137 + }, + { + "epoch": 2.290667530784187, + "grad_norm": 0.8552298545837402, + "learning_rate": 6.979417824521393e-07, + "loss": 0.0626, + "step": 14138 + }, + { + "epoch": 2.2908295528191833, + "grad_norm": 0.8617782592773438, + "learning_rate": 6.976387062928283e-07, + "loss": 0.0569, + "step": 14139 + }, + { + "epoch": 2.2909915748541803, + "grad_norm": 0.887869656085968, + "learning_rate": 6.97335685281412e-07, + "loss": 0.0597, + "step": 14140 + }, + { + "epoch": 2.2911535968891767, + "grad_norm": 0.8527069687843323, + "learning_rate": 6.970327194271612e-07, + "loss": 0.0682, + "step": 14141 + }, + { + "epoch": 2.2913156189241737, + "grad_norm": 0.9207058548927307, + "learning_rate": 6.967298087393471e-07, + "loss": 0.0674, + "step": 14142 + }, + { + "epoch": 2.2914776409591706, + "grad_norm": 0.8777424097061157, + "learning_rate": 6.964269532272361e-07, + "loss": 0.062, + "step": 14143 + }, + { + "epoch": 2.291639662994167, + "grad_norm": 0.9275683164596558, + "learning_rate": 6.96124152900095e-07, + "loss": 0.0591, + "step": 14144 + }, + { + "epoch": 2.291801685029164, + "grad_norm": 0.9526832699775696, + "learning_rate": 6.958214077671912e-07, + "loss": 0.0673, + "step": 14145 + }, + { + "epoch": 2.291963707064161, + "grad_norm": 1.0193464756011963, + "learning_rate": 6.955187178377853e-07, + "loss": 0.0628, + "step": 14146 + }, + { + "epoch": 2.2921257290991575, + "grad_norm": 0.9001911282539368, + "learning_rate": 6.952160831211401e-07, + "loss": 0.0649, + "step": 14147 + }, + { + "epoch": 2.2922877511341544, + "grad_norm": 0.7867611646652222, + "learning_rate": 6.949135036265153e-07, + "loss": 0.0577, + "step": 14148 + }, + { + "epoch": 2.292449773169151, + "grad_norm": 0.8737292885780334, + "learning_rate": 6.946109793631689e-07, + "loss": 0.0549, + "step": 14149 + }, + { + "epoch": 2.292611795204148, + "grad_norm": 0.9339378476142883, + "learning_rate": 6.943085103403577e-07, + "loss": 0.0673, + "step": 14150 + }, + { + "epoch": 2.2927738172391443, + "grad_norm": 0.8713966012001038, + "learning_rate": 6.940060965673362e-07, + "loss": 0.0591, + "step": 14151 + }, + { + "epoch": 2.2929358392741412, + "grad_norm": 0.8593830466270447, + "learning_rate": 6.937037380533579e-07, + "loss": 0.062, + "step": 14152 + }, + { + "epoch": 2.293097861309138, + "grad_norm": 0.8531447649002075, + "learning_rate": 6.93401434807674e-07, + "loss": 0.057, + "step": 14153 + }, + { + "epoch": 2.2932598833441347, + "grad_norm": 0.8757892847061157, + "learning_rate": 6.930991868395343e-07, + "loss": 0.0614, + "step": 14154 + }, + { + "epoch": 2.2934219053791316, + "grad_norm": 0.868497371673584, + "learning_rate": 6.92796994158187e-07, + "loss": 0.0633, + "step": 14155 + }, + { + "epoch": 2.2935839274141285, + "grad_norm": 0.9063879251480103, + "learning_rate": 6.924948567728787e-07, + "loss": 0.0596, + "step": 14156 + }, + { + "epoch": 2.293745949449125, + "grad_norm": 1.0460374355316162, + "learning_rate": 6.921927746928533e-07, + "loss": 0.0672, + "step": 14157 + }, + { + "epoch": 2.293907971484122, + "grad_norm": 0.8834719657897949, + "learning_rate": 6.918907479273535e-07, + "loss": 0.0637, + "step": 14158 + }, + { + "epoch": 2.2940699935191184, + "grad_norm": 0.8647652864456177, + "learning_rate": 6.915887764856216e-07, + "loss": 0.0618, + "step": 14159 + }, + { + "epoch": 2.2942320155541154, + "grad_norm": 0.8388234376907349, + "learning_rate": 6.912868603768979e-07, + "loss": 0.0574, + "step": 14160 + }, + { + "epoch": 2.2943940375891123, + "grad_norm": 0.8639540076255798, + "learning_rate": 6.909849996104187e-07, + "loss": 0.0632, + "step": 14161 + }, + { + "epoch": 2.294556059624109, + "grad_norm": 0.9691962599754333, + "learning_rate": 6.906831941954206e-07, + "loss": 0.0672, + "step": 14162 + }, + { + "epoch": 2.2947180816591057, + "grad_norm": 0.8707907795906067, + "learning_rate": 6.903814441411383e-07, + "loss": 0.0623, + "step": 14163 + }, + { + "epoch": 2.2948801036941022, + "grad_norm": 0.8201960325241089, + "learning_rate": 6.900797494568045e-07, + "loss": 0.0648, + "step": 14164 + }, + { + "epoch": 2.295042125729099, + "grad_norm": 0.9355173110961914, + "learning_rate": 6.897781101516504e-07, + "loss": 0.0678, + "step": 14165 + }, + { + "epoch": 2.295204147764096, + "grad_norm": 0.7753675580024719, + "learning_rate": 6.894765262349056e-07, + "loss": 0.0546, + "step": 14166 + }, + { + "epoch": 2.2953661697990926, + "grad_norm": 1.1092512607574463, + "learning_rate": 6.891749977157979e-07, + "loss": 0.0609, + "step": 14167 + }, + { + "epoch": 2.2955281918340895, + "grad_norm": 0.8517485857009888, + "learning_rate": 6.88873524603553e-07, + "loss": 0.0617, + "step": 14168 + }, + { + "epoch": 2.295690213869086, + "grad_norm": 0.8324517011642456, + "learning_rate": 6.885721069073953e-07, + "loss": 0.0565, + "step": 14169 + }, + { + "epoch": 2.295852235904083, + "grad_norm": 0.9597250819206238, + "learning_rate": 6.882707446365477e-07, + "loss": 0.0662, + "step": 14170 + }, + { + "epoch": 2.29601425793908, + "grad_norm": 0.9508599042892456, + "learning_rate": 6.879694378002316e-07, + "loss": 0.0623, + "step": 14171 + }, + { + "epoch": 2.2961762799740764, + "grad_norm": 1.0893688201904297, + "learning_rate": 6.876681864076646e-07, + "loss": 0.0757, + "step": 14172 + }, + { + "epoch": 2.2963383020090733, + "grad_norm": 0.9242887496948242, + "learning_rate": 6.873669904680655e-07, + "loss": 0.063, + "step": 14173 + }, + { + "epoch": 2.29650032404407, + "grad_norm": 0.8941336870193481, + "learning_rate": 6.870658499906505e-07, + "loss": 0.0595, + "step": 14174 + }, + { + "epoch": 2.2966623460790667, + "grad_norm": 0.8984914422035217, + "learning_rate": 6.867647649846338e-07, + "loss": 0.0615, + "step": 14175 + }, + { + "epoch": 2.2968243681140637, + "grad_norm": 0.9848359823226929, + "learning_rate": 6.864637354592266e-07, + "loss": 0.0634, + "step": 14176 + }, + { + "epoch": 2.29698639014906, + "grad_norm": 0.9562865495681763, + "learning_rate": 6.861627614236396e-07, + "loss": 0.0661, + "step": 14177 + }, + { + "epoch": 2.297148412184057, + "grad_norm": 0.8696370124816895, + "learning_rate": 6.858618428870842e-07, + "loss": 0.0591, + "step": 14178 + }, + { + "epoch": 2.297310434219054, + "grad_norm": 0.9466931819915771, + "learning_rate": 6.855609798587656e-07, + "loss": 0.0605, + "step": 14179 + }, + { + "epoch": 2.2974724562540505, + "grad_norm": 0.826383650302887, + "learning_rate": 6.852601723478902e-07, + "loss": 0.0515, + "step": 14180 + }, + { + "epoch": 2.2976344782890474, + "grad_norm": 0.9467977285385132, + "learning_rate": 6.849594203636619e-07, + "loss": 0.0621, + "step": 14181 + }, + { + "epoch": 2.297796500324044, + "grad_norm": 0.9184082746505737, + "learning_rate": 6.84658723915283e-07, + "loss": 0.0634, + "step": 14182 + }, + { + "epoch": 2.297958522359041, + "grad_norm": 0.8161764740943909, + "learning_rate": 6.843580830119542e-07, + "loss": 0.0573, + "step": 14183 + }, + { + "epoch": 2.2981205443940373, + "grad_norm": 0.8612385392189026, + "learning_rate": 6.840574976628741e-07, + "loss": 0.0674, + "step": 14184 + }, + { + "epoch": 2.2982825664290343, + "grad_norm": 0.8990957140922546, + "learning_rate": 6.837569678772401e-07, + "loss": 0.0606, + "step": 14185 + }, + { + "epoch": 2.298444588464031, + "grad_norm": 0.809965193271637, + "learning_rate": 6.834564936642488e-07, + "loss": 0.0644, + "step": 14186 + }, + { + "epoch": 2.2986066104990277, + "grad_norm": 0.9587274193763733, + "learning_rate": 6.831560750330909e-07, + "loss": 0.0612, + "step": 14187 + }, + { + "epoch": 2.2987686325340246, + "grad_norm": 0.9014949202537537, + "learning_rate": 6.828557119929613e-07, + "loss": 0.0591, + "step": 14188 + }, + { + "epoch": 2.2989306545690216, + "grad_norm": 1.0473854541778564, + "learning_rate": 6.825554045530502e-07, + "loss": 0.0749, + "step": 14189 + }, + { + "epoch": 2.299092676604018, + "grad_norm": 0.9291993379592896, + "learning_rate": 6.822551527225452e-07, + "loss": 0.0603, + "step": 14190 + }, + { + "epoch": 2.299254698639015, + "grad_norm": 0.8897577524185181, + "learning_rate": 6.819549565106331e-07, + "loss": 0.056, + "step": 14191 + }, + { + "epoch": 2.2994167206740115, + "grad_norm": 0.8365473747253418, + "learning_rate": 6.816548159264993e-07, + "loss": 0.0583, + "step": 14192 + }, + { + "epoch": 2.2995787427090084, + "grad_norm": 0.8146736025810242, + "learning_rate": 6.813547309793295e-07, + "loss": 0.0569, + "step": 14193 + }, + { + "epoch": 2.2997407647440054, + "grad_norm": 1.0734623670578003, + "learning_rate": 6.810547016783029e-07, + "loss": 0.0584, + "step": 14194 + }, + { + "epoch": 2.299902786779002, + "grad_norm": 0.953218936920166, + "learning_rate": 6.807547280326007e-07, + "loss": 0.0679, + "step": 14195 + }, + { + "epoch": 2.3000648088139988, + "grad_norm": 1.0048823356628418, + "learning_rate": 6.804548100514013e-07, + "loss": 0.0612, + "step": 14196 + }, + { + "epoch": 2.3002268308489953, + "grad_norm": 1.0320632457733154, + "learning_rate": 6.801549477438815e-07, + "loss": 0.0585, + "step": 14197 + }, + { + "epoch": 2.300388852883992, + "grad_norm": 0.9690525531768799, + "learning_rate": 6.798551411192165e-07, + "loss": 0.0676, + "step": 14198 + }, + { + "epoch": 2.300550874918989, + "grad_norm": 0.8694241046905518, + "learning_rate": 6.795553901865795e-07, + "loss": 0.0623, + "step": 14199 + }, + { + "epoch": 2.3007128969539856, + "grad_norm": 0.9456620216369629, + "learning_rate": 6.792556949551426e-07, + "loss": 0.0662, + "step": 14200 + }, + { + "epoch": 2.3008749189889826, + "grad_norm": 0.9079062342643738, + "learning_rate": 6.789560554340743e-07, + "loss": 0.0629, + "step": 14201 + }, + { + "epoch": 2.3010369410239795, + "grad_norm": 0.9178680181503296, + "learning_rate": 6.786564716325441e-07, + "loss": 0.0721, + "step": 14202 + }, + { + "epoch": 2.301198963058976, + "grad_norm": 0.7588401436805725, + "learning_rate": 6.783569435597188e-07, + "loss": 0.0523, + "step": 14203 + }, + { + "epoch": 2.301360985093973, + "grad_norm": 0.9008545875549316, + "learning_rate": 6.780574712247632e-07, + "loss": 0.0673, + "step": 14204 + }, + { + "epoch": 2.3015230071289694, + "grad_norm": 0.960690438747406, + "learning_rate": 6.777580546368393e-07, + "loss": 0.0612, + "step": 14205 + }, + { + "epoch": 2.3016850291639663, + "grad_norm": 0.9084741473197937, + "learning_rate": 6.774586938051084e-07, + "loss": 0.0618, + "step": 14206 + }, + { + "epoch": 2.301847051198963, + "grad_norm": 1.0799715518951416, + "learning_rate": 6.771593887387326e-07, + "loss": 0.0649, + "step": 14207 + }, + { + "epoch": 2.3020090732339598, + "grad_norm": 0.854361355304718, + "learning_rate": 6.768601394468674e-07, + "loss": 0.06, + "step": 14208 + }, + { + "epoch": 2.3021710952689567, + "grad_norm": 0.9102272987365723, + "learning_rate": 6.765609459386702e-07, + "loss": 0.0681, + "step": 14209 + }, + { + "epoch": 2.302333117303953, + "grad_norm": 0.9026054739952087, + "learning_rate": 6.762618082232952e-07, + "loss": 0.0594, + "step": 14210 + }, + { + "epoch": 2.30249513933895, + "grad_norm": 0.9153203964233398, + "learning_rate": 6.759627263098955e-07, + "loss": 0.0695, + "step": 14211 + }, + { + "epoch": 2.302657161373947, + "grad_norm": 0.8652878403663635, + "learning_rate": 6.756637002076225e-07, + "loss": 0.0624, + "step": 14212 + }, + { + "epoch": 2.3028191834089435, + "grad_norm": 0.8761528730392456, + "learning_rate": 6.753647299256255e-07, + "loss": 0.0601, + "step": 14213 + }, + { + "epoch": 2.3029812054439405, + "grad_norm": 0.8554459810256958, + "learning_rate": 6.750658154730522e-07, + "loss": 0.0597, + "step": 14214 + }, + { + "epoch": 2.303143227478937, + "grad_norm": 0.8388009071350098, + "learning_rate": 6.747669568590492e-07, + "loss": 0.062, + "step": 14215 + }, + { + "epoch": 2.303305249513934, + "grad_norm": 0.928914487361908, + "learning_rate": 6.744681540927588e-07, + "loss": 0.0618, + "step": 14216 + }, + { + "epoch": 2.303467271548931, + "grad_norm": 0.8588629961013794, + "learning_rate": 6.741694071833263e-07, + "loss": 0.0574, + "step": 14217 + }, + { + "epoch": 2.3036292935839273, + "grad_norm": 0.9985714554786682, + "learning_rate": 6.738707161398914e-07, + "loss": 0.0705, + "step": 14218 + }, + { + "epoch": 2.3037913156189243, + "grad_norm": 0.9504481554031372, + "learning_rate": 6.73572080971594e-07, + "loss": 0.0678, + "step": 14219 + }, + { + "epoch": 2.3039533376539207, + "grad_norm": 0.9666747450828552, + "learning_rate": 6.732735016875697e-07, + "loss": 0.0676, + "step": 14220 + }, + { + "epoch": 2.3041153596889177, + "grad_norm": 0.8083941340446472, + "learning_rate": 6.729749782969563e-07, + "loss": 0.0612, + "step": 14221 + }, + { + "epoch": 2.3042773817239146, + "grad_norm": 0.9107043743133545, + "learning_rate": 6.726765108088881e-07, + "loss": 0.0617, + "step": 14222 + }, + { + "epoch": 2.304439403758911, + "grad_norm": 0.8032419681549072, + "learning_rate": 6.72378099232496e-07, + "loss": 0.0553, + "step": 14223 + }, + { + "epoch": 2.304601425793908, + "grad_norm": 0.9696297645568848, + "learning_rate": 6.720797435769111e-07, + "loss": 0.0601, + "step": 14224 + }, + { + "epoch": 2.304763447828905, + "grad_norm": 0.9108163118362427, + "learning_rate": 6.717814438512626e-07, + "loss": 0.0624, + "step": 14225 + }, + { + "epoch": 2.3049254698639015, + "grad_norm": 0.8767720460891724, + "learning_rate": 6.714832000646778e-07, + "loss": 0.0662, + "step": 14226 + }, + { + "epoch": 2.3050874918988984, + "grad_norm": 0.9097752571105957, + "learning_rate": 6.711850122262823e-07, + "loss": 0.0645, + "step": 14227 + }, + { + "epoch": 2.305249513933895, + "grad_norm": 0.8309717178344727, + "learning_rate": 6.708868803451992e-07, + "loss": 0.052, + "step": 14228 + }, + { + "epoch": 2.305411535968892, + "grad_norm": 0.8942591547966003, + "learning_rate": 6.705888044305516e-07, + "loss": 0.0665, + "step": 14229 + }, + { + "epoch": 2.3055735580038883, + "grad_norm": 0.7907509207725525, + "learning_rate": 6.702907844914597e-07, + "loss": 0.0545, + "step": 14230 + }, + { + "epoch": 2.3057355800388852, + "grad_norm": 0.913838267326355, + "learning_rate": 6.699928205370418e-07, + "loss": 0.0648, + "step": 14231 + }, + { + "epoch": 2.305897602073882, + "grad_norm": 0.7393428087234497, + "learning_rate": 6.696949125764149e-07, + "loss": 0.0531, + "step": 14232 + }, + { + "epoch": 2.3060596241088787, + "grad_norm": 1.152111291885376, + "learning_rate": 6.693970606186953e-07, + "loss": 0.0706, + "step": 14233 + }, + { + "epoch": 2.3062216461438756, + "grad_norm": 0.8411812782287598, + "learning_rate": 6.690992646729949e-07, + "loss": 0.0603, + "step": 14234 + }, + { + "epoch": 2.3063836681788725, + "grad_norm": 0.9782255291938782, + "learning_rate": 6.688015247484256e-07, + "loss": 0.0644, + "step": 14235 + }, + { + "epoch": 2.306545690213869, + "grad_norm": 0.9612037539482117, + "learning_rate": 6.685038408540989e-07, + "loss": 0.0636, + "step": 14236 + }, + { + "epoch": 2.306707712248866, + "grad_norm": 1.0477268695831299, + "learning_rate": 6.682062129991232e-07, + "loss": 0.07, + "step": 14237 + }, + { + "epoch": 2.3068697342838624, + "grad_norm": 0.8569392561912537, + "learning_rate": 6.679086411926039e-07, + "loss": 0.0622, + "step": 14238 + }, + { + "epoch": 2.3070317563188594, + "grad_norm": 0.8216943144798279, + "learning_rate": 6.676111254436465e-07, + "loss": 0.0564, + "step": 14239 + }, + { + "epoch": 2.3071937783538563, + "grad_norm": 0.8884442448616028, + "learning_rate": 6.673136657613547e-07, + "loss": 0.0602, + "step": 14240 + }, + { + "epoch": 2.307355800388853, + "grad_norm": 0.9046189785003662, + "learning_rate": 6.670162621548293e-07, + "loss": 0.0612, + "step": 14241 + }, + { + "epoch": 2.3075178224238497, + "grad_norm": 0.9811939597129822, + "learning_rate": 6.667189146331707e-07, + "loss": 0.0644, + "step": 14242 + }, + { + "epoch": 2.307679844458846, + "grad_norm": 0.9112708568572998, + "learning_rate": 6.664216232054771e-07, + "loss": 0.0598, + "step": 14243 + }, + { + "epoch": 2.307841866493843, + "grad_norm": 1.0917285680770874, + "learning_rate": 6.661243878808443e-07, + "loss": 0.0707, + "step": 14244 + }, + { + "epoch": 2.30800388852884, + "grad_norm": 0.8850948810577393, + "learning_rate": 6.658272086683676e-07, + "loss": 0.0545, + "step": 14245 + }, + { + "epoch": 2.3081659105638366, + "grad_norm": 0.8096259832382202, + "learning_rate": 6.655300855771393e-07, + "loss": 0.0557, + "step": 14246 + }, + { + "epoch": 2.3083279325988335, + "grad_norm": 0.9947896599769592, + "learning_rate": 6.652330186162514e-07, + "loss": 0.0618, + "step": 14247 + }, + { + "epoch": 2.3084899546338304, + "grad_norm": 0.7889495491981506, + "learning_rate": 6.649360077947939e-07, + "loss": 0.0539, + "step": 14248 + }, + { + "epoch": 2.308651976668827, + "grad_norm": 0.9984772801399231, + "learning_rate": 6.646390531218522e-07, + "loss": 0.0655, + "step": 14249 + }, + { + "epoch": 2.308813998703824, + "grad_norm": 0.9736108183860779, + "learning_rate": 6.643421546065146e-07, + "loss": 0.0626, + "step": 14250 + }, + { + "epoch": 2.3089760207388204, + "grad_norm": 0.9100701808929443, + "learning_rate": 6.640453122578655e-07, + "loss": 0.0652, + "step": 14251 + }, + { + "epoch": 2.3091380427738173, + "grad_norm": 1.022388219833374, + "learning_rate": 6.637485260849866e-07, + "loss": 0.0645, + "step": 14252 + }, + { + "epoch": 2.309300064808814, + "grad_norm": 0.9333613514900208, + "learning_rate": 6.634517960969588e-07, + "loss": 0.0595, + "step": 14253 + }, + { + "epoch": 2.3094620868438107, + "grad_norm": 0.8503563404083252, + "learning_rate": 6.63155122302861e-07, + "loss": 0.053, + "step": 14254 + }, + { + "epoch": 2.3096241088788076, + "grad_norm": 1.0957640409469604, + "learning_rate": 6.628585047117731e-07, + "loss": 0.063, + "step": 14255 + }, + { + "epoch": 2.309786130913804, + "grad_norm": 1.1101269721984863, + "learning_rate": 6.625619433327681e-07, + "loss": 0.0574, + "step": 14256 + }, + { + "epoch": 2.309948152948801, + "grad_norm": 1.0195109844207764, + "learning_rate": 6.622654381749213e-07, + "loss": 0.0655, + "step": 14257 + }, + { + "epoch": 2.310110174983798, + "grad_norm": 0.9084102511405945, + "learning_rate": 6.619689892473046e-07, + "loss": 0.0594, + "step": 14258 + }, + { + "epoch": 2.3102721970187945, + "grad_norm": 0.9905080795288086, + "learning_rate": 6.616725965589893e-07, + "loss": 0.0613, + "step": 14259 + }, + { + "epoch": 2.3104342190537914, + "grad_norm": 0.8357704281806946, + "learning_rate": 6.613762601190435e-07, + "loss": 0.0619, + "step": 14260 + }, + { + "epoch": 2.310596241088788, + "grad_norm": 0.9966726899147034, + "learning_rate": 6.61079979936535e-07, + "loss": 0.0677, + "step": 14261 + }, + { + "epoch": 2.310758263123785, + "grad_norm": 1.088291049003601, + "learning_rate": 6.60783756020529e-07, + "loss": 0.0691, + "step": 14262 + }, + { + "epoch": 2.310920285158782, + "grad_norm": 0.8392730951309204, + "learning_rate": 6.60487588380089e-07, + "loss": 0.0599, + "step": 14263 + }, + { + "epoch": 2.3110823071937783, + "grad_norm": 0.935095489025116, + "learning_rate": 6.601914770242776e-07, + "loss": 0.0603, + "step": 14264 + }, + { + "epoch": 2.311244329228775, + "grad_norm": 0.8228254318237305, + "learning_rate": 6.598954219621545e-07, + "loss": 0.0618, + "step": 14265 + }, + { + "epoch": 2.3114063512637717, + "grad_norm": 0.9754266142845154, + "learning_rate": 6.595994232027794e-07, + "loss": 0.0648, + "step": 14266 + }, + { + "epoch": 2.3115683732987686, + "grad_norm": 0.8244330286979675, + "learning_rate": 6.593034807552076e-07, + "loss": 0.051, + "step": 14267 + }, + { + "epoch": 2.3117303953337656, + "grad_norm": 0.829617440700531, + "learning_rate": 6.590075946284941e-07, + "loss": 0.0569, + "step": 14268 + }, + { + "epoch": 2.311892417368762, + "grad_norm": 1.0251657962799072, + "learning_rate": 6.587117648316943e-07, + "loss": 0.0588, + "step": 14269 + }, + { + "epoch": 2.312054439403759, + "grad_norm": 0.8598863482475281, + "learning_rate": 6.584159913738583e-07, + "loss": 0.0531, + "step": 14270 + }, + { + "epoch": 2.3122164614387555, + "grad_norm": 1.17449951171875, + "learning_rate": 6.581202742640361e-07, + "loss": 0.0737, + "step": 14271 + }, + { + "epoch": 2.3123784834737524, + "grad_norm": 0.9494041800498962, + "learning_rate": 6.578246135112765e-07, + "loss": 0.0646, + "step": 14272 + }, + { + "epoch": 2.3125405055087493, + "grad_norm": 0.9004831910133362, + "learning_rate": 6.575290091246256e-07, + "loss": 0.0553, + "step": 14273 + }, + { + "epoch": 2.312702527543746, + "grad_norm": 0.8012595772743225, + "learning_rate": 6.572334611131284e-07, + "loss": 0.0556, + "step": 14274 + }, + { + "epoch": 2.3128645495787428, + "grad_norm": 0.8960158228874207, + "learning_rate": 6.569379694858277e-07, + "loss": 0.0645, + "step": 14275 + }, + { + "epoch": 2.3130265716137393, + "grad_norm": 1.01628577709198, + "learning_rate": 6.566425342517652e-07, + "loss": 0.0653, + "step": 14276 + }, + { + "epoch": 2.313188593648736, + "grad_norm": 0.8243879079818726, + "learning_rate": 6.56347155419981e-07, + "loss": 0.0575, + "step": 14277 + }, + { + "epoch": 2.313350615683733, + "grad_norm": 0.9484763741493225, + "learning_rate": 6.560518329995108e-07, + "loss": 0.0646, + "step": 14278 + }, + { + "epoch": 2.3135126377187296, + "grad_norm": 1.1005399227142334, + "learning_rate": 6.557565669993931e-07, + "loss": 0.068, + "step": 14279 + }, + { + "epoch": 2.3136746597537265, + "grad_norm": 0.9087778925895691, + "learning_rate": 6.554613574286614e-07, + "loss": 0.0637, + "step": 14280 + }, + { + "epoch": 2.3138366817887235, + "grad_norm": 0.8552685976028442, + "learning_rate": 6.551662042963491e-07, + "loss": 0.0602, + "step": 14281 + }, + { + "epoch": 2.31399870382372, + "grad_norm": 0.8670865297317505, + "learning_rate": 6.548711076114858e-07, + "loss": 0.0605, + "step": 14282 + }, + { + "epoch": 2.314160725858717, + "grad_norm": 1.124015212059021, + "learning_rate": 6.545760673831009e-07, + "loss": 0.0686, + "step": 14283 + }, + { + "epoch": 2.3143227478937134, + "grad_norm": 1.0868921279907227, + "learning_rate": 6.542810836202237e-07, + "loss": 0.0644, + "step": 14284 + }, + { + "epoch": 2.3144847699287103, + "grad_norm": 0.8501294255256653, + "learning_rate": 6.539861563318784e-07, + "loss": 0.0641, + "step": 14285 + }, + { + "epoch": 2.314646791963707, + "grad_norm": 0.9735018610954285, + "learning_rate": 6.536912855270894e-07, + "loss": 0.0572, + "step": 14286 + }, + { + "epoch": 2.3148088139987038, + "grad_norm": 1.222994089126587, + "learning_rate": 6.533964712148779e-07, + "loss": 0.0736, + "step": 14287 + }, + { + "epoch": 2.3149708360337007, + "grad_norm": 0.9061808586120605, + "learning_rate": 6.531017134042678e-07, + "loss": 0.0617, + "step": 14288 + }, + { + "epoch": 2.315132858068697, + "grad_norm": 0.8229312896728516, + "learning_rate": 6.528070121042746e-07, + "loss": 0.0584, + "step": 14289 + }, + { + "epoch": 2.315294880103694, + "grad_norm": 0.8917800784111023, + "learning_rate": 6.52512367323917e-07, + "loss": 0.0568, + "step": 14290 + }, + { + "epoch": 2.315456902138691, + "grad_norm": 0.9813393950462341, + "learning_rate": 6.522177790722101e-07, + "loss": 0.0597, + "step": 14291 + }, + { + "epoch": 2.3156189241736875, + "grad_norm": 0.951609194278717, + "learning_rate": 6.519232473581675e-07, + "loss": 0.054, + "step": 14292 + }, + { + "epoch": 2.3157809462086845, + "grad_norm": 0.9440679550170898, + "learning_rate": 6.516287721908013e-07, + "loss": 0.0684, + "step": 14293 + }, + { + "epoch": 2.315942968243681, + "grad_norm": 0.8250246644020081, + "learning_rate": 6.513343535791216e-07, + "loss": 0.0576, + "step": 14294 + }, + { + "epoch": 2.316104990278678, + "grad_norm": 0.9275084733963013, + "learning_rate": 6.510399915321381e-07, + "loss": 0.0622, + "step": 14295 + }, + { + "epoch": 2.316267012313675, + "grad_norm": 1.0191025733947754, + "learning_rate": 6.507456860588554e-07, + "loss": 0.0587, + "step": 14296 + }, + { + "epoch": 2.3164290343486713, + "grad_norm": 1.1428616046905518, + "learning_rate": 6.504514371682788e-07, + "loss": 0.0684, + "step": 14297 + }, + { + "epoch": 2.3165910563836682, + "grad_norm": 0.9954843521118164, + "learning_rate": 6.501572448694135e-07, + "loss": 0.0539, + "step": 14298 + }, + { + "epoch": 2.3167530784186647, + "grad_norm": 0.9105631113052368, + "learning_rate": 6.498631091712603e-07, + "loss": 0.0609, + "step": 14299 + }, + { + "epoch": 2.3169151004536617, + "grad_norm": 0.7692684531211853, + "learning_rate": 6.495690300828183e-07, + "loss": 0.056, + "step": 14300 + }, + { + "epoch": 2.3170771224886586, + "grad_norm": 0.9106401205062866, + "learning_rate": 6.492750076130858e-07, + "loss": 0.0609, + "step": 14301 + }, + { + "epoch": 2.317239144523655, + "grad_norm": 1.094725489616394, + "learning_rate": 6.489810417710596e-07, + "loss": 0.0642, + "step": 14302 + }, + { + "epoch": 2.317401166558652, + "grad_norm": 0.7931134104728699, + "learning_rate": 6.48687132565734e-07, + "loss": 0.0578, + "step": 14303 + }, + { + "epoch": 2.317563188593649, + "grad_norm": 0.9079036116600037, + "learning_rate": 6.483932800061021e-07, + "loss": 0.0586, + "step": 14304 + }, + { + "epoch": 2.3177252106286454, + "grad_norm": 0.8913233280181885, + "learning_rate": 6.480994841011551e-07, + "loss": 0.0562, + "step": 14305 + }, + { + "epoch": 2.3178872326636424, + "grad_norm": 0.9209250211715698, + "learning_rate": 6.478057448598821e-07, + "loss": 0.0622, + "step": 14306 + }, + { + "epoch": 2.318049254698639, + "grad_norm": 0.994839608669281, + "learning_rate": 6.475120622912714e-07, + "loss": 0.0689, + "step": 14307 + }, + { + "epoch": 2.318211276733636, + "grad_norm": 0.9868736863136292, + "learning_rate": 6.472184364043085e-07, + "loss": 0.0632, + "step": 14308 + }, + { + "epoch": 2.3183732987686323, + "grad_norm": 0.9076941013336182, + "learning_rate": 6.469248672079778e-07, + "loss": 0.0614, + "step": 14309 + }, + { + "epoch": 2.3185353208036292, + "grad_norm": 0.9764859080314636, + "learning_rate": 6.466313547112627e-07, + "loss": 0.0699, + "step": 14310 + }, + { + "epoch": 2.318697342838626, + "grad_norm": 1.0119270086288452, + "learning_rate": 6.463378989231414e-07, + "loss": 0.0575, + "step": 14311 + }, + { + "epoch": 2.3188593648736227, + "grad_norm": 0.8330714106559753, + "learning_rate": 6.460444998525953e-07, + "loss": 0.0593, + "step": 14312 + }, + { + "epoch": 2.3190213869086196, + "grad_norm": 0.9974405765533447, + "learning_rate": 6.45751157508602e-07, + "loss": 0.0687, + "step": 14313 + }, + { + "epoch": 2.3191834089436165, + "grad_norm": 0.9587564468383789, + "learning_rate": 6.454578719001353e-07, + "loss": 0.0632, + "step": 14314 + }, + { + "epoch": 2.319345430978613, + "grad_norm": 0.8796777129173279, + "learning_rate": 6.451646430361696e-07, + "loss": 0.0626, + "step": 14315 + }, + { + "epoch": 2.31950745301361, + "grad_norm": 0.9801618456840515, + "learning_rate": 6.448714709256768e-07, + "loss": 0.0687, + "step": 14316 + }, + { + "epoch": 2.3196694750486064, + "grad_norm": 1.0900496244430542, + "learning_rate": 6.445783555776289e-07, + "loss": 0.0667, + "step": 14317 + }, + { + "epoch": 2.3198314970836034, + "grad_norm": 0.8728010654449463, + "learning_rate": 6.442852970009925e-07, + "loss": 0.0648, + "step": 14318 + }, + { + "epoch": 2.3199935191186003, + "grad_norm": 1.05231511592865, + "learning_rate": 6.439922952047354e-07, + "loss": 0.0703, + "step": 14319 + }, + { + "epoch": 2.320155541153597, + "grad_norm": 1.0301355123519897, + "learning_rate": 6.436993501978226e-07, + "loss": 0.0609, + "step": 14320 + }, + { + "epoch": 2.3203175631885937, + "grad_norm": 1.086633563041687, + "learning_rate": 6.434064619892175e-07, + "loss": 0.0576, + "step": 14321 + }, + { + "epoch": 2.32047958522359, + "grad_norm": 0.8054091930389404, + "learning_rate": 6.431136305878819e-07, + "loss": 0.061, + "step": 14322 + }, + { + "epoch": 2.320641607258587, + "grad_norm": 0.8511539697647095, + "learning_rate": 6.428208560027755e-07, + "loss": 0.0574, + "step": 14323 + }, + { + "epoch": 2.320803629293584, + "grad_norm": 1.0268864631652832, + "learning_rate": 6.425281382428566e-07, + "loss": 0.0602, + "step": 14324 + }, + { + "epoch": 2.3209656513285806, + "grad_norm": 0.941626787185669, + "learning_rate": 6.422354773170825e-07, + "loss": 0.0707, + "step": 14325 + }, + { + "epoch": 2.3211276733635775, + "grad_norm": 0.8575886487960815, + "learning_rate": 6.419428732344055e-07, + "loss": 0.0642, + "step": 14326 + }, + { + "epoch": 2.3212896953985744, + "grad_norm": 0.7996649146080017, + "learning_rate": 6.41650326003781e-07, + "loss": 0.0594, + "step": 14327 + }, + { + "epoch": 2.321451717433571, + "grad_norm": 0.8261630535125732, + "learning_rate": 6.413578356341602e-07, + "loss": 0.061, + "step": 14328 + }, + { + "epoch": 2.321613739468568, + "grad_norm": 0.9302612543106079, + "learning_rate": 6.410654021344909e-07, + "loss": 0.0629, + "step": 14329 + }, + { + "epoch": 2.3217757615035644, + "grad_norm": 0.91783207654953, + "learning_rate": 6.407730255137212e-07, + "loss": 0.0618, + "step": 14330 + }, + { + "epoch": 2.3219377835385613, + "grad_norm": 0.9778722524642944, + "learning_rate": 6.404807057807982e-07, + "loss": 0.0656, + "step": 14331 + }, + { + "epoch": 2.3220998055735578, + "grad_norm": 0.9770204424858093, + "learning_rate": 6.401884429446667e-07, + "loss": 0.0747, + "step": 14332 + }, + { + "epoch": 2.3222618276085547, + "grad_norm": 0.974606990814209, + "learning_rate": 6.398962370142672e-07, + "loss": 0.0708, + "step": 14333 + }, + { + "epoch": 2.3224238496435516, + "grad_norm": 0.8390527963638306, + "learning_rate": 6.396040879985416e-07, + "loss": 0.0654, + "step": 14334 + }, + { + "epoch": 2.322585871678548, + "grad_norm": 0.8269057869911194, + "learning_rate": 6.393119959064287e-07, + "loss": 0.0601, + "step": 14335 + }, + { + "epoch": 2.322747893713545, + "grad_norm": 0.9386374354362488, + "learning_rate": 6.390199607468661e-07, + "loss": 0.0661, + "step": 14336 + }, + { + "epoch": 2.322909915748542, + "grad_norm": 0.795021116733551, + "learning_rate": 6.387279825287892e-07, + "loss": 0.0568, + "step": 14337 + }, + { + "epoch": 2.3230719377835385, + "grad_norm": 0.8094752430915833, + "learning_rate": 6.384360612611317e-07, + "loss": 0.0615, + "step": 14338 + }, + { + "epoch": 2.3232339598185354, + "grad_norm": 0.8009088039398193, + "learning_rate": 6.381441969528268e-07, + "loss": 0.0567, + "step": 14339 + }, + { + "epoch": 2.323395981853532, + "grad_norm": 1.1142011880874634, + "learning_rate": 6.378523896128022e-07, + "loss": 0.0741, + "step": 14340 + }, + { + "epoch": 2.323558003888529, + "grad_norm": 0.8346922397613525, + "learning_rate": 6.37560639249989e-07, + "loss": 0.0549, + "step": 14341 + }, + { + "epoch": 2.323720025923526, + "grad_norm": 0.9480348825454712, + "learning_rate": 6.37268945873313e-07, + "loss": 0.0684, + "step": 14342 + }, + { + "epoch": 2.3238820479585223, + "grad_norm": 0.8845011591911316, + "learning_rate": 6.369773094917006e-07, + "loss": 0.0602, + "step": 14343 + }, + { + "epoch": 2.324044069993519, + "grad_norm": 0.7797037363052368, + "learning_rate": 6.36685730114073e-07, + "loss": 0.0554, + "step": 14344 + }, + { + "epoch": 2.3242060920285157, + "grad_norm": 0.9391816854476929, + "learning_rate": 6.363942077493526e-07, + "loss": 0.0599, + "step": 14345 + }, + { + "epoch": 2.3243681140635126, + "grad_norm": 0.895332396030426, + "learning_rate": 6.361027424064609e-07, + "loss": 0.0646, + "step": 14346 + }, + { + "epoch": 2.3245301360985096, + "grad_norm": 1.2685229778289795, + "learning_rate": 6.35811334094314e-07, + "loss": 0.0661, + "step": 14347 + }, + { + "epoch": 2.324692158133506, + "grad_norm": 0.9253360033035278, + "learning_rate": 6.355199828218289e-07, + "loss": 0.0673, + "step": 14348 + }, + { + "epoch": 2.324854180168503, + "grad_norm": 0.8045247197151184, + "learning_rate": 6.352286885979206e-07, + "loss": 0.0547, + "step": 14349 + }, + { + "epoch": 2.3250162022035, + "grad_norm": 0.9229259490966797, + "learning_rate": 6.349374514315015e-07, + "loss": 0.0606, + "step": 14350 + }, + { + "epoch": 2.3251782242384964, + "grad_norm": 0.9514033794403076, + "learning_rate": 6.346462713314832e-07, + "loss": 0.0599, + "step": 14351 + }, + { + "epoch": 2.3253402462734933, + "grad_norm": 0.9198815822601318, + "learning_rate": 6.343551483067751e-07, + "loss": 0.0639, + "step": 14352 + }, + { + "epoch": 2.32550226830849, + "grad_norm": 0.8940281271934509, + "learning_rate": 6.340640823662842e-07, + "loss": 0.059, + "step": 14353 + }, + { + "epoch": 2.3256642903434868, + "grad_norm": 0.9115906953811646, + "learning_rate": 6.337730735189174e-07, + "loss": 0.0633, + "step": 14354 + }, + { + "epoch": 2.3258263123784833, + "grad_norm": 0.9600539803504944, + "learning_rate": 6.334821217735778e-07, + "loss": 0.0585, + "step": 14355 + }, + { + "epoch": 2.32598833441348, + "grad_norm": 0.9461213946342468, + "learning_rate": 6.331912271391688e-07, + "loss": 0.0607, + "step": 14356 + }, + { + "epoch": 2.326150356448477, + "grad_norm": 0.9921082258224487, + "learning_rate": 6.329003896245908e-07, + "loss": 0.0614, + "step": 14357 + }, + { + "epoch": 2.3263123784834736, + "grad_norm": 0.8889320492744446, + "learning_rate": 6.326096092387429e-07, + "loss": 0.0627, + "step": 14358 + }, + { + "epoch": 2.3264744005184705, + "grad_norm": 0.9381389021873474, + "learning_rate": 6.323188859905207e-07, + "loss": 0.0633, + "step": 14359 + }, + { + "epoch": 2.3266364225534675, + "grad_norm": 0.850811779499054, + "learning_rate": 6.320282198888217e-07, + "loss": 0.0619, + "step": 14360 + }, + { + "epoch": 2.326798444588464, + "grad_norm": 0.831915557384491, + "learning_rate": 6.317376109425397e-07, + "loss": 0.0546, + "step": 14361 + }, + { + "epoch": 2.326960466623461, + "grad_norm": 0.8602516651153564, + "learning_rate": 6.314470591605646e-07, + "loss": 0.0624, + "step": 14362 + }, + { + "epoch": 2.3271224886584574, + "grad_norm": 0.9104455709457397, + "learning_rate": 6.311565645517878e-07, + "loss": 0.0652, + "step": 14363 + }, + { + "epoch": 2.3272845106934543, + "grad_norm": 1.1242001056671143, + "learning_rate": 6.308661271250974e-07, + "loss": 0.0677, + "step": 14364 + }, + { + "epoch": 2.3274465327284513, + "grad_norm": 0.8856641054153442, + "learning_rate": 6.305757468893805e-07, + "loss": 0.0649, + "step": 14365 + }, + { + "epoch": 2.3276085547634477, + "grad_norm": 0.8960265517234802, + "learning_rate": 6.302854238535219e-07, + "loss": 0.0605, + "step": 14366 + }, + { + "epoch": 2.3277705767984447, + "grad_norm": 0.9673840999603271, + "learning_rate": 6.299951580264047e-07, + "loss": 0.0579, + "step": 14367 + }, + { + "epoch": 2.327932598833441, + "grad_norm": 1.0338740348815918, + "learning_rate": 6.2970494941691e-07, + "loss": 0.0713, + "step": 14368 + }, + { + "epoch": 2.328094620868438, + "grad_norm": 1.090754508972168, + "learning_rate": 6.294147980339182e-07, + "loss": 0.0671, + "step": 14369 + }, + { + "epoch": 2.328256642903435, + "grad_norm": 0.877884566783905, + "learning_rate": 6.291247038863066e-07, + "loss": 0.0598, + "step": 14370 + }, + { + "epoch": 2.3284186649384315, + "grad_norm": 0.8679099678993225, + "learning_rate": 6.288346669829518e-07, + "loss": 0.0611, + "step": 14371 + }, + { + "epoch": 2.3285806869734285, + "grad_norm": 0.972556471824646, + "learning_rate": 6.285446873327289e-07, + "loss": 0.0667, + "step": 14372 + }, + { + "epoch": 2.3287427090084254, + "grad_norm": 0.8393288254737854, + "learning_rate": 6.282547649445087e-07, + "loss": 0.0602, + "step": 14373 + }, + { + "epoch": 2.328904731043422, + "grad_norm": 0.8273356556892395, + "learning_rate": 6.279648998271626e-07, + "loss": 0.0599, + "step": 14374 + }, + { + "epoch": 2.329066753078419, + "grad_norm": 0.8478513360023499, + "learning_rate": 6.276750919895611e-07, + "loss": 0.0557, + "step": 14375 + }, + { + "epoch": 2.3292287751134153, + "grad_norm": 0.9557338953018188, + "learning_rate": 6.273853414405715e-07, + "loss": 0.0706, + "step": 14376 + }, + { + "epoch": 2.3293907971484122, + "grad_norm": 0.9526261687278748, + "learning_rate": 6.270956481890581e-07, + "loss": 0.0676, + "step": 14377 + }, + { + "epoch": 2.3295528191834087, + "grad_norm": 0.8927558660507202, + "learning_rate": 6.268060122438846e-07, + "loss": 0.0581, + "step": 14378 + }, + { + "epoch": 2.3297148412184057, + "grad_norm": 0.9341113567352295, + "learning_rate": 6.265164336139157e-07, + "loss": 0.0619, + "step": 14379 + }, + { + "epoch": 2.3298768632534026, + "grad_norm": 0.8526856303215027, + "learning_rate": 6.262269123080095e-07, + "loss": 0.06, + "step": 14380 + }, + { + "epoch": 2.330038885288399, + "grad_norm": 1.0245894193649292, + "learning_rate": 6.259374483350253e-07, + "loss": 0.0641, + "step": 14381 + }, + { + "epoch": 2.330200907323396, + "grad_norm": 0.8189849853515625, + "learning_rate": 6.256480417038202e-07, + "loss": 0.0572, + "step": 14382 + }, + { + "epoch": 2.330362929358393, + "grad_norm": 0.9356175661087036, + "learning_rate": 6.253586924232489e-07, + "loss": 0.0652, + "step": 14383 + }, + { + "epoch": 2.3305249513933894, + "grad_norm": 0.9660431742668152, + "learning_rate": 6.250694005021651e-07, + "loss": 0.0656, + "step": 14384 + }, + { + "epoch": 2.3306869734283864, + "grad_norm": 0.7970089316368103, + "learning_rate": 6.247801659494207e-07, + "loss": 0.0563, + "step": 14385 + }, + { + "epoch": 2.330848995463383, + "grad_norm": 0.9310674071311951, + "learning_rate": 6.244909887738651e-07, + "loss": 0.069, + "step": 14386 + }, + { + "epoch": 2.33101101749838, + "grad_norm": 1.0296223163604736, + "learning_rate": 6.242018689843471e-07, + "loss": 0.07, + "step": 14387 + }, + { + "epoch": 2.3311730395333763, + "grad_norm": 0.875980019569397, + "learning_rate": 6.239128065897113e-07, + "loss": 0.0623, + "step": 14388 + }, + { + "epoch": 2.3313350615683732, + "grad_norm": 0.9242268204689026, + "learning_rate": 6.236238015988044e-07, + "loss": 0.0655, + "step": 14389 + }, + { + "epoch": 2.33149708360337, + "grad_norm": 0.8916738629341125, + "learning_rate": 6.233348540204689e-07, + "loss": 0.0632, + "step": 14390 + }, + { + "epoch": 2.3316591056383666, + "grad_norm": 0.9092872142791748, + "learning_rate": 6.23045963863545e-07, + "loss": 0.0652, + "step": 14391 + }, + { + "epoch": 2.3318211276733636, + "grad_norm": 0.8567991852760315, + "learning_rate": 6.227571311368724e-07, + "loss": 0.0694, + "step": 14392 + }, + { + "epoch": 2.3319831497083605, + "grad_norm": 0.921323299407959, + "learning_rate": 6.22468355849288e-07, + "loss": 0.0666, + "step": 14393 + }, + { + "epoch": 2.332145171743357, + "grad_norm": 0.9615211486816406, + "learning_rate": 6.221796380096298e-07, + "loss": 0.0626, + "step": 14394 + }, + { + "epoch": 2.332307193778354, + "grad_norm": 0.9106801748275757, + "learning_rate": 6.218909776267295e-07, + "loss": 0.0597, + "step": 14395 + }, + { + "epoch": 2.3324692158133504, + "grad_norm": 0.896317183971405, + "learning_rate": 6.216023747094207e-07, + "loss": 0.06, + "step": 14396 + }, + { + "epoch": 2.3326312378483474, + "grad_norm": 0.9247400760650635, + "learning_rate": 6.213138292665333e-07, + "loss": 0.0679, + "step": 14397 + }, + { + "epoch": 2.3327932598833443, + "grad_norm": 0.9041847586631775, + "learning_rate": 6.210253413068964e-07, + "loss": 0.0586, + "step": 14398 + }, + { + "epoch": 2.332955281918341, + "grad_norm": 0.840080201625824, + "learning_rate": 6.207369108393374e-07, + "loss": 0.0549, + "step": 14399 + }, + { + "epoch": 2.3331173039533377, + "grad_norm": 0.8104625344276428, + "learning_rate": 6.20448537872681e-07, + "loss": 0.0578, + "step": 14400 + }, + { + "epoch": 2.333279325988334, + "grad_norm": 0.9511587023735046, + "learning_rate": 6.201602224157508e-07, + "loss": 0.0697, + "step": 14401 + }, + { + "epoch": 2.333441348023331, + "grad_norm": 1.060603380203247, + "learning_rate": 6.198719644773687e-07, + "loss": 0.0735, + "step": 14402 + }, + { + "epoch": 2.333603370058328, + "grad_norm": 0.9461984634399414, + "learning_rate": 6.195837640663546e-07, + "loss": 0.06, + "step": 14403 + }, + { + "epoch": 2.3337653920933246, + "grad_norm": 0.7724205851554871, + "learning_rate": 6.192956211915269e-07, + "loss": 0.0539, + "step": 14404 + }, + { + "epoch": 2.3339274141283215, + "grad_norm": 0.7702530026435852, + "learning_rate": 6.190075358617029e-07, + "loss": 0.0553, + "step": 14405 + }, + { + "epoch": 2.3340894361633184, + "grad_norm": 1.1382404565811157, + "learning_rate": 6.187195080856953e-07, + "loss": 0.0647, + "step": 14406 + }, + { + "epoch": 2.334251458198315, + "grad_norm": 0.8377068042755127, + "learning_rate": 6.184315378723177e-07, + "loss": 0.0581, + "step": 14407 + }, + { + "epoch": 2.334413480233312, + "grad_norm": 1.097277283668518, + "learning_rate": 6.181436252303829e-07, + "loss": 0.0576, + "step": 14408 + }, + { + "epoch": 2.3345755022683083, + "grad_norm": 1.051430106163025, + "learning_rate": 6.178557701686985e-07, + "loss": 0.0641, + "step": 14409 + }, + { + "epoch": 2.3347375243033053, + "grad_norm": 0.9613043665885925, + "learning_rate": 6.175679726960731e-07, + "loss": 0.0686, + "step": 14410 + }, + { + "epoch": 2.3348995463383018, + "grad_norm": 0.791201114654541, + "learning_rate": 6.17280232821312e-07, + "loss": 0.0547, + "step": 14411 + }, + { + "epoch": 2.3350615683732987, + "grad_norm": 1.0079299211502075, + "learning_rate": 6.169925505532201e-07, + "loss": 0.0659, + "step": 14412 + }, + { + "epoch": 2.3352235904082956, + "grad_norm": 1.0087671279907227, + "learning_rate": 6.167049259005989e-07, + "loss": 0.0701, + "step": 14413 + }, + { + "epoch": 2.335385612443292, + "grad_norm": 0.9747778177261353, + "learning_rate": 6.164173588722497e-07, + "loss": 0.0582, + "step": 14414 + }, + { + "epoch": 2.335547634478289, + "grad_norm": 0.862362265586853, + "learning_rate": 6.161298494769713e-07, + "loss": 0.0548, + "step": 14415 + }, + { + "epoch": 2.335709656513286, + "grad_norm": 1.0373525619506836, + "learning_rate": 6.158423977235611e-07, + "loss": 0.0639, + "step": 14416 + }, + { + "epoch": 2.3358716785482825, + "grad_norm": 0.9192182421684265, + "learning_rate": 6.155550036208125e-07, + "loss": 0.066, + "step": 14417 + }, + { + "epoch": 2.3360337005832794, + "grad_norm": 0.8472986221313477, + "learning_rate": 6.152676671775215e-07, + "loss": 0.056, + "step": 14418 + }, + { + "epoch": 2.336195722618276, + "grad_norm": 1.020595669746399, + "learning_rate": 6.149803884024786e-07, + "loss": 0.0643, + "step": 14419 + }, + { + "epoch": 2.336357744653273, + "grad_norm": 0.877204954624176, + "learning_rate": 6.146931673044751e-07, + "loss": 0.0563, + "step": 14420 + }, + { + "epoch": 2.3365197666882698, + "grad_norm": 0.8729059100151062, + "learning_rate": 6.144060038922967e-07, + "loss": 0.0586, + "step": 14421 + }, + { + "epoch": 2.3366817887232663, + "grad_norm": 1.002275824546814, + "learning_rate": 6.141188981747323e-07, + "loss": 0.0644, + "step": 14422 + }, + { + "epoch": 2.336843810758263, + "grad_norm": 0.8643121123313904, + "learning_rate": 6.138318501605667e-07, + "loss": 0.0591, + "step": 14423 + }, + { + "epoch": 2.3370058327932597, + "grad_norm": 0.9158600568771362, + "learning_rate": 6.135448598585814e-07, + "loss": 0.0647, + "step": 14424 + }, + { + "epoch": 2.3371678548282566, + "grad_norm": 0.9756930470466614, + "learning_rate": 6.132579272775583e-07, + "loss": 0.0646, + "step": 14425 + }, + { + "epoch": 2.3373298768632536, + "grad_norm": 0.9086965322494507, + "learning_rate": 6.129710524262758e-07, + "loss": 0.0622, + "step": 14426 + }, + { + "epoch": 2.33749189889825, + "grad_norm": 0.8095213174819946, + "learning_rate": 6.12684235313514e-07, + "loss": 0.0585, + "step": 14427 + }, + { + "epoch": 2.337653920933247, + "grad_norm": 0.8328830599784851, + "learning_rate": 6.123974759480469e-07, + "loss": 0.0553, + "step": 14428 + }, + { + "epoch": 2.337815942968244, + "grad_norm": 0.9773808121681213, + "learning_rate": 6.12110774338649e-07, + "loss": 0.0597, + "step": 14429 + }, + { + "epoch": 2.3379779650032404, + "grad_norm": 1.039530634880066, + "learning_rate": 6.118241304940928e-07, + "loss": 0.0648, + "step": 14430 + }, + { + "epoch": 2.3381399870382373, + "grad_norm": 0.8209445476531982, + "learning_rate": 6.115375444231489e-07, + "loss": 0.0472, + "step": 14431 + }, + { + "epoch": 2.338302009073234, + "grad_norm": 0.8805145025253296, + "learning_rate": 6.112510161345861e-07, + "loss": 0.0629, + "step": 14432 + }, + { + "epoch": 2.3384640311082308, + "grad_norm": 1.0358737707138062, + "learning_rate": 6.109645456371715e-07, + "loss": 0.0585, + "step": 14433 + }, + { + "epoch": 2.3386260531432272, + "grad_norm": 0.8988499641418457, + "learning_rate": 6.106781329396714e-07, + "loss": 0.0542, + "step": 14434 + }, + { + "epoch": 2.338788075178224, + "grad_norm": 0.8143104910850525, + "learning_rate": 6.103917780508475e-07, + "loss": 0.0535, + "step": 14435 + }, + { + "epoch": 2.338950097213221, + "grad_norm": 0.8741886019706726, + "learning_rate": 6.101054809794615e-07, + "loss": 0.0623, + "step": 14436 + }, + { + "epoch": 2.3391121192482176, + "grad_norm": 0.9815524816513062, + "learning_rate": 6.098192417342755e-07, + "loss": 0.0693, + "step": 14437 + }, + { + "epoch": 2.3392741412832145, + "grad_norm": 0.9720104336738586, + "learning_rate": 6.095330603240468e-07, + "loss": 0.0597, + "step": 14438 + }, + { + "epoch": 2.3394361633182115, + "grad_norm": 0.9975051879882812, + "learning_rate": 6.092469367575312e-07, + "loss": 0.0565, + "step": 14439 + }, + { + "epoch": 2.339598185353208, + "grad_norm": 1.0102384090423584, + "learning_rate": 6.089608710434836e-07, + "loss": 0.0639, + "step": 14440 + }, + { + "epoch": 2.339760207388205, + "grad_norm": 1.2839295864105225, + "learning_rate": 6.086748631906572e-07, + "loss": 0.0529, + "step": 14441 + }, + { + "epoch": 2.3399222294232014, + "grad_norm": 0.770176112651825, + "learning_rate": 6.083889132078033e-07, + "loss": 0.0553, + "step": 14442 + }, + { + "epoch": 2.3400842514581983, + "grad_norm": 0.8635346293449402, + "learning_rate": 6.081030211036707e-07, + "loss": 0.0545, + "step": 14443 + }, + { + "epoch": 2.3402462734931953, + "grad_norm": 0.951804518699646, + "learning_rate": 6.078171868870075e-07, + "loss": 0.0615, + "step": 14444 + }, + { + "epoch": 2.3404082955281917, + "grad_norm": 0.8703580498695374, + "learning_rate": 6.075314105665595e-07, + "loss": 0.0609, + "step": 14445 + }, + { + "epoch": 2.3405703175631887, + "grad_norm": 0.9203516840934753, + "learning_rate": 6.072456921510703e-07, + "loss": 0.0607, + "step": 14446 + }, + { + "epoch": 2.340732339598185, + "grad_norm": 0.8416757583618164, + "learning_rate": 6.06960031649283e-07, + "loss": 0.0551, + "step": 14447 + }, + { + "epoch": 2.340894361633182, + "grad_norm": 1.4582421779632568, + "learning_rate": 6.066744290699372e-07, + "loss": 0.0577, + "step": 14448 + }, + { + "epoch": 2.341056383668179, + "grad_norm": 1.0675171613693237, + "learning_rate": 6.06388884421773e-07, + "loss": 0.0636, + "step": 14449 + }, + { + "epoch": 2.3412184057031755, + "grad_norm": 1.3599189519882202, + "learning_rate": 6.061033977135253e-07, + "loss": 0.0655, + "step": 14450 + }, + { + "epoch": 2.3413804277381725, + "grad_norm": 0.8678320050239563, + "learning_rate": 6.058179689539309e-07, + "loss": 0.0632, + "step": 14451 + }, + { + "epoch": 2.3415424497731694, + "grad_norm": 0.7486401200294495, + "learning_rate": 6.055325981517238e-07, + "loss": 0.0604, + "step": 14452 + }, + { + "epoch": 2.341704471808166, + "grad_norm": 0.9437369108200073, + "learning_rate": 6.052472853156339e-07, + "loss": 0.066, + "step": 14453 + }, + { + "epoch": 2.341866493843163, + "grad_norm": 0.8997758030891418, + "learning_rate": 6.049620304543916e-07, + "loss": 0.0664, + "step": 14454 + }, + { + "epoch": 2.3420285158781593, + "grad_norm": 0.826973557472229, + "learning_rate": 6.046768335767248e-07, + "loss": 0.0587, + "step": 14455 + }, + { + "epoch": 2.3421905379131562, + "grad_norm": 0.9194096326828003, + "learning_rate": 6.043916946913613e-07, + "loss": 0.061, + "step": 14456 + }, + { + "epoch": 2.3423525599481527, + "grad_norm": 1.0393024682998657, + "learning_rate": 6.041066138070245e-07, + "loss": 0.0642, + "step": 14457 + }, + { + "epoch": 2.3425145819831497, + "grad_norm": 1.180245041847229, + "learning_rate": 6.038215909324372e-07, + "loss": 0.0696, + "step": 14458 + }, + { + "epoch": 2.3426766040181466, + "grad_norm": 0.9270623326301575, + "learning_rate": 6.035366260763203e-07, + "loss": 0.0644, + "step": 14459 + }, + { + "epoch": 2.342838626053143, + "grad_norm": 0.8007288575172424, + "learning_rate": 6.032517192473935e-07, + "loss": 0.0534, + "step": 14460 + }, + { + "epoch": 2.34300064808814, + "grad_norm": 0.85854172706604, + "learning_rate": 6.02966870454374e-07, + "loss": 0.0591, + "step": 14461 + }, + { + "epoch": 2.343162670123137, + "grad_norm": 0.8734108805656433, + "learning_rate": 6.026820797059777e-07, + "loss": 0.0645, + "step": 14462 + }, + { + "epoch": 2.3433246921581334, + "grad_norm": 0.9076907634735107, + "learning_rate": 6.023973470109182e-07, + "loss": 0.0626, + "step": 14463 + }, + { + "epoch": 2.3434867141931304, + "grad_norm": 0.7974627614021301, + "learning_rate": 6.021126723779075e-07, + "loss": 0.0583, + "step": 14464 + }, + { + "epoch": 2.343648736228127, + "grad_norm": 0.9349268078804016, + "learning_rate": 6.018280558156566e-07, + "loss": 0.0679, + "step": 14465 + }, + { + "epoch": 2.343810758263124, + "grad_norm": 0.9849607348442078, + "learning_rate": 6.015434973328735e-07, + "loss": 0.0693, + "step": 14466 + }, + { + "epoch": 2.3439727802981207, + "grad_norm": 0.8764670491218567, + "learning_rate": 6.012589969382659e-07, + "loss": 0.0605, + "step": 14467 + }, + { + "epoch": 2.344134802333117, + "grad_norm": 0.8385621905326843, + "learning_rate": 6.009745546405377e-07, + "loss": 0.0605, + "step": 14468 + }, + { + "epoch": 2.344296824368114, + "grad_norm": 0.8304133415222168, + "learning_rate": 6.006901704483917e-07, + "loss": 0.0562, + "step": 14469 + }, + { + "epoch": 2.3444588464031106, + "grad_norm": 0.8149062395095825, + "learning_rate": 6.00405844370531e-07, + "loss": 0.0544, + "step": 14470 + }, + { + "epoch": 2.3446208684381076, + "grad_norm": 0.8023223280906677, + "learning_rate": 6.001215764156551e-07, + "loss": 0.057, + "step": 14471 + }, + { + "epoch": 2.3447828904731045, + "grad_norm": 0.7750385403633118, + "learning_rate": 5.998373665924606e-07, + "loss": 0.0537, + "step": 14472 + }, + { + "epoch": 2.344944912508101, + "grad_norm": 0.9637815952301025, + "learning_rate": 5.995532149096447e-07, + "loss": 0.066, + "step": 14473 + }, + { + "epoch": 2.345106934543098, + "grad_norm": 0.8537014722824097, + "learning_rate": 5.992691213759011e-07, + "loss": 0.0598, + "step": 14474 + }, + { + "epoch": 2.345268956578095, + "grad_norm": 0.8019862174987793, + "learning_rate": 5.989850859999227e-07, + "loss": 0.0572, + "step": 14475 + }, + { + "epoch": 2.3454309786130914, + "grad_norm": 0.8479922413825989, + "learning_rate": 5.987011087904007e-07, + "loss": 0.0562, + "step": 14476 + }, + { + "epoch": 2.3455930006480883, + "grad_norm": 0.8056397438049316, + "learning_rate": 5.984171897560234e-07, + "loss": 0.0596, + "step": 14477 + }, + { + "epoch": 2.345755022683085, + "grad_norm": 0.8855328559875488, + "learning_rate": 5.981333289054792e-07, + "loss": 0.0617, + "step": 14478 + }, + { + "epoch": 2.3459170447180817, + "grad_norm": 0.8664817810058594, + "learning_rate": 5.978495262474509e-07, + "loss": 0.0606, + "step": 14479 + }, + { + "epoch": 2.346079066753078, + "grad_norm": 0.8697784543037415, + "learning_rate": 5.975657817906253e-07, + "loss": 0.0604, + "step": 14480 + }, + { + "epoch": 2.346241088788075, + "grad_norm": 0.860231339931488, + "learning_rate": 5.972820955436825e-07, + "loss": 0.062, + "step": 14481 + }, + { + "epoch": 2.346403110823072, + "grad_norm": 0.8649452328681946, + "learning_rate": 5.96998467515304e-07, + "loss": 0.0594, + "step": 14482 + }, + { + "epoch": 2.3465651328580686, + "grad_norm": 0.9872585535049438, + "learning_rate": 5.967148977141665e-07, + "loss": 0.0711, + "step": 14483 + }, + { + "epoch": 2.3467271548930655, + "grad_norm": 0.8951234817504883, + "learning_rate": 5.964313861489466e-07, + "loss": 0.0692, + "step": 14484 + }, + { + "epoch": 2.3468891769280624, + "grad_norm": 0.8037145137786865, + "learning_rate": 5.96147932828321e-07, + "loss": 0.0561, + "step": 14485 + }, + { + "epoch": 2.347051198963059, + "grad_norm": 0.9256864786148071, + "learning_rate": 5.958645377609606e-07, + "loss": 0.0604, + "step": 14486 + }, + { + "epoch": 2.347213220998056, + "grad_norm": 0.9966773390769958, + "learning_rate": 5.955812009555378e-07, + "loss": 0.0661, + "step": 14487 + }, + { + "epoch": 2.3473752430330523, + "grad_norm": 0.930876612663269, + "learning_rate": 5.952979224207205e-07, + "loss": 0.0658, + "step": 14488 + }, + { + "epoch": 2.3475372650680493, + "grad_norm": 0.9100360870361328, + "learning_rate": 5.950147021651792e-07, + "loss": 0.0675, + "step": 14489 + }, + { + "epoch": 2.347699287103046, + "grad_norm": 0.8395872116088867, + "learning_rate": 5.947315401975773e-07, + "loss": 0.0549, + "step": 14490 + }, + { + "epoch": 2.3478613091380427, + "grad_norm": 0.807367205619812, + "learning_rate": 5.944484365265795e-07, + "loss": 0.0558, + "step": 14491 + }, + { + "epoch": 2.3480233311730396, + "grad_norm": 0.902279257774353, + "learning_rate": 5.941653911608486e-07, + "loss": 0.0609, + "step": 14492 + }, + { + "epoch": 2.348185353208036, + "grad_norm": 0.7829273343086243, + "learning_rate": 5.938824041090443e-07, + "loss": 0.0568, + "step": 14493 + }, + { + "epoch": 2.348347375243033, + "grad_norm": 1.0976831912994385, + "learning_rate": 5.935994753798258e-07, + "loss": 0.0659, + "step": 14494 + }, + { + "epoch": 2.34850939727803, + "grad_norm": 0.8173691630363464, + "learning_rate": 5.933166049818501e-07, + "loss": 0.0597, + "step": 14495 + }, + { + "epoch": 2.3486714193130265, + "grad_norm": 0.7735475897789001, + "learning_rate": 5.930337929237726e-07, + "loss": 0.0606, + "step": 14496 + }, + { + "epoch": 2.3488334413480234, + "grad_norm": 0.9407158493995667, + "learning_rate": 5.927510392142458e-07, + "loss": 0.0622, + "step": 14497 + }, + { + "epoch": 2.34899546338302, + "grad_norm": 0.8489283323287964, + "learning_rate": 5.924683438619208e-07, + "loss": 0.0552, + "step": 14498 + }, + { + "epoch": 2.349157485418017, + "grad_norm": 0.9036144614219666, + "learning_rate": 5.921857068754494e-07, + "loss": 0.0573, + "step": 14499 + }, + { + "epoch": 2.3493195074530138, + "grad_norm": 0.8492865562438965, + "learning_rate": 5.91903128263479e-07, + "loss": 0.0608, + "step": 14500 + }, + { + "epoch": 2.3494815294880103, + "grad_norm": 0.8074772953987122, + "learning_rate": 5.916206080346549e-07, + "loss": 0.0574, + "step": 14501 + }, + { + "epoch": 2.349643551523007, + "grad_norm": 0.9476245641708374, + "learning_rate": 5.913381461976217e-07, + "loss": 0.0591, + "step": 14502 + }, + { + "epoch": 2.3498055735580037, + "grad_norm": 0.938563346862793, + "learning_rate": 5.910557427610225e-07, + "loss": 0.0571, + "step": 14503 + }, + { + "epoch": 2.3499675955930006, + "grad_norm": 0.9736203551292419, + "learning_rate": 5.907733977334978e-07, + "loss": 0.0621, + "step": 14504 + }, + { + "epoch": 2.3501296176279975, + "grad_norm": 0.8642393350601196, + "learning_rate": 5.904911111236872e-07, + "loss": 0.0633, + "step": 14505 + }, + { + "epoch": 2.350291639662994, + "grad_norm": 0.9992730617523193, + "learning_rate": 5.902088829402274e-07, + "loss": 0.0643, + "step": 14506 + }, + { + "epoch": 2.350453661697991, + "grad_norm": 0.8306195735931396, + "learning_rate": 5.899267131917547e-07, + "loss": 0.0538, + "step": 14507 + }, + { + "epoch": 2.350615683732988, + "grad_norm": 0.9145315885543823, + "learning_rate": 5.896446018869018e-07, + "loss": 0.0614, + "step": 14508 + }, + { + "epoch": 2.3507777057679844, + "grad_norm": 0.948464035987854, + "learning_rate": 5.893625490343014e-07, + "loss": 0.0604, + "step": 14509 + }, + { + "epoch": 2.3509397278029813, + "grad_norm": 0.7787171006202698, + "learning_rate": 5.890805546425832e-07, + "loss": 0.0572, + "step": 14510 + }, + { + "epoch": 2.351101749837978, + "grad_norm": 0.9339138269424438, + "learning_rate": 5.887986187203762e-07, + "loss": 0.0636, + "step": 14511 + }, + { + "epoch": 2.3512637718729748, + "grad_norm": 1.0364383459091187, + "learning_rate": 5.885167412763051e-07, + "loss": 0.067, + "step": 14512 + }, + { + "epoch": 2.3514257939079712, + "grad_norm": 0.8669085502624512, + "learning_rate": 5.88234922318997e-07, + "loss": 0.0552, + "step": 14513 + }, + { + "epoch": 2.351587815942968, + "grad_norm": 0.9558306932449341, + "learning_rate": 5.879531618570738e-07, + "loss": 0.0647, + "step": 14514 + }, + { + "epoch": 2.351749837977965, + "grad_norm": 0.9057703614234924, + "learning_rate": 5.876714598991573e-07, + "loss": 0.0636, + "step": 14515 + }, + { + "epoch": 2.3519118600129616, + "grad_norm": 0.9369593262672424, + "learning_rate": 5.873898164538658e-07, + "loss": 0.0662, + "step": 14516 + }, + { + "epoch": 2.3520738820479585, + "grad_norm": 0.9057307839393616, + "learning_rate": 5.871082315298168e-07, + "loss": 0.0591, + "step": 14517 + }, + { + "epoch": 2.3522359040829555, + "grad_norm": 0.8824877738952637, + "learning_rate": 5.868267051356283e-07, + "loss": 0.0576, + "step": 14518 + }, + { + "epoch": 2.352397926117952, + "grad_norm": 0.895741879940033, + "learning_rate": 5.865452372799121e-07, + "loss": 0.0594, + "step": 14519 + }, + { + "epoch": 2.352559948152949, + "grad_norm": 0.8106254935264587, + "learning_rate": 5.86263827971281e-07, + "loss": 0.0609, + "step": 14520 + }, + { + "epoch": 2.3527219701879454, + "grad_norm": 0.7932037115097046, + "learning_rate": 5.859824772183459e-07, + "loss": 0.0541, + "step": 14521 + }, + { + "epoch": 2.3528839922229423, + "grad_norm": 1.2299085855484009, + "learning_rate": 5.857011850297148e-07, + "loss": 0.0635, + "step": 14522 + }, + { + "epoch": 2.3530460142579392, + "grad_norm": 0.9761585593223572, + "learning_rate": 5.854199514139952e-07, + "loss": 0.0563, + "step": 14523 + }, + { + "epoch": 2.3532080362929357, + "grad_norm": 0.9831428527832031, + "learning_rate": 5.851387763797916e-07, + "loss": 0.0723, + "step": 14524 + }, + { + "epoch": 2.3533700583279327, + "grad_norm": 0.9172781705856323, + "learning_rate": 5.848576599357078e-07, + "loss": 0.0666, + "step": 14525 + }, + { + "epoch": 2.353532080362929, + "grad_norm": 0.9665817022323608, + "learning_rate": 5.845766020903459e-07, + "loss": 0.0599, + "step": 14526 + }, + { + "epoch": 2.353694102397926, + "grad_norm": 0.8048878908157349, + "learning_rate": 5.842956028523031e-07, + "loss": 0.0518, + "step": 14527 + }, + { + "epoch": 2.353856124432923, + "grad_norm": 1.1949489116668701, + "learning_rate": 5.840146622301796e-07, + "loss": 0.0697, + "step": 14528 + }, + { + "epoch": 2.3540181464679195, + "grad_norm": 1.0436763763427734, + "learning_rate": 5.837337802325718e-07, + "loss": 0.0735, + "step": 14529 + }, + { + "epoch": 2.3541801685029164, + "grad_norm": 0.9715738892555237, + "learning_rate": 5.834529568680722e-07, + "loss": 0.067, + "step": 14530 + }, + { + "epoch": 2.3543421905379134, + "grad_norm": 0.9725156426429749, + "learning_rate": 5.83172192145274e-07, + "loss": 0.065, + "step": 14531 + }, + { + "epoch": 2.35450421257291, + "grad_norm": 1.021435022354126, + "learning_rate": 5.828914860727674e-07, + "loss": 0.0635, + "step": 14532 + }, + { + "epoch": 2.354666234607907, + "grad_norm": 0.9970274567604065, + "learning_rate": 5.826108386591436e-07, + "loss": 0.0587, + "step": 14533 + }, + { + "epoch": 2.3548282566429033, + "grad_norm": 1.0088868141174316, + "learning_rate": 5.823302499129873e-07, + "loss": 0.0653, + "step": 14534 + }, + { + "epoch": 2.3549902786779002, + "grad_norm": 0.9348923563957214, + "learning_rate": 5.820497198428849e-07, + "loss": 0.0649, + "step": 14535 + }, + { + "epoch": 2.3551523007128967, + "grad_norm": 0.8347666263580322, + "learning_rate": 5.817692484574197e-07, + "loss": 0.0563, + "step": 14536 + }, + { + "epoch": 2.3553143227478937, + "grad_norm": 0.8557009696960449, + "learning_rate": 5.814888357651733e-07, + "loss": 0.0638, + "step": 14537 + }, + { + "epoch": 2.3554763447828906, + "grad_norm": 0.8046319484710693, + "learning_rate": 5.81208481774726e-07, + "loss": 0.0533, + "step": 14538 + }, + { + "epoch": 2.355638366817887, + "grad_norm": 0.9410936832427979, + "learning_rate": 5.809281864946556e-07, + "loss": 0.0613, + "step": 14539 + }, + { + "epoch": 2.355800388852884, + "grad_norm": 0.840984582901001, + "learning_rate": 5.806479499335385e-07, + "loss": 0.0563, + "step": 14540 + }, + { + "epoch": 2.355962410887881, + "grad_norm": 0.9175610542297363, + "learning_rate": 5.803677720999495e-07, + "loss": 0.059, + "step": 14541 + }, + { + "epoch": 2.3561244329228774, + "grad_norm": 0.8809838891029358, + "learning_rate": 5.800876530024615e-07, + "loss": 0.0598, + "step": 14542 + }, + { + "epoch": 2.3562864549578744, + "grad_norm": 0.9057488441467285, + "learning_rate": 5.798075926496449e-07, + "loss": 0.0637, + "step": 14543 + }, + { + "epoch": 2.356448476992871, + "grad_norm": 0.8945654630661011, + "learning_rate": 5.795275910500703e-07, + "loss": 0.067, + "step": 14544 + }, + { + "epoch": 2.356610499027868, + "grad_norm": 1.0310019254684448, + "learning_rate": 5.792476482123027e-07, + "loss": 0.0654, + "step": 14545 + }, + { + "epoch": 2.3567725210628647, + "grad_norm": 0.8311236500740051, + "learning_rate": 5.789677641449087e-07, + "loss": 0.0557, + "step": 14546 + }, + { + "epoch": 2.356934543097861, + "grad_norm": 0.9365234971046448, + "learning_rate": 5.786879388564534e-07, + "loss": 0.0632, + "step": 14547 + }, + { + "epoch": 2.357096565132858, + "grad_norm": 0.7686519026756287, + "learning_rate": 5.784081723554971e-07, + "loss": 0.0577, + "step": 14548 + }, + { + "epoch": 2.3572585871678546, + "grad_norm": 0.86738520860672, + "learning_rate": 5.781284646506008e-07, + "loss": 0.0601, + "step": 14549 + }, + { + "epoch": 2.3574206092028516, + "grad_norm": 0.8879827857017517, + "learning_rate": 5.778488157503223e-07, + "loss": 0.0595, + "step": 14550 + }, + { + "epoch": 2.3575826312378485, + "grad_norm": 0.8974021077156067, + "learning_rate": 5.775692256632187e-07, + "loss": 0.0587, + "step": 14551 + }, + { + "epoch": 2.357744653272845, + "grad_norm": 0.9620395302772522, + "learning_rate": 5.772896943978446e-07, + "loss": 0.0681, + "step": 14552 + }, + { + "epoch": 2.357906675307842, + "grad_norm": 0.8157269954681396, + "learning_rate": 5.770102219627526e-07, + "loss": 0.058, + "step": 14553 + }, + { + "epoch": 2.358068697342839, + "grad_norm": 0.857354998588562, + "learning_rate": 5.767308083664949e-07, + "loss": 0.0626, + "step": 14554 + }, + { + "epoch": 2.3582307193778353, + "grad_norm": 1.1273916959762573, + "learning_rate": 5.7645145361762e-07, + "loss": 0.0657, + "step": 14555 + }, + { + "epoch": 2.3583927414128323, + "grad_norm": 0.9196612238883972, + "learning_rate": 5.761721577246754e-07, + "loss": 0.0593, + "step": 14556 + }, + { + "epoch": 2.3585547634478288, + "grad_norm": 0.9983531832695007, + "learning_rate": 5.758929206962074e-07, + "loss": 0.0676, + "step": 14557 + }, + { + "epoch": 2.3587167854828257, + "grad_norm": 1.0570601224899292, + "learning_rate": 5.756137425407598e-07, + "loss": 0.0627, + "step": 14558 + }, + { + "epoch": 2.358878807517822, + "grad_norm": 1.1186915636062622, + "learning_rate": 5.753346232668758e-07, + "loss": 0.0683, + "step": 14559 + }, + { + "epoch": 2.359040829552819, + "grad_norm": 0.9919911623001099, + "learning_rate": 5.750555628830928e-07, + "loss": 0.0628, + "step": 14560 + }, + { + "epoch": 2.359202851587816, + "grad_norm": 0.9504989385604858, + "learning_rate": 5.747765613979523e-07, + "loss": 0.0648, + "step": 14561 + }, + { + "epoch": 2.3593648736228126, + "grad_norm": 0.9681611061096191, + "learning_rate": 5.744976188199905e-07, + "loss": 0.0653, + "step": 14562 + }, + { + "epoch": 2.3595268956578095, + "grad_norm": 0.9179617166519165, + "learning_rate": 5.742187351577416e-07, + "loss": 0.064, + "step": 14563 + }, + { + "epoch": 2.3596889176928064, + "grad_norm": 0.8961821794509888, + "learning_rate": 5.739399104197388e-07, + "loss": 0.0585, + "step": 14564 + }, + { + "epoch": 2.359850939727803, + "grad_norm": 0.9559543132781982, + "learning_rate": 5.73661144614514e-07, + "loss": 0.0606, + "step": 14565 + }, + { + "epoch": 2.3600129617628, + "grad_norm": 0.9991036653518677, + "learning_rate": 5.733824377505965e-07, + "loss": 0.0674, + "step": 14566 + }, + { + "epoch": 2.3601749837977963, + "grad_norm": 0.9018924832344055, + "learning_rate": 5.731037898365138e-07, + "loss": 0.0603, + "step": 14567 + }, + { + "epoch": 2.3603370058327933, + "grad_norm": 0.8355051279067993, + "learning_rate": 5.728252008807925e-07, + "loss": 0.0548, + "step": 14568 + }, + { + "epoch": 2.36049902786779, + "grad_norm": 0.8183480501174927, + "learning_rate": 5.725466708919561e-07, + "loss": 0.0614, + "step": 14569 + }, + { + "epoch": 2.3606610499027867, + "grad_norm": 0.9316161274909973, + "learning_rate": 5.722681998785273e-07, + "loss": 0.0618, + "step": 14570 + }, + { + "epoch": 2.3608230719377836, + "grad_norm": 0.8802493810653687, + "learning_rate": 5.719897878490265e-07, + "loss": 0.0595, + "step": 14571 + }, + { + "epoch": 2.36098509397278, + "grad_norm": 0.8617424368858337, + "learning_rate": 5.717114348119726e-07, + "loss": 0.0638, + "step": 14572 + }, + { + "epoch": 2.361147116007777, + "grad_norm": 0.9052625894546509, + "learning_rate": 5.71433140775883e-07, + "loss": 0.0619, + "step": 14573 + }, + { + "epoch": 2.361309138042774, + "grad_norm": 1.0340335369110107, + "learning_rate": 5.711549057492718e-07, + "loss": 0.0687, + "step": 14574 + }, + { + "epoch": 2.3614711600777705, + "grad_norm": 0.8974990844726562, + "learning_rate": 5.70876729740652e-07, + "loss": 0.056, + "step": 14575 + }, + { + "epoch": 2.3616331821127674, + "grad_norm": 0.8803661465644836, + "learning_rate": 5.705986127585364e-07, + "loss": 0.0533, + "step": 14576 + }, + { + "epoch": 2.3617952041477643, + "grad_norm": 0.917258620262146, + "learning_rate": 5.703205548114352e-07, + "loss": 0.0671, + "step": 14577 + }, + { + "epoch": 2.361957226182761, + "grad_norm": 0.8356657028198242, + "learning_rate": 5.700425559078543e-07, + "loss": 0.0538, + "step": 14578 + }, + { + "epoch": 2.3621192482177578, + "grad_norm": 0.7526644468307495, + "learning_rate": 5.697646160563001e-07, + "loss": 0.0544, + "step": 14579 + }, + { + "epoch": 2.3622812702527543, + "grad_norm": 0.8724152445793152, + "learning_rate": 5.694867352652791e-07, + "loss": 0.0571, + "step": 14580 + }, + { + "epoch": 2.362443292287751, + "grad_norm": 0.988396167755127, + "learning_rate": 5.692089135432913e-07, + "loss": 0.0603, + "step": 14581 + }, + { + "epoch": 2.3626053143227477, + "grad_norm": 0.8566252589225769, + "learning_rate": 5.689311508988385e-07, + "loss": 0.0614, + "step": 14582 + }, + { + "epoch": 2.3627673363577446, + "grad_norm": 0.9325825572013855, + "learning_rate": 5.686534473404195e-07, + "loss": 0.062, + "step": 14583 + }, + { + "epoch": 2.3629293583927415, + "grad_norm": 0.9369713664054871, + "learning_rate": 5.68375802876531e-07, + "loss": 0.0612, + "step": 14584 + }, + { + "epoch": 2.363091380427738, + "grad_norm": 0.9119452834129333, + "learning_rate": 5.680982175156688e-07, + "loss": 0.0552, + "step": 14585 + }, + { + "epoch": 2.363253402462735, + "grad_norm": 0.9514526128768921, + "learning_rate": 5.678206912663259e-07, + "loss": 0.0559, + "step": 14586 + }, + { + "epoch": 2.363415424497732, + "grad_norm": 0.9722010493278503, + "learning_rate": 5.675432241369938e-07, + "loss": 0.0602, + "step": 14587 + }, + { + "epoch": 2.3635774465327284, + "grad_norm": 0.9492889046669006, + "learning_rate": 5.672658161361636e-07, + "loss": 0.0601, + "step": 14588 + }, + { + "epoch": 2.3637394685677253, + "grad_norm": 0.9314302206039429, + "learning_rate": 5.669884672723208e-07, + "loss": 0.0622, + "step": 14589 + }, + { + "epoch": 2.363901490602722, + "grad_norm": 0.9510951638221741, + "learning_rate": 5.667111775539538e-07, + "loss": 0.0754, + "step": 14590 + }, + { + "epoch": 2.3640635126377187, + "grad_norm": 1.04404616355896, + "learning_rate": 5.664339469895472e-07, + "loss": 0.0693, + "step": 14591 + }, + { + "epoch": 2.3642255346727157, + "grad_norm": 0.877565860748291, + "learning_rate": 5.661567755875816e-07, + "loss": 0.061, + "step": 14592 + }, + { + "epoch": 2.364387556707712, + "grad_norm": 0.8133190274238586, + "learning_rate": 5.658796633565391e-07, + "loss": 0.0578, + "step": 14593 + }, + { + "epoch": 2.364549578742709, + "grad_norm": 0.9318547248840332, + "learning_rate": 5.656026103048975e-07, + "loss": 0.0612, + "step": 14594 + }, + { + "epoch": 2.3647116007777056, + "grad_norm": 1.0102546215057373, + "learning_rate": 5.653256164411366e-07, + "loss": 0.066, + "step": 14595 + }, + { + "epoch": 2.3648736228127025, + "grad_norm": 0.8324956297874451, + "learning_rate": 5.650486817737291e-07, + "loss": 0.0536, + "step": 14596 + }, + { + "epoch": 2.3650356448476995, + "grad_norm": 0.9316480755805969, + "learning_rate": 5.647718063111496e-07, + "loss": 0.0612, + "step": 14597 + }, + { + "epoch": 2.365197666882696, + "grad_norm": 1.2550431489944458, + "learning_rate": 5.644949900618696e-07, + "loss": 0.06, + "step": 14598 + }, + { + "epoch": 2.365359688917693, + "grad_norm": 0.8270633816719055, + "learning_rate": 5.64218233034359e-07, + "loss": 0.0493, + "step": 14599 + }, + { + "epoch": 2.3655217109526894, + "grad_norm": 0.9431858062744141, + "learning_rate": 5.639415352370858e-07, + "loss": 0.066, + "step": 14600 + }, + { + "epoch": 2.3656837329876863, + "grad_norm": 0.8048388361930847, + "learning_rate": 5.636648966785168e-07, + "loss": 0.0605, + "step": 14601 + }, + { + "epoch": 2.3658457550226832, + "grad_norm": 0.8069517016410828, + "learning_rate": 5.633883173671159e-07, + "loss": 0.0572, + "step": 14602 + }, + { + "epoch": 2.3660077770576797, + "grad_norm": 0.9145830273628235, + "learning_rate": 5.631117973113462e-07, + "loss": 0.0575, + "step": 14603 + }, + { + "epoch": 2.3661697990926767, + "grad_norm": 0.9402312636375427, + "learning_rate": 5.628353365196682e-07, + "loss": 0.064, + "step": 14604 + }, + { + "epoch": 2.366331821127673, + "grad_norm": 0.8871921896934509, + "learning_rate": 5.625589350005409e-07, + "loss": 0.0612, + "step": 14605 + }, + { + "epoch": 2.36649384316267, + "grad_norm": 0.9529529213905334, + "learning_rate": 5.622825927624226e-07, + "loss": 0.0685, + "step": 14606 + }, + { + "epoch": 2.366655865197667, + "grad_norm": 0.8908757567405701, + "learning_rate": 5.620063098137668e-07, + "loss": 0.0539, + "step": 14607 + }, + { + "epoch": 2.3668178872326635, + "grad_norm": 0.897894561290741, + "learning_rate": 5.617300861630276e-07, + "loss": 0.0623, + "step": 14608 + }, + { + "epoch": 2.3669799092676604, + "grad_norm": 1.1129932403564453, + "learning_rate": 5.61453921818658e-07, + "loss": 0.0598, + "step": 14609 + }, + { + "epoch": 2.3671419313026574, + "grad_norm": 0.9506595730781555, + "learning_rate": 5.611778167891077e-07, + "loss": 0.0658, + "step": 14610 + }, + { + "epoch": 2.367303953337654, + "grad_norm": 1.14960777759552, + "learning_rate": 5.609017710828238e-07, + "loss": 0.0709, + "step": 14611 + }, + { + "epoch": 2.367465975372651, + "grad_norm": 0.8241292834281921, + "learning_rate": 5.60625784708253e-07, + "loss": 0.0601, + "step": 14612 + }, + { + "epoch": 2.3676279974076473, + "grad_norm": 0.987915575504303, + "learning_rate": 5.6034985767384e-07, + "loss": 0.0651, + "step": 14613 + }, + { + "epoch": 2.3677900194426442, + "grad_norm": 0.8549913167953491, + "learning_rate": 5.600739899880275e-07, + "loss": 0.0588, + "step": 14614 + }, + { + "epoch": 2.3679520414776407, + "grad_norm": 0.9886636137962341, + "learning_rate": 5.597981816592565e-07, + "loss": 0.0615, + "step": 14615 + }, + { + "epoch": 2.3681140635126376, + "grad_norm": 0.8965216279029846, + "learning_rate": 5.595224326959662e-07, + "loss": 0.059, + "step": 14616 + }, + { + "epoch": 2.3682760855476346, + "grad_norm": 0.8499079346656799, + "learning_rate": 5.592467431065937e-07, + "loss": 0.0602, + "step": 14617 + }, + { + "epoch": 2.368438107582631, + "grad_norm": 0.9334196448326111, + "learning_rate": 5.589711128995734e-07, + "loss": 0.0587, + "step": 14618 + }, + { + "epoch": 2.368600129617628, + "grad_norm": 0.8634200692176819, + "learning_rate": 5.586955420833404e-07, + "loss": 0.0553, + "step": 14619 + }, + { + "epoch": 2.368762151652625, + "grad_norm": 0.9879969954490662, + "learning_rate": 5.584200306663259e-07, + "loss": 0.0591, + "step": 14620 + }, + { + "epoch": 2.3689241736876214, + "grad_norm": 0.8652029037475586, + "learning_rate": 5.581445786569606e-07, + "loss": 0.0591, + "step": 14621 + }, + { + "epoch": 2.3690861957226184, + "grad_norm": 1.0487457513809204, + "learning_rate": 5.578691860636706e-07, + "loss": 0.0652, + "step": 14622 + }, + { + "epoch": 2.369248217757615, + "grad_norm": 0.9185906648635864, + "learning_rate": 5.575938528948843e-07, + "loss": 0.0608, + "step": 14623 + }, + { + "epoch": 2.369410239792612, + "grad_norm": 0.8492238521575928, + "learning_rate": 5.573185791590266e-07, + "loss": 0.0612, + "step": 14624 + }, + { + "epoch": 2.3695722618276087, + "grad_norm": 0.8446729779243469, + "learning_rate": 5.570433648645182e-07, + "loss": 0.0578, + "step": 14625 + }, + { + "epoch": 2.369734283862605, + "grad_norm": 0.8767587542533875, + "learning_rate": 5.567682100197808e-07, + "loss": 0.0626, + "step": 14626 + }, + { + "epoch": 2.369896305897602, + "grad_norm": 0.8166986703872681, + "learning_rate": 5.564931146332334e-07, + "loss": 0.0594, + "step": 14627 + }, + { + "epoch": 2.3700583279325986, + "grad_norm": 0.84421306848526, + "learning_rate": 5.562180787132945e-07, + "loss": 0.06, + "step": 14628 + }, + { + "epoch": 2.3702203499675956, + "grad_norm": 0.8665793538093567, + "learning_rate": 5.559431022683779e-07, + "loss": 0.046, + "step": 14629 + }, + { + "epoch": 2.3703823720025925, + "grad_norm": 0.7502251267433167, + "learning_rate": 5.55668185306898e-07, + "loss": 0.0541, + "step": 14630 + }, + { + "epoch": 2.370544394037589, + "grad_norm": 0.9786513447761536, + "learning_rate": 5.553933278372664e-07, + "loss": 0.0676, + "step": 14631 + }, + { + "epoch": 2.370706416072586, + "grad_norm": 0.8539317846298218, + "learning_rate": 5.551185298678929e-07, + "loss": 0.059, + "step": 14632 + }, + { + "epoch": 2.370868438107583, + "grad_norm": 0.9358149766921997, + "learning_rate": 5.548437914071861e-07, + "loss": 0.0671, + "step": 14633 + }, + { + "epoch": 2.3710304601425793, + "grad_norm": 0.7582257986068726, + "learning_rate": 5.545691124635518e-07, + "loss": 0.0528, + "step": 14634 + }, + { + "epoch": 2.3711924821775763, + "grad_norm": 0.9727949500083923, + "learning_rate": 5.542944930453958e-07, + "loss": 0.0642, + "step": 14635 + }, + { + "epoch": 2.3713545042125728, + "grad_norm": 0.9191507697105408, + "learning_rate": 5.54019933161119e-07, + "loss": 0.0564, + "step": 14636 + }, + { + "epoch": 2.3715165262475697, + "grad_norm": 0.9941038489341736, + "learning_rate": 5.537454328191225e-07, + "loss": 0.0654, + "step": 14637 + }, + { + "epoch": 2.371678548282566, + "grad_norm": 1.0261110067367554, + "learning_rate": 5.534709920278064e-07, + "loss": 0.0663, + "step": 14638 + }, + { + "epoch": 2.371840570317563, + "grad_norm": 1.0194072723388672, + "learning_rate": 5.531966107955683e-07, + "loss": 0.058, + "step": 14639 + }, + { + "epoch": 2.37200259235256, + "grad_norm": 0.9304824471473694, + "learning_rate": 5.52922289130802e-07, + "loss": 0.0633, + "step": 14640 + }, + { + "epoch": 2.3721646143875565, + "grad_norm": 0.9960220456123352, + "learning_rate": 5.526480270419018e-07, + "loss": 0.0677, + "step": 14641 + }, + { + "epoch": 2.3723266364225535, + "grad_norm": 0.8424884080886841, + "learning_rate": 5.523738245372596e-07, + "loss": 0.06, + "step": 14642 + }, + { + "epoch": 2.3724886584575504, + "grad_norm": 0.851523220539093, + "learning_rate": 5.52099681625265e-07, + "loss": 0.0618, + "step": 14643 + }, + { + "epoch": 2.372650680492547, + "grad_norm": 0.8086889386177063, + "learning_rate": 5.518255983143061e-07, + "loss": 0.0611, + "step": 14644 + }, + { + "epoch": 2.372812702527544, + "grad_norm": 0.8051342368125916, + "learning_rate": 5.515515746127697e-07, + "loss": 0.0589, + "step": 14645 + }, + { + "epoch": 2.3729747245625403, + "grad_norm": 0.9982341527938843, + "learning_rate": 5.512776105290402e-07, + "loss": 0.0722, + "step": 14646 + }, + { + "epoch": 2.3731367465975373, + "grad_norm": 0.933042049407959, + "learning_rate": 5.510037060714995e-07, + "loss": 0.0621, + "step": 14647 + }, + { + "epoch": 2.373298768632534, + "grad_norm": 0.872920572757721, + "learning_rate": 5.507298612485293e-07, + "loss": 0.0582, + "step": 14648 + }, + { + "epoch": 2.3734607906675307, + "grad_norm": 0.8864262104034424, + "learning_rate": 5.504560760685079e-07, + "loss": 0.0587, + "step": 14649 + }, + { + "epoch": 2.3736228127025276, + "grad_norm": 0.8857452869415283, + "learning_rate": 5.501823505398137e-07, + "loss": 0.0643, + "step": 14650 + }, + { + "epoch": 2.373784834737524, + "grad_norm": 0.9040178060531616, + "learning_rate": 5.499086846708196e-07, + "loss": 0.06, + "step": 14651 + }, + { + "epoch": 2.373946856772521, + "grad_norm": 0.796747624874115, + "learning_rate": 5.496350784699015e-07, + "loss": 0.0553, + "step": 14652 + }, + { + "epoch": 2.374108878807518, + "grad_norm": 1.0335019826889038, + "learning_rate": 5.493615319454299e-07, + "loss": 0.0642, + "step": 14653 + }, + { + "epoch": 2.3742709008425145, + "grad_norm": 0.760349690914154, + "learning_rate": 5.490880451057759e-07, + "loss": 0.0557, + "step": 14654 + }, + { + "epoch": 2.3744329228775114, + "grad_norm": 0.7830966711044312, + "learning_rate": 5.488146179593057e-07, + "loss": 0.0559, + "step": 14655 + }, + { + "epoch": 2.3745949449125083, + "grad_norm": 1.081561803817749, + "learning_rate": 5.485412505143858e-07, + "loss": 0.0734, + "step": 14656 + }, + { + "epoch": 2.374756966947505, + "grad_norm": 0.9587316513061523, + "learning_rate": 5.482679427793827e-07, + "loss": 0.0659, + "step": 14657 + }, + { + "epoch": 2.3749189889825018, + "grad_norm": 0.8291387557983398, + "learning_rate": 5.479946947626566e-07, + "loss": 0.0584, + "step": 14658 + }, + { + "epoch": 2.3750810110174982, + "grad_norm": 0.8340387940406799, + "learning_rate": 5.477215064725692e-07, + "loss": 0.0572, + "step": 14659 + }, + { + "epoch": 2.375243033052495, + "grad_norm": 1.00160551071167, + "learning_rate": 5.474483779174791e-07, + "loss": 0.0646, + "step": 14660 + }, + { + "epoch": 2.3754050550874917, + "grad_norm": 0.9756659269332886, + "learning_rate": 5.471753091057438e-07, + "loss": 0.0626, + "step": 14661 + }, + { + "epoch": 2.3755670771224886, + "grad_norm": 0.8894532322883606, + "learning_rate": 5.469023000457183e-07, + "loss": 0.0557, + "step": 14662 + }, + { + "epoch": 2.3757290991574855, + "grad_norm": 1.0041909217834473, + "learning_rate": 5.466293507457557e-07, + "loss": 0.0645, + "step": 14663 + }, + { + "epoch": 2.375891121192482, + "grad_norm": 0.9752393364906311, + "learning_rate": 5.463564612142083e-07, + "loss": 0.0614, + "step": 14664 + }, + { + "epoch": 2.376053143227479, + "grad_norm": 0.8114173412322998, + "learning_rate": 5.460836314594259e-07, + "loss": 0.0578, + "step": 14665 + }, + { + "epoch": 2.376215165262476, + "grad_norm": 0.8527717590332031, + "learning_rate": 5.458108614897545e-07, + "loss": 0.0601, + "step": 14666 + }, + { + "epoch": 2.3763771872974724, + "grad_norm": 1.0959402322769165, + "learning_rate": 5.455381513135427e-07, + "loss": 0.0673, + "step": 14667 + }, + { + "epoch": 2.3765392093324693, + "grad_norm": 1.0152456760406494, + "learning_rate": 5.452655009391341e-07, + "loss": 0.0676, + "step": 14668 + }, + { + "epoch": 2.376701231367466, + "grad_norm": 0.9547176957130432, + "learning_rate": 5.449929103748705e-07, + "loss": 0.0598, + "step": 14669 + }, + { + "epoch": 2.3768632534024627, + "grad_norm": 0.8178269863128662, + "learning_rate": 5.447203796290918e-07, + "loss": 0.0586, + "step": 14670 + }, + { + "epoch": 2.3770252754374597, + "grad_norm": 1.0196658372879028, + "learning_rate": 5.444479087101387e-07, + "loss": 0.0674, + "step": 14671 + }, + { + "epoch": 2.377187297472456, + "grad_norm": 0.8403282761573792, + "learning_rate": 5.441754976263478e-07, + "loss": 0.0641, + "step": 14672 + }, + { + "epoch": 2.377349319507453, + "grad_norm": 1.0634124279022217, + "learning_rate": 5.43903146386053e-07, + "loss": 0.0671, + "step": 14673 + }, + { + "epoch": 2.3775113415424496, + "grad_norm": 0.9557974338531494, + "learning_rate": 5.436308549975883e-07, + "loss": 0.0666, + "step": 14674 + }, + { + "epoch": 2.3776733635774465, + "grad_norm": 0.9498515725135803, + "learning_rate": 5.433586234692853e-07, + "loss": 0.0594, + "step": 14675 + }, + { + "epoch": 2.3778353856124435, + "grad_norm": 0.8803273439407349, + "learning_rate": 5.430864518094731e-07, + "loss": 0.0602, + "step": 14676 + }, + { + "epoch": 2.37799740764744, + "grad_norm": 0.8781242966651917, + "learning_rate": 5.428143400264799e-07, + "loss": 0.0583, + "step": 14677 + }, + { + "epoch": 2.378159429682437, + "grad_norm": 0.9101912975311279, + "learning_rate": 5.425422881286319e-07, + "loss": 0.0566, + "step": 14678 + }, + { + "epoch": 2.378321451717434, + "grad_norm": 0.9364922046661377, + "learning_rate": 5.422702961242532e-07, + "loss": 0.0705, + "step": 14679 + }, + { + "epoch": 2.3784834737524303, + "grad_norm": 0.9377305507659912, + "learning_rate": 5.419983640216647e-07, + "loss": 0.0618, + "step": 14680 + }, + { + "epoch": 2.3786454957874272, + "grad_norm": 0.8949405550956726, + "learning_rate": 5.417264918291887e-07, + "loss": 0.059, + "step": 14681 + }, + { + "epoch": 2.3788075178224237, + "grad_norm": 1.1497116088867188, + "learning_rate": 5.414546795551429e-07, + "loss": 0.0691, + "step": 14682 + }, + { + "epoch": 2.3789695398574207, + "grad_norm": 0.9060092568397522, + "learning_rate": 5.411829272078453e-07, + "loss": 0.0627, + "step": 14683 + }, + { + "epoch": 2.379131561892417, + "grad_norm": 1.0954656600952148, + "learning_rate": 5.409112347956089e-07, + "loss": 0.0628, + "step": 14684 + }, + { + "epoch": 2.379293583927414, + "grad_norm": 0.9496614336967468, + "learning_rate": 5.406396023267473e-07, + "loss": 0.0644, + "step": 14685 + }, + { + "epoch": 2.379455605962411, + "grad_norm": 0.8147675395011902, + "learning_rate": 5.403680298095737e-07, + "loss": 0.0566, + "step": 14686 + }, + { + "epoch": 2.3796176279974075, + "grad_norm": 0.8436890244483948, + "learning_rate": 5.400965172523953e-07, + "loss": 0.0596, + "step": 14687 + }, + { + "epoch": 2.3797796500324044, + "grad_norm": 1.066830039024353, + "learning_rate": 5.398250646635209e-07, + "loss": 0.059, + "step": 14688 + }, + { + "epoch": 2.3799416720674014, + "grad_norm": 1.006394386291504, + "learning_rate": 5.395536720512551e-07, + "loss": 0.0612, + "step": 14689 + }, + { + "epoch": 2.380103694102398, + "grad_norm": 0.9849836230278015, + "learning_rate": 5.392823394239042e-07, + "loss": 0.0676, + "step": 14690 + }, + { + "epoch": 2.380265716137395, + "grad_norm": 1.1965252161026, + "learning_rate": 5.39011066789768e-07, + "loss": 0.0702, + "step": 14691 + }, + { + "epoch": 2.3804277381723913, + "grad_norm": 0.9115785360336304, + "learning_rate": 5.387398541571479e-07, + "loss": 0.0634, + "step": 14692 + }, + { + "epoch": 2.380589760207388, + "grad_norm": 1.0645346641540527, + "learning_rate": 5.384687015343418e-07, + "loss": 0.0629, + "step": 14693 + }, + { + "epoch": 2.380751782242385, + "grad_norm": 0.9696865677833557, + "learning_rate": 5.381976089296467e-07, + "loss": 0.0637, + "step": 14694 + }, + { + "epoch": 2.3809138042773816, + "grad_norm": 0.9118956327438354, + "learning_rate": 5.379265763513574e-07, + "loss": 0.0642, + "step": 14695 + }, + { + "epoch": 2.3810758263123786, + "grad_norm": 0.9417039155960083, + "learning_rate": 5.376556038077668e-07, + "loss": 0.0617, + "step": 14696 + }, + { + "epoch": 2.381237848347375, + "grad_norm": 0.9443559050559998, + "learning_rate": 5.373846913071659e-07, + "loss": 0.0653, + "step": 14697 + }, + { + "epoch": 2.381399870382372, + "grad_norm": 0.8240593671798706, + "learning_rate": 5.371138388578448e-07, + "loss": 0.0534, + "step": 14698 + }, + { + "epoch": 2.381561892417369, + "grad_norm": 1.074074625968933, + "learning_rate": 5.368430464680885e-07, + "loss": 0.0641, + "step": 14699 + }, + { + "epoch": 2.3817239144523654, + "grad_norm": 1.0060458183288574, + "learning_rate": 5.365723141461851e-07, + "loss": 0.0625, + "step": 14700 + }, + { + "epoch": 2.3818859364873624, + "grad_norm": 0.9213908314704895, + "learning_rate": 5.36301641900418e-07, + "loss": 0.0614, + "step": 14701 + }, + { + "epoch": 2.3820479585223593, + "grad_norm": 1.0687966346740723, + "learning_rate": 5.360310297390681e-07, + "loss": 0.0705, + "step": 14702 + }, + { + "epoch": 2.3822099805573558, + "grad_norm": 0.8176394104957581, + "learning_rate": 5.357604776704159e-07, + "loss": 0.0566, + "step": 14703 + }, + { + "epoch": 2.3823720025923527, + "grad_norm": 0.7344977259635925, + "learning_rate": 5.354899857027398e-07, + "loss": 0.052, + "step": 14704 + }, + { + "epoch": 2.382534024627349, + "grad_norm": 0.8186675310134888, + "learning_rate": 5.352195538443162e-07, + "loss": 0.0598, + "step": 14705 + }, + { + "epoch": 2.382696046662346, + "grad_norm": 0.8668035864830017, + "learning_rate": 5.349491821034192e-07, + "loss": 0.0565, + "step": 14706 + }, + { + "epoch": 2.3828580686973426, + "grad_norm": 0.8794363737106323, + "learning_rate": 5.346788704883222e-07, + "loss": 0.0653, + "step": 14707 + }, + { + "epoch": 2.3830200907323396, + "grad_norm": 0.8817334771156311, + "learning_rate": 5.344086190072955e-07, + "loss": 0.0602, + "step": 14708 + }, + { + "epoch": 2.3831821127673365, + "grad_norm": 1.023415446281433, + "learning_rate": 5.341384276686087e-07, + "loss": 0.0543, + "step": 14709 + }, + { + "epoch": 2.383344134802333, + "grad_norm": 0.9122980237007141, + "learning_rate": 5.338682964805286e-07, + "loss": 0.0592, + "step": 14710 + }, + { + "epoch": 2.38350615683733, + "grad_norm": 1.0508270263671875, + "learning_rate": 5.335982254513208e-07, + "loss": 0.0671, + "step": 14711 + }, + { + "epoch": 2.383668178872327, + "grad_norm": 0.8803588151931763, + "learning_rate": 5.333282145892493e-07, + "loss": 0.063, + "step": 14712 + }, + { + "epoch": 2.3838302009073233, + "grad_norm": 0.9654552340507507, + "learning_rate": 5.330582639025739e-07, + "loss": 0.0598, + "step": 14713 + }, + { + "epoch": 2.3839922229423203, + "grad_norm": 0.9189082384109497, + "learning_rate": 5.327883733995562e-07, + "loss": 0.0534, + "step": 14714 + }, + { + "epoch": 2.3841542449773168, + "grad_norm": 0.9922522306442261, + "learning_rate": 5.325185430884539e-07, + "loss": 0.0617, + "step": 14715 + }, + { + "epoch": 2.3843162670123137, + "grad_norm": 0.873923122882843, + "learning_rate": 5.322487729775233e-07, + "loss": 0.0668, + "step": 14716 + }, + { + "epoch": 2.3844782890473106, + "grad_norm": 1.0077275037765503, + "learning_rate": 5.319790630750182e-07, + "loss": 0.0597, + "step": 14717 + }, + { + "epoch": 2.384640311082307, + "grad_norm": 0.9447020888328552, + "learning_rate": 5.317094133891903e-07, + "loss": 0.0616, + "step": 14718 + }, + { + "epoch": 2.384802333117304, + "grad_norm": 1.0282572507858276, + "learning_rate": 5.314398239282926e-07, + "loss": 0.072, + "step": 14719 + }, + { + "epoch": 2.3849643551523005, + "grad_norm": 0.8626378178596497, + "learning_rate": 5.311702947005718e-07, + "loss": 0.0574, + "step": 14720 + }, + { + "epoch": 2.3851263771872975, + "grad_norm": 0.8774121403694153, + "learning_rate": 5.309008257142754e-07, + "loss": 0.0563, + "step": 14721 + }, + { + "epoch": 2.3852883992222944, + "grad_norm": 0.9489586353302002, + "learning_rate": 5.306314169776486e-07, + "loss": 0.0613, + "step": 14722 + }, + { + "epoch": 2.385450421257291, + "grad_norm": 1.0716079473495483, + "learning_rate": 5.303620684989347e-07, + "loss": 0.0602, + "step": 14723 + }, + { + "epoch": 2.385612443292288, + "grad_norm": 0.8634665012359619, + "learning_rate": 5.30092780286375e-07, + "loss": 0.0546, + "step": 14724 + }, + { + "epoch": 2.3857744653272843, + "grad_norm": 1.048065423965454, + "learning_rate": 5.298235523482093e-07, + "loss": 0.0685, + "step": 14725 + }, + { + "epoch": 2.3859364873622813, + "grad_norm": 0.8572198152542114, + "learning_rate": 5.295543846926752e-07, + "loss": 0.0559, + "step": 14726 + }, + { + "epoch": 2.386098509397278, + "grad_norm": 0.8316371440887451, + "learning_rate": 5.292852773280091e-07, + "loss": 0.0603, + "step": 14727 + }, + { + "epoch": 2.3862605314322747, + "grad_norm": 1.1684722900390625, + "learning_rate": 5.290162302624433e-07, + "loss": 0.0619, + "step": 14728 + }, + { + "epoch": 2.3864225534672716, + "grad_norm": 0.960731029510498, + "learning_rate": 5.287472435042116e-07, + "loss": 0.0612, + "step": 14729 + }, + { + "epoch": 2.386584575502268, + "grad_norm": 1.0198336839675903, + "learning_rate": 5.284783170615446e-07, + "loss": 0.0638, + "step": 14730 + }, + { + "epoch": 2.386746597537265, + "grad_norm": 0.9215663075447083, + "learning_rate": 5.282094509426694e-07, + "loss": 0.0627, + "step": 14731 + }, + { + "epoch": 2.386908619572262, + "grad_norm": 0.9803484678268433, + "learning_rate": 5.279406451558136e-07, + "loss": 0.0699, + "step": 14732 + }, + { + "epoch": 2.3870706416072585, + "grad_norm": 0.8979873061180115, + "learning_rate": 5.27671899709201e-07, + "loss": 0.053, + "step": 14733 + }, + { + "epoch": 2.3872326636422554, + "grad_norm": 1.023978352546692, + "learning_rate": 5.274032146110567e-07, + "loss": 0.0607, + "step": 14734 + }, + { + "epoch": 2.3873946856772523, + "grad_norm": 0.7679246664047241, + "learning_rate": 5.271345898695995e-07, + "loss": 0.053, + "step": 14735 + }, + { + "epoch": 2.387556707712249, + "grad_norm": 1.0310170650482178, + "learning_rate": 5.268660254930499e-07, + "loss": 0.062, + "step": 14736 + }, + { + "epoch": 2.3877187297472457, + "grad_norm": 0.8446293473243713, + "learning_rate": 5.265975214896249e-07, + "loss": 0.0596, + "step": 14737 + }, + { + "epoch": 2.3878807517822422, + "grad_norm": 1.072784662246704, + "learning_rate": 5.263290778675401e-07, + "loss": 0.0659, + "step": 14738 + }, + { + "epoch": 2.388042773817239, + "grad_norm": 0.93727707862854, + "learning_rate": 5.260606946350094e-07, + "loss": 0.0634, + "step": 14739 + }, + { + "epoch": 2.3882047958522357, + "grad_norm": 1.1457421779632568, + "learning_rate": 5.257923718002447e-07, + "loss": 0.0604, + "step": 14740 + }, + { + "epoch": 2.3883668178872326, + "grad_norm": 1.0058701038360596, + "learning_rate": 5.255241093714561e-07, + "loss": 0.0608, + "step": 14741 + }, + { + "epoch": 2.3885288399222295, + "grad_norm": 0.922507107257843, + "learning_rate": 5.252559073568514e-07, + "loss": 0.0557, + "step": 14742 + }, + { + "epoch": 2.388690861957226, + "grad_norm": 0.9533309936523438, + "learning_rate": 5.249877657646371e-07, + "loss": 0.0567, + "step": 14743 + }, + { + "epoch": 2.388852883992223, + "grad_norm": 0.8887631297111511, + "learning_rate": 5.247196846030178e-07, + "loss": 0.061, + "step": 14744 + }, + { + "epoch": 2.38901490602722, + "grad_norm": 1.0275040864944458, + "learning_rate": 5.244516638801966e-07, + "loss": 0.0604, + "step": 14745 + }, + { + "epoch": 2.3891769280622164, + "grad_norm": 0.7640145421028137, + "learning_rate": 5.241837036043731e-07, + "loss": 0.0586, + "step": 14746 + }, + { + "epoch": 2.3893389500972133, + "grad_norm": 1.0371057987213135, + "learning_rate": 5.239158037837464e-07, + "loss": 0.0581, + "step": 14747 + }, + { + "epoch": 2.38950097213221, + "grad_norm": 0.8980159163475037, + "learning_rate": 5.236479644265153e-07, + "loss": 0.0616, + "step": 14748 + }, + { + "epoch": 2.3896629941672067, + "grad_norm": 0.923527181148529, + "learning_rate": 5.233801855408733e-07, + "loss": 0.0555, + "step": 14749 + }, + { + "epoch": 2.3898250162022037, + "grad_norm": 0.9246334433555603, + "learning_rate": 5.231124671350141e-07, + "loss": 0.0641, + "step": 14750 + }, + { + "epoch": 2.3899870382372, + "grad_norm": 0.7615161538124084, + "learning_rate": 5.228448092171295e-07, + "loss": 0.0538, + "step": 14751 + }, + { + "epoch": 2.390149060272197, + "grad_norm": 0.9077004194259644, + "learning_rate": 5.225772117954089e-07, + "loss": 0.0647, + "step": 14752 + }, + { + "epoch": 2.3903110823071936, + "grad_norm": 0.9212455749511719, + "learning_rate": 5.223096748780407e-07, + "loss": 0.0559, + "step": 14753 + }, + { + "epoch": 2.3904731043421905, + "grad_norm": 0.8712791800498962, + "learning_rate": 5.220421984732104e-07, + "loss": 0.0621, + "step": 14754 + }, + { + "epoch": 2.3906351263771874, + "grad_norm": 0.9051830172538757, + "learning_rate": 5.217747825891023e-07, + "loss": 0.0596, + "step": 14755 + }, + { + "epoch": 2.390797148412184, + "grad_norm": 0.8532323241233826, + "learning_rate": 5.215074272338986e-07, + "loss": 0.0616, + "step": 14756 + }, + { + "epoch": 2.390959170447181, + "grad_norm": 0.9374991059303284, + "learning_rate": 5.212401324157795e-07, + "loss": 0.0571, + "step": 14757 + }, + { + "epoch": 2.391121192482178, + "grad_norm": 0.9487468600273132, + "learning_rate": 5.20972898142924e-07, + "loss": 0.0622, + "step": 14758 + }, + { + "epoch": 2.3912832145171743, + "grad_norm": 0.8730701208114624, + "learning_rate": 5.207057244235089e-07, + "loss": 0.0594, + "step": 14759 + }, + { + "epoch": 2.3914452365521712, + "grad_norm": 0.9014290571212769, + "learning_rate": 5.204386112657095e-07, + "loss": 0.0632, + "step": 14760 + }, + { + "epoch": 2.3916072585871677, + "grad_norm": 0.9186999797821045, + "learning_rate": 5.201715586776965e-07, + "loss": 0.0588, + "step": 14761 + }, + { + "epoch": 2.3917692806221647, + "grad_norm": 0.8001679182052612, + "learning_rate": 5.199045666676436e-07, + "loss": 0.0606, + "step": 14762 + }, + { + "epoch": 2.391931302657161, + "grad_norm": 0.8610774874687195, + "learning_rate": 5.196376352437199e-07, + "loss": 0.0522, + "step": 14763 + }, + { + "epoch": 2.392093324692158, + "grad_norm": 0.8187136650085449, + "learning_rate": 5.193707644140913e-07, + "loss": 0.0517, + "step": 14764 + }, + { + "epoch": 2.392255346727155, + "grad_norm": 1.040032148361206, + "learning_rate": 5.191039541869245e-07, + "loss": 0.0672, + "step": 14765 + }, + { + "epoch": 2.3924173687621515, + "grad_norm": 0.8961792588233948, + "learning_rate": 5.188372045703824e-07, + "loss": 0.0614, + "step": 14766 + }, + { + "epoch": 2.3925793907971484, + "grad_norm": 0.7686487436294556, + "learning_rate": 5.185705155726287e-07, + "loss": 0.0552, + "step": 14767 + }, + { + "epoch": 2.3927414128321454, + "grad_norm": 0.987948477268219, + "learning_rate": 5.183038872018215e-07, + "loss": 0.0584, + "step": 14768 + }, + { + "epoch": 2.392903434867142, + "grad_norm": 0.9744378328323364, + "learning_rate": 5.180373194661198e-07, + "loss": 0.0644, + "step": 14769 + }, + { + "epoch": 2.393065456902139, + "grad_norm": 0.9378809928894043, + "learning_rate": 5.1777081237368e-07, + "loss": 0.0642, + "step": 14770 + }, + { + "epoch": 2.3932274789371353, + "grad_norm": 1.1186909675598145, + "learning_rate": 5.175043659326564e-07, + "loss": 0.0673, + "step": 14771 + }, + { + "epoch": 2.393389500972132, + "grad_norm": 0.9698737263679504, + "learning_rate": 5.172379801512014e-07, + "loss": 0.0614, + "step": 14772 + }, + { + "epoch": 2.393551523007129, + "grad_norm": 0.985581636428833, + "learning_rate": 5.169716550374665e-07, + "loss": 0.0609, + "step": 14773 + }, + { + "epoch": 2.3937135450421256, + "grad_norm": 0.8977107405662537, + "learning_rate": 5.167053905996003e-07, + "loss": 0.0643, + "step": 14774 + }, + { + "epoch": 2.3938755670771226, + "grad_norm": 0.8801769018173218, + "learning_rate": 5.164391868457494e-07, + "loss": 0.0647, + "step": 14775 + }, + { + "epoch": 2.394037589112119, + "grad_norm": 1.0398766994476318, + "learning_rate": 5.161730437840585e-07, + "loss": 0.0658, + "step": 14776 + }, + { + "epoch": 2.394199611147116, + "grad_norm": 0.9638035297393799, + "learning_rate": 5.159069614226723e-07, + "loss": 0.0655, + "step": 14777 + }, + { + "epoch": 2.394361633182113, + "grad_norm": 1.0413271188735962, + "learning_rate": 5.15640939769732e-07, + "loss": 0.0587, + "step": 14778 + }, + { + "epoch": 2.3945236552171094, + "grad_norm": 0.9833051562309265, + "learning_rate": 5.153749788333767e-07, + "loss": 0.0633, + "step": 14779 + }, + { + "epoch": 2.3946856772521063, + "grad_norm": 0.9395384192466736, + "learning_rate": 5.151090786217433e-07, + "loss": 0.0647, + "step": 14780 + }, + { + "epoch": 2.3948476992871033, + "grad_norm": 0.9286221265792847, + "learning_rate": 5.148432391429703e-07, + "loss": 0.0617, + "step": 14781 + }, + { + "epoch": 2.3950097213220998, + "grad_norm": 0.8092193603515625, + "learning_rate": 5.145774604051895e-07, + "loss": 0.0587, + "step": 14782 + }, + { + "epoch": 2.3951717433570967, + "grad_norm": 0.9903756380081177, + "learning_rate": 5.143117424165339e-07, + "loss": 0.0611, + "step": 14783 + }, + { + "epoch": 2.395333765392093, + "grad_norm": 0.8078405261039734, + "learning_rate": 5.140460851851336e-07, + "loss": 0.0596, + "step": 14784 + }, + { + "epoch": 2.39549578742709, + "grad_norm": 1.076858639717102, + "learning_rate": 5.137804887191172e-07, + "loss": 0.0657, + "step": 14785 + }, + { + "epoch": 2.3956578094620866, + "grad_norm": 1.0425070524215698, + "learning_rate": 5.135149530266112e-07, + "loss": 0.0613, + "step": 14786 + }, + { + "epoch": 2.3958198314970836, + "grad_norm": 0.9560631513595581, + "learning_rate": 5.132494781157407e-07, + "loss": 0.064, + "step": 14787 + }, + { + "epoch": 2.3959818535320805, + "grad_norm": 0.765007495880127, + "learning_rate": 5.129840639946279e-07, + "loss": 0.0555, + "step": 14788 + }, + { + "epoch": 2.396143875567077, + "grad_norm": 0.9489015936851501, + "learning_rate": 5.127187106713951e-07, + "loss": 0.0601, + "step": 14789 + }, + { + "epoch": 2.396305897602074, + "grad_norm": 0.8428517580032349, + "learning_rate": 5.124534181541596e-07, + "loss": 0.0542, + "step": 14790 + }, + { + "epoch": 2.396467919637071, + "grad_norm": 0.8759004473686218, + "learning_rate": 5.1218818645104e-07, + "loss": 0.0604, + "step": 14791 + }, + { + "epoch": 2.3966299416720673, + "grad_norm": 0.9745606184005737, + "learning_rate": 5.119230155701515e-07, + "loss": 0.0633, + "step": 14792 + }, + { + "epoch": 2.3967919637070643, + "grad_norm": 0.8945474624633789, + "learning_rate": 5.116579055196085e-07, + "loss": 0.0614, + "step": 14793 + }, + { + "epoch": 2.3969539857420608, + "grad_norm": 0.9640982747077942, + "learning_rate": 5.113928563075213e-07, + "loss": 0.0664, + "step": 14794 + }, + { + "epoch": 2.3971160077770577, + "grad_norm": 0.8617674112319946, + "learning_rate": 5.111278679419996e-07, + "loss": 0.0607, + "step": 14795 + }, + { + "epoch": 2.3972780298120546, + "grad_norm": 1.115388035774231, + "learning_rate": 5.108629404311535e-07, + "loss": 0.0675, + "step": 14796 + }, + { + "epoch": 2.397440051847051, + "grad_norm": 0.9158341288566589, + "learning_rate": 5.105980737830871e-07, + "loss": 0.0631, + "step": 14797 + }, + { + "epoch": 2.397602073882048, + "grad_norm": 0.93878173828125, + "learning_rate": 5.103332680059053e-07, + "loss": 0.0581, + "step": 14798 + }, + { + "epoch": 2.3977640959170445, + "grad_norm": 0.8788214921951294, + "learning_rate": 5.100685231077107e-07, + "loss": 0.061, + "step": 14799 + }, + { + "epoch": 2.3979261179520415, + "grad_norm": 0.8120740056037903, + "learning_rate": 5.098038390966039e-07, + "loss": 0.0494, + "step": 14800 + }, + { + "epoch": 2.3980881399870384, + "grad_norm": 0.9346016645431519, + "learning_rate": 5.095392159806833e-07, + "loss": 0.0663, + "step": 14801 + }, + { + "epoch": 2.398250162022035, + "grad_norm": 1.0751339197158813, + "learning_rate": 5.09274653768046e-07, + "loss": 0.0646, + "step": 14802 + }, + { + "epoch": 2.398412184057032, + "grad_norm": 0.9000687003135681, + "learning_rate": 5.090101524667865e-07, + "loss": 0.0632, + "step": 14803 + }, + { + "epoch": 2.3985742060920288, + "grad_norm": 0.83973628282547, + "learning_rate": 5.087457120849984e-07, + "loss": 0.0593, + "step": 14804 + }, + { + "epoch": 2.3987362281270252, + "grad_norm": 0.8891993165016174, + "learning_rate": 5.084813326307728e-07, + "loss": 0.0607, + "step": 14805 + }, + { + "epoch": 2.398898250162022, + "grad_norm": 0.8002360463142395, + "learning_rate": 5.082170141121992e-07, + "loss": 0.0532, + "step": 14806 + }, + { + "epoch": 2.3990602721970187, + "grad_norm": 0.980747401714325, + "learning_rate": 5.079527565373654e-07, + "loss": 0.0631, + "step": 14807 + }, + { + "epoch": 2.3992222942320156, + "grad_norm": 0.9085080027580261, + "learning_rate": 5.076885599143558e-07, + "loss": 0.0628, + "step": 14808 + }, + { + "epoch": 2.399384316267012, + "grad_norm": 0.9033792614936829, + "learning_rate": 5.074244242512546e-07, + "loss": 0.0585, + "step": 14809 + }, + { + "epoch": 2.399546338302009, + "grad_norm": 0.9455671310424805, + "learning_rate": 5.071603495561444e-07, + "loss": 0.0657, + "step": 14810 + }, + { + "epoch": 2.399708360337006, + "grad_norm": 1.0358432531356812, + "learning_rate": 5.068963358371059e-07, + "loss": 0.064, + "step": 14811 + }, + { + "epoch": 2.3998703823720025, + "grad_norm": 0.968040943145752, + "learning_rate": 5.066323831022155e-07, + "loss": 0.0686, + "step": 14812 + }, + { + "epoch": 2.4000324044069994, + "grad_norm": 0.8224153518676758, + "learning_rate": 5.063684913595504e-07, + "loss": 0.057, + "step": 14813 + }, + { + "epoch": 2.4001944264419963, + "grad_norm": 0.9839720726013184, + "learning_rate": 5.061046606171849e-07, + "loss": 0.0606, + "step": 14814 + }, + { + "epoch": 2.400356448476993, + "grad_norm": 0.8918019533157349, + "learning_rate": 5.058408908831919e-07, + "loss": 0.0628, + "step": 14815 + }, + { + "epoch": 2.4005184705119897, + "grad_norm": 0.9705418348312378, + "learning_rate": 5.055771821656416e-07, + "loss": 0.0527, + "step": 14816 + }, + { + "epoch": 2.4006804925469862, + "grad_norm": 1.0187605619430542, + "learning_rate": 5.053135344726032e-07, + "loss": 0.0579, + "step": 14817 + }, + { + "epoch": 2.400842514581983, + "grad_norm": 0.8986850380897522, + "learning_rate": 5.05049947812144e-07, + "loss": 0.0617, + "step": 14818 + }, + { + "epoch": 2.40100453661698, + "grad_norm": 0.9489282965660095, + "learning_rate": 5.047864221923276e-07, + "loss": 0.0595, + "step": 14819 + }, + { + "epoch": 2.4011665586519766, + "grad_norm": 0.8312699794769287, + "learning_rate": 5.045229576212191e-07, + "loss": 0.0587, + "step": 14820 + }, + { + "epoch": 2.4013285806869735, + "grad_norm": 0.8826655149459839, + "learning_rate": 5.04259554106879e-07, + "loss": 0.062, + "step": 14821 + }, + { + "epoch": 2.40149060272197, + "grad_norm": 1.0575923919677734, + "learning_rate": 5.039962116573676e-07, + "loss": 0.0738, + "step": 14822 + }, + { + "epoch": 2.401652624756967, + "grad_norm": 0.8363692164421082, + "learning_rate": 5.037329302807409e-07, + "loss": 0.0583, + "step": 14823 + }, + { + "epoch": 2.401814646791964, + "grad_norm": 0.9132293462753296, + "learning_rate": 5.034697099850557e-07, + "loss": 0.066, + "step": 14824 + }, + { + "epoch": 2.4019766688269604, + "grad_norm": 0.8294099569320679, + "learning_rate": 5.032065507783671e-07, + "loss": 0.0549, + "step": 14825 + }, + { + "epoch": 2.4021386908619573, + "grad_norm": 0.8338202834129333, + "learning_rate": 5.029434526687249e-07, + "loss": 0.0594, + "step": 14826 + }, + { + "epoch": 2.402300712896954, + "grad_norm": 0.9756371378898621, + "learning_rate": 5.026804156641804e-07, + "loss": 0.0635, + "step": 14827 + }, + { + "epoch": 2.4024627349319507, + "grad_norm": 0.9361462593078613, + "learning_rate": 5.02417439772781e-07, + "loss": 0.0661, + "step": 14828 + }, + { + "epoch": 2.4026247569669477, + "grad_norm": 0.8787729144096375, + "learning_rate": 5.021545250025755e-07, + "loss": 0.0638, + "step": 14829 + }, + { + "epoch": 2.402786779001944, + "grad_norm": 0.8789749145507812, + "learning_rate": 5.01891671361606e-07, + "loss": 0.052, + "step": 14830 + }, + { + "epoch": 2.402948801036941, + "grad_norm": 0.8396589756011963, + "learning_rate": 5.01628878857916e-07, + "loss": 0.0594, + "step": 14831 + }, + { + "epoch": 2.4031108230719376, + "grad_norm": 0.9573028683662415, + "learning_rate": 5.013661474995463e-07, + "loss": 0.0675, + "step": 14832 + }, + { + "epoch": 2.4032728451069345, + "grad_norm": 0.9700629115104675, + "learning_rate": 5.011034772945359e-07, + "loss": 0.0628, + "step": 14833 + }, + { + "epoch": 2.4034348671419314, + "grad_norm": 1.1622406244277954, + "learning_rate": 5.008408682509219e-07, + "loss": 0.0643, + "step": 14834 + }, + { + "epoch": 2.403596889176928, + "grad_norm": 0.8823803067207336, + "learning_rate": 5.005783203767394e-07, + "loss": 0.0606, + "step": 14835 + }, + { + "epoch": 2.403758911211925, + "grad_norm": 0.8871647715568542, + "learning_rate": 5.003158336800218e-07, + "loss": 0.0567, + "step": 14836 + }, + { + "epoch": 2.403920933246922, + "grad_norm": 0.9583814740180969, + "learning_rate": 5.000534081688013e-07, + "loss": 0.0589, + "step": 14837 + }, + { + "epoch": 2.4040829552819183, + "grad_norm": 1.0450133085250854, + "learning_rate": 4.997910438511052e-07, + "loss": 0.0643, + "step": 14838 + }, + { + "epoch": 2.404244977316915, + "grad_norm": 1.106645107269287, + "learning_rate": 4.995287407349636e-07, + "loss": 0.069, + "step": 14839 + }, + { + "epoch": 2.4044069993519117, + "grad_norm": 0.9293127059936523, + "learning_rate": 4.992664988284021e-07, + "loss": 0.0619, + "step": 14840 + }, + { + "epoch": 2.4045690213869086, + "grad_norm": 0.8363195061683655, + "learning_rate": 4.990043181394433e-07, + "loss": 0.0611, + "step": 14841 + }, + { + "epoch": 2.404731043421905, + "grad_norm": 0.9512073993682861, + "learning_rate": 4.987421986761101e-07, + "loss": 0.0591, + "step": 14842 + }, + { + "epoch": 2.404893065456902, + "grad_norm": 0.8879616260528564, + "learning_rate": 4.984801404464229e-07, + "loss": 0.0593, + "step": 14843 + }, + { + "epoch": 2.405055087491899, + "grad_norm": 0.8494080901145935, + "learning_rate": 4.982181434583996e-07, + "loss": 0.0583, + "step": 14844 + }, + { + "epoch": 2.4052171095268955, + "grad_norm": 0.8918368816375732, + "learning_rate": 4.97956207720057e-07, + "loss": 0.0637, + "step": 14845 + }, + { + "epoch": 2.4053791315618924, + "grad_norm": 0.982917070388794, + "learning_rate": 4.976943332394093e-07, + "loss": 0.0662, + "step": 14846 + }, + { + "epoch": 2.4055411535968894, + "grad_norm": 0.9451669454574585, + "learning_rate": 4.974325200244698e-07, + "loss": 0.0609, + "step": 14847 + }, + { + "epoch": 2.405703175631886, + "grad_norm": 0.956188976764679, + "learning_rate": 4.971707680832491e-07, + "loss": 0.0558, + "step": 14848 + }, + { + "epoch": 2.405865197666883, + "grad_norm": 0.9561905860900879, + "learning_rate": 4.969090774237559e-07, + "loss": 0.0601, + "step": 14849 + }, + { + "epoch": 2.4060272197018793, + "grad_norm": 1.0489095449447632, + "learning_rate": 4.966474480539976e-07, + "loss": 0.064, + "step": 14850 + }, + { + "epoch": 2.406189241736876, + "grad_norm": 0.8939489126205444, + "learning_rate": 4.963858799819802e-07, + "loss": 0.0586, + "step": 14851 + }, + { + "epoch": 2.406351263771873, + "grad_norm": 0.8735487461090088, + "learning_rate": 4.961243732157048e-07, + "loss": 0.065, + "step": 14852 + }, + { + "epoch": 2.4065132858068696, + "grad_norm": 0.900790810585022, + "learning_rate": 4.958629277631749e-07, + "loss": 0.0603, + "step": 14853 + }, + { + "epoch": 2.4066753078418666, + "grad_norm": 0.991608738899231, + "learning_rate": 4.956015436323897e-07, + "loss": 0.0592, + "step": 14854 + }, + { + "epoch": 2.406837329876863, + "grad_norm": 0.8543813228607178, + "learning_rate": 4.953402208313471e-07, + "loss": 0.0651, + "step": 14855 + }, + { + "epoch": 2.40699935191186, + "grad_norm": 0.8871439695358276, + "learning_rate": 4.950789593680422e-07, + "loss": 0.0639, + "step": 14856 + }, + { + "epoch": 2.407161373946857, + "grad_norm": 0.9490445852279663, + "learning_rate": 4.948177592504682e-07, + "loss": 0.0669, + "step": 14857 + }, + { + "epoch": 2.4073233959818534, + "grad_norm": 0.8415353894233704, + "learning_rate": 4.945566204866201e-07, + "loss": 0.0585, + "step": 14858 + }, + { + "epoch": 2.4074854180168503, + "grad_norm": 1.0602666139602661, + "learning_rate": 4.942955430844856e-07, + "loss": 0.0623, + "step": 14859 + }, + { + "epoch": 2.4076474400518473, + "grad_norm": 0.8117191791534424, + "learning_rate": 4.940345270520536e-07, + "loss": 0.0591, + "step": 14860 + }, + { + "epoch": 2.4078094620868438, + "grad_norm": 0.9509507417678833, + "learning_rate": 4.937735723973109e-07, + "loss": 0.065, + "step": 14861 + }, + { + "epoch": 2.4079714841218407, + "grad_norm": 0.9605535268783569, + "learning_rate": 4.935126791282419e-07, + "loss": 0.0678, + "step": 14862 + }, + { + "epoch": 2.408133506156837, + "grad_norm": 0.9226017594337463, + "learning_rate": 4.932518472528292e-07, + "loss": 0.0626, + "step": 14863 + }, + { + "epoch": 2.408295528191834, + "grad_norm": 0.8513705730438232, + "learning_rate": 4.929910767790536e-07, + "loss": 0.0593, + "step": 14864 + }, + { + "epoch": 2.4084575502268306, + "grad_norm": 0.7933249473571777, + "learning_rate": 4.927303677148942e-07, + "loss": 0.054, + "step": 14865 + }, + { + "epoch": 2.4086195722618275, + "grad_norm": 0.8183956146240234, + "learning_rate": 4.92469720068329e-07, + "loss": 0.0534, + "step": 14866 + }, + { + "epoch": 2.4087815942968245, + "grad_norm": 1.0017709732055664, + "learning_rate": 4.922091338473309e-07, + "loss": 0.0691, + "step": 14867 + }, + { + "epoch": 2.408943616331821, + "grad_norm": 1.0862891674041748, + "learning_rate": 4.919486090598749e-07, + "loss": 0.0652, + "step": 14868 + }, + { + "epoch": 2.409105638366818, + "grad_norm": 0.8221504092216492, + "learning_rate": 4.91688145713933e-07, + "loss": 0.0558, + "step": 14869 + }, + { + "epoch": 2.409267660401815, + "grad_norm": 1.0607209205627441, + "learning_rate": 4.91427743817473e-07, + "loss": 0.0589, + "step": 14870 + }, + { + "epoch": 2.4094296824368113, + "grad_norm": 0.8373808264732361, + "learning_rate": 4.911674033784628e-07, + "loss": 0.0584, + "step": 14871 + }, + { + "epoch": 2.4095917044718083, + "grad_norm": 0.8616558909416199, + "learning_rate": 4.909071244048694e-07, + "loss": 0.0557, + "step": 14872 + }, + { + "epoch": 2.4097537265068047, + "grad_norm": 0.8790077567100525, + "learning_rate": 4.906469069046568e-07, + "loss": 0.0644, + "step": 14873 + }, + { + "epoch": 2.4099157485418017, + "grad_norm": 0.9295965433120728, + "learning_rate": 4.903867508857857e-07, + "loss": 0.0584, + "step": 14874 + }, + { + "epoch": 2.4100777705767986, + "grad_norm": 0.8102318048477173, + "learning_rate": 4.901266563562168e-07, + "loss": 0.0586, + "step": 14875 + }, + { + "epoch": 2.410239792611795, + "grad_norm": 0.8939180970191956, + "learning_rate": 4.898666233239083e-07, + "loss": 0.0602, + "step": 14876 + }, + { + "epoch": 2.410401814646792, + "grad_norm": 0.8693578839302063, + "learning_rate": 4.896066517968167e-07, + "loss": 0.0623, + "step": 14877 + }, + { + "epoch": 2.4105638366817885, + "grad_norm": 0.9918479323387146, + "learning_rate": 4.893467417828967e-07, + "loss": 0.0659, + "step": 14878 + }, + { + "epoch": 2.4107258587167855, + "grad_norm": 0.8719764351844788, + "learning_rate": 4.890868932901005e-07, + "loss": 0.0604, + "step": 14879 + }, + { + "epoch": 2.4108878807517824, + "grad_norm": 0.8317460417747498, + "learning_rate": 4.888271063263791e-07, + "loss": 0.0602, + "step": 14880 + }, + { + "epoch": 2.411049902786779, + "grad_norm": 1.0084984302520752, + "learning_rate": 4.885673808996816e-07, + "loss": 0.0651, + "step": 14881 + }, + { + "epoch": 2.411211924821776, + "grad_norm": 0.9814823269844055, + "learning_rate": 4.883077170179542e-07, + "loss": 0.0678, + "step": 14882 + }, + { + "epoch": 2.4113739468567728, + "grad_norm": 0.9129720330238342, + "learning_rate": 4.880481146891428e-07, + "loss": 0.0585, + "step": 14883 + }, + { + "epoch": 2.4115359688917692, + "grad_norm": 0.827803909778595, + "learning_rate": 4.877885739211907e-07, + "loss": 0.0594, + "step": 14884 + }, + { + "epoch": 2.411697990926766, + "grad_norm": 1.2343494892120361, + "learning_rate": 4.875290947220382e-07, + "loss": 0.0581, + "step": 14885 + }, + { + "epoch": 2.4118600129617627, + "grad_norm": 0.9479061961174011, + "learning_rate": 4.872696770996246e-07, + "loss": 0.0645, + "step": 14886 + }, + { + "epoch": 2.4120220349967596, + "grad_norm": 0.9281307458877563, + "learning_rate": 4.870103210618895e-07, + "loss": 0.0629, + "step": 14887 + }, + { + "epoch": 2.412184057031756, + "grad_norm": 0.9015679359436035, + "learning_rate": 4.867510266167669e-07, + "loss": 0.0598, + "step": 14888 + }, + { + "epoch": 2.412346079066753, + "grad_norm": 1.0281862020492554, + "learning_rate": 4.864917937721905e-07, + "loss": 0.0627, + "step": 14889 + }, + { + "epoch": 2.41250810110175, + "grad_norm": 0.7588191032409668, + "learning_rate": 4.862326225360927e-07, + "loss": 0.0536, + "step": 14890 + }, + { + "epoch": 2.4126701231367464, + "grad_norm": 0.9489138722419739, + "learning_rate": 4.859735129164036e-07, + "loss": 0.0613, + "step": 14891 + }, + { + "epoch": 2.4128321451717434, + "grad_norm": 0.9236519932746887, + "learning_rate": 4.85714464921051e-07, + "loss": 0.0605, + "step": 14892 + }, + { + "epoch": 2.4129941672067403, + "grad_norm": 0.9763721823692322, + "learning_rate": 4.854554785579613e-07, + "loss": 0.0568, + "step": 14893 + }, + { + "epoch": 2.413156189241737, + "grad_norm": 0.9076926112174988, + "learning_rate": 4.851965538350589e-07, + "loss": 0.0613, + "step": 14894 + }, + { + "epoch": 2.4133182112767337, + "grad_norm": 0.8788256645202637, + "learning_rate": 4.849376907602662e-07, + "loss": 0.0606, + "step": 14895 + }, + { + "epoch": 2.4134802333117302, + "grad_norm": 0.9264197945594788, + "learning_rate": 4.846788893415038e-07, + "loss": 0.0653, + "step": 14896 + }, + { + "epoch": 2.413642255346727, + "grad_norm": 0.8280066847801208, + "learning_rate": 4.844201495866904e-07, + "loss": 0.0616, + "step": 14897 + }, + { + "epoch": 2.413804277381724, + "grad_norm": 0.9356870055198669, + "learning_rate": 4.841614715037429e-07, + "loss": 0.0621, + "step": 14898 + }, + { + "epoch": 2.4139662994167206, + "grad_norm": 0.9848315715789795, + "learning_rate": 4.839028551005767e-07, + "loss": 0.0644, + "step": 14899 + }, + { + "epoch": 2.4141283214517175, + "grad_norm": 0.9987033605575562, + "learning_rate": 4.83644300385103e-07, + "loss": 0.0686, + "step": 14900 + }, + { + "epoch": 2.414290343486714, + "grad_norm": 0.8517066836357117, + "learning_rate": 4.833858073652351e-07, + "loss": 0.0565, + "step": 14901 + }, + { + "epoch": 2.414452365521711, + "grad_norm": 0.9564343094825745, + "learning_rate": 4.831273760488816e-07, + "loss": 0.0659, + "step": 14902 + }, + { + "epoch": 2.414614387556708, + "grad_norm": 0.9652625322341919, + "learning_rate": 4.828690064439492e-07, + "loss": 0.0632, + "step": 14903 + }, + { + "epoch": 2.4147764095917044, + "grad_norm": 0.9745001792907715, + "learning_rate": 4.82610698558344e-07, + "loss": 0.0628, + "step": 14904 + }, + { + "epoch": 2.4149384316267013, + "grad_norm": 0.9366600513458252, + "learning_rate": 4.823524523999685e-07, + "loss": 0.0614, + "step": 14905 + }, + { + "epoch": 2.4151004536616982, + "grad_norm": 0.8678472638130188, + "learning_rate": 4.820942679767268e-07, + "loss": 0.0553, + "step": 14906 + }, + { + "epoch": 2.4152624756966947, + "grad_norm": 1.0045222043991089, + "learning_rate": 4.818361452965165e-07, + "loss": 0.0626, + "step": 14907 + }, + { + "epoch": 2.4154244977316917, + "grad_norm": 0.8525367379188538, + "learning_rate": 4.815780843672366e-07, + "loss": 0.0577, + "step": 14908 + }, + { + "epoch": 2.415586519766688, + "grad_norm": 0.8319639563560486, + "learning_rate": 4.813200851967826e-07, + "loss": 0.052, + "step": 14909 + }, + { + "epoch": 2.415748541801685, + "grad_norm": 0.8100563287734985, + "learning_rate": 4.810621477930488e-07, + "loss": 0.0592, + "step": 14910 + }, + { + "epoch": 2.4159105638366816, + "grad_norm": 0.8080427050590515, + "learning_rate": 4.808042721639275e-07, + "loss": 0.0556, + "step": 14911 + }, + { + "epoch": 2.4160725858716785, + "grad_norm": 1.0545318126678467, + "learning_rate": 4.805464583173094e-07, + "loss": 0.0604, + "step": 14912 + }, + { + "epoch": 2.4162346079066754, + "grad_norm": 1.0038424730300903, + "learning_rate": 4.802887062610831e-07, + "loss": 0.064, + "step": 14913 + }, + { + "epoch": 2.416396629941672, + "grad_norm": 0.9150903224945068, + "learning_rate": 4.800310160031335e-07, + "loss": 0.0612, + "step": 14914 + }, + { + "epoch": 2.416558651976669, + "grad_norm": 1.0144057273864746, + "learning_rate": 4.797733875513475e-07, + "loss": 0.0604, + "step": 14915 + }, + { + "epoch": 2.416720674011666, + "grad_norm": 0.8084055781364441, + "learning_rate": 4.795158209136067e-07, + "loss": 0.0516, + "step": 14916 + }, + { + "epoch": 2.4168826960466623, + "grad_norm": 1.1354771852493286, + "learning_rate": 4.792583160977929e-07, + "loss": 0.0603, + "step": 14917 + }, + { + "epoch": 2.417044718081659, + "grad_norm": 0.9197603464126587, + "learning_rate": 4.79000873111784e-07, + "loss": 0.0604, + "step": 14918 + }, + { + "epoch": 2.4172067401166557, + "grad_norm": 0.8190104961395264, + "learning_rate": 4.787434919634571e-07, + "loss": 0.0541, + "step": 14919 + }, + { + "epoch": 2.4173687621516526, + "grad_norm": 0.9333441853523254, + "learning_rate": 4.784861726606893e-07, + "loss": 0.0566, + "step": 14920 + }, + { + "epoch": 2.4175307841866496, + "grad_norm": 0.7886890172958374, + "learning_rate": 4.782289152113518e-07, + "loss": 0.0567, + "step": 14921 + }, + { + "epoch": 2.417692806221646, + "grad_norm": 0.9538326859474182, + "learning_rate": 4.779717196233169e-07, + "loss": 0.0607, + "step": 14922 + }, + { + "epoch": 2.417854828256643, + "grad_norm": 0.8618488311767578, + "learning_rate": 4.777145859044543e-07, + "loss": 0.0589, + "step": 14923 + }, + { + "epoch": 2.4180168502916395, + "grad_norm": 0.9344967007637024, + "learning_rate": 4.774575140626317e-07, + "loss": 0.0666, + "step": 14924 + }, + { + "epoch": 2.4181788723266364, + "grad_norm": 1.032467007637024, + "learning_rate": 4.772005041057146e-07, + "loss": 0.0656, + "step": 14925 + }, + { + "epoch": 2.4183408943616334, + "grad_norm": 0.7819418907165527, + "learning_rate": 4.769435560415666e-07, + "loss": 0.0565, + "step": 14926 + }, + { + "epoch": 2.41850291639663, + "grad_norm": 0.8860273361206055, + "learning_rate": 4.766866698780506e-07, + "loss": 0.062, + "step": 14927 + }, + { + "epoch": 2.4186649384316268, + "grad_norm": 1.2574406862258911, + "learning_rate": 4.764298456230265e-07, + "loss": 0.059, + "step": 14928 + }, + { + "epoch": 2.4188269604666237, + "grad_norm": 0.9362139105796814, + "learning_rate": 4.7617308328435115e-07, + "loss": 0.0599, + "step": 14929 + }, + { + "epoch": 2.41898898250162, + "grad_norm": 1.0488052368164062, + "learning_rate": 4.7591638286988234e-07, + "loss": 0.0639, + "step": 14930 + }, + { + "epoch": 2.419151004536617, + "grad_norm": 0.9460102915763855, + "learning_rate": 4.756597443874747e-07, + "loss": 0.0642, + "step": 14931 + }, + { + "epoch": 2.4193130265716136, + "grad_norm": 0.952669084072113, + "learning_rate": 4.754031678449794e-07, + "loss": 0.0558, + "step": 14932 + }, + { + "epoch": 2.4194750486066106, + "grad_norm": 0.9806849360466003, + "learning_rate": 4.75146653250248e-07, + "loss": 0.066, + "step": 14933 + }, + { + "epoch": 2.419637070641607, + "grad_norm": 0.9641299247741699, + "learning_rate": 4.7489020061112805e-07, + "loss": 0.0566, + "step": 14934 + }, + { + "epoch": 2.419799092676604, + "grad_norm": 0.7902235984802246, + "learning_rate": 4.746338099354686e-07, + "loss": 0.0521, + "step": 14935 + }, + { + "epoch": 2.419961114711601, + "grad_norm": 0.8686903715133667, + "learning_rate": 4.743774812311125e-07, + "loss": 0.059, + "step": 14936 + }, + { + "epoch": 2.4201231367465974, + "grad_norm": 0.8258570432662964, + "learning_rate": 4.7412121450590374e-07, + "loss": 0.0544, + "step": 14937 + }, + { + "epoch": 2.4202851587815943, + "grad_norm": 0.9893523454666138, + "learning_rate": 4.7386500976768337e-07, + "loss": 0.0602, + "step": 14938 + }, + { + "epoch": 2.4204471808165913, + "grad_norm": 0.9651594758033752, + "learning_rate": 4.7360886702429056e-07, + "loss": 0.0649, + "step": 14939 + }, + { + "epoch": 2.4206092028515878, + "grad_norm": 0.8819870352745056, + "learning_rate": 4.733527862835624e-07, + "loss": 0.0593, + "step": 14940 + }, + { + "epoch": 2.4207712248865847, + "grad_norm": 0.9650877714157104, + "learning_rate": 4.730967675533346e-07, + "loss": 0.0641, + "step": 14941 + }, + { + "epoch": 2.420933246921581, + "grad_norm": 0.8652887344360352, + "learning_rate": 4.728408108414409e-07, + "loss": 0.0628, + "step": 14942 + }, + { + "epoch": 2.421095268956578, + "grad_norm": 1.117307186126709, + "learning_rate": 4.7258491615571277e-07, + "loss": 0.0695, + "step": 14943 + }, + { + "epoch": 2.4212572909915746, + "grad_norm": 0.9765766859054565, + "learning_rate": 4.7232908350397984e-07, + "loss": 0.0574, + "step": 14944 + }, + { + "epoch": 2.4214193130265715, + "grad_norm": 0.8039732575416565, + "learning_rate": 4.720733128940699e-07, + "loss": 0.0507, + "step": 14945 + }, + { + "epoch": 2.4215813350615685, + "grad_norm": 0.8668411374092102, + "learning_rate": 4.7181760433381017e-07, + "loss": 0.0637, + "step": 14946 + }, + { + "epoch": 2.421743357096565, + "grad_norm": 0.9708417654037476, + "learning_rate": 4.715619578310227e-07, + "loss": 0.0674, + "step": 14947 + }, + { + "epoch": 2.421905379131562, + "grad_norm": 0.8565574884414673, + "learning_rate": 4.7130637339352995e-07, + "loss": 0.0617, + "step": 14948 + }, + { + "epoch": 2.422067401166559, + "grad_norm": 0.8651789426803589, + "learning_rate": 4.7105085102915365e-07, + "loss": 0.0635, + "step": 14949 + }, + { + "epoch": 2.4222294232015553, + "grad_norm": 0.9995070695877075, + "learning_rate": 4.707953907457119e-07, + "loss": 0.0622, + "step": 14950 + }, + { + "epoch": 2.4223914452365523, + "grad_norm": 0.948974072933197, + "learning_rate": 4.7053999255101987e-07, + "loss": 0.0637, + "step": 14951 + }, + { + "epoch": 2.4225534672715487, + "grad_norm": 0.8724423050880432, + "learning_rate": 4.702846564528929e-07, + "loss": 0.0648, + "step": 14952 + }, + { + "epoch": 2.4227154893065457, + "grad_norm": 0.8870061039924622, + "learning_rate": 4.700293824591437e-07, + "loss": 0.0611, + "step": 14953 + }, + { + "epoch": 2.4228775113415426, + "grad_norm": 0.7316666841506958, + "learning_rate": 4.6977417057758297e-07, + "loss": 0.0513, + "step": 14954 + }, + { + "epoch": 2.423039533376539, + "grad_norm": 0.8627529740333557, + "learning_rate": 4.695190208160197e-07, + "loss": 0.0617, + "step": 14955 + }, + { + "epoch": 2.423201555411536, + "grad_norm": 0.8494905829429626, + "learning_rate": 4.6926393318226045e-07, + "loss": 0.057, + "step": 14956 + }, + { + "epoch": 2.4233635774465325, + "grad_norm": 1.1695611476898193, + "learning_rate": 4.6900890768411145e-07, + "loss": 0.0733, + "step": 14957 + }, + { + "epoch": 2.4235255994815295, + "grad_norm": 0.8258056640625, + "learning_rate": 4.6875394432937345e-07, + "loss": 0.0581, + "step": 14958 + }, + { + "epoch": 2.4236876215165264, + "grad_norm": 0.8477557897567749, + "learning_rate": 4.684990431258499e-07, + "loss": 0.0589, + "step": 14959 + }, + { + "epoch": 2.423849643551523, + "grad_norm": 0.9261993765830994, + "learning_rate": 4.6824420408133953e-07, + "loss": 0.0632, + "step": 14960 + }, + { + "epoch": 2.42401166558652, + "grad_norm": 1.014751672744751, + "learning_rate": 4.6798942720364063e-07, + "loss": 0.0599, + "step": 14961 + }, + { + "epoch": 2.4241736876215167, + "grad_norm": 1.0292463302612305, + "learning_rate": 4.677347125005463e-07, + "loss": 0.0717, + "step": 14962 + }, + { + "epoch": 2.4243357096565132, + "grad_norm": 0.988088846206665, + "learning_rate": 4.6748005997985264e-07, + "loss": 0.0617, + "step": 14963 + }, + { + "epoch": 2.42449773169151, + "grad_norm": 1.0865875482559204, + "learning_rate": 4.6722546964935114e-07, + "loss": 0.0694, + "step": 14964 + }, + { + "epoch": 2.4246597537265067, + "grad_norm": 0.8739269375801086, + "learning_rate": 4.6697094151683026e-07, + "loss": 0.0573, + "step": 14965 + }, + { + "epoch": 2.4248217757615036, + "grad_norm": 0.983854353427887, + "learning_rate": 4.6671647559007884e-07, + "loss": 0.0637, + "step": 14966 + }, + { + "epoch": 2.4249837977965, + "grad_norm": 0.8353926539421082, + "learning_rate": 4.6646207187688197e-07, + "loss": 0.0606, + "step": 14967 + }, + { + "epoch": 2.425145819831497, + "grad_norm": 0.9040305018424988, + "learning_rate": 4.6620773038502625e-07, + "loss": 0.0676, + "step": 14968 + }, + { + "epoch": 2.425307841866494, + "grad_norm": 0.89899080991745, + "learning_rate": 4.659534511222916e-07, + "loss": 0.0592, + "step": 14969 + }, + { + "epoch": 2.4254698639014904, + "grad_norm": 1.2018378973007202, + "learning_rate": 4.656992340964589e-07, + "loss": 0.0634, + "step": 14970 + }, + { + "epoch": 2.4256318859364874, + "grad_norm": 0.8837653398513794, + "learning_rate": 4.6544507931530676e-07, + "loss": 0.0666, + "step": 14971 + }, + { + "epoch": 2.4257939079714843, + "grad_norm": 0.8115970492362976, + "learning_rate": 4.651909867866117e-07, + "loss": 0.0603, + "step": 14972 + }, + { + "epoch": 2.425955930006481, + "grad_norm": 0.8570439219474792, + "learning_rate": 4.649369565181483e-07, + "loss": 0.0545, + "step": 14973 + }, + { + "epoch": 2.4261179520414777, + "grad_norm": 0.8621937036514282, + "learning_rate": 4.64682988517689e-07, + "loss": 0.0586, + "step": 14974 + }, + { + "epoch": 2.426279974076474, + "grad_norm": 0.8469997048377991, + "learning_rate": 4.6442908279300536e-07, + "loss": 0.0578, + "step": 14975 + }, + { + "epoch": 2.426441996111471, + "grad_norm": 0.898225724697113, + "learning_rate": 4.641752393518661e-07, + "loss": 0.0597, + "step": 14976 + }, + { + "epoch": 2.426604018146468, + "grad_norm": 0.8789613842964172, + "learning_rate": 4.639214582020368e-07, + "loss": 0.0572, + "step": 14977 + }, + { + "epoch": 2.4267660401814646, + "grad_norm": 0.9776742458343506, + "learning_rate": 4.6366773935128423e-07, + "loss": 0.062, + "step": 14978 + }, + { + "epoch": 2.4269280622164615, + "grad_norm": 0.9235649704933167, + "learning_rate": 4.634140828073716e-07, + "loss": 0.0606, + "step": 14979 + }, + { + "epoch": 2.427090084251458, + "grad_norm": 0.954810619354248, + "learning_rate": 4.631604885780591e-07, + "loss": 0.064, + "step": 14980 + }, + { + "epoch": 2.427252106286455, + "grad_norm": 0.9652833938598633, + "learning_rate": 4.629069566711059e-07, + "loss": 0.061, + "step": 14981 + }, + { + "epoch": 2.427414128321452, + "grad_norm": 0.9159372448921204, + "learning_rate": 4.6265348709427146e-07, + "loss": 0.0648, + "step": 14982 + }, + { + "epoch": 2.4275761503564484, + "grad_norm": 0.8039860725402832, + "learning_rate": 4.6240007985530913e-07, + "loss": 0.0529, + "step": 14983 + }, + { + "epoch": 2.4277381723914453, + "grad_norm": 0.8820045590400696, + "learning_rate": 4.621467349619738e-07, + "loss": 0.0562, + "step": 14984 + }, + { + "epoch": 2.4279001944264422, + "grad_norm": 0.8682768940925598, + "learning_rate": 4.6189345242201674e-07, + "loss": 0.0572, + "step": 14985 + }, + { + "epoch": 2.4280622164614387, + "grad_norm": 0.830511748790741, + "learning_rate": 4.6164023224318786e-07, + "loss": 0.0606, + "step": 14986 + }, + { + "epoch": 2.4282242384964356, + "grad_norm": 0.9582686424255371, + "learning_rate": 4.6138707443323523e-07, + "loss": 0.064, + "step": 14987 + }, + { + "epoch": 2.428386260531432, + "grad_norm": 0.7856261134147644, + "learning_rate": 4.6113397899990474e-07, + "loss": 0.0495, + "step": 14988 + }, + { + "epoch": 2.428548282566429, + "grad_norm": 1.0667169094085693, + "learning_rate": 4.6088094595094057e-07, + "loss": 0.0712, + "step": 14989 + }, + { + "epoch": 2.4287103046014256, + "grad_norm": 0.9162574410438538, + "learning_rate": 4.6062797529408537e-07, + "loss": 0.0666, + "step": 14990 + }, + { + "epoch": 2.4288723266364225, + "grad_norm": 0.8367456197738647, + "learning_rate": 4.603750670370777e-07, + "loss": 0.0615, + "step": 14991 + }, + { + "epoch": 2.4290343486714194, + "grad_norm": 1.0230121612548828, + "learning_rate": 4.6012222118765796e-07, + "loss": 0.063, + "step": 14992 + }, + { + "epoch": 2.429196370706416, + "grad_norm": 0.9114834666252136, + "learning_rate": 4.598694377535617e-07, + "loss": 0.0676, + "step": 14993 + }, + { + "epoch": 2.429358392741413, + "grad_norm": 1.0044584274291992, + "learning_rate": 4.5961671674252447e-07, + "loss": 0.0695, + "step": 14994 + }, + { + "epoch": 2.42952041477641, + "grad_norm": 1.0129789113998413, + "learning_rate": 4.593640581622771e-07, + "loss": 0.0668, + "step": 14995 + }, + { + "epoch": 2.4296824368114063, + "grad_norm": 1.0267726182937622, + "learning_rate": 4.5911146202055113e-07, + "loss": 0.0645, + "step": 14996 + }, + { + "epoch": 2.429844458846403, + "grad_norm": 0.8808178901672363, + "learning_rate": 4.588589283250763e-07, + "loss": 0.063, + "step": 14997 + }, + { + "epoch": 2.4300064808813997, + "grad_norm": 1.2315572500228882, + "learning_rate": 4.5860645708357855e-07, + "loss": 0.0587, + "step": 14998 + }, + { + "epoch": 2.4301685029163966, + "grad_norm": 0.8882023096084595, + "learning_rate": 4.5835404830378296e-07, + "loss": 0.0604, + "step": 14999 + }, + { + "epoch": 2.4303305249513936, + "grad_norm": 0.8703988790512085, + "learning_rate": 4.581017019934131e-07, + "loss": 0.0572, + "step": 15000 + }, + { + "epoch": 2.43049254698639, + "grad_norm": 0.9442451596260071, + "learning_rate": 4.578494181601895e-07, + "loss": 0.0641, + "step": 15001 + }, + { + "epoch": 2.430654569021387, + "grad_norm": 0.8746116757392883, + "learning_rate": 4.57597196811832e-07, + "loss": 0.0652, + "step": 15002 + }, + { + "epoch": 2.4308165910563835, + "grad_norm": 0.8572196364402771, + "learning_rate": 4.5734503795605763e-07, + "loss": 0.058, + "step": 15003 + }, + { + "epoch": 2.4309786130913804, + "grad_norm": 0.8255354166030884, + "learning_rate": 4.5709294160058204e-07, + "loss": 0.0576, + "step": 15004 + }, + { + "epoch": 2.4311406351263773, + "grad_norm": 0.918545126914978, + "learning_rate": 4.5684090775311855e-07, + "loss": 0.0653, + "step": 15005 + }, + { + "epoch": 2.431302657161374, + "grad_norm": 0.9600496292114258, + "learning_rate": 4.565889364213791e-07, + "loss": 0.0665, + "step": 15006 + }, + { + "epoch": 2.4314646791963708, + "grad_norm": 1.181902527809143, + "learning_rate": 4.5633702761307327e-07, + "loss": 0.0618, + "step": 15007 + }, + { + "epoch": 2.4316267012313677, + "grad_norm": 1.0709116458892822, + "learning_rate": 4.5608518133590933e-07, + "loss": 0.0652, + "step": 15008 + }, + { + "epoch": 2.431788723266364, + "grad_norm": 0.9790709018707275, + "learning_rate": 4.5583339759759203e-07, + "loss": 0.0615, + "step": 15009 + }, + { + "epoch": 2.431950745301361, + "grad_norm": 0.8114708662033081, + "learning_rate": 4.5558167640582545e-07, + "loss": 0.0577, + "step": 15010 + }, + { + "epoch": 2.4321127673363576, + "grad_norm": 0.8501421213150024, + "learning_rate": 4.553300177683129e-07, + "loss": 0.0592, + "step": 15011 + }, + { + "epoch": 2.4322747893713546, + "grad_norm": 0.9928768277168274, + "learning_rate": 4.550784216927542e-07, + "loss": 0.0652, + "step": 15012 + }, + { + "epoch": 2.432436811406351, + "grad_norm": 1.0321820974349976, + "learning_rate": 4.5482688818684646e-07, + "loss": 0.0617, + "step": 15013 + }, + { + "epoch": 2.432598833441348, + "grad_norm": 0.8856379985809326, + "learning_rate": 4.5457541725828696e-07, + "loss": 0.0677, + "step": 15014 + }, + { + "epoch": 2.432760855476345, + "grad_norm": 0.8795359134674072, + "learning_rate": 4.543240089147699e-07, + "loss": 0.0588, + "step": 15015 + }, + { + "epoch": 2.4329228775113414, + "grad_norm": 0.8717951774597168, + "learning_rate": 4.5407266316398745e-07, + "loss": 0.0664, + "step": 15016 + }, + { + "epoch": 2.4330848995463383, + "grad_norm": 0.9385889768600464, + "learning_rate": 4.5382138001363067e-07, + "loss": 0.0656, + "step": 15017 + }, + { + "epoch": 2.4332469215813353, + "grad_norm": 0.8398104906082153, + "learning_rate": 4.5357015947138786e-07, + "loss": 0.0579, + "step": 15018 + }, + { + "epoch": 2.4334089436163318, + "grad_norm": 0.9150863289833069, + "learning_rate": 4.5331900154494623e-07, + "loss": 0.0595, + "step": 15019 + }, + { + "epoch": 2.4335709656513287, + "grad_norm": 0.9102035164833069, + "learning_rate": 4.530679062419899e-07, + "loss": 0.059, + "step": 15020 + }, + { + "epoch": 2.433732987686325, + "grad_norm": 0.9387508034706116, + "learning_rate": 4.528168735702024e-07, + "loss": 0.0654, + "step": 15021 + }, + { + "epoch": 2.433895009721322, + "grad_norm": 0.9315344095230103, + "learning_rate": 4.5256590353726426e-07, + "loss": 0.0664, + "step": 15022 + }, + { + "epoch": 2.434057031756319, + "grad_norm": 1.0084915161132812, + "learning_rate": 4.523149961508558e-07, + "loss": 0.0587, + "step": 15023 + }, + { + "epoch": 2.4342190537913155, + "grad_norm": 0.8739719390869141, + "learning_rate": 4.520641514186522e-07, + "loss": 0.0623, + "step": 15024 + }, + { + "epoch": 2.4343810758263125, + "grad_norm": 1.0345163345336914, + "learning_rate": 4.5181336934832897e-07, + "loss": 0.0693, + "step": 15025 + }, + { + "epoch": 2.434543097861309, + "grad_norm": 0.8258783221244812, + "learning_rate": 4.5156264994756144e-07, + "loss": 0.065, + "step": 15026 + }, + { + "epoch": 2.434705119896306, + "grad_norm": 0.8560571074485779, + "learning_rate": 4.5131199322401926e-07, + "loss": 0.0637, + "step": 15027 + }, + { + "epoch": 2.434867141931303, + "grad_norm": 0.8309101462364197, + "learning_rate": 4.510613991853721e-07, + "loss": 0.0608, + "step": 15028 + }, + { + "epoch": 2.4350291639662993, + "grad_norm": 0.91250079870224, + "learning_rate": 4.5081086783928754e-07, + "loss": 0.0579, + "step": 15029 + }, + { + "epoch": 2.4351911860012962, + "grad_norm": 0.9563912749290466, + "learning_rate": 4.5056039919343236e-07, + "loss": 0.063, + "step": 15030 + }, + { + "epoch": 2.435353208036293, + "grad_norm": 1.026058554649353, + "learning_rate": 4.503099932554689e-07, + "loss": 0.0614, + "step": 15031 + }, + { + "epoch": 2.4355152300712897, + "grad_norm": 0.9105499386787415, + "learning_rate": 4.5005965003305953e-07, + "loss": 0.064, + "step": 15032 + }, + { + "epoch": 2.4356772521062866, + "grad_norm": 0.8013800978660583, + "learning_rate": 4.49809369533864e-07, + "loss": 0.0565, + "step": 15033 + }, + { + "epoch": 2.435839274141283, + "grad_norm": 1.1963187456130981, + "learning_rate": 4.4955915176554065e-07, + "loss": 0.0628, + "step": 15034 + }, + { + "epoch": 2.43600129617628, + "grad_norm": 0.8717741966247559, + "learning_rate": 4.49308996735745e-07, + "loss": 0.0616, + "step": 15035 + }, + { + "epoch": 2.4361633182112765, + "grad_norm": 0.8936408758163452, + "learning_rate": 4.490589044521315e-07, + "loss": 0.0577, + "step": 15036 + }, + { + "epoch": 2.4363253402462735, + "grad_norm": 0.8526644706726074, + "learning_rate": 4.4880887492235265e-07, + "loss": 0.0613, + "step": 15037 + }, + { + "epoch": 2.4364873622812704, + "grad_norm": 0.8698762059211731, + "learning_rate": 4.4855890815405867e-07, + "loss": 0.0637, + "step": 15038 + }, + { + "epoch": 2.436649384316267, + "grad_norm": 0.8664577007293701, + "learning_rate": 4.483090041548968e-07, + "loss": 0.0591, + "step": 15039 + }, + { + "epoch": 2.436811406351264, + "grad_norm": 0.8795555830001831, + "learning_rate": 4.4805916293251486e-07, + "loss": 0.0665, + "step": 15040 + }, + { + "epoch": 2.4369734283862607, + "grad_norm": 0.8652831315994263, + "learning_rate": 4.4780938449455747e-07, + "loss": 0.0586, + "step": 15041 + }, + { + "epoch": 2.4371354504212572, + "grad_norm": 0.8803925514221191, + "learning_rate": 4.4755966884866606e-07, + "loss": 0.0581, + "step": 15042 + }, + { + "epoch": 2.437297472456254, + "grad_norm": 0.9763633608818054, + "learning_rate": 4.4731001600248234e-07, + "loss": 0.0573, + "step": 15043 + }, + { + "epoch": 2.4374594944912507, + "grad_norm": 0.9532651305198669, + "learning_rate": 4.470604259636438e-07, + "loss": 0.0591, + "step": 15044 + }, + { + "epoch": 2.4376215165262476, + "grad_norm": 0.8046735525131226, + "learning_rate": 4.4681089873978957e-07, + "loss": 0.0601, + "step": 15045 + }, + { + "epoch": 2.4377835385612445, + "grad_norm": 0.9399983882904053, + "learning_rate": 4.465614343385524e-07, + "loss": 0.0584, + "step": 15046 + }, + { + "epoch": 2.437945560596241, + "grad_norm": 0.8739177584648132, + "learning_rate": 4.463120327675663e-07, + "loss": 0.0596, + "step": 15047 + }, + { + "epoch": 2.438107582631238, + "grad_norm": 0.8265982270240784, + "learning_rate": 4.46062694034462e-07, + "loss": 0.0565, + "step": 15048 + }, + { + "epoch": 2.4382696046662344, + "grad_norm": 0.8940845727920532, + "learning_rate": 4.45813418146869e-07, + "loss": 0.0627, + "step": 15049 + }, + { + "epoch": 2.4384316267012314, + "grad_norm": 0.8442987203598022, + "learning_rate": 4.455642051124143e-07, + "loss": 0.0598, + "step": 15050 + }, + { + "epoch": 2.4385936487362283, + "grad_norm": 1.1562384366989136, + "learning_rate": 4.4531505493872334e-07, + "loss": 0.0673, + "step": 15051 + }, + { + "epoch": 2.438755670771225, + "grad_norm": 0.9282644391059875, + "learning_rate": 4.4506596763341985e-07, + "loss": 0.059, + "step": 15052 + }, + { + "epoch": 2.4389176928062217, + "grad_norm": 0.8264767527580261, + "learning_rate": 4.4481694320412383e-07, + "loss": 0.0552, + "step": 15053 + }, + { + "epoch": 2.439079714841218, + "grad_norm": 0.8514079451560974, + "learning_rate": 4.445679816584567e-07, + "loss": 0.0586, + "step": 15054 + }, + { + "epoch": 2.439241736876215, + "grad_norm": 0.8398727178573608, + "learning_rate": 4.4431908300403506e-07, + "loss": 0.061, + "step": 15055 + }, + { + "epoch": 2.439403758911212, + "grad_norm": 0.8882549405097961, + "learning_rate": 4.4407024724847534e-07, + "loss": 0.0603, + "step": 15056 + }, + { + "epoch": 2.4395657809462086, + "grad_norm": 1.0399445295333862, + "learning_rate": 4.4382147439939045e-07, + "loss": 0.0656, + "step": 15057 + }, + { + "epoch": 2.4397278029812055, + "grad_norm": 0.9227023124694824, + "learning_rate": 4.4357276446439197e-07, + "loss": 0.064, + "step": 15058 + }, + { + "epoch": 2.439889825016202, + "grad_norm": 0.911526620388031, + "learning_rate": 4.4332411745109135e-07, + "loss": 0.0632, + "step": 15059 + }, + { + "epoch": 2.440051847051199, + "grad_norm": 0.8623772263526917, + "learning_rate": 4.4307553336709525e-07, + "loss": 0.0604, + "step": 15060 + }, + { + "epoch": 2.440213869086196, + "grad_norm": 0.9016939401626587, + "learning_rate": 4.428270122200104e-07, + "loss": 0.0576, + "step": 15061 + }, + { + "epoch": 2.4403758911211924, + "grad_norm": 0.9120055437088013, + "learning_rate": 4.4257855401744044e-07, + "loss": 0.0614, + "step": 15062 + }, + { + "epoch": 2.4405379131561893, + "grad_norm": 1.0015937089920044, + "learning_rate": 4.4233015876698787e-07, + "loss": 0.0607, + "step": 15063 + }, + { + "epoch": 2.440699935191186, + "grad_norm": 0.8989893794059753, + "learning_rate": 4.42081826476253e-07, + "loss": 0.0566, + "step": 15064 + }, + { + "epoch": 2.4408619572261827, + "grad_norm": 0.9549097418785095, + "learning_rate": 4.4183355715283425e-07, + "loss": 0.0642, + "step": 15065 + }, + { + "epoch": 2.4410239792611796, + "grad_norm": 0.9465216398239136, + "learning_rate": 4.4158535080432803e-07, + "loss": 0.0662, + "step": 15066 + }, + { + "epoch": 2.441186001296176, + "grad_norm": 0.9384377002716064, + "learning_rate": 4.413372074383293e-07, + "loss": 0.0565, + "step": 15067 + }, + { + "epoch": 2.441348023331173, + "grad_norm": 0.8981598019599915, + "learning_rate": 4.4108912706242876e-07, + "loss": 0.0571, + "step": 15068 + }, + { + "epoch": 2.4415100453661696, + "grad_norm": 0.9155302047729492, + "learning_rate": 4.408411096842194e-07, + "loss": 0.0575, + "step": 15069 + }, + { + "epoch": 2.4416720674011665, + "grad_norm": 0.8059252500534058, + "learning_rate": 4.405931553112894e-07, + "loss": 0.0568, + "step": 15070 + }, + { + "epoch": 2.4418340894361634, + "grad_norm": 0.778841495513916, + "learning_rate": 4.4034526395122485e-07, + "loss": 0.0521, + "step": 15071 + }, + { + "epoch": 2.44199611147116, + "grad_norm": 0.8791018724441528, + "learning_rate": 4.4009743561161e-07, + "loss": 0.0576, + "step": 15072 + }, + { + "epoch": 2.442158133506157, + "grad_norm": 1.0327953100204468, + "learning_rate": 4.3984967030002964e-07, + "loss": 0.0668, + "step": 15073 + }, + { + "epoch": 2.442320155541154, + "grad_norm": 0.899375319480896, + "learning_rate": 4.396019680240643e-07, + "loss": 0.0592, + "step": 15074 + }, + { + "epoch": 2.4424821775761503, + "grad_norm": 0.8751654028892517, + "learning_rate": 4.3935432879129215e-07, + "loss": 0.0572, + "step": 15075 + }, + { + "epoch": 2.442644199611147, + "grad_norm": 0.8992448449134827, + "learning_rate": 4.3910675260929096e-07, + "loss": 0.0603, + "step": 15076 + }, + { + "epoch": 2.4428062216461437, + "grad_norm": 0.8739854693412781, + "learning_rate": 4.3885923948563585e-07, + "loss": 0.0588, + "step": 15077 + }, + { + "epoch": 2.4429682436811406, + "grad_norm": 0.8920753598213196, + "learning_rate": 4.386117894278999e-07, + "loss": 0.0579, + "step": 15078 + }, + { + "epoch": 2.4431302657161376, + "grad_norm": 0.9651167988777161, + "learning_rate": 4.383644024436551e-07, + "loss": 0.0569, + "step": 15079 + }, + { + "epoch": 2.443292287751134, + "grad_norm": 0.9695246815681458, + "learning_rate": 4.381170785404704e-07, + "loss": 0.0606, + "step": 15080 + }, + { + "epoch": 2.443454309786131, + "grad_norm": 0.8437383770942688, + "learning_rate": 4.378698177259133e-07, + "loss": 0.0604, + "step": 15081 + }, + { + "epoch": 2.4436163318211275, + "grad_norm": 0.9582122564315796, + "learning_rate": 4.376226200075495e-07, + "loss": 0.0653, + "step": 15082 + }, + { + "epoch": 2.4437783538561244, + "grad_norm": 1.188623070716858, + "learning_rate": 4.3737548539294266e-07, + "loss": 0.0702, + "step": 15083 + }, + { + "epoch": 2.4439403758911213, + "grad_norm": 0.975605309009552, + "learning_rate": 4.3712841388965476e-07, + "loss": 0.0653, + "step": 15084 + }, + { + "epoch": 2.444102397926118, + "grad_norm": 0.9400991201400757, + "learning_rate": 4.368814055052459e-07, + "loss": 0.0586, + "step": 15085 + }, + { + "epoch": 2.4442644199611148, + "grad_norm": 0.957561194896698, + "learning_rate": 4.3663446024727247e-07, + "loss": 0.0627, + "step": 15086 + }, + { + "epoch": 2.4444264419961117, + "grad_norm": 0.8637754917144775, + "learning_rate": 4.3638757812329095e-07, + "loss": 0.0598, + "step": 15087 + }, + { + "epoch": 2.444588464031108, + "grad_norm": 0.7822928428649902, + "learning_rate": 4.3614075914085617e-07, + "loss": 0.0555, + "step": 15088 + }, + { + "epoch": 2.444750486066105, + "grad_norm": 0.9488714337348938, + "learning_rate": 4.358940033075207e-07, + "loss": 0.0554, + "step": 15089 + }, + { + "epoch": 2.4449125081011016, + "grad_norm": 1.0039563179016113, + "learning_rate": 4.356473106308326e-07, + "loss": 0.0636, + "step": 15090 + }, + { + "epoch": 2.4450745301360985, + "grad_norm": 1.1325535774230957, + "learning_rate": 4.3540068111834144e-07, + "loss": 0.0678, + "step": 15091 + }, + { + "epoch": 2.445236552171095, + "grad_norm": 0.962975263595581, + "learning_rate": 4.351541147775931e-07, + "loss": 0.0666, + "step": 15092 + }, + { + "epoch": 2.445398574206092, + "grad_norm": 1.0033221244812012, + "learning_rate": 4.3490761161613186e-07, + "loss": 0.0651, + "step": 15093 + }, + { + "epoch": 2.445560596241089, + "grad_norm": 1.0614312887191772, + "learning_rate": 4.346611716415006e-07, + "loss": 0.0623, + "step": 15094 + }, + { + "epoch": 2.4457226182760854, + "grad_norm": 0.8363829255104065, + "learning_rate": 4.344147948612393e-07, + "loss": 0.0593, + "step": 15095 + }, + { + "epoch": 2.4458846403110823, + "grad_norm": 0.7662982940673828, + "learning_rate": 4.341684812828867e-07, + "loss": 0.0506, + "step": 15096 + }, + { + "epoch": 2.4460466623460793, + "grad_norm": 0.8307511806488037, + "learning_rate": 4.3392223091397925e-07, + "loss": 0.059, + "step": 15097 + }, + { + "epoch": 2.4462086843810757, + "grad_norm": 0.9444789290428162, + "learning_rate": 4.336760437620519e-07, + "loss": 0.0681, + "step": 15098 + }, + { + "epoch": 2.4463707064160727, + "grad_norm": 0.9271620512008667, + "learning_rate": 4.33429919834637e-07, + "loss": 0.0625, + "step": 15099 + }, + { + "epoch": 2.446532728451069, + "grad_norm": 0.7332361936569214, + "learning_rate": 4.331838591392662e-07, + "loss": 0.0545, + "step": 15100 + }, + { + "epoch": 2.446694750486066, + "grad_norm": 0.8970035314559937, + "learning_rate": 4.3293786168346674e-07, + "loss": 0.0612, + "step": 15101 + }, + { + "epoch": 2.446856772521063, + "grad_norm": 0.9213421940803528, + "learning_rate": 4.326919274747668e-07, + "loss": 0.0575, + "step": 15102 + }, + { + "epoch": 2.4470187945560595, + "grad_norm": 0.8858630657196045, + "learning_rate": 4.32446056520692e-07, + "loss": 0.0578, + "step": 15103 + }, + { + "epoch": 2.4471808165910565, + "grad_norm": 1.067657709121704, + "learning_rate": 4.322002488287635e-07, + "loss": 0.0658, + "step": 15104 + }, + { + "epoch": 2.447342838626053, + "grad_norm": 0.9456945061683655, + "learning_rate": 4.319545044065038e-07, + "loss": 0.0584, + "step": 15105 + }, + { + "epoch": 2.44750486066105, + "grad_norm": 0.8441051244735718, + "learning_rate": 4.317088232614308e-07, + "loss": 0.0632, + "step": 15106 + }, + { + "epoch": 2.447666882696047, + "grad_norm": 0.9701780080795288, + "learning_rate": 4.3146320540106397e-07, + "loss": 0.0627, + "step": 15107 + }, + { + "epoch": 2.4478289047310433, + "grad_norm": 0.8207990527153015, + "learning_rate": 4.3121765083291663e-07, + "loss": 0.0608, + "step": 15108 + }, + { + "epoch": 2.4479909267660402, + "grad_norm": 0.8845500946044922, + "learning_rate": 4.3097215956450304e-07, + "loss": 0.0592, + "step": 15109 + }, + { + "epoch": 2.448152948801037, + "grad_norm": 0.8777700662612915, + "learning_rate": 4.307267316033342e-07, + "loss": 0.0663, + "step": 15110 + }, + { + "epoch": 2.4483149708360337, + "grad_norm": 0.9499847888946533, + "learning_rate": 4.3048136695691965e-07, + "loss": 0.0657, + "step": 15111 + }, + { + "epoch": 2.4484769928710306, + "grad_norm": 0.8840598464012146, + "learning_rate": 4.3023606563276753e-07, + "loss": 0.0586, + "step": 15112 + }, + { + "epoch": 2.448639014906027, + "grad_norm": 0.9703836441040039, + "learning_rate": 4.2999082763838293e-07, + "loss": 0.06, + "step": 15113 + }, + { + "epoch": 2.448801036941024, + "grad_norm": 1.0197018384933472, + "learning_rate": 4.297456529812702e-07, + "loss": 0.0734, + "step": 15114 + }, + { + "epoch": 2.4489630589760205, + "grad_norm": 0.9363842606544495, + "learning_rate": 4.2950054166892937e-07, + "loss": 0.0602, + "step": 15115 + }, + { + "epoch": 2.4491250810110174, + "grad_norm": 0.8683654069900513, + "learning_rate": 4.292554937088622e-07, + "loss": 0.0586, + "step": 15116 + }, + { + "epoch": 2.4492871030460144, + "grad_norm": 0.9922870397567749, + "learning_rate": 4.290105091085656e-07, + "loss": 0.0618, + "step": 15117 + }, + { + "epoch": 2.449449125081011, + "grad_norm": 0.9936143755912781, + "learning_rate": 4.287655878755365e-07, + "loss": 0.0596, + "step": 15118 + }, + { + "epoch": 2.449611147116008, + "grad_norm": 0.7621217370033264, + "learning_rate": 4.2852073001726754e-07, + "loss": 0.0545, + "step": 15119 + }, + { + "epoch": 2.4497731691510047, + "grad_norm": 1.023511290550232, + "learning_rate": 4.282759355412505e-07, + "loss": 0.0672, + "step": 15120 + }, + { + "epoch": 2.4499351911860012, + "grad_norm": 0.9315782189369202, + "learning_rate": 4.280312044549778e-07, + "loss": 0.0593, + "step": 15121 + }, + { + "epoch": 2.450097213220998, + "grad_norm": 0.8633824586868286, + "learning_rate": 4.2778653676593534e-07, + "loss": 0.0586, + "step": 15122 + }, + { + "epoch": 2.4502592352559946, + "grad_norm": 0.99221271276474, + "learning_rate": 4.275419324816105e-07, + "loss": 0.0601, + "step": 15123 + }, + { + "epoch": 2.4504212572909916, + "grad_norm": 0.7723276615142822, + "learning_rate": 4.272973916094872e-07, + "loss": 0.0519, + "step": 15124 + }, + { + "epoch": 2.4505832793259885, + "grad_norm": 0.8727956414222717, + "learning_rate": 4.2705291415704757e-07, + "loss": 0.0611, + "step": 15125 + }, + { + "epoch": 2.450745301360985, + "grad_norm": 1.0098901987075806, + "learning_rate": 4.268085001317726e-07, + "loss": 0.0693, + "step": 15126 + }, + { + "epoch": 2.450907323395982, + "grad_norm": 0.9148622751235962, + "learning_rate": 4.2656414954114044e-07, + "loss": 0.0559, + "step": 15127 + }, + { + "epoch": 2.4510693454309784, + "grad_norm": 1.0653668642044067, + "learning_rate": 4.263198623926279e-07, + "loss": 0.0658, + "step": 15128 + }, + { + "epoch": 2.4512313674659754, + "grad_norm": 1.062276005744934, + "learning_rate": 4.260756386937095e-07, + "loss": 0.0647, + "step": 15129 + }, + { + "epoch": 2.4513933895009723, + "grad_norm": 0.9811892509460449, + "learning_rate": 4.258314784518569e-07, + "loss": 0.0649, + "step": 15130 + }, + { + "epoch": 2.451555411535969, + "grad_norm": 0.8770061135292053, + "learning_rate": 4.2558738167454233e-07, + "loss": 0.0617, + "step": 15131 + }, + { + "epoch": 2.4517174335709657, + "grad_norm": 0.860925018787384, + "learning_rate": 4.253433483692337e-07, + "loss": 0.0617, + "step": 15132 + }, + { + "epoch": 2.4518794556059627, + "grad_norm": 0.9953809976577759, + "learning_rate": 4.250993785433988e-07, + "loss": 0.0604, + "step": 15133 + }, + { + "epoch": 2.452041477640959, + "grad_norm": 0.9298536777496338, + "learning_rate": 4.248554722045009e-07, + "loss": 0.0598, + "step": 15134 + }, + { + "epoch": 2.452203499675956, + "grad_norm": 0.8232978582382202, + "learning_rate": 4.246116293600033e-07, + "loss": 0.0578, + "step": 15135 + }, + { + "epoch": 2.4523655217109526, + "grad_norm": 0.9326602220535278, + "learning_rate": 4.2436785001736896e-07, + "loss": 0.0566, + "step": 15136 + }, + { + "epoch": 2.4525275437459495, + "grad_norm": 1.011151671409607, + "learning_rate": 4.241241341840546e-07, + "loss": 0.0606, + "step": 15137 + }, + { + "epoch": 2.452689565780946, + "grad_norm": 0.8477997183799744, + "learning_rate": 4.2388048186751823e-07, + "loss": 0.0556, + "step": 15138 + }, + { + "epoch": 2.452851587815943, + "grad_norm": 0.9269173741340637, + "learning_rate": 4.2363689307521494e-07, + "loss": 0.0573, + "step": 15139 + }, + { + "epoch": 2.45301360985094, + "grad_norm": 0.9142125844955444, + "learning_rate": 4.233933678145982e-07, + "loss": 0.0639, + "step": 15140 + }, + { + "epoch": 2.4531756318859363, + "grad_norm": 1.0160726308822632, + "learning_rate": 4.2314990609311905e-07, + "loss": 0.0579, + "step": 15141 + }, + { + "epoch": 2.4533376539209333, + "grad_norm": 0.950394332408905, + "learning_rate": 4.229065079182268e-07, + "loss": 0.0654, + "step": 15142 + }, + { + "epoch": 2.45349967595593, + "grad_norm": 0.9993029236793518, + "learning_rate": 4.2266317329736904e-07, + "loss": 0.0583, + "step": 15143 + }, + { + "epoch": 2.4536616979909267, + "grad_norm": 0.9236908555030823, + "learning_rate": 4.224199022379913e-07, + "loss": 0.0595, + "step": 15144 + }, + { + "epoch": 2.4538237200259236, + "grad_norm": 0.7894659042358398, + "learning_rate": 4.2217669474753644e-07, + "loss": 0.0562, + "step": 15145 + }, + { + "epoch": 2.45398574206092, + "grad_norm": 0.8922528028488159, + "learning_rate": 4.2193355083344684e-07, + "loss": 0.0649, + "step": 15146 + }, + { + "epoch": 2.454147764095917, + "grad_norm": 0.8606491088867188, + "learning_rate": 4.216904705031624e-07, + "loss": 0.0582, + "step": 15147 + }, + { + "epoch": 2.454309786130914, + "grad_norm": 0.8970129489898682, + "learning_rate": 4.2144745376411946e-07, + "loss": 0.0609, + "step": 15148 + }, + { + "epoch": 2.4544718081659105, + "grad_norm": 0.8637439012527466, + "learning_rate": 4.2120450062375364e-07, + "loss": 0.0556, + "step": 15149 + }, + { + "epoch": 2.4546338302009074, + "grad_norm": 0.9069430828094482, + "learning_rate": 4.2096161108950015e-07, + "loss": 0.0568, + "step": 15150 + }, + { + "epoch": 2.454795852235904, + "grad_norm": 1.070995807647705, + "learning_rate": 4.2071878516879107e-07, + "loss": 0.0597, + "step": 15151 + }, + { + "epoch": 2.454957874270901, + "grad_norm": 0.8751211762428284, + "learning_rate": 4.204760228690546e-07, + "loss": 0.0612, + "step": 15152 + }, + { + "epoch": 2.4551198963058978, + "grad_norm": 0.9568556547164917, + "learning_rate": 4.202333241977194e-07, + "loss": 0.0578, + "step": 15153 + }, + { + "epoch": 2.4552819183408943, + "grad_norm": 0.8110483288764954, + "learning_rate": 4.1999068916221184e-07, + "loss": 0.0536, + "step": 15154 + }, + { + "epoch": 2.455443940375891, + "grad_norm": 1.0597234964370728, + "learning_rate": 4.1974811776995526e-07, + "loss": 0.0536, + "step": 15155 + }, + { + "epoch": 2.4556059624108877, + "grad_norm": 0.8247944712638855, + "learning_rate": 4.1950561002837257e-07, + "loss": 0.0552, + "step": 15156 + }, + { + "epoch": 2.4557679844458846, + "grad_norm": 0.8759192228317261, + "learning_rate": 4.1926316594488315e-07, + "loss": 0.0639, + "step": 15157 + }, + { + "epoch": 2.4559300064808816, + "grad_norm": 0.8859750032424927, + "learning_rate": 4.1902078552690573e-07, + "loss": 0.055, + "step": 15158 + }, + { + "epoch": 2.456092028515878, + "grad_norm": 0.9089791774749756, + "learning_rate": 4.1877846878185635e-07, + "loss": 0.0567, + "step": 15159 + }, + { + "epoch": 2.456254050550875, + "grad_norm": 0.9586995244026184, + "learning_rate": 4.185362157171496e-07, + "loss": 0.0611, + "step": 15160 + }, + { + "epoch": 2.4564160725858715, + "grad_norm": 0.9053599238395691, + "learning_rate": 4.1829402634019746e-07, + "loss": 0.0628, + "step": 15161 + }, + { + "epoch": 2.4565780946208684, + "grad_norm": 0.8209922313690186, + "learning_rate": 4.1805190065841107e-07, + "loss": 0.0568, + "step": 15162 + }, + { + "epoch": 2.4567401166558653, + "grad_norm": 1.2450376749038696, + "learning_rate": 4.178098386791971e-07, + "loss": 0.0639, + "step": 15163 + }, + { + "epoch": 2.456902138690862, + "grad_norm": 0.9918776750564575, + "learning_rate": 4.175678404099637e-07, + "loss": 0.0621, + "step": 15164 + }, + { + "epoch": 2.4570641607258588, + "grad_norm": 0.916986346244812, + "learning_rate": 4.1732590585811586e-07, + "loss": 0.0672, + "step": 15165 + }, + { + "epoch": 2.4572261827608557, + "grad_norm": 1.3180897235870361, + "learning_rate": 4.1708403503105456e-07, + "loss": 0.0618, + "step": 15166 + }, + { + "epoch": 2.457388204795852, + "grad_norm": 0.946431577205658, + "learning_rate": 4.168422279361811e-07, + "loss": 0.059, + "step": 15167 + }, + { + "epoch": 2.457550226830849, + "grad_norm": 0.9107024669647217, + "learning_rate": 4.166004845808941e-07, + "loss": 0.0579, + "step": 15168 + }, + { + "epoch": 2.4577122488658456, + "grad_norm": 0.9876469969749451, + "learning_rate": 4.163588049725914e-07, + "loss": 0.0622, + "step": 15169 + }, + { + "epoch": 2.4578742709008425, + "grad_norm": 0.9634394645690918, + "learning_rate": 4.1611718911866663e-07, + "loss": 0.0572, + "step": 15170 + }, + { + "epoch": 2.458036292935839, + "grad_norm": 0.9489405155181885, + "learning_rate": 4.158756370265127e-07, + "loss": 0.0577, + "step": 15171 + }, + { + "epoch": 2.458198314970836, + "grad_norm": 0.8651463389396667, + "learning_rate": 4.1563414870352093e-07, + "loss": 0.0559, + "step": 15172 + }, + { + "epoch": 2.458360337005833, + "grad_norm": 0.8804042935371399, + "learning_rate": 4.1539272415708014e-07, + "loss": 0.0612, + "step": 15173 + }, + { + "epoch": 2.4585223590408294, + "grad_norm": 0.8089525103569031, + "learning_rate": 4.1515136339457725e-07, + "loss": 0.0591, + "step": 15174 + }, + { + "epoch": 2.4586843810758263, + "grad_norm": 0.8935924768447876, + "learning_rate": 4.1491006642339765e-07, + "loss": 0.0638, + "step": 15175 + }, + { + "epoch": 2.4588464031108233, + "grad_norm": 0.8347843289375305, + "learning_rate": 4.146688332509241e-07, + "loss": 0.0616, + "step": 15176 + }, + { + "epoch": 2.4590084251458197, + "grad_norm": 0.9167835712432861, + "learning_rate": 4.144276638845382e-07, + "loss": 0.0613, + "step": 15177 + }, + { + "epoch": 2.4591704471808167, + "grad_norm": 1.0505998134613037, + "learning_rate": 4.1418655833161794e-07, + "loss": 0.0596, + "step": 15178 + }, + { + "epoch": 2.459332469215813, + "grad_norm": 0.8533574938774109, + "learning_rate": 4.139455165995418e-07, + "loss": 0.0528, + "step": 15179 + }, + { + "epoch": 2.45949449125081, + "grad_norm": 0.8060251474380493, + "learning_rate": 4.137045386956853e-07, + "loss": 0.0542, + "step": 15180 + }, + { + "epoch": 2.459656513285807, + "grad_norm": 0.7060052752494812, + "learning_rate": 4.1346362462742067e-07, + "loss": 0.0521, + "step": 15181 + }, + { + "epoch": 2.4598185353208035, + "grad_norm": 1.0977946519851685, + "learning_rate": 4.1322277440211973e-07, + "loss": 0.062, + "step": 15182 + }, + { + "epoch": 2.4599805573558005, + "grad_norm": 0.7807490229606628, + "learning_rate": 4.129819880271516e-07, + "loss": 0.0566, + "step": 15183 + }, + { + "epoch": 2.460142579390797, + "grad_norm": 1.1565881967544556, + "learning_rate": 4.1274126550988505e-07, + "loss": 0.0633, + "step": 15184 + }, + { + "epoch": 2.460304601425794, + "grad_norm": 0.9105919003486633, + "learning_rate": 4.125006068576842e-07, + "loss": 0.0642, + "step": 15185 + }, + { + "epoch": 2.460466623460791, + "grad_norm": 0.8822482824325562, + "learning_rate": 4.1226001207791327e-07, + "loss": 0.0622, + "step": 15186 + }, + { + "epoch": 2.4606286454957873, + "grad_norm": 0.873638927936554, + "learning_rate": 4.120194811779335e-07, + "loss": 0.0659, + "step": 15187 + }, + { + "epoch": 2.4607906675307842, + "grad_norm": 0.9651090502738953, + "learning_rate": 4.1177901416510485e-07, + "loss": 0.0645, + "step": 15188 + }, + { + "epoch": 2.460952689565781, + "grad_norm": 0.8976024389266968, + "learning_rate": 4.1153861104678505e-07, + "loss": 0.0582, + "step": 15189 + }, + { + "epoch": 2.4611147116007777, + "grad_norm": 1.0942816734313965, + "learning_rate": 4.112982718303299e-07, + "loss": 0.0669, + "step": 15190 + }, + { + "epoch": 2.4612767336357746, + "grad_norm": 1.016480803489685, + "learning_rate": 4.1105799652309347e-07, + "loss": 0.0662, + "step": 15191 + }, + { + "epoch": 2.461438755670771, + "grad_norm": 0.9741160869598389, + "learning_rate": 4.1081778513242606e-07, + "loss": 0.0628, + "step": 15192 + }, + { + "epoch": 2.461600777705768, + "grad_norm": 0.8383246064186096, + "learning_rate": 4.105776376656795e-07, + "loss": 0.054, + "step": 15193 + }, + { + "epoch": 2.4617627997407645, + "grad_norm": 0.8395657539367676, + "learning_rate": 4.103375541302007e-07, + "loss": 0.0606, + "step": 15194 + }, + { + "epoch": 2.4619248217757614, + "grad_norm": 0.8531553745269775, + "learning_rate": 4.1009753453333636e-07, + "loss": 0.0621, + "step": 15195 + }, + { + "epoch": 2.4620868438107584, + "grad_norm": 0.9702677130699158, + "learning_rate": 4.0985757888242965e-07, + "loss": 0.0658, + "step": 15196 + }, + { + "epoch": 2.462248865845755, + "grad_norm": 0.9990732073783875, + "learning_rate": 4.09617687184822e-07, + "loss": 0.0656, + "step": 15197 + }, + { + "epoch": 2.462410887880752, + "grad_norm": 0.888516902923584, + "learning_rate": 4.0937785944785617e-07, + "loss": 0.0645, + "step": 15198 + }, + { + "epoch": 2.4625729099157487, + "grad_norm": 1.0981239080429077, + "learning_rate": 4.091380956788676e-07, + "loss": 0.0567, + "step": 15199 + }, + { + "epoch": 2.462734931950745, + "grad_norm": 0.9011585712432861, + "learning_rate": 4.0889839588519386e-07, + "loss": 0.0584, + "step": 15200 + }, + { + "epoch": 2.462896953985742, + "grad_norm": 0.8724988698959351, + "learning_rate": 4.086587600741687e-07, + "loss": 0.0592, + "step": 15201 + }, + { + "epoch": 2.4630589760207386, + "grad_norm": 1.0473746061325073, + "learning_rate": 4.0841918825312465e-07, + "loss": 0.0654, + "step": 15202 + }, + { + "epoch": 2.4632209980557356, + "grad_norm": 1.0286039113998413, + "learning_rate": 4.0817968042939165e-07, + "loss": 0.066, + "step": 15203 + }, + { + "epoch": 2.4633830200907325, + "grad_norm": 0.9677504301071167, + "learning_rate": 4.0794023661029856e-07, + "loss": 0.0646, + "step": 15204 + }, + { + "epoch": 2.463545042125729, + "grad_norm": 0.9991774559020996, + "learning_rate": 4.0770085680317153e-07, + "loss": 0.0683, + "step": 15205 + }, + { + "epoch": 2.463707064160726, + "grad_norm": 0.8651935458183289, + "learning_rate": 4.0746154101533485e-07, + "loss": 0.064, + "step": 15206 + }, + { + "epoch": 2.4638690861957224, + "grad_norm": 0.9634072184562683, + "learning_rate": 4.072222892541111e-07, + "loss": 0.068, + "step": 15207 + }, + { + "epoch": 2.4640311082307194, + "grad_norm": 0.7955970764160156, + "learning_rate": 4.0698310152682107e-07, + "loss": 0.0583, + "step": 15208 + }, + { + "epoch": 2.4641931302657163, + "grad_norm": 0.8827134966850281, + "learning_rate": 4.067439778407839e-07, + "loss": 0.058, + "step": 15209 + }, + { + "epoch": 2.464355152300713, + "grad_norm": 0.932346522808075, + "learning_rate": 4.065049182033146e-07, + "loss": 0.0603, + "step": 15210 + }, + { + "epoch": 2.4645171743357097, + "grad_norm": 0.8273137807846069, + "learning_rate": 4.0626592262172803e-07, + "loss": 0.0568, + "step": 15211 + }, + { + "epoch": 2.4646791963707066, + "grad_norm": 0.8896094560623169, + "learning_rate": 4.0602699110333795e-07, + "loss": 0.0552, + "step": 15212 + }, + { + "epoch": 2.464841218405703, + "grad_norm": 0.8498284220695496, + "learning_rate": 4.0578812365545533e-07, + "loss": 0.0577, + "step": 15213 + }, + { + "epoch": 2.4650032404407, + "grad_norm": 0.8612973690032959, + "learning_rate": 4.0554932028538774e-07, + "loss": 0.0596, + "step": 15214 + }, + { + "epoch": 2.4651652624756966, + "grad_norm": 0.9590808153152466, + "learning_rate": 4.0531058100044264e-07, + "loss": 0.0692, + "step": 15215 + }, + { + "epoch": 2.4653272845106935, + "grad_norm": 0.9362857341766357, + "learning_rate": 4.050719058079244e-07, + "loss": 0.0629, + "step": 15216 + }, + { + "epoch": 2.46548930654569, + "grad_norm": 1.062246322631836, + "learning_rate": 4.048332947151362e-07, + "loss": 0.0645, + "step": 15217 + }, + { + "epoch": 2.465651328580687, + "grad_norm": 0.8582957983016968, + "learning_rate": 4.045947477293791e-07, + "loss": 0.0562, + "step": 15218 + }, + { + "epoch": 2.465813350615684, + "grad_norm": 0.8503392934799194, + "learning_rate": 4.043562648579519e-07, + "loss": 0.0538, + "step": 15219 + }, + { + "epoch": 2.4659753726506803, + "grad_norm": 0.7822324633598328, + "learning_rate": 4.041178461081519e-07, + "loss": 0.0569, + "step": 15220 + }, + { + "epoch": 2.4661373946856773, + "grad_norm": 0.8472219705581665, + "learning_rate": 4.0387949148727343e-07, + "loss": 0.0584, + "step": 15221 + }, + { + "epoch": 2.466299416720674, + "grad_norm": 0.8778401613235474, + "learning_rate": 4.036412010026103e-07, + "loss": 0.0649, + "step": 15222 + }, + { + "epoch": 2.4664614387556707, + "grad_norm": 0.8573176860809326, + "learning_rate": 4.034029746614532e-07, + "loss": 0.0615, + "step": 15223 + }, + { + "epoch": 2.4666234607906676, + "grad_norm": 0.8038144707679749, + "learning_rate": 4.0316481247109215e-07, + "loss": 0.0562, + "step": 15224 + }, + { + "epoch": 2.466785482825664, + "grad_norm": 0.9153627753257751, + "learning_rate": 4.029267144388127e-07, + "loss": 0.0603, + "step": 15225 + }, + { + "epoch": 2.466947504860661, + "grad_norm": 0.8227776885032654, + "learning_rate": 4.0268868057190075e-07, + "loss": 0.0585, + "step": 15226 + }, + { + "epoch": 2.467109526895658, + "grad_norm": 0.880547285079956, + "learning_rate": 4.0245071087764015e-07, + "loss": 0.0633, + "step": 15227 + }, + { + "epoch": 2.4672715489306545, + "grad_norm": 0.9440403580665588, + "learning_rate": 4.022128053633123e-07, + "loss": 0.062, + "step": 15228 + }, + { + "epoch": 2.4674335709656514, + "grad_norm": 0.8492519855499268, + "learning_rate": 4.0197496403619557e-07, + "loss": 0.0534, + "step": 15229 + }, + { + "epoch": 2.467595593000648, + "grad_norm": 0.7454891800880432, + "learning_rate": 4.017371869035674e-07, + "loss": 0.051, + "step": 15230 + }, + { + "epoch": 2.467757615035645, + "grad_norm": 0.8574808239936829, + "learning_rate": 4.014994739727046e-07, + "loss": 0.0585, + "step": 15231 + }, + { + "epoch": 2.4679196370706418, + "grad_norm": 0.9869388937950134, + "learning_rate": 4.01261825250879e-07, + "loss": 0.063, + "step": 15232 + }, + { + "epoch": 2.4680816591056383, + "grad_norm": 0.9070796966552734, + "learning_rate": 4.0102424074536295e-07, + "loss": 0.0593, + "step": 15233 + }, + { + "epoch": 2.468243681140635, + "grad_norm": 1.0267338752746582, + "learning_rate": 4.0078672046342553e-07, + "loss": 0.0566, + "step": 15234 + }, + { + "epoch": 2.468405703175632, + "grad_norm": 1.009340763092041, + "learning_rate": 4.005492644123346e-07, + "loss": 0.0684, + "step": 15235 + }, + { + "epoch": 2.4685677252106286, + "grad_norm": 0.7549983263015747, + "learning_rate": 4.0031187259935546e-07, + "loss": 0.0585, + "step": 15236 + }, + { + "epoch": 2.4687297472456255, + "grad_norm": 1.0285526514053345, + "learning_rate": 4.0007454503175196e-07, + "loss": 0.0649, + "step": 15237 + }, + { + "epoch": 2.468891769280622, + "grad_norm": 0.9466545581817627, + "learning_rate": 3.998372817167856e-07, + "loss": 0.0668, + "step": 15238 + }, + { + "epoch": 2.469053791315619, + "grad_norm": 0.8367640376091003, + "learning_rate": 3.9960008266171663e-07, + "loss": 0.055, + "step": 15239 + }, + { + "epoch": 2.4692158133506155, + "grad_norm": 0.9086434841156006, + "learning_rate": 3.993629478738012e-07, + "loss": 0.0613, + "step": 15240 + }, + { + "epoch": 2.4693778353856124, + "grad_norm": 0.9851820468902588, + "learning_rate": 3.9912587736029656e-07, + "loss": 0.0644, + "step": 15241 + }, + { + "epoch": 2.4695398574206093, + "grad_norm": 0.8282864093780518, + "learning_rate": 3.988888711284569e-07, + "loss": 0.0498, + "step": 15242 + }, + { + "epoch": 2.469701879455606, + "grad_norm": 1.0456504821777344, + "learning_rate": 3.9865192918553256e-07, + "loss": 0.0621, + "step": 15243 + }, + { + "epoch": 2.4698639014906028, + "grad_norm": 1.0236272811889648, + "learning_rate": 3.9841505153877387e-07, + "loss": 0.0589, + "step": 15244 + }, + { + "epoch": 2.4700259235255997, + "grad_norm": 0.982083797454834, + "learning_rate": 3.981782381954283e-07, + "loss": 0.0691, + "step": 15245 + }, + { + "epoch": 2.470187945560596, + "grad_norm": 0.8207711577415466, + "learning_rate": 3.9794148916274365e-07, + "loss": 0.0525, + "step": 15246 + }, + { + "epoch": 2.470349967595593, + "grad_norm": 0.7450169324874878, + "learning_rate": 3.977048044479617e-07, + "loss": 0.0538, + "step": 15247 + }, + { + "epoch": 2.4705119896305896, + "grad_norm": 0.9385082125663757, + "learning_rate": 3.974681840583255e-07, + "loss": 0.0647, + "step": 15248 + }, + { + "epoch": 2.4706740116655865, + "grad_norm": 0.9451351165771484, + "learning_rate": 3.972316280010749e-07, + "loss": 0.0596, + "step": 15249 + }, + { + "epoch": 2.4708360337005835, + "grad_norm": 1.1026972532272339, + "learning_rate": 3.969951362834476e-07, + "loss": 0.0632, + "step": 15250 + }, + { + "epoch": 2.47099805573558, + "grad_norm": 0.82200026512146, + "learning_rate": 3.967587089126801e-07, + "loss": 0.0561, + "step": 15251 + }, + { + "epoch": 2.471160077770577, + "grad_norm": 0.9545662999153137, + "learning_rate": 3.965223458960063e-07, + "loss": 0.0698, + "step": 15252 + }, + { + "epoch": 2.4713220998055734, + "grad_norm": 1.0327435731887817, + "learning_rate": 3.9628604724065907e-07, + "loss": 0.0596, + "step": 15253 + }, + { + "epoch": 2.4714841218405703, + "grad_norm": 0.9295939207077026, + "learning_rate": 3.9604981295386673e-07, + "loss": 0.0572, + "step": 15254 + }, + { + "epoch": 2.4716461438755672, + "grad_norm": 0.9229394793510437, + "learning_rate": 3.958136430428594e-07, + "loss": 0.0621, + "step": 15255 + }, + { + "epoch": 2.4718081659105637, + "grad_norm": 0.9629893898963928, + "learning_rate": 3.9557753751486237e-07, + "loss": 0.0589, + "step": 15256 + }, + { + "epoch": 2.4719701879455607, + "grad_norm": 0.985286295413971, + "learning_rate": 3.9534149637710073e-07, + "loss": 0.0636, + "step": 15257 + }, + { + "epoch": 2.4721322099805576, + "grad_norm": 0.8284005522727966, + "learning_rate": 3.9510551963679534e-07, + "loss": 0.0588, + "step": 15258 + }, + { + "epoch": 2.472294232015554, + "grad_norm": 1.0502867698669434, + "learning_rate": 3.948696073011668e-07, + "loss": 0.0616, + "step": 15259 + }, + { + "epoch": 2.472456254050551, + "grad_norm": 0.9685097932815552, + "learning_rate": 3.9463375937743546e-07, + "loss": 0.0636, + "step": 15260 + }, + { + "epoch": 2.4726182760855475, + "grad_norm": 0.9071670174598694, + "learning_rate": 3.943979758728153e-07, + "loss": 0.0646, + "step": 15261 + }, + { + "epoch": 2.4727802981205445, + "grad_norm": 0.93815678358078, + "learning_rate": 3.941622567945216e-07, + "loss": 0.0576, + "step": 15262 + }, + { + "epoch": 2.472942320155541, + "grad_norm": 0.8597636222839355, + "learning_rate": 3.93926602149767e-07, + "loss": 0.0634, + "step": 15263 + }, + { + "epoch": 2.473104342190538, + "grad_norm": 0.9428747892379761, + "learning_rate": 3.9369101194576156e-07, + "loss": 0.058, + "step": 15264 + }, + { + "epoch": 2.473266364225535, + "grad_norm": 0.8766636252403259, + "learning_rate": 3.934554861897141e-07, + "loss": 0.0541, + "step": 15265 + }, + { + "epoch": 2.4734283862605313, + "grad_norm": 0.9514452219009399, + "learning_rate": 3.93220024888831e-07, + "loss": 0.0595, + "step": 15266 + }, + { + "epoch": 2.4735904082955282, + "grad_norm": 1.0236924886703491, + "learning_rate": 3.929846280503169e-07, + "loss": 0.0666, + "step": 15267 + }, + { + "epoch": 2.473752430330525, + "grad_norm": 0.8765333294868469, + "learning_rate": 3.927492956813747e-07, + "loss": 0.0601, + "step": 15268 + }, + { + "epoch": 2.4739144523655217, + "grad_norm": 0.871446967124939, + "learning_rate": 3.925140277892037e-07, + "loss": 0.0566, + "step": 15269 + }, + { + "epoch": 2.4740764744005186, + "grad_norm": 0.8625292181968689, + "learning_rate": 3.922788243810038e-07, + "loss": 0.0563, + "step": 15270 + }, + { + "epoch": 2.474238496435515, + "grad_norm": 0.8439027070999146, + "learning_rate": 3.9204368546397144e-07, + "loss": 0.0631, + "step": 15271 + }, + { + "epoch": 2.474400518470512, + "grad_norm": 0.9690973162651062, + "learning_rate": 3.918086110453015e-07, + "loss": 0.0647, + "step": 15272 + }, + { + "epoch": 2.4745625405055085, + "grad_norm": 0.9805171489715576, + "learning_rate": 3.915736011321855e-07, + "loss": 0.0608, + "step": 15273 + }, + { + "epoch": 2.4747245625405054, + "grad_norm": 0.8432701230049133, + "learning_rate": 3.9133865573181524e-07, + "loss": 0.0525, + "step": 15274 + }, + { + "epoch": 2.4748865845755024, + "grad_norm": 1.0578525066375732, + "learning_rate": 3.9110377485138017e-07, + "loss": 0.0603, + "step": 15275 + }, + { + "epoch": 2.475048606610499, + "grad_norm": 0.9432248473167419, + "learning_rate": 3.9086895849806547e-07, + "loss": 0.0623, + "step": 15276 + }, + { + "epoch": 2.475210628645496, + "grad_norm": 0.8474089503288269, + "learning_rate": 3.9063420667905637e-07, + "loss": 0.0598, + "step": 15277 + }, + { + "epoch": 2.4753726506804927, + "grad_norm": 0.940773069858551, + "learning_rate": 3.903995194015364e-07, + "loss": 0.053, + "step": 15278 + }, + { + "epoch": 2.475534672715489, + "grad_norm": 1.0299676656723022, + "learning_rate": 3.9016489667268563e-07, + "loss": 0.0628, + "step": 15279 + }, + { + "epoch": 2.475696694750486, + "grad_norm": 0.8737006187438965, + "learning_rate": 3.899303384996836e-07, + "loss": 0.0612, + "step": 15280 + }, + { + "epoch": 2.4758587167854826, + "grad_norm": 0.9336732029914856, + "learning_rate": 3.8969584488970675e-07, + "loss": 0.0607, + "step": 15281 + }, + { + "epoch": 2.4760207388204796, + "grad_norm": 0.8692206740379333, + "learning_rate": 3.894614158499302e-07, + "loss": 0.0597, + "step": 15282 + }, + { + "epoch": 2.4761827608554765, + "grad_norm": 1.1092135906219482, + "learning_rate": 3.892270513875271e-07, + "loss": 0.0599, + "step": 15283 + }, + { + "epoch": 2.476344782890473, + "grad_norm": 1.1713637113571167, + "learning_rate": 3.889927515096681e-07, + "loss": 0.0662, + "step": 15284 + }, + { + "epoch": 2.47650680492547, + "grad_norm": 1.050701379776001, + "learning_rate": 3.887585162235225e-07, + "loss": 0.0573, + "step": 15285 + }, + { + "epoch": 2.4766688269604664, + "grad_norm": 0.8351500630378723, + "learning_rate": 3.885243455362578e-07, + "loss": 0.0554, + "step": 15286 + }, + { + "epoch": 2.4768308489954634, + "grad_norm": 0.9191629886627197, + "learning_rate": 3.882902394550378e-07, + "loss": 0.0612, + "step": 15287 + }, + { + "epoch": 2.4769928710304603, + "grad_norm": 0.9809505939483643, + "learning_rate": 3.8805619798702565e-07, + "loss": 0.069, + "step": 15288 + }, + { + "epoch": 2.4771548930654568, + "grad_norm": 0.9045706391334534, + "learning_rate": 3.878222211393834e-07, + "loss": 0.0568, + "step": 15289 + }, + { + "epoch": 2.4773169151004537, + "grad_norm": 0.94025719165802, + "learning_rate": 3.8758830891927056e-07, + "loss": 0.0574, + "step": 15290 + }, + { + "epoch": 2.4774789371354506, + "grad_norm": 0.948002278804779, + "learning_rate": 3.8735446133384313e-07, + "loss": 0.0584, + "step": 15291 + }, + { + "epoch": 2.477640959170447, + "grad_norm": 1.0768437385559082, + "learning_rate": 3.8712067839025647e-07, + "loss": 0.0703, + "step": 15292 + }, + { + "epoch": 2.477802981205444, + "grad_norm": 1.0241899490356445, + "learning_rate": 3.8688696009566404e-07, + "loss": 0.07, + "step": 15293 + }, + { + "epoch": 2.4779650032404406, + "grad_norm": 0.8172342777252197, + "learning_rate": 3.86653306457217e-07, + "loss": 0.0562, + "step": 15294 + }, + { + "epoch": 2.4781270252754375, + "grad_norm": 0.9391677379608154, + "learning_rate": 3.864197174820647e-07, + "loss": 0.0619, + "step": 15295 + }, + { + "epoch": 2.478289047310434, + "grad_norm": 0.8953998684883118, + "learning_rate": 3.861861931773542e-07, + "loss": 0.0588, + "step": 15296 + }, + { + "epoch": 2.478451069345431, + "grad_norm": 0.8568360209465027, + "learning_rate": 3.8595273355023054e-07, + "loss": 0.0573, + "step": 15297 + }, + { + "epoch": 2.478613091380428, + "grad_norm": 1.2389211654663086, + "learning_rate": 3.8571933860783785e-07, + "loss": 0.0682, + "step": 15298 + }, + { + "epoch": 2.4787751134154243, + "grad_norm": 0.9053958058357239, + "learning_rate": 3.854860083573167e-07, + "loss": 0.0544, + "step": 15299 + }, + { + "epoch": 2.4789371354504213, + "grad_norm": 0.989845335483551, + "learning_rate": 3.8525274280580646e-07, + "loss": 0.0537, + "step": 15300 + }, + { + "epoch": 2.479099157485418, + "grad_norm": 0.8106503486633301, + "learning_rate": 3.850195419604455e-07, + "loss": 0.0611, + "step": 15301 + }, + { + "epoch": 2.4792611795204147, + "grad_norm": 0.9021776914596558, + "learning_rate": 3.8478640582836733e-07, + "loss": 0.0567, + "step": 15302 + }, + { + "epoch": 2.4794232015554116, + "grad_norm": 0.9458431601524353, + "learning_rate": 3.845533344167068e-07, + "loss": 0.0572, + "step": 15303 + }, + { + "epoch": 2.479585223590408, + "grad_norm": 1.0524413585662842, + "learning_rate": 3.8432032773259574e-07, + "loss": 0.0643, + "step": 15304 + }, + { + "epoch": 2.479747245625405, + "grad_norm": 0.8422672152519226, + "learning_rate": 3.84087385783162e-07, + "loss": 0.0622, + "step": 15305 + }, + { + "epoch": 2.479909267660402, + "grad_norm": 1.0135761499404907, + "learning_rate": 3.838545085755341e-07, + "loss": 0.0624, + "step": 15306 + }, + { + "epoch": 2.4800712896953985, + "grad_norm": 0.9610772728919983, + "learning_rate": 3.8362169611683655e-07, + "loss": 0.0637, + "step": 15307 + }, + { + "epoch": 2.4802333117303954, + "grad_norm": 0.903256356716156, + "learning_rate": 3.8338894841419476e-07, + "loss": 0.0601, + "step": 15308 + }, + { + "epoch": 2.480395333765392, + "grad_norm": 1.0476837158203125, + "learning_rate": 3.831562654747284e-07, + "loss": 0.0646, + "step": 15309 + }, + { + "epoch": 2.480557355800389, + "grad_norm": 0.9361180067062378, + "learning_rate": 3.8292364730555754e-07, + "loss": 0.057, + "step": 15310 + }, + { + "epoch": 2.4807193778353858, + "grad_norm": 0.9486135244369507, + "learning_rate": 3.826910939138001e-07, + "loss": 0.0601, + "step": 15311 + }, + { + "epoch": 2.4808813998703823, + "grad_norm": 0.8238722085952759, + "learning_rate": 3.8245860530657126e-07, + "loss": 0.0562, + "step": 15312 + }, + { + "epoch": 2.481043421905379, + "grad_norm": 0.9989494681358337, + "learning_rate": 3.8222618149098473e-07, + "loss": 0.0571, + "step": 15313 + }, + { + "epoch": 2.481205443940376, + "grad_norm": 0.9721057415008545, + "learning_rate": 3.8199382247415236e-07, + "loss": 0.0678, + "step": 15314 + }, + { + "epoch": 2.4813674659753726, + "grad_norm": 0.9013247489929199, + "learning_rate": 3.817615282631831e-07, + "loss": 0.0597, + "step": 15315 + }, + { + "epoch": 2.4815294880103695, + "grad_norm": 0.9429559707641602, + "learning_rate": 3.8152929886518587e-07, + "loss": 0.0692, + "step": 15316 + }, + { + "epoch": 2.481691510045366, + "grad_norm": 0.803132176399231, + "learning_rate": 3.8129713428726454e-07, + "loss": 0.0536, + "step": 15317 + }, + { + "epoch": 2.481853532080363, + "grad_norm": 0.7906658053398132, + "learning_rate": 3.810650345365241e-07, + "loss": 0.0508, + "step": 15318 + }, + { + "epoch": 2.4820155541153595, + "grad_norm": 0.9531766772270203, + "learning_rate": 3.808329996200663e-07, + "loss": 0.0648, + "step": 15319 + }, + { + "epoch": 2.4821775761503564, + "grad_norm": 0.8146340847015381, + "learning_rate": 3.8060102954499024e-07, + "loss": 0.0545, + "step": 15320 + }, + { + "epoch": 2.4823395981853533, + "grad_norm": 0.9485729932785034, + "learning_rate": 3.8036912431839297e-07, + "loss": 0.055, + "step": 15321 + }, + { + "epoch": 2.48250162022035, + "grad_norm": 0.8682623505592346, + "learning_rate": 3.8013728394737216e-07, + "loss": 0.0593, + "step": 15322 + }, + { + "epoch": 2.4826636422553467, + "grad_norm": 1.042887806892395, + "learning_rate": 3.7990550843902017e-07, + "loss": 0.0639, + "step": 15323 + }, + { + "epoch": 2.4828256642903437, + "grad_norm": 0.9474160075187683, + "learning_rate": 3.796737978004289e-07, + "loss": 0.0667, + "step": 15324 + }, + { + "epoch": 2.48298768632534, + "grad_norm": 0.9614787101745605, + "learning_rate": 3.7944215203868843e-07, + "loss": 0.0681, + "step": 15325 + }, + { + "epoch": 2.483149708360337, + "grad_norm": 0.7932602763175964, + "learning_rate": 3.792105711608865e-07, + "loss": 0.0587, + "step": 15326 + }, + { + "epoch": 2.4833117303953336, + "grad_norm": 0.8903249502182007, + "learning_rate": 3.7897905517410877e-07, + "loss": 0.0531, + "step": 15327 + }, + { + "epoch": 2.4834737524303305, + "grad_norm": 0.9789133667945862, + "learning_rate": 3.7874760408543933e-07, + "loss": 0.0701, + "step": 15328 + }, + { + "epoch": 2.4836357744653275, + "grad_norm": 0.8971397280693054, + "learning_rate": 3.785162179019597e-07, + "loss": 0.0692, + "step": 15329 + }, + { + "epoch": 2.483797796500324, + "grad_norm": 0.9187726974487305, + "learning_rate": 3.7828489663075065e-07, + "loss": 0.0591, + "step": 15330 + }, + { + "epoch": 2.483959818535321, + "grad_norm": 1.0862635374069214, + "learning_rate": 3.7805364027888787e-07, + "loss": 0.0704, + "step": 15331 + }, + { + "epoch": 2.4841218405703174, + "grad_norm": 0.8576655387878418, + "learning_rate": 3.778224488534496e-07, + "loss": 0.062, + "step": 15332 + }, + { + "epoch": 2.4842838626053143, + "grad_norm": 1.0012966394424438, + "learning_rate": 3.7759132236150854e-07, + "loss": 0.068, + "step": 15333 + }, + { + "epoch": 2.4844458846403112, + "grad_norm": 0.8457930088043213, + "learning_rate": 3.773602608101376e-07, + "loss": 0.0632, + "step": 15334 + }, + { + "epoch": 2.4846079066753077, + "grad_norm": 1.0266876220703125, + "learning_rate": 3.771292642064056e-07, + "loss": 0.0679, + "step": 15335 + }, + { + "epoch": 2.4847699287103047, + "grad_norm": 0.8954113721847534, + "learning_rate": 3.7689833255737995e-07, + "loss": 0.0574, + "step": 15336 + }, + { + "epoch": 2.4849319507453016, + "grad_norm": 0.9594757556915283, + "learning_rate": 3.7666746587012885e-07, + "loss": 0.0611, + "step": 15337 + }, + { + "epoch": 2.485093972780298, + "grad_norm": 0.8226574063301086, + "learning_rate": 3.764366641517145e-07, + "loss": 0.0606, + "step": 15338 + }, + { + "epoch": 2.485255994815295, + "grad_norm": 0.9588454365730286, + "learning_rate": 3.762059274091989e-07, + "loss": 0.0639, + "step": 15339 + }, + { + "epoch": 2.4854180168502915, + "grad_norm": 0.913917064666748, + "learning_rate": 3.759752556496421e-07, + "loss": 0.0661, + "step": 15340 + }, + { + "epoch": 2.4855800388852884, + "grad_norm": 0.7962075471878052, + "learning_rate": 3.7574464888010363e-07, + "loss": 0.0535, + "step": 15341 + }, + { + "epoch": 2.485742060920285, + "grad_norm": 0.8324793577194214, + "learning_rate": 3.7551410710763764e-07, + "loss": 0.0583, + "step": 15342 + }, + { + "epoch": 2.485904082955282, + "grad_norm": 1.2726354598999023, + "learning_rate": 3.75283630339299e-07, + "loss": 0.0629, + "step": 15343 + }, + { + "epoch": 2.486066104990279, + "grad_norm": 0.8221186995506287, + "learning_rate": 3.7505321858213926e-07, + "loss": 0.055, + "step": 15344 + }, + { + "epoch": 2.4862281270252753, + "grad_norm": 1.0081355571746826, + "learning_rate": 3.7482287184320897e-07, + "loss": 0.0632, + "step": 15345 + }, + { + "epoch": 2.4863901490602722, + "grad_norm": 0.920180082321167, + "learning_rate": 3.7459259012955606e-07, + "loss": 0.0622, + "step": 15346 + }, + { + "epoch": 2.486552171095269, + "grad_norm": 0.9343588948249817, + "learning_rate": 3.743623734482263e-07, + "loss": 0.0634, + "step": 15347 + }, + { + "epoch": 2.4867141931302656, + "grad_norm": 1.170159935951233, + "learning_rate": 3.7413222180626455e-07, + "loss": 0.0627, + "step": 15348 + }, + { + "epoch": 2.4868762151652626, + "grad_norm": 0.9837419390678406, + "learning_rate": 3.7390213521071193e-07, + "loss": 0.0612, + "step": 15349 + }, + { + "epoch": 2.487038237200259, + "grad_norm": 0.8149502873420715, + "learning_rate": 3.736721136686081e-07, + "loss": 0.0558, + "step": 15350 + }, + { + "epoch": 2.487200259235256, + "grad_norm": 0.8310353755950928, + "learning_rate": 3.7344215718699256e-07, + "loss": 0.0522, + "step": 15351 + }, + { + "epoch": 2.487362281270253, + "grad_norm": 0.8602123260498047, + "learning_rate": 3.7321226577290147e-07, + "loss": 0.0598, + "step": 15352 + }, + { + "epoch": 2.4875243033052494, + "grad_norm": 0.8709070682525635, + "learning_rate": 3.7298243943336784e-07, + "loss": 0.0583, + "step": 15353 + }, + { + "epoch": 2.4876863253402464, + "grad_norm": 0.9925591945648193, + "learning_rate": 3.7275267817542425e-07, + "loss": 0.0599, + "step": 15354 + }, + { + "epoch": 2.487848347375243, + "grad_norm": 0.8003130555152893, + "learning_rate": 3.725229820061008e-07, + "loss": 0.0569, + "step": 15355 + }, + { + "epoch": 2.48801036941024, + "grad_norm": 0.8514525890350342, + "learning_rate": 3.7229335093242587e-07, + "loss": 0.0528, + "step": 15356 + }, + { + "epoch": 2.4881723914452367, + "grad_norm": 0.8922498226165771, + "learning_rate": 3.720637849614253e-07, + "loss": 0.0614, + "step": 15357 + }, + { + "epoch": 2.488334413480233, + "grad_norm": 1.7786387205123901, + "learning_rate": 3.7183428410012326e-07, + "loss": 0.0647, + "step": 15358 + }, + { + "epoch": 2.48849643551523, + "grad_norm": 0.9049399495124817, + "learning_rate": 3.716048483555423e-07, + "loss": 0.063, + "step": 15359 + }, + { + "epoch": 2.488658457550227, + "grad_norm": 0.9355145692825317, + "learning_rate": 3.713754777347023e-07, + "loss": 0.0621, + "step": 15360 + }, + { + "epoch": 2.4888204795852236, + "grad_norm": 1.0823627710342407, + "learning_rate": 3.711461722446216e-07, + "loss": 0.0676, + "step": 15361 + }, + { + "epoch": 2.4889825016202205, + "grad_norm": 0.8411454558372498, + "learning_rate": 3.7091693189231615e-07, + "loss": 0.0586, + "step": 15362 + }, + { + "epoch": 2.489144523655217, + "grad_norm": 0.8815544843673706, + "learning_rate": 3.706877566848008e-07, + "loss": 0.0552, + "step": 15363 + }, + { + "epoch": 2.489306545690214, + "grad_norm": 0.96858811378479, + "learning_rate": 3.704586466290863e-07, + "loss": 0.0595, + "step": 15364 + }, + { + "epoch": 2.4894685677252104, + "grad_norm": 0.8106810450553894, + "learning_rate": 3.7022960173218437e-07, + "loss": 0.0564, + "step": 15365 + }, + { + "epoch": 2.4896305897602073, + "grad_norm": 0.8244809508323669, + "learning_rate": 3.7000062200110266e-07, + "loss": 0.0668, + "step": 15366 + }, + { + "epoch": 2.4897926117952043, + "grad_norm": 0.9649903178215027, + "learning_rate": 3.6977170744284805e-07, + "loss": 0.0636, + "step": 15367 + }, + { + "epoch": 2.4899546338302008, + "grad_norm": 1.087392807006836, + "learning_rate": 3.6954285806442337e-07, + "loss": 0.0681, + "step": 15368 + }, + { + "epoch": 2.4901166558651977, + "grad_norm": 1.129231333732605, + "learning_rate": 3.6931407387283126e-07, + "loss": 0.0688, + "step": 15369 + }, + { + "epoch": 2.4902786779001946, + "grad_norm": 0.8219298720359802, + "learning_rate": 3.6908535487507335e-07, + "loss": 0.0577, + "step": 15370 + }, + { + "epoch": 2.490440699935191, + "grad_norm": 0.8305365443229675, + "learning_rate": 3.688567010781463e-07, + "loss": 0.0571, + "step": 15371 + }, + { + "epoch": 2.490602721970188, + "grad_norm": 0.9187493324279785, + "learning_rate": 3.68628112489047e-07, + "loss": 0.0593, + "step": 15372 + }, + { + "epoch": 2.4907647440051845, + "grad_norm": 0.8878763914108276, + "learning_rate": 3.683995891147696e-07, + "loss": 0.0579, + "step": 15373 + }, + { + "epoch": 2.4909267660401815, + "grad_norm": 0.9653205275535583, + "learning_rate": 3.681711309623065e-07, + "loss": 0.0578, + "step": 15374 + }, + { + "epoch": 2.4910887880751784, + "grad_norm": 0.8415849208831787, + "learning_rate": 3.679427380386477e-07, + "loss": 0.0582, + "step": 15375 + }, + { + "epoch": 2.491250810110175, + "grad_norm": 0.9508287310600281, + "learning_rate": 3.677144103507818e-07, + "loss": 0.0608, + "step": 15376 + }, + { + "epoch": 2.491412832145172, + "grad_norm": 0.9753308892250061, + "learning_rate": 3.674861479056946e-07, + "loss": 0.0614, + "step": 15377 + }, + { + "epoch": 2.4915748541801683, + "grad_norm": 0.8694376349449158, + "learning_rate": 3.672579507103716e-07, + "loss": 0.0622, + "step": 15378 + }, + { + "epoch": 2.4917368762151653, + "grad_norm": 0.854770839214325, + "learning_rate": 3.67029818771793e-07, + "loss": 0.0606, + "step": 15379 + }, + { + "epoch": 2.491898898250162, + "grad_norm": 0.8874652981758118, + "learning_rate": 3.668017520969405e-07, + "loss": 0.0647, + "step": 15380 + }, + { + "epoch": 2.4920609202851587, + "grad_norm": 0.9091949462890625, + "learning_rate": 3.66573750692793e-07, + "loss": 0.0686, + "step": 15381 + }, + { + "epoch": 2.4922229423201556, + "grad_norm": 0.9150681495666504, + "learning_rate": 3.663458145663254e-07, + "loss": 0.0611, + "step": 15382 + }, + { + "epoch": 2.492384964355152, + "grad_norm": 1.12820565700531, + "learning_rate": 3.6611794372451244e-07, + "loss": 0.0716, + "step": 15383 + }, + { + "epoch": 2.492546986390149, + "grad_norm": 0.825421154499054, + "learning_rate": 3.65890138174326e-07, + "loss": 0.0642, + "step": 15384 + }, + { + "epoch": 2.492709008425146, + "grad_norm": 0.9262431263923645, + "learning_rate": 3.6566239792273775e-07, + "loss": 0.0597, + "step": 15385 + }, + { + "epoch": 2.4928710304601425, + "grad_norm": 0.9032964110374451, + "learning_rate": 3.6543472297671495e-07, + "loss": 0.0627, + "step": 15386 + }, + { + "epoch": 2.4930330524951394, + "grad_norm": 0.8417792320251465, + "learning_rate": 3.6520711334322387e-07, + "loss": 0.0637, + "step": 15387 + }, + { + "epoch": 2.493195074530136, + "grad_norm": 0.9112814664840698, + "learning_rate": 3.6497956902922904e-07, + "loss": 0.0601, + "step": 15388 + }, + { + "epoch": 2.493357096565133, + "grad_norm": 1.061842918395996, + "learning_rate": 3.6475209004169286e-07, + "loss": 0.0566, + "step": 15389 + }, + { + "epoch": 2.4935191186001298, + "grad_norm": 0.9109911918640137, + "learning_rate": 3.645246763875754e-07, + "loss": 0.0605, + "step": 15390 + }, + { + "epoch": 2.4936811406351262, + "grad_norm": 0.8097963929176331, + "learning_rate": 3.6429732807383517e-07, + "loss": 0.0591, + "step": 15391 + }, + { + "epoch": 2.493843162670123, + "grad_norm": 0.7953127026557922, + "learning_rate": 3.640700451074289e-07, + "loss": 0.0537, + "step": 15392 + }, + { + "epoch": 2.49400518470512, + "grad_norm": 1.0355387926101685, + "learning_rate": 3.63842827495309e-07, + "loss": 0.0673, + "step": 15393 + }, + { + "epoch": 2.4941672067401166, + "grad_norm": 0.8306580781936646, + "learning_rate": 3.636156752444303e-07, + "loss": 0.0578, + "step": 15394 + }, + { + "epoch": 2.4943292287751135, + "grad_norm": 1.0002905130386353, + "learning_rate": 3.633885883617416e-07, + "loss": 0.0626, + "step": 15395 + }, + { + "epoch": 2.49449125081011, + "grad_norm": 0.9213991165161133, + "learning_rate": 3.631615668541921e-07, + "loss": 0.0528, + "step": 15396 + }, + { + "epoch": 2.494653272845107, + "grad_norm": 1.1550705432891846, + "learning_rate": 3.6293461072872735e-07, + "loss": 0.066, + "step": 15397 + }, + { + "epoch": 2.4948152948801035, + "grad_norm": 0.9875110387802124, + "learning_rate": 3.6270771999229124e-07, + "loss": 0.0553, + "step": 15398 + }, + { + "epoch": 2.4949773169151004, + "grad_norm": 0.9158915281295776, + "learning_rate": 3.6248089465182797e-07, + "loss": 0.0621, + "step": 15399 + }, + { + "epoch": 2.4951393389500973, + "grad_norm": 0.9057533740997314, + "learning_rate": 3.622541347142758e-07, + "loss": 0.0537, + "step": 15400 + }, + { + "epoch": 2.495301360985094, + "grad_norm": 0.8839161396026611, + "learning_rate": 3.6202744018657393e-07, + "loss": 0.0623, + "step": 15401 + }, + { + "epoch": 2.4954633830200907, + "grad_norm": 0.9387931227684021, + "learning_rate": 3.618008110756588e-07, + "loss": 0.0662, + "step": 15402 + }, + { + "epoch": 2.4956254050550877, + "grad_norm": 1.0209976434707642, + "learning_rate": 3.6157424738846427e-07, + "loss": 0.0676, + "step": 15403 + }, + { + "epoch": 2.495787427090084, + "grad_norm": 0.9517818689346313, + "learning_rate": 3.6134774913192314e-07, + "loss": 0.0619, + "step": 15404 + }, + { + "epoch": 2.495949449125081, + "grad_norm": 0.9246603846549988, + "learning_rate": 3.6112131631296507e-07, + "loss": 0.0608, + "step": 15405 + }, + { + "epoch": 2.4961114711600776, + "grad_norm": 0.9202209711074829, + "learning_rate": 3.608949489385191e-07, + "loss": 0.0601, + "step": 15406 + }, + { + "epoch": 2.4962734931950745, + "grad_norm": 0.8076236248016357, + "learning_rate": 3.60668647015511e-07, + "loss": 0.0593, + "step": 15407 + }, + { + "epoch": 2.4964355152300715, + "grad_norm": 0.9734016060829163, + "learning_rate": 3.6044241055086525e-07, + "loss": 0.0606, + "step": 15408 + }, + { + "epoch": 2.496597537265068, + "grad_norm": 1.0602586269378662, + "learning_rate": 3.602162395515041e-07, + "loss": 0.066, + "step": 15409 + }, + { + "epoch": 2.496759559300065, + "grad_norm": 1.0129172801971436, + "learning_rate": 3.599901340243478e-07, + "loss": 0.0573, + "step": 15410 + }, + { + "epoch": 2.4969215813350614, + "grad_norm": 0.8970673084259033, + "learning_rate": 3.597640939763153e-07, + "loss": 0.0616, + "step": 15411 + }, + { + "epoch": 2.4970836033700583, + "grad_norm": 0.8650174140930176, + "learning_rate": 3.5953811941432104e-07, + "loss": 0.0544, + "step": 15412 + }, + { + "epoch": 2.4972456254050552, + "grad_norm": 0.920050323009491, + "learning_rate": 3.593122103452812e-07, + "loss": 0.0581, + "step": 15413 + }, + { + "epoch": 2.4974076474400517, + "grad_norm": 0.9447347521781921, + "learning_rate": 3.590863667761077e-07, + "loss": 0.0596, + "step": 15414 + }, + { + "epoch": 2.4975696694750487, + "grad_norm": 0.8913710713386536, + "learning_rate": 3.5886058871371005e-07, + "loss": 0.0622, + "step": 15415 + }, + { + "epoch": 2.4977316915100456, + "grad_norm": 1.1236344575881958, + "learning_rate": 3.5863487616499713e-07, + "loss": 0.064, + "step": 15416 + }, + { + "epoch": 2.497893713545042, + "grad_norm": 0.8553385138511658, + "learning_rate": 3.584092291368746e-07, + "loss": 0.0516, + "step": 15417 + }, + { + "epoch": 2.498055735580039, + "grad_norm": 0.8064867854118347, + "learning_rate": 3.581836476362474e-07, + "loss": 0.055, + "step": 15418 + }, + { + "epoch": 2.4982177576150355, + "grad_norm": 1.035109043121338, + "learning_rate": 3.579581316700173e-07, + "loss": 0.0661, + "step": 15419 + }, + { + "epoch": 2.4983797796500324, + "grad_norm": 0.9095247983932495, + "learning_rate": 3.5773268124508485e-07, + "loss": 0.0644, + "step": 15420 + }, + { + "epoch": 2.498541801685029, + "grad_norm": 0.7617493867874146, + "learning_rate": 3.575072963683482e-07, + "loss": 0.055, + "step": 15421 + }, + { + "epoch": 2.498703823720026, + "grad_norm": 0.863577663898468, + "learning_rate": 3.5728197704670344e-07, + "loss": 0.0569, + "step": 15422 + }, + { + "epoch": 2.498865845755023, + "grad_norm": 0.8674677610397339, + "learning_rate": 3.5705672328704503e-07, + "loss": 0.0629, + "step": 15423 + }, + { + "epoch": 2.4990278677900193, + "grad_norm": 0.8979498744010925, + "learning_rate": 3.5683153509626504e-07, + "loss": 0.0641, + "step": 15424 + }, + { + "epoch": 2.499189889825016, + "grad_norm": 1.075107216835022, + "learning_rate": 3.566064124812541e-07, + "loss": 0.0667, + "step": 15425 + }, + { + "epoch": 2.499351911860013, + "grad_norm": 0.913252055644989, + "learning_rate": 3.563813554488996e-07, + "loss": 0.0589, + "step": 15426 + }, + { + "epoch": 2.4995139338950096, + "grad_norm": 0.9128844141960144, + "learning_rate": 3.561563640060875e-07, + "loss": 0.0632, + "step": 15427 + }, + { + "epoch": 2.4996759559300066, + "grad_norm": 0.9388177394866943, + "learning_rate": 3.559314381597034e-07, + "loss": 0.0677, + "step": 15428 + }, + { + "epoch": 2.499837977965003, + "grad_norm": 0.8632997870445251, + "learning_rate": 3.557065779166291e-07, + "loss": 0.0533, + "step": 15429 + }, + { + "epoch": 2.5, + "grad_norm": 0.9118877649307251, + "learning_rate": 3.55481783283744e-07, + "loss": 0.0619, + "step": 15430 + }, + { + "epoch": 2.5001620220349965, + "grad_norm": 1.0225169658660889, + "learning_rate": 3.5525705426792624e-07, + "loss": 0.0709, + "step": 15431 + }, + { + "epoch": 2.5003240440699934, + "grad_norm": 0.8662908673286438, + "learning_rate": 3.5503239087605337e-07, + "loss": 0.0596, + "step": 15432 + }, + { + "epoch": 2.5004860661049904, + "grad_norm": 0.7899366617202759, + "learning_rate": 3.548077931149982e-07, + "loss": 0.0581, + "step": 15433 + }, + { + "epoch": 2.500648088139987, + "grad_norm": 0.9061411023139954, + "learning_rate": 3.54583260991633e-07, + "loss": 0.0606, + "step": 15434 + }, + { + "epoch": 2.500810110174984, + "grad_norm": 0.8774332404136658, + "learning_rate": 3.543587945128285e-07, + "loss": 0.0537, + "step": 15435 + }, + { + "epoch": 2.5009721322099807, + "grad_norm": 1.0649468898773193, + "learning_rate": 3.541343936854524e-07, + "loss": 0.0656, + "step": 15436 + }, + { + "epoch": 2.501134154244977, + "grad_norm": 0.9664431810379028, + "learning_rate": 3.53910058516371e-07, + "loss": 0.0612, + "step": 15437 + }, + { + "epoch": 2.501296176279974, + "grad_norm": 1.0078061819076538, + "learning_rate": 3.5368578901244843e-07, + "loss": 0.0584, + "step": 15438 + }, + { + "epoch": 2.501458198314971, + "grad_norm": 1.0571532249450684, + "learning_rate": 3.5346158518054674e-07, + "loss": 0.0649, + "step": 15439 + }, + { + "epoch": 2.5016202203499676, + "grad_norm": 0.8381620049476624, + "learning_rate": 3.5323744702752657e-07, + "loss": 0.0511, + "step": 15440 + }, + { + "epoch": 2.5017822423849645, + "grad_norm": 0.9569916129112244, + "learning_rate": 3.5301337456024434e-07, + "loss": 0.0622, + "step": 15441 + }, + { + "epoch": 2.501944264419961, + "grad_norm": 0.8359852433204651, + "learning_rate": 3.5278936778555763e-07, + "loss": 0.0558, + "step": 15442 + }, + { + "epoch": 2.502106286454958, + "grad_norm": 0.9020277857780457, + "learning_rate": 3.525654267103207e-07, + "loss": 0.0602, + "step": 15443 + }, + { + "epoch": 2.5022683084899544, + "grad_norm": 0.7572386860847473, + "learning_rate": 3.523415513413847e-07, + "loss": 0.0566, + "step": 15444 + }, + { + "epoch": 2.5024303305249513, + "grad_norm": 0.9130625128746033, + "learning_rate": 3.5211774168559976e-07, + "loss": 0.0552, + "step": 15445 + }, + { + "epoch": 2.5025923525599483, + "grad_norm": 1.0692723989486694, + "learning_rate": 3.518939977498137e-07, + "loss": 0.0597, + "step": 15446 + }, + { + "epoch": 2.5027543745949448, + "grad_norm": 0.9212605357170105, + "learning_rate": 3.516703195408741e-07, + "loss": 0.0638, + "step": 15447 + }, + { + "epoch": 2.5029163966299417, + "grad_norm": 0.9287461042404175, + "learning_rate": 3.514467070656233e-07, + "loss": 0.0586, + "step": 15448 + }, + { + "epoch": 2.5030784186649386, + "grad_norm": 0.8843840956687927, + "learning_rate": 3.512231603309038e-07, + "loss": 0.063, + "step": 15449 + }, + { + "epoch": 2.503240440699935, + "grad_norm": 0.8561710119247437, + "learning_rate": 3.509996793435558e-07, + "loss": 0.0593, + "step": 15450 + }, + { + "epoch": 2.503402462734932, + "grad_norm": 1.0155240297317505, + "learning_rate": 3.5077626411041707e-07, + "loss": 0.0709, + "step": 15451 + }, + { + "epoch": 2.5035644847699285, + "grad_norm": 0.9709542393684387, + "learning_rate": 3.505529146383235e-07, + "loss": 0.0626, + "step": 15452 + }, + { + "epoch": 2.5037265068049255, + "grad_norm": 0.9691731929779053, + "learning_rate": 3.503296309341095e-07, + "loss": 0.0617, + "step": 15453 + }, + { + "epoch": 2.503888528839922, + "grad_norm": 0.9437772035598755, + "learning_rate": 3.501064130046064e-07, + "loss": 0.0593, + "step": 15454 + }, + { + "epoch": 2.504050550874919, + "grad_norm": 0.8508868217468262, + "learning_rate": 3.4988326085664463e-07, + "loss": 0.0586, + "step": 15455 + }, + { + "epoch": 2.504212572909916, + "grad_norm": 0.8837978839874268, + "learning_rate": 3.496601744970518e-07, + "loss": 0.0541, + "step": 15456 + }, + { + "epoch": 2.5043745949449123, + "grad_norm": 0.975632905960083, + "learning_rate": 3.494371539326538e-07, + "loss": 0.0556, + "step": 15457 + }, + { + "epoch": 2.5045366169799093, + "grad_norm": 1.0804563760757446, + "learning_rate": 3.492141991702752e-07, + "loss": 0.0585, + "step": 15458 + }, + { + "epoch": 2.504698639014906, + "grad_norm": 0.832747757434845, + "learning_rate": 3.4899131021673693e-07, + "loss": 0.0584, + "step": 15459 + }, + { + "epoch": 2.5048606610499027, + "grad_norm": 0.9667251110076904, + "learning_rate": 3.4876848707885854e-07, + "loss": 0.0595, + "step": 15460 + }, + { + "epoch": 2.5050226830848996, + "grad_norm": 0.9770776629447937, + "learning_rate": 3.4854572976345954e-07, + "loss": 0.0636, + "step": 15461 + }, + { + "epoch": 2.5051847051198965, + "grad_norm": 0.8421487212181091, + "learning_rate": 3.483230382773545e-07, + "loss": 0.0574, + "step": 15462 + }, + { + "epoch": 2.505346727154893, + "grad_norm": 1.0473026037216187, + "learning_rate": 3.481004126273574e-07, + "loss": 0.0623, + "step": 15463 + }, + { + "epoch": 2.50550874918989, + "grad_norm": 0.7978666424751282, + "learning_rate": 3.478778528202803e-07, + "loss": 0.0544, + "step": 15464 + }, + { + "epoch": 2.5056707712248865, + "grad_norm": 1.039564609527588, + "learning_rate": 3.476553588629328e-07, + "loss": 0.0655, + "step": 15465 + }, + { + "epoch": 2.5058327932598834, + "grad_norm": 0.7758963108062744, + "learning_rate": 3.474329307621227e-07, + "loss": 0.053, + "step": 15466 + }, + { + "epoch": 2.50599481529488, + "grad_norm": 0.9446272850036621, + "learning_rate": 3.4721056852465575e-07, + "loss": 0.0561, + "step": 15467 + }, + { + "epoch": 2.506156837329877, + "grad_norm": 0.9161103963851929, + "learning_rate": 3.469882721573356e-07, + "loss": 0.0607, + "step": 15468 + }, + { + "epoch": 2.5063188593648738, + "grad_norm": 0.8289987444877625, + "learning_rate": 3.467660416669649e-07, + "loss": 0.0606, + "step": 15469 + }, + { + "epoch": 2.5064808813998702, + "grad_norm": 0.985321581363678, + "learning_rate": 3.465438770603416e-07, + "loss": 0.0652, + "step": 15470 + }, + { + "epoch": 2.506642903434867, + "grad_norm": 0.8236386179924011, + "learning_rate": 3.463217783442649e-07, + "loss": 0.0561, + "step": 15471 + }, + { + "epoch": 2.506804925469864, + "grad_norm": 1.0456392765045166, + "learning_rate": 3.4609974552552993e-07, + "loss": 0.0686, + "step": 15472 + }, + { + "epoch": 2.5069669475048606, + "grad_norm": 1.1649935245513916, + "learning_rate": 3.4587777861093105e-07, + "loss": 0.0645, + "step": 15473 + }, + { + "epoch": 2.5071289695398575, + "grad_norm": 0.954638659954071, + "learning_rate": 3.456558776072585e-07, + "loss": 0.0642, + "step": 15474 + }, + { + "epoch": 2.507290991574854, + "grad_norm": 0.8497850298881531, + "learning_rate": 3.4543404252130234e-07, + "loss": 0.0587, + "step": 15475 + }, + { + "epoch": 2.507453013609851, + "grad_norm": 0.8479697108268738, + "learning_rate": 3.4521227335985146e-07, + "loss": 0.0582, + "step": 15476 + }, + { + "epoch": 2.5076150356448474, + "grad_norm": 0.8747241497039795, + "learning_rate": 3.449905701296902e-07, + "loss": 0.0555, + "step": 15477 + }, + { + "epoch": 2.5077770576798444, + "grad_norm": 0.9284306764602661, + "learning_rate": 3.447689328376022e-07, + "loss": 0.0651, + "step": 15478 + }, + { + "epoch": 2.5079390797148413, + "grad_norm": 0.9648377299308777, + "learning_rate": 3.445473614903688e-07, + "loss": 0.069, + "step": 15479 + }, + { + "epoch": 2.508101101749838, + "grad_norm": 0.9850466251373291, + "learning_rate": 3.4432585609477125e-07, + "loss": 0.0611, + "step": 15480 + }, + { + "epoch": 2.5082631237848347, + "grad_norm": 0.8666247725486755, + "learning_rate": 3.441044166575855e-07, + "loss": 0.0637, + "step": 15481 + }, + { + "epoch": 2.5084251458198317, + "grad_norm": 0.8115009069442749, + "learning_rate": 3.438830431855872e-07, + "loss": 0.052, + "step": 15482 + }, + { + "epoch": 2.508587167854828, + "grad_norm": 1.0275858640670776, + "learning_rate": 3.4366173568555013e-07, + "loss": 0.0657, + "step": 15483 + }, + { + "epoch": 2.508749189889825, + "grad_norm": 0.9150252342224121, + "learning_rate": 3.434404941642455e-07, + "loss": 0.0587, + "step": 15484 + }, + { + "epoch": 2.508911211924822, + "grad_norm": 1.8770604133605957, + "learning_rate": 3.4321931862844327e-07, + "loss": 0.0569, + "step": 15485 + }, + { + "epoch": 2.5090732339598185, + "grad_norm": 1.0579850673675537, + "learning_rate": 3.4299820908491045e-07, + "loss": 0.0671, + "step": 15486 + }, + { + "epoch": 2.5092352559948155, + "grad_norm": 0.8055444955825806, + "learning_rate": 3.427771655404133e-07, + "loss": 0.0595, + "step": 15487 + }, + { + "epoch": 2.509397278029812, + "grad_norm": 0.8405749797821045, + "learning_rate": 3.4255618800171366e-07, + "loss": 0.0574, + "step": 15488 + }, + { + "epoch": 2.509559300064809, + "grad_norm": 0.9373750686645508, + "learning_rate": 3.423352764755733e-07, + "loss": 0.0611, + "step": 15489 + }, + { + "epoch": 2.5097213220998054, + "grad_norm": 0.957573652267456, + "learning_rate": 3.421144309687527e-07, + "loss": 0.0701, + "step": 15490 + }, + { + "epoch": 2.5098833441348023, + "grad_norm": 0.9812716245651245, + "learning_rate": 3.418936514880092e-07, + "loss": 0.0618, + "step": 15491 + }, + { + "epoch": 2.5100453661697992, + "grad_norm": 0.9846394658088684, + "learning_rate": 3.4167293804009656e-07, + "loss": 0.0691, + "step": 15492 + }, + { + "epoch": 2.5102073882047957, + "grad_norm": 1.0045005083084106, + "learning_rate": 3.414522906317691e-07, + "loss": 0.0551, + "step": 15493 + }, + { + "epoch": 2.5103694102397927, + "grad_norm": 0.8848669528961182, + "learning_rate": 3.412317092697781e-07, + "loss": 0.0637, + "step": 15494 + }, + { + "epoch": 2.5105314322747896, + "grad_norm": 0.9748433232307434, + "learning_rate": 3.4101119396087237e-07, + "loss": 0.0644, + "step": 15495 + }, + { + "epoch": 2.510693454309786, + "grad_norm": 0.8080410957336426, + "learning_rate": 3.407907447117997e-07, + "loss": 0.0569, + "step": 15496 + }, + { + "epoch": 2.510855476344783, + "grad_norm": 0.9153392314910889, + "learning_rate": 3.405703615293052e-07, + "loss": 0.0655, + "step": 15497 + }, + { + "epoch": 2.5110174983797795, + "grad_norm": 0.8493557572364807, + "learning_rate": 3.4035004442013157e-07, + "loss": 0.0581, + "step": 15498 + }, + { + "epoch": 2.5111795204147764, + "grad_norm": 0.969572126865387, + "learning_rate": 3.4012979339102054e-07, + "loss": 0.0614, + "step": 15499 + }, + { + "epoch": 2.511341542449773, + "grad_norm": 1.0106775760650635, + "learning_rate": 3.399096084487108e-07, + "loss": 0.0603, + "step": 15500 + }, + { + "epoch": 2.51150356448477, + "grad_norm": 0.7856919765472412, + "learning_rate": 3.3968948959994004e-07, + "loss": 0.0533, + "step": 15501 + }, + { + "epoch": 2.511665586519767, + "grad_norm": 0.8515869975090027, + "learning_rate": 3.394694368514434e-07, + "loss": 0.0634, + "step": 15502 + }, + { + "epoch": 2.5118276085547633, + "grad_norm": 0.7295697927474976, + "learning_rate": 3.3924945020995277e-07, + "loss": 0.0557, + "step": 15503 + }, + { + "epoch": 2.51198963058976, + "grad_norm": 0.8382651805877686, + "learning_rate": 3.390295296822002e-07, + "loss": 0.055, + "step": 15504 + }, + { + "epoch": 2.512151652624757, + "grad_norm": 0.8761834502220154, + "learning_rate": 3.388096752749154e-07, + "loss": 0.0607, + "step": 15505 + }, + { + "epoch": 2.5123136746597536, + "grad_norm": 0.9966699481010437, + "learning_rate": 3.3858988699482397e-07, + "loss": 0.0619, + "step": 15506 + }, + { + "epoch": 2.5124756966947506, + "grad_norm": 0.8567604422569275, + "learning_rate": 3.3837016484865146e-07, + "loss": 0.0573, + "step": 15507 + }, + { + "epoch": 2.5126377187297475, + "grad_norm": 1.0470939874649048, + "learning_rate": 3.381505088431203e-07, + "loss": 0.0621, + "step": 15508 + }, + { + "epoch": 2.512799740764744, + "grad_norm": 0.8962857127189636, + "learning_rate": 3.3793091898495305e-07, + "loss": 0.0627, + "step": 15509 + }, + { + "epoch": 2.512961762799741, + "grad_norm": 1.0544072389602661, + "learning_rate": 3.377113952808669e-07, + "loss": 0.0638, + "step": 15510 + }, + { + "epoch": 2.5131237848347374, + "grad_norm": 0.7999473214149475, + "learning_rate": 3.374919377375796e-07, + "loss": 0.062, + "step": 15511 + }, + { + "epoch": 2.5132858068697344, + "grad_norm": 0.9653675556182861, + "learning_rate": 3.3727254636180597e-07, + "loss": 0.0689, + "step": 15512 + }, + { + "epoch": 2.513447828904731, + "grad_norm": 0.9286097288131714, + "learning_rate": 3.370532211602587e-07, + "loss": 0.0689, + "step": 15513 + }, + { + "epoch": 2.5136098509397278, + "grad_norm": 0.9723500609397888, + "learning_rate": 3.3683396213964826e-07, + "loss": 0.0592, + "step": 15514 + }, + { + "epoch": 2.5137718729747247, + "grad_norm": 0.8343885540962219, + "learning_rate": 3.3661476930668404e-07, + "loss": 0.0516, + "step": 15515 + }, + { + "epoch": 2.513933895009721, + "grad_norm": 0.8575674295425415, + "learning_rate": 3.363956426680728e-07, + "loss": 0.0581, + "step": 15516 + }, + { + "epoch": 2.514095917044718, + "grad_norm": 0.7917022109031677, + "learning_rate": 3.3617658223051935e-07, + "loss": 0.058, + "step": 15517 + }, + { + "epoch": 2.514257939079715, + "grad_norm": 0.8156291246414185, + "learning_rate": 3.3595758800072515e-07, + "loss": 0.0549, + "step": 15518 + }, + { + "epoch": 2.5144199611147116, + "grad_norm": 0.7907753586769104, + "learning_rate": 3.3573865998539236e-07, + "loss": 0.056, + "step": 15519 + }, + { + "epoch": 2.5145819831497085, + "grad_norm": 0.9458636045455933, + "learning_rate": 3.355197981912198e-07, + "loss": 0.0628, + "step": 15520 + }, + { + "epoch": 2.514744005184705, + "grad_norm": 0.9401422739028931, + "learning_rate": 3.3530100262490287e-07, + "loss": 0.0609, + "step": 15521 + }, + { + "epoch": 2.514906027219702, + "grad_norm": 0.8505311608314514, + "learning_rate": 3.350822732931361e-07, + "loss": 0.0601, + "step": 15522 + }, + { + "epoch": 2.5150680492546984, + "grad_norm": 0.9184473752975464, + "learning_rate": 3.3486361020261345e-07, + "loss": 0.0595, + "step": 15523 + }, + { + "epoch": 2.5152300712896953, + "grad_norm": 0.870324432849884, + "learning_rate": 3.3464501336002544e-07, + "loss": 0.0557, + "step": 15524 + }, + { + "epoch": 2.5153920933246923, + "grad_norm": 0.8141091465950012, + "learning_rate": 3.344264827720592e-07, + "loss": 0.0593, + "step": 15525 + }, + { + "epoch": 2.5155541153596888, + "grad_norm": 0.8419532775878906, + "learning_rate": 3.342080184454022e-07, + "loss": 0.0568, + "step": 15526 + }, + { + "epoch": 2.5157161373946857, + "grad_norm": 0.961286723613739, + "learning_rate": 3.339896203867385e-07, + "loss": 0.0676, + "step": 15527 + }, + { + "epoch": 2.5158781594296826, + "grad_norm": 0.8865793347358704, + "learning_rate": 3.337712886027511e-07, + "loss": 0.0601, + "step": 15528 + }, + { + "epoch": 2.516040181464679, + "grad_norm": 0.8175432682037354, + "learning_rate": 3.3355302310011996e-07, + "loss": 0.0587, + "step": 15529 + }, + { + "epoch": 2.516202203499676, + "grad_norm": 0.8112668991088867, + "learning_rate": 3.3333482388552356e-07, + "loss": 0.0549, + "step": 15530 + }, + { + "epoch": 2.516364225534673, + "grad_norm": 0.8561041951179504, + "learning_rate": 3.3311669096563886e-07, + "loss": 0.0598, + "step": 15531 + }, + { + "epoch": 2.5165262475696695, + "grad_norm": 0.9206536412239075, + "learning_rate": 3.3289862434713857e-07, + "loss": 0.0605, + "step": 15532 + }, + { + "epoch": 2.516688269604666, + "grad_norm": 1.009718418121338, + "learning_rate": 3.3268062403669646e-07, + "loss": 0.0722, + "step": 15533 + }, + { + "epoch": 2.516850291639663, + "grad_norm": 0.945552408695221, + "learning_rate": 3.3246269004098275e-07, + "loss": 0.0655, + "step": 15534 + }, + { + "epoch": 2.51701231367466, + "grad_norm": 0.9071747660636902, + "learning_rate": 3.322448223666658e-07, + "loss": 0.059, + "step": 15535 + }, + { + "epoch": 2.5171743357096563, + "grad_norm": 0.9418054223060608, + "learning_rate": 3.320270210204107e-07, + "loss": 0.0657, + "step": 15536 + }, + { + "epoch": 2.5173363577446533, + "grad_norm": 0.7941263914108276, + "learning_rate": 3.31809286008882e-07, + "loss": 0.0506, + "step": 15537 + }, + { + "epoch": 2.51749837977965, + "grad_norm": 0.8504292964935303, + "learning_rate": 3.3159161733874347e-07, + "loss": 0.0592, + "step": 15538 + }, + { + "epoch": 2.5176604018146467, + "grad_norm": 0.9112208485603333, + "learning_rate": 3.3137401501665334e-07, + "loss": 0.061, + "step": 15539 + }, + { + "epoch": 2.5178224238496436, + "grad_norm": 0.8395227789878845, + "learning_rate": 3.311564790492702e-07, + "loss": 0.0598, + "step": 15540 + }, + { + "epoch": 2.5179844458846405, + "grad_norm": 1.0928508043289185, + "learning_rate": 3.3093900944325046e-07, + "loss": 0.0652, + "step": 15541 + }, + { + "epoch": 2.518146467919637, + "grad_norm": 0.8456630110740662, + "learning_rate": 3.307216062052479e-07, + "loss": 0.0569, + "step": 15542 + }, + { + "epoch": 2.518308489954634, + "grad_norm": 0.9061066508293152, + "learning_rate": 3.305042693419147e-07, + "loss": 0.0579, + "step": 15543 + }, + { + "epoch": 2.5184705119896305, + "grad_norm": 0.9693790674209595, + "learning_rate": 3.3028699885990085e-07, + "loss": 0.0609, + "step": 15544 + }, + { + "epoch": 2.5186325340246274, + "grad_norm": 0.8443101644515991, + "learning_rate": 3.300697947658543e-07, + "loss": 0.0571, + "step": 15545 + }, + { + "epoch": 2.518794556059624, + "grad_norm": 0.828754723072052, + "learning_rate": 3.298526570664207e-07, + "loss": 0.0597, + "step": 15546 + }, + { + "epoch": 2.518956578094621, + "grad_norm": 0.961614727973938, + "learning_rate": 3.296355857682443e-07, + "loss": 0.0658, + "step": 15547 + }, + { + "epoch": 2.5191186001296177, + "grad_norm": 1.3140058517456055, + "learning_rate": 3.294185808779665e-07, + "loss": 0.0645, + "step": 15548 + }, + { + "epoch": 2.5192806221646142, + "grad_norm": 0.8643244504928589, + "learning_rate": 3.292016424022276e-07, + "loss": 0.0578, + "step": 15549 + }, + { + "epoch": 2.519442644199611, + "grad_norm": 0.9640627503395081, + "learning_rate": 3.289847703476659e-07, + "loss": 0.0605, + "step": 15550 + }, + { + "epoch": 2.519604666234608, + "grad_norm": 1.0605357885360718, + "learning_rate": 3.287679647209152e-07, + "loss": 0.0665, + "step": 15551 + }, + { + "epoch": 2.5197666882696046, + "grad_norm": 0.8427574038505554, + "learning_rate": 3.285512255286111e-07, + "loss": 0.0604, + "step": 15552 + }, + { + "epoch": 2.5199287103046015, + "grad_norm": 0.8638863563537598, + "learning_rate": 3.283345527773854e-07, + "loss": 0.0562, + "step": 15553 + }, + { + "epoch": 2.5200907323395985, + "grad_norm": 1.155843734741211, + "learning_rate": 3.2811794647386625e-07, + "loss": 0.0649, + "step": 15554 + }, + { + "epoch": 2.520252754374595, + "grad_norm": 0.9180019497871399, + "learning_rate": 3.279014066246822e-07, + "loss": 0.0602, + "step": 15555 + }, + { + "epoch": 2.5204147764095914, + "grad_norm": 0.9564993977546692, + "learning_rate": 3.276849332364587e-07, + "loss": 0.0619, + "step": 15556 + }, + { + "epoch": 2.5205767984445884, + "grad_norm": 0.8472095727920532, + "learning_rate": 3.2746852631581947e-07, + "loss": 0.0545, + "step": 15557 + }, + { + "epoch": 2.5207388204795853, + "grad_norm": 0.9872234463691711, + "learning_rate": 3.2725218586938584e-07, + "loss": 0.0583, + "step": 15558 + }, + { + "epoch": 2.520900842514582, + "grad_norm": 0.9279904961585999, + "learning_rate": 3.270359119037772e-07, + "loss": 0.0557, + "step": 15559 + }, + { + "epoch": 2.5210628645495787, + "grad_norm": 0.9269344806671143, + "learning_rate": 3.2681970442561134e-07, + "loss": 0.06, + "step": 15560 + }, + { + "epoch": 2.5212248865845757, + "grad_norm": 0.9857649803161621, + "learning_rate": 3.266035634415035e-07, + "loss": 0.0634, + "step": 15561 + }, + { + "epoch": 2.521386908619572, + "grad_norm": 0.7839047312736511, + "learning_rate": 3.2638748895806705e-07, + "loss": 0.0513, + "step": 15562 + }, + { + "epoch": 2.521548930654569, + "grad_norm": 0.9197927713394165, + "learning_rate": 3.261714809819136e-07, + "loss": 0.0632, + "step": 15563 + }, + { + "epoch": 2.521710952689566, + "grad_norm": 0.8434021472930908, + "learning_rate": 3.259555395196526e-07, + "loss": 0.0637, + "step": 15564 + }, + { + "epoch": 2.5218729747245625, + "grad_norm": 0.9010359048843384, + "learning_rate": 3.2573966457789014e-07, + "loss": 0.0669, + "step": 15565 + }, + { + "epoch": 2.5220349967595594, + "grad_norm": 1.0379987955093384, + "learning_rate": 3.255238561632326e-07, + "loss": 0.0633, + "step": 15566 + }, + { + "epoch": 2.522197018794556, + "grad_norm": 0.9836294651031494, + "learning_rate": 3.253081142822831e-07, + "loss": 0.0634, + "step": 15567 + }, + { + "epoch": 2.522359040829553, + "grad_norm": 1.0170096158981323, + "learning_rate": 3.250924389416432e-07, + "loss": 0.062, + "step": 15568 + }, + { + "epoch": 2.5225210628645494, + "grad_norm": 1.039481282234192, + "learning_rate": 3.2487683014791077e-07, + "loss": 0.0666, + "step": 15569 + }, + { + "epoch": 2.5226830848995463, + "grad_norm": 0.9465806484222412, + "learning_rate": 3.2466128790768327e-07, + "loss": 0.0606, + "step": 15570 + }, + { + "epoch": 2.5228451069345432, + "grad_norm": 1.00591242313385, + "learning_rate": 3.2444581222755733e-07, + "loss": 0.065, + "step": 15571 + }, + { + "epoch": 2.5230071289695397, + "grad_norm": 0.8654181957244873, + "learning_rate": 3.2423040311412384e-07, + "loss": 0.0602, + "step": 15572 + }, + { + "epoch": 2.5231691510045366, + "grad_norm": 0.86398845911026, + "learning_rate": 3.2401506057397503e-07, + "loss": 0.0563, + "step": 15573 + }, + { + "epoch": 2.5233311730395336, + "grad_norm": 0.8598949313163757, + "learning_rate": 3.2379978461369976e-07, + "loss": 0.0625, + "step": 15574 + }, + { + "epoch": 2.52349319507453, + "grad_norm": 0.9612644910812378, + "learning_rate": 3.2358457523988446e-07, + "loss": 0.0576, + "step": 15575 + }, + { + "epoch": 2.523655217109527, + "grad_norm": 0.9757301211357117, + "learning_rate": 3.233694324591144e-07, + "loss": 0.0639, + "step": 15576 + }, + { + "epoch": 2.5238172391445235, + "grad_norm": 1.0324627161026, + "learning_rate": 3.2315435627797266e-07, + "loss": 0.0676, + "step": 15577 + }, + { + "epoch": 2.5239792611795204, + "grad_norm": 0.9274498224258423, + "learning_rate": 3.229393467030395e-07, + "loss": 0.0603, + "step": 15578 + }, + { + "epoch": 2.524141283214517, + "grad_norm": 0.9407957196235657, + "learning_rate": 3.2272440374089443e-07, + "loss": 0.0647, + "step": 15579 + }, + { + "epoch": 2.524303305249514, + "grad_norm": 0.7908738255500793, + "learning_rate": 3.225095273981127e-07, + "loss": 0.0517, + "step": 15580 + }, + { + "epoch": 2.524465327284511, + "grad_norm": 0.8206102848052979, + "learning_rate": 3.2229471768127047e-07, + "loss": 0.0551, + "step": 15581 + }, + { + "epoch": 2.5246273493195073, + "grad_norm": 0.9348705410957336, + "learning_rate": 3.2207997459694053e-07, + "loss": 0.065, + "step": 15582 + }, + { + "epoch": 2.524789371354504, + "grad_norm": 0.9329501986503601, + "learning_rate": 3.218652981516923e-07, + "loss": 0.0654, + "step": 15583 + }, + { + "epoch": 2.524951393389501, + "grad_norm": 0.928276002407074, + "learning_rate": 3.2165068835209506e-07, + "loss": 0.0653, + "step": 15584 + }, + { + "epoch": 2.5251134154244976, + "grad_norm": 0.9858642220497131, + "learning_rate": 3.214361452047149e-07, + "loss": 0.0642, + "step": 15585 + }, + { + "epoch": 2.5252754374594946, + "grad_norm": 1.175571322441101, + "learning_rate": 3.2122166871611736e-07, + "loss": 0.0692, + "step": 15586 + }, + { + "epoch": 2.5254374594944915, + "grad_norm": 0.8295290470123291, + "learning_rate": 3.2100725889286393e-07, + "loss": 0.0581, + "step": 15587 + }, + { + "epoch": 2.525599481529488, + "grad_norm": 0.8364087343215942, + "learning_rate": 3.207929157415152e-07, + "loss": 0.0551, + "step": 15588 + }, + { + "epoch": 2.525761503564485, + "grad_norm": 0.9872838854789734, + "learning_rate": 3.205786392686297e-07, + "loss": 0.0602, + "step": 15589 + }, + { + "epoch": 2.5259235255994814, + "grad_norm": 0.8154653310775757, + "learning_rate": 3.2036442948076395e-07, + "loss": 0.0598, + "step": 15590 + }, + { + "epoch": 2.5260855476344783, + "grad_norm": 0.900958776473999, + "learning_rate": 3.201502863844716e-07, + "loss": 0.0608, + "step": 15591 + }, + { + "epoch": 2.526247569669475, + "grad_norm": 0.9498975276947021, + "learning_rate": 3.199362099863057e-07, + "loss": 0.0633, + "step": 15592 + }, + { + "epoch": 2.5264095917044718, + "grad_norm": 0.947784423828125, + "learning_rate": 3.1972220029281605e-07, + "loss": 0.0554, + "step": 15593 + }, + { + "epoch": 2.5265716137394687, + "grad_norm": 0.9299330115318298, + "learning_rate": 3.19508257310551e-07, + "loss": 0.0585, + "step": 15594 + }, + { + "epoch": 2.526733635774465, + "grad_norm": 0.9528511166572571, + "learning_rate": 3.1929438104605636e-07, + "loss": 0.06, + "step": 15595 + }, + { + "epoch": 2.526895657809462, + "grad_norm": 0.8668416142463684, + "learning_rate": 3.190805715058765e-07, + "loss": 0.0589, + "step": 15596 + }, + { + "epoch": 2.527057679844459, + "grad_norm": 0.8737120032310486, + "learning_rate": 3.1886682869655403e-07, + "loss": 0.0571, + "step": 15597 + }, + { + "epoch": 2.5272197018794555, + "grad_norm": 0.9954075813293457, + "learning_rate": 3.1865315262462783e-07, + "loss": 0.0658, + "step": 15598 + }, + { + "epoch": 2.5273817239144525, + "grad_norm": 0.8705222010612488, + "learning_rate": 3.1843954329663583e-07, + "loss": 0.0588, + "step": 15599 + }, + { + "epoch": 2.527543745949449, + "grad_norm": 0.8527966737747192, + "learning_rate": 3.182260007191157e-07, + "loss": 0.059, + "step": 15600 + }, + { + "epoch": 2.527705767984446, + "grad_norm": 1.4021953344345093, + "learning_rate": 3.1801252489859933e-07, + "loss": 0.0547, + "step": 15601 + }, + { + "epoch": 2.5278677900194424, + "grad_norm": 0.9866907596588135, + "learning_rate": 3.1779911584161963e-07, + "loss": 0.0631, + "step": 15602 + }, + { + "epoch": 2.5280298120544393, + "grad_norm": 0.8450327515602112, + "learning_rate": 3.175857735547061e-07, + "loss": 0.06, + "step": 15603 + }, + { + "epoch": 2.5281918340894363, + "grad_norm": 0.7806409001350403, + "learning_rate": 3.173724980443868e-07, + "loss": 0.0496, + "step": 15604 + }, + { + "epoch": 2.5283538561244328, + "grad_norm": 0.8109617233276367, + "learning_rate": 3.171592893171868e-07, + "loss": 0.0517, + "step": 15605 + }, + { + "epoch": 2.5285158781594297, + "grad_norm": 0.9940459132194519, + "learning_rate": 3.1694614737963036e-07, + "loss": 0.0662, + "step": 15606 + }, + { + "epoch": 2.5286779001944266, + "grad_norm": 0.9137442111968994, + "learning_rate": 3.167330722382389e-07, + "loss": 0.0588, + "step": 15607 + }, + { + "epoch": 2.528839922229423, + "grad_norm": 0.7500993013381958, + "learning_rate": 3.165200638995328e-07, + "loss": 0.0513, + "step": 15608 + }, + { + "epoch": 2.52900194426442, + "grad_norm": 0.8935854434967041, + "learning_rate": 3.163071223700273e-07, + "loss": 0.0569, + "step": 15609 + }, + { + "epoch": 2.529163966299417, + "grad_norm": 0.8556219935417175, + "learning_rate": 3.160942476562404e-07, + "loss": 0.0588, + "step": 15610 + }, + { + "epoch": 2.5293259883344135, + "grad_norm": 1.0256495475769043, + "learning_rate": 3.158814397646842e-07, + "loss": 0.0636, + "step": 15611 + }, + { + "epoch": 2.5294880103694104, + "grad_norm": 0.8018587827682495, + "learning_rate": 3.1566869870187115e-07, + "loss": 0.0522, + "step": 15612 + }, + { + "epoch": 2.529650032404407, + "grad_norm": 0.794672966003418, + "learning_rate": 3.1545602447430904e-07, + "loss": 0.0521, + "step": 15613 + }, + { + "epoch": 2.529812054439404, + "grad_norm": 0.8398905992507935, + "learning_rate": 3.1524341708850633e-07, + "loss": 0.0643, + "step": 15614 + }, + { + "epoch": 2.5299740764744003, + "grad_norm": 0.9044195413589478, + "learning_rate": 3.150308765509688e-07, + "loss": 0.0608, + "step": 15615 + }, + { + "epoch": 2.5301360985093972, + "grad_norm": 0.9135866761207581, + "learning_rate": 3.148184028681983e-07, + "loss": 0.0634, + "step": 15616 + }, + { + "epoch": 2.530298120544394, + "grad_norm": 0.8611244559288025, + "learning_rate": 3.1460599604669684e-07, + "loss": 0.0589, + "step": 15617 + }, + { + "epoch": 2.5304601425793907, + "grad_norm": 0.9294557571411133, + "learning_rate": 3.1439365609296253e-07, + "loss": 0.0573, + "step": 15618 + }, + { + "epoch": 2.5306221646143876, + "grad_norm": 0.9217930436134338, + "learning_rate": 3.141813830134943e-07, + "loss": 0.0631, + "step": 15619 + }, + { + "epoch": 2.5307841866493845, + "grad_norm": 0.8782767653465271, + "learning_rate": 3.1396917681478595e-07, + "loss": 0.0633, + "step": 15620 + }, + { + "epoch": 2.530946208684381, + "grad_norm": 0.9176865220069885, + "learning_rate": 3.137570375033305e-07, + "loss": 0.0532, + "step": 15621 + }, + { + "epoch": 2.531108230719378, + "grad_norm": 0.9333579540252686, + "learning_rate": 3.13544965085619e-07, + "loss": 0.0611, + "step": 15622 + }, + { + "epoch": 2.5312702527543745, + "grad_norm": 0.8954581618309021, + "learning_rate": 3.133329595681406e-07, + "loss": 0.0563, + "step": 15623 + }, + { + "epoch": 2.5314322747893714, + "grad_norm": 0.8141576051712036, + "learning_rate": 3.1312102095738205e-07, + "loss": 0.0513, + "step": 15624 + }, + { + "epoch": 2.531594296824368, + "grad_norm": 0.9502742886543274, + "learning_rate": 3.1290914925982794e-07, + "loss": 0.0651, + "step": 15625 + }, + { + "epoch": 2.531756318859365, + "grad_norm": 0.951263964176178, + "learning_rate": 3.12697344481962e-07, + "loss": 0.0565, + "step": 15626 + }, + { + "epoch": 2.5319183408943617, + "grad_norm": 0.9662578701972961, + "learning_rate": 3.124856066302634e-07, + "loss": 0.0623, + "step": 15627 + }, + { + "epoch": 2.5320803629293582, + "grad_norm": 1.0653865337371826, + "learning_rate": 3.1227393571121117e-07, + "loss": 0.0634, + "step": 15628 + }, + { + "epoch": 2.532242384964355, + "grad_norm": 0.9926478266716003, + "learning_rate": 3.1206233173128265e-07, + "loss": 0.0583, + "step": 15629 + }, + { + "epoch": 2.532404406999352, + "grad_norm": 0.9750377535820007, + "learning_rate": 3.1185079469695263e-07, + "loss": 0.0629, + "step": 15630 + }, + { + "epoch": 2.5325664290343486, + "grad_norm": 0.9622008800506592, + "learning_rate": 3.116393246146926e-07, + "loss": 0.0643, + "step": 15631 + }, + { + "epoch": 2.5327284510693455, + "grad_norm": 1.4896754026412964, + "learning_rate": 3.1142792149097297e-07, + "loss": 0.0577, + "step": 15632 + }, + { + "epoch": 2.5328904731043425, + "grad_norm": 1.024353265762329, + "learning_rate": 3.1121658533226376e-07, + "loss": 0.0706, + "step": 15633 + }, + { + "epoch": 2.533052495139339, + "grad_norm": 0.9080899953842163, + "learning_rate": 3.110053161450299e-07, + "loss": 0.0589, + "step": 15634 + }, + { + "epoch": 2.5332145171743354, + "grad_norm": 0.8674499988555908, + "learning_rate": 3.1079411393573597e-07, + "loss": 0.0575, + "step": 15635 + }, + { + "epoch": 2.5333765392093324, + "grad_norm": 0.9596990346908569, + "learning_rate": 3.105829787108444e-07, + "loss": 0.0649, + "step": 15636 + }, + { + "epoch": 2.5335385612443293, + "grad_norm": 0.9714479446411133, + "learning_rate": 3.103719104768155e-07, + "loss": 0.0638, + "step": 15637 + }, + { + "epoch": 2.533700583279326, + "grad_norm": 0.8182390332221985, + "learning_rate": 3.10160909240107e-07, + "loss": 0.0556, + "step": 15638 + }, + { + "epoch": 2.5338626053143227, + "grad_norm": 0.8322840332984924, + "learning_rate": 3.0994997500717575e-07, + "loss": 0.0517, + "step": 15639 + }, + { + "epoch": 2.5340246273493197, + "grad_norm": 0.9775702357292175, + "learning_rate": 3.0973910778447523e-07, + "loss": 0.0633, + "step": 15640 + }, + { + "epoch": 2.534186649384316, + "grad_norm": 0.8347146511077881, + "learning_rate": 3.0952830757845833e-07, + "loss": 0.0552, + "step": 15641 + }, + { + "epoch": 2.534348671419313, + "grad_norm": 0.7945565581321716, + "learning_rate": 3.0931757439557313e-07, + "loss": 0.0533, + "step": 15642 + }, + { + "epoch": 2.53451069345431, + "grad_norm": 0.970628023147583, + "learning_rate": 3.091069082422696e-07, + "loss": 0.0589, + "step": 15643 + }, + { + "epoch": 2.5346727154893065, + "grad_norm": 0.7898392677307129, + "learning_rate": 3.08896309124993e-07, + "loss": 0.0513, + "step": 15644 + }, + { + "epoch": 2.5348347375243034, + "grad_norm": 0.951751172542572, + "learning_rate": 3.086857770501867e-07, + "loss": 0.0617, + "step": 15645 + }, + { + "epoch": 2.5349967595593, + "grad_norm": 0.9525232911109924, + "learning_rate": 3.084753120242928e-07, + "loss": 0.0635, + "step": 15646 + }, + { + "epoch": 2.535158781594297, + "grad_norm": 0.8588886857032776, + "learning_rate": 3.0826491405375024e-07, + "loss": 0.0586, + "step": 15647 + }, + { + "epoch": 2.5353208036292934, + "grad_norm": 0.9159846901893616, + "learning_rate": 3.0805458314499855e-07, + "loss": 0.063, + "step": 15648 + }, + { + "epoch": 2.5354828256642903, + "grad_norm": 0.870168149471283, + "learning_rate": 3.078443193044717e-07, + "loss": 0.0574, + "step": 15649 + }, + { + "epoch": 2.535644847699287, + "grad_norm": 0.8437957763671875, + "learning_rate": 3.076341225386037e-07, + "loss": 0.0585, + "step": 15650 + }, + { + "epoch": 2.5358068697342837, + "grad_norm": 0.9445669054985046, + "learning_rate": 3.074239928538261e-07, + "loss": 0.0694, + "step": 15651 + }, + { + "epoch": 2.5359688917692806, + "grad_norm": 0.81999272108078, + "learning_rate": 3.0721393025656853e-07, + "loss": 0.0524, + "step": 15652 + }, + { + "epoch": 2.5361309138042776, + "grad_norm": 1.2289856672286987, + "learning_rate": 3.070039347532583e-07, + "loss": 0.0617, + "step": 15653 + }, + { + "epoch": 2.536292935839274, + "grad_norm": 1.0222786664962769, + "learning_rate": 3.0679400635032053e-07, + "loss": 0.0564, + "step": 15654 + }, + { + "epoch": 2.536454957874271, + "grad_norm": 0.8596547842025757, + "learning_rate": 3.065841450541787e-07, + "loss": 0.0636, + "step": 15655 + }, + { + "epoch": 2.536616979909268, + "grad_norm": 1.1208213567733765, + "learning_rate": 3.063743508712544e-07, + "loss": 0.0638, + "step": 15656 + }, + { + "epoch": 2.5367790019442644, + "grad_norm": 0.8171042799949646, + "learning_rate": 3.06164623807966e-07, + "loss": 0.058, + "step": 15657 + }, + { + "epoch": 2.536941023979261, + "grad_norm": 0.8748427033424377, + "learning_rate": 3.059549638707315e-07, + "loss": 0.0573, + "step": 15658 + }, + { + "epoch": 2.537103046014258, + "grad_norm": 1.00801420211792, + "learning_rate": 3.0574537106596606e-07, + "loss": 0.0647, + "step": 15659 + }, + { + "epoch": 2.537265068049255, + "grad_norm": 0.9508548974990845, + "learning_rate": 3.0553584540008176e-07, + "loss": 0.0621, + "step": 15660 + }, + { + "epoch": 2.5374270900842513, + "grad_norm": 0.862837553024292, + "learning_rate": 3.053263868794895e-07, + "loss": 0.0647, + "step": 15661 + }, + { + "epoch": 2.537589112119248, + "grad_norm": 0.9013253450393677, + "learning_rate": 3.0511699551059927e-07, + "loss": 0.0671, + "step": 15662 + }, + { + "epoch": 2.537751134154245, + "grad_norm": 1.0130845308303833, + "learning_rate": 3.049076712998181e-07, + "loss": 0.0585, + "step": 15663 + }, + { + "epoch": 2.5379131561892416, + "grad_norm": 1.0517815351486206, + "learning_rate": 3.0469841425354945e-07, + "loss": 0.0628, + "step": 15664 + }, + { + "epoch": 2.5380751782242386, + "grad_norm": 0.8955190777778625, + "learning_rate": 3.044892243781969e-07, + "loss": 0.0606, + "step": 15665 + }, + { + "epoch": 2.5382372002592355, + "grad_norm": 0.9051300287246704, + "learning_rate": 3.0428010168016107e-07, + "loss": 0.0605, + "step": 15666 + }, + { + "epoch": 2.538399222294232, + "grad_norm": 0.8366383910179138, + "learning_rate": 3.040710461658408e-07, + "loss": 0.0598, + "step": 15667 + }, + { + "epoch": 2.538561244329229, + "grad_norm": 0.8393315672874451, + "learning_rate": 3.0386205784163207e-07, + "loss": 0.0633, + "step": 15668 + }, + { + "epoch": 2.5387232663642254, + "grad_norm": 1.1416642665863037, + "learning_rate": 3.0365313671393e-07, + "loss": 0.0619, + "step": 15669 + }, + { + "epoch": 2.5388852883992223, + "grad_norm": 0.8364261984825134, + "learning_rate": 3.0344428278912765e-07, + "loss": 0.0606, + "step": 15670 + }, + { + "epoch": 2.539047310434219, + "grad_norm": 0.9973102807998657, + "learning_rate": 3.0323549607361333e-07, + "loss": 0.0586, + "step": 15671 + }, + { + "epoch": 2.5392093324692158, + "grad_norm": 0.8072327971458435, + "learning_rate": 3.030267765737774e-07, + "loss": 0.0571, + "step": 15672 + }, + { + "epoch": 2.5393713545042127, + "grad_norm": 1.0454763174057007, + "learning_rate": 3.0281812429600544e-07, + "loss": 0.0569, + "step": 15673 + }, + { + "epoch": 2.539533376539209, + "grad_norm": 1.0612105131149292, + "learning_rate": 3.026095392466824e-07, + "loss": 0.0655, + "step": 15674 + }, + { + "epoch": 2.539695398574206, + "grad_norm": 1.0224926471710205, + "learning_rate": 3.024010214321893e-07, + "loss": 0.0707, + "step": 15675 + }, + { + "epoch": 2.539857420609203, + "grad_norm": 0.8485469222068787, + "learning_rate": 3.021925708589066e-07, + "loss": 0.0567, + "step": 15676 + }, + { + "epoch": 2.5400194426441995, + "grad_norm": 1.0038886070251465, + "learning_rate": 3.019841875332133e-07, + "loss": 0.0614, + "step": 15677 + }, + { + "epoch": 2.5401814646791965, + "grad_norm": 0.8578863739967346, + "learning_rate": 3.0177587146148435e-07, + "loss": 0.061, + "step": 15678 + }, + { + "epoch": 2.540343486714193, + "grad_norm": 0.7758865356445312, + "learning_rate": 3.0156762265009437e-07, + "loss": 0.0567, + "step": 15679 + }, + { + "epoch": 2.54050550874919, + "grad_norm": 1.1475505828857422, + "learning_rate": 3.013594411054144e-07, + "loss": 0.0682, + "step": 15680 + }, + { + "epoch": 2.5406675307841864, + "grad_norm": 0.9241853952407837, + "learning_rate": 3.0115132683381565e-07, + "loss": 0.0679, + "step": 15681 + }, + { + "epoch": 2.5408295528191833, + "grad_norm": 0.9604292511940002, + "learning_rate": 3.0094327984166506e-07, + "loss": 0.0564, + "step": 15682 + }, + { + "epoch": 2.5409915748541803, + "grad_norm": 0.8330355882644653, + "learning_rate": 3.007353001353283e-07, + "loss": 0.0574, + "step": 15683 + }, + { + "epoch": 2.5411535968891767, + "grad_norm": 0.9024338126182556, + "learning_rate": 3.0052738772116925e-07, + "loss": 0.069, + "step": 15684 + }, + { + "epoch": 2.5413156189241737, + "grad_norm": 0.9222663640975952, + "learning_rate": 3.003195426055497e-07, + "loss": 0.0632, + "step": 15685 + }, + { + "epoch": 2.5414776409591706, + "grad_norm": 0.9716166257858276, + "learning_rate": 3.001117647948287e-07, + "loss": 0.0651, + "step": 15686 + }, + { + "epoch": 2.541639662994167, + "grad_norm": 0.904063880443573, + "learning_rate": 2.9990405429536433e-07, + "loss": 0.0624, + "step": 15687 + }, + { + "epoch": 2.541801685029164, + "grad_norm": 0.8916031122207642, + "learning_rate": 2.996964111135123e-07, + "loss": 0.0565, + "step": 15688 + }, + { + "epoch": 2.541963707064161, + "grad_norm": 0.8588840961456299, + "learning_rate": 2.9948883525562464e-07, + "loss": 0.0574, + "step": 15689 + }, + { + "epoch": 2.5421257290991575, + "grad_norm": 0.8580512404441833, + "learning_rate": 2.992813267280531e-07, + "loss": 0.0594, + "step": 15690 + }, + { + "epoch": 2.5422877511341544, + "grad_norm": 0.8236280083656311, + "learning_rate": 2.9907388553714806e-07, + "loss": 0.0582, + "step": 15691 + }, + { + "epoch": 2.542449773169151, + "grad_norm": 0.8864600658416748, + "learning_rate": 2.988665116892564e-07, + "loss": 0.0582, + "step": 15692 + }, + { + "epoch": 2.542611795204148, + "grad_norm": 0.934640109539032, + "learning_rate": 2.986592051907222e-07, + "loss": 0.066, + "step": 15693 + }, + { + "epoch": 2.5427738172391443, + "grad_norm": 0.8519915342330933, + "learning_rate": 2.9845196604788935e-07, + "loss": 0.0627, + "step": 15694 + }, + { + "epoch": 2.5429358392741412, + "grad_norm": 0.9214040637016296, + "learning_rate": 2.9824479426709853e-07, + "loss": 0.0572, + "step": 15695 + }, + { + "epoch": 2.543097861309138, + "grad_norm": 1.0581270456314087, + "learning_rate": 2.980376898546888e-07, + "loss": 0.0634, + "step": 15696 + }, + { + "epoch": 2.5432598833441347, + "grad_norm": 1.086905598640442, + "learning_rate": 2.9783065281699747e-07, + "loss": 0.0629, + "step": 15697 + }, + { + "epoch": 2.5434219053791316, + "grad_norm": 1.031447172164917, + "learning_rate": 2.976236831603588e-07, + "loss": 0.0629, + "step": 15698 + }, + { + "epoch": 2.5435839274141285, + "grad_norm": 0.9910357594490051, + "learning_rate": 2.9741678089110576e-07, + "loss": 0.0699, + "step": 15699 + }, + { + "epoch": 2.543745949449125, + "grad_norm": 0.838286817073822, + "learning_rate": 2.972099460155689e-07, + "loss": 0.0555, + "step": 15700 + }, + { + "epoch": 2.543907971484122, + "grad_norm": 0.939771294593811, + "learning_rate": 2.970031785400773e-07, + "loss": 0.0593, + "step": 15701 + }, + { + "epoch": 2.5440699935191184, + "grad_norm": 1.2176207304000854, + "learning_rate": 2.9679647847095735e-07, + "loss": 0.0637, + "step": 15702 + }, + { + "epoch": 2.5442320155541154, + "grad_norm": 0.8864500522613525, + "learning_rate": 2.965898458145339e-07, + "loss": 0.0614, + "step": 15703 + }, + { + "epoch": 2.544394037589112, + "grad_norm": 1.0630028247833252, + "learning_rate": 2.9638328057712775e-07, + "loss": 0.0688, + "step": 15704 + }, + { + "epoch": 2.544556059624109, + "grad_norm": 0.9076703786849976, + "learning_rate": 2.9617678276506136e-07, + "loss": 0.0578, + "step": 15705 + }, + { + "epoch": 2.5447180816591057, + "grad_norm": 0.9207355976104736, + "learning_rate": 2.9597035238465214e-07, + "loss": 0.0546, + "step": 15706 + }, + { + "epoch": 2.5448801036941022, + "grad_norm": 0.9597820043563843, + "learning_rate": 2.9576398944221707e-07, + "loss": 0.0627, + "step": 15707 + }, + { + "epoch": 2.545042125729099, + "grad_norm": 0.8808280229568481, + "learning_rate": 2.9555769394406934e-07, + "loss": 0.0615, + "step": 15708 + }, + { + "epoch": 2.545204147764096, + "grad_norm": 0.9642734527587891, + "learning_rate": 2.9535146589652093e-07, + "loss": 0.0668, + "step": 15709 + }, + { + "epoch": 2.5453661697990926, + "grad_norm": 0.897223711013794, + "learning_rate": 2.9514530530588367e-07, + "loss": 0.0609, + "step": 15710 + }, + { + "epoch": 2.5455281918340895, + "grad_norm": 0.9457272887229919, + "learning_rate": 2.949392121784636e-07, + "loss": 0.0596, + "step": 15711 + }, + { + "epoch": 2.5456902138690864, + "grad_norm": 0.9758214950561523, + "learning_rate": 2.947331865205677e-07, + "loss": 0.0564, + "step": 15712 + }, + { + "epoch": 2.545852235904083, + "grad_norm": 1.0195071697235107, + "learning_rate": 2.9452722833849976e-07, + "loss": 0.0663, + "step": 15713 + }, + { + "epoch": 2.54601425793908, + "grad_norm": 0.8964647054672241, + "learning_rate": 2.943213376385612e-07, + "loss": 0.0587, + "step": 15714 + }, + { + "epoch": 2.5461762799740764, + "grad_norm": 1.0028923749923706, + "learning_rate": 2.9411551442705243e-07, + "loss": 0.0637, + "step": 15715 + }, + { + "epoch": 2.5463383020090733, + "grad_norm": 0.8939936757087708, + "learning_rate": 2.9390975871027046e-07, + "loss": 0.0589, + "step": 15716 + }, + { + "epoch": 2.54650032404407, + "grad_norm": 0.9271273016929626, + "learning_rate": 2.9370407049451126e-07, + "loss": 0.0572, + "step": 15717 + }, + { + "epoch": 2.5466623460790667, + "grad_norm": 0.9877331256866455, + "learning_rate": 2.934984497860691e-07, + "loss": 0.0683, + "step": 15718 + }, + { + "epoch": 2.5468243681140637, + "grad_norm": 0.8610256910324097, + "learning_rate": 2.932928965912335e-07, + "loss": 0.0636, + "step": 15719 + }, + { + "epoch": 2.54698639014906, + "grad_norm": 0.9936156868934631, + "learning_rate": 2.9308741091629596e-07, + "loss": 0.059, + "step": 15720 + }, + { + "epoch": 2.547148412184057, + "grad_norm": 0.9047034382820129, + "learning_rate": 2.928819927675433e-07, + "loss": 0.064, + "step": 15721 + }, + { + "epoch": 2.547310434219054, + "grad_norm": 0.9128992557525635, + "learning_rate": 2.9267664215126e-07, + "loss": 0.0647, + "step": 15722 + }, + { + "epoch": 2.5474724562540505, + "grad_norm": 0.9131163358688354, + "learning_rate": 2.924713590737294e-07, + "loss": 0.0631, + "step": 15723 + }, + { + "epoch": 2.5476344782890474, + "grad_norm": 0.8168431520462036, + "learning_rate": 2.9226614354123356e-07, + "loss": 0.0582, + "step": 15724 + }, + { + "epoch": 2.547796500324044, + "grad_norm": 0.9571110606193542, + "learning_rate": 2.9206099556005145e-07, + "loss": 0.0636, + "step": 15725 + }, + { + "epoch": 2.547958522359041, + "grad_norm": 0.9737008213996887, + "learning_rate": 2.9185591513645947e-07, + "loss": 0.0674, + "step": 15726 + }, + { + "epoch": 2.5481205443940373, + "grad_norm": 0.8390480875968933, + "learning_rate": 2.9165090227673306e-07, + "loss": 0.0563, + "step": 15727 + }, + { + "epoch": 2.5482825664290343, + "grad_norm": 1.0809296369552612, + "learning_rate": 2.914459569871447e-07, + "loss": 0.0703, + "step": 15728 + }, + { + "epoch": 2.548444588464031, + "grad_norm": 0.7961978912353516, + "learning_rate": 2.9124107927396534e-07, + "loss": 0.0548, + "step": 15729 + }, + { + "epoch": 2.5486066104990277, + "grad_norm": 0.9523206949234009, + "learning_rate": 2.91036269143464e-07, + "loss": 0.0619, + "step": 15730 + }, + { + "epoch": 2.5487686325340246, + "grad_norm": 0.756062924861908, + "learning_rate": 2.908315266019074e-07, + "loss": 0.0545, + "step": 15731 + }, + { + "epoch": 2.5489306545690216, + "grad_norm": 0.991388201713562, + "learning_rate": 2.9062685165555963e-07, + "loss": 0.0585, + "step": 15732 + }, + { + "epoch": 2.549092676604018, + "grad_norm": 0.9171768426895142, + "learning_rate": 2.904222443106838e-07, + "loss": 0.0607, + "step": 15733 + }, + { + "epoch": 2.549254698639015, + "grad_norm": 1.0466676950454712, + "learning_rate": 2.9021770457354046e-07, + "loss": 0.0611, + "step": 15734 + }, + { + "epoch": 2.549416720674012, + "grad_norm": 1.0103275775909424, + "learning_rate": 2.9001323245038765e-07, + "loss": 0.0688, + "step": 15735 + }, + { + "epoch": 2.5495787427090084, + "grad_norm": 0.8275580406188965, + "learning_rate": 2.8980882794748227e-07, + "loss": 0.0535, + "step": 15736 + }, + { + "epoch": 2.5497407647440054, + "grad_norm": 0.885045051574707, + "learning_rate": 2.896044910710777e-07, + "loss": 0.0677, + "step": 15737 + }, + { + "epoch": 2.549902786779002, + "grad_norm": 0.9406101703643799, + "learning_rate": 2.894002218274261e-07, + "loss": 0.0581, + "step": 15738 + }, + { + "epoch": 2.5500648088139988, + "grad_norm": 0.9260234236717224, + "learning_rate": 2.8919602022277923e-07, + "loss": 0.0661, + "step": 15739 + }, + { + "epoch": 2.5502268308489953, + "grad_norm": 1.0107648372650146, + "learning_rate": 2.8899188626338363e-07, + "loss": 0.0613, + "step": 15740 + }, + { + "epoch": 2.550388852883992, + "grad_norm": 1.016796350479126, + "learning_rate": 2.8878781995548584e-07, + "loss": 0.0708, + "step": 15741 + }, + { + "epoch": 2.550550874918989, + "grad_norm": 0.8699153661727905, + "learning_rate": 2.8858382130532965e-07, + "loss": 0.0604, + "step": 15742 + }, + { + "epoch": 2.5507128969539856, + "grad_norm": 0.9409704804420471, + "learning_rate": 2.8837989031915674e-07, + "loss": 0.0604, + "step": 15743 + }, + { + "epoch": 2.5508749189889826, + "grad_norm": 0.884575605392456, + "learning_rate": 2.8817602700320747e-07, + "loss": 0.0657, + "step": 15744 + }, + { + "epoch": 2.5510369410239795, + "grad_norm": 0.9905060529708862, + "learning_rate": 2.879722313637193e-07, + "loss": 0.0693, + "step": 15745 + }, + { + "epoch": 2.551198963058976, + "grad_norm": 0.9149320125579834, + "learning_rate": 2.8776850340692777e-07, + "loss": 0.0633, + "step": 15746 + }, + { + "epoch": 2.551360985093973, + "grad_norm": 1.0935709476470947, + "learning_rate": 2.875648431390665e-07, + "loss": 0.0625, + "step": 15747 + }, + { + "epoch": 2.5515230071289694, + "grad_norm": 0.9420832991600037, + "learning_rate": 2.87361250566367e-07, + "loss": 0.0557, + "step": 15748 + }, + { + "epoch": 2.5516850291639663, + "grad_norm": 1.622456431388855, + "learning_rate": 2.871577256950589e-07, + "loss": 0.0592, + "step": 15749 + }, + { + "epoch": 2.551847051198963, + "grad_norm": 0.8848714232444763, + "learning_rate": 2.869542685313692e-07, + "loss": 0.0572, + "step": 15750 + }, + { + "epoch": 2.5520090732339598, + "grad_norm": 1.0072789192199707, + "learning_rate": 2.8675087908152407e-07, + "loss": 0.0661, + "step": 15751 + }, + { + "epoch": 2.5521710952689567, + "grad_norm": 0.8592365384101868, + "learning_rate": 2.865475573517451e-07, + "loss": 0.0597, + "step": 15752 + }, + { + "epoch": 2.552333117303953, + "grad_norm": 0.9718042612075806, + "learning_rate": 2.863443033482549e-07, + "loss": 0.0634, + "step": 15753 + }, + { + "epoch": 2.55249513933895, + "grad_norm": 1.0915923118591309, + "learning_rate": 2.8614111707727267e-07, + "loss": 0.0533, + "step": 15754 + }, + { + "epoch": 2.552657161373947, + "grad_norm": 0.9023035764694214, + "learning_rate": 2.859379985450142e-07, + "loss": 0.0615, + "step": 15755 + }, + { + "epoch": 2.5528191834089435, + "grad_norm": 0.9342001676559448, + "learning_rate": 2.8573494775769485e-07, + "loss": 0.0639, + "step": 15756 + }, + { + "epoch": 2.5529812054439405, + "grad_norm": 1.2013731002807617, + "learning_rate": 2.8553196472152794e-07, + "loss": 0.0641, + "step": 15757 + }, + { + "epoch": 2.5531432274789374, + "grad_norm": 0.8660479784011841, + "learning_rate": 2.853290494427238e-07, + "loss": 0.0601, + "step": 15758 + }, + { + "epoch": 2.553305249513934, + "grad_norm": 0.8665417432785034, + "learning_rate": 2.851262019274917e-07, + "loss": 0.066, + "step": 15759 + }, + { + "epoch": 2.5534672715489304, + "grad_norm": 0.9686926007270813, + "learning_rate": 2.8492342218203766e-07, + "loss": 0.061, + "step": 15760 + }, + { + "epoch": 2.5536292935839273, + "grad_norm": 0.931259274482727, + "learning_rate": 2.847207102125665e-07, + "loss": 0.0667, + "step": 15761 + }, + { + "epoch": 2.5537913156189243, + "grad_norm": 0.8071960210800171, + "learning_rate": 2.845180660252808e-07, + "loss": 0.0659, + "step": 15762 + }, + { + "epoch": 2.5539533376539207, + "grad_norm": 0.8331485986709595, + "learning_rate": 2.843154896263811e-07, + "loss": 0.0538, + "step": 15763 + }, + { + "epoch": 2.5541153596889177, + "grad_norm": 0.8786145448684692, + "learning_rate": 2.8411298102206524e-07, + "loss": 0.0623, + "step": 15764 + }, + { + "epoch": 2.5542773817239146, + "grad_norm": 0.9104515314102173, + "learning_rate": 2.839105402185305e-07, + "loss": 0.065, + "step": 15765 + }, + { + "epoch": 2.554439403758911, + "grad_norm": 0.935090959072113, + "learning_rate": 2.837081672219694e-07, + "loss": 0.0581, + "step": 15766 + }, + { + "epoch": 2.554601425793908, + "grad_norm": 0.8648780584335327, + "learning_rate": 2.835058620385756e-07, + "loss": 0.061, + "step": 15767 + }, + { + "epoch": 2.554763447828905, + "grad_norm": 0.8187403678894043, + "learning_rate": 2.833036246745385e-07, + "loss": 0.0566, + "step": 15768 + }, + { + "epoch": 2.5549254698639015, + "grad_norm": 0.8875253796577454, + "learning_rate": 2.831014551360467e-07, + "loss": 0.0608, + "step": 15769 + }, + { + "epoch": 2.5550874918988984, + "grad_norm": 0.8889281153678894, + "learning_rate": 2.828993534292851e-07, + "loss": 0.0608, + "step": 15770 + }, + { + "epoch": 2.555249513933895, + "grad_norm": 0.9261566400527954, + "learning_rate": 2.8269731956043736e-07, + "loss": 0.0626, + "step": 15771 + }, + { + "epoch": 2.555411535968892, + "grad_norm": 0.8873212337493896, + "learning_rate": 2.824953535356872e-07, + "loss": 0.0524, + "step": 15772 + }, + { + "epoch": 2.5555735580038883, + "grad_norm": 0.919251561164856, + "learning_rate": 2.8229345536121223e-07, + "loss": 0.0607, + "step": 15773 + }, + { + "epoch": 2.5557355800388852, + "grad_norm": 0.8338304162025452, + "learning_rate": 2.820916250431907e-07, + "loss": 0.0569, + "step": 15774 + }, + { + "epoch": 2.555897602073882, + "grad_norm": 0.7551842331886292, + "learning_rate": 2.818898625877983e-07, + "loss": 0.0512, + "step": 15775 + }, + { + "epoch": 2.5560596241088787, + "grad_norm": 0.929349422454834, + "learning_rate": 2.8168816800120845e-07, + "loss": 0.0586, + "step": 15776 + }, + { + "epoch": 2.5562216461438756, + "grad_norm": 0.8814112544059753, + "learning_rate": 2.814865412895926e-07, + "loss": 0.0554, + "step": 15777 + }, + { + "epoch": 2.5563836681788725, + "grad_norm": 0.9311420917510986, + "learning_rate": 2.812849824591196e-07, + "loss": 0.0615, + "step": 15778 + }, + { + "epoch": 2.556545690213869, + "grad_norm": 0.8256794810295105, + "learning_rate": 2.8108349151595713e-07, + "loss": 0.0577, + "step": 15779 + }, + { + "epoch": 2.556707712248866, + "grad_norm": 0.8068723082542419, + "learning_rate": 2.808820684662705e-07, + "loss": 0.0626, + "step": 15780 + }, + { + "epoch": 2.556869734283863, + "grad_norm": 0.75613933801651, + "learning_rate": 2.806807133162215e-07, + "loss": 0.0509, + "step": 15781 + }, + { + "epoch": 2.5570317563188594, + "grad_norm": 1.0102713108062744, + "learning_rate": 2.804794260719726e-07, + "loss": 0.0697, + "step": 15782 + }, + { + "epoch": 2.557193778353856, + "grad_norm": 0.8469998836517334, + "learning_rate": 2.8027820673968256e-07, + "loss": 0.0618, + "step": 15783 + }, + { + "epoch": 2.557355800388853, + "grad_norm": 1.0581094026565552, + "learning_rate": 2.800770553255072e-07, + "loss": 0.0644, + "step": 15784 + }, + { + "epoch": 2.5575178224238497, + "grad_norm": 0.8770332932472229, + "learning_rate": 2.7987597183560175e-07, + "loss": 0.0615, + "step": 15785 + }, + { + "epoch": 2.557679844458846, + "grad_norm": 0.9421771764755249, + "learning_rate": 2.796749562761186e-07, + "loss": 0.0575, + "step": 15786 + }, + { + "epoch": 2.557841866493843, + "grad_norm": 1.0569273233413696, + "learning_rate": 2.7947400865320966e-07, + "loss": 0.0666, + "step": 15787 + }, + { + "epoch": 2.55800388852884, + "grad_norm": 1.0118812322616577, + "learning_rate": 2.7927312897302217e-07, + "loss": 0.0615, + "step": 15788 + }, + { + "epoch": 2.5581659105638366, + "grad_norm": 1.168845772743225, + "learning_rate": 2.790723172417026e-07, + "loss": 0.065, + "step": 15789 + }, + { + "epoch": 2.5583279325988335, + "grad_norm": 0.9244213700294495, + "learning_rate": 2.7887157346539574e-07, + "loss": 0.0632, + "step": 15790 + }, + { + "epoch": 2.5584899546338304, + "grad_norm": 0.9718979001045227, + "learning_rate": 2.7867089765024365e-07, + "loss": 0.0678, + "step": 15791 + }, + { + "epoch": 2.558651976668827, + "grad_norm": 1.047033667564392, + "learning_rate": 2.7847028980238666e-07, + "loss": 0.0602, + "step": 15792 + }, + { + "epoch": 2.558813998703824, + "grad_norm": 1.0640023946762085, + "learning_rate": 2.7826974992796266e-07, + "loss": 0.0594, + "step": 15793 + }, + { + "epoch": 2.5589760207388204, + "grad_norm": 0.8724460005760193, + "learning_rate": 2.780692780331079e-07, + "loss": 0.059, + "step": 15794 + }, + { + "epoch": 2.5591380427738173, + "grad_norm": 0.7924855947494507, + "learning_rate": 2.778688741239563e-07, + "loss": 0.0529, + "step": 15795 + }, + { + "epoch": 2.559300064808814, + "grad_norm": 0.8275933861732483, + "learning_rate": 2.7766853820663963e-07, + "loss": 0.0577, + "step": 15796 + }, + { + "epoch": 2.5594620868438107, + "grad_norm": 1.010675072669983, + "learning_rate": 2.774682702872877e-07, + "loss": 0.0616, + "step": 15797 + }, + { + "epoch": 2.5596241088788076, + "grad_norm": 0.9357376098632812, + "learning_rate": 2.7726807037202903e-07, + "loss": 0.06, + "step": 15798 + }, + { + "epoch": 2.559786130913804, + "grad_norm": 0.8635546565055847, + "learning_rate": 2.7706793846698776e-07, + "loss": 0.0589, + "step": 15799 + }, + { + "epoch": 2.559948152948801, + "grad_norm": 0.8578700423240662, + "learning_rate": 2.7686787457828796e-07, + "loss": 0.0534, + "step": 15800 + }, + { + "epoch": 2.560110174983798, + "grad_norm": 1.073947787284851, + "learning_rate": 2.7666787871205135e-07, + "loss": 0.063, + "step": 15801 + }, + { + "epoch": 2.5602721970187945, + "grad_norm": 1.1108258962631226, + "learning_rate": 2.764679508743981e-07, + "loss": 0.0571, + "step": 15802 + }, + { + "epoch": 2.5604342190537914, + "grad_norm": 0.8796466588973999, + "learning_rate": 2.7626809107144435e-07, + "loss": 0.0547, + "step": 15803 + }, + { + "epoch": 2.560596241088788, + "grad_norm": 1.0041248798370361, + "learning_rate": 2.7606829930930555e-07, + "loss": 0.0646, + "step": 15804 + }, + { + "epoch": 2.560758263123785, + "grad_norm": 0.9209561944007874, + "learning_rate": 2.7586857559409484e-07, + "loss": 0.0549, + "step": 15805 + }, + { + "epoch": 2.5609202851587813, + "grad_norm": 0.8668680787086487, + "learning_rate": 2.7566891993192347e-07, + "loss": 0.0578, + "step": 15806 + }, + { + "epoch": 2.5610823071937783, + "grad_norm": 0.8909144401550293, + "learning_rate": 2.754693323289004e-07, + "loss": 0.0564, + "step": 15807 + }, + { + "epoch": 2.561244329228775, + "grad_norm": 0.8772615790367126, + "learning_rate": 2.752698127911327e-07, + "loss": 0.0613, + "step": 15808 + }, + { + "epoch": 2.5614063512637717, + "grad_norm": 0.8881198167800903, + "learning_rate": 2.750703613247252e-07, + "loss": 0.052, + "step": 15809 + }, + { + "epoch": 2.5615683732987686, + "grad_norm": 0.8577731847763062, + "learning_rate": 2.748709779357794e-07, + "loss": 0.0573, + "step": 15810 + }, + { + "epoch": 2.5617303953337656, + "grad_norm": 0.8215259313583374, + "learning_rate": 2.7467166263039776e-07, + "loss": 0.0586, + "step": 15811 + }, + { + "epoch": 2.561892417368762, + "grad_norm": 0.8955356478691101, + "learning_rate": 2.744724154146777e-07, + "loss": 0.0568, + "step": 15812 + }, + { + "epoch": 2.562054439403759, + "grad_norm": 0.9561681747436523, + "learning_rate": 2.7427323629471677e-07, + "loss": 0.0685, + "step": 15813 + }, + { + "epoch": 2.562216461438756, + "grad_norm": 0.9938089847564697, + "learning_rate": 2.740741252766077e-07, + "loss": 0.0621, + "step": 15814 + }, + { + "epoch": 2.5623784834737524, + "grad_norm": 0.9976528882980347, + "learning_rate": 2.7387508236644404e-07, + "loss": 0.0609, + "step": 15815 + }, + { + "epoch": 2.5625405055087493, + "grad_norm": 0.8575246334075928, + "learning_rate": 2.736761075703165e-07, + "loss": 0.0586, + "step": 15816 + }, + { + "epoch": 2.562702527543746, + "grad_norm": 0.9577282071113586, + "learning_rate": 2.734772008943118e-07, + "loss": 0.0626, + "step": 15817 + }, + { + "epoch": 2.5628645495787428, + "grad_norm": 0.8331069946289062, + "learning_rate": 2.732783623445168e-07, + "loss": 0.0563, + "step": 15818 + }, + { + "epoch": 2.5630265716137393, + "grad_norm": 0.8915289640426636, + "learning_rate": 2.730795919270149e-07, + "loss": 0.0608, + "step": 15819 + }, + { + "epoch": 2.563188593648736, + "grad_norm": 0.9112197160720825, + "learning_rate": 2.728808896478891e-07, + "loss": 0.0571, + "step": 15820 + }, + { + "epoch": 2.563350615683733, + "grad_norm": 1.0440930128097534, + "learning_rate": 2.726822555132183e-07, + "loss": 0.0612, + "step": 15821 + }, + { + "epoch": 2.5635126377187296, + "grad_norm": 1.0141581296920776, + "learning_rate": 2.7248368952908055e-07, + "loss": 0.0702, + "step": 15822 + }, + { + "epoch": 2.5636746597537265, + "grad_norm": 1.0160237550735474, + "learning_rate": 2.722851917015512e-07, + "loss": 0.0653, + "step": 15823 + }, + { + "epoch": 2.5638366817887235, + "grad_norm": 0.8820620775222778, + "learning_rate": 2.7208676203670406e-07, + "loss": 0.0551, + "step": 15824 + }, + { + "epoch": 2.56399870382372, + "grad_norm": 1.0316556692123413, + "learning_rate": 2.7188840054061084e-07, + "loss": 0.0625, + "step": 15825 + }, + { + "epoch": 2.564160725858717, + "grad_norm": 1.0389971733093262, + "learning_rate": 2.716901072193404e-07, + "loss": 0.0646, + "step": 15826 + }, + { + "epoch": 2.5643227478937134, + "grad_norm": 0.8673059940338135, + "learning_rate": 2.7149188207896084e-07, + "loss": 0.0631, + "step": 15827 + }, + { + "epoch": 2.5644847699287103, + "grad_norm": 0.8147962689399719, + "learning_rate": 2.71293725125536e-07, + "loss": 0.0547, + "step": 15828 + }, + { + "epoch": 2.564646791963707, + "grad_norm": 1.1430022716522217, + "learning_rate": 2.710956363651296e-07, + "loss": 0.0643, + "step": 15829 + }, + { + "epoch": 2.5648088139987038, + "grad_norm": 1.0345301628112793, + "learning_rate": 2.7089761580380346e-07, + "loss": 0.0694, + "step": 15830 + }, + { + "epoch": 2.5649708360337007, + "grad_norm": 1.059556245803833, + "learning_rate": 2.7069966344761636e-07, + "loss": 0.0555, + "step": 15831 + }, + { + "epoch": 2.565132858068697, + "grad_norm": 0.8952658176422119, + "learning_rate": 2.7050177930262406e-07, + "loss": 0.062, + "step": 15832 + }, + { + "epoch": 2.565294880103694, + "grad_norm": 0.9212254285812378, + "learning_rate": 2.703039633748822e-07, + "loss": 0.0525, + "step": 15833 + }, + { + "epoch": 2.565456902138691, + "grad_norm": 0.8648931980133057, + "learning_rate": 2.701062156704434e-07, + "loss": 0.0598, + "step": 15834 + }, + { + "epoch": 2.5656189241736875, + "grad_norm": 0.8723458647727966, + "learning_rate": 2.6990853619535793e-07, + "loss": 0.06, + "step": 15835 + }, + { + "epoch": 2.5657809462086845, + "grad_norm": 0.8057177662849426, + "learning_rate": 2.697109249556748e-07, + "loss": 0.0555, + "step": 15836 + }, + { + "epoch": 2.5659429682436814, + "grad_norm": 1.0397346019744873, + "learning_rate": 2.6951338195744e-07, + "loss": 0.0679, + "step": 15837 + }, + { + "epoch": 2.566104990278678, + "grad_norm": 0.824124276638031, + "learning_rate": 2.6931590720669807e-07, + "loss": 0.0582, + "step": 15838 + }, + { + "epoch": 2.566267012313675, + "grad_norm": 0.807733952999115, + "learning_rate": 2.6911850070949124e-07, + "loss": 0.0591, + "step": 15839 + }, + { + "epoch": 2.5664290343486713, + "grad_norm": 1.1735118627548218, + "learning_rate": 2.6892116247185964e-07, + "loss": 0.0717, + "step": 15840 + }, + { + "epoch": 2.5665910563836682, + "grad_norm": 0.878514289855957, + "learning_rate": 2.687238924998414e-07, + "loss": 0.0493, + "step": 15841 + }, + { + "epoch": 2.5667530784186647, + "grad_norm": 0.9837947487831116, + "learning_rate": 2.6852669079947294e-07, + "loss": 0.0616, + "step": 15842 + }, + { + "epoch": 2.5669151004536617, + "grad_norm": 0.9340665340423584, + "learning_rate": 2.683295573767866e-07, + "loss": 0.0597, + "step": 15843 + }, + { + "epoch": 2.5670771224886586, + "grad_norm": 0.9732357263565063, + "learning_rate": 2.681324922378159e-07, + "loss": 0.058, + "step": 15844 + }, + { + "epoch": 2.567239144523655, + "grad_norm": 0.8672887682914734, + "learning_rate": 2.679354953885899e-07, + "loss": 0.0621, + "step": 15845 + }, + { + "epoch": 2.567401166558652, + "grad_norm": 0.9062552452087402, + "learning_rate": 2.6773856683513677e-07, + "loss": 0.0574, + "step": 15846 + }, + { + "epoch": 2.567563188593649, + "grad_norm": 0.9597328305244446, + "learning_rate": 2.6754170658348094e-07, + "loss": 0.056, + "step": 15847 + }, + { + "epoch": 2.5677252106286454, + "grad_norm": 0.8402513265609741, + "learning_rate": 2.673449146396459e-07, + "loss": 0.0555, + "step": 15848 + }, + { + "epoch": 2.5678872326636424, + "grad_norm": 0.9047306776046753, + "learning_rate": 2.671481910096546e-07, + "loss": 0.0586, + "step": 15849 + }, + { + "epoch": 2.568049254698639, + "grad_norm": 0.9194369912147522, + "learning_rate": 2.6695153569952475e-07, + "loss": 0.0625, + "step": 15850 + }, + { + "epoch": 2.568211276733636, + "grad_norm": 0.9545932412147522, + "learning_rate": 2.6675494871527404e-07, + "loss": 0.0646, + "step": 15851 + }, + { + "epoch": 2.5683732987686323, + "grad_norm": 0.9350480437278748, + "learning_rate": 2.665584300629176e-07, + "loss": 0.0616, + "step": 15852 + }, + { + "epoch": 2.5685353208036292, + "grad_norm": 0.8764953017234802, + "learning_rate": 2.663619797484684e-07, + "loss": 0.0584, + "step": 15853 + }, + { + "epoch": 2.568697342838626, + "grad_norm": 0.904297947883606, + "learning_rate": 2.661655977779373e-07, + "loss": 0.0539, + "step": 15854 + }, + { + "epoch": 2.5688593648736227, + "grad_norm": 0.9557779431343079, + "learning_rate": 2.65969284157333e-07, + "loss": 0.0549, + "step": 15855 + }, + { + "epoch": 2.5690213869086196, + "grad_norm": 0.7922280430793762, + "learning_rate": 2.6577303889266244e-07, + "loss": 0.0576, + "step": 15856 + }, + { + "epoch": 2.5691834089436165, + "grad_norm": 1.1768953800201416, + "learning_rate": 2.655768619899302e-07, + "loss": 0.0574, + "step": 15857 + }, + { + "epoch": 2.569345430978613, + "grad_norm": 0.9072592854499817, + "learning_rate": 2.6538075345513864e-07, + "loss": 0.062, + "step": 15858 + }, + { + "epoch": 2.56950745301361, + "grad_norm": 0.9729146361351013, + "learning_rate": 2.651847132942886e-07, + "loss": 0.0647, + "step": 15859 + }, + { + "epoch": 2.569669475048607, + "grad_norm": 0.8979154825210571, + "learning_rate": 2.6498874151337865e-07, + "loss": 0.0649, + "step": 15860 + }, + { + "epoch": 2.5698314970836034, + "grad_norm": 1.0150434970855713, + "learning_rate": 2.6479283811840393e-07, + "loss": 0.0602, + "step": 15861 + }, + { + "epoch": 2.5699935191186, + "grad_norm": 0.9745667576789856, + "learning_rate": 2.6459700311535885e-07, + "loss": 0.0662, + "step": 15862 + }, + { + "epoch": 2.570155541153597, + "grad_norm": 0.8473979830741882, + "learning_rate": 2.6440123651023634e-07, + "loss": 0.0554, + "step": 15863 + }, + { + "epoch": 2.5703175631885937, + "grad_norm": 1.0695520639419556, + "learning_rate": 2.642055383090264e-07, + "loss": 0.0609, + "step": 15864 + }, + { + "epoch": 2.57047958522359, + "grad_norm": 0.8144644498825073, + "learning_rate": 2.6400990851771615e-07, + "loss": 0.0502, + "step": 15865 + }, + { + "epoch": 2.570641607258587, + "grad_norm": 0.8602494597434998, + "learning_rate": 2.638143471422916e-07, + "loss": 0.0585, + "step": 15866 + }, + { + "epoch": 2.570803629293584, + "grad_norm": 0.8381325006484985, + "learning_rate": 2.636188541887366e-07, + "loss": 0.0611, + "step": 15867 + }, + { + "epoch": 2.5709656513285806, + "grad_norm": 0.8610426783561707, + "learning_rate": 2.634234296630328e-07, + "loss": 0.0578, + "step": 15868 + }, + { + "epoch": 2.5711276733635775, + "grad_norm": 0.8060207366943359, + "learning_rate": 2.632280735711595e-07, + "loss": 0.0534, + "step": 15869 + }, + { + "epoch": 2.5712896953985744, + "grad_norm": 1.0370417833328247, + "learning_rate": 2.6303278591909426e-07, + "loss": 0.061, + "step": 15870 + }, + { + "epoch": 2.571451717433571, + "grad_norm": 0.9766287803649902, + "learning_rate": 2.62837566712813e-07, + "loss": 0.0637, + "step": 15871 + }, + { + "epoch": 2.571613739468568, + "grad_norm": 1.0253368616104126, + "learning_rate": 2.626424159582872e-07, + "loss": 0.0585, + "step": 15872 + }, + { + "epoch": 2.5717757615035644, + "grad_norm": 1.0771368741989136, + "learning_rate": 2.6244733366148994e-07, + "loss": 0.0607, + "step": 15873 + }, + { + "epoch": 2.5719377835385613, + "grad_norm": 0.9325494170188904, + "learning_rate": 2.622523198283894e-07, + "loss": 0.0618, + "step": 15874 + }, + { + "epoch": 2.5720998055735578, + "grad_norm": 0.887843906879425, + "learning_rate": 2.6205737446495296e-07, + "loss": 0.068, + "step": 15875 + }, + { + "epoch": 2.5722618276085547, + "grad_norm": 0.909279465675354, + "learning_rate": 2.6186249757714474e-07, + "loss": 0.0615, + "step": 15876 + }, + { + "epoch": 2.5724238496435516, + "grad_norm": 1.0336390733718872, + "learning_rate": 2.6166768917092746e-07, + "loss": 0.0653, + "step": 15877 + }, + { + "epoch": 2.572585871678548, + "grad_norm": 0.9292657375335693, + "learning_rate": 2.614729492522633e-07, + "loss": 0.0644, + "step": 15878 + }, + { + "epoch": 2.572747893713545, + "grad_norm": 0.995266854763031, + "learning_rate": 2.6127827782710916e-07, + "loss": 0.0614, + "step": 15879 + }, + { + "epoch": 2.572909915748542, + "grad_norm": 0.8683896660804749, + "learning_rate": 2.61083674901422e-07, + "loss": 0.0525, + "step": 15880 + }, + { + "epoch": 2.5730719377835385, + "grad_norm": 0.9227241277694702, + "learning_rate": 2.6088914048115585e-07, + "loss": 0.0609, + "step": 15881 + }, + { + "epoch": 2.5732339598185354, + "grad_norm": 0.9650192856788635, + "learning_rate": 2.6069467457226467e-07, + "loss": 0.0655, + "step": 15882 + }, + { + "epoch": 2.5733959818535324, + "grad_norm": 0.9700379371643066, + "learning_rate": 2.6050027718069694e-07, + "loss": 0.0588, + "step": 15883 + }, + { + "epoch": 2.573558003888529, + "grad_norm": 0.901326596736908, + "learning_rate": 2.6030594831240094e-07, + "loss": 0.064, + "step": 15884 + }, + { + "epoch": 2.5737200259235253, + "grad_norm": 0.947603166103363, + "learning_rate": 2.601116879733231e-07, + "loss": 0.0545, + "step": 15885 + }, + { + "epoch": 2.5738820479585223, + "grad_norm": 1.0234147310256958, + "learning_rate": 2.599174961694073e-07, + "loss": 0.0712, + "step": 15886 + }, + { + "epoch": 2.574044069993519, + "grad_norm": 0.9118129014968872, + "learning_rate": 2.597233729065951e-07, + "loss": 0.066, + "step": 15887 + }, + { + "epoch": 2.5742060920285157, + "grad_norm": 0.8596914410591125, + "learning_rate": 2.595293181908265e-07, + "loss": 0.0624, + "step": 15888 + }, + { + "epoch": 2.5743681140635126, + "grad_norm": 0.8012384176254272, + "learning_rate": 2.593353320280387e-07, + "loss": 0.0588, + "step": 15889 + }, + { + "epoch": 2.5745301360985096, + "grad_norm": 0.8562291860580444, + "learning_rate": 2.59141414424168e-07, + "loss": 0.0572, + "step": 15890 + }, + { + "epoch": 2.574692158133506, + "grad_norm": 1.0421948432922363, + "learning_rate": 2.5894756538514644e-07, + "loss": 0.0693, + "step": 15891 + }, + { + "epoch": 2.574854180168503, + "grad_norm": 0.880355715751648, + "learning_rate": 2.587537849169064e-07, + "loss": 0.0567, + "step": 15892 + }, + { + "epoch": 2.5750162022035, + "grad_norm": 0.8059505224227905, + "learning_rate": 2.585600730253773e-07, + "loss": 0.051, + "step": 15893 + }, + { + "epoch": 2.5751782242384964, + "grad_norm": 0.8571881651878357, + "learning_rate": 2.5836642971648534e-07, + "loss": 0.068, + "step": 15894 + }, + { + "epoch": 2.5753402462734933, + "grad_norm": 0.9805305600166321, + "learning_rate": 2.5817285499615624e-07, + "loss": 0.0608, + "step": 15895 + }, + { + "epoch": 2.57550226830849, + "grad_norm": 0.9158777594566345, + "learning_rate": 2.579793488703122e-07, + "loss": 0.0623, + "step": 15896 + }, + { + "epoch": 2.5756642903434868, + "grad_norm": 0.8620784878730774, + "learning_rate": 2.5778591134487494e-07, + "loss": 0.0629, + "step": 15897 + }, + { + "epoch": 2.5758263123784833, + "grad_norm": 0.8118161559104919, + "learning_rate": 2.5759254242576246e-07, + "loss": 0.0536, + "step": 15898 + }, + { + "epoch": 2.57598833441348, + "grad_norm": 0.9540441036224365, + "learning_rate": 2.5739924211889173e-07, + "loss": 0.0572, + "step": 15899 + }, + { + "epoch": 2.576150356448477, + "grad_norm": 0.7988532185554504, + "learning_rate": 2.572060104301771e-07, + "loss": 0.0562, + "step": 15900 + }, + { + "epoch": 2.5763123784834736, + "grad_norm": 0.9321579933166504, + "learning_rate": 2.5701284736553146e-07, + "loss": 0.0652, + "step": 15901 + }, + { + "epoch": 2.5764744005184705, + "grad_norm": 0.7826856970787048, + "learning_rate": 2.5681975293086443e-07, + "loss": 0.055, + "step": 15902 + }, + { + "epoch": 2.5766364225534675, + "grad_norm": 0.9522474408149719, + "learning_rate": 2.5662672713208465e-07, + "loss": 0.0601, + "step": 15903 + }, + { + "epoch": 2.576798444588464, + "grad_norm": 0.9039526581764221, + "learning_rate": 2.564337699750985e-07, + "loss": 0.0628, + "step": 15904 + }, + { + "epoch": 2.576960466623461, + "grad_norm": 0.9911723136901855, + "learning_rate": 2.5624088146580903e-07, + "loss": 0.07, + "step": 15905 + }, + { + "epoch": 2.5771224886584574, + "grad_norm": 0.9902855753898621, + "learning_rate": 2.560480616101191e-07, + "loss": 0.0602, + "step": 15906 + }, + { + "epoch": 2.5772845106934543, + "grad_norm": 1.157789707183838, + "learning_rate": 2.55855310413928e-07, + "loss": 0.0609, + "step": 15907 + }, + { + "epoch": 2.577446532728451, + "grad_norm": 0.9956571459770203, + "learning_rate": 2.556626278831345e-07, + "loss": 0.0597, + "step": 15908 + }, + { + "epoch": 2.5776085547634477, + "grad_norm": 0.8518067598342896, + "learning_rate": 2.554700140236327e-07, + "loss": 0.0552, + "step": 15909 + }, + { + "epoch": 2.5777705767984447, + "grad_norm": 0.8812946081161499, + "learning_rate": 2.552774688413165e-07, + "loss": 0.0525, + "step": 15910 + }, + { + "epoch": 2.577932598833441, + "grad_norm": 1.0784142017364502, + "learning_rate": 2.550849923420787e-07, + "loss": 0.0615, + "step": 15911 + }, + { + "epoch": 2.578094620868438, + "grad_norm": 1.0830011367797852, + "learning_rate": 2.5489258453180676e-07, + "loss": 0.0558, + "step": 15912 + }, + { + "epoch": 2.578256642903435, + "grad_norm": 0.8007397055625916, + "learning_rate": 2.547002454163888e-07, + "loss": 0.0524, + "step": 15913 + }, + { + "epoch": 2.5784186649384315, + "grad_norm": 0.9391987323760986, + "learning_rate": 2.545079750017099e-07, + "loss": 0.0546, + "step": 15914 + }, + { + "epoch": 2.5785806869734285, + "grad_norm": 0.9835785627365112, + "learning_rate": 2.543157732936527e-07, + "loss": 0.061, + "step": 15915 + }, + { + "epoch": 2.5787427090084254, + "grad_norm": 0.9381945729255676, + "learning_rate": 2.541236402980987e-07, + "loss": 0.0547, + "step": 15916 + }, + { + "epoch": 2.578904731043422, + "grad_norm": 1.088310956954956, + "learning_rate": 2.5393157602092626e-07, + "loss": 0.0656, + "step": 15917 + }, + { + "epoch": 2.579066753078419, + "grad_norm": 0.9402004480361938, + "learning_rate": 2.5373958046801207e-07, + "loss": 0.0596, + "step": 15918 + }, + { + "epoch": 2.5792287751134153, + "grad_norm": 0.8306174874305725, + "learning_rate": 2.5354765364523164e-07, + "loss": 0.0564, + "step": 15919 + }, + { + "epoch": 2.5793907971484122, + "grad_norm": 0.8160369396209717, + "learning_rate": 2.5335579555845563e-07, + "loss": 0.0584, + "step": 15920 + }, + { + "epoch": 2.5795528191834087, + "grad_norm": 1.0433648824691772, + "learning_rate": 2.531640062135557e-07, + "loss": 0.0592, + "step": 15921 + }, + { + "epoch": 2.5797148412184057, + "grad_norm": 1.0717437267303467, + "learning_rate": 2.5297228561640075e-07, + "loss": 0.0689, + "step": 15922 + }, + { + "epoch": 2.5798768632534026, + "grad_norm": 0.8935438394546509, + "learning_rate": 2.5278063377285556e-07, + "loss": 0.0655, + "step": 15923 + }, + { + "epoch": 2.580038885288399, + "grad_norm": 0.8454123735427856, + "learning_rate": 2.5258905068878433e-07, + "loss": 0.0582, + "step": 15924 + }, + { + "epoch": 2.580200907323396, + "grad_norm": 0.9273526072502136, + "learning_rate": 2.523975363700501e-07, + "loss": 0.0555, + "step": 15925 + }, + { + "epoch": 2.580362929358393, + "grad_norm": 1.308284878730774, + "learning_rate": 2.522060908225127e-07, + "loss": 0.0684, + "step": 15926 + }, + { + "epoch": 2.5805249513933894, + "grad_norm": 0.8910729885101318, + "learning_rate": 2.520147140520288e-07, + "loss": 0.0531, + "step": 15927 + }, + { + "epoch": 2.5806869734283864, + "grad_norm": 0.9772580862045288, + "learning_rate": 2.518234060644545e-07, + "loss": 0.0598, + "step": 15928 + }, + { + "epoch": 2.580848995463383, + "grad_norm": 0.845855712890625, + "learning_rate": 2.5163216686564354e-07, + "loss": 0.0567, + "step": 15929 + }, + { + "epoch": 2.58101101749838, + "grad_norm": 0.9278295040130615, + "learning_rate": 2.5144099646144724e-07, + "loss": 0.0579, + "step": 15930 + }, + { + "epoch": 2.5811730395333763, + "grad_norm": 1.0706762075424194, + "learning_rate": 2.512498948577152e-07, + "loss": 0.0644, + "step": 15931 + }, + { + "epoch": 2.5813350615683732, + "grad_norm": 0.9492532014846802, + "learning_rate": 2.510588620602947e-07, + "loss": 0.0567, + "step": 15932 + }, + { + "epoch": 2.58149708360337, + "grad_norm": 0.8904463648796082, + "learning_rate": 2.5086789807503036e-07, + "loss": 0.0576, + "step": 15933 + }, + { + "epoch": 2.5816591056383666, + "grad_norm": 0.8406658172607422, + "learning_rate": 2.506770029077657e-07, + "loss": 0.0558, + "step": 15934 + }, + { + "epoch": 2.5818211276733636, + "grad_norm": 0.919401228427887, + "learning_rate": 2.5048617656434127e-07, + "loss": 0.0607, + "step": 15935 + }, + { + "epoch": 2.5819831497083605, + "grad_norm": 0.9194125533103943, + "learning_rate": 2.502954190505963e-07, + "loss": 0.0582, + "step": 15936 + }, + { + "epoch": 2.582145171743357, + "grad_norm": 0.8466059565544128, + "learning_rate": 2.5010473037236776e-07, + "loss": 0.0589, + "step": 15937 + }, + { + "epoch": 2.582307193778354, + "grad_norm": 0.8999043107032776, + "learning_rate": 2.499141105354894e-07, + "loss": 0.0631, + "step": 15938 + }, + { + "epoch": 2.582469215813351, + "grad_norm": 0.8571875095367432, + "learning_rate": 2.4972355954579366e-07, + "loss": 0.0622, + "step": 15939 + }, + { + "epoch": 2.5826312378483474, + "grad_norm": 0.9047456383705139, + "learning_rate": 2.495330774091126e-07, + "loss": 0.0596, + "step": 15940 + }, + { + "epoch": 2.5827932598833443, + "grad_norm": 0.9654316902160645, + "learning_rate": 2.493426641312724e-07, + "loss": 0.0587, + "step": 15941 + }, + { + "epoch": 2.582955281918341, + "grad_norm": 0.8947680592536926, + "learning_rate": 2.4915231971810064e-07, + "loss": 0.0586, + "step": 15942 + }, + { + "epoch": 2.5831173039533377, + "grad_norm": 0.7643946409225464, + "learning_rate": 2.4896204417542066e-07, + "loss": 0.0537, + "step": 15943 + }, + { + "epoch": 2.583279325988334, + "grad_norm": 0.9256800413131714, + "learning_rate": 2.4877183750905475e-07, + "loss": 0.0596, + "step": 15944 + }, + { + "epoch": 2.583441348023331, + "grad_norm": 1.0002453327178955, + "learning_rate": 2.4858169972482276e-07, + "loss": 0.0602, + "step": 15945 + }, + { + "epoch": 2.583603370058328, + "grad_norm": 0.8987006545066833, + "learning_rate": 2.483916308285425e-07, + "loss": 0.0584, + "step": 15946 + }, + { + "epoch": 2.5837653920933246, + "grad_norm": 0.8947418332099915, + "learning_rate": 2.482016308260296e-07, + "loss": 0.0596, + "step": 15947 + }, + { + "epoch": 2.5839274141283215, + "grad_norm": 0.8880223631858826, + "learning_rate": 2.4801169972309745e-07, + "loss": 0.0575, + "step": 15948 + }, + { + "epoch": 2.5840894361633184, + "grad_norm": 0.8866935968399048, + "learning_rate": 2.4782183752555784e-07, + "loss": 0.0536, + "step": 15949 + }, + { + "epoch": 2.584251458198315, + "grad_norm": 0.8493706583976746, + "learning_rate": 2.4763204423921937e-07, + "loss": 0.0541, + "step": 15950 + }, + { + "epoch": 2.584413480233312, + "grad_norm": 0.8858264684677124, + "learning_rate": 2.4744231986988996e-07, + "loss": 0.0645, + "step": 15951 + }, + { + "epoch": 2.5845755022683083, + "grad_norm": 1.0235286951065063, + "learning_rate": 2.47252664423375e-07, + "loss": 0.0737, + "step": 15952 + }, + { + "epoch": 2.5847375243033053, + "grad_norm": 1.0129871368408203, + "learning_rate": 2.4706307790547614e-07, + "loss": 0.0563, + "step": 15953 + }, + { + "epoch": 2.5848995463383018, + "grad_norm": 0.9243896007537842, + "learning_rate": 2.4687356032199516e-07, + "loss": 0.0617, + "step": 15954 + }, + { + "epoch": 2.5850615683732987, + "grad_norm": 0.7274792790412903, + "learning_rate": 2.4668411167873165e-07, + "loss": 0.0506, + "step": 15955 + }, + { + "epoch": 2.5852235904082956, + "grad_norm": 0.7890907526016235, + "learning_rate": 2.464947319814806e-07, + "loss": 0.0544, + "step": 15956 + }, + { + "epoch": 2.585385612443292, + "grad_norm": 0.9411144852638245, + "learning_rate": 2.4630542123603775e-07, + "loss": 0.0586, + "step": 15957 + }, + { + "epoch": 2.585547634478289, + "grad_norm": 0.8673883080482483, + "learning_rate": 2.461161794481945e-07, + "loss": 0.0541, + "step": 15958 + }, + { + "epoch": 2.585709656513286, + "grad_norm": 1.1016530990600586, + "learning_rate": 2.4592700662374265e-07, + "loss": 0.0667, + "step": 15959 + }, + { + "epoch": 2.5858716785482825, + "grad_norm": 0.9959900975227356, + "learning_rate": 2.4573790276846947e-07, + "loss": 0.0651, + "step": 15960 + }, + { + "epoch": 2.5860337005832794, + "grad_norm": 0.895260751247406, + "learning_rate": 2.4554886788816094e-07, + "loss": 0.0672, + "step": 15961 + }, + { + "epoch": 2.5861957226182763, + "grad_norm": 1.0384845733642578, + "learning_rate": 2.453599019886016e-07, + "loss": 0.059, + "step": 15962 + }, + { + "epoch": 2.586357744653273, + "grad_norm": 0.8022738099098206, + "learning_rate": 2.451710050755732e-07, + "loss": 0.0506, + "step": 15963 + }, + { + "epoch": 2.5865197666882693, + "grad_norm": 0.8844985365867615, + "learning_rate": 2.449821771548552e-07, + "loss": 0.0591, + "step": 15964 + }, + { + "epoch": 2.5866817887232663, + "grad_norm": 0.8309991955757141, + "learning_rate": 2.4479341823222564e-07, + "loss": 0.0558, + "step": 15965 + }, + { + "epoch": 2.586843810758263, + "grad_norm": 0.9121332168579102, + "learning_rate": 2.446047283134606e-07, + "loss": 0.0673, + "step": 15966 + }, + { + "epoch": 2.5870058327932597, + "grad_norm": 0.8563621640205383, + "learning_rate": 2.444161074043325e-07, + "loss": 0.0605, + "step": 15967 + }, + { + "epoch": 2.5871678548282566, + "grad_norm": 0.834702730178833, + "learning_rate": 2.4422755551061246e-07, + "loss": 0.0561, + "step": 15968 + }, + { + "epoch": 2.5873298768632536, + "grad_norm": 0.9449539184570312, + "learning_rate": 2.4403907263807064e-07, + "loss": 0.0555, + "step": 15969 + }, + { + "epoch": 2.58749189889825, + "grad_norm": 1.0673609972000122, + "learning_rate": 2.4385065879247466e-07, + "loss": 0.0612, + "step": 15970 + }, + { + "epoch": 2.587653920933247, + "grad_norm": 0.8567180633544922, + "learning_rate": 2.4366231397958823e-07, + "loss": 0.0603, + "step": 15971 + }, + { + "epoch": 2.587815942968244, + "grad_norm": 0.8513876795768738, + "learning_rate": 2.4347403820517423e-07, + "loss": 0.058, + "step": 15972 + }, + { + "epoch": 2.5879779650032404, + "grad_norm": 0.8742343187332153, + "learning_rate": 2.4328583147499503e-07, + "loss": 0.0598, + "step": 15973 + }, + { + "epoch": 2.5881399870382373, + "grad_norm": 1.0191162824630737, + "learning_rate": 2.4309769379480764e-07, + "loss": 0.0576, + "step": 15974 + }, + { + "epoch": 2.588302009073234, + "grad_norm": 0.8935777544975281, + "learning_rate": 2.4290962517036915e-07, + "loss": 0.0568, + "step": 15975 + }, + { + "epoch": 2.5884640311082308, + "grad_norm": 0.8707433342933655, + "learning_rate": 2.427216256074341e-07, + "loss": 0.0537, + "step": 15976 + }, + { + "epoch": 2.5886260531432272, + "grad_norm": 1.0060011148452759, + "learning_rate": 2.425336951117549e-07, + "loss": 0.0622, + "step": 15977 + }, + { + "epoch": 2.588788075178224, + "grad_norm": 0.9074399471282959, + "learning_rate": 2.423458336890816e-07, + "loss": 0.0621, + "step": 15978 + }, + { + "epoch": 2.588950097213221, + "grad_norm": 0.931549072265625, + "learning_rate": 2.421580413451624e-07, + "loss": 0.0688, + "step": 15979 + }, + { + "epoch": 2.5891121192482176, + "grad_norm": 0.9901089072227478, + "learning_rate": 2.4197031808574327e-07, + "loss": 0.0644, + "step": 15980 + }, + { + "epoch": 2.5892741412832145, + "grad_norm": 0.8553241491317749, + "learning_rate": 2.417826639165688e-07, + "loss": 0.0572, + "step": 15981 + }, + { + "epoch": 2.5894361633182115, + "grad_norm": 0.7999852895736694, + "learning_rate": 2.4159507884337877e-07, + "loss": 0.0553, + "step": 15982 + }, + { + "epoch": 2.589598185353208, + "grad_norm": 0.7739843726158142, + "learning_rate": 2.414075628719145e-07, + "loss": 0.0565, + "step": 15983 + }, + { + "epoch": 2.589760207388205, + "grad_norm": 0.8175353407859802, + "learning_rate": 2.4122011600791334e-07, + "loss": 0.0576, + "step": 15984 + }, + { + "epoch": 2.589922229423202, + "grad_norm": 0.9324160218238831, + "learning_rate": 2.4103273825711094e-07, + "loss": 0.0601, + "step": 15985 + }, + { + "epoch": 2.5900842514581983, + "grad_norm": 1.0186549425125122, + "learning_rate": 2.408454296252397e-07, + "loss": 0.0605, + "step": 15986 + }, + { + "epoch": 2.590246273493195, + "grad_norm": 1.0774922370910645, + "learning_rate": 2.406581901180305e-07, + "loss": 0.0632, + "step": 15987 + }, + { + "epoch": 2.5904082955281917, + "grad_norm": 1.0663115978240967, + "learning_rate": 2.404710197412144e-07, + "loss": 0.0681, + "step": 15988 + }, + { + "epoch": 2.5905703175631887, + "grad_norm": 0.9276178479194641, + "learning_rate": 2.4028391850051654e-07, + "loss": 0.0622, + "step": 15989 + }, + { + "epoch": 2.590732339598185, + "grad_norm": 0.9000306725502014, + "learning_rate": 2.4009688640166257e-07, + "loss": 0.0572, + "step": 15990 + }, + { + "epoch": 2.590894361633182, + "grad_norm": 1.0842214822769165, + "learning_rate": 2.399099234503749e-07, + "loss": 0.0616, + "step": 15991 + }, + { + "epoch": 2.591056383668179, + "grad_norm": 0.9430128335952759, + "learning_rate": 2.397230296523742e-07, + "loss": 0.0669, + "step": 15992 + }, + { + "epoch": 2.5912184057031755, + "grad_norm": 1.052301049232483, + "learning_rate": 2.39536205013379e-07, + "loss": 0.07, + "step": 15993 + }, + { + "epoch": 2.5913804277381725, + "grad_norm": 0.9034858345985413, + "learning_rate": 2.3934944953910576e-07, + "loss": 0.0519, + "step": 15994 + }, + { + "epoch": 2.5915424497731694, + "grad_norm": 0.8782376050949097, + "learning_rate": 2.391627632352686e-07, + "loss": 0.0527, + "step": 15995 + }, + { + "epoch": 2.591704471808166, + "grad_norm": 0.8798543214797974, + "learning_rate": 2.3897614610757984e-07, + "loss": 0.0616, + "step": 15996 + }, + { + "epoch": 2.591866493843163, + "grad_norm": 0.9117831587791443, + "learning_rate": 2.387895981617497e-07, + "loss": 0.061, + "step": 15997 + }, + { + "epoch": 2.5920285158781593, + "grad_norm": 0.8840068578720093, + "learning_rate": 2.386031194034855e-07, + "loss": 0.0605, + "step": 15998 + }, + { + "epoch": 2.5921905379131562, + "grad_norm": 0.9171715378761292, + "learning_rate": 2.3841670983849402e-07, + "loss": 0.0562, + "step": 15999 + }, + { + "epoch": 2.5923525599481527, + "grad_norm": 0.9331315755844116, + "learning_rate": 2.3823036947247773e-07, + "loss": 0.0573, + "step": 16000 + }, + { + "epoch": 2.5925145819831497, + "grad_norm": 0.8709453344345093, + "learning_rate": 2.3804409831113817e-07, + "loss": 0.0554, + "step": 16001 + }, + { + "epoch": 2.5926766040181466, + "grad_norm": 0.8639891147613525, + "learning_rate": 2.3785789636017604e-07, + "loss": 0.0603, + "step": 16002 + }, + { + "epoch": 2.592838626053143, + "grad_norm": 0.9981393814086914, + "learning_rate": 2.3767176362528843e-07, + "loss": 0.0577, + "step": 16003 + }, + { + "epoch": 2.59300064808814, + "grad_norm": 0.8646852374076843, + "learning_rate": 2.374857001121697e-07, + "loss": 0.0618, + "step": 16004 + }, + { + "epoch": 2.593162670123137, + "grad_norm": 0.9664946794509888, + "learning_rate": 2.3729970582651307e-07, + "loss": 0.0676, + "step": 16005 + }, + { + "epoch": 2.5933246921581334, + "grad_norm": 0.8121821284294128, + "learning_rate": 2.371137807740101e-07, + "loss": 0.0553, + "step": 16006 + }, + { + "epoch": 2.5934867141931304, + "grad_norm": 1.0388915538787842, + "learning_rate": 2.3692792496034928e-07, + "loss": 0.0603, + "step": 16007 + }, + { + "epoch": 2.593648736228127, + "grad_norm": 0.9599438905715942, + "learning_rate": 2.3674213839121745e-07, + "loss": 0.0629, + "step": 16008 + }, + { + "epoch": 2.593810758263124, + "grad_norm": 0.8781747817993164, + "learning_rate": 2.3655642107229925e-07, + "loss": 0.0596, + "step": 16009 + }, + { + "epoch": 2.5939727802981203, + "grad_norm": 0.9688482284545898, + "learning_rate": 2.3637077300927762e-07, + "loss": 0.0646, + "step": 16010 + }, + { + "epoch": 2.594134802333117, + "grad_norm": 0.9513368606567383, + "learning_rate": 2.3618519420783137e-07, + "loss": 0.0647, + "step": 16011 + }, + { + "epoch": 2.594296824368114, + "grad_norm": 1.025209903717041, + "learning_rate": 2.3599968467364037e-07, + "loss": 0.0603, + "step": 16012 + }, + { + "epoch": 2.5944588464031106, + "grad_norm": 0.7293410897254944, + "learning_rate": 2.3581424441238038e-07, + "loss": 0.0476, + "step": 16013 + }, + { + "epoch": 2.5946208684381076, + "grad_norm": 0.8842470645904541, + "learning_rate": 2.3562887342972574e-07, + "loss": 0.0616, + "step": 16014 + }, + { + "epoch": 2.5947828904731045, + "grad_norm": 0.8669375777244568, + "learning_rate": 2.3544357173134691e-07, + "loss": 0.0557, + "step": 16015 + }, + { + "epoch": 2.594944912508101, + "grad_norm": 1.0717798471450806, + "learning_rate": 2.3525833932291491e-07, + "loss": 0.0648, + "step": 16016 + }, + { + "epoch": 2.595106934543098, + "grad_norm": 0.8745622038841248, + "learning_rate": 2.35073176210098e-07, + "loss": 0.063, + "step": 16017 + }, + { + "epoch": 2.595268956578095, + "grad_norm": 1.0365678071975708, + "learning_rate": 2.3488808239855998e-07, + "loss": 0.063, + "step": 16018 + }, + { + "epoch": 2.5954309786130914, + "grad_norm": 0.9520593285560608, + "learning_rate": 2.3470305789396546e-07, + "loss": 0.06, + "step": 16019 + }, + { + "epoch": 2.5955930006480883, + "grad_norm": 0.9252042174339294, + "learning_rate": 2.3451810270197494e-07, + "loss": 0.062, + "step": 16020 + }, + { + "epoch": 2.595755022683085, + "grad_norm": 0.9404069781303406, + "learning_rate": 2.3433321682824917e-07, + "loss": 0.0542, + "step": 16021 + }, + { + "epoch": 2.5959170447180817, + "grad_norm": 0.8857480883598328, + "learning_rate": 2.341484002784436e-07, + "loss": 0.0554, + "step": 16022 + }, + { + "epoch": 2.596079066753078, + "grad_norm": 0.9531388878822327, + "learning_rate": 2.3396365305821372e-07, + "loss": 0.0647, + "step": 16023 + }, + { + "epoch": 2.596241088788075, + "grad_norm": 0.8874648809432983, + "learning_rate": 2.3377897517321224e-07, + "loss": 0.0634, + "step": 16024 + }, + { + "epoch": 2.596403110823072, + "grad_norm": 0.8631939888000488, + "learning_rate": 2.3359436662909018e-07, + "loss": 0.0571, + "step": 16025 + }, + { + "epoch": 2.5965651328580686, + "grad_norm": 0.7769051194190979, + "learning_rate": 2.3340982743149582e-07, + "loss": 0.0585, + "step": 16026 + }, + { + "epoch": 2.5967271548930655, + "grad_norm": 0.8491709232330322, + "learning_rate": 2.3322535758607573e-07, + "loss": 0.0531, + "step": 16027 + }, + { + "epoch": 2.5968891769280624, + "grad_norm": 0.8236337304115295, + "learning_rate": 2.3304095709847402e-07, + "loss": 0.0529, + "step": 16028 + }, + { + "epoch": 2.597051198963059, + "grad_norm": 0.9366066455841064, + "learning_rate": 2.3285662597433368e-07, + "loss": 0.0658, + "step": 16029 + }, + { + "epoch": 2.597213220998056, + "grad_norm": 1.007311463356018, + "learning_rate": 2.3267236421929323e-07, + "loss": 0.0583, + "step": 16030 + }, + { + "epoch": 2.5973752430330523, + "grad_norm": 0.8656664490699768, + "learning_rate": 2.3248817183899209e-07, + "loss": 0.0572, + "step": 16031 + }, + { + "epoch": 2.5975372650680493, + "grad_norm": 0.909775972366333, + "learning_rate": 2.3230404883906626e-07, + "loss": 0.0573, + "step": 16032 + }, + { + "epoch": 2.5976992871030458, + "grad_norm": 0.9270328283309937, + "learning_rate": 2.321199952251482e-07, + "loss": 0.0578, + "step": 16033 + }, + { + "epoch": 2.5978613091380427, + "grad_norm": 1.056470274925232, + "learning_rate": 2.319360110028701e-07, + "loss": 0.0651, + "step": 16034 + }, + { + "epoch": 2.5980233311730396, + "grad_norm": 0.8853468894958496, + "learning_rate": 2.3175209617786133e-07, + "loss": 0.063, + "step": 16035 + }, + { + "epoch": 2.598185353208036, + "grad_norm": 0.9667817950248718, + "learning_rate": 2.3156825075574956e-07, + "loss": 0.06, + "step": 16036 + }, + { + "epoch": 2.598347375243033, + "grad_norm": 0.7612243890762329, + "learning_rate": 2.3138447474215981e-07, + "loss": 0.0556, + "step": 16037 + }, + { + "epoch": 2.59850939727803, + "grad_norm": 1.081939935684204, + "learning_rate": 2.312007681427153e-07, + "loss": 0.0573, + "step": 16038 + }, + { + "epoch": 2.5986714193130265, + "grad_norm": 0.9254032969474792, + "learning_rate": 2.3101713096303658e-07, + "loss": 0.0651, + "step": 16039 + }, + { + "epoch": 2.5988334413480234, + "grad_norm": 0.9094540476799011, + "learning_rate": 2.30833563208743e-07, + "loss": 0.0645, + "step": 16040 + }, + { + "epoch": 2.5989954633830203, + "grad_norm": 0.8876714706420898, + "learning_rate": 2.3065006488545122e-07, + "loss": 0.0662, + "step": 16041 + }, + { + "epoch": 2.599157485418017, + "grad_norm": 0.9631155729293823, + "learning_rate": 2.304666359987756e-07, + "loss": 0.0585, + "step": 16042 + }, + { + "epoch": 2.5993195074530138, + "grad_norm": 0.8992942571640015, + "learning_rate": 2.302832765543292e-07, + "loss": 0.0564, + "step": 16043 + }, + { + "epoch": 2.5994815294880103, + "grad_norm": 0.8657216429710388, + "learning_rate": 2.300999865577211e-07, + "loss": 0.0597, + "step": 16044 + }, + { + "epoch": 2.599643551523007, + "grad_norm": 0.8959026336669922, + "learning_rate": 2.2991676601456069e-07, + "loss": 0.0652, + "step": 16045 + }, + { + "epoch": 2.5998055735580037, + "grad_norm": 0.8713762760162354, + "learning_rate": 2.2973361493045382e-07, + "loss": 0.0621, + "step": 16046 + }, + { + "epoch": 2.5999675955930006, + "grad_norm": 0.9186781048774719, + "learning_rate": 2.2955053331100486e-07, + "loss": 0.0584, + "step": 16047 + }, + { + "epoch": 2.6001296176279975, + "grad_norm": 1.0051521062850952, + "learning_rate": 2.293675211618146e-07, + "loss": 0.065, + "step": 16048 + }, + { + "epoch": 2.600291639662994, + "grad_norm": 0.8411121964454651, + "learning_rate": 2.2918457848848303e-07, + "loss": 0.0585, + "step": 16049 + }, + { + "epoch": 2.600453661697991, + "grad_norm": 0.9174489974975586, + "learning_rate": 2.2900170529660898e-07, + "loss": 0.0634, + "step": 16050 + }, + { + "epoch": 2.600615683732988, + "grad_norm": 0.8582833409309387, + "learning_rate": 2.288189015917866e-07, + "loss": 0.0589, + "step": 16051 + }, + { + "epoch": 2.6007777057679844, + "grad_norm": 0.7952532768249512, + "learning_rate": 2.2863616737960976e-07, + "loss": 0.0533, + "step": 16052 + }, + { + "epoch": 2.6009397278029813, + "grad_norm": 0.9317299723625183, + "learning_rate": 2.2845350266566952e-07, + "loss": 0.0659, + "step": 16053 + }, + { + "epoch": 2.601101749837978, + "grad_norm": 0.8274461627006531, + "learning_rate": 2.2827090745555502e-07, + "loss": 0.0573, + "step": 16054 + }, + { + "epoch": 2.6012637718729748, + "grad_norm": 0.8277769088745117, + "learning_rate": 2.2808838175485321e-07, + "loss": 0.054, + "step": 16055 + }, + { + "epoch": 2.6014257939079712, + "grad_norm": 0.8775892853736877, + "learning_rate": 2.279059255691493e-07, + "loss": 0.061, + "step": 16056 + }, + { + "epoch": 2.601587815942968, + "grad_norm": 1.035484790802002, + "learning_rate": 2.2772353890402527e-07, + "loss": 0.0613, + "step": 16057 + }, + { + "epoch": 2.601749837977965, + "grad_norm": 0.9841363430023193, + "learning_rate": 2.2754122176506244e-07, + "loss": 0.0717, + "step": 16058 + }, + { + "epoch": 2.6019118600129616, + "grad_norm": 0.8622410297393799, + "learning_rate": 2.2735897415783888e-07, + "loss": 0.0613, + "step": 16059 + }, + { + "epoch": 2.6020738820479585, + "grad_norm": 0.7690012454986572, + "learning_rate": 2.271767960879312e-07, + "loss": 0.0528, + "step": 16060 + }, + { + "epoch": 2.6022359040829555, + "grad_norm": 0.8313643336296082, + "learning_rate": 2.2699468756091385e-07, + "loss": 0.0507, + "step": 16061 + }, + { + "epoch": 2.602397926117952, + "grad_norm": 0.8808419704437256, + "learning_rate": 2.2681264858235797e-07, + "loss": 0.0611, + "step": 16062 + }, + { + "epoch": 2.602559948152949, + "grad_norm": 0.8801603317260742, + "learning_rate": 2.2663067915783349e-07, + "loss": 0.0557, + "step": 16063 + }, + { + "epoch": 2.602721970187946, + "grad_norm": 0.8919458985328674, + "learning_rate": 2.2644877929290932e-07, + "loss": 0.0587, + "step": 16064 + }, + { + "epoch": 2.6028839922229423, + "grad_norm": 0.8611831665039062, + "learning_rate": 2.26266948993151e-07, + "loss": 0.0564, + "step": 16065 + }, + { + "epoch": 2.6030460142579392, + "grad_norm": 0.9475187659263611, + "learning_rate": 2.2608518826412128e-07, + "loss": 0.0631, + "step": 16066 + }, + { + "epoch": 2.6032080362929357, + "grad_norm": 0.8380742073059082, + "learning_rate": 2.2590349711138214e-07, + "loss": 0.0553, + "step": 16067 + }, + { + "epoch": 2.6033700583279327, + "grad_norm": 0.780228316783905, + "learning_rate": 2.2572187554049274e-07, + "loss": 0.0566, + "step": 16068 + }, + { + "epoch": 2.603532080362929, + "grad_norm": 0.9787698984146118, + "learning_rate": 2.2554032355701027e-07, + "loss": 0.0641, + "step": 16069 + }, + { + "epoch": 2.603694102397926, + "grad_norm": 0.9883422255516052, + "learning_rate": 2.2535884116648976e-07, + "loss": 0.0625, + "step": 16070 + }, + { + "epoch": 2.603856124432923, + "grad_norm": 1.032846212387085, + "learning_rate": 2.2517742837448425e-07, + "loss": 0.0589, + "step": 16071 + }, + { + "epoch": 2.6040181464679195, + "grad_norm": 0.8250254988670349, + "learning_rate": 2.2499608518654432e-07, + "loss": 0.0569, + "step": 16072 + }, + { + "epoch": 2.6041801685029164, + "grad_norm": 0.9005188345909119, + "learning_rate": 2.2481481160821883e-07, + "loss": 0.0606, + "step": 16073 + }, + { + "epoch": 2.6043421905379134, + "grad_norm": 0.9215274453163147, + "learning_rate": 2.2463360764505448e-07, + "loss": 0.0597, + "step": 16074 + }, + { + "epoch": 2.60450421257291, + "grad_norm": 0.7813541889190674, + "learning_rate": 2.244524733025952e-07, + "loss": 0.0521, + "step": 16075 + }, + { + "epoch": 2.604666234607907, + "grad_norm": 1.026735782623291, + "learning_rate": 2.2427140858638424e-07, + "loss": 0.0677, + "step": 16076 + }, + { + "epoch": 2.6048282566429033, + "grad_norm": 0.8096170425415039, + "learning_rate": 2.240904135019603e-07, + "loss": 0.0549, + "step": 16077 + }, + { + "epoch": 2.6049902786779002, + "grad_norm": 1.0487642288208008, + "learning_rate": 2.2390948805486174e-07, + "loss": 0.0671, + "step": 16078 + }, + { + "epoch": 2.6051523007128967, + "grad_norm": 0.9688891768455505, + "learning_rate": 2.2372863225062574e-07, + "loss": 0.058, + "step": 16079 + }, + { + "epoch": 2.6053143227478937, + "grad_norm": 0.9506645202636719, + "learning_rate": 2.2354784609478485e-07, + "loss": 0.0602, + "step": 16080 + }, + { + "epoch": 2.6054763447828906, + "grad_norm": 0.7836284041404724, + "learning_rate": 2.2336712959287077e-07, + "loss": 0.0532, + "step": 16081 + }, + { + "epoch": 2.605638366817887, + "grad_norm": 0.8392066359519958, + "learning_rate": 2.2318648275041267e-07, + "loss": 0.0583, + "step": 16082 + }, + { + "epoch": 2.605800388852884, + "grad_norm": 0.7897651195526123, + "learning_rate": 2.2300590557293944e-07, + "loss": 0.0497, + "step": 16083 + }, + { + "epoch": 2.605962410887881, + "grad_norm": 0.9836229085922241, + "learning_rate": 2.2282539806597476e-07, + "loss": 0.0629, + "step": 16084 + }, + { + "epoch": 2.6061244329228774, + "grad_norm": 1.0149391889572144, + "learning_rate": 2.2264496023504223e-07, + "loss": 0.0598, + "step": 16085 + }, + { + "epoch": 2.6062864549578744, + "grad_norm": 0.824090301990509, + "learning_rate": 2.22464592085663e-07, + "loss": 0.055, + "step": 16086 + }, + { + "epoch": 2.6064484769928713, + "grad_norm": 0.874244213104248, + "learning_rate": 2.2228429362335546e-07, + "loss": 0.0538, + "step": 16087 + }, + { + "epoch": 2.606610499027868, + "grad_norm": 0.8717989325523376, + "learning_rate": 2.2210406485363656e-07, + "loss": 0.063, + "step": 16088 + }, + { + "epoch": 2.6067725210628643, + "grad_norm": 1.0721566677093506, + "learning_rate": 2.2192390578202105e-07, + "loss": 0.0644, + "step": 16089 + }, + { + "epoch": 2.606934543097861, + "grad_norm": 0.8851465582847595, + "learning_rate": 2.217438164140212e-07, + "loss": 0.0536, + "step": 16090 + }, + { + "epoch": 2.607096565132858, + "grad_norm": 1.2500100135803223, + "learning_rate": 2.2156379675514762e-07, + "loss": 0.0662, + "step": 16091 + }, + { + "epoch": 2.6072585871678546, + "grad_norm": 0.930827260017395, + "learning_rate": 2.213838468109075e-07, + "loss": 0.0613, + "step": 16092 + }, + { + "epoch": 2.6074206092028516, + "grad_norm": 0.7674300074577332, + "learning_rate": 2.2120396658680765e-07, + "loss": 0.0532, + "step": 16093 + }, + { + "epoch": 2.6075826312378485, + "grad_norm": 0.9664437174797058, + "learning_rate": 2.210241560883525e-07, + "loss": 0.0676, + "step": 16094 + }, + { + "epoch": 2.607744653272845, + "grad_norm": 0.9895860552787781, + "learning_rate": 2.2084441532104262e-07, + "loss": 0.0644, + "step": 16095 + }, + { + "epoch": 2.607906675307842, + "grad_norm": 0.9111254811286926, + "learning_rate": 2.206647442903781e-07, + "loss": 0.0614, + "step": 16096 + }, + { + "epoch": 2.608068697342839, + "grad_norm": 1.104712724685669, + "learning_rate": 2.204851430018562e-07, + "loss": 0.069, + "step": 16097 + }, + { + "epoch": 2.6082307193778353, + "grad_norm": 1.0142390727996826, + "learning_rate": 2.2030561146097363e-07, + "loss": 0.0548, + "step": 16098 + }, + { + "epoch": 2.6083927414128323, + "grad_norm": 0.9152945876121521, + "learning_rate": 2.2012614967322182e-07, + "loss": 0.0608, + "step": 16099 + }, + { + "epoch": 2.6085547634478288, + "grad_norm": 0.9016642570495605, + "learning_rate": 2.199467576440928e-07, + "loss": 0.0629, + "step": 16100 + }, + { + "epoch": 2.6087167854828257, + "grad_norm": 0.7842551469802856, + "learning_rate": 2.1976743537907546e-07, + "loss": 0.055, + "step": 16101 + }, + { + "epoch": 2.608878807517822, + "grad_norm": 0.8693322539329529, + "learning_rate": 2.195881828836563e-07, + "loss": 0.0594, + "step": 16102 + }, + { + "epoch": 2.609040829552819, + "grad_norm": 1.128956913948059, + "learning_rate": 2.194090001633206e-07, + "loss": 0.0671, + "step": 16103 + }, + { + "epoch": 2.609202851587816, + "grad_norm": 0.8959473371505737, + "learning_rate": 2.1922988722355044e-07, + "loss": 0.0583, + "step": 16104 + }, + { + "epoch": 2.6093648736228126, + "grad_norm": 0.9709823131561279, + "learning_rate": 2.1905084406982663e-07, + "loss": 0.0631, + "step": 16105 + }, + { + "epoch": 2.6095268956578095, + "grad_norm": 1.0241434574127197, + "learning_rate": 2.188718707076265e-07, + "loss": 0.0605, + "step": 16106 + }, + { + "epoch": 2.6096889176928064, + "grad_norm": 1.0129493474960327, + "learning_rate": 2.1869296714242732e-07, + "loss": 0.0609, + "step": 16107 + }, + { + "epoch": 2.609850939727803, + "grad_norm": 0.9752632975578308, + "learning_rate": 2.185141333797025e-07, + "loss": 0.0576, + "step": 16108 + }, + { + "epoch": 2.6100129617628, + "grad_norm": 0.8120836019515991, + "learning_rate": 2.183353694249249e-07, + "loss": 0.048, + "step": 16109 + }, + { + "epoch": 2.6101749837977968, + "grad_norm": 0.946182370185852, + "learning_rate": 2.181566752835626e-07, + "loss": 0.0621, + "step": 16110 + }, + { + "epoch": 2.6103370058327933, + "grad_norm": 0.913953959941864, + "learning_rate": 2.1797805096108405e-07, + "loss": 0.0609, + "step": 16111 + }, + { + "epoch": 2.6104990278677898, + "grad_norm": 0.9735072255134583, + "learning_rate": 2.177994964629554e-07, + "loss": 0.0589, + "step": 16112 + }, + { + "epoch": 2.6106610499027867, + "grad_norm": 0.8608875274658203, + "learning_rate": 2.1762101179463896e-07, + "loss": 0.064, + "step": 16113 + }, + { + "epoch": 2.6108230719377836, + "grad_norm": 0.9533604979515076, + "learning_rate": 2.174425969615962e-07, + "loss": 0.0635, + "step": 16114 + }, + { + "epoch": 2.61098509397278, + "grad_norm": 0.88776695728302, + "learning_rate": 2.1726425196928663e-07, + "loss": 0.0543, + "step": 16115 + }, + { + "epoch": 2.611147116007777, + "grad_norm": 1.0538303852081299, + "learning_rate": 2.1708597682316645e-07, + "loss": 0.0626, + "step": 16116 + }, + { + "epoch": 2.611309138042774, + "grad_norm": 0.8502702713012695, + "learning_rate": 2.1690777152869103e-07, + "loss": 0.0554, + "step": 16117 + }, + { + "epoch": 2.6114711600777705, + "grad_norm": 0.9557256102561951, + "learning_rate": 2.1672963609131292e-07, + "loss": 0.0582, + "step": 16118 + }, + { + "epoch": 2.6116331821127674, + "grad_norm": 1.0968202352523804, + "learning_rate": 2.1655157051648223e-07, + "loss": 0.0612, + "step": 16119 + }, + { + "epoch": 2.6117952041477643, + "grad_norm": 0.9737009406089783, + "learning_rate": 2.1637357480964821e-07, + "loss": 0.0624, + "step": 16120 + }, + { + "epoch": 2.611957226182761, + "grad_norm": 0.9180959463119507, + "learning_rate": 2.1619564897625566e-07, + "loss": 0.0625, + "step": 16121 + }, + { + "epoch": 2.6121192482177578, + "grad_norm": 0.9389775395393372, + "learning_rate": 2.1601779302175026e-07, + "loss": 0.0632, + "step": 16122 + }, + { + "epoch": 2.6122812702527543, + "grad_norm": 0.9112357497215271, + "learning_rate": 2.158400069515734e-07, + "loss": 0.0631, + "step": 16123 + }, + { + "epoch": 2.612443292287751, + "grad_norm": 0.9230049848556519, + "learning_rate": 2.1566229077116445e-07, + "loss": 0.058, + "step": 16124 + }, + { + "epoch": 2.6126053143227477, + "grad_norm": 0.9439942240715027, + "learning_rate": 2.1548464448596123e-07, + "loss": 0.0643, + "step": 16125 + }, + { + "epoch": 2.6127673363577446, + "grad_norm": 0.8843783140182495, + "learning_rate": 2.1530706810139913e-07, + "loss": 0.064, + "step": 16126 + }, + { + "epoch": 2.6129293583927415, + "grad_norm": 0.980243980884552, + "learning_rate": 2.1512956162291294e-07, + "loss": 0.0633, + "step": 16127 + }, + { + "epoch": 2.613091380427738, + "grad_norm": 0.9535460472106934, + "learning_rate": 2.1495212505593221e-07, + "loss": 0.0688, + "step": 16128 + }, + { + "epoch": 2.613253402462735, + "grad_norm": 0.9649513959884644, + "learning_rate": 2.147747584058868e-07, + "loss": 0.0587, + "step": 16129 + }, + { + "epoch": 2.613415424497732, + "grad_norm": 0.9310510158538818, + "learning_rate": 2.1459746167820372e-07, + "loss": 0.0603, + "step": 16130 + }, + { + "epoch": 2.6135774465327284, + "grad_norm": 1.098154902458191, + "learning_rate": 2.1442023487830782e-07, + "loss": 0.0632, + "step": 16131 + }, + { + "epoch": 2.6137394685677253, + "grad_norm": 0.9631187915802002, + "learning_rate": 2.142430780116214e-07, + "loss": 0.0624, + "step": 16132 + }, + { + "epoch": 2.613901490602722, + "grad_norm": 1.261637568473816, + "learning_rate": 2.1406599108356573e-07, + "loss": 0.067, + "step": 16133 + }, + { + "epoch": 2.6140635126377187, + "grad_norm": 0.8249843120574951, + "learning_rate": 2.1388897409955867e-07, + "loss": 0.0528, + "step": 16134 + }, + { + "epoch": 2.6142255346727152, + "grad_norm": 0.8405267596244812, + "learning_rate": 2.1371202706501697e-07, + "loss": 0.0559, + "step": 16135 + }, + { + "epoch": 2.614387556707712, + "grad_norm": 0.8323154449462891, + "learning_rate": 2.1353514998535414e-07, + "loss": 0.0581, + "step": 16136 + }, + { + "epoch": 2.614549578742709, + "grad_norm": 0.9908446669578552, + "learning_rate": 2.13358342865983e-07, + "loss": 0.0634, + "step": 16137 + }, + { + "epoch": 2.6147116007777056, + "grad_norm": 0.9334249496459961, + "learning_rate": 2.1318160571231316e-07, + "loss": 0.0594, + "step": 16138 + }, + { + "epoch": 2.6148736228127025, + "grad_norm": 0.9564669728279114, + "learning_rate": 2.1300493852975167e-07, + "loss": 0.0613, + "step": 16139 + }, + { + "epoch": 2.6150356448476995, + "grad_norm": 1.0061742067337036, + "learning_rate": 2.128283413237045e-07, + "loss": 0.0643, + "step": 16140 + }, + { + "epoch": 2.615197666882696, + "grad_norm": 0.8926098942756653, + "learning_rate": 2.1265181409957537e-07, + "loss": 0.0605, + "step": 16141 + }, + { + "epoch": 2.615359688917693, + "grad_norm": 0.8778125047683716, + "learning_rate": 2.1247535686276632e-07, + "loss": 0.0568, + "step": 16142 + }, + { + "epoch": 2.61552171095269, + "grad_norm": 0.8341763615608215, + "learning_rate": 2.1229896961867475e-07, + "loss": 0.0596, + "step": 16143 + }, + { + "epoch": 2.6156837329876863, + "grad_norm": 1.0330524444580078, + "learning_rate": 2.121226523726988e-07, + "loss": 0.07, + "step": 16144 + }, + { + "epoch": 2.6158457550226832, + "grad_norm": 0.8376036286354065, + "learning_rate": 2.1194640513023306e-07, + "loss": 0.0545, + "step": 16145 + }, + { + "epoch": 2.6160077770576797, + "grad_norm": 0.8829439282417297, + "learning_rate": 2.1177022789667045e-07, + "loss": 0.059, + "step": 16146 + }, + { + "epoch": 2.6161697990926767, + "grad_norm": 0.9374154806137085, + "learning_rate": 2.1159412067740136e-07, + "loss": 0.0578, + "step": 16147 + }, + { + "epoch": 2.616331821127673, + "grad_norm": 0.9516973495483398, + "learning_rate": 2.1141808347781428e-07, + "loss": 0.0641, + "step": 16148 + }, + { + "epoch": 2.61649384316267, + "grad_norm": 0.885984480381012, + "learning_rate": 2.1124211630329571e-07, + "loss": 0.057, + "step": 16149 + }, + { + "epoch": 2.616655865197667, + "grad_norm": 0.9362042546272278, + "learning_rate": 2.110662191592297e-07, + "loss": 0.0587, + "step": 16150 + }, + { + "epoch": 2.6168178872326635, + "grad_norm": 1.1420210599899292, + "learning_rate": 2.1089039205099832e-07, + "loss": 0.0599, + "step": 16151 + }, + { + "epoch": 2.6169799092676604, + "grad_norm": 0.8978714346885681, + "learning_rate": 2.1071463498398114e-07, + "loss": 0.0628, + "step": 16152 + }, + { + "epoch": 2.6171419313026574, + "grad_norm": 0.8524075746536255, + "learning_rate": 2.1053894796355694e-07, + "loss": 0.0582, + "step": 16153 + }, + { + "epoch": 2.617303953337654, + "grad_norm": 0.8688123822212219, + "learning_rate": 2.103633309950995e-07, + "loss": 0.0616, + "step": 16154 + }, + { + "epoch": 2.617465975372651, + "grad_norm": 0.8266528844833374, + "learning_rate": 2.101877840839836e-07, + "loss": 0.0588, + "step": 16155 + }, + { + "epoch": 2.6176279974076473, + "grad_norm": 0.9983370900154114, + "learning_rate": 2.1001230723558087e-07, + "loss": 0.0625, + "step": 16156 + }, + { + "epoch": 2.6177900194426442, + "grad_norm": 0.835728645324707, + "learning_rate": 2.0983690045525944e-07, + "loss": 0.0593, + "step": 16157 + }, + { + "epoch": 2.6179520414776407, + "grad_norm": 0.8325420618057251, + "learning_rate": 2.0966156374838677e-07, + "loss": 0.0582, + "step": 16158 + }, + { + "epoch": 2.6181140635126376, + "grad_norm": 0.8607774376869202, + "learning_rate": 2.0948629712032738e-07, + "loss": 0.0552, + "step": 16159 + }, + { + "epoch": 2.6182760855476346, + "grad_norm": 1.0366528034210205, + "learning_rate": 2.0931110057644505e-07, + "loss": 0.0654, + "step": 16160 + }, + { + "epoch": 2.618438107582631, + "grad_norm": 1.09884512424469, + "learning_rate": 2.0913597412209941e-07, + "loss": 0.0653, + "step": 16161 + }, + { + "epoch": 2.618600129617628, + "grad_norm": 1.109409213066101, + "learning_rate": 2.089609177626492e-07, + "loss": 0.0675, + "step": 16162 + }, + { + "epoch": 2.618762151652625, + "grad_norm": 0.9305920004844666, + "learning_rate": 2.0878593150345043e-07, + "loss": 0.057, + "step": 16163 + }, + { + "epoch": 2.6189241736876214, + "grad_norm": 0.9153008460998535, + "learning_rate": 2.0861101534985774e-07, + "loss": 0.0565, + "step": 16164 + }, + { + "epoch": 2.6190861957226184, + "grad_norm": 0.9763879179954529, + "learning_rate": 2.0843616930722288e-07, + "loss": 0.0676, + "step": 16165 + }, + { + "epoch": 2.6192482177576153, + "grad_norm": 0.9485346078872681, + "learning_rate": 2.082613933808958e-07, + "loss": 0.0615, + "step": 16166 + }, + { + "epoch": 2.619410239792612, + "grad_norm": 0.7745640277862549, + "learning_rate": 2.0808668757622413e-07, + "loss": 0.0542, + "step": 16167 + }, + { + "epoch": 2.6195722618276087, + "grad_norm": 0.9337866902351379, + "learning_rate": 2.079120518985539e-07, + "loss": 0.0631, + "step": 16168 + }, + { + "epoch": 2.619734283862605, + "grad_norm": 0.7666480541229248, + "learning_rate": 2.077374863532275e-07, + "loss": 0.0533, + "step": 16169 + }, + { + "epoch": 2.619896305897602, + "grad_norm": 0.8683062791824341, + "learning_rate": 2.07562990945587e-07, + "loss": 0.0573, + "step": 16170 + }, + { + "epoch": 2.6200583279325986, + "grad_norm": 0.9523324370384216, + "learning_rate": 2.073885656809718e-07, + "loss": 0.061, + "step": 16171 + }, + { + "epoch": 2.6202203499675956, + "grad_norm": 0.855353832244873, + "learning_rate": 2.0721421056471818e-07, + "loss": 0.0556, + "step": 16172 + }, + { + "epoch": 2.6203823720025925, + "grad_norm": 0.924647331237793, + "learning_rate": 2.0703992560216075e-07, + "loss": 0.0608, + "step": 16173 + }, + { + "epoch": 2.620544394037589, + "grad_norm": 0.9063007831573486, + "learning_rate": 2.0686571079863383e-07, + "loss": 0.0636, + "step": 16174 + }, + { + "epoch": 2.620706416072586, + "grad_norm": 0.9852257370948792, + "learning_rate": 2.0669156615946623e-07, + "loss": 0.0637, + "step": 16175 + }, + { + "epoch": 2.620868438107583, + "grad_norm": 0.8728942275047302, + "learning_rate": 2.0651749168998703e-07, + "loss": 0.0603, + "step": 16176 + }, + { + "epoch": 2.6210304601425793, + "grad_norm": 0.9853529334068298, + "learning_rate": 2.0634348739552251e-07, + "loss": 0.065, + "step": 16177 + }, + { + "epoch": 2.6211924821775763, + "grad_norm": 0.9242932200431824, + "learning_rate": 2.0616955328139675e-07, + "loss": 0.0658, + "step": 16178 + }, + { + "epoch": 2.6213545042125728, + "grad_norm": 0.8449025750160217, + "learning_rate": 2.059956893529319e-07, + "loss": 0.0517, + "step": 16179 + }, + { + "epoch": 2.6215165262475697, + "grad_norm": 0.9864807724952698, + "learning_rate": 2.058218956154473e-07, + "loss": 0.063, + "step": 16180 + }, + { + "epoch": 2.621678548282566, + "grad_norm": 1.0352917909622192, + "learning_rate": 2.0564817207426092e-07, + "loss": 0.0568, + "step": 16181 + }, + { + "epoch": 2.621840570317563, + "grad_norm": 1.0253640413284302, + "learning_rate": 2.0547451873468877e-07, + "loss": 0.0694, + "step": 16182 + }, + { + "epoch": 2.62200259235256, + "grad_norm": 0.8204119205474854, + "learning_rate": 2.0530093560204272e-07, + "loss": 0.0532, + "step": 16183 + }, + { + "epoch": 2.6221646143875565, + "grad_norm": 0.9108116030693054, + "learning_rate": 2.051274226816355e-07, + "loss": 0.0631, + "step": 16184 + }, + { + "epoch": 2.6223266364225535, + "grad_norm": 0.8796073198318481, + "learning_rate": 2.0495397997877558e-07, + "loss": 0.0594, + "step": 16185 + }, + { + "epoch": 2.6224886584575504, + "grad_norm": 0.931476891040802, + "learning_rate": 2.0478060749877044e-07, + "loss": 0.0647, + "step": 16186 + }, + { + "epoch": 2.622650680492547, + "grad_norm": 0.9860643148422241, + "learning_rate": 2.0460730524692384e-07, + "loss": 0.061, + "step": 16187 + }, + { + "epoch": 2.622812702527544, + "grad_norm": 0.9249363541603088, + "learning_rate": 2.0443407322853882e-07, + "loss": 0.058, + "step": 16188 + }, + { + "epoch": 2.6229747245625408, + "grad_norm": 0.9313977956771851, + "learning_rate": 2.0426091144891664e-07, + "loss": 0.0585, + "step": 16189 + }, + { + "epoch": 2.6231367465975373, + "grad_norm": 0.8045008182525635, + "learning_rate": 2.0408781991335446e-07, + "loss": 0.0542, + "step": 16190 + }, + { + "epoch": 2.6232987686325338, + "grad_norm": 0.8684973120689392, + "learning_rate": 2.039147986271492e-07, + "loss": 0.0604, + "step": 16191 + }, + { + "epoch": 2.6234607906675307, + "grad_norm": 0.8771528005599976, + "learning_rate": 2.0374184759559463e-07, + "loss": 0.053, + "step": 16192 + }, + { + "epoch": 2.6236228127025276, + "grad_norm": 0.9225574731826782, + "learning_rate": 2.0356896682398264e-07, + "loss": 0.0605, + "step": 16193 + }, + { + "epoch": 2.623784834737524, + "grad_norm": 0.8193483948707581, + "learning_rate": 2.033961563176029e-07, + "loss": 0.0596, + "step": 16194 + }, + { + "epoch": 2.623946856772521, + "grad_norm": 1.0594878196716309, + "learning_rate": 2.0322341608174338e-07, + "loss": 0.0619, + "step": 16195 + }, + { + "epoch": 2.624108878807518, + "grad_norm": 0.7767667770385742, + "learning_rate": 2.0305074612168906e-07, + "loss": 0.0532, + "step": 16196 + }, + { + "epoch": 2.6242709008425145, + "grad_norm": 0.8942044377326965, + "learning_rate": 2.0287814644272347e-07, + "loss": 0.0574, + "step": 16197 + }, + { + "epoch": 2.6244329228775114, + "grad_norm": 0.8722692728042603, + "learning_rate": 2.0270561705012765e-07, + "loss": 0.0565, + "step": 16198 + }, + { + "epoch": 2.6245949449125083, + "grad_norm": 0.9796530604362488, + "learning_rate": 2.0253315794918043e-07, + "loss": 0.0698, + "step": 16199 + }, + { + "epoch": 2.624756966947505, + "grad_norm": 0.9763656854629517, + "learning_rate": 2.0236076914515956e-07, + "loss": 0.0721, + "step": 16200 + }, + { + "epoch": 2.6249189889825018, + "grad_norm": 1.099333643913269, + "learning_rate": 2.021884506433383e-07, + "loss": 0.0629, + "step": 16201 + }, + { + "epoch": 2.6250810110174982, + "grad_norm": 0.8187487721443176, + "learning_rate": 2.020162024489894e-07, + "loss": 0.059, + "step": 16202 + }, + { + "epoch": 2.625243033052495, + "grad_norm": 0.874085009098053, + "learning_rate": 2.0184402456738444e-07, + "loss": 0.0601, + "step": 16203 + }, + { + "epoch": 2.6254050550874917, + "grad_norm": 0.9488238096237183, + "learning_rate": 2.0167191700379092e-07, + "loss": 0.0593, + "step": 16204 + }, + { + "epoch": 2.6255670771224886, + "grad_norm": 1.0066373348236084, + "learning_rate": 2.0149987976347485e-07, + "loss": 0.0596, + "step": 16205 + }, + { + "epoch": 2.6257290991574855, + "grad_norm": 0.8716149926185608, + "learning_rate": 2.0132791285169985e-07, + "loss": 0.0615, + "step": 16206 + }, + { + "epoch": 2.625891121192482, + "grad_norm": 0.8857298493385315, + "learning_rate": 2.0115601627372832e-07, + "loss": 0.0551, + "step": 16207 + }, + { + "epoch": 2.626053143227479, + "grad_norm": 0.8469251394271851, + "learning_rate": 2.0098419003481946e-07, + "loss": 0.0573, + "step": 16208 + }, + { + "epoch": 2.626215165262476, + "grad_norm": 0.8539047837257385, + "learning_rate": 2.0081243414023067e-07, + "loss": 0.0561, + "step": 16209 + }, + { + "epoch": 2.6263771872974724, + "grad_norm": 0.9203428626060486, + "learning_rate": 2.0064074859521777e-07, + "loss": 0.0624, + "step": 16210 + }, + { + "epoch": 2.6265392093324693, + "grad_norm": 0.9050666093826294, + "learning_rate": 2.004691334050335e-07, + "loss": 0.0579, + "step": 16211 + }, + { + "epoch": 2.6267012313674662, + "grad_norm": 0.9167254567146301, + "learning_rate": 2.0029758857492893e-07, + "loss": 0.0663, + "step": 16212 + }, + { + "epoch": 2.6268632534024627, + "grad_norm": 0.9239572882652283, + "learning_rate": 2.001261141101532e-07, + "loss": 0.0646, + "step": 16213 + }, + { + "epoch": 2.6270252754374592, + "grad_norm": 0.7515432238578796, + "learning_rate": 1.9995471001595267e-07, + "loss": 0.0568, + "step": 16214 + }, + { + "epoch": 2.627187297472456, + "grad_norm": 0.834296703338623, + "learning_rate": 1.9978337629757233e-07, + "loss": 0.0564, + "step": 16215 + }, + { + "epoch": 2.627349319507453, + "grad_norm": 0.78092360496521, + "learning_rate": 1.9961211296025352e-07, + "loss": 0.0565, + "step": 16216 + }, + { + "epoch": 2.6275113415424496, + "grad_norm": 0.9915261268615723, + "learning_rate": 1.994409200092376e-07, + "loss": 0.0632, + "step": 16217 + }, + { + "epoch": 2.6276733635774465, + "grad_norm": 1.0370954275131226, + "learning_rate": 1.992697974497629e-07, + "loss": 0.066, + "step": 16218 + }, + { + "epoch": 2.6278353856124435, + "grad_norm": 0.905182957649231, + "learning_rate": 1.9909874528706407e-07, + "loss": 0.0663, + "step": 16219 + }, + { + "epoch": 2.62799740764744, + "grad_norm": 0.8637315034866333, + "learning_rate": 1.989277635263756e-07, + "loss": 0.0556, + "step": 16220 + }, + { + "epoch": 2.628159429682437, + "grad_norm": 0.8456125855445862, + "learning_rate": 1.9875685217292856e-07, + "loss": 0.0563, + "step": 16221 + }, + { + "epoch": 2.628321451717434, + "grad_norm": 0.8584468960762024, + "learning_rate": 1.9858601123195403e-07, + "loss": 0.0578, + "step": 16222 + }, + { + "epoch": 2.6284834737524303, + "grad_norm": 0.9558644890785217, + "learning_rate": 1.9841524070867784e-07, + "loss": 0.0656, + "step": 16223 + }, + { + "epoch": 2.6286454957874272, + "grad_norm": 1.1154943704605103, + "learning_rate": 1.9824454060832526e-07, + "loss": 0.0623, + "step": 16224 + }, + { + "epoch": 2.6288075178224237, + "grad_norm": 0.9311351776123047, + "learning_rate": 1.980739109361199e-07, + "loss": 0.0581, + "step": 16225 + }, + { + "epoch": 2.6289695398574207, + "grad_norm": 1.0172123908996582, + "learning_rate": 1.9790335169728197e-07, + "loss": 0.0644, + "step": 16226 + }, + { + "epoch": 2.629131561892417, + "grad_norm": 1.0837866067886353, + "learning_rate": 1.977328628970307e-07, + "loss": 0.0607, + "step": 16227 + }, + { + "epoch": 2.629293583927414, + "grad_norm": 0.8728600144386292, + "learning_rate": 1.9756244454058244e-07, + "loss": 0.06, + "step": 16228 + }, + { + "epoch": 2.629455605962411, + "grad_norm": 0.9363974332809448, + "learning_rate": 1.9739209663315162e-07, + "loss": 0.0597, + "step": 16229 + }, + { + "epoch": 2.6296176279974075, + "grad_norm": 0.9871296882629395, + "learning_rate": 1.9722181917995103e-07, + "loss": 0.0646, + "step": 16230 + }, + { + "epoch": 2.6297796500324044, + "grad_norm": 0.8740096092224121, + "learning_rate": 1.9705161218618902e-07, + "loss": 0.0625, + "step": 16231 + }, + { + "epoch": 2.6299416720674014, + "grad_norm": 1.0138083696365356, + "learning_rate": 1.9688147565707528e-07, + "loss": 0.0668, + "step": 16232 + }, + { + "epoch": 2.630103694102398, + "grad_norm": 0.7797446846961975, + "learning_rate": 1.967114095978151e-07, + "loss": 0.0515, + "step": 16233 + }, + { + "epoch": 2.630265716137395, + "grad_norm": 0.9009860754013062, + "learning_rate": 1.9654141401361183e-07, + "loss": 0.0583, + "step": 16234 + }, + { + "epoch": 2.6304277381723913, + "grad_norm": 0.9890360236167908, + "learning_rate": 1.9637148890966685e-07, + "loss": 0.0623, + "step": 16235 + }, + { + "epoch": 2.630589760207388, + "grad_norm": 0.7815765738487244, + "learning_rate": 1.9620163429117906e-07, + "loss": 0.0524, + "step": 16236 + }, + { + "epoch": 2.6307517822423847, + "grad_norm": 0.9130102396011353, + "learning_rate": 1.9603185016334737e-07, + "loss": 0.0572, + "step": 16237 + }, + { + "epoch": 2.6309138042773816, + "grad_norm": 0.9095372557640076, + "learning_rate": 1.958621365313648e-07, + "loss": 0.0609, + "step": 16238 + }, + { + "epoch": 2.6310758263123786, + "grad_norm": 0.7958774566650391, + "learning_rate": 1.9569249340042534e-07, + "loss": 0.0591, + "step": 16239 + }, + { + "epoch": 2.631237848347375, + "grad_norm": 0.8964948654174805, + "learning_rate": 1.9552292077571894e-07, + "loss": 0.0678, + "step": 16240 + }, + { + "epoch": 2.631399870382372, + "grad_norm": 0.9855856895446777, + "learning_rate": 1.953534186624345e-07, + "loss": 0.065, + "step": 16241 + }, + { + "epoch": 2.631561892417369, + "grad_norm": 1.1027717590332031, + "learning_rate": 1.9518398706575846e-07, + "loss": 0.064, + "step": 16242 + }, + { + "epoch": 2.6317239144523654, + "grad_norm": 0.8115701079368591, + "learning_rate": 1.9501462599087472e-07, + "loss": 0.0496, + "step": 16243 + }, + { + "epoch": 2.6318859364873624, + "grad_norm": 0.9368529915809631, + "learning_rate": 1.948453354429661e-07, + "loss": 0.0613, + "step": 16244 + }, + { + "epoch": 2.6320479585223593, + "grad_norm": 0.8521546721458435, + "learning_rate": 1.946761154272106e-07, + "loss": 0.059, + "step": 16245 + }, + { + "epoch": 2.6322099805573558, + "grad_norm": 0.9154667258262634, + "learning_rate": 1.9450696594878804e-07, + "loss": 0.0632, + "step": 16246 + }, + { + "epoch": 2.6323720025923527, + "grad_norm": 0.8996468782424927, + "learning_rate": 1.9433788701287288e-07, + "loss": 0.0639, + "step": 16247 + }, + { + "epoch": 2.632534024627349, + "grad_norm": 0.8377342820167542, + "learning_rate": 1.941688786246393e-07, + "loss": 0.0593, + "step": 16248 + }, + { + "epoch": 2.632696046662346, + "grad_norm": 0.7824660539627075, + "learning_rate": 1.939999407892576e-07, + "loss": 0.052, + "step": 16249 + }, + { + "epoch": 2.6328580686973426, + "grad_norm": 0.8129716515541077, + "learning_rate": 1.9383107351189672e-07, + "loss": 0.0569, + "step": 16250 + }, + { + "epoch": 2.6330200907323396, + "grad_norm": 0.8622326254844666, + "learning_rate": 1.936622767977253e-07, + "loss": 0.0588, + "step": 16251 + }, + { + "epoch": 2.6331821127673365, + "grad_norm": 0.9913507103919983, + "learning_rate": 1.9349355065190618e-07, + "loss": 0.0662, + "step": 16252 + }, + { + "epoch": 2.633344134802333, + "grad_norm": 0.8238366842269897, + "learning_rate": 1.9332489507960324e-07, + "loss": 0.0566, + "step": 16253 + }, + { + "epoch": 2.63350615683733, + "grad_norm": 0.9328113794326782, + "learning_rate": 1.9315631008597596e-07, + "loss": 0.0648, + "step": 16254 + }, + { + "epoch": 2.633668178872327, + "grad_norm": 0.9620225429534912, + "learning_rate": 1.9298779567618357e-07, + "loss": 0.0593, + "step": 16255 + }, + { + "epoch": 2.6338302009073233, + "grad_norm": 0.7656645774841309, + "learning_rate": 1.9281935185538141e-07, + "loss": 0.0511, + "step": 16256 + }, + { + "epoch": 2.6339922229423203, + "grad_norm": 0.8839242458343506, + "learning_rate": 1.9265097862872423e-07, + "loss": 0.0611, + "step": 16257 + }, + { + "epoch": 2.6341542449773168, + "grad_norm": 0.8976559638977051, + "learning_rate": 1.9248267600136317e-07, + "loss": 0.0574, + "step": 16258 + }, + { + "epoch": 2.6343162670123137, + "grad_norm": 1.1227093935012817, + "learning_rate": 1.9231444397844855e-07, + "loss": 0.0671, + "step": 16259 + }, + { + "epoch": 2.63447828904731, + "grad_norm": 0.8992955088615417, + "learning_rate": 1.9214628256512656e-07, + "loss": 0.0623, + "step": 16260 + }, + { + "epoch": 2.634640311082307, + "grad_norm": 1.0141760110855103, + "learning_rate": 1.919781917665439e-07, + "loss": 0.0653, + "step": 16261 + }, + { + "epoch": 2.634802333117304, + "grad_norm": 1.1703956127166748, + "learning_rate": 1.918101715878437e-07, + "loss": 0.0751, + "step": 16262 + }, + { + "epoch": 2.6349643551523005, + "grad_norm": 0.9227219820022583, + "learning_rate": 1.9164222203416627e-07, + "loss": 0.0644, + "step": 16263 + }, + { + "epoch": 2.6351263771872975, + "grad_norm": 0.804478645324707, + "learning_rate": 1.9147434311065028e-07, + "loss": 0.0484, + "step": 16264 + }, + { + "epoch": 2.6352883992222944, + "grad_norm": 0.8943800926208496, + "learning_rate": 1.91306534822433e-07, + "loss": 0.0589, + "step": 16265 + }, + { + "epoch": 2.635450421257291, + "grad_norm": 0.9125702977180481, + "learning_rate": 1.911387971746495e-07, + "loss": 0.0503, + "step": 16266 + }, + { + "epoch": 2.635612443292288, + "grad_norm": 0.9437099099159241, + "learning_rate": 1.9097113017243097e-07, + "loss": 0.0515, + "step": 16267 + }, + { + "epoch": 2.6357744653272848, + "grad_norm": 0.9175340533256531, + "learning_rate": 1.9080353382090798e-07, + "loss": 0.0571, + "step": 16268 + }, + { + "epoch": 2.6359364873622813, + "grad_norm": 1.21630859375, + "learning_rate": 1.9063600812520898e-07, + "loss": 0.0573, + "step": 16269 + }, + { + "epoch": 2.636098509397278, + "grad_norm": 1.0302997827529907, + "learning_rate": 1.9046855309045957e-07, + "loss": 0.0666, + "step": 16270 + }, + { + "epoch": 2.6362605314322747, + "grad_norm": 0.8015779256820679, + "learning_rate": 1.9030116872178317e-07, + "loss": 0.0546, + "step": 16271 + }, + { + "epoch": 2.6364225534672716, + "grad_norm": 1.0317645072937012, + "learning_rate": 1.9013385502430175e-07, + "loss": 0.0664, + "step": 16272 + }, + { + "epoch": 2.636584575502268, + "grad_norm": 0.9456698894500732, + "learning_rate": 1.899666120031349e-07, + "loss": 0.0589, + "step": 16273 + }, + { + "epoch": 2.636746597537265, + "grad_norm": 0.9725162386894226, + "learning_rate": 1.8979943966339924e-07, + "loss": 0.065, + "step": 16274 + }, + { + "epoch": 2.636908619572262, + "grad_norm": 0.8339110016822815, + "learning_rate": 1.8963233801021024e-07, + "loss": 0.0542, + "step": 16275 + }, + { + "epoch": 2.6370706416072585, + "grad_norm": 0.991547167301178, + "learning_rate": 1.8946530704868072e-07, + "loss": 0.0624, + "step": 16276 + }, + { + "epoch": 2.6372326636422554, + "grad_norm": 0.8456936478614807, + "learning_rate": 1.8929834678392184e-07, + "loss": 0.0566, + "step": 16277 + }, + { + "epoch": 2.6373946856772523, + "grad_norm": 0.9428996443748474, + "learning_rate": 1.891314572210412e-07, + "loss": 0.0575, + "step": 16278 + }, + { + "epoch": 2.637556707712249, + "grad_norm": 0.9978534579277039, + "learning_rate": 1.8896463836514556e-07, + "loss": 0.0676, + "step": 16279 + }, + { + "epoch": 2.6377187297472457, + "grad_norm": 0.837712824344635, + "learning_rate": 1.887978902213397e-07, + "loss": 0.0553, + "step": 16280 + }, + { + "epoch": 2.6378807517822422, + "grad_norm": 1.049025058746338, + "learning_rate": 1.8863121279472595e-07, + "loss": 0.0626, + "step": 16281 + }, + { + "epoch": 2.638042773817239, + "grad_norm": 1.0920323133468628, + "learning_rate": 1.8846460609040302e-07, + "loss": 0.0624, + "step": 16282 + }, + { + "epoch": 2.6382047958522357, + "grad_norm": 0.900133490562439, + "learning_rate": 1.88298070113469e-07, + "loss": 0.0582, + "step": 16283 + }, + { + "epoch": 2.6383668178872326, + "grad_norm": 0.9234185814857483, + "learning_rate": 1.881316048690207e-07, + "loss": 0.0614, + "step": 16284 + }, + { + "epoch": 2.6385288399222295, + "grad_norm": 0.9673925042152405, + "learning_rate": 1.8796521036215044e-07, + "loss": 0.0552, + "step": 16285 + }, + { + "epoch": 2.638690861957226, + "grad_norm": 0.9933484792709351, + "learning_rate": 1.8779888659794937e-07, + "loss": 0.0658, + "step": 16286 + }, + { + "epoch": 2.638852883992223, + "grad_norm": 0.8367605209350586, + "learning_rate": 1.8763263358150735e-07, + "loss": 0.0614, + "step": 16287 + }, + { + "epoch": 2.63901490602722, + "grad_norm": 0.9234703779220581, + "learning_rate": 1.874664513179106e-07, + "loss": 0.0558, + "step": 16288 + }, + { + "epoch": 2.6391769280622164, + "grad_norm": 1.0613017082214355, + "learning_rate": 1.8730033981224443e-07, + "loss": 0.0688, + "step": 16289 + }, + { + "epoch": 2.6393389500972133, + "grad_norm": 1.0138506889343262, + "learning_rate": 1.8713429906959097e-07, + "loss": 0.0574, + "step": 16290 + }, + { + "epoch": 2.6395009721322102, + "grad_norm": 0.9312925338745117, + "learning_rate": 1.869683290950311e-07, + "loss": 0.0671, + "step": 16291 + }, + { + "epoch": 2.6396629941672067, + "grad_norm": 0.90425705909729, + "learning_rate": 1.8680242989364327e-07, + "loss": 0.0607, + "step": 16292 + }, + { + "epoch": 2.6398250162022032, + "grad_norm": 1.0604413747787476, + "learning_rate": 1.8663660147050262e-07, + "loss": 0.0629, + "step": 16293 + }, + { + "epoch": 2.6399870382372, + "grad_norm": 0.7951275706291199, + "learning_rate": 1.8647084383068393e-07, + "loss": 0.051, + "step": 16294 + }, + { + "epoch": 2.640149060272197, + "grad_norm": 0.8711312413215637, + "learning_rate": 1.8630515697925927e-07, + "loss": 0.0563, + "step": 16295 + }, + { + "epoch": 2.6403110823071936, + "grad_norm": 1.0285757780075073, + "learning_rate": 1.8613954092129738e-07, + "loss": 0.0657, + "step": 16296 + }, + { + "epoch": 2.6404731043421905, + "grad_norm": 0.9499953985214233, + "learning_rate": 1.8597399566186615e-07, + "loss": 0.06, + "step": 16297 + }, + { + "epoch": 2.6406351263771874, + "grad_norm": 0.8965511918067932, + "learning_rate": 1.858085212060304e-07, + "loss": 0.0599, + "step": 16298 + }, + { + "epoch": 2.640797148412184, + "grad_norm": 1.0099302530288696, + "learning_rate": 1.8564311755885438e-07, + "loss": 0.0557, + "step": 16299 + }, + { + "epoch": 2.640959170447181, + "grad_norm": 0.9318955540657043, + "learning_rate": 1.85477784725398e-07, + "loss": 0.0622, + "step": 16300 + }, + { + "epoch": 2.641121192482178, + "grad_norm": 0.9583910703659058, + "learning_rate": 1.8531252271072025e-07, + "loss": 0.0625, + "step": 16301 + }, + { + "epoch": 2.6412832145171743, + "grad_norm": 0.8921169638633728, + "learning_rate": 1.851473315198782e-07, + "loss": 0.0591, + "step": 16302 + }, + { + "epoch": 2.6414452365521712, + "grad_norm": 0.9584546089172363, + "learning_rate": 1.8498221115792554e-07, + "loss": 0.0664, + "step": 16303 + }, + { + "epoch": 2.6416072585871677, + "grad_norm": 0.9304138422012329, + "learning_rate": 1.848171616299152e-07, + "loss": 0.06, + "step": 16304 + }, + { + "epoch": 2.6417692806221647, + "grad_norm": 0.9597314596176147, + "learning_rate": 1.8465218294089704e-07, + "loss": 0.0605, + "step": 16305 + }, + { + "epoch": 2.641931302657161, + "grad_norm": 0.9333338737487793, + "learning_rate": 1.8448727509591951e-07, + "loss": 0.0571, + "step": 16306 + }, + { + "epoch": 2.642093324692158, + "grad_norm": 0.9065783619880676, + "learning_rate": 1.843224381000272e-07, + "loss": 0.0621, + "step": 16307 + }, + { + "epoch": 2.642255346727155, + "grad_norm": 0.7040373086929321, + "learning_rate": 1.8415767195826468e-07, + "loss": 0.0506, + "step": 16308 + }, + { + "epoch": 2.6424173687621515, + "grad_norm": 0.825442373752594, + "learning_rate": 1.8399297667567317e-07, + "loss": 0.0603, + "step": 16309 + }, + { + "epoch": 2.6425793907971484, + "grad_norm": 0.9128686189651489, + "learning_rate": 1.8382835225729256e-07, + "loss": 0.0604, + "step": 16310 + }, + { + "epoch": 2.6427414128321454, + "grad_norm": 0.8846583962440491, + "learning_rate": 1.836637987081588e-07, + "loss": 0.06, + "step": 16311 + }, + { + "epoch": 2.642903434867142, + "grad_norm": 0.8874152302742004, + "learning_rate": 1.834993160333068e-07, + "loss": 0.0601, + "step": 16312 + }, + { + "epoch": 2.643065456902139, + "grad_norm": 1.2292134761810303, + "learning_rate": 1.8333490423777106e-07, + "loss": 0.0675, + "step": 16313 + }, + { + "epoch": 2.6432274789371357, + "grad_norm": 0.841103732585907, + "learning_rate": 1.831705633265804e-07, + "loss": 0.0571, + "step": 16314 + }, + { + "epoch": 2.643389500972132, + "grad_norm": 0.9429198503494263, + "learning_rate": 1.8300629330476383e-07, + "loss": 0.0633, + "step": 16315 + }, + { + "epoch": 2.6435515230071287, + "grad_norm": 1.0920491218566895, + "learning_rate": 1.8284209417734762e-07, + "loss": 0.0608, + "step": 16316 + }, + { + "epoch": 2.6437135450421256, + "grad_norm": 0.9739688634872437, + "learning_rate": 1.8267796594935606e-07, + "loss": 0.0637, + "step": 16317 + }, + { + "epoch": 2.6438755670771226, + "grad_norm": 1.1021599769592285, + "learning_rate": 1.8251390862581097e-07, + "loss": 0.0693, + "step": 16318 + }, + { + "epoch": 2.644037589112119, + "grad_norm": 0.9459995627403259, + "learning_rate": 1.8234992221173198e-07, + "loss": 0.0639, + "step": 16319 + }, + { + "epoch": 2.644199611147116, + "grad_norm": 0.9659364223480225, + "learning_rate": 1.8218600671213698e-07, + "loss": 0.06, + "step": 16320 + }, + { + "epoch": 2.644361633182113, + "grad_norm": 0.954434871673584, + "learning_rate": 1.8202216213204144e-07, + "loss": 0.0613, + "step": 16321 + }, + { + "epoch": 2.6445236552171094, + "grad_norm": 0.8224755525588989, + "learning_rate": 1.8185838847645743e-07, + "loss": 0.055, + "step": 16322 + }, + { + "epoch": 2.6446856772521063, + "grad_norm": 0.9007136821746826, + "learning_rate": 1.8169468575039735e-07, + "loss": 0.0645, + "step": 16323 + }, + { + "epoch": 2.6448476992871033, + "grad_norm": 1.1272404193878174, + "learning_rate": 1.8153105395886967e-07, + "loss": 0.0585, + "step": 16324 + }, + { + "epoch": 2.6450097213220998, + "grad_norm": 0.8485245108604431, + "learning_rate": 1.813674931068818e-07, + "loss": 0.0603, + "step": 16325 + }, + { + "epoch": 2.6451717433570967, + "grad_norm": 0.8735572099685669, + "learning_rate": 1.8120400319943692e-07, + "loss": 0.0557, + "step": 16326 + }, + { + "epoch": 2.645333765392093, + "grad_norm": 0.8615409731864929, + "learning_rate": 1.8104058424153802e-07, + "loss": 0.0594, + "step": 16327 + }, + { + "epoch": 2.64549578742709, + "grad_norm": 0.9410812258720398, + "learning_rate": 1.8087723623818608e-07, + "loss": 0.0596, + "step": 16328 + }, + { + "epoch": 2.6456578094620866, + "grad_norm": 0.7836385369300842, + "learning_rate": 1.8071395919437823e-07, + "loss": 0.0522, + "step": 16329 + }, + { + "epoch": 2.6458198314970836, + "grad_norm": 0.9764214158058167, + "learning_rate": 1.805507531151107e-07, + "loss": 0.067, + "step": 16330 + }, + { + "epoch": 2.6459818535320805, + "grad_norm": 0.8474244475364685, + "learning_rate": 1.8038761800537708e-07, + "loss": 0.0555, + "step": 16331 + }, + { + "epoch": 2.646143875567077, + "grad_norm": 0.8792239427566528, + "learning_rate": 1.8022455387016913e-07, + "loss": 0.0558, + "step": 16332 + }, + { + "epoch": 2.646305897602074, + "grad_norm": 0.8611176609992981, + "learning_rate": 1.8006156071447595e-07, + "loss": 0.0568, + "step": 16333 + }, + { + "epoch": 2.646467919637071, + "grad_norm": 0.9201486706733704, + "learning_rate": 1.7989863854328492e-07, + "loss": 0.0564, + "step": 16334 + }, + { + "epoch": 2.6466299416720673, + "grad_norm": 0.8337270021438599, + "learning_rate": 1.7973578736158098e-07, + "loss": 0.0522, + "step": 16335 + }, + { + "epoch": 2.6467919637070643, + "grad_norm": 0.9367399215698242, + "learning_rate": 1.7957300717434706e-07, + "loss": 0.0577, + "step": 16336 + }, + { + "epoch": 2.6469539857420608, + "grad_norm": 0.913486123085022, + "learning_rate": 1.794102979865639e-07, + "loss": 0.0639, + "step": 16337 + }, + { + "epoch": 2.6471160077770577, + "grad_norm": 0.9401398301124573, + "learning_rate": 1.7924765980320974e-07, + "loss": 0.0588, + "step": 16338 + }, + { + "epoch": 2.647278029812054, + "grad_norm": 0.8506883382797241, + "learning_rate": 1.790850926292617e-07, + "loss": 0.0583, + "step": 16339 + }, + { + "epoch": 2.647440051847051, + "grad_norm": 0.9261347055435181, + "learning_rate": 1.7892259646969278e-07, + "loss": 0.0591, + "step": 16340 + }, + { + "epoch": 2.647602073882048, + "grad_norm": 1.0030386447906494, + "learning_rate": 1.7876017132947483e-07, + "loss": 0.061, + "step": 16341 + }, + { + "epoch": 2.6477640959170445, + "grad_norm": 0.9110730290412903, + "learning_rate": 1.785978172135791e-07, + "loss": 0.0569, + "step": 16342 + }, + { + "epoch": 2.6479261179520415, + "grad_norm": 0.8250390887260437, + "learning_rate": 1.7843553412697278e-07, + "loss": 0.0486, + "step": 16343 + }, + { + "epoch": 2.6480881399870384, + "grad_norm": 1.0518966913223267, + "learning_rate": 1.782733220746205e-07, + "loss": 0.0639, + "step": 16344 + }, + { + "epoch": 2.648250162022035, + "grad_norm": 0.9728784561157227, + "learning_rate": 1.7811118106148633e-07, + "loss": 0.0679, + "step": 16345 + }, + { + "epoch": 2.648412184057032, + "grad_norm": 0.9852555990219116, + "learning_rate": 1.7794911109253105e-07, + "loss": 0.0634, + "step": 16346 + }, + { + "epoch": 2.6485742060920288, + "grad_norm": 0.9500349164009094, + "learning_rate": 1.777871121727137e-07, + "loss": 0.0619, + "step": 16347 + }, + { + "epoch": 2.6487362281270252, + "grad_norm": 0.9328777194023132, + "learning_rate": 1.7762518430699122e-07, + "loss": 0.0637, + "step": 16348 + }, + { + "epoch": 2.648898250162022, + "grad_norm": 1.0019419193267822, + "learning_rate": 1.774633275003179e-07, + "loss": 0.0678, + "step": 16349 + }, + { + "epoch": 2.6490602721970187, + "grad_norm": 1.070502758026123, + "learning_rate": 1.7730154175764623e-07, + "loss": 0.0578, + "step": 16350 + }, + { + "epoch": 2.6492222942320156, + "grad_norm": 0.82248455286026, + "learning_rate": 1.771398270839267e-07, + "loss": 0.054, + "step": 16351 + }, + { + "epoch": 2.649384316267012, + "grad_norm": 1.011540412902832, + "learning_rate": 1.7697818348410722e-07, + "loss": 0.066, + "step": 16352 + }, + { + "epoch": 2.649546338302009, + "grad_norm": 0.9603102207183838, + "learning_rate": 1.7681661096313364e-07, + "loss": 0.0643, + "step": 16353 + }, + { + "epoch": 2.649708360337006, + "grad_norm": 0.9704177379608154, + "learning_rate": 1.7665510952595027e-07, + "loss": 0.0703, + "step": 16354 + }, + { + "epoch": 2.6498703823720025, + "grad_norm": 0.9658411741256714, + "learning_rate": 1.764936791774974e-07, + "loss": 0.0621, + "step": 16355 + }, + { + "epoch": 2.6500324044069994, + "grad_norm": 0.9105367660522461, + "learning_rate": 1.7633231992271572e-07, + "loss": 0.0545, + "step": 16356 + }, + { + "epoch": 2.6501944264419963, + "grad_norm": 0.8852334022521973, + "learning_rate": 1.7617103176654187e-07, + "loss": 0.0669, + "step": 16357 + }, + { + "epoch": 2.650356448476993, + "grad_norm": 0.9062175154685974, + "learning_rate": 1.7600981471391083e-07, + "loss": 0.0594, + "step": 16358 + }, + { + "epoch": 2.6505184705119897, + "grad_norm": 0.8429781794548035, + "learning_rate": 1.7584866876975526e-07, + "loss": 0.0531, + "step": 16359 + }, + { + "epoch": 2.6506804925469862, + "grad_norm": 0.9941640496253967, + "learning_rate": 1.7568759393900597e-07, + "loss": 0.0622, + "step": 16360 + }, + { + "epoch": 2.650842514581983, + "grad_norm": 1.0209076404571533, + "learning_rate": 1.7552659022659206e-07, + "loss": 0.0636, + "step": 16361 + }, + { + "epoch": 2.6510045366169797, + "grad_norm": 1.1594321727752686, + "learning_rate": 1.7536565763743934e-07, + "loss": 0.0646, + "step": 16362 + }, + { + "epoch": 2.6511665586519766, + "grad_norm": 0.9234576225280762, + "learning_rate": 1.7520479617647163e-07, + "loss": 0.0573, + "step": 16363 + }, + { + "epoch": 2.6513285806869735, + "grad_norm": 0.9568017721176147, + "learning_rate": 1.7504400584861137e-07, + "loss": 0.065, + "step": 16364 + }, + { + "epoch": 2.65149060272197, + "grad_norm": 0.9261468052864075, + "learning_rate": 1.7488328665877823e-07, + "loss": 0.0632, + "step": 16365 + }, + { + "epoch": 2.651652624756967, + "grad_norm": 0.8637058734893799, + "learning_rate": 1.7472263861189e-07, + "loss": 0.0605, + "step": 16366 + }, + { + "epoch": 2.651814646791964, + "grad_norm": 0.9233385324478149, + "learning_rate": 1.7456206171286182e-07, + "loss": 0.061, + "step": 16367 + }, + { + "epoch": 2.6519766688269604, + "grad_norm": 0.8062421679496765, + "learning_rate": 1.7440155596660735e-07, + "loss": 0.0519, + "step": 16368 + }, + { + "epoch": 2.6521386908619573, + "grad_norm": 0.8380430340766907, + "learning_rate": 1.7424112137803763e-07, + "loss": 0.0575, + "step": 16369 + }, + { + "epoch": 2.6523007128969542, + "grad_norm": 0.9855707287788391, + "learning_rate": 1.7408075795206037e-07, + "loss": 0.0603, + "step": 16370 + }, + { + "epoch": 2.6524627349319507, + "grad_norm": 1.0595271587371826, + "learning_rate": 1.739204656935839e-07, + "loss": 0.0718, + "step": 16371 + }, + { + "epoch": 2.6526247569669477, + "grad_norm": 1.0735374689102173, + "learning_rate": 1.7376024460751262e-07, + "loss": 0.0748, + "step": 16372 + }, + { + "epoch": 2.652786779001944, + "grad_norm": 0.929942786693573, + "learning_rate": 1.7360009469874788e-07, + "loss": 0.0564, + "step": 16373 + }, + { + "epoch": 2.652948801036941, + "grad_norm": 0.8766565918922424, + "learning_rate": 1.7344001597219024e-07, + "loss": 0.0616, + "step": 16374 + }, + { + "epoch": 2.6531108230719376, + "grad_norm": 0.9398424029350281, + "learning_rate": 1.7328000843273879e-07, + "loss": 0.0622, + "step": 16375 + }, + { + "epoch": 2.6532728451069345, + "grad_norm": 0.9084738492965698, + "learning_rate": 1.7312007208528796e-07, + "loss": 0.0656, + "step": 16376 + }, + { + "epoch": 2.6534348671419314, + "grad_norm": 0.9480365514755249, + "learning_rate": 1.729602069347322e-07, + "loss": 0.061, + "step": 16377 + }, + { + "epoch": 2.653596889176928, + "grad_norm": 0.875299870967865, + "learning_rate": 1.7280041298596257e-07, + "loss": 0.0631, + "step": 16378 + }, + { + "epoch": 2.653758911211925, + "grad_norm": 0.9430073499679565, + "learning_rate": 1.7264069024386876e-07, + "loss": 0.066, + "step": 16379 + }, + { + "epoch": 2.653920933246922, + "grad_norm": 1.0011956691741943, + "learning_rate": 1.7248103871333743e-07, + "loss": 0.0557, + "step": 16380 + }, + { + "epoch": 2.6540829552819183, + "grad_norm": 0.783429741859436, + "learning_rate": 1.7232145839925413e-07, + "loss": 0.0582, + "step": 16381 + }, + { + "epoch": 2.654244977316915, + "grad_norm": 1.0384507179260254, + "learning_rate": 1.7216194930650105e-07, + "loss": 0.0629, + "step": 16382 + }, + { + "epoch": 2.6544069993519117, + "grad_norm": 0.9398157596588135, + "learning_rate": 1.7200251143995983e-07, + "loss": 0.0562, + "step": 16383 + }, + { + "epoch": 2.6545690213869086, + "grad_norm": 1.124776005744934, + "learning_rate": 1.7184314480450713e-07, + "loss": 0.0653, + "step": 16384 + }, + { + "epoch": 2.654731043421905, + "grad_norm": 0.9212331771850586, + "learning_rate": 1.7168384940502048e-07, + "loss": 0.0662, + "step": 16385 + }, + { + "epoch": 2.654893065456902, + "grad_norm": 1.0174168348312378, + "learning_rate": 1.715246252463737e-07, + "loss": 0.065, + "step": 16386 + }, + { + "epoch": 2.655055087491899, + "grad_norm": 0.8449579477310181, + "learning_rate": 1.7136547233343904e-07, + "loss": 0.0596, + "step": 16387 + }, + { + "epoch": 2.6552171095268955, + "grad_norm": 0.9600573182106018, + "learning_rate": 1.7120639067108508e-07, + "loss": 0.0591, + "step": 16388 + }, + { + "epoch": 2.6553791315618924, + "grad_norm": 0.9382137060165405, + "learning_rate": 1.7104738026417987e-07, + "loss": 0.0599, + "step": 16389 + }, + { + "epoch": 2.6555411535968894, + "grad_norm": 1.0673377513885498, + "learning_rate": 1.7088844111758956e-07, + "loss": 0.0638, + "step": 16390 + }, + { + "epoch": 2.655703175631886, + "grad_norm": 0.9802731871604919, + "learning_rate": 1.7072957323617635e-07, + "loss": 0.0656, + "step": 16391 + }, + { + "epoch": 2.655865197666883, + "grad_norm": 0.9394249320030212, + "learning_rate": 1.7057077662480131e-07, + "loss": 0.0533, + "step": 16392 + }, + { + "epoch": 2.6560272197018797, + "grad_norm": 0.9298037886619568, + "learning_rate": 1.7041205128832338e-07, + "loss": 0.059, + "step": 16393 + }, + { + "epoch": 2.656189241736876, + "grad_norm": 1.0378389358520508, + "learning_rate": 1.7025339723159924e-07, + "loss": 0.0637, + "step": 16394 + }, + { + "epoch": 2.656351263771873, + "grad_norm": 0.7979336380958557, + "learning_rate": 1.700948144594833e-07, + "loss": 0.059, + "step": 16395 + }, + { + "epoch": 2.6565132858068696, + "grad_norm": 0.8939142823219299, + "learning_rate": 1.6993630297682778e-07, + "loss": 0.0633, + "step": 16396 + }, + { + "epoch": 2.6566753078418666, + "grad_norm": 0.8715953230857849, + "learning_rate": 1.6977786278848275e-07, + "loss": 0.0577, + "step": 16397 + }, + { + "epoch": 2.656837329876863, + "grad_norm": 0.8175707459449768, + "learning_rate": 1.6961949389929593e-07, + "loss": 0.0571, + "step": 16398 + }, + { + "epoch": 2.65699935191186, + "grad_norm": 1.1204349994659424, + "learning_rate": 1.6946119631411352e-07, + "loss": 0.0584, + "step": 16399 + }, + { + "epoch": 2.657161373946857, + "grad_norm": 1.0144906044006348, + "learning_rate": 1.693029700377785e-07, + "loss": 0.0669, + "step": 16400 + }, + { + "epoch": 2.6573233959818534, + "grad_norm": 0.8621246218681335, + "learning_rate": 1.6914481507513263e-07, + "loss": 0.0629, + "step": 16401 + }, + { + "epoch": 2.6574854180168503, + "grad_norm": 0.8595054745674133, + "learning_rate": 1.6898673143101479e-07, + "loss": 0.0557, + "step": 16402 + }, + { + "epoch": 2.6576474400518473, + "grad_norm": 0.8768499493598938, + "learning_rate": 1.688287191102614e-07, + "loss": 0.0645, + "step": 16403 + }, + { + "epoch": 2.6578094620868438, + "grad_norm": 0.9614449143409729, + "learning_rate": 1.6867077811770826e-07, + "loss": 0.0583, + "step": 16404 + }, + { + "epoch": 2.6579714841218407, + "grad_norm": 0.9931366443634033, + "learning_rate": 1.6851290845818824e-07, + "loss": 0.0712, + "step": 16405 + }, + { + "epoch": 2.658133506156837, + "grad_norm": 0.8831222653388977, + "learning_rate": 1.683551101365305e-07, + "loss": 0.055, + "step": 16406 + }, + { + "epoch": 2.658295528191834, + "grad_norm": 0.9368479251861572, + "learning_rate": 1.6819738315756395e-07, + "loss": 0.0537, + "step": 16407 + }, + { + "epoch": 2.6584575502268306, + "grad_norm": 0.9596263766288757, + "learning_rate": 1.6803972752611475e-07, + "loss": 0.0548, + "step": 16408 + }, + { + "epoch": 2.6586195722618275, + "grad_norm": 0.9823015928268433, + "learning_rate": 1.6788214324700652e-07, + "loss": 0.0593, + "step": 16409 + }, + { + "epoch": 2.6587815942968245, + "grad_norm": 0.9856812357902527, + "learning_rate": 1.6772463032506126e-07, + "loss": 0.0581, + "step": 16410 + }, + { + "epoch": 2.658943616331821, + "grad_norm": 1.0176301002502441, + "learning_rate": 1.6756718876509815e-07, + "loss": 0.06, + "step": 16411 + }, + { + "epoch": 2.659105638366818, + "grad_norm": 0.8685017228126526, + "learning_rate": 1.6740981857193471e-07, + "loss": 0.0569, + "step": 16412 + }, + { + "epoch": 2.659267660401815, + "grad_norm": 0.8308966755867004, + "learning_rate": 1.672525197503863e-07, + "loss": 0.0605, + "step": 16413 + }, + { + "epoch": 2.6594296824368113, + "grad_norm": 0.7896578311920166, + "learning_rate": 1.6709529230526544e-07, + "loss": 0.0544, + "step": 16414 + }, + { + "epoch": 2.6595917044718083, + "grad_norm": 1.1066980361938477, + "learning_rate": 1.66938136241383e-07, + "loss": 0.0637, + "step": 16415 + }, + { + "epoch": 2.659753726506805, + "grad_norm": 1.2486014366149902, + "learning_rate": 1.667810515635482e-07, + "loss": 0.0629, + "step": 16416 + }, + { + "epoch": 2.6599157485418017, + "grad_norm": 0.92633056640625, + "learning_rate": 1.6662403827656603e-07, + "loss": 0.0627, + "step": 16417 + }, + { + "epoch": 2.660077770576798, + "grad_norm": 0.9085054397583008, + "learning_rate": 1.6646709638524216e-07, + "loss": 0.0594, + "step": 16418 + }, + { + "epoch": 2.660239792611795, + "grad_norm": 1.1926897764205933, + "learning_rate": 1.6631022589437828e-07, + "loss": 0.0662, + "step": 16419 + }, + { + "epoch": 2.660401814646792, + "grad_norm": 0.877842128276825, + "learning_rate": 1.6615342680877417e-07, + "loss": 0.0617, + "step": 16420 + }, + { + "epoch": 2.6605638366817885, + "grad_norm": 1.0418201684951782, + "learning_rate": 1.6599669913322708e-07, + "loss": 0.061, + "step": 16421 + }, + { + "epoch": 2.6607258587167855, + "grad_norm": 0.9494113326072693, + "learning_rate": 1.6584004287253235e-07, + "loss": 0.0586, + "step": 16422 + }, + { + "epoch": 2.6608878807517824, + "grad_norm": 1.1011654138565063, + "learning_rate": 1.6568345803148478e-07, + "loss": 0.0642, + "step": 16423 + }, + { + "epoch": 2.661049902786779, + "grad_norm": 1.0519500970840454, + "learning_rate": 1.6552694461487385e-07, + "loss": 0.0583, + "step": 16424 + }, + { + "epoch": 2.661211924821776, + "grad_norm": 0.9799832105636597, + "learning_rate": 1.6537050262748905e-07, + "loss": 0.0611, + "step": 16425 + }, + { + "epoch": 2.6613739468567728, + "grad_norm": 0.9446945190429688, + "learning_rate": 1.652141320741174e-07, + "loss": 0.0623, + "step": 16426 + }, + { + "epoch": 2.6615359688917692, + "grad_norm": 0.9712892770767212, + "learning_rate": 1.6505783295954314e-07, + "loss": 0.0589, + "step": 16427 + }, + { + "epoch": 2.661697990926766, + "grad_norm": 0.9126527905464172, + "learning_rate": 1.6490160528854855e-07, + "loss": 0.063, + "step": 16428 + }, + { + "epoch": 2.6618600129617627, + "grad_norm": 0.843527615070343, + "learning_rate": 1.6474544906591422e-07, + "loss": 0.0636, + "step": 16429 + }, + { + "epoch": 2.6620220349967596, + "grad_norm": 1.002021312713623, + "learning_rate": 1.6458936429641803e-07, + "loss": 0.0638, + "step": 16430 + }, + { + "epoch": 2.662184057031756, + "grad_norm": 0.8258562684059143, + "learning_rate": 1.6443335098483586e-07, + "loss": 0.0567, + "step": 16431 + }, + { + "epoch": 2.662346079066753, + "grad_norm": 0.987740159034729, + "learning_rate": 1.642774091359406e-07, + "loss": 0.0617, + "step": 16432 + }, + { + "epoch": 2.66250810110175, + "grad_norm": 1.08059823513031, + "learning_rate": 1.6412153875450448e-07, + "loss": 0.0681, + "step": 16433 + }, + { + "epoch": 2.6626701231367464, + "grad_norm": 0.9311217069625854, + "learning_rate": 1.6396573984529707e-07, + "loss": 0.063, + "step": 16434 + }, + { + "epoch": 2.6628321451717434, + "grad_norm": 0.8519605994224548, + "learning_rate": 1.6381001241308452e-07, + "loss": 0.0563, + "step": 16435 + }, + { + "epoch": 2.6629941672067403, + "grad_norm": 0.9323353171348572, + "learning_rate": 1.6365435646263223e-07, + "loss": 0.0625, + "step": 16436 + }, + { + "epoch": 2.663156189241737, + "grad_norm": 0.9580549597740173, + "learning_rate": 1.6349877199870218e-07, + "loss": 0.0612, + "step": 16437 + }, + { + "epoch": 2.6633182112767337, + "grad_norm": 0.9537264108657837, + "learning_rate": 1.6334325902605642e-07, + "loss": 0.0611, + "step": 16438 + }, + { + "epoch": 2.6634802333117307, + "grad_norm": 0.8179647922515869, + "learning_rate": 1.6318781754945168e-07, + "loss": 0.0574, + "step": 16439 + }, + { + "epoch": 2.663642255346727, + "grad_norm": 0.8439454436302185, + "learning_rate": 1.63032447573645e-07, + "loss": 0.0547, + "step": 16440 + }, + { + "epoch": 2.6638042773817237, + "grad_norm": 0.8650406002998352, + "learning_rate": 1.6287714910339008e-07, + "loss": 0.0558, + "step": 16441 + }, + { + "epoch": 2.6639662994167206, + "grad_norm": 1.0165598392486572, + "learning_rate": 1.6272192214343868e-07, + "loss": 0.069, + "step": 16442 + }, + { + "epoch": 2.6641283214517175, + "grad_norm": 0.9209080934524536, + "learning_rate": 1.6256676669854032e-07, + "loss": 0.0659, + "step": 16443 + }, + { + "epoch": 2.664290343486714, + "grad_norm": 1.0296342372894287, + "learning_rate": 1.6241168277344232e-07, + "loss": 0.0654, + "step": 16444 + }, + { + "epoch": 2.664452365521711, + "grad_norm": 0.9982236623764038, + "learning_rate": 1.6225667037289034e-07, + "loss": 0.0727, + "step": 16445 + }, + { + "epoch": 2.664614387556708, + "grad_norm": 0.8094567656517029, + "learning_rate": 1.6210172950162639e-07, + "loss": 0.0574, + "step": 16446 + }, + { + "epoch": 2.6647764095917044, + "grad_norm": 1.097362995147705, + "learning_rate": 1.6194686016439227e-07, + "loss": 0.0682, + "step": 16447 + }, + { + "epoch": 2.6649384316267013, + "grad_norm": 0.9655567407608032, + "learning_rate": 1.617920623659261e-07, + "loss": 0.0602, + "step": 16448 + }, + { + "epoch": 2.6651004536616982, + "grad_norm": 2.5268149375915527, + "learning_rate": 1.6163733611096495e-07, + "loss": 0.0604, + "step": 16449 + }, + { + "epoch": 2.6652624756966947, + "grad_norm": 0.8302810788154602, + "learning_rate": 1.6148268140424224e-07, + "loss": 0.0574, + "step": 16450 + }, + { + "epoch": 2.6654244977316917, + "grad_norm": 1.3241533041000366, + "learning_rate": 1.6132809825049e-07, + "loss": 0.0606, + "step": 16451 + }, + { + "epoch": 2.665586519766688, + "grad_norm": 0.991156280040741, + "learning_rate": 1.6117358665443922e-07, + "loss": 0.0633, + "step": 16452 + }, + { + "epoch": 2.665748541801685, + "grad_norm": 0.8456504344940186, + "learning_rate": 1.6101914662081665e-07, + "loss": 0.0609, + "step": 16453 + }, + { + "epoch": 2.6659105638366816, + "grad_norm": 1.1732730865478516, + "learning_rate": 1.6086477815434763e-07, + "loss": 0.0678, + "step": 16454 + }, + { + "epoch": 2.6660725858716785, + "grad_norm": 0.9648259878158569, + "learning_rate": 1.6071048125975598e-07, + "loss": 0.0613, + "step": 16455 + }, + { + "epoch": 2.6662346079066754, + "grad_norm": 1.1859503984451294, + "learning_rate": 1.6055625594176254e-07, + "loss": 0.0761, + "step": 16456 + }, + { + "epoch": 2.666396629941672, + "grad_norm": 0.9471797347068787, + "learning_rate": 1.604021022050864e-07, + "loss": 0.0643, + "step": 16457 + }, + { + "epoch": 2.666558651976669, + "grad_norm": 0.9277358651161194, + "learning_rate": 1.602480200544443e-07, + "loss": 0.0676, + "step": 16458 + }, + { + "epoch": 2.666720674011666, + "grad_norm": 0.9459307789802551, + "learning_rate": 1.600940094945505e-07, + "loss": 0.0658, + "step": 16459 + }, + { + "epoch": 2.6668826960466623, + "grad_norm": 0.8798080086708069, + "learning_rate": 1.5994007053011796e-07, + "loss": 0.0642, + "step": 16460 + }, + { + "epoch": 2.667044718081659, + "grad_norm": 0.9812191128730774, + "learning_rate": 1.5978620316585564e-07, + "loss": 0.0614, + "step": 16461 + }, + { + "epoch": 2.6672067401166557, + "grad_norm": 0.898577094078064, + "learning_rate": 1.5963240740647285e-07, + "loss": 0.0627, + "step": 16462 + }, + { + "epoch": 2.6673687621516526, + "grad_norm": 0.9194305539131165, + "learning_rate": 1.594786832566747e-07, + "loss": 0.0637, + "step": 16463 + }, + { + "epoch": 2.667530784186649, + "grad_norm": 0.895622730255127, + "learning_rate": 1.5932503072116524e-07, + "loss": 0.0625, + "step": 16464 + }, + { + "epoch": 2.667692806221646, + "grad_norm": 0.9284375309944153, + "learning_rate": 1.5917144980464483e-07, + "loss": 0.0645, + "step": 16465 + }, + { + "epoch": 2.667854828256643, + "grad_norm": 0.8722718358039856, + "learning_rate": 1.5901794051181362e-07, + "loss": 0.057, + "step": 16466 + }, + { + "epoch": 2.6680168502916395, + "grad_norm": 0.8681865930557251, + "learning_rate": 1.58864502847369e-07, + "loss": 0.059, + "step": 16467 + }, + { + "epoch": 2.6681788723266364, + "grad_norm": 0.9118895530700684, + "learning_rate": 1.5871113681600464e-07, + "loss": 0.0578, + "step": 16468 + }, + { + "epoch": 2.6683408943616334, + "grad_norm": 0.9743660092353821, + "learning_rate": 1.5855784242241352e-07, + "loss": 0.0638, + "step": 16469 + }, + { + "epoch": 2.66850291639663, + "grad_norm": 0.9498010277748108, + "learning_rate": 1.5840461967128628e-07, + "loss": 0.0605, + "step": 16470 + }, + { + "epoch": 2.6686649384316268, + "grad_norm": 0.9760184288024902, + "learning_rate": 1.5825146856731144e-07, + "loss": 0.0638, + "step": 16471 + }, + { + "epoch": 2.6688269604666237, + "grad_norm": 0.9990751147270203, + "learning_rate": 1.5809838911517438e-07, + "loss": 0.0612, + "step": 16472 + }, + { + "epoch": 2.66898898250162, + "grad_norm": 0.8862509727478027, + "learning_rate": 1.5794538131955944e-07, + "loss": 0.0579, + "step": 16473 + }, + { + "epoch": 2.669151004536617, + "grad_norm": 0.9482281804084778, + "learning_rate": 1.5779244518514813e-07, + "loss": 0.0669, + "step": 16474 + }, + { + "epoch": 2.6693130265716136, + "grad_norm": 0.9086149334907532, + "learning_rate": 1.5763958071662006e-07, + "loss": 0.059, + "step": 16475 + }, + { + "epoch": 2.6694750486066106, + "grad_norm": 0.8625389933586121, + "learning_rate": 1.574867879186523e-07, + "loss": 0.0598, + "step": 16476 + }, + { + "epoch": 2.669637070641607, + "grad_norm": 0.9149574041366577, + "learning_rate": 1.5733406679592028e-07, + "loss": 0.0646, + "step": 16477 + }, + { + "epoch": 2.669799092676604, + "grad_norm": 0.7897137999534607, + "learning_rate": 1.5718141735309695e-07, + "loss": 0.0554, + "step": 16478 + }, + { + "epoch": 2.669961114711601, + "grad_norm": 0.7885839939117432, + "learning_rate": 1.5702883959485215e-07, + "loss": 0.0547, + "step": 16479 + }, + { + "epoch": 2.6701231367465974, + "grad_norm": 0.9897783398628235, + "learning_rate": 1.5687633352585467e-07, + "loss": 0.0536, + "step": 16480 + }, + { + "epoch": 2.6702851587815943, + "grad_norm": 1.0975406169891357, + "learning_rate": 1.5672389915077162e-07, + "loss": 0.0666, + "step": 16481 + }, + { + "epoch": 2.6704471808165913, + "grad_norm": 0.8759128451347351, + "learning_rate": 1.5657153647426703e-07, + "loss": 0.0639, + "step": 16482 + }, + { + "epoch": 2.6706092028515878, + "grad_norm": 0.8712435364723206, + "learning_rate": 1.5641924550100218e-07, + "loss": 0.059, + "step": 16483 + }, + { + "epoch": 2.6707712248865847, + "grad_norm": 0.7633887529373169, + "learning_rate": 1.5626702623563694e-07, + "loss": 0.0531, + "step": 16484 + }, + { + "epoch": 2.670933246921581, + "grad_norm": 1.1077615022659302, + "learning_rate": 1.561148786828287e-07, + "loss": 0.0654, + "step": 16485 + }, + { + "epoch": 2.671095268956578, + "grad_norm": 0.95298832654953, + "learning_rate": 1.5596280284723348e-07, + "loss": 0.0593, + "step": 16486 + }, + { + "epoch": 2.6712572909915746, + "grad_norm": 1.1085457801818848, + "learning_rate": 1.558107987335039e-07, + "loss": 0.0647, + "step": 16487 + }, + { + "epoch": 2.6714193130265715, + "grad_norm": 0.8979312181472778, + "learning_rate": 1.5565886634629102e-07, + "loss": 0.06, + "step": 16488 + }, + { + "epoch": 2.6715813350615685, + "grad_norm": 0.8960966467857361, + "learning_rate": 1.5550700569024358e-07, + "loss": 0.0561, + "step": 16489 + }, + { + "epoch": 2.671743357096565, + "grad_norm": 0.8627614378929138, + "learning_rate": 1.5535521677000813e-07, + "loss": 0.0538, + "step": 16490 + }, + { + "epoch": 2.671905379131562, + "grad_norm": 0.9660282135009766, + "learning_rate": 1.5520349959022934e-07, + "loss": 0.0582, + "step": 16491 + }, + { + "epoch": 2.672067401166559, + "grad_norm": 0.9266077876091003, + "learning_rate": 1.5505185415554903e-07, + "loss": 0.063, + "step": 16492 + }, + { + "epoch": 2.6722294232015553, + "grad_norm": 0.9693909287452698, + "learning_rate": 1.5490028047060762e-07, + "loss": 0.0612, + "step": 16493 + }, + { + "epoch": 2.6723914452365523, + "grad_norm": 0.9419223666191101, + "learning_rate": 1.54748778540042e-07, + "loss": 0.0568, + "step": 16494 + }, + { + "epoch": 2.672553467271549, + "grad_norm": 0.9191377758979797, + "learning_rate": 1.5459734836848872e-07, + "loss": 0.0631, + "step": 16495 + }, + { + "epoch": 2.6727154893065457, + "grad_norm": 1.1649914979934692, + "learning_rate": 1.544459899605813e-07, + "loss": 0.063, + "step": 16496 + }, + { + "epoch": 2.6728775113415426, + "grad_norm": 0.7725280523300171, + "learning_rate": 1.5429470332094992e-07, + "loss": 0.0554, + "step": 16497 + }, + { + "epoch": 2.673039533376539, + "grad_norm": 0.9325549006462097, + "learning_rate": 1.5414348845422394e-07, + "loss": 0.0567, + "step": 16498 + }, + { + "epoch": 2.673201555411536, + "grad_norm": 1.1128214597702026, + "learning_rate": 1.5399234536503023e-07, + "loss": 0.0684, + "step": 16499 + }, + { + "epoch": 2.6733635774465325, + "grad_norm": 0.967528223991394, + "learning_rate": 1.538412740579942e-07, + "loss": 0.0644, + "step": 16500 + }, + { + "epoch": 2.6735255994815295, + "grad_norm": 0.893129825592041, + "learning_rate": 1.536902745377372e-07, + "loss": 0.0641, + "step": 16501 + }, + { + "epoch": 2.6736876215165264, + "grad_norm": 1.0466177463531494, + "learning_rate": 1.5353934680888e-07, + "loss": 0.0636, + "step": 16502 + }, + { + "epoch": 2.673849643551523, + "grad_norm": 1.234455943107605, + "learning_rate": 1.5338849087604025e-07, + "loss": 0.0587, + "step": 16503 + }, + { + "epoch": 2.67401166558652, + "grad_norm": 0.9519467353820801, + "learning_rate": 1.5323770674383398e-07, + "loss": 0.0616, + "step": 16504 + }, + { + "epoch": 2.6741736876215167, + "grad_norm": 0.8917698860168457, + "learning_rate": 1.5308699441687502e-07, + "loss": 0.0564, + "step": 16505 + }, + { + "epoch": 2.6743357096565132, + "grad_norm": 0.8450326919555664, + "learning_rate": 1.529363538997744e-07, + "loss": 0.0553, + "step": 16506 + }, + { + "epoch": 2.67449773169151, + "grad_norm": 0.9240355491638184, + "learning_rate": 1.5278578519714177e-07, + "loss": 0.063, + "step": 16507 + }, + { + "epoch": 2.6746597537265067, + "grad_norm": 0.9043802618980408, + "learning_rate": 1.526352883135837e-07, + "loss": 0.0639, + "step": 16508 + }, + { + "epoch": 2.6748217757615036, + "grad_norm": 0.8774120807647705, + "learning_rate": 1.5248486325370544e-07, + "loss": 0.0609, + "step": 16509 + }, + { + "epoch": 2.6749837977965, + "grad_norm": 0.9532600045204163, + "learning_rate": 1.5233451002210964e-07, + "loss": 0.0594, + "step": 16510 + }, + { + "epoch": 2.675145819831497, + "grad_norm": 0.8727222681045532, + "learning_rate": 1.5218422862339683e-07, + "loss": 0.0651, + "step": 16511 + }, + { + "epoch": 2.675307841866494, + "grad_norm": 0.8903719186782837, + "learning_rate": 1.520340190621647e-07, + "loss": 0.0586, + "step": 16512 + }, + { + "epoch": 2.6754698639014904, + "grad_norm": 1.0103119611740112, + "learning_rate": 1.5188388134300901e-07, + "loss": 0.0592, + "step": 16513 + }, + { + "epoch": 2.6756318859364874, + "grad_norm": 0.8567571043968201, + "learning_rate": 1.5173381547052528e-07, + "loss": 0.0577, + "step": 16514 + }, + { + "epoch": 2.6757939079714843, + "grad_norm": 0.874523401260376, + "learning_rate": 1.5158382144930344e-07, + "loss": 0.0564, + "step": 16515 + }, + { + "epoch": 2.675955930006481, + "grad_norm": 0.8860069513320923, + "learning_rate": 1.5143389928393398e-07, + "loss": 0.0573, + "step": 16516 + }, + { + "epoch": 2.6761179520414777, + "grad_norm": 0.8461055159568787, + "learning_rate": 1.512840489790035e-07, + "loss": 0.0578, + "step": 16517 + }, + { + "epoch": 2.6762799740764747, + "grad_norm": 0.9307635426521301, + "learning_rate": 1.5113427053909725e-07, + "loss": 0.0589, + "step": 16518 + }, + { + "epoch": 2.676441996111471, + "grad_norm": 0.9444718360900879, + "learning_rate": 1.5098456396879846e-07, + "loss": 0.0627, + "step": 16519 + }, + { + "epoch": 2.6766040181464676, + "grad_norm": 0.8695940375328064, + "learning_rate": 1.508349292726874e-07, + "loss": 0.0566, + "step": 16520 + }, + { + "epoch": 2.6767660401814646, + "grad_norm": 0.9241846203804016, + "learning_rate": 1.506853664553426e-07, + "loss": 0.0509, + "step": 16521 + }, + { + "epoch": 2.6769280622164615, + "grad_norm": 0.9394335150718689, + "learning_rate": 1.505358755213407e-07, + "loss": 0.0598, + "step": 16522 + }, + { + "epoch": 2.677090084251458, + "grad_norm": 0.865931510925293, + "learning_rate": 1.503864564752547e-07, + "loss": 0.0542, + "step": 16523 + }, + { + "epoch": 2.677252106286455, + "grad_norm": 0.9105206727981567, + "learning_rate": 1.5023710932165758e-07, + "loss": 0.051, + "step": 16524 + }, + { + "epoch": 2.677414128321452, + "grad_norm": 0.8169817328453064, + "learning_rate": 1.500878340651185e-07, + "loss": 0.0584, + "step": 16525 + }, + { + "epoch": 2.6775761503564484, + "grad_norm": 0.8245558142662048, + "learning_rate": 1.4993863071020548e-07, + "loss": 0.0581, + "step": 16526 + }, + { + "epoch": 2.6777381723914453, + "grad_norm": 0.9422661662101746, + "learning_rate": 1.4978949926148288e-07, + "loss": 0.0599, + "step": 16527 + }, + { + "epoch": 2.6779001944264422, + "grad_norm": 1.0110220909118652, + "learning_rate": 1.4964043972351377e-07, + "loss": 0.0624, + "step": 16528 + }, + { + "epoch": 2.6780622164614387, + "grad_norm": 1.0897401571273804, + "learning_rate": 1.494914521008603e-07, + "loss": 0.0539, + "step": 16529 + }, + { + "epoch": 2.6782242384964356, + "grad_norm": 0.8992266654968262, + "learning_rate": 1.4934253639807994e-07, + "loss": 0.0606, + "step": 16530 + }, + { + "epoch": 2.678386260531432, + "grad_norm": 0.9721251130104065, + "learning_rate": 1.4919369261972933e-07, + "loss": 0.0646, + "step": 16531 + }, + { + "epoch": 2.678548282566429, + "grad_norm": 0.8682243824005127, + "learning_rate": 1.4904492077036286e-07, + "loss": 0.0566, + "step": 16532 + }, + { + "epoch": 2.6787103046014256, + "grad_norm": 0.7980409860610962, + "learning_rate": 1.4889622085453304e-07, + "loss": 0.0563, + "step": 16533 + }, + { + "epoch": 2.6788723266364225, + "grad_norm": 0.8528122901916504, + "learning_rate": 1.4874759287678898e-07, + "loss": 0.0609, + "step": 16534 + }, + { + "epoch": 2.6790343486714194, + "grad_norm": 0.7880252599716187, + "learning_rate": 1.485990368416787e-07, + "loss": 0.0584, + "step": 16535 + }, + { + "epoch": 2.679196370706416, + "grad_norm": 1.048828363418579, + "learning_rate": 1.484505527537475e-07, + "loss": 0.0558, + "step": 16536 + }, + { + "epoch": 2.679358392741413, + "grad_norm": 0.9639270305633545, + "learning_rate": 1.483021406175389e-07, + "loss": 0.0636, + "step": 16537 + }, + { + "epoch": 2.67952041477641, + "grad_norm": 0.9279593825340271, + "learning_rate": 1.4815380043759374e-07, + "loss": 0.0578, + "step": 16538 + }, + { + "epoch": 2.6796824368114063, + "grad_norm": 0.8325791954994202, + "learning_rate": 1.4800553221845094e-07, + "loss": 0.0565, + "step": 16539 + }, + { + "epoch": 2.679844458846403, + "grad_norm": 0.8952010273933411, + "learning_rate": 1.4785733596464736e-07, + "loss": 0.0601, + "step": 16540 + }, + { + "epoch": 2.6800064808814, + "grad_norm": 0.8639797568321228, + "learning_rate": 1.4770921168071717e-07, + "loss": 0.0601, + "step": 16541 + }, + { + "epoch": 2.6801685029163966, + "grad_norm": 1.0438389778137207, + "learning_rate": 1.4756115937119202e-07, + "loss": 0.0681, + "step": 16542 + }, + { + "epoch": 2.680330524951393, + "grad_norm": 0.9621492028236389, + "learning_rate": 1.4741317904060304e-07, + "loss": 0.06, + "step": 16543 + }, + { + "epoch": 2.68049254698639, + "grad_norm": 0.814268946647644, + "learning_rate": 1.4726527069347796e-07, + "loss": 0.0555, + "step": 16544 + }, + { + "epoch": 2.680654569021387, + "grad_norm": 0.8960631489753723, + "learning_rate": 1.471174343343418e-07, + "loss": 0.0576, + "step": 16545 + }, + { + "epoch": 2.6808165910563835, + "grad_norm": 0.9540743827819824, + "learning_rate": 1.4696966996771838e-07, + "loss": 0.0691, + "step": 16546 + }, + { + "epoch": 2.6809786130913804, + "grad_norm": 1.125815987586975, + "learning_rate": 1.468219775981286e-07, + "loss": 0.0665, + "step": 16547 + }, + { + "epoch": 2.6811406351263773, + "grad_norm": 0.8713697195053101, + "learning_rate": 1.4667435723009187e-07, + "loss": 0.0554, + "step": 16548 + }, + { + "epoch": 2.681302657161374, + "grad_norm": 0.961420476436615, + "learning_rate": 1.4652680886812488e-07, + "loss": 0.0615, + "step": 16549 + }, + { + "epoch": 2.6814646791963708, + "grad_norm": 0.8258307576179504, + "learning_rate": 1.463793325167423e-07, + "loss": 0.0602, + "step": 16550 + }, + { + "epoch": 2.6816267012313677, + "grad_norm": 0.946225106716156, + "learning_rate": 1.4623192818045638e-07, + "loss": 0.0613, + "step": 16551 + }, + { + "epoch": 2.681788723266364, + "grad_norm": 1.008615255355835, + "learning_rate": 1.4608459586377743e-07, + "loss": 0.0708, + "step": 16552 + }, + { + "epoch": 2.681950745301361, + "grad_norm": 1.0377353429794312, + "learning_rate": 1.4593733557121347e-07, + "loss": 0.0649, + "step": 16553 + }, + { + "epoch": 2.6821127673363576, + "grad_norm": 0.9528126120567322, + "learning_rate": 1.4579014730727037e-07, + "loss": 0.0641, + "step": 16554 + }, + { + "epoch": 2.6822747893713546, + "grad_norm": 0.9190387725830078, + "learning_rate": 1.45643031076452e-07, + "loss": 0.0629, + "step": 16555 + }, + { + "epoch": 2.682436811406351, + "grad_norm": 0.855954647064209, + "learning_rate": 1.4549598688325896e-07, + "loss": 0.0566, + "step": 16556 + }, + { + "epoch": 2.682598833441348, + "grad_norm": 0.9234669208526611, + "learning_rate": 1.4534901473219093e-07, + "loss": 0.0671, + "step": 16557 + }, + { + "epoch": 2.682760855476345, + "grad_norm": 1.030083179473877, + "learning_rate": 1.4520211462774548e-07, + "loss": 0.0586, + "step": 16558 + }, + { + "epoch": 2.6829228775113414, + "grad_norm": 1.0558991432189941, + "learning_rate": 1.4505528657441648e-07, + "loss": 0.0627, + "step": 16559 + }, + { + "epoch": 2.6830848995463383, + "grad_norm": 1.0315167903900146, + "learning_rate": 1.4490853057669675e-07, + "loss": 0.061, + "step": 16560 + }, + { + "epoch": 2.6832469215813353, + "grad_norm": 0.8899813890457153, + "learning_rate": 1.4476184663907628e-07, + "loss": 0.0682, + "step": 16561 + }, + { + "epoch": 2.6834089436163318, + "grad_norm": 0.787449300289154, + "learning_rate": 1.4461523476604482e-07, + "loss": 0.0506, + "step": 16562 + }, + { + "epoch": 2.6835709656513287, + "grad_norm": 0.8441698551177979, + "learning_rate": 1.444686949620866e-07, + "loss": 0.0617, + "step": 16563 + }, + { + "epoch": 2.683732987686325, + "grad_norm": 0.8496869206428528, + "learning_rate": 1.4432222723168632e-07, + "loss": 0.0619, + "step": 16564 + }, + { + "epoch": 2.683895009721322, + "grad_norm": 1.025014042854309, + "learning_rate": 1.4417583157932485e-07, + "loss": 0.0568, + "step": 16565 + }, + { + "epoch": 2.6840570317563186, + "grad_norm": 0.9680132269859314, + "learning_rate": 1.4402950800948223e-07, + "loss": 0.0626, + "step": 16566 + }, + { + "epoch": 2.6842190537913155, + "grad_norm": 0.7993006110191345, + "learning_rate": 1.4388325652663542e-07, + "loss": 0.0592, + "step": 16567 + }, + { + "epoch": 2.6843810758263125, + "grad_norm": 0.8684203028678894, + "learning_rate": 1.437370771352589e-07, + "loss": 0.0611, + "step": 16568 + }, + { + "epoch": 2.684543097861309, + "grad_norm": 0.9955109357833862, + "learning_rate": 1.4359096983982607e-07, + "loss": 0.0586, + "step": 16569 + }, + { + "epoch": 2.684705119896306, + "grad_norm": 0.884691059589386, + "learning_rate": 1.4344493464480745e-07, + "loss": 0.0633, + "step": 16570 + }, + { + "epoch": 2.684867141931303, + "grad_norm": 0.8436540961265564, + "learning_rate": 1.4329897155467039e-07, + "loss": 0.0623, + "step": 16571 + }, + { + "epoch": 2.6850291639662993, + "grad_norm": 1.0994386672973633, + "learning_rate": 1.4315308057388206e-07, + "loss": 0.0638, + "step": 16572 + }, + { + "epoch": 2.6851911860012962, + "grad_norm": 0.9540835022926331, + "learning_rate": 1.4300726170690614e-07, + "loss": 0.056, + "step": 16573 + }, + { + "epoch": 2.685353208036293, + "grad_norm": 0.9536152482032776, + "learning_rate": 1.428615149582041e-07, + "loss": 0.0571, + "step": 16574 + }, + { + "epoch": 2.6855152300712897, + "grad_norm": 0.7673203349113464, + "learning_rate": 1.4271584033223512e-07, + "loss": 0.0578, + "step": 16575 + }, + { + "epoch": 2.6856772521062866, + "grad_norm": 0.899626612663269, + "learning_rate": 1.425702378334573e-07, + "loss": 0.0572, + "step": 16576 + }, + { + "epoch": 2.685839274141283, + "grad_norm": 0.9618022441864014, + "learning_rate": 1.4242470746632542e-07, + "loss": 0.0644, + "step": 16577 + }, + { + "epoch": 2.68600129617628, + "grad_norm": 0.9150751233100891, + "learning_rate": 1.4227924923529228e-07, + "loss": 0.0629, + "step": 16578 + }, + { + "epoch": 2.6861633182112765, + "grad_norm": 1.083203911781311, + "learning_rate": 1.4213386314480825e-07, + "loss": 0.0701, + "step": 16579 + }, + { + "epoch": 2.6863253402462735, + "grad_norm": 0.9395116567611694, + "learning_rate": 1.4198854919932225e-07, + "loss": 0.0624, + "step": 16580 + }, + { + "epoch": 2.6864873622812704, + "grad_norm": 0.8851693868637085, + "learning_rate": 1.4184330740328044e-07, + "loss": 0.0657, + "step": 16581 + }, + { + "epoch": 2.686649384316267, + "grad_norm": 0.8476728200912476, + "learning_rate": 1.4169813776112652e-07, + "loss": 0.0589, + "step": 16582 + }, + { + "epoch": 2.686811406351264, + "grad_norm": 1.0356652736663818, + "learning_rate": 1.4155304027730271e-07, + "loss": 0.0627, + "step": 16583 + }, + { + "epoch": 2.6869734283862607, + "grad_norm": 0.8928619623184204, + "learning_rate": 1.4140801495624913e-07, + "loss": 0.0626, + "step": 16584 + }, + { + "epoch": 2.6871354504212572, + "grad_norm": 1.1158393621444702, + "learning_rate": 1.412630618024016e-07, + "loss": 0.0641, + "step": 16585 + }, + { + "epoch": 2.687297472456254, + "grad_norm": 1.0287305116653442, + "learning_rate": 1.4111818082019696e-07, + "loss": 0.065, + "step": 16586 + }, + { + "epoch": 2.6874594944912507, + "grad_norm": 0.8559868931770325, + "learning_rate": 1.4097337201406742e-07, + "loss": 0.0576, + "step": 16587 + }, + { + "epoch": 2.6876215165262476, + "grad_norm": 0.9814032912254333, + "learning_rate": 1.4082863538844444e-07, + "loss": 0.0598, + "step": 16588 + }, + { + "epoch": 2.687783538561244, + "grad_norm": 1.0753200054168701, + "learning_rate": 1.406839709477556e-07, + "loss": 0.0605, + "step": 16589 + }, + { + "epoch": 2.687945560596241, + "grad_norm": 0.9408139586448669, + "learning_rate": 1.4053937869642737e-07, + "loss": 0.0624, + "step": 16590 + }, + { + "epoch": 2.688107582631238, + "grad_norm": 0.8320403695106506, + "learning_rate": 1.4039485863888537e-07, + "loss": 0.0557, + "step": 16591 + }, + { + "epoch": 2.6882696046662344, + "grad_norm": 0.9615926146507263, + "learning_rate": 1.402504107795502e-07, + "loss": 0.0627, + "step": 16592 + }, + { + "epoch": 2.6884316267012314, + "grad_norm": 0.8587374091148376, + "learning_rate": 1.401060351228417e-07, + "loss": 0.0631, + "step": 16593 + }, + { + "epoch": 2.6885936487362283, + "grad_norm": 0.8640631437301636, + "learning_rate": 1.39961731673178e-07, + "loss": 0.0605, + "step": 16594 + }, + { + "epoch": 2.688755670771225, + "grad_norm": 0.8059629797935486, + "learning_rate": 1.398175004349739e-07, + "loss": 0.0556, + "step": 16595 + }, + { + "epoch": 2.6889176928062217, + "grad_norm": 0.9349533319473267, + "learning_rate": 1.3967334141264277e-07, + "loss": 0.0646, + "step": 16596 + }, + { + "epoch": 2.6890797148412187, + "grad_norm": 0.9726789593696594, + "learning_rate": 1.3952925461059558e-07, + "loss": 0.0594, + "step": 16597 + }, + { + "epoch": 2.689241736876215, + "grad_norm": 0.9107815623283386, + "learning_rate": 1.39385240033241e-07, + "loss": 0.0614, + "step": 16598 + }, + { + "epoch": 2.689403758911212, + "grad_norm": 0.8884726762771606, + "learning_rate": 1.392412976849855e-07, + "loss": 0.0643, + "step": 16599 + }, + { + "epoch": 2.6895657809462086, + "grad_norm": 0.8729420900344849, + "learning_rate": 1.3909742757023336e-07, + "loss": 0.0585, + "step": 16600 + }, + { + "epoch": 2.6897278029812055, + "grad_norm": 0.8863007426261902, + "learning_rate": 1.3895362969338662e-07, + "loss": 0.0613, + "step": 16601 + }, + { + "epoch": 2.689889825016202, + "grad_norm": 0.9088746309280396, + "learning_rate": 1.3880990405884532e-07, + "loss": 0.056, + "step": 16602 + }, + { + "epoch": 2.690051847051199, + "grad_norm": 0.9081185460090637, + "learning_rate": 1.3866625067100707e-07, + "loss": 0.0599, + "step": 16603 + }, + { + "epoch": 2.690213869086196, + "grad_norm": 0.9388935565948486, + "learning_rate": 1.3852266953426674e-07, + "loss": 0.0642, + "step": 16604 + }, + { + "epoch": 2.6903758911211924, + "grad_norm": 0.7939647436141968, + "learning_rate": 1.3837916065301827e-07, + "loss": 0.0603, + "step": 16605 + }, + { + "epoch": 2.6905379131561893, + "grad_norm": 0.8875724077224731, + "learning_rate": 1.3823572403165285e-07, + "loss": 0.0599, + "step": 16606 + }, + { + "epoch": 2.690699935191186, + "grad_norm": 0.7508094906806946, + "learning_rate": 1.380923596745587e-07, + "loss": 0.0548, + "step": 16607 + }, + { + "epoch": 2.6908619572261827, + "grad_norm": 0.8791196942329407, + "learning_rate": 1.3794906758612252e-07, + "loss": 0.0589, + "step": 16608 + }, + { + "epoch": 2.6910239792611796, + "grad_norm": 0.9110643267631531, + "learning_rate": 1.3780584777072892e-07, + "loss": 0.061, + "step": 16609 + }, + { + "epoch": 2.691186001296176, + "grad_norm": 0.9413241744041443, + "learning_rate": 1.376627002327599e-07, + "loss": 0.0639, + "step": 16610 + }, + { + "epoch": 2.691348023331173, + "grad_norm": 0.9628585577011108, + "learning_rate": 1.375196249765956e-07, + "loss": 0.0696, + "step": 16611 + }, + { + "epoch": 2.6915100453661696, + "grad_norm": 0.8341220021247864, + "learning_rate": 1.373766220066136e-07, + "loss": 0.0596, + "step": 16612 + }, + { + "epoch": 2.6916720674011665, + "grad_norm": 0.8509212732315063, + "learning_rate": 1.372336913271896e-07, + "loss": 0.0554, + "step": 16613 + }, + { + "epoch": 2.6918340894361634, + "grad_norm": 0.8546708822250366, + "learning_rate": 1.3709083294269676e-07, + "loss": 0.0609, + "step": 16614 + }, + { + "epoch": 2.69199611147116, + "grad_norm": 0.9867240190505981, + "learning_rate": 1.369480468575063e-07, + "loss": 0.0511, + "step": 16615 + }, + { + "epoch": 2.692158133506157, + "grad_norm": 1.0250896215438843, + "learning_rate": 1.368053330759872e-07, + "loss": 0.0635, + "step": 16616 + }, + { + "epoch": 2.692320155541154, + "grad_norm": 1.0135303735733032, + "learning_rate": 1.366626916025063e-07, + "loss": 0.0621, + "step": 16617 + }, + { + "epoch": 2.6924821775761503, + "grad_norm": 0.852816104888916, + "learning_rate": 1.3652012244142754e-07, + "loss": 0.0576, + "step": 16618 + }, + { + "epoch": 2.692644199611147, + "grad_norm": 0.8429510593414307, + "learning_rate": 1.363776255971133e-07, + "loss": 0.0597, + "step": 16619 + }, + { + "epoch": 2.692806221646144, + "grad_norm": 0.8124983906745911, + "learning_rate": 1.362352010739243e-07, + "loss": 0.0522, + "step": 16620 + }, + { + "epoch": 2.6929682436811406, + "grad_norm": 0.836234450340271, + "learning_rate": 1.360928488762181e-07, + "loss": 0.0579, + "step": 16621 + }, + { + "epoch": 2.693130265716137, + "grad_norm": 0.9209657907485962, + "learning_rate": 1.3595056900834986e-07, + "loss": 0.0608, + "step": 16622 + }, + { + "epoch": 2.693292287751134, + "grad_norm": 0.9225515127182007, + "learning_rate": 1.3580836147467304e-07, + "loss": 0.0635, + "step": 16623 + }, + { + "epoch": 2.693454309786131, + "grad_norm": 0.9024338722229004, + "learning_rate": 1.3566622627953968e-07, + "loss": 0.0602, + "step": 16624 + }, + { + "epoch": 2.6936163318211275, + "grad_norm": 0.959028959274292, + "learning_rate": 1.3552416342729802e-07, + "loss": 0.0628, + "step": 16625 + }, + { + "epoch": 2.6937783538561244, + "grad_norm": 0.917083203792572, + "learning_rate": 1.3538217292229482e-07, + "loss": 0.0589, + "step": 16626 + }, + { + "epoch": 2.6939403758911213, + "grad_norm": 0.9200828671455383, + "learning_rate": 1.3524025476887527e-07, + "loss": 0.0577, + "step": 16627 + }, + { + "epoch": 2.694102397926118, + "grad_norm": 1.1384090185165405, + "learning_rate": 1.3509840897138083e-07, + "loss": 0.0641, + "step": 16628 + }, + { + "epoch": 2.6942644199611148, + "grad_norm": 0.9467897415161133, + "learning_rate": 1.349566355341525e-07, + "loss": 0.0635, + "step": 16629 + }, + { + "epoch": 2.6944264419961117, + "grad_norm": 0.8724448680877686, + "learning_rate": 1.3481493446152766e-07, + "loss": 0.0598, + "step": 16630 + }, + { + "epoch": 2.694588464031108, + "grad_norm": 0.9046519994735718, + "learning_rate": 1.3467330575784226e-07, + "loss": 0.0619, + "step": 16631 + }, + { + "epoch": 2.694750486066105, + "grad_norm": 0.7974745631217957, + "learning_rate": 1.3453174942743008e-07, + "loss": 0.0567, + "step": 16632 + }, + { + "epoch": 2.6949125081011016, + "grad_norm": 0.8849309682846069, + "learning_rate": 1.3439026547462126e-07, + "loss": 0.0636, + "step": 16633 + }, + { + "epoch": 2.6950745301360985, + "grad_norm": 1.4565906524658203, + "learning_rate": 1.3424885390374593e-07, + "loss": 0.0653, + "step": 16634 + }, + { + "epoch": 2.695236552171095, + "grad_norm": 0.9429144859313965, + "learning_rate": 1.341075147191312e-07, + "loss": 0.0632, + "step": 16635 + }, + { + "epoch": 2.695398574206092, + "grad_norm": 0.8843449354171753, + "learning_rate": 1.3396624792510082e-07, + "loss": 0.0587, + "step": 16636 + }, + { + "epoch": 2.695560596241089, + "grad_norm": 0.84376460313797, + "learning_rate": 1.3382505352597747e-07, + "loss": 0.0592, + "step": 16637 + }, + { + "epoch": 2.6957226182760854, + "grad_norm": 0.9161346554756165, + "learning_rate": 1.33683931526081e-07, + "loss": 0.0644, + "step": 16638 + }, + { + "epoch": 2.6958846403110823, + "grad_norm": 0.9942044019699097, + "learning_rate": 1.3354288192973074e-07, + "loss": 0.0626, + "step": 16639 + }, + { + "epoch": 2.6960466623460793, + "grad_norm": 0.9702102541923523, + "learning_rate": 1.3340190474124104e-07, + "loss": 0.0654, + "step": 16640 + }, + { + "epoch": 2.6962086843810757, + "grad_norm": 0.9594953060150146, + "learning_rate": 1.3326099996492618e-07, + "loss": 0.0608, + "step": 16641 + }, + { + "epoch": 2.6963707064160727, + "grad_norm": 0.9994754195213318, + "learning_rate": 1.3312016760509722e-07, + "loss": 0.0606, + "step": 16642 + }, + { + "epoch": 2.6965327284510696, + "grad_norm": 0.8460058569908142, + "learning_rate": 1.3297940766606344e-07, + "loss": 0.0541, + "step": 16643 + }, + { + "epoch": 2.696694750486066, + "grad_norm": 0.9044547080993652, + "learning_rate": 1.3283872015213168e-07, + "loss": 0.0607, + "step": 16644 + }, + { + "epoch": 2.6968567725210626, + "grad_norm": 0.807353675365448, + "learning_rate": 1.3269810506760683e-07, + "loss": 0.0583, + "step": 16645 + }, + { + "epoch": 2.6970187945560595, + "grad_norm": 1.019974708557129, + "learning_rate": 1.3255756241679102e-07, + "loss": 0.0644, + "step": 16646 + }, + { + "epoch": 2.6971808165910565, + "grad_norm": 0.818080723285675, + "learning_rate": 1.3241709220398467e-07, + "loss": 0.0579, + "step": 16647 + }, + { + "epoch": 2.697342838626053, + "grad_norm": 0.959563672542572, + "learning_rate": 1.3227669443348578e-07, + "loss": 0.0625, + "step": 16648 + }, + { + "epoch": 2.69750486066105, + "grad_norm": 0.9866206645965576, + "learning_rate": 1.321363691095906e-07, + "loss": 0.0584, + "step": 16649 + }, + { + "epoch": 2.697666882696047, + "grad_norm": 1.2002931833267212, + "learning_rate": 1.3199611623659235e-07, + "loss": 0.0728, + "step": 16650 + }, + { + "epoch": 2.6978289047310433, + "grad_norm": 0.8130374550819397, + "learning_rate": 1.3185593581878238e-07, + "loss": 0.0582, + "step": 16651 + }, + { + "epoch": 2.6979909267660402, + "grad_norm": 0.883966863155365, + "learning_rate": 1.3171582786044968e-07, + "loss": 0.0638, + "step": 16652 + }, + { + "epoch": 2.698152948801037, + "grad_norm": 1.0147669315338135, + "learning_rate": 1.3157579236588197e-07, + "loss": 0.0619, + "step": 16653 + }, + { + "epoch": 2.6983149708360337, + "grad_norm": 0.9203122854232788, + "learning_rate": 1.3143582933936333e-07, + "loss": 0.0581, + "step": 16654 + }, + { + "epoch": 2.6984769928710306, + "grad_norm": 0.9031851887702942, + "learning_rate": 1.3129593878517643e-07, + "loss": 0.0599, + "step": 16655 + }, + { + "epoch": 2.698639014906027, + "grad_norm": 0.8836839199066162, + "learning_rate": 1.3115612070760174e-07, + "loss": 0.0583, + "step": 16656 + }, + { + "epoch": 2.698801036941024, + "grad_norm": 0.9112008810043335, + "learning_rate": 1.3101637511091724e-07, + "loss": 0.0549, + "step": 16657 + }, + { + "epoch": 2.6989630589760205, + "grad_norm": 1.0826163291931152, + "learning_rate": 1.3087670199939894e-07, + "loss": 0.0597, + "step": 16658 + }, + { + "epoch": 2.6991250810110174, + "grad_norm": 0.8839635252952576, + "learning_rate": 1.3073710137732037e-07, + "loss": 0.0545, + "step": 16659 + }, + { + "epoch": 2.6992871030460144, + "grad_norm": 0.9589625000953674, + "learning_rate": 1.3059757324895283e-07, + "loss": 0.0638, + "step": 16660 + }, + { + "epoch": 2.699449125081011, + "grad_norm": 0.862845242023468, + "learning_rate": 1.3045811761856597e-07, + "loss": 0.0589, + "step": 16661 + }, + { + "epoch": 2.699611147116008, + "grad_norm": 0.8167102336883545, + "learning_rate": 1.303187344904261e-07, + "loss": 0.0543, + "step": 16662 + }, + { + "epoch": 2.6997731691510047, + "grad_norm": 1.040915608406067, + "learning_rate": 1.3017942386879867e-07, + "loss": 0.067, + "step": 16663 + }, + { + "epoch": 2.6999351911860012, + "grad_norm": 0.9312401413917542, + "learning_rate": 1.3004018575794586e-07, + "loss": 0.0596, + "step": 16664 + }, + { + "epoch": 2.700097213220998, + "grad_norm": 1.038403034210205, + "learning_rate": 1.2990102016212868e-07, + "loss": 0.0634, + "step": 16665 + }, + { + "epoch": 2.7002592352559946, + "grad_norm": 0.8530080914497375, + "learning_rate": 1.2976192708560432e-07, + "loss": 0.0642, + "step": 16666 + }, + { + "epoch": 2.7004212572909916, + "grad_norm": 1.1165688037872314, + "learning_rate": 1.2962290653262903e-07, + "loss": 0.0593, + "step": 16667 + }, + { + "epoch": 2.700583279325988, + "grad_norm": 0.9035430550575256, + "learning_rate": 1.2948395850745726e-07, + "loss": 0.0613, + "step": 16668 + }, + { + "epoch": 2.700745301360985, + "grad_norm": 0.8505802154541016, + "learning_rate": 1.293450830143392e-07, + "loss": 0.0609, + "step": 16669 + }, + { + "epoch": 2.700907323395982, + "grad_norm": 0.98024582862854, + "learning_rate": 1.29206280057525e-07, + "loss": 0.0601, + "step": 16670 + }, + { + "epoch": 2.7010693454309784, + "grad_norm": 0.8178975582122803, + "learning_rate": 1.2906754964126078e-07, + "loss": 0.0573, + "step": 16671 + }, + { + "epoch": 2.7012313674659754, + "grad_norm": 0.8594771027565002, + "learning_rate": 1.2892889176979284e-07, + "loss": 0.0607, + "step": 16672 + }, + { + "epoch": 2.7013933895009723, + "grad_norm": 0.9803131818771362, + "learning_rate": 1.2879030644736252e-07, + "loss": 0.0643, + "step": 16673 + }, + { + "epoch": 2.701555411535969, + "grad_norm": 0.8199018836021423, + "learning_rate": 1.2865179367821083e-07, + "loss": 0.0536, + "step": 16674 + }, + { + "epoch": 2.7017174335709657, + "grad_norm": 0.8338936567306519, + "learning_rate": 1.2851335346657557e-07, + "loss": 0.0597, + "step": 16675 + }, + { + "epoch": 2.7018794556059627, + "grad_norm": 0.974616527557373, + "learning_rate": 1.283749858166927e-07, + "loss": 0.0615, + "step": 16676 + }, + { + "epoch": 2.702041477640959, + "grad_norm": 0.9886922240257263, + "learning_rate": 1.2823669073279615e-07, + "loss": 0.0644, + "step": 16677 + }, + { + "epoch": 2.702203499675956, + "grad_norm": 0.7480154037475586, + "learning_rate": 1.280984682191172e-07, + "loss": 0.0548, + "step": 16678 + }, + { + "epoch": 2.7023655217109526, + "grad_norm": 0.8866371512413025, + "learning_rate": 1.2796031827988582e-07, + "loss": 0.0576, + "step": 16679 + }, + { + "epoch": 2.7025275437459495, + "grad_norm": 1.1099352836608887, + "learning_rate": 1.2782224091932775e-07, + "loss": 0.0609, + "step": 16680 + }, + { + "epoch": 2.702689565780946, + "grad_norm": 0.902736246585846, + "learning_rate": 1.276842361416686e-07, + "loss": 0.0601, + "step": 16681 + }, + { + "epoch": 2.702851587815943, + "grad_norm": 1.0859075784683228, + "learning_rate": 1.2754630395113098e-07, + "loss": 0.0672, + "step": 16682 + }, + { + "epoch": 2.70301360985094, + "grad_norm": 0.9454689025878906, + "learning_rate": 1.2740844435193578e-07, + "loss": 0.0651, + "step": 16683 + }, + { + "epoch": 2.7031756318859363, + "grad_norm": 0.8466532230377197, + "learning_rate": 1.2727065734830013e-07, + "loss": 0.0515, + "step": 16684 + }, + { + "epoch": 2.7033376539209333, + "grad_norm": 0.9573439359664917, + "learning_rate": 1.271329429444404e-07, + "loss": 0.0619, + "step": 16685 + }, + { + "epoch": 2.70349967595593, + "grad_norm": 0.9744999408721924, + "learning_rate": 1.269953011445707e-07, + "loss": 0.0584, + "step": 16686 + }, + { + "epoch": 2.7036616979909267, + "grad_norm": 0.9962242841720581, + "learning_rate": 1.2685773195290186e-07, + "loss": 0.0582, + "step": 16687 + }, + { + "epoch": 2.7038237200259236, + "grad_norm": 0.9875708818435669, + "learning_rate": 1.267202353736438e-07, + "loss": 0.0621, + "step": 16688 + }, + { + "epoch": 2.70398574206092, + "grad_norm": 0.8407350778579712, + "learning_rate": 1.265828114110032e-07, + "loss": 0.0556, + "step": 16689 + }, + { + "epoch": 2.704147764095917, + "grad_norm": 0.9471036791801453, + "learning_rate": 1.26445460069185e-07, + "loss": 0.0601, + "step": 16690 + }, + { + "epoch": 2.7043097861309136, + "grad_norm": 0.739356279373169, + "learning_rate": 1.2630818135239198e-07, + "loss": 0.0503, + "step": 16691 + }, + { + "epoch": 2.7044718081659105, + "grad_norm": 1.1428637504577637, + "learning_rate": 1.2617097526482407e-07, + "loss": 0.0649, + "step": 16692 + }, + { + "epoch": 2.7046338302009074, + "grad_norm": 0.9683777689933777, + "learning_rate": 1.2603384181068018e-07, + "loss": 0.0627, + "step": 16693 + }, + { + "epoch": 2.704795852235904, + "grad_norm": 1.0176392793655396, + "learning_rate": 1.2589678099415582e-07, + "loss": 0.0653, + "step": 16694 + }, + { + "epoch": 2.704957874270901, + "grad_norm": 0.9063898921012878, + "learning_rate": 1.2575979281944429e-07, + "loss": 0.0545, + "step": 16695 + }, + { + "epoch": 2.7051198963058978, + "grad_norm": 1.0391916036605835, + "learning_rate": 1.256228772907378e-07, + "loss": 0.0706, + "step": 16696 + }, + { + "epoch": 2.7052819183408943, + "grad_norm": 0.9358698129653931, + "learning_rate": 1.254860344122255e-07, + "loss": 0.0651, + "step": 16697 + }, + { + "epoch": 2.705443940375891, + "grad_norm": 1.0807019472122192, + "learning_rate": 1.2534926418809433e-07, + "loss": 0.0649, + "step": 16698 + }, + { + "epoch": 2.705605962410888, + "grad_norm": 0.9569528102874756, + "learning_rate": 1.2521256662252902e-07, + "loss": 0.0634, + "step": 16699 + }, + { + "epoch": 2.7057679844458846, + "grad_norm": 0.9983630180358887, + "learning_rate": 1.2507594171971198e-07, + "loss": 0.0618, + "step": 16700 + }, + { + "epoch": 2.7059300064808816, + "grad_norm": 1.1226799488067627, + "learning_rate": 1.2493938948382468e-07, + "loss": 0.0627, + "step": 16701 + }, + { + "epoch": 2.706092028515878, + "grad_norm": 0.9132358431816101, + "learning_rate": 1.2480290991904398e-07, + "loss": 0.064, + "step": 16702 + }, + { + "epoch": 2.706254050550875, + "grad_norm": 0.8795627951622009, + "learning_rate": 1.246665030295463e-07, + "loss": 0.0611, + "step": 16703 + }, + { + "epoch": 2.7064160725858715, + "grad_norm": 0.8634214997291565, + "learning_rate": 1.245301688195058e-07, + "loss": 0.0563, + "step": 16704 + }, + { + "epoch": 2.7065780946208684, + "grad_norm": 0.9130164384841919, + "learning_rate": 1.243939072930933e-07, + "loss": 0.057, + "step": 16705 + }, + { + "epoch": 2.7067401166558653, + "grad_norm": 1.1923558712005615, + "learning_rate": 1.2425771845447853e-07, + "loss": 0.0607, + "step": 16706 + }, + { + "epoch": 2.706902138690862, + "grad_norm": 1.020593523979187, + "learning_rate": 1.2412160230782844e-07, + "loss": 0.0604, + "step": 16707 + }, + { + "epoch": 2.7070641607258588, + "grad_norm": 0.8900664448738098, + "learning_rate": 1.2398555885730774e-07, + "loss": 0.0539, + "step": 16708 + }, + { + "epoch": 2.7072261827608557, + "grad_norm": 0.9280899167060852, + "learning_rate": 1.2384958810707892e-07, + "loss": 0.0645, + "step": 16709 + }, + { + "epoch": 2.707388204795852, + "grad_norm": 0.9784387350082397, + "learning_rate": 1.2371369006130256e-07, + "loss": 0.063, + "step": 16710 + }, + { + "epoch": 2.707550226830849, + "grad_norm": 0.920939564704895, + "learning_rate": 1.2357786472413702e-07, + "loss": 0.0656, + "step": 16711 + }, + { + "epoch": 2.7077122488658456, + "grad_norm": 0.858536958694458, + "learning_rate": 1.2344211209973811e-07, + "loss": 0.0582, + "step": 16712 + }, + { + "epoch": 2.7078742709008425, + "grad_norm": 0.9796631336212158, + "learning_rate": 1.2330643219225918e-07, + "loss": 0.0593, + "step": 16713 + }, + { + "epoch": 2.708036292935839, + "grad_norm": 0.9091916084289551, + "learning_rate": 1.2317082500585163e-07, + "loss": 0.06, + "step": 16714 + }, + { + "epoch": 2.708198314970836, + "grad_norm": 1.0876365900039673, + "learning_rate": 1.2303529054466522e-07, + "loss": 0.0648, + "step": 16715 + }, + { + "epoch": 2.708360337005833, + "grad_norm": 0.9896219968795776, + "learning_rate": 1.2289982881284718e-07, + "loss": 0.0657, + "step": 16716 + }, + { + "epoch": 2.7085223590408294, + "grad_norm": 0.9894326329231262, + "learning_rate": 1.2276443981454167e-07, + "loss": 0.0645, + "step": 16717 + }, + { + "epoch": 2.7086843810758263, + "grad_norm": 0.9388425946235657, + "learning_rate": 1.226291235538915e-07, + "loss": 0.0582, + "step": 16718 + }, + { + "epoch": 2.7088464031108233, + "grad_norm": 0.9230713844299316, + "learning_rate": 1.22493880035037e-07, + "loss": 0.0647, + "step": 16719 + }, + { + "epoch": 2.7090084251458197, + "grad_norm": 0.8833425045013428, + "learning_rate": 1.223587092621162e-07, + "loss": 0.0634, + "step": 16720 + }, + { + "epoch": 2.7091704471808167, + "grad_norm": 0.9370494484901428, + "learning_rate": 1.2222361123926525e-07, + "loss": 0.0603, + "step": 16721 + }, + { + "epoch": 2.7093324692158136, + "grad_norm": 0.8519914150238037, + "learning_rate": 1.2208858597061752e-07, + "loss": 0.0599, + "step": 16722 + }, + { + "epoch": 2.70949449125081, + "grad_norm": 0.9887134432792664, + "learning_rate": 1.2195363346030497e-07, + "loss": 0.0671, + "step": 16723 + }, + { + "epoch": 2.709656513285807, + "grad_norm": 0.8368325233459473, + "learning_rate": 1.21818753712456e-07, + "loss": 0.0527, + "step": 16724 + }, + { + "epoch": 2.7098185353208035, + "grad_norm": 0.8991503119468689, + "learning_rate": 1.2168394673119837e-07, + "loss": 0.0631, + "step": 16725 + }, + { + "epoch": 2.7099805573558005, + "grad_norm": 0.9804664850234985, + "learning_rate": 1.2154921252065633e-07, + "loss": 0.0622, + "step": 16726 + }, + { + "epoch": 2.710142579390797, + "grad_norm": 0.8143778443336487, + "learning_rate": 1.2141455108495321e-07, + "loss": 0.0536, + "step": 16727 + }, + { + "epoch": 2.710304601425794, + "grad_norm": 0.934648334980011, + "learning_rate": 1.2127996242820822e-07, + "loss": 0.0581, + "step": 16728 + }, + { + "epoch": 2.710466623460791, + "grad_norm": 1.0055309534072876, + "learning_rate": 1.2114544655454002e-07, + "loss": 0.0675, + "step": 16729 + }, + { + "epoch": 2.7106286454957873, + "grad_norm": 0.9444065690040588, + "learning_rate": 1.2101100346806478e-07, + "loss": 0.0661, + "step": 16730 + }, + { + "epoch": 2.7107906675307842, + "grad_norm": 0.7594142556190491, + "learning_rate": 1.2087663317289554e-07, + "loss": 0.0543, + "step": 16731 + }, + { + "epoch": 2.710952689565781, + "grad_norm": 0.9627096056938171, + "learning_rate": 1.2074233567314408e-07, + "loss": 0.0609, + "step": 16732 + }, + { + "epoch": 2.7111147116007777, + "grad_norm": 0.8099537491798401, + "learning_rate": 1.2060811097291874e-07, + "loss": 0.0559, + "step": 16733 + }, + { + "epoch": 2.7112767336357746, + "grad_norm": 1.0347872972488403, + "learning_rate": 1.2047395907632818e-07, + "loss": 0.06, + "step": 16734 + }, + { + "epoch": 2.711438755670771, + "grad_norm": 0.824565052986145, + "learning_rate": 1.2033987998747582e-07, + "loss": 0.062, + "step": 16735 + }, + { + "epoch": 2.711600777705768, + "grad_norm": 1.082157015800476, + "learning_rate": 1.2020587371046445e-07, + "loss": 0.0715, + "step": 16736 + }, + { + "epoch": 2.7117627997407645, + "grad_norm": 0.8374156355857849, + "learning_rate": 1.2007194024939412e-07, + "loss": 0.0589, + "step": 16737 + }, + { + "epoch": 2.7119248217757614, + "grad_norm": 1.058831810951233, + "learning_rate": 1.1993807960836322e-07, + "loss": 0.0694, + "step": 16738 + }, + { + "epoch": 2.7120868438107584, + "grad_norm": 1.0584897994995117, + "learning_rate": 1.198042917914677e-07, + "loss": 0.0624, + "step": 16739 + }, + { + "epoch": 2.712248865845755, + "grad_norm": 0.9175471067428589, + "learning_rate": 1.1967057680280058e-07, + "loss": 0.0571, + "step": 16740 + }, + { + "epoch": 2.712410887880752, + "grad_norm": 0.9468705058097839, + "learning_rate": 1.1953693464645395e-07, + "loss": 0.0601, + "step": 16741 + }, + { + "epoch": 2.7125729099157487, + "grad_norm": 0.8563498258590698, + "learning_rate": 1.1940336532651614e-07, + "loss": 0.0611, + "step": 16742 + }, + { + "epoch": 2.712734931950745, + "grad_norm": 1.0424282550811768, + "learning_rate": 1.192698688470742e-07, + "loss": 0.058, + "step": 16743 + }, + { + "epoch": 2.712896953985742, + "grad_norm": 0.9378633499145508, + "learning_rate": 1.1913644521221345e-07, + "loss": 0.0613, + "step": 16744 + }, + { + "epoch": 2.713058976020739, + "grad_norm": 0.874260425567627, + "learning_rate": 1.1900309442601593e-07, + "loss": 0.0591, + "step": 16745 + }, + { + "epoch": 2.7132209980557356, + "grad_norm": 1.0240492820739746, + "learning_rate": 1.1886981649256169e-07, + "loss": 0.0673, + "step": 16746 + }, + { + "epoch": 2.713383020090732, + "grad_norm": 0.8745399117469788, + "learning_rate": 1.1873661141592857e-07, + "loss": 0.0605, + "step": 16747 + }, + { + "epoch": 2.713545042125729, + "grad_norm": 0.8494243025779724, + "learning_rate": 1.1860347920019304e-07, + "loss": 0.0592, + "step": 16748 + }, + { + "epoch": 2.713707064160726, + "grad_norm": 0.9611318111419678, + "learning_rate": 1.184704198494277e-07, + "loss": 0.0642, + "step": 16749 + }, + { + "epoch": 2.7138690861957224, + "grad_norm": 0.9604257941246033, + "learning_rate": 1.1833743336770482e-07, + "loss": 0.0631, + "step": 16750 + }, + { + "epoch": 2.7140311082307194, + "grad_norm": 0.8594368696212769, + "learning_rate": 1.1820451975909253e-07, + "loss": 0.0571, + "step": 16751 + }, + { + "epoch": 2.7141931302657163, + "grad_norm": 0.8387914299964905, + "learning_rate": 1.1807167902765843e-07, + "loss": 0.0564, + "step": 16752 + }, + { + "epoch": 2.714355152300713, + "grad_norm": 1.1130517721176147, + "learning_rate": 1.1793891117746648e-07, + "loss": 0.0618, + "step": 16753 + }, + { + "epoch": 2.7145171743357097, + "grad_norm": 0.8822175860404968, + "learning_rate": 1.1780621621257953e-07, + "loss": 0.057, + "step": 16754 + }, + { + "epoch": 2.7146791963707066, + "grad_norm": 0.8818928599357605, + "learning_rate": 1.176735941370577e-07, + "loss": 0.0605, + "step": 16755 + }, + { + "epoch": 2.714841218405703, + "grad_norm": 0.932831883430481, + "learning_rate": 1.1754104495495882e-07, + "loss": 0.064, + "step": 16756 + }, + { + "epoch": 2.7150032404407, + "grad_norm": 0.8247097134590149, + "learning_rate": 1.1740856867033801e-07, + "loss": 0.0576, + "step": 16757 + }, + { + "epoch": 2.7151652624756966, + "grad_norm": 0.8611258864402771, + "learning_rate": 1.1727616528724949e-07, + "loss": 0.0581, + "step": 16758 + }, + { + "epoch": 2.7153272845106935, + "grad_norm": 0.8132169842720032, + "learning_rate": 1.171438348097445e-07, + "loss": 0.0533, + "step": 16759 + }, + { + "epoch": 2.71548930654569, + "grad_norm": 0.8210513591766357, + "learning_rate": 1.1701157724187173e-07, + "loss": 0.0576, + "step": 16760 + }, + { + "epoch": 2.715651328580687, + "grad_norm": 0.9274697303771973, + "learning_rate": 1.1687939258767795e-07, + "loss": 0.0642, + "step": 16761 + }, + { + "epoch": 2.715813350615684, + "grad_norm": 0.9596958160400391, + "learning_rate": 1.1674728085120713e-07, + "loss": 0.063, + "step": 16762 + }, + { + "epoch": 2.7159753726506803, + "grad_norm": 0.9137746691703796, + "learning_rate": 1.1661524203650298e-07, + "loss": 0.0604, + "step": 16763 + }, + { + "epoch": 2.7161373946856773, + "grad_norm": 0.9403409361839294, + "learning_rate": 1.1648327614760452e-07, + "loss": 0.0622, + "step": 16764 + }, + { + "epoch": 2.716299416720674, + "grad_norm": 0.9043160676956177, + "learning_rate": 1.1635138318854961e-07, + "loss": 0.0668, + "step": 16765 + }, + { + "epoch": 2.7164614387556707, + "grad_norm": 0.8880981802940369, + "learning_rate": 1.1621956316337391e-07, + "loss": 0.061, + "step": 16766 + }, + { + "epoch": 2.7166234607906676, + "grad_norm": 0.938401460647583, + "learning_rate": 1.1608781607611113e-07, + "loss": 0.0631, + "step": 16767 + }, + { + "epoch": 2.7167854828256646, + "grad_norm": 0.9028334021568298, + "learning_rate": 1.1595614193079224e-07, + "loss": 0.0553, + "step": 16768 + }, + { + "epoch": 2.716947504860661, + "grad_norm": 0.8787040710449219, + "learning_rate": 1.1582454073144623e-07, + "loss": 0.0611, + "step": 16769 + }, + { + "epoch": 2.7171095268956575, + "grad_norm": 0.8604116439819336, + "learning_rate": 1.1569301248209958e-07, + "loss": 0.0499, + "step": 16770 + }, + { + "epoch": 2.7172715489306545, + "grad_norm": 0.9343183636665344, + "learning_rate": 1.1556155718677714e-07, + "loss": 0.0587, + "step": 16771 + }, + { + "epoch": 2.7174335709656514, + "grad_norm": 0.7739042639732361, + "learning_rate": 1.1543017484950015e-07, + "loss": 0.0545, + "step": 16772 + }, + { + "epoch": 2.717595593000648, + "grad_norm": 1.046132206916809, + "learning_rate": 1.1529886547428954e-07, + "loss": 0.0585, + "step": 16773 + }, + { + "epoch": 2.717757615035645, + "grad_norm": 0.8855156302452087, + "learning_rate": 1.1516762906516322e-07, + "loss": 0.0581, + "step": 16774 + }, + { + "epoch": 2.7179196370706418, + "grad_norm": 0.8061611652374268, + "learning_rate": 1.1503646562613602e-07, + "loss": 0.0588, + "step": 16775 + }, + { + "epoch": 2.7180816591056383, + "grad_norm": 0.9897039532661438, + "learning_rate": 1.1490537516122141e-07, + "loss": 0.0622, + "step": 16776 + }, + { + "epoch": 2.718243681140635, + "grad_norm": 0.9542493224143982, + "learning_rate": 1.1477435767443007e-07, + "loss": 0.0619, + "step": 16777 + }, + { + "epoch": 2.718405703175632, + "grad_norm": 0.8548462986946106, + "learning_rate": 1.1464341316977184e-07, + "loss": 0.0585, + "step": 16778 + }, + { + "epoch": 2.7185677252106286, + "grad_norm": 0.810540497303009, + "learning_rate": 1.145125416512527e-07, + "loss": 0.053, + "step": 16779 + }, + { + "epoch": 2.7187297472456255, + "grad_norm": 0.8522582054138184, + "learning_rate": 1.1438174312287664e-07, + "loss": 0.0563, + "step": 16780 + }, + { + "epoch": 2.718891769280622, + "grad_norm": 0.9812653064727783, + "learning_rate": 1.1425101758864632e-07, + "loss": 0.0679, + "step": 16781 + }, + { + "epoch": 2.719053791315619, + "grad_norm": 1.0658005475997925, + "learning_rate": 1.1412036505256158e-07, + "loss": 0.057, + "step": 16782 + }, + { + "epoch": 2.7192158133506155, + "grad_norm": 0.930414080619812, + "learning_rate": 1.1398978551861978e-07, + "loss": 0.0626, + "step": 16783 + }, + { + "epoch": 2.7193778353856124, + "grad_norm": 0.9118586182594299, + "learning_rate": 1.1385927899081661e-07, + "loss": 0.0641, + "step": 16784 + }, + { + "epoch": 2.7195398574206093, + "grad_norm": 0.9204965829849243, + "learning_rate": 1.13728845473145e-07, + "loss": 0.0619, + "step": 16785 + }, + { + "epoch": 2.719701879455606, + "grad_norm": 0.9930737614631653, + "learning_rate": 1.1359848496959618e-07, + "loss": 0.0617, + "step": 16786 + }, + { + "epoch": 2.7198639014906028, + "grad_norm": 0.8213454484939575, + "learning_rate": 1.1346819748415893e-07, + "loss": 0.0568, + "step": 16787 + }, + { + "epoch": 2.7200259235255997, + "grad_norm": 0.8408246636390686, + "learning_rate": 1.1333798302081922e-07, + "loss": 0.0554, + "step": 16788 + }, + { + "epoch": 2.720187945560596, + "grad_norm": 0.8678555488586426, + "learning_rate": 1.1320784158356218e-07, + "loss": 0.0564, + "step": 16789 + }, + { + "epoch": 2.720349967595593, + "grad_norm": 0.8617439866065979, + "learning_rate": 1.1307777317636882e-07, + "loss": 0.0587, + "step": 16790 + }, + { + "epoch": 2.7205119896305896, + "grad_norm": 0.8767697811126709, + "learning_rate": 1.1294777780321898e-07, + "loss": 0.0613, + "step": 16791 + }, + { + "epoch": 2.7206740116655865, + "grad_norm": 1.116636872291565, + "learning_rate": 1.1281785546809115e-07, + "loss": 0.0709, + "step": 16792 + }, + { + "epoch": 2.720836033700583, + "grad_norm": 0.9495435357093811, + "learning_rate": 1.1268800617495995e-07, + "loss": 0.0596, + "step": 16793 + }, + { + "epoch": 2.72099805573558, + "grad_norm": 0.8045752644538879, + "learning_rate": 1.1255822992779858e-07, + "loss": 0.0519, + "step": 16794 + }, + { + "epoch": 2.721160077770577, + "grad_norm": 0.7897116541862488, + "learning_rate": 1.1242852673057774e-07, + "loss": 0.0559, + "step": 16795 + }, + { + "epoch": 2.7213220998055734, + "grad_norm": 0.9391621947288513, + "learning_rate": 1.1229889658726623e-07, + "loss": 0.0606, + "step": 16796 + }, + { + "epoch": 2.7214841218405703, + "grad_norm": 0.8378191590309143, + "learning_rate": 1.1216933950183028e-07, + "loss": 0.0553, + "step": 16797 + }, + { + "epoch": 2.7216461438755672, + "grad_norm": 0.8450056314468384, + "learning_rate": 1.1203985547823427e-07, + "loss": 0.0608, + "step": 16798 + }, + { + "epoch": 2.7218081659105637, + "grad_norm": 0.9059349894523621, + "learning_rate": 1.1191044452043998e-07, + "loss": 0.0581, + "step": 16799 + }, + { + "epoch": 2.7219701879455607, + "grad_norm": 0.9473249316215515, + "learning_rate": 1.1178110663240676e-07, + "loss": 0.0646, + "step": 16800 + }, + { + "epoch": 2.7221322099805576, + "grad_norm": 1.025634527206421, + "learning_rate": 1.1165184181809258e-07, + "loss": 0.0586, + "step": 16801 + }, + { + "epoch": 2.722294232015554, + "grad_norm": 0.9218292832374573, + "learning_rate": 1.1152265008145202e-07, + "loss": 0.0651, + "step": 16802 + }, + { + "epoch": 2.722456254050551, + "grad_norm": 0.8640052080154419, + "learning_rate": 1.1139353142643861e-07, + "loss": 0.0558, + "step": 16803 + }, + { + "epoch": 2.7226182760855475, + "grad_norm": 1.0886274576187134, + "learning_rate": 1.1126448585700306e-07, + "loss": 0.06, + "step": 16804 + }, + { + "epoch": 2.7227802981205445, + "grad_norm": 0.8292529582977295, + "learning_rate": 1.1113551337709305e-07, + "loss": 0.0591, + "step": 16805 + }, + { + "epoch": 2.722942320155541, + "grad_norm": 0.85284024477005, + "learning_rate": 1.110066139906557e-07, + "loss": 0.0607, + "step": 16806 + }, + { + "epoch": 2.723104342190538, + "grad_norm": 0.8384661674499512, + "learning_rate": 1.1087778770163482e-07, + "loss": 0.0543, + "step": 16807 + }, + { + "epoch": 2.723266364225535, + "grad_norm": 0.9519869089126587, + "learning_rate": 1.1074903451397195e-07, + "loss": 0.0633, + "step": 16808 + }, + { + "epoch": 2.7234283862605313, + "grad_norm": 0.8796502947807312, + "learning_rate": 1.1062035443160673e-07, + "loss": 0.0599, + "step": 16809 + }, + { + "epoch": 2.7235904082955282, + "grad_norm": 0.9636805057525635, + "learning_rate": 1.1049174745847657e-07, + "loss": 0.065, + "step": 16810 + }, + { + "epoch": 2.723752430330525, + "grad_norm": 0.8485105037689209, + "learning_rate": 1.1036321359851638e-07, + "loss": 0.0555, + "step": 16811 + }, + { + "epoch": 2.7239144523655217, + "grad_norm": 1.940087080001831, + "learning_rate": 1.1023475285565882e-07, + "loss": 0.0602, + "step": 16812 + }, + { + "epoch": 2.7240764744005186, + "grad_norm": 0.9121723175048828, + "learning_rate": 1.1010636523383494e-07, + "loss": 0.0591, + "step": 16813 + }, + { + "epoch": 2.724238496435515, + "grad_norm": 0.9082077145576477, + "learning_rate": 1.099780507369727e-07, + "loss": 0.062, + "step": 16814 + }, + { + "epoch": 2.724400518470512, + "grad_norm": 0.9480251669883728, + "learning_rate": 1.0984980936899842e-07, + "loss": 0.0608, + "step": 16815 + }, + { + "epoch": 2.7245625405055085, + "grad_norm": 1.0009106397628784, + "learning_rate": 1.0972164113383616e-07, + "loss": 0.0651, + "step": 16816 + }, + { + "epoch": 2.7247245625405054, + "grad_norm": 1.0686898231506348, + "learning_rate": 1.0959354603540695e-07, + "loss": 0.069, + "step": 16817 + }, + { + "epoch": 2.7248865845755024, + "grad_norm": 0.8673676252365112, + "learning_rate": 1.09465524077631e-07, + "loss": 0.0545, + "step": 16818 + }, + { + "epoch": 2.725048606610499, + "grad_norm": 0.9199877977371216, + "learning_rate": 1.093375752644249e-07, + "loss": 0.053, + "step": 16819 + }, + { + "epoch": 2.725210628645496, + "grad_norm": 0.8862485885620117, + "learning_rate": 1.0920969959970301e-07, + "loss": 0.0579, + "step": 16820 + }, + { + "epoch": 2.7253726506804927, + "grad_norm": 0.8255491256713867, + "learning_rate": 1.0908189708737942e-07, + "loss": 0.0614, + "step": 16821 + }, + { + "epoch": 2.725534672715489, + "grad_norm": 0.9784971475601196, + "learning_rate": 1.0895416773136408e-07, + "loss": 0.0637, + "step": 16822 + }, + { + "epoch": 2.725696694750486, + "grad_norm": 1.1451879739761353, + "learning_rate": 1.088265115355644e-07, + "loss": 0.0695, + "step": 16823 + }, + { + "epoch": 2.725858716785483, + "grad_norm": 0.827608585357666, + "learning_rate": 1.0869892850388697e-07, + "loss": 0.0603, + "step": 16824 + }, + { + "epoch": 2.7260207388204796, + "grad_norm": 0.9534388184547424, + "learning_rate": 1.0857141864023591e-07, + "loss": 0.0635, + "step": 16825 + }, + { + "epoch": 2.7261827608554765, + "grad_norm": 0.9384109973907471, + "learning_rate": 1.0844398194851197e-07, + "loss": 0.0566, + "step": 16826 + }, + { + "epoch": 2.726344782890473, + "grad_norm": 1.1413118839263916, + "learning_rate": 1.0831661843261482e-07, + "loss": 0.0673, + "step": 16827 + }, + { + "epoch": 2.72650680492547, + "grad_norm": 0.878052830696106, + "learning_rate": 1.0818932809644161e-07, + "loss": 0.0537, + "step": 16828 + }, + { + "epoch": 2.7266688269604664, + "grad_norm": 0.9519762992858887, + "learning_rate": 1.0806211094388647e-07, + "loss": 0.0624, + "step": 16829 + }, + { + "epoch": 2.7268308489954634, + "grad_norm": 0.9057498574256897, + "learning_rate": 1.0793496697884265e-07, + "loss": 0.0527, + "step": 16830 + }, + { + "epoch": 2.7269928710304603, + "grad_norm": 0.9844750761985779, + "learning_rate": 1.078078962052001e-07, + "loss": 0.0589, + "step": 16831 + }, + { + "epoch": 2.7271548930654568, + "grad_norm": 1.1606096029281616, + "learning_rate": 1.0768089862684684e-07, + "loss": 0.0655, + "step": 16832 + }, + { + "epoch": 2.7273169151004537, + "grad_norm": 0.8622628450393677, + "learning_rate": 1.0755397424766917e-07, + "loss": 0.0558, + "step": 16833 + }, + { + "epoch": 2.7274789371354506, + "grad_norm": 0.8796065449714661, + "learning_rate": 1.0742712307154957e-07, + "loss": 0.0685, + "step": 16834 + }, + { + "epoch": 2.727640959170447, + "grad_norm": 0.8955232501029968, + "learning_rate": 1.0730034510237048e-07, + "loss": 0.0654, + "step": 16835 + }, + { + "epoch": 2.727802981205444, + "grad_norm": 0.9430897235870361, + "learning_rate": 1.0717364034401073e-07, + "loss": 0.0662, + "step": 16836 + }, + { + "epoch": 2.7279650032404406, + "grad_norm": 0.849699079990387, + "learning_rate": 1.0704700880034696e-07, + "loss": 0.0577, + "step": 16837 + }, + { + "epoch": 2.7281270252754375, + "grad_norm": 0.9427576065063477, + "learning_rate": 1.0692045047525384e-07, + "loss": 0.0603, + "step": 16838 + }, + { + "epoch": 2.728289047310434, + "grad_norm": 0.8516416549682617, + "learning_rate": 1.0679396537260356e-07, + "loss": 0.0539, + "step": 16839 + }, + { + "epoch": 2.728451069345431, + "grad_norm": 0.9089876413345337, + "learning_rate": 1.066675534962669e-07, + "loss": 0.0621, + "step": 16840 + }, + { + "epoch": 2.728613091380428, + "grad_norm": 1.2640206813812256, + "learning_rate": 1.0654121485011131e-07, + "loss": 0.0632, + "step": 16841 + }, + { + "epoch": 2.7287751134154243, + "grad_norm": 0.8047393560409546, + "learning_rate": 1.0641494943800234e-07, + "loss": 0.0581, + "step": 16842 + }, + { + "epoch": 2.7289371354504213, + "grad_norm": 0.8546563982963562, + "learning_rate": 1.0628875726380355e-07, + "loss": 0.057, + "step": 16843 + }, + { + "epoch": 2.729099157485418, + "grad_norm": 1.0105202198028564, + "learning_rate": 1.0616263833137602e-07, + "loss": 0.0678, + "step": 16844 + }, + { + "epoch": 2.7292611795204147, + "grad_norm": 0.9958057999610901, + "learning_rate": 1.0603659264457888e-07, + "loss": 0.0581, + "step": 16845 + }, + { + "epoch": 2.7294232015554116, + "grad_norm": 0.9695627093315125, + "learning_rate": 1.0591062020726878e-07, + "loss": 0.059, + "step": 16846 + }, + { + "epoch": 2.7295852235904086, + "grad_norm": 0.8972888588905334, + "learning_rate": 1.0578472102330011e-07, + "loss": 0.0588, + "step": 16847 + }, + { + "epoch": 2.729747245625405, + "grad_norm": 1.0485844612121582, + "learning_rate": 1.0565889509652483e-07, + "loss": 0.0638, + "step": 16848 + }, + { + "epoch": 2.7299092676604015, + "grad_norm": 0.9550689458847046, + "learning_rate": 1.0553314243079343e-07, + "loss": 0.0621, + "step": 16849 + }, + { + "epoch": 2.7300712896953985, + "grad_norm": 0.8764699101448059, + "learning_rate": 1.0540746302995341e-07, + "loss": 0.0568, + "step": 16850 + }, + { + "epoch": 2.7302333117303954, + "grad_norm": 0.8220252394676208, + "learning_rate": 1.0528185689785031e-07, + "loss": 0.056, + "step": 16851 + }, + { + "epoch": 2.730395333765392, + "grad_norm": 1.0028624534606934, + "learning_rate": 1.0515632403832715e-07, + "loss": 0.0509, + "step": 16852 + }, + { + "epoch": 2.730557355800389, + "grad_norm": 0.84007328748703, + "learning_rate": 1.0503086445522476e-07, + "loss": 0.0505, + "step": 16853 + }, + { + "epoch": 2.7307193778353858, + "grad_norm": 0.9844392538070679, + "learning_rate": 1.0490547815238228e-07, + "loss": 0.0602, + "step": 16854 + }, + { + "epoch": 2.7308813998703823, + "grad_norm": 1.050789475440979, + "learning_rate": 1.0478016513363665e-07, + "loss": 0.0632, + "step": 16855 + }, + { + "epoch": 2.731043421905379, + "grad_norm": 1.0053508281707764, + "learning_rate": 1.0465492540282146e-07, + "loss": 0.0655, + "step": 16856 + }, + { + "epoch": 2.731205443940376, + "grad_norm": 0.9643328785896301, + "learning_rate": 1.0452975896376865e-07, + "loss": 0.0642, + "step": 16857 + }, + { + "epoch": 2.7313674659753726, + "grad_norm": 0.9741927981376648, + "learning_rate": 1.044046658203085e-07, + "loss": 0.0602, + "step": 16858 + }, + { + "epoch": 2.7315294880103695, + "grad_norm": 0.9322402477264404, + "learning_rate": 1.0427964597626822e-07, + "loss": 0.0567, + "step": 16859 + }, + { + "epoch": 2.731691510045366, + "grad_norm": 0.8425524234771729, + "learning_rate": 1.0415469943547335e-07, + "loss": 0.0608, + "step": 16860 + }, + { + "epoch": 2.731853532080363, + "grad_norm": 1.0030529499053955, + "learning_rate": 1.0402982620174696e-07, + "loss": 0.0638, + "step": 16861 + }, + { + "epoch": 2.7320155541153595, + "grad_norm": 0.8651495575904846, + "learning_rate": 1.0390502627890986e-07, + "loss": 0.0586, + "step": 16862 + }, + { + "epoch": 2.7321775761503564, + "grad_norm": 0.9395628571510315, + "learning_rate": 1.0378029967077985e-07, + "loss": 0.0558, + "step": 16863 + }, + { + "epoch": 2.7323395981853533, + "grad_norm": 0.8924857378005981, + "learning_rate": 1.0365564638117442e-07, + "loss": 0.0659, + "step": 16864 + }, + { + "epoch": 2.73250162022035, + "grad_norm": 0.860127329826355, + "learning_rate": 1.0353106641390693e-07, + "loss": 0.0651, + "step": 16865 + }, + { + "epoch": 2.7326636422553467, + "grad_norm": 0.846647322177887, + "learning_rate": 1.0340655977279012e-07, + "loss": 0.0592, + "step": 16866 + }, + { + "epoch": 2.7328256642903437, + "grad_norm": 0.871707022190094, + "learning_rate": 1.032821264616321e-07, + "loss": 0.0618, + "step": 16867 + }, + { + "epoch": 2.73298768632534, + "grad_norm": 0.89871746301651, + "learning_rate": 1.0315776648424119e-07, + "loss": 0.0655, + "step": 16868 + }, + { + "epoch": 2.733149708360337, + "grad_norm": 0.8250013589859009, + "learning_rate": 1.0303347984442297e-07, + "loss": 0.0585, + "step": 16869 + }, + { + "epoch": 2.733311730395334, + "grad_norm": 0.8901594281196594, + "learning_rate": 1.0290926654597938e-07, + "loss": 0.0568, + "step": 16870 + }, + { + "epoch": 2.7334737524303305, + "grad_norm": 0.9799373745918274, + "learning_rate": 1.0278512659271128e-07, + "loss": 0.0617, + "step": 16871 + }, + { + "epoch": 2.733635774465327, + "grad_norm": 0.9966387748718262, + "learning_rate": 1.0266105998841702e-07, + "loss": 0.058, + "step": 16872 + }, + { + "epoch": 2.733797796500324, + "grad_norm": 0.8289188742637634, + "learning_rate": 1.0253706673689328e-07, + "loss": 0.0542, + "step": 16873 + }, + { + "epoch": 2.733959818535321, + "grad_norm": 0.9346227645874023, + "learning_rate": 1.0241314684193343e-07, + "loss": 0.0611, + "step": 16874 + }, + { + "epoch": 2.7341218405703174, + "grad_norm": 0.9059098958969116, + "learning_rate": 1.0228930030732914e-07, + "loss": 0.0619, + "step": 16875 + }, + { + "epoch": 2.7342838626053143, + "grad_norm": 0.9077804684638977, + "learning_rate": 1.0216552713686989e-07, + "loss": 0.0595, + "step": 16876 + }, + { + "epoch": 2.7344458846403112, + "grad_norm": 0.9058060646057129, + "learning_rate": 1.0204182733434293e-07, + "loss": 0.0612, + "step": 16877 + }, + { + "epoch": 2.7346079066753077, + "grad_norm": 0.8321201205253601, + "learning_rate": 1.01918200903533e-07, + "loss": 0.0592, + "step": 16878 + }, + { + "epoch": 2.7347699287103047, + "grad_norm": 0.9400634765625, + "learning_rate": 1.017946478482229e-07, + "loss": 0.068, + "step": 16879 + }, + { + "epoch": 2.7349319507453016, + "grad_norm": 0.822931170463562, + "learning_rate": 1.0167116817219325e-07, + "loss": 0.0585, + "step": 16880 + }, + { + "epoch": 2.735093972780298, + "grad_norm": 1.0861873626708984, + "learning_rate": 1.0154776187922182e-07, + "loss": 0.068, + "step": 16881 + }, + { + "epoch": 2.735255994815295, + "grad_norm": 1.1060582399368286, + "learning_rate": 1.0142442897308453e-07, + "loss": 0.0665, + "step": 16882 + }, + { + "epoch": 2.7354180168502915, + "grad_norm": 0.9769384860992432, + "learning_rate": 1.0130116945755553e-07, + "loss": 0.0658, + "step": 16883 + }, + { + "epoch": 2.7355800388852884, + "grad_norm": 0.9157807230949402, + "learning_rate": 1.0117798333640627e-07, + "loss": 0.0583, + "step": 16884 + }, + { + "epoch": 2.735742060920285, + "grad_norm": 0.98633873462677, + "learning_rate": 1.0105487061340541e-07, + "loss": 0.0652, + "step": 16885 + }, + { + "epoch": 2.735904082955282, + "grad_norm": 0.9951062202453613, + "learning_rate": 1.0093183129231993e-07, + "loss": 0.0606, + "step": 16886 + }, + { + "epoch": 2.736066104990279, + "grad_norm": 0.8598393201828003, + "learning_rate": 1.0080886537691514e-07, + "loss": 0.0607, + "step": 16887 + }, + { + "epoch": 2.7362281270252753, + "grad_norm": 0.911261260509491, + "learning_rate": 1.0068597287095305e-07, + "loss": 0.0566, + "step": 16888 + }, + { + "epoch": 2.7363901490602722, + "grad_norm": 0.9654666185379028, + "learning_rate": 1.005631537781937e-07, + "loss": 0.0631, + "step": 16889 + }, + { + "epoch": 2.736552171095269, + "grad_norm": 1.0117939710617065, + "learning_rate": 1.0044040810239547e-07, + "loss": 0.063, + "step": 16890 + }, + { + "epoch": 2.7367141931302656, + "grad_norm": 0.8387407064437866, + "learning_rate": 1.0031773584731397e-07, + "loss": 0.0552, + "step": 16891 + }, + { + "epoch": 2.7368762151652626, + "grad_norm": 0.8349279165267944, + "learning_rate": 1.0019513701670285e-07, + "loss": 0.06, + "step": 16892 + }, + { + "epoch": 2.737038237200259, + "grad_norm": 0.9240767955780029, + "learning_rate": 1.0007261161431275e-07, + "loss": 0.0591, + "step": 16893 + }, + { + "epoch": 2.737200259235256, + "grad_norm": 0.9168483018875122, + "learning_rate": 9.995015964389315e-08, + "loss": 0.0643, + "step": 16894 + }, + { + "epoch": 2.7373622812702525, + "grad_norm": 1.0924078226089478, + "learning_rate": 9.982778110919106e-08, + "loss": 0.0587, + "step": 16895 + }, + { + "epoch": 2.7375243033052494, + "grad_norm": 0.9266462326049805, + "learning_rate": 9.970547601394986e-08, + "loss": 0.0645, + "step": 16896 + }, + { + "epoch": 2.7376863253402464, + "grad_norm": 1.2055165767669678, + "learning_rate": 9.958324436191297e-08, + "loss": 0.0617, + "step": 16897 + }, + { + "epoch": 2.737848347375243, + "grad_norm": 0.8799644708633423, + "learning_rate": 9.94610861568196e-08, + "loss": 0.0596, + "step": 16898 + }, + { + "epoch": 2.73801036941024, + "grad_norm": 0.9457053542137146, + "learning_rate": 9.933900140240843e-08, + "loss": 0.057, + "step": 16899 + }, + { + "epoch": 2.7381723914452367, + "grad_norm": 0.8883636593818665, + "learning_rate": 9.92169901024137e-08, + "loss": 0.0646, + "step": 16900 + }, + { + "epoch": 2.738334413480233, + "grad_norm": 0.9511093497276306, + "learning_rate": 9.90950522605691e-08, + "loss": 0.0616, + "step": 16901 + }, + { + "epoch": 2.73849643551523, + "grad_norm": 0.9124867916107178, + "learning_rate": 9.897318788060662e-08, + "loss": 0.0563, + "step": 16902 + }, + { + "epoch": 2.738658457550227, + "grad_norm": 1.006246566772461, + "learning_rate": 9.885139696625356e-08, + "loss": 0.0661, + "step": 16903 + }, + { + "epoch": 2.7388204795852236, + "grad_norm": 0.8624682426452637, + "learning_rate": 9.872967952123752e-08, + "loss": 0.0538, + "step": 16904 + }, + { + "epoch": 2.7389825016202205, + "grad_norm": 1.1840333938598633, + "learning_rate": 9.860803554928189e-08, + "loss": 0.0687, + "step": 16905 + }, + { + "epoch": 2.739144523655217, + "grad_norm": 0.8678603172302246, + "learning_rate": 9.848646505410953e-08, + "loss": 0.0524, + "step": 16906 + }, + { + "epoch": 2.739306545690214, + "grad_norm": 0.8984764218330383, + "learning_rate": 9.83649680394394e-08, + "loss": 0.0554, + "step": 16907 + }, + { + "epoch": 2.7394685677252104, + "grad_norm": 0.9465093016624451, + "learning_rate": 9.824354450898966e-08, + "loss": 0.0618, + "step": 16908 + }, + { + "epoch": 2.7396305897602073, + "grad_norm": 0.8658840656280518, + "learning_rate": 9.812219446647509e-08, + "loss": 0.0571, + "step": 16909 + }, + { + "epoch": 2.7397926117952043, + "grad_norm": 0.9028376340866089, + "learning_rate": 9.800091791560939e-08, + "loss": 0.0542, + "step": 16910 + }, + { + "epoch": 2.7399546338302008, + "grad_norm": 1.0068341493606567, + "learning_rate": 9.78797148601024e-08, + "loss": 0.0694, + "step": 16911 + }, + { + "epoch": 2.7401166558651977, + "grad_norm": 0.8635134696960449, + "learning_rate": 9.775858530366334e-08, + "loss": 0.06, + "step": 16912 + }, + { + "epoch": 2.7402786779001946, + "grad_norm": 0.8920632600784302, + "learning_rate": 9.763752924999842e-08, + "loss": 0.0559, + "step": 16913 + }, + { + "epoch": 2.740440699935191, + "grad_norm": 0.8138787746429443, + "learning_rate": 9.751654670281135e-08, + "loss": 0.0555, + "step": 16914 + }, + { + "epoch": 2.740602721970188, + "grad_norm": 0.9727160334587097, + "learning_rate": 9.739563766580362e-08, + "loss": 0.0632, + "step": 16915 + }, + { + "epoch": 2.7407647440051845, + "grad_norm": 0.830783486366272, + "learning_rate": 9.727480214267559e-08, + "loss": 0.0599, + "step": 16916 + }, + { + "epoch": 2.7409267660401815, + "grad_norm": 0.8595759272575378, + "learning_rate": 9.715404013712432e-08, + "loss": 0.0564, + "step": 16917 + }, + { + "epoch": 2.741088788075178, + "grad_norm": 1.019550085067749, + "learning_rate": 9.70333516528446e-08, + "loss": 0.0684, + "step": 16918 + }, + { + "epoch": 2.741250810110175, + "grad_norm": 1.0129154920578003, + "learning_rate": 9.691273669352908e-08, + "loss": 0.066, + "step": 16919 + }, + { + "epoch": 2.741412832145172, + "grad_norm": 1.0200157165527344, + "learning_rate": 9.679219526286837e-08, + "loss": 0.066, + "step": 16920 + }, + { + "epoch": 2.7415748541801683, + "grad_norm": 1.0694841146469116, + "learning_rate": 9.667172736455093e-08, + "loss": 0.0631, + "step": 16921 + }, + { + "epoch": 2.7417368762151653, + "grad_norm": 0.9911954402923584, + "learning_rate": 9.655133300226271e-08, + "loss": 0.0644, + "step": 16922 + }, + { + "epoch": 2.741898898250162, + "grad_norm": 1.1050474643707275, + "learning_rate": 9.643101217968743e-08, + "loss": 0.0572, + "step": 16923 + }, + { + "epoch": 2.7420609202851587, + "grad_norm": 0.8301832675933838, + "learning_rate": 9.631076490050684e-08, + "loss": 0.0577, + "step": 16924 + }, + { + "epoch": 2.7422229423201556, + "grad_norm": 1.2750616073608398, + "learning_rate": 9.619059116839968e-08, + "loss": 0.07, + "step": 16925 + }, + { + "epoch": 2.7423849643551526, + "grad_norm": 0.9567539691925049, + "learning_rate": 9.60704909870433e-08, + "loss": 0.0601, + "step": 16926 + }, + { + "epoch": 2.742546986390149, + "grad_norm": 1.1425048112869263, + "learning_rate": 9.59504643601128e-08, + "loss": 0.0629, + "step": 16927 + }, + { + "epoch": 2.742709008425146, + "grad_norm": 0.8750780820846558, + "learning_rate": 9.583051129128051e-08, + "loss": 0.0594, + "step": 16928 + }, + { + "epoch": 2.7428710304601425, + "grad_norm": 0.8770819306373596, + "learning_rate": 9.57106317842163e-08, + "loss": 0.0646, + "step": 16929 + }, + { + "epoch": 2.7430330524951394, + "grad_norm": 1.0766657590866089, + "learning_rate": 9.559082584258833e-08, + "loss": 0.0548, + "step": 16930 + }, + { + "epoch": 2.743195074530136, + "grad_norm": 0.9907754063606262, + "learning_rate": 9.547109347006312e-08, + "loss": 0.0658, + "step": 16931 + }, + { + "epoch": 2.743357096565133, + "grad_norm": 0.9115135669708252, + "learning_rate": 9.535143467030327e-08, + "loss": 0.0584, + "step": 16932 + }, + { + "epoch": 2.7435191186001298, + "grad_norm": 0.9572370052337646, + "learning_rate": 9.523184944697034e-08, + "loss": 0.0609, + "step": 16933 + }, + { + "epoch": 2.7436811406351262, + "grad_norm": 0.8873291015625, + "learning_rate": 9.511233780372303e-08, + "loss": 0.0609, + "step": 16934 + }, + { + "epoch": 2.743843162670123, + "grad_norm": 0.8288209438323975, + "learning_rate": 9.499289974421927e-08, + "loss": 0.0567, + "step": 16935 + }, + { + "epoch": 2.74400518470512, + "grad_norm": 0.8421540856361389, + "learning_rate": 9.487353527211223e-08, + "loss": 0.0597, + "step": 16936 + }, + { + "epoch": 2.7441672067401166, + "grad_norm": 0.884370744228363, + "learning_rate": 9.475424439105485e-08, + "loss": 0.0656, + "step": 16937 + }, + { + "epoch": 2.7443292287751135, + "grad_norm": 0.9282497763633728, + "learning_rate": 9.463502710469697e-08, + "loss": 0.0596, + "step": 16938 + }, + { + "epoch": 2.74449125081011, + "grad_norm": 0.9526280760765076, + "learning_rate": 9.45158834166865e-08, + "loss": 0.0618, + "step": 16939 + }, + { + "epoch": 2.744653272845107, + "grad_norm": 1.0486241579055786, + "learning_rate": 9.439681333066858e-08, + "loss": 0.0664, + "step": 16940 + }, + { + "epoch": 2.7448152948801035, + "grad_norm": 0.9274911284446716, + "learning_rate": 9.427781685028697e-08, + "loss": 0.0546, + "step": 16941 + }, + { + "epoch": 2.7449773169151004, + "grad_norm": 0.9596142768859863, + "learning_rate": 9.415889397918238e-08, + "loss": 0.0612, + "step": 16942 + }, + { + "epoch": 2.7451393389500973, + "grad_norm": 0.8194335699081421, + "learning_rate": 9.404004472099382e-08, + "loss": 0.0555, + "step": 16943 + }, + { + "epoch": 2.745301360985094, + "grad_norm": 0.9087086319923401, + "learning_rate": 9.3921269079357e-08, + "loss": 0.0567, + "step": 16944 + }, + { + "epoch": 2.7454633830200907, + "grad_norm": 0.8666279911994934, + "learning_rate": 9.380256705790708e-08, + "loss": 0.0553, + "step": 16945 + }, + { + "epoch": 2.7456254050550877, + "grad_norm": 0.9419772624969482, + "learning_rate": 9.368393866027614e-08, + "loss": 0.0666, + "step": 16946 + }, + { + "epoch": 2.745787427090084, + "grad_norm": 0.8581135869026184, + "learning_rate": 9.356538389009296e-08, + "loss": 0.0524, + "step": 16947 + }, + { + "epoch": 2.745949449125081, + "grad_norm": 0.846437394618988, + "learning_rate": 9.344690275098573e-08, + "loss": 0.0499, + "step": 16948 + }, + { + "epoch": 2.746111471160078, + "grad_norm": 0.8755781054496765, + "learning_rate": 9.332849524657961e-08, + "loss": 0.0586, + "step": 16949 + }, + { + "epoch": 2.7462734931950745, + "grad_norm": 0.9014527201652527, + "learning_rate": 9.321016138049727e-08, + "loss": 0.0562, + "step": 16950 + }, + { + "epoch": 2.7464355152300715, + "grad_norm": 0.9872837662696838, + "learning_rate": 9.309190115635996e-08, + "loss": 0.057, + "step": 16951 + }, + { + "epoch": 2.746597537265068, + "grad_norm": 0.9479248523712158, + "learning_rate": 9.297371457778565e-08, + "loss": 0.0581, + "step": 16952 + }, + { + "epoch": 2.746759559300065, + "grad_norm": 0.9341238737106323, + "learning_rate": 9.285560164839086e-08, + "loss": 0.0633, + "step": 16953 + }, + { + "epoch": 2.7469215813350614, + "grad_norm": 0.9351992011070251, + "learning_rate": 9.273756237178938e-08, + "loss": 0.0639, + "step": 16954 + }, + { + "epoch": 2.7470836033700583, + "grad_norm": 0.8418577313423157, + "learning_rate": 9.261959675159304e-08, + "loss": 0.0612, + "step": 16955 + }, + { + "epoch": 2.7472456254050552, + "grad_norm": 1.0418609380722046, + "learning_rate": 9.250170479141146e-08, + "loss": 0.0661, + "step": 16956 + }, + { + "epoch": 2.7474076474400517, + "grad_norm": 1.1158641576766968, + "learning_rate": 9.238388649485175e-08, + "loss": 0.065, + "step": 16957 + }, + { + "epoch": 2.7475696694750487, + "grad_norm": 0.9515754580497742, + "learning_rate": 9.226614186551852e-08, + "loss": 0.0578, + "step": 16958 + }, + { + "epoch": 2.7477316915100456, + "grad_norm": 0.8326787352561951, + "learning_rate": 9.214847090701474e-08, + "loss": 0.0537, + "step": 16959 + }, + { + "epoch": 2.747893713545042, + "grad_norm": 0.9199459552764893, + "learning_rate": 9.20308736229411e-08, + "loss": 0.0591, + "step": 16960 + }, + { + "epoch": 2.748055735580039, + "grad_norm": 0.9134218692779541, + "learning_rate": 9.19133500168956e-08, + "loss": 0.0604, + "step": 16961 + }, + { + "epoch": 2.7482177576150355, + "grad_norm": 0.8909247517585754, + "learning_rate": 9.179590009247397e-08, + "loss": 0.0587, + "step": 16962 + }, + { + "epoch": 2.7483797796500324, + "grad_norm": 0.8900055885314941, + "learning_rate": 9.167852385326969e-08, + "loss": 0.0579, + "step": 16963 + }, + { + "epoch": 2.748541801685029, + "grad_norm": 1.0126845836639404, + "learning_rate": 9.15612213028752e-08, + "loss": 0.0661, + "step": 16964 + }, + { + "epoch": 2.748703823720026, + "grad_norm": 0.9734671711921692, + "learning_rate": 9.144399244487873e-08, + "loss": 0.06, + "step": 16965 + }, + { + "epoch": 2.748865845755023, + "grad_norm": 0.8340158462524414, + "learning_rate": 9.132683728286767e-08, + "loss": 0.0552, + "step": 16966 + }, + { + "epoch": 2.7490278677900193, + "grad_norm": 0.81150883436203, + "learning_rate": 9.120975582042613e-08, + "loss": 0.057, + "step": 16967 + }, + { + "epoch": 2.749189889825016, + "grad_norm": 0.9296124577522278, + "learning_rate": 9.109274806113732e-08, + "loss": 0.0647, + "step": 16968 + }, + { + "epoch": 2.749351911860013, + "grad_norm": 0.8688836097717285, + "learning_rate": 9.097581400858064e-08, + "loss": 0.0588, + "step": 16969 + }, + { + "epoch": 2.7495139338950096, + "grad_norm": 1.00139319896698, + "learning_rate": 9.085895366633457e-08, + "loss": 0.0626, + "step": 16970 + }, + { + "epoch": 2.7496759559300066, + "grad_norm": 0.9233237504959106, + "learning_rate": 9.074216703797434e-08, + "loss": 0.0646, + "step": 16971 + }, + { + "epoch": 2.7498379779650035, + "grad_norm": 0.954022228717804, + "learning_rate": 9.062545412707375e-08, + "loss": 0.0637, + "step": 16972 + }, + { + "epoch": 2.75, + "grad_norm": 0.7831946611404419, + "learning_rate": 9.050881493720326e-08, + "loss": 0.0533, + "step": 16973 + }, + { + "epoch": 2.7501620220349965, + "grad_norm": 0.8610273599624634, + "learning_rate": 9.039224947193254e-08, + "loss": 0.0567, + "step": 16974 + }, + { + "epoch": 2.7503240440699934, + "grad_norm": 1.0053431987762451, + "learning_rate": 9.027575773482788e-08, + "loss": 0.063, + "step": 16975 + }, + { + "epoch": 2.7504860661049904, + "grad_norm": 1.0265311002731323, + "learning_rate": 9.01593397294534e-08, + "loss": 0.068, + "step": 16976 + }, + { + "epoch": 2.750648088139987, + "grad_norm": 1.0385494232177734, + "learning_rate": 9.004299545937151e-08, + "loss": 0.0615, + "step": 16977 + }, + { + "epoch": 2.750810110174984, + "grad_norm": 1.0085492134094238, + "learning_rate": 8.992672492814158e-08, + "loss": 0.0618, + "step": 16978 + }, + { + "epoch": 2.7509721322099807, + "grad_norm": 0.9866387248039246, + "learning_rate": 8.981052813932245e-08, + "loss": 0.0675, + "step": 16979 + }, + { + "epoch": 2.751134154244977, + "grad_norm": 1.0213267803192139, + "learning_rate": 8.969440509646821e-08, + "loss": 0.0586, + "step": 16980 + }, + { + "epoch": 2.751296176279974, + "grad_norm": 1.007440447807312, + "learning_rate": 8.957835580313212e-08, + "loss": 0.0656, + "step": 16981 + }, + { + "epoch": 2.751458198314971, + "grad_norm": 0.9071718454360962, + "learning_rate": 8.946238026286552e-08, + "loss": 0.0619, + "step": 16982 + }, + { + "epoch": 2.7516202203499676, + "grad_norm": 1.0188510417938232, + "learning_rate": 8.93464784792164e-08, + "loss": 0.0566, + "step": 16983 + }, + { + "epoch": 2.7517822423849645, + "grad_norm": 0.8976929187774658, + "learning_rate": 8.923065045573165e-08, + "loss": 0.0603, + "step": 16984 + }, + { + "epoch": 2.751944264419961, + "grad_norm": 0.7960435748100281, + "learning_rate": 8.911489619595482e-08, + "loss": 0.0537, + "step": 16985 + }, + { + "epoch": 2.752106286454958, + "grad_norm": 1.094245195388794, + "learning_rate": 8.899921570342807e-08, + "loss": 0.0682, + "step": 16986 + }, + { + "epoch": 2.7522683084899544, + "grad_norm": 0.8297873139381409, + "learning_rate": 8.888360898169079e-08, + "loss": 0.0515, + "step": 16987 + }, + { + "epoch": 2.7524303305249513, + "grad_norm": 0.8277057409286499, + "learning_rate": 8.876807603428017e-08, + "loss": 0.0569, + "step": 16988 + }, + { + "epoch": 2.7525923525599483, + "grad_norm": 0.8910994529724121, + "learning_rate": 8.865261686473143e-08, + "loss": 0.0529, + "step": 16989 + }, + { + "epoch": 2.7527543745949448, + "grad_norm": 0.798820436000824, + "learning_rate": 8.853723147657755e-08, + "loss": 0.0544, + "step": 16990 + }, + { + "epoch": 2.7529163966299417, + "grad_norm": 0.994156539440155, + "learning_rate": 8.842191987334853e-08, + "loss": 0.0661, + "step": 16991 + }, + { + "epoch": 2.7530784186649386, + "grad_norm": 0.8740126490592957, + "learning_rate": 8.830668205857263e-08, + "loss": 0.0599, + "step": 16992 + }, + { + "epoch": 2.753240440699935, + "grad_norm": 0.9140499830245972, + "learning_rate": 8.819151803577647e-08, + "loss": 0.0605, + "step": 16993 + }, + { + "epoch": 2.753402462734932, + "grad_norm": 0.9925417304039001, + "learning_rate": 8.807642780848335e-08, + "loss": 0.0623, + "step": 16994 + }, + { + "epoch": 2.7535644847699285, + "grad_norm": 0.8395823836326599, + "learning_rate": 8.796141138021464e-08, + "loss": 0.0575, + "step": 16995 + }, + { + "epoch": 2.7537265068049255, + "grad_norm": 1.012656331062317, + "learning_rate": 8.784646875448971e-08, + "loss": 0.0669, + "step": 16996 + }, + { + "epoch": 2.753888528839922, + "grad_norm": 0.8465946316719055, + "learning_rate": 8.77315999348255e-08, + "loss": 0.0572, + "step": 16997 + }, + { + "epoch": 2.754050550874919, + "grad_norm": 1.1134271621704102, + "learning_rate": 8.761680492473668e-08, + "loss": 0.058, + "step": 16998 + }, + { + "epoch": 2.754212572909916, + "grad_norm": 0.852190375328064, + "learning_rate": 8.7502083727736e-08, + "loss": 0.0576, + "step": 16999 + }, + { + "epoch": 2.7543745949449123, + "grad_norm": 0.9086182713508606, + "learning_rate": 8.738743634733316e-08, + "loss": 0.0631, + "step": 17000 + }, + { + "epoch": 2.7545366169799093, + "grad_norm": 0.9035629034042358, + "learning_rate": 8.727286278703672e-08, + "loss": 0.0589, + "step": 17001 + }, + { + "epoch": 2.754698639014906, + "grad_norm": 0.8812063932418823, + "learning_rate": 8.715836305035169e-08, + "loss": 0.0516, + "step": 17002 + }, + { + "epoch": 2.7548606610499027, + "grad_norm": 0.9071505069732666, + "learning_rate": 8.704393714078191e-08, + "loss": 0.0585, + "step": 17003 + }, + { + "epoch": 2.7550226830848996, + "grad_norm": 0.9533473253250122, + "learning_rate": 8.692958506182847e-08, + "loss": 0.0562, + "step": 17004 + }, + { + "epoch": 2.7551847051198965, + "grad_norm": 0.8007147312164307, + "learning_rate": 8.681530681699024e-08, + "loss": 0.0611, + "step": 17005 + }, + { + "epoch": 2.755346727154893, + "grad_norm": 0.9169260263442993, + "learning_rate": 8.67011024097636e-08, + "loss": 0.065, + "step": 17006 + }, + { + "epoch": 2.75550874918989, + "grad_norm": 0.8631471395492554, + "learning_rate": 8.658697184364323e-08, + "loss": 0.0504, + "step": 17007 + }, + { + "epoch": 2.7556707712248865, + "grad_norm": 0.9157608151435852, + "learning_rate": 8.647291512212136e-08, + "loss": 0.0572, + "step": 17008 + }, + { + "epoch": 2.7558327932598834, + "grad_norm": 0.9917179942131042, + "learning_rate": 8.635893224868769e-08, + "loss": 0.064, + "step": 17009 + }, + { + "epoch": 2.75599481529488, + "grad_norm": 0.9392203092575073, + "learning_rate": 8.624502322682942e-08, + "loss": 0.0655, + "step": 17010 + }, + { + "epoch": 2.756156837329877, + "grad_norm": 0.8594088554382324, + "learning_rate": 8.61311880600324e-08, + "loss": 0.0567, + "step": 17011 + }, + { + "epoch": 2.7563188593648738, + "grad_norm": 0.7704145312309265, + "learning_rate": 8.601742675177993e-08, + "loss": 0.0494, + "step": 17012 + }, + { + "epoch": 2.7564808813998702, + "grad_norm": 0.9152031540870667, + "learning_rate": 8.590373930555201e-08, + "loss": 0.0579, + "step": 17013 + }, + { + "epoch": 2.756642903434867, + "grad_norm": 0.8692240715026855, + "learning_rate": 8.57901257248278e-08, + "loss": 0.0578, + "step": 17014 + }, + { + "epoch": 2.756804925469864, + "grad_norm": 0.8714682459831238, + "learning_rate": 8.567658601308371e-08, + "loss": 0.0523, + "step": 17015 + }, + { + "epoch": 2.7569669475048606, + "grad_norm": 0.9357670545578003, + "learning_rate": 8.556312017379332e-08, + "loss": 0.0581, + "step": 17016 + }, + { + "epoch": 2.7571289695398575, + "grad_norm": 0.9461519718170166, + "learning_rate": 8.544972821042857e-08, + "loss": 0.0652, + "step": 17017 + }, + { + "epoch": 2.757290991574854, + "grad_norm": 0.7985438108444214, + "learning_rate": 8.533641012645921e-08, + "loss": 0.0567, + "step": 17018 + }, + { + "epoch": 2.757453013609851, + "grad_norm": 0.8582321405410767, + "learning_rate": 8.522316592535246e-08, + "loss": 0.0545, + "step": 17019 + }, + { + "epoch": 2.7576150356448474, + "grad_norm": 0.9717732071876526, + "learning_rate": 8.510999561057276e-08, + "loss": 0.0655, + "step": 17020 + }, + { + "epoch": 2.7577770576798444, + "grad_norm": 0.8554613590240479, + "learning_rate": 8.499689918558318e-08, + "loss": 0.0555, + "step": 17021 + }, + { + "epoch": 2.7579390797148413, + "grad_norm": 0.9778949618339539, + "learning_rate": 8.488387665384457e-08, + "loss": 0.0646, + "step": 17022 + }, + { + "epoch": 2.758101101749838, + "grad_norm": 1.0577203035354614, + "learning_rate": 8.477092801881525e-08, + "loss": 0.0594, + "step": 17023 + }, + { + "epoch": 2.7582631237848347, + "grad_norm": 0.9223877787590027, + "learning_rate": 8.465805328395055e-08, + "loss": 0.0574, + "step": 17024 + }, + { + "epoch": 2.7584251458198317, + "grad_norm": 0.7924177050590515, + "learning_rate": 8.454525245270378e-08, + "loss": 0.0518, + "step": 17025 + }, + { + "epoch": 2.758587167854828, + "grad_norm": 0.8869810700416565, + "learning_rate": 8.443252552852776e-08, + "loss": 0.0636, + "step": 17026 + }, + { + "epoch": 2.758749189889825, + "grad_norm": 0.927229106426239, + "learning_rate": 8.431987251487083e-08, + "loss": 0.0608, + "step": 17027 + }, + { + "epoch": 2.758911211924822, + "grad_norm": 1.047018051147461, + "learning_rate": 8.42072934151797e-08, + "loss": 0.0618, + "step": 17028 + }, + { + "epoch": 2.7590732339598185, + "grad_norm": 0.8683516979217529, + "learning_rate": 8.409478823289934e-08, + "loss": 0.0614, + "step": 17029 + }, + { + "epoch": 2.7592352559948155, + "grad_norm": 0.9671280384063721, + "learning_rate": 8.398235697147205e-08, + "loss": 0.0533, + "step": 17030 + }, + { + "epoch": 2.759397278029812, + "grad_norm": 0.8958953619003296, + "learning_rate": 8.386999963433812e-08, + "loss": 0.0573, + "step": 17031 + }, + { + "epoch": 2.759559300064809, + "grad_norm": 0.8084527850151062, + "learning_rate": 8.375771622493506e-08, + "loss": 0.0498, + "step": 17032 + }, + { + "epoch": 2.7597213220998054, + "grad_norm": 0.838610827922821, + "learning_rate": 8.364550674669875e-08, + "loss": 0.0635, + "step": 17033 + }, + { + "epoch": 2.7598833441348023, + "grad_norm": 0.9222990274429321, + "learning_rate": 8.353337120306282e-08, + "loss": 0.0647, + "step": 17034 + }, + { + "epoch": 2.7600453661697992, + "grad_norm": 1.0761842727661133, + "learning_rate": 8.342130959745731e-08, + "loss": 0.0665, + "step": 17035 + }, + { + "epoch": 2.7602073882047957, + "grad_norm": 0.8044500350952148, + "learning_rate": 8.330932193331226e-08, + "loss": 0.0568, + "step": 17036 + }, + { + "epoch": 2.7603694102397927, + "grad_norm": 0.8677234649658203, + "learning_rate": 8.319740821405354e-08, + "loss": 0.0588, + "step": 17037 + }, + { + "epoch": 2.7605314322747896, + "grad_norm": 1.054807424545288, + "learning_rate": 8.308556844310589e-08, + "loss": 0.0701, + "step": 17038 + }, + { + "epoch": 2.760693454309786, + "grad_norm": 1.1043615341186523, + "learning_rate": 8.297380262389077e-08, + "loss": 0.0624, + "step": 17039 + }, + { + "epoch": 2.760855476344783, + "grad_norm": 0.7604734301567078, + "learning_rate": 8.286211075982764e-08, + "loss": 0.0498, + "step": 17040 + }, + { + "epoch": 2.7610174983797795, + "grad_norm": 0.8906594514846802, + "learning_rate": 8.275049285433545e-08, + "loss": 0.0596, + "step": 17041 + }, + { + "epoch": 2.7611795204147764, + "grad_norm": 0.9391190409660339, + "learning_rate": 8.263894891082813e-08, + "loss": 0.0644, + "step": 17042 + }, + { + "epoch": 2.761341542449773, + "grad_norm": 0.9467473030090332, + "learning_rate": 8.252747893271906e-08, + "loss": 0.0614, + "step": 17043 + }, + { + "epoch": 2.76150356448477, + "grad_norm": 0.8915271162986755, + "learning_rate": 8.241608292341913e-08, + "loss": 0.0602, + "step": 17044 + }, + { + "epoch": 2.761665586519767, + "grad_norm": 0.9345802068710327, + "learning_rate": 8.230476088633644e-08, + "loss": 0.0603, + "step": 17045 + }, + { + "epoch": 2.7618276085547633, + "grad_norm": 0.8942722082138062, + "learning_rate": 8.219351282487742e-08, + "loss": 0.0602, + "step": 17046 + }, + { + "epoch": 2.76198963058976, + "grad_norm": 0.9697921276092529, + "learning_rate": 8.208233874244575e-08, + "loss": 0.0628, + "step": 17047 + }, + { + "epoch": 2.762151652624757, + "grad_norm": 0.877307116985321, + "learning_rate": 8.197123864244344e-08, + "loss": 0.0578, + "step": 17048 + }, + { + "epoch": 2.7623136746597536, + "grad_norm": 0.9268743395805359, + "learning_rate": 8.18602125282697e-08, + "loss": 0.0567, + "step": 17049 + }, + { + "epoch": 2.7624756966947506, + "grad_norm": 0.8506268262863159, + "learning_rate": 8.174926040332182e-08, + "loss": 0.0533, + "step": 17050 + }, + { + "epoch": 2.7626377187297475, + "grad_norm": 0.9653171896934509, + "learning_rate": 8.16383822709943e-08, + "loss": 0.0601, + "step": 17051 + }, + { + "epoch": 2.762799740764744, + "grad_norm": 0.9039610028266907, + "learning_rate": 8.152757813468027e-08, + "loss": 0.0585, + "step": 17052 + }, + { + "epoch": 2.762961762799741, + "grad_norm": 0.8565463423728943, + "learning_rate": 8.14168479977695e-08, + "loss": 0.0521, + "step": 17053 + }, + { + "epoch": 2.7631237848347374, + "grad_norm": 0.8485569357872009, + "learning_rate": 8.130619186365012e-08, + "loss": 0.0554, + "step": 17054 + }, + { + "epoch": 2.7632858068697344, + "grad_norm": 0.872444212436676, + "learning_rate": 8.119560973570834e-08, + "loss": 0.0548, + "step": 17055 + }, + { + "epoch": 2.763447828904731, + "grad_norm": 0.9443418979644775, + "learning_rate": 8.10851016173278e-08, + "loss": 0.0645, + "step": 17056 + }, + { + "epoch": 2.7636098509397278, + "grad_norm": 0.9589036703109741, + "learning_rate": 8.097466751188915e-08, + "loss": 0.064, + "step": 17057 + }, + { + "epoch": 2.7637718729747247, + "grad_norm": 0.9315710663795471, + "learning_rate": 8.086430742277191e-08, + "loss": 0.0634, + "step": 17058 + }, + { + "epoch": 2.763933895009721, + "grad_norm": 1.0759693384170532, + "learning_rate": 8.075402135335253e-08, + "loss": 0.0581, + "step": 17059 + }, + { + "epoch": 2.764095917044718, + "grad_norm": 0.9197863340377808, + "learning_rate": 8.064380930700556e-08, + "loss": 0.0607, + "step": 17060 + }, + { + "epoch": 2.764257939079715, + "grad_norm": 0.8800620436668396, + "learning_rate": 8.053367128710355e-08, + "loss": 0.0556, + "step": 17061 + }, + { + "epoch": 2.7644199611147116, + "grad_norm": 1.0651123523712158, + "learning_rate": 8.042360729701604e-08, + "loss": 0.0606, + "step": 17062 + }, + { + "epoch": 2.7645819831497085, + "grad_norm": 0.8733367323875427, + "learning_rate": 8.031361734011118e-08, + "loss": 0.0663, + "step": 17063 + }, + { + "epoch": 2.764744005184705, + "grad_norm": 1.1324882507324219, + "learning_rate": 8.020370141975347e-08, + "loss": 0.0627, + "step": 17064 + }, + { + "epoch": 2.764906027219702, + "grad_norm": 0.9091924428939819, + "learning_rate": 8.009385953930721e-08, + "loss": 0.0635, + "step": 17065 + }, + { + "epoch": 2.7650680492546984, + "grad_norm": 0.9367949962615967, + "learning_rate": 7.998409170213245e-08, + "loss": 0.0591, + "step": 17066 + }, + { + "epoch": 2.7652300712896953, + "grad_norm": 0.7516220211982727, + "learning_rate": 7.987439791158874e-08, + "loss": 0.0515, + "step": 17067 + }, + { + "epoch": 2.7653920933246923, + "grad_norm": 1.0401772260665894, + "learning_rate": 7.976477817103117e-08, + "loss": 0.0612, + "step": 17068 + }, + { + "epoch": 2.7655541153596888, + "grad_norm": 0.9251458644866943, + "learning_rate": 7.965523248381485e-08, + "loss": 0.0598, + "step": 17069 + }, + { + "epoch": 2.7657161373946857, + "grad_norm": 1.0257396697998047, + "learning_rate": 7.954576085329152e-08, + "loss": 0.0613, + "step": 17070 + }, + { + "epoch": 2.7658781594296826, + "grad_norm": 1.0854121446609497, + "learning_rate": 7.943636328281018e-08, + "loss": 0.0627, + "step": 17071 + }, + { + "epoch": 2.766040181464679, + "grad_norm": 0.9566676616668701, + "learning_rate": 7.93270397757187e-08, + "loss": 0.0564, + "step": 17072 + }, + { + "epoch": 2.766202203499676, + "grad_norm": 0.9054695963859558, + "learning_rate": 7.921779033536137e-08, + "loss": 0.0537, + "step": 17073 + }, + { + "epoch": 2.766364225534673, + "grad_norm": 1.10857093334198, + "learning_rate": 7.910861496508216e-08, + "loss": 0.0631, + "step": 17074 + }, + { + "epoch": 2.7665262475696695, + "grad_norm": 1.0174477100372314, + "learning_rate": 7.899951366822061e-08, + "loss": 0.0657, + "step": 17075 + }, + { + "epoch": 2.766688269604666, + "grad_norm": 0.9663045406341553, + "learning_rate": 7.88904864481152e-08, + "loss": 0.061, + "step": 17076 + }, + { + "epoch": 2.766850291639663, + "grad_norm": 0.9540015459060669, + "learning_rate": 7.878153330810184e-08, + "loss": 0.0563, + "step": 17077 + }, + { + "epoch": 2.76701231367466, + "grad_norm": 0.9780965447425842, + "learning_rate": 7.867265425151454e-08, + "loss": 0.0641, + "step": 17078 + }, + { + "epoch": 2.7671743357096563, + "grad_norm": 0.9400720596313477, + "learning_rate": 7.856384928168426e-08, + "loss": 0.0663, + "step": 17079 + }, + { + "epoch": 2.7673363577446533, + "grad_norm": 1.059554934501648, + "learning_rate": 7.845511840194081e-08, + "loss": 0.0611, + "step": 17080 + }, + { + "epoch": 2.76749837977965, + "grad_norm": 0.911750078201294, + "learning_rate": 7.834646161561044e-08, + "loss": 0.0582, + "step": 17081 + }, + { + "epoch": 2.7676604018146467, + "grad_norm": 1.0574133396148682, + "learning_rate": 7.823787892601825e-08, + "loss": 0.0648, + "step": 17082 + }, + { + "epoch": 2.7678224238496436, + "grad_norm": 0.8240780234336853, + "learning_rate": 7.812937033648604e-08, + "loss": 0.0583, + "step": 17083 + }, + { + "epoch": 2.7679844458846405, + "grad_norm": 1.0235298871994019, + "learning_rate": 7.802093585033449e-08, + "loss": 0.0694, + "step": 17084 + }, + { + "epoch": 2.768146467919637, + "grad_norm": 1.218329668045044, + "learning_rate": 7.79125754708815e-08, + "loss": 0.0731, + "step": 17085 + }, + { + "epoch": 2.768308489954634, + "grad_norm": 0.8903065919876099, + "learning_rate": 7.780428920144217e-08, + "loss": 0.0582, + "step": 17086 + }, + { + "epoch": 2.7684705119896305, + "grad_norm": 0.8183029294013977, + "learning_rate": 7.769607704532972e-08, + "loss": 0.0587, + "step": 17087 + }, + { + "epoch": 2.7686325340246274, + "grad_norm": 0.8426903486251831, + "learning_rate": 7.758793900585565e-08, + "loss": 0.0606, + "step": 17088 + }, + { + "epoch": 2.768794556059624, + "grad_norm": 1.1522109508514404, + "learning_rate": 7.747987508632871e-08, + "loss": 0.0576, + "step": 17089 + }, + { + "epoch": 2.768956578094621, + "grad_norm": 0.7906349897384644, + "learning_rate": 7.737188529005484e-08, + "loss": 0.0545, + "step": 17090 + }, + { + "epoch": 2.7691186001296177, + "grad_norm": 0.8786861896514893, + "learning_rate": 7.726396962033894e-08, + "loss": 0.0612, + "step": 17091 + }, + { + "epoch": 2.7692806221646142, + "grad_norm": 1.079187273979187, + "learning_rate": 7.715612808048251e-08, + "loss": 0.0612, + "step": 17092 + }, + { + "epoch": 2.769442644199611, + "grad_norm": 1.0963796377182007, + "learning_rate": 7.70483606737854e-08, + "loss": 0.0613, + "step": 17093 + }, + { + "epoch": 2.769604666234608, + "grad_norm": 0.9393141269683838, + "learning_rate": 7.6940667403545e-08, + "loss": 0.0641, + "step": 17094 + }, + { + "epoch": 2.7697666882696046, + "grad_norm": 0.9409939050674438, + "learning_rate": 7.683304827305644e-08, + "loss": 0.0603, + "step": 17095 + }, + { + "epoch": 2.7699287103046015, + "grad_norm": 0.8602232933044434, + "learning_rate": 7.672550328561318e-08, + "loss": 0.0574, + "step": 17096 + }, + { + "epoch": 2.7700907323395985, + "grad_norm": 1.0245064496994019, + "learning_rate": 7.661803244450455e-08, + "loss": 0.0628, + "step": 17097 + }, + { + "epoch": 2.770252754374595, + "grad_norm": 0.8060587048530579, + "learning_rate": 7.651063575301986e-08, + "loss": 0.0561, + "step": 17098 + }, + { + "epoch": 2.7704147764095914, + "grad_norm": 1.021560549736023, + "learning_rate": 7.64033132144451e-08, + "loss": 0.0612, + "step": 17099 + }, + { + "epoch": 2.7705767984445884, + "grad_norm": 1.0078121423721313, + "learning_rate": 7.62960648320643e-08, + "loss": 0.0581, + "step": 17100 + }, + { + "epoch": 2.7707388204795853, + "grad_norm": 0.9183746576309204, + "learning_rate": 7.618889060915819e-08, + "loss": 0.0544, + "step": 17101 + }, + { + "epoch": 2.770900842514582, + "grad_norm": 0.9616214632987976, + "learning_rate": 7.608179054900634e-08, + "loss": 0.0645, + "step": 17102 + }, + { + "epoch": 2.7710628645495787, + "grad_norm": 1.02703058719635, + "learning_rate": 7.597476465488668e-08, + "loss": 0.0657, + "step": 17103 + }, + { + "epoch": 2.7712248865845757, + "grad_norm": 0.9865126609802246, + "learning_rate": 7.586781293007273e-08, + "loss": 0.0653, + "step": 17104 + }, + { + "epoch": 2.771386908619572, + "grad_norm": 0.8494060039520264, + "learning_rate": 7.57609353778374e-08, + "loss": 0.0586, + "step": 17105 + }, + { + "epoch": 2.771548930654569, + "grad_norm": 0.9772338271141052, + "learning_rate": 7.565413200145089e-08, + "loss": 0.0611, + "step": 17106 + }, + { + "epoch": 2.771710952689566, + "grad_norm": 1.0353899002075195, + "learning_rate": 7.55474028041811e-08, + "loss": 0.0631, + "step": 17107 + }, + { + "epoch": 2.7718729747245625, + "grad_norm": 0.9081644415855408, + "learning_rate": 7.544074778929378e-08, + "loss": 0.0616, + "step": 17108 + }, + { + "epoch": 2.7720349967595594, + "grad_norm": 1.0636510848999023, + "learning_rate": 7.533416696005242e-08, + "loss": 0.0683, + "step": 17109 + }, + { + "epoch": 2.772197018794556, + "grad_norm": 0.981203556060791, + "learning_rate": 7.522766031971774e-08, + "loss": 0.0553, + "step": 17110 + }, + { + "epoch": 2.772359040829553, + "grad_norm": 0.9812530279159546, + "learning_rate": 7.512122787154908e-08, + "loss": 0.0583, + "step": 17111 + }, + { + "epoch": 2.7725210628645494, + "grad_norm": 0.9434771537780762, + "learning_rate": 7.501486961880245e-08, + "loss": 0.0635, + "step": 17112 + }, + { + "epoch": 2.7726830848995463, + "grad_norm": 0.9104921817779541, + "learning_rate": 7.490858556473246e-08, + "loss": 0.056, + "step": 17113 + }, + { + "epoch": 2.7728451069345432, + "grad_norm": 0.996190071105957, + "learning_rate": 7.480237571259153e-08, + "loss": 0.0633, + "step": 17114 + }, + { + "epoch": 2.7730071289695397, + "grad_norm": 0.8751932978630066, + "learning_rate": 7.469624006562898e-08, + "loss": 0.0574, + "step": 17115 + }, + { + "epoch": 2.7731691510045366, + "grad_norm": 0.8272261023521423, + "learning_rate": 7.459017862709194e-08, + "loss": 0.0515, + "step": 17116 + }, + { + "epoch": 2.7733311730395336, + "grad_norm": 0.916673481464386, + "learning_rate": 7.448419140022616e-08, + "loss": 0.0609, + "step": 17117 + }, + { + "epoch": 2.77349319507453, + "grad_norm": 1.0777217149734497, + "learning_rate": 7.437827838827488e-08, + "loss": 0.0615, + "step": 17118 + }, + { + "epoch": 2.773655217109527, + "grad_norm": 1.0199631452560425, + "learning_rate": 7.4272439594478e-08, + "loss": 0.0547, + "step": 17119 + }, + { + "epoch": 2.7738172391445235, + "grad_norm": 0.8210328817367554, + "learning_rate": 7.416667502207458e-08, + "loss": 0.0541, + "step": 17120 + }, + { + "epoch": 2.7739792611795204, + "grad_norm": 0.8746510148048401, + "learning_rate": 7.40609846743004e-08, + "loss": 0.0565, + "step": 17121 + }, + { + "epoch": 2.774141283214517, + "grad_norm": 0.8559312224388123, + "learning_rate": 7.395536855438923e-08, + "loss": 0.0582, + "step": 17122 + }, + { + "epoch": 2.774303305249514, + "grad_norm": 1.033912181854248, + "learning_rate": 7.384982666557322e-08, + "loss": 0.0567, + "step": 17123 + }, + { + "epoch": 2.774465327284511, + "grad_norm": 0.8947577476501465, + "learning_rate": 7.37443590110809e-08, + "loss": 0.0601, + "step": 17124 + }, + { + "epoch": 2.7746273493195073, + "grad_norm": 0.901314377784729, + "learning_rate": 7.363896559414024e-08, + "loss": 0.0593, + "step": 17125 + }, + { + "epoch": 2.774789371354504, + "grad_norm": 0.8654842972755432, + "learning_rate": 7.353364641797533e-08, + "loss": 0.0636, + "step": 17126 + }, + { + "epoch": 2.774951393389501, + "grad_norm": 0.9468207955360413, + "learning_rate": 7.342840148580888e-08, + "loss": 0.0626, + "step": 17127 + }, + { + "epoch": 2.7751134154244976, + "grad_norm": 0.8542966246604919, + "learning_rate": 7.332323080086106e-08, + "loss": 0.0542, + "step": 17128 + }, + { + "epoch": 2.7752754374594946, + "grad_norm": 0.9125382900238037, + "learning_rate": 7.321813436635044e-08, + "loss": 0.0621, + "step": 17129 + }, + { + "epoch": 2.7754374594944915, + "grad_norm": 0.9692714810371399, + "learning_rate": 7.311311218549166e-08, + "loss": 0.0604, + "step": 17130 + }, + { + "epoch": 2.775599481529488, + "grad_norm": 0.8632164597511292, + "learning_rate": 7.300816426149854e-08, + "loss": 0.0619, + "step": 17131 + }, + { + "epoch": 2.775761503564485, + "grad_norm": 0.9177026748657227, + "learning_rate": 7.290329059758294e-08, + "loss": 0.0648, + "step": 17132 + }, + { + "epoch": 2.7759235255994814, + "grad_norm": 0.9254752993583679, + "learning_rate": 7.279849119695314e-08, + "loss": 0.0649, + "step": 17133 + }, + { + "epoch": 2.7760855476344783, + "grad_norm": 1.0560814142227173, + "learning_rate": 7.269376606281547e-08, + "loss": 0.0711, + "step": 17134 + }, + { + "epoch": 2.776247569669475, + "grad_norm": 0.8399720788002014, + "learning_rate": 7.258911519837486e-08, + "loss": 0.0619, + "step": 17135 + }, + { + "epoch": 2.7764095917044718, + "grad_norm": 0.8254349827766418, + "learning_rate": 7.248453860683291e-08, + "loss": 0.0566, + "step": 17136 + }, + { + "epoch": 2.7765716137394687, + "grad_norm": 0.8527083992958069, + "learning_rate": 7.238003629138957e-08, + "loss": 0.0586, + "step": 17137 + }, + { + "epoch": 2.776733635774465, + "grad_norm": 1.0240825414657593, + "learning_rate": 7.227560825524255e-08, + "loss": 0.0664, + "step": 17138 + }, + { + "epoch": 2.776895657809462, + "grad_norm": 1.000952124595642, + "learning_rate": 7.21712545015868e-08, + "loss": 0.0619, + "step": 17139 + }, + { + "epoch": 2.777057679844459, + "grad_norm": 0.9190289378166199, + "learning_rate": 7.20669750336156e-08, + "loss": 0.064, + "step": 17140 + }, + { + "epoch": 2.7772197018794555, + "grad_norm": 0.967212975025177, + "learning_rate": 7.196276985451916e-08, + "loss": 0.0608, + "step": 17141 + }, + { + "epoch": 2.7773817239144525, + "grad_norm": 0.7784137725830078, + "learning_rate": 7.185863896748662e-08, + "loss": 0.0556, + "step": 17142 + }, + { + "epoch": 2.777543745949449, + "grad_norm": 0.9892643094062805, + "learning_rate": 7.175458237570349e-08, + "loss": 0.0673, + "step": 17143 + }, + { + "epoch": 2.777705767984446, + "grad_norm": 0.872578501701355, + "learning_rate": 7.165060008235414e-08, + "loss": 0.0567, + "step": 17144 + }, + { + "epoch": 2.7778677900194424, + "grad_norm": 0.8793370723724365, + "learning_rate": 7.154669209061965e-08, + "loss": 0.0538, + "step": 17145 + }, + { + "epoch": 2.7780298120544393, + "grad_norm": 1.0323148965835571, + "learning_rate": 7.14428584036797e-08, + "loss": 0.058, + "step": 17146 + }, + { + "epoch": 2.7781918340894363, + "grad_norm": 0.8175366520881653, + "learning_rate": 7.133909902471147e-08, + "loss": 0.0573, + "step": 17147 + }, + { + "epoch": 2.7783538561244328, + "grad_norm": 0.9729697108268738, + "learning_rate": 7.123541395688966e-08, + "loss": 0.0602, + "step": 17148 + }, + { + "epoch": 2.7785158781594297, + "grad_norm": 0.9307907223701477, + "learning_rate": 7.113180320338642e-08, + "loss": 0.0591, + "step": 17149 + }, + { + "epoch": 2.7786779001944266, + "grad_norm": 0.8893921375274658, + "learning_rate": 7.102826676737202e-08, + "loss": 0.0552, + "step": 17150 + }, + { + "epoch": 2.778839922229423, + "grad_norm": 0.8809131979942322, + "learning_rate": 7.0924804652015e-08, + "loss": 0.0638, + "step": 17151 + }, + { + "epoch": 2.77900194426442, + "grad_norm": 0.9402658939361572, + "learning_rate": 7.082141686048066e-08, + "loss": 0.0621, + "step": 17152 + }, + { + "epoch": 2.779163966299417, + "grad_norm": 1.108227014541626, + "learning_rate": 7.071810339593254e-08, + "loss": 0.0662, + "step": 17153 + }, + { + "epoch": 2.7793259883344135, + "grad_norm": 0.9297798871994019, + "learning_rate": 7.061486426153146e-08, + "loss": 0.0619, + "step": 17154 + }, + { + "epoch": 2.7794880103694104, + "grad_norm": 0.8598953485488892, + "learning_rate": 7.051169946043685e-08, + "loss": 0.065, + "step": 17155 + }, + { + "epoch": 2.779650032404407, + "grad_norm": 1.2274415493011475, + "learning_rate": 7.040860899580475e-08, + "loss": 0.0584, + "step": 17156 + }, + { + "epoch": 2.779812054439404, + "grad_norm": 0.8859155774116516, + "learning_rate": 7.030559287078992e-08, + "loss": 0.0601, + "step": 17157 + }, + { + "epoch": 2.7799740764744003, + "grad_norm": 0.8546262979507446, + "learning_rate": 7.020265108854423e-08, + "loss": 0.0584, + "step": 17158 + }, + { + "epoch": 2.7801360985093972, + "grad_norm": 0.8823912739753723, + "learning_rate": 7.009978365221687e-08, + "loss": 0.0592, + "step": 17159 + }, + { + "epoch": 2.780298120544394, + "grad_norm": 0.9547003507614136, + "learning_rate": 6.99969905649564e-08, + "loss": 0.0606, + "step": 17160 + }, + { + "epoch": 2.7804601425793907, + "grad_norm": 0.7352187633514404, + "learning_rate": 6.989427182990727e-08, + "loss": 0.0517, + "step": 17161 + }, + { + "epoch": 2.7806221646143876, + "grad_norm": 0.7668613791465759, + "learning_rate": 6.979162745021306e-08, + "loss": 0.0512, + "step": 17162 + }, + { + "epoch": 2.7807841866493845, + "grad_norm": 0.9652883410453796, + "learning_rate": 6.968905742901405e-08, + "loss": 0.0578, + "step": 17163 + }, + { + "epoch": 2.780946208684381, + "grad_norm": 0.8641253709793091, + "learning_rate": 6.958656176944801e-08, + "loss": 0.0599, + "step": 17164 + }, + { + "epoch": 2.781108230719378, + "grad_norm": 0.9378582835197449, + "learning_rate": 6.94841404746524e-08, + "loss": 0.0599, + "step": 17165 + }, + { + "epoch": 2.7812702527543745, + "grad_norm": 0.902350664138794, + "learning_rate": 6.938179354776003e-08, + "loss": 0.0588, + "step": 17166 + }, + { + "epoch": 2.7814322747893714, + "grad_norm": 0.9747065305709839, + "learning_rate": 6.927952099190282e-08, + "loss": 0.0606, + "step": 17167 + }, + { + "epoch": 2.781594296824368, + "grad_norm": 0.9471100568771362, + "learning_rate": 6.917732281020995e-08, + "loss": 0.0641, + "step": 17168 + }, + { + "epoch": 2.781756318859365, + "grad_norm": 0.9175786972045898, + "learning_rate": 6.907519900580862e-08, + "loss": 0.0576, + "step": 17169 + }, + { + "epoch": 2.7819183408943617, + "grad_norm": 0.8128597736358643, + "learning_rate": 6.897314958182327e-08, + "loss": 0.0569, + "step": 17170 + }, + { + "epoch": 2.7820803629293582, + "grad_norm": 1.0002559423446655, + "learning_rate": 6.887117454137698e-08, + "loss": 0.0681, + "step": 17171 + }, + { + "epoch": 2.782242384964355, + "grad_norm": 0.9145398736000061, + "learning_rate": 6.87692738875892e-08, + "loss": 0.0587, + "step": 17172 + }, + { + "epoch": 2.782404406999352, + "grad_norm": 0.9303719997406006, + "learning_rate": 6.866744762357852e-08, + "loss": 0.06, + "step": 17173 + }, + { + "epoch": 2.7825664290343486, + "grad_norm": 0.9759852886199951, + "learning_rate": 6.856569575245969e-08, + "loss": 0.062, + "step": 17174 + }, + { + "epoch": 2.7827284510693455, + "grad_norm": 0.9545977711677551, + "learning_rate": 6.846401827734689e-08, + "loss": 0.0615, + "step": 17175 + }, + { + "epoch": 2.7828904731043425, + "grad_norm": 0.8938077092170715, + "learning_rate": 6.836241520135123e-08, + "loss": 0.0616, + "step": 17176 + }, + { + "epoch": 2.783052495139339, + "grad_norm": 1.1049113273620605, + "learning_rate": 6.826088652758106e-08, + "loss": 0.0655, + "step": 17177 + }, + { + "epoch": 2.7832145171743354, + "grad_norm": 0.866930365562439, + "learning_rate": 6.815943225914278e-08, + "loss": 0.0569, + "step": 17178 + }, + { + "epoch": 2.7833765392093324, + "grad_norm": 0.9813682436943054, + "learning_rate": 6.805805239914087e-08, + "loss": 0.0615, + "step": 17179 + }, + { + "epoch": 2.7835385612443293, + "grad_norm": 1.0694559812545776, + "learning_rate": 6.795674695067783e-08, + "loss": 0.0656, + "step": 17180 + }, + { + "epoch": 2.783700583279326, + "grad_norm": 0.8712436556816101, + "learning_rate": 6.785551591685257e-08, + "loss": 0.0597, + "step": 17181 + }, + { + "epoch": 2.7838626053143227, + "grad_norm": 0.7749736905097961, + "learning_rate": 6.77543593007629e-08, + "loss": 0.0492, + "step": 17182 + }, + { + "epoch": 2.7840246273493197, + "grad_norm": 0.8556262850761414, + "learning_rate": 6.765327710550412e-08, + "loss": 0.062, + "step": 17183 + }, + { + "epoch": 2.784186649384316, + "grad_norm": 0.953547477722168, + "learning_rate": 6.755226933416876e-08, + "loss": 0.0589, + "step": 17184 + }, + { + "epoch": 2.784348671419313, + "grad_norm": 0.8861910104751587, + "learning_rate": 6.745133598984737e-08, + "loss": 0.0543, + "step": 17185 + }, + { + "epoch": 2.78451069345431, + "grad_norm": 1.0571638345718384, + "learning_rate": 6.735047707562863e-08, + "loss": 0.0595, + "step": 17186 + }, + { + "epoch": 2.7846727154893065, + "grad_norm": 1.2609553337097168, + "learning_rate": 6.72496925945984e-08, + "loss": 0.0701, + "step": 17187 + }, + { + "epoch": 2.7848347375243034, + "grad_norm": 0.9145808815956116, + "learning_rate": 6.714898254984031e-08, + "loss": 0.062, + "step": 17188 + }, + { + "epoch": 2.7849967595593, + "grad_norm": 0.9036248326301575, + "learning_rate": 6.704834694443608e-08, + "loss": 0.0625, + "step": 17189 + }, + { + "epoch": 2.785158781594297, + "grad_norm": 0.834865391254425, + "learning_rate": 6.69477857814646e-08, + "loss": 0.0523, + "step": 17190 + }, + { + "epoch": 2.7853208036292934, + "grad_norm": 0.823307991027832, + "learning_rate": 6.684729906400344e-08, + "loss": 0.0572, + "step": 17191 + }, + { + "epoch": 2.7854828256642903, + "grad_norm": 1.1076858043670654, + "learning_rate": 6.674688679512654e-08, + "loss": 0.0676, + "step": 17192 + }, + { + "epoch": 2.785644847699287, + "grad_norm": 0.8785462975502014, + "learning_rate": 6.66465489779064e-08, + "loss": 0.0596, + "step": 17193 + }, + { + "epoch": 2.7858068697342837, + "grad_norm": 0.8788356781005859, + "learning_rate": 6.654628561541337e-08, + "loss": 0.0625, + "step": 17194 + }, + { + "epoch": 2.7859688917692806, + "grad_norm": 0.8812581896781921, + "learning_rate": 6.644609671071556e-08, + "loss": 0.0609, + "step": 17195 + }, + { + "epoch": 2.7861309138042776, + "grad_norm": 0.9713631272315979, + "learning_rate": 6.634598226687772e-08, + "loss": 0.0644, + "step": 17196 + }, + { + "epoch": 2.786292935839274, + "grad_norm": 0.881287693977356, + "learning_rate": 6.624594228696323e-08, + "loss": 0.0618, + "step": 17197 + }, + { + "epoch": 2.786454957874271, + "grad_norm": 0.8775894641876221, + "learning_rate": 6.614597677403384e-08, + "loss": 0.0579, + "step": 17198 + }, + { + "epoch": 2.786616979909268, + "grad_norm": 0.877691388130188, + "learning_rate": 6.604608573114735e-08, + "loss": 0.0617, + "step": 17199 + }, + { + "epoch": 2.7867790019442644, + "grad_norm": 0.8632456064224243, + "learning_rate": 6.594626916136077e-08, + "loss": 0.0617, + "step": 17200 + }, + { + "epoch": 2.786941023979261, + "grad_norm": 1.1622703075408936, + "learning_rate": 6.584652706772804e-08, + "loss": 0.0661, + "step": 17201 + }, + { + "epoch": 2.787103046014258, + "grad_norm": 0.9076300263404846, + "learning_rate": 6.574685945330145e-08, + "loss": 0.0588, + "step": 17202 + }, + { + "epoch": 2.787265068049255, + "grad_norm": 0.931695282459259, + "learning_rate": 6.564726632112939e-08, + "loss": 0.0568, + "step": 17203 + }, + { + "epoch": 2.7874270900842513, + "grad_norm": 1.018306016921997, + "learning_rate": 6.554774767426026e-08, + "loss": 0.0644, + "step": 17204 + }, + { + "epoch": 2.787589112119248, + "grad_norm": 0.9979287981987, + "learning_rate": 6.544830351573883e-08, + "loss": 0.0625, + "step": 17205 + }, + { + "epoch": 2.787751134154245, + "grad_norm": 0.8496156334877014, + "learning_rate": 6.534893384860824e-08, + "loss": 0.0638, + "step": 17206 + }, + { + "epoch": 2.7879131561892416, + "grad_norm": 0.8916325569152832, + "learning_rate": 6.524963867590772e-08, + "loss": 0.0595, + "step": 17207 + }, + { + "epoch": 2.7880751782242386, + "grad_norm": 0.9312731027603149, + "learning_rate": 6.515041800067678e-08, + "loss": 0.0627, + "step": 17208 + }, + { + "epoch": 2.7882372002592355, + "grad_norm": 0.8041099309921265, + "learning_rate": 6.505127182595106e-08, + "loss": 0.0594, + "step": 17209 + }, + { + "epoch": 2.788399222294232, + "grad_norm": 0.9212023019790649, + "learning_rate": 6.495220015476366e-08, + "loss": 0.0539, + "step": 17210 + }, + { + "epoch": 2.788561244329229, + "grad_norm": 0.9335891008377075, + "learning_rate": 6.485320299014608e-08, + "loss": 0.0572, + "step": 17211 + }, + { + "epoch": 2.7887232663642254, + "grad_norm": 0.9345332384109497, + "learning_rate": 6.475428033512754e-08, + "loss": 0.0667, + "step": 17212 + }, + { + "epoch": 2.7888852883992223, + "grad_norm": 0.938679575920105, + "learning_rate": 6.465543219273507e-08, + "loss": 0.0602, + "step": 17213 + }, + { + "epoch": 2.789047310434219, + "grad_norm": 0.8519788980484009, + "learning_rate": 6.455665856599291e-08, + "loss": 0.05, + "step": 17214 + }, + { + "epoch": 2.7892093324692158, + "grad_norm": 1.0343631505966187, + "learning_rate": 6.445795945792338e-08, + "loss": 0.0649, + "step": 17215 + }, + { + "epoch": 2.7893713545042127, + "grad_norm": 0.935124933719635, + "learning_rate": 6.435933487154627e-08, + "loss": 0.0668, + "step": 17216 + }, + { + "epoch": 2.789533376539209, + "grad_norm": 0.9516078233718872, + "learning_rate": 6.426078480987947e-08, + "loss": 0.0628, + "step": 17217 + }, + { + "epoch": 2.789695398574206, + "grad_norm": 1.1208375692367554, + "learning_rate": 6.416230927593803e-08, + "loss": 0.0603, + "step": 17218 + }, + { + "epoch": 2.789857420609203, + "grad_norm": 0.8511807918548584, + "learning_rate": 6.406390827273567e-08, + "loss": 0.0597, + "step": 17219 + }, + { + "epoch": 2.7900194426441995, + "grad_norm": 0.8629888296127319, + "learning_rate": 6.39655818032825e-08, + "loss": 0.0586, + "step": 17220 + }, + { + "epoch": 2.7901814646791965, + "grad_norm": 1.064953088760376, + "learning_rate": 6.386732987058802e-08, + "loss": 0.0625, + "step": 17221 + }, + { + "epoch": 2.790343486714193, + "grad_norm": 0.9076513648033142, + "learning_rate": 6.376915247765735e-08, + "loss": 0.0551, + "step": 17222 + }, + { + "epoch": 2.79050550874919, + "grad_norm": 0.8662562966346741, + "learning_rate": 6.367104962749504e-08, + "loss": 0.0561, + "step": 17223 + }, + { + "epoch": 2.7906675307841864, + "grad_norm": 0.8259896636009216, + "learning_rate": 6.357302132310338e-08, + "loss": 0.054, + "step": 17224 + }, + { + "epoch": 2.7908295528191833, + "grad_norm": 0.8116698265075684, + "learning_rate": 6.347506756748084e-08, + "loss": 0.0535, + "step": 17225 + }, + { + "epoch": 2.7909915748541803, + "grad_norm": 0.8382045030593872, + "learning_rate": 6.337718836362473e-08, + "loss": 0.056, + "step": 17226 + }, + { + "epoch": 2.7911535968891767, + "grad_norm": 0.8277113437652588, + "learning_rate": 6.327938371453069e-08, + "loss": 0.0618, + "step": 17227 + }, + { + "epoch": 2.7913156189241737, + "grad_norm": 1.0887233018875122, + "learning_rate": 6.318165362319023e-08, + "loss": 0.0643, + "step": 17228 + }, + { + "epoch": 2.7914776409591706, + "grad_norm": 0.9287629723548889, + "learning_rate": 6.308399809259457e-08, + "loss": 0.0593, + "step": 17229 + }, + { + "epoch": 2.791639662994167, + "grad_norm": 0.8305715918540955, + "learning_rate": 6.298641712573105e-08, + "loss": 0.057, + "step": 17230 + }, + { + "epoch": 2.791801685029164, + "grad_norm": 0.8574482202529907, + "learning_rate": 6.28889107255859e-08, + "loss": 0.0548, + "step": 17231 + }, + { + "epoch": 2.791963707064161, + "grad_norm": 0.9703893661499023, + "learning_rate": 6.279147889514226e-08, + "loss": 0.0562, + "step": 17232 + }, + { + "epoch": 2.7921257290991575, + "grad_norm": 0.9155253767967224, + "learning_rate": 6.269412163738137e-08, + "loss": 0.0623, + "step": 17233 + }, + { + "epoch": 2.7922877511341544, + "grad_norm": 0.8451345562934875, + "learning_rate": 6.259683895528251e-08, + "loss": 0.0535, + "step": 17234 + }, + { + "epoch": 2.792449773169151, + "grad_norm": 0.997577965259552, + "learning_rate": 6.249963085182192e-08, + "loss": 0.0666, + "step": 17235 + }, + { + "epoch": 2.792611795204148, + "grad_norm": 1.17750883102417, + "learning_rate": 6.24024973299736e-08, + "loss": 0.0615, + "step": 17236 + }, + { + "epoch": 2.7927738172391443, + "grad_norm": 0.9555923938751221, + "learning_rate": 6.230543839271019e-08, + "loss": 0.0606, + "step": 17237 + }, + { + "epoch": 2.7929358392741412, + "grad_norm": 1.071610689163208, + "learning_rate": 6.220845404300124e-08, + "loss": 0.0687, + "step": 17238 + }, + { + "epoch": 2.793097861309138, + "grad_norm": 0.7882108092308044, + "learning_rate": 6.211154428381466e-08, + "loss": 0.0511, + "step": 17239 + }, + { + "epoch": 2.7932598833441347, + "grad_norm": 0.882218599319458, + "learning_rate": 6.201470911811474e-08, + "loss": 0.0581, + "step": 17240 + }, + { + "epoch": 2.7934219053791316, + "grad_norm": 0.8951191902160645, + "learning_rate": 6.191794854886496e-08, + "loss": 0.0643, + "step": 17241 + }, + { + "epoch": 2.7935839274141285, + "grad_norm": 0.8501166105270386, + "learning_rate": 6.182126257902626e-08, + "loss": 0.0549, + "step": 17242 + }, + { + "epoch": 2.793745949449125, + "grad_norm": 1.0171254873275757, + "learning_rate": 6.17246512115563e-08, + "loss": 0.0557, + "step": 17243 + }, + { + "epoch": 2.793907971484122, + "grad_norm": 0.9592586159706116, + "learning_rate": 6.162811444941159e-08, + "loss": 0.0647, + "step": 17244 + }, + { + "epoch": 2.7940699935191184, + "grad_norm": 0.7187346816062927, + "learning_rate": 6.153165229554587e-08, + "loss": 0.043, + "step": 17245 + }, + { + "epoch": 2.7942320155541154, + "grad_norm": 0.9293650984764099, + "learning_rate": 6.143526475291067e-08, + "loss": 0.0611, + "step": 17246 + }, + { + "epoch": 2.794394037589112, + "grad_norm": 0.919105052947998, + "learning_rate": 6.133895182445504e-08, + "loss": 0.0585, + "step": 17247 + }, + { + "epoch": 2.794556059624109, + "grad_norm": 0.9237651228904724, + "learning_rate": 6.124271351312605e-08, + "loss": 0.0615, + "step": 17248 + }, + { + "epoch": 2.7947180816591057, + "grad_norm": 0.9139578938484192, + "learning_rate": 6.114654982186829e-08, + "loss": 0.0635, + "step": 17249 + }, + { + "epoch": 2.7948801036941022, + "grad_norm": 1.0389397144317627, + "learning_rate": 6.105046075362441e-08, + "loss": 0.0565, + "step": 17250 + }, + { + "epoch": 2.795042125729099, + "grad_norm": 0.9643591046333313, + "learning_rate": 6.095444631133401e-08, + "loss": 0.0637, + "step": 17251 + }, + { + "epoch": 2.795204147764096, + "grad_norm": 0.9869605898857117, + "learning_rate": 6.085850649793529e-08, + "loss": 0.0612, + "step": 17252 + }, + { + "epoch": 2.7953661697990926, + "grad_norm": 1.068179965019226, + "learning_rate": 6.076264131636394e-08, + "loss": 0.0603, + "step": 17253 + }, + { + "epoch": 2.7955281918340895, + "grad_norm": 0.8888418674468994, + "learning_rate": 6.066685076955264e-08, + "loss": 0.0567, + "step": 17254 + }, + { + "epoch": 2.7956902138690864, + "grad_norm": 0.759746253490448, + "learning_rate": 6.057113486043236e-08, + "loss": 0.0525, + "step": 17255 + }, + { + "epoch": 2.795852235904083, + "grad_norm": 1.0633058547973633, + "learning_rate": 6.047549359193245e-08, + "loss": 0.0631, + "step": 17256 + }, + { + "epoch": 2.79601425793908, + "grad_norm": 0.906599223613739, + "learning_rate": 6.037992696697914e-08, + "loss": 0.0616, + "step": 17257 + }, + { + "epoch": 2.7961762799740764, + "grad_norm": 0.8136864900588989, + "learning_rate": 6.028443498849596e-08, + "loss": 0.0528, + "step": 17258 + }, + { + "epoch": 2.7963383020090733, + "grad_norm": 0.992445170879364, + "learning_rate": 6.018901765940499e-08, + "loss": 0.0618, + "step": 17259 + }, + { + "epoch": 2.79650032404407, + "grad_norm": 0.8325508236885071, + "learning_rate": 6.009367498262587e-08, + "loss": 0.05, + "step": 17260 + }, + { + "epoch": 2.7966623460790667, + "grad_norm": 0.7613760232925415, + "learning_rate": 5.999840696107595e-08, + "loss": 0.0488, + "step": 17261 + }, + { + "epoch": 2.7968243681140637, + "grad_norm": 1.0377253293991089, + "learning_rate": 5.990321359767015e-08, + "loss": 0.068, + "step": 17262 + }, + { + "epoch": 2.79698639014906, + "grad_norm": 1.0658155679702759, + "learning_rate": 5.980809489532085e-08, + "loss": 0.0616, + "step": 17263 + }, + { + "epoch": 2.797148412184057, + "grad_norm": 0.9770277142524719, + "learning_rate": 5.97130508569388e-08, + "loss": 0.0582, + "step": 17264 + }, + { + "epoch": 2.797310434219054, + "grad_norm": 0.9497054219245911, + "learning_rate": 5.961808148543219e-08, + "loss": 0.0561, + "step": 17265 + }, + { + "epoch": 2.7974724562540505, + "grad_norm": 0.9469033479690552, + "learning_rate": 5.95231867837065e-08, + "loss": 0.0573, + "step": 17266 + }, + { + "epoch": 2.7976344782890474, + "grad_norm": 0.9465834498405457, + "learning_rate": 5.9428366754665234e-08, + "loss": 0.061, + "step": 17267 + }, + { + "epoch": 2.797796500324044, + "grad_norm": 0.92009437084198, + "learning_rate": 5.933362140121052e-08, + "loss": 0.0604, + "step": 17268 + }, + { + "epoch": 2.797958522359041, + "grad_norm": 0.8519408702850342, + "learning_rate": 5.923895072624031e-08, + "loss": 0.0618, + "step": 17269 + }, + { + "epoch": 2.7981205443940373, + "grad_norm": 1.1452556848526, + "learning_rate": 5.9144354732651455e-08, + "loss": 0.0634, + "step": 17270 + }, + { + "epoch": 2.7982825664290343, + "grad_norm": 0.9965704679489136, + "learning_rate": 5.904983342333887e-08, + "loss": 0.0651, + "step": 17271 + }, + { + "epoch": 2.798444588464031, + "grad_norm": 0.9546822905540466, + "learning_rate": 5.8955386801194394e-08, + "loss": 0.0615, + "step": 17272 + }, + { + "epoch": 2.7986066104990277, + "grad_norm": 0.8720921277999878, + "learning_rate": 5.886101486910767e-08, + "loss": 0.058, + "step": 17273 + }, + { + "epoch": 2.7987686325340246, + "grad_norm": 0.9746670722961426, + "learning_rate": 5.8766717629966387e-08, + "loss": 0.0621, + "step": 17274 + }, + { + "epoch": 2.7989306545690216, + "grad_norm": 0.9597407579421997, + "learning_rate": 5.867249508665629e-08, + "loss": 0.062, + "step": 17275 + }, + { + "epoch": 2.799092676604018, + "grad_norm": 1.0109037160873413, + "learning_rate": 5.857834724205979e-08, + "loss": 0.0593, + "step": 17276 + }, + { + "epoch": 2.799254698639015, + "grad_norm": 0.8651391863822937, + "learning_rate": 5.848427409905766e-08, + "loss": 0.0586, + "step": 17277 + }, + { + "epoch": 2.799416720674012, + "grad_norm": 0.9493432641029358, + "learning_rate": 5.839027566052841e-08, + "loss": 0.0629, + "step": 17278 + }, + { + "epoch": 2.7995787427090084, + "grad_norm": 0.9215166568756104, + "learning_rate": 5.829635192934807e-08, + "loss": 0.0638, + "step": 17279 + }, + { + "epoch": 2.7997407647440054, + "grad_norm": 0.8210483193397522, + "learning_rate": 5.820250290839047e-08, + "loss": 0.0543, + "step": 17280 + }, + { + "epoch": 2.799902786779002, + "grad_norm": 0.8877785801887512, + "learning_rate": 5.810872860052747e-08, + "loss": 0.0562, + "step": 17281 + }, + { + "epoch": 2.8000648088139988, + "grad_norm": 0.9447657465934753, + "learning_rate": 5.801502900862788e-08, + "loss": 0.0624, + "step": 17282 + }, + { + "epoch": 2.8002268308489953, + "grad_norm": 1.0124446153640747, + "learning_rate": 5.7921404135559414e-08, + "loss": 0.0577, + "step": 17283 + }, + { + "epoch": 2.800388852883992, + "grad_norm": 1.1428158283233643, + "learning_rate": 5.782785398418561e-08, + "loss": 0.0618, + "step": 17284 + }, + { + "epoch": 2.800550874918989, + "grad_norm": 1.0584673881530762, + "learning_rate": 5.773437855736974e-08, + "loss": 0.0651, + "step": 17285 + }, + { + "epoch": 2.8007128969539856, + "grad_norm": 0.9388712048530579, + "learning_rate": 5.7640977857972016e-08, + "loss": 0.0651, + "step": 17286 + }, + { + "epoch": 2.8008749189889826, + "grad_norm": 0.9077460765838623, + "learning_rate": 5.7547651888849864e-08, + "loss": 0.0635, + "step": 17287 + }, + { + "epoch": 2.8010369410239795, + "grad_norm": 0.8643128275871277, + "learning_rate": 5.745440065285879e-08, + "loss": 0.0568, + "step": 17288 + }, + { + "epoch": 2.801198963058976, + "grad_norm": 0.9081558585166931, + "learning_rate": 5.736122415285206e-08, + "loss": 0.0654, + "step": 17289 + }, + { + "epoch": 2.801360985093973, + "grad_norm": 0.9258993864059448, + "learning_rate": 5.726812239168128e-08, + "loss": 0.0616, + "step": 17290 + }, + { + "epoch": 2.8015230071289694, + "grad_norm": 0.9604719281196594, + "learning_rate": 5.717509537219418e-08, + "loss": 0.0643, + "step": 17291 + }, + { + "epoch": 2.8016850291639663, + "grad_norm": 0.8643040060997009, + "learning_rate": 5.708214309723792e-08, + "loss": 0.0642, + "step": 17292 + }, + { + "epoch": 2.801847051198963, + "grad_norm": 0.8756396770477295, + "learning_rate": 5.6989265569656335e-08, + "loss": 0.0583, + "step": 17293 + }, + { + "epoch": 2.8020090732339598, + "grad_norm": 0.7664240598678589, + "learning_rate": 5.689646279229105e-08, + "loss": 0.0532, + "step": 17294 + }, + { + "epoch": 2.8021710952689567, + "grad_norm": 0.7147080898284912, + "learning_rate": 5.680373476798201e-08, + "loss": 0.0479, + "step": 17295 + }, + { + "epoch": 2.802333117303953, + "grad_norm": 0.7867782711982727, + "learning_rate": 5.671108149956611e-08, + "loss": 0.0516, + "step": 17296 + }, + { + "epoch": 2.80249513933895, + "grad_norm": 0.9071274399757385, + "learning_rate": 5.661850298987859e-08, + "loss": 0.0572, + "step": 17297 + }, + { + "epoch": 2.802657161373947, + "grad_norm": 0.8463209867477417, + "learning_rate": 5.6525999241751894e-08, + "loss": 0.056, + "step": 17298 + }, + { + "epoch": 2.8028191834089435, + "grad_norm": 1.1977580785751343, + "learning_rate": 5.643357025801655e-08, + "loss": 0.0617, + "step": 17299 + }, + { + "epoch": 2.8029812054439405, + "grad_norm": 0.9107047319412231, + "learning_rate": 5.6341216041500555e-08, + "loss": 0.061, + "step": 17300 + }, + { + "epoch": 2.8031432274789374, + "grad_norm": 0.907247006893158, + "learning_rate": 5.624893659503028e-08, + "loss": 0.0526, + "step": 17301 + }, + { + "epoch": 2.803305249513934, + "grad_norm": 0.8431805968284607, + "learning_rate": 5.6156731921428455e-08, + "loss": 0.0576, + "step": 17302 + }, + { + "epoch": 2.8034672715489304, + "grad_norm": 1.0197104215621948, + "learning_rate": 5.6064602023516154e-08, + "loss": 0.0609, + "step": 17303 + }, + { + "epoch": 2.8036292935839273, + "grad_norm": 0.908807098865509, + "learning_rate": 5.597254690411363e-08, + "loss": 0.061, + "step": 17304 + }, + { + "epoch": 2.8037913156189243, + "grad_norm": 0.8455592393875122, + "learning_rate": 5.588056656603641e-08, + "loss": 0.0588, + "step": 17305 + }, + { + "epoch": 2.8039533376539207, + "grad_norm": 0.9535689949989319, + "learning_rate": 5.5788661012099176e-08, + "loss": 0.0598, + "step": 17306 + }, + { + "epoch": 2.8041153596889177, + "grad_norm": 0.8808077573776245, + "learning_rate": 5.5696830245114134e-08, + "loss": 0.0658, + "step": 17307 + }, + { + "epoch": 2.8042773817239146, + "grad_norm": 0.8899673819541931, + "learning_rate": 5.560507426789069e-08, + "loss": 0.0557, + "step": 17308 + }, + { + "epoch": 2.804439403758911, + "grad_norm": 0.8185904622077942, + "learning_rate": 5.551339308323689e-08, + "loss": 0.0569, + "step": 17309 + }, + { + "epoch": 2.804601425793908, + "grad_norm": 0.8157827258110046, + "learning_rate": 5.5421786693957705e-08, + "loss": 0.0508, + "step": 17310 + }, + { + "epoch": 2.804763447828905, + "grad_norm": 0.9755039811134338, + "learning_rate": 5.533025510285617e-08, + "loss": 0.0592, + "step": 17311 + }, + { + "epoch": 2.8049254698639015, + "grad_norm": 0.9864017367362976, + "learning_rate": 5.523879831273282e-08, + "loss": 0.0625, + "step": 17312 + }, + { + "epoch": 2.8050874918988984, + "grad_norm": 0.8920227289199829, + "learning_rate": 5.514741632638571e-08, + "loss": 0.0634, + "step": 17313 + }, + { + "epoch": 2.805249513933895, + "grad_norm": 0.9825411438941956, + "learning_rate": 5.505610914661147e-08, + "loss": 0.0567, + "step": 17314 + }, + { + "epoch": 2.805411535968892, + "grad_norm": 1.1181998252868652, + "learning_rate": 5.496487677620399e-08, + "loss": 0.0629, + "step": 17315 + }, + { + "epoch": 2.8055735580038883, + "grad_norm": 1.0909833908081055, + "learning_rate": 5.487371921795381e-08, + "loss": 0.0687, + "step": 17316 + }, + { + "epoch": 2.8057355800388852, + "grad_norm": 0.9423796534538269, + "learning_rate": 5.478263647465093e-08, + "loss": 0.0582, + "step": 17317 + }, + { + "epoch": 2.805897602073882, + "grad_norm": 0.8542878031730652, + "learning_rate": 5.4691628549082e-08, + "loss": 0.0655, + "step": 17318 + }, + { + "epoch": 2.8060596241088787, + "grad_norm": 0.9813777208328247, + "learning_rate": 5.4600695444032014e-08, + "loss": 0.0615, + "step": 17319 + }, + { + "epoch": 2.8062216461438756, + "grad_norm": 0.9087584018707275, + "learning_rate": 5.450983716228292e-08, + "loss": 0.0576, + "step": 17320 + }, + { + "epoch": 2.8063836681788725, + "grad_norm": 0.9132459759712219, + "learning_rate": 5.441905370661471e-08, + "loss": 0.0615, + "step": 17321 + }, + { + "epoch": 2.806545690213869, + "grad_norm": 0.7650594115257263, + "learning_rate": 5.4328345079805164e-08, + "loss": 0.0522, + "step": 17322 + }, + { + "epoch": 2.806707712248866, + "grad_norm": 0.9708273410797119, + "learning_rate": 5.423771128462985e-08, + "loss": 0.0676, + "step": 17323 + }, + { + "epoch": 2.806869734283863, + "grad_norm": 0.949398934841156, + "learning_rate": 5.4147152323862085e-08, + "loss": 0.0591, + "step": 17324 + }, + { + "epoch": 2.8070317563188594, + "grad_norm": 0.8356457352638245, + "learning_rate": 5.405666820027272e-08, + "loss": 0.0577, + "step": 17325 + }, + { + "epoch": 2.807193778353856, + "grad_norm": 0.9790476560592651, + "learning_rate": 5.3966258916629824e-08, + "loss": 0.0585, + "step": 17326 + }, + { + "epoch": 2.807355800388853, + "grad_norm": 0.979407548904419, + "learning_rate": 5.387592447570061e-08, + "loss": 0.0618, + "step": 17327 + }, + { + "epoch": 2.8075178224238497, + "grad_norm": 0.8905614614486694, + "learning_rate": 5.378566488024817e-08, + "loss": 0.0569, + "step": 17328 + }, + { + "epoch": 2.807679844458846, + "grad_norm": 0.8756917715072632, + "learning_rate": 5.3695480133034994e-08, + "loss": 0.0593, + "step": 17329 + }, + { + "epoch": 2.807841866493843, + "grad_norm": 0.8476364016532898, + "learning_rate": 5.3605370236820276e-08, + "loss": 0.0571, + "step": 17330 + }, + { + "epoch": 2.80800388852884, + "grad_norm": 1.1375116109848022, + "learning_rate": 5.3515335194360694e-08, + "loss": 0.0673, + "step": 17331 + }, + { + "epoch": 2.8081659105638366, + "grad_norm": 0.9701889157295227, + "learning_rate": 5.3425375008411276e-08, + "loss": 0.0694, + "step": 17332 + }, + { + "epoch": 2.8083279325988335, + "grad_norm": 0.8260276913642883, + "learning_rate": 5.333548968172536e-08, + "loss": 0.0599, + "step": 17333 + }, + { + "epoch": 2.8084899546338304, + "grad_norm": 0.8484598994255066, + "learning_rate": 5.3245679217052424e-08, + "loss": 0.0548, + "step": 17334 + }, + { + "epoch": 2.808651976668827, + "grad_norm": 0.9745419025421143, + "learning_rate": 5.315594361714083e-08, + "loss": 0.062, + "step": 17335 + }, + { + "epoch": 2.808813998703824, + "grad_norm": 0.868465006351471, + "learning_rate": 5.3066282884735863e-08, + "loss": 0.0614, + "step": 17336 + }, + { + "epoch": 2.8089760207388204, + "grad_norm": 0.9748360514640808, + "learning_rate": 5.297669702258118e-08, + "loss": 0.0632, + "step": 17337 + }, + { + "epoch": 2.8091380427738173, + "grad_norm": 1.0200855731964111, + "learning_rate": 5.2887186033417914e-08, + "loss": 0.0625, + "step": 17338 + }, + { + "epoch": 2.809300064808814, + "grad_norm": 1.0314770936965942, + "learning_rate": 5.279774991998499e-08, + "loss": 0.0611, + "step": 17339 + }, + { + "epoch": 2.8094620868438107, + "grad_norm": 1.0516095161437988, + "learning_rate": 5.270838868501854e-08, + "loss": 0.0644, + "step": 17340 + }, + { + "epoch": 2.8096241088788076, + "grad_norm": 0.8740168809890747, + "learning_rate": 5.261910233125333e-08, + "loss": 0.0567, + "step": 17341 + }, + { + "epoch": 2.809786130913804, + "grad_norm": 0.9460188746452332, + "learning_rate": 5.252989086142107e-08, + "loss": 0.0538, + "step": 17342 + }, + { + "epoch": 2.809948152948801, + "grad_norm": 0.8696618676185608, + "learning_rate": 5.244075427825124e-08, + "loss": 0.0557, + "step": 17343 + }, + { + "epoch": 2.810110174983798, + "grad_norm": 1.0212994813919067, + "learning_rate": 5.235169258447137e-08, + "loss": 0.0699, + "step": 17344 + }, + { + "epoch": 2.8102721970187945, + "grad_norm": 1.036773443222046, + "learning_rate": 5.2262705782806513e-08, + "loss": 0.0664, + "step": 17345 + }, + { + "epoch": 2.8104342190537914, + "grad_norm": 0.875264585018158, + "learning_rate": 5.2173793875979204e-08, + "loss": 0.0647, + "step": 17346 + }, + { + "epoch": 2.810596241088788, + "grad_norm": 1.0210704803466797, + "learning_rate": 5.208495686671061e-08, + "loss": 0.0674, + "step": 17347 + }, + { + "epoch": 2.810758263123785, + "grad_norm": 1.0680155754089355, + "learning_rate": 5.199619475771856e-08, + "loss": 0.0707, + "step": 17348 + }, + { + "epoch": 2.8109202851587813, + "grad_norm": 0.8734906315803528, + "learning_rate": 5.190750755171864e-08, + "loss": 0.0533, + "step": 17349 + }, + { + "epoch": 2.8110823071937783, + "grad_norm": 0.8121834397315979, + "learning_rate": 5.181889525142453e-08, + "loss": 0.0516, + "step": 17350 + }, + { + "epoch": 2.811244329228775, + "grad_norm": 1.0527007579803467, + "learning_rate": 5.1730357859547666e-08, + "loss": 0.0645, + "step": 17351 + }, + { + "epoch": 2.8114063512637717, + "grad_norm": 0.9521415829658508, + "learning_rate": 5.164189537879782e-08, + "loss": 0.0601, + "step": 17352 + }, + { + "epoch": 2.8115683732987686, + "grad_norm": 0.9354329705238342, + "learning_rate": 5.155350781188062e-08, + "loss": 0.0609, + "step": 17353 + }, + { + "epoch": 2.8117303953337656, + "grad_norm": 0.8666455149650574, + "learning_rate": 5.146519516150084e-08, + "loss": 0.0575, + "step": 17354 + }, + { + "epoch": 2.811892417368762, + "grad_norm": 0.874804675579071, + "learning_rate": 5.137695743036103e-08, + "loss": 0.0606, + "step": 17355 + }, + { + "epoch": 2.812054439403759, + "grad_norm": 0.96415114402771, + "learning_rate": 5.128879462116071e-08, + "loss": 0.0546, + "step": 17356 + }, + { + "epoch": 2.812216461438756, + "grad_norm": 1.104056477546692, + "learning_rate": 5.1200706736597435e-08, + "loss": 0.0641, + "step": 17357 + }, + { + "epoch": 2.8123784834737524, + "grad_norm": 0.9852107763290405, + "learning_rate": 5.111269377936656e-08, + "loss": 0.0684, + "step": 17358 + }, + { + "epoch": 2.8125405055087493, + "grad_norm": 0.9444692730903625, + "learning_rate": 5.10247557521612e-08, + "loss": 0.0621, + "step": 17359 + }, + { + "epoch": 2.812702527543746, + "grad_norm": 0.8795128464698792, + "learning_rate": 5.093689265767143e-08, + "loss": 0.0592, + "step": 17360 + }, + { + "epoch": 2.8128645495787428, + "grad_norm": 0.9272993206977844, + "learning_rate": 5.084910449858649e-08, + "loss": 0.0582, + "step": 17361 + }, + { + "epoch": 2.8130265716137393, + "grad_norm": 1.0397619009017944, + "learning_rate": 5.0761391277591996e-08, + "loss": 0.0598, + "step": 17362 + }, + { + "epoch": 2.813188593648736, + "grad_norm": 0.9312973022460938, + "learning_rate": 5.0673752997372204e-08, + "loss": 0.0581, + "step": 17363 + }, + { + "epoch": 2.813350615683733, + "grad_norm": 0.9002386331558228, + "learning_rate": 5.05861896606083e-08, + "loss": 0.0614, + "step": 17364 + }, + { + "epoch": 2.8135126377187296, + "grad_norm": 0.9482325911521912, + "learning_rate": 5.049870126997897e-08, + "loss": 0.0642, + "step": 17365 + }, + { + "epoch": 2.8136746597537265, + "grad_norm": 0.8579296469688416, + "learning_rate": 5.0411287828162346e-08, + "loss": 0.0602, + "step": 17366 + }, + { + "epoch": 2.8138366817887235, + "grad_norm": 0.9267897605895996, + "learning_rate": 5.032394933783213e-08, + "loss": 0.0552, + "step": 17367 + }, + { + "epoch": 2.81399870382372, + "grad_norm": 0.8635824918746948, + "learning_rate": 5.023668580166091e-08, + "loss": 0.057, + "step": 17368 + }, + { + "epoch": 2.814160725858717, + "grad_norm": 1.0104823112487793, + "learning_rate": 5.014949722231876e-08, + "loss": 0.0652, + "step": 17369 + }, + { + "epoch": 2.8143227478937134, + "grad_norm": 0.9497948884963989, + "learning_rate": 5.0062383602473566e-08, + "loss": 0.0546, + "step": 17370 + }, + { + "epoch": 2.8144847699287103, + "grad_norm": 0.9637094140052795, + "learning_rate": 4.9975344944790674e-08, + "loss": 0.0576, + "step": 17371 + }, + { + "epoch": 2.814646791963707, + "grad_norm": 0.904549241065979, + "learning_rate": 4.9888381251933237e-08, + "loss": 0.0664, + "step": 17372 + }, + { + "epoch": 2.8148088139987038, + "grad_norm": 0.9141635894775391, + "learning_rate": 4.980149252656219e-08, + "loss": 0.0575, + "step": 17373 + }, + { + "epoch": 2.8149708360337007, + "grad_norm": 0.8708399534225464, + "learning_rate": 4.971467877133651e-08, + "loss": 0.0565, + "step": 17374 + }, + { + "epoch": 2.815132858068697, + "grad_norm": 0.9253936409950256, + "learning_rate": 4.962793998891158e-08, + "loss": 0.0568, + "step": 17375 + }, + { + "epoch": 2.815294880103694, + "grad_norm": 1.0665290355682373, + "learning_rate": 4.954127618194193e-08, + "loss": 0.0641, + "step": 17376 + }, + { + "epoch": 2.815456902138691, + "grad_norm": 0.9630182385444641, + "learning_rate": 4.945468735307934e-08, + "loss": 0.0655, + "step": 17377 + }, + { + "epoch": 2.8156189241736875, + "grad_norm": 0.8089599609375, + "learning_rate": 4.936817350497336e-08, + "loss": 0.057, + "step": 17378 + }, + { + "epoch": 2.8157809462086845, + "grad_norm": 0.85359787940979, + "learning_rate": 4.9281734640270476e-08, + "loss": 0.0535, + "step": 17379 + }, + { + "epoch": 2.8159429682436814, + "grad_norm": 0.9590152502059937, + "learning_rate": 4.919537076161579e-08, + "loss": 0.0589, + "step": 17380 + }, + { + "epoch": 2.816104990278678, + "grad_norm": 0.8613442182540894, + "learning_rate": 4.910908187165248e-08, + "loss": 0.0625, + "step": 17381 + }, + { + "epoch": 2.816267012313675, + "grad_norm": 0.9294003248214722, + "learning_rate": 4.90228679730198e-08, + "loss": 0.0623, + "step": 17382 + }, + { + "epoch": 2.8164290343486713, + "grad_norm": 0.906424343585968, + "learning_rate": 4.893672906835623e-08, + "loss": 0.058, + "step": 17383 + }, + { + "epoch": 2.8165910563836682, + "grad_norm": 0.8201852440834045, + "learning_rate": 4.8850665160297406e-08, + "loss": 0.0538, + "step": 17384 + }, + { + "epoch": 2.8167530784186647, + "grad_norm": 0.8930208086967468, + "learning_rate": 4.8764676251476237e-08, + "loss": 0.0556, + "step": 17385 + }, + { + "epoch": 2.8169151004536617, + "grad_norm": 0.8145630955696106, + "learning_rate": 4.867876234452423e-08, + "loss": 0.0543, + "step": 17386 + }, + { + "epoch": 2.8170771224886586, + "grad_norm": 1.011269211769104, + "learning_rate": 4.859292344207012e-08, + "loss": 0.0641, + "step": 17387 + }, + { + "epoch": 2.817239144523655, + "grad_norm": 0.9331755638122559, + "learning_rate": 4.85071595467404e-08, + "loss": 0.0623, + "step": 17388 + }, + { + "epoch": 2.817401166558652, + "grad_norm": 1.006993055343628, + "learning_rate": 4.842147066115882e-08, + "loss": 0.0664, + "step": 17389 + }, + { + "epoch": 2.817563188593649, + "grad_norm": 0.8741045594215393, + "learning_rate": 4.8335856787947447e-08, + "loss": 0.0569, + "step": 17390 + }, + { + "epoch": 2.8177252106286454, + "grad_norm": 0.9738301634788513, + "learning_rate": 4.825031792972612e-08, + "loss": 0.0626, + "step": 17391 + }, + { + "epoch": 2.8178872326636424, + "grad_norm": 0.9677324295043945, + "learning_rate": 4.81648540891122e-08, + "loss": 0.0639, + "step": 17392 + }, + { + "epoch": 2.818049254698639, + "grad_norm": 0.8875753283500671, + "learning_rate": 4.807946526872026e-08, + "loss": 0.0516, + "step": 17393 + }, + { + "epoch": 2.818211276733636, + "grad_norm": 0.8754310607910156, + "learning_rate": 4.799415147116265e-08, + "loss": 0.0591, + "step": 17394 + }, + { + "epoch": 2.8183732987686323, + "grad_norm": 0.9936445355415344, + "learning_rate": 4.7908912699050906e-08, + "loss": 0.0666, + "step": 17395 + }, + { + "epoch": 2.8185353208036292, + "grad_norm": 0.9375000596046448, + "learning_rate": 4.782374895499236e-08, + "loss": 0.0646, + "step": 17396 + }, + { + "epoch": 2.818697342838626, + "grad_norm": 0.9706060886383057, + "learning_rate": 4.773866024159274e-08, + "loss": 0.0558, + "step": 17397 + }, + { + "epoch": 2.8188593648736227, + "grad_norm": 0.8691913485527039, + "learning_rate": 4.7653646561455767e-08, + "loss": 0.063, + "step": 17398 + }, + { + "epoch": 2.8190213869086196, + "grad_norm": 1.1515320539474487, + "learning_rate": 4.756870791718271e-08, + "loss": 0.0665, + "step": 17399 + }, + { + "epoch": 2.8191834089436165, + "grad_norm": 1.0879017114639282, + "learning_rate": 4.7483844311372594e-08, + "loss": 0.0659, + "step": 17400 + }, + { + "epoch": 2.819345430978613, + "grad_norm": 0.8411441445350647, + "learning_rate": 4.739905574662168e-08, + "loss": 0.0525, + "step": 17401 + }, + { + "epoch": 2.81950745301361, + "grad_norm": 0.8254294395446777, + "learning_rate": 4.731434222552456e-08, + "loss": 0.0584, + "step": 17402 + }, + { + "epoch": 2.819669475048607, + "grad_norm": 0.8175220489501953, + "learning_rate": 4.722970375067304e-08, + "loss": 0.0581, + "step": 17403 + }, + { + "epoch": 2.8198314970836034, + "grad_norm": 0.9433904886245728, + "learning_rate": 4.7145140324657e-08, + "loss": 0.0595, + "step": 17404 + }, + { + "epoch": 2.8199935191186, + "grad_norm": 0.9785334467887878, + "learning_rate": 4.7060651950064094e-08, + "loss": 0.0633, + "step": 17405 + }, + { + "epoch": 2.820155541153597, + "grad_norm": 0.7298676371574402, + "learning_rate": 4.697623862947892e-08, + "loss": 0.0528, + "step": 17406 + }, + { + "epoch": 2.8203175631885937, + "grad_norm": 0.8374927639961243, + "learning_rate": 4.689190036548524e-08, + "loss": 0.0558, + "step": 17407 + }, + { + "epoch": 2.82047958522359, + "grad_norm": 0.8497073650360107, + "learning_rate": 4.680763716066239e-08, + "loss": 0.0592, + "step": 17408 + }, + { + "epoch": 2.820641607258587, + "grad_norm": 1.0084335803985596, + "learning_rate": 4.672344901758941e-08, + "loss": 0.0642, + "step": 17409 + }, + { + "epoch": 2.820803629293584, + "grad_norm": 0.8744716644287109, + "learning_rate": 4.663933593884229e-08, + "loss": 0.0627, + "step": 17410 + }, + { + "epoch": 2.8209656513285806, + "grad_norm": 0.8989715576171875, + "learning_rate": 4.655529792699426e-08, + "loss": 0.0603, + "step": 17411 + }, + { + "epoch": 2.8211276733635775, + "grad_norm": 1.0203579664230347, + "learning_rate": 4.6471334984616866e-08, + "loss": 0.0635, + "step": 17412 + }, + { + "epoch": 2.8212896953985744, + "grad_norm": 0.8238543272018433, + "learning_rate": 4.6387447114278897e-08, + "loss": 0.0526, + "step": 17413 + }, + { + "epoch": 2.821451717433571, + "grad_norm": 0.872983455657959, + "learning_rate": 4.6303634318548006e-08, + "loss": 0.0611, + "step": 17414 + }, + { + "epoch": 2.821613739468568, + "grad_norm": 0.9443795084953308, + "learning_rate": 4.6219896599987714e-08, + "loss": 0.0602, + "step": 17415 + }, + { + "epoch": 2.8217757615035644, + "grad_norm": 0.8985307812690735, + "learning_rate": 4.613623396116068e-08, + "loss": 0.0577, + "step": 17416 + }, + { + "epoch": 2.8219377835385613, + "grad_norm": 0.9005011916160583, + "learning_rate": 4.6052646404626814e-08, + "loss": 0.0627, + "step": 17417 + }, + { + "epoch": 2.8220998055735578, + "grad_norm": 1.0429044961929321, + "learning_rate": 4.596913393294322e-08, + "loss": 0.0734, + "step": 17418 + }, + { + "epoch": 2.8222618276085547, + "grad_norm": 0.9500260949134827, + "learning_rate": 4.5885696548665645e-08, + "loss": 0.0641, + "step": 17419 + }, + { + "epoch": 2.8224238496435516, + "grad_norm": 0.828524112701416, + "learning_rate": 4.580233425434677e-08, + "loss": 0.0574, + "step": 17420 + }, + { + "epoch": 2.822585871678548, + "grad_norm": 1.0398857593536377, + "learning_rate": 4.57190470525376e-08, + "loss": 0.0606, + "step": 17421 + }, + { + "epoch": 2.822747893713545, + "grad_norm": 0.9271135926246643, + "learning_rate": 4.563583494578638e-08, + "loss": 0.0619, + "step": 17422 + }, + { + "epoch": 2.822909915748542, + "grad_norm": 0.8910772800445557, + "learning_rate": 4.555269793663886e-08, + "loss": 0.0572, + "step": 17423 + }, + { + "epoch": 2.8230719377835385, + "grad_norm": 0.9423297047615051, + "learning_rate": 4.546963602763937e-08, + "loss": 0.0671, + "step": 17424 + }, + { + "epoch": 2.8232339598185354, + "grad_norm": 0.9918851256370544, + "learning_rate": 4.5386649221329516e-08, + "loss": 0.0641, + "step": 17425 + }, + { + "epoch": 2.8233959818535324, + "grad_norm": 0.9373489022254944, + "learning_rate": 4.530373752024753e-08, + "loss": 0.0565, + "step": 17426 + }, + { + "epoch": 2.823558003888529, + "grad_norm": 0.9793664216995239, + "learning_rate": 4.5220900926931374e-08, + "loss": 0.0552, + "step": 17427 + }, + { + "epoch": 2.8237200259235253, + "grad_norm": 0.9216907024383545, + "learning_rate": 4.51381394439146e-08, + "loss": 0.0623, + "step": 17428 + }, + { + "epoch": 2.8238820479585223, + "grad_norm": 0.9568885564804077, + "learning_rate": 4.5055453073730715e-08, + "loss": 0.0557, + "step": 17429 + }, + { + "epoch": 2.824044069993519, + "grad_norm": 0.8096397519111633, + "learning_rate": 4.497284181890882e-08, + "loss": 0.0505, + "step": 17430 + }, + { + "epoch": 2.8242060920285157, + "grad_norm": 0.8063415884971619, + "learning_rate": 4.4890305681977164e-08, + "loss": 0.0506, + "step": 17431 + }, + { + "epoch": 2.8243681140635126, + "grad_norm": 0.9902795553207397, + "learning_rate": 4.480784466546068e-08, + "loss": 0.0676, + "step": 17432 + }, + { + "epoch": 2.8245301360985096, + "grad_norm": 0.9459345936775208, + "learning_rate": 4.4725458771882615e-08, + "loss": 0.0634, + "step": 17433 + }, + { + "epoch": 2.824692158133506, + "grad_norm": 0.8988990783691406, + "learning_rate": 4.4643148003764015e-08, + "loss": 0.0587, + "step": 17434 + }, + { + "epoch": 2.824854180168503, + "grad_norm": 0.9229409694671631, + "learning_rate": 4.456091236362314e-08, + "loss": 0.0612, + "step": 17435 + }, + { + "epoch": 2.8250162022035, + "grad_norm": 1.0969082117080688, + "learning_rate": 4.44787518539766e-08, + "loss": 0.0609, + "step": 17436 + }, + { + "epoch": 2.8251782242384964, + "grad_norm": 0.863645076751709, + "learning_rate": 4.4396666477337645e-08, + "loss": 0.055, + "step": 17437 + }, + { + "epoch": 2.8253402462734933, + "grad_norm": 0.9219480156898499, + "learning_rate": 4.4314656236218444e-08, + "loss": 0.061, + "step": 17438 + }, + { + "epoch": 2.82550226830849, + "grad_norm": 1.2461076974868774, + "learning_rate": 4.423272113312782e-08, + "loss": 0.0572, + "step": 17439 + }, + { + "epoch": 2.8256642903434868, + "grad_norm": 0.7902315258979797, + "learning_rate": 4.415086117057377e-08, + "loss": 0.0539, + "step": 17440 + }, + { + "epoch": 2.8258263123784833, + "grad_norm": 0.904567301273346, + "learning_rate": 4.406907635105984e-08, + "loss": 0.0638, + "step": 17441 + }, + { + "epoch": 2.82598833441348, + "grad_norm": 0.8270232081413269, + "learning_rate": 4.398736667708875e-08, + "loss": 0.0571, + "step": 17442 + }, + { + "epoch": 2.826150356448477, + "grad_norm": 1.1331998109817505, + "learning_rate": 4.390573215116101e-08, + "loss": 0.0772, + "step": 17443 + }, + { + "epoch": 2.8263123784834736, + "grad_norm": 0.8062843680381775, + "learning_rate": 4.382417277577433e-08, + "loss": 0.0547, + "step": 17444 + }, + { + "epoch": 2.8264744005184705, + "grad_norm": 1.1029092073440552, + "learning_rate": 4.374268855342395e-08, + "loss": 0.065, + "step": 17445 + }, + { + "epoch": 2.8266364225534675, + "grad_norm": 0.9821694493293762, + "learning_rate": 4.3661279486603424e-08, + "loss": 0.0602, + "step": 17446 + }, + { + "epoch": 2.826798444588464, + "grad_norm": 0.8740617036819458, + "learning_rate": 4.357994557780354e-08, + "loss": 0.0604, + "step": 17447 + }, + { + "epoch": 2.826960466623461, + "grad_norm": 1.0357604026794434, + "learning_rate": 4.349868682951286e-08, + "loss": 0.0618, + "step": 17448 + }, + { + "epoch": 2.8271224886584574, + "grad_norm": 0.8919463753700256, + "learning_rate": 4.3417503244217726e-08, + "loss": 0.0608, + "step": 17449 + }, + { + "epoch": 2.8272845106934543, + "grad_norm": 0.9080489873886108, + "learning_rate": 4.333639482440199e-08, + "loss": 0.0629, + "step": 17450 + }, + { + "epoch": 2.827446532728451, + "grad_norm": 0.9267844557762146, + "learning_rate": 4.3255361572547836e-08, + "loss": 0.0615, + "step": 17451 + }, + { + "epoch": 2.8276085547634477, + "grad_norm": 0.9971922039985657, + "learning_rate": 4.3174403491134385e-08, + "loss": 0.0687, + "step": 17452 + }, + { + "epoch": 2.8277705767984447, + "grad_norm": 0.7694254517555237, + "learning_rate": 4.309352058263855e-08, + "loss": 0.0511, + "step": 17453 + }, + { + "epoch": 2.827932598833441, + "grad_norm": 0.826542854309082, + "learning_rate": 4.301271284953584e-08, + "loss": 0.0516, + "step": 17454 + }, + { + "epoch": 2.828094620868438, + "grad_norm": 0.7882339358329773, + "learning_rate": 4.29319802942979e-08, + "loss": 0.0527, + "step": 17455 + }, + { + "epoch": 2.828256642903435, + "grad_norm": 0.8655419945716858, + "learning_rate": 4.285132291939526e-08, + "loss": 0.059, + "step": 17456 + }, + { + "epoch": 2.8284186649384315, + "grad_norm": 0.8806518316268921, + "learning_rate": 4.27707407272962e-08, + "loss": 0.0605, + "step": 17457 + }, + { + "epoch": 2.8285806869734285, + "grad_norm": 0.9758058786392212, + "learning_rate": 4.2690233720466265e-08, + "loss": 0.0649, + "step": 17458 + }, + { + "epoch": 2.8287427090084254, + "grad_norm": 0.8468053340911865, + "learning_rate": 4.2609801901368485e-08, + "loss": 0.0556, + "step": 17459 + }, + { + "epoch": 2.828904731043422, + "grad_norm": 0.8715428709983826, + "learning_rate": 4.2529445272463946e-08, + "loss": 0.0595, + "step": 17460 + }, + { + "epoch": 2.829066753078419, + "grad_norm": 1.172911524772644, + "learning_rate": 4.2449163836211507e-08, + "loss": 0.0675, + "step": 17461 + }, + { + "epoch": 2.8292287751134153, + "grad_norm": 0.836900532245636, + "learning_rate": 4.2368957595067264e-08, + "loss": 0.0587, + "step": 17462 + }, + { + "epoch": 2.8293907971484122, + "grad_norm": 0.9022508263587952, + "learning_rate": 4.228882655148564e-08, + "loss": 0.0594, + "step": 17463 + }, + { + "epoch": 2.8295528191834087, + "grad_norm": 0.8521215915679932, + "learning_rate": 4.220877070791857e-08, + "loss": 0.0583, + "step": 17464 + }, + { + "epoch": 2.8297148412184057, + "grad_norm": 0.9187961220741272, + "learning_rate": 4.2128790066815195e-08, + "loss": 0.0621, + "step": 17465 + }, + { + "epoch": 2.8298768632534026, + "grad_norm": 0.8813655972480774, + "learning_rate": 4.204888463062273e-08, + "loss": 0.0555, + "step": 17466 + }, + { + "epoch": 2.830038885288399, + "grad_norm": 0.7997599840164185, + "learning_rate": 4.1969054401786724e-08, + "loss": 0.0553, + "step": 17467 + }, + { + "epoch": 2.830200907323396, + "grad_norm": 0.8015317916870117, + "learning_rate": 4.188929938274911e-08, + "loss": 0.0537, + "step": 17468 + }, + { + "epoch": 2.830362929358393, + "grad_norm": 0.8224532604217529, + "learning_rate": 4.1809619575950425e-08, + "loss": 0.0552, + "step": 17469 + }, + { + "epoch": 2.8305249513933894, + "grad_norm": 1.1566404104232788, + "learning_rate": 4.1730014983828724e-08, + "loss": 0.0624, + "step": 17470 + }, + { + "epoch": 2.8306869734283864, + "grad_norm": 0.8228906989097595, + "learning_rate": 4.165048560881929e-08, + "loss": 0.0598, + "step": 17471 + }, + { + "epoch": 2.830848995463383, + "grad_norm": 1.1967852115631104, + "learning_rate": 4.157103145335628e-08, + "loss": 0.0753, + "step": 17472 + }, + { + "epoch": 2.83101101749838, + "grad_norm": 0.9739968776702881, + "learning_rate": 4.149165251987053e-08, + "loss": 0.0575, + "step": 17473 + }, + { + "epoch": 2.8311730395333763, + "grad_norm": 0.9316312074661255, + "learning_rate": 4.141234881079065e-08, + "loss": 0.0653, + "step": 17474 + }, + { + "epoch": 2.8313350615683732, + "grad_norm": 1.3163434267044067, + "learning_rate": 4.1333120328542754e-08, + "loss": 0.0668, + "step": 17475 + }, + { + "epoch": 2.83149708360337, + "grad_norm": 0.9953809976577759, + "learning_rate": 4.125396707555213e-08, + "loss": 0.0615, + "step": 17476 + }, + { + "epoch": 2.8316591056383666, + "grad_norm": 1.2429362535476685, + "learning_rate": 4.1174889054239885e-08, + "loss": 0.0626, + "step": 17477 + }, + { + "epoch": 2.8318211276733636, + "grad_norm": 1.1035979986190796, + "learning_rate": 4.109588626702576e-08, + "loss": 0.0654, + "step": 17478 + }, + { + "epoch": 2.8319831497083605, + "grad_norm": 0.7999586462974548, + "learning_rate": 4.101695871632699e-08, + "loss": 0.0552, + "step": 17479 + }, + { + "epoch": 2.832145171743357, + "grad_norm": 1.007940649986267, + "learning_rate": 4.0938106404558864e-08, + "loss": 0.0631, + "step": 17480 + }, + { + "epoch": 2.832307193778354, + "grad_norm": 0.8191199898719788, + "learning_rate": 4.085932933413361e-08, + "loss": 0.0567, + "step": 17481 + }, + { + "epoch": 2.832469215813351, + "grad_norm": 1.0429675579071045, + "learning_rate": 4.078062750746209e-08, + "loss": 0.0633, + "step": 17482 + }, + { + "epoch": 2.8326312378483474, + "grad_norm": 0.9091582298278809, + "learning_rate": 4.070200092695209e-08, + "loss": 0.0591, + "step": 17483 + }, + { + "epoch": 2.8327932598833443, + "grad_norm": 0.9163491725921631, + "learning_rate": 4.062344959500947e-08, + "loss": 0.0672, + "step": 17484 + }, + { + "epoch": 2.832955281918341, + "grad_norm": 1.2758320569992065, + "learning_rate": 4.054497351403758e-08, + "loss": 0.0597, + "step": 17485 + }, + { + "epoch": 2.8331173039533377, + "grad_norm": 1.0452107191085815, + "learning_rate": 4.0466572686437833e-08, + "loss": 0.0654, + "step": 17486 + }, + { + "epoch": 2.833279325988334, + "grad_norm": 1.031578540802002, + "learning_rate": 4.038824711460943e-08, + "loss": 0.0656, + "step": 17487 + }, + { + "epoch": 2.833441348023331, + "grad_norm": 0.8796032071113586, + "learning_rate": 4.0309996800947936e-08, + "loss": 0.06, + "step": 17488 + }, + { + "epoch": 2.833603370058328, + "grad_norm": 0.7887482643127441, + "learning_rate": 4.02318217478484e-08, + "loss": 0.0564, + "step": 17489 + }, + { + "epoch": 2.8337653920933246, + "grad_norm": 0.9122143387794495, + "learning_rate": 4.0153721957702504e-08, + "loss": 0.0655, + "step": 17490 + }, + { + "epoch": 2.8339274141283215, + "grad_norm": 0.9661285877227783, + "learning_rate": 4.0075697432900295e-08, + "loss": 0.065, + "step": 17491 + }, + { + "epoch": 2.8340894361633184, + "grad_norm": 0.8843967318534851, + "learning_rate": 3.9997748175828467e-08, + "loss": 0.0567, + "step": 17492 + }, + { + "epoch": 2.834251458198315, + "grad_norm": 0.9385737180709839, + "learning_rate": 3.9919874188872607e-08, + "loss": 0.0606, + "step": 17493 + }, + { + "epoch": 2.834413480233312, + "grad_norm": 0.9699385166168213, + "learning_rate": 3.9842075474415545e-08, + "loss": 0.0575, + "step": 17494 + }, + { + "epoch": 2.8345755022683083, + "grad_norm": 0.8735253214836121, + "learning_rate": 3.976435203483703e-08, + "loss": 0.0592, + "step": 17495 + }, + { + "epoch": 2.8347375243033053, + "grad_norm": 0.9157827496528625, + "learning_rate": 3.9686703872516e-08, + "loss": 0.0653, + "step": 17496 + }, + { + "epoch": 2.8348995463383018, + "grad_norm": 0.9803964495658875, + "learning_rate": 3.960913098982805e-08, + "loss": 0.0605, + "step": 17497 + }, + { + "epoch": 2.8350615683732987, + "grad_norm": 0.8232876658439636, + "learning_rate": 3.953163338914656e-08, + "loss": 0.0562, + "step": 17498 + }, + { + "epoch": 2.8352235904082956, + "grad_norm": 0.7949116826057434, + "learning_rate": 3.94542110728427e-08, + "loss": 0.0577, + "step": 17499 + }, + { + "epoch": 2.835385612443292, + "grad_norm": 0.94376540184021, + "learning_rate": 3.9376864043285943e-08, + "loss": 0.0646, + "step": 17500 + }, + { + "epoch": 2.835547634478289, + "grad_norm": 1.0657342672348022, + "learning_rate": 3.9299592302842195e-08, + "loss": 0.061, + "step": 17501 + }, + { + "epoch": 2.835709656513286, + "grad_norm": 0.8984468579292297, + "learning_rate": 3.922239585387649e-08, + "loss": 0.0647, + "step": 17502 + }, + { + "epoch": 2.8358716785482825, + "grad_norm": 1.0442514419555664, + "learning_rate": 3.914527469875029e-08, + "loss": 0.06, + "step": 17503 + }, + { + "epoch": 2.8360337005832794, + "grad_norm": 0.9332334995269775, + "learning_rate": 3.906822883982336e-08, + "loss": 0.063, + "step": 17504 + }, + { + "epoch": 2.8361957226182763, + "grad_norm": 1.0261024236679077, + "learning_rate": 3.8991258279453544e-08, + "loss": 0.0672, + "step": 17505 + }, + { + "epoch": 2.836357744653273, + "grad_norm": 1.0123735666275024, + "learning_rate": 3.891436301999563e-08, + "loss": 0.0578, + "step": 17506 + }, + { + "epoch": 2.8365197666882693, + "grad_norm": 0.8809978365898132, + "learning_rate": 3.883754306380244e-08, + "loss": 0.0535, + "step": 17507 + }, + { + "epoch": 2.8366817887232663, + "grad_norm": 0.9029288291931152, + "learning_rate": 3.876079841322461e-08, + "loss": 0.0615, + "step": 17508 + }, + { + "epoch": 2.836843810758263, + "grad_norm": 0.9749417304992676, + "learning_rate": 3.868412907061026e-08, + "loss": 0.0564, + "step": 17509 + }, + { + "epoch": 2.8370058327932597, + "grad_norm": 0.9761818051338196, + "learning_rate": 3.8607535038305276e-08, + "loss": 0.0627, + "step": 17510 + }, + { + "epoch": 2.8371678548282566, + "grad_norm": 0.8667659759521484, + "learning_rate": 3.853101631865336e-08, + "loss": 0.0568, + "step": 17511 + }, + { + "epoch": 2.8373298768632536, + "grad_norm": 1.0688188076019287, + "learning_rate": 3.84545729139954e-08, + "loss": 0.0594, + "step": 17512 + }, + { + "epoch": 2.83749189889825, + "grad_norm": 0.9591261744499207, + "learning_rate": 3.837820482667121e-08, + "loss": 0.0625, + "step": 17513 + }, + { + "epoch": 2.837653920933247, + "grad_norm": 0.90581876039505, + "learning_rate": 3.83019120590164e-08, + "loss": 0.0537, + "step": 17514 + }, + { + "epoch": 2.837815942968244, + "grad_norm": 0.8603760004043579, + "learning_rate": 3.822569461336606e-08, + "loss": 0.0517, + "step": 17515 + }, + { + "epoch": 2.8379779650032404, + "grad_norm": 0.9439842104911804, + "learning_rate": 3.814955249205221e-08, + "loss": 0.0637, + "step": 17516 + }, + { + "epoch": 2.8381399870382373, + "grad_norm": 0.9294354915618896, + "learning_rate": 3.8073485697404655e-08, + "loss": 0.0601, + "step": 17517 + }, + { + "epoch": 2.838302009073234, + "grad_norm": 0.8488750457763672, + "learning_rate": 3.7997494231750145e-08, + "loss": 0.0617, + "step": 17518 + }, + { + "epoch": 2.8384640311082308, + "grad_norm": 0.9641651511192322, + "learning_rate": 3.792157809741459e-08, + "loss": 0.0648, + "step": 17519 + }, + { + "epoch": 2.8386260531432272, + "grad_norm": 1.0396709442138672, + "learning_rate": 3.784573729672086e-08, + "loss": 0.0551, + "step": 17520 + }, + { + "epoch": 2.838788075178224, + "grad_norm": 0.9486382603645325, + "learning_rate": 3.7769971831989325e-08, + "loss": 0.0659, + "step": 17521 + }, + { + "epoch": 2.838950097213221, + "grad_norm": 0.972588300704956, + "learning_rate": 3.769428170553785e-08, + "loss": 0.0649, + "step": 17522 + }, + { + "epoch": 2.8391121192482176, + "grad_norm": 1.0495613813400269, + "learning_rate": 3.761866691968291e-08, + "loss": 0.0679, + "step": 17523 + }, + { + "epoch": 2.8392741412832145, + "grad_norm": 0.8821991682052612, + "learning_rate": 3.754312747673766e-08, + "loss": 0.056, + "step": 17524 + }, + { + "epoch": 2.8394361633182115, + "grad_norm": 1.0493261814117432, + "learning_rate": 3.746766337901386e-08, + "loss": 0.0586, + "step": 17525 + }, + { + "epoch": 2.839598185353208, + "grad_norm": 0.923512876033783, + "learning_rate": 3.739227462882022e-08, + "loss": 0.0601, + "step": 17526 + }, + { + "epoch": 2.839760207388205, + "grad_norm": 0.8735133409500122, + "learning_rate": 3.731696122846379e-08, + "loss": 0.0589, + "step": 17527 + }, + { + "epoch": 2.839922229423202, + "grad_norm": 1.056152105331421, + "learning_rate": 3.724172318024854e-08, + "loss": 0.0684, + "step": 17528 + }, + { + "epoch": 2.8400842514581983, + "grad_norm": 0.8753458857536316, + "learning_rate": 3.71665604864771e-08, + "loss": 0.0581, + "step": 17529 + }, + { + "epoch": 2.840246273493195, + "grad_norm": 1.102301836013794, + "learning_rate": 3.709147314944872e-08, + "loss": 0.07, + "step": 17530 + }, + { + "epoch": 2.8404082955281917, + "grad_norm": 0.8725458383560181, + "learning_rate": 3.7016461171461296e-08, + "loss": 0.0613, + "step": 17531 + }, + { + "epoch": 2.8405703175631887, + "grad_norm": 0.938378095626831, + "learning_rate": 3.6941524554809924e-08, + "loss": 0.0622, + "step": 17532 + }, + { + "epoch": 2.840732339598185, + "grad_norm": 0.9458240270614624, + "learning_rate": 3.686666330178695e-08, + "loss": 0.0578, + "step": 17533 + }, + { + "epoch": 2.840894361633182, + "grad_norm": 0.9221711754798889, + "learning_rate": 3.6791877414683594e-08, + "loss": 0.0605, + "step": 17534 + }, + { + "epoch": 2.841056383668179, + "grad_norm": 0.9185110926628113, + "learning_rate": 3.6717166895788306e-08, + "loss": 0.0638, + "step": 17535 + }, + { + "epoch": 2.8412184057031755, + "grad_norm": 0.8876226544380188, + "learning_rate": 3.664253174738647e-08, + "loss": 0.0588, + "step": 17536 + }, + { + "epoch": 2.8413804277381725, + "grad_norm": 0.9121412634849548, + "learning_rate": 3.656797197176182e-08, + "loss": 0.0593, + "step": 17537 + }, + { + "epoch": 2.8415424497731694, + "grad_norm": 0.8094960451126099, + "learning_rate": 3.649348757119614e-08, + "loss": 0.0577, + "step": 17538 + }, + { + "epoch": 2.841704471808166, + "grad_norm": 0.8800841569900513, + "learning_rate": 3.641907854796789e-08, + "loss": 0.0586, + "step": 17539 + }, + { + "epoch": 2.841866493843163, + "grad_norm": 0.8749896883964539, + "learning_rate": 3.634474490435413e-08, + "loss": 0.0591, + "step": 17540 + }, + { + "epoch": 2.8420285158781593, + "grad_norm": 0.9184433221817017, + "learning_rate": 3.627048664262916e-08, + "loss": 0.0525, + "step": 17541 + }, + { + "epoch": 2.8421905379131562, + "grad_norm": 0.9051634669303894, + "learning_rate": 3.6196303765065333e-08, + "loss": 0.0676, + "step": 17542 + }, + { + "epoch": 2.8423525599481527, + "grad_norm": 0.8812767863273621, + "learning_rate": 3.612219627393221e-08, + "loss": 0.0601, + "step": 17543 + }, + { + "epoch": 2.8425145819831497, + "grad_norm": 0.9540053606033325, + "learning_rate": 3.60481641714977e-08, + "loss": 0.0591, + "step": 17544 + }, + { + "epoch": 2.8426766040181466, + "grad_norm": 1.0543007850646973, + "learning_rate": 3.597420746002639e-08, + "loss": 0.0655, + "step": 17545 + }, + { + "epoch": 2.842838626053143, + "grad_norm": 0.8972881436347961, + "learning_rate": 3.590032614178174e-08, + "loss": 0.06, + "step": 17546 + }, + { + "epoch": 2.84300064808814, + "grad_norm": 0.8661593794822693, + "learning_rate": 3.5826520219023887e-08, + "loss": 0.0607, + "step": 17547 + }, + { + "epoch": 2.843162670123137, + "grad_norm": 0.896032452583313, + "learning_rate": 3.57527896940113e-08, + "loss": 0.0592, + "step": 17548 + }, + { + "epoch": 2.8433246921581334, + "grad_norm": 0.822556734085083, + "learning_rate": 3.567913456900024e-08, + "loss": 0.0561, + "step": 17549 + }, + { + "epoch": 2.8434867141931304, + "grad_norm": 0.9163435101509094, + "learning_rate": 3.560555484624417e-08, + "loss": 0.0679, + "step": 17550 + }, + { + "epoch": 2.843648736228127, + "grad_norm": 0.8780238032341003, + "learning_rate": 3.5532050527994076e-08, + "loss": 0.0533, + "step": 17551 + }, + { + "epoch": 2.843810758263124, + "grad_norm": 0.9988797903060913, + "learning_rate": 3.545862161649927e-08, + "loss": 0.057, + "step": 17552 + }, + { + "epoch": 2.8439727802981203, + "grad_norm": 0.8687535524368286, + "learning_rate": 3.538526811400711e-08, + "loss": 0.0607, + "step": 17553 + }, + { + "epoch": 2.844134802333117, + "grad_norm": 1.180427074432373, + "learning_rate": 3.531199002276109e-08, + "loss": 0.0727, + "step": 17554 + }, + { + "epoch": 2.844296824368114, + "grad_norm": 0.8522838354110718, + "learning_rate": 3.5238787345003855e-08, + "loss": 0.0549, + "step": 17555 + }, + { + "epoch": 2.8444588464031106, + "grad_norm": 0.9009879231452942, + "learning_rate": 3.5165660082975006e-08, + "loss": 0.0628, + "step": 17556 + }, + { + "epoch": 2.8446208684381076, + "grad_norm": 0.8708153963088989, + "learning_rate": 3.50926082389122e-08, + "loss": 0.0574, + "step": 17557 + }, + { + "epoch": 2.8447828904731045, + "grad_norm": 0.8726041316986084, + "learning_rate": 3.501963181505058e-08, + "loss": 0.063, + "step": 17558 + }, + { + "epoch": 2.844944912508101, + "grad_norm": 0.9148244261741638, + "learning_rate": 3.4946730813623376e-08, + "loss": 0.0561, + "step": 17559 + }, + { + "epoch": 2.845106934543098, + "grad_norm": 0.9740058779716492, + "learning_rate": 3.487390523686074e-08, + "loss": 0.0645, + "step": 17560 + }, + { + "epoch": 2.845268956578095, + "grad_norm": 0.8795120716094971, + "learning_rate": 3.4801155086991165e-08, + "loss": 0.0604, + "step": 17561 + }, + { + "epoch": 2.8454309786130914, + "grad_norm": 0.8973474502563477, + "learning_rate": 3.472848036624038e-08, + "loss": 0.0601, + "step": 17562 + }, + { + "epoch": 2.8455930006480883, + "grad_norm": 0.9123368859291077, + "learning_rate": 3.4655881076832156e-08, + "loss": 0.0622, + "step": 17563 + }, + { + "epoch": 2.845755022683085, + "grad_norm": 0.8565255403518677, + "learning_rate": 3.4583357220988326e-08, + "loss": 0.0551, + "step": 17564 + }, + { + "epoch": 2.8459170447180817, + "grad_norm": 0.815396249294281, + "learning_rate": 3.451090880092739e-08, + "loss": 0.0529, + "step": 17565 + }, + { + "epoch": 2.846079066753078, + "grad_norm": 0.7915904521942139, + "learning_rate": 3.443853581886619e-08, + "loss": 0.0512, + "step": 17566 + }, + { + "epoch": 2.846241088788075, + "grad_norm": 1.000328779220581, + "learning_rate": 3.4366238277019625e-08, + "loss": 0.0591, + "step": 17567 + }, + { + "epoch": 2.846403110823072, + "grad_norm": 0.8379427194595337, + "learning_rate": 3.4294016177598974e-08, + "loss": 0.0601, + "step": 17568 + }, + { + "epoch": 2.8465651328580686, + "grad_norm": 0.8759309649467468, + "learning_rate": 3.42218695228147e-08, + "loss": 0.0594, + "step": 17569 + }, + { + "epoch": 2.8467271548930655, + "grad_norm": 0.9378208518028259, + "learning_rate": 3.4149798314874195e-08, + "loss": 0.0637, + "step": 17570 + }, + { + "epoch": 2.8468891769280624, + "grad_norm": 0.9910644888877869, + "learning_rate": 3.4077802555982645e-08, + "loss": 0.0633, + "step": 17571 + }, + { + "epoch": 2.847051198963059, + "grad_norm": 0.8594523668289185, + "learning_rate": 3.4005882248343e-08, + "loss": 0.057, + "step": 17572 + }, + { + "epoch": 2.847213220998056, + "grad_norm": 0.9198819994926453, + "learning_rate": 3.393403739415546e-08, + "loss": 0.0638, + "step": 17573 + }, + { + "epoch": 2.8473752430330523, + "grad_norm": 0.8322975039482117, + "learning_rate": 3.3862267995618817e-08, + "loss": 0.0572, + "step": 17574 + }, + { + "epoch": 2.8475372650680493, + "grad_norm": 0.9040446281433105, + "learning_rate": 3.379057405492908e-08, + "loss": 0.0641, + "step": 17575 + }, + { + "epoch": 2.8476992871030458, + "grad_norm": 0.8539929986000061, + "learning_rate": 3.3718955574279234e-08, + "loss": 0.0602, + "step": 17576 + }, + { + "epoch": 2.8478613091380427, + "grad_norm": 0.7834108471870422, + "learning_rate": 3.3647412555861126e-08, + "loss": 0.051, + "step": 17577 + }, + { + "epoch": 2.8480233311730396, + "grad_norm": 0.8313713073730469, + "learning_rate": 3.357594500186384e-08, + "loss": 0.0548, + "step": 17578 + }, + { + "epoch": 2.848185353208036, + "grad_norm": 0.8187208771705627, + "learning_rate": 3.3504552914474244e-08, + "loss": 0.0554, + "step": 17579 + }, + { + "epoch": 2.848347375243033, + "grad_norm": 0.8749431371688843, + "learning_rate": 3.3433236295876134e-08, + "loss": 0.054, + "step": 17580 + }, + { + "epoch": 2.84850939727803, + "grad_norm": 0.9242835640907288, + "learning_rate": 3.336199514825195e-08, + "loss": 0.0567, + "step": 17581 + }, + { + "epoch": 2.8486714193130265, + "grad_norm": 0.9290145635604858, + "learning_rate": 3.329082947378215e-08, + "loss": 0.0639, + "step": 17582 + }, + { + "epoch": 2.8488334413480234, + "grad_norm": 0.908678412437439, + "learning_rate": 3.3219739274643057e-08, + "loss": 0.0579, + "step": 17583 + }, + { + "epoch": 2.8489954633830203, + "grad_norm": 0.9778639078140259, + "learning_rate": 3.314872455301071e-08, + "loss": 0.0648, + "step": 17584 + }, + { + "epoch": 2.849157485418017, + "grad_norm": 0.6851035952568054, + "learning_rate": 3.3077785311057545e-08, + "loss": 0.0508, + "step": 17585 + }, + { + "epoch": 2.8493195074530138, + "grad_norm": 0.9486784338951111, + "learning_rate": 3.300692155095458e-08, + "loss": 0.0606, + "step": 17586 + }, + { + "epoch": 2.8494815294880103, + "grad_norm": 1.097536325454712, + "learning_rate": 3.293613327486983e-08, + "loss": 0.0656, + "step": 17587 + }, + { + "epoch": 2.849643551523007, + "grad_norm": 0.9391690492630005, + "learning_rate": 3.286542048496904e-08, + "loss": 0.0613, + "step": 17588 + }, + { + "epoch": 2.8498055735580037, + "grad_norm": 0.9068102836608887, + "learning_rate": 3.2794783183416055e-08, + "loss": 0.059, + "step": 17589 + }, + { + "epoch": 2.8499675955930006, + "grad_norm": 0.851341962814331, + "learning_rate": 3.272422137237219e-08, + "loss": 0.0532, + "step": 17590 + }, + { + "epoch": 2.8501296176279975, + "grad_norm": 0.9030383229255676, + "learning_rate": 3.265373505399627e-08, + "loss": 0.0536, + "step": 17591 + }, + { + "epoch": 2.850291639662994, + "grad_norm": 0.8289960026741028, + "learning_rate": 3.258332423044547e-08, + "loss": 0.0602, + "step": 17592 + }, + { + "epoch": 2.850453661697991, + "grad_norm": 0.8833999633789062, + "learning_rate": 3.2512988903873885e-08, + "loss": 0.0623, + "step": 17593 + }, + { + "epoch": 2.850615683732988, + "grad_norm": 1.016706943511963, + "learning_rate": 3.2442729076433697e-08, + "loss": 0.0629, + "step": 17594 + }, + { + "epoch": 2.8507777057679844, + "grad_norm": 0.903662383556366, + "learning_rate": 3.237254475027429e-08, + "loss": 0.0596, + "step": 17595 + }, + { + "epoch": 2.8509397278029813, + "grad_norm": 0.8987908959388733, + "learning_rate": 3.230243592754368e-08, + "loss": 0.0649, + "step": 17596 + }, + { + "epoch": 2.851101749837978, + "grad_norm": 0.9659387469291687, + "learning_rate": 3.223240261038707e-08, + "loss": 0.0661, + "step": 17597 + }, + { + "epoch": 2.8512637718729748, + "grad_norm": 0.9991119503974915, + "learning_rate": 3.2162444800946655e-08, + "loss": 0.0595, + "step": 17598 + }, + { + "epoch": 2.8514257939079712, + "grad_norm": 0.8747795224189758, + "learning_rate": 3.209256250136378e-08, + "loss": 0.0505, + "step": 17599 + }, + { + "epoch": 2.851587815942968, + "grad_norm": 0.9156426191329956, + "learning_rate": 3.202275571377589e-08, + "loss": 0.0553, + "step": 17600 + }, + { + "epoch": 2.851749837977965, + "grad_norm": 0.9192603826522827, + "learning_rate": 3.1953024440319334e-08, + "loss": 0.064, + "step": 17601 + }, + { + "epoch": 2.8519118600129616, + "grad_norm": 0.9150480031967163, + "learning_rate": 3.188336868312769e-08, + "loss": 0.0652, + "step": 17602 + }, + { + "epoch": 2.8520738820479585, + "grad_norm": 0.8279345035552979, + "learning_rate": 3.18137884443323e-08, + "loss": 0.0577, + "step": 17603 + }, + { + "epoch": 2.8522359040829555, + "grad_norm": 0.9174199104309082, + "learning_rate": 3.1744283726062306e-08, + "loss": 0.0614, + "step": 17604 + }, + { + "epoch": 2.852397926117952, + "grad_norm": 0.9263173937797546, + "learning_rate": 3.167485453044378e-08, + "loss": 0.0642, + "step": 17605 + }, + { + "epoch": 2.852559948152949, + "grad_norm": 0.8777797222137451, + "learning_rate": 3.160550085960168e-08, + "loss": 0.0556, + "step": 17606 + }, + { + "epoch": 2.852721970187946, + "grad_norm": 0.8794631958007812, + "learning_rate": 3.153622271565793e-08, + "loss": 0.0583, + "step": 17607 + }, + { + "epoch": 2.8528839922229423, + "grad_norm": 0.8909582495689392, + "learning_rate": 3.1467020100732215e-08, + "loss": 0.0647, + "step": 17608 + }, + { + "epoch": 2.8530460142579392, + "grad_norm": 0.9028645753860474, + "learning_rate": 3.139789301694146e-08, + "loss": 0.0583, + "step": 17609 + }, + { + "epoch": 2.8532080362929357, + "grad_norm": 0.9116567969322205, + "learning_rate": 3.1328841466401746e-08, + "loss": 0.0605, + "step": 17610 + }, + { + "epoch": 2.8533700583279327, + "grad_norm": 1.222704529762268, + "learning_rate": 3.125986545122528e-08, + "loss": 0.0621, + "step": 17611 + }, + { + "epoch": 2.853532080362929, + "grad_norm": 0.8335679769515991, + "learning_rate": 3.1190964973522865e-08, + "loss": 0.0562, + "step": 17612 + }, + { + "epoch": 2.853694102397926, + "grad_norm": 0.9687299728393555, + "learning_rate": 3.112214003540254e-08, + "loss": 0.057, + "step": 17613 + }, + { + "epoch": 2.853856124432923, + "grad_norm": 0.9810713529586792, + "learning_rate": 3.105339063896956e-08, + "loss": 0.07, + "step": 17614 + }, + { + "epoch": 2.8540181464679195, + "grad_norm": 1.0105944871902466, + "learning_rate": 3.098471678632892e-08, + "loss": 0.0691, + "step": 17615 + }, + { + "epoch": 2.8541801685029164, + "grad_norm": 0.9086531400680542, + "learning_rate": 3.0916118479580593e-08, + "loss": 0.0627, + "step": 17616 + }, + { + "epoch": 2.8543421905379134, + "grad_norm": 0.8142098188400269, + "learning_rate": 3.084759572082402e-08, + "loss": 0.0532, + "step": 17617 + }, + { + "epoch": 2.85450421257291, + "grad_norm": 0.9001931548118591, + "learning_rate": 3.077914851215585e-08, + "loss": 0.0685, + "step": 17618 + }, + { + "epoch": 2.854666234607907, + "grad_norm": 0.7494874000549316, + "learning_rate": 3.071077685567025e-08, + "loss": 0.0548, + "step": 17619 + }, + { + "epoch": 2.8548282566429033, + "grad_norm": 0.956156849861145, + "learning_rate": 3.064248075345916e-08, + "loss": 0.06, + "step": 17620 + }, + { + "epoch": 2.8549902786779002, + "grad_norm": 0.939167857170105, + "learning_rate": 3.057426020761256e-08, + "loss": 0.066, + "step": 17621 + }, + { + "epoch": 2.8551523007128967, + "grad_norm": 0.9294570088386536, + "learning_rate": 3.050611522021796e-08, + "loss": 0.0586, + "step": 17622 + }, + { + "epoch": 2.8553143227478937, + "grad_norm": 0.9607594609260559, + "learning_rate": 3.043804579336007e-08, + "loss": 0.0625, + "step": 17623 + }, + { + "epoch": 2.8554763447828906, + "grad_norm": 1.0955085754394531, + "learning_rate": 3.0370051929121405e-08, + "loss": 0.0652, + "step": 17624 + }, + { + "epoch": 2.855638366817887, + "grad_norm": 0.9362739324569702, + "learning_rate": 3.030213362958306e-08, + "loss": 0.0622, + "step": 17625 + }, + { + "epoch": 2.855800388852884, + "grad_norm": 0.8863419890403748, + "learning_rate": 3.023429089682284e-08, + "loss": 0.0599, + "step": 17626 + }, + { + "epoch": 2.855962410887881, + "grad_norm": 0.8718972206115723, + "learning_rate": 3.0166523732916564e-08, + "loss": 0.0588, + "step": 17627 + }, + { + "epoch": 2.8561244329228774, + "grad_norm": 1.004003643989563, + "learning_rate": 3.009883213993786e-08, + "loss": 0.0603, + "step": 17628 + }, + { + "epoch": 2.8562864549578744, + "grad_norm": 0.9552223682403564, + "learning_rate": 3.0031216119957576e-08, + "loss": 0.053, + "step": 17629 + }, + { + "epoch": 2.8564484769928713, + "grad_norm": 0.9536174535751343, + "learning_rate": 2.996367567504544e-08, + "loss": 0.0656, + "step": 17630 + }, + { + "epoch": 2.856610499027868, + "grad_norm": 1.1152795553207397, + "learning_rate": 2.989621080726701e-08, + "loss": 0.0635, + "step": 17631 + }, + { + "epoch": 2.8567725210628643, + "grad_norm": 0.9013298749923706, + "learning_rate": 2.9828821518687045e-08, + "loss": 0.0566, + "step": 17632 + }, + { + "epoch": 2.856934543097861, + "grad_norm": 0.9204021096229553, + "learning_rate": 2.9761507811367497e-08, + "loss": 0.0631, + "step": 17633 + }, + { + "epoch": 2.857096565132858, + "grad_norm": 0.9465929865837097, + "learning_rate": 2.9694269687367826e-08, + "loss": 0.0542, + "step": 17634 + }, + { + "epoch": 2.8572585871678546, + "grad_norm": 0.8179449439048767, + "learning_rate": 2.962710714874556e-08, + "loss": 0.0581, + "step": 17635 + }, + { + "epoch": 2.8574206092028516, + "grad_norm": 1.1688090562820435, + "learning_rate": 2.9560020197555716e-08, + "loss": 0.0635, + "step": 17636 + }, + { + "epoch": 2.8575826312378485, + "grad_norm": 1.0356324911117554, + "learning_rate": 2.9493008835850823e-08, + "loss": 0.0592, + "step": 17637 + }, + { + "epoch": 2.857744653272845, + "grad_norm": 0.9192071557044983, + "learning_rate": 2.9426073065681183e-08, + "loss": 0.0646, + "step": 17638 + }, + { + "epoch": 2.857906675307842, + "grad_norm": 0.8982980251312256, + "learning_rate": 2.9359212889095157e-08, + "loss": 0.0616, + "step": 17639 + }, + { + "epoch": 2.858068697342839, + "grad_norm": 0.8842819333076477, + "learning_rate": 2.929242830813861e-08, + "loss": 0.0607, + "step": 17640 + }, + { + "epoch": 2.8582307193778353, + "grad_norm": 0.8390337824821472, + "learning_rate": 2.9225719324854628e-08, + "loss": 0.0564, + "step": 17641 + }, + { + "epoch": 2.8583927414128323, + "grad_norm": 0.9694913029670715, + "learning_rate": 2.915908594128436e-08, + "loss": 0.0596, + "step": 17642 + }, + { + "epoch": 2.8585547634478288, + "grad_norm": 0.9143932461738586, + "learning_rate": 2.9092528159466727e-08, + "loss": 0.0585, + "step": 17643 + }, + { + "epoch": 2.8587167854828257, + "grad_norm": 0.9615548849105835, + "learning_rate": 2.9026045981438434e-08, + "loss": 0.0646, + "step": 17644 + }, + { + "epoch": 2.858878807517822, + "grad_norm": 0.8890393972396851, + "learning_rate": 2.895963940923341e-08, + "loss": 0.0605, + "step": 17645 + }, + { + "epoch": 2.859040829552819, + "grad_norm": 0.9828749299049377, + "learning_rate": 2.889330844488364e-08, + "loss": 0.0568, + "step": 17646 + }, + { + "epoch": 2.859202851587816, + "grad_norm": 0.8218222856521606, + "learning_rate": 2.882705309041861e-08, + "loss": 0.0532, + "step": 17647 + }, + { + "epoch": 2.8593648736228126, + "grad_norm": 1.0544509887695312, + "learning_rate": 2.8760873347865593e-08, + "loss": 0.0649, + "step": 17648 + }, + { + "epoch": 2.8595268956578095, + "grad_norm": 1.0283457040786743, + "learning_rate": 2.869476921924963e-08, + "loss": 0.0588, + "step": 17649 + }, + { + "epoch": 2.8596889176928064, + "grad_norm": 0.9277712106704712, + "learning_rate": 2.862874070659327e-08, + "loss": 0.0557, + "step": 17650 + }, + { + "epoch": 2.859850939727803, + "grad_norm": 0.8709115982055664, + "learning_rate": 2.8562787811916848e-08, + "loss": 0.0558, + "step": 17651 + }, + { + "epoch": 2.8600129617628, + "grad_norm": 0.9610093235969543, + "learning_rate": 2.8496910537238185e-08, + "loss": 0.0622, + "step": 17652 + }, + { + "epoch": 2.8601749837977968, + "grad_norm": 1.003127098083496, + "learning_rate": 2.8431108884573454e-08, + "loss": 0.0611, + "step": 17653 + }, + { + "epoch": 2.8603370058327933, + "grad_norm": 0.879693865776062, + "learning_rate": 2.8365382855935487e-08, + "loss": 0.0526, + "step": 17654 + }, + { + "epoch": 2.8604990278677898, + "grad_norm": 0.8258382678031921, + "learning_rate": 2.8299732453335725e-08, + "loss": 0.0576, + "step": 17655 + }, + { + "epoch": 2.8606610499027867, + "grad_norm": 0.9305282831192017, + "learning_rate": 2.8234157678782846e-08, + "loss": 0.0603, + "step": 17656 + }, + { + "epoch": 2.8608230719377836, + "grad_norm": 0.825541615486145, + "learning_rate": 2.8168658534282743e-08, + "loss": 0.0553, + "step": 17657 + }, + { + "epoch": 2.86098509397278, + "grad_norm": 0.8502270579338074, + "learning_rate": 2.8103235021840204e-08, + "loss": 0.0581, + "step": 17658 + }, + { + "epoch": 2.861147116007777, + "grad_norm": 0.9126089215278625, + "learning_rate": 2.8037887143456954e-08, + "loss": 0.0582, + "step": 17659 + }, + { + "epoch": 2.861309138042774, + "grad_norm": 0.8462595343589783, + "learning_rate": 2.7972614901132235e-08, + "loss": 0.0621, + "step": 17660 + }, + { + "epoch": 2.8614711600777705, + "grad_norm": 0.8996595144271851, + "learning_rate": 2.790741829686333e-08, + "loss": 0.0602, + "step": 17661 + }, + { + "epoch": 2.8616331821127674, + "grad_norm": 1.2317053079605103, + "learning_rate": 2.784229733264504e-08, + "loss": 0.0688, + "step": 17662 + }, + { + "epoch": 2.8617952041477643, + "grad_norm": 1.1419668197631836, + "learning_rate": 2.7777252010469657e-08, + "loss": 0.0643, + "step": 17663 + }, + { + "epoch": 2.861957226182761, + "grad_norm": 1.055248737335205, + "learning_rate": 2.771228233232809e-08, + "loss": 0.0607, + "step": 17664 + }, + { + "epoch": 2.8621192482177578, + "grad_norm": 0.8915876746177673, + "learning_rate": 2.7647388300207635e-08, + "loss": 0.0603, + "step": 17665 + }, + { + "epoch": 2.8622812702527543, + "grad_norm": 1.019177794456482, + "learning_rate": 2.7582569916094205e-08, + "loss": 0.062, + "step": 17666 + }, + { + "epoch": 2.862443292287751, + "grad_norm": 0.932235598564148, + "learning_rate": 2.7517827181970937e-08, + "loss": 0.0606, + "step": 17667 + }, + { + "epoch": 2.8626053143227477, + "grad_norm": 0.764743983745575, + "learning_rate": 2.745316009981902e-08, + "loss": 0.0513, + "step": 17668 + }, + { + "epoch": 2.8627673363577446, + "grad_norm": 0.9902213215827942, + "learning_rate": 2.7388568671616877e-08, + "loss": 0.0629, + "step": 17669 + }, + { + "epoch": 2.8629293583927415, + "grad_norm": 1.0022236108779907, + "learning_rate": 2.732405289934098e-08, + "loss": 0.0586, + "step": 17670 + }, + { + "epoch": 2.863091380427738, + "grad_norm": 1.2449359893798828, + "learning_rate": 2.7259612784965307e-08, + "loss": 0.0579, + "step": 17671 + }, + { + "epoch": 2.863253402462735, + "grad_norm": 0.9988813400268555, + "learning_rate": 2.719524833046133e-08, + "loss": 0.0626, + "step": 17672 + }, + { + "epoch": 2.863415424497732, + "grad_norm": 0.9619942307472229, + "learning_rate": 2.7130959537798874e-08, + "loss": 0.0637, + "step": 17673 + }, + { + "epoch": 2.8635774465327284, + "grad_norm": 0.9914158582687378, + "learning_rate": 2.7066746408944968e-08, + "loss": 0.0621, + "step": 17674 + }, + { + "epoch": 2.8637394685677253, + "grad_norm": 0.9497270584106445, + "learning_rate": 2.700260894586415e-08, + "loss": 0.0555, + "step": 17675 + }, + { + "epoch": 2.863901490602722, + "grad_norm": 0.8510124087333679, + "learning_rate": 2.6938547150518746e-08, + "loss": 0.0556, + "step": 17676 + }, + { + "epoch": 2.8640635126377187, + "grad_norm": 0.8168439269065857, + "learning_rate": 2.6874561024869407e-08, + "loss": 0.0514, + "step": 17677 + }, + { + "epoch": 2.8642255346727152, + "grad_norm": 0.8079927563667297, + "learning_rate": 2.6810650570873454e-08, + "loss": 0.0501, + "step": 17678 + }, + { + "epoch": 2.864387556707712, + "grad_norm": 0.9214690923690796, + "learning_rate": 2.6746815790486548e-08, + "loss": 0.061, + "step": 17679 + }, + { + "epoch": 2.864549578742709, + "grad_norm": 1.2173523902893066, + "learning_rate": 2.6683056685662122e-08, + "loss": 0.0676, + "step": 17680 + }, + { + "epoch": 2.8647116007777056, + "grad_norm": 0.9455307722091675, + "learning_rate": 2.6619373258350566e-08, + "loss": 0.0689, + "step": 17681 + }, + { + "epoch": 2.8648736228127025, + "grad_norm": 0.84743332862854, + "learning_rate": 2.6555765510500875e-08, + "loss": 0.0549, + "step": 17682 + }, + { + "epoch": 2.8650356448476995, + "grad_norm": 0.8518211245536804, + "learning_rate": 2.6492233444059267e-08, + "loss": 0.0595, + "step": 17683 + }, + { + "epoch": 2.865197666882696, + "grad_norm": 0.8848883509635925, + "learning_rate": 2.6428777060969468e-08, + "loss": 0.0615, + "step": 17684 + }, + { + "epoch": 2.865359688917693, + "grad_norm": 0.9524630308151245, + "learning_rate": 2.6365396363173256e-08, + "loss": 0.0634, + "step": 17685 + }, + { + "epoch": 2.86552171095269, + "grad_norm": 0.9205766916275024, + "learning_rate": 2.6302091352609637e-08, + "loss": 0.0672, + "step": 17686 + }, + { + "epoch": 2.8656837329876863, + "grad_norm": 0.821868360042572, + "learning_rate": 2.6238862031215672e-08, + "loss": 0.0562, + "step": 17687 + }, + { + "epoch": 2.8658457550226832, + "grad_norm": 0.9511677026748657, + "learning_rate": 2.617570840092648e-08, + "loss": 0.0646, + "step": 17688 + }, + { + "epoch": 2.8660077770576797, + "grad_norm": 0.9630586504936218, + "learning_rate": 2.611263046367385e-08, + "loss": 0.0589, + "step": 17689 + }, + { + "epoch": 2.8661697990926767, + "grad_norm": 0.8870193362236023, + "learning_rate": 2.604962822138818e-08, + "loss": 0.0544, + "step": 17690 + }, + { + "epoch": 2.866331821127673, + "grad_norm": 1.0679670572280884, + "learning_rate": 2.5986701675996816e-08, + "loss": 0.0634, + "step": 17691 + }, + { + "epoch": 2.86649384316267, + "grad_norm": 0.9083020091056824, + "learning_rate": 2.5923850829425723e-08, + "loss": 0.0639, + "step": 17692 + }, + { + "epoch": 2.866655865197667, + "grad_norm": 0.8310949802398682, + "learning_rate": 2.5861075683597526e-08, + "loss": 0.0558, + "step": 17693 + }, + { + "epoch": 2.8668178872326635, + "grad_norm": 0.956186830997467, + "learning_rate": 2.579837624043291e-08, + "loss": 0.0709, + "step": 17694 + }, + { + "epoch": 2.8669799092676604, + "grad_norm": 1.0341099500656128, + "learning_rate": 2.573575250185062e-08, + "loss": 0.0563, + "step": 17695 + }, + { + "epoch": 2.8671419313026574, + "grad_norm": 0.8617078065872192, + "learning_rate": 2.5673204469766898e-08, + "loss": 0.0621, + "step": 17696 + }, + { + "epoch": 2.867303953337654, + "grad_norm": 0.9676273465156555, + "learning_rate": 2.561073214609494e-08, + "loss": 0.0636, + "step": 17697 + }, + { + "epoch": 2.867465975372651, + "grad_norm": 0.9023331999778748, + "learning_rate": 2.5548335532747105e-08, + "loss": 0.0571, + "step": 17698 + }, + { + "epoch": 2.8676279974076473, + "grad_norm": 0.9330205321311951, + "learning_rate": 2.5486014631631862e-08, + "loss": 0.0592, + "step": 17699 + }, + { + "epoch": 2.8677900194426442, + "grad_norm": 1.1070448160171509, + "learning_rate": 2.5423769444656575e-08, + "loss": 0.0669, + "step": 17700 + }, + { + "epoch": 2.8679520414776407, + "grad_norm": 0.8804136514663696, + "learning_rate": 2.536159997372528e-08, + "loss": 0.0567, + "step": 17701 + }, + { + "epoch": 2.8681140635126376, + "grad_norm": 0.9040849208831787, + "learning_rate": 2.52995062207409e-08, + "loss": 0.0598, + "step": 17702 + }, + { + "epoch": 2.8682760855476346, + "grad_norm": 0.999211311340332, + "learning_rate": 2.5237488187602743e-08, + "loss": 0.0586, + "step": 17703 + }, + { + "epoch": 2.868438107582631, + "grad_norm": 0.9211670160293579, + "learning_rate": 2.517554587620874e-08, + "loss": 0.0648, + "step": 17704 + }, + { + "epoch": 2.868600129617628, + "grad_norm": 0.820000410079956, + "learning_rate": 2.511367928845404e-08, + "loss": 0.055, + "step": 17705 + }, + { + "epoch": 2.868762151652625, + "grad_norm": 0.8488945960998535, + "learning_rate": 2.5051888426231574e-08, + "loss": 0.0563, + "step": 17706 + }, + { + "epoch": 2.8689241736876214, + "grad_norm": 0.9143860936164856, + "learning_rate": 2.499017329143205e-08, + "loss": 0.0565, + "step": 17707 + }, + { + "epoch": 2.8690861957226184, + "grad_norm": 0.9244613647460938, + "learning_rate": 2.492853388594396e-08, + "loss": 0.0627, + "step": 17708 + }, + { + "epoch": 2.8692482177576153, + "grad_norm": 0.924808144569397, + "learning_rate": 2.486697021165302e-08, + "loss": 0.0617, + "step": 17709 + }, + { + "epoch": 2.869410239792612, + "grad_norm": 1.042020320892334, + "learning_rate": 2.480548227044327e-08, + "loss": 0.0593, + "step": 17710 + }, + { + "epoch": 2.8695722618276087, + "grad_norm": 1.0010639429092407, + "learning_rate": 2.4744070064195713e-08, + "loss": 0.0599, + "step": 17711 + }, + { + "epoch": 2.869734283862605, + "grad_norm": 1.0316102504730225, + "learning_rate": 2.4682733594789677e-08, + "loss": 0.0673, + "step": 17712 + }, + { + "epoch": 2.869896305897602, + "grad_norm": 1.0180681943893433, + "learning_rate": 2.4621472864101992e-08, + "loss": 0.0638, + "step": 17713 + }, + { + "epoch": 2.8700583279325986, + "grad_norm": 1.028444766998291, + "learning_rate": 2.4560287874006716e-08, + "loss": 0.0638, + "step": 17714 + }, + { + "epoch": 2.8702203499675956, + "grad_norm": 1.0102734565734863, + "learning_rate": 2.4499178626376243e-08, + "loss": 0.0612, + "step": 17715 + }, + { + "epoch": 2.8703823720025925, + "grad_norm": 0.8836684226989746, + "learning_rate": 2.443814512308018e-08, + "loss": 0.0563, + "step": 17716 + }, + { + "epoch": 2.870544394037589, + "grad_norm": 0.9856815934181213, + "learning_rate": 2.437718736598621e-08, + "loss": 0.0579, + "step": 17717 + }, + { + "epoch": 2.870706416072586, + "grad_norm": 0.9209667444229126, + "learning_rate": 2.431630535695978e-08, + "loss": 0.056, + "step": 17718 + }, + { + "epoch": 2.870868438107583, + "grad_norm": 0.9791855812072754, + "learning_rate": 2.4255499097863012e-08, + "loss": 0.0622, + "step": 17719 + }, + { + "epoch": 2.8710304601425793, + "grad_norm": 0.9314326047897339, + "learning_rate": 2.419476859055664e-08, + "loss": 0.0617, + "step": 17720 + }, + { + "epoch": 2.8711924821775763, + "grad_norm": 0.8641183376312256, + "learning_rate": 2.4134113836899455e-08, + "loss": 0.0557, + "step": 17721 + }, + { + "epoch": 2.8713545042125728, + "grad_norm": 0.8541434407234192, + "learning_rate": 2.4073534838746637e-08, + "loss": 0.0541, + "step": 17722 + }, + { + "epoch": 2.8715165262475697, + "grad_norm": 1.0766733884811401, + "learning_rate": 2.4013031597951985e-08, + "loss": 0.0597, + "step": 17723 + }, + { + "epoch": 2.871678548282566, + "grad_norm": 0.9844111204147339, + "learning_rate": 2.3952604116366795e-08, + "loss": 0.0637, + "step": 17724 + }, + { + "epoch": 2.871840570317563, + "grad_norm": 0.8357641696929932, + "learning_rate": 2.3892252395840143e-08, + "loss": 0.0598, + "step": 17725 + }, + { + "epoch": 2.87200259235256, + "grad_norm": 1.0578702688217163, + "learning_rate": 2.383197643821833e-08, + "loss": 0.0574, + "step": 17726 + }, + { + "epoch": 2.8721646143875565, + "grad_norm": 0.8286598920822144, + "learning_rate": 2.377177624534599e-08, + "loss": 0.0589, + "step": 17727 + }, + { + "epoch": 2.8723266364225535, + "grad_norm": 0.890817403793335, + "learning_rate": 2.3711651819064984e-08, + "loss": 0.0612, + "step": 17728 + }, + { + "epoch": 2.8724886584575504, + "grad_norm": 0.8068243265151978, + "learning_rate": 2.3651603161214677e-08, + "loss": 0.0547, + "step": 17729 + }, + { + "epoch": 2.872650680492547, + "grad_norm": 0.8931058049201965, + "learning_rate": 2.359163027363276e-08, + "loss": 0.0582, + "step": 17730 + }, + { + "epoch": 2.872812702527544, + "grad_norm": 0.9487268328666687, + "learning_rate": 2.3531733158154157e-08, + "loss": 0.0684, + "step": 17731 + }, + { + "epoch": 2.8729747245625408, + "grad_norm": 0.8844464421272278, + "learning_rate": 2.3471911816611846e-08, + "loss": 0.0576, + "step": 17732 + }, + { + "epoch": 2.8731367465975373, + "grad_norm": 0.7697759866714478, + "learning_rate": 2.3412166250835756e-08, + "loss": 0.0475, + "step": 17733 + }, + { + "epoch": 2.8732987686325338, + "grad_norm": 1.038557767868042, + "learning_rate": 2.335249646265414e-08, + "loss": 0.0675, + "step": 17734 + }, + { + "epoch": 2.8734607906675307, + "grad_norm": 0.8407655954360962, + "learning_rate": 2.3292902453892485e-08, + "loss": 0.0579, + "step": 17735 + }, + { + "epoch": 2.8736228127025276, + "grad_norm": 0.940645694732666, + "learning_rate": 2.3233384226375167e-08, + "loss": 0.0677, + "step": 17736 + }, + { + "epoch": 2.873784834737524, + "grad_norm": 0.9915009140968323, + "learning_rate": 2.3173941781922114e-08, + "loss": 0.0645, + "step": 17737 + }, + { + "epoch": 2.873946856772521, + "grad_norm": 1.1203358173370361, + "learning_rate": 2.311457512235271e-08, + "loss": 0.0607, + "step": 17738 + }, + { + "epoch": 2.874108878807518, + "grad_norm": 0.9407758116722107, + "learning_rate": 2.305528424948328e-08, + "loss": 0.0555, + "step": 17739 + }, + { + "epoch": 2.8742709008425145, + "grad_norm": 1.1099528074264526, + "learning_rate": 2.2996069165128198e-08, + "loss": 0.0638, + "step": 17740 + }, + { + "epoch": 2.8744329228775114, + "grad_norm": 0.8714383244514465, + "learning_rate": 2.2936929871099356e-08, + "loss": 0.0595, + "step": 17741 + }, + { + "epoch": 2.8745949449125083, + "grad_norm": 0.834579586982727, + "learning_rate": 2.2877866369205858e-08, + "loss": 0.0633, + "step": 17742 + }, + { + "epoch": 2.874756966947505, + "grad_norm": 0.9249007701873779, + "learning_rate": 2.281887866125515e-08, + "loss": 0.054, + "step": 17743 + }, + { + "epoch": 2.8749189889825018, + "grad_norm": 0.8720956444740295, + "learning_rate": 2.2759966749051897e-08, + "loss": 0.0561, + "step": 17744 + }, + { + "epoch": 2.8750810110174982, + "grad_norm": 1.0232727527618408, + "learning_rate": 2.2701130634399104e-08, + "loss": 0.063, + "step": 17745 + }, + { + "epoch": 2.875243033052495, + "grad_norm": 0.9553148746490479, + "learning_rate": 2.2642370319096718e-08, + "loss": 0.0691, + "step": 17746 + }, + { + "epoch": 2.8754050550874917, + "grad_norm": 0.820517361164093, + "learning_rate": 2.2583685804942746e-08, + "loss": 0.0578, + "step": 17747 + }, + { + "epoch": 2.8755670771224886, + "grad_norm": 0.9615774154663086, + "learning_rate": 2.2525077093732695e-08, + "loss": 0.0674, + "step": 17748 + }, + { + "epoch": 2.8757290991574855, + "grad_norm": 0.9988585114479065, + "learning_rate": 2.2466544187259852e-08, + "loss": 0.0686, + "step": 17749 + }, + { + "epoch": 2.875891121192482, + "grad_norm": 0.8221516013145447, + "learning_rate": 2.2408087087315567e-08, + "loss": 0.0566, + "step": 17750 + }, + { + "epoch": 2.876053143227479, + "grad_norm": 0.852265477180481, + "learning_rate": 2.234970579568785e-08, + "loss": 0.06, + "step": 17751 + }, + { + "epoch": 2.876215165262476, + "grad_norm": 0.9517872929573059, + "learning_rate": 2.2291400314163325e-08, + "loss": 0.0628, + "step": 17752 + }, + { + "epoch": 2.8763771872974724, + "grad_norm": 1.0439865589141846, + "learning_rate": 2.2233170644526126e-08, + "loss": 0.0619, + "step": 17753 + }, + { + "epoch": 2.8765392093324693, + "grad_norm": 0.8319445848464966, + "learning_rate": 2.217501678855788e-08, + "loss": 0.0548, + "step": 17754 + }, + { + "epoch": 2.8767012313674662, + "grad_norm": 0.9762924909591675, + "learning_rate": 2.211693874803772e-08, + "loss": 0.0599, + "step": 17755 + }, + { + "epoch": 2.8768632534024627, + "grad_norm": 0.831531822681427, + "learning_rate": 2.2058936524742835e-08, + "loss": 0.0537, + "step": 17756 + }, + { + "epoch": 2.8770252754374592, + "grad_norm": 0.8122191429138184, + "learning_rate": 2.2001010120448197e-08, + "loss": 0.0555, + "step": 17757 + }, + { + "epoch": 2.877187297472456, + "grad_norm": 1.2487119436264038, + "learning_rate": 2.1943159536925994e-08, + "loss": 0.0614, + "step": 17758 + }, + { + "epoch": 2.877349319507453, + "grad_norm": 1.0974977016448975, + "learning_rate": 2.1885384775946207e-08, + "loss": 0.0593, + "step": 17759 + }, + { + "epoch": 2.8775113415424496, + "grad_norm": 0.9182059168815613, + "learning_rate": 2.1827685839276856e-08, + "loss": 0.0623, + "step": 17760 + }, + { + "epoch": 2.8776733635774465, + "grad_norm": 1.0060678720474243, + "learning_rate": 2.177006272868293e-08, + "loss": 0.0623, + "step": 17761 + }, + { + "epoch": 2.8778353856124435, + "grad_norm": 0.8593238592147827, + "learning_rate": 2.1712515445928285e-08, + "loss": 0.0612, + "step": 17762 + }, + { + "epoch": 2.87799740764744, + "grad_norm": 0.8475682139396667, + "learning_rate": 2.1655043992773183e-08, + "loss": 0.0538, + "step": 17763 + }, + { + "epoch": 2.878159429682437, + "grad_norm": 1.3795852661132812, + "learning_rate": 2.159764837097622e-08, + "loss": 0.0681, + "step": 17764 + }, + { + "epoch": 2.878321451717434, + "grad_norm": 1.0932248830795288, + "learning_rate": 2.1540328582293767e-08, + "loss": 0.0699, + "step": 17765 + }, + { + "epoch": 2.8784834737524303, + "grad_norm": 0.8664324879646301, + "learning_rate": 2.1483084628479145e-08, + "loss": 0.0559, + "step": 17766 + }, + { + "epoch": 2.8786454957874272, + "grad_norm": 0.92646723985672, + "learning_rate": 2.142591651128456e-08, + "loss": 0.0559, + "step": 17767 + }, + { + "epoch": 2.8788075178224237, + "grad_norm": 0.7677503228187561, + "learning_rate": 2.1368824232458618e-08, + "loss": 0.0519, + "step": 17768 + }, + { + "epoch": 2.8789695398574207, + "grad_norm": 0.8314093351364136, + "learning_rate": 2.1311807793748805e-08, + "loss": 0.0546, + "step": 17769 + }, + { + "epoch": 2.879131561892417, + "grad_norm": 0.971030056476593, + "learning_rate": 2.125486719689929e-08, + "loss": 0.0632, + "step": 17770 + }, + { + "epoch": 2.879293583927414, + "grad_norm": 0.8812588453292847, + "learning_rate": 2.1198002443652276e-08, + "loss": 0.0607, + "step": 17771 + }, + { + "epoch": 2.879455605962411, + "grad_norm": 1.1112418174743652, + "learning_rate": 2.1141213535747772e-08, + "loss": 0.0659, + "step": 17772 + }, + { + "epoch": 2.8796176279974075, + "grad_norm": 0.976514995098114, + "learning_rate": 2.1084500474923554e-08, + "loss": 0.0672, + "step": 17773 + }, + { + "epoch": 2.8797796500324044, + "grad_norm": 0.9304304122924805, + "learning_rate": 2.1027863262914617e-08, + "loss": 0.0598, + "step": 17774 + }, + { + "epoch": 2.8799416720674014, + "grad_norm": 1.0370122194290161, + "learning_rate": 2.0971301901454023e-08, + "loss": 0.0622, + "step": 17775 + }, + { + "epoch": 2.880103694102398, + "grad_norm": 0.7911035418510437, + "learning_rate": 2.0914816392272608e-08, + "loss": 0.057, + "step": 17776 + }, + { + "epoch": 2.880265716137395, + "grad_norm": 0.8377315998077393, + "learning_rate": 2.0858406737098435e-08, + "loss": 0.0523, + "step": 17777 + }, + { + "epoch": 2.8804277381723913, + "grad_norm": 0.895128071308136, + "learning_rate": 2.0802072937657624e-08, + "loss": 0.0597, + "step": 17778 + }, + { + "epoch": 2.880589760207388, + "grad_norm": 0.9647876620292664, + "learning_rate": 2.0745814995673796e-08, + "loss": 0.0653, + "step": 17779 + }, + { + "epoch": 2.8807517822423847, + "grad_norm": 0.8965823650360107, + "learning_rate": 2.068963291286863e-08, + "loss": 0.0554, + "step": 17780 + }, + { + "epoch": 2.8809138042773816, + "grad_norm": 1.0152802467346191, + "learning_rate": 2.0633526690960747e-08, + "loss": 0.0599, + "step": 17781 + }, + { + "epoch": 2.8810758263123786, + "grad_norm": 0.886622428894043, + "learning_rate": 2.0577496331666837e-08, + "loss": 0.0631, + "step": 17782 + }, + { + "epoch": 2.881237848347375, + "grad_norm": 0.943684995174408, + "learning_rate": 2.052154183670163e-08, + "loss": 0.06, + "step": 17783 + }, + { + "epoch": 2.881399870382372, + "grad_norm": 0.9235215783119202, + "learning_rate": 2.046566320777682e-08, + "loss": 0.0594, + "step": 17784 + }, + { + "epoch": 2.881561892417369, + "grad_norm": 0.9773284196853638, + "learning_rate": 2.04098604466027e-08, + "loss": 0.0646, + "step": 17785 + }, + { + "epoch": 2.8817239144523654, + "grad_norm": 0.921341061592102, + "learning_rate": 2.0354133554885967e-08, + "loss": 0.059, + "step": 17786 + }, + { + "epoch": 2.8818859364873624, + "grad_norm": 0.9354360699653625, + "learning_rate": 2.0298482534332198e-08, + "loss": 0.0553, + "step": 17787 + }, + { + "epoch": 2.8820479585223593, + "grad_norm": 0.8430424332618713, + "learning_rate": 2.0242907386644195e-08, + "loss": 0.0548, + "step": 17788 + }, + { + "epoch": 2.8822099805573558, + "grad_norm": 0.8397956490516663, + "learning_rate": 2.0187408113522266e-08, + "loss": 0.054, + "step": 17789 + }, + { + "epoch": 2.8823720025923527, + "grad_norm": 0.967339813709259, + "learning_rate": 2.0131984716664776e-08, + "loss": 0.0617, + "step": 17790 + }, + { + "epoch": 2.882534024627349, + "grad_norm": 1.0861202478408813, + "learning_rate": 2.0076637197767312e-08, + "loss": 0.0653, + "step": 17791 + }, + { + "epoch": 2.882696046662346, + "grad_norm": 0.9953954219818115, + "learning_rate": 2.002136555852352e-08, + "loss": 0.0634, + "step": 17792 + }, + { + "epoch": 2.8828580686973426, + "grad_norm": 0.7926887273788452, + "learning_rate": 1.996616980062427e-08, + "loss": 0.0497, + "step": 17793 + }, + { + "epoch": 2.8830200907323396, + "grad_norm": 1.0522444248199463, + "learning_rate": 1.9911049925758765e-08, + "loss": 0.0647, + "step": 17794 + }, + { + "epoch": 2.8831821127673365, + "grad_norm": 0.9192836284637451, + "learning_rate": 1.9856005935613708e-08, + "loss": 0.0607, + "step": 17795 + }, + { + "epoch": 2.883344134802333, + "grad_norm": 0.9567740559577942, + "learning_rate": 1.9801037831872482e-08, + "loss": 0.0569, + "step": 17796 + }, + { + "epoch": 2.88350615683733, + "grad_norm": 0.8763161897659302, + "learning_rate": 1.9746145616217905e-08, + "loss": 0.0535, + "step": 17797 + }, + { + "epoch": 2.883668178872327, + "grad_norm": 0.9974742531776428, + "learning_rate": 1.9691329290329185e-08, + "loss": 0.0576, + "step": 17798 + }, + { + "epoch": 2.8838302009073233, + "grad_norm": 0.915104866027832, + "learning_rate": 1.9636588855883598e-08, + "loss": 0.062, + "step": 17799 + }, + { + "epoch": 2.8839922229423203, + "grad_norm": 0.8486490249633789, + "learning_rate": 1.958192431455591e-08, + "loss": 0.059, + "step": 17800 + }, + { + "epoch": 2.8841542449773168, + "grad_norm": 0.944681704044342, + "learning_rate": 1.9527335668018954e-08, + "loss": 0.0648, + "step": 17801 + }, + { + "epoch": 2.8843162670123137, + "grad_norm": 0.9253272414207458, + "learning_rate": 1.9472822917942778e-08, + "loss": 0.0609, + "step": 17802 + }, + { + "epoch": 2.88447828904731, + "grad_norm": 0.9688064455986023, + "learning_rate": 1.9418386065995222e-08, + "loss": 0.0623, + "step": 17803 + }, + { + "epoch": 2.884640311082307, + "grad_norm": 1.0113117694854736, + "learning_rate": 1.9364025113842444e-08, + "loss": 0.0597, + "step": 17804 + }, + { + "epoch": 2.884802333117304, + "grad_norm": 0.8749908804893494, + "learning_rate": 1.9309740063147566e-08, + "loss": 0.0587, + "step": 17805 + }, + { + "epoch": 2.8849643551523005, + "grad_norm": 1.0198384523391724, + "learning_rate": 1.9255530915571197e-08, + "loss": 0.0611, + "step": 17806 + }, + { + "epoch": 2.8851263771872975, + "grad_norm": 0.9575091600418091, + "learning_rate": 1.920139767277257e-08, + "loss": 0.061, + "step": 17807 + }, + { + "epoch": 2.8852883992222944, + "grad_norm": 0.9491891264915466, + "learning_rate": 1.9147340336407584e-08, + "loss": 0.0618, + "step": 17808 + }, + { + "epoch": 2.885450421257291, + "grad_norm": 0.9675207734107971, + "learning_rate": 1.9093358908130743e-08, + "loss": 0.0675, + "step": 17809 + }, + { + "epoch": 2.885612443292288, + "grad_norm": 0.8479520082473755, + "learning_rate": 1.9039453389592954e-08, + "loss": 0.0612, + "step": 17810 + }, + { + "epoch": 2.8857744653272848, + "grad_norm": 1.0039427280426025, + "learning_rate": 1.8985623782444284e-08, + "loss": 0.064, + "step": 17811 + }, + { + "epoch": 2.8859364873622813, + "grad_norm": 0.9611213207244873, + "learning_rate": 1.893187008833175e-08, + "loss": 0.0603, + "step": 17812 + }, + { + "epoch": 2.886098509397278, + "grad_norm": 0.8589776158332825, + "learning_rate": 1.8878192308899867e-08, + "loss": 0.0586, + "step": 17813 + }, + { + "epoch": 2.8862605314322747, + "grad_norm": 0.8623048067092896, + "learning_rate": 1.8824590445790935e-08, + "loss": 0.0626, + "step": 17814 + }, + { + "epoch": 2.8864225534672716, + "grad_norm": 0.9948340654373169, + "learning_rate": 1.877106450064531e-08, + "loss": 0.0649, + "step": 17815 + }, + { + "epoch": 2.886584575502268, + "grad_norm": 0.9719632863998413, + "learning_rate": 1.871761447510084e-08, + "loss": 0.0546, + "step": 17816 + }, + { + "epoch": 2.886746597537265, + "grad_norm": 0.8713729977607727, + "learning_rate": 1.866424037079234e-08, + "loss": 0.0602, + "step": 17817 + }, + { + "epoch": 2.886908619572262, + "grad_norm": 0.8655098676681519, + "learning_rate": 1.8610942189353777e-08, + "loss": 0.0556, + "step": 17818 + }, + { + "epoch": 2.8870706416072585, + "grad_norm": 0.830318808555603, + "learning_rate": 1.855771993241523e-08, + "loss": 0.0535, + "step": 17819 + }, + { + "epoch": 2.8872326636422554, + "grad_norm": 0.8426806926727295, + "learning_rate": 1.850457360160568e-08, + "loss": 0.0574, + "step": 17820 + }, + { + "epoch": 2.8873946856772523, + "grad_norm": 0.8310089111328125, + "learning_rate": 1.8451503198551047e-08, + "loss": 0.0546, + "step": 17821 + }, + { + "epoch": 2.887556707712249, + "grad_norm": 0.8731127381324768, + "learning_rate": 1.839850872487503e-08, + "loss": 0.0574, + "step": 17822 + }, + { + "epoch": 2.8877187297472457, + "grad_norm": 0.9887226819992065, + "learning_rate": 1.834559018219939e-08, + "loss": 0.0567, + "step": 17823 + }, + { + "epoch": 2.8878807517822422, + "grad_norm": 0.9463251829147339, + "learning_rate": 1.829274757214339e-08, + "loss": 0.0628, + "step": 17824 + }, + { + "epoch": 2.888042773817239, + "grad_norm": 0.8988006114959717, + "learning_rate": 1.8239980896323505e-08, + "loss": 0.0581, + "step": 17825 + }, + { + "epoch": 2.8882047958522357, + "grad_norm": 0.9556702375411987, + "learning_rate": 1.8187290156354565e-08, + "loss": 0.0587, + "step": 17826 + }, + { + "epoch": 2.8883668178872326, + "grad_norm": 0.8816114068031311, + "learning_rate": 1.8134675353848608e-08, + "loss": 0.0593, + "step": 17827 + }, + { + "epoch": 2.8885288399222295, + "grad_norm": 0.856730043888092, + "learning_rate": 1.808213649041546e-08, + "loss": 0.0554, + "step": 17828 + }, + { + "epoch": 2.888690861957226, + "grad_norm": 0.8481936454772949, + "learning_rate": 1.8029673567662997e-08, + "loss": 0.0543, + "step": 17829 + }, + { + "epoch": 2.888852883992223, + "grad_norm": 1.0299345254898071, + "learning_rate": 1.7977286587196053e-08, + "loss": 0.0651, + "step": 17830 + }, + { + "epoch": 2.88901490602722, + "grad_norm": 0.9097673296928406, + "learning_rate": 1.792497555061806e-08, + "loss": 0.0626, + "step": 17831 + }, + { + "epoch": 2.8891769280622164, + "grad_norm": 0.9130776524543762, + "learning_rate": 1.7872740459529135e-08, + "loss": 0.0627, + "step": 17832 + }, + { + "epoch": 2.8893389500972133, + "grad_norm": 0.9484906792640686, + "learning_rate": 1.7820581315527717e-08, + "loss": 0.0658, + "step": 17833 + }, + { + "epoch": 2.8895009721322102, + "grad_norm": 0.8442859649658203, + "learning_rate": 1.7768498120209755e-08, + "loss": 0.0559, + "step": 17834 + }, + { + "epoch": 2.8896629941672067, + "grad_norm": 0.9843007326126099, + "learning_rate": 1.771649087516897e-08, + "loss": 0.0648, + "step": 17835 + }, + { + "epoch": 2.8898250162022032, + "grad_norm": 0.8426430225372314, + "learning_rate": 1.766455958199631e-08, + "loss": 0.0564, + "step": 17836 + }, + { + "epoch": 2.8899870382372, + "grad_norm": 0.8461860418319702, + "learning_rate": 1.7612704242281342e-08, + "loss": 0.0614, + "step": 17837 + }, + { + "epoch": 2.890149060272197, + "grad_norm": 1.05631422996521, + "learning_rate": 1.7560924857610016e-08, + "loss": 0.064, + "step": 17838 + }, + { + "epoch": 2.8903110823071936, + "grad_norm": 0.8737570643424988, + "learning_rate": 1.750922142956718e-08, + "loss": 0.0607, + "step": 17839 + }, + { + "epoch": 2.8904731043421905, + "grad_norm": 0.8831677436828613, + "learning_rate": 1.745759395973462e-08, + "loss": 0.0525, + "step": 17840 + }, + { + "epoch": 2.8906351263771874, + "grad_norm": 0.9918739199638367, + "learning_rate": 1.7406042449691907e-08, + "loss": 0.0665, + "step": 17841 + }, + { + "epoch": 2.890797148412184, + "grad_norm": 0.8606711030006409, + "learning_rate": 1.7354566901016944e-08, + "loss": 0.0596, + "step": 17842 + }, + { + "epoch": 2.890959170447181, + "grad_norm": 0.8856733441352844, + "learning_rate": 1.730316731528403e-08, + "loss": 0.0629, + "step": 17843 + }, + { + "epoch": 2.891121192482178, + "grad_norm": 1.0403659343719482, + "learning_rate": 1.7251843694066074e-08, + "loss": 0.0601, + "step": 17844 + }, + { + "epoch": 2.8912832145171743, + "grad_norm": 0.8173930048942566, + "learning_rate": 1.7200596038934038e-08, + "loss": 0.0585, + "step": 17845 + }, + { + "epoch": 2.8914452365521712, + "grad_norm": 0.8533201813697815, + "learning_rate": 1.7149424351455003e-08, + "loss": 0.0593, + "step": 17846 + }, + { + "epoch": 2.8916072585871677, + "grad_norm": 0.941921591758728, + "learning_rate": 1.7098328633195493e-08, + "loss": 0.0658, + "step": 17847 + }, + { + "epoch": 2.8917692806221647, + "grad_norm": 1.0077810287475586, + "learning_rate": 1.7047308885718427e-08, + "loss": 0.0636, + "step": 17848 + }, + { + "epoch": 2.891931302657161, + "grad_norm": 1.1122187376022339, + "learning_rate": 1.6996365110585332e-08, + "loss": 0.0593, + "step": 17849 + }, + { + "epoch": 2.892093324692158, + "grad_norm": 1.093993902206421, + "learning_rate": 1.694549730935441e-08, + "loss": 0.073, + "step": 17850 + }, + { + "epoch": 2.892255346727155, + "grad_norm": 0.9843876957893372, + "learning_rate": 1.6894705483582464e-08, + "loss": 0.0657, + "step": 17851 + }, + { + "epoch": 2.8924173687621515, + "grad_norm": 0.9816562533378601, + "learning_rate": 1.684398963482381e-08, + "loss": 0.0609, + "step": 17852 + }, + { + "epoch": 2.8925793907971484, + "grad_norm": 1.04456627368927, + "learning_rate": 1.6793349764629707e-08, + "loss": 0.0619, + "step": 17853 + }, + { + "epoch": 2.8927414128321454, + "grad_norm": 0.8714144825935364, + "learning_rate": 1.674278587454975e-08, + "loss": 0.0546, + "step": 17854 + }, + { + "epoch": 2.892903434867142, + "grad_norm": 0.9265003204345703, + "learning_rate": 1.669229796613131e-08, + "loss": 0.0642, + "step": 17855 + }, + { + "epoch": 2.893065456902139, + "grad_norm": 0.916529655456543, + "learning_rate": 1.6641886040919263e-08, + "loss": 0.0613, + "step": 17856 + }, + { + "epoch": 2.8932274789371357, + "grad_norm": 0.8931515216827393, + "learning_rate": 1.659155010045571e-08, + "loss": 0.0584, + "step": 17857 + }, + { + "epoch": 2.893389500972132, + "grad_norm": 0.9384124279022217, + "learning_rate": 1.654129014628081e-08, + "loss": 0.0612, + "step": 17858 + }, + { + "epoch": 2.8935515230071287, + "grad_norm": 1.1038861274719238, + "learning_rate": 1.649110617993277e-08, + "loss": 0.0643, + "step": 17859 + }, + { + "epoch": 2.8937135450421256, + "grad_norm": 0.8668294548988342, + "learning_rate": 1.6440998202947034e-08, + "loss": 0.0565, + "step": 17860 + }, + { + "epoch": 2.8938755670771226, + "grad_norm": 0.8724477291107178, + "learning_rate": 1.639096621685654e-08, + "loss": 0.0577, + "step": 17861 + }, + { + "epoch": 2.894037589112119, + "grad_norm": 0.786655068397522, + "learning_rate": 1.634101022319229e-08, + "loss": 0.0581, + "step": 17862 + }, + { + "epoch": 2.894199611147116, + "grad_norm": 0.9810567498207092, + "learning_rate": 1.6291130223482498e-08, + "loss": 0.0551, + "step": 17863 + }, + { + "epoch": 2.894361633182113, + "grad_norm": 1.0155911445617676, + "learning_rate": 1.6241326219254006e-08, + "loss": 0.0584, + "step": 17864 + }, + { + "epoch": 2.8945236552171094, + "grad_norm": 1.0140700340270996, + "learning_rate": 1.6191598212030314e-08, + "loss": 0.0612, + "step": 17865 + }, + { + "epoch": 2.8946856772521063, + "grad_norm": 0.9874489903450012, + "learning_rate": 1.6141946203332703e-08, + "loss": 0.0627, + "step": 17866 + }, + { + "epoch": 2.8948476992871033, + "grad_norm": 0.938338041305542, + "learning_rate": 1.609237019468107e-08, + "loss": 0.0587, + "step": 17867 + }, + { + "epoch": 2.8950097213220998, + "grad_norm": 0.9551253318786621, + "learning_rate": 1.6042870187591985e-08, + "loss": 0.066, + "step": 17868 + }, + { + "epoch": 2.8951717433570967, + "grad_norm": 1.030510663986206, + "learning_rate": 1.5993446183579786e-08, + "loss": 0.0592, + "step": 17869 + }, + { + "epoch": 2.895333765392093, + "grad_norm": 0.9057535529136658, + "learning_rate": 1.5944098184156876e-08, + "loss": 0.0584, + "step": 17870 + }, + { + "epoch": 2.89549578742709, + "grad_norm": 0.8612263798713684, + "learning_rate": 1.5894826190833712e-08, + "loss": 0.0557, + "step": 17871 + }, + { + "epoch": 2.8956578094620866, + "grad_norm": 0.784995973110199, + "learning_rate": 1.5845630205117147e-08, + "loss": 0.0542, + "step": 17872 + }, + { + "epoch": 2.8958198314970836, + "grad_norm": 0.9127455353736877, + "learning_rate": 1.579651022851264e-08, + "loss": 0.0654, + "step": 17873 + }, + { + "epoch": 2.8959818535320805, + "grad_norm": 0.9822579026222229, + "learning_rate": 1.5747466262523438e-08, + "loss": 0.0597, + "step": 17874 + }, + { + "epoch": 2.896143875567077, + "grad_norm": 0.833354651927948, + "learning_rate": 1.569849830865e-08, + "loss": 0.0518, + "step": 17875 + }, + { + "epoch": 2.896305897602074, + "grad_norm": 0.9307559132575989, + "learning_rate": 1.5649606368390578e-08, + "loss": 0.0571, + "step": 17876 + }, + { + "epoch": 2.896467919637071, + "grad_norm": 0.8814070820808411, + "learning_rate": 1.56007904432412e-08, + "loss": 0.0577, + "step": 17877 + }, + { + "epoch": 2.8966299416720673, + "grad_norm": 0.9398168325424194, + "learning_rate": 1.5552050534695383e-08, + "loss": 0.0579, + "step": 17878 + }, + { + "epoch": 2.8967919637070643, + "grad_norm": 0.8523743152618408, + "learning_rate": 1.5503386644244724e-08, + "loss": 0.0567, + "step": 17879 + }, + { + "epoch": 2.8969539857420608, + "grad_norm": 0.9002468585968018, + "learning_rate": 1.5454798773378023e-08, + "loss": 0.0628, + "step": 17880 + }, + { + "epoch": 2.8971160077770577, + "grad_norm": 1.3708131313323975, + "learning_rate": 1.5406286923582148e-08, + "loss": 0.0594, + "step": 17881 + }, + { + "epoch": 2.897278029812054, + "grad_norm": 0.9118489027023315, + "learning_rate": 1.5357851096340915e-08, + "loss": 0.0566, + "step": 17882 + }, + { + "epoch": 2.897440051847051, + "grad_norm": 0.9221370816230774, + "learning_rate": 1.5309491293137026e-08, + "loss": 0.0616, + "step": 17883 + }, + { + "epoch": 2.897602073882048, + "grad_norm": 0.9659370183944702, + "learning_rate": 1.526120751544985e-08, + "loss": 0.0584, + "step": 17884 + }, + { + "epoch": 2.8977640959170445, + "grad_norm": 1.0934197902679443, + "learning_rate": 1.5212999764756543e-08, + "loss": 0.0669, + "step": 17885 + }, + { + "epoch": 2.8979261179520415, + "grad_norm": 0.854312539100647, + "learning_rate": 1.5164868042532864e-08, + "loss": 0.0521, + "step": 17886 + }, + { + "epoch": 2.8980881399870384, + "grad_norm": 1.0536237955093384, + "learning_rate": 1.5116812350250422e-08, + "loss": 0.0606, + "step": 17887 + }, + { + "epoch": 2.898250162022035, + "grad_norm": 0.9276726245880127, + "learning_rate": 1.506883268938053e-08, + "loss": 0.0588, + "step": 17888 + }, + { + "epoch": 2.898412184057032, + "grad_norm": 0.8733401894569397, + "learning_rate": 1.5020929061391188e-08, + "loss": 0.0603, + "step": 17889 + }, + { + "epoch": 2.8985742060920288, + "grad_norm": 0.8288245797157288, + "learning_rate": 1.4973101467747608e-08, + "loss": 0.0547, + "step": 17890 + }, + { + "epoch": 2.8987362281270252, + "grad_norm": 0.9966906905174255, + "learning_rate": 1.4925349909913346e-08, + "loss": 0.0615, + "step": 17891 + }, + { + "epoch": 2.898898250162022, + "grad_norm": 0.9769363403320312, + "learning_rate": 1.4877674389349728e-08, + "loss": 0.0607, + "step": 17892 + }, + { + "epoch": 2.8990602721970187, + "grad_norm": 0.8618332743644714, + "learning_rate": 1.4830074907515313e-08, + "loss": 0.0596, + "step": 17893 + }, + { + "epoch": 2.8992222942320156, + "grad_norm": 0.819103479385376, + "learning_rate": 1.4782551465866713e-08, + "loss": 0.0527, + "step": 17894 + }, + { + "epoch": 2.899384316267012, + "grad_norm": 0.8623654246330261, + "learning_rate": 1.4735104065858042e-08, + "loss": 0.0623, + "step": 17895 + }, + { + "epoch": 2.899546338302009, + "grad_norm": 0.8684125542640686, + "learning_rate": 1.4687732708940916e-08, + "loss": 0.0539, + "step": 17896 + }, + { + "epoch": 2.899708360337006, + "grad_norm": 0.8661196827888489, + "learning_rate": 1.4640437396564733e-08, + "loss": 0.056, + "step": 17897 + }, + { + "epoch": 2.8998703823720025, + "grad_norm": 0.9082359671592712, + "learning_rate": 1.4593218130176668e-08, + "loss": 0.0535, + "step": 17898 + }, + { + "epoch": 2.9000324044069994, + "grad_norm": 0.8829771876335144, + "learning_rate": 1.4546074911221675e-08, + "loss": 0.0582, + "step": 17899 + }, + { + "epoch": 2.9001944264419963, + "grad_norm": 1.0028576850891113, + "learning_rate": 1.4499007741141934e-08, + "loss": 0.063, + "step": 17900 + }, + { + "epoch": 2.900356448476993, + "grad_norm": 0.8748372793197632, + "learning_rate": 1.4452016621377961e-08, + "loss": 0.0625, + "step": 17901 + }, + { + "epoch": 2.9005184705119897, + "grad_norm": 0.986389696598053, + "learning_rate": 1.4405101553367218e-08, + "loss": 0.0616, + "step": 17902 + }, + { + "epoch": 2.9006804925469862, + "grad_norm": 1.0490912199020386, + "learning_rate": 1.4358262538545498e-08, + "loss": 0.058, + "step": 17903 + }, + { + "epoch": 2.900842514581983, + "grad_norm": 0.8131743669509888, + "learning_rate": 1.4311499578345821e-08, + "loss": 0.0559, + "step": 17904 + }, + { + "epoch": 2.9010045366169797, + "grad_norm": 0.8198305368423462, + "learning_rate": 1.4264812674198714e-08, + "loss": 0.0563, + "step": 17905 + }, + { + "epoch": 2.9011665586519766, + "grad_norm": 0.8010324239730835, + "learning_rate": 1.421820182753303e-08, + "loss": 0.0547, + "step": 17906 + }, + { + "epoch": 2.9013285806869735, + "grad_norm": 1.2426363229751587, + "learning_rate": 1.4171667039775128e-08, + "loss": 0.0607, + "step": 17907 + }, + { + "epoch": 2.90149060272197, + "grad_norm": 0.8623930811882019, + "learning_rate": 1.4125208312348593e-08, + "loss": 0.0633, + "step": 17908 + }, + { + "epoch": 2.901652624756967, + "grad_norm": 1.1092482805252075, + "learning_rate": 1.4078825646675065e-08, + "loss": 0.069, + "step": 17909 + }, + { + "epoch": 2.901814646791964, + "grad_norm": 0.8913895487785339, + "learning_rate": 1.403251904417341e-08, + "loss": 0.0588, + "step": 17910 + }, + { + "epoch": 2.9019766688269604, + "grad_norm": 1.0606086254119873, + "learning_rate": 1.3986288506260825e-08, + "loss": 0.0616, + "step": 17911 + }, + { + "epoch": 2.9021386908619573, + "grad_norm": 0.8608266115188599, + "learning_rate": 1.3940134034351738e-08, + "loss": 0.0547, + "step": 17912 + }, + { + "epoch": 2.9023007128969542, + "grad_norm": 1.2168540954589844, + "learning_rate": 1.3894055629858627e-08, + "loss": 0.072, + "step": 17913 + }, + { + "epoch": 2.9024627349319507, + "grad_norm": 0.862879753112793, + "learning_rate": 1.3848053294190922e-08, + "loss": 0.054, + "step": 17914 + }, + { + "epoch": 2.9026247569669477, + "grad_norm": 0.9422655701637268, + "learning_rate": 1.3802127028756662e-08, + "loss": 0.0596, + "step": 17915 + }, + { + "epoch": 2.902786779001944, + "grad_norm": 0.8769212961196899, + "learning_rate": 1.3756276834960558e-08, + "loss": 0.0569, + "step": 17916 + }, + { + "epoch": 2.902948801036941, + "grad_norm": 0.8366473913192749, + "learning_rate": 1.3710502714205654e-08, + "loss": 0.0578, + "step": 17917 + }, + { + "epoch": 2.9031108230719376, + "grad_norm": 0.9338627457618713, + "learning_rate": 1.366480466789305e-08, + "loss": 0.0608, + "step": 17918 + }, + { + "epoch": 2.9032728451069345, + "grad_norm": 0.9875946640968323, + "learning_rate": 1.3619182697420518e-08, + "loss": 0.0588, + "step": 17919 + }, + { + "epoch": 2.9034348671419314, + "grad_norm": 0.8782922029495239, + "learning_rate": 1.3573636804183887e-08, + "loss": 0.0565, + "step": 17920 + }, + { + "epoch": 2.903596889176928, + "grad_norm": 0.8396292924880981, + "learning_rate": 1.352816698957704e-08, + "loss": 0.0578, + "step": 17921 + }, + { + "epoch": 2.903758911211925, + "grad_norm": 1.0632270574569702, + "learning_rate": 1.3482773254991365e-08, + "loss": 0.0586, + "step": 17922 + }, + { + "epoch": 2.903920933246922, + "grad_norm": 0.9017321467399597, + "learning_rate": 1.3437455601815198e-08, + "loss": 0.0591, + "step": 17923 + }, + { + "epoch": 2.9040829552819183, + "grad_norm": 0.8652000427246094, + "learning_rate": 1.3392214031435757e-08, + "loss": 0.0583, + "step": 17924 + }, + { + "epoch": 2.904244977316915, + "grad_norm": 0.9219072461128235, + "learning_rate": 1.334704854523694e-08, + "loss": 0.0661, + "step": 17925 + }, + { + "epoch": 2.9044069993519117, + "grad_norm": 1.0006201267242432, + "learning_rate": 1.3301959144600974e-08, + "loss": 0.0635, + "step": 17926 + }, + { + "epoch": 2.9045690213869086, + "grad_norm": 0.8481730222702026, + "learning_rate": 1.325694583090731e-08, + "loss": 0.0576, + "step": 17927 + }, + { + "epoch": 2.904731043421905, + "grad_norm": 0.8712880611419678, + "learning_rate": 1.3212008605533177e-08, + "loss": 0.0556, + "step": 17928 + }, + { + "epoch": 2.904893065456902, + "grad_norm": 0.8364845514297485, + "learning_rate": 1.316714746985387e-08, + "loss": 0.058, + "step": 17929 + }, + { + "epoch": 2.905055087491899, + "grad_norm": 0.8850131630897522, + "learning_rate": 1.312236242524162e-08, + "loss": 0.0563, + "step": 17930 + }, + { + "epoch": 2.9052171095268955, + "grad_norm": 0.8984468579292297, + "learning_rate": 1.3077653473067276e-08, + "loss": 0.0587, + "step": 17931 + }, + { + "epoch": 2.9053791315618924, + "grad_norm": 0.9292992949485779, + "learning_rate": 1.3033020614698078e-08, + "loss": 0.0633, + "step": 17932 + }, + { + "epoch": 2.9055411535968894, + "grad_norm": 1.0178935527801514, + "learning_rate": 1.298846385150071e-08, + "loss": 0.0657, + "step": 17933 + }, + { + "epoch": 2.905703175631886, + "grad_norm": 0.9548495411872864, + "learning_rate": 1.2943983184837417e-08, + "loss": 0.0589, + "step": 17934 + }, + { + "epoch": 2.905865197666883, + "grad_norm": 0.9326539039611816, + "learning_rate": 1.2899578616069607e-08, + "loss": 0.0588, + "step": 17935 + }, + { + "epoch": 2.9060272197018797, + "grad_norm": 0.9059630036354065, + "learning_rate": 1.2855250146556197e-08, + "loss": 0.0605, + "step": 17936 + }, + { + "epoch": 2.906189241736876, + "grad_norm": 0.8523368835449219, + "learning_rate": 1.28109977776536e-08, + "loss": 0.051, + "step": 17937 + }, + { + "epoch": 2.906351263771873, + "grad_norm": 0.7979968786239624, + "learning_rate": 1.2766821510715177e-08, + "loss": 0.0501, + "step": 17938 + }, + { + "epoch": 2.9065132858068696, + "grad_norm": 0.8871493339538574, + "learning_rate": 1.272272134709318e-08, + "loss": 0.0613, + "step": 17939 + }, + { + "epoch": 2.9066753078418666, + "grad_norm": 0.8767561316490173, + "learning_rate": 1.2678697288136809e-08, + "loss": 0.0619, + "step": 17940 + }, + { + "epoch": 2.906837329876863, + "grad_norm": 0.8473535180091858, + "learning_rate": 1.2634749335193319e-08, + "loss": 0.0568, + "step": 17941 + }, + { + "epoch": 2.90699935191186, + "grad_norm": 0.9051182866096497, + "learning_rate": 1.2590877489606911e-08, + "loss": 0.0559, + "step": 17942 + }, + { + "epoch": 2.907161373946857, + "grad_norm": 0.928796112537384, + "learning_rate": 1.2547081752720402e-08, + "loss": 0.0609, + "step": 17943 + }, + { + "epoch": 2.9073233959818534, + "grad_norm": 0.9684448838233948, + "learning_rate": 1.2503362125873552e-08, + "loss": 0.0574, + "step": 17944 + }, + { + "epoch": 2.9074854180168503, + "grad_norm": 1.0342750549316406, + "learning_rate": 1.2459718610404182e-08, + "loss": 0.0629, + "step": 17945 + }, + { + "epoch": 2.9076474400518473, + "grad_norm": 0.8830204606056213, + "learning_rate": 1.241615120764761e-08, + "loss": 0.0574, + "step": 17946 + }, + { + "epoch": 2.9078094620868438, + "grad_norm": 0.8388495445251465, + "learning_rate": 1.2372659918937213e-08, + "loss": 0.0511, + "step": 17947 + }, + { + "epoch": 2.9079714841218407, + "grad_norm": 1.0486900806427002, + "learning_rate": 1.2329244745603596e-08, + "loss": 0.0685, + "step": 17948 + }, + { + "epoch": 2.908133506156837, + "grad_norm": 0.8908421993255615, + "learning_rate": 1.2285905688974587e-08, + "loss": 0.0535, + "step": 17949 + }, + { + "epoch": 2.908295528191834, + "grad_norm": 0.8851662278175354, + "learning_rate": 1.2242642750376899e-08, + "loss": 0.0572, + "step": 17950 + }, + { + "epoch": 2.9084575502268306, + "grad_norm": 0.9874460697174072, + "learning_rate": 1.2199455931134197e-08, + "loss": 0.0566, + "step": 17951 + }, + { + "epoch": 2.9086195722618275, + "grad_norm": 1.0689846277236938, + "learning_rate": 1.2156345232567923e-08, + "loss": 0.0702, + "step": 17952 + }, + { + "epoch": 2.9087815942968245, + "grad_norm": 1.057041049003601, + "learning_rate": 1.2113310655996746e-08, + "loss": 0.056, + "step": 17953 + }, + { + "epoch": 2.908943616331821, + "grad_norm": 0.8022787570953369, + "learning_rate": 1.2070352202737668e-08, + "loss": 0.0512, + "step": 17954 + }, + { + "epoch": 2.909105638366818, + "grad_norm": 0.9817745685577393, + "learning_rate": 1.202746987410519e-08, + "loss": 0.0612, + "step": 17955 + }, + { + "epoch": 2.909267660401815, + "grad_norm": 1.1243666410446167, + "learning_rate": 1.19846636714116e-08, + "loss": 0.0682, + "step": 17956 + }, + { + "epoch": 2.9094296824368113, + "grad_norm": 0.9882938861846924, + "learning_rate": 1.1941933595966127e-08, + "loss": 0.0599, + "step": 17957 + }, + { + "epoch": 2.9095917044718083, + "grad_norm": 0.9126664996147156, + "learning_rate": 1.1899279649076612e-08, + "loss": 0.0604, + "step": 17958 + }, + { + "epoch": 2.909753726506805, + "grad_norm": 0.8586945533752441, + "learning_rate": 1.1856701832047845e-08, + "loss": 0.0615, + "step": 17959 + }, + { + "epoch": 2.9099157485418017, + "grad_norm": 0.8310577273368835, + "learning_rate": 1.181420014618323e-08, + "loss": 0.0592, + "step": 17960 + }, + { + "epoch": 2.910077770576798, + "grad_norm": 0.9139492511749268, + "learning_rate": 1.1771774592782558e-08, + "loss": 0.0584, + "step": 17961 + }, + { + "epoch": 2.910239792611795, + "grad_norm": 1.0066767930984497, + "learning_rate": 1.172942517314396e-08, + "loss": 0.0588, + "step": 17962 + }, + { + "epoch": 2.910401814646792, + "grad_norm": 0.7591646909713745, + "learning_rate": 1.1687151888563897e-08, + "loss": 0.0507, + "step": 17963 + }, + { + "epoch": 2.9105638366817885, + "grad_norm": 0.8789999485015869, + "learning_rate": 1.1644954740334946e-08, + "loss": 0.0631, + "step": 17964 + }, + { + "epoch": 2.9107258587167855, + "grad_norm": 0.9494521617889404, + "learning_rate": 1.160283372974913e-08, + "loss": 0.0592, + "step": 17965 + }, + { + "epoch": 2.9108878807517824, + "grad_norm": 0.9379144310951233, + "learning_rate": 1.1560788858094584e-08, + "loss": 0.0607, + "step": 17966 + }, + { + "epoch": 2.911049902786779, + "grad_norm": 1.2142386436462402, + "learning_rate": 1.1518820126658058e-08, + "loss": 0.0623, + "step": 17967 + }, + { + "epoch": 2.911211924821776, + "grad_norm": 0.8130635619163513, + "learning_rate": 1.1476927536723248e-08, + "loss": 0.0544, + "step": 17968 + }, + { + "epoch": 2.9113739468567728, + "grad_norm": 0.9625205397605896, + "learning_rate": 1.1435111089572737e-08, + "loss": 0.0612, + "step": 17969 + }, + { + "epoch": 2.9115359688917692, + "grad_norm": 1.122702956199646, + "learning_rate": 1.1393370786485502e-08, + "loss": 0.0628, + "step": 17970 + }, + { + "epoch": 2.911697990926766, + "grad_norm": 0.9404149651527405, + "learning_rate": 1.1351706628738857e-08, + "loss": 0.0646, + "step": 17971 + }, + { + "epoch": 2.9118600129617627, + "grad_norm": 0.8580222725868225, + "learning_rate": 1.1310118617607613e-08, + "loss": 0.0536, + "step": 17972 + }, + { + "epoch": 2.9120220349967596, + "grad_norm": 0.8675228953361511, + "learning_rate": 1.1268606754364087e-08, + "loss": 0.056, + "step": 17973 + }, + { + "epoch": 2.912184057031756, + "grad_norm": 0.8910603523254395, + "learning_rate": 1.122717104027865e-08, + "loss": 0.0547, + "step": 17974 + }, + { + "epoch": 2.912346079066753, + "grad_norm": 0.8188005685806274, + "learning_rate": 1.1185811476619179e-08, + "loss": 0.0561, + "step": 17975 + }, + { + "epoch": 2.91250810110175, + "grad_norm": 0.8997900485992432, + "learning_rate": 1.1144528064650772e-08, + "loss": 0.0615, + "step": 17976 + }, + { + "epoch": 2.9126701231367464, + "grad_norm": 0.9417411684989929, + "learning_rate": 1.1103320805637141e-08, + "loss": 0.0616, + "step": 17977 + }, + { + "epoch": 2.9128321451717434, + "grad_norm": 0.7963204383850098, + "learning_rate": 1.1062189700838944e-08, + "loss": 0.0538, + "step": 17978 + }, + { + "epoch": 2.9129941672067403, + "grad_norm": 0.8257819414138794, + "learning_rate": 1.102113475151434e-08, + "loss": 0.0554, + "step": 17979 + }, + { + "epoch": 2.913156189241737, + "grad_norm": 1.1627333164215088, + "learning_rate": 1.0980155958920103e-08, + "loss": 0.0594, + "step": 17980 + }, + { + "epoch": 2.9133182112767337, + "grad_norm": 0.9248695373535156, + "learning_rate": 1.0939253324309673e-08, + "loss": 0.0618, + "step": 17981 + }, + { + "epoch": 2.9134802333117307, + "grad_norm": 0.9199352264404297, + "learning_rate": 1.089842684893455e-08, + "loss": 0.0585, + "step": 17982 + }, + { + "epoch": 2.913642255346727, + "grad_norm": 0.9159268140792847, + "learning_rate": 1.0857676534044014e-08, + "loss": 0.0591, + "step": 17983 + }, + { + "epoch": 2.9138042773817237, + "grad_norm": 1.0434057712554932, + "learning_rate": 1.0817002380885123e-08, + "loss": 0.0663, + "step": 17984 + }, + { + "epoch": 2.9139662994167206, + "grad_norm": 0.8765219449996948, + "learning_rate": 1.0776404390702434e-08, + "loss": 0.0646, + "step": 17985 + }, + { + "epoch": 2.9141283214517175, + "grad_norm": 0.9191032648086548, + "learning_rate": 1.0735882564737732e-08, + "loss": 0.0617, + "step": 17986 + }, + { + "epoch": 2.914290343486714, + "grad_norm": 1.1906532049179077, + "learning_rate": 1.0695436904231137e-08, + "loss": 0.0587, + "step": 17987 + }, + { + "epoch": 2.914452365521711, + "grad_norm": 0.9076985716819763, + "learning_rate": 1.0655067410419994e-08, + "loss": 0.0602, + "step": 17988 + }, + { + "epoch": 2.914614387556708, + "grad_norm": 0.9002047777175903, + "learning_rate": 1.061477408453998e-08, + "loss": 0.0568, + "step": 17989 + }, + { + "epoch": 2.9147764095917044, + "grad_norm": 0.9402999877929688, + "learning_rate": 1.057455692782372e-08, + "loss": 0.062, + "step": 17990 + }, + { + "epoch": 2.9149384316267013, + "grad_norm": 0.9988169074058533, + "learning_rate": 1.0534415941501341e-08, + "loss": 0.0629, + "step": 17991 + }, + { + "epoch": 2.9151004536616982, + "grad_norm": 0.8821046352386475, + "learning_rate": 1.049435112680186e-08, + "loss": 0.0601, + "step": 17992 + }, + { + "epoch": 2.9152624756966947, + "grad_norm": 0.8363478183746338, + "learning_rate": 1.0454362484950409e-08, + "loss": 0.0605, + "step": 17993 + }, + { + "epoch": 2.9154244977316917, + "grad_norm": 0.9162735939025879, + "learning_rate": 1.0414450017171007e-08, + "loss": 0.0612, + "step": 17994 + }, + { + "epoch": 2.915586519766688, + "grad_norm": 0.9638509750366211, + "learning_rate": 1.03746137246849e-08, + "loss": 0.0622, + "step": 17995 + }, + { + "epoch": 2.915748541801685, + "grad_norm": 0.8784467577934265, + "learning_rate": 1.0334853608710838e-08, + "loss": 0.0626, + "step": 17996 + }, + { + "epoch": 2.9159105638366816, + "grad_norm": 0.9395826458930969, + "learning_rate": 1.0295169670465066e-08, + "loss": 0.0605, + "step": 17997 + }, + { + "epoch": 2.9160725858716785, + "grad_norm": 1.00931978225708, + "learning_rate": 1.0255561911162449e-08, + "loss": 0.0597, + "step": 17998 + }, + { + "epoch": 2.9162346079066754, + "grad_norm": 0.8709773421287537, + "learning_rate": 1.0216030332014515e-08, + "loss": 0.0579, + "step": 17999 + }, + { + "epoch": 2.916396629941672, + "grad_norm": 0.8078761100769043, + "learning_rate": 1.0176574934230854e-08, + "loss": 0.0583, + "step": 18000 + }, + { + "epoch": 2.916558651976669, + "grad_norm": 0.8328400254249573, + "learning_rate": 1.0137195719018556e-08, + "loss": 0.0519, + "step": 18001 + }, + { + "epoch": 2.916720674011666, + "grad_norm": 0.9479650259017944, + "learning_rate": 1.0097892687583044e-08, + "loss": 0.0587, + "step": 18002 + }, + { + "epoch": 2.9168826960466623, + "grad_norm": 0.9351217150688171, + "learning_rate": 1.0058665841126414e-08, + "loss": 0.0608, + "step": 18003 + }, + { + "epoch": 2.917044718081659, + "grad_norm": 0.7649050354957581, + "learning_rate": 1.0019515180849094e-08, + "loss": 0.0538, + "step": 18004 + }, + { + "epoch": 2.9172067401166557, + "grad_norm": 0.9090438485145569, + "learning_rate": 9.980440707948735e-09, + "loss": 0.0559, + "step": 18005 + }, + { + "epoch": 2.9173687621516526, + "grad_norm": 0.8982040286064148, + "learning_rate": 9.941442423621606e-09, + "loss": 0.056, + "step": 18006 + }, + { + "epoch": 2.917530784186649, + "grad_norm": 0.9106990694999695, + "learning_rate": 9.902520329060083e-09, + "loss": 0.0535, + "step": 18007 + }, + { + "epoch": 2.917692806221646, + "grad_norm": 1.0003997087478638, + "learning_rate": 9.863674425455716e-09, + "loss": 0.0647, + "step": 18008 + }, + { + "epoch": 2.917854828256643, + "grad_norm": 1.027595043182373, + "learning_rate": 9.824904713996996e-09, + "loss": 0.0557, + "step": 18009 + }, + { + "epoch": 2.9180168502916395, + "grad_norm": 0.9183197021484375, + "learning_rate": 9.78621119586992e-09, + "loss": 0.0558, + "step": 18010 + }, + { + "epoch": 2.9181788723266364, + "grad_norm": 1.1512269973754883, + "learning_rate": 9.747593872258543e-09, + "loss": 0.0643, + "step": 18011 + }, + { + "epoch": 2.9183408943616334, + "grad_norm": 0.859645664691925, + "learning_rate": 9.709052744344694e-09, + "loss": 0.0649, + "step": 18012 + }, + { + "epoch": 2.91850291639663, + "grad_norm": 0.85866379737854, + "learning_rate": 9.670587813307153e-09, + "loss": 0.0537, + "step": 18013 + }, + { + "epoch": 2.9186649384316268, + "grad_norm": 0.8549041152000427, + "learning_rate": 9.63219908032359e-09, + "loss": 0.0596, + "step": 18014 + }, + { + "epoch": 2.9188269604666237, + "grad_norm": 0.8607486486434937, + "learning_rate": 9.593886546567787e-09, + "loss": 0.06, + "step": 18015 + }, + { + "epoch": 2.91898898250162, + "grad_norm": 0.9240609407424927, + "learning_rate": 9.55565021321242e-09, + "loss": 0.0589, + "step": 18016 + }, + { + "epoch": 2.919151004536617, + "grad_norm": 0.9318268895149231, + "learning_rate": 9.51749008142766e-09, + "loss": 0.0504, + "step": 18017 + }, + { + "epoch": 2.9193130265716136, + "grad_norm": 0.9120360016822815, + "learning_rate": 9.479406152380632e-09, + "loss": 0.0582, + "step": 18018 + }, + { + "epoch": 2.9194750486066106, + "grad_norm": 0.8662173748016357, + "learning_rate": 9.441398427236792e-09, + "loss": 0.0557, + "step": 18019 + }, + { + "epoch": 2.919637070641607, + "grad_norm": 0.9860486388206482, + "learning_rate": 9.403466907159375e-09, + "loss": 0.0609, + "step": 18020 + }, + { + "epoch": 2.919799092676604, + "grad_norm": 1.0033873319625854, + "learning_rate": 9.365611593308565e-09, + "loss": 0.0586, + "step": 18021 + }, + { + "epoch": 2.919961114711601, + "grad_norm": 0.8788847327232361, + "learning_rate": 9.327832486842603e-09, + "loss": 0.0595, + "step": 18022 + }, + { + "epoch": 2.9201231367465974, + "grad_norm": 1.0497198104858398, + "learning_rate": 9.290129588918062e-09, + "loss": 0.0669, + "step": 18023 + }, + { + "epoch": 2.9202851587815943, + "grad_norm": 0.9634641408920288, + "learning_rate": 9.25250290068791e-09, + "loss": 0.0572, + "step": 18024 + }, + { + "epoch": 2.9204471808165913, + "grad_norm": 0.8264623284339905, + "learning_rate": 9.214952423303724e-09, + "loss": 0.0575, + "step": 18025 + }, + { + "epoch": 2.9206092028515878, + "grad_norm": 0.8819963335990906, + "learning_rate": 9.17747815791431e-09, + "loss": 0.0554, + "step": 18026 + }, + { + "epoch": 2.9207712248865847, + "grad_norm": 0.8962211012840271, + "learning_rate": 9.140080105666527e-09, + "loss": 0.0608, + "step": 18027 + }, + { + "epoch": 2.920933246921581, + "grad_norm": 1.065934181213379, + "learning_rate": 9.102758267704736e-09, + "loss": 0.0623, + "step": 18028 + }, + { + "epoch": 2.921095268956578, + "grad_norm": 0.9114288687705994, + "learning_rate": 9.065512645170249e-09, + "loss": 0.0578, + "step": 18029 + }, + { + "epoch": 2.9212572909915746, + "grad_norm": 1.0408483743667603, + "learning_rate": 9.02834323920354e-09, + "loss": 0.0679, + "step": 18030 + }, + { + "epoch": 2.9214193130265715, + "grad_norm": 0.9269614815711975, + "learning_rate": 8.991250050941203e-09, + "loss": 0.0599, + "step": 18031 + }, + { + "epoch": 2.9215813350615685, + "grad_norm": 0.850156843662262, + "learning_rate": 8.954233081518438e-09, + "loss": 0.0532, + "step": 18032 + }, + { + "epoch": 2.921743357096565, + "grad_norm": 0.8286687135696411, + "learning_rate": 8.917292332068228e-09, + "loss": 0.0546, + "step": 18033 + }, + { + "epoch": 2.921905379131562, + "grad_norm": 0.8290266394615173, + "learning_rate": 8.880427803720226e-09, + "loss": 0.0591, + "step": 18034 + }, + { + "epoch": 2.922067401166559, + "grad_norm": 0.8320590853691101, + "learning_rate": 8.843639497602973e-09, + "loss": 0.0516, + "step": 18035 + }, + { + "epoch": 2.9222294232015553, + "grad_norm": 0.9579795002937317, + "learning_rate": 8.806927414841959e-09, + "loss": 0.0585, + "step": 18036 + }, + { + "epoch": 2.9223914452365523, + "grad_norm": 0.8604581356048584, + "learning_rate": 8.770291556560174e-09, + "loss": 0.0567, + "step": 18037 + }, + { + "epoch": 2.922553467271549, + "grad_norm": 0.8759823441505432, + "learning_rate": 8.73373192387894e-09, + "loss": 0.0628, + "step": 18038 + }, + { + "epoch": 2.9227154893065457, + "grad_norm": 0.9367085099220276, + "learning_rate": 8.697248517916535e-09, + "loss": 0.0609, + "step": 18039 + }, + { + "epoch": 2.9228775113415426, + "grad_norm": 0.9392814040184021, + "learning_rate": 8.660841339789561e-09, + "loss": 0.068, + "step": 18040 + }, + { + "epoch": 2.923039533376539, + "grad_norm": 0.8410159349441528, + "learning_rate": 8.62451039061213e-09, + "loss": 0.0539, + "step": 18041 + }, + { + "epoch": 2.923201555411536, + "grad_norm": 0.8481424450874329, + "learning_rate": 8.58825567149557e-09, + "loss": 0.0567, + "step": 18042 + }, + { + "epoch": 2.9233635774465325, + "grad_norm": 1.1205499172210693, + "learning_rate": 8.55207718354928e-09, + "loss": 0.0688, + "step": 18043 + }, + { + "epoch": 2.9235255994815295, + "grad_norm": 0.819627046585083, + "learning_rate": 8.515974927880144e-09, + "loss": 0.055, + "step": 18044 + }, + { + "epoch": 2.9236876215165264, + "grad_norm": 0.8424476981163025, + "learning_rate": 8.479948905592839e-09, + "loss": 0.0556, + "step": 18045 + }, + { + "epoch": 2.923849643551523, + "grad_norm": 0.9404410719871521, + "learning_rate": 8.443999117790091e-09, + "loss": 0.0572, + "step": 18046 + }, + { + "epoch": 2.92401166558652, + "grad_norm": 1.1736658811569214, + "learning_rate": 8.4081255655713e-09, + "loss": 0.0574, + "step": 18047 + }, + { + "epoch": 2.9241736876215167, + "grad_norm": 1.16831374168396, + "learning_rate": 8.3723282500342e-09, + "loss": 0.0615, + "step": 18048 + }, + { + "epoch": 2.9243357096565132, + "grad_norm": 0.8560138940811157, + "learning_rate": 8.33660717227458e-09, + "loss": 0.0581, + "step": 18049 + }, + { + "epoch": 2.92449773169151, + "grad_norm": 0.9814584255218506, + "learning_rate": 8.3009623333849e-09, + "loss": 0.0648, + "step": 18050 + }, + { + "epoch": 2.9246597537265067, + "grad_norm": 0.8727818131446838, + "learning_rate": 8.265393734455674e-09, + "loss": 0.0601, + "step": 18051 + }, + { + "epoch": 2.9248217757615036, + "grad_norm": 1.0411512851715088, + "learning_rate": 8.229901376575755e-09, + "loss": 0.0601, + "step": 18052 + }, + { + "epoch": 2.9249837977965, + "grad_norm": 1.0159389972686768, + "learning_rate": 8.194485260830943e-09, + "loss": 0.0582, + "step": 18053 + }, + { + "epoch": 2.925145819831497, + "grad_norm": 0.8708806037902832, + "learning_rate": 8.15914538830509e-09, + "loss": 0.0551, + "step": 18054 + }, + { + "epoch": 2.925307841866494, + "grad_norm": 0.8334715962409973, + "learning_rate": 8.123881760078723e-09, + "loss": 0.0575, + "step": 18055 + }, + { + "epoch": 2.9254698639014904, + "grad_norm": 0.9106010794639587, + "learning_rate": 8.088694377231532e-09, + "loss": 0.0626, + "step": 18056 + }, + { + "epoch": 2.9256318859364874, + "grad_norm": 0.9865289926528931, + "learning_rate": 8.053583240840157e-09, + "loss": 0.0614, + "step": 18057 + }, + { + "epoch": 2.9257939079714843, + "grad_norm": 0.9036328196525574, + "learning_rate": 8.018548351978738e-09, + "loss": 0.0612, + "step": 18058 + }, + { + "epoch": 2.925955930006481, + "grad_norm": 0.9329188466072083, + "learning_rate": 7.98358971171892e-09, + "loss": 0.0589, + "step": 18059 + }, + { + "epoch": 2.9261179520414777, + "grad_norm": 1.0107616186141968, + "learning_rate": 7.948707321130956e-09, + "loss": 0.0607, + "step": 18060 + }, + { + "epoch": 2.9262799740764747, + "grad_norm": 1.3307433128356934, + "learning_rate": 7.913901181281768e-09, + "loss": 0.0597, + "step": 18061 + }, + { + "epoch": 2.926441996111471, + "grad_norm": 0.8141980171203613, + "learning_rate": 7.879171293236621e-09, + "loss": 0.0541, + "step": 18062 + }, + { + "epoch": 2.9266040181464676, + "grad_norm": 0.894360363483429, + "learning_rate": 7.844517658057993e-09, + "loss": 0.0631, + "step": 18063 + }, + { + "epoch": 2.9267660401814646, + "grad_norm": 0.9647737145423889, + "learning_rate": 7.80994027680615e-09, + "loss": 0.0555, + "step": 18064 + }, + { + "epoch": 2.9269280622164615, + "grad_norm": 0.874281644821167, + "learning_rate": 7.775439150539132e-09, + "loss": 0.0584, + "step": 18065 + }, + { + "epoch": 2.927090084251458, + "grad_norm": 0.9083302617073059, + "learning_rate": 7.741014280312765e-09, + "loss": 0.0563, + "step": 18066 + }, + { + "epoch": 2.927252106286455, + "grad_norm": 0.9287460446357727, + "learning_rate": 7.70666566718009e-09, + "loss": 0.06, + "step": 18067 + }, + { + "epoch": 2.927414128321452, + "grad_norm": 0.9412106275558472, + "learning_rate": 7.672393312192218e-09, + "loss": 0.0596, + "step": 18068 + }, + { + "epoch": 2.9275761503564484, + "grad_norm": 0.7685844302177429, + "learning_rate": 7.638197216397748e-09, + "loss": 0.0508, + "step": 18069 + }, + { + "epoch": 2.9277381723914453, + "grad_norm": 0.9476762413978577, + "learning_rate": 7.604077380843067e-09, + "loss": 0.0544, + "step": 18070 + }, + { + "epoch": 2.9279001944264422, + "grad_norm": 0.9895046353340149, + "learning_rate": 7.57003380657234e-09, + "loss": 0.0626, + "step": 18071 + }, + { + "epoch": 2.9280622164614387, + "grad_norm": 0.825796365737915, + "learning_rate": 7.536066494626681e-09, + "loss": 0.0536, + "step": 18072 + }, + { + "epoch": 2.9282242384964356, + "grad_norm": 0.8940446972846985, + "learning_rate": 7.502175446046089e-09, + "loss": 0.0637, + "step": 18073 + }, + { + "epoch": 2.928386260531432, + "grad_norm": 1.003281831741333, + "learning_rate": 7.468360661866957e-09, + "loss": 0.0616, + "step": 18074 + }, + { + "epoch": 2.928548282566429, + "grad_norm": 0.901324987411499, + "learning_rate": 7.434622143124015e-09, + "loss": 0.0595, + "step": 18075 + }, + { + "epoch": 2.9287103046014256, + "grad_norm": 0.998007595539093, + "learning_rate": 7.400959890850046e-09, + "loss": 0.0623, + "step": 18076 + }, + { + "epoch": 2.9288723266364225, + "grad_norm": 0.9806587100028992, + "learning_rate": 7.367373906074782e-09, + "loss": 0.0575, + "step": 18077 + }, + { + "epoch": 2.9290343486714194, + "grad_norm": 0.963106095790863, + "learning_rate": 7.333864189825735e-09, + "loss": 0.0635, + "step": 18078 + }, + { + "epoch": 2.929196370706416, + "grad_norm": 0.9748607277870178, + "learning_rate": 7.3004307431281954e-09, + "loss": 0.0627, + "step": 18079 + }, + { + "epoch": 2.929358392741413, + "grad_norm": 0.8219755291938782, + "learning_rate": 7.267073567005234e-09, + "loss": 0.0534, + "step": 18080 + }, + { + "epoch": 2.92952041477641, + "grad_norm": 0.8908148407936096, + "learning_rate": 7.233792662477701e-09, + "loss": 0.0533, + "step": 18081 + }, + { + "epoch": 2.9296824368114063, + "grad_norm": 0.8585811853408813, + "learning_rate": 7.2005880305636714e-09, + "loss": 0.0563, + "step": 18082 + }, + { + "epoch": 2.929844458846403, + "grad_norm": 0.9972938895225525, + "learning_rate": 7.167459672278998e-09, + "loss": 0.0621, + "step": 18083 + }, + { + "epoch": 2.9300064808814, + "grad_norm": 0.8972851634025574, + "learning_rate": 7.134407588637871e-09, + "loss": 0.0567, + "step": 18084 + }, + { + "epoch": 2.9301685029163966, + "grad_norm": 0.903139591217041, + "learning_rate": 7.1014317806508696e-09, + "loss": 0.0603, + "step": 18085 + }, + { + "epoch": 2.930330524951393, + "grad_norm": 0.8544906377792358, + "learning_rate": 7.068532249327742e-09, + "loss": 0.06, + "step": 18086 + }, + { + "epoch": 2.93049254698639, + "grad_norm": 0.9023297429084778, + "learning_rate": 7.035708995674628e-09, + "loss": 0.0622, + "step": 18087 + }, + { + "epoch": 2.930654569021387, + "grad_norm": 0.9524878859519958, + "learning_rate": 7.002962020695725e-09, + "loss": 0.0581, + "step": 18088 + }, + { + "epoch": 2.9308165910563835, + "grad_norm": 1.0636718273162842, + "learning_rate": 6.970291325393286e-09, + "loss": 0.0624, + "step": 18089 + }, + { + "epoch": 2.9309786130913804, + "grad_norm": 0.9945453405380249, + "learning_rate": 6.937696910767067e-09, + "loss": 0.0608, + "step": 18090 + }, + { + "epoch": 2.9311406351263773, + "grad_norm": 0.8092207312583923, + "learning_rate": 6.905178777814326e-09, + "loss": 0.0559, + "step": 18091 + }, + { + "epoch": 2.931302657161374, + "grad_norm": 1.0313624143600464, + "learning_rate": 6.872736927529822e-09, + "loss": 0.0619, + "step": 18092 + }, + { + "epoch": 2.9314646791963708, + "grad_norm": 1.039193034172058, + "learning_rate": 6.840371360906095e-09, + "loss": 0.0611, + "step": 18093 + }, + { + "epoch": 2.9316267012313677, + "grad_norm": 0.9258902668952942, + "learning_rate": 6.8080820789340195e-09, + "loss": 0.0572, + "step": 18094 + }, + { + "epoch": 2.931788723266364, + "grad_norm": 0.9824669361114502, + "learning_rate": 6.775869082601139e-09, + "loss": 0.0573, + "step": 18095 + }, + { + "epoch": 2.931950745301361, + "grad_norm": 0.8684791922569275, + "learning_rate": 6.743732372893053e-09, + "loss": 0.0595, + "step": 18096 + }, + { + "epoch": 2.9321127673363576, + "grad_norm": 0.8853773474693298, + "learning_rate": 6.711671950793419e-09, + "loss": 0.0513, + "step": 18097 + }, + { + "epoch": 2.9322747893713546, + "grad_norm": 0.944572389125824, + "learning_rate": 6.679687817282843e-09, + "loss": 0.0517, + "step": 18098 + }, + { + "epoch": 2.932436811406351, + "grad_norm": 0.9606372117996216, + "learning_rate": 6.647779973339985e-09, + "loss": 0.0594, + "step": 18099 + }, + { + "epoch": 2.932598833441348, + "grad_norm": 0.8097171187400818, + "learning_rate": 6.615948419941565e-09, + "loss": 0.0531, + "step": 18100 + }, + { + "epoch": 2.932760855476345, + "grad_norm": 0.8059049844741821, + "learning_rate": 6.584193158060969e-09, + "loss": 0.0524, + "step": 18101 + }, + { + "epoch": 2.9329228775113414, + "grad_norm": 0.8145474791526794, + "learning_rate": 6.5525141886702005e-09, + "loss": 0.0572, + "step": 18102 + }, + { + "epoch": 2.9330848995463383, + "grad_norm": 1.0327907800674438, + "learning_rate": 6.520911512738481e-09, + "loss": 0.0654, + "step": 18103 + }, + { + "epoch": 2.9332469215813353, + "grad_norm": 0.9246883392333984, + "learning_rate": 6.489385131232817e-09, + "loss": 0.0663, + "step": 18104 + }, + { + "epoch": 2.9334089436163318, + "grad_norm": 0.9235041737556458, + "learning_rate": 6.4579350451177135e-09, + "loss": 0.0673, + "step": 18105 + }, + { + "epoch": 2.9335709656513287, + "grad_norm": 0.8101209998130798, + "learning_rate": 6.426561255355457e-09, + "loss": 0.0568, + "step": 18106 + }, + { + "epoch": 2.933732987686325, + "grad_norm": 0.8323474526405334, + "learning_rate": 6.395263762906112e-09, + "loss": 0.0514, + "step": 18107 + }, + { + "epoch": 2.933895009721322, + "grad_norm": 0.8345054984092712, + "learning_rate": 6.364042568727524e-09, + "loss": 0.05, + "step": 18108 + }, + { + "epoch": 2.9340570317563186, + "grad_norm": 0.903107225894928, + "learning_rate": 6.332897673774485e-09, + "loss": 0.0614, + "step": 18109 + }, + { + "epoch": 2.9342190537913155, + "grad_norm": 1.5786057710647583, + "learning_rate": 6.301829079000399e-09, + "loss": 0.0695, + "step": 18110 + }, + { + "epoch": 2.9343810758263125, + "grad_norm": 0.8032664060592651, + "learning_rate": 6.270836785355616e-09, + "loss": 0.05, + "step": 18111 + }, + { + "epoch": 2.934543097861309, + "grad_norm": 1.0438019037246704, + "learning_rate": 6.239920793788546e-09, + "loss": 0.0631, + "step": 18112 + }, + { + "epoch": 2.934705119896306, + "grad_norm": 0.9662144184112549, + "learning_rate": 6.209081105245096e-09, + "loss": 0.0647, + "step": 18113 + }, + { + "epoch": 2.934867141931303, + "grad_norm": 1.402650237083435, + "learning_rate": 6.178317720668958e-09, + "loss": 0.0615, + "step": 18114 + }, + { + "epoch": 2.9350291639662993, + "grad_norm": 0.9588105082511902, + "learning_rate": 6.147630641001323e-09, + "loss": 0.0585, + "step": 18115 + }, + { + "epoch": 2.9351911860012962, + "grad_norm": 0.9412093162536621, + "learning_rate": 6.117019867181162e-09, + "loss": 0.0613, + "step": 18116 + }, + { + "epoch": 2.935353208036293, + "grad_norm": 1.0334348678588867, + "learning_rate": 6.0864854001452255e-09, + "loss": 0.0649, + "step": 18117 + }, + { + "epoch": 2.9355152300712897, + "grad_norm": 0.9597827196121216, + "learning_rate": 6.056027240827489e-09, + "loss": 0.0642, + "step": 18118 + }, + { + "epoch": 2.9356772521062866, + "grad_norm": 0.927130401134491, + "learning_rate": 6.025645390160262e-09, + "loss": 0.0632, + "step": 18119 + }, + { + "epoch": 2.935839274141283, + "grad_norm": 0.8240948915481567, + "learning_rate": 5.995339849073079e-09, + "loss": 0.0607, + "step": 18120 + }, + { + "epoch": 2.93600129617628, + "grad_norm": 0.919998824596405, + "learning_rate": 5.965110618492698e-09, + "loss": 0.0637, + "step": 18121 + }, + { + "epoch": 2.9361633182112765, + "grad_norm": 0.9138144254684448, + "learning_rate": 5.9349576993447675e-09, + "loss": 0.0608, + "step": 18122 + }, + { + "epoch": 2.9363253402462735, + "grad_norm": 1.1568125486373901, + "learning_rate": 5.904881092551607e-09, + "loss": 0.0606, + "step": 18123 + }, + { + "epoch": 2.9364873622812704, + "grad_norm": 0.8754058480262756, + "learning_rate": 5.874880799033589e-09, + "loss": 0.0555, + "step": 18124 + }, + { + "epoch": 2.936649384316267, + "grad_norm": 0.8889382481575012, + "learning_rate": 5.844956819708314e-09, + "loss": 0.0585, + "step": 18125 + }, + { + "epoch": 2.936811406351264, + "grad_norm": 0.9085453748703003, + "learning_rate": 5.815109155491716e-09, + "loss": 0.0605, + "step": 18126 + }, + { + "epoch": 2.9369734283862607, + "grad_norm": 0.9168490767478943, + "learning_rate": 5.785337807297231e-09, + "loss": 0.0616, + "step": 18127 + }, + { + "epoch": 2.9371354504212572, + "grad_norm": 1.0325088500976562, + "learning_rate": 5.755642776035242e-09, + "loss": 0.0636, + "step": 18128 + }, + { + "epoch": 2.937297472456254, + "grad_norm": 0.9850202202796936, + "learning_rate": 5.726024062614466e-09, + "loss": 0.0607, + "step": 18129 + }, + { + "epoch": 2.9374594944912507, + "grad_norm": 1.0262207984924316, + "learning_rate": 5.696481667941678e-09, + "loss": 0.0581, + "step": 18130 + }, + { + "epoch": 2.9376215165262476, + "grad_norm": 0.9356762766838074, + "learning_rate": 5.667015592920322e-09, + "loss": 0.064, + "step": 18131 + }, + { + "epoch": 2.937783538561244, + "grad_norm": 0.9865776896476746, + "learning_rate": 5.637625838452176e-09, + "loss": 0.0695, + "step": 18132 + }, + { + "epoch": 2.937945560596241, + "grad_norm": 1.0751196146011353, + "learning_rate": 5.608312405436245e-09, + "loss": 0.0706, + "step": 18133 + }, + { + "epoch": 2.938107582631238, + "grad_norm": 0.8332173824310303, + "learning_rate": 5.579075294769864e-09, + "loss": 0.0523, + "step": 18134 + }, + { + "epoch": 2.9382696046662344, + "grad_norm": 0.9454176425933838, + "learning_rate": 5.54991450734732e-09, + "loss": 0.0566, + "step": 18135 + }, + { + "epoch": 2.9384316267012314, + "grad_norm": 0.8918482661247253, + "learning_rate": 5.520830044060677e-09, + "loss": 0.06, + "step": 18136 + }, + { + "epoch": 2.9385936487362283, + "grad_norm": 0.9533538818359375, + "learning_rate": 5.491821905800332e-09, + "loss": 0.0536, + "step": 18137 + }, + { + "epoch": 2.938755670771225, + "grad_norm": 0.8268304467201233, + "learning_rate": 5.46289009345391e-09, + "loss": 0.0584, + "step": 18138 + }, + { + "epoch": 2.9389176928062217, + "grad_norm": 0.7862445116043091, + "learning_rate": 5.43403460790598e-09, + "loss": 0.0561, + "step": 18139 + }, + { + "epoch": 2.9390797148412187, + "grad_norm": 1.113399624824524, + "learning_rate": 5.405255450040003e-09, + "loss": 0.0603, + "step": 18140 + }, + { + "epoch": 2.939241736876215, + "grad_norm": 1.0670251846313477, + "learning_rate": 5.376552620736664e-09, + "loss": 0.0631, + "step": 18141 + }, + { + "epoch": 2.939403758911212, + "grad_norm": 0.9453678727149963, + "learning_rate": 5.347926120873592e-09, + "loss": 0.0585, + "step": 18142 + }, + { + "epoch": 2.9395657809462086, + "grad_norm": 0.9201284050941467, + "learning_rate": 5.319375951327033e-09, + "loss": 0.0643, + "step": 18143 + }, + { + "epoch": 2.9397278029812055, + "grad_norm": 0.929400622844696, + "learning_rate": 5.290902112970731e-09, + "loss": 0.0587, + "step": 18144 + }, + { + "epoch": 2.939889825016202, + "grad_norm": 1.096695065498352, + "learning_rate": 5.262504606675656e-09, + "loss": 0.0624, + "step": 18145 + }, + { + "epoch": 2.940051847051199, + "grad_norm": 0.8255508542060852, + "learning_rate": 5.234183433310835e-09, + "loss": 0.0513, + "step": 18146 + }, + { + "epoch": 2.940213869086196, + "grad_norm": 1.058481216430664, + "learning_rate": 5.205938593742799e-09, + "loss": 0.0652, + "step": 18147 + }, + { + "epoch": 2.9403758911211924, + "grad_norm": 0.99068284034729, + "learning_rate": 5.177770088835854e-09, + "loss": 0.0587, + "step": 18148 + }, + { + "epoch": 2.9405379131561893, + "grad_norm": 0.9016160368919373, + "learning_rate": 5.149677919451535e-09, + "loss": 0.0575, + "step": 18149 + }, + { + "epoch": 2.940699935191186, + "grad_norm": 1.0625081062316895, + "learning_rate": 5.121662086449708e-09, + "loss": 0.071, + "step": 18150 + }, + { + "epoch": 2.9408619572261827, + "grad_norm": 1.1517467498779297, + "learning_rate": 5.093722590687744e-09, + "loss": 0.0692, + "step": 18151 + }, + { + "epoch": 2.9410239792611796, + "grad_norm": 0.8428022861480713, + "learning_rate": 5.065859433019959e-09, + "loss": 0.051, + "step": 18152 + }, + { + "epoch": 2.941186001296176, + "grad_norm": 0.9025364518165588, + "learning_rate": 5.038072614299561e-09, + "loss": 0.0583, + "step": 18153 + }, + { + "epoch": 2.941348023331173, + "grad_norm": 1.0714325904846191, + "learning_rate": 5.010362135376423e-09, + "loss": 0.0552, + "step": 18154 + }, + { + "epoch": 2.9415100453661696, + "grad_norm": 1.128408432006836, + "learning_rate": 4.9827279970982024e-09, + "loss": 0.0682, + "step": 18155 + }, + { + "epoch": 2.9416720674011665, + "grad_norm": 0.9572305083274841, + "learning_rate": 4.955170200310888e-09, + "loss": 0.0565, + "step": 18156 + }, + { + "epoch": 2.9418340894361634, + "grad_norm": 0.8969508409500122, + "learning_rate": 4.927688745857417e-09, + "loss": 0.0601, + "step": 18157 + }, + { + "epoch": 2.94199611147116, + "grad_norm": 1.0484877824783325, + "learning_rate": 4.9002836345787845e-09, + "loss": 0.0628, + "step": 18158 + }, + { + "epoch": 2.942158133506157, + "grad_norm": 0.9466218948364258, + "learning_rate": 4.872954867313484e-09, + "loss": 0.0631, + "step": 18159 + }, + { + "epoch": 2.942320155541154, + "grad_norm": 1.0803978443145752, + "learning_rate": 4.845702444897515e-09, + "loss": 0.0597, + "step": 18160 + }, + { + "epoch": 2.9424821775761503, + "grad_norm": 0.8907843232154846, + "learning_rate": 4.818526368164933e-09, + "loss": 0.0571, + "step": 18161 + }, + { + "epoch": 2.942644199611147, + "grad_norm": 1.0460045337677002, + "learning_rate": 4.791426637947294e-09, + "loss": 0.0617, + "step": 18162 + }, + { + "epoch": 2.942806221646144, + "grad_norm": 0.893657922744751, + "learning_rate": 4.764403255073657e-09, + "loss": 0.0601, + "step": 18163 + }, + { + "epoch": 2.9429682436811406, + "grad_norm": 1.0240740776062012, + "learning_rate": 4.7374562203708615e-09, + "loss": 0.0714, + "step": 18164 + }, + { + "epoch": 2.943130265716137, + "grad_norm": 0.8384248614311218, + "learning_rate": 4.710585534663526e-09, + "loss": 0.0593, + "step": 18165 + }, + { + "epoch": 2.943292287751134, + "grad_norm": 0.8110920786857605, + "learning_rate": 4.683791198773768e-09, + "loss": 0.0551, + "step": 18166 + }, + { + "epoch": 2.943454309786131, + "grad_norm": 0.8798488974571228, + "learning_rate": 4.657073213521768e-09, + "loss": 0.0547, + "step": 18167 + }, + { + "epoch": 2.9436163318211275, + "grad_norm": 0.8243097066879272, + "learning_rate": 4.630431579724371e-09, + "loss": 0.0606, + "step": 18168 + }, + { + "epoch": 2.9437783538561244, + "grad_norm": 1.1285996437072754, + "learning_rate": 4.603866298197035e-09, + "loss": 0.0602, + "step": 18169 + }, + { + "epoch": 2.9439403758911213, + "grad_norm": 0.9858548641204834, + "learning_rate": 4.577377369752722e-09, + "loss": 0.0655, + "step": 18170 + }, + { + "epoch": 2.944102397926118, + "grad_norm": 1.0158785581588745, + "learning_rate": 4.550964795202173e-09, + "loss": 0.0605, + "step": 18171 + }, + { + "epoch": 2.9442644199611148, + "grad_norm": 0.9173128604888916, + "learning_rate": 4.524628575352796e-09, + "loss": 0.057, + "step": 18172 + }, + { + "epoch": 2.9444264419961117, + "grad_norm": 1.0110288858413696, + "learning_rate": 4.4983687110111695e-09, + "loss": 0.0644, + "step": 18173 + }, + { + "epoch": 2.944588464031108, + "grad_norm": 0.8735485076904297, + "learning_rate": 4.472185202980261e-09, + "loss": 0.0587, + "step": 18174 + }, + { + "epoch": 2.944750486066105, + "grad_norm": 0.8773369193077087, + "learning_rate": 4.446078052061375e-09, + "loss": 0.0586, + "step": 18175 + }, + { + "epoch": 2.9449125081011016, + "grad_norm": 1.5746303796768188, + "learning_rate": 4.420047259053595e-09, + "loss": 0.0564, + "step": 18176 + }, + { + "epoch": 2.9450745301360985, + "grad_norm": 0.9835705161094666, + "learning_rate": 4.394092824752949e-09, + "loss": 0.0608, + "step": 18177 + }, + { + "epoch": 2.945236552171095, + "grad_norm": 0.9602859020233154, + "learning_rate": 4.36821474995408e-09, + "loss": 0.06, + "step": 18178 + }, + { + "epoch": 2.945398574206092, + "grad_norm": 0.9771770238876343, + "learning_rate": 4.342413035448301e-09, + "loss": 0.0584, + "step": 18179 + }, + { + "epoch": 2.945560596241089, + "grad_norm": 0.8505083322525024, + "learning_rate": 4.316687682025256e-09, + "loss": 0.0548, + "step": 18180 + }, + { + "epoch": 2.9457226182760854, + "grad_norm": 0.8177760243415833, + "learning_rate": 4.291038690472371e-09, + "loss": 0.0592, + "step": 18181 + }, + { + "epoch": 2.9458846403110823, + "grad_norm": 0.9233116507530212, + "learning_rate": 4.26546606157402e-09, + "loss": 0.0594, + "step": 18182 + }, + { + "epoch": 2.9460466623460793, + "grad_norm": 0.8462489247322083, + "learning_rate": 4.2399697961129084e-09, + "loss": 0.0502, + "step": 18183 + }, + { + "epoch": 2.9462086843810757, + "grad_norm": 0.8417542576789856, + "learning_rate": 4.2145498948692465e-09, + "loss": 0.0603, + "step": 18184 + }, + { + "epoch": 2.9463707064160727, + "grad_norm": 0.844294011592865, + "learning_rate": 4.189206358620468e-09, + "loss": 0.0534, + "step": 18185 + }, + { + "epoch": 2.9465327284510696, + "grad_norm": 0.9389594197273254, + "learning_rate": 4.163939188142341e-09, + "loss": 0.0582, + "step": 18186 + }, + { + "epoch": 2.946694750486066, + "grad_norm": 0.9046799540519714, + "learning_rate": 4.138748384207858e-09, + "loss": 0.0595, + "step": 18187 + }, + { + "epoch": 2.9468567725210626, + "grad_norm": 0.9170434474945068, + "learning_rate": 4.113633947587792e-09, + "loss": 0.059, + "step": 18188 + }, + { + "epoch": 2.9470187945560595, + "grad_norm": 0.7925693392753601, + "learning_rate": 4.0885958790504165e-09, + "loss": 0.0539, + "step": 18189 + }, + { + "epoch": 2.9471808165910565, + "grad_norm": 0.9091478586196899, + "learning_rate": 4.063634179362341e-09, + "loss": 0.0521, + "step": 18190 + }, + { + "epoch": 2.947342838626053, + "grad_norm": 0.9072052836418152, + "learning_rate": 4.0387488492868446e-09, + "loss": 0.0637, + "step": 18191 + }, + { + "epoch": 2.94750486066105, + "grad_norm": 1.005933165550232, + "learning_rate": 4.013939889585538e-09, + "loss": 0.0623, + "step": 18192 + }, + { + "epoch": 2.947666882696047, + "grad_norm": 0.9582807421684265, + "learning_rate": 3.989207301017262e-09, + "loss": 0.0613, + "step": 18193 + }, + { + "epoch": 2.9478289047310433, + "grad_norm": 1.0312175750732422, + "learning_rate": 3.964551084339463e-09, + "loss": 0.0714, + "step": 18194 + }, + { + "epoch": 2.9479909267660402, + "grad_norm": 0.9463022947311401, + "learning_rate": 3.939971240305707e-09, + "loss": 0.0649, + "step": 18195 + }, + { + "epoch": 2.948152948801037, + "grad_norm": 0.9024858474731445, + "learning_rate": 3.915467769668724e-09, + "loss": 0.0549, + "step": 18196 + }, + { + "epoch": 2.9483149708360337, + "grad_norm": 0.8559155464172363, + "learning_rate": 3.891040673177915e-09, + "loss": 0.0564, + "step": 18197 + }, + { + "epoch": 2.9484769928710306, + "grad_norm": 0.804895281791687, + "learning_rate": 3.866689951580738e-09, + "loss": 0.0563, + "step": 18198 + }, + { + "epoch": 2.948639014906027, + "grad_norm": 0.9484009742736816, + "learning_rate": 3.842415605622429e-09, + "loss": 0.056, + "step": 18199 + }, + { + "epoch": 2.948801036941024, + "grad_norm": 0.813424289226532, + "learning_rate": 3.818217636045729e-09, + "loss": 0.0606, + "step": 18200 + }, + { + "epoch": 2.9489630589760205, + "grad_norm": 0.9334542751312256, + "learning_rate": 3.794096043590878e-09, + "loss": 0.0611, + "step": 18201 + }, + { + "epoch": 2.9491250810110174, + "grad_norm": 0.9005117416381836, + "learning_rate": 3.770050828995897e-09, + "loss": 0.0569, + "step": 18202 + }, + { + "epoch": 2.9492871030460144, + "grad_norm": 0.8461850881576538, + "learning_rate": 3.746081992996587e-09, + "loss": 0.0572, + "step": 18203 + }, + { + "epoch": 2.949449125081011, + "grad_norm": 0.8409425616264343, + "learning_rate": 3.7221895363262485e-09, + "loss": 0.0623, + "step": 18204 + }, + { + "epoch": 2.949611147116008, + "grad_norm": 1.0652631521224976, + "learning_rate": 3.6983734597162423e-09, + "loss": 0.0653, + "step": 18205 + }, + { + "epoch": 2.9497731691510047, + "grad_norm": 0.9219419956207275, + "learning_rate": 3.674633763894875e-09, + "loss": 0.0573, + "step": 18206 + }, + { + "epoch": 2.9499351911860012, + "grad_norm": 1.0178706645965576, + "learning_rate": 3.6509704495887866e-09, + "loss": 0.0608, + "step": 18207 + }, + { + "epoch": 2.950097213220998, + "grad_norm": 0.853714644908905, + "learning_rate": 3.6273835175221204e-09, + "loss": 0.0556, + "step": 18208 + }, + { + "epoch": 2.9502592352559946, + "grad_norm": 0.9474278688430786, + "learning_rate": 3.6038729684162445e-09, + "loss": 0.0633, + "step": 18209 + }, + { + "epoch": 2.9504212572909916, + "grad_norm": 0.962617039680481, + "learning_rate": 3.58043880299086e-09, + "loss": 0.0638, + "step": 18210 + }, + { + "epoch": 2.950583279325988, + "grad_norm": 0.8814941048622131, + "learning_rate": 3.557081021962616e-09, + "loss": 0.0634, + "step": 18211 + }, + { + "epoch": 2.950745301360985, + "grad_norm": 0.8792970776557922, + "learning_rate": 3.533799626046497e-09, + "loss": 0.0634, + "step": 18212 + }, + { + "epoch": 2.950907323395982, + "grad_norm": 0.8841227889060974, + "learning_rate": 3.5105946159549874e-09, + "loss": 0.0564, + "step": 18213 + }, + { + "epoch": 2.9510693454309784, + "grad_norm": 1.0366162061691284, + "learning_rate": 3.487465992397521e-09, + "loss": 0.0656, + "step": 18214 + }, + { + "epoch": 2.9512313674659754, + "grad_norm": 0.873475968837738, + "learning_rate": 3.4644137560824187e-09, + "loss": 0.0582, + "step": 18215 + }, + { + "epoch": 2.9513933895009723, + "grad_norm": 0.8530957698822021, + "learning_rate": 3.4414379077146733e-09, + "loss": 0.0568, + "step": 18216 + }, + { + "epoch": 2.951555411535969, + "grad_norm": 0.9540935158729553, + "learning_rate": 3.418538447997333e-09, + "loss": 0.0599, + "step": 18217 + }, + { + "epoch": 2.9517174335709657, + "grad_norm": 0.8702645301818848, + "learning_rate": 3.3957153776312257e-09, + "loss": 0.0599, + "step": 18218 + }, + { + "epoch": 2.9518794556059627, + "grad_norm": 0.9750285744667053, + "learning_rate": 3.372968697314405e-09, + "loss": 0.0584, + "step": 18219 + }, + { + "epoch": 2.952041477640959, + "grad_norm": 1.2663254737854004, + "learning_rate": 3.3502984077429803e-09, + "loss": 0.0627, + "step": 18220 + }, + { + "epoch": 2.952203499675956, + "grad_norm": 0.9789478778839111, + "learning_rate": 3.3277045096108405e-09, + "loss": 0.0574, + "step": 18221 + }, + { + "epoch": 2.9523655217109526, + "grad_norm": 1.000509262084961, + "learning_rate": 3.3051870036091004e-09, + "loss": 0.0558, + "step": 18222 + }, + { + "epoch": 2.9525275437459495, + "grad_norm": 0.9567583799362183, + "learning_rate": 3.282745890426653e-09, + "loss": 0.0611, + "step": 18223 + }, + { + "epoch": 2.952689565780946, + "grad_norm": 0.9223018288612366, + "learning_rate": 3.260381170750171e-09, + "loss": 0.0581, + "step": 18224 + }, + { + "epoch": 2.952851587815943, + "grad_norm": 0.8357469439506531, + "learning_rate": 3.238092845264107e-09, + "loss": 0.0543, + "step": 18225 + }, + { + "epoch": 2.95301360985094, + "grad_norm": 0.7842499017715454, + "learning_rate": 3.215880914650693e-09, + "loss": 0.0563, + "step": 18226 + }, + { + "epoch": 2.9531756318859363, + "grad_norm": 1.091303825378418, + "learning_rate": 3.1937453795888306e-09, + "loss": 0.0625, + "step": 18227 + }, + { + "epoch": 2.9533376539209333, + "grad_norm": 0.9732781052589417, + "learning_rate": 3.171686240756033e-09, + "loss": 0.0619, + "step": 18228 + }, + { + "epoch": 2.95349967595593, + "grad_norm": 0.8895050883293152, + "learning_rate": 3.1497034988278717e-09, + "loss": 0.0539, + "step": 18229 + }, + { + "epoch": 2.9536616979909267, + "grad_norm": 0.9890386462211609, + "learning_rate": 3.1277971544763088e-09, + "loss": 0.0618, + "step": 18230 + }, + { + "epoch": 2.9538237200259236, + "grad_norm": 0.9173012375831604, + "learning_rate": 3.1059672083719183e-09, + "loss": 0.056, + "step": 18231 + }, + { + "epoch": 2.95398574206092, + "grad_norm": 0.9382359385490417, + "learning_rate": 3.0842136611825004e-09, + "loss": 0.0637, + "step": 18232 + }, + { + "epoch": 2.954147764095917, + "grad_norm": 0.8800296187400818, + "learning_rate": 3.062536513573633e-09, + "loss": 0.0596, + "step": 18233 + }, + { + "epoch": 2.9543097861309136, + "grad_norm": 1.0513750314712524, + "learning_rate": 3.0409357662086748e-09, + "loss": 0.0661, + "step": 18234 + }, + { + "epoch": 2.9544718081659105, + "grad_norm": 0.8666124939918518, + "learning_rate": 3.0194114197487635e-09, + "loss": 0.0535, + "step": 18235 + }, + { + "epoch": 2.9546338302009074, + "grad_norm": 0.9290871024131775, + "learning_rate": 2.997963474852261e-09, + "loss": 0.0626, + "step": 18236 + }, + { + "epoch": 2.954795852235904, + "grad_norm": 0.9381429553031921, + "learning_rate": 2.97659193217531e-09, + "loss": 0.0614, + "step": 18237 + }, + { + "epoch": 2.954957874270901, + "grad_norm": 0.8524567484855652, + "learning_rate": 2.9552967923721086e-09, + "loss": 0.0595, + "step": 18238 + }, + { + "epoch": 2.9551198963058978, + "grad_norm": 0.7893286943435669, + "learning_rate": 2.9340780560938032e-09, + "loss": 0.0515, + "step": 18239 + }, + { + "epoch": 2.9552819183408943, + "grad_norm": 0.8815685510635376, + "learning_rate": 2.9129357239901514e-09, + "loss": 0.0554, + "step": 18240 + }, + { + "epoch": 2.955443940375891, + "grad_norm": 0.7954118251800537, + "learning_rate": 2.8918697967078578e-09, + "loss": 0.0561, + "step": 18241 + }, + { + "epoch": 2.955605962410888, + "grad_norm": 0.8579531908035278, + "learning_rate": 2.8708802748914077e-09, + "loss": 0.0591, + "step": 18242 + }, + { + "epoch": 2.9557679844458846, + "grad_norm": 0.9822569489479065, + "learning_rate": 2.849967159183065e-09, + "loss": 0.0587, + "step": 18243 + }, + { + "epoch": 2.9559300064808816, + "grad_norm": 0.9963696002960205, + "learning_rate": 2.829130450222872e-09, + "loss": 0.0605, + "step": 18244 + }, + { + "epoch": 2.956092028515878, + "grad_norm": 1.0176767110824585, + "learning_rate": 2.8083701486480985e-09, + "loss": 0.0586, + "step": 18245 + }, + { + "epoch": 2.956254050550875, + "grad_norm": 0.9688636064529419, + "learning_rate": 2.7876862550940685e-09, + "loss": 0.0648, + "step": 18246 + }, + { + "epoch": 2.9564160725858715, + "grad_norm": 1.1327593326568604, + "learning_rate": 2.7670787701938874e-09, + "loss": 0.0649, + "step": 18247 + }, + { + "epoch": 2.9565780946208684, + "grad_norm": 0.8865358829498291, + "learning_rate": 2.7465476945778835e-09, + "loss": 0.0517, + "step": 18248 + }, + { + "epoch": 2.9567401166558653, + "grad_norm": 0.9270232319831848, + "learning_rate": 2.726093028874166e-09, + "loss": 0.063, + "step": 18249 + }, + { + "epoch": 2.956902138690862, + "grad_norm": 0.8303735852241516, + "learning_rate": 2.705714773708623e-09, + "loss": 0.0583, + "step": 18250 + }, + { + "epoch": 2.9570641607258588, + "grad_norm": 0.8890751004219055, + "learning_rate": 2.6854129297049225e-09, + "loss": 0.0557, + "step": 18251 + }, + { + "epoch": 2.9572261827608557, + "grad_norm": 0.9399266242980957, + "learning_rate": 2.6651874974845115e-09, + "loss": 0.0563, + "step": 18252 + }, + { + "epoch": 2.957388204795852, + "grad_norm": 0.7995545268058777, + "learning_rate": 2.645038477665507e-09, + "loss": 0.0526, + "step": 18253 + }, + { + "epoch": 2.957550226830849, + "grad_norm": 0.9148885011672974, + "learning_rate": 2.6249658708651928e-09, + "loss": 0.0664, + "step": 18254 + }, + { + "epoch": 2.9577122488658456, + "grad_norm": 0.847109317779541, + "learning_rate": 2.6049696776972443e-09, + "loss": 0.0579, + "step": 18255 + }, + { + "epoch": 2.9578742709008425, + "grad_norm": 0.858539342880249, + "learning_rate": 2.5850498987733952e-09, + "loss": 0.0644, + "step": 18256 + }, + { + "epoch": 2.958036292935839, + "grad_norm": 1.0084835290908813, + "learning_rate": 2.5652065347037126e-09, + "loss": 0.0673, + "step": 18257 + }, + { + "epoch": 2.958198314970836, + "grad_norm": 0.8510595560073853, + "learning_rate": 2.545439586094933e-09, + "loss": 0.0584, + "step": 18258 + }, + { + "epoch": 2.958360337005833, + "grad_norm": 0.9628735780715942, + "learning_rate": 2.525749053552129e-09, + "loss": 0.0601, + "step": 18259 + }, + { + "epoch": 2.9585223590408294, + "grad_norm": 0.8464257121086121, + "learning_rate": 2.506134937677318e-09, + "loss": 0.0567, + "step": 18260 + }, + { + "epoch": 2.9586843810758263, + "grad_norm": 0.838660478591919, + "learning_rate": 2.4865972390711314e-09, + "loss": 0.0583, + "step": 18261 + }, + { + "epoch": 2.9588464031108233, + "grad_norm": 0.9245243668556213, + "learning_rate": 2.4671359583314237e-09, + "loss": 0.0618, + "step": 18262 + }, + { + "epoch": 2.9590084251458197, + "grad_norm": 0.8043138980865479, + "learning_rate": 2.4477510960532747e-09, + "loss": 0.0579, + "step": 18263 + }, + { + "epoch": 2.9591704471808167, + "grad_norm": 0.8946799039840698, + "learning_rate": 2.4284426528298212e-09, + "loss": 0.0584, + "step": 18264 + }, + { + "epoch": 2.9593324692158136, + "grad_norm": 1.0274814367294312, + "learning_rate": 2.409210629251979e-09, + "loss": 0.063, + "step": 18265 + }, + { + "epoch": 2.95949449125081, + "grad_norm": 0.8745455145835876, + "learning_rate": 2.3900550259084445e-09, + "loss": 0.0599, + "step": 18266 + }, + { + "epoch": 2.959656513285807, + "grad_norm": 0.9556804895401001, + "learning_rate": 2.370975843385137e-09, + "loss": 0.0605, + "step": 18267 + }, + { + "epoch": 2.9598185353208035, + "grad_norm": 0.9654064774513245, + "learning_rate": 2.351973082265757e-09, + "loss": 0.0604, + "step": 18268 + }, + { + "epoch": 2.9599805573558005, + "grad_norm": 0.9703561067581177, + "learning_rate": 2.333046743131784e-09, + "loss": 0.0632, + "step": 18269 + }, + { + "epoch": 2.960142579390797, + "grad_norm": 1.048510193824768, + "learning_rate": 2.314196826562476e-09, + "loss": 0.0593, + "step": 18270 + }, + { + "epoch": 2.960304601425794, + "grad_norm": 0.8689060211181641, + "learning_rate": 2.295423333134317e-09, + "loss": 0.0659, + "step": 18271 + }, + { + "epoch": 2.960466623460791, + "grad_norm": 0.9351786971092224, + "learning_rate": 2.2767262634218466e-09, + "loss": 0.0653, + "step": 18272 + }, + { + "epoch": 2.9606286454957873, + "grad_norm": 0.7980687022209167, + "learning_rate": 2.2581056179971084e-09, + "loss": 0.0542, + "step": 18273 + }, + { + "epoch": 2.9607906675307842, + "grad_norm": 0.8763124346733093, + "learning_rate": 2.239561397430201e-09, + "loss": 0.0586, + "step": 18274 + }, + { + "epoch": 2.960952689565781, + "grad_norm": 0.9670480489730835, + "learning_rate": 2.221093602288171e-09, + "loss": 0.0565, + "step": 18275 + }, + { + "epoch": 2.9611147116007777, + "grad_norm": 0.8312298655509949, + "learning_rate": 2.2027022331361226e-09, + "loss": 0.0575, + "step": 18276 + }, + { + "epoch": 2.9612767336357746, + "grad_norm": 0.9964354038238525, + "learning_rate": 2.18438729053666e-09, + "loss": 0.06, + "step": 18277 + }, + { + "epoch": 2.961438755670771, + "grad_norm": 0.9789435863494873, + "learning_rate": 2.1661487750504473e-09, + "loss": 0.0577, + "step": 18278 + }, + { + "epoch": 2.961600777705768, + "grad_norm": 0.977944016456604, + "learning_rate": 2.147986687235648e-09, + "loss": 0.0589, + "step": 18279 + }, + { + "epoch": 2.9617627997407645, + "grad_norm": 0.9870699644088745, + "learning_rate": 2.129901027647652e-09, + "loss": 0.0603, + "step": 18280 + }, + { + "epoch": 2.9619248217757614, + "grad_norm": 0.8196133375167847, + "learning_rate": 2.1118917968399045e-09, + "loss": 0.0516, + "step": 18281 + }, + { + "epoch": 2.9620868438107584, + "grad_norm": 1.1633763313293457, + "learning_rate": 2.0939589953633542e-09, + "loss": 0.0574, + "step": 18282 + }, + { + "epoch": 2.962248865845755, + "grad_norm": 1.174547791481018, + "learning_rate": 2.076102623767007e-09, + "loss": 0.0649, + "step": 18283 + }, + { + "epoch": 2.962410887880752, + "grad_norm": 0.8780858516693115, + "learning_rate": 2.0583226825970915e-09, + "loss": 0.0636, + "step": 18284 + }, + { + "epoch": 2.9625729099157487, + "grad_norm": 0.8388274908065796, + "learning_rate": 2.0406191723976175e-09, + "loss": 0.0627, + "step": 18285 + }, + { + "epoch": 2.962734931950745, + "grad_norm": 0.9338237047195435, + "learning_rate": 2.022992093710097e-09, + "loss": 0.0605, + "step": 18286 + }, + { + "epoch": 2.962896953985742, + "grad_norm": 0.8622428774833679, + "learning_rate": 2.005441447074097e-09, + "loss": 0.0602, + "step": 18287 + }, + { + "epoch": 2.963058976020739, + "grad_norm": 0.9662334322929382, + "learning_rate": 1.9879672330266886e-09, + "loss": 0.0603, + "step": 18288 + }, + { + "epoch": 2.9632209980557356, + "grad_norm": 1.1020525693893433, + "learning_rate": 1.9705694521021666e-09, + "loss": 0.0663, + "step": 18289 + }, + { + "epoch": 2.963383020090732, + "grad_norm": 0.82396000623703, + "learning_rate": 1.9532481048334383e-09, + "loss": 0.0535, + "step": 18290 + }, + { + "epoch": 2.963545042125729, + "grad_norm": 0.8906350135803223, + "learning_rate": 1.9360031917498024e-09, + "loss": 0.06, + "step": 18291 + }, + { + "epoch": 2.963707064160726, + "grad_norm": 0.845971405506134, + "learning_rate": 1.918834713379447e-09, + "loss": 0.0633, + "step": 18292 + }, + { + "epoch": 2.9638690861957224, + "grad_norm": 0.9976989030838013, + "learning_rate": 1.9017426702475084e-09, + "loss": 0.0593, + "step": 18293 + }, + { + "epoch": 2.9640311082307194, + "grad_norm": 0.9086126685142517, + "learning_rate": 1.884727062876901e-09, + "loss": 0.0592, + "step": 18294 + }, + { + "epoch": 2.9641931302657163, + "grad_norm": 1.0157880783081055, + "learning_rate": 1.867787891788597e-09, + "loss": 0.0634, + "step": 18295 + }, + { + "epoch": 2.964355152300713, + "grad_norm": 1.0585784912109375, + "learning_rate": 1.8509251575002386e-09, + "loss": 0.0652, + "step": 18296 + }, + { + "epoch": 2.9645171743357097, + "grad_norm": 0.8272886276245117, + "learning_rate": 1.8341388605283562e-09, + "loss": 0.0594, + "step": 18297 + }, + { + "epoch": 2.9646791963707066, + "grad_norm": 0.8589320182800293, + "learning_rate": 1.8174290013864282e-09, + "loss": 0.0577, + "step": 18298 + }, + { + "epoch": 2.964841218405703, + "grad_norm": 0.8368940949440002, + "learning_rate": 1.800795580585435e-09, + "loss": 0.0585, + "step": 18299 + }, + { + "epoch": 2.9650032404407, + "grad_norm": 0.8517768383026123, + "learning_rate": 1.784238598634691e-09, + "loss": 0.0569, + "step": 18300 + }, + { + "epoch": 2.9651652624756966, + "grad_norm": 1.1536654233932495, + "learning_rate": 1.767758056040736e-09, + "loss": 0.0648, + "step": 18301 + }, + { + "epoch": 2.9653272845106935, + "grad_norm": 1.0494126081466675, + "learning_rate": 1.7513539533078882e-09, + "loss": 0.0679, + "step": 18302 + }, + { + "epoch": 2.96548930654569, + "grad_norm": 0.8454523682594299, + "learning_rate": 1.735026290937969e-09, + "loss": 0.0558, + "step": 18303 + }, + { + "epoch": 2.965651328580687, + "grad_norm": 0.9818223714828491, + "learning_rate": 1.7187750694303007e-09, + "loss": 0.0593, + "step": 18304 + }, + { + "epoch": 2.965813350615684, + "grad_norm": 0.9108536839485168, + "learning_rate": 1.7026002892825411e-09, + "loss": 0.0591, + "step": 18305 + }, + { + "epoch": 2.9659753726506803, + "grad_norm": 0.8628289699554443, + "learning_rate": 1.686501950989572e-09, + "loss": 0.0569, + "step": 18306 + }, + { + "epoch": 2.9661373946856773, + "grad_norm": 0.9580399990081787, + "learning_rate": 1.6704800550434997e-09, + "loss": 0.0551, + "step": 18307 + }, + { + "epoch": 2.966299416720674, + "grad_norm": 0.9089958667755127, + "learning_rate": 1.6545346019350427e-09, + "loss": 0.0599, + "step": 18308 + }, + { + "epoch": 2.9664614387556707, + "grad_norm": 0.875546932220459, + "learning_rate": 1.6386655921521443e-09, + "loss": 0.061, + "step": 18309 + }, + { + "epoch": 2.9666234607906676, + "grad_norm": 1.0294671058654785, + "learning_rate": 1.6228730261799718e-09, + "loss": 0.0663, + "step": 18310 + }, + { + "epoch": 2.9667854828256646, + "grad_norm": 0.8314631581306458, + "learning_rate": 1.6071569045020274e-09, + "loss": 0.0512, + "step": 18311 + }, + { + "epoch": 2.966947504860661, + "grad_norm": 1.001203179359436, + "learning_rate": 1.5915172275990375e-09, + "loss": 0.0555, + "step": 18312 + }, + { + "epoch": 2.9671095268956575, + "grad_norm": 0.8834933638572693, + "learning_rate": 1.5759539959495085e-09, + "loss": 0.0627, + "step": 18313 + }, + { + "epoch": 2.9672715489306545, + "grad_norm": 1.0192748308181763, + "learning_rate": 1.5604672100297258e-09, + "loss": 0.064, + "step": 18314 + }, + { + "epoch": 2.9674335709656514, + "grad_norm": 0.9845191240310669, + "learning_rate": 1.5450568703137547e-09, + "loss": 0.0635, + "step": 18315 + }, + { + "epoch": 2.967595593000648, + "grad_norm": 0.7708606123924255, + "learning_rate": 1.5297229772726075e-09, + "loss": 0.0567, + "step": 18316 + }, + { + "epoch": 2.967757615035645, + "grad_norm": 0.8803342580795288, + "learning_rate": 1.5144655313759082e-09, + "loss": 0.0596, + "step": 18317 + }, + { + "epoch": 2.9679196370706418, + "grad_norm": 0.9847882986068726, + "learning_rate": 1.499284533090506e-09, + "loss": 0.0609, + "step": 18318 + }, + { + "epoch": 2.9680816591056383, + "grad_norm": 0.9394886493682861, + "learning_rate": 1.4841799828804737e-09, + "loss": 0.0655, + "step": 18319 + }, + { + "epoch": 2.968243681140635, + "grad_norm": 0.8597686886787415, + "learning_rate": 1.469151881208497e-09, + "loss": 0.0533, + "step": 18320 + }, + { + "epoch": 2.968405703175632, + "grad_norm": 0.9775875806808472, + "learning_rate": 1.4542002285339307e-09, + "loss": 0.0652, + "step": 18321 + }, + { + "epoch": 2.9685677252106286, + "grad_norm": 0.9064924716949463, + "learning_rate": 1.4393250253144642e-09, + "loss": 0.0605, + "step": 18322 + }, + { + "epoch": 2.9687297472456255, + "grad_norm": 0.788995087146759, + "learning_rate": 1.4245262720052888e-09, + "loss": 0.055, + "step": 18323 + }, + { + "epoch": 2.968891769280622, + "grad_norm": 0.86928391456604, + "learning_rate": 1.4098039690593756e-09, + "loss": 0.0589, + "step": 18324 + }, + { + "epoch": 2.969053791315619, + "grad_norm": 0.894902765750885, + "learning_rate": 1.3951581169266426e-09, + "loss": 0.0581, + "step": 18325 + }, + { + "epoch": 2.9692158133506155, + "grad_norm": 0.9499050974845886, + "learning_rate": 1.3805887160558973e-09, + "loss": 0.0611, + "step": 18326 + }, + { + "epoch": 2.9693778353856124, + "grad_norm": 0.896775484085083, + "learning_rate": 1.3660957668923391e-09, + "loss": 0.0545, + "step": 18327 + }, + { + "epoch": 2.9695398574206093, + "grad_norm": 1.0583899021148682, + "learning_rate": 1.3516792698797797e-09, + "loss": 0.0639, + "step": 18328 + }, + { + "epoch": 2.969701879455606, + "grad_norm": 0.9759207963943481, + "learning_rate": 1.3373392254592554e-09, + "loss": 0.0665, + "step": 18329 + }, + { + "epoch": 2.9698639014906028, + "grad_norm": 0.8481022715568542, + "learning_rate": 1.323075634069304e-09, + "loss": 0.0598, + "step": 18330 + }, + { + "epoch": 2.9700259235255997, + "grad_norm": 1.0313864946365356, + "learning_rate": 1.3088884961467985e-09, + "loss": 0.0618, + "step": 18331 + }, + { + "epoch": 2.970187945560596, + "grad_norm": 0.9152479767799377, + "learning_rate": 1.2947778121255584e-09, + "loss": 0.0561, + "step": 18332 + }, + { + "epoch": 2.970349967595593, + "grad_norm": 0.9564821720123291, + "learning_rate": 1.2807435824371827e-09, + "loss": 0.0579, + "step": 18333 + }, + { + "epoch": 2.9705119896305896, + "grad_norm": 0.9317049384117126, + "learning_rate": 1.2667858075113281e-09, + "loss": 0.0563, + "step": 18334 + }, + { + "epoch": 2.9706740116655865, + "grad_norm": 1.058647632598877, + "learning_rate": 1.2529044877751528e-09, + "loss": 0.0618, + "step": 18335 + }, + { + "epoch": 2.970836033700583, + "grad_norm": 0.8454841375350952, + "learning_rate": 1.239099623653317e-09, + "loss": 0.0543, + "step": 18336 + }, + { + "epoch": 2.97099805573558, + "grad_norm": 1.0585004091262817, + "learning_rate": 1.2253712155679832e-09, + "loss": 0.0597, + "step": 18337 + }, + { + "epoch": 2.971160077770577, + "grad_norm": 0.9348391890525818, + "learning_rate": 1.2117192639393704e-09, + "loss": 0.0573, + "step": 18338 + }, + { + "epoch": 2.9713220998055734, + "grad_norm": 0.8744611740112305, + "learning_rate": 1.1981437691852004e-09, + "loss": 0.0567, + "step": 18339 + }, + { + "epoch": 2.9714841218405703, + "grad_norm": 0.9730286598205566, + "learning_rate": 1.1846447317206967e-09, + "loss": 0.0593, + "step": 18340 + }, + { + "epoch": 2.9716461438755672, + "grad_norm": 1.0506782531738281, + "learning_rate": 1.1712221519594168e-09, + "loss": 0.0634, + "step": 18341 + }, + { + "epoch": 2.9718081659105637, + "grad_norm": 0.9589892625808716, + "learning_rate": 1.1578760303113113e-09, + "loss": 0.0554, + "step": 18342 + }, + { + "epoch": 2.9719701879455607, + "grad_norm": 0.8297640085220337, + "learning_rate": 1.1446063671854969e-09, + "loss": 0.0528, + "step": 18343 + }, + { + "epoch": 2.9721322099805576, + "grad_norm": 0.8789884448051453, + "learning_rate": 1.131413162987205e-09, + "loss": 0.0607, + "step": 18344 + }, + { + "epoch": 2.972294232015554, + "grad_norm": 1.033374309539795, + "learning_rate": 1.1182964181208344e-09, + "loss": 0.0625, + "step": 18345 + }, + { + "epoch": 2.972456254050551, + "grad_norm": 0.8781412839889526, + "learning_rate": 1.1052561329871757e-09, + "loss": 0.0567, + "step": 18346 + }, + { + "epoch": 2.9726182760855475, + "grad_norm": 1.0838392972946167, + "learning_rate": 1.0922923079856319e-09, + "loss": 0.0611, + "step": 18347 + }, + { + "epoch": 2.9727802981205445, + "grad_norm": 0.891605019569397, + "learning_rate": 1.0794049435128296e-09, + "loss": 0.0582, + "step": 18348 + }, + { + "epoch": 2.972942320155541, + "grad_norm": 0.9720555543899536, + "learning_rate": 1.0665940399626207e-09, + "loss": 0.0635, + "step": 18349 + }, + { + "epoch": 2.973104342190538, + "grad_norm": 0.8500351309776306, + "learning_rate": 1.0538595977277466e-09, + "loss": 0.0587, + "step": 18350 + }, + { + "epoch": 2.973266364225535, + "grad_norm": 1.0722614526748657, + "learning_rate": 1.041201617197063e-09, + "loss": 0.0624, + "step": 18351 + }, + { + "epoch": 2.9734283862605313, + "grad_norm": 0.9611464738845825, + "learning_rate": 1.028620098758315e-09, + "loss": 0.0588, + "step": 18352 + }, + { + "epoch": 2.9735904082955282, + "grad_norm": 1.0209434032440186, + "learning_rate": 1.0161150427964727e-09, + "loss": 0.0619, + "step": 18353 + }, + { + "epoch": 2.973752430330525, + "grad_norm": 0.9883117079734802, + "learning_rate": 1.0036864496942856e-09, + "loss": 0.0645, + "step": 18354 + }, + { + "epoch": 2.9739144523655217, + "grad_norm": 0.890546977519989, + "learning_rate": 9.913343198314495e-10, + "loss": 0.0606, + "step": 18355 + }, + { + "epoch": 2.9740764744005186, + "grad_norm": 0.9300897121429443, + "learning_rate": 9.79058653586551e-10, + "loss": 0.0638, + "step": 18356 + }, + { + "epoch": 2.974238496435515, + "grad_norm": 0.7656763792037964, + "learning_rate": 9.66859451334845e-10, + "loss": 0.0535, + "step": 18357 + }, + { + "epoch": 2.974400518470512, + "grad_norm": 0.9918314814567566, + "learning_rate": 9.54736713449922e-10, + "loss": 0.0602, + "step": 18358 + }, + { + "epoch": 2.9745625405055085, + "grad_norm": 0.8427473306655884, + "learning_rate": 9.426904403023185e-10, + "loss": 0.061, + "step": 18359 + }, + { + "epoch": 2.9747245625405054, + "grad_norm": 0.7689868211746216, + "learning_rate": 9.307206322606288e-10, + "loss": 0.0537, + "step": 18360 + }, + { + "epoch": 2.9748865845755024, + "grad_norm": 0.8835668563842773, + "learning_rate": 9.188272896915041e-10, + "loss": 0.0591, + "step": 18361 + }, + { + "epoch": 2.975048606610499, + "grad_norm": 0.8549810647964478, + "learning_rate": 9.070104129582647e-10, + "loss": 0.0575, + "step": 18362 + }, + { + "epoch": 2.975210628645496, + "grad_norm": 0.9912297129631042, + "learning_rate": 8.952700024231209e-10, + "loss": 0.063, + "step": 18363 + }, + { + "epoch": 2.9753726506804927, + "grad_norm": 0.8764829635620117, + "learning_rate": 8.836060584449524e-10, + "loss": 0.0609, + "step": 18364 + }, + { + "epoch": 2.975534672715489, + "grad_norm": 0.7048518061637878, + "learning_rate": 8.720185813806958e-10, + "loss": 0.0481, + "step": 18365 + }, + { + "epoch": 2.975696694750486, + "grad_norm": 0.97881680727005, + "learning_rate": 8.6050757158479e-10, + "loss": 0.0604, + "step": 18366 + }, + { + "epoch": 2.975858716785483, + "grad_norm": 0.9841058850288391, + "learning_rate": 8.490730294097304e-10, + "loss": 0.0615, + "step": 18367 + }, + { + "epoch": 2.9760207388204796, + "grad_norm": 0.8247631788253784, + "learning_rate": 8.377149552049602e-10, + "loss": 0.0558, + "step": 18368 + }, + { + "epoch": 2.9761827608554765, + "grad_norm": 0.9710003137588501, + "learning_rate": 8.264333493185339e-10, + "loss": 0.0596, + "step": 18369 + }, + { + "epoch": 2.976344782890473, + "grad_norm": 1.0644299983978271, + "learning_rate": 8.15228212095176e-10, + "loss": 0.0599, + "step": 18370 + }, + { + "epoch": 2.97650680492547, + "grad_norm": 0.9960538148880005, + "learning_rate": 8.040995438779453e-10, + "loss": 0.07, + "step": 18371 + }, + { + "epoch": 2.9766688269604664, + "grad_norm": 0.9092425107955933, + "learning_rate": 7.930473450074028e-10, + "loss": 0.0665, + "step": 18372 + }, + { + "epoch": 2.9768308489954634, + "grad_norm": 0.7984481453895569, + "learning_rate": 7.820716158216113e-10, + "loss": 0.0556, + "step": 18373 + }, + { + "epoch": 2.9769928710304603, + "grad_norm": 0.8725264668464661, + "learning_rate": 7.711723566564133e-10, + "loss": 0.0587, + "step": 18374 + }, + { + "epoch": 2.9771548930654568, + "grad_norm": 0.873121976852417, + "learning_rate": 7.603495678451534e-10, + "loss": 0.0517, + "step": 18375 + }, + { + "epoch": 2.9773169151004537, + "grad_norm": 1.0072768926620483, + "learning_rate": 7.496032497195105e-10, + "loss": 0.0708, + "step": 18376 + }, + { + "epoch": 2.9774789371354506, + "grad_norm": 0.875882625579834, + "learning_rate": 7.389334026078332e-10, + "loss": 0.052, + "step": 18377 + }, + { + "epoch": 2.977640959170447, + "grad_norm": 0.9897551536560059, + "learning_rate": 7.283400268365271e-10, + "loss": 0.057, + "step": 18378 + }, + { + "epoch": 2.977802981205444, + "grad_norm": 0.9639883637428284, + "learning_rate": 7.178231227300548e-10, + "loss": 0.0682, + "step": 18379 + }, + { + "epoch": 2.9779650032404406, + "grad_norm": 0.8938714861869812, + "learning_rate": 7.073826906098258e-10, + "loss": 0.0534, + "step": 18380 + }, + { + "epoch": 2.9781270252754375, + "grad_norm": 0.8408438563346863, + "learning_rate": 6.97018730795862e-10, + "loss": 0.0569, + "step": 18381 + }, + { + "epoch": 2.978289047310434, + "grad_norm": 0.9069212079048157, + "learning_rate": 6.867312436045769e-10, + "loss": 0.0674, + "step": 18382 + }, + { + "epoch": 2.978451069345431, + "grad_norm": 0.9260680675506592, + "learning_rate": 6.76520229351274e-10, + "loss": 0.0562, + "step": 18383 + }, + { + "epoch": 2.978613091380428, + "grad_norm": 1.0838451385498047, + "learning_rate": 6.663856883482034e-10, + "loss": 0.0593, + "step": 18384 + }, + { + "epoch": 2.9787751134154243, + "grad_norm": 1.0117061138153076, + "learning_rate": 6.563276209053948e-10, + "loss": 0.0628, + "step": 18385 + }, + { + "epoch": 2.9789371354504213, + "grad_norm": 1.174195647239685, + "learning_rate": 6.463460273306577e-10, + "loss": 0.0687, + "step": 18386 + }, + { + "epoch": 2.979099157485418, + "grad_norm": 0.8610493540763855, + "learning_rate": 6.364409079295808e-10, + "loss": 0.0569, + "step": 18387 + }, + { + "epoch": 2.9792611795204147, + "grad_norm": 0.8971181511878967, + "learning_rate": 6.266122630049776e-10, + "loss": 0.061, + "step": 18388 + }, + { + "epoch": 2.9794232015554116, + "grad_norm": 1.0571962594985962, + "learning_rate": 6.168600928577184e-10, + "loss": 0.0602, + "step": 18389 + }, + { + "epoch": 2.9795852235904086, + "grad_norm": 0.8279181718826294, + "learning_rate": 6.071843977861758e-10, + "loss": 0.0559, + "step": 18390 + }, + { + "epoch": 2.979747245625405, + "grad_norm": 0.9802617430686951, + "learning_rate": 5.975851780862241e-10, + "loss": 0.0636, + "step": 18391 + }, + { + "epoch": 2.9799092676604015, + "grad_norm": 1.0149646997451782, + "learning_rate": 5.880624340517948e-10, + "loss": 0.0606, + "step": 18392 + }, + { + "epoch": 2.9800712896953985, + "grad_norm": 0.8134593367576599, + "learning_rate": 5.786161659740441e-10, + "loss": 0.0535, + "step": 18393 + }, + { + "epoch": 2.9802333117303954, + "grad_norm": 0.8985582590103149, + "learning_rate": 5.692463741424625e-10, + "loss": 0.0575, + "step": 18394 + }, + { + "epoch": 2.980395333765392, + "grad_norm": 1.0262095928192139, + "learning_rate": 5.5995305884321e-10, + "loss": 0.0618, + "step": 18395 + }, + { + "epoch": 2.980557355800389, + "grad_norm": 0.9016473293304443, + "learning_rate": 5.507362203607814e-10, + "loss": 0.0611, + "step": 18396 + }, + { + "epoch": 2.9807193778353858, + "grad_norm": 1.423275113105774, + "learning_rate": 5.415958589774506e-10, + "loss": 0.0656, + "step": 18397 + }, + { + "epoch": 2.9808813998703823, + "grad_norm": 1.001585602760315, + "learning_rate": 5.325319749727165e-10, + "loss": 0.059, + "step": 18398 + }, + { + "epoch": 2.981043421905379, + "grad_norm": 1.0050073862075806, + "learning_rate": 5.235445686238572e-10, + "loss": 0.0532, + "step": 18399 + }, + { + "epoch": 2.981205443940376, + "grad_norm": 1.142298698425293, + "learning_rate": 5.146336402059304e-10, + "loss": 0.0697, + "step": 18400 + }, + { + "epoch": 2.9813674659753726, + "grad_norm": 0.8341407775878906, + "learning_rate": 5.057991899917735e-10, + "loss": 0.0491, + "step": 18401 + }, + { + "epoch": 2.9815294880103695, + "grad_norm": 1.106696367263794, + "learning_rate": 4.970412182511708e-10, + "loss": 0.065, + "step": 18402 + }, + { + "epoch": 2.981691510045366, + "grad_norm": 0.9795176982879639, + "learning_rate": 4.883597252525185e-10, + "loss": 0.0658, + "step": 18403 + }, + { + "epoch": 2.981853532080363, + "grad_norm": 0.879523515701294, + "learning_rate": 4.797547112614376e-10, + "loss": 0.0624, + "step": 18404 + }, + { + "epoch": 2.9820155541153595, + "grad_norm": 0.9020334482192993, + "learning_rate": 4.712261765410509e-10, + "loss": 0.0638, + "step": 18405 + }, + { + "epoch": 2.9821775761503564, + "grad_norm": 0.8660544753074646, + "learning_rate": 4.627741213525383e-10, + "loss": 0.0582, + "step": 18406 + }, + { + "epoch": 2.9823395981853533, + "grad_norm": 0.871384859085083, + "learning_rate": 4.5439854595430435e-10, + "loss": 0.0613, + "step": 18407 + }, + { + "epoch": 2.98250162022035, + "grad_norm": 0.9486377835273743, + "learning_rate": 4.460994506028105e-10, + "loss": 0.0588, + "step": 18408 + }, + { + "epoch": 2.9826636422553467, + "grad_norm": 0.9920843839645386, + "learning_rate": 4.378768355514651e-10, + "loss": 0.0621, + "step": 18409 + }, + { + "epoch": 2.9828256642903437, + "grad_norm": 1.2648646831512451, + "learning_rate": 4.2973070105256643e-10, + "loss": 0.066, + "step": 18410 + }, + { + "epoch": 2.98298768632534, + "grad_norm": 0.9957455396652222, + "learning_rate": 4.2166104735508197e-10, + "loss": 0.0688, + "step": 18411 + }, + { + "epoch": 2.983149708360337, + "grad_norm": 0.8266428709030151, + "learning_rate": 4.136678747060363e-10, + "loss": 0.0555, + "step": 18412 + }, + { + "epoch": 2.983311730395334, + "grad_norm": 0.8543840050697327, + "learning_rate": 4.0575118334967854e-10, + "loss": 0.0543, + "step": 18413 + }, + { + "epoch": 2.9834737524303305, + "grad_norm": 0.8616589307785034, + "learning_rate": 3.9791097352831487e-10, + "loss": 0.0617, + "step": 18414 + }, + { + "epoch": 2.983635774465327, + "grad_norm": 1.0665658712387085, + "learning_rate": 3.90147245482031e-10, + "loss": 0.0605, + "step": 18415 + }, + { + "epoch": 2.983797796500324, + "grad_norm": 0.7606042623519897, + "learning_rate": 3.824599994484146e-10, + "loss": 0.0514, + "step": 18416 + }, + { + "epoch": 2.983959818535321, + "grad_norm": 1.0913400650024414, + "learning_rate": 3.748492356625555e-10, + "loss": 0.066, + "step": 18417 + }, + { + "epoch": 2.9841218405703174, + "grad_norm": 0.9275001883506775, + "learning_rate": 3.673149543573229e-10, + "loss": 0.0614, + "step": 18418 + }, + { + "epoch": 2.9842838626053143, + "grad_norm": 0.902944803237915, + "learning_rate": 3.598571557630881e-10, + "loss": 0.0639, + "step": 18419 + }, + { + "epoch": 2.9844458846403112, + "grad_norm": 0.9159849286079407, + "learning_rate": 3.5247584010827953e-10, + "loss": 0.0619, + "step": 18420 + }, + { + "epoch": 2.9846079066753077, + "grad_norm": 0.9164811372756958, + "learning_rate": 3.451710076188275e-10, + "loss": 0.059, + "step": 18421 + }, + { + "epoch": 2.9847699287103047, + "grad_norm": 0.8712190389633179, + "learning_rate": 3.3794265851816444e-10, + "loss": 0.0598, + "step": 18422 + }, + { + "epoch": 2.9849319507453016, + "grad_norm": 0.9287796020507812, + "learning_rate": 3.307907930272247e-10, + "loss": 0.0547, + "step": 18423 + }, + { + "epoch": 2.985093972780298, + "grad_norm": 1.1647449731826782, + "learning_rate": 3.237154113649998e-10, + "loss": 0.0669, + "step": 18424 + }, + { + "epoch": 2.985255994815295, + "grad_norm": 1.0105154514312744, + "learning_rate": 3.167165137479833e-10, + "loss": 0.055, + "step": 18425 + }, + { + "epoch": 2.9854180168502915, + "grad_norm": 0.9725024700164795, + "learning_rate": 3.0979410039017053e-10, + "loss": 0.0611, + "step": 18426 + }, + { + "epoch": 2.9855800388852884, + "grad_norm": 1.0524919033050537, + "learning_rate": 3.029481715038918e-10, + "loss": 0.0653, + "step": 18427 + }, + { + "epoch": 2.985742060920285, + "grad_norm": 1.088059425354004, + "learning_rate": 2.961787272978689e-10, + "loss": 0.0651, + "step": 18428 + }, + { + "epoch": 2.985904082955282, + "grad_norm": 0.8906368613243103, + "learning_rate": 2.8948576797971364e-10, + "loss": 0.0578, + "step": 18429 + }, + { + "epoch": 2.986066104990279, + "grad_norm": 0.9600349068641663, + "learning_rate": 2.828692937542621e-10, + "loss": 0.0644, + "step": 18430 + }, + { + "epoch": 2.9862281270252753, + "grad_norm": 0.9611825346946716, + "learning_rate": 2.7632930482385243e-10, + "loss": 0.0671, + "step": 18431 + }, + { + "epoch": 2.9863901490602722, + "grad_norm": 0.9350132346153259, + "learning_rate": 2.6986580138832487e-10, + "loss": 0.0597, + "step": 18432 + }, + { + "epoch": 2.986552171095269, + "grad_norm": 0.851214587688446, + "learning_rate": 2.634787836458541e-10, + "loss": 0.0572, + "step": 18433 + }, + { + "epoch": 2.9867141931302656, + "grad_norm": 1.1418606042861938, + "learning_rate": 2.571682517915619e-10, + "loss": 0.0617, + "step": 18434 + }, + { + "epoch": 2.9868762151652626, + "grad_norm": 0.9702183604240417, + "learning_rate": 2.5093420601862706e-10, + "loss": 0.0533, + "step": 18435 + }, + { + "epoch": 2.987038237200259, + "grad_norm": 0.8244792819023132, + "learning_rate": 2.447766465180079e-10, + "loss": 0.054, + "step": 18436 + }, + { + "epoch": 2.987200259235256, + "grad_norm": 1.0218993425369263, + "learning_rate": 2.386955734778873e-10, + "loss": 0.0629, + "step": 18437 + }, + { + "epoch": 2.9873622812702525, + "grad_norm": 0.794620156288147, + "learning_rate": 2.3269098708422754e-10, + "loss": 0.048, + "step": 18438 + }, + { + "epoch": 2.9875243033052494, + "grad_norm": 0.7882254719734192, + "learning_rate": 2.2676288752104814e-10, + "loss": 0.0506, + "step": 18439 + }, + { + "epoch": 2.9876863253402464, + "grad_norm": 1.0353506803512573, + "learning_rate": 2.2091127496959298e-10, + "loss": 0.0663, + "step": 18440 + }, + { + "epoch": 2.987848347375243, + "grad_norm": 0.8752200603485107, + "learning_rate": 2.1513614960888552e-10, + "loss": 0.0578, + "step": 18441 + }, + { + "epoch": 2.98801036941024, + "grad_norm": 0.8921242952346802, + "learning_rate": 2.0943751161545122e-10, + "loss": 0.0593, + "step": 18442 + }, + { + "epoch": 2.9881723914452367, + "grad_norm": 0.8822087049484253, + "learning_rate": 2.0381536116415025e-10, + "loss": 0.0559, + "step": 18443 + }, + { + "epoch": 2.988334413480233, + "grad_norm": 1.00393545627594, + "learning_rate": 1.98269698426512e-10, + "loss": 0.0679, + "step": 18444 + }, + { + "epoch": 2.98849643551523, + "grad_norm": 0.8842718005180359, + "learning_rate": 1.9280052357240065e-10, + "loss": 0.0635, + "step": 18445 + }, + { + "epoch": 2.988658457550227, + "grad_norm": 0.875261664390564, + "learning_rate": 1.8740783676945984e-10, + "loss": 0.0622, + "step": 18446 + }, + { + "epoch": 2.9888204795852236, + "grad_norm": 0.9737663865089417, + "learning_rate": 1.820916381820026e-10, + "loss": 0.0655, + "step": 18447 + }, + { + "epoch": 2.9889825016202205, + "grad_norm": 0.8469840884208679, + "learning_rate": 1.768519279732317e-10, + "loss": 0.059, + "step": 18448 + }, + { + "epoch": 2.989144523655217, + "grad_norm": 0.9026631116867065, + "learning_rate": 1.7168870630357437e-10, + "loss": 0.0613, + "step": 18449 + }, + { + "epoch": 2.989306545690214, + "grad_norm": 1.0350512266159058, + "learning_rate": 1.666019733306823e-10, + "loss": 0.0625, + "step": 18450 + }, + { + "epoch": 2.9894685677252104, + "grad_norm": 0.9230927228927612, + "learning_rate": 1.6159172920998667e-10, + "loss": 0.0618, + "step": 18451 + }, + { + "epoch": 2.9896305897602073, + "grad_norm": 0.933576226234436, + "learning_rate": 1.5665797409553097e-10, + "loss": 0.0561, + "step": 18452 + }, + { + "epoch": 2.9897926117952043, + "grad_norm": 0.8842147588729858, + "learning_rate": 1.5180070813747282e-10, + "loss": 0.0591, + "step": 18453 + }, + { + "epoch": 2.9899546338302008, + "grad_norm": 0.9423680305480957, + "learning_rate": 1.4701993148485972e-10, + "loss": 0.0578, + "step": 18454 + }, + { + "epoch": 2.9901166558651977, + "grad_norm": 0.9645872712135315, + "learning_rate": 1.4231564428424105e-10, + "loss": 0.0643, + "step": 18455 + }, + { + "epoch": 2.9902786779001946, + "grad_norm": 1.0968506336212158, + "learning_rate": 1.3768784667883562e-10, + "loss": 0.0557, + "step": 18456 + }, + { + "epoch": 2.990440699935191, + "grad_norm": 0.8200769424438477, + "learning_rate": 1.3313653881075195e-10, + "loss": 0.0579, + "step": 18457 + }, + { + "epoch": 2.990602721970188, + "grad_norm": 0.8951736688613892, + "learning_rate": 1.2866172081904548e-10, + "loss": 0.059, + "step": 18458 + }, + { + "epoch": 2.9907647440051845, + "grad_norm": 0.9866411685943604, + "learning_rate": 1.2426339284082877e-10, + "loss": 0.0585, + "step": 18459 + }, + { + "epoch": 2.9909267660401815, + "grad_norm": 0.8763075470924377, + "learning_rate": 1.1994155501071636e-10, + "loss": 0.0578, + "step": 18460 + }, + { + "epoch": 2.991088788075178, + "grad_norm": 0.9515385627746582, + "learning_rate": 1.1569620746054722e-10, + "loss": 0.0622, + "step": 18461 + }, + { + "epoch": 2.991250810110175, + "grad_norm": 0.9050562977790833, + "learning_rate": 1.1152735032077255e-10, + "loss": 0.0639, + "step": 18462 + }, + { + "epoch": 2.991412832145172, + "grad_norm": 0.997776985168457, + "learning_rate": 1.0743498371823535e-10, + "loss": 0.0671, + "step": 18463 + }, + { + "epoch": 2.9915748541801683, + "grad_norm": 0.8363643288612366, + "learning_rate": 1.0341910777894593e-10, + "loss": 0.0567, + "step": 18464 + }, + { + "epoch": 2.9917368762151653, + "grad_norm": 0.9793019890785217, + "learning_rate": 9.947972262502881e-11, + "loss": 0.0637, + "step": 18465 + }, + { + "epoch": 2.991898898250162, + "grad_norm": 0.83316969871521, + "learning_rate": 9.561682837777586e-11, + "loss": 0.0526, + "step": 18466 + }, + { + "epoch": 2.9920609202851587, + "grad_norm": 0.846834659576416, + "learning_rate": 9.183042515459317e-11, + "loss": 0.0572, + "step": 18467 + }, + { + "epoch": 2.9922229423201556, + "grad_norm": 0.9713281393051147, + "learning_rate": 8.812051307205416e-11, + "loss": 0.0651, + "step": 18468 + }, + { + "epoch": 2.9923849643551526, + "grad_norm": 0.8826894164085388, + "learning_rate": 8.448709224312402e-11, + "loss": 0.067, + "step": 18469 + }, + { + "epoch": 2.992546986390149, + "grad_norm": 1.1817312240600586, + "learning_rate": 8.093016277938015e-11, + "loss": 0.0597, + "step": 18470 + }, + { + "epoch": 2.992709008425146, + "grad_norm": 0.9918498992919922, + "learning_rate": 7.744972478962443e-11, + "loss": 0.0696, + "step": 18471 + }, + { + "epoch": 2.9928710304601425, + "grad_norm": 0.9315574765205383, + "learning_rate": 7.404577837988313e-11, + "loss": 0.0574, + "step": 18472 + }, + { + "epoch": 2.9930330524951394, + "grad_norm": 0.8410326838493347, + "learning_rate": 7.071832365479481e-11, + "loss": 0.0578, + "step": 18473 + }, + { + "epoch": 2.993195074530136, + "grad_norm": 1.0091389417648315, + "learning_rate": 6.746736071594484e-11, + "loss": 0.0696, + "step": 18474 + }, + { + "epoch": 2.993357096565133, + "grad_norm": 0.9494967460632324, + "learning_rate": 6.429288966297576e-11, + "loss": 0.0644, + "step": 18475 + }, + { + "epoch": 2.9935191186001298, + "grad_norm": 0.8485308885574341, + "learning_rate": 6.119491059303206e-11, + "loss": 0.0614, + "step": 18476 + }, + { + "epoch": 2.9936811406351262, + "grad_norm": 0.9347163438796997, + "learning_rate": 5.817342360048273e-11, + "loss": 0.0584, + "step": 18477 + }, + { + "epoch": 2.993843162670123, + "grad_norm": 0.8836044669151306, + "learning_rate": 5.522842877830892e-11, + "loss": 0.0605, + "step": 18478 + }, + { + "epoch": 2.99400518470512, + "grad_norm": 0.8401952981948853, + "learning_rate": 5.235992621616115e-11, + "loss": 0.0585, + "step": 18479 + }, + { + "epoch": 2.9941672067401166, + "grad_norm": 1.1539223194122314, + "learning_rate": 4.956791600230215e-11, + "loss": 0.0589, + "step": 18480 + }, + { + "epoch": 2.9943292287751135, + "grad_norm": 0.8261008262634277, + "learning_rate": 4.685239822166398e-11, + "loss": 0.0574, + "step": 18481 + }, + { + "epoch": 2.99449125081011, + "grad_norm": 0.8534218072891235, + "learning_rate": 4.4213372957790935e-11, + "loss": 0.0544, + "step": 18482 + }, + { + "epoch": 2.994653272845107, + "grad_norm": 0.8698385953903198, + "learning_rate": 4.165084029117417e-11, + "loss": 0.0586, + "step": 18483 + }, + { + "epoch": 2.9948152948801035, + "grad_norm": 1.0100245475769043, + "learning_rate": 3.9164800300084404e-11, + "loss": 0.0542, + "step": 18484 + }, + { + "epoch": 2.9949773169151004, + "grad_norm": 0.8979434967041016, + "learning_rate": 3.6755253060849484e-11, + "loss": 0.0589, + "step": 18485 + }, + { + "epoch": 2.9951393389500973, + "grad_norm": 0.9884781837463379, + "learning_rate": 3.442219864729923e-11, + "loss": 0.065, + "step": 18486 + }, + { + "epoch": 2.995301360985094, + "grad_norm": 1.027869701385498, + "learning_rate": 3.216563713048793e-11, + "loss": 0.0565, + "step": 18487 + }, + { + "epoch": 2.9954633830200907, + "grad_norm": 0.9127057790756226, + "learning_rate": 2.998556857952695e-11, + "loss": 0.06, + "step": 18488 + }, + { + "epoch": 2.9956254050550877, + "grad_norm": 0.9373353123664856, + "learning_rate": 2.7881993061307233e-11, + "loss": 0.0585, + "step": 18489 + }, + { + "epoch": 2.995787427090084, + "grad_norm": 1.021518588066101, + "learning_rate": 2.5854910639944165e-11, + "loss": 0.0629, + "step": 18490 + }, + { + "epoch": 2.995949449125081, + "grad_norm": 0.8520172238349915, + "learning_rate": 2.390432137761023e-11, + "loss": 0.0592, + "step": 18491 + }, + { + "epoch": 2.996111471160078, + "grad_norm": 0.7974404096603394, + "learning_rate": 2.203022533425747e-11, + "loss": 0.0554, + "step": 18492 + }, + { + "epoch": 2.9962734931950745, + "grad_norm": 0.8519773483276367, + "learning_rate": 2.023262256678482e-11, + "loss": 0.0582, + "step": 18493 + }, + { + "epoch": 2.9964355152300715, + "grad_norm": 0.904719352722168, + "learning_rate": 1.8511513130148317e-11, + "loss": 0.0618, + "step": 18494 + }, + { + "epoch": 2.996597537265068, + "grad_norm": 0.9865505695343018, + "learning_rate": 1.686689707736111e-11, + "loss": 0.0664, + "step": 18495 + }, + { + "epoch": 2.996759559300065, + "grad_norm": 0.965429425239563, + "learning_rate": 1.529877445866079e-11, + "loss": 0.0616, + "step": 18496 + }, + { + "epoch": 2.9969215813350614, + "grad_norm": 0.8916358351707458, + "learning_rate": 1.3807145322064508e-11, + "loss": 0.0618, + "step": 18497 + }, + { + "epoch": 2.9970836033700583, + "grad_norm": 0.8863734602928162, + "learning_rate": 1.2392009713091402e-11, + "loss": 0.0537, + "step": 18498 + }, + { + "epoch": 2.9972456254050552, + "grad_norm": 0.9996553659439087, + "learning_rate": 1.1053367674762617e-11, + "loss": 0.0576, + "step": 18499 + }, + { + "epoch": 2.9974076474400517, + "grad_norm": 0.8960807919502258, + "learning_rate": 9.791219248711515e-12, + "loss": 0.0567, + "step": 18500 + }, + { + "epoch": 2.9975696694750487, + "grad_norm": 0.9851888418197632, + "learning_rate": 8.605564472963235e-12, + "loss": 0.0613, + "step": 18501 + }, + { + "epoch": 2.9977316915100456, + "grad_norm": 1.055681824684143, + "learning_rate": 7.496403384155137e-12, + "loss": 0.0598, + "step": 18502 + }, + { + "epoch": 2.997893713545042, + "grad_norm": 0.9974924921989441, + "learning_rate": 6.463736015871469e-12, + "loss": 0.0644, + "step": 18503 + }, + { + "epoch": 2.998055735580039, + "grad_norm": 0.9039761424064636, + "learning_rate": 5.507562400308697e-12, + "loss": 0.0685, + "step": 18504 + }, + { + "epoch": 2.9982177576150355, + "grad_norm": 0.9217861890792847, + "learning_rate": 4.6278825660550645e-12, + "loss": 0.0597, + "step": 18505 + }, + { + "epoch": 2.9983797796500324, + "grad_norm": 0.8809846043586731, + "learning_rate": 3.8246965403110344e-12, + "loss": 0.0548, + "step": 18506 + }, + { + "epoch": 2.998541801685029, + "grad_norm": 0.8097039461135864, + "learning_rate": 3.098004347779071e-12, + "loss": 0.0532, + "step": 18507 + }, + { + "epoch": 2.998703823720026, + "grad_norm": 0.847151517868042, + "learning_rate": 2.4478060103860777e-12, + "loss": 0.0547, + "step": 18508 + }, + { + "epoch": 2.998865845755023, + "grad_norm": 0.8286563754081726, + "learning_rate": 1.8741015483936253e-12, + "loss": 0.0566, + "step": 18509 + }, + { + "epoch": 2.9990278677900193, + "grad_norm": 0.9556106925010681, + "learning_rate": 1.376890979287726e-12, + "loss": 0.0555, + "step": 18510 + }, + { + "epoch": 2.999189889825016, + "grad_norm": 0.7847691774368286, + "learning_rate": 9.56174318056391e-13, + "loss": 0.0503, + "step": 18511 + }, + { + "epoch": 2.999351911860013, + "grad_norm": 1.074299931526184, + "learning_rate": 6.119515774671847e-13, + "loss": 0.0612, + "step": 18512 + }, + { + "epoch": 2.9995139338950096, + "grad_norm": 0.9344704747200012, + "learning_rate": 3.442227686223376e-13, + "loss": 0.0638, + "step": 18513 + }, + { + "epoch": 2.9996759559300066, + "grad_norm": 1.0170155763626099, + "learning_rate": 1.529878990158551e-13, + "loss": 0.0624, + "step": 18514 + }, + { + "epoch": 2.9998379779650035, + "grad_norm": 0.8656219840049744, + "learning_rate": 3.824697503151953e-14, + "loss": 0.0574, + "step": 18515 + }, + { + "epoch": 3.0, + "grad_norm": 0.9637875556945801, + "learning_rate": 0.0, + "loss": 0.0613, + "step": 18516 + } + ], + "logging_steps": 1.0, + "max_steps": 18516, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.8941809923882746e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}